Import LLVM, at r72732.

author: ed <ed@FreeBSD.org> 2009-06-02 17:52:33 +0000
committer: ed <ed@FreeBSD.org> 2009-06-02 17:52:33 +0000
commit: 3277b69d734b9c90b44ebde4ede005717e2c3b2e (patch)
tree: 64ba909838c23261cace781ece27d106134ea451 /lib
download: FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.zip
FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.tar.gz
828 files changed, 372534 insertions, 0 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
new file mode 100644
index 0000000..c5523ec
--- /dev/null
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -0,0 +1,248 @@
+//===- AliasAnalysis.cpp - Generic Alias Analysis Interface Implementation -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the generic AliasAnalysis interface which is used as the
+// common interface used by all clients and implementations of alias analysis.
+//
+// This file also implements the default version of the AliasAnalysis interface
+// that is to be used when no other implementation is specified.  This does some
+// simple tests that detect obvious cases: two different global pointers cannot
+// alias, a global cannot alias a malloc, two different mallocs cannot alias,
+// etc.
+//
+// This alias analysis implementation really isn't very good for anything, but
+// it is very fast, and makes a nice clean default implementation.  Because it
+// handles lots of little corner cases, other, more complex, alias analysis
+// implementations may choose to rely on this pass to resolve these simple and
+// easy cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/Target/TargetData.h"
+using namespace llvm;
+
+// Register the AliasAnalysis interface, providing a nice name to refer to.
+static RegisterAnalysisGroup<AliasAnalysis> Z("Alias Analysis");
+char AliasAnalysis::ID = 0;
+
+//===----------------------------------------------------------------------===//
+// Default chaining methods
+//===----------------------------------------------------------------------===//
+
+AliasAnalysis::AliasResult
+AliasAnalysis::alias(const Value *V1, unsigned V1Size,
+                     const Value *V2, unsigned V2Size) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->alias(V1, V1Size, V2, V2Size);
+}
+
+void AliasAnalysis::getMustAliases(Value *P, std::vector<Value*> &RetVals) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->getMustAliases(P, RetVals);
+}
+
+bool AliasAnalysis::pointsToConstantMemory(const Value *P) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->pointsToConstantMemory(P);
+}
+
+bool AliasAnalysis::hasNoModRefInfoForCalls() const {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->hasNoModRefInfoForCalls();
+}
+
+void AliasAnalysis::deleteValue(Value *V) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  AA->deleteValue(V);
+}
+
+void AliasAnalysis::copyValue(Value *From, Value *To) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  AA->copyValue(From, To);
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(CallSite CS1, CallSite CS2) {
+  // FIXME: we can do better.
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->getModRefInfo(CS1, CS2);
+}
+
+
+//===----------------------------------------------------------------------===//
+// AliasAnalysis non-virtual helper method implementation
+//===----------------------------------------------------------------------===//
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(LoadInst *L, Value *P, unsigned Size) {
+  return alias(L->getOperand(0), TD->getTypeStoreSize(L->getType()),
+               P, Size) ? Ref : NoModRef;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(StoreInst *S, Value *P, unsigned Size) {
+  // If the stored address cannot alias the pointer in question, then the
+  // pointer cannot be modified by the store.
+  if (!alias(S->getOperand(1),
+             TD->getTypeStoreSize(S->getOperand(0)->getType()), P, Size))
+    return NoModRef;
+
+  // If the pointer is a pointer to constant memory, then it could not have been
+  // modified by this store.
+  return pointsToConstantMemory(P) ? NoModRef : Mod;
+}
+
+AliasAnalysis::ModRefBehavior
+AliasAnalysis::getModRefBehavior(CallSite CS,
+                                 std::vector<PointerAccessInfo> *Info) {
+  if (CS.doesNotAccessMemory())
+    // Can't do better than this.
+    return DoesNotAccessMemory;
+  ModRefBehavior MRB = getModRefBehavior(CS.getCalledFunction(), Info);
+  if (MRB != DoesNotAccessMemory && CS.onlyReadsMemory())
+    return OnlyReadsMemory;
+  return MRB;
+}
+
+AliasAnalysis::ModRefBehavior
+AliasAnalysis::getModRefBehavior(Function *F,
+                                 std::vector<PointerAccessInfo> *Info) {
+  if (F) {
+    if (F->doesNotAccessMemory())
+      // Can't do better than this.
+      return DoesNotAccessMemory;
+    if (F->onlyReadsMemory())
+      return OnlyReadsMemory;
+    if (unsigned id = F->getIntrinsicID()) {
+#define GET_INTRINSIC_MODREF_BEHAVIOR
+#include "llvm/Intrinsics.gen"
+#undef GET_INTRINSIC_MODREF_BEHAVIOR
+    }
+  }
+  return UnknownModRefBehavior;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+  ModRefResult Mask = ModRef;
+  ModRefBehavior MRB = getModRefBehavior(CS);
+  if (MRB == DoesNotAccessMemory)
+    return NoModRef;
+  else if (MRB == OnlyReadsMemory)
+    Mask = Ref;
+  else if (MRB == AliasAnalysis::AccessesArguments) {
+    bool doesAlias = false;
+    for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+         AI != AE; ++AI)
+      if (alias(*AI, ~0U, P, Size) != NoAlias) {
+        doesAlias = true;
+        break;
+      }
+
+    if (!doesAlias)
+      return NoModRef;
+  }
+
+  if (!AA) return Mask;
+
+  // If P points to a constant memory location, the call definitely could not
+  // modify the memory location.
+  if ((Mask & Mod) && AA->pointsToConstantMemory(P))
+    Mask = ModRefResult(Mask & ~Mod);
+
+  return ModRefResult(Mask & AA->getModRefInfo(CS, P, Size));
+}
+
+// AliasAnalysis destructor: DO NOT move this to the header file for
+// AliasAnalysis or else clients of the AliasAnalysis class may not depend on
+// the AliasAnalysis.o file in the current .a file, causing alias analysis
+// support to not be included in the tool correctly!
+//
+AliasAnalysis::~AliasAnalysis() {}
+
+/// InitializeAliasAnalysis - Subclasses must call this method to initialize the
+/// AliasAnalysis interface before any other methods are called.
+///
+void AliasAnalysis::InitializeAliasAnalysis(Pass *P) {
+  TD = &P->getAnalysis<TargetData>();
+  AA = &P->getAnalysis<AliasAnalysis>();
+}
+
+// getAnalysisUsage - All alias analysis implementations should invoke this
+// directly (using AliasAnalysis::getAnalysisUsage(AU)) to make sure that
+// TargetData is required by the pass.
+void AliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetData>();            // All AA's need TargetData.
+  AU.addRequired<AliasAnalysis>();         // All AA's chain
+}
+
+/// canBasicBlockModify - Return true if it is possible for execution of the
+/// specified basic block to modify the value pointed to by Ptr.
+///
+bool AliasAnalysis::canBasicBlockModify(const BasicBlock &BB,
+                                        const Value *Ptr, unsigned Size) {
+  return canInstructionRangeModify(BB.front(), BB.back(), Ptr, Size);
+}
+
+/// canInstructionRangeModify - Return true if it is possible for the execution
+/// of the specified instructions to modify the value pointed to by Ptr.  The
+/// instructions to consider are all of the instructions in the range of [I1,I2]
+/// INCLUSIVE.  I1 and I2 must be in the same basic block.
+///
+bool AliasAnalysis::canInstructionRangeModify(const Instruction &I1,
+                                              const Instruction &I2,
+                                              const Value *Ptr, unsigned Size) {
+  assert(I1.getParent() == I2.getParent() &&
+         "Instructions not in same basic block!");
+  BasicBlock::iterator I = const_cast<Instruction*>(&I1);
+  BasicBlock::iterator E = const_cast<Instruction*>(&I2);
+  ++E;  // Convert from inclusive to exclusive range.
+
+  for (; I != E; ++I) // Check every instruction in range
+    if (getModRefInfo(I, const_cast<Value*>(Ptr), Size) & Mod)
+      return true;
+  return false;
+}
+
+/// isNoAliasCall - Return true if this pointer is returned by a noalias
+/// function.
+bool llvm::isNoAliasCall(const Value *V) {
+  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+    return CallSite(const_cast<Instruction*>(cast<Instruction>(V)))
+      .paramHasAttr(0, Attribute::NoAlias);
+  return false;
+}
+
+/// isIdentifiedObject - Return true if this pointer refers to a distinct and
+/// identifiable object.  This returns true for:
+///    Global Variables and Functions
+///    Allocas and Mallocs
+///    ByVal and NoAlias Arguments
+///    NoAlias returns
+///
+bool llvm::isIdentifiedObject(const Value *V) {
+  if (isa<GlobalValue>(V) || isa<AllocationInst>(V) || isNoAliasCall(V))
+    return true;
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasNoAliasAttr() || A->hasByValAttr();
+  return false;
+}
+
+// Because of the way .a files work, we must force the BasicAA implementation to
+// be pulled in if the AliasAnalysis classes are pulled in.  Otherwise we run
+// the risk of AliasAnalysis being used, but the default implementation not
+// being linked into the tool that uses it.
+DEFINING_FILE_FOR(AliasAnalysis)
diff --git a/lib/Analysis/AliasAnalysisCounter.cpp b/lib/Analysis/AliasAnalysisCounter.cpp
new file mode 100644
index 0000000..4362d7d
--- /dev/null
+++ b/lib/Analysis/AliasAnalysisCounter.cpp
@@ -0,0 +1,173 @@
+//===- AliasAnalysisCounter.cpp - Alias Analysis Query Counter ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass which can be used to count how many alias queries
+// are being made and how the alias analysis implementation being used responds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+static cl::opt<bool>
+PrintAll("count-aa-print-all-queries", cl::ReallyHidden);
+static cl::opt<bool>
+PrintAllFailures("count-aa-print-all-failed-queries", cl::ReallyHidden);
+
+namespace {
+  class VISIBILITY_HIDDEN AliasAnalysisCounter 
+      : public ModulePass, public AliasAnalysis {
+    unsigned No, May, Must;
+    unsigned NoMR, JustRef, JustMod, MR;
+    const char *Name;
+    Module *M;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    AliasAnalysisCounter() : ModulePass(&ID) {
+      No = May = Must = 0;
+      NoMR = JustRef = JustMod = MR = 0;
+    }
+
+    void printLine(const char *Desc, unsigned Val, unsigned Sum) {
+      cerr <<  "  " << Val << " " << Desc << " responses ("
+           << Val*100/Sum << "%)\n";
+    }
+    ~AliasAnalysisCounter() {
+      unsigned AASum = No+May+Must;
+      unsigned MRSum = NoMR+JustRef+JustMod+MR;
+      if (AASum + MRSum) { // Print a report if any counted queries occurred...
+        cerr << "\n===== Alias Analysis Counter Report =====\n"
+             << "  Analysis counted: " << Name << "\n"
+             << "  " << AASum << " Total Alias Queries Performed\n";
+        if (AASum) {
+          printLine("no alias",     No, AASum);
+          printLine("may alias",   May, AASum);
+          printLine("must alias", Must, AASum);
+          cerr << "  Alias Analysis Counter Summary: " << No*100/AASum << "%/"
+               << May*100/AASum << "%/" << Must*100/AASum<<"%\n\n";
+        }
+
+        cerr << "  " << MRSum    << " Total Mod/Ref Queries Performed\n";
+        if (MRSum) {
+          printLine("no mod/ref",    NoMR, MRSum);
+          printLine("ref",        JustRef, MRSum);
+          printLine("mod",        JustMod, MRSum);
+          printLine("mod/ref",         MR, MRSum);
+          cerr << "  Mod/Ref Analysis Counter Summary: " <<NoMR*100/MRSum<< "%/"
+               << JustRef*100/MRSum << "%/" << JustMod*100/MRSum << "%/"
+               << MR*100/MRSum <<"%\n\n";
+        }
+      }
+    }
+
+    bool runOnModule(Module &M) {
+      this->M = &M;
+      InitializeAliasAnalysis(this);
+      Name = dynamic_cast<Pass*>(&getAnalysis<AliasAnalysis>())->getPassName();
+      return false;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AliasAnalysis::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.setPreservesAll();
+    }
+
+    // FIXME: We could count these too...
+    bool pointsToConstantMemory(const Value *P) {
+      return getAnalysis<AliasAnalysis>().pointsToConstantMemory(P);
+    }
+    bool doesNotAccessMemory(CallSite CS) {
+      return getAnalysis<AliasAnalysis>().doesNotAccessMemory(CS);
+    }
+    bool doesNotAccessMemory(Function *F) {
+      return getAnalysis<AliasAnalysis>().doesNotAccessMemory(F);
+    }
+    bool onlyReadsMemory(CallSite CS) {
+      return getAnalysis<AliasAnalysis>().onlyReadsMemory(CS);
+    }
+    bool onlyReadsMemory(Function *F) {
+      return getAnalysis<AliasAnalysis>().onlyReadsMemory(F);
+    }
+
+
+    // Forwarding functions: just delegate to a real AA implementation, counting
+    // the number of responses...
+    AliasResult alias(const Value *V1, unsigned V1Size,
+                      const Value *V2, unsigned V2Size);
+
+    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size);
+    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
+      return AliasAnalysis::getModRefInfo(CS1,CS2);
+    }
+  };
+}
+
+char AliasAnalysisCounter::ID = 0;
+static RegisterPass<AliasAnalysisCounter>
+X("count-aa", "Count Alias Analysis Query Responses", false, true);
+static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+
+ModulePass *llvm::createAliasAnalysisCounterPass() {
+  return new AliasAnalysisCounter();
+}
+
+AliasAnalysis::AliasResult
+AliasAnalysisCounter::alias(const Value *V1, unsigned V1Size,
+                            const Value *V2, unsigned V2Size) {
+  AliasResult R = getAnalysis<AliasAnalysis>().alias(V1, V1Size, V2, V2Size);
+
+  const char *AliasString;
+  switch (R) {
+  default: assert(0 && "Unknown alias type!");
+  case NoAlias:   No++;   AliasString = "No alias"; break;
+  case MayAlias:  May++;  AliasString = "May alias"; break;
+  case MustAlias: Must++; AliasString = "Must alias"; break;
+  }
+
+  if (PrintAll || (PrintAllFailures && R == MayAlias)) {
+    cerr << AliasString << ":\t";
+    cerr << "[" << V1Size << "B] ";
+    WriteAsOperand(*cerr.stream(), V1, true, M);
+    cerr << ", ";
+    cerr << "[" << V2Size << "B] ";
+    WriteAsOperand(*cerr.stream(), V2, true, M);
+    cerr << "\n";
+  }
+
+  return R;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysisCounter::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+  ModRefResult R = getAnalysis<AliasAnalysis>().getModRefInfo(CS, P, Size);
+
+  const char *MRString;
+  switch (R) {
+  default:       assert(0 && "Unknown mod/ref type!");
+  case NoModRef: NoMR++;     MRString = "NoModRef"; break;
+  case Ref:      JustRef++;  MRString = "JustRef"; break;
+  case Mod:      JustMod++;  MRString = "JustMod"; break;
+  case ModRef:   MR++;       MRString = "ModRef"; break;
+  }
+
+  if (PrintAll || (PrintAllFailures && R == ModRef)) {
+    cerr << MRString << ":  Ptr: ";
+    cerr << "[" << Size << "B] ";
+    WriteAsOperand(*cerr.stream(), P, true, M);
+    cerr << "\t<->" << *CS.getInstruction();
+  }
+  return R;
+}
diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp
new file mode 100644
index 0000000..07820e3
--- /dev/null
+++ b/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -0,0 +1,246 @@
+//===- AliasAnalysisEvaluator.cpp - Alias Analysis Accuracy Evaluator -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple N^2 alias analysis accuracy evaluator.
+// Basically, for each function in the program, it simply queries to see how the
+// alias analysis implementation answers alias queries between each pair of
+// pointers in the function.
+//
+// This is inspired and adapted from code by: Naveen Neelakantam, Francesco
+// Spadini, and Wojciech Stryjewski.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+#include <set>
+#include <sstream>
+using namespace llvm;
+
+static cl::opt<bool> PrintAll("print-all-alias-modref-info", cl::ReallyHidden);
+
+static cl::opt<bool> PrintNoAlias("print-no-aliases", cl::ReallyHidden);
+static cl::opt<bool> PrintMayAlias("print-may-aliases", cl::ReallyHidden);
+static cl::opt<bool> PrintMustAlias("print-must-aliases", cl::ReallyHidden);
+
+static cl::opt<bool> PrintNoModRef("print-no-modref", cl::ReallyHidden);
+static cl::opt<bool> PrintMod("print-mod", cl::ReallyHidden);
+static cl::opt<bool> PrintRef("print-ref", cl::ReallyHidden);
+static cl::opt<bool> PrintModRef("print-modref", cl::ReallyHidden);
+
+namespace {
+  class VISIBILITY_HIDDEN AAEval : public FunctionPass {
+    unsigned NoAlias, MayAlias, MustAlias;
+    unsigned NoModRef, Mod, Ref, ModRef;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    AAEval() : FunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.setPreservesAll();
+    }
+
+    bool doInitialization(Module &M) {
+      NoAlias = MayAlias = MustAlias = 0;
+      NoModRef = Mod = Ref = ModRef = 0;
+
+      if (PrintAll) {
+        PrintNoAlias = PrintMayAlias = PrintMustAlias = true;
+        PrintNoModRef = PrintMod = PrintRef = PrintModRef = true;
+      }
+      return false;
+    }
+
+    bool runOnFunction(Function &F);
+    bool doFinalization(Module &M);
+  };
+}
+
+char AAEval::ID = 0;
+static RegisterPass<AAEval>
+X("aa-eval", "Exhaustive Alias Analysis Precision Evaluator", false, true);
+
+FunctionPass *llvm::createAAEvalPass() { return new AAEval(); }
+
+static void PrintResults(const char *Msg, bool P, const Value *V1, const Value *V2,
+                         const Module *M) {
+  if (P) {
+    std::stringstream s1, s2;
+    WriteAsOperand(s1, V1, true, M);
+    WriteAsOperand(s2, V2, true, M);
+    std::string o1(s1.str()), o2(s2.str());
+    if (o2 < o1)
+        std::swap(o1, o2);
+    cerr << "  " << Msg << ":\t"
+         << o1 << ", "
+         << o2 << "\n";
+  }
+}
+
+static inline void
+PrintModRefResults(const char *Msg, bool P, Instruction *I, Value *Ptr,
+                   Module *M) {
+  if (P) {
+    cerr << "  " << Msg << ":  Ptr: ";
+    WriteAsOperand(*cerr.stream(), Ptr, true, M);
+    cerr << "\t<->" << *I;
+  }
+}
+
+bool AAEval::runOnFunction(Function &F) {
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  const TargetData &TD = AA.getTargetData();
+
+  std::set<Value *> Pointers;
+  std::set<CallSite> CallSites;
+
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    if (isa<PointerType>(I->getType()))    // Add all pointer arguments
+      Pointers.insert(I);
+
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    if (isa<PointerType>(I->getType())) // Add all pointer instructions
+      Pointers.insert(&*I);
+    Instruction &Inst = *I;
+    User::op_iterator OI = Inst.op_begin();
+    CallSite CS = CallSite::get(&Inst);
+    if (CS.getInstruction() &&
+        isa<Function>(CS.getCalledValue()))
+      ++OI;  // Skip actual functions for direct function calls.
+    for (; OI != Inst.op_end(); ++OI)
+      if (isa<PointerType>((*OI)->getType()) && !isa<ConstantPointerNull>(*OI))
+        Pointers.insert(*OI);
+
+    if (CS.getInstruction()) CallSites.insert(CS);
+  }
+
+  if (PrintNoAlias || PrintMayAlias || PrintMustAlias ||
+      PrintNoModRef || PrintMod || PrintRef || PrintModRef)
+    cerr << "Function: " << F.getName() << ": " << Pointers.size()
+         << " pointers, " << CallSites.size() << " call sites\n";
+
+  // iterate over the worklist, and run the full (n^2)/2 disambiguations
+  for (std::set<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end();
+       I1 != E; ++I1) {
+    unsigned I1Size = 0;
+    const Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType();
+    if (I1ElTy->isSized()) I1Size = TD.getTypeStoreSize(I1ElTy);
+
+    for (std::set<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) {
+      unsigned I2Size = 0;
+      const Type *I2ElTy =cast<PointerType>((*I2)->getType())->getElementType();
+      if (I2ElTy->isSized()) I2Size = TD.getTypeStoreSize(I2ElTy);
+
+      switch (AA.alias(*I1, I1Size, *I2, I2Size)) {
+      case AliasAnalysis::NoAlias:
+        PrintResults("NoAlias", PrintNoAlias, *I1, *I2, F.getParent());
+        ++NoAlias; break;
+      case AliasAnalysis::MayAlias:
+        PrintResults("MayAlias", PrintMayAlias, *I1, *I2, F.getParent());
+        ++MayAlias; break;
+      case AliasAnalysis::MustAlias:
+        PrintResults("MustAlias", PrintMustAlias, *I1, *I2, F.getParent());
+        ++MustAlias; break;
+      default:
+        cerr << "Unknown alias query result!\n";
+      }
+    }
+  }
+
+  // Mod/ref alias analysis: compare all pairs of calls and values
+  for (std::set<CallSite>::iterator C = CallSites.begin(),
+         Ce = CallSites.end(); C != Ce; ++C) {
+    Instruction *I = C->getInstruction();
+
+    for (std::set<Value *>::iterator V = Pointers.begin(), Ve = Pointers.end();
+         V != Ve; ++V) {
+      unsigned Size = 0;
+      const Type *ElTy = cast<PointerType>((*V)->getType())->getElementType();
+      if (ElTy->isSized()) Size = TD.getTypeStoreSize(ElTy);
+
+      switch (AA.getModRefInfo(*C, *V, Size)) {
+      case AliasAnalysis::NoModRef:
+        PrintModRefResults("NoModRef", PrintNoModRef, I, *V, F.getParent());
+        ++NoModRef; break;
+      case AliasAnalysis::Mod:
+        PrintModRefResults("     Mod", PrintMod, I, *V, F.getParent());
+        ++Mod; break;
+      case AliasAnalysis::Ref:
+        PrintModRefResults("     Ref", PrintRef, I, *V, F.getParent());
+        ++Ref; break;
+      case AliasAnalysis::ModRef:
+        PrintModRefResults("  ModRef", PrintModRef, I, *V, F.getParent());
+        ++ModRef; break;
+      default:
+        cerr << "Unknown alias query result!\n";
+      }
+    }
+  }
+
+  return false;
+}
+
+static void PrintPercent(unsigned Num, unsigned Sum) {
+  cerr << "(" << Num*100ULL/Sum << "."
+            << ((Num*1000ULL/Sum) % 10) << "%)\n";
+}
+
+bool AAEval::doFinalization(Module &M) {
+  unsigned AliasSum = NoAlias + MayAlias + MustAlias;
+  cerr << "===== Alias Analysis Evaluator Report =====\n";
+  if (AliasSum == 0) {
+    cerr << "  Alias Analysis Evaluator Summary: No pointers!\n";
+  } else {
+    cerr << "  " << AliasSum << " Total Alias Queries Performed\n";
+    cerr << "  " << NoAlias << " no alias responses ";
+    PrintPercent(NoAlias, AliasSum);
+    cerr << "  " << MayAlias << " may alias responses ";
+    PrintPercent(MayAlias, AliasSum);
+    cerr << "  " << MustAlias << " must alias responses ";
+    PrintPercent(MustAlias, AliasSum);
+    cerr << "  Alias Analysis Evaluator Pointer Alias Summary: "
+         << NoAlias*100/AliasSum  << "%/" << MayAlias*100/AliasSum << "%/"
+         << MustAlias*100/AliasSum << "%\n";
+  }
+
+  // Display the summary for mod/ref analysis
+  unsigned ModRefSum = NoModRef + Mod + Ref + ModRef;
+  if (ModRefSum == 0) {
+    cerr << "  Alias Analysis Mod/Ref Evaluator Summary: no mod/ref!\n";
+  } else {
+    cerr << "  " << ModRefSum << " Total ModRef Queries Performed\n";
+    cerr << "  " << NoModRef << " no mod/ref responses ";
+    PrintPercent(NoModRef, ModRefSum);
+    cerr << "  " << Mod << " mod responses ";
+    PrintPercent(Mod, ModRefSum);
+    cerr << "  " << Ref << " ref responses ";
+    PrintPercent(Ref, ModRefSum);
+    cerr << "  " << ModRef << " mod & ref responses ";
+    PrintPercent(ModRef, ModRefSum);
+    cerr << "  Alias Analysis Evaluator Mod/Ref Summary: "
+         << NoModRef*100/ModRefSum  << "%/" << Mod*100/ModRefSum << "%/"
+         << Ref*100/ModRefSum << "%/" << ModRef*100/ModRefSum << "%\n";
+  }
+
+  return false;
+}
diff --git a/lib/Analysis/AliasDebugger.cpp b/lib/Analysis/AliasDebugger.cpp
new file mode 100644
index 0000000..1e82621
--- /dev/null
+++ b/lib/Analysis/AliasDebugger.cpp
@@ -0,0 +1,123 @@
+//===- AliasDebugger.cpp - Simple Alias Analysis Use Checker --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This simple pass checks alias analysis users to ensure that if they
+// create a new value, they do not query AA without informing it of the value.
+// It acts as a shim over any other AA pass you want.
+//
+// Yes keeping track of every value in the program is expensive, but this is 
+// a debugging pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+using namespace llvm;
+
+namespace {
+  
+  class VISIBILITY_HIDDEN AliasDebugger 
+      : public ModulePass, public AliasAnalysis {
+
+    //What we do is simple.  Keep track of every value the AA could
+    //know about, and verify that queries are one of those.
+    //A query to a value that didn't exist when the AA was created
+    //means someone forgot to update the AA when creating new values
+
+    std::set<const Value*> Vals;
+    
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    AliasDebugger() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M) {
+      InitializeAliasAnalysis(this);                 // set up super class
+
+      for(Module::global_iterator I = M.global_begin(),
+            E = M.global_end(); I != E; ++I)
+        Vals.insert(&*I);
+
+      for(Module::iterator I = M.begin(),
+            E = M.end(); I != E; ++I){
+        Vals.insert(&*I);
+        if(!I->isDeclaration()) {
+          for (Function::arg_iterator AI = I->arg_begin(), AE = I->arg_end();
+               AI != AE; ++AI) 
+            Vals.insert(&*AI);     
+          for (Function::const_iterator FI = I->begin(), FE = I->end();
+               FI != FE; ++FI) 
+            for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end();
+                 BI != BE; ++BI)
+              Vals.insert(&*BI);
+        }
+        
+      }
+      return false;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AliasAnalysis::getAnalysisUsage(AU);
+      AU.setPreservesAll();                         // Does not transform code
+    }
+
+    //------------------------------------------------
+    // Implement the AliasAnalysis API
+    //
+    AliasResult alias(const Value *V1, unsigned V1Size,
+                      const Value *V2, unsigned V2Size) {
+      assert(Vals.find(V1) != Vals.end() && "Never seen value in AA before");
+      assert(Vals.find(V2) != Vals.end() && "Never seen value in AA before");    
+      return AliasAnalysis::alias(V1, V1Size, V2, V2Size);
+    }
+
+    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+      assert(Vals.find(P) != Vals.end() && "Never seen value in AA before");
+      return AliasAnalysis::getModRefInfo(CS, P, Size);
+    }
+
+    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
+      return AliasAnalysis::getModRefInfo(CS1,CS2);
+    }
+    
+    void getMustAliases(Value *P, std::vector<Value*> &RetVals) {
+      assert(Vals.find(P) != Vals.end() && "Never seen value in AA before");
+      return AliasAnalysis::getMustAliases(P, RetVals);
+    }
+
+    bool pointsToConstantMemory(const Value *P) {
+      assert(Vals.find(P) != Vals.end() && "Never seen value in AA before");
+      return AliasAnalysis::pointsToConstantMemory(P);
+    }
+
+    virtual void deleteValue(Value *V) {
+      assert(Vals.find(V) != Vals.end() && "Never seen value in AA before");
+      AliasAnalysis::deleteValue(V);
+    }
+    virtual void copyValue(Value *From, Value *To) {
+      Vals.insert(To);
+      AliasAnalysis::copyValue(From, To);
+    }
+
+  };
+}
+
+char AliasDebugger::ID = 0;
+static RegisterPass<AliasDebugger>
+X("debug-aa", "AA use debugger", false, true);
+static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+
+Pass *llvm::createAliasDebugger() { return new AliasDebugger(); }
+
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
new file mode 100644
index 0000000..18c2b665
--- /dev/null
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -0,0 +1,608 @@
+//===- AliasSetTracker.cpp - Alias Sets Tracker implementation-------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AliasSetTracker and AliasSet classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+/// mergeSetIn - Merge the specified alias set into this alias set.
+///
+void AliasSet::mergeSetIn(AliasSet &AS, AliasSetTracker &AST) {
+  assert(!AS.Forward && "Alias set is already forwarding!");
+  assert(!Forward && "This set is a forwarding set!!");
+
+  // Update the alias and access types of this set...
+  AccessTy |= AS.AccessTy;
+  AliasTy  |= AS.AliasTy;
+
+  if (AliasTy == MustAlias) {
+    // Check that these two merged sets really are must aliases.  Since both
+    // used to be must-alias sets, we can just check any pointer from each set
+    // for aliasing.
+    AliasAnalysis &AA = AST.getAliasAnalysis();
+    PointerRec *L = getSomePointer();
+    PointerRec *R = AS.getSomePointer();
+
+    // If the pointers are not a must-alias pair, this set becomes a may alias.
+    if (AA.alias(L->getValue(), L->getSize(), R->getValue(), R->getSize())
+        != AliasAnalysis::MustAlias)
+      AliasTy = MayAlias;
+  }
+
+  if (CallSites.empty()) {            // Merge call sites...
+    if (!AS.CallSites.empty())
+      std::swap(CallSites, AS.CallSites);
+  } else if (!AS.CallSites.empty()) {
+    CallSites.insert(CallSites.end(), AS.CallSites.begin(), AS.CallSites.end());
+    AS.CallSites.clear();
+  }
+
+  AS.Forward = this;  // Forward across AS now...
+  addRef();           // AS is now pointing to us...
+
+  // Merge the list of constituent pointers...
+  if (AS.PtrList) {
+    *PtrListEnd = AS.PtrList;
+    AS.PtrList->setPrevInList(PtrListEnd);
+    PtrListEnd = AS.PtrListEnd;
+
+    AS.PtrList = 0;
+    AS.PtrListEnd = &AS.PtrList;
+    assert(*AS.PtrListEnd == 0 && "End of list is not null?");
+  }
+}
+
+void AliasSetTracker::removeAliasSet(AliasSet *AS) {
+  if (AliasSet *Fwd = AS->Forward) {
+    Fwd->dropRef(*this);
+    AS->Forward = 0;
+  }
+  AliasSets.erase(AS);
+}
+
+void AliasSet::removeFromTracker(AliasSetTracker &AST) {
+  assert(RefCount == 0 && "Cannot remove non-dead alias set from tracker!");
+  AST.removeAliasSet(this);
+}
+
+void AliasSet::addPointer(AliasSetTracker &AST, PointerRec &Entry,
+                          unsigned Size, bool KnownMustAlias) {
+  assert(!Entry.hasAliasSet() && "Entry already in set!");
+
+  // Check to see if we have to downgrade to _may_ alias.
+  if (isMustAlias() && !KnownMustAlias)
+    if (PointerRec *P = getSomePointer()) {
+      AliasAnalysis &AA = AST.getAliasAnalysis();
+      AliasAnalysis::AliasResult Result =
+        AA.alias(P->getValue(), P->getSize(), Entry.getValue(), Size);
+      if (Result == AliasAnalysis::MayAlias)
+        AliasTy = MayAlias;
+      else                  // First entry of must alias must have maximum size!
+        P->updateSize(Size);
+      assert(Result != AliasAnalysis::NoAlias && "Cannot be part of must set!");
+    }
+
+  Entry.setAliasSet(this);
+  Entry.updateSize(Size);
+
+  // Add it to the end of the list...
+  assert(*PtrListEnd == 0 && "End of list is not null?");
+  *PtrListEnd = &Entry;
+  PtrListEnd = Entry.setPrevInList(PtrListEnd);
+  assert(*PtrListEnd == 0 && "End of list is not null?");
+  addRef();               // Entry points to alias set...
+}
+
+void AliasSet::addCallSite(CallSite CS, AliasAnalysis &AA) {
+  CallSites.push_back(CS);
+
+  AliasAnalysis::ModRefBehavior Behavior = AA.getModRefBehavior(CS);
+  if (Behavior == AliasAnalysis::DoesNotAccessMemory)
+    return;
+  else if (Behavior == AliasAnalysis::OnlyReadsMemory) {
+    AliasTy = MayAlias;
+    AccessTy |= Refs;
+    return;
+  }
+
+  // FIXME: This should use mod/ref information to make this not suck so bad
+  AliasTy = MayAlias;
+  AccessTy = ModRef;
+}
+
+/// aliasesPointer - Return true if the specified pointer "may" (or must)
+/// alias one of the members in the set.
+///
+bool AliasSet::aliasesPointer(const Value *Ptr, unsigned Size,
+                              AliasAnalysis &AA) const {
+  if (AliasTy == MustAlias) {
+    assert(CallSites.empty() && "Illegal must alias set!");
+
+    // If this is a set of MustAliases, only check to see if the pointer aliases
+    // SOME value in the set...
+    PointerRec *SomePtr = getSomePointer();
+    assert(SomePtr && "Empty must-alias set??");
+    return AA.alias(SomePtr->getValue(), SomePtr->getSize(), Ptr, Size);
+  }
+
+  // If this is a may-alias set, we have to check all of the pointers in the set
+  // to be sure it doesn't alias the set...
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (AA.alias(Ptr, Size, I.getPointer(), I.getSize()))
+      return true;
+
+  // Check the call sites list and invoke list...
+  if (!CallSites.empty()) {
+    if (AA.hasNoModRefInfoForCalls())
+      return true;
+
+    for (unsigned i = 0, e = CallSites.size(); i != e; ++i)
+      if (AA.getModRefInfo(CallSites[i], const_cast<Value*>(Ptr), Size)
+                   != AliasAnalysis::NoModRef)
+        return true;
+  }
+
+  return false;
+}
+
+bool AliasSet::aliasesCallSite(CallSite CS, AliasAnalysis &AA) const {
+  if (AA.doesNotAccessMemory(CS))
+    return false;
+
+  if (AA.hasNoModRefInfoForCalls())
+    return true;
+
+  for (unsigned i = 0, e = CallSites.size(); i != e; ++i)
+    if (AA.getModRefInfo(CallSites[i], CS) != AliasAnalysis::NoModRef ||
+        AA.getModRefInfo(CS, CallSites[i]) != AliasAnalysis::NoModRef)
+      return true;
+
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (AA.getModRefInfo(CS, I.getPointer(), I.getSize()) !=
+           AliasAnalysis::NoModRef)
+      return true;
+
+  return false;
+}
+
+void AliasSetTracker::clear() {
+  // Delete all the PointerRec entries.
+  for (DenseMap<Value*, AliasSet::PointerRec*>::iterator I = PointerMap.begin(),
+       E = PointerMap.end(); I != E; ++I)
+    I->second->eraseFromList();
+  
+  PointerMap.clear();
+  
+  // The alias sets should all be clear now.
+  AliasSets.clear();
+}
+
+
+/// findAliasSetForPointer - Given a pointer, find the one alias set to put the
+/// instruction referring to the pointer into.  If there are multiple alias sets
+/// that may alias the pointer, merge them together and return the unified set.
+///
+AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
+                                                  unsigned Size) {
+  AliasSet *FoundSet = 0;
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (!I->Forward && I->aliasesPointer(Ptr, Size, AA)) {
+      if (FoundSet == 0) {  // If this is the first alias set ptr can go into.
+        FoundSet = I;       // Remember it.
+      } else {              // Otherwise, we must merge the sets.
+        FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
+      }
+    }
+
+  return FoundSet;
+}
+
+/// containsPointer - Return true if the specified location is represented by
+/// this alias set, false otherwise.  This does not modify the AST object or
+/// alias sets.
+bool AliasSetTracker::containsPointer(Value *Ptr, unsigned Size) const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    if (!I->Forward && I->aliasesPointer(Ptr, Size, AA))
+      return true;
+  return false;
+}
+
+
+
+AliasSet *AliasSetTracker::findAliasSetForCallSite(CallSite CS) {
+  AliasSet *FoundSet = 0;
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (!I->Forward && I->aliasesCallSite(CS, AA)) {
+      if (FoundSet == 0) {  // If this is the first alias set ptr can go into.
+        FoundSet = I;       // Remember it.
+      } else if (!I->Forward) {     // Otherwise, we must merge the sets.
+        FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
+      }
+    }
+
+  return FoundSet;
+}
+
+
+
+
+/// getAliasSetForPointer - Return the alias set that the specified pointer
+/// lives in.
+AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer, unsigned Size,
+                                                 bool *New) {
+  AliasSet::PointerRec &Entry = getEntryFor(Pointer);
+
+  // Check to see if the pointer is already known...
+  if (Entry.hasAliasSet()) {
+    Entry.updateSize(Size);
+    // Return the set!
+    return *Entry.getAliasSet(*this)->getForwardedTarget(*this);
+  } else if (AliasSet *AS = findAliasSetForPointer(Pointer, Size)) {
+    // Add it to the alias set it aliases...
+    AS->addPointer(*this, Entry, Size);
+    return *AS;
+  } else {
+    if (New) *New = true;
+    // Otherwise create a new alias set to hold the loaded pointer...
+    AliasSets.push_back(new AliasSet());
+    AliasSets.back().addPointer(*this, Entry, Size);
+    return AliasSets.back();
+  }
+}
+
+bool AliasSetTracker::add(Value *Ptr, unsigned Size) {
+  bool NewPtr;
+  addPointer(Ptr, Size, AliasSet::NoModRef, NewPtr);
+  return NewPtr;
+}
+
+
+bool AliasSetTracker::add(LoadInst *LI) {
+  bool NewPtr;
+  AliasSet &AS = addPointer(LI->getOperand(0),
+                            AA.getTargetData().getTypeStoreSize(LI->getType()),
+                            AliasSet::Refs, NewPtr);
+  if (LI->isVolatile()) AS.setVolatile();
+  return NewPtr;
+}
+
+bool AliasSetTracker::add(StoreInst *SI) {
+  bool NewPtr;
+  Value *Val = SI->getOperand(0);
+  AliasSet &AS = addPointer(SI->getOperand(1),
+                            AA.getTargetData().getTypeStoreSize(Val->getType()),
+                            AliasSet::Mods, NewPtr);
+  if (SI->isVolatile()) AS.setVolatile();
+  return NewPtr;
+}
+
+bool AliasSetTracker::add(FreeInst *FI) {
+  bool NewPtr;
+  addPointer(FI->getOperand(0), ~0, AliasSet::Mods, NewPtr);
+  return NewPtr;
+}
+
+bool AliasSetTracker::add(VAArgInst *VAAI) {
+  bool NewPtr;
+  addPointer(VAAI->getOperand(0), ~0, AliasSet::ModRef, NewPtr);
+  return NewPtr;
+}
+
+
+bool AliasSetTracker::add(CallSite CS) {
+  if (isa<DbgInfoIntrinsic>(CS.getInstruction())) 
+    return true; // Ignore DbgInfo Intrinsics.
+  if (AA.doesNotAccessMemory(CS))
+    return true; // doesn't alias anything
+
+  AliasSet *AS = findAliasSetForCallSite(CS);
+  if (!AS) {
+    AliasSets.push_back(new AliasSet());
+    AS = &AliasSets.back();
+    AS->addCallSite(CS, AA);
+    return true;
+  } else {
+    AS->addCallSite(CS, AA);
+    return false;
+  }
+}
+
+bool AliasSetTracker::add(Instruction *I) {
+  // Dispatch to one of the other add methods...
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return add(LI);
+  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return add(SI);
+  else if (CallInst *CI = dyn_cast<CallInst>(I))
+    return add(CI);
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+    return add(II);
+  else if (FreeInst *FI = dyn_cast<FreeInst>(I))
+    return add(FI);
+  else if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+    return add(VAAI);
+  return true;
+}
+
+void AliasSetTracker::add(BasicBlock &BB) {
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+    add(I);
+}
+
+void AliasSetTracker::add(const AliasSetTracker &AST) {
+  assert(&AA == &AST.AA &&
+         "Merging AliasSetTracker objects with different Alias Analyses!");
+
+  // Loop over all of the alias sets in AST, adding the pointers contained
+  // therein into the current alias sets.  This can cause alias sets to be
+  // merged together in the current AST.
+  for (const_iterator I = AST.begin(), E = AST.end(); I != E; ++I)
+    if (!I->Forward) {   // Ignore forwarding alias sets
+      AliasSet &AS = const_cast<AliasSet&>(*I);
+
+      // If there are any call sites in the alias set, add them to this AST.
+      for (unsigned i = 0, e = AS.CallSites.size(); i != e; ++i)
+        add(AS.CallSites[i]);
+
+      // Loop over all of the pointers in this alias set...
+      AliasSet::iterator I = AS.begin(), E = AS.end();
+      bool X;
+      for (; I != E; ++I) {
+        AliasSet &NewAS = addPointer(I.getPointer(), I.getSize(),
+                                     (AliasSet::AccessType)AS.AccessTy, X);
+        if (AS.isVolatile()) NewAS.setVolatile();
+      }
+    }
+}
+
+/// remove - Remove the specified (potentially non-empty) alias set from the
+/// tracker.
+void AliasSetTracker::remove(AliasSet &AS) {
+  // Drop all call sites.
+  AS.CallSites.clear();
+  
+  // Clear the alias set.
+  unsigned NumRefs = 0;
+  while (!AS.empty()) {
+    AliasSet::PointerRec *P = AS.PtrList;
+
+    Value *ValToRemove = P->getValue();
+    
+    // Unlink and delete entry from the list of values.
+    P->eraseFromList();
+    
+    // Remember how many references need to be dropped.
+    ++NumRefs;
+
+    // Finally, remove the entry.
+    PointerMap.erase(ValToRemove);
+  }
+  
+  // Stop using the alias set, removing it.
+  AS.RefCount -= NumRefs;
+  if (AS.RefCount == 0)
+    AS.removeFromTracker(*this);
+}
+
+bool AliasSetTracker::remove(Value *Ptr, unsigned Size) {
+  AliasSet *AS = findAliasSetForPointer(Ptr, Size);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(LoadInst *LI) {
+  unsigned Size = AA.getTargetData().getTypeStoreSize(LI->getType());
+  AliasSet *AS = findAliasSetForPointer(LI->getOperand(0), Size);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(StoreInst *SI) {
+  unsigned Size =
+    AA.getTargetData().getTypeStoreSize(SI->getOperand(0)->getType());
+  AliasSet *AS = findAliasSetForPointer(SI->getOperand(1), Size);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(FreeInst *FI) {
+  AliasSet *AS = findAliasSetForPointer(FI->getOperand(0), ~0);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(VAArgInst *VAAI) {
+  AliasSet *AS = findAliasSetForPointer(VAAI->getOperand(0), ~0);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(CallSite CS) {
+  if (AA.doesNotAccessMemory(CS))
+    return false; // doesn't alias anything
+
+  AliasSet *AS = findAliasSetForCallSite(CS);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(Instruction *I) {
+  // Dispatch to one of the other remove methods...
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return remove(LI);
+  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return remove(SI);
+  else if (CallInst *CI = dyn_cast<CallInst>(I))
+    return remove(CI);
+  else if (FreeInst *FI = dyn_cast<FreeInst>(I))
+    return remove(FI);
+  else if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+    return remove(VAAI);
+  return true;
+}
+
+
+// deleteValue method - This method is used to remove a pointer value from the
+// AliasSetTracker entirely.  It should be used when an instruction is deleted
+// from the program to update the AST.  If you don't use this, you would have
+// dangling pointers to deleted instructions.
+//
+void AliasSetTracker::deleteValue(Value *PtrVal) {
+  // Notify the alias analysis implementation that this value is gone.
+  AA.deleteValue(PtrVal);
+
+  // If this is a call instruction, remove the callsite from the appropriate
+  // AliasSet.
+  CallSite CS = CallSite::get(PtrVal);
+  if (CS.getInstruction())
+    if (!AA.doesNotAccessMemory(CS))
+      if (AliasSet *AS = findAliasSetForCallSite(CS))
+        AS->removeCallSite(CS);
+
+  // First, look up the PointerRec for this pointer.
+  DenseMap<Value*, AliasSet::PointerRec*>::iterator I = PointerMap.find(PtrVal);
+  if (I == PointerMap.end()) return;  // Noop
+
+  // If we found one, remove the pointer from the alias set it is in.
+  AliasSet::PointerRec *PtrValEnt = I->second;
+  AliasSet *AS = PtrValEnt->getAliasSet(*this);
+
+  // Unlink and delete from the list of values.
+  PtrValEnt->eraseFromList();
+  
+  // Stop using the alias set.
+  AS->dropRef(*this);
+  
+  PointerMap.erase(I);
+}
+
+// copyValue - This method should be used whenever a preexisting value in the
+// program is copied or cloned, introducing a new value.  Note that it is ok for
+// clients that use this method to introduce the same value multiple times: if
+// the tracker already knows about a value, it will ignore the request.
+//
+void AliasSetTracker::copyValue(Value *From, Value *To) {
+  // Notify the alias analysis implementation that this value is copied.
+  AA.copyValue(From, To);
+
+  // First, look up the PointerRec for this pointer.
+  DenseMap<Value*, AliasSet::PointerRec*>::iterator I = PointerMap.find(From);
+  if (I == PointerMap.end())
+    return;  // Noop
+  assert(I->second->hasAliasSet() && "Dead entry?");
+
+  AliasSet::PointerRec &Entry = getEntryFor(To);
+  if (Entry.hasAliasSet()) return;    // Already in the tracker!
+
+  // Add it to the alias set it aliases...
+  I = PointerMap.find(From);
+  AliasSet *AS = I->second->getAliasSet(*this);
+  AS->addPointer(*this, Entry, I->second->getSize(), true);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//               AliasSet/AliasSetTracker Printing Support
+//===----------------------------------------------------------------------===//
+
+void AliasSet::print(std::ostream &OS) const {
+  OS << "  AliasSet[" << (void*)this << "," << RefCount << "] ";
+  OS << (AliasTy == MustAlias ? "must" : "may") << " alias, ";
+  switch (AccessTy) {
+  case NoModRef: OS << "No access "; break;
+  case Refs    : OS << "Ref       "; break;
+  case Mods    : OS << "Mod       "; break;
+  case ModRef  : OS << "Mod/Ref   "; break;
+  default: assert(0 && "Bad value for AccessTy!");
+  }
+  if (isVolatile()) OS << "[volatile] ";
+  if (Forward)
+    OS << " forwarding to " << (void*)Forward;
+
+
+  if (!empty()) {
+    OS << "Pointers: ";
+    for (iterator I = begin(), E = end(); I != E; ++I) {
+      if (I != begin()) OS << ", ";
+      WriteAsOperand(OS << "(", I.getPointer());
+      OS << ", " << I.getSize() << ")";
+    }
+  }
+  if (!CallSites.empty()) {
+    OS << "\n    " << CallSites.size() << " Call Sites: ";
+    for (unsigned i = 0, e = CallSites.size(); i != e; ++i) {
+      if (i) OS << ", ";
+      WriteAsOperand(OS, CallSites[i].getCalledValue());
+    }
+  }
+  OS << "\n";
+}
+
+void AliasSetTracker::print(std::ostream &OS) const {
+  OS << "Alias Set Tracker: " << AliasSets.size() << " alias sets for "
+     << PointerMap.size() << " pointer values.\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    I->print(OS);
+  OS << "\n";
+}
+
+void AliasSet::dump() const { print (cerr); }
+void AliasSetTracker::dump() const { print(cerr); }
+
+//===----------------------------------------------------------------------===//
+//                            AliasSetPrinter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  class VISIBILITY_HIDDEN AliasSetPrinter : public FunctionPass {
+    AliasSetTracker *Tracker;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    AliasSetPrinter() : FunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<AliasAnalysis>();
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      Tracker = new AliasSetTracker(getAnalysis<AliasAnalysis>());
+
+      for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+        Tracker->add(&*I);
+      Tracker->print(cerr);
+      delete Tracker;
+      return false;
+    }
+  };
+}
+
+char AliasSetPrinter::ID = 0;
+static RegisterPass<AliasSetPrinter>
+X("print-alias-sets", "Alias Set Printer", false, true);
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
new file mode 100644
index 0000000..493c6e8
--- /dev/null
+++ b/lib/Analysis/Analysis.cpp
@@ -0,0 +1,44 @@
+//===-- Analysis.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Analysis.h"
+#include "llvm/Analysis/Verifier.h"
+#include <fstream>
+#include <cstring>
+
+using namespace llvm;
+
+int LLVMVerifyModule(LLVMModuleRef M, LLVMVerifierFailureAction Action,
+                     char **OutMessages) {
+  std::string Messages;
+  
+  int Result = verifyModule(*unwrap(M),
+                            static_cast<VerifierFailureAction>(Action),
+                            OutMessages? &Messages : 0);
+  
+  if (OutMessages)
+    *OutMessages = strdup(Messages.c_str());
+  
+  return Result;
+}
+
+int LLVMVerifyFunction(LLVMValueRef Fn, LLVMVerifierFailureAction Action) {
+  return verifyFunction(*unwrap<Function>(Fn),
+                        static_cast<VerifierFailureAction>(Action));
+}
+
+void LLVMViewFunctionCFG(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  F->viewCFG();
+}
+
+void LLVMViewFunctionCFGOnly(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  F->viewCFGOnly();
+}
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
new file mode 100644
index 0000000..d062045
--- /dev/null
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -0,0 +1,838 @@
+//===- BasicAliasAnalysis.cpp - Local Alias Analysis Impl -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the default implementation of the Alias Analysis interface
+// that simply implements a few identities (two different globals cannot alias,
+// etc), but otherwise does no analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include <algorithm>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Useful predicates
+//===----------------------------------------------------------------------===//
+
+static const User *isGEP(const Value *V) {
+  if (isa<GetElementPtrInst>(V) ||
+      (isa<ConstantExpr>(V) &&
+       cast<ConstantExpr>(V)->getOpcode() == Instruction::GetElementPtr))
+    return cast<User>(V);
+  return 0;
+}
+
+static const Value *GetGEPOperands(const Value *V, 
+                                   SmallVector<Value*, 16> &GEPOps) {
+  assert(GEPOps.empty() && "Expect empty list to populate!");
+  GEPOps.insert(GEPOps.end(), cast<User>(V)->op_begin()+1,
+                cast<User>(V)->op_end());
+
+  // Accumulate all of the chained indexes into the operand array
+  V = cast<User>(V)->getOperand(0);
+
+  while (const User *G = isGEP(V)) {
+    if (!isa<Constant>(GEPOps[0]) || isa<GlobalValue>(GEPOps[0]) ||
+        !cast<Constant>(GEPOps[0])->isNullValue())
+      break;  // Don't handle folding arbitrary pointer offsets yet...
+    GEPOps.erase(GEPOps.begin());   // Drop the zero index
+    GEPOps.insert(GEPOps.begin(), G->op_begin()+1, G->op_end());
+    V = G->getOperand(0);
+  }
+  return V;
+}
+
+/// isKnownNonNull - Return true if we know that the specified value is never
+/// null.
+static bool isKnownNonNull(const Value *V) {
+  // Alloca never returns null, malloc might.
+  if (isa<AllocaInst>(V)) return true;
+  
+  // A byval argument is never null.
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasByValAttr();
+
+  // Global values are not null unless extern weak.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return !GV->hasExternalWeakLinkage();
+  return false;
+}
+
+/// isNonEscapingLocalObject - Return true if the pointer is to a function-local
+/// object that never escapes from the function.
+static bool isNonEscapingLocalObject(const Value *V) {
+  // If this is a local allocation, check to see if it escapes.
+  if (isa<AllocationInst>(V) || isNoAliasCall(V))
+    return !PointerMayBeCaptured(V, false);
+
+  // If this is an argument that corresponds to a byval or noalias argument,
+  // then it has not escaped before entering the function.  Check if it escapes
+  // inside the function.
+  if (const Argument *A = dyn_cast<Argument>(V))
+    if (A->hasByValAttr() || A->hasNoAliasAttr()) {
+      // Don't bother analyzing arguments already known not to escape.
+      if (A->hasNoCaptureAttr())
+        return true;
+      return !PointerMayBeCaptured(V, false);
+    }
+  return false;
+}
+
+
+/// isObjectSmallerThan - Return true if we can prove that the object specified
+/// by V is smaller than Size.
+static bool isObjectSmallerThan(const Value *V, unsigned Size,
+                                const TargetData &TD) {
+  const Type *AccessTy;
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    AccessTy = GV->getType()->getElementType();
+  } else if (const AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
+    if (!AI->isArrayAllocation())
+      AccessTy = AI->getType()->getElementType();
+    else
+      return false;
+  } else if (const Argument *A = dyn_cast<Argument>(V)) {
+    if (A->hasByValAttr())
+      AccessTy = cast<PointerType>(A->getType())->getElementType();
+    else
+      return false;
+  } else {
+    return false;
+  }
+  
+  if (AccessTy->isSized())
+    return TD.getTypeAllocSize(AccessTy) < Size;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// NoAA Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// NoAA - This class implements the -no-aa pass, which always returns "I
+  /// don't know" for alias queries.  NoAA is unlike other alias analysis
+  /// implementations, in that it does not chain to a previous analysis.  As
+  /// such it doesn't follow many of the rules that other alias analyses must.
+  ///
+  struct VISIBILITY_HIDDEN NoAA : public ImmutablePass, public AliasAnalysis {
+    static char ID; // Class identification, replacement for typeinfo
+    NoAA() : ImmutablePass(&ID) {}
+    explicit NoAA(void *PID) : ImmutablePass(PID) { }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+    virtual void initializePass() {
+      TD = &getAnalysis<TargetData>();
+    }
+
+    virtual AliasResult alias(const Value *V1, unsigned V1Size,
+                              const Value *V2, unsigned V2Size) {
+      return MayAlias;
+    }
+
+    virtual void getArgumentAccesses(Function *F, CallSite CS,
+                                     std::vector<PointerAccessInfo> &Info) {
+      assert(0 && "This method may not be called on this function!");
+    }
+
+    virtual void getMustAliases(Value *P, std::vector<Value*> &RetVals) { }
+    virtual bool pointsToConstantMemory(const Value *P) { return false; }
+    virtual ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+      return ModRef;
+    }
+    virtual ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
+      return ModRef;
+    }
+    virtual bool hasNoModRefInfoForCalls() const { return true; }
+
+    virtual void deleteValue(Value *V) {}
+    virtual void copyValue(Value *From, Value *To) {}
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char NoAA::ID = 0;
+static RegisterPass<NoAA>
+U("no-aa", "No Alias Analysis (always returns 'may' alias)", true, true);
+
+// Declare that we implement the AliasAnalysis interface
+static RegisterAnalysisGroup<AliasAnalysis> V(U);
+
+ImmutablePass *llvm::createNoAAPass() { return new NoAA(); }
+
+//===----------------------------------------------------------------------===//
+// BasicAA Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// BasicAliasAnalysis - This is the default alias analysis implementation.
+  /// Because it doesn't chain to a previous alias analysis (like -no-aa), it
+  /// derives from the NoAA class.
+  struct VISIBILITY_HIDDEN BasicAliasAnalysis : public NoAA {
+    static char ID; // Class identification, replacement for typeinfo
+    BasicAliasAnalysis() : NoAA(&ID) {}
+    AliasResult alias(const Value *V1, unsigned V1Size,
+                      const Value *V2, unsigned V2Size);
+
+    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size);
+    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2);
+
+    /// hasNoModRefInfoForCalls - We can provide mod/ref information against
+    /// non-escaping allocations.
+    virtual bool hasNoModRefInfoForCalls() const { return false; }
+
+    /// pointsToConstantMemory - Chase pointers until we find a (constant
+    /// global) or not.
+    bool pointsToConstantMemory(const Value *P);
+
+  private:
+    // CheckGEPInstructions - Check two GEP instructions with known
+    // must-aliasing base pointers.  This checks to see if the index expressions
+    // preclude the pointers from aliasing...
+    AliasResult
+    CheckGEPInstructions(const Type* BasePtr1Ty,
+                         Value **GEP1Ops, unsigned NumGEP1Ops, unsigned G1Size,
+                         const Type *BasePtr2Ty,
+                         Value **GEP2Ops, unsigned NumGEP2Ops, unsigned G2Size);
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char BasicAliasAnalysis::ID = 0;
+static RegisterPass<BasicAliasAnalysis>
+X("basicaa", "Basic Alias Analysis (default AA impl)", false, true);
+
+// Declare that we implement the AliasAnalysis interface
+static RegisterAnalysisGroup<AliasAnalysis, true> Y(X);
+
+ImmutablePass *llvm::createBasicAliasAnalysisPass() {
+  return new BasicAliasAnalysis();
+}
+
+
+/// pointsToConstantMemory - Chase pointers until we find a (constant
+/// global) or not.
+bool BasicAliasAnalysis::pointsToConstantMemory(const Value *P) {
+  if (const GlobalVariable *GV = 
+        dyn_cast<GlobalVariable>(P->getUnderlyingObject()))
+    return GV->isConstant();
+  return false;
+}
+
+
+// getModRefInfo - Check to see if the specified callsite can clobber the
+// specified memory object.  Since we only look at local properties of this
+// function, we really can't say much about this query.  We do, however, use
+// simple "address taken" analysis on local objects.
+//
+AliasAnalysis::ModRefResult
+BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+  if (!isa<Constant>(P)) {
+    const Value *Object = P->getUnderlyingObject();
+    
+    // If this is a tail call and P points to a stack location, we know that
+    // the tail call cannot access or modify the local stack.
+    // We cannot exclude byval arguments here; these belong to the caller of
+    // the current function not to the current function, and a tail callee
+    // may reference them.
+    if (isa<AllocaInst>(Object))
+      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
+        if (CI->isTailCall())
+          return NoModRef;
+    
+    // If the pointer is to a locally allocated object that does not escape,
+    // then the call can not mod/ref the pointer unless the call takes the
+    // argument without capturing it.
+    if (isNonEscapingLocalObject(Object) && CS.getInstruction() != Object) {
+      bool passedAsArg = false;
+      // TODO: Eventually only check 'nocapture' arguments.
+      for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+           CI != CE; ++CI)
+        if (isa<PointerType>((*CI)->getType()) &&
+            alias(cast<Value>(CI), ~0U, P, ~0U) != NoAlias)
+          passedAsArg = true;
+      
+      if (!passedAsArg)
+        return NoModRef;
+    }
+  }
+
+  // The AliasAnalysis base class has some smarts, lets use them.
+  return AliasAnalysis::getModRefInfo(CS, P, Size);
+}
+
+
+AliasAnalysis::ModRefResult 
+BasicAliasAnalysis::getModRefInfo(CallSite CS1, CallSite CS2) {
+  // If CS1 or CS2 are readnone, they don't interact.
+  ModRefBehavior CS1B = AliasAnalysis::getModRefBehavior(CS1);
+  if (CS1B == DoesNotAccessMemory) return NoModRef;
+  
+  ModRefBehavior CS2B = AliasAnalysis::getModRefBehavior(CS2);
+  if (CS2B == DoesNotAccessMemory) return NoModRef;
+  
+  // If they both only read from memory, just return ref.
+  if (CS1B == OnlyReadsMemory && CS2B == OnlyReadsMemory)
+    return Ref;
+  
+  // Otherwise, fall back to NoAA (mod+ref).
+  return NoAA::getModRefInfo(CS1, CS2);
+}
+
+
+// alias - Provide a bunch of ad-hoc rules to disambiguate in common cases, such
+// as array references.
+//
+AliasAnalysis::AliasResult
+BasicAliasAnalysis::alias(const Value *V1, unsigned V1Size,
+                          const Value *V2, unsigned V2Size) {
+  // Strip off any constant expression casts if they exist
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V1))
+    if (CE->isCast() && isa<PointerType>(CE->getOperand(0)->getType()))
+      V1 = CE->getOperand(0);
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V2))
+    if (CE->isCast() && isa<PointerType>(CE->getOperand(0)->getType()))
+      V2 = CE->getOperand(0);
+
+  // Are we checking for alias of the same value?
+  if (V1 == V2) return MustAlias;
+
+  if (!isa<PointerType>(V1->getType()) || !isa<PointerType>(V2->getType()))
+    return NoAlias;  // Scalars cannot alias each other
+
+  // Strip off cast instructions.   Since V1 and V2 are pointers, they must be
+  // pointer<->pointer bitcasts.
+  if (const BitCastInst *I = dyn_cast<BitCastInst>(V1))
+    return alias(I->getOperand(0), V1Size, V2, V2Size);
+  if (const BitCastInst *I = dyn_cast<BitCastInst>(V2))
+    return alias(V1, V1Size, I->getOperand(0), V2Size);
+
+  // Figure out what objects these things are pointing to if we can.
+  const Value *O1 = V1->getUnderlyingObject();
+  const Value *O2 = V2->getUnderlyingObject();
+
+  if (O1 != O2) {
+    // If V1/V2 point to two different objects we know that we have no alias.
+    if (isIdentifiedObject(O1) && isIdentifiedObject(O2))
+      return NoAlias;
+  
+    // Arguments can't alias with local allocations or noalias calls.
+    if ((isa<Argument>(O1) && (isa<AllocationInst>(O2) || isNoAliasCall(O2))) ||
+        (isa<Argument>(O2) && (isa<AllocationInst>(O1) || isNoAliasCall(O1))))
+      return NoAlias;
+
+    // Most objects can't alias null.
+    if ((isa<ConstantPointerNull>(V2) && isKnownNonNull(O1)) ||
+        (isa<ConstantPointerNull>(V1) && isKnownNonNull(O2)))
+      return NoAlias;
+  }
+  
+  // If the size of one access is larger than the entire object on the other
+  // side, then we know such behavior is undefined and can assume no alias.
+  const TargetData &TD = getTargetData();
+  if ((V1Size != ~0U && isObjectSmallerThan(O2, V1Size, TD)) ||
+      (V2Size != ~0U && isObjectSmallerThan(O1, V2Size, TD)))
+    return NoAlias;
+  
+  // If one pointer is the result of a call/invoke and the other is a
+  // non-escaping local object, then we know the object couldn't escape to a
+  // point where the call could return it.
+  if ((isa<CallInst>(O1) || isa<InvokeInst>(O1)) &&
+      isNonEscapingLocalObject(O2) && O1 != O2)
+    return NoAlias;
+  if ((isa<CallInst>(O2) || isa<InvokeInst>(O2)) &&
+      isNonEscapingLocalObject(O1) && O1 != O2)
+    return NoAlias;
+  
+  // If we have two gep instructions with must-alias'ing base pointers, figure
+  // out if the indexes to the GEP tell us anything about the derived pointer.
+  // Note that we also handle chains of getelementptr instructions as well as
+  // constant expression getelementptrs here.
+  //
+  if (isGEP(V1) && isGEP(V2)) {
+    const User *GEP1 = cast<User>(V1);
+    const User *GEP2 = cast<User>(V2);
+    
+    // If V1 and V2 are identical GEPs, just recurse down on both of them.
+    // This allows us to analyze things like:
+    //   P = gep A, 0, i, 1
+    //   Q = gep B, 0, i, 1
+    // by just analyzing A and B.  This is even safe for variable indices.
+    if (GEP1->getType() == GEP2->getType() &&
+        GEP1->getNumOperands() == GEP2->getNumOperands() &&
+        GEP1->getOperand(0)->getType() == GEP2->getOperand(0)->getType() &&
+        // All operands are the same, ignoring the base.
+        std::equal(GEP1->op_begin()+1, GEP1->op_end(), GEP2->op_begin()+1))
+      return alias(GEP1->getOperand(0), V1Size, GEP2->getOperand(0), V2Size);
+    
+    
+    // Drill down into the first non-gep value, to test for must-aliasing of
+    // the base pointers.
+    while (isGEP(GEP1->getOperand(0)) &&
+           GEP1->getOperand(1) ==
+           Constant::getNullValue(GEP1->getOperand(1)->getType()))
+      GEP1 = cast<User>(GEP1->getOperand(0));
+    const Value *BasePtr1 = GEP1->getOperand(0);
+
+    while (isGEP(GEP2->getOperand(0)) &&
+           GEP2->getOperand(1) ==
+           Constant::getNullValue(GEP2->getOperand(1)->getType()))
+      GEP2 = cast<User>(GEP2->getOperand(0));
+    const Value *BasePtr2 = GEP2->getOperand(0);
+
+    // Do the base pointers alias?
+    AliasResult BaseAlias = alias(BasePtr1, ~0U, BasePtr2, ~0U);
+    if (BaseAlias == NoAlias) return NoAlias;
+    if (BaseAlias == MustAlias) {
+      // If the base pointers alias each other exactly, check to see if we can
+      // figure out anything about the resultant pointers, to try to prove
+      // non-aliasing.
+
+      // Collect all of the chained GEP operands together into one simple place
+      SmallVector<Value*, 16> GEP1Ops, GEP2Ops;
+      BasePtr1 = GetGEPOperands(V1, GEP1Ops);
+      BasePtr2 = GetGEPOperands(V2, GEP2Ops);
+
+      // If GetGEPOperands were able to fold to the same must-aliased pointer,
+      // do the comparison.
+      if (BasePtr1 == BasePtr2) {
+        AliasResult GAlias =
+          CheckGEPInstructions(BasePtr1->getType(),
+                               &GEP1Ops[0], GEP1Ops.size(), V1Size,
+                               BasePtr2->getType(),
+                               &GEP2Ops[0], GEP2Ops.size(), V2Size);
+        if (GAlias != MayAlias)
+          return GAlias;
+      }
+    }
+  }
+
+  // Check to see if these two pointers are related by a getelementptr
+  // instruction.  If one pointer is a GEP with a non-zero index of the other
+  // pointer, we know they cannot alias.
+  //
+  if (isGEP(V2)) {
+    std::swap(V1, V2);
+    std::swap(V1Size, V2Size);
+  }
+
+  if (V1Size != ~0U && V2Size != ~0U)
+    if (isGEP(V1)) {
+      SmallVector<Value*, 16> GEPOperands;
+      const Value *BasePtr = GetGEPOperands(V1, GEPOperands);
+
+      AliasResult R = alias(BasePtr, V1Size, V2, V2Size);
+      if (R == MustAlias) {
+        // If there is at least one non-zero constant index, we know they cannot
+        // alias.
+        bool ConstantFound = false;
+        bool AllZerosFound = true;
+        for (unsigned i = 0, e = GEPOperands.size(); i != e; ++i)
+          if (const Constant *C = dyn_cast<Constant>(GEPOperands[i])) {
+            if (!C->isNullValue()) {
+              ConstantFound = true;
+              AllZerosFound = false;
+              break;
+            }
+          } else {
+            AllZerosFound = false;
+          }
+
+        // If we have getelementptr <ptr>, 0, 0, 0, 0, ... and V2 must aliases
+        // the ptr, the end result is a must alias also.
+        if (AllZerosFound)
+          return MustAlias;
+
+        if (ConstantFound) {
+          if (V2Size <= 1 && V1Size <= 1)  // Just pointer check?
+            return NoAlias;
+
+          // Otherwise we have to check to see that the distance is more than
+          // the size of the argument... build an index vector that is equal to
+          // the arguments provided, except substitute 0's for any variable
+          // indexes we find...
+          if (cast<PointerType>(
+                BasePtr->getType())->getElementType()->isSized()) {
+            for (unsigned i = 0; i != GEPOperands.size(); ++i)
+              if (!isa<ConstantInt>(GEPOperands[i]))
+                GEPOperands[i] =
+                  Constant::getNullValue(GEPOperands[i]->getType());
+            int64_t Offset =
+              getTargetData().getIndexedOffset(BasePtr->getType(),
+                                               &GEPOperands[0],
+                                               GEPOperands.size());
+
+            if (Offset >= (int64_t)V2Size || Offset <= -(int64_t)V1Size)
+              return NoAlias;
+          }
+        }
+      }
+    }
+
+  return MayAlias;
+}
+
+// This function is used to determine if the indices of two GEP instructions are
+// equal. V1 and V2 are the indices.
+static bool IndexOperandsEqual(Value *V1, Value *V2) {
+  if (V1->getType() == V2->getType())
+    return V1 == V2;
+  if (Constant *C1 = dyn_cast<Constant>(V1))
+    if (Constant *C2 = dyn_cast<Constant>(V2)) {
+      // Sign extend the constants to long types, if necessary
+      if (C1->getType() != Type::Int64Ty)
+        C1 = ConstantExpr::getSExt(C1, Type::Int64Ty);
+      if (C2->getType() != Type::Int64Ty) 
+        C2 = ConstantExpr::getSExt(C2, Type::Int64Ty);
+      return C1 == C2;
+    }
+  return false;
+}
+
+/// CheckGEPInstructions - Check two GEP instructions with known must-aliasing
+/// base pointers.  This checks to see if the index expressions preclude the
+/// pointers from aliasing...
+AliasAnalysis::AliasResult 
+BasicAliasAnalysis::CheckGEPInstructions(
+  const Type* BasePtr1Ty, Value **GEP1Ops, unsigned NumGEP1Ops, unsigned G1S,
+  const Type *BasePtr2Ty, Value **GEP2Ops, unsigned NumGEP2Ops, unsigned G2S) {
+  // We currently can't handle the case when the base pointers have different
+  // primitive types.  Since this is uncommon anyway, we are happy being
+  // extremely conservative.
+  if (BasePtr1Ty != BasePtr2Ty)
+    return MayAlias;
+
+  const PointerType *GEPPointerTy = cast<PointerType>(BasePtr1Ty);
+
+  // Find the (possibly empty) initial sequence of equal values... which are not
+  // necessarily constants.
+  unsigned NumGEP1Operands = NumGEP1Ops, NumGEP2Operands = NumGEP2Ops;
+  unsigned MinOperands = std::min(NumGEP1Operands, NumGEP2Operands);
+  unsigned MaxOperands = std::max(NumGEP1Operands, NumGEP2Operands);
+  unsigned UnequalOper = 0;
+  while (UnequalOper != MinOperands &&
+         IndexOperandsEqual(GEP1Ops[UnequalOper], GEP2Ops[UnequalOper])) {
+    // Advance through the type as we go...
+    ++UnequalOper;
+    if (const CompositeType *CT = dyn_cast<CompositeType>(BasePtr1Ty))
+      BasePtr1Ty = CT->getTypeAtIndex(GEP1Ops[UnequalOper-1]);
+    else {
+      // If all operands equal each other, then the derived pointers must
+      // alias each other...
+      BasePtr1Ty = 0;
+      assert(UnequalOper == NumGEP1Operands && UnequalOper == NumGEP2Operands &&
+             "Ran out of type nesting, but not out of operands?");
+      return MustAlias;
+    }
+  }
+
+  // If we have seen all constant operands, and run out of indexes on one of the
+  // getelementptrs, check to see if the tail of the leftover one is all zeros.
+  // If so, return mustalias.
+  if (UnequalOper == MinOperands) {
+    if (NumGEP1Ops < NumGEP2Ops) {
+      std::swap(GEP1Ops, GEP2Ops);
+      std::swap(NumGEP1Ops, NumGEP2Ops);
+    }
+
+    bool AllAreZeros = true;
+    for (unsigned i = UnequalOper; i != MaxOperands; ++i)
+      if (!isa<Constant>(GEP1Ops[i]) ||
+          !cast<Constant>(GEP1Ops[i])->isNullValue()) {
+        AllAreZeros = false;
+        break;
+      }
+    if (AllAreZeros) return MustAlias;
+  }
+
+
+  // So now we know that the indexes derived from the base pointers,
+  // which are known to alias, are different.  We can still determine a
+  // no-alias result if there are differing constant pairs in the index
+  // chain.  For example:
+  //        A[i][0] != A[j][1] iff (&A[0][1]-&A[0][0] >= std::max(G1S, G2S))
+  //
+  // We have to be careful here about array accesses.  In particular, consider:
+  //        A[1][0] vs A[0][i]
+  // In this case, we don't *know* that the array will be accessed in bounds:
+  // the index could even be negative.  Because of this, we have to
+  // conservatively *give up* and return may alias.  We disregard differing
+  // array subscripts that are followed by a variable index without going
+  // through a struct.
+  //
+  unsigned SizeMax = std::max(G1S, G2S);
+  if (SizeMax == ~0U) return MayAlias; // Avoid frivolous work.
+
+  // Scan for the first operand that is constant and unequal in the
+  // two getelementptrs...
+  unsigned FirstConstantOper = UnequalOper;
+  for (; FirstConstantOper != MinOperands; ++FirstConstantOper) {
+    const Value *G1Oper = GEP1Ops[FirstConstantOper];
+    const Value *G2Oper = GEP2Ops[FirstConstantOper];
+
+    if (G1Oper != G2Oper)   // Found non-equal constant indexes...
+      if (Constant *G1OC = dyn_cast<ConstantInt>(const_cast<Value*>(G1Oper)))
+        if (Constant *G2OC = dyn_cast<ConstantInt>(const_cast<Value*>(G2Oper))){
+          if (G1OC->getType() != G2OC->getType()) {
+            // Sign extend both operands to long.
+            if (G1OC->getType() != Type::Int64Ty)
+              G1OC = ConstantExpr::getSExt(G1OC, Type::Int64Ty);
+            if (G2OC->getType() != Type::Int64Ty) 
+              G2OC = ConstantExpr::getSExt(G2OC, Type::Int64Ty);
+            GEP1Ops[FirstConstantOper] = G1OC;
+            GEP2Ops[FirstConstantOper] = G2OC;
+          }
+          
+          if (G1OC != G2OC) {
+            // Handle the "be careful" case above: if this is an array/vector
+            // subscript, scan for a subsequent variable array index.
+            if (const SequentialType *STy =
+                  dyn_cast<SequentialType>(BasePtr1Ty)) {
+              const Type *NextTy = STy;
+              bool isBadCase = false;
+              
+              for (unsigned Idx = FirstConstantOper;
+                   Idx != MinOperands && isa<SequentialType>(NextTy); ++Idx) {
+                const Value *V1 = GEP1Ops[Idx], *V2 = GEP2Ops[Idx];
+                if (!isa<Constant>(V1) || !isa<Constant>(V2)) {
+                  isBadCase = true;
+                  break;
+                }
+                // If the array is indexed beyond the bounds of the static type
+                // at this level, it will also fall into the "be careful" case.
+                // It would theoretically be possible to analyze these cases,
+                // but for now just be conservatively correct.
+                if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+                  if (cast<ConstantInt>(G1OC)->getZExtValue() >=
+                        ATy->getNumElements() ||
+                      cast<ConstantInt>(G2OC)->getZExtValue() >=
+                        ATy->getNumElements()) {
+                    isBadCase = true;
+                    break;
+                  }
+                if (const VectorType *VTy = dyn_cast<VectorType>(STy))
+                  if (cast<ConstantInt>(G1OC)->getZExtValue() >=
+                        VTy->getNumElements() ||
+                      cast<ConstantInt>(G2OC)->getZExtValue() >=
+                        VTy->getNumElements()) {
+                    isBadCase = true;
+                    break;
+                  }
+                STy = cast<SequentialType>(NextTy);
+                NextTy = cast<SequentialType>(NextTy)->getElementType();
+              }
+              
+              if (isBadCase) G1OC = 0;
+            }
+
+            // Make sure they are comparable (ie, not constant expressions), and
+            // make sure the GEP with the smaller leading constant is GEP1.
+            if (G1OC) {
+              Constant *Compare = ConstantExpr::getICmp(ICmpInst::ICMP_SGT, 
+                                                        G1OC, G2OC);
+              if (ConstantInt *CV = dyn_cast<ConstantInt>(Compare)) {
+                if (CV->getZExtValue()) {  // If they are comparable and G2 > G1
+                  std::swap(GEP1Ops, GEP2Ops);  // Make GEP1 < GEP2
+                  std::swap(NumGEP1Ops, NumGEP2Ops);
+                }
+                break;
+              }
+            }
+          }
+        }
+    BasePtr1Ty = cast<CompositeType>(BasePtr1Ty)->getTypeAtIndex(G1Oper);
+  }
+
+  // No shared constant operands, and we ran out of common operands.  At this
+  // point, the GEP instructions have run through all of their operands, and we
+  // haven't found evidence that there are any deltas between the GEP's.
+  // However, one GEP may have more operands than the other.  If this is the
+  // case, there may still be hope.  Check this now.
+  if (FirstConstantOper == MinOperands) {
+    // Make GEP1Ops be the longer one if there is a longer one.
+    if (NumGEP1Ops < NumGEP2Ops) {
+      std::swap(GEP1Ops, GEP2Ops);
+      std::swap(NumGEP1Ops, NumGEP2Ops);
+    }
+
+    // Is there anything to check?
+    if (NumGEP1Ops > MinOperands) {
+      for (unsigned i = FirstConstantOper; i != MaxOperands; ++i)
+        if (isa<ConstantInt>(GEP1Ops[i]) && 
+            !cast<ConstantInt>(GEP1Ops[i])->isZero()) {
+          // Yup, there's a constant in the tail.  Set all variables to
+          // constants in the GEP instruction to make it suitable for
+          // TargetData::getIndexedOffset.
+          for (i = 0; i != MaxOperands; ++i)
+            if (!isa<ConstantInt>(GEP1Ops[i]))
+              GEP1Ops[i] = Constant::getNullValue(GEP1Ops[i]->getType());
+          // Okay, now get the offset.  This is the relative offset for the full
+          // instruction.
+          const TargetData &TD = getTargetData();
+          int64_t Offset1 = TD.getIndexedOffset(GEPPointerTy, GEP1Ops,
+                                                NumGEP1Ops);
+
+          // Now check without any constants at the end.
+          int64_t Offset2 = TD.getIndexedOffset(GEPPointerTy, GEP1Ops,
+                                                MinOperands);
+
+          // Make sure we compare the absolute difference.
+          if (Offset1 > Offset2)
+            std::swap(Offset1, Offset2);
+
+          // If the tail provided a bit enough offset, return noalias!
+          if ((uint64_t)(Offset2-Offset1) >= SizeMax)
+            return NoAlias;
+          // Otherwise break - we don't look for another constant in the tail.
+          break;
+        }
+    }
+
+    // Couldn't find anything useful.
+    return MayAlias;
+  }
+
+  // If there are non-equal constants arguments, then we can figure
+  // out a minimum known delta between the two index expressions... at
+  // this point we know that the first constant index of GEP1 is less
+  // than the first constant index of GEP2.
+
+  // Advance BasePtr[12]Ty over this first differing constant operand.
+  BasePtr2Ty = cast<CompositeType>(BasePtr1Ty)->
+      getTypeAtIndex(GEP2Ops[FirstConstantOper]);
+  BasePtr1Ty = cast<CompositeType>(BasePtr1Ty)->
+      getTypeAtIndex(GEP1Ops[FirstConstantOper]);
+
+  // We are going to be using TargetData::getIndexedOffset to determine the
+  // offset that each of the GEP's is reaching.  To do this, we have to convert
+  // all variable references to constant references.  To do this, we convert the
+  // initial sequence of array subscripts into constant zeros to start with.
+  const Type *ZeroIdxTy = GEPPointerTy;
+  for (unsigned i = 0; i != FirstConstantOper; ++i) {
+    if (!isa<StructType>(ZeroIdxTy))
+      GEP1Ops[i] = GEP2Ops[i] = Constant::getNullValue(Type::Int32Ty);
+
+    if (const CompositeType *CT = dyn_cast<CompositeType>(ZeroIdxTy))
+      ZeroIdxTy = CT->getTypeAtIndex(GEP1Ops[i]);
+  }
+
+  // We know that GEP1Ops[FirstConstantOper] & GEP2Ops[FirstConstantOper] are ok
+
+  // Loop over the rest of the operands...
+  for (unsigned i = FirstConstantOper+1; i != MaxOperands; ++i) {
+    const Value *Op1 = i < NumGEP1Ops ? GEP1Ops[i] : 0;
+    const Value *Op2 = i < NumGEP2Ops ? GEP2Ops[i] : 0;
+    // If they are equal, use a zero index...
+    if (Op1 == Op2 && BasePtr1Ty == BasePtr2Ty) {
+      if (!isa<ConstantInt>(Op1))
+        GEP1Ops[i] = GEP2Ops[i] = Constant::getNullValue(Op1->getType());
+      // Otherwise, just keep the constants we have.
+    } else {
+      if (Op1) {
+        if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+          // If this is an array index, make sure the array element is in range.
+          if (const ArrayType *AT = dyn_cast<ArrayType>(BasePtr1Ty)) {
+            if (Op1C->getZExtValue() >= AT->getNumElements())
+              return MayAlias;  // Be conservative with out-of-range accesses
+          } else if (const VectorType *VT = dyn_cast<VectorType>(BasePtr1Ty)) {
+            if (Op1C->getZExtValue() >= VT->getNumElements())
+              return MayAlias;  // Be conservative with out-of-range accesses
+          }
+          
+        } else {
+          // GEP1 is known to produce a value less than GEP2.  To be
+          // conservatively correct, we must assume the largest possible
+          // constant is used in this position.  This cannot be the initial
+          // index to the GEP instructions (because we know we have at least one
+          // element before this one with the different constant arguments), so
+          // we know that the current index must be into either a struct or
+          // array.  Because we know it's not constant, this cannot be a
+          // structure index.  Because of this, we can calculate the maximum
+          // value possible.
+          //
+          if (const ArrayType *AT = dyn_cast<ArrayType>(BasePtr1Ty))
+            GEP1Ops[i] = ConstantInt::get(Type::Int64Ty,AT->getNumElements()-1);
+          else if (const VectorType *VT = dyn_cast<VectorType>(BasePtr1Ty))
+            GEP1Ops[i] = ConstantInt::get(Type::Int64Ty,VT->getNumElements()-1);
+        }
+      }
+
+      if (Op2) {
+        if (const ConstantInt *Op2C = dyn_cast<ConstantInt>(Op2)) {
+          // If this is an array index, make sure the array element is in range.
+          if (const ArrayType *AT = dyn_cast<ArrayType>(BasePtr2Ty)) {
+            if (Op2C->getZExtValue() >= AT->getNumElements())
+              return MayAlias;  // Be conservative with out-of-range accesses
+          } else if (const VectorType *VT = dyn_cast<VectorType>(BasePtr2Ty)) {
+            if (Op2C->getZExtValue() >= VT->getNumElements())
+              return MayAlias;  // Be conservative with out-of-range accesses
+          }
+        } else {  // Conservatively assume the minimum value for this index
+          GEP2Ops[i] = Constant::getNullValue(Op2->getType());
+        }
+      }
+    }
+
+    if (BasePtr1Ty && Op1) {
+      if (const CompositeType *CT = dyn_cast<CompositeType>(BasePtr1Ty))
+        BasePtr1Ty = CT->getTypeAtIndex(GEP1Ops[i]);
+      else
+        BasePtr1Ty = 0;
+    }
+
+    if (BasePtr2Ty && Op2) {
+      if (const CompositeType *CT = dyn_cast<CompositeType>(BasePtr2Ty))
+        BasePtr2Ty = CT->getTypeAtIndex(GEP2Ops[i]);
+      else
+        BasePtr2Ty = 0;
+    }
+  }
+
+  if (GEPPointerTy->getElementType()->isSized()) {
+    int64_t Offset1 =
+      getTargetData().getIndexedOffset(GEPPointerTy, GEP1Ops, NumGEP1Ops);
+    int64_t Offset2 = 
+      getTargetData().getIndexedOffset(GEPPointerTy, GEP2Ops, NumGEP2Ops);
+    assert(Offset1 != Offset2 &&
+           "There is at least one different constant here!");
+    
+    // Make sure we compare the absolute difference.
+    if (Offset1 > Offset2)
+      std::swap(Offset1, Offset2);
+    
+    if ((uint64_t)(Offset2-Offset1) >= SizeMax) {
+      //cerr << "Determined that these two GEP's don't alias ["
+      //     << SizeMax << " bytes]: \n" << *GEP1 << *GEP2;
+      return NoAlias;
+    }
+  }
+  return MayAlias;
+}
+
+// Make sure that anything that uses AliasAnalysis pulls in this file...
+DEFINING_FILE_FOR(BasicAliasAnalysis)
diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
new file mode 100644
index 0000000..143220c
--- /dev/null
+++ b/lib/Analysis/CFGPrinter.cpp
@@ -0,0 +1,221 @@
+//===- CFGPrinter.cpp - DOT printer for the control flow graph ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a '-dot-cfg' analysis pass, which emits the
+// cfg.<fnname>.dot file for each function in the program, with a graph of the
+// CFG for that function.
+//
+// The other main feature of this file is that it implements the
+// Function::viewCFG method, which is useful for debugging passes which operate
+// on the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Config/config.h"
+#include <iosfwd>
+#include <sstream>
+#include <fstream>
+using namespace llvm;
+
+/// CFGOnly flag - This is used to control whether or not the CFG graph printer
+/// prints out the contents of basic blocks or not.  This is acceptable because
+/// this code is only really used for debugging purposes.
+///
+static bool CFGOnly = false;
+
+namespace llvm {
+template<>
+struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
+  static std::string getGraphName(const Function *F) {
+    return "CFG for '" + F->getName() + "' function";
+  }
+
+  static std::string getNodeLabel(const BasicBlock *Node,
+                                  const Function *Graph) {
+    if (CFGOnly && !Node->getName().empty())
+      return Node->getName() + ":";
+
+    std::ostringstream Out;
+    if (CFGOnly) {
+      WriteAsOperand(Out, Node, false);
+      return Out.str();
+    }
+
+    if (Node->getName().empty()) {
+      WriteAsOperand(Out, Node, false);
+      Out << ":";
+    }
+
+    Out << *Node;
+    std::string OutStr = Out.str();
+    if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
+
+    // Process string output to make it nicer...
+    for (unsigned i = 0; i != OutStr.length(); ++i)
+      if (OutStr[i] == '\n') {                            // Left justify
+        OutStr[i] = '\\';
+        OutStr.insert(OutStr.begin()+i+1, 'l');
+      } else if (OutStr[i] == ';') {                      // Delete comments!
+        unsigned Idx = OutStr.find('\n', i+1);            // Find end of line
+        OutStr.erase(OutStr.begin()+i, OutStr.begin()+Idx);
+        --i;
+      }
+
+    return OutStr;
+  }
+
+  static std::string getEdgeSourceLabel(const BasicBlock *Node,
+                                        succ_const_iterator I) {
+    // Label source of conditional branches with "T" or "F"
+    if (const BranchInst *BI = dyn_cast<BranchInst>(Node->getTerminator()))
+      if (BI->isConditional())
+        return (I == succ_begin(Node)) ? "T" : "F";
+    return "";
+  }
+};
+}
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGViewer : public FunctionPass {
+    static char ID; // Pass identifcation, replacement for typeid
+    CFGViewer() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      F.viewCFG();
+      return false;
+    }
+
+    void print(std::ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGViewer::ID = 0;
+static RegisterPass<CFGViewer>
+V0("view-cfg", "View CFG of function", false, true);
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGOnlyViewer : public FunctionPass {
+    static char ID; // Pass identifcation, replacement for typeid
+    CFGOnlyViewer() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      CFGOnly = true;
+      F.viewCFG();
+      CFGOnly = false;
+      return false;
+    }
+
+    void print(std::ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGOnlyViewer::ID = 0;
+static RegisterPass<CFGOnlyViewer>
+V1("view-cfg-only",
+   "View CFG of function (with no function bodies)", false, true);
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGPrinter : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGPrinter() : FunctionPass(&ID) {}
+    explicit CFGPrinter(void *pid) : FunctionPass(pid) {}
+
+    virtual bool runOnFunction(Function &F) {
+      std::string Filename = "cfg." + F.getName() + ".dot";
+      cerr << "Writing '" << Filename << "'...";
+      std::ofstream File(Filename.c_str());
+
+      if (File.good())
+        WriteGraph(File, (const Function*)&F);
+      else
+        cerr << "  error opening file for writing!";
+      cerr << "\n";
+      return false;
+    }
+
+    void print(std::ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGPrinter::ID = 0;
+static RegisterPass<CFGPrinter>
+P1("dot-cfg", "Print CFG of function to 'dot' file", false, true);
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGOnlyPrinter : public CFGPrinter {
+    static char ID; // Pass identification, replacement for typeid
+    CFGOnlyPrinter() : CFGPrinter(&ID) {}
+    virtual bool runOnFunction(Function &F) {
+      bool OldCFGOnly = CFGOnly;
+      CFGOnly = true;
+      CFGPrinter::runOnFunction(F);
+      CFGOnly = OldCFGOnly;
+      return false;
+    }
+    void print(std::ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGOnlyPrinter::ID = 0;
+static RegisterPass<CFGOnlyPrinter>
+P2("dot-cfg-only",
+   "Print CFG of function to 'dot' file (with no function bodies)", false, true);
+
+/// viewCFG - This function is meant for use from the debugger.  You can just
+/// say 'call F->viewCFG()' and a ghostview window should pop up from the
+/// program, displaying the CFG of the current function.  This depends on there
+/// being a 'dot' and 'gv' program in your path.
+///
+void Function::viewCFG() const {
+  ViewGraph(this, "cfg" + getName());
+}
+
+/// viewCFGOnly - This function is meant for use from the debugger.  It works
+/// just like viewCFG, but it does not include the contents of basic blocks
+/// into the nodes, just the label.  If you are only interested in the CFG t
+/// his can make the graph smaller.
+///
+void Function::viewCFGOnly() const {
+  CFGOnly = true;
+  viewCFG();
+  CFGOnly = false;
+}
+
+FunctionPass *llvm::createCFGPrinterPass () {
+  return new CFGPrinter();
+}
+
+FunctionPass *llvm::createCFGOnlyPrinterPass () {
+  return new CFGOnlyPrinter();
+}
+
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
new file mode 100644
index 0000000..093aa69
--- /dev/null
+++ b/lib/Analysis/CMakeLists.txt
@@ -0,0 +1,34 @@
+add_llvm_library(LLVMAnalysis
+  AliasAnalysis.cpp
+  AliasAnalysisCounter.cpp
+  AliasAnalysisEvaluator.cpp
+  AliasDebugger.cpp
+  AliasSetTracker.cpp
+  Analysis.cpp
+  BasicAliasAnalysis.cpp
+  CaptureTracking.cpp
+  CFGPrinter.cpp
+  ConstantFolding.cpp
+  DbgInfoPrinter.cpp
+  DebugInfo.cpp
+  InstCount.cpp
+  Interval.cpp
+  IntervalPartition.cpp
+  IVUsers.cpp
+  LibCallAliasAnalysis.cpp
+  LibCallSemantics.cpp
+  LiveValues.cpp
+  LoopInfo.cpp
+  LoopPass.cpp
+  LoopVR.cpp
+  MemoryDependenceAnalysis.cpp
+  PostDominators.cpp
+  ProfileInfo.cpp
+  ProfileInfoLoader.cpp
+  ProfileInfoLoaderPass.cpp
+  ScalarEvolution.cpp
+  ScalarEvolutionExpander.cpp
+  SparsePropagation.cpp
+  Trace.cpp
+  ValueTracking.cpp
+  )
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
new file mode 100644
index 0000000..a19b8e4
--- /dev/null
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -0,0 +1,112 @@
+//===--- CaptureTracking.cpp - Determine whether a pointer is captured ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains routines that help determine which pointers are captured.
+// A pointer value is captured if the function makes a copy of any part of the
+// pointer that outlives the call.  Not being captured means, more or less, that
+// the pointer is only dereferenced and not stored in a global.  Returning part
+// of the pointer as the function return value may or may not count as capturing
+// the pointer, depending on the context.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Instructions.h"
+#include "llvm/Value.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CallSite.h"
+using namespace llvm;
+
+/// PointerMayBeCaptured - Return true if this pointer value may be captured
+/// by the enclosing function (which is required to exist).  This routine can
+/// be expensive, so consider caching the results.  The boolean ReturnCaptures
+/// specifies whether returning the value (or part of it) from the function
+/// counts as capturing it or not.
+bool llvm::PointerMayBeCaptured(const Value *V, bool ReturnCaptures) {
+  assert(isa<PointerType>(V->getType()) && "Capture is for pointers only!");
+  SmallVector<Use*, 16> Worklist;
+  SmallSet<Use*, 16> Visited;
+
+  for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
+       UI != UE; ++UI) {
+    Use *U = &UI.getUse();
+    Visited.insert(U);
+    Worklist.push_back(U);
+  }
+
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+    V = U->get();
+
+    switch (I->getOpcode()) {
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      CallSite CS = CallSite::get(I);
+      // Not captured if the callee is readonly, doesn't return a copy through
+      // its return value and doesn't unwind (a readonly function can leak bits
+      // by throwing an exception or not depending on the input value).
+      if (CS.onlyReadsMemory() && CS.doesNotThrow() &&
+          I->getType() == Type::VoidTy)
+        break;
+
+      // Not captured if only passed via 'nocapture' arguments.  Note that
+      // calling a function pointer does not in itself cause the pointer to
+      // be captured.  This is a subtle point considering that (for example)
+      // the callee might return its own address.  It is analogous to saying
+      // that loading a value from a pointer does not cause the pointer to be
+      // captured, even though the loaded value might be the pointer itself
+      // (think of self-referential objects).
+      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+      for (CallSite::arg_iterator A = B; A != E; ++A)
+        if (A->get() == V && !CS.paramHasAttr(A - B + 1, Attribute::NoCapture))
+          // The parameter is not marked 'nocapture' - captured.
+          return true;
+      // Only passed via 'nocapture' arguments, or is the called function - not
+      // captured.
+      break;
+    }
+    case Instruction::Free:
+      // Freeing a pointer does not cause it to be captured.
+      break;
+    case Instruction::Load:
+      // Loading from a pointer does not cause it to be captured.
+      break;
+    case Instruction::Ret:
+      if (ReturnCaptures)
+        return true;
+      break;
+    case Instruction::Store:
+      if (V == I->getOperand(0))
+        // Stored the pointer - it may be captured.
+        return true;
+      // Storing to the pointee does not cause the pointer to be captured.
+      break;
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+      // The original value is not captured via this if the new value isn't.
+      for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI) {
+        Use *U = &UI.getUse();
+        if (Visited.insert(U))
+          Worklist.push_back(U);
+      }
+      break;
+    default:
+      // Something else - be conservative and say it is captured.
+      return true;
+    }
+  }
+
+  // All uses examined - not captured.
+  return false;
+}
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
new file mode 100644
index 0000000..e5ab322
--- /dev/null
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -0,0 +1,829 @@
+//===-- ConstantFolding.cpp - Analyze constant folding possibilities ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions determines the possibility of performing constant
+// folding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include <cerrno>
+#include <cmath>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Constant Folding internal helper functions
+//===----------------------------------------------------------------------===//
+
+/// IsConstantOffsetFromGlobal - If this constant is actually a constant offset
+/// from a global, return the global and the constant.  Because of
+/// constantexprs, this function is recursive.
+static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
+                                       int64_t &Offset, const TargetData &TD) {
+  // Trivial case, constant is the global.
+  if ((GV = dyn_cast<GlobalValue>(C))) {
+    Offset = 0;
+    return true;
+  }
+  
+  // Otherwise, if this isn't a constant expr, bail out.
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (!CE) return false;
+  
+  // Look through ptr->int and ptr->ptr casts.
+  if (CE->getOpcode() == Instruction::PtrToInt ||
+      CE->getOpcode() == Instruction::BitCast)
+    return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
+  
+  // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)    
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    // Cannot compute this if the element type of the pointer is missing size
+    // info.
+    if (!cast<PointerType>(CE->getOperand(0)->getType())
+                 ->getElementType()->isSized())
+      return false;
+    
+    // If the base isn't a global+constant, we aren't either.
+    if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
+      return false;
+    
+    // Otherwise, add any offset that our operands provide.
+    gep_type_iterator GTI = gep_type_begin(CE);
+    for (User::const_op_iterator i = CE->op_begin() + 1, e = CE->op_end();
+         i != e; ++i, ++GTI) {
+      ConstantInt *CI = dyn_cast<ConstantInt>(*i);
+      if (!CI) return false;  // Index isn't a simple constant?
+      if (CI->getZExtValue() == 0) continue;  // Not adding anything.
+      
+      if (const StructType *ST = dyn_cast<StructType>(*GTI)) {
+        // N = N + Offset
+        Offset += TD.getStructLayout(ST)->getElementOffset(CI->getZExtValue());
+      } else {
+        const SequentialType *SQT = cast<SequentialType>(*GTI);
+        Offset += TD.getTypeAllocSize(SQT->getElementType())*CI->getSExtValue();
+      }
+    }
+    return true;
+  }
+  
+  return false;
+}
+
+
+/// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
+/// Attempt to symbolically evaluate the result of a binary operator merging
+/// these together.  If target data info is available, it is provided as TD, 
+/// otherwise TD is null.
+static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
+                                           Constant *Op1, const TargetData *TD){
+  // SROA
+  
+  // Fold (and 0xffffffff00000000, (shl x, 32)) -> shl.
+  // Fold (lshr (or X, Y), 32) -> (lshr [X/Y], 32) if one doesn't contribute
+  // bits.
+  
+  
+  // If the constant expr is something like &A[123] - &A[4].f, fold this into a
+  // constant.  This happens frequently when iterating over a global array.
+  if (Opc == Instruction::Sub && TD) {
+    GlobalValue *GV1, *GV2;
+    int64_t Offs1, Offs2;
+    
+    if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *TD))
+      if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *TD) &&
+          GV1 == GV2) {
+        // (&GV+C1) - (&GV+C2) -> C1-C2, pointer arithmetic cannot overflow.
+        return ConstantInt::get(Op0->getType(), Offs1-Offs2);
+      }
+  }
+    
+  return 0;
+}
+
+/// SymbolicallyEvaluateGEP - If we can symbolically evaluate the specified GEP
+/// constant expression, do so.
+static Constant *SymbolicallyEvaluateGEP(Constant* const* Ops, unsigned NumOps,
+                                         const Type *ResultTy,
+                                         const TargetData *TD) {
+  Constant *Ptr = Ops[0];
+  if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized())
+    return 0;
+  
+  uint64_t BasePtr = 0;
+  if (!Ptr->isNullValue()) {
+    // If this is a inttoptr from a constant int, we can fold this as the base,
+    // otherwise we can't.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (CE->getOpcode() == Instruction::IntToPtr)
+        if (ConstantInt *Base = dyn_cast<ConstantInt>(CE->getOperand(0)))
+          BasePtr = Base->getZExtValue();
+    
+    if (BasePtr == 0)
+      return 0;
+  }
+
+  // If this is a constant expr gep that is effectively computing an
+  // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
+  for (unsigned i = 1; i != NumOps; ++i)
+    if (!isa<ConstantInt>(Ops[i]))
+      return false;
+  
+  uint64_t Offset = TD->getIndexedOffset(Ptr->getType(),
+                                         (Value**)Ops+1, NumOps-1);
+  Constant *C = ConstantInt::get(TD->getIntPtrType(), Offset+BasePtr);
+  return ConstantExpr::getIntToPtr(C, ResultTy);
+}
+
+/// FoldBitCast - Constant fold bitcast, symbolically evaluating it with 
+/// targetdata.  Return 0 if unfoldable.
+static Constant *FoldBitCast(Constant *C, const Type *DestTy,
+                             const TargetData &TD) {
+  // If this is a bitcast from constant vector -> vector, fold it.
+  if (ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
+    if (const VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
+      // If the element types match, VMCore can fold it.
+      unsigned NumDstElt = DestVTy->getNumElements();
+      unsigned NumSrcElt = CV->getNumOperands();
+      if (NumDstElt == NumSrcElt)
+        return 0;
+      
+      const Type *SrcEltTy = CV->getType()->getElementType();
+      const Type *DstEltTy = DestVTy->getElementType();
+      
+      // Otherwise, we're changing the number of elements in a vector, which 
+      // requires endianness information to do the right thing.  For example,
+      //    bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
+      // folds to (little endian):
+      //    <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+      // and to (big endian):
+      //    <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+      
+      // First thing is first.  We only want to think about integer here, so if
+      // we have something in FP form, recast it as integer.
+      if (DstEltTy->isFloatingPoint()) {
+        // Fold to an vector of integers with same size as our FP type.
+        unsigned FPWidth = DstEltTy->getPrimitiveSizeInBits();
+        const Type *DestIVTy = VectorType::get(IntegerType::get(FPWidth),
+                                               NumDstElt);
+        // Recursively handle this integer conversion, if possible.
+        C = FoldBitCast(C, DestIVTy, TD);
+        if (!C) return 0;
+        
+        // Finally, VMCore can handle this now that #elts line up.
+        return ConstantExpr::getBitCast(C, DestTy);
+      }
+      
+      // Okay, we know the destination is integer, if the input is FP, convert
+      // it to integer first.
+      if (SrcEltTy->isFloatingPoint()) {
+        unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
+        const Type *SrcIVTy = VectorType::get(IntegerType::get(FPWidth),
+                                              NumSrcElt);
+        // Ask VMCore to do the conversion now that #elts line up.
+        C = ConstantExpr::getBitCast(C, SrcIVTy);
+        CV = dyn_cast<ConstantVector>(C);
+        if (!CV) return 0;  // If VMCore wasn't able to fold it, bail out.
+      }
+      
+      // Now we know that the input and output vectors are both integer vectors
+      // of the same size, and that their #elements is not the same.  Do the
+      // conversion here, which depends on whether the input or output has
+      // more elements.
+      bool isLittleEndian = TD.isLittleEndian();
+      
+      SmallVector<Constant*, 32> Result;
+      if (NumDstElt < NumSrcElt) {
+        // Handle: bitcast (<4 x i32> <i32 0, i32 1, i32 2, i32 3> to <2 x i64>)
+        Constant *Zero = Constant::getNullValue(DstEltTy);
+        unsigned Ratio = NumSrcElt/NumDstElt;
+        unsigned SrcBitSize = SrcEltTy->getPrimitiveSizeInBits();
+        unsigned SrcElt = 0;
+        for (unsigned i = 0; i != NumDstElt; ++i) {
+          // Build each element of the result.
+          Constant *Elt = Zero;
+          unsigned ShiftAmt = isLittleEndian ? 0 : SrcBitSize*(Ratio-1);
+          for (unsigned j = 0; j != Ratio; ++j) {
+            Constant *Src = dyn_cast<ConstantInt>(CV->getOperand(SrcElt++));
+            if (!Src) return 0;  // Reject constantexpr elements.
+            
+            // Zero extend the element to the right size.
+            Src = ConstantExpr::getZExt(Src, Elt->getType());
+            
+            // Shift it to the right place, depending on endianness.
+            Src = ConstantExpr::getShl(Src, 
+                                    ConstantInt::get(Src->getType(), ShiftAmt));
+            ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
+            
+            // Mix it in.
+            Elt = ConstantExpr::getOr(Elt, Src);
+          }
+          Result.push_back(Elt);
+        }
+      } else {
+        // Handle: bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
+        unsigned Ratio = NumDstElt/NumSrcElt;
+        unsigned DstBitSize = DstEltTy->getPrimitiveSizeInBits();
+        
+        // Loop over each source value, expanding into multiple results.
+        for (unsigned i = 0; i != NumSrcElt; ++i) {
+          Constant *Src = dyn_cast<ConstantInt>(CV->getOperand(i));
+          if (!Src) return 0;  // Reject constantexpr elements.
+
+          unsigned ShiftAmt = isLittleEndian ? 0 : DstBitSize*(Ratio-1);
+          for (unsigned j = 0; j != Ratio; ++j) {
+            // Shift the piece of the value into the right place, depending on
+            // endianness.
+            Constant *Elt = ConstantExpr::getLShr(Src, 
+                                ConstantInt::get(Src->getType(), ShiftAmt));
+            ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
+
+            // Truncate and remember this piece.
+            Result.push_back(ConstantExpr::getTrunc(Elt, DstEltTy));
+          }
+        }
+      }
+      
+      return ConstantVector::get(Result.data(), Result.size());
+    }
+  }
+  
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Constant Folding public APIs
+//===----------------------------------------------------------------------===//
+
+
+/// ConstantFoldInstruction - Attempt to constant fold the specified
+/// instruction.  If successful, the constant result is returned, if not, null
+/// is returned.  Note that this function can only fail when attempting to fold
+/// instructions like loads and stores, which have no constant expression form.
+///
+Constant *llvm::ConstantFoldInstruction(Instruction *I, const TargetData *TD) {
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    if (PN->getNumIncomingValues() == 0)
+      return UndefValue::get(PN->getType());
+
+    Constant *Result = dyn_cast<Constant>(PN->getIncomingValue(0));
+    if (Result == 0) return 0;
+
+    // Handle PHI nodes specially here...
+    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) != Result && PN->getIncomingValue(i) != PN)
+        return 0;   // Not all the same incoming constants...
+
+    // If we reach here, all incoming values are the same constant.
+    return Result;
+  }
+
+  // Scan the operand list, checking to see if they are all constants, if so,
+  // hand off to ConstantFoldInstOperands.
+  SmallVector<Constant*, 8> Ops;
+  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+    if (Constant *Op = dyn_cast<Constant>(*i))
+      Ops.push_back(Op);
+    else
+      return 0;  // All operands not constant!
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(CI->getPredicate(),
+                                           Ops.data(), Ops.size(), TD);
+  else
+    return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                    Ops.data(), Ops.size(), TD);
+}
+
+/// ConstantFoldConstantExpression - Attempt to fold the constant expression
+/// using the specified TargetData.  If successful, the constant result is
+/// result is returned, if not, null is returned.
+Constant *llvm::ConstantFoldConstantExpression(ConstantExpr *CE,
+                                               const TargetData *TD) {
+  assert(TD && "ConstantFoldConstantExpression requires a valid TargetData.");
+
+  SmallVector<Constant*, 8> Ops;
+  for (User::op_iterator i = CE->op_begin(), e = CE->op_end(); i != e; ++i)
+    Ops.push_back(cast<Constant>(*i));
+
+  if (CE->isCompare())
+    return ConstantFoldCompareInstOperands(CE->getPredicate(),
+                                           Ops.data(), Ops.size(), TD);
+  else 
+    return ConstantFoldInstOperands(CE->getOpcode(), CE->getType(),
+                                    Ops.data(), Ops.size(), TD);
+}
+
+/// ConstantFoldInstOperands - Attempt to constant fold an instruction with the
+/// specified opcode and operands.  If successful, the constant result is
+/// returned, if not, null is returned.  Note that this function can fail when
+/// attempting to fold instructions like loads and stores, which have no
+/// constant expression form.
+///
+Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, const Type *DestTy, 
+                                         Constant* const* Ops, unsigned NumOps,
+                                         const TargetData *TD) {
+  // Handle easy binops first.
+  if (Instruction::isBinaryOp(Opcode)) {
+    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1]))
+      if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], TD))
+        return C;
+    
+    return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
+  }
+  
+  switch (Opcode) {
+  default: return 0;
+  case Instruction::Call:
+    if (Function *F = dyn_cast<Function>(Ops[0]))
+      if (canConstantFoldCallTo(F))
+        return ConstantFoldCall(F, Ops+1, NumOps-1);
+    return 0;
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+  case Instruction::VICmp:
+  case Instruction::VFCmp:
+    assert(0 &&"This function is invalid for compares: no predicate specified");
+  case Instruction::PtrToInt:
+    // If the input is a inttoptr, eliminate the pair.  This requires knowing
+    // the width of a pointer, so it can't be done in ConstantExpr::getCast.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
+      if (TD && CE->getOpcode() == Instruction::IntToPtr) {
+        Constant *Input = CE->getOperand(0);
+        unsigned InWidth = Input->getType()->getPrimitiveSizeInBits();
+        if (TD->getPointerSizeInBits() < InWidth) {
+          Constant *Mask = 
+            ConstantInt::get(APInt::getLowBitsSet(InWidth,
+                                                  TD->getPointerSizeInBits()));
+          Input = ConstantExpr::getAnd(Input, Mask);
+        }
+        // Do a zext or trunc to get to the dest size.
+        return ConstantExpr::getIntegerCast(Input, DestTy, false);
+      }
+    }
+    return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
+  case Instruction::IntToPtr:
+    // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
+    // the int size is >= the ptr size.  This requires knowing the width of a
+    // pointer, so it can't be done in ConstantExpr::getCast.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
+      if (TD &&
+          TD->getPointerSizeInBits() <=
+          CE->getType()->getPrimitiveSizeInBits()) {
+        if (CE->getOpcode() == Instruction::PtrToInt) {
+          Constant *Input = CE->getOperand(0);
+          Constant *C = FoldBitCast(Input, DestTy, *TD);
+          return C ? C : ConstantExpr::getBitCast(Input, DestTy);
+        }
+        // If there's a constant offset added to the integer value before
+        // it is casted back to a pointer, see if the expression can be
+        // converted into a GEP.
+        if (CE->getOpcode() == Instruction::Add)
+          if (ConstantInt *L = dyn_cast<ConstantInt>(CE->getOperand(0)))
+            if (ConstantExpr *R = dyn_cast<ConstantExpr>(CE->getOperand(1)))
+              if (R->getOpcode() == Instruction::PtrToInt)
+                if (GlobalVariable *GV =
+                      dyn_cast<GlobalVariable>(R->getOperand(0))) {
+                  const PointerType *GVTy = cast<PointerType>(GV->getType());
+                  if (const ArrayType *AT =
+                        dyn_cast<ArrayType>(GVTy->getElementType())) {
+                    const Type *ElTy = AT->getElementType();
+                    uint64_t AllocSize = TD->getTypeAllocSize(ElTy);
+                    APInt PSA(L->getValue().getBitWidth(), AllocSize);
+                    if (ElTy == cast<PointerType>(DestTy)->getElementType() &&
+                        L->getValue().urem(PSA) == 0) {
+                      APInt ElemIdx = L->getValue().udiv(PSA);
+                      if (ElemIdx.ult(APInt(ElemIdx.getBitWidth(),
+                                            AT->getNumElements()))) {
+                        Constant *Index[] = {
+                          Constant::getNullValue(CE->getType()),
+                          ConstantInt::get(ElemIdx)
+                        };
+                        return ConstantExpr::getGetElementPtr(GV, &Index[0], 2);
+                      }
+                    }
+                  }
+                }
+      }
+    }
+    return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+      return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
+  case Instruction::BitCast:
+    if (TD)
+      if (Constant *C = FoldBitCast(Ops[0], DestTy, *TD))
+        return C;
+    return ConstantExpr::getBitCast(Ops[0], DestTy);
+  case Instruction::Select:
+    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ExtractElement:
+    return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
+  case Instruction::InsertElement:
+    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ShuffleVector:
+    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
+  case Instruction::GetElementPtr:
+    if (Constant *C = SymbolicallyEvaluateGEP(Ops, NumOps, DestTy, TD))
+      return C;
+    
+    return ConstantExpr::getGetElementPtr(Ops[0], Ops+1, NumOps-1);
+  }
+}
+
+/// ConstantFoldCompareInstOperands - Attempt to constant fold a compare
+/// instruction (icmp/fcmp) with the specified operands.  If it fails, it
+/// returns a constant expression of the specified operands.
+///
+Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
+                                                Constant*const * Ops, 
+                                                unsigned NumOps,
+                                                const TargetData *TD) {
+  // fold: icmp (inttoptr x), null         -> icmp x, 0
+  // fold: icmp (ptrtoint x), 0            -> icmp x, null
+  // fold: icmp (inttoptr x), (inttoptr y) -> icmp trunc/zext x, trunc/zext y
+  // fold: icmp (ptrtoint x), (ptrtoint y) -> icmp x, y
+  //
+  // ConstantExpr::getCompare cannot do this, because it doesn't have TD
+  // around to know if bit truncation is happening.
+  if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops[0])) {
+    if (TD && Ops[1]->isNullValue()) {
+      const Type *IntPtrTy = TD->getIntPtrType();
+      if (CE0->getOpcode() == Instruction::IntToPtr) {
+        // Convert the integer value to the right size to ensure we get the
+        // proper extension or truncation.
+        Constant *C = ConstantExpr::getIntegerCast(CE0->getOperand(0),
+                                                   IntPtrTy, false);
+        Constant *NewOps[] = { C, Constant::getNullValue(C->getType()) };
+        return ConstantFoldCompareInstOperands(Predicate, NewOps, 2, TD);
+      }
+      
+      // Only do this transformation if the int is intptrty in size, otherwise
+      // there is a truncation or extension that we aren't modeling.
+      if (CE0->getOpcode() == Instruction::PtrToInt && 
+          CE0->getType() == IntPtrTy) {
+        Constant *C = CE0->getOperand(0);
+        Constant *NewOps[] = { C, Constant::getNullValue(C->getType()) };
+        // FIXME!
+        return ConstantFoldCompareInstOperands(Predicate, NewOps, 2, TD);
+      }
+    }
+    
+    if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops[1])) {
+      if (TD && CE0->getOpcode() == CE1->getOpcode()) {
+        const Type *IntPtrTy = TD->getIntPtrType();
+
+        if (CE0->getOpcode() == Instruction::IntToPtr) {
+          // Convert the integer value to the right size to ensure we get the
+          // proper extension or truncation.
+          Constant *C0 = ConstantExpr::getIntegerCast(CE0->getOperand(0),
+                                                      IntPtrTy, false);
+          Constant *C1 = ConstantExpr::getIntegerCast(CE1->getOperand(0),
+                                                      IntPtrTy, false);
+          Constant *NewOps[] = { C0, C1 };
+          return ConstantFoldCompareInstOperands(Predicate, NewOps, 2, TD);
+        }
+
+        // Only do this transformation if the int is intptrty in size, otherwise
+        // there is a truncation or extension that we aren't modeling.
+        if ((CE0->getOpcode() == Instruction::PtrToInt &&
+             CE0->getType() == IntPtrTy &&
+             CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType())) {
+          Constant *NewOps[] = { 
+            CE0->getOperand(0), CE1->getOperand(0) 
+          };
+          return ConstantFoldCompareInstOperands(Predicate, NewOps, 2, TD);
+        }
+      }
+    }
+  }
+  return ConstantExpr::getCompare(Predicate, Ops[0], Ops[1]);
+}
+
+
+/// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
+/// getelementptr constantexpr, return the constant value being addressed by the
+/// constant expression, or null if something is funny and we can't decide.
+Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C, 
+                                                       ConstantExpr *CE) {
+  if (CE->getOperand(1) != Constant::getNullValue(CE->getOperand(1)->getType()))
+    return 0;  // Do not allow stepping over the value!
+  
+  // Loop over all of the operands, tracking down which value we are
+  // addressing...
+  gep_type_iterator I = gep_type_begin(CE), E = gep_type_end(CE);
+  for (++I; I != E; ++I)
+    if (const StructType *STy = dyn_cast<StructType>(*I)) {
+      ConstantInt *CU = cast<ConstantInt>(I.getOperand());
+      assert(CU->getZExtValue() < STy->getNumElements() &&
+             "Struct index out of range!");
+      unsigned El = (unsigned)CU->getZExtValue();
+      if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
+        C = CS->getOperand(El);
+      } else if (isa<ConstantAggregateZero>(C)) {
+        C = Constant::getNullValue(STy->getElementType(El));
+      } else if (isa<UndefValue>(C)) {
+        C = UndefValue::get(STy->getElementType(El));
+      } else {
+        return 0;
+      }
+    } else if (ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand())) {
+      if (const ArrayType *ATy = dyn_cast<ArrayType>(*I)) {
+        if (CI->getZExtValue() >= ATy->getNumElements())
+         return 0;
+        if (ConstantArray *CA = dyn_cast<ConstantArray>(C))
+          C = CA->getOperand(CI->getZExtValue());
+        else if (isa<ConstantAggregateZero>(C))
+          C = Constant::getNullValue(ATy->getElementType());
+        else if (isa<UndefValue>(C))
+          C = UndefValue::get(ATy->getElementType());
+        else
+          return 0;
+      } else if (const VectorType *PTy = dyn_cast<VectorType>(*I)) {
+        if (CI->getZExtValue() >= PTy->getNumElements())
+          return 0;
+        if (ConstantVector *CP = dyn_cast<ConstantVector>(C))
+          C = CP->getOperand(CI->getZExtValue());
+        else if (isa<ConstantAggregateZero>(C))
+          C = Constant::getNullValue(PTy->getElementType());
+        else if (isa<UndefValue>(C))
+          C = UndefValue::get(PTy->getElementType());
+        else
+          return 0;
+      } else {
+        return 0;
+      }
+    } else {
+      return 0;
+    }
+  return C;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Constant Folding for Calls
+//
+
+/// canConstantFoldCallTo - Return true if its even possible to fold a call to
+/// the specified function.
+bool
+llvm::canConstantFoldCallTo(const Function *F) {
+  switch (F->getIntrinsicID()) {
+  case Intrinsic::sqrt:
+  case Intrinsic::powi:
+  case Intrinsic::bswap:
+  case Intrinsic::ctpop:
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+    return true;
+  default: break;
+  }
+
+  if (!F->hasName()) return false;
+  const char *Str = F->getNameStart();
+  unsigned Len = F->getNameLen();
+  
+  // In these cases, the check of the length is required.  We don't want to
+  // return true for a name like "cos\0blah" which strcmp would return equal to
+  // "cos", but has length 8.
+  switch (Str[0]) {
+  default: return false;
+  case 'a':
+    if (Len == 4)
+      return !strcmp(Str, "acos") || !strcmp(Str, "asin") ||
+             !strcmp(Str, "atan");
+    else if (Len == 5)
+      return !strcmp(Str, "atan2");
+    return false;
+  case 'c':
+    if (Len == 3)
+      return !strcmp(Str, "cos");
+    else if (Len == 4)
+      return !strcmp(Str, "ceil") || !strcmp(Str, "cosf") ||
+             !strcmp(Str, "cosh");
+    return false;
+  case 'e':
+    if (Len == 3)
+      return !strcmp(Str, "exp");
+    return false;
+  case 'f':
+    if (Len == 4)
+      return !strcmp(Str, "fabs") || !strcmp(Str, "fmod");
+    else if (Len == 5)
+      return !strcmp(Str, "floor");
+    return false;
+    break;
+  case 'l':
+    if (Len == 3 && !strcmp(Str, "log"))
+      return true;
+    if (Len == 5 && !strcmp(Str, "log10"))
+      return true;
+    return false;
+  case 'p':
+    if (Len == 3 && !strcmp(Str, "pow"))
+      return true;
+    return false;
+  case 's':
+    if (Len == 3)
+      return !strcmp(Str, "sin");
+    if (Len == 4)
+      return !strcmp(Str, "sinh") || !strcmp(Str, "sqrt") ||
+             !strcmp(Str, "sinf");
+    if (Len == 5)
+      return !strcmp(Str, "sqrtf");
+    return false;
+  case 't':
+    if (Len == 3 && !strcmp(Str, "tan"))
+      return true;
+    else if (Len == 4 && !strcmp(Str, "tanh"))
+      return true;
+    return false;
+  }
+}
+
+static Constant *ConstantFoldFP(double (*NativeFP)(double), double V, 
+                                const Type *Ty) {
+  errno = 0;
+  V = NativeFP(V);
+  if (errno != 0) {
+    errno = 0;
+    return 0;
+  }
+  
+  if (Ty == Type::FloatTy)
+    return ConstantFP::get(APFloat((float)V));
+  if (Ty == Type::DoubleTy)
+    return ConstantFP::get(APFloat(V));
+  assert(0 && "Can only constant fold float/double");
+  return 0; // dummy return to suppress warning
+}
+
+static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
+                                      double V, double W,
+                                      const Type *Ty) {
+  errno = 0;
+  V = NativeFP(V, W);
+  if (errno != 0) {
+    errno = 0;
+    return 0;
+  }
+  
+  if (Ty == Type::FloatTy)
+    return ConstantFP::get(APFloat((float)V));
+  if (Ty == Type::DoubleTy)
+    return ConstantFP::get(APFloat(V));
+  assert(0 && "Can only constant fold float/double");
+  return 0; // dummy return to suppress warning
+}
+
+/// ConstantFoldCall - Attempt to constant fold a call to the specified function
+/// with the specified arguments, returning null if unsuccessful.
+
+Constant *
+llvm::ConstantFoldCall(Function *F, 
+                       Constant* const* Operands, unsigned NumOperands) {
+  if (!F->hasName()) return 0;
+  const char *Str = F->getNameStart();
+  unsigned Len = F->getNameLen();
+  
+  const Type *Ty = F->getReturnType();
+  if (NumOperands == 1) {
+    if (ConstantFP *Op = dyn_cast<ConstantFP>(Operands[0])) {
+      if (Ty!=Type::FloatTy && Ty!=Type::DoubleTy)
+        return 0;
+      /// Currently APFloat versions of these functions do not exist, so we use
+      /// the host native double versions.  Float versions are not called
+      /// directly but for all these it is true (float)(f((double)arg)) ==
+      /// f(arg).  Long double not supported yet.
+      double V = Ty==Type::FloatTy ? (double)Op->getValueAPF().convertToFloat():
+                                     Op->getValueAPF().convertToDouble();
+      switch (Str[0]) {
+      case 'a':
+        if (Len == 4 && !strcmp(Str, "acos"))
+          return ConstantFoldFP(acos, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "asin"))
+          return ConstantFoldFP(asin, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "atan"))
+          return ConstantFoldFP(atan, V, Ty);
+        break;
+      case 'c':
+        if (Len == 4 && !strcmp(Str, "ceil"))
+          return ConstantFoldFP(ceil, V, Ty);
+        else if (Len == 3 && !strcmp(Str, "cos"))
+          return ConstantFoldFP(cos, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "cosh"))
+          return ConstantFoldFP(cosh, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "cosf"))
+          return ConstantFoldFP(cos, V, Ty);
+        break;
+      case 'e':
+        if (Len == 3 && !strcmp(Str, "exp"))
+          return ConstantFoldFP(exp, V, Ty);
+        break;
+      case 'f':
+        if (Len == 4 && !strcmp(Str, "fabs"))
+          return ConstantFoldFP(fabs, V, Ty);
+        else if (Len == 5 && !strcmp(Str, "floor"))
+          return ConstantFoldFP(floor, V, Ty);
+        break;
+      case 'l':
+        if (Len == 3 && !strcmp(Str, "log") && V > 0)
+          return ConstantFoldFP(log, V, Ty);
+        else if (Len == 5 && !strcmp(Str, "log10") && V > 0)
+          return ConstantFoldFP(log10, V, Ty);
+        else if (!strcmp(Str, "llvm.sqrt.f32") ||
+                 !strcmp(Str, "llvm.sqrt.f64")) {
+          if (V >= -0.0)
+            return ConstantFoldFP(sqrt, V, Ty);
+          else // Undefined
+            return Constant::getNullValue(Ty);
+        }
+        break;
+      case 's':
+        if (Len == 3 && !strcmp(Str, "sin"))
+          return ConstantFoldFP(sin, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "sinh"))
+          return ConstantFoldFP(sinh, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "sqrt") && V >= 0)
+          return ConstantFoldFP(sqrt, V, Ty);
+        else if (Len == 5 && !strcmp(Str, "sqrtf") && V >= 0)
+          return ConstantFoldFP(sqrt, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "sinf"))
+          return ConstantFoldFP(sin, V, Ty);
+        break;
+      case 't':
+        if (Len == 3 && !strcmp(Str, "tan"))
+          return ConstantFoldFP(tan, V, Ty);
+        else if (Len == 4 && !strcmp(Str, "tanh"))
+          return ConstantFoldFP(tanh, V, Ty);
+        break;
+      default:
+        break;
+      }
+    } else if (ConstantInt *Op = dyn_cast<ConstantInt>(Operands[0])) {
+      if (Len > 11 && !memcmp(Str, "llvm.bswap", 10))
+        return ConstantInt::get(Op->getValue().byteSwap());
+      else if (Len > 11 && !memcmp(Str, "llvm.ctpop", 10))
+        return ConstantInt::get(Ty, Op->getValue().countPopulation());
+      else if (Len > 10 && !memcmp(Str, "llvm.cttz", 9))
+        return ConstantInt::get(Ty, Op->getValue().countTrailingZeros());
+      else if (Len > 10 && !memcmp(Str, "llvm.ctlz", 9))
+        return ConstantInt::get(Ty, Op->getValue().countLeadingZeros());
+    }
+  } else if (NumOperands == 2) {
+    if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
+      if (Ty!=Type::FloatTy && Ty!=Type::DoubleTy)
+        return 0;
+      double Op1V = Ty==Type::FloatTy ? 
+                      (double)Op1->getValueAPF().convertToFloat():
+                      Op1->getValueAPF().convertToDouble();
+      if (ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
+        double Op2V = Ty==Type::FloatTy ? 
+                      (double)Op2->getValueAPF().convertToFloat():
+                      Op2->getValueAPF().convertToDouble();
+
+        if (Len == 3 && !strcmp(Str, "pow")) {
+          return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
+        } else if (Len == 4 && !strcmp(Str, "fmod")) {
+          return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
+        } else if (Len == 5 && !strcmp(Str, "atan2")) {
+          return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
+        }
+      } else if (ConstantInt *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
+        if (!strcmp(Str, "llvm.powi.f32")) {
+          return ConstantFP::get(APFloat((float)std::pow((float)Op1V,
+                                                 (int)Op2C->getZExtValue())));
+        } else if (!strcmp(Str, "llvm.powi.f64")) {
+          return ConstantFP::get(APFloat((double)std::pow((double)Op1V,
+                                                 (int)Op2C->getZExtValue())));
+        }
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/lib/Analysis/DbgInfoPrinter.cpp b/lib/Analysis/DbgInfoPrinter.cpp
new file mode 100644
index 0000000..d80d581
--- /dev/null
+++ b/lib/Analysis/DbgInfoPrinter.cpp
@@ -0,0 +1,167 @@
+//===- DbgInfoPrinter.cpp - Print debug info in a human readable form ------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that prints instructions, and associated debug
+// info:
+// 
+//   - source/line/col information
+//   - original variable name
+//   - original type name
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+PrintDirectory("print-fullpath",
+               cl::desc("Print fullpath when printing debug info"),
+               cl::Hidden);
+
+namespace {
+  class VISIBILITY_HIDDEN PrintDbgInfo : public FunctionPass {
+    raw_ostream &Out;
+    void printStopPoint(const DbgStopPointInst *DSI);
+    void printFuncStart(const DbgFuncStartInst *FS);
+    void printVariableDeclaration(const Value *V);
+  public:
+    static char ID; // Pass identification
+    PrintDbgInfo() : FunctionPass(&ID), Out(outs()) {}
+
+    virtual bool runOnFunction(Function &F);
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+  char PrintDbgInfo::ID = 0;
+  static RegisterPass<PrintDbgInfo> X("print-dbginfo",
+                                     "Print debug info in human readable form");
+}
+
+FunctionPass *llvm::createDbgInfoPrinterPass() { return new PrintDbgInfo(); }
+
+void PrintDbgInfo::printVariableDeclaration(const Value *V) {
+  std::string DisplayName, File, Directory, Type;
+  unsigned LineNo;
+
+  if (!getLocationInfo(V, DisplayName, Type, LineNo, File, Directory))
+    return;
+
+  Out << "; ";
+  WriteAsOperand(Out, V, false, 0);
+  Out << " is variable " << DisplayName
+      << " of type " << Type << " declared at ";
+
+  if (PrintDirectory)
+    Out << Directory << "/";
+
+  Out << File << ":" << LineNo << "\n";
+}
+
+void PrintDbgInfo::printStopPoint(const DbgStopPointInst *DSI) {
+  if (PrintDirectory) {
+    std::string dir;
+    GetConstantStringInfo(DSI->getDirectory(), dir);
+    Out << dir << "/";
+  }
+
+  std::string file;
+  GetConstantStringInfo(DSI->getFileName(), file);
+  Out << file << ":" << DSI->getLine();
+
+  if (unsigned Col = DSI->getColumn())
+    Out << ":" << Col;
+}
+
+void PrintDbgInfo::printFuncStart(const DbgFuncStartInst *FS) {
+  DISubprogram Subprogram(cast<GlobalVariable>(FS->getSubprogram()));
+  std::string Res1, Res2;
+  Out << "; fully qualified function name: " << Subprogram.getDisplayName(Res1)
+      << " return type: " << Subprogram.getType().getName(Res2)
+      << " at line " << Subprogram.getLineNumber()
+      << "\n\n";
+}
+
+bool PrintDbgInfo::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  Out << "function " << F.getName() << "\n\n";
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *BB = I;
+
+    if (I != F.begin() && (pred_begin(BB) == pred_end(BB)))
+      // Skip dead blocks.
+      continue;
+
+    const DbgStopPointInst *DSI = findBBStopPoint(BB);
+    Out << BB->getName();
+    Out << ":";
+
+    if (DSI) {
+      Out << "; (";
+      printStopPoint(DSI);
+      Out << ")";
+    }
+
+    Out << "\n";
+
+    // A dbgstoppoint's information is valid until we encounter a new one.
+    const DbgStopPointInst *LastDSP = DSI;
+    bool Printed = DSI != 0;
+    for (BasicBlock::const_iterator i = BB->begin(), e = BB->end();
+         i != e; ++i) {
+      if (isa<DbgInfoIntrinsic>(i)) {
+        if ((DSI = dyn_cast<DbgStopPointInst>(i))) {
+          if (DSI->getContext() == LastDSP->getContext() &&
+              DSI->getLineValue() == LastDSP->getLineValue() &&
+              DSI->getColumnValue() == LastDSP->getColumnValue())
+            // Don't print same location twice.
+            continue;
+
+          LastDSP = cast<DbgStopPointInst>(i);
+
+          // Don't print consecutive stoppoints, use a flag to know which one we
+          // printed.
+          Printed = false;
+        } else if (const DbgFuncStartInst *FS = dyn_cast<DbgFuncStartInst>(i)) {
+          printFuncStart(FS);
+        }
+      } else {
+        if (!Printed && LastDSP) {
+          Out << "; ";
+          printStopPoint(LastDSP);
+          Out << "\n";
+          Printed = true;
+        }
+
+        Out << *i;
+        printVariableDeclaration(i);
+
+        if (const User *U = dyn_cast<User>(i)) {
+          for(unsigned i=0;i<U->getNumOperands();i++)
+            printVariableDeclaration(U->getOperand(i));
+        }
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
new file mode 100644
index 0000000..6bdb64c
--- /dev/null
+++ b/lib/Analysis/DebugInfo.cpp
@@ -0,0 +1,1079 @@
+//===--- DebugInfo.cpp - Debug Information Helper Classes -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the helper classes used to build and interpret debug
+// information in LLVM IR form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Streams.h"
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+//===----------------------------------------------------------------------===//
+// DIDescriptor
+//===----------------------------------------------------------------------===//
+
+/// ValidDebugInfo - Return true if V represents valid debug info value.
+bool DIDescriptor::ValidDebugInfo(Value *V, CodeGenOpt::Level OptLevel) {
+  if (!V)
+    return false;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(V->stripPointerCasts());
+  if (!GV)
+    return false;
+
+  if (!GV->hasInternalLinkage () && !GV->hasLinkOnceLinkage())
+    return false;
+
+  DIDescriptor DI(GV);
+
+  // Check current version. Allow Version6 for now.
+  unsigned Version = DI.getVersion();
+  if (Version != LLVMDebugVersion && Version != LLVMDebugVersion6)
+    return false;
+
+  unsigned Tag = DI.getTag();
+  switch (Tag) {
+  case DW_TAG_variable:
+    assert(DIVariable(GV).Verify() && "Invalid DebugInfo value");
+    break;
+  case DW_TAG_compile_unit:
+    assert(DICompileUnit(GV).Verify() && "Invalid DebugInfo value");
+    break;
+  case DW_TAG_subprogram:
+    assert(DISubprogram(GV).Verify() && "Invalid DebugInfo value");
+    break;
+  case DW_TAG_lexical_block:
+    // FIXME: This interfers with the quality of generated code during
+    // optimization.
+    if (OptLevel != CodeGenOpt::None)
+      return false;
+    // FALLTHROUGH
+  default:
+    break;
+  }
+
+  return true;
+}
+
+DIDescriptor::DIDescriptor(GlobalVariable *gv, unsigned RequiredTag) {
+  GV = gv;
+  
+  // If this is non-null, check to see if the Tag matches. If not, set to null.
+  if (GV && getTag() != RequiredTag)
+    GV = 0;
+}
+
+const std::string &
+DIDescriptor::getStringField(unsigned Elt, std::string &Result) const {
+  if (GV == 0) {
+    Result.clear();
+    return Result;
+  }
+
+  Constant *C = GV->getInitializer();
+  if (C == 0 || Elt >= C->getNumOperands()) {
+    Result.clear();
+    return Result;
+  }
+
+  // Fills in the string if it succeeds
+  if (!GetConstantStringInfo(C->getOperand(Elt), Result))
+    Result.clear();
+
+  return Result;
+}
+
+uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
+  if (GV == 0) return 0;
+
+  Constant *C = GV->getInitializer();
+  if (C == 0 || Elt >= C->getNumOperands())
+    return 0;
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C->getOperand(Elt)))
+    return CI->getZExtValue();
+  return 0;
+}
+
+DIDescriptor DIDescriptor::getDescriptorField(unsigned Elt) const {
+  if (GV == 0) return DIDescriptor();
+
+  Constant *C = GV->getInitializer();
+  if (C == 0 || Elt >= C->getNumOperands())
+    return DIDescriptor();
+
+  C = C->getOperand(Elt);
+  return DIDescriptor(dyn_cast<GlobalVariable>(C->stripPointerCasts()));
+}
+
+GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
+  if (GV == 0) return 0;
+
+  Constant *C = GV->getInitializer();
+  if (C == 0 || Elt >= C->getNumOperands())
+    return 0;
+
+  C = C->getOperand(Elt);
+  return dyn_cast<GlobalVariable>(C->stripPointerCasts());
+}
+
+//===----------------------------------------------------------------------===//
+// Simple Descriptor Constructors and other Methods
+//===----------------------------------------------------------------------===//
+
+// Needed by DIVariable::getType().
+DIType::DIType(GlobalVariable *gv) : DIDescriptor(gv) {
+  if (!gv) return;
+  unsigned tag = getTag();
+  if (tag != dwarf::DW_TAG_base_type && !DIDerivedType::isDerivedType(tag) &&
+      !DICompositeType::isCompositeType(tag))
+    GV = 0;
+}
+
+/// isDerivedType - Return true if the specified tag is legal for
+/// DIDerivedType.
+bool DIType::isDerivedType(unsigned Tag) {
+  switch (Tag) {
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_pointer_type:
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_volatile_type:
+  case dwarf::DW_TAG_restrict_type:
+  case dwarf::DW_TAG_member:
+  case dwarf::DW_TAG_inheritance:
+    return true;
+  default:
+    // FIXME: Even though it doesn't make sense, CompositeTypes are current
+    // modelled as DerivedTypes, this should return true for them as well.
+    return false;
+  }
+}
+
+/// isCompositeType - Return true if the specified tag is legal for
+/// DICompositeType.
+bool DIType::isCompositeType(unsigned TAG) {
+  switch (TAG) {
+  case dwarf::DW_TAG_array_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+  case dwarf::DW_TAG_vector_type:
+  case dwarf::DW_TAG_subroutine_type:
+  case dwarf::DW_TAG_class_type:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isVariable - Return true if the specified tag is legal for DIVariable.
+bool DIVariable::isVariable(unsigned Tag) {
+  switch (Tag) {
+  case dwarf::DW_TAG_auto_variable:
+  case dwarf::DW_TAG_arg_variable:
+  case dwarf::DW_TAG_return_variable:
+    return true;
+  default:
+    return false;
+  }
+}
+
+unsigned DIArray::getNumElements() const {
+  assert (GV && "Invalid DIArray");
+  Constant *C = GV->getInitializer();
+  assert (C && "Invalid DIArray initializer");
+  return C->getNumOperands();
+}
+
+/// Verify - Verify that a compile unit is well formed.
+bool DICompileUnit::Verify() const {
+  if (isNull()) 
+    return false;
+  std::string Res;
+  if (getFilename(Res).empty()) 
+    return false;
+  // It is possible that directory and produce string is empty.
+  return true;
+}
+
+/// Verify - Verify that a type descriptor is well formed.
+bool DIType::Verify() const {
+  if (isNull()) 
+    return false;
+  if (getContext().isNull()) 
+    return false;
+
+  DICompileUnit CU = getCompileUnit();
+  if (!CU.isNull() && !CU.Verify()) 
+    return false;
+  return true;
+}
+
+/// Verify - Verify that a composite type descriptor is well formed.
+bool DICompositeType::Verify() const {
+  if (isNull()) 
+    return false;
+  if (getContext().isNull()) 
+    return false;
+
+  DICompileUnit CU = getCompileUnit();
+  if (!CU.isNull() && !CU.Verify()) 
+    return false;
+  return true;
+}
+
+/// Verify - Verify that a subprogram descriptor is well formed.
+bool DISubprogram::Verify() const {
+  if (isNull())
+    return false;
+  
+  if (getContext().isNull())
+    return false;
+
+  DICompileUnit CU = getCompileUnit();
+  if (!CU.Verify()) 
+    return false;
+
+  DICompositeType Ty = getType();
+  if (!Ty.isNull() && !Ty.Verify())
+    return false;
+  return true;
+}
+
+/// Verify - Verify that a global variable descriptor is well formed.
+bool DIGlobalVariable::Verify() const {
+  if (isNull())
+    return false;
+  
+  if (getContext().isNull())
+    return false;
+
+  DICompileUnit CU = getCompileUnit();
+  if (!CU.isNull() && !CU.Verify()) 
+    return false;
+
+  DIType Ty = getType();
+  if (!Ty.Verify())
+    return false;
+
+  if (!getGlobal())
+    return false;
+
+  return true;
+}
+
+/// Verify - Verify that a variable descriptor is well formed.
+bool DIVariable::Verify() const {
+  if (isNull())
+    return false;
+  
+  if (getContext().isNull())
+    return false;
+
+  DIType Ty = getType();
+  if (!Ty.Verify())
+    return false;
+
+  return true;
+}
+
+/// getOriginalTypeSize - If this type is derived from a base type then
+/// return base type size.
+uint64_t DIDerivedType::getOriginalTypeSize() const {
+  if (getTag() != dwarf::DW_TAG_member)
+    return getSizeInBits();
+  DIType BT = getTypeDerivedFrom();
+  if (BT.getTag() != dwarf::DW_TAG_base_type)
+    return getSizeInBits();
+  return BT.getSizeInBits();
+}
+
+/// describes - Return true if this subprogram provides debugging
+/// information for the function F.
+bool DISubprogram::describes(const Function *F) {
+  assert (F && "Invalid function");
+  std::string Name;
+  getLinkageName(Name);
+  if (Name.empty())
+    getName(Name);
+  if (!Name.empty() && (strcmp(Name.c_str(), F->getNameStart()) == false))
+    return true;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// DIFactory: Basic Helpers
+//===----------------------------------------------------------------------===//
+
+DIFactory::DIFactory(Module &m)
+  : M(m), StopPointFn(0), FuncStartFn(0), RegionStartFn(0), RegionEndFn(0),
+    DeclareFn(0) {
+  EmptyStructPtr = PointerType::getUnqual(StructType::get(NULL, NULL));
+}
+
+/// getCastToEmpty - Return this descriptor as a Constant* with type '{}*'.
+/// This is only valid when the descriptor is non-null.
+Constant *DIFactory::getCastToEmpty(DIDescriptor D) {
+  if (D.isNull()) return Constant::getNullValue(EmptyStructPtr);
+  return ConstantExpr::getBitCast(D.getGV(), EmptyStructPtr);
+}
+
+Constant *DIFactory::GetTagConstant(unsigned TAG) {
+  assert((TAG & LLVMDebugVersionMask) == 0 &&
+         "Tag too large for debug encoding!");
+  return ConstantInt::get(Type::Int32Ty, TAG | LLVMDebugVersion);
+}
+
+Constant *DIFactory::GetStringConstant(const std::string &String) {
+  // Check string cache for previous edition.
+  Constant *&Slot = StringCache[String];
+  
+  // Return Constant if previously defined.
+  if (Slot) return Slot;
+  
+  const PointerType *DestTy = PointerType::getUnqual(Type::Int8Ty);
+  
+  // If empty string then use a sbyte* null instead.
+  if (String.empty())
+    return Slot = ConstantPointerNull::get(DestTy);
+
+  // Construct string as an llvm constant.
+  Constant *ConstStr = ConstantArray::get(String);
+    
+  // Otherwise create and return a new string global.
+  GlobalVariable *StrGV = new GlobalVariable(ConstStr->getType(), true,
+                                             GlobalVariable::InternalLinkage,
+                                             ConstStr, ".str", &M);
+  StrGV->setSection("llvm.metadata");
+  return Slot = ConstantExpr::getBitCast(StrGV, DestTy);
+}
+
+/// GetOrCreateAnchor - Look up an anchor for the specified tag and name.  If it
+/// already exists, return it.  If not, create a new one and return it.
+DIAnchor DIFactory::GetOrCreateAnchor(unsigned TAG, const char *Name) {
+  const Type *EltTy = StructType::get(Type::Int32Ty, Type::Int32Ty, NULL);
+  
+  // Otherwise, create the global or return it if already in the module.
+  Constant *C = M.getOrInsertGlobal(Name, EltTy);
+  assert(isa<GlobalVariable>(C) && "Incorrectly typed anchor?");
+  GlobalVariable *GV = cast<GlobalVariable>(C);
+  
+  // If it has an initializer, it is already in the module.
+  if (GV->hasInitializer()) 
+    return SubProgramAnchor = DIAnchor(GV);
+  
+  GV->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  GV->setSection("llvm.metadata");
+  GV->setConstant(true);
+  M.addTypeName("llvm.dbg.anchor.type", EltTy);
+  
+  // Otherwise, set the initializer.
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_anchor),
+    ConstantInt::get(Type::Int32Ty, TAG)
+  };
+  
+  GV->setInitializer(ConstantStruct::get(Elts, 2));
+  return DIAnchor(GV);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// DIFactory: Primary Constructors
+//===----------------------------------------------------------------------===//
+
+/// GetOrCreateCompileUnitAnchor - Return the anchor for compile units,
+/// creating a new one if there isn't already one in the module.
+DIAnchor DIFactory::GetOrCreateCompileUnitAnchor() {
+  // If we already created one, just return it.
+  if (!CompileUnitAnchor.isNull())
+    return CompileUnitAnchor;
+  return CompileUnitAnchor = GetOrCreateAnchor(dwarf::DW_TAG_compile_unit,
+                                               "llvm.dbg.compile_units");
+}
+
+/// GetOrCreateSubprogramAnchor - Return the anchor for subprograms,
+/// creating a new one if there isn't already one in the module.
+DIAnchor DIFactory::GetOrCreateSubprogramAnchor() {
+  // If we already created one, just return it.
+  if (!SubProgramAnchor.isNull())
+    return SubProgramAnchor;
+  return SubProgramAnchor = GetOrCreateAnchor(dwarf::DW_TAG_subprogram,
+                                              "llvm.dbg.subprograms");
+}
+
+/// GetOrCreateGlobalVariableAnchor - Return the anchor for globals,
+/// creating a new one if there isn't already one in the module.
+DIAnchor DIFactory::GetOrCreateGlobalVariableAnchor() {
+  // If we already created one, just return it.
+  if (!GlobalVariableAnchor.isNull())
+    return GlobalVariableAnchor;
+  return GlobalVariableAnchor = GetOrCreateAnchor(dwarf::DW_TAG_variable,
+                                                  "llvm.dbg.global_variables");
+}
+
+/// GetOrCreateArray - Create an descriptor for an array of descriptors. 
+/// This implicitly uniques the arrays created.
+DIArray DIFactory::GetOrCreateArray(DIDescriptor *Tys, unsigned NumTys) {
+  SmallVector<Constant*, 16> Elts;
+  
+  for (unsigned i = 0; i != NumTys; ++i)
+    Elts.push_back(getCastToEmpty(Tys[i]));
+  
+  Constant *Init = ConstantArray::get(ArrayType::get(EmptyStructPtr,
+                                                     Elts.size()),
+                                      Elts.data(), Elts.size());
+  // If we already have this array, just return the uniqued version.
+  DIDescriptor &Entry = SimpleConstantCache[Init];
+  if (!Entry.isNull()) return DIArray(Entry.getGV());
+  
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.array", &M);
+  GV->setSection("llvm.metadata");
+  Entry = DIDescriptor(GV);
+  return DIArray(GV);
+}
+
+/// GetOrCreateSubrange - Create a descriptor for a value range.  This
+/// implicitly uniques the values returned.
+DISubrange DIFactory::GetOrCreateSubrange(int64_t Lo, int64_t Hi) {
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_subrange_type),
+    ConstantInt::get(Type::Int64Ty, Lo),
+    ConstantInt::get(Type::Int64Ty, Hi)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+
+  // If we already have this range, just return the uniqued version.
+  DIDescriptor &Entry = SimpleConstantCache[Init];
+  if (!Entry.isNull()) return DISubrange(Entry.getGV());
+  
+  M.addTypeName("llvm.dbg.subrange.type", Init->getType());
+
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.subrange", &M);
+  GV->setSection("llvm.metadata");
+  Entry = DIDescriptor(GV);
+  return DISubrange(GV);
+}
+
+
+
+/// CreateCompileUnit - Create a new descriptor for the specified compile
+/// unit.  Note that this does not unique compile units within the module.
+DICompileUnit DIFactory::CreateCompileUnit(unsigned LangID,
+                                           const std::string &Filename,
+                                           const std::string &Directory,
+                                           const std::string &Producer,
+                                           bool isMain,
+                                           bool isOptimized,
+                                           const char *Flags,
+                                           unsigned RunTimeVer) {
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_compile_unit),
+    getCastToEmpty(GetOrCreateCompileUnitAnchor()),
+    ConstantInt::get(Type::Int32Ty, LangID),
+    GetStringConstant(Filename),
+    GetStringConstant(Directory),
+    GetStringConstant(Producer),
+    ConstantInt::get(Type::Int1Ty, isMain),
+    ConstantInt::get(Type::Int1Ty, isOptimized),
+    GetStringConstant(Flags),
+    ConstantInt::get(Type::Int32Ty, RunTimeVer)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.compile_unit.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.compile_unit", &M);
+  GV->setSection("llvm.metadata");
+  return DICompileUnit(GV);
+}
+
+/// CreateEnumerator - Create a single enumerator value.
+DIEnumerator DIFactory::CreateEnumerator(const std::string &Name, uint64_t Val){
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_enumerator),
+    GetStringConstant(Name),
+    ConstantInt::get(Type::Int64Ty, Val)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.enumerator.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.enumerator", &M);
+  GV->setSection("llvm.metadata");
+  return DIEnumerator(GV);
+}
+
+
+/// CreateBasicType - Create a basic type like int, float, etc.
+DIBasicType DIFactory::CreateBasicType(DIDescriptor Context,
+                                      const std::string &Name,
+                                       DICompileUnit CompileUnit,
+                                       unsigned LineNumber,
+                                       uint64_t SizeInBits,
+                                       uint64_t AlignInBits,
+                                       uint64_t OffsetInBits, unsigned Flags,
+                                       unsigned Encoding) {
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_base_type),
+    getCastToEmpty(Context),
+    GetStringConstant(Name),
+    getCastToEmpty(CompileUnit),
+    ConstantInt::get(Type::Int32Ty, LineNumber),
+    ConstantInt::get(Type::Int64Ty, SizeInBits),
+    ConstantInt::get(Type::Int64Ty, AlignInBits),
+    ConstantInt::get(Type::Int64Ty, OffsetInBits),
+    ConstantInt::get(Type::Int32Ty, Flags),
+    ConstantInt::get(Type::Int32Ty, Encoding)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.basictype.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.basictype", &M);
+  GV->setSection("llvm.metadata");
+  return DIBasicType(GV);
+}
+
+/// CreateDerivedType - Create a derived type like const qualified type,
+/// pointer, typedef, etc.
+DIDerivedType DIFactory::CreateDerivedType(unsigned Tag,
+                                           DIDescriptor Context,
+                                           const std::string &Name,
+                                           DICompileUnit CompileUnit,
+                                           unsigned LineNumber,
+                                           uint64_t SizeInBits,
+                                           uint64_t AlignInBits,
+                                           uint64_t OffsetInBits,
+                                           unsigned Flags,
+                                           DIType DerivedFrom) {
+  Constant *Elts[] = {
+    GetTagConstant(Tag),
+    getCastToEmpty(Context),
+    GetStringConstant(Name),
+    getCastToEmpty(CompileUnit),
+    ConstantInt::get(Type::Int32Ty, LineNumber),
+    ConstantInt::get(Type::Int64Ty, SizeInBits),
+    ConstantInt::get(Type::Int64Ty, AlignInBits),
+    ConstantInt::get(Type::Int64Ty, OffsetInBits),
+    ConstantInt::get(Type::Int32Ty, Flags),
+    getCastToEmpty(DerivedFrom)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.derivedtype.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.derivedtype", &M);
+  GV->setSection("llvm.metadata");
+  return DIDerivedType(GV);
+}
+
+/// CreateCompositeType - Create a composite type like array, struct, etc.
+DICompositeType DIFactory::CreateCompositeType(unsigned Tag,
+                                               DIDescriptor Context,
+                                               const std::string &Name,
+                                               DICompileUnit CompileUnit,
+                                               unsigned LineNumber,
+                                               uint64_t SizeInBits,
+                                               uint64_t AlignInBits,
+                                               uint64_t OffsetInBits,
+                                               unsigned Flags,
+                                               DIType DerivedFrom,
+                                               DIArray Elements,
+                                               unsigned RuntimeLang) {
+
+  Constant *Elts[] = {
+    GetTagConstant(Tag),
+    getCastToEmpty(Context),
+    GetStringConstant(Name),
+    getCastToEmpty(CompileUnit),
+    ConstantInt::get(Type::Int32Ty, LineNumber),
+    ConstantInt::get(Type::Int64Ty, SizeInBits),
+    ConstantInt::get(Type::Int64Ty, AlignInBits),
+    ConstantInt::get(Type::Int64Ty, OffsetInBits),
+    ConstantInt::get(Type::Int32Ty, Flags),
+    getCastToEmpty(DerivedFrom),
+    getCastToEmpty(Elements),
+    ConstantInt::get(Type::Int32Ty, RuntimeLang)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.composite.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.composite", &M);
+  GV->setSection("llvm.metadata");
+  return DICompositeType(GV);
+}
+
+
+/// CreateSubprogram - Create a new descriptor for the specified subprogram.
+/// See comments in DISubprogram for descriptions of these fields.  This
+/// method does not unique the generated descriptors.
+DISubprogram DIFactory::CreateSubprogram(DIDescriptor Context, 
+                                         const std::string &Name,
+                                         const std::string &DisplayName,
+                                         const std::string &LinkageName,
+                                         DICompileUnit CompileUnit,
+                                         unsigned LineNo, DIType Type,
+                                         bool isLocalToUnit,
+                                         bool isDefinition) {
+
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_subprogram),
+    getCastToEmpty(GetOrCreateSubprogramAnchor()),
+    getCastToEmpty(Context),
+    GetStringConstant(Name),
+    GetStringConstant(DisplayName),
+    GetStringConstant(LinkageName),
+    getCastToEmpty(CompileUnit),
+    ConstantInt::get(Type::Int32Ty, LineNo),
+    getCastToEmpty(Type),
+    ConstantInt::get(Type::Int1Ty, isLocalToUnit),
+    ConstantInt::get(Type::Int1Ty, isDefinition)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.subprogram.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.subprogram", &M);
+  GV->setSection("llvm.metadata");
+  return DISubprogram(GV);
+}
+
+/// CreateGlobalVariable - Create a new descriptor for the specified global.
+DIGlobalVariable
+DIFactory::CreateGlobalVariable(DIDescriptor Context, const std::string &Name,
+                                const std::string &DisplayName,
+                                const std::string &LinkageName,
+                                DICompileUnit CompileUnit,
+                                unsigned LineNo, DIType Type,bool isLocalToUnit,
+                                bool isDefinition, llvm::GlobalVariable *Val) {
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_variable),
+    getCastToEmpty(GetOrCreateGlobalVariableAnchor()),
+    getCastToEmpty(Context),
+    GetStringConstant(Name),
+    GetStringConstant(DisplayName),
+    GetStringConstant(LinkageName),
+    getCastToEmpty(CompileUnit),
+    ConstantInt::get(Type::Int32Ty, LineNo),
+    getCastToEmpty(Type),
+    ConstantInt::get(Type::Int1Ty, isLocalToUnit),
+    ConstantInt::get(Type::Int1Ty, isDefinition),
+    ConstantExpr::getBitCast(Val, EmptyStructPtr)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.global_variable.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.global_variable", &M);
+  GV->setSection("llvm.metadata");
+  return DIGlobalVariable(GV);
+}
+
+
+/// CreateVariable - Create a new descriptor for the specified variable.
+DIVariable DIFactory::CreateVariable(unsigned Tag, DIDescriptor Context,
+                                     const std::string &Name,
+                                     DICompileUnit CompileUnit, unsigned LineNo,
+                                     DIType Type) {
+  Constant *Elts[] = {
+    GetTagConstant(Tag),
+    getCastToEmpty(Context),
+    GetStringConstant(Name),
+    getCastToEmpty(CompileUnit),
+    ConstantInt::get(Type::Int32Ty, LineNo),
+    getCastToEmpty(Type)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.variable.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.variable", &M);
+  GV->setSection("llvm.metadata");
+  return DIVariable(GV);
+}
+
+
+/// CreateBlock - This creates a descriptor for a lexical block with the
+/// specified parent context.
+DIBlock DIFactory::CreateBlock(DIDescriptor Context) {
+  Constant *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_lexical_block),
+    getCastToEmpty(Context)
+  };
+  
+  Constant *Init = ConstantStruct::get(Elts, sizeof(Elts)/sizeof(Elts[0]));
+  
+  M.addTypeName("llvm.dbg.block.type", Init->getType());
+  GlobalVariable *GV = new GlobalVariable(Init->getType(), true,
+                                          GlobalValue::InternalLinkage,
+                                          Init, "llvm.dbg.block", &M);
+  GV->setSection("llvm.metadata");
+  return DIBlock(GV);
+}
+
+
+//===----------------------------------------------------------------------===//
+// DIFactory: Routines for inserting code into a function
+//===----------------------------------------------------------------------===//
+
+/// InsertStopPoint - Create a new llvm.dbg.stoppoint intrinsic invocation,
+/// inserting it at the end of the specified basic block.
+void DIFactory::InsertStopPoint(DICompileUnit CU, unsigned LineNo,
+                                unsigned ColNo, BasicBlock *BB) {
+  
+  // Lazily construct llvm.dbg.stoppoint function.
+  if (!StopPointFn)
+    StopPointFn = llvm::Intrinsic::getDeclaration(&M, 
+                                              llvm::Intrinsic::dbg_stoppoint);
+  
+  // Invoke llvm.dbg.stoppoint
+  Value *Args[] = {
+    llvm::ConstantInt::get(llvm::Type::Int32Ty, LineNo),
+    llvm::ConstantInt::get(llvm::Type::Int32Ty, ColNo),
+    getCastToEmpty(CU)
+  };
+  CallInst::Create(StopPointFn, Args, Args+3, "", BB);
+}
+
+/// InsertSubprogramStart - Create a new llvm.dbg.func.start intrinsic to
+/// mark the start of the specified subprogram.
+void DIFactory::InsertSubprogramStart(DISubprogram SP, BasicBlock *BB) {
+  // Lazily construct llvm.dbg.func.start.
+  if (!FuncStartFn)
+    FuncStartFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_func_start);
+  
+  // Call llvm.dbg.func.start which also implicitly sets a stoppoint.
+  CallInst::Create(FuncStartFn, getCastToEmpty(SP), "", BB);
+}
+
+/// InsertRegionStart - Insert a new llvm.dbg.region.start intrinsic call to
+/// mark the start of a region for the specified scoping descriptor.
+void DIFactory::InsertRegionStart(DIDescriptor D, BasicBlock *BB) {
+  // Lazily construct llvm.dbg.region.start function.
+  if (!RegionStartFn)
+    RegionStartFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_region_start);
+
+  // Call llvm.dbg.func.start.
+  CallInst::Create(RegionStartFn, getCastToEmpty(D), "", BB);
+}
+
+/// InsertRegionEnd - Insert a new llvm.dbg.region.end intrinsic call to
+/// mark the end of a region for the specified scoping descriptor.
+void DIFactory::InsertRegionEnd(DIDescriptor D, BasicBlock *BB) {
+  // Lazily construct llvm.dbg.region.end function.
+  if (!RegionEndFn)
+    RegionEndFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_region_end);
+
+  // Call llvm.dbg.region.end.
+  CallInst::Create(RegionEndFn, getCastToEmpty(D), "", BB);
+}
+
+/// InsertDeclare - Insert a new llvm.dbg.declare intrinsic call.
+void DIFactory::InsertDeclare(Value *Storage, DIVariable D, BasicBlock *BB) {
+  // Cast the storage to a {}* for the call to llvm.dbg.declare.
+  Storage = new BitCastInst(Storage, EmptyStructPtr, "", BB);
+  
+  if (!DeclareFn)
+    DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
+
+  Value *Args[] = { Storage, getCastToEmpty(D) };
+  CallInst::Create(DeclareFn, Args, Args+2, "", BB);
+}
+
+namespace llvm {
+  /// findStopPoint - Find the stoppoint coressponding to this instruction, that
+  /// is the stoppoint that dominates this instruction.
+  const DbgStopPointInst *findStopPoint(const Instruction *Inst) {
+    if (const DbgStopPointInst *DSI = dyn_cast<DbgStopPointInst>(Inst))
+      return DSI;
+
+    const BasicBlock *BB = Inst->getParent();
+    BasicBlock::const_iterator I = Inst, B;
+    while (BB) {
+      B = BB->begin();
+
+      // A BB consisting only of a terminator can't have a stoppoint.
+      while (I != B) {
+        --I;
+        if (const DbgStopPointInst *DSI = dyn_cast<DbgStopPointInst>(I))
+          return DSI;
+      }
+
+      // This BB didn't have a stoppoint: if there is only one predecessor, look
+      // for a stoppoint there. We could use getIDom(), but that would require
+      // dominator info.
+      BB = I->getParent()->getUniquePredecessor();
+      if (BB)
+        I = BB->getTerminator();
+    }
+
+    return 0;
+  }
+
+  /// findBBStopPoint - Find the stoppoint corresponding to first real
+  /// (non-debug intrinsic) instruction in this Basic Block, and return the
+  /// stoppoint for it.
+  const DbgStopPointInst *findBBStopPoint(const BasicBlock *BB) {
+    for(BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (const DbgStopPointInst *DSI = dyn_cast<DbgStopPointInst>(I))
+        return DSI;
+
+    // Fallback to looking for stoppoint of unique predecessor. Useful if this
+    // BB contains no stoppoints, but unique predecessor does.
+    BB = BB->getUniquePredecessor();
+    if (BB)
+      return findStopPoint(BB->getTerminator());
+
+    return 0;
+  }
+
+  Value *findDbgGlobalDeclare(GlobalVariable *V) {
+    const Module *M = V->getParent();
+    const Type *Ty = M->getTypeByName("llvm.dbg.global_variable.type");
+    if (!Ty) return 0;
+
+    Ty = PointerType::get(Ty, 0);
+
+    Value *Val = V->stripPointerCasts();
+    for (Value::use_iterator I = Val->use_begin(), E = Val->use_end();
+         I != E; ++I) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I)) {
+        if (CE->getOpcode() == Instruction::BitCast) {
+          Value *VV = CE;
+
+          while (VV->hasOneUse())
+            VV = *VV->use_begin();
+
+          if (VV->getType() == Ty)
+            return VV;
+        }
+      }
+    }
+    
+    if (Val->getType() == Ty)
+      return Val;
+
+    return 0;
+  }
+
+  /// Finds the llvm.dbg.declare intrinsic corresponding to this value if any.
+  /// It looks through pointer casts too.
+  const DbgDeclareInst *findDbgDeclare(const Value *V, bool stripCasts) {
+    if (stripCasts) {
+      V = V->stripPointerCasts();
+
+      // Look for the bitcast.
+      for (Value::use_const_iterator I = V->use_begin(), E =V->use_end();
+            I != E; ++I)
+        if (isa<BitCastInst>(I))
+          return findDbgDeclare(*I, false);
+
+      return 0;
+    }
+
+    // Find llvm.dbg.declare among uses of the instruction.
+    for (Value::use_const_iterator I = V->use_begin(), E =V->use_end();
+          I != E; ++I)
+      if (const DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I))
+        return DDI;
+
+    return 0;
+  }
+
+  bool getLocationInfo(const Value *V, std::string &DisplayName,
+                       std::string &Type, unsigned &LineNo, std::string &File,
+                       std::string &Dir) {
+    DICompileUnit Unit;
+    DIType TypeD;
+
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(const_cast<Value*>(V))) {
+      Value *DIGV = findDbgGlobalDeclare(GV);
+      if (!DIGV) return false;
+      DIGlobalVariable Var(cast<GlobalVariable>(DIGV));
+
+      Var.getDisplayName(DisplayName);
+      LineNo = Var.getLineNumber();
+      Unit = Var.getCompileUnit();
+      TypeD = Var.getType();
+    } else {
+      const DbgDeclareInst *DDI = findDbgDeclare(V);
+      if (!DDI) return false;
+      DIVariable Var(cast<GlobalVariable>(DDI->getVariable()));
+
+      Var.getName(DisplayName);
+      LineNo = Var.getLineNumber();
+      Unit = Var.getCompileUnit();
+      TypeD = Var.getType();
+    }
+
+    TypeD.getName(Type);
+    Unit.getFilename(File);
+    Unit.getDirectory(Dir);
+    return true;
+  }
+}
+
+/// dump - Print descriptor.
+void DIDescriptor::dump() const {
+  cerr << "[" << dwarf::TagString(getTag()) << "] ";
+  cerr << std::hex << "[GV:" << GV << "]" << std::dec;
+}
+
+/// dump - Print compile unit.
+void DICompileUnit::dump() const {
+  if (getLanguage())
+    cerr << " [" << dwarf::LanguageString(getLanguage()) << "] ";
+
+  std::string Res1, Res2;
+  cerr << " [" << getDirectory(Res1) << "/" << getFilename(Res2) << " ]";
+}
+
+/// dump - Print type.
+void DIType::dump() const {
+  if (isNull()) return;
+
+  std::string Res;
+  if (!getName(Res).empty())
+    cerr << " [" << Res << "] ";
+
+  unsigned Tag = getTag();
+  cerr << " [" << dwarf::TagString(Tag) << "] ";
+
+  // TODO : Print context
+  getCompileUnit().dump();
+  cerr << " [" 
+       << getLineNumber() << ", " 
+       << getSizeInBits() << ", "
+       << getAlignInBits() << ", "
+       << getOffsetInBits() 
+       << "] ";
+
+  if (isPrivate()) 
+    cerr << " [private] ";
+  else if (isProtected())
+    cerr << " [protected] ";
+
+  if (isForwardDecl())
+    cerr << " [fwd] ";
+
+  if (isBasicType(Tag))
+    DIBasicType(GV).dump();
+  else if (isDerivedType(Tag))
+    DIDerivedType(GV).dump();
+  else if (isCompositeType(Tag))
+    DICompositeType(GV).dump();
+  else {
+    cerr << "Invalid DIType\n";
+    return;
+  }
+
+  cerr << "\n";
+}
+
+/// dump - Print basic type.
+void DIBasicType::dump() const {
+  cerr << " [" << dwarf::AttributeEncodingString(getEncoding()) << "] ";
+}
+
+/// dump - Print derived type.
+void DIDerivedType::dump() const {
+  cerr << "\n\t Derived From: "; getTypeDerivedFrom().dump();
+}
+
+/// dump - Print composite type.
+void DICompositeType::dump() const {
+  DIArray A = getTypeArray();
+  if (A.isNull())
+    return;
+  cerr << " [" << A.getNumElements() << " elements]";
+}
+
+/// dump - Print global.
+void DIGlobal::dump() const {
+  std::string Res;
+  if (!getName(Res).empty())
+    cerr << " [" << Res << "] ";
+
+  unsigned Tag = getTag();
+  cerr << " [" << dwarf::TagString(Tag) << "] ";
+
+  // TODO : Print context
+  getCompileUnit().dump();
+  cerr << " [" << getLineNumber() << "] ";
+
+  if (isLocalToUnit())
+    cerr << " [local] ";
+
+  if (isDefinition())
+    cerr << " [def] ";
+
+  if (isGlobalVariable(Tag))
+    DIGlobalVariable(GV).dump();
+
+  cerr << "\n";
+}
+
+/// dump - Print subprogram.
+void DISubprogram::dump() const {
+  DIGlobal::dump();
+}
+
+/// dump - Print global variable.
+void DIGlobalVariable::dump() const {
+  cerr << " ["; getGlobal()->dump(); cerr << "] ";
+}
+
+/// dump - Print variable.
+void DIVariable::dump() const {
+  std::string Res;
+  if (!getName(Res).empty())
+    cerr << " [" << Res << "] ";
+
+  getCompileUnit().dump();
+  cerr << " [" << getLineNumber() << "] ";
+  getType().dump();
+  cerr << "\n";
+}
diff --git a/lib/Analysis/IPA/Andersens.cpp b/lib/Analysis/IPA/Andersens.cpp
new file mode 100644
index 0000000..8584d06
--- /dev/null
+++ b/lib/Analysis/IPA/Andersens.cpp
@@ -0,0 +1,2878 @@
+//===- Andersens.cpp - Andersen's Interprocedural Alias Analysis ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an implementation of Andersen's interprocedural alias
+// analysis
+//
+// In pointer analysis terms, this is a subset-based, flow-insensitive,
+// field-sensitive, and context-insensitive algorithm pointer algorithm.
+//
+// This algorithm is implemented as three stages:
+//   1. Object identification.
+//   2. Inclusion constraint identification.
+//   3. Offline constraint graph optimization
+//   4. Inclusion constraint solving.
+//
+// The object identification stage identifies all of the memory objects in the
+// program, which includes globals, heap allocated objects, and stack allocated
+// objects.
+//
+// The inclusion constraint identification stage finds all inclusion constraints
+// in the program by scanning the program, looking for pointer assignments and
+// other statements that effect the points-to graph.  For a statement like "A =
+// B", this statement is processed to indicate that A can point to anything that
+// B can point to.  Constraints can handle copies, loads, and stores, and
+// address taking.
+//
+// The offline constraint graph optimization portion includes offline variable
+// substitution algorithms intended to compute pointer and location
+// equivalences.  Pointer equivalences are those pointers that will have the
+// same points-to sets, and location equivalences are those variables that
+// always appear together in points-to sets.  It also includes an offline
+// cycle detection algorithm that allows cycles to be collapsed sooner 
+// during solving.
+//
+// The inclusion constraint solving phase iteratively propagates the inclusion
+// constraints until a fixed point is reached.  This is an O(N^3) algorithm.
+//
+// Function constraints are handled as if they were structs with X fields.
+// Thus, an access to argument X of function Y is an access to node index
+// getNode(Y) + X.  This representation allows handling of indirect calls
+// without any issues.  To wit, an indirect call Y(a,b) is equivalent to
+// *(Y + 1) = a, *(Y + 2) = b.
+// The return node for a function is always located at getNode(F) +
+// CallReturnPos. The arguments start at getNode(F) + CallArgPos.
+//
+// Future Improvements:
+//   Use of BDD's.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "anders-aa"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/DenseSet.h"
+#include <algorithm>
+#include <set>
+#include <list>
+#include <map>
+#include <stack>
+#include <vector>
+#include <queue>
+
+// Determining the actual set of nodes the universal set can consist of is very
+// expensive because it means propagating around very large sets.  We rely on
+// other analysis being able to determine which nodes can never be pointed to in
+// order to disambiguate further than "points-to anything".
+#define FULL_UNIVERSAL 0
+
+using namespace llvm;
+STATISTIC(NumIters      , "Number of iterations to reach convergence");
+STATISTIC(NumConstraints, "Number of constraints");
+STATISTIC(NumNodes      , "Number of nodes");
+STATISTIC(NumUnified    , "Number of variables unified");
+STATISTIC(NumErased     , "Number of redundant constraints erased");
+
+static const unsigned SelfRep = (unsigned)-1;
+static const unsigned Unvisited = (unsigned)-1;
+// Position of the function return node relative to the function node.
+static const unsigned CallReturnPos = 1;
+// Position of the function call node relative to the function node.
+static const unsigned CallFirstArgPos = 2;
+
+namespace {
+  struct BitmapKeyInfo {
+    static inline SparseBitVector<> *getEmptyKey() {
+      return reinterpret_cast<SparseBitVector<> *>(-1);
+    }
+    static inline SparseBitVector<> *getTombstoneKey() {
+      return reinterpret_cast<SparseBitVector<> *>(-2);
+    }
+    static unsigned getHashValue(const SparseBitVector<> *bitmap) {
+      return bitmap->getHashValue();
+    }
+    static bool isEqual(const SparseBitVector<> *LHS,
+                        const SparseBitVector<> *RHS) {
+      if (LHS == RHS)
+        return true;
+      else if (LHS == getEmptyKey() || RHS == getEmptyKey()
+               || LHS == getTombstoneKey() || RHS == getTombstoneKey())
+        return false;
+
+      return *LHS == *RHS;
+    }
+
+    static bool isPod() { return true; }
+  };
+
+  class VISIBILITY_HIDDEN Andersens : public ModulePass, public AliasAnalysis,
+                                      private InstVisitor<Andersens> {
+    struct Node;
+
+    /// Constraint - Objects of this structure are used to represent the various
+    /// constraints identified by the algorithm.  The constraints are 'copy',
+    /// for statements like "A = B", 'load' for statements like "A = *B",
+    /// 'store' for statements like "*A = B", and AddressOf for statements like
+    /// A = alloca;  The Offset is applied as *(A + K) = B for stores,
+    /// A = *(B + K) for loads, and A = B + K for copies.  It is
+    /// illegal on addressof constraints (because it is statically
+    /// resolvable to A = &C where C = B + K)
+
+    struct Constraint {
+      enum ConstraintType { Copy, Load, Store, AddressOf } Type;
+      unsigned Dest;
+      unsigned Src;
+      unsigned Offset;
+
+      Constraint(ConstraintType Ty, unsigned D, unsigned S, unsigned O = 0)
+        : Type(Ty), Dest(D), Src(S), Offset(O) {
+        assert((Offset == 0 || Ty != AddressOf) &&
+               "Offset is illegal on addressof constraints");
+      }
+
+      bool operator==(const Constraint &RHS) const {
+        return RHS.Type == Type
+          && RHS.Dest == Dest
+          && RHS.Src == Src
+          && RHS.Offset == Offset;
+      }
+
+      bool operator!=(const Constraint &RHS) const {
+        return !(*this == RHS);
+      }
+
+      bool operator<(const Constraint &RHS) const {
+        if (RHS.Type != Type)
+          return RHS.Type < Type;
+        else if (RHS.Dest != Dest)
+          return RHS.Dest < Dest;
+        else if (RHS.Src != Src)
+          return RHS.Src < Src;
+        return RHS.Offset < Offset;
+      }
+    };
+
+    // Information DenseSet requires implemented in order to be able to do
+    // it's thing
+    struct PairKeyInfo {
+      static inline std::pair<unsigned, unsigned> getEmptyKey() {
+        return std::make_pair(~0U, ~0U);
+      }
+      static inline std::pair<unsigned, unsigned> getTombstoneKey() {
+        return std::make_pair(~0U - 1, ~0U - 1);
+      }
+      static unsigned getHashValue(const std::pair<unsigned, unsigned> &P) {
+        return P.first ^ P.second;
+      }
+      static unsigned isEqual(const std::pair<unsigned, unsigned> &LHS,
+                              const std::pair<unsigned, unsigned> &RHS) {
+        return LHS == RHS;
+      }
+    };
+    
+    struct ConstraintKeyInfo {
+      static inline Constraint getEmptyKey() {
+        return Constraint(Constraint::Copy, ~0U, ~0U, ~0U);
+      }
+      static inline Constraint getTombstoneKey() {
+        return Constraint(Constraint::Copy, ~0U - 1, ~0U - 1, ~0U - 1);
+      }
+      static unsigned getHashValue(const Constraint &C) {
+        return C.Src ^ C.Dest ^ C.Type ^ C.Offset;
+      }
+      static bool isEqual(const Constraint &LHS,
+                          const Constraint &RHS) {
+        return LHS.Type == RHS.Type && LHS.Dest == RHS.Dest
+          && LHS.Src == RHS.Src && LHS.Offset == RHS.Offset;
+      }
+    };
+
+    // Node class - This class is used to represent a node in the constraint
+    // graph.  Due to various optimizations, it is not always the case that
+    // there is a mapping from a Node to a Value.  In particular, we add
+    // artificial Node's that represent the set of pointed-to variables shared
+    // for each location equivalent Node.
+    struct Node {
+    private:
+      static unsigned Counter;
+
+    public:
+      Value *Val;
+      SparseBitVector<> *Edges;
+      SparseBitVector<> *PointsTo;
+      SparseBitVector<> *OldPointsTo;
+      std::list<Constraint> Constraints;
+
+      // Pointer and location equivalence labels
+      unsigned PointerEquivLabel;
+      unsigned LocationEquivLabel;
+      // Predecessor edges, both real and implicit
+      SparseBitVector<> *PredEdges;
+      SparseBitVector<> *ImplicitPredEdges;
+      // Set of nodes that point to us, only use for location equivalence.
+      SparseBitVector<> *PointedToBy;
+      // Number of incoming edges, used during variable substitution to early
+      // free the points-to sets
+      unsigned NumInEdges;
+      // True if our points-to set is in the Set2PEClass map
+      bool StoredInHash;
+      // True if our node has no indirect constraints (complex or otherwise)
+      bool Direct;
+      // True if the node is address taken, *or* it is part of a group of nodes
+      // that must be kept together.  This is set to true for functions and
+      // their arg nodes, which must be kept at the same position relative to
+      // their base function node.
+      bool AddressTaken;
+
+      // Nodes in cycles (or in equivalence classes) are united together using a
+      // standard union-find representation with path compression.  NodeRep
+      // gives the index into GraphNodes for the representative Node.
+      unsigned NodeRep;
+
+      // Modification timestamp.  Assigned from Counter.
+      // Used for work list prioritization.
+      unsigned Timestamp;
+
+      explicit Node(bool direct = true) :
+        Val(0), Edges(0), PointsTo(0), OldPointsTo(0), 
+        PointerEquivLabel(0), LocationEquivLabel(0), PredEdges(0),
+        ImplicitPredEdges(0), PointedToBy(0), NumInEdges(0),
+        StoredInHash(false), Direct(direct), AddressTaken(false),
+        NodeRep(SelfRep), Timestamp(0) { }
+
+      Node *setValue(Value *V) {
+        assert(Val == 0 && "Value already set for this node!");
+        Val = V;
+        return this;
+      }
+
+      /// getValue - Return the LLVM value corresponding to this node.
+      ///
+      Value *getValue() const { return Val; }
+
+      /// addPointerTo - Add a pointer to the list of pointees of this node,
+      /// returning true if this caused a new pointer to be added, or false if
+      /// we already knew about the points-to relation.
+      bool addPointerTo(unsigned Node) {
+        return PointsTo->test_and_set(Node);
+      }
+
+      /// intersects - Return true if the points-to set of this node intersects
+      /// with the points-to set of the specified node.
+      bool intersects(Node *N) const;
+
+      /// intersectsIgnoring - Return true if the points-to set of this node
+      /// intersects with the points-to set of the specified node on any nodes
+      /// except for the specified node to ignore.
+      bool intersectsIgnoring(Node *N, unsigned) const;
+
+      // Timestamp a node (used for work list prioritization)
+      void Stamp() {
+        Timestamp = Counter++;
+      }
+
+      bool isRep() const {
+        return( (int) NodeRep < 0 );
+      }
+    };
+
+    struct WorkListElement {
+      Node* node;
+      unsigned Timestamp;
+      WorkListElement(Node* n, unsigned t) : node(n), Timestamp(t) {}
+
+      // Note that we reverse the sense of the comparison because we
+      // actually want to give low timestamps the priority over high,
+      // whereas priority is typically interpreted as a greater value is
+      // given high priority.
+      bool operator<(const WorkListElement& that) const {
+        return( this->Timestamp > that.Timestamp );
+      }
+    };
+
+    // Priority-queue based work list specialized for Nodes.
+    class WorkList {
+      std::priority_queue<WorkListElement> Q;
+
+    public:
+      void insert(Node* n) {
+        Q.push( WorkListElement(n, n->Timestamp) );
+      }
+
+      // We automatically discard non-representative nodes and nodes
+      // that were in the work list twice (we keep a copy of the
+      // timestamp in the work list so we can detect this situation by
+      // comparing against the node's current timestamp).
+      Node* pop() {
+        while( !Q.empty() ) {
+          WorkListElement x = Q.top(); Q.pop();
+          Node* INode = x.node;
+
+          if( INode->isRep() &&
+              INode->Timestamp == x.Timestamp ) {
+            return(x.node);
+          }
+        }
+        return(0);
+      }
+
+      bool empty() {
+        return Q.empty();
+      }
+    };
+
+    /// GraphNodes - This vector is populated as part of the object
+    /// identification stage of the analysis, which populates this vector with a
+    /// node for each memory object and fills in the ValueNodes map.
+    std::vector<Node> GraphNodes;
+
+    /// ValueNodes - This map indicates the Node that a particular Value* is
+    /// represented by.  This contains entries for all pointers.
+    DenseMap<Value*, unsigned> ValueNodes;
+
+    /// ObjectNodes - This map contains entries for each memory object in the
+    /// program: globals, alloca's and mallocs.
+    DenseMap<Value*, unsigned> ObjectNodes;
+
+    /// ReturnNodes - This map contains an entry for each function in the
+    /// program that returns a value.
+    DenseMap<Function*, unsigned> ReturnNodes;
+
+    /// VarargNodes - This map contains the entry used to represent all pointers
+    /// passed through the varargs portion of a function call for a particular
+    /// function.  An entry is not present in this map for functions that do not
+    /// take variable arguments.
+    DenseMap<Function*, unsigned> VarargNodes;
+
+
+    /// Constraints - This vector contains a list of all of the constraints
+    /// identified by the program.
+    std::vector<Constraint> Constraints;
+
+    // Map from graph node to maximum K value that is allowed (for functions,
+    // this is equivalent to the number of arguments + CallFirstArgPos)
+    std::map<unsigned, unsigned> MaxK;
+
+    /// This enum defines the GraphNodes indices that correspond to important
+    /// fixed sets.
+    enum {
+      UniversalSet = 0,
+      NullPtr      = 1,
+      NullObject   = 2,
+      NumberSpecialNodes
+    };
+    // Stack for Tarjan's
+    std::stack<unsigned> SCCStack;
+    // Map from Graph Node to DFS number
+    std::vector<unsigned> Node2DFS;
+    // Map from Graph Node to Deleted from graph.
+    std::vector<bool> Node2Deleted;
+    // Same as Node Maps, but implemented as std::map because it is faster to
+    // clear 
+    std::map<unsigned, unsigned> Tarjan2DFS;
+    std::map<unsigned, bool> Tarjan2Deleted;
+    // Current DFS number
+    unsigned DFSNumber;
+
+    // Work lists.
+    WorkList w1, w2;
+    WorkList *CurrWL, *NextWL; // "current" and "next" work lists
+
+    // Offline variable substitution related things
+
+    // Temporary rep storage, used because we can't collapse SCC's in the
+    // predecessor graph by uniting the variables permanently, we can only do so
+    // for the successor graph.
+    std::vector<unsigned> VSSCCRep;
+    // Mapping from node to whether we have visited it during SCC finding yet.
+    std::vector<bool> Node2Visited;
+    // During variable substitution, we create unknowns to represent the unknown
+    // value that is a dereference of a variable.  These nodes are known as
+    // "ref" nodes (since they represent the value of dereferences).
+    unsigned FirstRefNode;
+    // During HVN, we create represent address taken nodes as if they were
+    // unknown (since HVN, unlike HU, does not evaluate unions).
+    unsigned FirstAdrNode;
+    // Current pointer equivalence class number
+    unsigned PEClass;
+    // Mapping from points-to sets to equivalence classes
+    typedef DenseMap<SparseBitVector<> *, unsigned, BitmapKeyInfo> BitVectorMap;
+    BitVectorMap Set2PEClass;
+    // Mapping from pointer equivalences to the representative node.  -1 if we
+    // have no representative node for this pointer equivalence class yet.
+    std::vector<int> PEClass2Node;
+    // Mapping from pointer equivalences to representative node.  This includes
+    // pointer equivalent but not location equivalent variables. -1 if we have
+    // no representative node for this pointer equivalence class yet.
+    std::vector<int> PENLEClass2Node;
+    // Union/Find for HCD
+    std::vector<unsigned> HCDSCCRep;
+    // HCD's offline-detected cycles; "Statically DeTected"
+    // -1 if not part of such a cycle, otherwise a representative node.
+    std::vector<int> SDT;
+    // Whether to use SDT (UniteNodes can use it during solving, but not before)
+    bool SDTActive;
+
+  public:
+    static char ID;
+    Andersens() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M) {
+      InitializeAliasAnalysis(this);
+      IdentifyObjects(M);
+      CollectConstraints(M);
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa-constraints"
+      DEBUG(PrintConstraints());
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa"
+      SolveConstraints();
+      DEBUG(PrintPointsToGraph());
+
+      // Free the constraints list, as we don't need it to respond to alias
+      // requests.
+      std::vector<Constraint>().swap(Constraints);
+      //These are needed for Print() (-analyze in opt)
+      //ObjectNodes.clear();
+      //ReturnNodes.clear();
+      //VarargNodes.clear();
+      return false;
+    }
+
+    void releaseMemory() {
+      // FIXME: Until we have transitively required passes working correctly,
+      // this cannot be enabled!  Otherwise, using -count-aa with the pass
+      // causes memory to be freed too early. :(
+#if 0
+      // The memory objects and ValueNodes data structures at the only ones that
+      // are still live after construction.
+      std::vector<Node>().swap(GraphNodes);
+      ValueNodes.clear();
+#endif
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AliasAnalysis::getAnalysisUsage(AU);
+      AU.setPreservesAll();                         // Does not transform code
+    }
+
+    //------------------------------------------------
+    // Implement the AliasAnalysis API
+    //
+    AliasResult alias(const Value *V1, unsigned V1Size,
+                      const Value *V2, unsigned V2Size);
+    virtual ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size);
+    virtual ModRefResult getModRefInfo(CallSite CS1, CallSite CS2);
+    void getMustAliases(Value *P, std::vector<Value*> &RetVals);
+    bool pointsToConstantMemory(const Value *P);
+
+    virtual void deleteValue(Value *V) {
+      ValueNodes.erase(V);
+      getAnalysis<AliasAnalysis>().deleteValue(V);
+    }
+
+    virtual void copyValue(Value *From, Value *To) {
+      ValueNodes[To] = ValueNodes[From];
+      getAnalysis<AliasAnalysis>().copyValue(From, To);
+    }
+
+  private:
+    /// getNode - Return the node corresponding to the specified pointer scalar.
+    ///
+    unsigned getNode(Value *V) {
+      if (Constant *C = dyn_cast<Constant>(V))
+        if (!isa<GlobalValue>(C))
+          return getNodeForConstantPointer(C);
+
+      DenseMap<Value*, unsigned>::iterator I = ValueNodes.find(V);
+      if (I == ValueNodes.end()) {
+#ifndef NDEBUG
+        V->dump();
+#endif
+        assert(0 && "Value does not have a node in the points-to graph!");
+      }
+      return I->second;
+    }
+
+    /// getObject - Return the node corresponding to the memory object for the
+    /// specified global or allocation instruction.
+    unsigned getObject(Value *V) const {
+      DenseMap<Value*, unsigned>::iterator I = ObjectNodes.find(V);
+      assert(I != ObjectNodes.end() &&
+             "Value does not have an object in the points-to graph!");
+      return I->second;
+    }
+
+    /// getReturnNode - Return the node representing the return value for the
+    /// specified function.
+    unsigned getReturnNode(Function *F) const {
+      DenseMap<Function*, unsigned>::iterator I = ReturnNodes.find(F);
+      assert(I != ReturnNodes.end() && "Function does not return a value!");
+      return I->second;
+    }
+
+    /// getVarargNode - Return the node representing the variable arguments
+    /// formal for the specified function.
+    unsigned getVarargNode(Function *F) const {
+      DenseMap<Function*, unsigned>::iterator I = VarargNodes.find(F);
+      assert(I != VarargNodes.end() && "Function does not take var args!");
+      return I->second;
+    }
+
+    /// getNodeValue - Get the node for the specified LLVM value and set the
+    /// value for it to be the specified value.
+    unsigned getNodeValue(Value &V) {
+      unsigned Index = getNode(&V);
+      GraphNodes[Index].setValue(&V);
+      return Index;
+    }
+
+    unsigned UniteNodes(unsigned First, unsigned Second,
+                        bool UnionByRank = true);
+    unsigned FindNode(unsigned Node);
+    unsigned FindNode(unsigned Node) const;
+
+    void IdentifyObjects(Module &M);
+    void CollectConstraints(Module &M);
+    bool AnalyzeUsesOfFunction(Value *);
+    void CreateConstraintGraph();
+    void OptimizeConstraints();
+    unsigned FindEquivalentNode(unsigned, unsigned);
+    void ClumpAddressTaken();
+    void RewriteConstraints();
+    void HU();
+    void HVN();
+    void HCD();
+    void Search(unsigned Node);
+    void UnitePointerEquivalences();
+    void SolveConstraints();
+    bool QueryNode(unsigned Node);
+    void Condense(unsigned Node);
+    void HUValNum(unsigned Node);
+    void HVNValNum(unsigned Node);
+    unsigned getNodeForConstantPointer(Constant *C);
+    unsigned getNodeForConstantPointerTarget(Constant *C);
+    void AddGlobalInitializerConstraints(unsigned, Constant *C);
+
+    void AddConstraintsForNonInternalLinkage(Function *F);
+    void AddConstraintsForCall(CallSite CS, Function *F);
+    bool AddConstraintsForExternalCall(CallSite CS, Function *F);
+
+
+    void PrintNode(const Node *N) const;
+    void PrintConstraints() const ;
+    void PrintConstraint(const Constraint &) const;
+    void PrintLabels() const;
+    void PrintPointsToGraph() const;
+
+    //===------------------------------------------------------------------===//
+    // Instruction visitation methods for adding constraints
+    //
+    friend class InstVisitor<Andersens>;
+    void visitReturnInst(ReturnInst &RI);
+    void visitInvokeInst(InvokeInst &II) { visitCallSite(CallSite(&II)); }
+    void visitCallInst(CallInst &CI) { visitCallSite(CallSite(&CI)); }
+    void visitCallSite(CallSite CS);
+    void visitAllocationInst(AllocationInst &AI);
+    void visitLoadInst(LoadInst &LI);
+    void visitStoreInst(StoreInst &SI);
+    void visitGetElementPtrInst(GetElementPtrInst &GEP);
+    void visitPHINode(PHINode &PN);
+    void visitCastInst(CastInst &CI);
+    void visitICmpInst(ICmpInst &ICI) {} // NOOP!
+    void visitFCmpInst(FCmpInst &ICI) {} // NOOP!
+    void visitSelectInst(SelectInst &SI);
+    void visitVAArg(VAArgInst &I);
+    void visitInstruction(Instruction &I);
+
+    //===------------------------------------------------------------------===//
+    // Implement Analyize interface
+    //
+    void print(std::ostream &O, const Module* M) const {
+      PrintPointsToGraph();
+    }
+  };
+}
+
+char Andersens::ID = 0;
+static RegisterPass<Andersens>
+X("anders-aa", "Andersen's Interprocedural Alias Analysis", false, true);
+static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+
+// Initialize Timestamp Counter (static).
+unsigned Andersens::Node::Counter = 0;
+
+ModulePass *llvm::createAndersensPass() { return new Andersens(); }
+
+//===----------------------------------------------------------------------===//
+//                  AliasAnalysis Interface Implementation
+//===----------------------------------------------------------------------===//
+
+AliasAnalysis::AliasResult Andersens::alias(const Value *V1, unsigned V1Size,
+                                            const Value *V2, unsigned V2Size) {
+  Node *N1 = &GraphNodes[FindNode(getNode(const_cast<Value*>(V1)))];
+  Node *N2 = &GraphNodes[FindNode(getNode(const_cast<Value*>(V2)))];
+
+  // Check to see if the two pointers are known to not alias.  They don't alias
+  // if their points-to sets do not intersect.
+  if (!N1->intersectsIgnoring(N2, NullObject))
+    return NoAlias;
+
+  return AliasAnalysis::alias(V1, V1Size, V2, V2Size);
+}
+
+AliasAnalysis::ModRefResult
+Andersens::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+  // The only thing useful that we can contribute for mod/ref information is
+  // when calling external function calls: if we know that memory never escapes
+  // from the program, it cannot be modified by an external call.
+  //
+  // NOTE: This is not really safe, at least not when the entire program is not
+  // available.  The deal is that the external function could call back into the
+  // program and modify stuff.  We ignore this technical niggle for now.  This
+  // is, after all, a "research quality" implementation of Andersen's analysis.
+  if (Function *F = CS.getCalledFunction())
+    if (F->isDeclaration()) {
+      Node *N1 = &GraphNodes[FindNode(getNode(P))];
+
+      if (N1->PointsTo->empty())
+        return NoModRef;
+#if FULL_UNIVERSAL
+      if (!UniversalSet->PointsTo->test(FindNode(getNode(P))))
+        return NoModRef;  // Universal set does not contain P
+#else
+      if (!N1->PointsTo->test(UniversalSet))
+        return NoModRef;  // P doesn't point to the universal set.
+#endif
+    }
+
+  return AliasAnalysis::getModRefInfo(CS, P, Size);
+}
+
+AliasAnalysis::ModRefResult
+Andersens::getModRefInfo(CallSite CS1, CallSite CS2) {
+  return AliasAnalysis::getModRefInfo(CS1,CS2);
+}
+
+/// getMustAlias - We can provide must alias information if we know that a
+/// pointer can only point to a specific function or the null pointer.
+/// Unfortunately we cannot determine must-alias information for global
+/// variables or any other memory memory objects because we do not track whether
+/// a pointer points to the beginning of an object or a field of it.
+void Andersens::getMustAliases(Value *P, std::vector<Value*> &RetVals) {
+  Node *N = &GraphNodes[FindNode(getNode(P))];
+  if (N->PointsTo->count() == 1) {
+    Node *Pointee = &GraphNodes[N->PointsTo->find_first()];
+    // If a function is the only object in the points-to set, then it must be
+    // the destination.  Note that we can't handle global variables here,
+    // because we don't know if the pointer is actually pointing to a field of
+    // the global or to the beginning of it.
+    if (Value *V = Pointee->getValue()) {
+      if (Function *F = dyn_cast<Function>(V))
+        RetVals.push_back(F);
+    } else {
+      // If the object in the points-to set is the null object, then the null
+      // pointer is a must alias.
+      if (Pointee == &GraphNodes[NullObject])
+        RetVals.push_back(Constant::getNullValue(P->getType()));
+    }
+  }
+  AliasAnalysis::getMustAliases(P, RetVals);
+}
+
+/// pointsToConstantMemory - If we can determine that this pointer only points
+/// to constant memory, return true.  In practice, this means that if the
+/// pointer can only point to constant globals, functions, or the null pointer,
+/// return true.
+///
+bool Andersens::pointsToConstantMemory(const Value *P) {
+  Node *N = &GraphNodes[FindNode(getNode(const_cast<Value*>(P)))];
+  unsigned i;
+
+  for (SparseBitVector<>::iterator bi = N->PointsTo->begin();
+       bi != N->PointsTo->end();
+       ++bi) {
+    i = *bi;
+    Node *Pointee = &GraphNodes[i];
+    if (Value *V = Pointee->getValue()) {
+      if (!isa<GlobalValue>(V) || (isa<GlobalVariable>(V) &&
+                                   !cast<GlobalVariable>(V)->isConstant()))
+        return AliasAnalysis::pointsToConstantMemory(P);
+    } else {
+      if (i != NullObject)
+        return AliasAnalysis::pointsToConstantMemory(P);
+    }
+  }
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                       Object Identification Phase
+//===----------------------------------------------------------------------===//
+
+/// IdentifyObjects - This stage scans the program, adding an entry to the
+/// GraphNodes list for each memory object in the program (global stack or
+/// heap), and populates the ValueNodes and ObjectNodes maps for these objects.
+///
+void Andersens::IdentifyObjects(Module &M) {
+  unsigned NumObjects = 0;
+
+  // Object #0 is always the universal set: the object that we don't know
+  // anything about.
+  assert(NumObjects == UniversalSet && "Something changed!");
+  ++NumObjects;
+
+  // Object #1 always represents the null pointer.
+  assert(NumObjects == NullPtr && "Something changed!");
+  ++NumObjects;
+
+  // Object #2 always represents the null object (the object pointed to by null)
+  assert(NumObjects == NullObject && "Something changed!");
+  ++NumObjects;
+
+  // Add all the globals first.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    ObjectNodes[I] = NumObjects++;
+    ValueNodes[I] = NumObjects++;
+  }
+
+  // Add nodes for all of the functions and the instructions inside of them.
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    // The function itself is a memory object.
+    unsigned First = NumObjects;
+    ValueNodes[F] = NumObjects++;
+    if (isa<PointerType>(F->getFunctionType()->getReturnType()))
+      ReturnNodes[F] = NumObjects++;
+    if (F->getFunctionType()->isVarArg())
+      VarargNodes[F] = NumObjects++;
+
+
+    // Add nodes for all of the incoming pointer arguments.
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I)
+      {
+        if (isa<PointerType>(I->getType()))
+          ValueNodes[I] = NumObjects++;
+      }
+    MaxK[First] = NumObjects - First;
+
+    // Scan the function body, creating a memory object for each heap/stack
+    // allocation in the body of the function and a node to represent all
+    // pointer values defined by instructions and used as operands.
+    for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+      // If this is an heap or stack allocation, create a node for the memory
+      // object.
+      if (isa<PointerType>(II->getType())) {
+        ValueNodes[&*II] = NumObjects++;
+        if (AllocationInst *AI = dyn_cast<AllocationInst>(&*II))
+          ObjectNodes[AI] = NumObjects++;
+      }
+
+      // Calls to inline asm need to be added as well because the callee isn't
+      // referenced anywhere else.
+      if (CallInst *CI = dyn_cast<CallInst>(&*II)) {
+        Value *Callee = CI->getCalledValue();
+        if (isa<InlineAsm>(Callee))
+          ValueNodes[Callee] = NumObjects++;
+      }
+    }
+  }
+
+  // Now that we know how many objects to create, make them all now!
+  GraphNodes.resize(NumObjects);
+  NumNodes += NumObjects;
+}
+
+//===----------------------------------------------------------------------===//
+//                     Constraint Identification Phase
+//===----------------------------------------------------------------------===//
+
+/// getNodeForConstantPointer - Return the node corresponding to the constant
+/// pointer itself.
+unsigned Andersens::getNodeForConstantPointer(Constant *C) {
+  assert(isa<PointerType>(C->getType()) && "Not a constant pointer!");
+
+  if (isa<ConstantPointerNull>(C) || isa<UndefValue>(C))
+    return NullPtr;
+  else if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return getNode(GV);
+  else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    switch (CE->getOpcode()) {
+    case Instruction::GetElementPtr:
+      return getNodeForConstantPointer(CE->getOperand(0));
+    case Instruction::IntToPtr:
+      return UniversalSet;
+    case Instruction::BitCast:
+      return getNodeForConstantPointer(CE->getOperand(0));
+    default:
+      cerr << "Constant Expr not yet handled: " << *CE << "\n";
+      assert(0);
+    }
+  } else {
+    assert(0 && "Unknown constant pointer!");
+  }
+  return 0;
+}
+
+/// getNodeForConstantPointerTarget - Return the node POINTED TO by the
+/// specified constant pointer.
+unsigned Andersens::getNodeForConstantPointerTarget(Constant *C) {
+  assert(isa<PointerType>(C->getType()) && "Not a constant pointer!");
+
+  if (isa<ConstantPointerNull>(C))
+    return NullObject;
+  else if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return getObject(GV);
+  else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    switch (CE->getOpcode()) {
+    case Instruction::GetElementPtr:
+      return getNodeForConstantPointerTarget(CE->getOperand(0));
+    case Instruction::IntToPtr:
+      return UniversalSet;
+    case Instruction::BitCast:
+      return getNodeForConstantPointerTarget(CE->getOperand(0));
+    default:
+      cerr << "Constant Expr not yet handled: " << *CE << "\n";
+      assert(0);
+    }
+  } else {
+    assert(0 && "Unknown constant pointer!");
+  }
+  return 0;
+}
+
+/// AddGlobalInitializerConstraints - Add inclusion constraints for the memory
+/// object N, which contains values indicated by C.
+void Andersens::AddGlobalInitializerConstraints(unsigned NodeIndex,
+                                                Constant *C) {
+  if (C->getType()->isSingleValueType()) {
+    if (isa<PointerType>(C->getType()))
+      Constraints.push_back(Constraint(Constraint::Copy, NodeIndex,
+                                       getNodeForConstantPointer(C)));
+  } else if (C->isNullValue()) {
+    Constraints.push_back(Constraint(Constraint::Copy, NodeIndex,
+                                     NullObject));
+    return;
+  } else if (!isa<UndefValue>(C)) {
+    // If this is an array or struct, include constraints for each element.
+    assert(isa<ConstantArray>(C) || isa<ConstantStruct>(C));
+    for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+      AddGlobalInitializerConstraints(NodeIndex,
+                                      cast<Constant>(C->getOperand(i)));
+  }
+}
+
+/// AddConstraintsForNonInternalLinkage - If this function does not have
+/// internal linkage, realize that we can't trust anything passed into or
+/// returned by this function.
+void Andersens::AddConstraintsForNonInternalLinkage(Function *F) {
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
+    if (isa<PointerType>(I->getType()))
+      // If this is an argument of an externally accessible function, the
+      // incoming pointer might point to anything.
+      Constraints.push_back(Constraint(Constraint::Copy, getNode(I),
+                                       UniversalSet));
+}
+
+/// AddConstraintsForCall - If this is a call to a "known" function, add the
+/// constraints and return true.  If this is a call to an unknown function,
+/// return false.
+bool Andersens::AddConstraintsForExternalCall(CallSite CS, Function *F) {
+  assert(F->isDeclaration() && "Not an external function!");
+
+  // These functions don't induce any points-to constraints.
+  if (F->getName() == "atoi" || F->getName() == "atof" ||
+      F->getName() == "atol" || F->getName() == "atoll" ||
+      F->getName() == "remove" || F->getName() == "unlink" ||
+      F->getName() == "rename" || F->getName() == "memcmp" ||
+      F->getName() == "llvm.memset" ||
+      F->getName() == "strcmp" || F->getName() == "strncmp" ||
+      F->getName() == "execl" || F->getName() == "execlp" ||
+      F->getName() == "execle" || F->getName() == "execv" ||
+      F->getName() == "execvp" || F->getName() == "chmod" ||
+      F->getName() == "puts" || F->getName() == "write" ||
+      F->getName() == "open" || F->getName() == "create" ||
+      F->getName() == "truncate" || F->getName() == "chdir" ||
+      F->getName() == "mkdir" || F->getName() == "rmdir" ||
+      F->getName() == "read" || F->getName() == "pipe" ||
+      F->getName() == "wait" || F->getName() == "time" ||
+      F->getName() == "stat" || F->getName() == "fstat" ||
+      F->getName() == "lstat" || F->getName() == "strtod" ||
+      F->getName() == "strtof" || F->getName() == "strtold" ||
+      F->getName() == "fopen" || F->getName() == "fdopen" ||
+      F->getName() == "freopen" ||
+      F->getName() == "fflush" || F->getName() == "feof" ||
+      F->getName() == "fileno" || F->getName() == "clearerr" ||
+      F->getName() == "rewind" || F->getName() == "ftell" ||
+      F->getName() == "ferror" || F->getName() == "fgetc" ||
+      F->getName() == "fgetc" || F->getName() == "_IO_getc" ||
+      F->getName() == "fwrite" || F->getName() == "fread" ||
+      F->getName() == "fgets" || F->getName() == "ungetc" ||
+      F->getName() == "fputc" ||
+      F->getName() == "fputs" || F->getName() == "putc" ||
+      F->getName() == "ftell" || F->getName() == "rewind" ||
+      F->getName() == "_IO_putc" || F->getName() == "fseek" ||
+      F->getName() == "fgetpos" || F->getName() == "fsetpos" ||
+      F->getName() == "printf" || F->getName() == "fprintf" ||
+      F->getName() == "sprintf" || F->getName() == "vprintf" ||
+      F->getName() == "vfprintf" || F->getName() == "vsprintf" ||
+      F->getName() == "scanf" || F->getName() == "fscanf" ||
+      F->getName() == "sscanf" || F->getName() == "__assert_fail" ||
+      F->getName() == "modf")
+    return true;
+
+
+  // These functions do induce points-to edges.
+  if (F->getName() == "llvm.memcpy" ||
+      F->getName() == "llvm.memmove" ||
+      F->getName() == "memmove") {
+
+    const FunctionType *FTy = F->getFunctionType();
+    if (FTy->getNumParams() > 1 && 
+        isa<PointerType>(FTy->getParamType(0)) &&
+        isa<PointerType>(FTy->getParamType(1))) {
+
+      // *Dest = *Src, which requires an artificial graph node to represent the
+      // constraint.  It is broken up into *Dest = temp, temp = *Src
+      unsigned FirstArg = getNode(CS.getArgument(0));
+      unsigned SecondArg = getNode(CS.getArgument(1));
+      unsigned TempArg = GraphNodes.size();
+      GraphNodes.push_back(Node());
+      Constraints.push_back(Constraint(Constraint::Store,
+                                       FirstArg, TempArg));
+      Constraints.push_back(Constraint(Constraint::Load,
+                                       TempArg, SecondArg));
+      // In addition, Dest = Src
+      Constraints.push_back(Constraint(Constraint::Copy,
+                                       FirstArg, SecondArg));
+      return true;
+    }
+  }
+
+  // Result = Arg0
+  if (F->getName() == "realloc" || F->getName() == "strchr" ||
+      F->getName() == "strrchr" || F->getName() == "strstr" ||
+      F->getName() == "strtok") {
+    const FunctionType *FTy = F->getFunctionType();
+    if (FTy->getNumParams() > 0 && 
+        isa<PointerType>(FTy->getParamType(0))) {
+      Constraints.push_back(Constraint(Constraint::Copy,
+                                       getNode(CS.getInstruction()),
+                                       getNode(CS.getArgument(0))));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+
+/// AnalyzeUsesOfFunction - Look at all of the users of the specified function.
+/// If this is used by anything complex (i.e., the address escapes), return
+/// true.
+bool Andersens::AnalyzeUsesOfFunction(Value *V) {
+
+  if (!isa<PointerType>(V->getType())) return true;
+
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (dyn_cast<LoadInst>(*UI)) {
+      return false;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (V == SI->getOperand(1)) {
+        return false;
+      } else if (SI->getOperand(1)) {
+        return true;  // Storing the pointer
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+      if (AnalyzeUsesOfFunction(GEP)) return true;
+    } else if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      // Make sure that this is just the function being called, not that it is
+      // passing into the function.
+      for (unsigned i = 1, e = CI->getNumOperands(); i != e; ++i)
+        if (CI->getOperand(i) == V) return true;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
+      // Make sure that this is just the function being called, not that it is
+      // passing into the function.
+      for (unsigned i = 3, e = II->getNumOperands(); i != e; ++i)
+        if (II->getOperand(i) == V) return true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr ||
+          CE->getOpcode() == Instruction::BitCast) {
+        if (AnalyzeUsesOfFunction(CE))
+          return true;
+      } else {
+        return true;
+      }
+    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(*UI)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return true;  // Allow comparison against null.
+    } else if (dyn_cast<FreeInst>(*UI)) {
+      return false;
+    } else {
+      return true;
+    }
+  return false;
+}
+
+/// CollectConstraints - This stage scans the program, adding a constraint to
+/// the Constraints list for each instruction in the program that induces a
+/// constraint, and setting up the initial points-to graph.
+///
+void Andersens::CollectConstraints(Module &M) {
+  // First, the universal set points to itself.
+  Constraints.push_back(Constraint(Constraint::AddressOf, UniversalSet,
+                                   UniversalSet));
+  Constraints.push_back(Constraint(Constraint::Store, UniversalSet,
+                                   UniversalSet));
+
+  // Next, the null pointer points to the null object.
+  Constraints.push_back(Constraint(Constraint::AddressOf, NullPtr, NullObject));
+
+  // Next, add any constraints on global variables and their initializers.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    // Associate the address of the global object as pointing to the memory for
+    // the global: &G = <G memory>
+    unsigned ObjectIndex = getObject(I);
+    Node *Object = &GraphNodes[ObjectIndex];
+    Object->setValue(I);
+    Constraints.push_back(Constraint(Constraint::AddressOf, getNodeValue(*I),
+                                     ObjectIndex));
+
+    if (I->hasInitializer()) {
+      AddGlobalInitializerConstraints(ObjectIndex, I->getInitializer());
+    } else {
+      // If it doesn't have an initializer (i.e. it's defined in another
+      // translation unit), it points to the universal set.
+      Constraints.push_back(Constraint(Constraint::Copy, ObjectIndex,
+                                       UniversalSet));
+    }
+  }
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    // Set up the return value node.
+    if (isa<PointerType>(F->getFunctionType()->getReturnType()))
+      GraphNodes[getReturnNode(F)].setValue(F);
+    if (F->getFunctionType()->isVarArg())
+      GraphNodes[getVarargNode(F)].setValue(F);
+
+    // Set up incoming argument nodes.
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I)
+      if (isa<PointerType>(I->getType()))
+        getNodeValue(*I);
+
+    // At some point we should just add constraints for the escaping functions
+    // at solve time, but this slows down solving. For now, we simply mark
+    // address taken functions as escaping and treat them as external.
+    if (!F->hasLocalLinkage() || AnalyzeUsesOfFunction(F))
+      AddConstraintsForNonInternalLinkage(F);
+
+    if (!F->isDeclaration()) {
+      // Scan the function body, creating a memory object for each heap/stack
+      // allocation in the body of the function and a node to represent all
+      // pointer values defined by instructions and used as operands.
+      visit(F);
+    } else {
+      // External functions that return pointers return the universal set.
+      if (isa<PointerType>(F->getFunctionType()->getReturnType()))
+        Constraints.push_back(Constraint(Constraint::Copy,
+                                         getReturnNode(F),
+                                         UniversalSet));
+
+      // Any pointers that are passed into the function have the universal set
+      // stored into them.
+      for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+           I != E; ++I)
+        if (isa<PointerType>(I->getType())) {
+          // Pointers passed into external functions could have anything stored
+          // through them.
+          Constraints.push_back(Constraint(Constraint::Store, getNode(I),
+                                           UniversalSet));
+          // Memory objects passed into external function calls can have the
+          // universal set point to them.
+#if FULL_UNIVERSAL
+          Constraints.push_back(Constraint(Constraint::Copy,
+                                           UniversalSet,
+                                           getNode(I)));
+#else
+          Constraints.push_back(Constraint(Constraint::Copy,
+                                           getNode(I),
+                                           UniversalSet));
+#endif
+        }
+
+      // If this is an external varargs function, it can also store pointers
+      // into any pointers passed through the varargs section.
+      if (F->getFunctionType()->isVarArg())
+        Constraints.push_back(Constraint(Constraint::Store, getVarargNode(F),
+                                         UniversalSet));
+    }
+  }
+  NumConstraints += Constraints.size();
+}
+
+
+void Andersens::visitInstruction(Instruction &I) {
+#ifdef NDEBUG
+  return;          // This function is just a big assert.
+#endif
+  if (isa<BinaryOperator>(I))
+    return;
+  // Most instructions don't have any effect on pointer values.
+  switch (I.getOpcode()) {
+  case Instruction::Br:
+  case Instruction::Switch:
+  case Instruction::Unwind:
+  case Instruction::Unreachable:
+  case Instruction::Free:
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return;
+  default:
+    // Is this something we aren't handling yet?
+    cerr << "Unknown instruction: " << I;
+    abort();
+  }
+}
+
+void Andersens::visitAllocationInst(AllocationInst &AI) {
+  unsigned ObjectIndex = getObject(&AI);
+  GraphNodes[ObjectIndex].setValue(&AI);
+  Constraints.push_back(Constraint(Constraint::AddressOf, getNodeValue(AI),
+                                   ObjectIndex));
+}
+
+void Andersens::visitReturnInst(ReturnInst &RI) {
+  if (RI.getNumOperands() && isa<PointerType>(RI.getOperand(0)->getType()))
+    // return V   -->   <Copy/retval{F}/v>
+    Constraints.push_back(Constraint(Constraint::Copy,
+                                     getReturnNode(RI.getParent()->getParent()),
+                                     getNode(RI.getOperand(0))));
+}
+
+void Andersens::visitLoadInst(LoadInst &LI) {
+  if (isa<PointerType>(LI.getType()))
+    // P1 = load P2  -->  <Load/P1/P2>
+    Constraints.push_back(Constraint(Constraint::Load, getNodeValue(LI),
+                                     getNode(LI.getOperand(0))));
+}
+
+void Andersens::visitStoreInst(StoreInst &SI) {
+  if (isa<PointerType>(SI.getOperand(0)->getType()))
+    // store P1, P2  -->  <Store/P2/P1>
+    Constraints.push_back(Constraint(Constraint::Store,
+                                     getNode(SI.getOperand(1)),
+                                     getNode(SI.getOperand(0))));
+}
+
+void Andersens::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  // P1 = getelementptr P2, ... --> <Copy/P1/P2>
+  Constraints.push_back(Constraint(Constraint::Copy, getNodeValue(GEP),
+                                   getNode(GEP.getOperand(0))));
+}
+
+void Andersens::visitPHINode(PHINode &PN) {
+  if (isa<PointerType>(PN.getType())) {
+    unsigned PNN = getNodeValue(PN);
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+      // P1 = phi P2, P3  -->  <Copy/P1/P2>, <Copy/P1/P3>, ...
+      Constraints.push_back(Constraint(Constraint::Copy, PNN,
+                                       getNode(PN.getIncomingValue(i))));
+  }
+}
+
+void Andersens::visitCastInst(CastInst &CI) {
+  Value *Op = CI.getOperand(0);
+  if (isa<PointerType>(CI.getType())) {
+    if (isa<PointerType>(Op->getType())) {
+      // P1 = cast P2  --> <Copy/P1/P2>
+      Constraints.push_back(Constraint(Constraint::Copy, getNodeValue(CI),
+                                       getNode(CI.getOperand(0))));
+    } else {
+      // P1 = cast int --> <Copy/P1/Univ>
+#if 0
+      Constraints.push_back(Constraint(Constraint::Copy, getNodeValue(CI),
+                                       UniversalSet));
+#else
+      getNodeValue(CI);
+#endif
+    }
+  } else if (isa<PointerType>(Op->getType())) {
+    // int = cast P1 --> <Copy/Univ/P1>
+#if 0
+    Constraints.push_back(Constraint(Constraint::Copy,
+                                     UniversalSet,
+                                     getNode(CI.getOperand(0))));
+#else
+    getNode(CI.getOperand(0));
+#endif
+  }
+}
+
+void Andersens::visitSelectInst(SelectInst &SI) {
+  if (isa<PointerType>(SI.getType())) {
+    unsigned SIN = getNodeValue(SI);
+    // P1 = select C, P2, P3   ---> <Copy/P1/P2>, <Copy/P1/P3>
+    Constraints.push_back(Constraint(Constraint::Copy, SIN,
+                                     getNode(SI.getOperand(1))));
+    Constraints.push_back(Constraint(Constraint::Copy, SIN,
+                                     getNode(SI.getOperand(2))));
+  }
+}
+
+void Andersens::visitVAArg(VAArgInst &I) {
+  assert(0 && "vaarg not handled yet!");
+}
+
+/// AddConstraintsForCall - Add constraints for a call with actual arguments
+/// specified by CS to the function specified by F.  Note that the types of
+/// arguments might not match up in the case where this is an indirect call and
+/// the function pointer has been casted.  If this is the case, do something
+/// reasonable.
+void Andersens::AddConstraintsForCall(CallSite CS, Function *F) {
+  Value *CallValue = CS.getCalledValue();
+  bool IsDeref = F == NULL;
+
+  // If this is a call to an external function, try to handle it directly to get
+  // some taste of context sensitivity.
+  if (F && F->isDeclaration() && AddConstraintsForExternalCall(CS, F))
+    return;
+
+  if (isa<PointerType>(CS.getType())) {
+    unsigned CSN = getNode(CS.getInstruction());
+    if (!F || isa<PointerType>(F->getFunctionType()->getReturnType())) {
+      if (IsDeref)
+        Constraints.push_back(Constraint(Constraint::Load, CSN,
+                                         getNode(CallValue), CallReturnPos));
+      else
+        Constraints.push_back(Constraint(Constraint::Copy, CSN,
+                                         getNode(CallValue) + CallReturnPos));
+    } else {
+      // If the function returns a non-pointer value, handle this just like we
+      // treat a nonpointer cast to pointer.
+      Constraints.push_back(Constraint(Constraint::Copy, CSN,
+                                       UniversalSet));
+    }
+  } else if (F && isa<PointerType>(F->getFunctionType()->getReturnType())) {
+#if FULL_UNIVERSAL
+    Constraints.push_back(Constraint(Constraint::Copy,
+                                     UniversalSet,
+                                     getNode(CallValue) + CallReturnPos));
+#else
+    Constraints.push_back(Constraint(Constraint::Copy,
+                                      getNode(CallValue) + CallReturnPos,
+                                      UniversalSet));
+#endif
+                          
+    
+  }
+
+  CallSite::arg_iterator ArgI = CS.arg_begin(), ArgE = CS.arg_end();
+  bool external = !F ||  F->isDeclaration();
+  if (F) {
+    // Direct Call
+    Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+    for (; AI != AE && ArgI != ArgE; ++AI, ++ArgI) 
+      {
+#if !FULL_UNIVERSAL
+        if (external && isa<PointerType>((*ArgI)->getType())) 
+          {
+            // Add constraint that ArgI can now point to anything due to
+            // escaping, as can everything it points to. The second portion of
+            // this should be taken care of by universal = *universal
+            Constraints.push_back(Constraint(Constraint::Copy,
+                                             getNode(*ArgI),
+                                             UniversalSet));
+          }
+#endif
+        if (isa<PointerType>(AI->getType())) {
+          if (isa<PointerType>((*ArgI)->getType())) {
+            // Copy the actual argument into the formal argument.
+            Constraints.push_back(Constraint(Constraint::Copy, getNode(AI),
+                                             getNode(*ArgI)));
+          } else {
+            Constraints.push_back(Constraint(Constraint::Copy, getNode(AI),
+                                             UniversalSet));
+          }
+        } else if (isa<PointerType>((*ArgI)->getType())) {
+#if FULL_UNIVERSAL
+          Constraints.push_back(Constraint(Constraint::Copy,
+                                           UniversalSet,
+                                           getNode(*ArgI)));
+#else
+          Constraints.push_back(Constraint(Constraint::Copy,
+                                           getNode(*ArgI),
+                                           UniversalSet));
+#endif
+        }
+      }
+  } else {
+    //Indirect Call
+    unsigned ArgPos = CallFirstArgPos;
+    for (; ArgI != ArgE; ++ArgI) {
+      if (isa<PointerType>((*ArgI)->getType())) {
+        // Copy the actual argument into the formal argument.
+        Constraints.push_back(Constraint(Constraint::Store,
+                                         getNode(CallValue),
+                                         getNode(*ArgI), ArgPos++));
+      } else {
+        Constraints.push_back(Constraint(Constraint::Store,
+                                         getNode (CallValue),
+                                         UniversalSet, ArgPos++));
+      }
+    }
+  }
+  // Copy all pointers passed through the varargs section to the varargs node.
+  if (F && F->getFunctionType()->isVarArg())
+    for (; ArgI != ArgE; ++ArgI)
+      if (isa<PointerType>((*ArgI)->getType()))
+        Constraints.push_back(Constraint(Constraint::Copy, getVarargNode(F),
+                                         getNode(*ArgI)));
+  // If more arguments are passed in than we track, just drop them on the floor.
+}
+
+void Andersens::visitCallSite(CallSite CS) {
+  if (isa<PointerType>(CS.getType()))
+    getNodeValue(*CS.getInstruction());
+
+  if (Function *F = CS.getCalledFunction()) {
+    AddConstraintsForCall(CS, F);
+  } else {
+    AddConstraintsForCall(CS, NULL);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Constraint Solving Phase
+//===----------------------------------------------------------------------===//
+
+/// intersects - Return true if the points-to set of this node intersects
+/// with the points-to set of the specified node.
+bool Andersens::Node::intersects(Node *N) const {
+  return PointsTo->intersects(N->PointsTo);
+}
+
+/// intersectsIgnoring - Return true if the points-to set of this node
+/// intersects with the points-to set of the specified node on any nodes
+/// except for the specified node to ignore.
+bool Andersens::Node::intersectsIgnoring(Node *N, unsigned Ignoring) const {
+  // TODO: If we are only going to call this with the same value for Ignoring,
+  // we should move the special values out of the points-to bitmap.
+  bool WeHadIt = PointsTo->test(Ignoring);
+  bool NHadIt = N->PointsTo->test(Ignoring);
+  bool Result = false;
+  if (WeHadIt)
+    PointsTo->reset(Ignoring);
+  if (NHadIt)
+    N->PointsTo->reset(Ignoring);
+  Result = PointsTo->intersects(N->PointsTo);
+  if (WeHadIt)
+    PointsTo->set(Ignoring);
+  if (NHadIt)
+    N->PointsTo->set(Ignoring);
+  return Result;
+}
+
+void dumpToDOUT(SparseBitVector<> *bitmap) {
+#ifndef NDEBUG
+  dump(*bitmap, DOUT);
+#endif
+}
+
+
+/// Clump together address taken variables so that the points-to sets use up
+/// less space and can be operated on faster.
+
+void Andersens::ClumpAddressTaken() {
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa-renumber"
+  std::vector<unsigned> Translate;
+  std::vector<Node> NewGraphNodes;
+
+  Translate.resize(GraphNodes.size());
+  unsigned NewPos = 0;
+
+  for (unsigned i = 0; i < Constraints.size(); ++i) {
+    Constraint &C = Constraints[i];
+    if (C.Type == Constraint::AddressOf) {
+      GraphNodes[C.Src].AddressTaken = true;
+    }
+  }
+  for (unsigned i = 0; i < NumberSpecialNodes; ++i) {
+    unsigned Pos = NewPos++;
+    Translate[i] = Pos;
+    NewGraphNodes.push_back(GraphNodes[i]);
+    DOUT << "Renumbering node " << i << " to node " << Pos << "\n";
+  }
+
+  // I believe this ends up being faster than making two vectors and splicing
+  // them.
+  for (unsigned i = NumberSpecialNodes; i < GraphNodes.size(); ++i) {
+    if (GraphNodes[i].AddressTaken) {
+      unsigned Pos = NewPos++;
+      Translate[i] = Pos;
+      NewGraphNodes.push_back(GraphNodes[i]);
+      DOUT << "Renumbering node " << i << " to node " << Pos << "\n";
+    }
+  }
+
+  for (unsigned i = NumberSpecialNodes; i < GraphNodes.size(); ++i) {
+    if (!GraphNodes[i].AddressTaken) {
+      unsigned Pos = NewPos++;
+      Translate[i] = Pos;
+      NewGraphNodes.push_back(GraphNodes[i]);
+      DOUT << "Renumbering node " << i << " to node " << Pos << "\n";
+    }
+  }
+
+  for (DenseMap<Value*, unsigned>::iterator Iter = ValueNodes.begin();
+       Iter != ValueNodes.end();
+       ++Iter)
+    Iter->second = Translate[Iter->second];
+
+  for (DenseMap<Value*, unsigned>::iterator Iter = ObjectNodes.begin();
+       Iter != ObjectNodes.end();
+       ++Iter)
+    Iter->second = Translate[Iter->second];
+
+  for (DenseMap<Function*, unsigned>::iterator Iter = ReturnNodes.begin();
+       Iter != ReturnNodes.end();
+       ++Iter)
+    Iter->second = Translate[Iter->second];
+
+  for (DenseMap<Function*, unsigned>::iterator Iter = VarargNodes.begin();
+       Iter != VarargNodes.end();
+       ++Iter)
+    Iter->second = Translate[Iter->second];
+
+  for (unsigned i = 0; i < Constraints.size(); ++i) {
+    Constraint &C = Constraints[i];
+    C.Src = Translate[C.Src];
+    C.Dest = Translate[C.Dest];
+  }
+
+  GraphNodes.swap(NewGraphNodes);
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa"
+}
+
+/// The technique used here is described in "Exploiting Pointer and Location
+/// Equivalence to Optimize Pointer Analysis. In the 14th International Static
+/// Analysis Symposium (SAS), August 2007."  It is known as the "HVN" algorithm,
+/// and is equivalent to value numbering the collapsed constraint graph without
+/// evaluating unions.  This is used as a pre-pass to HU in order to resolve
+/// first order pointer dereferences and speed up/reduce memory usage of HU.
+/// Running both is equivalent to HRU without the iteration
+/// HVN in more detail:
+/// Imagine the set of constraints was simply straight line code with no loops
+/// (we eliminate cycles, so there are no loops), such as:
+/// E = &D
+/// E = &C
+/// E = F
+/// F = G
+/// G = F
+/// Applying value numbering to this code tells us:
+/// G == F == E
+///
+/// For HVN, this is as far as it goes.  We assign new value numbers to every
+/// "address node", and every "reference node".
+/// To get the optimal result for this, we use a DFS + SCC (since all nodes in a
+/// cycle must have the same value number since the = operation is really
+/// inclusion, not overwrite), and value number nodes we receive points-to sets
+/// before we value our own node.
+/// The advantage of HU over HVN is that HU considers the inclusion property, so
+/// that if you have
+/// E = &D
+/// E = &C
+/// E = F
+/// F = G
+/// F = &D
+/// G = F
+/// HU will determine that G == F == E.  HVN will not, because it cannot prove
+/// that the points to information ends up being the same because they all
+/// receive &D from E anyway.
+
+void Andersens::HVN() {
+  DOUT << "Beginning HVN\n";
+  // Build a predecessor graph.  This is like our constraint graph with the
+  // edges going in the opposite direction, and there are edges for all the
+  // constraints, instead of just copy constraints.  We also build implicit
+  // edges for constraints are implied but not explicit.  I.E for the constraint
+  // a = &b, we add implicit edges *a = b.  This helps us capture more cycles
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    Constraint &C = Constraints[i];
+    if (C.Type == Constraint::AddressOf) {
+      GraphNodes[C.Src].AddressTaken = true;
+      GraphNodes[C.Src].Direct = false;
+
+      // Dest = &src edge
+      unsigned AdrNode = C.Src + FirstAdrNode;
+      if (!GraphNodes[C.Dest].PredEdges)
+        GraphNodes[C.Dest].PredEdges = new SparseBitVector<>;
+      GraphNodes[C.Dest].PredEdges->set(AdrNode);
+
+      // *Dest = src edge
+      unsigned RefNode = C.Dest + FirstRefNode;
+      if (!GraphNodes[RefNode].ImplicitPredEdges)
+        GraphNodes[RefNode].ImplicitPredEdges = new SparseBitVector<>;
+      GraphNodes[RefNode].ImplicitPredEdges->set(C.Src);
+    } else if (C.Type == Constraint::Load) {
+      if (C.Offset == 0) {
+        // dest = *src edge
+        if (!GraphNodes[C.Dest].PredEdges)
+          GraphNodes[C.Dest].PredEdges = new SparseBitVector<>;
+        GraphNodes[C.Dest].PredEdges->set(C.Src + FirstRefNode);
+      } else {
+        GraphNodes[C.Dest].Direct = false;
+      }
+    } else if (C.Type == Constraint::Store) {
+      if (C.Offset == 0) {
+        // *dest = src edge
+        unsigned RefNode = C.Dest + FirstRefNode;
+        if (!GraphNodes[RefNode].PredEdges)
+          GraphNodes[RefNode].PredEdges = new SparseBitVector<>;
+        GraphNodes[RefNode].PredEdges->set(C.Src);
+      }
+    } else {
+      // Dest = Src edge and *Dest = *Src edge
+      if (!GraphNodes[C.Dest].PredEdges)
+        GraphNodes[C.Dest].PredEdges = new SparseBitVector<>;
+      GraphNodes[C.Dest].PredEdges->set(C.Src);
+      unsigned RefNode = C.Dest + FirstRefNode;
+      if (!GraphNodes[RefNode].ImplicitPredEdges)
+        GraphNodes[RefNode].ImplicitPredEdges = new SparseBitVector<>;
+      GraphNodes[RefNode].ImplicitPredEdges->set(C.Src + FirstRefNode);
+    }
+  }
+  PEClass = 1;
+  // Do SCC finding first to condense our predecessor graph
+  DFSNumber = 0;
+  Node2DFS.insert(Node2DFS.begin(), GraphNodes.size(), 0);
+  Node2Deleted.insert(Node2Deleted.begin(), GraphNodes.size(), false);
+  Node2Visited.insert(Node2Visited.begin(), GraphNodes.size(), false);
+
+  for (unsigned i = 0; i < FirstRefNode; ++i) {
+    unsigned Node = VSSCCRep[i];
+    if (!Node2Visited[Node])
+      HVNValNum(Node);
+  }
+  for (BitVectorMap::iterator Iter = Set2PEClass.begin();
+       Iter != Set2PEClass.end();
+       ++Iter)
+    delete Iter->first;
+  Set2PEClass.clear();
+  Node2DFS.clear();
+  Node2Deleted.clear();
+  Node2Visited.clear();
+  DOUT << "Finished HVN\n";
+
+}
+
+/// This is the workhorse of HVN value numbering. We combine SCC finding at the
+/// same time because it's easy.
+void Andersens::HVNValNum(unsigned NodeIndex) {
+  unsigned MyDFS = DFSNumber++;
+  Node *N = &GraphNodes[NodeIndex];
+  Node2Visited[NodeIndex] = true;
+  Node2DFS[NodeIndex] = MyDFS;
+
+  // First process all our explicit edges
+  if (N->PredEdges)
+    for (SparseBitVector<>::iterator Iter = N->PredEdges->begin();
+         Iter != N->PredEdges->end();
+         ++Iter) {
+      unsigned j = VSSCCRep[*Iter];
+      if (!Node2Deleted[j]) {
+        if (!Node2Visited[j])
+          HVNValNum(j);
+        if (Node2DFS[NodeIndex] > Node2DFS[j])
+          Node2DFS[NodeIndex] = Node2DFS[j];
+      }
+    }
+
+  // Now process all the implicit edges
+  if (N->ImplicitPredEdges)
+    for (SparseBitVector<>::iterator Iter = N->ImplicitPredEdges->begin();
+         Iter != N->ImplicitPredEdges->end();
+         ++Iter) {
+      unsigned j = VSSCCRep[*Iter];
+      if (!Node2Deleted[j]) {
+        if (!Node2Visited[j])
+          HVNValNum(j);
+        if (Node2DFS[NodeIndex] > Node2DFS[j])
+          Node2DFS[NodeIndex] = Node2DFS[j];
+      }
+    }
+
+  // See if we found any cycles
+  if (MyDFS == Node2DFS[NodeIndex]) {
+    while (!SCCStack.empty() && Node2DFS[SCCStack.top()] >= MyDFS) {
+      unsigned CycleNodeIndex = SCCStack.top();
+      Node *CycleNode = &GraphNodes[CycleNodeIndex];
+      VSSCCRep[CycleNodeIndex] = NodeIndex;
+      // Unify the nodes
+      N->Direct &= CycleNode->Direct;
+
+      if (CycleNode->PredEdges) {
+        if (!N->PredEdges)
+          N->PredEdges = new SparseBitVector<>;
+        *(N->PredEdges) |= CycleNode->PredEdges;
+        delete CycleNode->PredEdges;
+        CycleNode->PredEdges = NULL;
+      }
+      if (CycleNode->ImplicitPredEdges) {
+        if (!N->ImplicitPredEdges)
+          N->ImplicitPredEdges = new SparseBitVector<>;
+        *(N->ImplicitPredEdges) |= CycleNode->ImplicitPredEdges;
+        delete CycleNode->ImplicitPredEdges;
+        CycleNode->ImplicitPredEdges = NULL;
+      }
+
+      SCCStack.pop();
+    }
+
+    Node2Deleted[NodeIndex] = true;
+
+    if (!N->Direct) {
+      GraphNodes[NodeIndex].PointerEquivLabel = PEClass++;
+      return;
+    }
+
+    // Collect labels of successor nodes
+    bool AllSame = true;
+    unsigned First = ~0;
+    SparseBitVector<> *Labels = new SparseBitVector<>;
+    bool Used = false;
+
+    if (N->PredEdges)
+      for (SparseBitVector<>::iterator Iter = N->PredEdges->begin();
+           Iter != N->PredEdges->end();
+         ++Iter) {
+        unsigned j = VSSCCRep[*Iter];
+        unsigned Label = GraphNodes[j].PointerEquivLabel;
+        // Ignore labels that are equal to us or non-pointers
+        if (j == NodeIndex || Label == 0)
+          continue;
+        if (First == (unsigned)~0)
+          First = Label;
+        else if (First != Label)
+          AllSame = false;
+        Labels->set(Label);
+    }
+
+    // We either have a non-pointer, a copy of an existing node, or a new node.
+    // Assign the appropriate pointer equivalence label.
+    if (Labels->empty()) {
+      GraphNodes[NodeIndex].PointerEquivLabel = 0;
+    } else if (AllSame) {
+      GraphNodes[NodeIndex].PointerEquivLabel = First;
+    } else {
+      GraphNodes[NodeIndex].PointerEquivLabel = Set2PEClass[Labels];
+      if (GraphNodes[NodeIndex].PointerEquivLabel == 0) {
+        unsigned EquivClass = PEClass++;
+        Set2PEClass[Labels] = EquivClass;
+        GraphNodes[NodeIndex].PointerEquivLabel = EquivClass;
+        Used = true;
+      }
+    }
+    if (!Used)
+      delete Labels;
+  } else {
+    SCCStack.push(NodeIndex);
+  }
+}
+
+/// The technique used here is described in "Exploiting Pointer and Location
+/// Equivalence to Optimize Pointer Analysis. In the 14th International Static
+/// Analysis Symposium (SAS), August 2007."  It is known as the "HU" algorithm,
+/// and is equivalent to value numbering the collapsed constraint graph
+/// including evaluating unions.
+void Andersens::HU() {
+  DOUT << "Beginning HU\n";
+  // Build a predecessor graph.  This is like our constraint graph with the
+  // edges going in the opposite direction, and there are edges for all the
+  // constraints, instead of just copy constraints.  We also build implicit
+  // edges for constraints are implied but not explicit.  I.E for the constraint
+  // a = &b, we add implicit edges *a = b.  This helps us capture more cycles
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    Constraint &C = Constraints[i];
+    if (C.Type == Constraint::AddressOf) {
+      GraphNodes[C.Src].AddressTaken = true;
+      GraphNodes[C.Src].Direct = false;
+
+      GraphNodes[C.Dest].PointsTo->set(C.Src);
+      // *Dest = src edge
+      unsigned RefNode = C.Dest + FirstRefNode;
+      if (!GraphNodes[RefNode].ImplicitPredEdges)
+        GraphNodes[RefNode].ImplicitPredEdges = new SparseBitVector<>;
+      GraphNodes[RefNode].ImplicitPredEdges->set(C.Src);
+      GraphNodes[C.Src].PointedToBy->set(C.Dest);
+    } else if (C.Type == Constraint::Load) {
+      if (C.Offset == 0) {
+        // dest = *src edge
+        if (!GraphNodes[C.Dest].PredEdges)
+          GraphNodes[C.Dest].PredEdges = new SparseBitVector<>;
+        GraphNodes[C.Dest].PredEdges->set(C.Src + FirstRefNode);
+      } else {
+        GraphNodes[C.Dest].Direct = false;
+      }
+    } else if (C.Type == Constraint::Store) {
+      if (C.Offset == 0) {
+        // *dest = src edge
+        unsigned RefNode = C.Dest + FirstRefNode;
+        if (!GraphNodes[RefNode].PredEdges)
+          GraphNodes[RefNode].PredEdges = new SparseBitVector<>;
+        GraphNodes[RefNode].PredEdges->set(C.Src);
+      }
+    } else {
+      // Dest = Src edge and *Dest = *Src edg
+      if (!GraphNodes[C.Dest].PredEdges)
+        GraphNodes[C.Dest].PredEdges = new SparseBitVector<>;
+      GraphNodes[C.Dest].PredEdges->set(C.Src);
+      unsigned RefNode = C.Dest + FirstRefNode;
+      if (!GraphNodes[RefNode].ImplicitPredEdges)
+        GraphNodes[RefNode].ImplicitPredEdges = new SparseBitVector<>;
+      GraphNodes[RefNode].ImplicitPredEdges->set(C.Src + FirstRefNode);
+    }
+  }
+  PEClass = 1;
+  // Do SCC finding first to condense our predecessor graph
+  DFSNumber = 0;
+  Node2DFS.insert(Node2DFS.begin(), GraphNodes.size(), 0);
+  Node2Deleted.insert(Node2Deleted.begin(), GraphNodes.size(), false);
+  Node2Visited.insert(Node2Visited.begin(), GraphNodes.size(), false);
+
+  for (unsigned i = 0; i < FirstRefNode; ++i) {
+    if (FindNode(i) == i) {
+      unsigned Node = VSSCCRep[i];
+      if (!Node2Visited[Node])
+        Condense(Node);
+    }
+  }
+
+  // Reset tables for actual labeling
+  Node2DFS.clear();
+  Node2Visited.clear();
+  Node2Deleted.clear();
+  // Pre-grow our densemap so that we don't get really bad behavior
+  Set2PEClass.resize(GraphNodes.size());
+
+  // Visit the condensed graph and generate pointer equivalence labels.
+  Node2Visited.insert(Node2Visited.begin(), GraphNodes.size(), false);
+  for (unsigned i = 0; i < FirstRefNode; ++i) {
+    if (FindNode(i) == i) {
+      unsigned Node = VSSCCRep[i];
+      if (!Node2Visited[Node])
+        HUValNum(Node);
+    }
+  }
+  // PEClass nodes will be deleted by the deleting of N->PointsTo in our caller.
+  Set2PEClass.clear();
+  DOUT << "Finished HU\n";
+}
+
+
+/// Implementation of standard Tarjan SCC algorithm as modified by Nuutilla.
+void Andersens::Condense(unsigned NodeIndex) {
+  unsigned MyDFS = DFSNumber++;
+  Node *N = &GraphNodes[NodeIndex];
+  Node2Visited[NodeIndex] = true;
+  Node2DFS[NodeIndex] = MyDFS;
+
+  // First process all our explicit edges
+  if (N->PredEdges)
+    for (SparseBitVector<>::iterator Iter = N->PredEdges->begin();
+         Iter != N->PredEdges->end();
+         ++Iter) {
+      unsigned j = VSSCCRep[*Iter];
+      if (!Node2Deleted[j]) {
+        if (!Node2Visited[j])
+          Condense(j);
+        if (Node2DFS[NodeIndex] > Node2DFS[j])
+          Node2DFS[NodeIndex] = Node2DFS[j];
+      }
+    }
+
+  // Now process all the implicit edges
+  if (N->ImplicitPredEdges)
+    for (SparseBitVector<>::iterator Iter = N->ImplicitPredEdges->begin();
+         Iter != N->ImplicitPredEdges->end();
+         ++Iter) {
+      unsigned j = VSSCCRep[*Iter];
+      if (!Node2Deleted[j]) {
+        if (!Node2Visited[j])
+          Condense(j);
+        if (Node2DFS[NodeIndex] > Node2DFS[j])
+          Node2DFS[NodeIndex] = Node2DFS[j];
+      }
+    }
+
+  // See if we found any cycles
+  if (MyDFS == Node2DFS[NodeIndex]) {
+    while (!SCCStack.empty() && Node2DFS[SCCStack.top()] >= MyDFS) {
+      unsigned CycleNodeIndex = SCCStack.top();
+      Node *CycleNode = &GraphNodes[CycleNodeIndex];
+      VSSCCRep[CycleNodeIndex] = NodeIndex;
+      // Unify the nodes
+      N->Direct &= CycleNode->Direct;
+
+      *(N->PointsTo) |= CycleNode->PointsTo;
+      delete CycleNode->PointsTo;
+      CycleNode->PointsTo = NULL;
+      if (CycleNode->PredEdges) {
+        if (!N->PredEdges)
+          N->PredEdges = new SparseBitVector<>;
+        *(N->PredEdges) |= CycleNode->PredEdges;
+        delete CycleNode->PredEdges;
+        CycleNode->PredEdges = NULL;
+      }
+      if (CycleNode->ImplicitPredEdges) {
+        if (!N->ImplicitPredEdges)
+          N->ImplicitPredEdges = new SparseBitVector<>;
+        *(N->ImplicitPredEdges) |= CycleNode->ImplicitPredEdges;
+        delete CycleNode->ImplicitPredEdges;
+        CycleNode->ImplicitPredEdges = NULL;
+      }
+      SCCStack.pop();
+    }
+
+    Node2Deleted[NodeIndex] = true;
+
+    // Set up number of incoming edges for other nodes
+    if (N->PredEdges)
+      for (SparseBitVector<>::iterator Iter = N->PredEdges->begin();
+           Iter != N->PredEdges->end();
+           ++Iter)
+        ++GraphNodes[VSSCCRep[*Iter]].NumInEdges;
+  } else {
+    SCCStack.push(NodeIndex);
+  }
+}
+
+void Andersens::HUValNum(unsigned NodeIndex) {
+  Node *N = &GraphNodes[NodeIndex];
+  Node2Visited[NodeIndex] = true;
+
+  // Eliminate dereferences of non-pointers for those non-pointers we have
+  // already identified.  These are ref nodes whose non-ref node:
+  // 1. Has already been visited determined to point to nothing (and thus, a
+  // dereference of it must point to nothing)
+  // 2. Any direct node with no predecessor edges in our graph and with no
+  // points-to set (since it can't point to anything either, being that it
+  // receives no points-to sets and has none).
+  if (NodeIndex >= FirstRefNode) {
+    unsigned j = VSSCCRep[FindNode(NodeIndex - FirstRefNode)];
+    if ((Node2Visited[j] && !GraphNodes[j].PointerEquivLabel)
+        || (GraphNodes[j].Direct && !GraphNodes[j].PredEdges
+            && GraphNodes[j].PointsTo->empty())){
+      return;
+    }
+  }
+    // Process all our explicit edges
+  if (N->PredEdges)
+    for (SparseBitVector<>::iterator Iter = N->PredEdges->begin();
+         Iter != N->PredEdges->end();
+         ++Iter) {
+      unsigned j = VSSCCRep[*Iter];
+      if (!Node2Visited[j])
+        HUValNum(j);
+
+      // If this edge turned out to be the same as us, or got no pointer
+      // equivalence label (and thus points to nothing) , just decrement our
+      // incoming edges and continue.
+      if (j == NodeIndex || GraphNodes[j].PointerEquivLabel == 0) {
+        --GraphNodes[j].NumInEdges;
+        continue;
+      }
+
+      *(N->PointsTo) |= GraphNodes[j].PointsTo;
+
+      // If we didn't end up storing this in the hash, and we're done with all
+      // the edges, we don't need the points-to set anymore.
+      --GraphNodes[j].NumInEdges;
+      if (!GraphNodes[j].NumInEdges && !GraphNodes[j].StoredInHash) {
+        delete GraphNodes[j].PointsTo;
+        GraphNodes[j].PointsTo = NULL;
+      }
+    }
+  // If this isn't a direct node, generate a fresh variable.
+  if (!N->Direct) {
+    N->PointsTo->set(FirstRefNode + NodeIndex);
+  }
+
+  // See If we have something equivalent to us, if not, generate a new
+  // equivalence class.
+  if (N->PointsTo->empty()) {
+    delete N->PointsTo;
+    N->PointsTo = NULL;
+  } else {
+    if (N->Direct) {
+      N->PointerEquivLabel = Set2PEClass[N->PointsTo];
+      if (N->PointerEquivLabel == 0) {
+        unsigned EquivClass = PEClass++;
+        N->StoredInHash = true;
+        Set2PEClass[N->PointsTo] = EquivClass;
+        N->PointerEquivLabel = EquivClass;
+      }
+    } else {
+      N->PointerEquivLabel = PEClass++;
+    }
+  }
+}
+
+/// Rewrite our list of constraints so that pointer equivalent nodes are
+/// replaced by their the pointer equivalence class representative.
+void Andersens::RewriteConstraints() {
+  std::vector<Constraint> NewConstraints;
+  DenseSet<Constraint, ConstraintKeyInfo> Seen;
+
+  PEClass2Node.clear();
+  PENLEClass2Node.clear();
+
+  // We may have from 1 to Graphnodes + 1 equivalence classes.
+  PEClass2Node.insert(PEClass2Node.begin(), GraphNodes.size() + 1, -1);
+  PENLEClass2Node.insert(PENLEClass2Node.begin(), GraphNodes.size() + 1, -1);
+
+  // Rewrite constraints, ignoring non-pointer constraints, uniting equivalent
+  // nodes, and rewriting constraints to use the representative nodes.
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    Constraint &C = Constraints[i];
+    unsigned RHSNode = FindNode(C.Src);
+    unsigned LHSNode = FindNode(C.Dest);
+    unsigned RHSLabel = GraphNodes[VSSCCRep[RHSNode]].PointerEquivLabel;
+    unsigned LHSLabel = GraphNodes[VSSCCRep[LHSNode]].PointerEquivLabel;
+
+    // First we try to eliminate constraints for things we can prove don't point
+    // to anything.
+    if (LHSLabel == 0) {
+      DEBUG(PrintNode(&GraphNodes[LHSNode]));
+      DOUT << " is a non-pointer, ignoring constraint.\n";
+      continue;
+    }
+    if (RHSLabel == 0) {
+      DEBUG(PrintNode(&GraphNodes[RHSNode]));
+      DOUT << " is a non-pointer, ignoring constraint.\n";
+      continue;
+    }
+    // This constraint may be useless, and it may become useless as we translate
+    // it.
+    if (C.Src == C.Dest && C.Type == Constraint::Copy)
+      continue;
+
+    C.Src = FindEquivalentNode(RHSNode, RHSLabel);
+    C.Dest = FindEquivalentNode(FindNode(LHSNode), LHSLabel);
+    if ((C.Src == C.Dest && C.Type == Constraint::Copy)
+        || Seen.count(C))
+      continue;
+
+    Seen.insert(C);
+    NewConstraints.push_back(C);
+  }
+  Constraints.swap(NewConstraints);
+  PEClass2Node.clear();
+}
+
+/// See if we have a node that is pointer equivalent to the one being asked
+/// about, and if so, unite them and return the equivalent node.  Otherwise,
+/// return the original node.
+unsigned Andersens::FindEquivalentNode(unsigned NodeIndex,
+                                       unsigned NodeLabel) {
+  if (!GraphNodes[NodeIndex].AddressTaken) {
+    if (PEClass2Node[NodeLabel] != -1) {
+      // We found an existing node with the same pointer label, so unify them.
+      // We specifically request that Union-By-Rank not be used so that
+      // PEClass2Node[NodeLabel] U= NodeIndex and not the other way around.
+      return UniteNodes(PEClass2Node[NodeLabel], NodeIndex, false);
+    } else {
+      PEClass2Node[NodeLabel] = NodeIndex;
+      PENLEClass2Node[NodeLabel] = NodeIndex;
+    }
+  } else if (PENLEClass2Node[NodeLabel] == -1) {
+    PENLEClass2Node[NodeLabel] = NodeIndex;
+  }
+
+  return NodeIndex;
+}
+
+void Andersens::PrintLabels() const {
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    if (i < FirstRefNode) {
+      PrintNode(&GraphNodes[i]);
+    } else if (i < FirstAdrNode) {
+      DOUT << "REF(";
+      PrintNode(&GraphNodes[i-FirstRefNode]);
+      DOUT <<")";
+    } else {
+      DOUT << "ADR(";
+      PrintNode(&GraphNodes[i-FirstAdrNode]);
+      DOUT <<")";
+    }
+
+    DOUT << " has pointer label " << GraphNodes[i].PointerEquivLabel
+         << " and SCC rep " << VSSCCRep[i]
+         << " and is " << (GraphNodes[i].Direct ? "Direct" : "Not direct")
+         << "\n";
+  }
+}
+
+/// The technique used here is described in "The Ant and the
+/// Grasshopper: Fast and Accurate Pointer Analysis for Millions of
+/// Lines of Code. In Programming Language Design and Implementation
+/// (PLDI), June 2007." It is known as the "HCD" (Hybrid Cycle
+/// Detection) algorithm. It is called a hybrid because it performs an
+/// offline analysis and uses its results during the solving (online)
+/// phase. This is just the offline portion; the results of this
+/// operation are stored in SDT and are later used in SolveContraints()
+/// and UniteNodes().
+void Andersens::HCD() {
+  DOUT << "Starting HCD.\n";
+  HCDSCCRep.resize(GraphNodes.size());
+
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    GraphNodes[i].Edges = new SparseBitVector<>;
+    HCDSCCRep[i] = i;
+  }
+
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    Constraint &C = Constraints[i];
+    assert (C.Src < GraphNodes.size() && C.Dest < GraphNodes.size());
+    if (C.Type == Constraint::AddressOf) {
+      continue;
+    } else if (C.Type == Constraint::Load) {
+      if( C.Offset == 0 )
+        GraphNodes[C.Dest].Edges->set(C.Src + FirstRefNode);
+    } else if (C.Type == Constraint::Store) {
+      if( C.Offset == 0 )
+        GraphNodes[C.Dest + FirstRefNode].Edges->set(C.Src);
+    } else {
+      GraphNodes[C.Dest].Edges->set(C.Src);
+    }
+  }
+
+  Node2DFS.insert(Node2DFS.begin(), GraphNodes.size(), 0);
+  Node2Deleted.insert(Node2Deleted.begin(), GraphNodes.size(), false);
+  Node2Visited.insert(Node2Visited.begin(), GraphNodes.size(), false);
+  SDT.insert(SDT.begin(), GraphNodes.size() / 2, -1);
+
+  DFSNumber = 0;
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    unsigned Node = HCDSCCRep[i];
+    if (!Node2Deleted[Node])
+      Search(Node);
+  }
+
+  for (unsigned i = 0; i < GraphNodes.size(); ++i)
+    if (GraphNodes[i].Edges != NULL) {
+      delete GraphNodes[i].Edges;
+      GraphNodes[i].Edges = NULL;
+    }
+
+  while( !SCCStack.empty() )
+    SCCStack.pop();
+
+  Node2DFS.clear();
+  Node2Visited.clear();
+  Node2Deleted.clear();
+  HCDSCCRep.clear();
+  DOUT << "HCD complete.\n";
+}
+
+// Component of HCD: 
+// Use Nuutila's variant of Tarjan's algorithm to detect
+// Strongly-Connected Components (SCCs). For non-trivial SCCs
+// containing ref nodes, insert the appropriate information in SDT.
+void Andersens::Search(unsigned Node) {
+  unsigned MyDFS = DFSNumber++;
+
+  Node2Visited[Node] = true;
+  Node2DFS[Node] = MyDFS;
+
+  for (SparseBitVector<>::iterator Iter = GraphNodes[Node].Edges->begin(),
+                                   End  = GraphNodes[Node].Edges->end();
+       Iter != End;
+       ++Iter) {
+    unsigned J = HCDSCCRep[*Iter];
+    assert(GraphNodes[J].isRep() && "Debug check; must be representative");
+    if (!Node2Deleted[J]) {
+      if (!Node2Visited[J])
+        Search(J);
+      if (Node2DFS[Node] > Node2DFS[J])
+        Node2DFS[Node] = Node2DFS[J];
+    }
+  }
+
+  if( MyDFS != Node2DFS[Node] ) {
+    SCCStack.push(Node);
+    return;
+  }
+
+  // This node is the root of a SCC, so process it.
+  //
+  // If the SCC is "non-trivial" (not a singleton) and contains a reference 
+  // node, we place this SCC into SDT.  We unite the nodes in any case.
+  if (!SCCStack.empty() && Node2DFS[SCCStack.top()] >= MyDFS) {
+    SparseBitVector<> SCC;
+
+    SCC.set(Node);
+
+    bool Ref = (Node >= FirstRefNode);
+
+    Node2Deleted[Node] = true;
+
+    do {
+      unsigned P = SCCStack.top(); SCCStack.pop();
+      Ref |= (P >= FirstRefNode);
+      SCC.set(P);
+      HCDSCCRep[P] = Node;
+    } while (!SCCStack.empty() && Node2DFS[SCCStack.top()] >= MyDFS);
+
+    if (Ref) {
+      unsigned Rep = SCC.find_first();
+      assert(Rep < FirstRefNode && "The SCC didn't have a non-Ref node!");
+
+      SparseBitVector<>::iterator i = SCC.begin();
+
+      // Skip over the non-ref nodes
+      while( *i < FirstRefNode )
+        ++i;
+
+      while( i != SCC.end() )
+        SDT[ (*i++) - FirstRefNode ] = Rep;
+    }
+  }
+}
+
+
+/// Optimize the constraints by performing offline variable substitution and
+/// other optimizations.
+void Andersens::OptimizeConstraints() {
+  DOUT << "Beginning constraint optimization\n";
+
+  SDTActive = false;
+
+  // Function related nodes need to stay in the same relative position and can't
+  // be location equivalent.
+  for (std::map<unsigned, unsigned>::iterator Iter = MaxK.begin();
+       Iter != MaxK.end();
+       ++Iter) {
+    for (unsigned i = Iter->first;
+         i != Iter->first + Iter->second;
+         ++i) {
+      GraphNodes[i].AddressTaken = true;
+      GraphNodes[i].Direct = false;
+    }
+  }
+
+  ClumpAddressTaken();
+  FirstRefNode = GraphNodes.size();
+  FirstAdrNode = FirstRefNode + GraphNodes.size();
+  GraphNodes.insert(GraphNodes.end(), 2 * GraphNodes.size(),
+                    Node(false));
+  VSSCCRep.resize(GraphNodes.size());
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    VSSCCRep[i] = i;
+  }
+  HVN();
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    Node *N = &GraphNodes[i];
+    delete N->PredEdges;
+    N->PredEdges = NULL;
+    delete N->ImplicitPredEdges;
+    N->ImplicitPredEdges = NULL;
+  }
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa-labels"
+  DEBUG(PrintLabels());
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa"
+  RewriteConstraints();
+  // Delete the adr nodes.
+  GraphNodes.resize(FirstRefNode * 2);
+
+  // Now perform HU
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    Node *N = &GraphNodes[i];
+    if (FindNode(i) == i) {
+      N->PointsTo = new SparseBitVector<>;
+      N->PointedToBy = new SparseBitVector<>;
+      // Reset our labels
+    }
+    VSSCCRep[i] = i;
+    N->PointerEquivLabel = 0;
+  }
+  HU();
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa-labels"
+  DEBUG(PrintLabels());
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa"
+  RewriteConstraints();
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    if (FindNode(i) == i) {
+      Node *N = &GraphNodes[i];
+      delete N->PointsTo;
+      N->PointsTo = NULL;
+      delete N->PredEdges;
+      N->PredEdges = NULL;
+      delete N->ImplicitPredEdges;
+      N->ImplicitPredEdges = NULL;
+      delete N->PointedToBy;
+      N->PointedToBy = NULL;
+    }
+  }
+
+  // perform Hybrid Cycle Detection (HCD)
+  HCD();
+  SDTActive = true;
+
+  // No longer any need for the upper half of GraphNodes (for ref nodes).
+  GraphNodes.erase(GraphNodes.begin() + FirstRefNode, GraphNodes.end());
+
+  // HCD complete.
+
+  DOUT << "Finished constraint optimization\n";
+  FirstRefNode = 0;
+  FirstAdrNode = 0;
+}
+
+/// Unite pointer but not location equivalent variables, now that the constraint
+/// graph is built.
+void Andersens::UnitePointerEquivalences() {
+  DOUT << "Uniting remaining pointer equivalences\n";
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    if (GraphNodes[i].AddressTaken && GraphNodes[i].isRep()) {
+      unsigned Label = GraphNodes[i].PointerEquivLabel;
+
+      if (Label && PENLEClass2Node[Label] != -1)
+        UniteNodes(i, PENLEClass2Node[Label]);
+    }
+  }
+  DOUT << "Finished remaining pointer equivalences\n";
+  PENLEClass2Node.clear();
+}
+
+/// Create the constraint graph used for solving points-to analysis.
+///
+void Andersens::CreateConstraintGraph() {
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    Constraint &C = Constraints[i];
+    assert (C.Src < GraphNodes.size() && C.Dest < GraphNodes.size());
+    if (C.Type == Constraint::AddressOf)
+      GraphNodes[C.Dest].PointsTo->set(C.Src);
+    else if (C.Type == Constraint::Load)
+      GraphNodes[C.Src].Constraints.push_back(C);
+    else if (C.Type == Constraint::Store)
+      GraphNodes[C.Dest].Constraints.push_back(C);
+    else if (C.Offset != 0)
+      GraphNodes[C.Src].Constraints.push_back(C);
+    else
+      GraphNodes[C.Src].Edges->set(C.Dest);
+  }
+}
+
+// Perform DFS and cycle detection.
+bool Andersens::QueryNode(unsigned Node) {
+  assert(GraphNodes[Node].isRep() && "Querying a non-rep node");
+  unsigned OurDFS = ++DFSNumber;
+  SparseBitVector<> ToErase;
+  SparseBitVector<> NewEdges;
+  Tarjan2DFS[Node] = OurDFS;
+
+  // Changed denotes a change from a recursive call that we will bubble up.
+  // Merged is set if we actually merge a node ourselves.
+  bool Changed = false, Merged = false;
+
+  for (SparseBitVector<>::iterator bi = GraphNodes[Node].Edges->begin();
+       bi != GraphNodes[Node].Edges->end();
+       ++bi) {
+    unsigned RepNode = FindNode(*bi);
+    // If this edge points to a non-representative node but we are
+    // already planning to add an edge to its representative, we have no
+    // need for this edge anymore.
+    if (RepNode != *bi && NewEdges.test(RepNode)){
+      ToErase.set(*bi);
+      continue;
+    }
+
+    // Continue about our DFS.
+    if (!Tarjan2Deleted[RepNode]){
+      if (Tarjan2DFS[RepNode] == 0) {
+        Changed |= QueryNode(RepNode);
+        // May have been changed by QueryNode
+        RepNode = FindNode(RepNode);
+      }
+      if (Tarjan2DFS[RepNode] < Tarjan2DFS[Node])
+        Tarjan2DFS[Node] = Tarjan2DFS[RepNode];
+    }
+
+    // We may have just discovered that this node is part of a cycle, in
+    // which case we can also erase it.
+    if (RepNode != *bi) {
+      ToErase.set(*bi);
+      NewEdges.set(RepNode);
+    }
+  }
+
+  GraphNodes[Node].Edges->intersectWithComplement(ToErase);
+  GraphNodes[Node].Edges |= NewEdges;
+
+  // If this node is a root of a non-trivial SCC, place it on our 
+  // worklist to be processed.
+  if (OurDFS == Tarjan2DFS[Node]) {
+    while (!SCCStack.empty() && Tarjan2DFS[SCCStack.top()] >= OurDFS) {
+      Node = UniteNodes(Node, SCCStack.top());
+
+      SCCStack.pop();
+      Merged = true;
+    }
+    Tarjan2Deleted[Node] = true;
+
+    if (Merged)
+      NextWL->insert(&GraphNodes[Node]);
+  } else {
+    SCCStack.push(Node);
+  }
+
+  return(Changed | Merged);
+}
+
+/// SolveConstraints - This stage iteratively processes the constraints list
+/// propagating constraints (adding edges to the Nodes in the points-to graph)
+/// until a fixed point is reached.
+///
+/// We use a variant of the technique called "Lazy Cycle Detection", which is
+/// described in "The Ant and the Grasshopper: Fast and Accurate Pointer
+/// Analysis for Millions of Lines of Code. In Programming Language Design and
+/// Implementation (PLDI), June 2007."
+/// The paper describes performing cycle detection one node at a time, which can
+/// be expensive if there are no cycles, but there are long chains of nodes that
+/// it heuristically believes are cycles (because it will DFS from each node
+/// without state from previous nodes).
+/// Instead, we use the heuristic to build a worklist of nodes to check, then
+/// cycle detect them all at the same time to do this more cheaply.  This
+/// catches cycles slightly later than the original technique did, but does it
+/// make significantly cheaper.
+
+void Andersens::SolveConstraints() {
+  CurrWL = &w1;
+  NextWL = &w2;
+
+  OptimizeConstraints();
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa-constraints"
+      DEBUG(PrintConstraints());
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "anders-aa"
+
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    Node *N = &GraphNodes[i];
+    N->PointsTo = new SparseBitVector<>;
+    N->OldPointsTo = new SparseBitVector<>;
+    N->Edges = new SparseBitVector<>;
+  }
+  CreateConstraintGraph();
+  UnitePointerEquivalences();
+  assert(SCCStack.empty() && "SCC Stack should be empty by now!");
+  Node2DFS.clear();
+  Node2Deleted.clear();
+  Node2DFS.insert(Node2DFS.begin(), GraphNodes.size(), 0);
+  Node2Deleted.insert(Node2Deleted.begin(), GraphNodes.size(), false);
+  DFSNumber = 0;
+  DenseSet<Constraint, ConstraintKeyInfo> Seen;
+  DenseSet<std::pair<unsigned,unsigned>, PairKeyInfo> EdgesChecked;
+
+  // Order graph and add initial nodes to work list.
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    Node *INode = &GraphNodes[i];
+
+    // Add to work list if it's a representative and can contribute to the
+    // calculation right now.
+    if (INode->isRep() && !INode->PointsTo->empty()
+        && (!INode->Edges->empty() || !INode->Constraints.empty())) {
+      INode->Stamp();
+      CurrWL->insert(INode);
+    }
+  }
+  std::queue<unsigned int> TarjanWL;
+#if !FULL_UNIVERSAL
+  // "Rep and special variables" - in order for HCD to maintain conservative
+  // results when !FULL_UNIVERSAL, we need to treat the special variables in
+  // the same way that the !FULL_UNIVERSAL tweak does throughout the rest of
+  // the analysis - it's ok to add edges from the special nodes, but never
+  // *to* the special nodes.
+  std::vector<unsigned int> RSV;
+#endif
+  while( !CurrWL->empty() ) {
+    DOUT << "Starting iteration #" << ++NumIters << "\n";
+
+    Node* CurrNode;
+    unsigned CurrNodeIndex;
+
+    // Actual cycle checking code.  We cycle check all of the lazy cycle
+    // candidates from the last iteration in one go.
+    if (!TarjanWL.empty()) {
+      DFSNumber = 0;
+      
+      Tarjan2DFS.clear();
+      Tarjan2Deleted.clear();
+      while (!TarjanWL.empty()) {
+        unsigned int ToTarjan = TarjanWL.front();
+        TarjanWL.pop();
+        if (!Tarjan2Deleted[ToTarjan]
+            && GraphNodes[ToTarjan].isRep()
+            && Tarjan2DFS[ToTarjan] == 0)
+          QueryNode(ToTarjan);
+      }
+    }
+    
+    // Add to work list if it's a representative and can contribute to the
+    // calculation right now.
+    while( (CurrNode = CurrWL->pop()) != NULL ) {
+      CurrNodeIndex = CurrNode - &GraphNodes[0];
+      CurrNode->Stamp();
+      
+          
+      // Figure out the changed points to bits
+      SparseBitVector<> CurrPointsTo;
+      CurrPointsTo.intersectWithComplement(CurrNode->PointsTo,
+                                           CurrNode->OldPointsTo);
+      if (CurrPointsTo.empty())
+        continue;
+
+      *(CurrNode->OldPointsTo) |= CurrPointsTo;
+
+      // Check the offline-computed equivalencies from HCD.
+      bool SCC = false;
+      unsigned Rep;
+
+      if (SDT[CurrNodeIndex] >= 0) {
+        SCC = true;
+        Rep = FindNode(SDT[CurrNodeIndex]);
+
+#if !FULL_UNIVERSAL
+        RSV.clear();
+#endif
+        for (SparseBitVector<>::iterator bi = CurrPointsTo.begin();
+             bi != CurrPointsTo.end(); ++bi) {
+          unsigned Node = FindNode(*bi);
+#if !FULL_UNIVERSAL
+          if (Node < NumberSpecialNodes) {
+            RSV.push_back(Node);
+            continue;
+          }
+#endif
+          Rep = UniteNodes(Rep,Node);
+        }
+#if !FULL_UNIVERSAL
+        RSV.push_back(Rep);
+#endif
+
+        NextWL->insert(&GraphNodes[Rep]);
+
+        if ( ! CurrNode->isRep() )
+          continue;
+      }
+
+      Seen.clear();
+
+      /* Now process the constraints for this node.  */
+      for (std::list<Constraint>::iterator li = CurrNode->Constraints.begin();
+           li != CurrNode->Constraints.end(); ) {
+        li->Src = FindNode(li->Src);
+        li->Dest = FindNode(li->Dest);
+
+        // Delete redundant constraints
+        if( Seen.count(*li) ) {
+          std::list<Constraint>::iterator lk = li; li++;
+
+          CurrNode->Constraints.erase(lk);
+          ++NumErased;
+          continue;
+        }
+        Seen.insert(*li);
+
+        // Src and Dest will be the vars we are going to process.
+        // This may look a bit ugly, but what it does is allow us to process
+        // both store and load constraints with the same code.
+        // Load constraints say that every member of our RHS solution has K
+        // added to it, and that variable gets an edge to LHS. We also union
+        // RHS+K's solution into the LHS solution.
+        // Store constraints say that every member of our LHS solution has K
+        // added to it, and that variable gets an edge from RHS. We also union
+        // RHS's solution into the LHS+K solution.
+        unsigned *Src;
+        unsigned *Dest;
+        unsigned K = li->Offset;
+        unsigned CurrMember;
+        if (li->Type == Constraint::Load) {
+          Src = &CurrMember;
+          Dest = &li->Dest;
+        } else if (li->Type == Constraint::Store) {
+          Src = &li->Src;
+          Dest = &CurrMember;
+        } else {
+          // TODO Handle offseted copy constraint
+          li++;
+          continue;
+        }
+
+        // See if we can use Hybrid Cycle Detection (that is, check
+        // if it was a statically detected offline equivalence that
+        // involves pointers; if so, remove the redundant constraints).
+        if( SCC && K == 0 ) {
+#if FULL_UNIVERSAL
+          CurrMember = Rep;
+
+          if (GraphNodes[*Src].Edges->test_and_set(*Dest))
+            if (GraphNodes[*Dest].PointsTo |= *(GraphNodes[*Src].PointsTo))
+              NextWL->insert(&GraphNodes[*Dest]);
+#else
+          for (unsigned i=0; i < RSV.size(); ++i) {
+            CurrMember = RSV[i];
+
+            if (*Dest < NumberSpecialNodes)
+              continue;
+            if (GraphNodes[*Src].Edges->test_and_set(*Dest))
+              if (GraphNodes[*Dest].PointsTo |= *(GraphNodes[*Src].PointsTo))
+                NextWL->insert(&GraphNodes[*Dest]);
+          }
+#endif
+          // since all future elements of the points-to set will be
+          // equivalent to the current ones, the complex constraints
+          // become redundant.
+          //
+          std::list<Constraint>::iterator lk = li; li++;
+#if !FULL_UNIVERSAL
+          // In this case, we can still erase the constraints when the
+          // elements of the points-to sets are referenced by *Dest,
+          // but not when they are referenced by *Src (i.e. for a Load
+          // constraint). This is because if another special variable is
+          // put into the points-to set later, we still need to add the
+          // new edge from that special variable.
+          if( lk->Type != Constraint::Load)
+#endif
+          GraphNodes[CurrNodeIndex].Constraints.erase(lk);
+        } else {
+          const SparseBitVector<> &Solution = CurrPointsTo;
+
+          for (SparseBitVector<>::iterator bi = Solution.begin();
+               bi != Solution.end();
+               ++bi) {
+            CurrMember = *bi;
+
+            // Need to increment the member by K since that is where we are
+            // supposed to copy to/from.  Note that in positive weight cycles,
+            // which occur in address taking of fields, K can go past
+            // MaxK[CurrMember] elements, even though that is all it could point
+            // to.
+            if (K > 0 && K > MaxK[CurrMember])
+              continue;
+            else
+              CurrMember = FindNode(CurrMember + K);
+
+            // Add an edge to the graph, so we can just do regular
+            // bitmap ior next time.  It may also let us notice a cycle.
+#if !FULL_UNIVERSAL
+            if (*Dest < NumberSpecialNodes)
+              continue;
+#endif
+            if (GraphNodes[*Src].Edges->test_and_set(*Dest))
+              if (GraphNodes[*Dest].PointsTo |= *(GraphNodes[*Src].PointsTo))
+                NextWL->insert(&GraphNodes[*Dest]);
+
+          }
+          li++;
+        }
+      }
+      SparseBitVector<> NewEdges;
+      SparseBitVector<> ToErase;
+
+      // Now all we have left to do is propagate points-to info along the
+      // edges, erasing the redundant edges.
+      for (SparseBitVector<>::iterator bi = CurrNode->Edges->begin();
+           bi != CurrNode->Edges->end();
+           ++bi) {
+
+        unsigned DestVar = *bi;
+        unsigned Rep = FindNode(DestVar);
+
+        // If we ended up with this node as our destination, or we've already
+        // got an edge for the representative, delete the current edge.
+        if (Rep == CurrNodeIndex ||
+            (Rep != DestVar && NewEdges.test(Rep))) {
+            ToErase.set(DestVar);
+            continue;
+        }
+        
+        std::pair<unsigned,unsigned> edge(CurrNodeIndex,Rep);
+        
+        // This is where we do lazy cycle detection.
+        // If this is a cycle candidate (equal points-to sets and this
+        // particular edge has not been cycle-checked previously), add to the
+        // list to check for cycles on the next iteration.
+        if (!EdgesChecked.count(edge) &&
+            *(GraphNodes[Rep].PointsTo) == *(CurrNode->PointsTo)) {
+          EdgesChecked.insert(edge);
+          TarjanWL.push(Rep);
+        }
+        // Union the points-to sets into the dest
+#if !FULL_UNIVERSAL
+        if (Rep >= NumberSpecialNodes)
+#endif
+        if (GraphNodes[Rep].PointsTo |= CurrPointsTo) {
+          NextWL->insert(&GraphNodes[Rep]);
+        }
+        // If this edge's destination was collapsed, rewrite the edge.
+        if (Rep != DestVar) {
+          ToErase.set(DestVar);
+          NewEdges.set(Rep);
+        }
+      }
+      CurrNode->Edges->intersectWithComplement(ToErase);
+      CurrNode->Edges |= NewEdges;
+    }
+
+    // Switch to other work list.
+    WorkList* t = CurrWL; CurrWL = NextWL; NextWL = t;
+  }
+
+
+  Node2DFS.clear();
+  Node2Deleted.clear();
+  for (unsigned i = 0; i < GraphNodes.size(); ++i) {
+    Node *N = &GraphNodes[i];
+    delete N->OldPointsTo;
+    delete N->Edges;
+  }
+  SDTActive = false;
+  SDT.clear();
+}
+
+//===----------------------------------------------------------------------===//
+//                               Union-Find
+//===----------------------------------------------------------------------===//
+
+// Unite nodes First and Second, returning the one which is now the
+// representative node.  First and Second are indexes into GraphNodes
+unsigned Andersens::UniteNodes(unsigned First, unsigned Second,
+                               bool UnionByRank) {
+  assert (First < GraphNodes.size() && Second < GraphNodes.size() &&
+          "Attempting to merge nodes that don't exist");
+
+  Node *FirstNode = &GraphNodes[First];
+  Node *SecondNode = &GraphNodes[Second];
+
+  assert (SecondNode->isRep() && FirstNode->isRep() &&
+          "Trying to unite two non-representative nodes!");
+  if (First == Second)
+    return First;
+
+  if (UnionByRank) {
+    int RankFirst  = (int) FirstNode ->NodeRep;
+    int RankSecond = (int) SecondNode->NodeRep;
+
+    // Rank starts at -1 and gets decremented as it increases.
+    // Translation: higher rank, lower NodeRep value, which is always negative.
+    if (RankFirst > RankSecond) {
+      unsigned t = First; First = Second; Second = t;
+      Node* tp = FirstNode; FirstNode = SecondNode; SecondNode = tp;
+    } else if (RankFirst == RankSecond) {
+      FirstNode->NodeRep = (unsigned) (RankFirst - 1);
+    }
+  }
+
+  SecondNode->NodeRep = First;
+#if !FULL_UNIVERSAL
+  if (First >= NumberSpecialNodes)
+#endif
+  if (FirstNode->PointsTo && SecondNode->PointsTo)
+    FirstNode->PointsTo |= *(SecondNode->PointsTo);
+  if (FirstNode->Edges && SecondNode->Edges)
+    FirstNode->Edges |= *(SecondNode->Edges);
+  if (!SecondNode->Constraints.empty())
+    FirstNode->Constraints.splice(FirstNode->Constraints.begin(),
+                                  SecondNode->Constraints);
+  if (FirstNode->OldPointsTo) {
+    delete FirstNode->OldPointsTo;
+    FirstNode->OldPointsTo = new SparseBitVector<>;
+  }
+
+  // Destroy interesting parts of the merged-from node.
+  delete SecondNode->OldPointsTo;
+  delete SecondNode->Edges;
+  delete SecondNode->PointsTo;
+  SecondNode->Edges = NULL;
+  SecondNode->PointsTo = NULL;
+  SecondNode->OldPointsTo = NULL;
+
+  NumUnified++;
+  DOUT << "Unified Node ";
+  DEBUG(PrintNode(FirstNode));
+  DOUT << " and Node ";
+  DEBUG(PrintNode(SecondNode));
+  DOUT << "\n";
+
+  if (SDTActive)
+    if (SDT[Second] >= 0) {
+      if (SDT[First] < 0)
+        SDT[First] = SDT[Second];
+      else {
+        UniteNodes( FindNode(SDT[First]), FindNode(SDT[Second]) );
+        First = FindNode(First);
+      }
+    }
+
+  return First;
+}
+
+// Find the index into GraphNodes of the node representing Node, performing
+// path compression along the way
+unsigned Andersens::FindNode(unsigned NodeIndex) {
+  assert (NodeIndex < GraphNodes.size()
+          && "Attempting to find a node that can't exist");
+  Node *N = &GraphNodes[NodeIndex];
+  if (N->isRep())
+    return NodeIndex;
+  else
+    return (N->NodeRep = FindNode(N->NodeRep));
+}
+
+// Find the index into GraphNodes of the node representing Node, 
+// don't perform path compression along the way (for Print)
+unsigned Andersens::FindNode(unsigned NodeIndex) const {
+  assert (NodeIndex < GraphNodes.size()
+          && "Attempting to find a node that can't exist");
+  const Node *N = &GraphNodes[NodeIndex];
+  if (N->isRep())
+    return NodeIndex;
+  else
+    return FindNode(N->NodeRep);
+}
+
+//===----------------------------------------------------------------------===//
+//                               Debugging Output
+//===----------------------------------------------------------------------===//
+
+void Andersens::PrintNode(const Node *N) const {
+  if (N == &GraphNodes[UniversalSet]) {
+    cerr << "<universal>";
+    return;
+  } else if (N == &GraphNodes[NullPtr]) {
+    cerr << "<nullptr>";
+    return;
+  } else if (N == &GraphNodes[NullObject]) {
+    cerr << "<null>";
+    return;
+  }
+  if (!N->getValue()) {
+    cerr << "artificial" << (intptr_t) N;
+    return;
+  }
+
+  assert(N->getValue() != 0 && "Never set node label!");
+  Value *V = N->getValue();
+  if (Function *F = dyn_cast<Function>(V)) {
+    if (isa<PointerType>(F->getFunctionType()->getReturnType()) &&
+        N == &GraphNodes[getReturnNode(F)]) {
+      cerr << F->getName() << ":retval";
+      return;
+    } else if (F->getFunctionType()->isVarArg() &&
+               N == &GraphNodes[getVarargNode(F)]) {
+      cerr << F->getName() << ":vararg";
+      return;
+    }
+  }
+
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    cerr << I->getParent()->getParent()->getName() << ":";
+  else if (Argument *Arg = dyn_cast<Argument>(V))
+    cerr << Arg->getParent()->getName() << ":";
+
+  if (V->hasName())
+    cerr << V->getName();
+  else
+    cerr << "(unnamed)";
+
+  if (isa<GlobalValue>(V) || isa<AllocationInst>(V))
+    if (N == &GraphNodes[getObject(V)])
+      cerr << "<mem>";
+}
+void Andersens::PrintConstraint(const Constraint &C) const {
+  if (C.Type == Constraint::Store) {
+    cerr << "*";
+    if (C.Offset != 0)
+      cerr << "(";
+  }
+  PrintNode(&GraphNodes[C.Dest]);
+  if (C.Type == Constraint::Store && C.Offset != 0)
+    cerr << " + " << C.Offset << ")";
+  cerr << " = ";
+  if (C.Type == Constraint::Load) {
+    cerr << "*";
+    if (C.Offset != 0)
+      cerr << "(";
+  }
+  else if (C.Type == Constraint::AddressOf)
+    cerr << "&";
+  PrintNode(&GraphNodes[C.Src]);
+  if (C.Offset != 0 && C.Type != Constraint::Store)
+    cerr << " + " << C.Offset;
+  if (C.Type == Constraint::Load && C.Offset != 0)
+    cerr << ")";
+  cerr << "\n";
+}
+
+void Andersens::PrintConstraints() const {
+  cerr << "Constraints:\n";
+
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i)
+    PrintConstraint(Constraints[i]);
+}
+
+void Andersens::PrintPointsToGraph() const {
+  cerr << "Points-to graph:\n";
+  for (unsigned i = 0, e = GraphNodes.size(); i != e; ++i) {
+    const Node *N = &GraphNodes[i];
+    if (FindNode(i) != i) {
+      PrintNode(N);
+      cerr << "\t--> same as ";
+      PrintNode(&GraphNodes[FindNode(i)]);
+      cerr << "\n";
+    } else {
+      cerr << "[" << (N->PointsTo->count()) << "] ";
+      PrintNode(N);
+      cerr << "\t--> ";
+
+      bool first = true;
+      for (SparseBitVector<>::iterator bi = N->PointsTo->begin();
+           bi != N->PointsTo->end();
+           ++bi) {
+        if (!first)
+          cerr << ", ";
+        PrintNode(&GraphNodes[*bi]);
+        first = false;
+      }
+      cerr << "\n";
+    }
+  }
+}
diff --git a/lib/Analysis/IPA/CMakeLists.txt b/lib/Analysis/IPA/CMakeLists.txt
new file mode 100644
index 0000000..1ebb0be
--- /dev/null
+++ b/lib/Analysis/IPA/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_llvm_library(LLVMipa
+  Andersens.cpp
+  CallGraph.cpp
+  CallGraphSCCPass.cpp
+  FindUsedTypes.cpp
+  GlobalsModRef.cpp
+  )
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
new file mode 100644
index 0000000..6dabcdb
--- /dev/null
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -0,0 +1,314 @@
+//===- CallGraph.cpp - Build a Module's call graph ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CallGraph class and provides the BasicCallGraph
+// default implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+#include <ostream>
+using namespace llvm;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// BasicCallGraph class definition
+//
+class VISIBILITY_HIDDEN BasicCallGraph : public CallGraph, public ModulePass {
+  // Root is root of the call graph, or the external node if a 'main' function
+  // couldn't be found.
+  //
+  CallGraphNode *Root;
+
+  // ExternalCallingNode - This node has edges to all external functions and
+  // those internal functions that have their address taken.
+  CallGraphNode *ExternalCallingNode;
+
+  // CallsExternalNode - This node has edges to it from all functions making
+  // indirect calls or calling an external function.
+  CallGraphNode *CallsExternalNode;
+
+public:
+  static char ID; // Class identification, replacement for typeinfo
+  BasicCallGraph() : ModulePass(&ID), Root(0), 
+    ExternalCallingNode(0), CallsExternalNode(0) {}
+
+  // runOnModule - Compute the call graph for the specified module.
+  virtual bool runOnModule(Module &M) {
+    CallGraph::initialize(M);
+    
+    ExternalCallingNode = getOrInsertFunction(0);
+    CallsExternalNode = new CallGraphNode(0);
+    Root = 0;
+  
+    // Add every function to the call graph...
+    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+      addToCallGraph(I);
+  
+    // If we didn't find a main function, use the external call graph node
+    if (Root == 0) Root = ExternalCallingNode;
+    
+    return false;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+  }
+
+  void print(std::ostream *o, const Module *M) const {
+    if (o) print(*o, M);
+  }
+
+  virtual void print(std::ostream &o, const Module *M) const {
+    o << "CallGraph Root is: ";
+    if (Function *F = getRoot()->getFunction())
+      o << F->getName() << "\n";
+    else
+      o << "<<null function: 0x" << getRoot() << ">>\n";
+    
+    CallGraph::print(o, M);
+  }
+
+  virtual void releaseMemory() {
+    destroy();
+  }
+  
+  /// dump - Print out this call graph.
+  ///
+  inline void dump() const {
+    print(cerr, Mod);
+  }
+
+  CallGraphNode* getExternalCallingNode() const { return ExternalCallingNode; }
+  CallGraphNode* getCallsExternalNode()   const { return CallsExternalNode; }
+
+  // getRoot - Return the root of the call graph, which is either main, or if
+  // main cannot be found, the external node.
+  //
+  CallGraphNode *getRoot()             { return Root; }
+  const CallGraphNode *getRoot() const { return Root; }
+
+private:
+  //===---------------------------------------------------------------------
+  // Implementation of CallGraph construction
+  //
+
+  // addToCallGraph - Add a function to the call graph, and link the node to all
+  // of the functions that it calls.
+  //
+  void addToCallGraph(Function *F) {
+    CallGraphNode *Node = getOrInsertFunction(F);
+
+    // If this function has external linkage, anything could call it.
+    if (!F->hasLocalLinkage()) {
+      ExternalCallingNode->addCalledFunction(CallSite(), Node);
+
+      // Found the entry point?
+      if (F->getName() == "main") {
+        if (Root)    // Found multiple external mains?  Don't pick one.
+          Root = ExternalCallingNode;
+        else
+          Root = Node;          // Found a main, keep track of it!
+      }
+    }
+
+    // Loop over all of the users of the function, looking for non-call uses.
+    for (Value::use_iterator I = F->use_begin(), E = F->use_end(); I != E; ++I)
+      if ((!isa<CallInst>(I) && !isa<InvokeInst>(I))
+          || !CallSite(cast<Instruction>(I)).isCallee(I)) {
+        // Not a call, or being used as a parameter rather than as the callee.
+        ExternalCallingNode->addCalledFunction(CallSite(), Node);
+        break;
+      }
+
+    // If this function is not defined in this translation unit, it could call
+    // anything.
+    if (F->isDeclaration() && !F->isIntrinsic())
+      Node->addCalledFunction(CallSite(), CallsExternalNode);
+
+    // Look for calls by this function.
+    for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
+      for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+           II != IE; ++II) {
+        CallSite CS = CallSite::get(II);
+        if (CS.getInstruction() && !isa<DbgInfoIntrinsic>(II)) {
+          const Function *Callee = CS.getCalledFunction();
+          if (Callee)
+            Node->addCalledFunction(CS, getOrInsertFunction(Callee));
+          else
+            Node->addCalledFunction(CS, CallsExternalNode);
+        }
+      }
+  }
+
+  //
+  // destroy - Release memory for the call graph
+  virtual void destroy() {
+    /// CallsExternalNode is not in the function map, delete it explicitly.
+    delete CallsExternalNode;
+    CallsExternalNode = 0;
+    CallGraph::destroy();
+  }
+};
+
+} //End anonymous namespace
+
+static RegisterAnalysisGroup<CallGraph> X("Call Graph");
+static RegisterPass<BasicCallGraph>
+Y("basiccg", "Basic CallGraph Construction", false, true);
+static RegisterAnalysisGroup<CallGraph, true> Z(Y);
+
+char CallGraph::ID = 0;
+char BasicCallGraph::ID = 0;
+
+void CallGraph::initialize(Module &M) {
+  Mod = &M;
+}
+
+void CallGraph::destroy() {
+  if (!FunctionMap.empty()) {
+    for (FunctionMapTy::iterator I = FunctionMap.begin(), E = FunctionMap.end();
+        I != E; ++I)
+      delete I->second;
+    FunctionMap.clear();
+  }
+}
+
+void CallGraph::print(std::ostream &OS, const Module *M) const {
+  for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
+    I->second->print(OS);
+}
+
+void CallGraph::dump() const {
+  print(cerr, 0);
+}
+
+//===----------------------------------------------------------------------===//
+// Implementations of public modification methods
+//
+
+// removeFunctionFromModule - Unlink the function from this module, returning
+// it.  Because this removes the function from the module, the call graph node
+// is destroyed.  This is only valid if the function does not call any other
+// functions (ie, there are no edges in it's CGN).  The easiest way to do this
+// is to dropAllReferences before calling this.
+//
+Function *CallGraph::removeFunctionFromModule(CallGraphNode *CGN) {
+  assert(CGN->CalledFunctions.empty() && "Cannot remove function from call "
+         "graph if it references other functions!");
+  Function *F = CGN->getFunction(); // Get the function for the call graph node
+  delete CGN;                       // Delete the call graph node for this func
+  FunctionMap.erase(F);             // Remove the call graph node from the map
+
+  Mod->getFunctionList().remove(F);
+  return F;
+}
+
+// changeFunction - This method changes the function associated with this
+// CallGraphNode, for use by transformations that need to change the prototype
+// of a Function (thus they must create a new Function and move the old code
+// over).
+void CallGraph::changeFunction(Function *OldF, Function *NewF) {
+  iterator I = FunctionMap.find(OldF);
+  CallGraphNode *&New = FunctionMap[NewF];
+  assert(I != FunctionMap.end() && I->second && !New &&
+         "OldF didn't exist in CG or NewF already does!");
+  New = I->second;
+  New->F = NewF;
+  FunctionMap.erase(I);
+}
+
+// getOrInsertFunction - This method is identical to calling operator[], but
+// it will insert a new CallGraphNode for the specified function if one does
+// not already exist.
+CallGraphNode *CallGraph::getOrInsertFunction(const Function *F) {
+  CallGraphNode *&CGN = FunctionMap[F];
+  if (CGN) return CGN;
+  
+  assert((!F || F->getParent() == Mod) && "Function not in current module!");
+  return CGN = new CallGraphNode(const_cast<Function*>(F));
+}
+
+void CallGraphNode::print(std::ostream &OS) const {
+  if (Function *F = getFunction())
+    OS << "Call graph node for function: '" << F->getName() <<"'\n";
+  else
+    OS << "Call graph node <<null function: 0x" << this << ">>:\n";
+
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    if (Function *FI = I->second->getFunction())
+      OS << "  Calls function '" << FI->getName() <<"'\n";
+  else
+    OS << "  Calls external node\n";
+  OS << "\n";
+}
+
+void CallGraphNode::dump() const { print(cerr); }
+
+/// removeCallEdgeFor - This method removes the edge in the node for the
+/// specified call site.  Note that this method takes linear time, so it
+/// should be used sparingly.
+void CallGraphNode::removeCallEdgeFor(CallSite CS) {
+  for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
+    assert(I != CalledFunctions.end() && "Cannot find callsite to remove!");
+    if (I->first == CS) {
+      CalledFunctions.erase(I);
+      return;
+    }
+  }
+}
+
+
+// removeAnyCallEdgeTo - This method removes any call edges from this node to
+// the specified callee function.  This takes more time to execute than
+// removeCallEdgeTo, so it should not be used unless necessary.
+void CallGraphNode::removeAnyCallEdgeTo(CallGraphNode *Callee) {
+  for (unsigned i = 0, e = CalledFunctions.size(); i != e; ++i)
+    if (CalledFunctions[i].second == Callee) {
+      CalledFunctions[i] = CalledFunctions.back();
+      CalledFunctions.pop_back();
+      --i; --e;
+    }
+}
+
+/// removeOneAbstractEdgeTo - Remove one edge associated with a null callsite
+/// from this node to the specified callee function.
+void CallGraphNode::removeOneAbstractEdgeTo(CallGraphNode *Callee) {
+  for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
+    assert(I != CalledFunctions.end() && "Cannot find callee to remove!");
+    CallRecord &CR = *I;
+    if (CR.second == Callee && !CR.first.getInstruction()) {
+      CalledFunctions.erase(I);
+      return;
+    }
+  }
+}
+
+/// replaceCallSite - Make the edge in the node for Old CallSite be for
+/// New CallSite instead.  Note that this method takes linear time, so it
+/// should be used sparingly.
+void CallGraphNode::replaceCallSite(CallSite Old, CallSite New) {
+  for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
+    assert(I != CalledFunctions.end() && "Cannot find callsite to replace!");
+    if (I->first == Old) {
+      I->first = New;
+      return;
+    }
+  }
+}
+
+// Enuse that users of CallGraph.h also link with this file
+DEFINING_FILE_FOR(CallGraph)
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
new file mode 100644
index 0000000..3880d0a
--- /dev/null
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -0,0 +1,207 @@
+//===- CallGraphSCCPass.cpp - Pass that operates BU on call graph ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CallGraphSCCPass class, which is used for passes
+// which are implemented as bottom-up traversals on the call graph.  Because
+// there may be cycles in the call graph, passes of this type operate on the
+// call-graph in SCC order: that is, they process function bottom-up, except for
+// recursive functions, which they process all at once.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/PassManagers.h"
+#include "llvm/Function.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// CGPassManager
+//
+/// CGPassManager manages FPPassManagers and CalLGraphSCCPasses.
+
+namespace {
+
+class CGPassManager : public ModulePass, public PMDataManager {
+
+public:
+  static char ID;
+  explicit CGPassManager(int Depth) 
+    : ModulePass(&ID), PMDataManager(Depth) { }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M);
+
+  bool doInitialization(CallGraph &CG);
+  bool doFinalization(CallGraph &CG);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    // CGPassManager walks SCC and it needs CallGraph.
+    Info.addRequired<CallGraph>();
+    Info.setPreservesAll();
+  }
+
+  virtual const char *getPassName() const {
+    return "CallGraph Pass Manager";
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::cerr << std::string(Offset*2, ' ') << "Call Graph SCC Pass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      Pass *P = getContainedPass(Index);
+      P->dumpPassStructure(Offset + 1);
+      dumpLastUses(P, Offset+1);
+    }
+  }
+
+  Pass *getContainedPass(unsigned N) {
+    assert ( N < PassVector.size() && "Pass number out of range!");
+    Pass *FP = static_cast<Pass *>(PassVector[N]);
+    return FP;
+  }
+
+  virtual PassManagerType getPassManagerType() const { 
+    return PMT_CallGraphPassManager; 
+  }
+};
+
+}
+
+char CGPassManager::ID = 0;
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool CGPassManager::runOnModule(Module &M) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  bool Changed = doInitialization(CG);
+
+  // Walk SCC
+  for (scc_iterator<CallGraph*> I = scc_begin(&CG), E = scc_end(&CG);
+       I != E; ++I) {
+
+    // Run all passes on current SCC
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      Pass *P = getContainedPass(Index);
+
+      dumpPassInfo(P, EXECUTION_MSG, ON_CG_MSG, "");
+      dumpRequiredSet(P);
+
+      initializeAnalysisImpl(P);
+
+      StartPassTimer(P);
+      if (CallGraphSCCPass *CGSP = dynamic_cast<CallGraphSCCPass *>(P))
+        Changed |= CGSP->runOnSCC(*I);   // TODO : What if CG is changed ?
+      else {
+        FPPassManager *FPP = dynamic_cast<FPPassManager *>(P);
+        assert (FPP && "Invalid CGPassManager member");
+
+        // Run pass P on all functions current SCC
+        std::vector<CallGraphNode*> &SCC = *I;
+        for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+          Function *F = SCC[i]->getFunction();
+          if (F) {
+            dumpPassInfo(P, EXECUTION_MSG, ON_FUNCTION_MSG, F->getNameStart());
+            Changed |= FPP->runOnFunction(*F);
+          }
+        }
+      }
+      StopPassTimer(P);
+
+      if (Changed)
+        dumpPassInfo(P, MODIFICATION_MSG, ON_CG_MSG, "");
+      dumpPreservedSet(P);
+
+      verifyPreservedAnalysis(P);      
+      removeNotPreservedAnalysis(P);
+      recordAvailableAnalysis(P);
+      removeDeadPasses(P, "", ON_CG_MSG);
+    }
+  }
+  Changed |= doFinalization(CG);
+  return Changed;
+}
+
+/// Initialize CG
+bool CGPassManager::doInitialization(CallGraph &CG) {
+  bool Changed = false;
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+    Pass *P = getContainedPass(Index);
+    if (CallGraphSCCPass *CGSP = dynamic_cast<CallGraphSCCPass *>(P)) {
+      Changed |= CGSP->doInitialization(CG);
+    } else {
+      FPPassManager *FP = dynamic_cast<FPPassManager *>(P);
+      assert (FP && "Invalid CGPassManager member");
+      Changed |= FP->doInitialization(CG.getModule());
+    }
+  }
+  return Changed;
+}
+
+/// Finalize CG
+bool CGPassManager::doFinalization(CallGraph &CG) {
+  bool Changed = false;
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+    Pass *P = getContainedPass(Index);
+    if (CallGraphSCCPass *CGSP = dynamic_cast<CallGraphSCCPass *>(P)) {
+      Changed |= CGSP->doFinalization(CG);
+    } else {
+      FPPassManager *FP = dynamic_cast<FPPassManager *>(P);
+      assert (FP && "Invalid CGPassManager member");
+      Changed |= FP->doFinalization(CG.getModule());
+    }
+  }
+  return Changed;
+}
+
+/// Assign pass manager to manage this pass.
+void CallGraphSCCPass::assignPassManager(PMStack &PMS,
+                                         PassManagerType PreferredType) {
+  // Find CGPassManager 
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_CallGraphPassManager)
+    PMS.pop();
+
+  assert (!PMS.empty() && "Unable to handle Call Graph Pass");
+  CGPassManager *CGP = dynamic_cast<CGPassManager *>(PMS.top());
+
+  // Create new Call Graph SCC Pass Manager if it does not exist. 
+  if (!CGP) {
+
+    assert (!PMS.empty() && "Unable to create Call Graph Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Call Graph Pass Manager
+    CGP = new CGPassManager(PMD->getDepth() + 1);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(CGP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    Pass *P = dynamic_cast<Pass *>(CGP);
+    TPM->schedulePass(P);
+
+    // [4] Push new manager into PMS
+    PMS.push(CGP);
+  }
+
+  CGP->add(this);
+}
+
+/// getAnalysisUsage - For this class, we declare that we require and preserve
+/// the call graph.  If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void CallGraphSCCPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<CallGraph>();
+  AU.addPreserved<CallGraph>();
+}
diff --git a/lib/Analysis/IPA/FindUsedTypes.cpp b/lib/Analysis/IPA/FindUsedTypes.cpp
new file mode 100644
index 0000000..920ee37
--- /dev/null
+++ b/lib/Analysis/IPA/FindUsedTypes.cpp
@@ -0,0 +1,104 @@
+//===- FindUsedTypes.cpp - Find all Types used by a module ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to seek out all of the types in use by the program.  Note
+// that this analysis explicitly does not include types only used by the symbol
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+char FindUsedTypes::ID = 0;
+static RegisterPass<FindUsedTypes>
+X("print-used-types", "Find Used Types", false, true);
+
+// IncorporateType - Incorporate one type and all of its subtypes into the
+// collection of used types.
+//
+void FindUsedTypes::IncorporateType(const Type *Ty) {
+  // If ty doesn't already exist in the used types map, add it now, otherwise
+  // return.
+  if (!UsedTypes.insert(Ty).second) return;  // Already contain Ty.
+
+  // Make sure to add any types this type references now.
+  //
+  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+       I != E; ++I)
+    IncorporateType(*I);
+}
+
+void FindUsedTypes::IncorporateValue(const Value *V) {
+  IncorporateType(V->getType());
+
+  // If this is a constant, it could be using other types...
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (!isa<GlobalValue>(C))
+      for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
+           OI != OE; ++OI)
+        IncorporateValue(*OI);
+  }
+}
+
+
+// run - This incorporates all types used by the specified module
+//
+bool FindUsedTypes::runOnModule(Module &m) {
+  UsedTypes.clear();  // reset if run multiple times...
+
+  // Loop over global variables, incorporating their types
+  for (Module::const_global_iterator I = m.global_begin(), E = m.global_end();
+       I != E; ++I) {
+    IncorporateType(I->getType());
+    if (I->hasInitializer())
+      IncorporateValue(I->getInitializer());
+  }
+
+  for (Module::iterator MI = m.begin(), ME = m.end(); MI != ME; ++MI) {
+    IncorporateType(MI->getType());
+    const Function &F = *MI;
+
+    // Loop over all of the instructions in the function, adding their return
+    // type as well as the types of their operands.
+    //
+    for (const_inst_iterator II = inst_begin(F), IE = inst_end(F);
+         II != IE; ++II) {
+      const Instruction &I = *II;
+
+      IncorporateType(I.getType());  // Incorporate the type of the instruction
+      for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
+           OI != OE; ++OI)
+        IncorporateValue(*OI);  // Insert inst operand types as well
+    }
+  }
+
+  return false;
+}
+
+// Print the types found in the module.  If the optional Module parameter is
+// passed in, then the types are printed symbolically if possible, using the
+// symbol table from the module.
+//
+void FindUsedTypes::print(std::ostream &OS, const Module *M) const {
+  raw_os_ostream RO(OS);
+  RO << "Types in use by this module:\n";
+  for (std::set<const Type *>::const_iterator I = UsedTypes.begin(),
+       E = UsedTypes.end(); I != E; ++I) {
+    RO << "   ";
+    WriteTypeSymbolic(RO, *I, M);
+    RO << '\n';
+  }
+}
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
new file mode 100644
index 0000000..2e9884a
--- /dev/null
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -0,0 +1,567 @@
+//===- GlobalsModRef.cpp - Simple Mod/Ref Analysis for Globals ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This simple pass provides alias and mod/ref information for global values
+// that do not have their address taken, and keeps track of whether functions
+// read or write memory (are "pure").  For this simple (but very common) case,
+// we can provide pretty accurate and useful information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globalsmodref-aa"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SCCIterator.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumNonAddrTakenGlobalVars,
+          "Number of global vars without address taken");
+STATISTIC(NumNonAddrTakenFunctions,"Number of functions without address taken");
+STATISTIC(NumNoMemFunctions, "Number of functions that do not access memory");
+STATISTIC(NumReadMemFunctions, "Number of functions that only read memory");
+STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects");
+
+namespace {
+  /// FunctionRecord - One instance of this structure is stored for every
+  /// function in the program.  Later, the entries for these functions are
+  /// removed if the function is found to call an external function (in which
+  /// case we know nothing about it.
+  struct VISIBILITY_HIDDEN FunctionRecord {
+    /// GlobalInfo - Maintain mod/ref info for all of the globals without
+    /// addresses taken that are read or written (transitively) by this
+    /// function.
+    std::map<GlobalValue*, unsigned> GlobalInfo;
+
+    /// MayReadAnyGlobal - May read global variables, but it is not known which.
+    bool MayReadAnyGlobal;
+
+    unsigned getInfoForGlobal(GlobalValue *GV) const {
+      unsigned Effect = MayReadAnyGlobal ? AliasAnalysis::Ref : 0;
+      std::map<GlobalValue*, unsigned>::const_iterator I = GlobalInfo.find(GV);
+      if (I != GlobalInfo.end())
+        Effect |= I->second;
+      return Effect;
+    }
+
+    /// FunctionEffect - Capture whether or not this function reads or writes to
+    /// ANY memory.  If not, we can do a lot of aggressive analysis on it.
+    unsigned FunctionEffect;
+
+    FunctionRecord() : MayReadAnyGlobal (false), FunctionEffect(0) {}
+  };
+
+  /// GlobalsModRef - The actual analysis pass.
+  class VISIBILITY_HIDDEN GlobalsModRef
+      : public ModulePass, public AliasAnalysis {
+    /// NonAddressTakenGlobals - The globals that do not have their addresses
+    /// taken.
+    std::set<GlobalValue*> NonAddressTakenGlobals;
+
+    /// IndirectGlobals - The memory pointed to by this global is known to be
+    /// 'owned' by the global.
+    std::set<GlobalValue*> IndirectGlobals;
+
+    /// AllocsForIndirectGlobals - If an instruction allocates memory for an
+    /// indirect global, this map indicates which one.
+    std::map<Value*, GlobalValue*> AllocsForIndirectGlobals;
+
+    /// FunctionInfo - For each function, keep track of what globals are
+    /// modified or read.
+    std::map<Function*, FunctionRecord> FunctionInfo;
+
+  public:
+    static char ID;
+    GlobalsModRef() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M) {
+      InitializeAliasAnalysis(this);                 // set up super class
+      AnalyzeGlobals(M);                          // find non-addr taken globals
+      AnalyzeCallGraph(getAnalysis<CallGraph>(), M); // Propagate on CG
+      return false;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AliasAnalysis::getAnalysisUsage(AU);
+      AU.addRequired<CallGraph>();
+      AU.setPreservesAll();                         // Does not transform code
+    }
+
+    //------------------------------------------------
+    // Implement the AliasAnalysis API
+    //
+    AliasResult alias(const Value *V1, unsigned V1Size,
+                      const Value *V2, unsigned V2Size);
+    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size);
+    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
+      return AliasAnalysis::getModRefInfo(CS1,CS2);
+    }
+    bool hasNoModRefInfoForCalls() const { return false; }
+
+    /// getModRefBehavior - Return the behavior of the specified function if
+    /// called from the specified call site.  The call site may be null in which
+    /// case the most generic behavior of this function should be returned.
+    ModRefBehavior getModRefBehavior(Function *F,
+                                         std::vector<PointerAccessInfo> *Info) {
+      if (FunctionRecord *FR = getFunctionInfo(F)) {
+        if (FR->FunctionEffect == 0)
+          return DoesNotAccessMemory;
+        else if ((FR->FunctionEffect & Mod) == 0)
+          return OnlyReadsMemory;
+      }
+      return AliasAnalysis::getModRefBehavior(F, Info);
+    }
+    
+    /// getModRefBehavior - Return the behavior of the specified function if
+    /// called from the specified call site.  The call site may be null in which
+    /// case the most generic behavior of this function should be returned.
+    ModRefBehavior getModRefBehavior(CallSite CS,
+                                         std::vector<PointerAccessInfo> *Info) {
+      Function* F = CS.getCalledFunction();
+      if (!F) return AliasAnalysis::getModRefBehavior(CS, Info);
+      if (FunctionRecord *FR = getFunctionInfo(F)) {
+        if (FR->FunctionEffect == 0)
+          return DoesNotAccessMemory;
+        else if ((FR->FunctionEffect & Mod) == 0)
+          return OnlyReadsMemory;
+      }
+      return AliasAnalysis::getModRefBehavior(CS, Info);
+    }
+
+    virtual void deleteValue(Value *V);
+    virtual void copyValue(Value *From, Value *To);
+
+  private:
+    /// getFunctionInfo - Return the function info for the function, or null if
+    /// we don't have anything useful to say about it.
+    FunctionRecord *getFunctionInfo(Function *F) {
+      std::map<Function*, FunctionRecord>::iterator I = FunctionInfo.find(F);
+      if (I != FunctionInfo.end())
+        return &I->second;
+      return 0;
+    }
+
+    void AnalyzeGlobals(Module &M);
+    void AnalyzeCallGraph(CallGraph &CG, Module &M);
+    bool AnalyzeUsesOfPointer(Value *V, std::vector<Function*> &Readers,
+                              std::vector<Function*> &Writers,
+                              GlobalValue *OkayStoreDest = 0);
+    bool AnalyzeIndirectGlobalMemory(GlobalValue *GV);
+  };
+}
+
+char GlobalsModRef::ID = 0;
+static RegisterPass<GlobalsModRef>
+X("globalsmodref-aa", "Simple mod/ref analysis for globals", false, true);
+static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+
+Pass *llvm::createGlobalsModRefPass() { return new GlobalsModRef(); }
+
+/// AnalyzeGlobals - Scan through the users of all of the internal
+/// GlobalValue's in the program.  If none of them have their "address taken"
+/// (really, their address passed to something nontrivial), record this fact,
+/// and record the functions that they are used directly in.
+void GlobalsModRef::AnalyzeGlobals(Module &M) {
+  std::vector<Function*> Readers, Writers;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->hasLocalLinkage()) {
+      if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
+        // Remember that we are tracking this global.
+        NonAddressTakenGlobals.insert(I);
+        ++NumNonAddrTakenFunctions;
+      }
+      Readers.clear(); Writers.clear();
+    }
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (I->hasLocalLinkage()) {
+      if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
+        // Remember that we are tracking this global, and the mod/ref fns
+        NonAddressTakenGlobals.insert(I);
+
+        for (unsigned i = 0, e = Readers.size(); i != e; ++i)
+          FunctionInfo[Readers[i]].GlobalInfo[I] |= Ref;
+
+        if (!I->isConstant())  // No need to keep track of writers to constants
+          for (unsigned i = 0, e = Writers.size(); i != e; ++i)
+            FunctionInfo[Writers[i]].GlobalInfo[I] |= Mod;
+        ++NumNonAddrTakenGlobalVars;
+
+        // If this global holds a pointer type, see if it is an indirect global.
+        if (isa<PointerType>(I->getType()->getElementType()) &&
+            AnalyzeIndirectGlobalMemory(I))
+          ++NumIndirectGlobalVars;
+      }
+      Readers.clear(); Writers.clear();
+    }
+}
+
+/// AnalyzeUsesOfPointer - Look at all of the users of the specified pointer.
+/// If this is used by anything complex (i.e., the address escapes), return
+/// true.  Also, while we are at it, keep track of those functions that read and
+/// write to the value.
+///
+/// If OkayStoreDest is non-null, stores into this global are allowed.
+bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
+                                         std::vector<Function*> &Readers,
+                                         std::vector<Function*> &Writers,
+                                         GlobalValue *OkayStoreDest) {
+  if (!isa<PointerType>(V->getType())) return true;
+
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      Readers.push_back(LI->getParent()->getParent());
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (V == SI->getOperand(1)) {
+        Writers.push_back(SI->getParent()->getParent());
+      } else if (SI->getOperand(1) != OkayStoreDest) {
+        return true;  // Storing the pointer
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+      if (AnalyzeUsesOfPointer(GEP, Readers, Writers)) return true;
+    } else if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      // Make sure that this is just the function being called, not that it is
+      // passing into the function.
+      for (unsigned i = 1, e = CI->getNumOperands(); i != e; ++i)
+        if (CI->getOperand(i) == V) return true;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
+      // Make sure that this is just the function being called, not that it is
+      // passing into the function.
+      for (unsigned i = 3, e = II->getNumOperands(); i != e; ++i)
+        if (II->getOperand(i) == V) return true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr ||
+          CE->getOpcode() == Instruction::BitCast) {
+        if (AnalyzeUsesOfPointer(CE, Readers, Writers))
+          return true;
+      } else {
+        return true;
+      }
+    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(*UI)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return true;  // Allow comparison against null.
+    } else if (FreeInst *F = dyn_cast<FreeInst>(*UI)) {
+      Writers.push_back(F->getParent()->getParent());
+    } else {
+      return true;
+    }
+  return false;
+}
+
+/// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable
+/// which holds a pointer type.  See if the global always points to non-aliased
+/// heap memory: that is, all initializers of the globals are allocations, and
+/// those allocations have no use other than initialization of the global.
+/// Further, all loads out of GV must directly use the memory, not store the
+/// pointer somewhere.  If this is true, we consider the memory pointed to by
+/// GV to be owned by GV and can disambiguate other pointers from it.
+bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
+  // Keep track of values related to the allocation of the memory, f.e. the
+  // value produced by the malloc call and any casts.
+  std::vector<Value*> AllocRelatedValues;
+
+  // Walk the user list of the global.  If we find anything other than a direct
+  // load or store, bail out.
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I){
+    if (LoadInst *LI = dyn_cast<LoadInst>(*I)) {
+      // The pointer loaded from the global can only be used in simple ways:
+      // we allow addressing of it and loading storing to it.  We do *not* allow
+      // storing the loaded pointer somewhere else or passing to a function.
+      std::vector<Function*> ReadersWriters;
+      if (AnalyzeUsesOfPointer(LI, ReadersWriters, ReadersWriters))
+        return false;  // Loaded pointer escapes.
+      // TODO: Could try some IP mod/ref of the loaded pointer.
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(*I)) {
+      // Storing the global itself.
+      if (SI->getOperand(0) == GV) return false;
+
+      // If storing the null pointer, ignore it.
+      if (isa<ConstantPointerNull>(SI->getOperand(0)))
+        continue;
+
+      // Check the value being stored.
+      Value *Ptr = SI->getOperand(0)->getUnderlyingObject();
+
+      if (isa<MallocInst>(Ptr)) {
+        // Okay, easy case.
+      } else if (CallInst *CI = dyn_cast<CallInst>(Ptr)) {
+        Function *F = CI->getCalledFunction();
+        if (!F || !F->isDeclaration()) return false;     // Too hard to analyze.
+        if (F->getName() != "calloc") return false;   // Not calloc.
+      } else {
+        return false;  // Too hard to analyze.
+      }
+
+      // Analyze all uses of the allocation.  If any of them are used in a
+      // non-simple way (e.g. stored to another global) bail out.
+      std::vector<Function*> ReadersWriters;
+      if (AnalyzeUsesOfPointer(Ptr, ReadersWriters, ReadersWriters, GV))
+        return false;  // Loaded pointer escapes.
+
+      // Remember that this allocation is related to the indirect global.
+      AllocRelatedValues.push_back(Ptr);
+    } else {
+      // Something complex, bail out.
+      return false;
+    }
+  }
+
+  // Okay, this is an indirect global.  Remember all of the allocations for
+  // this global in AllocsForIndirectGlobals.
+  while (!AllocRelatedValues.empty()) {
+    AllocsForIndirectGlobals[AllocRelatedValues.back()] = GV;
+    AllocRelatedValues.pop_back();
+  }
+  IndirectGlobals.insert(GV);
+  return true;
+}
+
+/// AnalyzeCallGraph - At this point, we know the functions where globals are
+/// immediately stored to and read from.  Propagate this information up the call
+/// graph to all callers and compute the mod/ref info for all memory for each
+/// function.
+void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
+  // We do a bottom-up SCC traversal of the call graph.  In other words, we
+  // visit all callees before callers (leaf-first).
+  for (scc_iterator<CallGraph*> I = scc_begin(&CG), E = scc_end(&CG); I != E;
+       ++I) {
+    std::vector<CallGraphNode *> &SCC = *I;
+    assert(!SCC.empty() && "SCC with no functions?");
+
+    if (!SCC[0]->getFunction()) {
+      // Calls externally - can't say anything useful.  Remove any existing
+      // function records (may have been created when scanning globals).
+      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+        FunctionInfo.erase(SCC[i]->getFunction());
+      continue;
+    }
+
+    FunctionRecord &FR = FunctionInfo[SCC[0]->getFunction()];
+
+    bool KnowNothing = false;
+    unsigned FunctionEffect = 0;
+
+    // Collect the mod/ref properties due to called functions.  We only compute
+    // one mod-ref set.
+    for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) {
+      Function *F = SCC[i]->getFunction();
+      if (!F) {
+        KnowNothing = true;
+        break;
+      }
+
+      if (F->isDeclaration()) {
+        // Try to get mod/ref behaviour from function attributes.
+        if (F->doesNotAccessMemory()) {
+          // Can't do better than that!
+        } else if (F->onlyReadsMemory()) {
+          FunctionEffect |= Ref;
+          if (!F->isIntrinsic())
+            // This function might call back into the module and read a global -
+            // consider every global as possibly being read by this function.
+            FR.MayReadAnyGlobal = true;
+        } else {
+          FunctionEffect |= ModRef;
+          // Can't say anything useful unless it's an intrinsic - they don't
+          // read or write global variables of the kind considered here.
+          KnowNothing = !F->isIntrinsic();
+        }
+        continue;
+      }
+
+      for (CallGraphNode::iterator CI = SCC[i]->begin(), E = SCC[i]->end();
+           CI != E && !KnowNothing; ++CI)
+        if (Function *Callee = CI->second->getFunction()) {
+          if (FunctionRecord *CalleeFR = getFunctionInfo(Callee)) {
+            // Propagate function effect up.
+            FunctionEffect |= CalleeFR->FunctionEffect;
+
+            // Incorporate callee's effects on globals into our info.
+            for (std::map<GlobalValue*, unsigned>::iterator GI =
+                   CalleeFR->GlobalInfo.begin(), E = CalleeFR->GlobalInfo.end();
+                 GI != E; ++GI)
+              FR.GlobalInfo[GI->first] |= GI->second;
+            FR.MayReadAnyGlobal |= CalleeFR->MayReadAnyGlobal;
+          } else {
+            // Can't say anything about it.  However, if it is inside our SCC,
+            // then nothing needs to be done.
+            CallGraphNode *CalleeNode = CG[Callee];
+            if (std::find(SCC.begin(), SCC.end(), CalleeNode) == SCC.end())
+              KnowNothing = true;
+          }
+        } else {
+          KnowNothing = true;
+        }
+    }
+
+    // If we can't say anything useful about this SCC, remove all SCC functions
+    // from the FunctionInfo map.
+    if (KnowNothing) {
+      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+        FunctionInfo.erase(SCC[i]->getFunction());
+      continue;
+    }
+
+    // Scan the function bodies for explicit loads or stores.
+    for (unsigned i = 0, e = SCC.size(); i != e && FunctionEffect != ModRef;++i)
+      for (inst_iterator II = inst_begin(SCC[i]->getFunction()),
+             E = inst_end(SCC[i]->getFunction());
+           II != E && FunctionEffect != ModRef; ++II)
+        if (isa<LoadInst>(*II)) {
+          FunctionEffect |= Ref;
+          if (cast<LoadInst>(*II).isVolatile())
+            // Volatile loads may have side-effects, so mark them as writing
+            // memory (for example, a flag inside the processor).
+            FunctionEffect |= Mod;
+        } else if (isa<StoreInst>(*II)) {
+          FunctionEffect |= Mod;
+          if (cast<StoreInst>(*II).isVolatile())
+            // Treat volatile stores as reading memory somewhere.
+            FunctionEffect |= Ref;
+        } else if (isa<MallocInst>(*II) || isa<FreeInst>(*II)) {
+          FunctionEffect |= ModRef;
+        }
+
+    if ((FunctionEffect & Mod) == 0)
+      ++NumReadMemFunctions;
+    if (FunctionEffect == 0)
+      ++NumNoMemFunctions;
+    FR.FunctionEffect = FunctionEffect;
+
+    // Finally, now that we know the full effect on this SCC, clone the
+    // information to each function in the SCC.
+    for (unsigned i = 1, e = SCC.size(); i != e; ++i)
+      FunctionInfo[SCC[i]->getFunction()] = FR;
+  }
+}
+
+
+
+/// alias - If one of the pointers is to a global that we are tracking, and the
+/// other is some random pointer, we know there cannot be an alias, because the
+/// address of the global isn't taken.
+AliasAnalysis::AliasResult
+GlobalsModRef::alias(const Value *V1, unsigned V1Size,
+                     const Value *V2, unsigned V2Size) {
+  // Get the base object these pointers point to.
+  Value *UV1 = const_cast<Value*>(V1->getUnderlyingObject());
+  Value *UV2 = const_cast<Value*>(V2->getUnderlyingObject());
+
+  // If either of the underlying values is a global, they may be non-addr-taken
+  // globals, which we can answer queries about.
+  GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1);
+  GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2);
+  if (GV1 || GV2) {
+    // If the global's address is taken, pretend we don't know it's a pointer to
+    // the global.
+    if (GV1 && !NonAddressTakenGlobals.count(GV1)) GV1 = 0;
+    if (GV2 && !NonAddressTakenGlobals.count(GV2)) GV2 = 0;
+
+    // If the the two pointers are derived from two different non-addr-taken
+    // globals, or if one is and the other isn't, we know these can't alias.
+    if ((GV1 || GV2) && GV1 != GV2)
+      return NoAlias;
+
+    // Otherwise if they are both derived from the same addr-taken global, we
+    // can't know the two accesses don't overlap.
+  }
+
+  // These pointers may be based on the memory owned by an indirect global.  If
+  // so, we may be able to handle this.  First check to see if the base pointer
+  // is a direct load from an indirect global.
+  GV1 = GV2 = 0;
+  if (LoadInst *LI = dyn_cast<LoadInst>(UV1))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
+      if (IndirectGlobals.count(GV))
+        GV1 = GV;
+  if (LoadInst *LI = dyn_cast<LoadInst>(UV2))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
+      if (IndirectGlobals.count(GV))
+        GV2 = GV;
+
+  // These pointers may also be from an allocation for the indirect global.  If
+  // so, also handle them.
+  if (AllocsForIndirectGlobals.count(UV1))
+    GV1 = AllocsForIndirectGlobals[UV1];
+  if (AllocsForIndirectGlobals.count(UV2))
+    GV2 = AllocsForIndirectGlobals[UV2];
+
+  // Now that we know whether the two pointers are related to indirect globals,
+  // use this to disambiguate the pointers.  If either pointer is based on an
+  // indirect global and if they are not both based on the same indirect global,
+  // they cannot alias.
+  if ((GV1 || GV2) && GV1 != GV2)
+    return NoAlias;
+
+  return AliasAnalysis::alias(V1, V1Size, V2, V2Size);
+}
+
+AliasAnalysis::ModRefResult
+GlobalsModRef::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+  unsigned Known = ModRef;
+
+  // If we are asking for mod/ref info of a direct call with a pointer to a
+  // global we are tracking, return information if we have it.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(P->getUnderlyingObject()))
+    if (GV->hasLocalLinkage())
+      if (Function *F = CS.getCalledFunction())
+        if (NonAddressTakenGlobals.count(GV))
+          if (FunctionRecord *FR = getFunctionInfo(F))
+            Known = FR->getInfoForGlobal(GV);
+
+  if (Known == NoModRef)
+    return NoModRef; // No need to query other mod/ref analyses
+  return ModRefResult(Known & AliasAnalysis::getModRefInfo(CS, P, Size));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Methods to update the analysis as a result of the client transformation.
+//
+void GlobalsModRef::deleteValue(Value *V) {
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    if (NonAddressTakenGlobals.erase(GV)) {
+      // This global might be an indirect global.  If so, remove it and remove
+      // any AllocRelatedValues for it.
+      if (IndirectGlobals.erase(GV)) {
+        // Remove any entries in AllocsForIndirectGlobals for this global.
+        for (std::map<Value*, GlobalValue*>::iterator
+             I = AllocsForIndirectGlobals.begin(),
+             E = AllocsForIndirectGlobals.end(); I != E; ) {
+          if (I->second == GV) {
+            AllocsForIndirectGlobals.erase(I++);
+          } else {
+            ++I;
+          }
+        }
+      }
+    }
+  }
+
+  // Otherwise, if this is an allocation related to an indirect global, remove
+  // it.
+  AllocsForIndirectGlobals.erase(V);
+
+  AliasAnalysis::deleteValue(V);
+}
+
+void GlobalsModRef::copyValue(Value *From, Value *To) {
+  AliasAnalysis::copyValue(From, To);
+}
diff --git a/lib/Analysis/IPA/Makefile b/lib/Analysis/IPA/Makefile
new file mode 100644
index 0000000..adacb16
--- /dev/null
+++ b/lib/Analysis/IPA/Makefile
@@ -0,0 +1,14 @@
+##===- lib/Analysis/IPA/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMipa
+BUILD_ARCHIVE = 1
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
new file mode 100644
index 0000000..7af9130
--- /dev/null
+++ b/lib/Analysis/IVUsers.cpp
@@ -0,0 +1,391 @@
+//===- IVUsers.cpp - Induction Variable Users -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bookkeeping for "interesting" users of expressions
+// computed from induction variables.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "iv-users"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+char IVUsers::ID = 0;
+static RegisterPass<IVUsers>
+X("iv-users", "Induction Variable Users", false, true);
+
+Pass *llvm::createIVUsersPass() {
+  return new IVUsers();
+}
+
+/// containsAddRecFromDifferentLoop - Determine whether expression S involves a
+/// subexpression that is an AddRec from a loop other than L.  An outer loop
+/// of L is OK, but not an inner loop nor a disjoint loop.
+static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+  // This is very common, put it first.
+  if (isa<SCEVConstant>(S))
+    return false;
+  if (const SCEVCommutativeExpr *AE = dyn_cast<SCEVCommutativeExpr>(S)) {
+    for (unsigned int i=0; i< AE->getNumOperands(); i++)
+      if (containsAddRecFromDifferentLoop(AE->getOperand(i), L))
+        return true;
+    return false;
+  }
+  if (const SCEVAddRecExpr *AE = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (const Loop *newLoop = AE->getLoop()) {
+      if (newLoop == L)
+        return false;
+      // if newLoop is an outer loop of L, this is OK.
+      if (!LoopInfoBase<BasicBlock>::isNotAlreadyContainedIn(L, newLoop))
+        return false;
+    }
+    return true;
+  }
+  if (const SCEVUDivExpr *DE = dyn_cast<SCEVUDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#if 0
+  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll
+  // need this when it is.
+  if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#endif
+  if (const SCEVCastExpr *CE = dyn_cast<SCEVCastExpr>(S))
+    return containsAddRecFromDifferentLoop(CE->getOperand(), L);
+  return false;
+}
+
+/// getSCEVStartAndStride - Compute the start and stride of this expression,
+/// returning false if the expression is not a start/stride pair, or true if it
+/// is.  The stride must be a loop invariant expression, but the start may be
+/// a mix of loop invariant and loop variant expressions.  The start cannot,
+/// however, contain an AddRec from a different loop, unless that loop is an
+/// outer loop of the current loop.
+static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, Loop *UseLoop,
+                                  SCEVHandle &Start, SCEVHandle &Stride,
+                                  bool &isSigned,
+                                  ScalarEvolution *SE, DominatorTree *DT) {
+  SCEVHandle TheAddRec = Start;   // Initialize to zero.
+  bool isSExt = false;
+  bool isZExt = false;
+
+  // If the outer level is an AddExpr, the operands are all start values except
+  // for a nested AddRecExpr.
+  if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(SH)) {
+    for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i)
+      if (const SCEVAddRecExpr *AddRec =
+             dyn_cast<SCEVAddRecExpr>(AE->getOperand(i))) {
+        if (AddRec->getLoop() == L)
+          TheAddRec = SE->getAddExpr(AddRec, TheAddRec);
+        else
+          return false;  // Nested IV of some sort?
+      } else {
+        Start = SE->getAddExpr(Start, AE->getOperand(i));
+      }
+
+  } else if (const SCEVZeroExtendExpr *Z = dyn_cast<SCEVZeroExtendExpr>(SH)) {
+    TheAddRec = Z->getOperand();
+    isZExt = true;
+  } else if (const SCEVSignExtendExpr *S = dyn_cast<SCEVSignExtendExpr>(SH)) {
+    TheAddRec = S->getOperand();
+    isSExt = true;
+  } else if (isa<SCEVAddRecExpr>(SH)) {
+    TheAddRec = SH;
+  } else {
+    return false;  // not analyzable.
+  }
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(TheAddRec);
+  if (!AddRec || AddRec->getLoop() != L) return false;
+
+  // Use getSCEVAtScope to attempt to simplify other loops out of
+  // the picture.
+  SCEVHandle AddRecStart = AddRec->getStart();
+  SCEVHandle BetterAddRecStart = SE->getSCEVAtScope(AddRecStart, UseLoop);
+  if (!isa<SCEVCouldNotCompute>(BetterAddRecStart))
+    AddRecStart = BetterAddRecStart;
+
+  // FIXME: If Start contains an SCEVAddRecExpr from a different loop, other
+  // than an outer loop of the current loop, reject it.  LSR has no concept of
+  // operating on more than one loop at a time so don't confuse it with such
+  // expressions.
+  if (containsAddRecFromDifferentLoop(AddRecStart, L))
+    return false;
+
+  if (isSExt || isZExt)
+    Start = SE->getTruncateExpr(Start, AddRec->getType());
+
+  Start = SE->getAddExpr(Start, AddRecStart);
+
+  if (!isa<SCEVConstant>(AddRec->getStepRecurrence(*SE))) {
+    // If stride is an instruction, make sure it dominates the loop preheader.
+    // Otherwise we could end up with a use before def situation.
+    BasicBlock *Preheader = L->getLoopPreheader();
+    if (!AddRec->getStepRecurrence(*SE)->dominates(Preheader, DT))
+      return false;
+
+    DOUT << "[" << L->getHeader()->getName()
+         << "] Variable stride: " << *AddRec << "\n";
+  }
+
+  Stride = AddRec->getStepRecurrence(*SE);
+  isSigned = isSExt;
+  return true;
+}
+
+/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
+/// and now we need to decide whether the user should use the preinc or post-inc
+/// value.  If this user should use the post-inc version of the IV, return true.
+///
+/// Choosing wrong here can break dominance properties (if we choose to use the
+/// post-inc value when we cannot) or it can end up adding extra live-ranges to
+/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
+/// should use the post-inc value).
+static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
+                                       Loop *L, LoopInfo *LI, DominatorTree *DT,
+                                       Pass *P) {
+  // If the user is in the loop, use the preinc value.
+  if (L->contains(User->getParent())) return false;
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+
+  // Ok, the user is outside of the loop.  If it is dominated by the latch
+  // block, use the post-inc value.
+  if (DT->dominates(LatchBlock, User->getParent()))
+    return true;
+
+  // There is one case we have to be careful of: PHI nodes.  These little guys
+  // can live in blocks that are not dominated by the latch block, but (since
+  // their uses occur in the predecessor block, not the block the PHI lives in)
+  // should still use the post-inc value.  Check for this case now.
+  PHINode *PN = dyn_cast<PHINode>(User);
+  if (!PN) return false;  // not a phi, not dominated by latch block.
+
+  // Look at all of the uses of IV by the PHI node.  If any use corresponds to
+  // a block that is not dominated by the latch block, give up and use the
+  // preincremented value.
+  unsigned NumUses = 0;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == IV) {
+      ++NumUses;
+      if (!DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
+        return false;
+    }
+
+  // Okay, all uses of IV by PN are in predecessor blocks that really are
+  // dominated by the latch block.  Use the post-incremented value.
+  return true;
+}
+
+/// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
+/// reducible SCEV, recursively add its users to the IVUsesByStride set and
+/// return true.  Otherwise, return false.
+bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;   // Void and FP expressions cannot be reduced.
+
+  // LSR is not APInt clean, do not touch integers bigger than 64-bits.
+  if (SE->getTypeSizeInBits(I->getType()) > 64)
+    return false;
+
+  if (!Processed.insert(I))
+    return true;    // Instruction already handled.
+
+  // Get the symbolic expression for this instruction.
+  SCEVHandle ISE = SE->getSCEV(I);
+  if (isa<SCEVCouldNotCompute>(ISE)) return false;
+
+  // Get the start and stride for this expression.
+  Loop *UseLoop = LI->getLoopFor(I->getParent());
+  SCEVHandle Start = SE->getIntegerSCEV(0, ISE->getType());
+  SCEVHandle Stride = Start;
+  bool isSigned = false; // Arbitrary initial value - pacifies compiler.
+
+  if (!getSCEVStartAndStride(ISE, L, UseLoop, Start, Stride, isSigned, SE, DT))
+    return false;  // Non-reducible symbolic expression, bail out.
+
+  SmallPtrSet<Instruction *, 4> UniqueUsers;
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!UniqueUsers.insert(User))
+      continue;
+
+    // Do not infinitely recurse on PHI nodes.
+    if (isa<PHINode>(User) && Processed.count(User))
+      continue;
+
+    // Descend recursively, but not into PHI nodes outside the current loop.
+    // It's important to see the entire expression outside the loop to get
+    // choices that depend on addressing mode use right, although we won't
+    // consider references ouside the loop in all cases.
+    // If User is already in Processed, we don't want to recurse into it again,
+    // but do want to record a second reference in the same instruction.
+    bool AddUserToIVUsers = false;
+    if (LI->getLoopFor(User->getParent()) != L) {
+      if (isa<PHINode>(User) || Processed.count(User) ||
+          !AddUsersIfInteresting(User)) {
+        DOUT << "FOUND USER in other loop: " << *User
+             << "   OF SCEV: " << *ISE << "\n";
+        AddUserToIVUsers = true;
+      }
+    } else if (Processed.count(User) ||
+               !AddUsersIfInteresting(User)) {
+      DOUT << "FOUND USER: " << *User
+           << "   OF SCEV: " << *ISE << "\n";
+      AddUserToIVUsers = true;
+    }
+
+    if (AddUserToIVUsers) {
+      IVUsersOfOneStride *StrideUses = IVUsesByStride[Stride];
+      if (!StrideUses) {    // First occurrence of this stride?
+        StrideOrder.push_back(Stride);
+        StrideUses = new IVUsersOfOneStride(Stride);
+        IVUses.push_back(StrideUses);
+        IVUsesByStride[Stride] = StrideUses;
+      }
+
+      // Okay, we found a user that we cannot reduce.  Analyze the instruction
+      // and decide what to do with it.  If we are a use inside of the loop, use
+      // the value before incrementation, otherwise use it after incrementation.
+      if (IVUseShouldUsePostIncValue(User, I, L, LI, DT, this)) {
+        // The value used will be incremented by the stride more than we are
+        // expecting, so subtract this off.
+        SCEVHandle NewStart = SE->getMinusSCEV(Start, Stride);
+        StrideUses->addUser(NewStart, User, I, isSigned);
+        StrideUses->Users.back().setIsUseOfPostIncrementedValue(true);
+        DOUT << "   USING POSTINC SCEV, START=" << *NewStart<< "\n";
+      } else {
+        StrideUses->addUser(Start, User, I, isSigned);
+      }
+    }
+  }
+  return true;
+}
+
+IVUsers::IVUsers()
+ : LoopPass(&ID) {
+}
+
+void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfo>();
+  AU.addRequired<DominatorTree>();
+  AU.addRequired<ScalarEvolution>();
+  AU.setPreservesAll();
+}
+
+bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
+
+  L = l;
+  LI = &getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+  SE = &getAnalysis<ScalarEvolution>();
+
+  // Find all uses of induction variables in this loop, and categorize
+  // them by stride.  Start by finding all of the PHI nodes in the header for
+  // this loop.  If they are induction variables, inspect their uses.
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
+    AddUsersIfInteresting(I);
+
+  return false;
+}
+
+/// getReplacementExpr - Return a SCEV expression which computes the
+/// value of the OperandValToReplace of the given IVStrideUse.
+SCEVHandle IVUsers::getReplacementExpr(const IVStrideUse &U) const {
+  const Type *UseTy = U.getOperandValToReplace()->getType();
+  // Start with zero.
+  SCEVHandle RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
+  // Create the basic add recurrence.
+  RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
+  // Add the offset in a separate step, because it may be loop-variant.
+  RetVal = SE->getAddExpr(RetVal, U.getOffset());
+  // For uses of post-incremented values, add an extra stride to compute
+  // the actual replacement value.
+  if (U.isUseOfPostIncrementedValue())
+    RetVal = SE->getAddExpr(RetVal, U.getParent()->Stride);
+  // Evaluate the expression out of the loop, if possible.
+  if (!L->contains(U.getUser()->getParent())) {
+    SCEVHandle ExitVal = SE->getSCEVAtScope(RetVal, L->getParentLoop());
+    if (!isa<SCEVCouldNotCompute>(ExitVal) && ExitVal->isLoopInvariant(L))
+      RetVal = ExitVal;
+  }
+  // Promote the result to the type of the use.
+  if (SE->getTypeSizeInBits(RetVal->getType()) !=
+      SE->getTypeSizeInBits(UseTy)) {
+    if (U.isSigned())
+      RetVal = SE->getSignExtendExpr(RetVal, UseTy);
+    else
+      RetVal = SE->getZeroExtendExpr(RetVal, UseTy);
+  }
+  return RetVal;
+}
+
+void IVUsers::print(raw_ostream &OS, const Module *M) const {
+  OS << "IV Users for loop ";
+  WriteAsOperand(OS, L->getHeader(), false);
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << " with backedge-taken count "
+       << *SE->getBackedgeTakenCount(L);
+  }
+  OS << ":\n";
+
+  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride*>::const_iterator SI =
+      IVUsesByStride.find(StrideOrder[Stride]);
+    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
+    OS << "  Stride " << *SI->first->getType() << " " << *SI->first << ":\n";
+
+    for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
+         E = SI->second->Users.end(); UI != E; ++UI) {
+      OS << "    ";
+      WriteAsOperand(OS, UI->getOperandValToReplace(), false);
+      OS << " = ";
+      OS << *getReplacementExpr(*UI);
+      if (UI->isUseOfPostIncrementedValue())
+        OS << " (post-inc)";
+      OS << " in ";
+      UI->getUser()->print(OS);
+    }
+  }
+}
+
+void IVUsers::print(std::ostream &o, const Module *M) const {
+  raw_os_ostream OS(o);
+  print(OS, M);
+}
+
+void IVUsers::dump() const {
+  print(errs());
+}
+
+void IVUsers::releaseMemory() {
+  IVUsesByStride.clear();
+  StrideOrder.clear();
+  Processed.clear();
+}
+
+void IVStrideUse::deleted() {
+  // Remove this user from the list.
+  Parent->Users.erase(this);
+  // this now dangles!
+}
diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp
new file mode 100644
index 0000000..2dea7b3
--- /dev/null
+++ b/lib/Analysis/InstCount.cpp
@@ -0,0 +1,86 @@
+//===-- InstCount.cpp - Collects the count of all instructions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass collects the count of all instructions and reports them
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instcount"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/Statistic.h"
+#include <ostream>
+using namespace llvm;
+
+STATISTIC(TotalInsts , "Number of instructions (of all types)");
+STATISTIC(TotalBlocks, "Number of basic blocks");
+STATISTIC(TotalFuncs , "Number of non-external functions");
+STATISTIC(TotalMemInst, "Number of memory instructions");
+
+#define HANDLE_INST(N, OPCODE, CLASS) \
+  STATISTIC(Num ## OPCODE ## Inst, "Number of " #OPCODE " insts");
+
+#include "llvm/Instruction.def"
+
+
+namespace {
+  class VISIBILITY_HIDDEN InstCount 
+      : public FunctionPass, public InstVisitor<InstCount> {
+    friend class InstVisitor<InstCount>;
+
+    void visitFunction  (Function &F) { ++TotalFuncs; }
+    void visitBasicBlock(BasicBlock &BB) { ++TotalBlocks; }
+
+#define HANDLE_INST(N, OPCODE, CLASS) \
+    void visit##OPCODE(CLASS &) { ++Num##OPCODE##Inst; ++TotalInsts; }
+
+#include "llvm/Instruction.def"
+
+    void visitInstruction(Instruction &I) {
+      cerr << "Instruction Count does not know about " << I;
+      abort();
+    }
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    InstCount() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+    virtual void print(std::ostream &O, const Module *M) const {}
+
+  };
+}
+
+char InstCount::ID = 0;
+static RegisterPass<InstCount>
+X("instcount", "Counts the various types of Instructions", false, true);
+
+FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
+
+// InstCount::run - This is the main Analysis entry point for a
+// function.
+//
+bool InstCount::runOnFunction(Function &F) {
+  unsigned StartMemInsts =
+    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
+    NumInvokeInst + NumAllocaInst + NumMallocInst + NumFreeInst;
+  visit(F);
+  unsigned EndMemInsts =
+    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
+    NumInvokeInst + NumAllocaInst + NumMallocInst + NumFreeInst;
+  TotalMemInst += EndMemInsts-StartMemInsts;
+  return false;
+}
diff --git a/lib/Analysis/Interval.cpp b/lib/Analysis/Interval.cpp
new file mode 100644
index 0000000..16b1947
--- /dev/null
+++ b/lib/Analysis/Interval.cpp
@@ -0,0 +1,57 @@
+//===- Interval.cpp - Interval class code ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definition of the Interval class, which represents a
+// partition of a control flow graph of some kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Interval.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Support/CFG.h"
+#include <algorithm>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Interval Implementation
+//===----------------------------------------------------------------------===//
+
+// isLoop - Find out if there is a back edge in this interval...
+//
+bool Interval::isLoop() const {
+  // There is a loop in this interval iff one of the predecessors of the header
+  // node lives in the interval.
+  for (::pred_iterator I = ::pred_begin(HeaderNode), E = ::pred_end(HeaderNode);
+       I != E; ++I) {
+    if (contains(*I)) return true;
+  }
+  return false;
+}
+
+
+void Interval::print(std::ostream &o) const {
+  o << "-------------------------------------------------------------\n"
+       << "Interval Contents:\n";
+
+  // Print out all of the basic blocks in the interval...
+  for (std::vector<BasicBlock*>::const_iterator I = Nodes.begin(),
+         E = Nodes.end(); I != E; ++I)
+    o << **I << "\n";
+
+  o << "Interval Predecessors:\n";
+  for (std::vector<BasicBlock*>::const_iterator I = Predecessors.begin(),
+         E = Predecessors.end(); I != E; ++I)
+    o << **I << "\n";
+
+  o << "Interval Successors:\n";
+  for (std::vector<BasicBlock*>::const_iterator I = Successors.begin(),
+         E = Successors.end(); I != E; ++I)
+    o << **I << "\n";
+}
diff --git a/lib/Analysis/IntervalPartition.cpp b/lib/Analysis/IntervalPartition.cpp
new file mode 100644
index 0000000..cb8a85d
--- /dev/null
+++ b/lib/Analysis/IntervalPartition.cpp
@@ -0,0 +1,114 @@
+//===- IntervalPartition.cpp - Interval Partition module code -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definition of the IntervalPartition class, which
+// calculates and represent the interval partition of a function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/IntervalIterator.h"
+using namespace llvm;
+
+char IntervalPartition::ID = 0;
+static RegisterPass<IntervalPartition>
+X("intervals", "Interval Partition Construction", true, true);
+
+//===----------------------------------------------------------------------===//
+// IntervalPartition Implementation
+//===----------------------------------------------------------------------===//
+
+// releaseMemory - Reset state back to before function was analyzed
+void IntervalPartition::releaseMemory() {
+  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    delete Intervals[i];
+  IntervalMap.clear();
+  Intervals.clear();
+  RootInterval = 0;
+}
+
+void IntervalPartition::print(std::ostream &O, const Module*) const {
+  for(unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    Intervals[i]->print(O);
+}
+
+// addIntervalToPartition - Add an interval to the internal list of intervals,
+// and then add mappings from all of the basic blocks in the interval to the
+// interval itself (in the IntervalMap).
+//
+void IntervalPartition::addIntervalToPartition(Interval *I) {
+  Intervals.push_back(I);
+
+  // Add mappings for all of the basic blocks in I to the IntervalPartition
+  for (Interval::node_iterator It = I->Nodes.begin(), End = I->Nodes.end();
+       It != End; ++It)
+    IntervalMap.insert(std::make_pair(*It, I));
+}
+
+// updatePredecessors - Interval generation only sets the successor fields of
+// the interval data structures.  After interval generation is complete,
+// run through all of the intervals and propagate successor info as
+// predecessor info.
+//
+void IntervalPartition::updatePredecessors(Interval *Int) {
+  BasicBlock *Header = Int->getHeaderNode();
+  for (Interval::succ_iterator I = Int->Successors.begin(),
+         E = Int->Successors.end(); I != E; ++I)
+    getBlockInterval(*I)->Predecessors.push_back(Header);
+}
+
+// IntervalPartition ctor - Build the first level interval partition for the
+// specified function...
+//
+bool IntervalPartition::runOnFunction(Function &F) {
+  // Pass false to intervals_begin because we take ownership of it's memory
+  function_interval_iterator I = intervals_begin(&F, false);
+  assert(I != intervals_end(&F) && "No intervals in function!?!?!");
+
+  addIntervalToPartition(RootInterval = *I);
+
+  ++I;  // After the first one...
+
+  // Add the rest of the intervals to the partition.
+  for (function_interval_iterator E = intervals_end(&F); I != E; ++I)
+    addIntervalToPartition(*I);
+
+  // Now that we know all of the successor information, propagate this to the
+  // predecessors for each block.
+  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    updatePredecessors(Intervals[i]);
+  return false;
+}
+
+
+// IntervalPartition ctor - Build a reduced interval partition from an
+// existing interval graph.  This takes an additional boolean parameter to
+// distinguish it from a copy constructor.  Always pass in false for now.
+//
+IntervalPartition::IntervalPartition(IntervalPartition &IP, bool)
+  : FunctionPass(&ID) {
+  assert(IP.getRootInterval() && "Cannot operate on empty IntervalPartitions!");
+
+  // Pass false to intervals_begin because we take ownership of it's memory
+  interval_part_interval_iterator I = intervals_begin(IP, false);
+  assert(I != intervals_end(IP) && "No intervals in interval partition!?!?!");
+
+  addIntervalToPartition(RootInterval = *I);
+
+  ++I;  // After the first one...
+
+  // Add the rest of the intervals to the partition.
+  for (interval_part_interval_iterator E = intervals_end(IP); I != E; ++I)
+    addIntervalToPartition(*I);
+
+  // Now that we know all of the successor information, propagate this to the
+  // predecessors for each block.
+  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    updatePredecessors(Intervals[i]);
+}
+
diff --git a/lib/Analysis/LibCallAliasAnalysis.cpp b/lib/Analysis/LibCallAliasAnalysis.cpp
new file mode 100644
index 0000000..971e6e7
--- /dev/null
+++ b/lib/Analysis/LibCallAliasAnalysis.cpp
@@ -0,0 +1,141 @@
+//===- LibCallAliasAnalysis.cpp - Implement AliasAnalysis for libcalls ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LibCallAliasAnalysis class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LibCallAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+using namespace llvm;
+  
+// Register this pass...
+char LibCallAliasAnalysis::ID = 0;
+static RegisterPass<LibCallAliasAnalysis>
+X("libcall-aa", "LibCall Alias Analysis", false, true);
+  
+// Declare that we implement the AliasAnalysis interface
+static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+
+FunctionPass *llvm::createLibCallAliasAnalysisPass(LibCallInfo *LCI) {
+  return new LibCallAliasAnalysis(LCI);
+}
+
+LibCallAliasAnalysis::~LibCallAliasAnalysis() {
+  delete LCI;
+}
+
+void LibCallAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AliasAnalysis::getAnalysisUsage(AU);
+  AU.addRequired<TargetData>();
+  AU.setPreservesAll();                         // Does not transform code
+}
+
+
+
+/// AnalyzeLibCallDetails - Given a call to a function with the specified
+/// LibCallFunctionInfo, see if we can improve the mod/ref footprint of the call
+/// vs the specified pointer/size.
+AliasAnalysis::ModRefResult
+LibCallAliasAnalysis::AnalyzeLibCallDetails(const LibCallFunctionInfo *FI,
+                                            CallSite CS, Value *P,
+                                            unsigned Size) {
+  // If we have a function, check to see what kind of mod/ref effects it
+  // has.  Start by including any info globally known about the function.
+  AliasAnalysis::ModRefResult MRInfo = FI->UniversalBehavior;
+  if (MRInfo == NoModRef) return MRInfo;
+  
+  // If that didn't tell us that the function is 'readnone', check to see
+  // if we have detailed info and if 'P' is any of the locations we know
+  // about.
+  const LibCallFunctionInfo::LocationMRInfo *Details = FI->LocationDetails;
+  if (Details == 0)
+    return MRInfo;
+  
+  // If the details array is of the 'DoesNot' kind, we only know something if
+  // the pointer is a match for one of the locations in 'Details'.  If we find a
+  // match, we can prove some interactions cannot happen.
+  // 
+  if (FI->DetailsType == LibCallFunctionInfo::DoesNot) {
+    // Find out if the pointer refers to a known location.
+    for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) {
+      const LibCallLocationInfo &Loc =
+      LCI->getLocationInfo(Details[i].LocationID);
+      LibCallLocationInfo::LocResult Res = Loc.isLocation(CS, P, Size);
+      if (Res != LibCallLocationInfo::Yes) continue;
+      
+      // If we find a match against a location that we 'do not' interact with,
+      // learn this info into MRInfo.
+      return ModRefResult(MRInfo & ~Details[i].MRInfo);
+    }
+    return MRInfo;
+  }
+  
+  // If the details are of the 'DoesOnly' sort, we know something if the pointer
+  // is a match for one of the locations in 'Details'.  Also, if we can prove
+  // that the pointers is *not* one of the locations in 'Details', we know that
+  // the call is NoModRef.
+  assert(FI->DetailsType == LibCallFunctionInfo::DoesOnly);
+  
+  // Find out if the pointer refers to a known location.
+  bool NoneMatch = true;
+  for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) {
+    const LibCallLocationInfo &Loc =
+    LCI->getLocationInfo(Details[i].LocationID);
+    LibCallLocationInfo::LocResult Res = Loc.isLocation(CS, P, Size);
+    if (Res == LibCallLocationInfo::No) continue;
+    
+    // If we don't know if this pointer points to the location, then we have to
+    // assume it might alias in some case.
+    if (Res == LibCallLocationInfo::Unknown) {
+      NoneMatch = false;
+      continue;
+    }
+    
+    // If we know that this pointer definitely is pointing into the location,
+    // merge in this information.
+    return ModRefResult(MRInfo & Details[i].MRInfo);
+  }
+  
+  // If we found that the pointer is guaranteed to not match any of the
+  // locations in our 'DoesOnly' rule, then we know that the pointer must point
+  // to some other location.  Since the libcall doesn't mod/ref any other
+  // locations, return NoModRef.
+  if (NoneMatch)
+    return NoModRef;
+  
+  // Otherwise, return any other info gained so far.
+  return MRInfo;
+}
+
+// getModRefInfo - Check to see if the specified callsite can clobber the
+// specified memory object.
+//
+AliasAnalysis::ModRefResult
+LibCallAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+  ModRefResult MRInfo = ModRef;
+  
+  // If this is a direct call to a function that LCI knows about, get the
+  // information about the runtime function.
+  if (LCI) {
+    if (Function *F = CS.getCalledFunction()) {
+      if (const LibCallFunctionInfo *FI = LCI->getFunctionInfo(F)) {
+        MRInfo = ModRefResult(MRInfo & AnalyzeLibCallDetails(FI, CS, P, Size));
+        if (MRInfo == NoModRef) return NoModRef;
+      }
+    }
+  }
+  
+  // The AliasAnalysis base class has some smarts, lets use them.
+  return (ModRefResult)(MRInfo | AliasAnalysis::getModRefInfo(CS, P, Size));
+}
diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/LibCallSemantics.cpp
new file mode 100644
index 0000000..2985047
--- /dev/null
+++ b/lib/Analysis/LibCallSemantics.cpp
@@ -0,0 +1,65 @@
+//===- LibCallSemantics.cpp - Describe library semantics ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements interfaces that can be used to describe language
+// specific runtime library interfaces (e.g. libc, libm, etc) to LLVM
+// optimizers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Function.h"
+using namespace llvm;
+
+/// getMap - This impl pointer in ~LibCallInfo is actually a StringMap.  This
+/// helper does the cast.
+static StringMap<const LibCallFunctionInfo*> *getMap(void *Ptr) {
+  return static_cast<StringMap<const LibCallFunctionInfo*> *>(Ptr);
+}
+
+LibCallInfo::~LibCallInfo() {
+  delete getMap(Impl);
+}
+
+const LibCallLocationInfo &LibCallInfo::getLocationInfo(unsigned LocID) const {
+  // Get location info on the first call.
+  if (NumLocations == 0)
+    NumLocations = getLocationInfo(Locations);
+  
+  assert(LocID < NumLocations && "Invalid location ID!");
+  return Locations[LocID];
+}
+
+
+/// getFunctionInfo - Return the LibCallFunctionInfo object corresponding to
+/// the specified function if we have it.  If not, return null.
+const LibCallFunctionInfo *LibCallInfo::getFunctionInfo(Function *F) const {
+  StringMap<const LibCallFunctionInfo*> *Map = getMap(Impl);
+  
+  /// If this is the first time we are querying for this info, lazily construct
+  /// the StringMap to index it.
+  if (Map == 0) {
+    Impl = Map = new StringMap<const LibCallFunctionInfo*>();
+    
+    const LibCallFunctionInfo *Array = getFunctionInfoArray();
+    if (Array == 0) return 0;
+    
+    // We now have the array of entries.  Populate the StringMap.
+    for (unsigned i = 0; Array[i].Name; ++i)
+      (*Map)[Array[i].Name] = Array+i;
+  }
+  
+  // Look up this function in the string map.
+  const char *ValueName = F->getNameStart();
+  StringMap<const LibCallFunctionInfo*>::iterator I =
+  Map->find(ValueName, ValueName+F->getNameLen());
+  return I != Map->end() ? I->second : 0;
+}
+
diff --git a/lib/Analysis/LiveValues.cpp b/lib/Analysis/LiveValues.cpp
new file mode 100644
index 0000000..2bbe98a
--- /dev/null
+++ b/lib/Analysis/LiveValues.cpp
@@ -0,0 +1,191 @@
+//===- LiveValues.cpp - Liveness information for LLVM IR Values. ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the implementation for the LLVM IR Value liveness
+// analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LiveValues.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+using namespace llvm;
+
+FunctionPass *llvm::createLiveValuesPass() { return new LiveValues(); }
+
+char LiveValues::ID = 0;
+static RegisterPass<LiveValues>
+X("live-values", "Value Liveness Analysis", false, true);
+
+LiveValues::LiveValues() : FunctionPass(&ID) {}
+
+void LiveValues::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTree>();
+  AU.addRequired<LoopInfo>();
+  AU.setPreservesAll();
+}
+
+bool LiveValues::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+
+  // This pass' values are computed lazily, so there's nothing to do here.
+
+  return false;
+}
+
+void LiveValues::releaseMemory() {
+  Memos.clear();
+}
+
+/// isUsedInBlock - Test if the given value is used in the given block.
+///
+bool LiveValues::isUsedInBlock(const Value *V, const BasicBlock *BB) {
+  Memo &M = getMemo(V);
+  return M.Used.count(BB);
+}
+
+/// isLiveThroughBlock - Test if the given value is known to be
+/// live-through the given block, meaning that the block is properly
+/// dominated by the value's definition, and there exists a block
+/// reachable from it that contains a use. This uses a conservative
+/// approximation that errs on the side of returning false.
+///
+bool LiveValues::isLiveThroughBlock(const Value *V,
+                                    const BasicBlock *BB) {
+  Memo &M = getMemo(V);
+  return M.LiveThrough.count(BB);
+}
+
+/// isKilledInBlock - Test if the given value is known to be killed in
+/// the given block, meaning that the block contains a use of the value,
+/// and no blocks reachable from the block contain a use. This uses a
+/// conservative approximation that errs on the side of returning false.
+///
+bool LiveValues::isKilledInBlock(const Value *V, const BasicBlock *BB) {
+  Memo &M = getMemo(V);
+  return M.Killed.count(BB);
+}
+
+/// getMemo - Retrieve an existing Memo for the given value if one
+/// is available, otherwise compute a new one.
+///
+LiveValues::Memo &LiveValues::getMemo(const Value *V) {
+  DenseMap<const Value *, Memo>::iterator I = Memos.find(V);
+  if (I != Memos.end())
+    return I->second;
+  return compute(V);
+}
+
+/// getImmediateDominator - A handy utility for the specific DominatorTree
+/// query that we need here.
+///
+static const BasicBlock *getImmediateDominator(const BasicBlock *BB,
+                                               const DominatorTree *DT) {
+  DomTreeNode *Node = DT->getNode(const_cast<BasicBlock *>(BB))->getIDom();
+  return Node ? Node->getBlock() : 0;
+}
+
+/// compute - Compute a new Memo for the given value.
+///
+LiveValues::Memo &LiveValues::compute(const Value *V) {
+  Memo &M = Memos[V];
+
+  // Determine the block containing the definition.
+  const BasicBlock *DefBB;
+  // Instructions define values with meaningful live ranges.
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    DefBB = I->getParent();
+  // Arguments can be analyzed as values defined in the entry block.
+  else if (const Argument *A = dyn_cast<Argument>(V))
+    DefBB = &A->getParent()->getEntryBlock();
+  // Constants and other things aren't meaningful here, so just
+  // return having computed an empty Memo so that we don't come
+  // here again. The assumption here is that client code won't
+  // be asking about such values very often.
+  else
+    return M;
+
+  // Determine if the value is defined inside a loop. This is used
+  // to track whether the value is ever used outside the loop, so
+  // it'll be set to null if the value is either not defined in a
+  // loop or used outside the loop in which it is defined.
+  const Loop *L = LI->getLoopFor(DefBB);
+
+  // Track whether the value is used anywhere outside of the block
+  // in which it is defined.
+  bool LiveOutOfDefBB = false;
+
+  // Examine each use of the value.
+  for (Value::use_const_iterator I = V->use_begin(), E = V->use_end();
+       I != E; ++I) {
+    const User *U = *I;
+    const BasicBlock *UseBB = cast<Instruction>(U)->getParent();
+
+    // Note the block in which this use occurs.
+    M.Used.insert(UseBB);
+
+    // If the use block doesn't have successors, the value can be
+    // considered killed.
+    if (succ_begin(UseBB) == succ_end(UseBB))
+      M.Killed.insert(UseBB);
+
+    // Observe whether the value is used outside of the loop in which
+    // it is defined. Switch to an enclosing loop if necessary.
+    for (; L; L = L->getParentLoop())
+      if (L->contains(UseBB))
+        break;
+
+    // Search for live-through blocks.
+    const BasicBlock *BB;
+    if (const PHINode *PHI = dyn_cast<PHINode>(U)) {
+      // For PHI nodes, start the search at the incoming block paired with the
+      // incoming value, which must be dominated by the definition.
+      unsigned Num = PHI->getIncomingValueNumForOperand(I.getOperandNo());
+      BB = PHI->getIncomingBlock(Num);
+
+      // A PHI-node use means the value is live-out of it's defining block
+      // even if that block also contains the only use.
+      LiveOutOfDefBB = true;
+    } else {
+      // Otherwise just start the search at the use.
+      BB = UseBB;
+
+      // Note if the use is outside the defining block.
+      LiveOutOfDefBB |= UseBB != DefBB;
+    }
+
+    // Climb the immediate dominator tree from the use to the definition
+    // and mark all intermediate blocks as live-through.
+    for (; BB != DefBB; BB = getImmediateDominator(BB, DT)) {
+      if (BB != UseBB && !M.LiveThrough.insert(BB))
+        break;
+    }
+  }
+
+  // If the value is defined inside a loop and is not live outside
+  // the loop, then each exit block of the loop in which the value
+  // is used is a kill block.
+  if (L) {
+    SmallVector<BasicBlock *, 4> ExitingBlocks;
+    L->getExitingBlocks(ExitingBlocks);
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      const BasicBlock *ExitingBlock = ExitingBlocks[i];
+      if (M.Used.count(ExitingBlock))
+        M.Killed.insert(ExitingBlock);
+    }
+  }
+
+  // If the value was never used outside the the block in which it was
+  // defined, it's killed in that block.
+  if (!LiveOutOfDefBB)
+    M.Killed.insert(DefBB);
+
+  return M;
+}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
new file mode 100644
index 0000000..de6480a
--- /dev/null
+++ b/lib/Analysis/LoopInfo.cpp
@@ -0,0 +1,50 @@
+//===- LoopInfo.cpp - Natural Loop Calculator -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LoopInfo class that is used to identify natural loops
+// and determine the loop depth of various nodes of the CFG.  Note that the
+// loops identified may actually be several natural loops that share the same
+// header node... not just a single natural loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <algorithm>
+#include <ostream>
+using namespace llvm;
+
+char LoopInfo::ID = 0;
+static RegisterPass<LoopInfo>
+X("loops", "Natural Loop Information", true, true);
+
+//===----------------------------------------------------------------------===//
+// Loop implementation
+//
+
+//===----------------------------------------------------------------------===//
+// LoopInfo implementation
+//
+bool LoopInfo::runOnFunction(Function &) {
+  releaseMemory();
+  LI->Calculate(getAnalysis<DominatorTree>().getBase());    // Update
+  return false;
+}
+
+void LoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<DominatorTree>();
+}
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
new file mode 100644
index 0000000..08c25f4
--- /dev/null
+++ b/lib/Analysis/LoopPass.cpp
@@ -0,0 +1,340 @@
+//===- LoopPass.cpp - Loop Pass and Loop Pass Manager ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements LoopPass and LPPassManager. All loop optimization
+// and transformation passes are derived from LoopPass. LPPassManager is
+// responsible for managing LoopPasses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopPass.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// LPPassManager
+//
+
+char LPPassManager::ID = 0;
+/// LPPassManager manages FPPassManagers and CalLGraphSCCPasses.
+
+LPPassManager::LPPassManager(int Depth) 
+  : FunctionPass(&ID), PMDataManager(Depth) { 
+  skipThisLoop = false;
+  redoThisLoop = false;
+  LI = NULL;
+  CurrentLoop = NULL;
+}
+
+/// Delete loop from the loop queue and loop hierarchy (LoopInfo). 
+void LPPassManager::deleteLoopFromQueue(Loop *L) {
+
+  if (Loop *ParentLoop = L->getParentLoop()) { // Not a top-level loop.
+    // Reparent all of the blocks in this loop.  Since BBLoop had a parent,
+    // they are now all in it.
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); 
+         I != E; ++I)
+      if (LI->getLoopFor(*I) == L)    // Don't change blocks in subloops.
+        LI->changeLoopFor(*I, ParentLoop);
+    
+    // Remove the loop from its parent loop.
+    for (Loop::iterator I = ParentLoop->begin(), E = ParentLoop->end();;
+         ++I) {
+      assert(I != E && "Couldn't find loop");
+      if (*I == L) {
+        ParentLoop->removeChildLoop(I);
+        break;
+      }
+    }
+    
+    // Move all subloops into the parent loop.
+    while (!L->empty())
+      ParentLoop->addChildLoop(L->removeChildLoop(L->end()-1));
+  } else {
+    // Reparent all of the blocks in this loop.  Since BBLoop had no parent,
+    // they no longer in a loop at all.
+    
+    for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
+      // Don't change blocks in subloops.
+      if (LI->getLoopFor(L->getBlocks()[i]) == L) {
+        LI->removeBlock(L->getBlocks()[i]);
+        --i;
+      }
+    }
+
+    // Remove the loop from the top-level LoopInfo object.
+    for (LoopInfo::iterator I = LI->begin(), E = LI->end();; ++I) {
+      assert(I != E && "Couldn't find loop");
+      if (*I == L) {
+        LI->removeLoop(I);
+        break;
+      }
+    }
+
+    // Move all of the subloops to the top-level.
+    while (!L->empty())
+      LI->addTopLevelLoop(L->removeChildLoop(L->end()-1));
+  }
+
+  delete L;
+
+  // If L is current loop then skip rest of the passes and let
+  // runOnFunction remove L from LQ. Otherwise, remove L from LQ now
+  // and continue applying other passes on CurrentLoop.
+  if (CurrentLoop == L) {
+    skipThisLoop = true;
+    return;
+  }
+
+  for (std::deque<Loop *>::iterator I = LQ.begin(),
+         E = LQ.end(); I != E; ++I) {
+    if (*I == L) {
+      LQ.erase(I);
+      break;
+    }
+  }
+}
+
+// Inset loop into loop nest (LoopInfo) and loop queue (LQ).
+void LPPassManager::insertLoop(Loop *L, Loop *ParentLoop) {
+
+  assert (CurrentLoop != L && "Cannot insert CurrentLoop");
+
+  // Insert into loop nest
+  if (ParentLoop)
+    ParentLoop->addChildLoop(L);
+  else
+    LI->addTopLevelLoop(L);
+
+  // Insert L into loop queue
+  if (L == CurrentLoop) 
+    redoLoop(L);
+  else if (!ParentLoop)
+    // This is top level loop. 
+    LQ.push_front(L);
+  else {
+    // Insert L after ParentLoop
+    for (std::deque<Loop *>::iterator I = LQ.begin(),
+           E = LQ.end(); I != E; ++I) {
+      if (*I == ParentLoop) {
+        // deque does not support insert after.
+        ++I;
+        LQ.insert(I, 1, L);
+        break;
+      }
+    }
+  }
+}
+
+// Reoptimize this loop. LPPassManager will re-insert this loop into the
+// queue. This allows LoopPass to change loop nest for the loop. This
+// utility may send LPPassManager into infinite loops so use caution.
+void LPPassManager::redoLoop(Loop *L) {
+  assert (CurrentLoop == L && "Can redo only CurrentLoop");
+  redoThisLoop = true;
+}
+
+/// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for
+/// all loop passes.
+void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From, 
+                                                  BasicBlock *To, Loop *L) {
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+    Pass *P = getContainedPass(Index);
+    LoopPass *LP = dynamic_cast<LoopPass *>(P);
+    LP->cloneBasicBlockAnalysis(From, To, L);
+  }
+}
+
+/// deleteSimpleAnalysisValue - Invoke deleteAnalysisValue hook for all passes.
+void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
+  if (BasicBlock *BB = dyn_cast<BasicBlock>(V)) {
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; 
+         ++BI) {
+      Instruction &I = *BI;
+      deleteSimpleAnalysisValue(&I, L);
+    }
+  }
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+    Pass *P = getContainedPass(Index);
+    LoopPass *LP = dynamic_cast<LoopPass *>(P);
+    LP->deleteAnalysisValue(V, L);
+  }
+}
+
+
+// Recurse through all subloops and all loops  into LQ.
+static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {
+  LQ.push_back(L);
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    addLoopIntoQueue(*I, LQ);
+}
+
+/// Pass Manager itself does not invalidate any analysis info.
+void LPPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
+  // LPPassManager needs LoopInfo. In the long term LoopInfo class will 
+  // become part of LPPassManager.
+  Info.addRequired<LoopInfo>();
+  Info.setPreservesAll();
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the function, and if so, return true.
+bool LPPassManager::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfo>();
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  // Populate Loop Queue
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    addLoopIntoQueue(*I, LQ);
+
+  // Initialization
+  for (std::deque<Loop *>::const_iterator I = LQ.begin(), E = LQ.end();
+       I != E; ++I) {
+    Loop *L = *I;
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+      Pass *P = getContainedPass(Index);
+      LoopPass *LP = dynamic_cast<LoopPass *>(P);
+      if (LP)
+        Changed |= LP->doInitialization(L, *this);
+    }
+  }
+
+  // Walk Loops
+  while (!LQ.empty()) {
+      
+    CurrentLoop  = LQ.back();
+    skipThisLoop = false;
+    redoThisLoop = false;
+
+    // Run all passes on current SCC
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+      Pass *P = getContainedPass(Index);
+
+      dumpPassInfo(P, EXECUTION_MSG, ON_LOOP_MSG, "");
+      dumpRequiredSet(P);
+
+      initializeAnalysisImpl(P);
+
+      LoopPass *LP = dynamic_cast<LoopPass *>(P);
+      {
+        PassManagerPrettyStackEntry X(LP, *CurrentLoop->getHeader());
+        StartPassTimer(P);
+        assert(LP && "Invalid LPPassManager member");
+        Changed |= LP->runOnLoop(CurrentLoop, *this);
+        StopPassTimer(P);
+      }
+
+      if (Changed)
+        dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG, "");
+      dumpPreservedSet(P);
+
+      verifyPreservedAnalysis(LP);
+      removeNotPreservedAnalysis(P);
+      recordAvailableAnalysis(P);
+      removeDeadPasses(P, "", ON_LOOP_MSG);
+
+      // If dominator information is available then verify the info if requested.
+      verifyDomInfo(*LP, F);
+
+      if (skipThisLoop)
+        // Do not run other passes on this loop.
+        break;
+    }
+    
+    // Pop the loop from queue after running all passes.
+    LQ.pop_back();
+    
+    if (redoThisLoop)
+      LQ.push_back(CurrentLoop);
+  }
+  
+  // Finalization
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    Pass *P = getContainedPass(Index);
+    LoopPass *LP = dynamic_cast <LoopPass *>(P);
+    if (LP)
+      Changed |= LP->doFinalization();
+  }
+
+  return Changed;
+}
+
+/// Print passes managed by this manager
+void LPPassManager::dumpPassStructure(unsigned Offset) {
+  llvm::cerr << std::string(Offset*2, ' ') << "Loop Pass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    Pass *P = getContainedPass(Index);
+    P->dumpPassStructure(Offset + 1);
+    dumpLastUses(P, Offset+1);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// LoopPass
+
+// Check if this pass is suitable for the current LPPassManager, if
+// available. This pass P is not suitable for a LPPassManager if P
+// is not preserving higher level analysis info used by other
+// LPPassManager passes. In such case, pop LPPassManager from the
+// stack. This will force assignPassManager() to create new
+// LPPassManger as expected.
+void LoopPass::preparePassManager(PMStack &PMS) {
+
+  // Find LPPassManager 
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_LoopPassManager)
+    PMS.pop();
+
+  LPPassManager *LPPM = dynamic_cast<LPPassManager *>(PMS.top());
+
+  // If this pass is destroying high level information that is used
+  // by other passes that are managed by LPM then do not insert
+  // this pass in current LPM. Use new LPPassManager.
+  if (LPPM && !LPPM->preserveHigherLevelAnalysis(this)) 
+    PMS.pop();
+}
+
+/// Assign pass manager to manage this pass.
+void LoopPass::assignPassManager(PMStack &PMS,
+                                 PassManagerType PreferredType) {
+  // Find LPPassManager 
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_LoopPassManager)
+    PMS.pop();
+
+  LPPassManager *LPPM = dynamic_cast<LPPassManager *>(PMS.top());
+
+  // Create new Loop Pass Manager if it does not exist. 
+  if (!LPPM) {
+
+    assert (!PMS.empty() && "Unable to create Loop Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Call Graph Pass Manager
+    LPPM = new LPPassManager(PMD->getDepth() + 1);
+    LPPM->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(LPPM);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    Pass *P = dynamic_cast<Pass *>(LPPM);
+    TPM->schedulePass(P);
+
+    // [4] Push new manager into PMS
+    PMS.push(LPPM);
+  }
+
+  LPPM->add(this);
+}
diff --git a/lib/Analysis/LoopVR.cpp b/lib/Analysis/LoopVR.cpp
new file mode 100644
index 0000000..0a3d06b
--- /dev/null
+++ b/lib/Analysis/LoopVR.cpp
@@ -0,0 +1,291 @@
+//===- LoopVR.cpp - Value Range analysis driven by loop information -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// FIXME: What does this do?
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loopvr"
+#include "llvm/Analysis/LoopVR.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+char LoopVR::ID = 0;
+static RegisterPass<LoopVR> X("loopvr", "Loop Value Ranges", false, true);
+
+/// getRange - determine the range for a particular SCEV within a given Loop
+ConstantRange LoopVR::getRange(SCEVHandle S, Loop *L, ScalarEvolution &SE) {
+  SCEVHandle T = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(T))
+    return ConstantRange(cast<IntegerType>(S->getType())->getBitWidth(), true);
+
+  T = SE.getTruncateOrZeroExtend(T, S->getType());
+  return getRange(S, T, SE);
+}
+
+/// getRange - determine the range for a particular SCEV with a given trip count
+ConstantRange LoopVR::getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE){
+
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
+    return ConstantRange(C->getValue()->getValue());
+
+  ConstantRange FullSet(cast<IntegerType>(S->getType())->getBitWidth(), true);
+
+  // {x,+,y,+,...z}. We detect overflow by checking the size of the set after
+  // summing the upper and lower.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    ConstantRange X = getRange(Add->getOperand(0), T, SE);
+    if (X.isFullSet()) return FullSet;
+    for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i) {
+      ConstantRange Y = getRange(Add->getOperand(i), T, SE);
+      if (Y.isFullSet()) return FullSet;
+
+      APInt Spread_X = X.getSetSize(), Spread_Y = Y.getSetSize();
+      APInt NewLower = X.getLower() + Y.getLower();
+      APInt NewUpper = X.getUpper() + Y.getUpper() - 1;
+      if (NewLower == NewUpper)
+        return FullSet;
+
+      X = ConstantRange(NewLower, NewUpper);
+      if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+        return FullSet; // we've wrapped, therefore, full set.
+    }
+    return X;
+  }
+
+  // {x,*,y,*,...,z}. In order to detect overflow, we use k*bitwidth where
+  // k is the number of terms being multiplied.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    ConstantRange X = getRange(Mul->getOperand(0), T, SE);
+    if (X.isFullSet()) return FullSet;
+
+    const IntegerType *Ty = IntegerType::get(X.getBitWidth());
+    const IntegerType *ExTy = IntegerType::get(X.getBitWidth() *
+                                               Mul->getNumOperands());
+    ConstantRange XExt = X.zeroExtend(ExTy->getBitWidth());
+
+    for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i) {
+      ConstantRange Y = getRange(Mul->getOperand(i), T, SE);
+      if (Y.isFullSet()) return FullSet;
+
+      ConstantRange YExt = Y.zeroExtend(ExTy->getBitWidth());
+      XExt = ConstantRange(XExt.getLower() * YExt.getLower(),
+                           ((XExt.getUpper()-1) * (YExt.getUpper()-1)) + 1);
+    }
+    return XExt.truncate(Ty->getBitWidth());
+  }
+
+  // X smax Y smax ... Z is: range(smax(X_smin, Y_smin, ..., Z_smin),
+  //                               smax(X_smax, Y_smax, ..., Z_smax))
+  // It doesn't matter if one of the SCEVs has FullSet because we're taking
+  // a maximum of the minimums across all of them.
+  if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) {
+    ConstantRange X = getRange(SMax->getOperand(0), T, SE);
+    if (X.isFullSet()) return FullSet;
+
+    APInt smin = X.getSignedMin(), smax = X.getSignedMax();
+    for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i) {
+      ConstantRange Y = getRange(SMax->getOperand(i), T, SE);
+      smin = APIntOps::smax(smin, Y.getSignedMin());
+      smax = APIntOps::smax(smax, Y.getSignedMax());
+    }
+    if (smax + 1 == smin) return FullSet;
+    return ConstantRange(smin, smax + 1);
+  }
+
+  // X umax Y umax ... Z is: range(umax(X_umin, Y_umin, ..., Z_umin),
+  //                               umax(X_umax, Y_umax, ..., Z_umax))
+  // It doesn't matter if one of the SCEVs has FullSet because we're taking
+  // a maximum of the minimums across all of them.
+  if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) {
+    ConstantRange X = getRange(UMax->getOperand(0), T, SE);
+    if (X.isFullSet()) return FullSet;
+
+    APInt umin = X.getUnsignedMin(), umax = X.getUnsignedMax();
+    for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i) {
+      ConstantRange Y = getRange(UMax->getOperand(i), T, SE);
+      umin = APIntOps::umax(umin, Y.getUnsignedMin());
+      umax = APIntOps::umax(umax, Y.getUnsignedMax());
+    }
+    if (umax + 1 == umin) return FullSet;
+    return ConstantRange(umin, umax + 1);
+  }
+
+  // L udiv R. Luckily, there's only ever 2 sides to a udiv.
+  if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
+    ConstantRange L = getRange(UDiv->getLHS(), T, SE);
+    ConstantRange R = getRange(UDiv->getRHS(), T, SE);
+    if (L.isFullSet() && R.isFullSet()) return FullSet;
+
+    if (R.getUnsignedMax() == 0) {
+      // RHS must be single-element zero. Return an empty set.
+      return ConstantRange(R.getBitWidth(), false);
+    }
+
+    APInt Lower = L.getUnsignedMin().udiv(R.getUnsignedMax());
+
+    APInt Upper;
+
+    if (R.getUnsignedMin() == 0) {
+      // Just because it contains zero, doesn't mean it will also contain one.
+      // Use maximalIntersectWith to get the right behaviour.
+      ConstantRange NotZero(APInt(L.getBitWidth(), 1),
+                            APInt::getNullValue(L.getBitWidth()));
+      R = R.maximalIntersectWith(NotZero);
+    }
+ 
+    // But, the maximal intersection might still include zero. If it does, then
+    // we know it also included one.
+    if (R.contains(APInt::getNullValue(L.getBitWidth())))
+      Upper = L.getUnsignedMax();
+    else
+      Upper = L.getUnsignedMax().udiv(R.getUnsignedMin());
+
+    return ConstantRange(Lower, Upper);
+  }
+
+  // ConstantRange already implements the cast operators.
+
+  if (const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(S)) {
+    T = SE.getTruncateOrZeroExtend(T, ZExt->getOperand()->getType());
+    ConstantRange X = getRange(ZExt->getOperand(), T, SE);
+    return X.zeroExtend(cast<IntegerType>(ZExt->getType())->getBitWidth());
+  }
+
+  if (const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(S)) {
+    T = SE.getTruncateOrZeroExtend(T, SExt->getOperand()->getType());
+    ConstantRange X = getRange(SExt->getOperand(), T, SE);
+    return X.signExtend(cast<IntegerType>(SExt->getType())->getBitWidth());
+  }
+
+  if (const SCEVTruncateExpr *Trunc = dyn_cast<SCEVTruncateExpr>(S)) {
+    T = SE.getTruncateOrZeroExtend(T, Trunc->getOperand()->getType());
+    ConstantRange X = getRange(Trunc->getOperand(), T, SE);
+    if (X.isFullSet()) return FullSet;
+    return X.truncate(cast<IntegerType>(Trunc->getType())->getBitWidth());
+  }
+
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) {
+    const SCEVConstant *Trip = dyn_cast<SCEVConstant>(T);
+    if (!Trip) return FullSet;
+
+    if (AddRec->isAffine()) {
+      SCEVHandle StartHandle = AddRec->getStart();
+      SCEVHandle StepHandle = AddRec->getOperand(1);
+
+      const SCEVConstant *Step = dyn_cast<SCEVConstant>(StepHandle);
+      if (!Step) return FullSet;
+
+      uint32_t ExWidth = 2 * Trip->getValue()->getBitWidth();
+      APInt TripExt = Trip->getValue()->getValue(); TripExt.zext(ExWidth);
+      APInt StepExt = Step->getValue()->getValue(); StepExt.zext(ExWidth);
+      if ((TripExt * StepExt).ugt(APInt::getLowBitsSet(ExWidth, ExWidth >> 1)))
+        return FullSet;
+
+      SCEVHandle EndHandle = SE.getAddExpr(StartHandle,
+                                           SE.getMulExpr(T, StepHandle));
+      const SCEVConstant *Start = dyn_cast<SCEVConstant>(StartHandle);
+      const SCEVConstant *End = dyn_cast<SCEVConstant>(EndHandle);
+      if (!Start || !End) return FullSet;
+
+      const APInt &StartInt = Start->getValue()->getValue();
+      const APInt &EndInt = End->getValue()->getValue();
+      const APInt &StepInt = Step->getValue()->getValue();
+
+      if (StepInt.isNegative()) {
+        if (EndInt == StartInt + 1) return FullSet;
+        return ConstantRange(EndInt, StartInt + 1);
+      } else {
+        if (StartInt == EndInt + 1) return FullSet;
+        return ConstantRange(StartInt, EndInt + 1);
+      }
+    }
+  }
+
+  // TODO: non-affine addrec, udiv, SCEVUnknown (narrowed from elsewhere)?
+
+  return FullSet;
+}
+
+bool LoopVR::runOnFunction(Function &F) { Map.clear(); return false; }
+
+void LoopVR::print(std::ostream &os, const Module *) const {
+  raw_os_ostream OS(os);
+  for (std::map<Value *, ConstantRange *>::const_iterator I = Map.begin(),
+       E = Map.end(); I != E; ++I) {
+    OS << *I->first << ": " << *I->second << '\n';
+  }
+}
+
+void LoopVR::releaseMemory() {
+  for (std::map<Value *, ConstantRange *>::iterator I = Map.begin(),
+       E = Map.end(); I != E; ++I) {
+    delete I->second;
+  }
+
+  Map.clear();  
+}
+
+ConstantRange LoopVR::compute(Value *V) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return ConstantRange(CI->getValue());
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return ConstantRange(cast<IntegerType>(V->getType())->getBitWidth(), false);
+
+  LoopInfo &LI = getAnalysis<LoopInfo>();
+
+  Loop *L = LI.getLoopFor(I->getParent());
+  if (!L || L->isLoopInvariant(I))
+    return ConstantRange(cast<IntegerType>(V->getType())->getBitWidth(), false);
+
+  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
+
+  SCEVHandle S = SE.getSCEV(I);
+  if (isa<SCEVUnknown>(S) || isa<SCEVCouldNotCompute>(S))
+    return ConstantRange(cast<IntegerType>(V->getType())->getBitWidth(), false);
+
+  return ConstantRange(getRange(S, L, SE));
+}
+
+ConstantRange LoopVR::get(Value *V) {
+  std::map<Value *, ConstantRange *>::iterator I = Map.find(V);
+  if (I == Map.end()) {
+    ConstantRange *CR = new ConstantRange(compute(V));
+    Map[V] = CR;
+    return *CR;
+  }
+
+  return *I->second;
+}
+
+void LoopVR::remove(Value *V) {
+  std::map<Value *, ConstantRange *>::iterator I = Map.find(V);
+  if (I != Map.end()) {
+    delete I->second;
+    Map.erase(I);
+  }
+}
+
+void LoopVR::narrow(Value *V, const ConstantRange &CR) {
+  if (CR.isFullSet()) return;
+
+  std::map<Value *, ConstantRange *>::iterator I = Map.find(V);
+  if (I == Map.end())
+    Map[V] = new ConstantRange(CR);
+  else
+    Map[V] = new ConstantRange(Map[V]->maximalIntersectWith(CR));
+}
diff --git a/lib/Analysis/Makefile b/lib/Analysis/Makefile
new file mode 100644
index 0000000..4af6d35
--- /dev/null
+++ b/lib/Analysis/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Analysis/Makefile -------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMAnalysis
+DIRS = IPA
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
new file mode 100644
index 0000000..3b21029
--- /dev/null
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -0,0 +1,1142 @@
+//===- MemoryDependenceAnalysis.cpp - Mem Deps Implementation  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an analysis that determines, for a given memory
+// operation, what preceding memory operations it depends on.  It builds on 
+// alias analysis information, and tries to provide a lazy, caching interface to
+// a common kind of alias information query.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "memdep"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Function.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/PredIteratorCache.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetData.h"
+using namespace llvm;
+
+STATISTIC(NumCacheNonLocal, "Number of fully cached non-local responses");
+STATISTIC(NumCacheDirtyNonLocal, "Number of dirty cached non-local responses");
+STATISTIC(NumUncacheNonLocal, "Number of uncached non-local responses");
+
+STATISTIC(NumCacheNonLocalPtr,
+          "Number of fully cached non-local ptr responses");
+STATISTIC(NumCacheDirtyNonLocalPtr,
+          "Number of cached, but dirty, non-local ptr responses");
+STATISTIC(NumUncacheNonLocalPtr,
+          "Number of uncached non-local ptr responses");
+STATISTIC(NumCacheCompleteNonLocalPtr,
+          "Number of block queries that were completely cached");
+
+char MemoryDependenceAnalysis::ID = 0;
+  
+// Register this pass...
+static RegisterPass<MemoryDependenceAnalysis> X("memdep",
+                                     "Memory Dependence Analysis", false, true);
+
+MemoryDependenceAnalysis::MemoryDependenceAnalysis()
+: FunctionPass(&ID), PredCache(0) {
+}
+MemoryDependenceAnalysis::~MemoryDependenceAnalysis() {
+}
+
+/// Clean up memory in between runs
+void MemoryDependenceAnalysis::releaseMemory() {
+  LocalDeps.clear();
+  NonLocalDeps.clear();
+  NonLocalPointerDeps.clear();
+  ReverseLocalDeps.clear();
+  ReverseNonLocalDeps.clear();
+  ReverseNonLocalPtrDeps.clear();
+  PredCache->clear();
+}
+
+
+
+/// getAnalysisUsage - Does not modify anything.  It uses Alias Analysis.
+///
+void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<AliasAnalysis>();
+  AU.addRequiredTransitive<TargetData>();
+}
+
+bool MemoryDependenceAnalysis::runOnFunction(Function &) {
+  AA = &getAnalysis<AliasAnalysis>();
+  TD = &getAnalysis<TargetData>();
+  if (PredCache == 0)
+    PredCache.reset(new PredIteratorCache());
+  return false;
+}
+
+/// RemoveFromReverseMap - This is a helper function that removes Val from
+/// 'Inst's set in ReverseMap.  If the set becomes empty, remove Inst's entry.
+template <typename KeyTy>
+static void RemoveFromReverseMap(DenseMap<Instruction*, 
+                                 SmallPtrSet<KeyTy, 4> > &ReverseMap,
+                                 Instruction *Inst, KeyTy Val) {
+  typename DenseMap<Instruction*, SmallPtrSet<KeyTy, 4> >::iterator
+  InstIt = ReverseMap.find(Inst);
+  assert(InstIt != ReverseMap.end() && "Reverse map out of sync?");
+  bool Found = InstIt->second.erase(Val);
+  assert(Found && "Invalid reverse map!"); Found=Found;
+  if (InstIt->second.empty())
+    ReverseMap.erase(InstIt);
+}
+
+
+/// getCallSiteDependencyFrom - Private helper for finding the local
+/// dependencies of a call site.
+MemDepResult MemoryDependenceAnalysis::
+getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
+                          BasicBlock::iterator ScanIt, BasicBlock *BB) {
+  // Walk backwards through the block, looking for dependencies
+  while (ScanIt != BB->begin()) {
+    Instruction *Inst = --ScanIt;
+    
+    // If this inst is a memory op, get the pointer it accessed
+    Value *Pointer = 0;
+    uint64_t PointerSize = 0;
+    if (StoreInst *S = dyn_cast<StoreInst>(Inst)) {
+      Pointer = S->getPointerOperand();
+      PointerSize = TD->getTypeStoreSize(S->getOperand(0)->getType());
+    } else if (VAArgInst *V = dyn_cast<VAArgInst>(Inst)) {
+      Pointer = V->getOperand(0);
+      PointerSize = TD->getTypeStoreSize(V->getType());
+    } else if (FreeInst *F = dyn_cast<FreeInst>(Inst)) {
+      Pointer = F->getPointerOperand();
+      
+      // FreeInsts erase the entire structure
+      PointerSize = ~0ULL;
+    } else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+      // Debug intrinsics don't cause dependences.
+      if (isa<DbgInfoIntrinsic>(Inst)) continue;
+      CallSite InstCS = CallSite::get(Inst);
+      // If these two calls do not interfere, look past it.
+      switch (AA->getModRefInfo(CS, InstCS)) {
+      case AliasAnalysis::NoModRef:
+        // If the two calls don't interact (e.g. InstCS is readnone) keep
+        // scanning.
+        continue;
+      case AliasAnalysis::Ref:
+        // If the two calls read the same memory locations and CS is a readonly
+        // function, then we have two cases: 1) the calls may not interfere with
+        // each other at all.  2) the calls may produce the same value.  In case
+        // #1 we want to ignore the values, in case #2, we want to return Inst
+        // as a Def dependence.  This allows us to CSE in cases like:
+        //   X = strlen(P);
+        //    memchr(...);
+        //   Y = strlen(P);  // Y = X
+        if (isReadOnlyCall) {
+          if (CS.getCalledFunction() != 0 &&
+              CS.getCalledFunction() == InstCS.getCalledFunction())
+            return MemDepResult::getDef(Inst);
+          // Ignore unrelated read/read call dependences.
+          continue;
+        }
+        // FALL THROUGH
+      default:
+        return MemDepResult::getClobber(Inst);
+      }
+    } else {
+      // Non-memory instruction.
+      continue;
+    }
+    
+    if (AA->getModRefInfo(CS, Pointer, PointerSize) != AliasAnalysis::NoModRef)
+      return MemDepResult::getClobber(Inst);
+  }
+  
+  // No dependence found.  If this is the entry block of the function, it is a
+  // clobber, otherwise it is non-local.
+  if (BB != &BB->getParent()->getEntryBlock())
+    return MemDepResult::getNonLocal();
+  return MemDepResult::getClobber(ScanIt);
+}
+
+/// getPointerDependencyFrom - Return the instruction on which a memory
+/// location depends.  If isLoad is true, this routine ignore may-aliases with
+/// read-only operations.
+MemDepResult MemoryDependenceAnalysis::
+getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
+                         BasicBlock::iterator ScanIt, BasicBlock *BB) {
+
+  // Walk backwards through the basic block, looking for dependencies.
+  while (ScanIt != BB->begin()) {
+    Instruction *Inst = --ScanIt;
+
+    // Debug intrinsics don't cause dependences.
+    if (isa<DbgInfoIntrinsic>(Inst)) continue;
+
+    // Values depend on loads if the pointers are must aliased.  This means that
+    // a load depends on another must aliased load from the same value.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      Value *Pointer = LI->getPointerOperand();
+      uint64_t PointerSize = TD->getTypeStoreSize(LI->getType());
+      
+      // If we found a pointer, check if it could be the same as our pointer.
+      AliasAnalysis::AliasResult R =
+        AA->alias(Pointer, PointerSize, MemPtr, MemSize);
+      if (R == AliasAnalysis::NoAlias)
+        continue;
+      
+      // May-alias loads don't depend on each other without a dependence.
+      if (isLoad && R == AliasAnalysis::MayAlias)
+        continue;
+      // Stores depend on may and must aliased loads, loads depend on must-alias
+      // loads.
+      return MemDepResult::getDef(Inst);
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // If alias analysis can tell that this store is guaranteed to not modify
+      // the query pointer, ignore it.  Use getModRefInfo to handle cases where
+      // the query pointer points to constant memory etc.
+      if (AA->getModRefInfo(SI, MemPtr, MemSize) == AliasAnalysis::NoModRef)
+        continue;
+
+      // Ok, this store might clobber the query pointer.  Check to see if it is
+      // a must alias: in this case, we want to return this as a def.
+      Value *Pointer = SI->getPointerOperand();
+      uint64_t PointerSize = TD->getTypeStoreSize(SI->getOperand(0)->getType());
+      
+      // If we found a pointer, check if it could be the same as our pointer.
+      AliasAnalysis::AliasResult R =
+        AA->alias(Pointer, PointerSize, MemPtr, MemSize);
+      
+      if (R == AliasAnalysis::NoAlias)
+        continue;
+      if (R == AliasAnalysis::MayAlias)
+        return MemDepResult::getClobber(Inst);
+      return MemDepResult::getDef(Inst);
+    }
+
+    // If this is an allocation, and if we know that the accessed pointer is to
+    // the allocation, return Def.  This means that there is no dependence and
+    // the access can be optimized based on that.  For example, a load could
+    // turn into undef.
+    if (AllocationInst *AI = dyn_cast<AllocationInst>(Inst)) {
+      Value *AccessPtr = MemPtr->getUnderlyingObject();
+      
+      if (AccessPtr == AI ||
+          AA->alias(AI, 1, AccessPtr, 1) == AliasAnalysis::MustAlias)
+        return MemDepResult::getDef(AI);
+      continue;
+    }
+    
+    // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
+    switch (AA->getModRefInfo(Inst, MemPtr, MemSize)) {
+    case AliasAnalysis::NoModRef:
+      // If the call has no effect on the queried pointer, just ignore it.
+      continue;
+    case AliasAnalysis::Ref:
+      // If the call is known to never store to the pointer, and if this is a
+      // load query, we can safely ignore it (scan past it).
+      if (isLoad)
+        continue;
+      // FALL THROUGH.
+    default:
+      // Otherwise, there is a potential dependence.  Return a clobber.
+      return MemDepResult::getClobber(Inst);
+    }
+  }
+  
+  // No dependence found.  If this is the entry block of the function, it is a
+  // clobber, otherwise it is non-local.
+  if (BB != &BB->getParent()->getEntryBlock())
+    return MemDepResult::getNonLocal();
+  return MemDepResult::getClobber(ScanIt);
+}
+
+/// getDependency - Return the instruction on which a memory operation
+/// depends.
+MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
+  Instruction *ScanPos = QueryInst;
+  
+  // Check for a cached result
+  MemDepResult &LocalCache = LocalDeps[QueryInst];
+  
+  // If the cached entry is non-dirty, just return it.  Note that this depends
+  // on MemDepResult's default constructing to 'dirty'.
+  if (!LocalCache.isDirty())
+    return LocalCache;
+    
+  // Otherwise, if we have a dirty entry, we know we can start the scan at that
+  // instruction, which may save us some work.
+  if (Instruction *Inst = LocalCache.getInst()) {
+    ScanPos = Inst;
+   
+    RemoveFromReverseMap(ReverseLocalDeps, Inst, QueryInst);
+  }
+  
+  BasicBlock *QueryParent = QueryInst->getParent();
+  
+  Value *MemPtr = 0;
+  uint64_t MemSize = 0;
+  
+  // Do the scan.
+  if (BasicBlock::iterator(QueryInst) == QueryParent->begin()) {
+    // No dependence found.  If this is the entry block of the function, it is a
+    // clobber, otherwise it is non-local.
+    if (QueryParent != &QueryParent->getParent()->getEntryBlock())
+      LocalCache = MemDepResult::getNonLocal();
+    else
+      LocalCache = MemDepResult::getClobber(QueryInst);
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(QueryInst)) {
+    // If this is a volatile store, don't mess around with it.  Just return the
+    // previous instruction as a clobber.
+    if (SI->isVolatile())
+      LocalCache = MemDepResult::getClobber(--BasicBlock::iterator(ScanPos));
+    else {
+      MemPtr = SI->getPointerOperand();
+      MemSize = TD->getTypeStoreSize(SI->getOperand(0)->getType());
+    }
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(QueryInst)) {
+    // If this is a volatile load, don't mess around with it.  Just return the
+    // previous instruction as a clobber.
+    if (LI->isVolatile())
+      LocalCache = MemDepResult::getClobber(--BasicBlock::iterator(ScanPos));
+    else {
+      MemPtr = LI->getPointerOperand();
+      MemSize = TD->getTypeStoreSize(LI->getType());
+    }
+  } else if (isa<CallInst>(QueryInst) || isa<InvokeInst>(QueryInst)) {
+    CallSite QueryCS = CallSite::get(QueryInst);
+    bool isReadOnly = AA->onlyReadsMemory(QueryCS);
+    LocalCache = getCallSiteDependencyFrom(QueryCS, isReadOnly, ScanPos,
+                                           QueryParent);
+  } else if (FreeInst *FI = dyn_cast<FreeInst>(QueryInst)) {
+    MemPtr = FI->getPointerOperand();
+    // FreeInsts erase the entire structure, not just a field.
+    MemSize = ~0UL;
+  } else {
+    // Non-memory instruction.
+    LocalCache = MemDepResult::getClobber(--BasicBlock::iterator(ScanPos));
+  }
+  
+  // If we need to do a pointer scan, make it happen.
+  if (MemPtr)
+    LocalCache = getPointerDependencyFrom(MemPtr, MemSize, 
+                                          isa<LoadInst>(QueryInst),
+                                          ScanPos, QueryParent);
+  
+  // Remember the result!
+  if (Instruction *I = LocalCache.getInst())
+    ReverseLocalDeps[I].insert(QueryInst);
+  
+  return LocalCache;
+}
+
+#ifndef NDEBUG
+/// AssertSorted - This method is used when -debug is specified to verify that
+/// cache arrays are properly kept sorted.
+static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
+                         int Count = -1) {
+  if (Count == -1) Count = Cache.size();
+  if (Count == 0) return;
+
+  for (unsigned i = 1; i != unsigned(Count); ++i)
+    assert(Cache[i-1] <= Cache[i] && "Cache isn't sorted!");
+}
+#endif
+
+/// getNonLocalCallDependency - Perform a full dependency query for the
+/// specified call, returning the set of blocks that the value is
+/// potentially live across.  The returned set of results will include a
+/// "NonLocal" result for all blocks where the value is live across.
+///
+/// This method assumes the instruction returns a "NonLocal" dependency
+/// within its own block.
+///
+/// This returns a reference to an internal data structure that may be
+/// invalidated on the next non-local query or when an instruction is
+/// removed.  Clients must copy this data if they want it around longer than
+/// that.
+const MemoryDependenceAnalysis::NonLocalDepInfo &
+MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
+  assert(getDependency(QueryCS.getInstruction()).isNonLocal() &&
+ "getNonLocalCallDependency should only be used on calls with non-local deps!");
+  PerInstNLInfo &CacheP = NonLocalDeps[QueryCS.getInstruction()];
+  NonLocalDepInfo &Cache = CacheP.first;
+
+  /// DirtyBlocks - This is the set of blocks that need to be recomputed.  In
+  /// the cached case, this can happen due to instructions being deleted etc. In
+  /// the uncached case, this starts out as the set of predecessors we care
+  /// about.
+  SmallVector<BasicBlock*, 32> DirtyBlocks;
+  
+  if (!Cache.empty()) {
+    // Okay, we have a cache entry.  If we know it is not dirty, just return it
+    // with no computation.
+    if (!CacheP.second) {
+      NumCacheNonLocal++;
+      return Cache;
+    }
+    
+    // If we already have a partially computed set of results, scan them to
+    // determine what is dirty, seeding our initial DirtyBlocks worklist.
+    for (NonLocalDepInfo::iterator I = Cache.begin(), E = Cache.end();
+       I != E; ++I)
+      if (I->second.isDirty())
+        DirtyBlocks.push_back(I->first);
+    
+    // Sort the cache so that we can do fast binary search lookups below.
+    std::sort(Cache.begin(), Cache.end());
+    
+    ++NumCacheDirtyNonLocal;
+    //cerr << "CACHED CASE: " << DirtyBlocks.size() << " dirty: "
+    //     << Cache.size() << " cached: " << *QueryInst;
+  } else {
+    // Seed DirtyBlocks with each of the preds of QueryInst's block.
+    BasicBlock *QueryBB = QueryCS.getInstruction()->getParent();
+    for (BasicBlock **PI = PredCache->GetPreds(QueryBB); *PI; ++PI)
+      DirtyBlocks.push_back(*PI);
+    NumUncacheNonLocal++;
+  }
+  
+  // isReadonlyCall - If this is a read-only call, we can be more aggressive.
+  bool isReadonlyCall = AA->onlyReadsMemory(QueryCS);
+
+  SmallPtrSet<BasicBlock*, 64> Visited;
+  
+  unsigned NumSortedEntries = Cache.size();
+  DEBUG(AssertSorted(Cache));
+  
+  // Iterate while we still have blocks to update.
+  while (!DirtyBlocks.empty()) {
+    BasicBlock *DirtyBB = DirtyBlocks.back();
+    DirtyBlocks.pop_back();
+    
+    // Already processed this block?
+    if (!Visited.insert(DirtyBB))
+      continue;
+    
+    // Do a binary search to see if we already have an entry for this block in
+    // the cache set.  If so, find it.
+    DEBUG(AssertSorted(Cache, NumSortedEntries));
+    NonLocalDepInfo::iterator Entry = 
+      std::upper_bound(Cache.begin(), Cache.begin()+NumSortedEntries,
+                       std::make_pair(DirtyBB, MemDepResult()));
+    if (Entry != Cache.begin() && prior(Entry)->first == DirtyBB)
+      --Entry;
+    
+    MemDepResult *ExistingResult = 0;
+    if (Entry != Cache.begin()+NumSortedEntries && 
+        Entry->first == DirtyBB) {
+      // If we already have an entry, and if it isn't already dirty, the block
+      // is done.
+      if (!Entry->second.isDirty())
+        continue;
+      
+      // Otherwise, remember this slot so we can update the value.
+      ExistingResult = &Entry->second;
+    }
+    
+    // If the dirty entry has a pointer, start scanning from it so we don't have
+    // to rescan the entire block.
+    BasicBlock::iterator ScanPos = DirtyBB->end();
+    if (ExistingResult) {
+      if (Instruction *Inst = ExistingResult->getInst()) {
+        ScanPos = Inst;
+        // We're removing QueryInst's use of Inst.
+        RemoveFromReverseMap(ReverseNonLocalDeps, Inst,
+                             QueryCS.getInstruction());
+      }
+    }
+    
+    // Find out if this block has a local dependency for QueryInst.
+    MemDepResult Dep;
+    
+    if (ScanPos != DirtyBB->begin()) {
+      Dep = getCallSiteDependencyFrom(QueryCS, isReadonlyCall,ScanPos, DirtyBB);
+    } else if (DirtyBB != &DirtyBB->getParent()->getEntryBlock()) {
+      // No dependence found.  If this is the entry block of the function, it is
+      // a clobber, otherwise it is non-local.
+      Dep = MemDepResult::getNonLocal();
+    } else {
+      Dep = MemDepResult::getClobber(ScanPos);
+    }
+    
+    // If we had a dirty entry for the block, update it.  Otherwise, just add
+    // a new entry.
+    if (ExistingResult)
+      *ExistingResult = Dep;
+    else
+      Cache.push_back(std::make_pair(DirtyBB, Dep));
+    
+    // If the block has a dependency (i.e. it isn't completely transparent to
+    // the value), remember the association!
+    if (!Dep.isNonLocal()) {
+      // Keep the ReverseNonLocalDeps map up to date so we can efficiently
+      // update this when we remove instructions.
+      if (Instruction *Inst = Dep.getInst())
+        ReverseNonLocalDeps[Inst].insert(QueryCS.getInstruction());
+    } else {
+    
+      // If the block *is* completely transparent to the load, we need to check
+      // the predecessors of this block.  Add them to our worklist.
+      for (BasicBlock **PI = PredCache->GetPreds(DirtyBB); *PI; ++PI)
+        DirtyBlocks.push_back(*PI);
+    }
+  }
+  
+  return Cache;
+}
+
+/// getNonLocalPointerDependency - Perform a full dependency query for an
+/// access to the specified (non-volatile) memory location, returning the
+/// set of instructions that either define or clobber the value.
+///
+/// This method assumes the pointer has a "NonLocal" dependency within its
+/// own block.
+///
+void MemoryDependenceAnalysis::
+getNonLocalPointerDependency(Value *Pointer, bool isLoad, BasicBlock *FromBB,
+                             SmallVectorImpl<NonLocalDepEntry> &Result) {
+  assert(isa<PointerType>(Pointer->getType()) &&
+         "Can't get pointer deps of a non-pointer!");
+  Result.clear();
+  
+  // We know that the pointer value is live into FromBB find the def/clobbers
+  // from presecessors.
+  const Type *EltTy = cast<PointerType>(Pointer->getType())->getElementType();
+  uint64_t PointeeSize = TD->getTypeStoreSize(EltTy);
+  
+  // This is the set of blocks we've inspected, and the pointer we consider in
+  // each block.  Because of critical edges, we currently bail out if querying
+  // a block with multiple different pointers.  This can happen during PHI
+  // translation.
+  DenseMap<BasicBlock*, Value*> Visited;
+  if (!getNonLocalPointerDepFromBB(Pointer, PointeeSize, isLoad, FromBB,
+                                   Result, Visited, true))
+    return;
+  Result.clear();
+  Result.push_back(std::make_pair(FromBB,
+                                  MemDepResult::getClobber(FromBB->begin())));
+}
+
+/// GetNonLocalInfoForBlock - Compute the memdep value for BB with
+/// Pointer/PointeeSize using either cached information in Cache or by doing a
+/// lookup (which may use dirty cache info if available).  If we do a lookup,
+/// add the result to the cache.
+MemDepResult MemoryDependenceAnalysis::
+GetNonLocalInfoForBlock(Value *Pointer, uint64_t PointeeSize,
+                        bool isLoad, BasicBlock *BB,
+                        NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
+  
+  // Do a binary search to see if we already have an entry for this block in
+  // the cache set.  If so, find it.
+  NonLocalDepInfo::iterator Entry =
+    std::upper_bound(Cache->begin(), Cache->begin()+NumSortedEntries,
+                     std::make_pair(BB, MemDepResult()));
+  if (Entry != Cache->begin() && prior(Entry)->first == BB)
+    --Entry;
+  
+  MemDepResult *ExistingResult = 0;
+  if (Entry != Cache->begin()+NumSortedEntries && Entry->first == BB)
+    ExistingResult = &Entry->second;
+  
+  // If we have a cached entry, and it is non-dirty, use it as the value for
+  // this dependency.
+  if (ExistingResult && !ExistingResult->isDirty()) {
+    ++NumCacheNonLocalPtr;
+    return *ExistingResult;
+  }    
+  
+  // Otherwise, we have to scan for the value.  If we have a dirty cache
+  // entry, start scanning from its position, otherwise we scan from the end
+  // of the block.
+  BasicBlock::iterator ScanPos = BB->end();
+  if (ExistingResult && ExistingResult->getInst()) {
+    assert(ExistingResult->getInst()->getParent() == BB &&
+           "Instruction invalidated?");
+    ++NumCacheDirtyNonLocalPtr;
+    ScanPos = ExistingResult->getInst();
+    
+    // Eliminating the dirty entry from 'Cache', so update the reverse info.
+    ValueIsLoadPair CacheKey(Pointer, isLoad);
+    RemoveFromReverseMap(ReverseNonLocalPtrDeps, ScanPos, CacheKey);
+  } else {
+    ++NumUncacheNonLocalPtr;
+  }
+  
+  // Scan the block for the dependency.
+  MemDepResult Dep = getPointerDependencyFrom(Pointer, PointeeSize, isLoad, 
+                                              ScanPos, BB);
+  
+  // If we had a dirty entry for the block, update it.  Otherwise, just add
+  // a new entry.
+  if (ExistingResult)
+    *ExistingResult = Dep;
+  else
+    Cache->push_back(std::make_pair(BB, Dep));
+  
+  // If the block has a dependency (i.e. it isn't completely transparent to
+  // the value), remember the reverse association because we just added it
+  // to Cache!
+  if (Dep.isNonLocal())
+    return Dep;
+  
+  // Keep the ReverseNonLocalPtrDeps map up to date so we can efficiently
+  // update MemDep when we remove instructions.
+  Instruction *Inst = Dep.getInst();
+  assert(Inst && "Didn't depend on anything?");
+  ValueIsLoadPair CacheKey(Pointer, isLoad);
+  ReverseNonLocalPtrDeps[Inst].insert(CacheKey);
+  return Dep;
+}
+
+
+/// getNonLocalPointerDepFromBB - Perform a dependency query based on
+/// pointer/pointeesize starting at the end of StartBB.  Add any clobber/def
+/// results to the results vector and keep track of which blocks are visited in
+/// 'Visited'.
+///
+/// This has special behavior for the first block queries (when SkipFirstBlock
+/// is true).  In this special case, it ignores the contents of the specified
+/// block and starts returning dependence info for its predecessors.
+///
+/// This function returns false on success, or true to indicate that it could
+/// not compute dependence information for some reason.  This should be treated
+/// as a clobber dependence on the first instruction in the predecessor block.
+bool MemoryDependenceAnalysis::
+getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
+                            bool isLoad, BasicBlock *StartBB,
+                            SmallVectorImpl<NonLocalDepEntry> &Result,
+                            DenseMap<BasicBlock*, Value*> &Visited,
+                            bool SkipFirstBlock) {
+  
+  // Look up the cached info for Pointer.
+  ValueIsLoadPair CacheKey(Pointer, isLoad);
+  
+  std::pair<BBSkipFirstBlockPair, NonLocalDepInfo> *CacheInfo =
+    &NonLocalPointerDeps[CacheKey];
+  NonLocalDepInfo *Cache = &CacheInfo->second;
+
+  // If we have valid cached information for exactly the block we are
+  // investigating, just return it with no recomputation.
+  if (CacheInfo->first == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
+    // We have a fully cached result for this query then we can just return the
+    // cached results and populate the visited set.  However, we have to verify
+    // that we don't already have conflicting results for these blocks.  Check
+    // to ensure that if a block in the results set is in the visited set that
+    // it was for the same pointer query.
+    if (!Visited.empty()) {
+      for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
+           I != E; ++I) {
+        DenseMap<BasicBlock*, Value*>::iterator VI = Visited.find(I->first);
+        if (VI == Visited.end() || VI->second == Pointer) continue;
+        
+        // We have a pointer mismatch in a block.  Just return clobber, saying
+        // that something was clobbered in this result.  We could also do a
+        // non-fully cached query, but there is little point in doing this.
+        return true;
+      }
+    }
+    
+    for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
+         I != E; ++I) {
+      Visited.insert(std::make_pair(I->first, Pointer));
+      if (!I->second.isNonLocal())
+        Result.push_back(*I);
+    }
+    ++NumCacheCompleteNonLocalPtr;
+    return false;
+  }
+  
+  // Otherwise, either this is a new block, a block with an invalid cache
+  // pointer or one that we're about to invalidate by putting more info into it
+  // than its valid cache info.  If empty, the result will be valid cache info,
+  // otherwise it isn't.
+  if (Cache->empty())
+    CacheInfo->first = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
+  else
+    CacheInfo->first = BBSkipFirstBlockPair();
+  
+  SmallVector<BasicBlock*, 32> Worklist;
+  Worklist.push_back(StartBB);
+  
+  // Keep track of the entries that we know are sorted.  Previously cached
+  // entries will all be sorted.  The entries we add we only sort on demand (we
+  // don't insert every element into its sorted position).  We know that we
+  // won't get any reuse from currently inserted values, because we don't
+  // revisit blocks after we insert info for them.
+  unsigned NumSortedEntries = Cache->size();
+  DEBUG(AssertSorted(*Cache));
+  
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+    
+    // Skip the first block if we have it.
+    if (!SkipFirstBlock) {
+      // Analyze the dependency of *Pointer in FromBB.  See if we already have
+      // been here.
+      assert(Visited.count(BB) && "Should check 'visited' before adding to WL");
+
+      // Get the dependency info for Pointer in BB.  If we have cached
+      // information, we will use it, otherwise we compute it.
+      DEBUG(AssertSorted(*Cache, NumSortedEntries));
+      MemDepResult Dep = GetNonLocalInfoForBlock(Pointer, PointeeSize, isLoad,
+                                                 BB, Cache, NumSortedEntries);
+      
+      // If we got a Def or Clobber, add this to the list of results.
+      if (!Dep.isNonLocal()) {
+        Result.push_back(NonLocalDepEntry(BB, Dep));
+        continue;
+      }
+    }
+    
+    // If 'Pointer' is an instruction defined in this block, then we need to do
+    // phi translation to change it into a value live in the predecessor block.
+    // If phi translation fails, then we can't continue dependence analysis.
+    Instruction *PtrInst = dyn_cast<Instruction>(Pointer);
+    bool NeedsPHITranslation = PtrInst && PtrInst->getParent() == BB;
+    
+    // If no PHI translation is needed, just add all the predecessors of this
+    // block to scan them as well.
+    if (!NeedsPHITranslation) {
+      SkipFirstBlock = false;
+      for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
+        // Verify that we haven't looked at this block yet.
+        std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
+          InsertRes = Visited.insert(std::make_pair(*PI, Pointer));
+        if (InsertRes.second) {
+          // First time we've looked at *PI.
+          Worklist.push_back(*PI);
+          continue;
+        }
+        
+        // If we have seen this block before, but it was with a different
+        // pointer then we have a phi translation failure and we have to treat
+        // this as a clobber.
+        if (InsertRes.first->second != Pointer)
+          goto PredTranslationFailure;
+      }
+      continue;
+    }
+    
+    // If we do need to do phi translation, then there are a bunch of different
+    // cases, because we have to find a Value* live in the predecessor block. We
+    // know that PtrInst is defined in this block at least.
+    
+    // If this is directly a PHI node, just use the incoming values for each
+    // pred as the phi translated version.
+    if (PHINode *PtrPHI = dyn_cast<PHINode>(PtrInst)) {
+      for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
+        BasicBlock *Pred = *PI;
+        Value *PredPtr = PtrPHI->getIncomingValueForBlock(Pred);
+        
+        // Check to see if we have already visited this pred block with another
+        // pointer.  If so, we can't do this lookup.  This failure can occur
+        // with PHI translation when a critical edge exists and the PHI node in
+        // the successor translates to a pointer value different than the
+        // pointer the block was first analyzed with.
+        std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
+          InsertRes = Visited.insert(std::make_pair(Pred, PredPtr));
+
+        if (!InsertRes.second) {
+          // If the predecessor was visited with PredPtr, then we already did
+          // the analysis and can ignore it.
+          if (InsertRes.first->second == PredPtr)
+            continue;
+          
+          // Otherwise, the block was previously analyzed with a different
+          // pointer.  We can't represent the result of this case, so we just
+          // treat this as a phi translation failure.
+          goto PredTranslationFailure;
+        }
+
+        // We may have added values to the cache list before this PHI
+        // translation.  If so, we haven't done anything to ensure that the
+        // cache remains sorted.  Sort it now (if needed) so that recursive
+        // invocations of getNonLocalPointerDepFromBB that could reuse the cache
+        // value will only see properly sorted cache arrays.
+        if (Cache && NumSortedEntries != Cache->size())
+          std::sort(Cache->begin(), Cache->end());
+        Cache = 0;
+        
+        // FIXME: it is entirely possible that PHI translating will end up with
+        // the same value.  Consider PHI translating something like:
+        // X = phi [x, bb1], [y, bb2].  PHI translating for bb1 doesn't *need*
+        // to recurse here, pedantically speaking.
+        
+        // If we have a problem phi translating, fall through to the code below
+        // to handle the failure condition.
+        if (getNonLocalPointerDepFromBB(PredPtr, PointeeSize, isLoad, Pred,
+                                        Result, Visited))
+          goto PredTranslationFailure;
+      }
+
+      // Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
+      CacheInfo = &NonLocalPointerDeps[CacheKey];
+      Cache = &CacheInfo->second;
+      NumSortedEntries = Cache->size();
+      
+      // Since we did phi translation, the "Cache" set won't contain all of the
+      // results for the query.  This is ok (we can still use it to accelerate
+      // specific block queries) but we can't do the fastpath "return all
+      // results from the set"  Clear out the indicator for this.
+      CacheInfo->first = BBSkipFirstBlockPair();
+      SkipFirstBlock = false;
+      continue;
+    }
+    
+    // TODO: BITCAST, GEP.
+    
+    //   cerr << "MEMDEP: Could not PHI translate: " << *Pointer;
+    //   if (isa<BitCastInst>(PtrInst) || isa<GetElementPtrInst>(PtrInst))
+    //     cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0);
+  PredTranslationFailure:
+    
+    if (Cache == 0) {
+      // Refresh the CacheInfo/Cache pointer if it got invalidated.
+      CacheInfo = &NonLocalPointerDeps[CacheKey];
+      Cache = &CacheInfo->second;
+      NumSortedEntries = Cache->size();
+    } else if (NumSortedEntries != Cache->size()) {
+      std::sort(Cache->begin(), Cache->end());
+      NumSortedEntries = Cache->size();
+    }
+
+    // Since we did phi translation, the "Cache" set won't contain all of the
+    // results for the query.  This is ok (we can still use it to accelerate
+    // specific block queries) but we can't do the fastpath "return all
+    // results from the set"  Clear out the indicator for this.
+    CacheInfo->first = BBSkipFirstBlockPair();
+    
+    // If *nothing* works, mark the pointer as being clobbered by the first
+    // instruction in this block.
+    //
+    // If this is the magic first block, return this as a clobber of the whole
+    // incoming value.  Since we can't phi translate to one of the predecessors,
+    // we have to bail out.
+    if (SkipFirstBlock)
+      return true;
+    
+    for (NonLocalDepInfo::reverse_iterator I = Cache->rbegin(); ; ++I) {
+      assert(I != Cache->rend() && "Didn't find current block??");
+      if (I->first != BB)
+        continue;
+      
+      assert(I->second.isNonLocal() &&
+             "Should only be here with transparent block");
+      I->second = MemDepResult::getClobber(BB->begin());
+      ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey);
+      Result.push_back(*I);
+      break;
+    }
+  }
+
+  // Okay, we're done now.  If we added new values to the cache, re-sort it.
+  switch (Cache->size()-NumSortedEntries) {
+  case 0:
+    // done, no new entries.
+    break;
+  case 2: {
+    // Two new entries, insert the last one into place.
+    NonLocalDepEntry Val = Cache->back();
+    Cache->pop_back();
+    NonLocalDepInfo::iterator Entry =
+    std::upper_bound(Cache->begin(), Cache->end()-1, Val);
+    Cache->insert(Entry, Val);
+    // FALL THROUGH.
+  }
+  case 1:
+    // One new entry, Just insert the new value at the appropriate position.
+    if (Cache->size() != 1) {
+      NonLocalDepEntry Val = Cache->back();
+      Cache->pop_back();
+      NonLocalDepInfo::iterator Entry =
+        std::upper_bound(Cache->begin(), Cache->end(), Val);
+      Cache->insert(Entry, Val);
+    }
+    break;
+  default:
+    // Added many values, do a full scale sort.
+    std::sort(Cache->begin(), Cache->end());
+  }
+  DEBUG(AssertSorted(*Cache));
+  return false;
+}
+
+/// RemoveCachedNonLocalPointerDependencies - If P exists in
+/// CachedNonLocalPointerInfo, remove it.
+void MemoryDependenceAnalysis::
+RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P) {
+  CachedNonLocalPointerInfo::iterator It = 
+    NonLocalPointerDeps.find(P);
+  if (It == NonLocalPointerDeps.end()) return;
+  
+  // Remove all of the entries in the BB->val map.  This involves removing
+  // instructions from the reverse map.
+  NonLocalDepInfo &PInfo = It->second.second;
+  
+  for (unsigned i = 0, e = PInfo.size(); i != e; ++i) {
+    Instruction *Target = PInfo[i].second.getInst();
+    if (Target == 0) continue;  // Ignore non-local dep results.
+    assert(Target->getParent() == PInfo[i].first);
+    
+    // Eliminating the dirty entry from 'Cache', so update the reverse info.
+    RemoveFromReverseMap(ReverseNonLocalPtrDeps, Target, P);
+  }
+  
+  // Remove P from NonLocalPointerDeps (which deletes NonLocalDepInfo).
+  NonLocalPointerDeps.erase(It);
+}
+
+
+/// invalidateCachedPointerInfo - This method is used to invalidate cached
+/// information about the specified pointer, because it may be too
+/// conservative in memdep.  This is an optional call that can be used when
+/// the client detects an equivalence between the pointer and some other
+/// value and replaces the other value with ptr. This can make Ptr available
+/// in more places that cached info does not necessarily keep.
+void MemoryDependenceAnalysis::invalidateCachedPointerInfo(Value *Ptr) {
+  // If Ptr isn't really a pointer, just ignore it.
+  if (!isa<PointerType>(Ptr->getType())) return;
+  // Flush store info for the pointer.
+  RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(Ptr, false));
+  // Flush load info for the pointer.
+  RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(Ptr, true));
+}
+
+/// removeInstruction - Remove an instruction from the dependence analysis,
+/// updating the dependence of instructions that previously depended on it.
+/// This method attempts to keep the cache coherent using the reverse map.
+void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
+  // Walk through the Non-local dependencies, removing this one as the value
+  // for any cached queries.
+  NonLocalDepMapType::iterator NLDI = NonLocalDeps.find(RemInst);
+  if (NLDI != NonLocalDeps.end()) {
+    NonLocalDepInfo &BlockMap = NLDI->second.first;
+    for (NonLocalDepInfo::iterator DI = BlockMap.begin(), DE = BlockMap.end();
+         DI != DE; ++DI)
+      if (Instruction *Inst = DI->second.getInst())
+        RemoveFromReverseMap(ReverseNonLocalDeps, Inst, RemInst);
+    NonLocalDeps.erase(NLDI);
+  }
+
+  // If we have a cached local dependence query for this instruction, remove it.
+  //
+  LocalDepMapType::iterator LocalDepEntry = LocalDeps.find(RemInst);
+  if (LocalDepEntry != LocalDeps.end()) {
+    // Remove us from DepInst's reverse set now that the local dep info is gone.
+    if (Instruction *Inst = LocalDepEntry->second.getInst())
+      RemoveFromReverseMap(ReverseLocalDeps, Inst, RemInst);
+
+    // Remove this local dependency info.
+    LocalDeps.erase(LocalDepEntry);
+  }
+  
+  // If we have any cached pointer dependencies on this instruction, remove
+  // them.  If the instruction has non-pointer type, then it can't be a pointer
+  // base.
+  
+  // Remove it from both the load info and the store info.  The instruction
+  // can't be in either of these maps if it is non-pointer.
+  if (isa<PointerType>(RemInst->getType())) {
+    RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, false));
+    RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, true));
+  }
+  
+  // Loop over all of the things that depend on the instruction we're removing.
+  // 
+  SmallVector<std::pair<Instruction*, Instruction*>, 8> ReverseDepsToAdd;
+
+  // If we find RemInst as a clobber or Def in any of the maps for other values,
+  // we need to replace its entry with a dirty version of the instruction after
+  // it.  If RemInst is a terminator, we use a null dirty value.
+  //
+  // Using a dirty version of the instruction after RemInst saves having to scan
+  // the entire block to get to this point.
+  MemDepResult NewDirtyVal;
+  if (!RemInst->isTerminator())
+    NewDirtyVal = MemDepResult::getDirty(++BasicBlock::iterator(RemInst));
+  
+  ReverseDepMapType::iterator ReverseDepIt = ReverseLocalDeps.find(RemInst);
+  if (ReverseDepIt != ReverseLocalDeps.end()) {
+    SmallPtrSet<Instruction*, 4> &ReverseDeps = ReverseDepIt->second;
+    // RemInst can't be the terminator if it has local stuff depending on it.
+    assert(!ReverseDeps.empty() && !isa<TerminatorInst>(RemInst) &&
+           "Nothing can locally depend on a terminator");
+    
+    for (SmallPtrSet<Instruction*, 4>::iterator I = ReverseDeps.begin(),
+         E = ReverseDeps.end(); I != E; ++I) {
+      Instruction *InstDependingOnRemInst = *I;
+      assert(InstDependingOnRemInst != RemInst &&
+             "Already removed our local dep info");
+                        
+      LocalDeps[InstDependingOnRemInst] = NewDirtyVal;
+      
+      // Make sure to remember that new things depend on NewDepInst.
+      assert(NewDirtyVal.getInst() && "There is no way something else can have "
+             "a local dep on this if it is a terminator!");
+      ReverseDepsToAdd.push_back(std::make_pair(NewDirtyVal.getInst(), 
+                                                InstDependingOnRemInst));
+    }
+    
+    ReverseLocalDeps.erase(ReverseDepIt);
+
+    // Add new reverse deps after scanning the set, to avoid invalidating the
+    // 'ReverseDeps' reference.
+    while (!ReverseDepsToAdd.empty()) {
+      ReverseLocalDeps[ReverseDepsToAdd.back().first]
+        .insert(ReverseDepsToAdd.back().second);
+      ReverseDepsToAdd.pop_back();
+    }
+  }
+  
+  ReverseDepIt = ReverseNonLocalDeps.find(RemInst);
+  if (ReverseDepIt != ReverseNonLocalDeps.end()) {
+    SmallPtrSet<Instruction*, 4> &Set = ReverseDepIt->second;
+    for (SmallPtrSet<Instruction*, 4>::iterator I = Set.begin(), E = Set.end();
+         I != E; ++I) {
+      assert(*I != RemInst && "Already removed NonLocalDep info for RemInst");
+      
+      PerInstNLInfo &INLD = NonLocalDeps[*I];
+      // The information is now dirty!
+      INLD.second = true;
+      
+      for (NonLocalDepInfo::iterator DI = INLD.first.begin(), 
+           DE = INLD.first.end(); DI != DE; ++DI) {
+        if (DI->second.getInst() != RemInst) continue;
+        
+        // Convert to a dirty entry for the subsequent instruction.
+        DI->second = NewDirtyVal;
+        
+        if (Instruction *NextI = NewDirtyVal.getInst())
+          ReverseDepsToAdd.push_back(std::make_pair(NextI, *I));
+      }
+    }
+
+    ReverseNonLocalDeps.erase(ReverseDepIt);
+
+    // Add new reverse deps after scanning the set, to avoid invalidating 'Set'
+    while (!ReverseDepsToAdd.empty()) {
+      ReverseNonLocalDeps[ReverseDepsToAdd.back().first]
+        .insert(ReverseDepsToAdd.back().second);
+      ReverseDepsToAdd.pop_back();
+    }
+  }
+  
+  // If the instruction is in ReverseNonLocalPtrDeps then it appears as a
+  // value in the NonLocalPointerDeps info.
+  ReverseNonLocalPtrDepTy::iterator ReversePtrDepIt =
+    ReverseNonLocalPtrDeps.find(RemInst);
+  if (ReversePtrDepIt != ReverseNonLocalPtrDeps.end()) {
+    SmallPtrSet<ValueIsLoadPair, 4> &Set = ReversePtrDepIt->second;
+    SmallVector<std::pair<Instruction*, ValueIsLoadPair>,8> ReversePtrDepsToAdd;
+    
+    for (SmallPtrSet<ValueIsLoadPair, 4>::iterator I = Set.begin(),
+         E = Set.end(); I != E; ++I) {
+      ValueIsLoadPair P = *I;
+      assert(P.getPointer() != RemInst &&
+             "Already removed NonLocalPointerDeps info for RemInst");
+      
+      NonLocalDepInfo &NLPDI = NonLocalPointerDeps[P].second;
+      
+      // The cache is not valid for any specific block anymore.
+      NonLocalPointerDeps[P].first = BBSkipFirstBlockPair();
+      
+      // Update any entries for RemInst to use the instruction after it.
+      for (NonLocalDepInfo::iterator DI = NLPDI.begin(), DE = NLPDI.end();
+           DI != DE; ++DI) {
+        if (DI->second.getInst() != RemInst) continue;
+        
+        // Convert to a dirty entry for the subsequent instruction.
+        DI->second = NewDirtyVal;
+        
+        if (Instruction *NewDirtyInst = NewDirtyVal.getInst())
+          ReversePtrDepsToAdd.push_back(std::make_pair(NewDirtyInst, P));
+      }
+      
+      // Re-sort the NonLocalDepInfo.  Changing the dirty entry to its
+      // subsequent value may invalidate the sortedness.
+      std::sort(NLPDI.begin(), NLPDI.end());
+    }
+    
+    ReverseNonLocalPtrDeps.erase(ReversePtrDepIt);
+    
+    while (!ReversePtrDepsToAdd.empty()) {
+      ReverseNonLocalPtrDeps[ReversePtrDepsToAdd.back().first]
+        .insert(ReversePtrDepsToAdd.back().second);
+      ReversePtrDepsToAdd.pop_back();
+    }
+  }
+  
+  
+  assert(!NonLocalDeps.count(RemInst) && "RemInst got reinserted?");
+  AA->deleteValue(RemInst);
+  DEBUG(verifyRemoved(RemInst));
+}
+/// verifyRemoved - Verify that the specified instruction does not occur
+/// in our internal data structures.
+void MemoryDependenceAnalysis::verifyRemoved(Instruction *D) const {
+  for (LocalDepMapType::const_iterator I = LocalDeps.begin(),
+       E = LocalDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    assert(I->second.getInst() != D &&
+           "Inst occurs in data structures");
+  }
+  
+  for (CachedNonLocalPointerInfo::const_iterator I =NonLocalPointerDeps.begin(),
+       E = NonLocalPointerDeps.end(); I != E; ++I) {
+    assert(I->first.getPointer() != D && "Inst occurs in NLPD map key");
+    const NonLocalDepInfo &Val = I->second.second;
+    for (NonLocalDepInfo::const_iterator II = Val.begin(), E = Val.end();
+         II != E; ++II)
+      assert(II->second.getInst() != D && "Inst occurs as NLPD value");
+  }
+  
+  for (NonLocalDepMapType::const_iterator I = NonLocalDeps.begin(),
+       E = NonLocalDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    const PerInstNLInfo &INLD = I->second;
+    for (NonLocalDepInfo::const_iterator II = INLD.first.begin(),
+         EE = INLD.first.end(); II  != EE; ++II)
+      assert(II->second.getInst() != D && "Inst occurs in data structures");
+  }
+  
+  for (ReverseDepMapType::const_iterator I = ReverseLocalDeps.begin(),
+       E = ReverseLocalDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    for (SmallPtrSet<Instruction*, 4>::const_iterator II = I->second.begin(),
+         EE = I->second.end(); II != EE; ++II)
+      assert(*II != D && "Inst occurs in data structures");
+  }
+  
+  for (ReverseDepMapType::const_iterator I = ReverseNonLocalDeps.begin(),
+       E = ReverseNonLocalDeps.end();
+       I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    for (SmallPtrSet<Instruction*, 4>::const_iterator II = I->second.begin(),
+         EE = I->second.end(); II != EE; ++II)
+      assert(*II != D && "Inst occurs in data structures");
+  }
+  
+  for (ReverseNonLocalPtrDepTy::const_iterator
+       I = ReverseNonLocalPtrDeps.begin(),
+       E = ReverseNonLocalPtrDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in rev NLPD map");
+    
+    for (SmallPtrSet<ValueIsLoadPair, 4>::const_iterator II = I->second.begin(),
+         E = I->second.end(); II != E; ++II)
+      assert(*II != ValueIsLoadPair(D, false) &&
+             *II != ValueIsLoadPair(D, true) &&
+             "Inst occurs in ReverseNonLocalPtrDeps map");
+  }
+  
+}
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
new file mode 100644
index 0000000..4853c2a
--- /dev/null
+++ b/lib/Analysis/PostDominators.cpp
@@ -0,0 +1,94 @@
+//===- PostDominators.cpp - Post-Dominator Calculation --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the post-dominator construction algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "postdomtree"
+
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/Analysis/DominatorInternals.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  PostDominatorTree Implementation
+//===----------------------------------------------------------------------===//
+
+char PostDominatorTree::ID = 0;
+char PostDominanceFrontier::ID = 0;
+static RegisterPass<PostDominatorTree>
+F("postdomtree", "Post-Dominator Tree Construction", true, true);
+
+bool PostDominatorTree::runOnFunction(Function &F) {
+  DT->recalculate(F);
+  DEBUG(DT->dump());
+  return false;
+}
+
+PostDominatorTree::~PostDominatorTree()
+{
+  delete DT;
+}
+
+FunctionPass* llvm::createPostDomTree() {
+  return new PostDominatorTree();
+}
+
+//===----------------------------------------------------------------------===//
+//  PostDominanceFrontier Implementation
+//===----------------------------------------------------------------------===//
+
+static RegisterPass<PostDominanceFrontier>
+H("postdomfrontier", "Post-Dominance Frontier Construction", true, true);
+
+const DominanceFrontier::DomSetType &
+PostDominanceFrontier::calculate(const PostDominatorTree &DT,
+                                 const DomTreeNode *Node) {
+  // Loop over CFG successors to calculate DFlocal[Node]
+  BasicBlock *BB = Node->getBlock();
+  DomSetType &S = Frontiers[BB];       // The new set to fill in...
+  if (getRoots().empty()) return S;
+
+  if (BB)
+    for (pred_iterator SI = pred_begin(BB), SE = pred_end(BB);
+         SI != SE; ++SI) {
+      // Does Node immediately dominate this predecessor?
+      DomTreeNode *SINode = DT[*SI];
+      if (SINode && SINode->getIDom() != Node)
+        S.insert(*SI);
+    }
+
+  // At this point, S is DFlocal.  Now we union in DFup's of our children...
+  // Loop through and visit the nodes that Node immediately dominates (Node's
+  // children in the IDomTree)
+  //
+  for (DomTreeNode::const_iterator
+         NI = Node->begin(), NE = Node->end(); NI != NE; ++NI) {
+    DomTreeNode *IDominee = *NI;
+    const DomSetType &ChildDF = calculate(DT, IDominee);
+
+    DomSetType::const_iterator CDFI = ChildDF.begin(), CDFE = ChildDF.end();
+    for (; CDFI != CDFE; ++CDFI) {
+      if (!DT.properlyDominates(Node, DT[*CDFI]))
+        S.insert(*CDFI);
+    }
+  }
+
+  return S;
+}
+
+FunctionPass* llvm::createPostDomFrontier() {
+  return new PostDominanceFrontier();
+}
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
new file mode 100644
index 0000000..a0965b6
--- /dev/null
+++ b/lib/Analysis/ProfileInfo.cpp
@@ -0,0 +1,100 @@
+//===- ProfileInfo.cpp - Profile Info Interface ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the abstract ProfileInfo interface, and the default
+// "no profile" implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+using namespace llvm;
+
+// Register the ProfileInfo interface, providing a nice name to refer to.
+static RegisterAnalysisGroup<ProfileInfo> Z("Profile Information");
+char ProfileInfo::ID = 0;
+
+ProfileInfo::~ProfileInfo() {}
+
+unsigned ProfileInfo::getExecutionCount(BasicBlock *BB) const {
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+  // Are there zero predecessors of this block?
+  if (PI == PE) {
+    // If this is the entry block, look for the Null -> Entry edge.
+    if (BB == &BB->getParent()->getEntryBlock())
+      return getEdgeWeight(0, BB);
+    else
+      return 0;   // Otherwise, this is a dead block.
+  }
+
+  // Otherwise, if there are predecessors, the execution count of this block is
+  // the sum of the edge frequencies from the incoming edges.  Note that if
+  // there are multiple edges from a predecessor to this block that we don't
+  // want to count its weight multiple times.  For this reason, we keep track of
+  // the predecessors we've seen and only count them if we haven't run into them
+  // yet.
+  //
+  // We don't want to create an std::set unless we are dealing with a block that
+  // has a LARGE number of in-edges.  Handle the common case of having only a
+  // few in-edges with special code.
+  //
+  BasicBlock *FirstPred = *PI;
+  unsigned Count = getEdgeWeight(FirstPred, BB);
+  ++PI;
+  if (PI == PE) return Count;   // Quick exit for single predecessor blocks
+
+  BasicBlock *SecondPred = *PI;
+  if (SecondPred != FirstPred) Count += getEdgeWeight(SecondPred, BB);
+  ++PI;
+  if (PI == PE) return Count;   // Quick exit for two predecessor blocks
+
+  BasicBlock *ThirdPred = *PI;
+  if (ThirdPred != FirstPred && ThirdPred != SecondPred)
+    Count += getEdgeWeight(ThirdPred, BB);
+  ++PI;
+  if (PI == PE) return Count;   // Quick exit for three predecessor blocks
+
+  std::set<BasicBlock*> ProcessedPreds;
+  ProcessedPreds.insert(FirstPred);
+  ProcessedPreds.insert(SecondPred);
+  ProcessedPreds.insert(ThirdPred);
+  for (; PI != PE; ++PI)
+    if (ProcessedPreds.insert(*PI).second)
+      Count += getEdgeWeight(*PI, BB);
+  return Count;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//  NoProfile ProfileInfo implementation
+//
+
+namespace {
+  struct VISIBILITY_HIDDEN NoProfileInfo 
+    : public ImmutablePass, public ProfileInfo {
+    static char ID; // Class identification, replacement for typeinfo
+    NoProfileInfo() : ImmutablePass(&ID) {}
+  };
+}  // End of anonymous namespace
+
+char NoProfileInfo::ID = 0;
+// Register this pass...
+static RegisterPass<NoProfileInfo>
+X("no-profile", "No Profile Information", false, true);
+
+// Declare that we implement the ProfileInfo interface
+static RegisterAnalysisGroup<ProfileInfo, true> Y(X);
+
+ImmutablePass *llvm::createNoProfileInfoPass() { return new NoProfileInfo(); }
diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp
new file mode 100644
index 0000000..3a0a740
--- /dev/null
+++ b/lib/Analysis/ProfileInfoLoader.cpp
@@ -0,0 +1,277 @@
+//===- ProfileInfoLoad.cpp - Load profile information from disk -----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ProfileInfoLoader class is used to load and represent profiling
+// information read in from the dump file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ProfileInfoLoader.h"
+#include "llvm/Analysis/ProfileInfoTypes.h"
+#include "llvm/Module.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Support/Streams.h"
+#include <cstdio>
+#include <cstdlib>
+#include <map>
+using namespace llvm;
+
+// ByteSwap - Byteswap 'Var' if 'Really' is true.
+//
+static inline unsigned ByteSwap(unsigned Var, bool Really) {
+  if (!Really) return Var;
+  return ((Var & (255<< 0)) << 24) |
+         ((Var & (255<< 8)) <<  8) |
+         ((Var & (255<<16)) >>  8) |
+         ((Var & (255<<24)) >> 24);
+}
+
+static void ReadProfilingBlock(const char *ToolName, FILE *F,
+                               bool ShouldByteSwap,
+                               std::vector<unsigned> &Data) {
+  // Read the number of entries...
+  unsigned NumEntries;
+  if (fread(&NumEntries, sizeof(unsigned), 1, F) != 1) {
+    cerr << ToolName << ": data packet truncated!\n";
+    perror(0);
+    exit(1);
+  }
+  NumEntries = ByteSwap(NumEntries, ShouldByteSwap);
+
+  // Read the counts...
+  std::vector<unsigned> TempSpace(NumEntries);
+
+  // Read in the block of data...
+  if (fread(&TempSpace[0], sizeof(unsigned)*NumEntries, 1, F) != 1) {
+    cerr << ToolName << ": data packet truncated!\n";
+    perror(0);
+    exit(1);
+  }
+
+  // Make sure we have enough space...
+  if (Data.size() < NumEntries)
+    Data.resize(NumEntries);
+
+  // Accumulate the data we just read into the data.
+  if (!ShouldByteSwap) {
+    for (unsigned i = 0; i != NumEntries; ++i)
+      Data[i] += TempSpace[i];
+  } else {
+    for (unsigned i = 0; i != NumEntries; ++i)
+      Data[i] += ByteSwap(TempSpace[i], true);
+  }
+}
+
+// ProfileInfoLoader ctor - Read the specified profiling data file, exiting the
+// program if the file is invalid or broken.
+//
+ProfileInfoLoader::ProfileInfoLoader(const char *ToolName,
+                                     const std::string &Filename,
+                                     Module &TheModule) : M(TheModule) {
+  FILE *F = fopen(Filename.c_str(), "r");
+  if (F == 0) {
+    cerr << ToolName << ": Error opening '" << Filename << "': ";
+    perror(0);
+    exit(1);
+  }
+
+  // Keep reading packets until we run out of them.
+  unsigned PacketType;
+  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
+    // If the low eight bits of the packet are zero, we must be dealing with an
+    // endianness mismatch.  Byteswap all words read from the profiling
+    // information.
+    bool ShouldByteSwap = (char)PacketType == 0;
+    PacketType = ByteSwap(PacketType, ShouldByteSwap);
+
+    switch (PacketType) {
+    case ArgumentInfo: {
+      unsigned ArgLength;
+      if (fread(&ArgLength, sizeof(unsigned), 1, F) != 1) {
+        cerr << ToolName << ": arguments packet truncated!\n";
+        perror(0);
+        exit(1);
+      }
+      ArgLength = ByteSwap(ArgLength, ShouldByteSwap);
+
+      // Read in the arguments...
+      std::vector<char> Chars(ArgLength+4);
+
+      if (ArgLength)
+        if (fread(&Chars[0], (ArgLength+3) & ~3, 1, F) != 1) {
+          cerr << ToolName << ": arguments packet truncated!\n";
+          perror(0);
+          exit(1);
+        }
+      CommandLines.push_back(std::string(&Chars[0], &Chars[ArgLength]));
+      break;
+    }
+
+    case FunctionInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, FunctionCounts);
+      break;
+
+    case BlockInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BlockCounts);
+      break;
+
+    case EdgeInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
+      break;
+
+    case BBTraceInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BBTrace);
+      break;
+
+    default:
+      cerr << ToolName << ": Unknown packet type #" << PacketType << "!\n";
+      exit(1);
+    }
+  }
+
+  fclose(F);
+}
+
+
+// getFunctionCounts - This method is used by consumers of function counting
+// information.  If we do not directly have function count information, we
+// compute it from other, more refined, types of profile information.
+//
+void ProfileInfoLoader::getFunctionCounts(std::vector<std::pair<Function*,
+                                                      unsigned> > &Counts) {
+  if (FunctionCounts.empty()) {
+    if (hasAccurateBlockCounts()) {
+      // Synthesize function frequency information from the number of times
+      // their entry blocks were executed.
+      std::vector<std::pair<BasicBlock*, unsigned> > BlockCounts;
+      getBlockCounts(BlockCounts);
+
+      for (unsigned i = 0, e = BlockCounts.size(); i != e; ++i)
+        if (&BlockCounts[i].first->getParent()->getEntryBlock() ==
+            BlockCounts[i].first)
+          Counts.push_back(std::make_pair(BlockCounts[i].first->getParent(),
+                                          BlockCounts[i].second));
+    } else {
+      cerr << "Function counts are not available!\n";
+    }
+    return;
+  }
+
+  unsigned Counter = 0;
+  for (Module::iterator I = M.begin(), E = M.end();
+       I != E && Counter != FunctionCounts.size(); ++I)
+    if (!I->isDeclaration())
+      Counts.push_back(std::make_pair(I, FunctionCounts[Counter++]));
+}
+
+// getBlockCounts - This method is used by consumers of block counting
+// information.  If we do not directly have block count information, we
+// compute it from other, more refined, types of profile information.
+//
+void ProfileInfoLoader::getBlockCounts(std::vector<std::pair<BasicBlock*,
+                                                         unsigned> > &Counts) {
+  if (BlockCounts.empty()) {
+    if (hasAccurateEdgeCounts()) {
+      // Synthesize block count information from edge frequency information.
+      // The block execution frequency is equal to the sum of the execution
+      // frequency of all outgoing edges from a block.
+      //
+      // If a block has no successors, this will not be correct, so we have to
+      // special case it. :(
+      std::vector<std::pair<Edge, unsigned> > EdgeCounts;
+      getEdgeCounts(EdgeCounts);
+
+      std::map<BasicBlock*, unsigned> InEdgeFreqs;
+
+      BasicBlock *LastBlock = 0;
+      TerminatorInst *TI = 0;
+      for (unsigned i = 0, e = EdgeCounts.size(); i != e; ++i) {
+        if (EdgeCounts[i].first.first != LastBlock) {
+          LastBlock = EdgeCounts[i].first.first;
+          TI = LastBlock->getTerminator();
+          Counts.push_back(std::make_pair(LastBlock, 0));
+        }
+        Counts.back().second += EdgeCounts[i].second;
+        unsigned SuccNum = EdgeCounts[i].first.second;
+        if (SuccNum >= TI->getNumSuccessors()) {
+          static bool Warned = false;
+          if (!Warned) {
+            cerr << "WARNING: profile info doesn't seem to match"
+                 << " the program!\n";
+            Warned = true;
+          }
+        } else {
+          // If this successor has no successors of its own, we will never
+          // compute an execution count for that block.  Remember the incoming
+          // edge frequencies to add later.
+          BasicBlock *Succ = TI->getSuccessor(SuccNum);
+          if (Succ->getTerminator()->getNumSuccessors() == 0)
+            InEdgeFreqs[Succ] += EdgeCounts[i].second;
+        }
+      }
+
+      // Now we have to accumulate information for those blocks without
+      // successors into our table.
+      for (std::map<BasicBlock*, unsigned>::iterator I = InEdgeFreqs.begin(),
+             E = InEdgeFreqs.end(); I != E; ++I) {
+        unsigned i = 0;
+        for (; i != Counts.size() && Counts[i].first != I->first; ++i)
+          /*empty*/;
+        if (i == Counts.size()) Counts.push_back(std::make_pair(I->first, 0));
+        Counts[i].second += I->second;
+      }
+
+    } else {
+      cerr << "Block counts are not available!\n";
+    }
+    return;
+  }
+
+  unsigned Counter = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      Counts.push_back(std::make_pair(BB, BlockCounts[Counter++]));
+      if (Counter == BlockCounts.size())
+        return;
+    }
+}
+
+// getEdgeCounts - This method is used by consumers of edge counting
+// information.  If we do not directly have edge count information, we compute
+// it from other, more refined, types of profile information.
+//
+void ProfileInfoLoader::getEdgeCounts(std::vector<std::pair<Edge,
+                                                  unsigned> > &Counts) {
+  if (EdgeCounts.empty()) {
+    cerr << "Edge counts not available, and no synthesis "
+         << "is implemented yet!\n";
+    return;
+  }
+
+  unsigned Counter = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (unsigned i = 0, e = BB->getTerminator()->getNumSuccessors();
+           i != e; ++i) {
+        Counts.push_back(std::make_pair(Edge(BB, i), EdgeCounts[Counter++]));
+        if (Counter == EdgeCounts.size())
+          return;
+      }
+}
+
+// getBBTrace - This method is used by consumers of basic-block trace
+// information.
+//
+void ProfileInfoLoader::getBBTrace(std::vector<BasicBlock *> &Trace) {
+  if (BBTrace.empty ()) {
+    cerr << "Basic block trace is not available!\n";
+    return;
+  }
+  cerr << "Basic block trace loading is not implemented yet!\n";
+}
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
new file mode 100644
index 0000000..0a8a87b
--- /dev/null
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -0,0 +1,92 @@
+//===- ProfileInfoLoaderPass.cpp - LLVM Pass to load profile info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a concrete implementation of profiling information that
+// loads the information from a profile dump file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BasicBlock.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Analysis/ProfileInfoLoader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+static cl::opt<std::string>
+ProfileInfoFilename("profile-info-file", cl::init("llvmprof.out"),
+                    cl::value_desc("filename"),
+                    cl::desc("Profile file loaded by -profile-loader"));
+
+namespace {
+  class VISIBILITY_HIDDEN LoaderPass : public ModulePass, public ProfileInfo {
+    std::string Filename;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    explicit LoaderPass(const std::string &filename = "")
+      : ModulePass(&ID), Filename(filename) {
+      if (filename.empty()) Filename = ProfileInfoFilename;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    virtual const char *getPassName() const {
+      return "Profiling information loader";
+    }
+
+    /// run - Load the profile information from the specified file.
+    virtual bool runOnModule(Module &M);
+  };
+}  // End of anonymous namespace
+
+char LoaderPass::ID = 0;
+static RegisterPass<LoaderPass>
+X("profile-loader", "Load profile information from llvmprof.out", false, true);
+
+static RegisterAnalysisGroup<ProfileInfo> Y(X);
+
+ModulePass *llvm::createProfileLoaderPass() { return new LoaderPass(); }
+
+/// createProfileLoaderPass - This function returns a Pass that loads the
+/// profiling information for the module from the specified filename, making it
+/// available to the optimizers.
+Pass *llvm::createProfileLoaderPass(const std::string &Filename) {
+  return new LoaderPass(Filename);
+}
+
+bool LoaderPass::runOnModule(Module &M) {
+  ProfileInfoLoader PIL("profile-loader", Filename, M);
+  EdgeCounts.clear();
+  bool PrintedWarning = false;
+
+  std::vector<std::pair<ProfileInfoLoader::Edge, unsigned> > ECs;
+  PIL.getEdgeCounts(ECs);
+  for (unsigned i = 0, e = ECs.size(); i != e; ++i) {
+    BasicBlock *BB = ECs[i].first.first;
+    unsigned SuccNum = ECs[i].first.second;
+    TerminatorInst *TI = BB->getTerminator();
+    if (SuccNum >= TI->getNumSuccessors()) {
+      if (!PrintedWarning) {
+        cerr << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
+        PrintedWarning = true;
+      }
+    } else {
+      EdgeCounts[std::make_pair(BB, TI->getSuccessor(SuccNum))]+= ECs[i].second;
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
new file mode 100644
index 0000000..f7f1849
--- /dev/null
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -0,0 +1,3824 @@
+//===- ScalarEvolution.cpp - Scalar Evolution Analysis ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the scalar evolution analysis
+// engine, which is used primarily to analyze expressions involving induction
+// variables in loops.
+//
+// There are several aspects to this library.  First is the representation of
+// scalar expressions, which are represented as subclasses of the SCEV class.
+// These classes are used to represent certain types of subexpressions that we
+// can handle.  These classes are reference counted, managed by the SCEVHandle
+// class.  We only create one SCEV of a particular shape, so pointer-comparisons
+// for equality are legal.
+//
+// One important aspect of the SCEV objects is that they are never cyclic, even
+// if there is a cycle in the dataflow for an expression (ie, a PHI node).  If
+// the PHI node is one of the idioms that we can represent (e.g., a polynomial
+// recurrence) then we represent it directly as a recurrence node, otherwise we
+// represent it as a SCEVUnknown node.
+//
+// In addition to being able to represent expressions of various types, we also
+// have folders that are used to build the *canonical* representation for a
+// particular expression.  These folders are capable of using a variety of
+// rewrite rules to simplify the expressions.
+//
+// Once the folders are defined, we can implement the more interesting
+// higher-level code, such as the code that recognizes PHI nodes of various
+// types, computes the execution count of a loop, etc.
+//
+// TODO: We should use these routines and value representations to implement
+// dependence analysis!
+//
+//===----------------------------------------------------------------------===//
+//
+// There are several good references for the techniques used in this analysis.
+//
+//  Chains of recurrences -- a method to expedite the evaluation
+//  of closed-form functions
+//  Olaf Bachmann, Paul S. Wang, Eugene V. Zima
+//
+//  On computational properties of chains of recurrences
+//  Eugene V. Zima
+//
+//  Symbolic Evaluation of Chains of Recurrences for Loop Optimization
+//  Robert A. van Engelen
+//
+//  Efficient Symbolic Analysis for Optimizing Compilers
+//  Robert A. van Engelen
+//
+//  Using the chains of recurrences algebra for data dependence testing and
+//  induction variable substitution
+//  MS Thesis, Johnie Birch
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scalar-evolution"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <ostream>
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumArrayLenItCounts,
+          "Number of trip counts computed with array length");
+STATISTIC(NumTripCountsComputed,
+          "Number of loops with predictable loop counts");
+STATISTIC(NumTripCountsNotComputed,
+          "Number of loops without predictable loop counts");
+STATISTIC(NumBruteForceTripCountsComputed,
+          "Number of loops with trip counts computed by force");
+
+static cl::opt<unsigned>
+MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
+                        cl::desc("Maximum number of iterations SCEV will "
+                                 "symbolically execute a constant derived loop"),
+                        cl::init(100));
+
+static RegisterPass<ScalarEvolution>
+R("scalar-evolution", "Scalar Evolution Analysis", false, true);
+char ScalarEvolution::ID = 0;
+
+//===----------------------------------------------------------------------===//
+//                           SCEV class definitions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Implementation of the SCEV class.
+//
+SCEV::~SCEV() {}
+void SCEV::dump() const {
+  print(errs());
+  errs() << '\n';
+}
+
+void SCEV::print(std::ostream &o) const {
+  raw_os_ostream OS(o);
+  print(OS);
+}
+
+bool SCEV::isZero() const {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
+    return SC->getValue()->isZero();
+  return false;
+}
+
+bool SCEV::isOne() const {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
+    return SC->getValue()->isOne();
+  return false;
+}
+
+SCEVCouldNotCompute::SCEVCouldNotCompute() : SCEV(scCouldNotCompute) {}
+SCEVCouldNotCompute::~SCEVCouldNotCompute() {}
+
+bool SCEVCouldNotCompute::isLoopInvariant(const Loop *L) const {
+  assert(0 && "Attempt to use a SCEVCouldNotCompute object!");
+  return false;
+}
+
+const Type *SCEVCouldNotCompute::getType() const {
+  assert(0 && "Attempt to use a SCEVCouldNotCompute object!");
+  return 0;
+}
+
+bool SCEVCouldNotCompute::hasComputableLoopEvolution(const Loop *L) const {
+  assert(0 && "Attempt to use a SCEVCouldNotCompute object!");
+  return false;
+}
+
+SCEVHandle SCEVCouldNotCompute::
+replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
+                                  const SCEVHandle &Conc,
+                                  ScalarEvolution &SE) const {
+  return this;
+}
+
+void SCEVCouldNotCompute::print(raw_ostream &OS) const {
+  OS << "***COULDNOTCOMPUTE***";
+}
+
+bool SCEVCouldNotCompute::classof(const SCEV *S) {
+  return S->getSCEVType() == scCouldNotCompute;
+}
+
+
+// SCEVConstants - Only allow the creation of one SCEVConstant for any
+// particular value.  Don't use a SCEVHandle here, or else the object will
+// never be deleted!
+static ManagedStatic<std::map<ConstantInt*, SCEVConstant*> > SCEVConstants;
+
+
+SCEVConstant::~SCEVConstant() {
+  SCEVConstants->erase(V);
+}
+
+SCEVHandle ScalarEvolution::getConstant(ConstantInt *V) {
+  SCEVConstant *&R = (*SCEVConstants)[V];
+  if (R == 0) R = new SCEVConstant(V);
+  return R;
+}
+
+SCEVHandle ScalarEvolution::getConstant(const APInt& Val) {
+  return getConstant(ConstantInt::get(Val));
+}
+
+const Type *SCEVConstant::getType() const { return V->getType(); }
+
+void SCEVConstant::print(raw_ostream &OS) const {
+  WriteAsOperand(OS, V, false);
+}
+
+SCEVCastExpr::SCEVCastExpr(unsigned SCEVTy,
+                           const SCEVHandle &op, const Type *ty)
+  : SCEV(SCEVTy), Op(op), Ty(ty) {}
+
+SCEVCastExpr::~SCEVCastExpr() {}
+
+bool SCEVCastExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
+  return Op->dominates(BB, DT);
+}
+
+// SCEVTruncates - Only allow the creation of one SCEVTruncateExpr for any
+// particular input.  Don't use a SCEVHandle here, or else the object will
+// never be deleted!
+static ManagedStatic<std::map<std::pair<const SCEV*, const Type*>, 
+                     SCEVTruncateExpr*> > SCEVTruncates;
+
+SCEVTruncateExpr::SCEVTruncateExpr(const SCEVHandle &op, const Type *ty)
+  : SCEVCastExpr(scTruncate, op, ty) {
+  assert((Op->getType()->isInteger() || isa<PointerType>(Op->getType())) &&
+         (Ty->isInteger() || isa<PointerType>(Ty)) &&
+         "Cannot truncate non-integer value!");
+}
+
+SCEVTruncateExpr::~SCEVTruncateExpr() {
+  SCEVTruncates->erase(std::make_pair(Op, Ty));
+}
+
+void SCEVTruncateExpr::print(raw_ostream &OS) const {
+  OS << "(trunc " << *Op->getType() << " " << *Op << " to " << *Ty << ")";
+}
+
+// SCEVZeroExtends - Only allow the creation of one SCEVZeroExtendExpr for any
+// particular input.  Don't use a SCEVHandle here, or else the object will never
+// be deleted!
+static ManagedStatic<std::map<std::pair<const SCEV*, const Type*>,
+                     SCEVZeroExtendExpr*> > SCEVZeroExtends;
+
+SCEVZeroExtendExpr::SCEVZeroExtendExpr(const SCEVHandle &op, const Type *ty)
+  : SCEVCastExpr(scZeroExtend, op, ty) {
+  assert((Op->getType()->isInteger() || isa<PointerType>(Op->getType())) &&
+         (Ty->isInteger() || isa<PointerType>(Ty)) &&
+         "Cannot zero extend non-integer value!");
+}
+
+SCEVZeroExtendExpr::~SCEVZeroExtendExpr() {
+  SCEVZeroExtends->erase(std::make_pair(Op, Ty));
+}
+
+void SCEVZeroExtendExpr::print(raw_ostream &OS) const {
+  OS << "(zext " << *Op->getType() << " " << *Op << " to " << *Ty << ")";
+}
+
+// SCEVSignExtends - Only allow the creation of one SCEVSignExtendExpr for any
+// particular input.  Don't use a SCEVHandle here, or else the object will never
+// be deleted!
+static ManagedStatic<std::map<std::pair<const SCEV*, const Type*>,
+                     SCEVSignExtendExpr*> > SCEVSignExtends;
+
+SCEVSignExtendExpr::SCEVSignExtendExpr(const SCEVHandle &op, const Type *ty)
+  : SCEVCastExpr(scSignExtend, op, ty) {
+  assert((Op->getType()->isInteger() || isa<PointerType>(Op->getType())) &&
+         (Ty->isInteger() || isa<PointerType>(Ty)) &&
+         "Cannot sign extend non-integer value!");
+}
+
+SCEVSignExtendExpr::~SCEVSignExtendExpr() {
+  SCEVSignExtends->erase(std::make_pair(Op, Ty));
+}
+
+void SCEVSignExtendExpr::print(raw_ostream &OS) const {
+  OS << "(sext " << *Op->getType() << " " << *Op << " to " << *Ty << ")";
+}
+
+// SCEVCommExprs - Only allow the creation of one SCEVCommutativeExpr for any
+// particular input.  Don't use a SCEVHandle here, or else the object will never
+// be deleted!
+static ManagedStatic<std::map<std::pair<unsigned, std::vector<const SCEV*> >,
+                     SCEVCommutativeExpr*> > SCEVCommExprs;
+
+SCEVCommutativeExpr::~SCEVCommutativeExpr() {
+  std::vector<const SCEV*> SCEVOps(Operands.begin(), Operands.end());
+  SCEVCommExprs->erase(std::make_pair(getSCEVType(), SCEVOps));
+}
+
+void SCEVCommutativeExpr::print(raw_ostream &OS) const {
+  assert(Operands.size() > 1 && "This plus expr shouldn't exist!");
+  const char *OpStr = getOperationStr();
+  OS << "(" << *Operands[0];
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i)
+    OS << OpStr << *Operands[i];
+  OS << ")";
+}
+
+SCEVHandle SCEVCommutativeExpr::
+replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
+                                  const SCEVHandle &Conc,
+                                  ScalarEvolution &SE) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    SCEVHandle H =
+      getOperand(i)->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+    if (H != getOperand(i)) {
+      std::vector<SCEVHandle> NewOps;
+      NewOps.reserve(getNumOperands());
+      for (unsigned j = 0; j != i; ++j)
+        NewOps.push_back(getOperand(j));
+      NewOps.push_back(H);
+      for (++i; i != e; ++i)
+        NewOps.push_back(getOperand(i)->
+                         replaceSymbolicValuesWithConcrete(Sym, Conc, SE));
+
+      if (isa<SCEVAddExpr>(this))
+        return SE.getAddExpr(NewOps);
+      else if (isa<SCEVMulExpr>(this))
+        return SE.getMulExpr(NewOps);
+      else if (isa<SCEVSMaxExpr>(this))
+        return SE.getSMaxExpr(NewOps);
+      else if (isa<SCEVUMaxExpr>(this))
+        return SE.getUMaxExpr(NewOps);
+      else
+        assert(0 && "Unknown commutative expr!");
+    }
+  }
+  return this;
+}
+
+bool SCEVNAryExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    if (!getOperand(i)->dominates(BB, DT))
+      return false;
+  }
+  return true;
+}
+
+
+// SCEVUDivs - Only allow the creation of one SCEVUDivExpr for any particular
+// input.  Don't use a SCEVHandle here, or else the object will never be
+// deleted!
+static ManagedStatic<std::map<std::pair<const SCEV*, const SCEV*>,
+                     SCEVUDivExpr*> > SCEVUDivs;
+
+SCEVUDivExpr::~SCEVUDivExpr() {
+  SCEVUDivs->erase(std::make_pair(LHS, RHS));
+}
+
+bool SCEVUDivExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
+  return LHS->dominates(BB, DT) && RHS->dominates(BB, DT);
+}
+
+void SCEVUDivExpr::print(raw_ostream &OS) const {
+  OS << "(" << *LHS << " /u " << *RHS << ")";
+}
+
+const Type *SCEVUDivExpr::getType() const {
+  // In most cases the types of LHS and RHS will be the same, but in some
+  // crazy cases one or the other may be a pointer. ScalarEvolution doesn't
+  // depend on the type for correctness, but handling types carefully can
+  // avoid extra casts in the SCEVExpander. The LHS is more likely to be
+  // a pointer type than the RHS, so use the RHS' type here.
+  return RHS->getType();
+}
+
+// SCEVAddRecExprs - Only allow the creation of one SCEVAddRecExpr for any
+// particular input.  Don't use a SCEVHandle here, or else the object will never
+// be deleted!
+static ManagedStatic<std::map<std::pair<const Loop *,
+                                        std::vector<const SCEV*> >,
+                     SCEVAddRecExpr*> > SCEVAddRecExprs;
+
+SCEVAddRecExpr::~SCEVAddRecExpr() {
+  std::vector<const SCEV*> SCEVOps(Operands.begin(), Operands.end());
+  SCEVAddRecExprs->erase(std::make_pair(L, SCEVOps));
+}
+
+SCEVHandle SCEVAddRecExpr::
+replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
+                                  const SCEVHandle &Conc,
+                                  ScalarEvolution &SE) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    SCEVHandle H =
+      getOperand(i)->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+    if (H != getOperand(i)) {
+      std::vector<SCEVHandle> NewOps;
+      NewOps.reserve(getNumOperands());
+      for (unsigned j = 0; j != i; ++j)
+        NewOps.push_back(getOperand(j));
+      NewOps.push_back(H);
+      for (++i; i != e; ++i)
+        NewOps.push_back(getOperand(i)->
+                         replaceSymbolicValuesWithConcrete(Sym, Conc, SE));
+
+      return SE.getAddRecExpr(NewOps, L);
+    }
+  }
+  return this;
+}
+
+
+bool SCEVAddRecExpr::isLoopInvariant(const Loop *QueryLoop) const {
+  // This recurrence is invariant w.r.t to QueryLoop iff QueryLoop doesn't
+  // contain L and if the start is invariant.
+  // Add recurrences are never invariant in the function-body (null loop).
+  return QueryLoop &&
+         !QueryLoop->contains(L->getHeader()) &&
+         getOperand(0)->isLoopInvariant(QueryLoop);
+}
+
+
+void SCEVAddRecExpr::print(raw_ostream &OS) const {
+  OS << "{" << *Operands[0];
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i)
+    OS << ",+," << *Operands[i];
+  OS << "}<" << L->getHeader()->getName() + ">";
+}
+
+// SCEVUnknowns - Only allow the creation of one SCEVUnknown for any particular
+// value.  Don't use a SCEVHandle here, or else the object will never be
+// deleted!
+static ManagedStatic<std::map<Value*, SCEVUnknown*> > SCEVUnknowns;
+
+SCEVUnknown::~SCEVUnknown() { SCEVUnknowns->erase(V); }
+
+bool SCEVUnknown::isLoopInvariant(const Loop *L) const {
+  // All non-instruction values are loop invariant.  All instructions are loop
+  // invariant if they are not contained in the specified loop.
+  // Instructions are never considered invariant in the function body
+  // (null loop) because they are defined within the "loop".
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return L && !L->contains(I->getParent());
+  return true;
+}
+
+bool SCEVUnknown::dominates(BasicBlock *BB, DominatorTree *DT) const {
+  if (Instruction *I = dyn_cast<Instruction>(getValue()))
+    return DT->dominates(I->getParent(), BB);
+  return true;
+}
+
+const Type *SCEVUnknown::getType() const {
+  return V->getType();
+}
+
+void SCEVUnknown::print(raw_ostream &OS) const {
+  WriteAsOperand(OS, V, false);
+}
+
+//===----------------------------------------------------------------------===//
+//                               SCEV Utilities
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// SCEVComplexityCompare - Return true if the complexity of the LHS is less
+  /// than the complexity of the RHS.  This comparator is used to canonicalize
+  /// expressions.
+  class VISIBILITY_HIDDEN SCEVComplexityCompare {
+    LoopInfo *LI;
+  public:
+    explicit SCEVComplexityCompare(LoopInfo *li) : LI(li) {}
+
+    bool operator()(const SCEV *LHS, const SCEV *RHS) const {
+      // Primarily, sort the SCEVs by their getSCEVType().
+      if (LHS->getSCEVType() != RHS->getSCEVType())
+        return LHS->getSCEVType() < RHS->getSCEVType();
+
+      // Aside from the getSCEVType() ordering, the particular ordering
+      // isn't very important except that it's beneficial to be consistent,
+      // so that (a + b) and (b + a) don't end up as different expressions.
+
+      // Sort SCEVUnknown values with some loose heuristics. TODO: This is
+      // not as complete as it could be.
+      if (const SCEVUnknown *LU = dyn_cast<SCEVUnknown>(LHS)) {
+        const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
+
+        // Order pointer values after integer values. This helps SCEVExpander
+        // form GEPs.
+        if (isa<PointerType>(LU->getType()) && !isa<PointerType>(RU->getType()))
+          return false;
+        if (isa<PointerType>(RU->getType()) && !isa<PointerType>(LU->getType()))
+          return true;
+
+        // Compare getValueID values.
+        if (LU->getValue()->getValueID() != RU->getValue()->getValueID())
+          return LU->getValue()->getValueID() < RU->getValue()->getValueID();
+
+        // Sort arguments by their position.
+        if (const Argument *LA = dyn_cast<Argument>(LU->getValue())) {
+          const Argument *RA = cast<Argument>(RU->getValue());
+          return LA->getArgNo() < RA->getArgNo();
+        }
+
+        // For instructions, compare their loop depth, and their opcode.
+        // This is pretty loose.
+        if (Instruction *LV = dyn_cast<Instruction>(LU->getValue())) {
+          Instruction *RV = cast<Instruction>(RU->getValue());
+
+          // Compare loop depths.
+          if (LI->getLoopDepth(LV->getParent()) !=
+              LI->getLoopDepth(RV->getParent()))
+            return LI->getLoopDepth(LV->getParent()) <
+                   LI->getLoopDepth(RV->getParent());
+
+          // Compare opcodes.
+          if (LV->getOpcode() != RV->getOpcode())
+            return LV->getOpcode() < RV->getOpcode();
+
+          // Compare the number of operands.
+          if (LV->getNumOperands() != RV->getNumOperands())
+            return LV->getNumOperands() < RV->getNumOperands();
+        }
+
+        return false;
+      }
+
+      // Constant sorting doesn't matter since they'll be folded.
+      if (isa<SCEVConstant>(LHS))
+        return false;
+
+      // Lexicographically compare n-ary expressions.
+      if (const SCEVNAryExpr *LC = dyn_cast<SCEVNAryExpr>(LHS)) {
+        const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
+        for (unsigned i = 0, e = LC->getNumOperands(); i != e; ++i) {
+          if (i >= RC->getNumOperands())
+            return false;
+          if (operator()(LC->getOperand(i), RC->getOperand(i)))
+            return true;
+          if (operator()(RC->getOperand(i), LC->getOperand(i)))
+            return false;
+        }
+        return LC->getNumOperands() < RC->getNumOperands();
+      }
+
+      // Lexicographically compare udiv expressions.
+      if (const SCEVUDivExpr *LC = dyn_cast<SCEVUDivExpr>(LHS)) {
+        const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
+        if (operator()(LC->getLHS(), RC->getLHS()))
+          return true;
+        if (operator()(RC->getLHS(), LC->getLHS()))
+          return false;
+        if (operator()(LC->getRHS(), RC->getRHS()))
+          return true;
+        if (operator()(RC->getRHS(), LC->getRHS()))
+          return false;
+        return false;
+      }
+
+      // Compare cast expressions by operand.
+      if (const SCEVCastExpr *LC = dyn_cast<SCEVCastExpr>(LHS)) {
+        const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+        return operator()(LC->getOperand(), RC->getOperand());
+      }
+
+      assert(0 && "Unknown SCEV kind!");
+      return false;
+    }
+  };
+}
+
+/// GroupByComplexity - Given a list of SCEV objects, order them by their
+/// complexity, and group objects of the same complexity together by value.
+/// When this routine is finished, we know that any duplicates in the vector are
+/// consecutive and that complexity is monotonically increasing.
+///
+/// Note that we go take special precautions to ensure that we get determinstic
+/// results from this routine.  In other words, we don't want the results of
+/// this to depend on where the addresses of various SCEV objects happened to
+/// land in memory.
+///
+static void GroupByComplexity(std::vector<SCEVHandle> &Ops,
+                              LoopInfo *LI) {
+  if (Ops.size() < 2) return;  // Noop
+  if (Ops.size() == 2) {
+    // This is the common case, which also happens to be trivially simple.
+    // Special case it.
+    if (SCEVComplexityCompare(LI)(Ops[1], Ops[0]))
+      std::swap(Ops[0], Ops[1]);
+    return;
+  }
+
+  // Do the rough sort by complexity.
+  std::stable_sort(Ops.begin(), Ops.end(), SCEVComplexityCompare(LI));
+
+  // Now that we are sorted by complexity, group elements of the same
+  // complexity.  Note that this is, at worst, N^2, but the vector is likely to
+  // be extremely short in practice.  Note that we take this approach because we
+  // do not want to depend on the addresses of the objects we are grouping.
+  for (unsigned i = 0, e = Ops.size(); i != e-2; ++i) {
+    const SCEV *S = Ops[i];
+    unsigned Complexity = S->getSCEVType();
+
+    // If there are any objects of the same complexity and same value as this
+    // one, group them.
+    for (unsigned j = i+1; j != e && Ops[j]->getSCEVType() == Complexity; ++j) {
+      if (Ops[j] == S) { // Found a duplicate.
+        // Move it to immediately after i'th element.
+        std::swap(Ops[i+1], Ops[j]);
+        ++i;   // no need to rescan it.
+        if (i == e-2) return;  // Done!
+      }
+    }
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                      Simple SCEV method implementations
+//===----------------------------------------------------------------------===//
+
+/// BinomialCoefficient - Compute BC(It, K).  The result has width W.
+/// Assume, K > 0.
+static SCEVHandle BinomialCoefficient(SCEVHandle It, unsigned K,
+                                      ScalarEvolution &SE,
+                                      const Type* ResultTy) {
+  // Handle the simplest case efficiently.
+  if (K == 1)
+    return SE.getTruncateOrZeroExtend(It, ResultTy);
+
+  // We are using the following formula for BC(It, K):
+  //
+  //   BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / K!
+  //
+  // Suppose, W is the bitwidth of the return value.  We must be prepared for
+  // overflow.  Hence, we must assure that the result of our computation is
+  // equal to the accurate one modulo 2^W.  Unfortunately, division isn't
+  // safe in modular arithmetic.
+  //
+  // However, this code doesn't use exactly that formula; the formula it uses
+  // is something like the following, where T is the number of factors of 2 in 
+  // K! (i.e. trailing zeros in the binary representation of K!), and ^ is
+  // exponentiation:
+  //
+  //   BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / 2^T / (K! / 2^T)
+  //
+  // This formula is trivially equivalent to the previous formula.  However,
+  // this formula can be implemented much more efficiently.  The trick is that
+  // K! / 2^T is odd, and exact division by an odd number *is* safe in modular
+  // arithmetic.  To do exact division in modular arithmetic, all we have
+  // to do is multiply by the inverse.  Therefore, this step can be done at
+  // width W.
+  // 
+  // The next issue is how to safely do the division by 2^T.  The way this
+  // is done is by doing the multiplication step at a width of at least W + T
+  // bits.  This way, the bottom W+T bits of the product are accurate. Then,
+  // when we perform the division by 2^T (which is equivalent to a right shift
+  // by T), the bottom W bits are accurate.  Extra bits are okay; they'll get
+  // truncated out after the division by 2^T.
+  //
+  // In comparison to just directly using the first formula, this technique
+  // is much more efficient; using the first formula requires W * K bits,
+  // but this formula less than W + K bits. Also, the first formula requires
+  // a division step, whereas this formula only requires multiplies and shifts.
+  //
+  // It doesn't matter whether the subtraction step is done in the calculation
+  // width or the input iteration count's width; if the subtraction overflows,
+  // the result must be zero anyway.  We prefer here to do it in the width of
+  // the induction variable because it helps a lot for certain cases; CodeGen
+  // isn't smart enough to ignore the overflow, which leads to much less
+  // efficient code if the width of the subtraction is wider than the native
+  // register width.
+  //
+  // (It's possible to not widen at all by pulling out factors of 2 before
+  // the multiplication; for example, K=2 can be calculated as
+  // It/2*(It+(It*INT_MIN/INT_MIN)+-1). However, it requires
+  // extra arithmetic, so it's not an obvious win, and it gets
+  // much more complicated for K > 3.)
+
+  // Protection from insane SCEVs; this bound is conservative,
+  // but it probably doesn't matter.
+  if (K > 1000)
+    return SE.getCouldNotCompute();
+
+  unsigned W = SE.getTypeSizeInBits(ResultTy);
+
+  // Calculate K! / 2^T and T; we divide out the factors of two before
+  // multiplying for calculating K! / 2^T to avoid overflow.
+  // Other overflow doesn't matter because we only care about the bottom
+  // W bits of the result.
+  APInt OddFactorial(W, 1);
+  unsigned T = 1;
+  for (unsigned i = 3; i <= K; ++i) {
+    APInt Mult(W, i);
+    unsigned TwoFactors = Mult.countTrailingZeros();
+    T += TwoFactors;
+    Mult = Mult.lshr(TwoFactors);
+    OddFactorial *= Mult;
+  }
+
+  // We need at least W + T bits for the multiplication step
+  unsigned CalculationBits = W + T;
+
+  // Calcuate 2^T, at width T+W.
+  APInt DivFactor = APInt(CalculationBits, 1).shl(T);
+
+  // Calculate the multiplicative inverse of K! / 2^T;
+  // this multiplication factor will perform the exact division by
+  // K! / 2^T.
+  APInt Mod = APInt::getSignedMinValue(W+1);
+  APInt MultiplyFactor = OddFactorial.zext(W+1);
+  MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod);
+  MultiplyFactor = MultiplyFactor.trunc(W);
+
+  // Calculate the product, at width T+W
+  const IntegerType *CalculationTy = IntegerType::get(CalculationBits);
+  SCEVHandle Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy);
+  for (unsigned i = 1; i != K; ++i) {
+    SCEVHandle S = SE.getMinusSCEV(It, SE.getIntegerSCEV(i, It->getType()));
+    Dividend = SE.getMulExpr(Dividend,
+                             SE.getTruncateOrZeroExtend(S, CalculationTy));
+  }
+
+  // Divide by 2^T
+  SCEVHandle DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor));
+
+  // Truncate the result, and divide by K! / 2^T.
+
+  return SE.getMulExpr(SE.getConstant(MultiplyFactor),
+                       SE.getTruncateOrZeroExtend(DivResult, ResultTy));
+}
+
+/// evaluateAtIteration - Return the value of this chain of recurrences at
+/// the specified iteration number.  We can evaluate this recurrence by
+/// multiplying each element in the chain by the binomial coefficient
+/// corresponding to it.  In other words, we can evaluate {A,+,B,+,C,+,D} as:
+///
+///   A*BC(It, 0) + B*BC(It, 1) + C*BC(It, 2) + D*BC(It, 3)
+///
+/// where BC(It, k) stands for binomial coefficient.
+///
+SCEVHandle SCEVAddRecExpr::evaluateAtIteration(SCEVHandle It,
+                                               ScalarEvolution &SE) const {
+  SCEVHandle Result = getStart();
+  for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    // The computation is correct in the face of overflow provided that the
+    // multiplication is performed _after_ the evaluation of the binomial
+    // coefficient.
+    SCEVHandle Coeff = BinomialCoefficient(It, i, SE, getType());
+    if (isa<SCEVCouldNotCompute>(Coeff))
+      return Coeff;
+
+    Result = SE.getAddExpr(Result, SE.getMulExpr(getOperand(i), Coeff));
+  }
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//                    SCEV Expression folder implementations
+//===----------------------------------------------------------------------===//
+
+SCEVHandle ScalarEvolution::getTruncateExpr(const SCEVHandle &Op,
+                                            const Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) &&
+         "This is not a truncating conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    return getUnknown(
+        ConstantExpr::getTrunc(SC->getValue(), Ty));
+
+  // trunc(trunc(x)) --> trunc(x)
+  if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op))
+    return getTruncateExpr(ST->getOperand(), Ty);
+
+  // trunc(sext(x)) --> sext(x) if widening or trunc(x) if narrowing
+  if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
+    return getTruncateOrSignExtend(SS->getOperand(), Ty);
+
+  // trunc(zext(x)) --> zext(x) if widening or trunc(x) if narrowing
+  if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
+    return getTruncateOrZeroExtend(SZ->getOperand(), Ty);
+
+  // If the input value is a chrec scev made out of constants, truncate
+  // all of the constants.
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
+    std::vector<SCEVHandle> Operands;
+    for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
+      Operands.push_back(getTruncateExpr(AddRec->getOperand(i), Ty));
+    return getAddRecExpr(Operands, AddRec->getLoop());
+  }
+
+  SCEVTruncateExpr *&Result = (*SCEVTruncates)[std::make_pair(Op, Ty)];
+  if (Result == 0) Result = new SCEVTruncateExpr(Op, Ty);
+  return Result;
+}
+
+SCEVHandle ScalarEvolution::getZeroExtendExpr(const SCEVHandle &Op,
+                                              const Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
+         "This is not an extending conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) {
+    const Type *IntTy = getEffectiveSCEVType(Ty);
+    Constant *C = ConstantExpr::getZExt(SC->getValue(), IntTy);
+    if (IntTy != Ty) C = ConstantExpr::getIntToPtr(C, Ty);
+    return getUnknown(C);
+  }
+
+  // zext(zext(x)) --> zext(x)
+  if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
+    return getZeroExtendExpr(SZ->getOperand(), Ty);
+
+  // If the input value is a chrec scev, and we can prove that the value
+  // did not overflow the old, smaller, value, we can zero extend all of the
+  // operands (often constants).  This allows analysis of something like
+  // this:  for (unsigned char X = 0; X < 100; ++X) { int Y = X; }
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op))
+    if (AR->isAffine()) {
+      // Check whether the backedge-taken count is SCEVCouldNotCompute.
+      // Note that this serves two purposes: It filters out loops that are
+      // simply not analyzable, and it covers the case where this code is
+      // being called from within backedge-taken count analysis, such that
+      // attempting to ask for the backedge-taken count would likely result
+      // in infinite recursion. In the later case, the analysis code will
+      // cope with a conservative value, and it will take care to purge
+      // that value once it has finished.
+      SCEVHandle MaxBECount = getMaxBackedgeTakenCount(AR->getLoop());
+      if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
+        // Manually compute the final value for AR, checking for
+        // overflow.
+        SCEVHandle Start = AR->getStart();
+        SCEVHandle Step = AR->getStepRecurrence(*this);
+
+        // Check whether the backedge-taken count can be losslessly casted to
+        // the addrec's type. The count is always unsigned.
+        SCEVHandle CastedMaxBECount =
+          getTruncateOrZeroExtend(MaxBECount, Start->getType());
+        SCEVHandle RecastedMaxBECount =
+          getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
+        if (MaxBECount == RecastedMaxBECount) {
+          const Type *WideTy =
+            IntegerType::get(getTypeSizeInBits(Start->getType()) * 2);
+          // Check whether Start+Step*MaxBECount has no unsigned overflow.
+          SCEVHandle ZMul =
+            getMulExpr(CastedMaxBECount,
+                       getTruncateOrZeroExtend(Step, Start->getType()));
+          SCEVHandle Add = getAddExpr(Start, ZMul);
+          SCEVHandle OperandExtendedAdd =
+            getAddExpr(getZeroExtendExpr(Start, WideTy),
+                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+                                  getZeroExtendExpr(Step, WideTy)));
+          if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd)
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
+                                 getZeroExtendExpr(Step, Ty),
+                                 AR->getLoop());
+
+          // Similar to above, only this time treat the step value as signed.
+          // This covers loops that count down.
+          SCEVHandle SMul =
+            getMulExpr(CastedMaxBECount,
+                       getTruncateOrSignExtend(Step, Start->getType()));
+          Add = getAddExpr(Start, SMul);
+          OperandExtendedAdd =
+            getAddExpr(getZeroExtendExpr(Start, WideTy),
+                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+                                  getSignExtendExpr(Step, WideTy)));
+          if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd)
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
+                                 getSignExtendExpr(Step, Ty),
+                                 AR->getLoop());
+        }
+      }
+    }
+
+  SCEVZeroExtendExpr *&Result = (*SCEVZeroExtends)[std::make_pair(Op, Ty)];
+  if (Result == 0) Result = new SCEVZeroExtendExpr(Op, Ty);
+  return Result;
+}
+
+SCEVHandle ScalarEvolution::getSignExtendExpr(const SCEVHandle &Op,
+                                              const Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
+         "This is not an extending conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) {
+    const Type *IntTy = getEffectiveSCEVType(Ty);
+    Constant *C = ConstantExpr::getSExt(SC->getValue(), IntTy);
+    if (IntTy != Ty) C = ConstantExpr::getIntToPtr(C, Ty);
+    return getUnknown(C);
+  }
+
+  // sext(sext(x)) --> sext(x)
+  if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
+    return getSignExtendExpr(SS->getOperand(), Ty);
+
+  // If the input value is a chrec scev, and we can prove that the value
+  // did not overflow the old, smaller, value, we can sign extend all of the
+  // operands (often constants).  This allows analysis of something like
+  // this:  for (signed char X = 0; X < 100; ++X) { int Y = X; }
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op))
+    if (AR->isAffine()) {
+      // Check whether the backedge-taken count is SCEVCouldNotCompute.
+      // Note that this serves two purposes: It filters out loops that are
+      // simply not analyzable, and it covers the case where this code is
+      // being called from within backedge-taken count analysis, such that
+      // attempting to ask for the backedge-taken count would likely result
+      // in infinite recursion. In the later case, the analysis code will
+      // cope with a conservative value, and it will take care to purge
+      // that value once it has finished.
+      SCEVHandle MaxBECount = getMaxBackedgeTakenCount(AR->getLoop());
+      if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
+        // Manually compute the final value for AR, checking for
+        // overflow.
+        SCEVHandle Start = AR->getStart();
+        SCEVHandle Step = AR->getStepRecurrence(*this);
+
+        // Check whether the backedge-taken count can be losslessly casted to
+        // the addrec's type. The count is always unsigned.
+        SCEVHandle CastedMaxBECount =
+          getTruncateOrZeroExtend(MaxBECount, Start->getType());
+        SCEVHandle RecastedMaxBECount =
+          getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
+        if (MaxBECount == RecastedMaxBECount) {
+          const Type *WideTy =
+            IntegerType::get(getTypeSizeInBits(Start->getType()) * 2);
+          // Check whether Start+Step*MaxBECount has no signed overflow.
+          SCEVHandle SMul =
+            getMulExpr(CastedMaxBECount,
+                       getTruncateOrSignExtend(Step, Start->getType()));
+          SCEVHandle Add = getAddExpr(Start, SMul);
+          SCEVHandle OperandExtendedAdd =
+            getAddExpr(getSignExtendExpr(Start, WideTy),
+                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+                                  getSignExtendExpr(Step, WideTy)));
+          if (getSignExtendExpr(Add, WideTy) == OperandExtendedAdd)
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getSignExtendExpr(Start, Ty),
+                                 getSignExtendExpr(Step, Ty),
+                                 AR->getLoop());
+        }
+      }
+    }
+
+  SCEVSignExtendExpr *&Result = (*SCEVSignExtends)[std::make_pair(Op, Ty)];
+  if (Result == 0) Result = new SCEVSignExtendExpr(Op, Ty);
+  return Result;
+}
+
+/// getAddExpr - Get a canonical add expression, or something simpler if
+/// possible.
+SCEVHandle ScalarEvolution::getAddExpr(std::vector<SCEVHandle> &Ops) {
+  assert(!Ops.empty() && "Cannot get empty add!");
+  if (Ops.size() == 1) return Ops[0];
+#ifndef NDEBUG
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
+           getEffectiveSCEVType(Ops[0]->getType()) &&
+           "SCEVAddExpr operand types don't match!");
+#endif
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+    ++Idx;
+    assert(Idx < Ops.size());
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      ConstantInt *Fold = ConstantInt::get(LHSC->getValue()->getValue() + 
+                                           RHSC->getValue()->getValue());
+      Ops[0] = getConstant(Fold);
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      if (Ops.size() == 1) return Ops[0];
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant zero being added, strip it off.
+    if (cast<SCEVConstant>(Ops[0])->getValue()->isZero()) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    }
+  }
+
+  if (Ops.size() == 1) return Ops[0];
+
+  // Okay, check to see if the same value occurs in the operand list twice.  If
+  // so, merge them together into an multiply expression.  Since we sorted the
+  // list, these values are required to be adjacent.
+  const Type *Ty = Ops[0]->getType();
+  for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
+    if (Ops[i] == Ops[i+1]) {      //  X + Y + Y  -->  X + Y*2
+      // Found a match, merge the two values into a multiply, and add any
+      // remaining values to the result.
+      SCEVHandle Two = getIntegerSCEV(2, Ty);
+      SCEVHandle Mul = getMulExpr(Ops[i], Two);
+      if (Ops.size() == 2)
+        return Mul;
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+      Ops.push_back(Mul);
+      return getAddExpr(Ops);
+    }
+
+  // Check for truncates. If all the operands are truncated from the same
+  // type, see if factoring out the truncate would permit the result to be
+  // folded. eg., trunc(x) + m*trunc(n) --> trunc(x + trunc(m)*n)
+  // if the contents of the resulting outer trunc fold to something simple.
+  for (; Idx < Ops.size() && isa<SCEVTruncateExpr>(Ops[Idx]); ++Idx) {
+    const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(Ops[Idx]);
+    const Type *DstType = Trunc->getType();
+    const Type *SrcType = Trunc->getOperand()->getType();
+    std::vector<SCEVHandle> LargeOps;
+    bool Ok = true;
+    // Check all the operands to see if they can be represented in the
+    // source type of the truncate.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Ops[i])) {
+        if (T->getOperand()->getType() != SrcType) {
+          Ok = false;
+          break;
+        }
+        LargeOps.push_back(T->getOperand());
+      } else if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
+        // This could be either sign or zero extension, but sign extension
+        // is much more likely to be foldable here.
+        LargeOps.push_back(getSignExtendExpr(C, SrcType));
+      } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Ops[i])) {
+        std::vector<SCEVHandle> LargeMulOps;
+        for (unsigned j = 0, f = M->getNumOperands(); j != f && Ok; ++j) {
+          if (const SCEVTruncateExpr *T =
+                dyn_cast<SCEVTruncateExpr>(M->getOperand(j))) {
+            if (T->getOperand()->getType() != SrcType) {
+              Ok = false;
+              break;
+            }
+            LargeMulOps.push_back(T->getOperand());
+          } else if (const SCEVConstant *C =
+                       dyn_cast<SCEVConstant>(M->getOperand(j))) {
+            // This could be either sign or zero extension, but sign extension
+            // is much more likely to be foldable here.
+            LargeMulOps.push_back(getSignExtendExpr(C, SrcType));
+          } else {
+            Ok = false;
+            break;
+          }
+        }
+        if (Ok)
+          LargeOps.push_back(getMulExpr(LargeMulOps));
+      } else {
+        Ok = false;
+        break;
+      }
+    }
+    if (Ok) {
+      // Evaluate the expression in the larger type.
+      SCEVHandle Fold = getAddExpr(LargeOps);
+      // If it folds to something simple, use it. Otherwise, don't.
+      if (isa<SCEVConstant>(Fold) || isa<SCEVUnknown>(Fold))
+        return getTruncateExpr(Fold, DstType);
+    }
+  }
+
+  // Skip past any other cast SCEVs.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
+    ++Idx;
+
+  // If there are add operands they would be next.
+  if (Idx < Ops.size()) {
+    bool DeletedAdd = false;
+    while (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[Idx])) {
+      // If we have an add, expand the add operands onto the end of the operands
+      // list.
+      Ops.insert(Ops.end(), Add->op_begin(), Add->op_end());
+      Ops.erase(Ops.begin()+Idx);
+      DeletedAdd = true;
+    }
+
+    // If we deleted at least one add, we added operands to the end of the list,
+    // and they are not necessarily sorted.  Recurse to resort and resimplify
+    // any operands we just aquired.
+    if (DeletedAdd)
+      return getAddExpr(Ops);
+  }
+
+  // Skip over the add expression until we get to a multiply.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr)
+    ++Idx;
+
+  // If we are adding something to a multiply expression, make sure the
+  // something is not already an operand of the multiply.  If so, merge it into
+  // the multiply.
+  for (; Idx < Ops.size() && isa<SCEVMulExpr>(Ops[Idx]); ++Idx) {
+    const SCEVMulExpr *Mul = cast<SCEVMulExpr>(Ops[Idx]);
+    for (unsigned MulOp = 0, e = Mul->getNumOperands(); MulOp != e; ++MulOp) {
+      const SCEV *MulOpSCEV = Mul->getOperand(MulOp);
+      for (unsigned AddOp = 0, e = Ops.size(); AddOp != e; ++AddOp)
+        if (MulOpSCEV == Ops[AddOp] && !isa<SCEVConstant>(MulOpSCEV)) {
+          // Fold W + X + (X * Y * Z)  -->  W + (X * ((Y*Z)+1))
+          SCEVHandle InnerMul = Mul->getOperand(MulOp == 0);
+          if (Mul->getNumOperands() != 2) {
+            // If the multiply has more than two operands, we must get the
+            // Y*Z term.
+            std::vector<SCEVHandle> MulOps(Mul->op_begin(), Mul->op_end());
+            MulOps.erase(MulOps.begin()+MulOp);
+            InnerMul = getMulExpr(MulOps);
+          }
+          SCEVHandle One = getIntegerSCEV(1, Ty);
+          SCEVHandle AddOne = getAddExpr(InnerMul, One);
+          SCEVHandle OuterMul = getMulExpr(AddOne, Ops[AddOp]);
+          if (Ops.size() == 2) return OuterMul;
+          if (AddOp < Idx) {
+            Ops.erase(Ops.begin()+AddOp);
+            Ops.erase(Ops.begin()+Idx-1);
+          } else {
+            Ops.erase(Ops.begin()+Idx);
+            Ops.erase(Ops.begin()+AddOp-1);
+          }
+          Ops.push_back(OuterMul);
+          return getAddExpr(Ops);
+        }
+
+      // Check this multiply against other multiplies being added together.
+      for (unsigned OtherMulIdx = Idx+1;
+           OtherMulIdx < Ops.size() && isa<SCEVMulExpr>(Ops[OtherMulIdx]);
+           ++OtherMulIdx) {
+        const SCEVMulExpr *OtherMul = cast<SCEVMulExpr>(Ops[OtherMulIdx]);
+        // If MulOp occurs in OtherMul, we can fold the two multiplies
+        // together.
+        for (unsigned OMulOp = 0, e = OtherMul->getNumOperands();
+             OMulOp != e; ++OMulOp)
+          if (OtherMul->getOperand(OMulOp) == MulOpSCEV) {
+            // Fold X + (A*B*C) + (A*D*E) --> X + (A*(B*C+D*E))
+            SCEVHandle InnerMul1 = Mul->getOperand(MulOp == 0);
+            if (Mul->getNumOperands() != 2) {
+              std::vector<SCEVHandle> MulOps(Mul->op_begin(), Mul->op_end());
+              MulOps.erase(MulOps.begin()+MulOp);
+              InnerMul1 = getMulExpr(MulOps);
+            }
+            SCEVHandle InnerMul2 = OtherMul->getOperand(OMulOp == 0);
+            if (OtherMul->getNumOperands() != 2) {
+              std::vector<SCEVHandle> MulOps(OtherMul->op_begin(),
+                                             OtherMul->op_end());
+              MulOps.erase(MulOps.begin()+OMulOp);
+              InnerMul2 = getMulExpr(MulOps);
+            }
+            SCEVHandle InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
+            SCEVHandle OuterMul = getMulExpr(MulOpSCEV, InnerMulSum);
+            if (Ops.size() == 2) return OuterMul;
+            Ops.erase(Ops.begin()+Idx);
+            Ops.erase(Ops.begin()+OtherMulIdx-1);
+            Ops.push_back(OuterMul);
+            return getAddExpr(Ops);
+          }
+      }
+    }
+  }
+
+  // If there are any add recurrences in the operands list, see if any other
+  // added values are loop invariant.  If so, we can fold them into the
+  // recurrence.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr)
+    ++Idx;
+
+  // Scan over all recurrences, trying to fold loop invariants into them.
+  for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
+    // Scan all of the other operands to this add and add them to the vector if
+    // they are loop invariant w.r.t. the recurrence.
+    std::vector<SCEVHandle> LIOps;
+    const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+      if (Ops[i]->isLoopInvariant(AddRec->getLoop())) {
+        LIOps.push_back(Ops[i]);
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+      }
+
+    // If we found some loop invariants, fold them into the recurrence.
+    if (!LIOps.empty()) {
+      //  NLI + LI + {Start,+,Step}  -->  NLI + {LI+Start,+,Step}
+      LIOps.push_back(AddRec->getStart());
+
+      std::vector<SCEVHandle> AddRecOps(AddRec->op_begin(), AddRec->op_end());
+      AddRecOps[0] = getAddExpr(LIOps);
+
+      SCEVHandle NewRec = getAddRecExpr(AddRecOps, AddRec->getLoop());
+      // If all of the other operands were loop invariant, we are done.
+      if (Ops.size() == 1) return NewRec;
+
+      // Otherwise, add the folded AddRec by the non-liv parts.
+      for (unsigned i = 0;; ++i)
+        if (Ops[i] == AddRec) {
+          Ops[i] = NewRec;
+          break;
+        }
+      return getAddExpr(Ops);
+    }
+
+    // Okay, if there weren't any loop invariants to be folded, check to see if
+    // there are multiple AddRec's with the same loop induction variable being
+    // added together.  If so, we can fold them.
+    for (unsigned OtherIdx = Idx+1;
+         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);++OtherIdx)
+      if (OtherIdx != Idx) {
+        const SCEVAddRecExpr *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+        if (AddRec->getLoop() == OtherAddRec->getLoop()) {
+          // Other + {A,+,B} + {C,+,D}  -->  Other + {A+C,+,B+D}
+          std::vector<SCEVHandle> NewOps(AddRec->op_begin(), AddRec->op_end());
+          for (unsigned i = 0, e = OtherAddRec->getNumOperands(); i != e; ++i) {
+            if (i >= NewOps.size()) {
+              NewOps.insert(NewOps.end(), OtherAddRec->op_begin()+i,
+                            OtherAddRec->op_end());
+              break;
+            }
+            NewOps[i] = getAddExpr(NewOps[i], OtherAddRec->getOperand(i));
+          }
+          SCEVHandle NewAddRec = getAddRecExpr(NewOps, AddRec->getLoop());
+
+          if (Ops.size() == 2) return NewAddRec;
+
+          Ops.erase(Ops.begin()+Idx);
+          Ops.erase(Ops.begin()+OtherIdx-1);
+          Ops.push_back(NewAddRec);
+          return getAddExpr(Ops);
+        }
+      }
+
+    // Otherwise couldn't fold anything into this recurrence.  Move onto the
+    // next one.
+  }
+
+  // Okay, it looks like we really DO need an add expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
+  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scAddExpr,
+                                                                 SCEVOps)];
+  if (Result == 0) Result = new SCEVAddExpr(Ops);
+  return Result;
+}
+
+
+/// getMulExpr - Get a canonical multiply expression, or something simpler if
+/// possible.
+SCEVHandle ScalarEvolution::getMulExpr(std::vector<SCEVHandle> &Ops) {
+  assert(!Ops.empty() && "Cannot get empty mul!");
+#ifndef NDEBUG
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
+           getEffectiveSCEVType(Ops[0]->getType()) &&
+           "SCEVMulExpr operand types don't match!");
+#endif
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+
+    // C1*(C2+V) -> C1*C2 + C1*V
+    if (Ops.size() == 2)
+      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
+        if (Add->getNumOperands() == 2 &&
+            isa<SCEVConstant>(Add->getOperand(0)))
+          return getAddExpr(getMulExpr(LHSC, Add->getOperand(0)),
+                            getMulExpr(LHSC, Add->getOperand(1)));
+
+
+    ++Idx;
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      ConstantInt *Fold = ConstantInt::get(LHSC->getValue()->getValue() * 
+                                           RHSC->getValue()->getValue());
+      Ops[0] = getConstant(Fold);
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      if (Ops.size() == 1) return Ops[0];
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant one being multiplied, strip it off.
+    if (cast<SCEVConstant>(Ops[0])->getValue()->equalsInt(1)) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    } else if (cast<SCEVConstant>(Ops[0])->getValue()->isZero()) {
+      // If we have a multiply of zero, it will always be zero.
+      return Ops[0];
+    }
+  }
+
+  // Skip over the add expression until we get to a multiply.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr)
+    ++Idx;
+
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  // If there are mul operands inline them all into this expression.
+  if (Idx < Ops.size()) {
+    bool DeletedMul = false;
+    while (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[Idx])) {
+      // If we have an mul, expand the mul operands onto the end of the operands
+      // list.
+      Ops.insert(Ops.end(), Mul->op_begin(), Mul->op_end());
+      Ops.erase(Ops.begin()+Idx);
+      DeletedMul = true;
+    }
+
+    // If we deleted at least one mul, we added operands to the end of the list,
+    // and they are not necessarily sorted.  Recurse to resort and resimplify
+    // any operands we just aquired.
+    if (DeletedMul)
+      return getMulExpr(Ops);
+  }
+
+  // If there are any add recurrences in the operands list, see if any other
+  // added values are loop invariant.  If so, we can fold them into the
+  // recurrence.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr)
+    ++Idx;
+
+  // Scan over all recurrences, trying to fold loop invariants into them.
+  for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
+    // Scan all of the other operands to this mul and add them to the vector if
+    // they are loop invariant w.r.t. the recurrence.
+    std::vector<SCEVHandle> LIOps;
+    const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+      if (Ops[i]->isLoopInvariant(AddRec->getLoop())) {
+        LIOps.push_back(Ops[i]);
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+      }
+
+    // If we found some loop invariants, fold them into the recurrence.
+    if (!LIOps.empty()) {
+      //  NLI * LI * {Start,+,Step}  -->  NLI * {LI*Start,+,LI*Step}
+      std::vector<SCEVHandle> NewOps;
+      NewOps.reserve(AddRec->getNumOperands());
+      if (LIOps.size() == 1) {
+        const SCEV *Scale = LIOps[0];
+        for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
+          NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i)));
+      } else {
+        for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
+          std::vector<SCEVHandle> MulOps(LIOps);
+          MulOps.push_back(AddRec->getOperand(i));
+          NewOps.push_back(getMulExpr(MulOps));
+        }
+      }
+
+      SCEVHandle NewRec = getAddRecExpr(NewOps, AddRec->getLoop());
+
+      // If all of the other operands were loop invariant, we are done.
+      if (Ops.size() == 1) return NewRec;
+
+      // Otherwise, multiply the folded AddRec by the non-liv parts.
+      for (unsigned i = 0;; ++i)
+        if (Ops[i] == AddRec) {
+          Ops[i] = NewRec;
+          break;
+        }
+      return getMulExpr(Ops);
+    }
+
+    // Okay, if there weren't any loop invariants to be folded, check to see if
+    // there are multiple AddRec's with the same loop induction variable being
+    // multiplied together.  If so, we can fold them.
+    for (unsigned OtherIdx = Idx+1;
+         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);++OtherIdx)
+      if (OtherIdx != Idx) {
+        const SCEVAddRecExpr *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+        if (AddRec->getLoop() == OtherAddRec->getLoop()) {
+          // F * G  -->  {A,+,B} * {C,+,D}  -->  {A*C,+,F*D + G*B + B*D}
+          const SCEVAddRecExpr *F = AddRec, *G = OtherAddRec;
+          SCEVHandle NewStart = getMulExpr(F->getStart(),
+                                                 G->getStart());
+          SCEVHandle B = F->getStepRecurrence(*this);
+          SCEVHandle D = G->getStepRecurrence(*this);
+          SCEVHandle NewStep = getAddExpr(getMulExpr(F, D),
+                                          getMulExpr(G, B),
+                                          getMulExpr(B, D));
+          SCEVHandle NewAddRec = getAddRecExpr(NewStart, NewStep,
+                                               F->getLoop());
+          if (Ops.size() == 2) return NewAddRec;
+
+          Ops.erase(Ops.begin()+Idx);
+          Ops.erase(Ops.begin()+OtherIdx-1);
+          Ops.push_back(NewAddRec);
+          return getMulExpr(Ops);
+        }
+      }
+
+    // Otherwise couldn't fold anything into this recurrence.  Move onto the
+    // next one.
+  }
+
+  // Okay, it looks like we really DO need an mul expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
+  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scMulExpr,
+                                                                 SCEVOps)];
+  if (Result == 0)
+    Result = new SCEVMulExpr(Ops);
+  return Result;
+}
+
+/// getUDivExpr - Get a canonical multiply expression, or something simpler if
+/// possible.
+SCEVHandle ScalarEvolution::getUDivExpr(const SCEVHandle &LHS,
+                                        const SCEVHandle &RHS) {
+  assert(getEffectiveSCEVType(LHS->getType()) ==
+         getEffectiveSCEVType(RHS->getType()) &&
+         "SCEVUDivExpr operand types don't match!");
+
+  if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
+    if (RHSC->getValue()->equalsInt(1))
+      return LHS;                            // X udiv 1 --> x
+    if (RHSC->isZero())
+      return getIntegerSCEV(0, LHS->getType()); // value is undefined
+
+    // Determine if the division can be folded into the operands of
+    // its operands.
+    // TODO: Generalize this to non-constants by using known-bits information.
+    const Type *Ty = LHS->getType();
+    unsigned LZ = RHSC->getValue()->getValue().countLeadingZeros();
+    unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ;
+    // For non-power-of-two values, effectively round the value up to the
+    // nearest power of two.
+    if (!RHSC->getValue()->getValue().isPowerOf2())
+      ++MaxShiftAmt;
+    const IntegerType *ExtTy =
+      IntegerType::get(getTypeSizeInBits(Ty) + MaxShiftAmt);
+    // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded.
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS))
+      if (const SCEVConstant *Step =
+            dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this)))
+        if (!Step->getValue()->getValue()
+              .urem(RHSC->getValue()->getValue()) &&
+            getZeroExtendExpr(AR, ExtTy) ==
+            getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
+                          getZeroExtendExpr(Step, ExtTy),
+                          AR->getLoop())) {
+          std::vector<SCEVHandle> Operands;
+          for (unsigned i = 0, e = AR->getNumOperands(); i != e; ++i)
+            Operands.push_back(getUDivExpr(AR->getOperand(i), RHS));
+          return getAddRecExpr(Operands, AR->getLoop());
+        }
+    // (A*B)/C --> A*(B/C) if safe and B/C can be folded.
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(LHS)) {
+      std::vector<SCEVHandle> Operands;
+      for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i)
+        Operands.push_back(getZeroExtendExpr(M->getOperand(i), ExtTy));
+      if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands))
+        // Find an operand that's safely divisible.
+        for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
+          SCEVHandle Op = M->getOperand(i);
+          SCEVHandle Div = getUDivExpr(Op, RHSC);
+          if (!isa<SCEVUDivExpr>(Div) && getMulExpr(Div, RHSC) == Op) {
+            Operands = M->getOperands();
+            Operands[i] = Div;
+            return getMulExpr(Operands);
+          }
+        }
+    }
+    // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded.
+    if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(LHS)) {
+      std::vector<SCEVHandle> Operands;
+      for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i)
+        Operands.push_back(getZeroExtendExpr(A->getOperand(i), ExtTy));
+      if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) {
+        Operands.clear();
+        for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) {
+          SCEVHandle Op = getUDivExpr(A->getOperand(i), RHS);
+          if (isa<SCEVUDivExpr>(Op) || getMulExpr(Op, RHS) != A->getOperand(i))
+            break;
+          Operands.push_back(Op);
+        }
+        if (Operands.size() == A->getNumOperands())
+          return getAddExpr(Operands);
+      }
+    }
+
+    // Fold if both operands are constant.
+    if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
+      Constant *LHSCV = LHSC->getValue();
+      Constant *RHSCV = RHSC->getValue();
+      return getUnknown(ConstantExpr::getUDiv(LHSCV, RHSCV));
+    }
+  }
+
+  SCEVUDivExpr *&Result = (*SCEVUDivs)[std::make_pair(LHS, RHS)];
+  if (Result == 0) Result = new SCEVUDivExpr(LHS, RHS);
+  return Result;
+}
+
+
+/// getAddRecExpr - Get an add recurrence expression for the specified loop.
+/// Simplify the expression as much as possible.
+SCEVHandle ScalarEvolution::getAddRecExpr(const SCEVHandle &Start,
+                               const SCEVHandle &Step, const Loop *L) {
+  std::vector<SCEVHandle> Operands;
+  Operands.push_back(Start);
+  if (const SCEVAddRecExpr *StepChrec = dyn_cast<SCEVAddRecExpr>(Step))
+    if (StepChrec->getLoop() == L) {
+      Operands.insert(Operands.end(), StepChrec->op_begin(),
+                      StepChrec->op_end());
+      return getAddRecExpr(Operands, L);
+    }
+
+  Operands.push_back(Step);
+  return getAddRecExpr(Operands, L);
+}
+
+/// getAddRecExpr - Get an add recurrence expression for the specified loop.
+/// Simplify the expression as much as possible.
+SCEVHandle ScalarEvolution::getAddRecExpr(std::vector<SCEVHandle> &Operands,
+                                          const Loop *L) {
+  if (Operands.size() == 1) return Operands[0];
+#ifndef NDEBUG
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Operands[i]->getType()) ==
+           getEffectiveSCEVType(Operands[0]->getType()) &&
+           "SCEVAddRecExpr operand types don't match!");
+#endif
+
+  if (Operands.back()->isZero()) {
+    Operands.pop_back();
+    return getAddRecExpr(Operands, L);             // {X,+,0}  -->  X
+  }
+
+  // Canonicalize nested AddRecs in by nesting them in order of loop depth.
+  if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) {
+    const Loop* NestedLoop = NestedAR->getLoop();
+    if (L->getLoopDepth() < NestedLoop->getLoopDepth()) {
+      std::vector<SCEVHandle> NestedOperands(NestedAR->op_begin(),
+                                             NestedAR->op_end());
+      SCEVHandle NestedARHandle(NestedAR);
+      Operands[0] = NestedAR->getStart();
+      NestedOperands[0] = getAddRecExpr(Operands, L);
+      return getAddRecExpr(NestedOperands, NestedLoop);
+    }
+  }
+
+  std::vector<const SCEV*> SCEVOps(Operands.begin(), Operands.end());
+  SCEVAddRecExpr *&Result = (*SCEVAddRecExprs)[std::make_pair(L, SCEVOps)];
+  if (Result == 0) Result = new SCEVAddRecExpr(Operands, L);
+  return Result;
+}
+
+SCEVHandle ScalarEvolution::getSMaxExpr(const SCEVHandle &LHS,
+                                        const SCEVHandle &RHS) {
+  std::vector<SCEVHandle> Ops;
+  Ops.push_back(LHS);
+  Ops.push_back(RHS);
+  return getSMaxExpr(Ops);
+}
+
+SCEVHandle ScalarEvolution::getSMaxExpr(std::vector<SCEVHandle> Ops) {
+  assert(!Ops.empty() && "Cannot get empty smax!");
+  if (Ops.size() == 1) return Ops[0];
+#ifndef NDEBUG
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
+           getEffectiveSCEVType(Ops[0]->getType()) &&
+           "SCEVSMaxExpr operand types don't match!");
+#endif
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+    ++Idx;
+    assert(Idx < Ops.size());
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      ConstantInt *Fold = ConstantInt::get(
+                              APIntOps::smax(LHSC->getValue()->getValue(),
+                                             RHSC->getValue()->getValue()));
+      Ops[0] = getConstant(Fold);
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      if (Ops.size() == 1) return Ops[0];
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant -inf, strip it off.
+    if (cast<SCEVConstant>(Ops[0])->getValue()->isMinValue(true)) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    }
+  }
+
+  if (Ops.size() == 1) return Ops[0];
+
+  // Find the first SMax
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scSMaxExpr)
+    ++Idx;
+
+  // Check to see if one of the operands is an SMax. If so, expand its operands
+  // onto our operand list, and recurse to simplify.
+  if (Idx < Ops.size()) {
+    bool DeletedSMax = false;
+    while (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(Ops[Idx])) {
+      Ops.insert(Ops.end(), SMax->op_begin(), SMax->op_end());
+      Ops.erase(Ops.begin()+Idx);
+      DeletedSMax = true;
+    }
+
+    if (DeletedSMax)
+      return getSMaxExpr(Ops);
+  }
+
+  // Okay, check to see if the same value occurs in the operand list twice.  If
+  // so, delete one.  Since we sorted the list, these values are required to
+  // be adjacent.
+  for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
+    if (Ops[i] == Ops[i+1]) {      //  X smax Y smax Y  -->  X smax Y
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+1);
+      --i; --e;
+    }
+
+  if (Ops.size() == 1) return Ops[0];
+
+  assert(!Ops.empty() && "Reduced smax down to nothing!");
+
+  // Okay, it looks like we really DO need an smax expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
+  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scSMaxExpr,
+                                                                 SCEVOps)];
+  if (Result == 0) Result = new SCEVSMaxExpr(Ops);
+  return Result;
+}
+
+SCEVHandle ScalarEvolution::getUMaxExpr(const SCEVHandle &LHS,
+                                        const SCEVHandle &RHS) {
+  std::vector<SCEVHandle> Ops;
+  Ops.push_back(LHS);
+  Ops.push_back(RHS);
+  return getUMaxExpr(Ops);
+}
+
+SCEVHandle ScalarEvolution::getUMaxExpr(std::vector<SCEVHandle> Ops) {
+  assert(!Ops.empty() && "Cannot get empty umax!");
+  if (Ops.size() == 1) return Ops[0];
+#ifndef NDEBUG
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
+           getEffectiveSCEVType(Ops[0]->getType()) &&
+           "SCEVUMaxExpr operand types don't match!");
+#endif
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+    ++Idx;
+    assert(Idx < Ops.size());
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      ConstantInt *Fold = ConstantInt::get(
+                              APIntOps::umax(LHSC->getValue()->getValue(),
+                                             RHSC->getValue()->getValue()));
+      Ops[0] = getConstant(Fold);
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      if (Ops.size() == 1) return Ops[0];
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant zero, strip it off.
+    if (cast<SCEVConstant>(Ops[0])->getValue()->isMinValue(false)) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    }
+  }
+
+  if (Ops.size() == 1) return Ops[0];
+
+  // Find the first UMax
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scUMaxExpr)
+    ++Idx;
+
+  // Check to see if one of the operands is a UMax. If so, expand its operands
+  // onto our operand list, and recurse to simplify.
+  if (Idx < Ops.size()) {
+    bool DeletedUMax = false;
+    while (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(Ops[Idx])) {
+      Ops.insert(Ops.end(), UMax->op_begin(), UMax->op_end());
+      Ops.erase(Ops.begin()+Idx);
+      DeletedUMax = true;
+    }
+
+    if (DeletedUMax)
+      return getUMaxExpr(Ops);
+  }
+
+  // Okay, check to see if the same value occurs in the operand list twice.  If
+  // so, delete one.  Since we sorted the list, these values are required to
+  // be adjacent.
+  for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
+    if (Ops[i] == Ops[i+1]) {      //  X umax Y umax Y  -->  X umax Y
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+1);
+      --i; --e;
+    }
+
+  if (Ops.size() == 1) return Ops[0];
+
+  assert(!Ops.empty() && "Reduced umax down to nothing!");
+
+  // Okay, it looks like we really DO need a umax expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
+  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scUMaxExpr,
+                                                                 SCEVOps)];
+  if (Result == 0) Result = new SCEVUMaxExpr(Ops);
+  return Result;
+}
+
+SCEVHandle ScalarEvolution::getUnknown(Value *V) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return getConstant(CI);
+  if (isa<ConstantPointerNull>(V))
+    return getIntegerSCEV(0, V->getType());
+  SCEVUnknown *&Result = (*SCEVUnknowns)[V];
+  if (Result == 0) Result = new SCEVUnknown(V);
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//            Basic SCEV Analysis and PHI Idiom Recognition Code
+//
+
+/// isSCEVable - Test if values of the given type are analyzable within
+/// the SCEV framework. This primarily includes integer types, and it
+/// can optionally include pointer types if the ScalarEvolution class
+/// has access to target-specific information.
+bool ScalarEvolution::isSCEVable(const Type *Ty) const {
+  // Integers are always SCEVable.
+  if (Ty->isInteger())
+    return true;
+
+  // Pointers are SCEVable if TargetData information is available
+  // to provide pointer size information.
+  if (isa<PointerType>(Ty))
+    return TD != NULL;
+
+  // Otherwise it's not SCEVable.
+  return false;
+}
+
+/// getTypeSizeInBits - Return the size in bits of the specified type,
+/// for which isSCEVable must return true.
+uint64_t ScalarEvolution::getTypeSizeInBits(const Type *Ty) const {
+  assert(isSCEVable(Ty) && "Type is not SCEVable!");
+
+  // If we have a TargetData, use it!
+  if (TD)
+    return TD->getTypeSizeInBits(Ty);
+
+  // Otherwise, we support only integer types.
+  assert(Ty->isInteger() && "isSCEVable permitted a non-SCEVable type!");
+  return Ty->getPrimitiveSizeInBits();
+}
+
+/// getEffectiveSCEVType - Return a type with the same bitwidth as
+/// the given type and which represents how SCEV will treat the given
+/// type, for which isSCEVable must return true. For pointer types,
+/// this is the pointer-sized integer type.
+const Type *ScalarEvolution::getEffectiveSCEVType(const Type *Ty) const {
+  assert(isSCEVable(Ty) && "Type is not SCEVable!");
+
+  if (Ty->isInteger())
+    return Ty;
+
+  assert(isa<PointerType>(Ty) && "Unexpected non-pointer non-integer type!");
+  return TD->getIntPtrType();
+}
+
+SCEVHandle ScalarEvolution::getCouldNotCompute() {
+  return UnknownValue;
+}
+
+/// hasSCEV - Return true if the SCEV for this value has already been
+/// computed.
+bool ScalarEvolution::hasSCEV(Value *V) const {
+  return Scalars.count(V);
+}
+
+/// getSCEV - Return an existing SCEV if it exists, otherwise analyze the
+/// expression and create a new one.
+SCEVHandle ScalarEvolution::getSCEV(Value *V) {
+  assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
+
+  std::map<SCEVCallbackVH, SCEVHandle>::iterator I = Scalars.find(V);
+  if (I != Scalars.end()) return I->second;
+  SCEVHandle S = createSCEV(V);
+  Scalars.insert(std::make_pair(SCEVCallbackVH(V, this), S));
+  return S;
+}
+
+/// getIntegerSCEV - Given an integer or FP type, create a constant for the
+/// specified signed integer value and return a SCEV for the constant.
+SCEVHandle ScalarEvolution::getIntegerSCEV(int Val, const Type *Ty) {
+  Ty = getEffectiveSCEVType(Ty);
+  Constant *C;
+  if (Val == 0)
+    C = Constant::getNullValue(Ty);
+  else if (Ty->isFloatingPoint())
+    C = ConstantFP::get(APFloat(Ty==Type::FloatTy ? APFloat::IEEEsingle :
+                                APFloat::IEEEdouble, Val));
+  else
+    C = ConstantInt::get(Ty, Val);
+  return getUnknown(C);
+}
+
+/// getNegativeSCEV - Return a SCEV corresponding to -V = -1*V
+///
+SCEVHandle ScalarEvolution::getNegativeSCEV(const SCEVHandle &V) {
+  if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
+    return getUnknown(ConstantExpr::getNeg(VC->getValue()));
+
+  const Type *Ty = V->getType();
+  Ty = getEffectiveSCEVType(Ty);
+  return getMulExpr(V, getConstant(ConstantInt::getAllOnesValue(Ty)));
+}
+
+/// getNotSCEV - Return a SCEV corresponding to ~V = -1-V
+SCEVHandle ScalarEvolution::getNotSCEV(const SCEVHandle &V) {
+  if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
+    return getUnknown(ConstantExpr::getNot(VC->getValue()));
+
+  const Type *Ty = V->getType();
+  Ty = getEffectiveSCEVType(Ty);
+  SCEVHandle AllOnes = getConstant(ConstantInt::getAllOnesValue(Ty));
+  return getMinusSCEV(AllOnes, V);
+}
+
+/// getMinusSCEV - Return a SCEV corresponding to LHS - RHS.
+///
+SCEVHandle ScalarEvolution::getMinusSCEV(const SCEVHandle &LHS,
+                                         const SCEVHandle &RHS) {
+  // X - Y --> X + -Y
+  return getAddExpr(LHS, getNegativeSCEV(RHS));
+}
+
+/// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is zero
+/// extended.
+SCEVHandle
+ScalarEvolution::getTruncateOrZeroExtend(const SCEVHandle &V,
+                                         const Type *Ty) {
+  const Type *SrcTy = V->getType();
+  assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
+         (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
+         "Cannot truncate or zero extend with non-integer arguments!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
+    return getTruncateExpr(V, Ty);
+  return getZeroExtendExpr(V, Ty);
+}
+
+/// getTruncateOrSignExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is sign
+/// extended.
+SCEVHandle
+ScalarEvolution::getTruncateOrSignExtend(const SCEVHandle &V,
+                                         const Type *Ty) {
+  const Type *SrcTy = V->getType();
+  assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
+         (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
+         "Cannot truncate or zero extend with non-integer arguments!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
+    return getTruncateExpr(V, Ty);
+  return getSignExtendExpr(V, Ty);
+}
+
+/// getNoopOrZeroExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is zero
+/// extended.  The conversion must not be narrowing.
+SCEVHandle
+ScalarEvolution::getNoopOrZeroExtend(const SCEVHandle &V, const Type *Ty) {
+  const Type *SrcTy = V->getType();
+  assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
+         (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
+         "Cannot noop or zero extend with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
+         "getNoopOrZeroExtend cannot truncate!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getZeroExtendExpr(V, Ty);
+}
+
+/// getNoopOrSignExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is sign
+/// extended.  The conversion must not be narrowing.
+SCEVHandle
+ScalarEvolution::getNoopOrSignExtend(const SCEVHandle &V, const Type *Ty) {
+  const Type *SrcTy = V->getType();
+  assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
+         (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
+         "Cannot noop or sign extend with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
+         "getNoopOrSignExtend cannot truncate!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getSignExtendExpr(V, Ty);
+}
+
+/// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  The conversion must not be widening.
+SCEVHandle
+ScalarEvolution::getTruncateOrNoop(const SCEVHandle &V, const Type *Ty) {
+  const Type *SrcTy = V->getType();
+  assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
+         (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
+         "Cannot truncate or noop with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) >= getTypeSizeInBits(Ty) &&
+         "getTruncateOrNoop cannot extend!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getTruncateExpr(V, Ty);
+}
+
+/// ReplaceSymbolicValueWithConcrete - This looks up the computed SCEV value for
+/// the specified instruction and replaces any references to the symbolic value
+/// SymName with the specified value.  This is used during PHI resolution.
+void ScalarEvolution::
+ReplaceSymbolicValueWithConcrete(Instruction *I, const SCEVHandle &SymName,
+                                 const SCEVHandle &NewVal) {
+  std::map<SCEVCallbackVH, SCEVHandle>::iterator SI =
+    Scalars.find(SCEVCallbackVH(I, this));
+  if (SI == Scalars.end()) return;
+
+  SCEVHandle NV =
+    SI->second->replaceSymbolicValuesWithConcrete(SymName, NewVal, *this);
+  if (NV == SI->second) return;  // No change.
+
+  SI->second = NV;       // Update the scalars map!
+
+  // Any instruction values that use this instruction might also need to be
+  // updated!
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI)
+    ReplaceSymbolicValueWithConcrete(cast<Instruction>(*UI), SymName, NewVal);
+}
+
+/// createNodeForPHI - PHI nodes have two cases.  Either the PHI node exists in
+/// a loop header, making it a potential recurrence, or it doesn't.
+///
+SCEVHandle ScalarEvolution::createNodeForPHI(PHINode *PN) {
+  if (PN->getNumIncomingValues() == 2)  // The loops have been canonicalized.
+    if (const Loop *L = LI->getLoopFor(PN->getParent()))
+      if (L->getHeader() == PN->getParent()) {
+        // If it lives in the loop header, it has two incoming values, one
+        // from outside the loop, and one from inside.
+        unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+        unsigned BackEdge     = IncomingEdge^1;
+
+        // While we are analyzing this PHI node, handle its value symbolically.
+        SCEVHandle SymbolicName = getUnknown(PN);
+        assert(Scalars.find(PN) == Scalars.end() &&
+               "PHI node already processed?");
+        Scalars.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
+
+        // Using this symbolic name for the PHI, analyze the value coming around
+        // the back-edge.
+        SCEVHandle BEValue = getSCEV(PN->getIncomingValue(BackEdge));
+
+        // NOTE: If BEValue is loop invariant, we know that the PHI node just
+        // has a special value for the first iteration of the loop.
+
+        // If the value coming around the backedge is an add with the symbolic
+        // value we just inserted, then we found a simple induction variable!
+        if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) {
+          // If there is a single occurrence of the symbolic value, replace it
+          // with a recurrence.
+          unsigned FoundIndex = Add->getNumOperands();
+          for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+            if (Add->getOperand(i) == SymbolicName)
+              if (FoundIndex == e) {
+                FoundIndex = i;
+                break;
+              }
+
+          if (FoundIndex != Add->getNumOperands()) {
+            // Create an add with everything but the specified operand.
+            std::vector<SCEVHandle> Ops;
+            for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+              if (i != FoundIndex)
+                Ops.push_back(Add->getOperand(i));
+            SCEVHandle Accum = getAddExpr(Ops);
+
+            // This is not a valid addrec if the step amount is varying each
+            // loop iteration, but is not itself an addrec in this loop.
+            if (Accum->isLoopInvariant(L) ||
+                (isa<SCEVAddRecExpr>(Accum) &&
+                 cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
+              SCEVHandle StartVal = getSCEV(PN->getIncomingValue(IncomingEdge));
+              SCEVHandle PHISCEV  = getAddRecExpr(StartVal, Accum, L);
+
+              // Okay, for the entire analysis of this edge we assumed the PHI
+              // to be symbolic.  We now need to go back and update all of the
+              // entries for the scalars that use the PHI (except for the PHI
+              // itself) to use the new analyzed value instead of the "symbolic"
+              // value.
+              ReplaceSymbolicValueWithConcrete(PN, SymbolicName, PHISCEV);
+              return PHISCEV;
+            }
+          }
+        } else if (const SCEVAddRecExpr *AddRec =
+                     dyn_cast<SCEVAddRecExpr>(BEValue)) {
+          // Otherwise, this could be a loop like this:
+          //     i = 0;  for (j = 1; ..; ++j) { ....  i = j; }
+          // In this case, j = {1,+,1}  and BEValue is j.
+          // Because the other in-value of i (0) fits the evolution of BEValue
+          // i really is an addrec evolution.
+          if (AddRec->getLoop() == L && AddRec->isAffine()) {
+            SCEVHandle StartVal = getSCEV(PN->getIncomingValue(IncomingEdge));
+
+            // If StartVal = j.start - j.stride, we can use StartVal as the
+            // initial step of the addrec evolution.
+            if (StartVal == getMinusSCEV(AddRec->getOperand(0),
+                                            AddRec->getOperand(1))) {
+              SCEVHandle PHISCEV = 
+                 getAddRecExpr(StartVal, AddRec->getOperand(1), L);
+
+              // Okay, for the entire analysis of this edge we assumed the PHI
+              // to be symbolic.  We now need to go back and update all of the
+              // entries for the scalars that use the PHI (except for the PHI
+              // itself) to use the new analyzed value instead of the "symbolic"
+              // value.
+              ReplaceSymbolicValueWithConcrete(PN, SymbolicName, PHISCEV);
+              return PHISCEV;
+            }
+          }
+        }
+
+        return SymbolicName;
+      }
+
+  // If it's not a loop phi, we can't handle it yet.
+  return getUnknown(PN);
+}
+
+/// createNodeForGEP - Expand GEP instructions into add and multiply
+/// operations. This allows them to be analyzed by regular SCEV code.
+///
+SCEVHandle ScalarEvolution::createNodeForGEP(User *GEP) {
+
+  const Type *IntPtrTy = TD->getIntPtrType();
+  Value *Base = GEP->getOperand(0);
+  // Don't attempt to analyze GEPs over unsized objects.
+  if (!cast<PointerType>(Base->getType())->getElementType()->isSized())
+    return getUnknown(GEP);
+  SCEVHandle TotalOffset = getIntegerSCEV(0, IntPtrTy);
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (GetElementPtrInst::op_iterator I = next(GEP->op_begin()),
+                                      E = GEP->op_end();
+       I != E; ++I) {
+    Value *Index = *I;
+    // Compute the (potentially symbolic) offset in bytes for this index.
+    if (const StructType *STy = dyn_cast<StructType>(*GTI++)) {
+      // For a struct, add the member offset.
+      const StructLayout &SL = *TD->getStructLayout(STy);
+      unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
+      uint64_t Offset = SL.getElementOffset(FieldNo);
+      TotalOffset = getAddExpr(TotalOffset,
+                                  getIntegerSCEV(Offset, IntPtrTy));
+    } else {
+      // For an array, add the element offset, explicitly scaled.
+      SCEVHandle LocalOffset = getSCEV(Index);
+      if (!isa<PointerType>(LocalOffset->getType()))
+        // Getelementptr indicies are signed.
+        LocalOffset = getTruncateOrSignExtend(LocalOffset,
+                                              IntPtrTy);
+      LocalOffset =
+        getMulExpr(LocalOffset,
+                   getIntegerSCEV(TD->getTypeAllocSize(*GTI),
+                                  IntPtrTy));
+      TotalOffset = getAddExpr(TotalOffset, LocalOffset);
+    }
+  }
+  return getAddExpr(getSCEV(Base), TotalOffset);
+}
+
+/// GetMinTrailingZeros - Determine the minimum number of zero bits that S is
+/// guaranteed to end in (at every loop iteration).  It is, at the same time,
+/// the minimum number of times S is divisible by 2.  For example, given {4,+,8}
+/// it returns 2.  If S is guaranteed to be 0, it returns the bitwidth of S.
+static uint32_t GetMinTrailingZeros(SCEVHandle S, const ScalarEvolution &SE) {
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
+    return C->getValue()->getValue().countTrailingZeros();
+
+  if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(S))
+    return std::min(GetMinTrailingZeros(T->getOperand(), SE),
+                    (uint32_t)SE.getTypeSizeInBits(T->getType()));
+
+  if (const SCEVZeroExtendExpr *E = dyn_cast<SCEVZeroExtendExpr>(S)) {
+    uint32_t OpRes = GetMinTrailingZeros(E->getOperand(), SE);
+    return OpRes == SE.getTypeSizeInBits(E->getOperand()->getType()) ?
+             SE.getTypeSizeInBits(E->getType()) : OpRes;
+  }
+
+  if (const SCEVSignExtendExpr *E = dyn_cast<SCEVSignExtendExpr>(S)) {
+    uint32_t OpRes = GetMinTrailingZeros(E->getOperand(), SE);
+    return OpRes == SE.getTypeSizeInBits(E->getOperand()->getType()) ?
+             SE.getTypeSizeInBits(E->getType()) : OpRes;
+  }
+
+  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0), SE);
+    for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i), SE));
+    return MinOpRes;
+  }
+
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+    // The result is the sum of all operands results.
+    uint32_t SumOpRes = GetMinTrailingZeros(M->getOperand(0), SE);
+    uint32_t BitWidth = SE.getTypeSizeInBits(M->getType());
+    for (unsigned i = 1, e = M->getNumOperands();
+         SumOpRes != BitWidth && i != e; ++i)
+      SumOpRes = std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i), SE),
+                          BitWidth);
+    return SumOpRes;
+  }
+
+  if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0), SE);
+    for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i), SE));
+    return MinOpRes;
+  }
+
+  if (const SCEVSMaxExpr *M = dyn_cast<SCEVSMaxExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0), SE);
+    for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i), SE));
+    return MinOpRes;
+  }
+
+  if (const SCEVUMaxExpr *M = dyn_cast<SCEVUMaxExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0), SE);
+    for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i), SE));
+    return MinOpRes;
+  }
+
+  // SCEVUDivExpr, SCEVUnknown
+  return 0;
+}
+
+/// createSCEV - We know that there is no SCEV for the specified value.
+/// Analyze the expression.
+///
+SCEVHandle ScalarEvolution::createSCEV(Value *V) {
+  if (!isSCEVable(V->getType()))
+    return getUnknown(V);
+
+  unsigned Opcode = Instruction::UserOp1;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    Opcode = I->getOpcode();
+  else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    Opcode = CE->getOpcode();
+  else
+    return getUnknown(V);
+
+  User *U = cast<User>(V);
+  switch (Opcode) {
+  case Instruction::Add:
+    return getAddExpr(getSCEV(U->getOperand(0)),
+                      getSCEV(U->getOperand(1)));
+  case Instruction::Mul:
+    return getMulExpr(getSCEV(U->getOperand(0)),
+                      getSCEV(U->getOperand(1)));
+  case Instruction::UDiv:
+    return getUDivExpr(getSCEV(U->getOperand(0)),
+                       getSCEV(U->getOperand(1)));
+  case Instruction::Sub:
+    return getMinusSCEV(getSCEV(U->getOperand(0)),
+                        getSCEV(U->getOperand(1)));
+  case Instruction::And:
+    // For an expression like x&255 that merely masks off the high bits,
+    // use zext(trunc(x)) as the SCEV expression.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      if (CI->isNullValue())
+        return getSCEV(U->getOperand(1));
+      if (CI->isAllOnesValue())
+        return getSCEV(U->getOperand(0));
+      const APInt &A = CI->getValue();
+      unsigned Ones = A.countTrailingOnes();
+      if (APIntOps::isMask(Ones, A))
+        return
+          getZeroExtendExpr(getTruncateExpr(getSCEV(U->getOperand(0)),
+                                            IntegerType::get(Ones)),
+                            U->getType());
+    }
+    break;
+  case Instruction::Or:
+    // If the RHS of the Or is a constant, we may have something like:
+    // X*4+1 which got turned into X*4|1.  Handle this as an Add so loop
+    // optimizations will transparently handle this case.
+    //
+    // In order for this transformation to be safe, the LHS must be of the
+    // form X*(2^n) and the Or constant must be less than 2^n.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      SCEVHandle LHS = getSCEV(U->getOperand(0));
+      const APInt &CIVal = CI->getValue();
+      if (GetMinTrailingZeros(LHS, *this) >=
+          (CIVal.getBitWidth() - CIVal.countLeadingZeros()))
+        return getAddExpr(LHS, getSCEV(U->getOperand(1)));
+    }
+    break;
+  case Instruction::Xor:
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      // If the RHS of the xor is a signbit, then this is just an add.
+      // Instcombine turns add of signbit into xor as a strength reduction step.
+      if (CI->getValue().isSignBit())
+        return getAddExpr(getSCEV(U->getOperand(0)),
+                          getSCEV(U->getOperand(1)));
+
+      // If the RHS of xor is -1, then this is a not operation.
+      if (CI->isAllOnesValue())
+        return getNotSCEV(getSCEV(U->getOperand(0)));
+
+      // Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask.
+      // This is a variant of the check for xor with -1, and it handles
+      // the case where instcombine has trimmed non-demanded bits out
+      // of an xor with -1.
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0)))
+        if (ConstantInt *LCI = dyn_cast<ConstantInt>(BO->getOperand(1)))
+          if (BO->getOpcode() == Instruction::And &&
+              LCI->getValue() == CI->getValue())
+            if (const SCEVZeroExtendExpr *Z =
+                  dyn_cast<SCEVZeroExtendExpr>(getSCEV(U->getOperand(0))))
+              return getZeroExtendExpr(getNotSCEV(Z->getOperand()),
+                                       U->getType());
+    }
+    break;
+
+  case Instruction::Shl:
+    // Turn shift left of a constant amount into a multiply.
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+      Constant *X = ConstantInt::get(
+        APInt(BitWidth, 1).shl(SA->getLimitedValue(BitWidth)));
+      return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X));
+    }
+    break;
+
+  case Instruction::LShr:
+    // Turn logical shift right of a constant into a unsigned divide.
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+      Constant *X = ConstantInt::get(
+        APInt(BitWidth, 1).shl(SA->getLimitedValue(BitWidth)));
+      return getUDivExpr(getSCEV(U->getOperand(0)), getSCEV(X));
+    }
+    break;
+
+  case Instruction::AShr:
+    // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1)))
+      if (Instruction *L = dyn_cast<Instruction>(U->getOperand(0)))
+        if (L->getOpcode() == Instruction::Shl &&
+            L->getOperand(1) == U->getOperand(1)) {
+          unsigned BitWidth = getTypeSizeInBits(U->getType());
+          uint64_t Amt = BitWidth - CI->getZExtValue();
+          if (Amt == BitWidth)
+            return getSCEV(L->getOperand(0));       // shift by zero --> noop
+          if (Amt > BitWidth)
+            return getIntegerSCEV(0, U->getType()); // value is undefined
+          return
+            getSignExtendExpr(getTruncateExpr(getSCEV(L->getOperand(0)),
+                                                      IntegerType::get(Amt)),
+                                 U->getType());
+        }
+    break;
+
+  case Instruction::Trunc:
+    return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType());
+
+  case Instruction::ZExt:
+    return getZeroExtendExpr(getSCEV(U->getOperand(0)), U->getType());
+
+  case Instruction::SExt:
+    return getSignExtendExpr(getSCEV(U->getOperand(0)), U->getType());
+
+  case Instruction::BitCast:
+    // BitCasts are no-op casts so we just eliminate the cast.
+    if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType()))
+      return getSCEV(U->getOperand(0));
+    break;
+
+  case Instruction::IntToPtr:
+    if (!TD) break; // Without TD we can't analyze pointers.
+    return getTruncateOrZeroExtend(getSCEV(U->getOperand(0)),
+                                   TD->getIntPtrType());
+
+  case Instruction::PtrToInt:
+    if (!TD) break; // Without TD we can't analyze pointers.
+    return getTruncateOrZeroExtend(getSCEV(U->getOperand(0)),
+                                   U->getType());
+
+  case Instruction::GetElementPtr:
+    if (!TD) break; // Without TD we can't analyze pointers.
+    return createNodeForGEP(U);
+
+  case Instruction::PHI:
+    return createNodeForPHI(cast<PHINode>(U));
+
+  case Instruction::Select:
+    // This could be a smax or umax that was lowered earlier.
+    // Try to recover it.
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U->getOperand(0))) {
+      Value *LHS = ICI->getOperand(0);
+      Value *RHS = ICI->getOperand(1);
+      switch (ICI->getPredicate()) {
+      case ICmpInst::ICMP_SLT:
+      case ICmpInst::ICMP_SLE:
+        std::swap(LHS, RHS);
+        // fall through
+      case ICmpInst::ICMP_SGT:
+      case ICmpInst::ICMP_SGE:
+        if (LHS == U->getOperand(1) && RHS == U->getOperand(2))
+          return getSMaxExpr(getSCEV(LHS), getSCEV(RHS));
+        else if (LHS == U->getOperand(2) && RHS == U->getOperand(1))
+          // ~smax(~x, ~y) == smin(x, y).
+          return getNotSCEV(getSMaxExpr(
+                                   getNotSCEV(getSCEV(LHS)),
+                                   getNotSCEV(getSCEV(RHS))));
+        break;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_ULE:
+        std::swap(LHS, RHS);
+        // fall through
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_UGE:
+        if (LHS == U->getOperand(1) && RHS == U->getOperand(2))
+          return getUMaxExpr(getSCEV(LHS), getSCEV(RHS));
+        else if (LHS == U->getOperand(2) && RHS == U->getOperand(1))
+          // ~umax(~x, ~y) == umin(x, y)
+          return getNotSCEV(getUMaxExpr(getNotSCEV(getSCEV(LHS)),
+                                        getNotSCEV(getSCEV(RHS))));
+        break;
+      default:
+        break;
+      }
+    }
+
+  default: // We cannot analyze this expression.
+    break;
+  }
+
+  return getUnknown(V);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                   Iteration Count Computation Code
+//
+
+/// getBackedgeTakenCount - If the specified loop has a predictable
+/// backedge-taken count, return it, otherwise return a SCEVCouldNotCompute
+/// object. The backedge-taken count is the number of times the loop header
+/// will be branched to from within the loop. This is one less than the
+/// trip count of the loop, since it doesn't count the first iteration,
+/// when the header is branched to from outside the loop.
+///
+/// Note that it is not valid to call this method on a loop without a
+/// loop-invariant backedge-taken count (see
+/// hasLoopInvariantBackedgeTakenCount).
+///
+SCEVHandle ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
+  return getBackedgeTakenInfo(L).Exact;
+}
+
+/// getMaxBackedgeTakenCount - Similar to getBackedgeTakenCount, except
+/// return the least SCEV value that is known never to be less than the
+/// actual backedge taken count.
+SCEVHandle ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) {
+  return getBackedgeTakenInfo(L).Max;
+}
+
+const ScalarEvolution::BackedgeTakenInfo &
+ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
+  // Initially insert a CouldNotCompute for this loop. If the insertion
+  // succeeds, procede to actually compute a backedge-taken count and
+  // update the value. The temporary CouldNotCompute value tells SCEV
+  // code elsewhere that it shouldn't attempt to request a new
+  // backedge-taken count, which could result in infinite recursion.
+  std::pair<std::map<const Loop*, BackedgeTakenInfo>::iterator, bool> Pair =
+    BackedgeTakenCounts.insert(std::make_pair(L, getCouldNotCompute()));
+  if (Pair.second) {
+    BackedgeTakenInfo ItCount = ComputeBackedgeTakenCount(L);
+    if (ItCount.Exact != UnknownValue) {
+      assert(ItCount.Exact->isLoopInvariant(L) &&
+             ItCount.Max->isLoopInvariant(L) &&
+             "Computed trip count isn't loop invariant for loop!");
+      ++NumTripCountsComputed;
+
+      // Update the value in the map.
+      Pair.first->second = ItCount;
+    } else if (isa<PHINode>(L->getHeader()->begin())) {
+      // Only count loops that have phi nodes as not being computable.
+      ++NumTripCountsNotComputed;
+    }
+
+    // Now that we know more about the trip count for this loop, forget any
+    // existing SCEV values for PHI nodes in this loop since they are only
+    // conservative estimates made without the benefit
+    // of trip count information.
+    if (ItCount.hasAnyInfo())
+      forgetLoopPHIs(L);
+  }
+  return Pair.first->second;
+}
+
+/// forgetLoopBackedgeTakenCount - This method should be called by the
+/// client when it has changed a loop in a way that may effect
+/// ScalarEvolution's ability to compute a trip count, or if the loop
+/// is deleted.
+void ScalarEvolution::forgetLoopBackedgeTakenCount(const Loop *L) {
+  BackedgeTakenCounts.erase(L);
+  forgetLoopPHIs(L);
+}
+
+/// forgetLoopPHIs - Delete the memoized SCEVs associated with the
+/// PHI nodes in the given loop. This is used when the trip count of
+/// the loop may have changed.
+void ScalarEvolution::forgetLoopPHIs(const Loop *L) {
+  BasicBlock *Header = L->getHeader();
+
+  // Push all Loop-header PHIs onto the Worklist stack, except those
+  // that are presently represented via a SCEVUnknown. SCEVUnknown for
+  // a PHI either means that it has an unrecognized structure, or it's
+  // a PHI that's in the progress of being computed by createNodeForPHI.
+  // In the former case, additional loop trip count information isn't
+  // going to change anything. In the later case, createNodeForPHI will
+  // perform the necessary updates on its own when it gets to that point.
+  SmallVector<Instruction *, 16> Worklist;
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    std::map<SCEVCallbackVH, SCEVHandle>::iterator It = Scalars.find((Value*)I);
+    if (It != Scalars.end() && !isa<SCEVUnknown>(It->second))
+      Worklist.push_back(PN);
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    if (Scalars.erase(I))
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI)
+        Worklist.push_back(cast<Instruction>(UI));
+  }
+}
+
+/// ComputeBackedgeTakenCount - Compute the number of times the backedge
+/// of the specified loop will execute.
+ScalarEvolution::BackedgeTakenInfo
+ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
+  // If the loop has a non-one exit block count, we can't analyze it.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1) return UnknownValue;
+
+  // Okay, there is one exit block.  Try to find the condition that causes the
+  // loop to be exited.
+  BasicBlock *ExitBlock = ExitBlocks[0];
+
+  BasicBlock *ExitingBlock = 0;
+  for (pred_iterator PI = pred_begin(ExitBlock), E = pred_end(ExitBlock);
+       PI != E; ++PI)
+    if (L->contains(*PI)) {
+      if (ExitingBlock == 0)
+        ExitingBlock = *PI;
+      else
+        return UnknownValue;   // More than one block exiting!
+    }
+  assert(ExitingBlock && "No exits from loop, something is broken!");
+
+  // Okay, we've computed the exiting block.  See what condition causes us to
+  // exit.
+  //
+  // FIXME: we should be able to handle switch instructions (with a single exit)
+  BranchInst *ExitBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (ExitBr == 0) return UnknownValue;
+  assert(ExitBr->isConditional() && "If unconditional, it can't be in loop!");
+  
+  // At this point, we know we have a conditional branch that determines whether
+  // the loop is exited.  However, we don't know if the branch is executed each
+  // time through the loop.  If not, then the execution count of the branch will
+  // not be equal to the trip count of the loop.
+  //
+  // Currently we check for this by checking to see if the Exit branch goes to
+  // the loop header.  If so, we know it will always execute the same number of
+  // times as the loop.  We also handle the case where the exit block *is* the
+  // loop header.  This is common for un-rotated loops.  More extensive analysis
+  // could be done to handle more cases here.
+  if (ExitBr->getSuccessor(0) != L->getHeader() &&
+      ExitBr->getSuccessor(1) != L->getHeader() &&
+      ExitBr->getParent() != L->getHeader())
+    return UnknownValue;
+  
+  ICmpInst *ExitCond = dyn_cast<ICmpInst>(ExitBr->getCondition());
+
+  // If it's not an integer or pointer comparison then compute it the hard way.
+  if (ExitCond == 0)
+    return ComputeBackedgeTakenCountExhaustively(L, ExitBr->getCondition(),
+                                          ExitBr->getSuccessor(0) == ExitBlock);
+
+  // If the condition was exit on true, convert the condition to exit on false
+  ICmpInst::Predicate Cond;
+  if (ExitBr->getSuccessor(1) == ExitBlock)
+    Cond = ExitCond->getPredicate();
+  else
+    Cond = ExitCond->getInversePredicate();
+
+  // Handle common loops like: for (X = "string"; *X; ++X)
+  if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0)))
+    if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) {
+      SCEVHandle ItCnt =
+        ComputeLoadConstantCompareBackedgeTakenCount(LI, RHS, L, Cond);
+      if (!isa<SCEVCouldNotCompute>(ItCnt)) return ItCnt;
+    }
+
+  SCEVHandle LHS = getSCEV(ExitCond->getOperand(0));
+  SCEVHandle RHS = getSCEV(ExitCond->getOperand(1));
+
+  // Try to evaluate any dependencies out of the loop.
+  LHS = getSCEVAtScope(LHS, L);
+  RHS = getSCEVAtScope(RHS, L);
+
+  // At this point, we would like to compute how many iterations of the 
+  // loop the predicate will return true for these inputs.
+  if (LHS->isLoopInvariant(L) && !RHS->isLoopInvariant(L)) {
+    // If there is a loop-invariant, force it into the RHS.
+    std::swap(LHS, RHS);
+    Cond = ICmpInst::getSwappedPredicate(Cond);
+  }
+
+  // If we have a comparison of a chrec against a constant, try to use value
+  // ranges to answer this query.
+  if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS))
+    if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS))
+      if (AddRec->getLoop() == L) {
+        // Form the constant range.
+        ConstantRange CompRange(
+            ICmpInst::makeConstantRange(Cond, RHSC->getValue()->getValue()));
+
+        SCEVHandle Ret = AddRec->getNumIterationsInRange(CompRange, *this);
+        if (!isa<SCEVCouldNotCompute>(Ret)) return Ret;
+      }
+
+  switch (Cond) {
+  case ICmpInst::ICMP_NE: {                     // while (X != Y)
+    // Convert to: while (X-Y != 0)
+    SCEVHandle TC = HowFarToZero(getMinusSCEV(LHS, RHS), L);
+    if (!isa<SCEVCouldNotCompute>(TC)) return TC;
+    break;
+  }
+  case ICmpInst::ICMP_EQ: {
+    // Convert to: while (X-Y == 0)           // while (X == Y)
+    SCEVHandle TC = HowFarToNonZero(getMinusSCEV(LHS, RHS), L);
+    if (!isa<SCEVCouldNotCompute>(TC)) return TC;
+    break;
+  }
+  case ICmpInst::ICMP_SLT: {
+    BackedgeTakenInfo BTI = HowManyLessThans(LHS, RHS, L, true);
+    if (BTI.hasAnyInfo()) return BTI;
+    break;
+  }
+  case ICmpInst::ICMP_SGT: {
+    BackedgeTakenInfo BTI = HowManyLessThans(getNotSCEV(LHS),
+                                             getNotSCEV(RHS), L, true);
+    if (BTI.hasAnyInfo()) return BTI;
+    break;
+  }
+  case ICmpInst::ICMP_ULT: {
+    BackedgeTakenInfo BTI = HowManyLessThans(LHS, RHS, L, false);
+    if (BTI.hasAnyInfo()) return BTI;
+    break;
+  }
+  case ICmpInst::ICMP_UGT: {
+    BackedgeTakenInfo BTI = HowManyLessThans(getNotSCEV(LHS),
+                                             getNotSCEV(RHS), L, false);
+    if (BTI.hasAnyInfo()) return BTI;
+    break;
+  }
+  default:
+#if 0
+    errs() << "ComputeBackedgeTakenCount ";
+    if (ExitCond->getOperand(0)->getType()->isUnsigned())
+      errs() << "[unsigned] ";
+    errs() << *LHS << "   "
+         << Instruction::getOpcodeName(Instruction::ICmp) 
+         << "   " << *RHS << "\n";
+#endif
+    break;
+  }
+  return
+    ComputeBackedgeTakenCountExhaustively(L, ExitCond,
+                                          ExitBr->getSuccessor(0) == ExitBlock);
+}
+
+static ConstantInt *
+EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C,
+                                ScalarEvolution &SE) {
+  SCEVHandle InVal = SE.getConstant(C);
+  SCEVHandle Val = AddRec->evaluateAtIteration(InVal, SE);
+  assert(isa<SCEVConstant>(Val) &&
+         "Evaluation of SCEV at constant didn't fold correctly?");
+  return cast<SCEVConstant>(Val)->getValue();
+}
+
+/// GetAddressedElementFromGlobal - Given a global variable with an initializer
+/// and a GEP expression (missing the pointer index) indexing into it, return
+/// the addressed element of the initializer or null if the index expression is
+/// invalid.
+static Constant *
+GetAddressedElementFromGlobal(GlobalVariable *GV,
+                              const std::vector<ConstantInt*> &Indices) {
+  Constant *Init = GV->getInitializer();
+  for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
+    uint64_t Idx = Indices[i]->getZExtValue();
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) {
+      assert(Idx < CS->getNumOperands() && "Bad struct index!");
+      Init = cast<Constant>(CS->getOperand(Idx));
+    } else if (ConstantArray *CA = dyn_cast<ConstantArray>(Init)) {
+      if (Idx >= CA->getNumOperands()) return 0;  // Bogus program
+      Init = cast<Constant>(CA->getOperand(Idx));
+    } else if (isa<ConstantAggregateZero>(Init)) {
+      if (const StructType *STy = dyn_cast<StructType>(Init->getType())) {
+        assert(Idx < STy->getNumElements() && "Bad struct index!");
+        Init = Constant::getNullValue(STy->getElementType(Idx));
+      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Init->getType())) {
+        if (Idx >= ATy->getNumElements()) return 0;  // Bogus program
+        Init = Constant::getNullValue(ATy->getElementType());
+      } else {
+        assert(0 && "Unknown constant aggregate type!");
+      }
+      return 0;
+    } else {
+      return 0; // Unknown initializer type
+    }
+  }
+  return Init;
+}
+
+/// ComputeLoadConstantCompareBackedgeTakenCount - Given an exit condition of
+/// 'icmp op load X, cst', try to see if we can compute the backedge
+/// execution count.
+SCEVHandle ScalarEvolution::
+ComputeLoadConstantCompareBackedgeTakenCount(LoadInst *LI, Constant *RHS,
+                                             const Loop *L,
+                                             ICmpInst::Predicate predicate) {
+  if (LI->isVolatile()) return UnknownValue;
+
+  // Check to see if the loaded pointer is a getelementptr of a global.
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0));
+  if (!GEP) return UnknownValue;
+
+  // Make sure that it is really a constant global we are gepping, with an
+  // initializer, and make sure the first IDX is really 0.
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
+  if (!GV || !GV->isConstant() || !GV->hasInitializer() ||
+      GEP->getNumOperands() < 3 || !isa<Constant>(GEP->getOperand(1)) ||
+      !cast<Constant>(GEP->getOperand(1))->isNullValue())
+    return UnknownValue;
+
+  // Okay, we allow one non-constant index into the GEP instruction.
+  Value *VarIdx = 0;
+  std::vector<ConstantInt*> Indexes;
+  unsigned VarIdxNum = 0;
+  for (unsigned i = 2, e = GEP->getNumOperands(); i != e; ++i)
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      Indexes.push_back(CI);
+    } else if (!isa<ConstantInt>(GEP->getOperand(i))) {
+      if (VarIdx) return UnknownValue;  // Multiple non-constant idx's.
+      VarIdx = GEP->getOperand(i);
+      VarIdxNum = i-2;
+      Indexes.push_back(0);
+    }
+
+  // Okay, we know we have a (load (gep GV, 0, X)) comparison with a constant.
+  // Check to see if X is a loop variant variable value now.
+  SCEVHandle Idx = getSCEV(VarIdx);
+  Idx = getSCEVAtScope(Idx, L);
+
+  // We can only recognize very limited forms of loop index expressions, in
+  // particular, only affine AddRec's like {C1,+,C2}.
+  const SCEVAddRecExpr *IdxExpr = dyn_cast<SCEVAddRecExpr>(Idx);
+  if (!IdxExpr || !IdxExpr->isAffine() || IdxExpr->isLoopInvariant(L) ||
+      !isa<SCEVConstant>(IdxExpr->getOperand(0)) ||
+      !isa<SCEVConstant>(IdxExpr->getOperand(1)))
+    return UnknownValue;
+
+  unsigned MaxSteps = MaxBruteForceIterations;
+  for (unsigned IterationNum = 0; IterationNum != MaxSteps; ++IterationNum) {
+    ConstantInt *ItCst =
+      ConstantInt::get(IdxExpr->getType(), IterationNum);
+    ConstantInt *Val = EvaluateConstantChrecAtConstant(IdxExpr, ItCst, *this);
+
+    // Form the GEP offset.
+    Indexes[VarIdxNum] = Val;
+
+    Constant *Result = GetAddressedElementFromGlobal(GV, Indexes);
+    if (Result == 0) break;  // Cannot compute!
+
+    // Evaluate the condition for this iteration.
+    Result = ConstantExpr::getICmp(predicate, Result, RHS);
+    if (!isa<ConstantInt>(Result)) break;  // Couldn't decide for sure
+    if (cast<ConstantInt>(Result)->getValue().isMinValue()) {
+#if 0
+      errs() << "\n***\n*** Computed loop count " << *ItCst
+             << "\n*** From global " << *GV << "*** BB: " << *L->getHeader()
+             << "***\n";
+#endif
+      ++NumArrayLenItCounts;
+      return getConstant(ItCst);   // Found terminating iteration!
+    }
+  }
+  return UnknownValue;
+}
+
+
+/// CanConstantFold - Return true if we can constant fold an instruction of the
+/// specified type, assuming that all operands were constants.
+static bool CanConstantFold(const Instruction *I) {
+  if (isa<BinaryOperator>(I) || isa<CmpInst>(I) ||
+      isa<SelectInst>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I))
+    return true;
+
+  if (const CallInst *CI = dyn_cast<CallInst>(I))
+    if (const Function *F = CI->getCalledFunction())
+      return canConstantFoldCallTo(F);
+  return false;
+}
+
+/// getConstantEvolvingPHI - Given an LLVM value and a loop, return a PHI node
+/// in the loop that V is derived from.  We allow arbitrary operations along the
+/// way, but the operands of an operation must either be constants or a value
+/// derived from a constant PHI.  If this expression does not fit with these
+/// constraints, return null.
+static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
+  // If this is not an instruction, or if this is an instruction outside of the
+  // loop, it can't be derived from a loop PHI.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || !L->contains(I->getParent())) return 0;
+
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    if (L->getHeader() == I->getParent())
+      return PN;
+    else
+      // We don't currently keep track of the control flow needed to evaluate
+      // PHIs, so we cannot handle PHIs inside of loops.
+      return 0;
+  }
+
+  // If we won't be able to constant fold this expression even if the operands
+  // are constants, return early.
+  if (!CanConstantFold(I)) return 0;
+
+  // Otherwise, we can evaluate this instruction if all of its operands are
+  // constant or derived from a PHI node themselves.
+  PHINode *PHI = 0;
+  for (unsigned Op = 0, e = I->getNumOperands(); Op != e; ++Op)
+    if (!(isa<Constant>(I->getOperand(Op)) ||
+          isa<GlobalValue>(I->getOperand(Op)))) {
+      PHINode *P = getConstantEvolvingPHI(I->getOperand(Op), L);
+      if (P == 0) return 0;  // Not evolving from PHI
+      if (PHI == 0)
+        PHI = P;
+      else if (PHI != P)
+        return 0;  // Evolving from multiple different PHIs.
+    }
+
+  // This is a expression evolving from a constant PHI!
+  return PHI;
+}
+
+/// EvaluateExpression - Given an expression that passes the
+/// getConstantEvolvingPHI predicate, evaluate its value assuming the PHI node
+/// in the loop has the value PHIVal.  If we can't fold this expression for some
+/// reason, return null.
+static Constant *EvaluateExpression(Value *V, Constant *PHIVal) {
+  if (isa<PHINode>(V)) return PHIVal;
+  if (Constant *C = dyn_cast<Constant>(V)) return C;
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) return GV;
+  Instruction *I = cast<Instruction>(V);
+
+  std::vector<Constant*> Operands;
+  Operands.resize(I->getNumOperands());
+
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    Operands[i] = EvaluateExpression(I->getOperand(i), PHIVal);
+    if (Operands[i] == 0) return 0;
+  }
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(CI->getPredicate(),
+                                           &Operands[0], Operands.size());
+  else
+    return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                    &Operands[0], Operands.size());
+}
+
+/// getConstantEvolutionLoopExitValue - If we know that the specified Phi is
+/// in the header of its containing loop, we know the loop executes a
+/// constant number of times, and the PHI node is just a recurrence
+/// involving constants, fold it.
+Constant *ScalarEvolution::
+getConstantEvolutionLoopExitValue(PHINode *PN, const APInt& BEs, const Loop *L){
+  std::map<PHINode*, Constant*>::iterator I =
+    ConstantEvolutionLoopExitValue.find(PN);
+  if (I != ConstantEvolutionLoopExitValue.end())
+    return I->second;
+
+  if (BEs.ugt(APInt(BEs.getBitWidth(),MaxBruteForceIterations)))
+    return ConstantEvolutionLoopExitValue[PN] = 0;  // Not going to evaluate it.
+
+  Constant *&RetVal = ConstantEvolutionLoopExitValue[PN];
+
+  // Since the loop is canonicalized, the PHI node must have two entries.  One
+  // entry must be a constant (coming in from outside of the loop), and the
+  // second must be derived from the same PHI.
+  bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
+  Constant *StartCST =
+    dyn_cast<Constant>(PN->getIncomingValue(!SecondIsBackedge));
+  if (StartCST == 0)
+    return RetVal = 0;  // Must be a constant.
+
+  Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
+  PHINode *PN2 = getConstantEvolvingPHI(BEValue, L);
+  if (PN2 != PN)
+    return RetVal = 0;  // Not derived from same PHI.
+
+  // Execute the loop symbolically to determine the exit value.
+  if (BEs.getActiveBits() >= 32)
+    return RetVal = 0; // More than 2^32-1 iterations?? Not doing it!
+
+  unsigned NumIterations = BEs.getZExtValue(); // must be in range
+  unsigned IterationNum = 0;
+  for (Constant *PHIVal = StartCST; ; ++IterationNum) {
+    if (IterationNum == NumIterations)
+      return RetVal = PHIVal;  // Got exit value!
+
+    // Compute the value of the PHI node for the next iteration.
+    Constant *NextPHI = EvaluateExpression(BEValue, PHIVal);
+    if (NextPHI == PHIVal)
+      return RetVal = NextPHI;  // Stopped evolving!
+    if (NextPHI == 0)
+      return 0;        // Couldn't evaluate!
+    PHIVal = NextPHI;
+  }
+}
+
+/// ComputeBackedgeTakenCountExhaustively - If the trip is known to execute a
+/// constant number of times (the condition evolves only from constants),
+/// try to evaluate a few iterations of the loop until we get the exit
+/// condition gets a value of ExitWhen (true or false).  If we cannot
+/// evaluate the trip count of the loop, return UnknownValue.
+SCEVHandle ScalarEvolution::
+ComputeBackedgeTakenCountExhaustively(const Loop *L, Value *Cond, bool ExitWhen) {
+  PHINode *PN = getConstantEvolvingPHI(Cond, L);
+  if (PN == 0) return UnknownValue;
+
+  // Since the loop is canonicalized, the PHI node must have two entries.  One
+  // entry must be a constant (coming in from outside of the loop), and the
+  // second must be derived from the same PHI.
+  bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
+  Constant *StartCST =
+    dyn_cast<Constant>(PN->getIncomingValue(!SecondIsBackedge));
+  if (StartCST == 0) return UnknownValue;  // Must be a constant.
+
+  Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
+  PHINode *PN2 = getConstantEvolvingPHI(BEValue, L);
+  if (PN2 != PN) return UnknownValue;  // Not derived from same PHI.
+
+  // Okay, we find a PHI node that defines the trip count of this loop.  Execute
+  // the loop symbolically to determine when the condition gets a value of
+  // "ExitWhen".
+  unsigned IterationNum = 0;
+  unsigned MaxIterations = MaxBruteForceIterations;   // Limit analysis.
+  for (Constant *PHIVal = StartCST;
+       IterationNum != MaxIterations; ++IterationNum) {
+    ConstantInt *CondVal =
+      dyn_cast_or_null<ConstantInt>(EvaluateExpression(Cond, PHIVal));
+
+    // Couldn't symbolically evaluate.
+    if (!CondVal) return UnknownValue;
+
+    if (CondVal->getValue() == uint64_t(ExitWhen)) {
+      ConstantEvolutionLoopExitValue[PN] = PHIVal;
+      ++NumBruteForceTripCountsComputed;
+      return getConstant(ConstantInt::get(Type::Int32Ty, IterationNum));
+    }
+
+    // Compute the value of the PHI node for the next iteration.
+    Constant *NextPHI = EvaluateExpression(BEValue, PHIVal);
+    if (NextPHI == 0 || NextPHI == PHIVal)
+      return UnknownValue;  // Couldn't evaluate or not making progress...
+    PHIVal = NextPHI;
+  }
+
+  // Too many iterations were needed to evaluate.
+  return UnknownValue;
+}
+
+/// getSCEVAtScope - Return a SCEV expression handle for the specified value
+/// at the specified scope in the program.  The L value specifies a loop
+/// nest to evaluate the expression at, where null is the top-level or a
+/// specified loop is immediately inside of the loop.
+///
+/// This method can be used to compute the exit value for a variable defined
+/// in a loop by querying what the value will hold in the parent loop.
+///
+/// In the case that a relevant loop exit value cannot be computed, the
+/// original value V is returned.
+SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
+  // FIXME: this should be turned into a virtual method on SCEV!
+
+  if (isa<SCEVConstant>(V)) return V;
+
+  // If this instruction is evolved from a constant-evolving PHI, compute the
+  // exit value from the loop without using SCEVs.
+  if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) {
+    if (Instruction *I = dyn_cast<Instruction>(SU->getValue())) {
+      const Loop *LI = (*this->LI)[I->getParent()];
+      if (LI && LI->getParentLoop() == L)  // Looking for loop exit value.
+        if (PHINode *PN = dyn_cast<PHINode>(I))
+          if (PN->getParent() == LI->getHeader()) {
+            // Okay, there is no closed form solution for the PHI node.  Check
+            // to see if the loop that contains it has a known backedge-taken
+            // count.  If so, we may be able to force computation of the exit
+            // value.
+            SCEVHandle BackedgeTakenCount = getBackedgeTakenCount(LI);
+            if (const SCEVConstant *BTCC =
+                  dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
+              // Okay, we know how many times the containing loop executes.  If
+              // this is a constant evolving PHI node, get the final value at
+              // the specified iteration number.
+              Constant *RV = getConstantEvolutionLoopExitValue(PN,
+                                                   BTCC->getValue()->getValue(),
+                                                               LI);
+              if (RV) return getUnknown(RV);
+            }
+          }
+
+      // Okay, this is an expression that we cannot symbolically evaluate
+      // into a SCEV.  Check to see if it's possible to symbolically evaluate
+      // the arguments into constants, and if so, try to constant propagate the
+      // result.  This is particularly useful for computing loop exit values.
+      if (CanConstantFold(I)) {
+        // Check to see if we've folded this instruction at this loop before.
+        std::map<const Loop *, Constant *> &Values = ValuesAtScopes[I];
+        std::pair<std::map<const Loop *, Constant *>::iterator, bool> Pair =
+          Values.insert(std::make_pair(L, static_cast<Constant *>(0)));
+        if (!Pair.second)
+          return Pair.first->second ? &*getUnknown(Pair.first->second) : V;
+
+        std::vector<Constant*> Operands;
+        Operands.reserve(I->getNumOperands());
+        for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+          Value *Op = I->getOperand(i);
+          if (Constant *C = dyn_cast<Constant>(Op)) {
+            Operands.push_back(C);
+          } else {
+            // If any of the operands is non-constant and if they are
+            // non-integer and non-pointer, don't even try to analyze them
+            // with scev techniques.
+            if (!isSCEVable(Op->getType()))
+              return V;
+
+            SCEVHandle OpV = getSCEVAtScope(getSCEV(Op), L);
+            if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(OpV)) {
+              Constant *C = SC->getValue();
+              if (C->getType() != Op->getType())
+                C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                                  Op->getType(),
+                                                                  false),
+                                          C, Op->getType());
+              Operands.push_back(C);
+            } else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(OpV)) {
+              if (Constant *C = dyn_cast<Constant>(SU->getValue())) {
+                if (C->getType() != Op->getType())
+                  C =
+                    ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                                  Op->getType(),
+                                                                  false),
+                                          C, Op->getType());
+                Operands.push_back(C);
+              } else
+                return V;
+            } else {
+              return V;
+            }
+          }
+        }
+        
+        Constant *C;
+        if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+          C = ConstantFoldCompareInstOperands(CI->getPredicate(),
+                                              &Operands[0], Operands.size());
+        else
+          C = ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                       &Operands[0], Operands.size());
+        Pair.first->second = C;
+        return getUnknown(C);
+      }
+    }
+
+    // This is some other type of SCEVUnknown, just return it.
+    return V;
+  }
+
+  if (const SCEVCommutativeExpr *Comm = dyn_cast<SCEVCommutativeExpr>(V)) {
+    // Avoid performing the look-up in the common case where the specified
+    // expression has no loop-variant portions.
+    for (unsigned i = 0, e = Comm->getNumOperands(); i != e; ++i) {
+      SCEVHandle OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
+      if (OpAtScope != Comm->getOperand(i)) {
+        // Okay, at least one of these operands is loop variant but might be
+        // foldable.  Build a new instance of the folded commutative expression.
+        std::vector<SCEVHandle> NewOps(Comm->op_begin(), Comm->op_begin()+i);
+        NewOps.push_back(OpAtScope);
+
+        for (++i; i != e; ++i) {
+          OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
+          NewOps.push_back(OpAtScope);
+        }
+        if (isa<SCEVAddExpr>(Comm))
+          return getAddExpr(NewOps);
+        if (isa<SCEVMulExpr>(Comm))
+          return getMulExpr(NewOps);
+        if (isa<SCEVSMaxExpr>(Comm))
+          return getSMaxExpr(NewOps);
+        if (isa<SCEVUMaxExpr>(Comm))
+          return getUMaxExpr(NewOps);
+        assert(0 && "Unknown commutative SCEV type!");
+      }
+    }
+    // If we got here, all operands are loop invariant.
+    return Comm;
+  }
+
+  if (const SCEVUDivExpr *Div = dyn_cast<SCEVUDivExpr>(V)) {
+    SCEVHandle LHS = getSCEVAtScope(Div->getLHS(), L);
+    SCEVHandle RHS = getSCEVAtScope(Div->getRHS(), L);
+    if (LHS == Div->getLHS() && RHS == Div->getRHS())
+      return Div;   // must be loop invariant
+    return getUDivExpr(LHS, RHS);
+  }
+
+  // If this is a loop recurrence for a loop that does not contain L, then we
+  // are dealing with the final value computed by the loop.
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V)) {
+    if (!L || !AddRec->getLoop()->contains(L->getHeader())) {
+      // To evaluate this recurrence, we need to know how many times the AddRec
+      // loop iterates.  Compute this now.
+      SCEVHandle BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop());
+      if (BackedgeTakenCount == UnknownValue) return AddRec;
+
+      // Then, evaluate the AddRec.
+      return AddRec->evaluateAtIteration(BackedgeTakenCount, *this);
+    }
+    return AddRec;
+  }
+
+  if (const SCEVZeroExtendExpr *Cast = dyn_cast<SCEVZeroExtendExpr>(V)) {
+    SCEVHandle Op = getSCEVAtScope(Cast->getOperand(), L);
+    if (Op == Cast->getOperand())
+      return Cast;  // must be loop invariant
+    return getZeroExtendExpr(Op, Cast->getType());
+  }
+
+  if (const SCEVSignExtendExpr *Cast = dyn_cast<SCEVSignExtendExpr>(V)) {
+    SCEVHandle Op = getSCEVAtScope(Cast->getOperand(), L);
+    if (Op == Cast->getOperand())
+      return Cast;  // must be loop invariant
+    return getSignExtendExpr(Op, Cast->getType());
+  }
+
+  if (const SCEVTruncateExpr *Cast = dyn_cast<SCEVTruncateExpr>(V)) {
+    SCEVHandle Op = getSCEVAtScope(Cast->getOperand(), L);
+    if (Op == Cast->getOperand())
+      return Cast;  // must be loop invariant
+    return getTruncateExpr(Op, Cast->getType());
+  }
+
+  assert(0 && "Unknown SCEV type!");
+  return 0;
+}
+
+/// getSCEVAtScope - This is a convenience function which does
+/// getSCEVAtScope(getSCEV(V), L).
+SCEVHandle ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
+  return getSCEVAtScope(getSCEV(V), L);
+}
+
+/// SolveLinEquationWithOverflow - Finds the minimum unsigned root of the
+/// following equation:
+///
+///     A * X = B (mod N)
+///
+/// where N = 2^BW and BW is the common bit width of A and B. The signedness of
+/// A and B isn't important.
+///
+/// If the equation does not have a solution, SCEVCouldNotCompute is returned.
+static SCEVHandle SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
+                                               ScalarEvolution &SE) {
+  uint32_t BW = A.getBitWidth();
+  assert(BW == B.getBitWidth() && "Bit widths must be the same.");
+  assert(A != 0 && "A must be non-zero.");
+
+  // 1. D = gcd(A, N)
+  //
+  // The gcd of A and N may have only one prime factor: 2. The number of
+  // trailing zeros in A is its multiplicity
+  uint32_t Mult2 = A.countTrailingZeros();
+  // D = 2^Mult2
+
+  // 2. Check if B is divisible by D.
+  //
+  // B is divisible by D if and only if the multiplicity of prime factor 2 for B
+  // is not less than multiplicity of this prime factor for D.
+  if (B.countTrailingZeros() < Mult2)
+    return SE.getCouldNotCompute();
+
+  // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
+  // modulo (N / D).
+  //
+  // (N / D) may need BW+1 bits in its representation.  Hence, we'll use this
+  // bit width during computations.
+  APInt AD = A.lshr(Mult2).zext(BW + 1);  // AD = A / D
+  APInt Mod(BW + 1, 0);
+  Mod.set(BW - Mult2);  // Mod = N / D
+  APInt I = AD.multiplicativeInverse(Mod);
+
+  // 4. Compute the minimum unsigned root of the equation:
+  // I * (B / D) mod (N / D)
+  APInt Result = (I * B.lshr(Mult2).zext(BW + 1)).urem(Mod);
+
+  // The result is guaranteed to be less than 2^BW so we may truncate it to BW
+  // bits.
+  return SE.getConstant(Result.trunc(BW));
+}
+
+/// SolveQuadraticEquation - Find the roots of the quadratic equation for the
+/// given quadratic chrec {L,+,M,+,N}.  This returns either the two roots (which
+/// might be the same) or two SCEVCouldNotCompute objects.
+///
+static std::pair<SCEVHandle,SCEVHandle>
+SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
+  assert(AddRec->getNumOperands() == 3 && "This is not a quadratic chrec!");
+  const SCEVConstant *LC = dyn_cast<SCEVConstant>(AddRec->getOperand(0));
+  const SCEVConstant *MC = dyn_cast<SCEVConstant>(AddRec->getOperand(1));
+  const SCEVConstant *NC = dyn_cast<SCEVConstant>(AddRec->getOperand(2));
+
+  // We currently can only solve this if the coefficients are constants.
+  if (!LC || !MC || !NC) {
+    const SCEV *CNC = SE.getCouldNotCompute();
+    return std::make_pair(CNC, CNC);
+  }
+
+  uint32_t BitWidth = LC->getValue()->getValue().getBitWidth();
+  const APInt &L = LC->getValue()->getValue();
+  const APInt &M = MC->getValue()->getValue();
+  const APInt &N = NC->getValue()->getValue();
+  APInt Two(BitWidth, 2);
+  APInt Four(BitWidth, 4);
+
+  { 
+    using namespace APIntOps;
+    const APInt& C = L;
+    // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
+    // The B coefficient is M-N/2
+    APInt B(M);
+    B -= sdiv(N,Two);
+
+    // The A coefficient is N/2
+    APInt A(N.sdiv(Two));
+
+    // Compute the B^2-4ac term.
+    APInt SqrtTerm(B);
+    SqrtTerm *= B;
+    SqrtTerm -= Four * (A * C);
+
+    // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
+    // integer value or else APInt::sqrt() will assert.
+    APInt SqrtVal(SqrtTerm.sqrt());
+
+    // Compute the two solutions for the quadratic formula. 
+    // The divisions must be performed as signed divisions.
+    APInt NegB(-B);
+    APInt TwoA( A << 1 );
+    if (TwoA.isMinValue()) {
+      const SCEV *CNC = SE.getCouldNotCompute();
+      return std::make_pair(CNC, CNC);
+    }
+
+    ConstantInt *Solution1 = ConstantInt::get((NegB + SqrtVal).sdiv(TwoA));
+    ConstantInt *Solution2 = ConstantInt::get((NegB - SqrtVal).sdiv(TwoA));
+
+    return std::make_pair(SE.getConstant(Solution1), 
+                          SE.getConstant(Solution2));
+    } // end APIntOps namespace
+}
+
+/// HowFarToZero - Return the number of times a backedge comparing the specified
+/// value to zero will execute.  If not computable, return UnknownValue.
+SCEVHandle ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
+  // If the value is a constant
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
+    // If the value is already zero, the branch will execute zero times.
+    if (C->getValue()->isZero()) return C;
+    return UnknownValue;  // Otherwise it will loop infinitely.
+  }
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V);
+  if (!AddRec || AddRec->getLoop() != L)
+    return UnknownValue;
+
+  if (AddRec->isAffine()) {
+    // If this is an affine expression, the execution count of this branch is
+    // the minimum unsigned root of the following equation:
+    //
+    //     Start + Step*N = 0 (mod 2^BW)
+    //
+    // equivalent to:
+    //
+    //             Step*N = -Start (mod 2^BW)
+    //
+    // where BW is the common bit width of Start and Step.
+
+    // Get the initial value for the loop.
+    SCEVHandle Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop());
+    SCEVHandle Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop());
+
+    if (const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step)) {
+      // For now we handle only constant steps.
+
+      // First, handle unitary steps.
+      if (StepC->getValue()->equalsInt(1))      // 1*N = -Start (mod 2^BW), so:
+        return getNegativeSCEV(Start);       //   N = -Start (as unsigned)
+      if (StepC->getValue()->isAllOnesValue())  // -1*N = -Start (mod 2^BW), so:
+        return Start;                           //    N = Start (as unsigned)
+
+      // Then, try to solve the above equation provided that Start is constant.
+      if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
+        return SolveLinEquationWithOverflow(StepC->getValue()->getValue(),
+                                            -StartC->getValue()->getValue(),
+                                            *this);
+    }
+  } else if (AddRec->isQuadratic() && AddRec->getType()->isInteger()) {
+    // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of
+    // the quadratic equation to solve it.
+    std::pair<SCEVHandle,SCEVHandle> Roots = SolveQuadraticEquation(AddRec,
+                                                                    *this);
+    const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
+    const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
+    if (R1) {
+#if 0
+      errs() << "HFTZ: " << *V << " - sol#1: " << *R1
+             << "  sol#2: " << *R2 << "\n";
+#endif
+      // Pick the smallest positive root value.
+      if (ConstantInt *CB =
+          dyn_cast<ConstantInt>(ConstantExpr::getICmp(ICmpInst::ICMP_ULT, 
+                                   R1->getValue(), R2->getValue()))) {
+        if (CB->getZExtValue() == false)
+          std::swap(R1, R2);   // R1 is the minimum root now.
+
+        // We can only use this value if the chrec ends up with an exact zero
+        // value at this index.  When solving for "X*X != 5", for example, we
+        // should not accept a root of 2.
+        SCEVHandle Val = AddRec->evaluateAtIteration(R1, *this);
+        if (Val->isZero())
+          return R1;  // We found a quadratic root!
+      }
+    }
+  }
+
+  return UnknownValue;
+}
+
+/// HowFarToNonZero - Return the number of times a backedge checking the
+/// specified value for nonzero will execute.  If not computable, return
+/// UnknownValue
+SCEVHandle ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
+  // Loops that look like: while (X == 0) are very strange indeed.  We don't
+  // handle them yet except for the trivial case.  This could be expanded in the
+  // future as needed.
+
+  // If the value is a constant, check to see if it is known to be non-zero
+  // already.  If so, the backedge will execute zero times.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
+    if (!C->getValue()->isNullValue())
+      return getIntegerSCEV(0, C->getType());
+    return UnknownValue;  // Otherwise it will loop infinitely.
+  }
+
+  // We could implement others, but I really doubt anyone writes loops like
+  // this, and if they did, they would already be constant folded.
+  return UnknownValue;
+}
+
+/// getLoopPredecessor - If the given loop's header has exactly one unique
+/// predecessor outside the loop, return it. Otherwise return null.
+///
+BasicBlock *ScalarEvolution::getLoopPredecessor(const Loop *L) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Pred = 0;
+  for (pred_iterator PI = pred_begin(Header), E = pred_end(Header);
+       PI != E; ++PI)
+    if (!L->contains(*PI)) {
+      if (Pred && Pred != *PI) return 0; // Multiple predecessors.
+      Pred = *PI;
+    }
+  return Pred;
+}
+
+/// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB
+/// (which may not be an immediate predecessor) which has exactly one
+/// successor from which BB is reachable, or null if no such block is
+/// found.
+///
+BasicBlock *
+ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
+  // If the block has a unique predecessor, then there is no path from the
+  // predecessor to the block that does not go through the direct edge
+  // from the predecessor to the block.
+  if (BasicBlock *Pred = BB->getSinglePredecessor())
+    return Pred;
+
+  // A loop's header is defined to be a block that dominates the loop.
+  // If the header has a unique predecessor outside the loop, it must be
+  // a block that has exactly one successor that can reach the loop.
+  if (Loop *L = LI->getLoopFor(BB))
+    return getLoopPredecessor(L);
+
+  return 0;
+}
+
+/// isLoopGuardedByCond - Test whether entry to the loop is protected by
+/// a conditional between LHS and RHS.  This is used to help avoid max
+/// expressions in loop trip counts.
+bool ScalarEvolution::isLoopGuardedByCond(const Loop *L,
+                                          ICmpInst::Predicate Pred,
+                                          const SCEV *LHS, const SCEV *RHS) {
+  // Interpret a null as meaning no loop, where there is obviously no guard
+  // (interprocedural conditions notwithstanding).
+  if (!L) return false;
+
+  BasicBlock *Predecessor = getLoopPredecessor(L);
+  BasicBlock *PredecessorDest = L->getHeader();
+
+  // Starting at the loop predecessor, climb up the predecessor chain, as long
+  // as there are predecessors that can be found that have unique successors
+  // leading to the original header.
+  for (; Predecessor;
+       PredecessorDest = Predecessor,
+       Predecessor = getPredecessorWithUniqueSuccessorForBB(Predecessor)) {
+
+    BranchInst *LoopEntryPredicate =
+      dyn_cast<BranchInst>(Predecessor->getTerminator());
+    if (!LoopEntryPredicate ||
+        LoopEntryPredicate->isUnconditional())
+      continue;
+
+    ICmpInst *ICI = dyn_cast<ICmpInst>(LoopEntryPredicate->getCondition());
+    if (!ICI) continue;
+
+    // Now that we found a conditional branch that dominates the loop, check to
+    // see if it is the comparison we are looking for.
+    Value *PreCondLHS = ICI->getOperand(0);
+    Value *PreCondRHS = ICI->getOperand(1);
+    ICmpInst::Predicate Cond;
+    if (LoopEntryPredicate->getSuccessor(0) == PredecessorDest)
+      Cond = ICI->getPredicate();
+    else
+      Cond = ICI->getInversePredicate();
+
+    if (Cond == Pred)
+      ; // An exact match.
+    else if (!ICmpInst::isTrueWhenEqual(Cond) && Pred == ICmpInst::ICMP_NE)
+      ; // The actual condition is beyond sufficient.
+    else
+      // Check a few special cases.
+      switch (Cond) {
+      case ICmpInst::ICMP_UGT:
+        if (Pred == ICmpInst::ICMP_ULT) {
+          std::swap(PreCondLHS, PreCondRHS);
+          Cond = ICmpInst::ICMP_ULT;
+          break;
+        }
+        continue;
+      case ICmpInst::ICMP_SGT:
+        if (Pred == ICmpInst::ICMP_SLT) {
+          std::swap(PreCondLHS, PreCondRHS);
+          Cond = ICmpInst::ICMP_SLT;
+          break;
+        }
+        continue;
+      case ICmpInst::ICMP_NE:
+        // Expressions like (x >u 0) are often canonicalized to (x != 0),
+        // so check for this case by checking if the NE is comparing against
+        // a minimum or maximum constant.
+        if (!ICmpInst::isTrueWhenEqual(Pred))
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(PreCondRHS)) {
+            const APInt &A = CI->getValue();
+            switch (Pred) {
+            case ICmpInst::ICMP_SLT:
+              if (A.isMaxSignedValue()) break;
+              continue;
+            case ICmpInst::ICMP_SGT:
+              if (A.isMinSignedValue()) break;
+              continue;
+            case ICmpInst::ICMP_ULT:
+              if (A.isMaxValue()) break;
+              continue;
+            case ICmpInst::ICMP_UGT:
+              if (A.isMinValue()) break;
+              continue;
+            default:
+              continue;
+            }
+            Cond = ICmpInst::ICMP_NE;
+            // NE is symmetric but the original comparison may not be. Swap
+            // the operands if necessary so that they match below.
+            if (isa<SCEVConstant>(LHS))
+              std::swap(PreCondLHS, PreCondRHS);
+            break;
+          }
+        continue;
+      default:
+        // We weren't able to reconcile the condition.
+        continue;
+      }
+
+    if (!PreCondLHS->getType()->isInteger()) continue;
+
+    SCEVHandle PreCondLHSSCEV = getSCEV(PreCondLHS);
+    SCEVHandle PreCondRHSSCEV = getSCEV(PreCondRHS);
+    if ((LHS == PreCondLHSSCEV && RHS == PreCondRHSSCEV) ||
+        (LHS == getNotSCEV(PreCondRHSSCEV) &&
+         RHS == getNotSCEV(PreCondLHSSCEV)))
+      return true;
+  }
+
+  return false;
+}
+
+/// HowManyLessThans - Return the number of times a backedge containing the
+/// specified less-than comparison will execute.  If not computable, return
+/// UnknownValue.
+ScalarEvolution::BackedgeTakenInfo ScalarEvolution::
+HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
+                 const Loop *L, bool isSigned) {
+  // Only handle:  "ADDREC < LoopInvariant".
+  if (!RHS->isLoopInvariant(L)) return UnknownValue;
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!AddRec || AddRec->getLoop() != L)
+    return UnknownValue;
+
+  if (AddRec->isAffine()) {
+    // FORNOW: We only support unit strides.
+    unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
+    SCEVHandle Step = AddRec->getStepRecurrence(*this);
+    SCEVHandle NegOne = getIntegerSCEV(-1, AddRec->getType());
+
+    // TODO: handle non-constant strides.
+    const SCEVConstant *CStep = dyn_cast<SCEVConstant>(Step);
+    if (!CStep || CStep->isZero())
+      return UnknownValue;
+    if (CStep->isOne()) {
+      // With unit stride, the iteration never steps past the limit value.
+    } else if (CStep->getValue()->getValue().isStrictlyPositive()) {
+      if (const SCEVConstant *CLimit = dyn_cast<SCEVConstant>(RHS)) {
+        // Test whether a positive iteration iteration can step past the limit
+        // value and past the maximum value for its type in a single step.
+        if (isSigned) {
+          APInt Max = APInt::getSignedMaxValue(BitWidth);
+          if ((Max - CStep->getValue()->getValue())
+                .slt(CLimit->getValue()->getValue()))
+            return UnknownValue;
+        } else {
+          APInt Max = APInt::getMaxValue(BitWidth);
+          if ((Max - CStep->getValue()->getValue())
+                .ult(CLimit->getValue()->getValue()))
+            return UnknownValue;
+        }
+      } else
+        // TODO: handle non-constant limit values below.
+        return UnknownValue;
+    } else
+      // TODO: handle negative strides below.
+      return UnknownValue;
+
+    // We know the LHS is of the form {n,+,s} and the RHS is some loop-invariant
+    // m.  So, we count the number of iterations in which {n,+,s} < m is true.
+    // Note that we cannot simply return max(m-n,0)/s because it's not safe to
+    // treat m-n as signed nor unsigned due to overflow possibility.
+
+    // First, we get the value of the LHS in the first iteration: n
+    SCEVHandle Start = AddRec->getOperand(0);
+
+    // Determine the minimum constant start value.
+    SCEVHandle MinStart = isa<SCEVConstant>(Start) ? Start :
+      getConstant(isSigned ? APInt::getSignedMinValue(BitWidth) :
+                             APInt::getMinValue(BitWidth));
+
+    // If we know that the condition is true in order to enter the loop,
+    // then we know that it will run exactly (m-n)/s times. Otherwise, we
+    // only know that it will execute (max(m,n)-n)/s times. In both cases,
+    // the division must round up.
+    SCEVHandle End = RHS;
+    if (!isLoopGuardedByCond(L,
+                             isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                             getMinusSCEV(Start, Step), RHS))
+      End = isSigned ? getSMaxExpr(RHS, Start)
+                     : getUMaxExpr(RHS, Start);
+
+    // Determine the maximum constant end value.
+    SCEVHandle MaxEnd = isa<SCEVConstant>(End) ? End :
+      getConstant(isSigned ? APInt::getSignedMaxValue(BitWidth) :
+                             APInt::getMaxValue(BitWidth));
+
+    // Finally, we subtract these two values and divide, rounding up, to get
+    // the number of times the backedge is executed.
+    SCEVHandle BECount = getUDivExpr(getAddExpr(getMinusSCEV(End, Start),
+                                                getAddExpr(Step, NegOne)),
+                                     Step);
+
+    // The maximum backedge count is similar, except using the minimum start
+    // value and the maximum end value.
+    SCEVHandle MaxBECount = getUDivExpr(getAddExpr(getMinusSCEV(MaxEnd,
+                                                                MinStart),
+                                                   getAddExpr(Step, NegOne)),
+                                        Step);
+
+    return BackedgeTakenInfo(BECount, MaxBECount);
+  }
+
+  return UnknownValue;
+}
+
+/// getNumIterationsInRange - Return the number of iterations of this loop that
+/// produce values in the specified constant range.  Another way of looking at
+/// this is that it returns the first iteration number where the value is not in
+/// the condition, thus computing the exit count. If the iteration count can't
+/// be computed, an instance of SCEVCouldNotCompute is returned.
+SCEVHandle SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
+                                                   ScalarEvolution &SE) const {
+  if (Range.isFullSet())  // Infinite loop.
+    return SE.getCouldNotCompute();
+
+  // If the start is a non-zero constant, shift the range to simplify things.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(getStart()))
+    if (!SC->getValue()->isZero()) {
+      std::vector<SCEVHandle> Operands(op_begin(), op_end());
+      Operands[0] = SE.getIntegerSCEV(0, SC->getType());
+      SCEVHandle Shifted = SE.getAddRecExpr(Operands, getLoop());
+      if (const SCEVAddRecExpr *ShiftedAddRec =
+            dyn_cast<SCEVAddRecExpr>(Shifted))
+        return ShiftedAddRec->getNumIterationsInRange(
+                           Range.subtract(SC->getValue()->getValue()), SE);
+      // This is strange and shouldn't happen.
+      return SE.getCouldNotCompute();
+    }
+
+  // The only time we can solve this is when we have all constant indices.
+  // Otherwise, we cannot determine the overflow conditions.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (!isa<SCEVConstant>(getOperand(i)))
+      return SE.getCouldNotCompute();
+
+
+  // Okay at this point we know that all elements of the chrec are constants and
+  // that the start element is zero.
+
+  // First check to see if the range contains zero.  If not, the first
+  // iteration exits.
+  unsigned BitWidth = SE.getTypeSizeInBits(getType());
+  if (!Range.contains(APInt(BitWidth, 0)))
+    return SE.getConstant(ConstantInt::get(getType(),0));
+
+  if (isAffine()) {
+    // If this is an affine expression then we have this situation:
+    //   Solve {0,+,A} in Range  ===  Ax in Range
+
+    // We know that zero is in the range.  If A is positive then we know that
+    // the upper value of the range must be the first possible exit value.
+    // If A is negative then the lower of the range is the last possible loop
+    // value.  Also note that we already checked for a full range.
+    APInt One(BitWidth,1);
+    APInt A     = cast<SCEVConstant>(getOperand(1))->getValue()->getValue();
+    APInt End = A.sge(One) ? (Range.getUpper() - One) : Range.getLower();
+
+    // The exit value should be (End+A)/A.
+    APInt ExitVal = (End + A).udiv(A);
+    ConstantInt *ExitValue = ConstantInt::get(ExitVal);
+
+    // Evaluate at the exit value.  If we really did fall out of the valid
+    // range, then we computed our trip count, otherwise wrap around or other
+    // things must have happened.
+    ConstantInt *Val = EvaluateConstantChrecAtConstant(this, ExitValue, SE);
+    if (Range.contains(Val->getValue()))
+      return SE.getCouldNotCompute();  // Something strange happened
+
+    // Ensure that the previous value is in the range.  This is a sanity check.
+    assert(Range.contains(
+           EvaluateConstantChrecAtConstant(this, 
+           ConstantInt::get(ExitVal - One), SE)->getValue()) &&
+           "Linear scev computation is off in a bad way!");
+    return SE.getConstant(ExitValue);
+  } else if (isQuadratic()) {
+    // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of the
+    // quadratic equation to solve it.  To do this, we must frame our problem in
+    // terms of figuring out when zero is crossed, instead of when
+    // Range.getUpper() is crossed.
+    std::vector<SCEVHandle> NewOps(op_begin(), op_end());
+    NewOps[0] = SE.getNegativeSCEV(SE.getConstant(Range.getUpper()));
+    SCEVHandle NewAddRec = SE.getAddRecExpr(NewOps, getLoop());
+
+    // Next, solve the constructed addrec
+    std::pair<SCEVHandle,SCEVHandle> Roots =
+      SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE);
+    const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
+    const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
+    if (R1) {
+      // Pick the smallest positive root value.
+      if (ConstantInt *CB =
+          dyn_cast<ConstantInt>(ConstantExpr::getICmp(ICmpInst::ICMP_ULT, 
+                                   R1->getValue(), R2->getValue()))) {
+        if (CB->getZExtValue() == false)
+          std::swap(R1, R2);   // R1 is the minimum root now.
+
+        // Make sure the root is not off by one.  The returned iteration should
+        // not be in the range, but the previous one should be.  When solving
+        // for "X*X < 5", for example, we should not return a root of 2.
+        ConstantInt *R1Val = EvaluateConstantChrecAtConstant(this,
+                                                             R1->getValue(),
+                                                             SE);
+        if (Range.contains(R1Val->getValue())) {
+          // The next iteration must be out of the range...
+          ConstantInt *NextVal = ConstantInt::get(R1->getValue()->getValue()+1);
+
+          R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
+          if (!Range.contains(R1Val->getValue()))
+            return SE.getConstant(NextVal);
+          return SE.getCouldNotCompute();  // Something strange happened
+        }
+
+        // If R1 was not in the range, then it is a good return value.  Make
+        // sure that R1-1 WAS in the range though, just in case.
+        ConstantInt *NextVal = ConstantInt::get(R1->getValue()->getValue()-1);
+        R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
+        if (Range.contains(R1Val->getValue()))
+          return R1;
+        return SE.getCouldNotCompute();  // Something strange happened
+      }
+    }
+  }
+
+  return SE.getCouldNotCompute();
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                   SCEVCallbackVH Class Implementation
+//===----------------------------------------------------------------------===//
+
+void ScalarEvolution::SCEVCallbackVH::deleted() {
+  assert(SE && "SCEVCallbackVH called with a non-null ScalarEvolution!");
+  if (PHINode *PN = dyn_cast<PHINode>(getValPtr()))
+    SE->ConstantEvolutionLoopExitValue.erase(PN);
+  if (Instruction *I = dyn_cast<Instruction>(getValPtr()))
+    SE->ValuesAtScopes.erase(I);
+  SE->Scalars.erase(getValPtr());
+  // this now dangles!
+}
+
+void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *) {
+  assert(SE && "SCEVCallbackVH called with a non-null ScalarEvolution!");
+
+  // Forget all the expressions associated with users of the old value,
+  // so that future queries will recompute the expressions using the new
+  // value.
+  SmallVector<User *, 16> Worklist;
+  Value *Old = getValPtr();
+  bool DeleteOld = false;
+  for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end();
+       UI != UE; ++UI)
+    Worklist.push_back(*UI);
+  while (!Worklist.empty()) {
+    User *U = Worklist.pop_back_val();
+    // Deleting the Old value will cause this to dangle. Postpone
+    // that until everything else is done.
+    if (U == Old) {
+      DeleteOld = true;
+      continue;
+    }
+    if (PHINode *PN = dyn_cast<PHINode>(U))
+      SE->ConstantEvolutionLoopExitValue.erase(PN);
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      SE->ValuesAtScopes.erase(I);
+    if (SE->Scalars.erase(U))
+      for (Value::use_iterator UI = U->use_begin(), UE = U->use_end();
+           UI != UE; ++UI)
+        Worklist.push_back(*UI);
+  }
+  if (DeleteOld) {
+    if (PHINode *PN = dyn_cast<PHINode>(Old))
+      SE->ConstantEvolutionLoopExitValue.erase(PN);
+    if (Instruction *I = dyn_cast<Instruction>(Old))
+      SE->ValuesAtScopes.erase(I);
+    SE->Scalars.erase(Old);
+    // this now dangles!
+  }
+  // this may dangle!
+}
+
+ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
+  : CallbackVH(V), SE(se) {}
+
+//===----------------------------------------------------------------------===//
+//                   ScalarEvolution Class Implementation
+//===----------------------------------------------------------------------===//
+
+ScalarEvolution::ScalarEvolution()
+  : FunctionPass(&ID), UnknownValue(new SCEVCouldNotCompute()) {
+}
+
+bool ScalarEvolution::runOnFunction(Function &F) {
+  this->F = &F;
+  LI = &getAnalysis<LoopInfo>();
+  TD = getAnalysisIfAvailable<TargetData>();
+  return false;
+}
+
+void ScalarEvolution::releaseMemory() {
+  Scalars.clear();
+  BackedgeTakenCounts.clear();
+  ConstantEvolutionLoopExitValue.clear();
+  ValuesAtScopes.clear();
+}
+
+void ScalarEvolution::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<LoopInfo>();
+}
+
+bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) {
+  return !isa<SCEVCouldNotCompute>(getBackedgeTakenCount(L));
+}
+
+static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
+                          const Loop *L) {
+  // Print all inner loops first
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    PrintLoopInfo(OS, SE, *I);
+
+  OS << "Loop " << L->getHeader()->getName() << ": ";
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1)
+    OS << "<multiple exits> ";
+
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << "backedge-taken count is " << *SE->getBackedgeTakenCount(L);
+  } else {
+    OS << "Unpredictable backedge-taken count. ";
+  }
+
+  OS << "\n";
+}
+
+void ScalarEvolution::print(raw_ostream &OS, const Module* ) const {
+  // ScalarEvolution's implementaiton of the print method is to print
+  // out SCEV values of all instructions that are interesting. Doing
+  // this potentially causes it to create new SCEV objects though,
+  // which technically conflicts with the const qualifier. This isn't
+  // observable from outside the class though (the hasSCEV function
+  // notwithstanding), so casting away the const isn't dangerous.
+  ScalarEvolution &SE = *const_cast<ScalarEvolution*>(this);
+
+  OS << "Classifying expressions for: " << F->getName() << "\n";
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (isSCEVable(I->getType())) {
+      OS << *I;
+      OS << "  -->  ";
+      SCEVHandle SV = SE.getSCEV(&*I);
+      SV->print(OS);
+      OS << "\t\t";
+
+      if (const Loop *L = LI->getLoopFor((*I).getParent())) {
+        OS << "Exits: ";
+        SCEVHandle ExitValue = SE.getSCEVAtScope(&*I, L->getParentLoop());
+        if (!ExitValue->isLoopInvariant(L)) {
+          OS << "<<Unknown>>";
+        } else {
+          OS << *ExitValue;
+        }
+      }
+
+      OS << "\n";
+    }
+
+  OS << "Determining loop execution counts for: " << F->getName() << "\n";
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    PrintLoopInfo(OS, &SE, *I);
+}
+
+void ScalarEvolution::print(std::ostream &o, const Module *M) const {
+  raw_os_ostream OS(o);
+  print(OS, M);
+}
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
new file mode 100644
index 0000000..7ba8268
--- /dev/null
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -0,0 +1,646 @@
+//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the scalar evolution expander,
+// which is used to generate the code corresponding to a given scalar evolution
+// expression.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Target/TargetData.h"
+using namespace llvm;
+
+/// InsertCastOfTo - Insert a cast of V to the specified type, doing what
+/// we can to share the casts.
+Value *SCEVExpander::InsertCastOfTo(Instruction::CastOps opcode, Value *V, 
+                                    const Type *Ty) {
+  // Short-circuit unnecessary bitcasts.
+  if (opcode == Instruction::BitCast && V->getType() == Ty)
+    return V;
+
+  // Short-circuit unnecessary inttoptr<->ptrtoint casts.
+  if ((opcode == Instruction::PtrToInt || opcode == Instruction::IntToPtr) &&
+      SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(V->getType())) {
+    if (CastInst *CI = dyn_cast<CastInst>(V))
+      if ((CI->getOpcode() == Instruction::PtrToInt ||
+           CI->getOpcode() == Instruction::IntToPtr) &&
+          SE.getTypeSizeInBits(CI->getType()) ==
+          SE.getTypeSizeInBits(CI->getOperand(0)->getType()))
+        return CI->getOperand(0);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+      if ((CE->getOpcode() == Instruction::PtrToInt ||
+           CE->getOpcode() == Instruction::IntToPtr) &&
+          SE.getTypeSizeInBits(CE->getType()) ==
+          SE.getTypeSizeInBits(CE->getOperand(0)->getType()))
+        return CE->getOperand(0);
+  }
+
+  // FIXME: keep track of the cast instruction.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getCast(opcode, C, Ty);
+  
+  if (Argument *A = dyn_cast<Argument>(V)) {
+    // Check to see if there is already a cast!
+    for (Value::use_iterator UI = A->use_begin(), E = A->use_end();
+         UI != E; ++UI) {
+      if ((*UI)->getType() == Ty)
+        if (CastInst *CI = dyn_cast<CastInst>(cast<Instruction>(*UI)))
+          if (CI->getOpcode() == opcode) {
+            // If the cast isn't the first instruction of the function, move it.
+            if (BasicBlock::iterator(CI) != 
+                A->getParent()->getEntryBlock().begin()) {
+              // If the CastInst is the insert point, change the insert point.
+              if (CI == InsertPt) ++InsertPt;
+              // Splice the cast at the beginning of the entry block.
+              CI->moveBefore(A->getParent()->getEntryBlock().begin());
+            }
+            return CI;
+          }
+    }
+    Instruction *I = CastInst::Create(opcode, V, Ty, V->getName(),
+                                      A->getParent()->getEntryBlock().begin());
+    InsertedValues.insert(I);
+    return I;
+  }
+
+  Instruction *I = cast<Instruction>(V);
+
+  // Check to see if there is already a cast.  If there is, use it.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    if ((*UI)->getType() == Ty)
+      if (CastInst *CI = dyn_cast<CastInst>(cast<Instruction>(*UI)))
+        if (CI->getOpcode() == opcode) {
+          BasicBlock::iterator It = I; ++It;
+          if (isa<InvokeInst>(I))
+            It = cast<InvokeInst>(I)->getNormalDest()->begin();
+          while (isa<PHINode>(It)) ++It;
+          if (It != BasicBlock::iterator(CI)) {
+            // If the CastInst is the insert point, change the insert point.
+            if (CI == InsertPt) ++InsertPt;
+            // Splice the cast immediately after the operand in question.
+            CI->moveBefore(It);
+          }
+          return CI;
+        }
+  }
+  BasicBlock::iterator IP = I; ++IP;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+    IP = II->getNormalDest()->begin();
+  while (isa<PHINode>(IP)) ++IP;
+  Instruction *CI = CastInst::Create(opcode, V, Ty, V->getName(), IP);
+  InsertedValues.insert(CI);
+  return CI;
+}
+
+/// InsertNoopCastOfTo - Insert a cast of V to the specified type,
+/// which must be possible with a noop cast.
+Value *SCEVExpander::InsertNoopCastOfTo(Value *V, const Type *Ty) {
+  Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false);
+  assert((Op == Instruction::BitCast ||
+          Op == Instruction::PtrToInt ||
+          Op == Instruction::IntToPtr) &&
+         "InsertNoopCastOfTo cannot perform non-noop casts!");
+  assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) &&
+         "InsertNoopCastOfTo cannot change sizes!");
+  return InsertCastOfTo(Op, V, Ty);
+}
+
+/// InsertBinop - Insert the specified binary operator, doing a small amount
+/// of work to avoid inserting an obviously redundant operation.
+Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, Value *LHS,
+                                 Value *RHS, BasicBlock::iterator InsertPt) {
+  // Fold a binop with constant operands.
+  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+    if (Constant *CRHS = dyn_cast<Constant>(RHS))
+      return ConstantExpr::get(Opcode, CLHS, CRHS);
+
+  // Do a quick scan to see if we have this binop nearby.  If so, reuse it.
+  unsigned ScanLimit = 6;
+  BasicBlock::iterator BlockBegin = InsertPt->getParent()->begin();
+  if (InsertPt != BlockBegin) {
+    // Scanning starts from the last instruction before InsertPt.
+    BasicBlock::iterator IP = InsertPt;
+    --IP;
+    for (; ScanLimit; --IP, --ScanLimit) {
+      if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS &&
+          IP->getOperand(1) == RHS)
+        return IP;
+      if (IP == BlockBegin) break;
+    }
+  }
+  
+  // If we haven't found this binop, insert it.
+  Instruction *BO = BinaryOperator::Create(Opcode, LHS, RHS, "tmp", InsertPt);
+  InsertedValues.insert(BO);
+  return BO;
+}
+
+/// FactorOutConstant - Test if S is divisible by Factor, using signed
+/// division. If so, update S with Factor divided out and return true.
+/// S need not be evenly divisble if a reasonable remainder can be
+/// computed.
+/// TODO: When ScalarEvolution gets a SCEVSDivExpr, this can be made
+/// unnecessary; in its place, just signed-divide Ops[i] by the scale and
+/// check to see if the divide was folded.
+static bool FactorOutConstant(SCEVHandle &S,
+                              SCEVHandle &Remainder,
+                              const APInt &Factor,
+                              ScalarEvolution &SE) {
+  // Everything is divisible by one.
+  if (Factor == 1)
+    return true;
+
+  // For a Constant, check for a multiple of the given factor.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+    ConstantInt *CI =
+      ConstantInt::get(C->getValue()->getValue().sdiv(Factor));
+    // If the quotient is zero and the remainder is non-zero, reject
+    // the value at this scale. It will be considered for subsequent
+    // smaller scales.
+    if (C->isZero() || !CI->isZero()) {
+      SCEVHandle Div = SE.getConstant(CI);
+      S = Div;
+      Remainder =
+        SE.getAddExpr(Remainder,
+                      SE.getConstant(C->getValue()->getValue().srem(Factor)));
+      return true;
+    }
+  }
+
+  // In a Mul, check if there is a constant operand which is a multiple
+  // of the given factor.
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+      if (!C->getValue()->getValue().srem(Factor)) {
+        std::vector<SCEVHandle> NewMulOps(M->getOperands());
+        NewMulOps[0] =
+          SE.getConstant(C->getValue()->getValue().sdiv(Factor));
+        S = SE.getMulExpr(NewMulOps);
+        return true;
+      }
+
+  // In an AddRec, check if both start and step are divisible.
+  if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
+    SCEVHandle Step = A->getStepRecurrence(SE);
+    SCEVHandle StepRem = SE.getIntegerSCEV(0, Step->getType());
+    if (!FactorOutConstant(Step, StepRem, Factor, SE))
+      return false;
+    if (!StepRem->isZero())
+      return false;
+    SCEVHandle Start = A->getStart();
+    if (!FactorOutConstant(Start, Remainder, Factor, SE))
+      return false;
+    S = SE.getAddRecExpr(Start, Step, A->getLoop());
+    return true;
+  }
+
+  return false;
+}
+
+/// expandAddToGEP - Expand a SCEVAddExpr with a pointer type into a GEP
+/// instead of using ptrtoint+arithmetic+inttoptr. This helps
+/// BasicAliasAnalysis analyze the result. However, it suffers from the
+/// underlying bug described in PR2831. Addition in LLVM currently always
+/// has two's complement wrapping guaranteed. However, the semantics for
+/// getelementptr overflow are ambiguous. In the common case though, this
+/// expansion gets used when a GEP in the original code has been converted
+/// into integer arithmetic, in which case the resulting code will be no
+/// more undefined than it was originally.
+///
+/// Design note: It might seem desirable for this function to be more
+/// loop-aware. If some of the indices are loop-invariant while others
+/// aren't, it might seem desirable to emit multiple GEPs, keeping the
+/// loop-invariant portions of the overall computation outside the loop.
+/// However, there are a few reasons this is not done here. Hoisting simple
+/// arithmetic is a low-level optimization that often isn't very
+/// important until late in the optimization process. In fact, passes
+/// like InstructionCombining will combine GEPs, even if it means
+/// pushing loop-invariant computation down into loops, so even if the
+/// GEPs were split here, the work would quickly be undone. The
+/// LoopStrengthReduction pass, which is usually run quite late (and
+/// after the last InstructionCombining pass), takes care of hoisting
+/// loop-invariant portions of expressions, after considering what
+/// can be folded using target addressing modes.
+///
+Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
+                                    const SCEVHandle *op_end,
+                                    const PointerType *PTy,
+                                    const Type *Ty,
+                                    Value *V) {
+  const Type *ElTy = PTy->getElementType();
+  SmallVector<Value *, 4> GepIndices;
+  std::vector<SCEVHandle> Ops(op_begin, op_end);
+  bool AnyNonZeroIndices = false;
+
+  // Decend down the pointer's type and attempt to convert the other
+  // operands into GEP indices, at each level. The first index in a GEP
+  // indexes into the array implied by the pointer operand; the rest of
+  // the indices index into the element or field type selected by the
+  // preceding index.
+  for (;;) {
+    APInt ElSize = APInt(SE.getTypeSizeInBits(Ty),
+                         ElTy->isSized() ?  SE.TD->getTypeAllocSize(ElTy) : 0);
+    std::vector<SCEVHandle> NewOps;
+    std::vector<SCEVHandle> ScaledOps;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      // Split AddRecs up into parts as either of the parts may be usable
+      // without the other.
+      if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i]))
+        if (!A->getStart()->isZero()) {
+          SCEVHandle Start = A->getStart();
+          Ops.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0, A->getType()),
+                                         A->getStepRecurrence(SE),
+                                         A->getLoop()));
+          Ops[i] = Start;
+          ++e;
+        }
+      // If the scale size is not 0, attempt to factor out a scale.
+      if (ElSize != 0) {
+        SCEVHandle Op = Ops[i];
+        SCEVHandle Remainder = SE.getIntegerSCEV(0, Op->getType());
+        if (FactorOutConstant(Op, Remainder, ElSize, SE)) {
+          ScaledOps.push_back(Op); // Op now has ElSize factored out.
+          NewOps.push_back(Remainder);
+          continue;
+        }
+      }
+      // If the operand was not divisible, add it to the list of operands
+      // we'll scan next iteration.
+      NewOps.push_back(Ops[i]);
+    }
+    Ops = NewOps;
+    AnyNonZeroIndices |= !ScaledOps.empty();
+    Value *Scaled = ScaledOps.empty() ?
+                    Constant::getNullValue(Ty) :
+                    expandCodeFor(SE.getAddExpr(ScaledOps), Ty);
+    GepIndices.push_back(Scaled);
+
+    // Collect struct field index operands.
+    if (!Ops.empty())
+      while (const StructType *STy = dyn_cast<StructType>(ElTy)) {
+        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0]))
+          if (SE.getTypeSizeInBits(C->getType()) <= 64) {
+            const StructLayout &SL = *SE.TD->getStructLayout(STy);
+            uint64_t FullOffset = C->getValue()->getZExtValue();
+            if (FullOffset < SL.getSizeInBytes()) {
+              unsigned ElIdx = SL.getElementContainingOffset(FullOffset);
+              GepIndices.push_back(ConstantInt::get(Type::Int32Ty, ElIdx));
+              ElTy = STy->getTypeAtIndex(ElIdx);
+              Ops[0] =
+                SE.getConstant(ConstantInt::get(Ty,
+                                                FullOffset -
+                                                  SL.getElementOffset(ElIdx)));
+              AnyNonZeroIndices = true;
+              continue;
+            }
+          }
+        break;
+      }
+
+    if (const ArrayType *ATy = dyn_cast<ArrayType>(ElTy)) {
+      ElTy = ATy->getElementType();
+      continue;
+    }
+    break;
+  }
+
+  // If none of the operands were convertable to proper GEP indices, cast
+  // the base to i8* and do an ugly getelementptr with that. It's still
+  // better than ptrtoint+arithmetic+inttoptr at least.
+  if (!AnyNonZeroIndices) {
+    V = InsertNoopCastOfTo(V,
+                           Type::Int8Ty->getPointerTo(PTy->getAddressSpace()));
+    Value *Idx = expand(SE.getAddExpr(Ops));
+    Idx = InsertNoopCastOfTo(Idx, Ty);
+
+    // Fold a GEP with constant operands.
+    if (Constant *CLHS = dyn_cast<Constant>(V))
+      if (Constant *CRHS = dyn_cast<Constant>(Idx))
+        return ConstantExpr::getGetElementPtr(CLHS, &CRHS, 1);
+
+    // Do a quick scan to see if we have this GEP nearby.  If so, reuse it.
+    unsigned ScanLimit = 6;
+    BasicBlock::iterator BlockBegin = InsertPt->getParent()->begin();
+    if (InsertPt != BlockBegin) {
+      // Scanning starts from the last instruction before InsertPt.
+      BasicBlock::iterator IP = InsertPt;
+      --IP;
+      for (; ScanLimit; --IP, --ScanLimit) {
+        if (IP->getOpcode() == Instruction::GetElementPtr &&
+            IP->getOperand(0) == V && IP->getOperand(1) == Idx)
+          return IP;
+        if (IP == BlockBegin) break;
+      }
+    }
+
+    Value *GEP = GetElementPtrInst::Create(V, Idx, "scevgep", InsertPt);
+    InsertedValues.insert(GEP);
+    return GEP;
+  }
+
+  // Insert a pretty getelementptr.
+  Value *GEP = GetElementPtrInst::Create(V,
+                                         GepIndices.begin(),
+                                         GepIndices.end(),
+                                         "scevgep", InsertPt);
+  Ops.push_back(SE.getUnknown(GEP));
+  InsertedValues.insert(GEP);
+  return expand(SE.getAddExpr(Ops));
+}
+
+Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *V = expand(S->getOperand(S->getNumOperands()-1));
+
+  // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
+  // comments on expandAddToGEP for details.
+  if (SE.TD)
+    if (const PointerType *PTy = dyn_cast<PointerType>(V->getType())) {
+      const std::vector<SCEVHandle> &Ops = S->getOperands();
+      return expandAddToGEP(&Ops[0], &Ops[Ops.size() - 1],
+                            PTy, Ty, V);
+    }
+
+  V = InsertNoopCastOfTo(V, Ty);
+
+  // Emit a bunch of add instructions
+  for (int i = S->getNumOperands()-2; i >= 0; --i) {
+    Value *W = expand(S->getOperand(i));
+    W = InsertNoopCastOfTo(W, Ty);
+    V = InsertBinop(Instruction::Add, V, W, InsertPt);
+  }
+  return V;
+}
+
+Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  int FirstOp = 0;  // Set if we should emit a subtract.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(0)))
+    if (SC->getValue()->isAllOnesValue())
+      FirstOp = 1;
+
+  int i = S->getNumOperands()-2;
+  Value *V = expand(S->getOperand(i+1));
+  V = InsertNoopCastOfTo(V, Ty);
+
+  // Emit a bunch of multiply instructions
+  for (; i >= FirstOp; --i) {
+    Value *W = expand(S->getOperand(i));
+    W = InsertNoopCastOfTo(W, Ty);
+    V = InsertBinop(Instruction::Mul, V, W, InsertPt);
+  }
+
+  // -1 * ...  --->  0 - ...
+  if (FirstOp == 1)
+    V = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), V, InsertPt);
+  return V;
+}
+
+Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+  Value *LHS = expand(S->getLHS());
+  LHS = InsertNoopCastOfTo(LHS, Ty);
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
+    const APInt &RHS = SC->getValue()->getValue();
+    if (RHS.isPowerOf2())
+      return InsertBinop(Instruction::LShr, LHS,
+                         ConstantInt::get(Ty, RHS.logBase2()),
+                         InsertPt);
+  }
+
+  Value *RHS = expand(S->getRHS());
+  RHS = InsertNoopCastOfTo(RHS, Ty);
+  return InsertBinop(Instruction::UDiv, LHS, RHS, InsertPt);
+}
+
+/// Move parts of Base into Rest to leave Base with the minimal
+/// expression that provides a pointer operand suitable for a
+/// GEP expansion.
+static void ExposePointerBase(SCEVHandle &Base, SCEVHandle &Rest,
+                              ScalarEvolution &SE) {
+  while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
+    Base = A->getStart();
+    Rest = SE.getAddExpr(Rest,
+                         SE.getAddRecExpr(SE.getIntegerSCEV(0, A->getType()),
+                                          A->getStepRecurrence(SE),
+                                          A->getLoop()));
+  }
+  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
+    Base = A->getOperand(A->getNumOperands()-1);
+    std::vector<SCEVHandle> NewAddOps(A->op_begin(), A->op_end());
+    NewAddOps.back() = Rest;
+    Rest = SE.getAddExpr(NewAddOps);
+    ExposePointerBase(Base, Rest, SE);
+  }
+}
+
+Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  const Loop *L = S->getLoop();
+
+  // {X,+,F} --> X + {0,+,F}
+  if (!S->getStart()->isZero()) {
+    std::vector<SCEVHandle> NewOps(S->getOperands());
+    NewOps[0] = SE.getIntegerSCEV(0, Ty);
+    SCEVHandle Rest = SE.getAddRecExpr(NewOps, L);
+
+    // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
+    // comments on expandAddToGEP for details.
+    if (SE.TD) {
+      SCEVHandle Base = S->getStart();
+      SCEVHandle RestArray[1] = { Rest };
+      // Dig into the expression to find the pointer base for a GEP.
+      ExposePointerBase(Base, RestArray[0], SE);
+      // If we found a pointer, expand the AddRec with a GEP.
+      if (const PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
+        // Make sure the Base isn't something exotic, such as a multiplied
+        // or divided pointer value. In those cases, the result type isn't
+        // actually a pointer type.
+        if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) {
+          Value *StartV = expand(Base);
+          assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
+          return expandAddToGEP(RestArray, RestArray+1, PTy, Ty, StartV);
+        }
+      }
+    }
+
+    Value *RestV = expand(Rest);
+    return expand(SE.getAddExpr(S->getStart(), SE.getUnknown(RestV)));
+  }
+
+  // {0,+,1} --> Insert a canonical induction variable into the loop!
+  if (S->isAffine() &&
+      S->getOperand(1) == SE.getIntegerSCEV(1, Ty)) {
+    // Create and insert the PHI node for the induction variable in the
+    // specified loop.
+    BasicBlock *Header = L->getHeader();
+    PHINode *PN = PHINode::Create(Ty, "indvar", Header->begin());
+    InsertedValues.insert(PN);
+    PN->addIncoming(Constant::getNullValue(Ty), L->getLoopPreheader());
+
+    pred_iterator HPI = pred_begin(Header);
+    assert(HPI != pred_end(Header) && "Loop with zero preds???");
+    if (!L->contains(*HPI)) ++HPI;
+    assert(HPI != pred_end(Header) && L->contains(*HPI) &&
+           "No backedge in loop?");
+
+    // Insert a unit add instruction right before the terminator corresponding
+    // to the back-edge.
+    Constant *One = ConstantInt::get(Ty, 1);
+    Instruction *Add = BinaryOperator::CreateAdd(PN, One, "indvar.next",
+                                                 (*HPI)->getTerminator());
+    InsertedValues.insert(Add);
+
+    pred_iterator PI = pred_begin(Header);
+    if (*PI == L->getLoopPreheader())
+      ++PI;
+    PN->addIncoming(Add, *PI);
+    return PN;
+  }
+
+  // Get the canonical induction variable I for this loop.
+  Value *I = getOrInsertCanonicalInductionVariable(L, Ty);
+
+  // If this is a simple linear addrec, emit it now as a special case.
+  if (S->isAffine()) {   // {0,+,F} --> i*F
+    Value *F = expand(S->getOperand(1));
+    F = InsertNoopCastOfTo(F, Ty);
+    
+    // IF the step is by one, just return the inserted IV.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(F))
+      if (CI->getValue() == 1)
+        return I;
+    
+    // If the insert point is directly inside of the loop, emit the multiply at
+    // the insert point.  Otherwise, L is a loop that is a parent of the insert
+    // point loop.  If we can, move the multiply to the outer most loop that it
+    // is safe to be in.
+    BasicBlock::iterator MulInsertPt = getInsertionPoint();
+    Loop *InsertPtLoop = SE.LI->getLoopFor(MulInsertPt->getParent());
+    if (InsertPtLoop != L && InsertPtLoop &&
+        L->contains(InsertPtLoop->getHeader())) {
+      do {
+        // If we cannot hoist the multiply out of this loop, don't.
+        if (!InsertPtLoop->isLoopInvariant(F)) break;
+
+        BasicBlock *InsertPtLoopPH = InsertPtLoop->getLoopPreheader();
+
+        // If this loop hasn't got a preheader, we aren't able to hoist the
+        // multiply.
+        if (!InsertPtLoopPH)
+          break;
+
+        // Otherwise, move the insert point to the preheader.
+        MulInsertPt = InsertPtLoopPH->getTerminator();
+        InsertPtLoop = InsertPtLoop->getParentLoop();
+      } while (InsertPtLoop != L);
+    }
+    
+    return InsertBinop(Instruction::Mul, I, F, MulInsertPt);
+  }
+
+  // If this is a chain of recurrences, turn it into a closed form, using the
+  // folders, then expandCodeFor the closed form.  This allows the folders to
+  // simplify the expression without having to build a bunch of special code
+  // into this folder.
+  SCEVHandle IH = SE.getUnknown(I);   // Get I as a "symbolic" SCEV.
+
+  SCEVHandle V = S->evaluateAtIteration(IH, SE);
+  //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";
+
+  return expand(V);
+}
+
+Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *V = expand(S->getOperand());
+  V = InsertNoopCastOfTo(V, SE.getEffectiveSCEVType(V->getType()));
+  Instruction *I = new TruncInst(V, Ty, "tmp.", InsertPt);
+  InsertedValues.insert(I);
+  return I;
+}
+
+Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *V = expand(S->getOperand());
+  V = InsertNoopCastOfTo(V, SE.getEffectiveSCEVType(V->getType()));
+  Instruction *I = new ZExtInst(V, Ty, "tmp.", InsertPt);
+  InsertedValues.insert(I);
+  return I;
+}
+
+Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *V = expand(S->getOperand());
+  V = InsertNoopCastOfTo(V, SE.getEffectiveSCEVType(V->getType()));
+  Instruction *I = new SExtInst(V, Ty, "tmp.", InsertPt);
+  InsertedValues.insert(I);
+  return I;
+}
+
+Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *LHS = expand(S->getOperand(0));
+  LHS = InsertNoopCastOfTo(LHS, Ty);
+  for (unsigned i = 1; i < S->getNumOperands(); ++i) {
+    Value *RHS = expand(S->getOperand(i));
+    RHS = InsertNoopCastOfTo(RHS, Ty);
+    Instruction *ICmp =
+      new ICmpInst(ICmpInst::ICMP_SGT, LHS, RHS, "tmp", InsertPt);
+    InsertedValues.insert(ICmp);
+    Instruction *Sel = SelectInst::Create(ICmp, LHS, RHS, "smax", InsertPt);
+    InsertedValues.insert(Sel);
+    LHS = Sel;
+  }
+  return LHS;
+}
+
+Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *LHS = expand(S->getOperand(0));
+  LHS = InsertNoopCastOfTo(LHS, Ty);
+  for (unsigned i = 1; i < S->getNumOperands(); ++i) {
+    Value *RHS = expand(S->getOperand(i));
+    RHS = InsertNoopCastOfTo(RHS, Ty);
+    Instruction *ICmp =
+      new ICmpInst(ICmpInst::ICMP_UGT, LHS, RHS, "tmp", InsertPt);
+    InsertedValues.insert(ICmp);
+    Instruction *Sel = SelectInst::Create(ICmp, LHS, RHS, "umax", InsertPt);
+    InsertedValues.insert(Sel);
+    LHS = Sel;
+  }
+  return LHS;
+}
+
+Value *SCEVExpander::expandCodeFor(SCEVHandle SH, const Type *Ty) {
+  // Expand the code for this SCEV.
+  Value *V = expand(SH);
+  if (Ty) {
+    assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) &&
+           "non-trivial casts should be done with the SCEVs directly!");
+    V = InsertNoopCastOfTo(V, Ty);
+  }
+  return V;
+}
+
+Value *SCEVExpander::expand(const SCEV *S) {
+  // Check to see if we already expanded this.
+  std::map<SCEVHandle, AssertingVH<Value> >::iterator I =
+    InsertedExpressions.find(S);
+  if (I != InsertedExpressions.end())
+    return I->second;
+  
+  Value *V = visit(S);
+  InsertedExpressions[S] = V;
+  return V;
+}
diff --git a/lib/Analysis/SparsePropagation.cpp b/lib/Analysis/SparsePropagation.cpp
new file mode 100644
index 0000000..5433068
--- /dev/null
+++ b/lib/Analysis/SparsePropagation.cpp
@@ -0,0 +1,331 @@
+//===- SparsePropagation.cpp - Sparse Conditional Property Propagation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an abstract sparse conditional propagation algorithm,
+// modeled after SCCP, but with a customizable lattice function.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sparseprop"
+#include "llvm/Analysis/SparsePropagation.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                  AbstractLatticeFunction Implementation
+//===----------------------------------------------------------------------===//
+
+AbstractLatticeFunction::~AbstractLatticeFunction() {}
+
+/// PrintValue - Render the specified lattice value to the specified stream.
+void AbstractLatticeFunction::PrintValue(LatticeVal V, std::ostream &OS) {
+  if (V == UndefVal)
+    OS << "undefined";
+  else if (V == OverdefinedVal)
+    OS << "overdefined";
+  else if (V == UntrackedVal)
+    OS << "untracked";
+  else
+    OS << "unknown lattice value";
+}
+
+//===----------------------------------------------------------------------===//
+//                          SparseSolver Implementation
+//===----------------------------------------------------------------------===//
+
+/// getOrInitValueState - Return the LatticeVal object that corresponds to the
+/// value, initializing the value's state if it hasn't been entered into the
+/// map yet.   This function is necessary because not all values should start
+/// out in the underdefined state... Arguments should be overdefined, and
+/// constants should be marked as constants.
+///
+SparseSolver::LatticeVal SparseSolver::getOrInitValueState(Value *V) {
+  DenseMap<Value*, LatticeVal>::iterator I = ValueState.find(V);
+  if (I != ValueState.end()) return I->second;  // Common case, in the map
+  
+  LatticeVal LV;
+  if (LatticeFunc->IsUntrackedValue(V))
+    return LatticeFunc->getUntrackedVal();
+  else if (Constant *C = dyn_cast<Constant>(V))
+    LV = LatticeFunc->ComputeConstant(C);
+  else if (Argument *A = dyn_cast<Argument>(V))
+    LV = LatticeFunc->ComputeArgument(A);
+  else if (!isa<Instruction>(V))
+    // All other non-instructions are overdefined.
+    LV = LatticeFunc->getOverdefinedVal();
+  else
+    // All instructions are underdefined by default.
+    LV = LatticeFunc->getUndefVal();
+  
+  // If this value is untracked, don't add it to the map.
+  if (LV == LatticeFunc->getUntrackedVal())
+    return LV;
+  return ValueState[V] = LV;
+}
+
+/// UpdateState - When the state for some instruction is potentially updated,
+/// this function notices and adds I to the worklist if needed.
+void SparseSolver::UpdateState(Instruction &Inst, LatticeVal V) {
+  DenseMap<Value*, LatticeVal>::iterator I = ValueState.find(&Inst);
+  if (I != ValueState.end() && I->second == V)
+    return;  // No change.
+  
+  // An update.  Visit uses of I.
+  ValueState[&Inst] = V;
+  InstWorkList.push_back(&Inst);
+}
+
+/// MarkBlockExecutable - This method can be used by clients to mark all of
+/// the blocks that are known to be intrinsically live in the processed unit.
+void SparseSolver::MarkBlockExecutable(BasicBlock *BB) {
+  DOUT << "Marking Block Executable: " << BB->getNameStart() << "\n";
+  BBExecutable.insert(BB);   // Basic block is executable!
+  BBWorkList.push_back(BB);  // Add the block to the work list!
+}
+
+/// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+/// work list if it is not already executable...
+void SparseSolver::markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+  if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+    return;  // This edge is already known to be executable!
+  
+  DOUT << "Marking Edge Executable: " << Source->getNameStart()
+       << " -> " << Dest->getNameStart() << "\n";
+
+  if (BBExecutable.count(Dest)) {
+    // The destination is already executable, but we just made an edge
+    // feasible that wasn't before.  Revisit the PHI nodes in the block
+    // because they have potentially new operands.
+    for (BasicBlock::iterator I = Dest->begin(); isa<PHINode>(I); ++I)
+      visitPHINode(*cast<PHINode>(I));
+    
+  } else {
+    MarkBlockExecutable(Dest);
+  }
+}
+
+
+/// getFeasibleSuccessors - Return a vector of booleans to indicate which
+/// successors are reachable from a given terminator instruction.
+void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                         SmallVectorImpl<bool> &Succs,
+                                         bool AggressiveUndef) {
+  Succs.resize(TI.getNumSuccessors());
+  if (TI.getNumSuccessors() == 0) return;
+  
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+      return;
+    }
+    
+    LatticeVal BCValue;
+    if (AggressiveUndef)
+      BCValue = getOrInitValueState(BI->getCondition());
+    else
+      BCValue = getLatticeState(BI->getCondition());
+    
+    if (BCValue == LatticeFunc->getOverdefinedVal() ||
+        BCValue == LatticeFunc->getUntrackedVal()) {
+      // Overdefined condition variables can branch either way.
+      Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // If undefined, neither is feasible yet.
+    if (BCValue == LatticeFunc->getUndefVal())
+      return;
+
+    Constant *C = LatticeFunc->GetConstant(BCValue, BI->getCondition(), *this);
+    if (C == 0 || !isa<ConstantInt>(C)) {
+      // Non-constant values can go either way.
+      Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // Constant condition variables mean the branch can only go a single way
+    Succs[C == ConstantInt::getFalse()] = true;
+    return;
+  }
+  
+  if (isa<InvokeInst>(TI)) {
+    // Invoke instructions successors are always executable.
+    // TODO: Could ask the lattice function if the value can throw.
+    Succs[0] = Succs[1] = true;
+    return;
+  }
+  
+  SwitchInst &SI = cast<SwitchInst>(TI);
+  LatticeVal SCValue;
+  if (AggressiveUndef)
+    SCValue = getOrInitValueState(SI.getCondition());
+  else
+    SCValue = getLatticeState(SI.getCondition());
+  
+  if (SCValue == LatticeFunc->getOverdefinedVal() ||
+      SCValue == LatticeFunc->getUntrackedVal()) {
+    // All destinations are executable!
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+  
+  // If undefined, neither is feasible yet.
+  if (SCValue == LatticeFunc->getUndefVal())
+    return;
+  
+  Constant *C = LatticeFunc->GetConstant(SCValue, SI.getCondition(), *this);
+  if (C == 0 || !isa<ConstantInt>(C)) {
+    // All destinations are executable!
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+  
+  Succs[SI.findCaseValue(cast<ConstantInt>(C))] = true;
+}
+
+
+/// isEdgeFeasible - Return true if the control flow edge from the 'From'
+/// basic block to the 'To' basic block is currently feasible...
+bool SparseSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To,
+                                  bool AggressiveUndef) {
+  SmallVector<bool, 16> SuccFeasible;
+  TerminatorInst *TI = From->getTerminator();
+  getFeasibleSuccessors(*TI, SuccFeasible, AggressiveUndef);
+  
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    if (TI->getSuccessor(i) == To && SuccFeasible[i])
+      return true;
+  
+  return false;
+}
+
+void SparseSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible, true);
+  
+  BasicBlock *BB = TI.getParent();
+  
+  // Mark all feasible successors executable...
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SparseSolver::visitPHINode(PHINode &PN) {
+  LatticeVal PNIV = getOrInitValueState(&PN);
+  LatticeVal Overdefined = LatticeFunc->getOverdefinedVal();
+  
+  // If this value is already overdefined (common) just return.
+  if (PNIV == Overdefined || PNIV == LatticeFunc->getUntrackedVal())
+    return;  // Quick exit
+  
+  // Super-extra-high-degree PHI nodes are unlikely to ever be interesting,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64) {
+    UpdateState(PN, Overdefined);
+    return;
+  }
+  
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  Otherwise, ask the
+  // transfer function to give us the merge of the incoming values.
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    // If the edge is not yet known to be feasible, it doesn't impact the PHI.
+    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent(), true))
+      continue;
+    
+    // Merge in this value.
+    LatticeVal OpVal = getOrInitValueState(PN.getIncomingValue(i));
+    if (OpVal != PNIV)
+      PNIV = LatticeFunc->MergeValues(PNIV, OpVal);
+    
+    if (PNIV == Overdefined)
+      break;  // Rest of input values don't matter.
+  }
+
+  // Update the PHI with the compute value, which is the merge of the inputs.
+  UpdateState(PN, PNIV);
+}
+
+
+void SparseSolver::visitInst(Instruction &I) {
+  // PHIs are handled by the propagation logic, they are never passed into the
+  // transfer functions.
+  if (PHINode *PN = dyn_cast<PHINode>(&I))
+    return visitPHINode(*PN);
+  
+  // Otherwise, ask the transfer function what the result is.  If this is
+  // something that we care about, remember it.
+  LatticeVal IV = LatticeFunc->ComputeInstructionState(I, *this);
+  if (IV != LatticeFunc->getUntrackedVal())
+    UpdateState(I, IV);
+  
+  if (TerminatorInst *TI = dyn_cast<TerminatorInst>(&I))
+    visitTerminatorInst(*TI);
+}
+
+void SparseSolver::Solve(Function &F) {
+  MarkBlockExecutable(&F.getEntryBlock());
+  
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty()) {
+    // Process the instruction work list.
+    while (!InstWorkList.empty()) {
+      Instruction *I = InstWorkList.back();
+      InstWorkList.pop_back();
+
+      DOUT << "\nPopped off I-WL: " << *I;
+
+      // "I" got into the work list because it made a transition.  See if any
+      // users are both live and in need of updating.
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI) {
+        Instruction *U = cast<Instruction>(*UI);
+        if (BBExecutable.count(U->getParent()))   // Inst is executable?
+          visitInst(*U);
+      }
+    }
+
+    // Process the basic block work list.
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DOUT << "\nPopped off BBWL: " << *BB;
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        visitInst(*I);
+    }
+  }
+}
+
+void SparseSolver::Print(Function &F, std::ostream &OS) const {
+  OS << "\nFUNCTION: " << F.getNameStr() << "\n";
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!BBExecutable.count(BB))
+      OS << "INFEASIBLE: ";
+    OS << "\t";
+    if (BB->hasName())
+      OS << BB->getNameStr() << ":\n";
+    else
+      OS << "; anon bb\n";
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      LatticeFunc->PrintValue(getLatticeState(I), OS);
+      OS << *I;
+    }
+    
+    OS << "\n";
+  }
+}
+
diff --git a/lib/Analysis/Trace.cpp b/lib/Analysis/Trace.cpp
new file mode 100644
index 0000000..8f19fda
--- /dev/null
+++ b/lib/Analysis/Trace.cpp
@@ -0,0 +1,50 @@
+//===- Trace.cpp - Implementation of Trace class --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents a single trace of LLVM basic blocks.  A trace is a
+// single entry, multiple exit, region of code that is often hot.  Trace-based
+// optimizations treat traces almost like they are a large, strange, basic
+// block: because the trace path is assumed to be hot, optimizations for the
+// fall-through path are made at the expense of the non-fall-through paths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Trace.h"
+#include "llvm/Function.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+Function *Trace::getFunction() const {
+  return getEntryBasicBlock()->getParent();
+}
+
+Module *Trace::getModule() const {
+  return getFunction()->getParent();
+}
+
+/// print - Write trace to output stream.
+///
+void Trace::print(std::ostream &O) const {
+  Function *F = getFunction ();
+  O << "; Trace from function " << F->getName() << ", blocks:\n";
+  for (const_iterator i = begin(), e = end(); i != e; ++i) {
+    O << "; ";
+    WriteAsOperand(O, *i, true, getModule());
+    O << "\n";
+  }
+  O << "; Trace parent function: \n" << *F;
+}
+
+/// dump - Debugger convenience method; writes trace to standard error
+/// output stream.
+///
+void Trace::dump() const {
+  print(cerr);
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
new file mode 100644
index 0000000..29ff8aa
--- /dev/null
+++ b/lib/Analysis/ValueTracking.cpp
@@ -0,0 +1,1079 @@
+//===- ValueTracking.cpp - Walk computations to compute properties --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains routines that help analyze properties that chains of
+// computations have.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include <cstring>
+using namespace llvm;
+
+/// getOpcode - If this is an Instruction or a ConstantExpr, return the
+/// opcode value. Otherwise return UserOp1.
+static unsigned getOpcode(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getOpcode();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    return CE->getOpcode();
+  // Use UserOp1 to mean there's no opcode.
+  return Instruction::UserOp1;
+}
+
+
+/// ComputeMaskedBits - Determine which of the bits specified in Mask are
+/// known to be either zero or one and return them in the KnownZero/KnownOne
+/// bit sets.  This code only analyzes bits in Mask, in order to short-circuit
+/// processing.
+/// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
+/// we cannot optimize based on the assumption that it is zero without changing
+/// it to be an explicit zero.  If we don't change it to zero, other code could
+/// optimized based on the contradictory assumption that it is non-zero.
+/// Because instcombine aggressively folds operations with undef args anyway,
+/// this won't lose us code quality.
+void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
+                             APInt &KnownZero, APInt &KnownOne,
+                             TargetData *TD, unsigned Depth) {
+  const unsigned MaxDepth = 6;
+  assert(V && "No Value?");
+  assert(Depth <= MaxDepth && "Limit Search Depth");
+  unsigned BitWidth = Mask.getBitWidth();
+  assert((V->getType()->isInteger() || isa<PointerType>(V->getType())) &&
+         "Not integer or pointer type!");
+  assert((!TD || TD->getTypeSizeInBits(V->getType()) == BitWidth) &&
+         (!isa<IntegerType>(V->getType()) ||
+          V->getType()->getPrimitiveSizeInBits() == BitWidth) &&
+         KnownZero.getBitWidth() == BitWidth && 
+         KnownOne.getBitWidth() == BitWidth &&
+         "V, Mask, KnownOne and KnownZero should have same BitWidth");
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getValue() & Mask;
+    KnownZero = ~KnownOne & Mask;
+    return;
+  }
+  // Null is all-zeros.
+  if (isa<ConstantPointerNull>(V)) {
+    KnownOne.clear();
+    KnownZero = Mask;
+    return;
+  }
+  // The address of an aligned GlobalValue has trailing zeros.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    unsigned Align = GV->getAlignment();
+    if (Align == 0 && TD && GV->getType()->getElementType()->isSized()) 
+      Align = TD->getPrefTypeAlignment(GV->getType()->getElementType());
+    if (Align > 0)
+      KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
+                                              CountTrailingZeros_32(Align));
+    else
+      KnownZero.clear();
+    KnownOne.clear();
+    return;
+  }
+
+  KnownZero.clear(); KnownOne.clear();   // Start out not knowing anything.
+
+  if (Depth == MaxDepth || Mask == 0)
+    return;  // Limit search depth.
+
+  User *I = dyn_cast<User>(V);
+  if (!I) return;
+
+  APInt KnownZero2(KnownZero), KnownOne2(KnownOne);
+  switch (getOpcode(I)) {
+  default: break;
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, TD, Depth+1);
+    APInt Mask2(Mask & ~KnownZero);
+    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD,
+                      Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    return;
+  }
+  case Instruction::Or: {
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, TD, Depth+1);
+    APInt Mask2(Mask & ~KnownOne);
+    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD,
+                      Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    return;
+  }
+  case Instruction::Xor: {
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, TD,
+                      Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    return;
+  }
+  case Instruction::Mul: {
+    APInt Mask2 = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(I->getOperand(1), Mask2, KnownZero, KnownOne, TD,Depth+1);
+    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD,
+                      Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conserative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    KnownOne.clear();
+    unsigned TrailZ = KnownZero.countTrailingOnes() +
+                      KnownZero2.countTrailingOnes();
+    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                               KnownZero2.countLeadingOnes(),
+                               BitWidth) - BitWidth;
+
+    TrailZ = std::min(TrailZ, BitWidth);
+    LeadZ = std::min(LeadZ, BitWidth);
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+                APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero &= Mask;
+    return;
+  }
+  case Instruction::UDiv: {
+    // For the purposes of computing leading zeros we can conservatively
+    // treat a udiv as a logical right shift by the power of 2 known to
+    // be less than the denominator.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(I->getOperand(0),
+                      AllOnes, KnownZero2, KnownOne2, TD, Depth+1);
+    unsigned LeadZ = KnownZero2.countLeadingOnes();
+
+    KnownOne2.clear();
+    KnownZero2.clear();
+    ComputeMaskedBits(I->getOperand(1),
+                      AllOnes, KnownZero2, KnownOne2, TD, Depth+1);
+    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
+    if (RHSUnknownLeadingOnes != BitWidth)
+      LeadZ = std::min(BitWidth,
+                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ) & Mask;
+    return;
+  }
+  case Instruction::Select:
+    ComputeMaskedBits(I->getOperand(2), Mask, KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero2, KnownOne2, TD,
+                      Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+    return; // Can't work with floating point.
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+    // We can't handle these if we don't know the pointer size.
+    if (!TD) return;
+    // FALL THROUGH and handle them the same as zext/trunc.
+  case Instruction::ZExt:
+  case Instruction::Trunc: {
+    // Note that we handle pointer operands here because of inttoptr/ptrtoint
+    // which fall through here.
+    const Type *SrcTy = I->getOperand(0)->getType();
+    unsigned SrcBitWidth = TD ?
+      TD->getTypeSizeInBits(SrcTy) :
+      SrcTy->getPrimitiveSizeInBits();
+    APInt MaskIn(Mask);
+    MaskIn.zextOrTrunc(SrcBitWidth);
+    KnownZero.zextOrTrunc(SrcBitWidth);
+    KnownOne.zextOrTrunc(SrcBitWidth);
+    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, TD,
+                      Depth+1);
+    KnownZero.zextOrTrunc(BitWidth);
+    KnownOne.zextOrTrunc(BitWidth);
+    // Any top bits are known to be zero.
+    if (BitWidth > SrcBitWidth)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    return;
+  }
+  case Instruction::BitCast: {
+    const Type *SrcTy = I->getOperand(0)->getType();
+    if (SrcTy->isInteger() || isa<PointerType>(SrcTy)) {
+      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, TD,
+                        Depth+1);
+      return;
+    }
+    break;
+  }
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
+    unsigned SrcBitWidth = SrcTy->getBitWidth();
+      
+    APInt MaskIn(Mask); 
+    MaskIn.trunc(SrcBitWidth);
+    KnownZero.trunc(SrcBitWidth);
+    KnownOne.trunc(SrcBitWidth);
+    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, TD,
+                      Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    if (KnownZero[SrcBitWidth-1])             // Input sign bit known zero
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
+      KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    return;
+  }
+  case Instruction::Shl:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      APInt Mask2(Mask.lshr(ShiftAmt));
+      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, TD,
+                        Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero <<= ShiftAmt;
+      KnownOne  <<= ShiftAmt;
+      KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0
+      return;
+    }
+    break;
+  case Instruction::LShr:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // Compute the new bits that are at the top now.
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Unsigned shift right.
+      APInt Mask2(Mask.shl(ShiftAmt));
+      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero,KnownOne, TD,
+                        Depth+1);
+      assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+      // high bits known zero.
+      KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      return;
+    }
+    break;
+  case Instruction::AShr:
+    // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // Compute the new bits that are at the top now.
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Signed shift right.
+      APInt Mask2(Mask.shl(ShiftAmt));
+      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, TD,
+                        Depth+1);
+      assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+        
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      if (KnownZero[BitWidth-ShiftAmt-1])    // New bits are known zero.
+        KnownZero |= HighBits;
+      else if (KnownOne[BitWidth-ShiftAmt-1])  // New bits are known one.
+        KnownOne |= HighBits;
+      return;
+    }
+    break;
+  case Instruction::Sub: {
+    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(I->getOperand(0))) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (!CLHS->getValue().isNegative()) {
+        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        ComputeMaskedBits(I->getOperand(1), MaskV, KnownZero2, KnownOne2,
+                          TD, Depth+1);
+    
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
+        }
+      }        
+    }
+  }
+  // fall through
+  case Instruction::Add: {
+    // If one of the operands has trailing zeros, than the bits that the
+    // other operand has in those bit positions will be preserved in the
+    // result. For an add, this works with either operand. For a subtract,
+    // this only works if the known zeros are in the right operand.
+    APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+    APInt Mask2 = APInt::getLowBitsSet(BitWidth,
+                                       BitWidth - Mask.countLeadingZeros());
+    ComputeMaskedBits(I->getOperand(0), Mask2, LHSKnownZero, LHSKnownOne, TD,
+                      Depth+1);
+    assert((LHSKnownZero & LHSKnownOne) == 0 &&
+           "Bits known to be one AND zero?");
+    unsigned LHSKnownZeroOut = LHSKnownZero.countTrailingOnes();
+
+    ComputeMaskedBits(I->getOperand(1), Mask2, KnownZero2, KnownOne2, TD, 
+                      Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    unsigned RHSKnownZeroOut = KnownZero2.countTrailingOnes();
+
+    // Determine which operand has more trailing zeros, and use that
+    // many bits from the other operand.
+    if (LHSKnownZeroOut > RHSKnownZeroOut) {
+      if (getOpcode(I) == Instruction::Add) {
+        APInt Mask = APInt::getLowBitsSet(BitWidth, LHSKnownZeroOut);
+        KnownZero |= KnownZero2 & Mask;
+        KnownOne  |= KnownOne2 & Mask;
+      } else {
+        // If the known zeros are in the left operand for a subtract,
+        // fall back to the minimum known zeros in both operands.
+        KnownZero |= APInt::getLowBitsSet(BitWidth,
+                                          std::min(LHSKnownZeroOut,
+                                                   RHSKnownZeroOut));
+      }
+    } else if (RHSKnownZeroOut >= LHSKnownZeroOut) {
+      APInt Mask = APInt::getLowBitsSet(BitWidth, RHSKnownZeroOut);
+      KnownZero |= LHSKnownZero & Mask;
+      KnownOne  |= LHSKnownOne & Mask;
+    }
+    return;
+  }
+  case Instruction::SRem:
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      APInt RA = Rem->getValue();
+      if (RA.isPowerOf2() || (-RA).isPowerOf2()) {
+        APInt LowBits = RA.isStrictlyPositive() ? (RA - 1) : ~RA;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD, 
+                          Depth+1);
+
+        // If the sign bit of the first operand is zero, the sign bit of
+        // the result is zero. If the first operand has no one bits below
+        // the second operand's single 1 bit, its sign will be zero.
+        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+          KnownZero2 |= ~LowBits;
+
+        KnownZero |= KnownZero2 & Mask;
+
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+      }
+    }
+    break;
+  case Instruction::URem: {
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      APInt RA = Rem->getValue();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = (RA - 1);
+        APInt Mask2 = LowBits & Mask;
+        KnownZero |= ~LowBits & Mask;
+        ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, TD,
+                          Depth+1);
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+        break;
+      }
+    }
+
+    // Since the result is less than or equal to either operand, any leading
+    // zero bits in either operand must also exist in the result.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(I->getOperand(0), AllOnes, KnownZero, KnownOne,
+                      TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), AllOnes, KnownZero2, KnownOne2,
+                      TD, Depth+1);
+
+    unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
+                                KnownZero2.countLeadingOnes());
+    KnownOne.clear();
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & Mask;
+    break;
+  }
+
+  case Instruction::Alloca:
+  case Instruction::Malloc: {
+    AllocationInst *AI = cast<AllocationInst>(V);
+    unsigned Align = AI->getAlignment();
+    if (Align == 0 && TD) {
+      if (isa<AllocaInst>(AI))
+        Align = TD->getABITypeAlignment(AI->getType()->getElementType());
+      else if (isa<MallocInst>(AI)) {
+        // Malloc returns maximally aligned memory.
+        Align = TD->getABITypeAlignment(AI->getType()->getElementType());
+        Align =
+          std::max(Align,
+                   (unsigned)TD->getABITypeAlignment(Type::DoubleTy));
+        Align =
+          std::max(Align,
+                   (unsigned)TD->getABITypeAlignment(Type::Int64Ty));
+      }
+    }
+    
+    if (Align > 0)
+      KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
+                                              CountTrailingZeros_32(Align));
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    // Analyze all of the subscripts of this getelementptr instruction
+    // to determine if we can prove known low zero bits.
+    APInt LocalMask = APInt::getAllOnesValue(BitWidth);
+    APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
+    ComputeMaskedBits(I->getOperand(0), LocalMask,
+                      LocalKnownZero, LocalKnownOne, TD, Depth+1);
+    unsigned TrailZ = LocalKnownZero.countTrailingOnes();
+
+    gep_type_iterator GTI = gep_type_begin(I);
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
+      Value *Index = I->getOperand(i);
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        // Handle struct member offset arithmetic.
+        if (!TD) return;
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+        uint64_t Offset = SL->getElementOffset(Idx);
+        TrailZ = std::min(TrailZ,
+                          CountTrailingZeros_64(Offset));
+      } else {
+        // Handle array index arithmetic.
+        const Type *IndexedTy = GTI.getIndexedType();
+        if (!IndexedTy->isSized()) return;
+        unsigned GEPOpiBits = Index->getType()->getPrimitiveSizeInBits();
+        uint64_t TypeSize = TD ? TD->getTypeAllocSize(IndexedTy) : 1;
+        LocalMask = APInt::getAllOnesValue(GEPOpiBits);
+        LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
+        ComputeMaskedBits(Index, LocalMask,
+                          LocalKnownZero, LocalKnownOne, TD, Depth+1);
+        TrailZ = std::min(TrailZ,
+                          unsigned(CountTrailingZeros_64(TypeSize) +
+                                   LocalKnownZero.countTrailingOnes()));
+      }
+    }
+    
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) & Mask;
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *P = cast<PHINode>(I);
+    // Handle the case of a simple two-predecessor recurrence PHI.
+    // There's a lot more that could theoretically be done here, but
+    // this is sufficient to catch some interesting cases.
+    if (P->getNumIncomingValues() == 2) {
+      for (unsigned i = 0; i != 2; ++i) {
+        Value *L = P->getIncomingValue(i);
+        Value *R = P->getIncomingValue(!i);
+        User *LU = dyn_cast<User>(L);
+        if (!LU)
+          continue;
+        unsigned Opcode = getOpcode(LU);
+        // Check for operations that have the property that if
+        // both their operands have low zero bits, the result
+        // will have low zero bits.
+        if (Opcode == Instruction::Add ||
+            Opcode == Instruction::Sub ||
+            Opcode == Instruction::And ||
+            Opcode == Instruction::Or ||
+            Opcode == Instruction::Mul) {
+          Value *LL = LU->getOperand(0);
+          Value *LR = LU->getOperand(1);
+          // Find a recurrence.
+          if (LL == I)
+            L = LR;
+          else if (LR == I)
+            L = LL;
+          else
+            break;
+          // Ok, we have a PHI of the form L op= R. Check for low
+          // zero bits.
+          APInt Mask2 = APInt::getAllOnesValue(BitWidth);
+          ComputeMaskedBits(R, Mask2, KnownZero2, KnownOne2, TD, Depth+1);
+          Mask2 = APInt::getLowBitsSet(BitWidth,
+                                       KnownZero2.countTrailingOnes());
+
+          // We need to take the minimum number of known bits
+          APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
+          ComputeMaskedBits(L, Mask2, KnownZero3, KnownOne3, TD, Depth+1);
+
+          KnownZero = Mask &
+                      APInt::getLowBitsSet(BitWidth,
+                                           std::min(KnownZero2.countTrailingOnes(),
+                                                    KnownZero3.countTrailingOnes()));
+          break;
+        }
+      }
+    }
+
+    // Otherwise take the unions of the known bit sets of the operands,
+    // taking conservative care to avoid excessive recursion.
+    if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) {
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getAllOnesValue(BitWidth);
+      for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+        // Skip direct self references.
+        if (P->getIncomingValue(i) == P) continue;
+
+        KnownZero2 = APInt(BitWidth, 0);
+        KnownOne2 = APInt(BitWidth, 0);
+        // Recurse, but cap the recursion to one level, because we don't
+        // want to waste time spinning around in loops.
+        ComputeMaskedBits(P->getIncomingValue(i), KnownZero | KnownOne,
+                          KnownZero2, KnownOne2, TD, MaxDepth-1);
+        KnownZero &= KnownZero2;
+        KnownOne &= KnownOne2;
+        // If all bits have been ruled out, there's no need to check
+        // more operands.
+        if (!KnownZero && !KnownOne)
+          break;
+      }
+    }
+    break;
+  }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::ctpop:
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz: {
+        unsigned LowBits = Log2_32(BitWidth)+1;
+        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        break;
+      }
+      }
+    }
+    break;
+  }
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
+                             TargetData *TD, unsigned Depth) {
+  APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
+  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+  return (KnownZero & Mask) == Mask;
+}
+
+
+
+/// ComputeNumSignBits - Return the number of times the sign bit of the
+/// register is replicated into the other bits.  We know that at least 1 bit
+/// is always equal to the sign bit (itself), but other cases can give us
+/// information.  For example, immediately after an "ashr X, 2", we know that
+/// the top 3 bits are all equal to each other, so we return 3.
+///
+/// 'Op' must have a scalar integer type.
+///
+unsigned llvm::ComputeNumSignBits(Value *V, TargetData *TD, unsigned Depth) {
+  const IntegerType *Ty = cast<IntegerType>(V->getType());
+  unsigned TyBits = Ty->getBitWidth();
+  unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
+
+  // Note that ConstantInt is handled by the general ComputeMaskedBits case
+  // below.
+
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+  
+  User *U = dyn_cast<User>(V);
+  switch (getOpcode(V)) {
+  default: break;
+  case Instruction::SExt:
+    Tmp = TyBits-cast<IntegerType>(U->getOperand(0)->getType())->getBitWidth();
+    return ComputeNumSignBits(U->getOperand(0), TD, Depth+1) + Tmp;
+    
+  case Instruction::AShr:
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    // ashr X, C   -> adds C sign bits.
+    if (ConstantInt *C = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      Tmp += C->getZExtValue();
+      if (Tmp > TyBits) Tmp = TyBits;
+    }
+    return Tmp;
+  case Instruction::Shl:
+    if (ConstantInt *C = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      // shl destroys sign bits.
+      Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+      if (C->getZExtValue() >= TyBits ||      // Bad shift.
+          C->getZExtValue() >= Tmp) break;    // Shifted all sign bits out.
+      return Tmp - C->getZExtValue();
+    }
+    break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:    // NOT is handled here.
+    // Logical binary ops preserve the number of sign bits at the worst.
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // ComputeMaskedBits, and pick whichever answer is better.
+    }
+    break;
+
+  case Instruction::Select:
+    Tmp = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(U->getOperand(2), TD, Depth+1);
+    return std::min(Tmp, Tmp2);
+    
+  case Instruction::Add:
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+      
+    // Special case decrementing a value (ADD X, -1):
+    if (ConstantInt *CRHS = dyn_cast<ConstantInt>(U->getOperand(1)))
+      if (CRHS->isAllOnesValue()) {
+        APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
+        APInt Mask = APInt::getAllOnesValue(TyBits);
+        ComputeMaskedBits(U->getOperand(0), Mask, KnownZero, KnownOne, TD,
+                          Depth+1);
+        
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(TyBits, 1)) == Mask)
+          return TyBits;
+        
+        // If we are subtracting one from a positive number, there is no carry
+        // out of the result.
+        if (KnownZero.isNegative())
+          return Tmp;
+      }
+      
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    if (Tmp2 == 1) return 1;
+      return std::min(Tmp, Tmp2)-1;
+    break;
+    
+  case Instruction::Sub:
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    if (Tmp2 == 1) return 1;
+      
+    // Handle NEG.
+    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(U->getOperand(0)))
+      if (CLHS->isNullValue()) {
+        APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
+        APInt Mask = APInt::getAllOnesValue(TyBits);
+        ComputeMaskedBits(U->getOperand(1), Mask, KnownZero, KnownOne, 
+                          TD, Depth+1);
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(TyBits, 1)) == Mask)
+          return TyBits;
+        
+        // If the input is known to be positive (the sign bit is known clear),
+        // the output of the NEG has the same number of sign bits as the input.
+        if (KnownZero.isNegative())
+          return Tmp2;
+        
+        // Otherwise, we treat this like a SUB.
+      }
+    
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+      return std::min(Tmp, Tmp2)-1;
+    break;
+  case Instruction::Trunc:
+    // FIXME: it's tricky to do anything useful for this, but it is an important
+    // case for targets like X86.
+    break;
+  }
+  
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
+  APInt Mask = APInt::getAllOnesValue(TyBits);
+  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+  
+  if (KnownZero.isNegative()) {        // sign bit is 0
+    Mask = KnownZero;
+  } else if (KnownOne.isNegative()) {  // sign bit is 1;
+    Mask = KnownOne;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
+  
+  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // the number of identical bits in the top of the input value.
+  Mask = ~Mask;
+  Mask <<= Mask.getBitWidth()-TyBits;
+  // Return # leading zeros.  We use 'min' here in case Val was zero before
+  // shifting.  We don't want to return '64' as for an i32 "0".
+  return std::max(FirstAnswer, std::min(TyBits, Mask.countLeadingZeros()));
+}
+
+/// CannotBeNegativeZero - Return true if we can prove that the specified FP 
+/// value is never equal to -0.0.
+///
+/// NOTE: this function will need to be revisited when we support non-default
+/// rounding modes!
+///
+bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
+    return !CFP->getValueAPF().isNegZero();
+  
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return false;
+  
+  // (add x, 0.0) is guaranteed to return +0.0, not -0.0.
+  if (I->getOpcode() == Instruction::Add &&
+      isa<ConstantFP>(I->getOperand(1)) && 
+      cast<ConstantFP>(I->getOperand(1))->isNullValue())
+    return true;
+    
+  // sitofp and uitofp turn into +0.0 for zero.
+  if (isa<SIToFPInst>(I) || isa<UIToFPInst>(I))
+    return true;
+  
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    // sqrt(-0.0) = -0.0, no other negative results are possible.
+    if (II->getIntrinsicID() == Intrinsic::sqrt)
+      return CannotBeNegativeZero(II->getOperand(1), Depth+1);
+  
+  if (const CallInst *CI = dyn_cast<CallInst>(I))
+    if (const Function *F = CI->getCalledFunction()) {
+      if (F->isDeclaration()) {
+        switch (F->getNameLen()) {
+        case 3:  // abs(x) != -0.0
+          if (!strcmp(F->getNameStart(), "abs")) return true;
+          break;
+        case 4:  // abs[lf](x) != -0.0
+          if (!strcmp(F->getNameStart(), "absf")) return true;
+          if (!strcmp(F->getNameStart(), "absl")) return true;
+          break;
+        }
+      }
+    }
+  
+  return false;
+}
+
+// This is the recursive version of BuildSubAggregate. It takes a few different
+// arguments. Idxs is the index within the nested struct From that we are
+// looking at now (which is of type IndexedType). IdxSkip is the number of
+// indices from Idxs that should be left out when inserting into the resulting
+// struct. To is the result struct built so far, new insertvalue instructions
+// build on that.
+Value *BuildSubAggregate(Value *From, Value* To, const Type *IndexedType,
+                                 SmallVector<unsigned, 10> &Idxs,
+                                 unsigned IdxSkip,
+                                 Instruction *InsertBefore) {
+  const llvm::StructType *STy = llvm::dyn_cast<llvm::StructType>(IndexedType);
+  if (STy) {
+    // Save the original To argument so we can modify it
+    Value *OrigTo = To;
+    // General case, the type indexed by Idxs is a struct
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      // Process each struct element recursively
+      Idxs.push_back(i);
+      Value *PrevTo = To;
+      To = BuildSubAggregate(From, To, STy->getElementType(i), Idxs, IdxSkip,
+                             InsertBefore);
+      Idxs.pop_back();
+      if (!To) {
+        // Couldn't find any inserted value for this index? Cleanup
+        while (PrevTo != OrigTo) {
+          InsertValueInst* Del = cast<InsertValueInst>(PrevTo);
+          PrevTo = Del->getAggregateOperand();
+          Del->eraseFromParent();
+        }
+        // Stop processing elements
+        break;
+      }
+    }
+    // If we succesfully found a value for each of our subaggregates 
+    if (To)
+      return To;
+  }
+  // Base case, the type indexed by SourceIdxs is not a struct, or not all of
+  // the struct's elements had a value that was inserted directly. In the latter
+  // case, perhaps we can't determine each of the subelements individually, but
+  // we might be able to find the complete struct somewhere.
+  
+  // Find the value that is at that particular spot
+  Value *V = FindInsertedValue(From, Idxs.begin(), Idxs.end());
+
+  if (!V)
+    return NULL;
+
+  // Insert the value in the new (sub) aggregrate
+  return llvm::InsertValueInst::Create(To, V, Idxs.begin() + IdxSkip,
+                                       Idxs.end(), "tmp", InsertBefore);
+}
+
+// This helper takes a nested struct and extracts a part of it (which is again a
+// struct) into a new value. For example, given the struct:
+// { a, { b, { c, d }, e } }
+// and the indices "1, 1" this returns
+// { c, d }.
+//
+// It does this by inserting an insertvalue for each element in the resulting
+// struct, as opposed to just inserting a single struct. This will only work if
+// each of the elements of the substruct are known (ie, inserted into From by an
+// insertvalue instruction somewhere).
+//
+// All inserted insertvalue instructions are inserted before InsertBefore
+Value *BuildSubAggregate(Value *From, const unsigned *idx_begin,
+                         const unsigned *idx_end, Instruction *InsertBefore) {
+  assert(InsertBefore && "Must have someplace to insert!");
+  const Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
+                                                             idx_begin,
+                                                             idx_end);
+  Value *To = UndefValue::get(IndexedType);
+  SmallVector<unsigned, 10> Idxs(idx_begin, idx_end);
+  unsigned IdxSkip = Idxs.size();
+
+  return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
+}
+
+/// FindInsertedValue - Given an aggregrate and an sequence of indices, see if
+/// the scalar value indexed is already around as a register, for example if it
+/// were inserted directly into the aggregrate.
+///
+/// If InsertBefore is not null, this function will duplicate (modified)
+/// insertvalues when a part of a nested struct is extracted.
+Value *llvm::FindInsertedValue(Value *V, const unsigned *idx_begin,
+                         const unsigned *idx_end, Instruction *InsertBefore) {
+  // Nothing to index? Just return V then (this is useful at the end of our
+  // recursion)
+  if (idx_begin == idx_end)
+    return V;
+  // We have indices, so V should have an indexable type
+  assert((isa<StructType>(V->getType()) || isa<ArrayType>(V->getType()))
+         && "Not looking at a struct or array?");
+  assert(ExtractValueInst::getIndexedType(V->getType(), idx_begin, idx_end)
+         && "Invalid indices for type?");
+  const CompositeType *PTy = cast<CompositeType>(V->getType());
+  
+  if (isa<UndefValue>(V))
+    return UndefValue::get(ExtractValueInst::getIndexedType(PTy,
+                                                              idx_begin,
+                                                              idx_end));
+  else if (isa<ConstantAggregateZero>(V))
+    return Constant::getNullValue(ExtractValueInst::getIndexedType(PTy, 
+                                                                     idx_begin,
+                                                                     idx_end));
+  else if (Constant *C = dyn_cast<Constant>(V)) {
+    if (isa<ConstantArray>(C) || isa<ConstantStruct>(C))
+      // Recursively process this constant
+      return FindInsertedValue(C->getOperand(*idx_begin), idx_begin + 1, idx_end,
+                               InsertBefore);
+  } else if (InsertValueInst *I = dyn_cast<InsertValueInst>(V)) {
+    // Loop the indices for the insertvalue instruction in parallel with the
+    // requested indices
+    const unsigned *req_idx = idx_begin;
+    for (const unsigned *i = I->idx_begin(), *e = I->idx_end();
+         i != e; ++i, ++req_idx) {
+      if (req_idx == idx_end) {
+        if (InsertBefore)
+          // The requested index identifies a part of a nested aggregate. Handle
+          // this specially. For example,
+          // %A = insertvalue { i32, {i32, i32 } } undef, i32 10, 1, 0
+          // %B = insertvalue { i32, {i32, i32 } } %A, i32 11, 1, 1
+          // %C = extractvalue {i32, { i32, i32 } } %B, 1
+          // This can be changed into
+          // %A = insertvalue {i32, i32 } undef, i32 10, 0
+          // %C = insertvalue {i32, i32 } %A, i32 11, 1
+          // which allows the unused 0,0 element from the nested struct to be
+          // removed.
+          return BuildSubAggregate(V, idx_begin, req_idx, InsertBefore);
+        else
+          // We can't handle this without inserting insertvalues
+          return 0;
+      }
+      
+      // This insert value inserts something else than what we are looking for.
+      // See if the (aggregrate) value inserted into has the value we are
+      // looking for, then.
+      if (*req_idx != *i)
+        return FindInsertedValue(I->getAggregateOperand(), idx_begin, idx_end,
+                                 InsertBefore);
+    }
+    // If we end up here, the indices of the insertvalue match with those
+    // requested (though possibly only partially). Now we recursively look at
+    // the inserted value, passing any remaining indices.
+    return FindInsertedValue(I->getInsertedValueOperand(), req_idx, idx_end,
+                             InsertBefore);
+  } else if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
+    // If we're extracting a value from an aggregrate that was extracted from
+    // something else, we can extract from that something else directly instead.
+    // However, we will need to chain I's indices with the requested indices.
+   
+    // Calculate the number of indices required 
+    unsigned size = I->getNumIndices() + (idx_end - idx_begin);
+    // Allocate some space to put the new indices in
+    SmallVector<unsigned, 5> Idxs;
+    Idxs.reserve(size);
+    // Add indices from the extract value instruction
+    for (const unsigned *i = I->idx_begin(), *e = I->idx_end();
+         i != e; ++i)
+      Idxs.push_back(*i);
+    
+    // Add requested indices
+    for (const unsigned *i = idx_begin, *e = idx_end; i != e; ++i)
+      Idxs.push_back(*i);
+
+    assert(Idxs.size() == size 
+           && "Number of indices added not correct?");
+    
+    return FindInsertedValue(I->getAggregateOperand(), Idxs.begin(), Idxs.end(),
+                             InsertBefore);
+  }
+  // Otherwise, we don't know (such as, extracting from a function return value
+  // or load instruction)
+  return 0;
+}
+
+/// GetConstantStringInfo - This function computes the length of a
+/// null-terminated C string pointed to by V.  If successful, it returns true
+/// and returns the string in Str.  If unsuccessful, it returns false.
+bool llvm::GetConstantStringInfo(Value *V, std::string &Str, uint64_t Offset,
+                                 bool StopAtNul) {
+  // If V is NULL then return false;
+  if (V == NULL) return false;
+
+  // Look through bitcast instructions.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(V))
+    return GetConstantStringInfo(BCI->getOperand(0), Str, Offset, StopAtNul);
+  
+  // If the value is not a GEP instruction nor a constant expression with a
+  // GEP instruction, then return false because ConstantArray can't occur
+  // any other way
+  User *GEP = 0;
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(V)) {
+    GEP = GEPI;
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() == Instruction::BitCast)
+      return GetConstantStringInfo(CE->getOperand(0), Str, Offset, StopAtNul);
+    if (CE->getOpcode() != Instruction::GetElementPtr)
+      return false;
+    GEP = CE;
+  }
+  
+  if (GEP) {
+    // Make sure the GEP has exactly three arguments.
+    if (GEP->getNumOperands() != 3)
+      return false;
+    
+    // Make sure the index-ee is a pointer to array of i8.
+    const PointerType *PT = cast<PointerType>(GEP->getOperand(0)->getType());
+    const ArrayType *AT = dyn_cast<ArrayType>(PT->getElementType());
+    if (AT == 0 || AT->getElementType() != Type::Int8Ty)
+      return false;
+    
+    // Check to make sure that the first operand of the GEP is an integer and
+    // has value 0 so that we are sure we're indexing into the initializer.
+    ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
+    if (FirstIdx == 0 || !FirstIdx->isZero())
+      return false;
+    
+    // If the second index isn't a ConstantInt, then this is a variable index
+    // into the array.  If this occurs, we can't say anything meaningful about
+    // the string.
+    uint64_t StartIdx = 0;
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
+      StartIdx = CI->getZExtValue();
+    else
+      return false;
+    return GetConstantStringInfo(GEP->getOperand(0), Str, StartIdx+Offset,
+                                 StopAtNul);
+  }
+  
+  // The GEP instruction, constant or instruction, must reference a global
+  // variable that is a constant and is initialized. The referenced constant
+  // initializer is the array that we'll use for optimization.
+  GlobalVariable* GV = dyn_cast<GlobalVariable>(V);
+  if (!GV || !GV->isConstant() || !GV->hasInitializer())
+    return false;
+  Constant *GlobalInit = GV->getInitializer();
+  
+  // Handle the ConstantAggregateZero case
+  if (isa<ConstantAggregateZero>(GlobalInit)) {
+    // This is a degenerate case. The initializer is constant zero so the
+    // length of the string must be zero.
+    Str.clear();
+    return true;
+  }
+  
+  // Must be a Constant Array
+  ConstantArray *Array = dyn_cast<ConstantArray>(GlobalInit);
+  if (Array == 0 || Array->getType()->getElementType() != Type::Int8Ty)
+    return false;
+  
+  // Get the number of elements in the array
+  uint64_t NumElts = Array->getType()->getNumElements();
+  
+  if (Offset > NumElts)
+    return false;
+  
+  // Traverse the constant array from 'Offset' which is the place the GEP refers
+  // to in the array.
+  Str.reserve(NumElts-Offset);
+  for (unsigned i = Offset; i != NumElts; ++i) {
+    Constant *Elt = Array->getOperand(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(Elt);
+    if (!CI) // This array isn't suitable, non-int initializer.
+      return false;
+    if (StopAtNul && CI->isZero())
+      return true; // we found end of string, success!
+    Str += (char)CI->getZExtValue();
+  }
+  
+  // The array isn't null terminated, but maybe this is a memcpy, not a strcpy.
+  return true;
+}
diff --git a/lib/Archive/Archive.cpp b/lib/Archive/Archive.cpp
new file mode 100644
index 0000000..c6c89d2
--- /dev/null
+++ b/lib/Archive/Archive.cpp
@@ -0,0 +1,266 @@
+//===-- Archive.cpp - Generic LLVM archive functions ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the Archive and ArchiveMember
+// classes that is common to both reading and writing archives..
+//
+//===----------------------------------------------------------------------===//
+
+#include "ArchiveInternals.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/System/Process.h"
+#include <memory>
+#include <cstring>
+using namespace llvm;
+
+// getMemberSize - compute the actual physical size of the file member as seen
+// on disk. This isn't the size of member's payload. Use getSize() for that.
+unsigned
+ArchiveMember::getMemberSize() const {
+  // Basically its the file size plus the header size
+  unsigned result =  info.fileSize + sizeof(ArchiveMemberHeader);
+
+  // If it has a long filename, include the name length
+  if (hasLongFilename())
+    result += path.toString().length() + 1;
+
+  // If its now odd lengthed, include the padding byte
+  if (result % 2 != 0 )
+    result++;
+
+  return result;
+}
+
+// This default constructor is only use by the ilist when it creates its
+// sentry node. We give it specific static values to make it stand out a bit.
+ArchiveMember::ArchiveMember()
+  : parent(0), path("--invalid--"), flags(0), data(0)
+{
+  info.user = sys::Process::GetCurrentUserId();
+  info.group = sys::Process::GetCurrentGroupId();
+  info.mode = 0777;
+  info.fileSize = 0;
+  info.modTime = sys::TimeValue::now();
+}
+
+// This is the constructor that the Archive class uses when it is building or
+// reading an archive. It just defaults a few things and ensures the parent is
+// set for the iplist. The Archive class fills in the ArchiveMember's data.
+// This is required because correctly setting the data may depend on other
+// things in the Archive.
+ArchiveMember::ArchiveMember(Archive* PAR)
+  : parent(PAR), path(), flags(0), data(0)
+{
+}
+
+// This method allows an ArchiveMember to be replaced with the data for a
+// different file, presumably as an update to the member. It also makes sure
+// the flags are reset correctly.
+bool ArchiveMember::replaceWith(const sys::Path& newFile, std::string* ErrMsg) {
+  if (!newFile.exists()) {
+    if (ErrMsg) 
+      *ErrMsg = "Can not replace an archive member with a non-existent file";
+    return true;
+  }
+
+  data = 0;
+  path = newFile;
+
+  // SVR4 symbol tables have an empty name
+  if (path.toString() == ARFILE_SVR4_SYMTAB_NAME)
+    flags |= SVR4SymbolTableFlag;
+  else
+    flags &= ~SVR4SymbolTableFlag;
+
+  // BSD4.4 symbol tables have a special name
+  if (path.toString() == ARFILE_BSD4_SYMTAB_NAME)
+    flags |= BSD4SymbolTableFlag;
+  else
+    flags &= ~BSD4SymbolTableFlag;
+
+  // LLVM symbol tables have a very specific name
+  if (path.toString() == ARFILE_LLVM_SYMTAB_NAME)
+    flags |= LLVMSymbolTableFlag;
+  else
+    flags &= ~LLVMSymbolTableFlag;
+
+  // String table name
+  if (path.toString() == ARFILE_STRTAB_NAME)
+    flags |= StringTableFlag;
+  else
+    flags &= ~StringTableFlag;
+
+  // If it has a slash then it has a path
+  bool hasSlash = path.toString().find('/') != std::string::npos;
+  if (hasSlash)
+    flags |= HasPathFlag;
+  else
+    flags &= ~HasPathFlag;
+
+  // If it has a slash or its over 15 chars then its a long filename format
+  if (hasSlash || path.toString().length() > 15)
+    flags |= HasLongFilenameFlag;
+  else
+    flags &= ~HasLongFilenameFlag;
+
+  // Get the signature and status info
+  const char* signature = (const char*) data;
+  std::string magic;
+  if (!signature) {
+    path.getMagicNumber(magic,4);
+    signature = magic.c_str();
+    std::string err;
+    const sys::FileStatus *FSinfo = path.getFileStatus(false, ErrMsg);
+    if (FSinfo)
+      info = *FSinfo;
+    else
+      return true;
+  }
+
+  // Determine what kind of file it is
+  switch (sys::IdentifyFileType(signature,4)) {
+    default:
+      flags &= ~BitcodeFlag;
+      break;
+  }
+  return false;
+}
+
+// Archive constructor - this is the only constructor that gets used for the
+// Archive class. Everything else (default,copy) is deprecated. This just
+// initializes and maps the file into memory, if requested.
+Archive::Archive(const sys::Path& filename)
+  : archPath(filename), members(), mapfile(0), base(0), symTab(), strtab(),
+    symTabSize(0), firstFileOffset(0), modules(), foreignST(0) {
+}
+
+bool
+Archive::mapToMemory(std::string* ErrMsg) {
+  mapfile = MemoryBuffer::getFile(archPath.c_str(), ErrMsg);
+  if (mapfile == 0)
+    return true;
+  base = mapfile->getBufferStart();
+  return false;
+}
+
+void Archive::cleanUpMemory() {
+  // Shutdown the file mapping
+  delete mapfile;
+  mapfile = 0;
+  base = 0;
+  
+  // Forget the entire symbol table
+  symTab.clear();
+  symTabSize = 0;
+  
+  firstFileOffset = 0;
+  
+  // Free the foreign symbol table member
+  if (foreignST) {
+    delete foreignST;
+    foreignST = 0;
+  }
+  
+  // Delete any ModuleProviders and ArchiveMember's we've allocated as a result
+  // of symbol table searches.
+  for (ModuleMap::iterator I=modules.begin(), E=modules.end(); I != E; ++I ) {
+    delete I->second.first;
+    delete I->second.second;
+  }
+}
+
+// Archive destructor - just clean up memory
+Archive::~Archive() {
+  cleanUpMemory();
+}
+
+
+
+static void getSymbols(Module*M, std::vector<std::string>& symbols) {
+  // Loop over global variables
+  for (Module::global_iterator GI = M->global_begin(), GE=M->global_end(); GI != GE; ++GI)
+    if (!GI->isDeclaration() && !GI->hasLocalLinkage())
+      if (!GI->getName().empty())
+        symbols.push_back(GI->getName());
+  
+  // Loop over functions
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI)
+    if (!FI->isDeclaration() && !FI->hasLocalLinkage())
+      if (!FI->getName().empty())
+        symbols.push_back(FI->getName());
+
+  // Loop over aliases
+  for (Module::alias_iterator AI = M->alias_begin(), AE = M->alias_end();
+       AI != AE; ++AI) {
+    if (AI->hasName())
+      symbols.push_back(AI->getName());
+  }
+}
+
+// Get just the externally visible defined symbols from the bitcode
+bool llvm::GetBitcodeSymbols(const sys::Path& fName,
+                             std::vector<std::string>& symbols,
+                             std::string* ErrMsg) {
+  std::auto_ptr<MemoryBuffer> Buffer(
+                       MemoryBuffer::getFileOrSTDIN(fName.c_str()));
+  if (!Buffer.get()) {
+    if (ErrMsg) *ErrMsg = "Could not open file '" + fName.toString() + "'";
+    return true;
+  }
+  
+  ModuleProvider *MP = getBitcodeModuleProvider(Buffer.get(), ErrMsg);
+  if (!MP)
+    return true;
+  
+  // Get the module from the provider
+  Module* M = MP->materializeModule();
+  if (M == 0) {
+    delete MP;
+    return true;
+  }
+  
+  // Get the symbols
+  getSymbols(M, symbols);
+  
+  // Done with the module.
+  delete MP;
+  return true;
+}
+
+ModuleProvider*
+llvm::GetBitcodeSymbols(const unsigned char *BufPtr, unsigned Length,
+                        const std::string& ModuleID,
+                        std::vector<std::string>& symbols,
+                        std::string* ErrMsg) {
+  // Get the module provider
+  MemoryBuffer *Buffer =MemoryBuffer::getNewMemBuffer(Length, ModuleID.c_str());
+  memcpy((char*)Buffer->getBufferStart(), BufPtr, Length);
+  
+  ModuleProvider *MP = getBitcodeModuleProvider(Buffer, ErrMsg);
+  if (!MP)
+    return 0;
+  
+  // Get the module from the provider
+  Module* M = MP->materializeModule();
+  if (M == 0) {
+    delete MP;
+    return 0;
+  }
+  
+  // Get the symbols
+  getSymbols(M, symbols);
+  
+  // Done with the module. Note that ModuleProvider will delete the
+  // Module when it is deleted. Also note that its the caller's responsibility
+  // to delete the ModuleProvider.
+  return MP;
+}
diff --git a/lib/Archive/ArchiveInternals.h b/lib/Archive/ArchiveInternals.h
new file mode 100644
index 0000000..7ba3024
--- /dev/null
+++ b/lib/Archive/ArchiveInternals.h
@@ -0,0 +1,85 @@
+//===-- lib/Archive/ArchiveInternals.h -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Internal implementation header for LLVM Archive files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_ARCHIVE_ARCHIVEINTERNALS_H
+#define LIB_ARCHIVE_ARCHIVEINTERNALS_H
+
+#include "llvm/Bitcode/Archive.h"
+#include "llvm/System/TimeValue.h"
+#include "llvm/ADT/StringExtras.h"
+
+#include <cstring>
+
+#define ARFILE_MAGIC "!<arch>\n"                   ///< magic string
+#define ARFILE_MAGIC_LEN (sizeof(ARFILE_MAGIC)-1)  ///< length of magic string
+#define ARFILE_SVR4_SYMTAB_NAME "/               " ///< SVR4 symtab entry name
+#define ARFILE_LLVM_SYMTAB_NAME "#_LLVM_SYM_TAB_#" ///< LLVM symtab entry name
+#define ARFILE_BSD4_SYMTAB_NAME "__.SYMDEF SORTED" ///< BSD4 symtab entry name
+#define ARFILE_STRTAB_NAME      "//              " ///< Name of string table
+#define ARFILE_PAD "\n"                            ///< inter-file align padding
+#define ARFILE_MEMBER_MAGIC "`\n"                  ///< fmag field magic #
+
+namespace llvm {
+
+  /// The ArchiveMemberHeader structure is used internally for bitcode
+  /// archives.
+  /// The header precedes each file member in the archive. This structure is
+  /// defined using character arrays for direct and correct interpretation
+  /// regardless of the endianess of the machine that produced it.
+  /// @brief Archive File Member Header
+  class ArchiveMemberHeader {
+    /// @name Data
+    /// @{
+    public:
+      char name[16];  ///< Name of the file member.
+      char date[12];  ///< File date, decimal seconds since Epoch
+      char uid[6];    ///< user id in ASCII decimal
+      char gid[6];    ///< group id in ASCII decimal
+      char mode[8];   ///< file mode in ASCII octal
+      char size[10];  ///< file size in ASCII decimal
+      char fmag[2];   ///< Always contains ARFILE_MAGIC_TERMINATOR
+
+    /// @}
+    /// @name Methods
+    /// @{
+    public:
+    void init() {
+      memset(name,' ',16);
+      memset(date,' ',12);
+      memset(uid,' ',6);
+      memset(gid,' ',6);
+      memset(mode,' ',8);
+      memset(size,' ',10);
+      fmag[0] = '`';
+      fmag[1] = '\n';
+    }
+
+    bool checkSignature() {
+      return 0 == memcmp(fmag, ARFILE_MEMBER_MAGIC,2);
+    }
+  };
+  
+  // Get just the externally visible defined symbols from the bitcode
+  bool GetBitcodeSymbols(const sys::Path& fName,
+                          std::vector<std::string>& symbols,
+                          std::string* ErrMsg);
+  
+  ModuleProvider* GetBitcodeSymbols(const unsigned char*Buffer,unsigned Length,
+                                    const std::string& ModuleID,
+                                    std::vector<std::string>& symbols,
+                                    std::string* ErrMsg);
+}
+
+#endif
+
+// vim: sw=2 ai
diff --git a/lib/Archive/ArchiveReader.cpp b/lib/Archive/ArchiveReader.cpp
new file mode 100644
index 0000000..b07e884
--- /dev/null
+++ b/lib/Archive/ArchiveReader.cpp
@@ -0,0 +1,627 @@
+//===-- ArchiveReader.cpp - Read LLVM archive files -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Builds up standard unix archive files (.a) containing LLVM bitcode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ArchiveInternals.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Module.h"
+#include <cstdlib>
+#include <memory>
+using namespace llvm;
+
+/// Read a variable-bit-rate encoded unsigned integer
+static inline unsigned readInteger(const char*&At, const char*End) {
+  unsigned Shift = 0;
+  unsigned Result = 0;
+
+  do {
+    if (At == End)
+      return Result;
+    Result |= (unsigned)((*At++) & 0x7F) << Shift;
+    Shift += 7;
+  } while (At[-1] & 0x80);
+  return Result;
+}
+
+// Completely parse the Archive's symbol table and populate symTab member var.
+bool
+Archive::parseSymbolTable(const void* data, unsigned size, std::string* error) {
+  const char* At = (const char*) data;
+  const char* End = At + size;
+  while (At < End) {
+    unsigned offset = readInteger(At, End);
+    if (At == End) {
+      if (error)
+        *error = "Ran out of data reading vbr_uint for symtab offset!";
+      return false;
+    }
+    unsigned length = readInteger(At, End);
+    if (At == End) {
+      if (error)
+        *error = "Ran out of data reading vbr_uint for symtab length!";
+      return false;
+    }
+    if (At + length > End) {
+      if (error)
+        *error = "Malformed symbol table: length not consistent with size";
+      return false;
+    }
+    // we don't care if it can't be inserted (duplicate entry)
+    symTab.insert(std::make_pair(std::string(At, length), offset));
+    At += length;
+  }
+  symTabSize = size;
+  return true;
+}
+
+// This member parses an ArchiveMemberHeader that is presumed to be pointed to
+// by At. The At pointer is updated to the byte just after the header, which
+// can be variable in size.
+ArchiveMember*
+Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
+{
+  if (At + sizeof(ArchiveMemberHeader) >= End) {
+    if (error)
+      *error = "Unexpected end of file";
+    return 0;
+  }
+
+  // Cast archive member header
+  ArchiveMemberHeader* Hdr = (ArchiveMemberHeader*)At;
+  At += sizeof(ArchiveMemberHeader);
+
+  // Extract the size and determine if the file is
+  // compressed or not (negative length).
+  int flags = 0;
+  int MemberSize = atoi(Hdr->size);
+  if (MemberSize < 0) {
+    flags |= ArchiveMember::CompressedFlag;
+    MemberSize = -MemberSize;
+  }
+
+  // Check the size of the member for sanity
+  if (At + MemberSize > End) {
+    if (error)
+      *error = "invalid member length in archive file";
+    return 0;
+  }
+
+  // Check the member signature
+  if (!Hdr->checkSignature()) {
+    if (error)
+      *error = "invalid file member signature";
+    return 0;
+  }
+
+  // Convert and check the member name
+  // The empty name ( '/' and 15 blanks) is for a foreign (non-LLVM) symbol
+  // table. The special name "//" and 14 blanks is for a string table, used
+  // for long file names. This library doesn't generate either of those but
+  // it will accept them. If the name starts with #1/ and the remainder is
+  // digits, then those digits specify the length of the name that is
+  // stored immediately following the header. The special name
+  // __LLVM_SYM_TAB__ identifies the symbol table for LLVM bitcode.
+  // Anything else is a regular, short filename that is terminated with
+  // a '/' and blanks.
+
+  std::string pathname;
+  switch (Hdr->name[0]) {
+    case '#':
+      if (Hdr->name[1] == '1' && Hdr->name[2] == '/') {
+        if (isdigit(Hdr->name[3])) {
+          unsigned len = atoi(&Hdr->name[3]);
+          pathname.assign(At, len);
+          At += len;
+          MemberSize -= len;
+          flags |= ArchiveMember::HasLongFilenameFlag;
+        } else {
+          if (error)
+            *error = "invalid long filename";
+          return 0;
+        }
+      } else if (Hdr->name[1] == '_' &&
+                 (0 == memcmp(Hdr->name, ARFILE_LLVM_SYMTAB_NAME, 16))) {
+        // The member is using a long file name (>15 chars) format.
+        // This format is standard for 4.4BSD and Mac OSX operating
+        // systems. LLVM uses it similarly. In this format, the
+        // remainder of the name field (after #1/) specifies the
+        // length of the file name which occupy the first bytes of
+        // the member's data. The pathname already has the #1/ stripped.
+        pathname.assign(ARFILE_LLVM_SYMTAB_NAME);
+        flags |= ArchiveMember::LLVMSymbolTableFlag;
+      }
+      break;
+    case '/':
+      if (Hdr->name[1]== '/') {
+        if (0 == memcmp(Hdr->name, ARFILE_STRTAB_NAME, 16)) {
+          pathname.assign(ARFILE_STRTAB_NAME);
+          flags |= ArchiveMember::StringTableFlag;
+        } else {
+          if (error)
+            *error = "invalid string table name";
+          return 0;
+        }
+      } else if (Hdr->name[1] == ' ') {
+        if (0 == memcmp(Hdr->name, ARFILE_SVR4_SYMTAB_NAME, 16)) {
+          pathname.assign(ARFILE_SVR4_SYMTAB_NAME);
+          flags |= ArchiveMember::SVR4SymbolTableFlag;
+        } else {
+          if (error)
+            *error = "invalid SVR4 symbol table name";
+          return 0;
+        }
+      } else if (isdigit(Hdr->name[1])) {
+        unsigned index = atoi(&Hdr->name[1]);
+        if (index < strtab.length()) {
+          const char* namep = strtab.c_str() + index;
+          const char* endp = strtab.c_str() + strtab.length();
+          const char* p = namep;
+          const char* last_p = p;
+          while (p < endp) {
+            if (*p == '\n' && *last_p == '/') {
+              pathname.assign(namep, last_p - namep);
+              flags |= ArchiveMember::HasLongFilenameFlag;
+              break;
+            }
+            last_p = p;
+            p++;
+          }
+          if (p >= endp) {
+            if (error)
+              *error = "missing name termiantor in string table";
+            return 0;
+          }
+        } else {
+          if (error)
+            *error = "name index beyond string table";
+          return 0;
+        }
+      }
+      break;
+    case '_':
+      if (Hdr->name[1] == '_' &&
+          (0 == memcmp(Hdr->name, ARFILE_BSD4_SYMTAB_NAME, 16))) {
+        pathname.assign(ARFILE_BSD4_SYMTAB_NAME);
+        flags |= ArchiveMember::BSD4SymbolTableFlag;
+        break;
+      }
+      /* FALL THROUGH */
+
+    default:
+      char* slash = (char*) memchr(Hdr->name, '/', 16);
+      if (slash == 0)
+        slash = Hdr->name + 16;
+      pathname.assign(Hdr->name, slash - Hdr->name);
+      break;
+  }
+
+  // Determine if this is a bitcode file
+  switch (sys::IdentifyFileType(At, 4)) {
+    case sys::Bitcode_FileType:
+      flags |= ArchiveMember::BitcodeFlag;
+      break;
+    default:
+      flags &= ~ArchiveMember::BitcodeFlag;
+      break;
+  }
+
+  // Instantiate the ArchiveMember to be filled
+  ArchiveMember* member = new ArchiveMember(this);
+
+  // Fill in fields of the ArchiveMember
+  member->parent = this;
+  member->path.set(pathname);
+  member->info.fileSize = MemberSize;
+  member->info.modTime.fromEpochTime(atoi(Hdr->date));
+  unsigned int mode;
+  sscanf(Hdr->mode, "%o", &mode);
+  member->info.mode = mode;
+  member->info.user = atoi(Hdr->uid);
+  member->info.group = atoi(Hdr->gid);
+  member->flags = flags;
+  member->data = At;
+
+  return member;
+}
+
+bool
+Archive::checkSignature(std::string* error) {
+  // Check the magic string at file's header
+  if (mapfile->getBufferSize() < 8 || memcmp(base, ARFILE_MAGIC, 8)) {
+    if (error)
+      *error = "invalid signature for an archive file";
+    return false;
+  }
+  return true;
+}
+
+// This function loads the entire archive and fully populates its ilist with
+// the members of the archive file. This is typically used in preparation for
+// editing the contents of the archive.
+bool
+Archive::loadArchive(std::string* error) {
+
+  // Set up parsing
+  members.clear();
+  symTab.clear();
+  const char *At = base;
+  const char *End = mapfile->getBufferEnd();
+
+  if (!checkSignature(error))
+    return false;
+
+  At += 8;  // Skip the magic string.
+
+  bool seenSymbolTable = false;
+  bool foundFirstFile = false;
+  while (At < End) {
+    // parse the member header
+    const char* Save = At;
+    ArchiveMember* mbr = parseMemberHeader(At, End, error);
+    if (!mbr)
+      return false;
+
+    // check if this is the foreign symbol table
+    if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
+      // We just save this but don't do anything special
+      // with it. It doesn't count as the "first file".
+      if (foreignST) {
+        // What? Multiple foreign symbol tables? Just chuck it
+        // and retain the last one found.
+        delete foreignST;
+      }
+      foreignST = mbr;
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+    } else if (mbr->isStringTable()) {
+      // Simply suck the entire string table into a string
+      // variable. This will be used to get the names of the
+      // members that use the "/ddd" format for their names
+      // (SVR4 style long names).
+      strtab.assign(At, mbr->getSize());
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+      delete mbr;
+    } else if (mbr->isLLVMSymbolTable()) {
+      // This is the LLVM symbol table for the archive. If we've seen it
+      // already, its an error. Otherwise, parse the symbol table and move on.
+      if (seenSymbolTable) {
+        if (error)
+          *error = "invalid archive: multiple symbol tables";
+        return false;
+      }
+      if (!parseSymbolTable(mbr->getData(), mbr->getSize(), error))
+        return false;
+      seenSymbolTable = true;
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+      delete mbr; // We don't need this member in the list of members.
+    } else {
+      // This is just a regular file. If its the first one, save its offset.
+      // Otherwise just push it on the list and move on to the next file.
+      if (!foundFirstFile) {
+        firstFileOffset = Save - base;
+        foundFirstFile = true;
+      }
+      members.push_back(mbr);
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+    }
+  }
+  return true;
+}
+
+// Open and completely load the archive file.
+Archive*
+Archive::OpenAndLoad(const sys::Path& file, std::string* ErrorMessage) 
+{
+  std::auto_ptr<Archive> result ( new Archive(file));
+  if (result->mapToMemory(ErrorMessage))
+    return 0;
+  if (!result->loadArchive(ErrorMessage))
+    return 0;
+  return result.release();
+}
+
+// Get all the bitcode modules from the archive
+bool
+Archive::getAllModules(std::vector<Module*>& Modules, std::string* ErrMessage) {
+
+  for (iterator I=begin(), E=end(); I != E; ++I) {
+    if (I->isBitcode()) {
+      std::string FullMemberName = archPath.toString() +
+        "(" + I->getPath().toString() + ")";
+      MemoryBuffer *Buffer =
+        MemoryBuffer::getNewMemBuffer(I->getSize(), FullMemberName.c_str());
+      memcpy((char*)Buffer->getBufferStart(), I->getData(), I->getSize());
+      
+      Module *M = ParseBitcodeFile(Buffer, ErrMessage);
+      delete Buffer;
+      if (!M)
+        return true;
+
+      Modules.push_back(M);
+    }
+  }
+  return false;
+}
+
+// Load just the symbol table from the archive file
+bool
+Archive::loadSymbolTable(std::string* ErrorMsg) {
+
+  // Set up parsing
+  members.clear();
+  symTab.clear();
+  const char *At = base;
+  const char *End = mapfile->getBufferEnd();
+
+  // Make sure we're dealing with an archive
+  if (!checkSignature(ErrorMsg))
+    return false;
+
+  At += 8; // Skip signature
+
+  // Parse the first file member header
+  const char* FirstFile = At;
+  ArchiveMember* mbr = parseMemberHeader(At, End, ErrorMsg);
+  if (!mbr)
+    return false;
+
+  if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
+    // Skip the foreign symbol table, we don't do anything with it
+    At += mbr->getSize();
+    if ((intptr_t(At) & 1) == 1)
+      At++;
+    delete mbr;
+
+    // Read the next one
+    FirstFile = At;
+    mbr = parseMemberHeader(At, End, ErrorMsg);
+    if (!mbr) {
+      delete mbr;
+      return false;
+    }
+  }
+
+  if (mbr->isStringTable()) {
+    // Process the string table entry
+    strtab.assign((const char*)mbr->getData(), mbr->getSize());
+    At += mbr->getSize();
+    if ((intptr_t(At) & 1) == 1)
+      At++;
+    delete mbr;
+    // Get the next one
+    FirstFile = At;
+    mbr = parseMemberHeader(At, End, ErrorMsg);
+    if (!mbr) {
+      delete mbr;
+      return false;
+    }
+  }
+
+  // See if its the symbol table
+  if (mbr->isLLVMSymbolTable()) {
+    if (!parseSymbolTable(mbr->getData(), mbr->getSize(), ErrorMsg)) {
+      delete mbr;
+      return false;
+    }
+
+    At += mbr->getSize();
+    if ((intptr_t(At) & 1) == 1)
+      At++;
+    delete mbr;
+    // Can't be any more symtab headers so just advance
+    FirstFile = At;
+  } else {
+    // There's no symbol table in the file. We have to rebuild it from scratch
+    // because the intent of this method is to get the symbol table loaded so
+    // it can be searched efficiently.
+    // Add the member to the members list
+    members.push_back(mbr);
+  }
+
+  firstFileOffset = FirstFile - base;
+  return true;
+}
+
+// Open the archive and load just the symbol tables
+Archive*
+Archive::OpenAndLoadSymbols(const sys::Path& file, std::string* ErrorMessage) {
+  std::auto_ptr<Archive> result ( new Archive(file) );
+  if (result->mapToMemory(ErrorMessage))
+    return 0;
+  if (!result->loadSymbolTable(ErrorMessage))
+    return 0;
+  return result.release();
+}
+
+// Look up one symbol in the symbol table and return a ModuleProvider for the
+// module that defines that symbol.
+ModuleProvider*
+Archive::findModuleDefiningSymbol(const std::string& symbol, 
+                                  std::string* ErrMsg) {
+  SymTabType::iterator SI = symTab.find(symbol);
+  if (SI == symTab.end())
+    return 0;
+
+  // The symbol table was previously constructed assuming that the members were
+  // written without the symbol table header. Because VBR encoding is used, the
+  // values could not be adjusted to account for the offset of the symbol table
+  // because that could affect the size of the symbol table due to VBR encoding.
+  // We now have to account for this by adjusting the offset by the size of the
+  // symbol table and its header.
+  unsigned fileOffset =
+    SI->second +                // offset in symbol-table-less file
+    firstFileOffset;            // add offset to first "real" file in archive
+
+  // See if the module is already loaded
+  ModuleMap::iterator MI = modules.find(fileOffset);
+  if (MI != modules.end())
+    return MI->second.first;
+
+  // Module hasn't been loaded yet, we need to load it
+  const char* modptr = base + fileOffset;
+  ArchiveMember* mbr = parseMemberHeader(modptr, mapfile->getBufferEnd(),
+                                         ErrMsg);
+  if (!mbr)
+    return 0;
+
+  // Now, load the bitcode module to get the ModuleProvider
+  std::string FullMemberName = archPath.toString() + "(" +
+    mbr->getPath().toString() + ")";
+  MemoryBuffer *Buffer =MemoryBuffer::getNewMemBuffer(mbr->getSize(),
+                                                      FullMemberName.c_str());
+  memcpy((char*)Buffer->getBufferStart(), mbr->getData(), mbr->getSize());
+  
+  ModuleProvider *mp = getBitcodeModuleProvider(Buffer, ErrMsg);
+  if (!mp)
+    return 0;
+
+  modules.insert(std::make_pair(fileOffset, std::make_pair(mp, mbr)));
+
+  return mp;
+}
+
+// Look up multiple symbols in the symbol table and return a set of
+// ModuleProviders that define those symbols.
+bool
+Archive::findModulesDefiningSymbols(std::set<std::string>& symbols,
+                                    std::set<ModuleProvider*>& result,
+                                    std::string* error) {
+  if (!mapfile || !base) {
+    if (error)
+      *error = "Empty archive invalid for finding modules defining symbols";
+    return false;
+  }
+
+  if (symTab.empty()) {
+    // We don't have a symbol table, so we must build it now but lets also
+    // make sure that we populate the modules table as we do this to ensure
+    // that we don't load them twice when findModuleDefiningSymbol is called
+    // below.
+
+    // Get a pointer to the first file
+    const char* At  = base + firstFileOffset;
+    const char* End = mapfile->getBufferEnd();
+
+    while ( At < End) {
+      // Compute the offset to be put in the symbol table
+      unsigned offset = At - base - firstFileOffset;
+
+      // Parse the file's header
+      ArchiveMember* mbr = parseMemberHeader(At, End, error);
+      if (!mbr)
+        return false;
+
+      // If it contains symbols
+      if (mbr->isBitcode()) {
+        // Get the symbols
+        std::vector<std::string> symbols;
+        std::string FullMemberName = archPath.toString() + "(" +
+          mbr->getPath().toString() + ")";
+        ModuleProvider* MP = 
+          GetBitcodeSymbols((const unsigned char*)At, mbr->getSize(),
+                            FullMemberName, symbols, error);
+
+        if (MP) {
+          // Insert the module's symbols into the symbol table
+          for (std::vector<std::string>::iterator I = symbols.begin(),
+               E=symbols.end(); I != E; ++I ) {
+            symTab.insert(std::make_pair(*I, offset));
+          }
+          // Insert the ModuleProvider and the ArchiveMember into the table of
+          // modules.
+          modules.insert(std::make_pair(offset, std::make_pair(MP, mbr)));
+        } else {
+          if (error)
+            *error = "Can't parse bitcode member: " + 
+              mbr->getPath().toString() + ": " + *error;
+          delete mbr;
+          return false;
+        }
+      }
+
+      // Go to the next file location
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+    }
+  }
+
+  // At this point we have a valid symbol table (one way or another) so we
+  // just use it to quickly find the symbols requested.
+
+  for (std::set<std::string>::iterator I=symbols.begin(),
+       E=symbols.end(); I != E;) {
+    // See if this symbol exists
+    ModuleProvider* mp = findModuleDefiningSymbol(*I,error);
+    if (mp) {
+      // The symbol exists, insert the ModuleProvider into our result,
+      // duplicates wil be ignored
+      result.insert(mp);
+
+      // Remove the symbol now that its been resolved, being careful to
+      // post-increment the iterator.
+      symbols.erase(I++);
+    } else {
+      ++I;
+    }
+  }
+  return true;
+}
+
+bool Archive::isBitcodeArchive() {
+  // Make sure the symTab has been loaded. In most cases this should have been
+  // done when the archive was constructed, but still,  this is just in case.
+  if (symTab.empty())
+    if (!loadSymbolTable(0))
+      return false;
+
+  // Now that we know it's been loaded, return true
+  // if it has a size
+  if (symTab.size()) return true;
+
+  // We still can't be sure it isn't a bitcode archive
+  if (!loadArchive(0))
+    return false;
+
+  std::vector<Module *> Modules;
+  std::string ErrorMessage;
+
+  // Scan the archive, trying to load a bitcode member.  We only load one to
+  // see if this works.
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (!I->isBitcode())
+      continue;
+    
+    std::string FullMemberName = 
+      archPath.toString() + "(" + I->getPath().toString() + ")";
+
+    MemoryBuffer *Buffer =
+      MemoryBuffer::getNewMemBuffer(I->getSize(), FullMemberName.c_str());
+    memcpy((char*)Buffer->getBufferStart(), I->getData(), I->getSize());
+    Module *M = ParseBitcodeFile(Buffer);
+    delete Buffer;
+    if (!M)
+      return false;  // Couldn't parse bitcode, not a bitcode archive.
+    delete M;
+    return true;
+  }
+  
+  return false;
+}
diff --git a/lib/Archive/ArchiveWriter.cpp b/lib/Archive/ArchiveWriter.cpp
new file mode 100644
index 0000000..336a2bd
--- /dev/null
+++ b/lib/Archive/ArchiveWriter.cpp
@@ -0,0 +1,482 @@
+//===-- ArchiveWriter.cpp - Write LLVM archive files ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Builds up an LLVM archive file (.a) containing LLVM bitcode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ArchiveInternals.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/System/Signals.h"
+#include "llvm/System/Process.h"
+#include "llvm/ModuleProvider.h"
+#include <fstream>
+#include <ostream>
+#include <iomanip>
+using namespace llvm;
+
+// Write an integer using variable bit rate encoding. This saves a few bytes
+// per entry in the symbol table.
+static inline void writeInteger(unsigned num, std::ofstream& ARFile) {
+  while (1) {
+    if (num < 0x80) { // done?
+      ARFile << (unsigned char)num;
+      return;
+    }
+
+    // Nope, we are bigger than a character, output the next 7 bits and set the
+    // high bit to say that there is more coming...
+    ARFile << (unsigned char)(0x80 | ((unsigned char)num & 0x7F));
+    num >>= 7;  // Shift out 7 bits now...
+  }
+}
+
+// Compute how many bytes are taken by a given VBR encoded value. This is needed
+// to pre-compute the size of the symbol table.
+static inline unsigned numVbrBytes(unsigned num) {
+
+  // Note that the following nested ifs are somewhat equivalent to a binary
+  // search. We split it in half by comparing against 2^14 first. This allows
+  // most reasonable values to be done in 2 comparisons instead of 1 for
+  // small ones and four for large ones. We expect this to access file offsets
+  // in the 2^10 to 2^24 range and symbol lengths in the 2^0 to 2^8 range,
+  // so this approach is reasonable.
+  if (num < 1<<14) {
+    if (num < 1<<7)
+      return 1;
+    else
+      return 2;
+  }
+  if (num < 1<<21)
+    return 3;
+
+  if (num < 1<<28)
+    return 4;
+  return 5; // anything >= 2^28 takes 5 bytes
+}
+
+// Create an empty archive.
+Archive*
+Archive::CreateEmpty(const sys::Path& FilePath ) {
+  Archive* result = new Archive(FilePath);
+  return result;
+}
+
+// Fill the ArchiveMemberHeader with the information from a member. If
+// TruncateNames is true, names are flattened to 15 chars or less. The sz field
+// is provided here instead of coming from the mbr because the member might be
+// stored compressed and the compressed size is not the ArchiveMember's size.
+// Furthermore compressed files have negative size fields to identify them as
+// compressed.
+bool
+Archive::fillHeader(const ArchiveMember &mbr, ArchiveMemberHeader& hdr,
+                    int sz, bool TruncateNames) const {
+
+  // Set the permissions mode, uid and gid
+  hdr.init();
+  char buffer[32];
+  sprintf(buffer, "%-8o", mbr.getMode());
+  memcpy(hdr.mode,buffer,8);
+  sprintf(buffer,  "%-6u", mbr.getUser());
+  memcpy(hdr.uid,buffer,6);
+  sprintf(buffer,  "%-6u", mbr.getGroup());
+  memcpy(hdr.gid,buffer,6);
+
+  // Set the last modification date
+  uint64_t secondsSinceEpoch = mbr.getModTime().toEpochTime();
+  sprintf(buffer,"%-12u", unsigned(secondsSinceEpoch));
+  memcpy(hdr.date,buffer,12);
+
+  // Get rid of trailing blanks in the name
+  std::string mbrPath = mbr.getPath().toString();
+  size_t mbrLen = mbrPath.length();
+  while (mbrLen > 0 && mbrPath[mbrLen-1] == ' ') {
+    mbrPath.erase(mbrLen-1,1);
+    mbrLen--;
+  }
+
+  // Set the name field in one of its various flavors.
+  bool writeLongName = false;
+  if (mbr.isStringTable()) {
+    memcpy(hdr.name,ARFILE_STRTAB_NAME,16);
+  } else if (mbr.isSVR4SymbolTable()) {
+    memcpy(hdr.name,ARFILE_SVR4_SYMTAB_NAME,16);
+  } else if (mbr.isBSD4SymbolTable()) {
+    memcpy(hdr.name,ARFILE_BSD4_SYMTAB_NAME,16);
+  } else if (mbr.isLLVMSymbolTable()) {
+    memcpy(hdr.name,ARFILE_LLVM_SYMTAB_NAME,16);
+  } else if (TruncateNames) {
+    const char* nm = mbrPath.c_str();
+    unsigned len = mbrPath.length();
+    size_t slashpos = mbrPath.rfind('/');
+    if (slashpos != std::string::npos) {
+      nm += slashpos + 1;
+      len -= slashpos +1;
+    }
+    if (len > 15)
+      len = 15;
+    memcpy(hdr.name,nm,len);
+    hdr.name[len] = '/';
+  } else if (mbrPath.length() < 16 && mbrPath.find('/') == std::string::npos) {
+    memcpy(hdr.name,mbrPath.c_str(),mbrPath.length());
+    hdr.name[mbrPath.length()] = '/';
+  } else {
+    std::string nm = "#1/";
+    nm += utostr(mbrPath.length());
+    memcpy(hdr.name,nm.data(),nm.length());
+    if (sz < 0)
+      sz -= mbrPath.length();
+    else
+      sz += mbrPath.length();
+    writeLongName = true;
+  }
+
+  // Set the size field
+  if (sz < 0) {
+    buffer[0] = '-';
+    sprintf(&buffer[1],"%-9u",(unsigned)-sz);
+  } else {
+    sprintf(buffer, "%-10u", (unsigned)sz);
+  }
+  memcpy(hdr.size,buffer,10);
+
+  return writeLongName;
+}
+
+// Insert a file into the archive before some other member. This also takes care
+// of extracting the necessary flags and information from the file.
+bool
+Archive::addFileBefore(const sys::Path& filePath, iterator where, 
+                        std::string* ErrMsg) {
+  if (!filePath.exists()) {
+    if (ErrMsg)
+      *ErrMsg = "Can not add a non-existent file to archive";
+    return true;
+  }
+
+  ArchiveMember* mbr = new ArchiveMember(this);
+
+  mbr->data = 0;
+  mbr->path = filePath;
+  const sys::FileStatus *FSInfo = mbr->path.getFileStatus(false, ErrMsg);
+  if (FSInfo)
+    mbr->info = *FSInfo;
+  else
+    return true;
+
+  unsigned flags = 0;
+  bool hasSlash = filePath.toString().find('/') != std::string::npos;
+  if (hasSlash)
+    flags |= ArchiveMember::HasPathFlag;
+  if (hasSlash || filePath.toString().length() > 15)
+    flags |= ArchiveMember::HasLongFilenameFlag;
+  std::string magic;
+  mbr->path.getMagicNumber(magic,4);
+  switch (sys::IdentifyFileType(magic.c_str(),4)) {
+    case sys::Bitcode_FileType:
+      flags |= ArchiveMember::BitcodeFlag;
+      break;
+    default:
+      break;
+  }
+  mbr->flags = flags;
+  members.insert(where,mbr);
+  return false;
+}
+
+// Write one member out to the file.
+bool
+Archive::writeMember(
+  const ArchiveMember& member,
+  std::ofstream& ARFile,
+  bool CreateSymbolTable,
+  bool TruncateNames,
+  bool ShouldCompress,
+  std::string* ErrMsg
+) {
+
+  unsigned filepos = ARFile.tellp();
+  filepos -= 8;
+
+  // Get the data and its size either from the
+  // member's in-memory data or directly from the file.
+  size_t fSize = member.getSize();
+  const char *data = (const char*)member.getData();
+  MemoryBuffer *mFile = 0;
+  if (!data) {
+    mFile = MemoryBuffer::getFile(member.getPath().c_str(), ErrMsg);
+    if (mFile == 0)
+      return true;
+    data = mFile->getBufferStart();
+    fSize = mFile->getBufferSize();
+  }
+
+  // Now that we have the data in memory, update the
+  // symbol table if its a bitcode file.
+  if (CreateSymbolTable && member.isBitcode()) {
+    std::vector<std::string> symbols;
+    std::string FullMemberName = archPath.toString() + "(" +
+      member.getPath().toString()
+      + ")";
+    ModuleProvider* MP = 
+      GetBitcodeSymbols((const unsigned char*)data,fSize,
+                        FullMemberName, symbols, ErrMsg);
+
+    // If the bitcode parsed successfully
+    if ( MP ) {
+      for (std::vector<std::string>::iterator SI = symbols.begin(),
+           SE = symbols.end(); SI != SE; ++SI) {
+
+        std::pair<SymTabType::iterator,bool> Res =
+          symTab.insert(std::make_pair(*SI,filepos));
+
+        if (Res.second) {
+          symTabSize += SI->length() +
+                        numVbrBytes(SI->length()) +
+                        numVbrBytes(filepos);
+        }
+      }
+      // We don't need this module any more.
+      delete MP;
+    } else {
+      delete mFile;
+      if (ErrMsg)
+        *ErrMsg = "Can't parse bitcode member: " + member.getPath().toString()
+          + ": " + *ErrMsg;
+      return true;
+    }
+  }
+
+  int hdrSize = fSize;
+
+  // Compute the fields of the header
+  ArchiveMemberHeader Hdr;
+  bool writeLongName = fillHeader(member,Hdr,hdrSize,TruncateNames);
+
+  // Write header to archive file
+  ARFile.write((char*)&Hdr, sizeof(Hdr));
+
+  // Write the long filename if its long
+  if (writeLongName) {
+    ARFile.write(member.getPath().toString().data(),
+                 member.getPath().toString().length());
+  }
+
+  // Write the (possibly compressed) member's content to the file.
+  ARFile.write(data,fSize);
+
+  // Make sure the member is an even length
+  if ((ARFile.tellp() & 1) == 1)
+    ARFile << ARFILE_PAD;
+
+  // Close the mapped file if it was opened
+  delete mFile;
+  return false;
+}
+
+// Write out the LLVM symbol table as an archive member to the file.
+void
+Archive::writeSymbolTable(std::ofstream& ARFile) {
+
+  // Construct the symbol table's header
+  ArchiveMemberHeader Hdr;
+  Hdr.init();
+  memcpy(Hdr.name,ARFILE_LLVM_SYMTAB_NAME,16);
+  uint64_t secondsSinceEpoch = sys::TimeValue::now().toEpochTime();
+  char buffer[32];
+  sprintf(buffer, "%-8o", 0644);
+  memcpy(Hdr.mode,buffer,8);
+  sprintf(buffer, "%-6u", sys::Process::GetCurrentUserId());
+  memcpy(Hdr.uid,buffer,6);
+  sprintf(buffer, "%-6u", sys::Process::GetCurrentGroupId());
+  memcpy(Hdr.gid,buffer,6);
+  sprintf(buffer,"%-12u", unsigned(secondsSinceEpoch));
+  memcpy(Hdr.date,buffer,12);
+  sprintf(buffer,"%-10u",symTabSize);
+  memcpy(Hdr.size,buffer,10);
+
+  // Write the header
+  ARFile.write((char*)&Hdr, sizeof(Hdr));
+
+#ifndef NDEBUG
+  // Save the starting position of the symbol tables data content.
+  unsigned startpos = ARFile.tellp();
+#endif
+
+  // Write out the symbols sequentially
+  for ( Archive::SymTabType::iterator I = symTab.begin(), E = symTab.end();
+        I != E; ++I)
+  {
+    // Write out the file index
+    writeInteger(I->second, ARFile);
+    // Write out the length of the symbol
+    writeInteger(I->first.length(), ARFile);
+    // Write out the symbol
+    ARFile.write(I->first.data(), I->first.length());
+  }
+
+#ifndef NDEBUG
+  // Now that we're done with the symbol table, get the ending file position
+  unsigned endpos = ARFile.tellp();
+#endif
+
+  // Make sure that the amount we wrote is what we pre-computed. This is
+  // critical for file integrity purposes.
+  assert(endpos - startpos == symTabSize && "Invalid symTabSize computation");
+
+  // Make sure the symbol table is even sized
+  if (symTabSize % 2 != 0 )
+    ARFile << ARFILE_PAD;
+}
+
+// Write the entire archive to the file specified when the archive was created.
+// This writes to a temporary file first. Options are for creating a symbol
+// table, flattening the file names (no directories, 15 chars max) and
+// compressing each archive member.
+bool
+Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
+                     std::string* ErrMsg)
+{
+  // Make sure they haven't opened up the file, not loaded it,
+  // but are now trying to write it which would wipe out the file.
+  if (members.empty() && mapfile && mapfile->getBufferSize() > 8) {
+    if (ErrMsg)
+      *ErrMsg = "Can't write an archive not opened for writing";
+    return true;
+  }
+
+  // Create a temporary file to store the archive in
+  sys::Path TmpArchive = archPath;
+  if (TmpArchive.createTemporaryFileOnDisk(ErrMsg))
+    return true;
+
+  // Make sure the temporary gets removed if we crash
+  sys::RemoveFileOnSignal(TmpArchive);
+
+  // Create archive file for output.
+  std::ios::openmode io_mode = std::ios::out | std::ios::trunc |
+                               std::ios::binary;
+  std::ofstream ArchiveFile(TmpArchive.c_str(), io_mode);
+
+  // Check for errors opening or creating archive file.
+  if (!ArchiveFile.is_open() || ArchiveFile.bad()) {
+    if (TmpArchive.exists())
+      TmpArchive.eraseFromDisk();
+    if (ErrMsg)
+      *ErrMsg = "Error opening archive file: " + archPath.toString();
+    return true;
+  }
+
+  // If we're creating a symbol table, reset it now
+  if (CreateSymbolTable) {
+    symTabSize = 0;
+    symTab.clear();
+  }
+
+  // Write magic string to archive.
+  ArchiveFile << ARFILE_MAGIC;
+
+  // Loop over all member files, and write them out. Note that this also
+  // builds the symbol table, symTab.
+  for (MembersList::iterator I = begin(), E = end(); I != E; ++I) {
+    if (writeMember(*I, ArchiveFile, CreateSymbolTable,
+                     TruncateNames, Compress, ErrMsg)) {
+      if (TmpArchive.exists())
+        TmpArchive.eraseFromDisk();
+      ArchiveFile.close();
+      return true;
+    }
+  }
+
+  // Close archive file.
+  ArchiveFile.close();
+
+  // Write the symbol table
+  if (CreateSymbolTable) {
+    // At this point we have written a file that is a legal archive but it
+    // doesn't have a symbol table in it. To aid in faster reading and to
+    // ensure compatibility with other archivers we need to put the symbol
+    // table first in the file. Unfortunately, this means mapping the file
+    // we just wrote back in and copying it to the destination file.
+    sys::Path FinalFilePath = archPath;
+
+    // Map in the archive we just wrote.
+    {
+    OwningPtr<MemoryBuffer> arch(MemoryBuffer::getFile(TmpArchive.c_str()));
+    if (arch == 0) return true;
+    const char* base = arch->getBufferStart();
+
+    // Open another temporary file in order to avoid invalidating the 
+    // mmapped data
+    if (FinalFilePath.createTemporaryFileOnDisk(ErrMsg))
+      return true;
+    sys::RemoveFileOnSignal(FinalFilePath);
+
+    std::ofstream FinalFile(FinalFilePath.c_str(), io_mode);
+    if (!FinalFile.is_open() || FinalFile.bad()) {
+      if (TmpArchive.exists())
+        TmpArchive.eraseFromDisk();
+      if (ErrMsg)
+        *ErrMsg = "Error opening archive file: " + FinalFilePath.toString();
+      return true;
+    }
+
+    // Write the file magic number
+    FinalFile << ARFILE_MAGIC;
+
+    // If there is a foreign symbol table, put it into the file now. Most
+    // ar(1) implementations require the symbol table to be first but llvm-ar
+    // can deal with it being after a foreign symbol table. This ensures
+    // compatibility with other ar(1) implementations as well as allowing the
+    // archive to store both native .o and LLVM .bc files, both indexed.
+    if (foreignST) {
+      if (writeMember(*foreignST, FinalFile, false, false, false, ErrMsg)) {
+        FinalFile.close();
+        if (TmpArchive.exists())
+          TmpArchive.eraseFromDisk();
+        return true;
+      }
+    }
+
+    // Put out the LLVM symbol table now.
+    writeSymbolTable(FinalFile);
+
+    // Copy the temporary file contents being sure to skip the file's magic
+    // number.
+    FinalFile.write(base + sizeof(ARFILE_MAGIC)-1,
+      arch->getBufferSize()-sizeof(ARFILE_MAGIC)+1);
+
+    // Close up shop
+    FinalFile.close();
+    } // free arch.
+    
+    // Move the final file over top of TmpArchive
+    if (FinalFilePath.renamePathOnDisk(TmpArchive, ErrMsg))
+      return true;
+  }
+  
+  // Before we replace the actual archive, we need to forget all the
+  // members, since they point to data in that old archive. We need to do
+  // this because we cannot replace an open file on Windows.
+  cleanUpMemory();
+  
+  if (TmpArchive.renamePathOnDisk(archPath, ErrMsg))
+    return true;
+
+  // Set correct read and write permissions after temporary file is moved
+  // to final destination path.
+  if (archPath.makeReadableOnDisk(ErrMsg))
+    return true;
+  if (archPath.makeWriteableOnDisk(ErrMsg))
+    return true;
+
+  return false;
+}
diff --git a/lib/Archive/CMakeLists.txt b/lib/Archive/CMakeLists.txt
new file mode 100644
index 0000000..27698cb
--- /dev/null
+++ b/lib/Archive/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_llvm_library(LLVMArchive
+  Archive.cpp
+  ArchiveReader.cpp
+  ArchiveWriter.cpp
+  )
+\ No newline at end of file
diff --git a/lib/Archive/Makefile b/lib/Archive/Makefile
new file mode 100644
index 0000000..da97804
--- /dev/null
+++ b/lib/Archive/Makefile
@@ -0,0 +1,17 @@
+##===- lib/Archive/Makefile --------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMArchive
+
+# We only want an archive so only those modules actually used by a tool are
+# included.
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/AsmParser/CMakeLists.txt b/lib/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..985ebe2
--- /dev/null
+++ b/lib/AsmParser/CMakeLists.txt
@@ -0,0 +1,6 @@
+# AsmParser
+add_llvm_library(LLVMAsmParser
+  LLLexer.cpp
+  LLParser.cpp
+  Parser.cpp
+  )
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
new file mode 100644
index 0000000..f2e6890
--- /dev/null
+++ b/lib/AsmParser/LLLexer.cpp
@@ -0,0 +1,835 @@
+//===- LLLexer.cpp - Lexer for .ll Files ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the Lexer for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLLexer.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instruction.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Assembly/Parser.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+bool LLLexer::Error(LocTy ErrorLoc, const std::string &Msg) const {
+  // Scan backward to find the start of the line.
+  const char *LineStart = ErrorLoc;
+  while (LineStart != CurBuf->getBufferStart() &&
+         LineStart[-1] != '\n' && LineStart[-1] != '\r')
+    --LineStart;
+  // Get the end of the line.
+  const char *LineEnd = ErrorLoc;
+  while (LineEnd != CurBuf->getBufferEnd() &&
+         LineEnd[0] != '\n' && LineEnd[0] != '\r')
+    ++LineEnd;
+
+  unsigned LineNo = 1;
+  for (const char *FP = CurBuf->getBufferStart(); FP != ErrorLoc; ++FP)
+    if (*FP == '\n') ++LineNo;
+
+  std::string LineContents(LineStart, LineEnd);
+  ErrorInfo.setError(Msg, LineNo, ErrorLoc-LineStart, LineContents);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions.
+//===----------------------------------------------------------------------===//
+
+// atoull - Convert an ascii string of decimal digits into the unsigned long
+// long representation... this does not have to do input error checking,
+// because we know that the input will be matched by a suitable regex...
+//
+uint64_t LLLexer::atoull(const char *Buffer, const char *End) {
+  uint64_t Result = 0;
+  for (; Buffer != End; Buffer++) {
+    uint64_t OldRes = Result;
+    Result *= 10;
+    Result += *Buffer-'0';
+    if (Result < OldRes) {  // Uh, oh, overflow detected!!!
+      Error("constant bigger than 64 bits detected!");
+      return 0;
+    }
+  }
+  return Result;
+}
+
+uint64_t LLLexer::HexIntToVal(const char *Buffer, const char *End) {
+  uint64_t Result = 0;
+  for (; Buffer != End; ++Buffer) {
+    uint64_t OldRes = Result;
+    Result *= 16;
+    char C = *Buffer;
+    if (C >= '0' && C <= '9')
+      Result += C-'0';
+    else if (C >= 'A' && C <= 'F')
+      Result += C-'A'+10;
+    else if (C >= 'a' && C <= 'f')
+      Result += C-'a'+10;
+
+    if (Result < OldRes) {   // Uh, oh, overflow detected!!!
+      Error("constant bigger than 64 bits detected!");
+      return 0;
+    }
+  }
+  return Result;
+}
+
+void LLLexer::HexToIntPair(const char *Buffer, const char *End,
+                           uint64_t Pair[2]) {
+  Pair[0] = 0;
+  for (int i=0; i<16; i++, Buffer++) {
+    assert(Buffer != End);
+    Pair[0] *= 16;
+    char C = *Buffer;
+    if (C >= '0' && C <= '9')
+      Pair[0] += C-'0';
+    else if (C >= 'A' && C <= 'F')
+      Pair[0] += C-'A'+10;
+    else if (C >= 'a' && C <= 'f')
+      Pair[0] += C-'a'+10;
+  }
+  Pair[1] = 0;
+  for (int i=0; i<16 && Buffer != End; i++, Buffer++) {
+    Pair[1] *= 16;
+    char C = *Buffer;
+    if (C >= '0' && C <= '9')
+      Pair[1] += C-'0';
+    else if (C >= 'A' && C <= 'F')
+      Pair[1] += C-'A'+10;
+    else if (C >= 'a' && C <= 'f')
+      Pair[1] += C-'a'+10;
+  }
+  if (Buffer != End)
+    Error("constant bigger than 128 bits detected!");
+}
+
+/// FP80HexToIntPair - translate an 80 bit FP80 number (20 hexits) into
+/// { low64, high16 } as usual for an APInt.
+void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End,
+                           uint64_t Pair[2]) {
+  Pair[1] = 0;
+  for (int i=0; i<4 && Buffer != End; i++, Buffer++) {
+    assert(Buffer != End);
+    Pair[1] *= 16;
+    char C = *Buffer;
+    if (C >= '0' && C <= '9')
+      Pair[1] += C-'0';
+    else if (C >= 'A' && C <= 'F')
+      Pair[1] += C-'A'+10;
+    else if (C >= 'a' && C <= 'f')
+      Pair[1] += C-'a'+10;
+  }
+  Pair[0] = 0;
+  for (int i=0; i<16; i++, Buffer++) {
+    Pair[0] *= 16;
+    char C = *Buffer;
+    if (C >= '0' && C <= '9')
+      Pair[0] += C-'0';
+    else if (C >= 'A' && C <= 'F')
+      Pair[0] += C-'A'+10;
+    else if (C >= 'a' && C <= 'f')
+      Pair[0] += C-'a'+10;
+  }
+  if (Buffer != End)
+    Error("constant bigger than 128 bits detected!");
+}
+
+// UnEscapeLexed - Run through the specified buffer and change \xx codes to the
+// appropriate character.
+static void UnEscapeLexed(std::string &Str) {
+  if (Str.empty()) return;
+
+  char *Buffer = &Str[0], *EndBuffer = Buffer+Str.size();
+  char *BOut = Buffer;
+  for (char *BIn = Buffer; BIn != EndBuffer; ) {
+    if (BIn[0] == '\\') {
+      if (BIn < EndBuffer-1 && BIn[1] == '\\') {
+        *BOut++ = '\\'; // Two \ becomes one
+        BIn += 2;
+      } else if (BIn < EndBuffer-2 && isxdigit(BIn[1]) && isxdigit(BIn[2])) {
+        char Tmp = BIn[3]; BIn[3] = 0;      // Terminate string
+        *BOut = (char)strtol(BIn+1, 0, 16); // Convert to number
+        BIn[3] = Tmp;                       // Restore character
+        BIn += 3;                           // Skip over handled chars
+        ++BOut;
+      } else {
+        *BOut++ = *BIn++;
+      }
+    } else {
+      *BOut++ = *BIn++;
+    }
+  }
+  Str.resize(BOut-Buffer);
+}
+
+/// isLabelChar - Return true for [-a-zA-Z$._0-9].
+static bool isLabelChar(char C) {
+  return isalnum(C) || C == '-' || C == '$' || C == '.' || C == '_';
+}
+
+
+/// isLabelTail - Return true if this pointer points to a valid end of a label.
+static const char *isLabelTail(const char *CurPtr) {
+  while (1) {
+    if (CurPtr[0] == ':') return CurPtr+1;
+    if (!isLabelChar(CurPtr[0])) return 0;
+    ++CurPtr;
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Lexer definition.
+//===----------------------------------------------------------------------===//
+
+LLLexer::LLLexer(MemoryBuffer *StartBuf, ParseError &Err)
+  : CurBuf(StartBuf), ErrorInfo(Err), APFloatVal(0.0) {
+  CurPtr = CurBuf->getBufferStart();
+}
+
+std::string LLLexer::getFilename() const {
+  return CurBuf->getBufferIdentifier();
+}
+
+int LLLexer::getNextChar() {
+  char CurChar = *CurPtr++;
+  switch (CurChar) {
+  default: return (unsigned char)CurChar;
+  case 0:
+    // A nul character in the stream is either the end of the current buffer or
+    // a random nul in the file.  Disambiguate that here.
+    if (CurPtr-1 != CurBuf->getBufferEnd())
+      return 0;  // Just whitespace.
+
+    // Otherwise, return end of file.
+    --CurPtr;  // Another call to lex will return EOF again.
+    return EOF;
+  }
+}
+
+
+lltok::Kind LLLexer::LexToken() {
+  TokStart = CurPtr;
+
+  int CurChar = getNextChar();
+  switch (CurChar) {
+  default:
+    // Handle letters: [a-zA-Z_]
+    if (isalpha(CurChar) || CurChar == '_')
+      return LexIdentifier();
+
+    return lltok::Error;
+  case EOF: return lltok::Eof;
+  case 0:
+  case ' ':
+  case '\t':
+  case '\n':
+  case '\r':
+    // Ignore whitespace.
+    return LexToken();
+  case '+': return LexPositive();
+  case '@': return LexAt();
+  case '%': return LexPercent();
+  case '"': return LexQuote();
+  case '.':
+    if (const char *Ptr = isLabelTail(CurPtr)) {
+      CurPtr = Ptr;
+      StrVal.assign(TokStart, CurPtr-1);
+      return lltok::LabelStr;
+    }
+    if (CurPtr[0] == '.' && CurPtr[1] == '.') {
+      CurPtr += 2;
+      return lltok::dotdotdot;
+    }
+    return lltok::Error;
+  case '$':
+    if (const char *Ptr = isLabelTail(CurPtr)) {
+      CurPtr = Ptr;
+      StrVal.assign(TokStart, CurPtr-1);
+      return lltok::LabelStr;
+    }
+    return lltok::Error;
+  case ';':
+    SkipLineComment();
+    return LexToken();
+  case '!': return lltok::Metadata;
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+  case '-':
+    return LexDigitOrNegative();
+  case '=': return lltok::equal;
+  case '[': return lltok::lsquare;
+  case ']': return lltok::rsquare;
+  case '{': return lltok::lbrace;
+  case '}': return lltok::rbrace;
+  case '<': return lltok::less;
+  case '>': return lltok::greater;
+  case '(': return lltok::lparen;
+  case ')': return lltok::rparen;
+  case ',': return lltok::comma;
+  case '*': return lltok::star;
+  case '\\': return lltok::backslash;
+  }
+}
+
+void LLLexer::SkipLineComment() {
+  while (1) {
+    if (CurPtr[0] == '\n' || CurPtr[0] == '\r' || getNextChar() == EOF)
+      return;
+  }
+}
+
+/// LexAt - Lex all tokens that start with an @ character:
+///   GlobalVar   @\"[^\"]*\"
+///   GlobalVar   @[-a-zA-Z$._][-a-zA-Z$._0-9]*
+///   GlobalVarID @[0-9]+
+lltok::Kind LLLexer::LexAt() {
+  // Handle AtStringConstant: @\"[^\"]*\"
+  if (CurPtr[0] == '"') {
+    ++CurPtr;
+
+    while (1) {
+      int CurChar = getNextChar();
+
+      if (CurChar == EOF) {
+        Error("end of file in global variable name");
+        return lltok::Error;
+      }
+      if (CurChar == '"') {
+        StrVal.assign(TokStart+2, CurPtr-1);
+        UnEscapeLexed(StrVal);
+        return lltok::GlobalVar;
+      }
+    }
+  }
+
+  // Handle GlobalVarName: @[-a-zA-Z$._][-a-zA-Z$._0-9]*
+  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+      CurPtr[0] == '.' || CurPtr[0] == '_') {
+    ++CurPtr;
+    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+           CurPtr[0] == '.' || CurPtr[0] == '_')
+      ++CurPtr;
+
+    StrVal.assign(TokStart+1, CurPtr);   // Skip @
+    return lltok::GlobalVar;
+  }
+
+  // Handle GlobalVarID: @[0-9]+
+  if (isdigit(CurPtr[0])) {
+    for (++CurPtr; isdigit(CurPtr[0]); ++CurPtr)
+      /*empty*/;
+
+    uint64_t Val = atoull(TokStart+1, CurPtr);
+    if ((unsigned)Val != Val)
+      Error("invalid value number (too large)!");
+    UIntVal = unsigned(Val);
+    return lltok::GlobalID;
+  }
+
+  return lltok::Error;
+}
+
+
+/// LexPercent - Lex all tokens that start with a % character:
+///   LocalVar   ::= %\"[^\"]*\"
+///   LocalVar   ::= %[-a-zA-Z$._][-a-zA-Z$._0-9]*
+///   LocalVarID ::= %[0-9]+
+lltok::Kind LLLexer::LexPercent() {
+  // Handle LocalVarName: %\"[^\"]*\"
+  if (CurPtr[0] == '"') {
+    ++CurPtr;
+
+    while (1) {
+      int CurChar = getNextChar();
+
+      if (CurChar == EOF) {
+        Error("end of file in string constant");
+        return lltok::Error;
+      }
+      if (CurChar == '"') {
+        StrVal.assign(TokStart+2, CurPtr-1);
+        UnEscapeLexed(StrVal);
+        return lltok::LocalVar;
+      }
+    }
+  }
+
+  // Handle LocalVarName: %[-a-zA-Z$._][-a-zA-Z$._0-9]*
+  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+      CurPtr[0] == '.' || CurPtr[0] == '_') {
+    ++CurPtr;
+    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+           CurPtr[0] == '.' || CurPtr[0] == '_')
+      ++CurPtr;
+
+    StrVal.assign(TokStart+1, CurPtr);   // Skip %
+    return lltok::LocalVar;
+  }
+
+  // Handle LocalVarID: %[0-9]+
+  if (isdigit(CurPtr[0])) {
+    for (++CurPtr; isdigit(CurPtr[0]); ++CurPtr)
+      /*empty*/;
+
+    uint64_t Val = atoull(TokStart+1, CurPtr);
+    if ((unsigned)Val != Val)
+      Error("invalid value number (too large)!");
+    UIntVal = unsigned(Val);
+    return lltok::LocalVarID;
+  }
+
+  return lltok::Error;
+}
+
+/// LexQuote - Lex all tokens that start with a " character:
+///   QuoteLabel        "[^"]+":
+///   StringConstant    "[^"]*"
+lltok::Kind LLLexer::LexQuote() {
+  while (1) {
+    int CurChar = getNextChar();
+
+    if (CurChar == EOF) {
+      Error("end of file in quoted string");
+      return lltok::Error;
+    }
+
+    if (CurChar != '"') continue;
+
+    if (CurPtr[0] != ':') {
+      StrVal.assign(TokStart+1, CurPtr-1);
+      UnEscapeLexed(StrVal);
+      return lltok::StringConstant;
+    }
+
+    ++CurPtr;
+    StrVal.assign(TokStart+1, CurPtr-2);
+    UnEscapeLexed(StrVal);
+    return lltok::LabelStr;
+  }
+}
+
+static bool JustWhitespaceNewLine(const char *&Ptr) {
+  const char *ThisPtr = Ptr;
+  while (*ThisPtr == ' ' || *ThisPtr == '\t')
+    ++ThisPtr;
+  if (*ThisPtr == '\n' || *ThisPtr == '\r') {
+    Ptr = ThisPtr;
+    return true;
+  }
+  return false;
+}
+
+
+/// LexIdentifier: Handle several related productions:
+///    Label           [-a-zA-Z$._0-9]+:
+///    IntegerType     i[0-9]+
+///    Keyword         sdiv, float, ...
+///    HexIntConstant  [us]0x[0-9A-Fa-f]+
+lltok::Kind LLLexer::LexIdentifier() {
+  const char *StartChar = CurPtr;
+  const char *IntEnd = CurPtr[-1] == 'i' ? 0 : StartChar;
+  const char *KeywordEnd = 0;
+
+  for (; isLabelChar(*CurPtr); ++CurPtr) {
+    // If we decide this is an integer, remember the end of the sequence.
+    if (!IntEnd && !isdigit(*CurPtr)) IntEnd = CurPtr;
+    if (!KeywordEnd && !isalnum(*CurPtr) && *CurPtr != '_') KeywordEnd = CurPtr;
+  }
+
+  // If we stopped due to a colon, this really is a label.
+  if (*CurPtr == ':') {
+    StrVal.assign(StartChar-1, CurPtr++);
+    return lltok::LabelStr;
+  }
+
+  // Otherwise, this wasn't a label.  If this was valid as an integer type,
+  // return it.
+  if (IntEnd == 0) IntEnd = CurPtr;
+  if (IntEnd != StartChar) {
+    CurPtr = IntEnd;
+    uint64_t NumBits = atoull(StartChar, CurPtr);
+    if (NumBits < IntegerType::MIN_INT_BITS ||
+        NumBits > IntegerType::MAX_INT_BITS) {
+      Error("bitwidth for integer type out of range!");
+      return lltok::Error;
+    }
+    TyVal = IntegerType::get(NumBits);
+    return lltok::Type;
+  }
+
+  // Otherwise, this was a letter sequence.  See which keyword this is.
+  if (KeywordEnd == 0) KeywordEnd = CurPtr;
+  CurPtr = KeywordEnd;
+  --StartChar;
+  unsigned Len = CurPtr-StartChar;
+#define KEYWORD(STR) \
+  if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) \
+    return lltok::kw_##STR;
+
+  KEYWORD(begin);   KEYWORD(end);
+  KEYWORD(true);    KEYWORD(false);
+  KEYWORD(declare); KEYWORD(define);
+  KEYWORD(global);  KEYWORD(constant);
+
+  KEYWORD(private);
+  KEYWORD(internal);
+  KEYWORD(available_externally);
+  KEYWORD(linkonce);
+  KEYWORD(linkonce_odr);
+  KEYWORD(weak);
+  KEYWORD(weak_odr);
+  KEYWORD(appending);
+  KEYWORD(dllimport);
+  KEYWORD(dllexport);
+  KEYWORD(common);
+  KEYWORD(default);
+  KEYWORD(hidden);
+  KEYWORD(protected);
+  KEYWORD(extern_weak);
+  KEYWORD(external);
+  KEYWORD(thread_local);
+  KEYWORD(zeroinitializer);
+  KEYWORD(undef);
+  KEYWORD(null);
+  KEYWORD(to);
+  KEYWORD(tail);
+  KEYWORD(target);
+  KEYWORD(triple);
+  KEYWORD(deplibs);
+  KEYWORD(datalayout);
+  KEYWORD(volatile);
+  KEYWORD(align);
+  KEYWORD(addrspace);
+  KEYWORD(section);
+  KEYWORD(alias);
+  KEYWORD(module);
+  KEYWORD(asm);
+  KEYWORD(sideeffect);
+  KEYWORD(gc);
+
+  KEYWORD(ccc);
+  KEYWORD(fastcc);
+  KEYWORD(coldcc);
+  KEYWORD(x86_stdcallcc);
+  KEYWORD(x86_fastcallcc);
+  KEYWORD(cc);
+  KEYWORD(c);
+
+  KEYWORD(signext);
+  KEYWORD(zeroext);
+  KEYWORD(inreg);
+  KEYWORD(sret);
+  KEYWORD(nounwind);
+  KEYWORD(noreturn);
+  KEYWORD(noalias);
+  KEYWORD(nocapture);
+  KEYWORD(byval);
+  KEYWORD(nest);
+  KEYWORD(readnone);
+  KEYWORD(readonly);
+
+  KEYWORD(noinline);
+  KEYWORD(alwaysinline);
+  KEYWORD(optsize);
+  KEYWORD(ssp);
+  KEYWORD(sspreq);
+
+  KEYWORD(type);
+  KEYWORD(opaque);
+
+  KEYWORD(eq); KEYWORD(ne); KEYWORD(slt); KEYWORD(sgt); KEYWORD(sle);
+  KEYWORD(sge); KEYWORD(ult); KEYWORD(ugt); KEYWORD(ule); KEYWORD(uge);
+  KEYWORD(oeq); KEYWORD(one); KEYWORD(olt); KEYWORD(ogt); KEYWORD(ole);
+  KEYWORD(oge); KEYWORD(ord); KEYWORD(uno); KEYWORD(ueq); KEYWORD(une);
+
+  KEYWORD(x);
+#undef KEYWORD
+
+  // Keywords for types.
+#define TYPEKEYWORD(STR, LLVMTY) \
+  if (Len == strlen(STR) && !memcmp(StartChar, STR, strlen(STR))) { \
+    TyVal = LLVMTY; return lltok::Type; }
+  TYPEKEYWORD("void",      Type::VoidTy);
+  TYPEKEYWORD("float",     Type::FloatTy);
+  TYPEKEYWORD("double",    Type::DoubleTy);
+  TYPEKEYWORD("x86_fp80",  Type::X86_FP80Ty);
+  TYPEKEYWORD("fp128",     Type::FP128Ty);
+  TYPEKEYWORD("ppc_fp128", Type::PPC_FP128Ty);
+  TYPEKEYWORD("label",     Type::LabelTy);
+  TYPEKEYWORD("metadata",  Type::MetadataTy);
+#undef TYPEKEYWORD
+
+  // Handle special forms for autoupgrading.  Drop these in LLVM 3.0.  This is
+  // to avoid conflicting with the sext/zext instructions, below.
+  if (Len == 4 && !memcmp(StartChar, "sext", 4)) {
+    // Scan CurPtr ahead, seeing if there is just whitespace before the newline.
+    if (JustWhitespaceNewLine(CurPtr))
+      return lltok::kw_signext;
+  } else if (Len == 4 && !memcmp(StartChar, "zext", 4)) {
+    // Scan CurPtr ahead, seeing if there is just whitespace before the newline.
+    if (JustWhitespaceNewLine(CurPtr))
+      return lltok::kw_zeroext;
+  }
+
+  // Keywords for instructions.
+#define INSTKEYWORD(STR, Enum) \
+  if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) { \
+    UIntVal = Instruction::Enum; return lltok::kw_##STR; }
+
+  INSTKEYWORD(add,   Add);  INSTKEYWORD(sub,   Sub);  INSTKEYWORD(mul,   Mul);
+  INSTKEYWORD(udiv,  UDiv); INSTKEYWORD(sdiv,  SDiv); INSTKEYWORD(fdiv,  FDiv);
+  INSTKEYWORD(urem,  URem); INSTKEYWORD(srem,  SRem); INSTKEYWORD(frem,  FRem);
+  INSTKEYWORD(shl,   Shl);  INSTKEYWORD(lshr,  LShr); INSTKEYWORD(ashr,  AShr);
+  INSTKEYWORD(and,   And);  INSTKEYWORD(or,    Or);   INSTKEYWORD(xor,   Xor);
+  INSTKEYWORD(icmp,  ICmp); INSTKEYWORD(fcmp,  FCmp);
+  INSTKEYWORD(vicmp, VICmp); INSTKEYWORD(vfcmp, VFCmp);
+
+  INSTKEYWORD(phi,         PHI);
+  INSTKEYWORD(call,        Call);
+  INSTKEYWORD(trunc,       Trunc);
+  INSTKEYWORD(zext,        ZExt);
+  INSTKEYWORD(sext,        SExt);
+  INSTKEYWORD(fptrunc,     FPTrunc);
+  INSTKEYWORD(fpext,       FPExt);
+  INSTKEYWORD(uitofp,      UIToFP);
+  INSTKEYWORD(sitofp,      SIToFP);
+  INSTKEYWORD(fptoui,      FPToUI);
+  INSTKEYWORD(fptosi,      FPToSI);
+  INSTKEYWORD(inttoptr,    IntToPtr);
+  INSTKEYWORD(ptrtoint,    PtrToInt);
+  INSTKEYWORD(bitcast,     BitCast);
+  INSTKEYWORD(select,      Select);
+  INSTKEYWORD(va_arg,      VAArg);
+  INSTKEYWORD(ret,         Ret);
+  INSTKEYWORD(br,          Br);
+  INSTKEYWORD(switch,      Switch);
+  INSTKEYWORD(invoke,      Invoke);
+  INSTKEYWORD(unwind,      Unwind);
+  INSTKEYWORD(unreachable, Unreachable);
+
+  INSTKEYWORD(malloc,      Malloc);
+  INSTKEYWORD(alloca,      Alloca);
+  INSTKEYWORD(free,        Free);
+  INSTKEYWORD(load,        Load);
+  INSTKEYWORD(store,       Store);
+  INSTKEYWORD(getelementptr, GetElementPtr);
+
+  INSTKEYWORD(extractelement, ExtractElement);
+  INSTKEYWORD(insertelement,  InsertElement);
+  INSTKEYWORD(shufflevector,  ShuffleVector);
+  INSTKEYWORD(getresult,      ExtractValue);
+  INSTKEYWORD(extractvalue,   ExtractValue);
+  INSTKEYWORD(insertvalue,    InsertValue);
+#undef INSTKEYWORD
+
+  // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by
+  // the CFE to avoid forcing it to deal with 64-bit numbers.
+  if ((TokStart[0] == 'u' || TokStart[0] == 's') &&
+      TokStart[1] == '0' && TokStart[2] == 'x' && isxdigit(TokStart[3])) {
+    int len = CurPtr-TokStart-3;
+    uint32_t bits = len * 4;
+    APInt Tmp(bits, TokStart+3, len, 16);
+    uint32_t activeBits = Tmp.getActiveBits();
+    if (activeBits > 0 && activeBits < bits)
+      Tmp.trunc(activeBits);
+    APSIntVal = APSInt(Tmp, TokStart[0] == 'u');
+    return lltok::APSInt;
+  }
+
+  // If this is "cc1234", return this as just "cc".
+  if (TokStart[0] == 'c' && TokStart[1] == 'c') {
+    CurPtr = TokStart+2;
+    return lltok::kw_cc;
+  }
+
+  // If this starts with "call", return it as CALL.  This is to support old
+  // broken .ll files.  FIXME: remove this with LLVM 3.0.
+  if (CurPtr-TokStart > 4 && !memcmp(TokStart, "call", 4)) {
+    CurPtr = TokStart+4;
+    UIntVal = Instruction::Call;
+    return lltok::kw_call;
+  }
+
+  // Finally, if this isn't known, return an error.
+  CurPtr = TokStart+1;
+  return lltok::Error;
+}
+
+
+/// Lex0x: Handle productions that start with 0x, knowing that it matches and
+/// that this is not a label:
+///    HexFPConstant     0x[0-9A-Fa-f]+
+///    HexFP80Constant   0xK[0-9A-Fa-f]+
+///    HexFP128Constant  0xL[0-9A-Fa-f]+
+///    HexPPC128Constant 0xM[0-9A-Fa-f]+
+lltok::Kind LLLexer::Lex0x() {
+  CurPtr = TokStart + 2;
+
+  char Kind;
+  if (CurPtr[0] >= 'K' && CurPtr[0] <= 'M') {
+    Kind = *CurPtr++;
+  } else {
+    Kind = 'J';
+  }
+
+  if (!isxdigit(CurPtr[0])) {
+    // Bad token, return it as an error.
+    CurPtr = TokStart+1;
+    return lltok::Error;
+  }
+
+  while (isxdigit(CurPtr[0]))
+    ++CurPtr;
+
+  if (Kind == 'J') {
+    // HexFPConstant - Floating point constant represented in IEEE format as a
+    // hexadecimal number for when exponential notation is not precise enough.
+    // Float and double only.
+    APFloatVal = APFloat(BitsToDouble(HexIntToVal(TokStart+2, CurPtr)));
+    return lltok::APFloat;
+  }
+
+  uint64_t Pair[2];
+  switch (Kind) {
+  default: assert(0 && "Unknown kind!");
+  case 'K':
+    // F80HexFPConstant - x87 long double in hexadecimal format (10 bytes)
+    FP80HexToIntPair(TokStart+3, CurPtr, Pair);
+    APFloatVal = APFloat(APInt(80, 2, Pair));
+    return lltok::APFloat;
+  case 'L':
+    // F128HexFPConstant - IEEE 128-bit in hexadecimal format (16 bytes)
+    HexToIntPair(TokStart+3, CurPtr, Pair);
+    APFloatVal = APFloat(APInt(128, 2, Pair), true);
+    return lltok::APFloat;
+  case 'M':
+    // PPC128HexFPConstant - PowerPC 128-bit in hexadecimal format (16 bytes)
+    HexToIntPair(TokStart+3, CurPtr, Pair);
+    APFloatVal = APFloat(APInt(128, 2, Pair));
+    return lltok::APFloat;
+  }
+}
+
+/// LexIdentifier: Handle several related productions:
+///    Label             [-a-zA-Z$._0-9]+:
+///    NInteger          -[0-9]+
+///    FPConstant        [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+///    PInteger          [0-9]+
+///    HexFPConstant     0x[0-9A-Fa-f]+
+///    HexFP80Constant   0xK[0-9A-Fa-f]+
+///    HexFP128Constant  0xL[0-9A-Fa-f]+
+///    HexPPC128Constant 0xM[0-9A-Fa-f]+
+lltok::Kind LLLexer::LexDigitOrNegative() {
+  // If the letter after the negative is a number, this is probably a label.
+  if (!isdigit(TokStart[0]) && !isdigit(CurPtr[0])) {
+    // Okay, this is not a number after the -, it's probably a label.
+    if (const char *End = isLabelTail(CurPtr)) {
+      StrVal.assign(TokStart, End-1);
+      CurPtr = End;
+      return lltok::LabelStr;
+    }
+
+    return lltok::Error;
+  }
+
+  // At this point, it is either a label, int or fp constant.
+
+  // Skip digits, we have at least one.
+  for (; isdigit(CurPtr[0]); ++CurPtr)
+    /*empty*/;
+
+  // Check to see if this really is a label afterall, e.g. "-1:".
+  if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
+    if (const char *End = isLabelTail(CurPtr)) {
+      StrVal.assign(TokStart, End-1);
+      CurPtr = End;
+      return lltok::LabelStr;
+    }
+  }
+
+  // If the next character is a '.', then it is a fp value, otherwise its
+  // integer.
+  if (CurPtr[0] != '.') {
+    if (TokStart[0] == '0' && TokStart[1] == 'x')
+      return Lex0x();
+    unsigned Len = CurPtr-TokStart;
+    uint32_t numBits = ((Len * 64) / 19) + 2;
+    APInt Tmp(numBits, TokStart, Len, 10);
+    if (TokStart[0] == '-') {
+      uint32_t minBits = Tmp.getMinSignedBits();
+      if (minBits > 0 && minBits < numBits)
+        Tmp.trunc(minBits);
+      APSIntVal = APSInt(Tmp, false);
+    } else {
+      uint32_t activeBits = Tmp.getActiveBits();
+      if (activeBits > 0 && activeBits < numBits)
+        Tmp.trunc(activeBits);
+      APSIntVal = APSInt(Tmp, true);
+    }
+    return lltok::APSInt;
+  }
+
+  ++CurPtr;
+
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(CurPtr[0])) ++CurPtr;
+
+  if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
+    if (isdigit(CurPtr[1]) ||
+        ((CurPtr[1] == '-' || CurPtr[1] == '+') && isdigit(CurPtr[2]))) {
+      CurPtr += 2;
+      while (isdigit(CurPtr[0])) ++CurPtr;
+    }
+  }
+
+  APFloatVal = APFloat(atof(TokStart));
+  return lltok::APFloat;
+}
+
+///    FPConstant  [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+lltok::Kind LLLexer::LexPositive() {
+  // If the letter after the negative is a number, this is probably not a
+  // label.
+  if (!isdigit(CurPtr[0]))
+    return lltok::Error;
+
+  // Skip digits.
+  for (++CurPtr; isdigit(CurPtr[0]); ++CurPtr)
+    /*empty*/;
+
+  // At this point, we need a '.'.
+  if (CurPtr[0] != '.') {
+    CurPtr = TokStart+1;
+    return lltok::Error;
+  }
+
+  ++CurPtr;
+
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(CurPtr[0])) ++CurPtr;
+
+  if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
+    if (isdigit(CurPtr[1]) ||
+        ((CurPtr[1] == '-' || CurPtr[1] == '+') && isdigit(CurPtr[2]))) {
+      CurPtr += 2;
+      while (isdigit(CurPtr[0])) ++CurPtr;
+    }
+  }
+
+  APFloatVal = APFloat(atof(TokStart));
+  return lltok::APFloat;
+}
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
new file mode 100644
index 0000000..995aa4e
--- /dev/null
+++ b/lib/AsmParser/LLLexer.h
@@ -0,0 +1,84 @@
+//===- LLLexer.h - Lexer for LLVM Assembly Files ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents the Lexer for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_ASMPARSER_LLLEXER_H
+#define LIB_ASMPARSER_LLLEXER_H
+
+#include "LLToken.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/APFloat.h"
+#include <string>
+
+namespace llvm {
+  class MemoryBuffer;
+  class Type;
+  class ParseError;
+
+  class LLLexer {
+    const char *CurPtr;
+    MemoryBuffer *CurBuf;
+    ParseError &ErrorInfo;
+
+    // Information about the current token.
+    const char *TokStart;
+    lltok::Kind CurKind;
+    std::string StrVal;
+    unsigned UIntVal;
+    const Type *TyVal;
+    APFloat APFloatVal;
+    APSInt  APSIntVal;
+
+    std::string TheError;
+  public:
+    explicit LLLexer(MemoryBuffer *StartBuf, ParseError &);
+    ~LLLexer() {}
+
+    lltok::Kind Lex() {
+      return CurKind = LexToken();
+    }
+
+    typedef const char* LocTy;
+    LocTy getLoc() const { return TokStart; }
+    lltok::Kind getKind() const { return CurKind; }
+    const std::string getStrVal() const { return StrVal; }
+    const Type *getTyVal() const { return TyVal; }
+    unsigned getUIntVal() const { return UIntVal; }
+    const APSInt &getAPSIntVal() const { return APSIntVal; }
+    const APFloat &getAPFloatVal() const { return APFloatVal; }
+
+
+    bool Error(LocTy L, const std::string &Msg) const;
+    bool Error(const std::string &Msg) const { return Error(CurPtr, Msg); }
+    std::string getFilename() const;
+
+  private:
+    lltok::Kind LexToken();
+
+    int getNextChar();
+    void SkipLineComment();
+    lltok::Kind LexIdentifier();
+    lltok::Kind LexDigitOrNegative();
+    lltok::Kind LexPositive();
+    lltok::Kind LexAt();
+    lltok::Kind LexPercent();
+    lltok::Kind LexQuote();
+    lltok::Kind Lex0x();
+
+    uint64_t atoull(const char *Buffer, const char *End);
+    uint64_t HexIntToVal(const char *Buffer, const char *End);
+    void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
+    void FP80HexToIntPair(const char *Buff, const char *End, uint64_t Pair[2]);
+  };
+} // end namespace llvm
+
+#endif
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
new file mode 100644
index 0000000..8db4c71
--- /dev/null
+++ b/lib/AsmParser/LLParser.cpp
@@ -0,0 +1,3279 @@
+//===-- LLParser.cpp - Parser Class ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the parser class for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLParser.h"
+#include "llvm/AutoUpgrade.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/MDNode.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace llvm {
+  /// ValID - Represents a reference of a definition of some sort with no type.
+  /// There are several cases where we have to parse the value but where the
+  /// type can depend on later context.  This may either be a numeric reference
+  /// or a symbolic (%var) reference.  This is just a discriminated union.
+  struct ValID {
+    enum {
+      t_LocalID, t_GlobalID,      // ID in UIntVal.
+      t_LocalName, t_GlobalName,  // Name in StrVal.
+      t_APSInt, t_APFloat,        // Value in APSIntVal/APFloatVal.
+      t_Null, t_Undef, t_Zero,    // No value.
+      t_EmptyArray,               // No value:  []
+      t_Constant,                 // Value in ConstantVal.
+      t_InlineAsm                 // Value in StrVal/StrVal2/UIntVal.
+    } Kind;
+    
+    LLParser::LocTy Loc;
+    unsigned UIntVal;
+    std::string StrVal, StrVal2;
+    APSInt APSIntVal;
+    APFloat APFloatVal;
+    Constant *ConstantVal;
+    ValID() : APFloatVal(0.0) {}
+  };
+}
+
+/// Run: module ::= toplevelentity*
+bool LLParser::Run() {
+  // Prime the lexer.
+  Lex.Lex();
+
+  return ParseTopLevelEntities() ||
+         ValidateEndOfModule();
+}
+
+/// ValidateEndOfModule - Do final validity and sanity checks at the end of the
+/// module.
+bool LLParser::ValidateEndOfModule() {
+  if (!ForwardRefTypes.empty())
+    return Error(ForwardRefTypes.begin()->second.second,
+                 "use of undefined type named '" +
+                 ForwardRefTypes.begin()->first + "'");
+  if (!ForwardRefTypeIDs.empty())
+    return Error(ForwardRefTypeIDs.begin()->second.second,
+                 "use of undefined type '%" +
+                 utostr(ForwardRefTypeIDs.begin()->first) + "'");
+  
+  if (!ForwardRefVals.empty())
+    return Error(ForwardRefVals.begin()->second.second,
+                 "use of undefined value '@" + ForwardRefVals.begin()->first +
+                 "'");
+  
+  if (!ForwardRefValIDs.empty())
+    return Error(ForwardRefValIDs.begin()->second.second,
+                 "use of undefined value '@" +
+                 utostr(ForwardRefValIDs.begin()->first) + "'");
+  
+  // Look for intrinsic functions and CallInst that need to be upgraded
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
+    UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove
+  
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-Level Entities
+//===----------------------------------------------------------------------===//
+
+bool LLParser::ParseTopLevelEntities() {
+  while (1) {
+    switch (Lex.getKind()) {
+    default:         return TokError("expected top-level entity");
+    case lltok::Eof: return false;
+    //case lltok::kw_define:
+    case lltok::kw_declare: if (ParseDeclare()) return true; break;
+    case lltok::kw_define:  if (ParseDefine()) return true; break;
+    case lltok::kw_module:  if (ParseModuleAsm()) return true; break;
+    case lltok::kw_target:  if (ParseTargetDefinition()) return true; break;
+    case lltok::kw_deplibs: if (ParseDepLibs()) return true; break;
+    case lltok::kw_type:    if (ParseUnnamedType()) return true; break;
+    case lltok::StringConstant: // FIXME: REMOVE IN LLVM 3.0
+    case lltok::LocalVar:   if (ParseNamedType()) return true; break;
+    case lltok::GlobalVar:  if (ParseNamedGlobal()) return true; break;
+
+    // The Global variable production with no name can have many different
+    // optional leading prefixes, the production is:
+    // GlobalVar ::= OptionalLinkage OptionalVisibility OptionalThreadLocal
+    //               OptionalAddrSpace ('constant'|'global') ...
+    case lltok::kw_private:       // OptionalLinkage
+    case lltok::kw_internal:      // OptionalLinkage
+    case lltok::kw_weak:          // OptionalLinkage
+    case lltok::kw_weak_odr:      // OptionalLinkage
+    case lltok::kw_linkonce:      // OptionalLinkage
+    case lltok::kw_linkonce_odr:  // OptionalLinkage
+    case lltok::kw_appending:     // OptionalLinkage
+    case lltok::kw_dllexport:     // OptionalLinkage
+    case lltok::kw_common:        // OptionalLinkage
+    case lltok::kw_dllimport:     // OptionalLinkage
+    case lltok::kw_extern_weak:   // OptionalLinkage
+    case lltok::kw_external: {    // OptionalLinkage
+      unsigned Linkage, Visibility;
+      if (ParseOptionalLinkage(Linkage) ||
+          ParseOptionalVisibility(Visibility) ||
+          ParseGlobal("", 0, Linkage, true, Visibility))
+        return true;
+      break;
+    }
+    case lltok::kw_default:       // OptionalVisibility
+    case lltok::kw_hidden:        // OptionalVisibility
+    case lltok::kw_protected: {   // OptionalVisibility
+      unsigned Visibility;
+      if (ParseOptionalVisibility(Visibility) ||
+          ParseGlobal("", 0, 0, false, Visibility))
+        return true;
+      break;
+    }
+        
+    case lltok::kw_thread_local:  // OptionalThreadLocal
+    case lltok::kw_addrspace:     // OptionalAddrSpace
+    case lltok::kw_constant:      // GlobalType
+    case lltok::kw_global:        // GlobalType
+      if (ParseGlobal("", 0, 0, false, 0)) return true;
+      break;
+    }
+  }
+}
+
+
+/// toplevelentity
+///   ::= 'module' 'asm' STRINGCONSTANT
+bool LLParser::ParseModuleAsm() {
+  assert(Lex.getKind() == lltok::kw_module);
+  Lex.Lex();
+  
+  std::string AsmStr; 
+  if (ParseToken(lltok::kw_asm, "expected 'module asm'") ||
+      ParseStringConstant(AsmStr)) return true;
+  
+  const std::string &AsmSoFar = M->getModuleInlineAsm();
+  if (AsmSoFar.empty())
+    M->setModuleInlineAsm(AsmStr);
+  else
+    M->setModuleInlineAsm(AsmSoFar+"\n"+AsmStr);
+  return false;
+}
+
+/// toplevelentity
+///   ::= 'target' 'triple' '=' STRINGCONSTANT
+///   ::= 'target' 'datalayout' '=' STRINGCONSTANT
+bool LLParser::ParseTargetDefinition() {
+  assert(Lex.getKind() == lltok::kw_target);
+  std::string Str;
+  switch (Lex.Lex()) {
+  default: return TokError("unknown target property");
+  case lltok::kw_triple:
+    Lex.Lex();
+    if (ParseToken(lltok::equal, "expected '=' after target triple") ||
+        ParseStringConstant(Str))
+      return true;
+    M->setTargetTriple(Str);
+    return false;
+  case lltok::kw_datalayout:
+    Lex.Lex();
+    if (ParseToken(lltok::equal, "expected '=' after target datalayout") ||
+        ParseStringConstant(Str))
+      return true;
+    M->setDataLayout(Str);
+    return false;
+  }
+}
+
+/// toplevelentity
+///   ::= 'deplibs' '=' '[' ']'
+///   ::= 'deplibs' '=' '[' STRINGCONSTANT (',' STRINGCONSTANT)* ']'
+bool LLParser::ParseDepLibs() {
+  assert(Lex.getKind() == lltok::kw_deplibs);
+  Lex.Lex();
+  if (ParseToken(lltok::equal, "expected '=' after deplibs") ||
+      ParseToken(lltok::lsquare, "expected '=' after deplibs"))
+    return true;
+
+  if (EatIfPresent(lltok::rsquare))
+    return false;
+  
+  std::string Str;
+  if (ParseStringConstant(Str)) return true;
+  M->addLibrary(Str);
+
+  while (EatIfPresent(lltok::comma)) {
+    if (ParseStringConstant(Str)) return true;
+    M->addLibrary(Str);
+  }
+
+  return ParseToken(lltok::rsquare, "expected ']' at end of list");
+}
+
+/// toplevelentity
+///   ::= 'type' type
+bool LLParser::ParseUnnamedType() {
+  assert(Lex.getKind() == lltok::kw_type);
+  LocTy TypeLoc = Lex.getLoc();
+  Lex.Lex(); // eat kw_type
+
+  PATypeHolder Ty(Type::VoidTy);
+  if (ParseType(Ty)) return true;
+ 
+  unsigned TypeID = NumberedTypes.size();
+  
+  // See if this type was previously referenced.
+  std::map<unsigned, std::pair<PATypeHolder, LocTy> >::iterator
+    FI = ForwardRefTypeIDs.find(TypeID);
+  if (FI != ForwardRefTypeIDs.end()) {
+    if (FI->second.first.get() == Ty)
+      return Error(TypeLoc, "self referential type is invalid");
+    
+    cast<DerivedType>(FI->second.first.get())->refineAbstractTypeTo(Ty);
+    Ty = FI->second.first.get();
+    ForwardRefTypeIDs.erase(FI);
+  }
+  
+  NumberedTypes.push_back(Ty);
+  
+  return false;
+}
+
+/// toplevelentity
+///   ::= LocalVar '=' 'type' type
+bool LLParser::ParseNamedType() {
+  std::string Name = Lex.getStrVal();
+  LocTy NameLoc = Lex.getLoc();
+  Lex.Lex();  // eat LocalVar.
+  
+  PATypeHolder Ty(Type::VoidTy);
+  
+  if (ParseToken(lltok::equal, "expected '=' after name") ||
+      ParseToken(lltok::kw_type, "expected 'type' after name") ||
+      ParseType(Ty))
+    return true;
+  
+  // Set the type name, checking for conflicts as we do so.
+  bool AlreadyExists = M->addTypeName(Name, Ty);
+  if (!AlreadyExists) return false;
+
+  // See if this type is a forward reference.  We need to eagerly resolve
+  // types to allow recursive type redefinitions below.
+  std::map<std::string, std::pair<PATypeHolder, LocTy> >::iterator
+  FI = ForwardRefTypes.find(Name);
+  if (FI != ForwardRefTypes.end()) {
+    if (FI->second.first.get() == Ty)
+      return Error(NameLoc, "self referential type is invalid");
+
+    cast<DerivedType>(FI->second.first.get())->refineAbstractTypeTo(Ty);
+    Ty = FI->second.first.get();
+    ForwardRefTypes.erase(FI);
+  }
+  
+  // Inserting a name that is already defined, get the existing name.
+  const Type *Existing = M->getTypeByName(Name);
+  assert(Existing && "Conflict but no matching type?!");
+    
+  // Otherwise, this is an attempt to redefine a type. That's okay if
+  // the redefinition is identical to the original.
+  // FIXME: REMOVE REDEFINITIONS IN LLVM 3.0
+  if (Existing == Ty) return false;
+  
+  // Any other kind of (non-equivalent) redefinition is an error.
+  return Error(NameLoc, "redefinition of type named '" + Name + "' of type '" +
+               Ty->getDescription() + "'");
+}
+
+
+/// toplevelentity
+///   ::= 'declare' FunctionHeader
+bool LLParser::ParseDeclare() {
+  assert(Lex.getKind() == lltok::kw_declare);
+  Lex.Lex();
+  
+  Function *F;
+  return ParseFunctionHeader(F, false);
+}
+
+/// toplevelentity
+///   ::= 'define' FunctionHeader '{' ...
+bool LLParser::ParseDefine() {
+  assert(Lex.getKind() == lltok::kw_define);
+  Lex.Lex();
+  
+  Function *F;
+  return ParseFunctionHeader(F, true) ||
+         ParseFunctionBody(*F);
+}
+
+/// ParseGlobalType
+///   ::= 'constant'
+///   ::= 'global'
+bool LLParser::ParseGlobalType(bool &IsConstant) {
+  if (Lex.getKind() == lltok::kw_constant)
+    IsConstant = true;
+  else if (Lex.getKind() == lltok::kw_global)
+    IsConstant = false;
+  else {
+    IsConstant = false;
+    return TokError("expected 'global' or 'constant'");
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// ParseNamedGlobal:
+///   GlobalVar '=' OptionalVisibility ALIAS ...
+///   GlobalVar '=' OptionalLinkage OptionalVisibility ...   -> global variable
+bool LLParser::ParseNamedGlobal() {
+  assert(Lex.getKind() == lltok::GlobalVar);
+  LocTy NameLoc = Lex.getLoc();
+  std::string Name = Lex.getStrVal();
+  Lex.Lex();
+  
+  bool HasLinkage;
+  unsigned Linkage, Visibility;
+  if (ParseToken(lltok::equal, "expected '=' in global variable") ||
+      ParseOptionalLinkage(Linkage, HasLinkage) ||
+      ParseOptionalVisibility(Visibility))
+    return true;
+  
+  if (HasLinkage || Lex.getKind() != lltok::kw_alias)
+    return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility);
+  return ParseAlias(Name, NameLoc, Visibility);
+}
+
+/// ParseAlias:
+///   ::= GlobalVar '=' OptionalVisibility 'alias' OptionalLinkage Aliasee
+/// Aliasee
+///   ::= TypeAndValue
+///   ::= 'bitcast' '(' TypeAndValue 'to' Type ')'
+///   ::= 'getelementptr' '(' ... ')'
+///
+/// Everything through visibility has already been parsed.
+///
+bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
+                          unsigned Visibility) {
+  assert(Lex.getKind() == lltok::kw_alias);
+  Lex.Lex();
+  unsigned Linkage;
+  LocTy LinkageLoc = Lex.getLoc();
+  if (ParseOptionalLinkage(Linkage))
+    return true;
+
+  if (Linkage != GlobalValue::ExternalLinkage &&
+      Linkage != GlobalValue::WeakAnyLinkage &&
+      Linkage != GlobalValue::WeakODRLinkage &&
+      Linkage != GlobalValue::InternalLinkage &&
+      Linkage != GlobalValue::PrivateLinkage)
+    return Error(LinkageLoc, "invalid linkage type for alias");
+  
+  Constant *Aliasee;
+  LocTy AliaseeLoc = Lex.getLoc();
+  if (Lex.getKind() != lltok::kw_bitcast &&
+      Lex.getKind() != lltok::kw_getelementptr) {
+    if (ParseGlobalTypeAndValue(Aliasee)) return true;
+  } else {
+    // The bitcast dest type is not present, it is implied by the dest type.
+    ValID ID;
+    if (ParseValID(ID)) return true;
+    if (ID.Kind != ValID::t_Constant)
+      return Error(AliaseeLoc, "invalid aliasee");
+    Aliasee = ID.ConstantVal;
+  }
+  
+  if (!isa<PointerType>(Aliasee->getType()))
+    return Error(AliaseeLoc, "alias must have pointer type");
+
+  // Okay, create the alias but do not insert it into the module yet.
+  GlobalAlias* GA = new GlobalAlias(Aliasee->getType(),
+                                    (GlobalValue::LinkageTypes)Linkage, Name,
+                                    Aliasee);
+  GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+  
+  // See if this value already exists in the symbol table.  If so, it is either
+  // a redefinition or a definition of a forward reference.
+  if (GlobalValue *Val =
+        cast_or_null<GlobalValue>(M->getValueSymbolTable().lookup(Name))) {
+    // See if this was a redefinition.  If so, there is no entry in
+    // ForwardRefVals.
+    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefVals.find(Name);
+    if (I == ForwardRefVals.end())
+      return Error(NameLoc, "redefinition of global named '@" + Name + "'");
+
+    // Otherwise, this was a definition of forward ref.  Verify that types
+    // agree.
+    if (Val->getType() != GA->getType())
+      return Error(NameLoc,
+              "forward reference and definition of alias have different types");
+    
+    // If they agree, just RAUW the old value with the alias and remove the
+    // forward ref info.
+    Val->replaceAllUsesWith(GA);
+    Val->eraseFromParent();
+    ForwardRefVals.erase(I);
+  }
+  
+  // Insert into the module, we know its name won't collide now.
+  M->getAliasList().push_back(GA);
+  assert(GA->getNameStr() == Name && "Should not be a name conflict!");
+  
+  return false;
+}
+
+/// ParseGlobal
+///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalThreadLocal
+///       OptionalAddrSpace GlobalType Type Const
+///   ::= OptionalLinkage OptionalVisibility OptionalThreadLocal
+///       OptionalAddrSpace GlobalType Type Const
+///
+/// Everything through visibility has been parsed already.
+///
+bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
+                           unsigned Linkage, bool HasLinkage,
+                           unsigned Visibility) {
+  unsigned AddrSpace;
+  bool ThreadLocal, IsConstant;
+  LocTy TyLoc;
+    
+  PATypeHolder Ty(Type::VoidTy);
+  if (ParseOptionalToken(lltok::kw_thread_local, ThreadLocal) ||
+      ParseOptionalAddrSpace(AddrSpace) ||
+      ParseGlobalType(IsConstant) ||
+      ParseType(Ty, TyLoc))
+    return true;
+  
+  // If the linkage is specified and is external, then no initializer is
+  // present.
+  Constant *Init = 0;
+  if (!HasLinkage || (Linkage != GlobalValue::DLLImportLinkage &&
+                      Linkage != GlobalValue::ExternalWeakLinkage &&
+                      Linkage != GlobalValue::ExternalLinkage)) {
+    if (ParseGlobalValue(Ty, Init))
+      return true;
+  }
+
+  if (isa<FunctionType>(Ty) || Ty == Type::LabelTy)
+    return Error(TyLoc, "invalid type for global variable");
+  
+  GlobalVariable *GV = 0;
+
+  // See if the global was forward referenced, if so, use the global.
+  if (!Name.empty()) {
+    if ((GV = M->getGlobalVariable(Name, true)) &&
+        !ForwardRefVals.erase(Name))
+      return Error(NameLoc, "redefinition of global '@" + Name + "'");
+  } else {
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefValIDs.find(NumberedVals.size());
+    if (I != ForwardRefValIDs.end()) {
+      GV = cast<GlobalVariable>(I->second.first);
+      ForwardRefValIDs.erase(I);
+    }
+  }
+
+  if (GV == 0) {
+    GV = new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage, 0, Name,
+                            M, false, AddrSpace);
+  } else {
+    if (GV->getType()->getElementType() != Ty)
+      return Error(TyLoc,
+            "forward reference and definition of global have different types");
+    
+    // Move the forward-reference to the correct spot in the module.
+    M->getGlobalList().splice(M->global_end(), M->getGlobalList(), GV);
+  }
+
+  if (Name.empty())
+    NumberedVals.push_back(GV);
+  
+  // Set the parsed properties on the global.
+  if (Init)
+    GV->setInitializer(Init);
+  GV->setConstant(IsConstant);
+  GV->setLinkage((GlobalValue::LinkageTypes)Linkage);
+  GV->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+  GV->setThreadLocal(ThreadLocal);
+  
+  // Parse attributes on the global.
+  while (Lex.getKind() == lltok::comma) {
+    Lex.Lex();
+    
+    if (Lex.getKind() == lltok::kw_section) {
+      Lex.Lex();
+      GV->setSection(Lex.getStrVal());
+      if (ParseToken(lltok::StringConstant, "expected global section string"))
+        return true;
+    } else if (Lex.getKind() == lltok::kw_align) {
+      unsigned Alignment;
+      if (ParseOptionalAlignment(Alignment)) return true;
+      GV->setAlignment(Alignment);
+    } else {
+      TokError("unknown global variable property!");
+    }
+  }
+  
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// GlobalValue Reference/Resolution Routines.
+//===----------------------------------------------------------------------===//
+
+/// GetGlobalVal - Get a value with the specified name or ID, creating a
+/// forward reference record if needed.  This can return null if the value
+/// exists but does not have the right type.
+GlobalValue *LLParser::GetGlobalVal(const std::string &Name, const Type *Ty,
+                                    LocTy Loc) {
+  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+  if (PTy == 0) {
+    Error(Loc, "global variable reference must have pointer type");
+    return 0;
+  }
+  
+  // Look this name up in the normal function symbol table.
+  GlobalValue *Val =
+    cast_or_null<GlobalValue>(M->getValueSymbolTable().lookup(Name));
+  
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefVals.find(Name);
+    if (I != ForwardRefVals.end())
+      Val = I->second.first;
+  }
+  
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    Error(Loc, "'@" + Name + "' defined with type '" +
+          Val->getType()->getDescription() + "'");
+    return 0;
+  }
+  
+  // Otherwise, create a new forward reference for this value and remember it.
+  GlobalValue *FwdVal;
+  if (const FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType())) {
+    // Function types can return opaque but functions can't.
+    if (isa<OpaqueType>(FT->getReturnType())) {
+      Error(Loc, "function may not return opaque type");
+      return 0;
+    }
+    
+    FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M);
+  } else {
+    FwdVal = new GlobalVariable(PTy->getElementType(), false,
+                                GlobalValue::ExternalWeakLinkage, 0, Name, M);
+  }
+  
+  ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+GlobalValue *LLParser::GetGlobalVal(unsigned ID, const Type *Ty, LocTy Loc) {
+  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+  if (PTy == 0) {
+    Error(Loc, "global variable reference must have pointer type");
+    return 0;
+  }
+  
+  GlobalValue *Val = ID < NumberedVals.size() ? NumberedVals[ID] : 0;
+  
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefValIDs.find(ID);
+    if (I != ForwardRefValIDs.end())
+      Val = I->second.first;
+  }
+  
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    Error(Loc, "'@" + utostr(ID) + "' defined with type '" +
+          Val->getType()->getDescription() + "'");
+    return 0;
+  }
+  
+  // Otherwise, create a new forward reference for this value and remember it.
+  GlobalValue *FwdVal;
+  if (const FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType())) {
+    // Function types can return opaque but functions can't.
+    if (isa<OpaqueType>(FT->getReturnType())) {
+      Error(Loc, "function may not return opaque type");
+      return 0;
+    }
+    FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, "", M);
+  } else {
+    FwdVal = new GlobalVariable(PTy->getElementType(), false,
+                                GlobalValue::ExternalWeakLinkage, 0, "", M);
+  }
+  
+  ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Helper Routines.
+//===----------------------------------------------------------------------===//
+
+/// ParseToken - If the current token has the specified kind, eat it and return
+/// success.  Otherwise, emit the specified error and return failure.
+bool LLParser::ParseToken(lltok::Kind T, const char *ErrMsg) {
+  if (Lex.getKind() != T)
+    return TokError(ErrMsg);
+  Lex.Lex();
+  return false;
+}
+
+/// ParseStringConstant
+///   ::= StringConstant
+bool LLParser::ParseStringConstant(std::string &Result) {
+  if (Lex.getKind() != lltok::StringConstant)
+    return TokError("expected string constant");
+  Result = Lex.getStrVal();
+  Lex.Lex();
+  return false;
+}
+
+/// ParseUInt32
+///   ::= uint32
+bool LLParser::ParseUInt32(unsigned &Val) {
+  if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
+    return TokError("expected integer");
+  uint64_t Val64 = Lex.getAPSIntVal().getLimitedValue(0xFFFFFFFFULL+1);
+  if (Val64 != unsigned(Val64))
+    return TokError("expected 32-bit integer (too large)");
+  Val = Val64;
+  Lex.Lex();
+  return false;
+}
+
+
+/// ParseOptionalAddrSpace
+///   := /*empty*/
+///   := 'addrspace' '(' uint32 ')'
+bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) {
+  AddrSpace = 0;
+  if (!EatIfPresent(lltok::kw_addrspace))
+    return false;
+  return ParseToken(lltok::lparen, "expected '(' in address space") ||
+         ParseUInt32(AddrSpace) ||
+         ParseToken(lltok::rparen, "expected ')' in address space");
+}  
+
+/// ParseOptionalAttrs - Parse a potentially empty attribute list.  AttrKind
+/// indicates what kind of attribute list this is: 0: function arg, 1: result,
+/// 2: function attr.
+/// 3: function arg after value: FIXME: REMOVE IN LLVM 3.0
+bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
+  Attrs = Attribute::None;
+  LocTy AttrLoc = Lex.getLoc();
+  
+  while (1) {
+    switch (Lex.getKind()) {
+    case lltok::kw_sext:
+    case lltok::kw_zext:
+      // Treat these as signext/zeroext if they occur in the argument list after
+      // the value, as in "call i8 @foo(i8 10 sext)".  If they occur before the
+      // value, as in "call i8 @foo(i8 sext (" then it is part of a constant
+      // expr.
+      // FIXME: REMOVE THIS IN LLVM 3.0
+      if (AttrKind == 3) {
+        if (Lex.getKind() == lltok::kw_sext)
+          Attrs |= Attribute::SExt;
+        else
+          Attrs |= Attribute::ZExt;
+        break;
+      }
+      // FALL THROUGH.
+    default:  // End of attributes.
+      if (AttrKind != 2 && (Attrs & Attribute::FunctionOnly))
+        return Error(AttrLoc, "invalid use of function-only attribute");
+        
+      if (AttrKind != 0 && AttrKind != 3 && (Attrs & Attribute::ParameterOnly))
+        return Error(AttrLoc, "invalid use of parameter-only attribute");
+        
+      return false;
+    case lltok::kw_zeroext:      Attrs |= Attribute::ZExt; break;
+    case lltok::kw_signext:      Attrs |= Attribute::SExt; break;
+    case lltok::kw_inreg:        Attrs |= Attribute::InReg; break;
+    case lltok::kw_sret:         Attrs |= Attribute::StructRet; break;
+    case lltok::kw_noalias:      Attrs |= Attribute::NoAlias; break;
+    case lltok::kw_nocapture:    Attrs |= Attribute::NoCapture; break;
+    case lltok::kw_byval:        Attrs |= Attribute::ByVal; break;
+    case lltok::kw_nest:         Attrs |= Attribute::Nest; break;
+
+    case lltok::kw_noreturn:     Attrs |= Attribute::NoReturn; break;
+    case lltok::kw_nounwind:     Attrs |= Attribute::NoUnwind; break;
+    case lltok::kw_noinline:     Attrs |= Attribute::NoInline; break;
+    case lltok::kw_readnone:     Attrs |= Attribute::ReadNone; break;
+    case lltok::kw_readonly:     Attrs |= Attribute::ReadOnly; break;
+    case lltok::kw_alwaysinline: Attrs |= Attribute::AlwaysInline; break;
+    case lltok::kw_optsize:      Attrs |= Attribute::OptimizeForSize; break;
+    case lltok::kw_ssp:          Attrs |= Attribute::StackProtect; break;
+    case lltok::kw_sspreq:       Attrs |= Attribute::StackProtectReq; break;
+
+        
+    case lltok::kw_align: {
+      unsigned Alignment;
+      if (ParseOptionalAlignment(Alignment))
+        return true;
+      Attrs |= Attribute::constructAlignmentFromInt(Alignment);
+      continue;
+    }
+    }
+    Lex.Lex();
+  }
+}
+
+/// ParseOptionalLinkage
+///   ::= /*empty*/
+///   ::= 'private'
+///   ::= 'internal'
+///   ::= 'weak'
+///   ::= 'weak_odr'
+///   ::= 'linkonce'
+///   ::= 'linkonce_odr'
+///   ::= 'appending'
+///   ::= 'dllexport'
+///   ::= 'common'
+///   ::= 'dllimport'
+///   ::= 'extern_weak'
+///   ::= 'external'
+bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
+  HasLinkage = false;
+  switch (Lex.getKind()) {
+  default:                     Res = GlobalValue::ExternalLinkage; return false;
+  case lltok::kw_private:      Res = GlobalValue::PrivateLinkage; break;
+  case lltok::kw_internal:     Res = GlobalValue::InternalLinkage; break;
+  case lltok::kw_weak:         Res = GlobalValue::WeakAnyLinkage; break;
+  case lltok::kw_weak_odr:     Res = GlobalValue::WeakODRLinkage; break;
+  case lltok::kw_linkonce:     Res = GlobalValue::LinkOnceAnyLinkage; break;
+  case lltok::kw_linkonce_odr: Res = GlobalValue::LinkOnceODRLinkage; break;
+  case lltok::kw_available_externally:
+    Res = GlobalValue::AvailableExternallyLinkage;
+    break;
+  case lltok::kw_appending:    Res = GlobalValue::AppendingLinkage; break;
+  case lltok::kw_dllexport:    Res = GlobalValue::DLLExportLinkage; break;
+  case lltok::kw_common:       Res = GlobalValue::CommonLinkage; break;
+  case lltok::kw_dllimport:    Res = GlobalValue::DLLImportLinkage; break;
+  case lltok::kw_extern_weak:  Res = GlobalValue::ExternalWeakLinkage; break;
+  case lltok::kw_external:     Res = GlobalValue::ExternalLinkage; break;
+  }
+  Lex.Lex();
+  HasLinkage = true;
+  return false;
+}
+
+/// ParseOptionalVisibility
+///   ::= /*empty*/
+///   ::= 'default'
+///   ::= 'hidden'
+///   ::= 'protected'
+/// 
+bool LLParser::ParseOptionalVisibility(unsigned &Res) {
+  switch (Lex.getKind()) {
+  default:                  Res = GlobalValue::DefaultVisibility; return false;
+  case lltok::kw_default:   Res = GlobalValue::DefaultVisibility; break;
+  case lltok::kw_hidden:    Res = GlobalValue::HiddenVisibility; break;
+  case lltok::kw_protected: Res = GlobalValue::ProtectedVisibility; break;
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// ParseOptionalCallingConv
+///   ::= /*empty*/
+///   ::= 'ccc'
+///   ::= 'fastcc'
+///   ::= 'coldcc'
+///   ::= 'x86_stdcallcc'
+///   ::= 'x86_fastcallcc'
+///   ::= 'cc' UINT
+/// 
+bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
+  switch (Lex.getKind()) {
+  default:                       CC = CallingConv::C; return false;
+  case lltok::kw_ccc:            CC = CallingConv::C; break;
+  case lltok::kw_fastcc:         CC = CallingConv::Fast; break;
+  case lltok::kw_coldcc:         CC = CallingConv::Cold; break;
+  case lltok::kw_x86_stdcallcc:  CC = CallingConv::X86_StdCall; break;
+  case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break;
+  case lltok::kw_cc:             Lex.Lex(); return ParseUInt32(CC);
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// ParseOptionalAlignment
+///   ::= /* empty */
+///   ::= 'align' 4
+bool LLParser::ParseOptionalAlignment(unsigned &Alignment) {
+  Alignment = 0;
+  if (!EatIfPresent(lltok::kw_align))
+    return false;
+  LocTy AlignLoc = Lex.getLoc();
+  if (ParseUInt32(Alignment)) return true;
+  if (!isPowerOf2_32(Alignment))
+    return Error(AlignLoc, "alignment is not a power of two");
+  return false;
+}
+
+/// ParseOptionalCommaAlignment
+///   ::= /* empty */
+///   ::= ',' 'align' 4
+bool LLParser::ParseOptionalCommaAlignment(unsigned &Alignment) {
+  Alignment = 0;
+  if (!EatIfPresent(lltok::comma))
+    return false;
+  return ParseToken(lltok::kw_align, "expected 'align'") ||
+         ParseUInt32(Alignment);
+}
+
+/// ParseIndexList
+///    ::=  (',' uint32)+
+bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
+  if (Lex.getKind() != lltok::comma)
+    return TokError("expected ',' as start of index list");
+  
+  while (EatIfPresent(lltok::comma)) {
+    unsigned Idx;
+    if (ParseUInt32(Idx)) return true;
+    Indices.push_back(Idx);
+  }
+  
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Type Parsing.
+//===----------------------------------------------------------------------===//
+
+/// ParseType - Parse and resolve a full type.
+bool LLParser::ParseType(PATypeHolder &Result, bool AllowVoid) {
+  LocTy TypeLoc = Lex.getLoc();
+  if (ParseTypeRec(Result)) return true;
+  
+  // Verify no unresolved uprefs.
+  if (!UpRefs.empty())
+    return Error(UpRefs.back().Loc, "invalid unresolved type up reference");
+  
+  if (!AllowVoid && Result.get() == Type::VoidTy)
+    return Error(TypeLoc, "void type only allowed for function results");
+  
+  return false;
+}
+
+/// HandleUpRefs - Every time we finish a new layer of types, this function is
+/// called.  It loops through the UpRefs vector, which is a list of the
+/// currently active types.  For each type, if the up-reference is contained in
+/// the newly completed type, we decrement the level count.  When the level
+/// count reaches zero, the up-referenced type is the type that is passed in:
+/// thus we can complete the cycle.
+///
+PATypeHolder LLParser::HandleUpRefs(const Type *ty) {
+  // If Ty isn't abstract, or if there are no up-references in it, then there is
+  // nothing to resolve here.
+  if (!ty->isAbstract() || UpRefs.empty()) return ty;
+  
+  PATypeHolder Ty(ty);
+#if 0
+  errs() << "Type '" << Ty->getDescription()
+         << "' newly formed.  Resolving upreferences.\n"
+         << UpRefs.size() << " upreferences active!\n";
+#endif
+  
+  // If we find any resolvable upreferences (i.e., those whose NestingLevel goes
+  // to zero), we resolve them all together before we resolve them to Ty.  At
+  // the end of the loop, if there is anything to resolve to Ty, it will be in
+  // this variable.
+  OpaqueType *TypeToResolve = 0;
+  
+  for (unsigned i = 0; i != UpRefs.size(); ++i) {
+    // Determine if 'Ty' directly contains this up-references 'LastContainedTy'.
+    bool ContainsType =
+      std::find(Ty->subtype_begin(), Ty->subtype_end(),
+                UpRefs[i].LastContainedTy) != Ty->subtype_end();
+    
+#if 0
+    errs() << "  UR#" << i << " - TypeContains(" << Ty->getDescription() << ", "
+           << UpRefs[i].LastContainedTy->getDescription() << ") = "
+           << (ContainsType ? "true" : "false")
+           << " level=" << UpRefs[i].NestingLevel << "\n";
+#endif
+    if (!ContainsType)
+      continue;
+    
+    // Decrement level of upreference
+    unsigned Level = --UpRefs[i].NestingLevel;
+    UpRefs[i].LastContainedTy = Ty;
+    
+    // If the Up-reference has a non-zero level, it shouldn't be resolved yet.
+    if (Level != 0)
+      continue;
+    
+#if 0
+    errs() << "  * Resolving upreference for " << UpRefs[i].UpRefTy << "\n";
+#endif
+    if (!TypeToResolve)
+      TypeToResolve = UpRefs[i].UpRefTy;
+    else
+      UpRefs[i].UpRefTy->refineAbstractTypeTo(TypeToResolve);
+    UpRefs.erase(UpRefs.begin()+i);     // Remove from upreference list.
+    --i;                                // Do not skip the next element.
+  }
+  
+  if (TypeToResolve)
+    TypeToResolve->refineAbstractTypeTo(Ty);
+  
+  return Ty;
+}
+
+
+/// ParseTypeRec - The recursive function used to process the internal
+/// implementation details of types.
+bool LLParser::ParseTypeRec(PATypeHolder &Result) {
+  switch (Lex.getKind()) {
+  default:
+    return TokError("expected type");
+  case lltok::Type:
+    // TypeRec ::= 'float' | 'void' (etc)
+    Result = Lex.getTyVal();
+    Lex.Lex(); 
+    break;
+  case lltok::kw_opaque:
+    // TypeRec ::= 'opaque'
+    Result = OpaqueType::get();
+    Lex.Lex();
+    break;
+  case lltok::lbrace:
+    // TypeRec ::= '{' ... '}'
+    if (ParseStructType(Result, false))
+      return true;
+    break;
+  case lltok::lsquare:
+    // TypeRec ::= '[' ... ']'
+    Lex.Lex(); // eat the lsquare.
+    if (ParseArrayVectorType(Result, false))
+      return true;
+    break;
+  case lltok::less: // Either vector or packed struct.
+    // TypeRec ::= '<' ... '>'
+    Lex.Lex();
+    if (Lex.getKind() == lltok::lbrace) {
+      if (ParseStructType(Result, true) ||
+          ParseToken(lltok::greater, "expected '>' at end of packed struct"))
+        return true;
+    } else if (ParseArrayVectorType(Result, true))
+      return true;
+    break;
+  case lltok::LocalVar:
+  case lltok::StringConstant:  // FIXME: REMOVE IN LLVM 3.0
+    // TypeRec ::= %foo
+    if (const Type *T = M->getTypeByName(Lex.getStrVal())) {
+      Result = T;
+    } else {
+      Result = OpaqueType::get();
+      ForwardRefTypes.insert(std::make_pair(Lex.getStrVal(),
+                                            std::make_pair(Result,
+                                                           Lex.getLoc())));
+      M->addTypeName(Lex.getStrVal(), Result.get());
+    }
+    Lex.Lex();
+    break;
+      
+  case lltok::LocalVarID:
+    // TypeRec ::= %4
+    if (Lex.getUIntVal() < NumberedTypes.size())
+      Result = NumberedTypes[Lex.getUIntVal()];
+    else {
+      std::map<unsigned, std::pair<PATypeHolder, LocTy> >::iterator
+        I = ForwardRefTypeIDs.find(Lex.getUIntVal());
+      if (I != ForwardRefTypeIDs.end())
+        Result = I->second.first;
+      else {
+        Result = OpaqueType::get();
+        ForwardRefTypeIDs.insert(std::make_pair(Lex.getUIntVal(),
+                                                std::make_pair(Result,
+                                                               Lex.getLoc())));
+      }
+    }
+    Lex.Lex();
+    break;
+  case lltok::backslash: {
+    // TypeRec ::= '\' 4
+    Lex.Lex();
+    unsigned Val;
+    if (ParseUInt32(Val)) return true;
+    OpaqueType *OT = OpaqueType::get();        // Use temporary placeholder.
+    UpRefs.push_back(UpRefRecord(Lex.getLoc(), Val, OT));
+    Result = OT;
+    break;
+  }
+  }
+  
+  // Parse the type suffixes. 
+  while (1) {
+    switch (Lex.getKind()) {
+    // End of type.
+    default: return false;    
+
+    // TypeRec ::= TypeRec '*'
+    case lltok::star:
+      if (Result.get() == Type::LabelTy)
+        return TokError("basic block pointers are invalid");
+      if (Result.get() == Type::VoidTy)
+        return TokError("pointers to void are invalid; use i8* instead");
+      Result = HandleUpRefs(PointerType::getUnqual(Result.get()));
+      Lex.Lex();
+      break;
+
+    // TypeRec ::= TypeRec 'addrspace' '(' uint32 ')' '*'
+    case lltok::kw_addrspace: {
+      if (Result.get() == Type::LabelTy)
+        return TokError("basic block pointers are invalid");
+      if (Result.get() == Type::VoidTy)
+        return TokError("pointers to void are invalid; use i8* instead");
+      unsigned AddrSpace;
+      if (ParseOptionalAddrSpace(AddrSpace) ||
+          ParseToken(lltok::star, "expected '*' in address space"))
+        return true;
+
+      Result = HandleUpRefs(PointerType::get(Result.get(), AddrSpace));
+      break;
+    }
+        
+    /// Types '(' ArgTypeListI ')' OptFuncAttrs
+    case lltok::lparen:
+      if (ParseFunctionType(Result))
+        return true;
+      break;
+    }
+  }
+}
+
+/// ParseParameterList
+///    ::= '(' ')'
+///    ::= '(' Arg (',' Arg)* ')'
+///  Arg
+///    ::= Type OptionalAttributes Value OptionalAttributes
+bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
+                                  PerFunctionState &PFS) {
+  if (ParseToken(lltok::lparen, "expected '(' in call"))
+    return true;
+  
+  while (Lex.getKind() != lltok::rparen) {
+    // If this isn't the first argument, we need a comma.
+    if (!ArgList.empty() &&
+        ParseToken(lltok::comma, "expected ',' in argument list"))
+      return true;
+    
+    // Parse the argument.
+    LocTy ArgLoc;
+    PATypeHolder ArgTy(Type::VoidTy);
+    unsigned ArgAttrs1, ArgAttrs2;
+    Value *V;
+    if (ParseType(ArgTy, ArgLoc) ||
+        ParseOptionalAttrs(ArgAttrs1, 0) ||
+        ParseValue(ArgTy, V, PFS) ||
+        // FIXME: Should not allow attributes after the argument, remove this in
+        // LLVM 3.0.
+        ParseOptionalAttrs(ArgAttrs2, 3))
+      return true;
+    ArgList.push_back(ParamInfo(ArgLoc, V, ArgAttrs1|ArgAttrs2));
+  }
+
+  Lex.Lex();  // Lex the ')'.
+  return false;
+}
+
+
+
+/// ParseArgumentList - Parse the argument list for a function type or function
+/// prototype.  If 'inType' is true then we are parsing a FunctionType.
+///   ::= '(' ArgTypeListI ')'
+/// ArgTypeListI
+///   ::= /*empty*/
+///   ::= '...'
+///   ::= ArgTypeList ',' '...'
+///   ::= ArgType (',' ArgType)*
+///
+bool LLParser::ParseArgumentList(std::vector<ArgInfo> &ArgList,
+                                 bool &isVarArg, bool inType) {
+  isVarArg = false;
+  assert(Lex.getKind() == lltok::lparen);
+  Lex.Lex(); // eat the (.
+  
+  if (Lex.getKind() == lltok::rparen) {
+    // empty
+  } else if (Lex.getKind() == lltok::dotdotdot) {
+    isVarArg = true;
+    Lex.Lex();
+  } else {
+    LocTy TypeLoc = Lex.getLoc();
+    PATypeHolder ArgTy(Type::VoidTy);
+    unsigned Attrs;
+    std::string Name;
+    
+    // If we're parsing a type, use ParseTypeRec, because we allow recursive
+    // types (such as a function returning a pointer to itself).  If parsing a
+    // function prototype, we require fully resolved types.
+    if ((inType ? ParseTypeRec(ArgTy) : ParseType(ArgTy)) ||
+        ParseOptionalAttrs(Attrs, 0)) return true;
+    
+    if (ArgTy == Type::VoidTy)
+      return Error(TypeLoc, "argument can not have void type");
+    
+    if (Lex.getKind() == lltok::LocalVar ||
+        Lex.getKind() == lltok::StringConstant) { // FIXME: REMOVE IN LLVM 3.0
+      Name = Lex.getStrVal();
+      Lex.Lex();
+    }
+
+    if (!ArgTy->isFirstClassType() && !isa<OpaqueType>(ArgTy))
+      return Error(TypeLoc, "invalid type for function argument");
+    
+    ArgList.push_back(ArgInfo(TypeLoc, ArgTy, Attrs, Name));
+    
+    while (EatIfPresent(lltok::comma)) {
+      // Handle ... at end of arg list.
+      if (EatIfPresent(lltok::dotdotdot)) {
+        isVarArg = true;
+        break;
+      }
+      
+      // Otherwise must be an argument type.
+      TypeLoc = Lex.getLoc();
+      if ((inType ? ParseTypeRec(ArgTy) : ParseType(ArgTy)) ||
+          ParseOptionalAttrs(Attrs, 0)) return true;
+
+      if (ArgTy == Type::VoidTy)
+        return Error(TypeLoc, "argument can not have void type");
+
+      if (Lex.getKind() == lltok::LocalVar ||
+          Lex.getKind() == lltok::StringConstant) { // FIXME: REMOVE IN LLVM 3.0
+        Name = Lex.getStrVal();
+        Lex.Lex();
+      } else {
+        Name = "";
+      }
+
+      if (!ArgTy->isFirstClassType() && !isa<OpaqueType>(ArgTy))
+        return Error(TypeLoc, "invalid type for function argument");
+      
+      ArgList.push_back(ArgInfo(TypeLoc, ArgTy, Attrs, Name));
+    }
+  }
+  
+  return ParseToken(lltok::rparen, "expected ')' at end of argument list");
+}
+  
+/// ParseFunctionType
+///  ::= Type ArgumentList OptionalAttrs
+bool LLParser::ParseFunctionType(PATypeHolder &Result) {
+  assert(Lex.getKind() == lltok::lparen);
+
+  if (!FunctionType::isValidReturnType(Result))
+    return TokError("invalid function return type");
+  
+  std::vector<ArgInfo> ArgList;
+  bool isVarArg;
+  unsigned Attrs;
+  if (ParseArgumentList(ArgList, isVarArg, true) ||
+      // FIXME: Allow, but ignore attributes on function types!
+      // FIXME: Remove in LLVM 3.0
+      ParseOptionalAttrs(Attrs, 2))
+    return true;
+  
+  // Reject names on the arguments lists.
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    if (!ArgList[i].Name.empty())
+      return Error(ArgList[i].Loc, "argument name invalid in function type");
+    if (!ArgList[i].Attrs != 0) {
+      // Allow but ignore attributes on function types; this permits
+      // auto-upgrade.
+      // FIXME: REJECT ATTRIBUTES ON FUNCTION TYPES in LLVM 3.0
+    }
+  }
+  
+  std::vector<const Type*> ArgListTy;
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
+    ArgListTy.push_back(ArgList[i].Type);
+    
+  Result = HandleUpRefs(FunctionType::get(Result.get(), ArgListTy, isVarArg));
+  return false;
+}
+
+/// ParseStructType: Handles packed and unpacked types.  </> parsed elsewhere.
+///   TypeRec
+///     ::= '{' '}'
+///     ::= '{' TypeRec (',' TypeRec)* '}'
+///     ::= '<' '{' '}' '>'
+///     ::= '<' '{' TypeRec (',' TypeRec)* '}' '>'
+bool LLParser::ParseStructType(PATypeHolder &Result, bool Packed) {
+  assert(Lex.getKind() == lltok::lbrace);
+  Lex.Lex(); // Consume the '{'
+  
+  if (EatIfPresent(lltok::rbrace)) {
+    Result = StructType::get(std::vector<const Type*>(), Packed);
+    return false;
+  }
+
+  std::vector<PATypeHolder> ParamsList;
+  LocTy EltTyLoc = Lex.getLoc();
+  if (ParseTypeRec(Result)) return true;
+  ParamsList.push_back(Result);
+  
+  if (Result == Type::VoidTy)
+    return Error(EltTyLoc, "struct element can not have void type");
+  
+  while (EatIfPresent(lltok::comma)) {
+    EltTyLoc = Lex.getLoc();
+    if (ParseTypeRec(Result)) return true;
+    
+    if (Result == Type::VoidTy)
+      return Error(EltTyLoc, "struct element can not have void type");
+    
+    ParamsList.push_back(Result);
+  }
+  
+  if (ParseToken(lltok::rbrace, "expected '}' at end of struct"))
+    return true;
+  
+  std::vector<const Type*> ParamsListTy;
+  for (unsigned i = 0, e = ParamsList.size(); i != e; ++i)
+    ParamsListTy.push_back(ParamsList[i].get());
+  Result = HandleUpRefs(StructType::get(ParamsListTy, Packed));
+  return false;
+}
+
+/// ParseArrayVectorType - Parse an array or vector type, assuming the first
+/// token has already been consumed.
+///   TypeRec 
+///     ::= '[' APSINTVAL 'x' Types ']'
+///     ::= '<' APSINTVAL 'x' Types '>'
+bool LLParser::ParseArrayVectorType(PATypeHolder &Result, bool isVector) {
+  if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned() ||
+      Lex.getAPSIntVal().getBitWidth() > 64)
+    return TokError("expected number in address space");
+  
+  LocTy SizeLoc = Lex.getLoc();
+  uint64_t Size = Lex.getAPSIntVal().getZExtValue();
+  Lex.Lex();
+      
+  if (ParseToken(lltok::kw_x, "expected 'x' after element count"))
+      return true;
+
+  LocTy TypeLoc = Lex.getLoc();
+  PATypeHolder EltTy(Type::VoidTy);
+  if (ParseTypeRec(EltTy)) return true;
+  
+  if (EltTy == Type::VoidTy)
+    return Error(TypeLoc, "array and vector element type cannot be void");
+
+  if (ParseToken(isVector ? lltok::greater : lltok::rsquare,
+                 "expected end of sequential type"))
+    return true;
+  
+  if (isVector) {
+    if (Size == 0)
+      return Error(SizeLoc, "zero element vector is illegal");
+    if ((unsigned)Size != Size)
+      return Error(SizeLoc, "size too large for vector");
+    if (!EltTy->isFloatingPoint() && !EltTy->isInteger())
+      return Error(TypeLoc, "vector element type must be fp or integer");
+    Result = VectorType::get(EltTy, unsigned(Size));
+  } else {
+    if (!EltTy->isFirstClassType() && !isa<OpaqueType>(EltTy))
+      return Error(TypeLoc, "invalid array element type");
+    Result = HandleUpRefs(ArrayType::get(EltTy, Size));
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Function Semantic Analysis.
+//===----------------------------------------------------------------------===//
+
+LLParser::PerFunctionState::PerFunctionState(LLParser &p, Function &f)
+  : P(p), F(f) {
+
+  // Insert unnamed arguments into the NumberedVals list.
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end();
+       AI != E; ++AI)
+    if (!AI->hasName())
+      NumberedVals.push_back(AI);
+}
+
+LLParser::PerFunctionState::~PerFunctionState() {
+  // If there were any forward referenced non-basicblock values, delete them.
+  for (std::map<std::string, std::pair<Value*, LocTy> >::iterator
+       I = ForwardRefVals.begin(), E = ForwardRefVals.end(); I != E; ++I)
+    if (!isa<BasicBlock>(I->second.first)) {
+      I->second.first->replaceAllUsesWith(UndefValue::get(I->second.first
+                                                          ->getType()));
+      delete I->second.first;
+      I->second.first = 0;
+    }
+  
+  for (std::map<unsigned, std::pair<Value*, LocTy> >::iterator
+       I = ForwardRefValIDs.begin(), E = ForwardRefValIDs.end(); I != E; ++I)
+    if (!isa<BasicBlock>(I->second.first)) {
+      I->second.first->replaceAllUsesWith(UndefValue::get(I->second.first
+                                                          ->getType()));
+      delete I->second.first;
+      I->second.first = 0;
+    }
+}
+
+bool LLParser::PerFunctionState::VerifyFunctionComplete() {
+  if (!ForwardRefVals.empty())
+    return P.Error(ForwardRefVals.begin()->second.second,
+                   "use of undefined value '%" + ForwardRefVals.begin()->first +
+                   "'");
+  if (!ForwardRefValIDs.empty())
+    return P.Error(ForwardRefValIDs.begin()->second.second,
+                   "use of undefined value '%" +
+                   utostr(ForwardRefValIDs.begin()->first) + "'");
+  return false;
+}
+
+
+/// GetVal - Get a value with the specified name or ID, creating a
+/// forward reference record if needed.  This can return null if the value
+/// exists but does not have the right type.
+Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
+                                          const Type *Ty, LocTy Loc) {
+  // Look this name up in the normal function symbol table.
+  Value *Val = F.getValueSymbolTable().lookup(Name);
+  
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<std::string, std::pair<Value*, LocTy> >::iterator
+      I = ForwardRefVals.find(Name);
+    if (I != ForwardRefVals.end())
+      Val = I->second.first;
+  }
+    
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    if (Ty == Type::LabelTy)
+      P.Error(Loc, "'%" + Name + "' is not a basic block");
+    else
+      P.Error(Loc, "'%" + Name + "' defined with type '" +
+              Val->getType()->getDescription() + "'");
+    return 0;
+  }
+  
+  // Don't make placeholders with invalid type.
+  if (!Ty->isFirstClassType() && !isa<OpaqueType>(Ty) && Ty != Type::LabelTy) {
+    P.Error(Loc, "invalid use of a non-first-class type");
+    return 0;
+  }
+  
+  // Otherwise, create a new forward reference for this value and remember it.
+  Value *FwdVal;
+  if (Ty == Type::LabelTy) 
+    FwdVal = BasicBlock::Create(Name, &F);
+  else
+    FwdVal = new Argument(Ty, Name);
+  
+  ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+Value *LLParser::PerFunctionState::GetVal(unsigned ID, const Type *Ty,
+                                          LocTy Loc) {
+  // Look this name up in the normal function symbol table.
+  Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : 0;
+  
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<unsigned, std::pair<Value*, LocTy> >::iterator
+      I = ForwardRefValIDs.find(ID);
+    if (I != ForwardRefValIDs.end())
+      Val = I->second.first;
+  }
+  
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    if (Ty == Type::LabelTy)
+      P.Error(Loc, "'%" + utostr(ID) + "' is not a basic block");
+    else
+      P.Error(Loc, "'%" + utostr(ID) + "' defined with type '" +
+              Val->getType()->getDescription() + "'");
+    return 0;
+  }
+  
+  if (!Ty->isFirstClassType() && !isa<OpaqueType>(Ty) && Ty != Type::LabelTy) {
+    P.Error(Loc, "invalid use of a non-first-class type");
+    return 0;
+  }
+  
+  // Otherwise, create a new forward reference for this value and remember it.
+  Value *FwdVal;
+  if (Ty == Type::LabelTy) 
+    FwdVal = BasicBlock::Create("", &F);
+  else
+    FwdVal = new Argument(Ty);
+  
+  ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+/// SetInstName - After an instruction is parsed and inserted into its
+/// basic block, this installs its name.
+bool LLParser::PerFunctionState::SetInstName(int NameID,
+                                             const std::string &NameStr,
+                                             LocTy NameLoc, Instruction *Inst) {
+  // If this instruction has void type, it cannot have a name or ID specified.
+  if (Inst->getType() == Type::VoidTy) {
+    if (NameID != -1 || !NameStr.empty())
+      return P.Error(NameLoc, "instructions returning void cannot have a name");
+    return false;
+  }
+  
+  // If this was a numbered instruction, verify that the instruction is the
+  // expected value and resolve any forward references.
+  if (NameStr.empty()) {
+    // If neither a name nor an ID was specified, just use the next ID.
+    if (NameID == -1)
+      NameID = NumberedVals.size();
+    
+    if (unsigned(NameID) != NumberedVals.size())
+      return P.Error(NameLoc, "instruction expected to be numbered '%" +
+                     utostr(NumberedVals.size()) + "'");
+    
+    std::map<unsigned, std::pair<Value*, LocTy> >::iterator FI =
+      ForwardRefValIDs.find(NameID);
+    if (FI != ForwardRefValIDs.end()) {
+      if (FI->second.first->getType() != Inst->getType())
+        return P.Error(NameLoc, "instruction forward referenced with type '" + 
+                       FI->second.first->getType()->getDescription() + "'");
+      FI->second.first->replaceAllUsesWith(Inst);
+      ForwardRefValIDs.erase(FI);
+    }
+
+    NumberedVals.push_back(Inst);
+    return false;
+  }
+
+  // Otherwise, the instruction had a name.  Resolve forward refs and set it.
+  std::map<std::string, std::pair<Value*, LocTy> >::iterator
+    FI = ForwardRefVals.find(NameStr);
+  if (FI != ForwardRefVals.end()) {
+    if (FI->second.first->getType() != Inst->getType())
+      return P.Error(NameLoc, "instruction forward referenced with type '" + 
+                     FI->second.first->getType()->getDescription() + "'");
+    FI->second.first->replaceAllUsesWith(Inst);
+    ForwardRefVals.erase(FI);
+  }
+  
+  // Set the name on the instruction.
+  Inst->setName(NameStr);
+  
+  if (Inst->getNameStr() != NameStr)
+    return P.Error(NameLoc, "multiple definition of local value named '" + 
+                   NameStr + "'");
+  return false;
+}
+
+/// GetBB - Get a basic block with the specified name or ID, creating a
+/// forward reference record if needed.
+BasicBlock *LLParser::PerFunctionState::GetBB(const std::string &Name,
+                                              LocTy Loc) {
+  return cast_or_null<BasicBlock>(GetVal(Name, Type::LabelTy, Loc));
+}
+
+BasicBlock *LLParser::PerFunctionState::GetBB(unsigned ID, LocTy Loc) {
+  return cast_or_null<BasicBlock>(GetVal(ID, Type::LabelTy, Loc));
+}
+
+/// DefineBB - Define the specified basic block, which is either named or
+/// unnamed.  If there is an error, this returns null otherwise it returns
+/// the block being defined.
+BasicBlock *LLParser::PerFunctionState::DefineBB(const std::string &Name,
+                                                 LocTy Loc) {
+  BasicBlock *BB;
+  if (Name.empty())
+    BB = GetBB(NumberedVals.size(), Loc);
+  else
+    BB = GetBB(Name, Loc);
+  if (BB == 0) return 0; // Already diagnosed error.
+  
+  // Move the block to the end of the function.  Forward ref'd blocks are
+  // inserted wherever they happen to be referenced.
+  F.getBasicBlockList().splice(F.end(), F.getBasicBlockList(), BB);
+  
+  // Remove the block from forward ref sets.
+  if (Name.empty()) {
+    ForwardRefValIDs.erase(NumberedVals.size());
+    NumberedVals.push_back(BB);
+  } else {
+    // BB forward references are already in the function symbol table.
+    ForwardRefVals.erase(Name);
+  }
+  
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Constants.
+//===----------------------------------------------------------------------===//
+
+/// ParseValID - Parse an abstract value that doesn't necessarily have a
+/// type implied.  For example, if we parse "4" we don't know what integer type
+/// it has.  The value will later be combined with its type and checked for
+/// sanity.
+bool LLParser::ParseValID(ValID &ID) {
+  ID.Loc = Lex.getLoc();
+  switch (Lex.getKind()) {
+  default: return TokError("expected value token");
+  case lltok::GlobalID:  // @42
+    ID.UIntVal = Lex.getUIntVal();
+    ID.Kind = ValID::t_GlobalID;
+    break;
+  case lltok::GlobalVar:  // @foo
+    ID.StrVal = Lex.getStrVal();
+    ID.Kind = ValID::t_GlobalName;
+    break;
+  case lltok::LocalVarID:  // %42
+    ID.UIntVal = Lex.getUIntVal();
+    ID.Kind = ValID::t_LocalID;
+    break;
+  case lltok::LocalVar:  // %foo
+  case lltok::StringConstant:  // "foo" - FIXME: REMOVE IN LLVM 3.0
+    ID.StrVal = Lex.getStrVal();
+    ID.Kind = ValID::t_LocalName;
+    break;
+  case lltok::Metadata: {  // !{...} MDNode, !"foo" MDString
+    ID.Kind = ValID::t_Constant;
+    Lex.Lex();
+    if (Lex.getKind() == lltok::lbrace) {
+      SmallVector<Value*, 16> Elts;
+      if (ParseMDNodeVector(Elts) ||
+          ParseToken(lltok::rbrace, "expected end of metadata node"))
+        return true;
+
+      ID.ConstantVal = MDNode::get(Elts.data(), Elts.size());
+      return false;
+    }
+
+    // MDString:
+    //   ::= '!' STRINGCONSTANT
+    std::string Str;
+    if (ParseStringConstant(Str)) return true;
+
+    ID.ConstantVal = MDString::get(Str.data(), Str.data() + Str.size());
+    return false;
+  }
+  case lltok::APSInt:
+    ID.APSIntVal = Lex.getAPSIntVal(); 
+    ID.Kind = ValID::t_APSInt;
+    break;
+  case lltok::APFloat:
+    ID.APFloatVal = Lex.getAPFloatVal();
+    ID.Kind = ValID::t_APFloat;
+    break;
+  case lltok::kw_true:
+    ID.ConstantVal = ConstantInt::getTrue();
+    ID.Kind = ValID::t_Constant;
+    break;
+  case lltok::kw_false:
+    ID.ConstantVal = ConstantInt::getFalse();
+    ID.Kind = ValID::t_Constant;
+    break;
+  case lltok::kw_null: ID.Kind = ValID::t_Null; break;
+  case lltok::kw_undef: ID.Kind = ValID::t_Undef; break;
+  case lltok::kw_zeroinitializer: ID.Kind = ValID::t_Zero; break;
+      
+  case lltok::lbrace: {
+    // ValID ::= '{' ConstVector '}'
+    Lex.Lex();
+    SmallVector<Constant*, 16> Elts;
+    if (ParseGlobalValueVector(Elts) ||
+        ParseToken(lltok::rbrace, "expected end of struct constant"))
+      return true;
+    
+    ID.ConstantVal = ConstantStruct::get(Elts.data(), Elts.size(), false);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::less: {
+    // ValID ::= '<' ConstVector '>'         --> Vector.
+    // ValID ::= '<' '{' ConstVector '}' '>' --> Packed Struct.
+    Lex.Lex();
+    bool isPackedStruct = EatIfPresent(lltok::lbrace);
+    
+    SmallVector<Constant*, 16> Elts;
+    LocTy FirstEltLoc = Lex.getLoc();
+    if (ParseGlobalValueVector(Elts) ||
+        (isPackedStruct &&
+         ParseToken(lltok::rbrace, "expected end of packed struct")) ||
+        ParseToken(lltok::greater, "expected end of constant"))
+      return true;
+    
+    if (isPackedStruct) {
+      ID.ConstantVal = ConstantStruct::get(Elts.data(), Elts.size(), true);
+      ID.Kind = ValID::t_Constant;
+      return false;
+    }
+    
+    if (Elts.empty())
+      return Error(ID.Loc, "constant vector must not be empty");
+
+    if (!Elts[0]->getType()->isInteger() &&
+        !Elts[0]->getType()->isFloatingPoint())
+      return Error(FirstEltLoc,
+                   "vector elements must have integer or floating point type");
+    
+    // Verify that all the vector elements have the same type.
+    for (unsigned i = 1, e = Elts.size(); i != e; ++i)
+      if (Elts[i]->getType() != Elts[0]->getType())
+        return Error(FirstEltLoc,
+                     "vector element #" + utostr(i) +
+                    " is not of type '" + Elts[0]->getType()->getDescription());
+    
+    ID.ConstantVal = ConstantVector::get(Elts.data(), Elts.size());
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::lsquare: {   // Array Constant
+    Lex.Lex();
+    SmallVector<Constant*, 16> Elts;
+    LocTy FirstEltLoc = Lex.getLoc();
+    if (ParseGlobalValueVector(Elts) ||
+        ParseToken(lltok::rsquare, "expected end of array constant"))
+      return true;
+
+    // Handle empty element.
+    if (Elts.empty()) {
+      // Use undef instead of an array because it's inconvenient to determine
+      // the element type at this point, there being no elements to examine.
+      ID.Kind = ValID::t_EmptyArray;
+      return false;
+    }
+    
+    if (!Elts[0]->getType()->isFirstClassType())
+      return Error(FirstEltLoc, "invalid array element type: " + 
+                   Elts[0]->getType()->getDescription());
+          
+    ArrayType *ATy = ArrayType::get(Elts[0]->getType(), Elts.size());
+    
+    // Verify all elements are correct type!
+    for (unsigned i = 0, e = Elts.size(); i != e; ++i) {
+      if (Elts[i]->getType() != Elts[0]->getType())
+        return Error(FirstEltLoc,
+                     "array element #" + utostr(i) +
+                     " is not of type '" +Elts[0]->getType()->getDescription());
+    }
+    
+    ID.ConstantVal = ConstantArray::get(ATy, Elts.data(), Elts.size());
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_c:  // c "foo"
+    Lex.Lex();
+    ID.ConstantVal = ConstantArray::get(Lex.getStrVal(), false);
+    if (ParseToken(lltok::StringConstant, "expected string")) return true;
+    ID.Kind = ValID::t_Constant;
+    return false;
+
+  case lltok::kw_asm: {
+    // ValID ::= 'asm' SideEffect? STRINGCONSTANT ',' STRINGCONSTANT
+    bool HasSideEffect;
+    Lex.Lex();
+    if (ParseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
+        ParseStringConstant(ID.StrVal) ||
+        ParseToken(lltok::comma, "expected comma in inline asm expression") ||
+        ParseToken(lltok::StringConstant, "expected constraint string"))
+      return true;
+    ID.StrVal2 = Lex.getStrVal();
+    ID.UIntVal = HasSideEffect;
+    ID.Kind = ValID::t_InlineAsm;
+    return false;
+  }
+      
+  case lltok::kw_trunc:
+  case lltok::kw_zext:
+  case lltok::kw_sext:
+  case lltok::kw_fptrunc:
+  case lltok::kw_fpext:
+  case lltok::kw_bitcast:
+  case lltok::kw_uitofp:
+  case lltok::kw_sitofp:
+  case lltok::kw_fptoui:
+  case lltok::kw_fptosi: 
+  case lltok::kw_inttoptr:
+  case lltok::kw_ptrtoint: { 
+    unsigned Opc = Lex.getUIntVal();
+    PATypeHolder DestTy(Type::VoidTy);
+    Constant *SrcVal;
+    Lex.Lex();
+    if (ParseToken(lltok::lparen, "expected '(' after constantexpr cast") ||
+        ParseGlobalTypeAndValue(SrcVal) ||
+        ParseToken(lltok::kw_to, "expected 'to' int constantexpr cast") ||
+        ParseType(DestTy) ||
+        ParseToken(lltok::rparen, "expected ')' at end of constantexpr cast"))
+      return true;
+    if (!CastInst::castIsValid((Instruction::CastOps)Opc, SrcVal, DestTy))
+      return Error(ID.Loc, "invalid cast opcode for cast from '" +
+                   SrcVal->getType()->getDescription() + "' to '" +
+                   DestTy->getDescription() + "'");
+    ID.ConstantVal = ConstantExpr::getCast((Instruction::CastOps)Opc, SrcVal,
+                                           DestTy);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_extractvalue: {
+    Lex.Lex();
+    Constant *Val;
+    SmallVector<unsigned, 4> Indices;
+    if (ParseToken(lltok::lparen, "expected '(' in extractvalue constantexpr")||
+        ParseGlobalTypeAndValue(Val) ||
+        ParseIndexList(Indices) ||
+        ParseToken(lltok::rparen, "expected ')' in extractvalue constantexpr"))
+      return true;
+    if (!isa<StructType>(Val->getType()) && !isa<ArrayType>(Val->getType()))
+      return Error(ID.Loc, "extractvalue operand must be array or struct");
+    if (!ExtractValueInst::getIndexedType(Val->getType(), Indices.begin(),
+                                          Indices.end()))
+      return Error(ID.Loc, "invalid indices for extractvalue");
+    ID.ConstantVal =
+      ConstantExpr::getExtractValue(Val, Indices.data(), Indices.size());
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_insertvalue: {
+    Lex.Lex();
+    Constant *Val0, *Val1;
+    SmallVector<unsigned, 4> Indices;
+    if (ParseToken(lltok::lparen, "expected '(' in insertvalue constantexpr")||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in insertvalue constantexpr")||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseIndexList(Indices) ||
+        ParseToken(lltok::rparen, "expected ')' in insertvalue constantexpr"))
+      return true;
+    if (!isa<StructType>(Val0->getType()) && !isa<ArrayType>(Val0->getType()))
+      return Error(ID.Loc, "extractvalue operand must be array or struct");
+    if (!ExtractValueInst::getIndexedType(Val0->getType(), Indices.begin(),
+                                          Indices.end()))
+      return Error(ID.Loc, "invalid indices for insertvalue");
+    ID.ConstantVal =
+      ConstantExpr::getInsertValue(Val0, Val1, Indices.data(), Indices.size());
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_icmp:
+  case lltok::kw_fcmp:
+  case lltok::kw_vicmp:
+  case lltok::kw_vfcmp: {
+    unsigned PredVal, Opc = Lex.getUIntVal();
+    Constant *Val0, *Val1;
+    Lex.Lex();
+    if (ParseCmpPredicate(PredVal, Opc) ||
+        ParseToken(lltok::lparen, "expected '(' in compare constantexpr") ||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in compare constantexpr") ||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseToken(lltok::rparen, "expected ')' in compare constantexpr"))
+      return true;
+    
+    if (Val0->getType() != Val1->getType())
+      return Error(ID.Loc, "compare operands must have the same type");
+    
+    CmpInst::Predicate Pred = (CmpInst::Predicate)PredVal;
+    
+    if (Opc == Instruction::FCmp) {
+      if (!Val0->getType()->isFPOrFPVector())
+        return Error(ID.Loc, "fcmp requires floating point operands");
+      ID.ConstantVal = ConstantExpr::getFCmp(Pred, Val0, Val1);
+    } else if (Opc == Instruction::ICmp) {
+      if (!Val0->getType()->isIntOrIntVector() &&
+          !isa<PointerType>(Val0->getType()))
+        return Error(ID.Loc, "icmp requires pointer or integer operands");
+      ID.ConstantVal = ConstantExpr::getICmp(Pred, Val0, Val1);
+    } else if (Opc == Instruction::VFCmp) {
+      // FIXME: REMOVE VFCMP Support
+      if (!Val0->getType()->isFPOrFPVector() ||
+          !isa<VectorType>(Val0->getType()))
+        return Error(ID.Loc, "vfcmp requires vector floating point operands");
+      ID.ConstantVal = ConstantExpr::getVFCmp(Pred, Val0, Val1);
+    } else if (Opc == Instruction::VICmp) {
+      // FIXME: REMOVE VICMP Support
+      if (!Val0->getType()->isIntOrIntVector() ||
+          !isa<VectorType>(Val0->getType()))
+        return Error(ID.Loc, "vicmp requires vector floating point operands");
+      ID.ConstantVal = ConstantExpr::getVICmp(Pred, Val0, Val1);
+    }
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+      
+  // Binary Operators.
+  case lltok::kw_add:
+  case lltok::kw_sub:
+  case lltok::kw_mul:
+  case lltok::kw_udiv:
+  case lltok::kw_sdiv:
+  case lltok::kw_fdiv:
+  case lltok::kw_urem:
+  case lltok::kw_srem:
+  case lltok::kw_frem: {
+    unsigned Opc = Lex.getUIntVal();
+    Constant *Val0, *Val1;
+    Lex.Lex();
+    if (ParseToken(lltok::lparen, "expected '(' in binary constantexpr") ||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in binary constantexpr") ||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseToken(lltok::rparen, "expected ')' in binary constantexpr"))
+      return true;
+    if (Val0->getType() != Val1->getType())
+      return Error(ID.Loc, "operands of constexpr must have same type");
+    if (!Val0->getType()->isIntOrIntVector() &&
+        !Val0->getType()->isFPOrFPVector())
+      return Error(ID.Loc,"constexpr requires integer, fp, or vector operands");
+    ID.ConstantVal = ConstantExpr::get(Opc, Val0, Val1);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+      
+  // Logical Operations
+  case lltok::kw_shl:
+  case lltok::kw_lshr:
+  case lltok::kw_ashr:
+  case lltok::kw_and:
+  case lltok::kw_or:
+  case lltok::kw_xor: {
+    unsigned Opc = Lex.getUIntVal();
+    Constant *Val0, *Val1;
+    Lex.Lex();
+    if (ParseToken(lltok::lparen, "expected '(' in logical constantexpr") ||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in logical constantexpr") ||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseToken(lltok::rparen, "expected ')' in logical constantexpr"))
+      return true;
+    if (Val0->getType() != Val1->getType())
+      return Error(ID.Loc, "operands of constexpr must have same type");
+    if (!Val0->getType()->isIntOrIntVector())
+      return Error(ID.Loc,
+                   "constexpr requires integer or integer vector operands");
+    ID.ConstantVal = ConstantExpr::get(Opc, Val0, Val1);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }  
+      
+  case lltok::kw_getelementptr:
+  case lltok::kw_shufflevector:
+  case lltok::kw_insertelement:
+  case lltok::kw_extractelement:
+  case lltok::kw_select: {
+    unsigned Opc = Lex.getUIntVal();
+    SmallVector<Constant*, 16> Elts;
+    Lex.Lex();
+    if (ParseToken(lltok::lparen, "expected '(' in constantexpr") ||
+        ParseGlobalValueVector(Elts) ||
+        ParseToken(lltok::rparen, "expected ')' in constantexpr"))
+      return true;
+    
+    if (Opc == Instruction::GetElementPtr) {
+      if (Elts.size() == 0 || !isa<PointerType>(Elts[0]->getType()))
+        return Error(ID.Loc, "getelementptr requires pointer operand");
+      
+      if (!GetElementPtrInst::getIndexedType(Elts[0]->getType(),
+                                             (Value**)&Elts[1], Elts.size()-1))
+        return Error(ID.Loc, "invalid indices for getelementptr");
+      ID.ConstantVal = ConstantExpr::getGetElementPtr(Elts[0],
+                                                      &Elts[1], Elts.size()-1);
+    } else if (Opc == Instruction::Select) {
+      if (Elts.size() != 3)
+        return Error(ID.Loc, "expected three operands to select");
+      if (const char *Reason = SelectInst::areInvalidOperands(Elts[0], Elts[1],
+                                                              Elts[2]))
+        return Error(ID.Loc, Reason);
+      ID.ConstantVal = ConstantExpr::getSelect(Elts[0], Elts[1], Elts[2]);
+    } else if (Opc == Instruction::ShuffleVector) {
+      if (Elts.size() != 3)
+        return Error(ID.Loc, "expected three operands to shufflevector");
+      if (!ShuffleVectorInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
+        return Error(ID.Loc, "invalid operands to shufflevector");
+      ID.ConstantVal = ConstantExpr::getShuffleVector(Elts[0], Elts[1],Elts[2]);
+    } else if (Opc == Instruction::ExtractElement) {
+      if (Elts.size() != 2)
+        return Error(ID.Loc, "expected two operands to extractelement");
+      if (!ExtractElementInst::isValidOperands(Elts[0], Elts[1]))
+        return Error(ID.Loc, "invalid extractelement operands");
+      ID.ConstantVal = ConstantExpr::getExtractElement(Elts[0], Elts[1]);
+    } else {
+      assert(Opc == Instruction::InsertElement && "Unknown opcode");
+      if (Elts.size() != 3)
+      return Error(ID.Loc, "expected three operands to insertelement");
+      if (!InsertElementInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
+        return Error(ID.Loc, "invalid insertelement operands");
+      ID.ConstantVal = ConstantExpr::getInsertElement(Elts[0], Elts[1],Elts[2]);
+    }
+    
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  }
+  
+  Lex.Lex();
+  return false;
+}
+
+/// ParseGlobalValue - Parse a global value with the specified type.
+bool LLParser::ParseGlobalValue(const Type *Ty, Constant *&V) {
+  V = 0;
+  ValID ID;
+  return ParseValID(ID) ||
+         ConvertGlobalValIDToValue(Ty, ID, V);
+}
+
+/// ConvertGlobalValIDToValue - Apply a type to a ValID to get a fully resolved
+/// constant.
+bool LLParser::ConvertGlobalValIDToValue(const Type *Ty, ValID &ID,
+                                         Constant *&V) {
+  if (isa<FunctionType>(Ty))
+    return Error(ID.Loc, "functions are not values, refer to them as pointers");
+  
+  switch (ID.Kind) {
+  default: assert(0 && "Unknown ValID!");
+  case ValID::t_LocalID:
+  case ValID::t_LocalName:
+    return Error(ID.Loc, "invalid use of function-local name");
+  case ValID::t_InlineAsm:
+    return Error(ID.Loc, "inline asm can only be an operand of call/invoke");
+  case ValID::t_GlobalName:
+    V = GetGlobalVal(ID.StrVal, Ty, ID.Loc);
+    return V == 0;
+  case ValID::t_GlobalID:
+    V = GetGlobalVal(ID.UIntVal, Ty, ID.Loc);
+    return V == 0;
+  case ValID::t_APSInt:
+    if (!isa<IntegerType>(Ty))
+      return Error(ID.Loc, "integer constant must have integer type");
+    ID.APSIntVal.extOrTrunc(Ty->getPrimitiveSizeInBits());
+    V = ConstantInt::get(ID.APSIntVal);
+    return false;
+  case ValID::t_APFloat:
+    if (!Ty->isFloatingPoint() ||
+        !ConstantFP::isValueValidForType(Ty, ID.APFloatVal))
+      return Error(ID.Loc, "floating point constant invalid for type");
+      
+    // The lexer has no type info, so builds all float and double FP constants
+    // as double.  Fix this here.  Long double does not need this.
+    if (&ID.APFloatVal.getSemantics() == &APFloat::IEEEdouble &&
+        Ty == Type::FloatTy) {
+      bool Ignored;
+      ID.APFloatVal.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven,
+                            &Ignored);
+    }
+    V = ConstantFP::get(ID.APFloatVal);
+      
+    if (V->getType() != Ty)
+      return Error(ID.Loc, "floating point constant does not have type '" +
+                   Ty->getDescription() + "'");
+      
+    return false;
+  case ValID::t_Null:
+    if (!isa<PointerType>(Ty))
+      return Error(ID.Loc, "null must be a pointer type");
+    V = ConstantPointerNull::get(cast<PointerType>(Ty));
+    return false;
+  case ValID::t_Undef:
+    // FIXME: LabelTy should not be a first-class type.
+    if ((!Ty->isFirstClassType() || Ty == Type::LabelTy) &&
+        !isa<OpaqueType>(Ty))
+      return Error(ID.Loc, "invalid type for undef constant");
+    V = UndefValue::get(Ty);
+    return false;
+  case ValID::t_EmptyArray:
+    if (!isa<ArrayType>(Ty) || cast<ArrayType>(Ty)->getNumElements() != 0)
+      return Error(ID.Loc, "invalid empty array initializer");
+    V = UndefValue::get(Ty);
+    return false;
+  case ValID::t_Zero:
+    // FIXME: LabelTy should not be a first-class type.
+    if (!Ty->isFirstClassType() || Ty == Type::LabelTy)
+      return Error(ID.Loc, "invalid type for null constant");
+    V = Constant::getNullValue(Ty);
+    return false;
+  case ValID::t_Constant:
+    if (ID.ConstantVal->getType() != Ty)
+      return Error(ID.Loc, "constant expression type mismatch");
+    V = ID.ConstantVal;
+    return false;
+  }
+}
+  
+bool LLParser::ParseGlobalTypeAndValue(Constant *&V) {
+  PATypeHolder Type(Type::VoidTy);
+  return ParseType(Type) ||
+         ParseGlobalValue(Type, V);
+}    
+
+/// ParseGlobalValueVector
+///   ::= /*empty*/
+///   ::= TypeAndValue (',' TypeAndValue)*
+bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts) {
+  // Empty list.
+  if (Lex.getKind() == lltok::rbrace ||
+      Lex.getKind() == lltok::rsquare ||
+      Lex.getKind() == lltok::greater ||
+      Lex.getKind() == lltok::rparen)
+    return false;
+  
+  Constant *C;
+  if (ParseGlobalTypeAndValue(C)) return true;
+  Elts.push_back(C);
+  
+  while (EatIfPresent(lltok::comma)) {
+    if (ParseGlobalTypeAndValue(C)) return true;
+    Elts.push_back(C);
+  }
+  
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Function Parsing.
+//===----------------------------------------------------------------------===//
+
+bool LLParser::ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
+                                   PerFunctionState &PFS) {
+  if (ID.Kind == ValID::t_LocalID)
+    V = PFS.GetVal(ID.UIntVal, Ty, ID.Loc);
+  else if (ID.Kind == ValID::t_LocalName)
+    V = PFS.GetVal(ID.StrVal, Ty, ID.Loc);
+  else if (ID.Kind == ValID::t_InlineAsm) {
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    const FunctionType *FTy =
+      PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : 0;
+    if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2))
+      return Error(ID.Loc, "invalid type for inline asm constraint string");
+    V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal);
+    return false;
+  } else {
+    Constant *C;
+    if (ConvertGlobalValIDToValue(Ty, ID, C)) return true;
+    V = C;
+    return false;
+  }
+
+  return V == 0;
+}
+
+bool LLParser::ParseValue(const Type *Ty, Value *&V, PerFunctionState &PFS) {
+  V = 0;
+  ValID ID;
+  return ParseValID(ID) ||
+         ConvertValIDToValue(Ty, ID, V, PFS);
+}
+
+bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState &PFS) {
+  PATypeHolder T(Type::VoidTy);
+  return ParseType(T) ||
+         ParseValue(T, V, PFS);
+}
+
+/// FunctionHeader
+///   ::= OptionalLinkage OptionalVisibility OptionalCallingConv OptRetAttrs
+///       Type GlobalName '(' ArgList ')' OptFuncAttrs OptSection
+///       OptionalAlign OptGC
+bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
+  // Parse the linkage.
+  LocTy LinkageLoc = Lex.getLoc();
+  unsigned Linkage;
+  
+  unsigned Visibility, CC, RetAttrs;
+  PATypeHolder RetType(Type::VoidTy);
+  LocTy RetTypeLoc = Lex.getLoc();
+  if (ParseOptionalLinkage(Linkage) ||
+      ParseOptionalVisibility(Visibility) ||
+      ParseOptionalCallingConv(CC) ||
+      ParseOptionalAttrs(RetAttrs, 1) ||
+      ParseType(RetType, RetTypeLoc, true /*void allowed*/))
+    return true;
+
+  // Verify that the linkage is ok.
+  switch ((GlobalValue::LinkageTypes)Linkage) {
+  case GlobalValue::ExternalLinkage:
+    break; // always ok.
+  case GlobalValue::DLLImportLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    if (isDefine)
+      return Error(LinkageLoc, "invalid linkage for function definition");
+    break;
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::DLLExportLinkage:
+    if (!isDefine)
+      return Error(LinkageLoc, "invalid linkage for function declaration");
+    break;
+  case GlobalValue::AppendingLinkage:
+  case GlobalValue::GhostLinkage:
+  case GlobalValue::CommonLinkage:
+    return Error(LinkageLoc, "invalid function linkage type");
+  }
+  
+  if (!FunctionType::isValidReturnType(RetType) ||
+      isa<OpaqueType>(RetType))
+    return Error(RetTypeLoc, "invalid function return type");
+  
+  LocTy NameLoc = Lex.getLoc();
+
+  std::string FunctionName;
+  if (Lex.getKind() == lltok::GlobalVar) {
+    FunctionName = Lex.getStrVal();
+  } else if (Lex.getKind() == lltok::GlobalID) {     // @42 is ok.
+    unsigned NameID = Lex.getUIntVal();
+
+    if (NameID != NumberedVals.size())
+      return TokError("function expected to be numbered '%" +
+                      utostr(NumberedVals.size()) + "'");
+  } else {
+    return TokError("expected function name");
+  }
+  
+  Lex.Lex();
+  
+  if (Lex.getKind() != lltok::lparen)
+    return TokError("expected '(' in function argument list");
+  
+  std::vector<ArgInfo> ArgList;
+  bool isVarArg;
+  unsigned FuncAttrs;
+  std::string Section;
+  unsigned Alignment;
+  std::string GC;
+
+  if (ParseArgumentList(ArgList, isVarArg, false) ||
+      ParseOptionalAttrs(FuncAttrs, 2) ||
+      (EatIfPresent(lltok::kw_section) &&
+       ParseStringConstant(Section)) ||
+      ParseOptionalAlignment(Alignment) ||
+      (EatIfPresent(lltok::kw_gc) &&
+       ParseStringConstant(GC)))
+    return true;
+
+  // If the alignment was parsed as an attribute, move to the alignment field.
+  if (FuncAttrs & Attribute::Alignment) {
+    Alignment = Attribute::getAlignmentFromAttrs(FuncAttrs);
+    FuncAttrs &= ~Attribute::Alignment;
+  }
+  
+  // Okay, if we got here, the function is syntactically valid.  Convert types
+  // and do semantic checks.
+  std::vector<const Type*> ParamTypeList;
+  SmallVector<AttributeWithIndex, 8> Attrs;
+  // FIXME : In 3.0, stop accepting zext, sext and inreg as optional function 
+  // attributes.
+  unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg;
+  if (FuncAttrs & ObsoleteFuncAttrs) {
+    RetAttrs |= FuncAttrs & ObsoleteFuncAttrs;
+    FuncAttrs &= ~ObsoleteFuncAttrs;
+  }
+  
+  if (RetAttrs != Attribute::None)
+    Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
+  
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    ParamTypeList.push_back(ArgList[i].Type);
+    if (ArgList[i].Attrs != Attribute::None)
+      Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
+  }
+
+  if (FuncAttrs != Attribute::None)
+    Attrs.push_back(AttributeWithIndex::get(~0, FuncAttrs));
+
+  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  
+  if (PAL.paramHasAttr(1, Attribute::StructRet) &&
+      RetType != Type::VoidTy)
+    return Error(RetTypeLoc, "functions with 'sret' argument must return void"); 
+  
+  const FunctionType *FT = FunctionType::get(RetType, ParamTypeList, isVarArg);
+  const PointerType *PFT = PointerType::getUnqual(FT);
+
+  Fn = 0;
+  if (!FunctionName.empty()) {
+    // If this was a definition of a forward reference, remove the definition
+    // from the forward reference table and fill in the forward ref.
+    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator FRVI =
+      ForwardRefVals.find(FunctionName);
+    if (FRVI != ForwardRefVals.end()) {
+      Fn = M->getFunction(FunctionName);
+      ForwardRefVals.erase(FRVI);
+    } else if ((Fn = M->getFunction(FunctionName))) {
+      // If this function already exists in the symbol table, then it is
+      // multiply defined.  We accept a few cases for old backwards compat.
+      // FIXME: Remove this stuff for LLVM 3.0.
+      if (Fn->getType() != PFT || Fn->getAttributes() != PAL ||
+          (!Fn->isDeclaration() && isDefine)) {
+        // If the redefinition has different type or different attributes,
+        // reject it.  If both have bodies, reject it.
+        return Error(NameLoc, "invalid redefinition of function '" +
+                     FunctionName + "'");
+      } else if (Fn->isDeclaration()) {
+        // Make sure to strip off any argument names so we can't get conflicts.
+        for (Function::arg_iterator AI = Fn->arg_begin(), AE = Fn->arg_end();
+             AI != AE; ++AI)
+          AI->setName("");
+      }
+    }
+    
+  } else if (FunctionName.empty()) {
+    // If this is a definition of a forward referenced function, make sure the
+    // types agree.
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator I
+      = ForwardRefValIDs.find(NumberedVals.size());
+    if (I != ForwardRefValIDs.end()) {
+      Fn = cast<Function>(I->second.first);
+      if (Fn->getType() != PFT)
+        return Error(NameLoc, "type of definition and forward reference of '@" +
+                     utostr(NumberedVals.size()) +"' disagree");
+      ForwardRefValIDs.erase(I);
+    }
+  }
+
+  if (Fn == 0)
+    Fn = Function::Create(FT, GlobalValue::ExternalLinkage, FunctionName, M);
+  else // Move the forward-reference to the correct spot in the module.
+    M->getFunctionList().splice(M->end(), M->getFunctionList(), Fn);
+
+  if (FunctionName.empty())
+    NumberedVals.push_back(Fn);
+  
+  Fn->setLinkage((GlobalValue::LinkageTypes)Linkage);
+  Fn->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+  Fn->setCallingConv(CC);
+  Fn->setAttributes(PAL);
+  Fn->setAlignment(Alignment);
+  Fn->setSection(Section);
+  if (!GC.empty()) Fn->setGC(GC.c_str());
+    
+  // Add all of the arguments we parsed to the function.
+  Function::arg_iterator ArgIt = Fn->arg_begin();
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i, ++ArgIt) {
+    // If the argument has a name, insert it into the argument symbol table.
+    if (ArgList[i].Name.empty()) continue;
+    
+    // Set the name, if it conflicted, it will be auto-renamed.
+    ArgIt->setName(ArgList[i].Name);
+    
+    if (ArgIt->getNameStr() != ArgList[i].Name)
+      return Error(ArgList[i].Loc, "redefinition of argument '%" +
+                   ArgList[i].Name + "'");
+  }
+  
+  return false;
+}
+
+
+/// ParseFunctionBody
+///   ::= '{' BasicBlock+ '}'
+///   ::= 'begin' BasicBlock+ 'end'  // FIXME: remove in LLVM 3.0
+///
+bool LLParser::ParseFunctionBody(Function &Fn) {
+  if (Lex.getKind() != lltok::lbrace && Lex.getKind() != lltok::kw_begin)
+    return TokError("expected '{' in function body");
+  Lex.Lex();  // eat the {.
+  
+  PerFunctionState PFS(*this, Fn);
+  
+  while (Lex.getKind() != lltok::rbrace && Lex.getKind() != lltok::kw_end)
+    if (ParseBasicBlock(PFS)) return true;
+  
+  // Eat the }.
+  Lex.Lex();
+  
+  // Verify function is ok.
+  return PFS.VerifyFunctionComplete();
+}
+
+/// ParseBasicBlock
+///   ::= LabelStr? Instruction*
+bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
+  // If this basic block starts out with a name, remember it.
+  std::string Name;
+  LocTy NameLoc = Lex.getLoc();
+  if (Lex.getKind() == lltok::LabelStr) {
+    Name = Lex.getStrVal();
+    Lex.Lex();
+  }
+  
+  BasicBlock *BB = PFS.DefineBB(Name, NameLoc);
+  if (BB == 0) return true;
+  
+  std::string NameStr;
+  
+  // Parse the instructions in this block until we get a terminator.
+  Instruction *Inst;
+  do {
+    // This instruction may have three possibilities for a name: a) none
+    // specified, b) name specified "%foo =", c) number specified: "%4 =".
+    LocTy NameLoc = Lex.getLoc();
+    int NameID = -1;
+    NameStr = "";
+    
+    if (Lex.getKind() == lltok::LocalVarID) {
+      NameID = Lex.getUIntVal();
+      Lex.Lex();
+      if (ParseToken(lltok::equal, "expected '=' after instruction id"))
+        return true;
+    } else if (Lex.getKind() == lltok::LocalVar ||
+               // FIXME: REMOVE IN LLVM 3.0
+               Lex.getKind() == lltok::StringConstant) {
+      NameStr = Lex.getStrVal();
+      Lex.Lex();
+      if (ParseToken(lltok::equal, "expected '=' after instruction name"))
+        return true;
+    }
+    
+    if (ParseInstruction(Inst, BB, PFS)) return true;
+    
+    BB->getInstList().push_back(Inst);
+
+    // Set the name on the instruction.
+    if (PFS.SetInstName(NameID, NameStr, NameLoc, Inst)) return true;
+  } while (!isa<TerminatorInst>(Inst));
+  
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Parsing.
+//===----------------------------------------------------------------------===//
+
+/// ParseInstruction - Parse one of the many different instructions.
+///
+bool LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
+                                PerFunctionState &PFS) {
+  lltok::Kind Token = Lex.getKind();
+  if (Token == lltok::Eof)
+    return TokError("found end of file when expecting more instructions");
+  LocTy Loc = Lex.getLoc();
+  unsigned KeywordVal = Lex.getUIntVal();
+  Lex.Lex();  // Eat the keyword.
+  
+  switch (Token) {
+  default:                    return Error(Loc, "expected instruction opcode");
+  // Terminator Instructions.
+  case lltok::kw_unwind:      Inst = new UnwindInst(); return false;
+  case lltok::kw_unreachable: Inst = new UnreachableInst(); return false;
+  case lltok::kw_ret:         return ParseRet(Inst, BB, PFS);
+  case lltok::kw_br:          return ParseBr(Inst, PFS);
+  case lltok::kw_switch:      return ParseSwitch(Inst, PFS);
+  case lltok::kw_invoke:      return ParseInvoke(Inst, PFS);
+  // Binary Operators.
+  case lltok::kw_add:
+  case lltok::kw_sub:
+  case lltok::kw_mul:    return ParseArithmetic(Inst, PFS, KeywordVal, 0);
+      
+  case lltok::kw_udiv:
+  case lltok::kw_sdiv:
+  case lltok::kw_urem:
+  case lltok::kw_srem:   return ParseArithmetic(Inst, PFS, KeywordVal, 1);
+  case lltok::kw_fdiv:
+  case lltok::kw_frem:   return ParseArithmetic(Inst, PFS, KeywordVal, 2);
+  case lltok::kw_shl:
+  case lltok::kw_lshr:
+  case lltok::kw_ashr:
+  case lltok::kw_and:
+  case lltok::kw_or:
+  case lltok::kw_xor:    return ParseLogical(Inst, PFS, KeywordVal);
+  case lltok::kw_icmp:
+  case lltok::kw_fcmp:
+  case lltok::kw_vicmp:
+  case lltok::kw_vfcmp:  return ParseCompare(Inst, PFS, KeywordVal);
+  // Casts.
+  case lltok::kw_trunc:
+  case lltok::kw_zext:
+  case lltok::kw_sext:
+  case lltok::kw_fptrunc:
+  case lltok::kw_fpext:
+  case lltok::kw_bitcast:
+  case lltok::kw_uitofp:
+  case lltok::kw_sitofp:
+  case lltok::kw_fptoui:
+  case lltok::kw_fptosi: 
+  case lltok::kw_inttoptr:
+  case lltok::kw_ptrtoint:       return ParseCast(Inst, PFS, KeywordVal);
+  // Other.
+  case lltok::kw_select:         return ParseSelect(Inst, PFS);
+  case lltok::kw_va_arg:         return ParseVA_Arg(Inst, PFS);
+  case lltok::kw_extractelement: return ParseExtractElement(Inst, PFS);
+  case lltok::kw_insertelement:  return ParseInsertElement(Inst, PFS);
+  case lltok::kw_shufflevector:  return ParseShuffleVector(Inst, PFS);
+  case lltok::kw_phi:            return ParsePHI(Inst, PFS);
+  case lltok::kw_call:           return ParseCall(Inst, PFS, false);
+  case lltok::kw_tail:           return ParseCall(Inst, PFS, true);
+  // Memory.
+  case lltok::kw_alloca:
+  case lltok::kw_malloc:         return ParseAlloc(Inst, PFS, KeywordVal);
+  case lltok::kw_free:           return ParseFree(Inst, PFS);
+  case lltok::kw_load:           return ParseLoad(Inst, PFS, false);
+  case lltok::kw_store:          return ParseStore(Inst, PFS, false);
+  case lltok::kw_volatile:
+    if (EatIfPresent(lltok::kw_load))
+      return ParseLoad(Inst, PFS, true);
+    else if (EatIfPresent(lltok::kw_store))
+      return ParseStore(Inst, PFS, true);
+    else
+      return TokError("expected 'load' or 'store'");
+  case lltok::kw_getresult:     return ParseGetResult(Inst, PFS);
+  case lltok::kw_getelementptr: return ParseGetElementPtr(Inst, PFS);
+  case lltok::kw_extractvalue:  return ParseExtractValue(Inst, PFS);
+  case lltok::kw_insertvalue:   return ParseInsertValue(Inst, PFS);
+  }
+}
+
+/// ParseCmpPredicate - Parse an integer or fp predicate, based on Kind.
+bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) {
+  // FIXME: REMOVE vicmp/vfcmp!
+  if (Opc == Instruction::FCmp || Opc == Instruction::VFCmp) {
+    switch (Lex.getKind()) {
+    default: TokError("expected fcmp predicate (e.g. 'oeq')");
+    case lltok::kw_oeq: P = CmpInst::FCMP_OEQ; break;
+    case lltok::kw_one: P = CmpInst::FCMP_ONE; break;
+    case lltok::kw_olt: P = CmpInst::FCMP_OLT; break;
+    case lltok::kw_ogt: P = CmpInst::FCMP_OGT; break;
+    case lltok::kw_ole: P = CmpInst::FCMP_OLE; break;
+    case lltok::kw_oge: P = CmpInst::FCMP_OGE; break;
+    case lltok::kw_ord: P = CmpInst::FCMP_ORD; break;
+    case lltok::kw_uno: P = CmpInst::FCMP_UNO; break;
+    case lltok::kw_ueq: P = CmpInst::FCMP_UEQ; break;
+    case lltok::kw_une: P = CmpInst::FCMP_UNE; break;
+    case lltok::kw_ult: P = CmpInst::FCMP_ULT; break;
+    case lltok::kw_ugt: P = CmpInst::FCMP_UGT; break;
+    case lltok::kw_ule: P = CmpInst::FCMP_ULE; break;
+    case lltok::kw_uge: P = CmpInst::FCMP_UGE; break;
+    case lltok::kw_true: P = CmpInst::FCMP_TRUE; break;
+    case lltok::kw_false: P = CmpInst::FCMP_FALSE; break;
+    }
+  } else {
+    switch (Lex.getKind()) {
+    default: TokError("expected icmp predicate (e.g. 'eq')");
+    case lltok::kw_eq:  P = CmpInst::ICMP_EQ; break;
+    case lltok::kw_ne:  P = CmpInst::ICMP_NE; break;
+    case lltok::kw_slt: P = CmpInst::ICMP_SLT; break;
+    case lltok::kw_sgt: P = CmpInst::ICMP_SGT; break;
+    case lltok::kw_sle: P = CmpInst::ICMP_SLE; break;
+    case lltok::kw_sge: P = CmpInst::ICMP_SGE; break;
+    case lltok::kw_ult: P = CmpInst::ICMP_ULT; break;
+    case lltok::kw_ugt: P = CmpInst::ICMP_UGT; break;
+    case lltok::kw_ule: P = CmpInst::ICMP_ULE; break;
+    case lltok::kw_uge: P = CmpInst::ICMP_UGE; break;
+    }
+  }
+  Lex.Lex();
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Terminator Instructions.
+//===----------------------------------------------------------------------===//
+
+/// ParseRet - Parse a return instruction.
+///   ::= 'ret' void
+///   ::= 'ret' TypeAndValue
+///   ::= 'ret' TypeAndValue (',' TypeAndValue)+  [[obsolete: LLVM 3.0]]
+bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB,
+                        PerFunctionState &PFS) {
+  PATypeHolder Ty(Type::VoidTy);
+  if (ParseType(Ty, true /*void allowed*/)) return true;
+  
+  if (Ty == Type::VoidTy) {
+    Inst = ReturnInst::Create();
+    return false;
+  }
+  
+  Value *RV;
+  if (ParseValue(Ty, RV, PFS)) return true;
+  
+  // The normal case is one return value.
+  if (Lex.getKind() == lltok::comma) {
+    // FIXME: LLVM 3.0 remove MRV support for 'ret i32 1, i32 2', requiring use
+    // of 'ret {i32,i32} {i32 1, i32 2}'
+    SmallVector<Value*, 8> RVs;
+    RVs.push_back(RV);
+    
+    while (EatIfPresent(lltok::comma)) {
+      if (ParseTypeAndValue(RV, PFS)) return true;
+      RVs.push_back(RV);
+    }
+
+    RV = UndefValue::get(PFS.getFunction().getReturnType());
+    for (unsigned i = 0, e = RVs.size(); i != e; ++i) {
+      Instruction *I = InsertValueInst::Create(RV, RVs[i], i, "mrv");
+      BB->getInstList().push_back(I);
+      RV = I;
+    }
+  }
+  Inst = ReturnInst::Create(RV);
+  return false;
+}
+
+
+/// ParseBr
+///   ::= 'br' TypeAndValue
+///   ::= 'br' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc, Loc2;
+  Value *Op0, *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS)) return true;
+  
+  if (BasicBlock *BB = dyn_cast<BasicBlock>(Op0)) {
+    Inst = BranchInst::Create(BB);
+    return false;
+  }
+  
+  if (Op0->getType() != Type::Int1Ty)
+    return Error(Loc, "branch condition must have 'i1' type");
+    
+  if (ParseToken(lltok::comma, "expected ',' after branch condition") ||
+      ParseTypeAndValue(Op1, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after true destination") ||
+      ParseTypeAndValue(Op2, Loc2, PFS))
+    return true;
+  
+  if (!isa<BasicBlock>(Op1))
+    return Error(Loc, "true destination of branch must be a basic block");
+  if (!isa<BasicBlock>(Op2))
+    return Error(Loc2, "true destination of branch must be a basic block");
+    
+  Inst = BranchInst::Create(cast<BasicBlock>(Op1), cast<BasicBlock>(Op2), Op0);
+  return false;
+}
+
+/// ParseSwitch
+///  Instruction
+///    ::= 'switch' TypeAndValue ',' TypeAndValue '[' JumpTable ']'
+///  JumpTable
+///    ::= (TypeAndValue ',' TypeAndValue)*
+bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy CondLoc, BBLoc;
+  Value *Cond, *DefaultBB;
+  if (ParseTypeAndValue(Cond, CondLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after switch condition") ||
+      ParseTypeAndValue(DefaultBB, BBLoc, PFS) ||
+      ParseToken(lltok::lsquare, "expected '[' with switch table"))
+    return true;
+
+  if (!isa<IntegerType>(Cond->getType()))
+    return Error(CondLoc, "switch condition must have integer type");
+  if (!isa<BasicBlock>(DefaultBB))
+    return Error(BBLoc, "default destination must be a basic block");
+  
+  // Parse the jump table pairs.
+  SmallPtrSet<Value*, 32> SeenCases;
+  SmallVector<std::pair<ConstantInt*, BasicBlock*>, 32> Table;
+  while (Lex.getKind() != lltok::rsquare) {
+    Value *Constant, *DestBB;
+    
+    if (ParseTypeAndValue(Constant, CondLoc, PFS) ||
+        ParseToken(lltok::comma, "expected ',' after case value") ||
+        ParseTypeAndValue(DestBB, BBLoc, PFS))
+      return true;
+
+    if (!SeenCases.insert(Constant))
+      return Error(CondLoc, "duplicate case value in switch");
+    if (!isa<ConstantInt>(Constant))
+      return Error(CondLoc, "case value is not a constant integer");
+    if (!isa<BasicBlock>(DestBB))
+      return Error(BBLoc, "case destination is not a basic block");
+    
+    Table.push_back(std::make_pair(cast<ConstantInt>(Constant),
+                                   cast<BasicBlock>(DestBB)));
+  }
+  
+  Lex.Lex();  // Eat the ']'.
+  
+  SwitchInst *SI = SwitchInst::Create(Cond, cast<BasicBlock>(DefaultBB),
+                                      Table.size());
+  for (unsigned i = 0, e = Table.size(); i != e; ++i)
+    SI->addCase(Table[i].first, Table[i].second);
+  Inst = SI;
+  return false;
+}
+
+/// ParseInvoke
+///   ::= 'invoke' OptionalCallingConv OptionalAttrs Type Value ParamList
+///       OptionalAttrs 'to' TypeAndValue 'unwind' TypeAndValue
+bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy CallLoc = Lex.getLoc();
+  unsigned CC, RetAttrs, FnAttrs;
+  PATypeHolder RetType(Type::VoidTy);
+  LocTy RetTypeLoc;
+  ValID CalleeID;
+  SmallVector<ParamInfo, 16> ArgList;
+
+  Value *NormalBB, *UnwindBB;
+  if (ParseOptionalCallingConv(CC) ||
+      ParseOptionalAttrs(RetAttrs, 1) ||
+      ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
+      ParseValID(CalleeID) ||
+      ParseParameterList(ArgList, PFS) ||
+      ParseOptionalAttrs(FnAttrs, 2) ||
+      ParseToken(lltok::kw_to, "expected 'to' in invoke") ||
+      ParseTypeAndValue(NormalBB, PFS) ||
+      ParseToken(lltok::kw_unwind, "expected 'unwind' in invoke") ||
+      ParseTypeAndValue(UnwindBB, PFS))
+    return true;
+  
+  if (!isa<BasicBlock>(NormalBB))
+    return Error(CallLoc, "normal destination is not a basic block");
+  if (!isa<BasicBlock>(UnwindBB))
+    return Error(CallLoc, "unwind destination is not a basic block");
+  
+  // If RetType is a non-function pointer type, then this is the short syntax
+  // for the call, which means that RetType is just the return type.  Infer the
+  // rest of the function argument types from the arguments that are present.
+  const PointerType *PFTy = 0;
+  const FunctionType *Ty = 0;
+  if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
+      !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
+    // Pull out the types of all of the arguments...
+    std::vector<const Type*> ParamTypes;
+    for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
+      ParamTypes.push_back(ArgList[i].V->getType());
+    
+    if (!FunctionType::isValidReturnType(RetType))
+      return Error(RetTypeLoc, "Invalid result type for LLVM function");
+    
+    Ty = FunctionType::get(RetType, ParamTypes, false);
+    PFTy = PointerType::getUnqual(Ty);
+  }
+  
+  // Look up the callee.
+  Value *Callee;
+  if (ConvertValIDToValue(PFTy, CalleeID, Callee, PFS)) return true;
+  
+  // FIXME: In LLVM 3.0, stop accepting zext, sext and inreg as optional
+  // function attributes.
+  unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg;
+  if (FnAttrs & ObsoleteFuncAttrs) {
+    RetAttrs |= FnAttrs & ObsoleteFuncAttrs;
+    FnAttrs &= ~ObsoleteFuncAttrs;
+  }
+  
+  // Set up the Attributes for the function.
+  SmallVector<AttributeWithIndex, 8> Attrs;
+  if (RetAttrs != Attribute::None)
+    Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
+  
+  SmallVector<Value*, 8> Args;
+  
+  // Loop through FunctionType's arguments and ensure they are specified
+  // correctly.  Also, gather any parameter attributes.
+  FunctionType::param_iterator I = Ty->param_begin();
+  FunctionType::param_iterator E = Ty->param_end();
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    const Type *ExpectedTy = 0;
+    if (I != E) {
+      ExpectedTy = *I++;
+    } else if (!Ty->isVarArg()) {
+      return Error(ArgList[i].Loc, "too many arguments specified");
+    }
+    
+    if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
+      return Error(ArgList[i].Loc, "argument is not of expected type '" +
+                   ExpectedTy->getDescription() + "'");
+    Args.push_back(ArgList[i].V);
+    if (ArgList[i].Attrs != Attribute::None)
+      Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
+  }
+  
+  if (I != E)
+    return Error(CallLoc, "not enough parameters specified for call");
+  
+  if (FnAttrs != Attribute::None)
+    Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
+  
+  // Finish off the Attributes and check them
+  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  
+  InvokeInst *II = InvokeInst::Create(Callee, cast<BasicBlock>(NormalBB),
+                                      cast<BasicBlock>(UnwindBB),
+                                      Args.begin(), Args.end());
+  II->setCallingConv(CC);
+  II->setAttributes(PAL);
+  Inst = II;
+  return false;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Binary Operators.
+//===----------------------------------------------------------------------===//
+
+/// ParseArithmetic
+///  ::= ArithmeticOps TypeAndValue ',' Value
+///
+/// If OperandType is 0, then any FP or integer operand is allowed.  If it is 1,
+/// then any integer operand is allowed, if it is 2, any fp operand is allowed.
+bool LLParser::ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS,
+                               unsigned Opc, unsigned OperandType) {
+  LocTy Loc; Value *LHS, *RHS;
+  if (ParseTypeAndValue(LHS, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' in arithmetic operation") ||
+      ParseValue(LHS->getType(), RHS, PFS))
+    return true;
+
+  bool Valid;
+  switch (OperandType) {
+  default: assert(0 && "Unknown operand type!");
+  case 0: // int or FP.
+    Valid = LHS->getType()->isIntOrIntVector() ||
+            LHS->getType()->isFPOrFPVector();
+    break;
+  case 1: Valid = LHS->getType()->isIntOrIntVector(); break;
+  case 2: Valid = LHS->getType()->isFPOrFPVector(); break;
+  }
+  
+  if (!Valid)
+    return Error(Loc, "invalid operand type for instruction");
+  
+  Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+  return false;
+}
+
+/// ParseLogical
+///  ::= ArithmeticOps TypeAndValue ',' Value {
+bool LLParser::ParseLogical(Instruction *&Inst, PerFunctionState &PFS,
+                            unsigned Opc) {
+  LocTy Loc; Value *LHS, *RHS;
+  if (ParseTypeAndValue(LHS, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' in logical operation") ||
+      ParseValue(LHS->getType(), RHS, PFS))
+    return true;
+
+  if (!LHS->getType()->isIntOrIntVector())
+    return Error(Loc,"instruction requires integer or integer vector operands");
+
+  Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+  return false;
+}
+
+
+/// ParseCompare
+///  ::= 'icmp' IPredicates TypeAndValue ',' Value
+///  ::= 'fcmp' FPredicates TypeAndValue ',' Value
+///  ::= 'vicmp' IPredicates TypeAndValue ',' Value
+///  ::= 'vfcmp' FPredicates TypeAndValue ',' Value
+bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS,
+                            unsigned Opc) {
+  // Parse the integer/fp comparison predicate.
+  LocTy Loc;
+  unsigned Pred;
+  Value *LHS, *RHS;
+  if (ParseCmpPredicate(Pred, Opc) ||
+      ParseTypeAndValue(LHS, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after compare value") ||
+      ParseValue(LHS->getType(), RHS, PFS))
+    return true;
+  
+  if (Opc == Instruction::FCmp) {
+    if (!LHS->getType()->isFPOrFPVector())
+      return Error(Loc, "fcmp requires floating point operands");
+    Inst = new FCmpInst(CmpInst::Predicate(Pred), LHS, RHS);
+  } else if (Opc == Instruction::ICmp) {
+    if (!LHS->getType()->isIntOrIntVector() &&
+        !isa<PointerType>(LHS->getType()))
+      return Error(Loc, "icmp requires integer operands");
+    Inst = new ICmpInst(CmpInst::Predicate(Pred), LHS, RHS);
+  } else if (Opc == Instruction::VFCmp) {
+    if (!LHS->getType()->isFPOrFPVector() || !isa<VectorType>(LHS->getType()))
+      return Error(Loc, "vfcmp requires vector floating point operands");
+    Inst = new VFCmpInst(CmpInst::Predicate(Pred), LHS, RHS);
+  } else if (Opc == Instruction::VICmp) {
+    if (!LHS->getType()->isIntOrIntVector() || !isa<VectorType>(LHS->getType()))
+      return Error(Loc, "vicmp requires vector floating point operands");
+    Inst = new VICmpInst(CmpInst::Predicate(Pred), LHS, RHS);
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Other Instructions.
+//===----------------------------------------------------------------------===//
+
+
+/// ParseCast
+///   ::= CastOpc TypeAndValue 'to' Type
+bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS,
+                         unsigned Opc) {
+  LocTy Loc;  Value *Op;
+  PATypeHolder DestTy(Type::VoidTy);
+  if (ParseTypeAndValue(Op, Loc, PFS) ||
+      ParseToken(lltok::kw_to, "expected 'to' after cast value") ||
+      ParseType(DestTy))
+    return true;
+  
+  if (!CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy)) {
+    CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy);
+    return Error(Loc, "invalid cast opcode for cast from '" +
+                 Op->getType()->getDescription() + "' to '" +
+                 DestTy->getDescription() + "'");
+  }
+  Inst = CastInst::Create((Instruction::CastOps)Opc, Op, DestTy);
+  return false;
+}
+
+/// ParseSelect
+///   ::= 'select' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseSelect(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after select condition") ||
+      ParseTypeAndValue(Op1, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after select value") ||
+      ParseTypeAndValue(Op2, PFS))
+    return true;
+  
+  if (const char *Reason = SelectInst::areInvalidOperands(Op0, Op1, Op2))
+    return Error(Loc, Reason);
+  
+  Inst = SelectInst::Create(Op0, Op1, Op2);
+  return false;
+}
+
+/// ParseVA_Arg
+///   ::= 'va_arg' TypeAndValue ',' Type
+bool LLParser::ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Op;
+  PATypeHolder EltTy(Type::VoidTy);
+  LocTy TypeLoc;
+  if (ParseTypeAndValue(Op, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after vaarg operand") ||
+      ParseType(EltTy, TypeLoc))
+    return true;
+  
+  if (!EltTy->isFirstClassType())
+    return Error(TypeLoc, "va_arg requires operand with first class type");
+
+  Inst = new VAArgInst(Op, EltTy);
+  return false;
+}
+
+/// ParseExtractElement
+///   ::= 'extractelement' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after extract value") ||
+      ParseTypeAndValue(Op1, PFS))
+    return true;
+  
+  if (!ExtractElementInst::isValidOperands(Op0, Op1))
+    return Error(Loc, "invalid extractelement operands");
+  
+  Inst = new ExtractElementInst(Op0, Op1);
+  return false;
+}
+
+/// ParseInsertElement
+///   ::= 'insertelement' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+      ParseTypeAndValue(Op1, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+      ParseTypeAndValue(Op2, PFS))
+    return true;
+  
+  if (!InsertElementInst::isValidOperands(Op0, Op1, Op2))
+    return Error(Loc, "invalid extractelement operands");
+  
+  Inst = InsertElementInst::Create(Op0, Op1, Op2);
+  return false;
+}
+
+/// ParseShuffleVector
+///   ::= 'shufflevector' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after shuffle mask") ||
+      ParseTypeAndValue(Op1, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after shuffle value") ||
+      ParseTypeAndValue(Op2, PFS))
+    return true;
+  
+  if (!ShuffleVectorInst::isValidOperands(Op0, Op1, Op2))
+    return Error(Loc, "invalid extractelement operands");
+  
+  Inst = new ShuffleVectorInst(Op0, Op1, Op2);
+  return false;
+}
+
+/// ParsePHI
+///   ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Valueß ']')*
+bool LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
+  PATypeHolder Ty(Type::VoidTy);
+  Value *Op0, *Op1;
+  LocTy TypeLoc = Lex.getLoc();
+  
+  if (ParseType(Ty) ||
+      ParseToken(lltok::lsquare, "expected '[' in phi value list") ||
+      ParseValue(Ty, Op0, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+      ParseValue(Type::LabelTy, Op1, PFS) ||
+      ParseToken(lltok::rsquare, "expected ']' in phi value list"))
+    return true;
+ 
+  SmallVector<std::pair<Value*, BasicBlock*>, 16> PHIVals;
+  while (1) {
+    PHIVals.push_back(std::make_pair(Op0, cast<BasicBlock>(Op1)));
+    
+    if (!EatIfPresent(lltok::comma))
+      break;
+
+    if (ParseToken(lltok::lsquare, "expected '[' in phi value list") ||
+        ParseValue(Ty, Op0, PFS) ||
+        ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+        ParseValue(Type::LabelTy, Op1, PFS) ||
+        ParseToken(lltok::rsquare, "expected ']' in phi value list"))
+      return true;
+  }
+  
+  if (!Ty->isFirstClassType())
+    return Error(TypeLoc, "phi node must have first class type");
+
+  PHINode *PN = PHINode::Create(Ty);
+  PN->reserveOperandSpace(PHIVals.size());
+  for (unsigned i = 0, e = PHIVals.size(); i != e; ++i)
+    PN->addIncoming(PHIVals[i].first, PHIVals[i].second);
+  Inst = PN;
+  return false;
+}
+
+/// ParseCall
+///   ::= 'tail'? 'call' OptionalCallingConv OptionalAttrs Type Value
+///       ParameterList OptionalAttrs
+bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
+                         bool isTail) {
+  unsigned CC, RetAttrs, FnAttrs;
+  PATypeHolder RetType(Type::VoidTy);
+  LocTy RetTypeLoc;
+  ValID CalleeID;
+  SmallVector<ParamInfo, 16> ArgList;
+  LocTy CallLoc = Lex.getLoc();
+  
+  if ((isTail && ParseToken(lltok::kw_call, "expected 'tail call'")) ||
+      ParseOptionalCallingConv(CC) ||
+      ParseOptionalAttrs(RetAttrs, 1) ||
+      ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
+      ParseValID(CalleeID) ||
+      ParseParameterList(ArgList, PFS) ||
+      ParseOptionalAttrs(FnAttrs, 2))
+    return true;
+  
+  // If RetType is a non-function pointer type, then this is the short syntax
+  // for the call, which means that RetType is just the return type.  Infer the
+  // rest of the function argument types from the arguments that are present.
+  const PointerType *PFTy = 0;
+  const FunctionType *Ty = 0;
+  if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
+      !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
+    // Pull out the types of all of the arguments...
+    std::vector<const Type*> ParamTypes;
+    for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
+      ParamTypes.push_back(ArgList[i].V->getType());
+    
+    if (!FunctionType::isValidReturnType(RetType))
+      return Error(RetTypeLoc, "Invalid result type for LLVM function");
+    
+    Ty = FunctionType::get(RetType, ParamTypes, false);
+    PFTy = PointerType::getUnqual(Ty);
+  }
+  
+  // Look up the callee.
+  Value *Callee;
+  if (ConvertValIDToValue(PFTy, CalleeID, Callee, PFS)) return true;
+  
+  // FIXME: In LLVM 3.0, stop accepting zext, sext and inreg as optional
+  // function attributes.
+  unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg;
+  if (FnAttrs & ObsoleteFuncAttrs) {
+    RetAttrs |= FnAttrs & ObsoleteFuncAttrs;
+    FnAttrs &= ~ObsoleteFuncAttrs;
+  }
+
+  // Set up the Attributes for the function.
+  SmallVector<AttributeWithIndex, 8> Attrs;
+  if (RetAttrs != Attribute::None)
+    Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
+  
+  SmallVector<Value*, 8> Args;
+  
+  // Loop through FunctionType's arguments and ensure they are specified
+  // correctly.  Also, gather any parameter attributes.
+  FunctionType::param_iterator I = Ty->param_begin();
+  FunctionType::param_iterator E = Ty->param_end();
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    const Type *ExpectedTy = 0;
+    if (I != E) {
+      ExpectedTy = *I++;
+    } else if (!Ty->isVarArg()) {
+      return Error(ArgList[i].Loc, "too many arguments specified");
+    }
+    
+    if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
+      return Error(ArgList[i].Loc, "argument is not of expected type '" +
+                   ExpectedTy->getDescription() + "'");
+    Args.push_back(ArgList[i].V);
+    if (ArgList[i].Attrs != Attribute::None)
+      Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
+  }
+  
+  if (I != E)
+    return Error(CallLoc, "not enough parameters specified for call");
+
+  if (FnAttrs != Attribute::None)
+    Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
+
+  // Finish off the Attributes and check them
+  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  
+  CallInst *CI = CallInst::Create(Callee, Args.begin(), Args.end());
+  CI->setTailCall(isTail);
+  CI->setCallingConv(CC);
+  CI->setAttributes(PAL);
+  Inst = CI;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Instructions.
+//===----------------------------------------------------------------------===//
+
+/// ParseAlloc
+///   ::= 'malloc' Type (',' TypeAndValue)? (',' OptionalAlignment)?
+///   ::= 'alloca' Type (',' TypeAndValue)? (',' OptionalAlignment)?
+bool LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS,
+                          unsigned Opc) {
+  PATypeHolder Ty(Type::VoidTy);
+  Value *Size = 0;
+  LocTy SizeLoc = 0;
+  unsigned Alignment = 0;
+  if (ParseType(Ty)) return true;
+
+  if (EatIfPresent(lltok::comma)) {
+    if (Lex.getKind() == lltok::kw_align) {
+      if (ParseOptionalAlignment(Alignment)) return true;
+    } else if (ParseTypeAndValue(Size, SizeLoc, PFS) ||
+               ParseOptionalCommaAlignment(Alignment)) {
+      return true;
+    }
+  }
+
+  if (Size && Size->getType() != Type::Int32Ty)
+    return Error(SizeLoc, "element count must be i32");
+
+  if (Opc == Instruction::Malloc)
+    Inst = new MallocInst(Ty, Size, Alignment);
+  else
+    Inst = new AllocaInst(Ty, Size, Alignment);
+  return false;
+}
+
+/// ParseFree
+///   ::= 'free' TypeAndValue
+bool LLParser::ParseFree(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val; LocTy Loc;
+  if (ParseTypeAndValue(Val, Loc, PFS)) return true;
+  if (!isa<PointerType>(Val->getType()))
+    return Error(Loc, "operand to free must be a pointer");
+  Inst = new FreeInst(Val);
+  return false;
+}
+
+/// ParseLoad
+///   ::= 'volatile'? 'load' TypeAndValue (',' 'align' uint)?
+bool LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS,
+                         bool isVolatile) {
+  Value *Val; LocTy Loc;
+  unsigned Alignment;
+  if (ParseTypeAndValue(Val, Loc, PFS) ||
+      ParseOptionalCommaAlignment(Alignment))
+    return true;
+
+  if (!isa<PointerType>(Val->getType()) ||
+      !cast<PointerType>(Val->getType())->getElementType()->isFirstClassType())
+    return Error(Loc, "load operand must be a pointer to a first class type");
+  
+  Inst = new LoadInst(Val, "", isVolatile, Alignment);
+  return false;
+}
+
+/// ParseStore
+///   ::= 'volatile'? 'store' TypeAndValue ',' TypeAndValue (',' 'align' uint)?
+bool LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS,
+                          bool isVolatile) {
+  Value *Val, *Ptr; LocTy Loc, PtrLoc;
+  unsigned Alignment;
+  if (ParseTypeAndValue(Val, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after store operand") ||
+      ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      ParseOptionalCommaAlignment(Alignment))
+    return true;
+  
+  if (!isa<PointerType>(Ptr->getType()))
+    return Error(PtrLoc, "store operand must be a pointer");
+  if (!Val->getType()->isFirstClassType())
+    return Error(Loc, "store operand must be a first class value");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
+    return Error(Loc, "stored value and pointer type do not match");
+  
+  Inst = new StoreInst(Val, Ptr, isVolatile, Alignment);
+  return false;
+}
+
+/// ParseGetResult
+///   ::= 'getresult' TypeAndValue ',' uint
+/// FIXME: Remove support for getresult in LLVM 3.0
+bool LLParser::ParseGetResult(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val; LocTy ValLoc, EltLoc;
+  unsigned Element;
+  if (ParseTypeAndValue(Val, ValLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after getresult operand") ||
+      ParseUInt32(Element, EltLoc))
+    return true;
+  
+  if (!isa<StructType>(Val->getType()) && !isa<ArrayType>(Val->getType()))
+    return Error(ValLoc, "getresult inst requires an aggregate operand");
+  if (!ExtractValueInst::getIndexedType(Val->getType(), Element))
+    return Error(EltLoc, "invalid getresult index for value");
+  Inst = ExtractValueInst::Create(Val, Element);
+  return false;
+}
+
+/// ParseGetElementPtr
+///   ::= 'getelementptr' TypeAndValue (',' TypeAndValue)*
+bool LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Ptr, *Val; LocTy Loc, EltLoc;
+  if (ParseTypeAndValue(Ptr, Loc, PFS)) return true;
+  
+  if (!isa<PointerType>(Ptr->getType()))
+    return Error(Loc, "base of getelementptr must be a pointer");
+  
+  SmallVector<Value*, 16> Indices;
+  while (EatIfPresent(lltok::comma)) {
+    if (ParseTypeAndValue(Val, EltLoc, PFS)) return true;
+    if (!isa<IntegerType>(Val->getType()))
+      return Error(EltLoc, "getelementptr index must be an integer");
+    Indices.push_back(Val);
+  }
+  
+  if (!GetElementPtrInst::getIndexedType(Ptr->getType(),
+                                         Indices.begin(), Indices.end()))
+    return Error(Loc, "invalid getelementptr indices");
+  Inst = GetElementPtrInst::Create(Ptr, Indices.begin(), Indices.end());
+  return false;
+}
+
+/// ParseExtractValue
+///   ::= 'extractvalue' TypeAndValue (',' uint32)+
+bool LLParser::ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val; LocTy Loc;
+  SmallVector<unsigned, 4> Indices;
+  if (ParseTypeAndValue(Val, Loc, PFS) ||
+      ParseIndexList(Indices))
+    return true;
+
+  if (!isa<StructType>(Val->getType()) && !isa<ArrayType>(Val->getType()))
+    return Error(Loc, "extractvalue operand must be array or struct");
+
+  if (!ExtractValueInst::getIndexedType(Val->getType(), Indices.begin(),
+                                        Indices.end()))
+    return Error(Loc, "invalid indices for extractvalue");
+  Inst = ExtractValueInst::Create(Val, Indices.begin(), Indices.end());
+  return false;
+}
+
+/// ParseInsertValue
+///   ::= 'insertvalue' TypeAndValue ',' TypeAndValue (',' uint32)+
+bool LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val0, *Val1; LocTy Loc0, Loc1;
+  SmallVector<unsigned, 4> Indices;
+  if (ParseTypeAndValue(Val0, Loc0, PFS) ||
+      ParseToken(lltok::comma, "expected comma after insertvalue operand") ||
+      ParseTypeAndValue(Val1, Loc1, PFS) ||
+      ParseIndexList(Indices))
+    return true;
+  
+  if (!isa<StructType>(Val0->getType()) && !isa<ArrayType>(Val0->getType()))
+    return Error(Loc0, "extractvalue operand must be array or struct");
+  
+  if (!ExtractValueInst::getIndexedType(Val0->getType(), Indices.begin(),
+                                        Indices.end()))
+    return Error(Loc0, "invalid indices for insertvalue");
+  Inst = InsertValueInst::Create(Val0, Val1, Indices.begin(), Indices.end());
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Embedded metadata.
+//===----------------------------------------------------------------------===//
+
+/// ParseMDNodeVector
+///   ::= Element (',' Element)*
+/// Element
+///   ::= 'null' | TypeAndValue
+bool LLParser::ParseMDNodeVector(SmallVectorImpl<Value*> &Elts) {
+  assert(Lex.getKind() == lltok::lbrace);
+  Lex.Lex();
+  do {
+    Value *V;
+    if (Lex.getKind() == lltok::kw_null) {
+      Lex.Lex();
+      V = 0;
+    } else {
+      Constant *C;
+      if (ParseGlobalTypeAndValue(C)) return true;
+      V = C;
+    }
+    Elts.push_back(V);
+  } while (EatIfPresent(lltok::comma));
+
+  return false;
+}
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
new file mode 100644
index 0000000..7106689
--- /dev/null
+++ b/lib/AsmParser/LLParser.h
@@ -0,0 +1,276 @@
+//===-- LLParser.h - Parser Class -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the parser class for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_LLPARSER_H
+#define LLVM_ASMPARSER_LLPARSER_H
+
+#include "LLLexer.h"
+#include "llvm/Type.h"
+#include <map>
+
+namespace llvm {
+  class Module;
+  class OpaqueType;
+  class Function;
+  class Value;
+  class BasicBlock;
+  class Instruction;
+  class Constant;
+  class GlobalValue;
+  class MDString;
+  class MDNode;
+  struct ValID;
+
+  class LLParser {
+  public:
+    typedef LLLexer::LocTy LocTy;
+  private:
+
+    LLLexer Lex;
+    Module *M;
+
+    // Type resolution handling data structures.
+    std::map<std::string, std::pair<PATypeHolder, LocTy> > ForwardRefTypes;
+    std::map<unsigned, std::pair<PATypeHolder, LocTy> > ForwardRefTypeIDs;
+    std::vector<PATypeHolder> NumberedTypes;
+
+    struct UpRefRecord {
+      /// Loc - This is the location of the upref.
+      LocTy Loc;
+
+      /// NestingLevel - The number of nesting levels that need to be popped
+      /// before this type is resolved.
+      unsigned NestingLevel;
+
+      /// LastContainedTy - This is the type at the current binding level for
+      /// the type.  Every time we reduce the nesting level, this gets updated.
+      const Type *LastContainedTy;
+
+      /// UpRefTy - This is the actual opaque type that the upreference is
+      /// represented with.
+      OpaqueType *UpRefTy;
+
+      UpRefRecord(LocTy L, unsigned NL, OpaqueType *URTy)
+        : Loc(L), NestingLevel(NL), LastContainedTy((Type*)URTy),
+          UpRefTy(URTy) {}
+    };
+    std::vector<UpRefRecord> UpRefs;
+
+    // Global Value reference information.
+    std::map<std::string, std::pair<GlobalValue*, LocTy> > ForwardRefVals;
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> > ForwardRefValIDs;
+    std::vector<GlobalValue*> NumberedVals;
+  public:
+    LLParser(MemoryBuffer *F, ParseError &Err, Module *m) : Lex(F, Err), M(m) {}
+    bool Run();
+
+  private:
+
+    bool Error(LocTy L, const std::string &Msg) const {
+      return Lex.Error(L, Msg);
+    }
+    bool TokError(const std::string &Msg) const {
+      return Error(Lex.getLoc(), Msg);
+    }
+
+    /// GetGlobalVal - Get a value with the specified name or ID, creating a
+    /// forward reference record if needed.  This can return null if the value
+    /// exists but does not have the right type.
+    GlobalValue *GetGlobalVal(const std::string &N, const Type *Ty, LocTy Loc);
+    GlobalValue *GetGlobalVal(unsigned ID, const Type *Ty, LocTy Loc);
+
+    // Helper Routines.
+    bool ParseToken(lltok::Kind T, const char *ErrMsg);
+    bool EatIfPresent(lltok::Kind T) {
+      if (Lex.getKind() != T) return false;
+      Lex.Lex();
+      return true;
+    }
+    bool ParseOptionalToken(lltok::Kind T, bool &Present) {
+      if (Lex.getKind() != T) {
+        Present = false;
+      } else {
+        Lex.Lex();
+        Present = true;
+      }
+      return false;
+    }
+    bool ParseStringConstant(std::string &Result);
+    bool ParseUInt32(unsigned &Val);
+    bool ParseUInt32(unsigned &Val, LocTy &Loc) {
+      Loc = Lex.getLoc();
+      return ParseUInt32(Val);
+    }
+    bool ParseOptionalAddrSpace(unsigned &AddrSpace);
+    bool ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind);
+    bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage);
+    bool ParseOptionalLinkage(unsigned &Linkage) {
+      bool HasLinkage; return ParseOptionalLinkage(Linkage, HasLinkage);
+    }
+    bool ParseOptionalVisibility(unsigned &Visibility);
+    bool ParseOptionalCallingConv(unsigned &CC);
+    bool ParseOptionalAlignment(unsigned &Alignment);
+    bool ParseOptionalCommaAlignment(unsigned &Alignment);
+    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices);
+
+    // Top-Level Entities
+    bool ParseTopLevelEntities();
+    bool ValidateEndOfModule();
+    bool ParseTargetDefinition();
+    bool ParseDepLibs();
+    bool ParseModuleAsm();
+    bool ParseUnnamedType();
+    bool ParseNamedType();
+    bool ParseDeclare();
+    bool ParseDefine();
+
+    bool ParseGlobalType(bool &IsConstant);
+    bool ParseNamedGlobal();
+    bool ParseGlobal(const std::string &Name, LocTy Loc, unsigned Linkage,
+                     bool HasLinkage, unsigned Visibility);
+    bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Visibility);
+
+    // Type Parsing.
+    bool ParseType(PATypeHolder &Result, bool AllowVoid = false);
+    bool ParseType(PATypeHolder &Result, LocTy &Loc, bool AllowVoid = false) {
+      Loc = Lex.getLoc();
+      return ParseType(Result, AllowVoid);
+    }
+    bool ParseTypeRec(PATypeHolder &H);
+    bool ParseStructType(PATypeHolder &H, bool Packed);
+    bool ParseArrayVectorType(PATypeHolder &H, bool isVector);
+    bool ParseFunctionType(PATypeHolder &Result);
+    PATypeHolder HandleUpRefs(const Type *Ty);
+
+    // Constants.
+    bool ParseValID(ValID &ID);
+    bool ConvertGlobalValIDToValue(const Type *Ty, ValID &ID, Constant *&V);
+    bool ParseGlobalValue(const Type *Ty, Constant *&V);
+    bool ParseGlobalTypeAndValue(Constant *&V);
+    bool ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts);
+    bool ParseMDNodeVector(SmallVectorImpl<Value*> &);
+
+
+    // Function Semantic Analysis.
+    class PerFunctionState {
+      LLParser &P;
+      Function &F;
+      std::map<std::string, std::pair<Value*, LocTy> > ForwardRefVals;
+      std::map<unsigned, std::pair<Value*, LocTy> > ForwardRefValIDs;
+      std::vector<Value*> NumberedVals;
+    public:
+      PerFunctionState(LLParser &p, Function &f);
+      ~PerFunctionState();
+
+      Function &getFunction() const { return F; }
+
+      bool VerifyFunctionComplete();
+
+      /// GetVal - Get a value with the specified name or ID, creating a
+      /// forward reference record if needed.  This can return null if the value
+      /// exists but does not have the right type.
+      Value *GetVal(const std::string &Name, const Type *Ty, LocTy Loc);
+      Value *GetVal(unsigned ID, const Type *Ty, LocTy Loc);
+
+      /// SetInstName - After an instruction is parsed and inserted into its
+      /// basic block, this installs its name.
+      bool SetInstName(int NameID, const std::string &NameStr, LocTy NameLoc,
+                       Instruction *Inst);
+
+      /// GetBB - Get a basic block with the specified name or ID, creating a
+      /// forward reference record if needed.  This can return null if the value
+      /// is not a BasicBlock.
+      BasicBlock *GetBB(const std::string &Name, LocTy Loc);
+      BasicBlock *GetBB(unsigned ID, LocTy Loc);
+
+      /// DefineBB - Define the specified basic block, which is either named or
+      /// unnamed.  If there is an error, this returns null otherwise it returns
+      /// the block being defined.
+      BasicBlock *DefineBB(const std::string &Name, LocTy Loc);
+    };
+
+    bool ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
+                             PerFunctionState &PFS);
+
+    bool ParseValue(const Type *Ty, Value *&V, PerFunctionState &PFS);
+    bool ParseValue(const Type *Ty, Value *&V, LocTy &Loc,
+                    PerFunctionState &PFS) {
+      Loc = Lex.getLoc();
+      return ParseValue(Ty, V, PFS);
+    }
+
+    bool ParseTypeAndValue(Value *&V, PerFunctionState &PFS);
+    bool ParseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) {
+      Loc = Lex.getLoc();
+      return ParseTypeAndValue(V, PFS);
+    }
+
+    struct ParamInfo {
+      LocTy Loc;
+      Value *V;
+      unsigned Attrs;
+      ParamInfo(LocTy loc, Value *v, unsigned attrs)
+        : Loc(loc), V(v), Attrs(attrs) {}
+    };
+    bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
+                            PerFunctionState &PFS);
+
+    // Function Parsing.
+    struct ArgInfo {
+      LocTy Loc;
+      PATypeHolder Type;
+      unsigned Attrs;
+      std::string Name;
+      ArgInfo(LocTy L, PATypeHolder Ty, unsigned Attr, const std::string &N)
+        : Loc(L), Type(Ty), Attrs(Attr), Name(N) {}
+    };
+    bool ParseArgumentList(std::vector<ArgInfo> &ArgList,
+                           bool &isVarArg, bool inType);
+    bool ParseFunctionHeader(Function *&Fn, bool isDefine);
+    bool ParseFunctionBody(Function &Fn);
+    bool ParseBasicBlock(PerFunctionState &PFS);
+
+    // Instruction Parsing.
+    bool ParseInstruction(Instruction *&Inst, BasicBlock *BB,
+                          PerFunctionState &PFS);
+    bool ParseCmpPredicate(unsigned &Pred, unsigned Opc);
+
+    bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS);
+    bool ParseBr(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS);
+
+    bool ParseArithmetic(Instruction *&I, PerFunctionState &PFS, unsigned Opc,
+                         unsigned OperandType);
+    bool ParseLogical(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseCompare(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseCast(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseSelect(Instruction *&I, PerFunctionState &PFS);
+    bool ParseVA_Arg(Instruction *&I, PerFunctionState &PFS);
+    bool ParseExtractElement(Instruction *&I, PerFunctionState &PFS);
+    bool ParseInsertElement(Instruction *&I, PerFunctionState &PFS);
+    bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS);
+    bool ParsePHI(Instruction *&I, PerFunctionState &PFS);
+    bool ParseCall(Instruction *&I, PerFunctionState &PFS, bool isTail);
+    bool ParseAlloc(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseFree(Instruction *&I, PerFunctionState &PFS);
+    bool ParseLoad(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
+    bool ParseStore(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
+    bool ParseGetResult(Instruction *&I, PerFunctionState &PFS);
+    bool ParseGetElementPtr(Instruction *&I, PerFunctionState &PFS);
+    bool ParseExtractValue(Instruction *&I, PerFunctionState &PFS);
+    bool ParseInsertValue(Instruction *&I, PerFunctionState &PFS);
+  };
+} // End llvm namespace
+
+#endif
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
new file mode 100644
index 0000000..d8bd38a
--- /dev/null
+++ b/lib/AsmParser/LLToken.h
@@ -0,0 +1,130 @@
+//===- LLToken.h - Token Codes for LLVM Assembly Files ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the enums for the .ll lexer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBS_ASMPARSER_LLTOKEN_H
+#define LIBS_ASMPARSER_LLTOKEN_H
+
+namespace llvm {
+namespace lltok {
+  enum Kind {
+    // Markers
+    Eof, Error,
+
+    // Tokens with no info.
+    dotdotdot,         // ...
+    equal, comma,      // =  ,
+    star,              // *
+    lsquare, rsquare,  // [  ]
+    lbrace, rbrace,    // {  }
+    less, greater,     // <  >
+    lparen, rparen,    // (  )
+    backslash,         // \    (not /)
+
+    kw_x,
+    kw_begin,   kw_end,
+    kw_true,    kw_false,
+    kw_declare, kw_define,
+    kw_global,  kw_constant,
+
+    kw_private, kw_internal, kw_linkonce, kw_linkonce_odr, kw_weak, kw_weak_odr,
+    kw_appending, kw_dllimport, kw_dllexport, kw_common,kw_available_externally,
+    kw_default, kw_hidden, kw_protected,
+    kw_extern_weak,
+    kw_external, kw_thread_local,
+    kw_zeroinitializer,
+    kw_undef, kw_null,
+    kw_to,
+    kw_tail,
+    kw_target,
+    kw_triple,
+    kw_deplibs,
+    kw_datalayout,
+    kw_volatile,
+    kw_align,
+    kw_addrspace,
+    kw_section,
+    kw_alias,
+    kw_module,
+    kw_asm,
+    kw_sideeffect,
+    kw_gc,
+    kw_c,
+
+    kw_cc, kw_ccc, kw_fastcc, kw_coldcc, kw_x86_stdcallcc, kw_x86_fastcallcc,
+
+    kw_signext,
+    kw_zeroext,
+    kw_inreg,
+    kw_sret,
+    kw_nounwind,
+    kw_noreturn,
+    kw_noalias,
+    kw_nocapture,
+    kw_byval,
+    kw_nest,
+    kw_readnone,
+    kw_readonly,
+
+    kw_noinline,
+    kw_alwaysinline,
+    kw_optsize,
+    kw_ssp,
+    kw_sspreq,
+
+    kw_type,
+    kw_opaque,
+
+    kw_eq, kw_ne, kw_slt, kw_sgt, kw_sle, kw_sge, kw_ult, kw_ugt, kw_ule,
+    kw_uge, kw_oeq, kw_one, kw_olt, kw_ogt, kw_ole, kw_oge, kw_ord, kw_uno,
+    kw_ueq, kw_une,
+
+    // Instruction Opcodes (Opcode in UIntVal).
+    kw_add,  kw_sub,  kw_mul,  kw_udiv, kw_sdiv, kw_fdiv,
+    kw_urem, kw_srem, kw_frem, kw_shl,  kw_lshr, kw_ashr,
+    kw_and,  kw_or,   kw_xor,  kw_icmp, kw_fcmp, kw_vicmp, kw_vfcmp,
+
+    kw_phi, kw_call,
+    kw_trunc, kw_zext, kw_sext, kw_fptrunc, kw_fpext, kw_uitofp, kw_sitofp,
+    kw_fptoui, kw_fptosi, kw_inttoptr, kw_ptrtoint, kw_bitcast,
+    kw_select, kw_va_arg,
+
+    kw_ret, kw_br, kw_switch, kw_invoke, kw_unwind, kw_unreachable,
+
+    kw_malloc, kw_alloca, kw_free, kw_load, kw_store, kw_getelementptr,
+
+    kw_extractelement, kw_insertelement, kw_shufflevector, kw_getresult,
+    kw_extractvalue, kw_insertvalue,
+
+    // Unsigned Valued tokens (UIntVal).
+    GlobalID,          // @42
+    LocalVarID,        // %42
+
+    // String valued tokens (StrVal).
+    LabelStr,          // foo:
+    GlobalVar,         // @foo @"foo"
+    LocalVar,          // %foo %"foo"
+    StringConstant,    // "foo"
+
+    // Metadata valued tokens.
+    Metadata,          // !"foo" !{i8 42}
+
+    // Type valued tokens (TyVal).
+    Type,
+
+    APFloat,  // APFloatVal
+    APSInt // APSInt
+  };
+} // end namespace lltok
+} // end namespace llvm
+
+#endif
diff --git a/lib/AsmParser/Makefile b/lib/AsmParser/Makefile
new file mode 100644
index 0000000..995bb0e
--- /dev/null
+++ b/lib/AsmParser/Makefile
@@ -0,0 +1,14 @@
+##===- lib/AsmParser/Makefile ------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME := LLVMAsmParser
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
new file mode 100644
index 0000000..759e00e
--- /dev/null
+++ b/lib/AsmParser/Parser.cpp
@@ -0,0 +1,87 @@
+//===- Parser.cpp - Main dispatch module for the Parser library -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library implements the functionality defined in llvm/Assembly/Parser.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Assembly/Parser.h"
+#include "LLParser.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+using namespace llvm;
+
+Module *llvm::ParseAssemblyFile(const std::string &Filename, ParseError &Err) {
+  Err.setFilename(Filename);
+
+  std::string ErrorStr;
+  OwningPtr<MemoryBuffer>
+    F(MemoryBuffer::getFileOrSTDIN(Filename.c_str(), &ErrorStr));
+  if (F == 0) {
+    Err.setError("Could not open input file '" + Filename + "'");
+    return 0;
+  }
+
+  OwningPtr<Module> M(new Module(Filename));
+  if (LLParser(F.get(), Err, M.get()).Run())
+    return 0;
+  return M.take();
+}
+
+Module *llvm::ParseAssemblyString(const char *AsmString, Module *M,
+                                  ParseError &Err) {
+  Err.setFilename("<string>");
+
+  OwningPtr<MemoryBuffer>
+    F(MemoryBuffer::getMemBuffer(AsmString, AsmString+strlen(AsmString),
+                                 "<string>"));
+  
+  // If we are parsing into an existing module, do it.
+  if (M)
+    return LLParser(F.get(), Err, M).Run() ? 0 : M;
+
+  // Otherwise create a new module.
+  OwningPtr<Module> M2(new Module("<string>"));
+  if (LLParser(F.get(), Err, M2.get()).Run())
+    return 0;
+  return M2.take();
+}
+
+
+//===------------------------------------------------------------------------===
+//                              ParseError Class
+//===------------------------------------------------------------------------===
+
+void ParseError::PrintError(const char *ProgName, raw_ostream &S) {
+  errs() << ProgName << ": ";
+  if (Filename == "-")
+    errs() << "<stdin>";
+  else
+    errs() << Filename;
+
+  if (LineNo != -1) {
+    errs() << ':' << LineNo;
+    if (ColumnNo != -1)
+      errs() << ':' << (ColumnNo+1);
+  }
+
+  errs() << ": " << Message << '\n';
+
+  if (LineNo != -1 && ColumnNo != -1) {
+    errs() << LineContents << '\n';
+
+    // Print out spaces/tabs before the caret.
+    for (unsigned i = 0; i != unsigned(ColumnNo); ++i)
+      errs() << (LineContents[i] == '\t' ? '\t' : ' ');
+    errs() << "^\n";
+  }
+}
diff --git a/lib/Bitcode/Makefile b/lib/Bitcode/Makefile
new file mode 100644
index 0000000..2d6b5ad
--- /dev/null
+++ b/lib/Bitcode/Makefile
@@ -0,0 +1,14 @@
+##===- lib/Bitcode/Makefile --------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+PARALLEL_DIRS = Reader Writer
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Bitcode/Reader/BitReader.cpp b/lib/Bitcode/Reader/BitReader.cpp
new file mode 100644
index 0000000..52851cd
--- /dev/null
+++ b/lib/Bitcode/Reader/BitReader.cpp
@@ -0,0 +1,51 @@
+//===-- BitReader.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/BitReader.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <string>
+#include <cstring>
+
+using namespace llvm;
+
+/* Builds a module from the bitcode in the specified memory buffer, returning a
+   reference to the module via the OutModule parameter. Returns 0 on success.
+   Optionally returns a human-readable error message via OutMessage. */ 
+int LLVMParseBitcode(LLVMMemoryBufferRef MemBuf,
+                     LLVMModuleRef *OutModule, char **OutMessage) {
+  std::string Message;
+  
+  *OutModule = wrap(ParseBitcodeFile(unwrap(MemBuf), &Message));
+  if (!*OutModule) {
+    if (OutMessage)
+      *OutMessage = strdup(Message.c_str());
+    return 1;
+  }
+  
+  return 0;
+}
+
+/* Reads a module from the specified path, returning via the OutModule parameter
+   a module provider which performs lazy deserialization. Returns 0 on success.
+   Optionally returns a human-readable error message via OutMessage. */ 
+int LLVMGetBitcodeModuleProvider(LLVMMemoryBufferRef MemBuf,
+                                 LLVMModuleProviderRef *OutMP,
+                                 char **OutMessage) {
+  std::string Message;
+  
+  *OutMP = wrap(getBitcodeModuleProvider(unwrap(MemBuf), &Message));
+  if (!*OutMP) {
+    if (OutMessage)
+      *OutMessage = strdup(Message.c_str());
+    return 1;
+  }
+  
+  return 0;
+}
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
new file mode 100644
index 0000000..1dad04b
--- /dev/null
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -0,0 +1,2126 @@
+//===- BitcodeReader.cpp - Internal BitcodeReader implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header defines the BitcodeReader class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "BitcodeReader.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/MDNode.h"
+#include "llvm/Module.h"
+#include "llvm/AutoUpgrade.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/OperandTraits.h"
+using namespace llvm;
+
+void BitcodeReader::FreeState() {
+  delete Buffer;
+  Buffer = 0;
+  std::vector<PATypeHolder>().swap(TypeList);
+  ValueList.clear();
+  
+  std::vector<AttrListPtr>().swap(MAttributes);
+  std::vector<BasicBlock*>().swap(FunctionBBs);
+  std::vector<Function*>().swap(FunctionsWithBodies);
+  DeferredFunctionInfo.clear();
+}
+
+//===----------------------------------------------------------------------===//
+//  Helper functions to implement forward reference resolution, etc.
+//===----------------------------------------------------------------------===//
+
+/// ConvertToString - Convert a string from a record into an std::string, return
+/// true on failure.
+template<typename StrTy>
+static bool ConvertToString(SmallVector<uint64_t, 64> &Record, unsigned Idx,
+                            StrTy &Result) {
+  if (Idx > Record.size())
+    return true;
+  
+  for (unsigned i = Idx, e = Record.size(); i != e; ++i)
+    Result += (char)Record[i];
+  return false;
+}
+
+static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
+  switch (Val) {
+  default: // Map unknown/new linkages to external
+  case 0: return GlobalValue::ExternalLinkage;
+  case 1: return GlobalValue::WeakAnyLinkage;
+  case 2: return GlobalValue::AppendingLinkage;
+  case 3: return GlobalValue::InternalLinkage;
+  case 4: return GlobalValue::LinkOnceAnyLinkage;
+  case 5: return GlobalValue::DLLImportLinkage;
+  case 6: return GlobalValue::DLLExportLinkage;
+  case 7: return GlobalValue::ExternalWeakLinkage;
+  case 8: return GlobalValue::CommonLinkage;
+  case 9: return GlobalValue::PrivateLinkage;
+  case 10: return GlobalValue::WeakODRLinkage;
+  case 11: return GlobalValue::LinkOnceODRLinkage;
+  case 12: return GlobalValue::AvailableExternallyLinkage;
+  }
+}
+
+static GlobalValue::VisibilityTypes GetDecodedVisibility(unsigned Val) {
+  switch (Val) {
+  default: // Map unknown visibilities to default.
+  case 0: return GlobalValue::DefaultVisibility;
+  case 1: return GlobalValue::HiddenVisibility;
+  case 2: return GlobalValue::ProtectedVisibility;
+  }
+}
+
+static int GetDecodedCastOpcode(unsigned Val) {
+  switch (Val) {
+  default: return -1;
+  case bitc::CAST_TRUNC   : return Instruction::Trunc;
+  case bitc::CAST_ZEXT    : return Instruction::ZExt;
+  case bitc::CAST_SEXT    : return Instruction::SExt;
+  case bitc::CAST_FPTOUI  : return Instruction::FPToUI;
+  case bitc::CAST_FPTOSI  : return Instruction::FPToSI;
+  case bitc::CAST_UITOFP  : return Instruction::UIToFP;
+  case bitc::CAST_SITOFP  : return Instruction::SIToFP;
+  case bitc::CAST_FPTRUNC : return Instruction::FPTrunc;
+  case bitc::CAST_FPEXT   : return Instruction::FPExt;
+  case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
+  case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
+  case bitc::CAST_BITCAST : return Instruction::BitCast;
+  }
+}
+static int GetDecodedBinaryOpcode(unsigned Val, const Type *Ty) {
+  switch (Val) {
+  default: return -1;
+  case bitc::BINOP_ADD:  return Instruction::Add;
+  case bitc::BINOP_SUB:  return Instruction::Sub;
+  case bitc::BINOP_MUL:  return Instruction::Mul;
+  case bitc::BINOP_UDIV: return Instruction::UDiv;
+  case bitc::BINOP_SDIV:
+    return Ty->isFPOrFPVector() ? Instruction::FDiv : Instruction::SDiv;
+  case bitc::BINOP_UREM: return Instruction::URem;
+  case bitc::BINOP_SREM:
+    return Ty->isFPOrFPVector() ? Instruction::FRem : Instruction::SRem;
+  case bitc::BINOP_SHL:  return Instruction::Shl;
+  case bitc::BINOP_LSHR: return Instruction::LShr;
+  case bitc::BINOP_ASHR: return Instruction::AShr;
+  case bitc::BINOP_AND:  return Instruction::And;
+  case bitc::BINOP_OR:   return Instruction::Or;
+  case bitc::BINOP_XOR:  return Instruction::Xor;
+  }
+}
+
+namespace llvm {
+namespace {
+  /// @brief A class for maintaining the slot number definition
+  /// as a placeholder for the actual definition for forward constants defs.
+  class ConstantPlaceHolder : public ConstantExpr {
+    ConstantPlaceHolder();                       // DO NOT IMPLEMENT
+    void operator=(const ConstantPlaceHolder &); // DO NOT IMPLEMENT
+  public:
+    // allocate space for exactly one operand
+    void *operator new(size_t s) {
+      return User::operator new(s, 1);
+    }
+    explicit ConstantPlaceHolder(const Type *Ty)
+      : ConstantExpr(Ty, Instruction::UserOp1, &Op<0>(), 1) {
+      Op<0>() = UndefValue::get(Type::Int32Ty);
+    }
+    
+    /// @brief Methods to support type inquiry through isa, cast, and dyn_cast.
+    static inline bool classof(const ConstantPlaceHolder *) { return true; }
+    static bool classof(const Value *V) {
+      return isa<ConstantExpr>(V) && 
+             cast<ConstantExpr>(V)->getOpcode() == Instruction::UserOp1;
+    }
+    
+    
+    /// Provide fast operand accessors
+    //DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+  };
+}
+
+// FIXME: can we inherit this from ConstantExpr?
+template <>
+struct OperandTraits<ConstantPlaceHolder> : FixedNumOperandTraits<1> {
+};
+}
+
+
+void BitcodeReaderValueList::AssignValue(Value *V, unsigned Idx) {
+  if (Idx == size()) {
+    push_back(V);
+    return;
+  }
+  
+  if (Idx >= size())
+    resize(Idx+1);
+  
+  WeakVH &OldV = ValuePtrs[Idx];
+  if (OldV == 0) {
+    OldV = V;
+    return;
+  }
+  
+  // Handle constants and non-constants (e.g. instrs) differently for
+  // efficiency.
+  if (Constant *PHC = dyn_cast<Constant>(&*OldV)) {
+    ResolveConstants.push_back(std::make_pair(PHC, Idx));
+    OldV = V;
+  } else {
+    // If there was a forward reference to this value, replace it.
+    Value *PrevVal = OldV;
+    OldV->replaceAllUsesWith(V);
+    delete PrevVal;
+  }
+}
+  
+
+Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx,
+                                                    const Type *Ty) {
+  if (Idx >= size())
+    resize(Idx + 1);
+
+  if (Value *V = ValuePtrs[Idx]) {
+    assert(Ty == V->getType() && "Type mismatch in constant table!");
+    return cast<Constant>(V);
+  }
+
+  // Create and return a placeholder, which will later be RAUW'd.
+  Constant *C = new ConstantPlaceHolder(Ty);
+  ValuePtrs[Idx] = C;
+  return C;
+}
+
+Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, const Type *Ty) {
+  if (Idx >= size())
+    resize(Idx + 1);
+  
+  if (Value *V = ValuePtrs[Idx]) {
+    assert((Ty == 0 || Ty == V->getType()) && "Type mismatch in value table!");
+    return V;
+  }
+  
+  // No type specified, must be invalid reference.
+  if (Ty == 0) return 0;
+  
+  // Create and return a placeholder, which will later be RAUW'd.
+  Value *V = new Argument(Ty);
+  ValuePtrs[Idx] = V;
+  return V;
+}
+
+/// ResolveConstantForwardRefs - Once all constants are read, this method bulk
+/// resolves any forward references.  The idea behind this is that we sometimes
+/// get constants (such as large arrays) which reference *many* forward ref
+/// constants.  Replacing each of these causes a lot of thrashing when
+/// building/reuniquing the constant.  Instead of doing this, we look at all the
+/// uses and rewrite all the place holders at once for any constant that uses
+/// a placeholder.
+void BitcodeReaderValueList::ResolveConstantForwardRefs() {
+  // Sort the values by-pointer so that they are efficient to look up with a 
+  // binary search.
+  std::sort(ResolveConstants.begin(), ResolveConstants.end());
+  
+  SmallVector<Constant*, 64> NewOps;
+  
+  while (!ResolveConstants.empty()) {
+    Value *RealVal = operator[](ResolveConstants.back().second);
+    Constant *Placeholder = ResolveConstants.back().first;
+    ResolveConstants.pop_back();
+    
+    // Loop over all users of the placeholder, updating them to reference the
+    // new value.  If they reference more than one placeholder, update them all
+    // at once.
+    while (!Placeholder->use_empty()) {
+      Value::use_iterator UI = Placeholder->use_begin();
+      
+      // If the using object isn't uniqued, just update the operands.  This
+      // handles instructions and initializers for global variables.
+      if (!isa<Constant>(*UI) || isa<GlobalValue>(*UI)) {
+        UI.getUse().set(RealVal);
+        continue;
+      }
+      
+      // Otherwise, we have a constant that uses the placeholder.  Replace that
+      // constant with a new constant that has *all* placeholder uses updated.
+      Constant *UserC = cast<Constant>(*UI);
+      for (User::op_iterator I = UserC->op_begin(), E = UserC->op_end();
+           I != E; ++I) {
+        Value *NewOp;
+        if (!isa<ConstantPlaceHolder>(*I)) {
+          // Not a placeholder reference.
+          NewOp = *I;
+        } else if (*I == Placeholder) {
+          // Common case is that it just references this one placeholder.
+          NewOp = RealVal;
+        } else {
+          // Otherwise, look up the placeholder in ResolveConstants.
+          ResolveConstantsTy::iterator It = 
+            std::lower_bound(ResolveConstants.begin(), ResolveConstants.end(), 
+                             std::pair<Constant*, unsigned>(cast<Constant>(*I),
+                                                            0));
+          assert(It != ResolveConstants.end() && It->first == *I);
+          NewOp = operator[](It->second);
+        }
+
+        NewOps.push_back(cast<Constant>(NewOp));
+      }
+
+      // Make the new constant.
+      Constant *NewC;
+      if (ConstantArray *UserCA = dyn_cast<ConstantArray>(UserC)) {
+        NewC = ConstantArray::get(UserCA->getType(), &NewOps[0], NewOps.size());
+      } else if (ConstantStruct *UserCS = dyn_cast<ConstantStruct>(UserC)) {
+        NewC = ConstantStruct::get(&NewOps[0], NewOps.size(),
+                                   UserCS->getType()->isPacked());
+      } else if (isa<ConstantVector>(UserC)) {
+        NewC = ConstantVector::get(&NewOps[0], NewOps.size());
+      } else {
+        assert(isa<ConstantExpr>(UserC) && "Must be a ConstantExpr.");
+        NewC = cast<ConstantExpr>(UserC)->getWithOperands(&NewOps[0],
+                                                          NewOps.size());
+      }
+      
+      UserC->replaceAllUsesWith(NewC);
+      UserC->destroyConstant();
+      NewOps.clear();
+    }
+    
+    // Update all ValueHandles, they should be the only users at this point.
+    Placeholder->replaceAllUsesWith(RealVal);
+    delete Placeholder;
+  }
+}
+
+
+const Type *BitcodeReader::getTypeByID(unsigned ID, bool isTypeTable) {
+  // If the TypeID is in range, return it.
+  if (ID < TypeList.size())
+    return TypeList[ID].get();
+  if (!isTypeTable) return 0;
+  
+  // The type table allows forward references.  Push as many Opaque types as
+  // needed to get up to ID.
+  while (TypeList.size() <= ID)
+    TypeList.push_back(OpaqueType::get());
+  return TypeList.back().get();
+}
+
+//===----------------------------------------------------------------------===//
+//  Functions for parsing blocks from the bitcode file
+//===----------------------------------------------------------------------===//
+
+bool BitcodeReader::ParseAttributeBlock() {
+  if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
+    return Error("Malformed block record");
+  
+  if (!MAttributes.empty())
+    return Error("Multiple PARAMATTR blocks found!");
+  
+  SmallVector<uint64_t, 64> Record;
+  
+  SmallVector<AttributeWithIndex, 8> Attrs;
+  
+  // Read all the records.
+  while (1) {
+    unsigned Code = Stream.ReadCode();
+    if (Code == bitc::END_BLOCK) {
+      if (Stream.ReadBlockEnd())
+        return Error("Error at end of PARAMATTR block");
+      return false;
+    }
+    
+    if (Code == bitc::ENTER_SUBBLOCK) {
+      // No known subblocks, always skip them.
+      Stream.ReadSubBlockID();
+      if (Stream.SkipBlock())
+        return Error("Malformed block record");
+      continue;
+    }
+    
+    if (Code == bitc::DEFINE_ABBREV) {
+      Stream.ReadAbbrevRecord();
+      continue;
+    }
+    
+    // Read a record.
+    Record.clear();
+    switch (Stream.ReadRecord(Code, Record)) {
+    default:  // Default behavior: ignore.
+      break;
+    case bitc::PARAMATTR_CODE_ENTRY: { // ENTRY: [paramidx0, attr0, ...]
+      if (Record.size() & 1)
+        return Error("Invalid ENTRY record");
+
+      // FIXME : Remove this autoupgrade code in LLVM 3.0.
+      // If Function attributes are using index 0 then transfer them
+      // to index ~0. Index 0 is used for return value attributes but used to be
+      // used for function attributes.
+      Attributes RetAttribute = Attribute::None;
+      Attributes FnAttribute = Attribute::None;
+      for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
+        // FIXME: remove in LLVM 3.0
+        // The alignment is stored as a 16-bit raw value from bits 31--16.
+        // We shift the bits above 31 down by 11 bits.
+
+        unsigned Alignment = (Record[i+1] & (0xffffull << 16)) >> 16;
+        if (Alignment && !isPowerOf2_32(Alignment))
+          return Error("Alignment is not a power of two.");
+
+        Attributes ReconstitutedAttr = Record[i+1] & 0xffff;
+        if (Alignment)
+          ReconstitutedAttr |= Attribute::constructAlignmentFromInt(Alignment);
+        ReconstitutedAttr |= (Record[i+1] & (0xffffull << 32)) >> 11;
+        Record[i+1] = ReconstitutedAttr;
+
+        if (Record[i] == 0)
+          RetAttribute = Record[i+1];
+        else if (Record[i] == ~0U)
+          FnAttribute = Record[i+1];
+      }
+
+      unsigned OldRetAttrs = (Attribute::NoUnwind|Attribute::NoReturn|
+                              Attribute::ReadOnly|Attribute::ReadNone);
+      
+      if (FnAttribute == Attribute::None && RetAttribute != Attribute::None &&
+          (RetAttribute & OldRetAttrs) != 0) {
+        if (FnAttribute == Attribute::None) { // add a slot so they get added.
+          Record.push_back(~0U);
+          Record.push_back(0);
+        }
+        
+        FnAttribute  |= RetAttribute & OldRetAttrs;
+        RetAttribute &= ~OldRetAttrs;
+      }
+
+      for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
+        if (Record[i] == 0) {
+          if (RetAttribute != Attribute::None)
+            Attrs.push_back(AttributeWithIndex::get(0, RetAttribute));
+        } else if (Record[i] == ~0U) {
+          if (FnAttribute != Attribute::None)
+            Attrs.push_back(AttributeWithIndex::get(~0U, FnAttribute));
+        } else if (Record[i+1] != Attribute::None)
+          Attrs.push_back(AttributeWithIndex::get(Record[i], Record[i+1]));
+      }
+
+      MAttributes.push_back(AttrListPtr::get(Attrs.begin(), Attrs.end()));
+      Attrs.clear();
+      break;
+    }
+    }
+  }
+}
+
+
+bool BitcodeReader::ParseTypeTable() {
+  if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID))
+    return Error("Malformed block record");
+  
+  if (!TypeList.empty())
+    return Error("Multiple TYPE_BLOCKs found!");
+
+  SmallVector<uint64_t, 64> Record;
+  unsigned NumRecords = 0;
+
+  // Read all the records for this type table.
+  while (1) {
+    unsigned Code = Stream.ReadCode();
+    if (Code == bitc::END_BLOCK) {
+      if (NumRecords != TypeList.size())
+        return Error("Invalid type forward reference in TYPE_BLOCK");
+      if (Stream.ReadBlockEnd())
+        return Error("Error at end of type table block");
+      return false;
+    }
+    
+    if (Code == bitc::ENTER_SUBBLOCK) {
+      // No known subblocks, always skip them.
+      Stream.ReadSubBlockID();
+      if (Stream.SkipBlock())
+        return Error("Malformed block record");
+      continue;
+    }
+    
+    if (Code == bitc::DEFINE_ABBREV) {
+      Stream.ReadAbbrevRecord();
+      continue;
+    }
+    
+    // Read a record.
+    Record.clear();
+    const Type *ResultTy = 0;
+    switch (Stream.ReadRecord(Code, Record)) {
+    default:  // Default behavior: unknown type.
+      ResultTy = 0;
+      break;
+    case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
+      // TYPE_CODE_NUMENTRY contains a count of the number of types in the
+      // type list.  This allows us to reserve space.
+      if (Record.size() < 1)
+        return Error("Invalid TYPE_CODE_NUMENTRY record");
+      TypeList.reserve(Record[0]);
+      continue;
+    case bitc::TYPE_CODE_VOID:      // VOID
+      ResultTy = Type::VoidTy;
+      break;
+    case bitc::TYPE_CODE_FLOAT:     // FLOAT
+      ResultTy = Type::FloatTy;
+      break;
+    case bitc::TYPE_CODE_DOUBLE:    // DOUBLE
+      ResultTy = Type::DoubleTy;
+      break;
+    case bitc::TYPE_CODE_X86_FP80:  // X86_FP80
+      ResultTy = Type::X86_FP80Ty;
+      break;
+    case bitc::TYPE_CODE_FP128:     // FP128
+      ResultTy = Type::FP128Ty;
+      break;
+    case bitc::TYPE_CODE_PPC_FP128: // PPC_FP128
+      ResultTy = Type::PPC_FP128Ty;
+      break;
+    case bitc::TYPE_CODE_LABEL:     // LABEL
+      ResultTy = Type::LabelTy;
+      break;
+    case bitc::TYPE_CODE_OPAQUE:    // OPAQUE
+      ResultTy = 0;
+      break;
+    case bitc::TYPE_CODE_METADATA:  // METADATA
+      ResultTy = Type::MetadataTy;
+      break;
+    case bitc::TYPE_CODE_INTEGER:   // INTEGER: [width]
+      if (Record.size() < 1)
+        return Error("Invalid Integer type record");
+      
+      ResultTy = IntegerType::get(Record[0]);
+      break;
+    case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or 
+                                    //          [pointee type, address space]
+      if (Record.size() < 1)
+        return Error("Invalid POINTER type record");
+      unsigned AddressSpace = 0;
+      if (Record.size() == 2)
+        AddressSpace = Record[1];
+      ResultTy = PointerType::get(getTypeByID(Record[0], true), AddressSpace);
+      break;
+    }
+    case bitc::TYPE_CODE_FUNCTION: {
+      // FIXME: attrid is dead, remove it in LLVM 3.0
+      // FUNCTION: [vararg, attrid, retty, paramty x N]
+      if (Record.size() < 3)
+        return Error("Invalid FUNCTION type record");
+      std::vector<const Type*> ArgTys;
+      for (unsigned i = 3, e = Record.size(); i != e; ++i)
+        ArgTys.push_back(getTypeByID(Record[i], true));
+      
+      ResultTy = FunctionType::get(getTypeByID(Record[2], true), ArgTys,
+                                   Record[0]);
+      break;
+    }
+    case bitc::TYPE_CODE_STRUCT: {  // STRUCT: [ispacked, eltty x N]
+      if (Record.size() < 1)
+        return Error("Invalid STRUCT type record");
+      std::vector<const Type*> EltTys;
+      for (unsigned i = 1, e = Record.size(); i != e; ++i)
+        EltTys.push_back(getTypeByID(Record[i], true));
+      ResultTy = StructType::get(EltTys, Record[0]);
+      break;
+    }
+    case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
+      if (Record.size() < 2)
+        return Error("Invalid ARRAY type record");
+      ResultTy = ArrayType::get(getTypeByID(Record[1], true), Record[0]);
+      break;
+    case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty]
+      if (Record.size() < 2)
+        return Error("Invalid VECTOR type record");
+      ResultTy = VectorType::get(getTypeByID(Record[1], true), Record[0]);
+      break;
+    }
+    
+    if (NumRecords == TypeList.size()) {
+      // If this is a new type slot, just append it.
+      TypeList.push_back(ResultTy ? ResultTy : OpaqueType::get());
+      ++NumRecords;
+    } else if (ResultTy == 0) {
+      // Otherwise, this was forward referenced, so an opaque type was created,
+      // but the result type is actually just an opaque.  Leave the one we
+      // created previously.
+      ++NumRecords;
+    } else {
+      // Otherwise, this was forward referenced, so an opaque type was created.
+      // Resolve the opaque type to the real type now.
+      assert(NumRecords < TypeList.size() && "Typelist imbalance");
+      const OpaqueType *OldTy = cast<OpaqueType>(TypeList[NumRecords++].get());
+     
+      // Don't directly push the new type on the Tab. Instead we want to replace
+      // the opaque type we previously inserted with the new concrete value. The
+      // refinement from the abstract (opaque) type to the new type causes all
+      // uses of the abstract type to use the concrete type (NewTy). This will
+      // also cause the opaque type to be deleted.
+      const_cast<OpaqueType*>(OldTy)->refineAbstractTypeTo(ResultTy);
+      
+      // This should have replaced the old opaque type with the new type in the
+      // value table... or with a preexisting type that was already in the
+      // system.  Let's just make sure it did.
+      assert(TypeList[NumRecords-1].get() != OldTy &&
+             "refineAbstractType didn't work!");
+    }
+  }
+}
+
+
+bool BitcodeReader::ParseTypeSymbolTable() {
+  if (Stream.EnterSubBlock(bitc::TYPE_SYMTAB_BLOCK_ID))
+    return Error("Malformed block record");
+  
+  SmallVector<uint64_t, 64> Record;
+  
+  // Read all the records for this type table.
+  std::string TypeName;
+  while (1) {
+    unsigned Code = Stream.ReadCode();
+    if (Code == bitc::END_BLOCK) {
+      if (Stream.ReadBlockEnd())
+        return Error("Error at end of type symbol table block");
+      return false;
+    }
+    
+    if (Code == bitc::ENTER_SUBBLOCK) {
+      // No known subblocks, always skip them.
+      Stream.ReadSubBlockID();
+      if (Stream.SkipBlock())
+        return Error("Malformed block record");
+      continue;
+    }
+    
+    if (Code == bitc::DEFINE_ABBREV) {
+      Stream.ReadAbbrevRecord();
+      continue;
+    }
+    
+    // Read a record.
+    Record.clear();
+    switch (Stream.ReadRecord(Code, Record)) {
+    default:  // Default behavior: unknown type.
+      break;
+    case bitc::TST_CODE_ENTRY:    // TST_ENTRY: [typeid, namechar x N]
+      if (ConvertToString(Record, 1, TypeName))
+        return Error("Invalid TST_ENTRY record");
+      unsigned TypeID = Record[0];
+      if (TypeID >= TypeList.size())
+        return Error("Invalid Type ID in TST_ENTRY record");
+
+      TheModule->addTypeName(TypeName, TypeList[TypeID].get());
+      TypeName.clear();
+      break;
+    }
+  }
+}
+
+bool BitcodeReader::ParseValueSymbolTable() {
+  if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+  
+  // Read all the records for this value table.
+  SmallString<128> ValueName;
+  while (1) {
+    unsigned Code = Stream.ReadCode();
+    if (Code == bitc::END_BLOCK) {
+      if (Stream.ReadBlockEnd())
+        return Error("Error at end of value symbol table block");
+      return false;
+    }    
+    if (Code == bitc::ENTER_SUBBLOCK) {
+      // No known subblocks, always skip them.
+      Stream.ReadSubBlockID();
+      if (Stream.SkipBlock())
+        return Error("Malformed block record");
+      continue;
+    }
+    
+    if (Code == bitc::DEFINE_ABBREV) {
+      Stream.ReadAbbrevRecord();
+      continue;
+    }
+    
+    // Read a record.
+    Record.clear();
+    switch (Stream.ReadRecord(Code, Record)) {
+    default:  // Default behavior: unknown type.
+      break;
+    case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
+      if (ConvertToString(Record, 1, ValueName))
+        return Error("Invalid VST_ENTRY record");
+      unsigned ValueID = Record[0];
+      if (ValueID >= ValueList.size())
+        return Error("Invalid Value ID in VST_ENTRY record");
+      Value *V = ValueList[ValueID];
+      
+      V->setName(&ValueName[0], ValueName.size());
+      ValueName.clear();
+      break;
+    }
+    case bitc::VST_CODE_BBENTRY: {
+      if (ConvertToString(Record, 1, ValueName))
+        return Error("Invalid VST_BBENTRY record");
+      BasicBlock *BB = getBasicBlock(Record[0]);
+      if (BB == 0)
+        return Error("Invalid BB ID in VST_BBENTRY record");
+      
+      BB->setName(&ValueName[0], ValueName.size());
+      ValueName.clear();
+      break;
+    }
+    }
+  }
+}
+
+/// DecodeSignRotatedValue - Decode a signed value stored with the sign bit in
+/// the LSB for dense VBR encoding.
+static uint64_t DecodeSignRotatedValue(uint64_t V) {
+  if ((V & 1) == 0)
+    return V >> 1;
+  if (V != 1) 
+    return -(V >> 1);
+  // There is no such thing as -0 with integers.  "-0" really means MININT.
+  return 1ULL << 63;
+}
+
+/// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
+/// values and aliases that we can.
+bool BitcodeReader::ResolveGlobalAndAliasInits() {
+  std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
+  std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
+  
+  GlobalInitWorklist.swap(GlobalInits);
+  AliasInitWorklist.swap(AliasInits);
+
+  while (!GlobalInitWorklist.empty()) {
+    unsigned ValID = GlobalInitWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      // Not ready to resolve this yet, it requires something later in the file.
+      GlobalInits.push_back(GlobalInitWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
+        GlobalInitWorklist.back().first->setInitializer(C);
+      else
+        return Error("Global variable initializer is not a constant!");
+    }
+    GlobalInitWorklist.pop_back(); 
+  }
+
+  while (!AliasInitWorklist.empty()) {
+    unsigned ValID = AliasInitWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      AliasInits.push_back(AliasInitWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
+        AliasInitWorklist.back().first->setAliasee(C);
+      else
+        return Error("Alias initializer is not a constant!");
+    }
+    AliasInitWorklist.pop_back(); 
+  }
+  return false;
+}
+
+
+bool BitcodeReader::ParseConstants() {
+  if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+  
+  // Read all the records for this value table.
+  const Type *CurTy = Type::Int32Ty;
+  unsigned NextCstNo = ValueList.size();
+  while (1) {
+    unsigned Code = Stream.ReadCode();
+    if (Code == bitc::END_BLOCK)
+      break;
+    
+    if (Code == bitc::ENTER_SUBBLOCK) {
+      // No known subblocks, always skip them.
+      Stream.ReadSubBlockID();
+      if (Stream.SkipBlock())
+        return Error("Malformed block record");
+      continue;
+    }
+    
+    if (Code == bitc::DEFINE_ABBREV) {
+      Stream.ReadAbbrevRecord();
+      continue;
+    }
+    
+    // Read a record.
+    Record.clear();
+    Value *V = 0;
+    switch (Stream.ReadRecord(Code, Record)) {
+    default:  // Default behavior: unknown constant
+    case bitc::CST_CODE_UNDEF:     // UNDEF
+      V = UndefValue::get(CurTy);
+      break;
+    case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
+      if (Record.empty())
+        return Error("Malformed CST_SETTYPE record");
+      if (Record[0] >= TypeList.size())
+        return Error("Invalid Type ID in CST_SETTYPE record");
+      CurTy = TypeList[Record[0]];
+      continue;  // Skip the ValueList manipulation.
+    case bitc::CST_CODE_NULL:      // NULL
+      V = Constant::getNullValue(CurTy);
+      break;
+    case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
+      if (!isa<IntegerType>(CurTy) || Record.empty())
+        return Error("Invalid CST_INTEGER record");
+      V = ConstantInt::get(CurTy, DecodeSignRotatedValue(Record[0]));
+      break;
+    case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
+      if (!isa<IntegerType>(CurTy) || Record.empty())
+        return Error("Invalid WIDE_INTEGER record");
+      
+      unsigned NumWords = Record.size();
+      SmallVector<uint64_t, 8> Words;
+      Words.resize(NumWords);
+      for (unsigned i = 0; i != NumWords; ++i)
+        Words[i] = DecodeSignRotatedValue(Record[i]);
+      V = ConstantInt::get(APInt(cast<IntegerType>(CurTy)->getBitWidth(),
+                                 NumWords, &Words[0]));
+      break;
+    }
+    case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
+      if (Record.empty())
+        return Error("Invalid FLOAT record");
+      if (CurTy == Type::FloatTy)
+        V = ConstantFP::get(APFloat(APInt(32, (uint32_t)Record[0])));
+      else if (CurTy == Type::DoubleTy)
+        V = ConstantFP::get(APFloat(APInt(64, Record[0])));
+      else if (CurTy == Type::X86_FP80Ty) {
+        // Bits are not stored the same way as a normal i80 APInt, compensate.
+        uint64_t Rearrange[2];
+        Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16);
+        Rearrange[1] = Record[0] >> 48;
+        V = ConstantFP::get(APFloat(APInt(80, 2, Rearrange)));
+      } else if (CurTy == Type::FP128Ty)
+        V = ConstantFP::get(APFloat(APInt(128, 2, &Record[0]), true));
+      else if (CurTy == Type::PPC_FP128Ty)
+        V = ConstantFP::get(APFloat(APInt(128, 2, &Record[0])));
+      else
+        V = UndefValue::get(CurTy);
+      break;
+    }
+      
+    case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
+      if (Record.empty())
+        return Error("Invalid CST_AGGREGATE record");
+      
+      unsigned Size = Record.size();
+      std::vector<Constant*> Elts;
+      
+      if (const StructType *STy = dyn_cast<StructType>(CurTy)) {
+        for (unsigned i = 0; i != Size; ++i)
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i],
+                                                     STy->getElementType(i)));
+        V = ConstantStruct::get(STy, Elts);
+      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
+        const Type *EltTy = ATy->getElementType();
+        for (unsigned i = 0; i != Size; ++i)
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
+        V = ConstantArray::get(ATy, Elts);
+      } else if (const VectorType *VTy = dyn_cast<VectorType>(CurTy)) {
+        const Type *EltTy = VTy->getElementType();
+        for (unsigned i = 0; i != Size; ++i)
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
+        V = ConstantVector::get(Elts);
+      } else {
+        V = UndefValue::get(CurTy);
+      }
+      break;
+    }
+    case bitc::CST_CODE_STRING: { // STRING: [values]
+      if (Record.empty())
+        return Error("Invalid CST_AGGREGATE record");
+
+      const ArrayType *ATy = cast<ArrayType>(CurTy);
+      const Type *EltTy = ATy->getElementType();
+      
+      unsigned Size = Record.size();
+      std::vector<Constant*> Elts;
+      for (unsigned i = 0; i != Size; ++i)
+        Elts.push_back(ConstantInt::get(EltTy, Record[i]));
+      V = ConstantArray::get(ATy, Elts);
+      break;
+    }
+    case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
+      if (Record.empty())
+        return Error("Invalid CST_AGGREGATE record");
+      
+      const ArrayType *ATy = cast<ArrayType>(CurTy);
+      const Type *EltTy = ATy->getElementType();
+      
+      unsigned Size = Record.size();
+      std::vector<Constant*> Elts;
+      for (unsigned i = 0; i != Size; ++i)
+        Elts.push_back(ConstantInt::get(EltTy, Record[i]));
+      Elts.push_back(Constant::getNullValue(EltTy));
+      V = ConstantArray::get(ATy, Elts);
+      break;
+    }
+    case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
+      if (Record.size() < 3) return Error("Invalid CE_BINOP record");
+      int Opc = GetDecodedBinaryOpcode(Record[0], CurTy);
+      if (Opc < 0) {
+        V = UndefValue::get(CurTy);  // Unknown binop.
+      } else {
+        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
+        Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy);
+        V = ConstantExpr::get(Opc, LHS, RHS);
+      }
+      break;
+    }  
+    case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
+      if (Record.size() < 3) return Error("Invalid CE_CAST record");
+      int Opc = GetDecodedCastOpcode(Record[0]);
+      if (Opc < 0) {
+        V = UndefValue::get(CurTy);  // Unknown cast.
+      } else {
+        const Type *OpTy = getTypeByID(Record[1]);
+        if (!OpTy) return Error("Invalid CE_CAST record");
+        Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
+        V = ConstantExpr::getCast(Opc, Op, CurTy);
+      }
+      break;
+    }  
+    case bitc::CST_CODE_CE_GEP: {  // CE_GEP:        [n x operands]
+      if (Record.size() & 1) return Error("Invalid CE_GEP record");
+      SmallVector<Constant*, 16> Elts;
+      for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
+        const Type *ElTy = getTypeByID(Record[i]);
+        if (!ElTy) return Error("Invalid CE_GEP record");
+        Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
+      }
+      V = ConstantExpr::getGetElementPtr(Elts[0], &Elts[1], Elts.size()-1);
+      break;
+    }
+    case bitc::CST_CODE_CE_SELECT:  // CE_SELECT: [opval#, opval#, opval#]
+      if (Record.size() < 3) return Error("Invalid CE_SELECT record");
+      V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
+                                                              Type::Int1Ty),
+                                  ValueList.getConstantFwdRef(Record[1],CurTy),
+                                  ValueList.getConstantFwdRef(Record[2],CurTy));
+      break;
+    case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opval]
+      if (Record.size() < 3) return Error("Invalid CE_EXTRACTELT record");
+      const VectorType *OpTy = 
+        dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
+      if (OpTy == 0) return Error("Invalid CE_EXTRACTELT record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], Type::Int32Ty);
+      V = ConstantExpr::getExtractElement(Op0, Op1);
+      break;
+    }
+    case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opval]
+      const VectorType *OpTy = dyn_cast<VectorType>(CurTy);
+      if (Record.size() < 3 || OpTy == 0)
+        return Error("Invalid CE_INSERTELT record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
+                                                  OpTy->getElementType());
+      Constant *Op2 = ValueList.getConstantFwdRef(Record[2], Type::Int32Ty);
+      V = ConstantExpr::getInsertElement(Op0, Op1, Op2);
+      break;
+    }
+    case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
+      const VectorType *OpTy = dyn_cast<VectorType>(CurTy);
+      if (Record.size() < 3 || OpTy == 0)
+        return Error("Invalid CE_SHUFFLEVEC record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      const Type *ShufTy=VectorType::get(Type::Int32Ty, OpTy->getNumElements());
+      Constant *Op2 = ValueList.getConstantFwdRef(Record[2], ShufTy);
+      V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
+      break;
+    }
+    case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval]
+      const VectorType *RTy = dyn_cast<VectorType>(CurTy);
+      const VectorType *OpTy = dyn_cast<VectorType>(getTypeByID(Record[0]));
+      if (Record.size() < 4 || RTy == 0 || OpTy == 0)
+        return Error("Invalid CE_SHUFVEC_EX record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
+      const Type *ShufTy=VectorType::get(Type::Int32Ty, RTy->getNumElements());
+      Constant *Op2 = ValueList.getConstantFwdRef(Record[3], ShufTy);
+      V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
+      break;
+    }
+    case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
+      if (Record.size() < 4) return Error("Invalid CE_CMP record");
+      const Type *OpTy = getTypeByID(Record[0]);
+      if (OpTy == 0) return Error("Invalid CE_CMP record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
+
+      if (OpTy->isFloatingPoint())
+        V = ConstantExpr::getFCmp(Record[3], Op0, Op1);
+      else if (!isa<VectorType>(OpTy))
+        V = ConstantExpr::getICmp(Record[3], Op0, Op1);
+      else if (OpTy->isFPOrFPVector())
+        V = ConstantExpr::getVFCmp(Record[3], Op0, Op1);
+      else
+        V = ConstantExpr::getVICmp(Record[3], Op0, Op1);
+      break;
+    }
+    case bitc::CST_CODE_INLINEASM: {
+      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      std::string AsmStr, ConstrStr;
+      bool HasSideEffects = Record[0];
+      unsigned AsmStrSize = Record[1];
+      if (2+AsmStrSize >= Record.size())
+        return Error("Invalid INLINEASM record");
+      unsigned ConstStrSize = Record[2+AsmStrSize];
+      if (3+AsmStrSize+ConstStrSize > Record.size())
+        return Error("Invalid INLINEASM record");
+      
+      for (unsigned i = 0; i != AsmStrSize; ++i)
+        AsmStr += (char)Record[2+i];
+      for (unsigned i = 0; i != ConstStrSize; ++i)
+        ConstrStr += (char)Record[3+AsmStrSize+i];
+      const PointerType *PTy = cast<PointerType>(CurTy);
+      V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
+                         AsmStr, ConstrStr, HasSideEffects);
+      break;
+    }
+    case bitc::CST_CODE_MDSTRING: {
+      if (Record.size() < 2) return Error("Invalid MDSTRING record");
+      unsigned MDStringLength = Record.size();
+      SmallString<8> String;
+      String.resize(MDStringLength);
+      for (unsigned i = 0; i != MDStringLength; ++i)
+        String[i] = Record[i];
+      V = MDString::get(String.c_str(), String.c_str() + MDStringLength);
+      break;
+    }
+    case bitc::CST_CODE_MDNODE: {
+      if (Record.empty() || Record.size() % 2 == 1)
+        return Error("Invalid CST_MDNODE record");
+      
+      unsigned Size = Record.size();
+      SmallVector<Value*, 8> Elts;
+      for (unsigned i = 0; i != Size; i += 2) {
+        const Type *Ty = getTypeByID(Record[i], false);
+        if (Ty != Type::VoidTy)
+          Elts.push_back(ValueList.getValueFwdRef(Record[i+1], Ty));
+        else
+          Elts.push_back(NULL);
+      }
+      V = MDNode::get(&Elts[0], Elts.size());
+      break;
+    }
+    }
+    
+    ValueList.AssignValue(V, NextCstNo);
+    ++NextCstNo;
+  }
+  
+  if (NextCstNo != ValueList.size())
+    return Error("Invalid constant reference!");
+  
+  if (Stream.ReadBlockEnd())
+    return Error("Error at end of constants block");
+  
+  // Once all the constants have been read, go through and resolve forward
+  // references.
+  ValueList.ResolveConstantForwardRefs();
+  return false;
+}
+
+/// RememberAndSkipFunctionBody - When we see the block for a function body,
+/// remember where it is and then skip it.  This lets us lazily deserialize the
+/// functions.
+bool BitcodeReader::RememberAndSkipFunctionBody() {
+  // Get the function we are talking about.
+  if (FunctionsWithBodies.empty())
+    return Error("Insufficient function protos");
+  
+  Function *Fn = FunctionsWithBodies.back();
+  FunctionsWithBodies.pop_back();
+  
+  // Save the current stream state.
+  uint64_t CurBit = Stream.GetCurrentBitNo();
+  DeferredFunctionInfo[Fn] = std::make_pair(CurBit, Fn->getLinkage());
+  
+  // Set the functions linkage to GhostLinkage so we know it is lazily
+  // deserialized.
+  Fn->setLinkage(GlobalValue::GhostLinkage);
+  
+  // Skip over the function block for now.
+  if (Stream.SkipBlock())
+    return Error("Malformed block record");
+  return false;
+}
+
+bool BitcodeReader::ParseModule(const std::string &ModuleID) {
+  // Reject multiple MODULE_BLOCK's in a single bitstream.
+  if (TheModule)
+    return Error("Multiple MODULE_BLOCKs in same stream");
+  
+  if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return Error("Malformed block record");
+
+  // Otherwise, create the module.
+  TheModule = new Module(ModuleID);
+  
+  SmallVector<uint64_t, 64> Record;
+  std::vector<std::string> SectionTable;
+  std::vector<std::string> GCTable;
+
+  // Read all the records for this module.
+  while (!Stream.AtEndOfStream()) {
+    unsigned Code = Stream.ReadCode();
+    if (Code == bitc::END_BLOCK) {
+      if (Stream.ReadBlockEnd())
+        return Error("Error at end of module block");
+
+      // Patch the initializers for globals and aliases up.
+      ResolveGlobalAndAliasInits();
+      if (!GlobalInits.empty() || !AliasInits.empty())
+        return Error("Malformed global initializer set");
+      if (!FunctionsWithBodies.empty())
+        return Error("Too few function bodies found");
+
+      // Look for intrinsic functions which need to be upgraded at some point
+      for (Module::iterator FI = TheModule->begin(), FE = TheModule->end();
+           FI != FE; ++FI) {
+        Function* NewFn;
+        if (UpgradeIntrinsicFunction(FI, NewFn))
+          UpgradedIntrinsics.push_back(std::make_pair(FI, NewFn));
+      }
+
+      // Force deallocation of memory for these vectors to favor the client that
+      // want lazy deserialization.
+      std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
+      std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
+      std::vector<Function*>().swap(FunctionsWithBodies);
+      return false;
+    }
+    
+    if (Code == bitc::ENTER_SUBBLOCK) {
+      switch (Stream.ReadSubBlockID()) {
+      default:  // Skip unknown content.
+        if (Stream.SkipBlock())
+          return Error("Malformed block record");
+        break;
+      case bitc::BLOCKINFO_BLOCK_ID:
+        if (Stream.ReadBlockInfoBlock())
+          return Error("Malformed BlockInfoBlock");
+        break;
+      case bitc::PARAMATTR_BLOCK_ID:
+        if (ParseAttributeBlock())
+          return true;
+        break;
+      case bitc::TYPE_BLOCK_ID:
+        if (ParseTypeTable())
+          return true;
+        break;
+      case bitc::TYPE_SYMTAB_BLOCK_ID:
+        if (ParseTypeSymbolTable())
+          return true;
+        break;
+      case bitc::VALUE_SYMTAB_BLOCK_ID:
+        if (ParseValueSymbolTable())
+          return true;
+        break;
+      case bitc::CONSTANTS_BLOCK_ID:
+        if (ParseConstants() || ResolveGlobalAndAliasInits())
+          return true;
+        break;
+      case bitc::FUNCTION_BLOCK_ID:
+        // If this is the first function body we've seen, reverse the
+        // FunctionsWithBodies list.
+        if (!HasReversedFunctionsWithBodies) {
+          std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end());
+          HasReversedFunctionsWithBodies = true;
+        }
+        
+        if (RememberAndSkipFunctionBody())
+          return true;
+        break;
+      }
+      continue;
+    }
+    
+    if (Code == bitc::DEFINE_ABBREV) {
+      Stream.ReadAbbrevRecord();
+      continue;
+    }
+    
+    // Read a record.
+    switch (Stream.ReadRecord(Code, Record)) {
+    default: break;  // Default behavior, ignore unknown content.
+    case bitc::MODULE_CODE_VERSION:  // VERSION: [version#]
+      if (Record.size() < 1)
+        return Error("Malformed MODULE_CODE_VERSION");
+      // Only version #0 is supported so far.
+      if (Record[0] != 0)
+        return Error("Unknown bitstream version!");
+      break;
+    case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_TRIPLE record");
+      TheModule->setTargetTriple(S);
+      break;
+    }
+    case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_DATALAYOUT record");
+      TheModule->setDataLayout(S);
+      break;
+    }
+    case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_ASM record");
+      TheModule->setModuleInlineAsm(S);
+      break;
+    }
+    case bitc::MODULE_CODE_DEPLIB: {  // DEPLIB: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_DEPLIB record");
+      TheModule->addLibrary(S);
+      break;
+    }
+    case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_SECTIONNAME record");
+      SectionTable.push_back(S);
+      break;
+    }
+    case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_GCNAME record");
+      GCTable.push_back(S);
+      break;
+    }
+    // GLOBALVAR: [pointer type, isconst, initid,
+    //             linkage, alignment, section, visibility, threadlocal]
+    case bitc::MODULE_CODE_GLOBALVAR: {
+      if (Record.size() < 6)
+        return Error("Invalid MODULE_CODE_GLOBALVAR record");
+      const Type *Ty = getTypeByID(Record[0]);
+      if (!isa<PointerType>(Ty))
+        return Error("Global not a pointer type!");
+      unsigned AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
+      Ty = cast<PointerType>(Ty)->getElementType();
+      
+      bool isConstant = Record[1];
+      GlobalValue::LinkageTypes Linkage = GetDecodedLinkage(Record[3]);
+      unsigned Alignment = (1 << Record[4]) >> 1;
+      std::string Section;
+      if (Record[5]) {
+        if (Record[5]-1 >= SectionTable.size())
+          return Error("Invalid section ID");
+        Section = SectionTable[Record[5]-1];
+      }
+      GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
+      if (Record.size() > 6)
+        Visibility = GetDecodedVisibility(Record[6]);
+      bool isThreadLocal = false;
+      if (Record.size() > 7)
+        isThreadLocal = Record[7];
+
+      GlobalVariable *NewGV =
+        new GlobalVariable(Ty, isConstant, Linkage, 0, "", TheModule, 
+                           isThreadLocal, AddressSpace);
+      NewGV->setAlignment(Alignment);
+      if (!Section.empty())
+        NewGV->setSection(Section);
+      NewGV->setVisibility(Visibility);
+      NewGV->setThreadLocal(isThreadLocal);
+      
+      ValueList.push_back(NewGV);
+      
+      // Remember which value to use for the global initializer.
+      if (unsigned InitID = Record[2])
+        GlobalInits.push_back(std::make_pair(NewGV, InitID-1));
+      break;
+    }
+    // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
+    //             alignment, section, visibility, gc]
+    case bitc::MODULE_CODE_FUNCTION: {
+      if (Record.size() < 8)
+        return Error("Invalid MODULE_CODE_FUNCTION record");
+      const Type *Ty = getTypeByID(Record[0]);
+      if (!isa<PointerType>(Ty))
+        return Error("Function not a pointer type!");
+      const FunctionType *FTy =
+        dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
+      if (!FTy)
+        return Error("Function not a pointer to function type!");
+
+      Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                        "", TheModule);
+
+      Func->setCallingConv(Record[1]);
+      bool isProto = Record[2];
+      Func->setLinkage(GetDecodedLinkage(Record[3]));
+      Func->setAttributes(getAttributes(Record[4]));
+      
+      Func->setAlignment((1 << Record[5]) >> 1);
+      if (Record[6]) {
+        if (Record[6]-1 >= SectionTable.size())
+          return Error("Invalid section ID");
+        Func->setSection(SectionTable[Record[6]-1]);
+      }
+      Func->setVisibility(GetDecodedVisibility(Record[7]));
+      if (Record.size() > 8 && Record[8]) {
+        if (Record[8]-1 > GCTable.size())
+          return Error("Invalid GC ID");
+        Func->setGC(GCTable[Record[8]-1].c_str());
+      }
+      ValueList.push_back(Func);
+      
+      // If this is a function with a body, remember the prototype we are
+      // creating now, so that we can match up the body with them later.
+      if (!isProto)
+        FunctionsWithBodies.push_back(Func);
+      break;
+    }
+    // ALIAS: [alias type, aliasee val#, linkage]
+    // ALIAS: [alias type, aliasee val#, linkage, visibility]
+    case bitc::MODULE_CODE_ALIAS: {
+      if (Record.size() < 3)
+        return Error("Invalid MODULE_ALIAS record");
+      const Type *Ty = getTypeByID(Record[0]);
+      if (!isa<PointerType>(Ty))
+        return Error("Function not a pointer type!");
+      
+      GlobalAlias *NewGA = new GlobalAlias(Ty, GetDecodedLinkage(Record[2]),
+                                           "", 0, TheModule);
+      // Old bitcode files didn't have visibility field.
+      if (Record.size() > 3)
+        NewGA->setVisibility(GetDecodedVisibility(Record[3]));
+      ValueList.push_back(NewGA);
+      AliasInits.push_back(std::make_pair(NewGA, Record[1]));
+      break;
+    }
+    /// MODULE_CODE_PURGEVALS: [numvals]
+    case bitc::MODULE_CODE_PURGEVALS:
+      // Trim down the value list to the specified size.
+      if (Record.size() < 1 || Record[0] > ValueList.size())
+        return Error("Invalid MODULE_PURGEVALS record");
+      ValueList.shrinkTo(Record[0]);
+      break;
+    }
+    Record.clear();
+  }
+  
+  return Error("Premature end of bitstream");
+}
+
+bool BitcodeReader::ParseBitcode() {
+  TheModule = 0;
+  
+  if (Buffer->getBufferSize() & 3)
+    return Error("Bitcode stream should be a multiple of 4 bytes in length");
+  
+  unsigned char *BufPtr = (unsigned char *)Buffer->getBufferStart();
+  unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
+  
+  // If we have a wrapper header, parse it and ignore the non-bc file contents.
+  // The magic number is 0x0B17C0DE stored in little endian.
+  if (isBitcodeWrapper(BufPtr, BufEnd))
+    if (SkipBitcodeWrapperHeader(BufPtr, BufEnd))
+      return Error("Invalid bitcode wrapper header");
+  
+  StreamFile.init(BufPtr, BufEnd);
+  Stream.init(StreamFile);
+  
+  // Sniff for the signature.
+  if (Stream.Read(8) != 'B' ||
+      Stream.Read(8) != 'C' ||
+      Stream.Read(4) != 0x0 ||
+      Stream.Read(4) != 0xC ||
+      Stream.Read(4) != 0xE ||
+      Stream.Read(4) != 0xD)
+    return Error("Invalid bitcode signature");
+  
+  // We expect a number of well-defined blocks, though we don't necessarily
+  // need to understand them all.
+  while (!Stream.AtEndOfStream()) {
+    unsigned Code = Stream.ReadCode();
+    
+    if (Code != bitc::ENTER_SUBBLOCK)
+      return Error("Invalid record at top-level");
+    
+    unsigned BlockID = Stream.ReadSubBlockID();
+    
+    // We only know the MODULE subblock ID.
+    switch (BlockID) {
+    case bitc::BLOCKINFO_BLOCK_ID:
+      if (Stream.ReadBlockInfoBlock())
+        return Error("Malformed BlockInfoBlock");
+      break;
+    case bitc::MODULE_BLOCK_ID:
+      if (ParseModule(Buffer->getBufferIdentifier()))
+        return true;
+      break;
+    default:
+      if (Stream.SkipBlock())
+        return Error("Malformed block record");
+      break;
+    }
+  }
+  
+  return false;
+}
+
+
+/// ParseFunctionBody - Lazily parse the specified function body block.
+bool BitcodeReader::ParseFunctionBody(Function *F) {
+  if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
+    return Error("Malformed block record");
+  
+  unsigned ModuleValueListSize = ValueList.size();
+  
+  // Add all the function arguments to the value table.
+  for(Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
+    ValueList.push_back(I);
+  
+  unsigned NextValueNo = ValueList.size();
+  BasicBlock *CurBB = 0;
+  unsigned CurBBNo = 0;
+
+  // Read all the records.
+  SmallVector<uint64_t, 64> Record;
+  while (1) {
+    unsigned Code = Stream.ReadCode();
+    if (Code == bitc::END_BLOCK) {
+      if (Stream.ReadBlockEnd())
+        return Error("Error at end of function block");
+      break;
+    }
+    
+    if (Code == bitc::ENTER_SUBBLOCK) {
+      switch (Stream.ReadSubBlockID()) {
+      default:  // Skip unknown content.
+        if (Stream.SkipBlock())
+          return Error("Malformed block record");
+        break;
+      case bitc::CONSTANTS_BLOCK_ID:
+        if (ParseConstants()) return true;
+        NextValueNo = ValueList.size();
+        break;
+      case bitc::VALUE_SYMTAB_BLOCK_ID:
+        if (ParseValueSymbolTable()) return true;
+        break;
+      }
+      continue;
+    }
+    
+    if (Code == bitc::DEFINE_ABBREV) {
+      Stream.ReadAbbrevRecord();
+      continue;
+    }
+    
+    // Read a record.
+    Record.clear();
+    Instruction *I = 0;
+    switch (Stream.ReadRecord(Code, Record)) {
+    default: // Default behavior: reject
+      return Error("Unknown instruction");
+    case bitc::FUNC_CODE_DECLAREBLOCKS:     // DECLAREBLOCKS: [nblocks]
+      if (Record.size() < 1 || Record[0] == 0)
+        return Error("Invalid DECLAREBLOCKS record");
+      // Create all the basic blocks for the function.
+      FunctionBBs.resize(Record[0]);
+      for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
+        FunctionBBs[i] = BasicBlock::Create("", F);
+      CurBB = FunctionBBs[0];
+      continue;
+      
+    case bitc::FUNC_CODE_INST_BINOP: {    // BINOP: [opval, ty, opval, opcode]
+      unsigned OpNum = 0;
+      Value *LHS, *RHS;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+          getValue(Record, OpNum, LHS->getType(), RHS) ||
+          OpNum+1 != Record.size())
+        return Error("Invalid BINOP record");
+      
+      int Opc = GetDecodedBinaryOpcode(Record[OpNum], LHS->getType());
+      if (Opc == -1) return Error("Invalid BINOP record");
+      I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CAST: {    // CAST: [opval, opty, destty, castopc]
+      unsigned OpNum = 0;
+      Value *Op;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+          OpNum+2 != Record.size())
+        return Error("Invalid CAST record");
+      
+      const Type *ResTy = getTypeByID(Record[OpNum]);
+      int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
+      if (Opc == -1 || ResTy == 0)
+        return Error("Invalid CAST record");
+      I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_GEP: { // GEP: [n x operands]
+      unsigned OpNum = 0;
+      Value *BasePtr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
+        return Error("Invalid GEP record");
+
+      SmallVector<Value*, 16> GEPIdx;
+      while (OpNum != Record.size()) {
+        Value *Op;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          return Error("Invalid GEP record");
+        GEPIdx.push_back(Op);
+      }
+
+      I = GetElementPtrInst::Create(BasePtr, GEPIdx.begin(), GEPIdx.end());
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_EXTRACTVAL: {
+                                       // EXTRACTVAL: [opty, opval, n x indices]
+      unsigned OpNum = 0;
+      Value *Agg;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+        return Error("Invalid EXTRACTVAL record");
+
+      SmallVector<unsigned, 4> EXTRACTVALIdx;
+      for (unsigned RecSize = Record.size();
+           OpNum != RecSize; ++OpNum) {
+        uint64_t Index = Record[OpNum];
+        if ((unsigned)Index != Index)
+          return Error("Invalid EXTRACTVAL index");
+        EXTRACTVALIdx.push_back((unsigned)Index);
+      }
+
+      I = ExtractValueInst::Create(Agg,
+                                   EXTRACTVALIdx.begin(), EXTRACTVALIdx.end());
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_INSERTVAL: {
+                           // INSERTVAL: [opty, opval, opty, opval, n x indices]
+      unsigned OpNum = 0;
+      Value *Agg;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+        return Error("Invalid INSERTVAL record");
+      Value *Val;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Val))
+        return Error("Invalid INSERTVAL record");
+
+      SmallVector<unsigned, 4> INSERTVALIdx;
+      for (unsigned RecSize = Record.size();
+           OpNum != RecSize; ++OpNum) {
+        uint64_t Index = Record[OpNum];
+        if ((unsigned)Index != Index)
+          return Error("Invalid INSERTVAL index");
+        INSERTVALIdx.push_back((unsigned)Index);
+      }
+
+      I = InsertValueInst::Create(Agg, Val,
+                                  INSERTVALIdx.begin(), INSERTVALIdx.end());
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_SELECT: { // SELECT: [opval, ty, opval, opval]
+      // obsolete form of select
+      // handles select i1 ... in old bitcode
+      unsigned OpNum = 0;
+      Value *TrueVal, *FalseVal, *Cond;
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
+          getValue(Record, OpNum, TrueVal->getType(), FalseVal) ||
+          getValue(Record, OpNum, Type::Int1Ty, Cond))
+        return Error("Invalid SELECT record");
+      
+      I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_VSELECT: {// VSELECT: [ty,opval,opval,predty,pred]
+      // new form of select
+      // handles select i1 or select [N x i1]
+      unsigned OpNum = 0;
+      Value *TrueVal, *FalseVal, *Cond;
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
+          getValue(Record, OpNum, TrueVal->getType(), FalseVal) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Cond))
+        return Error("Invalid SELECT record");
+
+      // select condition can be either i1 or [N x i1]
+      if (const VectorType* vector_type =
+          dyn_cast<const VectorType>(Cond->getType())) {
+        // expect <n x i1>
+        if (vector_type->getElementType() != Type::Int1Ty) 
+          return Error("Invalid SELECT condition type");
+      } else {
+        // expect i1
+        if (Cond->getType() != Type::Int1Ty) 
+          return Error("Invalid SELECT condition type");
+      } 
+      
+      I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_EXTRACTELT: { // EXTRACTELT: [opty, opval, opval]
+      unsigned OpNum = 0;
+      Value *Vec, *Idx;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
+          getValue(Record, OpNum, Type::Int32Ty, Idx))
+        return Error("Invalid EXTRACTELT record");
+      I = new ExtractElementInst(Vec, Idx);
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_INSERTELT: { // INSERTELT: [ty, opval,opval,opval]
+      unsigned OpNum = 0;
+      Value *Vec, *Elt, *Idx;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
+          getValue(Record, OpNum, 
+                   cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
+          getValue(Record, OpNum, Type::Int32Ty, Idx))
+        return Error("Invalid INSERTELT record");
+      I = InsertElementInst::Create(Vec, Elt, Idx);
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_SHUFFLEVEC: {// SHUFFLEVEC: [opval,ty,opval,opval]
+      unsigned OpNum = 0;
+      Value *Vec1, *Vec2, *Mask;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
+          getValue(Record, OpNum, Vec1->getType(), Vec2))
+        return Error("Invalid SHUFFLEVEC record");
+
+      if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
+        return Error("Invalid SHUFFLEVEC record");
+      I = new ShuffleVectorInst(Vec1, Vec2, Mask);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_CMP: { // CMP: [opty, opval, opval, pred]
+      // VFCmp/VICmp
+      // or old form of ICmp/FCmp returning bool
+      unsigned OpNum = 0;
+      Value *LHS, *RHS;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+          getValue(Record, OpNum, LHS->getType(), RHS) ||
+          OpNum+1 != Record.size())
+        return Error("Invalid CMP record");
+      
+      if (LHS->getType()->isFloatingPoint())
+        I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
+      else if (!isa<VectorType>(LHS->getType()))
+        I = new ICmpInst((ICmpInst::Predicate)Record[OpNum], LHS, RHS);
+      else if (LHS->getType()->isFPOrFPVector())
+        I = new VFCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
+      else
+        I = new VICmpInst((ICmpInst::Predicate)Record[OpNum], LHS, RHS);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CMP2: { // CMP2: [opty, opval, opval, pred]
+      // Fcmp/ICmp returning bool or vector of bool
+      unsigned OpNum = 0;
+      Value *LHS, *RHS;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+          getValue(Record, OpNum, LHS->getType(), RHS) ||
+          OpNum+1 != Record.size())
+        return Error("Invalid CMP2 record");
+      
+      if (LHS->getType()->isFPOrFPVector())
+        I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
+      else 
+        I = new ICmpInst((ICmpInst::Predicate)Record[OpNum], LHS, RHS);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_GETRESULT: { // GETRESULT: [ty, val, n]
+      if (Record.size() != 2)
+        return Error("Invalid GETRESULT record");
+      unsigned OpNum = 0;
+      Value *Op;
+      getValueTypePair(Record, OpNum, NextValueNo, Op);
+      unsigned Index = Record[1];
+      I = ExtractValueInst::Create(Op, Index);
+      break;
+    }
+    
+    case bitc::FUNC_CODE_INST_RET: // RET: [opty,opval<optional>]
+      {
+        unsigned Size = Record.size();
+        if (Size == 0) {
+          I = ReturnInst::Create();
+          break;
+        }
+
+        unsigned OpNum = 0;
+        SmallVector<Value *,4> Vs;
+        do {
+          Value *Op = NULL;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+            return Error("Invalid RET record");
+          Vs.push_back(Op);
+        } while(OpNum != Record.size());
+
+        const Type *ReturnType = F->getReturnType();
+        if (Vs.size() > 1 ||
+            (isa<StructType>(ReturnType) &&
+             (Vs.empty() || Vs[0]->getType() != ReturnType))) {
+          Value *RV = UndefValue::get(ReturnType);
+          for (unsigned i = 0, e = Vs.size(); i != e; ++i) {
+            I = InsertValueInst::Create(RV, Vs[i], i, "mrv");
+            CurBB->getInstList().push_back(I);
+            ValueList.AssignValue(I, NextValueNo++);
+            RV = I;
+          }
+          I = ReturnInst::Create(RV);
+          break;
+        }
+
+        I = ReturnInst::Create(Vs[0]);
+        break;
+      }
+    case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
+      if (Record.size() != 1 && Record.size() != 3)
+        return Error("Invalid BR record");
+      BasicBlock *TrueDest = getBasicBlock(Record[0]);
+      if (TrueDest == 0)
+        return Error("Invalid BR record");
+
+      if (Record.size() == 1)
+        I = BranchInst::Create(TrueDest);
+      else {
+        BasicBlock *FalseDest = getBasicBlock(Record[1]);
+        Value *Cond = getFnValueByID(Record[2], Type::Int1Ty);
+        if (FalseDest == 0 || Cond == 0)
+          return Error("Invalid BR record");
+        I = BranchInst::Create(TrueDest, FalseDest, Cond);
+      }
+      break;
+    }
+    case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, opval, n, n x ops]
+      if (Record.size() < 3 || (Record.size() & 1) == 0)
+        return Error("Invalid SWITCH record");
+      const Type *OpTy = getTypeByID(Record[0]);
+      Value *Cond = getFnValueByID(Record[1], OpTy);
+      BasicBlock *Default = getBasicBlock(Record[2]);
+      if (OpTy == 0 || Cond == 0 || Default == 0)
+        return Error("Invalid SWITCH record");
+      unsigned NumCases = (Record.size()-3)/2;
+      SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
+      for (unsigned i = 0, e = NumCases; i != e; ++i) {
+        ConstantInt *CaseVal = 
+          dyn_cast_or_null<ConstantInt>(getFnValueByID(Record[3+i*2], OpTy));
+        BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
+        if (CaseVal == 0 || DestBB == 0) {
+          delete SI;
+          return Error("Invalid SWITCH record!");
+        }
+        SI->addCase(CaseVal, DestBB);
+      }
+      I = SI;
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_INVOKE: {
+      // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...]
+      if (Record.size() < 4) return Error("Invalid INVOKE record");
+      AttrListPtr PAL = getAttributes(Record[0]);
+      unsigned CCInfo = Record[1];
+      BasicBlock *NormalBB = getBasicBlock(Record[2]);
+      BasicBlock *UnwindBB = getBasicBlock(Record[3]);
+      
+      unsigned OpNum = 4;
+      Value *Callee;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+        return Error("Invalid INVOKE record");
+      
+      const PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
+      const FunctionType *FTy = !CalleeTy ? 0 :
+        dyn_cast<FunctionType>(CalleeTy->getElementType());
+
+      // Check that the right number of fixed parameters are here.
+      if (FTy == 0 || NormalBB == 0 || UnwindBB == 0 ||
+          Record.size() < OpNum+FTy->getNumParams())
+        return Error("Invalid INVOKE record");
+      
+      SmallVector<Value*, 16> Ops;
+      for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
+        Ops.push_back(getFnValueByID(Record[OpNum], FTy->getParamType(i)));
+        if (Ops.back() == 0) return Error("Invalid INVOKE record");
+      }
+      
+      if (!FTy->isVarArg()) {
+        if (Record.size() != OpNum)
+          return Error("Invalid INVOKE record");
+      } else {
+        // Read type/value pairs for varargs params.
+        while (OpNum != Record.size()) {
+          Value *Op;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+            return Error("Invalid INVOKE record");
+          Ops.push_back(Op);
+        }
+      }
+      
+      I = InvokeInst::Create(Callee, NormalBB, UnwindBB,
+                             Ops.begin(), Ops.end());
+      cast<InvokeInst>(I)->setCallingConv(CCInfo);
+      cast<InvokeInst>(I)->setAttributes(PAL);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_UNWIND: // UNWIND
+      I = new UnwindInst();
+      break;
+    case bitc::FUNC_CODE_INST_UNREACHABLE: // UNREACHABLE
+      I = new UnreachableInst();
+      break;
+    case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
+      if (Record.size() < 1 || ((Record.size()-1)&1))
+        return Error("Invalid PHI record");
+      const Type *Ty = getTypeByID(Record[0]);
+      if (!Ty) return Error("Invalid PHI record");
+      
+      PHINode *PN = PHINode::Create(Ty);
+      PN->reserveOperandSpace((Record.size()-1)/2);
+      
+      for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) {
+        Value *V = getFnValueByID(Record[1+i], Ty);
+        BasicBlock *BB = getBasicBlock(Record[2+i]);
+        if (!V || !BB) return Error("Invalid PHI record");
+        PN->addIncoming(V, BB);
+      }
+      I = PN;
+      break;
+    }
+      
+    case bitc::FUNC_CODE_INST_MALLOC: { // MALLOC: [instty, op, align]
+      if (Record.size() < 3)
+        return Error("Invalid MALLOC record");
+      const PointerType *Ty =
+        dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
+      Value *Size = getFnValueByID(Record[1], Type::Int32Ty);
+      unsigned Align = Record[2];
+      if (!Ty || !Size) return Error("Invalid MALLOC record");
+      I = new MallocInst(Ty->getElementType(), Size, (1 << Align) >> 1);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_FREE: { // FREE: [op, opty]
+      unsigned OpNum = 0;
+      Value *Op;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+          OpNum != Record.size())
+        return Error("Invalid FREE record");
+      I = new FreeInst(Op);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, op, align]
+      if (Record.size() < 3)
+        return Error("Invalid ALLOCA record");
+      const PointerType *Ty =
+        dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
+      Value *Size = getFnValueByID(Record[1], Type::Int32Ty);
+      unsigned Align = Record[2];
+      if (!Ty || !Size) return Error("Invalid ALLOCA record");
+      I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_LOAD: { // LOAD: [opty, op, align, vol]
+      unsigned OpNum = 0;
+      Value *Op;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+          OpNum+2 != Record.size())
+        return Error("Invalid LOAD record");
+      
+      I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_STORE2: { // STORE2:[ptrty, ptr, val, align, vol]
+      unsigned OpNum = 0;
+      Value *Val, *Ptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          getValue(Record, OpNum, 
+                    cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
+          OpNum+2 != Record.size())
+        return Error("Invalid STORE record");
+      
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_STORE: { // STORE:[val, valty, ptr, align, vol]
+      // FIXME: Legacy form of store instruction. Should be removed in LLVM 3.0.
+      unsigned OpNum = 0;
+      Value *Val, *Ptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Val) ||
+          getValue(Record, OpNum, PointerType::getUnqual(Val->getType()), Ptr)||
+          OpNum+2 != Record.size())
+        return Error("Invalid STORE record");
+      
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CALL: {
+      // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
+      if (Record.size() < 3)
+        return Error("Invalid CALL record");
+      
+      AttrListPtr PAL = getAttributes(Record[0]);
+      unsigned CCInfo = Record[1];
+      
+      unsigned OpNum = 2;
+      Value *Callee;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+        return Error("Invalid CALL record");
+      
+      const PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
+      const FunctionType *FTy = 0;
+      if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
+      if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
+        return Error("Invalid CALL record");
+      
+      SmallVector<Value*, 16> Args;
+      // Read the fixed params.
+      for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
+        if (FTy->getParamType(i)->getTypeID()==Type::LabelTyID)
+          Args.push_back(getBasicBlock(Record[OpNum]));
+        else
+          Args.push_back(getFnValueByID(Record[OpNum], FTy->getParamType(i)));
+        if (Args.back() == 0) return Error("Invalid CALL record");
+      }
+      
+      // Read type/value pairs for varargs params.
+      if (!FTy->isVarArg()) {
+        if (OpNum != Record.size())
+          return Error("Invalid CALL record");
+      } else {
+        while (OpNum != Record.size()) {
+          Value *Op;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+            return Error("Invalid CALL record");
+          Args.push_back(Op);
+        }
+      }
+      
+      I = CallInst::Create(Callee, Args.begin(), Args.end());
+      cast<CallInst>(I)->setCallingConv(CCInfo>>1);
+      cast<CallInst>(I)->setTailCall(CCInfo & 1);
+      cast<CallInst>(I)->setAttributes(PAL);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
+      if (Record.size() < 3)
+        return Error("Invalid VAARG record");
+      const Type *OpTy = getTypeByID(Record[0]);
+      Value *Op = getFnValueByID(Record[1], OpTy);
+      const Type *ResTy = getTypeByID(Record[2]);
+      if (!OpTy || !Op || !ResTy)
+        return Error("Invalid VAARG record");
+      I = new VAArgInst(Op, ResTy);
+      break;
+    }
+    }
+
+    // Add instruction to end of current BB.  If there is no current BB, reject
+    // this file.
+    if (CurBB == 0) {
+      delete I;
+      return Error("Invalid instruction with no BB");
+    }
+    CurBB->getInstList().push_back(I);
+    
+    // If this was a terminator instruction, move to the next block.
+    if (isa<TerminatorInst>(I)) {
+      ++CurBBNo;
+      CurBB = CurBBNo < FunctionBBs.size() ? FunctionBBs[CurBBNo] : 0;
+    }
+    
+    // Non-void values get registered in the value table for future use.
+    if (I && I->getType() != Type::VoidTy)
+      ValueList.AssignValue(I, NextValueNo++);
+  }
+  
+  // Check the function list for unresolved values.
+  if (Argument *A = dyn_cast<Argument>(ValueList.back())) {
+    if (A->getParent() == 0) {
+      // We found at least one unresolved value.  Nuke them all to avoid leaks.
+      for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; ++i){
+        if ((A = dyn_cast<Argument>(ValueList.back())) && A->getParent() == 0) {
+          A->replaceAllUsesWith(UndefValue::get(A->getType()));
+          delete A;
+        }
+      }
+      return Error("Never resolved value found in function!");
+    }
+  }
+  
+  // Trim the value list down to the size it was before we parsed this function.
+  ValueList.shrinkTo(ModuleValueListSize);
+  std::vector<BasicBlock*>().swap(FunctionBBs);
+  
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// ModuleProvider implementation
+//===----------------------------------------------------------------------===//
+
+
+bool BitcodeReader::materializeFunction(Function *F, std::string *ErrInfo) {
+  // If it already is material, ignore the request.
+  if (!F->hasNotBeenReadFromBitcode()) return false;
+  
+  DenseMap<Function*, std::pair<uint64_t, unsigned> >::iterator DFII = 
+    DeferredFunctionInfo.find(F);
+  assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
+  
+  // Move the bit stream to the saved position of the deferred function body and
+  // restore the real linkage type for the function.
+  Stream.JumpToBit(DFII->second.first);
+  F->setLinkage((GlobalValue::LinkageTypes)DFII->second.second);
+  
+  if (ParseFunctionBody(F)) {
+    if (ErrInfo) *ErrInfo = ErrorString;
+    return true;
+  }
+
+  // Upgrade any old intrinsic calls in the function.
+  for (UpgradedIntrinsicMap::iterator I = UpgradedIntrinsics.begin(),
+       E = UpgradedIntrinsics.end(); I != E; ++I) {
+    if (I->first != I->second) {
+      for (Value::use_iterator UI = I->first->use_begin(),
+           UE = I->first->use_end(); UI != UE; ) {
+        if (CallInst* CI = dyn_cast<CallInst>(*UI++))
+          UpgradeIntrinsicCall(CI, I->second);
+      }
+    }
+  }
+  
+  return false;
+}
+
+void BitcodeReader::dematerializeFunction(Function *F) {
+  // If this function isn't materialized, or if it is a proto, this is a noop.
+  if (F->hasNotBeenReadFromBitcode() || F->isDeclaration())
+    return;
+  
+  assert(DeferredFunctionInfo.count(F) && "No info to read function later?");
+  
+  // Just forget the function body, we can remat it later.
+  F->deleteBody();
+  F->setLinkage(GlobalValue::GhostLinkage);
+}
+
+
+Module *BitcodeReader::materializeModule(std::string *ErrInfo) {
+  for (DenseMap<Function*, std::pair<uint64_t, unsigned> >::iterator I = 
+       DeferredFunctionInfo.begin(), E = DeferredFunctionInfo.end(); I != E;
+       ++I) {
+    Function *F = I->first;
+    if (F->hasNotBeenReadFromBitcode() &&
+        materializeFunction(F, ErrInfo))
+      return 0;
+  }
+
+  // Upgrade any intrinsic calls that slipped through (should not happen!) and 
+  // delete the old functions to clean up. We can't do this unless the entire 
+  // module is materialized because there could always be another function body 
+  // with calls to the old function.
+  for (std::vector<std::pair<Function*, Function*> >::iterator I =
+       UpgradedIntrinsics.begin(), E = UpgradedIntrinsics.end(); I != E; ++I) {
+    if (I->first != I->second) {
+      for (Value::use_iterator UI = I->first->use_begin(),
+           UE = I->first->use_end(); UI != UE; ) {
+        if (CallInst* CI = dyn_cast<CallInst>(*UI++))
+          UpgradeIntrinsicCall(CI, I->second);
+      }
+      if (!I->first->use_empty())
+        I->first->replaceAllUsesWith(I->second);
+      I->first->eraseFromParent();
+    }
+  }
+  std::vector<std::pair<Function*, Function*> >().swap(UpgradedIntrinsics);
+  
+  return TheModule;
+}
+
+
+/// This method is provided by the parent ModuleProvde class and overriden
+/// here. It simply releases the module from its provided and frees up our
+/// state.
+/// @brief Release our hold on the generated module
+Module *BitcodeReader::releaseModule(std::string *ErrInfo) {
+  // Since we're losing control of this Module, we must hand it back complete
+  Module *M = ModuleProvider::releaseModule(ErrInfo);
+  FreeState();
+  return M;
+}
+
+
+//===----------------------------------------------------------------------===//
+// External interface
+//===----------------------------------------------------------------------===//
+
+/// getBitcodeModuleProvider - lazy function-at-a-time loading from a file.
+///
+ModuleProvider *llvm::getBitcodeModuleProvider(MemoryBuffer *Buffer,
+                                               std::string *ErrMsg) {
+  BitcodeReader *R = new BitcodeReader(Buffer);
+  if (R->ParseBitcode()) {
+    if (ErrMsg)
+      *ErrMsg = R->getErrorString();
+    
+    // Don't let the BitcodeReader dtor delete 'Buffer'.
+    R->releaseMemoryBuffer();
+    delete R;
+    return 0;
+  }
+  return R;
+}
+
+/// ParseBitcodeFile - Read the specified bitcode file, returning the module.
+/// If an error occurs, return null and fill in *ErrMsg if non-null.
+Module *llvm::ParseBitcodeFile(MemoryBuffer *Buffer, std::string *ErrMsg){
+  BitcodeReader *R;
+  R = static_cast<BitcodeReader*>(getBitcodeModuleProvider(Buffer, ErrMsg));
+  if (!R) return 0;
+  
+  // Read in the entire module.
+  Module *M = R->materializeModule(ErrMsg);
+
+  // Don't let the BitcodeReader dtor delete 'Buffer', regardless of whether
+  // there was an error.
+  R->releaseMemoryBuffer();
+  
+  // If there was no error, tell ModuleProvider not to delete it when its dtor
+  // is run.
+  if (M)
+    M = R->releaseModule(ErrMsg);
+  
+  delete R;
+  return M;
+}
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
new file mode 100644
index 0000000..0dc470b
--- /dev/null
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -0,0 +1,214 @@
+//===- BitcodeReader.h - Internal BitcodeReader impl ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header defines the BitcodeReader class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BITCODE_READER_H
+#define BITCODE_READER_H
+
+#include "llvm/ModuleProvider.h"
+#include "llvm/Attributes.h"
+#include "llvm/Type.h"
+#include "llvm/OperandTraits.h"
+#include "llvm/Bitcode/BitstreamReader.h"
+#include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/ADT/DenseMap.h"
+#include <vector>
+
+namespace llvm {
+  class MemoryBuffer;
+  
+//===----------------------------------------------------------------------===//
+//                          BitcodeReaderValueList Class
+//===----------------------------------------------------------------------===//
+
+class BitcodeReaderValueList {
+  std::vector<WeakVH> ValuePtrs;
+  
+  /// ResolveConstants - As we resolve forward-referenced constants, we add
+  /// information about them to this vector.  This allows us to resolve them in
+  /// bulk instead of resolving each reference at a time.  See the code in
+  /// ResolveConstantForwardRefs for more information about this.
+  ///
+  /// The key of this vector is the placeholder constant, the value is the slot
+  /// number that holds the resolved value.
+  typedef std::vector<std::pair<Constant*, unsigned> > ResolveConstantsTy;
+  ResolveConstantsTy ResolveConstants;
+public:
+  BitcodeReaderValueList() {}
+  ~BitcodeReaderValueList() {
+    assert(ResolveConstants.empty() && "Constants not resolved?");
+  }
+
+  // vector compatibility methods
+  unsigned size() const { return ValuePtrs.size(); }
+  void resize(unsigned N) { ValuePtrs.resize(N); }
+  void push_back(Value *V) {
+    ValuePtrs.push_back(V);
+  }
+  
+  void clear() {
+    assert(ResolveConstants.empty() && "Constants not resolved?");
+    ValuePtrs.clear();
+  }
+  
+  Value *operator[](unsigned i) const {
+    assert(i < ValuePtrs.size());
+    return ValuePtrs[i];
+  }
+  
+  Value *back() const { return ValuePtrs.back(); }
+    void pop_back() { ValuePtrs.pop_back(); }
+  bool empty() const { return ValuePtrs.empty(); }
+  void shrinkTo(unsigned N) {
+    assert(N <= size() && "Invalid shrinkTo request!");
+    ValuePtrs.resize(N);
+  }
+  
+  Constant *getConstantFwdRef(unsigned Idx, const Type *Ty);
+  Value *getValueFwdRef(unsigned Idx, const Type *Ty);
+  
+  void AssignValue(Value *V, unsigned Idx);
+  
+  /// ResolveConstantForwardRefs - Once all constants are read, this method bulk
+  /// resolves any forward references.
+  void ResolveConstantForwardRefs();
+};
+
+class BitcodeReader : public ModuleProvider {
+  MemoryBuffer *Buffer;
+  BitstreamReader StreamFile;
+  BitstreamCursor Stream;
+  
+  const char *ErrorString;
+  
+  std::vector<PATypeHolder> TypeList;
+  BitcodeReaderValueList ValueList;
+  std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
+  std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
+  
+  /// MAttributes - The set of attributes by index.  Index zero in the
+  /// file is for null, and is thus not represented here.  As such all indices
+  /// are off by one.
+  std::vector<AttrListPtr> MAttributes;
+  
+  /// FunctionBBs - While parsing a function body, this is a list of the basic
+  /// blocks for the function.
+  std::vector<BasicBlock*> FunctionBBs;
+  
+  // When reading the module header, this list is populated with functions that
+  // have bodies later in the file.
+  std::vector<Function*> FunctionsWithBodies;
+
+  // When intrinsic functions are encountered which require upgrading they are 
+  // stored here with their replacement function.
+  typedef std::vector<std::pair<Function*, Function*> > UpgradedIntrinsicMap;
+  UpgradedIntrinsicMap UpgradedIntrinsics;
+  
+  // After the module header has been read, the FunctionsWithBodies list is 
+  // reversed.  This keeps track of whether we've done this yet.
+  bool HasReversedFunctionsWithBodies;
+  
+  /// DeferredFunctionInfo - When function bodies are initially scanned, this
+  /// map contains info about where to find deferred function body (in the
+  /// stream) and what linkage the original function had.
+  DenseMap<Function*, std::pair<uint64_t, unsigned> > DeferredFunctionInfo;
+public:
+  explicit BitcodeReader(MemoryBuffer *buffer)
+      : Buffer(buffer), ErrorString(0) {
+    HasReversedFunctionsWithBodies = false;
+  }
+  ~BitcodeReader() {
+    FreeState();
+  }
+  
+  void FreeState();
+  
+  /// releaseMemoryBuffer - This causes the reader to completely forget about
+  /// the memory buffer it contains, which prevents the buffer from being
+  /// destroyed when it is deleted.
+  void releaseMemoryBuffer() {
+    Buffer = 0;
+  }
+  
+  virtual bool materializeFunction(Function *F, std::string *ErrInfo = 0);
+  virtual Module *materializeModule(std::string *ErrInfo = 0);
+  virtual void dematerializeFunction(Function *F);
+  virtual Module *releaseModule(std::string *ErrInfo = 0);
+
+  bool Error(const char *Str) {
+    ErrorString = Str;
+    return true;
+  }
+  const char *getErrorString() const { return ErrorString; }
+  
+  /// @brief Main interface to parsing a bitcode buffer.
+  /// @returns true if an error occurred.
+  bool ParseBitcode();
+private:
+  const Type *getTypeByID(unsigned ID, bool isTypeTable = false);
+  Value *getFnValueByID(unsigned ID, const Type *Ty) {
+    return ValueList.getValueFwdRef(ID, Ty);
+  }
+  BasicBlock *getBasicBlock(unsigned ID) const {
+    if (ID >= FunctionBBs.size()) return 0; // Invalid ID
+    return FunctionBBs[ID];
+  }
+  AttrListPtr getAttributes(unsigned i) const {
+    if (i-1 < MAttributes.size())
+      return MAttributes[i-1];
+    return AttrListPtr();
+  }
+  
+  /// getValueTypePair - Read a value/type pair out of the specified record from
+  /// slot 'Slot'.  Increment Slot past the number of slots used in the record.
+  /// Return true on failure.
+  bool getValueTypePair(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
+                        unsigned InstNum, Value *&ResVal) {
+    if (Slot == Record.size()) return true;
+    unsigned ValNo = (unsigned)Record[Slot++];
+    if (ValNo < InstNum) {
+      // If this is not a forward reference, just return the value we already
+      // have.
+      ResVal = getFnValueByID(ValNo, 0);
+      return ResVal == 0;
+    } else if (Slot == Record.size()) {
+      return true;
+    }
+    
+    unsigned TypeNo = (unsigned)Record[Slot++];
+    ResVal = getFnValueByID(ValNo, getTypeByID(TypeNo));
+    return ResVal == 0;
+  }
+  bool getValue(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
+                const Type *Ty, Value *&ResVal) {
+    if (Slot == Record.size()) return true;
+    unsigned ValNo = (unsigned)Record[Slot++];
+    ResVal = getFnValueByID(ValNo, Ty);
+    return ResVal == 0;
+  }
+
+  
+  bool ParseModule(const std::string &ModuleID);
+  bool ParseAttributeBlock();
+  bool ParseTypeTable();
+  bool ParseTypeSymbolTable();
+  bool ParseValueSymbolTable();
+  bool ParseConstants();
+  bool RememberAndSkipFunctionBody();
+  bool ParseFunctionBody(Function *F);
+  bool ResolveGlobalAndAliasInits();
+};
+  
+} // End llvm namespace
+
+#endif
diff --git a/lib/Bitcode/Reader/CMakeLists.txt b/lib/Bitcode/Reader/CMakeLists.txt
new file mode 100644
index 0000000..a19c79a
--- /dev/null
+++ b/lib/Bitcode/Reader/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_llvm_library(LLVMBitReader
+  BitReader.cpp
+  BitcodeReader.cpp
+  Deserialize.cpp
+  DeserializeAPFloat.cpp
+  DeserializeAPInt.cpp
+  )
+\ No newline at end of file
diff --git a/lib/Bitcode/Reader/Deserialize.cpp b/lib/Bitcode/Reader/Deserialize.cpp
new file mode 100644
index 0000000..06da6ce
--- /dev/null
+++ b/lib/Bitcode/Reader/Deserialize.cpp
@@ -0,0 +1,454 @@
+//==- Deserialize.cpp - Generic Object Serialization to Bitcode --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the internal methods used for object serialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/Deserialize.h"
+
+#ifdef DEBUG_BACKPATCH
+#include "llvm/Support/Streams.h"
+#endif
+
+using namespace llvm;
+
+Deserializer::Deserializer(BitstreamReader& stream)
+  : Stream(stream), RecIdx(0), FreeList(NULL), AbbrevNo(0), RecordCode(0) {
+
+  StreamStart = Stream.GetCurrentBitNo();
+}
+
+Deserializer::~Deserializer() {
+  assert (RecIdx >= Record.size() && 
+          "Still scanning bitcode record when deserialization completed.");
+ 
+#ifdef DEBUG_BACKPATCH
+  for (MapTy::iterator I=BPatchMap.begin(), E=BPatchMap.end(); I!=E; ++I)
+    assert (I->first.hasFinalPtr() &&
+            "Some pointers were not backpatched.");
+#endif
+}
+
+
+bool Deserializer::inRecord() {
+  if (Record.size() > 0) {
+    if (RecIdx >= Record.size()) {
+      RecIdx = 0;
+      Record.clear();
+      AbbrevNo = 0;
+      return false;
+    }
+    else
+      return true;
+  }
+
+  return false;
+}
+
+bool Deserializer::AdvanceStream() {
+  assert (!inRecord() && 
+          "Cannot advance stream.  Still processing a record.");
+  
+  if (AbbrevNo == bitc::ENTER_SUBBLOCK ||
+      AbbrevNo >= bitc::UNABBREV_RECORD)
+    return true;
+  
+  while (!Stream.AtEndOfStream()) {
+    
+    uint64_t Pos = Stream.GetCurrentBitNo();
+    AbbrevNo = Stream.ReadCode();    
+  
+    switch (AbbrevNo) {        
+      case bitc::ENTER_SUBBLOCK: {
+        unsigned id = Stream.ReadSubBlockID();
+        
+        // Determine the extent of the block.  This is useful for jumping around
+        // the stream.  This is hack: we read the header of the block, save
+        // the length, and then revert the bitstream to a location just before
+        // the block is entered.
+        uint64_t BPos = Stream.GetCurrentBitNo();
+        Stream.ReadVBR(bitc::CodeLenWidth); // Skip the code size.
+        Stream.SkipToWord();
+        unsigned NumWords = Stream.Read(bitc::BlockSizeWidth);
+        Stream.JumpToBit(BPos);
+                
+        BlockStack.push_back(Location(Pos,id,NumWords));
+        break;
+      } 
+        
+      case bitc::END_BLOCK: {
+        bool x = Stream.ReadBlockEnd();
+        assert(!x && "Error at block end."); x=x;
+        BlockStack.pop_back();
+        continue;
+      }
+        
+      case bitc::DEFINE_ABBREV:
+        Stream.ReadAbbrevRecord();
+        continue;
+
+      default:
+        break;
+    }
+    
+    return true;
+  }
+  
+  return false;
+}
+
+void Deserializer::ReadRecord() {
+
+  while (AdvanceStream() && AbbrevNo == bitc::ENTER_SUBBLOCK) {
+    assert (!BlockStack.empty());
+    Stream.EnterSubBlock(BlockStack.back().BlockID);
+    AbbrevNo = 0;
+  }
+
+  if (Stream.AtEndOfStream())
+    return;
+  
+  assert (Record.empty());
+  assert (AbbrevNo >= bitc::UNABBREV_RECORD);
+  RecordCode = Stream.ReadRecord(AbbrevNo,Record);
+  assert (Record.size() > 0);
+}
+
+void Deserializer::SkipBlock() {
+  assert (!inRecord());
+
+  if (AtEnd())
+    return;
+
+  AdvanceStream();  
+
+  assert (AbbrevNo == bitc::ENTER_SUBBLOCK);
+  BlockStack.pop_back();
+  Stream.SkipBlock();
+
+  AbbrevNo = 0;
+  AdvanceStream();
+}
+
+bool Deserializer::SkipToBlock(unsigned BlockID) {
+  assert (!inRecord());
+  
+  AdvanceStream();
+  assert (AbbrevNo == bitc::ENTER_SUBBLOCK);
+  
+  unsigned BlockLevel = BlockStack.size();
+
+  while (!AtEnd() &&
+         BlockLevel == BlockStack.size() && 
+         getCurrentBlockID() != BlockID)
+    SkipBlock();
+
+  return !(AtEnd() || BlockLevel != BlockStack.size());
+}
+
+Deserializer::Location Deserializer::getCurrentBlockLocation() {
+  if (!inRecord())
+    AdvanceStream();
+  
+  return BlockStack.back();
+}
+
+bool Deserializer::JumpTo(const Location& Loc) {
+    
+  assert (!inRecord());
+
+  AdvanceStream();
+  
+  assert (!BlockStack.empty() || AtEnd());
+    
+  uint64_t LastBPos = StreamStart;
+  
+  while (!BlockStack.empty()) {
+    
+    LastBPos = BlockStack.back().BitNo;
+    
+    // Determine of the current block contains the location of the block
+    // we are looking for.
+    if (BlockStack.back().contains(Loc)) {
+      // We found the enclosing block.  We must first POP it off to
+      // destroy any accumulated context within the block scope.  We then
+      // jump to the position of the block and enter it.
+      Stream.JumpToBit(LastBPos);
+      
+      if (BlockStack.size() == Stream.BlockScope.size())
+        Stream.PopBlockScope();
+
+      BlockStack.pop_back();
+      
+      AbbrevNo = 0;
+      AdvanceStream();      
+      assert (AbbrevNo == bitc::ENTER_SUBBLOCK);
+      
+      Stream.EnterSubBlock(BlockStack.back().BlockID);
+      break;
+    }
+
+    // This block does not contain the block we are looking for.  Pop it.
+    if (BlockStack.size() == Stream.BlockScope.size())
+      Stream.PopBlockScope();
+    
+    BlockStack.pop_back();
+
+  }
+
+  // Check if we have popped our way to the outermost scope.  If so,
+  // we need to adjust our position.
+  if (BlockStack.empty()) {
+    assert (Stream.BlockScope.empty());
+    
+    Stream.JumpToBit(Loc.BitNo < LastBPos ? StreamStart : LastBPos);
+    AbbrevNo = 0;
+    AdvanceStream();
+  }
+
+  assert (AbbrevNo == bitc::ENTER_SUBBLOCK);
+  assert (!BlockStack.empty());
+  
+  while (!AtEnd() && BlockStack.back() != Loc) {
+    if (BlockStack.back().contains(Loc)) {
+      Stream.EnterSubBlock(BlockStack.back().BlockID);
+      AbbrevNo = 0;
+      AdvanceStream();
+      continue;
+    }
+    else
+      SkipBlock();
+  }
+  
+  if (AtEnd())
+    return false;
+  
+  assert (BlockStack.back() == Loc);
+
+  return true;
+}
+
+void Deserializer::Rewind() {
+  while (!Stream.BlockScope.empty())
+    Stream.PopBlockScope();
+  
+  while (!BlockStack.empty())
+    BlockStack.pop_back();
+  
+  Stream.JumpToBit(StreamStart);
+  AbbrevNo = 0;
+}
+  
+
+unsigned Deserializer::getCurrentBlockID() { 
+  if (!inRecord())
+    AdvanceStream();
+  
+  return BlockStack.back().BlockID;
+}
+
+unsigned Deserializer::getRecordCode() {
+  if (!inRecord()) {
+    AdvanceStream();
+    assert (AbbrevNo >= bitc::UNABBREV_RECORD);
+    ReadRecord();
+  }
+  
+  return RecordCode;
+}
+
+bool Deserializer::FinishedBlock(Location BlockLoc) {
+  if (!inRecord())
+    AdvanceStream();
+
+  for (llvm::SmallVector<Location,8>::reverse_iterator
+        I=BlockStack.rbegin(), E=BlockStack.rend(); I!=E; ++I)
+      if (*I == BlockLoc)
+        return false;
+  
+  return true;
+}
+
+unsigned Deserializer::getAbbrevNo() {
+  if (!inRecord())
+    AdvanceStream();
+  
+  return AbbrevNo;
+}
+
+bool Deserializer::AtEnd() {
+  if (inRecord())
+    return false;
+  
+  if (!AdvanceStream())
+    return true;
+  
+  return false;
+}
+
+uint64_t Deserializer::ReadInt() {
+  // FIXME: Any error recovery/handling with incomplete or bad files?
+  if (!inRecord())
+    ReadRecord();
+
+  return Record[RecIdx++];
+}
+
+int64_t Deserializer::ReadSInt() {
+  uint64_t x = ReadInt();
+  int64_t magnitude = x >> 1;
+  return x & 0x1 ? -magnitude : magnitude;
+}
+
+char* Deserializer::ReadCStr(char* cstr, unsigned MaxLen, bool isNullTerm) {
+  if (cstr == NULL)
+    MaxLen = 0; // Zero this just in case someone does something funny.
+  
+  unsigned len = ReadInt();
+
+  assert (MaxLen == 0 || (len + (isNullTerm ? 1 : 0)) <= MaxLen);
+
+  if (!cstr)
+    cstr = new char[len + (isNullTerm ? 1 : 0)];
+  
+  assert (cstr != NULL);
+  
+  for (unsigned i = 0; i < len; ++i)
+    cstr[i] = (char) ReadInt();
+  
+  if (isNullTerm)
+    cstr[len] = '\0';
+  
+  return cstr;
+}
+
+void Deserializer::ReadCStr(std::vector<char>& buff, bool isNullTerm,
+                            unsigned Idx) {
+  
+  unsigned len = ReadInt();
+
+  // If Idx is beyond the current before size, reduce Idx to refer to the
+  // element after the last element.
+  if (Idx > buff.size())
+    Idx = buff.size();
+
+  buff.reserve(len+Idx);
+  buff.resize(Idx);      
+  
+  for (unsigned i = 0; i < len; ++i)
+    buff.push_back((char) ReadInt());
+  
+  if (isNullTerm)
+    buff.push_back('\0');
+}
+
+void Deserializer::RegisterPtr(const SerializedPtrID& PtrId,
+                               const void* Ptr) {
+  
+  MapTy::value_type& E = BPatchMap.FindAndConstruct(BPKey(PtrId));
+  
+  assert (!HasFinalPtr(E) && "Pointer already registered.");
+
+#ifdef DEBUG_BACKPATCH
+  llvm::cerr << "RegisterPtr: " << PtrId << " => " << Ptr << "\n";
+#endif 
+  
+  SetPtr(E,Ptr);
+}
+
+void Deserializer::ReadUIntPtr(uintptr_t& PtrRef, 
+                               const SerializedPtrID& PtrId,
+                               bool AllowBackpatch) {  
+  if (PtrId == 0) {
+    PtrRef = 0;
+    return;
+  }
+  
+  MapTy::value_type& E = BPatchMap.FindAndConstruct(BPKey(PtrId));
+  
+  if (HasFinalPtr(E)) {
+    PtrRef = GetFinalPtr(E);
+
+#ifdef DEBUG_BACKPATCH
+    llvm::cerr << "ReadUintPtr: " << PtrId
+               << " <-- " <<  (void*) GetFinalPtr(E) << '\n';
+#endif    
+  }
+  else {
+    assert (AllowBackpatch &&
+            "Client forbids backpatching for this pointer.");
+    
+#ifdef DEBUG_BACKPATCH
+    llvm::cerr << "ReadUintPtr: " << PtrId << " (NO PTR YET)\n";
+#endif
+    
+    // Register backpatch.  Check the freelist for a BPNode.
+    BPNode* N;
+
+    if (FreeList) {
+      N = FreeList;
+      FreeList = FreeList->Next;
+    }
+    else // No available BPNode.  Allocate one.
+      N = (BPNode*) Allocator.Allocate<BPNode>();
+    
+    new (N) BPNode(GetBPNode(E),PtrRef);
+    SetBPNode(E,N);
+  }
+}
+
+uintptr_t Deserializer::ReadInternalRefPtr() {
+  SerializedPtrID PtrId = ReadPtrID();
+  
+  assert (PtrId != 0 && "References cannot refer the NULL address.");
+
+  MapTy::value_type& E = BPatchMap.FindAndConstruct(BPKey(PtrId));
+  
+  assert (HasFinalPtr(E) &&
+          "Cannot backpatch references.  Object must be already deserialized.");
+  
+  return GetFinalPtr(E);
+}
+
+void Deserializer::BPEntry::SetPtr(BPNode*& FreeList, void* P) {
+  BPNode* Last = NULL;
+  
+  for (BPNode* N = Head; N != NULL; N=N->Next) {
+    Last = N;
+    N->PtrRef |= reinterpret_cast<uintptr_t>(P);
+  }
+  
+  if (Last) {
+    Last->Next = FreeList;
+    FreeList = Head;
+  }
+  
+  Ptr = const_cast<void*>(P);
+}
+
+
+#define INT_READ(TYPE)\
+void SerializeTrait<TYPE>::Read(Deserializer& D, TYPE& X) {\
+  X = (TYPE) D.ReadInt(); }
+
+INT_READ(bool)
+INT_READ(unsigned char)
+INT_READ(unsigned short)
+INT_READ(unsigned int)
+INT_READ(unsigned long)
+
+#define SINT_READ(TYPE)\
+void SerializeTrait<TYPE>::Read(Deserializer& D, TYPE& X) {\
+  X = (TYPE) D.ReadSInt(); }
+
+INT_READ(signed char)
+INT_READ(signed short)
+INT_READ(signed int)
+INT_READ(signed long)
diff --git a/lib/Bitcode/Reader/DeserializeAPFloat.cpp b/lib/Bitcode/Reader/DeserializeAPFloat.cpp
new file mode 100644
index 0000000..ee24b68
--- /dev/null
+++ b/lib/Bitcode/Reader/DeserializeAPFloat.cpp
@@ -0,0 +1,24 @@
+//===-- SerializeAPInt.cpp - Serialization for APFloat ---------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements deserialization of APFloat.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/Bitcode/Deserialize.h"
+
+using namespace llvm;
+
+APFloat APFloat::ReadVal(Deserializer& D) {
+  APInt x;
+  D.Read(x);
+  return APFloat(x);
+}
+
diff --git a/lib/Bitcode/Reader/DeserializeAPInt.cpp b/lib/Bitcode/Reader/DeserializeAPInt.cpp
new file mode 100644
index 0000000..1b5b2bf
--- /dev/null
+++ b/lib/Bitcode/Reader/DeserializeAPInt.cpp
@@ -0,0 +1,33 @@
+//===-- DeserializeAPInt.cpp - Deserialization for APInts ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements deserialization of APInts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/Bitcode/Deserialize.h"
+#include <cassert>
+
+using namespace llvm;
+
+void APInt::Read(Deserializer& D) {
+  BitWidth = D.ReadInt();
+  
+  if (isSingleWord())
+    VAL = D.ReadInt();
+  else {
+    uint32_t NumWords = D.ReadInt();
+    assert (NumWords > 1);
+    pVal = new uint64_t[NumWords];
+    assert (pVal && "Allocation in deserialization of APInt failed.");
+    for (unsigned i = 0; i < NumWords; ++i)
+      pVal[i] = D.ReadInt();    
+  }
+}
diff --git a/lib/Bitcode/Reader/Makefile b/lib/Bitcode/Reader/Makefile
new file mode 100644
index 0000000..59af8d53
--- /dev/null
+++ b/lib/Bitcode/Reader/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Bitcode/Reader/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMBitReader
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Bitcode/Writer/BitWriter.cpp b/lib/Bitcode/Writer/BitWriter.cpp
new file mode 100644
index 0000000..8834964
--- /dev/null
+++ b/lib/Bitcode/Writer/BitWriter.cpp
@@ -0,0 +1,58 @@
+//===-- BitWriter.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/BitWriter.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include <fstream>
+
+using namespace llvm;
+
+
+/*===-- Operations on modules ---------------------------------------------===*/
+
+int LLVMWriteBitcodeToFile(LLVMModuleRef M, const char *Path) {
+  std::ofstream OS(Path, std::ios_base::out|std::ios::trunc|std::ios::binary);
+  
+  if (!OS.fail())
+    WriteBitcodeToFile(unwrap(M), OS);
+  
+  if (OS.fail())
+    return -1;
+  
+  return 0;
+}
+
+#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR >= 4)
+#include <ext/stdio_filebuf.h>
+
+// FIXME: Control this with configure? Provide some portable abstraction in
+// libSystem? As is, the user will just get a linker error if they use this on 
+// non-GCC. Some C++ stdlibs even have ofstream::ofstream(int fd).
+int LLVMWriteBitcodeToFileHandle(LLVMModuleRef M, int FileHandle) {
+  __gnu_cxx::stdio_filebuf<char> Buffer(FileHandle, std::ios_base::out |
+                                                    std::ios::trunc |
+                                                    std::ios::binary);
+  std::ostream OS(&Buffer);
+  
+  if (!OS.fail())
+    WriteBitcodeToFile(unwrap(M), OS);
+  
+  if (OS.fail())
+    return -1;
+  
+  return 0;
+}
+
+#else
+
+int LLVMWriteBitcodeToFileHandle(LLVMModuleRef M, int FileHandle) {
+  return -1; // Not supported.
+}
+
+#endif
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
new file mode 100644
index 0000000..bfc029c
--- /dev/null
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -0,0 +1,1449 @@
+//===--- Bitcode/Writer/BitcodeWriter.cpp - Bitcode Writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Bitcode writer implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Bitcode/BitstreamWriter.h"
+#include "llvm/Bitcode/LLVMBitCodes.h"
+#include "ValueEnumerator.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/MDNode.h"
+#include "llvm/Module.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Program.h"
+using namespace llvm;
+
+/// These are manifest constants used by the bitcode writer. They do not need to
+/// be kept in sync with the reader, but need to be consistent within this file.
+enum {
+  CurVersion = 0,
+  
+  // VALUE_SYMTAB_BLOCK abbrev id's.
+  VST_ENTRY_8_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+  VST_ENTRY_7_ABBREV,
+  VST_ENTRY_6_ABBREV,
+  VST_BBENTRY_6_ABBREV,
+  
+  // CONSTANTS_BLOCK abbrev id's.
+  CONSTANTS_SETTYPE_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+  CONSTANTS_INTEGER_ABBREV,
+  CONSTANTS_CE_CAST_Abbrev,
+  CONSTANTS_NULL_Abbrev,
+  
+  // FUNCTION_BLOCK abbrev id's.
+  FUNCTION_INST_LOAD_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+  FUNCTION_INST_BINOP_ABBREV,
+  FUNCTION_INST_CAST_ABBREV,
+  FUNCTION_INST_RET_VOID_ABBREV,
+  FUNCTION_INST_RET_VAL_ABBREV,
+  FUNCTION_INST_UNREACHABLE_ABBREV
+};
+
+
+static unsigned GetEncodedCastOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: assert(0 && "Unknown cast instruction!");
+  case Instruction::Trunc   : return bitc::CAST_TRUNC;
+  case Instruction::ZExt    : return bitc::CAST_ZEXT;
+  case Instruction::SExt    : return bitc::CAST_SEXT;
+  case Instruction::FPToUI  : return bitc::CAST_FPTOUI;
+  case Instruction::FPToSI  : return bitc::CAST_FPTOSI;
+  case Instruction::UIToFP  : return bitc::CAST_UITOFP;
+  case Instruction::SIToFP  : return bitc::CAST_SITOFP;
+  case Instruction::FPTrunc : return bitc::CAST_FPTRUNC;
+  case Instruction::FPExt   : return bitc::CAST_FPEXT;
+  case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
+  case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
+  case Instruction::BitCast : return bitc::CAST_BITCAST;
+  }
+}
+
+static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: assert(0 && "Unknown binary instruction!");
+  case Instruction::Add:  return bitc::BINOP_ADD;
+  case Instruction::Sub:  return bitc::BINOP_SUB;
+  case Instruction::Mul:  return bitc::BINOP_MUL;
+  case Instruction::UDiv: return bitc::BINOP_UDIV;
+  case Instruction::FDiv:
+  case Instruction::SDiv: return bitc::BINOP_SDIV;
+  case Instruction::URem: return bitc::BINOP_UREM;
+  case Instruction::FRem:
+  case Instruction::SRem: return bitc::BINOP_SREM;
+  case Instruction::Shl:  return bitc::BINOP_SHL;
+  case Instruction::LShr: return bitc::BINOP_LSHR;
+  case Instruction::AShr: return bitc::BINOP_ASHR;
+  case Instruction::And:  return bitc::BINOP_AND;
+  case Instruction::Or:   return bitc::BINOP_OR;
+  case Instruction::Xor:  return bitc::BINOP_XOR;
+  }
+}
+
+
+
+static void WriteStringRecord(unsigned Code, const std::string &Str, 
+                              unsigned AbbrevToUse, BitstreamWriter &Stream) {
+  SmallVector<unsigned, 64> Vals;
+  
+  // Code: [strchar x N]
+  for (unsigned i = 0, e = Str.size(); i != e; ++i)
+    Vals.push_back(Str[i]);
+    
+  // Emit the finished record.
+  Stream.EmitRecord(Code, Vals, AbbrevToUse);
+}
+
+// Emit information about parameter attributes.
+static void WriteAttributeTable(const ValueEnumerator &VE, 
+                                BitstreamWriter &Stream) {
+  const std::vector<AttrListPtr> &Attrs = VE.getAttributes();
+  if (Attrs.empty()) return;
+  
+  Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+    const AttrListPtr &A = Attrs[i];
+    for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) {
+      const AttributeWithIndex &PAWI = A.getSlot(i);
+      Record.push_back(PAWI.Index);
+
+      // FIXME: remove in LLVM 3.0
+      // Store the alignment in the bitcode as a 16-bit raw value instead of a
+      // 5-bit log2 encoded value. Shift the bits above the alignment up by
+      // 11 bits.
+      uint64_t FauxAttr = PAWI.Attrs & 0xffff;
+      if (PAWI.Attrs & Attribute::Alignment)
+        FauxAttr |= (1ull<<16)<<(((PAWI.Attrs & Attribute::Alignment)-1) >> 16);
+      FauxAttr |= (PAWI.Attrs & (0x3FFull << 21)) << 11;
+
+      Record.push_back(FauxAttr);
+    }
+    
+    Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
+    Record.clear();
+  }
+  
+  Stream.ExitBlock();
+}
+
+/// WriteTypeTable - Write out the type table for a module.
+static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
+  const ValueEnumerator::TypeList &TypeList = VE.getTypes();
+  
+  Stream.EnterSubblock(bitc::TYPE_BLOCK_ID, 4 /*count from # abbrevs */);
+  SmallVector<uint64_t, 64> TypeVals;
+  
+  // Abbrev for TYPE_CODE_POINTER.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                            Log2_32_Ceil(VE.getTypes().size()+1)));
+  Abbv->Add(BitCodeAbbrevOp(0));  // Addrspace = 0
+  unsigned PtrAbbrev = Stream.EmitAbbrev(Abbv);
+  
+  // Abbrev for TYPE_CODE_FUNCTION.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // isvararg
+  Abbv->Add(BitCodeAbbrevOp(0));  // FIXME: DEAD value, remove in LLVM 3.0
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                            Log2_32_Ceil(VE.getTypes().size()+1)));
+  unsigned FunctionAbbrev = Stream.EmitAbbrev(Abbv);
+  
+  // Abbrev for TYPE_CODE_STRUCT.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                            Log2_32_Ceil(VE.getTypes().size()+1)));
+  unsigned StructAbbrev = Stream.EmitAbbrev(Abbv);
+ 
+  // Abbrev for TYPE_CODE_ARRAY.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // size
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                            Log2_32_Ceil(VE.getTypes().size()+1)));
+  unsigned ArrayAbbrev = Stream.EmitAbbrev(Abbv);
+  
+  // Emit an entry count so the reader can reserve space.
+  TypeVals.push_back(TypeList.size());
+  Stream.EmitRecord(bitc::TYPE_CODE_NUMENTRY, TypeVals);
+  TypeVals.clear();
+  
+  // Loop over all of the types, emitting each in turn.
+  for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
+    const Type *T = TypeList[i].first;
+    int AbbrevToUse = 0;
+    unsigned Code = 0;
+    
+    switch (T->getTypeID()) {
+    default: assert(0 && "Unknown type!");
+    case Type::VoidTyID:   Code = bitc::TYPE_CODE_VOID;   break;
+    case Type::FloatTyID:  Code = bitc::TYPE_CODE_FLOAT;  break;
+    case Type::DoubleTyID: Code = bitc::TYPE_CODE_DOUBLE; break;
+    case Type::X86_FP80TyID: Code = bitc::TYPE_CODE_X86_FP80; break;
+    case Type::FP128TyID: Code = bitc::TYPE_CODE_FP128; break;
+    case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
+    case Type::LabelTyID:  Code = bitc::TYPE_CODE_LABEL;  break;
+    case Type::OpaqueTyID: Code = bitc::TYPE_CODE_OPAQUE; break;
+    case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break;
+    case Type::IntegerTyID:
+      // INTEGER: [width]
+      Code = bitc::TYPE_CODE_INTEGER;
+      TypeVals.push_back(cast<IntegerType>(T)->getBitWidth());
+      break;
+    case Type::PointerTyID: {
+      const PointerType *PTy = cast<PointerType>(T);
+      // POINTER: [pointee type, address space]
+      Code = bitc::TYPE_CODE_POINTER;
+      TypeVals.push_back(VE.getTypeID(PTy->getElementType()));
+      unsigned AddressSpace = PTy->getAddressSpace();
+      TypeVals.push_back(AddressSpace);
+      if (AddressSpace == 0) AbbrevToUse = PtrAbbrev;
+      break;
+    }
+    case Type::FunctionTyID: {
+      const FunctionType *FT = cast<FunctionType>(T);
+      // FUNCTION: [isvararg, attrid, retty, paramty x N]
+      Code = bitc::TYPE_CODE_FUNCTION;
+      TypeVals.push_back(FT->isVarArg());
+      TypeVals.push_back(0);  // FIXME: DEAD: remove in llvm 3.0
+      TypeVals.push_back(VE.getTypeID(FT->getReturnType()));
+      for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i)
+        TypeVals.push_back(VE.getTypeID(FT->getParamType(i)));
+      AbbrevToUse = FunctionAbbrev;
+      break;
+    }
+    case Type::StructTyID: {
+      const StructType *ST = cast<StructType>(T);
+      // STRUCT: [ispacked, eltty x N]
+      Code = bitc::TYPE_CODE_STRUCT;
+      TypeVals.push_back(ST->isPacked());
+      // Output all of the element types.
+      for (StructType::element_iterator I = ST->element_begin(),
+           E = ST->element_end(); I != E; ++I)
+        TypeVals.push_back(VE.getTypeID(*I));
+      AbbrevToUse = StructAbbrev;
+      break;
+    }
+    case Type::ArrayTyID: {
+      const ArrayType *AT = cast<ArrayType>(T);
+      // ARRAY: [numelts, eltty]
+      Code = bitc::TYPE_CODE_ARRAY;
+      TypeVals.push_back(AT->getNumElements());
+      TypeVals.push_back(VE.getTypeID(AT->getElementType()));
+      AbbrevToUse = ArrayAbbrev;
+      break;
+    }
+    case Type::VectorTyID: {
+      const VectorType *VT = cast<VectorType>(T);
+      // VECTOR [numelts, eltty]
+      Code = bitc::TYPE_CODE_VECTOR;
+      TypeVals.push_back(VT->getNumElements());
+      TypeVals.push_back(VE.getTypeID(VT->getElementType()));
+      break;
+    }
+    }
+
+    // Emit the finished record.
+    Stream.EmitRecord(Code, TypeVals, AbbrevToUse);
+    TypeVals.clear();
+  }
+  
+  Stream.ExitBlock();
+}
+
+static unsigned getEncodedLinkage(const GlobalValue *GV) {
+  switch (GV->getLinkage()) {
+  default: assert(0 && "Invalid linkage!");
+  case GlobalValue::GhostLinkage:  // Map ghost linkage onto external.
+  case GlobalValue::ExternalLinkage:     return 0;
+  case GlobalValue::WeakAnyLinkage:      return 1;
+  case GlobalValue::AppendingLinkage:    return 2;
+  case GlobalValue::InternalLinkage:     return 3;
+  case GlobalValue::LinkOnceAnyLinkage:  return 4;
+  case GlobalValue::DLLImportLinkage:    return 5;
+  case GlobalValue::DLLExportLinkage:    return 6;
+  case GlobalValue::ExternalWeakLinkage: return 7;
+  case GlobalValue::CommonLinkage:       return 8;
+  case GlobalValue::PrivateLinkage:      return 9;
+  case GlobalValue::WeakODRLinkage:      return 10;
+  case GlobalValue::LinkOnceODRLinkage:  return 11;
+  case GlobalValue::AvailableExternallyLinkage:  return 12;
+  }
+}
+
+static unsigned getEncodedVisibility(const GlobalValue *GV) {
+  switch (GV->getVisibility()) {
+  default: assert(0 && "Invalid visibility!");
+  case GlobalValue::DefaultVisibility:   return 0;
+  case GlobalValue::HiddenVisibility:    return 1;
+  case GlobalValue::ProtectedVisibility: return 2;
+  }
+}
+
+// Emit top-level description of module, including target triple, inline asm,
+// descriptors for global variables, and function prototype info.
+static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
+                            BitstreamWriter &Stream) {
+  // Emit the list of dependent libraries for the Module.
+  for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I)
+    WriteStringRecord(bitc::MODULE_CODE_DEPLIB, *I, 0/*TODO*/, Stream);
+
+  // Emit various pieces of data attached to a module.
+  if (!M->getTargetTriple().empty())
+    WriteStringRecord(bitc::MODULE_CODE_TRIPLE, M->getTargetTriple(),
+                      0/*TODO*/, Stream);
+  if (!M->getDataLayout().empty())
+    WriteStringRecord(bitc::MODULE_CODE_DATALAYOUT, M->getDataLayout(),
+                      0/*TODO*/, Stream);
+  if (!M->getModuleInlineAsm().empty())
+    WriteStringRecord(bitc::MODULE_CODE_ASM, M->getModuleInlineAsm(),
+                      0/*TODO*/, Stream);
+
+  // Emit information about sections and GC, computing how many there are. Also
+  // compute the maximum alignment value.
+  std::map<std::string, unsigned> SectionMap;
+  std::map<std::string, unsigned> GCMap;
+  unsigned MaxAlignment = 0;
+  unsigned MaxGlobalType = 0;
+  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
+       GV != E; ++GV) {
+    MaxAlignment = std::max(MaxAlignment, GV->getAlignment());
+    MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV->getType()));
+    
+    if (!GV->hasSection()) continue;
+    // Give section names unique ID's.
+    unsigned &Entry = SectionMap[GV->getSection()];
+    if (Entry != 0) continue;
+    WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV->getSection(),
+                      0/*TODO*/, Stream);
+    Entry = SectionMap.size();
+  }
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+    MaxAlignment = std::max(MaxAlignment, F->getAlignment());
+    if (F->hasSection()) {
+      // Give section names unique ID's.
+      unsigned &Entry = SectionMap[F->getSection()];
+      if (!Entry) {
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, F->getSection(),
+                          0/*TODO*/, Stream);
+        Entry = SectionMap.size();
+      }
+    }
+    if (F->hasGC()) {
+      // Same for GC names.
+      unsigned &Entry = GCMap[F->getGC()];
+      if (!Entry) {
+        WriteStringRecord(bitc::MODULE_CODE_GCNAME, F->getGC(),
+                          0/*TODO*/, Stream);
+        Entry = GCMap.size();
+      }
+    }
+  }
+  
+  // Emit abbrev for globals, now that we know # sections and max alignment.
+  unsigned SimpleGVarAbbrev = 0;
+  if (!M->global_empty()) { 
+    // Add an abbrev for common globals with no visibility or thread localness.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                              Log2_32_Ceil(MaxGlobalType+1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));      // Constant.
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));        // Initializer.
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));      // Linkage.
+    if (MaxAlignment == 0)                                      // Alignment.
+      Abbv->Add(BitCodeAbbrevOp(0));
+    else {
+      unsigned MaxEncAlignment = Log2_32(MaxAlignment)+1;
+      Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                               Log2_32_Ceil(MaxEncAlignment+1)));
+    }
+    if (SectionMap.empty())                                    // Section.
+      Abbv->Add(BitCodeAbbrevOp(0));
+    else
+      Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                               Log2_32_Ceil(SectionMap.size()+1)));
+    // Don't bother emitting vis + thread local.
+    SimpleGVarAbbrev = Stream.EmitAbbrev(Abbv);
+  }
+  
+  // Emit the global variable information.
+  SmallVector<unsigned, 64> Vals;
+  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
+       GV != E; ++GV) {
+    unsigned AbbrevToUse = 0;
+
+    // GLOBALVAR: [type, isconst, initid, 
+    //             linkage, alignment, section, visibility, threadlocal]
+    Vals.push_back(VE.getTypeID(GV->getType()));
+    Vals.push_back(GV->isConstant());
+    Vals.push_back(GV->isDeclaration() ? 0 :
+                   (VE.getValueID(GV->getInitializer()) + 1));
+    Vals.push_back(getEncodedLinkage(GV));
+    Vals.push_back(Log2_32(GV->getAlignment())+1);
+    Vals.push_back(GV->hasSection() ? SectionMap[GV->getSection()] : 0);
+    if (GV->isThreadLocal() || 
+        GV->getVisibility() != GlobalValue::DefaultVisibility) {
+      Vals.push_back(getEncodedVisibility(GV));
+      Vals.push_back(GV->isThreadLocal());
+    } else {
+      AbbrevToUse = SimpleGVarAbbrev;
+    }
+    
+    Stream.EmitRecord(bitc::MODULE_CODE_GLOBALVAR, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+
+  // Emit the function proto information.
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+    // FUNCTION:  [type, callingconv, isproto, paramattr,
+    //             linkage, alignment, section, visibility, gc]
+    Vals.push_back(VE.getTypeID(F->getType()));
+    Vals.push_back(F->getCallingConv());
+    Vals.push_back(F->isDeclaration());
+    Vals.push_back(getEncodedLinkage(F));
+    Vals.push_back(VE.getAttributeID(F->getAttributes()));
+    Vals.push_back(Log2_32(F->getAlignment())+1);
+    Vals.push_back(F->hasSection() ? SectionMap[F->getSection()] : 0);
+    Vals.push_back(getEncodedVisibility(F));
+    Vals.push_back(F->hasGC() ? GCMap[F->getGC()] : 0);
+    
+    unsigned AbbrevToUse = 0;
+    Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+  
+  
+  // Emit the alias information.
+  for (Module::const_alias_iterator AI = M->alias_begin(), E = M->alias_end();
+       AI != E; ++AI) {
+    Vals.push_back(VE.getTypeID(AI->getType()));
+    Vals.push_back(VE.getValueID(AI->getAliasee()));
+    Vals.push_back(getEncodedLinkage(AI));
+    Vals.push_back(getEncodedVisibility(AI));
+    unsigned AbbrevToUse = 0;
+    Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+}
+
+
+static void WriteConstants(unsigned FirstVal, unsigned LastVal,
+                           const ValueEnumerator &VE,
+                           BitstreamWriter &Stream, bool isGlobal) {
+  if (FirstVal == LastVal) return;
+  
+  Stream.EnterSubblock(bitc::CONSTANTS_BLOCK_ID, 4);
+
+  unsigned AggregateAbbrev = 0;
+  unsigned String8Abbrev = 0;
+  unsigned CString7Abbrev = 0;
+  unsigned CString6Abbrev = 0;
+  unsigned MDString8Abbrev = 0;
+  unsigned MDString6Abbrev = 0;
+  // If this is a constant pool for the module, emit module-specific abbrevs.
+  if (isGlobal) {
+    // Abbrev for CST_CODE_AGGREGATE.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal+1)));
+    AggregateAbbrev = Stream.EmitAbbrev(Abbv);
+
+    // Abbrev for CST_CODE_STRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_STRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    String8Abbrev = Stream.EmitAbbrev(Abbv);
+    // Abbrev for CST_CODE_CSTRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+    CString7Abbrev = Stream.EmitAbbrev(Abbv);
+    // Abbrev for CST_CODE_CSTRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    CString6Abbrev = Stream.EmitAbbrev(Abbv);
+
+    // Abbrev for CST_CODE_MDSTRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_MDSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    MDString8Abbrev = Stream.EmitAbbrev(Abbv);
+    // Abbrev for CST_CODE_MDSTRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_MDSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    MDString6Abbrev = Stream.EmitAbbrev(Abbv);
+  }  
+  
+  SmallVector<uint64_t, 64> Record;
+
+  const ValueEnumerator::ValueList &Vals = VE.getValues();
+  const Type *LastTy = 0;
+  for (unsigned i = FirstVal; i != LastVal; ++i) {
+    const Value *V = Vals[i].first;
+    // If we need to switch types, do so now.
+    if (V->getType() != LastTy) {
+      LastTy = V->getType();
+      Record.push_back(VE.getTypeID(LastTy));
+      Stream.EmitRecord(bitc::CST_CODE_SETTYPE, Record,
+                        CONSTANTS_SETTYPE_ABBREV);
+      Record.clear();
+    }
+    
+    if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+      Record.push_back(unsigned(IA->hasSideEffects()));
+      
+      // Add the asm string.
+      const std::string &AsmStr = IA->getAsmString();
+      Record.push_back(AsmStr.size());
+      for (unsigned i = 0, e = AsmStr.size(); i != e; ++i)
+        Record.push_back(AsmStr[i]);
+      
+      // Add the constraint string.
+      const std::string &ConstraintStr = IA->getConstraintString();
+      Record.push_back(ConstraintStr.size());
+      for (unsigned i = 0, e = ConstraintStr.size(); i != e; ++i)
+        Record.push_back(ConstraintStr[i]);
+      Stream.EmitRecord(bitc::CST_CODE_INLINEASM, Record);
+      Record.clear();
+      continue;
+    }
+    const Constant *C = cast<Constant>(V);
+    unsigned Code = -1U;
+    unsigned AbbrevToUse = 0;
+    if (C->isNullValue()) {
+      Code = bitc::CST_CODE_NULL;
+    } else if (isa<UndefValue>(C)) {
+      Code = bitc::CST_CODE_UNDEF;
+    } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
+      if (IV->getBitWidth() <= 64) {
+        int64_t V = IV->getSExtValue();
+        if (V >= 0)
+          Record.push_back(V << 1);
+        else
+          Record.push_back((-V << 1) | 1);
+        Code = bitc::CST_CODE_INTEGER;
+        AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+      } else {                             // Wide integers, > 64 bits in size.
+        // We have an arbitrary precision integer value to write whose 
+        // bit width is > 64. However, in canonical unsigned integer 
+        // format it is likely that the high bits are going to be zero.
+        // So, we only write the number of active words.
+        unsigned NWords = IV->getValue().getActiveWords(); 
+        const uint64_t *RawWords = IV->getValue().getRawData();
+        for (unsigned i = 0; i != NWords; ++i) {
+          int64_t V = RawWords[i];
+          if (V >= 0)
+            Record.push_back(V << 1);
+          else
+            Record.push_back((-V << 1) | 1);
+        }
+        Code = bitc::CST_CODE_WIDE_INTEGER;
+      }
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+      Code = bitc::CST_CODE_FLOAT;
+      const Type *Ty = CFP->getType();
+      if (Ty == Type::FloatTy || Ty == Type::DoubleTy) {
+        Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      } else if (Ty == Type::X86_FP80Ty) {
+        // api needed to prevent premature destruction
+        // bits are not in the same order as a normal i80 APInt, compensate.
+        APInt api = CFP->getValueAPF().bitcastToAPInt();
+        const uint64_t *p = api.getRawData();
+        Record.push_back((p[1] << 48) | (p[0] >> 16));
+        Record.push_back(p[0] & 0xffffLL);
+      } else if (Ty == Type::FP128Ty || Ty == Type::PPC_FP128Ty) {
+        APInt api = CFP->getValueAPF().bitcastToAPInt();
+        const uint64_t *p = api.getRawData();
+        Record.push_back(p[0]);
+        Record.push_back(p[1]);
+      } else {
+        assert (0 && "Unknown FP type!");
+      }
+    } else if (isa<ConstantArray>(C) && cast<ConstantArray>(C)->isString()) {
+      // Emit constant strings specially.
+      unsigned NumOps = C->getNumOperands();
+      // If this is a null-terminated string, use the denser CSTRING encoding.
+      if (C->getOperand(NumOps-1)->isNullValue()) {
+        Code = bitc::CST_CODE_CSTRING;
+        --NumOps;  // Don't encode the null, which isn't allowed by char6.
+      } else {
+        Code = bitc::CST_CODE_STRING;
+        AbbrevToUse = String8Abbrev;
+      }
+      bool isCStr7 = Code == bitc::CST_CODE_CSTRING;
+      bool isCStrChar6 = Code == bitc::CST_CODE_CSTRING;
+      for (unsigned i = 0; i != NumOps; ++i) {
+        unsigned char V = cast<ConstantInt>(C->getOperand(i))->getZExtValue();
+        Record.push_back(V);
+        isCStr7 &= (V & 128) == 0;
+        if (isCStrChar6) 
+          isCStrChar6 = BitCodeAbbrevOp::isChar6(V);
+      }
+      
+      if (isCStrChar6)
+        AbbrevToUse = CString6Abbrev;
+      else if (isCStr7)
+        AbbrevToUse = CString7Abbrev;
+    } else if (isa<ConstantArray>(C) || isa<ConstantStruct>(V) ||
+               isa<ConstantVector>(V)) {
+      Code = bitc::CST_CODE_AGGREGATE;
+      for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+        Record.push_back(VE.getValueID(C->getOperand(i)));
+      AbbrevToUse = AggregateAbbrev;
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      switch (CE->getOpcode()) {
+      default:
+        if (Instruction::isCast(CE->getOpcode())) {
+          Code = bitc::CST_CODE_CE_CAST;
+          Record.push_back(GetEncodedCastOpcode(CE->getOpcode()));
+          Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+          Record.push_back(VE.getValueID(C->getOperand(0)));
+          AbbrevToUse = CONSTANTS_CE_CAST_Abbrev;
+        } else {
+          assert(CE->getNumOperands() == 2 && "Unknown constant expr!");
+          Code = bitc::CST_CODE_CE_BINOP;
+          Record.push_back(GetEncodedBinaryOpcode(CE->getOpcode()));
+          Record.push_back(VE.getValueID(C->getOperand(0)));
+          Record.push_back(VE.getValueID(C->getOperand(1)));
+        }
+        break;
+      case Instruction::GetElementPtr:
+        Code = bitc::CST_CODE_CE_GEP;
+        for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i) {
+          Record.push_back(VE.getTypeID(C->getOperand(i)->getType()));
+          Record.push_back(VE.getValueID(C->getOperand(i)));
+        }
+        break;
+      case Instruction::Select:
+        Code = bitc::CST_CODE_CE_SELECT;
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ExtractElement:
+        Code = bitc::CST_CODE_CE_EXTRACTELT;
+        Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        break;
+      case Instruction::InsertElement:
+        Code = bitc::CST_CODE_CE_INSERTELT;
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ShuffleVector:
+        // If the return type and argument types are the same, this is a
+        // standard shufflevector instruction.  If the types are different,
+        // then the shuffle is widening or truncating the input vectors, and
+        // the argument type must also be encoded.
+        if (C->getType() == C->getOperand(0)->getType()) {
+          Code = bitc::CST_CODE_CE_SHUFFLEVEC;
+        } else {
+          Code = bitc::CST_CODE_CE_SHUFVEC_EX;
+          Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+        }
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+      case Instruction::VICmp:
+      case Instruction::VFCmp:
+        if (isa<VectorType>(C->getOperand(0)->getType())
+            && (CE->getOpcode() == Instruction::ICmp
+                || CE->getOpcode() == Instruction::FCmp)) {
+          // compare returning vector of Int1Ty
+          assert(0 && "Unsupported constant!");
+        } else {
+          Code = bitc::CST_CODE_CE_CMP;
+        }
+        Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(CE->getPredicate());
+        break;
+      }
+    } else if (const MDString *S = dyn_cast<MDString>(C)) {
+      Code = bitc::CST_CODE_MDSTRING;
+      AbbrevToUse = MDString6Abbrev;
+      for (unsigned i = 0, e = S->size(); i != e; ++i) {
+        char V = S->begin()[i];
+        Record.push_back(V);
+
+        if (!BitCodeAbbrevOp::isChar6(V))
+          AbbrevToUse = MDString8Abbrev;
+      }
+    } else if (const MDNode *N = dyn_cast<MDNode>(C)) {
+      Code = bitc::CST_CODE_MDNODE;
+      for (unsigned i = 0, e = N->getNumElements(); i != e; ++i) {
+        if (N->getElement(i)) {
+          Record.push_back(VE.getTypeID(N->getElement(i)->getType()));
+          Record.push_back(VE.getValueID(N->getElement(i)));
+        } else {
+          Record.push_back(VE.getTypeID(Type::VoidTy));
+          Record.push_back(0);
+        }
+      }
+    } else {
+      assert(0 && "Unknown constant!");
+    }
+    Stream.EmitRecord(Code, Record, AbbrevToUse);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+static void WriteModuleConstants(const ValueEnumerator &VE,
+                                 BitstreamWriter &Stream) {
+  const ValueEnumerator::ValueList &Vals = VE.getValues();
+  
+  // Find the first constant to emit, which is the first non-globalvalue value.
+  // We know globalvalues have been emitted by WriteModuleInfo.
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+    if (!isa<GlobalValue>(Vals[i].first)) {
+      WriteConstants(i, Vals.size(), VE, Stream, true);
+      return;
+    }
+  }
+}
+
+/// PushValueAndType - The file has to encode both the value and type id for
+/// many values, because we need to know what type to create for forward
+/// references.  However, most operands are not forward references, so this type
+/// field is not needed.
+///
+/// This function adds V's value ID to Vals.  If the value ID is higher than the
+/// instruction ID, then it is a forward reference, and it also includes the
+/// type ID.
+static bool PushValueAndType(const Value *V, unsigned InstID,
+                             SmallVector<unsigned, 64> &Vals, 
+                             ValueEnumerator &VE) {
+  unsigned ValID = VE.getValueID(V);
+  Vals.push_back(ValID);
+  if (ValID >= InstID) {
+    Vals.push_back(VE.getTypeID(V->getType()));
+    return true;
+  }
+  return false;
+}
+
+/// WriteInstruction - Emit an instruction to the specified stream.
+static void WriteInstruction(const Instruction &I, unsigned InstID,
+                             ValueEnumerator &VE, BitstreamWriter &Stream,
+                             SmallVector<unsigned, 64> &Vals) {
+  unsigned Code = 0;
+  unsigned AbbrevToUse = 0;
+  switch (I.getOpcode()) {
+  default:
+    if (Instruction::isCast(I.getOpcode())) {
+      Code = bitc::FUNC_CODE_INST_CAST;
+      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+        AbbrevToUse = FUNCTION_INST_CAST_ABBREV;
+      Vals.push_back(VE.getTypeID(I.getType()));
+      Vals.push_back(GetEncodedCastOpcode(I.getOpcode()));
+    } else {
+      assert(isa<BinaryOperator>(I) && "Unknown instruction!");
+      Code = bitc::FUNC_CODE_INST_BINOP;
+      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+        AbbrevToUse = FUNCTION_INST_BINOP_ABBREV;
+      Vals.push_back(VE.getValueID(I.getOperand(1)));
+      Vals.push_back(GetEncodedBinaryOpcode(I.getOpcode()));
+    }
+    break;
+
+  case Instruction::GetElementPtr:
+    Code = bitc::FUNC_CODE_INST_GEP;
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+      PushValueAndType(I.getOperand(i), InstID, Vals, VE);
+    break;
+  case Instruction::ExtractValue: {
+    Code = bitc::FUNC_CODE_INST_EXTRACTVAL;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    const ExtractValueInst *EVI = cast<ExtractValueInst>(&I);
+    for (const unsigned *i = EVI->idx_begin(), *e = EVI->idx_end(); i != e; ++i)
+      Vals.push_back(*i);
+    break;
+  }
+  case Instruction::InsertValue: {
+    Code = bitc::FUNC_CODE_INST_INSERTVAL;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
+    const InsertValueInst *IVI = cast<InsertValueInst>(&I);
+    for (const unsigned *i = IVI->idx_begin(), *e = IVI->idx_end(); i != e; ++i)
+      Vals.push_back(*i);
+    break;
+  }
+  case Instruction::Select:
+    Code = bitc::FUNC_CODE_INST_VSELECT;
+    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
+    Vals.push_back(VE.getValueID(I.getOperand(2)));
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    break;
+  case Instruction::ExtractElement:
+    Code = bitc::FUNC_CODE_INST_EXTRACTELT;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    Vals.push_back(VE.getValueID(I.getOperand(1)));
+    break;
+  case Instruction::InsertElement:
+    Code = bitc::FUNC_CODE_INST_INSERTELT;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    Vals.push_back(VE.getValueID(I.getOperand(1)));
+    Vals.push_back(VE.getValueID(I.getOperand(2)));
+    break;
+  case Instruction::ShuffleVector:
+    Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    Vals.push_back(VE.getValueID(I.getOperand(1)));
+    Vals.push_back(VE.getValueID(I.getOperand(2)));
+    break;
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+  case Instruction::VICmp:
+  case Instruction::VFCmp:
+    if (I.getOpcode() == Instruction::ICmp
+        || I.getOpcode() == Instruction::FCmp) {
+      // compare returning Int1Ty or vector of Int1Ty
+      Code = bitc::FUNC_CODE_INST_CMP2;
+    } else {
+      Code = bitc::FUNC_CODE_INST_CMP;
+    }
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    Vals.push_back(VE.getValueID(I.getOperand(1)));
+    Vals.push_back(cast<CmpInst>(I).getPredicate());
+    break;
+
+  case Instruction::Ret: 
+    {
+      Code = bitc::FUNC_CODE_INST_RET;
+      unsigned NumOperands = I.getNumOperands();
+      if (NumOperands == 0)
+        AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV;
+      else if (NumOperands == 1) {
+        if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+          AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV;
+      } else {
+        for (unsigned i = 0, e = NumOperands; i != e; ++i)
+          PushValueAndType(I.getOperand(i), InstID, Vals, VE);
+      }
+    }
+    break;
+  case Instruction::Br:
+    {
+      Code = bitc::FUNC_CODE_INST_BR;
+      BranchInst &II(cast<BranchInst>(I));
+      Vals.push_back(VE.getValueID(II.getSuccessor(0)));
+      if (II.isConditional()) {
+        Vals.push_back(VE.getValueID(II.getSuccessor(1)));
+        Vals.push_back(VE.getValueID(II.getCondition()));
+      }
+    }
+    break;
+  case Instruction::Switch:
+    Code = bitc::FUNC_CODE_INST_SWITCH;
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+      Vals.push_back(VE.getValueID(I.getOperand(i)));
+    break;
+  case Instruction::Invoke: {
+    const InvokeInst *II = cast<InvokeInst>(&I);
+    const Value *Callee(II->getCalledValue());
+    const PointerType *PTy = cast<PointerType>(Callee->getType());
+    const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+    Code = bitc::FUNC_CODE_INST_INVOKE;
+    
+    Vals.push_back(VE.getAttributeID(II->getAttributes()));
+    Vals.push_back(II->getCallingConv());
+    Vals.push_back(VE.getValueID(II->getNormalDest()));
+    Vals.push_back(VE.getValueID(II->getUnwindDest()));
+    PushValueAndType(Callee, InstID, Vals, VE);
+    
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+      Vals.push_back(VE.getValueID(I.getOperand(i+3)));  // fixed param.
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = 3+FTy->getNumParams(), e = I.getNumOperands();
+           i != e; ++i)
+        PushValueAndType(I.getOperand(i), InstID, Vals, VE); // vararg
+    }
+    break;
+  }
+  case Instruction::Unwind:
+    Code = bitc::FUNC_CODE_INST_UNWIND;
+    break;
+  case Instruction::Unreachable:
+    Code = bitc::FUNC_CODE_INST_UNREACHABLE;
+    AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV;
+    break;
+  
+  case Instruction::PHI:
+    Code = bitc::FUNC_CODE_INST_PHI;
+    Vals.push_back(VE.getTypeID(I.getType()));
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+      Vals.push_back(VE.getValueID(I.getOperand(i)));
+    break;
+    
+  case Instruction::Malloc:
+    Code = bitc::FUNC_CODE_INST_MALLOC;
+    Vals.push_back(VE.getTypeID(I.getType()));
+    Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
+    Vals.push_back(Log2_32(cast<MallocInst>(I).getAlignment())+1);
+    break;
+    
+  case Instruction::Free:
+    Code = bitc::FUNC_CODE_INST_FREE;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    break;
+    
+  case Instruction::Alloca:
+    Code = bitc::FUNC_CODE_INST_ALLOCA;
+    Vals.push_back(VE.getTypeID(I.getType()));
+    Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
+    Vals.push_back(Log2_32(cast<AllocaInst>(I).getAlignment())+1);
+    break;
+    
+  case Instruction::Load:
+    Code = bitc::FUNC_CODE_INST_LOAD;
+    if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))  // ptr
+      AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
+      
+    Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1);
+    Vals.push_back(cast<LoadInst>(I).isVolatile());
+    break;
+  case Instruction::Store:
+    Code = bitc::FUNC_CODE_INST_STORE2;
+    PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
+    Vals.push_back(VE.getValueID(I.getOperand(0)));       // val.
+    Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
+    Vals.push_back(cast<StoreInst>(I).isVolatile());
+    break;
+  case Instruction::Call: {
+    const PointerType *PTy = cast<PointerType>(I.getOperand(0)->getType());
+    const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+
+    Code = bitc::FUNC_CODE_INST_CALL;
+    
+    const CallInst *CI = cast<CallInst>(&I);
+    Vals.push_back(VE.getAttributeID(CI->getAttributes()));
+    Vals.push_back((CI->getCallingConv() << 1) | unsigned(CI->isTailCall()));
+    PushValueAndType(CI->getOperand(0), InstID, Vals, VE);  // Callee
+    
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+      Vals.push_back(VE.getValueID(I.getOperand(i+1)));  // fixed param.
+      
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      unsigned NumVarargs = I.getNumOperands()-1-FTy->getNumParams();
+      for (unsigned i = I.getNumOperands()-NumVarargs, e = I.getNumOperands();
+           i != e; ++i)
+        PushValueAndType(I.getOperand(i), InstID, Vals, VE);  // varargs
+    }
+    break;
+  }
+  case Instruction::VAArg:
+    Code = bitc::FUNC_CODE_INST_VAARG;
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));   // valistty
+    Vals.push_back(VE.getValueID(I.getOperand(0))); // valist.
+    Vals.push_back(VE.getTypeID(I.getType())); // restype.
+    break;
+  }
+  
+  Stream.EmitRecord(Code, Vals, AbbrevToUse);
+  Vals.clear();
+}
+
+// Emit names for globals/functions etc.
+static void WriteValueSymbolTable(const ValueSymbolTable &VST,
+                                  const ValueEnumerator &VE,
+                                  BitstreamWriter &Stream) {
+  if (VST.empty()) return;
+  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
+
+  // FIXME: Set up the abbrev, we know how many values there are!
+  // FIXME: We know if the type names can use 7-bit ascii.
+  SmallVector<unsigned, 64> NameVals;
+  
+  for (ValueSymbolTable::const_iterator SI = VST.begin(), SE = VST.end();
+       SI != SE; ++SI) {
+    
+    const ValueName &Name = *SI;
+    
+    // Figure out the encoding to use for the name.
+    bool is7Bit = true;
+    bool isChar6 = true;
+    for (const char *C = Name.getKeyData(), *E = C+Name.getKeyLength();
+         C != E; ++C) {
+      if (isChar6) 
+        isChar6 = BitCodeAbbrevOp::isChar6(*C);
+      if ((unsigned char)*C & 128) {
+        is7Bit = false;
+        break;  // don't bother scanning the rest.
+      }
+    }
+    
+    unsigned AbbrevToUse = VST_ENTRY_8_ABBREV;
+    
+    // VST_ENTRY:   [valueid, namechar x N]
+    // VST_BBENTRY: [bbid, namechar x N]
+    unsigned Code;
+    if (isa<BasicBlock>(SI->getValue())) {
+      Code = bitc::VST_CODE_BBENTRY;
+      if (isChar6)
+        AbbrevToUse = VST_BBENTRY_6_ABBREV;
+    } else {
+      Code = bitc::VST_CODE_ENTRY;
+      if (isChar6)
+        AbbrevToUse = VST_ENTRY_6_ABBREV;
+      else if (is7Bit)
+        AbbrevToUse = VST_ENTRY_7_ABBREV;
+    }
+    
+    NameVals.push_back(VE.getValueID(SI->getValue()));
+    for (const char *P = Name.getKeyData(),
+         *E = Name.getKeyData()+Name.getKeyLength(); P != E; ++P)
+      NameVals.push_back((unsigned char)*P);
+    
+    // Emit the finished record.
+    Stream.EmitRecord(Code, NameVals, AbbrevToUse);
+    NameVals.clear();
+  }
+  Stream.ExitBlock();
+}
+
+/// WriteFunction - Emit a function body to the module stream.
+static void WriteFunction(const Function &F, ValueEnumerator &VE, 
+                          BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::FUNCTION_BLOCK_ID, 4);
+  VE.incorporateFunction(F);
+
+  SmallVector<unsigned, 64> Vals;
+  
+  // Emit the number of basic blocks, so the reader can create them ahead of
+  // time.
+  Vals.push_back(VE.getBasicBlocks().size());
+  Stream.EmitRecord(bitc::FUNC_CODE_DECLAREBLOCKS, Vals);
+  Vals.clear();
+  
+  // If there are function-local constants, emit them now.
+  unsigned CstStart, CstEnd;
+  VE.getFunctionConstantRange(CstStart, CstEnd);
+  WriteConstants(CstStart, CstEnd, VE, Stream, false);
+  
+  // Keep a running idea of what the instruction ID is. 
+  unsigned InstID = CstEnd;
+  
+  // Finally, emit all the instructions, in order.
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      WriteInstruction(*I, InstID, VE, Stream, Vals);
+      if (I->getType() != Type::VoidTy)
+        ++InstID;
+    }
+  
+  // Emit names for all the instructions etc.
+  WriteValueSymbolTable(F.getValueSymbolTable(), VE, Stream);
+    
+  VE.purgeFunction();
+  Stream.ExitBlock();
+}
+
+/// WriteTypeSymbolTable - Emit a block for the specified type symtab.
+static void WriteTypeSymbolTable(const TypeSymbolTable &TST,
+                                 const ValueEnumerator &VE,
+                                 BitstreamWriter &Stream) {
+  if (TST.empty()) return;
+  
+  Stream.EnterSubblock(bitc::TYPE_SYMTAB_BLOCK_ID, 3);
+  
+  // 7-bit fixed width VST_CODE_ENTRY strings.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                            Log2_32_Ceil(VE.getTypes().size()+1)));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+  unsigned V7Abbrev = Stream.EmitAbbrev(Abbv);
+  
+  SmallVector<unsigned, 64> NameVals;
+  
+  for (TypeSymbolTable::const_iterator TI = TST.begin(), TE = TST.end(); 
+       TI != TE; ++TI) {
+    // TST_ENTRY: [typeid, namechar x N]
+    NameVals.push_back(VE.getTypeID(TI->second));
+    
+    const std::string &Str = TI->first;
+    bool is7Bit = true;
+    for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+      NameVals.push_back((unsigned char)Str[i]);
+      if (Str[i] & 128)
+        is7Bit = false;
+    }
+    
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::VST_CODE_ENTRY, NameVals, is7Bit ? V7Abbrev : 0);
+    NameVals.clear();
+  }
+  
+  Stream.ExitBlock();
+}
+
+// Emit blockinfo, which defines the standard abbreviations etc.
+static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
+  // We only want to emit block info records for blocks that have multiple
+  // instances: CONSTANTS_BLOCK, FUNCTION_BLOCK and VALUE_SYMTAB_BLOCK.  Other
+  // blocks can defined their abbrevs inline.
+  Stream.EnterBlockInfoBlock(2);
+  
+  { // 8-bit fixed-width VST_ENTRY/VST_BBENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID, 
+                                   Abbv) != VST_ENTRY_8_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  
+  { // 7-bit fixed width VST_ENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   Abbv) != VST_ENTRY_7_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  { // 6-bit char6 VST_ENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   Abbv) != VST_ENTRY_6_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  { // 6-bit char6 VST_BBENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_BBENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   Abbv) != VST_BBENTRY_6_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  
+  
+  
+  { // SETTYPE abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                              Log2_32_Ceil(VE.getTypes().size()+1)));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_SETTYPE_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  
+  { // INTEGER abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_INTEGER));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_INTEGER_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  
+  { // CE_CAST abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
+                              Log2_32_Ceil(VE.getTypes().size()+1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));    // value id
+
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_CE_CAST_Abbrev)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  { // NULL abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_NULL));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_NULL_Abbrev)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  
+  // FIXME: This should only use space for first class types!
+ 
+  { // INST_LOAD abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_LOAD_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  { // INST_BINOP abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_BINOP_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  { // INST_CAST abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
+                              Log2_32_Ceil(VE.getTypes().size()+1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // opc
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_CAST_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  
+  { // INST_RET abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_RET_VOID_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  { // INST_RET abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ValID
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_RET_VAL_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  { // INST_UNREACHABLE abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNREACHABLE));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_UNREACHABLE_ABBREV)
+      assert(0 && "Unexpected abbrev ordering!");
+  }
+  
+  Stream.ExitBlock();
+}
+
+
+/// WriteModule - Emit the specified module to the bitstream.
+static void WriteModule(const Module *M, BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
+  
+  // Emit the version number if it is non-zero.
+  if (CurVersion) {
+    SmallVector<unsigned, 1> Vals;
+    Vals.push_back(CurVersion);
+    Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
+  }
+  
+  // Analyze the module, enumerating globals, functions, etc.
+  ValueEnumerator VE(M);
+
+  // Emit blockinfo, which defines the standard abbreviations etc.
+  WriteBlockInfo(VE, Stream);
+  
+  // Emit information about parameter attributes.
+  WriteAttributeTable(VE, Stream);
+  
+  // Emit information describing all of the types in the module.
+  WriteTypeTable(VE, Stream);
+  
+  // Emit top-level description of module, including target triple, inline asm,
+  // descriptors for global variables, and function prototype info.
+  WriteModuleInfo(M, VE, Stream);
+  
+  // Emit constants.
+  WriteModuleConstants(VE, Stream);
+  
+  // If we have any aggregate values in the value table, purge them - these can
+  // only be used to initialize global variables.  Doing so makes the value
+  // namespace smaller for code in functions.
+  int NumNonAggregates = VE.PurgeAggregateValues();
+  if (NumNonAggregates != -1) {
+    SmallVector<unsigned, 1> Vals;
+    Vals.push_back(NumNonAggregates);
+    Stream.EmitRecord(bitc::MODULE_CODE_PURGEVALS, Vals);
+  }
+  
+  // Emit function bodies.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
+    if (!I->isDeclaration())
+      WriteFunction(*I, VE, Stream);
+  
+  // Emit the type symbol table information.
+  WriteTypeSymbolTable(M->getTypeSymbolTable(), VE, Stream);
+  
+  // Emit names for globals/functions etc.
+  WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream);
+  
+  Stream.ExitBlock();
+}
+
+/// EmitDarwinBCHeader - If generating a bc file on darwin, we have to emit a
+/// header and trailer to make it compatible with the system archiver.  To do
+/// this we emit the following header, and then emit a trailer that pads the
+/// file out to be a multiple of 16 bytes.
+/// 
+/// struct bc_header {
+///   uint32_t Magic;         // 0x0B17C0DE
+///   uint32_t Version;       // Version, currently always 0.
+///   uint32_t BitcodeOffset; // Offset to traditional bitcode file.
+///   uint32_t BitcodeSize;   // Size of traditional bitcode file.
+///   uint32_t CPUType;       // CPU specifier.
+///   ... potentially more later ...
+/// };
+enum {
+  DarwinBCSizeFieldOffset = 3*4, // Offset to bitcode_size.
+  DarwinBCHeaderSize = 5*4
+};
+
+static void EmitDarwinBCHeader(BitstreamWriter &Stream,
+                               const std::string &TT) {
+  unsigned CPUType = ~0U;
+  
+  // Match x86_64-*, i[3-9]86-*, powerpc-*, powerpc64-*.  The CPUType is a
+  // magic number from /usr/include/mach/machine.h.  It is ok to reproduce the
+  // specific constants here because they are implicitly part of the Darwin ABI.
+  enum {
+    DARWIN_CPU_ARCH_ABI64      = 0x01000000,
+    DARWIN_CPU_TYPE_X86        = 7,
+    DARWIN_CPU_TYPE_POWERPC    = 18
+  };
+  
+  if (TT.find("x86_64-") == 0)
+    CPUType = DARWIN_CPU_TYPE_X86 | DARWIN_CPU_ARCH_ABI64;
+  else if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
+           TT[4] == '-' && TT[1] - '3' < 6)
+    CPUType = DARWIN_CPU_TYPE_X86;
+  else if (TT.find("powerpc-") == 0)
+    CPUType = DARWIN_CPU_TYPE_POWERPC;
+  else if (TT.find("powerpc64-") == 0)
+    CPUType = DARWIN_CPU_TYPE_POWERPC | DARWIN_CPU_ARCH_ABI64;
+  
+  // Traditional Bitcode starts after header.
+  unsigned BCOffset = DarwinBCHeaderSize;
+  
+  Stream.Emit(0x0B17C0DE, 32);
+  Stream.Emit(0         , 32);  // Version.
+  Stream.Emit(BCOffset  , 32);
+  Stream.Emit(0         , 32);  // Filled in later.
+  Stream.Emit(CPUType   , 32);
+}
+
+/// EmitDarwinBCTrailer - Emit the darwin epilog after the bitcode file and
+/// finalize the header.
+static void EmitDarwinBCTrailer(BitstreamWriter &Stream, unsigned BufferSize) {
+  // Update the size field in the header.
+  Stream.BackpatchWord(DarwinBCSizeFieldOffset, BufferSize-DarwinBCHeaderSize);
+  
+  // If the file is not a multiple of 16 bytes, insert dummy padding.
+  while (BufferSize & 15) {
+    Stream.Emit(0, 8);
+    ++BufferSize;
+  }
+}
+
+
+/// WriteBitcodeToFile - Write the specified module to the specified output
+/// stream.
+void llvm::WriteBitcodeToFile(const Module *M, std::ostream &Out) {
+  raw_os_ostream RawOut(Out);
+  // If writing to stdout, set binary mode.
+  if (llvm::cout == Out)
+    sys::Program::ChangeStdoutToBinary();
+  WriteBitcodeToFile(M, RawOut);
+}
+
+/// WriteBitcodeToFile - Write the specified module to the specified output
+/// stream.
+void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out) {
+  std::vector<unsigned char> Buffer;
+  BitstreamWriter Stream(Buffer);
+  
+  Buffer.reserve(256*1024);
+
+  WriteBitcodeToStream( M, Stream );
+  
+  // If writing to stdout, set binary mode.
+  if (&llvm::outs() == &Out)
+    sys::Program::ChangeStdoutToBinary();
+
+  // Write the generated bitstream to "Out".
+  Out.write((char*)&Buffer.front(), Buffer.size());
+  
+  // Make sure it hits disk now.
+  Out.flush();
+}
+
+/// WriteBitcodeToStream - Write the specified module to the specified output
+/// stream.
+void llvm::WriteBitcodeToStream(const Module *M, BitstreamWriter &Stream) {
+  // If this is darwin, emit a file header and trailer if needed.
+  bool isDarwin = M->getTargetTriple().find("-darwin") != std::string::npos;
+  if (isDarwin)
+    EmitDarwinBCHeader(Stream, M->getTargetTriple());
+  
+  // Emit the file header.
+  Stream.Emit((unsigned)'B', 8);
+  Stream.Emit((unsigned)'C', 8);
+  Stream.Emit(0x0, 4);
+  Stream.Emit(0xC, 4);
+  Stream.Emit(0xE, 4);
+  Stream.Emit(0xD, 4);
+
+  // Emit the module.
+  WriteModule(M, Stream);
+
+  if (isDarwin)
+    EmitDarwinBCTrailer(Stream, Stream.getBuffer().size());
+}
diff --git a/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
new file mode 100644
index 0000000..209cf09
--- /dev/null
+++ b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -0,0 +1,56 @@
+//===--- Bitcode/Writer/BitcodeWriterPass.cpp - Bitcode Writer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// BitcodeWriterPass implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+  class WriteBitcodePass : public ModulePass {
+    // FIXME: Kill off std::ostream
+    std::ostream *Out;
+    raw_ostream *RawOut; // raw_ostream to print on
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit WriteBitcodePass(std::ostream &o)
+      : ModulePass(&ID), Out(&o), RawOut(0) {}
+    explicit WriteBitcodePass(raw_ostream &o)
+      : ModulePass(&ID), Out(0), RawOut(&o) {}
+    
+    const char *getPassName() const { return "Bitcode Writer"; }
+    
+    bool runOnModule(Module &M) {
+      if (Out) {
+        WriteBitcodeToFile(&M, *Out);
+      } else {
+        WriteBitcodeToFile(&M, *RawOut);
+      }
+      return false;
+    }
+  };
+}
+
+char WriteBitcodePass::ID = 0;
+
+/// CreateBitcodeWriterPass - Create and return a pass that writes the module
+/// to the specified ostream.
+ModulePass *llvm::CreateBitcodeWriterPass(std::ostream &Str) {
+  return new WriteBitcodePass(Str);
+}
+
+
+/// createBitcodeWriterPass - Create and return a pass that writes the module
+/// to the specified ostream.
+ModulePass *llvm::createBitcodeWriterPass(raw_ostream &Str) {
+  return new WriteBitcodePass(Str);
+}
diff --git a/lib/Bitcode/Writer/CMakeLists.txt b/lib/Bitcode/Writer/CMakeLists.txt
new file mode 100644
index 0000000..ac5bb99
--- /dev/null
+++ b/lib/Bitcode/Writer/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_llvm_library(LLVMBitWriter
+  BitWriter.cpp
+  BitcodeWriter.cpp
+  BitcodeWriterPass.cpp
+  Serialize.cpp
+  SerializeAPFloat.cpp
+  SerializeAPInt.cpp
+  ValueEnumerator.cpp
+  )
diff --git a/lib/Bitcode/Writer/Makefile b/lib/Bitcode/Writer/Makefile
new file mode 100644
index 0000000..7b0bd72
--- /dev/null
+++ b/lib/Bitcode/Writer/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Bitcode/Reader/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMBitWriter
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Bitcode/Writer/Serialize.cpp b/lib/Bitcode/Writer/Serialize.cpp
new file mode 100644
index 0000000..79464a6
--- /dev/null
+++ b/lib/Bitcode/Writer/Serialize.cpp
@@ -0,0 +1,118 @@
+//==- Serialize.cpp - Generic Object Serialization to Bitcode ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the internal methods used for object serialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/Serialize.h"
+#include "string.h"
+
+#ifdef DEBUG_BACKPATCH
+#include "llvm/Support/Streams.h"
+#endif
+
+using namespace llvm;
+
+Serializer::Serializer(BitstreamWriter& stream)
+  : Stream(stream), BlockLevel(0) {}
+
+Serializer::~Serializer() {
+  if (inRecord())
+    EmitRecord();
+
+  while (BlockLevel > 0)
+    Stream.ExitBlock();
+   
+  Stream.FlushToWord();
+}
+
+void Serializer::EmitRecord() {
+  assert(Record.size() > 0 && "Cannot emit empty record.");
+  Stream.EmitRecord(8,Record);
+  Record.clear();
+}
+
+void Serializer::EnterBlock(unsigned BlockID,unsigned CodeLen) {
+  FlushRecord();
+  Stream.EnterSubblock(BlockID,CodeLen);
+  ++BlockLevel;
+}
+
+void Serializer::ExitBlock() {
+  assert (BlockLevel > 0);
+  --BlockLevel;
+  FlushRecord();
+  Stream.ExitBlock();
+}
+
+void Serializer::EmitInt(uint64_t X) {
+  assert (BlockLevel > 0);
+  Record.push_back(X);
+}
+
+void Serializer::EmitSInt(int64_t X) {
+  if (X >= 0)
+    EmitInt(X << 1);
+  else
+    EmitInt((-X << 1) | 1);
+}
+
+void Serializer::EmitCStr(const char* s, const char* end) {
+  Record.push_back(end - s);
+  
+  while(s != end) {
+    Record.push_back(*s);
+    ++s;
+  }
+}
+
+void Serializer::EmitCStr(const char* s) {
+  EmitCStr(s,s+strlen(s));
+}
+
+SerializedPtrID Serializer::getPtrId(const void* ptr) {
+  if (!ptr)
+    return 0;
+  
+  MapTy::iterator I = PtrMap.find(ptr);
+  
+  if (I == PtrMap.end()) {
+    unsigned id = PtrMap.size()+1;
+#ifdef DEBUG_BACKPATCH
+    llvm::cerr << "Registered PTR: " << ptr << " => " << id << "\n";
+#endif
+    PtrMap[ptr] = id;
+    return id;
+  }
+  else return I->second;
+}
+
+bool Serializer::isRegistered(const void* ptr) const {
+  MapTy::const_iterator I = PtrMap.find(ptr);
+  return I != PtrMap.end();
+}
+
+
+#define INT_EMIT(TYPE)\
+void SerializeTrait<TYPE>::Emit(Serializer&S, TYPE X) { S.EmitInt(X); }
+
+INT_EMIT(bool)
+INT_EMIT(unsigned char)
+INT_EMIT(unsigned short)
+INT_EMIT(unsigned int)
+INT_EMIT(unsigned long)
+
+#define SINT_EMIT(TYPE)\
+void SerializeTrait<TYPE>::Emit(Serializer&S, TYPE X) { S.EmitSInt(X); }
+
+SINT_EMIT(signed char)
+SINT_EMIT(signed short)
+SINT_EMIT(signed int)
+SINT_EMIT(signed long)
diff --git a/lib/Bitcode/Writer/SerializeAPFloat.cpp b/lib/Bitcode/Writer/SerializeAPFloat.cpp
new file mode 100644
index 0000000..25d954f
--- /dev/null
+++ b/lib/Bitcode/Writer/SerializeAPFloat.cpp
@@ -0,0 +1,21 @@
+//===-- SerializeAPInt.cpp - Serialization for APFloat ---------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements serialization of APFloat.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/Bitcode/Serialize.h"
+
+using namespace llvm;
+
+void APFloat::Emit(Serializer& S) const {
+  S.Emit(bitcastToAPInt());
+}
diff --git a/lib/Bitcode/Writer/SerializeAPInt.cpp b/lib/Bitcode/Writer/SerializeAPInt.cpp
new file mode 100644
index 0000000..47792c7
--- /dev/null
+++ b/lib/Bitcode/Writer/SerializeAPInt.cpp
@@ -0,0 +1,31 @@
+//===-- SerializeAPInt.cpp - Serialization for APInts ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements serialization of APInts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/Bitcode/Serialize.h"
+#include <cassert>
+
+using namespace llvm;
+
+void APInt::Emit(Serializer& S) const {
+  S.EmitInt(BitWidth);
+
+  if (isSingleWord())
+    S.EmitInt(VAL);
+  else {
+    uint32_t NumWords = getNumWords();
+    S.EmitInt(NumWords);
+    for (unsigned i = 0; i < NumWords; ++i)
+      S.EmitInt(pVal[i]);
+  }
+}
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
new file mode 100644
index 0000000..8002a36
--- /dev/null
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -0,0 +1,347 @@
+//===-- ValueEnumerator.cpp - Number values and types for bitcode writer --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ValueEnumerator class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueEnumerator.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/MDNode.h"
+#include "llvm/Module.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/Instructions.h"
+#include <algorithm>
+using namespace llvm;
+
+static bool isSingleValueType(const std::pair<const llvm::Type*,
+                              unsigned int> &P) {
+  return P.first->isSingleValueType();
+}
+
+static bool isIntegerValue(const std::pair<const Value*, unsigned> &V) {
+  return isa<IntegerType>(V.first->getType());
+}
+
+static bool CompareByFrequency(const std::pair<const llvm::Type*,
+                               unsigned int> &P1,
+                               const std::pair<const llvm::Type*,
+                               unsigned int> &P2) {
+  return P1.second > P2.second;
+}
+
+/// ValueEnumerator - Enumerate module-level information.
+ValueEnumerator::ValueEnumerator(const Module *M) {
+  // Enumerate the global variables.
+  for (Module::const_global_iterator I = M->global_begin(),
+         E = M->global_end(); I != E; ++I)
+    EnumerateValue(I);
+
+  // Enumerate the functions.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    EnumerateValue(I);
+    EnumerateAttributes(cast<Function>(I)->getAttributes());
+  }
+
+  // Enumerate the aliases.
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    EnumerateValue(I);
+  
+  // Remember what is the cutoff between globalvalue's and other constants.
+  unsigned FirstConstant = Values.size();
+  
+  // Enumerate the global variable initializers.
+  for (Module::const_global_iterator I = M->global_begin(),
+         E = M->global_end(); I != E; ++I)
+    if (I->hasInitializer())
+      EnumerateValue(I->getInitializer());
+
+  // Enumerate the aliasees.
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    EnumerateValue(I->getAliasee());
+  
+  // Enumerate types used by the type symbol table.
+  EnumerateTypeSymbolTable(M->getTypeSymbolTable());
+
+  // Insert constants that are named at module level into the slot pool so that
+  // the module symbol table can refer to them...
+  EnumerateValueSymbolTable(M->getValueSymbolTable());
+  
+  // Enumerate types used by function bodies and argument lists.
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+    
+    for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I)
+      EnumerateType(I->getType());
+    
+    for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;++I){
+        for (User::const_op_iterator OI = I->op_begin(), E = I->op_end(); 
+             OI != E; ++OI)
+          EnumerateOperandType(*OI);
+        EnumerateType(I->getType());
+        if (const CallInst *CI = dyn_cast<CallInst>(I))
+          EnumerateAttributes(CI->getAttributes());
+        else if (const InvokeInst *II = dyn_cast<InvokeInst>(I))
+          EnumerateAttributes(II->getAttributes());
+      }
+  }
+  
+  // Optimize constant ordering.
+  OptimizeConstants(FirstConstant, Values.size());
+    
+  // Sort the type table by frequency so that most commonly used types are early
+  // in the table (have low bit-width).
+  std::stable_sort(Types.begin(), Types.end(), CompareByFrequency);
+    
+  // Partition the Type ID's so that the single-value types occur before the
+  // aggregate types.  This allows the aggregate types to be dropped from the
+  // type table after parsing the global variable initializers.
+  std::partition(Types.begin(), Types.end(), isSingleValueType);
+
+  // Now that we rearranged the type table, rebuild TypeMap.
+  for (unsigned i = 0, e = Types.size(); i != e; ++i)
+    TypeMap[Types[i].first] = i+1;
+}
+
+// Optimize constant ordering.
+namespace {
+  struct CstSortPredicate {
+    ValueEnumerator &VE;
+    explicit CstSortPredicate(ValueEnumerator &ve) : VE(ve) {}
+    bool operator()(const std::pair<const Value*, unsigned> &LHS,
+                    const std::pair<const Value*, unsigned> &RHS) {
+      // Sort by plane.
+      if (LHS.first->getType() != RHS.first->getType())
+        return VE.getTypeID(LHS.first->getType()) < 
+               VE.getTypeID(RHS.first->getType());
+      // Then by frequency.
+      return LHS.second > RHS.second;
+    }
+  };
+}
+
+/// OptimizeConstants - Reorder constant pool for denser encoding.
+void ValueEnumerator::OptimizeConstants(unsigned CstStart, unsigned CstEnd) {
+  if (CstStart == CstEnd || CstStart+1 == CstEnd) return;
+  
+  CstSortPredicate P(*this);
+  std::stable_sort(Values.begin()+CstStart, Values.begin()+CstEnd, P);
+  
+  // Ensure that integer constants are at the start of the constant pool.  This
+  // is important so that GEP structure indices come before gep constant exprs.
+  std::partition(Values.begin()+CstStart, Values.begin()+CstEnd,
+                 isIntegerValue);
+  
+  // Rebuild the modified portion of ValueMap.
+  for (; CstStart != CstEnd; ++CstStart)
+    ValueMap[Values[CstStart].first] = CstStart+1;
+}
+
+
+/// EnumerateTypeSymbolTable - Insert all of the types in the specified symbol
+/// table.
+void ValueEnumerator::EnumerateTypeSymbolTable(const TypeSymbolTable &TST) {
+  for (TypeSymbolTable::const_iterator TI = TST.begin(), TE = TST.end(); 
+       TI != TE; ++TI)
+    EnumerateType(TI->second);
+}
+
+/// EnumerateValueSymbolTable - Insert all of the values in the specified symbol
+/// table into the values table.
+void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) {
+  for (ValueSymbolTable::const_iterator VI = VST.begin(), VE = VST.end(); 
+       VI != VE; ++VI)
+    EnumerateValue(VI->getValue());
+}
+
+void ValueEnumerator::EnumerateValue(const Value *V) {
+  assert(V->getType() != Type::VoidTy && "Can't insert void values!");
+  
+  // Check to see if it's already in!
+  unsigned &ValueID = ValueMap[V];
+  if (ValueID) {
+    // Increment use count.
+    Values[ValueID-1].second++;
+    return;
+  }
+
+  // Enumerate the type of this value.
+  EnumerateType(V->getType());
+  
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (isa<GlobalValue>(C)) {
+      // Initializers for globals are handled explicitly elsewhere.
+    } else if (isa<ConstantArray>(C) && cast<ConstantArray>(C)->isString()) {
+      // Do not enumerate the initializers for an array of simple characters.
+      // The initializers just polute the value table, and we emit the strings
+      // specially.
+    } else if (C->getNumOperands()) {
+      // If a constant has operands, enumerate them.  This makes sure that if a
+      // constant has uses (for example an array of const ints), that they are
+      // inserted also.
+      
+      // We prefer to enumerate them with values before we enumerate the user
+      // itself.  This makes it more likely that we can avoid forward references
+      // in the reader.  We know that there can be no cycles in the constants
+      // graph that don't go through a global variable.
+      for (User::const_op_iterator I = C->op_begin(), E = C->op_end();
+           I != E; ++I)
+        EnumerateValue(*I);
+      
+      // Finally, add the value.  Doing this could make the ValueID reference be
+      // dangling, don't reuse it.
+      Values.push_back(std::make_pair(V, 1U));
+      ValueMap[V] = Values.size();
+      return;
+    } else if (const MDNode *N = dyn_cast<MDNode>(C)) {
+      for (MDNode::const_elem_iterator I = N->elem_begin(), E = N->elem_end();
+           I != E; ++I) {
+        if (*I)
+          EnumerateValue(*I);
+        else
+          EnumerateType(Type::VoidTy);
+      }
+
+      Values.push_back(std::make_pair(V, 1U));
+      ValueMap[V] = Values.size();
+      return;
+    }
+  }
+  
+  // Add the value.
+  Values.push_back(std::make_pair(V, 1U));
+  ValueID = Values.size();
+}
+
+
+void ValueEnumerator::EnumerateType(const Type *Ty) {
+  unsigned &TypeID = TypeMap[Ty];
+  
+  if (TypeID) {
+    // If we've already seen this type, just increase its occurrence count.
+    Types[TypeID-1].second++;
+    return;
+  }
+  
+  // First time we saw this type, add it.
+  Types.push_back(std::make_pair(Ty, 1U));
+  TypeID = Types.size();
+  
+  // Enumerate subtypes.
+  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+       I != E; ++I)
+    EnumerateType(*I);
+}
+
+// Enumerate the types for the specified value.  If the value is a constant,
+// walk through it, enumerating the types of the constant.
+void ValueEnumerator::EnumerateOperandType(const Value *V) {
+  EnumerateType(V->getType());
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    // If this constant is already enumerated, ignore it, we know its type must
+    // be enumerated.
+    if (ValueMap.count(V)) return;
+
+    // This constant may have operands, make sure to enumerate the types in
+    // them.
+    for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+      EnumerateOperandType(C->getOperand(i));
+
+    if (const MDNode *N = dyn_cast<MDNode>(V)) {
+      for (unsigned i = 0, e = N->getNumElements(); i != e; ++i)
+        EnumerateOperandType(N->getElement(i));
+    }
+  }
+}
+
+void ValueEnumerator::EnumerateAttributes(const AttrListPtr &PAL) {
+  if (PAL.isEmpty()) return;  // null is always 0.
+  // Do a lookup.
+  unsigned &Entry = AttributeMap[PAL.getRawPointer()];
+  if (Entry == 0) {
+    // Never saw this before, add it.
+    Attributes.push_back(PAL);
+    Entry = Attributes.size();
+  }
+}
+
+
+/// PurgeAggregateValues - If there are any aggregate values at the end of the
+/// value list, remove them and return the count of the remaining values.  If
+/// there are none, return -1.
+int ValueEnumerator::PurgeAggregateValues() {
+  // If there are no aggregate values at the end of the list, return -1.
+  if (Values.empty() || Values.back().first->getType()->isSingleValueType())
+    return -1;
+  
+  // Otherwise, remove aggregate values...
+  while (!Values.empty() && !Values.back().first->getType()->isSingleValueType())
+    Values.pop_back();
+  
+  // ... and return the new size.
+  return Values.size();
+}
+
+void ValueEnumerator::incorporateFunction(const Function &F) {
+  NumModuleValues = Values.size();
+  
+  // Adding function arguments to the value table.
+  for(Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
+      I != E; ++I)
+    EnumerateValue(I);
+
+  FirstFuncConstantID = Values.size();
+  
+  // Add all function-level constants to the value table.
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I)
+      for (User::const_op_iterator OI = I->op_begin(), E = I->op_end(); 
+           OI != E; ++OI) {
+        if ((isa<Constant>(*OI) && !isa<GlobalValue>(*OI)) ||
+            isa<InlineAsm>(*OI))
+          EnumerateValue(*OI);
+      }
+    BasicBlocks.push_back(BB);
+    ValueMap[BB] = BasicBlocks.size();
+  }
+  
+  // Optimize the constant layout.
+  OptimizeConstants(FirstFuncConstantID, Values.size());
+  
+  // Add the function's parameter attributes so they are available for use in
+  // the function's instruction.
+  EnumerateAttributes(F.getAttributes());
+
+  FirstInstID = Values.size();
+  
+  // Add all of the instructions.
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
+      if (I->getType() != Type::VoidTy)
+        EnumerateValue(I);
+    }
+  }
+}
+
+void ValueEnumerator::purgeFunction() {
+  /// Remove purged values from the ValueMap.
+  for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i)
+    ValueMap.erase(Values[i].first);
+  for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i)
+    ValueMap.erase(BasicBlocks[i]);
+    
+  Values.resize(NumModuleValues);
+  BasicBlocks.clear();
+}
+
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
new file mode 100644
index 0000000..bb0324b
--- /dev/null
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -0,0 +1,127 @@
+//===-- Bitcode/Writer/ValueEnumerator.h - Number values --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class gives values and types Unique ID's.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VALUE_ENUMERATOR_H
+#define VALUE_ENUMERATOR_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Attributes.h"
+#include <vector>
+
+namespace llvm {
+
+class Type;
+class Value;
+class BasicBlock;
+class Function;
+class Module;
+class AttrListPtr;
+class TypeSymbolTable;
+class ValueSymbolTable;
+
+class ValueEnumerator {
+public:
+  // For each type, we remember its Type* and occurrence frequency.
+  typedef std::vector<std::pair<const Type*, unsigned> > TypeList;
+
+  // For each value, we remember its Value* and occurrence frequency.
+  typedef std::vector<std::pair<const Value*, unsigned> > ValueList;
+private:
+  typedef DenseMap<const Type*, unsigned> TypeMapType;
+  TypeMapType TypeMap;
+  TypeList Types;
+
+  typedef DenseMap<const Value*, unsigned> ValueMapType;
+  ValueMapType ValueMap;
+  ValueList Values;
+  
+  typedef DenseMap<void*, unsigned> AttributeMapType;
+  AttributeMapType AttributeMap;
+  std::vector<AttrListPtr> Attributes;
+  
+  /// BasicBlocks - This contains all the basic blocks for the currently
+  /// incorporated function.  Their reverse mapping is stored in ValueMap.
+  std::vector<const BasicBlock*> BasicBlocks;
+  
+  /// When a function is incorporated, this is the size of the Values list
+  /// before incorporation.
+  unsigned NumModuleValues;
+  unsigned FirstFuncConstantID;
+  unsigned FirstInstID;
+  
+  ValueEnumerator(const ValueEnumerator &);  // DO NOT IMPLEMENT
+  void operator=(const ValueEnumerator &);   // DO NOT IMPLEMENT
+public:
+  ValueEnumerator(const Module *M);
+
+  unsigned getValueID(const Value *V) const {
+    ValueMapType::const_iterator I = ValueMap.find(V);
+    assert(I != ValueMap.end() && "Value not in slotcalculator!");
+    return I->second-1;
+  }
+  
+  unsigned getTypeID(const Type *T) const {
+    TypeMapType::const_iterator I = TypeMap.find(T);
+    assert(I != TypeMap.end() && "Type not in ValueEnumerator!");
+    return I->second-1;
+  }
+  
+  unsigned getAttributeID(const AttrListPtr &PAL) const {
+    if (PAL.isEmpty()) return 0;  // Null maps to zero.
+    AttributeMapType::const_iterator I = AttributeMap.find(PAL.getRawPointer());
+    assert(I != AttributeMap.end() && "Attribute not in ValueEnumerator!");
+    return I->second;
+  }
+
+  /// getFunctionConstantRange - Return the range of values that corresponds to
+  /// function-local constants.
+  void getFunctionConstantRange(unsigned &Start, unsigned &End) const {
+    Start = FirstFuncConstantID;
+    End = FirstInstID;
+  }
+  
+  const ValueList &getValues() const { return Values; }
+  const TypeList &getTypes() const { return Types; }
+  const std::vector<const BasicBlock*> &getBasicBlocks() const {
+    return BasicBlocks; 
+  }
+  const std::vector<AttrListPtr> &getAttributes() const {
+    return Attributes;
+  }
+
+  /// PurgeAggregateValues - If there are any aggregate values at the end of the
+  /// value list, remove them and return the count of the remaining values.  If
+  /// there are none, return -1.
+  int PurgeAggregateValues();
+  
+  /// incorporateFunction/purgeFunction - If you'd like to deal with a function,
+  /// use these two methods to get its data into the ValueEnumerator!
+  ///
+  void incorporateFunction(const Function &F);
+  void purgeFunction();
+
+private:
+  void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
+    
+  void EnumerateValue(const Value *V);
+  void EnumerateType(const Type *T);
+  void EnumerateOperandType(const Value *V);
+  void EnumerateAttributes(const AttrListPtr &PAL);
+  
+  void EnumerateTypeSymbolTable(const TypeSymbolTable &ST);
+  void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
new file mode 100644
index 0000000..45462da
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -0,0 +1,1724 @@
+//===-- AsmPrinter.cpp - Common AsmPrinter code ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AsmPrinter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include <cerrno>
+using namespace llvm;
+
+static cl::opt<cl::boolOrDefault>
+AsmVerbose("asm-verbose", cl::desc("Add comments to directives."),
+           cl::init(cl::BOU_UNSET));
+
+char AsmPrinter::ID = 0;
+AsmPrinter::AsmPrinter(raw_ostream &o, TargetMachine &tm,
+                       const TargetAsmInfo *T, CodeGenOpt::Level OL, bool VDef)
+  : MachineFunctionPass(&ID), FunctionNumber(0), OptLevel(OL), O(o),
+    TM(tm), TAI(T), TRI(tm.getRegisterInfo()),
+    IsInTextSection(false)
+{
+  switch (AsmVerbose) {
+  case cl::BOU_UNSET: VerboseAsm = VDef;  break;
+  case cl::BOU_TRUE:  VerboseAsm = true;  break;
+  case cl::BOU_FALSE: VerboseAsm = false; break;
+  }
+}
+
+AsmPrinter::~AsmPrinter() {
+  for (gcp_iterator I = GCMetadataPrinters.begin(),
+                    E = GCMetadataPrinters.end(); I != E; ++I)
+    delete I->second;
+}
+
+/// SwitchToTextSection - Switch to the specified text section of the executable
+/// if we are not already in it!
+///
+void AsmPrinter::SwitchToTextSection(const char *NewSection,
+                                     const GlobalValue *GV) {
+  std::string NS;
+  if (GV && GV->hasSection())
+    NS = TAI->getSwitchToSectionDirective() + GV->getSection();
+  else
+    NS = NewSection;
+  
+  // If we're already in this section, we're done.
+  if (CurrentSection == NS) return;
+
+  // Close the current section, if applicable.
+  if (TAI->getSectionEndDirectiveSuffix() && !CurrentSection.empty())
+    O << CurrentSection << TAI->getSectionEndDirectiveSuffix() << '\n';
+
+  CurrentSection = NS;
+
+  if (!CurrentSection.empty())
+    O << CurrentSection << TAI->getTextSectionStartSuffix() << '\n';
+
+  IsInTextSection = true;
+}
+
+/// SwitchToDataSection - Switch to the specified data section of the executable
+/// if we are not already in it!
+///
+void AsmPrinter::SwitchToDataSection(const char *NewSection,
+                                     const GlobalValue *GV) {
+  std::string NS;
+  if (GV && GV->hasSection())
+    NS = TAI->getSwitchToSectionDirective() + GV->getSection();
+  else
+    NS = NewSection;
+  
+  // If we're already in this section, we're done.
+  if (CurrentSection == NS) return;
+
+  // Close the current section, if applicable.
+  if (TAI->getSectionEndDirectiveSuffix() && !CurrentSection.empty())
+    O << CurrentSection << TAI->getSectionEndDirectiveSuffix() << '\n';
+
+  CurrentSection = NS;
+  
+  if (!CurrentSection.empty())
+    O << CurrentSection << TAI->getDataSectionStartSuffix() << '\n';
+
+  IsInTextSection = false;
+}
+
+/// SwitchToSection - Switch to the specified section of the executable if we
+/// are not already in it!
+void AsmPrinter::SwitchToSection(const Section* NS) {
+  const std::string& NewSection = NS->getName();
+
+  // If we're already in this section, we're done.
+  if (CurrentSection == NewSection) return;
+
+  // Close the current section, if applicable.
+  if (TAI->getSectionEndDirectiveSuffix() && !CurrentSection.empty())
+    O << CurrentSection << TAI->getSectionEndDirectiveSuffix() << '\n';
+
+  // FIXME: Make CurrentSection a Section* in the future
+  CurrentSection = NewSection;
+  CurrentSection_ = NS;
+
+  if (!CurrentSection.empty()) {
+    // If section is named we need to switch into it via special '.section'
+    // directive and also append funky flags. Otherwise - section name is just
+    // some magic assembler directive.
+    if (NS->isNamed())
+      O << TAI->getSwitchToSectionDirective()
+        << CurrentSection
+        << TAI->getSectionFlags(NS->getFlags());
+    else
+      O << CurrentSection;
+    O << TAI->getDataSectionStartSuffix() << '\n';
+  }
+
+  IsInTextSection = (NS->getFlags() & SectionFlags::Code);
+}
+
+void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<GCModuleInfo>();
+}
+
+bool AsmPrinter::doInitialization(Module &M) {
+  Mang = new Mangler(M, TAI->getGlobalPrefix(), TAI->getPrivateGlobalPrefix());
+  
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+
+  if (TAI->hasSingleParameterDotFile()) {
+    /* Very minimal debug info. It is ignored if we emit actual
+       debug info. If we don't, this at helps the user find where
+       a function came from. */
+    O << "\t.file\t\"" << M.getModuleIdentifier() << "\"\n";
+  }
+
+  for (GCModuleInfo::iterator I = MI->begin(), E = MI->end(); I != E; ++I)
+    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
+      MP->beginAssembly(O, *this, *TAI);
+  
+  if (!M.getModuleInlineAsm().empty())
+    O << TAI->getCommentString() << " Start of file scope inline assembly\n"
+      << M.getModuleInlineAsm()
+      << '\n' << TAI->getCommentString()
+      << " End of file scope inline assembly\n";
+
+  SwitchToDataSection("");   // Reset back to no section.
+  
+  MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  if (MMI) MMI->AnalyzeModule(M);
+  DW = getAnalysisIfAvailable<DwarfWriter>();
+  return false;
+}
+
+bool AsmPrinter::doFinalization(Module &M) {
+  if (TAI->getWeakRefDirective()) {
+    if (!ExtWeakSymbols.empty())
+      SwitchToDataSection("");
+
+    for (std::set<const GlobalValue*>::iterator i = ExtWeakSymbols.begin(),
+         e = ExtWeakSymbols.end(); i != e; ++i)
+      O << TAI->getWeakRefDirective() << Mang->getValueName(*i) << '\n';
+  }
+
+  if (TAI->getSetDirective()) {
+    if (!M.alias_empty())
+      SwitchToSection(TAI->getTextSection());
+
+    O << '\n';
+    for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
+         I!=E; ++I) {
+      std::string Name = Mang->getValueName(I);
+      std::string Target;
+
+      const GlobalValue *GV = cast<GlobalValue>(I->getAliasedGlobal());
+      Target = Mang->getValueName(GV);
+
+      if (I->hasExternalLinkage() || !TAI->getWeakRefDirective())
+        O << "\t.globl\t" << Name << '\n';
+      else if (I->hasWeakLinkage())
+        O << TAI->getWeakRefDirective() << Name << '\n';
+      else if (!I->hasLocalLinkage())
+        assert(0 && "Invalid alias linkage");
+
+      printVisibility(Name, I->getVisibility());
+
+      O << TAI->getSetDirective() << ' ' << Name << ", " << Target << '\n';
+    }
+  }
+
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
+    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*--I))
+      MP->finishAssembly(O, *this, *TAI);
+
+  // If we don't have any trampolines, then we don't require stack memory
+  // to be executable. Some targets have a directive to declare this.
+  Function* InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
+  if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
+    if (TAI->getNonexecutableStackDirective())
+      O << TAI->getNonexecutableStackDirective() << '\n';
+
+  delete Mang; Mang = 0;
+  return false;
+}
+
+const std::string &
+AsmPrinter::getCurrentFunctionEHName(const MachineFunction *MF,
+                                     std::string &Name) const {
+  assert(MF && "No machine function?");
+  Name = MF->getFunction()->getName();
+  if (Name.empty())
+    Name = Mang->getValueName(MF->getFunction());
+  Name = Mang->makeNameProper(TAI->getEHGlobalPrefix() +
+                              Name + ".eh", TAI->getGlobalPrefix());
+  return Name;
+}
+
+void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
+  // What's my mangled name?
+  CurrentFnName = Mang->getValueName(MF.getFunction());
+  IncrementFunctionNumber();
+}
+
+namespace {
+  // SectionCPs - Keep track the alignment, constpool entries per Section.
+  struct SectionCPs {
+    const Section *S;
+    unsigned Alignment;
+    SmallVector<unsigned, 4> CPEs;
+    SectionCPs(const Section *s, unsigned a) : S(s), Alignment(a) {};
+  };
+}
+
+/// EmitConstantPool - Print to the current output stream assembly
+/// representations of the constants in the constant pool MCP. This is
+/// used to print out constants which have been "spilled to memory" by
+/// the code generator.
+///
+void AsmPrinter::EmitConstantPool(MachineConstantPool *MCP) {
+  const std::vector<MachineConstantPoolEntry> &CP = MCP->getConstants();
+  if (CP.empty()) return;
+
+  // Calculate sections for constant pool entries. We collect entries to go into
+  // the same section together to reduce amount of section switch statements.
+  SmallVector<SectionCPs, 4> CPSections;
+  for (unsigned i = 0, e = CP.size(); i != e; ++i) {
+    MachineConstantPoolEntry CPE = CP[i];
+    unsigned Align = CPE.getAlignment();
+    const Section* S = TAI->SelectSectionForMachineConst(CPE.getType());
+    // The number of sections are small, just do a linear search from the
+    // last section to the first.
+    bool Found = false;
+    unsigned SecIdx = CPSections.size();
+    while (SecIdx != 0) {
+      if (CPSections[--SecIdx].S == S) {
+        Found = true;
+        break;
+      }
+    }
+    if (!Found) {
+      SecIdx = CPSections.size();
+      CPSections.push_back(SectionCPs(S, Align));
+    }
+
+    if (Align > CPSections[SecIdx].Alignment)
+      CPSections[SecIdx].Alignment = Align;
+    CPSections[SecIdx].CPEs.push_back(i);
+  }
+
+  // Now print stuff into the calculated sections.
+  for (unsigned i = 0, e = CPSections.size(); i != e; ++i) {
+    SwitchToSection(CPSections[i].S);
+    EmitAlignment(Log2_32(CPSections[i].Alignment));
+
+    unsigned Offset = 0;
+    for (unsigned j = 0, ee = CPSections[i].CPEs.size(); j != ee; ++j) {
+      unsigned CPI = CPSections[i].CPEs[j];
+      MachineConstantPoolEntry CPE = CP[CPI];
+
+      // Emit inter-object padding for alignment.
+      unsigned AlignMask = CPE.getAlignment() - 1;
+      unsigned NewOffset = (Offset + AlignMask) & ~AlignMask;
+      EmitZeros(NewOffset - Offset);
+
+      const Type *Ty = CPE.getType();
+      Offset = NewOffset + TM.getTargetData()->getTypeAllocSize(Ty);
+
+      O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+        << CPI << ":\t\t\t\t\t";
+      if (VerboseAsm) {
+        O << TAI->getCommentString() << ' ';
+        WriteTypeSymbolic(O, CPE.getType(), 0);
+      }
+      O << '\n';
+      if (CPE.isMachineConstantPoolEntry())
+        EmitMachineConstantPoolValue(CPE.Val.MachineCPVal);
+      else
+        EmitGlobalConstant(CPE.Val.ConstVal);
+    }
+  }
+}
+
+/// EmitJumpTableInfo - Print assembly representations of the jump tables used
+/// by the current function to the current output stream.  
+///
+void AsmPrinter::EmitJumpTableInfo(MachineJumpTableInfo *MJTI,
+                                   MachineFunction &MF) {
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  bool IsPic = TM.getRelocationModel() == Reloc::PIC_;
+  
+  // Pick the directive to use to print the jump table entries, and switch to 
+  // the appropriate section.
+  TargetLowering *LoweringInfo = TM.getTargetLowering();
+
+  const char* JumpTableDataSection = TAI->getJumpTableDataSection();
+  const Function *F = MF.getFunction();
+  unsigned SectionFlags = TAI->SectionFlagsForGlobal(F);
+  if ((IsPic && !(LoweringInfo && LoweringInfo->usesGlobalOffsetTable())) ||
+     !JumpTableDataSection ||
+      SectionFlags & SectionFlags::Linkonce) {
+    // In PIC mode, we need to emit the jump table to the same section as the
+    // function body itself, otherwise the label differences won't make sense.
+    // We should also do if the section name is NULL or function is declared in
+    // discardable section.
+    SwitchToSection(TAI->SectionForGlobal(F));
+  } else {
+    SwitchToDataSection(JumpTableDataSection);
+  }
+  
+  EmitAlignment(Log2_32(MJTI->getAlignment()));
+  
+  for (unsigned i = 0, e = JT.size(); i != e; ++i) {
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[i].MBBs;
+    
+    // If this jump table was deleted, ignore it. 
+    if (JTBBs.empty()) continue;
+
+    // For PIC codegen, if possible we want to use the SetDirective to reduce
+    // the number of relocations the assembler will generate for the jump table.
+    // Set directives are all printed before the jump table itself.
+    SmallPtrSet<MachineBasicBlock*, 16> EmittedSets;
+    if (TAI->getSetDirective() && IsPic)
+      for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii)
+        if (EmittedSets.insert(JTBBs[ii]))
+          printPICJumpTableSetLabel(i, JTBBs[ii]);
+    
+    // On some targets (e.g. darwin) we want to emit two consequtive labels
+    // before each jump table.  The first label is never referenced, but tells
+    // the assembler and linker the extents of the jump table object.  The
+    // second label is actually referenced by the code.
+    if (const char *JTLabelPrefix = TAI->getJumpTableSpecialLabelPrefix())
+      O << JTLabelPrefix << "JTI" << getFunctionNumber() << '_' << i << ":\n";
+    
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() 
+      << '_' << i << ":\n";
+    
+    for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
+      printPICJumpTableEntry(MJTI, JTBBs[ii], i);
+      O << '\n';
+    }
+  }
+}
+
+void AsmPrinter::printPICJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                        const MachineBasicBlock *MBB,
+                                        unsigned uid)  const {
+  bool IsPic = TM.getRelocationModel() == Reloc::PIC_;
+  
+  // Use JumpTableDirective otherwise honor the entry size from the jump table
+  // info.
+  const char *JTEntryDirective = TAI->getJumpTableDirective();
+  bool HadJTEntryDirective = JTEntryDirective != NULL;
+  if (!HadJTEntryDirective) {
+    JTEntryDirective = MJTI->getEntrySize() == 4 ?
+      TAI->getData32bitsDirective() : TAI->getData64bitsDirective();
+  }
+
+  O << JTEntryDirective << ' ';
+
+  // If we have emitted set directives for the jump table entries, print 
+  // them rather than the entries themselves.  If we're emitting PIC, then
+  // emit the table entries as differences between two text section labels.
+  // If we're emitting non-PIC code, then emit the entries as direct
+  // references to the target basic blocks.
+  if (IsPic) {
+    if (TAI->getSetDirective()) {
+      O << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+        << '_' << uid << "_set_" << MBB->getNumber();
+    } else {
+      printBasicBlockLabel(MBB, false, false, false);
+      // If the arch uses custom Jump Table directives, don't calc relative to
+      // JT
+      if (!HadJTEntryDirective) 
+        O << '-' << TAI->getPrivateGlobalPrefix() << "JTI"
+          << getFunctionNumber() << '_' << uid;
+    }
+  } else {
+    printBasicBlockLabel(MBB, false, false, false);
+  }
+}
+
+
+/// EmitSpecialLLVMGlobal - Check to see if the specified global is a
+/// special global used by LLVM.  If so, emit it and return true, otherwise
+/// do nothing and return false.
+bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
+  if (GV->getName() == "llvm.used") {
+    if (TAI->getUsedDirective() != 0)    // No need to emit this at all.
+      EmitLLVMUsedList(GV->getInitializer());
+    return true;
+  }
+
+  // Ignore debug and non-emitted data.
+  if (GV->getSection() == "llvm.metadata" ||
+      GV->hasAvailableExternallyLinkage())
+    return true;
+  
+  if (!GV->hasAppendingLinkage()) return false;
+
+  assert(GV->hasInitializer() && "Not a special LLVM global!");
+  
+  const TargetData *TD = TM.getTargetData();
+  unsigned Align = Log2_32(TD->getPointerPrefAlignment());
+  if (GV->getName() == "llvm.global_ctors") {
+    SwitchToDataSection(TAI->getStaticCtorsSection());
+    EmitAlignment(Align, 0);
+    EmitXXStructorList(GV->getInitializer());
+    return true;
+  } 
+  
+  if (GV->getName() == "llvm.global_dtors") {
+    SwitchToDataSection(TAI->getStaticDtorsSection());
+    EmitAlignment(Align, 0);
+    EmitXXStructorList(GV->getInitializer());
+    return true;
+  }
+  
+  return false;
+}
+
+/// findGlobalValue - if CV is an expression equivalent to a single
+/// global value, return that value.
+const GlobalValue * AsmPrinter::findGlobalValue(const Constant *CV) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return GV;
+  else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    const TargetData *TD = TM.getTargetData();
+    unsigned Opcode = CE->getOpcode();    
+    switch (Opcode) {
+    case Instruction::GetElementPtr: {
+      const Constant *ptrVal = CE->getOperand(0);
+      SmallVector<Value*, 8> idxVec(CE->op_begin()+1, CE->op_end());
+      if (TD->getIndexedOffset(ptrVal->getType(), &idxVec[0], idxVec.size()))
+        return 0;
+      return findGlobalValue(ptrVal);
+    }
+    case Instruction::BitCast:
+      return findGlobalValue(CE->getOperand(0));
+    default:
+      return 0;
+    }
+  }
+  return 0;
+}
+
+/// EmitLLVMUsedList - For targets that define a TAI::UsedDirective, mark each
+/// global in the specified llvm.used list for which emitUsedDirectiveFor
+/// is true, as being used with this directive.
+
+void AsmPrinter::EmitLLVMUsedList(Constant *List) {
+  const char *Directive = TAI->getUsedDirective();
+
+  // Should be an array of 'sbyte*'.
+  ConstantArray *InitList = dyn_cast<ConstantArray>(List);
+  if (InitList == 0) return;
+  
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    const GlobalValue *GV = findGlobalValue(InitList->getOperand(i));
+    if (TAI->emitUsedDirectiveFor(GV, Mang)) {
+      O << Directive;
+      EmitConstantValueOnly(InitList->getOperand(i));
+      O << '\n';
+    }
+  }
+}
+
+/// EmitXXStructorList - Emit the ctor or dtor list.  This just prints out the 
+/// function pointers, ignoring the init priority.
+void AsmPrinter::EmitXXStructorList(Constant *List) {
+  // Should be an array of '{ int, void ()* }' structs.  The first value is the
+  // init priority, which we ignore.
+  if (!isa<ConstantArray>(List)) return;
+  ConstantArray *InitList = cast<ConstantArray>(List);
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
+      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
+
+      if (CS->getOperand(1)->isNullValue())
+        return;  // Found a null terminator, exit printing.
+      // Emit the function pointer.
+      EmitGlobalConstant(CS->getOperand(1));
+    }
+}
+
+/// getGlobalLinkName - Returns the asm/link name of of the specified
+/// global variable.  Should be overridden by each target asm printer to
+/// generate the appropriate value.
+const std::string &AsmPrinter::getGlobalLinkName(const GlobalVariable *GV,
+                                                 std::string &LinkName) const {
+  if (isa<Function>(GV)) {
+    LinkName += TAI->getFunctionAddrPrefix();
+    LinkName += Mang->getValueName(GV);
+    LinkName += TAI->getFunctionAddrSuffix();
+  } else {
+    LinkName += TAI->getGlobalVarAddrPrefix();
+    LinkName += Mang->getValueName(GV);
+    LinkName += TAI->getGlobalVarAddrSuffix();
+  }  
+  
+  return LinkName;
+}
+
+/// EmitExternalGlobal - Emit the external reference to a global variable.
+/// Should be overridden if an indirect reference should be used.
+void AsmPrinter::EmitExternalGlobal(const GlobalVariable *GV) {
+  std::string GLN;
+  O << getGlobalLinkName(GV, GLN);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+/// LEB 128 number encoding.
+
+/// PrintULEB128 - Print a series of hexidecimal values (separated by commas)
+/// representing an unsigned leb128 value.
+void AsmPrinter::PrintULEB128(unsigned Value) const {
+  char Buffer[20];
+  do {
+    unsigned char Byte = static_cast<unsigned char>(Value & 0x7f);
+    Value >>= 7;
+    if (Value) Byte |= 0x80;
+    O << "0x" << utohex_buffer(Byte, Buffer+20);
+    if (Value) O << ", ";
+  } while (Value);
+}
+
+/// PrintSLEB128 - Print a series of hexidecimal values (separated by commas)
+/// representing a signed leb128 value.
+void AsmPrinter::PrintSLEB128(int Value) const {
+  int Sign = Value >> (8 * sizeof(Value) - 1);
+  bool IsMore;
+  char Buffer[20];
+
+  do {
+    unsigned char Byte = static_cast<unsigned char>(Value & 0x7f);
+    Value >>= 7;
+    IsMore = Value != Sign || ((Byte ^ Sign) & 0x40) != 0;
+    if (IsMore) Byte |= 0x80;
+    O << "0x" << utohex_buffer(Byte, Buffer+20);
+    if (IsMore) O << ", ";
+  } while (IsMore);
+}
+
+//===--------------------------------------------------------------------===//
+// Emission and print routines
+//
+
+/// PrintHex - Print a value as a hexidecimal value.
+///
+void AsmPrinter::PrintHex(int Value) const { 
+  char Buffer[20];
+  O << "0x" << utohex_buffer(static_cast<unsigned>(Value), Buffer+20);
+}
+
+/// EOL - Print a newline character to asm stream.  If a comment is present
+/// then it will be printed first.  Comments should not contain '\n'.
+void AsmPrinter::EOL() const {
+  O << '\n';
+}
+
+void AsmPrinter::EOL(const std::string &Comment) const {
+  if (VerboseAsm && !Comment.empty()) {
+    O << '\t'
+      << TAI->getCommentString()
+      << ' '
+      << Comment;
+  }
+  O << '\n';
+}
+
+void AsmPrinter::EOL(const char* Comment) const {
+  if (VerboseAsm && *Comment) {
+    O << '\t'
+      << TAI->getCommentString()
+      << ' '
+      << Comment;
+  }
+  O << '\n';
+}
+
+/// EmitULEB128Bytes - Emit an assembler byte data directive to compose an
+/// unsigned leb128 value.
+void AsmPrinter::EmitULEB128Bytes(unsigned Value) const {
+  if (TAI->hasLEB128()) {
+    O << "\t.uleb128\t"
+      << Value;
+  } else {
+    O << TAI->getData8bitsDirective();
+    PrintULEB128(Value);
+  }
+}
+
+/// EmitSLEB128Bytes - print an assembler byte data directive to compose a
+/// signed leb128 value.
+void AsmPrinter::EmitSLEB128Bytes(int Value) const {
+  if (TAI->hasLEB128()) {
+    O << "\t.sleb128\t"
+      << Value;
+  } else {
+    O << TAI->getData8bitsDirective();
+    PrintSLEB128(Value);
+  }
+}
+
+/// EmitInt8 - Emit a byte directive and value.
+///
+void AsmPrinter::EmitInt8(int Value) const {
+  O << TAI->getData8bitsDirective();
+  PrintHex(Value & 0xFF);
+}
+
+/// EmitInt16 - Emit a short directive and value.
+///
+void AsmPrinter::EmitInt16(int Value) const {
+  O << TAI->getData16bitsDirective();
+  PrintHex(Value & 0xFFFF);
+}
+
+/// EmitInt32 - Emit a long directive and value.
+///
+void AsmPrinter::EmitInt32(int Value) const {
+  O << TAI->getData32bitsDirective();
+  PrintHex(Value);
+}
+
+/// EmitInt64 - Emit a long long directive and value.
+///
+void AsmPrinter::EmitInt64(uint64_t Value) const {
+  if (TAI->getData64bitsDirective()) {
+    O << TAI->getData64bitsDirective();
+    PrintHex(Value);
+  } else {
+    if (TM.getTargetData()->isBigEndian()) {
+      EmitInt32(unsigned(Value >> 32)); O << '\n';
+      EmitInt32(unsigned(Value));
+    } else {
+      EmitInt32(unsigned(Value)); O << '\n';
+      EmitInt32(unsigned(Value >> 32));
+    }
+  }
+}
+
+/// toOctal - Convert the low order bits of X into an octal digit.
+///
+static inline char toOctal(int X) {
+  return (X&7)+'0';
+}
+
+/// printStringChar - Print a char, escaped if necessary.
+///
+static void printStringChar(raw_ostream &O, unsigned char C) {
+  if (C == '"') {
+    O << "\\\"";
+  } else if (C == '\\') {
+    O << "\\\\";
+  } else if (isprint((unsigned char)C)) {
+    O << C;
+  } else {
+    switch(C) {
+    case '\b': O << "\\b"; break;
+    case '\f': O << "\\f"; break;
+    case '\n': O << "\\n"; break;
+    case '\r': O << "\\r"; break;
+    case '\t': O << "\\t"; break;
+    default:
+      O << '\\';
+      O << toOctal(C >> 6);
+      O << toOctal(C >> 3);
+      O << toOctal(C >> 0);
+      break;
+    }
+  }
+}
+
+/// EmitString - Emit a string with quotes and a null terminator.
+/// Special characters are emitted properly.
+/// \literal (Eg. '\t') \endliteral
+void AsmPrinter::EmitString(const std::string &String) const {
+  EmitString(String.c_str(), String.size());
+}
+
+void AsmPrinter::EmitString(const char *String, unsigned Size) const {
+  const char* AscizDirective = TAI->getAscizDirective();
+  if (AscizDirective)
+    O << AscizDirective;
+  else
+    O << TAI->getAsciiDirective();
+  O << '\"';
+  for (unsigned i = 0; i < Size; ++i)
+    printStringChar(O, String[i]);
+  if (AscizDirective)
+    O << '\"';
+  else
+    O << "\\0\"";
+}
+
+
+/// EmitFile - Emit a .file directive.
+void AsmPrinter::EmitFile(unsigned Number, const std::string &Name) const {
+  O << "\t.file\t" << Number << " \"";
+  for (unsigned i = 0, N = Name.size(); i < N; ++i)
+    printStringChar(O, Name[i]);
+  O << '\"';
+}
+
+
+//===----------------------------------------------------------------------===//
+
+// EmitAlignment - Emit an alignment directive to the specified power of
+// two boundary.  For example, if you pass in 3 here, you will get an 8
+// byte alignment.  If a global value is specified, and if that global has
+// an explicit alignment requested, it will unconditionally override the
+// alignment request.  However, if ForcedAlignBits is specified, this value
+// has final say: the ultimate alignment will be the max of ForcedAlignBits
+// and the alignment computed with NumBits and the global.
+//
+// The algorithm is:
+//     Align = NumBits;
+//     if (GV && GV->hasalignment) Align = GV->getalignment();
+//     Align = std::max(Align, ForcedAlignBits);
+//
+void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalValue *GV,
+                               unsigned ForcedAlignBits,
+                               bool UseFillExpr) const {
+  if (GV && GV->getAlignment())
+    NumBits = Log2_32(GV->getAlignment());
+  NumBits = std::max(NumBits, ForcedAlignBits);
+  
+  if (NumBits == 0) return;   // No need to emit alignment.
+  if (TAI->getAlignmentIsInBytes()) NumBits = 1 << NumBits;
+  O << TAI->getAlignDirective() << NumBits;
+
+  unsigned FillValue = TAI->getTextAlignFillValue();
+  UseFillExpr &= IsInTextSection && FillValue;
+  if (UseFillExpr) {
+    O << ',';
+    PrintHex(FillValue);
+  }
+  O << '\n';
+}
+
+    
+/// EmitZeros - Emit a block of zeros.
+///
+void AsmPrinter::EmitZeros(uint64_t NumZeros, unsigned AddrSpace) const {
+  if (NumZeros) {
+    if (TAI->getZeroDirective()) {
+      O << TAI->getZeroDirective() << NumZeros;
+      if (TAI->getZeroDirectiveSuffix())
+        O << TAI->getZeroDirectiveSuffix();
+      O << '\n';
+    } else {
+      for (; NumZeros; --NumZeros)
+        O << TAI->getData8bitsDirective(AddrSpace) << "0\n";
+    }
+  }
+}
+
+// Print out the specified constant, without a storage class.  Only the
+// constants valid in constant expressions can occur here.
+void AsmPrinter::EmitConstantValueOnly(const Constant *CV) {
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    O << '0';
+  else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    O << CI->getZExtValue();
+  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+    // This is a constant address for a global variable or function. Use the
+    // name of the variable or function as the address value, possibly
+    // decorating it with GlobalVarAddrPrefix/Suffix or
+    // FunctionAddrPrefix/Suffix (these all default to "" )
+    if (isa<Function>(GV)) {
+      O << TAI->getFunctionAddrPrefix()
+        << Mang->getValueName(GV)
+        << TAI->getFunctionAddrSuffix();
+    } else {
+      O << TAI->getGlobalVarAddrPrefix()
+        << Mang->getValueName(GV)
+        << TAI->getGlobalVarAddrSuffix();
+    }
+  } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    const TargetData *TD = TM.getTargetData();
+    unsigned Opcode = CE->getOpcode();    
+    switch (Opcode) {
+    case Instruction::GetElementPtr: {
+      // generate a symbolic expression for the byte address
+      const Constant *ptrVal = CE->getOperand(0);
+      SmallVector<Value*, 8> idxVec(CE->op_begin()+1, CE->op_end());
+      if (int64_t Offset = TD->getIndexedOffset(ptrVal->getType(), &idxVec[0],
+                                                idxVec.size())) {
+        // Truncate/sext the offset to the pointer size.
+        if (TD->getPointerSizeInBits() != 64) {
+          int SExtAmount = 64-TD->getPointerSizeInBits();
+          Offset = (Offset << SExtAmount) >> SExtAmount;
+        }
+        
+        if (Offset)
+          O << '(';
+        EmitConstantValueOnly(ptrVal);
+        if (Offset > 0)
+          O << ") + " << Offset;
+        else if (Offset < 0)
+          O << ") - " << -Offset;
+      } else {
+        EmitConstantValueOnly(ptrVal);
+      }
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      assert(0 && "FIXME: Don't yet support this kind of constant cast expr");
+      break;
+    case Instruction::BitCast:
+      return EmitConstantValueOnly(CE->getOperand(0));
+
+    case Instruction::IntToPtr: {
+      // Handle casts to pointers by changing them into casts to the appropriate
+      // integer type.  This promotes constant folding and simplifies this code.
+      Constant *Op = CE->getOperand(0);
+      Op = ConstantExpr::getIntegerCast(Op, TD->getIntPtrType(), false/*ZExt*/);
+      return EmitConstantValueOnly(Op);
+    }
+      
+      
+    case Instruction::PtrToInt: {
+      // Support only foldable casts to/from pointers that can be eliminated by
+      // changing the pointer to the appropriately sized integer type.
+      Constant *Op = CE->getOperand(0);
+      const Type *Ty = CE->getType();
+
+      // We can emit the pointer value into this slot if the slot is an
+      // integer slot greater or equal to the size of the pointer.
+      if (TD->getTypeAllocSize(Ty) >= TD->getTypeAllocSize(Op->getType()))
+        return EmitConstantValueOnly(Op);
+
+      O << "((";
+      EmitConstantValueOnly(Op);
+      APInt ptrMask = APInt::getAllOnesValue(TD->getTypeAllocSizeInBits(Ty));
+      
+      SmallString<40> S;
+      ptrMask.toStringUnsigned(S);
+      O << ") & " << S.c_str() << ')';
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      O << '(';
+      EmitConstantValueOnly(CE->getOperand(0));
+      O << ')';
+      switch (Opcode) {
+      case Instruction::Add:
+       O << " + ";
+       break;
+      case Instruction::Sub:
+       O << " - ";
+       break;
+      case Instruction::And:
+       O << " & ";
+       break;
+      case Instruction::Or:
+       O << " | ";
+       break;
+      case Instruction::Xor:
+       O << " ^ ";
+       break;
+      default:
+       break;
+      }
+      O << '(';
+      EmitConstantValueOnly(CE->getOperand(1));
+      O << ')';
+      break;
+    default:
+      assert(0 && "Unsupported operator!");
+    }
+  } else {
+    assert(0 && "Unknown constant value!");
+  }
+}
+
+/// printAsCString - Print the specified array as a C compatible string, only if
+/// the predicate isString is true.
+///
+static void printAsCString(raw_ostream &O, const ConstantArray *CVA,
+                           unsigned LastElt) {
+  assert(CVA->isString() && "Array is not string compatible!");
+
+  O << '\"';
+  for (unsigned i = 0; i != LastElt; ++i) {
+    unsigned char C =
+        (unsigned char)cast<ConstantInt>(CVA->getOperand(i))->getZExtValue();
+    printStringChar(O, C);
+  }
+  O << '\"';
+}
+
+/// EmitString - Emit a zero-byte-terminated string constant.
+///
+void AsmPrinter::EmitString(const ConstantArray *CVA) const {
+  unsigned NumElts = CVA->getNumOperands();
+  if (TAI->getAscizDirective() && NumElts && 
+      cast<ConstantInt>(CVA->getOperand(NumElts-1))->getZExtValue() == 0) {
+    O << TAI->getAscizDirective();
+    printAsCString(O, CVA, NumElts-1);
+  } else {
+    O << TAI->getAsciiDirective();
+    printAsCString(O, CVA, NumElts);
+  }
+  O << '\n';
+}
+
+void AsmPrinter::EmitGlobalConstantArray(const ConstantArray *CVA,
+                                         unsigned AddrSpace) {
+  if (CVA->isString()) {
+    EmitString(CVA);
+  } else { // Not a string.  Print the values in successive locations
+    for (unsigned i = 0, e = CVA->getNumOperands(); i != e; ++i)
+      EmitGlobalConstant(CVA->getOperand(i), AddrSpace);
+  }
+}
+
+void AsmPrinter::EmitGlobalConstantVector(const ConstantVector *CP) {
+  const VectorType *PTy = CP->getType();
+  
+  for (unsigned I = 0, E = PTy->getNumElements(); I < E; ++I)
+    EmitGlobalConstant(CP->getOperand(I));
+}
+
+void AsmPrinter::EmitGlobalConstantStruct(const ConstantStruct *CVS,
+                                          unsigned AddrSpace) {
+  // Print the fields in successive locations. Pad to align if needed!
+  const TargetData *TD = TM.getTargetData();
+  unsigned Size = TD->getTypeAllocSize(CVS->getType());
+  const StructLayout *cvsLayout = TD->getStructLayout(CVS->getType());
+  uint64_t sizeSoFar = 0;
+  for (unsigned i = 0, e = CVS->getNumOperands(); i != e; ++i) {
+    const Constant* field = CVS->getOperand(i);
+
+    // Check if padding is needed and insert one or more 0s.
+    uint64_t fieldSize = TD->getTypeAllocSize(field->getType());
+    uint64_t padSize = ((i == e-1 ? Size : cvsLayout->getElementOffset(i+1))
+                        - cvsLayout->getElementOffset(i)) - fieldSize;
+    sizeSoFar += fieldSize + padSize;
+
+    // Now print the actual field value.
+    EmitGlobalConstant(field, AddrSpace);
+
+    // Insert padding - this may include padding to increase the size of the
+    // current field up to the ABI size (if the struct is not packed) as well
+    // as padding to ensure that the next field starts at the right offset.
+    EmitZeros(padSize, AddrSpace);
+  }
+  assert(sizeSoFar == cvsLayout->getSizeInBytes() &&
+         "Layout of constant struct may be incorrect!");
+}
+
+void AsmPrinter::EmitGlobalConstantFP(const ConstantFP *CFP, 
+                                      unsigned AddrSpace) {
+  // FP Constants are printed as integer constants to avoid losing
+  // precision...
+  const TargetData *TD = TM.getTargetData();
+  if (CFP->getType() == Type::DoubleTy) {
+    double Val = CFP->getValueAPF().convertToDouble();  // for comment only
+    uint64_t i = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+    if (TAI->getData64bitsDirective(AddrSpace)) {
+      O << TAI->getData64bitsDirective(AddrSpace) << i;
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString() << " double value: " << Val;
+      O << '\n';
+    } else if (TD->isBigEndian()) {
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(i >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " double most significant word " << Val;
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(i);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " double least significant word " << Val;
+      O << '\n';
+    } else {
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(i);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " double least significant word " << Val;
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(i >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " double most significant word " << Val;
+      O << '\n';
+    }
+    return;
+  } else if (CFP->getType() == Type::FloatTy) {
+    float Val = CFP->getValueAPF().convertToFloat();  // for comment only
+    O << TAI->getData32bitsDirective(AddrSpace)
+      << CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+    if (VerboseAsm)
+      O << '\t' << TAI->getCommentString() << " float " << Val;
+    O << '\n';
+    return;
+  } else if (CFP->getType() == Type::X86_FP80Ty) {
+    // all long double variants are printed as hex
+    // api needed to prevent premature destruction
+    APInt api = CFP->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    // Convert to double so we can print the approximate val as a comment.
+    APFloat DoubleVal = CFP->getValueAPF();
+    bool ignored;
+    DoubleVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                      &ignored);
+    if (TD->isBigEndian()) {
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[1]);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double most significant halfword of ~"
+          << DoubleVal.convertToDouble();
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0] >> 48);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString() << " long double next halfword";
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0] >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString() << " long double next halfword";
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0] >> 16);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString() << " long double next halfword";
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0]);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double least significant halfword";
+      O << '\n';
+     } else {
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0]);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double least significant halfword of ~"
+          << DoubleVal.convertToDouble();
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0] >> 16);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double next halfword";
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0] >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double next halfword";
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[0] >> 48);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double next halfword";
+      O << '\n';
+      O << TAI->getData16bitsDirective(AddrSpace) << uint16_t(p[1]);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double most significant halfword";
+      O << '\n';
+    }
+    EmitZeros(TD->getTypeAllocSize(Type::X86_FP80Ty) -
+              TD->getTypeStoreSize(Type::X86_FP80Ty), AddrSpace);
+    return;
+  } else if (CFP->getType() == Type::PPC_FP128Ty) {
+    // all long double variants are printed as hex
+    // api needed to prevent premature destruction
+    APInt api = CFP->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    if (TD->isBigEndian()) {
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[0] >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double most significant word";
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[0]);
+      if (VerboseAsm)      
+        O << '\t' << TAI->getCommentString()
+        << " long double next word";
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[1] >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double next word";
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[1]);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double least significant word";
+      O << '\n';
+     } else {
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[1]);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double least significant word";
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[1] >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double next word";
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[0]);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double next word";
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << uint32_t(p[0] >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " long double most significant word";
+      O << '\n';
+    }
+    return;
+  } else assert(0 && "Floating point constant type not handled");
+}
+
+void AsmPrinter::EmitGlobalConstantLargeInt(const ConstantInt *CI,
+                                            unsigned AddrSpace) {
+  const TargetData *TD = TM.getTargetData();
+  unsigned BitWidth = CI->getBitWidth();
+  assert(isPowerOf2_32(BitWidth) &&
+         "Non-power-of-2-sized integers not handled!");
+
+  // We don't expect assemblers to support integer data directives
+  // for more than 64 bits, so we emit the data in at most 64-bit
+  // quantities at a time.
+  const uint64_t *RawData = CI->getValue().getRawData();
+  for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
+    uint64_t Val;
+    if (TD->isBigEndian())
+      Val = RawData[e - i - 1];
+    else
+      Val = RawData[i];
+
+    if (TAI->getData64bitsDirective(AddrSpace))
+      O << TAI->getData64bitsDirective(AddrSpace) << Val << '\n';
+    else if (TD->isBigEndian()) {
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(Val >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " Double-word most significant word " << Val;
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(Val);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " Double-word least significant word " << Val;
+      O << '\n';
+    } else {
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(Val);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " Double-word least significant word " << Val;
+      O << '\n';
+      O << TAI->getData32bitsDirective(AddrSpace) << unsigned(Val >> 32);
+      if (VerboseAsm)
+        O << '\t' << TAI->getCommentString()
+          << " Double-word most significant word " << Val;
+      O << '\n';
+    }
+  }
+}
+
+/// EmitGlobalConstant - Print a general LLVM constant to the .s file.
+void AsmPrinter::EmitGlobalConstant(const Constant *CV, unsigned AddrSpace) {
+  const TargetData *TD = TM.getTargetData();
+  const Type *type = CV->getType();
+  unsigned Size = TD->getTypeAllocSize(type);
+
+  if (CV->isNullValue() || isa<UndefValue>(CV)) {
+    EmitZeros(Size, AddrSpace);
+    return;
+  } else if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV)) {
+    EmitGlobalConstantArray(CVA , AddrSpace);
+    return;
+  } else if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV)) {
+    EmitGlobalConstantStruct(CVS, AddrSpace);
+    return;
+  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    EmitGlobalConstantFP(CFP, AddrSpace);
+    return;
+  } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    // Small integers are handled below; large integers are handled here.
+    if (Size > 4) {
+      EmitGlobalConstantLargeInt(CI, AddrSpace);
+      return;
+    }
+  } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
+    EmitGlobalConstantVector(CP);
+    return;
+  }
+
+  printDataDirective(type, AddrSpace);
+  EmitConstantValueOnly(CV);
+  if (VerboseAsm) {
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      SmallString<40> S;
+      CI->getValue().toStringUnsigned(S, 16);
+      O << "\t\t\t" << TAI->getCommentString() << " 0x" << S.c_str();
+    }
+  }
+  O << '\n';
+}
+
+void AsmPrinter::EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  // Target doesn't support this yet!
+  abort();
+}
+
+/// PrintSpecial - Print information related to the specified machine instr
+/// that is independent of the operand, and may be independent of the instr
+/// itself.  This can be useful for portably encoding the comment character
+/// or other bits of target-specific knowledge into the asmstrings.  The
+/// syntax used is ${:comment}.  Targets can override this to add support
+/// for their own strange codes.
+void AsmPrinter::PrintSpecial(const MachineInstr *MI, const char *Code) const {
+  if (!strcmp(Code, "private")) {
+    O << TAI->getPrivateGlobalPrefix();
+  } else if (!strcmp(Code, "comment")) {
+    if (VerboseAsm)
+      O << TAI->getCommentString();
+  } else if (!strcmp(Code, "uid")) {
+    // Assign a unique ID to this machine instruction.
+    static const MachineInstr *LastMI = 0;
+    static const Function *F = 0;
+    static unsigned Counter = 0U-1;
+
+    // Comparing the address of MI isn't sufficient, because machineinstrs may
+    // be allocated to the same address across functions.
+    const Function *ThisF = MI->getParent()->getParent()->getFunction();
+    
+    // If this is a new machine instruction, bump the counter.
+    if (LastMI != MI || F != ThisF) {
+      ++Counter;
+      LastMI = MI;
+      F = ThisF;
+    }
+    O << Counter;
+  } else {
+    cerr << "Unknown special formatter '" << Code
+         << "' for machine instr: " << *MI;
+    exit(1);
+  }    
+}
+
+/// processDebugLoc - Processes the debug information of each machine
+/// instruction's DebugLoc.
+void AsmPrinter::processDebugLoc(DebugLoc DL) {
+  if (TAI->doesSupportDebugInformation() && DW->ShouldEmitDwarfDebug()) {
+    if (!DL.isUnknown()) {
+      static DebugLocTuple PrevDLT(0, ~0U, ~0U);
+      DebugLocTuple CurDLT = MF->getDebugLocTuple(DL);
+
+      if (CurDLT.CompileUnit != 0 && PrevDLT != CurDLT)
+        printLabel(DW->RecordSourceLine(CurDLT.Line, CurDLT.Col,
+                                        DICompileUnit(CurDLT.CompileUnit)));
+
+      PrevDLT = CurDLT;
+    }
+  }
+}
+
+/// printInlineAsm - This method formats and prints the specified machine
+/// instruction that is an inline asm.
+void AsmPrinter::printInlineAsm(const MachineInstr *MI) const {
+  unsigned NumOperands = MI->getNumOperands();
+  
+  // Count the number of register definitions.
+  unsigned NumDefs = 0;
+  for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
+       ++NumDefs)
+    assert(NumDefs != NumOperands-1 && "No asm string?");
+  
+  assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+
+  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
+  const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+
+  // If this asmstr is empty, just print the #APP/#NOAPP markers.
+  // These are useful to see where empty asm's wound up.
+  if (AsmStr[0] == 0) {
+    O << TAI->getInlineAsmStart() << "\n\t" << TAI->getInlineAsmEnd() << '\n';
+    return;
+  }
+  
+  O << TAI->getInlineAsmStart() << "\n\t";
+
+  // The variant of the current asmprinter.
+  int AsmPrinterVariant = TAI->getAssemblerDialect();
+
+  int CurVariant = -1;            // The number of the {.|.|.} region we are in.
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
+  
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+        O.write(LastEmitted, LiteralEnd-LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      O << '\n';       // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
+
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':     // $$ -> $
+        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+          O << '$';
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      case '(':             // $( -> same as GCC's { character.
+        ++LastEmitted;      // Consume '(' character.
+        if (CurVariant != -1) {
+          cerr << "Nested variants found in inline asm string: '"
+               << AsmStr << "'\n";
+          exit(1);
+        }
+        CurVariant = 0;     // We're in the first variant now.
+        break;
+      case '|':
+        ++LastEmitted;  // consume '|' character.
+        if (CurVariant == -1)
+          O << '|';       // this is gcc's behavior for | outside a variant
+        else
+          ++CurVariant;   // We're in the next variant.
+        break;
+      case ')':         // $) -> same as GCC's } char.
+        ++LastEmitted;  // consume ')' character.
+        if (CurVariant == -1)
+          O << '}';     // this is gcc's behavior for } outside a variant
+        else 
+          CurVariant = -1;
+        break;
+      }
+      if (Done) break;
+      
+      bool HasCurlyBraces = false;
+      if (*LastEmitted == '{') {     // ${variable}
+        ++LastEmitted;               // Consume '{' character.
+        HasCurlyBraces = true;
+      }
+      
+      // If we have ${:foo}, then this is not a real operand reference, it is a
+      // "magic" string reference, just like in .td files.  Arrange to call
+      // PrintSpecial.
+      if (HasCurlyBraces && *LastEmitted == ':') {
+        ++LastEmitted;
+        const char *StrStart = LastEmitted;
+        const char *StrEnd = strchr(StrStart, '}');
+        if (StrEnd == 0) {
+          cerr << "Unterminated ${:foo} operand in inline asm string: '" 
+               << AsmStr << "'\n";
+          exit(1);
+        }
+        
+        std::string Val(StrStart, StrEnd);
+        PrintSpecial(MI, Val.c_str());
+        LastEmitted = StrEnd+1;
+        break;
+      }
+            
+      const char *IDStart = LastEmitted;
+      char *IDEnd;
+      errno = 0;
+      long Val = strtol(IDStart, &IDEnd, 10); // We only accept numbers for IDs.
+      if (!isdigit(*IDStart) || (Val == 0 && errno == EINVAL)) {
+        cerr << "Bad $ operand number in inline asm string: '" 
+             << AsmStr << "'\n";
+        exit(1);
+      }
+      LastEmitted = IDEnd;
+      
+      char Modifier[2] = { 0, 0 };
+      
+      if (HasCurlyBraces) {
+        // If we have curly braces, check for a modifier character.  This
+        // supports syntax like ${0:u}, which correspond to "%u0" in GCC asm.
+        if (*LastEmitted == ':') {
+          ++LastEmitted;    // Consume ':' character.
+          if (*LastEmitted == 0) {
+            cerr << "Bad ${:} expression in inline asm string: '" 
+                 << AsmStr << "'\n";
+            exit(1);
+          }
+          
+          Modifier[0] = *LastEmitted;
+          ++LastEmitted;    // Consume modifier character.
+        }
+        
+        if (*LastEmitted != '}') {
+          cerr << "Bad ${} expression in inline asm string: '" 
+               << AsmStr << "'\n";
+          exit(1);
+        }
+        ++LastEmitted;    // Consume '}' character.
+      }
+      
+      if ((unsigned)Val >= NumOperands-1) {
+        cerr << "Invalid $ operand number in inline asm string: '" 
+             << AsmStr << "'\n";
+        exit(1);
+      }
+      
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
+        unsigned OpNo = 1;
+
+        bool Error = false;
+
+        // Scan to find the machine operand number for the operand.
+        for (; Val; --Val) {
+          if (OpNo >= MI->getNumOperands()) break;
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+
+        if (OpNo >= MI->getNumOperands()) {
+          Error = true;
+        } else {
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          ++OpNo;  // Skip over the ID number.
+
+          if (Modifier[0]=='l')  // labels are target independent
+            printBasicBlockLabel(MI->getOperand(OpNo).getMBB(), 
+                                 false, false, false);
+          else {
+            AsmPrinter *AP = const_cast<AsmPrinter*>(this);
+            if ((OpFlags & 7) == 4) {
+              Error = AP->PrintAsmMemoryOperand(MI, OpNo, AsmPrinterVariant,
+                                                Modifier[0] ? Modifier : 0);
+            } else {
+              Error = AP->PrintAsmOperand(MI, OpNo, AsmPrinterVariant,
+                                          Modifier[0] ? Modifier : 0);
+            }
+          }
+        }
+        if (Error) {
+          cerr << "Invalid operand found in inline asm: '"
+               << AsmStr << "'\n";
+          MI->dump();
+          exit(1);
+        }
+      }
+      break;
+    }
+    }
+  }
+  O << "\n\t" << TAI->getInlineAsmEnd() << '\n';
+}
+
+/// printImplicitDef - This method prints the specified machine instruction
+/// that is an implicit def.
+void AsmPrinter::printImplicitDef(const MachineInstr *MI) const {
+  if (VerboseAsm)
+    O << '\t' << TAI->getCommentString() << " implicit-def: "
+      << TRI->getAsmName(MI->getOperand(0).getReg()) << '\n';
+}
+
+/// printLabel - This method prints a local label used by debug and
+/// exception handling tables.
+void AsmPrinter::printLabel(const MachineInstr *MI) const {
+  printLabel(MI->getOperand(0).getImm());
+}
+
+void AsmPrinter::printLabel(unsigned Id) const {
+  O << TAI->getPrivateGlobalPrefix() << "label" << Id << ":\n";
+}
+
+/// printDeclare - This method prints a local variable declaration used by
+/// debug tables.
+/// FIXME: It doesn't really print anything rather it inserts a DebugVariable
+/// entry into dwarf table.
+void AsmPrinter::printDeclare(const MachineInstr *MI) const {
+  unsigned FI = MI->getOperand(0).getIndex();
+  GlobalValue *GV = MI->getOperand(1).getGlobal();
+  DW->RecordVariable(cast<GlobalVariable>(GV), FI, MI);
+}
+
+/// PrintAsmOperand - Print the specified operand of MI, an INLINEASM
+/// instruction, using the specified assembler variant.  Targets should
+/// overried this to format as appropriate.
+bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                 unsigned AsmVariant, const char *ExtraCode) {
+  // Target doesn't support this yet!
+  return true;
+}
+
+bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode) {
+  // Target doesn't support this yet!
+  return true;
+}
+
+/// printBasicBlockLabel - This method prints the label for the specified
+/// MachineBasicBlock
+void AsmPrinter::printBasicBlockLabel(const MachineBasicBlock *MBB,
+                                      bool printAlign, 
+                                      bool printColon,
+                                      bool printComment) const {
+  if (printAlign) {
+    unsigned Align = MBB->getAlignment();
+    if (Align)
+      EmitAlignment(Log2_32(Align));
+  }
+
+  O << TAI->getPrivateGlobalPrefix() << "BB" << getFunctionNumber() << '_'
+    << MBB->getNumber();
+  if (printColon)
+    O << ':';
+  if (printComment && MBB->getBasicBlock())
+    O << '\t' << TAI->getCommentString() << ' '
+      << MBB->getBasicBlock()->getNameStart();
+}
+
+/// printPICJumpTableSetLabel - This method prints a set label for the
+/// specified MachineBasicBlock for a jumptable entry.
+void AsmPrinter::printPICJumpTableSetLabel(unsigned uid, 
+                                           const MachineBasicBlock *MBB) const {
+  if (!TAI->getSetDirective())
+    return;
+  
+  O << TAI->getSetDirective() << ' ' << TAI->getPrivateGlobalPrefix()
+    << getFunctionNumber() << '_' << uid << "_set_" << MBB->getNumber() << ',';
+  printBasicBlockLabel(MBB, false, false, false);
+  O << '-' << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() 
+    << '_' << uid << '\n';
+}
+
+void AsmPrinter::printPICJumpTableSetLabel(unsigned uid, unsigned uid2,
+                                           const MachineBasicBlock *MBB) const {
+  if (!TAI->getSetDirective())
+    return;
+  
+  O << TAI->getSetDirective() << ' ' << TAI->getPrivateGlobalPrefix()
+    << getFunctionNumber() << '_' << uid << '_' << uid2
+    << "_set_" << MBB->getNumber() << ',';
+  printBasicBlockLabel(MBB, false, false, false);
+  O << '-' << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() 
+    << '_' << uid << '_' << uid2 << '\n';
+}
+
+/// printDataDirective - This method prints the asm directive for the
+/// specified type.
+void AsmPrinter::printDataDirective(const Type *type, unsigned AddrSpace) {
+  const TargetData *TD = TM.getTargetData();
+  switch (type->getTypeID()) {
+  case Type::IntegerTyID: {
+    unsigned BitWidth = cast<IntegerType>(type)->getBitWidth();
+    if (BitWidth <= 8)
+      O << TAI->getData8bitsDirective(AddrSpace);
+    else if (BitWidth <= 16)
+      O << TAI->getData16bitsDirective(AddrSpace);
+    else if (BitWidth <= 32)
+      O << TAI->getData32bitsDirective(AddrSpace);
+    else if (BitWidth <= 64) {
+      assert(TAI->getData64bitsDirective(AddrSpace) &&
+             "Target cannot handle 64-bit constant exprs!");
+      O << TAI->getData64bitsDirective(AddrSpace);
+    } else {
+      assert(0 && "Target cannot handle given data directive width!");
+    }
+    break;
+  }
+  case Type::PointerTyID:
+    if (TD->getPointerSize() == 8) {
+      assert(TAI->getData64bitsDirective(AddrSpace) &&
+             "Target cannot handle 64-bit pointer exprs!");
+      O << TAI->getData64bitsDirective(AddrSpace);
+    } else if (TD->getPointerSize() == 2) {
+      O << TAI->getData16bitsDirective(AddrSpace);
+    } else if (TD->getPointerSize() == 1) {
+      O << TAI->getData8bitsDirective(AddrSpace);
+    } else {
+      O << TAI->getData32bitsDirective(AddrSpace);
+    }
+    break;
+  case Type::FloatTyID: case Type::DoubleTyID:
+  case Type::X86_FP80TyID: case Type::FP128TyID: case Type::PPC_FP128TyID:
+    assert (0 && "Should have already output floating point constant.");
+  default:
+    assert (0 && "Can't handle printing this type of thing");
+    break;
+  }
+}
+
+void AsmPrinter::printSuffixedName(const char *Name, const char *Suffix,
+                                   const char *Prefix) {
+  if (Name[0]=='\"')
+    O << '\"';
+  O << TAI->getPrivateGlobalPrefix();
+  if (Prefix) O << Prefix;
+  if (Name[0]=='\"')
+    O << '\"';
+  if (Name[0]=='\"')
+    O << Name[1];
+  else
+    O << Name;
+  O << Suffix;
+  if (Name[0]=='\"')
+    O << '\"';
+}
+
+void AsmPrinter::printSuffixedName(const std::string &Name, const char* Suffix) {
+  printSuffixedName(Name.c_str(), Suffix);
+}
+
+void AsmPrinter::printVisibility(const std::string& Name,
+                                 unsigned Visibility) const {
+  if (Visibility == GlobalValue::HiddenVisibility) {
+    if (const char *Directive = TAI->getHiddenDirective())
+      O << Directive << Name << '\n';
+  } else if (Visibility == GlobalValue::ProtectedVisibility) {
+    if (const char *Directive = TAI->getProtectedDirective())
+      O << Directive << Name << '\n';
+  }
+}
+
+void AsmPrinter::printOffset(int64_t Offset) const {
+  if (Offset > 0)
+    O << '+' << Offset;
+  else if (Offset < 0)
+    O << Offset;
+}
+
+GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy *S) {
+  if (!S->usesMetadata())
+    return 0;
+  
+  gcp_iterator GCPI = GCMetadataPrinters.find(S);
+  if (GCPI != GCMetadataPrinters.end())
+    return GCPI->second;
+  
+  const char *Name = S->getName().c_str();
+  
+  for (GCMetadataPrinterRegistry::iterator
+         I = GCMetadataPrinterRegistry::begin(),
+         E = GCMetadataPrinterRegistry::end(); I != E; ++I)
+    if (strcmp(Name, I->getName()) == 0) {
+      GCMetadataPrinter *GMP = I->instantiate();
+      GMP->S = S;
+      GCMetadataPrinters.insert(std::make_pair(S, GMP));
+      return GMP;
+    }
+  
+  cerr << "no GCMetadataPrinter registered for GC: " << Name << "\n";
+  abort();
+}
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..066aaab
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(LLVMAsmPrinter
+  AsmPrinter.cpp
+  DIE.cpp
+  DwarfDebug.cpp
+  DwarfException.cpp
+  DwarfLabel.cpp
+  DwarfPrinter.cpp
+  DwarfWriter.cpp
+  OcamlGCPrinter.cpp
+  )
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
new file mode 100644
index 0000000..dc149cf
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -0,0 +1,518 @@
+//===--- lib/CodeGen/DIE.cpp - DWARF Info Entries -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for DWARF info entries.
+// 
+//===----------------------------------------------------------------------===//
+
+#include "DIE.h"
+#include "DwarfPrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include <ostream>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrevData Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(Attribute);
+  ID.AddInteger(Form);
+}
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrev Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(Tag);
+  ID.AddInteger(ChildrenFlag);
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i)
+    Data[i].Profile(ID);
+}
+
+/// Emit - Print the abbreviation using the specified asm printer.
+///
+void DIEAbbrev::Emit(const AsmPrinter *Asm) const {
+  // Emit its Dwarf tag type.
+  Asm->EmitULEB128Bytes(Tag);
+  Asm->EOL(dwarf::TagString(Tag));
+
+  // Emit whether it has children DIEs.
+  Asm->EmitULEB128Bytes(ChildrenFlag);
+  Asm->EOL(dwarf::ChildrenString(ChildrenFlag));
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    const DIEAbbrevData &AttrData = Data[i];
+
+    // Emit attribute type.
+    Asm->EmitULEB128Bytes(AttrData.getAttribute());
+    Asm->EOL(dwarf::AttributeString(AttrData.getAttribute()));
+
+    // Emit form type.
+    Asm->EmitULEB128Bytes(AttrData.getForm());
+    Asm->EOL(dwarf::FormEncodingString(AttrData.getForm()));
+  }
+
+  // Mark end of abbreviation.
+  Asm->EmitULEB128Bytes(0); Asm->EOL("EOM(1)");
+  Asm->EmitULEB128Bytes(0); Asm->EOL("EOM(2)");
+}
+
+#ifndef NDEBUG
+void DIEAbbrev::print(std::ostream &O) {
+  O << "Abbreviation @"
+    << std::hex << (intptr_t)this << std::dec
+    << "  "
+    << dwarf::TagString(Tag)
+    << " "
+    << dwarf::ChildrenString(ChildrenFlag)
+    << "\n";
+
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    O << "  "
+      << dwarf::AttributeString(Data[i].getAttribute())
+      << "  "
+      << dwarf::FormEncodingString(Data[i].getForm())
+      << "\n";
+  }
+}
+void DIEAbbrev::dump() { print(cerr); }
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIE Implementation
+//===----------------------------------------------------------------------===//
+
+DIE::~DIE() {
+  for (unsigned i = 0, N = Children.size(); i < N; ++i)
+    delete Children[i];
+}
+
+/// AddSiblingOffset - Add a sibling offset field to the front of the DIE.
+///
+void DIE::AddSiblingOffset() {
+  DIEInteger *DI = new DIEInteger(0);
+  Values.insert(Values.begin(), DI);
+  Abbrev.AddFirstAttribute(dwarf::DW_AT_sibling, dwarf::DW_FORM_ref4);
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIE::Profile(FoldingSetNodeID &ID) {
+  Abbrev.Profile(ID);
+
+  for (unsigned i = 0, N = Children.size(); i < N; ++i)
+    ID.AddPointer(Children[i]);
+
+  for (unsigned j = 0, M = Values.size(); j < M; ++j)
+    ID.AddPointer(Values[j]);
+}
+
+#ifndef NDEBUG
+void DIE::print(std::ostream &O, unsigned IncIndent) {
+  static unsigned IndentCount = 0;
+  IndentCount += IncIndent;
+  const std::string Indent(IndentCount, ' ');
+  bool isBlock = Abbrev.getTag() == 0;
+
+  if (!isBlock) {
+    O << Indent
+      << "Die: "
+      << "0x" << std::hex << (intptr_t)this << std::dec
+      << ", Offset: " << Offset
+      << ", Size: " << Size
+      << "\n";
+
+    O << Indent
+      << dwarf::TagString(Abbrev.getTag())
+      << " "
+      << dwarf::ChildrenString(Abbrev.getChildrenFlag());
+  } else {
+    O << "Size: " << Size;
+  }
+  O << "\n";
+
+  const SmallVector<DIEAbbrevData, 8> &Data = Abbrev.getData();
+
+  IndentCount += 2;
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    O << Indent;
+
+    if (!isBlock)
+      O << dwarf::AttributeString(Data[i].getAttribute());
+    else
+      O << "Blk[" << i << "]";
+
+    O <<  "  "
+      << dwarf::FormEncodingString(Data[i].getForm())
+      << " ";
+    Values[i]->print(O);
+    O << "\n";
+  }
+  IndentCount -= 2;
+
+  for (unsigned j = 0, M = Children.size(); j < M; ++j) {
+    Children[j]->print(O, 4);
+  }
+
+  if (!isBlock) O << "\n";
+  IndentCount -= IncIndent;
+}
+
+void DIE::dump() {
+  print(cerr);
+}
+#endif
+
+
+#ifndef NDEBUG
+void DIEValue::dump() {
+  print(cerr);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEInteger Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit integer of appropriate size.
+///
+void DIEInteger::EmitValue(Dwarf *D, unsigned Form) const {
+  const AsmPrinter *Asm = D->getAsm();
+  switch (Form) {
+  case dwarf::DW_FORM_flag:  // Fall thru
+  case dwarf::DW_FORM_ref1:  // Fall thru
+  case dwarf::DW_FORM_data1: Asm->EmitInt8(Integer);         break;
+  case dwarf::DW_FORM_ref2:  // Fall thru
+  case dwarf::DW_FORM_data2: Asm->EmitInt16(Integer);        break;
+  case dwarf::DW_FORM_ref4:  // Fall thru
+  case dwarf::DW_FORM_data4: Asm->EmitInt32(Integer);        break;
+  case dwarf::DW_FORM_ref8:  // Fall thru
+  case dwarf::DW_FORM_data8: Asm->EmitInt64(Integer);        break;
+  case dwarf::DW_FORM_udata: Asm->EmitULEB128Bytes(Integer); break;
+  case dwarf::DW_FORM_sdata: Asm->EmitSLEB128Bytes(Integer); break;
+  default: assert(0 && "DIE Value form not supported yet");  break;
+  }
+}
+
+/// SizeOf - Determine size of integer value in bytes.
+///
+unsigned DIEInteger::SizeOf(const TargetData *TD, unsigned Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_flag:  // Fall thru
+  case dwarf::DW_FORM_ref1:  // Fall thru
+  case dwarf::DW_FORM_data1: return sizeof(int8_t);
+  case dwarf::DW_FORM_ref2:  // Fall thru
+  case dwarf::DW_FORM_data2: return sizeof(int16_t);
+  case dwarf::DW_FORM_ref4:  // Fall thru
+  case dwarf::DW_FORM_data4: return sizeof(int32_t);
+  case dwarf::DW_FORM_ref8:  // Fall thru
+  case dwarf::DW_FORM_data8: return sizeof(int64_t);
+  case dwarf::DW_FORM_udata: return TargetAsmInfo::getULEB128Size(Integer);
+  case dwarf::DW_FORM_sdata: return TargetAsmInfo::getSLEB128Size(Integer);
+  default: assert(0 && "DIE Value form not supported yet"); break;
+  }
+  return 0;
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIEInteger::Profile(FoldingSetNodeID &ID, unsigned Int) {
+  ID.AddInteger(isInteger);
+  ID.AddInteger(Int);
+}
+void DIEInteger::Profile(FoldingSetNodeID &ID) {
+  Profile(ID, Integer);
+}
+
+#ifndef NDEBUG
+void DIEInteger::print(std::ostream &O) {
+  O << "Int: " << (int64_t)Integer
+    << "  0x" << std::hex << Integer << std::dec;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEString Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit string value.
+///
+void DIEString::EmitValue(Dwarf *D, unsigned Form) const {
+  D->getAsm()->EmitString(Str);
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIEString::Profile(FoldingSetNodeID &ID, const std::string &Str) {
+  ID.AddInteger(isString);
+  ID.AddString(Str);
+}
+void DIEString::Profile(FoldingSetNodeID &ID) {
+  Profile(ID, Str);
+}
+
+#ifndef NDEBUG
+void DIEString::print(std::ostream &O) {
+  O << "Str: \"" << Str << "\"";
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEDwarfLabel Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit label value.
+///
+void DIEDwarfLabel::EmitValue(Dwarf *D, unsigned Form) const {
+  bool IsSmall = Form == dwarf::DW_FORM_data4;
+  D->EmitReference(Label, false, IsSmall);
+}
+
+/// SizeOf - Determine size of label value in bytes.
+///
+unsigned DIEDwarfLabel::SizeOf(const TargetData *TD, unsigned Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  return TD->getPointerSize();
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIEDwarfLabel::Profile(FoldingSetNodeID &ID, const DWLabel &Label) {
+  ID.AddInteger(isLabel);
+  Label.Profile(ID);
+}
+void DIEDwarfLabel::Profile(FoldingSetNodeID &ID) {
+  Profile(ID, Label);
+}
+
+#ifndef NDEBUG
+void DIEDwarfLabel::print(std::ostream &O) {
+  O << "Lbl: ";
+  Label.print(O);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEObjectLabel Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit label value.
+///
+void DIEObjectLabel::EmitValue(Dwarf *D, unsigned Form) const {
+  bool IsSmall = Form == dwarf::DW_FORM_data4;
+  D->EmitReference(Label, false, IsSmall);
+}
+
+/// SizeOf - Determine size of label value in bytes.
+///
+unsigned DIEObjectLabel::SizeOf(const TargetData *TD, unsigned Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  return TD->getPointerSize();
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIEObjectLabel::Profile(FoldingSetNodeID &ID, const std::string &Label) {
+  ID.AddInteger(isAsIsLabel);
+  ID.AddString(Label);
+}
+void DIEObjectLabel::Profile(FoldingSetNodeID &ID) {
+  Profile(ID, Label.c_str());
+}
+
+#ifndef NDEBUG
+void DIEObjectLabel::print(std::ostream &O) {
+  O << "Obj: " << Label;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIESectionOffset Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit delta value.
+///
+void DIESectionOffset::EmitValue(Dwarf *D, unsigned Form) const {
+  bool IsSmall = Form == dwarf::DW_FORM_data4;
+  D->EmitSectionOffset(Label.getTag(), Section.getTag(),
+                       Label.getNumber(), Section.getNumber(),
+                       IsSmall, IsEH, UseSet);
+}
+
+/// SizeOf - Determine size of delta value in bytes.
+///
+unsigned DIESectionOffset::SizeOf(const TargetData *TD, unsigned Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  return TD->getPointerSize();
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIESectionOffset::Profile(FoldingSetNodeID &ID, const DWLabel &Label,
+                               const DWLabel &Section) {
+  ID.AddInteger(isSectionOffset);
+  Label.Profile(ID);
+  Section.Profile(ID);
+  // IsEH and UseSet are specific to the Label/Section that we will emit the
+  // offset for; so Label/Section are enough for uniqueness.
+}
+void DIESectionOffset::Profile(FoldingSetNodeID &ID) {
+  Profile(ID, Label, Section);
+}
+
+#ifndef NDEBUG
+void DIESectionOffset::print(std::ostream &O) {
+  O << "Off: ";
+  Label.print(O);
+  O << "-";
+  Section.print(O);
+  O << "-" << IsEH << "-" << UseSet;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEDelta Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit delta value.
+///
+void DIEDelta::EmitValue(Dwarf *D, unsigned Form) const {
+  bool IsSmall = Form == dwarf::DW_FORM_data4;
+  D->EmitDifference(LabelHi, LabelLo, IsSmall);
+}
+
+/// SizeOf - Determine size of delta value in bytes.
+///
+unsigned DIEDelta::SizeOf(const TargetData *TD, unsigned Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  return TD->getPointerSize();
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIEDelta::Profile(FoldingSetNodeID &ID, const DWLabel &LabelHi,
+                       const DWLabel &LabelLo) {
+  ID.AddInteger(isDelta);
+  LabelHi.Profile(ID);
+  LabelLo.Profile(ID);
+}
+void DIEDelta::Profile(FoldingSetNodeID &ID) {
+  Profile(ID, LabelHi, LabelLo);
+}
+
+#ifndef NDEBUG
+void DIEDelta::print(std::ostream &O) {
+  O << "Del: ";
+  LabelHi.print(O);
+  O << "-";
+  LabelLo.print(O);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEEntry Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit debug information entry offset.
+///
+void DIEEntry::EmitValue(Dwarf *D, unsigned Form) const {
+  D->getAsm()->EmitInt32(Entry->getOffset());
+}
+
+/// Profile - Used to gather unique data for the value folding set.
+///
+void DIEEntry::Profile(FoldingSetNodeID &ID, DIE *Entry) {
+  ID.AddInteger(isEntry);
+  ID.AddPointer(Entry);
+}
+void DIEEntry::Profile(FoldingSetNodeID &ID) {
+  ID.AddInteger(isEntry);
+
+  if (Entry)
+    ID.AddPointer(Entry);
+  else
+    ID.AddPointer(this);
+}
+
+#ifndef NDEBUG
+void DIEEntry::print(std::ostream &O) {
+  O << "Die: 0x" << std::hex << (intptr_t)Entry << std::dec;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEBlock Implementation
+//===----------------------------------------------------------------------===//
+
+/// ComputeSize - calculate the size of the block.
+///
+unsigned DIEBlock::ComputeSize(const TargetData *TD) {
+  if (!Size) {
+    const SmallVector<DIEAbbrevData, 8> &AbbrevData = Abbrev.getData();
+    for (unsigned i = 0, N = Values.size(); i < N; ++i)
+      Size += Values[i]->SizeOf(TD, AbbrevData[i].getForm());
+  }
+
+  return Size;
+}
+
+/// EmitValue - Emit block data.
+///
+void DIEBlock::EmitValue(Dwarf *D, unsigned Form) const {
+  const AsmPrinter *Asm = D->getAsm();
+  switch (Form) {
+  case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);         break;
+  case dwarf::DW_FORM_block2: Asm->EmitInt16(Size);        break;
+  case dwarf::DW_FORM_block4: Asm->EmitInt32(Size);        break;
+  case dwarf::DW_FORM_block:  Asm->EmitULEB128Bytes(Size); break;
+  default: assert(0 && "Improper form for block");         break;
+  }
+
+  const SmallVector<DIEAbbrevData, 8> &AbbrevData = Abbrev.getData();
+  for (unsigned i = 0, N = Values.size(); i < N; ++i) {
+    Asm->EOL();
+    Values[i]->EmitValue(D, AbbrevData[i].getForm());
+  }
+}
+
+/// SizeOf - Determine size of block data in bytes.
+///
+unsigned DIEBlock::SizeOf(const TargetData *TD, unsigned Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
+  case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
+  case dwarf::DW_FORM_block4: return Size + sizeof(int32_t);
+  case dwarf::DW_FORM_block: return Size + TargetAsmInfo::getULEB128Size(Size);
+  default: assert(0 && "Improper form for block"); break;
+  }
+  return 0;
+}
+
+void DIEBlock::Profile(FoldingSetNodeID &ID) {
+  ID.AddInteger(isBlock);
+  DIE::Profile(ID);
+}
+
+#ifndef NDEBUG
+void DIEBlock::print(std::ostream &O) {
+  O << "Blk: ";
+  DIE::print(O, 5);
+}
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
new file mode 100644
index 0000000..b14d91c
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -0,0 +1,549 @@
+//===--- lib/CodeGen/DIE.h - DWARF Info Entries -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for DWARF info entries.
+// 
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DIE_H__
+#define CODEGEN_ASMPRINTER_DIE_H__
+
+#include "DwarfLabel.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iosfwd>
+
+namespace llvm {
+  class AsmPrinter;
+  class Dwarf;
+  class TargetData;
+
+  //===--------------------------------------------------------------------===//
+  /// DIEAbbrevData - Dwarf abbreviation data, describes the one attribute of a
+  /// Dwarf abbreviation.
+  class VISIBILITY_HIDDEN DIEAbbrevData {
+    /// Attribute - Dwarf attribute code.
+    ///
+    unsigned Attribute;
+
+    /// Form - Dwarf form code.
+    ///
+    unsigned Form;
+  public:
+    DIEAbbrevData(unsigned A, unsigned F) : Attribute(A), Form(F) {}
+
+    // Accessors.
+    unsigned getAttribute() const { return Attribute; }
+    unsigned getForm()      const { return Form; }
+
+    /// Profile - Used to gather unique data for the abbreviation folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) const;
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEAbbrev - Dwarf abbreviation, describes the organization of a debug
+  /// information object.
+  class VISIBILITY_HIDDEN DIEAbbrev : public FoldingSetNode {
+    /// Tag - Dwarf tag code.
+    ///
+    unsigned Tag;
+
+    /// Unique number for node.
+    ///
+    unsigned Number;
+
+    /// ChildrenFlag - Dwarf children flag.
+    ///
+    unsigned ChildrenFlag;
+
+    /// Data - Raw data bytes for abbreviation.
+    ///
+    SmallVector<DIEAbbrevData, 8> Data;
+  public:
+    DIEAbbrev(unsigned T, unsigned C) : Tag(T), ChildrenFlag(C), Data() {}
+    virtual ~DIEAbbrev() {}
+
+    // Accessors.
+    unsigned getTag() const { return Tag; }
+    unsigned getNumber() const { return Number; }
+    unsigned getChildrenFlag() const { return ChildrenFlag; }
+    const SmallVector<DIEAbbrevData, 8> &getData() const { return Data; }
+    void setTag(unsigned T) { Tag = T; }
+    void setChildrenFlag(unsigned CF) { ChildrenFlag = CF; }
+    void setNumber(unsigned N) { Number = N; }
+
+    /// AddAttribute - Adds another set of attribute information to the
+    /// abbreviation.
+    void AddAttribute(unsigned Attribute, unsigned Form) {
+      Data.push_back(DIEAbbrevData(Attribute, Form));
+    }
+
+    /// AddFirstAttribute - Adds a set of attribute information to the front
+    /// of the abbreviation.
+    void AddFirstAttribute(unsigned Attribute, unsigned Form) {
+      Data.insert(Data.begin(), DIEAbbrevData(Attribute, Form));
+    }
+
+    /// Profile - Used to gather unique data for the abbreviation folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) const;
+
+    /// Emit - Print the abbreviation using the specified asm printer.
+    ///
+    void Emit(const AsmPrinter *Asm) const;
+
+#ifndef NDEBUG
+    void print(std::ostream *O) {
+      if (O) print(*O);
+    }
+    void print(std::ostream &O);
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIE - A structured debug information entry.  Has an abbreviation which
+  /// describes it's organization.
+  class CompileUnit;
+  class DIEValue;
+
+  class VISIBILITY_HIDDEN DIE : public FoldingSetNode {
+  protected:
+    /// Abbrev - Buffer for constructing abbreviation.
+    ///
+    DIEAbbrev Abbrev;
+
+    /// Offset - Offset in debug info section.
+    ///
+    unsigned Offset;
+
+    /// Size - Size of instance + children.
+    ///
+    unsigned Size;
+
+    /// Children DIEs.
+    ///
+    std::vector<DIE *> Children;
+
+    /// Attributes values.
+    ///
+    SmallVector<DIEValue*, 32> Values;
+
+    /// Abstract compile unit.
+    CompileUnit *AbstractCU;
+  public:
+    explicit DIE(unsigned Tag)
+      : Abbrev(Tag, dwarf::DW_CHILDREN_no), Offset(0), Size(0) {}
+    virtual ~DIE();
+
+    // Accessors.
+    DIEAbbrev &getAbbrev() { return Abbrev; }
+    unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
+    unsigned getTag() const { return Abbrev.getTag(); }
+    unsigned getOffset() const { return Offset; }
+    unsigned getSize() const { return Size; }
+    const std::vector<DIE *> &getChildren() const { return Children; }
+    SmallVector<DIEValue*, 32> &getValues() { return Values; }
+    CompileUnit *getAbstractCompileUnit() const { return AbstractCU; }
+
+    void setTag(unsigned Tag) { Abbrev.setTag(Tag); }
+    void setOffset(unsigned O) { Offset = O; }
+    void setSize(unsigned S) { Size = S; }
+    void setAbstractCompileUnit(CompileUnit *CU) { AbstractCU = CU; }
+
+    /// AddValue - Add a value and attributes to a DIE.
+    ///
+    void AddValue(unsigned Attribute, unsigned Form, DIEValue *Value) {
+      Abbrev.AddAttribute(Attribute, Form);
+      Values.push_back(Value);
+    }
+
+    /// SiblingOffset - Return the offset of the debug information entry's
+    /// sibling.
+    unsigned SiblingOffset() const { return Offset + Size; }
+
+    /// AddSiblingOffset - Add a sibling offset field to the front of the DIE.
+    ///
+    void AddSiblingOffset();
+
+    /// AddChild - Add a child to the DIE.
+    ///
+    void AddChild(DIE *Child) {
+      Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
+      Children.push_back(Child);
+    }
+
+    /// Detach - Detaches objects connected to it after copying.
+    ///
+    void Detach() {
+      Children.clear();
+    }
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) ;
+
+#ifndef NDEBUG
+    void print(std::ostream *O, unsigned IncIndent = 0) {
+      if (O) print(*O, IncIndent);
+    }
+    void print(std::ostream &O, unsigned IncIndent = 0);
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEValue - A debug information entry value.
+  ///
+  class VISIBILITY_HIDDEN DIEValue : public FoldingSetNode {
+  public:
+    enum {
+      isInteger,
+      isString,
+      isLabel,
+      isAsIsLabel,
+      isSectionOffset,
+      isDelta,
+      isEntry,
+      isBlock
+    };
+  protected:
+    /// Type - Type of data stored in the value.
+    ///
+    unsigned Type;
+  public:
+    explicit DIEValue(unsigned T) : Type(T) {}
+    virtual ~DIEValue() {}
+
+    // Accessors
+    unsigned getType()  const { return Type; }
+
+    /// EmitValue - Emit value via the Dwarf writer.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const = 0;
+
+    /// SizeOf - Return the size of a value in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const = 0;
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    virtual void Profile(FoldingSetNodeID &ID) = 0;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *) { return true; }
+
+#ifndef NDEBUG
+    void print(std::ostream *O) {
+      if (O) print(*O);
+    }
+    virtual void print(std::ostream &O) = 0;
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEInteger - An integer value DIE.
+  ///
+  class VISIBILITY_HIDDEN DIEInteger : public DIEValue {
+    uint64_t Integer;
+  public:
+    explicit DIEInteger(uint64_t I) : DIEValue(isInteger), Integer(I) {}
+
+    /// BestForm - Choose the best form for integer.
+    ///
+    static unsigned BestForm(bool IsSigned, uint64_t Int) {
+      if (IsSigned) {
+        if ((char)Int == (signed)Int)   return dwarf::DW_FORM_data1;
+        if ((short)Int == (signed)Int)  return dwarf::DW_FORM_data2;
+        if ((int)Int == (signed)Int)    return dwarf::DW_FORM_data4;
+      } else {
+        if ((unsigned char)Int == Int)  return dwarf::DW_FORM_data1;
+        if ((unsigned short)Int == Int) return dwarf::DW_FORM_data2;
+        if ((unsigned int)Int == Int)   return dwarf::DW_FORM_data4;
+      }
+      return dwarf::DW_FORM_data8;
+    }
+
+    /// EmitValue - Emit integer of appropriate size.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of integer value in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const;
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    static void Profile(FoldingSetNodeID &ID, unsigned Int);
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEInteger *) { return true; }
+    static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEString - A string value DIE.
+  ///
+  class VISIBILITY_HIDDEN DIEString : public DIEValue {
+    const std::string Str;
+  public:
+    explicit DIEString(const std::string &S) : DIEValue(isString), Str(S) {}
+
+    /// EmitValue - Emit string value.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of string value in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *, unsigned /*Form*/) const {
+      return Str.size() + sizeof(char); // sizeof('\0');
+    }
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    static void Profile(FoldingSetNodeID &ID, const std::string &Str);
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEString *) { return true; }
+    static bool classof(const DIEValue *S) { return S->getType() == isString; }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEDwarfLabel - A Dwarf internal label expression DIE.
+  //
+  class VISIBILITY_HIDDEN DIEDwarfLabel : public DIEValue {
+    const DWLabel Label;
+  public:
+    explicit DIEDwarfLabel(const DWLabel &L) : DIEValue(isLabel), Label(L) {}
+
+    /// EmitValue - Emit label value.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of label value in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const;
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    static void Profile(FoldingSetNodeID &ID, const DWLabel &Label);
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEDwarfLabel *)  { return true; }
+    static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEObjectLabel - A label to an object in code or data.
+  //
+  class VISIBILITY_HIDDEN DIEObjectLabel : public DIEValue {
+    const std::string Label;
+  public:
+    explicit DIEObjectLabel(const std::string &L)
+      : DIEValue(isAsIsLabel), Label(L) {}
+
+    /// EmitValue - Emit label value.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of label value in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const;
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    static void Profile(FoldingSetNodeID &ID, const std::string &Label);
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEObjectLabel *) { return true; }
+    static bool classof(const DIEValue *L) {
+      return L->getType() == isAsIsLabel;
+    }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIESectionOffset - A section offset DIE.
+  ///
+  class VISIBILITY_HIDDEN DIESectionOffset : public DIEValue {
+    const DWLabel Label;
+    const DWLabel Section;
+    bool IsEH : 1;
+    bool UseSet : 1;
+  public:
+    DIESectionOffset(const DWLabel &Lab, const DWLabel &Sec,
+                     bool isEH = false, bool useSet = true)
+      : DIEValue(isSectionOffset), Label(Lab), Section(Sec),
+        IsEH(isEH), UseSet(useSet) {}
+
+    /// EmitValue - Emit section offset.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of section offset value in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const;
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    static void Profile(FoldingSetNodeID &ID, const DWLabel &Label,
+                        const DWLabel &Section);
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIESectionOffset *)  { return true; }
+    static bool classof(const DIEValue *D) {
+      return D->getType() == isSectionOffset;
+    }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEDelta - A simple label difference DIE.
+  ///
+  class VISIBILITY_HIDDEN DIEDelta : public DIEValue {
+    const DWLabel LabelHi;
+    const DWLabel LabelLo;
+  public:
+    DIEDelta(const DWLabel &Hi, const DWLabel &Lo)
+      : DIEValue(isDelta), LabelHi(Hi), LabelLo(Lo) {}
+
+    /// EmitValue - Emit delta value.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of delta value in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const;
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    static void Profile(FoldingSetNodeID &ID, const DWLabel &LabelHi,
+                        const DWLabel &LabelLo);
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEDelta *)  { return true; }
+    static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEntry - A pointer to another debug information entry.  An instance of
+  /// this class can also be used as a proxy for a debug information entry not
+  /// yet defined (ie. types.)
+  class VISIBILITY_HIDDEN DIEEntry : public DIEValue {
+    DIE *Entry;
+  public:
+    explicit DIEEntry(DIE *E) : DIEValue(isEntry), Entry(E) {}
+
+    DIE *getEntry() const { return Entry; }
+    void setEntry(DIE *E) { Entry = E; }
+
+    /// EmitValue - Emit debug information entry offset.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of debug information entry in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const {
+      return sizeof(int32_t);
+    }
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    static void Profile(FoldingSetNodeID &ID, DIE *Entry);
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEEntry *)  { return true; }
+    static bool classof(const DIEValue *E) { return E->getType() == isEntry; }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEBlock - A block of values.  Primarily used for location expressions.
+  //
+  class VISIBILITY_HIDDEN DIEBlock : public DIEValue, public DIE {
+    unsigned Size;                // Size in bytes excluding size header.
+  public:
+    DIEBlock()
+      : DIEValue(isBlock), DIE(0), Size(0) {}
+    virtual ~DIEBlock() {}
+
+    /// ComputeSize - calculate the size of the block.
+    ///
+    unsigned ComputeSize(const TargetData *TD);
+
+    /// BestForm - Choose the best form for data.
+    ///
+    unsigned BestForm() const {
+      if ((unsigned char)Size == Size)  return dwarf::DW_FORM_block1;
+      if ((unsigned short)Size == Size) return dwarf::DW_FORM_block2;
+      if ((unsigned int)Size == Size)   return dwarf::DW_FORM_block4;
+      return dwarf::DW_FORM_block;
+    }
+
+    /// EmitValue - Emit block data.
+    ///
+    virtual void EmitValue(Dwarf *D, unsigned Form) const;
+
+    /// SizeOf - Determine size of block data in bytes.
+    ///
+    virtual unsigned SizeOf(const TargetData *TD, unsigned Form) const;
+
+    /// Profile - Used to gather unique data for the value folding set.
+    ///
+    virtual void Profile(FoldingSetNodeID &ID);
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEBlock *)  { return true; }
+    static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
+
+#ifndef NDEBUG
+    virtual void print(std::ostream &O);
+#endif
+  };
+
+} // end llvm namespace
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
new file mode 100644
index 0000000..25217b0
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -0,0 +1,2610 @@
+//===-- llvm/CodeGen/DwarfDebug.cpp - Dwarf Debug Framework ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfDebug.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/System/Path.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include <ostream>
+using namespace llvm;
+
+static TimerGroup &getDwarfTimerGroup() {
+  static TimerGroup DwarfTimerGroup("Dwarf Debugging");
+  return DwarfTimerGroup;
+}
+
+//===----------------------------------------------------------------------===//
+
+/// Configuration values for initial hash set sizes (log2).
+///
+static const unsigned InitDiesSetSize          = 9; // log2(512)
+static const unsigned InitAbbreviationsSetSize = 9; // log2(512)
+static const unsigned InitValuesSetSize        = 9; // log2(512)
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// CompileUnit - This dwarf writer support class manages information associate
+/// with a source file.
+class VISIBILITY_HIDDEN CompileUnit {
+  /// ID - File identifier for source.
+  ///
+  unsigned ID;
+
+  /// Die - Compile unit debug information entry.
+  ///
+  DIE *Die;
+
+  /// GVToDieMap - Tracks the mapping of unit level debug informaton
+  /// variables to debug information entries.
+  std::map<GlobalVariable *, DIE *> GVToDieMap;
+
+  /// GVToDIEEntryMap - Tracks the mapping of unit level debug informaton
+  /// descriptors to debug information entries using a DIEEntry proxy.
+  std::map<GlobalVariable *, DIEEntry *> GVToDIEEntryMap;
+
+  /// Globals - A map of globally visible named entities for this unit.
+  ///
+  StringMap<DIE*> Globals;
+
+  /// DiesSet - Used to uniquely define dies within the compile unit.
+  ///
+  FoldingSet<DIE> DiesSet;
+public:
+  CompileUnit(unsigned I, DIE *D)
+    : ID(I), Die(D), DiesSet(InitDiesSetSize) {}
+  ~CompileUnit() { delete Die; }
+
+  // Accessors.
+  unsigned getID() const { return ID; }
+  DIE* getDie() const { return Die; }
+  StringMap<DIE*> &getGlobals() { return Globals; }
+
+  /// hasContent - Return true if this compile unit has something to write out.
+  ///
+  bool hasContent() const { return !Die->getChildren().empty(); }
+
+  /// AddGlobal - Add a new global entity to the compile unit.
+  ///
+  void AddGlobal(const std::string &Name, DIE *Die) { Globals[Name] = Die; }
+
+  /// getDieMapSlotFor - Returns the debug information entry map slot for the
+  /// specified debug variable.
+  DIE *&getDieMapSlotFor(GlobalVariable *GV) { return GVToDieMap[GV]; }
+
+  /// getDIEEntrySlotFor - Returns the debug information entry proxy slot for the
+  /// specified debug variable.
+  DIEEntry *&getDIEEntrySlotFor(GlobalVariable *GV) {
+    return GVToDIEEntryMap[GV];
+  }
+
+  /// AddDie - Adds or interns the DIE to the compile unit.
+  ///
+  DIE *AddDie(DIE &Buffer) {
+    FoldingSetNodeID ID;
+    Buffer.Profile(ID);
+    void *Where;
+    DIE *Die = DiesSet.FindNodeOrInsertPos(ID, Where);
+
+    if (!Die) {
+      Die = new DIE(Buffer);
+      DiesSet.InsertNode(Die, Where);
+      this->Die->AddChild(Die);
+      Buffer.Detach();
+    }
+
+    return Die;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// DbgVariable - This class is used to track local variable information.
+///
+class VISIBILITY_HIDDEN DbgVariable {
+  DIVariable Var;                    // Variable Descriptor.
+  unsigned FrameIndex;               // Variable frame index.
+  bool InlinedFnVar;                 // Variable for an inlined function.
+public:
+  DbgVariable(DIVariable V, unsigned I, bool IFV)
+    : Var(V), FrameIndex(I), InlinedFnVar(IFV)  {}
+
+  // Accessors.
+  DIVariable getVariable() const { return Var; }
+  unsigned getFrameIndex() const { return FrameIndex; }
+  bool isInlinedFnVar() const { return InlinedFnVar; }
+};
+
+//===----------------------------------------------------------------------===//
+/// DbgScope - This class is used to track scope information.
+///
+class DbgConcreteScope;
+class VISIBILITY_HIDDEN DbgScope {
+  DbgScope *Parent;                   // Parent to this scope.
+  DIDescriptor Desc;                  // Debug info descriptor for scope.
+                                      // Either subprogram or block.
+  unsigned StartLabelID;              // Label ID of the beginning of scope.
+  unsigned EndLabelID;                // Label ID of the end of scope.
+  SmallVector<DbgScope *, 4> Scopes;  // Scopes defined in scope.
+  SmallVector<DbgVariable *, 8> Variables;// Variables declared in scope.
+  SmallVector<DbgConcreteScope *, 8> ConcreteInsts;// Concrete insts of funcs.
+public:
+  DbgScope(DbgScope *P, DIDescriptor D)
+    : Parent(P), Desc(D), StartLabelID(0), EndLabelID(0) {}
+  virtual ~DbgScope();
+
+  // Accessors.
+  DbgScope *getParent()          const { return Parent; }
+  DIDescriptor getDesc()         const { return Desc; }
+  unsigned getStartLabelID()     const { return StartLabelID; }
+  unsigned getEndLabelID()       const { return EndLabelID; }
+  SmallVector<DbgScope *, 4> &getScopes() { return Scopes; }
+  SmallVector<DbgVariable *, 8> &getVariables() { return Variables; }
+  SmallVector<DbgConcreteScope*,8> &getConcreteInsts() { return ConcreteInsts; }
+  void setStartLabelID(unsigned S) { StartLabelID = S; }
+  void setEndLabelID(unsigned E)   { EndLabelID = E; }
+
+  /// AddScope - Add a scope to the scope.
+  ///
+  void AddScope(DbgScope *S) { Scopes.push_back(S); }
+
+  /// AddVariable - Add a variable to the scope.
+  ///
+  void AddVariable(DbgVariable *V) { Variables.push_back(V); }
+
+  /// AddConcreteInst - Add a concrete instance to the scope.
+  ///
+  void AddConcreteInst(DbgConcreteScope *C) { ConcreteInsts.push_back(C); }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+#ifndef NDEBUG
+void DbgScope::dump() const {
+  static unsigned IndentLevel = 0;
+  std::string Indent(IndentLevel, ' ');
+
+  cerr << Indent; Desc.dump();
+  cerr << " [" << StartLabelID << ", " << EndLabelID << "]\n";
+
+  IndentLevel += 2;
+
+  for (unsigned i = 0, e = Scopes.size(); i != e; ++i)
+    if (Scopes[i] != this)
+      Scopes[i]->dump();
+
+  IndentLevel -= 2;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+/// DbgConcreteScope - This class is used to track a scope that holds concrete
+/// instance information.
+///
+class VISIBILITY_HIDDEN DbgConcreteScope : public DbgScope {
+  CompileUnit *Unit;
+  DIE *Die;                           // Debug info for this concrete scope.
+public:
+  DbgConcreteScope(DIDescriptor D) : DbgScope(NULL, D) {}
+
+  // Accessors.
+  DIE *getDie() const { return Die; }
+  void setDie(DIE *D) { Die = D; }
+};
+
+DbgScope::~DbgScope() {
+  for (unsigned i = 0, N = Scopes.size(); i < N; ++i)
+    delete Scopes[i];
+  for (unsigned j = 0, M = Variables.size(); j < M; ++j)
+    delete Variables[j];
+  for (unsigned k = 0, O = ConcreteInsts.size(); k < O; ++k)
+    delete ConcreteInsts[k];
+}
+
+} // end llvm namespace
+
+DwarfDebug::DwarfDebug(raw_ostream &OS, AsmPrinter *A, const TargetAsmInfo *T)
+  : Dwarf(OS, A, T, "dbg"), MainCU(0),
+    AbbreviationsSet(InitAbbreviationsSetSize), Abbreviations(),
+    ValuesSet(InitValuesSetSize), Values(), StringPool(), SectionMap(),
+    SectionSourceLines(), didInitial(false), shouldEmit(false),
+    FunctionDbgScope(0), DebugTimer(0) {
+  if (TimePassesIsEnabled)
+    DebugTimer = new Timer("Dwarf Debug Writer",
+                           getDwarfTimerGroup());
+}
+DwarfDebug::~DwarfDebug() {
+  for (unsigned j = 0, M = Values.size(); j < M; ++j)
+    delete Values[j];
+
+  for (DenseMap<const GlobalVariable *, DbgScope *>::iterator
+         I = AbstractInstanceRootMap.begin(),
+         E = AbstractInstanceRootMap.end(); I != E;++I)
+    delete I->second;
+
+  delete DebugTimer;
+}
+
+/// AssignAbbrevNumber - Define a unique number for the abbreviation.
+///
+void DwarfDebug::AssignAbbrevNumber(DIEAbbrev &Abbrev) {
+  // Profile the node so that we can make it unique.
+  FoldingSetNodeID ID;
+  Abbrev.Profile(ID);
+
+  // Check the set for priors.
+  DIEAbbrev *InSet = AbbreviationsSet.GetOrInsertNode(&Abbrev);
+
+  // If it's newly added.
+  if (InSet == &Abbrev) {
+    // Add to abbreviation list.
+    Abbreviations.push_back(&Abbrev);
+
+    // Assign the vector position + 1 as its number.
+    Abbrev.setNumber(Abbreviations.size());
+  } else {
+    // Assign existing abbreviation number.
+    Abbrev.setNumber(InSet->getNumber());
+  }
+}
+
+/// CreateDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+/// information entry.
+DIEEntry *DwarfDebug::CreateDIEEntry(DIE *Entry) {
+  DIEEntry *Value;
+
+  if (Entry) {
+    FoldingSetNodeID ID;
+    DIEEntry::Profile(ID, Entry);
+    void *Where;
+    Value = static_cast<DIEEntry *>(ValuesSet.FindNodeOrInsertPos(ID, Where));
+
+    if (Value) return Value;
+
+    Value = new DIEEntry(Entry);
+    ValuesSet.InsertNode(Value, Where);
+  } else {
+    Value = new DIEEntry(Entry);
+  }
+
+  Values.push_back(Value);
+  return Value;
+}
+
+/// SetDIEEntry - Set a DIEEntry once the debug information entry is defined.
+///
+void DwarfDebug::SetDIEEntry(DIEEntry *Value, DIE *Entry) {
+  Value->setEntry(Entry);
+
+  // Add to values set if not already there.  If it is, we merely have a
+  // duplicate in the values list (no harm.)
+  ValuesSet.GetOrInsertNode(Value);
+}
+
+/// AddUInt - Add an unsigned integer attribute data and value.
+///
+void DwarfDebug::AddUInt(DIE *Die, unsigned Attribute,
+                         unsigned Form, uint64_t Integer) {
+  if (!Form) Form = DIEInteger::BestForm(false, Integer);
+
+  FoldingSetNodeID ID;
+  DIEInteger::Profile(ID, Integer);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = new DIEInteger(Integer);
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  }
+
+  Die->AddValue(Attribute, Form, Value);
+}
+
+/// AddSInt - Add an signed integer attribute data and value.
+///
+void DwarfDebug::AddSInt(DIE *Die, unsigned Attribute,
+                         unsigned Form, int64_t Integer) {
+  if (!Form) Form = DIEInteger::BestForm(true, Integer);
+
+  FoldingSetNodeID ID;
+  DIEInteger::Profile(ID, (uint64_t)Integer);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = new DIEInteger(Integer);
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  }
+
+  Die->AddValue(Attribute, Form, Value);
+}
+
+/// AddString - Add a string attribute data and value.
+///
+void DwarfDebug::AddString(DIE *Die, unsigned Attribute, unsigned Form,
+                           const std::string &String) {
+  FoldingSetNodeID ID;
+  DIEString::Profile(ID, String);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = new DIEString(String);
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  }
+
+  Die->AddValue(Attribute, Form, Value);
+}
+
+/// AddLabel - Add a Dwarf label attribute data and value.
+///
+void DwarfDebug::AddLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                          const DWLabel &Label) {
+  FoldingSetNodeID ID;
+  DIEDwarfLabel::Profile(ID, Label);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = new DIEDwarfLabel(Label);
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  }
+
+  Die->AddValue(Attribute, Form, Value);
+}
+
+/// AddObjectLabel - Add an non-Dwarf label attribute data and value.
+///
+void DwarfDebug::AddObjectLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                                const std::string &Label) {
+  FoldingSetNodeID ID;
+  DIEObjectLabel::Profile(ID, Label);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = new DIEObjectLabel(Label);
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  }
+
+  Die->AddValue(Attribute, Form, Value);
+}
+
+/// AddSectionOffset - Add a section offset label attribute data and value.
+///
+void DwarfDebug::AddSectionOffset(DIE *Die, unsigned Attribute, unsigned Form,
+                                  const DWLabel &Label, const DWLabel &Section,
+                                  bool isEH, bool useSet) {
+  FoldingSetNodeID ID;
+  DIESectionOffset::Profile(ID, Label, Section);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = new DIESectionOffset(Label, Section, isEH, useSet);
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  }
+
+  Die->AddValue(Attribute, Form, Value);
+}
+
+/// AddDelta - Add a label delta attribute data and value.
+///
+void DwarfDebug::AddDelta(DIE *Die, unsigned Attribute, unsigned Form,
+                          const DWLabel &Hi, const DWLabel &Lo) {
+  FoldingSetNodeID ID;
+  DIEDelta::Profile(ID, Hi, Lo);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = new DIEDelta(Hi, Lo);
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  }
+
+  Die->AddValue(Attribute, Form, Value);
+}
+
+/// AddBlock - Add block data.
+///
+void DwarfDebug::AddBlock(DIE *Die, unsigned Attribute, unsigned Form,
+                          DIEBlock *Block) {
+  Block->ComputeSize(TD);
+  FoldingSetNodeID ID;
+  Block->Profile(ID);
+  void *Where;
+  DIEValue *Value = ValuesSet.FindNodeOrInsertPos(ID, Where);
+
+  if (!Value) {
+    Value = Block;
+    ValuesSet.InsertNode(Value, Where);
+    Values.push_back(Value);
+  } else {
+    // Already exists, reuse the previous one.
+    delete Block;
+    Block = cast<DIEBlock>(Value);
+  }
+
+  Die->AddValue(Attribute, Block->BestForm(), Value);
+}
+
+/// AddSourceLine - Add location information to specified debug information
+/// entry.
+void DwarfDebug::AddSourceLine(DIE *Die, const DIVariable *V) {
+  // If there is no compile unit specified, don't add a line #.
+  if (V->getCompileUnit().isNull())
+    return;
+
+  unsigned Line = V->getLineNumber();
+  unsigned FileID = FindCompileUnit(V->getCompileUnit()).getID();
+  assert(FileID && "Invalid file id");
+  AddUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  AddUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// AddSourceLine - Add location information to specified debug information
+/// entry.
+void DwarfDebug::AddSourceLine(DIE *Die, const DIGlobal *G) {
+  // If there is no compile unit specified, don't add a line #.
+  if (G->getCompileUnit().isNull())
+    return;
+
+  unsigned Line = G->getLineNumber();
+  unsigned FileID = FindCompileUnit(G->getCompileUnit()).getID();
+  assert(FileID && "Invalid file id");
+  AddUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  AddUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+void DwarfDebug::AddSourceLine(DIE *Die, const DIType *Ty) {
+  // If there is no compile unit specified, don't add a line #.
+  DICompileUnit CU = Ty->getCompileUnit();
+  if (CU.isNull())
+    return;
+
+  unsigned Line = Ty->getLineNumber();
+  unsigned FileID = FindCompileUnit(CU).getID();
+  assert(FileID && "Invalid file id");
+  AddUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  AddUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// AddAddress - Add an address attribute to a die based on the location
+/// provided.
+void DwarfDebug::AddAddress(DIE *Die, unsigned Attribute,
+                            const MachineLocation &Location) {
+  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
+  DIEBlock *Block = new DIEBlock();
+
+  if (Location.isReg()) {
+    if (Reg < 32) {
+      AddUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
+    } else {
+      AddUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
+      AddUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
+    }
+  } else {
+    if (Reg < 32) {
+      AddUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
+    } else {
+      AddUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
+      AddUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
+    }
+
+    AddUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
+  }
+
+  AddBlock(Die, Attribute, 0, Block);
+}
+
+/// AddType - Add a new type attribute to the specified entity.
+void DwarfDebug::AddType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty) {
+  if (Ty.isNull())
+    return;
+
+  // Check for pre-existence.
+  DIEEntry *&Slot = DW_Unit->getDIEEntrySlotFor(Ty.getGV());
+
+  // If it exists then use the existing value.
+  if (Slot) {
+    Entity->AddValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Slot);
+    return;
+  }
+
+  // Set up proxy.
+  Slot = CreateDIEEntry();
+
+  // Construct type.
+  DIE Buffer(dwarf::DW_TAG_base_type);
+  if (Ty.isBasicType(Ty.getTag()))
+    ConstructTypeDIE(DW_Unit, Buffer, DIBasicType(Ty.getGV()));
+  else if (Ty.isDerivedType(Ty.getTag()))
+    ConstructTypeDIE(DW_Unit, Buffer, DIDerivedType(Ty.getGV()));
+  else {
+    assert(Ty.isCompositeType(Ty.getTag()) && "Unknown kind of DIType");
+    ConstructTypeDIE(DW_Unit, Buffer, DICompositeType(Ty.getGV()));
+  }
+
+  // Add debug information entry to entity and appropriate context.
+  DIE *Die = NULL;
+  DIDescriptor Context = Ty.getContext();
+  if (!Context.isNull())
+    Die = DW_Unit->getDieMapSlotFor(Context.getGV());
+
+  if (Die) {
+    DIE *Child = new DIE(Buffer);
+    Die->AddChild(Child);
+    Buffer.Detach();
+    SetDIEEntry(Slot, Child);
+  } else {
+    Die = DW_Unit->AddDie(Buffer);
+    SetDIEEntry(Slot, Die);
+  }
+
+  Entity->AddValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Slot);
+}
+
+/// ConstructTypeDIE - Construct basic type die from DIBasicType.
+void DwarfDebug::ConstructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+                                  DIBasicType BTy) {
+  // Get core information.
+  std::string Name;
+  BTy.getName(Name);
+  Buffer.setTag(dwarf::DW_TAG_base_type);
+  AddUInt(&Buffer, dwarf::DW_AT_encoding,  dwarf::DW_FORM_data1,
+          BTy.getEncoding());
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    AddString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+  uint64_t Size = BTy.getSizeInBits() >> 3;
+  AddUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+}
+
+/// ConstructTypeDIE - Construct derived type die from DIDerivedType.
+void DwarfDebug::ConstructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+                                  DIDerivedType DTy) {
+  // Get core information.
+  std::string Name;
+  DTy.getName(Name);
+  uint64_t Size = DTy.getSizeInBits() >> 3;
+  unsigned Tag = DTy.getTag();
+
+  // FIXME - Workaround for templates.
+  if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type;
+
+  Buffer.setTag(Tag);
+
+  // Map to main type, void will not have a type.
+  DIType FromTy = DTy.getTypeDerivedFrom();
+  AddType(DW_Unit, &Buffer, FromTy);
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    AddString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  // Add size if non-zero (derived types might be zero-sized.)
+  if (Size)
+    AddUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+
+  // Add source line info if available and TyDesc is not a forward declaration.
+  if (!DTy.isForwardDecl())
+    AddSourceLine(&Buffer, &DTy);
+}
+
+/// ConstructTypeDIE - Construct type DIE from DICompositeType.
+void DwarfDebug::ConstructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+                                  DICompositeType CTy) {
+  // Get core information.
+  std::string Name;
+  CTy.getName(Name);
+
+  uint64_t Size = CTy.getSizeInBits() >> 3;
+  unsigned Tag = CTy.getTag();
+  Buffer.setTag(Tag);
+
+  switch (Tag) {
+  case dwarf::DW_TAG_vector_type:
+  case dwarf::DW_TAG_array_type:
+    ConstructArrayTypeDIE(DW_Unit, Buffer, &CTy);
+    break;
+  case dwarf::DW_TAG_enumeration_type: {
+    DIArray Elements = CTy.getTypeArray();
+
+    // Add enumerators to enumeration type.
+    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+      DIE *ElemDie = NULL;
+      DIEnumerator Enum(Elements.getElement(i).getGV());
+      ElemDie = ConstructEnumTypeDIE(DW_Unit, &Enum);
+      Buffer.AddChild(ElemDie);
+    }
+  }
+    break;
+  case dwarf::DW_TAG_subroutine_type: {
+    // Add return type.
+    DIArray Elements = CTy.getTypeArray();
+    DIDescriptor RTy = Elements.getElement(0);
+    AddType(DW_Unit, &Buffer, DIType(RTy.getGV()));
+
+    // Add prototype flag.
+    AddUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+
+    // Add arguments.
+    for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
+      DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+      DIDescriptor Ty = Elements.getElement(i);
+      AddType(DW_Unit, Arg, DIType(Ty.getGV()));
+      Buffer.AddChild(Arg);
+    }
+  }
+    break;
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_class_type: {
+    // Add elements to structure type.
+    DIArray Elements = CTy.getTypeArray();
+
+    // A forward struct declared type may not have elements available.
+    if (Elements.isNull())
+      break;
+
+    // Add elements to structure type.
+    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+      DIDescriptor Element = Elements.getElement(i);
+      DIE *ElemDie = NULL;
+      if (Element.getTag() == dwarf::DW_TAG_subprogram)
+        ElemDie = CreateSubprogramDIE(DW_Unit,
+                                      DISubprogram(Element.getGV()));
+      else if (Element.getTag() == dwarf::DW_TAG_variable) // ??
+        ElemDie = CreateGlobalVariableDIE(DW_Unit,
+                                          DIGlobalVariable(Element.getGV()));
+      else
+        ElemDie = CreateMemberDIE(DW_Unit,
+                                  DIDerivedType(Element.getGV()));
+      Buffer.AddChild(ElemDie);
+    }
+
+    // FIXME: We'd like an API to register additional attributes for the
+    // frontend to use while synthesizing, and then we'd use that api in clang
+    // instead of this.
+    if (Name == "__block_literal_generic")
+      AddUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1);
+
+    unsigned RLang = CTy.getRunTimeLang();
+    if (RLang)
+      AddUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
+              dwarf::DW_FORM_data1, RLang);
+    break;
+  }
+  default:
+    break;
+  }
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    AddString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  if (Tag == dwarf::DW_TAG_enumeration_type ||
+      Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) {
+    // Add size if non-zero (derived types might be zero-sized.)
+    if (Size)
+      AddUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+    else {
+      // Add zero size if it is not a forward declaration.
+      if (CTy.isForwardDecl())
+        AddUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      else
+        AddUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+    }
+
+    // Add source line info if available.
+    if (!CTy.isForwardDecl())
+      AddSourceLine(&Buffer, &CTy);
+  }
+}
+
+/// ConstructSubrangeDIE - Construct subrange DIE from DISubrange.
+void DwarfDebug::ConstructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
+  int64_t L = SR.getLo();
+  int64_t H = SR.getHi();
+  DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
+
+  if (L != H) {
+    AddDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
+    if (L)
+      AddSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L);
+    AddSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
+  }
+
+  Buffer.AddChild(DW_Subrange);
+}
+
+/// ConstructArrayTypeDIE - Construct array type DIE from DICompositeType.
+void DwarfDebug::ConstructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+                                       DICompositeType *CTy) {
+  Buffer.setTag(dwarf::DW_TAG_array_type);
+  if (CTy->getTag() == dwarf::DW_TAG_vector_type)
+    AddUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1);
+
+  // Emit derived type.
+  AddType(DW_Unit, &Buffer, CTy->getTypeDerivedFrom());
+  DIArray Elements = CTy->getTypeArray();
+
+  // Construct an anonymous type for index type.
+  DIE IdxBuffer(dwarf::DW_TAG_base_type);
+  AddUInt(&IdxBuffer, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
+  AddUInt(&IdxBuffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+          dwarf::DW_ATE_signed);
+  DIE *IndexTy = DW_Unit->AddDie(IdxBuffer);
+
+  // Add subranges to array type.
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+    DIDescriptor Element = Elements.getElement(i);
+    if (Element.getTag() == dwarf::DW_TAG_subrange_type)
+      ConstructSubrangeDIE(Buffer, DISubrange(Element.getGV()), IndexTy);
+  }
+}
+
+/// ConstructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
+DIE *DwarfDebug::ConstructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy) {
+  DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
+  std::string Name;
+  ETy->getName(Name);
+  AddString(Enumerator, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+  int64_t Value = ETy->getEnumValue();
+  AddSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
+  return Enumerator;
+}
+
+/// CreateGlobalVariableDIE - Create new DIE using GV.
+DIE *DwarfDebug::CreateGlobalVariableDIE(CompileUnit *DW_Unit,
+                                         const DIGlobalVariable &GV) {
+  DIE *GVDie = new DIE(dwarf::DW_TAG_variable);
+  std::string Name;
+  GV.getDisplayName(Name);
+  AddString(GVDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+  std::string LinkageName;
+  GV.getLinkageName(LinkageName);
+  if (!LinkageName.empty())
+    AddString(GVDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
+              LinkageName);
+  AddType(DW_Unit, GVDie, GV.getType());
+  if (!GV.isLocalToUnit())
+    AddUInt(GVDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+  AddSourceLine(GVDie, &GV);
+  return GVDie;
+}
+
+/// CreateMemberDIE - Create new member DIE.
+DIE *DwarfDebug::CreateMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT){
+  DIE *MemberDie = new DIE(DT.getTag());
+  std::string Name;
+  DT.getName(Name);
+  if (!Name.empty())
+    AddString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  AddType(DW_Unit, MemberDie, DT.getTypeDerivedFrom());
+
+  AddSourceLine(MemberDie, &DT);
+
+  uint64_t Size = DT.getSizeInBits();
+  uint64_t FieldSize = DT.getOriginalTypeSize();
+
+  if (Size != FieldSize) {
+    // Handle bitfield.
+    AddUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3);
+    AddUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits());
+
+    uint64_t Offset = DT.getOffsetInBits();
+    uint64_t FieldOffset = Offset;
+    uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
+    uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+    FieldOffset = (HiMark - FieldSize);
+    Offset -= FieldOffset;
+
+    // Maybe we need to work from the other end.
+    if (TD->isLittleEndian()) Offset = FieldSize - (Offset + Size);
+    AddUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
+  }
+
+  DIEBlock *Block = new DIEBlock();
+  AddUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+  AddUInt(Block, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3);
+  AddBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, Block);
+
+  if (DT.isProtected())
+    AddUInt(MemberDie, dwarf::DW_AT_accessibility, 0,
+            dwarf::DW_ACCESS_protected);
+  else if (DT.isPrivate())
+    AddUInt(MemberDie, dwarf::DW_AT_accessibility, 0,
+            dwarf::DW_ACCESS_private);
+
+  return MemberDie;
+}
+
+/// CreateSubprogramDIE - Create new DIE using SP.
+DIE *DwarfDebug::CreateSubprogramDIE(CompileUnit *DW_Unit,
+                                     const DISubprogram &SP,
+                                     bool IsConstructor,
+                                     bool IsInlined) {
+  DIE *SPDie = new DIE(dwarf::DW_TAG_subprogram);
+
+  std::string Name;
+  SP.getName(Name);
+  AddString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  std::string LinkageName;
+  SP.getLinkageName(LinkageName);
+
+  if (!LinkageName.empty())
+    AddString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
+              LinkageName);
+
+  AddSourceLine(SPDie, &SP);
+
+  DICompositeType SPTy = SP.getType();
+  DIArray Args = SPTy.getTypeArray();
+
+  // Add prototyped tag, if C or ObjC.
+  unsigned Lang = SP.getCompileUnit().getLanguage();
+  if (Lang == dwarf::DW_LANG_C99 || Lang == dwarf::DW_LANG_C89 ||
+      Lang == dwarf::DW_LANG_ObjC)
+    AddUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+
+  // Add Return Type.
+  unsigned SPTag = SPTy.getTag();
+  if (!IsConstructor) {
+    if (Args.isNull() || SPTag != dwarf::DW_TAG_subroutine_type)
+      AddType(DW_Unit, SPDie, SPTy);
+    else
+      AddType(DW_Unit, SPDie, DIType(Args.getElement(0).getGV()));
+  }
+
+  if (!SP.isDefinition()) {
+    AddUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+
+    // Add arguments. Do not add arguments for subprogram definition. They will
+    // be handled through RecordVariable.
+    if (SPTag == dwarf::DW_TAG_subroutine_type)
+      for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
+        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        AddType(DW_Unit, Arg, DIType(Args.getElement(i).getGV()));
+        AddUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); // ??
+        SPDie->AddChild(Arg);
+      }
+  }
+
+  if (!SP.isLocalToUnit() && !IsInlined)
+    AddUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  DIE *&Slot = DW_Unit->getDieMapSlotFor(SP.getGV());
+  Slot = SPDie;
+  return SPDie;
+}
+
+/// FindCompileUnit - Get the compile unit for the given descriptor.
+///
+CompileUnit &DwarfDebug::FindCompileUnit(DICompileUnit Unit) const {
+  DenseMap<Value *, CompileUnit *>::const_iterator I =
+    CompileUnitMap.find(Unit.getGV());
+  assert(I != CompileUnitMap.end() && "Missing compile unit.");
+  return *I->second;
+}
+
+/// CreateDbgScopeVariable - Create a new scope variable.
+///
+DIE *DwarfDebug::CreateDbgScopeVariable(DbgVariable *DV, CompileUnit *Unit) {
+  // Get the descriptor.
+  const DIVariable &VD = DV->getVariable();
+
+  // Translate tag to proper Dwarf tag.  The result variable is dropped for
+  // now.
+  unsigned Tag;
+  switch (VD.getTag()) {
+  case dwarf::DW_TAG_return_variable:
+    return NULL;
+  case dwarf::DW_TAG_arg_variable:
+    Tag = dwarf::DW_TAG_formal_parameter;
+    break;
+  case dwarf::DW_TAG_auto_variable:    // fall thru
+  default:
+    Tag = dwarf::DW_TAG_variable;
+    break;
+  }
+
+  // Define variable debug information entry.
+  DIE *VariableDie = new DIE(Tag);
+  std::string Name;
+  VD.getName(Name);
+  AddString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  // Add source line info if available.
+  AddSourceLine(VariableDie, &VD);
+
+  // Add variable type.
+  AddType(Unit, VariableDie, VD.getType());
+
+  // Add variable address.
+  if (!DV->isInlinedFnVar()) {
+    // Variables for abstract instances of inlined functions don't get a
+    // location.
+    MachineLocation Location;
+    Location.set(RI->getFrameRegister(*MF),
+                 RI->getFrameIndexOffset(*MF, DV->getFrameIndex()));
+    AddAddress(VariableDie, dwarf::DW_AT_location, Location);
+  }
+
+  return VariableDie;
+}
+
+/// getOrCreateScope - Returns the scope associated with the given descriptor.
+///
+DbgScope *DwarfDebug::getOrCreateScope(GlobalVariable *V) {
+  DbgScope *&Slot = DbgScopeMap[V];
+  if (Slot) return Slot;
+
+  DbgScope *Parent = NULL;
+  DIBlock Block(V);
+
+  // Don't create a new scope if we already created one for an inlined function.
+  DenseMap<const GlobalVariable *, DbgScope *>::iterator
+    II = AbstractInstanceRootMap.find(V);
+  if (II != AbstractInstanceRootMap.end())
+    return LexicalScopeStack.back();
+
+  if (!Block.isNull()) {
+    DIDescriptor ParentDesc = Block.getContext();
+    Parent =
+      ParentDesc.isNull() ?  NULL : getOrCreateScope(ParentDesc.getGV());
+  }
+
+  Slot = new DbgScope(Parent, DIDescriptor(V));
+
+  if (Parent)
+    Parent->AddScope(Slot);
+  else
+    // First function is top level function.
+    FunctionDbgScope = Slot;
+
+  return Slot;
+}
+
+/// ConstructDbgScope - Construct the components of a scope.
+///
+void DwarfDebug::ConstructDbgScope(DbgScope *ParentScope,
+                                   unsigned ParentStartID,
+                                   unsigned ParentEndID,
+                                   DIE *ParentDie, CompileUnit *Unit) {
+  // Add variables to scope.
+  SmallVector<DbgVariable *, 8> &Variables = ParentScope->getVariables();
+  for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
+    DIE *VariableDie = CreateDbgScopeVariable(Variables[i], Unit);
+    if (VariableDie) ParentDie->AddChild(VariableDie);
+  }
+
+  // Add concrete instances to scope.
+  SmallVector<DbgConcreteScope *, 8> &ConcreteInsts =
+    ParentScope->getConcreteInsts();
+  for (unsigned i = 0, N = ConcreteInsts.size(); i < N; ++i) {
+    DbgConcreteScope *ConcreteInst = ConcreteInsts[i];
+    DIE *Die = ConcreteInst->getDie();
+
+    unsigned StartID = ConcreteInst->getStartLabelID();
+    unsigned EndID = ConcreteInst->getEndLabelID();
+
+    // Add the scope bounds.
+    if (StartID)
+      AddLabel(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+               DWLabel("label", StartID));
+    else
+      AddLabel(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+               DWLabel("func_begin", SubprogramCount));
+
+    if (EndID)
+      AddLabel(Die, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+               DWLabel("label", EndID));
+    else
+      AddLabel(Die, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+               DWLabel("func_end", SubprogramCount));
+
+    ParentDie->AddChild(Die);
+  }
+
+  // Add nested scopes.
+  SmallVector<DbgScope *, 4> &Scopes = ParentScope->getScopes();
+  for (unsigned j = 0, M = Scopes.size(); j < M; ++j) {
+    // Define the Scope debug information entry.
+    DbgScope *Scope = Scopes[j];
+
+    unsigned StartID = MMI->MappedLabel(Scope->getStartLabelID());
+    unsigned EndID = MMI->MappedLabel(Scope->getEndLabelID());
+
+    // Ignore empty scopes.
+    if (StartID == EndID && StartID != 0) continue;
+
+    // Do not ignore inlined scopes even if they don't have any variables or
+    // scopes.
+    if (Scope->getScopes().empty() && Scope->getVariables().empty() &&
+        Scope->getConcreteInsts().empty())
+      continue;
+
+    if (StartID == ParentStartID && EndID == ParentEndID) {
+      // Just add stuff to the parent scope.
+      ConstructDbgScope(Scope, ParentStartID, ParentEndID, ParentDie, Unit);
+    } else {
+      DIE *ScopeDie = new DIE(dwarf::DW_TAG_lexical_block);
+
+      // Add the scope bounds.
+      if (StartID)
+        AddLabel(ScopeDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+                 DWLabel("label", StartID));
+      else
+        AddLabel(ScopeDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+                 DWLabel("func_begin", SubprogramCount));
+
+      if (EndID)
+        AddLabel(ScopeDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+                 DWLabel("label", EndID));
+      else
+        AddLabel(ScopeDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+                 DWLabel("func_end", SubprogramCount));
+
+      // Add the scope's contents.
+      ConstructDbgScope(Scope, StartID, EndID, ScopeDie, Unit);
+      ParentDie->AddChild(ScopeDie);
+    }
+  }
+}
+
+/// ConstructFunctionDbgScope - Construct the scope for the subprogram.
+///
+void DwarfDebug::ConstructFunctionDbgScope(DbgScope *RootScope,
+                                           bool AbstractScope) {
+  // Exit if there is no root scope.
+  if (!RootScope) return;
+  DIDescriptor Desc = RootScope->getDesc();
+  if (Desc.isNull())
+    return;
+
+  // Get the subprogram debug information entry.
+  DISubprogram SPD(Desc.getGV());
+
+  // Get the compile unit context.
+  CompileUnit *Unit = MainCU;
+  if (!Unit)
+    Unit = &FindCompileUnit(SPD.getCompileUnit());
+
+  // Get the subprogram die.
+  DIE *SPDie = Unit->getDieMapSlotFor(SPD.getGV());
+  assert(SPDie && "Missing subprogram descriptor");
+
+  if (!AbstractScope) {
+    // Add the function bounds.
+    AddLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+             DWLabel("func_begin", SubprogramCount));
+    AddLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+             DWLabel("func_end", SubprogramCount));
+    MachineLocation Location(RI->getFrameRegister(*MF));
+    AddAddress(SPDie, dwarf::DW_AT_frame_base, Location);
+  }
+
+  ConstructDbgScope(RootScope, 0, 0, SPDie, Unit);
+}
+
+/// ConstructDefaultDbgScope - Construct a default scope for the subprogram.
+///
+void DwarfDebug::ConstructDefaultDbgScope(MachineFunction *MF) {
+  const char *FnName = MF->getFunction()->getNameStart();
+  if (MainCU) {
+    StringMap<DIE*> &Globals = MainCU->getGlobals();
+    StringMap<DIE*>::iterator GI = Globals.find(FnName);
+    if (GI != Globals.end()) {
+      DIE *SPDie = GI->second;
+
+      // Add the function bounds.
+      AddLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+               DWLabel("func_begin", SubprogramCount));
+      AddLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+               DWLabel("func_end", SubprogramCount));
+
+      MachineLocation Location(RI->getFrameRegister(*MF));
+      AddAddress(SPDie, dwarf::DW_AT_frame_base, Location);
+      return;
+    }
+  } else {
+    for (unsigned i = 0, e = CompileUnits.size(); i != e; ++i) {
+      CompileUnit *Unit = CompileUnits[i];
+      StringMap<DIE*> &Globals = Unit->getGlobals();
+      StringMap<DIE*>::iterator GI = Globals.find(FnName);
+      if (GI != Globals.end()) {
+        DIE *SPDie = GI->second;
+
+        // Add the function bounds.
+        AddLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+                 DWLabel("func_begin", SubprogramCount));
+        AddLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+                 DWLabel("func_end", SubprogramCount));
+
+        MachineLocation Location(RI->getFrameRegister(*MF));
+        AddAddress(SPDie, dwarf::DW_AT_frame_base, Location);
+        return;
+      }
+    }
+  }
+
+#if 0
+  // FIXME: This is causing an abort because C++ mangled names are compared with
+  // their unmangled counterparts. See PR2885. Don't do this assert.
+  assert(0 && "Couldn't find DIE for machine function!");
+#endif
+}
+
+/// GetOrCreateSourceID - Look up the source id with the given directory and
+/// source file names. If none currently exists, create a new id and insert it
+/// in the SourceIds map. This can update DirectoryNames and SourceFileNames
+/// maps as well.
+unsigned DwarfDebug::GetOrCreateSourceID(const std::string &DirName,
+                                         const std::string &FileName) {
+  unsigned DId;
+  StringMap<unsigned>::iterator DI = DirectoryIdMap.find(DirName);
+  if (DI != DirectoryIdMap.end()) {
+    DId = DI->getValue();
+  } else {
+    DId = DirectoryNames.size() + 1;
+    DirectoryIdMap[DirName] = DId;
+    DirectoryNames.push_back(DirName);
+  }
+
+  unsigned FId;
+  StringMap<unsigned>::iterator FI = SourceFileIdMap.find(FileName);
+  if (FI != SourceFileIdMap.end()) {
+    FId = FI->getValue();
+  } else {
+    FId = SourceFileNames.size() + 1;
+    SourceFileIdMap[FileName] = FId;
+    SourceFileNames.push_back(FileName);
+  }
+
+  DenseMap<std::pair<unsigned, unsigned>, unsigned>::iterator SI =
+    SourceIdMap.find(std::make_pair(DId, FId));
+  if (SI != SourceIdMap.end())
+    return SI->second;
+
+  unsigned SrcId = SourceIds.size() + 1;  // DW_AT_decl_file cannot be 0.
+  SourceIdMap[std::make_pair(DId, FId)] = SrcId;
+  SourceIds.push_back(std::make_pair(DId, FId));
+
+  return SrcId;
+}
+
+void DwarfDebug::ConstructCompileUnit(GlobalVariable *GV) {
+  DICompileUnit DIUnit(GV);
+  std::string Dir, FN, Prod;
+  unsigned ID = GetOrCreateSourceID(DIUnit.getDirectory(Dir),
+                                    DIUnit.getFilename(FN));
+
+  DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
+  AddSectionOffset(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
+                   DWLabel("section_line", 0), DWLabel("section_line", 0),
+                   false);
+  AddString(Die, dwarf::DW_AT_producer, dwarf::DW_FORM_string,
+            DIUnit.getProducer(Prod));
+  AddUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data1,
+          DIUnit.getLanguage());
+  AddString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN);
+
+  if (!Dir.empty())
+    AddString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir);
+  if (DIUnit.isOptimized())
+    AddUInt(Die, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+
+  std::string Flags;
+  DIUnit.getFlags(Flags);
+  if (!Flags.empty())
+    AddString(Die, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string, Flags);
+
+  unsigned RVer = DIUnit.getRunTimeVersion();
+  if (RVer)
+    AddUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+            dwarf::DW_FORM_data1, RVer);
+
+  CompileUnit *Unit = new CompileUnit(ID, Die);
+  if (DIUnit.isMain()) {
+    assert(!MainCU && "Multiple main compile units are found!");
+    MainCU = Unit;
+    }
+
+  CompileUnitMap[DIUnit.getGV()] = Unit;
+  CompileUnits.push_back(Unit);
+}
+
+/// ConstructCompileUnits - Create a compile unit DIEs.
+void DwarfDebug::ConstructCompileUnits() {
+  GlobalVariable *Root = M->getGlobalVariable("llvm.dbg.compile_units");
+  if (!Root)
+    return;
+  assert(Root->hasLinkOnceLinkage() && Root->hasOneUse() &&
+         "Malformed compile unit descriptor anchor type");
+  Constant *RootC = cast<Constant>(*Root->use_begin());
+  assert(RootC->hasNUsesOrMore(1) &&
+         "Malformed compile unit descriptor anchor type");
+
+  for (Value::use_iterator UI = RootC->use_begin(), UE = Root->use_end();
+       UI != UE; ++UI)
+    for (Value::use_iterator UUI = UI->use_begin(), UUE = UI->use_end();
+         UUI != UUE; ++UUI) {
+      GlobalVariable *GV = cast<GlobalVariable>(*UUI);
+      ConstructCompileUnit(GV);
+    }
+}
+
+bool DwarfDebug::ConstructGlobalVariableDIE(GlobalVariable *GV) {
+  DIGlobalVariable DI_GV(GV);
+  CompileUnit *DW_Unit = MainCU;
+  if (!DW_Unit)
+    DW_Unit = &FindCompileUnit(DI_GV.getCompileUnit());
+
+  // Check for pre-existence.
+  DIE *&Slot = DW_Unit->getDieMapSlotFor(DI_GV.getGV());
+  if (Slot)
+    return false;
+
+  DIE *VariableDie = CreateGlobalVariableDIE(DW_Unit, DI_GV);
+
+  // Add address.
+  DIEBlock *Block = new DIEBlock();
+  AddUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+  std::string GLN;
+  AddObjectLabel(Block, 0, dwarf::DW_FORM_udata,
+                 Asm->getGlobalLinkName(DI_GV.getGlobal(), GLN));
+  AddBlock(VariableDie, dwarf::DW_AT_location, 0, Block);
+
+  // Add to map.
+  Slot = VariableDie;
+
+  // Add to context owner.
+  DW_Unit->getDie()->AddChild(VariableDie);
+
+  // Expose as global. FIXME - need to check external flag.
+  std::string Name;
+  DW_Unit->AddGlobal(DI_GV.getName(Name), VariableDie);
+  return true;
+}
+
+/// ConstructGlobalVariableDIEs - Create DIEs for each of the externally visible
+/// global variables. Return true if at least one global DIE is created.
+bool DwarfDebug::ConstructGlobalVariableDIEs() {
+  GlobalVariable *Root = M->getGlobalVariable("llvm.dbg.global_variables");
+  if (!Root)
+    return false;
+
+  assert(Root->hasLinkOnceLinkage() && Root->hasOneUse() &&
+         "Malformed global variable descriptor anchor type");
+  Constant *RootC = cast<Constant>(*Root->use_begin());
+  assert(RootC->hasNUsesOrMore(1) &&
+         "Malformed global variable descriptor anchor type");
+
+  bool Result = false;
+  for (Value::use_iterator UI = RootC->use_begin(), UE = Root->use_end();
+       UI != UE; ++UI)
+    for (Value::use_iterator UUI = UI->use_begin(), UUE = UI->use_end();
+         UUI != UUE; ++UUI)
+      Result |= ConstructGlobalVariableDIE(cast<GlobalVariable>(*UUI));
+
+  return Result;
+}
+
+bool DwarfDebug::ConstructSubprogram(GlobalVariable *GV) {
+  DISubprogram SP(GV);
+  CompileUnit *Unit = MainCU;
+  if (!Unit)
+    Unit = &FindCompileUnit(SP.getCompileUnit());
+
+  // Check for pre-existence.
+  DIE *&Slot = Unit->getDieMapSlotFor(GV);
+  if (Slot)
+    return false;
+
+  if (!SP.isDefinition())
+    // This is a method declaration which will be handled while constructing
+    // class type.
+    return false;
+
+  DIE *SubprogramDie = CreateSubprogramDIE(Unit, SP);
+
+  // Add to map.
+  Slot = SubprogramDie;
+
+  // Add to context owner.
+  Unit->getDie()->AddChild(SubprogramDie);
+
+  // Expose as global.
+  std::string Name;
+  Unit->AddGlobal(SP.getName(Name), SubprogramDie);
+  return true;
+}
+
+/// ConstructSubprograms - Create DIEs for each of the externally visible
+/// subprograms. Return true if at least one subprogram DIE is created.
+bool DwarfDebug::ConstructSubprograms() {
+  GlobalVariable *Root = M->getGlobalVariable("llvm.dbg.subprograms");
+  if (!Root)
+    return false;
+
+  assert(Root->hasLinkOnceLinkage() && Root->hasOneUse() &&
+         "Malformed subprogram descriptor anchor type");
+  Constant *RootC = cast<Constant>(*Root->use_begin());
+  assert(RootC->hasNUsesOrMore(1) &&
+         "Malformed subprogram descriptor anchor type");
+
+  bool Result = false;
+  for (Value::use_iterator UI = RootC->use_begin(), UE = Root->use_end();
+       UI != UE; ++UI)
+    for (Value::use_iterator UUI = UI->use_begin(), UUE = UI->use_end();
+         UUI != UUE; ++UUI)
+      Result |= ConstructSubprogram(cast<GlobalVariable>(*UUI));
+
+  return Result;
+}
+
+/// SetDebugInfo - Create global DIEs and emit initial debug info sections.
+/// This is inovked by the target AsmPrinter.
+void DwarfDebug::SetDebugInfo(MachineModuleInfo *mmi) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  // Create all the compile unit DIEs.
+  ConstructCompileUnits();
+
+  if (CompileUnits.empty()) {
+    if (TimePassesIsEnabled)
+      DebugTimer->stopTimer();
+
+    return;
+  }
+
+  // Create DIEs for each of the externally visible global variables.
+  bool globalDIEs = ConstructGlobalVariableDIEs();
+
+  // Create DIEs for each of the externally visible subprograms.
+  bool subprogramDIEs = ConstructSubprograms();
+
+  // If there is not any debug info available for any global variables and any
+  // subprograms then there is not any debug info to emit.
+  if (!globalDIEs && !subprogramDIEs) {
+    if (TimePassesIsEnabled)
+      DebugTimer->stopTimer();
+
+    return;
+  }
+
+  MMI = mmi;
+  shouldEmit = true;
+  MMI->setDebugInfoAvailability(true);
+
+  // Prime section data.
+  SectionMap.insert(TAI->getTextSection());
+
+  // Print out .file directives to specify files for .loc directives. These are
+  // printed out early so that they precede any .loc directives.
+  if (TAI->hasDotLocAndDotFile()) {
+    for (unsigned i = 1, e = getNumSourceIds()+1; i != e; ++i) {
+      // Remember source id starts at 1.
+      std::pair<unsigned, unsigned> Id = getSourceDirectoryAndFileIds(i);
+      sys::Path FullPath(getSourceDirectoryName(Id.first));
+      bool AppendOk =
+        FullPath.appendComponent(getSourceFileName(Id.second));
+      assert(AppendOk && "Could not append filename to directory!");
+      AppendOk = false;
+      Asm->EmitFile(i, FullPath.toString());
+      Asm->EOL();
+    }
+  }
+
+  // Emit initial sections
+  EmitInitial();
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+}
+
+/// EndModule - Emit all Dwarf sections that should come after the content.
+///
+void DwarfDebug::EndModule() {
+  if (!ShouldEmitDwarfDebug())
+    return;
+
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  // Standard sections final addresses.
+  Asm->SwitchToSection(TAI->getTextSection());
+  EmitLabel("text_end", 0);
+  Asm->SwitchToSection(TAI->getDataSection());
+  EmitLabel("data_end", 0);
+
+  // End text sections.
+  for (unsigned i = 1, N = SectionMap.size(); i <= N; ++i) {
+    Asm->SwitchToSection(SectionMap[i]);
+    EmitLabel("section_end", i);
+  }
+
+  // Emit common frame information.
+  EmitCommonDebugFrame();
+
+  // Emit function debug frame information
+  for (std::vector<FunctionDebugFrameInfo>::iterator I = DebugFrames.begin(),
+         E = DebugFrames.end(); I != E; ++I)
+    EmitFunctionDebugFrame(*I);
+
+  // Compute DIE offsets and sizes.
+  SizeAndOffsets();
+
+  // Emit all the DIEs into a debug info section
+  EmitDebugInfo();
+
+  // Corresponding abbreviations into a abbrev section.
+  EmitAbbreviations();
+
+  // Emit source line correspondence into a debug line section.
+  EmitDebugLines();
+
+  // Emit info into a debug pubnames section.
+  EmitDebugPubNames();
+
+  // Emit info into a debug str section.
+  EmitDebugStr();
+
+  // Emit info into a debug loc section.
+  EmitDebugLoc();
+
+  // Emit info into a debug aranges section.
+  EmitDebugARanges();
+
+  // Emit info into a debug ranges section.
+  EmitDebugRanges();
+
+  // Emit info into a debug macinfo section.
+  EmitDebugMacInfo();
+
+  // Emit inline info.
+  EmitDebugInlineInfo();
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+}
+
+/// BeginFunction - Gather pre-function debug information.  Assumes being
+/// emitted immediately after the function entry point.
+void DwarfDebug::BeginFunction(MachineFunction *MF) {
+  this->MF = MF;
+
+  if (!ShouldEmitDwarfDebug()) return;
+
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  // Begin accumulating function debug information.
+  MMI->BeginFunction(MF);
+
+  // Assumes in correct section after the entry point.
+  EmitLabel("func_begin", ++SubprogramCount);
+
+  // Emit label for the implicitly defined dbg.stoppoint at the start of the
+  // function.
+  DebugLoc FDL = MF->getDefaultDebugLoc();
+  if (!FDL.isUnknown()) {
+    DebugLocTuple DLT = MF->getDebugLocTuple(FDL);
+    unsigned LabelID = RecordSourceLine(DLT.Line, DLT.Col,
+                                        DICompileUnit(DLT.CompileUnit));
+    Asm->printLabel(LabelID);
+  }
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+}
+
+/// EndFunction - Gather and emit post-function debug information.
+///
+void DwarfDebug::EndFunction(MachineFunction *MF) {
+  if (!ShouldEmitDwarfDebug()) return;
+
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  // Define end label for subprogram.
+  EmitLabel("func_end", SubprogramCount);
+
+  // Get function line info.
+  if (!Lines.empty()) {
+    // Get section line info.
+    unsigned ID = SectionMap.insert(Asm->CurrentSection_);
+    if (SectionSourceLines.size() < ID) SectionSourceLines.resize(ID);
+    std::vector<SrcLineInfo> &SectionLineInfos = SectionSourceLines[ID-1];
+    // Append the function info to section info.
+    SectionLineInfos.insert(SectionLineInfos.end(),
+                            Lines.begin(), Lines.end());
+  }
+
+  // Construct the DbgScope for abstract instances.
+  for (SmallVector<DbgScope *, 32>::iterator
+         I = AbstractInstanceRootList.begin(),
+         E = AbstractInstanceRootList.end(); I != E; ++I)
+    ConstructFunctionDbgScope(*I);
+
+  // Construct scopes for subprogram.
+  if (FunctionDbgScope)
+    ConstructFunctionDbgScope(FunctionDbgScope);
+  else
+    // FIXME: This is wrong. We are essentially getting past a problem with
+    // debug information not being able to handle unreachable blocks that have
+    // debug information in them. In particular, those unreachable blocks that
+    // have "region end" info in them. That situation results in the "root
+    // scope" not being created. If that's the case, then emit a "default"
+    // scope, i.e., one that encompasses the whole function. This isn't
+    // desirable. And a better way of handling this (and all of the debugging
+    // information) needs to be explored.
+    ConstructDefaultDbgScope(MF);
+
+  DebugFrames.push_back(FunctionDebugFrameInfo(SubprogramCount,
+                                               MMI->getFrameMoves()));
+
+  // Clear debug info
+  if (FunctionDbgScope) {
+    delete FunctionDbgScope;
+    DbgScopeMap.clear();
+    DbgAbstractScopeMap.clear();
+    DbgConcreteScopeMap.clear();
+    InlinedVariableScopes.clear();
+    FunctionDbgScope = NULL;
+    LexicalScopeStack.clear();
+    AbstractInstanceRootList.clear();
+  }
+
+  Lines.clear();
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+}
+
+/// RecordSourceLine - Records location information and associates it with a
+/// label. Returns a unique label ID used to generate a label and provide
+/// correspondence to the source line list.
+unsigned DwarfDebug::RecordSourceLine(Value *V, unsigned Line, unsigned Col) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  CompileUnit *Unit = CompileUnitMap[V];
+  assert(Unit && "Unable to find CompileUnit");
+  unsigned ID = MMI->NextLabelID();
+  Lines.push_back(SrcLineInfo(Line, Col, Unit->getID(), ID));
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+
+  return ID;
+}
+
+/// RecordSourceLine - Records location information and associates it with a
+/// label. Returns a unique label ID used to generate a label and provide
+/// correspondence to the source line list.
+unsigned DwarfDebug::RecordSourceLine(unsigned Line, unsigned Col,
+                                      DICompileUnit CU) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  std::string Dir, Fn;
+  unsigned Src = GetOrCreateSourceID(CU.getDirectory(Dir),
+                                     CU.getFilename(Fn));
+  unsigned ID = MMI->NextLabelID();
+  Lines.push_back(SrcLineInfo(Line, Col, Src, ID));
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+
+  return ID;
+}
+
+/// getOrCreateSourceID - Public version of GetOrCreateSourceID. This can be
+/// timed. Look up the source id with the given directory and source file
+/// names. If none currently exists, create a new id and insert it in the
+/// SourceIds map. This can update DirectoryNames and SourceFileNames maps as
+/// well.
+unsigned DwarfDebug::getOrCreateSourceID(const std::string &DirName,
+                                         const std::string &FileName) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  unsigned SrcId = GetOrCreateSourceID(DirName, FileName);
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+
+  return SrcId;
+}
+
+/// RecordRegionStart - Indicate the start of a region.
+unsigned DwarfDebug::RecordRegionStart(GlobalVariable *V) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  DbgScope *Scope = getOrCreateScope(V);
+  unsigned ID = MMI->NextLabelID();
+  if (!Scope->getStartLabelID()) Scope->setStartLabelID(ID);
+  LexicalScopeStack.push_back(Scope);
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+
+  return ID;
+}
+
+/// RecordRegionEnd - Indicate the end of a region.
+unsigned DwarfDebug::RecordRegionEnd(GlobalVariable *V) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  DbgScope *Scope = getOrCreateScope(V);
+  unsigned ID = MMI->NextLabelID();
+  Scope->setEndLabelID(ID);
+  if (LexicalScopeStack.size() != 0)
+    LexicalScopeStack.pop_back();
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+
+  return ID;
+}
+
+/// RecordVariable - Indicate the declaration of a local variable.
+void DwarfDebug::RecordVariable(GlobalVariable *GV, unsigned FrameIndex,
+                                const MachineInstr *MI) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  DIDescriptor Desc(GV);
+  DbgScope *Scope = NULL;
+  bool InlinedFnVar = false;
+
+  if (Desc.getTag() == dwarf::DW_TAG_variable) {
+    // GV is a global variable.
+    DIGlobalVariable DG(GV);
+    Scope = getOrCreateScope(DG.getContext().getGV());
+  } else {
+    DenseMap<const MachineInstr *, DbgScope *>::iterator
+      SI = InlinedVariableScopes.find(MI);
+
+    if (SI != InlinedVariableScopes.end()) {
+      // or GV is an inlined local variable.
+      Scope = SI->second;
+    } else {
+      DIVariable DV(GV);
+      GlobalVariable *V = DV.getContext().getGV();
+
+      // FIXME: The code that checks for the inlined local variable is a hack!
+      DenseMap<const GlobalVariable *, DbgScope *>::iterator
+        AI = AbstractInstanceRootMap.find(V);
+
+      if (AI != AbstractInstanceRootMap.end()) {
+        // This method is called each time a DECLARE node is encountered. For an
+        // inlined function, this could be many, many times. We don't want to
+        // re-add variables to that DIE for each time. We just want to add them
+        // once. Check to make sure that we haven't added them already.
+        DenseMap<const GlobalVariable *,
+          SmallSet<const GlobalVariable *, 32> >::iterator
+          IP = InlinedParamMap.find(V);
+
+        if (IP != InlinedParamMap.end() && IP->second.count(GV) > 0) {
+          if (TimePassesIsEnabled)
+            DebugTimer->stopTimer();
+          return;
+        }
+
+        // or GV is an inlined local variable.
+        Scope = AI->second;
+        InlinedParamMap[V].insert(GV);
+        InlinedFnVar = true;
+      } else {
+        // or GV is a local variable.
+        Scope = getOrCreateScope(V);
+      }
+    }
+  }
+
+  assert(Scope && "Unable to find the variable's scope");
+  DbgVariable *DV = new DbgVariable(DIVariable(GV), FrameIndex, InlinedFnVar);
+  Scope->AddVariable(DV);
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+}
+
+//// RecordInlinedFnStart - Indicate the start of inlined subroutine.
+unsigned DwarfDebug::RecordInlinedFnStart(DISubprogram &SP, DICompileUnit CU,
+                                          unsigned Line, unsigned Col) {
+  unsigned LabelID = MMI->NextLabelID();
+
+  if (!TAI->doesDwarfUsesInlineInfoSection())
+    return LabelID;
+
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  GlobalVariable *GV = SP.getGV();
+  DenseMap<const GlobalVariable *, DbgScope *>::iterator
+    II = AbstractInstanceRootMap.find(GV);
+
+  if (II == AbstractInstanceRootMap.end()) {
+    // Create an abstract instance entry for this inlined function if it doesn't
+    // already exist.
+    DbgScope *Scope = new DbgScope(NULL, DIDescriptor(GV));
+
+    // Get the compile unit context.
+    CompileUnit *Unit = &FindCompileUnit(SP.getCompileUnit());
+    DIE *SPDie = Unit->getDieMapSlotFor(GV);
+    if (!SPDie)
+      SPDie = CreateSubprogramDIE(Unit, SP, false, true);
+
+    // Mark as being inlined. This makes this subprogram entry an abstract
+    // instance root.
+    // FIXME: Our debugger doesn't care about the value of DW_AT_inline, only
+    // that it's defined. That probably won't change in the future. However,
+    // this could be more elegant.
+    AddUInt(SPDie, dwarf::DW_AT_inline, 0, dwarf::DW_INL_declared_not_inlined);
+
+    // Keep track of the abstract scope for this function.
+    DbgAbstractScopeMap[GV] = Scope;
+
+    AbstractInstanceRootMap[GV] = Scope;
+    AbstractInstanceRootList.push_back(Scope);
+  }
+
+  // Create a concrete inlined instance for this inlined function.
+  DbgConcreteScope *ConcreteScope = new DbgConcreteScope(DIDescriptor(GV));
+  DIE *ScopeDie = new DIE(dwarf::DW_TAG_inlined_subroutine);
+  CompileUnit *Unit = &FindCompileUnit(SP.getCompileUnit());
+  ScopeDie->setAbstractCompileUnit(Unit);
+
+  DIE *Origin = Unit->getDieMapSlotFor(GV);
+  AddDIEEntry(ScopeDie, dwarf::DW_AT_abstract_origin,
+              dwarf::DW_FORM_ref4, Origin);
+  AddUInt(ScopeDie, dwarf::DW_AT_call_file, 0, Unit->getID());
+  AddUInt(ScopeDie, dwarf::DW_AT_call_line, 0, Line);
+  AddUInt(ScopeDie, dwarf::DW_AT_call_column, 0, Col);
+
+  ConcreteScope->setDie(ScopeDie);
+  ConcreteScope->setStartLabelID(LabelID);
+  MMI->RecordUsedDbgLabel(LabelID);
+
+  LexicalScopeStack.back()->AddConcreteInst(ConcreteScope);
+
+  // Keep track of the concrete scope that's inlined into this function.
+  DenseMap<GlobalVariable *, SmallVector<DbgScope *, 8> >::iterator
+    SI = DbgConcreteScopeMap.find(GV);
+
+  if (SI == DbgConcreteScopeMap.end())
+    DbgConcreteScopeMap[GV].push_back(ConcreteScope);
+  else
+    SI->second.push_back(ConcreteScope);
+
+  // Track the start label for this inlined function.
+  DenseMap<GlobalVariable *, SmallVector<unsigned, 4> >::iterator
+    I = InlineInfo.find(GV);
+
+  if (I == InlineInfo.end())
+    InlineInfo[GV].push_back(LabelID);
+  else
+    I->second.push_back(LabelID);
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+
+  return LabelID;
+}
+
+/// RecordInlinedFnEnd - Indicate the end of inlined subroutine.
+unsigned DwarfDebug::RecordInlinedFnEnd(DISubprogram &SP) {
+  if (!TAI->doesDwarfUsesInlineInfoSection())
+    return 0;
+
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  GlobalVariable *GV = SP.getGV();
+  DenseMap<GlobalVariable *, SmallVector<DbgScope *, 8> >::iterator
+    I = DbgConcreteScopeMap.find(GV);
+
+  if (I == DbgConcreteScopeMap.end()) {
+    // FIXME: Can this situation actually happen? And if so, should it?
+    if (TimePassesIsEnabled)
+      DebugTimer->stopTimer();
+
+    return 0;
+  }
+
+  SmallVector<DbgScope *, 8> &Scopes = I->second;
+  assert(!Scopes.empty() && "We should have at least one debug scope!");
+  DbgScope *Scope = Scopes.back(); Scopes.pop_back();
+  unsigned ID = MMI->NextLabelID();
+  MMI->RecordUsedDbgLabel(ID);
+  Scope->setEndLabelID(ID);
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+
+  return ID;
+}
+
+/// RecordVariableScope - Record scope for the variable declared by
+/// DeclareMI. DeclareMI must describe TargetInstrInfo::DECLARE. Record scopes
+/// for only inlined subroutine variables. Other variables's scopes are
+/// determined during RecordVariable().
+void DwarfDebug::RecordVariableScope(DIVariable &DV,
+                                     const MachineInstr *DeclareMI) {
+  if (TimePassesIsEnabled)
+    DebugTimer->startTimer();
+
+  DISubprogram SP(DV.getContext().getGV());
+
+  if (SP.isNull()) {
+    if (TimePassesIsEnabled)
+      DebugTimer->stopTimer();
+
+    return;
+  }
+
+  DenseMap<GlobalVariable *, DbgScope *>::iterator
+    I = DbgAbstractScopeMap.find(SP.getGV());
+  if (I != DbgAbstractScopeMap.end())
+    InlinedVariableScopes[DeclareMI] = I->second;
+
+  if (TimePassesIsEnabled)
+    DebugTimer->stopTimer();
+}
+
+//===----------------------------------------------------------------------===//
+// Emit Methods
+//===----------------------------------------------------------------------===//
+
+/// SizeAndOffsetDie - Compute the size and offset of a DIE.
+///
+unsigned DwarfDebug::SizeAndOffsetDie(DIE *Die, unsigned Offset, bool Last) {
+  // Get the children.
+  const std::vector<DIE *> &Children = Die->getChildren();
+
+  // If not last sibling and has children then add sibling offset attribute.
+  if (!Last && !Children.empty()) Die->AddSiblingOffset();
+
+  // Record the abbreviation.
+  AssignAbbrevNumber(Die->getAbbrev());
+
+  // Get the abbreviation for this DIE.
+  unsigned AbbrevNumber = Die->getAbbrevNumber();
+  const DIEAbbrev *Abbrev = Abbreviations[AbbrevNumber - 1];
+
+  // Set DIE offset
+  Die->setOffset(Offset);
+
+  // Start the size with the size of abbreviation code.
+  Offset += TargetAsmInfo::getULEB128Size(AbbrevNumber);
+
+  const SmallVector<DIEValue*, 32> &Values = Die->getValues();
+  const SmallVector<DIEAbbrevData, 8> &AbbrevData = Abbrev->getData();
+
+  // Size the DIE attribute values.
+  for (unsigned i = 0, N = Values.size(); i < N; ++i)
+    // Size attribute value.
+    Offset += Values[i]->SizeOf(TD, AbbrevData[i].getForm());
+
+  // Size the DIE children if any.
+  if (!Children.empty()) {
+    assert(Abbrev->getChildrenFlag() == dwarf::DW_CHILDREN_yes &&
+           "Children flag not set");
+
+    for (unsigned j = 0, M = Children.size(); j < M; ++j)
+      Offset = SizeAndOffsetDie(Children[j], Offset, (j + 1) == M);
+
+    // End of children marker.
+    Offset += sizeof(int8_t);
+  }
+
+  Die->setSize(Offset - Die->getOffset());
+  return Offset;
+}
+
+/// SizeAndOffsets - Compute the size and offset of all the DIEs.
+///
+void DwarfDebug::SizeAndOffsets() {
+  // Compute size of compile unit header.
+  static unsigned Offset =
+    sizeof(int32_t) + // Length of Compilation Unit Info
+    sizeof(int16_t) + // DWARF version number
+    sizeof(int32_t) + // Offset Into Abbrev. Section
+    sizeof(int8_t);   // Pointer Size (in bytes)
+
+  // Process base compile unit.
+  if (MainCU) {
+    SizeAndOffsetDie(MainCU->getDie(), Offset, true);
+    CompileUnitOffsets[MainCU] = 0;
+    return;
+  }
+
+  // Process all compile units.
+  unsigned PrevOffset = 0;
+
+  for (unsigned i = 0, e = CompileUnits.size(); i != e; ++i) {
+    CompileUnit *Unit = CompileUnits[i];
+    CompileUnitOffsets[Unit] = PrevOffset;
+    PrevOffset += SizeAndOffsetDie(Unit->getDie(), Offset, true)
+      + sizeof(int32_t);  // FIXME - extra pad for gdb bug.
+  }
+}
+
+/// EmitInitial - Emit initial Dwarf declarations.  This is necessary for cc
+/// tools to recognize the object file contains Dwarf information.
+void DwarfDebug::EmitInitial() {
+  // Check to see if we already emitted intial headers.
+  if (didInitial) return;
+  didInitial = true;
+
+  // Dwarf sections base addresses.
+  if (TAI->doesDwarfRequireFrameSection()) {
+    Asm->SwitchToDataSection(TAI->getDwarfFrameSection());
+    EmitLabel("section_debug_frame", 0);
+  }
+
+  Asm->SwitchToDataSection(TAI->getDwarfInfoSection());
+  EmitLabel("section_info", 0);
+  Asm->SwitchToDataSection(TAI->getDwarfAbbrevSection());
+  EmitLabel("section_abbrev", 0);
+  Asm->SwitchToDataSection(TAI->getDwarfARangesSection());
+  EmitLabel("section_aranges", 0);
+
+  if (TAI->doesSupportMacInfoSection()) {
+    Asm->SwitchToDataSection(TAI->getDwarfMacInfoSection());
+    EmitLabel("section_macinfo", 0);
+  }
+
+  Asm->SwitchToDataSection(TAI->getDwarfLineSection());
+  EmitLabel("section_line", 0);
+  Asm->SwitchToDataSection(TAI->getDwarfLocSection());
+  EmitLabel("section_loc", 0);
+  Asm->SwitchToDataSection(TAI->getDwarfPubNamesSection());
+  EmitLabel("section_pubnames", 0);
+  Asm->SwitchToDataSection(TAI->getDwarfStrSection());
+  EmitLabel("section_str", 0);
+  Asm->SwitchToDataSection(TAI->getDwarfRangesSection());
+  EmitLabel("section_ranges", 0);
+
+  Asm->SwitchToSection(TAI->getTextSection());
+  EmitLabel("text_begin", 0);
+  Asm->SwitchToSection(TAI->getDataSection());
+  EmitLabel("data_begin", 0);
+}
+
+/// EmitDIE - Recusively Emits a debug information entry.
+///
+void DwarfDebug::EmitDIE(DIE *Die) {
+  // Get the abbreviation for this DIE.
+  unsigned AbbrevNumber = Die->getAbbrevNumber();
+  const DIEAbbrev *Abbrev = Abbreviations[AbbrevNumber - 1];
+
+  Asm->EOL();
+
+  // Emit the code (index) for the abbreviation.
+  Asm->EmitULEB128Bytes(AbbrevNumber);
+
+  if (Asm->isVerbose())
+    Asm->EOL(std::string("Abbrev [" +
+                         utostr(AbbrevNumber) +
+                         "] 0x" + utohexstr(Die->getOffset()) +
+                         ":0x" + utohexstr(Die->getSize()) + " " +
+                         dwarf::TagString(Abbrev->getTag())));
+  else
+    Asm->EOL();
+
+  SmallVector<DIEValue*, 32> &Values = Die->getValues();
+  const SmallVector<DIEAbbrevData, 8> &AbbrevData = Abbrev->getData();
+
+  // Emit the DIE attribute values.
+  for (unsigned i = 0, N = Values.size(); i < N; ++i) {
+    unsigned Attr = AbbrevData[i].getAttribute();
+    unsigned Form = AbbrevData[i].getForm();
+    assert(Form && "Too many attributes for DIE (check abbreviation)");
+
+    switch (Attr) {
+    case dwarf::DW_AT_sibling:
+      Asm->EmitInt32(Die->SiblingOffset());
+      break;
+    case dwarf::DW_AT_abstract_origin: {
+      DIEEntry *E = cast<DIEEntry>(Values[i]);
+      DIE *Origin = E->getEntry();
+      unsigned Addr =
+        CompileUnitOffsets[Die->getAbstractCompileUnit()] +
+        Origin->getOffset();
+
+      Asm->EmitInt32(Addr);
+      break;
+    }
+    default:
+      // Emit an attribute using the defined form.
+      Values[i]->EmitValue(this, Form);
+      break;
+    }
+
+    Asm->EOL(dwarf::AttributeString(Attr));
+  }
+
+  // Emit the DIE children if any.
+  if (Abbrev->getChildrenFlag() == dwarf::DW_CHILDREN_yes) {
+    const std::vector<DIE *> &Children = Die->getChildren();
+
+    for (unsigned j = 0, M = Children.size(); j < M; ++j)
+      EmitDIE(Children[j]);
+
+    Asm->EmitInt8(0); Asm->EOL("End Of Children Mark");
+  }
+}
+
+/// EmitDebugInfo / EmitDebugInfoPerCU - Emit the debug info section.
+///
+void DwarfDebug::EmitDebugInfoPerCU(CompileUnit *Unit) {
+  DIE *Die = Unit->getDie();
+
+  // Emit the compile units header.
+  EmitLabel("info_begin", Unit->getID());
+
+  // Emit size of content not including length itself
+  unsigned ContentSize = Die->getSize() +
+    sizeof(int16_t) + // DWARF version number
+    sizeof(int32_t) + // Offset Into Abbrev. Section
+    sizeof(int8_t) +  // Pointer Size (in bytes)
+    sizeof(int32_t);  // FIXME - extra pad for gdb bug.
+
+  Asm->EmitInt32(ContentSize);  Asm->EOL("Length of Compilation Unit Info");
+  Asm->EmitInt16(dwarf::DWARF_VERSION); Asm->EOL("DWARF version number");
+  EmitSectionOffset("abbrev_begin", "section_abbrev", 0, 0, true, false);
+  Asm->EOL("Offset Into Abbrev. Section");
+  Asm->EmitInt8(TD->getPointerSize()); Asm->EOL("Address Size (in bytes)");
+
+  EmitDIE(Die);
+  // FIXME - extra padding for gdb bug.
+  Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
+  Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
+  Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
+  Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
+  EmitLabel("info_end", Unit->getID());
+
+  Asm->EOL();
+}
+
+void DwarfDebug::EmitDebugInfo() {
+  // Start debug info section.
+  Asm->SwitchToDataSection(TAI->getDwarfInfoSection());
+
+  if (MainCU) {
+    EmitDebugInfoPerCU(MainCU);
+    return;
+  }
+
+  for (unsigned i = 0, e = CompileUnits.size(); i != e; ++i)
+    EmitDebugInfoPerCU(CompileUnits[i]);
+}
+
+/// EmitAbbreviations - Emit the abbreviation section.
+///
+void DwarfDebug::EmitAbbreviations() const {
+  // Check to see if it is worth the effort.
+  if (!Abbreviations.empty()) {
+    // Start the debug abbrev section.
+    Asm->SwitchToDataSection(TAI->getDwarfAbbrevSection());
+
+    EmitLabel("abbrev_begin", 0);
+
+    // For each abbrevation.
+    for (unsigned i = 0, N = Abbreviations.size(); i < N; ++i) {
+      // Get abbreviation data
+      const DIEAbbrev *Abbrev = Abbreviations[i];
+
+      // Emit the abbrevations code (base 1 index.)
+      Asm->EmitULEB128Bytes(Abbrev->getNumber());
+      Asm->EOL("Abbreviation Code");
+
+      // Emit the abbreviations data.
+      Abbrev->Emit(Asm);
+
+      Asm->EOL();
+    }
+
+    // Mark end of abbreviations.
+    Asm->EmitULEB128Bytes(0); Asm->EOL("EOM(3)");
+
+    EmitLabel("abbrev_end", 0);
+    Asm->EOL();
+  }
+}
+
+/// EmitEndOfLineMatrix - Emit the last address of the section and the end of
+/// the line matrix.
+///
+void DwarfDebug::EmitEndOfLineMatrix(unsigned SectionEnd) {
+  // Define last address of section.
+  Asm->EmitInt8(0); Asm->EOL("Extended Op");
+  Asm->EmitInt8(TD->getPointerSize() + 1); Asm->EOL("Op size");
+  Asm->EmitInt8(dwarf::DW_LNE_set_address); Asm->EOL("DW_LNE_set_address");
+  EmitReference("section_end", SectionEnd); Asm->EOL("Section end label");
+
+  // Mark end of matrix.
+  Asm->EmitInt8(0); Asm->EOL("DW_LNE_end_sequence");
+  Asm->EmitULEB128Bytes(1); Asm->EOL();
+  Asm->EmitInt8(1); Asm->EOL();
+}
+
+/// EmitDebugLines - Emit source line information.
+///
+void DwarfDebug::EmitDebugLines() {
+  // If the target is using .loc/.file, the assembler will be emitting the
+  // .debug_line table automatically.
+  if (TAI->hasDotLocAndDotFile())
+    return;
+
+  // Minimum line delta, thus ranging from -10..(255-10).
+  const int MinLineDelta = -(dwarf::DW_LNS_fixed_advance_pc + 1);
+  // Maximum line delta, thus ranging from -10..(255-10).
+  const int MaxLineDelta = 255 + MinLineDelta;
+
+  // Start the dwarf line section.
+  Asm->SwitchToDataSection(TAI->getDwarfLineSection());
+
+  // Construct the section header.
+  EmitDifference("line_end", 0, "line_begin", 0, true);
+  Asm->EOL("Length of Source Line Info");
+  EmitLabel("line_begin", 0);
+
+  Asm->EmitInt16(dwarf::DWARF_VERSION); Asm->EOL("DWARF version number");
+
+  EmitDifference("line_prolog_end", 0, "line_prolog_begin", 0, true);
+  Asm->EOL("Prolog Length");
+  EmitLabel("line_prolog_begin", 0);
+
+  Asm->EmitInt8(1); Asm->EOL("Minimum Instruction Length");
+
+  Asm->EmitInt8(1); Asm->EOL("Default is_stmt_start flag");
+
+  Asm->EmitInt8(MinLineDelta); Asm->EOL("Line Base Value (Special Opcodes)");
+
+  Asm->EmitInt8(MaxLineDelta); Asm->EOL("Line Range Value (Special Opcodes)");
+
+  Asm->EmitInt8(-MinLineDelta); Asm->EOL("Special Opcode Base");
+
+  // Line number standard opcode encodings argument count
+  Asm->EmitInt8(0); Asm->EOL("DW_LNS_copy arg count");
+  Asm->EmitInt8(1); Asm->EOL("DW_LNS_advance_pc arg count");
+  Asm->EmitInt8(1); Asm->EOL("DW_LNS_advance_line arg count");
+  Asm->EmitInt8(1); Asm->EOL("DW_LNS_set_file arg count");
+  Asm->EmitInt8(1); Asm->EOL("DW_LNS_set_column arg count");
+  Asm->EmitInt8(0); Asm->EOL("DW_LNS_negate_stmt arg count");
+  Asm->EmitInt8(0); Asm->EOL("DW_LNS_set_basic_block arg count");
+  Asm->EmitInt8(0); Asm->EOL("DW_LNS_const_add_pc arg count");
+  Asm->EmitInt8(1); Asm->EOL("DW_LNS_fixed_advance_pc arg count");
+
+  // Emit directories.
+  for (unsigned DI = 1, DE = getNumSourceDirectories()+1; DI != DE; ++DI) {
+    Asm->EmitString(getSourceDirectoryName(DI));
+    Asm->EOL("Directory");
+  }
+
+  Asm->EmitInt8(0); Asm->EOL("End of directories");
+
+  // Emit files.
+  for (unsigned SI = 1, SE = getNumSourceIds()+1; SI != SE; ++SI) {
+    // Remember source id starts at 1.
+    std::pair<unsigned, unsigned> Id = getSourceDirectoryAndFileIds(SI);
+    Asm->EmitString(getSourceFileName(Id.second));
+    Asm->EOL("Source");
+    Asm->EmitULEB128Bytes(Id.first);
+    Asm->EOL("Directory #");
+    Asm->EmitULEB128Bytes(0);
+    Asm->EOL("Mod date");
+    Asm->EmitULEB128Bytes(0);
+    Asm->EOL("File size");
+  }
+
+  Asm->EmitInt8(0); Asm->EOL("End of files");
+
+  EmitLabel("line_prolog_end", 0);
+
+  // A sequence for each text section.
+  unsigned SecSrcLinesSize = SectionSourceLines.size();
+
+  for (unsigned j = 0; j < SecSrcLinesSize; ++j) {
+    // Isolate current sections line info.
+    const std::vector<SrcLineInfo> &LineInfos = SectionSourceLines[j];
+
+    if (Asm->isVerbose()) {
+      const Section* S = SectionMap[j + 1];
+      O << '\t' << TAI->getCommentString() << " Section"
+        << S->getName() << '\n';
+    } else {
+      Asm->EOL();
+    }
+
+    // Dwarf assumes we start with first line of first source file.
+    unsigned Source = 1;
+    unsigned Line = 1;
+
+    // Construct rows of the address, source, line, column matrix.
+    for (unsigned i = 0, N = LineInfos.size(); i < N; ++i) {
+      const SrcLineInfo &LineInfo = LineInfos[i];
+      unsigned LabelID = MMI->MappedLabel(LineInfo.getLabelID());
+      if (!LabelID) continue;
+
+      if (!Asm->isVerbose())
+        Asm->EOL();
+      else {
+        std::pair<unsigned, unsigned> SourceID =
+          getSourceDirectoryAndFileIds(LineInfo.getSourceID());
+        O << '\t' << TAI->getCommentString() << ' '
+          << getSourceDirectoryName(SourceID.first) << ' '
+          << getSourceFileName(SourceID.second)
+          <<" :" << utostr_32(LineInfo.getLine()) << '\n';
+      }
+
+      // Define the line address.
+      Asm->EmitInt8(0); Asm->EOL("Extended Op");
+      Asm->EmitInt8(TD->getPointerSize() + 1); Asm->EOL("Op size");
+      Asm->EmitInt8(dwarf::DW_LNE_set_address); Asm->EOL("DW_LNE_set_address");
+      EmitReference("label",  LabelID); Asm->EOL("Location label");
+
+      // If change of source, then switch to the new source.
+      if (Source != LineInfo.getSourceID()) {
+        Source = LineInfo.getSourceID();
+        Asm->EmitInt8(dwarf::DW_LNS_set_file); Asm->EOL("DW_LNS_set_file");
+        Asm->EmitULEB128Bytes(Source); Asm->EOL("New Source");
+      }
+
+      // If change of line.
+      if (Line != LineInfo.getLine()) {
+        // Determine offset.
+        int Offset = LineInfo.getLine() - Line;
+        int Delta = Offset - MinLineDelta;
+
+        // Update line.
+        Line = LineInfo.getLine();
+
+        // If delta is small enough and in range...
+        if (Delta >= 0 && Delta < (MaxLineDelta - 1)) {
+          // ... then use fast opcode.
+          Asm->EmitInt8(Delta - MinLineDelta); Asm->EOL("Line Delta");
+        } else {
+          // ... otherwise use long hand.
+          Asm->EmitInt8(dwarf::DW_LNS_advance_line);
+          Asm->EOL("DW_LNS_advance_line");
+          Asm->EmitSLEB128Bytes(Offset); Asm->EOL("Line Offset");
+          Asm->EmitInt8(dwarf::DW_LNS_copy); Asm->EOL("DW_LNS_copy");
+        }
+      } else {
+        // Copy the previous row (different address or source)
+        Asm->EmitInt8(dwarf::DW_LNS_copy); Asm->EOL("DW_LNS_copy");
+      }
+    }
+
+    EmitEndOfLineMatrix(j + 1);
+  }
+
+  if (SecSrcLinesSize == 0)
+    // Because we're emitting a debug_line section, we still need a line
+    // table. The linker and friends expect it to exist. If there's nothing to
+    // put into it, emit an empty table.
+    EmitEndOfLineMatrix(1);
+
+  EmitLabel("line_end", 0);
+  Asm->EOL();
+}
+
+/// EmitCommonDebugFrame - Emit common frame info into a debug frame section.
+///
+void DwarfDebug::EmitCommonDebugFrame() {
+  if (!TAI->doesDwarfRequireFrameSection())
+    return;
+
+  int stackGrowth =
+    Asm->TM.getFrameInfo()->getStackGrowthDirection() ==
+      TargetFrameInfo::StackGrowsUp ?
+    TD->getPointerSize() : -TD->getPointerSize();
+
+  // Start the dwarf frame section.
+  Asm->SwitchToDataSection(TAI->getDwarfFrameSection());
+
+  EmitLabel("debug_frame_common", 0);
+  EmitDifference("debug_frame_common_end", 0,
+                 "debug_frame_common_begin", 0, true);
+  Asm->EOL("Length of Common Information Entry");
+
+  EmitLabel("debug_frame_common_begin", 0);
+  Asm->EmitInt32((int)dwarf::DW_CIE_ID);
+  Asm->EOL("CIE Identifier Tag");
+  Asm->EmitInt8(dwarf::DW_CIE_VERSION);
+  Asm->EOL("CIE Version");
+  Asm->EmitString("");
+  Asm->EOL("CIE Augmentation");
+  Asm->EmitULEB128Bytes(1);
+  Asm->EOL("CIE Code Alignment Factor");
+  Asm->EmitSLEB128Bytes(stackGrowth);
+  Asm->EOL("CIE Data Alignment Factor");
+  Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), false));
+  Asm->EOL("CIE RA Column");
+
+  std::vector<MachineMove> Moves;
+  RI->getInitialFrameState(Moves);
+
+  EmitFrameMoves(NULL, 0, Moves, false);
+
+  Asm->EmitAlignment(2, 0, 0, false);
+  EmitLabel("debug_frame_common_end", 0);
+
+  Asm->EOL();
+}
+
+/// EmitFunctionDebugFrame - Emit per function frame info into a debug frame
+/// section.
+void
+DwarfDebug::EmitFunctionDebugFrame(const FunctionDebugFrameInfo&DebugFrameInfo){
+  if (!TAI->doesDwarfRequireFrameSection())
+    return;
+
+  // Start the dwarf frame section.
+  Asm->SwitchToDataSection(TAI->getDwarfFrameSection());
+
+  EmitDifference("debug_frame_end", DebugFrameInfo.Number,
+                 "debug_frame_begin", DebugFrameInfo.Number, true);
+  Asm->EOL("Length of Frame Information Entry");
+
+  EmitLabel("debug_frame_begin", DebugFrameInfo.Number);
+
+  EmitSectionOffset("debug_frame_common", "section_debug_frame",
+                    0, 0, true, false);
+  Asm->EOL("FDE CIE offset");
+
+  EmitReference("func_begin", DebugFrameInfo.Number);
+  Asm->EOL("FDE initial location");
+  EmitDifference("func_end", DebugFrameInfo.Number,
+                 "func_begin", DebugFrameInfo.Number);
+  Asm->EOL("FDE address range");
+
+  EmitFrameMoves("func_begin", DebugFrameInfo.Number, DebugFrameInfo.Moves,
+                 false);
+
+  Asm->EmitAlignment(2, 0, 0, false);
+  EmitLabel("debug_frame_end", DebugFrameInfo.Number);
+
+  Asm->EOL();
+}
+
+void DwarfDebug::EmitDebugPubNamesPerCU(CompileUnit *Unit) {
+  EmitDifference("pubnames_end", Unit->getID(),
+                 "pubnames_begin", Unit->getID(), true);
+  Asm->EOL("Length of Public Names Info");
+
+  EmitLabel("pubnames_begin", Unit->getID());
+
+  Asm->EmitInt16(dwarf::DWARF_VERSION); Asm->EOL("DWARF Version");
+
+  EmitSectionOffset("info_begin", "section_info",
+                    Unit->getID(), 0, true, false);
+  Asm->EOL("Offset of Compilation Unit Info");
+
+  EmitDifference("info_end", Unit->getID(), "info_begin", Unit->getID(),
+                 true);
+  Asm->EOL("Compilation Unit Length");
+
+  StringMap<DIE*> &Globals = Unit->getGlobals();
+  for (StringMap<DIE*>::const_iterator
+         GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
+    const char *Name = GI->getKeyData();
+    DIE * Entity = GI->second;
+
+    Asm->EmitInt32(Entity->getOffset()); Asm->EOL("DIE offset");
+    Asm->EmitString(Name, strlen(Name)); Asm->EOL("External Name");
+  }
+
+  Asm->EmitInt32(0); Asm->EOL("End Mark");
+  EmitLabel("pubnames_end", Unit->getID());
+
+  Asm->EOL();
+}
+
+/// EmitDebugPubNames - Emit visible names into a debug pubnames section.
+///
+void DwarfDebug::EmitDebugPubNames() {
+  // Start the dwarf pubnames section.
+  Asm->SwitchToDataSection(TAI->getDwarfPubNamesSection());
+
+  if (MainCU) {
+    EmitDebugPubNamesPerCU(MainCU);
+    return;
+  }
+
+  for (unsigned i = 0, e = CompileUnits.size(); i != e; ++i)
+    EmitDebugPubNamesPerCU(CompileUnits[i]);
+}
+
+/// EmitDebugStr - Emit visible names into a debug str section.
+///
+void DwarfDebug::EmitDebugStr() {
+  // Check to see if it is worth the effort.
+  if (!StringPool.empty()) {
+    // Start the dwarf str section.
+    Asm->SwitchToDataSection(TAI->getDwarfStrSection());
+
+    // For each of strings in the string pool.
+    for (unsigned StringID = 1, N = StringPool.size();
+         StringID <= N; ++StringID) {
+      // Emit a label for reference from debug information entries.
+      EmitLabel("string", StringID);
+
+      // Emit the string itself.
+      const std::string &String = StringPool[StringID];
+      Asm->EmitString(String); Asm->EOL();
+    }
+
+    Asm->EOL();
+  }
+}
+
+/// EmitDebugLoc - Emit visible names into a debug loc section.
+///
+void DwarfDebug::EmitDebugLoc() {
+  // Start the dwarf loc section.
+  Asm->SwitchToDataSection(TAI->getDwarfLocSection());
+  Asm->EOL();
+}
+
+/// EmitDebugARanges - Emit visible names into a debug aranges section.
+///
+void DwarfDebug::EmitDebugARanges() {
+  // Start the dwarf aranges section.
+  Asm->SwitchToDataSection(TAI->getDwarfARangesSection());
+
+  // FIXME - Mock up
+#if 0
+  CompileUnit *Unit = GetBaseCompileUnit();
+
+  // Don't include size of length
+  Asm->EmitInt32(0x1c); Asm->EOL("Length of Address Ranges Info");
+
+  Asm->EmitInt16(dwarf::DWARF_VERSION); Asm->EOL("Dwarf Version");
+
+  EmitReference("info_begin", Unit->getID());
+  Asm->EOL("Offset of Compilation Unit Info");
+
+  Asm->EmitInt8(TD->getPointerSize()); Asm->EOL("Size of Address");
+
+  Asm->EmitInt8(0); Asm->EOL("Size of Segment Descriptor");
+
+  Asm->EmitInt16(0);  Asm->EOL("Pad (1)");
+  Asm->EmitInt16(0);  Asm->EOL("Pad (2)");
+
+  // Range 1
+  EmitReference("text_begin", 0); Asm->EOL("Address");
+  EmitDifference("text_end", 0, "text_begin", 0, true); Asm->EOL("Length");
+
+  Asm->EmitInt32(0); Asm->EOL("EOM (1)");
+  Asm->EmitInt32(0); Asm->EOL("EOM (2)");
+#endif
+
+  Asm->EOL();
+}
+
+/// EmitDebugRanges - Emit visible names into a debug ranges section.
+///
+void DwarfDebug::EmitDebugRanges() {
+  // Start the dwarf ranges section.
+  Asm->SwitchToDataSection(TAI->getDwarfRangesSection());
+  Asm->EOL();
+}
+
+/// EmitDebugMacInfo - Emit visible names into a debug macinfo section.
+///
+void DwarfDebug::EmitDebugMacInfo() {
+  if (TAI->doesSupportMacInfoSection()) {
+    // Start the dwarf macinfo section.
+    Asm->SwitchToDataSection(TAI->getDwarfMacInfoSection());
+    Asm->EOL();
+  }
+}
+
+/// EmitDebugInlineInfo - Emit inline info using following format.
+/// Section Header:
+/// 1. length of section
+/// 2. Dwarf version number
+/// 3. address size.
+///
+/// Entries (one "entry" for each function that was inlined):
+///
+/// 1. offset into __debug_str section for MIPS linkage name, if exists;
+///   otherwise offset into __debug_str for regular function name.
+/// 2. offset into __debug_str section for regular function name.
+/// 3. an unsigned LEB128 number indicating the number of distinct inlining
+/// instances for the function.
+///
+/// The rest of the entry consists of a {die_offset, low_pc} pair for each
+/// inlined instance; the die_offset points to the inlined_subroutine die in the
+/// __debug_info section, and the low_pc is the starting address for the
+/// inlining instance.
+void DwarfDebug::EmitDebugInlineInfo() {
+  if (!TAI->doesDwarfUsesInlineInfoSection())
+    return;
+
+  if (!MainCU)
+    return;
+
+  Asm->SwitchToDataSection(TAI->getDwarfDebugInlineSection());
+  Asm->EOL();
+  EmitDifference("debug_inlined_end", 1,
+                 "debug_inlined_begin", 1, true);
+  Asm->EOL("Length of Debug Inlined Information Entry");
+
+  EmitLabel("debug_inlined_begin", 1);
+
+  Asm->EmitInt16(dwarf::DWARF_VERSION); Asm->EOL("Dwarf Version");
+  Asm->EmitInt8(TD->getPointerSize()); Asm->EOL("Address Size (in bytes)");
+
+  for (DenseMap<GlobalVariable *, SmallVector<unsigned, 4> >::iterator
+         I = InlineInfo.begin(), E = InlineInfo.end(); I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    SmallVector<unsigned, 4> &Labels = I->second;
+    DISubprogram SP(GV);
+    std::string Name;
+    std::string LName;
+
+    SP.getLinkageName(LName);
+    SP.getName(Name);
+
+    Asm->EmitString(LName.empty() ? Name : LName);
+    Asm->EOL("MIPS linkage name");
+
+    Asm->EmitString(Name); Asm->EOL("Function name");
+
+    Asm->EmitULEB128Bytes(Labels.size()); Asm->EOL("Inline count");
+
+    for (SmallVector<unsigned, 4>::iterator LI = Labels.begin(),
+           LE = Labels.end(); LI != LE; ++LI) {
+      DIE *SP = MainCU->getDieMapSlotFor(GV);
+      Asm->EmitInt32(SP->getOffset()); Asm->EOL("DIE offset");
+
+      if (TD->getPointerSize() == sizeof(int32_t))
+        O << TAI->getData32bitsDirective();
+      else
+        O << TAI->getData64bitsDirective();
+
+      PrintLabelName("label", *LI); Asm->EOL("low_pc");
+    }
+  }
+
+  EmitLabel("debug_inlined_end", 1);
+  Asm->EOL();
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
new file mode 100644
index 0000000..9824566
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -0,0 +1,561 @@
+//===-- llvm/CodeGen/DwarfDebug.h - Dwarf Debug Framework ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+#define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+
+#include "DIE.h"
+#include "DwarfPrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include <string>
+
+namespace llvm {
+
+class CompileUnit;
+class DbgVariable;
+class DbgScope;
+class DbgConcreteScope;
+class MachineFrameInfo;
+class MachineModuleInfo;
+class TargetAsmInfo;
+class Timer;
+
+//===----------------------------------------------------------------------===//
+/// SrcLineInfo - This class is used to record source line correspondence.
+///
+class VISIBILITY_HIDDEN SrcLineInfo {
+  unsigned Line;                     // Source line number.
+  unsigned Column;                   // Source column.
+  unsigned SourceID;                 // Source ID number.
+  unsigned LabelID;                  // Label in code ID number.
+public:
+  SrcLineInfo(unsigned L, unsigned C, unsigned S, unsigned I)
+    : Line(L), Column(C), SourceID(S), LabelID(I) {}
+
+  // Accessors
+  unsigned getLine() const { return Line; }
+  unsigned getColumn() const { return Column; }
+  unsigned getSourceID() const { return SourceID; }
+  unsigned getLabelID() const { return LabelID; }
+};
+
+class VISIBILITY_HIDDEN DwarfDebug : public Dwarf {
+  //===--------------------------------------------------------------------===//
+  // Attributes used to construct specific Dwarf sections.
+  //
+
+  /// CompileUnitMap - A map of global variables representing compile units to
+  /// compile units.
+  DenseMap<Value *, CompileUnit *> CompileUnitMap;
+
+  /// CompileUnits - All the compile units in this module.
+  ///
+  SmallVector<CompileUnit *, 8> CompileUnits;
+
+  /// MainCU - Some platform prefers one compile unit per .o file. In such
+  /// cases, all dies are inserted in MainCU.
+  CompileUnit *MainCU;
+
+  /// AbbreviationsSet - Used to uniquely define abbreviations.
+  ///
+  FoldingSet<DIEAbbrev> AbbreviationsSet;
+
+  /// Abbreviations - A list of all the unique abbreviations in use.
+  ///
+  std::vector<DIEAbbrev *> Abbreviations;
+
+  /// DirectoryIdMap - Directory name to directory id map.
+  ///
+  StringMap<unsigned> DirectoryIdMap;
+
+  /// DirectoryNames - A list of directory names.
+  SmallVector<std::string, 8> DirectoryNames;
+
+  /// SourceFileIdMap - Source file name to source file id map.
+  ///
+  StringMap<unsigned> SourceFileIdMap;
+
+  /// SourceFileNames - A list of source file names.
+  SmallVector<std::string, 8> SourceFileNames;
+
+  /// SourceIdMap - Source id map, i.e. pair of directory id and source file
+  /// id mapped to a unique id.
+  DenseMap<std::pair<unsigned, unsigned>, unsigned> SourceIdMap;
+
+  /// SourceIds - Reverse map from source id to directory id + file id pair.
+  ///
+  SmallVector<std::pair<unsigned, unsigned>, 8> SourceIds;
+
+  /// Lines - List of of source line correspondence.
+  std::vector<SrcLineInfo> Lines;
+
+  /// ValuesSet - Used to uniquely define values.
+  ///
+  FoldingSet<DIEValue> ValuesSet;
+
+  /// Values - A list of all the unique values in use.
+  ///
+  std::vector<DIEValue *> Values;
+
+  /// StringPool - A UniqueVector of strings used by indirect references.
+  ///
+  UniqueVector<std::string> StringPool;
+
+  /// SectionMap - Provides a unique id per text section.
+  ///
+  UniqueVector<const Section*> SectionMap;
+
+  /// SectionSourceLines - Tracks line numbers per text section.
+  ///
+  std::vector<std::vector<SrcLineInfo> > SectionSourceLines;
+
+  /// didInitial - Flag to indicate if initial emission has been done.
+  ///
+  bool didInitial;
+
+  /// shouldEmit - Flag to indicate if debug information should be emitted.
+  ///
+  bool shouldEmit;
+
+  // FunctionDbgScope - Top level scope for the current function.
+  //
+  DbgScope *FunctionDbgScope;
+  
+  /// DbgScopeMap - Tracks the scopes in the current function.
+  DenseMap<GlobalVariable *, DbgScope *> DbgScopeMap;
+
+  /// DbgAbstractScopeMap - Tracks abstract instance scopes in the current
+  /// function.
+  DenseMap<GlobalVariable *, DbgScope *> DbgAbstractScopeMap;
+
+  /// DbgConcreteScopeMap - Tracks concrete instance scopes in the current
+  /// function.
+  DenseMap<GlobalVariable *,
+           SmallVector<DbgScope *, 8> > DbgConcreteScopeMap;
+
+  /// InlineInfo - Keep track of inlined functions and their location.  This
+  /// information is used to populate debug_inlined section.
+  DenseMap<GlobalVariable *, SmallVector<unsigned, 4> > InlineInfo;
+
+  /// InlinedVariableScopes - Scopes information for the inlined subroutine
+  /// variables.
+  DenseMap<const MachineInstr *, DbgScope *> InlinedVariableScopes;
+
+  /// AbstractInstanceRootMap - Map of abstract instance roots of inlined
+  /// functions. These are subroutine entries that contain a DW_AT_inline
+  /// attribute.
+  DenseMap<const GlobalVariable *, DbgScope *> AbstractInstanceRootMap;
+
+  /// InlinedParamMap - A map keeping track of which parameters are assigned to
+  /// which abstract instance.
+  DenseMap<const GlobalVariable *,
+    SmallSet<const GlobalVariable *, 32> > InlinedParamMap;
+
+  /// AbstractInstanceRootList - List of abstract instance roots of inlined
+  /// functions. These are subroutine entries that contain a DW_AT_inline
+  /// attribute.
+  SmallVector<DbgScope *, 32> AbstractInstanceRootList;
+
+  /// LexicalScopeStack - A stack of lexical scopes. The top one is the current
+  /// scope.
+  SmallVector<DbgScope *, 16> LexicalScopeStack;
+
+  /// CompileUnitOffsets - A vector of the offsets of the compile units. This is
+  /// used when calculating the "origin" of a concrete instance of an inlined
+  /// function.
+  DenseMap<CompileUnit *, unsigned> CompileUnitOffsets;
+
+  /// DebugTimer - Timer for the Dwarf debug writer.
+  Timer *DebugTimer;
+  
+  struct FunctionDebugFrameInfo {
+    unsigned Number;
+    std::vector<MachineMove> Moves;
+
+    FunctionDebugFrameInfo(unsigned Num, const std::vector<MachineMove> &M)
+      : Number(Num), Moves(M) {}
+  };
+
+  std::vector<FunctionDebugFrameInfo> DebugFrames;
+
+  /// getSourceDirectoryAndFileIds - Return the directory and file ids that
+  /// maps to the source id. Source id starts at 1.
+  std::pair<unsigned, unsigned>
+  getSourceDirectoryAndFileIds(unsigned SId) const {
+    return SourceIds[SId-1];
+  }
+
+  /// getNumSourceDirectories - Return the number of source directories in the
+  /// debug info.
+  unsigned getNumSourceDirectories() const {
+    return DirectoryNames.size();
+  }
+
+  /// getSourceDirectoryName - Return the name of the directory corresponding
+  /// to the id.
+  const std::string &getSourceDirectoryName(unsigned Id) const {
+    return DirectoryNames[Id - 1];
+  }
+
+  /// getSourceFileName - Return the name of the source file corresponding
+  /// to the id.
+  const std::string &getSourceFileName(unsigned Id) const {
+    return SourceFileNames[Id - 1];
+  }
+
+  /// getNumSourceIds - Return the number of unique source ids.
+  unsigned getNumSourceIds() const {
+    return SourceIds.size();
+  }
+
+  /// AssignAbbrevNumber - Define a unique number for the abbreviation.
+  ///
+  void AssignAbbrevNumber(DIEAbbrev &Abbrev);
+
+  /// CreateDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+  /// information entry.
+  DIEEntry *CreateDIEEntry(DIE *Entry = NULL);
+
+  /// SetDIEEntry - Set a DIEEntry once the debug information entry is defined.
+  ///
+  void SetDIEEntry(DIEEntry *Value, DIE *Entry);
+
+  /// AddUInt - Add an unsigned integer attribute data and value.
+  ///
+  void AddUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
+
+  /// AddSInt - Add an signed integer attribute data and value.
+  ///
+  void AddSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
+
+  /// AddString - Add a string attribute data and value.
+  ///
+  void AddString(DIE *Die, unsigned Attribute, unsigned Form,
+                 const std::string &String);
+
+  /// AddLabel - Add a Dwarf label attribute data and value.
+  ///
+  void AddLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                const DWLabel &Label);
+
+  /// AddObjectLabel - Add an non-Dwarf label attribute data and value.
+  ///
+  void AddObjectLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                      const std::string &Label);
+
+  /// AddSectionOffset - Add a section offset label attribute data and value.
+  ///
+  void AddSectionOffset(DIE *Die, unsigned Attribute, unsigned Form,
+                        const DWLabel &Label, const DWLabel &Section,
+                        bool isEH = false, bool useSet = true);
+
+  /// AddDelta - Add a label delta attribute data and value.
+  ///
+  void AddDelta(DIE *Die, unsigned Attribute, unsigned Form,
+                const DWLabel &Hi, const DWLabel &Lo);
+
+  /// AddDIEEntry - Add a DIE attribute data and value.
+  ///
+  void AddDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry) {
+    Die->AddValue(Attribute, Form, CreateDIEEntry(Entry));
+  }
+
+  /// AddBlock - Add block data.
+  ///
+  void AddBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
+
+  /// AddSourceLine - Add location information to specified debug information
+  /// entry.
+  void AddSourceLine(DIE *Die, const DIVariable *V);
+
+  /// AddSourceLine - Add location information to specified debug information
+  /// entry.
+  void AddSourceLine(DIE *Die, const DIGlobal *G);
+
+  void AddSourceLine(DIE *Die, const DIType *Ty);
+
+  /// AddAddress - Add an address attribute to a die based on the location
+  /// provided.
+  void AddAddress(DIE *Die, unsigned Attribute,
+                  const MachineLocation &Location);
+
+  /// AddType - Add a new type attribute to the specified entity.
+  void AddType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty);
+
+  /// ConstructTypeDIE - Construct basic type die from DIBasicType.
+  void ConstructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+                        DIBasicType BTy);
+
+  /// ConstructTypeDIE - Construct derived type die from DIDerivedType.
+  void ConstructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+                        DIDerivedType DTy);
+
+  /// ConstructTypeDIE - Construct type DIE from DICompositeType.
+  void ConstructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+                        DICompositeType CTy);
+
+  /// ConstructSubrangeDIE - Construct subrange DIE from DISubrange.
+  void ConstructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
+
+  /// ConstructArrayTypeDIE - Construct array type DIE from DICompositeType.
+  void ConstructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer, 
+                             DICompositeType *CTy);
+
+  /// ConstructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
+  DIE *ConstructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy);
+
+  /// CreateGlobalVariableDIE - Create new DIE using GV.
+  DIE *CreateGlobalVariableDIE(CompileUnit *DW_Unit,
+                               const DIGlobalVariable &GV);
+
+  /// CreateMemberDIE - Create new member DIE.
+  DIE *CreateMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT);
+
+  /// CreateSubprogramDIE - Create new DIE using SP.
+  DIE *CreateSubprogramDIE(CompileUnit *DW_Unit,
+                           const DISubprogram &SP,
+                           bool IsConstructor = false,
+                           bool IsInlined = false);
+
+  /// FindCompileUnit - Get the compile unit for the given descriptor. 
+  ///
+  CompileUnit &FindCompileUnit(DICompileUnit Unit) const;
+
+  /// CreateDbgScopeVariable - Create a new scope variable.
+  ///
+  DIE *CreateDbgScopeVariable(DbgVariable *DV, CompileUnit *Unit);
+
+  /// getOrCreateScope - Returns the scope associated with the given descriptor.
+  ///
+  DbgScope *getOrCreateScope(GlobalVariable *V);
+
+  /// ConstructDbgScope - Construct the components of a scope.
+  ///
+  void ConstructDbgScope(DbgScope *ParentScope,
+                         unsigned ParentStartID, unsigned ParentEndID,
+                         DIE *ParentDie, CompileUnit *Unit);
+
+  /// ConstructFunctionDbgScope - Construct the scope for the subprogram.
+  ///
+  void ConstructFunctionDbgScope(DbgScope *RootScope,
+                                 bool AbstractScope = false);
+
+  /// ConstructDefaultDbgScope - Construct a default scope for the subprogram.
+  ///
+  void ConstructDefaultDbgScope(MachineFunction *MF);
+
+  /// EmitInitial - Emit initial Dwarf declarations.  This is necessary for cc
+  /// tools to recognize the object file contains Dwarf information.
+  void EmitInitial();
+
+  /// EmitDIE - Recusively Emits a debug information entry.
+  ///
+  void EmitDIE(DIE *Die);
+
+  /// SizeAndOffsetDie - Compute the size and offset of a DIE.
+  ///
+  unsigned SizeAndOffsetDie(DIE *Die, unsigned Offset, bool Last);
+
+  /// SizeAndOffsets - Compute the size and offset of all the DIEs.
+  ///
+  void SizeAndOffsets();
+
+  /// EmitDebugInfo / EmitDebugInfoPerCU - Emit the debug info section.
+  ///
+  void EmitDebugInfoPerCU(CompileUnit *Unit);
+
+  void EmitDebugInfo();
+
+  /// EmitAbbreviations - Emit the abbreviation section.
+  ///
+  void EmitAbbreviations() const;
+
+  /// EmitEndOfLineMatrix - Emit the last address of the section and the end of
+  /// the line matrix.
+  ///
+  void EmitEndOfLineMatrix(unsigned SectionEnd);
+
+  /// EmitDebugLines - Emit source line information.
+  ///
+  void EmitDebugLines();
+
+  /// EmitCommonDebugFrame - Emit common frame info into a debug frame section.
+  ///
+  void EmitCommonDebugFrame();
+
+  /// EmitFunctionDebugFrame - Emit per function frame info into a debug frame
+  /// section.
+  void EmitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo);
+
+  void EmitDebugPubNamesPerCU(CompileUnit *Unit);
+
+  /// EmitDebugPubNames - Emit visible names into a debug pubnames section.
+  ///
+  void EmitDebugPubNames();
+
+  /// EmitDebugStr - Emit visible names into a debug str section.
+  ///
+  void EmitDebugStr();
+
+  /// EmitDebugLoc - Emit visible names into a debug loc section.
+  ///
+  void EmitDebugLoc();
+
+  /// EmitDebugARanges - Emit visible names into a debug aranges section.
+  ///
+  void EmitDebugARanges();
+
+  /// EmitDebugRanges - Emit visible names into a debug ranges section.
+  ///
+  void EmitDebugRanges();
+
+  /// EmitDebugMacInfo - Emit visible names into a debug macinfo section.
+  ///
+  void EmitDebugMacInfo();
+
+  /// EmitDebugInlineInfo - Emit inline info using following format.
+  /// Section Header:
+  /// 1. length of section
+  /// 2. Dwarf version number
+  /// 3. address size.
+  ///
+  /// Entries (one "entry" for each function that was inlined):
+  ///
+  /// 1. offset into __debug_str section for MIPS linkage name, if exists; 
+  ///   otherwise offset into __debug_str for regular function name.
+  /// 2. offset into __debug_str section for regular function name.
+  /// 3. an unsigned LEB128 number indicating the number of distinct inlining 
+  /// instances for the function.
+  /// 
+  /// The rest of the entry consists of a {die_offset, low_pc}  pair for each 
+  /// inlined instance; the die_offset points to the inlined_subroutine die in
+  /// the __debug_info section, and the low_pc is the starting address  for the
+  ///  inlining instance.
+  void EmitDebugInlineInfo();
+
+  /// GetOrCreateSourceID - Look up the source id with the given directory and
+  /// source file names. If none currently exists, create a new id and insert it
+  /// in the SourceIds map. This can update DirectoryNames and SourceFileNames maps
+  /// as well.
+  unsigned GetOrCreateSourceID(const std::string &DirName,
+                               const std::string &FileName);
+
+  void ConstructCompileUnit(GlobalVariable *GV);
+
+  /// ConstructCompileUnits - Create a compile unit DIEs.
+  void ConstructCompileUnits();
+
+  bool ConstructGlobalVariableDIE(GlobalVariable *GV);
+
+  /// ConstructGlobalVariableDIEs - Create DIEs for each of the externally 
+  /// visible global variables. Return true if at least one global DIE is
+  /// created.
+  bool ConstructGlobalVariableDIEs();
+
+  bool ConstructSubprogram(GlobalVariable *GV);
+
+  /// ConstructSubprograms - Create DIEs for each of the externally visible
+  /// subprograms. Return true if at least one subprogram DIE is created.
+  bool ConstructSubprograms();
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfDebug(raw_ostream &OS, AsmPrinter *A, const TargetAsmInfo *T);
+  virtual ~DwarfDebug();
+
+  /// ShouldEmitDwarfDebug - Returns true if Dwarf debugging declarations should
+  /// be emitted.
+  bool ShouldEmitDwarfDebug() const { return shouldEmit; }
+
+  /// SetDebugInfo - Create global DIEs and emit initial debug info sections.
+  /// This is inovked by the target AsmPrinter.
+  void SetDebugInfo(MachineModuleInfo *mmi);
+
+  /// BeginModule - Emit all Dwarf sections that should come prior to the
+  /// content.
+  void BeginModule(Module *M) {
+    this->M = M;
+  }
+
+  /// EndModule - Emit all Dwarf sections that should come after the content.
+  ///
+  void EndModule();
+
+  /// BeginFunction - Gather pre-function debug information.  Assumes being
+  /// emitted immediately after the function entry point.
+  void BeginFunction(MachineFunction *MF);
+
+  /// EndFunction - Gather and emit post-function debug information.
+  ///
+  void EndFunction(MachineFunction *MF);
+
+  /// RecordSourceLine - Records location information and associates it with a 
+  /// label. Returns a unique label ID used to generate a label and provide
+  /// correspondence to the source line list.
+  unsigned RecordSourceLine(Value *V, unsigned Line, unsigned Col);
+  
+  /// RecordSourceLine - Records location information and associates it with a 
+  /// label. Returns a unique label ID used to generate a label and provide
+  /// correspondence to the source line list.
+  unsigned RecordSourceLine(unsigned Line, unsigned Col, DICompileUnit CU);
+
+  /// getRecordSourceLineCount - Return the number of source lines in the debug
+  /// info.
+  unsigned getRecordSourceLineCount() const {
+    return Lines.size();
+  }
+                            
+  /// getOrCreateSourceID - Public version of GetOrCreateSourceID. This can be
+  /// timed. Look up the source id with the given directory and source file
+  /// names. If none currently exists, create a new id and insert it in the
+  /// SourceIds map. This can update DirectoryNames and SourceFileNames maps as
+  /// well.
+  unsigned getOrCreateSourceID(const std::string &DirName,
+                               const std::string &FileName);
+
+  /// RecordRegionStart - Indicate the start of a region.
+  unsigned RecordRegionStart(GlobalVariable *V);
+
+  /// RecordRegionEnd - Indicate the end of a region.
+  unsigned RecordRegionEnd(GlobalVariable *V);
+
+  /// RecordVariable - Indicate the declaration of  a local variable.
+  void RecordVariable(GlobalVariable *GV, unsigned FrameIndex,
+                      const MachineInstr *MI);
+
+  //// RecordInlinedFnStart - Indicate the start of inlined subroutine.
+  unsigned RecordInlinedFnStart(DISubprogram &SP, DICompileUnit CU,
+                                unsigned Line, unsigned Col);
+
+  /// RecordInlinedFnEnd - Indicate the end of inlined subroutine.
+  unsigned RecordInlinedFnEnd(DISubprogram &SP);
+
+  /// RecordVariableScope - Record scope for the variable declared by
+  /// DeclareMI. DeclareMI must describe TargetInstrInfo::DECLARE. Record scopes
+  /// for only inlined subroutine variables. Other variables's scopes are
+  /// determined during RecordVariable().
+  void RecordVariableScope(DIVariable &DV, const MachineInstr *DeclareMI);
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
new file mode 100644
index 0000000..37466ab
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -0,0 +1,706 @@
+//===-- CodeGen/AsmPrinter/DwarfException.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+static TimerGroup &getDwarfTimerGroup() {
+  static TimerGroup DwarfTimerGroup("Dwarf Exception");
+  return DwarfTimerGroup;
+}
+
+DwarfException::DwarfException(raw_ostream &OS, AsmPrinter *A,
+                               const TargetAsmInfo *T)
+  : Dwarf(OS, A, T, "eh"), shouldEmitTable(false), shouldEmitMoves(false),
+    shouldEmitTableModule(false), shouldEmitMovesModule(false),
+    ExceptionTimer(0) {
+  if (TimePassesIsEnabled) 
+    ExceptionTimer = new Timer("Dwarf Exception Writer",
+                               getDwarfTimerGroup());
+}
+
+DwarfException::~DwarfException() {
+  delete ExceptionTimer;
+}
+
+void DwarfException::EmitCommonEHFrame(const Function *Personality,
+                                       unsigned Index) {
+  // Size and sign of stack growth.
+  int stackGrowth =
+    Asm->TM.getFrameInfo()->getStackGrowthDirection() ==
+    TargetFrameInfo::StackGrowsUp ?
+    TD->getPointerSize() : -TD->getPointerSize();
+
+  // Begin eh frame section.
+  Asm->SwitchToTextSection(TAI->getDwarfEHFrameSection());
+
+  if (!TAI->doesRequireNonLocalEHFrameLabel())
+    O << TAI->getEHGlobalPrefix();
+
+  O << "EH_frame" << Index << ":\n";
+  EmitLabel("section_eh_frame", Index);
+
+  // Define base labels.
+  EmitLabel("eh_frame_common", Index);
+
+  // Define the eh frame length.
+  EmitDifference("eh_frame_common_end", Index,
+                 "eh_frame_common_begin", Index, true);
+  Asm->EOL("Length of Common Information Entry");
+
+  // EH frame header.
+  EmitLabel("eh_frame_common_begin", Index);
+  Asm->EmitInt32((int)0);
+  Asm->EOL("CIE Identifier Tag");
+  Asm->EmitInt8(dwarf::DW_CIE_VERSION);
+  Asm->EOL("CIE Version");
+
+  // The personality presence indicates that language specific information will
+  // show up in the eh frame.
+  Asm->EmitString(Personality ? "zPLR" : "zR");
+  Asm->EOL("CIE Augmentation");
+
+  // Round out reader.
+  Asm->EmitULEB128Bytes(1);
+  Asm->EOL("CIE Code Alignment Factor");
+  Asm->EmitSLEB128Bytes(stackGrowth);
+  Asm->EOL("CIE Data Alignment Factor");
+  Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), true));
+  Asm->EOL("CIE Return Address Column");
+
+  // If there is a personality, we need to indicate the functions location.
+  if (Personality) {
+    Asm->EmitULEB128Bytes(7);
+    Asm->EOL("Augmentation Size");
+
+    if (TAI->getNeedsIndirectEncoding()) {
+      Asm->EmitInt8(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4 |
+                    dwarf::DW_EH_PE_indirect);
+      Asm->EOL("Personality (pcrel sdata4 indirect)");
+    } else {
+      Asm->EmitInt8(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+      Asm->EOL("Personality (pcrel sdata4)");
+    }
+
+    PrintRelDirective(true);
+    O << TAI->getPersonalityPrefix();
+    Asm->EmitExternalGlobal((const GlobalVariable *)(Personality));
+    O << TAI->getPersonalitySuffix();
+    if (strcmp(TAI->getPersonalitySuffix(), "+4@GOTPCREL"))
+      O << "-" << TAI->getPCSymbol();
+    Asm->EOL("Personality");
+
+    Asm->EmitInt8(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+    Asm->EOL("LSDA Encoding (pcrel sdata4)");
+
+    Asm->EmitInt8(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+    Asm->EOL("FDE Encoding (pcrel sdata4)");
+  } else {
+    Asm->EmitULEB128Bytes(1);
+    Asm->EOL("Augmentation Size");
+
+    Asm->EmitInt8(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+    Asm->EOL("FDE Encoding (pcrel sdata4)");
+  }
+
+  // Indicate locations of general callee saved registers in frame.
+  std::vector<MachineMove> Moves;
+  RI->getInitialFrameState(Moves);
+  EmitFrameMoves(NULL, 0, Moves, true);
+
+  // On Darwin the linker honors the alignment of eh_frame, which means it must
+  // be 8-byte on 64-bit targets to match what gcc does.  Otherwise you get
+  // holes which confuse readers of eh_frame.
+  Asm->EmitAlignment(TD->getPointerSize() == sizeof(int32_t) ? 2 : 3,
+                     0, 0, false);
+  EmitLabel("eh_frame_common_end", Index);
+
+  Asm->EOL();
+}
+
+/// EmitEHFrame - Emit function exception frame information.
+///
+void DwarfException::EmitEHFrame(const FunctionEHFrameInfo &EHFrameInfo) {
+  assert(!EHFrameInfo.function->hasAvailableExternallyLinkage() && 
+         "Should not emit 'available externally' functions at all");
+
+  Function::LinkageTypes linkage = EHFrameInfo.function->getLinkage();
+  Asm->SwitchToTextSection(TAI->getDwarfEHFrameSection());
+
+  // Externally visible entry into the functions eh frame info. If the
+  // corresponding function is static, this should not be externally visible.
+  if (linkage != Function::InternalLinkage &&
+      linkage != Function::PrivateLinkage) {
+    if (const char *GlobalEHDirective = TAI->getGlobalEHDirective())
+      O << GlobalEHDirective << EHFrameInfo.FnName << "\n";
+  }
+
+  // If corresponding function is weak definition, this should be too.
+  if ((linkage == Function::WeakAnyLinkage ||
+       linkage == Function::WeakODRLinkage ||
+       linkage == Function::LinkOnceAnyLinkage ||
+       linkage == Function::LinkOnceODRLinkage) &&
+      TAI->getWeakDefDirective())
+    O << TAI->getWeakDefDirective() << EHFrameInfo.FnName << "\n";
+
+  // If there are no calls then you can't unwind.  This may mean we can omit the
+  // EH Frame, but some environments do not handle weak absolute symbols. If
+  // UnwindTablesMandatory is set we cannot do this optimization; the unwind
+  // info is to be available for non-EH uses.
+  if (!EHFrameInfo.hasCalls &&
+      !UnwindTablesMandatory &&
+      ((linkage != Function::WeakAnyLinkage &&
+        linkage != Function::WeakODRLinkage &&
+        linkage != Function::LinkOnceAnyLinkage &&
+        linkage != Function::LinkOnceODRLinkage) ||
+       !TAI->getWeakDefDirective() ||
+       TAI->getSupportsWeakOmittedEHFrame())) {
+    O << EHFrameInfo.FnName << " = 0\n";
+    // This name has no connection to the function, so it might get
+    // dead-stripped when the function is not, erroneously.  Prohibit
+    // dead-stripping unconditionally.
+    if (const char *UsedDirective = TAI->getUsedDirective())
+      O << UsedDirective << EHFrameInfo.FnName << "\n\n";
+  } else {
+    O << EHFrameInfo.FnName << ":\n";
+
+    // EH frame header.
+    EmitDifference("eh_frame_end", EHFrameInfo.Number,
+                   "eh_frame_begin", EHFrameInfo.Number, true);
+    Asm->EOL("Length of Frame Information Entry");
+
+    EmitLabel("eh_frame_begin", EHFrameInfo.Number);
+
+    if (TAI->doesRequireNonLocalEHFrameLabel()) {
+      PrintRelDirective(true, true);
+      PrintLabelName("eh_frame_begin", EHFrameInfo.Number);
+
+      if (!TAI->isAbsoluteEHSectionOffsets())
+        O << "-EH_frame" << EHFrameInfo.PersonalityIndex;
+    } else {
+      EmitSectionOffset("eh_frame_begin", "eh_frame_common",
+                        EHFrameInfo.Number, EHFrameInfo.PersonalityIndex,
+                        true, true, false);
+    }
+
+    Asm->EOL("FDE CIE offset");
+
+    EmitReference("eh_func_begin", EHFrameInfo.Number, true, true);
+    Asm->EOL("FDE initial location");
+    EmitDifference("eh_func_end", EHFrameInfo.Number,
+                   "eh_func_begin", EHFrameInfo.Number, true);
+    Asm->EOL("FDE address range");
+
+    // If there is a personality and landing pads then point to the language
+    // specific data area in the exception table.
+    if (EHFrameInfo.PersonalityIndex) {
+      Asm->EmitULEB128Bytes(4);
+      Asm->EOL("Augmentation size");
+
+      if (EHFrameInfo.hasLandingPads)
+        EmitReference("exception", EHFrameInfo.Number, true, true);
+      else
+        Asm->EmitInt32((int)0);
+      Asm->EOL("Language Specific Data Area");
+    } else {
+      Asm->EmitULEB128Bytes(0);
+      Asm->EOL("Augmentation size");
+    }
+
+    // Indicate locations of function specific callee saved registers in frame.
+    EmitFrameMoves("eh_func_begin", EHFrameInfo.Number, EHFrameInfo.Moves, 
+                   true);
+
+    // On Darwin the linker honors the alignment of eh_frame, which means it
+    // must be 8-byte on 64-bit targets to match what gcc does.  Otherwise you
+    // get holes which confuse readers of eh_frame.
+    Asm->EmitAlignment(TD->getPointerSize() == sizeof(int32_t) ? 2 : 3,
+                       0, 0, false);
+    EmitLabel("eh_frame_end", EHFrameInfo.Number);
+
+    // If the function is marked used, this table should be also.  We cannot
+    // make the mark unconditional in this case, since retaining the table also
+    // retains the function in this case, and there is code around that depends
+    // on unused functions (calling undefined externals) being dead-stripped to
+    // link correctly.  Yes, there really is.
+    if (MMI->getUsedFunctions().count(EHFrameInfo.function))
+      if (const char *UsedDirective = TAI->getUsedDirective())
+        O << UsedDirective << EHFrameInfo.FnName << "\n\n";
+  }
+}
+
+/// EmitExceptionTable - Emit landing pads and actions.
+///
+/// The general organization of the table is complex, but the basic concepts are
+/// easy.  First there is a header which describes the location and organization
+/// of the three components that follow.
+/// 
+///  1. The landing pad site information describes the range of code covered by
+///     the try.  In our case it's an accumulation of the ranges covered by the
+///     invokes in the try.  There is also a reference to the landing pad that
+///     handles the exception once processed.  Finally an index into the actions
+///     table.
+///  2. The action table, in our case, is composed of pairs of type ids and next
+///     action offset.  Starting with the action index from the landing pad
+///     site, each type Id is checked for a match to the current exception.  If
+///     it matches then the exception and type id are passed on to the landing
+///     pad.  Otherwise the next action is looked up.  This chain is terminated
+///     with a next action of zero.  If no type id is found the the frame is
+///     unwound and handling continues.
+///  3. Type id table contains references to all the C++ typeinfo for all
+///     catches in the function.  This tables is reversed indexed base 1.
+
+/// SharedTypeIds - How many leading type ids two landing pads have in common.
+unsigned DwarfException::SharedTypeIds(const LandingPadInfo *L,
+                                       const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+  unsigned Count = 0;
+
+  for (; Count != MinSize; ++Count)
+    if (LIds[Count] != RIds[Count])
+      return Count;
+
+  return Count;
+}
+
+/// PadLT - Order landing pads lexicographically by type id.
+bool DwarfException::PadLT(const LandingPadInfo *L, const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+
+  for (unsigned i = 0; i != MinSize; ++i)
+    if (LIds[i] != RIds[i])
+      return LIds[i] < RIds[i];
+
+  return LSize < RSize;
+}
+
+void DwarfException::EmitExceptionTable() {
+  const std::vector<GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+  if (PadInfos.empty()) return;
+
+  // Sort the landing pads in order of their type ids.  This is used to fold
+  // duplicate actions.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
+    LandingPads.push_back(&PadInfos[i]);
+  std::sort(LandingPads.begin(), LandingPads.end(), PadLT);
+
+  // Negative type ids index into FilterIds, positive type ids index into
+  // TypeInfos.  The value written for a positive type id is just the type id
+  // itself.  For a negative type id, however, the value written is the
+  // (negative) byte offset of the corresponding FilterIds entry.  The byte
+  // offset is usually equal to the type id, because the FilterIds entries are
+  // written using a variable width encoding which outputs one byte per entry as
+  // long as the value written is not too large, but can differ.  This kind of
+  // complication does not occur for positive type ids because type infos are
+  // output using a fixed width encoding.  FilterOffsets[i] holds the byte
+  // offset corresponding to FilterIds[i].
+  SmallVector<int, 16> FilterOffsets;
+  FilterOffsets.reserve(FilterIds.size());
+  int Offset = -1;
+  for(std::vector<unsigned>::const_iterator I = FilterIds.begin(),
+        E = FilterIds.end(); I != E; ++I) {
+    FilterOffsets.push_back(Offset);
+    Offset -= TargetAsmInfo::getULEB128Size(*I);
+  }
+
+  // Compute the actions table and gather the first action index for each
+  // landing pad site.
+  SmallVector<ActionEntry, 32> Actions;
+  SmallVector<unsigned, 64> FirstActions;
+  FirstActions.reserve(LandingPads.size());
+
+  int FirstAction = 0;
+  unsigned SizeActions = 0;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LP = LandingPads[i];
+    const std::vector<int> &TypeIds = LP->TypeIds;
+    const unsigned NumShared = i ? SharedTypeIds(LP, LandingPads[i-1]) : 0;
+    unsigned SizeSiteActions = 0;
+
+    if (NumShared < TypeIds.size()) {
+      unsigned SizeAction = 0;
+      ActionEntry *PrevAction = 0;
+
+      if (NumShared) {
+        const unsigned SizePrevIds = LandingPads[i-1]->TypeIds.size();
+        assert(Actions.size());
+        PrevAction = &Actions.back();
+        SizeAction = TargetAsmInfo::getSLEB128Size(PrevAction->NextAction) +
+          TargetAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+
+        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
+          SizeAction -=
+            TargetAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+          SizeAction += -PrevAction->NextAction;
+          PrevAction = PrevAction->Previous;
+        }
+      }
+
+      // Compute the actions.
+      for (unsigned I = NumShared, M = TypeIds.size(); I != M; ++I) {
+        int TypeID = TypeIds[I];
+        assert(-1-TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
+        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        unsigned SizeTypeID = TargetAsmInfo::getSLEB128Size(ValueForTypeID);
+
+        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
+        SizeAction = SizeTypeID + TargetAsmInfo::getSLEB128Size(NextAction);
+        SizeSiteActions += SizeAction;
+
+        ActionEntry Action = {ValueForTypeID, NextAction, PrevAction};
+        Actions.push_back(Action);
+
+        PrevAction = &Actions.back();
+      }
+
+      // Record the first action of the landing pad site.
+      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
+    } // else identical - re-use previous FirstAction
+
+    FirstActions.push_back(FirstAction);
+
+    // Compute this sites contribution to size.
+    SizeActions += SizeSiteActions;
+  }
+
+  // Compute the call-site table.  The entry for an invoke has a try-range
+  // containing the call, a non-zero landing pad and an appropriate action.  The
+  // entry for an ordinary call has a try-range containing the call and zero for
+  // the landing pad and the action.  Calls marked 'nounwind' have no entry and
+  // must not be contained in the try-range of any entry - they form gaps in the
+  // table.  Entries must be ordered by try-range address.
+  SmallVector<CallSiteEntry, 64> CallSites;
+
+  RangeMapType PadMap;
+
+  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
+  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
+  // try-ranges for them need be deduced.
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      unsigned BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
+  // The end label of the previous invoke or nounwind try-range.
+  unsigned LastLabel = 0;
+
+  // Whether there is a potentially throwing instruction (currently this means
+  // an ordinary call) between the end of the previous try-range and now.
+  bool SawPotentiallyThrowing = false;
+
+  // Whether the last callsite entry was for an invoke.
+  bool PreviousIsInvoke = false;
+
+  // Visit all instructions in order of address.
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+       I != E; ++I) {
+    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
+         MI != E; ++MI) {
+      if (!MI->isLabel()) {
+        SawPotentiallyThrowing |= MI->getDesc().isCall();
+        continue;
+      }
+
+      unsigned BeginLabel = MI->getOperand(0).getImm();
+      assert(BeginLabel && "Invalid label!");
+
+      // End of the previous try-range?
+      if (BeginLabel == LastLabel)
+        SawPotentiallyThrowing = false;
+
+      // Beginning of a new try-range?
+      RangeMapType::iterator L = PadMap.find(BeginLabel);
+      if (L == PadMap.end())
+        // Nope, it was just some random label.
+        continue;
+
+      PadRange P = L->second;
+      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
+
+      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
+             "Inconsistent landing pad map!");
+
+      // If some instruction between the previous try-range and this one may
+      // throw, create a call-site entry with no landing pad for the region
+      // between the try-ranges.
+      if (SawPotentiallyThrowing) {
+        CallSiteEntry Site = {LastLabel, BeginLabel, 0, 0};
+        CallSites.push_back(Site);
+        PreviousIsInvoke = false;
+      }
+
+      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+      assert(BeginLabel && LastLabel && "Invalid landing pad!");
+
+      if (LandingPad->LandingPadLabel) {
+        // This try-range is for an invoke.
+        CallSiteEntry Site = {BeginLabel, LastLabel,
+                              LandingPad->LandingPadLabel,
+                              FirstActions[P.PadIndex]};
+
+        // Try to merge with the previous call-site.
+        if (PreviousIsInvoke) {
+          CallSiteEntry &Prev = CallSites.back();
+          if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+            // Extend the range of the previous entry.
+            Prev.EndLabel = Site.EndLabel;
+            continue;
+          }
+        }
+
+        // Otherwise, create a new call-site.
+        CallSites.push_back(Site);
+        PreviousIsInvoke = true;
+      } else {
+        // Create a gap.
+        PreviousIsInvoke = false;
+      }
+    }
+  }
+
+  // If some instruction between the previous try-range and the end of the
+  // function may throw, create a call-site entry with no landing pad for the
+  // region following the try-range.
+  if (SawPotentiallyThrowing) {
+    CallSiteEntry Site = {LastLabel, 0, 0, 0};
+    CallSites.push_back(Site);
+  }
+
+  // Final tallies.
+
+  // Call sites.
+  const unsigned SiteStartSize  = sizeof(int32_t); // DW_EH_PE_udata4
+  const unsigned SiteLengthSize = sizeof(int32_t); // DW_EH_PE_udata4
+  const unsigned LandingPadSize = sizeof(int32_t); // DW_EH_PE_udata4
+  unsigned SizeSites = CallSites.size() * (SiteStartSize +
+                                           SiteLengthSize +
+                                           LandingPadSize);
+  for (unsigned i = 0, e = CallSites.size(); i < e; ++i)
+    SizeSites += TargetAsmInfo::getULEB128Size(CallSites[i].Action);
+
+  // Type infos.
+  const unsigned TypeInfoSize = TD->getPointerSize(); // DW_EH_PE_absptr
+  unsigned SizeTypes = TypeInfos.size() * TypeInfoSize;
+
+  unsigned TypeOffset = sizeof(int8_t) + // Call site format
+    TargetAsmInfo::getULEB128Size(SizeSites) + // Call-site table length
+    SizeSites + SizeActions + SizeTypes;
+
+  unsigned TotalSize = sizeof(int8_t) + // LPStart format
+                       sizeof(int8_t) + // TType format
+           TargetAsmInfo::getULEB128Size(TypeOffset) + // TType base offset
+                       TypeOffset;
+
+  unsigned SizeAlign = (4 - TotalSize) & 3;
+
+  // Begin the exception table.
+  Asm->SwitchToDataSection(TAI->getDwarfExceptionSection());
+  Asm->EmitAlignment(2, 0, 0, false);
+  O << "GCC_except_table" << SubprogramCount << ":\n";
+
+  for (unsigned i = 0; i != SizeAlign; ++i) {
+    Asm->EmitInt8(0);
+    Asm->EOL("Padding");
+    }
+
+  EmitLabel("exception", SubprogramCount);
+
+  // Emit the header.
+  Asm->EmitInt8(dwarf::DW_EH_PE_omit);
+  Asm->EOL("LPStart format (DW_EH_PE_omit)");
+  Asm->EmitInt8(dwarf::DW_EH_PE_absptr);
+  Asm->EOL("TType format (DW_EH_PE_absptr)");
+  Asm->EmitULEB128Bytes(TypeOffset);
+  Asm->EOL("TType base offset");
+  Asm->EmitInt8(dwarf::DW_EH_PE_udata4);
+  Asm->EOL("Call site format (DW_EH_PE_udata4)");
+  Asm->EmitULEB128Bytes(SizeSites);
+  Asm->EOL("Call-site table length");
+
+  // Emit the landing pad site information.
+  for (unsigned i = 0; i < CallSites.size(); ++i) {
+    CallSiteEntry &S = CallSites[i];
+    const char *BeginTag;
+    unsigned BeginNumber;
+
+    if (!S.BeginLabel) {
+      BeginTag = "eh_func_begin";
+      BeginNumber = SubprogramCount;
+    } else {
+      BeginTag = "label";
+      BeginNumber = S.BeginLabel;
+    }
+
+    EmitSectionOffset(BeginTag, "eh_func_begin", BeginNumber, SubprogramCount,
+                      true, true);
+    Asm->EOL("Region start");
+
+    if (!S.EndLabel)
+      EmitDifference("eh_func_end", SubprogramCount, BeginTag, BeginNumber,
+                     true);
+    else
+      EmitDifference("label", S.EndLabel, BeginTag, BeginNumber, true);
+
+    Asm->EOL("Region length");
+
+    if (!S.PadLabel)
+      Asm->EmitInt32(0);
+    else
+      EmitSectionOffset("label", "eh_func_begin", S.PadLabel, SubprogramCount,
+                        true, true);
+
+    Asm->EOL("Landing pad");
+
+    Asm->EmitULEB128Bytes(S.Action);
+    Asm->EOL("Action");
+  }
+
+  // Emit the actions.
+  for (unsigned I = 0, N = Actions.size(); I != N; ++I) {
+    ActionEntry &Action = Actions[I];
+
+    Asm->EmitSLEB128Bytes(Action.ValueForTypeID);
+    Asm->EOL("TypeInfo index");
+    Asm->EmitSLEB128Bytes(Action.NextAction);
+    Asm->EOL("Next action");
+  }
+
+  // Emit the type ids.
+  for (unsigned M = TypeInfos.size(); M; --M) {
+    GlobalVariable *GV = TypeInfos[M - 1];
+    PrintRelDirective();
+
+    if (GV) {
+      std::string GLN;
+      O << Asm->getGlobalLinkName(GV, GLN);
+    } else {
+      O << "0";
+    }
+
+    Asm->EOL("TypeInfo");
+  }
+
+  // Emit the filter typeids.
+  for (unsigned j = 0, M = FilterIds.size(); j < M; ++j) {
+    unsigned TypeID = FilterIds[j];
+    Asm->EmitULEB128Bytes(TypeID);
+    Asm->EOL("Filter TypeInfo index");
+  }
+
+  Asm->EmitAlignment(2, 0, 0, false);
+}
+
+/// EndModule - Emit all exception information that should come after the
+/// content.
+void DwarfException::EndModule() {
+  if (TimePassesIsEnabled)
+    ExceptionTimer->startTimer();
+
+  if (shouldEmitMovesModule || shouldEmitTableModule) {
+    const std::vector<Function *> Personalities = MMI->getPersonalities();
+    for (unsigned i = 0; i < Personalities.size(); ++i)
+      EmitCommonEHFrame(Personalities[i], i);
+
+    for (std::vector<FunctionEHFrameInfo>::iterator I = EHFrames.begin(),
+           E = EHFrames.end(); I != E; ++I)
+      EmitEHFrame(*I);
+  }
+
+  if (TimePassesIsEnabled)
+    ExceptionTimer->stopTimer();
+}
+
+/// BeginFunction - Gather pre-function exception information.  Assumes being
+/// emitted immediately after the function entry point.
+void DwarfException::BeginFunction(MachineFunction *MF) {
+  if (TimePassesIsEnabled)
+    ExceptionTimer->startTimer();
+
+  this->MF = MF;
+  shouldEmitTable = shouldEmitMoves = false;
+
+  if (MMI && TAI->doesSupportExceptionHandling()) {
+    // Map all labels and get rid of any dead landing pads.
+    MMI->TidyLandingPads();
+
+    // If any landing pads survive, we need an EH table.
+    if (MMI->getLandingPads().size())
+      shouldEmitTable = true;
+
+    // See if we need frame move info.
+    if (!MF->getFunction()->doesNotThrow() || UnwindTablesMandatory)
+      shouldEmitMoves = true;
+
+    if (shouldEmitMoves || shouldEmitTable)
+      // Assumes in correct section after the entry point.
+      EmitLabel("eh_func_begin", ++SubprogramCount);
+  }
+
+  shouldEmitTableModule |= shouldEmitTable;
+  shouldEmitMovesModule |= shouldEmitMoves;
+
+  if (TimePassesIsEnabled)
+    ExceptionTimer->stopTimer();
+}
+
+/// EndFunction - Gather and emit post-function exception information.
+///
+void DwarfException::EndFunction() {
+  if (TimePassesIsEnabled) 
+    ExceptionTimer->startTimer();
+
+  if (shouldEmitMoves || shouldEmitTable) {
+    EmitLabel("eh_func_end", SubprogramCount);
+    EmitExceptionTable();
+
+    // Save EH frame information
+    std::string Name;
+    EHFrames.push_back(
+        FunctionEHFrameInfo(getAsm()->getCurrentFunctionEHName(MF, Name),
+                            SubprogramCount,
+                            MMI->getPersonalityIndex(),
+                            MF->getFrameInfo()->hasCalls(),
+                            !MMI->getLandingPads().empty(),
+                            MMI->getFrameMoves(),
+                            MF->getFunction()));
+  }
+
+  if (TimePassesIsEnabled) 
+    ExceptionTimer->stopTimer();
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
new file mode 100644
index 0000000..4479af2
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -0,0 +1,178 @@
+//===-- DwarfException.h - Dwarf Exception Framework -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFEXCEPTION_H__
+#define CODEGEN_ASMPRINTER_DWARFEXCEPTION_H__
+
+#include "DIE.h"
+#include "DwarfPrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/ADT/DenseMap.h"
+#include <string>
+
+namespace llvm {
+
+struct LandingPadInfo;
+class MachineModuleInfo;
+class TargetAsmInfo;
+class Timer;
+class raw_ostream;
+
+//===----------------------------------------------------------------------===//
+/// DwarfException - Emits Dwarf exception handling directives.
+///
+class VISIBILITY_HIDDEN DwarfException : public Dwarf {
+  struct FunctionEHFrameInfo {
+    std::string FnName;
+    unsigned Number;
+    unsigned PersonalityIndex;
+    bool hasCalls;
+    bool hasLandingPads;
+    std::vector<MachineMove> Moves;
+    const Function * function;
+
+    FunctionEHFrameInfo(const std::string &FN, unsigned Num, unsigned P,
+                        bool hC, bool hL,
+                        const std::vector<MachineMove> &M,
+                        const Function *f):
+      FnName(FN), Number(Num), PersonalityIndex(P),
+      hasCalls(hC), hasLandingPads(hL), Moves(M), function (f) { }
+  };
+
+  std::vector<FunctionEHFrameInfo> EHFrames;
+
+  /// shouldEmitTable - Per-function flag to indicate if EH tables should
+  /// be emitted.
+  bool shouldEmitTable;
+
+  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
+  /// should be emitted.
+  bool shouldEmitMoves;
+
+  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
+  /// should be emitted.
+  bool shouldEmitTableModule;
+
+  /// shouldEmitFrameModule - Per-module flag to indicate if frame moves
+  /// should be emitted.
+  bool shouldEmitMovesModule;
+
+  /// ExceptionTimer - Timer for the Dwarf exception writer.
+  Timer *ExceptionTimer;
+
+  /// EmitCommonEHFrame - Emit the common eh unwind frame.
+  ///
+  void EmitCommonEHFrame(const Function *Personality, unsigned Index);
+
+  /// EmitEHFrame - Emit function exception frame information.
+  ///
+  void EmitEHFrame(const FunctionEHFrameInfo &EHFrameInfo);
+
+  /// EmitExceptionTable - Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
+
+  /// SharedTypeIds - How many leading type ids two landing pads have in common.
+  static unsigned SharedTypeIds(const LandingPadInfo *L,
+                                const LandingPadInfo *R);
+
+  /// PadLT - Order landing pads lexicographically by type id.
+  static bool PadLT(const LandingPadInfo *L, const LandingPadInfo *R);
+
+  struct KeyInfo {
+    static inline unsigned getEmptyKey() { return -1U; }
+    static inline unsigned getTombstoneKey() { return -2U; }
+    static unsigned getHashValue(const unsigned &Key) { return Key; }
+    static bool isEqual(unsigned LHS, unsigned RHS) { return LHS == RHS; }
+    static bool isPod() { return true; }
+  };
+
+  /// ActionEntry - Structure describing an entry in the actions table.
+  struct ActionEntry {
+    int ValueForTypeID; // The value to write - may not be equal to the type id.
+    int NextAction;
+    struct ActionEntry *Previous;
+  };
+
+  /// PadRange - Structure holding a try-range and the associated landing pad.
+  struct PadRange {
+    // The index of the landing pad.
+    unsigned PadIndex;
+    // The index of the begin and end labels in the landing pad's label lists.
+    unsigned RangeIndex;
+  };
+
+  typedef DenseMap<unsigned, PadRange, KeyInfo> RangeMapType;
+
+  /// CallSiteEntry - Structure describing an entry in the call-site table.
+  struct CallSiteEntry {
+    // The 'try-range' is BeginLabel .. EndLabel.
+    unsigned BeginLabel; // zero indicates the start of the function.
+    unsigned EndLabel;   // zero indicates the end of the function.
+    // The landing pad starts at PadLabel.
+    unsigned PadLabel;   // zero indicates that there is no landing pad.
+    unsigned Action;
+  };
+
+  void EmitExceptionTable();
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfException(raw_ostream &OS, AsmPrinter *A, const TargetAsmInfo *T);
+  virtual ~DwarfException();
+
+  /// SetModuleInfo - Set machine module information when it's known that pass
+  /// manager has created it.  Set by the target AsmPrinter.
+  void SetModuleInfo(MachineModuleInfo *mmi) {
+    MMI = mmi;
+  }
+
+  /// BeginModule - Emit all exception information that should come prior to the
+  /// content.
+  void BeginModule(Module *M) {
+    this->M = M;
+  }
+
+  /// EndModule - Emit all exception information that should come after the
+  /// content.
+  void EndModule();
+
+  /// BeginFunction - Gather pre-function exception information.  Assumes being
+  /// emitted immediately after the function entry point.
+  void BeginFunction(MachineFunction *MF);
+
+  /// EndFunction - Gather and emit post-function exception information.
+  void EndFunction();
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfLabel.cpp b/lib/CodeGen/AsmPrinter/DwarfLabel.cpp
new file mode 100644
index 0000000..8021b7c
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfLabel.cpp
@@ -0,0 +1,35 @@
+//===--- lib/CodeGen/DwarfLabel.cpp - Dwarf Label -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// DWARF Labels
+// 
+//===----------------------------------------------------------------------===//
+
+#include "DwarfLabel.h"
+#include "llvm/ADT/FoldingSet.h"
+#include <ostream>
+
+using namespace llvm;
+
+/// Profile - Used to gather unique data for the folding set.
+///
+void DWLabel::Profile(FoldingSetNodeID &ID) const {
+  ID.AddString(Tag);
+  ID.AddInteger(Number);
+}
+
+#ifndef NDEBUG
+void DWLabel::print(std::ostream *O) const {
+  if (O) print(*O);
+}
+void DWLabel::print(std::ostream &O) const {
+  O << "." << Tag;
+  if (Number) O << Number;
+}
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfLabel.h b/lib/CodeGen/AsmPrinter/DwarfLabel.h
new file mode 100644
index 0000000..b493903
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfLabel.h
@@ -0,0 +1,56 @@
+//===--- lib/CodeGen/DwarfLabel.h - Dwarf Label -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// DWARF Labels.
+// 
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFLABEL_H__
+#define CODEGEN_ASMPRINTER_DWARFLABEL_H__
+
+#include "llvm/Support/Compiler.h"
+#include <iosfwd>
+#include <vector>
+
+namespace llvm {
+  class FoldingSetNodeID;
+
+  //===--------------------------------------------------------------------===//
+  /// DWLabel - Labels are used to track locations in the assembler file.
+  /// Labels appear in the form @verbatim <prefix><Tag><Number> @endverbatim,
+  /// where the tag is a category of label (Ex. location) and number is a value
+  /// unique in that category.
+  class VISIBILITY_HIDDEN DWLabel {
+    /// Tag - Label category tag. Should always be a statically declared C
+    /// string.
+    /// 
+    const char *Tag;
+
+    /// Number - Value to make label unique.
+    /// 
+    unsigned Number;
+  public:
+    DWLabel(const char *T, unsigned N) : Tag(T), Number(N) {}
+
+    // Accessors.
+    const char *getTag() const { return Tag; }
+    unsigned getNumber() const { return Number; }
+
+    /// Profile - Used to gather unique data for the folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) const;
+
+#ifndef NDEBUG
+    void print(std::ostream *O) const;
+    void print(std::ostream &O) const;
+#endif
+  };
+} // end llvm namespace
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp b/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
new file mode 100644
index 0000000..45e7dd3
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
@@ -0,0 +1,235 @@
+//===--- lib/CodeGen/DwarfPrinter.cpp - Dwarf Printer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Emit general DWARF directives.
+// 
+//===----------------------------------------------------------------------===//
+
+#include "DwarfPrinter.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <ostream>
+
+using namespace llvm;
+
+Dwarf::Dwarf(raw_ostream &OS, AsmPrinter *A, const TargetAsmInfo *T,
+             const char *flavor)
+: O(OS), Asm(A), TAI(T), TD(Asm->TM.getTargetData()),
+  RI(Asm->TM.getRegisterInfo()), M(NULL), MF(NULL), MMI(NULL),
+  SubprogramCount(0), Flavor(flavor), SetCounter(1) {}
+
+void Dwarf::PrintRelDirective(bool Force32Bit, bool isInSection) const {
+  if (isInSection && TAI->getDwarfSectionOffsetDirective())
+    O << TAI->getDwarfSectionOffsetDirective();
+  else if (Force32Bit || TD->getPointerSize() == sizeof(int32_t))
+    O << TAI->getData32bitsDirective();
+  else
+    O << TAI->getData64bitsDirective();
+}
+
+/// PrintLabelName - Print label name in form used by Dwarf writer.
+///
+void Dwarf::PrintLabelName(const char *Tag, unsigned Number) const {
+  O << TAI->getPrivateGlobalPrefix() << Tag;
+  if (Number) O << Number;
+}
+void Dwarf::PrintLabelName(const char *Tag, unsigned Number,
+                           const char *Suffix) const {
+  O << TAI->getPrivateGlobalPrefix() << Tag;
+  if (Number) O << Number;
+  O << Suffix;
+}
+
+/// EmitLabel - Emit location label for internal use by Dwarf.
+///
+void Dwarf::EmitLabel(const char *Tag, unsigned Number) const {
+  PrintLabelName(Tag, Number);
+  O << ":\n";
+}
+
+/// EmitReference - Emit a reference to a label.
+///
+void Dwarf::EmitReference(const char *Tag, unsigned Number,
+                          bool IsPCRelative, bool Force32Bit) const {
+  PrintRelDirective(Force32Bit);
+  PrintLabelName(Tag, Number);
+  if (IsPCRelative) O << "-" << TAI->getPCSymbol();
+}
+void Dwarf::EmitReference(const std::string &Name, bool IsPCRelative,
+                          bool Force32Bit) const {
+  PrintRelDirective(Force32Bit);
+  O << Name;
+  if (IsPCRelative) O << "-" << TAI->getPCSymbol();
+}
+
+/// EmitDifference - Emit the difference between two labels.  Some assemblers do
+/// not behave with absolute expressions with data directives, so there is an
+/// option (needsSet) to use an intermediary set expression.
+void Dwarf::EmitDifference(const char *TagHi, unsigned NumberHi,
+                           const char *TagLo, unsigned NumberLo,
+                           bool IsSmall) {
+  if (TAI->needsSet()) {
+    O << "\t.set\t";
+    PrintLabelName("set", SetCounter, Flavor);
+    O << ",";
+    PrintLabelName(TagHi, NumberHi);
+    O << "-";
+    PrintLabelName(TagLo, NumberLo);
+    O << "\n";
+
+    PrintRelDirective(IsSmall);
+    PrintLabelName("set", SetCounter, Flavor);
+    ++SetCounter;
+  } else {
+    PrintRelDirective(IsSmall);
+    PrintLabelName(TagHi, NumberHi);
+    O << "-";
+    PrintLabelName(TagLo, NumberLo);
+  }
+}
+
+void Dwarf::EmitSectionOffset(const char* Label, const char* Section,
+                              unsigned LabelNumber, unsigned SectionNumber,
+                              bool IsSmall, bool isEH,
+                              bool useSet) {
+  bool printAbsolute = false;
+  if (isEH)
+    printAbsolute = TAI->isAbsoluteEHSectionOffsets();
+  else
+    printAbsolute = TAI->isAbsoluteDebugSectionOffsets();
+
+  if (TAI->needsSet() && useSet) {
+    O << "\t.set\t";
+    PrintLabelName("set", SetCounter, Flavor);
+    O << ",";
+    PrintLabelName(Label, LabelNumber);
+
+    if (!printAbsolute) {
+      O << "-";
+      PrintLabelName(Section, SectionNumber);
+    }
+
+    O << "\n";
+    PrintRelDirective(IsSmall);
+    PrintLabelName("set", SetCounter, Flavor);
+    ++SetCounter;
+  } else {
+    PrintRelDirective(IsSmall, true);
+    PrintLabelName(Label, LabelNumber);
+
+    if (!printAbsolute) {
+      O << "-";
+      PrintLabelName(Section, SectionNumber);
+    }
+  }
+}
+
+/// EmitFrameMoves - Emit frame instructions to describe the layout of the
+/// frame.
+void Dwarf::EmitFrameMoves(const char *BaseLabel, unsigned BaseLabelID,
+                           const std::vector<MachineMove> &Moves, bool isEH) {
+  int stackGrowth =
+    Asm->TM.getFrameInfo()->getStackGrowthDirection() ==
+    TargetFrameInfo::StackGrowsUp ?
+    TD->getPointerSize() : -TD->getPointerSize();
+  bool IsLocal = BaseLabel && strcmp(BaseLabel, "label") == 0;
+
+  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
+    const MachineMove &Move = Moves[i];
+    unsigned LabelID = Move.getLabelID();
+
+    if (LabelID) {
+      LabelID = MMI->MappedLabel(LabelID);
+
+      // Throw out move if the label is invalid.
+      if (!LabelID) continue;
+    }
+
+    const MachineLocation &Dst = Move.getDestination();
+    const MachineLocation &Src = Move.getSource();
+
+    // Advance row if new location.
+    if (BaseLabel && LabelID && (BaseLabelID != LabelID || !IsLocal)) {
+      Asm->EmitInt8(dwarf::DW_CFA_advance_loc4);
+      Asm->EOL("DW_CFA_advance_loc4");
+      EmitDifference("label", LabelID, BaseLabel, BaseLabelID, true);
+      Asm->EOL();
+
+      BaseLabelID = LabelID;
+      BaseLabel = "label";
+      IsLocal = true;
+    }
+
+    // If advancing cfa.
+    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
+      if (!Src.isReg()) {
+        if (Src.getReg() == MachineLocation::VirtualFP) {
+          Asm->EmitInt8(dwarf::DW_CFA_def_cfa_offset);
+          Asm->EOL("DW_CFA_def_cfa_offset");
+        } else {
+          Asm->EmitInt8(dwarf::DW_CFA_def_cfa);
+          Asm->EOL("DW_CFA_def_cfa");
+          Asm->EmitULEB128Bytes(RI->getDwarfRegNum(Src.getReg(), isEH));
+          Asm->EOL("Register");
+        }
+
+        int Offset = -Src.getOffset();
+
+        Asm->EmitULEB128Bytes(Offset);
+        Asm->EOL("Offset");
+      } else {
+        assert(0 && "Machine move no supported yet.");
+      }
+    } else if (Src.isReg() &&
+               Src.getReg() == MachineLocation::VirtualFP) {
+      if (Dst.isReg()) {
+        Asm->EmitInt8(dwarf::DW_CFA_def_cfa_register);
+        Asm->EOL("DW_CFA_def_cfa_register");
+        Asm->EmitULEB128Bytes(RI->getDwarfRegNum(Dst.getReg(), isEH));
+        Asm->EOL("Register");
+      } else {
+        assert(0 && "Machine move no supported yet.");
+      }
+    } else {
+      unsigned Reg = RI->getDwarfRegNum(Src.getReg(), isEH);
+      int Offset = Dst.getOffset() / stackGrowth;
+
+      if (Offset < 0) {
+        Asm->EmitInt8(dwarf::DW_CFA_offset_extended_sf);
+        Asm->EOL("DW_CFA_offset_extended_sf");
+        Asm->EmitULEB128Bytes(Reg);
+        Asm->EOL("Reg");
+        Asm->EmitSLEB128Bytes(Offset);
+        Asm->EOL("Offset");
+      } else if (Reg < 64) {
+        Asm->EmitInt8(dwarf::DW_CFA_offset + Reg);
+        if (Asm->isVerbose())
+          Asm->EOL("DW_CFA_offset + Reg (" + utostr(Reg) + ")");
+        else
+          Asm->EOL();
+        Asm->EmitULEB128Bytes(Offset);
+        Asm->EOL("Offset");
+      } else {
+        Asm->EmitInt8(dwarf::DW_CFA_offset_extended);
+        Asm->EOL("DW_CFA_offset_extended");
+        Asm->EmitULEB128Bytes(Reg);
+        Asm->EOL("Reg");
+        Asm->EmitULEB128Bytes(Offset);
+        Asm->EOL("Offset");
+      }
+    }
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfPrinter.h b/lib/CodeGen/AsmPrinter/DwarfPrinter.h
new file mode 100644
index 0000000..6e75992
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfPrinter.h
@@ -0,0 +1,153 @@
+//===--- lib/CodeGen/DwarfPrinter.h - Dwarf Printer -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Emit general DWARF directives.
+// 
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFPRINTER_H__
+#define CODEGEN_ASMPRINTER_DWARFPRINTER_H__
+
+#include "DwarfLabel.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+namespace llvm {
+  class AsmPrinter;
+  class MachineFunction;
+  class MachineModuleInfo;
+  class Module;
+  class TargetAsmInfo;
+  class TargetData;
+  class TargetRegisterInfo;
+
+  class VISIBILITY_HIDDEN Dwarf {
+  protected:
+    //===-------------------------------------------------------------==---===//
+    // Core attributes used by the DWARF printer.
+    //
+
+    /// O - Stream to .s file.
+    ///
+    raw_ostream &O;
+
+    /// Asm - Target of Dwarf emission.
+    ///
+    AsmPrinter *Asm;
+
+    /// TAI - Target asm information.
+    /// 
+    const TargetAsmInfo *TAI;
+
+    /// TD - Target data.
+    /// 
+    const TargetData *TD;
+
+    /// RI - Register Information.
+    /// 
+    const TargetRegisterInfo *RI;
+
+    /// M - Current module.
+    ///
+    Module *M;
+
+    /// MF - Current machine function.
+    ///
+    MachineFunction *MF;
+
+    /// MMI - Collected machine module information.
+    ///
+    MachineModuleInfo *MMI;
+
+    /// SubprogramCount - The running count of functions being compiled.
+    ///
+    unsigned SubprogramCount;
+
+    /// Flavor - A unique string indicating what dwarf producer this is, used to
+    /// unique labels.
+    /// 
+    const char * const Flavor;
+
+    /// SetCounter - A unique number for each '.set' directive.
+    /// 
+    unsigned SetCounter;
+
+    Dwarf(raw_ostream &OS, AsmPrinter *A, const TargetAsmInfo *T,
+          const char *flavor);
+  public:
+    //===------------------------------------------------------------------===//
+    // Accessors.
+    //
+    const AsmPrinter *getAsm() const { return Asm; }
+    MachineModuleInfo *getMMI() const { return MMI; }
+    const TargetAsmInfo *getTargetAsmInfo() const { return TAI; }
+    const TargetData *getTargetData() const { return TD; }
+
+    void PrintRelDirective(bool Force32Bit = false,
+                           bool isInSection = false) const;
+
+
+    /// PrintLabelName - Print label name in form used by Dwarf writer.
+    ///
+    void PrintLabelName(const DWLabel &Label) const {
+      PrintLabelName(Label.getTag(), Label.getNumber());
+    }
+    void PrintLabelName(const char *Tag, unsigned Number) const;
+    void PrintLabelName(const char *Tag, unsigned Number,
+                        const char *Suffix) const;
+
+    /// EmitLabel - Emit location label for internal use by Dwarf.
+    ///
+    void EmitLabel(const DWLabel &Label) const {
+      EmitLabel(Label.getTag(), Label.getNumber());
+    }
+    void EmitLabel(const char *Tag, unsigned Number) const;
+
+    /// EmitReference - Emit a reference to a label.
+    ///
+    void EmitReference(const DWLabel &Label, bool IsPCRelative = false,
+                       bool Force32Bit = false) const {
+      EmitReference(Label.getTag(), Label.getNumber(),
+                    IsPCRelative, Force32Bit);
+    }
+    void EmitReference(const char *Tag, unsigned Number,
+                       bool IsPCRelative = false,
+                       bool Force32Bit = false) const;
+    void EmitReference(const std::string &Name, bool IsPCRelative = false,
+                       bool Force32Bit = false) const;
+
+    /// EmitDifference - Emit the difference between two labels.  Some
+    /// assemblers do not behave with absolute expressions with data directives,
+    /// so there is an option (needsSet) to use an intermediary set expression.
+    void EmitDifference(const DWLabel &LabelHi, const DWLabel &LabelLo,
+                        bool IsSmall = false) {
+      EmitDifference(LabelHi.getTag(), LabelHi.getNumber(),
+                     LabelLo.getTag(), LabelLo.getNumber(),
+                     IsSmall);
+    }
+    void EmitDifference(const char *TagHi, unsigned NumberHi,
+                        const char *TagLo, unsigned NumberLo,
+                        bool IsSmall = false);
+
+    void EmitSectionOffset(const char* Label, const char* Section,
+                           unsigned LabelNumber, unsigned SectionNumber,
+                           bool IsSmall = false, bool isEH = false,
+                           bool useSet = true);
+
+    /// EmitFrameMoves - Emit frame instructions to describe the layout of the
+    /// frame.
+    void EmitFrameMoves(const char *BaseLabel, unsigned BaseLabelID,
+                        const std::vector<MachineMove> &Moves, bool isEH);
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfWriter.cpp b/lib/CodeGen/AsmPrinter/DwarfWriter.cpp
new file mode 100644
index 0000000..483ee559
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfWriter.cpp
@@ -0,0 +1,129 @@
+//===-- llvm/CodeGen/DwarfWriter.cpp - Dwarf Framework --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "DwarfDebug.h"
+#include "DwarfException.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+using namespace llvm;
+
+static RegisterPass<DwarfWriter>
+X("dwarfwriter", "DWARF Information Writer");
+char DwarfWriter::ID = 0;
+
+//===----------------------------------------------------------------------===//
+/// DwarfWriter Implementation
+///
+
+DwarfWriter::DwarfWriter()
+  : ImmutablePass(&ID), DD(0), DE(0) {}
+
+DwarfWriter::~DwarfWriter() {
+  delete DE;
+  delete DD;
+}
+
+/// BeginModule - Emit all Dwarf sections that should come prior to the
+/// content.
+void DwarfWriter::BeginModule(Module *M,
+                              MachineModuleInfo *MMI,
+                              raw_ostream &OS, AsmPrinter *A,
+                              const TargetAsmInfo *T) {
+  DE = new DwarfException(OS, A, T);
+  DD = new DwarfDebug(OS, A, T);
+  DE->BeginModule(M);
+  DD->BeginModule(M);
+  DD->SetDebugInfo(MMI);
+  DE->SetModuleInfo(MMI);
+}
+
+/// EndModule - Emit all Dwarf sections that should come after the content.
+///
+void DwarfWriter::EndModule() {
+  DE->EndModule();
+  DD->EndModule();
+}
+
+/// BeginFunction - Gather pre-function debug information.  Assumes being
+/// emitted immediately after the function entry point.
+void DwarfWriter::BeginFunction(MachineFunction *MF) {
+  DE->BeginFunction(MF);
+  DD->BeginFunction(MF);
+}
+
+/// EndFunction - Gather and emit post-function debug information.
+///
+void DwarfWriter::EndFunction(MachineFunction *MF) {
+  DD->EndFunction(MF);
+  DE->EndFunction();
+
+  if (MachineModuleInfo *MMI = DD->getMMI() ? DD->getMMI() : DE->getMMI())
+    // Clear function debug information.
+    MMI->EndFunction();
+}
+
+/// RecordSourceLine - Records location information and associates it with a 
+/// label. Returns a unique label ID used to generate a label and provide
+/// correspondence to the source line list.
+unsigned DwarfWriter::RecordSourceLine(unsigned Line, unsigned Col, 
+                                       DICompileUnit CU) {
+  return DD->RecordSourceLine(Line, Col, CU);
+}
+
+/// RecordRegionStart - Indicate the start of a region.
+unsigned DwarfWriter::RecordRegionStart(GlobalVariable *V) {
+  return DD->RecordRegionStart(V);
+}
+
+/// RecordRegionEnd - Indicate the end of a region.
+unsigned DwarfWriter::RecordRegionEnd(GlobalVariable *V) {
+  return DD->RecordRegionEnd(V);
+}
+
+/// getRecordSourceLineCount - Count source lines.
+unsigned DwarfWriter::getRecordSourceLineCount() {
+  return DD->getRecordSourceLineCount();
+}
+
+/// RecordVariable - Indicate the declaration of  a local variable.
+///
+void DwarfWriter::RecordVariable(GlobalVariable *GV, unsigned FrameIndex,
+                                 const MachineInstr *MI) {
+  DD->RecordVariable(GV, FrameIndex, MI);
+}
+
+/// ShouldEmitDwarfDebug - Returns true if Dwarf debugging declarations should
+/// be emitted.
+bool DwarfWriter::ShouldEmitDwarfDebug() const {
+  return DD && DD->ShouldEmitDwarfDebug();
+}
+
+//// RecordInlinedFnStart - Global variable GV is inlined at the location marked
+//// by LabelID label.
+unsigned DwarfWriter::RecordInlinedFnStart(DISubprogram SP, DICompileUnit CU,
+                                           unsigned Line, unsigned Col) {
+  return DD->RecordInlinedFnStart(SP, CU, Line, Col);
+}
+
+/// RecordInlinedFnEnd - Indicate the end of inlined subroutine.
+unsigned DwarfWriter::RecordInlinedFnEnd(DISubprogram SP) {
+  return DD->RecordInlinedFnEnd(SP);
+}
+
+/// RecordVariableScope - Record scope for the variable declared by
+/// DeclareMI. DeclareMI must describe TargetInstrInfo::DECLARE.
+void DwarfWriter::RecordVariableScope(DIVariable &DV,
+                                      const MachineInstr *DeclareMI) {
+  DD->RecordVariableScope(DV, DeclareMI);
+}
diff --git a/lib/CodeGen/AsmPrinter/Makefile b/lib/CodeGen/AsmPrinter/Makefile
new file mode 100644
index 0000000..cb5b3f6
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/CodeGen/SelectionDAG/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMAsmPrinter
+PARALLEL_DIRS =
+BUILD_ARCHIVE = 1
+DONT_BUILD_RELINKED = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
new file mode 100644
index 0000000..8ba903a
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -0,0 +1,160 @@
+//===-- OcamlGCPrinter.cpp - Ocaml frametable emitter ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements printing the assembly code for an Ocaml frametable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+  class VISIBILITY_HIDDEN OcamlGCMetadataPrinter : public GCMetadataPrinter {
+  public:
+    void beginAssembly(raw_ostream &OS, AsmPrinter &AP,
+                       const TargetAsmInfo &TAI);
+
+    void finishAssembly(raw_ostream &OS, AsmPrinter &AP,
+                        const TargetAsmInfo &TAI);
+  };
+
+}
+
+static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter>
+Y("ocaml", "ocaml 3.10-compatible collector");
+
+void llvm::linkOcamlGCPrinter() { }
+
+static void EmitCamlGlobal(const Module &M, raw_ostream &OS, AsmPrinter &AP,
+                           const TargetAsmInfo &TAI, const char *Id) {
+  const std::string &MId = M.getModuleIdentifier();
+
+  std::string Mangled;
+  Mangled += TAI.getGlobalPrefix();
+  Mangled += "caml";
+  size_t Letter = Mangled.size();
+  Mangled.append(MId.begin(), std::find(MId.begin(), MId.end(), '.'));
+  Mangled += "__";
+  Mangled += Id;
+
+  // Capitalize the first letter of the module name.
+  Mangled[Letter] = toupper(Mangled[Letter]);
+
+  if (const char *GlobalDirective = TAI.getGlobalDirective())
+    OS << GlobalDirective << Mangled << "\n";
+  OS << Mangled << ":\n";
+}
+
+void OcamlGCMetadataPrinter::beginAssembly(raw_ostream &OS, AsmPrinter &AP,
+                                           const TargetAsmInfo &TAI) {
+  AP.SwitchToSection(TAI.getTextSection());
+  EmitCamlGlobal(getModule(), OS, AP, TAI, "code_begin");
+
+  AP.SwitchToSection(TAI.getDataSection());
+  EmitCamlGlobal(getModule(), OS, AP, TAI, "data_begin");
+}
+
+/// emitAssembly - Print the frametable. The ocaml frametable format is thus:
+///
+///   extern "C" struct align(sizeof(intptr_t)) {
+///     uint16_t NumDescriptors;
+///     struct align(sizeof(intptr_t)) {
+///       void *ReturnAddress;
+///       uint16_t FrameSize;
+///       uint16_t NumLiveOffsets;
+///       uint16_t LiveOffsets[NumLiveOffsets];
+///     } Descriptors[NumDescriptors];
+///   } caml${module}__frametable;
+///
+/// Note that this precludes programs from stack frames larger than 64K
+/// (FrameSize and LiveOffsets would overflow). FrameTablePrinter will abort if
+/// either condition is detected in a function which uses the GC.
+///
+void OcamlGCMetadataPrinter::finishAssembly(raw_ostream &OS, AsmPrinter &AP,
+                                            const TargetAsmInfo &TAI) {
+  const char *AddressDirective;
+  int AddressAlignLog;
+  if (AP.TM.getTargetData()->getPointerSize() == sizeof(int32_t)) {
+    AddressDirective = TAI.getData32bitsDirective();
+    AddressAlignLog = 2;
+  } else {
+    AddressDirective = TAI.getData64bitsDirective();
+    AddressAlignLog = 3;
+  }
+
+  AP.SwitchToSection(TAI.getTextSection());
+  EmitCamlGlobal(getModule(), OS, AP, TAI, "code_end");
+
+  AP.SwitchToSection(TAI.getDataSection());
+  EmitCamlGlobal(getModule(), OS, AP, TAI, "data_end");
+
+  OS << AddressDirective << 0; // FIXME: Why does ocaml emit this??
+  AP.EOL();
+
+  AP.SwitchToSection(TAI.getDataSection());
+  EmitCamlGlobal(getModule(), OS, AP, TAI, "frametable");
+
+  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+    GCFunctionInfo &FI = **I;
+
+    uint64_t FrameSize = FI.getFrameSize();
+    if (FrameSize >= 1<<16) {
+      cerr << "Function '" << FI.getFunction().getNameStart()
+           << "' is too large for the ocaml GC! "
+           << "Frame size " << FrameSize << " >= 65536.\n";
+      cerr << "(" << uintptr_t(&FI) << ")\n";
+      abort(); // Very rude!
+    }
+
+    OS << "\t" << TAI.getCommentString() << " live roots for "
+       << FI.getFunction().getNameStart() << "\n";
+
+    for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
+      size_t LiveCount = FI.live_size(J);
+      if (LiveCount >= 1<<16) {
+        cerr << "Function '" << FI.getFunction().getNameStart()
+             << "' is too large for the ocaml GC! "
+             << "Live root count " << LiveCount << " >= 65536.\n";
+        abort(); // Very rude!
+      }
+
+      OS << AddressDirective
+         << TAI.getPrivateGlobalPrefix() << "label" << J->Num;
+      AP.EOL("call return address");
+
+      AP.EmitInt16(FrameSize);
+      AP.EOL("stack frame size");
+
+      AP.EmitInt16(LiveCount);
+      AP.EOL("live root count");
+
+      for (GCFunctionInfo::live_iterator K = FI.live_begin(J),
+                                         KE = FI.live_end(J); K != KE; ++K) {
+        assert(K->StackOffset < 1<<16 &&
+               "GC root stack offset is outside of fixed stack frame and out "
+               "of range for ocaml GC!");
+
+        OS << "\t.word\t" << K->StackOffset;
+        AP.EOL("stack offset");
+      }
+
+      AP.EmitAlignment(AddressAlignLog);
+    }
+  }
+}
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
new file mode 100644
index 0000000..2635303
--- /dev/null
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -0,0 +1,1204 @@
+//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass forwards branches to unconditional branches to make them branch
+// directly to the target block.  This pass often results in dead MBB's, which
+// it then removes.
+//
+// Note that this pass must be run after register allocation, it cannot handle
+// SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "branchfolding"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumBranchOpts, "Number of branches optimized");
+STATISTIC(NumTailMerge , "Number of block tails merged");
+static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge", 
+                              cl::init(cl::BOU_UNSET), cl::Hidden);
+// Throttle for huge numbers of predecessors (compile speed problems)
+static cl::opt<unsigned>
+TailMergeThreshold("tail-merge-threshold", 
+          cl::desc("Max number of predecessors to consider tail merging"),
+          cl::init(150), cl::Hidden);
+
+namespace {
+  struct VISIBILITY_HIDDEN BranchFolder : public MachineFunctionPass {
+    static char ID;
+    explicit BranchFolder(bool defaultEnableTailMerge) : 
+      MachineFunctionPass(&ID) {
+      switch (FlagEnableTailMerge) {
+        case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
+        case cl::BOU_TRUE: EnableTailMerge = true; break;
+        case cl::BOU_FALSE: EnableTailMerge = false; break;
+      }
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char *getPassName() const { return "Control Flow Optimizer"; }
+    const TargetInstrInfo *TII;
+    MachineModuleInfo *MMI;
+    bool MadeChange;
+  private:
+    // Tail Merging.
+    bool EnableTailMerge;
+    bool TailMergeBlocks(MachineFunction &MF);
+    bool TryMergeBlocks(MachineBasicBlock* SuccBB,
+                        MachineBasicBlock* PredBB);
+    void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                 MachineBasicBlock *NewDest);
+    MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
+                                  MachineBasicBlock::iterator BBI1);
+    unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength);
+    void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
+                                                MachineBasicBlock* PredBB);
+    unsigned CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                       unsigned maxCommonTailLength);
+
+    typedef std::pair<unsigned,MachineBasicBlock*> MergePotentialsElt;
+    typedef std::vector<MergePotentialsElt>::iterator MPIterator;
+    std::vector<MergePotentialsElt> MergePotentials;
+
+    typedef std::pair<MPIterator, MachineBasicBlock::iterator> SameTailElt;
+    std::vector<SameTailElt> SameTails;
+
+    const TargetRegisterInfo *RegInfo;
+    RegScavenger *RS;
+    // Branch optzn.
+    bool OptimizeBranches(MachineFunction &MF);
+    void OptimizeBlock(MachineBasicBlock *MBB);
+    void RemoveDeadBlock(MachineBasicBlock *MBB);
+    bool OptimizeImpDefsBlock(MachineBasicBlock *MBB);
+    
+    bool CanFallThrough(MachineBasicBlock *CurBB);
+    bool CanFallThrough(MachineBasicBlock *CurBB, bool BranchUnAnalyzable,
+                        MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond);
+  };
+  char BranchFolder::ID = 0;
+}
+
+FunctionPass *llvm::createBranchFoldingPass(bool DefaultEnableTailMerge) { 
+      return new BranchFolder(DefaultEnableTailMerge); }
+
+/// RemoveDeadBlock - Remove the specified dead machine basic block from the
+/// function, updating the CFG.
+void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DOUT << "\nRemoving MBB: " << *MBB;
+  
+  MachineFunction *MF = MBB->getParent();
+  // drop all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end()-1);
+  
+  // If there are any labels in the basic block, unregister them from
+  // MachineModuleInfo.
+  if (MMI && !MBB->empty()) {
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      if (I->isLabel())
+        // The label ID # is always operand #0, an immediate.
+        MMI->InvalidateLabel(I->getOperand(0).getImm());
+    }
+  }
+  
+  // Remove the block.
+  MF->erase(MBB);
+}
+
+/// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def
+/// followed by terminators, and if the implicitly defined registers are not
+/// used by the terminators, remove those implicit_def's. e.g.
+/// BB1:
+///   r0 = implicit_def
+///   r1 = implicit_def
+///   br
+/// This block can be optimized away later if the implicit instructions are
+/// removed.
+bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
+  SmallSet<unsigned, 4> ImpDefRegs;
+  MachineBasicBlock::iterator I = MBB->begin();
+  while (I != MBB->end()) {
+    if (I->getOpcode() != TargetInstrInfo::IMPLICIT_DEF)
+      break;
+    unsigned Reg = I->getOperand(0).getReg();
+    ImpDefRegs.insert(Reg);
+    for (const unsigned *SubRegs = RegInfo->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs)
+      ImpDefRegs.insert(SubReg);
+    ++I;
+  }
+  if (ImpDefRegs.empty())
+    return false;
+
+  MachineBasicBlock::iterator FirstTerm = I;
+  while (I != MBB->end()) {
+    if (!TII->isUnpredicatedTerminator(I))
+      return false;
+    // See if it uses any of the implicitly defined registers.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = I->getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (ImpDefRegs.count(Reg))
+        return false;
+    }
+    ++I;
+  }
+
+  I = MBB->begin();
+  while (I != FirstTerm) {
+    MachineInstr *ImpDefMI = &*I;
+    ++I;
+    MBB->erase(ImpDefMI);
+  }
+
+  return true;
+}
+
+bool BranchFolder::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  if (!TII) return false;
+
+  RegInfo = MF.getTarget().getRegisterInfo();
+
+  // Fix CFG.  The later algorithms expect it to be right.
+  bool EverMadeChange = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; I++) {
+    MachineBasicBlock *MBB = I, *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (!TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true))
+      EverMadeChange |= MBB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+    EverMadeChange |= OptimizeImpDefsBlock(MBB);
+  }
+
+  RS = RegInfo->requiresRegisterScavenging(MF) ? new RegScavenger() : NULL;
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+
+  bool MadeChangeThisIteration = true;
+  while (MadeChangeThisIteration) {
+    MadeChangeThisIteration = false;
+    MadeChangeThisIteration |= TailMergeBlocks(MF);
+    MadeChangeThisIteration |= OptimizeBranches(MF);
+    EverMadeChange |= MadeChangeThisIteration;
+  }
+
+  // See if any jump tables have become mergable or dead as the code generator
+  // did its thing.
+  MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JTs = JTI->getJumpTables();
+  if (!JTs.empty()) {
+    // Figure out how these jump tables should be merged.
+    std::vector<unsigned> JTMapping;
+    JTMapping.reserve(JTs.size());
+    
+    // We always keep the 0th jump table.
+    JTMapping.push_back(0);
+
+    // Scan the jump tables, seeing if there are any duplicates.  Note that this
+    // is N^2, which should be fixed someday.
+    for (unsigned i = 1, e = JTs.size(); i != e; ++i)
+      JTMapping.push_back(JTI->getJumpTableIndex(JTs[i].MBBs));
+    
+    // If a jump table was merge with another one, walk the function rewriting
+    // references to jump tables to reference the new JT ID's.  Keep track of
+    // whether we see a jump table idx, if not, we can delete the JT.
+    BitVector JTIsLive(JTs.size());
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end();
+         BB != E; ++BB) {
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+           I != E; ++I)
+        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+          MachineOperand &Op = I->getOperand(op);
+          if (!Op.isJTI()) continue;
+          unsigned NewIdx = JTMapping[Op.getIndex()];
+          Op.setIndex(NewIdx);
+
+          // Remember that this JT is live.
+          JTIsLive.set(NewIdx);
+        }
+    }
+   
+    // Finally, remove dead jump tables.  This happens either because the
+    // indirect jump was unreachable (and thus deleted) or because the jump
+    // table was merged with some other one.
+    for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
+      if (!JTIsLive.test(i)) {
+        JTI->RemoveJumpTable(i);
+        EverMadeChange = true;
+      }
+  }
+  
+  delete RS;
+  return EverMadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Tail Merging of Blocks
+//===----------------------------------------------------------------------===//
+
+/// HashMachineInstr - Compute a hash value for MI and its operands.
+static unsigned HashMachineInstr(const MachineInstr *MI) {
+  unsigned Hash = MI->getOpcode();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+    
+    // Merge in bits from the operand if easy.
+    unsigned OperandHash = 0;
+    switch (Op.getType()) {
+    case MachineOperand::MO_Register:          OperandHash = Op.getReg(); break;
+    case MachineOperand::MO_Immediate:         OperandHash = Op.getImm(); break;
+    case MachineOperand::MO_MachineBasicBlock:
+      OperandHash = Op.getMBB()->getNumber();
+      break;
+    case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      OperandHash = Op.getIndex();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      // Global address / external symbol are too hard, don't bother, but do
+      // pull in the offset.
+      OperandHash = Op.getOffset();
+      break;
+    default: break;
+    }
+    
+    Hash += ((OperandHash << 3) | Op.getType()) << (i&31);
+  }
+  return Hash;
+}
+
+/// HashEndOfMBB - Hash the last few instructions in the MBB.  For blocks
+/// with no successors, we hash two instructions, because cross-jumping 
+/// only saves code when at least two instructions are removed (since a 
+/// branch must be inserted).  For blocks with a successor, one of the
+/// two blocks to be tail-merged will end with a branch already, so
+/// it gains to cross-jump even for one instruction.
+
+static unsigned HashEndOfMBB(const MachineBasicBlock *MBB,
+                             unsigned minCommonTailLength) {
+  MachineBasicBlock::const_iterator I = MBB->end();
+  if (I == MBB->begin())
+    return 0;   // Empty MBB.
+  
+  --I;
+  unsigned Hash = HashMachineInstr(I);
+    
+  if (I == MBB->begin() || minCommonTailLength == 1)
+    return Hash;   // Single instr MBB.
+  
+  --I;
+  // Hash in the second-to-last instruction.
+  Hash ^= HashMachineInstr(I) << 2;
+  return Hash;
+}
+
+/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
+/// of instructions they actually have in common together at their end.  Return
+/// iterators for the first shared instruction in each block.
+static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
+                                        MachineBasicBlock *MBB2,
+                                        MachineBasicBlock::iterator &I1,
+                                        MachineBasicBlock::iterator &I2) {
+  I1 = MBB1->end();
+  I2 = MBB2->end();
+  
+  unsigned TailLen = 0;
+  while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
+    --I1; --I2;
+    if (!I1->isIdenticalTo(I2) || 
+        // FIXME: This check is dubious. It's used to get around a problem where
+        // people incorrectly expect inline asm directives to remain in the same
+        // relative order. This is untenable because normal compiler
+        // optimizations (like this one) may reorder and/or merge these
+        // directives.
+        I1->getOpcode() == TargetInstrInfo::INLINEASM) {
+      ++I1; ++I2;
+      break;
+    }
+    ++TailLen;
+  }
+  return TailLen;
+}
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.  This
+/// returns true if OldInst's block is modified, false if NewDest is modified.
+void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                           MachineBasicBlock *NewDest) {
+  MachineBasicBlock *OldBB = OldInst->getParent();
+  
+  // Remove all the old successors of OldBB from the CFG.
+  while (!OldBB->succ_empty())
+    OldBB->removeSuccessor(OldBB->succ_begin());
+  
+  // Remove all the dead instructions from the end of OldBB.
+  OldBB->erase(OldInst, OldBB->end());
+
+  // If OldBB isn't immediately before OldBB, insert a branch to it.
+  if (++MachineFunction::iterator(OldBB) != MachineFunction::iterator(NewDest))
+    TII->InsertBranch(*OldBB, NewDest, 0, SmallVector<MachineOperand, 0>());
+  OldBB->addSuccessor(NewDest);
+  ++NumTailMerge;
+}
+
+/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
+/// MBB so that the part before the iterator falls into the part starting at the
+/// iterator.  This returns the new MBB.
+MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
+                                            MachineBasicBlock::iterator BBI1) {
+  MachineFunction &MF = *CurMBB.getParent();
+
+  // Create the fall-through block.
+  MachineFunction::iterator MBBI = &CurMBB;
+  MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(CurMBB.getBasicBlock());
+  CurMBB.getParent()->insert(++MBBI, NewMBB);
+
+  // Move all the successors of this block to the specified block.
+  NewMBB->transferSuccessors(&CurMBB);
+ 
+  // Add an edge from CurMBB to NewMBB for the fall-through.
+  CurMBB.addSuccessor(NewMBB);
+  
+  // Splice the code over.
+  NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
+
+  // For targets that use the register scavenger, we must maintain LiveIns.
+  if (RS) {
+    RS->enterBasicBlock(&CurMBB);
+    if (!CurMBB.empty())
+      RS->forward(prior(CurMBB.end()));
+    BitVector RegsLiveAtExit(RegInfo->getNumRegs());
+    RS->getRegsUsed(RegsLiveAtExit, false);
+    for (unsigned int i=0, e=RegInfo->getNumRegs(); i!=e; i++)
+      if (RegsLiveAtExit[i])
+        NewMBB->addLiveIn(i);
+  }
+
+  return NewMBB;
+}
+
+/// EstimateRuntime - Make a rough estimate for how long it will take to run
+/// the specified code.
+static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
+                                MachineBasicBlock::iterator E) {
+  unsigned Time = 0;
+  for (; I != E; ++I) {
+    const TargetInstrDesc &TID = I->getDesc();
+    if (TID.isCall())
+      Time += 10;
+    else if (TID.mayLoad() || TID.mayStore())
+      Time += 2;
+    else
+      ++Time;
+  }
+  return Time;
+}
+
+// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
+// branches temporarily for tail merging).  In the case where CurMBB ends
+// with a conditional branch to the next block, optimize by reversing the
+// test and conditionally branching to SuccMBB instead.
+
+static void FixTail(MachineBasicBlock* CurMBB, MachineBasicBlock *SuccBB,
+                    const TargetInstrInfo *TII) {
+  MachineFunction *MF = CurMBB->getParent();
+  MachineFunction::iterator I = next(MachineFunction::iterator(CurMBB));
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (I != MF->end() &&
+      !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
+    MachineBasicBlock *NextBB = I;
+    if (TBB == NextBB && !Cond.empty() && !FBB) {
+      if (!TII->ReverseBranchCondition(Cond)) {
+        TII->RemoveBranch(*CurMBB);
+        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond);
+        return;
+      }
+    }
+  }
+  TII->InsertBranch(*CurMBB, SuccBB, NULL, SmallVector<MachineOperand, 0>());
+}
+
+static bool MergeCompare(const std::pair<unsigned,MachineBasicBlock*> &p,
+                         const std::pair<unsigned,MachineBasicBlock*> &q) {
+    if (p.first < q.first)
+      return true;
+     else if (p.first > q.first)
+      return false;
+    else if (p.second->getNumber() < q.second->getNumber())
+      return true;
+    else if (p.second->getNumber() > q.second->getNumber())
+      return false;
+    else {
+      // _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
+      // an object with itself.
+#ifndef _GLIBCXX_DEBUG
+      assert(0 && "Predecessor appears twice");
+#endif
+      return false;
+    }
+}
+
+/// ComputeSameTails - Look through all the blocks in MergePotentials that have
+/// hash CurHash (guaranteed to match the last element).   Build the vector 
+/// SameTails of all those that have the (same) largest number of instructions
+/// in common of any pair of these blocks.  SameTails entries contain an
+/// iterator into MergePotentials (from which the MachineBasicBlock can be 
+/// found) and a MachineBasicBlock::iterator into that MBB indicating the 
+/// instruction where the matching code sequence begins.
+/// Order of elements in SameTails is the reverse of the order in which
+/// those blocks appear in MergePotentials (where they are not necessarily
+/// consecutive).
+unsigned BranchFolder::ComputeSameTails(unsigned CurHash, 
+                                        unsigned minCommonTailLength) {
+  unsigned maxCommonTailLength = 0U;
+  SameTails.clear();
+  MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
+  MPIterator HighestMPIter = prior(MergePotentials.end());
+  for (MPIterator CurMPIter = prior(MergePotentials.end()),
+                  B = MergePotentials.begin(); 
+       CurMPIter!=B && CurMPIter->first==CurHash;
+       --CurMPIter) {
+    for (MPIterator I = prior(CurMPIter); I->first==CurHash ; --I) {
+      unsigned CommonTailLen = ComputeCommonTailLength(
+                                        CurMPIter->second,
+                                        I->second,
+                                        TrialBBI1, TrialBBI2);
+      // If we will have to split a block, there should be at least
+      // minCommonTailLength instructions in common; if not, at worst
+      // we will be replacing a fallthrough into the common tail with a
+      // branch, which at worst breaks even with falling through into
+      // the duplicated common tail, so 1 instruction in common is enough.
+      // We will always pick a block we do not have to split as the common
+      // tail if there is one.
+      // (Empty blocks will get forwarded and need not be considered.)
+      if (CommonTailLen >= minCommonTailLength ||
+          (CommonTailLen > 0 &&
+           (TrialBBI1==CurMPIter->second->begin() ||
+            TrialBBI2==I->second->begin()))) {
+        if (CommonTailLen > maxCommonTailLength) {
+          SameTails.clear();
+          maxCommonTailLength = CommonTailLen;
+          HighestMPIter = CurMPIter;
+          SameTails.push_back(std::make_pair(CurMPIter, TrialBBI1));
+        }
+        if (HighestMPIter == CurMPIter &&
+            CommonTailLen == maxCommonTailLength)
+          SameTails.push_back(std::make_pair(I, TrialBBI2));
+      }
+      if (I==B)
+        break;
+    }
+  }
+  return maxCommonTailLength;
+}
+
+/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
+/// MergePotentials, restoring branches at ends of blocks as appropriate.
+void BranchFolder::RemoveBlocksWithHash(unsigned CurHash, 
+                                        MachineBasicBlock* SuccBB,
+                                        MachineBasicBlock* PredBB) {
+  MPIterator CurMPIter, B;
+  for (CurMPIter = prior(MergePotentials.end()), B = MergePotentials.begin(); 
+       CurMPIter->first==CurHash;
+       --CurMPIter) {
+    // Put the unconditional branch back, if we need one.
+    MachineBasicBlock *CurMBB = CurMPIter->second;
+    if (SuccBB && CurMBB != PredBB)
+      FixTail(CurMBB, SuccBB, TII);
+    if (CurMPIter==B)
+      break;
+  }
+  if (CurMPIter->first!=CurHash)
+    CurMPIter++;
+  MergePotentials.erase(CurMPIter, MergePotentials.end());
+}
+
+/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
+/// only of the common tail.  Create a block that does by splitting one.
+unsigned BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             unsigned maxCommonTailLength) {
+  unsigned i, commonTailIndex;
+  unsigned TimeEstimate = ~0U;
+  for (i=0, commonTailIndex=0; i<SameTails.size(); i++) {
+    // Use PredBB if possible; that doesn't require a new branch.
+    if (SameTails[i].first->second==PredBB) {
+      commonTailIndex = i;
+      break;
+    }
+    // Otherwise, make a (fairly bogus) choice based on estimate of
+    // how long it will take the various blocks to execute.
+    unsigned t = EstimateRuntime(SameTails[i].first->second->begin(), 
+                                 SameTails[i].second);
+    if (t<=TimeEstimate) {
+      TimeEstimate = t;
+      commonTailIndex = i;
+    }
+  }
+
+  MachineBasicBlock::iterator BBI = SameTails[commonTailIndex].second;
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].first->second;
+
+  DOUT << "\nSplitting " << MBB->getNumber() << ", size " << 
+          maxCommonTailLength;
+
+  MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI);
+  SameTails[commonTailIndex].first->second = newMBB;
+  SameTails[commonTailIndex].second = newMBB->begin();
+  // If we split PredBB, newMBB is the new predecessor.
+  if (PredBB==MBB)
+    PredBB = newMBB;
+
+  return commonTailIndex;
+}
+
+// See if any of the blocks in MergePotentials (which all have a common single
+// successor, or all have no successor) can be tail-merged.  If there is a
+// successor, any blocks in MergePotentials that are not tail-merged and
+// are not immediately before Succ must have an unconditional branch to
+// Succ added (but the predecessor/successor lists need no adjustment).  
+// The lone predecessor of Succ that falls through into Succ,
+// if any, is given in PredBB.
+
+bool BranchFolder::TryMergeBlocks(MachineBasicBlock *SuccBB,
+                                  MachineBasicBlock* PredBB) {
+  // It doesn't make sense to save a single instruction since tail merging
+  // will add a jump.
+  // FIXME: Ask the target to provide the threshold?
+  unsigned minCommonTailLength = (SuccBB ? 1 : 2) + 1;
+  MadeChange = false;
+  
+  DOUT << "\nTryMergeBlocks " << MergePotentials.size() << '\n';
+
+  // Sort by hash value so that blocks with identical end sequences sort
+  // together.
+  std::stable_sort(MergePotentials.begin(), MergePotentials.end(),MergeCompare);
+
+  // Walk through equivalence sets looking for actual exact matches.
+  while (MergePotentials.size() > 1) {
+    unsigned CurHash  = prior(MergePotentials.end())->first;
+    
+    // Build SameTails, identifying the set of blocks with this hash code
+    // and with the maximum number of instructions in common.
+    unsigned maxCommonTailLength = ComputeSameTails(CurHash, 
+                                                    minCommonTailLength);
+
+    // If we didn't find any pair that has at least minCommonTailLength 
+    // instructions in common, remove all blocks with this hash code and retry.
+    if (SameTails.empty()) {
+      RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+      continue;
+    }
+
+    // If one of the blocks is the entire common tail (and not the entry
+    // block, which we can't jump to), we can treat all blocks with this same
+    // tail at once.  Use PredBB if that is one of the possibilities, as that
+    // will not introduce any extra branches.
+    MachineBasicBlock *EntryBB = MergePotentials.begin()->second->
+                                getParent()->begin();
+    unsigned int commonTailIndex, i;
+    for (commonTailIndex=SameTails.size(), i=0; i<SameTails.size(); i++) {
+      MachineBasicBlock *MBB = SameTails[i].first->second;
+      if (MBB->begin() == SameTails[i].second && MBB != EntryBB) {
+        commonTailIndex = i;
+        if (MBB==PredBB)
+          break;
+      }
+    }
+
+    if (commonTailIndex==SameTails.size()) {
+      // None of the blocks consist entirely of the common tail.
+      // Split a block so that one does.
+      commonTailIndex = CreateCommonTailOnlyBlock(PredBB,  maxCommonTailLength);
+    }
+
+    MachineBasicBlock *MBB = SameTails[commonTailIndex].first->second;
+    // MBB is common tail.  Adjust all other BB's to jump to this one.
+    // Traversal must be forwards so erases work.
+    DOUT << "\nUsing common tail " << MBB->getNumber() << " for ";
+    for (unsigned int i=0; i<SameTails.size(); ++i) {
+      if (commonTailIndex==i)
+        continue;
+      DOUT << SameTails[i].first->second->getNumber() << ",";
+      // Hack the end off BB i, making it jump to BB commonTailIndex instead.
+      ReplaceTailWithBranchTo(SameTails[i].second, MBB);
+      // BB i is no longer a predecessor of SuccBB; remove it from the worklist.
+      MergePotentials.erase(SameTails[i].first);
+    }
+    DOUT << "\n";
+    // We leave commonTailIndex in the worklist in case there are other blocks
+    // that match it with a smaller number of instructions.
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
+
+  if (!EnableTailMerge) return false;
+ 
+  MadeChange = false;
+
+  // First find blocks with no successors.
+  MergePotentials.clear();
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    if (I->succ_empty())
+      MergePotentials.push_back(std::make_pair(HashEndOfMBB(I, 2U), I));
+  }
+  // See if we can do any tail merging on those.
+  if (MergePotentials.size() < TailMergeThreshold &&
+      MergePotentials.size() >= 2)
+    MadeChange |= TryMergeBlocks(NULL, NULL);
+
+  // Look at blocks (IBB) with multiple predecessors (PBB).
+  // We change each predecessor to a canonical form, by
+  // (1) temporarily removing any unconditional branch from the predecessor
+  // to IBB, and
+  // (2) alter conditional branches so they branch to the other block
+  // not IBB; this may require adding back an unconditional branch to IBB 
+  // later, where there wasn't one coming in.  E.g.
+  //   Bcc IBB
+  //   fallthrough to QBB
+  // here becomes
+  //   Bncc QBB
+  // with a conceptual B to IBB after that, which never actually exists.
+  // With those changes, we see whether the predecessors' tails match,
+  // and merge them if so.  We change things out of canonical form and
+  // back to the way they were later in the process.  (OptimizeBranches
+  // would undo some of this, but we can't use it, because we'd get into
+  // a compile-time infinite loop repeatedly doing and undoing the same
+  // transformations.)
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    if (I->pred_size() >= 2 && I->pred_size() < TailMergeThreshold) {
+      MachineBasicBlock *IBB = I;
+      MachineBasicBlock *PredBB = prior(I);
+      MergePotentials.clear();
+      for (MachineBasicBlock::pred_iterator P = I->pred_begin(), 
+                                            E2 = I->pred_end();
+           P != E2; ++P) {
+        MachineBasicBlock* PBB = *P;
+        // Skip blocks that loop to themselves, can't tail merge these.
+        if (PBB==IBB)
+          continue;
+        MachineBasicBlock *TBB = 0, *FBB = 0;
+        SmallVector<MachineOperand, 4> Cond;
+        if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+          // Failing case:  IBB is the target of a cbr, and
+          // we cannot reverse the branch.
+          SmallVector<MachineOperand, 4> NewCond(Cond);
+          if (!Cond.empty() && TBB==IBB) {
+            if (TII->ReverseBranchCondition(NewCond))
+              continue;
+            // This is the QBB case described above
+            if (!FBB)
+              FBB = next(MachineFunction::iterator(PBB));
+          }
+          // Failing case:  the only way IBB can be reached from PBB is via
+          // exception handling.  Happens for landing pads.  Would be nice
+          // to have a bit in the edge so we didn't have to do all this.
+          if (IBB->isLandingPad()) {
+            MachineFunction::iterator IP = PBB;  IP++;
+            MachineBasicBlock* PredNextBB = NULL;
+            if (IP!=MF.end())
+              PredNextBB = IP;
+            if (TBB==NULL) {
+              if (IBB!=PredNextBB)      // fallthrough
+                continue;
+            } else if (FBB) {
+              if (TBB!=IBB && FBB!=IBB)   // cbr then ubr
+                continue;
+            } else if (Cond.empty()) {
+              if (TBB!=IBB)               // ubr
+                continue;
+            } else {
+              if (TBB!=IBB && IBB!=PredNextBB)  // cbr
+                continue;
+            }
+          }
+          // Remove the unconditional branch at the end, if any.
+          if (TBB && (Cond.empty() || FBB)) {
+            TII->RemoveBranch(*PBB);
+            if (!Cond.empty())
+              // reinsert conditional branch only, for now
+              TII->InsertBranch(*PBB, (TBB==IBB) ? FBB : TBB, 0, NewCond);
+          }
+          MergePotentials.push_back(std::make_pair(HashEndOfMBB(PBB, 1U), *P));
+        }
+      }
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryMergeBlocks(I, PredBB);
+    // Reinsert an unconditional branch if needed.
+    // The 1 below can occur as a result of removing blocks in TryMergeBlocks.
+    PredBB = prior(I);      // this may have been changed in TryMergeBlocks
+    if (MergePotentials.size()==1 && 
+        MergePotentials.begin()->second != PredBB)
+      FixTail(MergePotentials.begin()->second, I, TII);
+    }
+  }
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Branch Optimization
+//===----------------------------------------------------------------------===//
+
+bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
+  MadeChange = false;
+  
+  // Make sure blocks are numbered in order
+  MF.RenumberBlocks();
+
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = I++;
+    OptimizeBlock(MBB);
+    
+    // If it is dead, remove it.
+    if (MBB->pred_empty()) {
+      RemoveDeadBlock(MBB);
+      MadeChange = true;
+      ++NumDeadBlocks;
+    }
+  }
+  return MadeChange;
+}
+
+
+/// CanFallThrough - Return true if the specified block (with the specified
+/// branch condition) can implicitly transfer control to the block after it by
+/// falling off the end of it.  This should return false if it can reach the
+/// block after it, but it uses an explicit branch to do so (e.g. a table jump).
+///
+/// True is a conservative answer.
+///
+bool BranchFolder::CanFallThrough(MachineBasicBlock *CurBB,
+                                  bool BranchUnAnalyzable,
+                                  MachineBasicBlock *TBB, 
+                                  MachineBasicBlock *FBB,
+                                  const SmallVectorImpl<MachineOperand> &Cond) {
+  MachineFunction::iterator Fallthrough = CurBB;
+  ++Fallthrough;
+  // If FallthroughBlock is off the end of the function, it can't fall through.
+  if (Fallthrough == CurBB->getParent()->end())
+    return false;
+  
+  // If FallthroughBlock isn't a successor of CurBB, no fallthrough is possible.
+  if (!CurBB->isSuccessor(Fallthrough))
+    return false;
+  
+  // If we couldn't analyze the branch, assume it could fall through.
+  if (BranchUnAnalyzable) return true;
+  
+  // If there is no branch, control always falls through.
+  if (TBB == 0) return true;
+
+  // If there is some explicit branch to the fallthrough block, it can obviously
+  // reach, even though the branch should get folded to fall through implicitly.
+  if (MachineFunction::iterator(TBB) == Fallthrough ||
+      MachineFunction::iterator(FBB) == Fallthrough)
+    return true;
+  
+  // If it's an unconditional branch to some block not the fall through, it 
+  // doesn't fall through.
+  if (Cond.empty()) return false;
+  
+  // Otherwise, if it is conditional and has no explicit false block, it falls
+  // through.
+  return FBB == 0;
+}
+
+/// CanFallThrough - Return true if the specified can implicitly transfer
+/// control to the block after it by falling off the end of it.  This should
+/// return false if it can reach the block after it, but it uses an explicit
+/// branch to do so (e.g. a table jump).
+///
+/// True is a conservative answer.
+///
+bool BranchFolder::CanFallThrough(MachineBasicBlock *CurBB) {
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  bool CurUnAnalyzable = TII->AnalyzeBranch(*CurBB, TBB, FBB, Cond, true);
+  return CanFallThrough(CurBB, CurUnAnalyzable, TBB, FBB, Cond);
+}
+
+/// IsBetterFallthrough - Return true if it would be clearly better to
+/// fall-through to MBB1 than to fall through into MBB2.  This has to return
+/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
+/// result in infinite loops.
+static bool IsBetterFallthrough(MachineBasicBlock *MBB1, 
+                                MachineBasicBlock *MBB2) {
+  // Right now, we use a simple heuristic.  If MBB2 ends with a call, and
+  // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
+  // optimize branches that branch to either a return block or an assert block
+  // into a fallthrough to the return.
+  if (MBB1->empty() || MBB2->empty()) return false;
+ 
+  // If there is a clear successor ordering we make sure that one block
+  // will fall through to the next
+  if (MBB1->isSuccessor(MBB2)) return true;
+  if (MBB2->isSuccessor(MBB1)) return false;
+
+  MachineInstr *MBB1I = --MBB1->end();
+  MachineInstr *MBB2I = --MBB2->end();
+  return MBB2I->getDesc().isCall() && !MBB1I->getDesc().isCall();
+}
+
+/// OptimizeBlock - Analyze and optimize control flow related to the specified
+/// block.  This is never called on the entry block.
+void BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
+  MachineFunction::iterator FallThrough = MBB;
+  ++FallThrough;
+  
+  // If this block is empty, make everyone use its fall-through, not the block
+  // explicitly.  Landing pads should not do this since the landing-pad table
+  // points to this block.
+  if (MBB->empty() && !MBB->isLandingPad()) {
+    // Dead block?  Leave for cleanup later.
+    if (MBB->pred_empty()) return;
+    
+    if (FallThrough == MBB->getParent()->end()) {
+      // TODO: Simplify preds to not branch here if possible!
+    } else {
+      // Rewrite all predecessors of the old block to go to the fallthrough
+      // instead.
+      while (!MBB->pred_empty()) {
+        MachineBasicBlock *Pred = *(MBB->pred_end()-1);
+        Pred->ReplaceUsesOfBlockWith(MBB, FallThrough);
+      }
+      
+      // If MBB was the target of a jump table, update jump tables to go to the
+      // fallthrough instead.
+      MBB->getParent()->getJumpTableInfo()->
+        ReplaceMBBInJumpTables(MBB, FallThrough);
+      MadeChange = true;
+    }
+    return;
+  }
+
+  // Check to see if we can simplify the terminator of the block before this
+  // one.
+  MachineBasicBlock &PrevBB = *prior(MachineFunction::iterator(MBB));
+
+  MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
+  SmallVector<MachineOperand, 4> PriorCond;
+  bool PriorUnAnalyzable =
+    TII->AnalyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+  if (!PriorUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
+                                              !PriorCond.empty());
+    
+    // If the previous branch is conditional and both conditions go to the same
+    // destination, remove the branch, replacing it with an unconditional one or
+    // a fall-through.
+    if (PriorTBB && PriorTBB == PriorFBB) {
+      TII->RemoveBranch(PrevBB);
+      PriorCond.clear(); 
+      if (PriorTBB != MBB)
+        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond);
+      MadeChange = true;
+      ++NumBranchOpts;
+      return OptimizeBlock(MBB);
+    }
+    
+    // If the previous branch *only* branches to *this* block (conditional or
+    // not) remove the branch.
+    if (PriorTBB == MBB && PriorFBB == 0) {
+      TII->RemoveBranch(PrevBB);
+      MadeChange = true;
+      ++NumBranchOpts;
+      return OptimizeBlock(MBB);
+    }
+    
+    // If the prior block branches somewhere else on the condition and here if
+    // the condition is false, remove the uncond second branch.
+    if (PriorFBB == MBB) {
+      TII->RemoveBranch(PrevBB);
+      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond);
+      MadeChange = true;
+      ++NumBranchOpts;
+      return OptimizeBlock(MBB);
+    }
+    
+    // If the prior block branches here on true and somewhere else on false, and
+    // if the branch condition is reversible, reverse the branch to create a
+    // fall-through.
+    if (PriorTBB == MBB) {
+      SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+      if (!TII->ReverseBranchCondition(NewPriorCond)) {
+        TII->RemoveBranch(PrevBB);
+        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond);
+        MadeChange = true;
+        ++NumBranchOpts;
+        return OptimizeBlock(MBB);
+      }
+    }
+    
+    // If this block doesn't fall through (e.g. it ends with an uncond branch or
+    // has no successors) and if the pred falls through into this block, and if
+    // it would otherwise fall through into the block after this, move this
+    // block to the end of the function.
+    //
+    // We consider it more likely that execution will stay in the function (e.g.
+    // due to loops) than it is to exit it.  This asserts in loops etc, moving
+    // the assert condition out of the loop body.
+    if (!PriorCond.empty() && PriorFBB == 0 &&
+        MachineFunction::iterator(PriorTBB) == FallThrough &&
+        !CanFallThrough(MBB)) {
+      bool DoTransform = true;
+      
+      // We have to be careful that the succs of PredBB aren't both no-successor
+      // blocks.  If neither have successors and if PredBB is the second from
+      // last block in the function, we'd just keep swapping the two blocks for
+      // last.  Only do the swap if one is clearly better to fall through than
+      // the other.
+      if (FallThrough == --MBB->getParent()->end() &&
+          !IsBetterFallthrough(PriorTBB, MBB))
+        DoTransform = false;
+
+      // We don't want to do this transformation if we have control flow like:
+      //   br cond BB2
+      // BB1:
+      //   ..
+      //   jmp BBX
+      // BB2:
+      //   ..
+      //   ret
+      //
+      // In this case, we could actually be moving the return block *into* a
+      // loop!
+      if (DoTransform && !MBB->succ_empty() &&
+          (!CanFallThrough(PriorTBB) || PriorTBB->empty()))
+        DoTransform = false;
+      
+      
+      if (DoTransform) {
+        // Reverse the branch so we will fall through on the previous true cond.
+        SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+        if (!TII->ReverseBranchCondition(NewPriorCond)) {
+          DOUT << "\nMoving MBB: " << *MBB;
+          DOUT << "To make fallthrough to: " << *PriorTBB << "\n";
+          
+          TII->RemoveBranch(PrevBB);
+          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond);
+
+          // Move this block to the end of the function.
+          MBB->moveAfter(--MBB->getParent()->end());
+          MadeChange = true;
+          ++NumBranchOpts;
+          return;
+        }
+      }
+    }
+  }
+  
+  // Analyze the branch in the current block.
+  MachineBasicBlock *CurTBB = 0, *CurFBB = 0;
+  SmallVector<MachineOperand, 4> CurCond;
+  bool CurUnAnalyzable= TII->AnalyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
+  if (!CurUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());
+
+    // If this is a two-way branch, and the FBB branches to this block, reverse 
+    // the condition so the single-basic-block loop is faster.  Instead of:
+    //    Loop: xxx; jcc Out; jmp Loop
+    // we want:
+    //    Loop: xxx; jncc Loop; jmp Out
+    if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
+      SmallVector<MachineOperand, 4> NewCond(CurCond);
+      if (!TII->ReverseBranchCondition(NewCond)) {
+        TII->RemoveBranch(*MBB);
+        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond);
+        MadeChange = true;
+        ++NumBranchOpts;
+        return OptimizeBlock(MBB);
+      }
+    }
+    
+    
+    // If this branch is the only thing in its block, see if we can forward
+    // other blocks across it.
+    if (CurTBB && CurCond.empty() && CurFBB == 0 && 
+        MBB->begin()->getDesc().isBranch() && CurTBB != MBB) {
+      // This block may contain just an unconditional branch.  Because there can
+      // be 'non-branch terminators' in the block, try removing the branch and
+      // then seeing if the block is empty.
+      TII->RemoveBranch(*MBB);
+
+      // If this block is just an unconditional branch to CurTBB, we can
+      // usually completely eliminate the block.  The only case we cannot
+      // completely eliminate the block is when the block before this one
+      // falls through into MBB and we can't understand the prior block's branch
+      // condition.
+      if (MBB->empty()) {
+        bool PredHasNoFallThrough = TII->BlockHasNoFallThrough(PrevBB);
+        if (PredHasNoFallThrough || !PriorUnAnalyzable ||
+            !PrevBB.isSuccessor(MBB)) {
+          // If the prior block falls through into us, turn it into an
+          // explicit branch to us to make updates simpler.
+          if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) && 
+              PriorTBB != MBB && PriorFBB != MBB) {
+            if (PriorTBB == 0) {
+              assert(PriorCond.empty() && PriorFBB == 0 &&
+                     "Bad branch analysis");
+              PriorTBB = MBB;
+            } else {
+              assert(PriorFBB == 0 && "Machine CFG out of date!");
+              PriorFBB = MBB;
+            }
+            TII->RemoveBranch(PrevBB);
+            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond);
+          }
+
+          // Iterate through all the predecessors, revectoring each in-turn.
+          size_t PI = 0;
+          bool DidChange = false;
+          bool HasBranchToSelf = false;
+          while(PI != MBB->pred_size()) {
+            MachineBasicBlock *PMBB = *(MBB->pred_begin() + PI);
+            if (PMBB == MBB) {
+              // If this block has an uncond branch to itself, leave it.
+              ++PI;
+              HasBranchToSelf = true;
+            } else {
+              DidChange = true;
+              PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
+              // If this change resulted in PMBB ending in a conditional
+              // branch where both conditions go to the same destination,
+              // change this to an unconditional branch (and fix the CFG).
+              MachineBasicBlock *NewCurTBB = 0, *NewCurFBB = 0;
+              SmallVector<MachineOperand, 4> NewCurCond;
+              bool NewCurUnAnalyzable = TII->AnalyzeBranch(*PMBB, NewCurTBB,
+                      NewCurFBB, NewCurCond, true);
+              if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
+                TII->RemoveBranch(*PMBB);
+                NewCurCond.clear(); 
+                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond);
+                MadeChange = true;
+                ++NumBranchOpts;
+                PMBB->CorrectExtraCFGEdges(NewCurTBB, NewCurFBB, false);
+              }
+            }
+          }
+
+          // Change any jumptables to go to the new MBB.
+          MBB->getParent()->getJumpTableInfo()->
+            ReplaceMBBInJumpTables(MBB, CurTBB);
+          if (DidChange) {
+            ++NumBranchOpts;
+            MadeChange = true;
+            if (!HasBranchToSelf) return;
+          }
+        }
+      }
+      
+      // Add the branch back if the block is more than just an uncond branch.
+      TII->InsertBranch(*MBB, CurTBB, 0, CurCond);
+    }
+  }
+
+  // If the prior block doesn't fall through into this block, and if this
+  // block doesn't fall through into some other block, see if we can find a
+  // place to move this block where a fall-through will happen.
+  if (!CanFallThrough(&PrevBB, PriorUnAnalyzable,
+                      PriorTBB, PriorFBB, PriorCond)) {
+    // Now we know that there was no fall-through into this block, check to
+    // see if it has a fall-through into its successor.
+    bool CurFallsThru = CanFallThrough(MBB, CurUnAnalyzable, CurTBB, CurFBB, 
+                                       CurCond);
+
+    if (!MBB->isLandingPad()) {
+      // Check all the predecessors of this block.  If one of them has no fall
+      // throughs, move this block right after it.
+      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+           E = MBB->pred_end(); PI != E; ++PI) {
+        // Analyze the branch at the end of the pred.
+        MachineBasicBlock *PredBB = *PI;
+        MachineFunction::iterator PredFallthrough = PredBB; ++PredFallthrough;
+        if (PredBB != MBB && !CanFallThrough(PredBB)
+            && (!CurFallsThru || !CurTBB || !CurFBB)
+            && (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
+          // If the current block doesn't fall through, just move it.
+          // If the current block can fall through and does not end with a
+          // conditional branch, we need to append an unconditional jump to 
+          // the (current) next block.  To avoid a possible compile-time
+          // infinite loop, move blocks only backward in this case.
+          // Also, if there are already 2 branches here, we cannot add a third;
+          // this means we have the case
+          // Bcc next
+          // B elsewhere
+          // next:
+          if (CurFallsThru) {
+            MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
+            CurCond.clear();
+            TII->InsertBranch(*MBB, NextBB, 0, CurCond);
+          }
+          MBB->moveAfter(PredBB);
+          MadeChange = true;
+          return OptimizeBlock(MBB);
+        }
+      }
+    }
+        
+    if (!CurFallsThru) {
+      // Check all successors to see if we can move this block before it.
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           E = MBB->succ_end(); SI != E; ++SI) {
+        // Analyze the branch at the end of the block before the succ.
+        MachineBasicBlock *SuccBB = *SI;
+        MachineFunction::iterator SuccPrev = SuccBB; --SuccPrev;
+        std::vector<MachineOperand> SuccPrevCond;
+        
+        // If this block doesn't already fall-through to that successor, and if
+        // the succ doesn't already have a block that can fall through into it,
+        // and if the successor isn't an EH destination, we can arrange for the
+        // fallthrough to happen.
+        if (SuccBB != MBB && !CanFallThrough(SuccPrev) &&
+            !SuccBB->isLandingPad()) {
+          MBB->moveBefore(SuccBB);
+          MadeChange = true;
+          return OptimizeBlock(MBB);
+        }
+      }
+      
+      // Okay, there is no really great place to put this block.  If, however,
+      // the block before this one would be a fall-through if this block were
+      // removed, move this block to the end of the function.
+      if (FallThrough != MBB->getParent()->end() &&
+          PrevBB.isSuccessor(FallThrough)) {
+        MBB->moveAfter(--MBB->getParent()->end());
+        MadeChange = true;
+        return;
+      }
+    }
+  }
+}
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
new file mode 100644
index 0000000..ca4b31c
--- /dev/null
+++ b/lib/CodeGen/CMakeLists.txt
@@ -0,0 +1,62 @@
+add_llvm_library(LLVMCodeGen
+  BranchFolding.cpp
+  CodePlacementOpt.cpp
+  DeadMachineInstructionElim.cpp
+  DwarfEHPrepare.cpp
+  ELFWriter.cpp
+  GCMetadata.cpp
+  GCMetadataPrinter.cpp
+  GCStrategy.cpp
+  IfConversion.cpp
+  IntrinsicLowering.cpp
+  LLVMTargetMachine.cpp
+  LatencyPriorityQueue.cpp
+  LiveInterval.cpp
+  LiveIntervalAnalysis.cpp
+  LiveStackAnalysis.cpp
+  LiveVariables.cpp
+  LowerSubregs.cpp
+  MachOWriter.cpp
+  MachineBasicBlock.cpp
+  MachineDominators.cpp
+  MachineFunction.cpp
+  MachineInstr.cpp
+  MachineLICM.cpp
+  MachineLoopInfo.cpp
+  MachineModuleInfo.cpp
+  MachinePassRegistry.cpp
+  MachineRegisterInfo.cpp
+  MachineSink.cpp
+  MachineVerifier.cpp
+  OcamlGC.cpp
+  PBQP.cpp
+  PHIElimination.cpp
+  Passes.cpp
+  PostRASchedulerList.cpp
+  PreAllocSplitting.cpp
+  PrologEpilogInserter.cpp
+  PseudoSourceValue.cpp
+  RegAllocBigBlock.cpp
+  RegAllocLinearScan.cpp
+  RegAllocLocal.cpp
+  RegAllocPBQP.cpp
+  RegAllocSimple.cpp
+  RegisterCoalescer.cpp
+  RegisterScavenging.cpp
+  ScheduleDAG.cpp
+  ScheduleDAGEmit.cpp
+  ScheduleDAGInstrs.cpp
+  ScheduleDAGPrinter.cpp
+  ShadowStackGC.cpp
+  ShrinkWrapping.cpp
+  SimpleRegisterCoalescing.cpp
+  Spiller.cpp
+  StackProtector.cpp
+  StackSlotColoring.cpp
+  StrongPHIElimination.cpp
+  TargetInstrInfoImpl.cpp
+  TwoAddressInstructionPass.cpp
+  UnreachableBlockElim.cpp
+  VirtRegMap.cpp
+  VirtRegRewriter.cpp
+  )
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
new file mode 100644
index 0000000..383098e
--- /dev/null
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -0,0 +1,358 @@
+//===-- CodePlacementOpt.cpp - Code Placement pass. -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass that optimize code placement and align loop
+// headers to target specific alignment boundary.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "code-placement"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumHeaderAligned, "Number of loop header aligned");
+STATISTIC(NumIntraElim,     "Number of intra loop branches eliminated");
+STATISTIC(NumIntraMoved,    "Number of intra loop branches moved");
+
+namespace {
+  class CodePlacementOpt : public MachineFunctionPass {
+    const MachineLoopInfo *MLI;
+    const TargetInstrInfo *TII;
+    const TargetLowering  *TLI;
+
+    /// ChangedMBBs - BBs which are modified by OptimizeIntraLoopEdges.
+    SmallPtrSet<MachineBasicBlock*, 8> ChangedMBBs;
+
+    /// UncondJmpMBBs - A list of BBs which are in loops and end with
+    /// unconditional branches.
+    SmallVector<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 4>
+    UncondJmpMBBs;
+
+    /// LoopHeaders - A list of BBs which are loop headers.
+    SmallVector<MachineBasicBlock*, 4> LoopHeaders;
+
+  public:
+    static char ID;
+    CodePlacementOpt() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char *getPassName() const {
+      return "Code Placement Optimizater";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool OptimizeIntraLoopEdges();
+    bool HeaderShouldBeAligned(MachineBasicBlock *MBB, MachineLoop *L,
+                               SmallPtrSet<MachineBasicBlock*, 4> &DoNotAlign);
+    bool AlignLoops(MachineFunction &MF);
+  };
+
+  char CodePlacementOpt::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createCodePlacementOptPass() {
+  return new CodePlacementOpt();
+}
+
+/// OptimizeBackEdges - Place loop back edges to move unconditional branches
+/// out of the loop.
+///
+///       A:
+///       ...
+///       <fallthrough to B>
+///
+///       B:  --> loop header
+///       ...
+///       jcc <cond> C, [exit]
+///
+///       C:
+///       ...
+///       jmp B
+///
+/// ==>
+///
+///       A:
+///       ...
+///       jmp B
+///
+///       C:  --> new loop header
+///       ...
+///       <fallthough to B>
+///       
+///       B:
+///       ...
+///       jcc <cond> C, [exit]
+///
+bool CodePlacementOpt::OptimizeIntraLoopEdges() {
+  if (!TLI->shouldOptimizeCodePlacement())
+    return false;
+
+  bool Changed = false;
+  for (unsigned i = 0, e = UncondJmpMBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = UncondJmpMBBs[i].first;
+    MachineBasicBlock *SuccMBB = UncondJmpMBBs[i].second;
+    MachineLoop *L = MLI->getLoopFor(MBB);
+    assert(L && "BB is expected to be in a loop!");
+
+    if (ChangedMBBs.count(MBB)) {
+      // BB has been modified, re-analyze.
+      MachineBasicBlock *TBB = 0, *FBB = 0;
+      SmallVector<MachineOperand, 4> Cond;
+      if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty())
+        continue;
+      if (MLI->getLoopFor(TBB) != L || TBB->isLandingPad())
+        continue;
+      SuccMBB = TBB;
+    } else {
+      assert(MLI->getLoopFor(SuccMBB) == L &&
+             "Successor is not in the same loop!");
+    }
+
+    if (MBB->isLayoutSuccessor(SuccMBB)) {
+      // Successor is right after MBB, just eliminate the unconditional jmp.
+      // Can this happen?
+      TII->RemoveBranch(*MBB);
+      ChangedMBBs.insert(MBB);
+      ++NumIntraElim;
+      Changed = true;
+      continue;
+    }
+
+    // Now check if the predecessor is fallthrough from any BB. If there is,
+    // that BB should be from outside the loop since edge will become a jmp.
+    bool OkToMove = true;
+    MachineBasicBlock *FtMBB = 0, *FtTBB = 0, *FtFBB = 0;
+    SmallVector<MachineOperand, 4> FtCond;    
+    for (MachineBasicBlock::pred_iterator PI = SuccMBB->pred_begin(),
+           PE = SuccMBB->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *PredMBB = *PI;
+      if (PredMBB->isLayoutSuccessor(SuccMBB)) {
+        if (TII->AnalyzeBranch(*PredMBB, FtTBB, FtFBB, FtCond)) {
+          OkToMove = false;
+          break;
+        }
+        if (!FtTBB)
+          FtTBB = SuccMBB;
+        else if (!FtFBB) {
+          assert(FtFBB != SuccMBB && "Unexpected control flow!");
+          FtFBB = SuccMBB;
+        }
+        
+        // A fallthrough.
+        FtMBB = PredMBB;
+        MachineLoop *PL = MLI->getLoopFor(PredMBB);
+        if (PL && (PL == L || PL->getLoopDepth() >= L->getLoopDepth()))
+          OkToMove = false;
+
+        break;
+      }
+    }
+
+    if (!OkToMove)
+      continue;
+
+    // Is it profitable? If SuccMBB can fallthrough itself, that can be changed
+    // into a jmp.
+    MachineBasicBlock *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (TII->AnalyzeBranch(*SuccMBB, TBB, FBB, Cond))
+      continue;
+    if (!TBB && Cond.empty())
+      TBB = next(MachineFunction::iterator(SuccMBB));
+    else if (!FBB && !Cond.empty())
+      FBB = next(MachineFunction::iterator(SuccMBB));
+
+    // This calculate the cost of the transformation. Also, it finds the *only*
+    // intra-loop edge if there is one.
+    int Cost = 0;
+    bool HasOneIntraSucc = true;
+    MachineBasicBlock *IntraSucc = 0;
+    for (MachineBasicBlock::succ_iterator SI = SuccMBB->succ_begin(),
+           SE = SuccMBB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock *SSMBB = *SI;
+      if (MLI->getLoopFor(SSMBB) == L) {
+        if (!IntraSucc)
+          IntraSucc = SSMBB;
+        else
+          HasOneIntraSucc = false;
+      }
+
+      if (SuccMBB->isLayoutSuccessor(SSMBB))
+        // This will become a jmp.
+        ++Cost;
+      else if (MBB->isLayoutSuccessor(SSMBB)) {
+        // One of the successor will become the new fallthrough.
+        if (SSMBB == FBB) {
+          FBB = 0;
+          --Cost;
+        } else if (!FBB && SSMBB == TBB && Cond.empty()) {
+          TBB = 0;
+          --Cost;
+        } else if (!Cond.empty() && !TII->ReverseBranchCondition(Cond)) {
+          assert(SSMBB == TBB);
+          TBB = FBB;
+          FBB = 0;
+          --Cost;
+        }
+      }
+    }
+    if (Cost)
+      continue;
+
+    // Now, let's move the successor to below the BB to eliminate the jmp.
+    SuccMBB->moveAfter(MBB);
+    TII->RemoveBranch(*MBB);
+    TII->RemoveBranch(*SuccMBB);
+    if (TBB)
+      TII->InsertBranch(*SuccMBB, TBB, FBB, Cond);
+    ChangedMBBs.insert(MBB);
+    ChangedMBBs.insert(SuccMBB);
+    if (FtMBB) {
+      TII->RemoveBranch(*FtMBB);
+      TII->InsertBranch(*FtMBB, FtTBB, FtFBB, FtCond);
+      ChangedMBBs.insert(FtMBB);
+    }
+    Changed = true;
+
+    // If BB is the loop latch, we may have a new loop headr.
+    if (MBB == L->getLoopLatch()) {
+      assert(MLI->isLoopHeader(SuccMBB) &&
+             "Only succ of loop latch is not the header?");
+      if (HasOneIntraSucc && IntraSucc)
+        std::replace(LoopHeaders.begin(),LoopHeaders.end(), SuccMBB, IntraSucc);
+    }
+  }
+
+  ++NumIntraMoved;
+  return Changed;
+}
+
+/// HeaderShouldBeAligned - Return true if the specified loop header block
+/// should be aligned. For now, we will not align it if all the predcessors
+/// (i.e. loop back edges) are laid out above the header. FIXME: Do not
+/// align small loops.
+bool
+CodePlacementOpt::HeaderShouldBeAligned(MachineBasicBlock *MBB, MachineLoop *L,
+                               SmallPtrSet<MachineBasicBlock*, 4> &DoNotAlign) {
+  if (DoNotAlign.count(MBB))
+    return false;
+
+  bool BackEdgeBelow = false;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredMBB = *PI;
+    if (PredMBB == MBB || PredMBB->getNumber() > MBB->getNumber()) {
+      BackEdgeBelow = true;
+      break;
+    }
+  }
+
+  if (!BackEdgeBelow)
+    return false;
+
+  // Ok, we are going to align this loop header. If it's an inner loop,
+  // do not align its outer loop.
+  MachineBasicBlock *PreHeader = L->getLoopPreheader();
+  if (PreHeader) {
+    MachineLoop *L = MLI->getLoopFor(PreHeader);
+    if (L) {
+      MachineBasicBlock *HeaderBlock = L->getHeader();
+      HeaderBlock->setAlignment(0);
+      DoNotAlign.insert(HeaderBlock);
+    }
+  }
+  return true;
+}
+
+/// AlignLoops - Align loop headers to target preferred alignments.
+///
+bool CodePlacementOpt::AlignLoops(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  unsigned Align = TLI->getPrefLoopAlignment();
+  if (!Align)
+    return false;  // Don't care about loop alignment.
+
+  // Make sure blocks are numbered in order
+  MF.RenumberBlocks();
+
+  bool Changed = false;
+  SmallPtrSet<MachineBasicBlock*, 4> DoNotAlign;
+  for (unsigned i = 0, e = LoopHeaders.size(); i != e; ++i) {
+    MachineBasicBlock *HeaderMBB = LoopHeaders[i];
+    MachineBasicBlock *PredMBB = prior(MachineFunction::iterator(HeaderMBB));
+    MachineLoop *L = MLI->getLoopFor(HeaderMBB);
+    if (L == MLI->getLoopFor(PredMBB))
+      // If previously BB is in the same loop, don't align this BB. We want
+      // to prevent adding noop's inside a loop.
+      continue;
+    if (HeaderShouldBeAligned(HeaderMBB, L, DoNotAlign)) {
+      HeaderMBB->setAlignment(Align);
+      Changed = true;
+      ++NumHeaderAligned;
+    }
+  }
+
+  return Changed;
+}
+
+bool CodePlacementOpt::runOnMachineFunction(MachineFunction &MF) {
+  MLI = &getAnalysis<MachineLoopInfo>();
+  if (MLI->empty())
+    return false;  // No loops.
+
+  TLI = MF.getTarget().getTargetLowering();
+  TII = MF.getTarget().getInstrInfo();
+
+  // Analyze the BBs first and keep track of loop headers and BBs that
+  // end with an unconditional jmp to another block in the same loop.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    if (MBB->isLandingPad())
+      continue;
+    MachineLoop *L = MLI->getLoopFor(MBB);
+    if (!L)
+      continue;
+    if (MLI->isLoopHeader(MBB))
+      LoopHeaders.push_back(MBB);
+
+    MachineBasicBlock *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty())
+      continue;
+    if (MLI->getLoopFor(TBB) == L && !TBB->isLandingPad())
+      UncondJmpMBBs.push_back(std::make_pair(MBB, TBB));
+  }
+
+  bool Changed = OptimizeIntraLoopEdges();
+
+  Changed |= AlignLoops(MF);
+
+  ChangedMBBs.clear();
+  UncondJmpMBBs.clear();
+  LoopHeaders.clear();
+
+  return Changed;
+}
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
new file mode 100644
index 0000000..4832a5e
--- /dev/null
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -0,0 +1,161 @@
+//===- DeadMachineInstructionElim.cpp - Remove dead machine instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level dead-code-elimination pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN DeadMachineInstructionElim : 
+        public MachineFunctionPass {
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    
+    const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+    BitVector LivePhysRegs;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DeadMachineInstructionElim() : MachineFunctionPass(&ID) {}
+
+  private:
+    bool isDead(MachineInstr *MI) const;
+  };
+}
+char DeadMachineInstructionElim::ID = 0;
+
+static RegisterPass<DeadMachineInstructionElim>
+Y("dead-mi-elimination",
+  "Remove dead machine instructions");
+
+FunctionPass *llvm::createDeadMachineInstructionElimPass() {
+  return new DeadMachineInstructionElim();
+}
+
+bool DeadMachineInstructionElim::isDead(MachineInstr *MI) const {
+  // Don't delete instructions with side effects.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(TII, SawStore))
+    return false;
+
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) ?
+          LivePhysRegs[Reg] : !MRI->use_empty(Reg)) {
+        // This def has a use. Don't delete the instruction!
+        return false;
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
+  bool AnyChanges = false;
+  MRI = &MF.getRegInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getTarget().getInstrInfo();
+
+  // Compute a bitvector to represent all non-allocatable physregs.
+  BitVector NonAllocatableRegs = TRI->getAllocatableSet(MF);
+  NonAllocatableRegs.flip();
+
+  // Loop over all instructions in all blocks, from bottom to top, so that it's
+  // more likely that chains of dependent but ultimately dead instructions will
+  // be cleaned up.
+  for (MachineFunction::reverse_iterator I = MF.rbegin(), E = MF.rend();
+       I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+
+    // Start out assuming that all non-allocatable registers are live
+    // out of this block.
+    LivePhysRegs = NonAllocatableRegs;
+
+    // Also add any explicit live-out physregs for this block.
+    if (!MBB->empty() && MBB->back().getDesc().isReturn())
+      for (MachineRegisterInfo::liveout_iterator LOI = MRI->liveout_begin(),
+           LOE = MRI->liveout_end(); LOI != LOE; ++LOI) {
+        unsigned Reg = *LOI;
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          LivePhysRegs.set(Reg);
+      }
+
+    // Now scan the instructions and delete dead ones, tracking physreg
+    // liveness as we go.
+    for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(),
+         MIE = MBB->rend(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+
+      // If the instruction is dead, delete it!
+      if (isDead(MI)) {
+        DOUT << "DeadMachineInstructionElim: DELETING: " << *MI;
+        AnyChanges = true;
+        MI->eraseFromParent();
+        MIE = MBB->rend();
+        // MII is now pointing to the next instruction to process,
+        // so don't increment it.
+        continue;
+      }
+
+      // Record the physreg defs.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef()) {
+          unsigned Reg = MO.getReg();
+          if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            LivePhysRegs.reset(Reg);
+            // Check the subreg set, not the alias set, because a def
+            // of a super-register may still be partially live after
+            // this def.
+            for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+                 *SubRegs; ++SubRegs)
+              LivePhysRegs.reset(*SubRegs);
+          }
+        }
+      }
+      // Record the physreg uses, after the defs, in case a physreg is
+      // both defined and used in the same instruction.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isUse()) {
+          unsigned Reg = MO.getReg();
+          if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            LivePhysRegs.set(Reg);
+            for (const unsigned *AliasSet = TRI->getAliasSet(Reg);
+                 *AliasSet; ++AliasSet)
+              LivePhysRegs.set(*AliasSet);
+          }
+        }
+      }
+
+      // We didn't delete the current instruction, so increment MII to
+      // the next one.
+      ++MII;
+    }
+  }
+
+  LivePhysRegs.clear();
+  return AnyChanges;
+}
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
new file mode 100644
index 0000000..720e3d1
--- /dev/null
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -0,0 +1,397 @@
+//===-- DwarfEHPrepare - Prepare exception handling for code generation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass mulches exception handling code into a form adapted to code
+// generation.  Required if using dwarf exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfehprepare"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+using namespace llvm;
+
+STATISTIC(NumLandingPadsSplit,     "Number of landing pads split");
+STATISTIC(NumUnwindsLowered,       "Number of unwind instructions lowered");
+STATISTIC(NumExceptionValuesMoved, "Number of eh.exception calls moved");
+STATISTIC(NumStackTempsIntroduced, "Number of stack temporaries introduced");
+
+namespace {
+  class VISIBILITY_HIDDEN DwarfEHPrepare : public FunctionPass {
+    const TargetLowering *TLI;
+    bool CompileFast;
+
+    // The eh.exception intrinsic.
+    Function *ExceptionValueIntrinsic;
+
+    // _Unwind_Resume or the target equivalent.
+    Constant *RewindFunction;
+
+    // Dominator info is used when turning stack temporaries into registers.
+    DominatorTree *DT;
+    DominanceFrontier *DF;
+
+    // The function we are running on.
+    Function *F;
+
+    // The landing pads for this function.
+    typedef SmallPtrSet<BasicBlock*, 8> BBSet;
+    BBSet LandingPads;
+
+    // Stack temporary used to hold eh.exception values.
+    AllocaInst *ExceptionValueVar;
+
+    bool NormalizeLandingPads();
+    bool LowerUnwinds();
+    bool MoveExceptionValueCalls();
+    bool FinishStackTemporaries();
+    bool PromoteStackTemporaries();
+
+    Instruction *CreateExceptionValueCall(BasicBlock *BB);
+    Instruction *CreateValueLoad(BasicBlock *BB);
+
+    /// CreateReadOfExceptionValue - Return the result of the eh.exception
+    /// intrinsic by calling the intrinsic if in a landing pad, or loading
+    /// it from the exception value variable otherwise.
+    Instruction *CreateReadOfExceptionValue(BasicBlock *BB) {
+      return LandingPads.count(BB) ?
+        CreateExceptionValueCall(BB) : CreateValueLoad(BB);
+    }
+
+  public:
+    static char ID; // Pass identification, replacement for typeid.
+    DwarfEHPrepare(const TargetLowering *tli, bool fast) :
+      FunctionPass(&ID), TLI(tli), CompileFast(fast),
+      ExceptionValueIntrinsic(0), RewindFunction(0) {}
+
+    virtual bool runOnFunction(Function &Fn);
+
+    // getAnalysisUsage - We need dominance frontiers for memory promotion.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      if (!CompileFast)
+        AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+      if (!CompileFast)
+        AU.addRequired<DominanceFrontier>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+    const char *getPassName() const {
+      return "Exception handling preparation";
+    }
+
+  };
+} // end anonymous namespace
+
+char DwarfEHPrepare::ID = 0;
+
+FunctionPass *llvm::createDwarfEHPass(const TargetLowering *tli, bool fast) {
+  return new DwarfEHPrepare(tli, fast);
+}
+
+/// NormalizeLandingPads - Normalize and discover landing pads, noting them
+/// in the LandingPads set.  A landing pad is normal if the only CFG edges
+/// that end at it are unwind edges from invoke instructions.
+/// Abnormal landing pads are fixed up by redirecting all unwind edges to
+/// a new basic block which falls through to the original.
+bool DwarfEHPrepare::NormalizeLandingPads() {
+  bool Changed = false;
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (!isa<InvokeInst>(TI))
+      continue;
+    BasicBlock *LPad = TI->getSuccessor(1);
+    // Skip landing pads that have already been normalized.
+    if (LandingPads.count(LPad))
+      continue;
+
+    // Check that only invoke unwind edges end at the landing pad.
+    bool OnlyUnwoundTo = true;
+    for (pred_iterator PI = pred_begin(LPad), PE = pred_end(LPad);
+         PI != PE; ++PI) {
+      TerminatorInst *PT = (*PI)->getTerminator();
+      if (!isa<InvokeInst>(PT) || LPad == PT->getSuccessor(0)) {
+        OnlyUnwoundTo = false;
+        break;
+      }
+    }
+    if (OnlyUnwoundTo) {
+      // Only unwind edges lead to the landing pad.  Remember the landing pad.
+      LandingPads.insert(LPad);
+      continue;
+    }
+
+    // At least one normal edge ends at the landing pad.  Redirect the unwind
+    // edges to a new basic block which falls through into this one.
+
+    // Create the new basic block.
+    BasicBlock *NewBB = BasicBlock::Create(LPad->getName() + "_unwind_edge");
+
+    // Insert it into the function right before the original landing pad.
+    LPad->getParent()->getBasicBlockList().insert(LPad, NewBB);
+
+    // Redirect unwind edges from the original landing pad to NewBB.
+    for (pred_iterator PI = pred_begin(LPad), PE = pred_end(LPad); PI != PE; ) {
+      TerminatorInst *PT = (*PI++)->getTerminator();
+      if (isa<InvokeInst>(PT) && PT->getSuccessor(1) == LPad)
+        // Unwind to the new block.
+        PT->setSuccessor(1, NewBB);
+    }
+
+    // If there are any PHI nodes in LPad, we need to update them so that they
+    // merge incoming values from NewBB instead.
+    for (BasicBlock::iterator II = LPad->begin(); isa<PHINode>(II); ++II) {
+      PHINode *PN = cast<PHINode>(II);
+      pred_iterator PB = pred_begin(NewBB), PE = pred_end(NewBB);
+
+      // Check to see if all of the values coming in via unwind edges are the
+      // same.  If so, we don't need to create a new PHI node.
+      Value *InVal = PN->getIncomingValueForBlock(*PB);
+      for (pred_iterator PI = PB; PI != PE; ++PI) {
+        if (PI != PB && InVal != PN->getIncomingValueForBlock(*PI)) {
+          InVal = 0;
+          break;
+        }
+      }
+
+      if (InVal == 0) {
+        // Different unwind edges have different values.  Create a new PHI node
+        // in NewBB.
+        PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName()+".unwind",
+                                         NewBB);
+        // Add an entry for each unwind edge, using the value from the old PHI.
+        for (pred_iterator PI = PB; PI != PE; ++PI)
+          NewPN->addIncoming(PN->getIncomingValueForBlock(*PI), *PI);
+
+        // Now use this new PHI as the common incoming value for NewBB in PN.
+        InVal = NewPN;
+      }
+
+      // Revector exactly one entry in the PHI node to come from NewBB
+      // and delete all other entries that come from unwind edges.  If
+      // there are both normal and unwind edges from the same predecessor,
+      // this leaves an entry for the normal edge.
+      for (pred_iterator PI = PB; PI != PE; ++PI)
+        PN->removeIncomingValue(*PI);
+      PN->addIncoming(InVal, NewBB);
+    }
+
+    // Add a fallthrough from NewBB to the original landing pad.
+    BranchInst::Create(LPad, NewBB);
+
+    // Now update DominatorTree and DominanceFrontier analysis information.
+    if (DT)
+      DT->splitBlock(NewBB);
+    if (DF)
+      DF->splitBlock(NewBB);
+
+    // Remember the newly constructed landing pad.  The original landing pad
+    // LPad is no longer a landing pad now that all unwind edges have been
+    // revectored to NewBB.
+    LandingPads.insert(NewBB);
+    ++NumLandingPadsSplit;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// LowerUnwinds - Turn unwind instructions into calls to _Unwind_Resume,
+/// rethrowing any previously caught exception.  This will crash horribly
+/// at runtime if there is no such exception: using unwind to throw a new
+/// exception is currently not supported.
+bool DwarfEHPrepare::LowerUnwinds() {
+  bool Changed = false;
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (!isa<UnwindInst>(TI))
+      continue;
+
+    // Replace the unwind instruction with a call to _Unwind_Resume (or the
+    // appropriate target equivalent) followed by an UnreachableInst.
+
+    // Find the rewind function if we didn't already.
+    if (!RewindFunction) {
+      std::vector<const Type*> Params(1, PointerType::getUnqual(Type::Int8Ty));
+      FunctionType *FTy = FunctionType::get(Type::VoidTy, Params, false);
+      const char *RewindName = TLI->getLibcallName(RTLIB::UNWIND_RESUME);
+      RewindFunction = F->getParent()->getOrInsertFunction(RewindName, FTy);
+    }
+
+    // Create the call...
+    CallInst::Create(RewindFunction, CreateReadOfExceptionValue(I), "", TI);
+    // ...followed by an UnreachableInst.
+    new UnreachableInst(TI);
+
+    // Nuke the unwind instruction.
+    TI->eraseFromParent();
+    ++NumUnwindsLowered;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// MoveExceptionValueCalls - Ensure that eh.exception is only ever called from
+/// landing pads by replacing calls outside of landing pads with loads from a
+/// stack temporary.  Move eh.exception calls inside landing pads to the start
+/// of the landing pad (optional, but may make things simpler for later passes).
+bool DwarfEHPrepare::MoveExceptionValueCalls() {
+  // If the eh.exception intrinsic is not declared in the module then there is
+  // nothing to do.  Speed up compilation by checking for this common case.
+  if (!ExceptionValueIntrinsic &&
+      !F->getParent()->getFunction(Intrinsic::getName(Intrinsic::eh_exception)))
+    return false;
+
+  bool Changed = false;
+
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
+        if (CI->getIntrinsicID() == Intrinsic::eh_exception) {
+          if (!CI->use_empty()) {
+            Value *ExceptionValue = CreateReadOfExceptionValue(BB);
+            if (CI == ExceptionValue) {
+              // The call was at the start of a landing pad - leave it alone.
+              assert(LandingPads.count(BB) &&
+                     "Created eh.exception call outside landing pad!");
+              continue;
+            }
+            CI->replaceAllUsesWith(ExceptionValue);
+          }
+          CI->eraseFromParent();
+          ++NumExceptionValuesMoved;
+          Changed = true;
+        }
+  }
+
+  return Changed;
+}
+
+/// FinishStackTemporaries - If we introduced a stack variable to hold the
+/// exception value then initialize it in each landing pad.
+bool DwarfEHPrepare::FinishStackTemporaries() {
+  if (!ExceptionValueVar)
+    // Nothing to do.
+    return false;
+
+  bool Changed = false;
+
+  // Make sure that there is a store of the exception value at the start of
+  // each landing pad.
+  for (BBSet::iterator LI = LandingPads.begin(), LE = LandingPads.end();
+       LI != LE; ++LI) {
+    Instruction *ExceptionValue = CreateReadOfExceptionValue(*LI);
+    Instruction *Store = new StoreInst(ExceptionValue, ExceptionValueVar);
+    Store->insertAfter(ExceptionValue);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// PromoteStackTemporaries - Turn any stack temporaries we introduced into
+/// registers if possible.
+bool DwarfEHPrepare::PromoteStackTemporaries() {
+  if (ExceptionValueVar && DT && DF && isAllocaPromotable(ExceptionValueVar)) {
+    // Turn the exception temporary into registers and phi nodes if possible.
+    std::vector<AllocaInst*> Allocas(1, ExceptionValueVar);
+    PromoteMemToReg(Allocas, *DT, *DF);
+    return true;
+  }
+  return false;
+}
+
+/// CreateExceptionValueCall - Insert a call to the eh.exception intrinsic at
+/// the start of the basic block (unless there already is one, in which case
+/// the existing call is returned).
+Instruction *DwarfEHPrepare::CreateExceptionValueCall(BasicBlock *BB) {
+  Instruction *Start = BB->getFirstNonPHI();
+  // Is this a call to eh.exception?
+  if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Start))
+    if (CI->getIntrinsicID() == Intrinsic::eh_exception)
+      // Reuse the existing call.
+      return Start;
+
+  // Find the eh.exception intrinsic if we didn't already.
+  if (!ExceptionValueIntrinsic)
+    ExceptionValueIntrinsic = Intrinsic::getDeclaration(F->getParent(),
+                                                       Intrinsic::eh_exception);
+
+  // Create the call.
+  return CallInst::Create(ExceptionValueIntrinsic, "eh.value.call", Start);
+}
+
+/// CreateValueLoad - Insert a load of the exception value stack variable
+/// (creating it if necessary) at the start of the basic block (unless
+/// there already is a load, in which case the existing load is returned).
+Instruction *DwarfEHPrepare::CreateValueLoad(BasicBlock *BB) {
+  Instruction *Start = BB->getFirstNonPHI();
+  // Is this a load of the exception temporary?
+  if (ExceptionValueVar)
+    if (LoadInst* LI = dyn_cast<LoadInst>(Start))
+      if (LI->getPointerOperand() == ExceptionValueVar)
+        // Reuse the existing load.
+        return Start;
+
+  // Create the temporary if we didn't already.
+  if (!ExceptionValueVar) {
+    ExceptionValueVar = new AllocaInst(PointerType::getUnqual(Type::Int8Ty),
+                                       "eh.value", F->begin()->begin());
+    ++NumStackTempsIntroduced;
+  }
+
+  // Load the value.
+  return new LoadInst(ExceptionValueVar, "eh.value.load", Start);
+}
+
+bool DwarfEHPrepare::runOnFunction(Function &Fn) {
+  bool Changed = false;
+
+  // Initialize internal state.
+  DT = getAnalysisIfAvailable<DominatorTree>();
+  DF = getAnalysisIfAvailable<DominanceFrontier>();
+  ExceptionValueVar = 0;
+  F = &Fn;
+
+  // Ensure that only unwind edges end at landing pads (a landing pad is a
+  // basic block where an invoke unwind edge ends).
+  Changed |= NormalizeLandingPads();
+
+  // Turn unwind instructions into libcalls.
+  Changed |= LowerUnwinds();
+
+  // TODO: Move eh.selector calls to landing pads and combine them.
+
+  // Move eh.exception calls to landing pads.
+  Changed |= MoveExceptionValueCalls();
+
+  // Initialize any stack temporaries we introduced.
+  Changed |= FinishStackTemporaries();
+
+  // Turn any stack temporaries into registers if possible.
+  if (!CompileFast)
+    Changed |= PromoteStackTemporaries();
+
+  LandingPads.clear();
+
+  return Changed;
+}
diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp
new file mode 100644
index 0000000..7cc1162
--- /dev/null
+++ b/lib/CodeGen/ELFWriter.cpp
@@ -0,0 +1,575 @@
+//===-- ELFWriter.cpp - Target-independent ELF Writer code ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the target-independent ELF writer.  This file writes out
+// the ELF file in the following order:
+//
+//  #1. ELF Header
+//  #2. '.text' section
+//  #3. '.data' section
+//  #4. '.bss' section  (conceptual position in file)
+//  ...
+//  #X. '.shstrtab' section
+//  #Y. Section Table
+//
+// The entries in the section table are laid out as:
+//  #0. Null entry [required]
+//  #1. ".text" entry - the program code
+//  #2. ".data" entry - global variables with initializers.     [ if needed ]
+//  #3. ".bss" entry  - global variables without initializers.  [ if needed ]
+//  ...
+//  #N. ".shstrtab" entry - String table for the section names.
+//
+// NOTE: This code should eventually be extended to support 64-bit ELF (this
+// won't be hard), but we haven't done so yet!
+//
+//===----------------------------------------------------------------------===//
+
+#include "ELFWriter.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/FileWriters.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetELFWriterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/OutputBuffer.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/raw_ostream.h"
+#include <list>
+using namespace llvm;
+
+char ELFWriter::ID = 0;
+/// AddELFWriter - Concrete function to add the ELF writer to the function pass
+/// manager.
+MachineCodeEmitter *llvm::AddELFWriter(PassManagerBase &PM,
+                                       raw_ostream &O,
+                                       TargetMachine &TM) {
+  ELFWriter *EW = new ELFWriter(O, TM);
+  PM.add(EW);
+  return &EW->getMachineCodeEmitter();
+}
+
+//===----------------------------------------------------------------------===//
+//                       ELFCodeEmitter Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+  /// ELFCodeEmitter - This class is used by the ELFWriter to emit the code for
+  /// functions to the ELF file.
+  class ELFCodeEmitter : public MachineCodeEmitter {
+    ELFWriter &EW;
+    TargetMachine &TM;
+    ELFWriter::ELFSection *ES;  // Section to write to.
+    std::vector<unsigned char> *OutBuffer;
+    size_t FnStart;
+  public:
+    explicit ELFCodeEmitter(ELFWriter &ew) : EW(ew), TM(EW.TM), OutBuffer(0) {}
+
+    void startFunction(MachineFunction &F);
+    bool finishFunction(MachineFunction &F);
+
+    void addRelocation(const MachineRelocation &MR) {
+      assert(0 && "relo not handled yet!");
+    }
+    
+    virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) {
+    }
+
+    virtual uintptr_t getConstantPoolEntryAddress(unsigned Index) const {
+      assert(0 && "CP not implementated yet!");
+      return 0;
+    }
+    virtual uintptr_t getJumpTableEntryAddress(unsigned Index) const {
+      assert(0 && "JT not implementated yet!");
+      return 0;
+    }
+
+    virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const {
+      assert(0 && "JT not implementated yet!");
+      return 0;
+    }
+
+    virtual uintptr_t getLabelAddress(uint64_t Label) const {
+      assert(0 && "Label address not implementated yet!");
+      abort();
+      return 0;
+    }
+
+    virtual void emitLabel(uint64_t LabelID) {
+      assert(0 && "emit Label not implementated yet!");
+      abort();
+    }
+
+
+    virtual void setModuleInfo(llvm::MachineModuleInfo* MMI) { }
+
+
+    /// JIT SPECIFIC FUNCTIONS - DO NOT IMPLEMENT THESE HERE!
+    void startGVStub(const GlobalValue* F, unsigned StubSize,
+                     unsigned Alignment = 1) {
+      assert(0 && "JIT specific function called!");
+      abort();
+    }
+    void startGVStub(const GlobalValue* F,  void *Buffer, unsigned StubSize) {
+      assert(0 && "JIT specific function called!");
+      abort();
+    }
+    void *finishGVStub(const GlobalValue *F) {
+      assert(0 && "JIT specific function called!");
+      abort();
+      return 0;
+    }
+  };
+}
+
+/// startFunction - This callback is invoked when a new machine function is
+/// about to be emitted.
+void ELFCodeEmitter::startFunction(MachineFunction &F) {
+  // Align the output buffer to the appropriate alignment.
+  unsigned Align = 16;   // FIXME: GENERICIZE!!
+  // Get the ELF Section that this function belongs in.
+  ES = &EW.getSection(".text", ELFWriter::ELFSection::SHT_PROGBITS,
+                      ELFWriter::ELFSection::SHF_EXECINSTR |
+                      ELFWriter::ELFSection::SHF_ALLOC);
+  OutBuffer = &ES->SectionData;
+  cerr << "FIXME: This code needs to be updated for changes in the "
+       << "CodeEmitter interfaces.  In particular, this should set "
+       << "BufferBegin/BufferEnd/CurBufferPtr, not deal with OutBuffer!";
+  abort();
+
+  // Upgrade the section alignment if required.
+  if (ES->Align < Align) ES->Align = Align;
+
+  // Add padding zeros to the end of the buffer to make sure that the
+  // function will start on the correct byte alignment within the section.
+  OutputBuffer OB(*OutBuffer,
+                  TM.getTargetData()->getPointerSizeInBits() == 64,
+                  TM.getTargetData()->isLittleEndian());
+  OB.align(Align);
+  FnStart = OutBuffer->size();
+}
+
+/// finishFunction - This callback is invoked after the function is completely
+/// finished.
+bool ELFCodeEmitter::finishFunction(MachineFunction &F) {
+  // We now know the size of the function, add a symbol to represent it.
+  ELFWriter::ELFSym FnSym(F.getFunction());
+
+  // Figure out the binding (linkage) of the symbol.
+  switch (F.getFunction()->getLinkage()) {
+  default:
+    // appending linkage is illegal for functions.
+    assert(0 && "Unknown linkage type!");
+  case GlobalValue::ExternalLinkage:
+    FnSym.SetBind(ELFWriter::ELFSym::STB_GLOBAL);
+    break;
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    FnSym.SetBind(ELFWriter::ELFSym::STB_WEAK);
+    break;
+  case GlobalValue::PrivateLinkage:
+    assert (0 && "PrivateLinkage should not be in the symbol table.");
+  case GlobalValue::InternalLinkage:
+    FnSym.SetBind(ELFWriter::ELFSym::STB_LOCAL);
+    break;
+  }
+
+  ES->Size = OutBuffer->size();
+
+  FnSym.SetType(ELFWriter::ELFSym::STT_FUNC);
+  FnSym.SectionIdx = ES->SectionIdx;
+  FnSym.Value = FnStart;   // Value = Offset from start of Section.
+  FnSym.Size = OutBuffer->size()-FnStart;
+
+  // Finally, add it to the symtab.
+  EW.SymbolTable.push_back(FnSym);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                          ELFWriter Implementation
+//===----------------------------------------------------------------------===//
+
+ELFWriter::ELFWriter(raw_ostream &o, TargetMachine &tm) 
+  : MachineFunctionPass(&ID), O(o), TM(tm) {
+  e_flags = 0;    // e_flags defaults to 0, no flags.
+
+  is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
+  isLittleEndian = TM.getTargetData()->isLittleEndian();
+
+  // Create the machine code emitter object for this target.
+  MCE = new ELFCodeEmitter(*this);
+  NumSections = 0;
+}
+
+ELFWriter::~ELFWriter() {
+  delete MCE;
+}
+
+// doInitialization - Emit the file header and all of the global variables for
+// the module to the ELF file.
+bool ELFWriter::doInitialization(Module &M) {
+  Mang = new Mangler(M);
+
+  // Local alias to shortenify coming code.
+  std::vector<unsigned char> &FH = FileHeader;
+  OutputBuffer FHOut(FH, is64Bit, isLittleEndian);
+
+  FHOut.outbyte(0x7F);                     // EI_MAG0
+  FHOut.outbyte('E');                      // EI_MAG1
+  FHOut.outbyte('L');                      // EI_MAG2
+  FHOut.outbyte('F');                      // EI_MAG3
+  FHOut.outbyte(is64Bit ? 2 : 1);          // EI_CLASS
+  FHOut.outbyte(isLittleEndian ? 1 : 2);   // EI_DATA
+  FHOut.outbyte(1);                        // EI_VERSION
+  FH.resize(16);                         // EI_PAD up to 16 bytes.
+
+  // This should change for shared objects.
+  FHOut.outhalf(1);                 // e_type = ET_REL
+  FHOut.outhalf(TM.getELFWriterInfo()->getEMachine()); // target-defined
+  FHOut.outword(1);                 // e_version = 1
+  FHOut.outaddr(0);                 // e_entry = 0 -> no entry point in .o file
+  FHOut.outaddr(0);                 // e_phoff = 0 -> no program header for .o
+
+  ELFHeader_e_shoff_Offset = FH.size();
+  FHOut.outaddr(0);                 // e_shoff
+  FHOut.outword(e_flags);           // e_flags = whatever the target wants
+
+  FHOut.outhalf(is64Bit ? 64 : 52); // e_ehsize = ELF header size
+  FHOut.outhalf(0);                 // e_phentsize = prog header entry size
+  FHOut.outhalf(0);                 // e_phnum     = # prog header entries = 0
+  FHOut.outhalf(is64Bit ? 64 : 40); // e_shentsize = sect hdr entry size
+
+
+  ELFHeader_e_shnum_Offset = FH.size();
+  FHOut.outhalf(0);                 // e_shnum     = # of section header ents
+  ELFHeader_e_shstrndx_Offset = FH.size();
+  FHOut.outhalf(0);                 // e_shstrndx  = Section # of '.shstrtab'
+
+  // Add the null section, which is required to be first in the file.
+  getSection("", 0, 0);
+
+  // Start up the symbol table.  The first entry in the symtab is the null
+  // entry.
+  SymbolTable.push_back(ELFSym(0));
+
+  return false;
+}
+
+void ELFWriter::EmitGlobal(GlobalVariable *GV) {
+  // If this is an external global, emit it now.  TODO: Note that it would be
+  // better to ignore the symbol here and only add it to the symbol table if
+  // referenced.
+  if (!GV->hasInitializer()) {
+    ELFSym ExternalSym(GV);
+    ExternalSym.SetBind(ELFSym::STB_GLOBAL);
+    ExternalSym.SetType(ELFSym::STT_NOTYPE);
+    ExternalSym.SectionIdx = ELFSection::SHN_UNDEF;
+    SymbolTable.push_back(ExternalSym);
+    return;
+  }
+
+  unsigned Align = TM.getTargetData()->getPreferredAlignment(GV);
+  unsigned Size  =
+    TM.getTargetData()->getTypeAllocSize(GV->getType()->getElementType());
+
+  // If this global has a zero initializer, it is part of the .bss or common
+  // section.
+  if (GV->getInitializer()->isNullValue()) {
+    // If this global is part of the common block, add it now.  Variables are
+    // part of the common block if they are zero initialized and allowed to be
+    // merged with other symbols.
+    if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage() ||
+        GV->hasCommonLinkage()) {
+      ELFSym CommonSym(GV);
+      // Value for common symbols is the alignment required.
+      CommonSym.Value = Align;
+      CommonSym.Size  = Size;
+      CommonSym.SetBind(ELFSym::STB_GLOBAL);
+      CommonSym.SetType(ELFSym::STT_OBJECT);
+      // TODO SOMEDAY: add ELF visibility.
+      CommonSym.SectionIdx = ELFSection::SHN_COMMON;
+      SymbolTable.push_back(CommonSym);
+      return;
+    }
+
+    // Otherwise, this symbol is part of the .bss section.  Emit it now.
+
+    // Handle alignment.  Ensure section is aligned at least as much as required
+    // by this symbol.
+    ELFSection &BSSSection = getBSSSection();
+    BSSSection.Align = std::max(BSSSection.Align, Align);
+
+    // Within the section, emit enough virtual padding to get us to an alignment
+    // boundary.
+    if (Align)
+      BSSSection.Size = (BSSSection.Size + Align - 1) & ~(Align-1);
+
+    ELFSym BSSSym(GV);
+    BSSSym.Value = BSSSection.Size;
+    BSSSym.Size = Size;
+    BSSSym.SetType(ELFSym::STT_OBJECT);
+
+    switch (GV->getLinkage()) {
+    default:  // weak/linkonce/common handled above
+      assert(0 && "Unexpected linkage type!");
+    case GlobalValue::AppendingLinkage:  // FIXME: This should be improved!
+    case GlobalValue::ExternalLinkage:
+      BSSSym.SetBind(ELFSym::STB_GLOBAL);
+      break;
+    case GlobalValue::InternalLinkage:
+      BSSSym.SetBind(ELFSym::STB_LOCAL);
+      break;
+    }
+
+    // Set the idx of the .bss section
+    BSSSym.SectionIdx = BSSSection.SectionIdx;
+    if (!GV->hasPrivateLinkage())
+      SymbolTable.push_back(BSSSym);
+
+    // Reserve space in the .bss section for this symbol.
+    BSSSection.Size += Size;
+    return;
+  }
+
+  // FIXME: handle .rodata
+  //assert(!GV->isConstant() && "unimp");
+
+  // FIXME: handle .data
+  //assert(0 && "unimp");
+}
+
+
+bool ELFWriter::runOnMachineFunction(MachineFunction &MF) {
+  // Nothing to do here, this is all done through the MCE object above.
+  return false;
+}
+
+/// doFinalization - Now that the module has been completely processed, emit
+/// the ELF file to 'O'.
+bool ELFWriter::doFinalization(Module &M) {
+  // Okay, the ELF header and .text sections have been completed, build the
+  // .data, .bss, and "common" sections next.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    EmitGlobal(I);
+
+  // Emit the symbol table now, if non-empty.
+  EmitSymbolTable();
+
+  // FIXME: Emit the relocations now.
+
+  // Emit the string table for the sections in the ELF file we have.
+  EmitSectionTableStringTable();
+
+  // Emit the sections to the .o file, and emit the section table for the file.
+  OutputSectionsAndSectionTable();
+
+  // We are done with the abstract symbols.
+  SectionList.clear();
+  NumSections = 0;
+
+  // Release the name mangler object.
+  delete Mang; Mang = 0;
+  return false;
+}
+
+/// EmitSymbolTable - If the current symbol table is non-empty, emit the string
+/// table for it and then the symbol table itself.
+void ELFWriter::EmitSymbolTable() {
+  if (SymbolTable.size() == 1) return;  // Only the null entry.
+
+  // FIXME: compact all local symbols to the start of the symtab.
+  unsigned FirstNonLocalSymbol = 1;
+
+  ELFSection &StrTab = getSection(".strtab", ELFSection::SHT_STRTAB, 0);
+  StrTab.Align = 1;
+
+  DataBuffer &StrTabBuf = StrTab.SectionData;
+  OutputBuffer StrTabOut(StrTabBuf, is64Bit, isLittleEndian);
+
+  // Set the zero'th symbol to a null byte, as required.
+  StrTabOut.outbyte(0);
+  SymbolTable[0].NameIdx = 0;
+  unsigned Index = 1;
+  for (unsigned i = 1, e = SymbolTable.size(); i != e; ++i) {
+    // Use the name mangler to uniquify the LLVM symbol.
+    std::string Name = Mang->getValueName(SymbolTable[i].GV);
+
+    if (Name.empty()) {
+      SymbolTable[i].NameIdx = 0;
+    } else {
+      SymbolTable[i].NameIdx = Index;
+
+      // Add the name to the output buffer, including the null terminator.
+      StrTabBuf.insert(StrTabBuf.end(), Name.begin(), Name.end());
+
+      // Add a null terminator.
+      StrTabBuf.push_back(0);
+
+      // Keep track of the number of bytes emitted to this section.
+      Index += Name.size()+1;
+    }
+  }
+  assert(Index == StrTabBuf.size());
+  StrTab.Size = Index;
+
+  // Now that we have emitted the string table and know the offset into the
+  // string table of each symbol, emit the symbol table itself.
+  ELFSection &SymTab = getSection(".symtab", ELFSection::SHT_SYMTAB, 0);
+  SymTab.Align = is64Bit ? 8 : 4;
+  SymTab.Link = SymTab.SectionIdx;     // Section Index of .strtab.
+  SymTab.Info = FirstNonLocalSymbol;   // First non-STB_LOCAL symbol.
+  SymTab.EntSize = 16; // Size of each symtab entry. FIXME: wrong for ELF64
+  DataBuffer &SymTabBuf = SymTab.SectionData;
+  OutputBuffer SymTabOut(SymTabBuf, is64Bit, isLittleEndian);
+
+  if (!is64Bit) {   // 32-bit and 64-bit formats are shuffled a bit.
+    for (unsigned i = 0, e = SymbolTable.size(); i != e; ++i) {
+      ELFSym &Sym = SymbolTable[i];
+      SymTabOut.outword(Sym.NameIdx);
+      SymTabOut.outaddr32(Sym.Value);
+      SymTabOut.outword(Sym.Size);
+      SymTabOut.outbyte(Sym.Info);
+      SymTabOut.outbyte(Sym.Other);
+      SymTabOut.outhalf(Sym.SectionIdx);
+    }
+  } else {
+    for (unsigned i = 0, e = SymbolTable.size(); i != e; ++i) {
+      ELFSym &Sym = SymbolTable[i];
+      SymTabOut.outword(Sym.NameIdx);
+      SymTabOut.outbyte(Sym.Info);
+      SymTabOut.outbyte(Sym.Other);
+      SymTabOut.outhalf(Sym.SectionIdx);
+      SymTabOut.outaddr64(Sym.Value);
+      SymTabOut.outxword(Sym.Size);
+    }
+  }
+
+  SymTab.Size = SymTabBuf.size();
+}
+
+/// EmitSectionTableStringTable - This method adds and emits a section for the
+/// ELF Section Table string table: the string table that holds all of the
+/// section names.
+void ELFWriter::EmitSectionTableStringTable() {
+  // First step: add the section for the string table to the list of sections:
+  ELFSection &SHStrTab = getSection(".shstrtab", ELFSection::SHT_STRTAB, 0);
+
+  // Now that we know which section number is the .shstrtab section, update the
+  // e_shstrndx entry in the ELF header.
+  OutputBuffer FHOut(FileHeader, is64Bit, isLittleEndian);
+  FHOut.fixhalf(SHStrTab.SectionIdx, ELFHeader_e_shstrndx_Offset);
+
+  // Set the NameIdx of each section in the string table and emit the bytes for
+  // the string table.
+  unsigned Index = 0;
+  DataBuffer &Buf = SHStrTab.SectionData;
+
+  for (std::list<ELFSection>::iterator I = SectionList.begin(),
+         E = SectionList.end(); I != E; ++I) {
+    // Set the index into the table.  Note if we have lots of entries with
+    // common suffixes, we could memoize them here if we cared.
+    I->NameIdx = Index;
+
+    // Add the name to the output buffer, including the null terminator.
+    Buf.insert(Buf.end(), I->Name.begin(), I->Name.end());
+
+    // Add a null terminator.
+    Buf.push_back(0);
+
+    // Keep track of the number of bytes emitted to this section.
+    Index += I->Name.size()+1;
+  }
+
+  // Set the size of .shstrtab now that we know what it is.
+  assert(Index == Buf.size());
+  SHStrTab.Size = Index;
+}
+
+/// OutputSectionsAndSectionTable - Now that we have constructed the file header
+/// and all of the sections, emit these to the ostream destination and emit the
+/// SectionTable.
+void ELFWriter::OutputSectionsAndSectionTable() {
+  // Pass #1: Compute the file offset for each section.
+  size_t FileOff = FileHeader.size();   // File header first.
+
+  // Emit all of the section data in order.
+  for (std::list<ELFSection>::iterator I = SectionList.begin(),
+         E = SectionList.end(); I != E; ++I) {
+    // Align FileOff to whatever the alignment restrictions of the section are.
+    if (I->Align)
+      FileOff = (FileOff+I->Align-1) & ~(I->Align-1);
+    I->Offset = FileOff;
+    FileOff += I->SectionData.size();
+  }
+
+  // Align Section Header.
+  unsigned TableAlign = is64Bit ? 8 : 4;
+  FileOff = (FileOff+TableAlign-1) & ~(TableAlign-1);
+
+  // Now that we know where all of the sections will be emitted, set the e_shnum
+  // entry in the ELF header.
+  OutputBuffer FHOut(FileHeader, is64Bit, isLittleEndian);
+  FHOut.fixhalf(NumSections, ELFHeader_e_shnum_Offset);
+
+  // Now that we know the offset in the file of the section table, update the
+  // e_shoff address in the ELF header.
+  FHOut.fixaddr(FileOff, ELFHeader_e_shoff_Offset);
+
+  // Now that we know all of the data in the file header, emit it and all of the
+  // sections!
+  O.write((char*)&FileHeader[0], FileHeader.size());
+  FileOff = FileHeader.size();
+  DataBuffer().swap(FileHeader);
+
+  DataBuffer Table;
+  OutputBuffer TableOut(Table, is64Bit, isLittleEndian);
+
+  // Emit all of the section data and build the section table itself.
+  while (!SectionList.empty()) {
+    const ELFSection &S = *SectionList.begin();
+
+    // Align FileOff to whatever the alignment restrictions of the section are.
+    if (S.Align)
+      for (size_t NewFileOff = (FileOff+S.Align-1) & ~(S.Align-1);
+           FileOff != NewFileOff; ++FileOff)
+        O << (char)0xAB;
+    O.write((char*)&S.SectionData[0], S.SectionData.size());
+    FileOff += S.SectionData.size();
+
+    TableOut.outword(S.NameIdx);  // sh_name - Symbol table name idx
+    TableOut.outword(S.Type);     // sh_type - Section contents & semantics
+    TableOut.outword(S.Flags);    // sh_flags - Section flags.
+    TableOut.outaddr(S.Addr);     // sh_addr - The mem addr this section is in.
+    TableOut.outaddr(S.Offset);   // sh_offset - Offset from the file start.
+    TableOut.outword(S.Size);     // sh_size - The section size.
+    TableOut.outword(S.Link);     // sh_link - Section header table index link.
+    TableOut.outword(S.Info);     // sh_info - Auxillary information.
+    TableOut.outword(S.Align);    // sh_addralign - Alignment of section.
+    TableOut.outword(S.EntSize);  // sh_entsize - Size of entries in the section
+
+    SectionList.pop_front();
+  }
+
+  // Align output for the section table.
+  for (size_t NewFileOff = (FileOff+TableAlign-1) & ~(TableAlign-1);
+       FileOff != NewFileOff; ++FileOff)
+    O << (char)0xAB;
+
+  // Emit the section table itself.
+  O.write((char*)&Table[0], Table.size());
+}
diff --git a/lib/CodeGen/ELFWriter.h b/lib/CodeGen/ELFWriter.h
new file mode 100644
index 0000000..31aa05a
--- /dev/null
+++ b/lib/CodeGen/ELFWriter.h
@@ -0,0 +1,230 @@
+//===-- ELFWriter.h - Target-independent ELF writer support -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ELFWriter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ELFWRITER_H
+#define ELFWRITER_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include <list>
+#include <map>
+
+namespace llvm {
+  class GlobalVariable;
+  class Mangler;
+  class MachineCodeEmitter;
+  class ELFCodeEmitter;
+  class raw_ostream;
+
+  /// ELFWriter - This class implements the common target-independent code for
+  /// writing ELF files.  Targets should derive a class from this to
+  /// parameterize the output format.
+  ///
+  class ELFWriter : public MachineFunctionPass {
+    friend class ELFCodeEmitter;
+  public:
+    static char ID;
+
+    MachineCodeEmitter &getMachineCodeEmitter() const {
+      return *(MachineCodeEmitter*)MCE;
+    }
+
+    ELFWriter(raw_ostream &O, TargetMachine &TM);
+    ~ELFWriter();
+
+    typedef std::vector<unsigned char> DataBuffer;
+
+  protected:
+    /// Output stream to send the resultant object file to.
+    ///
+    raw_ostream &O;
+
+    /// Target machine description.
+    ///
+    TargetMachine &TM;
+
+    /// Mang - The object used to perform name mangling for this module.
+    ///
+    Mangler *Mang;
+
+    /// MCE - The MachineCodeEmitter object that we are exposing to emit machine
+    /// code for functions to the .o file.
+    ELFCodeEmitter *MCE;
+
+    //===------------------------------------------------------------------===//
+    // Properties to be set by the derived class ctor, used to configure the
+    // ELFWriter.
+
+    // e_machine - This field is the target specific value to emit as the
+    // e_machine member of the ELF header.
+    unsigned short e_machine;
+
+    // e_flags - The machine flags for the target.  This defaults to zero.
+    unsigned e_flags;
+
+    //===------------------------------------------------------------------===//
+    // Properties inferred automatically from the target machine.
+    //
+
+    /// is64Bit/isLittleEndian - This information is inferred from the target
+    /// machine directly, indicating whether to emit a 32- or 64-bit ELF file.
+    bool is64Bit, isLittleEndian;
+
+    /// doInitialization - Emit the file header and all of the global variables
+    /// for the module to the ELF file.
+    bool doInitialization(Module &M);
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+
+    /// doFinalization - Now that the module has been completely processed, emit
+    /// the ELF file to 'O'.
+    bool doFinalization(Module &M);
+
+  private:
+    // The buffer we accumulate the file header into.  Note that this should be
+    // changed into something much more efficient later (and the bitcode writer
+    // as well!).
+    DataBuffer FileHeader;
+
+    /// ELFSection - This struct contains information about each section that is
+    /// emitted to the file.  This is eventually turned into the section header
+    /// table at the end of the file.
+    struct ELFSection {
+      std::string Name;       // Name of the section.
+      unsigned NameIdx;       // Index in .shstrtab of name, once emitted.
+      unsigned Type;
+      unsigned Flags;
+      uint64_t Addr;
+      unsigned Offset;
+      unsigned Size;
+      unsigned Link;
+      unsigned Info;
+      unsigned Align;
+      unsigned EntSize;
+
+      /// SectionIdx - The number of the section in the Section Table.
+      ///
+      unsigned short SectionIdx;
+
+      /// SectionData - The actual data for this section which we are building
+      /// up for emission to the file.
+      DataBuffer SectionData;
+
+      enum { SHT_NULL = 0, SHT_PROGBITS = 1, SHT_SYMTAB = 2, SHT_STRTAB = 3,
+             SHT_RELA = 4, SHT_HASH = 5, SHT_DYNAMIC = 6, SHT_NOTE = 7,
+             SHT_NOBITS = 8, SHT_REL = 9, SHT_SHLIB = 10, SHT_DYNSYM = 11 };
+      enum { SHN_UNDEF = 0, SHN_ABS = 0xFFF1, SHN_COMMON = 0xFFF2 };
+      enum {   // SHF - ELF Section Header Flags
+        SHF_WRITE            = 1 << 0, // Writable
+        SHF_ALLOC            = 1 << 1, // Mapped into the process addr space
+        SHF_EXECINSTR        = 1 << 2, // Executable
+        SHF_MERGE            = 1 << 4, // Might be merged if equal
+        SHF_STRINGS          = 1 << 5, // Contains null-terminated strings
+        SHF_INFO_LINK        = 1 << 6, // 'sh_info' contains SHT index
+        SHF_LINK_ORDER       = 1 << 7, // Preserve order after combining
+        SHF_OS_NONCONFORMING = 1 << 8, // nonstandard OS support required
+        SHF_GROUP            = 1 << 9, // Section is a member of a group
+        SHF_TLS              = 1 << 10 // Section holds thread-local data
+      };
+
+      ELFSection(const std::string &name)
+        : Name(name), Type(0), Flags(0), Addr(0), Offset(0), Size(0),
+          Link(0), Info(0), Align(0), EntSize(0) {
+      }
+    };
+
+    /// SectionList - This is the list of sections that we have emitted to the
+    /// file.  Once the file has been completely built, the section header table
+    /// is constructed from this info.
+    std::list<ELFSection> SectionList;
+    unsigned NumSections;   // Always = SectionList.size()
+
+    /// SectionLookup - This is a mapping from section name to section number in
+    /// the SectionList.
+    std::map<std::string, ELFSection*> SectionLookup;
+
+    /// getSection - Return the section with the specified name, creating a new
+    /// section if one does not already exist.
+    ELFSection &getSection(const std::string &Name,
+                           unsigned Type, unsigned Flags = 0) {
+      ELFSection *&SN = SectionLookup[Name];
+      if (SN) return *SN;
+
+      SectionList.push_back(Name);
+      SN = &SectionList.back();
+      SN->SectionIdx = NumSections++;
+      SN->Type = Type;
+      SN->Flags = Flags;
+      return *SN;
+    }
+
+    ELFSection &getDataSection() {
+      return getSection(".data", ELFSection::SHT_PROGBITS,
+                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
+    }
+    ELFSection &getBSSSection() {
+      return getSection(".bss", ELFSection::SHT_NOBITS,
+                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
+    }
+
+    /// ELFSym - This struct contains information about each symbol that is
+    /// added to logical symbol table for the module.  This is eventually
+    /// turned into a real symbol table in the file.
+    struct ELFSym {
+      const GlobalValue *GV;    // The global value this corresponds to.
+      unsigned NameIdx;         // Index in .strtab of name, once emitted.
+      uint64_t Value;
+      unsigned Size;
+      unsigned char Info;
+      unsigned char Other;
+      unsigned short SectionIdx;
+
+      enum { STB_LOCAL = 0, STB_GLOBAL = 1, STB_WEAK = 2 };
+      enum { STT_NOTYPE = 0, STT_OBJECT = 1, STT_FUNC = 2, STT_SECTION = 3,
+             STT_FILE = 4 };
+      ELFSym(const GlobalValue *gv) : GV(gv), Value(0), Size(0), Info(0),
+                                      Other(0), SectionIdx(0) {}
+
+      void SetBind(unsigned X) {
+        assert(X == (X & 0xF) && "Bind value out of range!");
+        Info = (Info & 0x0F) | (X << 4);
+      }
+      void SetType(unsigned X) {
+        assert(X == (X & 0xF) && "Type value out of range!");
+        Info = (Info & 0xF0) | X;
+      }
+    };
+
+    /// SymbolTable - This is the list of symbols we have emitted to the file.
+    /// This actually gets rearranged before emission to the file (to put the
+    /// local symbols first in the list).
+    std::vector<ELFSym> SymbolTable;
+
+    // As we complete the ELF file, we need to update fields in the ELF header
+    // (e.g. the location of the section table).  These members keep track of
+    // the offset in ELFHeader of these various pieces to update and other
+    // locations in the file.
+    unsigned ELFHeader_e_shoff_Offset;     // e_shoff    in ELF header.
+    unsigned ELFHeader_e_shstrndx_Offset;  // e_shstrndx in ELF header.
+    unsigned ELFHeader_e_shnum_Offset;     // e_shnum    in ELF header.
+  private:
+    void EmitGlobal(GlobalVariable *GV);
+
+    void EmitSymbolTable();
+
+    void EmitSectionTableStringTable();
+    void OutputSectionsAndSectionTable();
+  };
+}
+
+#endif
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
new file mode 100644
index 0000000..cf2ebb3
--- /dev/null
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -0,0 +1,212 @@
+//===-- GCMetadata.cpp - Garbage collector metadata -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GCFunctionInfo class and GCModuleInfo pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+namespace {
+  
+  class VISIBILITY_HIDDEN Printer : public FunctionPass {
+    static char ID;
+    std::ostream &OS;
+    
+  public:
+    explicit Printer(std::ostream &OS = *cerr);
+    
+    const char *getPassName() const;
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    
+    bool runOnFunction(Function &F);
+  };
+  
+  class VISIBILITY_HIDDEN Deleter : public FunctionPass {
+    static char ID;
+    
+  public:
+    Deleter();
+    
+    const char *getPassName() const;
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    
+    bool runOnFunction(Function &F);
+    bool doFinalization(Module &M);
+  };
+  
+}
+
+static RegisterPass<GCModuleInfo>
+X("collector-metadata", "Create Garbage Collector Module Metadata");
+
+// -----------------------------------------------------------------------------
+
+GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S)
+  : F(F), S(S), FrameSize(~0LL) {}
+
+GCFunctionInfo::~GCFunctionInfo() {}
+
+// -----------------------------------------------------------------------------
+
+char GCModuleInfo::ID = 0;
+
+GCModuleInfo::GCModuleInfo()
+  : ImmutablePass(&ID) {}
+
+GCModuleInfo::~GCModuleInfo() {
+  clear();
+}
+
+GCStrategy *GCModuleInfo::getOrCreateStrategy(const Module *M,
+                                              const std::string &Name) {
+  const char *Start = Name.c_str();
+  
+  strategy_map_type::iterator NMI =
+    StrategyMap.find(Start, Start + Name.size());
+  if (NMI != StrategyMap.end())
+    return NMI->getValue();
+  
+  for (GCRegistry::iterator I = GCRegistry::begin(),
+                            E = GCRegistry::end(); I != E; ++I) {
+    if (strcmp(Start, I->getName()) == 0) {
+      GCStrategy *S = I->instantiate();
+      S->M = M;
+      S->Name = Name;
+      StrategyMap.GetOrCreateValue(Start, Start + Name.size()).setValue(S);
+      StrategyList.push_back(S);
+      return S;
+    }
+  }
+  
+  cerr << "unsupported GC: " << Name << "\n";
+  abort();
+}
+
+GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
+  assert(!F.isDeclaration() && "Can only get GCFunctionInfo for a definition!");
+  assert(F.hasGC());
+  
+  finfo_map_type::iterator I = FInfoMap.find(&F);
+  if (I != FInfoMap.end())
+    return *I->second;
+  
+  GCStrategy *S = getOrCreateStrategy(F.getParent(), F.getGC());
+  GCFunctionInfo *GFI = S->insertFunctionInfo(F);
+  FInfoMap[&F] = GFI;
+  return *GFI;
+}
+
+void GCModuleInfo::clear() {
+  FInfoMap.clear();
+  StrategyMap.clear();
+  
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    delete *I;
+  StrategyList.clear();
+}
+
+// -----------------------------------------------------------------------------
+
+char Printer::ID = 0;
+
+FunctionPass *llvm::createGCInfoPrinter(std::ostream &OS) {
+  return new Printer(OS);
+}
+
+Printer::Printer(std::ostream &OS)
+  : FunctionPass(&ID), OS(OS) {}
+
+const char *Printer::getPassName() const {
+  return "Print Garbage Collector Information";
+}
+
+void Printer::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<GCModuleInfo>();
+}
+
+static const char *DescKind(GC::PointKind Kind) {
+  switch (Kind) {
+    default: assert(0 && "Unknown GC point kind");
+    case GC::Loop:     return "loop";
+    case GC::Return:   return "return";
+    case GC::PreCall:  return "pre-call";
+    case GC::PostCall: return "post-call";
+  }
+}
+
+bool Printer::runOnFunction(Function &F) {
+  if (!F.hasGC()) {
+    GCFunctionInfo *FD = &getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+    
+    OS << "GC roots for " << FD->getFunction().getNameStart() << ":\n";
+    for (GCFunctionInfo::roots_iterator RI = FD->roots_begin(),
+                                        RE = FD->roots_end(); RI != RE; ++RI)
+      OS << "\t" << RI->Num << "\t" << RI->StackOffset << "[sp]\n";
+    
+    OS << "GC safe points for " << FD->getFunction().getNameStart() << ":\n";
+    for (GCFunctionInfo::iterator PI = FD->begin(),
+                                  PE = FD->end(); PI != PE; ++PI) {
+      
+      OS << "\tlabel " << PI->Num << ": " << DescKind(PI->Kind) << ", live = {";
+      
+      for (GCFunctionInfo::live_iterator RI = FD->live_begin(PI),
+                                         RE = FD->live_end(PI);;) {
+        OS << " " << RI->Num;
+        if (++RI == RE)
+          break;
+        OS << ",";
+      }
+      
+      OS << " }\n";
+    }
+  }
+  
+  return false;
+}
+
+// -----------------------------------------------------------------------------
+
+char Deleter::ID = 0;
+
+FunctionPass *llvm::createGCInfoDeleter() {
+  return new Deleter();
+}
+
+Deleter::Deleter() : FunctionPass(&ID) {}
+
+const char *Deleter::getPassName() const {
+  return "Delete Garbage Collector Information";
+}
+
+void Deleter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<GCModuleInfo>();
+}
+
+bool Deleter::runOnFunction(Function &MF) {
+  return false;
+}
+
+bool Deleter::doFinalization(Module &M) {
+  GCModuleInfo *GMI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(GMI && "Deleter didn't require GCModuleInfo?!");
+  GMI->clear();
+  return false;
+}
diff --git a/lib/CodeGen/GCMetadataPrinter.cpp b/lib/CodeGen/GCMetadataPrinter.cpp
new file mode 100644
index 0000000..5a5ef84
--- /dev/null
+++ b/lib/CodeGen/GCMetadataPrinter.cpp
@@ -0,0 +1,30 @@
+//===-- GCMetadataPrinter.cpp - Garbage collection infrastructure ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the abstract base class GCMetadataPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+
+using namespace llvm;
+
+GCMetadataPrinter::GCMetadataPrinter() { }
+
+GCMetadataPrinter::~GCMetadataPrinter() { }
+
+void GCMetadataPrinter::beginAssembly(raw_ostream &OS, AsmPrinter &AP,
+                                      const TargetAsmInfo &TAI) {
+  // Default is no action.
+}
+
+void GCMetadataPrinter::finishAssembly(raw_ostream &OS, AsmPrinter &AP,
+                                       const TargetAsmInfo &TAI) {
+  // Default is no action.
+}
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
new file mode 100644
index 0000000..ad7421a
--- /dev/null
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -0,0 +1,392 @@
+//===-- GCStrategy.cpp - Garbage collection infrastructure -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target- and collector-independent garbage collection
+// infrastructure.
+//
+// MachineCodeAnalysis identifies the GC safe points in the machine code. Roots
+// are identified in SelectionDAGISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+namespace {
+  
+  /// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
+  /// llvm.gcwrite intrinsics, replacing them with simple loads and stores as 
+  /// directed by the GCStrategy. It also performs automatic root initialization
+  /// and custom intrinsic lowering.
+  class VISIBILITY_HIDDEN LowerIntrinsics : public FunctionPass {
+    static bool NeedsDefaultLoweringPass(const GCStrategy &C);
+    static bool NeedsCustomLoweringPass(const GCStrategy &C);
+    static bool CouldBecomeSafePoint(Instruction *I);
+    bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
+    static bool InsertRootInitializers(Function &F,
+                                       AllocaInst **Roots, unsigned Count);
+    
+  public:
+    static char ID;
+    
+    LowerIntrinsics();
+    const char *getPassName() const;
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    
+    bool doInitialization(Module &M);
+    bool runOnFunction(Function &F);
+  };
+  
+  
+  /// MachineCodeAnalysis - This is a target-independent pass over the machine 
+  /// function representation to identify safe points for the garbage collector
+  /// in the machine code. It inserts labels at safe points and populates a
+  /// GCMetadata record for each function.
+  class VISIBILITY_HIDDEN MachineCodeAnalysis : public MachineFunctionPass {
+    const TargetMachine *TM;
+    GCFunctionInfo *FI;
+    MachineModuleInfo *MMI;
+    const TargetInstrInfo *TII;
+    
+    void FindSafePoints(MachineFunction &MF);
+    void VisitCallPoint(MachineBasicBlock::iterator MI);
+    unsigned InsertLabel(MachineBasicBlock &MBB, 
+                         MachineBasicBlock::iterator MI) const;
+    
+    void FindStackOffsets(MachineFunction &MF);
+    
+  public:
+    static char ID;
+    
+    MachineCodeAnalysis();
+    const char *getPassName() const;
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    
+    bool runOnMachineFunction(MachineFunction &MF);
+  };
+  
+}
+
+// -----------------------------------------------------------------------------
+
+GCStrategy::GCStrategy() :
+  NeededSafePoints(0),
+  CustomReadBarriers(false),
+  CustomWriteBarriers(false),
+  CustomRoots(false),
+  InitRoots(true),
+  UsesMetadata(false)
+{}
+
+GCStrategy::~GCStrategy() {
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    delete *I;
+  
+  Functions.clear();
+}
+ 
+bool GCStrategy::initializeCustomLowering(Module &M) { return false; }
+ 
+bool GCStrategy::performCustomLowering(Function &F) {
+  cerr << "gc " << getName() << " must override performCustomLowering.\n";
+  abort();
+  return 0;
+}
+
+GCFunctionInfo *GCStrategy::insertFunctionInfo(const Function &F) {
+  GCFunctionInfo *FI = new GCFunctionInfo(F, *this);
+  Functions.push_back(FI);
+  return FI;
+}
+
+// -----------------------------------------------------------------------------
+
+FunctionPass *llvm::createGCLoweringPass() {
+  return new LowerIntrinsics();
+}
+ 
+char LowerIntrinsics::ID = 0;
+
+LowerIntrinsics::LowerIntrinsics()
+  : FunctionPass(&ID) {}
+
+const char *LowerIntrinsics::getPassName() const {
+  return "Lower Garbage Collection Instructions";
+}
+    
+void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<GCModuleInfo>();
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now.
+bool LowerIntrinsics::doInitialization(Module &M) {
+  // FIXME: This is rather antisocial in the context of a JIT since it performs
+  //        work against the entire module. But this cannot be done at
+  //        runFunction time (initializeCustomLowering likely needs to change
+  //        the module).
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() && I->hasGC())
+      MI->getFunctionInfo(*I); // Instantiate the GC strategy.
+  
+  bool MadeChange = false;
+  for (GCModuleInfo::iterator I = MI->begin(), E = MI->end(); I != E; ++I)
+    if (NeedsCustomLoweringPass(**I))
+      if ((*I)->initializeCustomLowering(M))
+        MadeChange = true;
+  
+  return MadeChange;
+}
+
+bool LowerIntrinsics::InsertRootInitializers(Function &F, AllocaInst **Roots, 
+                                                          unsigned Count) {
+  // Scroll past alloca instructions.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(IP)) ++IP;
+  
+  // Search for initializers in the initial BB.
+  SmallPtrSet<AllocaInst*,16> InitedRoots;
+  for (; !CouldBecomeSafePoint(IP); ++IP)
+    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
+      if (AllocaInst *AI =
+          dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
+        InitedRoots.insert(AI);
+  
+  // Add root initializers.
+  bool MadeChange = false;
+  
+  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
+    if (!InitedRoots.count(*I)) {
+      new StoreInst(ConstantPointerNull::get(cast<PointerType>(
+                      cast<PointerType>((*I)->getType())->getElementType())),
+                    *I, IP);
+      MadeChange = true;
+    }
+  
+  return MadeChange;
+}
+
+bool LowerIntrinsics::NeedsDefaultLoweringPass(const GCStrategy &C) {
+  // Default lowering is necessary only if read or write barriers have a default
+  // action. The default for roots is no action.
+  return !C.customWriteBarrier()
+      || !C.customReadBarrier()
+      || C.initializeRoots();
+}
+
+bool LowerIntrinsics::NeedsCustomLoweringPass(const GCStrategy &C) {
+  // Custom lowering is only necessary if enabled for some action.
+  return C.customWriteBarrier()
+      || C.customReadBarrier()
+      || C.customRoots();
+}
+
+/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
+/// instruction could introduce a safe point.
+bool LowerIntrinsics::CouldBecomeSafePoint(Instruction *I) {
+  // The natural definition of instructions which could introduce safe points
+  // are:
+  // 
+  //   - call, invoke (AfterCall, BeforeCall)
+  //   - phis (Loops)
+  //   - invoke, ret, unwind (Exit)
+  // 
+  // However, instructions as seemingly inoccuous as arithmetic can become
+  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
+  // it is necessary to take a conservative approach.
+  
+  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) ||
+      isa<StoreInst>(I) || isa<LoadInst>(I))
+    return false;
+  
+  // llvm.gcroot is safe because it doesn't do anything at runtime.
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    if (Function *F = CI->getCalledFunction())
+      if (unsigned IID = F->getIntrinsicID())
+        if (IID == Intrinsic::gcroot)
+          return false;
+  
+  return true;
+}
+
+/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
+/// Leave gcroot intrinsics; the code generator needs to see those.
+bool LowerIntrinsics::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use GC.
+  if (!F.hasGC())
+    return false;
+  
+  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+  GCStrategy &S = FI.getStrategy();
+  
+  bool MadeChange = false;
+  
+  if (NeedsDefaultLoweringPass(S))
+    MadeChange |= PerformDefaultLowering(F, S);
+  
+  if (NeedsCustomLoweringPass(S))
+    MadeChange |= S.performCustomLowering(F);
+  
+  return MadeChange;
+}
+
+bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
+  bool LowerWr = !S.customWriteBarrier();
+  bool LowerRd = !S.customReadBarrier();
+  bool InitRoots = S.initializeRoots();
+  
+  SmallVector<AllocaInst*,32> Roots;
+  
+  bool MadeChange = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
+        Function *F = CI->getCalledFunction();
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::gcwrite:
+          if (LowerWr) {
+            // Replace a write barrier with a simple store.
+            Value *St = new StoreInst(CI->getOperand(1), CI->getOperand(3), CI);
+            CI->replaceAllUsesWith(St);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcread:
+          if (LowerRd) {
+            // Replace a read barrier with a simple load.
+            Value *Ld = new LoadInst(CI->getOperand(2), "", CI);
+            Ld->takeName(CI);
+            CI->replaceAllUsesWith(Ld);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcroot:
+          if (InitRoots) {
+            // Initialize the GC root, but do not delete the intrinsic. The
+            // backend needs the intrinsic to flag the stack slot.
+            Roots.push_back(cast<AllocaInst>(
+                              CI->getOperand(1)->stripPointerCasts()));
+          }
+          break;
+        default:
+          continue;
+        }
+        
+        MadeChange = true;
+      }
+    }
+  }
+  
+  if (Roots.size())
+    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
+  
+  return MadeChange;
+}
+
+// -----------------------------------------------------------------------------
+
+FunctionPass *llvm::createGCMachineCodeAnalysisPass() {
+  return new MachineCodeAnalysis();
+}
+
+char MachineCodeAnalysis::ID = 0;
+
+MachineCodeAnalysis::MachineCodeAnalysis()
+  : MachineFunctionPass(&ID) {}
+
+const char *MachineCodeAnalysis::getPassName() const {
+  return "Analyze Machine Code For Garbage Collection";
+}
+
+void MachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<GCModuleInfo>();
+}
+
+unsigned MachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB, 
+                                     MachineBasicBlock::iterator MI) const {
+  unsigned Label = MMI->NextLabelID();
+  // N.B. we assume that MI is *not* equal to the "end()" iterator.
+  BuildMI(MBB, MI, MI->getDebugLoc(),
+          TII->get(TargetInstrInfo::GC_LABEL)).addImm(Label);
+  return Label;
+}
+
+void MachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
+  // Find the return address (next instruction), too, so as to bracket the call
+  // instruction.
+  MachineBasicBlock::iterator RAI = CI; 
+  ++RAI;                                
+  
+  if (FI->getStrategy().needsSafePoint(GC::PreCall))
+    FI->addSafePoint(GC::PreCall, InsertLabel(*CI->getParent(), CI));
+  
+  if (FI->getStrategy().needsSafePoint(GC::PostCall))
+    FI->addSafePoint(GC::PostCall, InsertLabel(*CI->getParent(), RAI));
+}
+
+void MachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
+  for (MachineFunction::iterator BBI = MF.begin(),
+                                 BBE = MF.end(); BBI != BBE; ++BBI)
+    for (MachineBasicBlock::iterator MI = BBI->begin(),
+                                     ME = BBI->end(); MI != ME; ++MI)
+      if (MI->getDesc().isCall())
+        VisitCallPoint(MI);
+}
+
+void MachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
+  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  assert(TRI && "TargetRegisterInfo not available!");
+  
+  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin(),
+                                      RE = FI->roots_end(); RI != RE; ++RI)
+    RI->StackOffset = TRI->getFrameIndexOffset(MF, RI->Num);
+}
+
+bool MachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  // Quick exit for functions that do not use GC.
+  if (!MF.getFunction()->hasGC())
+    return false;
+  
+  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF.getFunction());
+  if (!FI->getStrategy().needsSafePoints())
+    return false;
+  
+  TM = &MF.getTarget();
+  MMI = &getAnalysis<MachineModuleInfo>();
+  TII = TM->getInstrInfo();
+  
+  // Find the size of the stack frame.
+  FI->setFrameSize(MF.getFrameInfo()->getStackSize());
+  
+  // Find all safe points.
+  FindSafePoints(MF);
+  
+  // Find the stack offsets for all roots.
+  FindStackOffsets(MF);
+  
+  return false;
+}
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
new file mode 100644
index 0000000..1d0887f
--- /dev/null
+++ b/lib/CodeGen/IfConversion.cpp
@@ -0,0 +1,1229 @@
+//===-- IfConversion.cpp - Machine code if conversion pass. ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ifcvt"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+// Hidden options for help debugging.
+static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtFnStop("ifcvt-fn-stop", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtLimit("ifcvt-limit", cl::init(-1), cl::Hidden);
+static cl::opt<bool> DisableSimple("disable-ifcvt-simple", 
+                                   cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false", 
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle", 
+                                     cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev", 
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false", 
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev", 
+                                       cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond", 
+                                    cl::init(false), cl::Hidden);
+
+STATISTIC(NumSimple,       "Number of simple if-conversions performed");
+STATISTIC(NumSimpleFalse,  "Number of simple (F) if-conversions performed");
+STATISTIC(NumTriangle,     "Number of triangle if-conversions performed");
+STATISTIC(NumTriangleRev,  "Number of triangle (R) if-conversions performed");
+STATISTIC(NumTriangleFalse,"Number of triangle (F) if-conversions performed");
+STATISTIC(NumTriangleFRev, "Number of triangle (F/R) if-conversions performed");
+STATISTIC(NumDiamonds,     "Number of diamond if-conversions performed");
+STATISTIC(NumIfConvBBs,    "Number of if-converted blocks");
+STATISTIC(NumDupBBs,       "Number of duplicated blocks");
+
+namespace {
+  class VISIBILITY_HIDDEN IfConverter : public MachineFunctionPass {
+    enum IfcvtKind {
+      ICNotClassfied,  // BB data valid, but not classified.
+      ICSimpleFalse,   // Same as ICSimple, but on the false path.
+      ICSimple,        // BB is entry of an one split, no rejoin sub-CFG.
+      ICTriangleFRev,  // Same as ICTriangleFalse, but false path rev condition.
+      ICTriangleRev,   // Same as ICTriangle, but true path rev condition.
+      ICTriangleFalse, // Same as ICTriangle, but on the false path.
+      ICTriangle,      // BB is entry of a triangle sub-CFG.
+      ICDiamond        // BB is entry of a diamond sub-CFG.
+    };
+
+    /// BBInfo - One per MachineBasicBlock, this is used to cache the result
+    /// if-conversion feasibility analysis. This includes results from
+    /// TargetInstrInfo::AnalyzeBranch() (i.e. TBB, FBB, and Cond), and its
+    /// classification, and common tail block of its successors (if it's a
+    /// diamond shape), its size, whether it's predicable, and whether any
+    /// instruction can clobber the 'would-be' predicate.
+    ///
+    /// IsDone          - True if BB is not to be considered for ifcvt.
+    /// IsBeingAnalyzed - True if BB is currently being analyzed.
+    /// IsAnalyzed      - True if BB has been analyzed (info is still valid).
+    /// IsEnqueued      - True if BB has been enqueued to be ifcvt'ed.
+    /// IsBrAnalyzable  - True if AnalyzeBranch() returns false.
+    /// HasFallThrough  - True if BB may fallthrough to the following BB.
+    /// IsUnpredicable  - True if BB is known to be unpredicable.
+    /// ClobbersPred    - True if BB could modify predicates (e.g. has
+    ///                   cmp, call, etc.)
+    /// NonPredSize     - Number of non-predicated instructions.
+    /// BB              - Corresponding MachineBasicBlock.
+    /// TrueBB / FalseBB- See AnalyzeBranch().
+    /// BrCond          - Conditions for end of block conditional branches.
+    /// Predicate       - Predicate used in the BB.
+    struct BBInfo {
+      bool IsDone          : 1;
+      bool IsBeingAnalyzed : 1;
+      bool IsAnalyzed      : 1;
+      bool IsEnqueued      : 1;
+      bool IsBrAnalyzable  : 1;
+      bool HasFallThrough  : 1;
+      bool IsUnpredicable  : 1;
+      bool CannotBeCopied  : 1;
+      bool ClobbersPred    : 1;
+      unsigned NonPredSize;
+      MachineBasicBlock *BB;
+      MachineBasicBlock *TrueBB;
+      MachineBasicBlock *FalseBB;
+      SmallVector<MachineOperand, 4> BrCond;
+      SmallVector<MachineOperand, 4> Predicate;
+      BBInfo() : IsDone(false), IsBeingAnalyzed(false),
+                 IsAnalyzed(false), IsEnqueued(false), IsBrAnalyzable(false),
+                 HasFallThrough(false), IsUnpredicable(false),
+                 CannotBeCopied(false), ClobbersPred(false), NonPredSize(0),
+                 BB(0), TrueBB(0), FalseBB(0) {}
+    };
+
+    /// IfcvtToken - Record information about pending if-conversions to attemp:
+    /// BBI             - Corresponding BBInfo.
+    /// Kind            - Type of block. See IfcvtKind.
+    /// NeedSubsumption - True if the to-be-predicated BB has already been
+    ///                   predicated.
+    /// NumDups      - Number of instructions that would be duplicated due
+    ///                   to this if-conversion. (For diamonds, the number of
+    ///                   identical instructions at the beginnings of both
+    ///                   paths).
+    /// NumDups2     - For diamonds, the number of identical instructions
+    ///                   at the ends of both paths.
+    struct IfcvtToken {
+      BBInfo &BBI;
+      IfcvtKind Kind;
+      bool NeedSubsumption;
+      unsigned NumDups;
+      unsigned NumDups2;
+      IfcvtToken(BBInfo &b, IfcvtKind k, bool s, unsigned d, unsigned d2 = 0)
+        : BBI(b), Kind(k), NeedSubsumption(s), NumDups(d), NumDups2(d2) {}
+    };
+
+    /// Roots - Basic blocks that do not have successors. These are the starting
+    /// points of Graph traversal.
+    std::vector<MachineBasicBlock*> Roots;
+
+    /// BBAnalysis - Results of if-conversion feasibility analysis indexed by
+    /// basic block number.
+    std::vector<BBInfo> BBAnalysis;
+
+    const TargetLowering *TLI;
+    const TargetInstrInfo *TII;
+    bool MadeChange;
+  public:
+    static char ID;
+    IfConverter() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char *getPassName() const { return "If Converter"; }
+
+  private:
+    bool ReverseBranchCondition(BBInfo &BBI);
+    bool ValidSimple(BBInfo &TrueBBI, unsigned &Dups) const;
+    bool ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                       bool FalseBranch, unsigned &Dups) const;
+    bool ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                      unsigned &Dups1, unsigned &Dups2) const;
+    void ScanInstructions(BBInfo &BBI);
+    BBInfo &AnalyzeBlock(MachineBasicBlock *BB,
+                         std::vector<IfcvtToken*> &Tokens);
+    bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Cond,
+                             bool isTriangle = false, bool RevBranch = false);
+    bool AnalyzeBlocks(MachineFunction &MF,
+                       std::vector<IfcvtToken*> &Tokens);
+    void InvalidatePreds(MachineBasicBlock *BB);
+    void RemoveExtraEdges(BBInfo &BBI);
+    bool IfConvertSimple(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                          unsigned NumDups1, unsigned NumDups2);
+    void PredicateBlock(BBInfo &BBI,
+                        MachineBasicBlock::iterator E,
+                        SmallVectorImpl<MachineOperand> &Cond);
+    void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                               SmallVectorImpl<MachineOperand> &Cond,
+                               bool IgnoreBr = false);
+    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI);
+
+    bool MeetIfcvtSizeLimit(unsigned Size) const {
+      return Size > 0 && Size <= TLI->getIfCvtBlockSizeLimit();
+    }
+
+    // blockAlwaysFallThrough - Block ends without a terminator.
+    bool blockAlwaysFallThrough(BBInfo &BBI) const {
+      return BBI.IsBrAnalyzable && BBI.TrueBB == NULL;
+    }
+
+    // IfcvtTokenCmp - Used to sort if-conversion candidates.
+    static bool IfcvtTokenCmp(IfcvtToken *C1, IfcvtToken *C2) {
+      int Incr1 = (C1->Kind == ICDiamond)
+        ? -(int)(C1->NumDups + C1->NumDups2) : (int)C1->NumDups;
+      int Incr2 = (C2->Kind == ICDiamond)
+        ? -(int)(C2->NumDups + C2->NumDups2) : (int)C2->NumDups;
+      if (Incr1 > Incr2)
+        return true;
+      else if (Incr1 == Incr2) {
+        // Favors subsumption.
+        if (C1->NeedSubsumption == false && C2->NeedSubsumption == true)
+          return true;
+        else if (C1->NeedSubsumption == C2->NeedSubsumption) {
+          // Favors diamond over triangle, etc.
+          if ((unsigned)C1->Kind < (unsigned)C2->Kind)
+            return true;
+          else if (C1->Kind == C2->Kind)
+            return C1->BBI.BB->getNumber() < C2->BBI.BB->getNumber();
+        }
+      }
+      return false;
+    }
+  };
+
+  char IfConverter::ID = 0;
+}
+
+static RegisterPass<IfConverter>
+X("if-converter", "If Converter");
+
+FunctionPass *llvm::createIfConverterPass() { return new IfConverter(); }
+
+bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
+  TLI = MF.getTarget().getTargetLowering();
+  TII = MF.getTarget().getInstrInfo();
+  if (!TII) return false;
+
+  static int FnNum = -1;
+  DOUT << "\nIfcvt: function (" << ++FnNum <<  ") \'"
+       << MF.getFunction()->getName() << "\'";
+
+  if (FnNum < IfCvtFnStart || (IfCvtFnStop != -1 && FnNum > IfCvtFnStop)) {
+    DOUT << " skipped\n";
+    return false;
+  }
+  DOUT << "\n";
+
+  MF.RenumberBlocks();
+  BBAnalysis.resize(MF.getNumBlockIDs());
+
+  // Look for root nodes, i.e. blocks without successors.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    if (I->succ_empty())
+      Roots.push_back(I);
+
+  std::vector<IfcvtToken*> Tokens;
+  MadeChange = false;
+  unsigned NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle +
+    NumTriangleRev + NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+  while (IfCvtLimit == -1 || (int)NumIfCvts < IfCvtLimit) {
+    // Do an initial analysis for each basic block and find all the potential
+    // candidates to perform if-conversion.
+    bool Change = AnalyzeBlocks(MF, Tokens);
+    while (!Tokens.empty()) {
+      IfcvtToken *Token = Tokens.back();
+      Tokens.pop_back();
+      BBInfo &BBI = Token->BBI;
+      IfcvtKind Kind = Token->Kind;
+      unsigned NumDups = Token->NumDups;
+      unsigned NumDups2 = Token->NumDups2;
+
+      delete Token;
+
+      // If the block has been evicted out of the queue or it has already been
+      // marked dead (due to it being predicated), then skip it.
+      if (BBI.IsDone)
+        BBI.IsEnqueued = false;
+      if (!BBI.IsEnqueued)
+        continue;
+
+      BBI.IsEnqueued = false;
+
+      bool RetVal = false;
+      switch (Kind) {
+      default: assert(false && "Unexpected!");
+        break;
+      case ICSimple:
+      case ICSimpleFalse: {
+        bool isFalse = Kind == ICSimpleFalse;
+        if ((isFalse && DisableSimpleF) || (!isFalse && DisableSimple)) break;
+        DOUT << "Ifcvt (Simple" << (Kind == ICSimpleFalse ? " false" :"")
+             << "): BB#" << BBI.BB->getNumber() << " ("
+             << ((Kind == ICSimpleFalse)
+                 ? BBI.FalseBB->getNumber()
+                 : BBI.TrueBB->getNumber()) << ") ";
+        RetVal = IfConvertSimple(BBI, Kind);
+        DOUT << (RetVal ? "succeeded!" : "failed!") << "\n";
+        if (RetVal) {
+          if (isFalse) NumSimpleFalse++;
+          else         NumSimple++;
+        }
+       break;
+      }
+      case ICTriangle:
+      case ICTriangleRev:
+      case ICTriangleFalse:
+      case ICTriangleFRev: {
+        bool isFalse = Kind == ICTriangleFalse;
+        bool isRev   = (Kind == ICTriangleRev || Kind == ICTriangleFRev);
+        if (DisableTriangle && !isFalse && !isRev) break;
+        if (DisableTriangleR && !isFalse && isRev) break;
+        if (DisableTriangleF && isFalse && !isRev) break;
+        if (DisableTriangleFR && isFalse && isRev) break;
+        DOUT << "Ifcvt (Triangle";
+        if (isFalse)
+          DOUT << " false";
+        if (isRev)
+          DOUT << " rev";
+        DOUT << "): BB#" << BBI.BB->getNumber() << " (T:"
+             << BBI.TrueBB->getNumber() << ",F:"
+             << BBI.FalseBB->getNumber() << ") ";
+        RetVal = IfConvertTriangle(BBI, Kind);
+        DOUT << (RetVal ? "succeeded!" : "failed!") << "\n";
+        if (RetVal) {
+          if (isFalse) {
+            if (isRev) NumTriangleFRev++;
+            else       NumTriangleFalse++;
+          } else {
+            if (isRev) NumTriangleRev++;
+            else       NumTriangle++;
+          }
+        }
+        break;
+      }
+      case ICDiamond: {
+        if (DisableDiamond) break;
+        DOUT << "Ifcvt (Diamond): BB#" << BBI.BB->getNumber() << " (T:"
+             << BBI.TrueBB->getNumber() << ",F:"
+             << BBI.FalseBB->getNumber() << ") ";
+        RetVal = IfConvertDiamond(BBI, Kind, NumDups, NumDups2);
+        DOUT << (RetVal ? "succeeded!" : "failed!") << "\n";
+        if (RetVal) NumDiamonds++;
+        break;
+      }
+      }
+
+      Change |= RetVal;
+
+      NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle + NumTriangleRev +
+        NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+      if (IfCvtLimit != -1 && (int)NumIfCvts >= IfCvtLimit)
+        break;
+    }
+
+    if (!Change)
+      break;
+    MadeChange |= Change;
+  }
+
+  // Delete tokens in case of early exit.
+  while (!Tokens.empty()) {
+    IfcvtToken *Token = Tokens.back();
+    Tokens.pop_back();
+    delete Token;
+  }
+
+  Tokens.clear();
+  Roots.clear();
+  BBAnalysis.clear();
+
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         E = BB->succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  }
+  return NULL;
+}
+
+/// ReverseBranchCondition - Reverse the condition of the end of the block
+/// branch. Swap block's 'true' and 'false' successors.
+bool IfConverter::ReverseBranchCondition(BBInfo &BBI) {
+  if (!TII->ReverseBranchCondition(BBI.BrCond)) {
+    TII->RemoveBranch(*BBI.BB);
+    TII->InsertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond);
+    std::swap(BBI.TrueBB, BBI.FalseBB);
+    return true;
+  }
+  return false;
+}
+
+/// getNextBlock - Returns the next block in the function blocks ordering. If
+/// it is the end, returns NULL.
+static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) {
+  MachineFunction::iterator I = BB;
+  MachineFunction::iterator E = BB->getParent()->end();
+  if (++I == E)
+    return NULL;
+  return I;
+}
+
+/// ValidSimple - Returns true if the 'true' block (along with its
+/// predecessor) forms a valid simple shape for ifcvt. It also returns the
+/// number of instructions that the ifcvt would need to duplicate if performed
+/// in Dups.
+bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups) const {
+  Dups = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.IsBrAnalyzable)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied ||
+        TrueBBI.NonPredSize > TLI->getIfCvtDupBlockSizeLimit())
+      return false;
+    Dups = TrueBBI.NonPredSize;
+  }
+
+  return true;
+}
+
+/// ValidTriangle - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) forms a valid triangle shape for ifcvt.
+/// If 'FalseBranch' is true, it checks if 'true' block's false branch
+/// branches to the false branch rather than the other way around. It also
+/// returns the number of instructions that the ifcvt would need to duplicate
+/// if performed in 'Dups'.
+bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                                bool FalseBranch, unsigned &Dups) const {
+  Dups = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied)
+      return false;
+
+    unsigned Size = TrueBBI.NonPredSize;
+    if (TrueBBI.IsBrAnalyzable) {
+      if (TrueBBI.TrueBB && TrueBBI.BrCond.empty())
+        // Ends with an unconditional branch. It will be removed.
+        --Size;
+      else {
+        MachineBasicBlock *FExit = FalseBranch
+          ? TrueBBI.TrueBB : TrueBBI.FalseBB;
+        if (FExit)
+          // Require a conditional branch
+          ++Size;
+      }
+    }
+    if (Size > TLI->getIfCvtDupBlockSizeLimit())
+      return false;
+    Dups = Size;
+  }
+
+  MachineBasicBlock *TExit = FalseBranch ? TrueBBI.FalseBB : TrueBBI.TrueBB;
+  if (!TExit && blockAlwaysFallThrough(TrueBBI)) {
+    MachineFunction::iterator I = TrueBBI.BB;
+    if (++I == TrueBBI.BB->getParent()->end())
+      return false;
+    TExit = I;
+  }
+  return TExit && TExit == FalseBBI.BB;
+}
+
+static
+MachineBasicBlock::iterator firstNonBranchInst(MachineBasicBlock *BB,
+                                               const TargetInstrInfo *TII) {
+  MachineBasicBlock::iterator I = BB->end();
+  while (I != BB->begin()) {
+    --I;
+    if (!I->getDesc().isBranch())
+      break;
+  }
+  return I;
+}
+
+/// ValidDiamond - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) forms a valid diamond shape for ifcvt.
+bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                               unsigned &Dups1, unsigned &Dups2) const {
+  Dups1 = Dups2 = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone ||
+      FalseBBI.IsBeingAnalyzed || FalseBBI.IsDone)
+    return false;
+
+  MachineBasicBlock *TT = TrueBBI.TrueBB;
+  MachineBasicBlock *FT = FalseBBI.TrueBB;
+
+  if (!TT && blockAlwaysFallThrough(TrueBBI))
+    TT = getNextBlock(TrueBBI.BB);
+  if (!FT && blockAlwaysFallThrough(FalseBBI))
+    FT = getNextBlock(FalseBBI.BB);
+  if (TT != FT)
+    return false;
+  if (TT == NULL && (TrueBBI.IsBrAnalyzable || FalseBBI.IsBrAnalyzable))
+    return false;
+  if  (TrueBBI.BB->pred_size() > 1 || FalseBBI.BB->pred_size() > 1)
+    return false;
+
+  // FIXME: Allow true block to have an early exit?
+  if (TrueBBI.FalseBB || FalseBBI.FalseBB ||
+      (TrueBBI.ClobbersPred && FalseBBI.ClobbersPred))
+    return false;
+
+  MachineBasicBlock::iterator TI = TrueBBI.BB->begin();
+  MachineBasicBlock::iterator FI = FalseBBI.BB->begin();
+  while (TI != TrueBBI.BB->end() && FI != FalseBBI.BB->end()) {
+    if (!TI->isIdenticalTo(FI))
+      break;
+    ++Dups1;
+    ++TI;
+    ++FI;
+  }
+
+  TI = firstNonBranchInst(TrueBBI.BB, TII);
+  FI = firstNonBranchInst(FalseBBI.BB, TII);
+  while (TI != TrueBBI.BB->begin() && FI != FalseBBI.BB->begin()) {
+    if (!TI->isIdenticalTo(FI))
+      break;
+    ++Dups2;
+    --TI;
+    --FI;
+  }
+
+  return true;
+}
+
+/// ScanInstructions - Scan all the instructions in the block to determine if
+/// the block is predicable. In most cases, that means all the instructions
+/// in the block are isPredicable(). Also checks if the block contains any
+/// instruction which can clobber a predicate (e.g. condition code register).
+/// If so, the block is not predicable unless it's the last instruction.
+void IfConverter::ScanInstructions(BBInfo &BBI) {
+  if (BBI.IsDone)
+    return;
+
+  bool AlreadyPredicated = BBI.Predicate.size() > 0;
+  // First analyze the end of BB branches.
+  BBI.TrueBB = BBI.FalseBB = NULL;
+  BBI.BrCond.clear();
+  BBI.IsBrAnalyzable =
+    !TII->AnalyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
+  BBI.HasFallThrough = BBI.IsBrAnalyzable && BBI.FalseBB == NULL;
+
+  if (BBI.BrCond.size()) {
+    // No false branch. This BB must end with a conditional branch and a
+    // fallthrough.
+    if (!BBI.FalseBB)
+      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);  
+    assert(BBI.FalseBB && "Expected to find the fallthrough block!");
+  }
+
+  // Then scan all the instructions.
+  BBI.NonPredSize = 0;
+  BBI.ClobbersPred = false;
+  for (MachineBasicBlock::iterator I = BBI.BB->begin(), E = BBI.BB->end();
+       I != E; ++I) {
+    const TargetInstrDesc &TID = I->getDesc();
+    if (TID.isNotDuplicable())
+      BBI.CannotBeCopied = true;
+
+    bool isPredicated = TII->isPredicated(I);
+    bool isCondBr = BBI.IsBrAnalyzable && TID.isConditionalBranch();
+
+    if (!isCondBr) {
+      if (!isPredicated)
+        BBI.NonPredSize++;
+      else if (!AlreadyPredicated) {
+        // FIXME: This instruction is already predicated before the
+        // if-conversion pass. It's probably something like a conditional move.
+        // Mark this block unpredicable for now.
+        BBI.IsUnpredicable = true;
+        return;
+      }
+    }
+
+    if (BBI.ClobbersPred && !isPredicated) {
+      // Predicate modification instruction should end the block (except for
+      // already predicated instructions and end of block branches).
+      if (isCondBr) {
+        // A conditional branch is not predicable, but it may be eliminated.
+        continue;
+      }
+
+      // Predicate may have been modified, the subsequent (currently)
+      // unpredicated instructions cannot be correctly predicated.
+      BBI.IsUnpredicable = true;
+      return;
+    }
+
+    // FIXME: Make use of PredDefs? e.g. ADDC, SUBC sets predicates but are
+    // still potentially predicable.
+    std::vector<MachineOperand> PredDefs;
+    if (TII->DefinesPredicate(I, PredDefs))
+      BBI.ClobbersPred = true;
+
+    if (!TID.isPredicable()) {
+      BBI.IsUnpredicable = true;
+      return;
+    }
+  }
+}
+
+/// FeasibilityAnalysis - Determine if the block is a suitable candidate to be
+/// predicated by the specified predicate.
+bool IfConverter::FeasibilityAnalysis(BBInfo &BBI,
+                                      SmallVectorImpl<MachineOperand> &Pred,
+                                      bool isTriangle, bool RevBranch) {
+  // If the block is dead or unpredicable, then it cannot be predicated.
+  if (BBI.IsDone || BBI.IsUnpredicable)
+    return false;
+
+  // If it is already predicated, check if its predicate subsumes the new
+  // predicate.
+  if (BBI.Predicate.size() && !TII->SubsumesPredicate(BBI.Predicate, Pred))
+    return false;
+
+  if (BBI.BrCond.size()) {
+    if (!isTriangle)
+      return false;
+
+    // Test predicate subsumption.
+    SmallVector<MachineOperand, 4> RevPred(Pred.begin(), Pred.end());
+    SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+    if (RevBranch) {
+      if (TII->ReverseBranchCondition(Cond))
+        return false;
+    }
+    if (TII->ReverseBranchCondition(RevPred) ||
+        !TII->SubsumesPredicate(Cond, RevPred))
+      return false;
+  }
+
+  return true;
+}
+
+/// AnalyzeBlock - Analyze the structure of the sub-CFG starting from
+/// the specified block. Record its successors and whether it looks like an
+/// if-conversion candidate.
+IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
+                                             std::vector<IfcvtToken*> &Tokens) {
+  BBInfo &BBI = BBAnalysis[BB->getNumber()];
+
+  if (BBI.IsAnalyzed || BBI.IsBeingAnalyzed)
+    return BBI;
+
+  BBI.BB = BB;
+  BBI.IsBeingAnalyzed = true;
+
+  ScanInstructions(BBI);
+
+  // Unanalyzable or ends with fallthrough or unconditional branch.
+  if (!BBI.IsBrAnalyzable || BBI.BrCond.empty()) {
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    return BBI;
+  }
+
+  // Do not ifcvt if either path is a back edge to the entry block.
+  if (BBI.TrueBB == BB || BBI.FalseBB == BB) {
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    return BBI;
+  }
+
+  BBInfo &TrueBBI  = AnalyzeBlock(BBI.TrueBB, Tokens);
+  BBInfo &FalseBBI = AnalyzeBlock(BBI.FalseBB, Tokens);
+
+  if (TrueBBI.IsDone && FalseBBI.IsDone) {
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    return BBI;
+  }
+
+  SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+  bool CanRevCond = !TII->ReverseBranchCondition(RevCond);
+
+  unsigned Dups = 0;
+  unsigned Dups2 = 0;
+  bool TNeedSub = TrueBBI.Predicate.size() > 0;
+  bool FNeedSub = FalseBBI.Predicate.size() > 0;
+  bool Enqueued = false;
+  if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) &&
+      MeetIfcvtSizeLimit(TrueBBI.NonPredSize - (Dups + Dups2)) &&
+      MeetIfcvtSizeLimit(FalseBBI.NonPredSize - (Dups + Dups2)) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond) &&
+      FeasibilityAnalysis(FalseBBI, RevCond)) {
+    // Diamond:
+    //   EBB
+    //   / \_
+    //  |   |
+    // TBB FBB
+    //   \ /
+    //  TailBB
+    // Note TailBB can be empty.
+    Tokens.push_back(new IfcvtToken(BBI, ICDiamond, TNeedSub|FNeedSub, Dups,
+                                    Dups2));
+    Enqueued = true;
+  }
+
+  if (ValidTriangle(TrueBBI, FalseBBI, false, Dups) &&
+      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) {
+    // Triangle:
+    //   EBB
+    //   | \_
+    //   |  |
+    //   | TBB
+    //   |  /
+    //   FBB
+    Tokens.push_back(new IfcvtToken(BBI, ICTriangle, TNeedSub, Dups));
+    Enqueued = true;
+  }
+  
+  if (ValidTriangle(TrueBBI, FalseBBI, true, Dups) &&
+      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
+    Tokens.push_back(new IfcvtToken(BBI, ICTriangleRev, TNeedSub, Dups));
+    Enqueued = true;
+  }
+
+  if (ValidSimple(TrueBBI, Dups) &&
+      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond)) {
+    // Simple (split, no rejoin):
+    //   EBB
+    //   | \_
+    //   |  |
+    //   | TBB---> exit
+    //   |    
+    //   FBB
+    Tokens.push_back(new IfcvtToken(BBI, ICSimple, TNeedSub, Dups));
+    Enqueued = true;
+  }
+
+  if (CanRevCond) {
+    // Try the other path...
+    if (ValidTriangle(FalseBBI, TrueBBI, false, Dups) &&
+        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        FeasibilityAnalysis(FalseBBI, RevCond, true)) {
+      Tokens.push_back(new IfcvtToken(BBI, ICTriangleFalse, FNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidTriangle(FalseBBI, TrueBBI, true, Dups) &&
+        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
+      Tokens.push_back(new IfcvtToken(BBI, ICTriangleFRev, FNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidSimple(FalseBBI, Dups) &&
+        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        FeasibilityAnalysis(FalseBBI, RevCond)) {
+      Tokens.push_back(new IfcvtToken(BBI, ICSimpleFalse, FNeedSub, Dups));
+      Enqueued = true;
+    }
+  }
+
+  BBI.IsEnqueued = Enqueued;
+  BBI.IsBeingAnalyzed = false;
+  BBI.IsAnalyzed = true;
+  return BBI;
+}
+
+/// AnalyzeBlocks - Analyze all blocks and find entries for all if-conversion
+/// candidates. It returns true if any CFG restructuring is done to expose more
+/// if-conversion opportunities.
+bool IfConverter::AnalyzeBlocks(MachineFunction &MF,
+                                std::vector<IfcvtToken*> &Tokens) {
+  bool Change = false;
+  std::set<MachineBasicBlock*> Visited;
+  for (unsigned i = 0, e = Roots.size(); i != e; ++i) {
+    for (idf_ext_iterator<MachineBasicBlock*> I=idf_ext_begin(Roots[i],Visited),
+           E = idf_ext_end(Roots[i], Visited); I != E; ++I) {
+      MachineBasicBlock *BB = *I;
+      AnalyzeBlock(BB, Tokens);
+    }
+  }
+
+  // Sort to favor more complex ifcvt scheme.
+  std::stable_sort(Tokens.begin(), Tokens.end(), IfcvtTokenCmp);
+
+  return Change;
+}
+
+/// canFallThroughTo - Returns true either if ToBB is the next block after BB or
+/// that all the intervening blocks are empty (given BB can fall through to its
+/// next block).
+static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) {
+  MachineFunction::iterator I = BB;
+  MachineFunction::iterator TI = ToBB;
+  MachineFunction::iterator E = BB->getParent()->end();
+  while (++I != TI)
+    if (I == E || !I->empty())
+      return false;
+  return true;
+}
+
+/// InvalidatePreds - Invalidate predecessor BB info so it would be re-analyzed
+/// to determine if it can be if-converted. If predecessor is already enqueued,
+/// dequeue it!
+void IfConverter::InvalidatePreds(MachineBasicBlock *BB) {
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    BBInfo &PBBI = BBAnalysis[(*PI)->getNumber()];
+    if (PBBI.IsDone || PBBI.BB == BB)
+      continue;
+    PBBI.IsAnalyzed = false;
+    PBBI.IsEnqueued = false;
+  }
+}
+
+/// InsertUncondBranch - Inserts an unconditional branch from BB to ToBB.
+///
+static void InsertUncondBranch(MachineBasicBlock *BB, MachineBasicBlock *ToBB,
+                               const TargetInstrInfo *TII) {
+  SmallVector<MachineOperand, 0> NoCond;
+  TII->InsertBranch(*BB, ToBB, NULL, NoCond);
+}
+
+/// RemoveExtraEdges - Remove true / false edges if either / both are no longer
+/// successors.
+void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
+  MachineBasicBlock *TBB = NULL, *FBB = NULL;
+  SmallVector<MachineOperand, 4> Cond;
+  if (!TII->AnalyzeBranch(*BBI.BB, TBB, FBB, Cond))
+    BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+}
+
+/// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
+///
+bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICSimpleFalse)
+    std::swap(CvtBBI, NextBBI);
+
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtBBI->BB->pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (Kind == ICSimpleFalse)
+    if (TII->ReverseBranchCondition(Cond))
+      assert(false && "Unable to reverse branch condition!");
+
+  if (CvtBBI->BB->pred_size() > 1) {
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
+  } else {
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+
+    // Merge converted block into entry block.
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    MergeBlocks(BBI, *CvtBBI);
+  }
+
+  bool IterIfcvt = true;
+  if (!canFallThroughTo(BBI.BB, NextBBI->BB)) {
+    InsertUncondBranch(BBI.BB, NextBBI->BB, TII);
+    BBI.HasFallThrough = false;
+    // Now ifcvt'd block will look like this:
+    // BB:
+    // ...
+    // t, f = cmp
+    // if t op
+    // b BBf
+    //
+    // We cannot further ifcvt this block because the unconditional branch
+    // will have to be predicated on the new condition, that will not be
+    // available if cmp executes.
+    IterIfcvt = false;
+  }
+
+  RemoveExtraEdges(BBI);
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt)
+    BBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+  CvtBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// IfConvertTriangle - If convert a triangle sub-CFG.
+///
+bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    std::swap(CvtBBI, NextBBI);
+
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtBBI->BB->pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    if (TII->ReverseBranchCondition(Cond))
+      assert(false && "Unable to reverse branch condition!");
+
+  if (Kind == ICTriangleRev || Kind == ICTriangleFRev) {
+    if (ReverseBranchCondition(*CvtBBI)) {
+      // BB has been changed, modify its predecessors (except for this
+      // one) so they don't get ifcvt'ed based on bad intel.
+      for (MachineBasicBlock::pred_iterator PI = CvtBBI->BB->pred_begin(),
+             E = CvtBBI->BB->pred_end(); PI != E; ++PI) {
+        MachineBasicBlock *PBB = *PI;
+        if (PBB == BBI.BB)
+          continue;
+        BBInfo &PBBI = BBAnalysis[PBB->getNumber()];
+        if (PBBI.IsEnqueued) {
+          PBBI.IsAnalyzed = false;
+          PBBI.IsEnqueued = false;
+        }
+      }
+    }
+  }
+
+  bool HasEarlyExit = CvtBBI->FalseBB != NULL;
+  bool DupBB = CvtBBI->BB->pred_size() > 1;
+  if (DupBB) {
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
+  } else {
+    // Predicate the 'true' block after removing its branch.
+    CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+
+    // Now merge the entry of the triangle with the true block.
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    MergeBlocks(BBI, *CvtBBI);
+  }
+
+  // If 'true' block has a 'false' successor, add an exit branch to it.
+  if (HasEarlyExit) {
+    SmallVector<MachineOperand, 4> RevCond(CvtBBI->BrCond.begin(),
+                                           CvtBBI->BrCond.end());
+    if (TII->ReverseBranchCondition(RevCond))
+      assert(false && "Unable to reverse branch condition!");
+    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, NULL, RevCond);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB);
+  }
+
+  // Merge in the 'false' block if the 'false' block has no other
+  // predecessors. Otherwise, add an unconditional branch to 'false'.
+  bool FalseBBDead = false;
+  bool IterIfcvt = true;
+  bool isFallThrough = canFallThroughTo(BBI.BB, NextBBI->BB);
+  if (!isFallThrough) {
+    // Only merge them if the true block does not fallthrough to the false
+    // block. By not merging them, we make it possible to iteratively
+    // ifcvt the blocks.
+    if (!HasEarlyExit &&
+        NextBBI->BB->pred_size() == 1 && !NextBBI->HasFallThrough) {
+      MergeBlocks(BBI, *NextBBI);
+      FalseBBDead = true;
+    } else {
+      InsertUncondBranch(BBI.BB, NextBBI->BB, TII);
+      BBI.HasFallThrough = false;
+    }
+    // Mixed predicated and unpredicated code. This cannot be iteratively
+    // predicated.
+    IterIfcvt = false;
+  }
+
+  RemoveExtraEdges(BBI);
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt) 
+    BBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+  CvtBBI->IsDone = true;
+  if (FalseBBDead)
+    NextBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// IfConvertDiamond - If convert a diamond sub-CFG.
+///
+bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                                   unsigned NumDups1, unsigned NumDups2) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  MachineBasicBlock *TailBB = TrueBBI.TrueBB;
+  // True block must fall through or end with an unanalyzable terminator.
+  if (!TailBB) {
+    if (blockAlwaysFallThrough(TrueBBI))
+      TailBB = FalseBBI.TrueBB;
+    assert((TailBB || !TrueBBI.IsBrAnalyzable) && "Unexpected!");
+  }
+
+  if (TrueBBI.IsDone || FalseBBI.IsDone ||
+      TrueBBI.BB->pred_size() > 1 ||
+      FalseBBI.BB->pred_size() > 1) {
+    // Something has changed. It's no longer safe to predicate these blocks.
+    BBI.IsAnalyzed = false;
+    TrueBBI.IsAnalyzed = false;
+    FalseBBI.IsAnalyzed = false;
+    return false;
+  }
+
+  // Merge the 'true' and 'false' blocks by copying the instructions
+  // from the 'false' block to the 'true' block. That is, unless the true
+  // block would clobber the predicate, in that case, do the opposite.
+  BBInfo *BBI1 = &TrueBBI;
+  BBInfo *BBI2 = &FalseBBI;
+  SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (TII->ReverseBranchCondition(RevCond))
+    assert(false && "Unable to reverse branch condition!");
+  SmallVector<MachineOperand, 4> *Cond1 = &BBI.BrCond;
+  SmallVector<MachineOperand, 4> *Cond2 = &RevCond;
+
+  // Figure out the more profitable ordering.
+  bool DoSwap = false;
+  if (TrueBBI.ClobbersPred && !FalseBBI.ClobbersPred)
+    DoSwap = true;
+  else if (TrueBBI.ClobbersPred == FalseBBI.ClobbersPred) {
+    if (TrueBBI.NonPredSize > FalseBBI.NonPredSize)
+      DoSwap = true;
+  }
+  if (DoSwap) {
+    std::swap(BBI1, BBI2);
+    std::swap(Cond1, Cond2);
+  }
+
+  // Remove the conditional branch from entry to the blocks.
+  BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+
+  // Remove the duplicated instructions at the beginnings of both paths.
+  MachineBasicBlock::iterator DI1 = BBI1->BB->begin();
+  MachineBasicBlock::iterator DI2 = BBI2->BB->begin();
+  BBI1->NonPredSize -= NumDups1;
+  BBI2->NonPredSize -= NumDups1;
+  while (NumDups1 != 0) {
+    ++DI1;
+    ++DI2;
+    --NumDups1;
+  }
+  BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
+  BBI2->BB->erase(BBI2->BB->begin(), DI2);
+
+  // Predicate the 'true' block after removing its branch.
+  BBI1->NonPredSize -= TII->RemoveBranch(*BBI1->BB);
+  DI1 = BBI1->BB->end();
+  for (unsigned i = 0; i != NumDups2; ++i)
+    --DI1;
+  BBI1->BB->erase(DI1, BBI1->BB->end());
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1);
+
+  // Predicate the 'false' block.
+  BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
+  DI2 = BBI2->BB->end();
+  while (NumDups2 != 0) {
+    --DI2;
+    --NumDups2;
+  }
+  PredicateBlock(*BBI2, DI2, *Cond2);
+
+  // Merge the true block into the entry of the diamond.
+  MergeBlocks(BBI, *BBI1);
+  MergeBlocks(BBI, *BBI2);
+
+  // If the if-converted block falls through or unconditionally branches into
+  // the tail block, and the tail block does not have other predecessors, then
+  // fold the tail block in as well. Otherwise, unless it falls through to the
+  // tail, add a unconditional branch to it.
+  if (TailBB) {
+    BBInfo TailBBI = BBAnalysis[TailBB->getNumber()];
+    if (TailBB->pred_size() == 1 && !TailBBI.HasFallThrough) {
+      BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+      MergeBlocks(BBI, TailBBI);
+      TailBBI.IsDone = true;
+    } else {
+      InsertUncondBranch(BBI.BB, TailBB, TII);
+      BBI.HasFallThrough = false;
+    }
+  }
+
+  RemoveExtraEdges(BBI);
+
+  // Update block info.
+  BBI.IsDone = TrueBBI.IsDone = FalseBBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// PredicateBlock - Predicate instructions from the start of the block to the
+/// specified end with the specified condition.
+void IfConverter::PredicateBlock(BBInfo &BBI,
+                                 MachineBasicBlock::iterator E,
+                                 SmallVectorImpl<MachineOperand> &Cond) {
+  for (MachineBasicBlock::iterator I = BBI.BB->begin(); I != E; ++I) {
+    if (TII->isPredicated(I))
+      continue;
+    if (!TII->PredicateInstruction(I, Cond)) {
+      cerr << "Unable to predicate " << *I << "!\n";
+      abort();
+    }
+  }
+
+  std::copy(Cond.begin(), Cond.end(), std::back_inserter(BBI.Predicate));
+
+  BBI.IsAnalyzed = false;
+  BBI.NonPredSize = 0;
+
+  NumIfConvBBs++;
+}
+
+/// CopyAndPredicateBlock - Copy and predicate instructions from source BB to
+/// the destination block. Skip end of block branches if IgnoreBr is true.
+void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                                        SmallVectorImpl<MachineOperand> &Cond,
+                                        bool IgnoreBr) {
+  MachineFunction &MF = *ToBBI.BB->getParent();
+
+  for (MachineBasicBlock::iterator I = FromBBI.BB->begin(),
+         E = FromBBI.BB->end(); I != E; ++I) {
+    const TargetInstrDesc &TID = I->getDesc();
+    bool isPredicated = TII->isPredicated(I);
+    // Do not copy the end of the block branches.
+    if (IgnoreBr && !isPredicated && TID.isBranch())
+      break;
+
+    MachineInstr *MI = MF.CloneMachineInstr(I);
+    ToBBI.BB->insert(ToBBI.BB->end(), MI);
+    ToBBI.NonPredSize++;
+
+    if (!isPredicated)
+      if (!TII->PredicateInstruction(MI, Cond)) {
+        cerr << "Unable to predicate " << *MI << "!\n";
+        abort();
+      }
+  }
+
+  std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
+                                         FromBBI.BB->succ_end());
+  MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+    MachineBasicBlock *Succ = Succs[i];
+    // Fallthrough edge can't be transferred.
+    if (Succ == FallThrough)
+      continue;
+    ToBBI.BB->addSuccessor(Succ);
+  }
+
+  std::copy(FromBBI.Predicate.begin(), FromBBI.Predicate.end(),
+            std::back_inserter(ToBBI.Predicate));
+  std::copy(Cond.begin(), Cond.end(), std::back_inserter(ToBBI.Predicate));
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.IsAnalyzed = false;
+
+  NumDupBBs++;
+}
+
+/// MergeBlocks - Move all instructions from FromBB to the end of ToBB.
+///
+void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI) {
+  ToBBI.BB->splice(ToBBI.BB->end(),
+                   FromBBI.BB, FromBBI.BB->begin(), FromBBI.BB->end());
+
+  // Redirect all branches to FromBB to ToBB.
+  std::vector<MachineBasicBlock *> Preds(FromBBI.BB->pred_begin(),
+                                         FromBBI.BB->pred_end());
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    MachineBasicBlock *Pred = Preds[i];
+    if (Pred == ToBBI.BB)
+      continue;
+    Pred->ReplaceUsesOfBlockWith(FromBBI.BB, ToBBI.BB);
+  }
+ 
+  std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
+                                         FromBBI.BB->succ_end());
+  MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+    MachineBasicBlock *Succ = Succs[i];
+    // Fallthrough edge can't be transferred.
+    if (Succ == FallThrough)
+      continue;
+    FromBBI.BB->removeSuccessor(Succ);
+    ToBBI.BB->addSuccessor(Succ);
+  }
+
+  // Now FromBBI always falls through to the next block!
+  if (NBB && !FromBBI.BB->isSuccessor(NBB))
+    FromBBI.BB->addSuccessor(NBB);
+
+  std::copy(FromBBI.Predicate.begin(), FromBBI.Predicate.end(),
+            std::back_inserter(ToBBI.Predicate));
+  FromBBI.Predicate.clear();
+
+  ToBBI.NonPredSize += FromBBI.NonPredSize;
+  FromBBI.NonPredSize = 0;
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.HasFallThrough = FromBBI.HasFallThrough;
+  ToBBI.IsAnalyzed = false;
+  FromBBI.IsAnalyzed = false;
+}
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
new file mode 100644
index 0000000..e6912b8
--- /dev/null
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -0,0 +1,892 @@
+//===-- IntrinsicLowering.cpp - Intrinsic Lowering default implementation -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IntrinsicLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+template <class ArgIt>
+static void EnsureFunctionExists(Module &M, const char *Name,
+                                 ArgIt ArgBegin, ArgIt ArgEnd,
+                                 const Type *RetTy) {
+  // Insert a correctly-typed definition now.
+  std::vector<const Type *> ParamTys;
+  for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
+    ParamTys.push_back(I->getType());
+  M.getOrInsertFunction(Name, FunctionType::get(RetTy, ParamTys, false));
+}
+
+static void EnsureFPIntrinsicsExist(Module &M, Function *Fn,
+                                    const char *FName,
+                                    const char *DName, const char *LDName) {
+  // Insert definitions for all the floating point types.
+  switch((int)Fn->arg_begin()->getType()->getTypeID()) {
+  case Type::FloatTyID:
+    EnsureFunctionExists(M, FName, Fn->arg_begin(), Fn->arg_end(),
+                         Type::FloatTy);
+    break;
+  case Type::DoubleTyID:
+    EnsureFunctionExists(M, DName, Fn->arg_begin(), Fn->arg_end(),
+                         Type::DoubleTy);
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    EnsureFunctionExists(M, LDName, Fn->arg_begin(), Fn->arg_end(),
+                         Fn->arg_begin()->getType());
+    break;
+  }
+}
+
+/// ReplaceCallWith - This function is used when we want to lower an intrinsic
+/// call to a call of an external function.  This handles hard cases such as
+/// when there was already a prototype for the external function, and if that
+/// prototype doesn't match the arguments we expect to pass in.
+template <class ArgIt>
+static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
+                                 ArgIt ArgBegin, ArgIt ArgEnd,
+                                 const Type *RetTy, Constant *&FCache) {
+  if (!FCache) {
+    // If we haven't already looked up this function, check to see if the
+    // program already contains a function with this name.
+    Module *M = CI->getParent()->getParent()->getParent();
+    // Get or insert the definition now.
+    std::vector<const Type *> ParamTys;
+    for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
+      ParamTys.push_back((*I)->getType());
+    FCache = M->getOrInsertFunction(NewFn,
+                                    FunctionType::get(RetTy, ParamTys, false));
+  }
+
+  IRBuilder<> Builder(CI->getParent(), CI);
+  SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+  CallInst *NewCI = Builder.CreateCall(FCache, Args.begin(), Args.end());
+  NewCI->setName(CI->getName());
+  if (!CI->use_empty())
+    CI->replaceAllUsesWith(NewCI);
+  return NewCI;
+}
+
+void IntrinsicLowering::AddPrototypes(Module &M) {
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->isDeclaration() && !I->use_empty())
+      switch (I->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::setjmp:
+        EnsureFunctionExists(M, "setjmp", I->arg_begin(), I->arg_end(),
+                             Type::Int32Ty);
+        break;
+      case Intrinsic::longjmp:
+        EnsureFunctionExists(M, "longjmp", I->arg_begin(), I->arg_end(),
+                             Type::VoidTy);
+        break;
+      case Intrinsic::siglongjmp:
+        EnsureFunctionExists(M, "abort", I->arg_end(), I->arg_end(),
+                             Type::VoidTy);
+        break;
+      case Intrinsic::memcpy:
+        M.getOrInsertFunction("memcpy", PointerType::getUnqual(Type::Int8Ty),
+                              PointerType::getUnqual(Type::Int8Ty), 
+                              PointerType::getUnqual(Type::Int8Ty), 
+                              TD.getIntPtrType(), (Type *)0);
+        break;
+      case Intrinsic::memmove:
+        M.getOrInsertFunction("memmove", PointerType::getUnqual(Type::Int8Ty),
+                              PointerType::getUnqual(Type::Int8Ty), 
+                              PointerType::getUnqual(Type::Int8Ty), 
+                              TD.getIntPtrType(), (Type *)0);
+        break;
+      case Intrinsic::memset:
+        M.getOrInsertFunction("memset", PointerType::getUnqual(Type::Int8Ty),
+                              PointerType::getUnqual(Type::Int8Ty), 
+                              Type::Int32Ty, 
+                              TD.getIntPtrType(), (Type *)0);
+        break;
+      case Intrinsic::sqrt:
+        EnsureFPIntrinsicsExist(M, I, "sqrtf", "sqrt", "sqrtl");
+        break;
+      case Intrinsic::sin:
+        EnsureFPIntrinsicsExist(M, I, "sinf", "sin", "sinl");
+        break;
+      case Intrinsic::cos:
+        EnsureFPIntrinsicsExist(M, I, "cosf", "cos", "cosl");
+        break;
+      case Intrinsic::pow:
+        EnsureFPIntrinsicsExist(M, I, "powf", "pow", "powl");
+        break;
+      case Intrinsic::log:
+        EnsureFPIntrinsicsExist(M, I, "logf", "log", "logl");
+        break;
+      case Intrinsic::log2:
+        EnsureFPIntrinsicsExist(M, I, "log2f", "log2", "log2l");
+        break;
+      case Intrinsic::log10:
+        EnsureFPIntrinsicsExist(M, I, "log10f", "log10", "log10l");
+        break;
+      case Intrinsic::exp:
+        EnsureFPIntrinsicsExist(M, I, "expf", "exp", "expl");
+        break;
+      case Intrinsic::exp2:
+        EnsureFPIntrinsicsExist(M, I, "exp2f", "exp2", "exp2l");
+        break;
+      }
+}
+
+/// LowerBSWAP - Emit the code to lower bswap of V before the specified
+/// instruction IP.
+static Value *LowerBSWAP(Value *V, Instruction *IP) {
+  assert(V->getType()->isInteger() && "Can't bswap a non-integer type!");
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  
+  IRBuilder<> Builder(IP->getParent(), IP);
+
+  switch(BitSize) {
+  default: assert(0 && "Unhandled type size of value to byteswap!");
+  case 16: {
+    Value *Tmp1 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.2");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.1");
+    V = Builder.CreateOr(Tmp1, Tmp2, "bswap.i16");
+    break;
+  }
+  case 32: {
+    Value *Tmp4 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.4");
+    Value *Tmp3 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.3");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.2");
+    Value *Tmp1 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 24),
+                                     "bswap.1");
+    Tmp3 = Builder.CreateAnd(Tmp3, ConstantInt::get(Type::Int32Ty, 0xFF0000),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2, ConstantInt::get(Type::Int32Ty, 0xFF00),
+                             "bswap.and2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or1");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or2");
+    V = Builder.CreateOr(Tmp4, Tmp2, "bswap.i32");
+    break;
+  }
+  case 64: {
+    Value *Tmp8 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 56),
+                                    "bswap.8");
+    Value *Tmp7 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 40),
+                                    "bswap.7");
+    Value *Tmp6 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.6");
+    Value *Tmp5 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.5");
+    Value* Tmp4 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.4");
+    Value* Tmp3 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 24),
+                                     "bswap.3");
+    Value* Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 40),
+                                     "bswap.2");
+    Value* Tmp1 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 56),
+                                     "bswap.1");
+    Tmp7 = Builder.CreateAnd(Tmp7,
+                             ConstantInt::get(Type::Int64Ty,
+                                              0xFF000000000000ULL),
+                             "bswap.and7");
+    Tmp6 = Builder.CreateAnd(Tmp6,
+                             ConstantInt::get(Type::Int64Ty,
+                                              0xFF0000000000ULL),
+                             "bswap.and6");
+    Tmp5 = Builder.CreateAnd(Tmp5,
+                             ConstantInt::get(Type::Int64Ty, 0xFF00000000ULL),
+                             "bswap.and5");
+    Tmp4 = Builder.CreateAnd(Tmp4,
+                             ConstantInt::get(Type::Int64Ty, 0xFF000000ULL),
+                             "bswap.and4");
+    Tmp3 = Builder.CreateAnd(Tmp3,
+                             ConstantInt::get(Type::Int64Ty, 0xFF0000ULL),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2,
+                             ConstantInt::get(Type::Int64Ty, 0xFF00ULL),
+                             "bswap.and2");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp7, "bswap.or1");
+    Tmp6 = Builder.CreateOr(Tmp6, Tmp5, "bswap.or2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or3");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or4");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp6, "bswap.or5");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp2, "bswap.or6");
+    V = Builder.CreateOr(Tmp8, Tmp4, "bswap.i64");
+    break;
+  }
+  }
+  return V;
+}
+
+/// LowerCTPOP - Emit the code to lower ctpop of V before the specified
+/// instruction IP.
+static Value *LowerCTPOP(Value *V, Instruction *IP) {
+  assert(V->getType()->isInteger() && "Can't ctpop a non-integer type!");
+
+  static const uint64_t MaskValues[6] = {
+    0x5555555555555555ULL, 0x3333333333333333ULL,
+    0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
+    0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
+  };
+
+  IRBuilder<> Builder(IP->getParent(), IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  unsigned WordSize = (BitSize + 63) / 64;
+  Value *Count = ConstantInt::get(V->getType(), 0);
+
+  for (unsigned n = 0; n < WordSize; ++n) {
+    Value *PartValue = V;
+    for (unsigned i = 1, ct = 0; i < (BitSize>64 ? 64 : BitSize); 
+         i <<= 1, ++ct) {
+      Value *MaskCst = ConstantInt::get(V->getType(), MaskValues[ct]);
+      Value *LHS = Builder.CreateAnd(PartValue, MaskCst, "cppop.and1");
+      Value *VShift = Builder.CreateLShr(PartValue,
+                                         ConstantInt::get(V->getType(), i),
+                                         "ctpop.sh");
+      Value *RHS = Builder.CreateAnd(VShift, MaskCst, "cppop.and2");
+      PartValue = Builder.CreateAdd(LHS, RHS, "ctpop.step");
+    }
+    Count = Builder.CreateAdd(PartValue, Count, "ctpop.part");
+    if (BitSize > 64) {
+      V = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 64),
+                             "ctpop.part.sh");
+      BitSize -= 64;
+    }
+  }
+
+  return Count;
+}
+
+/// LowerCTLZ - Emit the code to lower ctlz of V before the specified
+/// instruction IP.
+static Value *LowerCTLZ(Value *V, Instruction *IP) {
+
+  IRBuilder<> Builder(IP->getParent(), IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  for (unsigned i = 1; i < BitSize; i <<= 1) {
+    Value *ShVal = ConstantInt::get(V->getType(), i);
+    ShVal = Builder.CreateLShr(V, ShVal, "ctlz.sh");
+    V = Builder.CreateOr(V, ShVal, "ctlz.step");
+  }
+
+  V = Builder.CreateNot(V);
+  return LowerCTPOP(V, IP);
+}
+
+/// Convert the llvm.part.select.iX.iY intrinsic. This intrinsic takes 
+/// three integer arguments. The first argument is the Value from which the
+/// bits will be selected. It may be of any bit width. The second and third
+/// arguments specify a range of bits to select with the second argument 
+/// specifying the low bit and the third argument specifying the high bit. Both
+/// must be type i32. The result is the corresponding selected bits from the
+/// Value in the same width as the Value (first argument). If the low bit index
+/// is higher than the high bit index then the inverse selection is done and 
+/// the bits are returned in inverse order. 
+/// @brief Lowering of llvm.part.select intrinsic.
+static Instruction *LowerPartSelect(CallInst *CI) {
+  IRBuilder<> Builder;
+
+  // Make sure we're dealing with a part select intrinsic here
+  Function *F = CI->getCalledFunction();
+  const FunctionType *FT = F->getFunctionType();
+  if (!F->isDeclaration() || !FT->getReturnType()->isInteger() ||
+      FT->getNumParams() != 3 || !FT->getParamType(0)->isInteger() ||
+      !FT->getParamType(1)->isInteger() || !FT->getParamType(2)->isInteger())
+    return CI;
+
+  // Get the intrinsic implementation function by converting all the . to _
+  // in the intrinsic's function name and then reconstructing the function
+  // declaration.
+  std::string Name(F->getName());
+  for (unsigned i = 4; i < Name.length(); ++i)
+    if (Name[i] == '.')
+      Name[i] = '_';
+  Module* M = F->getParent();
+  F = cast<Function>(M->getOrInsertFunction(Name, FT));
+  F->setLinkage(GlobalValue::WeakAnyLinkage);
+
+  // If we haven't defined the impl function yet, do so now
+  if (F->isDeclaration()) {
+
+    // Get the arguments to the function
+    Function::arg_iterator args = F->arg_begin();
+    Value* Val = args++; Val->setName("Val");
+    Value* Lo = args++; Lo->setName("Lo");
+    Value* Hi = args++; Hi->setName("High");
+
+    // We want to select a range of bits here such that [Hi, Lo] is shifted
+    // down to the low bits. However, it is quite possible that Hi is smaller
+    // than Lo in which case the bits have to be reversed. 
+    
+    // Create the blocks we will need for the two cases (forward, reverse)
+    BasicBlock* CurBB   = BasicBlock::Create("entry", F);
+    BasicBlock *RevSize = BasicBlock::Create("revsize", CurBB->getParent());
+    BasicBlock *FwdSize = BasicBlock::Create("fwdsize", CurBB->getParent());
+    BasicBlock *Compute = BasicBlock::Create("compute", CurBB->getParent());
+    BasicBlock *Reverse = BasicBlock::Create("reverse", CurBB->getParent());
+    BasicBlock *RsltBlk = BasicBlock::Create("result",  CurBB->getParent());
+
+    Builder.SetInsertPoint(CurBB);
+
+    // Cast Hi and Lo to the size of Val so the widths are all the same
+    if (Hi->getType() != Val->getType())
+      Hi = Builder.CreateIntCast(Hi, Val->getType(), /* isSigned */ false,
+                                 "tmp");
+    if (Lo->getType() != Val->getType())
+      Lo = Builder.CreateIntCast(Lo, Val->getType(), /* isSigned */ false,
+                                 "tmp");
+
+    // Compute a few things that both cases will need, up front.
+    Constant* Zero = ConstantInt::get(Val->getType(), 0);
+    Constant* One = ConstantInt::get(Val->getType(), 1);
+    Constant* AllOnes = ConstantInt::getAllOnesValue(Val->getType());
+
+    // Compare the Hi and Lo bit positions. This is used to determine 
+    // which case we have (forward or reverse)
+    Value *Cmp = Builder.CreateICmpULT(Hi, Lo, "less");
+    Builder.CreateCondBr(Cmp, RevSize, FwdSize);
+
+    // First, compute the number of bits in the forward case.
+    Builder.SetInsertPoint(FwdSize);
+    Value* FBitSize = Builder.CreateSub(Hi, Lo, "fbits");
+    Builder.CreateBr(Compute);
+
+    // Second, compute the number of bits in the reverse case.
+    Builder.SetInsertPoint(RevSize);
+    Value* RBitSize = Builder.CreateSub(Lo, Hi, "rbits");
+    Builder.CreateBr(Compute);
+
+    // Now, compute the bit range. Start by getting the bitsize and the shift
+    // amount (either Hi or Lo) from PHI nodes. Then we compute a mask for 
+    // the number of bits we want in the range. We shift the bits down to the 
+    // least significant bits, apply the mask to zero out unwanted high bits, 
+    // and we have computed the "forward" result. It may still need to be 
+    // reversed.
+    Builder.SetInsertPoint(Compute);
+
+    // Get the BitSize from one of the two subtractions
+    PHINode *BitSize = Builder.CreatePHI(Val->getType(), "bits");
+    BitSize->reserveOperandSpace(2);
+    BitSize->addIncoming(FBitSize, FwdSize);
+    BitSize->addIncoming(RBitSize, RevSize);
+
+    // Get the ShiftAmount as the smaller of Hi/Lo
+    PHINode *ShiftAmt = Builder.CreatePHI(Val->getType(), "shiftamt");
+    ShiftAmt->reserveOperandSpace(2);
+    ShiftAmt->addIncoming(Lo, FwdSize);
+    ShiftAmt->addIncoming(Hi, RevSize);
+
+    // Increment the bit size
+    Value *BitSizePlusOne = Builder.CreateAdd(BitSize, One, "bits");
+
+    // Create a Mask to zero out the high order bits.
+    Value* Mask = Builder.CreateShl(AllOnes, BitSizePlusOne, "mask");
+    Mask = Builder.CreateNot(Mask, "mask");
+
+    // Shift the bits down and apply the mask
+    Value* FRes = Builder.CreateLShr(Val, ShiftAmt, "fres");
+    FRes = Builder.CreateAnd(FRes, Mask, "fres");
+    Builder.CreateCondBr(Cmp, Reverse, RsltBlk);
+
+    // In the Reverse block we have the mask already in FRes but we must reverse
+    // it by shifting FRes bits right and putting them in RRes by shifting them 
+    // in from left.
+    Builder.SetInsertPoint(Reverse);
+
+    // First set up our loop counters
+    PHINode *Count = Builder.CreatePHI(Val->getType(), "count");
+    Count->reserveOperandSpace(2);
+    Count->addIncoming(BitSizePlusOne, Compute);
+
+    // Next, get the value that we are shifting.
+    PHINode *BitsToShift = Builder.CreatePHI(Val->getType(), "val");
+    BitsToShift->reserveOperandSpace(2);
+    BitsToShift->addIncoming(FRes, Compute);
+
+    // Finally, get the result of the last computation
+    PHINode *RRes = Builder.CreatePHI(Val->getType(), "rres");
+    RRes->reserveOperandSpace(2);
+    RRes->addIncoming(Zero, Compute);
+
+    // Decrement the counter
+    Value *Decr = Builder.CreateSub(Count, One, "decr");
+    Count->addIncoming(Decr, Reverse);
+
+    // Compute the Bit that we want to move
+    Value *Bit = Builder.CreateAnd(BitsToShift, One, "bit");
+
+    // Compute the new value for next iteration.
+    Value *NewVal = Builder.CreateLShr(BitsToShift, One, "rshift");
+    BitsToShift->addIncoming(NewVal, Reverse);
+
+    // Shift the bit into the low bits of the result.
+    Value *NewRes = Builder.CreateShl(RRes, One, "lshift");
+    NewRes = Builder.CreateOr(NewRes, Bit, "addbit");
+    RRes->addIncoming(NewRes, Reverse);
+    
+    // Terminate loop if we've moved all the bits.
+    Value *Cond = Builder.CreateICmpEQ(Decr, Zero, "cond");
+    Builder.CreateCondBr(Cond, RsltBlk, Reverse);
+
+    // Finally, in the result block, select one of the two results with a PHI
+    // node and return the result;
+    Builder.SetInsertPoint(RsltBlk);
+    PHINode *BitSelect = Builder.CreatePHI(Val->getType(), "part_select");
+    BitSelect->reserveOperandSpace(2);
+    BitSelect->addIncoming(FRes, Compute);
+    BitSelect->addIncoming(NewRes, Reverse);
+    Builder.CreateRet(BitSelect);
+  }
+
+  // Return a call to the implementation function
+  Builder.SetInsertPoint(CI->getParent(), CI);
+  CallInst *NewCI = Builder.CreateCall3(F, CI->getOperand(1),
+                                        CI->getOperand(2), CI->getOperand(3));
+  NewCI->setName(CI->getName());
+  return NewCI;
+}
+
+/// Convert the llvm.part.set.iX.iY.iZ intrinsic. This intrinsic takes 
+/// four integer arguments (iAny %Value, iAny %Replacement, i32 %Low, i32 %High)
+/// The first two arguments can be any bit width. The result is the same width
+/// as %Value. The operation replaces bits between %Low and %High with the value
+/// in %Replacement. If %Replacement is not the same width, it is truncated or
+/// zero extended as appropriate to fit the bits being replaced. If %Low is
+/// greater than %High then the inverse set of bits are replaced.
+/// @brief Lowering of llvm.bit.part.set intrinsic.
+static Instruction *LowerPartSet(CallInst *CI) {
+  IRBuilder<> Builder;
+
+  // Make sure we're dealing with a part select intrinsic here
+  Function *F = CI->getCalledFunction();
+  const FunctionType *FT = F->getFunctionType();
+  if (!F->isDeclaration() || !FT->getReturnType()->isInteger() ||
+      FT->getNumParams() != 4 || !FT->getParamType(0)->isInteger() ||
+      !FT->getParamType(1)->isInteger() || !FT->getParamType(2)->isInteger() ||
+      !FT->getParamType(3)->isInteger())
+    return CI;
+
+  // Get the intrinsic implementation function by converting all the . to _
+  // in the intrinsic's function name and then reconstructing the function
+  // declaration.
+  std::string Name(F->getName());
+  for (unsigned i = 4; i < Name.length(); ++i)
+    if (Name[i] == '.')
+      Name[i] = '_';
+  Module* M = F->getParent();
+  F = cast<Function>(M->getOrInsertFunction(Name, FT));
+  F->setLinkage(GlobalValue::WeakAnyLinkage);
+
+  // If we haven't defined the impl function yet, do so now
+  if (F->isDeclaration()) {
+    // Get the arguments for the function.
+    Function::arg_iterator args = F->arg_begin();
+    Value* Val = args++; Val->setName("Val");
+    Value* Rep = args++; Rep->setName("Rep");
+    Value* Lo  = args++; Lo->setName("Lo");
+    Value* Hi  = args++; Hi->setName("Hi");
+
+    // Get some types we need
+    const IntegerType* ValTy = cast<IntegerType>(Val->getType());
+    const IntegerType* RepTy = cast<IntegerType>(Rep->getType());
+    uint32_t RepBits = RepTy->getBitWidth();
+
+    // Constant Definitions
+    ConstantInt* RepBitWidth = ConstantInt::get(Type::Int32Ty, RepBits);
+    ConstantInt* RepMask = ConstantInt::getAllOnesValue(RepTy);
+    ConstantInt* ValMask = ConstantInt::getAllOnesValue(ValTy);
+    ConstantInt* One = ConstantInt::get(Type::Int32Ty, 1);
+    ConstantInt* ValOne = ConstantInt::get(ValTy, 1);
+    ConstantInt* Zero = ConstantInt::get(Type::Int32Ty, 0);
+    ConstantInt* ValZero = ConstantInt::get(ValTy, 0);
+
+    // Basic blocks we fill in below.
+    BasicBlock* entry = BasicBlock::Create("entry", F, 0);
+    BasicBlock* large = BasicBlock::Create("large", F, 0);
+    BasicBlock* small = BasicBlock::Create("small", F, 0);
+    BasicBlock* reverse = BasicBlock::Create("reverse", F, 0);
+    BasicBlock* result = BasicBlock::Create("result", F, 0);
+
+    // BASIC BLOCK: entry
+    Builder.SetInsertPoint(entry);
+    // First, get the number of bits that we're placing as an i32
+    Value* is_forward = Builder.CreateICmpULT(Lo, Hi);
+    Value* Hi_pn = Builder.CreateSelect(is_forward, Hi, Lo);
+    Value* Lo_pn = Builder.CreateSelect(is_forward, Lo, Hi);
+    Value* NumBits = Builder.CreateSub(Hi_pn, Lo_pn);
+    NumBits = Builder.CreateAdd(NumBits, One);
+    // Now, convert Lo and Hi to ValTy bit width
+    Lo = Builder.CreateIntCast(Lo_pn, ValTy, /* isSigned */ false);
+    // Determine if the replacement bits are larger than the number of bits we
+    // are replacing and deal with it.
+    Value* is_large = Builder.CreateICmpULT(NumBits, RepBitWidth);
+    Builder.CreateCondBr(is_large, large, small);
+
+    // BASIC BLOCK: large
+    Builder.SetInsertPoint(large);
+    Value* MaskBits = Builder.CreateSub(RepBitWidth, NumBits);
+    MaskBits = Builder.CreateIntCast(MaskBits, RepMask->getType(),
+                                     /* isSigned */ false);
+    Value* Mask1 = Builder.CreateLShr(RepMask, MaskBits);
+    Value* Rep2 = Builder.CreateAnd(Mask1, Rep);
+    Builder.CreateBr(small);
+
+    // BASIC BLOCK: small
+    Builder.SetInsertPoint(small);
+    PHINode* Rep3 = Builder.CreatePHI(RepTy);
+    Rep3->reserveOperandSpace(2);
+    Rep3->addIncoming(Rep2, large);
+    Rep3->addIncoming(Rep, entry);
+    Value* Rep4 = Builder.CreateIntCast(Rep3, ValTy, /* isSigned */ false);
+    Builder.CreateCondBr(is_forward, result, reverse);
+
+    // BASIC BLOCK: reverse (reverses the bits of the replacement)
+    Builder.SetInsertPoint(reverse);
+    // Set up our loop counter as a PHI so we can decrement on each iteration.
+    // We will loop for the number of bits in the replacement value.
+    PHINode *Count = Builder.CreatePHI(Type::Int32Ty, "count");
+    Count->reserveOperandSpace(2);
+    Count->addIncoming(NumBits, small);
+
+    // Get the value that we are shifting bits out of as a PHI because
+    // we'll change this with each iteration.
+    PHINode *BitsToShift = Builder.CreatePHI(Val->getType(), "val");
+    BitsToShift->reserveOperandSpace(2);
+    BitsToShift->addIncoming(Rep4, small);
+
+    // Get the result of the last computation or zero on first iteration
+    PHINode *RRes = Builder.CreatePHI(Val->getType(), "rres");
+    RRes->reserveOperandSpace(2);
+    RRes->addIncoming(ValZero, small);
+
+    // Decrement the loop counter by one
+    Value *Decr = Builder.CreateSub(Count, One);
+    Count->addIncoming(Decr, reverse);
+
+    // Get the bit that we want to move into the result
+    Value *Bit = Builder.CreateAnd(BitsToShift, ValOne);
+
+    // Compute the new value of the bits to shift for the next iteration.
+    Value *NewVal = Builder.CreateLShr(BitsToShift, ValOne);
+    BitsToShift->addIncoming(NewVal, reverse);
+
+    // Shift the bit we extracted into the low bit of the result.
+    Value *NewRes = Builder.CreateShl(RRes, ValOne);
+    NewRes = Builder.CreateOr(NewRes, Bit);
+    RRes->addIncoming(NewRes, reverse);
+    
+    // Terminate loop if we've moved all the bits.
+    Value *Cond = Builder.CreateICmpEQ(Decr, Zero);
+    Builder.CreateCondBr(Cond, result, reverse);
+
+    // BASIC BLOCK: result
+    Builder.SetInsertPoint(result);
+    PHINode *Rplcmnt = Builder.CreatePHI(Val->getType());
+    Rplcmnt->reserveOperandSpace(2);
+    Rplcmnt->addIncoming(NewRes, reverse);
+    Rplcmnt->addIncoming(Rep4, small);
+    Value* t0   = Builder.CreateIntCast(NumBits, ValTy, /* isSigned */ false);
+    Value* t1   = Builder.CreateShl(ValMask, Lo);
+    Value* t2   = Builder.CreateNot(t1);
+    Value* t3   = Builder.CreateShl(t1, t0);
+    Value* t4   = Builder.CreateOr(t2, t3);
+    Value* t5   = Builder.CreateAnd(t4, Val);
+    Value* t6   = Builder.CreateShl(Rplcmnt, Lo);
+    Value* Rslt = Builder.CreateOr(t5, t6, "part_set");
+    Builder.CreateRet(Rslt);
+  }
+
+  // Return a call to the implementation function
+  Builder.SetInsertPoint(CI->getParent(), CI);
+  CallInst *NewCI = Builder.CreateCall4(F, CI->getOperand(1),
+                                        CI->getOperand(2), CI->getOperand(3),
+                                        CI->getOperand(4));
+  NewCI->setName(CI->getName());
+  return NewCI;
+}
+
+static void ReplaceFPIntrinsicWithCall(CallInst *CI, Constant *FCache,
+                                       Constant *DCache, Constant *LDCache,
+                                       const char *Fname, const char *Dname,
+                                       const char *LDname) {
+  switch (CI->getOperand(1)->getType()->getTypeID()) {
+  default: assert(0 && "Invalid type in intrinsic"); abort();
+  case Type::FloatTyID:
+    ReplaceCallWith(Fname, CI, CI->op_begin() + 1, CI->op_end(),
+                  Type::FloatTy, FCache);
+    break;
+  case Type::DoubleTyID:
+    ReplaceCallWith(Dname, CI, CI->op_begin() + 1, CI->op_end(),
+                  Type::DoubleTy, DCache);
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    ReplaceCallWith(LDname, CI, CI->op_begin() + 1, CI->op_end(),
+                  CI->getOperand(1)->getType(), LDCache);
+    break;
+  }
+}
+
+void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
+  IRBuilder<> Builder(CI->getParent(), CI);
+
+  Function *Callee = CI->getCalledFunction();
+  assert(Callee && "Cannot lower an indirect call!");
+
+  switch (Callee->getIntrinsicID()) {
+  case Intrinsic::not_intrinsic:
+    cerr << "Cannot lower a call to a non-intrinsic function '"
+         << Callee->getName() << "'!\n";
+    abort();
+  default:
+    cerr << "Error: Code generator does not support intrinsic function '"
+         << Callee->getName() << "'!\n";
+    abort();
+
+    // The setjmp/longjmp intrinsics should only exist in the code if it was
+    // never optimized (ie, right out of the CFE), or if it has been hacked on
+    // by the lowerinvoke pass.  In both cases, the right thing to do is to
+    // convert the call to an explicit setjmp or longjmp call.
+  case Intrinsic::setjmp: {
+    static Constant *SetjmpFCache = 0;
+    Value *V = ReplaceCallWith("setjmp", CI, CI->op_begin() + 1, CI->op_end(),
+                               Type::Int32Ty, SetjmpFCache);
+    if (CI->getType() != Type::VoidTy)
+      CI->replaceAllUsesWith(V);
+    break;
+  }
+  case Intrinsic::sigsetjmp:
+     if (CI->getType() != Type::VoidTy)
+       CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+     break;
+
+  case Intrinsic::longjmp: {
+    static Constant *LongjmpFCache = 0;
+    ReplaceCallWith("longjmp", CI, CI->op_begin() + 1, CI->op_end(),
+                    Type::VoidTy, LongjmpFCache);
+    break;
+  }
+
+  case Intrinsic::siglongjmp: {
+    // Insert the call to abort
+    static Constant *AbortFCache = 0;
+    ReplaceCallWith("abort", CI, CI->op_end(), CI->op_end(), 
+                    Type::VoidTy, AbortFCache);
+    break;
+  }
+  case Intrinsic::ctpop:
+    CI->replaceAllUsesWith(LowerCTPOP(CI->getOperand(1), CI));
+    break;
+
+  case Intrinsic::bswap:
+    CI->replaceAllUsesWith(LowerBSWAP(CI->getOperand(1), CI));
+    break;
+    
+  case Intrinsic::ctlz:
+    CI->replaceAllUsesWith(LowerCTLZ(CI->getOperand(1), CI));
+    break;
+
+  case Intrinsic::cttz: {
+    // cttz(x) -> ctpop(~X & (X-1))
+    Value *Src = CI->getOperand(1);
+    Value *NotSrc = Builder.CreateNot(Src);
+    NotSrc->setName(Src->getName() + ".not");
+    Value *SrcM1 = ConstantInt::get(Src->getType(), 1);
+    SrcM1 = Builder.CreateSub(Src, SrcM1);
+    Src = LowerCTPOP(Builder.CreateAnd(NotSrc, SrcM1), CI);
+    CI->replaceAllUsesWith(Src);
+    break;
+  }
+
+  case Intrinsic::part_select:
+    CI->replaceAllUsesWith(LowerPartSelect(CI));
+    break;
+
+  case Intrinsic::part_set:
+    CI->replaceAllUsesWith(LowerPartSet(CI));
+    break;
+
+  case Intrinsic::stacksave:
+  case Intrinsic::stackrestore: {
+    static bool Warned = false;
+    if (!Warned)
+      cerr << "WARNING: this target does not support the llvm.stack"
+           << (Callee->getIntrinsicID() == Intrinsic::stacksave ?
+               "save" : "restore") << " intrinsic.\n";
+    Warned = true;
+    if (Callee->getIntrinsicID() == Intrinsic::stacksave)
+      CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    break;
+  }
+    
+  case Intrinsic::returnaddress:
+  case Intrinsic::frameaddress:
+    cerr << "WARNING: this target does not support the llvm."
+         << (Callee->getIntrinsicID() == Intrinsic::returnaddress ?
+             "return" : "frame") << "address intrinsic.\n";
+    CI->replaceAllUsesWith(ConstantPointerNull::get(
+                                            cast<PointerType>(CI->getType())));
+    break;
+
+  case Intrinsic::prefetch:
+    break;    // Simply strip out prefetches on unsupported architectures
+
+  case Intrinsic::pcmarker:
+    break;    // Simply strip out pcmarker on unsupported architectures
+  case Intrinsic::readcyclecounter: {
+    cerr << "WARNING: this target does not support the llvm.readcyclecoun"
+         << "ter intrinsic.  It is being lowered to a constant 0\n";
+    CI->replaceAllUsesWith(ConstantInt::get(Type::Int64Ty, 0));
+    break;
+  }
+
+  case Intrinsic::dbg_stoppoint:
+  case Intrinsic::dbg_region_start:
+  case Intrinsic::dbg_region_end:
+  case Intrinsic::dbg_func_start:
+  case Intrinsic::dbg_declare:
+    break;    // Simply strip out debugging intrinsics
+
+  case Intrinsic::eh_exception:
+  case Intrinsic::eh_selector_i32:
+  case Intrinsic::eh_selector_i64:
+    CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    break;
+
+  case Intrinsic::eh_typeid_for_i32:
+  case Intrinsic::eh_typeid_for_i64:
+    // Return something different to eh_selector.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+    break;
+
+  case Intrinsic::var_annotation:
+    break;   // Strip out annotate intrinsic
+    
+  case Intrinsic::memcpy: {
+    static Constant *MemcpyFCache = 0;
+    const IntegerType *IntPtr = TD.getIntPtrType();
+    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getOperand(1);
+    Ops[1] = CI->getOperand(2);
+    Ops[2] = Size;
+    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getOperand(1)->getType(),
+                    MemcpyFCache);
+    break;
+  }
+  case Intrinsic::memmove: {
+    static Constant *MemmoveFCache = 0;
+    const IntegerType *IntPtr = TD.getIntPtrType();
+    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getOperand(1);
+    Ops[1] = CI->getOperand(2);
+    Ops[2] = Size;
+    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getOperand(1)->getType(),
+                    MemmoveFCache);
+    break;
+  }
+  case Intrinsic::memset: {
+    static Constant *MemsetFCache = 0;
+    const IntegerType *IntPtr = TD.getIntPtrType();
+    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getOperand(1);
+    // Extend the amount to i32.
+    Ops[1] = Builder.CreateIntCast(CI->getOperand(2), Type::Int32Ty,
+                                   /* isSigned */ false);
+    Ops[2] = Size;
+    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getOperand(1)->getType(),
+                    MemsetFCache);
+    break;
+  }
+  case Intrinsic::sqrt: {
+    static Constant *sqrtFCache = 0;
+    static Constant *sqrtDCache = 0;
+    static Constant *sqrtLDCache = 0;
+    ReplaceFPIntrinsicWithCall(CI, sqrtFCache, sqrtDCache, sqrtLDCache,
+                               "sqrtf", "sqrt", "sqrtl");
+    break;
+  }
+  case Intrinsic::log: {
+    static Constant *logFCache = 0;
+    static Constant *logDCache = 0;
+    static Constant *logLDCache = 0;
+    ReplaceFPIntrinsicWithCall(CI, logFCache, logDCache, logLDCache,
+                               "logf", "log", "logl");
+    break;
+  }
+  case Intrinsic::log2: {
+    static Constant *log2FCache = 0;
+    static Constant *log2DCache = 0;
+    static Constant *log2LDCache = 0;
+    ReplaceFPIntrinsicWithCall(CI, log2FCache, log2DCache, log2LDCache,
+                               "log2f", "log2", "log2l");
+    break;
+  }
+  case Intrinsic::log10: {
+    static Constant *log10FCache = 0;
+    static Constant *log10DCache = 0;
+    static Constant *log10LDCache = 0;
+    ReplaceFPIntrinsicWithCall(CI, log10FCache, log10DCache, log10LDCache,
+                               "log10f", "log10", "log10l");
+    break;
+  }
+  case Intrinsic::exp: {
+    static Constant *expFCache = 0;
+    static Constant *expDCache = 0;
+    static Constant *expLDCache = 0;
+    ReplaceFPIntrinsicWithCall(CI, expFCache, expDCache, expLDCache,
+                               "expf", "exp", "expl");
+    break;
+  }
+  case Intrinsic::exp2: {
+    static Constant *exp2FCache = 0;
+    static Constant *exp2DCache = 0;
+    static Constant *exp2LDCache = 0;
+    ReplaceFPIntrinsicWithCall(CI, exp2FCache, exp2DCache, exp2LDCache,
+                               "exp2f", "exp2", "exp2l");
+    break;
+  }
+  case Intrinsic::pow: {
+    static Constant *powFCache = 0;
+    static Constant *powDCache = 0;
+    static Constant *powLDCache = 0;
+    ReplaceFPIntrinsicWithCall(CI, powFCache, powDCache, powLDCache,
+                               "powf", "pow", "powl");
+    break;
+  }
+  case Intrinsic::flt_rounds:
+     // Lower to "round to the nearest"
+     if (CI->getType() != Type::VoidTy)
+       CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+     break;
+  }
+
+  assert(CI->use_empty() &&
+         "Lowering should have eliminated any uses of the intrinsic call!");
+  CI->eraseFromParent();
+}
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
new file mode 100644
index 0000000..b3c60e6
--- /dev/null
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -0,0 +1,289 @@
+//===-- LLVMTargetMachine.cpp - Implement the LLVMTargetMachine class -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVMTargetMachine class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace llvm {
+  bool EnableFastISel;
+}
+
+static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
+    cl::desc("Print LLVM IR produced by the loop-reduce pass"));
+static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
+    cl::desc("Print LLVM IR input to isel pass"));
+static cl::opt<bool> PrintEmittedAsm("print-emitted-asm", cl::Hidden,
+    cl::desc("Dump emitter generated instructions as assembly"));
+static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
+    cl::desc("Dump garbage collector data"));
+static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
+    cl::desc("Verify generated machine code"),
+    cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=NULL));
+
+// When this works it will be on by default.
+static cl::opt<bool>
+DisablePostRAScheduler("disable-post-RA-scheduler",
+                       cl::desc("Disable scheduling after register allocation"),
+                       cl::init(true));
+
+// Enable or disable FastISel. Both options are needed, because
+// FastISel is enabled by default with -fast, and we wish to be
+// able to enable or disable fast-isel independently from -fast.
+static cl::opt<cl::boolOrDefault>
+EnableFastISelOption("fast-isel", cl::Hidden,
+  cl::desc("Enable the experimental \"fast\" instruction selector"));
+
+FileModel::Model
+LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                       raw_ostream &Out,
+                                       CodeGenFileType FileType,
+                                       CodeGenOpt::Level OptLevel) {
+  // Add common CodeGen passes.
+  if (addCommonCodeGenPasses(PM, OptLevel))
+    return FileModel::Error;
+
+  // Fold redundant debug labels.
+  PM.add(createDebugLabelFoldingPass());
+
+  if (PrintMachineCode)
+    PM.add(createMachineFunctionPrinterPass(cerr));
+
+  if (addPreEmitPass(PM, OptLevel) && PrintMachineCode)
+    PM.add(createMachineFunctionPrinterPass(cerr));
+
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createCodePlacementOptPass());
+
+  switch (FileType) {
+  default:
+    break;
+  case TargetMachine::AssemblyFile:
+    if (addAssemblyEmitter(PM, OptLevel, getAsmVerbosityDefault(), Out))
+      return FileModel::Error;
+    return FileModel::AsmFile;
+  case TargetMachine::ObjectFile:
+    if (getMachOWriterInfo())
+      return FileModel::MachOFile;
+    else if (getELFWriterInfo())
+      return FileModel::ElfFile;
+  }
+
+  return FileModel::Error;
+}
+
+/// addPassesToEmitFileFinish - If the passes to emit the specified file had to
+/// be split up (e.g., to add an object writer pass), this method can be used to
+/// finish up adding passes to emit the file, if necessary.
+bool LLVMTargetMachine::addPassesToEmitFileFinish(PassManagerBase &PM,
+                                                  MachineCodeEmitter *MCE,
+                                                  CodeGenOpt::Level OptLevel) {
+  if (MCE)
+    addSimpleCodeEmitter(PM, OptLevel, PrintEmittedAsm, *MCE);
+
+  PM.add(createGCInfoDeleter());
+
+  // Delete machine code for this function
+  PM.add(createMachineCodeDeleter());
+
+  return false; // success!
+}
+
+/// addPassesToEmitFileFinish - If the passes to emit the specified file had to
+/// be split up (e.g., to add an object writer pass), this method can be used to
+/// finish up adding passes to emit the file, if necessary.
+bool LLVMTargetMachine::addPassesToEmitFileFinish(PassManagerBase &PM,
+                                                  JITCodeEmitter *JCE,
+                                                  CodeGenOpt::Level OptLevel) {
+  if (JCE)
+    addSimpleCodeEmitter(PM, OptLevel, PrintEmittedAsm, *JCE);
+
+  PM.add(createGCInfoDeleter());
+
+  // Delete machine code for this function
+  PM.add(createMachineCodeDeleter());
+
+  return false; // success!
+}
+
+/// addPassesToEmitMachineCode - Add passes to the specified pass manager to
+/// get machine code emitted.  This uses a MachineCodeEmitter object to handle
+/// actually outputting the machine code and resolving things like the address
+/// of functions.  This method should returns true if machine code emission is
+/// not supported.
+///
+bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
+                                                   MachineCodeEmitter &MCE,
+                                                   CodeGenOpt::Level OptLevel) {
+  // Add common CodeGen passes.
+  if (addCommonCodeGenPasses(PM, OptLevel))
+    return true;
+
+  if (addPreEmitPass(PM, OptLevel) && PrintMachineCode)
+    PM.add(createMachineFunctionPrinterPass(cerr));
+
+  addCodeEmitter(PM, OptLevel, PrintEmittedAsm, MCE);
+
+  PM.add(createGCInfoDeleter());
+
+  // Delete machine code for this function
+  PM.add(createMachineCodeDeleter());
+
+  return false; // success!
+}
+
+/// addPassesToEmitMachineCode - Add passes to the specified pass manager to
+/// get machine code emitted.  This uses a MachineCodeEmitter object to handle
+/// actually outputting the machine code and resolving things like the address
+/// of functions.  This method should returns true if machine code emission is
+/// not supported.
+///
+bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
+                                                   JITCodeEmitter &JCE,
+                                                   CodeGenOpt::Level OptLevel) {
+  // Add common CodeGen passes.
+  if (addCommonCodeGenPasses(PM, OptLevel))
+    return true;
+
+  if (addPreEmitPass(PM, OptLevel) && PrintMachineCode)
+    PM.add(createMachineFunctionPrinterPass(cerr));
+
+  addCodeEmitter(PM, OptLevel, PrintEmittedAsm, JCE);
+
+  PM.add(createGCInfoDeleter());
+
+  // Delete machine code for this function
+  PM.add(createMachineCodeDeleter());
+
+  return false; // success!
+}
+
+static void printAndVerify(PassManagerBase &PM,
+                           bool allowDoubleDefs = false) {
+  if (PrintMachineCode)
+    PM.add(createMachineFunctionPrinterPass(cerr));
+
+  if (VerifyMachineCode)
+    PM.add(createMachineVerifierPass(allowDoubleDefs));
+}
+
+/// addCommonCodeGenPasses - Add standard LLVM codegen passes used for both
+/// emitting to assembly files or machine code output.
+///
+bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
+                                               CodeGenOpt::Level OptLevel) {
+  // Standard LLVM-Level Passes.
+
+  // Run loop strength reduction before anything else.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createLoopStrengthReducePass(getTargetLowering()));
+    if (PrintLSR)
+      PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &errs()));
+  }
+
+  // Turn exception handling constructs into something the code generators can
+  // handle.
+  if (!getTargetAsmInfo()->doesSupportExceptionHandling())
+    PM.add(createLowerInvokePass(getTargetLowering()));
+  else
+    PM.add(createDwarfEHPass(getTargetLowering(), OptLevel==CodeGenOpt::None));
+
+  PM.add(createGCLoweringPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  PM.add(createUnreachableBlockEliminationPass());
+
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createCodeGenPreparePass(getTargetLowering()));
+
+  PM.add(createStackProtectorPass(getTargetLowering()));
+
+  if (PrintISelInput)
+    PM.add(createPrintFunctionPass("\n\n"
+                                   "*** Final LLVM Code input to ISel ***\n",
+                                   &errs()));
+
+  // Standard Lower-Level Passes.
+
+  // Enable FastISel with -fast, but allow that to be overridden.
+  if (EnableFastISelOption == cl::BOU_TRUE ||
+      (OptLevel == CodeGenOpt::None && EnableFastISelOption != cl::BOU_FALSE))
+    EnableFastISel = true;
+
+  // Ask the target for an isel.
+  if (addInstSelector(PM, OptLevel))
+    return true;
+
+  // Print the instruction selected machine code...
+  printAndVerify(PM, /* allowDoubleDefs= */ true);
+
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createMachineLICMPass());
+    PM.add(createMachineSinkingPass());
+    printAndVerify(PM, /* allowDoubleDefs= */ true);
+  }
+
+  // Run pre-ra passes.
+  if (addPreRegAlloc(PM, OptLevel))
+    printAndVerify(PM);
+
+  // Perform register allocation.
+  PM.add(createRegisterAllocator());
+
+  // Perform stack slot coloring.
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createStackSlotColoringPass(OptLevel >= CodeGenOpt::Aggressive));
+
+  printAndVerify(PM);           // Print the register-allocated code
+
+  // Run post-ra passes.
+  if (addPostRegAlloc(PM, OptLevel))
+    printAndVerify(PM);
+
+  PM.add(createLowerSubregsPass());
+  printAndVerify(PM);
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  PM.add(createPrologEpilogCodeInserter());
+  printAndVerify(PM);
+
+  // Second pass scheduler.
+  if (OptLevel != CodeGenOpt::None && !DisablePostRAScheduler) {
+    PM.add(createPostRAScheduler());
+    printAndVerify(PM);
+  }
+
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createBranchFoldingPass(getEnableTailMergeDefault()));
+    printAndVerify(PM);
+  }
+
+  PM.add(createGCMachineCodeAnalysisPass());
+  printAndVerify(PM);
+
+  if (PrintGCInfo)
+    PM.add(createGCInfoPrinter(*cerr));
+
+  return false;
+}
diff --git a/lib/CodeGen/LatencyPriorityQueue.cpp b/lib/CodeGen/LatencyPriorityQueue.cpp
new file mode 100644
index 0000000..2e7b89c
--- /dev/null
+++ b/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -0,0 +1,114 @@
+//===---- LatencyPriorityQueue.cpp - A latency-oriented priority queue ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LatencyPriorityQueue class, which is a
+// SchedulingPriorityQueue that schedules using latency information to
+// reduce the length of the critical path through the basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scheduler"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+  
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+  
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return LHSNum < RHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *LatencyPriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+      
+  return OnlyAvailablePred;
+}
+
+void LatencyPriorityQueue::push_impl(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I)
+    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+      ++NumNodesBlocking;
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+  
+  Queue.push(SU);
+}
+
+
+// ScheduledNode - As nodes are scheduled, we look to see if there are any
+// successor nodes that have a single unscheduled predecessor.  If so, that
+// single predecessor has a higher priority, since scheduling it will make
+// the node available.
+void LatencyPriorityQueue::ScheduledNode(SUnit *SU) {
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I)
+    AdjustPriorityOfUnscheduledPreds(I->getSUnit());
+}
+
+/// AdjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+  
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (OnlyAvailablePred == 0 || !OnlyAvailablePred->isAvailable) return;
+  
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
new file mode 100644
index 0000000..67120b8
--- /dev/null
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -0,0 +1,853 @@
+//===-- LiveInterval.cpp - Live Interval Representation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRange and LiveInterval classes.  Given some
+// numbering of each the machine instructions an interval [i, j) is said to be a
+// live interval for register v if there is no instruction with number j' > j
+// such that v is live at j' abd there is no instruction with number i' < i such
+// that v is live at i'. In this implementation intervals can have holes,
+// i.e. an interval might look like [1,20), [50,65), [1000,1001).  Each
+// individual range is represented as an instance of LiveRange, and the whole
+// interval is represented as an instance of LiveInterval.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <ostream>
+using namespace llvm;
+
+// An example for liveAt():
+//
+// this = [1,4), liveAt(0) will return false. The instruction defining this
+// spans slots [0,3]. The interval belongs to an spilled definition of the
+// variable it represents. This is because slot 1 is used (def slot) and spans
+// up to slot 3 (store slot).
+//
+bool LiveInterval::liveAt(unsigned I) const {
+  Ranges::const_iterator r = std::upper_bound(ranges.begin(), ranges.end(), I);
+
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  return r->contains(I);
+}
+
+// liveBeforeAndAt - Check if the interval is live at the index and the index
+// just before it. If index is liveAt, check if it starts a new live range.
+// If it does, then check if the previous live range ends at index-1.
+bool LiveInterval::liveBeforeAndAt(unsigned I) const {
+  Ranges::const_iterator r = std::upper_bound(ranges.begin(), ranges.end(), I);
+
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  if (!r->contains(I))
+    return false;
+  if (I != r->start)
+    return true;
+  // I is the start of a live range. Check if the previous live range ends
+  // at I-1.
+  if (r == ranges.begin())
+    return false;
+  return r->end == I;
+}
+
+// overlaps - Return true if the intersection of the two live intervals is
+// not empty.
+//
+// An example for overlaps():
+//
+// 0: A = ...
+// 4: B = ...
+// 8: C = A + B ;; last use of A
+//
+// The live intervals should look like:
+//
+// A = [3, 11)
+// B = [7, x)
+// C = [11, y)
+//
+// A->overlaps(C) should return false since we want to be able to join
+// A and C.
+//
+bool LiveInterval::overlapsFrom(const LiveInterval& other,
+                                const_iterator StartPos) const {
+  const_iterator i = begin();
+  const_iterator ie = end();
+  const_iterator j = StartPos;
+  const_iterator je = other.end();
+
+  assert((StartPos->start <= i->start || StartPos == other.begin()) &&
+         StartPos != other.end() && "Bogus start position hint!");
+
+  if (i->start < j->start) {
+    i = std::upper_bound(i, ie, j->start);
+    if (i != ranges.begin()) --i;
+  } else if (j->start < i->start) {
+    ++StartPos;
+    if (StartPos != other.end() && StartPos->start <= i->start) {
+      assert(StartPos < other.end() && i < end());
+      j = std::upper_bound(j, je, i->start);
+      if (j != other.ranges.begin()) --j;
+    }
+  } else {
+    return true;
+  }
+
+  if (j == je) return false;
+
+  while (i != ie) {
+    if (i->start > j->start) {
+      std::swap(i, j);
+      std::swap(ie, je);
+    }
+
+    if (i->end > j->start)
+      return true;
+    ++i;
+  }
+
+  return false;
+}
+
+/// overlaps - Return true if the live interval overlaps a range specified
+/// by [Start, End).
+bool LiveInterval::overlaps(unsigned Start, unsigned End) const {
+  assert(Start < End && "Invalid range");
+  const_iterator I  = begin();
+  const_iterator E  = end();
+  const_iterator si = std::upper_bound(I, E, Start);
+  const_iterator ei = std::upper_bound(I, E, End);
+  if (si != ei)
+    return true;
+  if (si == I)
+    return false;
+  --si;
+  return si->contains(Start);
+}
+
+/// extendIntervalEndTo - This method is used when we want to extend the range
+/// specified by I to end at the specified endpoint.  To do this, we should
+/// merge and eliminate all ranges that this will overlap with.  The iterator is
+/// not invalidated.
+void LiveInterval::extendIntervalEndTo(Ranges::iterator I, unsigned NewEnd) {
+  assert(I != ranges.end() && "Not a valid interval!");
+  VNInfo *ValNo = I->valno;
+  unsigned OldEnd = I->end;
+
+  // Search for the first interval that we can't merge with.
+  Ranges::iterator MergeTo = next(I);
+  for (; MergeTo != ranges.end() && NewEnd >= MergeTo->end; ++MergeTo) {
+    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+  }
+
+  // If NewEnd was in the middle of an interval, make sure to get its endpoint.
+  I->end = std::max(NewEnd, prior(MergeTo)->end);
+
+  // Erase any dead ranges.
+  ranges.erase(next(I), MergeTo);
+
+  // Update kill info.
+  removeKills(ValNo, OldEnd, I->end-1);
+
+  // If the newly formed range now touches the range after it and if they have
+  // the same value number, merge the two ranges into one range.
+  Ranges::iterator Next = next(I);
+  if (Next != ranges.end() && Next->start <= I->end && Next->valno == ValNo) {
+    I->end = Next->end;
+    ranges.erase(Next);
+  }
+}
+
+
+/// extendIntervalStartTo - This method is used when we want to extend the range
+/// specified by I to start at the specified endpoint.  To do this, we should
+/// merge and eliminate all ranges that this will overlap with.
+LiveInterval::Ranges::iterator
+LiveInterval::extendIntervalStartTo(Ranges::iterator I, unsigned NewStart) {
+  assert(I != ranges.end() && "Not a valid interval!");
+  VNInfo *ValNo = I->valno;
+
+  // Search for the first interval that we can't merge with.
+  Ranges::iterator MergeTo = I;
+  do {
+    if (MergeTo == ranges.begin()) {
+      I->start = NewStart;
+      ranges.erase(MergeTo, I);
+      return I;
+    }
+    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+    --MergeTo;
+  } while (NewStart <= MergeTo->start);
+
+  // If we start in the middle of another interval, just delete a range and
+  // extend that interval.
+  if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
+    MergeTo->end = I->end;
+  } else {
+    // Otherwise, extend the interval right after.
+    ++MergeTo;
+    MergeTo->start = NewStart;
+    MergeTo->end = I->end;
+  }
+
+  ranges.erase(next(MergeTo), next(I));
+  return MergeTo;
+}
+
+LiveInterval::iterator
+LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
+  unsigned Start = LR.start, End = LR.end;
+  iterator it = std::upper_bound(From, ranges.end(), Start);
+
+  // If the inserted interval starts in the middle or right at the end of
+  // another interval, just extend that interval to contain the range of LR.
+  if (it != ranges.begin()) {
+    iterator B = prior(it);
+    if (LR.valno == B->valno) {
+      if (B->start <= Start && B->end >= Start) {
+        extendIntervalEndTo(B, End);
+        return B;
+      }
+    } else {
+      // Check to make sure that we are not overlapping two live ranges with
+      // different valno's.
+      assert(B->end <= Start &&
+             "Cannot overlap two LiveRanges with differing ValID's"
+             " (did you def the same reg twice in a MachineInstr?)");
+    }
+  }
+
+  // Otherwise, if this range ends in the middle of, or right next to, another
+  // interval, merge it into that interval.
+  if (it != ranges.end()) {
+    if (LR.valno == it->valno) {
+      if (it->start <= End) {
+        it = extendIntervalStartTo(it, Start);
+
+        // If LR is a complete superset of an interval, we may need to grow its
+        // endpoint as well.
+        if (End > it->end)
+          extendIntervalEndTo(it, End);
+        else if (End < it->end)
+          // Overlapping intervals, there might have been a kill here.
+          removeKill(it->valno, End);
+        return it;
+      }
+    } else {
+      // Check to make sure that we are not overlapping two live ranges with
+      // different valno's.
+      assert(it->start >= End &&
+             "Cannot overlap two LiveRanges with differing ValID's");
+    }
+  }
+
+  // Otherwise, this is just a new range that doesn't interact with anything.
+  // Insert it.
+  return ranges.insert(it, LR);
+}
+
+/// isInOneLiveRange - Return true if the range specified is entirely in the
+/// a single LiveRange of the live interval.
+bool LiveInterval::isInOneLiveRange(unsigned Start, unsigned End) {
+  Ranges::iterator I = std::upper_bound(ranges.begin(), ranges.end(), Start);
+  if (I == ranges.begin())
+    return false;
+  --I;
+  return I->contains(Start) && I->contains(End-1);
+}
+
+
+/// removeRange - Remove the specified range from this interval.  Note that
+/// the range must be in a single LiveRange in its entirety.
+void LiveInterval::removeRange(unsigned Start, unsigned End,
+                               bool RemoveDeadValNo) {
+  // Find the LiveRange containing this span.
+  Ranges::iterator I = std::upper_bound(ranges.begin(), ranges.end(), Start);
+  assert(I != ranges.begin() && "Range is not in interval!");
+  --I;
+  assert(I->contains(Start) && I->contains(End-1) &&
+         "Range is not entirely in interval!");
+
+  // If the span we are removing is at the start of the LiveRange, adjust it.
+  VNInfo *ValNo = I->valno;
+  if (I->start == Start) {
+    if (I->end == End) {
+      removeKills(I->valno, Start, End);
+      if (RemoveDeadValNo) {
+        // Check if val# is dead.
+        bool isDead = true;
+        for (const_iterator II = begin(), EE = end(); II != EE; ++II)
+          if (II != I && II->valno == ValNo) {
+            isDead = false;
+            break;
+          }          
+        if (isDead) {
+          // Now that ValNo is dead, remove it.  If it is the largest value
+          // number, just nuke it (and any other deleted values neighboring it),
+          // otherwise mark it as ~1U so it can be nuked later.
+          if (ValNo->id == getNumValNums()-1) {
+            do {
+              VNInfo *VNI = valnos.back();
+              valnos.pop_back();
+              VNI->~VNInfo();
+            } while (!valnos.empty() && valnos.back()->def == ~1U);
+          } else {
+            ValNo->def = ~1U;
+          }
+        }
+      }
+
+      ranges.erase(I);  // Removed the whole LiveRange.
+    } else
+      I->start = End;
+    return;
+  }
+
+  // Otherwise if the span we are removing is at the end of the LiveRange,
+  // adjust the other way.
+  if (I->end == End) {
+    removeKills(ValNo, Start, End);
+    I->end = Start;
+    return;
+  }
+
+  // Otherwise, we are splitting the LiveRange into two pieces.
+  unsigned OldEnd = I->end;
+  I->end = Start;   // Trim the old interval.
+
+  // Insert the new one.
+  ranges.insert(next(I), LiveRange(End, OldEnd, ValNo));
+}
+
+/// removeValNo - Remove all the ranges defined by the specified value#.
+/// Also remove the value# from value# list.
+void LiveInterval::removeValNo(VNInfo *ValNo) {
+  if (empty()) return;
+  Ranges::iterator I = ranges.end();
+  Ranges::iterator E = ranges.begin();
+  do {
+    --I;
+    if (I->valno == ValNo)
+      ranges.erase(I);
+  } while (I != E);
+  // Now that ValNo is dead, remove it.  If it is the largest value
+  // number, just nuke it (and any other deleted values neighboring it),
+  // otherwise mark it as ~1U so it can be nuked later.
+  if (ValNo->id == getNumValNums()-1) {
+    do {
+      VNInfo *VNI = valnos.back();
+      valnos.pop_back();
+      VNI->~VNInfo();
+    } while (!valnos.empty() && valnos.back()->def == ~1U);
+  } else {
+    ValNo->def = ~1U;
+  }
+}
+ 
+/// scaleNumbering - Renumber VNI and ranges to provide gaps for new
+/// instructions.                                                   
+void LiveInterval::scaleNumbering(unsigned factor) {
+  // Scale ranges.                                                            
+  for (iterator RI = begin(), RE = end(); RI != RE; ++RI) {
+    RI->start = InstrSlots::scale(RI->start, factor);
+    RI->end = InstrSlots::scale(RI->end, factor);
+  }
+
+  // Scale VNI info.                                                          
+  for (vni_iterator VNI = vni_begin(), VNIE = vni_end(); VNI != VNIE; ++VNI) {
+    VNInfo *vni = *VNI;
+    if (vni->def != ~0U && vni->def != ~1U) {
+      vni->def = InstrSlots::scale(vni->def, factor);
+    }
+
+    for (unsigned i = 0; i < vni->kills.size(); ++i) {
+      if (vni->kills[i] != 0)
+        vni->kills[i] = InstrSlots::scale(vni->kills[i], factor);
+    }
+  }
+}
+
+/// getLiveRangeContaining - Return the live range that contains the
+/// specified index, or null if there is none.
+LiveInterval::const_iterator 
+LiveInterval::FindLiveRangeContaining(unsigned Idx) const {
+  const_iterator It = std::upper_bound(begin(), end(), Idx);
+  if (It != ranges.begin()) {
+    --It;
+    if (It->contains(Idx))
+      return It;
+  }
+
+  return end();
+}
+
+LiveInterval::iterator 
+LiveInterval::FindLiveRangeContaining(unsigned Idx) {
+  iterator It = std::upper_bound(begin(), end(), Idx);
+  if (It != begin()) {
+    --It;
+    if (It->contains(Idx))
+      return It;
+  }
+  
+  return end();
+}
+
+/// findDefinedVNInfo - Find the VNInfo that's defined at the specified index
+/// (register interval) or defined by the specified register (stack inteval).
+VNInfo *LiveInterval::findDefinedVNInfo(unsigned DefIdxOrReg) const {
+  VNInfo *VNI = NULL;
+  for (LiveInterval::const_vni_iterator i = vni_begin(), e = vni_end();
+       i != e; ++i)
+    if ((*i)->def == DefIdxOrReg) {
+      VNI = *i;
+      break;
+    }
+  return VNI;
+}
+
+
+/// join - Join two live intervals (this, and other) together.  This applies
+/// mappings to the value numbers in the LHS/RHS intervals as specified.  If
+/// the intervals are not joinable, this aborts.
+void LiveInterval::join(LiveInterval &Other, const int *LHSValNoAssignments,
+                        const int *RHSValNoAssignments, 
+                        SmallVector<VNInfo*, 16> &NewVNInfo) {
+  // Determine if any of our live range values are mapped.  This is uncommon, so
+  // we want to avoid the interval scan if not. 
+  bool MustMapCurValNos = false;
+  unsigned NumVals = getNumValNums();
+  unsigned NumNewVals = NewVNInfo.size();
+  for (unsigned i = 0; i != NumVals; ++i) {
+    unsigned LHSValID = LHSValNoAssignments[i];
+    if (i != LHSValID ||
+        (NewVNInfo[LHSValID] && NewVNInfo[LHSValID] != getValNumInfo(i)))
+      MustMapCurValNos = true;
+  }
+
+  // If we have to apply a mapping to our base interval assignment, rewrite it
+  // now.
+  if (MustMapCurValNos) {
+    // Map the first live range.
+    iterator OutIt = begin();
+    OutIt->valno = NewVNInfo[LHSValNoAssignments[OutIt->valno->id]];
+    ++OutIt;
+    for (iterator I = OutIt, E = end(); I != E; ++I) {
+      OutIt->valno = NewVNInfo[LHSValNoAssignments[I->valno->id]];
+      
+      // If this live range has the same value # as its immediate predecessor,
+      // and if they are neighbors, remove one LiveRange.  This happens when we
+      // have [0,3:0)[4,7:1) and map 0/1 onto the same value #.
+      if (OutIt->valno == (OutIt-1)->valno && (OutIt-1)->end == OutIt->start) {
+        (OutIt-1)->end = OutIt->end;
+      } else {
+        if (I != OutIt) {
+          OutIt->start = I->start;
+          OutIt->end = I->end;
+        }
+        
+        // Didn't merge, on to the next one.
+        ++OutIt;
+      }
+    }
+    
+    // If we merge some live ranges, chop off the end.
+    ranges.erase(OutIt, end());
+  }
+
+  // Remember assignements because val# ids are changing.
+  SmallVector<unsigned, 16> OtherAssignments;
+  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
+    OtherAssignments.push_back(RHSValNoAssignments[I->valno->id]);
+
+  // Update val# info. Renumber them and make sure they all belong to this
+  // LiveInterval now. Also remove dead val#'s.
+  unsigned NumValNos = 0;
+  for (unsigned i = 0; i < NumNewVals; ++i) {
+    VNInfo *VNI = NewVNInfo[i];
+    if (VNI) {
+      if (NumValNos >= NumVals)
+        valnos.push_back(VNI);
+      else 
+        valnos[NumValNos] = VNI;
+      VNI->id = NumValNos++;  // Renumber val#.
+    }
+  }
+  if (NumNewVals < NumVals)
+    valnos.resize(NumNewVals);  // shrinkify
+
+  // Okay, now insert the RHS live ranges into the LHS.
+  iterator InsertPos = begin();
+  unsigned RangeNo = 0;
+  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I, ++RangeNo) {
+    // Map the valno in the other live range to the current live range.
+    I->valno = NewVNInfo[OtherAssignments[RangeNo]];
+    assert(I->valno && "Adding a dead range?");
+    InsertPos = addRangeFrom(*I, InsertPos);
+  }
+
+  weight += Other.weight;
+  if (Other.preference && !preference)
+    preference = Other.preference;
+}
+
+/// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
+/// interval as the specified value number.  The LiveRanges in RHS are
+/// allowed to overlap with LiveRanges in the current interval, but only if
+/// the overlapping LiveRanges have the specified value number.
+void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS, 
+                                        VNInfo *LHSValNo) {
+  // TODO: Make this more efficient.
+  iterator InsertPos = begin();
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    // Map the valno in the other live range to the current live range.
+    LiveRange Tmp = *I;
+    Tmp.valno = LHSValNo;
+    InsertPos = addRangeFrom(Tmp, InsertPos);
+  }
+}
+
+
+/// MergeValueInAsValue - Merge all of the live ranges of a specific val#
+/// in RHS into this live interval as the specified value number.
+/// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
+/// current interval, it will replace the value numbers of the overlaped
+/// live ranges with the specified value number.
+void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
+                                     const VNInfo *RHSValNo, VNInfo *LHSValNo) {
+  SmallVector<VNInfo*, 4> ReplacedValNos;
+  iterator IP = begin();
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    if (I->valno != RHSValNo)
+      continue;
+    unsigned Start = I->start, End = I->end;
+    IP = std::upper_bound(IP, end(), Start);
+    // If the start of this range overlaps with an existing liverange, trim it.
+    if (IP != begin() && IP[-1].end > Start) {
+      if (IP[-1].valno != LHSValNo) {
+        ReplacedValNos.push_back(IP[-1].valno);
+        IP[-1].valno = LHSValNo; // Update val#.
+      }
+      Start = IP[-1].end;
+      // Trimmed away the whole range?
+      if (Start >= End) continue;
+    }
+    // If the end of this range overlaps with an existing liverange, trim it.
+    if (IP != end() && End > IP->start) {
+      if (IP->valno != LHSValNo) {
+        ReplacedValNos.push_back(IP->valno);
+        IP->valno = LHSValNo;  // Update val#.
+      }
+      End = IP->start;
+      // If this trimmed away the whole range, ignore it.
+      if (Start == End) continue;
+    }
+    
+    // Map the valno in the other live range to the current live range.
+    IP = addRangeFrom(LiveRange(Start, End, LHSValNo), IP);
+  }
+
+
+  SmallSet<VNInfo*, 4> Seen;
+  for (unsigned i = 0, e = ReplacedValNos.size(); i != e; ++i) {
+    VNInfo *V1 = ReplacedValNos[i];
+    if (Seen.insert(V1)) {
+      bool isDead = true;
+      for (const_iterator I = begin(), E = end(); I != E; ++I)
+        if (I->valno == V1) {
+          isDead = false;
+          break;
+        }          
+      if (isDead) {
+        // Now that V1 is dead, remove it.  If it is the largest value number,
+        // just nuke it (and any other deleted values neighboring it), otherwise
+        // mark it as ~1U so it can be nuked later.
+        if (V1->id == getNumValNums()-1) {
+          do {
+            VNInfo *VNI = valnos.back();
+            valnos.pop_back();
+            VNI->~VNInfo();
+          } while (!valnos.empty() && valnos.back()->def == ~1U);
+        } else {
+          V1->def = ~1U;
+        }
+      }
+    }
+  }
+}
+
+
+/// MergeInClobberRanges - For any live ranges that are not defined in the
+/// current interval, but are defined in the Clobbers interval, mark them
+/// used with an unknown definition value.
+void LiveInterval::MergeInClobberRanges(const LiveInterval &Clobbers,
+                                        BumpPtrAllocator &VNInfoAllocator) {
+  if (Clobbers.empty()) return;
+  
+  DenseMap<VNInfo*, VNInfo*> ValNoMaps;
+  VNInfo *UnusedValNo = 0;
+  iterator IP = begin();
+  for (const_iterator I = Clobbers.begin(), E = Clobbers.end(); I != E; ++I) {
+    // For every val# in the Clobbers interval, create a new "unknown" val#.
+    VNInfo *ClobberValNo = 0;
+    DenseMap<VNInfo*, VNInfo*>::iterator VI = ValNoMaps.find(I->valno);
+    if (VI != ValNoMaps.end())
+      ClobberValNo = VI->second;
+    else if (UnusedValNo)
+      ClobberValNo = UnusedValNo;
+    else {
+      UnusedValNo = ClobberValNo = getNextValue(~0U, 0, VNInfoAllocator);
+      ValNoMaps.insert(std::make_pair(I->valno, ClobberValNo));
+    }
+
+    bool Done = false;
+    unsigned Start = I->start, End = I->end;
+    // If a clobber range starts before an existing range and ends after
+    // it, the clobber range will need to be split into multiple ranges.
+    // Loop until the entire clobber range is handled.
+    while (!Done) {
+      Done = true;
+      IP = std::upper_bound(IP, end(), Start);
+      unsigned SubRangeStart = Start;
+      unsigned SubRangeEnd = End;
+
+      // If the start of this range overlaps with an existing liverange, trim it.
+      if (IP != begin() && IP[-1].end > SubRangeStart) {
+        SubRangeStart = IP[-1].end;
+        // Trimmed away the whole range?
+        if (SubRangeStart >= SubRangeEnd) continue;
+      }
+      // If the end of this range overlaps with an existing liverange, trim it.
+      if (IP != end() && SubRangeEnd > IP->start) {
+        // If the clobber live range extends beyond the existing live range,
+        // it'll need at least another live range, so set the flag to keep
+        // iterating.
+        if (SubRangeEnd > IP->end) {
+          Start = IP->end;
+          Done = false;
+        }
+        SubRangeEnd = IP->start;
+        // If this trimmed away the whole range, ignore it.
+        if (SubRangeStart == SubRangeEnd) continue;
+      }
+
+      // Insert the clobber interval.
+      IP = addRangeFrom(LiveRange(SubRangeStart, SubRangeEnd, ClobberValNo),
+                        IP);
+      UnusedValNo = 0;
+    }
+  }
+
+  if (UnusedValNo) {
+    // Delete the last unused val#.
+    valnos.pop_back();
+    UnusedValNo->~VNInfo();
+  }
+}
+
+void LiveInterval::MergeInClobberRange(unsigned Start, unsigned End,
+                                       BumpPtrAllocator &VNInfoAllocator) {
+  // Find a value # to use for the clobber ranges.  If there is already a value#
+  // for unknown values, use it.
+  VNInfo *ClobberValNo = getNextValue(~0U, 0, VNInfoAllocator);
+  
+  iterator IP = begin();
+  IP = std::upper_bound(IP, end(), Start);
+    
+  // If the start of this range overlaps with an existing liverange, trim it.
+  if (IP != begin() && IP[-1].end > Start) {
+    Start = IP[-1].end;
+    // Trimmed away the whole range?
+    if (Start >= End) return;
+  }
+  // If the end of this range overlaps with an existing liverange, trim it.
+  if (IP != end() && End > IP->start) {
+    End = IP->start;
+    // If this trimmed away the whole range, ignore it.
+    if (Start == End) return;
+  }
+    
+  // Insert the clobber interval.
+  addRangeFrom(LiveRange(Start, End, ClobberValNo), IP);
+}
+
+/// MergeValueNumberInto - This method is called when two value nubmers
+/// are found to be equivalent.  This eliminates V1, replacing all
+/// LiveRanges with the V1 value number with the V2 value number.  This can
+/// cause merging of V1/V2 values numbers and compaction of the value space.
+VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+  assert(V1 != V2 && "Identical value#'s are always equivalent!");
+
+  // This code actually merges the (numerically) larger value number into the
+  // smaller value number, which is likely to allow us to compactify the value
+  // space.  The only thing we have to be careful of is to preserve the
+  // instruction that defines the result value.
+
+  // Make sure V2 is smaller than V1.
+  if (V1->id < V2->id) {
+    copyValNumInfo(V1, V2);
+    std::swap(V1, V2);
+  }
+
+  // Merge V1 live ranges into V2.
+  for (iterator I = begin(); I != end(); ) {
+    iterator LR = I++;
+    if (LR->valno != V1) continue;  // Not a V1 LiveRange.
+    
+    // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
+    // range, extend it.
+    if (LR != begin()) {
+      iterator Prev = LR-1;
+      if (Prev->valno == V2 && Prev->end == LR->start) {
+        Prev->end = LR->end;
+
+        // Erase this live-range.
+        ranges.erase(LR);
+        I = Prev+1;
+        LR = Prev;
+      }
+    }
+    
+    // Okay, now we have a V1 or V2 live range that is maximally merged forward.
+    // Ensure that it is a V2 live-range.
+    LR->valno = V2;
+    
+    // If we can merge it into later V2 live ranges, do so now.  We ignore any
+    // following V1 live ranges, as they will be merged in subsequent iterations
+    // of the loop.
+    if (I != end()) {
+      if (I->start == LR->end && I->valno == V2) {
+        LR->end = I->end;
+        ranges.erase(I);
+        I = LR+1;
+      }
+    }
+  }
+  
+  // Now that V1 is dead, remove it.  If it is the largest value number, just
+  // nuke it (and any other deleted values neighboring it), otherwise mark it as
+  // ~1U so it can be nuked later.
+  if (V1->id == getNumValNums()-1) {
+    do {
+      VNInfo *VNI = valnos.back();
+      valnos.pop_back();
+      VNI->~VNInfo();
+    } while (valnos.back()->def == ~1U);
+  } else {
+    V1->def = ~1U;
+  }
+  
+  return V2;
+}
+
+void LiveInterval::Copy(const LiveInterval &RHS,
+                        BumpPtrAllocator &VNInfoAllocator) {
+  ranges.clear();
+  valnos.clear();
+  preference = RHS.preference;
+  weight = RHS.weight;
+  for (unsigned i = 0, e = RHS.getNumValNums(); i != e; ++i) {
+    const VNInfo *VNI = RHS.getValNumInfo(i);
+    VNInfo *NewVNI = getNextValue(~0U, 0, VNInfoAllocator);
+    copyValNumInfo(NewVNI, VNI);
+  }
+  for (unsigned i = 0, e = RHS.ranges.size(); i != e; ++i) {
+    const LiveRange &LR = RHS.ranges[i];
+    addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id)));
+  }
+}
+
+unsigned LiveInterval::getSize() const {
+  unsigned Sum = 0;
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    Sum += I->end - I->start;
+  return Sum;
+}
+
+std::ostream& llvm::operator<<(std::ostream& os, const LiveRange &LR) {
+  return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
+}
+
+void LiveRange::dump() const {
+  cerr << *this << "\n";
+}
+
+void LiveInterval::print(std::ostream &OS,
+                         const TargetRegisterInfo *TRI) const {
+  if (isStackSlot())
+    OS << "SS#" << getStackSlotIndex();
+  else if (TRI && TargetRegisterInfo::isPhysicalRegister(reg))
+    OS << TRI->getName(reg);
+  else
+    OS << "%reg" << reg;
+
+  OS << ',' << weight;
+
+  if (empty())
+    OS << " EMPTY";
+  else {
+    OS << " = ";
+    for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
+           E = ranges.end(); I != E; ++I)
+    OS << *I;
+  }
+  
+  // Print value number info.
+  if (getNumValNums()) {
+    OS << "  ";
+    unsigned vnum = 0;
+    for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e;
+         ++i, ++vnum) {
+      const VNInfo *vni = *i;
+      if (vnum) OS << " ";
+      OS << vnum << "@";
+      if (vni->def == ~1U) {
+        OS << "x";
+      } else {
+        if (vni->def == ~0U)
+          OS << "?";
+        else
+          OS << vni->def;
+        unsigned ee = vni->kills.size();
+        if (ee || vni->hasPHIKill) {
+          OS << "-(";
+          for (unsigned j = 0; j != ee; ++j) {
+            OS << vni->kills[j];
+            if (j != ee-1)
+              OS << " ";
+          }
+          if (vni->hasPHIKill) {
+            if (ee)
+              OS << " ";
+            OS << "phi";
+          }
+          OS << ")";
+        }
+      }
+    }
+  }
+}
+
+void LiveInterval::dump() const {
+  cerr << *this << "\n";
+}
+
+
+void LiveRange::print(std::ostream &os) const {
+  os << *this;
+}
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
new file mode 100644
index 0000000..cf0a648
--- /dev/null
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -0,0 +1,2298 @@
+//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveInterval analysis pass which is used
+// by the Linear Scan Register allocator. This pass linearizes the
+// basic blocks of the function in DFS order and uses the
+// LiveVariables pass to conservatively compute live intervals for
+// each virtual and physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "liveintervals"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "VirtRegMap.h"
+#include "llvm/Value.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <limits>
+#include <cmath>
+using namespace llvm;
+
+// Hidden options for help debugging.
+static cl::opt<bool> DisableReMat("disable-rematerialization", 
+                                  cl::init(false), cl::Hidden);
+
+static cl::opt<bool> SplitAtBB("split-intervals-at-bb", 
+                               cl::init(true), cl::Hidden);
+static cl::opt<int> SplitLimit("split-limit",
+                               cl::init(-1), cl::Hidden);
+
+static cl::opt<bool> EnableAggressiveRemat("aggressive-remat", cl::Hidden);
+
+static cl::opt<bool> EnableFastSpilling("fast-spill",
+                                        cl::init(false), cl::Hidden);
+
+STATISTIC(numIntervals, "Number of original intervals");
+STATISTIC(numFolds    , "Number of loads/stores folded into instructions");
+STATISTIC(numSplits   , "Number of intervals split");
+
+char LiveIntervals::ID = 0;
+static RegisterPass<LiveIntervals> X("liveintervals", "Live Interval Analysis");
+
+void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
+  AU.addPreserved<LiveVariables>();
+  AU.addRequired<LiveVariables>();
+  AU.addPreservedID(MachineLoopInfoID);
+  AU.addPreservedID(MachineDominatorsID);
+  
+  if (!StrongPHIElim) {
+    AU.addPreservedID(PHIEliminationID);
+    AU.addRequiredID(PHIEliminationID);
+  }
+  
+  AU.addRequiredID(TwoAddressInstructionPassID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LiveIntervals::releaseMemory() {
+  // Free the live intervals themselves.
+  for (DenseMap<unsigned, LiveInterval*>::iterator I = r2iMap_.begin(),
+       E = r2iMap_.end(); I != E; ++I)
+    delete I->second;
+  
+  MBB2IdxMap.clear();
+  Idx2MBBMap.clear();
+  mi2iMap_.clear();
+  i2miMap_.clear();
+  r2iMap_.clear();
+  // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
+  VNInfoAllocator.Reset();
+  while (!ClonedMIs.empty()) {
+    MachineInstr *MI = ClonedMIs.back();
+    ClonedMIs.pop_back();
+    mf_->DeleteMachineInstr(MI);
+  }
+}
+
+void LiveIntervals::computeNumbering() {
+  Index2MiMap OldI2MI = i2miMap_;
+  std::vector<IdxMBBPair> OldI2MBB = Idx2MBBMap;
+  
+  Idx2MBBMap.clear();
+  MBB2IdxMap.clear();
+  mi2iMap_.clear();
+  i2miMap_.clear();
+  
+  FunctionSize = 0;
+  
+  // Number MachineInstrs and MachineBasicBlocks.
+  // Initialize MBB indexes to a sentinal.
+  MBB2IdxMap.resize(mf_->getNumBlockIDs(), std::make_pair(~0U,~0U));
+  
+  unsigned MIIndex = 0;
+  for (MachineFunction::iterator MBB = mf_->begin(), E = mf_->end();
+       MBB != E; ++MBB) {
+    unsigned StartIdx = MIIndex;
+
+    // Insert an empty slot at the beginning of each block.
+    MIIndex += InstrSlots::NUM;
+    i2miMap_.push_back(0);
+
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      bool inserted = mi2iMap_.insert(std::make_pair(I, MIIndex)).second;
+      assert(inserted && "multiple MachineInstr -> index mappings");
+      inserted = true;
+      i2miMap_.push_back(I);
+      MIIndex += InstrSlots::NUM;
+      FunctionSize++;
+      
+      // Insert max(1, numdefs) empty slots after every instruction.
+      unsigned Slots = I->getDesc().getNumDefs();
+      if (Slots == 0)
+        Slots = 1;
+      MIIndex += InstrSlots::NUM * Slots;
+      while (Slots--)
+        i2miMap_.push_back(0);
+    }
+    
+    // Set the MBB2IdxMap entry for this MBB.
+    MBB2IdxMap[MBB->getNumber()] = std::make_pair(StartIdx, MIIndex - 1);
+    Idx2MBBMap.push_back(std::make_pair(StartIdx, MBB));
+  }
+  std::sort(Idx2MBBMap.begin(), Idx2MBBMap.end(), Idx2MBBCompare());
+  
+  if (!OldI2MI.empty())
+    for (iterator OI = begin(), OE = end(); OI != OE; ++OI) {
+      for (LiveInterval::iterator LI = OI->second->begin(),
+           LE = OI->second->end(); LI != LE; ++LI) {
+        
+        // Remap the start index of the live range to the corresponding new
+        // number, or our best guess at what it _should_ correspond to if the
+        // original instruction has been erased.  This is either the following
+        // instruction or its predecessor.
+        unsigned index = LI->start / InstrSlots::NUM;
+        unsigned offset = LI->start % InstrSlots::NUM;
+        if (offset == InstrSlots::LOAD) {
+          std::vector<IdxMBBPair>::const_iterator I =
+                  std::lower_bound(OldI2MBB.begin(), OldI2MBB.end(), LI->start);
+          // Take the pair containing the index
+          std::vector<IdxMBBPair>::const_iterator J =
+                    (I == OldI2MBB.end() && OldI2MBB.size()>0) ? (I-1): I;
+          
+          LI->start = getMBBStartIdx(J->second);
+        } else {
+          LI->start = mi2iMap_[OldI2MI[index]] + offset;
+        }
+        
+        // Remap the ending index in the same way that we remapped the start,
+        // except for the final step where we always map to the immediately
+        // following instruction.
+        index = (LI->end - 1) / InstrSlots::NUM;
+        offset  = LI->end % InstrSlots::NUM;
+        if (offset == InstrSlots::LOAD) {
+          // VReg dies at end of block.
+          std::vector<IdxMBBPair>::const_iterator I =
+                  std::lower_bound(OldI2MBB.begin(), OldI2MBB.end(), LI->end);
+          --I;
+          
+          LI->end = getMBBEndIdx(I->second) + 1;
+        } else {
+          unsigned idx = index;
+          while (index < OldI2MI.size() && !OldI2MI[index]) ++index;
+          
+          if (index != OldI2MI.size())
+            LI->end = mi2iMap_[OldI2MI[index]] + (idx == index ? offset : 0);
+          else
+            LI->end = InstrSlots::NUM * i2miMap_.size();
+        }
+      }
+      
+      for (LiveInterval::vni_iterator VNI = OI->second->vni_begin(),
+           VNE = OI->second->vni_end(); VNI != VNE; ++VNI) { 
+        VNInfo* vni = *VNI;
+        
+        // Remap the VNInfo def index, which works the same as the
+        // start indices above. VN's with special sentinel defs
+        // don't need to be remapped.
+        if (vni->def != ~0U && vni->def != ~1U) {
+          unsigned index = vni->def / InstrSlots::NUM;
+          unsigned offset = vni->def % InstrSlots::NUM;
+          if (offset == InstrSlots::LOAD) {
+            std::vector<IdxMBBPair>::const_iterator I =
+                  std::lower_bound(OldI2MBB.begin(), OldI2MBB.end(), vni->def);
+            // Take the pair containing the index
+            std::vector<IdxMBBPair>::const_iterator J =
+                    (I == OldI2MBB.end() && OldI2MBB.size()>0) ? (I-1): I;
+          
+            vni->def = getMBBStartIdx(J->second);
+          } else {
+            vni->def = mi2iMap_[OldI2MI[index]] + offset;
+          }
+        }
+        
+        // Remap the VNInfo kill indices, which works the same as
+        // the end indices above.
+        for (size_t i = 0; i < vni->kills.size(); ++i) {
+          // PHI kills don't need to be remapped.
+          if (!vni->kills[i]) continue;
+          
+          unsigned index = (vni->kills[i]-1) / InstrSlots::NUM;
+          unsigned offset = vni->kills[i] % InstrSlots::NUM;
+          if (offset == InstrSlots::LOAD) {
+            std::vector<IdxMBBPair>::const_iterator I =
+             std::lower_bound(OldI2MBB.begin(), OldI2MBB.end(), vni->kills[i]);
+            --I;
+
+            vni->kills[i] = getMBBEndIdx(I->second);
+          } else {
+            unsigned idx = index;
+            while (index < OldI2MI.size() && !OldI2MI[index]) ++index;
+            
+            if (index != OldI2MI.size())
+              vni->kills[i] = mi2iMap_[OldI2MI[index]] + 
+                              (idx == index ? offset : 0);
+            else
+              vni->kills[i] = InstrSlots::NUM * i2miMap_.size();
+          }
+        }
+      }
+    }
+}
+
+void LiveIntervals::scaleNumbering(int factor) {
+  // Need to
+  //  * scale MBB begin and end points
+  //  * scale all ranges.
+  //  * Update VNI structures.
+  //  * Scale instruction numberings 
+
+  // Scale the MBB indices.
+  Idx2MBBMap.clear();
+  for (MachineFunction::iterator MBB = mf_->begin(), MBBE = mf_->end();
+       MBB != MBBE; ++MBB) {
+    std::pair<unsigned, unsigned> &mbbIndices = MBB2IdxMap[MBB->getNumber()];
+    mbbIndices.first = InstrSlots::scale(mbbIndices.first, factor);
+    mbbIndices.second = InstrSlots::scale(mbbIndices.second, factor);
+    Idx2MBBMap.push_back(std::make_pair(mbbIndices.first, MBB)); 
+  }
+  std::sort(Idx2MBBMap.begin(), Idx2MBBMap.end(), Idx2MBBCompare());
+
+  // Scale the intervals.
+  for (iterator LI = begin(), LE = end(); LI != LE; ++LI) {
+    LI->second->scaleNumbering(factor);
+  }
+
+  // Scale MachineInstrs.
+  Mi2IndexMap oldmi2iMap = mi2iMap_;
+  unsigned highestSlot = 0;
+  for (Mi2IndexMap::iterator MI = oldmi2iMap.begin(), ME = oldmi2iMap.end();
+       MI != ME; ++MI) {
+    unsigned newSlot = InstrSlots::scale(MI->second, factor);
+    mi2iMap_[MI->first] = newSlot;
+    highestSlot = std::max(highestSlot, newSlot); 
+  }
+
+  i2miMap_.clear();
+  i2miMap_.resize(highestSlot + 1);
+  for (Mi2IndexMap::iterator MI = mi2iMap_.begin(), ME = mi2iMap_.end();
+       MI != ME; ++MI) {
+    i2miMap_[MI->second] = MI->first;
+  }
+
+}
+
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
+  mf_ = &fn;
+  mri_ = &mf_->getRegInfo();
+  tm_ = &fn.getTarget();
+  tri_ = tm_->getRegisterInfo();
+  tii_ = tm_->getInstrInfo();
+  aa_ = &getAnalysis<AliasAnalysis>();
+  lv_ = &getAnalysis<LiveVariables>();
+  allocatableRegs_ = tri_->getAllocatableSet(fn);
+
+  computeNumbering();
+  computeIntervals();
+
+  numIntervals += getNumIntervals();
+
+  DEBUG(dump());
+  return true;
+}
+
+/// print - Implement the dump method.
+void LiveIntervals::print(std::ostream &O, const Module* ) const {
+  O << "********** INTERVALS **********\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    I->second->print(O, tri_);
+    O << "\n";
+  }
+
+  O << "********** MACHINEINSTRS **********\n";
+  for (MachineFunction::iterator mbbi = mf_->begin(), mbbe = mf_->end();
+       mbbi != mbbe; ++mbbi) {
+    O << ((Value*)mbbi->getBasicBlock())->getName() << ":\n";
+    for (MachineBasicBlock::iterator mii = mbbi->begin(),
+           mie = mbbi->end(); mii != mie; ++mii) {
+      O << getInstructionIndex(mii) << '\t' << *mii;
+    }
+  }
+}
+
+/// conflictsWithPhysRegDef - Returns true if the specified register
+/// is defined during the duration of the specified interval.
+bool LiveIntervals::conflictsWithPhysRegDef(const LiveInterval &li,
+                                            VirtRegMap &vrm, unsigned reg) {
+  for (LiveInterval::Ranges::const_iterator
+         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+    for (unsigned index = getBaseIndex(I->start),
+           end = getBaseIndex(I->end-1) + InstrSlots::NUM; index != end;
+         index += InstrSlots::NUM) {
+      // skip deleted instructions
+      while (index != end && !getInstructionFromIndex(index))
+        index += InstrSlots::NUM;
+      if (index == end) break;
+
+      MachineInstr *MI = getInstructionFromIndex(index);
+      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+      if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+        if (SrcReg == li.reg || DstReg == li.reg)
+          continue;
+      for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+        MachineOperand& mop = MI->getOperand(i);
+        if (!mop.isReg())
+          continue;
+        unsigned PhysReg = mop.getReg();
+        if (PhysReg == 0 || PhysReg == li.reg)
+          continue;
+        if (TargetRegisterInfo::isVirtualRegister(PhysReg)) {
+          if (!vrm.hasPhys(PhysReg))
+            continue;
+          PhysReg = vrm.getPhys(PhysReg);
+        }
+        if (PhysReg && tri_->regsOverlap(PhysReg, reg))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// conflictsWithPhysRegRef - Similar to conflictsWithPhysRegRef except
+/// it can check use as well.
+bool LiveIntervals::conflictsWithPhysRegRef(LiveInterval &li,
+                                            unsigned Reg, bool CheckUse,
+                                  SmallPtrSet<MachineInstr*,32> &JoinedCopies) {
+  for (LiveInterval::Ranges::const_iterator
+         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+    for (unsigned index = getBaseIndex(I->start),
+           end = getBaseIndex(I->end-1) + InstrSlots::NUM; index != end;
+         index += InstrSlots::NUM) {
+      // Skip deleted instructions.
+      MachineInstr *MI = 0;
+      while (index != end) {
+        MI = getInstructionFromIndex(index);
+        if (MI)
+          break;
+        index += InstrSlots::NUM;
+      }
+      if (index == end) break;
+
+      if (JoinedCopies.count(MI))
+        continue;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand& MO = MI->getOperand(i);
+        if (!MO.isReg())
+          continue;
+        if (MO.isUse() && !CheckUse)
+          continue;
+        unsigned PhysReg = MO.getReg();
+        if (PhysReg == 0 || TargetRegisterInfo::isVirtualRegister(PhysReg))
+          continue;
+        if (tri_->isSubRegister(Reg, PhysReg))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+
+void LiveIntervals::printRegName(unsigned reg) const {
+  if (TargetRegisterInfo::isPhysicalRegister(reg))
+    cerr << tri_->getName(reg);
+  else
+    cerr << "%reg" << reg;
+}
+
+void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
+                                             MachineBasicBlock::iterator mi,
+                                             unsigned MIIdx, MachineOperand& MO,
+                                             unsigned MOIdx,
+                                             LiveInterval &interval) {
+  DOUT << "\t\tregister: "; DEBUG(printRegName(interval.reg));
+  LiveVariables::VarInfo& vi = lv_->getVarInfo(interval.reg);
+
+  if (mi->getOpcode() == TargetInstrInfo::IMPLICIT_DEF) {
+    DOUT << "is a implicit_def\n";
+    return;
+  }
+
+  // Virtual registers may be defined multiple times (due to phi
+  // elimination and 2-addr elimination).  Much of what we do only has to be
+  // done once for the vreg.  We use an empty interval to detect the first
+  // time we see a vreg.
+  if (interval.empty()) {
+    // Get the Idx of the defining instructions.
+    unsigned defIndex = getDefIndex(MIIdx);
+    // Earlyclobbers move back one.
+    if (MO.isEarlyClobber())
+      defIndex = getUseIndex(MIIdx);
+    VNInfo *ValNo;
+    MachineInstr *CopyMI = NULL;
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (mi->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
+        mi->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+        mi->getOpcode() == TargetInstrInfo::SUBREG_TO_REG ||
+        tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      CopyMI = mi;
+    // Earlyclobbers move back one.
+    ValNo = interval.getNextValue(defIndex, CopyMI, VNInfoAllocator);
+
+    assert(ValNo->id == 0 && "First value in interval is not 0?");
+
+    // Loop over all of the blocks that the vreg is defined in.  There are
+    // two cases we have to handle here.  The most common case is a vreg
+    // whose lifetime is contained within a basic block.  In this case there
+    // will be a single kill, in MBB, which comes after the definition.
+    if (vi.Kills.size() == 1 && vi.Kills[0]->getParent() == mbb) {
+      // FIXME: what about dead vars?
+      unsigned killIdx;
+      if (vi.Kills[0] != mi)
+        killIdx = getUseIndex(getInstructionIndex(vi.Kills[0]))+1;
+      else
+        killIdx = defIndex+1;
+
+      // If the kill happens after the definition, we have an intra-block
+      // live range.
+      if (killIdx > defIndex) {
+        assert(vi.AliveBlocks.empty() &&
+               "Shouldn't be alive across any blocks!");
+        LiveRange LR(defIndex, killIdx, ValNo);
+        interval.addRange(LR);
+        DOUT << " +" << LR << "\n";
+        interval.addKill(ValNo, killIdx);
+        return;
+      }
+    }
+
+    // The other case we handle is when a virtual register lives to the end
+    // of the defining block, potentially live across some blocks, then is
+    // live into some number of blocks, but gets killed.  Start by adding a
+    // range that goes from this definition to the end of the defining block.
+    LiveRange NewLR(defIndex, getMBBEndIdx(mbb)+1, ValNo);
+    DOUT << " +" << NewLR;
+    interval.addRange(NewLR);
+
+    // Iterate over all of the blocks that the variable is completely
+    // live in, adding [insrtIndex(begin), instrIndex(end)+4) to the
+    // live interval.
+    for (SparseBitVector<>::iterator I = vi.AliveBlocks.begin(), 
+             E = vi.AliveBlocks.end(); I != E; ++I) {
+      LiveRange LR(getMBBStartIdx(*I),
+                   getMBBEndIdx(*I)+1,  // MBB ends at -1.
+                   ValNo);
+      interval.addRange(LR);
+      DOUT << " +" << LR;
+    }
+
+    // Finally, this virtual register is live from the start of any killing
+    // block to the 'use' slot of the killing instruction.
+    for (unsigned i = 0, e = vi.Kills.size(); i != e; ++i) {
+      MachineInstr *Kill = vi.Kills[i];
+      unsigned killIdx = getUseIndex(getInstructionIndex(Kill))+1;
+      LiveRange LR(getMBBStartIdx(Kill->getParent()),
+                   killIdx, ValNo);
+      interval.addRange(LR);
+      interval.addKill(ValNo, killIdx);
+      DOUT << " +" << LR;
+    }
+
+  } else {
+    // If this is the second time we see a virtual register definition, it
+    // must be due to phi elimination or two addr elimination.  If this is
+    // the result of two address elimination, then the vreg is one of the
+    // def-and-use register operand.
+    if (mi->isRegTiedToUseOperand(MOIdx)) {
+      // If this is a two-address definition, then we have already processed
+      // the live range.  The only problem is that we didn't realize there
+      // are actually two values in the live interval.  Because of this we
+      // need to take the LiveRegion that defines this register and split it
+      // into two values.
+      assert(interval.containsOneValue());
+      unsigned DefIndex = getDefIndex(interval.getValNumInfo(0)->def);
+      unsigned RedefIndex = getDefIndex(MIIdx);
+      if (MO.isEarlyClobber())
+        RedefIndex = getUseIndex(MIIdx);
+
+      const LiveRange *OldLR = interval.getLiveRangeContaining(RedefIndex-1);
+      VNInfo *OldValNo = OldLR->valno;
+
+      // Delete the initial value, which should be short and continuous,
+      // because the 2-addr copy must be in the same MBB as the redef.
+      interval.removeRange(DefIndex, RedefIndex);
+
+      // Two-address vregs should always only be redefined once.  This means
+      // that at this point, there should be exactly one value number in it.
+      assert(interval.containsOneValue() && "Unexpected 2-addr liveint!");
+
+      // The new value number (#1) is defined by the instruction we claimed
+      // defined value #0.
+      VNInfo *ValNo = interval.getNextValue(OldValNo->def, OldValNo->copy,
+                                            VNInfoAllocator);
+      
+      // Value#0 is now defined by the 2-addr instruction.
+      OldValNo->def  = RedefIndex;
+      OldValNo->copy = 0;
+      if (MO.isEarlyClobber())
+        OldValNo->redefByEC = true;
+      
+      // Add the new live interval which replaces the range for the input copy.
+      LiveRange LR(DefIndex, RedefIndex, ValNo);
+      DOUT << " replace range with " << LR;
+      interval.addRange(LR);
+      interval.addKill(ValNo, RedefIndex);
+
+      // If this redefinition is dead, we need to add a dummy unit live
+      // range covering the def slot.
+      if (MO.isDead())
+        interval.addRange(LiveRange(RedefIndex, RedefIndex+1, OldValNo));
+
+      DOUT << " RESULT: ";
+      interval.print(DOUT, tri_);
+
+    } else {
+      // Otherwise, this must be because of phi elimination.  If this is the
+      // first redefinition of the vreg that we have seen, go back and change
+      // the live range in the PHI block to be a different value number.
+      if (interval.containsOneValue()) {
+        assert(vi.Kills.size() == 1 &&
+               "PHI elimination vreg should have one kill, the PHI itself!");
+
+        // Remove the old range that we now know has an incorrect number.
+        VNInfo *VNI = interval.getValNumInfo(0);
+        MachineInstr *Killer = vi.Kills[0];
+        unsigned Start = getMBBStartIdx(Killer->getParent());
+        unsigned End = getUseIndex(getInstructionIndex(Killer))+1;
+        DOUT << " Removing [" << Start << "," << End << "] from: ";
+        interval.print(DOUT, tri_); DOUT << "\n";
+        interval.removeRange(Start, End);
+        VNI->hasPHIKill = true;
+        DOUT << " RESULT: "; interval.print(DOUT, tri_);
+
+        // Replace the interval with one of a NEW value number.  Note that this
+        // value number isn't actually defined by an instruction, weird huh? :)
+        LiveRange LR(Start, End, interval.getNextValue(~0, 0, VNInfoAllocator));
+        DOUT << " replace range with " << LR;
+        interval.addRange(LR);
+        interval.addKill(LR.valno, End);
+        DOUT << " RESULT: "; interval.print(DOUT, tri_);
+      }
+
+      // In the case of PHI elimination, each variable definition is only
+      // live until the end of the block.  We've already taken care of the
+      // rest of the live range.
+      unsigned defIndex = getDefIndex(MIIdx);
+      if (MO.isEarlyClobber())
+        defIndex = getUseIndex(MIIdx);
+      
+      VNInfo *ValNo;
+      MachineInstr *CopyMI = NULL;
+      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+      if (mi->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
+          mi->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+          mi->getOpcode() == TargetInstrInfo::SUBREG_TO_REG ||
+          tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
+        CopyMI = mi;
+      ValNo = interval.getNextValue(defIndex, CopyMI, VNInfoAllocator);
+      
+      unsigned killIndex = getMBBEndIdx(mbb) + 1;
+      LiveRange LR(defIndex, killIndex, ValNo);
+      interval.addRange(LR);
+      interval.addKill(ValNo, killIndex);
+      ValNo->hasPHIKill = true;
+      DOUT << " +" << LR;
+    }
+  }
+
+  DOUT << '\n';
+}
+
+void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB,
+                                              MachineBasicBlock::iterator mi,
+                                              unsigned MIIdx,
+                                              MachineOperand& MO,
+                                              LiveInterval &interval,
+                                              MachineInstr *CopyMI) {
+  // A physical register cannot be live across basic block, so its
+  // lifetime must end somewhere in its defining basic block.
+  DOUT << "\t\tregister: "; DEBUG(printRegName(interval.reg));
+
+  unsigned baseIndex = MIIdx;
+  unsigned start = getDefIndex(baseIndex);
+  // Earlyclobbers move back one.
+  if (MO.isEarlyClobber())
+    start = getUseIndex(MIIdx);
+  unsigned end = start;
+
+  // If it is not used after definition, it is considered dead at
+  // the instruction defining it. Hence its interval is:
+  // [defSlot(def), defSlot(def)+1)
+  if (MO.isDead()) {
+    DOUT << " dead";
+    end = start + 1;
+    goto exit;
+  }
+
+  // If it is not dead on definition, it must be killed by a
+  // subsequent instruction. Hence its interval is:
+  // [defSlot(def), useSlot(kill)+1)
+  baseIndex += InstrSlots::NUM;
+  while (++mi != MBB->end()) {
+    while (baseIndex / InstrSlots::NUM < i2miMap_.size() &&
+           getInstructionFromIndex(baseIndex) == 0)
+      baseIndex += InstrSlots::NUM;
+    if (mi->killsRegister(interval.reg, tri_)) {
+      DOUT << " killed";
+      end = getUseIndex(baseIndex) + 1;
+      goto exit;
+    } else {
+      int DefIdx = mi->findRegisterDefOperandIdx(interval.reg, false, tri_);
+      if (DefIdx != -1) {
+        if (mi->isRegTiedToUseOperand(DefIdx)) {
+          // Two-address instruction.
+          end = getDefIndex(baseIndex);
+          if (mi->getOperand(DefIdx).isEarlyClobber())
+            end = getUseIndex(baseIndex);
+        } else {
+          // Another instruction redefines the register before it is ever read.
+          // Then the register is essentially dead at the instruction that defines
+          // it. Hence its interval is:
+          // [defSlot(def), defSlot(def)+1)
+          DOUT << " dead";
+          end = start + 1;
+        }
+        goto exit;
+      }
+    }
+    
+    baseIndex += InstrSlots::NUM;
+  }
+  
+  // The only case we should have a dead physreg here without a killing or
+  // instruction where we know it's dead is if it is live-in to the function
+  // and never used. Another possible case is the implicit use of the
+  // physical register has been deleted by two-address pass.
+  end = start + 1;
+
+exit:
+  assert(start < end && "did not find end of interval?");
+
+  // Already exists? Extend old live interval.
+  LiveInterval::iterator OldLR = interval.FindLiveRangeContaining(start);
+  bool Extend = OldLR != interval.end();
+  VNInfo *ValNo = Extend
+    ? OldLR->valno : interval.getNextValue(start, CopyMI, VNInfoAllocator);
+  if (MO.isEarlyClobber() && Extend)
+    ValNo->redefByEC = true;
+  LiveRange LR(start, end, ValNo);
+  interval.addRange(LR);
+  interval.addKill(LR.valno, end);
+  DOUT << " +" << LR << '\n';
+}
+
+void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned MIIdx,
+                                      MachineOperand& MO,
+                                      unsigned MOIdx) {
+  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    handleVirtualRegisterDef(MBB, MI, MIIdx, MO, MOIdx,
+                             getOrCreateInterval(MO.getReg()));
+  else if (allocatableRegs_[MO.getReg()]) {
+    MachineInstr *CopyMI = NULL;
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
+        MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+        MI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG ||
+        tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      CopyMI = MI;
+    handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
+                              getOrCreateInterval(MO.getReg()), CopyMI);
+    // Def of a register also defines its sub-registers.
+    for (const unsigned* AS = tri_->getSubRegisters(MO.getReg()); *AS; ++AS)
+      // If MI also modifies the sub-register explicitly, avoid processing it
+      // more than once. Do not pass in TRI here so it checks for exact match.
+      if (!MI->modifiesRegister(*AS))
+        handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
+                                  getOrCreateInterval(*AS), 0);
+  }
+}
+
+void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
+                                         unsigned MIIdx,
+                                         LiveInterval &interval, bool isAlias) {
+  DOUT << "\t\tlivein register: "; DEBUG(printRegName(interval.reg));
+
+  // Look for kills, if it reaches a def before it's killed, then it shouldn't
+  // be considered a livein.
+  MachineBasicBlock::iterator mi = MBB->begin();
+  unsigned baseIndex = MIIdx;
+  unsigned start = baseIndex;
+  while (baseIndex / InstrSlots::NUM < i2miMap_.size() && 
+         getInstructionFromIndex(baseIndex) == 0)
+    baseIndex += InstrSlots::NUM;
+  unsigned end = baseIndex;
+  bool SeenDefUse = false;
+  
+  while (mi != MBB->end()) {
+    if (mi->killsRegister(interval.reg, tri_)) {
+      DOUT << " killed";
+      end = getUseIndex(baseIndex) + 1;
+      SeenDefUse = true;
+      goto exit;
+    } else if (mi->modifiesRegister(interval.reg, tri_)) {
+      // Another instruction redefines the register before it is ever read.
+      // Then the register is essentially dead at the instruction that defines
+      // it. Hence its interval is:
+      // [defSlot(def), defSlot(def)+1)
+      DOUT << " dead";
+      end = getDefIndex(start) + 1;
+      SeenDefUse = true;
+      goto exit;
+    }
+
+    baseIndex += InstrSlots::NUM;
+    ++mi;
+    if (mi != MBB->end()) {
+      while (baseIndex / InstrSlots::NUM < i2miMap_.size() && 
+             getInstructionFromIndex(baseIndex) == 0)
+        baseIndex += InstrSlots::NUM;
+    }
+  }
+
+exit:
+  // Live-in register might not be used at all.
+  if (!SeenDefUse) {
+    if (isAlias) {
+      DOUT << " dead";
+      end = getDefIndex(MIIdx) + 1;
+    } else {
+      DOUT << " live through";
+      end = baseIndex;
+    }
+  }
+
+  LiveRange LR(start, end, interval.getNextValue(~0U, 0, VNInfoAllocator));
+  interval.addRange(LR);
+  interval.addKill(LR.valno, end);
+  DOUT << " +" << LR << '\n';
+}
+
+/// computeIntervals - computes the live intervals for virtual
+/// registers. for some ordering of the machine instructions [1,N] a
+/// live interval is an interval [i, j) where 1 <= i <= j < N for
+/// which a variable is live
+void LiveIntervals::computeIntervals() { 
+
+  DOUT << "********** COMPUTING LIVE INTERVALS **********\n"
+       << "********** Function: "
+       << ((Value*)mf_->getFunction())->getName() << '\n';
+  
+  for (MachineFunction::iterator MBBI = mf_->begin(), E = mf_->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    // Track the index of the current machine instr.
+    unsigned MIIndex = getMBBStartIdx(MBB);
+    DOUT << ((Value*)MBB->getBasicBlock())->getName() << ":\n";
+
+    MachineBasicBlock::iterator MI = MBB->begin(), miEnd = MBB->end();
+
+    // Create intervals for live-ins to this BB first.
+    for (MachineBasicBlock::const_livein_iterator LI = MBB->livein_begin(),
+           LE = MBB->livein_end(); LI != LE; ++LI) {
+      handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*LI));
+      // Multiple live-ins can alias the same register.
+      for (const unsigned* AS = tri_->getSubRegisters(*LI); *AS; ++AS)
+        if (!hasInterval(*AS))
+          handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*AS),
+                               true);
+    }
+    
+    // Skip over empty initial indices.
+    while (MIIndex / InstrSlots::NUM < i2miMap_.size() &&
+           getInstructionFromIndex(MIIndex) == 0)
+      MIIndex += InstrSlots::NUM;
+    
+    for (; MI != miEnd; ++MI) {
+      DOUT << MIIndex << "\t" << *MI;
+
+      // Handle defs.
+      for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
+        MachineOperand &MO = MI->getOperand(i);
+        // handle register defs - build intervals
+        if (MO.isReg() && MO.getReg() && MO.isDef()) {
+          handleRegisterDef(MBB, MI, MIIndex, MO, i);
+        }
+      }
+
+      // Skip over the empty slots after each instruction.
+      unsigned Slots = MI->getDesc().getNumDefs();
+      if (Slots == 0)
+        Slots = 1;
+      MIIndex += InstrSlots::NUM * Slots;
+      
+      // Skip over empty indices.
+      while (MIIndex / InstrSlots::NUM < i2miMap_.size() &&
+             getInstructionFromIndex(MIIndex) == 0)
+        MIIndex += InstrSlots::NUM;
+    }
+  }
+}
+
+bool LiveIntervals::findLiveInMBBs(unsigned Start, unsigned End,
+                              SmallVectorImpl<MachineBasicBlock*> &MBBs) const {
+  std::vector<IdxMBBPair>::const_iterator I =
+    std::lower_bound(Idx2MBBMap.begin(), Idx2MBBMap.end(), Start);
+
+  bool ResVal = false;
+  while (I != Idx2MBBMap.end()) {
+    if (I->first >= End)
+      break;
+    MBBs.push_back(I->second);
+    ResVal = true;
+    ++I;
+  }
+  return ResVal;
+}
+
+bool LiveIntervals::findReachableMBBs(unsigned Start, unsigned End,
+                              SmallVectorImpl<MachineBasicBlock*> &MBBs) const {
+  std::vector<IdxMBBPair>::const_iterator I =
+    std::lower_bound(Idx2MBBMap.begin(), Idx2MBBMap.end(), Start);
+
+  bool ResVal = false;
+  while (I != Idx2MBBMap.end()) {
+    if (I->first > End)
+      break;
+    MachineBasicBlock *MBB = I->second;
+    if (getMBBEndIdx(MBB) > End)
+      break;
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           SE = MBB->succ_end(); SI != SE; ++SI)
+      MBBs.push_back(*SI);
+    ResVal = true;
+    ++I;
+  }
+  return ResVal;
+}
+
+LiveInterval* LiveIntervals::createInterval(unsigned reg) {
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? HUGE_VALF : 0.0F;
+  return new LiveInterval(reg, Weight);
+}
+
+/// dupInterval - Duplicate a live interval. The caller is responsible for
+/// managing the allocated memory.
+LiveInterval* LiveIntervals::dupInterval(LiveInterval *li) {
+  LiveInterval *NewLI = createInterval(li->reg);
+  NewLI->Copy(*li, getVNInfoAllocator());
+  return NewLI;
+}
+
+/// getVNInfoSourceReg - Helper function that parses the specified VNInfo
+/// copy field and returns the source register that defines it.
+unsigned LiveIntervals::getVNInfoSourceReg(const VNInfo *VNI) const {
+  if (!VNI->copy)
+    return 0;
+
+  if (VNI->copy->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
+    // If it's extracting out of a physical register, return the sub-register.
+    unsigned Reg = VNI->copy->getOperand(1).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      Reg = tri_->getSubReg(Reg, VNI->copy->getOperand(2).getImm());
+    return Reg;
+  } else if (VNI->copy->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+             VNI->copy->getOpcode() == TargetInstrInfo::SUBREG_TO_REG)
+    return VNI->copy->getOperand(2).getReg();
+
+  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+  if (tii_->isMoveInstr(*VNI->copy, SrcReg, DstReg, SrcSubReg, DstSubReg))
+    return SrcReg;
+  assert(0 && "Unrecognized copy instruction!");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Register allocator hooks.
+//
+
+/// getReMatImplicitUse - If the remat definition MI has one (for now, we only
+/// allow one) virtual register operand, then its uses are implicitly using
+/// the register. Returns the virtual register.
+unsigned LiveIntervals::getReMatImplicitUse(const LiveInterval &li,
+                                            MachineInstr *MI) const {
+  unsigned RegOp = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == li.reg)
+      continue;
+    // FIXME: For now, only remat MI with at most one register operand.
+    assert(!RegOp &&
+           "Can't rematerialize instruction with multiple register operand!");
+    RegOp = MO.getReg();
+#ifndef NDEBUG
+    break;
+#endif
+  }
+  return RegOp;
+}
+
+/// isValNoAvailableAt - Return true if the val# of the specified interval
+/// which reaches the given instruction also reaches the specified use index.
+bool LiveIntervals::isValNoAvailableAt(const LiveInterval &li, MachineInstr *MI,
+                                       unsigned UseIdx) const {
+  unsigned Index = getInstructionIndex(MI);  
+  VNInfo *ValNo = li.FindLiveRangeContaining(Index)->valno;
+  LiveInterval::const_iterator UI = li.FindLiveRangeContaining(UseIdx);
+  return UI != li.end() && UI->valno == ValNo;
+}
+
+/// isReMaterializable - Returns true if the definition MI of the specified
+/// val# of the specified interval is re-materializable.
+bool LiveIntervals::isReMaterializable(const LiveInterval &li,
+                                       const VNInfo *ValNo, MachineInstr *MI,
+                                       SmallVectorImpl<LiveInterval*> &SpillIs,
+                                       bool &isLoad) {
+  if (DisableReMat)
+    return false;
+
+  if (MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
+    return true;
+
+  int FrameIdx = 0;
+  if (tii_->isLoadFromStackSlot(MI, FrameIdx) &&
+      mf_->getFrameInfo()->isImmutableObjectIndex(FrameIdx))
+    // FIXME: Let target specific isReallyTriviallyReMaterializable determines
+    // this but remember this is not safe to fold into a two-address
+    // instruction.
+    // This is a load from fixed stack slot. It can be rematerialized.
+    return true;
+
+  // If the target-specific rules don't identify an instruction as
+  // being trivially rematerializable, use some target-independent
+  // rules.
+  if (!MI->getDesc().isRematerializable() ||
+      !tii_->isTriviallyReMaterializable(MI)) {
+    if (!EnableAggressiveRemat)
+      return false;
+
+    // If the instruction accesses memory but the memoperands have been lost,
+    // we can't analyze it.
+    const TargetInstrDesc &TID = MI->getDesc();
+    if ((TID.mayLoad() || TID.mayStore()) && MI->memoperands_empty())
+      return false;
+
+    // Avoid instructions obviously unsafe for remat.
+    if (TID.hasUnmodeledSideEffects() || TID.isNotDuplicable())
+      return false;
+
+    // If the instruction accesses memory and the memory could be non-constant,
+    // assume the instruction is not rematerializable.
+    for (std::list<MachineMemOperand>::const_iterator
+           I = MI->memoperands_begin(), E = MI->memoperands_end(); I != E; ++I){
+      const MachineMemOperand &MMO = *I;
+      if (MMO.isVolatile() || MMO.isStore())
+        return false;
+      const Value *V = MMO.getValue();
+      if (!V)
+        return false;
+      if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+        if (!PSV->isConstant(mf_->getFrameInfo()))
+          return false;
+      } else if (!aa_->pointsToConstantMemory(V))
+        return false;
+    }
+
+    // If any of the registers accessed are non-constant, conservatively assume
+    // the instruction is not rematerializable.
+    unsigned ImpUse = 0;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (Reg == 0)
+          continue;
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          return false;
+
+        // Only allow one def, and that in the first operand.
+        if (MO.isDef() != (i == 0))
+          return false;
+
+        // Only allow constant-valued registers.
+        bool IsLiveIn = mri_->isLiveIn(Reg);
+        MachineRegisterInfo::def_iterator I = mri_->def_begin(Reg),
+                                          E = mri_->def_end();
+
+        // For the def, it should be the only def of that register.
+        if (MO.isDef() && (next(I) != E || IsLiveIn))
+          return false;
+
+        if (MO.isUse()) {
+          // Only allow one use other register use, as that's all the
+          // remat mechanisms support currently.
+          if (Reg != li.reg) {
+            if (ImpUse == 0)
+              ImpUse = Reg;
+            else if (Reg != ImpUse)
+              return false;
+          }
+          // For the use, there should be only one associated def.
+          if (I != E && (next(I) != E || IsLiveIn))
+            return false;
+        }
+      }
+    }
+  }
+
+  unsigned ImpUse = getReMatImplicitUse(li, MI);
+  if (ImpUse) {
+    const LiveInterval &ImpLi = getInterval(ImpUse);
+    for (MachineRegisterInfo::use_iterator ri = mri_->use_begin(li.reg),
+           re = mri_->use_end(); ri != re; ++ri) {
+      MachineInstr *UseMI = &*ri;
+      unsigned UseIdx = getInstructionIndex(UseMI);
+      if (li.FindLiveRangeContaining(UseIdx)->valno != ValNo)
+        continue;
+      if (!isValNoAvailableAt(ImpLi, MI, UseIdx))
+        return false;
+    }
+
+    // If a register operand of the re-materialized instruction is going to
+    // be spilled next, then it's not legal to re-materialize this instruction.
+    for (unsigned i = 0, e = SpillIs.size(); i != e; ++i)
+      if (ImpUse == SpillIs[i]->reg)
+        return false;
+  }
+  return true;
+}
+
+/// isReMaterializable - Returns true if the definition MI of the specified
+/// val# of the specified interval is re-materializable.
+bool LiveIntervals::isReMaterializable(const LiveInterval &li,
+                                       const VNInfo *ValNo, MachineInstr *MI) {
+  SmallVector<LiveInterval*, 4> Dummy1;
+  bool Dummy2;
+  return isReMaterializable(li, ValNo, MI, Dummy1, Dummy2);
+}
+
+/// isReMaterializable - Returns true if every definition of MI of every
+/// val# of the specified interval is re-materializable.
+bool LiveIntervals::isReMaterializable(const LiveInterval &li,
+                                       SmallVectorImpl<LiveInterval*> &SpillIs,
+                                       bool &isLoad) {
+  isLoad = false;
+  for (LiveInterval::const_vni_iterator i = li.vni_begin(), e = li.vni_end();
+       i != e; ++i) {
+    const VNInfo *VNI = *i;
+    unsigned DefIdx = VNI->def;
+    if (DefIdx == ~1U)
+      continue; // Dead val#.
+    // Is the def for the val# rematerializable?
+    if (DefIdx == ~0u)
+      return false;
+    MachineInstr *ReMatDefMI = getInstructionFromIndex(DefIdx);
+    bool DefIsLoad = false;
+    if (!ReMatDefMI ||
+        !isReMaterializable(li, VNI, ReMatDefMI, SpillIs, DefIsLoad))
+      return false;
+    isLoad |= DefIsLoad;
+  }
+  return true;
+}
+
+/// FilterFoldedOps - Filter out two-address use operands. Return
+/// true if it finds any issue with the operands that ought to prevent
+/// folding.
+static bool FilterFoldedOps(MachineInstr *MI,
+                            SmallVector<unsigned, 2> &Ops,
+                            unsigned &MRInfo,
+                            SmallVector<unsigned, 2> &FoldOps) {
+  MRInfo = 0;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    unsigned OpIdx = Ops[i];
+    MachineOperand &MO = MI->getOperand(OpIdx);
+    // FIXME: fold subreg use.
+    if (MO.getSubReg())
+      return true;
+    if (MO.isDef())
+      MRInfo |= (unsigned)VirtRegMap::isMod;
+    else {
+      // Filter out two-address use operand(s).
+      if (MI->isRegTiedToDefOperand(OpIdx)) {
+        MRInfo = VirtRegMap::isModRef;
+        continue;
+      }
+      MRInfo |= (unsigned)VirtRegMap::isRef;
+    }
+    FoldOps.push_back(OpIdx);
+  }
+  return false;
+}
+                           
+
+/// tryFoldMemoryOperand - Attempts to fold either a spill / restore from
+/// slot / to reg or any rematerialized load into ith operand of specified
+/// MI. If it is successul, MI is updated with the newly created MI and
+/// returns true.
+bool LiveIntervals::tryFoldMemoryOperand(MachineInstr* &MI,
+                                         VirtRegMap &vrm, MachineInstr *DefMI,
+                                         unsigned InstrIdx,
+                                         SmallVector<unsigned, 2> &Ops,
+                                         bool isSS, int Slot, unsigned Reg) {
+  // If it is an implicit def instruction, just delete it.
+  if (MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF) {
+    RemoveMachineInstrFromMaps(MI);
+    vrm.RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+    ++numFolds;
+    return true;
+  }
+
+  // Filter the list of operand indexes that are to be folded. Abort if
+  // any operand will prevent folding.
+  unsigned MRInfo = 0;
+  SmallVector<unsigned, 2> FoldOps;
+  if (FilterFoldedOps(MI, Ops, MRInfo, FoldOps))
+    return false;
+
+  // The only time it's safe to fold into a two address instruction is when
+  // it's folding reload and spill from / into a spill stack slot.
+  if (DefMI && (MRInfo & VirtRegMap::isMod))
+    return false;
+
+  MachineInstr *fmi = isSS ? tii_->foldMemoryOperand(*mf_, MI, FoldOps, Slot)
+                           : tii_->foldMemoryOperand(*mf_, MI, FoldOps, DefMI);
+  if (fmi) {
+    // Remember this instruction uses the spill slot.
+    if (isSS) vrm.addSpillSlotUse(Slot, fmi);
+
+    // Attempt to fold the memory reference into the instruction. If
+    // we can do this, we don't need to insert spill code.
+    MachineBasicBlock &MBB = *MI->getParent();
+    if (isSS && !mf_->getFrameInfo()->isImmutableObjectIndex(Slot))
+      vrm.virtFolded(Reg, MI, fmi, (VirtRegMap::ModRef)MRInfo);
+    vrm.transferSpillPts(MI, fmi);
+    vrm.transferRestorePts(MI, fmi);
+    vrm.transferEmergencySpills(MI, fmi);
+    mi2iMap_.erase(MI);
+    i2miMap_[InstrIdx /InstrSlots::NUM] = fmi;
+    mi2iMap_[fmi] = InstrIdx;
+    MI = MBB.insert(MBB.erase(MI), fmi);
+    ++numFolds;
+    return true;
+  }
+  return false;
+}
+
+/// canFoldMemoryOperand - Returns true if the specified load / store
+/// folding is possible.
+bool LiveIntervals::canFoldMemoryOperand(MachineInstr *MI,
+                                         SmallVector<unsigned, 2> &Ops,
+                                         bool ReMat) const {
+  // Filter the list of operand indexes that are to be folded. Abort if
+  // any operand will prevent folding.
+  unsigned MRInfo = 0;
+  SmallVector<unsigned, 2> FoldOps;
+  if (FilterFoldedOps(MI, Ops, MRInfo, FoldOps))
+    return false;
+
+  // It's only legal to remat for a use, not a def.
+  if (ReMat && (MRInfo & VirtRegMap::isMod))
+    return false;
+
+  return tii_->canFoldMemoryOperand(MI, FoldOps);
+}
+
+bool LiveIntervals::intervalIsInOneMBB(const LiveInterval &li) const {
+  SmallPtrSet<MachineBasicBlock*, 4> MBBs;
+  for (LiveInterval::Ranges::const_iterator
+         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+    std::vector<IdxMBBPair>::const_iterator II =
+      std::lower_bound(Idx2MBBMap.begin(), Idx2MBBMap.end(), I->start);
+    if (II == Idx2MBBMap.end())
+      continue;
+    if (I->end > II->first)  // crossing a MBB.
+      return false;
+    MBBs.insert(II->second);
+    if (MBBs.size() > 1)
+      return false;
+  }
+  return true;
+}
+
+/// rewriteImplicitOps - Rewrite implicit use operands of MI (i.e. uses of
+/// interval on to-be re-materialized operands of MI) with new register.
+void LiveIntervals::rewriteImplicitOps(const LiveInterval &li,
+                                       MachineInstr *MI, unsigned NewVReg,
+                                       VirtRegMap &vrm) {
+  // There is an implicit use. That means one of the other operand is
+  // being remat'ed and the remat'ed instruction has li.reg as an
+  // use operand. Make sure we rewrite that as well.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (!vrm.isReMaterialized(Reg))
+      continue;
+    MachineInstr *ReMatMI = vrm.getReMaterializedMI(Reg);
+    MachineOperand *UseMO = ReMatMI->findRegisterUseOperand(li.reg);
+    if (UseMO)
+      UseMO->setReg(NewVReg);
+  }
+}
+
+/// rewriteInstructionForSpills, rewriteInstructionsForSpills - Helper functions
+/// for addIntervalsForSpills to rewrite uses / defs for the given live range.
+bool LiveIntervals::
+rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
+                 bool TrySplit, unsigned index, unsigned end,  MachineInstr *MI,
+                 MachineInstr *ReMatOrigDefMI, MachineInstr *ReMatDefMI,
+                 unsigned Slot, int LdSlot,
+                 bool isLoad, bool isLoadSS, bool DefIsReMat, bool CanDelete,
+                 VirtRegMap &vrm,
+                 const TargetRegisterClass* rc,
+                 SmallVector<int, 4> &ReMatIds,
+                 const MachineLoopInfo *loopInfo,
+                 unsigned &NewVReg, unsigned ImpUse, bool &HasDef, bool &HasUse,
+                 DenseMap<unsigned,unsigned> &MBBVRegsMap,
+                 std::vector<LiveInterval*> &NewLIs) {
+  bool CanFold = false;
+ RestartInstruction:
+  for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+    MachineOperand& mop = MI->getOperand(i);
+    if (!mop.isReg())
+      continue;
+    unsigned Reg = mop.getReg();
+    unsigned RegI = Reg;
+    if (Reg == 0 || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (Reg != li.reg)
+      continue;
+
+    bool TryFold = !DefIsReMat;
+    bool FoldSS = true; // Default behavior unless it's a remat.
+    int FoldSlot = Slot;
+    if (DefIsReMat) {
+      // If this is the rematerializable definition MI itself and
+      // all of its uses are rematerialized, simply delete it.
+      if (MI == ReMatOrigDefMI && CanDelete) {
+        DOUT << "\t\t\t\tErasing re-materlizable def: ";
+        DOUT << MI << '\n';
+        RemoveMachineInstrFromMaps(MI);
+        vrm.RemoveMachineInstrFromMaps(MI);
+        MI->eraseFromParent();
+        break;
+      }
+
+      // If def for this use can't be rematerialized, then try folding.
+      // If def is rematerializable and it's a load, also try folding.
+      TryFold = !ReMatDefMI || (ReMatDefMI && (MI == ReMatOrigDefMI || isLoad));
+      if (isLoad) {
+        // Try fold loads (from stack slot, constant pool, etc.) into uses.
+        FoldSS = isLoadSS;
+        FoldSlot = LdSlot;
+      }
+    }
+
+    // Scan all of the operands of this instruction rewriting operands
+    // to use NewVReg instead of li.reg as appropriate.  We do this for
+    // two reasons:
+    //
+    //   1. If the instr reads the same spilled vreg multiple times, we
+    //      want to reuse the NewVReg.
+    //   2. If the instr is a two-addr instruction, we are required to
+    //      keep the src/dst regs pinned.
+    //
+    // Keep track of whether we replace a use and/or def so that we can
+    // create the spill interval with the appropriate range. 
+
+    HasUse = mop.isUse();
+    HasDef = mop.isDef();
+    SmallVector<unsigned, 2> Ops;
+    Ops.push_back(i);
+    for (unsigned j = i+1, e = MI->getNumOperands(); j != e; ++j) {
+      const MachineOperand &MOj = MI->getOperand(j);
+      if (!MOj.isReg())
+        continue;
+      unsigned RegJ = MOj.getReg();
+      if (RegJ == 0 || TargetRegisterInfo::isPhysicalRegister(RegJ))
+        continue;
+      if (RegJ == RegI) {
+        Ops.push_back(j);
+        HasUse |= MOj.isUse();
+        HasDef |= MOj.isDef();
+      }
+    }
+
+    if (HasUse && !li.liveAt(getUseIndex(index)))
+      // Must be defined by an implicit def. It should not be spilled. Note,
+      // this is for correctness reason. e.g.
+      // 8   %reg1024<def> = IMPLICIT_DEF
+      // 12  %reg1024<def> = INSERT_SUBREG %reg1024<kill>, %reg1025, 2
+      // The live range [12, 14) are not part of the r1024 live interval since
+      // it's defined by an implicit def. It will not conflicts with live
+      // interval of r1025. Now suppose both registers are spilled, you can
+      // easily see a situation where both registers are reloaded before
+      // the INSERT_SUBREG and both target registers that would overlap.
+      HasUse = false;
+
+    // Create a new virtual register for the spill interval.
+    // Create the new register now so we can map the fold instruction
+    // to the new register so when it is unfolded we get the correct
+    // answer.
+    bool CreatedNewVReg = false;
+    if (NewVReg == 0) {
+      NewVReg = mri_->createVirtualRegister(rc);
+      vrm.grow();
+      CreatedNewVReg = true;
+    }
+
+    if (!TryFold)
+      CanFold = false;
+    else {
+      // Do not fold load / store here if we are splitting. We'll find an
+      // optimal point to insert a load / store later.
+      if (!TrySplit) {
+        if (tryFoldMemoryOperand(MI, vrm, ReMatDefMI, index,
+                                 Ops, FoldSS, FoldSlot, NewVReg)) {
+          // Folding the load/store can completely change the instruction in
+          // unpredictable ways, rescan it from the beginning.
+
+          if (FoldSS) {
+            // We need to give the new vreg the same stack slot as the
+            // spilled interval.
+            vrm.assignVirt2StackSlot(NewVReg, FoldSlot);
+          }
+
+          HasUse = false;
+          HasDef = false;
+          CanFold = false;
+          if (isNotInMIMap(MI))
+            break;
+          goto RestartInstruction;
+        }
+      } else {
+        // We'll try to fold it later if it's profitable.
+        CanFold = canFoldMemoryOperand(MI, Ops, DefIsReMat);
+      }
+    }
+
+    mop.setReg(NewVReg);
+    if (mop.isImplicit())
+      rewriteImplicitOps(li, MI, NewVReg, vrm);
+
+    // Reuse NewVReg for other reads.
+    for (unsigned j = 0, e = Ops.size(); j != e; ++j) {
+      MachineOperand &mopj = MI->getOperand(Ops[j]);
+      mopj.setReg(NewVReg);
+      if (mopj.isImplicit())
+        rewriteImplicitOps(li, MI, NewVReg, vrm);
+    }
+            
+    if (CreatedNewVReg) {
+      if (DefIsReMat) {
+        vrm.setVirtIsReMaterialized(NewVReg, ReMatDefMI/*, CanDelete*/);
+        if (ReMatIds[VNI->id] == VirtRegMap::MAX_STACK_SLOT) {
+          // Each valnum may have its own remat id.
+          ReMatIds[VNI->id] = vrm.assignVirtReMatId(NewVReg);
+        } else {
+          vrm.assignVirtReMatId(NewVReg, ReMatIds[VNI->id]);
+        }
+        if (!CanDelete || (HasUse && HasDef)) {
+          // If this is a two-addr instruction then its use operands are
+          // rematerializable but its def is not. It should be assigned a
+          // stack slot.
+          vrm.assignVirt2StackSlot(NewVReg, Slot);
+        }
+      } else {
+        vrm.assignVirt2StackSlot(NewVReg, Slot);
+      }
+    } else if (HasUse && HasDef &&
+               vrm.getStackSlot(NewVReg) == VirtRegMap::NO_STACK_SLOT) {
+      // If this interval hasn't been assigned a stack slot (because earlier
+      // def is a deleted remat def), do it now.
+      assert(Slot != VirtRegMap::NO_STACK_SLOT);
+      vrm.assignVirt2StackSlot(NewVReg, Slot);
+    }
+
+    // Re-matting an instruction with virtual register use. Add the
+    // register as an implicit use on the use MI.
+    if (DefIsReMat && ImpUse)
+      MI->addOperand(MachineOperand::CreateReg(ImpUse, false, true));
+
+    // Create a new register interval for this spill / remat.
+    LiveInterval &nI = getOrCreateInterval(NewVReg);
+    if (CreatedNewVReg) {
+      NewLIs.push_back(&nI);
+      MBBVRegsMap.insert(std::make_pair(MI->getParent()->getNumber(), NewVReg));
+      if (TrySplit)
+        vrm.setIsSplitFromReg(NewVReg, li.reg);
+    }
+
+    if (HasUse) {
+      if (CreatedNewVReg) {
+        LiveRange LR(getLoadIndex(index), getUseIndex(index)+1,
+                     nI.getNextValue(~0U, 0, VNInfoAllocator));
+        DOUT << " +" << LR;
+        nI.addRange(LR);
+      } else {
+        // Extend the split live interval to this def / use.
+        unsigned End = getUseIndex(index)+1;
+        LiveRange LR(nI.ranges[nI.ranges.size()-1].end, End,
+                     nI.getValNumInfo(nI.getNumValNums()-1));
+        DOUT << " +" << LR;
+        nI.addRange(LR);
+      }
+    }
+    if (HasDef) {
+      LiveRange LR(getDefIndex(index), getStoreIndex(index),
+                   nI.getNextValue(~0U, 0, VNInfoAllocator));
+      DOUT << " +" << LR;
+      nI.addRange(LR);
+    }
+
+    DOUT << "\t\t\t\tAdded new interval: ";
+    nI.print(DOUT, tri_);
+    DOUT << '\n';
+  }
+  return CanFold;
+}
+bool LiveIntervals::anyKillInMBBAfterIdx(const LiveInterval &li,
+                                   const VNInfo *VNI,
+                                   MachineBasicBlock *MBB, unsigned Idx) const {
+  unsigned End = getMBBEndIdx(MBB);
+  for (unsigned j = 0, ee = VNI->kills.size(); j != ee; ++j) {
+    unsigned KillIdx = VNI->kills[j];
+    if (KillIdx > Idx && KillIdx < End)
+      return true;
+  }
+  return false;
+}
+
+/// RewriteInfo - Keep track of machine instrs that will be rewritten
+/// during spilling.
+namespace {
+  struct RewriteInfo {
+    unsigned Index;
+    MachineInstr *MI;
+    bool HasUse;
+    bool HasDef;
+    RewriteInfo(unsigned i, MachineInstr *mi, bool u, bool d)
+      : Index(i), MI(mi), HasUse(u), HasDef(d) {}
+  };
+
+  struct RewriteInfoCompare {
+    bool operator()(const RewriteInfo &LHS, const RewriteInfo &RHS) const {
+      return LHS.Index < RHS.Index;
+    }
+  };
+}
+
+void LiveIntervals::
+rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
+                    LiveInterval::Ranges::const_iterator &I,
+                    MachineInstr *ReMatOrigDefMI, MachineInstr *ReMatDefMI,
+                    unsigned Slot, int LdSlot,
+                    bool isLoad, bool isLoadSS, bool DefIsReMat, bool CanDelete,
+                    VirtRegMap &vrm,
+                    const TargetRegisterClass* rc,
+                    SmallVector<int, 4> &ReMatIds,
+                    const MachineLoopInfo *loopInfo,
+                    BitVector &SpillMBBs,
+                    DenseMap<unsigned, std::vector<SRInfo> > &SpillIdxes,
+                    BitVector &RestoreMBBs,
+                    DenseMap<unsigned, std::vector<SRInfo> > &RestoreIdxes,
+                    DenseMap<unsigned,unsigned> &MBBVRegsMap,
+                    std::vector<LiveInterval*> &NewLIs) {
+  bool AllCanFold = true;
+  unsigned NewVReg = 0;
+  unsigned start = getBaseIndex(I->start);
+  unsigned end = getBaseIndex(I->end-1) + InstrSlots::NUM;
+
+  // First collect all the def / use in this live range that will be rewritten.
+  // Make sure they are sorted according to instruction index.
+  std::vector<RewriteInfo> RewriteMIs;
+  for (MachineRegisterInfo::reg_iterator ri = mri_->reg_begin(li.reg),
+         re = mri_->reg_end(); ri != re; ) {
+    MachineInstr *MI = &*ri;
+    MachineOperand &O = ri.getOperand();
+    ++ri;
+    assert(!O.isImplicit() && "Spilling register that's used as implicit use?");
+    unsigned index = getInstructionIndex(MI);
+    if (index < start || index >= end)
+      continue;
+    if (O.isUse() && !li.liveAt(getUseIndex(index)))
+      // Must be defined by an implicit def. It should not be spilled. Note,
+      // this is for correctness reason. e.g.
+      // 8   %reg1024<def> = IMPLICIT_DEF
+      // 12  %reg1024<def> = INSERT_SUBREG %reg1024<kill>, %reg1025, 2
+      // The live range [12, 14) are not part of the r1024 live interval since
+      // it's defined by an implicit def. It will not conflicts with live
+      // interval of r1025. Now suppose both registers are spilled, you can
+      // easily see a situation where both registers are reloaded before
+      // the INSERT_SUBREG and both target registers that would overlap.
+      continue;
+    RewriteMIs.push_back(RewriteInfo(index, MI, O.isUse(), O.isDef()));
+  }
+  std::sort(RewriteMIs.begin(), RewriteMIs.end(), RewriteInfoCompare());
+
+  unsigned ImpUse = DefIsReMat ? getReMatImplicitUse(li, ReMatDefMI) : 0;
+  // Now rewrite the defs and uses.
+  for (unsigned i = 0, e = RewriteMIs.size(); i != e; ) {
+    RewriteInfo &rwi = RewriteMIs[i];
+    ++i;
+    unsigned index = rwi.Index;
+    bool MIHasUse = rwi.HasUse;
+    bool MIHasDef = rwi.HasDef;
+    MachineInstr *MI = rwi.MI;
+    // If MI def and/or use the same register multiple times, then there
+    // are multiple entries.
+    unsigned NumUses = MIHasUse;
+    while (i != e && RewriteMIs[i].MI == MI) {
+      assert(RewriteMIs[i].Index == index);
+      bool isUse = RewriteMIs[i].HasUse;
+      if (isUse) ++NumUses;
+      MIHasUse |= isUse;
+      MIHasDef |= RewriteMIs[i].HasDef;
+      ++i;
+    }
+    MachineBasicBlock *MBB = MI->getParent();
+
+    if (ImpUse && MI != ReMatDefMI) {
+      // Re-matting an instruction with virtual register use. Update the
+      // register interval's spill weight to HUGE_VALF to prevent it from
+      // being spilled.
+      LiveInterval &ImpLi = getInterval(ImpUse);
+      ImpLi.weight = HUGE_VALF;
+    }
+
+    unsigned MBBId = MBB->getNumber();
+    unsigned ThisVReg = 0;
+    if (TrySplit) {
+      DenseMap<unsigned,unsigned>::iterator NVI = MBBVRegsMap.find(MBBId);
+      if (NVI != MBBVRegsMap.end()) {
+        ThisVReg = NVI->second;
+        // One common case:
+        // x = use
+        // ...
+        // ...
+        // def = ...
+        //     = use
+        // It's better to start a new interval to avoid artifically
+        // extend the new interval.
+        if (MIHasDef && !MIHasUse) {
+          MBBVRegsMap.erase(MBB->getNumber());
+          ThisVReg = 0;
+        }
+      }
+    }
+
+    bool IsNew = ThisVReg == 0;
+    if (IsNew) {
+      // This ends the previous live interval. If all of its def / use
+      // can be folded, give it a low spill weight.
+      if (NewVReg && TrySplit && AllCanFold) {
+        LiveInterval &nI = getOrCreateInterval(NewVReg);
+        nI.weight /= 10.0F;
+      }
+      AllCanFold = true;
+    }
+    NewVReg = ThisVReg;
+
+    bool HasDef = false;
+    bool HasUse = false;
+    bool CanFold = rewriteInstructionForSpills(li, I->valno, TrySplit,
+                         index, end, MI, ReMatOrigDefMI, ReMatDefMI,
+                         Slot, LdSlot, isLoad, isLoadSS, DefIsReMat,
+                         CanDelete, vrm, rc, ReMatIds, loopInfo, NewVReg,
+                         ImpUse, HasDef, HasUse, MBBVRegsMap, NewLIs);
+    if (!HasDef && !HasUse)
+      continue;
+
+    AllCanFold &= CanFold;
+
+    // Update weight of spill interval.
+    LiveInterval &nI = getOrCreateInterval(NewVReg);
+    if (!TrySplit) {
+      // The spill weight is now infinity as it cannot be spilled again.
+      nI.weight = HUGE_VALF;
+      continue;
+    }
+
+    // Keep track of the last def and first use in each MBB.
+    if (HasDef) {
+      if (MI != ReMatOrigDefMI || !CanDelete) {
+        bool HasKill = false;
+        if (!HasUse)
+          HasKill = anyKillInMBBAfterIdx(li, I->valno, MBB, getDefIndex(index));
+        else {
+          // If this is a two-address code, then this index starts a new VNInfo.
+          const VNInfo *VNI = li.findDefinedVNInfo(getDefIndex(index));
+          if (VNI)
+            HasKill = anyKillInMBBAfterIdx(li, VNI, MBB, getDefIndex(index));
+        }
+        DenseMap<unsigned, std::vector<SRInfo> >::iterator SII =
+          SpillIdxes.find(MBBId);
+        if (!HasKill) {
+          if (SII == SpillIdxes.end()) {
+            std::vector<SRInfo> S;
+            S.push_back(SRInfo(index, NewVReg, true));
+            SpillIdxes.insert(std::make_pair(MBBId, S));
+          } else if (SII->second.back().vreg != NewVReg) {
+            SII->second.push_back(SRInfo(index, NewVReg, true));
+          } else if ((int)index > SII->second.back().index) {
+            // If there is an earlier def and this is a two-address
+            // instruction, then it's not possible to fold the store (which
+            // would also fold the load).
+            SRInfo &Info = SII->second.back();
+            Info.index = index;
+            Info.canFold = !HasUse;
+          }
+          SpillMBBs.set(MBBId);
+        } else if (SII != SpillIdxes.end() &&
+                   SII->second.back().vreg == NewVReg &&
+                   (int)index > SII->second.back().index) {
+          // There is an earlier def that's not killed (must be two-address).
+          // The spill is no longer needed.
+          SII->second.pop_back();
+          if (SII->second.empty()) {
+            SpillIdxes.erase(MBBId);
+            SpillMBBs.reset(MBBId);
+          }
+        }
+      }
+    }
+
+    if (HasUse) {
+      DenseMap<unsigned, std::vector<SRInfo> >::iterator SII =
+        SpillIdxes.find(MBBId);
+      if (SII != SpillIdxes.end() &&
+          SII->second.back().vreg == NewVReg &&
+          (int)index > SII->second.back().index)
+        // Use(s) following the last def, it's not safe to fold the spill.
+        SII->second.back().canFold = false;
+      DenseMap<unsigned, std::vector<SRInfo> >::iterator RII =
+        RestoreIdxes.find(MBBId);
+      if (RII != RestoreIdxes.end() && RII->second.back().vreg == NewVReg)
+        // If we are splitting live intervals, only fold if it's the first
+        // use and there isn't another use later in the MBB.
+        RII->second.back().canFold = false;
+      else if (IsNew) {
+        // Only need a reload if there isn't an earlier def / use.
+        if (RII == RestoreIdxes.end()) {
+          std::vector<SRInfo> Infos;
+          Infos.push_back(SRInfo(index, NewVReg, true));
+          RestoreIdxes.insert(std::make_pair(MBBId, Infos));
+        } else {
+          RII->second.push_back(SRInfo(index, NewVReg, true));
+        }
+        RestoreMBBs.set(MBBId);
+      }
+    }
+
+    // Update spill weight.
+    unsigned loopDepth = loopInfo->getLoopDepth(MBB);
+    nI.weight += getSpillWeight(HasDef, HasUse, loopDepth);
+  }
+
+  if (NewVReg && TrySplit && AllCanFold) {
+    // If all of its def / use can be folded, give it a low spill weight.
+    LiveInterval &nI = getOrCreateInterval(NewVReg);
+    nI.weight /= 10.0F;
+  }
+}
+
+bool LiveIntervals::alsoFoldARestore(int Id, int index, unsigned vr,
+                        BitVector &RestoreMBBs,
+                        DenseMap<unsigned,std::vector<SRInfo> > &RestoreIdxes) {
+  if (!RestoreMBBs[Id])
+    return false;
+  std::vector<SRInfo> &Restores = RestoreIdxes[Id];
+  for (unsigned i = 0, e = Restores.size(); i != e; ++i)
+    if (Restores[i].index == index &&
+        Restores[i].vreg == vr &&
+        Restores[i].canFold)
+      return true;
+  return false;
+}
+
+void LiveIntervals::eraseRestoreInfo(int Id, int index, unsigned vr,
+                        BitVector &RestoreMBBs,
+                        DenseMap<unsigned,std::vector<SRInfo> > &RestoreIdxes) {
+  if (!RestoreMBBs[Id])
+    return;
+  std::vector<SRInfo> &Restores = RestoreIdxes[Id];
+  for (unsigned i = 0, e = Restores.size(); i != e; ++i)
+    if (Restores[i].index == index && Restores[i].vreg)
+      Restores[i].index = -1;
+}
+
+/// handleSpilledImpDefs - Remove IMPLICIT_DEF instructions which are being
+/// spilled and create empty intervals for their uses.
+void
+LiveIntervals::handleSpilledImpDefs(const LiveInterval &li, VirtRegMap &vrm,
+                                    const TargetRegisterClass* rc,
+                                    std::vector<LiveInterval*> &NewLIs) {
+  for (MachineRegisterInfo::reg_iterator ri = mri_->reg_begin(li.reg),
+         re = mri_->reg_end(); ri != re; ) {
+    MachineOperand &O = ri.getOperand();
+    MachineInstr *MI = &*ri;
+    ++ri;
+    if (O.isDef()) {
+      assert(MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF &&
+             "Register def was not rewritten?");
+      RemoveMachineInstrFromMaps(MI);
+      vrm.RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+    } else {
+      // This must be an use of an implicit_def so it's not part of the live
+      // interval. Create a new empty live interval for it.
+      // FIXME: Can we simply erase some of the instructions? e.g. Stores?
+      unsigned NewVReg = mri_->createVirtualRegister(rc);
+      vrm.grow();
+      vrm.setIsImplicitlyDefined(NewVReg);
+      NewLIs.push_back(&getOrCreateInterval(NewVReg));
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.getReg() == li.reg)
+          MO.setReg(NewVReg);
+      }
+    }
+  }
+}
+
+std::vector<LiveInterval*> LiveIntervals::
+addIntervalsForSpillsFast(const LiveInterval &li,
+                          const MachineLoopInfo *loopInfo,
+                          VirtRegMap &vrm) {
+  unsigned slot = vrm.assignVirt2StackSlot(li.reg);
+
+  std::vector<LiveInterval*> added;
+
+  assert(li.weight != HUGE_VALF &&
+         "attempt to spill already spilled interval!");
+
+  DOUT << "\t\t\t\tadding intervals for spills for interval: ";
+  DEBUG(li.dump());
+  DOUT << '\n';
+
+  const TargetRegisterClass* rc = mri_->getRegClass(li.reg);
+
+  MachineRegisterInfo::reg_iterator RI = mri_->reg_begin(li.reg);
+  while (RI != mri_->reg_end()) {
+    MachineInstr* MI = &*RI;
+    
+    SmallVector<unsigned, 2> Indices;
+    bool HasUse = false;
+    bool HasDef = false;
+    
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+      MachineOperand& mop = MI->getOperand(i);
+      if (!mop.isReg() || mop.getReg() != li.reg) continue;
+      
+      HasUse |= MI->getOperand(i).isUse();
+      HasDef |= MI->getOperand(i).isDef();
+      
+      Indices.push_back(i);
+    }
+    
+    if (!tryFoldMemoryOperand(MI, vrm, NULL, getInstructionIndex(MI),
+                              Indices, true, slot, li.reg)) {
+      unsigned NewVReg = mri_->createVirtualRegister(rc);
+      vrm.grow();
+      vrm.assignVirt2StackSlot(NewVReg, slot);
+      
+      // create a new register for this spill
+      LiveInterval &nI = getOrCreateInterval(NewVReg);
+
+      // the spill weight is now infinity as it
+      // cannot be spilled again
+      nI.weight = HUGE_VALF;
+      
+      // Rewrite register operands to use the new vreg.
+      for (SmallVectorImpl<unsigned>::iterator I = Indices.begin(),
+           E = Indices.end(); I != E; ++I) {
+        MI->getOperand(*I).setReg(NewVReg);
+        
+        if (MI->getOperand(*I).isUse())
+          MI->getOperand(*I).setIsKill(true);
+      }
+      
+      // Fill in  the new live interval.
+      unsigned index = getInstructionIndex(MI);
+      if (HasUse) {
+        LiveRange LR(getLoadIndex(index), getUseIndex(index),
+                     nI.getNextValue(~0U, 0, getVNInfoAllocator()));
+        DOUT << " +" << LR;
+        nI.addRange(LR);
+        vrm.addRestorePoint(NewVReg, MI);
+      }
+      if (HasDef) {
+        LiveRange LR(getDefIndex(index), getStoreIndex(index),
+                     nI.getNextValue(~0U, 0, getVNInfoAllocator()));
+        DOUT << " +" << LR;
+        nI.addRange(LR);
+        vrm.addSpillPoint(NewVReg, true, MI);
+      }
+      
+      added.push_back(&nI);
+        
+      DOUT << "\t\t\t\tadded new interval: ";
+      DEBUG(nI.dump());
+      DOUT << '\n';
+    }
+    
+    
+    RI = mri_->reg_begin(li.reg);
+  }
+
+  return added;
+}
+
+std::vector<LiveInterval*> LiveIntervals::
+addIntervalsForSpills(const LiveInterval &li,
+                      SmallVectorImpl<LiveInterval*> &SpillIs,
+                      const MachineLoopInfo *loopInfo, VirtRegMap &vrm) {
+  
+  if (EnableFastSpilling)
+    return addIntervalsForSpillsFast(li, loopInfo, vrm);
+  
+  assert(li.weight != HUGE_VALF &&
+         "attempt to spill already spilled interval!");
+
+  DOUT << "\t\t\t\tadding intervals for spills for interval: ";
+  li.print(DOUT, tri_);
+  DOUT << '\n';
+
+  // Each bit specify whether a spill is required in the MBB.
+  BitVector SpillMBBs(mf_->getNumBlockIDs());
+  DenseMap<unsigned, std::vector<SRInfo> > SpillIdxes;
+  BitVector RestoreMBBs(mf_->getNumBlockIDs());
+  DenseMap<unsigned, std::vector<SRInfo> > RestoreIdxes;
+  DenseMap<unsigned,unsigned> MBBVRegsMap;
+  std::vector<LiveInterval*> NewLIs;
+  const TargetRegisterClass* rc = mri_->getRegClass(li.reg);
+
+  unsigned NumValNums = li.getNumValNums();
+  SmallVector<MachineInstr*, 4> ReMatDefs;
+  ReMatDefs.resize(NumValNums, NULL);
+  SmallVector<MachineInstr*, 4> ReMatOrigDefs;
+  ReMatOrigDefs.resize(NumValNums, NULL);
+  SmallVector<int, 4> ReMatIds;
+  ReMatIds.resize(NumValNums, VirtRegMap::MAX_STACK_SLOT);
+  BitVector ReMatDelete(NumValNums);
+  unsigned Slot = VirtRegMap::MAX_STACK_SLOT;
+
+  // Spilling a split live interval. It cannot be split any further. Also,
+  // it's also guaranteed to be a single val# / range interval.
+  if (vrm.getPreSplitReg(li.reg)) {
+    vrm.setIsSplitFromReg(li.reg, 0);
+    // Unset the split kill marker on the last use.
+    unsigned KillIdx = vrm.getKillPoint(li.reg);
+    if (KillIdx) {
+      MachineInstr *KillMI = getInstructionFromIndex(KillIdx);
+      assert(KillMI && "Last use disappeared?");
+      int KillOp = KillMI->findRegisterUseOperandIdx(li.reg, true);
+      assert(KillOp != -1 && "Last use disappeared?");
+      KillMI->getOperand(KillOp).setIsKill(false);
+    }
+    vrm.removeKillPoint(li.reg);
+    bool DefIsReMat = vrm.isReMaterialized(li.reg);
+    Slot = vrm.getStackSlot(li.reg);
+    assert(Slot != VirtRegMap::MAX_STACK_SLOT);
+    MachineInstr *ReMatDefMI = DefIsReMat ?
+      vrm.getReMaterializedMI(li.reg) : NULL;
+    int LdSlot = 0;
+    bool isLoadSS = DefIsReMat && tii_->isLoadFromStackSlot(ReMatDefMI, LdSlot);
+    bool isLoad = isLoadSS ||
+      (DefIsReMat && (ReMatDefMI->getDesc().canFoldAsLoad()));
+    bool IsFirstRange = true;
+    for (LiveInterval::Ranges::const_iterator
+           I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+      // If this is a split live interval with multiple ranges, it means there
+      // are two-address instructions that re-defined the value. Only the
+      // first def can be rematerialized!
+      if (IsFirstRange) {
+        // Note ReMatOrigDefMI has already been deleted.
+        rewriteInstructionsForSpills(li, false, I, NULL, ReMatDefMI,
+                             Slot, LdSlot, isLoad, isLoadSS, DefIsReMat,
+                             false, vrm, rc, ReMatIds, loopInfo,
+                             SpillMBBs, SpillIdxes, RestoreMBBs, RestoreIdxes,
+                             MBBVRegsMap, NewLIs);
+      } else {
+        rewriteInstructionsForSpills(li, false, I, NULL, 0,
+                             Slot, 0, false, false, false,
+                             false, vrm, rc, ReMatIds, loopInfo,
+                             SpillMBBs, SpillIdxes, RestoreMBBs, RestoreIdxes,
+                             MBBVRegsMap, NewLIs);
+      }
+      IsFirstRange = false;
+    }
+
+    handleSpilledImpDefs(li, vrm, rc, NewLIs);
+    return NewLIs;
+  }
+
+  bool TrySplit = SplitAtBB && !intervalIsInOneMBB(li);
+  if (SplitLimit != -1 && (int)numSplits >= SplitLimit)
+    TrySplit = false;
+  if (TrySplit)
+    ++numSplits;
+  bool NeedStackSlot = false;
+  for (LiveInterval::const_vni_iterator i = li.vni_begin(), e = li.vni_end();
+       i != e; ++i) {
+    const VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    unsigned DefIdx = VNI->def;
+    if (DefIdx == ~1U)
+      continue; // Dead val#.
+    // Is the def for the val# rematerializable?
+    MachineInstr *ReMatDefMI = (DefIdx == ~0u)
+      ? 0 : getInstructionFromIndex(DefIdx);
+    bool dummy;
+    if (ReMatDefMI && isReMaterializable(li, VNI, ReMatDefMI, SpillIs, dummy)) {
+      // Remember how to remat the def of this val#.
+      ReMatOrigDefs[VN] = ReMatDefMI;
+      // Original def may be modified so we have to make a copy here.
+      MachineInstr *Clone = mf_->CloneMachineInstr(ReMatDefMI);
+      ClonedMIs.push_back(Clone);
+      ReMatDefs[VN] = Clone;
+
+      bool CanDelete = true;
+      if (VNI->hasPHIKill) {
+        // A kill is a phi node, not all of its uses can be rematerialized.
+        // It must not be deleted.
+        CanDelete = false;
+        // Need a stack slot if there is any live range where uses cannot be
+        // rematerialized.
+        NeedStackSlot = true;
+      }
+      if (CanDelete)
+        ReMatDelete.set(VN);
+    } else {
+      // Need a stack slot if there is any live range where uses cannot be
+      // rematerialized.
+      NeedStackSlot = true;
+    }
+  }
+
+  // One stack slot per live interval.
+  if (NeedStackSlot && vrm.getPreSplitReg(li.reg) == 0) {
+    if (vrm.getStackSlot(li.reg) == VirtRegMap::NO_STACK_SLOT)
+      Slot = vrm.assignVirt2StackSlot(li.reg);
+    
+    // This case only occurs when the prealloc splitter has already assigned
+    // a stack slot to this vreg.
+    else
+      Slot = vrm.getStackSlot(li.reg);
+  }
+
+  // Create new intervals and rewrite defs and uses.
+  for (LiveInterval::Ranges::const_iterator
+         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
+    MachineInstr *ReMatDefMI = ReMatDefs[I->valno->id];
+    MachineInstr *ReMatOrigDefMI = ReMatOrigDefs[I->valno->id];
+    bool DefIsReMat = ReMatDefMI != NULL;
+    bool CanDelete = ReMatDelete[I->valno->id];
+    int LdSlot = 0;
+    bool isLoadSS = DefIsReMat && tii_->isLoadFromStackSlot(ReMatDefMI, LdSlot);
+    bool isLoad = isLoadSS ||
+      (DefIsReMat && ReMatDefMI->getDesc().canFoldAsLoad());
+    rewriteInstructionsForSpills(li, TrySplit, I, ReMatOrigDefMI, ReMatDefMI,
+                               Slot, LdSlot, isLoad, isLoadSS, DefIsReMat,
+                               CanDelete, vrm, rc, ReMatIds, loopInfo,
+                               SpillMBBs, SpillIdxes, RestoreMBBs, RestoreIdxes,
+                               MBBVRegsMap, NewLIs);
+  }
+
+  // Insert spills / restores if we are splitting.
+  if (!TrySplit) {
+    handleSpilledImpDefs(li, vrm, rc, NewLIs);
+    return NewLIs;
+  }
+
+  SmallPtrSet<LiveInterval*, 4> AddedKill;
+  SmallVector<unsigned, 2> Ops;
+  if (NeedStackSlot) {
+    int Id = SpillMBBs.find_first();
+    while (Id != -1) {
+      std::vector<SRInfo> &spills = SpillIdxes[Id];
+      for (unsigned i = 0, e = spills.size(); i != e; ++i) {
+        int index = spills[i].index;
+        unsigned VReg = spills[i].vreg;
+        LiveInterval &nI = getOrCreateInterval(VReg);
+        bool isReMat = vrm.isReMaterialized(VReg);
+        MachineInstr *MI = getInstructionFromIndex(index);
+        bool CanFold = false;
+        bool FoundUse = false;
+        Ops.clear();
+        if (spills[i].canFold) {
+          CanFold = true;
+          for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+            MachineOperand &MO = MI->getOperand(j);
+            if (!MO.isReg() || MO.getReg() != VReg)
+              continue;
+
+            Ops.push_back(j);
+            if (MO.isDef())
+              continue;
+            if (isReMat || 
+                (!FoundUse && !alsoFoldARestore(Id, index, VReg,
+                                                RestoreMBBs, RestoreIdxes))) {
+              // MI has two-address uses of the same register. If the use
+              // isn't the first and only use in the BB, then we can't fold
+              // it. FIXME: Move this to rewriteInstructionsForSpills.
+              CanFold = false;
+              break;
+            }
+            FoundUse = true;
+          }
+        }
+        // Fold the store into the def if possible.
+        bool Folded = false;
+        if (CanFold && !Ops.empty()) {
+          if (tryFoldMemoryOperand(MI, vrm, NULL, index, Ops, true, Slot,VReg)){
+            Folded = true;
+            if (FoundUse) {
+              // Also folded uses, do not issue a load.
+              eraseRestoreInfo(Id, index, VReg, RestoreMBBs, RestoreIdxes);
+              nI.removeRange(getLoadIndex(index), getUseIndex(index)+1);
+            }
+            nI.removeRange(getDefIndex(index), getStoreIndex(index));
+          }
+        }
+
+        // Otherwise tell the spiller to issue a spill.
+        if (!Folded) {
+          LiveRange *LR = &nI.ranges[nI.ranges.size()-1];
+          bool isKill = LR->end == getStoreIndex(index);
+          if (!MI->registerDefIsDead(nI.reg))
+            // No need to spill a dead def.
+            vrm.addSpillPoint(VReg, isKill, MI);
+          if (isKill)
+            AddedKill.insert(&nI);
+        }
+      }
+      Id = SpillMBBs.find_next(Id);
+    }
+  }
+
+  int Id = RestoreMBBs.find_first();
+  while (Id != -1) {
+    std::vector<SRInfo> &restores = RestoreIdxes[Id];
+    for (unsigned i = 0, e = restores.size(); i != e; ++i) {
+      int index = restores[i].index;
+      if (index == -1)
+        continue;
+      unsigned VReg = restores[i].vreg;
+      LiveInterval &nI = getOrCreateInterval(VReg);
+      bool isReMat = vrm.isReMaterialized(VReg);
+      MachineInstr *MI = getInstructionFromIndex(index);
+      bool CanFold = false;
+      Ops.clear();
+      if (restores[i].canFold) {
+        CanFold = true;
+        for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+          MachineOperand &MO = MI->getOperand(j);
+          if (!MO.isReg() || MO.getReg() != VReg)
+            continue;
+
+          if (MO.isDef()) {
+            // If this restore were to be folded, it would have been folded
+            // already.
+            CanFold = false;
+            break;
+          }
+          Ops.push_back(j);
+        }
+      }
+
+      // Fold the load into the use if possible.
+      bool Folded = false;
+      if (CanFold && !Ops.empty()) {
+        if (!isReMat)
+          Folded = tryFoldMemoryOperand(MI, vrm, NULL,index,Ops,true,Slot,VReg);
+        else {
+          MachineInstr *ReMatDefMI = vrm.getReMaterializedMI(VReg);
+          int LdSlot = 0;
+          bool isLoadSS = tii_->isLoadFromStackSlot(ReMatDefMI, LdSlot);
+          // If the rematerializable def is a load, also try to fold it.
+          if (isLoadSS || ReMatDefMI->getDesc().canFoldAsLoad())
+            Folded = tryFoldMemoryOperand(MI, vrm, ReMatDefMI, index,
+                                          Ops, isLoadSS, LdSlot, VReg);
+          if (!Folded) {
+            unsigned ImpUse = getReMatImplicitUse(li, ReMatDefMI);
+            if (ImpUse) {
+              // Re-matting an instruction with virtual register use. Add the
+              // register as an implicit use on the use MI and update the register
+              // interval's spill weight to HUGE_VALF to prevent it from being
+              // spilled.
+              LiveInterval &ImpLi = getInterval(ImpUse);
+              ImpLi.weight = HUGE_VALF;
+              MI->addOperand(MachineOperand::CreateReg(ImpUse, false, true));
+            }
+          }
+        }
+      }
+      // If folding is not possible / failed, then tell the spiller to issue a
+      // load / rematerialization for us.
+      if (Folded)
+        nI.removeRange(getLoadIndex(index), getUseIndex(index)+1);
+      else
+        vrm.addRestorePoint(VReg, MI);
+    }
+    Id = RestoreMBBs.find_next(Id);
+  }
+
+  // Finalize intervals: add kills, finalize spill weights, and filter out
+  // dead intervals.
+  std::vector<LiveInterval*> RetNewLIs;
+  for (unsigned i = 0, e = NewLIs.size(); i != e; ++i) {
+    LiveInterval *LI = NewLIs[i];
+    if (!LI->empty()) {
+      LI->weight /= InstrSlots::NUM * getApproximateInstructionCount(*LI);
+      if (!AddedKill.count(LI)) {
+        LiveRange *LR = &LI->ranges[LI->ranges.size()-1];
+        unsigned LastUseIdx = getBaseIndex(LR->end);
+        MachineInstr *LastUse = getInstructionFromIndex(LastUseIdx);
+        int UseIdx = LastUse->findRegisterUseOperandIdx(LI->reg, false);
+        assert(UseIdx != -1);
+        if (!LastUse->isRegTiedToDefOperand(UseIdx)) {
+          LastUse->getOperand(UseIdx).setIsKill();
+          vrm.addKillPoint(LI->reg, LastUseIdx);
+        }
+      }
+      RetNewLIs.push_back(LI);
+    }
+  }
+
+  handleSpilledImpDefs(li, vrm, rc, RetNewLIs);
+  return RetNewLIs;
+}
+
+/// hasAllocatableSuperReg - Return true if the specified physical register has
+/// any super register that's allocatable.
+bool LiveIntervals::hasAllocatableSuperReg(unsigned Reg) const {
+  for (const unsigned* AS = tri_->getSuperRegisters(Reg); *AS; ++AS)
+    if (allocatableRegs_[*AS] && hasInterval(*AS))
+      return true;
+  return false;
+}
+
+/// getRepresentativeReg - Find the largest super register of the specified
+/// physical register.
+unsigned LiveIntervals::getRepresentativeReg(unsigned Reg) const {
+  // Find the largest super-register that is allocatable. 
+  unsigned BestReg = Reg;
+  for (const unsigned* AS = tri_->getSuperRegisters(Reg); *AS; ++AS) {
+    unsigned SuperReg = *AS;
+    if (!hasAllocatableSuperReg(SuperReg) && hasInterval(SuperReg)) {
+      BestReg = SuperReg;
+      break;
+    }
+  }
+  return BestReg;
+}
+
+/// getNumConflictsWithPhysReg - Return the number of uses and defs of the
+/// specified interval that conflicts with the specified physical register.
+unsigned LiveIntervals::getNumConflictsWithPhysReg(const LiveInterval &li,
+                                                   unsigned PhysReg) const {
+  unsigned NumConflicts = 0;
+  const LiveInterval &pli = getInterval(getRepresentativeReg(PhysReg));
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li.reg),
+         E = mri_->reg_end(); I != E; ++I) {
+    MachineOperand &O = I.getOperand();
+    MachineInstr *MI = O.getParent();
+    unsigned Index = getInstructionIndex(MI);
+    if (pli.liveAt(Index))
+      ++NumConflicts;
+  }
+  return NumConflicts;
+}
+
+/// spillPhysRegAroundRegDefsUses - Spill the specified physical register
+/// around all defs and uses of the specified interval. Return true if it
+/// was able to cut its interval.
+bool LiveIntervals::spillPhysRegAroundRegDefsUses(const LiveInterval &li,
+                                            unsigned PhysReg, VirtRegMap &vrm) {
+  unsigned SpillReg = getRepresentativeReg(PhysReg);
+
+  for (const unsigned *AS = tri_->getAliasSet(PhysReg); *AS; ++AS)
+    // If there are registers which alias PhysReg, but which are not a
+    // sub-register of the chosen representative super register. Assert
+    // since we can't handle it yet.
+    assert(*AS == SpillReg || !allocatableRegs_[*AS] || !hasInterval(*AS) ||
+           tri_->isSuperRegister(*AS, SpillReg));
+
+  bool Cut = false;
+  LiveInterval &pli = getInterval(SpillReg);
+  SmallPtrSet<MachineInstr*, 8> SeenMIs;
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li.reg),
+         E = mri_->reg_end(); I != E; ++I) {
+    MachineOperand &O = I.getOperand();
+    MachineInstr *MI = O.getParent();
+    if (SeenMIs.count(MI))
+      continue;
+    SeenMIs.insert(MI);
+    unsigned Index = getInstructionIndex(MI);
+    if (pli.liveAt(Index)) {
+      vrm.addEmergencySpill(SpillReg, MI);
+      unsigned StartIdx = getLoadIndex(Index);
+      unsigned EndIdx = getStoreIndex(Index)+1;
+      if (pli.isInOneLiveRange(StartIdx, EndIdx)) {
+        pli.removeRange(StartIdx, EndIdx);
+        Cut = true;
+      } else {
+        cerr << "Ran out of registers during register allocation!\n";
+        if (MI->getOpcode() == TargetInstrInfo::INLINEASM) {
+          cerr << "Please check your inline asm statement for invalid "
+               << "constraints:\n";
+          MI->print(cerr.stream(), tm_);
+        }
+        exit(1);
+      }
+      for (const unsigned* AS = tri_->getSubRegisters(SpillReg); *AS; ++AS) {
+        if (!hasInterval(*AS))
+          continue;
+        LiveInterval &spli = getInterval(*AS);
+        if (spli.liveAt(Index))
+          spli.removeRange(getLoadIndex(Index), getStoreIndex(Index)+1);
+      }
+    }
+  }
+  return Cut;
+}
+
+LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
+                                                   MachineInstr* startInst) {
+  LiveInterval& Interval = getOrCreateInterval(reg);
+  VNInfo* VN = Interval.getNextValue(
+            getInstructionIndex(startInst) + InstrSlots::DEF,
+            startInst, getVNInfoAllocator());
+  VN->hasPHIKill = true;
+  VN->kills.push_back(getMBBEndIdx(startInst->getParent()));
+  LiveRange LR(getInstructionIndex(startInst) + InstrSlots::DEF,
+               getMBBEndIdx(startInst->getParent()) + 1, VN);
+  Interval.addRange(LR);
+  
+  return LR;
+}
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
new file mode 100644
index 0000000..86f7ea2
--- /dev/null
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -0,0 +1,66 @@
+//===-- LiveStackAnalysis.cpp - Live Stack Slot Analysis ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the live stack slot analysis pass. It is analogous to
+// live interval analysis except it's analyzing liveness of stack slots rather
+// than registers.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "livestacks"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <limits>
+using namespace llvm;
+
+char LiveStacks::ID = 0;
+static RegisterPass<LiveStacks> X("livestacks", "Live Stack Slot Analysis");
+
+void LiveStacks::scaleNumbering(int factor) {
+  // Scale the intervals.
+  for (iterator LI = begin(), LE = end(); LI != LE; ++LI) {
+    LI->second.scaleNumbering(factor);
+  }
+}
+
+void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LiveStacks::releaseMemory() {
+  // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
+  VNInfoAllocator.Reset();
+  S2IMap.clear();
+  S2RCMap.clear();
+}
+
+bool LiveStacks::runOnMachineFunction(MachineFunction &) {
+  // FIXME: No analysis is being done right now. We are relying on the
+  // register allocators to provide the information.
+  return false;
+}
+
+/// print - Implement the dump method.
+void LiveStacks::print(std::ostream &O, const Module*) const {
+  O << "********** INTERVALS **********\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    I->second.print(O);
+    int Slot = I->first;
+    const TargetRegisterClass *RC = getIntervalRegClass(Slot);
+    if (RC)
+      O << " [" << RC->getName() << "]\n";
+    else
+      O << " [Unknown]\n";
+  }
+}
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
new file mode 100644
index 0000000..c33d81e
--- /dev/null
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -0,0 +1,695 @@
+//===-- LiveVariables.cpp - Live Variable Analysis for Machine Code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveVariable analysis pass.  For each machine
+// instruction in the function, this pass calculates the set of registers that
+// are immediately dead after the instruction (i.e., the instruction calculates
+// the value, but it is never used) and the set of registers that are used by
+// the instruction, but are never used after the instruction (i.e., they are
+// killed).
+//
+// This class computes live variables using are sparse implementation based on
+// the machine code SSA form.  This class computes live variable information for
+// each virtual and _register allocatable_ physical register in a function.  It
+// uses the dominance properties of SSA form to efficiently compute live
+// variables for virtual registers, and assumes that physical registers are only
+// live within a single basic block (allowing it to do a single local analysis
+// to resolve physical register lifetimes in each basic block).  If a physical
+// register is not register allocatable, it is not tracked.  This is useful for
+// things like the stack pointer and condition codes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/alloca.h"
+#include <algorithm>
+using namespace llvm;
+
+char LiveVariables::ID = 0;
+static RegisterPass<LiveVariables> X("livevars", "Live Variable Analysis");
+
+
+void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredID(UnreachableMachineBlockElimID);
+  AU.setPreservesAll();
+}
+
+void LiveVariables::VarInfo::dump() const {
+  cerr << "  Alive in blocks: ";
+  for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
+           E = AliveBlocks.end(); I != E; ++I)
+    cerr << *I << ", ";
+  cerr << "\n  Killed by:";
+  if (Kills.empty())
+    cerr << " No instructions.\n";
+  else {
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+      cerr << "\n    #" << i << ": " << *Kills[i];
+    cerr << "\n";
+  }
+}
+
+/// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
+LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
+  assert(TargetRegisterInfo::isVirtualRegister(RegIdx) &&
+         "getVarInfo: not a virtual register!");
+  RegIdx -= TargetRegisterInfo::FirstVirtualRegister;
+  if (RegIdx >= VirtRegInfo.size()) {
+    if (RegIdx >= 2*VirtRegInfo.size())
+      VirtRegInfo.resize(RegIdx*2);
+    else
+      VirtRegInfo.resize(2*VirtRegInfo.size());
+  }
+  return VirtRegInfo[RegIdx];
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo& VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB,
+                                    std::vector<MachineBasicBlock*> &WorkList) {
+  unsigned BBNum = MBB->getNumber();
+  
+  // Check to see if this basic block is one of the killing blocks.  If so,
+  // remove it.
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    if (VRInfo.Kills[i]->getParent() == MBB) {
+      VRInfo.Kills.erase(VRInfo.Kills.begin()+i);  // Erase entry
+      break;
+    }
+  
+  if (MBB == DefBlock) return;  // Terminate recursion
+
+  if (VRInfo.AliveBlocks.test(BBNum))
+    return;  // We already know the block is live
+
+  // Mark the variable known alive in this bb
+  VRInfo.AliveBlocks.set(BBNum);
+
+  for (MachineBasicBlock::const_pred_reverse_iterator PI = MBB->pred_rbegin(),
+         E = MBB->pred_rend(); PI != E; ++PI)
+    WorkList.push_back(*PI);
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB) {
+  std::vector<MachineBasicBlock*> WorkList;
+  MarkVirtRegAliveInBlock(VRInfo, DefBlock, MBB, WorkList);
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *Pred = WorkList.back();
+    WorkList.pop_back();
+    MarkVirtRegAliveInBlock(VRInfo, DefBlock, Pred, WorkList);
+  }
+}
+
+void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
+                                     MachineInstr *MI) {
+  assert(MRI->getVRegDef(reg) && "Register use before def!");
+
+  unsigned BBNum = MBB->getNumber();
+
+  VarInfo& VRInfo = getVarInfo(reg);
+  VRInfo.NumUses++;
+
+  // Check to see if this basic block is already a kill block.
+  if (!VRInfo.Kills.empty() && VRInfo.Kills.back()->getParent() == MBB) {
+    // Yes, this register is killed in this basic block already. Increase the
+    // live range by updating the kill instruction.
+    VRInfo.Kills.back() = MI;
+    return;
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    assert(VRInfo.Kills[i]->getParent() != MBB && "entry should be at end!");
+#endif
+
+  // This situation can occur:
+  //
+  //     ,------.
+  //     |      |
+  //     |      v
+  //     |   t2 = phi ... t1 ...
+  //     |      |
+  //     |      v
+  //     |   t1 = ...
+  //     |  ... = ... t1 ...
+  //     |      |
+  //     `------'
+  //
+  // where there is a use in a PHI node that's a predecessor to the defining
+  // block. We don't want to mark all predecessors as having the value "alive"
+  // in this case.
+  if (MBB == MRI->getVRegDef(reg)->getParent()) return;
+
+  // Add a new kill entry for this basic block. If this virtual register is
+  // already marked as alive in this basic block, that means it is alive in at
+  // least one of the successor blocks, it's not a kill.
+  if (!VRInfo.AliveBlocks.test(BBNum))
+    VRInfo.Kills.push_back(MI);
+
+  // Update all dominating blocks to mark them as "known live".
+  for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+         E = MBB->pred_end(); PI != E; ++PI)
+    MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(reg)->getParent(), *PI);
+}
+
+void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr *MI) {
+  VarInfo &VRInfo = getVarInfo(Reg);
+
+  if (VRInfo.AliveBlocks.empty())
+    // If vr is not alive in any block, then defaults to dead.
+    VRInfo.Kills.push_back(MI);
+}
+
+/// FindLastPartialDef - Return the last partial def of the specified register.
+/// Also returns the sub-register that's defined.
+MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
+                                                unsigned &PartDefReg) {
+  unsigned LastDefReg = 0;
+  unsigned LastDefDist = 0;
+  MachineInstr *LastDef = NULL;
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs) {
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (!Def)
+      continue;
+    unsigned Dist = DistanceMap[Def];
+    if (Dist > LastDefDist) {
+      LastDefReg  = SubReg;
+      LastDef     = Def;
+      LastDefDist = Dist;
+    }
+  }
+  PartDefReg = LastDefReg;
+  return LastDef;
+}
+
+/// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add
+/// implicit defs to a machine instruction if there was an earlier def of its
+/// super-register.
+void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
+  // If there was a previous use or a "full" def all is well.
+  if (!PhysRegDef[Reg] && !PhysRegUse[Reg]) {
+    // Otherwise, the last sub-register def implicitly defines this register.
+    // e.g.
+    // AH =
+    // AL = ... <imp-def EAX>, <imp-kill AH>
+    //    = AH
+    // ...
+    //    = EAX
+    // All of the sub-registers must have been defined before the use of Reg!
+    unsigned PartDefReg = 0;
+    MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefReg);
+    // If LastPartialDef is NULL, it must be using a livein register.
+    if (LastPartialDef) {
+      LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                           true/*IsImp*/));
+      PhysRegDef[Reg] = LastPartialDef;
+      SmallSet<unsigned, 8> Processed;
+      for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+           unsigned SubReg = *SubRegs; ++SubRegs) {
+        if (Processed.count(SubReg))
+          continue;
+        if (SubReg == PartDefReg || TRI->isSubRegister(PartDefReg, SubReg))
+          continue;
+        // This part of Reg was defined before the last partial def. It's killed
+        // here.
+        LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg,
+                                                             false/*IsDef*/,
+                                                             true/*IsImp*/));
+        PhysRegDef[SubReg] = LastPartialDef;
+        for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+          Processed.insert(*SS);
+      }
+    }
+  }
+
+  // There was an earlier def of a super-register. Add implicit def to that MI.
+  //
+  //   A: EAX = ...
+  //   B: ... = AX
+  //
+  // Add implicit def to A if there isn't a use of AX (or EAX) before B.
+  if (!PhysRegUse[Reg]) {
+    MachineInstr *Def = PhysRegDef[Reg];
+    if (Def && !Def->modifiesRegister(Reg))
+      Def->addOperand(MachineOperand::CreateReg(Reg,
+                                                true  /*IsDef*/,
+                                                true  /*IsImp*/));
+  }
+  
+  // Remember this use.
+  PhysRegUse[Reg]  = MI;
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs)
+    PhysRegUse[SubReg] =  MI;
+}
+
+/// hasRegisterUseBelow - Return true if the specified register is used after
+/// the current instruction and before it's next definition.
+bool LiveVariables::hasRegisterUseBelow(unsigned Reg,
+                                        MachineBasicBlock::iterator I,
+                                        MachineBasicBlock *MBB) {
+  if (I == MBB->end())
+    return false;
+
+  // First find out if there are any uses / defs below.
+  bool hasDistInfo = true;
+  unsigned CurDist = DistanceMap[I];
+  SmallVector<MachineInstr*, 4> Uses;
+  SmallVector<MachineInstr*, 4> Defs;
+  for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(Reg),
+         RE = MRI->reg_end(); RI != RE; ++RI) {
+    MachineOperand &UDO = RI.getOperand();
+    MachineInstr *UDMI = &*RI;
+    if (UDMI->getParent() != MBB)
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UDMI);
+    bool isBelow = false;
+    if (DI == DistanceMap.end()) {
+      // Must be below if it hasn't been assigned a distance yet.
+      isBelow = true;
+      hasDistInfo = false;
+    } else if (DI->second > CurDist)
+      isBelow = true;
+    if (isBelow) {
+      if (UDO.isUse())
+        Uses.push_back(UDMI);
+      if (UDO.isDef())
+        Defs.push_back(UDMI);
+    }
+  }
+
+  if (Uses.empty())
+    // No uses below.
+    return false;
+  else if (!Uses.empty() && Defs.empty())
+    // There are uses below but no defs below.
+    return true;
+  // There are both uses and defs below. We need to know which comes first.
+  if (!hasDistInfo) {
+    // Complete DistanceMap for this MBB. This information is computed only
+    // once per MBB.
+    ++I;
+    ++CurDist;
+    for (MachineBasicBlock::iterator E = MBB->end(); I != E; ++I, ++CurDist)
+      DistanceMap.insert(std::make_pair(I, CurDist));
+  }
+
+  unsigned EarliestUse = DistanceMap[Uses[0]];
+  for (unsigned i = 1, e = Uses.size(); i != e; ++i) {
+    unsigned Dist = DistanceMap[Uses[i]];
+    if (Dist < EarliestUse)
+      EarliestUse = Dist;
+  }
+  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+    unsigned Dist = DistanceMap[Defs[i]];
+    if (Dist < EarliestUse)
+      // The register is defined before its first use below.
+      return false;
+  }
+  return true;
+}
+
+bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
+  if (!PhysRegUse[Reg] && !PhysRegDef[Reg])
+    return false;
+
+  MachineInstr *LastRefOrPartRef = PhysRegUse[Reg]
+    ? PhysRegUse[Reg] : PhysRegDef[Reg];
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  // The whole register is used.
+  // AL =
+  // AH =
+  //
+  //    = AX
+  //    = AL, AX<imp-use, kill>
+  // AX =
+  //
+  // Or whole register is defined, but not used at all.
+  // AX<dead> =
+  // ...
+  // AX =
+  //
+  // Or whole register is defined, but only partly used.
+  // AX<dead> = AL<imp-def>
+  //    = AL<kill>
+  // AX = 
+  SmallSet<unsigned, 8> PartUses;
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs) {
+    if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      PartUses.insert(SubReg);
+      for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+        PartUses.insert(*SS);
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI)
+    // If the last reference is the last def, then it's not used at all.
+    // That is, unless we are currently processing the last reference itself.
+    LastRefOrPartRef->addRegisterDead(Reg, TRI, true);
+
+  /* Partial uses. Mark register def dead and add implicit def of
+     sub-registers which are used.
+    FIXME: LiveIntervalAnalysis can't handle this yet!
+    EAX<dead>  = op  AL<imp-def>
+    That is, EAX def is dead but AL def extends pass it.
+    Enable this after live interval analysis is fixed to improve codegen!
+  else if (!PhysRegUse[Reg]) {
+    PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true);
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs) {
+      if (PartUses.count(SubReg)) {
+        PhysRegDef[Reg]->addOperand(MachineOperand::CreateReg(SubReg,
+                                                              true, true));
+        LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
+        for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+          PartUses.erase(*SS);
+      }
+    }
+  } */
+  else
+    LastRefOrPartRef->addRegisterKilled(Reg, TRI, true);
+  return true;
+}
+
+void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI) {
+  // What parts of the register are previously defined?
+  SmallSet<unsigned, 32> Live;
+  if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
+    Live.insert(Reg);
+    for (const unsigned *SS = TRI->getSubRegisters(Reg); *SS; ++SS)
+      Live.insert(*SS);
+  } else {
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs) {
+      // If a register isn't itself defined, but all parts that make up of it
+      // are defined, then consider it also defined.
+      // e.g.
+      // AL =
+      // AH =
+      //    = AX
+      if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
+        Live.insert(SubReg);
+        for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+          Live.insert(*SS);
+      }
+    }
+  }
+
+  // Start from the largest piece, find the last time any part of the register
+  // is referenced.
+  if (!HandlePhysRegKill(Reg, MI)) {
+    // Only some of the sub-registers are used.
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs) {
+      if (!Live.count(SubReg))
+        // Skip if this sub-register isn't defined.
+        continue;
+      if (HandlePhysRegKill(SubReg, MI)) {
+        Live.erase(SubReg);
+        for (const unsigned *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+          Live.erase(*SS);
+      }
+    }
+    assert(Live.empty() && "Not all defined registers are killed / dead?");
+  }
+
+  if (MI) {
+    // Does this extend the live range of a super-register?
+    SmallSet<unsigned, 8> Processed;
+    for (const unsigned *SuperRegs = TRI->getSuperRegisters(Reg);
+         unsigned SuperReg = *SuperRegs; ++SuperRegs) {
+      if (Processed.count(SuperReg))
+        continue;
+      MachineInstr *LastRef = PhysRegUse[SuperReg]
+        ? PhysRegUse[SuperReg] : PhysRegDef[SuperReg];
+      if (LastRef && LastRef != MI) {
+        // The larger register is previously defined. Now a smaller part is
+        // being re-defined. Treat it as read/mod/write if there are uses
+        // below.
+        // EAX =
+        // AX  =        EAX<imp-use,kill>, EAX<imp-def>
+        // ...
+        ///    =  EAX
+        if (hasRegisterUseBelow(SuperReg, MI, MI->getParent())) {
+          MI->addOperand(MachineOperand::CreateReg(SuperReg, false/*IsDef*/,
+                                                   true/*IsImp*/,true/*IsKill*/));
+          MI->addOperand(MachineOperand::CreateReg(SuperReg, true/*IsDef*/,
+                                                   true/*IsImp*/));
+          PhysRegDef[SuperReg]  = MI;
+          PhysRegUse[SuperReg]  = NULL;
+          Processed.insert(SuperReg);
+          for (const unsigned *SS = TRI->getSubRegisters(SuperReg); *SS; ++SS) {
+            PhysRegDef[*SS]  = MI;
+            PhysRegUse[*SS]  = NULL;
+            Processed.insert(*SS);
+          }
+        } else {
+          // Otherwise, the super register is killed.
+          if (HandlePhysRegKill(SuperReg, MI)) {
+            PhysRegDef[SuperReg]  = NULL;
+            PhysRegUse[SuperReg]  = NULL;
+            for (const unsigned *SS = TRI->getSubRegisters(SuperReg); *SS; ++SS) {
+              PhysRegDef[*SS]  = NULL;
+              PhysRegUse[*SS]  = NULL;
+              Processed.insert(*SS);
+            }
+          }
+        }
+      }
+    }
+
+    // Remember this def.
+    PhysRegDef[Reg]  = MI;
+    PhysRegUse[Reg]  = NULL;
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs) {
+      PhysRegDef[SubReg]  = MI;
+      PhysRegUse[SubReg]  = NULL;
+    }
+  }
+}
+
+bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MRI = &mf.getRegInfo();
+  TRI = MF->getTarget().getRegisterInfo();
+
+  ReservedRegisters = TRI->getReservedRegs(mf);
+
+  unsigned NumRegs = TRI->getNumRegs();
+  PhysRegDef  = new MachineInstr*[NumRegs];
+  PhysRegUse  = new MachineInstr*[NumRegs];
+  PHIVarInfo = new SmallVector<unsigned, 4>[MF->getNumBlockIDs()];
+  std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
+  std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+
+  /// Get some space for a respectable number of registers.
+  VirtRegInfo.resize(64);
+
+  analyzePHINodes(mf);
+
+  // Calculate live variable information in depth first order on the CFG of the
+  // function.  This guarantees that we will see the definition of a virtual
+  // register before its uses due to dominance properties of SSA (except for PHI
+  // nodes, which are treated as a special case).
+  MachineBasicBlock *Entry = MF->begin();
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
+         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+       DFI != E; ++DFI) {
+    MachineBasicBlock *MBB = *DFI;
+
+    // Mark live-in registers as live-in.
+    for (MachineBasicBlock::const_livein_iterator II = MBB->livein_begin(),
+           EE = MBB->livein_end(); II != EE; ++II) {
+      assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
+             "Cannot have a live-in virtual register!");
+      HandlePhysRegDef(*II, 0);
+    }
+
+    // Loop over all of the instructions, processing them.
+    DistanceMap.clear();
+    unsigned Dist = 0;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      MachineInstr *MI = I;
+      DistanceMap.insert(std::make_pair(MI, Dist++));
+
+      // Process all of the operands of the instruction...
+      unsigned NumOperandsToProcess = MI->getNumOperands();
+
+      // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
+      // of the uses.  They will be handled in other basic blocks.
+      if (MI->getOpcode() == TargetInstrInfo::PHI)
+        NumOperandsToProcess = 1;
+
+      SmallVector<unsigned, 4> UseRegs;
+      SmallVector<unsigned, 4> DefRegs;
+      for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || MO.getReg() == 0)
+          continue;
+        unsigned MOReg = MO.getReg();
+        if (MO.isUse())
+          UseRegs.push_back(MOReg);
+        if (MO.isDef())
+          DefRegs.push_back(MOReg);
+      }
+
+      // Process all uses.
+      for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
+        unsigned MOReg = UseRegs[i];
+        if (TargetRegisterInfo::isVirtualRegister(MOReg))
+          HandleVirtRegUse(MOReg, MBB, MI);
+        else if (!ReservedRegisters[MOReg])
+          HandlePhysRegUse(MOReg, MI);
+      }
+
+      // Process all defs.
+      for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
+        unsigned MOReg = DefRegs[i];
+        if (TargetRegisterInfo::isVirtualRegister(MOReg))
+          HandleVirtRegDef(MOReg, MI);
+        else if (!ReservedRegisters[MOReg])
+          HandlePhysRegDef(MOReg, MI);
+      }
+    }
+
+    // Handle any virtual assignments from PHI nodes which might be at the
+    // bottom of this basic block.  We check all of our successor blocks to see
+    // if they have PHI nodes, and if so, we simulate an assignment at the end
+    // of the current block.
+    if (!PHIVarInfo[MBB->getNumber()].empty()) {
+      SmallVector<unsigned, 4>& VarInfoVec = PHIVarInfo[MBB->getNumber()];
+
+      for (SmallVector<unsigned, 4>::iterator I = VarInfoVec.begin(),
+             E = VarInfoVec.end(); I != E; ++I)
+        // Mark it alive only in the block we are representing.
+        MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
+                                MBB);
+    }
+
+    // Finally, if the last instruction in the block is a return, make sure to
+    // mark it as using all of the live-out values in the function.
+    if (!MBB->empty() && MBB->back().getDesc().isReturn()) {
+      MachineInstr *Ret = &MBB->back();
+
+      for (MachineRegisterInfo::liveout_iterator
+           I = MF->getRegInfo().liveout_begin(),
+           E = MF->getRegInfo().liveout_end(); I != E; ++I) {
+        assert(TargetRegisterInfo::isPhysicalRegister(*I) &&
+               "Cannot have a live-out virtual register!");
+        HandlePhysRegUse(*I, Ret);
+
+        // Add live-out registers as implicit uses.
+        if (!Ret->readsRegister(*I))
+          Ret->addOperand(MachineOperand::CreateReg(*I, false, true));
+      }
+    }
+
+    // Loop over PhysRegDef / PhysRegUse, killing any registers that are
+    // available at the end of the basic block.
+    for (unsigned i = 0; i != NumRegs; ++i)
+      if (PhysRegDef[i] || PhysRegUse[i])
+        HandlePhysRegDef(i, 0);
+
+    std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
+    std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+  }
+
+  // Convert and transfer the dead / killed information we have gathered into
+  // VirtRegInfo onto MI's.
+  for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i)
+    for (unsigned j = 0, e2 = VirtRegInfo[i].Kills.size(); j != e2; ++j)
+      if (VirtRegInfo[i].Kills[j] ==
+          MRI->getVRegDef(i + TargetRegisterInfo::FirstVirtualRegister))
+        VirtRegInfo[i]
+          .Kills[j]->addRegisterDead(i +
+                                     TargetRegisterInfo::FirstVirtualRegister,
+                                     TRI);
+      else
+        VirtRegInfo[i]
+          .Kills[j]->addRegisterKilled(i +
+                                       TargetRegisterInfo::FirstVirtualRegister,
+                                       TRI);
+
+  // Check to make sure there are no unreachable blocks in the MC CFG for the
+  // function.  If so, it is due to a bug in the instruction selector or some
+  // other part of the code generator if this happens.
+#ifndef NDEBUG
+  for(MachineFunction::iterator i = MF->begin(), e = MF->end(); i != e; ++i)
+    assert(Visited.count(&*i) != 0 && "unreachable basic block found");
+#endif
+
+  delete[] PhysRegDef;
+  delete[] PhysRegUse;
+  delete[] PHIVarInfo;
+
+  return false;
+}
+
+/// replaceKillInstruction - Update register kill info by replacing a kill
+/// instruction with a new one.
+void LiveVariables::replaceKillInstruction(unsigned Reg, MachineInstr *OldMI,
+                                           MachineInstr *NewMI) {
+  VarInfo &VI = getVarInfo(Reg);
+  std::replace(VI.Kills.begin(), VI.Kills.end(), OldMI, NewMI);
+}
+
+/// removeVirtualRegistersKilled - Remove all killed info for the specified
+/// instruction.
+void LiveVariables::removeVirtualRegistersKilled(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isKill()) {
+      MO.setIsKill(false);
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        bool removed = getVarInfo(Reg).removeKill(MI);
+        assert(removed && "kill not in register's VarInfo?");
+        removed = true;
+      }
+    }
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the variable information of a virtual register
+/// which is used in a PHI node. We map that to the BB the vreg is coming from.
+///
+void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
+  for (MachineFunction::const_iterator I = Fn.begin(), E = Fn.end();
+       I != E; ++I)
+    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->getOpcode() == TargetInstrInfo::PHI; ++BBI)
+      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+        PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
+          .push_back(BBI->getOperand(i).getReg());
+}
diff --git a/lib/CodeGen/LowerSubregs.cpp b/lib/CodeGen/LowerSubregs.cpp
new file mode 100644
index 0000000..14acb71
--- /dev/null
+++ b/lib/CodeGen/LowerSubregs.cpp
@@ -0,0 +1,292 @@
+//===-- LowerSubregs.cpp - Subregister Lowering instruction pass ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a MachineFunction pass which runs after register
+// allocation that turns subreg insert/extract instructions into register
+// copies, as needed. This ensures correct codegen even if the coalescer
+// isn't able to remove all subreg instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowersubregs"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+namespace {
+  struct VISIBILITY_HIDDEN LowerSubregsInstructionPass
+   : public MachineFunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    LowerSubregsInstructionPass() : MachineFunctionPass(&ID) {}
+    
+    const char *getPassName() const {
+      return "Subregister lowering instruction pass";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    /// runOnMachineFunction - pass entry point
+    bool runOnMachineFunction(MachineFunction&);
+    
+    bool LowerExtract(MachineInstr *MI);
+    bool LowerInsert(MachineInstr *MI);
+    bool LowerSubregToReg(MachineInstr *MI);
+
+    void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
+                          const TargetRegisterInfo &TRI);
+    void TransferKillFlag(MachineInstr *MI, unsigned SrcReg,
+                          const TargetRegisterInfo &TRI);
+  };
+
+  char LowerSubregsInstructionPass::ID = 0;
+}
+
+FunctionPass *llvm::createLowerSubregsPass() { 
+  return new LowerSubregsInstructionPass(); 
+}
+
+/// TransferDeadFlag - MI is a pseudo-instruction with DstReg dead,
+/// and the lowered replacement instructions immediately precede it.
+/// Mark the replacement instructions with the dead flag.
+void
+LowerSubregsInstructionPass::TransferDeadFlag(MachineInstr *MI,
+                                              unsigned DstReg,
+                                              const TargetRegisterInfo &TRI) {
+  for (MachineBasicBlock::iterator MII =
+        prior(MachineBasicBlock::iterator(MI)); ; --MII) {
+    if (MII->addRegisterDead(DstReg, &TRI))
+      break;
+    assert(MII != MI->getParent()->begin() &&
+           "copyRegToReg output doesn't reference destination register!");
+  }
+}
+
+/// TransferKillFlag - MI is a pseudo-instruction with SrcReg killed,
+/// and the lowered replacement instructions immediately precede it.
+/// Mark the replacement instructions with the kill flag.
+void
+LowerSubregsInstructionPass::TransferKillFlag(MachineInstr *MI,
+                                              unsigned SrcReg,
+                                              const TargetRegisterInfo &TRI) {
+  for (MachineBasicBlock::iterator MII =
+        prior(MachineBasicBlock::iterator(MI)); ; --MII) {
+    if (MII->addRegisterKilled(SrcReg, &TRI))
+      break;
+    assert(MII != MI->getParent()->begin() &&
+           "copyRegToReg output doesn't reference source register!");
+  }
+}
+
+bool LowerSubregsInstructionPass::LowerExtract(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const TargetRegisterInfo &TRI = *MF.getTarget().getRegisterInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  
+  assert(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+         MI->getOperand(1).isReg() && MI->getOperand(1).isUse() &&
+         MI->getOperand(2).isImm() && "Malformed extract_subreg");
+
+  unsigned DstReg   = MI->getOperand(0).getReg();
+  unsigned SuperReg = MI->getOperand(1).getReg();
+  unsigned SubIdx   = MI->getOperand(2).getImm();
+  unsigned SrcReg   = TRI.getSubReg(SuperReg, SubIdx);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(SuperReg) &&
+         "Extract supperg source must be a physical register");
+  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+         "Extract destination must be in a physical register");
+         
+  DOUT << "subreg: CONVERTING: " << *MI;
+
+  if (SrcReg == DstReg) {
+    // No need to insert an identify copy instruction.
+    DOUT << "subreg: eliminated!";
+    // Find the kill of the destination register's live range, and insert
+    // a kill of the source register at that point.
+    if (MI->getOperand(1).isKill() && !MI->getOperand(0).isDead())
+      for (MachineBasicBlock::iterator MII =
+             next(MachineBasicBlock::iterator(MI));
+           MII != MBB->end(); ++MII)
+        if (MII->killsRegister(DstReg, &TRI)) {
+          MII->addRegisterKilled(SuperReg, &TRI, /*AddIfNotFound=*/true);
+          break;
+        }
+  } else {
+    // Insert copy
+    const TargetRegisterClass *TRC = TRI.getPhysicalRegisterRegClass(DstReg);
+    assert(TRC == TRI.getPhysicalRegisterRegClass(SrcReg) &&
+            "Extract subreg and Dst must be of same register class");
+    TII.copyRegToReg(*MBB, MI, DstReg, SrcReg, TRC, TRC);
+    // Transfer the kill/dead flags, if needed.
+    if (MI->getOperand(0).isDead())
+      TransferDeadFlag(MI, DstReg, TRI);
+    if (MI->getOperand(1).isKill())
+      TransferKillFlag(MI, SrcReg, TRI);
+
+#ifndef NDEBUG
+    MachineBasicBlock::iterator dMI = MI;
+    DOUT << "subreg: " << *(--dMI);
+#endif
+  }
+
+  DOUT << "\n";
+  MBB->erase(MI);
+  return true;
+}
+
+bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const TargetRegisterInfo &TRI = *MF.getTarget().getRegisterInfo(); 
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
+         MI->getOperand(1).isImm() &&
+         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
+          MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
+          
+  unsigned DstReg  = MI->getOperand(0).getReg();
+  unsigned InsReg  = MI->getOperand(2).getReg();
+  unsigned InsSIdx = MI->getOperand(2).getSubReg();
+  unsigned SubIdx  = MI->getOperand(3).getImm();
+
+  assert(SubIdx != 0 && "Invalid index for insert_subreg");
+  unsigned DstSubReg = TRI.getSubReg(DstReg, SubIdx);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+         "Insert destination must be in a physical register");
+  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
+         "Inserted value must be in a physical register");
+
+  DOUT << "subreg: CONVERTING: " << *MI;
+
+  if (DstSubReg == InsReg && InsSIdx == 0) {
+    // No need to insert an identify copy instruction.
+    // Watch out for case like this:
+    // %RAX<def> = ...
+    // %RAX<def> = SUBREG_TO_REG 0, %EAX:3<kill>, 3
+    // The first def is defining RAX, not EAX so the top bits were not
+    // zero extended.
+    DOUT << "subreg: eliminated!";
+  } else {
+    // Insert sub-register copy
+    const TargetRegisterClass *TRC0= TRI.getPhysicalRegisterRegClass(DstSubReg);
+    const TargetRegisterClass *TRC1= TRI.getPhysicalRegisterRegClass(InsReg);
+    TII.copyRegToReg(*MBB, MI, DstSubReg, InsReg, TRC0, TRC1);
+    // Transfer the kill/dead flags, if needed.
+    if (MI->getOperand(0).isDead())
+      TransferDeadFlag(MI, DstSubReg, TRI);
+    if (MI->getOperand(2).isKill())
+      TransferKillFlag(MI, InsReg, TRI);
+
+#ifndef NDEBUG
+    MachineBasicBlock::iterator dMI = MI;
+    DOUT << "subreg: " << *(--dMI);
+#endif
+  }
+
+  DOUT << "\n";
+  MBB->erase(MI);
+  return true;                    
+}
+
+bool LowerSubregsInstructionPass::LowerInsert(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const TargetRegisterInfo &TRI = *MF.getTarget().getRegisterInfo(); 
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
+         (MI->getOperand(1).isReg() && MI->getOperand(1).isUse()) &&
+         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
+          MI->getOperand(3).isImm() && "Invalid insert_subreg");
+          
+  unsigned DstReg = MI->getOperand(0).getReg();
+#ifndef NDEBUG
+  unsigned SrcReg = MI->getOperand(1).getReg();
+#endif
+  unsigned InsReg = MI->getOperand(2).getReg();
+  unsigned SubIdx = MI->getOperand(3).getImm();     
+
+  assert(DstReg == SrcReg && "insert_subreg not a two-address instruction?");
+  assert(SubIdx != 0 && "Invalid index for insert_subreg");
+  unsigned DstSubReg = TRI.getSubReg(DstReg, SubIdx);
+  
+  assert(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+         "Insert superreg source must be in a physical register");
+  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
+         "Inserted value must be in a physical register");
+
+  DOUT << "subreg: CONVERTING: " << *MI;
+
+  if (DstSubReg == InsReg) {
+    // No need to insert an identify copy instruction.
+    DOUT << "subreg: eliminated!";
+  } else {
+    // Insert sub-register copy
+    const TargetRegisterClass *TRC0= TRI.getPhysicalRegisterRegClass(DstSubReg);
+    const TargetRegisterClass *TRC1= TRI.getPhysicalRegisterRegClass(InsReg);
+    TII.copyRegToReg(*MBB, MI, DstSubReg, InsReg, TRC0, TRC1);
+    // Transfer the kill/dead flags, if needed.
+    if (MI->getOperand(0).isDead())
+      TransferDeadFlag(MI, DstSubReg, TRI);
+    if (MI->getOperand(1).isKill())
+      TransferKillFlag(MI, InsReg, TRI);
+
+#ifndef NDEBUG
+    MachineBasicBlock::iterator dMI = MI;
+    DOUT << "subreg: " << *(--dMI);
+#endif
+  }
+
+  DOUT << "\n";
+  MBB->erase(MI);
+  return true;                    
+}
+
+/// runOnMachineFunction - Reduce subregister inserts and extracts to register
+/// copies.
+///
+bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
+  DOUT << "Machine Function\n";
+  
+  bool MadeChange = false;
+
+  DOUT << "********** LOWERING SUBREG INSTRS **********\n";
+  DOUT << "********** Function: " << MF.getFunction()->getName() << '\n';
+
+  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+       mbbi != mbbe; ++mbbi) {
+    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
+         mi != me;) {
+      MachineInstr *MI = mi++;
+           
+      if (MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
+        MadeChange |= LowerExtract(MI);
+      } else if (MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG) {
+        MadeChange |= LowerInsert(MI);
+      } else if (MI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
+        MadeChange |= LowerSubregToReg(MI);
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/lib/CodeGen/MachOWriter.cpp b/lib/CodeGen/MachOWriter.cpp
new file mode 100644
index 0000000..4332627
--- /dev/null
+++ b/lib/CodeGen/MachOWriter.cpp
@@ -0,0 +1,976 @@
+//===-- MachOWriter.cpp - Target-independent Mach-O Writer code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the target-independent Mach-O writer.  This file writes
+// out the Mach-O file in the following order:
+//
+//  #1 FatHeader (universal-only)
+//  #2 FatArch (universal-only, 1 per universal arch)
+//  Per arch:
+//    #3 Header
+//    #4 Load Commands
+//    #5 Sections
+//    #6 Relocations
+//    #7 Symbols
+//    #8 Strings
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachOWriter.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/FileWriters.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/OutputBuffer.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstring>
+using namespace llvm;
+
+/// AddMachOWriter - Concrete function to add the Mach-O writer to the function
+/// pass manager.
+MachineCodeEmitter *llvm::AddMachOWriter(PassManagerBase &PM,
+                                         raw_ostream &O,
+                                         TargetMachine &TM) {
+  MachOWriter *MOW = new MachOWriter(O, TM);
+  PM.add(MOW);
+  return &MOW->getMachineCodeEmitter();
+}
+
+//===----------------------------------------------------------------------===//
+//                       MachOCodeEmitter Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+  /// MachOCodeEmitter - This class is used by the MachOWriter to emit the code 
+  /// for functions to the Mach-O file.
+  class MachOCodeEmitter : public MachineCodeEmitter {
+    MachOWriter &MOW;
+
+    /// Target machine description.
+    TargetMachine &TM;
+
+    /// is64Bit/isLittleEndian - This information is inferred from the target
+    /// machine directly, indicating what header values and flags to set.
+    bool is64Bit, isLittleEndian;
+
+    /// Relocations - These are the relocations that the function needs, as
+    /// emitted.
+    std::vector<MachineRelocation> Relocations;
+    
+    /// CPLocations - This is a map of constant pool indices to offsets from the
+    /// start of the section for that constant pool index.
+    std::vector<uintptr_t> CPLocations;
+
+    /// CPSections - This is a map of constant pool indices to the MachOSection
+    /// containing the constant pool entry for that index.
+    std::vector<unsigned> CPSections;
+
+    /// JTLocations - This is a map of jump table indices to offsets from the
+    /// start of the section for that jump table index.
+    std::vector<uintptr_t> JTLocations;
+
+    /// MBBLocations - This vector is a mapping from MBB ID's to their address.
+    /// It is filled in by the StartMachineBasicBlock callback and queried by
+    /// the getMachineBasicBlockAddress callback.
+    std::vector<uintptr_t> MBBLocations;
+    
+  public:
+    MachOCodeEmitter(MachOWriter &mow) : MOW(mow), TM(MOW.TM) {
+      is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
+      isLittleEndian = TM.getTargetData()->isLittleEndian();
+    }
+
+    virtual void startFunction(MachineFunction &MF);
+    virtual bool finishFunction(MachineFunction &MF);
+
+    virtual void addRelocation(const MachineRelocation &MR) {
+      Relocations.push_back(MR);
+    }
+    
+    void emitConstantPool(MachineConstantPool *MCP);
+    void emitJumpTables(MachineJumpTableInfo *MJTI);
+    
+    virtual uintptr_t getConstantPoolEntryAddress(unsigned Index) const {
+      assert(CPLocations.size() > Index && "CP not emitted!");
+      return CPLocations[Index];
+    }
+    virtual uintptr_t getJumpTableEntryAddress(unsigned Index) const {
+      assert(JTLocations.size() > Index && "JT not emitted!");
+      return JTLocations[Index];
+    }
+
+    virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) {
+      if (MBBLocations.size() <= (unsigned)MBB->getNumber())
+        MBBLocations.resize((MBB->getNumber()+1)*2);
+      MBBLocations[MBB->getNumber()] = getCurrentPCOffset();
+    }
+
+    virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const {
+      assert(MBBLocations.size() > (unsigned)MBB->getNumber() && 
+             MBBLocations[MBB->getNumber()] && "MBB not emitted!");
+      return MBBLocations[MBB->getNumber()];
+    }
+
+    virtual uintptr_t getLabelAddress(uint64_t Label) const {
+      assert(0 && "get Label not implemented");
+      abort();
+      return 0;
+    }
+
+    virtual void emitLabel(uint64_t LabelID) {
+      assert(0 && "emit Label not implemented");
+      abort();
+    }
+
+
+    virtual void setModuleInfo(llvm::MachineModuleInfo* MMI) { }
+
+    /// JIT SPECIFIC FUNCTIONS - DO NOT IMPLEMENT THESE HERE!
+    virtual void startGVStub(const GlobalValue* F, unsigned StubSize,
+                             unsigned Alignment = 1) {
+      assert(0 && "JIT specific function called!");
+      abort();
+    }
+    virtual void startGVStub(const GlobalValue* F, void *Buffer, 
+                             unsigned StubSize) {
+      assert(0 && "JIT specific function called!");
+      abort();
+    }
+    virtual void *finishGVStub(const GlobalValue* F) {
+      assert(0 && "JIT specific function called!");
+      abort();
+      return 0;
+    }
+  };
+}
+
+/// startFunction - This callback is invoked when a new machine function is
+/// about to be emitted.
+void MachOCodeEmitter::startFunction(MachineFunction &MF) {
+  const TargetData *TD = TM.getTargetData();
+  const Function *F = MF.getFunction();
+
+  // Align the output buffer to the appropriate alignment, power of 2.
+  unsigned FnAlign = F->getAlignment();
+  unsigned TDAlign = TD->getPrefTypeAlignment(F->getType());
+  unsigned Align = Log2_32(std::max(FnAlign, TDAlign));
+  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
+
+  // Get the Mach-O Section that this function belongs in.
+  MachOWriter::MachOSection *MOS = MOW.getTextSection();
+  
+  // FIXME: better memory management
+  MOS->SectionData.reserve(4096);
+  BufferBegin = &MOS->SectionData[0];
+  BufferEnd = BufferBegin + MOS->SectionData.capacity();
+
+  // Upgrade the section alignment if required.
+  if (MOS->align < Align) MOS->align = Align;
+
+  // Round the size up to the correct alignment for starting the new function.
+  if ((MOS->size & ((1 << Align) - 1)) != 0) {
+    MOS->size += (1 << Align);
+    MOS->size &= ~((1 << Align) - 1);
+  }
+
+  // FIXME: Using MOS->size directly here instead of calculating it from the
+  // output buffer size (impossible because the code emitter deals only in raw
+  // bytes) forces us to manually synchronize size and write padding zero bytes
+  // to the output buffer for all non-text sections.  For text sections, we do
+  // not synchonize the output buffer, and we just blow up if anyone tries to
+  // write non-code to it.  An assert should probably be added to
+  // AddSymbolToSection to prevent calling it on the text section.
+  CurBufferPtr = BufferBegin + MOS->size;
+
+  // Clear per-function data structures.
+  CPLocations.clear();
+  CPSections.clear();
+  JTLocations.clear();
+  MBBLocations.clear();
+}
+
+/// finishFunction - This callback is invoked after the function is completely
+/// finished.
+bool MachOCodeEmitter::finishFunction(MachineFunction &MF) {
+  // Get the Mach-O Section that this function belongs in.
+  MachOWriter::MachOSection *MOS = MOW.getTextSection();
+
+  // Get a symbol for the function to add to the symbol table
+  // FIXME: it seems like we should call something like AddSymbolToSection
+  // in startFunction rather than changing the section size and symbol n_value
+  // here.
+  const GlobalValue *FuncV = MF.getFunction();
+  MachOSym FnSym(FuncV, MOW.Mang->getValueName(FuncV), MOS->Index, TM);
+  FnSym.n_value = MOS->size;
+  MOS->size = CurBufferPtr - BufferBegin;
+  
+  // Emit constant pool to appropriate section(s)
+  emitConstantPool(MF.getConstantPool());
+
+  // Emit jump tables to appropriate section
+  emitJumpTables(MF.getJumpTableInfo());
+  
+  // If we have emitted any relocations to function-specific objects such as 
+  // basic blocks, constant pools entries, or jump tables, record their
+  // addresses now so that we can rewrite them with the correct addresses
+  // later.
+  for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
+    MachineRelocation &MR = Relocations[i];
+    intptr_t Addr;
+
+    if (MR.isBasicBlock()) {
+      Addr = getMachineBasicBlockAddress(MR.getBasicBlock());
+      MR.setConstantVal(MOS->Index);
+      MR.setResultPointer((void*)Addr);
+    } else if (MR.isJumpTableIndex()) {
+      Addr = getJumpTableEntryAddress(MR.getJumpTableIndex());
+      MR.setConstantVal(MOW.getJumpTableSection()->Index);
+      MR.setResultPointer((void*)Addr);
+    } else if (MR.isConstantPoolIndex()) {
+      Addr = getConstantPoolEntryAddress(MR.getConstantPoolIndex());
+      MR.setConstantVal(CPSections[MR.getConstantPoolIndex()]);
+      MR.setResultPointer((void*)Addr);
+    } else if (MR.isGlobalValue()) {
+      // FIXME: This should be a set or something that uniques
+      MOW.PendingGlobals.push_back(MR.getGlobalValue());
+    } else {
+      assert(0 && "Unhandled relocation type");
+    }
+    MOS->Relocations.push_back(MR);
+  }
+  Relocations.clear();
+  
+  // Finally, add it to the symtab.
+  MOW.SymbolTable.push_back(FnSym);
+  return false;
+}
+
+/// emitConstantPool - For each constant pool entry, figure out which section
+/// the constant should live in, allocate space for it, and emit it to the 
+/// Section data buffer.
+void MachOCodeEmitter::emitConstantPool(MachineConstantPool *MCP) {
+  const std::vector<MachineConstantPoolEntry> &CP = MCP->getConstants();
+  if (CP.empty()) return;
+
+  // FIXME: handle PIC codegen
+  assert(TM.getRelocationModel() != Reloc::PIC_ &&
+         "PIC codegen not yet handled for mach-o jump tables!");
+
+  // Although there is no strict necessity that I am aware of, we will do what
+  // gcc for OS X does and put each constant pool entry in a section of constant
+  // objects of a certain size.  That means that float constants go in the
+  // literal4 section, and double objects go in literal8, etc.
+  //
+  // FIXME: revisit this decision if we ever do the "stick everything into one
+  // "giant object for PIC" optimization.
+  for (unsigned i = 0, e = CP.size(); i != e; ++i) {
+    const Type *Ty = CP[i].getType();
+    unsigned Size = TM.getTargetData()->getTypeAllocSize(Ty);
+
+    MachOWriter::MachOSection *Sec = MOW.getConstSection(CP[i].Val.ConstVal);
+    OutputBuffer SecDataOut(Sec->SectionData, is64Bit, isLittleEndian);
+
+    CPLocations.push_back(Sec->SectionData.size());
+    CPSections.push_back(Sec->Index);
+    
+    // FIXME: remove when we have unified size + output buffer
+    Sec->size += Size;
+
+    // Allocate space in the section for the global.
+    // FIXME: need alignment?
+    // FIXME: share between here and AddSymbolToSection?
+    for (unsigned j = 0; j < Size; ++j)
+      SecDataOut.outbyte(0);
+
+    MOW.InitMem(CP[i].Val.ConstVal, &Sec->SectionData[0], CPLocations[i],
+                TM.getTargetData(), Sec->Relocations);
+  }
+}
+
+/// emitJumpTables - Emit all the jump tables for a given jump table info
+/// record to the appropriate section.
+void MachOCodeEmitter::emitJumpTables(MachineJumpTableInfo *MJTI) {
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  // FIXME: handle PIC codegen
+  assert(TM.getRelocationModel() != Reloc::PIC_ &&
+         "PIC codegen not yet handled for mach-o jump tables!");
+
+  MachOWriter::MachOSection *Sec = MOW.getJumpTableSection();
+  unsigned TextSecIndex = MOW.getTextSection()->Index;
+  OutputBuffer SecDataOut(Sec->SectionData, is64Bit, isLittleEndian);
+
+  for (unsigned i = 0, e = JT.size(); i != e; ++i) {
+    // For each jump table, record its offset from the start of the section,
+    // reserve space for the relocations to the MBBs, and add the relocations.
+    const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
+    JTLocations.push_back(Sec->SectionData.size());
+    for (unsigned mi = 0, me = MBBs.size(); mi != me; ++mi) {
+      MachineRelocation MR(MOW.GetJTRelocation(Sec->SectionData.size(),
+                                               MBBs[mi]));
+      MR.setResultPointer((void *)JTLocations[i]);
+      MR.setConstantVal(TextSecIndex);
+      Sec->Relocations.push_back(MR);
+      SecDataOut.outaddr(0);
+    }
+  }
+  // FIXME: remove when we have unified size + output buffer
+  Sec->size = Sec->SectionData.size();
+}
+
+//===----------------------------------------------------------------------===//
+//                          MachOWriter Implementation
+//===----------------------------------------------------------------------===//
+
+char MachOWriter::ID = 0;
+MachOWriter::MachOWriter(raw_ostream &o, TargetMachine &tm) 
+  : MachineFunctionPass(&ID), O(o), TM(tm) {
+  is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
+  isLittleEndian = TM.getTargetData()->isLittleEndian();
+
+  // Create the machine code emitter object for this target.
+  MCE = new MachOCodeEmitter(*this);
+}
+
+MachOWriter::~MachOWriter() {
+  delete MCE;
+}
+
+void MachOWriter::AddSymbolToSection(MachOSection *Sec, GlobalVariable *GV) {
+  const Type *Ty = GV->getType()->getElementType();
+  unsigned Size = TM.getTargetData()->getTypeAllocSize(Ty);
+  unsigned Align = TM.getTargetData()->getPreferredAlignment(GV);
+
+  // Reserve space in the .bss section for this symbol while maintaining the
+  // desired section alignment, which must be at least as much as required by
+  // this symbol.
+  OutputBuffer SecDataOut(Sec->SectionData, is64Bit, isLittleEndian);
+
+  if (Align) {
+    uint64_t OrigSize = Sec->size;
+    Align = Log2_32(Align);
+    Sec->align = std::max(unsigned(Sec->align), Align);
+    Sec->size = (Sec->size + Align - 1) & ~(Align-1);
+    
+    // Add alignment padding to buffer as well.
+    // FIXME: remove when we have unified size + output buffer
+    unsigned AlignedSize = Sec->size - OrigSize;
+    for (unsigned i = 0; i < AlignedSize; ++i)
+      SecDataOut.outbyte(0);
+  }
+  // Globals without external linkage apparently do not go in the symbol table.
+  if (!GV->hasLocalLinkage()) {
+    MachOSym Sym(GV, Mang->getValueName(GV), Sec->Index, TM);
+    Sym.n_value = Sec->size;
+    SymbolTable.push_back(Sym);
+  }
+
+  // Record the offset of the symbol, and then allocate space for it.
+  // FIXME: remove when we have unified size + output buffer
+  Sec->size += Size;
+  
+  // Now that we know what section the GlovalVariable is going to be emitted 
+  // into, update our mappings.
+  // FIXME: We may also need to update this when outputting non-GlobalVariable
+  // GlobalValues such as functions.
+  GVSection[GV] = Sec;
+  GVOffset[GV] = Sec->SectionData.size();
+  
+  // Allocate space in the section for the global.
+  for (unsigned i = 0; i < Size; ++i)
+    SecDataOut.outbyte(0);
+}
+
+void MachOWriter::EmitGlobal(GlobalVariable *GV) {
+  const Type *Ty = GV->getType()->getElementType();
+  unsigned Size = TM.getTargetData()->getTypeAllocSize(Ty);
+  bool NoInit = !GV->hasInitializer();
+  
+  // If this global has a zero initializer, it is part of the .bss or common
+  // section.
+  if (NoInit || GV->getInitializer()->isNullValue()) {
+    // If this global is part of the common block, add it now.  Variables are
+    // part of the common block if they are zero initialized and allowed to be
+    // merged with other symbols.
+    if (NoInit || GV->hasLinkOnceLinkage() || GV->hasWeakLinkage() ||
+        GV->hasCommonLinkage()) {
+      MachOSym ExtOrCommonSym(GV, Mang->getValueName(GV), MachOSym::NO_SECT,TM);
+      // For undefined (N_UNDF) external (N_EXT) types, n_value is the size in
+      // bytes of the symbol.
+      ExtOrCommonSym.n_value = Size;
+      SymbolTable.push_back(ExtOrCommonSym);
+      // Remember that we've seen this symbol
+      GVOffset[GV] = Size;
+      return;
+    }
+    // Otherwise, this symbol is part of the .bss section.
+    MachOSection *BSS = getBSSSection();
+    AddSymbolToSection(BSS, GV);
+    return;
+  }
+  
+  // Scalar read-only data goes in a literal section if the scalar is 4, 8, or
+  // 16 bytes, or a cstring.  Other read only data goes into a regular const
+  // section.  Read-write data goes in the data section.
+  MachOSection *Sec = GV->isConstant() ? getConstSection(GV->getInitializer()) : 
+                                         getDataSection();
+  AddSymbolToSection(Sec, GV);
+  InitMem(GV->getInitializer(), &Sec->SectionData[0], GVOffset[GV],
+          TM.getTargetData(), Sec->Relocations);
+}
+
+
+bool MachOWriter::runOnMachineFunction(MachineFunction &MF) {
+  // Nothing to do here, this is all done through the MCE object.
+  return false;
+}
+
+bool MachOWriter::doInitialization(Module &M) {
+  // Set the magic value, now that we know the pointer size and endianness
+  Header.setMagic(isLittleEndian, is64Bit);
+
+  // Set the file type
+  // FIXME: this only works for object files, we do not support the creation
+  //        of dynamic libraries or executables at this time.
+  Header.filetype = MachOHeader::MH_OBJECT;
+
+  Mang = new Mangler(M);
+  return false;
+}
+
+/// doFinalization - Now that the module has been completely processed, emit
+/// the Mach-O file to 'O'.
+bool MachOWriter::doFinalization(Module &M) {
+  // FIXME: we don't handle debug info yet, we should probably do that.
+
+  // Okay, the.text section has been completed, build the .data, .bss, and 
+  // "common" sections next.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    EmitGlobal(I);
+  
+  // Emit the header and load commands.
+  EmitHeaderAndLoadCommands();
+
+  // Emit the various sections and their relocation info.
+  EmitSections();
+
+  // Write the symbol table and the string table to the end of the file.
+  O.write((char*)&SymT[0], SymT.size());
+  O.write((char*)&StrT[0], StrT.size());
+
+  // We are done with the abstract symbols.
+  SectionList.clear();
+  SymbolTable.clear();
+  DynamicSymbolTable.clear();
+
+  // Release the name mangler object.
+  delete Mang; Mang = 0;
+  return false;
+}
+
+void MachOWriter::EmitHeaderAndLoadCommands() {
+  // Step #0: Fill in the segment load command size, since we need it to figure
+  //          out the rest of the header fields
+  MachOSegment SEG("", is64Bit);
+  SEG.nsects  = SectionList.size();
+  SEG.cmdsize = SEG.cmdSize(is64Bit) + 
+                SEG.nsects * SectionList[0]->cmdSize(is64Bit);
+  
+  // Step #1: calculate the number of load commands.  We always have at least
+  //          one, for the LC_SEGMENT load command, plus two for the normal
+  //          and dynamic symbol tables, if there are any symbols.
+  Header.ncmds = SymbolTable.empty() ? 1 : 3;
+  
+  // Step #2: calculate the size of the load commands
+  Header.sizeofcmds = SEG.cmdsize;
+  if (!SymbolTable.empty())
+    Header.sizeofcmds += SymTab.cmdsize + DySymTab.cmdsize;
+    
+  // Step #3: write the header to the file
+  // Local alias to shortenify coming code.
+  DataBuffer &FH = Header.HeaderData;
+  OutputBuffer FHOut(FH, is64Bit, isLittleEndian);
+
+  FHOut.outword(Header.magic);
+  FHOut.outword(TM.getMachOWriterInfo()->getCPUType());
+  FHOut.outword(TM.getMachOWriterInfo()->getCPUSubType());
+  FHOut.outword(Header.filetype);
+  FHOut.outword(Header.ncmds);
+  FHOut.outword(Header.sizeofcmds);
+  FHOut.outword(Header.flags);
+  if (is64Bit)
+    FHOut.outword(Header.reserved);
+  
+  // Step #4: Finish filling in the segment load command and write it out
+  for (std::vector<MachOSection*>::iterator I = SectionList.begin(),
+         E = SectionList.end(); I != E; ++I)
+    SEG.filesize += (*I)->size;
+
+  SEG.vmsize = SEG.filesize;
+  SEG.fileoff = Header.cmdSize(is64Bit) + Header.sizeofcmds;
+  
+  FHOut.outword(SEG.cmd);
+  FHOut.outword(SEG.cmdsize);
+  FHOut.outstring(SEG.segname, 16);
+  FHOut.outaddr(SEG.vmaddr);
+  FHOut.outaddr(SEG.vmsize);
+  FHOut.outaddr(SEG.fileoff);
+  FHOut.outaddr(SEG.filesize);
+  FHOut.outword(SEG.maxprot);
+  FHOut.outword(SEG.initprot);
+  FHOut.outword(SEG.nsects);
+  FHOut.outword(SEG.flags);
+  
+  // Step #5: Finish filling in the fields of the MachOSections 
+  uint64_t currentAddr = 0;
+  for (std::vector<MachOSection*>::iterator I = SectionList.begin(),
+         E = SectionList.end(); I != E; ++I) {
+    MachOSection *MOS = *I;
+    MOS->addr = currentAddr;
+    MOS->offset = currentAddr + SEG.fileoff;
+
+    // FIXME: do we need to do something with alignment here?
+    currentAddr += MOS->size;
+  }
+  
+  // Step #6: Emit the symbol table to temporary buffers, so that we know the
+  // size of the string table when we write the next load command.  This also
+  // sorts and assigns indices to each of the symbols, which is necessary for
+  // emitting relocations to externally-defined objects.
+  BufferSymbolAndStringTable();
+  
+  // Step #7: Calculate the number of relocations for each section and write out
+  // the section commands for each section
+  currentAddr += SEG.fileoff;
+  for (std::vector<MachOSection*>::iterator I = SectionList.begin(),
+         E = SectionList.end(); I != E; ++I) {
+    MachOSection *MOS = *I;
+    // Convert the relocations to target-specific relocations, and fill in the
+    // relocation offset for this section.
+    CalculateRelocations(*MOS);
+    MOS->reloff = MOS->nreloc ? currentAddr : 0;
+    currentAddr += MOS->nreloc * 8;
+    
+    // write the finalized section command to the output buffer
+    FHOut.outstring(MOS->sectname, 16);
+    FHOut.outstring(MOS->segname, 16);
+    FHOut.outaddr(MOS->addr);
+    FHOut.outaddr(MOS->size);
+    FHOut.outword(MOS->offset);
+    FHOut.outword(MOS->align);
+    FHOut.outword(MOS->reloff);
+    FHOut.outword(MOS->nreloc);
+    FHOut.outword(MOS->flags);
+    FHOut.outword(MOS->reserved1);
+    FHOut.outword(MOS->reserved2);
+    if (is64Bit)
+      FHOut.outword(MOS->reserved3);
+  }
+  
+  // Step #8: Emit LC_SYMTAB/LC_DYSYMTAB load commands
+  SymTab.symoff  = currentAddr;
+  SymTab.nsyms   = SymbolTable.size();
+  SymTab.stroff  = SymTab.symoff + SymT.size();
+  SymTab.strsize = StrT.size();
+  FHOut.outword(SymTab.cmd);
+  FHOut.outword(SymTab.cmdsize);
+  FHOut.outword(SymTab.symoff);
+  FHOut.outword(SymTab.nsyms);
+  FHOut.outword(SymTab.stroff);
+  FHOut.outword(SymTab.strsize);
+
+  // FIXME: set DySymTab fields appropriately
+  // We should probably just update these in BufferSymbolAndStringTable since
+  // thats where we're partitioning up the different kinds of symbols.
+  FHOut.outword(DySymTab.cmd);
+  FHOut.outword(DySymTab.cmdsize);
+  FHOut.outword(DySymTab.ilocalsym);
+  FHOut.outword(DySymTab.nlocalsym);
+  FHOut.outword(DySymTab.iextdefsym);
+  FHOut.outword(DySymTab.nextdefsym);
+  FHOut.outword(DySymTab.iundefsym);
+  FHOut.outword(DySymTab.nundefsym);
+  FHOut.outword(DySymTab.tocoff);
+  FHOut.outword(DySymTab.ntoc);
+  FHOut.outword(DySymTab.modtaboff);
+  FHOut.outword(DySymTab.nmodtab);
+  FHOut.outword(DySymTab.extrefsymoff);
+  FHOut.outword(DySymTab.nextrefsyms);
+  FHOut.outword(DySymTab.indirectsymoff);
+  FHOut.outword(DySymTab.nindirectsyms);
+  FHOut.outword(DySymTab.extreloff);
+  FHOut.outword(DySymTab.nextrel);
+  FHOut.outword(DySymTab.locreloff);
+  FHOut.outword(DySymTab.nlocrel);
+  
+  O.write((char*)&FH[0], FH.size());
+}
+
+/// EmitSections - Now that we have constructed the file header and load
+/// commands, emit the data for each section to the file.
+void MachOWriter::EmitSections() {
+  for (std::vector<MachOSection*>::iterator I = SectionList.begin(),
+         E = SectionList.end(); I != E; ++I)
+    // Emit the contents of each section
+    O.write((char*)&(*I)->SectionData[0], (*I)->size);
+  for (std::vector<MachOSection*>::iterator I = SectionList.begin(),
+         E = SectionList.end(); I != E; ++I)
+    // Emit the relocation entry data for each section.
+    O.write((char*)&(*I)->RelocBuffer[0], (*I)->RelocBuffer.size());
+}
+
+/// PartitionByLocal - Simple boolean predicate that returns true if Sym is
+/// a local symbol rather than an external symbol.
+bool MachOWriter::PartitionByLocal(const MachOSym &Sym) {
+  return (Sym.n_type & (MachOSym::N_EXT | MachOSym::N_PEXT)) == 0;
+}
+
+/// PartitionByDefined - Simple boolean predicate that returns true if Sym is
+/// defined in this module.
+bool MachOWriter::PartitionByDefined(const MachOSym &Sym) {
+  // FIXME: Do N_ABS or N_INDR count as defined?
+  return (Sym.n_type & MachOSym::N_SECT) == MachOSym::N_SECT;
+}
+
+/// BufferSymbolAndStringTable - Sort the symbols we encountered and assign them
+/// each a string table index so that they appear in the correct order in the
+/// output file.
+void MachOWriter::BufferSymbolAndStringTable() {
+  // The order of the symbol table is:
+  // 1. local symbols
+  // 2. defined external symbols (sorted by name)
+  // 3. undefined external symbols (sorted by name)
+  
+  // Before sorting the symbols, check the PendingGlobals for any undefined
+  // globals that need to be put in the symbol table.
+  for (std::vector<GlobalValue*>::iterator I = PendingGlobals.begin(),
+         E = PendingGlobals.end(); I != E; ++I) {
+    if (GVOffset[*I] == 0 && GVSection[*I] == 0) {
+      MachOSym UndfSym(*I, Mang->getValueName(*I), MachOSym::NO_SECT, TM);
+      SymbolTable.push_back(UndfSym);
+      GVOffset[*I] = -1;
+    }
+  }
+  
+  // Sort the symbols by name, so that when we partition the symbols by scope
+  // of definition, we won't have to sort by name within each partition.
+  std::sort(SymbolTable.begin(), SymbolTable.end(), MachOSymCmp());
+
+  // Parition the symbol table entries so that all local symbols come before 
+  // all symbols with external linkage. { 1 | 2 3 }
+  std::partition(SymbolTable.begin(), SymbolTable.end(), PartitionByLocal);
+  
+  // Advance iterator to beginning of external symbols and partition so that
+  // all external symbols defined in this module come before all external
+  // symbols defined elsewhere. { 1 | 2 | 3 }
+  for (std::vector<MachOSym>::iterator I = SymbolTable.begin(),
+         E = SymbolTable.end(); I != E; ++I) {
+    if (!PartitionByLocal(*I)) {
+      std::partition(I, E, PartitionByDefined);
+      break;
+    }
+  }
+
+  // Calculate the starting index for each of the local, extern defined, and 
+  // undefined symbols, as well as the number of each to put in the LC_DYSYMTAB
+  // load command.
+  for (std::vector<MachOSym>::iterator I = SymbolTable.begin(),
+         E = SymbolTable.end(); I != E; ++I) {
+    if (PartitionByLocal(*I)) {
+      ++DySymTab.nlocalsym;
+      ++DySymTab.iextdefsym;
+      ++DySymTab.iundefsym;
+    } else if (PartitionByDefined(*I)) {
+      ++DySymTab.nextdefsym;
+      ++DySymTab.iundefsym;
+    } else {
+      ++DySymTab.nundefsym;
+    }
+  }
+  
+  // Write out a leading zero byte when emitting string table, for n_strx == 0
+  // which means an empty string.
+  OutputBuffer StrTOut(StrT, is64Bit, isLittleEndian);
+  StrTOut.outbyte(0);
+
+  // The order of the string table is:
+  // 1. strings for external symbols
+  // 2. strings for local symbols
+  // Since this is the opposite order from the symbol table, which we have just
+  // sorted, we can walk the symbol table backwards to output the string table.
+  for (std::vector<MachOSym>::reverse_iterator I = SymbolTable.rbegin(),
+        E = SymbolTable.rend(); I != E; ++I) {
+    if (I->GVName == "") {
+      I->n_strx = 0;
+    } else {
+      I->n_strx = StrT.size();
+      StrTOut.outstring(I->GVName, I->GVName.length()+1);
+    }
+  }
+
+  OutputBuffer SymTOut(SymT, is64Bit, isLittleEndian);
+
+  unsigned index = 0;
+  for (std::vector<MachOSym>::iterator I = SymbolTable.begin(),
+         E = SymbolTable.end(); I != E; ++I, ++index) {
+    // Add the section base address to the section offset in the n_value field
+    // to calculate the full address.
+    // FIXME: handle symbols where the n_value field is not the address
+    GlobalValue *GV = const_cast<GlobalValue*>(I->GV);
+    if (GV && GVSection[GV])
+      I->n_value += GVSection[GV]->addr;
+    if (GV && (GVOffset[GV] == -1))
+      GVOffset[GV] = index;
+         
+    // Emit nlist to buffer
+    SymTOut.outword(I->n_strx);
+    SymTOut.outbyte(I->n_type);
+    SymTOut.outbyte(I->n_sect);
+    SymTOut.outhalf(I->n_desc);
+    SymTOut.outaddr(I->n_value);
+  }
+}
+
+/// CalculateRelocations - For each MachineRelocation in the current section,
+/// calculate the index of the section containing the object to be relocated,
+/// and the offset into that section.  From this information, create the
+/// appropriate target-specific MachORelocation type and add buffer it to be
+/// written out after we are finished writing out sections.
+void MachOWriter::CalculateRelocations(MachOSection &MOS) {
+  for (unsigned i = 0, e = MOS.Relocations.size(); i != e; ++i) {
+    MachineRelocation &MR = MOS.Relocations[i];
+    unsigned TargetSection = MR.getConstantVal();
+    unsigned TargetAddr = 0;
+    unsigned TargetIndex = 0;
+
+    // This is a scattered relocation entry if it points to a global value with
+    // a non-zero offset.
+    bool Scattered = false;
+    bool Extern = false;
+
+    // Since we may not have seen the GlobalValue we were interested in yet at
+    // the time we emitted the relocation for it, fix it up now so that it
+    // points to the offset into the correct section.
+    if (MR.isGlobalValue()) {
+      GlobalValue *GV = MR.getGlobalValue();
+      MachOSection *MOSPtr = GVSection[GV];
+      intptr_t Offset = GVOffset[GV];
+      
+      // If we have never seen the global before, it must be to a symbol
+      // defined in another module (N_UNDF).
+      if (!MOSPtr) {
+        // FIXME: need to append stub suffix
+        Extern = true;
+        TargetAddr = 0;
+        TargetIndex = GVOffset[GV];
+      } else {
+        Scattered = TargetSection != 0;
+        TargetSection = MOSPtr->Index;
+      }
+      MR.setResultPointer((void*)Offset);
+    }
+    
+    // If the symbol is locally defined, pass in the address of the section and
+    // the section index to the code which will generate the target relocation.
+    if (!Extern) {
+        MachOSection &To = *SectionList[TargetSection - 1];
+        TargetAddr = To.addr;
+        TargetIndex = To.Index;
+    }
+
+    OutputBuffer RelocOut(MOS.RelocBuffer, is64Bit, isLittleEndian);
+    OutputBuffer SecOut(MOS.SectionData, is64Bit, isLittleEndian);
+    
+    MOS.nreloc += GetTargetRelocation(MR, MOS.Index, TargetAddr, TargetIndex,
+                                      RelocOut, SecOut, Scattered, Extern);
+  }
+}
+
+// InitMem - Write the value of a Constant to the specified memory location,
+// converting it into bytes and relocations.
+void MachOWriter::InitMem(const Constant *C, void *Addr, intptr_t Offset,
+                          const TargetData *TD, 
+                          std::vector<MachineRelocation> &MRs) {
+  typedef std::pair<const Constant*, intptr_t> CPair;
+  std::vector<CPair> WorkList;
+  
+  WorkList.push_back(CPair(C,(intptr_t)Addr + Offset));
+  
+  intptr_t ScatteredOffset = 0;
+  
+  while (!WorkList.empty()) {
+    const Constant *PC = WorkList.back().first;
+    intptr_t PA = WorkList.back().second;
+    WorkList.pop_back();
+    
+    if (isa<UndefValue>(PC)) {
+      continue;
+    } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(PC)) {
+      unsigned ElementSize =
+        TD->getTypeAllocSize(CP->getType()->getElementType());
+      for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
+        WorkList.push_back(CPair(CP->getOperand(i), PA+i*ElementSize));
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(PC)) {
+      //
+      // FIXME: Handle ConstantExpression.  See EE::getConstantValue()
+      //
+      switch (CE->getOpcode()) {
+      case Instruction::GetElementPtr: {
+        SmallVector<Value*, 8> Indices(CE->op_begin()+1, CE->op_end());
+        ScatteredOffset = TD->getIndexedOffset(CE->getOperand(0)->getType(),
+                                               &Indices[0], Indices.size());
+        WorkList.push_back(CPair(CE->getOperand(0), PA));
+        break;
+      }
+      case Instruction::Add:
+      default:
+        cerr << "ConstantExpr not handled as global var init: " << *CE << "\n";
+        abort();
+        break;
+      }
+    } else if (PC->getType()->isSingleValueType()) {
+      uint8_t *ptr = (uint8_t *)PA;
+      switch (PC->getType()->getTypeID()) {
+      case Type::IntegerTyID: {
+        unsigned NumBits = cast<IntegerType>(PC->getType())->getBitWidth();
+        uint64_t val = cast<ConstantInt>(PC)->getZExtValue();
+        if (NumBits <= 8)
+          ptr[0] = val;
+        else if (NumBits <= 16) {
+          if (TD->isBigEndian())
+            val = ByteSwap_16(val);
+          ptr[0] = val;
+          ptr[1] = val >> 8;
+        } else if (NumBits <= 32) {
+          if (TD->isBigEndian())
+            val = ByteSwap_32(val);
+          ptr[0] = val;
+          ptr[1] = val >> 8;
+          ptr[2] = val >> 16;
+          ptr[3] = val >> 24;
+        } else if (NumBits <= 64) {
+          if (TD->isBigEndian())
+            val = ByteSwap_64(val);
+          ptr[0] = val;
+          ptr[1] = val >> 8;
+          ptr[2] = val >> 16;
+          ptr[3] = val >> 24;
+          ptr[4] = val >> 32;
+          ptr[5] = val >> 40;
+          ptr[6] = val >> 48;
+          ptr[7] = val >> 56;
+        } else {
+          assert(0 && "Not implemented: bit widths > 64");
+        }
+        break;
+      }
+      case Type::FloatTyID: {
+        uint32_t val = cast<ConstantFP>(PC)->getValueAPF().bitcastToAPInt().
+                        getZExtValue();
+        if (TD->isBigEndian())
+          val = ByteSwap_32(val);
+        ptr[0] = val;
+        ptr[1] = val >> 8;
+        ptr[2] = val >> 16;
+        ptr[3] = val >> 24;
+        break;
+      }
+      case Type::DoubleTyID: {
+        uint64_t val = cast<ConstantFP>(PC)->getValueAPF().bitcastToAPInt().
+                         getZExtValue();
+        if (TD->isBigEndian())
+          val = ByteSwap_64(val);
+        ptr[0] = val;
+        ptr[1] = val >> 8;
+        ptr[2] = val >> 16;
+        ptr[3] = val >> 24;
+        ptr[4] = val >> 32;
+        ptr[5] = val >> 40;
+        ptr[6] = val >> 48;
+        ptr[7] = val >> 56;
+        break;
+      }
+      case Type::PointerTyID:
+        if (isa<ConstantPointerNull>(PC))
+          memset(ptr, 0, TD->getPointerSize());
+        else if (const GlobalValue* GV = dyn_cast<GlobalValue>(PC)) {
+          // FIXME: what about function stubs?
+          MRs.push_back(MachineRelocation::getGV(PA-(intptr_t)Addr, 
+                                                 MachineRelocation::VANILLA,
+                                                 const_cast<GlobalValue*>(GV),
+                                                 ScatteredOffset));
+          ScatteredOffset = 0;
+        } else
+          assert(0 && "Unknown constant pointer type!");
+        break;
+      default:
+        cerr << "ERROR: Constant unimp for type: " << *PC->getType() << "\n";
+        abort();
+      }
+    } else if (isa<ConstantAggregateZero>(PC)) {
+      memset((void*)PA, 0, (size_t)TD->getTypeAllocSize(PC->getType()));
+    } else if (const ConstantArray *CPA = dyn_cast<ConstantArray>(PC)) {
+      unsigned ElementSize =
+        TD->getTypeAllocSize(CPA->getType()->getElementType());
+      for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i)
+        WorkList.push_back(CPair(CPA->getOperand(i), PA+i*ElementSize));
+    } else if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(PC)) {
+      const StructLayout *SL =
+        TD->getStructLayout(cast<StructType>(CPS->getType()));
+      for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i)
+        WorkList.push_back(CPair(CPS->getOperand(i),
+                                 PA+SL->getElementOffset(i)));
+    } else {
+      cerr << "Bad Type: " << *PC->getType() << "\n";
+      assert(0 && "Unknown constant type to initialize memory with!");
+    }
+  }
+}
+
+MachOSym::MachOSym(const GlobalValue *gv, std::string name, uint8_t sect,
+                   TargetMachine &TM) :
+  GV(gv), n_strx(0), n_type(sect == NO_SECT ? N_UNDF : N_SECT), n_sect(sect),
+  n_desc(0), n_value(0) {
+
+  const TargetAsmInfo *TAI = TM.getTargetAsmInfo();  
+  
+  switch (GV->getLinkage()) {
+  default:
+    assert(0 && "Unexpected linkage type!");
+    break;
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::CommonLinkage:
+    assert(!isa<Function>(gv) && "Unexpected linkage type for Function!");
+  case GlobalValue::ExternalLinkage:
+    GVName = TAI->getGlobalPrefix() + name;
+    n_type |= GV->hasHiddenVisibility() ? N_PEXT : N_EXT;
+    break;
+  case GlobalValue::PrivateLinkage:
+    GVName = TAI->getPrivateGlobalPrefix() + name;
+    break;
+  case GlobalValue::InternalLinkage:
+    GVName = TAI->getGlobalPrefix() + name;
+    break;
+  }
+}
diff --git a/lib/CodeGen/MachOWriter.h b/lib/CodeGen/MachOWriter.h
new file mode 100644
index 0000000..6ab66ee
--- /dev/null
+++ b/lib/CodeGen/MachOWriter.h
@@ -0,0 +1,629 @@
+//=== MachOWriter.h - Target-independent Mach-O writer support --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachOWriter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MACHOWRITER_H
+#define MACHOWRITER_H
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRelocation.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetMachOWriterInfo.h"
+#include <map>
+
+namespace llvm {
+  class GlobalVariable;
+  class Mangler;
+  class MachineCodeEmitter;
+  class MachOCodeEmitter;
+  class OutputBuffer;
+  class raw_ostream;
+
+  /// MachOSym - This struct contains information about each symbol that is
+  /// added to logical symbol table for the module.  This is eventually
+  /// turned into a real symbol table in the file.
+  struct MachOSym {
+    const GlobalValue *GV;    // The global value this corresponds to.
+    std::string GVName;       // The mangled name of the global value.
+    uint32_t    n_strx;       // index into the string table
+    uint8_t     n_type;       // type flag
+    uint8_t     n_sect;       // section number or NO_SECT
+    int16_t     n_desc;       // see <mach-o/stab.h>
+    uint64_t    n_value;      // value for this symbol (or stab offset)
+    
+    // Constants for the n_sect field
+    // see <mach-o/nlist.h>
+    enum { NO_SECT = 0 };   // symbol is not in any section
+
+    // Constants for the n_type field
+    // see <mach-o/nlist.h>
+    enum { N_UNDF  = 0x0,  // undefined, n_sect == NO_SECT
+           N_ABS   = 0x2,  // absolute, n_sect == NO_SECT
+           N_SECT  = 0xe,  // defined in section number n_sect
+           N_PBUD  = 0xc,  // prebound undefined (defined in a dylib)
+           N_INDR  = 0xa   // indirect
+    };
+    // The following bits are OR'd into the types above. For example, a type
+    // of 0x0f would be an external N_SECT symbol (0x0e | 0x01).
+    enum { N_EXT  = 0x01,   // external symbol bit
+           N_PEXT = 0x10    // private external symbol bit
+    };
+    
+    // Constants for the n_desc field
+    // see <mach-o/loader.h>
+    enum { REFERENCE_FLAG_UNDEFINED_NON_LAZY          = 0,
+           REFERENCE_FLAG_UNDEFINED_LAZY              = 1,
+           REFERENCE_FLAG_DEFINED                     = 2,
+           REFERENCE_FLAG_PRIVATE_DEFINED             = 3,
+           REFERENCE_FLAG_PRIVATE_UNDEFINED_NON_LAZY  = 4,
+           REFERENCE_FLAG_PRIVATE_UNDEFINED_LAZY      = 5
+    };
+    enum { N_NO_DEAD_STRIP = 0x0020, // symbol is not to be dead stripped
+           N_WEAK_REF      = 0x0040, // symbol is weak referenced
+           N_WEAK_DEF      = 0x0080  // coalesced symbol is a weak definition
+    };
+    
+    MachOSym(const GlobalValue *gv, std::string name, uint8_t sect,
+             TargetMachine &TM);
+  };
+      
+  /// MachOWriter - This class implements the common target-independent code for
+  /// writing Mach-O files.  Targets should derive a class from this to
+  /// parameterize the output format.
+  ///
+  class MachOWriter : public MachineFunctionPass {
+    friend class MachOCodeEmitter;
+  public:
+    static char ID;
+    MachineCodeEmitter &getMachineCodeEmitter() const {
+      return *(MachineCodeEmitter*)MCE;
+    }
+
+    MachOWriter(raw_ostream &O, TargetMachine &TM);
+    virtual ~MachOWriter();
+
+    virtual const char *getPassName() const {
+      return "Mach-O Writer";
+    }
+
+    typedef std::vector<uint8_t> DataBuffer;
+  protected:
+    /// Output stream to send the resultant object file to.
+    ///
+    raw_ostream &O;
+
+    /// Target machine description.
+    ///
+    TargetMachine &TM;
+
+    /// Mang - The object used to perform name mangling for this module.
+    ///
+    Mangler *Mang;
+    
+    /// MCE - The MachineCodeEmitter object that we are exposing to emit machine
+    /// code for functions to the .o file.
+    MachOCodeEmitter *MCE;
+
+    /// is64Bit/isLittleEndian - This information is inferred from the target
+    /// machine directly, indicating what header values and flags to set.
+    bool is64Bit, isLittleEndian;
+
+    /// doInitialization - Emit the file header and all of the global variables
+    /// for the module to the Mach-O file.
+    bool doInitialization(Module &M);
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    /// doFinalization - Now that the module has been completely processed, emit
+    /// the Mach-O file to 'O'.
+    bool doFinalization(Module &M);
+
+    /// MachOHeader - This struct contains the header information about a
+    /// specific architecture type/subtype pair that is emitted to the file.
+    struct MachOHeader {
+      uint32_t  magic;      // mach magic number identifier
+      uint32_t  filetype;   // type of file
+      uint32_t  ncmds;      // number of load commands
+      uint32_t  sizeofcmds; // the size of all the load commands
+      uint32_t  flags;      // flags
+      uint32_t  reserved;   // 64-bit only
+      
+      /// HeaderData - The actual data for the header which we are building
+      /// up for emission to the file.
+      DataBuffer HeaderData;
+
+      // Constants for the filetype field
+      // see <mach-o/loader.h> for additional info on the various types
+      enum { MH_OBJECT     = 1, // relocatable object file
+             MH_EXECUTE    = 2, // demand paged executable file
+             MH_FVMLIB     = 3, // fixed VM shared library file
+             MH_CORE       = 4, // core file
+             MH_PRELOAD    = 5, // preloaded executable file
+             MH_DYLIB      = 6, // dynamically bound shared library
+             MH_DYLINKER   = 7, // dynamic link editor
+             MH_BUNDLE     = 8, // dynamically bound bundle file
+             MH_DYLIB_STUB = 9, // shared library stub for static linking only
+             MH_DSYM       = 10 // companion file wiht only debug sections
+      };
+      
+      // Constants for the flags field
+      enum { MH_NOUNDEFS                = 1 << 0,
+                // the object file has no undefined references
+             MH_INCRLINK                = 1 << 1,
+                // the object file is the output of an incremental link against
+                // a base file and cannot be link edited again
+             MH_DYLDLINK                = 1 << 2,
+                // the object file is input for the dynamic linker and cannot be
+                // statically link edited again.
+             MH_BINDATLOAD              = 1 << 3,
+                // the object file's undefined references are bound by the
+                // dynamic linker when loaded.
+             MH_PREBOUND                = 1 << 4,
+                // the file has its dynamic undefined references prebound
+             MH_SPLIT_SEGS              = 1 << 5,
+                // the file has its read-only and read-write segments split
+                // see <mach/shared_memory_server.h>
+             MH_LAZY_INIT               = 1 << 6,
+                // the shared library init routine is to be run lazily via
+                // catching memory faults to its writable segments (obsolete)
+             MH_TWOLEVEL                = 1 << 7,
+                // the image is using two-level namespace bindings
+             MH_FORCE_FLAT              = 1 << 8,
+                // the executable is forcing all images to use flat namespace
+                // bindings.
+             MH_NOMULTIDEFS             = 1 << 8,
+                // this umbrella guarantees no multiple definitions of symbols
+                // in its sub-images so the two-level namespace hints can
+                // always be used.
+             MH_NOFIXPREBINDING         = 1 << 10,
+                // do not have dyld notify the prebidning agent about this
+                // executable.
+             MH_PREBINDABLE             = 1 << 11,
+                // the binary is not prebound but can have its prebinding
+                // redone.  only used when MH_PREBOUND is not set.
+             MH_ALLMODSBOUND            = 1 << 12,
+                // indicates that this binary binds to all two-level namespace
+                // modules of its dependent libraries.  Only used when
+                // MH_PREBINDABLE and MH_TWOLEVEL are both set.
+             MH_SUBSECTIONS_VIA_SYMBOLS = 1 << 13,
+                // safe to divide up the sections into sub-sections via symbols
+                // for dead code stripping.
+             MH_CANONICAL               = 1 << 14,
+                // the binary has been canonicalized via the unprebind operation
+             MH_WEAK_DEFINES            = 1 << 15,
+                // the final linked image contains external weak symbols
+             MH_BINDS_TO_WEAK           = 1 << 16,
+                // the final linked image uses weak symbols
+             MH_ALLOW_STACK_EXECUTION   = 1 << 17
+                // When this bit is set, all stacks in the task will be given
+                // stack execution privilege.  Only used in MH_EXECUTE filetype
+      };
+
+      MachOHeader() : magic(0), filetype(0), ncmds(0), sizeofcmds(0), flags(0),
+                      reserved(0) { }
+      
+      /// cmdSize - This routine returns the size of the MachOSection as written
+      /// to disk, depending on whether the destination is a 64 bit Mach-O file.
+      unsigned cmdSize(bool is64Bit) const {
+        if (is64Bit)
+          return 8 * sizeof(uint32_t);
+        else
+          return 7 * sizeof(uint32_t);
+      }
+
+      /// setMagic - This routine sets the appropriate value for the 'magic'
+      /// field based on pointer size and endianness.
+      void setMagic(bool isLittleEndian, bool is64Bit) {
+        if (isLittleEndian)
+          if (is64Bit) magic = 0xcffaedfe;
+          else         magic = 0xcefaedfe;
+        else
+          if (is64Bit) magic = 0xfeedfacf;
+          else         magic = 0xfeedface;
+      }
+    };
+    
+    /// Header - An instance of MachOHeader that we will update while we build
+    /// the file, and then emit during finalization.
+    MachOHeader Header;
+    
+    /// MachOSegment - This struct contains the necessary information to
+    /// emit the load commands for each section in the file.
+    struct MachOSegment {
+      uint32_t    cmd;      // LC_SEGMENT or LC_SEGMENT_64
+      uint32_t    cmdsize;  // Total size of this struct and section commands
+      std::string segname;  // segment name
+      uint64_t    vmaddr;   // address of this segment
+      uint64_t    vmsize;   // size of this segment, may be larger than filesize
+      uint64_t    fileoff;  // offset in file
+      uint64_t    filesize; // amount to read from file
+      uint32_t    maxprot;  // maximum VM protection
+      uint32_t    initprot; // initial VM protection
+      uint32_t    nsects;   // number of sections in this segment
+      uint32_t    flags;    // flags
+      
+      // The following constants are getting pulled in by one of the
+      // system headers, which creates a neat clash with the enum.
+#if !defined(VM_PROT_NONE)
+#define VM_PROT_NONE    0x00
+#endif
+#if !defined(VM_PROT_READ)
+#define VM_PROT_READ    0x01
+#endif
+#if !defined(VM_PROT_WRITE)
+#define VM_PROT_WRITE   0x02
+#endif
+#if !defined(VM_PROT_EXECUTE)
+#define VM_PROT_EXECUTE 0x04
+#endif
+#if !defined(VM_PROT_ALL)
+#define VM_PROT_ALL     0x07
+#endif
+
+      // Constants for the vm protection fields
+      // see <mach-o/vm_prot.h>
+      enum { SEG_VM_PROT_NONE     = VM_PROT_NONE, 
+             SEG_VM_PROT_READ     = VM_PROT_READ, // read permission
+             SEG_VM_PROT_WRITE    = VM_PROT_WRITE, // write permission
+             SEG_VM_PROT_EXECUTE  = VM_PROT_EXECUTE,
+             SEG_VM_PROT_ALL      = VM_PROT_ALL
+      };
+      
+      // Constants for the cmd field
+      // see <mach-o/loader.h>
+      enum { LC_SEGMENT    = 0x01,  // segment of this file to be mapped
+             LC_SEGMENT_64 = 0x19   // 64-bit segment of this file to be mapped
+      };
+      
+      /// cmdSize - This routine returns the size of the MachOSection as written
+      /// to disk, depending on whether the destination is a 64 bit Mach-O file.
+      unsigned cmdSize(bool is64Bit) const {
+        if (is64Bit)
+          return 6 * sizeof(uint32_t) + 4 * sizeof(uint64_t) + 16;
+        else
+          return 10 * sizeof(uint32_t) + 16;  // addresses only 32 bits
+      }
+
+      MachOSegment(const std::string &seg, bool is64Bit)
+        : cmd(is64Bit ? LC_SEGMENT_64 : LC_SEGMENT), cmdsize(0), segname(seg),
+          vmaddr(0), vmsize(0), fileoff(0), filesize(0), maxprot(VM_PROT_ALL),
+          initprot(VM_PROT_ALL), nsects(0), flags(0) { }
+    };
+
+    /// MachOSection - This struct contains information about each section in a 
+    /// particular segment that is emitted to the file.  This is eventually
+    /// turned into the SectionCommand in the load command for a particlar
+    /// segment.
+    struct MachOSection { 
+      std::string  sectname; // name of this section, 
+      std::string  segname;  // segment this section goes in
+      uint64_t  addr;        // memory address of this section
+      uint64_t  size;        // size in bytes of this section
+      uint32_t  offset;      // file offset of this section
+      uint32_t  align;       // section alignment (power of 2)
+      uint32_t  reloff;      // file offset of relocation entries
+      uint32_t  nreloc;      // number of relocation entries
+      uint32_t  flags;       // flags (section type and attributes)
+      uint32_t  reserved1;   // reserved (for offset or index)
+      uint32_t  reserved2;   // reserved (for count or sizeof)
+      uint32_t  reserved3;   // reserved (64 bit only)
+      
+      /// A unique number for this section, which will be used to match symbols
+      /// to the correct section.
+      uint32_t Index;
+      
+      /// SectionData - The actual data for this section which we are building
+      /// up for emission to the file.
+      DataBuffer SectionData;
+
+      /// RelocBuffer - A buffer to hold the mach-o relocations before we write
+      /// them out at the appropriate location in the file.
+      DataBuffer RelocBuffer;
+      
+      /// Relocations - The relocations that we have encountered so far in this 
+      /// section that we will need to convert to MachORelocation entries when
+      /// the file is written.
+      std::vector<MachineRelocation> Relocations;
+      
+      // Constants for the section types (low 8 bits of flags field)
+      // see <mach-o/loader.h>
+      enum { S_REGULAR = 0,
+                // regular section
+             S_ZEROFILL = 1,
+                // zero fill on demand section
+             S_CSTRING_LITERALS = 2,
+                // section with only literal C strings
+             S_4BYTE_LITERALS = 3,
+                // section with only 4 byte literals
+             S_8BYTE_LITERALS = 4,
+                // section with only 8 byte literals
+             S_LITERAL_POINTERS = 5, 
+                // section with only pointers to literals
+             S_NON_LAZY_SYMBOL_POINTERS = 6,
+                // section with only non-lazy symbol pointers
+             S_LAZY_SYMBOL_POINTERS = 7,
+                // section with only lazy symbol pointers
+             S_SYMBOL_STUBS = 8,
+                // section with only symbol stubs
+                // byte size of stub in the reserved2 field
+             S_MOD_INIT_FUNC_POINTERS = 9,
+                // section with only function pointers for initialization
+             S_MOD_TERM_FUNC_POINTERS = 10,
+                // section with only function pointers for termination
+             S_COALESCED = 11,
+                // section contains symbols that are coalesced
+             S_GB_ZEROFILL = 12,
+                // zero fill on demand section (that can be larger than 4GB)
+             S_INTERPOSING = 13,
+                // section with only pairs of function pointers for interposing
+             S_16BYTE_LITERALS = 14
+                // section with only 16 byte literals
+      };
+      
+      // Constants for the section flags (high 24 bits of flags field)
+      // see <mach-o/loader.h>
+      enum { S_ATTR_PURE_INSTRUCTIONS   = 1 << 31,
+                // section contains only true machine instructions
+             S_ATTR_NO_TOC              = 1 << 30,
+                // section contains coalesced symbols that are not to be in a 
+                // ranlib table of contents
+             S_ATTR_STRIP_STATIC_SYMS   = 1 << 29,
+                // ok to strip static symbols in this section in files with the
+                // MY_DYLDLINK flag
+             S_ATTR_NO_DEAD_STRIP       = 1 << 28,
+                // no dead stripping
+             S_ATTR_LIVE_SUPPORT        = 1 << 27,
+                // blocks are live if they reference live blocks
+             S_ATTR_SELF_MODIFYING_CODE = 1 << 26,
+                // used with i386 code stubs written on by dyld
+             S_ATTR_DEBUG               = 1 << 25,
+                // a debug section
+             S_ATTR_SOME_INSTRUCTIONS   = 1 << 10,
+                // section contains some machine instructions
+             S_ATTR_EXT_RELOC           = 1 << 9,
+                // section has external relocation entries
+             S_ATTR_LOC_RELOC           = 1 << 8
+                // section has local relocation entries
+      };
+
+      /// cmdSize - This routine returns the size of the MachOSection as written
+      /// to disk, depending on whether the destination is a 64 bit Mach-O file.
+      unsigned cmdSize(bool is64Bit) const {
+        if (is64Bit)
+          return 7 * sizeof(uint32_t) + 2 * sizeof(uint64_t) + 32;
+        else
+          return 9 * sizeof(uint32_t) + 32;  // addresses only 32 bits
+      }
+
+      MachOSection(const std::string &seg, const std::string &sect)
+        : sectname(sect), segname(seg), addr(0), size(0), offset(0), align(2),
+          reloff(0), nreloc(0), flags(0), reserved1(0), reserved2(0),
+          reserved3(0) { }
+    };
+
+  private:
+
+    /// SectionList - This is the list of sections that we have emitted to the
+    /// file.  Once the file has been completely built, the segment load command
+    /// SectionCommands are constructed from this info.
+    std::vector<MachOSection*> SectionList;
+
+    /// SectionLookup - This is a mapping from section name to SectionList entry
+    std::map<std::string, MachOSection*> SectionLookup;
+    
+    /// GVSection - This is a mapping from a GlobalValue to a MachOSection,
+    /// to aid in emitting relocations.
+    std::map<GlobalValue*, MachOSection*> GVSection;
+
+    /// GVOffset - This is a mapping from a GlobalValue to an offset from the 
+    /// start of the section in which the GV resides, to aid in emitting
+    /// relocations.
+    std::map<GlobalValue*, intptr_t> GVOffset;
+
+    /// getSection - Return the section with the specified name, creating a new
+    /// section if one does not already exist.
+    MachOSection *getSection(const std::string &seg, const std::string &sect,
+                             unsigned Flags = 0) {
+      MachOSection *MOS = SectionLookup[seg+sect];
+      if (MOS) return MOS;
+
+      MOS = new MachOSection(seg, sect);
+      SectionList.push_back(MOS);
+      MOS->Index = SectionList.size();
+      MOS->flags = MachOSection::S_REGULAR | Flags;
+      SectionLookup[seg+sect] = MOS;
+      return MOS;
+    }
+    MachOSection *getTextSection(bool isCode = true) {
+      if (isCode)
+        return getSection("__TEXT", "__text", 
+                          MachOSection::S_ATTR_PURE_INSTRUCTIONS |
+                          MachOSection::S_ATTR_SOME_INSTRUCTIONS);
+      else
+        return getSection("__TEXT", "__text");
+    }
+    MachOSection *getBSSSection() {
+      return getSection("__DATA", "__bss", MachOSection::S_ZEROFILL);
+    }
+    MachOSection *getDataSection() {
+      return getSection("__DATA", "__data");
+    }
+    MachOSection *getConstSection(Constant *C) {
+      const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+      if (CVA && CVA->isCString())
+        return getSection("__TEXT", "__cstring", 
+                          MachOSection::S_CSTRING_LITERALS);
+      
+      const Type *Ty = C->getType();
+      if (Ty->isPrimitiveType() || Ty->isInteger()) {
+        unsigned Size = TM.getTargetData()->getTypeAllocSize(Ty);
+        switch(Size) {
+        default: break; // Fall through to __TEXT,__const
+        case 4:
+          return getSection("__TEXT", "__literal4",
+                            MachOSection::S_4BYTE_LITERALS);
+        case 8:
+          return getSection("__TEXT", "__literal8",
+                            MachOSection::S_8BYTE_LITERALS);
+        case 16:
+          return getSection("__TEXT", "__literal16",
+                            MachOSection::S_16BYTE_LITERALS);
+        }
+      }
+      return getSection("__TEXT", "__const");
+    }
+    MachOSection *getJumpTableSection() {
+      if (TM.getRelocationModel() == Reloc::PIC_)
+        return getTextSection(false);
+      else
+        return getSection("__TEXT", "__const");
+    }
+    
+    /// MachOSymTab - This struct contains information about the offsets and 
+    /// size of symbol table information.
+    /// segment.
+    struct MachOSymTab {
+      uint32_t cmd;     // LC_SYMTAB
+      uint32_t cmdsize; // sizeof( MachOSymTab )
+      uint32_t symoff;  // symbol table offset
+      uint32_t nsyms;   // number of symbol table entries
+      uint32_t stroff;  // string table offset
+      uint32_t strsize; // string table size in bytes
+
+      // Constants for the cmd field
+      // see <mach-o/loader.h>
+      enum { LC_SYMTAB = 0x02  // link-edit stab symbol table info
+      };
+      
+      MachOSymTab() : cmd(LC_SYMTAB), cmdsize(6 * sizeof(uint32_t)), symoff(0),
+        nsyms(0), stroff(0), strsize(0) { }
+    };
+    
+    /// MachOSymTab - This struct contains information about the offsets and 
+    /// size of symbol table information.
+    /// segment.
+    struct MachODySymTab {
+      uint32_t cmd;             // LC_DYSYMTAB
+      uint32_t cmdsize;         // sizeof( MachODySymTab )
+      uint32_t ilocalsym;       // index to local symbols
+      uint32_t nlocalsym;       // number of local symbols
+      uint32_t iextdefsym;      // index to externally defined symbols
+      uint32_t nextdefsym;      // number of externally defined symbols
+      uint32_t iundefsym;       // index to undefined symbols
+      uint32_t nundefsym;       // number of undefined symbols
+      uint32_t tocoff;          // file offset to table of contents
+      uint32_t ntoc;            // number of entries in table of contents
+      uint32_t modtaboff;       // file offset to module table
+      uint32_t nmodtab;         // number of module table entries
+      uint32_t extrefsymoff;    // offset to referenced symbol table
+      uint32_t nextrefsyms;     // number of referenced symbol table entries
+      uint32_t indirectsymoff;  // file offset to the indirect symbol table
+      uint32_t nindirectsyms;   // number of indirect symbol table entries
+      uint32_t extreloff;       // offset to external relocation entries
+      uint32_t nextrel;         // number of external relocation entries
+      uint32_t locreloff;       // offset to local relocation entries
+      uint32_t nlocrel;         // number of local relocation entries
+
+      // Constants for the cmd field
+      // see <mach-o/loader.h>
+      enum { LC_DYSYMTAB = 0x0B  // dynamic link-edit symbol table info
+      };
+      
+      MachODySymTab() : cmd(LC_DYSYMTAB), cmdsize(20 * sizeof(uint32_t)),
+        ilocalsym(0), nlocalsym(0), iextdefsym(0), nextdefsym(0),
+        iundefsym(0), nundefsym(0), tocoff(0), ntoc(0), modtaboff(0),
+        nmodtab(0), extrefsymoff(0), nextrefsyms(0), indirectsymoff(0),
+        nindirectsyms(0), extreloff(0), nextrel(0), locreloff(0), nlocrel(0) { }
+    };
+    
+    /// SymTab - The "stab" style symbol table information
+    MachOSymTab   SymTab;     
+    /// DySymTab - symbol table info for the dynamic link editor
+    MachODySymTab DySymTab;
+
+    struct MachOSymCmp {
+      // FIXME: this does not appear to be sorting 'f' after 'F'
+      bool operator()(const MachOSym &LHS, const MachOSym &RHS) {
+        return LHS.GVName < RHS.GVName;
+      }
+    };
+
+    /// PartitionByLocal - Simple boolean predicate that returns true if Sym is
+    /// a local symbol rather than an external symbol.
+    static bool PartitionByLocal(const MachOSym &Sym);
+
+    /// PartitionByDefined - Simple boolean predicate that returns true if Sym 
+    /// is defined in this module.
+    static bool PartitionByDefined(const MachOSym &Sym);
+
+  protected:
+  
+    /// SymbolTable - This is the list of symbols we have emitted to the file.
+    /// This actually gets rearranged before emission to the file (to put the
+    /// local symbols first in the list).
+    std::vector<MachOSym> SymbolTable;
+    
+    /// SymT - A buffer to hold the symbol table before we write it out at the
+    /// appropriate location in the file.
+    DataBuffer SymT;
+    
+    /// StrT - A buffer to hold the string table before we write it out at the
+    /// appropriate location in the file.
+    DataBuffer StrT;
+    
+    /// PendingSyms - This is a list of externally defined symbols that we have
+    /// been asked to emit, but have not seen a reference to.  When a reference
+    /// is seen, the symbol will move from this list to the SymbolTable.
+    std::vector<GlobalValue*> PendingGlobals;
+    
+    /// DynamicSymbolTable - This is just a vector of indices into
+    /// SymbolTable to aid in emitting the DYSYMTAB load command.
+    std::vector<unsigned> DynamicSymbolTable;
+    
+    static void InitMem(const Constant *C, void *Addr, intptr_t Offset,
+                        const TargetData *TD, 
+                        std::vector<MachineRelocation> &MRs);
+
+  private:
+    void AddSymbolToSection(MachOSection *MOS, GlobalVariable *GV);
+    void EmitGlobal(GlobalVariable *GV);
+    void EmitHeaderAndLoadCommands();
+    void EmitSections();
+    void BufferSymbolAndStringTable();
+    void CalculateRelocations(MachOSection &MOS);
+
+    MachineRelocation GetJTRelocation(unsigned Offset,
+                                      MachineBasicBlock *MBB) const {
+      return TM.getMachOWriterInfo()->GetJTRelocation(Offset, MBB);
+    }
+
+    /// GetTargetRelocation - Returns the number of relocations.
+    unsigned GetTargetRelocation(MachineRelocation &MR,
+                                 unsigned FromIdx,
+                                 unsigned ToAddr,
+                                 unsigned ToIndex,
+                                 OutputBuffer &RelocOut,
+                                 OutputBuffer &SecOut,
+                                 bool Scattered,
+                                 bool Extern) {
+      return TM.getMachOWriterInfo()->GetTargetRelocation(MR, FromIdx, ToAddr,
+                                                          ToIndex, RelocOut,
+                                                          SecOut, Scattered,
+                                                          Extern);
+    }
+  };
+}
+
+#endif
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
new file mode 100644
index 0000000..71e6b3e
--- /dev/null
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -0,0 +1,372 @@
+//===-- llvm/CodeGen/MachineBasicBlock.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect the sequence of machine instructions for a basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrDesc.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/LeakDetector.h"
+#include <algorithm>
+using namespace llvm;
+
+MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb)
+  : BB(bb), Number(-1), xParent(&mf), Alignment(0), IsLandingPad(false) {
+  Insts.Parent = this;
+}
+
+MachineBasicBlock::~MachineBasicBlock() {
+  LeakDetector::removeGarbageObject(this);
+}
+
+std::ostream& llvm::operator<<(std::ostream &OS, const MachineBasicBlock &MBB) {
+  MBB.print(OS);
+  return OS;
+}
+
+/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the 
+/// parent pointer of the MBB, the MBB numbering, and any instructions in the
+/// MBB to be on the right operand list for registers.
+///
+/// MBBs start out as #-1. When a MBB is added to a MachineFunction, it
+/// gets the next available unique MBB number. If it is removed from a
+/// MachineFunction, it goes back to being #-1.
+void ilist_traits<MachineBasicBlock>::addNodeToList(MachineBasicBlock* N) {
+  MachineFunction &MF = *N->getParent();
+  N->Number = MF.addToMBBNumbering(N);
+
+  // Make sure the instructions have their operands in the reginfo lists.
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  for (MachineBasicBlock::iterator I = N->begin(), E = N->end(); I != E; ++I)
+    I->AddRegOperandsToUseLists(RegInfo);
+
+  LeakDetector::removeGarbageObject(N);
+}
+
+void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock* N) {
+  N->getParent()->removeFromMBBNumbering(N->Number);
+  N->Number = -1;
+  LeakDetector::addGarbageObject(N);
+}
+
+
+/// addNodeToList (MI) - When we add an instruction to a basic block
+/// list, we update its parent pointer and add its operands from reg use/def
+/// lists if appropriate.
+void ilist_traits<MachineInstr>::addNodeToList(MachineInstr* N) {
+  assert(N->getParent() == 0 && "machine instruction already in a basic block");
+  N->setParent(Parent);
+  
+  // Add the instruction's register operands to their corresponding
+  // use/def lists.
+  MachineFunction *MF = Parent->getParent();
+  N->AddRegOperandsToUseLists(MF->getRegInfo());
+
+  LeakDetector::removeGarbageObject(N);
+}
+
+/// removeNodeFromList (MI) - When we remove an instruction from a basic block
+/// list, we update its parent pointer and remove its operands from reg use/def
+/// lists if appropriate.
+void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr* N) {
+  assert(N->getParent() != 0 && "machine instruction not in a basic block");
+
+  // Remove from the use/def lists.
+  N->RemoveRegOperandsFromUseLists();
+  
+  N->setParent(0);
+
+  LeakDetector::addGarbageObject(N);
+}
+
+/// transferNodesFromList (MI) - When moving a range of instructions from one
+/// MBB list to another, we need to update the parent pointers and the use/def
+/// lists.
+void ilist_traits<MachineInstr>::transferNodesFromList(
+      ilist_traits<MachineInstr>& fromList,
+      MachineBasicBlock::iterator first,
+      MachineBasicBlock::iterator last) {
+  assert(Parent->getParent() == fromList.Parent->getParent() &&
+        "MachineInstr parent mismatch!");
+
+  // Splice within the same MBB -> no change.
+  if (Parent == fromList.Parent) return;
+
+  // If splicing between two blocks within the same function, just update the
+  // parent pointers.
+  for (; first != last; ++first)
+    first->setParent(Parent);
+}
+
+void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) {
+  assert(!MI->getParent() && "MI is still in a block!");
+  Parent->getParent()->DeleteMachineInstr(MI);
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
+  iterator I = end();
+  while (I != begin() && (--I)->getDesc().isTerminator())
+    ; /*noop */
+  if (I != end() && !I->getDesc().isTerminator()) ++I;
+  return I;
+}
+
+bool
+MachineBasicBlock::isOnlyReachableByFallthrough() const {
+  return !isLandingPad() &&
+         !pred_empty() &&
+         next(pred_begin()) == pred_end() &&
+         (*pred_begin())->isLayoutSuccessor(this) &&
+         ((*pred_begin())->empty() ||
+          !(*pred_begin())->back().getDesc().isBarrier());
+}
+
+void MachineBasicBlock::dump() const {
+  print(*cerr.stream());
+}
+
+static inline void OutputReg(std::ostream &os, unsigned RegNo,
+                             const TargetRegisterInfo *TRI = 0) {
+  if (!RegNo || TargetRegisterInfo::isPhysicalRegister(RegNo)) {
+    if (TRI)
+      os << " %" << TRI->get(RegNo).Name;
+    else
+      os << " %mreg(" << RegNo << ")";
+  } else
+    os << " %reg" << RegNo;
+}
+
+void MachineBasicBlock::print(std::ostream &OS) const {
+  const MachineFunction *MF = getParent();
+  if(!MF) {
+    OS << "Can't print out MachineBasicBlock because parent MachineFunction"
+       << " is null\n";
+    return;
+  }
+
+  const BasicBlock *LBB = getBasicBlock();
+  OS << "\n";
+  if (LBB) OS << LBB->getName() << ": ";
+  OS << (const void*)this
+     << ", LLVM BB @" << (const void*) LBB << ", ID#" << getNumber();
+  if (Alignment) OS << ", Alignment " << Alignment;
+  if (isLandingPad()) OS << ", EH LANDING PAD";
+  OS << ":\n";
+
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();  
+  if (!livein_empty()) {
+    OS << "Live Ins:";
+    for (const_livein_iterator I = livein_begin(),E = livein_end(); I != E; ++I)
+      OutputReg(OS, *I, TRI);
+    OS << "\n";
+  }
+  // Print the preds of this block according to the CFG.
+  if (!pred_empty()) {
+    OS << "    Predecessors according to CFG:";
+    for (const_pred_iterator PI = pred_begin(), E = pred_end(); PI != E; ++PI)
+      OS << " " << *PI << " (#" << (*PI)->getNumber() << ")";
+    OS << "\n";
+  }
+  
+  for (const_iterator I = begin(); I != end(); ++I) {
+    OS << "\t";
+    I->print(OS, &getParent()->getTarget());
+  }
+
+  // Print the successors of this block according to the CFG.
+  if (!succ_empty()) {
+    OS << "    Successors according to CFG:";
+    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI)
+      OS << " " << *SI << " (#" << (*SI)->getNumber() << ")";
+    OS << "\n";
+  }
+}
+
+void MachineBasicBlock::removeLiveIn(unsigned Reg) {
+  livein_iterator I = std::find(livein_begin(), livein_end(), Reg);
+  assert(I != livein_end() && "Not a live in!");
+  LiveIns.erase(I);
+}
+
+bool MachineBasicBlock::isLiveIn(unsigned Reg) const {
+  const_livein_iterator I = std::find(livein_begin(), livein_end(), Reg);
+  return I != livein_end();
+}
+
+void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) {
+  getParent()->splice(NewAfter, this);
+}
+
+void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
+  MachineFunction::iterator BBI = NewBefore;
+  getParent()->splice(++BBI, this);
+}
+
+
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ) {
+  Successors.push_back(succ);
+  succ->addPredecessor(this);
+}
+
+void MachineBasicBlock::removeSuccessor(MachineBasicBlock *succ) {
+  succ->removePredecessor(this);
+  succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
+  assert(I != Successors.end() && "Not a current successor!");
+  Successors.erase(I);
+}
+
+MachineBasicBlock::succ_iterator 
+MachineBasicBlock::removeSuccessor(succ_iterator I) {
+  assert(I != Successors.end() && "Not a current successor!");
+  (*I)->removePredecessor(this);
+  return Successors.erase(I);
+}
+
+void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) {
+  Predecessors.push_back(pred);
+}
+
+void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) {
+  std::vector<MachineBasicBlock *>::iterator I =
+    std::find(Predecessors.begin(), Predecessors.end(), pred);
+  assert(I != Predecessors.end() && "Pred is not a predecessor of this block!");
+  Predecessors.erase(I);
+}
+
+void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB)
+{
+  if (this == fromMBB)
+    return;
+  
+  for(MachineBasicBlock::succ_iterator iter = fromMBB->succ_begin(), 
+      end = fromMBB->succ_end(); iter != end; ++iter) {
+      addSuccessor(*iter);
+  }
+  while(!fromMBB->succ_empty())
+    fromMBB->removeSuccessor(fromMBB->succ_begin());
+}
+
+bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
+  std::vector<MachineBasicBlock *>::const_iterator I =
+    std::find(Successors.begin(), Successors.end(), MBB);
+  return I != Successors.end();
+}
+
+bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
+  MachineFunction::const_iterator I(this);
+  return next(I) == MachineFunction::const_iterator(MBB);
+}
+
+/// removeFromParent - This method unlinks 'this' from the containing function,
+/// and returns it, but does not delete it.
+MachineBasicBlock *MachineBasicBlock::removeFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->remove(this);
+  return this;
+}
+
+
+/// eraseFromParent - This method unlinks 'this' from the containing function,
+/// and deletes it.
+void MachineBasicBlock::eraseFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->erase(this);
+}
+
+
+/// ReplaceUsesOfBlockWith - Given a machine basic block that branched to
+/// 'Old', change the code and CFG so that it branches to 'New' instead.
+void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
+                                               MachineBasicBlock *New) {
+  assert(Old != New && "Cannot replace self with self!");
+
+  MachineBasicBlock::iterator I = end();
+  while (I != begin()) {
+    --I;
+    if (!I->getDesc().isTerminator()) break;
+
+    // Scan the operands of this machine instruction, replacing any uses of Old
+    // with New.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      if (I->getOperand(i).isMBB() &&
+          I->getOperand(i).getMBB() == Old)
+        I->getOperand(i).setMBB(New);
+  }
+
+  // Update the successor information.
+  removeSuccessor(Old);
+  addSuccessor(New);
+}
+
+/// CorrectExtraCFGEdges - Various pieces of code can cause excess edges in the
+/// CFG to be inserted.  If we have proven that MBB can only branch to DestA and
+/// DestB, remove any other MBB successors from the CFG.  DestA and DestB can
+/// be null.
+/// Besides DestA and DestB, retain other edges leading to LandingPads
+/// (currently there can be only one; we don't check or require that here).
+/// Note it is possible that DestA and/or DestB are LandingPads.
+bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
+                                             MachineBasicBlock *DestB,
+                                             bool isCond) {
+  bool MadeChange = false;
+  bool AddedFallThrough = false;
+
+  MachineFunction::iterator FallThru = next(MachineFunction::iterator(this));
+  
+  // If this block ends with a conditional branch that falls through to its
+  // successor, set DestB as the successor.
+  if (isCond) {
+    if (DestB == 0 && FallThru != getParent()->end()) {
+      DestB = FallThru;
+      AddedFallThrough = true;
+    }
+  } else {
+    // If this is an unconditional branch with no explicit dest, it must just be
+    // a fallthrough into DestB.
+    if (DestA == 0 && FallThru != getParent()->end()) {
+      DestA = FallThru;
+      AddedFallThrough = true;
+    }
+  }
+  
+  MachineBasicBlock::succ_iterator SI = succ_begin();
+  MachineBasicBlock *OrigDestA = DestA, *OrigDestB = DestB;
+  while (SI != succ_end()) {
+    if (*SI == DestA && DestA == DestB) {
+      DestA = DestB = 0;
+      ++SI;
+    } else if (*SI == DestA) {
+      DestA = 0;
+      ++SI;
+    } else if (*SI == DestB) {
+      DestB = 0;
+      ++SI;
+    } else if ((*SI)->isLandingPad() && 
+               *SI!=OrigDestA && *SI!=OrigDestB) {
+      ++SI;
+    } else {
+      // Otherwise, this is a superfluous edge, remove it.
+      SI = removeSuccessor(SI);
+      MadeChange = true;
+    }
+  }
+  if (!AddedFallThrough) {
+    assert(DestA == 0 && DestB == 0 &&
+           "MachineCFG is missing edges!");
+  } else if (isCond) {
+    assert(DestA == 0 && "MachineCFG is missing edges!");
+  }
+  return MadeChange;
+}
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
new file mode 100644
index 0000000..37c8601
--- /dev/null
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -0,0 +1,53 @@
+//===- MachineDominators.cpp - Machine Dominator Calculation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// forward dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+TEMPLATE_INSTANTIATION(class DomTreeNodeBase<MachineBasicBlock>);
+TEMPLATE_INSTANTIATION(class DominatorTreeBase<MachineBasicBlock>);
+
+char MachineDominatorTree::ID = 0;
+
+static RegisterPass<MachineDominatorTree>
+E("machinedomtree", "MachineDominator Tree Construction", true);
+
+const PassInfo *const llvm::MachineDominatorsID = &E;
+
+void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+
+  return false;
+}
+
+MachineDominatorTree::MachineDominatorTree()
+    : MachineFunctionPass(&ID) {
+  DT = new DominatorTreeBase<MachineBasicBlock>(false);
+}
+
+MachineDominatorTree::~MachineDominatorTree() {
+  DT->releaseMemory();
+  delete DT;
+}
+
+void MachineDominatorTree::releaseMemory() {
+  DT->releaseMemory();
+}
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
new file mode 100644
index 0000000..cacfed1
--- /dev/null
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -0,0 +1,598 @@
+//===-- MachineFunction.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect native machine code information for a function.  This allows
+// target-specific information about the generated code to be stored with each
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/config.h"
+#include <fstream>
+#include <sstream>
+using namespace llvm;
+
+bool MachineFunctionPass::runOnFunction(Function &F) {
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+  
+  return runOnMachineFunction(MachineFunction::get(&F));
+}
+
+namespace {
+  struct VISIBILITY_HIDDEN Printer : public MachineFunctionPass {
+    static char ID;
+
+    std::ostream *OS;
+    const std::string Banner;
+
+    Printer (std::ostream *os, const std::string &banner) 
+      : MachineFunctionPass(&ID), OS(os), Banner(banner) {}
+
+    const char *getPassName() const { return "MachineFunction Printer"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) {
+      (*OS) << Banner;
+      MF.print (*OS);
+      return false;
+    }
+  };
+  char Printer::ID = 0;
+}
+
+/// Returns a newly-created MachineFunction Printer pass. The default output
+/// stream is std::cerr; the default banner is empty.
+///
+FunctionPass *llvm::createMachineFunctionPrinterPass(std::ostream *OS,
+                                                     const std::string &Banner){
+  return new Printer(OS, Banner);
+}
+
+namespace {
+  struct VISIBILITY_HIDDEN Deleter : public MachineFunctionPass {
+    static char ID;
+    Deleter() : MachineFunctionPass(&ID) {}
+
+    const char *getPassName() const { return "Machine Code Deleter"; }
+
+    bool runOnMachineFunction(MachineFunction &MF) {
+      // Delete the annotation from the function now.
+      MachineFunction::destruct(MF.getFunction());
+      return true;
+    }
+  };
+  char Deleter::ID = 0;
+}
+
+/// MachineCodeDeletion Pass - This pass deletes all of the machine code for
+/// the current function, which should happen after the function has been
+/// emitted to a .s file or to memory.
+FunctionPass *llvm::createMachineCodeDeleter() {
+  return new Deleter();
+}
+
+
+
+//===---------------------------------------------------------------------===//
+// MachineFunction implementation
+//===---------------------------------------------------------------------===//
+
+void ilist_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
+  MBB->getParent()->DeleteMachineBasicBlock(MBB);
+}
+
+MachineFunction::MachineFunction(const Function *F,
+                                 const TargetMachine &TM)
+  : Annotation(AnnotationManager::getID("CodeGen::MachineCodeForFunction")),
+    Fn(F), Target(TM) {
+  if (TM.getRegisterInfo())
+    RegInfo = new (Allocator.Allocate<MachineRegisterInfo>())
+                  MachineRegisterInfo(*TM.getRegisterInfo());
+  else
+    RegInfo = 0;
+  MFInfo = 0;
+  FrameInfo = new (Allocator.Allocate<MachineFrameInfo>())
+                  MachineFrameInfo(*TM.getFrameInfo());
+  ConstantPool = new (Allocator.Allocate<MachineConstantPool>())
+                     MachineConstantPool(TM.getTargetData());
+  
+  // Set up jump table.
+  const TargetData &TD = *TM.getTargetData();
+  bool IsPic = TM.getRelocationModel() == Reloc::PIC_;
+  unsigned EntrySize = IsPic ? 4 : TD.getPointerSize();
+  unsigned Alignment = IsPic ? TD.getABITypeAlignment(Type::Int32Ty)
+                             : TD.getPointerABIAlignment();
+  JumpTableInfo = new (Allocator.Allocate<MachineJumpTableInfo>())
+                      MachineJumpTableInfo(EntrySize, Alignment);
+}
+
+MachineFunction::~MachineFunction() {
+  BasicBlocks.clear();
+  InstructionRecycler.clear(Allocator);
+  BasicBlockRecycler.clear(Allocator);
+  if (RegInfo)
+    RegInfo->~MachineRegisterInfo();        Allocator.Deallocate(RegInfo);
+  if (MFInfo) {
+    MFInfo->~MachineFunctionInfo();       Allocator.Deallocate(MFInfo);
+  }
+  FrameInfo->~MachineFrameInfo();         Allocator.Deallocate(FrameInfo);
+  ConstantPool->~MachineConstantPool();   Allocator.Deallocate(ConstantPool);
+  JumpTableInfo->~MachineJumpTableInfo(); Allocator.Deallocate(JumpTableInfo);
+}
+
+
+/// RenumberBlocks - This discards all of the MachineBasicBlock numbers and
+/// recomputes them.  This guarantees that the MBB numbers are sequential,
+/// dense, and match the ordering of the blocks within the function.  If a
+/// specific MachineBasicBlock is specified, only that block and those after
+/// it are renumbered.
+void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
+  if (empty()) { MBBNumbering.clear(); return; }
+  MachineFunction::iterator MBBI, E = end();
+  if (MBB == 0)
+    MBBI = begin();
+  else
+    MBBI = MBB;
+  
+  // Figure out the block number this should have.
+  unsigned BlockNo = 0;
+  if (MBBI != begin())
+    BlockNo = prior(MBBI)->getNumber()+1;
+  
+  for (; MBBI != E; ++MBBI, ++BlockNo) {
+    if (MBBI->getNumber() != (int)BlockNo) {
+      // Remove use of the old number.
+      if (MBBI->getNumber() != -1) {
+        assert(MBBNumbering[MBBI->getNumber()] == &*MBBI &&
+               "MBB number mismatch!");
+        MBBNumbering[MBBI->getNumber()] = 0;
+      }
+      
+      // If BlockNo is already taken, set that block's number to -1.
+      if (MBBNumbering[BlockNo])
+        MBBNumbering[BlockNo]->setNumber(-1);
+
+      MBBNumbering[BlockNo] = MBBI;
+      MBBI->setNumber(BlockNo);
+    }
+  }    
+
+  // Okay, all the blocks are renumbered.  If we have compactified the block
+  // numbering, shrink MBBNumbering now.
+  assert(BlockNo <= MBBNumbering.size() && "Mismatch!");
+  MBBNumbering.resize(BlockNo);
+}
+
+/// CreateMachineInstr - Allocate a new MachineInstr. Use this instead
+/// of `new MachineInstr'.
+///
+MachineInstr *
+MachineFunction::CreateMachineInstr(const TargetInstrDesc &TID,
+                                    DebugLoc DL, bool NoImp) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+    MachineInstr(TID, DL, NoImp);
+}
+
+/// CloneMachineInstr - Create a new MachineInstr which is a copy of the
+/// 'Orig' instruction, identical in all ways except the the instruction
+/// has no parent, prev, or next.
+///
+MachineInstr *
+MachineFunction::CloneMachineInstr(const MachineInstr *Orig) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+             MachineInstr(*this, *Orig);
+}
+
+/// DeleteMachineInstr - Delete the given MachineInstr.
+///
+void
+MachineFunction::DeleteMachineInstr(MachineInstr *MI) {
+  // Clear the instructions memoperands. This must be done manually because
+  // the instruction's parent pointer is now null, so it can't properly
+  // deallocate them on its own.
+  MI->clearMemOperands(*this);
+
+  MI->~MachineInstr();
+  InstructionRecycler.Deallocate(Allocator, MI);
+}
+
+/// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this
+/// instead of `new MachineBasicBlock'.
+///
+MachineBasicBlock *
+MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
+  return new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
+             MachineBasicBlock(*this, bb);
+}
+
+/// DeleteMachineBasicBlock - Delete the given MachineBasicBlock.
+///
+void
+MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
+  assert(MBB->getParent() == this && "MBB parent mismatch!");
+  MBB->~MachineBasicBlock();
+  BasicBlockRecycler.Deallocate(Allocator, MBB);
+}
+
+void MachineFunction::dump() const {
+  print(*cerr.stream());
+}
+
+void MachineFunction::print(std::ostream &OS) const {
+  OS << "# Machine code for " << Fn->getName () << "():\n";
+
+  // Print Frame Information
+  FrameInfo->print(*this, OS);
+  
+  // Print JumpTable Information
+  JumpTableInfo->print(OS);
+
+  // Print Constant Pool
+  {
+    raw_os_ostream OSS(OS);
+    ConstantPool->print(OSS);
+  }
+  
+  const TargetRegisterInfo *TRI = getTarget().getRegisterInfo();
+  
+  if (RegInfo && !RegInfo->livein_empty()) {
+    OS << "Live Ins:";
+    for (MachineRegisterInfo::livein_iterator
+         I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
+      if (TRI)
+        OS << " " << TRI->getName(I->first);
+      else
+        OS << " Reg #" << I->first;
+      
+      if (I->second)
+        OS << " in VR#" << I->second << " ";
+    }
+    OS << "\n";
+  }
+  if (RegInfo && !RegInfo->liveout_empty()) {
+    OS << "Live Outs:";
+    for (MachineRegisterInfo::liveout_iterator
+         I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I)
+      if (TRI)
+        OS << " " << TRI->getName(*I);
+      else
+        OS << " Reg #" << *I;
+    OS << "\n";
+  }
+  
+  for (const_iterator BB = begin(); BB != end(); ++BB)
+    BB->print(OS);
+
+  OS << "\n# End machine code for " << Fn->getName () << "().\n\n";
+}
+
+/// CFGOnly flag - This is used to control whether or not the CFG graph printer
+/// prints out the contents of basic blocks or not.  This is acceptable because
+/// this code is only really used for debugging purposes.
+///
+static bool CFGOnly = false;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
+    static std::string getGraphName(const MachineFunction *F) {
+      return "CFG for '" + F->getFunction()->getName() + "' function";
+    }
+
+    static std::string getNodeLabel(const MachineBasicBlock *Node,
+                                    const MachineFunction *Graph) {
+      if (CFGOnly && Node->getBasicBlock() &&
+          !Node->getBasicBlock()->getName().empty())
+        return Node->getBasicBlock()->getName() + ":";
+
+      std::ostringstream Out;
+      if (CFGOnly) {
+        Out << Node->getNumber() << ':';
+        return Out.str();
+      }
+
+      Node->print(Out);
+
+      std::string OutStr = Out.str();
+      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
+
+      // Process string output to make it nicer...
+      for (unsigned i = 0; i != OutStr.length(); ++i)
+        if (OutStr[i] == '\n') {                            // Left justify
+          OutStr[i] = '\\';
+          OutStr.insert(OutStr.begin()+i+1, 'l');
+        }
+      return OutStr;
+    }
+  };
+}
+
+void MachineFunction::viewCFG() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getFunction()->getName());
+#else
+  cerr << "SelectionDAG::viewGraph is only available in debug builds on "
+       << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+void MachineFunction::viewCFGOnly() const
+{
+  CFGOnly = true;
+  viewCFG();
+  CFGOnly = false;
+}
+
+// The next two methods are used to construct and to retrieve
+// the MachineCodeForFunction object for the given function.
+// construct() -- Allocates and initializes for a given function and target
+// get()       -- Returns a handle to the object.
+//                This should not be called before "construct()"
+//                for a given Function.
+//
+MachineFunction&
+MachineFunction::construct(const Function *Fn, const TargetMachine &Tar)
+{
+  AnnotationID MF_AID =
+                    AnnotationManager::getID("CodeGen::MachineCodeForFunction");
+  assert(Fn->getAnnotation(MF_AID) == 0 &&
+         "Object already exists for this function!");
+  MachineFunction* mcInfo = new MachineFunction(Fn, Tar);
+  Fn->addAnnotation(mcInfo);
+  return *mcInfo;
+}
+
+void MachineFunction::destruct(const Function *Fn) {
+  AnnotationID MF_AID =
+                    AnnotationManager::getID("CodeGen::MachineCodeForFunction");
+  bool Deleted = Fn->deleteAnnotation(MF_AID);
+  assert(Deleted && "Machine code did not exist for function!"); 
+  Deleted = Deleted; // silence warning when no assertions.
+}
+
+MachineFunction& MachineFunction::get(const Function *F)
+{
+  AnnotationID MF_AID =
+                    AnnotationManager::getID("CodeGen::MachineCodeForFunction");
+  MachineFunction *mc = (MachineFunction*)F->getAnnotation(MF_AID);
+  assert(mc && "Call construct() method first to allocate the object");
+  return *mc;
+}
+
+/// addLiveIn - Add the specified physical register as a live-in value and
+/// create a corresponding virtual register for it.
+unsigned MachineFunction::addLiveIn(unsigned PReg,
+                                    const TargetRegisterClass *RC) {
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = getRegInfo().createVirtualRegister(RC);
+  getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+/// getOrCreateDebugLocID - Look up the DebugLocTuple index with the given
+/// source file, line, and column. If none currently exists, create a new
+/// DebugLocTuple, and insert it into the DebugIdMap.
+unsigned MachineFunction::getOrCreateDebugLocID(GlobalVariable *CompileUnit,
+                                                unsigned Line, unsigned Col) {
+  DebugLocTuple Tuple(CompileUnit, Line, Col);
+  DenseMap<DebugLocTuple, unsigned>::iterator II
+    = DebugLocInfo.DebugIdMap.find(Tuple);
+  if (II != DebugLocInfo.DebugIdMap.end())
+    return II->second;
+  // Add a new tuple.
+  unsigned Id = DebugLocInfo.DebugLocations.size();
+  DebugLocInfo.DebugLocations.push_back(Tuple);
+  DebugLocInfo.DebugIdMap[Tuple] = Id;
+  return Id;
+}
+
+/// getDebugLocTuple - Get the DebugLocTuple for a given DebugLoc object.
+DebugLocTuple MachineFunction::getDebugLocTuple(DebugLoc DL) const {
+  unsigned Idx = DL.getIndex();
+  assert(Idx < DebugLocInfo.DebugLocations.size() &&
+         "Invalid index into debug locations!");
+  return DebugLocInfo.DebugLocations[Idx];
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineFrameInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// CreateFixedObject - Create a new object at a fixed location on the stack.
+/// All fixed objects should be created before other objects are created for
+/// efficiency. By default, fixed objects are immutable. This returns an
+/// index with a negative value.
+///
+int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
+                                        bool Immutable) {
+  assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
+  Objects.insert(Objects.begin(), StackObject(Size, 1, SPOffset, Immutable));
+  return -++NumFixedObjects;
+}
+
+
+void MachineFrameInfo::print(const MachineFunction &MF, std::ostream &OS) const{
+  const TargetFrameInfo *FI = MF.getTarget().getFrameInfo();
+  int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
+
+  for (unsigned i = 0, e = Objects.size(); i != e; ++i) {
+    const StackObject &SO = Objects[i];
+    OS << "  <fi#" << (int)(i-NumFixedObjects) << ">: ";
+    if (SO.Size == ~0ULL) {
+      OS << "dead\n";
+      continue;
+    }
+    if (SO.Size == 0)
+      OS << "variable sized";
+    else
+      OS << "size is " << SO.Size << " byte" << (SO.Size != 1 ? "s," : ",");
+    OS << " alignment is " << SO.Alignment << " byte"
+       << (SO.Alignment != 1 ? "s," : ",");
+
+    if (i < NumFixedObjects)
+      OS << " fixed";
+    if (i < NumFixedObjects || SO.SPOffset != -1) {
+      int64_t Off = SO.SPOffset - ValOffset;
+      OS << " at location [SP";
+      if (Off > 0)
+        OS << "+" << Off;
+      else if (Off < 0)
+        OS << Off;
+      OS << "]";
+    }
+    OS << "\n";
+  }
+
+  if (HasVarSizedObjects)
+    OS << "  Stack frame contains variable sized objects\n";
+}
+
+void MachineFrameInfo::dump(const MachineFunction &MF) const {
+  print(MF, *cerr.stream());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  MachineJumpTableInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// getJumpTableIndex - Create a new jump table entry in the jump table info
+/// or return an existing one.
+///
+unsigned MachineJumpTableInfo::getJumpTableIndex(
+                               const std::vector<MachineBasicBlock*> &DestBBs) {
+  assert(!DestBBs.empty() && "Cannot create an empty jump table!");
+  for (unsigned i = 0, e = JumpTables.size(); i != e; ++i)
+    if (JumpTables[i].MBBs == DestBBs)
+      return i;
+  
+  JumpTables.push_back(MachineJumpTableEntry(DestBBs));
+  return JumpTables.size()-1;
+}
+
+/// ReplaceMBBInJumpTables - If Old is the target of any jump tables, update
+/// the jump tables to branch to New instead.
+bool
+MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old,
+                                             MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  for (size_t i = 0, e = JumpTables.size(); i != e; ++i) {
+    MachineJumpTableEntry &JTE = JumpTables[i];
+    for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j)
+      if (JTE.MBBs[j] == Old) {
+        JTE.MBBs[j] = New;
+        MadeChange = true;
+      }
+  }
+  return MadeChange;
+}
+
+void MachineJumpTableInfo::print(std::ostream &OS) const {
+  // FIXME: this is lame, maybe we could print out the MBB numbers or something
+  // like {1, 2, 4, 5, 3, 0}
+  for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) {
+    OS << "  <jt#" << i << "> has " << JumpTables[i].MBBs.size() 
+       << " entries\n";
+  }
+}
+
+void MachineJumpTableInfo::dump() const { print(*cerr.stream()); }
+
+
+//===----------------------------------------------------------------------===//
+//  MachineConstantPool implementation
+//===----------------------------------------------------------------------===//
+
+const Type *MachineConstantPoolEntry::getType() const {
+  if (isMachineConstantPoolEntry())
+      return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+MachineConstantPool::~MachineConstantPool() {
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (Constants[i].isMachineConstantPoolEntry())
+      delete Constants[i].Val.MachineCPVal;
+}
+
+/// getConstantPoolIndex - Create a new entry in the constant pool or return
+/// an existing one.  User must specify the log2 of the minimum required
+/// alignment for the object.
+///
+unsigned MachineConstantPool::getConstantPoolIndex(Constant *C, 
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+  
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (Constants[i].Val.ConstVal == C &&
+        (Constants[i].getAlignment() & (Alignment - 1)) == 0)
+      return i;
+  
+  Constants.push_back(MachineConstantPoolEntry(C, Alignment));
+  return Constants.size()-1;
+}
+
+unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+  
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  int Idx = V->getExistingMachineCPValue(this, Alignment);
+  if (Idx != -1)
+    return (unsigned)Idx;
+
+  Constants.push_back(MachineConstantPoolEntry(V, Alignment));
+  return Constants.size()-1;
+}
+
+void MachineConstantPool::print(raw_ostream &OS) const {
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    OS << "  <cp#" << i << "> is";
+    if (Constants[i].isMachineConstantPoolEntry())
+      Constants[i].Val.MachineCPVal->print(OS);
+    else
+      OS << *(Value*)Constants[i].Val.ConstVal;
+    OS << " , alignment=" << Constants[i].getAlignment();
+    OS << "\n";
+  }
+}
+
+void MachineConstantPool::dump() const { print(errs()); }
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
new file mode 100644
index 0000000..b8c8563
--- /dev/null
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -0,0 +1,1105 @@
+//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Methods common to all machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Constants.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Value.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetInstrDesc.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/FoldingSet.h"
+#include <ostream>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineOperand Implementation
+//===----------------------------------------------------------------------===//
+
+/// AddRegOperandToRegInfo - Add this register operand to the specified
+/// MachineRegisterInfo.  If it is null, then the next/prev fields should be
+/// explicitly nulled out.
+void MachineOperand::AddRegOperandToRegInfo(MachineRegisterInfo *RegInfo) {
+  assert(isReg() && "Can only add reg operand to use lists");
+  
+  // If the reginfo pointer is null, just explicitly null out or next/prev
+  // pointers, to ensure they are not garbage.
+  if (RegInfo == 0) {
+    Contents.Reg.Prev = 0;
+    Contents.Reg.Next = 0;
+    return;
+  }
+  
+  // Otherwise, add this operand to the head of the registers use/def list.
+  MachineOperand **Head = &RegInfo->getRegUseDefListHead(getReg());
+  
+  // For SSA values, we prefer to keep the definition at the start of the list.
+  // we do this by skipping over the definition if it is at the head of the
+  // list.
+  if (*Head && (*Head)->isDef())
+    Head = &(*Head)->Contents.Reg.Next;
+  
+  Contents.Reg.Next = *Head;
+  if (Contents.Reg.Next) {
+    assert(getReg() == Contents.Reg.Next->getReg() &&
+           "Different regs on the same list!");
+    Contents.Reg.Next->Contents.Reg.Prev = &Contents.Reg.Next;
+  }
+  
+  Contents.Reg.Prev = Head;
+  *Head = this;
+}
+
+/// RemoveRegOperandFromRegInfo - Remove this register operand from the
+/// MachineRegisterInfo it is linked with.
+void MachineOperand::RemoveRegOperandFromRegInfo() {
+  assert(isOnRegUseList() && "Reg operand is not on a use list");
+  // Unlink this from the doubly linked list of operands.
+  MachineOperand *NextOp = Contents.Reg.Next;
+  *Contents.Reg.Prev = NextOp; 
+  if (NextOp) {
+    assert(NextOp->getReg() == getReg() && "Corrupt reg use/def chain!");
+    NextOp->Contents.Reg.Prev = Contents.Reg.Prev;
+  }
+  Contents.Reg.Prev = 0;
+  Contents.Reg.Next = 0;
+}
+
+void MachineOperand::setReg(unsigned Reg) {
+  if (getReg() == Reg) return; // No change.
+  
+  // Otherwise, we have to change the register.  If this operand is embedded
+  // into a machine function, we need to update the old and new register's
+  // use/def lists.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        RemoveRegOperandFromRegInfo();
+        Contents.Reg.RegNo = Reg;
+        AddRegOperandToRegInfo(&MF->getRegInfo());
+        return;
+      }
+        
+  // Otherwise, just change the register, no problem.  :)
+  Contents.Reg.RegNo = Reg;
+}
+
+/// ChangeToImmediate - Replace this operand with a new immediate operand of
+/// the specified value.  If an operand is known to be an immediate already,
+/// the setImm method should be used.
+void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
+  // If this operand is currently a register operand, and if this is in a
+  // function, deregister the operand from the register's use/def list.
+  if (isReg() && getParent() && getParent()->getParent() &&
+      getParent()->getParent()->getParent())
+    RemoveRegOperandFromRegInfo();
+  
+  OpKind = MO_Immediate;
+  Contents.ImmVal = ImmVal;
+}
+
+/// ChangeToRegister - Replace this operand with a new register operand of
+/// the specified value.  If an operand is known to be an register already,
+/// the setReg method should be used.
+void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
+                                      bool isKill, bool isDead) {
+  // If this operand is already a register operand, use setReg to update the 
+  // register's use/def lists.
+  if (isReg()) {
+    assert(!isEarlyClobber());
+    setReg(Reg);
+  } else {
+    // Otherwise, change this to a register and set the reg#.
+    OpKind = MO_Register;
+    Contents.Reg.RegNo = Reg;
+
+    // If this operand is embedded in a function, add the operand to the
+    // register's use/def list.
+    if (MachineInstr *MI = getParent())
+      if (MachineBasicBlock *MBB = MI->getParent())
+        if (MachineFunction *MF = MBB->getParent())
+          AddRegOperandToRegInfo(&MF->getRegInfo());
+  }
+
+  IsDef = isDef;
+  IsImp = isImp;
+  IsKill = isKill;
+  IsDead = isDead;
+  IsEarlyClobber = false;
+  SubReg = 0;
+}
+
+/// isIdenticalTo - Return true if this operand is identical to the specified
+/// operand.
+bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
+  if (getType() != Other.getType()) return false;
+  
+  switch (getType()) {
+  default: assert(0 && "Unrecognized operand type");
+  case MachineOperand::MO_Register:
+    return getReg() == Other.getReg() && isDef() == Other.isDef() &&
+           getSubReg() == Other.getSubReg();
+  case MachineOperand::MO_Immediate:
+    return getImm() == Other.getImm();
+  case MachineOperand::MO_FPImmediate:
+    return getFPImm() == Other.getFPImm();
+  case MachineOperand::MO_MachineBasicBlock:
+    return getMBB() == Other.getMBB();
+  case MachineOperand::MO_FrameIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_ConstantPoolIndex:
+    return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_JumpTableIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_GlobalAddress:
+    return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_ExternalSymbol:
+    return !strcmp(getSymbolName(), Other.getSymbolName()) &&
+           getOffset() == Other.getOffset();
+  }
+}
+
+/// print - Print the specified machine operand.
+///
+void MachineOperand::print(std::ostream &OS, const TargetMachine *TM) const {
+  raw_os_ostream RawOS(OS);
+  print(RawOS, TM);
+}
+
+void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
+  switch (getType()) {
+  case MachineOperand::MO_Register:
+    if (getReg() == 0 || TargetRegisterInfo::isVirtualRegister(getReg())) {
+      OS << "%reg" << getReg();
+    } else {
+      // If the instruction is embedded into a basic block, we can find the
+      // target info for the instruction.
+      if (TM == 0)
+        if (const MachineInstr *MI = getParent())
+          if (const MachineBasicBlock *MBB = MI->getParent())
+            if (const MachineFunction *MF = MBB->getParent())
+              TM = &MF->getTarget();
+      
+      if (TM)
+        OS << "%" << TM->getRegisterInfo()->get(getReg()).Name;
+      else
+        OS << "%mreg" << getReg();
+    }
+
+    if (getSubReg() != 0) {
+      OS << ":" << getSubReg();
+    }
+
+    if (isDef() || isKill() || isDead() || isImplicit() || isEarlyClobber()) {
+      OS << "<";
+      bool NeedComma = false;
+      if (isImplicit()) {
+        if (NeedComma) OS << ",";
+        OS << (isDef() ? "imp-def" : "imp-use");
+        NeedComma = true;
+      } else if (isDef()) {
+        if (NeedComma) OS << ",";
+        if (isEarlyClobber())
+          OS << "earlyclobber,";
+        OS << "def";
+        NeedComma = true;
+      }
+      if (isKill() || isDead()) {
+        if (NeedComma) OS << ",";
+        if (isKill())  OS << "kill";
+        if (isDead())  OS << "dead";
+      }
+      OS << ">";
+    }
+    break;
+  case MachineOperand::MO_Immediate:
+    OS << getImm();
+    break;
+  case MachineOperand::MO_FPImmediate:
+    if (getFPImm()->getType() == Type::FloatTy) {
+      OS << getFPImm()->getValueAPF().convertToFloat();
+    } else {
+      OS << getFPImm()->getValueAPF().convertToDouble();
+    }
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    OS << "mbb<"
+       << ((Value*)getMBB()->getBasicBlock())->getName()
+       << "," << (void*)getMBB() << ">";
+    break;
+  case MachineOperand::MO_FrameIndex:
+    OS << "<fi#" << getIndex() << ">";
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    OS << "<cp#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << ">";
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    OS << "<jt#" << getIndex() << ">";
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    OS << "<ga:" << ((Value*)getGlobal())->getName();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << ">";
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    OS << "<es:" << getSymbolName();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << ">";
+    break;
+  default:
+    assert(0 && "Unrecognized operand type");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MachineMemOperand Implementation
+//===----------------------------------------------------------------------===//
+
+MachineMemOperand::MachineMemOperand(const Value *v, unsigned int f,
+                                     int64_t o, uint64_t s, unsigned int a)
+  : Offset(o), Size(s), V(v),
+    Flags((f & 7) | ((Log2_32(a) + 1) << 3)) {
+  assert(isPowerOf2_32(a) && "Alignment is not a power of 2!");
+  assert((isLoad() || isStore()) && "Not a load/store!");
+}
+
+/// Profile - Gather unique data for the object.
+///
+void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(Offset);
+  ID.AddInteger(Size);
+  ID.AddPointer(V);
+  ID.AddInteger(Flags);
+}
+
+//===----------------------------------------------------------------------===//
+// MachineInstr Implementation
+//===----------------------------------------------------------------------===//
+
+/// MachineInstr ctor - This constructor creates a dummy MachineInstr with
+/// TID NULL and no operands.
+MachineInstr::MachineInstr()
+  : TID(0), NumImplicitOps(0), Parent(0), debugLoc(DebugLoc::getUnknownLoc()) {
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+}
+
+void MachineInstr::addImplicitDefUseOperands() {
+  if (TID->ImplicitDefs)
+    for (const unsigned *ImpDefs = TID->ImplicitDefs; *ImpDefs; ++ImpDefs)
+      addOperand(MachineOperand::CreateReg(*ImpDefs, true, true));
+  if (TID->ImplicitUses)
+    for (const unsigned *ImpUses = TID->ImplicitUses; *ImpUses; ++ImpUses)
+      addOperand(MachineOperand::CreateReg(*ImpUses, false, true));
+}
+
+/// MachineInstr ctor - This constructor create a MachineInstr and add the
+/// implicit operands. It reserves space for number of operands specified by
+/// TargetInstrDesc or the numOperands if it is not zero. (for
+/// instructions with variable number of operands).
+MachineInstr::MachineInstr(const TargetInstrDesc &tid, bool NoImp)
+  : TID(&tid), NumImplicitOps(0), Parent(0), 
+    debugLoc(DebugLoc::getUnknownLoc()) {
+  if (!NoImp && TID->getImplicitDefs())
+    for (const unsigned *ImpDefs = TID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+      NumImplicitOps++;
+  if (!NoImp && TID->getImplicitUses())
+    for (const unsigned *ImpUses = TID->getImplicitUses(); *ImpUses; ++ImpUses)
+      NumImplicitOps++;
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  if (!NoImp)
+    addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+}
+
+/// MachineInstr ctor - As above, but with a DebugLoc.
+MachineInstr::MachineInstr(const TargetInstrDesc &tid, const DebugLoc dl,
+                           bool NoImp)
+  : TID(&tid), NumImplicitOps(0), Parent(0), debugLoc(dl) {
+  if (!NoImp && TID->getImplicitDefs())
+    for (const unsigned *ImpDefs = TID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+      NumImplicitOps++;
+  if (!NoImp && TID->getImplicitUses())
+    for (const unsigned *ImpUses = TID->getImplicitUses(); *ImpUses; ++ImpUses)
+      NumImplicitOps++;
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  if (!NoImp)
+    addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+}
+
+/// MachineInstr ctor - Work exactly the same as the ctor two above, except
+/// that the MachineInstr is created and added to the end of the specified 
+/// basic block.
+///
+MachineInstr::MachineInstr(MachineBasicBlock *MBB, const TargetInstrDesc &tid)
+  : TID(&tid), NumImplicitOps(0), Parent(0), 
+    debugLoc(DebugLoc::getUnknownLoc()) {
+  assert(MBB && "Cannot use inserting ctor with null basic block!");
+  if (TID->ImplicitDefs)
+    for (const unsigned *ImpDefs = TID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+      NumImplicitOps++;
+  if (TID->ImplicitUses)
+    for (const unsigned *ImpUses = TID->getImplicitUses(); *ImpUses; ++ImpUses)
+      NumImplicitOps++;
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+  MBB->push_back(this);  // Add instruction to end of basic block!
+}
+
+/// MachineInstr ctor - As above, but with a DebugLoc.
+///
+MachineInstr::MachineInstr(MachineBasicBlock *MBB, const DebugLoc dl,
+                           const TargetInstrDesc &tid)
+  : TID(&tid), NumImplicitOps(0), Parent(0), debugLoc(dl) {
+  assert(MBB && "Cannot use inserting ctor with null basic block!");
+  if (TID->ImplicitDefs)
+    for (const unsigned *ImpDefs = TID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+      NumImplicitOps++;
+  if (TID->ImplicitUses)
+    for (const unsigned *ImpUses = TID->getImplicitUses(); *ImpUses; ++ImpUses)
+      NumImplicitOps++;
+  Operands.reserve(NumImplicitOps + TID->getNumOperands());
+  addImplicitDefUseOperands();
+  // Make sure that we get added to a machine basicblock
+  LeakDetector::addGarbageObject(this);
+  MBB->push_back(this);  // Add instruction to end of basic block!
+}
+
+/// MachineInstr ctor - Copies MachineInstr arg exactly
+///
+MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
+  : TID(&MI.getDesc()), NumImplicitOps(0), Parent(0), 
+        debugLoc(MI.getDebugLoc()) {
+  Operands.reserve(MI.getNumOperands());
+
+  // Add operands
+  for (unsigned i = 0; i != MI.getNumOperands(); ++i)
+    addOperand(MI.getOperand(i));
+  NumImplicitOps = MI.NumImplicitOps;
+
+  // Add memory operands.
+  for (std::list<MachineMemOperand>::const_iterator i = MI.memoperands_begin(),
+       j = MI.memoperands_end(); i != j; ++i)
+    addMemOperand(MF, *i);
+
+  // Set parent to null.
+  Parent = 0;
+
+  LeakDetector::addGarbageObject(this);
+}
+
+MachineInstr::~MachineInstr() {
+  LeakDetector::removeGarbageObject(this);
+  assert(MemOperands.empty() &&
+         "MachineInstr being deleted with live memoperands!");
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    assert(Operands[i].ParentMI == this && "ParentMI mismatch!");
+    assert((!Operands[i].isReg() || !Operands[i].isOnRegUseList()) &&
+           "Reg operand def/use list corrupted");
+  }
+#endif
+}
+
+/// getRegInfo - If this instruction is embedded into a MachineFunction,
+/// return the MachineRegisterInfo object for the current function, otherwise
+/// return null.
+MachineRegisterInfo *MachineInstr::getRegInfo() {
+  if (MachineBasicBlock *MBB = getParent())
+    return &MBB->getParent()->getRegInfo();
+  return 0;
+}
+
+/// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands already be on their use lists.
+void MachineInstr::RemoveRegOperandsFromUseLists() {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    if (Operands[i].isReg())
+      Operands[i].RemoveRegOperandFromRegInfo();
+  }
+}
+
+/// AddRegOperandsToUseLists - Add all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands not be on their use lists yet.
+void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &RegInfo) {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    if (Operands[i].isReg())
+      Operands[i].AddRegOperandToRegInfo(&RegInfo);
+  }
+}
+
+
+/// addOperand - Add the specified operand to the instruction.  If it is an
+/// implicit operand, it is added to the end of the operand list.  If it is
+/// an explicit operand it is added at the end of the explicit operand list
+/// (before the first implicit operand). 
+void MachineInstr::addOperand(const MachineOperand &Op) {
+  bool isImpReg = Op.isReg() && Op.isImplicit();
+  assert((isImpReg || !OperandsComplete()) &&
+         "Trying to add an operand to a machine instr that is already done!");
+
+  MachineRegisterInfo *RegInfo = getRegInfo();
+
+  // If we are adding the operand to the end of the list, our job is simpler.
+  // This is true most of the time, so this is a reasonable optimization.
+  if (isImpReg || NumImplicitOps == 0) {
+    // We can only do this optimization if we know that the operand list won't
+    // reallocate.
+    if (Operands.empty() || Operands.size()+1 <= Operands.capacity()) {
+      Operands.push_back(Op);
+    
+      // Set the parent of the operand.
+      Operands.back().ParentMI = this;
+  
+      // If the operand is a register, update the operand's use list.
+      if (Op.isReg())
+        Operands.back().AddRegOperandToRegInfo(RegInfo);
+      return;
+    }
+  }
+  
+  // Otherwise, we have to insert a real operand before any implicit ones.
+  unsigned OpNo = Operands.size()-NumImplicitOps;
+
+  // If this instruction isn't embedded into a function, then we don't need to
+  // update any operand lists.
+  if (RegInfo == 0) {
+    // Simple insertion, no reginfo update needed for other register operands.
+    Operands.insert(Operands.begin()+OpNo, Op);
+    Operands[OpNo].ParentMI = this;
+
+    // Do explicitly set the reginfo for this operand though, to ensure the
+    // next/prev fields are properly nulled out.
+    if (Operands[OpNo].isReg())
+      Operands[OpNo].AddRegOperandToRegInfo(0);
+
+  } else if (Operands.size()+1 <= Operands.capacity()) {
+    // Otherwise, we have to remove register operands from their register use
+    // list, add the operand, then add the register operands back to their use
+    // list.  This also must handle the case when the operand list reallocates
+    // to somewhere else.
+  
+    // If insertion of this operand won't cause reallocation of the operand
+    // list, just remove the implicit operands, add the operand, then re-add all
+    // the rest of the operands.
+    for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
+      assert(Operands[i].isReg() && "Should only be an implicit reg!");
+      Operands[i].RemoveRegOperandFromRegInfo();
+    }
+    
+    // Add the operand.  If it is a register, add it to the reg list.
+    Operands.insert(Operands.begin()+OpNo, Op);
+    Operands[OpNo].ParentMI = this;
+
+    if (Operands[OpNo].isReg())
+      Operands[OpNo].AddRegOperandToRegInfo(RegInfo);
+    
+    // Re-add all the implicit ops.
+    for (unsigned i = OpNo+1, e = Operands.size(); i != e; ++i) {
+      assert(Operands[i].isReg() && "Should only be an implicit reg!");
+      Operands[i].AddRegOperandToRegInfo(RegInfo);
+    }
+  } else {
+    // Otherwise, we will be reallocating the operand list.  Remove all reg
+    // operands from their list, then readd them after the operand list is
+    // reallocated.
+    RemoveRegOperandsFromUseLists();
+    
+    Operands.insert(Operands.begin()+OpNo, Op);
+    Operands[OpNo].ParentMI = this;
+  
+    // Re-add all the operands.
+    AddRegOperandsToUseLists(*RegInfo);
+  }
+}
+
+/// RemoveOperand - Erase an operand  from an instruction, leaving it with one
+/// fewer operand than it started with.
+///
+void MachineInstr::RemoveOperand(unsigned OpNo) {
+  assert(OpNo < Operands.size() && "Invalid operand number");
+  
+  // Special case removing the last one.
+  if (OpNo == Operands.size()-1) {
+    // If needed, remove from the reg def/use list.
+    if (Operands.back().isReg() && Operands.back().isOnRegUseList())
+      Operands.back().RemoveRegOperandFromRegInfo();
+    
+    Operands.pop_back();
+    return;
+  }
+
+  // Otherwise, we are removing an interior operand.  If we have reginfo to
+  // update, remove all operands that will be shifted down from their reg lists,
+  // move everything down, then re-add them.
+  MachineRegisterInfo *RegInfo = getRegInfo();
+  if (RegInfo) {
+    for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
+      if (Operands[i].isReg())
+        Operands[i].RemoveRegOperandFromRegInfo();
+    }
+  }
+  
+  Operands.erase(Operands.begin()+OpNo);
+
+  if (RegInfo) {
+    for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
+      if (Operands[i].isReg())
+        Operands[i].AddRegOperandToRegInfo(RegInfo);
+    }
+  }
+}
+
+/// addMemOperand - Add a MachineMemOperand to the machine instruction,
+/// referencing arbitrary storage.
+void MachineInstr::addMemOperand(MachineFunction &MF,
+                                 const MachineMemOperand &MO) {
+  MemOperands.push_back(MO);
+}
+
+/// clearMemOperands - Erase all of this MachineInstr's MachineMemOperands.
+void MachineInstr::clearMemOperands(MachineFunction &MF) {
+  MemOperands.clear();
+}
+
+
+/// removeFromParent - This method unlinks 'this' from the containing basic
+/// block, and returns it, but does not delete it.
+MachineInstr *MachineInstr::removeFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->remove(this);
+  return this;
+}
+
+
+/// eraseFromParent - This method unlinks 'this' from the containing basic
+/// block, and deletes it.
+void MachineInstr::eraseFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase(this);
+}
+
+
+/// OperandComplete - Return true if it's illegal to add a new operand
+///
+bool MachineInstr::OperandsComplete() const {
+  unsigned short NumOperands = TID->getNumOperands();
+  if (!TID->isVariadic() && getNumOperands()-NumImplicitOps >= NumOperands)
+    return true;  // Broken: we have all the operands of this instruction!
+  return false;
+}
+
+/// getNumExplicitOperands - Returns the number of non-implicit operands.
+///
+unsigned MachineInstr::getNumExplicitOperands() const {
+  unsigned NumOperands = TID->getNumOperands();
+  if (!TID->isVariadic())
+    return NumOperands;
+
+  for (unsigned i = NumOperands, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit())
+      NumOperands++;
+  }
+  return NumOperands;
+}
+
+
+/// isLabel - Returns true if the MachineInstr represents a label.
+///
+bool MachineInstr::isLabel() const {
+  return getOpcode() == TargetInstrInfo::DBG_LABEL ||
+         getOpcode() == TargetInstrInfo::EH_LABEL ||
+         getOpcode() == TargetInstrInfo::GC_LABEL;
+}
+
+/// isDebugLabel - Returns true if the MachineInstr represents a debug label.
+///
+bool MachineInstr::isDebugLabel() const {
+  return getOpcode() == TargetInstrInfo::DBG_LABEL;
+}
+
+/// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
+/// the specific register or -1 if it is not found. It further tightening
+/// the search criteria to a use that kills the register if isKill is true.
+int MachineInstr::findRegisterUseOperandIdx(unsigned Reg, bool isKill,
+                                          const TargetRegisterInfo *TRI) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MOReg == Reg ||
+        (TRI &&
+         TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+         TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         TRI->isSubRegister(MOReg, Reg)))
+      if (!isKill || MO.isKill())
+        return i;
+  }
+  return -1;
+}
+  
+/// findRegisterDefOperandIdx() - Returns the operand index that is a def of
+/// the specified register or -1 if it is not found. If isDead is true, defs
+/// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
+/// also checks if there is a def of a super-register.
+int MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead,
+                                          const TargetRegisterInfo *TRI) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (MOReg == Reg ||
+        (TRI &&
+         TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+         TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         TRI->isSubRegister(MOReg, Reg)))
+      if (!isDead || MO.isDead())
+        return i;
+  }
+  return -1;
+}
+
+/// findFirstPredOperandIdx() - Find the index of the first operand in the
+/// operand list that is used to represent the predicate. It returns -1 if
+/// none is found.
+int MachineInstr::findFirstPredOperandIdx() const {
+  const TargetInstrDesc &TID = getDesc();
+  if (TID.isPredicable()) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (TID.OpInfo[i].isPredicate())
+        return i;
+  }
+
+  return -1;
+}
+  
+/// isRegTiedToUseOperand - Given the index of a register def operand,
+/// check if the register def is tied to a source operand, due to either
+/// two-address elimination or inline assembly constraints. Returns the
+/// first tied use operand index by reference is UseOpIdx is not null.
+bool MachineInstr::
+isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
+  if (getOpcode() == TargetInstrInfo::INLINEASM) {
+    assert(DefOpIdx >= 2);
+    const MachineOperand &MO = getOperand(DefOpIdx);
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
+      return false;
+    // Determine the actual operand no corresponding to this index.
+    unsigned DefNo = 0;
+    for (unsigned i = 1, e = getNumOperands(); i < e; ) {
+      const MachineOperand &FMO = getOperand(i);
+      assert(FMO.isImm());
+      // Skip over this def.
+      i += InlineAsm::getNumOperandRegisters(FMO.getImm()) + 1;
+      if (i > DefOpIdx)
+        break;
+      ++DefNo;
+    }
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &FMO = getOperand(i);
+      if (!FMO.isImm())
+        continue;
+      if (i+1 >= e || !getOperand(i+1).isReg() || !getOperand(i+1).isUse())
+        continue;
+      unsigned Idx;
+      if (InlineAsm::isUseOperandTiedToDef(FMO.getImm(), Idx) && 
+          Idx == DefNo) {
+        if (UseOpIdx)
+          *UseOpIdx = (unsigned)i + 1;
+        return true;
+      }
+    }
+  }
+
+  assert(getOperand(DefOpIdx).isDef() && "DefOpIdx is not a def!");
+  const TargetInstrDesc &TID = getDesc();
+  for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse() &&
+        TID.getOperandConstraint(i, TOI::TIED_TO) == (int)DefOpIdx) {
+      if (UseOpIdx)
+        *UseOpIdx = (unsigned)i;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// isRegTiedToDefOperand - Return true if the operand of the specified index
+/// is a register use and it is tied to an def operand. It also returns the def
+/// operand index by reference.
+bool MachineInstr::
+isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
+  if (getOpcode() == TargetInstrInfo::INLINEASM) {
+    const MachineOperand &MO = getOperand(UseOpIdx);
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0)
+      return false;
+    assert(UseOpIdx > 0);
+    const MachineOperand &UFMO = getOperand(UseOpIdx-1);
+    if (!UFMO.isImm())
+      return false;  // Must be physreg uses.
+    unsigned DefNo;
+    if (InlineAsm::isUseOperandTiedToDef(UFMO.getImm(), DefNo)) {
+      if (!DefOpIdx)
+        return true;
+
+      unsigned DefIdx = 1;
+      // Remember to adjust the index. First operand is asm string, then there
+      // is a flag for each.
+      while (DefNo) {
+        const MachineOperand &FMO = getOperand(DefIdx);
+        assert(FMO.isImm());
+        // Skip over this def.
+        DefIdx += InlineAsm::getNumOperandRegisters(FMO.getImm()) + 1;
+        --DefNo;
+      }
+      *DefOpIdx = DefIdx+1;
+      return true;
+    }
+    return false;
+  }
+
+  const TargetInstrDesc &TID = getDesc();
+  if (UseOpIdx >= TID.getNumOperands())
+    return false;
+  const MachineOperand &MO = getOperand(UseOpIdx);
+  if (!MO.isReg() || !MO.isUse())
+    return false;
+  int DefIdx = TID.getOperandConstraint(UseOpIdx, TOI::TIED_TO);
+  if (DefIdx == -1)
+    return false;
+  if (DefOpIdx)
+    *DefOpIdx = (unsigned)DefIdx;
+  return true;
+}
+
+/// copyKillDeadInfo - Copies kill / dead operand properties from MI.
+///
+void MachineInstr::copyKillDeadInfo(const MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || (!MO.isKill() && !MO.isDead()))
+      continue;
+    for (unsigned j = 0, ee = getNumOperands(); j != ee; ++j) {
+      MachineOperand &MOp = getOperand(j);
+      if (!MOp.isIdenticalTo(MO))
+        continue;
+      if (MO.isKill())
+        MOp.setIsKill();
+      else
+        MOp.setIsDead();
+      break;
+    }
+  }
+}
+
+/// copyPredicates - Copies predicate operand(s) from MI.
+void MachineInstr::copyPredicates(const MachineInstr *MI) {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isPredicable())
+    return;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (TID.OpInfo[i].isPredicate()) {
+      // Predicated operands must be last operands.
+      addOperand(MI->getOperand(i));
+    }
+  }
+}
+
+/// isSafeToMove - Return true if it is safe to move this instruction. If
+/// SawStore is set to true, it means that there is a store (or call) between
+/// the instruction's location and its intended destination.
+bool MachineInstr::isSafeToMove(const TargetInstrInfo *TII,
+                                bool &SawStore) const {
+  // Ignore stuff that we obviously can't move.
+  if (TID->mayStore() || TID->isCall()) {
+    SawStore = true;
+    return false;
+  }
+  if (TID->isTerminator() || TID->hasUnmodeledSideEffects())
+    return false;
+
+  // See if this instruction does a load.  If so, we have to guarantee that the
+  // loaded value doesn't change between the load and the its intended
+  // destination. The check for isInvariantLoad gives the targe the chance to
+  // classify the load as always returning a constant, e.g. a constant pool
+  // load.
+  if (TID->mayLoad() && !TII->isInvariantLoad(this))
+    // Otherwise, this is a real load.  If there is a store between the load and
+    // end of block, or if the laod is volatile, we can't move it.
+    return !SawStore && !hasVolatileMemoryRef();
+
+  return true;
+}
+
+/// isSafeToReMat - Return true if it's safe to rematerialize the specified
+/// instruction which defined the specified register instead of copying it.
+bool MachineInstr::isSafeToReMat(const TargetInstrInfo *TII,
+                                 unsigned DstReg) const {
+  bool SawStore = false;
+  if (!getDesc().isRematerializable() ||
+      !TII->isTriviallyReMaterializable(this) ||
+      !isSafeToMove(TII, SawStore))
+    return false;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg())
+      continue;
+    // FIXME: For now, do not remat any instruction with register operands.
+    // Later on, we can loosen the restriction is the register operands have
+    // not been modified between the def and use. Note, this is different from
+    // MachineSink because the code is no longer in two-address form (at least
+    // partially).
+    if (MO.isUse())
+      return false;
+    else if (!MO.isDead() && MO.getReg() != DstReg)
+      return false;
+  }
+  return true;
+}
+
+/// hasVolatileMemoryRef - Return true if this instruction may have a
+/// volatile memory reference, or if the information describing the
+/// memory reference is not available. Return false if it is known to
+/// have no volatile memory references.
+bool MachineInstr::hasVolatileMemoryRef() const {
+  // An instruction known never to access memory won't have a volatile access.
+  if (!TID->mayStore() &&
+      !TID->mayLoad() &&
+      !TID->isCall() &&
+      !TID->hasUnmodeledSideEffects())
+    return false;
+
+  // Otherwise, if the instruction has no memory reference information,
+  // conservatively assume it wasn't preserved.
+  if (memoperands_empty())
+    return true;
+  
+  // Check the memory reference information for volatile references.
+  for (std::list<MachineMemOperand>::const_iterator I = memoperands_begin(),
+       E = memoperands_end(); I != E; ++I)
+    if (I->isVolatile())
+      return true;
+
+  return false;
+}
+
+void MachineInstr::dump() const {
+  cerr << "  " << *this;
+}
+
+void MachineInstr::print(std::ostream &OS, const TargetMachine *TM) const {
+  raw_os_ostream RawOS(OS);
+  print(RawOS, TM);
+}
+
+void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
+  // Specialize printing if op#0 is definition
+  unsigned StartOp = 0;
+  if (getNumOperands() && getOperand(0).isReg() && getOperand(0).isDef()) {
+    getOperand(0).print(OS, TM);
+    OS << " = ";
+    ++StartOp;   // Don't print this operand again!
+  }
+
+  OS << getDesc().getName();
+
+  for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
+    if (i != StartOp)
+      OS << ",";
+    OS << " ";
+    getOperand(i).print(OS, TM);
+  }
+
+  if (!memoperands_empty()) {
+    OS << ", Mem:";
+    for (std::list<MachineMemOperand>::const_iterator i = memoperands_begin(),
+         e = memoperands_end(); i != e; ++i) {
+      const MachineMemOperand &MRO = *i;
+      const Value *V = MRO.getValue();
+
+      assert((MRO.isLoad() || MRO.isStore()) &&
+             "SV has to be a load, store or both.");
+      
+      if (MRO.isVolatile())
+        OS << "Volatile ";
+
+      if (MRO.isLoad())
+        OS << "LD";
+      if (MRO.isStore())
+        OS << "ST";
+        
+      OS << "(" << MRO.getSize() << "," << MRO.getAlignment() << ") [";
+      
+      if (!V)
+        OS << "<unknown>";
+      else if (!V->getName().empty())
+        OS << V->getName();
+      else if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+        PSV->print(OS);
+      } else
+        OS << V;
+
+      OS << " + " << MRO.getOffset() << "]";
+    }
+  }
+
+  if (!debugLoc.isUnknown()) {
+    const MachineFunction *MF = getParent()->getParent();
+    DebugLocTuple DLT = MF->getDebugLocTuple(debugLoc);
+    DICompileUnit CU(DLT.CompileUnit);
+    std::string Dir, Fn;
+    OS << " [dbg: "
+       << CU.getDirectory(Dir) << '/' << CU.getFilename(Fn) << ","
+       << DLT.Line << ","
+       << DLT.Col  << "]";
+  }
+
+  OS << "\n";
+}
+
+bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
+                                     const TargetRegisterInfo *RegInfo,
+                                     bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      if (!Found) {
+        if (MO.isKill())
+          // The register is already marked kill.
+          return true;
+        MO.setIsKill();
+        Found = true;
+      }
+    } else if (hasAliases && MO.isKill() &&
+               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // A super-register kill already exists.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded kill operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsKill(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is killed. Add a
+  // new implicit operand if required.
+  if (!Found && AddIfNotFound) {
+    addOperand(MachineOperand::CreateReg(IncomingReg,
+                                         false /*IsDef*/,
+                                         true  /*IsImp*/,
+                                         true  /*IsKill*/));
+    return true;
+  }
+  return Found;
+}
+
+bool MachineInstr::addRegisterDead(unsigned IncomingReg,
+                                   const TargetRegisterInfo *RegInfo,
+                                   bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      if (!Found) {
+        if (MO.isDead())
+          // The register is already marked dead.
+          return true;
+        MO.setIsDead();
+        Found = true;
+      }
+    } else if (hasAliases && MO.isDead() &&
+               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // There exists a super-register that's marked dead.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->getSubRegisters(IncomingReg) &&
+          RegInfo->getSuperRegisters(Reg) &&
+          RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded dead operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsDead(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is dead. Add a
+  // new implicit operand if required.
+  if (!Found && AddIfNotFound) {
+    addOperand(MachineOperand::CreateReg(IncomingReg,
+                                         true  /*IsDef*/,
+                                         true  /*IsImp*/,
+                                         false /*IsKill*/,
+                                         true  /*IsDead*/));
+    return true;
+  }
+  return Found;
+}
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
new file mode 100644
index 0000000..aaa4de4
--- /dev/null
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -0,0 +1,406 @@
+//===-- MachineLICM.cpp - Machine Loop Invariant Code Motion Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion on machine instructions. We
+// attempt to remove as much code from the body of a loop as possible.
+//
+// This pass does not attempt to throttle itself to limit register pressure.
+// The register allocation phases are expected to perform rematerialization
+// to recover when register pressure is high.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for the LLVM-IR-level LICM pass. It is only designed to hoist simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-licm"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+STATISTIC(NumHoisted, "Number of machine instructions hoisted out of loops");
+STATISTIC(NumCSEed,   "Number of hoisted machine instructions CSEed");
+
+namespace {
+  class VISIBILITY_HIDDEN MachineLICM : public MachineFunctionPass {
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+
+    // Various analyses that we use...
+    MachineLoopInfo      *LI;      // Current MachineLoopInfo
+    MachineDominatorTree *DT;      // Machine dominator tree for the cur loop
+    MachineRegisterInfo  *RegInfo; // Machine register information
+
+    // State that is updated as we process loops
+    bool         Changed;          // True if a loop is changed.
+    MachineLoop *CurLoop;          // The current loop we are working on.
+    MachineBasicBlock *CurPreheader; // The preheader for CurLoop.
+
+    // For each BB and opcode pair, keep a list of hoisted instructions.
+    DenseMap<std::pair<unsigned, unsigned>,
+      std::vector<const MachineInstr*> > CSEMap;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MachineLICM() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "Machine Instruction LICM"; }
+
+    // FIXME: Loop preheaders?
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual void releaseMemory() {
+      CSEMap.clear();
+    }
+
+  private:
+    /// IsLoopInvariantInst - Returns true if the instruction is loop
+    /// invariant. I.e., all virtual register operands are defined outside of
+    /// the loop, physical registers aren't accessed (explicitly or implicitly),
+    /// and the instruction is hoistable.
+    /// 
+    bool IsLoopInvariantInst(MachineInstr &I);
+
+    /// IsProfitableToHoist - Return true if it is potentially profitable to
+    /// hoist the given loop invariant.
+    bool IsProfitableToHoist(MachineInstr &MI);
+
+    /// HoistRegion - Walk the specified region of the CFG (defined by all
+    /// blocks dominated by the specified block, and that are in the current
+    /// loop) in depth first order w.r.t the DominatorTree. This allows us to
+    /// visit definitions before uses, allowing us to hoist a loop body in one
+    /// pass without iteration.
+    ///
+    void HoistRegion(MachineDomTreeNode *N);
+
+    /// Hoist - When an instruction is found to only use loop invariant operands
+    /// that is safe to hoist, this instruction is called to do the dirty work.
+    ///
+    void Hoist(MachineInstr &MI);
+  };
+} // end anonymous namespace
+
+char MachineLICM::ID = 0;
+static RegisterPass<MachineLICM>
+X("machinelicm", "Machine Loop Invariant Code Motion");
+
+FunctionPass *llvm::createMachineLICMPass() { return new MachineLICM(); }
+
+/// LoopIsOuterMostWithPreheader - Test if the given loop is the outer-most
+/// loop that has a preheader.
+static bool LoopIsOuterMostWithPreheader(MachineLoop *CurLoop) {
+  for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop())
+    if (L->getLoopPreheader())
+      return false;
+  return true;
+}
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner loop
+/// is not preserved so it is not a good idea to run LICM multiple times on one
+/// loop.
+///
+bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  DOUT << "******** Machine LICM ********\n";
+
+  Changed = false;
+  TM = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  RegInfo = &MF.getRegInfo();
+
+  // Get our Loop information...
+  LI = &getAnalysis<MachineLoopInfo>();
+  DT = &getAnalysis<MachineDominatorTree>();
+
+  for (MachineLoopInfo::iterator
+         I = LI->begin(), E = LI->end(); I != E; ++I) {
+    CurLoop = *I;
+
+    // Only visit outer-most preheader-sporting loops.
+    if (!LoopIsOuterMostWithPreheader(CurLoop))
+      continue;
+
+    // Determine the block to which to hoist instructions. If we can't find a
+    // suitable loop preheader, we can't do any hoisting.
+    //
+    // FIXME: We are only hoisting if the basic block coming into this loop
+    // has only one successor. This isn't the case in general because we haven't
+    // broken critical edges or added preheaders.
+    CurPreheader = CurLoop->getLoopPreheader();
+    if (!CurPreheader)
+      continue;
+
+    HoistRegion(DT->getNode(CurLoop->getHeader()));
+  }
+
+  return Changed;
+}
+
+/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in depth
+/// first order w.r.t the DominatorTree. This allows us to visit definitions
+/// before uses, allowing us to hoist a loop body in one pass without iteration.
+///
+void MachineLICM::HoistRegion(MachineDomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  MachineBasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  for (MachineBasicBlock::iterator
+         MII = BB->begin(), E = BB->end(); MII != E; ) {
+    MachineBasicBlock::iterator NextMII = MII; ++NextMII;
+    MachineInstr &MI = *MII;
+
+    Hoist(MI);
+
+    MII = NextMII;
+  }
+
+  const std::vector<MachineDomTreeNode*> &Children = N->getChildren();
+
+  for (unsigned I = 0, E = Children.size(); I != E; ++I)
+    HoistRegion(Children[I]);
+}
+
+/// IsLoopInvariantInst - Returns true if the instruction is loop
+/// invariant. I.e., all virtual register operands are defined outside of the
+/// loop, physical registers aren't accessed explicitly, and there are no side
+/// effects that aren't captured by the operands or other flags.
+/// 
+bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
+  const TargetInstrDesc &TID = I.getDesc();
+  
+  // Ignore stuff that we obviously can't hoist.
+  if (TID.mayStore() || TID.isCall() || TID.isTerminator() ||
+      TID.hasUnmodeledSideEffects())
+    return false;
+
+  if (TID.mayLoad()) {
+    // Okay, this instruction does a load. As a refinement, we allow the target
+    // to decide whether the loaded value is actually a constant. If so, we can
+    // actually use it as a load.
+    if (!TII->isInvariantLoad(&I))
+      // FIXME: we should be able to sink loads with no other side effects if
+      // there is nothing that can change memory from here until the end of
+      // block. This is a trivial form of alias analysis.
+      return false;
+  }
+
+  DEBUG({
+      DOUT << "--- Checking if we can hoist " << I;
+      if (I.getDesc().getImplicitUses()) {
+        DOUT << "  * Instruction has implicit uses:\n";
+
+        const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+        for (const unsigned *ImpUses = I.getDesc().getImplicitUses();
+             *ImpUses; ++ImpUses)
+          DOUT << "      -> " << TRI->getName(*ImpUses) << "\n";
+      }
+
+      if (I.getDesc().getImplicitDefs()) {
+        DOUT << "  * Instruction has implicit defines:\n";
+
+        const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+        for (const unsigned *ImpDefs = I.getDesc().getImplicitDefs();
+             *ImpDefs; ++ImpDefs)
+          DOUT << "      -> " << TRI->getName(*ImpDefs) << "\n";
+      }
+    });
+
+  if (I.getDesc().getImplicitDefs() || I.getDesc().getImplicitUses()) {
+    DOUT << "Cannot hoist with implicit defines or uses\n";
+    return false;
+  }
+
+  // The instruction is loop invariant if all of its operands are.
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = I.getOperand(i);
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    // Don't hoist an instruction that uses or defines a physical register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return false;
+
+    if (!MO.isUse())
+      continue;
+
+    assert(RegInfo->getVRegDef(Reg) &&
+           "Machine instr not mapped for this vreg?!");
+
+    // If the loop contains the definition of an operand, then the instruction
+    // isn't loop invariant.
+    if (CurLoop->contains(RegInfo->getVRegDef(Reg)->getParent()))
+      return false;
+  }
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+
+/// HasPHIUses - Return true if the specified register has any PHI use.
+static bool HasPHIUses(unsigned Reg, MachineRegisterInfo *RegInfo) {
+  for (MachineRegisterInfo::use_iterator UI = RegInfo->use_begin(Reg),
+         UE = RegInfo->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI->getOpcode() == TargetInstrInfo::PHI)
+      return true;
+  }
+  return false;
+}
+
+/// IsProfitableToHoist - Return true if it is potentially profitable to hoist
+/// the given loop invariant.
+bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
+  if (MI.getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
+    return false;
+
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // FIXME: For now, only hoist re-materilizable instructions. LICM will
+  // increase register pressure. We want to make sure it doesn't increase
+  // spilling.
+  if (!TID.mayLoad() && (!TID.isRematerializable() ||
+                         !TII->isTriviallyReMaterializable(&MI)))
+    return false;
+
+  // If result(s) of this instruction is used by PHIs, then don't hoist it.
+  // The presence of joins makes it difficult for current register allocator
+  // implementation to perform remat.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    if (HasPHIUses(MO.getReg(), RegInfo))
+      return false;
+  }
+
+  return true;
+}
+
+static const MachineInstr *LookForDuplicate(const MachineInstr *MI,
+                                      std::vector<const MachineInstr*> &PrevMIs,
+                                      MachineRegisterInfo *RegInfo) {
+  unsigned NumOps = MI->getNumOperands();
+  for (unsigned i = 0, e = PrevMIs.size(); i != e; ++i) {
+    const MachineInstr *PrevMI = PrevMIs[i];
+    unsigned NumOps2 = PrevMI->getNumOperands();
+    if (NumOps != NumOps2)
+      continue;
+    bool IsSame = true;
+    for (unsigned j = 0; j != NumOps; ++j) {
+      const MachineOperand &MO = MI->getOperand(j);
+      if (MO.isReg() && MO.isDef()) {
+        if (RegInfo->getRegClass(MO.getReg()) !=
+            RegInfo->getRegClass(PrevMI->getOperand(j).getReg())) {
+          IsSame = false;
+          break;
+        }
+        continue;
+      }
+      if (!MO.isIdenticalTo(PrevMI->getOperand(j))) {
+        IsSame = false;
+        break;
+      }
+    }
+    if (IsSame)
+      return PrevMI;
+  }
+  return 0;
+}
+
+/// Hoist - When an instruction is found to use only loop invariant operands
+/// that are safe to hoist, this instruction is called to do the dirty work.
+///
+void MachineLICM::Hoist(MachineInstr &MI) {
+  if (!IsLoopInvariantInst(MI)) return;
+  if (!IsProfitableToHoist(MI)) return;
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  DEBUG({
+      DOUT << "Hoisting " << MI;
+      if (CurPreheader->getBasicBlock())
+        DOUT << " to MachineBasicBlock "
+             << CurPreheader->getBasicBlock()->getName();
+      if (MI.getParent()->getBasicBlock())
+        DOUT << " from MachineBasicBlock "
+             << MI.getParent()->getBasicBlock()->getName();
+      DOUT << "\n";
+    });
+
+  // Look for opportunity to CSE the hoisted instruction.
+  std::pair<unsigned, unsigned> BBOpcPair =
+    std::make_pair(CurPreheader->getNumber(), MI.getOpcode());
+  DenseMap<std::pair<unsigned, unsigned>,
+    std::vector<const MachineInstr*> >::iterator CI = CSEMap.find(BBOpcPair);
+  bool DoneCSE = false;
+  if (CI != CSEMap.end()) {
+    const MachineInstr *Dup = LookForDuplicate(&MI, CI->second, RegInfo);
+    if (Dup) {
+      DOUT << "CSEing " << MI;
+      DOUT << " with " << *Dup;
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI.getOperand(i);
+        if (MO.isReg() && MO.isDef())
+          RegInfo->replaceRegWith(MO.getReg(), Dup->getOperand(i).getReg());
+      }
+      MI.eraseFromParent();
+      DoneCSE = true;
+      ++NumCSEed;
+    }
+  }
+
+  // Otherwise, splice the instruction to the preheader.
+  if (!DoneCSE) {
+    CurPreheader->splice(CurPreheader->getFirstTerminator(),
+                         MI.getParent(), &MI);
+    // Add to the CSE map.
+    if (CI != CSEMap.end())
+      CI->second.push_back(&MI);
+    else {
+      std::vector<const MachineInstr*> CSEMIs;
+      CSEMIs.push_back(&MI);
+      CSEMap.insert(std::make_pair(BBOpcPair, CSEMIs));
+    }
+  }
+
+  ++NumHoisted;
+  Changed = true;
+}
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
new file mode 100644
index 0000000..68ddb7b
--- /dev/null
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -0,0 +1,40 @@
+//===- MachineLoopInfo.cpp - Natural Loop Calculator ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachineLoopInfo class that is used to identify natural
+// loops and determine the loop depth of various nodes of the CFG.  Note that
+// the loops identified may actually be several natural loops that share the 
+// same header node... not just a single natural loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+using namespace llvm;
+
+TEMPLATE_INSTANTIATION(class LoopBase<MachineBasicBlock>);
+TEMPLATE_INSTANTIATION(class LoopInfoBase<MachineBasicBlock>);
+
+char MachineLoopInfo::ID = 0;
+static RegisterPass<MachineLoopInfo>
+X("machine-loops", "Machine Natural Loop Construction", true);
+
+const PassInfo *const llvm::MachineLoopInfoID = &X;
+
+bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
+  releaseMemory();
+  LI->Calculate(getAnalysis<MachineDominatorTree>().getBase());    // Update
+  return false;
+}
+
+void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+}
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
new file mode 100644
index 0000000..1d8109e
--- /dev/null
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -0,0 +1,368 @@
+//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+#include "llvm/Constants.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+using namespace llvm::dwarf;
+
+// Handle the Pass registration stuff necessary to use TargetData's.
+static RegisterPass<MachineModuleInfo>
+X("machinemoduleinfo", "Module Information");
+char MachineModuleInfo::ID = 0;
+
+//===----------------------------------------------------------------------===//
+  
+MachineModuleInfo::MachineModuleInfo()
+: ImmutablePass(&ID)
+, LabelIDList()
+, FrameMoves()
+, LandingPads()
+, Personalities()
+, CallsEHReturn(0)
+, CallsUnwindInit(0)
+, DbgInfoAvailable(false)
+{
+  // Always emit "no personality" info
+  Personalities.push_back(NULL);
+}
+MachineModuleInfo::~MachineModuleInfo() {
+
+}
+
+/// doInitialization - Initialize the state for a new module.
+///
+bool MachineModuleInfo::doInitialization() {
+  return false;
+}
+
+/// doFinalization - Tear down the state after completion of a module.
+///
+bool MachineModuleInfo::doFinalization() {
+  return false;
+}
+
+/// BeginFunction - Begin gathering function meta information.
+///
+void MachineModuleInfo::BeginFunction(MachineFunction *MF) {
+  // Coming soon.
+}
+
+/// EndFunction - Discard function meta information.
+///
+void MachineModuleInfo::EndFunction() {
+  // Clean up frame info.
+  FrameMoves.clear();
+  
+  // Clean up exception info.
+  LandingPads.clear();
+  TypeInfos.clear();
+  FilterIds.clear();
+  FilterEnds.clear();
+  CallsEHReturn = 0;
+  CallsUnwindInit = 0;
+}
+
+/// AnalyzeModule - Scan the module for global debug information.
+///
+void MachineModuleInfo::AnalyzeModule(Module &M) {
+  // Insert functions in the llvm.used array into UsedFunctions.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (InitList == 0) return;
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InitList->getOperand(i)))
+      if (CE->getOpcode() == Instruction::BitCast)
+        if (Function *F = dyn_cast<Function>(CE->getOperand(0)))
+          UsedFunctions.insert(F);
+  }
+}
+
+//===-EH-------------------------------------------------------------------===//
+
+/// getOrCreateLandingPadInfo - Find or create an LandingPadInfo for the
+/// specified MachineBasicBlock.
+LandingPadInfo &MachineModuleInfo::getOrCreateLandingPadInfo
+    (MachineBasicBlock *LandingPad) {
+  unsigned N = LandingPads.size();
+  for (unsigned i = 0; i < N; ++i) {
+    LandingPadInfo &LP = LandingPads[i];
+    if (LP.LandingPadBlock == LandingPad)
+      return LP;
+  }
+  
+  LandingPads.push_back(LandingPadInfo(LandingPad));
+  return LandingPads[N];
+}
+
+/// addInvoke - Provide the begin and end labels of an invoke style call and
+/// associate it with a try landing pad block.
+void MachineModuleInfo::addInvoke(MachineBasicBlock *LandingPad,
+                                  unsigned BeginLabel, unsigned EndLabel) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.BeginLabels.push_back(BeginLabel);
+  LP.EndLabels.push_back(EndLabel);
+}
+
+/// addLandingPad - Provide the label of a try LandingPad block.
+///
+unsigned MachineModuleInfo::addLandingPad(MachineBasicBlock *LandingPad) {
+  unsigned LandingPadLabel = NextLabelID();
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.LandingPadLabel = LandingPadLabel;  
+  return LandingPadLabel;
+}
+
+/// addPersonality - Provide the personality function for the exception
+/// information.
+void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad,
+                                       Function *Personality) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.Personality = Personality;
+
+  for (unsigned i = 0; i < Personalities.size(); ++i)
+    if (Personalities[i] == Personality)
+      return;
+  
+  Personalities.push_back(Personality);
+}
+
+/// addCatchTypeInfo - Provide the catch typeinfo for a landing pad.
+///
+void MachineModuleInfo::addCatchTypeInfo(MachineBasicBlock *LandingPad,
+                                        std::vector<GlobalVariable *> &TyInfo) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  for (unsigned N = TyInfo.size(); N; --N)
+    LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
+}
+
+/// addFilterTypeInfo - Provide the filter typeinfo for a landing pad.
+///
+void MachineModuleInfo::addFilterTypeInfo(MachineBasicBlock *LandingPad,
+                                        std::vector<GlobalVariable *> &TyInfo) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  std::vector<unsigned> IdsInFilter(TyInfo.size());
+  for (unsigned I = 0, E = TyInfo.size(); I != E; ++I)
+    IdsInFilter[I] = getTypeIDFor(TyInfo[I]);
+  LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
+}
+
+/// addCleanup - Add a cleanup action for a landing pad.
+///
+void MachineModuleInfo::addCleanup(MachineBasicBlock *LandingPad) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.TypeIds.push_back(0);
+}
+
+/// TidyLandingPads - Remap landing pad labels and remove any deleted landing
+/// pads.
+void MachineModuleInfo::TidyLandingPads() {
+  for (unsigned i = 0; i != LandingPads.size(); ) {
+    LandingPadInfo &LandingPad = LandingPads[i];
+    LandingPad.LandingPadLabel = MappedLabel(LandingPad.LandingPadLabel);
+
+    // Special case: we *should* emit LPs with null LP MBB. This indicates
+    // "nounwind" case.
+    if (!LandingPad.LandingPadLabel && LandingPad.LandingPadBlock) {
+      LandingPads.erase(LandingPads.begin() + i);
+      continue;
+    }
+
+    for (unsigned j=0; j != LandingPads[i].BeginLabels.size(); ) {
+      unsigned BeginLabel = MappedLabel(LandingPad.BeginLabels[j]);
+      unsigned EndLabel = MappedLabel(LandingPad.EndLabels[j]);
+
+      if (!BeginLabel || !EndLabel) {
+        LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
+        LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
+        continue;
+      }
+
+      LandingPad.BeginLabels[j] = BeginLabel;
+      LandingPad.EndLabels[j] = EndLabel;
+      ++j;
+    }
+
+    // Remove landing pads with no try-ranges.
+    if (LandingPads[i].BeginLabels.empty()) {
+      LandingPads.erase(LandingPads.begin() + i);
+      continue;
+    }
+
+    // If there is no landing pad, ensure that the list of typeids is empty.
+    // If the only typeid is a cleanup, this is the same as having no typeids.
+    if (!LandingPad.LandingPadBlock ||
+        (LandingPad.TypeIds.size() == 1 && !LandingPad.TypeIds[0]))
+      LandingPad.TypeIds.clear();
+
+    ++i;
+  }
+}
+
+/// getTypeIDFor - Return the type id for the specified typeinfo.  This is 
+/// function wide.
+unsigned MachineModuleInfo::getTypeIDFor(GlobalVariable *TI) {
+  for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
+    if (TypeInfos[i] == TI) return i + 1;
+
+  TypeInfos.push_back(TI);
+  return TypeInfos.size();
+}
+
+/// getFilterIDFor - Return the filter id for the specified typeinfos.  This is
+/// function wide.
+int MachineModuleInfo::getFilterIDFor(std::vector<unsigned> &TyIds) {
+  // If the new filter coincides with the tail of an existing filter, then
+  // re-use the existing filter.  Folding filters more than this requires
+  // re-ordering filters and/or their elements - probably not worth it.
+  for (std::vector<unsigned>::iterator I = FilterEnds.begin(),
+       E = FilterEnds.end(); I != E; ++I) {
+    unsigned i = *I, j = TyIds.size();
+
+    while (i && j)
+      if (FilterIds[--i] != TyIds[--j])
+        goto try_next;
+
+    if (!j)
+      // The new filter coincides with range [i, end) of the existing filter.
+      return -(1 + i);
+
+try_next:;
+  }
+
+  // Add the new filter.
+  int FilterID = -(1 + FilterIds.size());
+  FilterIds.reserve(FilterIds.size() + TyIds.size() + 1);
+  for (unsigned I = 0, N = TyIds.size(); I != N; ++I)
+    FilterIds.push_back(TyIds[I]);
+  FilterEnds.push_back(FilterIds.size());
+  FilterIds.push_back(0); // terminator
+  return FilterID;
+}
+
+/// getPersonality - Return the personality function for the current function.
+Function *MachineModuleInfo::getPersonality() const {
+  // FIXME: Until PR1414 will be fixed, we're using 1 personality function per
+  // function
+  return !LandingPads.empty() ? LandingPads[0].Personality : NULL;
+}
+
+/// getPersonalityIndex - Return unique index for current personality
+/// function. NULL personality function should always get zero index.
+unsigned MachineModuleInfo::getPersonalityIndex() const {
+  const Function* Personality = NULL;
+  
+  // Scan landing pads. If there is at least one non-NULL personality - use it.
+  for (unsigned i = 0; i != LandingPads.size(); ++i)
+    if (LandingPads[i].Personality) {
+      Personality = LandingPads[i].Personality;
+      break;
+    }
+  
+  for (unsigned i = 0; i < Personalities.size(); ++i) {
+    if (Personalities[i] == Personality)
+      return i;
+  }
+
+  // This should never happen
+  assert(0 && "Personality function should be set!");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+/// DebugLabelFolding pass - This pass prunes out redundant labels.  This allows
+/// a info consumer to determine if the range of two labels is empty, by seeing
+/// if the labels map to the same reduced label.
+
+namespace llvm {
+
+struct DebugLabelFolder : public MachineFunctionPass {
+  static char ID;
+  DebugLabelFolder() : MachineFunctionPass(&ID) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+  virtual const char *getPassName() const { return "Label Folder"; }
+};
+
+char DebugLabelFolder::ID = 0;
+
+bool DebugLabelFolder::runOnMachineFunction(MachineFunction &MF) {
+  // Get machine module info.
+  MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  if (!MMI) return false;
+  
+  // Track if change is made.
+  bool MadeChange = false;
+  // No prior label to begin.
+  unsigned PriorLabel = 0;
+  
+  // Iterate through basic blocks.
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end();
+       BB != E; ++BB) {
+    // Iterate through instructions.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+      // Is it a label.
+      if (I->isDebugLabel() && !MMI->isDbgLabelUsed(I->getOperand(0).getImm())){
+        // The label ID # is always operand #0, an immediate.
+        unsigned NextLabel = I->getOperand(0).getImm();
+        
+        // If there was an immediate prior label.
+        if (PriorLabel) {
+          // Remap the current label to prior label.
+          MMI->RemapLabel(NextLabel, PriorLabel);
+          // Delete the current label.
+          I = BB->erase(I);
+          // Indicate a change has been made.
+          MadeChange = true;
+          continue;
+        } else {
+          // Start a new round.
+          PriorLabel = NextLabel;
+        }
+       } else {
+        // No consecutive labels.
+        PriorLabel = 0;
+      }
+      
+      ++I;
+    }
+  }
+  
+  return MadeChange;
+}
+
+FunctionPass *createDebugLabelFoldingPass() { return new DebugLabelFolder(); }
+
+}
+
diff --git a/lib/CodeGen/MachinePassRegistry.cpp b/lib/CodeGen/MachinePassRegistry.cpp
new file mode 100644
index 0000000..9f4ef12
--- /dev/null
+++ b/lib/CodeGen/MachinePassRegistry.cpp
@@ -0,0 +1,41 @@
+//===-- CodeGen/MachineInstr.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the machine function pass registry for register allocators
+// and instruction schedulers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePassRegistry.h"
+
+using namespace llvm;
+
+
+/// Add - Adds a function pass to the registration list.
+///
+void MachinePassRegistry::Add(MachinePassRegistryNode *Node) {
+  Node->setNext(List);
+  List = Node;
+  if (Listener) Listener->NotifyAdd(Node->getName(),
+                                    Node->getCtor(),
+                                    Node->getDescription());
+}
+
+
+/// Remove - Removes a function pass from the registration list.
+///
+void MachinePassRegistry::Remove(MachinePassRegistryNode *Node) {
+  for (MachinePassRegistryNode **I = &List; *I; I = (*I)->getNextAddress()) {
+    if (*I == Node) {
+      if (Listener) Listener->NotifyRemove(Node->getName());
+      *I = (*I)->getNext();
+      break;
+    }
+  }
+}
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
new file mode 100644
index 0000000..4f5ab1f
--- /dev/null
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -0,0 +1,125 @@
+//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the MachineRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI) {
+  VRegInfo.reserve(256);
+  RegClass2VRegMap.resize(TRI.getNumRegClasses()+1); // RC ID starts at 1.
+  UsedPhysRegs.resize(TRI.getNumRegs());
+  
+  // Create the physreg use/def lists.
+  PhysRegUseDefLists = new MachineOperand*[TRI.getNumRegs()];
+  memset(PhysRegUseDefLists, 0, sizeof(MachineOperand*)*TRI.getNumRegs());
+}
+
+MachineRegisterInfo::~MachineRegisterInfo() {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = VRegInfo.size(); i != e; ++i)
+    assert(VRegInfo[i].second == 0 && "Vreg use list non-empty still?");
+  for (unsigned i = 0, e = UsedPhysRegs.size(); i != e; ++i)
+    assert(!PhysRegUseDefLists[i] &&
+           "PhysRegUseDefLists has entries after all instructions are deleted");
+#endif
+  delete [] PhysRegUseDefLists;
+}
+
+/// setRegClass - Set the register class of the specified virtual register.
+///
+void
+MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
+  unsigned VR = Reg;
+  Reg -= TargetRegisterInfo::FirstVirtualRegister;
+  assert(Reg < VRegInfo.size() && "Invalid vreg!");
+  const TargetRegisterClass *OldRC = VRegInfo[Reg].first;
+  VRegInfo[Reg].first = RC;
+
+  // Remove from old register class's vregs list. This may be slow but
+  // fortunately this operation is rarely needed.
+  std::vector<unsigned> &VRegs = RegClass2VRegMap[OldRC->getID()];
+  std::vector<unsigned>::iterator I=std::find(VRegs.begin(), VRegs.end(), VR);
+  VRegs.erase(I);
+
+  // Add to new register class's vregs list.
+  RegClass2VRegMap[RC->getID()].push_back(VR);
+}
+
+/// createVirtualRegister - Create and return a new virtual register in the
+/// function with the specified register class.
+///
+unsigned
+MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
+  assert(RegClass && "Cannot create register without RegClass!");
+  // Add a reg, but keep track of whether the vector reallocated or not.
+  void *ArrayBase = VRegInfo.empty() ? 0 : &VRegInfo[0];
+  VRegInfo.push_back(std::make_pair(RegClass, (MachineOperand*)0));
+
+  if (!((&VRegInfo[0] == ArrayBase || VRegInfo.size() == 1)))
+    // The vector reallocated, handle this now.
+    HandleVRegListReallocation();
+  unsigned VR = getLastVirtReg();
+  RegClass2VRegMap[RegClass->getID()].push_back(VR);
+  return VR;
+}
+
+/// HandleVRegListReallocation - We just added a virtual register to the
+/// VRegInfo info list and it reallocated.  Update the use/def lists info
+/// pointers.
+void MachineRegisterInfo::HandleVRegListReallocation() {
+  // The back pointers for the vreg lists point into the previous vector.
+  // Update them to point to their correct slots.
+  for (unsigned i = 0, e = VRegInfo.size(); i != e; ++i) {
+    MachineOperand *List = VRegInfo[i].second;
+    if (!List) continue;
+    // Update the back-pointer to be accurate once more.
+    List->Contents.Reg.Prev = &VRegInfo[i].second;
+  }
+}
+
+/// replaceRegWith - Replace all instances of FromReg with ToReg in the
+/// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
+/// except that it also changes any definitions of the register as well.
+void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
+  assert(FromReg != ToReg && "Cannot replace a reg with itself");
+
+  // TODO: This could be more efficient by bulk changing the operands.
+  for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
+    MachineOperand &O = I.getOperand();
+    ++I;
+    O.setReg(ToReg);
+  }
+}
+
+
+/// getVRegDef - Return the machine instr that defines the specified virtual
+/// register or null if none is found.  This assumes that the code is in SSA
+/// form, so there should only be one definition.
+MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
+  assert(Reg-TargetRegisterInfo::FirstVirtualRegister < VRegInfo.size() &&
+         "Invalid vreg!");
+  for (reg_iterator I = reg_begin(Reg), E = reg_end(); I != E; ++I) {
+    // Since we are in SSA form, we can stop at the first definition.
+    if (I.getOperand().isDef())
+      return &*I;
+  }
+  return 0;
+}
+
+
+#ifndef NDEBUG
+void MachineRegisterInfo::dumpUses(unsigned Reg) const {
+  for (use_iterator I = use_begin(Reg), E = use_end(); I != E; ++I)
+    I.getOperand().getParent()->dump();
+}
+#endif
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
new file mode 100644
index 0000000..0e18fa7
--- /dev/null
+++ b/lib/CodeGen/MachineSink.cpp
@@ -0,0 +1,257 @@
+//===-- MachineSink.cpp - Sinking for machine instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass 
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-sink"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+STATISTIC(NumSunk, "Number of machine instructions sunk");
+
+namespace {
+  class VISIBILITY_HIDDEN MachineSinking : public MachineFunctionPass {
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    MachineFunction       *CurMF; // Current MachineFunction
+    MachineRegisterInfo  *RegInfo; // Machine register information
+    MachineDominatorTree *DT;   // Machine dominator tree for the current Loop
+
+  public:
+    static char ID; // Pass identification
+    MachineSinking() : MachineFunctionPass(&ID) {}
+    
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+    }
+  private:
+    bool ProcessBlock(MachineBasicBlock &MBB);
+    bool SinkInstruction(MachineInstr *MI, bool &SawStore);
+    bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB) const;
+  };
+} // end anonymous namespace
+  
+char MachineSinking::ID = 0;
+static RegisterPass<MachineSinking>
+X("machine-sink", "Machine code sinking");
+
+FunctionPass *llvm::createMachineSinkingPass() { return new MachineSinking(); }
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified register
+/// occur in blocks dominated by the specified block.
+bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg, 
+                                             MachineBasicBlock *MBB) const {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Only makes sense for vregs");
+  for (MachineRegisterInfo::reg_iterator I = RegInfo->reg_begin(Reg),
+       E = RegInfo->reg_end(); I != E; ++I) {
+    if (I.getOperand().isDef()) continue;  // ignore def.
+    
+    // Determine the block of the use.
+    MachineInstr *UseInst = &*I;
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+    if (UseInst->getOpcode() == TargetInstrInfo::PHI) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      UseBlock = UseInst->getOperand(I.getOperandNo()+1).getMBB();
+    }
+    // Check that it dominates.
+    if (!DT->dominates(MBB, UseBlock))
+      return false;
+  }
+  return true;
+}
+
+
+
+bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
+  DOUT << "******** Machine Sinking ********\n";
+  
+  CurMF = &MF;
+  TM = &CurMF->getTarget();
+  TII = TM->getInstrInfo();
+  RegInfo = &CurMF->getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+
+  bool EverMadeChange = false;
+  
+  while (1) {
+    bool MadeChange = false;
+
+    // Process all basic blocks.
+    for (MachineFunction::iterator I = CurMF->begin(), E = CurMF->end(); 
+         I != E; ++I)
+      MadeChange |= ProcessBlock(*I);
+    
+    // If this iteration over the code changed anything, keep iterating.
+    if (!MadeChange) break;
+    EverMadeChange = true;
+  } 
+  return EverMadeChange;
+}
+
+bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (MBB.succ_size() <= 1 || MBB.empty()) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  MachineBasicBlock::iterator I = MBB.end();
+  --I;
+  bool ProcessedBegin, SawStore = false;
+  do {
+    MachineInstr *MI = I;  // The instruction to sink.
+    
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == MBB.begin();
+    if (!ProcessedBegin)
+      --I;
+    
+    if (SinkInstruction(MI, SawStore))
+      ++NumSunk, MadeChange = true;
+    
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+  
+  return MadeChange;
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
+  // Check if it's safe to move the instruction.
+  if (!MI->isSafeToMove(TII, SawStore))
+    return false;
+  
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z only the shrink the live range of x.
+  
+  // Loop over all the operands of the specified instruction.  If there is
+  // anything we can't handle, bail out.
+  MachineBasicBlock *ParentBlock = MI->getParent();
+  
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  MachineBasicBlock *SuccToSinkTo = 0;
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;  // Ignore non-register operands.
+    
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // If this is a physical register use, we can't move it.  If it is a def,
+      // we can move it, but only if the def is dead.
+      if (MO.isUse() || !MO.isDead())
+        return false;
+    } else {
+      // Virtual register uses are always safe to sink.
+      if (MO.isUse()) continue;
+
+      // If it's not safe to move defs of the register class, then abort.
+      if (!TII->isSafeToMoveRegClassDefs(RegInfo->getRegClass(Reg)))
+        return false;
+      
+      // FIXME: This picks a successor to sink into based on having one
+      // successor that dominates all the uses.  However, there are cases where
+      // sinking can happen but where the sink point isn't a successor.  For
+      // example:
+      //   x = computation
+      //   if () {} else {}
+      //   use x
+      // the instruction could be sunk over the whole diamond for the 
+      // if/then/else (or loop, etc), allowing it to be sunk into other blocks
+      // after that.
+      
+      // Virtual register defs can only be sunk if all their uses are in blocks
+      // dominated by one of the successors.
+      if (SuccToSinkTo) {
+        // If a previous operand picked a block to sink to, then this operand
+        // must be sinkable to the same block.
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo)) 
+          return false;
+        continue;
+      }
+      
+      // Otherwise, we should look at all the successors and decide which one
+      // we should sink to.
+      for (MachineBasicBlock::succ_iterator SI = ParentBlock->succ_begin(),
+           E = ParentBlock->succ_end(); SI != E; ++SI) {
+        if (AllUsesDominatedByBlock(Reg, *SI)) {
+          SuccToSinkTo = *SI;
+          break;
+        }
+      }
+      
+      // If we couldn't find a block to sink to, ignore this instruction.
+      if (SuccToSinkTo == 0)
+        return false;
+    }
+  }
+  
+  // If there are no outputs, it must have side-effects.
+  if (SuccToSinkTo == 0)
+    return false;
+
+  // It's not safe to sink instructions to EH landing pad. Control flow into
+  // landing pad is implicitly defined.
+  if (SuccToSinkTo->isLandingPad())
+    return false;
+  
+  // If is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (MI->getParent() == SuccToSinkTo)
+    return false;
+  
+  DEBUG(cerr << "Sink instr " << *MI);
+  DEBUG(cerr << "to block " << *SuccToSinkTo);
+  
+  // If the block has multiple predecessors, this would introduce computation on
+  // a path that it doesn't already exist.  We could split the critical edge,
+  // but for now we just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->pred_size() > 1) {
+    DEBUG(cerr << " *** PUNTING: Critical edge found\n");
+    return false;
+  }
+  
+  // Determine where to insert into.  Skip phi nodes.
+  MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
+  while (InsertPos != SuccToSinkTo->end() && 
+         InsertPos->getOpcode() == TargetInstrInfo::PHI)
+    ++InsertPos;
+  
+  // Move the instruction.
+  SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
+                       ++MachineBasicBlock::iterator(MI));
+  return true;
+}
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
new file mode 100644
index 0000000..be1396c
--- /dev/null
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -0,0 +1,690 @@
+//===-- MachineVerifier.cpp - Machine Code Verifier -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass to verify generated machine code. The following is checked:
+//
+// Operand counts: All explicit operands must be present.
+//
+// Register classes: All physical and virtual register operands must be
+// compatible with the register class required by the instruction descriptor.
+//
+// Register live intervals: Registers must be defined only once, and must be
+// defined before use.
+//
+// The machine code verifier is enabled from LLVMTargetMachine.cpp with the
+// command-line option -verify-machineinstrs, or by defining the environment
+// variable LLVM_VERIFY_MACHINEINSTRS to the name of a file that will receive
+// the verifier errors.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <fstream>
+
+using namespace llvm;
+
+namespace {
+  struct VISIBILITY_HIDDEN MachineVerifier : public MachineFunctionPass {
+    static char ID; // Pass ID, replacement for typeid
+
+    MachineVerifier(bool allowDoubleDefs = false) :
+      MachineFunctionPass(&ID),
+      allowVirtDoubleDefs(allowDoubleDefs),
+      allowPhysDoubleDefs(allowDoubleDefs),
+      OutFileName(getenv("LLVM_VERIFY_MACHINEINSTRS"))
+        {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    const bool allowVirtDoubleDefs;
+    const bool allowPhysDoubleDefs;
+
+    const char *const OutFileName;
+    std::ostream *OS;
+    const MachineFunction *MF;
+    const TargetMachine *TM;
+    const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
+
+    unsigned foundErrors;
+
+    typedef SmallVector<unsigned, 16> RegVector;
+    typedef DenseSet<unsigned> RegSet;
+    typedef DenseMap<unsigned, const MachineInstr*> RegMap;
+
+    BitVector regsReserved;
+    RegSet regsLive;
+    RegVector regsDefined, regsImpDefined, regsDead, regsKilled;
+
+    // Add Reg and any sub-registers to RV
+    void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
+      RV.push_back(Reg);
+      if (TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (const unsigned *R = TRI->getSubRegisters(Reg); *R; R++)
+          RV.push_back(*R);
+    }
+
+    // Does RS contain any super-registers of Reg?
+    bool anySuperRegisters(const RegSet &RS, unsigned Reg) {
+      for (const unsigned *R = TRI->getSuperRegisters(Reg); *R; R++)
+        if (RS.count(*R))
+          return true;
+      return false;
+    }
+
+    struct BBInfo {
+      // Is this MBB reachable from the MF entry point?
+      bool reachable;
+
+      // Vregs that must be live in because they are used without being
+      // defined. Map value is the user.
+      RegMap vregsLiveIn;
+
+      // Vregs that must be dead in because they are defined without being
+      // killed first. Map value is the defining instruction.
+      RegMap vregsDeadIn;
+
+      // Regs killed in MBB. They may be defined again, and will then be in both
+      // regsKilled and regsLiveOut.
+      RegSet regsKilled;
+
+      // Regs defined in MBB and live out. Note that vregs passing through may
+      // be live out without being mentioned here.
+      RegSet regsLiveOut;
+
+      // Vregs that pass through MBB untouched. This set is disjoint from
+      // regsKilled and regsLiveOut.
+      RegSet vregsPassed;
+
+      BBInfo() : reachable(false) {}
+
+      // Add register to vregsPassed if it belongs there. Return true if
+      // anything changed.
+      bool addPassed(unsigned Reg) {
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          return false;
+        if (regsKilled.count(Reg) || regsLiveOut.count(Reg))
+          return false;
+        return vregsPassed.insert(Reg).second;
+      }
+
+      // Same for a full set.
+      bool addPassed(const RegSet &RS) {
+        bool changed = false;
+        for (RegSet::const_iterator I = RS.begin(), E = RS.end(); I != E; ++I)
+          if (addPassed(*I))
+            changed = true;
+        return changed;
+      }
+
+      // Live-out registers are either in regsLiveOut or vregsPassed.
+      bool isLiveOut(unsigned Reg) const {
+        return regsLiveOut.count(Reg) || vregsPassed.count(Reg);
+      }
+    };
+
+    // Extra register info per MBB.
+    DenseMap<const MachineBasicBlock*, BBInfo> MBBInfoMap;
+
+    bool isReserved(unsigned Reg) {
+      return Reg < regsReserved.size() && regsReserved[Reg];
+    }
+
+    void visitMachineFunctionBefore();
+    void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
+    void visitMachineInstrBefore(const MachineInstr *MI);
+    void visitMachineOperand(const MachineOperand *MO, unsigned MONum);
+    void visitMachineInstrAfter(const MachineInstr *MI);
+    void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
+    void visitMachineFunctionAfter();
+
+    void report(const char *msg, const MachineFunction *MF);
+    void report(const char *msg, const MachineBasicBlock *MBB);
+    void report(const char *msg, const MachineInstr *MI);
+    void report(const char *msg, const MachineOperand *MO, unsigned MONum);
+
+    void markReachable(const MachineBasicBlock *MBB);
+    void calcMaxRegsPassed();
+    void calcMinRegsPassed();
+    void checkPHIOps(const MachineBasicBlock *MBB);
+  };
+}
+
+char MachineVerifier::ID = 0;
+static RegisterPass<MachineVerifier>
+MachineVer("machineverifier", "Verify generated machine code");
+static const PassInfo *const MachineVerifyID = &MachineVer;
+
+FunctionPass *
+llvm::createMachineVerifierPass(bool allowPhysDoubleDefs)
+{
+  return new MachineVerifier(allowPhysDoubleDefs);
+}
+
+bool
+MachineVerifier::runOnMachineFunction(MachineFunction &MF)
+{
+  std::ofstream OutFile;
+  if (OutFileName) {
+    OutFile.open(OutFileName, std::ios::out | std::ios::app);
+    OS = &OutFile;
+  } else {
+    OS = cerr.stream();
+  }
+
+  foundErrors = 0;
+
+  this->MF = &MF;
+  TM = &MF.getTarget();
+  TRI = TM->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  visitMachineFunctionBefore();
+  for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
+       MFI!=MFE; ++MFI) {
+    visitMachineBasicBlockBefore(MFI);
+    for (MachineBasicBlock::const_iterator MBBI = MFI->begin(),
+           MBBE = MFI->end(); MBBI != MBBE; ++MBBI) {
+      visitMachineInstrBefore(MBBI);
+      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I)
+        visitMachineOperand(&MBBI->getOperand(I), I);
+      visitMachineInstrAfter(MBBI);
+    }
+    visitMachineBasicBlockAfter(MFI);
+  }
+  visitMachineFunctionAfter();
+
+  if (OutFileName)
+    OutFile.close();
+
+  if (foundErrors) {
+    cerr << "\nStopping with " << foundErrors << " machine code errors.\n";
+    exit(1);
+  }
+
+  return false;                 // no changes
+}
+
+void
+MachineVerifier::report(const char *msg, const MachineFunction *MF)
+{
+  assert(MF);
+  *OS << "\n";
+  if (!foundErrors++)
+    MF->print(OS);
+  *OS << "*** Bad machine code: " << msg << " ***\n"
+      << "- function:    " << MF->getFunction()->getName() << "\n";
+}
+
+void
+MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB)
+{
+  assert(MBB);
+  report(msg, MBB->getParent());
+  *OS << "- basic block: " << MBB->getBasicBlock()->getName()
+      << " " << (void*)MBB
+      << " (#" << MBB->getNumber() << ")\n";
+}
+
+void
+MachineVerifier::report(const char *msg, const MachineInstr *MI)
+{
+  assert(MI);
+  report(msg, MI->getParent());
+  *OS << "- instruction: ";
+  MI->print(OS, TM);
+}
+
+void
+MachineVerifier::report(const char *msg,
+                        const MachineOperand *MO, unsigned MONum)
+{
+  assert(MO);
+  report(msg, MO->getParent());
+  *OS << "- operand " << MONum << ":   ";
+  MO->print(*OS, TM);
+  *OS << "\n";
+}
+
+void
+MachineVerifier::markReachable(const MachineBasicBlock *MBB)
+{
+  BBInfo &MInfo = MBBInfoMap[MBB];
+  if (!MInfo.reachable) {
+    MInfo.reachable = true;
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
+           SuE = MBB->succ_end(); SuI != SuE; ++SuI)
+      markReachable(*SuI);
+  }
+}
+
+void
+MachineVerifier::visitMachineFunctionBefore()
+{
+  regsReserved = TRI->getReservedRegs(*MF);
+  markReachable(&MF->front());
+}
+
+void
+MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB)
+{
+  regsLive.clear();
+  for (MachineBasicBlock::const_livein_iterator I = MBB->livein_begin(),
+         E = MBB->livein_end(); I != E; ++I) {
+    if (!TargetRegisterInfo::isPhysicalRegister(*I)) {
+      report("MBB live-in list contains non-physical register", MBB);
+      continue;
+    }
+    regsLive.insert(*I);
+    for (const unsigned *R = TRI->getSubRegisters(*I); *R; R++)
+      regsLive.insert(*R);
+  }
+  regsKilled.clear();
+  regsDefined.clear();
+  regsImpDefined.clear();
+}
+
+void
+MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI)
+{
+  const TargetInstrDesc &TI = MI->getDesc();
+  if (MI->getNumExplicitOperands() < TI.getNumOperands()) {
+    report("Too few operands", MI);
+    *OS << TI.getNumOperands() << " operands expected, but "
+        << MI->getNumExplicitOperands() << " given.\n";
+  }
+  if (!TI.isVariadic()) {
+    if (MI->getNumExplicitOperands() > TI.getNumOperands()) {
+      report("Too many operands", MI);
+      *OS << TI.getNumOperands() << " operands expected, but "
+          << MI->getNumExplicitOperands() << " given.\n";
+    }
+  }
+}
+
+void
+MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum)
+{
+  const MachineInstr *MI = MO->getParent();
+  const TargetInstrDesc &TI = MI->getDesc();
+
+  // The first TI.NumDefs operands must be explicit register defines
+  if (MONum < TI.getNumDefs()) {
+    if (!MO->isReg())
+      report("Explicit definition must be a register", MO, MONum);
+    else if (!MO->isDef())
+      report("Explicit definition marked as use", MO, MONum);
+    else if (MO->isImplicit())
+      report("Explicit definition marked as implicit", MO, MONum);
+  }
+
+  switch (MO->getType()) {
+  case MachineOperand::MO_Register: {
+    const unsigned Reg = MO->getReg();
+    if (!Reg)
+      return;
+
+    // Check Live Variables.
+    if (MO->isUse()) {
+      if (MO->isKill()) {
+        addRegWithSubRegs(regsKilled, Reg);
+      } else {
+        // TwoAddress instr modyfying a reg is treated as kill+def.
+        unsigned defIdx;
+        if (MI->isRegTiedToDefOperand(MONum, &defIdx) &&
+            MI->getOperand(defIdx).getReg() == Reg)
+          addRegWithSubRegs(regsKilled, Reg);
+      }
+      // Explicit use of a dead register.
+      if (!MO->isImplicit() && !regsLive.count(Reg)) {
+        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          // Reserved registers may be used even when 'dead'.
+          if (!isReserved(Reg))
+            report("Using an undefined physical register", MO, MONum);
+        } else {
+          BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+          // We don't know which virtual registers are live in, so only complain
+          // if vreg was killed in this MBB. Otherwise keep track of vregs that
+          // must be live in. PHI instructions are handled separately.
+          if (MInfo.regsKilled.count(Reg))
+            report("Using a killed virtual register", MO, MONum);
+          else if (MI->getOpcode() != TargetInstrInfo::PHI)
+            MInfo.vregsLiveIn.insert(std::make_pair(Reg, MI));
+        }
+      }
+    } else {
+      // Register defined.
+      // TODO: verify that earlyclobber ops are not used.
+      if (MO->isImplicit())
+        addRegWithSubRegs(regsImpDefined, Reg);
+      else
+        addRegWithSubRegs(regsDefined, Reg);
+
+      if (MO->isDead())
+        addRegWithSubRegs(regsDead, Reg);
+    }
+
+    // Check register classes.
+    if (MONum < TI.getNumOperands() && !MO->isImplicit()) {
+      const TargetOperandInfo &TOI = TI.OpInfo[MONum];
+      unsigned SubIdx = MO->getSubReg();
+
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        unsigned sr = Reg;
+        if (SubIdx) {
+          unsigned s = TRI->getSubReg(Reg, SubIdx);
+          if (!s) {
+            report("Invalid subregister index for physical register",
+                   MO, MONum);
+            return;
+          }
+          sr = s;
+        }
+        if (TOI.RegClass) {
+          const TargetRegisterClass *DRC = TRI->getRegClass(TOI.RegClass);
+          if (!DRC->contains(sr)) {
+            report("Illegal physical register for instruction", MO, MONum);
+            *OS << TRI->getName(sr) << " is not a "
+                << DRC->getName() << " register.\n";
+          }
+        }
+      } else {
+        // Virtual register.
+        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+        if (SubIdx) {
+          if (RC->subregclasses_begin()+SubIdx >= RC->subregclasses_end()) {
+            report("Invalid subregister index for virtual register", MO, MONum);
+            return;
+          }
+          RC = *(RC->subregclasses_begin()+SubIdx);
+        }
+        if (TOI.RegClass) {
+          const TargetRegisterClass *DRC = TRI->getRegClass(TOI.RegClass);
+          if (RC != DRC && !RC->hasSuperClass(DRC)) {
+            report("Illegal virtual register for instruction", MO, MONum);
+            *OS << "Expected a " << DRC->getName() << " register, but got a "
+                << RC->getName() << " register\n";
+          }
+        }
+      }
+    }
+    break;
+  }
+    // Can PHI instrs refer to MBBs not in the CFG? X86 and ARM do.
+    // case MachineOperand::MO_MachineBasicBlock:
+    //   if (MI->getOpcode() == TargetInstrInfo::PHI) {
+    //     if (!MO->getMBB()->isSuccessor(MI->getParent()))
+    //       report("PHI operand is not in the CFG", MO, MONum);
+    //   }
+    //   break;
+  default:
+    break;
+  }
+}
+
+void
+MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI)
+{
+  BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+  set_union(MInfo.regsKilled, regsKilled);
+  set_subtract(regsLive, regsKilled);
+  regsKilled.clear();
+
+  for (RegVector::const_iterator I = regsDefined.begin(),
+         E = regsDefined.end(); I != E; ++I) {
+    if (regsLive.count(*I)) {
+      if (TargetRegisterInfo::isPhysicalRegister(*I)) {
+        // We allow double defines to physical registers with live
+        // super-registers.
+        if (!allowPhysDoubleDefs && !isReserved(*I) &&
+            !anySuperRegisters(regsLive, *I)) {
+          report("Redefining a live physical register", MI);
+          *OS << "Register " << TRI->getName(*I)
+              << " was defined but already live.\n";
+        }
+      } else {
+        if (!allowVirtDoubleDefs) {
+          report("Redefining a live virtual register", MI);
+          *OS << "Virtual register %reg" << *I
+              << " was defined but already live.\n";
+        }
+      }
+    } else if (TargetRegisterInfo::isVirtualRegister(*I) &&
+               !MInfo.regsKilled.count(*I)) {
+      // Virtual register defined without being killed first must be dead on
+      // entry.
+      MInfo.vregsDeadIn.insert(std::make_pair(*I, MI));
+    }
+  }
+
+  set_union(regsLive, regsDefined); regsDefined.clear();
+  set_union(regsLive, regsImpDefined); regsImpDefined.clear();
+  set_subtract(regsLive, regsDead); regsDead.clear();
+}
+
+void
+MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB)
+{
+  MBBInfoMap[MBB].regsLiveOut = regsLive;
+  regsLive.clear();
+}
+
+// Calculate the largest possible vregsPassed sets. These are the registers that
+// can pass through an MBB live, but may not be live every time. It is assumed
+// that all vregsPassed sets are empty before the call.
+void
+MachineVerifier::calcMaxRegsPassed()
+{
+  // First push live-out regs to successors' vregsPassed. Remember the MBBs that
+  // have any vregsPassed.
+  DenseSet<const MachineBasicBlock*> todo;
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    const MachineBasicBlock &MBB(*MFI);
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    if (!MInfo.reachable)
+      continue;
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB.succ_begin(),
+           SuE = MBB.succ_end(); SuI != SuE; ++SuI) {
+      BBInfo &SInfo = MBBInfoMap[*SuI];
+      if (SInfo.addPassed(MInfo.regsLiveOut))
+        todo.insert(*SuI);
+    }
+  }
+
+  // Iteratively push vregsPassed to successors. This will converge to the same
+  // final state regardless of DenseSet iteration order.
+  while (!todo.empty()) {
+    const MachineBasicBlock *MBB = *todo.begin();
+    todo.erase(MBB);
+    BBInfo &MInfo = MBBInfoMap[MBB];
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
+           SuE = MBB->succ_end(); SuI != SuE; ++SuI) {
+      if (*SuI == MBB)
+        continue;
+      BBInfo &SInfo = MBBInfoMap[*SuI];
+      if (SInfo.addPassed(MInfo.vregsPassed))
+        todo.insert(*SuI);
+    }
+  }
+}
+
+// Calculate the minimum vregsPassed set. These are the registers that always
+// pass live through an MBB. The calculation assumes that calcMaxRegsPassed has
+// been called earlier.
+void
+MachineVerifier::calcMinRegsPassed()
+{
+  DenseSet<const MachineBasicBlock*> todo;
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI)
+    todo.insert(MFI);
+
+  while (!todo.empty()) {
+    const MachineBasicBlock *MBB = *todo.begin();
+    todo.erase(MBB);
+    BBInfo &MInfo = MBBInfoMap[MBB];
+
+    // Remove entries from vRegsPassed that are not live out from all
+    // reachable predecessors.
+    RegSet dead;
+    for (RegSet::iterator I = MInfo.vregsPassed.begin(),
+           E = MInfo.vregsPassed.end(); I != E; ++I) {
+      for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
+             PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
+        BBInfo &PrInfo = MBBInfoMap[*PrI];
+        if (PrInfo.reachable && !PrInfo.isLiveOut(*I)) {
+          dead.insert(*I);
+          break;
+        }
+      }
+    }
+    // If any regs removed, we need to recheck successors.
+    if (!dead.empty()) {
+      set_subtract(MInfo.vregsPassed, dead);
+      todo.insert(MBB->succ_begin(), MBB->succ_end());
+    }
+  }
+}
+
+// Check PHI instructions at the beginning of MBB. It is assumed that
+// calcMinRegsPassed has been run so BBInfo::isLiveOut is valid.
+void
+MachineVerifier::checkPHIOps(const MachineBasicBlock *MBB)
+{
+  for (MachineBasicBlock::const_iterator BBI = MBB->begin(), BBE = MBB->end();
+       BBI != BBE && BBI->getOpcode() == TargetInstrInfo::PHI; ++BBI) {
+    DenseSet<const MachineBasicBlock*> seen;
+
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
+      unsigned Reg = BBI->getOperand(i).getReg();
+      const MachineBasicBlock *Pre = BBI->getOperand(i + 1).getMBB();
+      if (!Pre->isSuccessor(MBB))
+        continue;
+      seen.insert(Pre);
+      BBInfo &PrInfo = MBBInfoMap[Pre];
+      if (PrInfo.reachable && !PrInfo.isLiveOut(Reg))
+        report("PHI operand is not live-out from predecessor",
+               &BBI->getOperand(i), i);
+    }
+
+    // Did we see all predecessors?
+    for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
+           PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
+      if (!seen.count(*PrI)) {
+        report("Missing PHI operand", BBI);
+        *OS << "MBB #" << (*PrI)->getNumber()
+            << " is a predecessor according to the CFG.\n";
+      }
+    }
+  }
+}
+
+void
+MachineVerifier::visitMachineFunctionAfter()
+{
+  calcMaxRegsPassed();
+
+  // With the maximal set of vregsPassed we can verify dead-in registers.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    BBInfo &MInfo = MBBInfoMap[MFI];
+
+    // Skip unreachable MBBs.
+    if (!MInfo.reachable)
+      continue;
+
+    for (MachineBasicBlock::const_pred_iterator PrI = MFI->pred_begin(),
+           PrE = MFI->pred_end(); PrI != PrE; ++PrI) {
+      BBInfo &PrInfo = MBBInfoMap[*PrI];
+      if (!PrInfo.reachable)
+        continue;
+
+      // Verify physical live-ins. EH landing pads have magic live-ins so we
+      // ignore them.
+      if (!MFI->isLandingPad()) {
+        for (MachineBasicBlock::const_livein_iterator I = MFI->livein_begin(),
+               E = MFI->livein_end(); I != E; ++I) {
+          if (TargetRegisterInfo::isPhysicalRegister(*I) &&
+              !isReserved (*I) && !PrInfo.isLiveOut(*I)) {
+            report("Live-in physical register is not live-out from predecessor",
+                   MFI);
+            *OS << "Register " << TRI->getName(*I)
+                << " is not live-out from MBB #" << (*PrI)->getNumber()
+                << ".\n";
+          }
+        }
+      }
+
+
+      // Verify dead-in virtual registers.
+      if (!allowVirtDoubleDefs) {
+        for (RegMap::iterator I = MInfo.vregsDeadIn.begin(),
+               E = MInfo.vregsDeadIn.end(); I != E; ++I) {
+          // DeadIn register must be in neither regsLiveOut or vregsPassed of
+          // any predecessor.
+          if (PrInfo.isLiveOut(I->first)) {
+            report("Live-in virtual register redefined", I->second);
+            *OS << "Register %reg" << I->first
+                << " was live-out from predecessor MBB #"
+                << (*PrI)->getNumber() << ".\n";
+          }
+        }
+      }
+    }
+  }
+
+  calcMinRegsPassed();
+
+  // With the minimal set of vregsPassed we can verify live-in virtual
+  // registers, including PHI instructions.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    BBInfo &MInfo = MBBInfoMap[MFI];
+
+    // Skip unreachable MBBs.
+    if (!MInfo.reachable)
+      continue;
+
+    checkPHIOps(MFI);
+
+    for (MachineBasicBlock::const_pred_iterator PrI = MFI->pred_begin(),
+           PrE = MFI->pred_end(); PrI != PrE; ++PrI) {
+      BBInfo &PrInfo = MBBInfoMap[*PrI];
+      if (!PrInfo.reachable)
+        continue;
+
+      for (RegMap::iterator I = MInfo.vregsLiveIn.begin(),
+             E = MInfo.vregsLiveIn.end(); I != E; ++I) {
+        if (!PrInfo.isLiveOut(I->first)) {
+          report("Used virtual register is not live-in", I->second);
+          *OS << "Register %reg" << I->first
+              << " is not live-out from predecessor MBB #"
+              << (*PrI)->getNumber()
+              << ".\n";
+        }
+      }
+    }
+  }
+}
diff --git a/lib/CodeGen/Makefile b/lib/CodeGen/Makefile
new file mode 100644
index 0000000..4ab3e3c
--- /dev/null
+++ b/lib/CodeGen/Makefile
@@ -0,0 +1,22 @@
+##===- lib/CodeGen/Makefile --------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMCodeGen
+PARALLEL_DIRS = SelectionDAG AsmPrinter
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
+# Xcode prior to 2.4 generates an error in -pedantic mode with use of HUGE_VAL
+# in this directory.  Disable -pedantic for this broken compiler.
+ifneq ($(HUGE_VAL_SANITY),yes)
+CompileCommonOpts := $(filter-out -pedantic, $(CompileCommonOpts))
+endif
+
diff --git a/lib/CodeGen/OcamlGC.cpp b/lib/CodeGen/OcamlGC.cpp
new file mode 100644
index 0000000..f7bc9f3
--- /dev/null
+++ b/lib/CodeGen/OcamlGC.cpp
@@ -0,0 +1,38 @@
+//===-- OcamlGC.cpp - Ocaml frametable GC strategy ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering for the llvm.gc* intrinsics compatible with
+// Objective Caml 3.10.0, which uses a liveness-accurate static stack map.
+//
+// The frametable emitter is in OcamlGCPrinter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN OcamlGC : public GCStrategy {
+  public:
+    OcamlGC();
+  };
+}
+
+static GCRegistry::Add<OcamlGC>
+X("ocaml", "ocaml 3.10-compatible GC");
+
+void llvm::linkOcamlGC() { }
+
+OcamlGC::OcamlGC() {
+  NeededSafePoints = 1 << GC::PostCall;
+  UsesMetadata = true;
+}
diff --git a/lib/CodeGen/PBQP.cpp b/lib/CodeGen/PBQP.cpp
new file mode 100644
index 0000000..562300f
--- /dev/null
+++ b/lib/CodeGen/PBQP.cpp
@@ -0,0 +1,1395 @@
+//===---------------- PBQP.cpp --------- PBQP Solver ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Developed by:                   Bernhard Scholz
+//                             The University of Sydney
+//                         http://www.it.usyd.edu.au/~scholz
+//===----------------------------------------------------------------------===//
+
+#include "PBQP.h"
+#include "llvm/Config/alloca.h"
+#include <limits>
+#include <cassert>
+#include <cstring>
+
+namespace llvm {
+
+/**************************************************************************
+ * Data Structures 
+ **************************************************************************/
+
+/* edge of PBQP graph */
+typedef struct adjnode {
+  struct adjnode *prev,      /* doubly chained list */ 
+                 *succ, 
+                 *reverse;   /* reverse edge */
+  int adj;                   /* adj. node */
+  PBQPMatrix *costs;         /* cost matrix of edge */
+
+  bool tc_valid;              /* flag whether following fields are valid */
+  int *tc_safe_regs;          /* safe registers */
+  int tc_impact;              /* impact */ 
+} adjnode;
+
+/* bucket node */
+typedef struct bucketnode {
+  struct bucketnode *prev;   /* doubly chained list */
+  struct bucketnode *succ;   
+  int u;                     /* node */
+} bucketnode;
+
+/* data structure of partitioned boolean quadratic problem */
+struct pbqp {
+  int num_nodes;             /* number of nodes */
+  int max_deg;               /* maximal degree of a node */
+  bool solved;               /* flag that indicates whether PBQP has been solved yet */
+  bool optimal;              /* flag that indicates whether PBQP is optimal */
+  PBQPNum min;
+  bool changed;              /* flag whether graph has changed in simplification */
+
+                             /* node fields */
+  PBQPVector **node_costs;   /* cost vectors of nodes */
+  int *node_deg;             /* node degree of nodes */
+  int *solution;             /* solution for node */
+  adjnode **adj_list;        /* adj. list */
+  bucketnode **bucket_ptr;   /* bucket pointer of a node */
+
+                             /* node stack */
+  int *stack;                /* stack of nodes */
+  int stack_ptr;             /* stack pointer */
+
+                             /* bucket fields */
+  bucketnode **bucket_list;  /* bucket list */
+
+  int num_r0;                /* counters for number statistics */
+  int num_ri;
+  int num_rii;
+  int num_rn; 
+  int num_rn_special;      
+};
+
+bool isInf(PBQPNum n) { return n == std::numeric_limits<PBQPNum>::infinity(); } 
+
+/*****************************************************************************
+ * allocation/de-allocation of pbqp problem 
+ ****************************************************************************/
+
+/* allocate new partitioned boolean quadratic program problem */
+pbqp *alloc_pbqp(int num_nodes)
+{
+  pbqp *this_;
+  int u;
+  
+  assert(num_nodes > 0);
+  
+  /* allocate memory for pbqp data structure */   
+  this_ = (pbqp *)malloc(sizeof(pbqp));
+
+  /* Initialize pbqp fields */
+  this_->num_nodes = num_nodes;
+  this_->solved = false;
+  this_->optimal = true;
+  this_->min = 0.0;
+  this_->max_deg = 0;
+  this_->changed = false;
+  this_->num_r0 = 0;
+  this_->num_ri = 0;
+  this_->num_rii = 0;
+  this_->num_rn = 0;
+  this_->num_rn_special = 0;
+  
+  /* initialize/allocate stack fields of pbqp */ 
+  this_->stack = (int *) malloc(sizeof(int)*num_nodes);
+  this_->stack_ptr = 0;
+  
+  /* initialize/allocate node fields of pbqp */
+  this_->adj_list = (adjnode **) malloc(sizeof(adjnode *)*num_nodes);
+  this_->node_deg = (int *) malloc(sizeof(int)*num_nodes);
+  this_->solution = (int *) malloc(sizeof(int)*num_nodes);
+  this_->bucket_ptr = (bucketnode **) malloc(sizeof(bucketnode **)*num_nodes);
+  this_->node_costs = (PBQPVector**) malloc(sizeof(PBQPVector*) * num_nodes);
+  for(u=0;u<num_nodes;u++) {
+    this_->solution[u]=-1;
+    this_->adj_list[u]=NULL;
+    this_->node_deg[u]=0;
+    this_->bucket_ptr[u]=NULL;
+    this_->node_costs[u]=NULL;
+  }
+  
+  /* initialize bucket list */
+  this_->bucket_list = NULL;
+  
+  return this_;
+}
+
+/* free pbqp problem */
+void free_pbqp(pbqp *this_)
+{
+  int u;
+  int deg;
+  adjnode *adj_ptr,*adj_next;
+  bucketnode *bucket,*bucket_next;
+  
+  assert(this_ != NULL);
+  
+  /* free node cost fields */
+  for(u=0;u < this_->num_nodes;u++) {
+    delete this_->node_costs[u];
+  }
+  free(this_->node_costs);
+  
+  /* free bucket list */
+  for(deg=0;deg<=this_->max_deg;deg++) {
+    for(bucket=this_->bucket_list[deg];bucket!=NULL;bucket=bucket_next) {
+      this_->bucket_ptr[bucket->u] = NULL;
+      bucket_next = bucket-> succ;
+      free(bucket);
+    }
+  }
+  free(this_->bucket_list);
+  
+  /* free adj. list */
+  assert(this_->adj_list != NULL);
+  for(u=0;u < this_->num_nodes; u++) {
+    for(adj_ptr = this_->adj_list[u]; adj_ptr != NULL; adj_ptr = adj_next) {
+      adj_next = adj_ptr -> succ;
+      if (u < adj_ptr->adj) {
+        assert(adj_ptr != NULL);
+        delete adj_ptr->costs;
+      }
+      if (adj_ptr -> tc_safe_regs != NULL) {
+           free(adj_ptr -> tc_safe_regs);
+      }
+      free(adj_ptr);
+    }
+  }
+  free(this_->adj_list);
+  
+  /* free other node fields */
+  free(this_->node_deg);
+  free(this_->solution);
+  free(this_->bucket_ptr);
+
+  /* free stack */
+  free(this_->stack);
+
+  /* free pbqp data structure itself */
+  free(this_);
+}
+
+
+/****************************************************************************
+ * adj. node routines 
+ ****************************************************************************/
+
+/* find data structure of adj. node of a given node */
+static
+adjnode *find_adjnode(pbqp *this_,int u,int v)
+{
+  adjnode *adj_ptr;
+  
+  assert (this_ != NULL);
+  assert (u >= 0 && u < this_->num_nodes);
+  assert (v >= 0 && v < this_->num_nodes);
+  assert(this_->adj_list != NULL);
+
+  for(adj_ptr = this_ -> adj_list[u];adj_ptr != NULL; adj_ptr = adj_ptr -> succ) {
+    if (adj_ptr->adj == v) {
+      return adj_ptr;
+    }
+  }
+  return NULL;
+}
+
+/* allocate a new data structure for adj. node */
+static
+adjnode *alloc_adjnode(pbqp *this_,int u, PBQPMatrix *costs)
+{
+  adjnode *p;
+
+  assert(this_ != NULL);
+  assert(costs != NULL);
+  assert(u >= 0 && u < this_->num_nodes);
+
+  p = (adjnode *)malloc(sizeof(adjnode));
+  assert(p != NULL);
+  
+  p->adj = u;
+  p->costs = costs;  
+
+  p->tc_valid= false;
+  p->tc_safe_regs = NULL;
+  p->tc_impact = 0;
+
+  return p;
+}
+
+/* insert adjacence node to adj. list */
+static
+void insert_adjnode(pbqp *this_, int u, adjnode *adj_ptr)
+{
+
+  assert(this_ != NULL);
+  assert(adj_ptr != NULL);
+  assert(u >= 0 && u < this_->num_nodes);
+
+  /* if adjacency list of node is not empty -> update
+     first node of the list */
+  if (this_ -> adj_list[u] != NULL) {
+    assert(this_->adj_list[u]->prev == NULL);
+    this_->adj_list[u] -> prev = adj_ptr;
+  }
+
+  /* update doubly chained list pointers of pointers */
+  adj_ptr -> succ = this_->adj_list[u];
+  adj_ptr -> prev = NULL;
+
+  /* update adjacency list pointer of node u */
+  this_->adj_list[u] = adj_ptr;
+}
+
+/* remove entry in an adj. list */
+static
+void remove_adjnode(pbqp *this_, int u, adjnode *adj_ptr)
+{
+  assert(this_!= NULL);
+  assert(u >= 0 && u <= this_->num_nodes);
+  assert(this_->adj_list != NULL);
+  assert(adj_ptr != NULL);
+  
+  if (adj_ptr -> prev == NULL) {
+    this_->adj_list[u] = adj_ptr -> succ;
+  } else {
+    adj_ptr -> prev -> succ = adj_ptr -> succ;
+  } 
+
+  if (adj_ptr -> succ != NULL) {
+    adj_ptr -> succ -> prev = adj_ptr -> prev;
+  }
+
+  if(adj_ptr->reverse != NULL) {
+    adjnode *rev = adj_ptr->reverse;
+    rev->reverse = NULL;
+  }
+
+  if (adj_ptr -> tc_safe_regs != NULL) {
+     free(adj_ptr -> tc_safe_regs);
+  }
+
+  free(adj_ptr);
+}
+
+/*****************************************************************************
+ * node functions 
+ ****************************************************************************/
+
+/* get degree of a node */
+static
+int get_deg(pbqp *this_,int u)
+{
+  adjnode *adj_ptr;
+  int deg = 0;
+  
+  assert(this_ != NULL);
+  assert(u >= 0 && u < this_->num_nodes);
+  assert(this_->adj_list != NULL);
+
+  for(adj_ptr = this_ -> adj_list[u];adj_ptr != NULL; adj_ptr = adj_ptr -> succ) {
+    deg ++;
+  }
+  return deg;
+}
+
+/* reinsert node */
+static
+void reinsert_node(pbqp *this_,int u)
+{
+  adjnode *adj_u,
+          *adj_v;
+
+  assert(this_!= NULL);
+  assert(u >= 0 && u <= this_->num_nodes);
+  assert(this_->adj_list != NULL);
+
+  for(adj_u = this_ -> adj_list[u]; adj_u != NULL; adj_u = adj_u -> succ) {
+    int v = adj_u -> adj;
+    adj_v = alloc_adjnode(this_,u,adj_u->costs);
+    insert_adjnode(this_,v,adj_v);
+  }
+}
+
+/* remove node */
+static
+void remove_node(pbqp *this_,int u)
+{
+  adjnode *adj_ptr;
+
+  assert(this_!= NULL);
+  assert(u >= 0 && u <= this_->num_nodes);
+  assert(this_->adj_list != NULL);
+
+  for(adj_ptr = this_ -> adj_list[u]; adj_ptr != NULL; adj_ptr = adj_ptr -> succ) {
+    remove_adjnode(this_,adj_ptr->adj,adj_ptr -> reverse);
+  }
+}
+
+/*****************************************************************************
+ * edge functions
+ ****************************************************************************/
+
+/* insert edge to graph */
+/* (does not check whether edge exists in graph */
+static
+void insert_edge(pbqp *this_, int u, int v, PBQPMatrix *costs)
+{
+  adjnode *adj_u,
+          *adj_v;
+  
+  /* create adjanceny entry for u */
+  adj_u = alloc_adjnode(this_,v,costs);
+  insert_adjnode(this_,u,adj_u);
+
+
+  /* create adjanceny entry for v */
+  adj_v = alloc_adjnode(this_,u,costs);
+  insert_adjnode(this_,v,adj_v);
+  
+  /* create link for reverse edge */
+  adj_u -> reverse = adj_v;
+  adj_v -> reverse = adj_u;
+}
+
+/* delete edge */
+static
+void delete_edge(pbqp *this_,int u,int v)
+{
+  adjnode *adj_ptr;
+  adjnode *rev;
+  
+  assert(this_ != NULL);
+  assert( u >= 0 && u < this_->num_nodes);
+  assert( v >= 0 && v < this_->num_nodes);
+
+  adj_ptr=find_adjnode(this_,u,v);
+  assert(adj_ptr != NULL);
+  assert(adj_ptr->reverse != NULL);
+
+  delete adj_ptr -> costs;
+ 
+  rev = adj_ptr->reverse; 
+  remove_adjnode(this_,u,adj_ptr);
+  remove_adjnode(this_,v,rev);
+} 
+
+/*****************************************************************************
+ * cost functions 
+ ****************************************************************************/
+
+/* Note: Since cost(u,v) = transpose(cost(v,u)), it would be necessary to store 
+   two matrices for both edges (u,v) and (v,u). However, we only store the 
+   matrix for the case u < v. For the other case we transpose the stored matrix
+   if required. 
+*/
+
+/* add costs to cost vector of a node */
+void add_pbqp_nodecosts(pbqp *this_,int u, PBQPVector *costs)
+{
+  assert(this_ != NULL);
+  assert(costs != NULL);
+  assert(u >= 0 && u <= this_->num_nodes);
+  
+  if (!this_->node_costs[u]) {
+    this_->node_costs[u] = new PBQPVector(*costs);
+  } else {
+    *this_->node_costs[u] += *costs;
+  }
+}
+
+/* get cost matrix ptr */
+static
+PBQPMatrix *get_costmatrix_ptr(pbqp *this_, int u, int v)
+{
+  adjnode *adj_ptr;
+  PBQPMatrix *m = NULL;
+
+  assert (this_ != NULL);
+  assert (u >= 0 && u < this_->num_nodes);
+  assert (v >= 0 && v < this_->num_nodes); 
+
+  adj_ptr = find_adjnode(this_,u,v);
+
+  if (adj_ptr != NULL) {
+    m = adj_ptr -> costs;
+  } 
+
+  return m;
+}
+
+/* get cost matrix ptr */
+/* Note: only the pointer is returned for 
+   cost(u,v), if u < v.
+*/ 
+static
+PBQPMatrix *pbqp_get_costmatrix(pbqp *this_, int u, int v)
+{
+  adjnode *adj_ptr = find_adjnode(this_,u,v);
+  
+  if (adj_ptr != NULL) {
+    if ( u < v) {
+      return new PBQPMatrix(*adj_ptr->costs);
+    } else {
+      return new PBQPMatrix(adj_ptr->costs->transpose());
+    }
+  } else {
+    return NULL;
+  }  
+}
+
+/* add costs to cost matrix of an edge */
+void add_pbqp_edgecosts(pbqp *this_,int u,int v, PBQPMatrix *costs)
+{
+  PBQPMatrix *adj_costs;
+
+  assert(this_!= NULL);
+  assert(costs != NULL);
+  assert(u >= 0 && u <= this_->num_nodes);
+  assert(v >= 0 && v <= this_->num_nodes);
+  
+  /* does the edge u-v exists ? */
+  if (u == v) {
+    PBQPVector *diag = new PBQPVector(costs->diagonalize());
+    add_pbqp_nodecosts(this_,v,diag);
+    delete diag;
+  } else if ((adj_costs = get_costmatrix_ptr(this_,u,v))!=NULL) {
+    if ( u < v) {
+      *adj_costs += *costs;
+    } else {
+      *adj_costs += costs->transpose();
+    }
+  } else {
+    adj_costs = new PBQPMatrix((u < v) ? *costs : costs->transpose());
+    insert_edge(this_,u,v,adj_costs);
+  } 
+}
+
+/* remove bucket from bucket list */
+static
+void pbqp_remove_bucket(pbqp *this_, bucketnode *bucket)
+{
+  int u = bucket->u;
+  
+  assert(this_ != NULL);
+  assert(u >= 0 && u < this_->num_nodes);
+  assert(this_->bucket_list != NULL);
+  assert(this_->bucket_ptr[u] != NULL);
+  
+  /* update predecessor node in bucket list 
+     (if no preceeding bucket exists, then
+     the bucket_list pointer needs to be 
+     updated.)
+  */    
+  if (bucket->prev != NULL) {
+    bucket->prev-> succ = bucket->succ; 
+  } else {
+    this_->bucket_list[this_->node_deg[u]] = bucket -> succ;
+  }
+  
+  /* update successor node in bucket list */ 
+  if (bucket->succ != NULL) { 
+    bucket->succ-> prev = bucket->prev;
+  }
+}
+
+/**********************************************************************************
+ * pop functions
+ **********************************************************************************/
+
+/* pop node of given degree */
+static
+int pop_node(pbqp *this_,int deg)
+{
+  bucketnode *bucket;
+  int u;
+
+  assert(this_ != NULL);
+  assert(deg >= 0 && deg <= this_->max_deg);
+  assert(this_->bucket_list != NULL);
+   
+  /* get first bucket of bucket list */
+  bucket = this_->bucket_list[deg];
+  assert(bucket != NULL);
+
+  /* remove bucket */
+  pbqp_remove_bucket(this_,bucket);
+  u = bucket->u;
+  free(bucket);
+  return u;
+}
+
+/**********************************************************************************
+ * reorder functions
+ **********************************************************************************/
+
+/* add bucket to bucketlist */
+static
+void add_to_bucketlist(pbqp *this_,bucketnode *bucket, int deg)
+{
+  bucketnode *old_head;
+  
+  assert(bucket != NULL);
+  assert(this_ != NULL);
+  assert(deg >= 0 && deg <= this_->max_deg);
+  assert(this_->bucket_list != NULL);
+
+  /* store node degree (for re-ordering purposes)*/
+  this_->node_deg[bucket->u] = deg;
+  
+  /* put bucket to front of doubly chained list */
+  old_head = this_->bucket_list[deg];
+  bucket -> prev = NULL;
+  bucket -> succ = old_head;
+  this_ -> bucket_list[deg] = bucket;
+  if (bucket -> succ != NULL ) {
+    assert ( old_head -> prev == NULL);
+    old_head -> prev = bucket;
+  }
+}
+
+
+/* reorder node in bucket list according to 
+   current node degree */
+static
+void reorder_node(pbqp *this_, int u)
+{
+  int deg; 
+  
+  assert(this_ != NULL);
+  assert(u>= 0 && u < this_->num_nodes);
+  assert(this_->bucket_list != NULL);
+  assert(this_->bucket_ptr[u] != NULL);
+
+  /* get current node degree */
+  deg = get_deg(this_,u);
+  
+  /* remove bucket from old bucket list only
+     if degree of node has changed. */
+  if (deg != this_->node_deg[u]) {
+    pbqp_remove_bucket(this_,this_->bucket_ptr[u]);
+    add_to_bucketlist(this_,this_->bucket_ptr[u],deg);
+  } 
+}
+
+/* reorder adj. nodes of a node */
+static
+void reorder_adjnodes(pbqp *this_,int u)
+{
+  adjnode *adj_ptr;
+  
+  assert(this_!= NULL);
+  assert(u >= 0 && u <= this_->num_nodes);
+  assert(this_->adj_list != NULL);
+
+  for(adj_ptr = this_ -> adj_list[u]; adj_ptr != NULL; adj_ptr = adj_ptr -> succ) {
+    reorder_node(this_,adj_ptr->adj);
+  }
+}
+
+/**********************************************************************************
+ * creation functions
+ **********************************************************************************/
+
+/* create new bucket entry */
+/* consistency of the bucket list is not checked! */
+static
+void create_bucket(pbqp *this_,int u,int deg)
+{
+  bucketnode *bucket;
+  
+  assert(this_ != NULL);
+  assert(u >= 0 && u < this_->num_nodes);
+  assert(this_->bucket_list != NULL);
+  
+  bucket = (bucketnode *)malloc(sizeof(bucketnode));
+  assert(bucket != NULL);
+
+  bucket -> u = u;
+  this_->bucket_ptr[u] = bucket;
+
+  add_to_bucketlist(this_,bucket,deg);
+}
+
+/* create bucket list */
+static
+void create_bucketlist(pbqp *this_)
+{
+  int u;
+  int max_deg;
+  int deg;
+
+  assert(this_ != NULL);
+  assert(this_->bucket_list == NULL);
+
+  /* determine max. degree of the nodes */
+  max_deg = 2;  /* at least of degree two! */
+  for(u=0;u<this_->num_nodes;u++) {
+    deg = this_->node_deg[u] = get_deg(this_,u);
+    if (deg > max_deg) {
+      max_deg = deg;
+    }
+  }
+  this_->max_deg = max_deg;
+  
+  /* allocate bucket list */
+  this_ -> bucket_list = (bucketnode **)malloc(sizeof(bucketnode *)*(max_deg + 1));
+  memset(this_->bucket_list,0,sizeof(bucketnode *)*(max_deg + 1));
+  assert(this_->bucket_list != NULL);
+  
+  /* insert nodes to the list */
+  for(u=0;u<this_->num_nodes;u++) {
+    create_bucket(this_,u,this_->node_deg[u]);  
+  }
+}
+
+/*****************************************************************************
+ * PBQP simplification for trivial nodes
+ ****************************************************************************/
+
+/* remove trivial node with cost vector length of one */
+static
+void disconnect_trivialnode(pbqp *this_,int u)
+{
+  int v;
+  adjnode *adj_ptr, 
+          *next;
+  PBQPMatrix *c_uv;
+  PBQPVector *c_v;
+  
+  assert(this_ != NULL);
+  assert(this_->node_costs != NULL);
+  assert(u >= 0 && u < this_ -> num_nodes);
+  assert(this_->node_costs[u]->getLength() == 1);
+  
+  /* add edge costs to node costs of adj. nodes */
+  for(adj_ptr = this_->adj_list[u]; adj_ptr != NULL; adj_ptr = next){
+    next = adj_ptr -> succ;
+    v = adj_ptr -> adj;
+    assert(v >= 0 && v < this_ -> num_nodes);
+    
+    /* convert matrix to cost vector offset for adj. node */
+    c_uv = pbqp_get_costmatrix(this_,u,v);
+    c_v = new PBQPVector(c_uv->getRowAsVector(0));
+    *this_->node_costs[v] += *c_v;
+    
+    /* delete edge & free vec/mat */
+    delete c_v;
+    delete c_uv;
+    delete_edge(this_,u,v);
+  }   
+}
+
+/* find all trivial nodes and disconnect them */
+static   
+void eliminate_trivial_nodes(pbqp *this_)
+{
+   int u;
+   
+   assert(this_ != NULL);
+   assert(this_ -> node_costs != NULL);
+   
+   for(u=0;u < this_ -> num_nodes; u++) {
+     if (this_->node_costs[u]->getLength() == 1) {
+       disconnect_trivialnode(this_,u); 
+     }
+   }
+}
+
+/*****************************************************************************
+ * Normal form for PBQP 
+ ****************************************************************************/
+
+/* simplify a cost matrix. If the matrix
+   is independent, then simplify_matrix
+   returns true - otherwise false. In
+   vectors u and v the offset values of
+   the decomposition are stored. 
+*/
+
+static
+bool normalize_matrix(PBQPMatrix *m, PBQPVector *u, PBQPVector *v)
+{
+  assert( m != NULL);
+  assert( u != NULL);
+  assert( v != NULL);
+  assert( u->getLength() > 0);
+  assert( v->getLength() > 0);
+  
+  assert(m->getRows() == u->getLength());
+  assert(m->getCols() == v->getLength());
+
+  /* determine u vector */
+  for(unsigned r = 0; r < m->getRows(); ++r) {
+    PBQPNum min = m->getRowMin(r);
+    (*u)[r] += min;
+    if (!isInf(min)) {
+      m->subFromRow(r, min);
+    } else {
+      m->setRow(r, 0);
+    }
+  }
+  
+  /* determine v vector */
+  for(unsigned c = 0; c < m->getCols(); ++c) {
+    PBQPNum min = m->getColMin(c);
+    (*v)[c] += min;
+    if (!isInf(min)) {
+      m->subFromCol(c, min);
+    } else {
+      m->setCol(c, 0);
+    }
+  }
+  
+  /* determine whether matrix is 
+     independent or not. 
+    */
+  return m->isZero();
+}
+
+/* simplify single edge */
+static
+void simplify_edge(pbqp *this_,int u,int v)
+{
+  PBQPMatrix *costs;
+  bool is_zero; 
+  
+  assert (this_ != NULL);
+  assert (u >= 0 && u <this_->num_nodes);
+  assert (v >= 0 && v <this_->num_nodes);
+  assert (u != v);
+
+  /* swap u and v  if u > v in order to avoid un-necessary
+     tranpositions of the cost matrix */
+  
+  if (u > v) {
+    int swap = u;
+    u = v;
+    v = swap;
+  }
+  
+  /* get cost matrix and simplify it */  
+  costs = get_costmatrix_ptr(this_,u,v);
+  is_zero=normalize_matrix(costs,this_->node_costs[u],this_->node_costs[v]);
+
+  /* delete edge */
+  if(is_zero){
+    delete_edge(this_,u,v);
+    this_->changed = true;
+  }
+}
+
+/* normalize cost matrices and remove 
+   edges in PBQP if they ary independent, 
+   i.e. can be decomposed into two 
+   cost vectors.
+*/
+static
+void eliminate_independent_edges(pbqp *this_)
+{
+  int u,v;
+  adjnode *adj_ptr,*next;
+  
+  assert(this_ != NULL);
+  assert(this_ -> adj_list != NULL);
+
+  this_->changed = false;
+  for(u=0;u < this_->num_nodes;u++) {
+    for (adj_ptr = this_ -> adj_list[u]; adj_ptr != NULL; adj_ptr = next) {
+      next = adj_ptr -> succ;
+      v = adj_ptr -> adj;
+      assert(v >= 0 && v < this_->num_nodes);
+      if (u < v) {
+        simplify_edge(this_,u,v);
+      } 
+    }
+  }
+}
+
+
+/*****************************************************************************
+ * PBQP reduction rules 
+ ****************************************************************************/
+
+/* RI reduction
+   This reduction rule is applied for nodes 
+   of degree one. */
+
+static
+void apply_RI(pbqp *this_,int x)
+{
+  int y;
+  unsigned xlen,
+           ylen;
+  PBQPMatrix *c_yx;
+  PBQPVector *c_x, *delta;
+  
+  assert(this_ != NULL);
+  assert(x >= 0 && x < this_->num_nodes);
+  assert(this_ -> adj_list[x] != NULL);
+  assert(this_ -> adj_list[x] -> succ == NULL);
+
+  /* get adjacence matrix */
+  y = this_ -> adj_list[x] -> adj;
+  assert(y >= 0 && y < this_->num_nodes);
+  
+  /* determine length of cost vectors for node x and y */
+  xlen = this_ -> node_costs[x]->getLength();
+  ylen = this_ -> node_costs[y]->getLength();
+
+  /* get cost vector c_x and matrix c_yx */
+  c_x = this_ -> node_costs[x];
+  c_yx = pbqp_get_costmatrix(this_,y,x); 
+  assert (c_yx != NULL);
+
+  
+  /* allocate delta vector */
+  delta = new PBQPVector(ylen);
+
+  /* compute delta vector */
+  for(unsigned i = 0; i < ylen; ++i) {
+    PBQPNum min =  (*c_yx)[i][0] + (*c_x)[0];
+    for(unsigned j = 1; j < xlen; ++j) {
+      PBQPNum c =  (*c_yx)[i][j] + (*c_x)[j];
+      if ( c < min )  
+         min = c;
+    }
+    (*delta)[i] = min; 
+  } 
+
+  /* add delta vector */
+  *this_ -> node_costs[y] += *delta;
+
+  /* delete node x */
+  remove_node(this_,x);
+
+  /* reorder adj. nodes of node x */
+  reorder_adjnodes(this_,x);
+
+  /* push node x on stack */
+  assert(this_ -> stack_ptr < this_ -> num_nodes);
+  this_->stack[this_ -> stack_ptr++] = x;
+
+  /* free vec/mat */
+  delete c_yx;
+  delete delta;
+
+  /* increment counter for number statistic */
+  this_->num_ri++;
+}
+
+/* RII reduction
+   This reduction rule is applied for nodes 
+   of degree two. */
+
+static
+void apply_RII(pbqp *this_,int x)
+{
+  int y,z; 
+  unsigned xlen,ylen,zlen;
+  adjnode *adj_yz;
+
+  PBQPMatrix *c_yx, *c_zx;
+  PBQPVector *cx;
+  PBQPMatrix *delta;
+ 
+  assert(this_ != NULL);
+  assert(x >= 0 && x < this_->num_nodes);
+  assert(this_ -> adj_list[x] != NULL);
+  assert(this_ -> adj_list[x] -> succ != NULL);
+  assert(this_ -> adj_list[x] -> succ -> succ == NULL);
+
+  /* get adjacence matrix */
+  y = this_ -> adj_list[x] -> adj;
+  z = this_ -> adj_list[x] -> succ -> adj;
+  assert(y >= 0 && y < this_->num_nodes);
+  assert(z >= 0 && z < this_->num_nodes);
+  
+  /* determine length of cost vectors for node x and y */
+  xlen = this_ -> node_costs[x]->getLength();
+  ylen = this_ -> node_costs[y]->getLength();
+  zlen = this_ -> node_costs[z]->getLength();
+
+  /* get cost vector c_x and matrix c_yx */
+  cx = this_ -> node_costs[x];
+  c_yx = pbqp_get_costmatrix(this_,y,x); 
+  c_zx = pbqp_get_costmatrix(this_,z,x); 
+  assert(c_yx != NULL);
+  assert(c_zx != NULL);
+
+  /* Colour Heuristic */
+  if ( (adj_yz = find_adjnode(this_,y,z)) != NULL) {
+    adj_yz->tc_valid = false;
+    adj_yz->reverse->tc_valid = false; 
+  }
+
+  /* allocate delta matrix */
+  delta = new PBQPMatrix(ylen, zlen);
+
+  /* compute delta matrix */
+  for(unsigned i=0;i<ylen;i++) {
+    for(unsigned j=0;j<zlen;j++) {
+      PBQPNum min = (*c_yx)[i][0] + (*c_zx)[j][0] + (*cx)[0];
+      for(unsigned k=1;k<xlen;k++) {
+        PBQPNum c = (*c_yx)[i][k] + (*c_zx)[j][k] + (*cx)[k];
+        if ( c < min ) {
+          min = c;
+        }
+      }
+      (*delta)[i][j] = min;
+    }
+  }
+
+  /* add delta matrix */
+  add_pbqp_edgecosts(this_,y,z,delta);
+
+  /* delete node x */
+  remove_node(this_,x);
+
+  /* simplify cost matrix c_yz */
+  simplify_edge(this_,y,z);
+
+  /* reorder adj. nodes */
+  reorder_adjnodes(this_,x);
+
+  /* push node x on stack */
+  assert(this_ -> stack_ptr < this_ -> num_nodes);
+  this_->stack[this_ -> stack_ptr++] = x;
+
+  /* free vec/mat */
+  delete c_yx;
+  delete c_zx;
+  delete delta;
+
+  /* increment counter for number statistic */
+  this_->num_rii++;
+
+}
+
+/* RN reduction */
+static
+void apply_RN(pbqp *this_,int x)
+{
+  unsigned xlen;
+
+  assert(this_ != NULL);
+  assert(x >= 0 && x < this_->num_nodes);
+  assert(this_ -> node_costs[x] != NULL);
+
+  xlen = this_ -> node_costs[x] -> getLength();
+
+  /* after application of RN rule no optimality
+     can be guaranteed! */
+  this_ -> optimal = false;
+  
+  /* push node x on stack */
+  assert(this_ -> stack_ptr < this_ -> num_nodes);
+  this_->stack[this_ -> stack_ptr++] = x;
+
+  /* delete node x */ 
+  remove_node(this_,x);
+
+  /* reorder adj. nodes of node x */
+  reorder_adjnodes(this_,x);
+
+  /* increment counter for number statistic */
+  this_->num_rn++;
+}
+
+
+static
+void compute_tc_info(pbqp *this_, adjnode *p)
+{
+   adjnode *r;
+   PBQPMatrix *m;
+   int x,y;
+   PBQPVector *c_x, *c_y;
+   int *row_inf_counts;
+
+   assert(p->reverse != NULL);
+
+   /* set flags */ 
+   r = p->reverse;
+   p->tc_valid = true;
+   r->tc_valid = true;
+
+   /* get edge */
+   x = r->adj;
+   y = p->adj;
+
+   /* get cost vectors */
+   c_x = this_ -> node_costs[x];
+   c_y = this_ -> node_costs[y];
+
+   /* get cost matrix */
+   m = pbqp_get_costmatrix(this_, x, y);
+
+  
+   /* allocate allowed set for edge (x,y) and (y,x) */
+   if (p->tc_safe_regs == NULL) {
+     p->tc_safe_regs = (int *) malloc(sizeof(int) * c_x->getLength());
+   } 
+
+   if (r->tc_safe_regs == NULL ) {
+     r->tc_safe_regs = (int *) malloc(sizeof(int) * c_y->getLength());
+   }
+
+   p->tc_impact = r->tc_impact = 0;
+
+   row_inf_counts = (int *) alloca(sizeof(int) * c_x->getLength());
+
+   /* init arrays */
+   p->tc_safe_regs[0] = 0;
+   row_inf_counts[0] = 0;
+   for(unsigned i = 1; i < c_x->getLength(); ++i){
+     p->tc_safe_regs[i] = 1;
+     row_inf_counts[i] = 0;
+   }
+
+   r->tc_safe_regs[0] = 0;
+   for(unsigned j = 1; j < c_y->getLength(); ++j){
+     r->tc_safe_regs[j] = 1;
+   }
+
+   for(unsigned j = 0; j < c_y->getLength(); ++j) {
+      int col_inf_counts = 0;
+      for (unsigned i = 0; i < c_x->getLength(); ++i) {
+         if (isInf((*m)[i][j])) {
+              ++col_inf_counts;
+              ++row_inf_counts[i];
+         
+              p->tc_safe_regs[i] = 0;
+              r->tc_safe_regs[j] = 0;
+         }
+      }
+      if (col_inf_counts > p->tc_impact) {
+           p->tc_impact = col_inf_counts;
+      }
+   }
+
+   for(unsigned i = 0; i < c_x->getLength(); ++i){
+     if (row_inf_counts[i] > r->tc_impact)
+     {
+           r->tc_impact = row_inf_counts[i];
+     }
+   }
+           
+   delete m;
+}
+
+/* 
+ * Checks whether node x can be locally coloured. 
+ */
+static 
+int is_colorable(pbqp *this_,int x)
+{
+  adjnode *adj_ptr;
+  PBQPVector *c_x;
+  int result = 1;
+  int *allowed;
+  int num_allowed = 0;
+  unsigned total_impact = 0;
+
+  assert(this_ != NULL);
+  assert(x >= 0 && x < this_->num_nodes);
+  assert(this_ -> node_costs[x] != NULL);
+
+  c_x = this_ -> node_costs[x];
+
+  /* allocate allowed set */
+  allowed = (int *)malloc(sizeof(int) * c_x->getLength());
+  for(unsigned i = 0; i < c_x->getLength(); ++i){
+    if (!isInf((*c_x)[i]) && i > 0) {
+      allowed[i] = 1;
+      ++num_allowed;
+    } else { 
+      allowed[i] = 0;
+    }
+  }
+
+  /* determine local minimum */
+  for(adj_ptr=this_->adj_list[x] ;adj_ptr != NULL; adj_ptr = adj_ptr -> succ) {
+      if (!adj_ptr -> tc_valid) { 
+          compute_tc_info(this_, adj_ptr);
+      }
+
+      total_impact += adj_ptr->tc_impact;
+
+      if (num_allowed > 0) {
+          for (unsigned i = 1; i < c_x->getLength(); ++i){
+            if (allowed[i]){
+              if (!adj_ptr->tc_safe_regs[i]){
+                allowed[i] = 0;
+                --num_allowed;
+                if (num_allowed == 0)
+                    break;
+              }
+            }
+          }
+      }
+      
+      if ( total_impact >= c_x->getLength() - 1 && num_allowed == 0 ) {
+         result = 0;
+         break;
+      }
+  }
+  free(allowed);
+
+  return result;
+}
+
+/* use briggs heuristic 
+  note: this_ is not a general heuristic. it only is useful for 
+  interference graphs.
+ */
+int pop_colorablenode(pbqp *this_)
+{
+  int deg;
+  bucketnode *min_bucket=NULL;
+  PBQPNum min = std::numeric_limits<PBQPNum>::infinity();
+ 
+  /* select node where the number of colors is less than the node degree */
+  for(deg=this_->max_deg;deg > 2;deg--) {
+    bucketnode *bucket;
+    for(bucket=this_->bucket_list[deg];bucket!= NULL;bucket = bucket -> succ) {
+      int u = bucket->u;
+      if (is_colorable(this_,u)) {
+        pbqp_remove_bucket(this_,bucket);
+        this_->num_rn_special++;
+        free(bucket);
+        return u; 
+      } 
+    }
+  }
+
+  /* select node with minimal ratio between average node costs and degree of node */
+  for(deg=this_->max_deg;deg >2; deg--) {
+    bucketnode *bucket;
+    for(bucket=this_->bucket_list[deg];bucket!= NULL;bucket = bucket -> succ) {
+      PBQPNum h;
+      int u;
+ 
+      u = bucket->u;
+      assert(u>=0 && u < this_->num_nodes);
+      h = (*this_->node_costs[u])[0] / (PBQPNum) deg;
+      if (h < min) {
+        min_bucket = bucket;
+        min = h;
+      } 
+    }
+  }
+
+  /* return node and free bucket */
+  if (min_bucket != NULL) {
+    int u;
+
+    pbqp_remove_bucket(this_,min_bucket);
+    u = min_bucket->u;
+    free(min_bucket);
+    return u;
+  } else {
+    return -1;
+  }
+}
+
+
+/*****************************************************************************
+ * PBQP graph parsing
+ ****************************************************************************/
+ 
+/* reduce pbqp problem (first phase) */
+static
+void reduce_pbqp(pbqp *this_)
+{
+  int u;
+
+  assert(this_ != NULL);
+  assert(this_->bucket_list != NULL);
+
+  for(;;){
+
+    if (this_->bucket_list[1] != NULL) {
+      u = pop_node(this_,1);
+      apply_RI(this_,u); 
+    } else if (this_->bucket_list[2] != NULL) {
+      u = pop_node(this_,2);
+      apply_RII(this_,u);
+    } else if ((u = pop_colorablenode(this_)) != -1) {
+      apply_RN(this_,u);
+    } else {
+      break;
+    }
+  } 
+}
+
+/*****************************************************************************
+ * PBQP back propagation
+ ****************************************************************************/
+
+/* determine solution of a reduced node. Either
+   RI or RII was applied for this_ node. */
+static
+void determine_solution(pbqp *this_,int x)
+{
+  PBQPVector *v = new PBQPVector(*this_ -> node_costs[x]);
+  adjnode *adj_ptr;
+
+  assert(this_ != NULL);
+  assert(x >= 0 && x < this_->num_nodes);
+  assert(this_ -> adj_list != NULL);
+  assert(this_ -> solution != NULL);
+
+  for(adj_ptr=this_->adj_list[x] ;adj_ptr != NULL; adj_ptr = adj_ptr -> succ) {
+    int y = adj_ptr -> adj;
+    int y_sol = this_ -> solution[y];
+
+    PBQPMatrix *c_yx = pbqp_get_costmatrix(this_,y,x);
+    assert(y_sol >= 0 && y_sol < (int)this_->node_costs[y]->getLength());
+    (*v) += c_yx->getRowAsVector(y_sol);
+    delete c_yx;
+  }
+  this_ -> solution[x] = v->minIndex();
+
+  delete v;
+}
+
+/* back popagation phase of PBQP */
+static
+void back_propagate(pbqp *this_)
+{
+   int i;
+
+   assert(this_ != NULL);
+   assert(this_->stack != NULL);
+   assert(this_->stack_ptr < this_->num_nodes);
+
+   for(i=this_ -> stack_ptr-1;i>=0;i--) {
+      int x = this_ -> stack[i];
+      assert( x >= 0 && x < this_ -> num_nodes);
+      reinsert_node(this_,x);
+      determine_solution(this_,x);
+   }
+}
+
+/* solve trivial nodes of degree zero */
+static
+void determine_trivialsolution(pbqp *this_)
+{
+  int u;
+  PBQPNum delta;
+
+  assert( this_ != NULL);
+  assert( this_ -> bucket_list != NULL);
+
+  /* determine trivial solution */
+  while (this_->bucket_list[0] != NULL) {
+    u = pop_node(this_,0);
+
+    assert( u >= 0 && u < this_ -> num_nodes);
+
+    this_->solution[u] = this_->node_costs[u]->minIndex();
+    delta = (*this_->node_costs[u])[this_->solution[u]];
+    this_->min = this_->min + delta;
+
+    /* increment counter for number statistic */
+    this_->num_r0++;
+  }
+}
+
+/*****************************************************************************
+ * debug facilities
+ ****************************************************************************/
+static
+void check_pbqp(pbqp *this_)
+{
+  int u,v;
+  PBQPMatrix *costs;
+  adjnode *adj_ptr;
+  
+  assert( this_ != NULL);
+  
+  for(u=0;u< this_->num_nodes; u++) {
+    assert (this_ -> node_costs[u] != NULL);
+    for(adj_ptr = this_ -> adj_list[u];adj_ptr != NULL; adj_ptr = adj_ptr -> succ) {
+      v = adj_ptr -> adj;
+      assert( v>= 0 && v < this_->num_nodes);
+      if (u < v ) {
+        costs = adj_ptr -> costs;
+        assert( costs->getRows() == this_->node_costs[u]->getLength() &&
+                costs->getCols() == this_->node_costs[v]->getLength());
+      }           
+    }
+  }
+}
+
+/*****************************************************************************
+ * PBQP solve routines 
+ ****************************************************************************/
+
+/* solve PBQP problem */
+void solve_pbqp(pbqp *this_)
+{
+  assert(this_ != NULL);
+  assert(!this_->solved); 
+  
+  /* check vector & matrix dimensions */
+  check_pbqp(this_);
+
+  /* simplify PBQP problem */  
+  
+  /* eliminate trivial nodes, i.e.
+     nodes with cost vectors of length one.  */
+  eliminate_trivial_nodes(this_); 
+
+  /* eliminate edges with independent 
+     cost matrices and normalize matrices */
+  eliminate_independent_edges(this_);
+  
+  /* create bucket list for graph parsing */
+  create_bucketlist(this_);
+  
+  /* reduce phase */
+  reduce_pbqp(this_);
+  
+  /* solve trivial nodes */
+  determine_trivialsolution(this_);
+
+  /* back propagation phase */
+  back_propagate(this_); 
+  
+  this_->solved = true;
+}
+
+/* get solution of a node */
+int get_pbqp_solution(pbqp *this_,int x)
+{
+  assert(this_ != NULL);
+  assert(this_->solution != NULL);
+  assert(this_ -> solved);
+  
+  return this_->solution[x];
+}
+
+/* is solution optimal? */
+bool is_pbqp_optimal(pbqp *this_)
+{
+  assert(this_ -> solved);
+  return this_->optimal;
+}
+
+} 
+
+/* end of pbqp.c */
diff --git a/lib/CodeGen/PBQP.h b/lib/CodeGen/PBQP.h
new file mode 100644
index 0000000..5fd2c06
--- /dev/null
+++ b/lib/CodeGen/PBQP.h
@@ -0,0 +1,284 @@
+//===---------------- PBQP.cpp --------- PBQP Solver ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Developed by:                   Bernhard Scholz
+//                             The University of Sydney
+//                         http://www.it.usyd.edu.au/~scholz
+//===----------------------------------------------------------------------===//
+
+// TODO:
+//
+//  * Default to null costs on vector initialisation?
+//  * C++-ify the rest of the solver.
+
+#ifndef LLVM_CODEGEN_PBQPSOLVER_H
+#define LLVM_CODEGEN_PBQPSOLVER_H
+
+#include <cassert>
+#include <algorithm>
+#include <functional>
+
+namespace llvm {
+
+//! \brief Floating point type to use in PBQP solver.
+typedef double PBQPNum;
+
+//! \brief PBQP Vector class.
+class PBQPVector {
+public:
+
+  //! \brief Construct a PBQP vector of the given size.
+  explicit PBQPVector(unsigned length) :
+    length(length), data(new PBQPNum[length]) {
+    std::fill(data, data + length, 0);
+  }
+
+  //! \brief Copy construct a PBQP vector.
+  PBQPVector(const PBQPVector &v) :
+    length(v.length), data(new PBQPNum[length]) {
+    std::copy(v.data, v.data + length, data);
+  }
+
+  ~PBQPVector() { delete[] data; }
+
+  //! \brief Assignment operator.
+  PBQPVector& operator=(const PBQPVector &v) {
+    delete[] data;
+    length = v.length;
+    data = new PBQPNum[length];
+    std::copy(v.data, v.data + length, data);
+    return *this;
+  }
+
+  //! \brief Return the length of the vector
+  unsigned getLength() const throw () {
+    return length;
+  }
+
+  //! \brief Element access.
+  PBQPNum& operator[](unsigned index) {
+    assert(index < length && "PBQPVector element access out of bounds.");
+    return data[index];
+  }
+
+  //! \brief Const element access.
+  const PBQPNum& operator[](unsigned index) const {
+    assert(index < length && "PBQPVector element access out of bounds.");
+    return data[index];
+  }
+
+  //! \brief Add another vector to this one.
+  PBQPVector& operator+=(const PBQPVector &v) {
+    assert(length == v.length && "PBQPVector length mismatch.");
+    std::transform(data, data + length, v.data, data, std::plus<PBQPNum>()); 
+    return *this;
+  }
+
+  //! \brief Subtract another vector from this one.
+  PBQPVector& operator-=(const PBQPVector &v) {
+    assert(length == v.length && "PBQPVector length mismatch.");
+    std::transform(data, data + length, v.data, data, std::minus<PBQPNum>()); 
+    return *this;
+  }
+
+  //! \brief Returns the index of the minimum value in this vector
+  unsigned minIndex() const {
+    return std::min_element(data, data + length) - data;
+  }
+
+private:
+  unsigned length;
+  PBQPNum *data;
+};
+
+
+//! \brief PBQP Matrix class
+class PBQPMatrix {
+public:
+
+  //! \brief Construct a PBQP Matrix with the given dimensions.
+  PBQPMatrix(unsigned rows, unsigned cols) :
+    rows(rows), cols(cols), data(new PBQPNum[rows * cols]) {
+    std::fill(data, data + (rows * cols), 0);
+  }
+
+  //! \brief Copy construct a PBQP matrix.
+  PBQPMatrix(const PBQPMatrix &m) :
+    rows(m.rows), cols(m.cols), data(new PBQPNum[rows * cols]) {
+    std::copy(m.data, m.data + (rows * cols), data);  
+  }
+
+  ~PBQPMatrix() { delete[] data; }
+
+  //! \brief Assignment operator.
+  PBQPMatrix& operator=(const PBQPMatrix &m) {
+    delete[] data;
+    rows = m.rows; cols = m.cols;
+    data = new PBQPNum[rows * cols];
+    std::copy(m.data, m.data + (rows * cols), data);
+    return *this;
+  }
+
+  //! \brief Return the number of rows in this matrix.
+  unsigned getRows() const throw () { return rows; }
+
+  //! \brief Return the number of cols in this matrix.
+  unsigned getCols() const throw () { return cols; }
+
+  //! \brief Matrix element access.
+  PBQPNum* operator[](unsigned r) {
+    assert(r < rows && "Row out of bounds.");
+    return data + (r * cols);
+  }
+
+  //! \brief Matrix element access.
+  const PBQPNum* operator[](unsigned r) const {
+    assert(r < rows && "Row out of bounds.");
+    return data + (r * cols);
+  }
+
+  //! \brief Returns the given row as a vector.
+  PBQPVector getRowAsVector(unsigned r) const {
+    PBQPVector v(cols);
+    for (unsigned c = 0; c < cols; ++c)
+      v[c] = (*this)[r][c];
+    return v; 
+  }
+
+  //! \brief Reset the matrix to the given value.
+  PBQPMatrix& reset(PBQPNum val = 0) {
+    std::fill(data, data + (rows * cols), val);
+    return *this;
+  }
+
+  //! \brief Set a single row of this matrix to the given value.
+  PBQPMatrix& setRow(unsigned r, PBQPNum val) {
+    assert(r < rows && "Row out of bounds.");
+    std::fill(data + (r * cols), data + ((r + 1) * cols), val);
+    return *this;
+  }
+
+  //! \brief Set a single column of this matrix to the given value.
+  PBQPMatrix& setCol(unsigned c, PBQPNum val) {
+    assert(c < cols && "Column out of bounds.");
+    for (unsigned r = 0; r < rows; ++r)
+      (*this)[r][c] = val;
+    return *this;
+  }
+
+  //! \brief Matrix transpose.
+  PBQPMatrix transpose() const {
+    PBQPMatrix m(cols, rows);
+    for (unsigned r = 0; r < rows; ++r)
+      for (unsigned c = 0; c < cols; ++c)
+        m[c][r] = (*this)[r][c];
+    return m;
+  }
+
+  //! \brief Returns the diagonal of the matrix as a vector.
+  //!
+  //! Matrix must be square.
+  PBQPVector diagonalize() const {
+    assert(rows == cols && "Attempt to diagonalize non-square matrix.");
+
+    PBQPVector v(rows);
+    for (unsigned r = 0; r < rows; ++r)
+      v[r] = (*this)[r][r];
+    return v;
+  } 
+
+  //! \brief Add the given matrix to this one.
+  PBQPMatrix& operator+=(const PBQPMatrix &m) {
+    assert(rows == m.rows && cols == m.cols &&
+           "Matrix dimensions mismatch.");
+    std::transform(data, data + (rows * cols), m.data, data,
+                   std::plus<PBQPNum>());
+    return *this;
+  }
+
+  //! \brief Returns the minimum of the given row
+  PBQPNum getRowMin(unsigned r) const {
+    assert(r < rows && "Row out of bounds");
+    return *std::min_element(data + (r * cols), data + ((r + 1) * cols));
+  }
+
+  //! \brief Returns the minimum of the given column
+  PBQPNum getColMin(unsigned c) const {
+    PBQPNum minElem = (*this)[0][c];
+    for (unsigned r = 1; r < rows; ++r)
+      if ((*this)[r][c] < minElem) minElem = (*this)[r][c];
+    return minElem;
+  }
+
+  //! \brief Subtracts the given scalar from the elements of the given row.
+  PBQPMatrix& subFromRow(unsigned r, PBQPNum val) {
+    assert(r < rows && "Row out of bounds");
+    std::transform(data + (r * cols), data + ((r + 1) * cols),
+                   data + (r * cols),
+                   std::bind2nd(std::minus<PBQPNum>(), val));
+    return *this;
+  }
+
+  //! \brief Subtracts the given scalar from the elements of the given column.
+  PBQPMatrix& subFromCol(unsigned c, PBQPNum val) {
+    for (unsigned r = 0; r < rows; ++r)
+      (*this)[r][c] -= val;
+    return *this;
+  }
+
+  //! \brief Returns true if this is a zero matrix.
+  bool isZero() const {
+    return find_if(data, data + (rows * cols),
+                   std::bind2nd(std::not_equal_to<PBQPNum>(), 0)) ==
+                     data + (rows * cols);
+  }
+
+private:
+  unsigned rows, cols;
+  PBQPNum *data;
+};
+
+#define EPS (1E-8)
+
+#ifndef PBQP_TYPE
+#define PBQP_TYPE
+struct pbqp;
+typedef struct pbqp pbqp;
+#endif
+
+/*****************
+ * PBQP routines *
+ *****************/
+
+/* allocate pbqp problem */
+pbqp *alloc_pbqp(int num);
+
+/* add node costs */
+void add_pbqp_nodecosts(pbqp *this_,int u, PBQPVector *costs);
+
+/* add edge mat */
+void add_pbqp_edgecosts(pbqp *this_,int u,int v,PBQPMatrix *costs);
+
+/* solve PBQP problem */
+void solve_pbqp(pbqp *this_);
+
+/* get solution of a node */
+int get_pbqp_solution(pbqp *this_,int u);
+
+/* alloc PBQP */
+pbqp *alloc_pbqp(int num);
+
+/* free PBQP */
+void free_pbqp(pbqp *this_);
+
+/* is optimal */
+bool is_pbqp_optimal(pbqp *this_);
+
+}
+#endif
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
new file mode 100644
index 0000000..c5c76fc
--- /dev/null
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -0,0 +1,431 @@
+//===-- PhiElimination.cpp - Eliminate PHI nodes by inserting copies ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates machine instruction PHI nodes by inserting copy
+// instructions.  This destroys SSA information, but is the desired input for
+// some register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "phielim"
+#include "llvm/BasicBlock.h"
+#include "llvm/Instructions.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumAtomic, "Number of atomic phis lowered");
+
+namespace {
+  class VISIBILITY_HIDDEN PNE : public MachineFunctionPass {
+    MachineRegisterInfo  *MRI; // Machine register information
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PNE() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<LiveVariables>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
+    /// in predecessor basic blocks.
+    ///
+    bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
+    void LowerAtomicPHINode(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator AfterPHIsIt);
+
+    /// analyzePHINodes - Gather information about the PHI nodes in
+    /// here. In particular, we want to map the number of uses of a virtual
+    /// register which is used in a PHI node. We map that to the BB the
+    /// vreg is coming from. This is used later to determine when the vreg
+    /// is killed in the BB.
+    ///
+    void analyzePHINodes(const MachineFunction& Fn);
+
+    // FindCopyInsertPoint - Find a safe place in MBB to insert a copy from
+    // SrcReg.  This needs to be after any def or uses of SrcReg, but before
+    // any subsequent point where control flow might jump out of the basic
+    // block.
+    MachineBasicBlock::iterator FindCopyInsertPoint(MachineBasicBlock &MBB,
+                                                    unsigned SrcReg);
+
+    // SkipPHIsAndLabels - Copies need to be inserted after phi nodes and
+    // also after any exception handling labels: in landing pads execution
+    // starts at the label, so any copies placed before it won't be executed!
+    MachineBasicBlock::iterator SkipPHIsAndLabels(MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator I) {
+      // Rather than assuming that EH labels come before other kinds of labels,
+      // just skip all labels.
+      while (I != MBB.end() &&
+             (I->getOpcode() == TargetInstrInfo::PHI || I->isLabel()))
+        ++I;
+      return I;
+    }
+
+    typedef std::pair<const MachineBasicBlock*, unsigned> BBVRegPair;
+    typedef std::map<BBVRegPair, unsigned> VRegPHIUse;
+
+    VRegPHIUse VRegPHIUseCount;
+
+    // Defs of PHI sources which are implicit_def.
+    SmallPtrSet<MachineInstr*, 4> ImpDefs;
+  };
+}
+
+char PNE::ID = 0;
+static RegisterPass<PNE>
+X("phi-node-elimination", "Eliminate PHI nodes for register allocation");
+
+const PassInfo *const llvm::PHIEliminationID = &X;
+
+bool PNE::runOnMachineFunction(MachineFunction &Fn) {
+  MRI = &Fn.getRegInfo();
+
+  analyzePHINodes(Fn);
+
+  bool Changed = false;
+
+  // Eliminate PHI instructions by inserting copies into predecessor blocks.
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
+    Changed |= EliminatePHINodes(Fn, *I);
+
+  // Remove dead IMPLICIT_DEF instructions.
+  for (SmallPtrSet<MachineInstr*,4>::iterator I = ImpDefs.begin(),
+         E = ImpDefs.end(); I != E; ++I) {
+    MachineInstr *DefMI = *I;
+    unsigned DefReg = DefMI->getOperand(0).getReg();
+    if (MRI->use_empty(DefReg))
+      DefMI->eraseFromParent();
+  }
+
+  ImpDefs.clear();
+  VRegPHIUseCount.clear();
+  return Changed;
+}
+
+
+/// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions in
+/// predecessor basic blocks.
+///
+bool PNE::EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB) {
+  if (MBB.empty() || MBB.front().getOpcode() != TargetInstrInfo::PHI)
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  // Get an iterator to the first instruction after the last PHI node (this may
+  // also be the end of the basic block).
+  MachineBasicBlock::iterator AfterPHIsIt = SkipPHIsAndLabels(MBB, MBB.begin());
+
+  while (MBB.front().getOpcode() == TargetInstrInfo::PHI)
+    LowerAtomicPHINode(MBB, AfterPHIsIt);
+
+  return true;
+}
+
+/// isSourceDefinedByImplicitDef - Return true if all sources of the phi node
+/// are implicit_def's.
+static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
+                                         const MachineRegisterInfo *MRI) {
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+    unsigned SrcReg = MPhi->getOperand(i).getReg();
+    const MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+    if (!DefMI || DefMI->getOpcode() != TargetInstrInfo::IMPLICIT_DEF)
+      return false;
+  }
+  return true;
+}
+
+// FindCopyInsertPoint - Find a safe place in MBB to insert a copy from SrcReg.
+// This needs to be after any def or uses of SrcReg, but before any subsequent
+// point where control flow might jump out of the basic block.
+MachineBasicBlock::iterator PNE::FindCopyInsertPoint(MachineBasicBlock &MBB,
+                                                     unsigned SrcReg) {
+  // Handle the trivial case trivially.
+  if (MBB.empty())
+    return MBB.begin();
+
+  // If this basic block does not contain an invoke, then control flow always
+  // reaches the end of it, so place the copy there.  The logic below works in
+  // this case too, but is more expensive.
+  if (!isa<InvokeInst>(MBB.getBasicBlock()->getTerminator()))
+    return MBB.getFirstTerminator();
+
+  // Discover any definition/uses in this basic block.
+  SmallPtrSet<MachineInstr*, 8> DefUsesInMBB;
+  for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
+       RE = MRI->reg_end(); RI != RE; ++RI) {
+    MachineInstr *DefUseMI = &*RI;
+    if (DefUseMI->getParent() == &MBB)
+      DefUsesInMBB.insert(DefUseMI);
+  }
+
+  MachineBasicBlock::iterator InsertPoint;
+  if (DefUsesInMBB.empty()) {
+    // No def/uses.  Insert the copy at the start of the basic block.
+    InsertPoint = MBB.begin();
+  } else if (DefUsesInMBB.size() == 1) {
+    // Insert the copy immediately after the definition/use.
+    InsertPoint = *DefUsesInMBB.begin();
+    ++InsertPoint;
+  } else {
+    // Insert the copy immediately after the last definition/use.
+    InsertPoint = MBB.end();
+    while (!DefUsesInMBB.count(&*--InsertPoint)) {}
+    ++InsertPoint;
+  }
+
+  // Make sure the copy goes after any phi nodes however.
+  return SkipPHIsAndLabels(MBB, InsertPoint);
+}
+
+/// LowerAtomicPHINode - Lower the PHI node at the top of the specified block,
+/// under the assuption that it needs to be lowered in a way that supports
+/// atomic execution of PHIs.  This lowering method is always correct all of the
+/// time.
+/// 
+void PNE::LowerAtomicPHINode(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator AfterPHIsIt) {
+  // Unlink the PHI node from the basic block, but don't delete the PHI yet.
+  MachineInstr *MPhi = MBB.remove(MBB.begin());
+
+  unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
+  unsigned DestReg = MPhi->getOperand(0).getReg();
+  bool isDead = MPhi->getOperand(0).isDead();
+
+  // Create a new register for the incoming PHI arguments.
+  MachineFunction &MF = *MBB.getParent();
+  const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
+  unsigned IncomingReg = 0;
+
+  // Insert a register to register copy at the top of the current block (but
+  // after any remaining phi nodes) which copies the new incoming register
+  // into the phi node destination.
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  if (isSourceDefinedByImplicitDef(MPhi, MRI))
+    // If all sources of a PHI node are implicit_def, just emit an
+    // implicit_def instead of a copy.
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetInstrInfo::IMPLICIT_DEF), DestReg);
+  else {
+    IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
+    TII->copyRegToReg(MBB, AfterPHIsIt, DestReg, IncomingReg, RC, RC);
+  }
+
+  // Update live variable information if there is any.
+  LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>();
+  if (LV) {
+    MachineInstr *PHICopy = prior(AfterPHIsIt);
+
+    if (IncomingReg) {
+      // Increment use count of the newly created virtual register.
+      LV->getVarInfo(IncomingReg).NumUses++;
+
+      // Add information to LiveVariables to know that the incoming value is
+      // killed.  Note that because the value is defined in several places (once
+      // each for each incoming block), the "def" block and instruction fields
+      // for the VarInfo is not filled in.
+      LV->addVirtualRegisterKilled(IncomingReg, PHICopy);
+    }
+
+    // Since we are going to be deleting the PHI node, if it is the last use of
+    // any registers, or if the value itself is dead, we need to move this
+    // information over to the new copy we just inserted.
+    LV->removeVirtualRegistersKilled(MPhi);
+
+    // If the result is dead, update LV.
+    if (isDead) {
+      LV->addVirtualRegisterDead(DestReg, PHICopy);
+      LV->removeVirtualRegisterDead(DestReg, MPhi);
+    }
+  }
+
+  // Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    --VRegPHIUseCount[BBVRegPair(MPhi->getOperand(i + 1).getMBB(),
+                                 MPhi->getOperand(i).getReg())];
+
+  // Now loop over all of the incoming arguments, changing them to copy into the
+  // IncomingReg register in the corresponding predecessor basic block.
+  SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
+  for (int i = NumSrcs - 1; i >= 0; --i) {
+    unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+           "Machine PHI Operands must all be virtual registers!");
+
+    // If source is defined by an implicit def, there is no need to insert a
+    // copy.
+    MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+    if (DefMI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF) {
+      ImpDefs.insert(DefMI);
+      continue;
+    }
+
+    // Get the MachineBasicBlock equivalent of the BasicBlock that is the source
+    // path the PHI.
+    MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB();
+
+    // Check to make sure we haven't already emitted the copy for this block.
+    // This can happen because PHI nodes may have multiple entries for the same
+    // basic block.
+    if (!MBBsInsertedInto.insert(&opBlock))
+      continue;  // If the copy has already been emitted, we're done.
+ 
+    // Find a safe location to insert the copy, this may be the first terminator
+    // in the block (or end()).
+    MachineBasicBlock::iterator InsertPos = FindCopyInsertPoint(opBlock, SrcReg);
+
+    // Insert the copy.
+    TII->copyRegToReg(opBlock, InsertPos, IncomingReg, SrcReg, RC, RC);
+
+    // Now update live variable information if we have it.  Otherwise we're done
+    if (!LV) continue;
+    
+    // We want to be able to insert a kill of the register if this PHI (aka, the
+    // copy we just inserted) is the last use of the source value.  Live
+    // variable analysis conservatively handles this by saying that the value is
+    // live until the end of the block the PHI entry lives in.  If the value
+    // really is dead at the PHI copy, there will be no successor blocks which
+    // have the value live-in.
+    //
+    // Check to see if the copy is the last use, and if so, update the live
+    // variables information so that it knows the copy source instruction kills
+    // the incoming value.
+    LiveVariables::VarInfo &InRegVI = LV->getVarInfo(SrcReg);
+
+    // Loop over all of the successors of the basic block, checking to see if
+    // the value is either live in the block, or if it is killed in the block.
+    // Also check to see if this register is in use by another PHI node which
+    // has not yet been eliminated.  If so, it will be killed at an appropriate
+    // point later.
+
+    // Is it used by any PHI instructions in this block?
+    bool ValueIsLive = VRegPHIUseCount[BBVRegPair(&opBlock, SrcReg)] != 0;
+
+    std::vector<MachineBasicBlock*> OpSuccBlocks;
+    
+    // Otherwise, scan successors, including the BB the PHI node lives in.
+    for (MachineBasicBlock::succ_iterator SI = opBlock.succ_begin(),
+           E = opBlock.succ_end(); SI != E && !ValueIsLive; ++SI) {
+      MachineBasicBlock *SuccMBB = *SI;
+
+      // Is it alive in this successor?
+      unsigned SuccIdx = SuccMBB->getNumber();
+      if (InRegVI.AliveBlocks.test(SuccIdx)) {
+        ValueIsLive = true;
+        break;
+      }
+
+      OpSuccBlocks.push_back(SuccMBB);
+    }
+
+    // Check to see if this value is live because there is a use in a successor
+    // that kills it.
+    if (!ValueIsLive) {
+      switch (OpSuccBlocks.size()) {
+      case 1: {
+        MachineBasicBlock *MBB = OpSuccBlocks[0];
+        for (unsigned i = 0, e = InRegVI.Kills.size(); i != e; ++i)
+          if (InRegVI.Kills[i]->getParent() == MBB) {
+            ValueIsLive = true;
+            break;
+          }
+        break;
+      }
+      case 2: {
+        MachineBasicBlock *MBB1 = OpSuccBlocks[0], *MBB2 = OpSuccBlocks[1];
+        for (unsigned i = 0, e = InRegVI.Kills.size(); i != e; ++i)
+          if (InRegVI.Kills[i]->getParent() == MBB1 || 
+              InRegVI.Kills[i]->getParent() == MBB2) {
+            ValueIsLive = true;
+            break;
+          }
+        break;        
+      }
+      default:
+        std::sort(OpSuccBlocks.begin(), OpSuccBlocks.end());
+        for (unsigned i = 0, e = InRegVI.Kills.size(); i != e; ++i)
+          if (std::binary_search(OpSuccBlocks.begin(), OpSuccBlocks.end(),
+                                 InRegVI.Kills[i]->getParent())) {
+            ValueIsLive = true;
+            break;
+          }
+      }
+    }        
+
+    // Okay, if we now know that the value is not live out of the block, we can
+    // add a kill marker in this block saying that it kills the incoming value!
+    if (!ValueIsLive) {
+      // In our final twist, we have to decide which instruction kills the
+      // register.  In most cases this is the copy, however, the first
+      // terminator instruction at the end of the block may also use the value.
+      // In this case, we should mark *it* as being the killing block, not the
+      // copy.
+      MachineBasicBlock::iterator KillInst = prior(InsertPos);
+      MachineBasicBlock::iterator Term = opBlock.getFirstTerminator();
+      if (Term != opBlock.end()) {
+        if (Term->readsRegister(SrcReg))
+          KillInst = Term;
+      
+        // Check that no other terminators use values.
+#ifndef NDEBUG
+        for (MachineBasicBlock::iterator TI = next(Term); TI != opBlock.end();
+             ++TI) {
+          assert(!TI->readsRegister(SrcReg) &&
+                 "Terminator instructions cannot use virtual registers unless"
+                 "they are the first terminator in a block!");
+        }
+#endif
+      }
+      
+      // Finally, mark it killed.
+      LV->addVirtualRegisterKilled(SrcReg, KillInst);
+
+      // This vreg no longer lives all of the way through opBlock.
+      unsigned opBlockNum = opBlock.getNumber();
+      InRegVI.AliveBlocks.reset(opBlockNum);
+    }
+  }
+    
+  // Really delete the PHI instruction now!
+  MF.DeleteMachineInstr(MPhi);
+  ++NumAtomic;
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the number of uses of a virtual register which is
+/// used in a PHI node. We map that to the BB the vreg is coming from. This is
+/// used later to determine when the vreg is killed in the BB.
+///
+void PNE::analyzePHINodes(const MachineFunction& Fn) {
+  for (MachineFunction::const_iterator I = Fn.begin(), E = Fn.end();
+       I != E; ++I)
+    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->getOpcode() == TargetInstrInfo::PHI; ++BBI)
+      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+        ++VRegPHIUseCount[BBVRegPair(BBI->getOperand(i + 1).getMBB(),
+                                     BBI->getOperand(i).getReg())];
+}
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
new file mode 100644
index 0000000..f67eb79
--- /dev/null
+++ b/lib/CodeGen/Passes.cpp
@@ -0,0 +1,54 @@
+//===-- Passes.cpp - Target independent code generation passes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces to access the target independent code
+// generation passes provided by the LLVM backend.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+//===---------------------------------------------------------------------===//
+///
+/// RegisterRegAlloc class - Track the registration of register allocators.
+///
+//===---------------------------------------------------------------------===//
+MachinePassRegistry RegisterRegAlloc::Registry;
+
+
+//===---------------------------------------------------------------------===//
+///
+/// RegAlloc command line options.
+///
+//===---------------------------------------------------------------------===//
+static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<RegisterRegAlloc> >
+RegAlloc("regalloc",
+         cl::init(&createLinearScanRegisterAllocator),
+         cl::desc("Register allocator to use: (default = linearscan)")); 
+
+
+//===---------------------------------------------------------------------===//
+///
+/// createRegisterAllocator - choose the appropriate register allocator.
+///
+//===---------------------------------------------------------------------===//
+FunctionPass *llvm::createRegisterAllocator() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
+  
+  if (!Ctor) {
+    Ctor = RegAlloc;
+    RegisterRegAlloc::setDefault(RegAlloc);
+  }
+  
+  return Ctor();
+}
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
new file mode 100644
index 0000000..de774685
--- /dev/null
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -0,0 +1,941 @@
+//===----- SchedulePostRAList.cpp - list scheduler ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumNoops, "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+
+static cl::opt<bool>
+EnableAntiDepBreaking("break-anti-dependencies",
+                      cl::desc("Break post-RA scheduling anti-dependencies"),
+                      cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnablePostRAHazardAvoidance("avoid-hazards",
+                      cl::desc("Enable simple hazard-avoidance"),
+                      cl::init(true), cl::Hidden);
+
+namespace {
+  class VISIBILITY_HIDDEN PostRAScheduler : public MachineFunctionPass {
+  public:
+    static char ID;
+    PostRAScheduler() : MachineFunctionPass(&ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Post RA top-down list latency scheduler";
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+  };
+  char PostRAScheduler::ID = 0;
+
+  class VISIBILITY_HIDDEN SchedulePostRATDList : public ScheduleDAGInstrs {
+    /// AvailableQueue - The priority queue to use for the available SUnits.
+    ///
+    LatencyPriorityQueue AvailableQueue;
+  
+    /// PendingQueue - This contains all of the instructions whose operands have
+    /// been issued, but their results are not ready yet (due to the latency of
+    /// the operation).  Once the operands becomes available, the instruction is
+    /// added to the AvailableQueue.
+    std::vector<SUnit*> PendingQueue;
+
+    /// Topo - A topological ordering for SUnits.
+    ScheduleDAGTopologicalSort Topo;
+
+    /// AllocatableSet - The set of allocatable registers.
+    /// We'll be ignoring anti-dependencies on non-allocatable registers,
+    /// because they may not be safe to break.
+    const BitVector AllocatableSet;
+
+    /// HazardRec - The hazard recognizer to use.
+    ScheduleHazardRecognizer *HazardRec;
+
+    /// Classes - For live regs that are only used in one register class in a
+    /// live range, the register class. If the register is not live, the
+    /// corresponding value is null. If the register is live but used in
+    /// multiple register classes, the corresponding value is -1 casted to a
+    /// pointer.
+    const TargetRegisterClass *
+      Classes[TargetRegisterInfo::FirstVirtualRegister];
+
+    /// RegRegs - Map registers to all their references within a live range.
+    std::multimap<unsigned, MachineOperand *> RegRefs;
+
+    /// The index of the most recent kill (proceding bottom-up), or ~0u if
+    /// the register is not live.
+    unsigned KillIndices[TargetRegisterInfo::FirstVirtualRegister];
+
+    /// The index of the most recent complete def (proceding bottom up), or ~0u
+    /// if the register is live.
+    unsigned DefIndices[TargetRegisterInfo::FirstVirtualRegister];
+
+  public:
+    SchedulePostRATDList(MachineFunction &MF,
+                         const MachineLoopInfo &MLI,
+                         const MachineDominatorTree &MDT,
+                         ScheduleHazardRecognizer *HR)
+      : ScheduleDAGInstrs(MF, MLI, MDT), Topo(SUnits),
+        AllocatableSet(TRI->getAllocatableSet(MF)),
+        HazardRec(HR) {}
+
+    ~SchedulePostRATDList() {
+      delete HazardRec;
+    }
+
+    /// StartBlock - Initialize register live-range state for scheduling in
+    /// this block.
+    ///
+    void StartBlock(MachineBasicBlock *BB);
+
+    /// Schedule - Schedule the instruction range using list scheduling.
+    ///
+    void Schedule();
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr *MI, unsigned Count);
+
+    /// FinishBlock - Clean up register live-range state.
+    ///
+    void FinishBlock();
+
+  private:
+    void PrescanInstruction(MachineInstr *MI);
+    void ScanInstruction(MachineInstr *MI, unsigned Count);
+    void ReleaseSucc(SUnit *SU, SDep *SuccEdge);
+    void ReleaseSuccessors(SUnit *SU);
+    void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+    void ListScheduleTopDown();
+    bool BreakAntiDependencies();
+  };
+
+  /// SimpleHazardRecognizer - A *very* simple hazard recognizer. It uses
+  /// a coarse classification and attempts to avoid that instructions of
+  /// a given class aren't grouped too densely together.
+  class SimpleHazardRecognizer : public ScheduleHazardRecognizer {
+    /// Class - A simple classification for SUnits.
+    enum Class {
+      Other, Load, Store
+    };
+
+    /// Window - The Class values of the most recently issued
+    /// instructions.
+    Class Window[8];
+
+    /// getClass - Classify the given SUnit.
+    Class getClass(const SUnit *SU) {
+      const MachineInstr *MI = SU->getInstr();
+      const TargetInstrDesc &TID = MI->getDesc();
+      if (TID.mayLoad())
+        return Load;
+      if (TID.mayStore())
+        return Store;
+      return Other;
+    }
+
+    /// Step - Rotate the existing entries in Window and insert the
+    /// given class value in position as the most recent.
+    void Step(Class C) {
+      std::copy(Window+1, array_endof(Window), Window);
+      Window[array_lengthof(Window)-1] = C;
+    }
+
+  public:
+    SimpleHazardRecognizer() : Window() {}
+
+    virtual HazardType getHazardType(SUnit *SU) {
+      Class C = getClass(SU);
+      if (C == Other)
+        return NoHazard;
+      unsigned Score = 0;
+      for (unsigned i = 0; i != array_lengthof(Window); ++i)
+        if (Window[i] == C)
+          Score += i + 1;
+      if (Score > array_lengthof(Window) * 2)
+        return Hazard;
+      return NoHazard;
+    }
+
+    virtual void EmitInstruction(SUnit *SU) {
+      Step(getClass(SU));
+    }
+
+    virtual void AdvanceCycle() {
+      Step(Other);
+    }
+  };
+}
+
+/// isSchedulingBoundary - Test if the given instruction should be
+/// considered a scheduling boundary. This primarily includes labels
+/// and terminators.
+///
+static bool isSchedulingBoundary(const MachineInstr *MI,
+                                 const MachineFunction &MF) {
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isLabel())
+    return true;
+
+  // Don't attempt to schedule around any instruction that modifies
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
+  if (MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore()))
+    return true;
+
+  return false;
+}
+
+bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
+  DOUT << "PostRAScheduler\n";
+
+  const MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  const MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  ScheduleHazardRecognizer *HR = EnablePostRAHazardAvoidance ?
+                                 new SimpleHazardRecognizer :
+                                 new ScheduleHazardRecognizer();
+
+  SchedulePostRATDList Scheduler(Fn, MLI, MDT, HR);
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Initialize register live-range state for scheduling in this block.
+    Scheduler.StartBlock(MBB);
+
+    // Schedule each sequence of instructions not interrupted by a label
+    // or anything else that effectively needs to shut down scheduling.
+    MachineBasicBlock::iterator Current = MBB->end();
+    unsigned Count = MBB->size(), CurrentCount = Count;
+    for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
+      MachineInstr *MI = prior(I);
+      if (isSchedulingBoundary(MI, Fn)) {
+        Scheduler.Run(MBB, I, Current, CurrentCount);
+        Scheduler.EmitSchedule();
+        Current = MI;
+        CurrentCount = Count - 1;
+        Scheduler.Observe(MI, CurrentCount);
+      }
+      I = MI;
+      --Count;
+    }
+    assert(Count == 0 && "Instruction count mismatch!");
+    assert((MBB->begin() == Current || CurrentCount != 0) &&
+           "Instruction count mismatch!");
+    Scheduler.Run(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.EmitSchedule();
+
+    // Clean up register live-range state.
+    Scheduler.FinishBlock();
+  }
+
+  return true;
+}
+  
+/// StartBlock - Initialize register live-range state for scheduling in
+/// this block.
+///
+void SchedulePostRATDList::StartBlock(MachineBasicBlock *BB) {
+  // Call the superclass.
+  ScheduleDAGInstrs::StartBlock(BB);
+
+  // Clear out the register class data.
+  std::fill(Classes, array_endof(Classes),
+            static_cast<const TargetRegisterClass *>(0));
+
+  // Initialize the indices to indicate that no registers are live.
+  std::fill(KillIndices, array_endof(KillIndices), ~0u);
+  std::fill(DefIndices, array_endof(DefIndices), BB->size());
+
+  // Determine the live-out physregs for this block.
+  if (!BB->empty() && BB->back().getDesc().isReturn())
+    // In a return block, examine the function live-out regs.
+    for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
+         E = MRI.liveout_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  else
+    // In a non-return block, examine the live-in regs of all successors.
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+           E = (*SI)->livein_end(); I != E; ++I) {
+        unsigned Reg = *I;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BB->size();
+        DefIndices[Reg] = ~0u;
+        // Repeat, for all aliases.
+        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+          unsigned AliasReg = *Alias;
+          Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+          KillIndices[AliasReg] = BB->size();
+          DefIndices[AliasReg] = ~0u;
+        }
+      }
+
+  // Consider callee-saved registers as live-out, since we're running after
+  // prologue/epilogue insertion so there's no way to add additional
+  // saved registers.
+  //
+  // TODO: If the callee saves and restores these, then we can potentially
+  // use them between the save and the restore. To do that, we could scan
+  // the exit blocks to see which of these registers are defined.
+  // Alternatively, callee-saved registers that aren't saved and restored
+  // could be marked live-in in every block.
+  for (const unsigned *I = TRI->getCalleeSavedRegs(); *I; ++I) {
+    unsigned Reg = *I;
+    Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+    KillIndices[Reg] = BB->size();
+    DefIndices[Reg] = ~0u;
+    // Repeat, for all aliases.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      unsigned AliasReg = *Alias;
+      Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[AliasReg] = BB->size();
+      DefIndices[AliasReg] = ~0u;
+    }
+  }
+}
+
+/// Schedule - Schedule the instruction range using list scheduling.
+///
+void SchedulePostRATDList::Schedule() {
+  DOUT << "********** List Scheduling **********\n";
+  
+  // Build the scheduling graph.
+  BuildSchedGraph();
+
+  if (EnableAntiDepBreaking) {
+    if (BreakAntiDependencies()) {
+      // We made changes. Update the dependency graph.
+      // Theoretically we could update the graph in place:
+      // When a live range is changed to use a different register, remove
+      // the def's anti-dependence *and* output-dependence edges due to
+      // that register, and add new anti-dependence and output-dependence
+      // edges based on the next live range of the register.
+      SUnits.clear();
+      EntrySU = SUnit();
+      ExitSU = SUnit();
+      BuildSchedGraph();
+    }
+  }
+
+  AvailableQueue.initNodes(SUnits);
+
+  ListScheduleTopDown();
+  
+  AvailableQueue.releaseState();
+}
+
+/// Observe - Update liveness information to account for the current
+/// instruction, which will not be scheduled.
+///
+void SchedulePostRATDList::Observe(MachineInstr *MI, unsigned Count) {
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  // Any register which was defined within the previous scheduling region
+  // may have been rescheduled and its lifetime may overlap with registers
+  // in ways not reflected in our current liveness state. For each such
+  // register, adjust the liveness state to be conservatively correct.
+  for (unsigned Reg = 0; Reg != TargetRegisterInfo::FirstVirtualRegister; ++Reg)
+    if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
+      assert(KillIndices[Reg] == ~0u && "Clobbered register is live!");
+      // Mark this register to be non-renamable.
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      // Move the def index to the end of the previous region, to reflect
+      // that the def could theoretically have been scheduled at the end.
+      DefIndices[Reg] = InsertPosIndex;
+    }
+
+  PrescanInstruction(MI);
+  ScanInstruction(MI, Count);
+}
+
+/// FinishBlock - Clean up register live-range state.
+///
+void SchedulePostRATDList::FinishBlock() {
+  RegRefs.clear();
+
+  // Call the superclass.
+  ScheduleDAGInstrs::FinishBlock();
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static SDep *CriticalPathStep(SUnit *SU) {
+  SDep *Next = 0;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  for (SUnit::pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+       P != PE; ++P) {
+    SUnit *PredSU = P->getSUnit();
+    unsigned PredLatency = P->getLatency();
+    unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+    // In the case of a latency tie, prefer an anti-dependency edge over
+    // other types of edges.
+    if (NextDepth < PredTotalLatency ||
+        (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) {
+      NextDepth = PredTotalLatency;
+      Next = &*P;
+    }
+  }
+  return Next;
+}
+
+void SchedulePostRATDList::PrescanInstruction(MachineInstr *MI) {
+  // Scan the register operands for this instruction and update
+  // Classes and RegRefs.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    const TargetRegisterClass *NewRC =
+      getInstrOperandRegClass(TRI, MI->getDesc(), i);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    // Now check for aliases.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      // If an alias of the reg is used during the live range, give up.
+      // Note that this allows us to skip checking if AntiDepReg
+      // overlaps with any of the aliases, among other things.
+      unsigned AliasReg = *Alias;
+      if (Classes[AliasReg]) {
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
+    }
+
+    // If we're still willing to consider this register, note the reference.
+    if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
+      RegRefs.insert(std::make_pair(Reg, &MO));
+  }
+}
+
+void SchedulePostRATDList::ScanInstruction(MachineInstr *MI,
+                                           unsigned Count) {
+  // Update liveness.
+  // Proceding upwards, registers that are defed but not used in this
+  // instruction are now dead.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    if (!MO.isDef()) continue;
+    // Ignore two-addr defs.
+    if (MI->isRegTiedToUseOperand(i)) continue;
+
+    DefIndices[Reg] = Count;
+    KillIndices[Reg] = ~0u;
+          assert(((KillIndices[Reg] == ~0u) !=
+                  (DefIndices[Reg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for Reg!");
+    Classes[Reg] = 0;
+    RegRefs.erase(Reg);
+    // Repeat, for all subregs.
+    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+         *Subreg; ++Subreg) {
+      unsigned SubregReg = *Subreg;
+      DefIndices[SubregReg] = Count;
+      KillIndices[SubregReg] = ~0u;
+      Classes[SubregReg] = 0;
+      RegRefs.erase(SubregReg);
+    }
+    // Conservatively mark super-registers as unusable.
+    for (const unsigned *Super = TRI->getSuperRegisters(Reg);
+         *Super; ++Super) {
+      unsigned SuperReg = *Super;
+      Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+    }
+  }
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    if (!MO.isUse()) continue;
+
+    const TargetRegisterClass *NewRC =
+      getInstrOperandRegClass(TRI, MI->getDesc(), i);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    RegRefs.insert(std::make_pair(Reg, &MO));
+
+    // It wasn't previously live but now it is, this is a kill.
+    if (KillIndices[Reg] == ~0u) {
+      KillIndices[Reg] = Count;
+      DefIndices[Reg] = ~0u;
+          assert(((KillIndices[Reg] == ~0u) !=
+                  (DefIndices[Reg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for Reg!");
+    }
+    // Repeat, for all aliases.
+    for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+      unsigned AliasReg = *Alias;
+      if (KillIndices[AliasReg] == ~0u) {
+        KillIndices[AliasReg] = Count;
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  }
+}
+
+/// BreakAntiDependencies - Identifiy anti-dependencies along the critical path
+/// of the ScheduleDAG and break them by renaming registers.
+///
+bool SchedulePostRATDList::BreakAntiDependencies() {
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return false;
+
+  // Find the node at the bottom of the critical path.
+  SUnit *Max = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
+      Max = SU;
+  }
+
+  DOUT << "Critical path has total latency "
+       << (Max->getDepth() + Max->Latency) << "\n";
+
+  // Track progress along the critical path through the SUnit graph as we walk
+  // the instructions.
+  SUnit *CriticalPathSU = Max;
+  MachineInstr *CriticalPathMI = CriticalPathSU->getInstr();
+
+  // Consider this pattern:
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  // There are three anti-dependencies here, and without special care,
+  // we'd break all of them using the same register:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  // because at each anti-dependence, B is the first register that
+  // isn't A which is free.  This re-introduces anti-dependencies
+  // at all but one of the original anti-dependencies that we were
+  // trying to break.  To avoid this, keep track of the most recent
+  // register that each register was replaced with, avoid avoid
+  // using it to repair an anti-dependence on the same register.
+  // This lets us produce this:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   C = ...
+  //   ... = C
+  //   B = ...
+  //   ... = B
+  // This still has an anti-dependence on B, but at least it isn't on the
+  // original critical path.
+  //
+  // TODO: If we tracked more than one register here, we could potentially
+  // fix that remaining critical edge too. This is a little more involved,
+  // because unlike the most recent register, less recent registers should
+  // still be considered, though only if no other registers are available.
+  unsigned LastNewReg[TargetRegisterInfo::FirstVirtualRegister] = {};
+
+  // Attempt to break anti-dependence edges on the critical path. Walk the
+  // instructions from the bottom up, tracking information about liveness
+  // as we go to help determine which registers are available.
+  bool Changed = false;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = InsertPos, E = Begin;
+       I != E; --Count) {
+    MachineInstr *MI = --I;
+
+    // After regalloc, IMPLICIT_DEF instructions aren't safe to treat as
+    // dependence-breaking. In the case of an INSERT_SUBREG, the IMPLICIT_DEF
+    // is left behind appearing to clobber the super-register, while the
+    // subregister needs to remain live. So we just ignore them.
+    if (MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
+      continue;
+
+    // Check if this instruction has a dependence on the critical path that
+    // is an anti-dependence that we may be able to break. If it is, set
+    // AntiDepReg to the non-zero register associated with the anti-dependence.
+    //
+    // We limit our attention to the critical path as a heuristic to avoid
+    // breaking anti-dependence edges that aren't going to significantly
+    // impact the overall schedule. There are a limited number of registers
+    // and we want to save them for the important edges.
+    // 
+    // TODO: Instructions with multiple defs could have multiple
+    // anti-dependencies. The current code here only knows how to break one
+    // edge per instruction. Note that we'd have to be able to break all of
+    // the anti-dependencies in an instruction in order to be effective.
+    unsigned AntiDepReg = 0;
+    if (MI == CriticalPathMI) {
+      if (SDep *Edge = CriticalPathStep(CriticalPathSU)) {
+        SUnit *NextSU = Edge->getSUnit();
+
+        // Only consider anti-dependence edges.
+        if (Edge->getKind() == SDep::Anti) {
+          AntiDepReg = Edge->getReg();
+          assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+          // Don't break anti-dependencies on non-allocatable registers.
+          if (!AllocatableSet.test(AntiDepReg))
+            AntiDepReg = 0;
+          else {
+            // If the SUnit has other dependencies on the SUnit that it
+            // anti-depends on, don't bother breaking the anti-dependency
+            // since those edges would prevent such units from being
+            // scheduled past each other regardless.
+            //
+            // Also, if there are dependencies on other SUnits with the
+            // same register as the anti-dependency, don't attempt to
+            // break it.
+            for (SUnit::pred_iterator P = CriticalPathSU->Preds.begin(),
+                 PE = CriticalPathSU->Preds.end(); P != PE; ++P)
+              if (P->getSUnit() == NextSU ?
+                    (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) :
+                    (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) {
+                AntiDepReg = 0;
+                break;
+              }
+          }
+        }
+        CriticalPathSU = NextSU;
+        CriticalPathMI = CriticalPathSU->getInstr();
+      } else {
+        // We've reached the end of the critical path.
+        CriticalPathSU = 0;
+        CriticalPathMI = 0;
+      }
+    }
+
+    PrescanInstruction(MI);
+
+    // If this instruction has a use of AntiDepReg, breaking it
+    // is invalid.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (MO.isUse() && AntiDepReg == Reg) {
+        AntiDepReg = 0;
+        break;
+      }
+    }
+
+    // Determine AntiDepReg's register class, if it is live and is
+    // consistently used within a single class.
+    const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg] : 0;
+    assert((AntiDepReg == 0 || RC != NULL) &&
+           "Register should be live if it's causing an anti-dependence!");
+    if (RC == reinterpret_cast<TargetRegisterClass *>(-1))
+      AntiDepReg = 0;
+
+    // Look for a suitable register to use to break the anti-depenence.
+    //
+    // TODO: Instead of picking the first free register, consider which might
+    // be the best.
+    if (AntiDepReg != 0) {
+      for (TargetRegisterClass::iterator R = RC->allocation_order_begin(MF),
+           RE = RC->allocation_order_end(MF); R != RE; ++R) {
+        unsigned NewReg = *R;
+        // Don't replace a register with itself.
+        if (NewReg == AntiDepReg) continue;
+        // Don't replace a register with one that was recently used to repair
+        // an anti-dependence with this AntiDepReg, because that would
+        // re-introduce that anti-dependence.
+        if (NewReg == LastNewReg[AntiDepReg]) continue;
+        // If NewReg is dead and NewReg's most recent def is not before
+        // AntiDepReg's kill, it's safe to replace AntiDepReg with NewReg.
+        assert(((KillIndices[AntiDepReg] == ~0u) != (DefIndices[AntiDepReg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for AntiDepReg!");
+        assert(((KillIndices[NewReg] == ~0u) != (DefIndices[NewReg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for NewReg!");
+        if (KillIndices[NewReg] == ~0u &&
+            Classes[NewReg] != reinterpret_cast<TargetRegisterClass *>(-1) &&
+            KillIndices[AntiDepReg] <= DefIndices[NewReg]) {
+          DOUT << "Breaking anti-dependence edge on "
+               << TRI->getName(AntiDepReg)
+               << " with " << RegRefs.count(AntiDepReg) << " references"
+               << " using " << TRI->getName(NewReg) << "!\n";
+
+          // Update the references to the old register to refer to the new
+          // register.
+          std::pair<std::multimap<unsigned, MachineOperand *>::iterator,
+                    std::multimap<unsigned, MachineOperand *>::iterator>
+             Range = RegRefs.equal_range(AntiDepReg);
+          for (std::multimap<unsigned, MachineOperand *>::iterator
+               Q = Range.first, QE = Range.second; Q != QE; ++Q)
+            Q->second->setReg(NewReg);
+
+          // We just went back in time and modified history; the
+          // liveness information for the anti-depenence reg is now
+          // inconsistent. Set the state as if it were dead.
+          Classes[NewReg] = Classes[AntiDepReg];
+          DefIndices[NewReg] = DefIndices[AntiDepReg];
+          KillIndices[NewReg] = KillIndices[AntiDepReg];
+          assert(((KillIndices[NewReg] == ~0u) !=
+                  (DefIndices[NewReg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for NewReg!");
+
+          Classes[AntiDepReg] = 0;
+          DefIndices[AntiDepReg] = KillIndices[AntiDepReg];
+          KillIndices[AntiDepReg] = ~0u;
+          assert(((KillIndices[AntiDepReg] == ~0u) !=
+                  (DefIndices[AntiDepReg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for AntiDepReg!");
+
+          RegRefs.erase(AntiDepReg);
+          Changed = true;
+          LastNewReg[AntiDepReg] = NewReg;
+          break;
+        }
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero. Also update its cycle bound.
+void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+  --SuccSU->NumPredsLeft;
+  
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft < 0) {
+    cerr << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    cerr << " has been released too many times!\n";
+    assert(0);
+  }
+#endif
+  
+  // Compute how many cycles it will be before this actually becomes
+  // available.  This is the max of the start time of all predecessors plus
+  // their latencies.
+  SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    PendingQueue.push_back(SuccSU);
+}
+
+/// ReleaseSuccessors - Call ReleaseSucc on each of SU's successors.
+void SchedulePostRATDList::ReleaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I)
+    ReleaseSucc(SU, &*I);
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DOUT << "*** Scheduling [" << CurCycle << "]: ";
+  DEBUG(SU->dump(this));
+  
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue.ScheduledNode(SU);
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void SchedulePostRATDList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // All leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (SUnits[i].Preds.empty()) {
+      AvailableQueue.push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    unsigned MinDepth = ~0u;
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() <= CurCycle) {
+        AvailableQueue.push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      } else if (PendingQueue[i]->getDepth() < MinDepth)
+        MinDepth = PendingQueue[i]->getDepth();
+    }
+    
+    // If there are no instructions available, don't try to issue anything, and
+    // don't advance the hazard recognizer.
+    if (AvailableQueue.empty()) {
+      CurCycle = MinDepth != ~0u ? MinDepth : CurCycle + 1;
+      continue;
+    }
+
+    SUnit *FoundSUnit = 0;
+
+    bool HasNoopHazards = false;
+    while (!AvailableQueue.empty()) {
+      SUnit *CurSUnit = AvailableQueue.pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue.push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule, do it now.
+    if (FoundSUnit) {
+      ScheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+
+      // If this is a pseudo-op node, we don't want to increment the current
+      // cycle.
+      if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
+        ++CurCycle;
+    } else if (!HasNoopHazards) {
+      // Otherwise, we have a pipeline stall, but no other problem, just advance
+      // the current cycle and try again.
+      DOUT << "*** Advancing cycle, no work to do\n";
+      HazardRec->AdvanceCycle();
+      ++NumStalls;
+      ++CurCycle;
+    } else {
+      // Otherwise, we have no instructions to issue and we have instructions
+      // that will fault if we don't do this right.  This is the case for
+      // processors without pipeline interlocks and other cases.
+      DOUT << "*** Emitting noop\n";
+      HazardRec->EmitNoop();
+      Sequence.push_back(0);   // NULL here means noop
+      ++NumNoops;
+      ++CurCycle;
+    }
+  }
+
+#ifndef NDEBUG
+  VerifySchedule(/*isBottomUp=*/false);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createPostRAScheduler() {
+  return new PostRAScheduler();
+}
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
new file mode 100644
index 0000000..97d4728
--- /dev/null
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -0,0 +1,1485 @@
+//===-- PreAllocSplitting.cpp - Pre-allocation Interval Spltting Pass. ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level pre-register allocation
+// live interval splitting pass. It finds live interval barriers, i.e.
+// instructions which will kill all physical registers in certain register
+// classes, and split all live intervals which cross the barrier.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-alloc-split"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+static cl::opt<int> PreSplitLimit("pre-split-limit", cl::init(-1), cl::Hidden);
+static cl::opt<int> DeadSplitLimit("dead-split-limit", cl::init(-1), cl::Hidden);
+static cl::opt<int> RestoreFoldLimit("restore-fold-limit", cl::init(-1), cl::Hidden);
+
+STATISTIC(NumSplits, "Number of intervals split");
+STATISTIC(NumRemats, "Number of intervals split by rematerialization");
+STATISTIC(NumFolds, "Number of intervals split with spill folding");
+STATISTIC(NumRestoreFolds, "Number of intervals split with restore folding");
+STATISTIC(NumRenumbers, "Number of intervals renumbered into new registers");
+STATISTIC(NumDeadSpills, "Number of dead spills removed");
+
+namespace {
+  class VISIBILITY_HIDDEN PreAllocSplitting : public MachineFunctionPass {
+    MachineFunction       *CurrMF;
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo* TRI;
+    MachineFrameInfo      *MFI;
+    MachineRegisterInfo   *MRI;
+    LiveIntervals         *LIs;
+    LiveStacks            *LSs;
+    VirtRegMap            *VRM;
+
+    // Barrier - Current barrier being processed.
+    MachineInstr          *Barrier;
+
+    // BarrierMBB - Basic block where the barrier resides in.
+    MachineBasicBlock     *BarrierMBB;
+
+    // Barrier - Current barrier index.
+    unsigned              BarrierIdx;
+
+    // CurrLI - Current live interval being split.
+    LiveInterval          *CurrLI;
+
+    // CurrSLI - Current stack slot live interval.
+    LiveInterval          *CurrSLI;
+
+    // CurrSValNo - Current val# for the stack slot live interval.
+    VNInfo                *CurrSValNo;
+
+    // IntervalSSMap - A map from live interval to spill slots.
+    DenseMap<unsigned, int> IntervalSSMap;
+
+    // Def2SpillMap - A map from a def instruction index to spill index.
+    DenseMap<unsigned, unsigned> Def2SpillMap;
+
+  public:
+    static char ID;
+    PreAllocSplitting() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<LiveStacks>();
+      AU.addPreserved<LiveStacks>();
+      AU.addPreserved<RegisterCoalescer>();
+      if (StrongPHIElim)
+        AU.addPreservedID(StrongPHIEliminationID);
+      else
+        AU.addPreservedID(PHIEliminationID);
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<VirtRegMap>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreserved<VirtRegMap>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    virtual void releaseMemory() {
+      IntervalSSMap.clear();
+      Def2SpillMap.clear();
+    }
+
+    virtual const char *getPassName() const {
+      return "Pre-Register Allocaton Live Interval Splitting";
+    }
+
+    /// print - Implement the dump method.
+    virtual void print(std::ostream &O, const Module* M = 0) const {
+      LIs->print(O, M);
+    }
+
+    void print(std::ostream *O, const Module* M = 0) const {
+      if (O) print(*O, M);
+    }
+
+  private:
+    MachineBasicBlock::iterator
+      findNextEmptySlot(MachineBasicBlock*, MachineInstr*,
+                        unsigned&);
+
+    MachineBasicBlock::iterator
+      findSpillPoint(MachineBasicBlock*, MachineInstr*, MachineInstr*,
+                     SmallPtrSet<MachineInstr*, 4>&, unsigned&);
+
+    MachineBasicBlock::iterator
+      findRestorePoint(MachineBasicBlock*, MachineInstr*, unsigned,
+                     SmallPtrSet<MachineInstr*, 4>&, unsigned&);
+
+    int CreateSpillStackSlot(unsigned, const TargetRegisterClass *);
+
+    bool IsAvailableInStack(MachineBasicBlock*, unsigned, unsigned, unsigned,
+                            unsigned&, int&) const;
+
+    void UpdateSpillSlotInterval(VNInfo*, unsigned, unsigned);
+
+    bool SplitRegLiveInterval(LiveInterval*);
+
+    bool SplitRegLiveIntervals(const TargetRegisterClass **,
+                               SmallPtrSet<LiveInterval*, 8>&);
+    
+    bool createsNewJoin(LiveRange* LR, MachineBasicBlock* DefMBB,
+                        MachineBasicBlock* BarrierMBB);
+    bool Rematerialize(unsigned vreg, VNInfo* ValNo,
+                       MachineInstr* DefMI,
+                       MachineBasicBlock::iterator RestorePt,
+                       unsigned RestoreIdx,
+                       SmallPtrSet<MachineInstr*, 4>& RefsInMBB);
+    MachineInstr* FoldSpill(unsigned vreg, const TargetRegisterClass* RC,
+                            MachineInstr* DefMI,
+                            MachineInstr* Barrier,
+                            MachineBasicBlock* MBB,
+                            int& SS,
+                            SmallPtrSet<MachineInstr*, 4>& RefsInMBB);
+    MachineInstr* FoldRestore(unsigned vreg, 
+                              const TargetRegisterClass* RC,
+                              MachineInstr* Barrier,
+                              MachineBasicBlock* MBB,
+                              int SS,
+                              SmallPtrSet<MachineInstr*, 4>& RefsInMBB);
+    void RenumberValno(VNInfo* VN);
+    void ReconstructLiveInterval(LiveInterval* LI);
+    bool removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split);
+    unsigned getNumberOfNonSpills(SmallPtrSet<MachineInstr*, 4>& MIs,
+                               unsigned Reg, int FrameIndex, bool& TwoAddr);
+    VNInfo* PerformPHIConstruction(MachineBasicBlock::iterator Use,
+                                   MachineBasicBlock* MBB, LiveInterval* LI,
+                                   SmallPtrSet<MachineInstr*, 4>& Visited,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                      DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                        bool IsTopLevel, bool IsIntraBlock);
+    VNInfo* PerformPHIConstructionFallBack(MachineBasicBlock::iterator Use,
+                                   MachineBasicBlock* MBB, LiveInterval* LI,
+                                   SmallPtrSet<MachineInstr*, 4>& Visited,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+            DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                      DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                        bool IsTopLevel, bool IsIntraBlock);
+};
+} // end anonymous namespace
+
+char PreAllocSplitting::ID = 0;
+
+static RegisterPass<PreAllocSplitting>
+X("pre-alloc-splitting", "Pre-Register Allocation Live Interval Splitting");
+
+const PassInfo *const llvm::PreAllocSplittingID = &X;
+
+
+/// findNextEmptySlot - Find a gap after the given machine instruction in the
+/// instruction index map. If there isn't one, return end().
+MachineBasicBlock::iterator
+PreAllocSplitting::findNextEmptySlot(MachineBasicBlock *MBB, MachineInstr *MI,
+                                     unsigned &SpotIndex) {
+  MachineBasicBlock::iterator MII = MI;
+  if (++MII != MBB->end()) {
+    unsigned Index = LIs->findGapBeforeInstr(LIs->getInstructionIndex(MII));
+    if (Index) {
+      SpotIndex = Index;
+      return MII;
+    }
+  }
+  return MBB->end();
+}
+
+/// findSpillPoint - Find a gap as far away from the given MI that's suitable
+/// for spilling the current live interval. The index must be before any
+/// defs and uses of the live interval register in the mbb. Return begin() if
+/// none is found.
+MachineBasicBlock::iterator
+PreAllocSplitting::findSpillPoint(MachineBasicBlock *MBB, MachineInstr *MI,
+                                  MachineInstr *DefMI,
+                                  SmallPtrSet<MachineInstr*, 4> &RefsInMBB,
+                                  unsigned &SpillIndex) {
+  MachineBasicBlock::iterator Pt = MBB->begin();
+
+  MachineBasicBlock::iterator MII = MI;
+  MachineBasicBlock::iterator EndPt = DefMI
+    ? MachineBasicBlock::iterator(DefMI) : MBB->begin();
+    
+  while (MII != EndPt && !RefsInMBB.count(MII) &&
+         MII->getOpcode() != TRI->getCallFrameSetupOpcode())
+    --MII;
+  if (MII == EndPt || RefsInMBB.count(MII)) return Pt;
+    
+  while (MII != EndPt && !RefsInMBB.count(MII)) {
+    unsigned Index = LIs->getInstructionIndex(MII);
+    
+    // We can't insert the spill between the barrier (a call), and its
+    // corresponding call frame setup.
+    if (MII->getOpcode() == TRI->getCallFrameDestroyOpcode()) {
+      while (MII->getOpcode() != TRI->getCallFrameSetupOpcode()) {
+        --MII;
+        if (MII == EndPt) {
+          return Pt;
+        }
+      }
+      continue;
+    } else if (LIs->hasGapBeforeInstr(Index)) {
+      Pt = MII;
+      SpillIndex = LIs->findGapBeforeInstr(Index, true);
+    }
+    
+    if (RefsInMBB.count(MII))
+      return Pt;
+    
+    
+    --MII;
+  }
+
+  return Pt;
+}
+
+/// findRestorePoint - Find a gap in the instruction index map that's suitable
+/// for restoring the current live interval value. The index must be before any
+/// uses of the live interval register in the mbb. Return end() if none is
+/// found.
+MachineBasicBlock::iterator
+PreAllocSplitting::findRestorePoint(MachineBasicBlock *MBB, MachineInstr *MI,
+                                    unsigned LastIdx,
+                                    SmallPtrSet<MachineInstr*, 4> &RefsInMBB,
+                                    unsigned &RestoreIndex) {
+  // FIXME: Allow spill to be inserted to the beginning of the mbb. Update mbb
+  // begin index accordingly.
+  MachineBasicBlock::iterator Pt = MBB->end();
+  MachineBasicBlock::iterator EndPt = MBB->getFirstTerminator();
+
+  // We start at the call, so walk forward until we find the call frame teardown
+  // since we can't insert restores before that.  Bail if we encounter a use
+  // during this time.
+  MachineBasicBlock::iterator MII = MI;
+  if (MII == EndPt) return Pt;
+  
+  while (MII != EndPt && !RefsInMBB.count(MII) &&
+         MII->getOpcode() != TRI->getCallFrameDestroyOpcode())
+    ++MII;
+  if (MII == EndPt || RefsInMBB.count(MII)) return Pt;
+  ++MII;
+  
+  // FIXME: Limit the number of instructions to examine to reduce
+  // compile time?
+  while (MII != EndPt) {
+    unsigned Index = LIs->getInstructionIndex(MII);
+    if (Index > LastIdx)
+      break;
+    unsigned Gap = LIs->findGapBeforeInstr(Index);
+      
+    // We can't insert a restore between the barrier (a call) and its 
+    // corresponding call frame teardown.
+    if (MII->getOpcode() == TRI->getCallFrameSetupOpcode()) {
+      do {
+        if (MII == EndPt || RefsInMBB.count(MII)) return Pt;
+        ++MII;
+      } while (MII->getOpcode() != TRI->getCallFrameDestroyOpcode());
+    } else if (Gap) {
+      Pt = MII;
+      RestoreIndex = Gap;
+    }
+    
+    if (RefsInMBB.count(MII))
+      return Pt;
+    
+    ++MII;
+  }
+
+  return Pt;
+}
+
+/// CreateSpillStackSlot - Create a stack slot for the live interval being
+/// split. If the live interval was previously split, just reuse the same
+/// slot.
+int PreAllocSplitting::CreateSpillStackSlot(unsigned Reg,
+                                            const TargetRegisterClass *RC) {
+  int SS;
+  DenseMap<unsigned, int>::iterator I = IntervalSSMap.find(Reg);
+  if (I != IntervalSSMap.end()) {
+    SS = I->second;
+  } else {
+    SS = MFI->CreateStackObject(RC->getSize(), RC->getAlignment());
+    IntervalSSMap[Reg] = SS;
+  }
+
+  // Create live interval for stack slot.
+  CurrSLI = &LSs->getOrCreateInterval(SS, RC);
+  if (CurrSLI->hasAtLeastOneValue())
+    CurrSValNo = CurrSLI->getValNumInfo(0);
+  else
+    CurrSValNo = CurrSLI->getNextValue(~0U, 0, LSs->getVNInfoAllocator());
+  return SS;
+}
+
+/// IsAvailableInStack - Return true if register is available in a split stack
+/// slot at the specified index.
+bool
+PreAllocSplitting::IsAvailableInStack(MachineBasicBlock *DefMBB,
+                                    unsigned Reg, unsigned DefIndex,
+                                    unsigned RestoreIndex, unsigned &SpillIndex,
+                                    int& SS) const {
+  if (!DefMBB)
+    return false;
+
+  DenseMap<unsigned, int>::iterator I = IntervalSSMap.find(Reg);
+  if (I == IntervalSSMap.end())
+    return false;
+  DenseMap<unsigned, unsigned>::iterator II = Def2SpillMap.find(DefIndex);
+  if (II == Def2SpillMap.end())
+    return false;
+
+  // If last spill of def is in the same mbb as barrier mbb (where restore will
+  // be), make sure it's not below the intended restore index.
+  // FIXME: Undo the previous spill?
+  assert(LIs->getMBBFromIndex(II->second) == DefMBB);
+  if (DefMBB == BarrierMBB && II->second >= RestoreIndex)
+    return false;
+
+  SS = I->second;
+  SpillIndex = II->second;
+  return true;
+}
+
+/// UpdateSpillSlotInterval - Given the specified val# of the register live
+/// interval being split, and the spill and restore indicies, update the live
+/// interval of the spill stack slot.
+void
+PreAllocSplitting::UpdateSpillSlotInterval(VNInfo *ValNo, unsigned SpillIndex,
+                                           unsigned RestoreIndex) {
+  assert(LIs->getMBBFromIndex(RestoreIndex) == BarrierMBB &&
+         "Expect restore in the barrier mbb");
+
+  MachineBasicBlock *MBB = LIs->getMBBFromIndex(SpillIndex);
+  if (MBB == BarrierMBB) {
+    // Intra-block spill + restore. We are done.
+    LiveRange SLR(SpillIndex, RestoreIndex, CurrSValNo);
+    CurrSLI->addRange(SLR);
+    return;
+  }
+
+  SmallPtrSet<MachineBasicBlock*, 4> Processed;
+  unsigned EndIdx = LIs->getMBBEndIdx(MBB);
+  LiveRange SLR(SpillIndex, EndIdx+1, CurrSValNo);
+  CurrSLI->addRange(SLR);
+  Processed.insert(MBB);
+
+  // Start from the spill mbb, figure out the extend of the spill slot's
+  // live interval.
+  SmallVector<MachineBasicBlock*, 4> WorkList;
+  const LiveRange *LR = CurrLI->getLiveRangeContaining(SpillIndex);
+  if (LR->end > EndIdx)
+    // If live range extend beyond end of mbb, add successors to work list.
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           SE = MBB->succ_end(); SI != SE; ++SI)
+      WorkList.push_back(*SI);
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *MBB = WorkList.back();
+    WorkList.pop_back();
+    if (Processed.count(MBB))
+      continue;
+    unsigned Idx = LIs->getMBBStartIdx(MBB);
+    LR = CurrLI->getLiveRangeContaining(Idx);
+    if (LR && LR->valno == ValNo) {
+      EndIdx = LIs->getMBBEndIdx(MBB);
+      if (Idx <= RestoreIndex && RestoreIndex < EndIdx) {
+        // Spill slot live interval stops at the restore.
+        LiveRange SLR(Idx, RestoreIndex, CurrSValNo);
+        CurrSLI->addRange(SLR);
+      } else if (LR->end > EndIdx) {
+        // Live range extends beyond end of mbb, process successors.
+        LiveRange SLR(Idx, EndIdx+1, CurrSValNo);
+        CurrSLI->addRange(SLR);
+        for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+               SE = MBB->succ_end(); SI != SE; ++SI)
+          WorkList.push_back(*SI);
+      } else {
+        LiveRange SLR(Idx, LR->end, CurrSValNo);
+        CurrSLI->addRange(SLR);
+      }
+      Processed.insert(MBB);
+    }
+  }
+}
+
+/// PerformPHIConstruction - From properly set up use and def lists, use a PHI
+/// construction algorithm to compute the ranges and valnos for an interval.
+VNInfo*
+PreAllocSplitting::PerformPHIConstruction(MachineBasicBlock::iterator UseI,
+                                       MachineBasicBlock* MBB, LiveInterval* LI,
+                                       SmallPtrSet<MachineInstr*, 4>& Visited,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                       DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                           bool IsTopLevel, bool IsIntraBlock) {
+  // Return memoized result if it's available.
+  if (IsTopLevel && Visited.count(UseI) && NewVNs.count(UseI))
+    return NewVNs[UseI];
+  else if (!IsTopLevel && IsIntraBlock && NewVNs.count(UseI))
+    return NewVNs[UseI];
+  else if (!IsIntraBlock && LiveOut.count(MBB))
+    return LiveOut[MBB];
+  
+  // Check if our block contains any uses or defs.
+  bool ContainsDefs = Defs.count(MBB);
+  bool ContainsUses = Uses.count(MBB);
+  
+  VNInfo* RetVNI = 0;
+  
+  // Enumerate the cases of use/def contaning blocks.
+  if (!ContainsDefs && !ContainsUses) {
+    return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs, Uses,
+                                          NewVNs, LiveOut, Phis,
+                                          IsTopLevel, IsIntraBlock);
+  } else if (ContainsDefs && !ContainsUses) {
+    SmallPtrSet<MachineInstr*, 2>& BlockDefs = Defs[MBB];
+
+    // Search for the def in this block.  If we don't find it before the
+    // instruction we care about, go to the fallback case.  Note that that
+    // should never happen: this cannot be intrablock, so use should
+    // always be an end() iterator.
+    assert(UseI == MBB->end() && "No use marked in intrablock");
+    
+    MachineBasicBlock::iterator Walker = UseI;
+    --Walker;
+    while (Walker != MBB->begin()) {
+      if (BlockDefs.count(Walker))
+        break;
+      --Walker;
+    }
+    
+    // Once we've found it, extend its VNInfo to our instruction.
+    unsigned DefIndex = LIs->getInstructionIndex(Walker);
+    DefIndex = LiveIntervals::getDefIndex(DefIndex);
+    unsigned EndIndex = LIs->getMBBEndIdx(MBB);
+    
+    RetVNI = NewVNs[Walker];
+    LI->addRange(LiveRange(DefIndex, EndIndex+1, RetVNI));
+  } else if (!ContainsDefs && ContainsUses) {
+    SmallPtrSet<MachineInstr*, 2>& BlockUses = Uses[MBB];
+    
+    // Search for the use in this block that precedes the instruction we care 
+    // about, going to the fallback case if we don't find it.    
+    if (UseI == MBB->begin())
+      return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs,
+                                            Uses, NewVNs, LiveOut, Phis,
+                                            IsTopLevel, IsIntraBlock);
+    
+    MachineBasicBlock::iterator Walker = UseI;
+    --Walker;
+    bool found = false;
+    while (Walker != MBB->begin()) {
+      if (BlockUses.count(Walker)) {
+        found = true;
+        break;
+      }
+      --Walker;
+    }
+        
+    // Must check begin() too.
+    if (!found) {
+      if (BlockUses.count(Walker))
+        found = true;
+      else
+        return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs,
+                                              Uses, NewVNs, LiveOut, Phis,
+                                              IsTopLevel, IsIntraBlock);
+    }
+
+    unsigned UseIndex = LIs->getInstructionIndex(Walker);
+    UseIndex = LiveIntervals::getUseIndex(UseIndex);
+    unsigned EndIndex = 0;
+    if (IsIntraBlock) {
+      EndIndex = LIs->getInstructionIndex(UseI);
+      EndIndex = LiveIntervals::getUseIndex(EndIndex);
+    } else
+      EndIndex = LIs->getMBBEndIdx(MBB);
+
+    // Now, recursively phi construct the VNInfo for the use we found,
+    // and then extend it to include the instruction we care about
+    RetVNI = PerformPHIConstruction(Walker, MBB, LI, Visited, Defs, Uses,
+                                    NewVNs, LiveOut, Phis, false, true);
+    
+    LI->addRange(LiveRange(UseIndex, EndIndex+1, RetVNI));
+    
+    // FIXME: Need to set kills properly for inter-block stuff.
+    if (LI->isKill(RetVNI, UseIndex)) LI->removeKill(RetVNI, UseIndex);
+    if (IsIntraBlock)
+      LI->addKill(RetVNI, EndIndex);
+  } else if (ContainsDefs && ContainsUses) {
+    SmallPtrSet<MachineInstr*, 2>& BlockDefs = Defs[MBB];
+    SmallPtrSet<MachineInstr*, 2>& BlockUses = Uses[MBB];
+    
+    // This case is basically a merging of the two preceding case, with the
+    // special note that checking for defs must take precedence over checking
+    // for uses, because of two-address instructions.
+    
+    if (UseI == MBB->begin())
+      return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs, Uses,
+                                            NewVNs, LiveOut, Phis,
+                                            IsTopLevel, IsIntraBlock);
+    
+    MachineBasicBlock::iterator Walker = UseI;
+    --Walker;
+    bool foundDef = false;
+    bool foundUse = false;
+    while (Walker != MBB->begin()) {
+      if (BlockDefs.count(Walker)) {
+        foundDef = true;
+        break;
+      } else if (BlockUses.count(Walker)) {
+        foundUse = true;
+        break;
+      }
+      --Walker;
+    }
+        
+    // Must check begin() too.
+    if (!foundDef && !foundUse) {
+      if (BlockDefs.count(Walker))
+        foundDef = true;
+      else if (BlockUses.count(Walker))
+        foundUse = true;
+      else
+        return PerformPHIConstructionFallBack(UseI, MBB, LI, Visited, Defs,
+                                              Uses, NewVNs, LiveOut, Phis,
+                                              IsTopLevel, IsIntraBlock);
+    }
+
+    unsigned StartIndex = LIs->getInstructionIndex(Walker);
+    StartIndex = foundDef ? LiveIntervals::getDefIndex(StartIndex) :
+                            LiveIntervals::getUseIndex(StartIndex);
+    unsigned EndIndex = 0;
+    if (IsIntraBlock) {
+      EndIndex = LIs->getInstructionIndex(UseI);
+      EndIndex = LiveIntervals::getUseIndex(EndIndex);
+    } else
+      EndIndex = LIs->getMBBEndIdx(MBB);
+
+    if (foundDef)
+      RetVNI = NewVNs[Walker];
+    else
+      RetVNI = PerformPHIConstruction(Walker, MBB, LI, Visited, Defs, Uses,
+                                      NewVNs, LiveOut, Phis, false, true);
+
+    LI->addRange(LiveRange(StartIndex, EndIndex+1, RetVNI));
+    
+    if (foundUse && LI->isKill(RetVNI, StartIndex))
+      LI->removeKill(RetVNI, StartIndex);
+    if (IsIntraBlock) {
+      LI->addKill(RetVNI, EndIndex);
+    }
+  }
+  
+  // Memoize results so we don't have to recompute them.
+  if (!IsIntraBlock) LiveOut[MBB] = RetVNI;
+  else {
+    if (!NewVNs.count(UseI))
+      NewVNs[UseI] = RetVNI;
+    Visited.insert(UseI);
+  }
+
+  return RetVNI;
+}
+
+/// PerformPHIConstructionFallBack - PerformPHIConstruction fall back path.
+///
+VNInfo*
+PreAllocSplitting::PerformPHIConstructionFallBack(MachineBasicBlock::iterator UseI,
+                                       MachineBasicBlock* MBB, LiveInterval* LI,
+                                       SmallPtrSet<MachineInstr*, 4>& Visited,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Defs,
+             DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> >& Uses,
+                                       DenseMap<MachineInstr*, VNInfo*>& NewVNs,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& LiveOut,
+                                 DenseMap<MachineBasicBlock*, VNInfo*>& Phis,
+                                           bool IsTopLevel, bool IsIntraBlock) {
+  // NOTE: Because this is the fallback case from other cases, we do NOT
+  // assume that we are not intrablock here.
+  if (Phis.count(MBB)) return Phis[MBB]; 
+
+  unsigned StartIndex = LIs->getMBBStartIdx(MBB);
+  VNInfo *RetVNI = Phis[MBB] = LI->getNextValue(~0U, /*FIXME*/ 0,
+                                                LIs->getVNInfoAllocator());
+  if (!IsIntraBlock) LiveOut[MBB] = RetVNI;
+    
+  // If there are no uses or defs between our starting point and the
+  // beginning of the block, then recursive perform phi construction
+  // on our predecessors.
+  DenseMap<MachineBasicBlock*, VNInfo*> IncomingVNs;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    VNInfo* Incoming = PerformPHIConstruction((*PI)->end(), *PI, LI, 
+                                              Visited, Defs, Uses, NewVNs,
+                                              LiveOut, Phis, false, false);
+    if (Incoming != 0)
+      IncomingVNs[*PI] = Incoming;
+  }
+    
+  if (MBB->pred_size() == 1 && !RetVNI->hasPHIKill) {
+    VNInfo* OldVN = RetVNI;
+    VNInfo* NewVN = IncomingVNs.begin()->second;
+    VNInfo* MergedVN = LI->MergeValueNumberInto(OldVN, NewVN);
+    if (MergedVN == OldVN) std::swap(OldVN, NewVN);
+    
+    for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator LOI = LiveOut.begin(),
+         LOE = LiveOut.end(); LOI != LOE; ++LOI)
+      if (LOI->second == OldVN)
+        LOI->second = MergedVN;
+    for (DenseMap<MachineInstr*, VNInfo*>::iterator NVI = NewVNs.begin(),
+         NVE = NewVNs.end(); NVI != NVE; ++NVI)
+      if (NVI->second == OldVN)
+        NVI->second = MergedVN;
+    for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator PI = Phis.begin(),
+         PE = Phis.end(); PI != PE; ++PI)
+      if (PI->second == OldVN)
+        PI->second = MergedVN;
+    RetVNI = MergedVN;
+  } else {
+    // Otherwise, merge the incoming VNInfos with a phi join.  Create a new
+    // VNInfo to represent the joined value.
+    for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator I =
+           IncomingVNs.begin(), E = IncomingVNs.end(); I != E; ++I) {
+      I->second->hasPHIKill = true;
+      unsigned KillIndex = LIs->getMBBEndIdx(I->first);
+      if (!LiveInterval::isKill(I->second, KillIndex))
+        LI->addKill(I->second, KillIndex);
+    }
+  }
+      
+  unsigned EndIndex = 0;
+  if (IsIntraBlock) {
+    EndIndex = LIs->getInstructionIndex(UseI);
+    EndIndex = LiveIntervals::getUseIndex(EndIndex);
+  } else
+    EndIndex = LIs->getMBBEndIdx(MBB);
+  LI->addRange(LiveRange(StartIndex, EndIndex+1, RetVNI));
+  if (IsIntraBlock)
+    LI->addKill(RetVNI, EndIndex);
+
+  // Memoize results so we don't have to recompute them.
+  if (!IsIntraBlock)
+    LiveOut[MBB] = RetVNI;
+  else {
+    if (!NewVNs.count(UseI))
+      NewVNs[UseI] = RetVNI;
+    Visited.insert(UseI);
+  }
+
+  return RetVNI;
+}
+
+/// ReconstructLiveInterval - Recompute a live interval from scratch.
+void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
+  BumpPtrAllocator& Alloc = LIs->getVNInfoAllocator();
+  
+  // Clear the old ranges and valnos;
+  LI->clear();
+  
+  // Cache the uses and defs of the register
+  typedef DenseMap<MachineBasicBlock*, SmallPtrSet<MachineInstr*, 2> > RegMap;
+  RegMap Defs, Uses;
+  
+  // Keep track of the new VNs we're creating.
+  DenseMap<MachineInstr*, VNInfo*> NewVNs;
+  SmallPtrSet<VNInfo*, 2> PhiVNs;
+  
+  // Cache defs, and create a new VNInfo for each def.
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(LI->reg),
+       DE = MRI->def_end(); DI != DE; ++DI) {
+    Defs[(*DI).getParent()].insert(&*DI);
+    
+    unsigned DefIdx = LIs->getInstructionIndex(&*DI);
+    DefIdx = LiveIntervals::getDefIndex(DefIdx);
+    
+    VNInfo* NewVN = LI->getNextValue(DefIdx, 0, Alloc);
+    
+    // If the def is a move, set the copy field.
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (TII->isMoveInstr(*DI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+      if (DstReg == LI->reg)
+        NewVN->copy = &*DI;
+    
+    NewVNs[&*DI] = NewVN;
+  }
+  
+  // Cache uses as a separate pass from actually processing them.
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(LI->reg),
+       UE = MRI->use_end(); UI != UE; ++UI)
+    Uses[(*UI).getParent()].insert(&*UI);
+    
+  // Now, actually process every use and use a phi construction algorithm
+  // to walk from it to its reaching definitions, building VNInfos along
+  // the way.
+  DenseMap<MachineBasicBlock*, VNInfo*> LiveOut;
+  DenseMap<MachineBasicBlock*, VNInfo*> Phis;
+  SmallPtrSet<MachineInstr*, 4> Visited;
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(LI->reg),
+       UE = MRI->use_end(); UI != UE; ++UI) {
+    PerformPHIConstruction(&*UI, UI->getParent(), LI, Visited, Defs,
+                           Uses, NewVNs, LiveOut, Phis, true, true); 
+  }
+  
+  // Add ranges for dead defs
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(LI->reg),
+       DE = MRI->def_end(); DI != DE; ++DI) {
+    unsigned DefIdx = LIs->getInstructionIndex(&*DI);
+    DefIdx = LiveIntervals::getDefIndex(DefIdx);
+    
+    if (LI->liveAt(DefIdx)) continue;
+    
+    VNInfo* DeadVN = NewVNs[&*DI];
+    LI->addRange(LiveRange(DefIdx, DefIdx+1, DeadVN));
+    LI->addKill(DeadVN, DefIdx);
+  }
+}
+
+/// RenumberValno - Split the given valno out into a new vreg, allowing it to
+/// be allocated to a different register.  This function creates a new vreg,
+/// copies the valno and its live ranges over to the new vreg's interval,
+/// removes them from the old interval, and rewrites all uses and defs of
+/// the original reg to the new vreg within those ranges.
+void PreAllocSplitting::RenumberValno(VNInfo* VN) {
+  SmallVector<VNInfo*, 4> Stack;
+  SmallVector<VNInfo*, 4> VNsToCopy;
+  Stack.push_back(VN);
+
+  // Walk through and copy the valno we care about, and any other valnos
+  // that are two-address redefinitions of the one we care about.  These
+  // will need to be rewritten as well.  We also check for safety of the 
+  // renumbering here, by making sure that none of the valno involved has
+  // phi kills.
+  while (!Stack.empty()) {
+    VNInfo* OldVN = Stack.back();
+    Stack.pop_back();
+    
+    // Bail out if we ever encounter a valno that has a PHI kill.  We can't
+    // renumber these.
+    if (OldVN->hasPHIKill) return;
+    
+    VNsToCopy.push_back(OldVN);
+    
+    // Locate two-address redefinitions
+    for (SmallVector<unsigned, 4>::iterator KI = OldVN->kills.begin(),
+         KE = OldVN->kills.end(); KI != KE; ++KI) {
+      MachineInstr* MI = LIs->getInstructionFromIndex(*KI);
+      unsigned DefIdx = MI->findRegisterDefOperandIdx(CurrLI->reg);
+      if (DefIdx == ~0U) continue;
+      if (MI->isRegTiedToUseOperand(DefIdx)) {
+        VNInfo* NextVN =
+                     CurrLI->findDefinedVNInfo(LiveIntervals::getDefIndex(*KI));
+        if (NextVN == OldVN) continue;
+        Stack.push_back(NextVN);
+      }
+    }
+  }
+  
+  // Create the new vreg
+  unsigned NewVReg = MRI->createVirtualRegister(MRI->getRegClass(CurrLI->reg));
+  
+  // Create the new live interval
+  LiveInterval& NewLI = LIs->getOrCreateInterval(NewVReg);
+  
+  for (SmallVector<VNInfo*, 4>::iterator OI = VNsToCopy.begin(), OE = 
+       VNsToCopy.end(); OI != OE; ++OI) {
+    VNInfo* OldVN = *OI;
+    
+    // Copy the valno over
+    VNInfo* NewVN = NewLI.getNextValue(OldVN->def, OldVN->copy, 
+                                       LIs->getVNInfoAllocator());
+    NewLI.copyValNumInfo(NewVN, OldVN);
+    NewLI.MergeValueInAsValue(*CurrLI, OldVN, NewVN);
+
+    // Remove the valno from the old interval
+    CurrLI->removeValNo(OldVN);
+  }
+  
+  // Rewrite defs and uses.  This is done in two stages to avoid invalidating
+  // the reg_iterator.
+  SmallVector<std::pair<MachineInstr*, unsigned>, 8> OpsToChange;
+  
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(CurrLI->reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand& MO = I.getOperand();
+    unsigned InstrIdx = LIs->getInstructionIndex(&*I);
+    
+    if ((MO.isUse() && NewLI.liveAt(LiveIntervals::getUseIndex(InstrIdx))) ||
+        (MO.isDef() && NewLI.liveAt(LiveIntervals::getDefIndex(InstrIdx))))
+      OpsToChange.push_back(std::make_pair(&*I, I.getOperandNo()));
+  }
+  
+  for (SmallVector<std::pair<MachineInstr*, unsigned>, 8>::iterator I =
+       OpsToChange.begin(), E = OpsToChange.end(); I != E; ++I) {
+    MachineInstr* Inst = I->first;
+    unsigned OpIdx = I->second;
+    MachineOperand& MO = Inst->getOperand(OpIdx);
+    MO.setReg(NewVReg);
+  }
+  
+  // Grow the VirtRegMap, since we've created a new vreg.
+  VRM->grow();
+  
+  // The renumbered vreg shares a stack slot with the old register.
+  if (IntervalSSMap.count(CurrLI->reg))
+    IntervalSSMap[NewVReg] = IntervalSSMap[CurrLI->reg];
+  
+  NumRenumbers++;
+}
+
+bool PreAllocSplitting::Rematerialize(unsigned vreg, VNInfo* ValNo,
+                                      MachineInstr* DefMI,
+                                      MachineBasicBlock::iterator RestorePt,
+                                      unsigned RestoreIdx,
+                                    SmallPtrSet<MachineInstr*, 4>& RefsInMBB) {
+  MachineBasicBlock& MBB = *RestorePt->getParent();
+  
+  MachineBasicBlock::iterator KillPt = BarrierMBB->end();
+  unsigned KillIdx = 0;
+  if (ValNo->def == ~0U || DefMI->getParent() == BarrierMBB)
+    KillPt = findSpillPoint(BarrierMBB, Barrier, NULL, RefsInMBB, KillIdx);
+  else
+    KillPt = findNextEmptySlot(DefMI->getParent(), DefMI, KillIdx);
+  
+  if (KillPt == DefMI->getParent()->end())
+    return false;
+  
+  TII->reMaterialize(MBB, RestorePt, vreg, DefMI);
+  LIs->InsertMachineInstrInMaps(prior(RestorePt), RestoreIdx);
+  
+  ReconstructLiveInterval(CurrLI);
+  unsigned RematIdx = LIs->getInstructionIndex(prior(RestorePt));
+  RematIdx = LiveIntervals::getDefIndex(RematIdx);
+  RenumberValno(CurrLI->findDefinedVNInfo(RematIdx));
+  
+  ++NumSplits;
+  ++NumRemats;
+  return true;  
+}
+
+MachineInstr* PreAllocSplitting::FoldSpill(unsigned vreg, 
+                                           const TargetRegisterClass* RC,
+                                           MachineInstr* DefMI,
+                                           MachineInstr* Barrier,
+                                           MachineBasicBlock* MBB,
+                                           int& SS,
+                                    SmallPtrSet<MachineInstr*, 4>& RefsInMBB) {
+  MachineBasicBlock::iterator Pt = MBB->begin();
+
+  // Go top down if RefsInMBB is empty.
+  if (RefsInMBB.empty())
+    return 0;
+  
+  MachineBasicBlock::iterator FoldPt = Barrier;
+  while (&*FoldPt != DefMI && FoldPt != MBB->begin() &&
+         !RefsInMBB.count(FoldPt))
+    --FoldPt;
+  
+  int OpIdx = FoldPt->findRegisterDefOperandIdx(vreg, false);
+  if (OpIdx == -1)
+    return 0;
+  
+  SmallVector<unsigned, 1> Ops;
+  Ops.push_back(OpIdx);
+  
+  if (!TII->canFoldMemoryOperand(FoldPt, Ops))
+    return 0;
+  
+  DenseMap<unsigned, int>::iterator I = IntervalSSMap.find(vreg);
+  if (I != IntervalSSMap.end()) {
+    SS = I->second;
+  } else {
+    SS = MFI->CreateStackObject(RC->getSize(), RC->getAlignment());    
+  }
+  
+  MachineInstr* FMI = TII->foldMemoryOperand(*MBB->getParent(),
+                                             FoldPt, Ops, SS);
+  
+  if (FMI) {
+    LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
+    FMI = MBB->insert(MBB->erase(FoldPt), FMI);
+    ++NumFolds;
+    
+    IntervalSSMap[vreg] = SS;
+    CurrSLI = &LSs->getOrCreateInterval(SS, RC);
+    if (CurrSLI->hasAtLeastOneValue())
+      CurrSValNo = CurrSLI->getValNumInfo(0);
+    else
+      CurrSValNo = CurrSLI->getNextValue(~0U, 0, LSs->getVNInfoAllocator());
+  }
+  
+  return FMI;
+}
+
+MachineInstr* PreAllocSplitting::FoldRestore(unsigned vreg, 
+                                             const TargetRegisterClass* RC,
+                                             MachineInstr* Barrier,
+                                             MachineBasicBlock* MBB,
+                                             int SS,
+                                     SmallPtrSet<MachineInstr*, 4>& RefsInMBB) {
+  if ((int)RestoreFoldLimit != -1 && RestoreFoldLimit == (int)NumRestoreFolds)
+    return 0;
+                                       
+  // Go top down if RefsInMBB is empty.
+  if (RefsInMBB.empty())
+    return 0;
+  
+  // Can't fold a restore between a call stack setup and teardown.
+  MachineBasicBlock::iterator FoldPt = Barrier;
+  
+  // Advance from barrier to call frame teardown.
+  while (FoldPt != MBB->getFirstTerminator() &&
+         FoldPt->getOpcode() != TRI->getCallFrameDestroyOpcode()) {
+    if (RefsInMBB.count(FoldPt))
+      return 0;
+    
+    ++FoldPt;
+  }
+  
+  if (FoldPt == MBB->getFirstTerminator())
+    return 0;
+  else
+    ++FoldPt;
+  
+  // Now find the restore point.
+  while (FoldPt != MBB->getFirstTerminator() && !RefsInMBB.count(FoldPt)) {
+    if (FoldPt->getOpcode() == TRI->getCallFrameSetupOpcode()) {
+      while (FoldPt != MBB->getFirstTerminator() &&
+             FoldPt->getOpcode() != TRI->getCallFrameDestroyOpcode()) {
+        if (RefsInMBB.count(FoldPt))
+          return 0;
+        
+        ++FoldPt;
+      }
+      
+      if (FoldPt == MBB->getFirstTerminator())
+        return 0;
+    } 
+    
+    ++FoldPt;
+  }
+  
+  if (FoldPt == MBB->getFirstTerminator())
+    return 0;
+  
+  int OpIdx = FoldPt->findRegisterUseOperandIdx(vreg, true);
+  if (OpIdx == -1)
+    return 0;
+  
+  SmallVector<unsigned, 1> Ops;
+  Ops.push_back(OpIdx);
+  
+  if (!TII->canFoldMemoryOperand(FoldPt, Ops))
+    return 0;
+  
+  MachineInstr* FMI = TII->foldMemoryOperand(*MBB->getParent(),
+                                             FoldPt, Ops, SS);
+  
+  if (FMI) {
+    LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
+    FMI = MBB->insert(MBB->erase(FoldPt), FMI);
+    ++NumRestoreFolds;
+  }
+  
+  return FMI;
+}
+
+/// SplitRegLiveInterval - Split (spill and restore) the given live interval
+/// so it would not cross the barrier that's being processed. Shrink wrap
+/// (minimize) the live interval to the last uses.
+bool PreAllocSplitting::SplitRegLiveInterval(LiveInterval *LI) {
+  CurrLI = LI;
+
+  // Find live range where current interval cross the barrier.
+  LiveInterval::iterator LR =
+    CurrLI->FindLiveRangeContaining(LIs->getUseIndex(BarrierIdx));
+  VNInfo *ValNo = LR->valno;
+
+  if (ValNo->def == ~1U) {
+    // Defined by a dead def? How can this be?
+    assert(0 && "Val# is defined by a dead def?");
+    abort();
+  }
+
+  MachineInstr *DefMI = (ValNo->def != ~0U)
+    ? LIs->getInstructionFromIndex(ValNo->def) : NULL;
+
+  // If this would create a new join point, do not split.
+  if (DefMI && createsNewJoin(LR, DefMI->getParent(), Barrier->getParent()))
+    return false;
+
+  // Find all references in the barrier mbb.
+  SmallPtrSet<MachineInstr*, 4> RefsInMBB;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(CurrLI->reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineInstr *RefMI = &*I;
+    if (RefMI->getParent() == BarrierMBB)
+      RefsInMBB.insert(RefMI);
+  }
+
+  // Find a point to restore the value after the barrier.
+  unsigned RestoreIndex = 0;
+  MachineBasicBlock::iterator RestorePt =
+    findRestorePoint(BarrierMBB, Barrier, LR->end, RefsInMBB, RestoreIndex);
+  if (RestorePt == BarrierMBB->end())
+    return false;
+
+  if (DefMI && LIs->isReMaterializable(*LI, ValNo, DefMI))
+    if (Rematerialize(LI->reg, ValNo, DefMI, RestorePt,
+                      RestoreIndex, RefsInMBB))
+    return true;
+
+  // Add a spill either before the barrier or after the definition.
+  MachineBasicBlock *DefMBB = DefMI ? DefMI->getParent() : NULL;
+  const TargetRegisterClass *RC = MRI->getRegClass(CurrLI->reg);
+  unsigned SpillIndex = 0;
+  MachineInstr *SpillMI = NULL;
+  int SS = -1;
+  if (ValNo->def == ~0U) {
+    // If it's defined by a phi, we must split just before the barrier.
+    if ((SpillMI = FoldSpill(LI->reg, RC, 0, Barrier,
+                            BarrierMBB, SS, RefsInMBB))) {
+      SpillIndex = LIs->getInstructionIndex(SpillMI);
+    } else {
+      MachineBasicBlock::iterator SpillPt = 
+        findSpillPoint(BarrierMBB, Barrier, NULL, RefsInMBB, SpillIndex);
+      if (SpillPt == BarrierMBB->begin())
+        return false; // No gap to insert spill.
+      // Add spill.
+    
+      SS = CreateSpillStackSlot(CurrLI->reg, RC);
+      TII->storeRegToStackSlot(*BarrierMBB, SpillPt, CurrLI->reg, true, SS, RC);
+      SpillMI = prior(SpillPt);
+      LIs->InsertMachineInstrInMaps(SpillMI, SpillIndex);
+    }
+  } else if (!IsAvailableInStack(DefMBB, CurrLI->reg, ValNo->def,
+                                 RestoreIndex, SpillIndex, SS)) {
+    // If it's already split, just restore the value. There is no need to spill
+    // the def again.
+    if (!DefMI)
+      return false; // Def is dead. Do nothing.
+    
+    if ((SpillMI = FoldSpill(LI->reg, RC, DefMI, Barrier,
+                            BarrierMBB, SS, RefsInMBB))) {
+      SpillIndex = LIs->getInstructionIndex(SpillMI);
+    } else {
+      // Check if it's possible to insert a spill after the def MI.
+      MachineBasicBlock::iterator SpillPt;
+      if (DefMBB == BarrierMBB) {
+        // Add spill after the def and the last use before the barrier.
+        SpillPt = findSpillPoint(BarrierMBB, Barrier, DefMI,
+                                 RefsInMBB, SpillIndex);
+        if (SpillPt == DefMBB->begin())
+          return false; // No gap to insert spill.
+      } else {
+        SpillPt = findNextEmptySlot(DefMBB, DefMI, SpillIndex);
+        if (SpillPt == DefMBB->end())
+          return false; // No gap to insert spill.
+      }
+      // Add spill. The store instruction kills the register if def is before
+      // the barrier in the barrier block.
+      SS = CreateSpillStackSlot(CurrLI->reg, RC);
+      TII->storeRegToStackSlot(*DefMBB, SpillPt, CurrLI->reg,
+                               DefMBB == BarrierMBB, SS, RC);
+      SpillMI = prior(SpillPt);
+      LIs->InsertMachineInstrInMaps(SpillMI, SpillIndex);
+    }
+  }
+
+  // Remember def instruction index to spill index mapping.
+  if (DefMI && SpillMI)
+    Def2SpillMap[ValNo->def] = SpillIndex;
+
+  // Add restore.
+  bool FoldedRestore = false;
+  if (MachineInstr* LMI = FoldRestore(CurrLI->reg, RC, Barrier,
+                                      BarrierMBB, SS, RefsInMBB)) {
+    RestorePt = LMI;
+    RestoreIndex = LIs->getInstructionIndex(RestorePt);
+    FoldedRestore = true;
+  } else {
+    TII->loadRegFromStackSlot(*BarrierMBB, RestorePt, CurrLI->reg, SS, RC);
+    MachineInstr *LoadMI = prior(RestorePt);
+    LIs->InsertMachineInstrInMaps(LoadMI, RestoreIndex);
+  }
+
+  // Update spill stack slot live interval.
+  UpdateSpillSlotInterval(ValNo, LIs->getUseIndex(SpillIndex)+1,
+                          LIs->getDefIndex(RestoreIndex));
+
+  ReconstructLiveInterval(CurrLI);
+  
+  if (!FoldedRestore) {
+    unsigned RestoreIdx = LIs->getInstructionIndex(prior(RestorePt));
+    RestoreIdx = LiveIntervals::getDefIndex(RestoreIdx);
+    RenumberValno(CurrLI->findDefinedVNInfo(RestoreIdx));
+  }
+  
+  ++NumSplits;
+  return true;
+}
+
+/// SplitRegLiveIntervals - Split all register live intervals that cross the
+/// barrier that's being processed.
+bool
+PreAllocSplitting::SplitRegLiveIntervals(const TargetRegisterClass **RCs,
+                                         SmallPtrSet<LiveInterval*, 8>& Split) {
+  // First find all the virtual registers whose live intervals are intercepted
+  // by the current barrier.
+  SmallVector<LiveInterval*, 8> Intervals;
+  for (const TargetRegisterClass **RC = RCs; *RC; ++RC) {
+    // FIXME: If it's not safe to move any instruction that defines the barrier
+    // register class, then it means there are some special dependencies which
+    // codegen is not modelling. Ignore these barriers for now.
+    if (!TII->isSafeToMoveRegClassDefs(*RC))
+      continue;
+    std::vector<unsigned> &VRs = MRI->getRegClassVirtRegs(*RC);
+    for (unsigned i = 0, e = VRs.size(); i != e; ++i) {
+      unsigned Reg = VRs[i];
+      if (!LIs->hasInterval(Reg))
+        continue;
+      LiveInterval *LI = &LIs->getInterval(Reg);
+      if (LI->liveAt(BarrierIdx) && !Barrier->readsRegister(Reg))
+        // Virtual register live interval is intercepted by the barrier. We
+        // should split and shrink wrap its interval if possible.
+        Intervals.push_back(LI);
+    }
+  }
+
+  // Process the affected live intervals.
+  bool Change = false;
+  while (!Intervals.empty()) {
+    if (PreSplitLimit != -1 && (int)NumSplits == PreSplitLimit)
+      break;
+    else if (NumSplits == 4)
+      Change |= Change;
+    LiveInterval *LI = Intervals.back();
+    Intervals.pop_back();
+    bool result = SplitRegLiveInterval(LI);
+    if (result) Split.insert(LI);
+    Change |= result;
+  }
+
+  return Change;
+}
+
+unsigned PreAllocSplitting::getNumberOfNonSpills(
+                                  SmallPtrSet<MachineInstr*, 4>& MIs,
+                                  unsigned Reg, int FrameIndex,
+                                  bool& FeedsTwoAddr) {
+  unsigned NonSpills = 0;
+  for (SmallPtrSet<MachineInstr*, 4>::iterator UI = MIs.begin(), UE = MIs.end();
+       UI != UE; ++UI) {
+    int StoreFrameIndex;
+    unsigned StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
+    if (StoreVReg != Reg || StoreFrameIndex != FrameIndex)
+      NonSpills++;
+    
+    int DefIdx = (*UI)->findRegisterDefOperandIdx(Reg);
+    if (DefIdx != -1 && (*UI)->isRegTiedToUseOperand(DefIdx))
+      FeedsTwoAddr = true;
+  }
+  
+  return NonSpills;
+}
+
+/// removeDeadSpills - After doing splitting, filter through all intervals we've
+/// split, and see if any of the spills are unnecessary.  If so, remove them.
+bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
+  bool changed = false;
+  
+  // Walk over all of the live intervals that were touched by the splitter,
+  // and see if we can do any DCE and/or folding.
+  for (SmallPtrSet<LiveInterval*, 8>::iterator LI = split.begin(),
+       LE = split.end(); LI != LE; ++LI) {
+    DenseMap<VNInfo*, SmallPtrSet<MachineInstr*, 4> > VNUseCount;
+    
+    // First, collect all the uses of the vreg, and sort them by their
+    // reaching definition (VNInfo).
+    for (MachineRegisterInfo::use_iterator UI = MRI->use_begin((*LI)->reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+      unsigned index = LIs->getInstructionIndex(&*UI);
+      index = LiveIntervals::getUseIndex(index);
+      
+      const LiveRange* LR = (*LI)->getLiveRangeContaining(index);
+      VNUseCount[LR->valno].insert(&*UI);
+    }
+    
+    // Now, take the definitions (VNInfo's) one at a time and try to DCE 
+    // and/or fold them away.
+    for (LiveInterval::vni_iterator VI = (*LI)->vni_begin(),
+         VE = (*LI)->vni_end(); VI != VE; ++VI) {
+      
+      if (DeadSplitLimit != -1 && (int)NumDeadSpills == DeadSplitLimit) 
+        return changed;
+      
+      VNInfo* CurrVN = *VI;
+      
+      // We don't currently try to handle definitions with PHI kills, because
+      // it would involve processing more than one VNInfo at once.
+      if (CurrVN->hasPHIKill) continue;
+      
+      // We also don't try to handle the results of PHI joins, since there's
+      // no defining instruction to analyze.
+      unsigned DefIdx = CurrVN->def;
+      if (DefIdx == ~0U || DefIdx == ~1U) continue;
+    
+      // We're only interested in eliminating cruft introduced by the splitter,
+      // is of the form load-use or load-use-store.  First, check that the
+      // definition is a load, and remember what stack slot we loaded it from.
+      MachineInstr* DefMI = LIs->getInstructionFromIndex(DefIdx);
+      int FrameIndex;
+      if (!TII->isLoadFromStackSlot(DefMI, FrameIndex)) continue;
+      
+      // If the definition has no uses at all, just DCE it.
+      if (VNUseCount[CurrVN].size() == 0) {
+        LIs->RemoveMachineInstrFromMaps(DefMI);
+        (*LI)->removeValNo(CurrVN);
+        DefMI->eraseFromParent();
+        VNUseCount.erase(CurrVN);
+        NumDeadSpills++;
+        changed = true;
+        continue;
+      }
+      
+      // Second, get the number of non-store uses of the definition, as well as
+      // a flag indicating whether it feeds into a later two-address definition.
+      bool FeedsTwoAddr = false;
+      unsigned NonSpillCount = getNumberOfNonSpills(VNUseCount[CurrVN],
+                                                    (*LI)->reg, FrameIndex,
+                                                    FeedsTwoAddr);
+      
+      // If there's one non-store use and it doesn't feed a two-addr, then
+      // this is a load-use-store case that we can try to fold.
+      if (NonSpillCount == 1 && !FeedsTwoAddr) {
+        // Start by finding the non-store use MachineInstr.
+        SmallPtrSet<MachineInstr*, 4>::iterator UI = VNUseCount[CurrVN].begin();
+        int StoreFrameIndex;
+        unsigned StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
+        while (UI != VNUseCount[CurrVN].end() &&
+               (StoreVReg == (*LI)->reg && StoreFrameIndex == FrameIndex)) {
+          ++UI;
+          if (UI != VNUseCount[CurrVN].end())
+            StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
+        }
+        if (UI == VNUseCount[CurrVN].end()) continue;
+        
+        MachineInstr* use = *UI;
+        
+        // Attempt to fold it away!
+        int OpIdx = use->findRegisterUseOperandIdx((*LI)->reg, false);
+        if (OpIdx == -1) continue;
+        SmallVector<unsigned, 1> Ops;
+        Ops.push_back(OpIdx);
+        if (!TII->canFoldMemoryOperand(use, Ops)) continue;
+
+        MachineInstr* NewMI =
+                          TII->foldMemoryOperand(*use->getParent()->getParent(),  
+                                                 use, Ops, FrameIndex);
+
+        if (!NewMI) continue;
+
+        // Update relevant analyses.
+        LIs->RemoveMachineInstrFromMaps(DefMI);
+        LIs->ReplaceMachineInstrInMaps(use, NewMI);
+        (*LI)->removeValNo(CurrVN);
+
+        DefMI->eraseFromParent();
+        MachineBasicBlock* MBB = use->getParent();
+        NewMI = MBB->insert(MBB->erase(use), NewMI);
+        VNUseCount[CurrVN].erase(use);
+        
+        // Remove deleted instructions.  Note that we need to remove them from 
+        // the VNInfo->use map as well, just to be safe.
+        for (SmallPtrSet<MachineInstr*, 4>::iterator II = 
+             VNUseCount[CurrVN].begin(), IE = VNUseCount[CurrVN].end();
+             II != IE; ++II) {
+          for (DenseMap<VNInfo*, SmallPtrSet<MachineInstr*, 4> >::iterator
+               VNI = VNUseCount.begin(), VNE = VNUseCount.end(); VNI != VNE; 
+               ++VNI)
+            if (VNI->first != CurrVN)
+              VNI->second.erase(*II);
+          LIs->RemoveMachineInstrFromMaps(*II);
+          (*II)->eraseFromParent();
+        }
+        
+        VNUseCount.erase(CurrVN);
+
+        for (DenseMap<VNInfo*, SmallPtrSet<MachineInstr*, 4> >::iterator
+             VI = VNUseCount.begin(), VE = VNUseCount.end(); VI != VE; ++VI)
+          if (VI->second.erase(use))
+            VI->second.insert(NewMI);
+
+        NumDeadSpills++;
+        changed = true;
+        continue;
+      }
+      
+      // If there's more than one non-store instruction, we can't profitably
+      // fold it, so bail.
+      if (NonSpillCount) continue;
+        
+      // Otherwise, this is a load-store case, so DCE them.
+      for (SmallPtrSet<MachineInstr*, 4>::iterator UI = 
+           VNUseCount[CurrVN].begin(), UE = VNUseCount[CurrVN].end();
+           UI != UI; ++UI) {
+        LIs->RemoveMachineInstrFromMaps(*UI);
+        (*UI)->eraseFromParent();
+      }
+        
+      VNUseCount.erase(CurrVN);
+        
+      LIs->RemoveMachineInstrFromMaps(DefMI);
+      (*LI)->removeValNo(CurrVN);
+      DefMI->eraseFromParent();
+      NumDeadSpills++;
+      changed = true;
+    }
+  }
+  
+  return changed;
+}
+
+bool PreAllocSplitting::createsNewJoin(LiveRange* LR,
+                                       MachineBasicBlock* DefMBB,
+                                       MachineBasicBlock* BarrierMBB) {
+  if (DefMBB == BarrierMBB)
+    return false;
+  
+  if (LR->valno->hasPHIKill)
+    return false;
+  
+  unsigned MBBEnd = LIs->getMBBEndIdx(BarrierMBB);
+  if (LR->end < MBBEnd)
+    return false;
+  
+  MachineLoopInfo& MLI = getAnalysis<MachineLoopInfo>();
+  if (MLI.getLoopFor(DefMBB) != MLI.getLoopFor(BarrierMBB))
+    return true;
+  
+  MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+  SmallPtrSet<MachineBasicBlock*, 4> Visited;
+  typedef std::pair<MachineBasicBlock*,
+                    MachineBasicBlock::succ_iterator> ItPair;
+  SmallVector<ItPair, 4> Stack;
+  Stack.push_back(std::make_pair(BarrierMBB, BarrierMBB->succ_begin()));
+  
+  while (!Stack.empty()) {
+    ItPair P = Stack.back();
+    Stack.pop_back();
+    
+    MachineBasicBlock* PredMBB = P.first;
+    MachineBasicBlock::succ_iterator S = P.second;
+    
+    if (S == PredMBB->succ_end())
+      continue;
+    else if (Visited.count(*S)) {
+      Stack.push_back(std::make_pair(PredMBB, ++S));
+      continue;
+    } else
+      Stack.push_back(std::make_pair(PredMBB, S+1));
+    
+    MachineBasicBlock* MBB = *S;
+    Visited.insert(MBB);
+    
+    if (MBB == BarrierMBB)
+      return true;
+    
+    MachineDomTreeNode* DefMDTN = MDT.getNode(DefMBB);
+    MachineDomTreeNode* BarrierMDTN = MDT.getNode(BarrierMBB);
+    MachineDomTreeNode* MDTN = MDT.getNode(MBB)->getIDom();
+    while (MDTN) {
+      if (MDTN == DefMDTN)
+        return true;
+      else if (MDTN == BarrierMDTN)
+        break;
+      MDTN = MDTN->getIDom();
+    }
+    
+    MBBEnd = LIs->getMBBEndIdx(MBB);
+    if (LR->end > MBBEnd)
+      Stack.push_back(std::make_pair(MBB, MBB->succ_begin()));
+  }
+  
+  return false;
+} 
+  
+
+bool PreAllocSplitting::runOnMachineFunction(MachineFunction &MF) {
+  CurrMF = &MF;
+  TM     = &MF.getTarget();
+  TRI    = TM->getRegisterInfo();
+  TII    = TM->getInstrInfo();
+  MFI    = MF.getFrameInfo();
+  MRI    = &MF.getRegInfo();
+  LIs    = &getAnalysis<LiveIntervals>();
+  LSs    = &getAnalysis<LiveStacks>();
+  VRM    = &getAnalysis<VirtRegMap>();
+
+  bool MadeChange = false;
+
+  // Make sure blocks are numbered in order.
+  MF.RenumberBlocks();
+
+  MachineBasicBlock *Entry = MF.begin();
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+
+  SmallPtrSet<LiveInterval*, 8> Split;
+
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
+         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+       DFI != E; ++DFI) {
+    BarrierMBB = *DFI;
+    for (MachineBasicBlock::iterator I = BarrierMBB->begin(),
+           E = BarrierMBB->end(); I != E; ++I) {
+      Barrier = &*I;
+      const TargetRegisterClass **BarrierRCs =
+        Barrier->getDesc().getRegClassBarriers();
+      if (!BarrierRCs)
+        continue;
+      BarrierIdx = LIs->getInstructionIndex(Barrier);
+      MadeChange |= SplitRegLiveIntervals(BarrierRCs, Split);
+    }
+  }
+
+  MadeChange |= removeDeadSpills(Split);
+
+  return MadeChange;
+}
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
new file mode 100644
index 0000000..9e7ad67
--- /dev/null
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -0,0 +1,679 @@
+//===-- PrologEpilogInserter.cpp - Insert Prolog/Epilog code in function --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This pass provides an optional shrink wrapping variant of prolog/epilog
+// insertion, enabled via --shrink-wrap. See ShrinkWrapping.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PrologEpilogInserter.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/STLExtras.h"
+#include <climits>
+
+using namespace llvm;
+
+char PEI::ID = 0;
+
+static RegisterPass<PEI>
+X("prologepilog", "Prologue/Epilogue Insertion");
+
+/// createPrologEpilogCodeInserter - This function returns a pass that inserts
+/// prolog and epilog code, and eliminates abstract frame references.
+///
+FunctionPass *llvm::createPrologEpilogCodeInserter() { return new PEI(); }
+
+/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+/// frame indexes with appropriate references.
+///
+bool PEI::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : NULL;
+
+  // Get MachineModuleInfo so that we can track the construction of the
+  // frame.
+  if (MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>())
+    Fn.getFrameInfo()->setMachineModuleInfo(MMI);
+
+  // Allow the target machine to make some adjustments to the function
+  // e.g. UsedPhysRegs before calculateCalleeSavedRegisters.
+  TRI->processFunctionBeforeCalleeSavedScan(Fn, RS);
+
+  // Scan the function for modified callee saved registers and insert spill
+  // code for any callee saved registers that are modified.  Also calculate
+  // the MaxCallFrameSize and HasCalls variables for the function's frame
+  // information and eliminates call frame pseudo instructions.
+  calculateCalleeSavedRegisters(Fn);
+
+  // Determine placement of CSR spill/restore code:
+  //  - with shrink wrapping, place spills and restores to tightly
+  //    enclose regions in the Machine CFG of the function where
+  //    they are used. Without shrink wrapping
+  //  - default (no shrink wrapping), place all spills in the
+  //    entry block, all restores in return blocks.
+  placeCSRSpillsAndRestores(Fn);
+
+  // Add the code to save and restore the callee saved registers
+  insertCSRSpillsAndRestores(Fn);
+
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TRI->processFunctionBeforeFrameFinalized(Fn);
+
+  // Calculate actual frame offsets for all abstract stack objects...
+  calculateFrameObjectOffsets(Fn);
+
+  // Add prolog and epilog code to the function.  This function is required
+  // to align the stack frame as necessary for any stack variables or
+  // called functions.  Because of this, calculateCalleeSavedRegisters
+  // must be called before this function in order to set the HasCalls
+  // and MaxCallFrameSize variables.
+  insertPrologEpilogCode(Fn);
+
+  // Replace all MO_FrameIndex operands with physical register references
+  // and actual offsets.
+  //
+  replaceFrameIndices(Fn);
+
+  delete RS;
+  clearAllSets();
+  return true;
+}
+
+#if 0
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  if (ShrinkWrapping || ShrinkWrapFunc != "") {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+  }
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+#endif
+
+/// calculateCalleeSavedRegisters - Scan the function for modified callee saved
+/// registers.  Also calculate the MaxCallFrameSize and HasCalls variables for
+/// the function's frame information and eliminates call frame pseudo
+/// instructions.
+///
+void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  const TargetFrameInfo *TFI = Fn.getTarget().getFrameInfo();
+
+  // Get the callee saved register list...
+  const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(&Fn);
+
+  // Get the function call frame set-up and tear-down instruction opcode
+  int FrameSetupOpcode   = RegInfo->getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = RegInfo->getCallFrameDestroyOpcode();
+
+  // These are used to keep track the callee-save area. Initialize them.
+  MinCSFrameIndex = INT_MAX;
+  MaxCSFrameIndex = 0;
+
+  // Early exit for targets which have no callee saved registers and no call
+  // frame setup/destroy pseudo instructions.
+  if ((CSRegs == 0 || CSRegs[0] == 0) &&
+      FrameSetupOpcode == -1 && FrameDestroyOpcode == -1)
+    return;
+
+  unsigned MaxCallFrameSize = 0;
+  bool HasCalls = false;
+
+  std::vector<MachineBasicBlock::iterator> FrameSDOps;
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
+               " instructions should have a single immediate argument!");
+        unsigned Size = I->getOperand(0).getImm();
+        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
+        HasCalls = true;
+        FrameSDOps.push_back(I);
+      }
+
+  MachineFrameInfo *FFI = Fn.getFrameInfo();
+  FFI->setHasCalls(HasCalls);
+  FFI->setMaxCallFrameSize(MaxCallFrameSize);
+
+  for (unsigned i = 0, e = FrameSDOps.size(); i != e; ++i) {
+    MachineBasicBlock::iterator I = FrameSDOps[i];
+    // If call frames are not being included as part of the stack frame,
+    // and there is no dynamic allocation (therefore referencing frame slots
+    // off sp), leave the pseudo ops alone. We'll eliminate them later.
+    if (RegInfo->hasReservedCallFrame(Fn) || RegInfo->hasFP(Fn))
+      RegInfo->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+  }
+
+  // Now figure out which *callee saved* registers are modified by the current
+  // function, thus needing to be saved and restored in the prolog/epilog.
+  //
+  const TargetRegisterClass* const *CSRegClasses =
+    RegInfo->getCalleeSavedRegClasses(&Fn);
+  std::vector<CalleeSavedInfo> CSI;
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (Fn.getRegInfo().isPhysRegUsed(Reg)) {
+        // If the reg is modified, save it!
+      CSI.push_back(CalleeSavedInfo(Reg, CSRegClasses[i]));
+    } else {
+      for (const unsigned *AliasSet = RegInfo->getAliasSet(Reg);
+           *AliasSet; ++AliasSet) {  // Check alias registers too.
+        if (Fn.getRegInfo().isPhysRegUsed(*AliasSet)) {
+          CSI.push_back(CalleeSavedInfo(Reg, CSRegClasses[i]));
+          break;
+        }
+      }
+    }
+  }
+
+  if (CSI.empty())
+    return;   // Early exit if no callee saved registers are modified!
+
+  unsigned NumFixedSpillSlots;
+  const std::pair<unsigned,int> *FixedSpillSlots =
+    TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+
+  // Now that we know which registers need to be saved and restored, allocate
+  // stack slots for them.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RC = CSI[i].getRegClass();
+
+    // Check to see if this physreg must be spilled to a particular stack slot
+    // on this target.
+    const std::pair<unsigned,int> *FixedSlot = FixedSpillSlots;
+    while (FixedSlot != FixedSpillSlots+NumFixedSpillSlots &&
+           FixedSlot->first != Reg)
+      ++FixedSlot;
+
+    int FrameIdx;
+    if (FixedSlot == FixedSpillSlots+NumFixedSpillSlots) {
+      // Nope, just spill it anywhere convenient.
+      unsigned Align = RC->getAlignment();
+      unsigned StackAlign = TFI->getStackAlignment();
+      // We may not be able to sastify the desired alignment specification of
+      // the TargetRegisterClass if the stack alignment is smaller.
+      // Use the min.
+      Align = std::min(Align, StackAlign);
+      FrameIdx = FFI->CreateStackObject(RC->getSize(), Align);
+      if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+      if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+    } else {
+      // Spill it to the stack where we must.
+      FrameIdx = FFI->CreateFixedObject(RC->getSize(), FixedSlot->second);
+    }
+    CSI[i].setFrameIdx(FrameIdx);
+  }
+
+  FFI->setCalleeSavedInfo(CSI);
+}
+
+/// insertCSRSpillsAndRestores - Insert spill and restore code for
+/// callee saved registers used in the function, handling shrink wrapping.
+///
+void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
+  // Get callee saved register information.
+  MachineFrameInfo *FFI = Fn.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo();
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty())
+    return;
+
+  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
+  MachineBasicBlock::iterator I;
+
+  if (! ShrinkWrapThisFunction) {
+    // Spill using target interface.
+    I = EntryBlock->begin();
+    if (!TII.spillCalleeSavedRegisters(*EntryBlock, I, CSI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        // Add the callee-saved register as live-in.
+        // It's killed at the spill.
+        EntryBlock->addLiveIn(CSI[i].getReg());
+
+        // Insert the spill to the stack frame.
+        TII.storeRegToStackSlot(*EntryBlock, I, CSI[i].getReg(), true,
+                                CSI[i].getFrameIdx(), CSI[i].getRegClass());
+      }
+    }
+
+    // Restore using target interface.
+    for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
+      MachineBasicBlock* MBB = ReturnBlocks[ri];
+      I = MBB->end(); --I;
+
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = I;
+      while (I2 != MBB->begin() && (--I2)->getDesc().isTerminator())
+        I = I2;
+
+      bool AtStart = I == MBB->begin();
+      MachineBasicBlock::iterator BeforeI = I;
+      if (!AtStart)
+        --BeforeI;
+
+      // Restore all registers immediately before the return and any
+      // terminators that preceed it.
+      if (!TII.restoreCalleeSavedRegisters(*MBB, I, CSI)) {
+        for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+          TII.loadRegFromStackSlot(*MBB, I, CSI[i].getReg(),
+                                   CSI[i].getFrameIdx(),
+                                   CSI[i].getRegClass());
+          assert(I != MBB->begin() &&
+                 "loadRegFromStackSlot didn't insert any code!");
+          // Insert in reverse order.  loadRegFromStackSlot can insert
+          // multiple instructions.
+          if (AtStart)
+            I = MBB->begin();
+          else {
+            I = BeforeI;
+            ++I;
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  // Insert spills.
+  std::vector<CalleeSavedInfo> blockCSI;
+  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
+         BE = CSRSave.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet save = BI->second;
+
+    if (save.empty())
+      continue;
+
+    blockCSI.clear();
+    for (CSRegSet::iterator RI = save.begin(),
+           RE = save.end(); RI != RE; ++RI) {
+      blockCSI.push_back(CSI[*RI]);
+    }
+    assert(blockCSI.size() > 0 &&
+           "Could not collect callee saved register info");
+
+    I = MBB->begin();
+
+    // When shrink wrapping, use stack slot stores/loads.
+    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+      // Add the callee-saved register as live-in.
+      // It's killed at the spill.
+      MBB->addLiveIn(blockCSI[i].getReg());
+
+      // Insert the spill to the stack frame.
+      TII.storeRegToStackSlot(*MBB, I, blockCSI[i].getReg(),
+                              true,
+                              blockCSI[i].getFrameIdx(),
+                              blockCSI[i].getRegClass());
+    }
+  }
+
+  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
+         BE = CSRRestore.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet restore = BI->second;
+
+    if (restore.empty())
+      continue;
+
+    blockCSI.clear();
+    for (CSRegSet::iterator RI = restore.begin(),
+           RE = restore.end(); RI != RE; ++RI) {
+      blockCSI.push_back(CSI[*RI]);
+    }
+    assert(blockCSI.size() > 0 &&
+           "Could not find callee saved register info");
+
+    // If MBB is empty and needs restores, insert at the _beginning_.
+    if (MBB->empty()) {
+      I = MBB->begin();
+    } else {
+      I = MBB->end();
+      --I;
+
+      // Skip over all terminator instructions, which are part of the
+      // return sequence.
+      if (! I->getDesc().isTerminator()) {
+        ++I;
+      } else {
+        MachineBasicBlock::iterator I2 = I;
+        while (I2 != MBB->begin() && (--I2)->getDesc().isTerminator())
+          I = I2;
+      }
+    }
+
+    bool AtStart = I == MBB->begin();
+    MachineBasicBlock::iterator BeforeI = I;
+    if (!AtStart)
+      --BeforeI;
+
+    // Restore all registers immediately before the return and any
+    // terminators that preceed it.
+    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+      TII.loadRegFromStackSlot(*MBB, I, blockCSI[i].getReg(),
+                               blockCSI[i].getFrameIdx(),
+                               blockCSI[i].getRegClass());
+      assert(I != MBB->begin() &&
+             "loadRegFromStackSlot didn't insert any code!");
+      // Insert in reverse order.  loadRegFromStackSlot can insert
+      // multiple instructions.
+      if (AtStart)
+        I = MBB->begin();
+      else {
+        I = BeforeI;
+        ++I;
+      }
+    }
+  }
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *FFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If stack grows down, we need to add size of find the lowest address of the
+  // object.
+  if (StackGrowsDown)
+    Offset += FFI->getObjectSize(FrameIdx);
+
+  unsigned Align = FFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    FFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    FFI->setObjectOffset(FrameIdx, Offset);
+    Offset += FFI->getObjectSize(FrameIdx);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameInfo &TFI = *Fn.getTarget().getFrameInfo();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameInfo::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *FFI = Fn.getFrameInfo();
+
+  unsigned MaxAlign = FFI->getMaxAlignment();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int64_t Offset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    Offset = -Offset;
+  assert(Offset >= 0
+         && "Local area offset should be in direction of stack growth");
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -FFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = FFI->getObjectOffset(i) + FFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // First assign frame offsets to stack objects that are used to spill
+  // callee saved registers.
+  if (StackGrowsDown) {
+    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
+      // If stack grows down, we need to add size of find the lowest
+      // address of the object.
+      Offset += FFI->getObjectSize(i);
+
+      unsigned Align = FFI->getObjectAlignment(i);
+      // If the alignment of this object is greater than that of the stack,
+      // then increase the stack alignment to match.
+      MaxAlign = std::max(MaxAlign, Align);
+      // Adjust to alignment boundary
+      Offset = (Offset+Align-1)/Align*Align;
+
+      FFI->setObjectOffset(i, -Offset);        // Set the computed offset
+    }
+  } else {
+    int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
+    for (int i = MaxCSFI; i >= MinCSFI ; --i) {
+      unsigned Align = FFI->getObjectAlignment(i);
+      // If the alignment of this object is greater than that of the stack,
+      // then increase the stack alignment to match.
+      MaxAlign = std::max(MaxAlign, Align);
+      // Adjust to alignment boundary
+      Offset = (Offset+Align-1)/Align*Align;
+
+      FFI->setObjectOffset(i, Offset);
+      Offset += FFI->getObjectSize(i);
+    }
+  }
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // frame pointer if a frame pointer is required.
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  if (RS && RegInfo->hasFP(Fn)) {
+    int SFI = RS->getScavengingFrameIndex();
+    if (SFI >= 0)
+      AdjustStackOffset(FFI, SFI, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  if (FFI->getStackProtectorIndex() >= 0)
+    AdjustStackOffset(FFI, FFI->getStackProtectorIndex(), StackGrowsDown,
+                      Offset, MaxAlign);
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+      continue;
+    if (RS && (int)i == RS->getScavengingFrameIndex())
+      continue;
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    if (FFI->getStackProtectorIndex() == (int)i)
+      continue;
+
+    AdjustStackOffset(FFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // stack pointer.
+  if (RS && !RegInfo->hasFP(Fn)) {
+    int SFI = RS->getScavengingFrameIndex();
+    if (SFI >= 0)
+      AdjustStackOffset(FFI, SFI, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // Round up the size to a multiple of the alignment, but only if there are
+  // calls or alloca's in the function.  This ensures that any calls to
+  // subroutines have their stack frames suitable aligned.
+  // Also do this if we need runtime alignment of the stack.  In this case
+  // offsets will be relative to SP not FP; round up the stack size so this
+  // works.
+  if (!RegInfo->targetHandlesStackFrameRounding() &&
+      (FFI->hasCalls() || FFI->hasVarSizedObjects() ||
+       (RegInfo->needsStackRealignment(Fn) &&
+        FFI->getObjectIndexEnd() != 0))) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (RegInfo->hasReservedCallFrame(Fn))
+      Offset += FFI->getMaxCallFrameSize();
+
+    unsigned AlignMask = std::max(TFI.getStackAlignment(),MaxAlign) - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  FFI->setStackSize(Offset+TFI.getOffsetOfLocalArea());
+
+  // Remember the required stack alignment in case targets need it to perform
+  // dynamic stack alignment.
+  FFI->setMaxAlignment(MaxAlign);
+}
+
+
+/// insertPrologEpilogCode - Scan the function for modified callee saved
+/// registers, insert spill code for these callee saved registers, then add
+/// prolog and epilog code to the function.
+///
+void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+
+  // Add prologue to the function...
+  TRI->emitPrologue(Fn);
+
+  // Add epilogue to restore the callee-save registers in each exiting block
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().getDesc().isReturn())
+      TRI->emitEpilogue(Fn, *I);
+  }
+}
+
+
+/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
+/// register references and actual offsets.
+///
+void PEI::replaceFrameIndices(MachineFunction &Fn) {
+  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+
+  const TargetMachine &TM = Fn.getTarget();
+  assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!");
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetFrameInfo *TFI = TM.getFrameInfo();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameInfo::StackGrowsDown;
+  int FrameSetupOpcode   = TRI.getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TRI.getCallFrameDestroyOpcode();
+
+  for (MachineFunction::iterator BB = Fn.begin(),
+         E = Fn.end(); BB != E; ++BB) {
+    int SPAdj = 0;  // SP offset due to call frame setup / destroy.
+    if (RS) RS->enterBasicBlock(BB);
+
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+      if (I->getOpcode() == TargetInstrInfo::DECLARE) {
+        // Ignore it.
+        ++I;
+        continue;
+      }
+
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+        // Remember how much SP has been adjusted to create the call
+        // frame.
+        int Size = I->getOperand(0).getImm();
+
+        if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
+            (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
+          Size = -Size;
+
+        SPAdj += Size;
+
+        MachineBasicBlock::iterator PrevI = BB->end();
+        if (I != BB->begin()) PrevI = prior(I);
+        TRI.eliminateCallFramePseudoInstr(Fn, *BB, I);
+
+        // Visit the instructions created by eliminateCallFramePseudoInstr().
+        if (PrevI == BB->end())
+          I = BB->begin();     // The replaced instr was the first in the block.
+        else
+          I = next(PrevI);
+        continue;
+      }
+
+      MachineInstr *MI = I;
+      bool DoIncr = true;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+        if (MI->getOperand(i).isFI()) {
+          // Some instructions (e.g. inline asm instructions) can have
+          // multiple frame indices and/or cause eliminateFrameIndex
+          // to insert more than one instruction. We need the register
+          // scavenger to go through all of these instructions so that
+          // it can update its register information. We keep the
+          // iterator at the point before insertion so that we can
+          // revisit them in full.
+          bool AtBeginning = (I == BB->begin());
+          if (!AtBeginning) --I;
+
+          // If this instruction has a FrameIndex operand, we need to
+          // use that target machine register info object to eliminate
+          // it.
+
+          TRI.eliminateFrameIndex(MI, SPAdj, RS);
+
+          // Reset the iterator if we were at the beginning of the BB.
+          if (AtBeginning) {
+            I = BB->begin();
+            DoIncr = false;
+          }
+
+          MI = 0;
+          break;
+        }
+
+      if (DoIncr && I != BB->end()) ++I;
+
+      // Update register states.
+      if (RS && MI) RS->forward(MI);
+    }
+
+    assert(SPAdj == 0 && "Unbalanced call frame setup / destroy pairs?");
+  }
+}
+
diff --git a/lib/CodeGen/PrologEpilogInserter.h b/lib/CodeGen/PrologEpilogInserter.h
new file mode 100644
index 0000000..c158dd8
--- /dev/null
+++ b/lib/CodeGen/PrologEpilogInserter.h
@@ -0,0 +1,167 @@
+//===-- PrologEpilogInserter.h - Prolog/Epilog code insertion -*- C++ -* --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This pass also implements a shrink wrapping variant of prolog/epilog
+// insertion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PEI_H
+#define LLVM_CODEGEN_PEI_H
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class RegScavenger;
+  class MachineBasicBlock;
+
+  class PEI : public MachineFunctionPass {
+  public:
+    static char ID;
+    PEI() : MachineFunctionPass(&ID) {}
+
+    const char *getPassName() const {
+      return "Prolog/Epilog Insertion & Frame Finalization";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+    /// frame indexes with appropriate references.
+    ///
+    bool runOnMachineFunction(MachineFunction &Fn);
+
+  private:
+    RegScavenger *RS;
+
+    // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
+    // stack frame indexes.
+    unsigned MinCSFrameIndex, MaxCSFrameIndex;
+
+    // Analysis info for spill/restore placement.
+    // "CSR": "callee saved register".
+
+    // CSRegSet contains indices into the Callee Saved Register Info
+    // vector built by calculateCalleeSavedRegisters() and accessed
+    // via MF.getFrameInfo()->getCalleeSavedInfo().
+    typedef SparseBitVector<> CSRegSet;
+
+    // CSRegBlockMap maps MachineBasicBlocks to sets of callee
+    // saved register indices.
+    typedef DenseMap<MachineBasicBlock*, CSRegSet> CSRegBlockMap;
+
+    // Set and maps for computing CSR spill/restore placement:
+    //  used in function (UsedCSRegs)
+    //  used in a basic block (CSRUsed)
+    //  anticipatable in a basic block (Antic{In,Out})
+    //  available in a basic block (Avail{In,Out})
+    //  to be spilled at the entry to a basic block (CSRSave)
+    //  to be restored at the end of a basic block (CSRRestore)
+    CSRegSet UsedCSRegs;
+    CSRegBlockMap CSRUsed;
+    CSRegBlockMap AnticIn, AnticOut;
+    CSRegBlockMap AvailIn, AvailOut;
+    CSRegBlockMap CSRSave;
+    CSRegBlockMap CSRRestore;
+
+    // Entry and return blocks of the current function.
+    MachineBasicBlock* EntryBlock;
+    SmallVector<MachineBasicBlock*, 4> ReturnBlocks;
+
+    // Map of MBBs to top level MachineLoops.
+    DenseMap<MachineBasicBlock*, MachineLoop*> TLLoops;
+
+    // Flag to control shrink wrapping per-function:
+    // may choose to skip shrink wrapping for certain
+    // functions.
+    bool ShrinkWrapThisFunction;
+
+#ifndef NDEBUG
+    // Machine function handle.
+    MachineFunction* MF;
+
+    // Flag indicating that the current function
+    // has at least one "short" path in the machine
+    // CFG from the entry block to an exit block.
+    bool HasFastExitPath;
+#endif
+
+    bool calculateSets(MachineFunction &Fn);
+    bool calcAnticInOut(MachineBasicBlock* MBB);
+    bool calcAvailInOut(MachineBasicBlock* MBB);
+    void calculateAnticAvail(MachineFunction &Fn);
+    bool addUsesForMEMERegion(MachineBasicBlock* MBB,
+                              SmallVector<MachineBasicBlock*, 4>& blks);
+    bool addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks);
+    bool calcSpillPlacements(MachineBasicBlock* MBB,
+                             SmallVector<MachineBasicBlock*, 4> &blks,
+                             CSRegBlockMap &prevSpills);
+    bool calcRestorePlacements(MachineBasicBlock* MBB,
+                               SmallVector<MachineBasicBlock*, 4> &blks,
+                               CSRegBlockMap &prevRestores);
+    void placeSpillsAndRestores(MachineFunction &Fn);
+    void placeCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateCalleeSavedRegisters(MachineFunction &Fn);
+    void insertCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateFrameObjectOffsets(MachineFunction &Fn);
+    void replaceFrameIndices(MachineFunction &Fn);
+    void insertPrologEpilogCode(MachineFunction &Fn);
+
+    // Initialize DFA sets, called before iterations.
+    void clearAnticAvailSets();
+    // Clear all sets constructed by shrink wrapping.
+    void clearAllSets();
+
+    // Initialize all shrink wrapping data.
+    void initShrinkWrappingInfo();
+
+    // Convienences for dealing with machine loops.
+    MachineBasicBlock* getTopLevelLoopPreheader(MachineLoop* LP);
+    MachineLoop* getTopLevelLoopParent(MachineLoop *LP);
+
+    // Propgate CSRs used in MBB to all MBBs of loop LP.
+    void propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP);
+
+    // Convenience for recognizing return blocks.
+    bool isReturnBlock(MachineBasicBlock* MBB);
+
+#ifndef NDEBUG
+    // Debugging methods.
+
+    // Mark this function as having fast exit paths.
+    void findFastExitPath();
+
+    // Verify placement of spills/restores.
+    void verifySpillRestorePlacement();
+
+    std::string getBasicBlockName(const MachineBasicBlock* MBB);
+    std::string stringifyCSRegSet(const CSRegSet& s);
+    void dumpSet(const CSRegSet& s);
+    void dumpUsed(MachineBasicBlock* MBB);
+    void dumpAllUsed();
+    void dumpSets(MachineBasicBlock* MBB);
+    void dumpSets1(MachineBasicBlock* MBB);
+    void dumpAllSets();
+    void dumpSRSets();
+#endif
+
+  };
+} // End llvm namespace
+#endif
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
new file mode 100644
index 0000000..b4c20e6
--- /dev/null
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -0,0 +1,92 @@
+//===-- llvm/CodeGen/PseudoSourceValue.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PseudoSourceValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+using namespace llvm;
+
+static ManagedStatic<PseudoSourceValue[4]> PSVs;
+
+const PseudoSourceValue *PseudoSourceValue::getStack()
+{ return &(*PSVs)[0]; }
+const PseudoSourceValue *PseudoSourceValue::getGOT()
+{ return &(*PSVs)[1]; }
+const PseudoSourceValue *PseudoSourceValue::getJumpTable()
+{ return &(*PSVs)[2]; }
+const PseudoSourceValue *PseudoSourceValue::getConstantPool()
+{ return &(*PSVs)[3]; }
+
+static const char *const PSVNames[] = {
+  "Stack",
+  "GOT",
+  "JumpTable",
+  "ConstantPool"
+};
+
+PseudoSourceValue::PseudoSourceValue() :
+  Value(PointerType::getUnqual(Type::Int8Ty), PseudoSourceValueVal) {}
+
+void PseudoSourceValue::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+void PseudoSourceValue::print(raw_ostream &OS) const {
+  OS << PSVNames[this - *PSVs];
+}
+
+namespace {
+  /// FixedStackPseudoSourceValue - A specialized PseudoSourceValue
+  /// for holding FixedStack values, which must include a frame
+  /// index.
+  class VISIBILITY_HIDDEN FixedStackPseudoSourceValue
+    : public PseudoSourceValue {
+    const int FI;
+  public:
+    explicit FixedStackPseudoSourceValue(int fi) : FI(fi) {}
+
+    virtual bool isConstant(const MachineFrameInfo *MFI) const;
+
+    virtual void print(raw_ostream &OS) const {
+      OS << "FixedStack" << FI;
+    }
+  };
+}
+
+static ManagedStatic<std::map<int, const PseudoSourceValue *> > FSValues;
+
+const PseudoSourceValue *PseudoSourceValue::getFixedStack(int FI) {
+  const PseudoSourceValue *&V = (*FSValues)[FI];
+  if (!V)
+    V = new FixedStackPseudoSourceValue(FI);
+  return V;
+}
+
+bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  if (this == getStack())
+    return false;
+  if (this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return true;
+  assert(0 && "Unknown PseudoSourceValue!");
+  return false;
+}
+
+bool FixedStackPseudoSourceValue::isConstant(const MachineFrameInfo *MFI) const{
+  return MFI && MFI->isImmutableObjectIndex(FI);
+}
diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt
new file mode 100644
index 0000000..64374ce
--- /dev/null
+++ b/lib/CodeGen/README.txt
@@ -0,0 +1,208 @@
+//===---------------------------------------------------------------------===//
+
+Common register allocation / spilling problem:
+
+        mul lr, r4, lr
+        str lr, [sp, #+52]
+        ldr lr, [r1, #+32]
+        sxth r3, r3
+        ldr r4, [sp, #+52]
+        mla r4, r3, lr, r4
+
+can be:
+
+        mul lr, r4, lr
+        mov r4, lr
+        str lr, [sp, #+52]
+        ldr lr, [r1, #+32]
+        sxth r3, r3
+        mla r4, r3, lr, r4
+
+and then "merge" mul and mov:
+
+        mul r4, r4, lr
+        str lr, [sp, #+52]
+        ldr lr, [r1, #+32]
+        sxth r3, r3
+        mla r4, r3, lr, r4
+
+It also increase the likelyhood the store may become dead.
+
+//===---------------------------------------------------------------------===//
+
+I think we should have a "hasSideEffects" flag (which is automatically set for
+stuff that "isLoad" "isCall" etc), and the remat pass should eventually be able
+to remat any instruction that has no side effects, if it can handle it and if
+profitable.
+
+For now, I'd suggest having the remat stuff work like this:
+
+1. I need to spill/reload this thing.
+2. Check to see if it has side effects.
+3. Check to see if it is simple enough: e.g. it only has one register
+destination and no register input.
+4. If so, clone the instruction, do the xform, etc.
+
+Advantages of this are:
+
+1. the .td file describes the behavior of the instructions, not the way the
+   algorithm should work.
+2. as remat gets smarter in the future, we shouldn't have to be changing the .td
+   files.
+3. it is easier to explain what the flag means in the .td file, because you
+   don't have to pull in the explanation of how the current remat algo works.
+
+Some potential added complexities:
+
+1. Some instructions have to be glued to it's predecessor or successor. All of
+   the PC relative instructions and condition code setting instruction. We could
+   mark them as hasSideEffects, but that's not quite right. PC relative loads
+   from constantpools can be remat'ed, for example. But it requires more than
+   just cloning the instruction. Some instructions can be remat'ed but it
+   expands to more than one instruction. But allocator will have to make a
+   decision.
+
+4. As stated in 3, not as simple as cloning in some cases. The target will have
+   to decide how to remat it. For example, an ARM 2-piece constant generation
+   instruction is remat'ed as a load from constantpool.
+
+//===---------------------------------------------------------------------===//
+
+bb27 ...
+        ...
+        %reg1037 = ADDri %reg1039, 1
+        %reg1038 = ADDrs %reg1032, %reg1039, %NOREG, 10
+    Successors according to CFG: 0x8b03bf0 (#5)
+
+bb76 (0x8b03bf0, LLVM BB @0x8b032d0, ID#5):
+    Predecessors according to CFG: 0x8b0c5f0 (#3) 0x8b0a7c0 (#4)
+        %reg1039 = PHI %reg1070, mbb<bb76.outer,0x8b0c5f0>, %reg1037, mbb<bb27,0x8b0a7c0>
+
+Note ADDri is not a two-address instruction. However, its result %reg1037 is an
+operand of the PHI node in bb76 and its operand %reg1039 is the result of the
+PHI node. We should treat it as a two-address code and make sure the ADDri is
+scheduled after any node that reads %reg1039.
+
+//===---------------------------------------------------------------------===//
+
+Use local info (i.e. register scavenger) to assign it a free register to allow
+reuse:
+        ldr r3, [sp, #+4]
+        add r3, r3, #3
+        ldr r2, [sp, #+8]
+        add r2, r2, #2
+        ldr r1, [sp, #+4]  <==
+        add r1, r1, #1
+        ldr r0, [sp, #+4]
+        add r0, r0, #2
+
+//===---------------------------------------------------------------------===//
+
+LLVM aggressively lift CSE out of loop. Sometimes this can be negative side-
+effects:
+
+R1 = X + 4
+R2 = X + 7
+R3 = X + 15
+
+loop:
+load [i + R1]
+...
+load [i + R2]
+...
+load [i + R3]
+
+Suppose there is high register pressure, R1, R2, R3, can be spilled. We need
+to implement proper re-materialization to handle this:
+
+R1 = X + 4
+R2 = X + 7
+R3 = X + 15
+
+loop:
+R1 = X + 4  @ re-materialized
+load [i + R1]
+...
+R2 = X + 7 @ re-materialized
+load [i + R2]
+...
+R3 = X + 15 @ re-materialized
+load [i + R3]
+
+Furthermore, with re-association, we can enable sharing:
+
+R1 = X + 4
+R2 = X + 7
+R3 = X + 15
+
+loop:
+T = i + X
+load [T + 4]
+...
+load [T + 7]
+...
+load [T + 15]
+//===---------------------------------------------------------------------===//
+
+It's not always a good idea to choose rematerialization over spilling. If all
+the load / store instructions would be folded then spilling is cheaper because
+it won't require new live intervals / registers. See 2003-05-31-LongShifts for
+an example.
+
+//===---------------------------------------------------------------------===//
+
+With a copying garbage collector, derived pointers must not be retained across
+collector safe points; the collector could move the objects and invalidate the
+derived pointer. This is bad enough in the first place, but safe points can
+crop up unpredictably. Consider:
+
+        %array = load { i32, [0 x %obj] }** %array_addr
+        %nth_el = getelementptr { i32, [0 x %obj] }* %array, i32 0, i32 %n
+        %old = load %obj** %nth_el
+        %z = div i64 %x, %y
+        store %obj* %new, %obj** %nth_el
+
+If the i64 division is lowered to a libcall, then a safe point will (must)
+appear for the call site. If a collection occurs, %array and %nth_el no longer
+point into the correct object.
+
+The fix for this is to copy address calculations so that dependent pointers
+are never live across safe point boundaries. But the loads cannot be copied
+like this if there was an intervening store, so may be hard to get right.
+
+Only a concurrent mutator can trigger a collection at the libcall safe point.
+So single-threaded programs do not have this requirement, even with a copying
+collector. Still, LLVM optimizations would probably undo a front-end's careful
+work.
+
+//===---------------------------------------------------------------------===//
+
+The ocaml frametable structure supports liveness information. It would be good
+to support it.
+
+//===---------------------------------------------------------------------===//
+
+The FIXME in ComputeCommonTailLength in BranchFolding.cpp needs to be
+revisited. The check is there to work around a misuse of directives in inline
+assembly.
+
+//===---------------------------------------------------------------------===//
+
+It would be good to detect collector/target compatibility instead of silently
+doing the wrong thing.
+
+//===---------------------------------------------------------------------===//
+
+It would be really nice to be able to write patterns in .td files for copies,
+which would eliminate a bunch of explicit predicates on them (e.g. no side 
+effects).  Once this is in place, it would be even better to have tblgen 
+synthesize the various copy insertion/inspection methods in TargetInstrInfo.
+
+//===---------------------------------------------------------------------===//
+
+Stack coloring improvments:
+
+1. Do proper LiveStackAnalysis on all stack objects including those which are
+   not spill slots.
+2. Reorder objects to fill in gaps between objects.
+   e.g. 4, 1, <gap>, 4, 1, 1, 1, <gap>, 4 => 4, 1, 1, 1, 1, 4, 4
diff --git a/lib/CodeGen/RegAllocBigBlock.cpp b/lib/CodeGen/RegAllocBigBlock.cpp
new file mode 100644
index 0000000..91e4099
--- /dev/null
+++ b/lib/CodeGen/RegAllocBigBlock.cpp
@@ -0,0 +1,892 @@
+//===- RegAllocBigBlock.cpp - A register allocator for large basic blocks -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RABigBlock class
+//
+//===----------------------------------------------------------------------===//
+
+// This register allocator is derived from RegAllocLocal.cpp. Like it, this
+// allocator works on one basic block at a time, oblivious to others.
+// However, the algorithm used here is suited for long blocks of
+// instructions - registers are spilled by greedily choosing those holding
+// values that will not be needed for the longest amount of time. This works
+// particularly well for blocks with 10 or more times as many instructions
+// as machine registers, but can be used for general code.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: - automagically invoke linearscan for (groups of) small BBs?
+//       - break ties when picking regs? (probably not worth it in a
+//         JIT context)
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads , "Number of loads added");
+STATISTIC(NumFolded, "Number of loads/stores folded into instructions");
+
+static RegisterRegAlloc
+  bigBlockRegAlloc("bigblock", "Big-block register allocator",
+                createBigBlockRegisterAllocator);
+
+namespace {
+/// VRegKeyInfo - Defines magic values required to use VirtRegs as DenseMap
+/// keys.
+  struct VRegKeyInfo {
+    static inline unsigned getEmptyKey() { return -1U; }
+    static inline unsigned getTombstoneKey() { return -2U; }
+    static bool isEqual(unsigned LHS, unsigned RHS) { return LHS == RHS; }
+    static unsigned getHashValue(const unsigned &Key) { return Key; }
+  };
+
+
+/// This register allocator is derived from RegAllocLocal.cpp. Like it, this
+/// allocator works on one basic block at a time, oblivious to others.
+/// However, the algorithm used here is suited for long blocks of
+/// instructions - registers are spilled by greedily choosing those holding
+/// values that will not be needed for the longest amount of time. This works
+/// particularly well for blocks with 10 or more times as many instructions
+/// as machine registers, but can be used for general code.
+///
+/// TODO: - automagically invoke linearscan for (groups of) small BBs?
+///       - break ties when picking regs? (probably not worth it in a
+///         JIT context)
+///
+  class VISIBILITY_HIDDEN RABigBlock : public MachineFunctionPass {
+  public:
+    static char ID;
+    RABigBlock() : MachineFunctionPass(&ID) {}
+  private:
+    /// TM - For getting at TargetMachine info 
+    ///
+    const TargetMachine *TM;
+    
+    /// MF - Our generic MachineFunction pointer
+    ///
+    MachineFunction *MF;
+    
+    /// RegInfo - For dealing with machine register info (aliases, folds
+    /// etc)
+    const TargetRegisterInfo *RegInfo;
+
+    typedef SmallVector<unsigned, 2> VRegTimes;
+
+    /// VRegReadTable - maps VRegs in a BB to the set of times they are read
+    ///
+    DenseMap<unsigned, VRegTimes*, VRegKeyInfo> VRegReadTable;
+
+    /// VRegReadIdx - keeps track of the "current time" in terms of
+    /// positions in VRegReadTable
+    DenseMap<unsigned, unsigned , VRegKeyInfo> VRegReadIdx;
+
+    /// StackSlotForVirtReg - Maps virtual regs to the frame index where these
+    /// values are spilled.
+    IndexedMap<unsigned, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+    /// Virt2PhysRegMap - This map contains entries for each virtual register
+    /// that is currently available in a physical register.
+    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2PhysRegMap;
+
+    /// PhysRegsUsed - This array is effectively a map, containing entries for
+    /// each physical register that currently has a value (ie, it is in
+    /// Virt2PhysRegMap).  The value mapped to is the virtual register
+    /// corresponding to the physical register (the inverse of the
+    /// Virt2PhysRegMap), or 0.  The value is set to 0 if this register is pinned
+    /// because it is used by a future instruction, and to -2 if it is not
+    /// allocatable.  If the entry for a physical register is -1, then the
+    /// physical register is "not in the map".
+    ///
+    std::vector<int> PhysRegsUsed;
+
+    /// VirtRegModified - This bitset contains information about which virtual
+    /// registers need to be spilled back to memory when their registers are
+    /// scavenged.  If a virtual register has simply been rematerialized, there
+    /// is no reason to spill it to memory when we need the register back.
+    ///
+    std::vector<int> VirtRegModified;
+
+    /// MBBLastInsnTime - the number of the the last instruction in MBB
+    ///
+    int MBBLastInsnTime;
+
+    /// MBBCurTime - the number of the the instruction being currently processed
+    ///
+    int MBBCurTime;
+
+    unsigned &getVirt2PhysRegMapSlot(unsigned VirtReg) {
+      return Virt2PhysRegMap[VirtReg];
+    }
+
+    unsigned &getVirt2StackSlot(unsigned VirtReg) {
+      return StackSlotForVirtReg[VirtReg];
+    }
+
+    /// markVirtRegModified - Lets us flip bits in the VirtRegModified bitset
+    ///
+    void markVirtRegModified(unsigned Reg, bool Val = true) {
+      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
+      Reg -= TargetRegisterInfo::FirstVirtualRegister;
+      if (VirtRegModified.size() <= Reg)
+        VirtRegModified.resize(Reg+1);
+      VirtRegModified[Reg] = Val;
+    }
+    
+    /// isVirtRegModified - Lets us query the VirtRegModified bitset
+    ///
+    bool isVirtRegModified(unsigned Reg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
+      assert(Reg - TargetRegisterInfo::FirstVirtualRegister < VirtRegModified.size()
+             && "Illegal virtual register!");
+      return VirtRegModified[Reg - TargetRegisterInfo::FirstVirtualRegister];
+    }
+
+  public:
+    /// getPassName - returns the BigBlock allocator's name
+    ///
+    virtual const char *getPassName() const {
+      return "BigBlock Register Allocator";
+    }
+
+    /// getAnalaysisUsage - declares the required analyses
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(PHIEliminationID);
+      AU.addRequiredID(TwoAddressInstructionPassID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// runOnMachineFunction - Register allocate the whole function
+    ///
+    bool runOnMachineFunction(MachineFunction &Fn);
+
+    /// AllocateBasicBlock - Register allocate the specified basic block.
+    ///
+    void AllocateBasicBlock(MachineBasicBlock &MBB);
+
+    /// FillVRegReadTable - Fill out the table of vreg read times given a BB
+    ///
+    void FillVRegReadTable(MachineBasicBlock &MBB);
+    
+    /// areRegsEqual - This method returns true if the specified registers are
+    /// related to each other.  To do this, it checks to see if they are equal
+    /// or if the first register is in the alias set of the second register.
+    ///
+    bool areRegsEqual(unsigned R1, unsigned R2) const {
+      if (R1 == R2) return true;
+      for (const unsigned *AliasSet = RegInfo->getAliasSet(R2);
+           *AliasSet; ++AliasSet) {
+        if (*AliasSet == R1) return true;
+      }
+      return false;
+    }
+
+    /// getStackSpaceFor - This returns the frame index of the specified virtual
+    /// register on the stack, allocating space if necessary.
+    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
+
+    /// removePhysReg - This method marks the specified physical register as no
+    /// longer being in use.
+    ///
+    void removePhysReg(unsigned PhysReg);
+
+    /// spillVirtReg - This method spills the value specified by PhysReg into
+    /// the virtual register slot specified by VirtReg.  It then updates the RA
+    /// data structures to indicate the fact that PhysReg is now available.
+    ///
+    void spillVirtReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                      unsigned VirtReg, unsigned PhysReg);
+
+    /// spillPhysReg - This method spills the specified physical register into
+    /// the virtual register slot associated with it.  If OnlyVirtRegs is set to
+    /// true, then the request is ignored if the physical register does not
+    /// contain a virtual register.
+    ///
+    void spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
+                      unsigned PhysReg, bool OnlyVirtRegs = false);
+
+    /// assignVirtToPhysReg - This method updates local state so that we know
+    /// that PhysReg is the proper container for VirtReg now.  The physical
+    /// register must not be used for anything else when this is called.
+    ///
+    void assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg);
+
+    /// isPhysRegAvailable - Return true if the specified physical register is
+    /// free and available for use.  This also includes checking to see if
+    /// aliased registers are all free...
+    ///
+    bool isPhysRegAvailable(unsigned PhysReg) const;
+
+    /// getFreeReg - Look to see if there is a free register available in the
+    /// specified register class.  If not, return 0.
+    ///
+    unsigned getFreeReg(const TargetRegisterClass *RC);
+
+    /// chooseReg - Pick a physical register to hold the specified
+    /// virtual register by choosing the one which will be read furthest
+    /// in the future.
+    ///
+    unsigned chooseReg(MachineBasicBlock &MBB, MachineInstr *MI,
+                    unsigned VirtReg);
+
+    /// reloadVirtReg - This method transforms the specified specified virtual
+    /// register use to refer to a physical register.  This method may do this
+    /// in one of several ways: if the register is available in a physical
+    /// register already, it uses that physical register.  If the value is not
+    /// in a physical register, and if there are physical registers available,
+    /// it loads it into a register.  If register pressure is high, and it is
+    /// possible, it tries to fold the load of the virtual register into the
+    /// instruction itself.  It avoids doing this if register pressure is low to
+    /// improve the chance that subsequent instructions can use the reloaded
+    /// value.  This method returns the modified instruction.
+    ///
+    MachineInstr *reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
+                                unsigned OpNum);
+
+  };
+  char RABigBlock::ID = 0;
+}
+
+/// getStackSpaceFor - This allocates space for the specified virtual register
+/// to be held on the stack.
+int RABigBlock::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
+  // Find the location Reg would belong...
+  int FrameIdx = getVirt2StackSlot(VirtReg);
+
+  if (FrameIdx)
+    return FrameIdx - 1;          // Already has space allocated?
+
+  // Allocate a new stack object for this spill location...
+  FrameIdx = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment());
+
+  // Assign the slot...
+  getVirt2StackSlot(VirtReg) = FrameIdx + 1;
+  return FrameIdx;
+}
+
+
+/// removePhysReg - This method marks the specified physical register as no
+/// longer being in use.
+///
+void RABigBlock::removePhysReg(unsigned PhysReg) {
+  PhysRegsUsed[PhysReg] = -1;      // PhyReg no longer used
+}
+
+
+/// spillVirtReg - This method spills the value specified by PhysReg into the
+/// virtual register slot specified by VirtReg.  It then updates the RA data
+/// structures to indicate the fact that PhysReg is now available.
+///
+void RABigBlock::spillVirtReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I,
+                           unsigned VirtReg, unsigned PhysReg) {
+  assert(VirtReg && "Spilling a physical register is illegal!"
+         " Must not have appropriate kill for the register or use exists beyond"
+         " the intended one.");
+  DOUT << "  Spilling register " << RegInfo->getName(PhysReg)
+       << " containing %reg" << VirtReg;
+  
+  const TargetInstrInfo* TII = MBB.getParent()->getTarget().getInstrInfo();
+  
+  if (!isVirtRegModified(VirtReg))
+    DOUT << " which has not been modified, so no store necessary!";
+
+  // Otherwise, there is a virtual register corresponding to this physical
+  // register.  We only need to spill it into its stack slot if it has been
+  // modified.
+  if (isVirtRegModified(VirtReg)) {
+    const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
+    int FrameIndex = getStackSpaceFor(VirtReg, RC);
+    DOUT << " to stack slot #" << FrameIndex;
+    TII->storeRegToStackSlot(MBB, I, PhysReg, true, FrameIndex, RC);
+    ++NumStores;   // Update statistics
+  }
+
+  getVirt2PhysRegMapSlot(VirtReg) = 0;   // VirtReg no longer available
+
+  DOUT << "\n";
+  removePhysReg(PhysReg);
+}
+
+
+/// spillPhysReg - This method spills the specified physical register into the
+/// virtual register slot associated with it.  If OnlyVirtRegs is set to true,
+/// then the request is ignored if the physical register does not contain a
+/// virtual register.
+///
+void RABigBlock::spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
+                           unsigned PhysReg, bool OnlyVirtRegs) {
+  if (PhysRegsUsed[PhysReg] != -1) {            // Only spill it if it's used!
+    assert(PhysRegsUsed[PhysReg] != -2 && "Non allocable reg used!");
+    if (PhysRegsUsed[PhysReg] || !OnlyVirtRegs)
+      spillVirtReg(MBB, I, PhysRegsUsed[PhysReg], PhysReg);
+  } else {
+    // If the selected register aliases any other registers, we must make
+    // sure that one of the aliases isn't alive.
+    for (const unsigned *AliasSet = RegInfo->getAliasSet(PhysReg);
+         *AliasSet; ++AliasSet)
+      if (PhysRegsUsed[*AliasSet] != -1 &&     // Spill aliased register.
+          PhysRegsUsed[*AliasSet] != -2)       // If allocatable.
+        if (PhysRegsUsed[*AliasSet])
+          spillVirtReg(MBB, I, PhysRegsUsed[*AliasSet], *AliasSet);
+  }
+}
+
+
+/// assignVirtToPhysReg - This method updates local state so that we know
+/// that PhysReg is the proper container for VirtReg now.  The physical
+/// register must not be used for anything else when this is called.
+///
+void RABigBlock::assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg) {
+  assert(PhysRegsUsed[PhysReg] == -1 && "Phys reg already assigned!");
+  // Update information to note the fact that this register was just used, and
+  // it holds VirtReg.
+  PhysRegsUsed[PhysReg] = VirtReg;
+  getVirt2PhysRegMapSlot(VirtReg) = PhysReg;
+}
+
+
+/// isPhysRegAvailable - Return true if the specified physical register is free
+/// and available for use.  This also includes checking to see if aliased
+/// registers are all free...
+///
+bool RABigBlock::isPhysRegAvailable(unsigned PhysReg) const {
+  if (PhysRegsUsed[PhysReg] != -1) return false;
+
+  // If the selected register aliases any other allocated registers, it is
+  // not free!
+  for (const unsigned *AliasSet = RegInfo->getAliasSet(PhysReg);
+       *AliasSet; ++AliasSet)
+    if (PhysRegsUsed[*AliasSet] >= 0) // Aliased register in use?
+      return false;                    // Can't use this reg then.
+  return true;
+}
+
+  
+/// getFreeReg - Look to see if there is a free register available in the
+/// specified register class.  If not, return 0.
+///
+unsigned RABigBlock::getFreeReg(const TargetRegisterClass *RC) {
+  // Get iterators defining the range of registers that are valid to allocate in
+  // this class, which also specifies the preferred allocation order.
+  TargetRegisterClass::iterator RI = RC->allocation_order_begin(*MF);
+  TargetRegisterClass::iterator RE = RC->allocation_order_end(*MF);
+
+  for (; RI != RE; ++RI)
+    if (isPhysRegAvailable(*RI)) {       // Is reg unused?
+      assert(*RI != 0 && "Cannot use register!");
+      return *RI; // Found an unused register!
+    }
+  return 0;
+}
+
+
+/// chooseReg - Pick a physical register to hold the specified
+/// virtual register by choosing the one whose value will be read
+/// furthest in the future.
+///
+unsigned RABigBlock::chooseReg(MachineBasicBlock &MBB, MachineInstr *I,
+                         unsigned VirtReg) {
+  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
+  // First check to see if we have a free register of the requested type...
+  unsigned PhysReg = getFreeReg(RC);
+
+  // If we didn't find an unused register, find the one which will be
+  // read at the most distant point in time.
+  if (PhysReg == 0) {
+    unsigned delay=0, longest_delay=0;
+    VRegTimes* ReadTimes;
+
+    unsigned curTime = MBBCurTime;
+
+    // for all physical regs in the RC,
+    for(TargetRegisterClass::iterator pReg = RC->begin(); 
+                                      pReg != RC->end();  ++pReg) {
+      // how long until they're read?
+      if(PhysRegsUsed[*pReg]>0) { // ignore non-allocatable regs
+        ReadTimes = VRegReadTable[PhysRegsUsed[*pReg]];
+        if(ReadTimes && !ReadTimes->empty()) {
+            unsigned& pt = VRegReadIdx[PhysRegsUsed[*pReg]];
+            while(pt < ReadTimes->size() && (*ReadTimes)[pt] < curTime) {
+                ++pt;
+            }
+
+            if(pt < ReadTimes->size())
+                delay = (*ReadTimes)[pt] - curTime;
+            else
+                delay = MBBLastInsnTime + 1 - curTime;
+        } else {
+            // This register is only defined, but never
+            // read in this MBB. Therefore the next read
+            // happens after the end of this MBB
+            delay = MBBLastInsnTime + 1 - curTime;
+        }
+
+        
+        if(delay > longest_delay) {
+          longest_delay = delay;
+          PhysReg = *pReg;
+        }
+      }
+    }
+
+    if(PhysReg == 0) { // ok, now we're desperate. We couldn't choose
+                       // a register to spill by looking through the
+                       // read timetable, so now we just spill the
+                       // first allocatable register we find.
+                       
+      // for all physical regs in the RC,
+      for(TargetRegisterClass::iterator pReg = RC->begin(); 
+                                        pReg != RC->end();  ++pReg) {
+        // if we find a register we can spill
+        if(PhysRegsUsed[*pReg]>=-1)
+          PhysReg = *pReg; // choose it to be spilled
+      }
+    }
+    
+    assert(PhysReg && "couldn't choose a register to spill :( ");
+    // TODO: assert that RC->contains(PhysReg) / handle aliased registers?
+
+    // since we needed to look in the table we need to spill this register.
+    spillPhysReg(MBB, I, PhysReg);
+  }
+
+  // assign the vreg to our chosen physical register
+  assignVirtToPhysReg(VirtReg, PhysReg);
+  return PhysReg; // and return it
+}
+
+
+/// reloadVirtReg - This method transforms an instruction with a virtual
+/// register use to one that references a physical register. It does this as
+/// follows:
+///
+///   1) If the register is already in a physical register, it uses it.
+///   2) Otherwise, if there is a free physical register, it uses that.
+///   3) Otherwise, it calls chooseReg() to get the physical register
+///      holding the most distantly needed value, generating a spill in
+///      the process.
+///
+/// This method returns the modified instruction.
+MachineInstr *RABigBlock::reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
+                                     unsigned OpNum) {
+  unsigned VirtReg = MI->getOperand(OpNum).getReg();
+  const TargetInstrInfo* TII = MBB.getParent()->getTarget().getInstrInfo();
+
+  // If the virtual register is already available in a physical register,
+  // just update the instruction and return.
+  if (unsigned PR = getVirt2PhysRegMapSlot(VirtReg)) {
+    MI->getOperand(OpNum).setReg(PR);
+    return MI;
+  }
+
+  // Otherwise, if we have free physical registers available to hold the
+  // value, use them.
+  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
+  unsigned PhysReg = getFreeReg(RC);
+  int FrameIndex = getStackSpaceFor(VirtReg, RC);
+
+  if (PhysReg) {   // we have a free register, so use it.
+    assignVirtToPhysReg(VirtReg, PhysReg);
+  } else {  // no free registers available.
+    // try to fold the spill into the instruction
+    SmallVector<unsigned, 1> Ops;
+    Ops.push_back(OpNum);
+    if(MachineInstr* FMI = TII->foldMemoryOperand(*MF, MI, Ops, FrameIndex)) {
+      ++NumFolded;
+      FMI->copyKillDeadInfo(MI);
+      return MBB.insert(MBB.erase(MI), FMI);
+    }
+    
+    // determine which of the physical registers we'll kill off, since we
+    // couldn't fold.
+    PhysReg = chooseReg(MBB, MI, VirtReg);
+  }
+
+  // this virtual register is now unmodified (since we just reloaded it)
+  markVirtRegModified(VirtReg, false);
+
+  DOUT << "  Reloading %reg" << VirtReg << " into "
+       << RegInfo->getName(PhysReg) << "\n";
+
+  // Add move instruction(s)
+  TII->loadRegFromStackSlot(MBB, MI, PhysReg, FrameIndex, RC);
+  ++NumLoads;    // Update statistics
+
+  MF->getRegInfo().setPhysRegUsed(PhysReg);
+  MI->getOperand(OpNum).setReg(PhysReg);  // Assign the input register
+  return MI;
+}
+
+/// Fill out the vreg read timetable. Since ReadTime increases
+/// monotonically, the individual readtime sets will be sorted
+/// in ascending order.
+void RABigBlock::FillVRegReadTable(MachineBasicBlock &MBB) {
+  // loop over each instruction
+  MachineBasicBlock::iterator MII;
+  unsigned ReadTime;
+  
+  for(ReadTime=0, MII = MBB.begin(); MII != MBB.end(); ++ReadTime, ++MII) {
+    MachineInstr *MI = MII;
+    
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      // look for vreg reads..
+      if (MO.isReg() && !MO.isDef() && MO.getReg() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+          // ..and add them to the read table.
+          VRegTimes* &Times = VRegReadTable[MO.getReg()];
+          if(!VRegReadTable[MO.getReg()]) {
+              Times = new VRegTimes;
+              VRegReadIdx[MO.getReg()] = 0;
+          }
+        Times->push_back(ReadTime);
+      }
+    }
+
+  }  
+
+  MBBLastInsnTime = ReadTime;
+
+  for(DenseMap<unsigned, VRegTimes*, VRegKeyInfo>::iterator Reads = VRegReadTable.begin();
+      Reads != VRegReadTable.end(); ++Reads) {
+      if(Reads->second) {
+          DOUT << "Reads[" << Reads->first << "]=" << Reads->second->size() << "\n";
+      }
+  }
+}
+
+/// isReadModWriteImplicitKill - True if this is an implicit kill for a
+/// read/mod/write register, i.e. update partial register.
+static bool isReadModWriteImplicitKill(MachineInstr *MI, unsigned Reg) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand& MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
+        MO.isDef() && !MO.isDead())
+      return true;
+  }
+  return false;
+}
+
+/// isReadModWriteImplicitDef - True if this is an implicit def for a
+/// read/mod/write register, i.e. update partial register.
+static bool isReadModWriteImplicitDef(MachineInstr *MI, unsigned Reg) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand& MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
+        !MO.isDef() && MO.isKill())
+      return true;
+  }
+  return false;
+}
+
+
+void RABigBlock::AllocateBasicBlock(MachineBasicBlock &MBB) {
+  // loop over each instruction
+  MachineBasicBlock::iterator MII = MBB.begin();
+  const TargetInstrInfo &TII = *TM->getInstrInfo();
+  
+  DEBUG(const BasicBlock *LBB = MBB.getBasicBlock();
+        if (LBB) DOUT << "\nStarting RegAlloc of BB: " << LBB->getName());
+
+  // If this is the first basic block in the machine function, add live-in
+  // registers as active.
+  if (&MBB == &*MF->begin()) {
+    for (MachineRegisterInfo::livein_iterator
+         I = MF->getRegInfo().livein_begin(),
+         E = MF->getRegInfo().livein_end(); I != E; ++I) {
+      unsigned Reg = I->first;
+      MF->getRegInfo().setPhysRegUsed(Reg);
+      PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+      for (const unsigned *AliasSet = RegInfo->getSubRegisters(Reg);
+           *AliasSet; ++AliasSet) {
+        if (PhysRegsUsed[*AliasSet] != -2) {
+          PhysRegsUsed[*AliasSet] = 0;  // It is free and reserved now
+          MF->getRegInfo().setPhysRegUsed(*AliasSet);
+        }
+      }
+    }    
+  }
+  
+  // Otherwise, sequentially allocate each instruction in the MBB.
+  MBBCurTime = -1;
+  while (MII != MBB.end()) {
+    MachineInstr *MI = MII++;
+    MBBCurTime++;
+    const TargetInstrDesc &TID = MI->getDesc();
+    DEBUG(DOUT << "\nTime=" << MBBCurTime << " Starting RegAlloc of: " << *MI;
+          DOUT << "  Regs have values: ";
+          for (unsigned i = 0; i != RegInfo->getNumRegs(); ++i)
+            if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2)
+               DOUT << "[" << RegInfo->getName(i)
+                    << ",%reg" << PhysRegsUsed[i] << "] ";
+          DOUT << "\n");
+
+    SmallVector<unsigned, 8> Kills;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isKill()) {
+        if (!MO.isImplicit())
+          Kills.push_back(MO.getReg());
+        else if (!isReadModWriteImplicitKill(MI, MO.getReg()))
+          // These are extra physical register kills when a sub-register
+          // is defined (def of a sub-register is a read/mod/write of the
+          // larger registers). Ignore.
+          Kills.push_back(MO.getReg());
+      }
+    }
+
+    // Get the used operands into registers.  This has the potential to spill
+    // incoming values if we are out of registers.  Note that we completely
+    // ignore physical register uses here.  We assume that if an explicit
+    // physical register is referenced by the instruction, that it is guaranteed
+    // to be live-in, or the input is badly hosed.
+    //
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      // here we are looking for only used operands (never def&use)
+      if (MO.isReg() && !MO.isDef() && MO.getReg() && !MO.isImplicit() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        MI = reloadVirtReg(MBB, MI, i);
+    }
+
+    // If this instruction is the last user of this register, kill the
+    // value, freeing the register being used, so it doesn't need to be
+    // spilled to memory.
+    //
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+      unsigned VirtReg = Kills[i];
+      unsigned PhysReg = VirtReg;
+      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
+        // If the virtual register was never materialized into a register, it
+        // might not be in the map, but it won't hurt to zero it out anyway.
+        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
+        PhysReg = PhysRegSlot;
+        PhysRegSlot = 0;
+      } else if (PhysRegsUsed[PhysReg] == -2) {
+        // Unallocatable register dead, ignore.
+        continue;
+      } else {
+        assert((!PhysRegsUsed[PhysReg] || PhysRegsUsed[PhysReg] == -1) &&
+               "Silently clearing a virtual register?");
+      }
+
+      if (PhysReg) {
+        DOUT << "  Last use of " << RegInfo->getName(PhysReg)
+             << "[%reg" << VirtReg <<"], removing it from live set\n";
+        removePhysReg(PhysReg);
+        for (const unsigned *AliasSet = RegInfo->getSubRegisters(PhysReg);
+             *AliasSet; ++AliasSet) {
+          if (PhysRegsUsed[*AliasSet] != -2) {
+            DOUT  << "  Last use of "
+                  << RegInfo->getName(*AliasSet)
+                  << "[%reg" << VirtReg <<"], removing it from live set\n";
+            removePhysReg(*AliasSet);
+          }
+        }
+      }
+    }
+
+    // Loop over all of the operands of the instruction, spilling registers that
+    // are defined, and marking explicit destinations in the PhysRegsUsed map.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDef() && !MO.isImplicit() && MO.getReg() &&
+          TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+        if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
+        // These are extra physical register defs when a sub-register
+        // is defined (def of a sub-register is a read/mod/write of the
+        // larger registers). Ignore.
+        if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
+
+        MF->getRegInfo().setPhysRegUsed(Reg);
+        spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
+        PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+        for (const unsigned *AliasSet = RegInfo->getSubRegisters(Reg);
+             *AliasSet; ++AliasSet) {
+          if (PhysRegsUsed[*AliasSet] != -2) {
+            PhysRegsUsed[*AliasSet] = 0;  // It is free and reserved now
+            MF->getRegInfo().setPhysRegUsed(*AliasSet);
+          }
+        }
+      }
+    }
+
+    // Loop over the implicit defs, spilling them as well.
+    if (TID.getImplicitDefs()) {
+      for (const unsigned *ImplicitDefs = TID.getImplicitDefs();
+           *ImplicitDefs; ++ImplicitDefs) {
+        unsigned Reg = *ImplicitDefs;
+        if (PhysRegsUsed[Reg] != -2) {
+          spillPhysReg(MBB, MI, Reg, true);
+          PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+        }
+        MF->getRegInfo().setPhysRegUsed(Reg);
+        for (const unsigned *AliasSet = RegInfo->getSubRegisters(Reg);
+             *AliasSet; ++AliasSet) {
+          if (PhysRegsUsed[*AliasSet] != -2) {
+            PhysRegsUsed[*AliasSet] = 0;  // It is free and reserved now
+            MF->getRegInfo().setPhysRegUsed(*AliasSet);
+          }
+        }
+      }
+    }
+
+    SmallVector<unsigned, 8> DeadDefs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadDefs.push_back(MO.getReg());
+    }
+
+    // Okay, we have allocated all of the source operands and spilled any values
+    // that would be destroyed by defs of this instruction.  Loop over the
+    // explicit defs and assign them to a register, spilling incoming values if
+    // we need to scavenge a register.
+    //
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDef() && MO.getReg() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned DestVirtReg = MO.getReg();
+        unsigned DestPhysReg;
+
+        // If DestVirtReg already has a value, use it.
+        if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg)))
+          DestPhysReg = chooseReg(MBB, MI, DestVirtReg);
+        MF->getRegInfo().setPhysRegUsed(DestPhysReg);
+        markVirtRegModified(DestVirtReg);
+        MI->getOperand(i).setReg(DestPhysReg);  // Assign the output register
+      }
+    }
+
+    // If this instruction defines any registers that are immediately dead,
+    // kill them now.
+    //
+    for (unsigned i = 0, e = DeadDefs.size(); i != e; ++i) {
+      unsigned VirtReg = DeadDefs[i];
+      unsigned PhysReg = VirtReg;
+      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
+        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
+        PhysReg = PhysRegSlot;
+        assert(PhysReg != 0);
+        PhysRegSlot = 0;
+      } else if (PhysRegsUsed[PhysReg] == -2) {
+        // Unallocatable register dead, ignore.
+        continue;
+      }
+
+      if (PhysReg) {
+        DOUT  << "  Register " << RegInfo->getName(PhysReg)
+              << " [%reg" << VirtReg
+              << "] is never used, removing it from live set\n";
+        removePhysReg(PhysReg);
+        for (const unsigned *AliasSet = RegInfo->getAliasSet(PhysReg);
+             *AliasSet; ++AliasSet) {
+          if (PhysRegsUsed[*AliasSet] != -2) {
+            DOUT  << "  Register " << RegInfo->getName(*AliasSet)
+                  << " [%reg" << *AliasSet
+                  << "] is never used, removing it from live set\n";
+            removePhysReg(*AliasSet);
+          }
+        }
+      }
+    }
+    
+    // Finally, if this is a noop copy instruction, zap it.
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (TII.isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
+        SrcReg == DstReg)
+      MBB.erase(MI);
+  }
+
+  MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
+
+  // Spill all physical registers holding virtual registers now.
+  for (unsigned i = 0, e = RegInfo->getNumRegs(); i != e; ++i)
+    if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2) {
+      if (unsigned VirtReg = PhysRegsUsed[i])
+        spillVirtReg(MBB, MI, VirtReg, i);
+      else
+        removePhysReg(i);
+    }
+}
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool RABigBlock::runOnMachineFunction(MachineFunction &Fn) {
+  DOUT << "Machine Function " << "\n";
+  MF = &Fn;
+  TM = &Fn.getTarget();
+  RegInfo = TM->getRegisterInfo();
+
+  PhysRegsUsed.assign(RegInfo->getNumRegs(), -1);
+  
+  // At various places we want to efficiently check to see whether a register
+  // is allocatable.  To handle this, we mark all unallocatable registers as
+  // being pinned down, permanently.
+  {
+    BitVector Allocable = RegInfo->getAllocatableSet(Fn);
+    for (unsigned i = 0, e = Allocable.size(); i != e; ++i)
+      if (!Allocable[i])
+        PhysRegsUsed[i] = -2;  // Mark the reg unallocable.
+  }
+
+  // initialize the virtual->physical register map to have a 'null'
+  // mapping for all virtual registers
+  Virt2PhysRegMap.grow(MF->getRegInfo().getLastVirtReg());
+  StackSlotForVirtReg.grow(MF->getRegInfo().getLastVirtReg());
+  VirtRegModified.resize(MF->getRegInfo().getLastVirtReg() - 
+                         TargetRegisterInfo::FirstVirtualRegister + 1, 0);
+
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // fill out the read timetable 
+    FillVRegReadTable(*MBB);
+    // use it to allocate the BB
+    AllocateBasicBlock(*MBB);
+    // clear it
+    VRegReadTable.clear();
+  }
+  
+  StackSlotForVirtReg.clear();
+  PhysRegsUsed.clear();
+  VirtRegModified.clear();
+  Virt2PhysRegMap.clear();
+  return true;
+}
+
+FunctionPass *llvm::createBigBlockRegisterAllocator() {
+  return new RABigBlock();
+}
+
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
new file mode 100644
index 0000000..ee118de
--- /dev/null
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -0,0 +1,1535 @@
+//===-- RegAllocLinearScan.cpp - Linear Scan register allocator -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a linear scan register allocator.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "VirtRegMap.h"
+#include "VirtRegRewriter.h"
+#include "Spiller.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+#include <set>
+#include <queue>
+#include <memory>
+#include <cmath>
+#include <iostream>
+
+using namespace llvm;
+
+STATISTIC(NumIters     , "Number of iterations performed");
+STATISTIC(NumBacktracks, "Number of times we had to backtrack");
+STATISTIC(NumCoalesce,   "Number of copies coalesced");
+STATISTIC(NumDowngrade,  "Number of registers downgraded");
+
+static cl::opt<bool>
+NewHeuristic("new-spilling-heuristic",
+             cl::desc("Use new spilling heuristic"),
+             cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+PreSplitIntervals("pre-alloc-split",
+                  cl::desc("Pre-register allocation live interval splitting"),
+                  cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+NewSpillFramework("new-spill-framework",
+                  cl::desc("New spilling framework"),
+                  cl::init(false), cl::Hidden);
+
+static RegisterRegAlloc
+linearscanRegAlloc("linearscan", "linear scan register allocator",
+                   createLinearScanRegisterAllocator);
+
+namespace {
+  struct VISIBILITY_HIDDEN RALinScan : public MachineFunctionPass {
+    static char ID;
+    RALinScan() : MachineFunctionPass(&ID) {}
+
+    typedef std::pair<LiveInterval*, LiveInterval::iterator> IntervalPtr;
+    typedef SmallVector<IntervalPtr, 32> IntervalPtrs;
+  private:
+    /// RelatedRegClasses - This structure is built the first time a function is
+    /// compiled, and keeps track of which register classes have registers that
+    /// belong to multiple classes or have aliases that are in other classes.
+    EquivalenceClasses<const TargetRegisterClass*> RelatedRegClasses;
+    DenseMap<unsigned, const TargetRegisterClass*> OneClassForEachPhysReg;
+
+    // NextReloadMap - For each register in the map, it maps to the another
+    // register which is defined by a reload from the same stack slot and
+    // both reloads are in the same basic block.
+    DenseMap<unsigned, unsigned> NextReloadMap;
+
+    // DowngradedRegs - A set of registers which are being "downgraded", i.e.
+    // un-favored for allocation.
+    SmallSet<unsigned, 8> DowngradedRegs;
+
+    // DowngradeMap - A map from virtual registers to physical registers being
+    // downgraded for the virtual registers.
+    DenseMap<unsigned, unsigned> DowngradeMap;
+
+    MachineFunction* mf_;
+    MachineRegisterInfo* mri_;
+    const TargetMachine* tm_;
+    const TargetRegisterInfo* tri_;
+    const TargetInstrInfo* tii_;
+    BitVector allocatableRegs_;
+    LiveIntervals* li_;
+    LiveStacks* ls_;
+    const MachineLoopInfo *loopInfo;
+
+    /// handled_ - Intervals are added to the handled_ set in the order of their
+    /// start value.  This is uses for backtracking.
+    std::vector<LiveInterval*> handled_;
+
+    /// fixed_ - Intervals that correspond to machine registers.
+    ///
+    IntervalPtrs fixed_;
+
+    /// active_ - Intervals that are currently being processed, and which have a
+    /// live range active for the current point.
+    IntervalPtrs active_;
+
+    /// inactive_ - Intervals that are currently being processed, but which have
+    /// a hold at the current point.
+    IntervalPtrs inactive_;
+
+    typedef std::priority_queue<LiveInterval*,
+                                SmallVector<LiveInterval*, 64>,
+                                greater_ptr<LiveInterval> > IntervalHeap;
+    IntervalHeap unhandled_;
+
+    /// regUse_ - Tracks register usage.
+    SmallVector<unsigned, 32> regUse_;
+    SmallVector<unsigned, 32> regUseBackUp_;
+
+    /// vrm_ - Tracks register assignments.
+    VirtRegMap* vrm_;
+
+    std::auto_ptr<VirtRegRewriter> rewriter_;
+
+    std::auto_ptr<Spiller> spiller_;
+
+  public:
+    virtual const char* getPassName() const {
+      return "Linear Scan Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LiveIntervals>();
+      if (StrongPHIElim)
+        AU.addRequiredID(StrongPHIEliminationID);
+      // Make sure PassManager knows which analyses to make available
+      // to coalescing and which analyses coalescing invalidates.
+      AU.addRequiredTransitive<RegisterCoalescer>();
+      if (PreSplitIntervals)
+        AU.addRequiredID(PreAllocSplittingID);
+      AU.addRequired<LiveStacks>();
+      AU.addPreserved<LiveStacks>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addRequired<VirtRegMap>();
+      AU.addPreserved<VirtRegMap>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    /// runOnMachineFunction - register allocate the whole function
+    bool runOnMachineFunction(MachineFunction&);
+
+  private:
+    /// linearScan - the linear scan algorithm
+    void linearScan();
+
+    /// initIntervalSets - initialize the interval sets.
+    ///
+    void initIntervalSets();
+
+    /// processActiveIntervals - expire old intervals and move non-overlapping
+    /// ones to the inactive list.
+    void processActiveIntervals(unsigned CurPoint);
+
+    /// processInactiveIntervals - expire old intervals and move overlapping
+    /// ones to the active list.
+    void processInactiveIntervals(unsigned CurPoint);
+
+    /// hasNextReloadInterval - Return the next liveinterval that's being
+    /// defined by a reload from the same SS as the specified one.
+    LiveInterval *hasNextReloadInterval(LiveInterval *cur);
+
+    /// DowngradeRegister - Downgrade a register for allocation.
+    void DowngradeRegister(LiveInterval *li, unsigned Reg);
+
+    /// UpgradeRegister - Upgrade a register for allocation.
+    void UpgradeRegister(unsigned Reg);
+
+    /// assignRegOrStackSlotAtInterval - assign a register if one
+    /// is available, or spill.
+    void assignRegOrStackSlotAtInterval(LiveInterval* cur);
+
+    void updateSpillWeights(std::vector<float> &Weights,
+                            unsigned reg, float weight,
+                            const TargetRegisterClass *RC);
+
+    /// findIntervalsToSpill - Determine the intervals to spill for the
+    /// specified interval. It's passed the physical registers whose spill
+    /// weight is the lowest among all the registers whose live intervals
+    /// conflict with the interval.
+    void findIntervalsToSpill(LiveInterval *cur,
+                            std::vector<std::pair<unsigned,float> > &Candidates,
+                            unsigned NumCands,
+                            SmallVector<LiveInterval*, 8> &SpillIntervals);
+
+    /// attemptTrivialCoalescing - If a simple interval is defined by a copy,
+    /// try allocate the definition the same register as the source register
+    /// if the register is not defined during live time of the interval. This
+    /// eliminate a copy. This is used to coalesce copies which were not
+    /// coalesced away before allocation either due to dest and src being in
+    /// different register classes or because the coalescer was overly
+    /// conservative.
+    unsigned attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg);
+
+    ///
+    /// Register usage / availability tracking helpers.
+    ///
+
+    void initRegUses() {
+      regUse_.resize(tri_->getNumRegs(), 0);
+      regUseBackUp_.resize(tri_->getNumRegs(), 0);
+    }
+
+    void finalizeRegUses() {
+#ifndef NDEBUG
+      // Verify all the registers are "freed".
+      bool Error = false;
+      for (unsigned i = 0, e = tri_->getNumRegs(); i != e; ++i) {
+        if (regUse_[i] != 0) {
+          cerr << tri_->getName(i) << " is still in use!\n";
+          Error = true;
+        }
+      }
+      if (Error)
+        abort();
+#endif
+      regUse_.clear();
+      regUseBackUp_.clear();
+    }
+
+    void addRegUse(unsigned physReg) {
+      assert(TargetRegisterInfo::isPhysicalRegister(physReg) &&
+             "should be physical register!");
+      ++regUse_[physReg];
+      for (const unsigned* as = tri_->getAliasSet(physReg); *as; ++as)
+        ++regUse_[*as];
+    }
+
+    void delRegUse(unsigned physReg) {
+      assert(TargetRegisterInfo::isPhysicalRegister(physReg) &&
+             "should be physical register!");
+      assert(regUse_[physReg] != 0);
+      --regUse_[physReg];
+      for (const unsigned* as = tri_->getAliasSet(physReg); *as; ++as) {
+        assert(regUse_[*as] != 0);
+        --regUse_[*as];
+      }
+    }
+
+    bool isRegAvail(unsigned physReg) const {
+      assert(TargetRegisterInfo::isPhysicalRegister(physReg) &&
+             "should be physical register!");
+      return regUse_[physReg] == 0;
+    }
+
+    void backUpRegUses() {
+      regUseBackUp_ = regUse_;
+    }
+
+    void restoreRegUses() {
+      regUse_ = regUseBackUp_;
+    }
+
+    ///
+    /// Register handling helpers.
+    ///
+
+    /// getFreePhysReg - return a free physical register for this virtual
+    /// register interval if we have one, otherwise return 0.
+    unsigned getFreePhysReg(LiveInterval* cur);
+    unsigned getFreePhysReg(const TargetRegisterClass *RC,
+                            unsigned MaxInactiveCount,
+                            SmallVector<unsigned, 256> &inactiveCounts,
+                            bool SkipDGRegs);
+
+    /// assignVirt2StackSlot - assigns this virtual register to a
+    /// stack slot. returns the stack slot
+    int assignVirt2StackSlot(unsigned virtReg);
+
+    void ComputeRelatedRegClasses();
+
+    template <typename ItTy>
+    void printIntervals(const char* const str, ItTy i, ItTy e) const {
+      if (str) DOUT << str << " intervals:\n";
+      for (; i != e; ++i) {
+        DOUT << "\t" << *i->first << " -> ";
+        unsigned reg = i->first->reg;
+        if (TargetRegisterInfo::isVirtualRegister(reg)) {
+          reg = vrm_->getPhys(reg);
+        }
+        DOUT << tri_->getName(reg) << '\n';
+      }
+    }
+  };
+  char RALinScan::ID = 0;
+}
+
+static RegisterPass<RALinScan>
+X("linearscan-regalloc", "Linear Scan Register Allocator");
+
+bool validateRegAlloc(MachineFunction *mf, LiveIntervals *lis,
+                      VirtRegMap *vrm) {
+
+  MachineRegisterInfo *mri = &mf->getRegInfo();
+  const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
+  bool allocationValid = true;
+
+
+  for (LiveIntervals::iterator itr = lis->begin(), end = lis->end();
+       itr != end; ++itr) {
+
+    LiveInterval *li = itr->second;
+
+    if (TargetRegisterInfo::isPhysicalRegister(li->reg)) {
+      continue;
+    }
+
+    if (vrm->hasPhys(li->reg)) {
+      const TargetRegisterClass *trc = mri->getRegClass(li->reg);
+      
+      if (lis->hasInterval(vrm->getPhys(li->reg))) {
+        if (li->overlaps(lis->getInterval(vrm->getPhys(li->reg)))) {
+          std::cerr << "vreg " << li->reg << " overlaps its assigned preg "
+                    << vrm->getPhys(li->reg) << "(" << tri->getName(vrm->getPhys(li->reg)) << ")\n";
+        }
+      }
+
+      TargetRegisterClass::iterator fReg =
+        std::find(trc->allocation_order_begin(*mf), trc->allocation_order_end(*mf),
+                  vrm->getPhys(li->reg));
+
+      if (fReg == trc->allocation_order_end(*mf)) {
+        std::cerr << "preg " << vrm->getPhys(li->reg) 
+                  << "(" << tri->getName(vrm->getPhys(li->reg)) << ") is not in the allocation set for vreg "
+                  << li->reg << "\n";
+        allocationValid &= false;
+      }
+    }
+    else {
+      std::cerr << "No preg for vreg " << li->reg << "\n";
+      // What about conflicting loads/stores?
+      continue;
+    }
+
+    for (LiveIntervals::iterator itr2 = next(itr); itr2 != end; ++itr2) {
+
+      LiveInterval *li2 = itr2->second;
+
+      if (li2->empty())
+        continue;
+
+      if (TargetRegisterInfo::isPhysicalRegister(li2->reg)) {
+        if (li->overlaps(*li2)) {
+          if (vrm->getPhys(li->reg) == li2->reg ||
+              tri->areAliases(vrm->getPhys(li->reg), li2->reg)) {
+            std::cerr << "vreg " << li->reg << " overlaps preg "
+                      << li2->reg << "(" << tri->getName(li2->reg) << ") which aliases "
+                      << vrm->getPhys(li->reg) << "(" << tri->getName(vrm->getPhys(li->reg)) << ")\n";
+            allocationValid &= false;
+          }
+        }
+      }
+      else {
+
+        if (!vrm->hasPhys(li2->reg)) {
+          continue;
+        }
+
+        if (li->overlaps(*li2)) {
+          if (vrm->getPhys(li->reg) == vrm->getPhys(li2->reg) ||
+              tri->areAliases(vrm->getPhys(li->reg), vrm->getPhys(li2->reg))) {
+            std::cerr << "vreg " << li->reg << " (preg " << vrm->getPhys(li->reg)
+                      << ") overlaps vreg " << li2->reg << " (preg " << vrm->getPhys(li2->reg)
+                      << ") and " << vrm->getPhys(li->reg) << " aliases " << vrm->getPhys(li2->reg) << "\n";
+            allocationValid &= false;
+          } 
+        }
+      }
+    } 
+
+  } 
+
+  return allocationValid;
+
+}
+
+
+void RALinScan::ComputeRelatedRegClasses() {
+  // First pass, add all reg classes to the union, and determine at least one
+  // reg class that each register is in.
+  bool HasAliases = false;
+  for (TargetRegisterInfo::regclass_iterator RCI = tri_->regclass_begin(),
+       E = tri_->regclass_end(); RCI != E; ++RCI) {
+    RelatedRegClasses.insert(*RCI);
+    for (TargetRegisterClass::iterator I = (*RCI)->begin(), E = (*RCI)->end();
+         I != E; ++I) {
+      HasAliases = HasAliases || *tri_->getAliasSet(*I) != 0;
+      
+      const TargetRegisterClass *&PRC = OneClassForEachPhysReg[*I];
+      if (PRC) {
+        // Already processed this register.  Just make sure we know that
+        // multiple register classes share a register.
+        RelatedRegClasses.unionSets(PRC, *RCI);
+      } else {
+        PRC = *RCI;
+      }
+    }
+  }
+  
+  // Second pass, now that we know conservatively what register classes each reg
+  // belongs to, add info about aliases.  We don't need to do this for targets
+  // without register aliases.
+  if (HasAliases)
+    for (DenseMap<unsigned, const TargetRegisterClass*>::iterator
+         I = OneClassForEachPhysReg.begin(), E = OneClassForEachPhysReg.end();
+         I != E; ++I)
+      for (const unsigned *AS = tri_->getAliasSet(I->first); *AS; ++AS)
+        RelatedRegClasses.unionSets(I->second, OneClassForEachPhysReg[*AS]);
+}
+
+/// attemptTrivialCoalescing - If a simple interval is defined by a copy,
+/// try allocate the definition the same register as the source register
+/// if the register is not defined during live time of the interval. This
+/// eliminate a copy. This is used to coalesce copies which were not
+/// coalesced away before allocation either due to dest and src being in
+/// different register classes or because the coalescer was overly
+/// conservative.
+unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
+  if ((cur.preference && cur.preference == Reg) || !cur.containsOneValue())
+    return Reg;
+
+  VNInfo *vni = cur.begin()->valno;
+  if (!vni->def || vni->def == ~1U || vni->def == ~0U)
+    return Reg;
+  MachineInstr *CopyMI = li_->getInstructionFromIndex(vni->def);
+  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg, PhysReg;
+  if (!CopyMI ||
+      !tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+    return Reg;
+  PhysReg = SrcReg;
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    if (!vrm_->isAssignedReg(SrcReg))
+      return Reg;
+    PhysReg = vrm_->getPhys(SrcReg);
+  }
+  if (Reg == PhysReg)
+    return Reg;
+
+  const TargetRegisterClass *RC = mri_->getRegClass(cur.reg);
+  if (!RC->contains(PhysReg))
+    return Reg;
+
+  // Try to coalesce.
+  if (!li_->conflictsWithPhysRegDef(cur, *vrm_, PhysReg)) {
+    DOUT << "Coalescing: " << cur << " -> " << tri_->getName(PhysReg)
+         << '\n';
+    vrm_->clearVirt(cur.reg);
+    vrm_->assignVirt2Phys(cur.reg, PhysReg);
+
+    // Remove unnecessary kills since a copy does not clobber the register.
+    if (li_->hasInterval(SrcReg)) {
+      LiveInterval &SrcLI = li_->getInterval(SrcReg);
+      for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(cur.reg),
+             E = mri_->reg_end(); I != E; ++I) {
+        MachineOperand &O = I.getOperand();
+        if (!O.isUse() || !O.isKill())
+          continue;
+        MachineInstr *MI = &*I;
+        if (SrcLI.liveAt(li_->getDefIndex(li_->getInstructionIndex(MI))))
+          O.setIsKill(false);
+      }
+    }
+
+    ++NumCoalesce;
+    return SrcReg;
+  }
+
+  return Reg;
+}
+
+bool RALinScan::runOnMachineFunction(MachineFunction &fn) {
+  mf_ = &fn;
+  mri_ = &fn.getRegInfo();
+  tm_ = &fn.getTarget();
+  tri_ = tm_->getRegisterInfo();
+  tii_ = tm_->getInstrInfo();
+  allocatableRegs_ = tri_->getAllocatableSet(fn);
+  li_ = &getAnalysis<LiveIntervals>();
+  ls_ = &getAnalysis<LiveStacks>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  // We don't run the coalescer here because we have no reason to
+  // interact with it.  If the coalescer requires interaction, it
+  // won't do anything.  If it doesn't require interaction, we assume
+  // it was run as a separate pass.
+
+  // If this is the first function compiled, compute the related reg classes.
+  if (RelatedRegClasses.empty())
+    ComputeRelatedRegClasses();
+
+  // Also resize register usage trackers.
+  initRegUses();
+
+  vrm_ = &getAnalysis<VirtRegMap>();
+  if (!rewriter_.get()) rewriter_.reset(createVirtRegRewriter());
+  
+  if (NewSpillFramework) {
+    spiller_.reset(createSpiller(mf_, li_, ls_, vrm_));
+  }
+  
+  initIntervalSets();
+
+  linearScan();
+
+  if (NewSpillFramework) {
+    bool allocValid = validateRegAlloc(mf_, li_, vrm_);
+  }
+
+  // Rewrite spill code and update the PhysRegsUsed set.
+  rewriter_->runOnMachineFunction(*mf_, *vrm_, li_);
+
+  assert(unhandled_.empty() && "Unhandled live intervals remain!");
+
+  finalizeRegUses();
+
+  fixed_.clear();
+  active_.clear();
+  inactive_.clear();
+  handled_.clear();
+  NextReloadMap.clear();
+  DowngradedRegs.clear();
+  DowngradeMap.clear();
+  spiller_.reset(0);
+
+  return true;
+}
+
+/// initIntervalSets - initialize the interval sets.
+///
+void RALinScan::initIntervalSets()
+{
+  assert(unhandled_.empty() && fixed_.empty() &&
+         active_.empty() && inactive_.empty() &&
+         "interval sets should be empty on initialization");
+
+  handled_.reserve(li_->getNumIntervals());
+
+  for (LiveIntervals::iterator i = li_->begin(), e = li_->end(); i != e; ++i) {
+    if (TargetRegisterInfo::isPhysicalRegister(i->second->reg)) {
+      mri_->setPhysRegUsed(i->second->reg);
+      fixed_.push_back(std::make_pair(i->second, i->second->begin()));
+    } else
+      unhandled_.push(i->second);
+  }
+}
+
+void RALinScan::linearScan()
+{
+  // linear scan algorithm
+  DOUT << "********** LINEAR SCAN **********\n";
+  DOUT << "********** Function: " << mf_->getFunction()->getName() << '\n';
+
+  DEBUG(printIntervals("fixed", fixed_.begin(), fixed_.end()));
+
+  while (!unhandled_.empty()) {
+    // pick the interval with the earliest start point
+    LiveInterval* cur = unhandled_.top();
+    unhandled_.pop();
+    ++NumIters;
+    DOUT << "\n*** CURRENT ***: " << *cur << '\n';
+
+    if (!cur->empty()) {
+      processActiveIntervals(cur->beginNumber());
+      processInactiveIntervals(cur->beginNumber());
+
+      assert(TargetRegisterInfo::isVirtualRegister(cur->reg) &&
+             "Can only allocate virtual registers!");
+    }
+
+    // Allocating a virtual register. try to find a free
+    // physical register or spill an interval (possibly this one) in order to
+    // assign it one.
+    assignRegOrStackSlotAtInterval(cur);
+
+    DEBUG(printIntervals("active", active_.begin(), active_.end()));
+    DEBUG(printIntervals("inactive", inactive_.begin(), inactive_.end()));
+  }
+
+  // Expire any remaining active intervals
+  while (!active_.empty()) {
+    IntervalPtr &IP = active_.back();
+    unsigned reg = IP.first->reg;
+    DOUT << "\tinterval " << *IP.first << " expired\n";
+    assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+           "Can only allocate virtual registers!");
+    reg = vrm_->getPhys(reg);
+    delRegUse(reg);
+    active_.pop_back();
+  }
+
+  // Expire any remaining inactive intervals
+  DEBUG(for (IntervalPtrs::reverse_iterator
+               i = inactive_.rbegin(); i != inactive_.rend(); ++i)
+        DOUT << "\tinterval " << *i->first << " expired\n");
+  inactive_.clear();
+
+  // Add live-ins to every BB except for entry. Also perform trivial coalescing.
+  MachineFunction::iterator EntryMBB = mf_->begin();
+  SmallVector<MachineBasicBlock*, 8> LiveInMBBs;
+  for (LiveIntervals::iterator i = li_->begin(), e = li_->end(); i != e; ++i) {
+    LiveInterval &cur = *i->second;
+    unsigned Reg = 0;
+    bool isPhys = TargetRegisterInfo::isPhysicalRegister(cur.reg);
+    if (isPhys)
+      Reg = cur.reg;
+    else if (vrm_->isAssignedReg(cur.reg))
+      Reg = attemptTrivialCoalescing(cur, vrm_->getPhys(cur.reg));
+    if (!Reg)
+      continue;
+    // Ignore splited live intervals.
+    if (!isPhys && vrm_->getPreSplitReg(cur.reg))
+      continue;
+    for (LiveInterval::Ranges::const_iterator I = cur.begin(), E = cur.end();
+         I != E; ++I) {
+      const LiveRange &LR = *I;
+      if (li_->findLiveInMBBs(LR.start, LR.end, LiveInMBBs)) {
+        for (unsigned i = 0, e = LiveInMBBs.size(); i != e; ++i)
+          if (LiveInMBBs[i] != EntryMBB)
+            LiveInMBBs[i]->addLiveIn(Reg);
+        LiveInMBBs.clear();
+      }
+    }
+  }
+
+  DOUT << *vrm_;
+
+  // Look for physical registers that end up not being allocated even though
+  // register allocator had to spill other registers in its register class.
+  if (ls_->getNumIntervals() == 0)
+    return;
+  if (!vrm_->FindUnusedRegisters(tri_, li_))
+    return;
+}
+
+/// processActiveIntervals - expire old intervals and move non-overlapping ones
+/// to the inactive list.
+void RALinScan::processActiveIntervals(unsigned CurPoint)
+{
+  DOUT << "\tprocessing active intervals:\n";
+
+  for (unsigned i = 0, e = active_.size(); i != e; ++i) {
+    LiveInterval *Interval = active_[i].first;
+    LiveInterval::iterator IntervalPos = active_[i].second;
+    unsigned reg = Interval->reg;
+
+    IntervalPos = Interval->advanceTo(IntervalPos, CurPoint);
+
+    if (IntervalPos == Interval->end()) {     // Remove expired intervals.
+      DOUT << "\t\tinterval " << *Interval << " expired\n";
+      assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+             "Can only allocate virtual registers!");
+      reg = vrm_->getPhys(reg);
+      delRegUse(reg);
+
+      // Pop off the end of the list.
+      active_[i] = active_.back();
+      active_.pop_back();
+      --i; --e;
+
+    } else if (IntervalPos->start > CurPoint) {
+      // Move inactive intervals to inactive list.
+      DOUT << "\t\tinterval " << *Interval << " inactive\n";
+      assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+             "Can only allocate virtual registers!");
+      reg = vrm_->getPhys(reg);
+      delRegUse(reg);
+      // add to inactive.
+      inactive_.push_back(std::make_pair(Interval, IntervalPos));
+
+      // Pop off the end of the list.
+      active_[i] = active_.back();
+      active_.pop_back();
+      --i; --e;
+    } else {
+      // Otherwise, just update the iterator position.
+      active_[i].second = IntervalPos;
+    }
+  }
+}
+
+/// processInactiveIntervals - expire old intervals and move overlapping
+/// ones to the active list.
+void RALinScan::processInactiveIntervals(unsigned CurPoint)
+{
+  DOUT << "\tprocessing inactive intervals:\n";
+
+  for (unsigned i = 0, e = inactive_.size(); i != e; ++i) {
+    LiveInterval *Interval = inactive_[i].first;
+    LiveInterval::iterator IntervalPos = inactive_[i].second;
+    unsigned reg = Interval->reg;
+
+    IntervalPos = Interval->advanceTo(IntervalPos, CurPoint);
+
+    if (IntervalPos == Interval->end()) {       // remove expired intervals.
+      DOUT << "\t\tinterval " << *Interval << " expired\n";
+
+      // Pop off the end of the list.
+      inactive_[i] = inactive_.back();
+      inactive_.pop_back();
+      --i; --e;
+    } else if (IntervalPos->start <= CurPoint) {
+      // move re-activated intervals in active list
+      DOUT << "\t\tinterval " << *Interval << " active\n";
+      assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+             "Can only allocate virtual registers!");
+      reg = vrm_->getPhys(reg);
+      addRegUse(reg);
+      // add to active
+      active_.push_back(std::make_pair(Interval, IntervalPos));
+
+      // Pop off the end of the list.
+      inactive_[i] = inactive_.back();
+      inactive_.pop_back();
+      --i; --e;
+    } else {
+      // Otherwise, just update the iterator position.
+      inactive_[i].second = IntervalPos;
+    }
+  }
+}
+
+/// updateSpillWeights - updates the spill weights of the specifed physical
+/// register and its weight.
+void RALinScan::updateSpillWeights(std::vector<float> &Weights,
+                                   unsigned reg, float weight,
+                                   const TargetRegisterClass *RC) {
+  SmallSet<unsigned, 4> Processed;
+  SmallSet<unsigned, 4> SuperAdded;
+  SmallVector<unsigned, 4> Supers;
+  Weights[reg] += weight;
+  Processed.insert(reg);
+  for (const unsigned* as = tri_->getAliasSet(reg); *as; ++as) {
+    Weights[*as] += weight;
+    Processed.insert(*as);
+    if (tri_->isSubRegister(*as, reg) &&
+        SuperAdded.insert(*as) &&
+        RC->contains(*as)) {
+      Supers.push_back(*as);
+    }
+  }
+
+  // If the alias is a super-register, and the super-register is in the
+  // register class we are trying to allocate. Then add the weight to all
+  // sub-registers of the super-register even if they are not aliases.
+  // e.g. allocating for GR32, bh is not used, updating bl spill weight.
+  //      bl should get the same spill weight otherwise it will be choosen
+  //      as a spill candidate since spilling bh doesn't make ebx available.
+  for (unsigned i = 0, e = Supers.size(); i != e; ++i) {
+    for (const unsigned *sr = tri_->getSubRegisters(Supers[i]); *sr; ++sr)
+      if (!Processed.count(*sr))
+        Weights[*sr] += weight;
+  }
+}
+
+static
+RALinScan::IntervalPtrs::iterator
+FindIntervalInVector(RALinScan::IntervalPtrs &IP, LiveInterval *LI) {
+  for (RALinScan::IntervalPtrs::iterator I = IP.begin(), E = IP.end();
+       I != E; ++I)
+    if (I->first == LI) return I;
+  return IP.end();
+}
+
+static void RevertVectorIteratorsTo(RALinScan::IntervalPtrs &V, unsigned Point){
+  for (unsigned i = 0, e = V.size(); i != e; ++i) {
+    RALinScan::IntervalPtr &IP = V[i];
+    LiveInterval::iterator I = std::upper_bound(IP.first->begin(),
+                                                IP.second, Point);
+    if (I != IP.first->begin()) --I;
+    IP.second = I;
+  }
+}
+
+/// addStackInterval - Create a LiveInterval for stack if the specified live
+/// interval has been spilled.
+static void addStackInterval(LiveInterval *cur, LiveStacks *ls_,
+                             LiveIntervals *li_,
+                             MachineRegisterInfo* mri_, VirtRegMap &vrm_) {
+  int SS = vrm_.getStackSlot(cur->reg);
+  if (SS == VirtRegMap::NO_STACK_SLOT)
+    return;
+
+  const TargetRegisterClass *RC = mri_->getRegClass(cur->reg);
+  LiveInterval &SI = ls_->getOrCreateInterval(SS, RC);
+
+  VNInfo *VNI;
+  if (SI.hasAtLeastOneValue())
+    VNI = SI.getValNumInfo(0);
+  else
+    VNI = SI.getNextValue(~0U, 0, ls_->getVNInfoAllocator());
+
+  LiveInterval &RI = li_->getInterval(cur->reg);
+  // FIXME: This may be overly conservative.
+  SI.MergeRangesInAsValue(RI, VNI);
+}
+
+/// getConflictWeight - Return the number of conflicts between cur
+/// live interval and defs and uses of Reg weighted by loop depthes.
+static
+float getConflictWeight(LiveInterval *cur, unsigned Reg, LiveIntervals *li_,
+                        MachineRegisterInfo *mri_,
+                        const MachineLoopInfo *loopInfo) {
+  float Conflicts = 0;
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(Reg),
+         E = mri_->reg_end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    if (cur->liveAt(li_->getInstructionIndex(MI))) {
+      unsigned loopDepth = loopInfo->getLoopDepth(MI->getParent());
+      Conflicts += powf(10.0f, (float)loopDepth);
+    }
+  }
+  return Conflicts;
+}
+
+/// findIntervalsToSpill - Determine the intervals to spill for the
+/// specified interval. It's passed the physical registers whose spill
+/// weight is the lowest among all the registers whose live intervals
+/// conflict with the interval.
+void RALinScan::findIntervalsToSpill(LiveInterval *cur,
+                            std::vector<std::pair<unsigned,float> > &Candidates,
+                            unsigned NumCands,
+                            SmallVector<LiveInterval*, 8> &SpillIntervals) {
+  // We have figured out the *best* register to spill. But there are other
+  // registers that are pretty good as well (spill weight within 3%). Spill
+  // the one that has fewest defs and uses that conflict with cur.
+  float Conflicts[3] = { 0.0f, 0.0f, 0.0f };
+  SmallVector<LiveInterval*, 8> SLIs[3];
+
+  DOUT << "\tConsidering " << NumCands << " candidates: ";
+  DEBUG(for (unsigned i = 0; i != NumCands; ++i)
+          DOUT << tri_->getName(Candidates[i].first) << " ";
+        DOUT << "\n";);
+  
+  // Calculate the number of conflicts of each candidate.
+  for (IntervalPtrs::iterator i = active_.begin(); i != active_.end(); ++i) {
+    unsigned Reg = i->first->reg;
+    unsigned PhysReg = vrm_->getPhys(Reg);
+    if (!cur->overlapsFrom(*i->first, i->second))
+      continue;
+    for (unsigned j = 0; j < NumCands; ++j) {
+      unsigned Candidate = Candidates[j].first;
+      if (tri_->regsOverlap(PhysReg, Candidate)) {
+        if (NumCands > 1)
+          Conflicts[j] += getConflictWeight(cur, Reg, li_, mri_, loopInfo);
+        SLIs[j].push_back(i->first);
+      }
+    }
+  }
+
+  for (IntervalPtrs::iterator i = inactive_.begin(); i != inactive_.end(); ++i){
+    unsigned Reg = i->first->reg;
+    unsigned PhysReg = vrm_->getPhys(Reg);
+    if (!cur->overlapsFrom(*i->first, i->second-1))
+      continue;
+    for (unsigned j = 0; j < NumCands; ++j) {
+      unsigned Candidate = Candidates[j].first;
+      if (tri_->regsOverlap(PhysReg, Candidate)) {
+        if (NumCands > 1)
+          Conflicts[j] += getConflictWeight(cur, Reg, li_, mri_, loopInfo);
+        SLIs[j].push_back(i->first);
+      }
+    }
+  }
+
+  // Which is the best candidate?
+  unsigned BestCandidate = 0;
+  float MinConflicts = Conflicts[0];
+  for (unsigned i = 1; i != NumCands; ++i) {
+    if (Conflicts[i] < MinConflicts) {
+      BestCandidate = i;
+      MinConflicts = Conflicts[i];
+    }
+  }
+
+  std::copy(SLIs[BestCandidate].begin(), SLIs[BestCandidate].end(),
+            std::back_inserter(SpillIntervals));
+}
+
+namespace {
+  struct WeightCompare {
+    typedef std::pair<unsigned, float> RegWeightPair;
+    bool operator()(const RegWeightPair &LHS, const RegWeightPair &RHS) const {
+      return LHS.second < RHS.second;
+    }
+  };
+}
+
+static bool weightsAreClose(float w1, float w2) {
+  if (!NewHeuristic)
+    return false;
+
+  float diff = w1 - w2;
+  if (diff <= 0.02f)  // Within 0.02f
+    return true;
+  return (diff / w2) <= 0.05f;  // Within 5%.
+}
+
+LiveInterval *RALinScan::hasNextReloadInterval(LiveInterval *cur) {
+  DenseMap<unsigned, unsigned>::iterator I = NextReloadMap.find(cur->reg);
+  if (I == NextReloadMap.end())
+    return 0;
+  return &li_->getInterval(I->second);
+}
+
+void RALinScan::DowngradeRegister(LiveInterval *li, unsigned Reg) {
+  bool isNew = DowngradedRegs.insert(Reg);
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Multiple reloads holding the same register?");
+  DowngradeMap.insert(std::make_pair(li->reg, Reg));
+  for (const unsigned *AS = tri_->getAliasSet(Reg); *AS; ++AS) {
+    isNew = DowngradedRegs.insert(*AS);
+    isNew = isNew; // Silence compiler warning.
+    assert(isNew && "Multiple reloads holding the same register?");
+    DowngradeMap.insert(std::make_pair(li->reg, *AS));
+  }
+  ++NumDowngrade;
+}
+
+void RALinScan::UpgradeRegister(unsigned Reg) {
+  if (Reg) {
+    DowngradedRegs.erase(Reg);
+    for (const unsigned *AS = tri_->getAliasSet(Reg); *AS; ++AS)
+      DowngradedRegs.erase(*AS);
+  }
+}
+
+namespace {
+  struct LISorter {
+    bool operator()(LiveInterval* A, LiveInterval* B) {
+      return A->beginNumber() < B->beginNumber();
+    }
+  };
+}
+
+/// assignRegOrStackSlotAtInterval - assign a register if one is available, or
+/// spill.
+void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur)
+{
+  DOUT << "\tallocating current interval: ";
+
+  // This is an implicitly defined live interval, just assign any register.
+  const TargetRegisterClass *RC = mri_->getRegClass(cur->reg);
+  if (cur->empty()) {
+    unsigned physReg = cur->preference;
+    if (!physReg)
+      physReg = *RC->allocation_order_begin(*mf_);
+    DOUT <<  tri_->getName(physReg) << '\n';
+    // Note the register is not really in use.
+    vrm_->assignVirt2Phys(cur->reg, physReg);
+    return;
+  }
+
+  backUpRegUses();
+
+  std::vector<std::pair<unsigned, float> > SpillWeightsToAdd;
+  unsigned StartPosition = cur->beginNumber();
+  const TargetRegisterClass *RCLeader = RelatedRegClasses.getLeaderValue(RC);
+
+  // If start of this live interval is defined by a move instruction and its
+  // source is assigned a physical register that is compatible with the target
+  // register class, then we should try to assign it the same register.
+  // This can happen when the move is from a larger register class to a smaller
+  // one, e.g. X86::mov32to32_. These move instructions are not coalescable.
+  if (!cur->preference && cur->hasAtLeastOneValue()) {
+    VNInfo *vni = cur->begin()->valno;
+    if (vni->def && vni->def != ~1U && vni->def != ~0U) {
+      MachineInstr *CopyMI = li_->getInstructionFromIndex(vni->def);
+      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+      if (CopyMI &&
+          tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg)) {
+        unsigned Reg = 0;
+        if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+          Reg = SrcReg;
+        else if (vrm_->isAssignedReg(SrcReg))
+          Reg = vrm_->getPhys(SrcReg);
+        if (Reg) {
+          if (SrcSubReg)
+            Reg = tri_->getSubReg(Reg, SrcSubReg);
+          if (DstSubReg)
+            Reg = tri_->getMatchingSuperReg(Reg, DstSubReg, RC);
+          if (Reg && allocatableRegs_[Reg] && RC->contains(Reg))
+            cur->preference = Reg;
+        }
+      }
+    }
+  }
+
+  // For every interval in inactive we overlap with, mark the
+  // register as not free and update spill weights.
+  for (IntervalPtrs::const_iterator i = inactive_.begin(),
+         e = inactive_.end(); i != e; ++i) {
+    unsigned Reg = i->first->reg;
+    assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+           "Can only allocate virtual registers!");
+    const TargetRegisterClass *RegRC = mri_->getRegClass(Reg);
+    // If this is not in a related reg class to the register we're allocating, 
+    // don't check it.
+    if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader &&
+        cur->overlapsFrom(*i->first, i->second-1)) {
+      Reg = vrm_->getPhys(Reg);
+      addRegUse(Reg);
+      SpillWeightsToAdd.push_back(std::make_pair(Reg, i->first->weight));
+    }
+  }
+  
+  // Speculatively check to see if we can get a register right now.  If not,
+  // we know we won't be able to by adding more constraints.  If so, we can
+  // check to see if it is valid.  Doing an exhaustive search of the fixed_ list
+  // is very bad (it contains all callee clobbered registers for any functions
+  // with a call), so we want to avoid doing that if possible.
+  unsigned physReg = getFreePhysReg(cur);
+  unsigned BestPhysReg = physReg;
+  if (physReg) {
+    // We got a register.  However, if it's in the fixed_ list, we might
+    // conflict with it.  Check to see if we conflict with it or any of its
+    // aliases.
+    SmallSet<unsigned, 8> RegAliases;
+    for (const unsigned *AS = tri_->getAliasSet(physReg); *AS; ++AS)
+      RegAliases.insert(*AS);
+    
+    bool ConflictsWithFixed = false;
+    for (unsigned i = 0, e = fixed_.size(); i != e; ++i) {
+      IntervalPtr &IP = fixed_[i];
+      if (physReg == IP.first->reg || RegAliases.count(IP.first->reg)) {
+        // Okay, this reg is on the fixed list.  Check to see if we actually
+        // conflict.
+        LiveInterval *I = IP.first;
+        if (I->endNumber() > StartPosition) {
+          LiveInterval::iterator II = I->advanceTo(IP.second, StartPosition);
+          IP.second = II;
+          if (II != I->begin() && II->start > StartPosition)
+            --II;
+          if (cur->overlapsFrom(*I, II)) {
+            ConflictsWithFixed = true;
+            break;
+          }
+        }
+      }
+    }
+    
+    // Okay, the register picked by our speculative getFreePhysReg call turned
+    // out to be in use.  Actually add all of the conflicting fixed registers to
+    // regUse_ so we can do an accurate query.
+    if (ConflictsWithFixed) {
+      // For every interval in fixed we overlap with, mark the register as not
+      // free and update spill weights.
+      for (unsigned i = 0, e = fixed_.size(); i != e; ++i) {
+        IntervalPtr &IP = fixed_[i];
+        LiveInterval *I = IP.first;
+
+        const TargetRegisterClass *RegRC = OneClassForEachPhysReg[I->reg];
+        if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader &&       
+            I->endNumber() > StartPosition) {
+          LiveInterval::iterator II = I->advanceTo(IP.second, StartPosition);
+          IP.second = II;
+          if (II != I->begin() && II->start > StartPosition)
+            --II;
+          if (cur->overlapsFrom(*I, II)) {
+            unsigned reg = I->reg;
+            addRegUse(reg);
+            SpillWeightsToAdd.push_back(std::make_pair(reg, I->weight));
+          }
+        }
+      }
+
+      // Using the newly updated regUse_ object, which includes conflicts in the
+      // future, see if there are any registers available.
+      physReg = getFreePhysReg(cur);
+    }
+  }
+    
+  // Restore the physical register tracker, removing information about the
+  // future.
+  restoreRegUses();
+  
+  // If we find a free register, we are done: assign this virtual to
+  // the free physical register and add this interval to the active
+  // list.
+  if (physReg) {
+    DOUT <<  tri_->getName(physReg) << '\n';
+    vrm_->assignVirt2Phys(cur->reg, physReg);
+    addRegUse(physReg);
+    active_.push_back(std::make_pair(cur, cur->begin()));
+    handled_.push_back(cur);
+
+    // "Upgrade" the physical register since it has been allocated.
+    UpgradeRegister(physReg);
+    if (LiveInterval *NextReloadLI = hasNextReloadInterval(cur)) {
+      // "Downgrade" physReg to try to keep physReg from being allocated until
+      // the next reload from the same SS is allocated. 
+      NextReloadLI->preference = physReg;
+      DowngradeRegister(cur, physReg);
+    }
+    return;
+  }
+  DOUT << "no free registers\n";
+
+  // Compile the spill weights into an array that is better for scanning.
+  std::vector<float> SpillWeights(tri_->getNumRegs(), 0.0f);
+  for (std::vector<std::pair<unsigned, float> >::iterator
+       I = SpillWeightsToAdd.begin(), E = SpillWeightsToAdd.end(); I != E; ++I)
+    updateSpillWeights(SpillWeights, I->first, I->second, RC);
+  
+  // for each interval in active, update spill weights.
+  for (IntervalPtrs::const_iterator i = active_.begin(), e = active_.end();
+       i != e; ++i) {
+    unsigned reg = i->first->reg;
+    assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+           "Can only allocate virtual registers!");
+    reg = vrm_->getPhys(reg);
+    updateSpillWeights(SpillWeights, reg, i->first->weight, RC);
+  }
+ 
+  DOUT << "\tassigning stack slot at interval "<< *cur << ":\n";
+
+  // Find a register to spill.
+  float minWeight = HUGE_VALF;
+  unsigned minReg = 0; /*cur->preference*/;  // Try the pref register first.
+
+  bool Found = false;
+  std::vector<std::pair<unsigned,float> > RegsWeights;
+  if (!minReg || SpillWeights[minReg] == HUGE_VALF)
+    for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_),
+           e = RC->allocation_order_end(*mf_); i != e; ++i) {
+      unsigned reg = *i;
+      float regWeight = SpillWeights[reg];
+      if (minWeight > regWeight)
+        Found = true;
+      RegsWeights.push_back(std::make_pair(reg, regWeight));
+    }
+  
+  // If we didn't find a register that is spillable, try aliases?
+  if (!Found) {
+    for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_),
+           e = RC->allocation_order_end(*mf_); i != e; ++i) {
+      unsigned reg = *i;
+      // No need to worry about if the alias register size < regsize of RC.
+      // We are going to spill all registers that alias it anyway.
+      for (const unsigned* as = tri_->getAliasSet(reg); *as; ++as)
+        RegsWeights.push_back(std::make_pair(*as, SpillWeights[*as]));
+    }
+  }
+
+  // Sort all potential spill candidates by weight.
+  std::sort(RegsWeights.begin(), RegsWeights.end(), WeightCompare());
+  minReg = RegsWeights[0].first;
+  minWeight = RegsWeights[0].second;
+  if (minWeight == HUGE_VALF) {
+    // All registers must have inf weight. Just grab one!
+    minReg = BestPhysReg ? BestPhysReg : *RC->allocation_order_begin(*mf_);
+    if (cur->weight == HUGE_VALF ||
+        li_->getApproximateInstructionCount(*cur) == 0) {
+      // Spill a physical register around defs and uses.
+      if (li_->spillPhysRegAroundRegDefsUses(*cur, minReg, *vrm_)) {
+        // spillPhysRegAroundRegDefsUses may have invalidated iterator stored
+        // in fixed_. Reset them.
+        for (unsigned i = 0, e = fixed_.size(); i != e; ++i) {
+          IntervalPtr &IP = fixed_[i];
+          LiveInterval *I = IP.first;
+          if (I->reg == minReg || tri_->isSubRegister(minReg, I->reg))
+            IP.second = I->advanceTo(I->begin(), StartPosition);
+        }
+
+        DowngradedRegs.clear();
+        assignRegOrStackSlotAtInterval(cur);
+      } else {
+        cerr << "Ran out of registers during register allocation!\n";
+        exit(1);
+      }
+      return;
+    }
+  }
+
+  // Find up to 3 registers to consider as spill candidates.
+  unsigned LastCandidate = RegsWeights.size() >= 3 ? 3 : 1;
+  while (LastCandidate > 1) {
+    if (weightsAreClose(RegsWeights[LastCandidate-1].second, minWeight))
+      break;
+    --LastCandidate;
+  }
+
+  DOUT << "\t\tregister(s) with min weight(s): ";
+  DEBUG(for (unsigned i = 0; i != LastCandidate; ++i)
+          DOUT << tri_->getName(RegsWeights[i].first)
+               << " (" << RegsWeights[i].second << ")\n");
+
+  // If the current has the minimum weight, we need to spill it and
+  // add any added intervals back to unhandled, and restart
+  // linearscan.
+  if (cur->weight != HUGE_VALF && cur->weight <= minWeight) {
+    DOUT << "\t\t\tspilling(c): " << *cur << '\n';
+    SmallVector<LiveInterval*, 8> spillIs;
+    std::vector<LiveInterval*> added;
+    
+    if (!NewSpillFramework) {
+      added = li_->addIntervalsForSpills(*cur, spillIs, loopInfo, *vrm_);
+    } else {
+      added = spiller_->spill(cur); 
+    }
+
+    std::sort(added.begin(), added.end(), LISorter());
+    addStackInterval(cur, ls_, li_, mri_, *vrm_);
+    if (added.empty())
+      return;  // Early exit if all spills were folded.
+
+    // Merge added with unhandled.  Note that we have already sorted
+    // intervals returned by addIntervalsForSpills by their starting
+    // point.
+    // This also update the NextReloadMap. That is, it adds mapping from a
+    // register defined by a reload from SS to the next reload from SS in the
+    // same basic block.
+    MachineBasicBlock *LastReloadMBB = 0;
+    LiveInterval *LastReload = 0;
+    int LastReloadSS = VirtRegMap::NO_STACK_SLOT;
+    for (unsigned i = 0, e = added.size(); i != e; ++i) {
+      LiveInterval *ReloadLi = added[i];
+      if (ReloadLi->weight == HUGE_VALF &&
+          li_->getApproximateInstructionCount(*ReloadLi) == 0) {
+        unsigned ReloadIdx = ReloadLi->beginNumber();
+        MachineBasicBlock *ReloadMBB = li_->getMBBFromIndex(ReloadIdx);
+        int ReloadSS = vrm_->getStackSlot(ReloadLi->reg);
+        if (LastReloadMBB == ReloadMBB && LastReloadSS == ReloadSS) {
+          // Last reload of same SS is in the same MBB. We want to try to
+          // allocate both reloads the same register and make sure the reg
+          // isn't clobbered in between if at all possible.
+          assert(LastReload->beginNumber() < ReloadIdx);
+          NextReloadMap.insert(std::make_pair(LastReload->reg, ReloadLi->reg));
+        }
+        LastReloadMBB = ReloadMBB;
+        LastReload = ReloadLi;
+        LastReloadSS = ReloadSS;
+      }
+      unhandled_.push(ReloadLi);
+    }
+    return;
+  }
+
+  ++NumBacktracks;
+
+  // Push the current interval back to unhandled since we are going
+  // to re-run at least this iteration. Since we didn't modify it it
+  // should go back right in the front of the list
+  unhandled_.push(cur);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(minReg) &&
+         "did not choose a register to spill?");
+
+  // We spill all intervals aliasing the register with
+  // minimum weight, rollback to the interval with the earliest
+  // start point and let the linear scan algorithm run again
+  SmallVector<LiveInterval*, 8> spillIs;
+
+  // Determine which intervals have to be spilled.
+  findIntervalsToSpill(cur, RegsWeights, LastCandidate, spillIs);
+
+  // Set of spilled vregs (used later to rollback properly)
+  SmallSet<unsigned, 8> spilled;
+
+  // The earliest start of a Spilled interval indicates up to where
+  // in handled we need to roll back
+  
+  unsigned earliestStart = cur->beginNumber();
+  LiveInterval *earliestStartInterval = cur;
+
+  // Spill live intervals of virtual regs mapped to the physical register we
+  // want to clear (and its aliases).  We only spill those that overlap with the
+  // current interval as the rest do not affect its allocation. we also keep
+  // track of the earliest start of all spilled live intervals since this will
+  // mark our rollback point.
+  std::vector<LiveInterval*> added;
+  while (!spillIs.empty()) {
+    bool epicFail = false;
+    LiveInterval *sli = spillIs.back();
+    spillIs.pop_back();
+    DOUT << "\t\t\tspilling(a): " << *sli << '\n';
+    earliestStart = std::min(earliestStart, sli->beginNumber());
+    earliestStartInterval =
+      (earliestStartInterval->beginNumber() < sli->beginNumber()) ?
+         earliestStartInterval : sli;
+    
+    if (earliestStartInterval->beginNumber()!=earliestStart) {
+      epicFail |= true;
+      std::cerr << "What the 1 - "
+      		<< "earliestStart = " << earliestStart
+      		<< "earliestStartInterval = " << earliestStartInterval->beginNumber()
+      		<< "\n";
+    }
+   
+    std::vector<LiveInterval*> newIs;
+    if (!NewSpillFramework) {
+      newIs = li_->addIntervalsForSpills(*sli, spillIs, loopInfo, *vrm_);
+    } else {
+      newIs = spiller_->spill(sli);
+    }
+    addStackInterval(sli, ls_, li_, mri_, *vrm_);
+    std::copy(newIs.begin(), newIs.end(), std::back_inserter(added));
+    spilled.insert(sli->reg);
+
+    if (earliestStartInterval->beginNumber()!=earliestStart) {
+      epicFail |= true;
+      std::cerr << "What the 2 - "
+      		<< "earliestStart = " << earliestStart
+      		<< "earliestStartInterval = " << earliestStartInterval->beginNumber()
+      		<< "\n";
+    }
+
+    if (epicFail) {
+      //abort();
+    }
+  }
+
+  earliestStart = earliestStartInterval->beginNumber();
+
+  DOUT << "\t\trolling back to: " << earliestStart << '\n';
+
+  // Scan handled in reverse order up to the earliest start of a
+  // spilled live interval and undo each one, restoring the state of
+  // unhandled.
+  while (!handled_.empty()) {
+    LiveInterval* i = handled_.back();
+    // If this interval starts before t we are done.
+    if (i->beginNumber() < earliestStart)
+      break;
+    DOUT << "\t\t\tundo changes for: " << *i << '\n';
+    handled_.pop_back();
+
+    // When undoing a live interval allocation we must know if it is active or
+    // inactive to properly update regUse_ and the VirtRegMap.
+    IntervalPtrs::iterator it;
+    if ((it = FindIntervalInVector(active_, i)) != active_.end()) {
+      active_.erase(it);
+      assert(!TargetRegisterInfo::isPhysicalRegister(i->reg));
+      if (!spilled.count(i->reg))
+        unhandled_.push(i);
+      delRegUse(vrm_->getPhys(i->reg));
+      vrm_->clearVirt(i->reg);
+    } else if ((it = FindIntervalInVector(inactive_, i)) != inactive_.end()) {
+      inactive_.erase(it);
+      assert(!TargetRegisterInfo::isPhysicalRegister(i->reg));
+      if (!spilled.count(i->reg))
+        unhandled_.push(i);
+      vrm_->clearVirt(i->reg);
+    } else {
+      assert(TargetRegisterInfo::isVirtualRegister(i->reg) &&
+             "Can only allocate virtual registers!");
+      vrm_->clearVirt(i->reg);
+      unhandled_.push(i);
+    }
+
+    DenseMap<unsigned, unsigned>::iterator ii = DowngradeMap.find(i->reg);
+    if (ii == DowngradeMap.end())
+      // It interval has a preference, it must be defined by a copy. Clear the
+      // preference now since the source interval allocation may have been
+      // undone as well.
+      i->preference = 0;
+    else {
+      UpgradeRegister(ii->second);
+    }
+  }
+
+  // Rewind the iterators in the active, inactive, and fixed lists back to the
+  // point we reverted to.
+  RevertVectorIteratorsTo(active_, earliestStart);
+  RevertVectorIteratorsTo(inactive_, earliestStart);
+  RevertVectorIteratorsTo(fixed_, earliestStart);
+
+  // Scan the rest and undo each interval that expired after t and
+  // insert it in active (the next iteration of the algorithm will
+  // put it in inactive if required)
+  for (unsigned i = 0, e = handled_.size(); i != e; ++i) {
+    LiveInterval *HI = handled_[i];
+    if (!HI->expiredAt(earliestStart) &&
+        HI->expiredAt(cur->beginNumber())) {
+      DOUT << "\t\t\tundo changes for: " << *HI << '\n';
+      active_.push_back(std::make_pair(HI, HI->begin()));
+      assert(!TargetRegisterInfo::isPhysicalRegister(HI->reg));
+      addRegUse(vrm_->getPhys(HI->reg));
+    }
+  }
+
+  // Merge added with unhandled.
+  // This also update the NextReloadMap. That is, it adds mapping from a
+  // register defined by a reload from SS to the next reload from SS in the
+  // same basic block.
+  MachineBasicBlock *LastReloadMBB = 0;
+  LiveInterval *LastReload = 0;
+  int LastReloadSS = VirtRegMap::NO_STACK_SLOT;
+  std::sort(added.begin(), added.end(), LISorter());
+  for (unsigned i = 0, e = added.size(); i != e; ++i) {
+    LiveInterval *ReloadLi = added[i];
+    if (ReloadLi->weight == HUGE_VALF &&
+        li_->getApproximateInstructionCount(*ReloadLi) == 0) {
+      unsigned ReloadIdx = ReloadLi->beginNumber();
+      MachineBasicBlock *ReloadMBB = li_->getMBBFromIndex(ReloadIdx);
+      int ReloadSS = vrm_->getStackSlot(ReloadLi->reg);
+      if (LastReloadMBB == ReloadMBB && LastReloadSS == ReloadSS) {
+        // Last reload of same SS is in the same MBB. We want to try to
+        // allocate both reloads the same register and make sure the reg
+        // isn't clobbered in between if at all possible.
+        assert(LastReload->beginNumber() < ReloadIdx);
+        NextReloadMap.insert(std::make_pair(LastReload->reg, ReloadLi->reg));
+      }
+      LastReloadMBB = ReloadMBB;
+      LastReload = ReloadLi;
+      LastReloadSS = ReloadSS;
+    }
+    unhandled_.push(ReloadLi);
+  }
+}
+
+unsigned RALinScan::getFreePhysReg(const TargetRegisterClass *RC,
+                                   unsigned MaxInactiveCount,
+                                   SmallVector<unsigned, 256> &inactiveCounts,
+                                   bool SkipDGRegs) {
+  unsigned FreeReg = 0;
+  unsigned FreeRegInactiveCount = 0;
+
+  TargetRegisterClass::iterator I = RC->allocation_order_begin(*mf_);
+  TargetRegisterClass::iterator E = RC->allocation_order_end(*mf_);
+  assert(I != E && "No allocatable register in this register class!");
+
+  // Scan for the first available register.
+  for (; I != E; ++I) {
+    unsigned Reg = *I;
+    // Ignore "downgraded" registers.
+    if (SkipDGRegs && DowngradedRegs.count(Reg))
+      continue;
+    if (isRegAvail(Reg)) {
+      FreeReg = Reg;
+      if (FreeReg < inactiveCounts.size())
+        FreeRegInactiveCount = inactiveCounts[FreeReg];
+      else
+        FreeRegInactiveCount = 0;
+      break;
+    }
+  }
+
+  // If there are no free regs, or if this reg has the max inactive count,
+  // return this register.
+  if (FreeReg == 0 || FreeRegInactiveCount == MaxInactiveCount)
+    return FreeReg;
+  
+  // Continue scanning the registers, looking for the one with the highest
+  // inactive count.  Alkis found that this reduced register pressure very
+  // slightly on X86 (in rev 1.94 of this file), though this should probably be
+  // reevaluated now.
+  for (; I != E; ++I) {
+    unsigned Reg = *I;
+    // Ignore "downgraded" registers.
+    if (SkipDGRegs && DowngradedRegs.count(Reg))
+      continue;
+    if (isRegAvail(Reg) && Reg < inactiveCounts.size() &&
+        FreeRegInactiveCount < inactiveCounts[Reg]) {
+      FreeReg = Reg;
+      FreeRegInactiveCount = inactiveCounts[Reg];
+      if (FreeRegInactiveCount == MaxInactiveCount)
+        break;    // We found the one with the max inactive count.
+    }
+  }
+
+  return FreeReg;
+}
+
+/// getFreePhysReg - return a free physical register for this virtual register
+/// interval if we have one, otherwise return 0.
+unsigned RALinScan::getFreePhysReg(LiveInterval *cur) {
+  SmallVector<unsigned, 256> inactiveCounts;
+  unsigned MaxInactiveCount = 0;
+  
+  const TargetRegisterClass *RC = mri_->getRegClass(cur->reg);
+  const TargetRegisterClass *RCLeader = RelatedRegClasses.getLeaderValue(RC);
+ 
+  for (IntervalPtrs::iterator i = inactive_.begin(), e = inactive_.end();
+       i != e; ++i) {
+    unsigned reg = i->first->reg;
+    assert(TargetRegisterInfo::isVirtualRegister(reg) &&
+           "Can only allocate virtual registers!");
+
+    // If this is not in a related reg class to the register we're allocating, 
+    // don't check it.
+    const TargetRegisterClass *RegRC = mri_->getRegClass(reg);
+    if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader) {
+      reg = vrm_->getPhys(reg);
+      if (inactiveCounts.size() <= reg)
+        inactiveCounts.resize(reg+1);
+      ++inactiveCounts[reg];
+      MaxInactiveCount = std::max(MaxInactiveCount, inactiveCounts[reg]);
+    }
+  }
+
+  // If copy coalescer has assigned a "preferred" register, check if it's
+  // available first.
+  if (cur->preference) {
+    DOUT << "(preferred: " << tri_->getName(cur->preference) << ") ";
+    if (isRegAvail(cur->preference) && 
+        RC->contains(cur->preference))
+      return cur->preference;
+  }
+
+  if (!DowngradedRegs.empty()) {
+    unsigned FreeReg = getFreePhysReg(RC, MaxInactiveCount, inactiveCounts,
+                                      true);
+    if (FreeReg)
+      return FreeReg;
+  }
+  return getFreePhysReg(RC, MaxInactiveCount, inactiveCounts, false);
+}
+
+FunctionPass* llvm::createLinearScanRegisterAllocator() {
+  return new RALinScan();
+}
diff --git a/lib/CodeGen/RegAllocLocal.cpp b/lib/CodeGen/RegAllocLocal.cpp
new file mode 100644
index 0000000..e1cc20c
--- /dev/null
+++ b/lib/CodeGen/RegAllocLocal.cpp
@@ -0,0 +1,1068 @@
+//===-- RegAllocLocal.cpp - A BasicBlock generic register allocator -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This register allocator allocates registers to a basic block at a time,
+// attempting to keep values in registers and reusing registers as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads , "Number of loads added");
+
+static RegisterRegAlloc
+  localRegAlloc("local", "local register allocator",
+                createLocalRegisterAllocator);
+
+namespace {
+  class VISIBILITY_HIDDEN RALocal : public MachineFunctionPass {
+  public:
+    static char ID;
+    RALocal() : MachineFunctionPass(&ID), StackSlotForVirtReg(-1) {}
+  private:
+    const TargetMachine *TM;
+    MachineFunction *MF;
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+
+    // StackSlotForVirtReg - Maps virtual regs to the frame index where these
+    // values are spilled.
+    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+    // Virt2PhysRegMap - This map contains entries for each virtual register
+    // that is currently available in a physical register.
+    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2PhysRegMap;
+
+    unsigned &getVirt2PhysRegMapSlot(unsigned VirtReg) {
+      return Virt2PhysRegMap[VirtReg];
+    }
+
+    // PhysRegsUsed - This array is effectively a map, containing entries for
+    // each physical register that currently has a value (ie, it is in
+    // Virt2PhysRegMap).  The value mapped to is the virtual register
+    // corresponding to the physical register (the inverse of the
+    // Virt2PhysRegMap), or 0.  The value is set to 0 if this register is pinned
+    // because it is used by a future instruction, and to -2 if it is not
+    // allocatable.  If the entry for a physical register is -1, then the
+    // physical register is "not in the map".
+    //
+    std::vector<int> PhysRegsUsed;
+
+    // PhysRegsUseOrder - This contains a list of the physical registers that
+    // currently have a virtual register value in them.  This list provides an
+    // ordering of registers, imposing a reallocation order.  This list is only
+    // used if all registers are allocated and we have to spill one, in which
+    // case we spill the least recently used register.  Entries at the front of
+    // the list are the least recently used registers, entries at the back are
+    // the most recently used.
+    //
+    std::vector<unsigned> PhysRegsUseOrder;
+
+    // Virt2LastUseMap - This maps each virtual register to its last use
+    // (MachineInstr*, operand index pair).
+    IndexedMap<std::pair<MachineInstr*, unsigned>, VirtReg2IndexFunctor>
+    Virt2LastUseMap;
+
+    std::pair<MachineInstr*,unsigned>& getVirtRegLastUse(unsigned Reg) {
+      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
+      return Virt2LastUseMap[Reg];
+    }
+
+    // VirtRegModified - This bitset contains information about which virtual
+    // registers need to be spilled back to memory when their registers are
+    // scavenged.  If a virtual register has simply been rematerialized, there
+    // is no reason to spill it to memory when we need the register back.
+    //
+    BitVector VirtRegModified;
+    
+    // UsedInMultipleBlocks - Tracks whether a particular register is used in
+    // more than one block.
+    BitVector UsedInMultipleBlocks;
+
+    void markVirtRegModified(unsigned Reg, bool Val = true) {
+      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
+      Reg -= TargetRegisterInfo::FirstVirtualRegister;
+      if (Val)
+        VirtRegModified.set(Reg);
+      else
+        VirtRegModified.reset(Reg);
+    }
+
+    bool isVirtRegModified(unsigned Reg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
+      assert(Reg - TargetRegisterInfo::FirstVirtualRegister < VirtRegModified.size()
+             && "Illegal virtual register!");
+      return VirtRegModified[Reg - TargetRegisterInfo::FirstVirtualRegister];
+    }
+
+    void AddToPhysRegsUseOrder(unsigned Reg) {
+      std::vector<unsigned>::iterator It =
+        std::find(PhysRegsUseOrder.begin(), PhysRegsUseOrder.end(), Reg);
+      if (It != PhysRegsUseOrder.end())
+        PhysRegsUseOrder.erase(It);
+      PhysRegsUseOrder.push_back(Reg);
+    }
+
+    void MarkPhysRegRecentlyUsed(unsigned Reg) {
+      if (PhysRegsUseOrder.empty() ||
+          PhysRegsUseOrder.back() == Reg) return;  // Already most recently used
+
+      for (unsigned i = PhysRegsUseOrder.size(); i != 0; --i)
+        if (areRegsEqual(Reg, PhysRegsUseOrder[i-1])) {
+          unsigned RegMatch = PhysRegsUseOrder[i-1];       // remove from middle
+          PhysRegsUseOrder.erase(PhysRegsUseOrder.begin()+i-1);
+          // Add it to the end of the list
+          PhysRegsUseOrder.push_back(RegMatch);
+          if (RegMatch == Reg)
+            return;    // Found an exact match, exit early
+        }
+    }
+
+  public:
+    virtual const char *getPassName() const {
+      return "Local Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(PHIEliminationID);
+      AU.addRequiredID(TwoAddressInstructionPassID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// runOnMachineFunction - Register allocate the whole function
+    bool runOnMachineFunction(MachineFunction &Fn);
+
+    /// AllocateBasicBlock - Register allocate the specified basic block.
+    void AllocateBasicBlock(MachineBasicBlock &MBB);
+
+
+    /// areRegsEqual - This method returns true if the specified registers are
+    /// related to each other.  To do this, it checks to see if they are equal
+    /// or if the first register is in the alias set of the second register.
+    ///
+    bool areRegsEqual(unsigned R1, unsigned R2) const {
+      if (R1 == R2) return true;
+      for (const unsigned *AliasSet = TRI->getAliasSet(R2);
+           *AliasSet; ++AliasSet) {
+        if (*AliasSet == R1) return true;
+      }
+      return false;
+    }
+
+    /// getStackSpaceFor - This returns the frame index of the specified virtual
+    /// register on the stack, allocating space if necessary.
+    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
+
+    /// removePhysReg - This method marks the specified physical register as no
+    /// longer being in use.
+    ///
+    void removePhysReg(unsigned PhysReg);
+
+    /// spillVirtReg - This method spills the value specified by PhysReg into
+    /// the virtual register slot specified by VirtReg.  It then updates the RA
+    /// data structures to indicate the fact that PhysReg is now available.
+    ///
+    void spillVirtReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                      unsigned VirtReg, unsigned PhysReg);
+
+    /// spillPhysReg - This method spills the specified physical register into
+    /// the virtual register slot associated with it.  If OnlyVirtRegs is set to
+    /// true, then the request is ignored if the physical register does not
+    /// contain a virtual register.
+    ///
+    void spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
+                      unsigned PhysReg, bool OnlyVirtRegs = false);
+
+    /// assignVirtToPhysReg - This method updates local state so that we know
+    /// that PhysReg is the proper container for VirtReg now.  The physical
+    /// register must not be used for anything else when this is called.
+    ///
+    void assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg);
+
+    /// isPhysRegAvailable - Return true if the specified physical register is
+    /// free and available for use.  This also includes checking to see if
+    /// aliased registers are all free...
+    ///
+    bool isPhysRegAvailable(unsigned PhysReg) const;
+
+    /// getFreeReg - Look to see if there is a free register available in the
+    /// specified register class.  If not, return 0.
+    ///
+    unsigned getFreeReg(const TargetRegisterClass *RC);
+
+    /// getReg - Find a physical register to hold the specified virtual
+    /// register.  If all compatible physical registers are used, this method
+    /// spills the last used virtual register to the stack, and uses that
+    /// register. If NoFree is true, that means the caller knows there isn't
+    /// a free register, do not call getFreeReg().
+    unsigned getReg(MachineBasicBlock &MBB, MachineInstr *MI,
+                    unsigned VirtReg, bool NoFree = false);
+
+    /// reloadVirtReg - This method transforms the specified virtual
+    /// register use to refer to a physical register.  This method may do this
+    /// in one of several ways: if the register is available in a physical
+    /// register already, it uses that physical register.  If the value is not
+    /// in a physical register, and if there are physical registers available,
+    /// it loads it into a register.  If register pressure is high, and it is
+    /// possible, it tries to fold the load of the virtual register into the
+    /// instruction itself.  It avoids doing this if register pressure is low to
+    /// improve the chance that subsequent instructions can use the reloaded
+    /// value.  This method returns the modified instruction.
+    ///
+    MachineInstr *reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
+                                unsigned OpNum, SmallSet<unsigned, 4> &RRegs);
+
+    /// ComputeLocalLiveness - Computes liveness of registers within a basic
+    /// block, setting the killed/dead flags as appropriate.
+    void ComputeLocalLiveness(MachineBasicBlock& MBB);
+
+    void reloadPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I,
+                       unsigned PhysReg);
+  };
+  char RALocal::ID = 0;
+}
+
+/// getStackSpaceFor - This allocates space for the specified virtual register
+/// to be held on the stack.
+int RALocal::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
+  // Find the location Reg would belong...
+  int SS = StackSlotForVirtReg[VirtReg];
+  if (SS != -1)
+    return SS;          // Already has space allocated?
+
+  // Allocate a new stack object for this spill location...
+  int FrameIdx = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment());
+
+  // Assign the slot...
+  StackSlotForVirtReg[VirtReg] = FrameIdx;
+  return FrameIdx;
+}
+
+
+/// removePhysReg - This method marks the specified physical register as no
+/// longer being in use.
+///
+void RALocal::removePhysReg(unsigned PhysReg) {
+  PhysRegsUsed[PhysReg] = -1;      // PhyReg no longer used
+
+  std::vector<unsigned>::iterator It =
+    std::find(PhysRegsUseOrder.begin(), PhysRegsUseOrder.end(), PhysReg);
+  if (It != PhysRegsUseOrder.end())
+    PhysRegsUseOrder.erase(It);
+}
+
+
+/// spillVirtReg - This method spills the value specified by PhysReg into the
+/// virtual register slot specified by VirtReg.  It then updates the RA data
+/// structures to indicate the fact that PhysReg is now available.
+///
+void RALocal::spillVirtReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I,
+                           unsigned VirtReg, unsigned PhysReg) {
+  assert(VirtReg && "Spilling a physical register is illegal!"
+         " Must not have appropriate kill for the register or use exists beyond"
+         " the intended one.");
+  DOUT << "  Spilling register " << TRI->getName(PhysReg)
+       << " containing %reg" << VirtReg;
+  
+  if (!isVirtRegModified(VirtReg)) {
+    DOUT << " which has not been modified, so no store necessary!";
+    std::pair<MachineInstr*, unsigned> &LastUse = getVirtRegLastUse(VirtReg);
+    if (LastUse.first)
+      LastUse.first->getOperand(LastUse.second).setIsKill();
+  } else {
+    // Otherwise, there is a virtual register corresponding to this physical
+    // register.  We only need to spill it into its stack slot if it has been
+    // modified.
+    const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
+    int FrameIndex = getStackSpaceFor(VirtReg, RC);
+    DOUT << " to stack slot #" << FrameIndex;
+    // If the instruction reads the register that's spilled, (e.g. this can
+    // happen if it is a move to a physical register), then the spill
+    // instruction is not a kill.
+    bool isKill = !(I != MBB.end() && I->readsRegister(PhysReg));
+    TII->storeRegToStackSlot(MBB, I, PhysReg, isKill, FrameIndex, RC);
+    ++NumStores;   // Update statistics
+  }
+
+  getVirt2PhysRegMapSlot(VirtReg) = 0;   // VirtReg no longer available
+
+  DOUT << "\n";
+  removePhysReg(PhysReg);
+}
+
+
+/// spillPhysReg - This method spills the specified physical register into the
+/// virtual register slot associated with it.  If OnlyVirtRegs is set to true,
+/// then the request is ignored if the physical register does not contain a
+/// virtual register.
+///
+void RALocal::spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
+                           unsigned PhysReg, bool OnlyVirtRegs) {
+  if (PhysRegsUsed[PhysReg] != -1) {            // Only spill it if it's used!
+    assert(PhysRegsUsed[PhysReg] != -2 && "Non allocable reg used!");
+    if (PhysRegsUsed[PhysReg] || !OnlyVirtRegs)
+      spillVirtReg(MBB, I, PhysRegsUsed[PhysReg], PhysReg);
+  } else {
+    // If the selected register aliases any other registers, we must make
+    // sure that one of the aliases isn't alive.
+    for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
+         *AliasSet; ++AliasSet)
+      if (PhysRegsUsed[*AliasSet] != -1 &&     // Spill aliased register.
+          PhysRegsUsed[*AliasSet] != -2)       // If allocatable.
+          if (PhysRegsUsed[*AliasSet])
+            spillVirtReg(MBB, I, PhysRegsUsed[*AliasSet], *AliasSet);
+  }
+}
+
+
+/// assignVirtToPhysReg - This method updates local state so that we know
+/// that PhysReg is the proper container for VirtReg now.  The physical
+/// register must not be used for anything else when this is called.
+///
+void RALocal::assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg) {
+  assert(PhysRegsUsed[PhysReg] == -1 && "Phys reg already assigned!");
+  // Update information to note the fact that this register was just used, and
+  // it holds VirtReg.
+  PhysRegsUsed[PhysReg] = VirtReg;
+  getVirt2PhysRegMapSlot(VirtReg) = PhysReg;
+  AddToPhysRegsUseOrder(PhysReg);   // New use of PhysReg
+}
+
+
+/// isPhysRegAvailable - Return true if the specified physical register is free
+/// and available for use.  This also includes checking to see if aliased
+/// registers are all free...
+///
+bool RALocal::isPhysRegAvailable(unsigned PhysReg) const {
+  if (PhysRegsUsed[PhysReg] != -1) return false;
+
+  // If the selected register aliases any other allocated registers, it is
+  // not free!
+  for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
+       *AliasSet; ++AliasSet)
+    if (PhysRegsUsed[*AliasSet] >= 0) // Aliased register in use?
+      return false;                    // Can't use this reg then.
+  return true;
+}
+
+
+/// getFreeReg - Look to see if there is a free register available in the
+/// specified register class.  If not, return 0.
+///
+unsigned RALocal::getFreeReg(const TargetRegisterClass *RC) {
+  // Get iterators defining the range of registers that are valid to allocate in
+  // this class, which also specifies the preferred allocation order.
+  TargetRegisterClass::iterator RI = RC->allocation_order_begin(*MF);
+  TargetRegisterClass::iterator RE = RC->allocation_order_end(*MF);
+
+  for (; RI != RE; ++RI)
+    if (isPhysRegAvailable(*RI)) {       // Is reg unused?
+      assert(*RI != 0 && "Cannot use register!");
+      return *RI; // Found an unused register!
+    }
+  return 0;
+}
+
+
+/// getReg - Find a physical register to hold the specified virtual
+/// register.  If all compatible physical registers are used, this method spills
+/// the last used virtual register to the stack, and uses that register.
+///
+unsigned RALocal::getReg(MachineBasicBlock &MBB, MachineInstr *I,
+                         unsigned VirtReg, bool NoFree) {
+  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
+
+  // First check to see if we have a free register of the requested type...
+  unsigned PhysReg = NoFree ? 0 : getFreeReg(RC);
+
+  // If we didn't find an unused register, scavenge one now!
+  if (PhysReg == 0) {
+    assert(!PhysRegsUseOrder.empty() && "No allocated registers??");
+
+    // Loop over all of the preallocated registers from the least recently used
+    // to the most recently used.  When we find one that is capable of holding
+    // our register, use it.
+    for (unsigned i = 0; PhysReg == 0; ++i) {
+      assert(i != PhysRegsUseOrder.size() &&
+             "Couldn't find a register of the appropriate class!");
+
+      unsigned R = PhysRegsUseOrder[i];
+
+      // We can only use this register if it holds a virtual register (ie, it
+      // can be spilled).  Do not use it if it is an explicitly allocated
+      // physical register!
+      assert(PhysRegsUsed[R] != -1 &&
+             "PhysReg in PhysRegsUseOrder, but is not allocated?");
+      if (PhysRegsUsed[R] && PhysRegsUsed[R] != -2) {
+        // If the current register is compatible, use it.
+        if (RC->contains(R)) {
+          PhysReg = R;
+          break;
+        } else {
+          // If one of the registers aliased to the current register is
+          // compatible, use it.
+          for (const unsigned *AliasIt = TRI->getAliasSet(R);
+               *AliasIt; ++AliasIt) {
+            if (RC->contains(*AliasIt) &&
+                // If this is pinned down for some reason, don't use it.  For
+                // example, if CL is pinned, and we run across CH, don't use
+                // CH as justification for using scavenging ECX (which will
+                // fail).
+                PhysRegsUsed[*AliasIt] != 0 &&
+                
+                // Make sure the register is allocatable.  Don't allocate SIL on
+                // x86-32.
+                PhysRegsUsed[*AliasIt] != -2) {
+              PhysReg = *AliasIt;    // Take an aliased register
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    assert(PhysReg && "Physical register not assigned!?!?");
+
+    // At this point PhysRegsUseOrder[i] is the least recently used register of
+    // compatible register class.  Spill it to memory and reap its remains.
+    spillPhysReg(MBB, I, PhysReg);
+  }
+
+  // Now that we know which register we need to assign this to, do it now!
+  assignVirtToPhysReg(VirtReg, PhysReg);
+  return PhysReg;
+}
+
+
+/// reloadVirtReg - This method transforms the specified virtual
+/// register use to refer to a physical register.  This method may do this in
+/// one of several ways: if the register is available in a physical register
+/// already, it uses that physical register.  If the value is not in a physical
+/// register, and if there are physical registers available, it loads it into a
+/// register.  If register pressure is high, and it is possible, it tries to
+/// fold the load of the virtual register into the instruction itself.  It
+/// avoids doing this if register pressure is low to improve the chance that
+/// subsequent instructions can use the reloaded value.  This method returns the
+/// modified instruction.
+///
+MachineInstr *RALocal::reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
+                                     unsigned OpNum,
+                                     SmallSet<unsigned, 4> &ReloadedRegs) {
+  unsigned VirtReg = MI->getOperand(OpNum).getReg();
+
+  // If the virtual register is already available, just update the instruction
+  // and return.
+  if (unsigned PR = getVirt2PhysRegMapSlot(VirtReg)) {
+    MarkPhysRegRecentlyUsed(PR);       // Already have this value available!
+    MI->getOperand(OpNum).setReg(PR);  // Assign the input register
+    getVirtRegLastUse(VirtReg) = std::make_pair(MI, OpNum);
+    return MI;
+  }
+
+  // Otherwise, we need to fold it into the current instruction, or reload it.
+  // If we have registers available to hold the value, use them.
+  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
+  unsigned PhysReg = getFreeReg(RC);
+  int FrameIndex = getStackSpaceFor(VirtReg, RC);
+
+  if (PhysReg) {   // Register is available, allocate it!
+    assignVirtToPhysReg(VirtReg, PhysReg);
+  } else {         // No registers available.
+    // Force some poor hapless value out of the register file to
+    // make room for the new register, and reload it.
+    PhysReg = getReg(MBB, MI, VirtReg, true);
+  }
+
+  markVirtRegModified(VirtReg, false);   // Note that this reg was just reloaded
+
+  DOUT << "  Reloading %reg" << VirtReg << " into "
+       << TRI->getName(PhysReg) << "\n";
+
+  // Add move instruction(s)
+  TII->loadRegFromStackSlot(MBB, MI, PhysReg, FrameIndex, RC);
+  ++NumLoads;    // Update statistics
+
+  MF->getRegInfo().setPhysRegUsed(PhysReg);
+  MI->getOperand(OpNum).setReg(PhysReg);  // Assign the input register
+  getVirtRegLastUse(VirtReg) = std::make_pair(MI, OpNum);
+
+  if (!ReloadedRegs.insert(PhysReg)) {
+    cerr << "Ran out of registers during register allocation!\n";
+    if (MI->getOpcode() == TargetInstrInfo::INLINEASM) {
+      cerr << "Please check your inline asm statement for invalid "
+           << "constraints:\n";
+      MI->print(cerr.stream(), TM);
+    }
+    exit(1);
+  }
+  for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
+       *SubRegs; ++SubRegs) {
+    if (!ReloadedRegs.insert(*SubRegs)) {
+      cerr << "Ran out of registers during register allocation!\n";
+      if (MI->getOpcode() == TargetInstrInfo::INLINEASM) {
+        cerr << "Please check your inline asm statement for invalid "
+             << "constraints:\n";
+        MI->print(cerr.stream(), TM);
+      }
+      exit(1);
+    }
+  }
+
+  return MI;
+}
+
+/// isReadModWriteImplicitKill - True if this is an implicit kill for a
+/// read/mod/write register, i.e. update partial register.
+static bool isReadModWriteImplicitKill(MachineInstr *MI, unsigned Reg) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand& MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
+        MO.isDef() && !MO.isDead())
+      return true;
+  }
+  return false;
+}
+
+/// isReadModWriteImplicitDef - True if this is an implicit def for a
+/// read/mod/write register, i.e. update partial register.
+static bool isReadModWriteImplicitDef(MachineInstr *MI, unsigned Reg) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand& MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
+        !MO.isDef() && MO.isKill())
+      return true;
+  }
+  return false;
+}
+
+// precedes - Helper function to determine with MachineInstr A
+// precedes MachineInstr B within the same MBB.
+static bool precedes(MachineBasicBlock::iterator A,
+                     MachineBasicBlock::iterator B) {
+  if (A == B)
+    return false;
+  
+  MachineBasicBlock::iterator I = A->getParent()->begin();
+  while (I != A->getParent()->end()) {
+    if (I == A)
+      return true;
+    else if (I == B)
+      return false;
+    
+    ++I;
+  }
+  
+  return false;
+}
+
+/// ComputeLocalLiveness - Computes liveness of registers within a basic
+/// block, setting the killed/dead flags as appropriate.
+void RALocal::ComputeLocalLiveness(MachineBasicBlock& MBB) {
+  MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo();
+  // Keep track of the most recently seen previous use or def of each reg, 
+  // so that we can update them with dead/kill markers.
+  DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > LastUseDef;
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+       I != E; ++I) {
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = I->getOperand(i);
+      // Uses don't trigger any flags, but we need to save
+      // them for later.  Also, we have to process these
+      // _before_ processing the defs, since an instr
+      // uses regs before it defs them.
+      if (MO.isReg() && MO.getReg() && MO.isUse()) {
+        LastUseDef[MO.getReg()] = std::make_pair(I, i);
+        
+        
+        if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue;
+        
+        const unsigned* Aliases = TRI->getAliasSet(MO.getReg());
+        if (Aliases) {
+          while (*Aliases) {
+            DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
+              alias = LastUseDef.find(*Aliases);
+            
+            if (alias != LastUseDef.end() && alias->second.first != I)
+              LastUseDef[*Aliases] = std::make_pair(I, i);
+            
+            ++Aliases;
+          }
+        }
+      }
+    }
+    
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = I->getOperand(i);
+      // Defs others than 2-addr redefs _do_ trigger flag changes:
+      //   - A def followed by a def is dead
+      //   - A use followed by a def is a kill
+      if (MO.isReg() && MO.getReg() && MO.isDef()) {
+        DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
+          last = LastUseDef.find(MO.getReg());
+        if (last != LastUseDef.end()) {
+          // Check if this is a two address instruction.  If so, then
+          // the def does not kill the use.
+          if (last->second.first == I &&
+              I->isRegTiedToUseOperand(i))
+            continue;
+          
+          MachineOperand& lastUD =
+                      last->second.first->getOperand(last->second.second);
+          if (lastUD.isDef())
+            lastUD.setIsDead(true);
+          else
+            lastUD.setIsKill(true);
+        }
+        
+        LastUseDef[MO.getReg()] = std::make_pair(I, i);
+      }
+    }
+  }
+  
+  // Live-out (of the function) registers contain return values of the function,
+  // so we need to make sure they are alive at return time.
+  if (!MBB.empty() && MBB.back().getDesc().isReturn()) {
+    MachineInstr* Ret = &MBB.back();
+    for (MachineRegisterInfo::liveout_iterator
+         I = MF->getRegInfo().liveout_begin(),
+         E = MF->getRegInfo().liveout_end(); I != E; ++I)
+      if (!Ret->readsRegister(*I)) {
+        Ret->addOperand(MachineOperand::CreateReg(*I, false, true));
+        LastUseDef[*I] = std::make_pair(Ret, Ret->getNumOperands()-1);
+      }
+  }
+  
+  // Finally, loop over the final use/def of each reg 
+  // in the block and determine if it is dead.
+  for (DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
+       I = LastUseDef.begin(), E = LastUseDef.end(); I != E; ++I) {
+    MachineInstr* MI = I->second.first;
+    unsigned idx = I->second.second;
+    MachineOperand& MO = MI->getOperand(idx);
+    
+    bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(MO.getReg());
+    
+    // A crude approximation of "live-out" calculation
+    bool usedOutsideBlock = isPhysReg ? false :   
+          UsedInMultipleBlocks.test(MO.getReg() -  
+                                    TargetRegisterInfo::FirstVirtualRegister);
+    if (!isPhysReg && !usedOutsideBlock)
+      for (MachineRegisterInfo::reg_iterator UI = MRI.reg_begin(MO.getReg()),
+           UE = MRI.reg_end(); UI != UE; ++UI)
+        // Two cases:
+        // - used in another block
+        // - used in the same block before it is defined (loop)
+        if (UI->getParent() != &MBB ||
+            (MO.isDef() && UI.getOperand().isUse() && precedes(&*UI, MI))) {
+          UsedInMultipleBlocks.set(MO.getReg() - 
+                                   TargetRegisterInfo::FirstVirtualRegister);
+          usedOutsideBlock = true;
+          break;
+        }
+    
+    // Physical registers and those that are not live-out of the block
+    // are killed/dead at their last use/def within this block.
+    if (isPhysReg || !usedOutsideBlock) {
+      if (MO.isUse()) {
+        // Don't mark uses that are tied to defs as kills.
+        if (!MI->isRegTiedToDefOperand(idx))
+          MO.setIsKill(true);
+      } else
+        MO.setIsDead(true);
+    }
+  }
+}
+
+void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
+  // loop over each instruction
+  MachineBasicBlock::iterator MII = MBB.begin();
+  
+  DEBUG(const BasicBlock *LBB = MBB.getBasicBlock();
+        if (LBB) DOUT << "\nStarting RegAlloc of BB: " << LBB->getName());
+
+  // Add live-in registers as active.
+  for (MachineBasicBlock::livein_iterator I = MBB.livein_begin(),
+         E = MBB.livein_end(); I != E; ++I) {
+    unsigned Reg = *I;
+    MF->getRegInfo().setPhysRegUsed(Reg);
+    PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+    AddToPhysRegsUseOrder(Reg); 
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         *SubRegs; ++SubRegs) {
+      if (PhysRegsUsed[*SubRegs] != -2) {
+        AddToPhysRegsUseOrder(*SubRegs); 
+        PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+        MF->getRegInfo().setPhysRegUsed(*SubRegs);
+      }
+    }
+  }
+  
+  ComputeLocalLiveness(MBB);
+  
+  // Otherwise, sequentially allocate each instruction in the MBB.
+  while (MII != MBB.end()) {
+    MachineInstr *MI = MII++;
+    const TargetInstrDesc &TID = MI->getDesc();
+    DEBUG(DOUT << "\nStarting RegAlloc of: " << *MI;
+          DOUT << "  Regs have values: ";
+          for (unsigned i = 0; i != TRI->getNumRegs(); ++i)
+            if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2)
+               DOUT << "[" << TRI->getName(i)
+                    << ",%reg" << PhysRegsUsed[i] << "] ";
+          DOUT << "\n");
+
+    // Loop over the implicit uses, making sure that they are at the head of the
+    // use order list, so they don't get reallocated.
+    if (TID.ImplicitUses) {
+      for (const unsigned *ImplicitUses = TID.ImplicitUses;
+           *ImplicitUses; ++ImplicitUses)
+        MarkPhysRegRecentlyUsed(*ImplicitUses);
+    }
+
+    SmallVector<unsigned, 8> Kills;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isKill()) {
+        if (!MO.isImplicit())
+          Kills.push_back(MO.getReg());
+        else if (!isReadModWriteImplicitKill(MI, MO.getReg()))
+          // These are extra physical register kills when a sub-register
+          // is defined (def of a sub-register is a read/mod/write of the
+          // larger registers). Ignore.
+          Kills.push_back(MO.getReg());
+      }
+    }
+
+    // If any physical regs are earlyclobber, spill any value they might
+    // have in them, then mark them unallocatable.
+    // If any virtual regs are earlyclobber, allocate them now (before
+    // freeing inputs that are killed).
+    if (MI->getOpcode()==TargetInstrInfo::INLINEASM) {
+      for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+        MachineOperand& MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef() && MO.isEarlyClobber() &&
+            MO.getReg()) {
+          if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+            unsigned DestVirtReg = MO.getReg();
+            unsigned DestPhysReg;
+
+            // If DestVirtReg already has a value, use it.
+            if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg)))
+              DestPhysReg = getReg(MBB, MI, DestVirtReg);
+            MF->getRegInfo().setPhysRegUsed(DestPhysReg);
+            markVirtRegModified(DestVirtReg);
+            getVirtRegLastUse(DestVirtReg) =
+                   std::make_pair((MachineInstr*)0, 0);
+            DOUT << "  Assigning " << TRI->getName(DestPhysReg)
+                 << " to %reg" << DestVirtReg << "\n";
+            MO.setReg(DestPhysReg);  // Assign the earlyclobber register
+          } else {
+            unsigned Reg = MO.getReg();
+            if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
+            // These are extra physical register defs when a sub-register
+            // is defined (def of a sub-register is a read/mod/write of the
+            // larger registers). Ignore.
+            if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
+
+            MF->getRegInfo().setPhysRegUsed(Reg);
+            spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
+            PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+            AddToPhysRegsUseOrder(Reg); 
+
+            for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+                 *SubRegs; ++SubRegs) {
+              if (PhysRegsUsed[*SubRegs] != -2) {
+                MF->getRegInfo().setPhysRegUsed(*SubRegs);
+                PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+                AddToPhysRegsUseOrder(*SubRegs); 
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Get the used operands into registers.  This has the potential to spill
+    // incoming values if we are out of registers.  Note that we completely
+    // ignore physical register uses here.  We assume that if an explicit
+    // physical register is referenced by the instruction, that it is guaranteed
+    // to be live-in, or the input is badly hosed.
+    //
+    SmallSet<unsigned, 4> ReloadedRegs;
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      // here we are looking for only used operands (never def&use)
+      if (MO.isReg() && !MO.isDef() && MO.getReg() && !MO.isImplicit() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        MI = reloadVirtReg(MBB, MI, i, ReloadedRegs);
+    }
+
+    // If this instruction is the last user of this register, kill the
+    // value, freeing the register being used, so it doesn't need to be
+    // spilled to memory.
+    //
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+      unsigned VirtReg = Kills[i];
+      unsigned PhysReg = VirtReg;
+      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
+        // If the virtual register was never materialized into a register, it
+        // might not be in the map, but it won't hurt to zero it out anyway.
+        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
+        PhysReg = PhysRegSlot;
+        PhysRegSlot = 0;
+      } else if (PhysRegsUsed[PhysReg] == -2) {
+        // Unallocatable register dead, ignore.
+        continue;
+      } else {
+        assert((!PhysRegsUsed[PhysReg] || PhysRegsUsed[PhysReg] == -1) &&
+               "Silently clearing a virtual register?");
+      }
+
+      if (PhysReg) {
+        DOUT << "  Last use of " << TRI->getName(PhysReg)
+             << "[%reg" << VirtReg <<"], removing it from live set\n";
+        removePhysReg(PhysReg);
+        for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
+             *SubRegs; ++SubRegs) {
+          if (PhysRegsUsed[*SubRegs] != -2) {
+            DOUT  << "  Last use of "
+                  << TRI->getName(*SubRegs)
+                  << "[%reg" << VirtReg <<"], removing it from live set\n";
+            removePhysReg(*SubRegs);
+          }
+        }
+      }
+    }
+
+    // Loop over all of the operands of the instruction, spilling registers that
+    // are defined, and marking explicit destinations in the PhysRegsUsed map.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDef() && !MO.isImplicit() && MO.getReg() &&
+          !MO.isEarlyClobber() &&
+          TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+        if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
+        // These are extra physical register defs when a sub-register
+        // is defined (def of a sub-register is a read/mod/write of the
+        // larger registers). Ignore.
+        if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
+
+        MF->getRegInfo().setPhysRegUsed(Reg);
+        spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
+        PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+        AddToPhysRegsUseOrder(Reg); 
+
+        for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+             *SubRegs; ++SubRegs) {
+          if (PhysRegsUsed[*SubRegs] != -2) {
+            MF->getRegInfo().setPhysRegUsed(*SubRegs);
+            PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+            AddToPhysRegsUseOrder(*SubRegs); 
+          }
+        }
+      }
+    }
+
+    // Loop over the implicit defs, spilling them as well.
+    if (TID.ImplicitDefs) {
+      for (const unsigned *ImplicitDefs = TID.ImplicitDefs;
+           *ImplicitDefs; ++ImplicitDefs) {
+        unsigned Reg = *ImplicitDefs;
+        if (PhysRegsUsed[Reg] != -2) {
+          spillPhysReg(MBB, MI, Reg, true);
+          AddToPhysRegsUseOrder(Reg); 
+          PhysRegsUsed[Reg] = 0;            // It is free and reserved now
+        }
+        MF->getRegInfo().setPhysRegUsed(Reg);
+        for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+             *SubRegs; ++SubRegs) {
+          if (PhysRegsUsed[*SubRegs] != -2) {
+            AddToPhysRegsUseOrder(*SubRegs); 
+            PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
+            MF->getRegInfo().setPhysRegUsed(*SubRegs);
+          }
+        }
+      }
+    }
+
+    SmallVector<unsigned, 8> DeadDefs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadDefs.push_back(MO.getReg());
+    }
+
+    // Okay, we have allocated all of the source operands and spilled any values
+    // that would be destroyed by defs of this instruction.  Loop over the
+    // explicit defs and assign them to a register, spilling incoming values if
+    // we need to scavenge a register.
+    //
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand& MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDef() && MO.getReg() &&
+          !MO.isEarlyClobber() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned DestVirtReg = MO.getReg();
+        unsigned DestPhysReg;
+
+        // If DestVirtReg already has a value, use it.
+        if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg)))
+          DestPhysReg = getReg(MBB, MI, DestVirtReg);
+        MF->getRegInfo().setPhysRegUsed(DestPhysReg);
+        markVirtRegModified(DestVirtReg);
+        getVirtRegLastUse(DestVirtReg) = std::make_pair((MachineInstr*)0, 0);
+        DOUT << "  Assigning " << TRI->getName(DestPhysReg)
+             << " to %reg" << DestVirtReg << "\n";
+        MO.setReg(DestPhysReg);  // Assign the output register
+      }
+    }
+
+    // If this instruction defines any registers that are immediately dead,
+    // kill them now.
+    //
+    for (unsigned i = 0, e = DeadDefs.size(); i != e; ++i) {
+      unsigned VirtReg = DeadDefs[i];
+      unsigned PhysReg = VirtReg;
+      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
+        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
+        PhysReg = PhysRegSlot;
+        assert(PhysReg != 0);
+        PhysRegSlot = 0;
+      } else if (PhysRegsUsed[PhysReg] == -2) {
+        // Unallocatable register dead, ignore.
+        continue;
+      }
+
+      if (PhysReg) {
+        DOUT  << "  Register " << TRI->getName(PhysReg)
+              << " [%reg" << VirtReg
+              << "] is never used, removing it from live set\n";
+        removePhysReg(PhysReg);
+        for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
+             *AliasSet; ++AliasSet) {
+          if (PhysRegsUsed[*AliasSet] != -2) {
+            DOUT  << "  Register " << TRI->getName(*AliasSet)
+                  << " [%reg" << *AliasSet
+                  << "] is never used, removing it from live set\n";
+            removePhysReg(*AliasSet);
+          }
+        }
+      }
+    }
+    
+    // Finally, if this is a noop copy instruction, zap it.  (Except that if
+    // the copy is dead, it must be kept to avoid messing up liveness info for
+    // the register scavenger.  See pr4100.)
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
+        SrcReg == DstReg && DeadDefs.empty())
+      MBB.erase(MI);
+  }
+
+  MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
+
+  // Spill all physical registers holding virtual registers now.
+  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
+    if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2) {
+      if (unsigned VirtReg = PhysRegsUsed[i])
+        spillVirtReg(MBB, MI, VirtReg, i);
+      else
+        removePhysReg(i);
+    }
+
+#if 0
+  // This checking code is very expensive.
+  bool AllOk = true;
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
+           e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i)
+    if (unsigned PR = Virt2PhysRegMap[i]) {
+      cerr << "Register still mapped: " << i << " -> " << PR << "\n";
+      AllOk = false;
+    }
+  assert(AllOk && "Virtual registers still in phys regs?");
+#endif
+
+  // Clear any physical register which appear live at the end of the basic
+  // block, but which do not hold any virtual registers.  e.g., the stack
+  // pointer.
+  PhysRegsUseOrder.clear();
+}
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool RALocal::runOnMachineFunction(MachineFunction &Fn) {
+  DOUT << "Machine Function " << "\n";
+  MF = &Fn;
+  TM = &Fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+
+  PhysRegsUsed.assign(TRI->getNumRegs(), -1);
+  
+  // At various places we want to efficiently check to see whether a register
+  // is allocatable.  To handle this, we mark all unallocatable registers as
+  // being pinned down, permanently.
+  {
+    BitVector Allocable = TRI->getAllocatableSet(Fn);
+    for (unsigned i = 0, e = Allocable.size(); i != e; ++i)
+      if (!Allocable[i])
+        PhysRegsUsed[i] = -2;  // Mark the reg unallocable.
+  }
+
+  // initialize the virtual->physical register map to have a 'null'
+  // mapping for all virtual registers
+  unsigned LastVirtReg = MF->getRegInfo().getLastVirtReg();
+  StackSlotForVirtReg.grow(LastVirtReg);
+  Virt2PhysRegMap.grow(LastVirtReg);
+  Virt2LastUseMap.grow(LastVirtReg);
+  VirtRegModified.resize(LastVirtReg+1-TargetRegisterInfo::FirstVirtualRegister);
+  UsedInMultipleBlocks.resize(LastVirtReg+1-TargetRegisterInfo::FirstVirtualRegister);
+ 
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB)
+    AllocateBasicBlock(*MBB);
+
+  StackSlotForVirtReg.clear();
+  PhysRegsUsed.clear();
+  VirtRegModified.clear();
+  UsedInMultipleBlocks.clear();
+  Virt2PhysRegMap.clear();
+  Virt2LastUseMap.clear();
+  return true;
+}
+
+FunctionPass *llvm::createLocalRegisterAllocator() {
+  return new RALocal();
+}
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
new file mode 100644
index 0000000..61450a7
--- /dev/null
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -0,0 +1,871 @@
+//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a Partitioned Boolean Quadratic Programming (PBQP) based
+// register allocator for LLVM. This allocator works by constructing a PBQP
+// problem representing the register allocation problem under consideration,
+// solving this using a PBQP solver, and mapping the solution back to a
+// register assignment. If any variables are selected for spilling then spill
+// code is inserted and the process repeated.
+//
+// The PBQP solver (pbqp.c) provided for this allocator uses a heuristic tuned
+// for register allocation. For more information on PBQP for register
+// allocation, see the following papers:
+//
+//   (1) Hames, L. and Scholz, B. 2006. Nearly optimal register allocation with
+//   PBQP. In Proceedings of the 7th Joint Modular Languages Conference
+//   (JMLC'06). LNCS, vol. 4228. Springer, New York, NY, USA. 346-361.
+//
+//   (2) Scholz, B., Eckstein, E. 2002. Register allocation for irregular
+//   architectures. In Proceedings of the Joint Conference on Languages,
+//   Compilers and Tools for Embedded Systems (LCTES'02), ACM Press, New York,
+//   NY, USA, 139-148.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+
+#include "PBQP.h"
+#include "VirtRegMap.h"
+#include "VirtRegRewriter.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+static RegisterRegAlloc
+registerPBQPRepAlloc("pbqp", "PBQP register allocator",
+                     createPBQPRegisterAllocator);
+
+namespace {
+
+  //!
+  //! PBQP based allocators solve the register allocation problem by mapping
+  //! register allocation problems to Partitioned Boolean Quadratic
+  //! Programming problems.
+  class VISIBILITY_HIDDEN PBQPRegAlloc : public MachineFunctionPass {
+  public:
+
+    static char ID;
+
+    //! Construct a PBQP register allocator.
+    PBQPRegAlloc() : MachineFunctionPass((intptr_t)&ID) {}
+
+    //! Return the pass name.
+    virtual const char* getPassName() const throw() {
+      return "PBQP Register Allocator";
+    }
+
+    //! PBQP analysis usage.
+    virtual void getAnalysisUsage(AnalysisUsage &au) const {
+      au.addRequired<LiveIntervals>();
+      au.addRequiredTransitive<RegisterCoalescer>();
+      au.addRequired<LiveStacks>();
+      au.addPreserved<LiveStacks>();
+      au.addRequired<MachineLoopInfo>();
+      au.addPreserved<MachineLoopInfo>();
+      au.addRequired<VirtRegMap>();
+      MachineFunctionPass::getAnalysisUsage(au);
+    }
+
+    //! Perform register allocation
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
+    typedef std::vector<const LiveInterval*> Node2LIMap;
+    typedef std::vector<unsigned> AllowedSet;
+    typedef std::vector<AllowedSet> AllowedSetMap;
+    typedef std::set<unsigned> RegSet;
+    typedef std::pair<unsigned, unsigned> RegPair;
+    typedef std::map<RegPair, PBQPNum> CoalesceMap;
+
+    typedef std::set<LiveInterval*> LiveIntervalSet;
+
+    MachineFunction *mf;
+    const TargetMachine *tm;
+    const TargetRegisterInfo *tri;
+    const TargetInstrInfo *tii;
+    const MachineLoopInfo *loopInfo;
+    MachineRegisterInfo *mri;
+
+    LiveIntervals *lis;
+    LiveStacks *lss;
+    VirtRegMap *vrm;
+
+    LI2NodeMap li2Node;
+    Node2LIMap node2LI;
+    AllowedSetMap allowedSets;
+    LiveIntervalSet vregIntervalsToAlloc,
+                    emptyVRegIntervals;
+
+
+    //! Builds a PBQP cost vector.
+    template <typename RegContainer>
+    PBQPVector* buildCostVector(unsigned vReg,
+                                const RegContainer &allowed,
+                                const CoalesceMap &cealesces,
+                                PBQPNum spillCost) const;
+
+    //! \brief Builds a PBQP interference matrix.
+    //!
+    //! @return Either a pointer to a non-zero PBQP matrix representing the
+    //!         allocation option costs, or a null pointer for a zero matrix.
+    //!
+    //! Expects allowed sets for two interfering LiveIntervals. These allowed
+    //! sets should contain only allocable registers from the LiveInterval's
+    //! register class, with any interfering pre-colored registers removed.
+    template <typename RegContainer>
+    PBQPMatrix* buildInterferenceMatrix(const RegContainer &allowed1,
+                                        const RegContainer &allowed2) const;
+
+    //!
+    //! Expects allowed sets for two potentially coalescable LiveIntervals,
+    //! and an estimated benefit due to coalescing. The allowed sets should
+    //! contain only allocable registers from the LiveInterval's register
+    //! classes, with any interfering pre-colored registers removed.
+    template <typename RegContainer>
+    PBQPMatrix* buildCoalescingMatrix(const RegContainer &allowed1,
+                                      const RegContainer &allowed2,
+                                      PBQPNum cBenefit) const;
+
+    //! \brief Finds coalescing opportunities and returns them as a map.
+    //!
+    //! Any entries in the map are guaranteed coalescable, even if their
+    //! corresponding live intervals overlap.
+    CoalesceMap findCoalesces();
+
+    //! \brief Finds the initial set of vreg intervals to allocate.
+    void findVRegIntervalsToAlloc();
+
+    //! \brief Constructs a PBQP problem representation of the register
+    //! allocation problem for this function.
+    //!
+    //! @return a PBQP solver object for the register allocation problem.
+    pbqp* constructPBQPProblem();
+
+    //! \brief Adds a stack interval if the given live interval has been
+    //! spilled. Used to support stack slot coloring.
+    void addStackInterval(const LiveInterval *spilled,MachineRegisterInfo* mri);
+
+    //! \brief Given a solved PBQP problem maps this solution back to a register
+    //! assignment.
+    bool mapPBQPToRegAlloc(pbqp *problem);
+
+    //! \brief Postprocessing before final spilling. Sets basic block "live in"
+    //! variables.
+    void finalizeAlloc() const;
+
+  };
+
+  char PBQPRegAlloc::ID = 0;
+}
+
+
+template <typename RegContainer>
+PBQPVector* PBQPRegAlloc::buildCostVector(unsigned vReg,
+                                          const RegContainer &allowed,
+                                          const CoalesceMap &coalesces,
+                                          PBQPNum spillCost) const {
+
+  typedef typename RegContainer::const_iterator AllowedItr;
+
+  // Allocate vector. Additional element (0th) used for spill option
+  PBQPVector *v = new PBQPVector(allowed.size() + 1);
+
+  (*v)[0] = spillCost;
+
+  // Iterate over the allowed registers inserting coalesce benefits if there
+  // are any.
+  unsigned ai = 0;
+  for (AllowedItr itr = allowed.begin(), end = allowed.end();
+       itr != end; ++itr, ++ai) {
+
+    unsigned pReg = *itr;
+
+    CoalesceMap::const_iterator cmItr =
+      coalesces.find(RegPair(vReg, pReg));
+
+    // No coalesce - on to the next preg.
+    if (cmItr == coalesces.end())
+      continue;
+
+    // We have a coalesce - insert the benefit.
+    (*v)[ai + 1] = -cmItr->second;
+  }
+
+  return v;
+}
+
+template <typename RegContainer>
+PBQPMatrix* PBQPRegAlloc::buildInterferenceMatrix(
+      const RegContainer &allowed1, const RegContainer &allowed2) const {
+
+  typedef typename RegContainer::const_iterator RegContainerIterator;
+
+  // Construct a PBQP matrix representing the cost of allocation options. The
+  // rows and columns correspond to the allocation options for the two live
+  // intervals.  Elements will be infinite where corresponding registers alias,
+  // since we cannot allocate aliasing registers to interfering live intervals.
+  // All other elements (non-aliasing combinations) will have zero cost. Note
+  // that the spill option (element 0,0) has zero cost, since we can allocate
+  // both intervals to memory safely (the cost for each individual allocation
+  // to memory is accounted for by the cost vectors for each live interval).
+  PBQPMatrix *m = new PBQPMatrix(allowed1.size() + 1, allowed2.size() + 1);
+
+  // Assume this is a zero matrix until proven otherwise.  Zero matrices occur
+  // between interfering live ranges with non-overlapping register sets (e.g.
+  // non-overlapping reg classes, or disjoint sets of allowed regs within the
+  // same class). The term "overlapping" is used advisedly: sets which do not
+  // intersect, but contain registers which alias, will have non-zero matrices.
+  // We optimize zero matrices away to improve solver speed.
+  bool isZeroMatrix = true;
+
+
+  // Row index. Starts at 1, since the 0th row is for the spill option, which
+  // is always zero.
+  unsigned ri = 1;
+
+  // Iterate over allowed sets, insert infinities where required.
+  for (RegContainerIterator a1Itr = allowed1.begin(), a1End = allowed1.end();
+       a1Itr != a1End; ++a1Itr) {
+
+    // Column index, starts at 1 as for row index.
+    unsigned ci = 1;
+    unsigned reg1 = *a1Itr;
+
+    for (RegContainerIterator a2Itr = allowed2.begin(), a2End = allowed2.end();
+         a2Itr != a2End; ++a2Itr) {
+
+      unsigned reg2 = *a2Itr;
+
+      // If the row/column regs are identical or alias insert an infinity.
+      if ((reg1 == reg2) || tri->areAliases(reg1, reg2)) {
+        (*m)[ri][ci] = std::numeric_limits<PBQPNum>::infinity();
+        isZeroMatrix = false;
+      }
+
+      ++ci;
+    }
+
+    ++ri;
+  }
+
+  // If this turns out to be a zero matrix...
+  if (isZeroMatrix) {
+    // free it and return null.
+    delete m;
+    return 0;
+  }
+
+  // ...otherwise return the cost matrix.
+  return m;
+}
+
+template <typename RegContainer>
+PBQPMatrix* PBQPRegAlloc::buildCoalescingMatrix(
+      const RegContainer &allowed1, const RegContainer &allowed2,
+      PBQPNum cBenefit) const {
+
+  typedef typename RegContainer::const_iterator RegContainerIterator;
+
+  // Construct a PBQP Matrix representing the benefits of coalescing. As with
+  // interference matrices the rows and columns represent allowed registers
+  // for the LiveIntervals which are (potentially) to be coalesced. The amount
+  // -cBenefit will be placed in any element representing the same register
+  // for both intervals.
+  PBQPMatrix *m = new PBQPMatrix(allowed1.size() + 1, allowed2.size() + 1);
+
+  // Reset costs to zero.
+  m->reset(0);
+
+  // Assume the matrix is zero till proven otherwise. Zero matrices will be
+  // optimized away as in the interference case.
+  bool isZeroMatrix = true;
+
+  // Row index. Starts at 1, since the 0th row is for the spill option, which
+  // is always zero.
+  unsigned ri = 1;
+
+  // Iterate over the allowed sets, insert coalescing benefits where
+  // appropriate.
+  for (RegContainerIterator a1Itr = allowed1.begin(), a1End = allowed1.end();
+       a1Itr != a1End; ++a1Itr) {
+
+    // Column index, starts at 1 as for row index.
+    unsigned ci = 1;
+    unsigned reg1 = *a1Itr;
+
+    for (RegContainerIterator a2Itr = allowed2.begin(), a2End = allowed2.end();
+         a2Itr != a2End; ++a2Itr) {
+
+      // If the row and column represent the same register insert a beneficial
+      // cost to preference this allocation - it would allow us to eliminate a
+      // move instruction.
+      if (reg1 == *a2Itr) {
+        (*m)[ri][ci] = -cBenefit;
+        isZeroMatrix = false;
+      }
+
+      ++ci;
+    }
+
+    ++ri;
+  }
+
+  // If this turns out to be a zero matrix...
+  if (isZeroMatrix) {
+    // ...free it and return null.
+    delete m;
+    return 0;
+  }
+
+  return m;
+}
+
+PBQPRegAlloc::CoalesceMap PBQPRegAlloc::findCoalesces() {
+
+  typedef MachineFunction::const_iterator MFIterator;
+  typedef MachineBasicBlock::const_iterator MBBIterator;
+  typedef LiveInterval::const_vni_iterator VNIIterator;
+
+  CoalesceMap coalescesFound;
+
+  // To find coalesces we need to iterate over the function looking for
+  // copy instructions.
+  for (MFIterator bbItr = mf->begin(), bbEnd = mf->end();
+       bbItr != bbEnd; ++bbItr) {
+
+    const MachineBasicBlock *mbb = &*bbItr;
+
+    for (MBBIterator iItr = mbb->begin(), iEnd = mbb->end();
+         iItr != iEnd; ++iItr) {
+
+      const MachineInstr *instr = &*iItr;
+      unsigned srcReg, dstReg, srcSubReg, dstSubReg;
+
+      // If this isn't a copy then continue to the next instruction.
+      if (!tii->isMoveInstr(*instr, srcReg, dstReg, srcSubReg, dstSubReg))
+        continue;
+
+      // If the registers are already the same our job is nice and easy.
+      if (dstReg == srcReg)
+        continue;
+
+      bool srcRegIsPhysical = TargetRegisterInfo::isPhysicalRegister(srcReg),
+           dstRegIsPhysical = TargetRegisterInfo::isPhysicalRegister(dstReg);
+
+      // If both registers are physical then we can't coalesce.
+      if (srcRegIsPhysical && dstRegIsPhysical)
+        continue;
+
+      // If it's a copy that includes a virtual register but the source and
+      // destination classes differ then we can't coalesce, so continue with
+      // the next instruction.
+      const TargetRegisterClass *srcRegClass = srcRegIsPhysical ?
+          tri->getPhysicalRegisterRegClass(srcReg) : mri->getRegClass(srcReg);
+
+      const TargetRegisterClass *dstRegClass = dstRegIsPhysical ?
+          tri->getPhysicalRegisterRegClass(dstReg) : mri->getRegClass(dstReg);
+
+      if (srcRegClass != dstRegClass)
+        continue;
+
+      // We also need any physical regs to be allocable, coalescing with
+      // a non-allocable register is invalid.
+      if (srcRegIsPhysical) {
+        if (std::find(srcRegClass->allocation_order_begin(*mf),
+                      srcRegClass->allocation_order_end(*mf), srcReg) ==
+            srcRegClass->allocation_order_end(*mf))
+          continue;
+      }
+
+      if (dstRegIsPhysical) {
+        if (std::find(dstRegClass->allocation_order_begin(*mf),
+                      dstRegClass->allocation_order_end(*mf), dstReg) ==
+            dstRegClass->allocation_order_end(*mf))
+          continue;
+      }
+
+      // If we've made it here we have a copy with compatible register classes.
+      // We can probably coalesce, but we need to consider overlap.
+      const LiveInterval *srcLI = &lis->getInterval(srcReg),
+                         *dstLI = &lis->getInterval(dstReg);
+
+      if (srcLI->overlaps(*dstLI)) {
+        // Even in the case of an overlap we might still be able to coalesce,
+        // but we need to make sure that no definition of either range occurs
+        // while the other range is live.
+
+        // Otherwise start by assuming we're ok.
+        bool badDef = false;
+
+        // Test all defs of the source range.
+        for (VNIIterator
+               vniItr = srcLI->vni_begin(), vniEnd = srcLI->vni_end();
+               vniItr != vniEnd; ++vniItr) {
+
+          // If we find a def that kills the coalescing opportunity then
+          // record it and break from the loop.
+          if (dstLI->liveAt((*vniItr)->def)) {
+            badDef = true;
+            break;
+          }
+        }
+
+        // If we have a bad def give up, continue to the next instruction.
+        if (badDef)
+          continue;
+
+        // Otherwise test definitions of the destination range.
+        for (VNIIterator
+               vniItr = dstLI->vni_begin(), vniEnd = dstLI->vni_end();
+               vniItr != vniEnd; ++vniItr) {
+
+          // We want to make sure we skip the copy instruction itself.
+          if ((*vniItr)->copy == instr)
+            continue;
+
+          if (srcLI->liveAt((*vniItr)->def)) {
+            badDef = true;
+            break;
+          }
+        }
+
+        // As before a bad def we give up and continue to the next instr.
+        if (badDef)
+          continue;
+      }
+
+      // If we make it to here then either the ranges didn't overlap, or they
+      // did, but none of their definitions would prevent us from coalescing.
+      // We're good to go with the coalesce.
+
+      float cBenefit = powf(10.0f, loopInfo->getLoopDepth(mbb)) / 5.0;
+
+      coalescesFound[RegPair(srcReg, dstReg)] = cBenefit;
+      coalescesFound[RegPair(dstReg, srcReg)] = cBenefit;
+    }
+
+  }
+
+  return coalescesFound;
+}
+
+void PBQPRegAlloc::findVRegIntervalsToAlloc() {
+
+  // Iterate over all live ranges.
+  for (LiveIntervals::iterator itr = lis->begin(), end = lis->end();
+       itr != end; ++itr) {
+
+    // Ignore physical ones.
+    if (TargetRegisterInfo::isPhysicalRegister(itr->first))
+      continue;
+
+    LiveInterval *li = itr->second;
+
+    // If this live interval is non-empty we will use pbqp to allocate it.
+    // Empty intervals we allocate in a simple post-processing stage in
+    // finalizeAlloc.
+    if (!li->empty()) {
+      vregIntervalsToAlloc.insert(li);
+    }
+    else {
+      emptyVRegIntervals.insert(li);
+    }
+  }
+}
+
+pbqp* PBQPRegAlloc::constructPBQPProblem() {
+
+  typedef std::vector<const LiveInterval*> LIVector;
+  typedef std::vector<unsigned> RegVector;
+
+  // This will store the physical intervals for easy reference.
+  LIVector physIntervals;
+
+  // Start by clearing the old node <-> live interval mappings & allowed sets
+  li2Node.clear();
+  node2LI.clear();
+  allowedSets.clear();
+
+  // Populate physIntervals, update preg use:
+  for (LiveIntervals::iterator itr = lis->begin(), end = lis->end();
+       itr != end; ++itr) {
+
+    if (TargetRegisterInfo::isPhysicalRegister(itr->first)) {
+      physIntervals.push_back(itr->second);
+      mri->setPhysRegUsed(itr->second->reg);
+    }
+  }
+
+  // Iterate over vreg intervals, construct live interval <-> node number
+  //  mappings.
+  for (LiveIntervalSet::const_iterator
+       itr = vregIntervalsToAlloc.begin(), end = vregIntervalsToAlloc.end();
+       itr != end; ++itr) {
+    const LiveInterval *li = *itr;
+
+    li2Node[li] = node2LI.size();
+    node2LI.push_back(li);
+  }
+
+  // Get the set of potential coalesces.
+  CoalesceMap coalesces(findCoalesces());
+
+  // Construct a PBQP solver for this problem
+  pbqp *solver = alloc_pbqp(vregIntervalsToAlloc.size());
+
+  // Resize allowedSets container appropriately.
+  allowedSets.resize(vregIntervalsToAlloc.size());
+
+  // Iterate over virtual register intervals to compute allowed sets...
+  for (unsigned node = 0; node < node2LI.size(); ++node) {
+
+    // Grab pointers to the interval and its register class.
+    const LiveInterval *li = node2LI[node];
+    const TargetRegisterClass *liRC = mri->getRegClass(li->reg);
+
+    // Start by assuming all allocable registers in the class are allowed...
+    RegVector liAllowed(liRC->allocation_order_begin(*mf),
+                        liRC->allocation_order_end(*mf));
+
+    // Eliminate the physical registers which overlap with this range, along
+    // with all their aliases.
+    for (LIVector::iterator pItr = physIntervals.begin(),
+       pEnd = physIntervals.end(); pItr != pEnd; ++pItr) {
+
+      if (!li->overlaps(**pItr))
+        continue;
+
+      unsigned pReg = (*pItr)->reg;
+
+      // If we get here then the live intervals overlap, but we're still ok
+      // if they're coalescable.
+      if (coalesces.find(RegPair(li->reg, pReg)) != coalesces.end())
+        continue;
+
+      // If we get here then we have a genuine exclusion.
+
+      // Remove the overlapping reg...
+      RegVector::iterator eraseItr =
+        std::find(liAllowed.begin(), liAllowed.end(), pReg);
+
+      if (eraseItr != liAllowed.end())
+        liAllowed.erase(eraseItr);
+
+      const unsigned *aliasItr = tri->getAliasSet(pReg);
+
+      if (aliasItr != 0) {
+        // ...and its aliases.
+        for (; *aliasItr != 0; ++aliasItr) {
+          RegVector::iterator eraseItr =
+            std::find(liAllowed.begin(), liAllowed.end(), *aliasItr);
+
+          if (eraseItr != liAllowed.end()) {
+            liAllowed.erase(eraseItr);
+          }
+        }
+      }
+    }
+
+    // Copy the allowed set into a member vector for use when constructing cost
+    // vectors & matrices, and mapping PBQP solutions back to assignments.
+    allowedSets[node] = AllowedSet(liAllowed.begin(), liAllowed.end());
+
+    // Set the spill cost to the interval weight, or epsilon if the
+    // interval weight is zero
+    PBQPNum spillCost = (li->weight != 0.0) ?
+        li->weight : std::numeric_limits<PBQPNum>::min();
+
+    // Build a cost vector for this interval.
+    add_pbqp_nodecosts(solver, node,
+                       buildCostVector(li->reg, allowedSets[node], coalesces,
+                                       spillCost));
+
+  }
+
+
+  // Now add the cost matrices...
+  for (unsigned node1 = 0; node1 < node2LI.size(); ++node1) {
+    const LiveInterval *li = node2LI[node1];
+
+    // Test for live range overlaps and insert interference matrices.
+    for (unsigned node2 = node1 + 1; node2 < node2LI.size(); ++node2) {
+      const LiveInterval *li2 = node2LI[node2];
+
+      CoalesceMap::const_iterator cmItr =
+        coalesces.find(RegPair(li->reg, li2->reg));
+
+      PBQPMatrix *m = 0;
+
+      if (cmItr != coalesces.end()) {
+        m = buildCoalescingMatrix(allowedSets[node1], allowedSets[node2],
+                                  cmItr->second);
+      }
+      else if (li->overlaps(*li2)) {
+        m = buildInterferenceMatrix(allowedSets[node1], allowedSets[node2]);
+      }
+
+      if (m != 0) {
+        add_pbqp_edgecosts(solver, node1, node2, m);
+        delete m;
+      }
+    }
+  }
+
+  // We're done, PBQP problem constructed - return it.
+  return solver;
+}
+
+void PBQPRegAlloc::addStackInterval(const LiveInterval *spilled,
+                                    MachineRegisterInfo* mri) {
+  int stackSlot = vrm->getStackSlot(spilled->reg);
+
+  if (stackSlot == VirtRegMap::NO_STACK_SLOT)
+    return;
+
+  const TargetRegisterClass *RC = mri->getRegClass(spilled->reg);
+  LiveInterval &stackInterval = lss->getOrCreateInterval(stackSlot, RC);
+
+  VNInfo *vni;
+  if (stackInterval.getNumValNums() != 0)
+    vni = stackInterval.getValNumInfo(0);
+  else
+    vni = stackInterval.getNextValue(-0U, 0, lss->getVNInfoAllocator());
+
+  LiveInterval &rhsInterval = lis->getInterval(spilled->reg);
+  stackInterval.MergeRangesInAsValue(rhsInterval, vni);
+}
+
+bool PBQPRegAlloc::mapPBQPToRegAlloc(pbqp *problem) {
+
+  // Set to true if we have any spills
+  bool anotherRoundNeeded = false;
+
+  // Clear the existing allocation.
+  vrm->clearAllVirt();
+
+  // Iterate over the nodes mapping the PBQP solution to a register assignment.
+  for (unsigned node = 0; node < node2LI.size(); ++node) {
+    unsigned virtReg = node2LI[node]->reg,
+             allocSelection = get_pbqp_solution(problem, node);
+
+    // If the PBQP solution is non-zero it's a physical register...
+    if (allocSelection != 0) {
+      // Get the physical reg, subtracting 1 to account for the spill option.
+      unsigned physReg = allowedSets[node][allocSelection - 1];
+
+      DOUT << "VREG " << virtReg << " -> " << tri->getName(physReg) << "\n";
+
+      assert(physReg != 0);
+
+      // Add to the virt reg map and update the used phys regs.
+      vrm->assignVirt2Phys(virtReg, physReg);
+    }
+    // ...Otherwise it's a spill.
+    else {
+
+      // Make sure we ignore this virtual reg on the next round
+      // of allocation
+      vregIntervalsToAlloc.erase(&lis->getInterval(virtReg));
+
+      // Insert spill ranges for this live range
+      const LiveInterval *spillInterval = node2LI[node];
+      double oldSpillWeight = spillInterval->weight;
+      SmallVector<LiveInterval*, 8> spillIs;
+      std::vector<LiveInterval*> newSpills =
+        lis->addIntervalsForSpills(*spillInterval, spillIs, loopInfo, *vrm);
+      addStackInterval(spillInterval, mri);
+
+      DOUT << "VREG " << virtReg << " -> SPILLED (Cost: "
+           << oldSpillWeight << ", New vregs: ";
+
+      // Copy any newly inserted live intervals into the list of regs to
+      // allocate.
+      for (std::vector<LiveInterval*>::const_iterator
+           itr = newSpills.begin(), end = newSpills.end();
+           itr != end; ++itr) {
+
+        assert(!(*itr)->empty() && "Empty spill range.");
+
+        DOUT << (*itr)->reg << " ";
+
+        vregIntervalsToAlloc.insert(*itr);
+      }
+
+      DOUT << ")\n";
+
+      // We need another round if spill intervals were added.
+      anotherRoundNeeded |= !newSpills.empty();
+    }
+  }
+
+  return !anotherRoundNeeded;
+}
+
+void PBQPRegAlloc::finalizeAlloc() const {
+  typedef LiveIntervals::iterator LIIterator;
+  typedef LiveInterval::Ranges::const_iterator LRIterator;
+
+  // First allocate registers for the empty intervals.
+  for (LiveIntervalSet::const_iterator
+         itr = emptyVRegIntervals.begin(), end = emptyVRegIntervals.end();
+         itr != end; ++itr) {
+    LiveInterval *li = *itr;
+
+    unsigned physReg = li->preference;
+
+    if (physReg == 0) {
+      const TargetRegisterClass *liRC = mri->getRegClass(li->reg);
+      physReg = *liRC->allocation_order_begin(*mf);
+    }
+
+    vrm->assignVirt2Phys(li->reg, physReg);
+  }
+
+  // Finally iterate over the basic blocks to compute and set the live-in sets.
+  SmallVector<MachineBasicBlock*, 8> liveInMBBs;
+  MachineBasicBlock *entryMBB = &*mf->begin();
+
+  for (LIIterator liItr = lis->begin(), liEnd = lis->end();
+       liItr != liEnd; ++liItr) {
+
+    const LiveInterval *li = liItr->second;
+    unsigned reg = 0;
+
+    // Get the physical register for this interval
+    if (TargetRegisterInfo::isPhysicalRegister(li->reg)) {
+      reg = li->reg;
+    }
+    else if (vrm->isAssignedReg(li->reg)) {
+      reg = vrm->getPhys(li->reg);
+    }
+    else {
+      // Ranges which are assigned a stack slot only are ignored.
+      continue;
+    }
+
+    // Ignore unallocated vregs:
+    if (reg == 0) {
+      continue;
+    }
+
+    // Iterate over the ranges of the current interval...
+    for (LRIterator lrItr = li->begin(), lrEnd = li->end();
+         lrItr != lrEnd; ++lrItr) {
+
+      // Find the set of basic blocks which this range is live into...
+      if (lis->findLiveInMBBs(lrItr->start, lrItr->end,  liveInMBBs)) {
+        // And add the physreg for this interval to their live-in sets.
+        for (unsigned i = 0; i < liveInMBBs.size(); ++i) {
+          if (liveInMBBs[i] != entryMBB) {
+            if (!liveInMBBs[i]->isLiveIn(reg)) {
+              liveInMBBs[i]->addLiveIn(reg);
+            }
+          }
+        }
+        liveInMBBs.clear();
+      }
+    }
+  }
+
+}
+
+bool PBQPRegAlloc::runOnMachineFunction(MachineFunction &MF) {
+
+  mf = &MF;
+  tm = &mf->getTarget();
+  tri = tm->getRegisterInfo();
+  tii = tm->getInstrInfo();
+  mri = &mf->getRegInfo();
+
+  lis = &getAnalysis<LiveIntervals>();
+  lss = &getAnalysis<LiveStacks>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  vrm = &getAnalysis<VirtRegMap>();
+
+  DOUT << "PBQP Register Allocating for " << mf->getFunction()->getName() << "\n";
+
+  // Allocator main loop:
+  //
+  // * Map current regalloc problem to a PBQP problem
+  // * Solve the PBQP problem
+  // * Map the solution back to a register allocation
+  // * Spill if necessary
+  //
+  // This process is continued till no more spills are generated.
+
+  // Find the vreg intervals in need of allocation.
+  findVRegIntervalsToAlloc();
+
+  // If there aren't any then we're done here.
+  if (vregIntervalsToAlloc.empty() && emptyVRegIntervals.empty())
+    return true;
+
+  // If there are non-empty intervals allocate them using pbqp.
+  if (!vregIntervalsToAlloc.empty()) {
+
+    bool pbqpAllocComplete = false;
+    unsigned round = 0;
+
+    while (!pbqpAllocComplete) {
+      DOUT << "  PBQP Regalloc round " << round << ":\n";
+
+      pbqp *problem = constructPBQPProblem();
+
+      solve_pbqp(problem);
+
+      pbqpAllocComplete = mapPBQPToRegAlloc(problem);
+
+      free_pbqp(problem);
+
+      ++round;
+    }
+  }
+
+  // Finalise allocation, allocate empty ranges.
+  finalizeAlloc();
+
+  vregIntervalsToAlloc.clear();
+  emptyVRegIntervals.clear();
+  li2Node.clear();
+  node2LI.clear();
+  allowedSets.clear();
+
+  DOUT << "Post alloc VirtRegMap:\n" << *vrm << "\n";
+
+  // Run rewriter
+  std::auto_ptr<VirtRegRewriter> rewriter(createVirtRegRewriter());
+
+  rewriter->runOnMachineFunction(*mf, *vrm, lis);
+
+  return true;
+}
+
+FunctionPass* llvm::createPBQPRegisterAllocator() {
+  return new PBQPRegAlloc();
+}
+
+
+#undef DEBUG_TYPE
diff --git a/lib/CodeGen/RegAllocSimple.cpp b/lib/CodeGen/RegAllocSimple.cpp
new file mode 100644
index 0000000..447e54c
--- /dev/null
+++ b/lib/CodeGen/RegAllocSimple.cpp
@@ -0,0 +1,257 @@
+//===-- RegAllocSimple.cpp - A simple generic register allocator ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple register allocator. *Very* simple: It immediate
+// spills every value right after it is computed, and it reloads all used
+// operands from the spill area to temporary registers before each instruction.
+// It does not keep values in registers across instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads , "Number of loads added");
+
+namespace {
+  static RegisterRegAlloc
+    simpleRegAlloc("simple", "simple register allocator",
+                   createSimpleRegisterAllocator);
+
+  class VISIBILITY_HIDDEN RegAllocSimple : public MachineFunctionPass {
+  public:
+    static char ID;
+    RegAllocSimple() : MachineFunctionPass(&ID) {}
+  private:
+    MachineFunction *MF;
+    const TargetMachine *TM;
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+
+    // StackSlotForVirtReg - Maps SSA Regs => frame index on the stack where
+    // these values are spilled
+    std::map<unsigned, int> StackSlotForVirtReg;
+
+    // RegsUsed - Keep track of what registers are currently in use.  This is a
+    // bitset.
+    std::vector<bool> RegsUsed;
+
+    // RegClassIdx - Maps RegClass => which index we can take a register
+    // from. Since this is a simple register allocator, when we need a register
+    // of a certain class, we just take the next available one.
+    std::map<const TargetRegisterClass*, unsigned> RegClassIdx;
+
+  public:
+    virtual const char *getPassName() const {
+      return "Simple Register Allocator";
+    }
+
+    /// runOnMachineFunction - Register allocate the whole function
+    bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(PHIEliminationID);           // Eliminate PHI nodes
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  private:
+    /// AllocateBasicBlock - Register allocate the specified basic block.
+    void AllocateBasicBlock(MachineBasicBlock &MBB);
+
+    /// getStackSpaceFor - This returns the offset of the specified virtual
+    /// register on the stack, allocating space if necessary.
+    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
+
+    /// Given a virtual register, return a compatible physical register that is
+    /// currently unused.
+    ///
+    /// Side effect: marks that register as being used until manually cleared
+    ///
+    unsigned getFreeReg(unsigned virtualReg);
+
+    /// Moves value from memory into that register
+    unsigned reloadVirtReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, unsigned VirtReg);
+
+    /// Saves reg value on the stack (maps virtual register to stack value)
+    void spillVirtReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                      unsigned VirtReg, unsigned PhysReg);
+  };
+  char RegAllocSimple::ID = 0;
+}
+
+/// getStackSpaceFor - This allocates space for the specified virtual
+/// register to be held on the stack.
+int RegAllocSimple::getStackSpaceFor(unsigned VirtReg,
+                                     const TargetRegisterClass *RC) {
+  // Find the location VirtReg would belong...
+  std::map<unsigned, int>::iterator I = StackSlotForVirtReg.find(VirtReg);
+
+  if (I != StackSlotForVirtReg.end())
+    return I->second;          // Already has space allocated?
+
+  // Allocate a new stack object for this spill location...
+  int FrameIdx = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment());
+
+  // Assign the slot...
+  StackSlotForVirtReg.insert(I, std::make_pair(VirtReg, FrameIdx));
+
+  return FrameIdx;
+}
+
+unsigned RegAllocSimple::getFreeReg(unsigned virtualReg) {
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtualReg);
+  TargetRegisterClass::iterator RI = RC->allocation_order_begin(*MF);
+#ifndef NDEBUG
+  TargetRegisterClass::iterator RE = RC->allocation_order_end(*MF);
+#endif
+
+  while (1) {
+    unsigned regIdx = RegClassIdx[RC]++;
+    assert(RI+regIdx != RE && "Not enough registers!");
+    unsigned PhysReg = *(RI+regIdx);
+
+    if (!RegsUsed[PhysReg]) {
+      MF->getRegInfo().setPhysRegUsed(PhysReg);
+      return PhysReg;
+    }
+  }
+}
+
+unsigned RegAllocSimple::reloadVirtReg(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned VirtReg) {
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(VirtReg);
+  int FrameIdx = getStackSpaceFor(VirtReg, RC);
+  unsigned PhysReg = getFreeReg(VirtReg);
+
+  // Add move instruction(s)
+  ++NumLoads;
+  TII->loadRegFromStackSlot(MBB, I, PhysReg, FrameIdx, RC);
+  return PhysReg;
+}
+
+void RegAllocSimple::spillVirtReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned VirtReg, unsigned PhysReg) {
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(VirtReg);
+  
+  int FrameIdx = getStackSpaceFor(VirtReg, RC);
+
+  // Add move instruction(s)
+  ++NumStores;
+  TII->storeRegToStackSlot(MBB, I, PhysReg, true, FrameIdx, RC);
+}
+
+
+void RegAllocSimple::AllocateBasicBlock(MachineBasicBlock &MBB) {
+  // loop over each instruction
+  for (MachineBasicBlock::iterator MI = MBB.begin(); MI != MBB.end(); ++MI) {
+    // Made to combat the incorrect allocation of r2 = add r1, r1
+    std::map<unsigned, unsigned> Virt2PhysRegMap;
+
+    RegsUsed.resize(TRI->getNumRegs());
+
+    // This is a preliminary pass that will invalidate any registers that are
+    // used by the instruction (including implicit uses).
+    const TargetInstrDesc &Desc = MI->getDesc();
+    const unsigned *Regs;
+    if (Desc.ImplicitUses) {
+      for (Regs = Desc.ImplicitUses; *Regs; ++Regs)
+        RegsUsed[*Regs] = true;
+    }
+
+    if (Desc.ImplicitDefs) {
+      for (Regs = Desc.ImplicitDefs; *Regs; ++Regs) {
+        RegsUsed[*Regs] = true;
+        MF->getRegInfo().setPhysRegUsed(*Regs);
+      }
+    }
+
+    // Loop over uses, move from memory into registers.
+    for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
+      MachineOperand &MO = MI->getOperand(i);
+
+      if (MO.isReg() && MO.getReg() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned virtualReg = (unsigned) MO.getReg();
+        DOUT << "op: " << MO << "\n";
+        DOUT << "\t inst[" << i << "]: ";
+        DEBUG(MI->print(*cerr.stream(), TM));
+
+        // make sure the same virtual register maps to the same physical
+        // register in any given instruction
+        unsigned physReg = Virt2PhysRegMap[virtualReg];
+        if (physReg == 0) {
+          if (MO.isDef()) {
+            unsigned TiedOp;
+            if (!MI->isRegTiedToUseOperand(i, &TiedOp)) {
+              physReg = getFreeReg(virtualReg);
+            } else {
+              // must be same register number as the source operand that is 
+              // tied to. This maps a = b + c into b = b + c, and saves b into
+              // a's spot.
+              assert(MI->getOperand(TiedOp).isReg()  &&
+                     MI->getOperand(TiedOp).getReg() &&
+                     MI->getOperand(TiedOp).isUse() &&
+                     "Two address instruction invalid!");
+
+              physReg = MI->getOperand(TiedOp).getReg();
+            }
+            spillVirtReg(MBB, next(MI), virtualReg, physReg);
+          } else {
+            physReg = reloadVirtReg(MBB, MI, virtualReg);
+            Virt2PhysRegMap[virtualReg] = physReg;
+          }
+        }
+        MO.setReg(physReg);
+        DOUT << "virt: " << virtualReg << ", phys: " << MO.getReg() << "\n";
+      }
+    }
+    RegClassIdx.clear();
+    RegsUsed.clear();
+  }
+}
+
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool RegAllocSimple::runOnMachineFunction(MachineFunction &Fn) {
+  DOUT << "Machine Function\n";
+  MF = &Fn;
+  TM = &MF->getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB)
+    AllocateBasicBlock(*MBB);
+
+  StackSlotForVirtReg.clear();
+  return true;
+}
+
+FunctionPass *llvm::createSimpleRegisterAllocator() {
+  return new RegAllocSimple();
+}
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
new file mode 100644
index 0000000..1131e3d
--- /dev/null
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -0,0 +1,41 @@
+//===- RegisterCoalescer.cpp - Generic Register Coalescing Interface -------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the generic RegisterCoalescer interface which
+// is used as the common interface used by all clients and
+// implementations of register coalescing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+// Register the RegisterCoalescer interface, providing a nice name to refer to.
+static RegisterAnalysisGroup<RegisterCoalescer> Z("Register Coalescer");
+char RegisterCoalescer::ID = 0;
+
+// RegisterCoalescer destructor: DO NOT move this to the header file
+// for RegisterCoalescer or else clients of the RegisterCoalescer
+// class may not depend on the RegisterCoalescer.o file in the current
+// .a file, causing alias analysis support to not be included in the
+// tool correctly!
+//
+RegisterCoalescer::~RegisterCoalescer() {}
+
+// Because of the way .a files work, we must force the SimpleRC
+// implementation to be pulled in if the RegisterCoalescer classes are
+// pulled in.  Otherwise we run the risk of RegisterCoalescer being
+// used, but the default implementation not being linked into the tool
+// that uses it.
+DEFINING_FILE_FOR(RegisterCoalescer)
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
new file mode 100644
index 0000000..944468e
--- /dev/null
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -0,0 +1,480 @@
+//===-- RegisterScavenging.cpp - Machine register scavenging --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine register scavenger. It can provide
+// information, such as unused registers, at any point in a machine basic block.
+// It also provides a mechanism to make registers available by evicting them to
+// spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reg-scavenging"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+/// RedefinesSuperRegPart - Return true if the specified register is redefining
+/// part of a super-register.
+static bool RedefinesSuperRegPart(const MachineInstr *MI, unsigned SubReg,
+                                  const TargetRegisterInfo *TRI) {
+  bool SeenSuperUse = false;
+  bool SeenSuperDef = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (TRI->isSuperRegister(SubReg, MO.getReg())) {
+      if (MO.isUse())
+        SeenSuperUse = true;
+      else if (MO.isImplicit())
+        SeenSuperDef = true;
+    }
+  }
+
+  return SeenSuperDef && SeenSuperUse;
+}
+
+static bool RedefinesSuperRegPart(const MachineInstr *MI,
+                                  const MachineOperand &MO,
+                                  const TargetRegisterInfo *TRI) {
+  assert(MO.isReg() && MO.isDef() && "Not a register def!");
+  return RedefinesSuperRegPart(MI, MO.getReg(), TRI);
+}
+
+/// setUsed - Set the register and its sub-registers as being used.
+void RegScavenger::setUsed(unsigned Reg, bool ImpDef) {
+  RegsAvailable.reset(Reg);
+  ImplicitDefed[Reg] = ImpDef;
+
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs) {
+    RegsAvailable.reset(SubReg);
+    ImplicitDefed[SubReg] = ImpDef;
+  }
+}
+
+/// setUnused - Set the register and its sub-registers as being unused.
+void RegScavenger::setUnused(unsigned Reg, const MachineInstr *MI) {
+  RegsAvailable.set(Reg);
+  ImplicitDefed.reset(Reg);
+
+  for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+       unsigned SubReg = *SubRegs; ++SubRegs)
+    if (!RedefinesSuperRegPart(MI, Reg, TRI)) {
+      RegsAvailable.set(SubReg);
+      ImplicitDefed.reset(SubReg);
+    }
+}
+
+void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
+  MachineFunction &MF = *mbb->getParent();
+  const TargetMachine &TM = MF.getTarget();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  assert((NumPhysRegs == 0 || NumPhysRegs == TRI->getNumRegs()) &&
+         "Target changed?");
+
+  if (!MBB) {
+    NumPhysRegs = TRI->getNumRegs();
+    RegsAvailable.resize(NumPhysRegs);
+    ImplicitDefed.resize(NumPhysRegs);
+
+    // Create reserved registers bitvector.
+    ReservedRegs = TRI->getReservedRegs(MF);
+
+    // Create callee-saved registers bitvector.
+    CalleeSavedRegs.resize(NumPhysRegs);
+    const unsigned *CSRegs = TRI->getCalleeSavedRegs();
+    if (CSRegs != NULL)
+      for (unsigned i = 0; CSRegs[i]; ++i)
+        CalleeSavedRegs.set(CSRegs[i]);
+  }
+
+  MBB = mbb;
+  ScavengedReg = 0;
+  ScavengedRC = NULL;
+  ScavengeRestore = NULL;
+  CurrDist = 0;
+  DistanceMap.clear();
+  ImplicitDefed.reset();
+
+  // All registers started out unused.
+  RegsAvailable.set();
+
+  // Reserved registers are always used.
+  RegsAvailable ^= ReservedRegs;
+
+  // Live-in registers are in use.
+  if (!MBB->livein_empty())
+    for (MachineBasicBlock::const_livein_iterator I = MBB->livein_begin(),
+           E = MBB->livein_end(); I != E; ++I)
+      setUsed(*I);
+
+  Tracking = false;
+}
+
+void RegScavenger::restoreScavengedReg() {
+  TII->loadRegFromStackSlot(*MBB, MBBI, ScavengedReg,
+                            ScavengingFrameIndex, ScavengedRC);
+  MachineBasicBlock::iterator II = prior(MBBI);
+  TRI->eliminateFrameIndex(II, 0, this);
+  setUsed(ScavengedReg);
+  ScavengedReg = 0;
+  ScavengedRC = NULL;
+}
+
+#ifndef NDEBUG
+/// isLiveInButUnusedBefore - Return true if register is livein the MBB not
+/// not used before it reaches the MI that defines register.
+static bool isLiveInButUnusedBefore(unsigned Reg, MachineInstr *MI,
+                                    MachineBasicBlock *MBB,
+                                    const TargetRegisterInfo *TRI,
+                                    MachineRegisterInfo* MRI) {
+  // First check if register is livein.
+  bool isLiveIn = false;
+  for (MachineBasicBlock::const_livein_iterator I = MBB->livein_begin(),
+         E = MBB->livein_end(); I != E; ++I)
+    if (Reg == *I || TRI->isSuperRegister(Reg, *I)) {
+      isLiveIn = true;
+      break;
+    }
+  if (!isLiveIn)
+    return false;
+
+  // Is there any use of it before the specified MI?
+  SmallPtrSet<MachineInstr*, 4> UsesInMBB;
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI->getParent() == MBB)
+      UsesInMBB.insert(UseMI);
+  }
+  if (UsesInMBB.empty())
+    return true;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MI; I != E; ++I)
+    if (UsesInMBB.count(&*I))
+      return false;
+  return true;
+}
+#endif
+
+void RegScavenger::forward() {
+  // Move ptr forward.
+  if (!Tracking) {
+    MBBI = MBB->begin();
+    Tracking = true;
+  } else {
+    assert(MBBI != MBB->end() && "Already at the end of the basic block!");
+    MBBI = next(MBBI);
+  }
+
+  MachineInstr *MI = MBBI;
+  DistanceMap.insert(std::make_pair(MI, CurrDist++));
+
+  if (MI == ScavengeRestore) {
+    ScavengedReg = 0;
+    ScavengedRC = NULL;
+    ScavengeRestore = NULL;
+  }
+
+  bool IsImpDef = MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF;
+
+  // Separate register operands into 3 classes: uses, defs, earlyclobbers.
+  SmallVector<std::pair<const MachineOperand*,unsigned>, 4> UseMOs;
+  SmallVector<std::pair<const MachineOperand*,unsigned>, 4> DefMOs;
+  SmallVector<std::pair<const MachineOperand*,unsigned>, 4> EarlyClobberMOs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    if (MO.isUse())
+      UseMOs.push_back(std::make_pair(&MO,i));
+    else if (MO.isEarlyClobber())
+      EarlyClobberMOs.push_back(std::make_pair(&MO,i));
+    else
+      DefMOs.push_back(std::make_pair(&MO,i));
+  }
+
+  // Process uses first.
+  BitVector UseRegs(NumPhysRegs);
+  for (unsigned i = 0, e = UseMOs.size(); i != e; ++i) {
+    const MachineOperand MO = *UseMOs[i].first;
+    unsigned Reg = MO.getReg();
+
+    assert(isUsed(Reg) && "Using an undefined register!");
+
+    if (MO.isKill() && !isReserved(Reg)) {
+      UseRegs.set(Reg);
+
+      // Mark sub-registers as used.
+      for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+           unsigned SubReg = *SubRegs; ++SubRegs)
+        UseRegs.set(SubReg);
+    }
+  }
+
+  // Change states of all registers after all the uses are processed to guard
+  // against multiple uses.
+  setUnused(UseRegs);
+
+  // Process early clobber defs then process defs. We can have a early clobber
+  // that is dead, it should not conflict with a def that happens one "slot"
+  // (see InstrSlots in LiveIntervalAnalysis.h) later.
+  unsigned NumECs = EarlyClobberMOs.size();
+  unsigned NumDefs = DefMOs.size();
+
+  for (unsigned i = 0, e = NumECs + NumDefs; i != e; ++i) {
+    const MachineOperand &MO = (i < NumECs)
+      ? *EarlyClobberMOs[i].first : *DefMOs[i-NumECs].first;
+    unsigned Idx = (i < NumECs)
+      ? EarlyClobberMOs[i].second : DefMOs[i-NumECs].second;
+    unsigned Reg = MO.getReg();
+
+    // If it's dead upon def, then it is now free.
+    if (MO.isDead()) {
+      setUnused(Reg, MI);
+      continue;
+    }
+
+    // Skip two-address destination operand.
+    if (MI->isRegTiedToUseOperand(Idx)) {
+      assert(isUsed(Reg) && "Using an undefined register!");
+      continue;
+    }
+
+    // Skip if this is merely redefining part of a super-register.
+    if (RedefinesSuperRegPart(MI, MO, TRI))
+      continue;
+
+    // Implicit def is allowed to "re-define" any register. Similarly,
+    // implicitly defined registers can be clobbered.
+    assert((isReserved(Reg) || isUnused(Reg) ||
+            IsImpDef || isImplicitlyDefined(Reg) ||
+            isLiveInButUnusedBefore(Reg, MI, MBB, TRI, MRI)) &&
+           "Re-defining a live register!");
+    setUsed(Reg, IsImpDef);
+  }
+}
+
+void RegScavenger::backward() {
+  assert(Tracking && "Not tracking states!");
+  assert(MBBI != MBB->begin() && "Already at start of basic block!");
+  // Move ptr backward.
+  MBBI = prior(MBBI);
+
+  MachineInstr *MI = MBBI;
+  DistanceMap.erase(MI);
+  --CurrDist;
+
+  // Separate register operands into 3 classes: uses, defs, earlyclobbers.
+  SmallVector<std::pair<const MachineOperand*,unsigned>, 4> UseMOs;
+  SmallVector<std::pair<const MachineOperand*,unsigned>, 4> DefMOs;
+  SmallVector<std::pair<const MachineOperand*,unsigned>, 4> EarlyClobberMOs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    if (MO.isUse())
+      UseMOs.push_back(std::make_pair(&MO,i));
+    else if (MO.isEarlyClobber())
+      EarlyClobberMOs.push_back(std::make_pair(&MO,i));
+    else
+      DefMOs.push_back(std::make_pair(&MO,i));
+  }
+
+
+  // Process defs first.
+  unsigned NumECs = EarlyClobberMOs.size();
+  unsigned NumDefs = DefMOs.size();
+  for (unsigned i = 0, e = NumECs + NumDefs; i != e; ++i) {
+    const MachineOperand &MO = (i < NumDefs)
+      ? *DefMOs[i].first : *EarlyClobberMOs[i-NumDefs].first;
+    unsigned Idx = (i < NumECs)
+      ? DefMOs[i].second : EarlyClobberMOs[i-NumDefs].second;
+
+    // Skip two-address destination operand.
+    if (MI->isRegTiedToUseOperand(Idx))
+      continue;
+
+    unsigned Reg = MO.getReg();
+    assert(isUsed(Reg));
+    if (!isReserved(Reg))
+      setUnused(Reg, MI);
+  }
+
+  // Process uses.
+  BitVector UseRegs(NumPhysRegs);
+  for (unsigned i = 0, e = UseMOs.size(); i != e; ++i) {
+    const MachineOperand MO = *UseMOs[i].first;
+    unsigned Reg = MO.getReg();
+    assert(isUnused(Reg) || isReserved(Reg));
+    UseRegs.set(Reg);
+
+    // Set the sub-registers as "used".
+    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
+         unsigned SubReg = *SubRegs; ++SubRegs)
+      UseRegs.set(SubReg);
+  }
+  setUsed(UseRegs);
+}
+
+void RegScavenger::getRegsUsed(BitVector &used, bool includeReserved) {
+  if (includeReserved)
+    used = ~RegsAvailable;
+  else
+    used = ~RegsAvailable & ~ReservedRegs;
+}
+
+/// CreateRegClassMask - Set the bits that represent the registers in the
+/// TargetRegisterClass.
+static void CreateRegClassMask(const TargetRegisterClass *RC, BitVector &Mask) {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E;
+       ++I)
+    Mask.set(*I);
+}
+
+unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RegClass,
+                                     const BitVector &Candidates) const {
+  // Mask off the registers which are not in the TargetRegisterClass.
+  BitVector RegsAvailableCopy(NumPhysRegs, false);
+  CreateRegClassMask(RegClass, RegsAvailableCopy);
+  RegsAvailableCopy &= RegsAvailable;
+
+  // Restrict the search to candidates.
+  RegsAvailableCopy &= Candidates;
+
+  // Returns the first unused (bit is set) register, or 0 is none is found.
+  int Reg = RegsAvailableCopy.find_first();
+  return (Reg == -1) ? 0 : Reg;
+}
+
+unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RegClass,
+                                     bool ExCalleeSaved) const {
+  // Mask off the registers which are not in the TargetRegisterClass.
+  BitVector RegsAvailableCopy(NumPhysRegs, false);
+  CreateRegClassMask(RegClass, RegsAvailableCopy);
+  RegsAvailableCopy &= RegsAvailable;
+
+  // If looking for a non-callee-saved register, mask off all the callee-saved
+  // registers.
+  if (ExCalleeSaved)
+    RegsAvailableCopy &= ~CalleeSavedRegs;
+
+  // Returns the first unused (bit is set) register, or 0 is none is found.
+  int Reg = RegsAvailableCopy.find_first();
+  return (Reg == -1) ? 0 : Reg;
+}
+
+/// findFirstUse - Calculate the distance to the first use of the
+/// specified register.
+MachineInstr*
+RegScavenger::findFirstUse(MachineBasicBlock *MBB,
+                           MachineBasicBlock::iterator I, unsigned Reg,
+                           unsigned &Dist) {
+  MachineInstr *UseMI = 0;
+  Dist = ~0U;
+  for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(Reg),
+         RE = MRI->reg_end(); RI != RE; ++RI) {
+    MachineInstr *UDMI = &*RI;
+    if (UDMI->getParent() != MBB)
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UDMI);
+    if (DI == DistanceMap.end()) {
+      // If it's not in map, it's below current MI, let's initialize the
+      // map.
+      I = next(I);
+      unsigned Dist = CurrDist + 1;
+      while (I != MBB->end()) {
+        DistanceMap.insert(std::make_pair(I, Dist++));
+        I = next(I);
+      }
+    }
+    DI = DistanceMap.find(UDMI);
+    if (DI->second > CurrDist && DI->second < Dist) {
+      Dist = DI->second;
+      UseMI = UDMI;
+    }
+  }
+  return UseMI;
+}
+
+unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+                                        MachineBasicBlock::iterator I,
+                                        int SPAdj) {
+  assert(ScavengingFrameIndex >= 0 &&
+         "Cannot scavenge a register without an emergency spill slot!");
+
+  // Mask off the registers which are not in the TargetRegisterClass.
+  BitVector Candidates(NumPhysRegs, false);
+  CreateRegClassMask(RC, Candidates);
+  Candidates ^= ReservedRegs;  // Do not include reserved registers.
+
+  // Exclude all the registers being used by the instruction.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = I->getOperand(i);
+    if (MO.isReg())
+      Candidates.reset(MO.getReg());
+  }
+
+  // Find the register whose use is furthest away.
+  unsigned SReg = 0;
+  unsigned MaxDist = 0;
+  MachineInstr *MaxUseMI = 0;
+  int Reg = Candidates.find_first();
+  while (Reg != -1) {
+    unsigned Dist;
+    MachineInstr *UseMI = findFirstUse(MBB, I, Reg, Dist);
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+      unsigned AsDist;
+      MachineInstr *AsUseMI = findFirstUse(MBB, I, *AS, AsDist);
+      if (AsDist < Dist) {
+        Dist = AsDist;
+        UseMI = AsUseMI;
+      }
+    }
+    if (Dist >= MaxDist) {
+      MaxDist = Dist;
+      MaxUseMI = UseMI;
+      SReg = Reg;
+    }
+    Reg = Candidates.find_next(Reg);
+  }
+
+  if (ScavengedReg != 0) {
+    assert(0 && "Scavenger slot is live, unable to scavenge another register!");
+    abort();
+  }
+
+  // Spill the scavenged register before I.
+  TII->storeRegToStackSlot(*MBB, I, SReg, true, ScavengingFrameIndex, RC);
+  MachineBasicBlock::iterator II = prior(I);
+  TRI->eliminateFrameIndex(II, SPAdj, this);
+
+  // Restore the scavenged register before its use (or first terminator).
+  II = MaxUseMI
+    ? MachineBasicBlock::iterator(MaxUseMI) : MBB->getFirstTerminator();
+  TII->loadRegFromStackSlot(*MBB, II, SReg, ScavengingFrameIndex, RC);
+  ScavengeRestore = prior(II);
+  ScavengedReg = SReg;
+  ScavengedRC = RC;
+
+  return SReg;
+}
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
new file mode 100644
index 0000000..a8452df
--- /dev/null
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -0,0 +1,572 @@
+//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include <climits>
+using namespace llvm;
+
+ScheduleDAG::ScheduleDAG(MachineFunction &mf)
+  : TM(mf.getTarget()),
+    TII(TM.getInstrInfo()),
+    TRI(TM.getRegisterInfo()),
+    TLI(TM.getTargetLowering()),
+    MF(mf), MRI(mf.getRegInfo()),
+    ConstPool(MF.getConstantPool()),
+    EntrySU(), ExitSU() {
+}
+
+ScheduleDAG::~ScheduleDAG() {}
+
+/// dump - dump the schedule.
+void ScheduleDAG::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      cerr << "**** NOOP ****\n";
+  }
+}
+
+
+/// Run - perform scheduling.
+///
+void ScheduleDAG::Run(MachineBasicBlock *bb,
+                      MachineBasicBlock::iterator insertPos) {
+  BB = bb;
+  InsertPos = insertPos;
+
+  SUnits.clear();
+  Sequence.clear();
+  EntrySU = SUnit();
+  ExitSU = SUnit();
+
+  Schedule();
+
+  DOUT << "*** Final schedule ***\n";
+  DEBUG(dumpSchedule());
+  DOUT << "\n";
+}
+
+/// addPred - This adds the specified edge as a pred of the current node if
+/// not already.  It also adds the current node as a successor of the
+/// specified node.
+void SUnit::addPred(const SDep &D) {
+  // If this node already has this depenence, don't add a redundant one.
+  for (SmallVector<SDep, 4>::const_iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I)
+    if (*I == D)
+      return;
+  // Now add a corresponding succ to N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  // Update the bookkeeping.
+  if (D.getKind() == SDep::Data) {
+    ++NumPreds;
+    ++N->NumSuccs;
+  }
+  if (!N->isScheduled)
+    ++NumPredsLeft;
+  if (!isScheduled)
+    ++N->NumSuccsLeft;
+  Preds.push_back(D);
+  N->Succs.push_back(P);
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
+}
+
+/// removePred - This removes the specified edge as a pred of the current
+/// node if it exists.  It also removes the current node as a successor of
+/// the specified node.
+void SUnit::removePred(const SDep &D) {
+  // Find the matching predecessor.
+  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I)
+    if (*I == D) {
+      bool FoundSucc = false;
+      // Find the corresponding successor in N.
+      SDep P = D;
+      P.setSUnit(this);
+      SUnit *N = D.getSUnit();
+      for (SmallVector<SDep, 4>::iterator II = N->Succs.begin(),
+             EE = N->Succs.end(); II != EE; ++II)
+        if (*II == P) {
+          FoundSucc = true;
+          N->Succs.erase(II);
+          break;
+        }
+      assert(FoundSucc && "Mismatching preds / succs lists!");
+      Preds.erase(I);
+      // Update the bookkeeping.
+      if (P.getKind() == SDep::Data) {
+        --NumPreds;
+        --N->NumSuccs;
+      }
+      if (!N->isScheduled)
+        --NumPredsLeft;
+      if (!isScheduled)
+        --N->NumSuccsLeft;
+      if (P.getLatency() != 0) {
+        this->setDepthDirty();
+        N->setHeightDirty();
+      }
+      return;
+    }
+}
+
+void SUnit::setDepthDirty() {
+  if (!isDepthCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isDepthCurrent = false;
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isDepthCurrent)
+        WorkList.push_back(SuccSU);
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::setHeightDirty() {
+  if (!isHeightCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isHeightCurrent = false;
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isHeightCurrent)
+        WorkList.push_back(PredSU);
+    }
+  } while (!WorkList.empty());
+}
+
+/// setDepthToAtLeast - Update this node's successors to reflect the
+/// fact that this node's depth just increased.
+///
+void SUnit::setDepthToAtLeast(unsigned NewDepth) {
+  if (NewDepth <= getDepth())
+    return;
+  setDepthDirty();
+  Depth = NewDepth;
+  isDepthCurrent = true;
+}
+
+/// setHeightToAtLeast - Update this node's predecessors to reflect the
+/// fact that this node's height just increased.
+///
+void SUnit::setHeightToAtLeast(unsigned NewHeight) {
+  if (NewHeight <= getHeight())
+    return;
+  setHeightDirty();
+  Height = NewHeight;
+  isHeightCurrent = true;
+}
+
+/// ComputeDepth - Calculate the maximal path from the node to the exit.
+///
+void SUnit::ComputeDepth() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxPredDepth = 0;
+    for (SUnit::const_pred_iterator I = Cur->Preds.begin(),
+         E = Cur->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isDepthCurrent)
+        MaxPredDepth = std::max(MaxPredDepth,
+                                PredSU->Depth + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(PredSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxPredDepth != Cur->Depth) {
+        Cur->setDepthDirty();
+        Cur->Depth = MaxPredDepth;
+      }
+      Cur->isDepthCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+/// ComputeHeight - Calculate the maximal path from the node to the entry.
+///
+void SUnit::ComputeHeight() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxSuccHeight = 0;
+    for (SUnit::const_succ_iterator I = Cur->Succs.begin(),
+         E = Cur->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isHeightCurrent)
+        MaxSuccHeight = std::max(MaxSuccHeight,
+                                 SuccSU->Height + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(SuccSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxSuccHeight != Cur->Height) {
+        Cur->setHeightDirty();
+        Cur->Height = MaxSuccHeight;
+      }
+      Cur->isHeightCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
+/// a group of nodes flagged together.
+void SUnit::dump(const ScheduleDAG *G) const {
+  cerr << "SU(" << NodeNum << "): ";
+  G->dumpNode(this);
+}
+
+void SUnit::dumpAll(const ScheduleDAG *G) const {
+  dump(G);
+
+  cerr << "  # preds left       : " << NumPredsLeft << "\n";
+  cerr << "  # succs left       : " << NumSuccsLeft << "\n";
+  cerr << "  Latency            : " << Latency << "\n";
+  cerr << "  Depth              : " << Depth << "\n";
+  cerr << "  Height             : " << Height << "\n";
+
+  if (Preds.size() != 0) {
+    cerr << "  Predecessors:\n";
+    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I) {
+      cerr << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        cerr << "val "; break;
+      case SDep::Anti:        cerr << "anti"; break;
+      case SDep::Output:      cerr << "out "; break;
+      case SDep::Order:       cerr << "ch  "; break;
+      }
+      cerr << "#";
+      cerr << I->getSUnit() << " - SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        cerr << " *";
+      cerr << "\n";
+    }
+  }
+  if (Succs.size() != 0) {
+    cerr << "  Successors:\n";
+    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end();
+         I != E; ++I) {
+      cerr << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        cerr << "val "; break;
+      case SDep::Anti:        cerr << "anti"; break;
+      case SDep::Output:      cerr << "out "; break;
+      case SDep::Order:       cerr << "ch  "; break;
+      }
+      cerr << "#";
+      cerr << I->getSUnit() << " - SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        cerr << " *";
+      cerr << "\n";
+    }
+  }
+  cerr << "\n";
+}
+
+#ifndef NDEBUG
+/// VerifySchedule - Verify that all SUnits were scheduled and that
+/// their state is consistent.
+///
+void ScheduleDAG::VerifySchedule(bool isBottomUp) {
+  bool AnyNotSched = false;
+  unsigned DeadNodes = 0;
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    if (!SUnits[i].isScheduled) {
+      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+        ++DeadNodes;
+        continue;
+      }
+      if (!AnyNotSched)
+        cerr << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      cerr << "has not been scheduled!\n";
+      AnyNotSched = true;
+    }
+    if (SUnits[i].isScheduled &&
+        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getHeight()) >
+          unsigned(INT_MAX)) {
+      if (!AnyNotSched)
+        cerr << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      cerr << "has an unexpected "
+           << (isBottomUp ? "Height" : "Depth") << " value!\n";
+      AnyNotSched = true;
+    }
+    if (isBottomUp) {
+      if (SUnits[i].NumSuccsLeft != 0) {
+        if (!AnyNotSched)
+          cerr << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        cerr << "has successors left!\n";
+        AnyNotSched = true;
+      }
+    } else {
+      if (SUnits[i].NumPredsLeft != 0) {
+        if (!AnyNotSched)
+          cerr << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        cerr << "has predecessors left!\n";
+        AnyNotSched = true;
+      }
+    }
+  }
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(!AnyNotSched);
+  assert(Sequence.size() + DeadNodes - Noops == SUnits.size() &&
+         "The number of nodes scheduled doesn't match the expected number!");
+}
+#endif
+
+/// InitDAGTopologicalSorting - create the initial topological 
+/// ordering from the DAG to be scheduled.
+///
+/// The idea of the algorithm is taken from 
+/// "Online algorithms for managing the topological order of
+/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+/// This is the MNR algorithm, which was first introduced by 
+/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in  
+/// "Maintaining a topological order under edge insertions".
+///
+/// Short description of the algorithm: 
+///
+/// Topological ordering, ord, of a DAG maps each node to a topological
+/// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+///
+/// This means that if there is a path from the node X to the node Z, 
+/// then ord(X) < ord(Z).
+///
+/// This property can be used to check for reachability of nodes:
+/// if Z is reachable from X, then an insertion of the edge Z->X would 
+/// create a cycle.
+///
+/// The algorithm first computes a topological ordering for the DAG by
+/// initializing the Index2Node and Node2Index arrays and then tries to keep
+/// the ordering up-to-date after edge insertions by reordering the DAG.
+///
+/// On insertion of the edge X->Y, the algorithm first marks by calling DFS
+/// the nodes reachable from Y, and then shifts them using Shift to lie
+/// immediately after X in Index2Node.
+void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+  WorkList.reserve(DAGSize);
+
+  Index2Node.resize(DAGSize);
+  Node2Index.resize(DAGSize);
+
+  // Initialize the data structures.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    // Temporarily use the Node2Index array as scratch space for degree counts.
+    Node2Index[NodeNum] = Degree;
+
+    // Is it a node without dependencies?
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      // Collect leaf nodes.
+      WorkList.push_back(SU);
+    }
+  }  
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    Allocate(SU->NodeNum, --Id);
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      SUnit *SU = I->getSUnit();
+      if (!--Node2Index[SU->NodeNum])
+        // If all dependencies of the node are processed already,
+        // then the node can be computed now.
+        WorkList.push_back(SU);
+    }
+  }
+
+  Visited.resize(DAGSize);
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] && 
+      "Wrong topological sorting");
+    }
+  }
+#endif
+}
+
+/// AddPred - Updates the topological ordering to accomodate an edge
+/// to be added from SUnit X to SUnit Y.
+void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[Y->NodeNum];
+  UpperBound = Node2Index[X->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(X) < Ord(Y) ?
+  if (LowerBound < UpperBound) {
+    // Update the topological order.
+    Visited.reset();
+    DFS(Y, UpperBound, HasLoop);
+    assert(!HasLoop && "Inserted edge creates a loop!");
+    // Recompute topological indexes.
+    Shift(Visited, LowerBound, UpperBound);
+  }
+}
+
+/// RemovePred - Updates the topological ordering to accomodate an
+/// an edge to be removed from the specified node N from the predecessors
+/// of the current node M.
+void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
+  // InitDAGTopologicalSorting();
+}
+
+/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark
+/// all nodes affected by the edge insertion. These nodes will later get new
+/// topological indexes by means of the Shift method.
+void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
+                                     bool& HasLoop) {
+  std::vector<const SUnit*> WorkList;
+  WorkList.reserve(SUnits.size()); 
+
+  WorkList.push_back(SU);
+  do {
+    SU = WorkList.back();
+    WorkList.pop_back();
+    Visited.set(SU->NodeNum);
+    for (int I = SU->Succs.size()-1; I >= 0; --I) {
+      int s = SU->Succs[I].getSUnit()->NodeNum;
+      if (Node2Index[s] == UpperBound) {
+        HasLoop = true; 
+        return;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        WorkList.push_back(SU->Succs[I].getSUnit());
+      } 
+    } 
+  } while (!WorkList.empty());
+}
+
+/// Shift - Renumber the nodes so that the topological ordering is 
+/// preserved.
+void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, 
+                                       int UpperBound) {
+  std::vector<int> L;
+  int shift = 0;
+  int i;
+
+  for (i = LowerBound; i <= UpperBound; ++i) {
+    // w is node at topological index i.
+    int w = Index2Node[i];
+    if (Visited.test(w)) {
+      // Unmark.
+      Visited.reset(w);
+      L.push_back(w);
+      shift = shift + 1;
+    } else {
+      Allocate(w, i - shift);
+    }
+  }
+
+  for (unsigned j = 0; j < L.size(); ++j) {
+    Allocate(L[j], i - shift);
+    i = i + 1;
+  }
+}
+
+
+/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
+/// create a cycle.
+bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *SU, SUnit *TargetSU) {
+  if (IsReachable(TargetSU, SU))
+    return true;
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (I->isAssignedRegDep() &&
+        IsReachable(TargetSU, I->getSUnit()))
+      return true;
+  return false;
+}
+
+/// IsReachable - Checks if SU is reachable from TargetSU.
+bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
+                                             const SUnit *TargetSU) {
+  // If insertion of the edge SU->TargetSU would create a cycle
+  // then there is a path from TargetSU to SU.
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[TargetSU->NodeNum];
+  UpperBound = Node2Index[SU->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(TargetSU) < Ord(SU) ?
+  if (LowerBound < UpperBound) {
+    Visited.reset();
+    // There may be a path from TargetSU to SU. Check for it. 
+    DFS(TargetSU, UpperBound, HasLoop);
+  }
+  return HasLoop;
+}
+
+/// Allocate - assign the topological index to the node n.
+void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
+  Node2Index[n] = index;
+  Index2Node[index] = n;
+}
+
+ScheduleDAGTopologicalSort::ScheduleDAGTopologicalSort(
+                                                     std::vector<SUnit> &sunits)
+ : SUnits(sunits) {}
+
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
diff --git a/lib/CodeGen/ScheduleDAGEmit.cpp b/lib/CodeGen/ScheduleDAGEmit.cpp
new file mode 100644
index 0000000..770f5bb
--- /dev/null
+++ b/lib/CodeGen/ScheduleDAGEmit.cpp
@@ -0,0 +1,71 @@
+//===---- ScheduleDAGEmit.cpp - Emit routines for the ScheduleDAG class ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the Emit routines for the ScheduleDAG class, which creates
+// MachineInstrs according to the computed schedule.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+void ScheduleDAG::AddMemOperand(MachineInstr *MI, const MachineMemOperand &MO) {
+  MI->addMemOperand(MF, MO);
+}
+
+void ScheduleDAG::EmitNoop() {
+  TII->insertNoop(*BB, InsertPos);
+}
+
+void ScheduleDAG::EmitPhysRegCopy(SUnit *SU,
+                                  DenseMap<SUnit*, unsigned> &VRBaseMap) {
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    if (I->getSUnit()->CopyDstRC) {
+      // Copy to physical register.
+      DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
+      assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
+      // Find the destination physical register.
+      unsigned Reg = 0;
+      for (SUnit::const_succ_iterator II = SU->Succs.begin(),
+             EE = SU->Succs.end(); II != EE; ++II) {
+        if (II->getReg()) {
+          Reg = II->getReg();
+          break;
+        }
+      }
+      TII->copyRegToReg(*BB, InsertPos, Reg, VRI->second,
+                        SU->CopyDstRC, SU->CopySrcRC);
+    } else {
+      // Copy from physical register.
+      assert(I->getReg() && "Unknown physical register!");
+      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
+      isNew = isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+      TII->copyRegToReg(*BB, InsertPos, VRBase, I->getReg(),
+                        SU->CopyDstRC, SU->CopySrcRC);
+    }
+    break;
+  }
+}
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
new file mode 100644
index 0000000..8e18b3d
--- /dev/null
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -0,0 +1,468 @@
+//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAGInstrs class, which implements re-scheduling
+// of MachineInstrs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sched-instrs"
+#include "ScheduleDAGInstrs.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
+using namespace llvm;
+
+ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
+                                     const MachineLoopInfo &mli,
+                                     const MachineDominatorTree &mdt)
+  : ScheduleDAG(mf), MLI(mli), MDT(mdt), LoopRegs(MLI, MDT) {}
+
+/// Run - perform scheduling.
+///
+void ScheduleDAGInstrs::Run(MachineBasicBlock *bb,
+                            MachineBasicBlock::iterator begin,
+                            MachineBasicBlock::iterator end,
+                            unsigned endcount) {
+  BB = bb;
+  Begin = begin;
+  InsertPosIndex = endcount;
+
+  ScheduleDAG::Run(bb, end);
+}
+
+/// getOpcode - If this is an Instruction or a ConstantExpr, return the
+/// opcode value. Otherwise return UserOp1.
+static unsigned getOpcode(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getOpcode();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    return CE->getOpcode();
+  // Use UserOp1 to mean there's no opcode.
+  return Instruction::UserOp1;
+}
+
+/// getUnderlyingObjectFromInt - This is the function that does the work of
+/// looking through basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObjectFromInt(const Value *V) {
+  do {
+    if (const User *U = dyn_cast<User>(V)) {
+      // If we find a ptrtoint, we can transfer control back to the
+      // regular getUnderlyingObjectFromInt.
+      if (getOpcode(U) == Instruction::PtrToInt)
+        return U->getOperand(0);
+      // If we find an add of a constant or a multiplied value, it's
+      // likely that the other operand will lead us to the base
+      // object. We don't have to worry about the case where the
+      // object address is somehow being computed bt the multiply,
+      // because our callers only care when the result is an
+      // identifibale object.
+      if (getOpcode(U) != Instruction::Add ||
+          (!isa<ConstantInt>(U->getOperand(1)) &&
+           getOpcode(U->getOperand(1)) != Instruction::Mul))
+        return V;
+      V = U->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(isa<IntegerType>(V->getType()) && "Unexpected operand type!");
+  } while (1);
+}
+
+/// getUnderlyingObject - This is a wrapper around Value::getUnderlyingObject
+/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObject(const Value *V) {
+  // First just call Value::getUnderlyingObject to let it do what it does.
+  do {
+    V = V->getUnderlyingObject();
+    // If it found an inttoptr, use special code to continue climing.
+    if (getOpcode(V) != Instruction::IntToPtr)
+      break;
+    const Value *O = getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
+    // If that succeeded in finding a pointer, continue the search.
+    if (!isa<PointerType>(O->getType()))
+      break;
+    V = O;
+  } while (1);
+  return V;
+}
+
+/// getUnderlyingObjectForInstr - If this machine instr has memory reference
+/// information and it can be tracked to a normal reference to a known
+/// object, return the Value for that object. Otherwise return null.
+static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI) {
+  if (!MI->hasOneMemOperand() ||
+      !MI->memoperands_begin()->getValue() ||
+      MI->memoperands_begin()->isVolatile())
+    return 0;
+
+  const Value *V = MI->memoperands_begin()->getValue();
+  if (!V)
+    return 0;
+
+  V = getUnderlyingObject(V);
+  if (!isa<PseudoSourceValue>(V) && !isIdentifiedObject(V))
+    return 0;
+
+  return V;
+}
+
+void ScheduleDAGInstrs::StartBlock(MachineBasicBlock *BB) {
+  if (MachineLoop *ML = MLI.getLoopFor(BB))
+    if (BB == ML->getLoopLatch()) {
+      MachineBasicBlock *Header = ML->getHeader();
+      for (MachineBasicBlock::livein_iterator I = Header->livein_begin(),
+           E = Header->livein_end(); I != E; ++I)
+        LoopLiveInRegs.insert(*I);
+      LoopRegs.VisitLoop(ML);
+    }
+}
+
+void ScheduleDAGInstrs::BuildSchedGraph() {
+  // We'll be allocating one SUnit for each instruction, plus one for
+  // the region exit node.
+  SUnits.reserve(BB->size());
+
+  // We build scheduling units by walking a block's instruction list from bottom
+  // to top.
+
+  // Remember where a generic side-effecting instruction is as we procede. If
+  // ChainMMO is null, this is assumed to have arbitrary side-effects. If
+  // ChainMMO is non-null, then Chain makes only a single memory reference.
+  SUnit *Chain = 0;
+  MachineMemOperand *ChainMMO = 0;
+
+  // Memory references to specific known memory locations are tracked so that
+  // they can be given more precise dependencies.
+  std::map<const Value *, SUnit *> MemDefs;
+  std::map<const Value *, std::vector<SUnit *> > MemUses;
+
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = ForceUnitLatencies();
+
+  // Ask the target if address-backscheduling is desirable, and if so how much.
+  unsigned SpecialAddressLatency =
+    TM.getSubtarget<TargetSubtarget>().getSpecialAddressLatency();
+
+  // Walk the list of instructions, from bottom moving up.
+  for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin;
+       MII != MIE; --MII) {
+    MachineInstr *MI = prior(MII);
+    const TargetInstrDesc &TID = MI->getDesc();
+    assert(!TID.isTerminator() && !MI->isLabel() &&
+           "Cannot schedule terminators or labels!");
+    // Create the SUnit for this MI.
+    SUnit *SU = NewSUnit(MI);
+
+    // Assign the Latency field of SU using target-provided information.
+    if (UnitLatencies)
+      SU->Latency = 1;
+    else
+      ComputeLatency(SU);
+
+    // Add register-based dependencies (data, anti, and output).
+    for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI->getOperand(j);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!");
+      std::vector<SUnit *> &UseList = Uses[Reg];
+      std::vector<SUnit *> &DefList = Defs[Reg];
+      // Optionally add output and anti dependencies.
+      // TODO: Using a latency of 1 here assumes there's no cost for
+      //       reusing registers.
+      SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
+      for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
+        SUnit *DefSU = DefList[i];
+        if (DefSU != SU &&
+            (Kind != SDep::Output || !MO.isDead() ||
+             !DefSU->getInstr()->registerDefIsDead(Reg)))
+          DefSU->addPred(SDep(SU, Kind, /*Latency=*/1, /*Reg=*/Reg));
+      }
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        std::vector<SUnit *> &DefList = Defs[*Alias];
+        for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
+          SUnit *DefSU = DefList[i];
+          if (DefSU != SU &&
+              (Kind != SDep::Output || !MO.isDead() ||
+               !DefSU->getInstr()->registerDefIsDead(Reg)))
+            DefSU->addPred(SDep(SU, Kind, /*Latency=*/1, /*Reg=*/ *Alias));
+        }
+      }
+
+      if (MO.isDef()) {
+        // Add any data dependencies.
+        unsigned DataLatency = SU->Latency;
+        for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
+          SUnit *UseSU = UseList[i];
+          if (UseSU != SU) {
+            unsigned LDataLatency = DataLatency;
+            // Optionally add in a special extra latency for nodes that
+            // feed addresses.
+            // TODO: Do this for register aliases too.
+            if (SpecialAddressLatency != 0 && !UnitLatencies) {
+              MachineInstr *UseMI = UseSU->getInstr();
+              const TargetInstrDesc &UseTID = UseMI->getDesc();
+              int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg);
+              assert(RegUseIndex >= 0 && "UseMI doesn's use register!");
+              if ((UseTID.mayLoad() || UseTID.mayStore()) &&
+                  (unsigned)RegUseIndex < UseTID.getNumOperands() &&
+                  UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass())
+                LDataLatency += SpecialAddressLatency;
+            }
+            UseSU->addPred(SDep(SU, SDep::Data, LDataLatency, Reg));
+          }
+        }
+        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+          std::vector<SUnit *> &UseList = Uses[*Alias];
+          for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
+            SUnit *UseSU = UseList[i];
+            if (UseSU != SU)
+              UseSU->addPred(SDep(SU, SDep::Data, DataLatency, *Alias));
+          }
+        }
+
+        // If a def is going to wrap back around to the top of the loop,
+        // backschedule it.
+        if (!UnitLatencies && DefList.empty()) {
+          LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg);
+          if (I != LoopRegs.Deps.end()) {
+            const MachineOperand *UseMO = I->second.first;
+            unsigned Count = I->second.second;
+            const MachineInstr *UseMI = UseMO->getParent();
+            unsigned UseMOIdx = UseMO - &UseMI->getOperand(0);
+            const TargetInstrDesc &UseTID = UseMI->getDesc();
+            // TODO: If we knew the total depth of the region here, we could
+            // handle the case where the whole loop is inside the region but
+            // is large enough that the isScheduleHigh trick isn't needed.
+            if (UseMOIdx < UseTID.getNumOperands()) {
+              // Currently, we only support scheduling regions consisting of
+              // single basic blocks. Check to see if the instruction is in
+              // the same region by checking to see if it has the same parent.
+              if (UseMI->getParent() != MI->getParent()) {
+                unsigned Latency = SU->Latency;
+                if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass())
+                  Latency += SpecialAddressLatency;
+                // This is a wild guess as to the portion of the latency which
+                // will be overlapped by work done outside the current
+                // scheduling region.
+                Latency -= std::min(Latency, Count);
+                // Add the artifical edge.
+                ExitSU.addPred(SDep(SU, SDep::Order, Latency,
+                                    /*Reg=*/0, /*isNormalMemory=*/false,
+                                    /*isMustAlias=*/false,
+                                    /*isArtificial=*/true));
+              } else if (SpecialAddressLatency > 0 &&
+                         UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) {
+                // The entire loop body is within the current scheduling region
+                // and the latency of this operation is assumed to be greater
+                // than the latency of the loop.
+                // TODO: Recursively mark data-edge predecessors as
+                //       isScheduleHigh too.
+                SU->isScheduleHigh = true;
+              }
+            }
+            LoopRegs.Deps.erase(I);
+          }
+        }
+
+        UseList.clear();
+        if (!MO.isDead())
+          DefList.clear();
+        DefList.push_back(SU);
+      } else {
+        UseList.push_back(SU);
+      }
+    }
+
+    // Add chain dependencies.
+    // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable
+    // after stack slots are lowered to actual addresses.
+    // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
+    // produce more precise dependence information.
+    if (TID.isCall() || TID.hasUnmodeledSideEffects()) {
+    new_chain:
+      // This is the conservative case. Add dependencies on all memory
+      // references.
+      if (Chain)
+        Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+      Chain = SU;
+      for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
+        PendingLoads[k]->addPred(SDep(SU, SDep::Order, SU->Latency));
+      PendingLoads.clear();
+      for (std::map<const Value *, SUnit *>::iterator I = MemDefs.begin(),
+           E = MemDefs.end(); I != E; ++I) {
+        I->second->addPred(SDep(SU, SDep::Order, SU->Latency));
+        I->second = SU;
+      }
+      for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
+           MemUses.begin(), E = MemUses.end(); I != E; ++I) {
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
+          I->second[i]->addPred(SDep(SU, SDep::Order, SU->Latency));
+        I->second.clear();
+      }
+      // See if it is known to just have a single memory reference.
+      MachineInstr *ChainMI = Chain->getInstr();
+      const TargetInstrDesc &ChainTID = ChainMI->getDesc();
+      if (!ChainTID.isCall() &&
+          !ChainTID.hasUnmodeledSideEffects() &&
+          ChainMI->hasOneMemOperand() &&
+          !ChainMI->memoperands_begin()->isVolatile() &&
+          ChainMI->memoperands_begin()->getValue())
+        // We know that the Chain accesses one specific memory location.
+        ChainMMO = &*ChainMI->memoperands_begin();
+      else
+        // Unknown memory accesses. Assume the worst.
+        ChainMMO = 0;
+    } else if (TID.mayStore()) {
+      if (const Value *V = getUnderlyingObjectForInstr(MI)) {
+        // A store to a specific PseudoSourceValue. Add precise dependencies.
+        // Handle the def in MemDefs, if there is one.
+        std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V);
+        if (I != MemDefs.end()) {
+          I->second->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0,
+                                  /*isNormalMemory=*/true));
+          I->second = SU;
+        } else {
+          MemDefs[V] = SU;
+        }
+        // Handle the uses in MemUses, if there are any.
+        std::map<const Value *, std::vector<SUnit *> >::iterator J =
+          MemUses.find(V);
+        if (J != MemUses.end()) {
+          for (unsigned i = 0, e = J->second.size(); i != e; ++i)
+            J->second[i]->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0,
+                                       /*isNormalMemory=*/true));
+          J->second.clear();
+        }
+        // Add dependencies from all the PendingLoads, since without
+        // memoperands we must assume they alias anything.
+        for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
+          PendingLoads[k]->addPred(SDep(SU, SDep::Order, SU->Latency));
+        // Add a general dependence too, if needed.
+        if (Chain)
+          Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+      } else
+        // Treat all other stores conservatively.
+        goto new_chain;
+    } else if (TID.mayLoad()) {
+      if (TII->isInvariantLoad(MI)) {
+        // Invariant load, no chain dependencies needed!
+      } else if (const Value *V = getUnderlyingObjectForInstr(MI)) {
+        // A load from a specific PseudoSourceValue. Add precise dependencies.
+        std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V);
+        if (I != MemDefs.end())
+          I->second->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0,
+                                  /*isNormalMemory=*/true));
+        MemUses[V].push_back(SU);
+
+        // Add a general dependence too, if needed.
+        if (Chain && (!ChainMMO ||
+                      (ChainMMO->isStore() || ChainMMO->isVolatile())))
+          Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+      } else if (MI->hasVolatileMemoryRef()) {
+        // Treat volatile loads conservatively. Note that this includes
+        // cases where memoperand information is unavailable.
+        goto new_chain;
+      } else {
+        // A normal load. Depend on the general chain, as well as on
+        // all stores. In the absense of MachineMemOperand information,
+        // we can't even assume that the load doesn't alias well-behaved
+        // memory locations.
+        if (Chain)
+          Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+        for (std::map<const Value *, SUnit *>::iterator I = MemDefs.begin(),
+             E = MemDefs.end(); I != E; ++I)
+          I->second->addPred(SDep(SU, SDep::Order, SU->Latency));
+        PendingLoads.push_back(SU);
+      }
+    }
+  }
+
+  for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    Defs[i].clear();
+    Uses[i].clear();
+  }
+  PendingLoads.clear();
+}
+
+void ScheduleDAGInstrs::FinishBlock() {
+  // Nothing to do.
+}
+
+void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+
+  // Compute the latency for the node.  We use the sum of the latencies for
+  // all nodes flagged together into this SUnit.
+  SU->Latency =
+    InstrItins.getLatency(SU->getInstr()->getDesc().getSchedClass());
+
+  // Simplistic target-independent heuristic: assume that loads take
+  // extra time.
+  if (InstrItins.isEmpty())
+    if (SU->getInstr()->getDesc().mayLoad())
+      SU->Latency += 2;
+}
+
+void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+  SU->getInstr()->dump();
+}
+
+std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream oss(s);
+  if (SU == &EntrySU)
+    oss << "<entry>";
+  else if (SU == &ExitSU)
+    oss << "<exit>";
+  else
+    SU->getInstr()->print(oss);
+  return oss.str();
+}
+
+// EmitSchedule - Emit the machine code in scheduled order.
+MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() {
+  // For MachineInstr-based scheduling, we're rescheduling the instructions in
+  // the block, so start by removing them from the block.
+  while (Begin != InsertPos) {
+    MachineBasicBlock::iterator I = Begin;
+    ++Begin;
+    BB->remove(I);
+  }
+
+  // Then re-insert them according to the given schedule.
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    SUnit *SU = Sequence[i];
+    if (!SU) {
+      // Null SUnit* is a noop.
+      EmitNoop();
+      continue;
+    }
+
+    BB->insert(InsertPos, SU->getInstr());
+  }
+
+  // Update the Begin iterator, as the first instruction in the block
+  // may have been scheduled later.
+  if (!Sequence.empty())
+    Begin = Sequence[0]->getInstr();
+
+  return BB;
+}
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
new file mode 100644
index 0000000..00d6268
--- /dev/null
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -0,0 +1,184 @@
+//==- ScheduleDAGInstrs.h - MachineInstr Scheduling --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScheduleDAGInstrs class, which implements
+// scheduling for a MachineInstr-based dependency graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCHEDULEDAGINSTRS_H
+#define SCHEDULEDAGINSTRS_H
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <map>
+
+namespace llvm {
+  class MachineLoopInfo;
+  class MachineDominatorTree;
+
+  /// LoopDependencies - This class analyzes loop-oriented register
+  /// dependencies, which are used to guide scheduling decisions.
+  /// For example, loop induction variable increments should be
+  /// scheduled as soon as possible after the variable's last use.
+  ///
+  class VISIBILITY_HIDDEN LoopDependencies {
+    const MachineLoopInfo &MLI;
+    const MachineDominatorTree &MDT;
+
+  public:
+    typedef std::map<unsigned, std::pair<const MachineOperand *, unsigned> >
+      LoopDeps;
+    LoopDeps Deps;
+
+    LoopDependencies(const MachineLoopInfo &mli,
+                     const MachineDominatorTree &mdt) :
+      MLI(mli), MDT(mdt) {}
+
+    /// VisitLoop - Clear out any previous state and analyze the given loop.
+    ///
+    void VisitLoop(const MachineLoop *Loop) {
+      Deps.clear();
+      MachineBasicBlock *Header = Loop->getHeader();
+      SmallSet<unsigned, 8> LoopLiveIns;
+      for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(),
+           LE = Header->livein_end(); LI != LE; ++LI)
+        LoopLiveIns.insert(*LI);
+
+      const MachineDomTreeNode *Node = MDT.getNode(Header);
+      const MachineBasicBlock *MBB = Node->getBlock();
+      assert(Loop->contains(MBB) &&
+             "Loop does not contain header!");
+      VisitRegion(Node, MBB, Loop, LoopLiveIns);
+    }
+
+  private:
+    void VisitRegion(const MachineDomTreeNode *Node,
+                     const MachineBasicBlock *MBB,
+                     const MachineLoop *Loop,
+                     const SmallSet<unsigned, 8> &LoopLiveIns) {
+      unsigned Count = 0;
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I, ++Count) {
+        const MachineInstr *MI = I;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg() || !MO.isUse())
+            continue;
+          unsigned MOReg = MO.getReg();
+          if (LoopLiveIns.count(MOReg))
+            Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
+        }
+      }
+
+      const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+      for (std::vector<MachineDomTreeNode*>::const_iterator I =
+           Children.begin(), E = Children.end(); I != E; ++I) {
+        const MachineDomTreeNode *ChildNode = *I;
+        MachineBasicBlock *ChildBlock = ChildNode->getBlock();
+        if (Loop->contains(ChildBlock))
+          VisitRegion(ChildNode, ChildBlock, Loop, LoopLiveIns);
+      }
+    }
+  };
+
+  /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
+  /// MachineInstrs.
+  class VISIBILITY_HIDDEN ScheduleDAGInstrs : public ScheduleDAG {
+    const MachineLoopInfo &MLI;
+    const MachineDominatorTree &MDT;
+
+    /// Defs, Uses - Remember where defs and uses of each physical register
+    /// are as we iterate upward through the instructions. This is allocated
+    /// here instead of inside BuildSchedGraph to avoid the need for it to be
+    /// initialized and destructed for each block.
+    std::vector<SUnit *> Defs[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<SUnit *> Uses[TargetRegisterInfo::FirstVirtualRegister];
+
+    /// PendingLoads - Remember where unknown loads are after the most recent
+    /// unknown store, as we iterate. As with Defs and Uses, this is here
+    /// to minimize construction/destruction.
+    std::vector<SUnit *> PendingLoads;
+
+    /// LoopRegs - Track which registers are used for loop-carried dependencies.
+    ///
+    LoopDependencies LoopRegs;
+
+    /// LoopLiveInRegs - Track which regs are live into a loop, to help guide
+    /// back-edge-aware scheduling.
+    ///
+    SmallSet<unsigned, 8> LoopLiveInRegs;
+
+  public:
+    MachineBasicBlock *BB;                // Current basic block
+    MachineBasicBlock::iterator Begin;    // The beginning of the range to
+                                          // be scheduled. The range extends
+                                          // to InsertPos.
+    unsigned InsertPosIndex;              // The index in BB of InsertPos.
+
+    explicit ScheduleDAGInstrs(MachineFunction &mf,
+                               const MachineLoopInfo &mli,
+                               const MachineDominatorTree &mdt);
+
+    virtual ~ScheduleDAGInstrs() {}
+
+    /// NewSUnit - Creates a new SUnit and return a ptr to it.
+    ///
+    SUnit *NewSUnit(MachineInstr *MI) {
+#ifndef NDEBUG
+      const SUnit *Addr = SUnits.empty() ? 0 : &SUnits[0];
+#endif
+      SUnits.push_back(SUnit(MI, (unsigned)SUnits.size()));
+      assert((Addr == 0 || Addr == &SUnits[0]) &&
+             "SUnits std::vector reallocated on the fly!");
+      SUnits.back().OrigNode = &SUnits.back();
+      return &SUnits.back();
+    }
+
+    /// Run - perform scheduling.
+    ///
+    void Run(MachineBasicBlock *bb,
+             MachineBasicBlock::iterator begin,
+             MachineBasicBlock::iterator end,
+             unsigned endindex);
+
+    /// BuildSchedGraph - Build SUnits from the MachineBasicBlock that we are
+    /// input.
+    virtual void BuildSchedGraph();
+
+    /// ComputeLatency - Compute node latency.
+    ///
+    virtual void ComputeLatency(SUnit *SU);
+
+    virtual MachineBasicBlock *EmitSchedule();
+
+    /// StartBlock - Prepare to perform scheduling in the given block.
+    ///
+    virtual void StartBlock(MachineBasicBlock *BB);
+
+    /// Schedule - Order nodes according to selected style, filling
+    /// in the Sequence member.
+    ///
+    virtual void Schedule() = 0;
+
+    /// FinishBlock - Clean up after scheduling in the given block.
+    ///
+    virtual void FinishBlock();
+
+    virtual void dumpNode(const SUnit *SU) const;
+
+    virtual std::string getGraphNodeLabel(const SUnit *SU) const;
+  };
+}
+
+#endif
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
new file mode 100644
index 0000000..594c24d
--- /dev/null
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -0,0 +1,97 @@
+//===-- ScheduleDAGPrinter.cpp - Implement ScheduleDAG::viewGraph() -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include <fstream>
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+    static std::string getGraphName(const ScheduleDAG *G) {
+      return G->MF.getFunction()->getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+    
+    static bool hasNodeAddressLabel(const SUnit *Node,
+                                    const ScheduleDAG *Graph) {
+      return true;
+    }
+    
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    static std::string getEdgeAttributes(const SUnit *Node,
+                                         SUnitIterator EI) {
+      if (EI.isArtificialDep())
+        return "color=cyan,style=dashed";
+      if (EI.isCtrlDep())
+        return "color=blue,style=dashed";
+      return "";
+    }
+    
+
+    static std::string getNodeLabel(const SUnit *Node,
+                                    const ScheduleDAG *Graph);
+    static std::string getNodeAttributes(const SUnit *N,
+                                         const ScheduleDAG *Graph) {
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(ScheduleDAG *G,
+                                       GraphWriter<ScheduleDAG*> &GW) {
+      return G->addCustomGraphFeatures(GW);
+    }
+  };
+}
+
+std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
+                                                       const ScheduleDAG *G) {
+  return G->getGraphNodeLabel(SU);
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAG::viewGraph() {
+// This code is only for debugging!
+#ifndef NDEBUG
+  if (BB->getBasicBlock())
+    ViewGraph(this, "dag." + MF.getFunction()->getName(),
+              "Scheduling-Units Graph for " + MF.getFunction()->getName() + ':' +
+              BB->getBasicBlock()->getName());
+  else
+    ViewGraph(this, "dag." + MF.getFunction()->getName(),
+              "Scheduling-Units Graph for " + MF.getFunction()->getName());
+#else
+  cerr << "ScheduleDAG::viewGraph is only available in debug builds on "
+       << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
new file mode 100644
index 0000000..9ea59ea
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_llvm_library(LLVMSelectionDAG
+  CallingConvLower.cpp
+  DAGCombiner.cpp
+  FastISel.cpp
+  LegalizeDAG.cpp
+  LegalizeFloatTypes.cpp
+  LegalizeIntegerTypes.cpp
+  LegalizeTypes.cpp
+  LegalizeTypesGeneric.cpp
+  LegalizeVectorOps.cpp
+  LegalizeVectorTypes.cpp
+  ScheduleDAGSDNodes.cpp
+  ScheduleDAGSDNodesEmit.cpp
+  ScheduleDAGFast.cpp
+  ScheduleDAGList.cpp
+  ScheduleDAGRRList.cpp
+  SelectionDAGBuild.cpp
+  SelectionDAG.cpp
+  SelectionDAGISel.cpp
+  SelectionDAGPrinter.cpp
+  TargetLowering.cpp
+  )
diff --git a/lib/CodeGen/SelectionDAG/CallingConvLower.cpp b/lib/CodeGen/SelectionDAG/CallingConvLower.cpp
new file mode 100644
index 0000000..7cd2b73
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/CallingConvLower.cpp
@@ -0,0 +1,148 @@
+//===-- CallingConvLower.cpp - Calling Conventions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CCState class, used for lowering and implementing
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+CCState::CCState(unsigned CC, bool isVarArg, const TargetMachine &tm,
+                 SmallVector<CCValAssign, 16> &locs)
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
+    TRI(*TM.getRegisterInfo()), Locs(locs) {
+  // No stack is used.
+  StackOffset = 0;
+  
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+// HandleByVal - Allocate a stack slot large enough to pass an argument by
+// value. The size and alignment information of the argument is encoded in its
+// parameter attribute.
+void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
+                          MVT LocVT, CCValAssign::LocInfo LocInfo,
+                          int MinSize, int MinAlign,
+                          ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  unsigned Offset = AllocateStack(Size, Align);
+
+  addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+}
+
+/// MarkAllocated - Mark a register and all of its aliases as allocated.
+void CCState::MarkAllocated(unsigned Reg) {
+  UsedRegs[Reg/32] |= 1 << (Reg&31);
+  
+  if (const unsigned *RegAliases = TRI.getAliasSet(Reg))
+    for (; (Reg = *RegAliases); ++RegAliases)
+      UsedRegs[Reg/32] |= 1 << (Reg&31);
+}
+
+/// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
+/// incorporating info about the formals into this state.
+void CCState::AnalyzeFormalArguments(SDNode *TheArgs, CCAssignFn Fn) {
+  unsigned NumArgs = TheArgs->getNumValues()-1;
+  
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ArgVT = TheArgs->getValueType(i);
+    ISD::ArgFlagsTy ArgFlags =
+      cast<ARG_FLAGSSDNode>(TheArgs->getOperand(3+i))->getArgFlags();
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+      cerr << "Formal argument #" << i << " has unhandled type "
+           << ArgVT.getMVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
+/// incorporating info about the result values into this state.
+void CCState::AnalyzeReturn(SDNode *TheRet, CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = TheRet->getNumOperands() / 2; i != e; ++i) {
+    MVT VT = TheRet->getOperand(i*2+1).getValueType();
+    ISD::ArgFlagsTy ArgFlags =
+      cast<ARG_FLAGSSDNode>(TheRet->getOperand(i*2+2))->getArgFlags();
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)){
+      cerr << "Return operand #" << i << " has unhandled type "
+           << VT.getMVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+
+/// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
+/// about the passed values into this state.
+void CCState::AnalyzeCallOperands(CallSDNode *TheCall, CCAssignFn Fn) {
+  unsigned NumOps = TheCall->getNumArgs();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = TheCall->getArg(i).getValueType();
+    ISD::ArgFlagsTy ArgFlags = TheCall->getArgFlags(i);
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+      cerr << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getMVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Same as above except it takes vectors of types
+/// and argument flags.
+void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+      cerr << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getMVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallResult(CallSDNode *TheCall, CCAssignFn Fn) {
+  for (unsigned i = 0, e = TheCall->getNumRetVals(); i != e; ++i) {
+    MVT VT = TheCall->getRetValType(i);
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (TheCall->isInreg())
+      Flags.setInReg();
+    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+      cerr << "Call result #" << i << " has unhandled type "
+           << VT.getMVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Same as above except it's specialized for calls which
+/// produce a single value.
+void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+    cerr << "Call result has unhandled type "
+         << VT.getMVTString() << "\n";
+    abort();
+  }
+}
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
new file mode 100644
index 0000000..4c1710d
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -0,0 +1,6203 @@
+//===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines dag nodes to form fewer, simpler DAG nodes.  It can be run
+// both before and after the DAG is legalized.
+//
+// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
+// primarily intended to handle simplification opportunities that are implicit
+// in the LLVM IR and exposed by the various codegen lowering phases.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dagcombine"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NodesCombined   , "Number of dag nodes combined");
+STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
+STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
+STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
+
+namespace {
+  static cl::opt<bool>
+    CombinerAA("combiner-alias-analysis", cl::Hidden,
+               cl::desc("Turn on alias analysis during testing"));
+
+  static cl::opt<bool>
+    CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
+               cl::desc("Include global information in alias analysis"));
+
+//------------------------------ DAGCombiner ---------------------------------//
+
+  class VISIBILITY_HIDDEN DAGCombiner {
+    SelectionDAG &DAG;
+    const TargetLowering &TLI;
+    CombineLevel Level;
+    CodeGenOpt::Level OptLevel;
+    bool LegalOperations;
+    bool LegalTypes;
+
+    // Worklist of all of the nodes that need to be simplified.
+    std::vector<SDNode*> WorkList;
+
+    // AA - Used for DAG load/store alias analysis.
+    AliasAnalysis &AA;
+
+    /// AddUsersToWorkList - When an instruction is simplified, add all users of
+    /// the instruction to the work lists because they might get more simplified
+    /// now.
+    ///
+    void AddUsersToWorkList(SDNode *N) {
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+           UI != UE; ++UI)
+        AddToWorkList(*UI);
+    }
+
+    /// visit - call the node-specific routine that knows how to fold each
+    /// particular type of node.
+    SDValue visit(SDNode *N);
+
+  public:
+    /// AddToWorkList - Add to the work list making sure it's instance is at the
+    /// the back (next to be processed.)
+    void AddToWorkList(SDNode *N) {
+      removeFromWorkList(N);
+      WorkList.push_back(N);
+    }
+
+    /// removeFromWorkList - remove all instances of N from the worklist.
+    ///
+    void removeFromWorkList(SDNode *N) {
+      WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), N),
+                     WorkList.end());
+    }
+
+    SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                      bool AddTo = true);
+
+    SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
+      return CombineTo(N, &Res, 1, AddTo);
+    }
+
+    SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
+                      bool AddTo = true) {
+      SDValue To[] = { Res0, Res1 };
+      return CombineTo(N, To, 2, AddTo);
+    }
+
+    void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
+
+  private:
+
+    /// SimplifyDemandedBits - Check the specified integer node value to see if
+    /// it can be simplified or if things it uses can be simplified by bit
+    /// propagation.  If so, return true.
+    bool SimplifyDemandedBits(SDValue Op) {
+      APInt Demanded = APInt::getAllOnesValue(Op.getValueSizeInBits());
+      return SimplifyDemandedBits(Op, Demanded);
+    }
+
+    bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
+
+    bool CombineToPreIndexedLoadStore(SDNode *N);
+    bool CombineToPostIndexedLoadStore(SDNode *N);
+
+
+    /// combine - call the node-specific routine that knows how to fold each
+    /// particular type of node. If that doesn't do anything, try the
+    /// target-specific DAG combines.
+    SDValue combine(SDNode *N);
+
+    // Visitation implementation - Implement dag node combining for different
+    // node types.  The semantics are as follows:
+    // Return Value:
+    //   SDValue.getNode() == 0 - No change was made
+    //   SDValue.getNode() == N - N was replaced, is dead and has been handled.
+    //   otherwise              - N should be replaced by the returned Operand.
+    //
+    SDValue visitTokenFactor(SDNode *N);
+    SDValue visitMERGE_VALUES(SDNode *N);
+    SDValue visitADD(SDNode *N);
+    SDValue visitSUB(SDNode *N);
+    SDValue visitADDC(SDNode *N);
+    SDValue visitADDE(SDNode *N);
+    SDValue visitMUL(SDNode *N);
+    SDValue visitSDIV(SDNode *N);
+    SDValue visitUDIV(SDNode *N);
+    SDValue visitSREM(SDNode *N);
+    SDValue visitUREM(SDNode *N);
+    SDValue visitMULHU(SDNode *N);
+    SDValue visitMULHS(SDNode *N);
+    SDValue visitSMUL_LOHI(SDNode *N);
+    SDValue visitUMUL_LOHI(SDNode *N);
+    SDValue visitSDIVREM(SDNode *N);
+    SDValue visitUDIVREM(SDNode *N);
+    SDValue visitAND(SDNode *N);
+    SDValue visitOR(SDNode *N);
+    SDValue visitXOR(SDNode *N);
+    SDValue SimplifyVBinOp(SDNode *N);
+    SDValue visitSHL(SDNode *N);
+    SDValue visitSRA(SDNode *N);
+    SDValue visitSRL(SDNode *N);
+    SDValue visitCTLZ(SDNode *N);
+    SDValue visitCTTZ(SDNode *N);
+    SDValue visitCTPOP(SDNode *N);
+    SDValue visitSELECT(SDNode *N);
+    SDValue visitSELECT_CC(SDNode *N);
+    SDValue visitSETCC(SDNode *N);
+    SDValue visitSIGN_EXTEND(SDNode *N);
+    SDValue visitZERO_EXTEND(SDNode *N);
+    SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitSIGN_EXTEND_INREG(SDNode *N);
+    SDValue visitTRUNCATE(SDNode *N);
+    SDValue visitBIT_CONVERT(SDNode *N);
+    SDValue visitBUILD_PAIR(SDNode *N);
+    SDValue visitFADD(SDNode *N);
+    SDValue visitFSUB(SDNode *N);
+    SDValue visitFMUL(SDNode *N);
+    SDValue visitFDIV(SDNode *N);
+    SDValue visitFREM(SDNode *N);
+    SDValue visitFCOPYSIGN(SDNode *N);
+    SDValue visitSINT_TO_FP(SDNode *N);
+    SDValue visitUINT_TO_FP(SDNode *N);
+    SDValue visitFP_TO_SINT(SDNode *N);
+    SDValue visitFP_TO_UINT(SDNode *N);
+    SDValue visitFP_ROUND(SDNode *N);
+    SDValue visitFP_ROUND_INREG(SDNode *N);
+    SDValue visitFP_EXTEND(SDNode *N);
+    SDValue visitFNEG(SDNode *N);
+    SDValue visitFABS(SDNode *N);
+    SDValue visitBRCOND(SDNode *N);
+    SDValue visitBR_CC(SDNode *N);
+    SDValue visitLOAD(SDNode *N);
+    SDValue visitSTORE(SDNode *N);
+    SDValue visitINSERT_VECTOR_ELT(SDNode *N);
+    SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
+    SDValue visitBUILD_VECTOR(SDNode *N);
+    SDValue visitCONCAT_VECTORS(SDNode *N);
+    SDValue visitVECTOR_SHUFFLE(SDNode *N);
+
+    SDValue XformToShuffleWithZero(SDNode *N);
+    SDValue ReassociateOps(unsigned Opc, DebugLoc DL, SDValue LHS, SDValue RHS);
+
+    SDValue visitShiftByConstant(SDNode *N, unsigned Amt);
+
+    bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
+    SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
+    SDValue SimplifySelect(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2);
+    SDValue SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2,
+                             SDValue N3, ISD::CondCode CC,
+                             bool NotExtCompare = false);
+    SDValue SimplifySetCC(MVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
+                          DebugLoc DL, bool foldBooleans = true);
+    SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                         unsigned HiOp);
+    SDValue CombineConsecutiveLoads(SDNode *N, MVT VT);
+    SDValue ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *, MVT);
+    SDValue BuildSDIV(SDNode *N);
+    SDValue BuildUDIV(SDNode *N);
+    SDNode *MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL);
+    SDValue ReduceLoadWidth(SDNode *N);
+    SDValue ReduceLoadOpStoreWidth(SDNode *N);
+
+    SDValue GetDemandedBits(SDValue V, const APInt &Mask);
+
+    /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+    /// looking for aliasing nodes and adding them to the Aliases vector.
+    void GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                          SmallVector<SDValue, 8> &Aliases);
+
+    /// isAlias - Return true if there is any possibility that the two addresses
+    /// overlap.
+    bool isAlias(SDValue Ptr1, int64_t Size1,
+                 const Value *SrcValue1, int SrcValueOffset1,
+                 SDValue Ptr2, int64_t Size2,
+                 const Value *SrcValue2, int SrcValueOffset2) const;
+
+    /// FindAliasInfo - Extracts the relevant alias information from the memory
+    /// node.  Returns true if the operand was a load.
+    bool FindAliasInfo(SDNode *N,
+                       SDValue &Ptr, int64_t &Size,
+                       const Value *&SrcValue, int &SrcValueOffset) const;
+
+    /// FindBetterChain - Walk up chain skipping non-aliasing memory nodes,
+    /// looking for a better chain (aliasing node.)
+    SDValue FindBetterChain(SDNode *N, SDValue Chain);
+
+    /// getShiftAmountTy - Returns a type large enough to hold any valid
+    /// shift amount - before type legalization these can be huge.
+    MVT getShiftAmountTy() {
+      return LegalTypes ?  TLI.getShiftAmountTy() : TLI.getPointerTy();
+    }
+
+public:
+    DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+      : DAG(D),
+        TLI(D.getTargetLoweringInfo()),
+        Level(Unrestricted),
+        OptLevel(OL),
+        LegalOperations(false),
+        LegalTypes(false),
+        AA(A) {}
+
+    /// Run - runs the dag combiner on all nodes in the work list
+    void Run(CombineLevel AtLevel);
+  };
+}
+
+
+namespace {
+/// WorkListRemover - This class is a DAGUpdateListener that removes any deleted
+/// nodes from the worklist.
+class VISIBILITY_HIDDEN WorkListRemover :
+  public SelectionDAG::DAGUpdateListener {
+  DAGCombiner &DC;
+public:
+  explicit WorkListRemover(DAGCombiner &dc) : DC(dc) {}
+
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    DC.removeFromWorkList(N);
+  }
+
+  virtual void NodeUpdated(SDNode *N) {
+    // Ignore updates.
+  }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetLowering::DAGCombinerInfo implementation
+//===----------------------------------------------------------------------===//
+
+void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
+  ((DAGCombiner*)DC)->AddToWorkList(N);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, const std::vector<SDValue> &To, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
+}
+
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
+}
+
+void TargetLowering::DAGCombinerInfo::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// isNegatibleForFree - Return 1 if we can compute the negated form of the
+/// specified expression for the same cost as the expression itself, or 2 if we
+/// can compute the negated form more cheaply than the expression itself.
+static char isNegatibleForFree(SDValue Op, bool LegalOperations,
+                               unsigned Depth = 0) {
+  // No compile time optimizations on this type.
+  if (Op.getValueType() == MVT::ppcf128)
+    return 0;
+
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return 2;
+
+  // Don't allow anything with multiple uses.
+  if (!Op.hasOneUse()) return 0;
+
+  // Don't recurse exponentially.
+  if (Depth > 6) return 0;
+
+  switch (Op.getOpcode()) {
+  default: return false;
+  case ISD::ConstantFP:
+    // Don't invert constant FP values after legalize.  The negated constant
+    // isn't necessarily legal.
+    return LegalOperations ? 0 : 1;
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    if (!UnsafeFPMath) return 0;
+
+    // fold (fsub (fadd A, B)) -> (fsub (fneg A), B)
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return V;
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, Depth+1);
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    if (!UnsafeFPMath) return 0;
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return 1;
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    if (HonorSignDependentRoundingFPMath()) return 0;
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return V;
+
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, Depth+1);
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FSIN:
+    return isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1);
+  }
+}
+
+/// GetNegatedExpression - If isNegatibleForFree returns true, this function
+/// returns the newly negated expression.
+static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                    bool LegalOperations, unsigned Depth = 0) {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
+
+  // Don't allow anything with multiple uses.
+  assert(Op.hasOneUse() && "Unknown reuse!");
+
+  assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Unknown code");
+  case ISD::ConstantFP: {
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    return DAG.getConstantFP(V, Op.getValueType());
+  }
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    assert(UnsafeFPMath);
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1),
+                       Op.getOperand(0));
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    assert(UnsafeFPMath);
+
+    // fold (fneg (fsub 0, B)) -> B
+    if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
+      if (N0CFP->getValueAPF().isZero())
+        return Op.getOperand(1);
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(0));
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    assert(!HonorSignDependentRoundingFPMath());
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations, Depth+1))
+      return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(0),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1));
+
+  case ISD::FP_EXTEND:
+  case ISD::FSIN:
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, Depth+1));
+  case ISD::FP_ROUND:
+      return DAG.getNode(ISD::FP_ROUND, Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+  }
+}
+
+
+// isSetCCEquivalent - Return true if this node is a setcc, or is a select_cc
+// that selects between the values 1 and 0, making it equivalent to a setcc.
+// Also, set the incoming LHS, RHS, and CC references to the appropriate
+// nodes based on the type of node we are checking.  This simplifies life a
+// bit for the callers.
+static bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
+                              SDValue &CC) {
+  if (N.getOpcode() == ISD::SETCC) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(2);
+    return true;
+  }
+  if (N.getOpcode() == ISD::SELECT_CC &&
+      N.getOperand(2).getOpcode() == ISD::Constant &&
+      N.getOperand(3).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(N.getOperand(2))->getAPIntValue() == 1 &&
+      cast<ConstantSDNode>(N.getOperand(3))->isNullValue()) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(4);
+    return true;
+  }
+  return false;
+}
+
+// isOneUseSetCC - Return true if this is a SetCC-equivalent operation with only
+// one use.  If this is true, it allows the users to invert the operation for
+// free when it is profitable to do so.
+static bool isOneUseSetCC(SDValue N) {
+  SDValue N0, N1, N2;
+  if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
+    return true;
+  return false;
+}
+
+SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
+                                    SDValue N0, SDValue N1) {
+  MVT VT = N0.getValueType();
+  if (N0.getOpcode() == Opc && isa<ConstantSDNode>(N0.getOperand(1))) {
+    if (isa<ConstantSDNode>(N1)) {
+      // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
+      SDValue OpNode =
+        DAG.FoldConstantArithmetic(Opc, VT,
+                                   cast<ConstantSDNode>(N0.getOperand(1)),
+                                   cast<ConstantSDNode>(N1));
+      return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+    } else if (N0.hasOneUse()) {
+      // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one use
+      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+                                   N0.getOperand(0), N1);
+      AddToWorkList(OpNode.getNode());
+      return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
+    }
+  }
+
+  if (N1.getOpcode() == Opc && isa<ConstantSDNode>(N1.getOperand(1))) {
+    if (isa<ConstantSDNode>(N0)) {
+      // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
+      SDValue OpNode =
+        DAG.FoldConstantArithmetic(Opc, VT,
+                                   cast<ConstantSDNode>(N1.getOperand(1)),
+                                   cast<ConstantSDNode>(N0));
+      return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+    } else if (N1.hasOneUse()) {
+      // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one use
+      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+                                   N1.getOperand(0), N0);
+      AddToWorkList(OpNode.getNode());
+      return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                               bool AddTo) {
+  assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
+  ++NodesCombined;
+  DOUT << "\nReplacing.1 "; DEBUG(N->dump(&DAG));
+  DOUT << "\nWith: "; DEBUG(To[0].getNode()->dump(&DAG));
+  DOUT << " and " << NumTo-1 << " other values\n";
+  DEBUG(for (unsigned i = 0, e = NumTo; i != e; ++i)
+          assert(N->getValueType(i) == To[i].getValueType() &&
+                 "Cannot combine value to value of different type!"));
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesWith(N, To, &DeadNodes);
+
+  if (AddTo) {
+    // Push the new nodes and any users onto the worklist
+    for (unsigned i = 0, e = NumTo; i != e; ++i) {
+      if (To[i].getNode()) {
+        AddToWorkList(To[i].getNode());
+        AddUsersToWorkList(To[i].getNode());
+      }
+    }
+  }
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (N->use_empty()) {
+    // Nodes can be reintroduced into the worklist.  Make sure we do not
+    // process a node that has been replaced.
+    removeFromWorkList(N);
+
+    // Finally, since the node is now dead, remove it from the graph.
+    DAG.DeleteNode(N);
+  }
+  return SDValue(N, 0);
+}
+
+void
+DAGCombiner::CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &
+                                                                          TLO) {
+  // Replace all uses.  If any nodes become isomorphic to other nodes and
+  // are deleted, make sure to remove them from our worklist.
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
+
+  // Push the new node and any (possibly new) users onto the worklist.
+  AddToWorkList(TLO.New.getNode());
+  AddUsersToWorkList(TLO.New.getNode());
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (TLO.Old.getNode()->use_empty()) {
+    removeFromWorkList(TLO.Old.getNode());
+
+    // If the operands of this node are only used by the node, they will now
+    // be dead.  Make sure to visit them first to delete dead nodes early.
+    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
+      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
+        AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode());
+
+    DAG.DeleteNode(TLO.Old.getNode());
+  }
+}
+
+/// SimplifyDemandedBits - Check the specified integer node value to see if
+/// it can be simplified or if things it uses can be simplified by bit
+/// propagation.  If so, return true.
+bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
+  TargetLowering::TargetLoweringOpt TLO(DAG);
+  APInt KnownZero, KnownOne;
+  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    return false;
+
+  // Revisit the node.
+  AddToWorkList(Op.getNode());
+
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  DOUT << "\nReplacing.2 "; DEBUG(TLO.Old.getNode()->dump(&DAG));
+  DOUT << "\nWith: "; DEBUG(TLO.New.getNode()->dump(&DAG));
+  DOUT << '\n';
+
+  CommitTargetLoweringOpt(TLO);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  Main DAG Combiner implementation
+//===----------------------------------------------------------------------===//
+
+void DAGCombiner::Run(CombineLevel AtLevel) {
+  // set the instance variables, so that the various visit routines may use it.
+  Level = AtLevel;
+  LegalOperations = Level >= NoIllegalOperations;
+  LegalTypes = Level >= NoIllegalTypes;
+
+  // Add all the dag nodes to the worklist.
+  WorkList.reserve(DAG.allnodes_size());
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I)
+    WorkList.push_back(I);
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+
+  // The root of the dag may dangle to deleted nodes until the dag combiner is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // while the worklist isn't empty, inspect the node on the end of it and
+  // try and combine it.
+  while (!WorkList.empty()) {
+    SDNode *N = WorkList.back();
+    WorkList.pop_back();
+
+    // If N has no uses, it is dead.  Make sure to revisit all N's operands once
+    // N is deleted from the DAG, since they too may now be dead or may have a
+    // reduced number of uses, allowing other xforms.
+    if (N->use_empty() && N != &Dummy) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        AddToWorkList(N->getOperand(i).getNode());
+
+      DAG.DeleteNode(N);
+      continue;
+    }
+
+    SDValue RV = combine(N);
+
+    if (RV.getNode() == 0)
+      continue;
+
+    ++NodesCombined;
+
+    // If we get back the same node we passed in, rather than a new node or
+    // zero, we know that the node must have defined multiple values and
+    // CombineTo was used.  Since CombineTo takes care of the worklist
+    // mechanics for us, we have no work to do in this case.
+    if (RV.getNode() == N)
+      continue;
+
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned new node!");
+
+    DOUT << "\nReplacing.3 "; DEBUG(N->dump(&DAG));
+    DOUT << "\nWith: "; DEBUG(RV.getNode()->dump(&DAG));
+    DOUT << '\n';
+    WorkListRemover DeadNodes(*this);
+    if (N->getNumValues() == RV.getNode()->getNumValues())
+      DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes);
+    else {
+      assert(N->getValueType(0) == RV.getValueType() &&
+             N->getNumValues() == 1 && "Type mismatch");
+      SDValue OpV = RV;
+      DAG.ReplaceAllUsesWith(N, &OpV, &DeadNodes);
+    }
+
+    // Push the new node and any users onto the worklist
+    AddToWorkList(RV.getNode());
+    AddUsersToWorkList(RV.getNode());
+
+    // Add any uses of the old node to the worklist in case this node is the
+    // last one that uses them.  They may become dead after this node is
+    // deleted.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      AddToWorkList(N->getOperand(i).getNode());
+
+    // Finally, if the node is now dead, remove it from the graph.  The node
+    // may not be dead if the replacement process recursively simplified to
+    // something else needing this node.
+    if (N->use_empty()) {
+      // Nodes can be reintroduced into the worklist.  Make sure we do not
+      // process a node that has been replaced.
+      removeFromWorkList(N);
+
+      // Finally, since the node is now dead, remove it from the graph.
+      DAG.DeleteNode(N);
+    }
+  }
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  DAG.setRoot(Dummy.getValue());
+}
+
+SDValue DAGCombiner::visit(SDNode *N) {
+  switch(N->getOpcode()) {
+  default: break;
+  case ISD::TokenFactor:        return visitTokenFactor(N);
+  case ISD::MERGE_VALUES:       return visitMERGE_VALUES(N);
+  case ISD::ADD:                return visitADD(N);
+  case ISD::SUB:                return visitSUB(N);
+  case ISD::ADDC:               return visitADDC(N);
+  case ISD::ADDE:               return visitADDE(N);
+  case ISD::MUL:                return visitMUL(N);
+  case ISD::SDIV:               return visitSDIV(N);
+  case ISD::UDIV:               return visitUDIV(N);
+  case ISD::SREM:               return visitSREM(N);
+  case ISD::UREM:               return visitUREM(N);
+  case ISD::MULHU:              return visitMULHU(N);
+  case ISD::MULHS:              return visitMULHS(N);
+  case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
+  case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
+  case ISD::SDIVREM:            return visitSDIVREM(N);
+  case ISD::UDIVREM:            return visitUDIVREM(N);
+  case ISD::AND:                return visitAND(N);
+  case ISD::OR:                 return visitOR(N);
+  case ISD::XOR:                return visitXOR(N);
+  case ISD::SHL:                return visitSHL(N);
+  case ISD::SRA:                return visitSRA(N);
+  case ISD::SRL:                return visitSRL(N);
+  case ISD::CTLZ:               return visitCTLZ(N);
+  case ISD::CTTZ:               return visitCTTZ(N);
+  case ISD::CTPOP:              return visitCTPOP(N);
+  case ISD::SELECT:             return visitSELECT(N);
+  case ISD::SELECT_CC:          return visitSELECT_CC(N);
+  case ISD::SETCC:              return visitSETCC(N);
+  case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
+  case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
+  case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
+  case ISD::TRUNCATE:           return visitTRUNCATE(N);
+  case ISD::BIT_CONVERT:        return visitBIT_CONVERT(N);
+  case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
+  case ISD::FADD:               return visitFADD(N);
+  case ISD::FSUB:               return visitFSUB(N);
+  case ISD::FMUL:               return visitFMUL(N);
+  case ISD::FDIV:               return visitFDIV(N);
+  case ISD::FREM:               return visitFREM(N);
+  case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
+  case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
+  case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
+  case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
+  case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
+  case ISD::FP_ROUND:           return visitFP_ROUND(N);
+  case ISD::FP_ROUND_INREG:     return visitFP_ROUND_INREG(N);
+  case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
+  case ISD::FNEG:               return visitFNEG(N);
+  case ISD::FABS:               return visitFABS(N);
+  case ISD::BRCOND:             return visitBRCOND(N);
+  case ISD::BR_CC:              return visitBR_CC(N);
+  case ISD::LOAD:               return visitLOAD(N);
+  case ISD::STORE:              return visitSTORE(N);
+  case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
+  case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
+  case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
+  case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
+  case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::combine(SDNode *N) {
+  SDValue RV = visit(N);
+
+  // If nothing happened, try a target-specific DAG combine.
+  if (RV.getNode() == 0) {
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned NULL!");
+
+    if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
+        TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
+
+      // Expose the DAG combiner to the target combiner impls.
+      TargetLowering::DAGCombinerInfo
+        DagCombineInfo(DAG, Level == Unrestricted, false, this);
+
+      RV = TLI.PerformDAGCombine(N, DagCombineInfo);
+    }
+  }
+
+  // If N is a commutative binary node, try commuting it to enable more
+  // sdisel CSE.
+  if (RV.getNode() == 0 &&
+      SelectionDAG::isCommutativeBinOp(N->getOpcode()) &&
+      N->getNumValues() == 1) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+
+    // Constant operands are canonicalized to RHS.
+    if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
+      SDValue Ops[] = { N1, N0 };
+      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(),
+                                            Ops, 2);
+      if (CSENode)
+        return SDValue(CSENode, 0);
+    }
+  }
+
+  return RV;
+}
+
+/// getInputChainForNode - Given a node, return its input chain if it has one,
+/// otherwise return a null sd operand.
+static SDValue getInputChainForNode(SDNode *N) {
+  if (unsigned NumOps = N->getNumOperands()) {
+    if (N->getOperand(0).getValueType() == MVT::Other)
+      return N->getOperand(0);
+    else if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
+      return N->getOperand(NumOps-1);
+    for (unsigned i = 1; i < NumOps-1; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        return N->getOperand(i);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
+  // If N has two operands, where one has an input chain equal to the other,
+  // the 'other' chain is redundant.
+  if (N->getNumOperands() == 2) {
+    if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
+      return N->getOperand(0);
+    if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
+      return N->getOperand(1);
+  }
+
+  SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
+  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor.
+  SmallPtrSet<SDNode*, 16> SeenOps;
+  bool Changed = false;             // If we should replace this token factor.
+
+  // Start out with this token factor.
+  TFs.push_back(N);
+
+  // Iterate through token factors.  The TFs grows when new token factors are
+  // encountered.
+  for (unsigned i = 0; i < TFs.size(); ++i) {
+    SDNode *TF = TFs[i];
+
+    // Check each of the operands.
+    for (unsigned i = 0, ie = TF->getNumOperands(); i != ie; ++i) {
+      SDValue Op = TF->getOperand(i);
+
+      switch (Op.getOpcode()) {
+      case ISD::EntryToken:
+        // Entry tokens don't need to be added to the list. They are
+        // rededundant.
+        Changed = true;
+        break;
+
+      case ISD::TokenFactor:
+        if ((CombinerAA || Op.hasOneUse()) &&
+            std::find(TFs.begin(), TFs.end(), Op.getNode()) == TFs.end()) {
+          // Queue up for processing.
+          TFs.push_back(Op.getNode());
+          // Clean up in case the token factor is removed.
+          AddToWorkList(Op.getNode());
+          Changed = true;
+          break;
+        }
+        // Fall thru
+
+      default:
+        // Only add if it isn't already in the list.
+        if (SeenOps.insert(Op.getNode()))
+          Ops.push_back(Op);
+        else
+          Changed = true;
+        break;
+      }
+    }
+  }
+
+  SDValue Result;
+
+  // If we've change things around then replace token factor.
+  if (Changed) {
+    if (Ops.empty()) {
+      // The entry token is the only possible outcome.
+      Result = DAG.getEntryNode();
+    } else {
+      // New and improved token factor.
+      Result = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                           MVT::Other, &Ops[0], Ops.size());
+    }
+
+    // Don't add users to work list.
+    return CombineTo(N, Result, false);
+  }
+
+  return Result;
+}
+
+/// MERGE_VALUES can always be eliminated.
+SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
+  WorkListRemover DeadNodes(*this);
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i),
+                                  &DeadNodes);
+  removeFromWorkList(N);
+  DAG.DeleteNode(N);
+  return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+}
+
+static
+SDValue combineShlAddConstant(DebugLoc DL, SDValue N0, SDValue N1,
+                              SelectionDAG &DAG) {
+  MVT VT = N0.getValueType();
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N01);
+
+  if (N01C && N00.getOpcode() == ISD::ADD && N00.getNode()->hasOneUse() &&
+      isa<ConstantSDNode>(N00.getOperand(1))) {
+    // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
+    N0 = DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                     DAG.getNode(ISD::SHL, N00.getDebugLoc(), VT,
+                                 N00.getOperand(0), N01),
+                     DAG.getNode(ISD::SHL, N01.getDebugLoc(), VT,
+                                 N00.getOperand(1), N01));
+    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (add x, undef) -> undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+  // fold (add c1, c2) -> c1+c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::ADD, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1, N0);
+  // fold (add x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (add Sym, c) -> Sym+c
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C &&
+        GA->getOpcode() == ISD::GlobalAddress)
+      return DAG.getGlobalAddress(GA->getGlobal(), VT,
+                                  GA->getOffset() +
+                                    (uint64_t)N1C->getSExtValue());
+  // fold ((c1-A)+c2) -> (c1+c2)-A
+  if (N1C && N0.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getOperand(0)))
+      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                         DAG.getConstant(N1C->getAPIntValue()+
+                                         N0C->getAPIntValue(), VT),
+                         N0.getOperand(1));
+  // reassociate add
+  SDValue RADD = ReassociateOps(ISD::ADD, N->getDebugLoc(), N0, N1);
+  if (RADD.getNode() != 0)
+    return RADD;
+  // fold ((0-A) + B) -> B-A
+  if (N0.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N0.getOperand(0)) &&
+      cast<ConstantSDNode>(N0.getOperand(0))->isNullValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1, N0.getOperand(1));
+  // fold (A + (0-B)) -> A-B
+  if (N1.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N1.getOperand(0)) &&
+      cast<ConstantSDNode>(N1.getOperand(0))->isNullValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, N1.getOperand(1));
+  // fold (A+(B-A)) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
+    return N1.getOperand(0);
+  // fold ((B-A)+A) -> B
+  if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
+    return N0.getOperand(0);
+  // fold (A+(B-(A+C))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(0))
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(1));
+  // fold (A+(B-(C+A))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(1))
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(0));
+  // fold (A+((B-A)+or-C)) to (B+or-C)
+  if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
+      N1.getOperand(0).getOpcode() == ISD::SUB &&
+      N0 == N1.getOperand(0).getOperand(1))
+    return DAG.getNode(N1.getOpcode(), N->getDebugLoc(), VT,
+                       N1.getOperand(0).getOperand(0), N1.getOperand(1));
+
+  // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    SDValue N10 = N1.getOperand(0);
+    SDValue N11 = N1.getOperand(1);
+
+    if (isa<ConstantSDNode>(N00) || isa<ConstantSDNode>(N10))
+      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT, N00, N10),
+                         DAG.getNode(ISD::ADD, N1.getDebugLoc(), VT, N01, N11));
+  }
+
+  if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (a+b) -> (a|b) iff a and b share no bits.
+  if (VT.isInteger() && !VT.isVector()) {
+    APInt LHSZero, LHSOne;
+    APInt RHSZero, RHSOne;
+    APInt Mask = APInt::getAllOnesValue(VT.getSizeInBits());
+    DAG.ComputeMaskedBits(N0, Mask, LHSZero, LHSOne);
+
+    if (LHSZero.getBoolValue()) {
+      DAG.ComputeMaskedBits(N1, Mask, RHSZero, RHSOne);
+
+      // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
+      // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
+      if ((RHSZero & (~LHSZero & Mask)) == (~LHSZero & Mask) ||
+          (LHSZero & (~RHSZero & Mask)) == (~RHSZero & Mask))
+        return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1);
+    }
+  }
+
+  // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
+  if (N0.getOpcode() == ISD::SHL && N0.getNode()->hasOneUse()) {
+    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N0, N1, DAG);
+    if (Result.getNode()) return Result;
+  }
+  if (N1.getOpcode() == ISD::SHL && N1.getNode()->hasOneUse()) {
+    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N1, N0, DAG);
+    if (Result.getNode()) return Result;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N0.getValueType();
+
+  // If the flag result is dead, turn this into an ADD.
+  if (N->hasNUsesOfValue(0, 1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1, N0),
+                     DAG.getNode(ISD::CARRY_FALSE,
+                                 N->getDebugLoc(), MVT::Flag));
+
+  // canonicalize constant to RHS.
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N1, N0);
+
+  // fold (addc x, 0) -> x + no carry out
+  if (N1C && N1C->isNullValue())
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
+                                        N->getDebugLoc(), MVT::Flag));
+
+  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
+  APInt LHSZero, LHSOne;
+  APInt RHSZero, RHSOne;
+  APInt Mask = APInt::getAllOnesValue(VT.getSizeInBits());
+  DAG.ComputeMaskedBits(N0, Mask, LHSZero, LHSOne);
+
+  if (LHSZero.getBoolValue()) {
+    DAG.ComputeMaskedBits(N1, Mask, RHSZero, RHSOne);
+
+    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
+    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
+    if ((RHSZero & (~LHSZero & Mask)) == (~LHSZero & Mask) ||
+        (LHSZero & (~RHSZero & Mask)) == (~RHSZero & Mask))
+      return CombineTo(N, DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1),
+                       DAG.getNode(ISD::CARRY_FALSE,
+                                   N->getDebugLoc(), MVT::Flag));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDE, N->getDebugLoc(), N->getVTList(),
+                       N1, N0, CarryIn);
+
+  // fold (adde x, y, false) -> (addc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N1, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  MVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (sub x, x) -> 0
+  if (N0 == N1)
+    return DAG.getConstant(0, N->getValueType(0));
+  // fold (sub c1, c2) -> c1-c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SUB, VT, N0C, N1C);
+  // fold (sub x, c) -> (add x, -c)
+  if (N1C)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(-N1C->getAPIntValue(), VT));
+  // fold (A+B)-A -> B
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
+    return N0.getOperand(1);
+  // fold (A+B)-B -> A
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
+    return N0.getOperand(0);
+  // fold ((A+(B+or-C))-B) -> A+or-C
+  if (N0.getOpcode() == ISD::ADD &&
+      (N0.getOperand(1).getOpcode() == ISD::SUB ||
+       N0.getOperand(1).getOpcode() == ISD::ADD) &&
+      N0.getOperand(1).getOperand(0) == N1)
+    return DAG.getNode(N0.getOperand(1).getOpcode(), N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(1));
+  // fold ((A+(C+B))-B) -> A+C
+  if (N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+  // fold ((A-(B-C))-C) -> A-B
+  if (N0.getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+
+  // If either operand of a sub is undef, the result is undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  // If the relocation model supports it, consider symbol offsets.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
+      // fold (sub Sym, c) -> Sym-c
+      if (N1C && GA->getOpcode() == ISD::GlobalAddress)
+        return DAG.getGlobalAddress(GA->getGlobal(), VT,
+                                    GA->getOffset() -
+                                      (uint64_t)N1C->getSExtValue());
+      // fold (sub Sym+c1, Sym+c2) -> c1-c2
+      if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
+        if (GA->getGlobal() == GB->getGlobal())
+          return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
+                                 VT);
+    }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (mul x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (mul c1, c2) -> c1*c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::MUL, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT, N1, N0);
+  // fold (mul x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mul x, -1) -> 0-x
+  if (N1C && N1C->isAllOnesValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), N0);
+  // fold (mul x, (1 << c)) -> x << c
+  if (N1C && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy()));
+  // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
+  if (N1C && (-N1C->getAPIntValue()).isPowerOf2()) {
+    unsigned Log2Val = (-N1C->getAPIntValue()).logBase2();
+    // FIXME: If the input is something that is easily negated (e.g. a
+    // single-use add), we should put the negate there.
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT),
+                       DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                            DAG.getConstant(Log2Val, getShiftAmountTy())));
+  }
+  // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
+  if (N1C && N0.getOpcode() == ISD::SHL &&
+      isa<ConstantSDNode>(N0.getOperand(1))) {
+    SDValue C3 = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                             N1, N0.getOperand(1));
+    AddToWorkList(C3.getNode());
+    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                       N0.getOperand(0), C3);
+  }
+
+  // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
+  // use.
+  {
+    SDValue Sh(0,0), Y(0,0);
+    // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
+    if (N0.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N0.getOperand(1)) &&
+        N0.getNode()->hasOneUse()) {
+      Sh = N0; Y = N1;
+    } else if (N1.getOpcode() == ISD::SHL &&
+               isa<ConstantSDNode>(N1.getOperand(1)) &&
+               N1.getNode()->hasOneUse()) {
+      Sh = N1; Y = N0;
+    }
+
+    if (Sh.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                Sh.getOperand(0), Y);
+      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                         Mul, Sh.getOperand(1));
+    }
+  }
+
+  // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
+  if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+      isa<ConstantSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
+                       DAG.getNode(ISD::MUL, N0.getDebugLoc(), VT,
+                                   N0.getOperand(0), N1),
+                       DAG.getNode(ISD::MUL, N1.getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  // reassociate mul
+  SDValue RMUL = ReassociateOps(ISD::MUL, N->getDebugLoc(), N0, N1);
+  if (RMUL.getNode() != 0)
+    return RMUL;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  MVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (sdiv c1, c2) -> c1/c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::SDIV, VT, N0C, N1C);
+  // fold (sdiv X, 1) -> X
+  if (N1C && N1C->getSExtValue() == 1LL)
+    return N0;
+  // fold (sdiv X, -1) -> 0-X
+  if (N1C && N1C->isAllOnesValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), N0);
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
+  if (!VT.isVector()) {
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UDIV, N->getDebugLoc(), N1.getValueType(),
+                         N0, N1);
+  }
+  // fold (sdiv X, pow2) -> simple ops after legalize
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap() &&
+      (isPowerOf2_64(N1C->getSExtValue()) ||
+       isPowerOf2_64(-N1C->getSExtValue()))) {
+    // If dividing by powers of two is cheap, then don't perform the following
+    // fold.
+    if (TLI.isPow2DivCheap())
+      return SDValue();
+
+    int64_t pow2 = N1C->getSExtValue();
+    int64_t abs2 = pow2 > 0 ? pow2 : -pow2;
+    unsigned lg2 = Log2_64(abs2);
+
+    // Splat the sign bit into the register
+    SDValue SGN = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
+                              DAG.getConstant(VT.getSizeInBits()-1,
+                                              getShiftAmountTy()));
+    AddToWorkList(SGN.getNode());
+
+    // Add (N0 < 0) ? abs2 - 1 : 0;
+    SDValue SRL = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, SGN,
+                              DAG.getConstant(VT.getSizeInBits() - lg2,
+                                              getShiftAmountTy()));
+    SDValue ADD = DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, SRL);
+    AddToWorkList(SRL.getNode());
+    AddToWorkList(ADD.getNode());    // Divide by pow2
+    SDValue SRA = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, ADD,
+                              DAG.getConstant(lg2, getShiftAmountTy()));
+
+    // If we're dividing by a positive value, we're done.  Otherwise, we must
+    // negate the result.
+    if (pow2 > 0)
+      return SRA;
+
+    AddToWorkList(SRA.getNode());
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), SRA);
+  }
+
+  // if integer divide is expensive and we satisfy the requirements, emit an
+  // alternate sequence.
+  if (N1C && (N1C->getSExtValue() < -1 || N1C->getSExtValue() > 1) &&
+      !TLI.isIntDivCheap()) {
+    SDValue Op = BuildSDIV(N);
+    if (Op.getNode()) return Op;
+  }
+
+  // undef / X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X / undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  MVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (udiv c1, c2) -> c1/c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::UDIV, VT, N0C, N1C);
+  // fold (udiv x, (1 << c)) -> x >>u c
+  if (N1C && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy()));
+  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+  if (N1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
+      if (SHC->getAPIntValue().isPowerOf2()) {
+        MVT ADDVT = N1.getOperand(1).getValueType();
+        SDValue Add = DAG.getNode(ISD::ADD, N->getDebugLoc(), ADDVT,
+                                  N1.getOperand(1),
+                                  DAG.getConstant(SHC->getAPIntValue()
+                                                                  .logBase2(),
+                                                  ADDVT));
+        AddToWorkList(Add.getNode());
+        return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, Add);
+      }
+    }
+  }
+  // fold (udiv x, c) -> alternate
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap()) {
+    SDValue Op = BuildUDIV(N);
+    if (Op.getNode()) return Op;
+  }
+
+  // undef / X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X / undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold (srem c1, c2) -> c1%c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::SREM, VT, N0C, N1C);
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
+  if (!VT.isVector()) {
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UREM, N->getDebugLoc(), VT, N0, N1);
+  }
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  if (N1C && !N1C->isNullValue()) {
+    SDValue Div = DAG.getNode(ISD::SDIV, N->getDebugLoc(), VT, N0, N1);
+    AddToWorkList(Div.getNode());
+    SDValue OptimizedDiv = combine(Div.getNode());
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      AddToWorkList(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // undef % X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X % undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold (urem c1, c2) -> c1%c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::UREM, VT, N0C, N1C);
+  // fold (urem x, pow2) -> (and x, pow2-1)
+  if (N1C && !N1C->isNullValue() && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue()-1,VT));
+  // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+  if (N1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
+      if (SHC->getAPIntValue().isPowerOf2()) {
+        SDValue Add =
+          DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1,
+                 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()),
+                                 VT));
+        AddToWorkList(Add.getNode());
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, Add);
+      }
+    }
+  }
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  if (N1C && !N1C->isNullValue()) {
+    SDValue Div = DAG.getNode(ISD::UDIV, N->getDebugLoc(), VT, N0, N1);
+    AddToWorkList(Div.getNode());
+    SDValue OptimizedDiv = combine(Div.getNode());
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      AddToWorkList(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // undef % X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X % undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold (mulhs x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mulhs x, 1) -> (sra x, size(x)-1)
+  if (N1C && N1C->getAPIntValue() == 1)
+    return DAG.getNode(ISD::SRA, N->getDebugLoc(), N0.getValueType(), N0,
+                       DAG.getConstant(N0.getValueType().getSizeInBits() - 1,
+                                       getShiftAmountTy()));
+  // fold (mulhs x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHU(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold (mulhu x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mulhu x, 1) -> 0
+  if (N1C && N1C->getAPIntValue() == 1)
+    return DAG.getConstant(0, N0.getValueType());
+  // fold (mulhu x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+
+  return SDValue();
+}
+
+/// SimplifyNodeWithTwoResults - Perform optimizations common to nodes that
+/// compute two values. LoOp and HiOp give the opcodes for the two computations
+/// that are being performed. Return true if a simplification was made.
+///
+SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                                unsigned HiOp) {
+  // If the high half is not needed, just compute the low half.
+  bool HiExists = N->hasAnyUseOfValue(1);
+  if (!HiExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegal(LoOp, N->getValueType(0)))) {
+    SDValue Res = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+                              N->op_begin(), N->getNumOperands());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If the low half is not needed, just compute the high half.
+  bool LoExists = N->hasAnyUseOfValue(0);
+  if (!LoExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
+    SDValue Res = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+                              N->op_begin(), N->getNumOperands());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If both halves are used, return as it is.
+  if (LoExists && HiExists)
+    return SDValue();
+
+  // If the two computed results can be simplified separately, separate them.
+  if (LoExists) {
+    SDValue Lo = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+                             N->op_begin(), N->getNumOperands());
+    AddToWorkList(Lo.getNode());
+    SDValue LoOpt = combine(Lo.getNode());
+    if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
+      return CombineTo(N, LoOpt, LoOpt);
+  }
+
+  if (HiExists) {
+    SDValue Hi = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+                             N->op_begin(), N->getNumOperands());
+    AddToWorkList(Hi.getNode());
+    SDValue HiOpt = combine(Hi.getNode());
+    if (HiOpt.getNode() && HiOpt != Hi &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
+      return CombineTo(N, HiOpt, HiOpt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIVREM(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+/// SimplifyBinOpWithSameOpcodeHands - If this is a binary operator with
+/// two operands of the same opcode, try to simplify it.
+SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  MVT VT = N0.getValueType();
+  assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");
+
+  // For each of OP in AND/OR/XOR:
+  // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
+  // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
+  // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
+  if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND||
+       N0.getOpcode() == ISD::SIGN_EXTEND ||
+       (N0.getOpcode() == ISD::TRUNCATE &&
+        !TLI.isTruncateFree(N0.getOperand(0).getValueType(), VT))) &&
+      N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorkList(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, ORNode);
+  }
+
+  // For each of OP in SHL/SRL/SRA/AND...
+  //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
+  //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
+  //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL ||
+       N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) &&
+      N0.getOperand(1) == N1.getOperand(1)) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorkList(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                       ORNode, N0.getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitAND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N1.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (and x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (and c1, c2) -> c1&c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::AND, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N1, N0);
+  // fold (and x, -1) -> x
+  if (N1C && N1C->isAllOnesValue())
+    return N0;
+  // if (and x, c) is known to be zero, return 0
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(BitWidth)))
+    return DAG.getConstant(0, VT);
+  // reassociate and
+  SDValue RAND = ReassociateOps(ISD::AND, N->getDebugLoc(), N0, N1);
+  if (RAND.getNode() != 0)
+    return RAND;
+  // fold (and (or x, 0xFFFF), 0xFF) -> 0xFF
+  if (N1C && N0.getOpcode() == ISD::OR)
+    if (ConstantSDNode *ORI = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue())
+        return N1;
+  // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N0Op0 = N0.getOperand(0);
+    APInt Mask = ~N1C->getAPIntValue();
+    Mask.trunc(N0Op0.getValueSizeInBits());
+    if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
+      SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(),
+                                 N0.getValueType(), N0Op0);
+
+      // Replace uses of the AND with uses of the Zero extend node.
+      CombineTo(N, Zext);
+
+      // We actually want to replace all uses of the any_extend with the
+      // zero_extend, to avoid duplicating things.  This will later cause this
+      // AND to be folded.
+      CombineTo(N0.getNode(), Zext);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
+        LL.getValueType().isInteger()) {
+      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
+      if (cast<ConstantSDNode>(LR)->isNullValue() && Op1 == ISD::SETEQ) {
+        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+      // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETEQ) {
+        SDValue ANDNode = DAG.getNode(ISD::AND, N0.getDebugLoc(),
+                                      LR.getValueType(), LL, RL);
+        AddToWorkList(ANDNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+      }
+      // fold (and (setgt X,  -1), (setgt Y,  -1)) -> (setgt (or X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETGT) {
+        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations || TLI.isCondCodeLegal(Result, LL.getValueType())))
+        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+                            LL, LR, Result);
+    }
+  }
+
+  // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
+  // fold (and (sra)) -> (and (srl)) when possible.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+  // fold (zext_inreg (extload x)) -> (zextload x)
+  if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    MVT EVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                                     BitWidth - EVT.getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, EVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), EVT,
+                                       LN0->isVolatile(), LN0->getAlignment());
+      AddToWorkList(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
+  if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    MVT EVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                                     BitWidth - EVT.getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, EVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), EVT,
+                                       LN0->isVolatile(), LN0->getAlignment());
+      AddToWorkList(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (and (load x), 255) -> (zextload x, i8)
+  // fold (and (extload x, i16), 255) -> (zextload x, i8)
+  if (N1C && N0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    if (LN0->getExtensionType() != ISD::SEXTLOAD &&
+        LN0->isUnindexed() && N0.hasOneUse() &&
+        // Do not change the width of a volatile load.
+        !LN0->isVolatile()) {
+      MVT EVT = MVT::Other;
+      uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits();
+      if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue()))
+        EVT = MVT::getIntegerVT(ActiveBits);
+
+      MVT LoadedVT = LN0->getMemoryVT();
+
+      // Do not generate loads of non-round integer types since these can
+      // be expensive (and would be wrong if the type is not byte sized).
+      if (EVT != MVT::Other && LoadedVT.bitsGT(EVT) && EVT.isRound() &&
+          (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, EVT))) {
+        MVT PtrType = N0.getOperand(1).getValueType();
+
+        // For big endian targets, we need to add an offset to the pointer to
+        // load the correct bytes.  For little endian systems, we merely need to
+        // read fewer bytes from the same pointer.
+        unsigned LVTStoreBytes = LoadedVT.getStoreSizeInBits()/8;
+        unsigned EVTStoreBytes = EVT.getStoreSizeInBits()/8;
+        unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
+        unsigned Alignment = LN0->getAlignment();
+        SDValue NewPtr = LN0->getBasePtr();
+
+        if (TLI.isBigEndian()) {
+          NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(), PtrType,
+                               NewPtr, DAG.getConstant(PtrOff, PtrType));
+          Alignment = MinAlign(Alignment, PtrOff);
+        }
+
+        AddToWorkList(NewPtr.getNode());
+        SDValue Load =
+          DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT, LN0->getChain(),
+                         NewPtr, LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                         EVT, LN0->isVolatile(), Alignment);
+        AddToWorkList(N);
+        CombineTo(N0.getNode(), Load, Load.getValue(1));
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N1.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (or x, undef) -> -1
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(~0ULL, VT);
+  // fold (or c1, c2) -> c1|c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N1, N0);
+  // fold (or x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (or x, -1) -> -1
+  if (N1C && N1C->isAllOnesValue())
+    return N1;
+  // fold (or x, c) -> c iff (x & ~c) == 0
+  if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
+    return N1;
+  // reassociate or
+  SDValue ROR = ReassociateOps(ISD::OR, N->getDebugLoc(), N0, N1);
+  if (ROR.getNode() != 0)
+    return ROR;
+  // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
+  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+             isa<ConstantSDNode>(N0.getOperand(1))) {
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+                                   N0.getOperand(0), N1),
+                       DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1));
+  }
+  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
+        LL.getValueType().isInteger()) {
+      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0)
+      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
+      if (cast<ConstantSDNode>(LR)->isNullValue() &&
+          (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
+        SDValue ORNode = DAG.getNode(ISD::OR, LR.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
+      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() &&
+          (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
+        SDValue ANDNode = DAG.getNode(ISD::AND, LR.getDebugLoc(),
+                                      LR.getValueType(), LL, RL);
+        AddToWorkList(ANDNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+      }
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations || TLI.isCondCodeLegal(Result, LL.getValueType())))
+        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+                            LL, LR, Result);
+    }
+  }
+
+  // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
+  if (N0.getOpcode() == ISD::AND &&
+      N1.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      N1.getOperand(1).getOpcode() == ISD::Constant &&
+      // Don't increase # computations.
+      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+    // We can only do this xform if we know that bits from X that are set in C2
+    // but not in C1 are already zero.  Likewise for Y.
+    const APInt &LHSMask =
+      cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    const APInt &RHSMask =
+      cast<ConstantSDNode>(N1.getOperand(1))->getAPIntValue();
+
+    if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
+        DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
+      SDValue X = DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+                              N0.getOperand(0), N1.getOperand(0));
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, X,
+                         DAG.getConstant(LHSMask | RHSMask, VT));
+    }
+  }
+
+  // See if this is some rotate idiom.
+  if (SDNode *Rot = MatchRotate(N0, N1, N->getDebugLoc()))
+    return SDValue(Rot, 0);
+
+  return SDValue();
+}
+
+/// MatchRotateHalf - Match "(X shl/srl V1) & V2" where V2 may not be present.
+static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
+  if (Op.getOpcode() == ISD::AND) {
+    if (isa<ConstantSDNode>(Op.getOperand(1))) {
+      Mask = Op.getOperand(1);
+      Op = Op.getOperand(0);
+    } else {
+      return false;
+    }
+  }
+
+  if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
+    Shift = Op;
+    return true;
+  }
+
+  return false;
+}
+
+// MatchRotate - Handle an 'or' of two operands.  If this is one of the many
+// idioms for rotate, and if the target supports rotation instructions, generate
+// a rot[lr].
+SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
+  // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
+  MVT VT = LHS.getValueType();
+  if (!TLI.isTypeLegal(VT)) return 0;
+
+  // The target must have at least one rotate flavor.
+  bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
+  bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
+  if (!HasROTL && !HasROTR) return 0;
+
+  // Match "(X shl/srl V1) & V2" where V2 may not be present.
+  SDValue LHSShift;   // The shift.
+  SDValue LHSMask;    // AND value if any.
+  if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
+    return 0; // Not part of a rotate.
+
+  SDValue RHSShift;   // The shift.
+  SDValue RHSMask;    // AND value if any.
+  if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
+    return 0; // Not part of a rotate.
+
+  if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
+    return 0;   // Not shifting the same value.
+
+  if (LHSShift.getOpcode() == RHSShift.getOpcode())
+    return 0;   // Shifts must disagree.
+
+  // Canonicalize shl to left side in a shl/srl pair.
+  if (RHSShift.getOpcode() == ISD::SHL) {
+    std::swap(LHS, RHS);
+    std::swap(LHSShift, RHSShift);
+    std::swap(LHSMask , RHSMask );
+  }
+
+  unsigned OpSizeInBits = VT.getSizeInBits();
+  SDValue LHSShiftArg = LHSShift.getOperand(0);
+  SDValue LHSShiftAmt = LHSShift.getOperand(1);
+  SDValue RHSShiftAmt = RHSShift.getOperand(1);
+
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  if (LHSShiftAmt.getOpcode() == ISD::Constant &&
+      RHSShiftAmt.getOpcode() == ISD::Constant) {
+    uint64_t LShVal = cast<ConstantSDNode>(LHSShiftAmt)->getZExtValue();
+    uint64_t RShVal = cast<ConstantSDNode>(RHSShiftAmt)->getZExtValue();
+    if ((LShVal + RShVal) != OpSizeInBits)
+      return 0;
+
+    SDValue Rot;
+    if (HasROTL)
+      Rot = DAG.getNode(ISD::ROTL, DL, VT, LHSShiftArg, LHSShiftAmt);
+    else
+      Rot = DAG.getNode(ISD::ROTR, DL, VT, LHSShiftArg, RHSShiftAmt);
+
+    // If there is an AND of either shifted operand, apply it to the result.
+    if (LHSMask.getNode() || RHSMask.getNode()) {
+      APInt Mask = APInt::getAllOnesValue(OpSizeInBits);
+
+      if (LHSMask.getNode()) {
+        APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal);
+        Mask &= cast<ConstantSDNode>(LHSMask)->getAPIntValue() | RHSBits;
+      }
+      if (RHSMask.getNode()) {
+        APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal);
+        Mask &= cast<ConstantSDNode>(RHSMask)->getAPIntValue() | LHSBits;
+      }
+
+      Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, VT));
+    }
+
+    return Rot.getNode();
+  }
+
+  // If there is a mask here, and we have a variable shift, we can't be sure
+  // that we're masking out the right stuff.
+  if (LHSMask.getNode() || RHSMask.getNode())
+    return 0;
+
+  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotl x, y)
+  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotr x, (sub 32, y))
+  if (RHSShiftAmt.getOpcode() == ISD::SUB &&
+      LHSShiftAmt == RHSShiftAmt.getOperand(1)) {
+    if (ConstantSDNode *SUBC =
+          dyn_cast<ConstantSDNode>(RHSShiftAmt.getOperand(0))) {
+      if (SUBC->getAPIntValue() == OpSizeInBits) {
+        if (HasROTL)
+          return DAG.getNode(ISD::ROTL, DL, VT,
+                             LHSShiftArg, LHSShiftAmt).getNode();
+        else
+          return DAG.getNode(ISD::ROTR, DL, VT,
+                             LHSShiftArg, RHSShiftAmt).getNode();
+      }
+    }
+  }
+
+  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotr x, y)
+  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotl x, (sub 32, y))
+  if (LHSShiftAmt.getOpcode() == ISD::SUB &&
+      RHSShiftAmt == LHSShiftAmt.getOperand(1)) {
+    if (ConstantSDNode *SUBC =
+          dyn_cast<ConstantSDNode>(LHSShiftAmt.getOperand(0))) {
+      if (SUBC->getAPIntValue() == OpSizeInBits) {
+        if (HasROTR)
+          return DAG.getNode(ISD::ROTR, DL, VT,
+                             LHSShiftArg, RHSShiftAmt).getNode();
+        else
+          return DAG.getNode(ISD::ROTL, DL, VT,
+                             LHSShiftArg, LHSShiftAmt).getNode();
+      }
+    }
+  }
+
+  // Look for sign/zext/any-extended or truncate cases:
+  if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
+       || LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
+       || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
+       || LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
+      (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
+       || RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
+       || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
+       || RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
+    SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
+    SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
+    if (RExtOp0.getOpcode() == ISD::SUB &&
+        RExtOp0.getOperand(1) == LExtOp0) {
+      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+      //   (rotl x, y)
+      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+      //   (rotr x, (sub 32, y))
+      if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0))) {
+        if (SUBC->getAPIntValue() == OpSizeInBits) {
+          return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
+                             LHSShiftArg,
+                             HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
+        }
+      }
+    } else if (LExtOp0.getOpcode() == ISD::SUB &&
+               RExtOp0 == LExtOp0.getOperand(1)) {
+      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+      //   (rotr x, y)
+      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+      //   (rotl x, (sub 32, y))
+      if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0))) {
+        if (SUBC->getAPIntValue() == OpSizeInBits) {
+          return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT,
+                             LHSShiftArg,
+                             HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+SDValue DAGCombiner::visitXOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LHS, RHS, CC;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
+  if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (xor x, undef) -> undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+  // fold (xor c1, c2) -> c1^c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::XOR, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0);
+  // fold (xor x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // reassociate xor
+  SDValue RXOR = ReassociateOps(ISD::XOR, N->getDebugLoc(), N0, N1);
+  if (RXOR.getNode() != 0)
+    return RXOR;
+
+  // fold !(x cc y) -> (x !cc y)
+  if (N1C && N1C->getAPIntValue() == 1 && isSetCCEquivalent(N0, LHS, RHS, CC)) {
+    bool isInt = LHS.getValueType().isInteger();
+    ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               isInt);
+
+    if (!LegalOperations || TLI.isCondCodeLegal(NotCC, LHS.getValueType())) {
+      switch (N0.getOpcode()) {
+      default:
+        assert(0 && "Unhandled SetCC Equivalent!");
+        abort();
+      case ISD::SETCC:
+        return DAG.getSetCC(N->getDebugLoc(), VT, LHS, RHS, NotCC);
+      case ISD::SELECT_CC:
+        return DAG.getSelectCC(N->getDebugLoc(), LHS, RHS, N0.getOperand(2),
+                               N0.getOperand(3), NotCC);
+      }
+    }
+  }
+
+  // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
+  if (N1C && N1C->getAPIntValue() == 1 && N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.getNode()->hasOneUse() &&
+      isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
+    SDValue V = N0.getOperand(0);
+    V = DAG.getNode(ISD::XOR, N0.getDebugLoc(), V.getValueType(), V,
+                    DAG.getConstant(1, V.getValueType()));
+    AddToWorkList(V.getNode());
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, V);
+  }
+
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+  if (N1C && N1C->getAPIntValue() == 1 && VT == MVT::i1 &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+    }
+  }
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
+  if (N1C && N1C->isAllOnesValue() &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+    }
+  }
+  // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2))
+  if (N1C && N0.getOpcode() == ISD::XOR) {
+    ConstantSDNode *N00C = dyn_cast<ConstantSDNode>(N0.getOperand(0));
+    ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (N00C)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(1),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N00C->getAPIntValue(), VT));
+    if (N01C)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(0),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N01C->getAPIntValue(), VT));
+  }
+  // fold (xor x, x) -> 0
+  if (N0 == N1) {
+    if (!VT.isVector()) {
+      return DAG.getConstant(0, VT);
+    } else if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)){
+      // Produce a vector of zeros.
+      SDValue El = DAG.getConstant(0, VT.getVectorElementType());
+      std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
+      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                         &Ops[0], Ops.size());
+    }
+  }
+
+  // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // Simplify the expression using non-local knowledge.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// visitShiftByConstant - Handle transforms common to the three shifts, when
+/// the shift amount is a constant.
+SDValue DAGCombiner::visitShiftByConstant(SDNode *N, unsigned Amt) {
+  SDNode *LHS = N->getOperand(0).getNode();
+  if (!LHS->hasOneUse()) return SDValue();
+
+  // We want to pull some binops through shifts, so that we have (and (shift))
+  // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
+  // thing happens with address calculations, so it's important to canonicalize
+  // it.
+  bool HighBitSet = false;  // Can we transform this if the high bit is set?
+
+  switch (LHS->getOpcode()) {
+  default: return SDValue();
+  case ISD::OR:
+  case ISD::XOR:
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  case ISD::AND:
+    HighBitSet = true;  // We can only transform sra if the high bit is set.
+    break;
+  case ISD::ADD:
+    if (N->getOpcode() != ISD::SHL)
+      return SDValue(); // only shl(add) not sr[al](add).
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  }
+
+  // We require the RHS of the binop to be a constant as well.
+  ConstantSDNode *BinOpCst = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+  if (!BinOpCst) return SDValue();
+
+  // FIXME: disable this unless the input to the binop is a shift by a constant.
+  // If it is not a shift, it pessimizes some common cases like:
+  //
+  //    void foo(int *X, int i) { X[i & 1235] = 1; }
+  //    int bar(int *X, int i) { return X[i & 255]; }
+  SDNode *BinOpLHSVal = LHS->getOperand(0).getNode();
+  if ((BinOpLHSVal->getOpcode() != ISD::SHL &&
+       BinOpLHSVal->getOpcode() != ISD::SRA &&
+       BinOpLHSVal->getOpcode() != ISD::SRL) ||
+      !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1)))
+    return SDValue();
+
+  MVT VT = N->getValueType(0);
+
+  // If this is a signed shift right, and the high bit is modified by the
+  // logical operation, do not perform the transformation. The highBitSet
+  // boolean indicates the value of the high bit of the constant which would
+  // cause it to be modified for this operation.
+  if (N->getOpcode() == ISD::SRA) {
+    bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative();
+    if (BinOpRHSSignSet != HighBitSet)
+      return SDValue();
+  }
+
+  // Fold the constants, shifting the binop RHS by the shift amount.
+  SDValue NewRHS = DAG.getNode(N->getOpcode(), LHS->getOperand(1).getDebugLoc(),
+                               N->getValueType(0),
+                               LHS->getOperand(1), N->getOperand(1));
+
+  // Create the new shift.
+  SDValue NewShift = DAG.getNode(N->getOpcode(), LHS->getOperand(0).getDebugLoc(),
+                                 VT, LHS->getOperand(0), N->getOperand(1));
+
+  // Create the new binop.
+  return DAG.getNode(LHS->getOpcode(), N->getDebugLoc(), VT, NewShift, NewRHS);
+}
+
+SDValue DAGCombiner::visitSHL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getSizeInBits();
+
+  // fold (shl c1, c2) -> c1<<c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
+  // fold (shl 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (shl x, c >= size(x)) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (shl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // if (shl x, c) is known to be zero, return 0
+  if (DAG.MaskedValueIsZero(SDValue(N, 0),
+                            APInt::getAllOnesValue(VT.getSizeInBits())))
+    return DAG.getConstant(0, VT);
+  // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      MVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC.trunc(TruncVT.getSizeInBits());
+      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(), TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SHL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    if (c1 + c2 > OpSizeInBits)
+      return DAG.getConstant(0, VT);
+    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(c1 + c2, N1.getValueType()));
+  }
+  // fold (shl (srl x, c1), c2) -> (shl (and x, (shl -1, c1)), (sub c2, c1)) or
+  //                               (srl (and x, (shl -1, c1)), (sub c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    SDValue Mask = DAG.getNode(ISD::AND, N0.getDebugLoc(), VT, N0.getOperand(0),
+                               DAG.getConstant(~0ULL << c1, VT));
+    if (c2 > c1)
+      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, Mask,
+                         DAG.getConstant(c2-c1, N1.getValueType()));
+    else
+      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, Mask,
+                         DAG.getConstant(c1-c2, N1.getValueType()));
+  }
+  // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
+  if (N1C && N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1))
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(~0ULL << N1C->getZExtValue(), VT));
+
+  return N1C ? visitShiftByConstant(N, N1C->getZExtValue()) : SDValue();
+}
+
+SDValue DAGCombiner::visitSRA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N0.getValueType();
+
+  // fold (sra c1, c2) -> (sra c1, c2)
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
+  // fold (sra 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (sra -1, x) -> -1
+  if (N0C && N0C->isAllOnesValue())
+    return N0;
+  // fold (sra x, (setge c, size(x))) -> undef
+  if (N1C && N1C->getZExtValue() >= VT.getSizeInBits())
+    return DAG.getUNDEF(VT);
+  // fold (sra x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
+  // sext_inreg.
+  if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
+    unsigned LowBits = VT.getSizeInBits() - (unsigned)N1C->getZExtValue();
+    MVT EVT = MVT::getIntegerVT(LowBits);
+    if ((!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, EVT)))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+                         N0.getOperand(0), DAG.getValueType(EVT));
+  }
+
+  // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRA) {
+    if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      unsigned Sum = N1C->getZExtValue() + C1->getZExtValue();
+      if (Sum >= VT.getSizeInBits()) Sum = VT.getSizeInBits()-1;
+      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0.getOperand(0),
+                         DAG.getConstant(Sum, N1C->getValueType(0)));
+    }
+  }
+
+  // fold (sra (shl X, m), (sub result_size, n))
+  // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
+  // result_size - n != m.
+  // If truncate is free for the target sext(shl) is likely to result in better
+  // code.
+  if (N0.getOpcode() == ISD::SHL) {
+    // Get the two constanst of the shifts, CN0 = m, CN = n.
+    const ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (N01C && N1C) {
+      // Determine what the truncate's result bitsize and type would be.
+      unsigned VTValSize = VT.getSizeInBits();
+      MVT TruncVT =
+        MVT::getIntegerVT(VTValSize - N1C->getZExtValue());
+      // Determine the residual right-shift amount.
+      signed ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
+
+      // If the shift is not a no-op (in which case this should be just a sign
+      // extend already), the truncated to type is legal, sign_extend is legal
+      // on that type, and the the truncate to that type is both legal and free,
+      // perform the transform.
+      if ((ShiftAmt > 0) &&
+          TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
+          TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
+          TLI.isTruncateFree(VT, TruncVT)) {
+
+          SDValue Amt = DAG.getConstant(ShiftAmt, getShiftAmountTy());
+          SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT,
+                                      N0.getOperand(0), Amt);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), TruncVT,
+                                      Shift);
+          return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(),
+                             N->getValueType(0), Trunc);
+      }
+    }
+  }
+
+  // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      MVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC.trunc(TruncVT.getSizeInBits());
+      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+                                     TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  // Simplify, based on bits shifted out of the LHS.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+
+  // If the sign bit is known to be zero, switch this to a SRL.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, N1);
+
+  return N1C ? visitShiftByConstant(N, N1C->getZExtValue()) : SDValue();
+}
+
+SDValue DAGCombiner::visitSRL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  MVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getSizeInBits();
+
+  // fold (srl c1, c2) -> c1 >>u c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
+  // fold (srl 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (srl x, c >= size(x)) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (srl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // if (srl x, c) is known to be zero, return 0
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(OpSizeInBits)))
+    return DAG.getConstant(0, VT);
+
+  // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    if (c1 + c2 > OpSizeInBits)
+      return DAG.getConstant(0, VT);
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(c1 + c2, N1.getValueType()));
+  }
+
+  // fold (srl (anyextend x), c) -> (anyextend (srl x, c))
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    // Shifting in all undef bits?
+    MVT SmallVT = N0.getOperand(0).getValueType();
+    if (N1C->getZExtValue() >= SmallVT.getSizeInBits())
+      return DAG.getUNDEF(VT);
+
+    SDValue SmallShift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), SmallVT,
+                                     N0.getOperand(0), N1);
+    AddToWorkList(SmallShift.getNode());
+    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift);
+  }
+
+  // fold (srl (sra X, Y), 31) -> (srl X, 31).  This srl only looks at the sign
+  // bit, which is unmodified by sra.
+  if (N1C && N1C->getZExtValue() + 1 == VT.getSizeInBits()) {
+    if (N0.getOpcode() == ISD::SRA)
+      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0), N1);
+  }
+
+  // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
+  if (N1C && N0.getOpcode() == ISD::CTLZ &&
+      N1C->getAPIntValue() == Log2_32(VT.getSizeInBits())) {
+    APInt KnownZero, KnownOne;
+    APInt Mask = APInt::getAllOnesValue(VT.getSizeInBits());
+    DAG.ComputeMaskedBits(N0.getOperand(0), Mask, KnownZero, KnownOne);
+
+    // If any of the input bits are KnownOne, then the input couldn't be all
+    // zeros, thus the result of the srl will always be zero.
+    if (KnownOne.getBoolValue()) return DAG.getConstant(0, VT);
+
+    // If all of the bits input the to ctlz node are known to be zero, then
+    // the result of the ctlz is "32" and the result of the shift is one.
+    APInt UnknownBits = ~KnownZero & Mask;
+    if (UnknownBits == 0) return DAG.getConstant(1, VT);
+
+    // Otherwise, check to see if there is exactly one bit input to the ctlz.
+    if ((UnknownBits & (UnknownBits - 1)) == 0) {
+      // Okay, we know that only that the single bit specified by UnknownBits
+      // could be set on input to the CTLZ node. If this bit is set, the SRL
+      // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
+      // to an SRL/XOR pair, which is likely to simplify more.
+      unsigned ShAmt = UnknownBits.countTrailingZeros();
+      SDValue Op = N0.getOperand(0);
+
+      if (ShAmt) {
+        Op = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT, Op,
+                         DAG.getConstant(ShAmt, getShiftAmountTy()));
+        AddToWorkList(Op.getNode());
+      }
+
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                         Op, DAG.getConstant(1, VT));
+    }
+  }
+
+  // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      MVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC.trunc(TruncVT.getSizeInBits());
+      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+                                     TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  // fold operands of srl based on knowledge that the low bits are not
+  // demanded.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return N1C ? visitShiftByConstant(N, N1C->getZExtValue()) : SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // fold (ctlz c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTLZ, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // fold (cttz c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTTZ, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTPOP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // fold (ctpop c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+  MVT VT = N->getValueType(0);
+  MVT VT0 = N0.getValueType();
+
+  // fold (select C, X, X) -> X
+  if (N1 == N2)
+    return N1;
+  // fold (select true, X, Y) -> X
+  if (N0C && !N0C->isNullValue())
+    return N1;
+  // fold (select false, X, Y) -> Y
+  if (N0C && N0C->isNullValue())
+    return N2;
+  // fold (select C, 1, X) -> (or C, X)
+  if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1)
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+  // fold (select C, 0, 1) -> (xor C, 1)
+  if (VT.isInteger() &&
+      (VT0 == MVT::i1 ||
+       (VT0.isInteger() &&
+        TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent)) &&
+      N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
+    SDValue XORNode;
+    if (VT == VT0)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT0,
+                         N0, DAG.getConstant(1, VT0));
+    XORNode = DAG.getNode(ISD::XOR, N0.getDebugLoc(), VT0,
+                          N0, DAG.getConstant(1, VT0));
+    AddToWorkList(XORNode.getNode());
+    if (VT.bitsGT(VT0))
+      return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, XORNode);
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, XORNode);
+  }
+  // fold (select C, 0, X) -> (and (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && N1C && N1C->isNullValue()) {
+    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    AddToWorkList(NOTNode.getNode());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, NOTNode, N2);
+  }
+  // fold (select C, X, 1) -> (or (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && N2C && N2C->getAPIntValue() == 1) {
+    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    AddToWorkList(NOTNode.getNode());
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, NOTNode, N1);
+  }
+  // fold (select C, X, 0) -> (and C, X)
+  if (VT == MVT::i1 && N2C && N2C->isNullValue())
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+  // fold (select X, X, Y) -> (or X, Y)
+  // fold (select X, 1, Y) -> (or X, Y)
+  if (VT == MVT::i1 && (N0 == N1 || (N1C && N1C->getAPIntValue() == 1)))
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+  // fold (select X, Y, X) -> (and X, Y)
+  // fold (select X, Y, 0) -> (and X, Y)
+  if (VT == MVT::i1 && (N0 == N2 || (N2C && N2C->getAPIntValue() == 0)))
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N1, N2))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold selects based on a setcc into other things, such as min/max/abs
+  if (N0.getOpcode() == ISD::SETCC) {
+    // FIXME:
+    // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+    // having to say they don't support SELECT_CC on every type the DAG knows
+    // about, since there is no way to mark an opcode illegal at all value types
+    if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other))
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         N1, N2, N0.getOperand(2));
+    return SimplifySelect(N->getDebugLoc(), N0, N1, N2);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue N3 = N->getOperand(3);
+  SDValue N4 = N->getOperand(4);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
+
+  // fold select_cc lhs, rhs, x, x, cc -> x
+  if (N2 == N3)
+    return N2;
+
+  // Determine if the condition we're dealing with is constant
+  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, N->getDebugLoc(), false);
+  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+
+  if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
+    if (!SCCC->isNullValue())
+      return N2;    // cond always true -> true val
+    else
+      return N3;    // cond always false -> false val
+  }
+
+  // Fold to a simpler select_cc
+  if (SCC.getNode() && SCC.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N2.getValueType(),
+                       SCC.getOperand(0), SCC.getOperand(1), N2, N3,
+                       SCC.getOperand(2));
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N2, N3))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold select_cc into other things, such as min/max/abs
+  return SimplifySelectCC(N->getDebugLoc(), N0, N1, N2, N3, CC);
+}
+
+SDValue DAGCombiner::visitSETCC(SDNode *N) {
+  return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
+                       cast<CondCodeSDNode>(N->getOperand(2))->get(),
+                       N->getDebugLoc());
+}
+
+// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
+// "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
+// transformation. Returns true if extension are possible and the above
+// mentioned transformation is profitable.
+static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
+                                    unsigned ExtOpc,
+                                    SmallVector<SDNode*, 4> &ExtendNodes,
+                                    const TargetLowering &TLI) {
+  bool HasCopyToRegUses = false;
+  bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
+  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+                            UE = N0.getNode()->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == N)
+      continue;
+    if (UI.getUse().getResNo() != N0.getResNo())
+      continue;
+    // FIXME: Only extend SETCC N, N and SETCC N, c for now.
+    if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
+      if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
+        // Sign bits will be lost after a zext.
+        return false;
+      bool Add = false;
+      for (unsigned i = 0; i != 2; ++i) {
+        SDValue UseOp = User->getOperand(i);
+        if (UseOp == N0)
+          continue;
+        if (!isa<ConstantSDNode>(UseOp))
+          return false;
+        Add = true;
+      }
+      if (Add)
+        ExtendNodes.push_back(User);
+      continue;
+    }
+    // If truncates aren't free and there are users we can't
+    // extend, it isn't worthwhile.
+    if (!isTruncFree)
+      return false;
+    // Remember if this value is live-out.
+    if (User->getOpcode() == ISD::CopyToReg)
+      HasCopyToRegUses = true;
+  }
+
+  if (HasCopyToRegUses) {
+    bool BothLiveOut = false;
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
+        BothLiveOut = true;
+        break;
+      }
+    }
+    if (BothLiveOut)
+      // Both unextended and extended values are live out. There had better be
+      // good a reason for the transformation.
+      return ExtendNodes.size();
+  }
+  return true;
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // fold (sext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N0);
+
+  // fold (sext (sext x)) -> (sext x)
+  // fold (sext (aext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    // fold (sext (truncate (load x))) -> (sext (smaller load x))
+    // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      if (NarrowLoad.getNode() != N0.getNode())
+        CombineTo(N0.getNode(), NarrowLoad);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+
+    // See if the value being truncated is already sign extended.  If so, just
+    // eliminate the trunc/sext pair.
+    SDValue Op = N0.getOperand(0);
+    unsigned OpBits   = Op.getValueType().getSizeInBits();
+    unsigned MidBits  = N0.getValueType().getSizeInBits();
+    unsigned DestBits = VT.getSizeInBits();
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return Op;
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, Op);
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+    }
+
+    // fold (sext (truncate x)) -> (sextinreg x).
+    if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
+                                                 N0.getValueType())) {
+      if (Op.getValueType().bitsLT(VT))
+        Op = DAG.getNode(ISD::ANY_EXTEND, N0.getDebugLoc(), VT, Op);
+      else if (Op.getValueType().bitsGT(VT))
+        Op = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), VT, Op);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, Op,
+                         DAG.getValueType(N0.getValueType()));
+    }
+  }
+
+  // fold (sext (load x)) -> (sext (truncate (sextload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      // Extend SetCC uses if necessary.
+      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+        SDNode *SetCC = SetCCs[i];
+        SmallVector<SDValue, 4> Ops;
+
+        for (unsigned j = 0; j != 2; ++j) {
+          SDValue SOp = SetCC->getOperand(j);
+          if (SOp == Trunc)
+            Ops.push_back(ExtLoad);
+          else
+            Ops.push_back(DAG.getNode(ISD::SIGN_EXTEND,
+                                      N->getDebugLoc(), VT, SOp));
+        }
+
+        Ops.push_back(SetCC->getOperand(2));
+        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                                     SetCC->getValueType(0),
+                                     &Ops[0], Ops.size()));
+      }
+
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext (sextload x)) -> (sext (truncate (sextload x)))
+  // fold (sext ( extload x)) -> (sext (truncate (sextload x)))
+  if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    MVT EVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), EVT,
+                                       LN0->isVolatile(), LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                            N0.getValueType(), ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // sext(setcc x, y, cc) -> (select_cc x, y, -1, 0, cc)
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       DAG.getConstant(~0ULL, VT), DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode()) return SCC;
+  }
+
+  // fold (sext x) -> (zext x) if the sign bit is known zero.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
+      DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // fold (zext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+  // fold (zext (zext x)) -> (zext x)
+  // fold (zext (aext x)) -> (zext x)
+  if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  // fold (zext (truncate (load x))) -> (zext (smaller load x))
+  // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      if (NarrowLoad.getNode() != N0.getNode())
+        CombineTo(N0.getNode(), NarrowLoad);
+      return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, NarrowLoad);
+    }
+  }
+
+  // fold (zext (truncate x)) -> (and x, mask)
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
+    SDValue Op = N0.getOperand(0);
+    if (Op.getValueType().bitsLT(VT)) {
+      Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
+    } else if (Op.getValueType().bitsGT(VT)) {
+      Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+    }
+    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(), N0.getValueType());
+  }
+
+  // Fold (zext (and (trunc x), cst)) -> (and x, cst),
+  // if either of the casts is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                           N0.getValueType()) ||
+       !TLI.isZExtFree(N0.getValueType(), VT))) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, X.getDebugLoc(), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       X, DAG.getConstant(Mask, VT));
+  }
+
+  // fold (zext (load x)) -> (zext (truncate (zextload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      // Extend SetCC uses if necessary.
+      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+        SDNode *SetCC = SetCCs[i];
+        SmallVector<SDValue, 4> Ops;
+
+        for (unsigned j = 0; j != 2; ++j) {
+          SDValue SOp = SetCC->getOperand(j);
+          if (SOp == Trunc)
+            Ops.push_back(ExtLoad);
+          else
+            Ops.push_back(DAG.getNode(ISD::ZERO_EXTEND,
+                                      N->getDebugLoc(), VT, SOp));
+        }
+
+        Ops.push_back(SetCC->getOperand(2));
+        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                                     SetCC->getValueType(0),
+                                     &Ops[0], Ops.size()));
+      }
+
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (zext (zextload x)) -> (zext (truncate (zextload x)))
+  // fold (zext ( extload x)) -> (zext (truncate (zextload x)))
+  if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    MVT EVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, EVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(), EVT,
+                                       LN0->isVolatile(), LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), N0.getValueType(),
+                            ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode()) return SCC;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // fold (aext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, N0);
+  // fold (aext (aext x)) -> (aext x)
+  // fold (aext (zext x)) -> (zext x)
+  // fold (aext (sext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND  ||
+      N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, N0.getOperand(0));
+
+  // fold (aext (truncate (load x))) -> (aext (smaller load x))
+  // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      if (NarrowLoad.getNode() != N0.getNode())
+        CombineTo(N0.getNode(), NarrowLoad);
+      return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, NarrowLoad);
+    }
+  }
+
+  // fold (aext (truncate x))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue TruncOp = N0.getOperand(0);
+    if (TruncOp.getValueType() == VT)
+      return TruncOp; // x iff x size == zext size.
+    if (TruncOp.getValueType().bitsGT(VT))
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, TruncOp);
+    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, TruncOp);
+  }
+
+  // Fold (aext (and (trunc x), cst)) -> (and x, cst)
+  // if the trunc is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                          N0.getValueType())) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       X, DAG.getConstant(Mask, VT));
+  }
+
+  // fold (aext (load x)) -> (aext (truncate (extload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getSrcValue(),
+                                       LN0->getSrcValueOffset(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      // Extend SetCC uses if necessary.
+      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+        SDNode *SetCC = SetCCs[i];
+        SmallVector<SDValue, 4> Ops;
+
+        for (unsigned j = 0; j != 2; ++j) {
+          SDValue SOp = SetCC->getOperand(j);
+          if (SOp == Trunc)
+            Ops.push_back(ExtLoad);
+          else
+            Ops.push_back(DAG.getNode(ISD::ANY_EXTEND,
+                                      N->getDebugLoc(), VT, SOp));
+        }
+
+        Ops.push_back(SetCC->getOperand(2));
+        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                                     SetCC->getValueType(0),
+                                     &Ops[0], Ops.size()));
+      }
+
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
+  // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
+  // fold (aext ( extload x)) -> (aext (truncate (extload  x)))
+  if (N0.getOpcode() == ISD::LOAD &&
+      !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    MVT EVT = LN0->getMemoryVT();
+    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), N->getDebugLoc(),
+                                     VT, LN0->getChain(), LN0->getBasePtr(),
+                                     LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(), EVT,
+                                     LN0->isVolatile(), LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(),
+              DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                          N0.getValueType(), ExtLoad),
+              ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode())
+      return SCC;
+  }
+
+  return SDValue();
+}
+
+/// GetDemandedBits - See if the specified operand can be simplified with the
+/// knowledge that only the bits specified by Mask are used.  If so, return the
+/// simpler operand, otherwise return a null SDValue.
+SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
+  switch (V.getOpcode()) {
+  default: break;
+  case ISD::OR:
+  case ISD::XOR:
+    // If the LHS or RHS don't contribute bits to the or, drop them.
+    if (DAG.MaskedValueIsZero(V.getOperand(0), Mask))
+      return V.getOperand(1);
+    if (DAG.MaskedValueIsZero(V.getOperand(1), Mask))
+      return V.getOperand(0);
+    break;
+  case ISD::SRL:
+    // Only look at single-use SRLs.
+    if (!V.getNode()->hasOneUse())
+      break;
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+      // See if we can recursively simplify the LHS.
+      unsigned Amt = RHSC->getZExtValue();
+
+      // Watch out for shift count overflow though.
+      if (Amt >= Mask.getBitWidth()) break;
+      APInt NewMask = Mask << Amt;
+      SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask);
+      if (SimplifyLHS.getNode())
+        return DAG.getNode(ISD::SRL, V.getDebugLoc(), V.getValueType(),
+                           SimplifyLHS, V.getOperand(1));
+    }
+  }
+  return SDValue();
+}
+
+/// ReduceLoadWidth - If the result of a wider load is shifted to right of N
+/// bits and then truncated to a narrower type and where N is a multiple
+/// of number of bits of the narrower type, transform it to a narrower load
+/// from address + N / num of bits of new type. If the result is to be
+/// extended, also fold the extension to form a extending load.
+SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+  MVT EVT = VT;
+
+  // This transformation isn't valid for vector loads.
+  if (VT.isVector())
+    return SDValue();
+
+  // Special case: SIGN_EXTEND_INREG is basically truncating to EVT then
+  // extended to VT.
+  if (Opc == ISD::SIGN_EXTEND_INREG) {
+    ExtType = ISD::SEXTLOAD;
+    EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+    if (LegalOperations && !TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))
+      return SDValue();
+  }
+
+  unsigned EVTBits = EVT.getSizeInBits();
+  unsigned ShAmt = 0;
+  if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
+    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      ShAmt = N01->getZExtValue();
+      // Is the shift amount a multiple of size of VT?
+      if ((ShAmt & (EVTBits-1)) == 0) {
+        N0 = N0.getOperand(0);
+        if (N0.getValueType().getSizeInBits() <= EVTBits)
+          return SDValue();
+      }
+    }
+  }
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (isa<LoadSDNode>(N0) && N0.hasOneUse() && EVT.isRound() &&
+      cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits() > EVTBits &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    MVT PtrType = N0.getOperand(1).getValueType();
+
+    // For big endian targets, we need to adjust the offset to the pointer to
+    // load the correct bytes.
+    if (TLI.isBigEndian()) {
+      unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
+      unsigned EVTStoreBits = EVT.getStoreSizeInBits();
+      ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
+    }
+
+    uint64_t PtrOff =  ShAmt / 8;
+    unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
+    SDValue NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(),
+                                 PtrType, LN0->getBasePtr(),
+                                 DAG.getConstant(PtrOff, PtrType));
+    AddToWorkList(NewPtr.getNode());
+
+    SDValue Load = (ExtType == ISD::NON_EXTLOAD)
+      ? DAG.getLoad(VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
+                    LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
+                    LN0->isVolatile(), NewAlign)
+      : DAG.getExtLoad(ExtType, N0.getDebugLoc(), VT, LN0->getChain(), NewPtr,
+                       LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
+                       EVT, LN0->isVolatile(), NewAlign);
+
+    // Replace the old load's chain with the new load's chain.
+    WorkListRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1),
+                                  &DeadNodes);
+
+    // Return the new loaded value.
+    return Load;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  MVT VT = N->getValueType(0);
+  MVT EVT = cast<VTSDNode>(N1)->getVT();
+  unsigned VTBits = VT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  // fold (sext_in_reg c1) -> c1
+  if (isa<ConstantSDNode>(N0) || N0.getOpcode() == ISD::UNDEF)
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, N0, N1);
+
+  // If the input is already sign extended, just drop the extension.
+  if (DAG.ComputeNumSignBits(N0) >= VT.getSizeInBits()-EVTBits+1)
+    return N0;
+
+  // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
+  if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT())) {
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N1);
+  }
+
+  // fold (sext_in_reg (sext x)) -> (sext x)
+  // fold (sext_in_reg (aext x)) -> (sext x)
+  // if x is small enough.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getValueType().getSizeInBits() < EVTBits)
+      return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N00, N1);
+  }
+
+  // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
+  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
+    return DAG.getZeroExtendInReg(N0, N->getDebugLoc(), EVT);
+
+  // fold operands of sext_in_reg based on knowledge that the top bits are not
+  // demanded.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (sext_in_reg (load x)) -> (smaller sextload x)
+  // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
+  SDValue NarrowLoad = ReduceLoadWidth(N);
+  if (NarrowLoad.getNode())
+    return NarrowLoad;
+
+  // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
+  // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
+  // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
+  if (N0.getOpcode() == ISD::SRL) {
+    if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if (ShAmt->getZExtValue()+EVTBits <= VT.getSizeInBits()) {
+        // We can turn this into an SRA iff the input to the SRL is already sign
+        // extended enough.
+        unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
+        if (VT.getSizeInBits()-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
+          return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT,
+                             N0.getOperand(0), N0.getOperand(1));
+      }
+  }
+
+  // fold (sext_inreg (extload x)) -> (sextload x)
+  if (ISD::isEXTLoad(N0.getNode()) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(), EVT,
+                                     LN0->isVolatile(), LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+  // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
+  if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse() &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(), EVT,
+                                     LN0->isVolatile(), LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // noop truncate
+  if (N0.getValueType() == N->getValueType(0))
+    return N0;
+  // fold (truncate c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0);
+  // fold (truncate (truncate x)) -> (truncate x)
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+  // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
+  if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND||
+      N0.getOpcode() == ISD::ANY_EXTEND) {
+    if (N0.getOperand(0).getValueType().bitsLT(VT))
+      // if the source is smaller than the dest, we still need an extend
+      return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                         N0.getOperand(0));
+    else if (N0.getOperand(0).getValueType().bitsGT(VT))
+      // if the source is larger than the dest, than we just need the truncate
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+    else
+      // if the source and dest are the same type, we can drop both the extend
+      // and the truncate
+      return N0.getOperand(0);
+  }
+
+  // See if we can simplify the input to this truncate through knowledge that
+  // only the low bits are being used.  For example "trunc (or (shl x, 8), y)"
+  // -> trunc y
+  SDValue Shorter =
+    GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
+                                             VT.getSizeInBits()));
+  if (Shorter.getNode())
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Shorter);
+
+  // fold (truncate (load x)) -> (smaller load x)
+  // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
+  return ReduceLoadWidth(N);
+}
+
+static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
+  SDValue Elt = N->getOperand(i);
+  if (Elt.getOpcode() != ISD::MERGE_VALUES)
+    return Elt.getNode();
+  return Elt.getOperand(Elt.getResNo()).getNode();
+}
+
+/// CombineConsecutiveLoads - build_pair (load, load) -> load
+/// if load locations are consecutive.
+SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, MVT VT) {
+  assert(N->getOpcode() == ISD::BUILD_PAIR);
+
+  SDNode *LD1 = getBuildPairElt(N, 0);
+  if (!ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
+    return SDValue();
+  MVT LD1VT = LD1->getValueType(0);
+  SDNode *LD2 = getBuildPairElt(N, 1);
+  const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  if (ISD::isNON_EXTLoad(LD2) &&
+      LD2->hasOneUse() &&
+      // If both are volatile this would reduce the number of volatile loads.
+      // If one is volatile it might be ok, but play conservative and bail out.
+      !cast<LoadSDNode>(LD1)->isVolatile() &&
+      !cast<LoadSDNode>(LD2)->isVolatile() &&
+      TLI.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1, MFI)) {
+    LoadSDNode *LD = cast<LoadSDNode>(LD1);
+    unsigned Align = LD->getAlignment();
+    unsigned NewAlign = TLI.getTargetData()->
+      getABITypeAlignment(VT.getTypeForMVT());
+
+    if (NewAlign <= Align &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
+      return DAG.getLoad(VT, N->getDebugLoc(), LD->getChain(), LD->getBasePtr(),
+                         LD->getSrcValue(), LD->getSrcValueOffset(),
+                         false, Align);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBIT_CONVERT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+
+  // If the input is a BUILD_VECTOR with all constant elements, fold this now.
+  // Only do this before legalize, since afterward the target may be depending
+  // on the bitconvert.
+  // First check to see if this is all constant.
+  if (!LegalTypes &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
+      VT.isVector()) {
+    bool isSimple = true;
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i)
+      if (N0.getOperand(i).getOpcode() != ISD::UNDEF &&
+          N0.getOperand(i).getOpcode() != ISD::Constant &&
+          N0.getOperand(i).getOpcode() != ISD::ConstantFP) {
+        isSimple = false;
+        break;
+      }
+
+    MVT DestEltVT = N->getValueType(0).getVectorElementType();
+    assert(!DestEltVT.isVector() &&
+           "Element type of vector ValueType must not be vector!");
+    if (isSimple)
+      return ConstantFoldBIT_CONVERTofBUILD_VECTOR(N0.getNode(), DestEltVT);
+  }
+
+  // If the input is a constant, let getNode fold it.
+  if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
+    SDValue Res = DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, N0);
+    if (Res.getNode() != N) return Res;
+  }
+
+  // (conv (conv x, t1), t2) -> (conv x, t2)
+  if (N0.getOpcode() == ISD::BIT_CONVERT)
+    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  // fold (conv (load x)) -> (load (conv*)x)
+  // If the resultant load doesn't need a higher alignment than the original!
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile() &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    unsigned Align = TLI.getTargetData()->
+      getABITypeAlignment(VT.getTypeForMVT());
+    unsigned OrigAlign = LN0->getAlignment();
+
+    if (Align <= OrigAlign) {
+      SDValue Load = DAG.getLoad(VT, N->getDebugLoc(), LN0->getChain(),
+                                 LN0->getBasePtr(),
+                                 LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                                 LN0->isVolatile(), OrigAlign);
+      AddToWorkList(N);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(),
+                            N0.getValueType(), Load),
+                Load.getValue(1));
+      return Load;
+    }
+  }
+
+  // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
+  // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  // This often reduces constant pool loads.
+  if ((N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FABS) &&
+      N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector()) {
+    SDValue NewConv = DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(), VT,
+                                  N0.getOperand(0));
+    AddToWorkList(NewConv.getNode());
+
+    APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+    if (N0.getOpcode() == ISD::FNEG)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                         NewConv, DAG.getConstant(SignBit, VT));
+    assert(N0.getOpcode() == ISD::FABS);
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       NewConv, DAG.getConstant(~SignBit, VT));
+  }
+
+  // fold (bitconvert (fcopysign cst, x)) ->
+  //         (or (and (bitconvert x), sign), (and cst, (not sign)))
+  // Note that we don't handle (copysign x, cst) because this can always be
+  // folded to an fneg or fabs.
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(0)) &&
+      VT.isInteger() && !VT.isVector()) {
+    unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits();
+    MVT IntXVT = MVT::getIntegerVT(OrigXWidth);
+    if (TLI.isTypeLegal(IntXVT) || !LegalTypes) {
+      SDValue X = DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(),
+                              IntXVT, N0.getOperand(1));
+      AddToWorkList(X.getNode());
+
+      // If X has a different width than the result/lhs, sext it or truncate it.
+      unsigned VTWidth = VT.getSizeInBits();
+      if (OrigXWidth < VTWidth) {
+        X = DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, X);
+        AddToWorkList(X.getNode());
+      } else if (OrigXWidth > VTWidth) {
+        // To get the sign bit in the right place, we have to shift it right
+        // before truncating.
+        X = DAG.getNode(ISD::SRL, X.getDebugLoc(),
+                        X.getValueType(), X,
+                        DAG.getConstant(OrigXWidth-VTWidth, X.getValueType()));
+        AddToWorkList(X.getNode());
+        X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+        AddToWorkList(X.getNode());
+      }
+
+      APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+      X = DAG.getNode(ISD::AND, X.getDebugLoc(), VT,
+                      X, DAG.getConstant(SignBit, VT));
+      AddToWorkList(X.getNode());
+
+      SDValue Cst = DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(),
+                                VT, N0.getOperand(0));
+      Cst = DAG.getNode(ISD::AND, Cst.getDebugLoc(), VT,
+                        Cst, DAG.getConstant(~SignBit, VT));
+      AddToWorkList(Cst.getNode());
+
+      return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, X, Cst);
+    }
+  }
+
+  // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
+  if (N0.getOpcode() == ISD::BUILD_PAIR) {
+    SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT);
+    if (CombineLD.getNode())
+      return CombineLD;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
+  MVT VT = N->getValueType(0);
+  return CombineConsecutiveLoads(N, VT);
+}
+
+/// ConstantFoldBIT_CONVERTofBUILD_VECTOR - We know that BV is a build_vector
+/// node with Constant, ConstantFP or Undef operands.  DstEltVT indicates the
+/// destination element value type.
+SDValue DAGCombiner::
+ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, MVT DstEltVT) {
+  MVT SrcEltVT = BV->getValueType(0).getVectorElementType();
+
+  // If this is already the right type, we're done.
+  if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
+
+  unsigned SrcBitSize = SrcEltVT.getSizeInBits();
+  unsigned DstBitSize = DstEltVT.getSizeInBits();
+
+  // If this is a conversion of N elements of one type to N elements of another
+  // type, convert each element.  This handles FP<->INT cases.
+  if (SrcBitSize == DstBitSize) {
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+      SDValue Op = BV->getOperand(i);
+      // If the vector element type is not legal, the BUILD_VECTOR operands
+      // are promoted and implicitly truncated.  Make that explicit here.
+      if (Op.getValueType() != SrcEltVT)
+        Op = DAG.getNode(ISD::TRUNCATE, BV->getDebugLoc(), SrcEltVT, Op);
+      Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, BV->getDebugLoc(),
+                                DstEltVT, Op));
+      AddToWorkList(Ops.back().getNode());
+    }
+    MVT VT = MVT::getVectorVT(DstEltVT,
+                              BV->getValueType(0).getVectorNumElements());
+    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                       &Ops[0], Ops.size());
+  }
+
+  // Otherwise, we're growing or shrinking the elements.  To avoid having to
+  // handle annoying details of growing/shrinking FP values, we convert them to
+  // int first.
+  if (SrcEltVT.isFloatingPoint()) {
+    // Convert the input float vector to a int vector where the elements are the
+    // same sizes.
+    assert((SrcEltVT == MVT::f32 || SrcEltVT == MVT::f64) && "Unknown FP VT!");
+    MVT IntVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits());
+    BV = ConstantFoldBIT_CONVERTofBUILD_VECTOR(BV, IntVT).getNode();
+    SrcEltVT = IntVT;
+  }
+
+  // Now we know the input is an integer vector.  If the output is a FP type,
+  // convert to integer first, then to FP of the right size.
+  if (DstEltVT.isFloatingPoint()) {
+    assert((DstEltVT == MVT::f32 || DstEltVT == MVT::f64) && "Unknown FP VT!");
+    MVT TmpVT = MVT::getIntegerVT(DstEltVT.getSizeInBits());
+    SDNode *Tmp = ConstantFoldBIT_CONVERTofBUILD_VECTOR(BV, TmpVT).getNode();
+
+    // Next, convert to FP elements of the same size.
+    return ConstantFoldBIT_CONVERTofBUILD_VECTOR(Tmp, DstEltVT);
+  }
+
+  // Okay, we know the src/dst types are both integers of differing types.
+  // Handling growing first.
+  assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
+  if (SrcBitSize < DstBitSize) {
+    unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = BV->getNumOperands(); i != e;
+         i += NumInputsPerOutput) {
+      bool isLE = TLI.isLittleEndian();
+      APInt NewBits = APInt(DstBitSize, 0);
+      bool EltIsUndef = true;
+      for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
+        // Shift the previously computed bits over.
+        NewBits <<= SrcBitSize;
+        SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
+        if (Op.getOpcode() == ISD::UNDEF) continue;
+        EltIsUndef = false;
+
+        NewBits |= (APInt(cast<ConstantSDNode>(Op)->getAPIntValue()).
+                    zextOrTrunc(SrcBitSize).zext(DstBitSize));
+      }
+
+      if (EltIsUndef)
+        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      else
+        Ops.push_back(DAG.getConstant(NewBits, DstEltVT));
+    }
+
+    MVT VT = MVT::getVectorVT(DstEltVT, Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                       &Ops[0], Ops.size());
+  }
+
+  // Finally, this must be the case where we are shrinking elements: each input
+  // turns into multiple outputs.
+  bool isS2V = ISD::isScalarToVector(BV);
+  unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
+  MVT VT = MVT::getVectorVT(DstEltVT, NumOutputsPerInput*BV->getNumOperands());
+  SmallVector<SDValue, 8> Ops;
+
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    if (BV->getOperand(i).getOpcode() == ISD::UNDEF) {
+      for (unsigned j = 0; j != NumOutputsPerInput; ++j)
+        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      continue;
+    }
+
+    APInt OpVal = APInt(cast<ConstantSDNode>(BV->getOperand(i))->
+                        getAPIntValue()).zextOrTrunc(SrcBitSize);
+
+    for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
+      APInt ThisVal = APInt(OpVal).trunc(DstBitSize);
+      Ops.push_back(DAG.getConstant(ThisVal, DstEltVT));
+      if (isS2V && i == 0 && j == 0 && APInt(ThisVal).zext(SrcBitSize) == OpVal)
+        // Simply turn this into a SCALAR_TO_VECTOR of the new type.
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT,
+                           Ops[0]);
+      OpVal = OpVal.lshr(DstBitSize);
+    }
+
+    // For big endian targets, swap the order of the pieces of each element.
+    if (TLI.isBigEndian())
+      std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                     &Ops[0], Ops.size());
+}
+
+SDValue DAGCombiner::visitFADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fadd c1, c2) -> (fadd c1, c2)
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1);
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N0);
+  // fold (fadd A, 0) -> A
+  if (UnsafeFPMath && N1CFP && N1CFP->getValueAPF().isZero())
+    return N0;
+  // fold (fadd A, (fneg B)) -> (fsub A, B)
+  if (isNegatibleForFree(N1, LegalOperations) == 2)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations));
+  // fold (fadd (fneg A), B) -> (fsub B, A)
+  if (isNegatibleForFree(N0, LegalOperations) == 2)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N1,
+                       GetNegatedExpression(N0, DAG, LegalOperations));
+
+  // If allowed, fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
+  if (UnsafeFPMath && N1CFP && N0.getOpcode() == ISD::FADD &&
+      N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fsub c1, c2) -> c1-c2
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0, N1);
+  // fold (fsub A, 0) -> A
+  if (UnsafeFPMath && N1CFP && N1CFP->getValueAPF().isZero())
+    return N0;
+  // fold (fsub 0, B) -> -B
+  if (UnsafeFPMath && N0CFP && N0CFP->getValueAPF().isZero()) {
+    if (isNegatibleForFree(N1, LegalOperations))
+      return GetNegatedExpression(N1, DAG, LegalOperations);
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N1);
+  }
+  // fold (fsub A, (fneg B)) -> (fadd A, B)
+  if (isNegatibleForFree(N1, LegalOperations))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fmul c1, c2) -> c1*c2
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0, N1);
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N1, N0);
+  // fold (fmul A, 0) -> 0
+  if (UnsafeFPMath && N1CFP && N1CFP->getValueAPF().isZero())
+    return N1;
+  // fold (fmul X, 2.0) -> (fadd X, X)
+  if (N1CFP && N1CFP->isExactlyValue(+2.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0);
+  // fold (fmul X, (fneg 1.0)) -> (fneg X)
+  if (N1CFP && N1CFP->isExactlyValue(-1.0))
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N0);
+
+  // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations));
+    }
+  }
+
+  // If allowed, fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
+  if (UnsafeFPMath && N1CFP && N0.getOpcode() == ISD::FMUL &&
+      N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fdiv c1, c2) -> c1/c2
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT, N0, N1);
+
+
+  // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  // fold (frem c1, c2) -> fmod(c1,c2)
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FREM, N->getDebugLoc(), VT, N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  MVT VT = N->getValueType(0);
+
+  if (N0CFP && N1CFP && VT != MVT::ppcf128)  // Constant fold
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT, N0, N1);
+
+  if (N1CFP) {
+    const APFloat& V = N1CFP->getValueAPF();
+    // copysign(x, c1) -> fabs(x)       iff ispos(c1)
+    // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
+    if (!V.isNegative()) {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
+        return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+    } else {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
+                           DAG.getNode(ISD::FABS, N0.getDebugLoc(), VT, N0));
+    }
+  }
+
+  // copysign(fabs(x), y) -> copysign(x, y)
+  // copysign(fneg(x), y) -> copysign(x, y)
+  // copysign(copysign(x,z), y) -> copysign(x, y)
+  if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
+      N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N1);
+
+  // copysign(x, abs(y)) -> abs(x)
+  if (N1.getOpcode() == ISD::FABS)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+
+  // copysign(x, copysign(y,z)) -> copysign(x, z)
+  if (N1.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0, N1.getOperand(1));
+
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0, N1.getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  MVT VT = N->getValueType(0);
+  MVT OpVT = N0.getValueType();
+
+  // fold (sint_to_fp c1) -> c1fp
+  if (N0C && OpVT != MVT::ppcf128)
+    return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+
+  // If the input is a legal type, and SINT_TO_FP is not legal on this target,
+  // but UINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  MVT VT = N->getValueType(0);
+  MVT OpVT = N0.getValueType();
+
+  // fold (uint_to_fp c1) -> c1fp
+  if (N0C && OpVT != MVT::ppcf128)
+    return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+
+  // If the input is a legal type, and UINT_TO_FP is not legal on this target,
+  // but SINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  MVT VT = N->getValueType(0);
+
+  // fold (fp_to_sint c1fp) -> c1
+  if (N0CFP)
+    return DAG.getNode(ISD::FP_TO_SINT, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  MVT VT = N->getValueType(0);
+
+  // fold (fp_to_uint c1fp) -> c1
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FP_TO_UINT, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  MVT VT = N->getValueType(0);
+
+  // fold (fp_round c1fp) -> c1fp
+  if (N0CFP && N0.getValueType() != MVT::ppcf128)
+    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0, N1);
+
+  // fold (fp_round (fp_extend x)) -> x
+  if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
+    return N0.getOperand(0);
+
+  // fold (fp_round (fp_round x)) -> (fp_round x)
+  if (N0.getOpcode() == ISD::FP_ROUND) {
+    // This is a value preserving truncation if both round's are.
+    bool IsTrunc = N->getConstantOperandVal(1) == 1 &&
+                   N0.getNode()->getConstantOperandVal(1) == 1;
+    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getIntPtrConstant(IsTrunc));
+  }
+
+  // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
+    SDValue Tmp = DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(), VT,
+                              N0.getOperand(0), N1);
+    AddToWorkList(Tmp.getNode());
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       Tmp, N0.getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  MVT VT = N->getValueType(0);
+  MVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+
+  // fold (fp_round_inreg c1fp) -> c1fp
+  if (N0CFP && (TLI.isTypeLegal(EVT) || !LegalTypes)) {
+    SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), EVT);
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, Round);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  MVT VT = N->getValueType(0);
+
+  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() &&
+      N->use_begin()->getOpcode() == ISD::FP_ROUND)
+    return SDValue();
+
+  // fold (fp_extend c1fp) -> c1fp
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, N0);
+
+  // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
+  // value of X.
+  if (N0.getOpcode() == ISD::FP_ROUND
+      && N0.getNode()->getConstantOperandVal(1) == 1) {
+    SDValue In = N0.getOperand(0);
+    if (In.getValueType() == VT) return In;
+    if (VT.bitsLT(In.getValueType()))
+      return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT,
+                         In, N0.getOperand(1));
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, In);
+  }
+
+  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) && N0.hasOneUse() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getSrcValue(),
+                                     LN0->getSrcValueOffset(),
+                                     N0.getValueType(),
+                                     LN0->isVolatile(), LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(),
+              DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(),
+                          N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1)),
+              ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFNEG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  if (isNegatibleForFree(N0, LegalOperations))
+    return GetNegatedExpression(N0, DAG, LegalOperations);
+
+  // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
+  // constant pool values.
+  if (N0.getOpcode() == ISD::BIT_CONVERT && N0.getNode()->hasOneUse() &&
+      N0.getOperand(0).getValueType().isInteger() &&
+      !N0.getOperand(0).getValueType().isVector()) {
+    SDValue Int = N0.getOperand(0);
+    MVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      Int = DAG.getNode(ISD::XOR, N0.getDebugLoc(), IntVT, Int,
+              DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+      AddToWorkList(Int.getNode());
+      return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                         N->getValueType(0), Int);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  MVT VT = N->getValueType(0);
+
+  // fold (fabs c1) -> fabs(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+  // fold (fabs (fabs x)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FABS)
+    return N->getOperand(0);
+  // fold (fabs (fneg x)) -> (fabs x)
+  // fold (fabs (fcopysign x, y)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0.getOperand(0));
+
+  // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
+  // constant pool values.
+  if (N0.getOpcode() == ISD::BIT_CONVERT && N0.getNode()->hasOneUse() &&
+      N0.getOperand(0).getValueType().isInteger() &&
+      !N0.getOperand(0).getValueType().isVector()) {
+    SDValue Int = N0.getOperand(0);
+    MVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      Int = DAG.getNode(ISD::AND, N0.getDebugLoc(), IntVT, Int,
+             DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+      AddToWorkList(Int.getNode());
+      return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                         N->getValueType(0), Int);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBRCOND(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+
+  // never taken branch, fold to chain
+  if (N1C && N1C->isNullValue())
+    return Chain;
+  // unconditional branch
+  if (N1C && N1C->getAPIntValue() == 1)
+    return DAG.getNode(ISD::BR, N->getDebugLoc(), MVT::Other, Chain, N2);
+  // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
+  // on the target.
+  if (N1.getOpcode() == ISD::SETCC &&
+      TLI.isOperationLegalOrCustom(ISD::BR_CC, MVT::Other)) {
+    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+                       Chain, N1.getOperand(2),
+                       N1.getOperand(0), N1.getOperand(1), N2);
+  }
+
+  if (N1.hasOneUse() && N1.getOpcode() == ISD::SRL) {
+    // Match this pattern so that we can generate simpler code:
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = srl i32 %b, 1
+    //   brcond i32 %c ...
+    //
+    // into
+    // 
+    //   %a = ...
+    //   %b = and %a, 2
+    //   %c = setcc eq %b, 0
+    //   brcond %c ...
+    //
+    // This applies only when the AND constant value has one bit set and the
+    // SRL constant is equal to the log2 of the AND constant. The back-end is
+    // smart enough to convert the result into a TEST/JMP sequence.
+    SDValue Op0 = N1.getOperand(0);
+    SDValue Op1 = N1.getOperand(1);
+
+    if (Op0.getOpcode() == ISD::AND &&
+        Op0.hasOneUse() &&
+        Op1.getOpcode() == ISD::Constant) {
+      SDValue AndOp0 = Op0.getOperand(0);
+      SDValue AndOp1 = Op0.getOperand(1);
+
+      if (AndOp1.getOpcode() == ISD::Constant) {
+        const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
+
+        if (AndConst.isPowerOf2() &&
+            cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
+          SDValue SetCC =
+            DAG.getSetCC(N->getDebugLoc(),
+                         TLI.getSetCCResultType(Op0.getValueType()),
+                         Op0, DAG.getConstant(0, Op0.getValueType()),
+                         ISD::SETNE);
+
+          // Replace the uses of SRL with SETCC
+          DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
+          removeFromWorkList(N1.getNode());
+          DAG.DeleteNode(N1.getNode());
+          return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                             MVT::Other, Chain, SetCC, N2);
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
+//
+SDValue DAGCombiner::visitBR_CC(SDNode *N) {
+  CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
+  SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
+
+  // Use SimplifySetCC to simplify SETCC's.
+  SDValue Simp = SimplifySetCC(TLI.getSetCCResultType(CondLHS.getValueType()),
+                               CondLHS, CondRHS, CC->get(), N->getDebugLoc(),
+                               false);
+  if (Simp.getNode()) AddToWorkList(Simp.getNode());
+
+  ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(Simp.getNode());
+
+  // fold br_cc true, dest -> br dest (unconditional branch)
+  if (SCCC && !SCCC->isNullValue())
+    return DAG.getNode(ISD::BR, N->getDebugLoc(), MVT::Other,
+                       N->getOperand(0), N->getOperand(4));
+  // fold br_cc false, dest -> unconditional fall through
+  if (SCCC && SCCC->isNullValue())
+    return N->getOperand(0);
+
+  // fold to a simpler setcc
+  if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+                       N->getOperand(0), Simp.getOperand(2),
+                       Simp.getOperand(0), Simp.getOperand(1),
+                       N->getOperand(4));
+
+  return SDValue();
+}
+
+/// CombineToPreIndexedLoadStore - Try turning a load / store into a
+/// pre-indexed load / store when the base pointer is an add or subtract
+/// and it has other uses besides the load / store. After the
+/// transformation, the new indexed load / store has effectively folded
+/// the add / subtract in and all of its other uses are redirected to the
+/// new load / store.
+bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
+  if (!LegalOperations)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  MVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
+  // out.  There is no reason to make this a preinc/predec.
+  if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
+      Ptr.getNode()->hasOneUse())
+    return false;
+
+  // Ask the target to do addressing mode selection.
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
+    return false;
+  // Don't create a indexed load / store with zero offset.
+  if (isa<ConstantSDNode>(Offset) &&
+      cast<ConstantSDNode>(Offset)->isNullValue())
+    return false;
+
+  // Try turning it into a pre-indexed load / store except when:
+  // 1) The new base ptr is a frame index.
+  // 2) If N is a store and the new base ptr is either the same as or is a
+  //    predecessor of the value being stored.
+  // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
+  //    that would create a cycle.
+  // 4) All uses are load / store ops that use it as old base ptr.
+
+  // Check #1.  Preinc'ing a frame index would require copying the stack pointer
+  // (plus the implicit offset) to a register to preinc anyway.
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
+
+  // Check #2.
+  if (!isLoad) {
+    SDValue Val = cast<StoreSDNode>(N)->getValue();
+    if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode()))
+      return false;
+  }
+
+  // Now check for #3 and #4.
+  bool RealUse = false;
+  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
+         E = Ptr.getNode()->use_end(); I != E; ++I) {
+    SDNode *Use = *I;
+    if (Use == N)
+      continue;
+    if (Use->isPredecessorOf(N))
+      return false;
+
+    if (!((Use->getOpcode() == ISD::LOAD &&
+           cast<LoadSDNode>(Use)->getBasePtr() == Ptr) ||
+          (Use->getOpcode() == ISD::STORE &&
+           cast<StoreSDNode>(Use)->getBasePtr() == Ptr)))
+      RealUse = true;
+  }
+
+  if (!RealUse)
+    return false;
+
+  SDValue Result;
+  if (isLoad)
+    Result = DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+                                BasePtr, Offset, AM);
+  else
+    Result = DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+                                 BasePtr, Offset, AM);
+  ++PreIndexedNodes;
+  ++NodesCombined;
+  DOUT << "\nReplacing.4 "; DEBUG(N->dump(&DAG));
+  DOUT << "\nWith: "; DEBUG(Result.getNode()->dump(&DAG));
+  DOUT << '\n';
+  WorkListRemover DeadNodes(*this);
+  if (isLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
+                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
+                                  &DeadNodes);
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
+                                  &DeadNodes);
+  }
+
+  // Finally, since the node is now dead, remove it from the graph.
+  DAG.DeleteNode(N);
+
+  // Replace the uses of Ptr with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0),
+                                &DeadNodes);
+  removeFromWorkList(Ptr.getNode());
+  DAG.DeleteNode(Ptr.getNode());
+
+  return true;
+}
+
+/// CombineToPostIndexedLoadStore - Try to combine a load / store with a
+/// add / sub of the base pointer node into a post-indexed load / store.
+/// The transformation folded the add / subtract into the new indexed
+/// load / store effectively and all of its uses are redirected to the
+/// new load / store.
+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
+  if (!LegalOperations)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  MVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  if (Ptr.getNode()->hasOneUse())
+    return false;
+
+  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
+         E = Ptr.getNode()->use_end(); I != E; ++I) {
+    SDNode *Op = *I;
+    if (Op == N ||
+        (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
+      continue;
+
+    SDValue BasePtr;
+    SDValue Offset;
+    ISD::MemIndexedMode AM = ISD::UNINDEXED;
+    if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
+      if (Ptr == Offset)
+        std::swap(BasePtr, Offset);
+      if (Ptr != BasePtr)
+        continue;
+      // Don't create a indexed load / store with zero offset.
+      if (isa<ConstantSDNode>(Offset) &&
+          cast<ConstantSDNode>(Offset)->isNullValue())
+        continue;
+
+      // Try turning it into a post-indexed load / store except when
+      // 1) All uses are load / store ops that use it as base ptr.
+      // 2) Op must be independent of N, i.e. Op is neither a predecessor
+      //    nor a successor of N. Otherwise, if Op is folded that would
+      //    create a cycle.
+
+      if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+        continue;
+
+      // Check for #1.
+      bool TryNext = false;
+      for (SDNode::use_iterator II = BasePtr.getNode()->use_begin(),
+             EE = BasePtr.getNode()->use_end(); II != EE; ++II) {
+        SDNode *Use = *II;
+        if (Use == Ptr.getNode())
+          continue;
+
+        // If all the uses are load / store addresses, then don't do the
+        // transformation.
+        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
+          bool RealUse = false;
+          for (SDNode::use_iterator III = Use->use_begin(),
+                 EEE = Use->use_end(); III != EEE; ++III) {
+            SDNode *UseUse = *III;
+            if (!((UseUse->getOpcode() == ISD::LOAD &&
+                   cast<LoadSDNode>(UseUse)->getBasePtr().getNode() == Use) ||
+                  (UseUse->getOpcode() == ISD::STORE &&
+                   cast<StoreSDNode>(UseUse)->getBasePtr().getNode() == Use)))
+              RealUse = true;
+          }
+
+          if (!RealUse) {
+            TryNext = true;
+            break;
+          }
+        }
+      }
+
+      if (TryNext)
+        continue;
+
+      // Check for #2
+      if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
+        SDValue Result = isLoad
+          ? DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+                               BasePtr, Offset, AM)
+          : DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+                                BasePtr, Offset, AM);
+        ++PostIndexedNodes;
+        ++NodesCombined;
+        DOUT << "\nReplacing.5 "; DEBUG(N->dump(&DAG));
+        DOUT << "\nWith: "; DEBUG(Result.getNode()->dump(&DAG));
+        DOUT << '\n';
+        WorkListRemover DeadNodes(*this);
+        if (isLoad) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
+                                        &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
+                                        &DeadNodes);
+        } else {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
+                                        &DeadNodes);
+        }
+
+        // Finally, since the node is now dead, remove it from the graph.
+        DAG.DeleteNode(N);
+
+        // Replace the uses of Use with uses of the updated base value.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
+                                      Result.getValue(isLoad ? 1 : 0),
+                                      &DeadNodes);
+        removeFromWorkList(Op);
+        DAG.DeleteNode(Op);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// InferAlignment - If we can infer some alignment information from this
+/// pointer, return it.
+static unsigned InferAlignment(SDValue Ptr, SelectionDAG &DAG) {
+  // If this is a direct reference to a stack slot, use information about the
+  // stack slot's alignment.
+  int FrameIdx = 1 << 31;
+  int64_t FrameOffset = 0;
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+    FrameIdx = FI->getIndex();
+  } else if (Ptr.getOpcode() == ISD::ADD &&
+             isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+    FrameOffset = Ptr.getConstantOperandVal(1);
+  }
+
+  if (FrameIdx != (1 << 31)) {
+    // FIXME: Handle FI+CST.
+    const MachineFrameInfo &MFI = *DAG.getMachineFunction().getFrameInfo();
+    if (MFI.isFixedObjectIndex(FrameIdx)) {
+      int64_t ObjectOffset = MFI.getObjectOffset(FrameIdx) + FrameOffset;
+
+      // The alignment of the frame index can be determined from its offset from
+      // the incoming frame position.  If the frame object is at offset 32 and
+      // the stack is guaranteed to be 16-byte aligned, then we know that the
+      // object is 16-byte aligned.
+      unsigned StackAlign = DAG.getTarget().getFrameInfo()->getStackAlignment();
+      unsigned Align = MinAlign(ObjectOffset, StackAlign);
+
+      // Finally, the frame object itself may have a known alignment.  Factor
+      // the alignment + offset into a new alignment.  For example, if we know
+      // the  FI is 8 byte aligned, but the pointer is 4 off, we really have a
+      // 4-byte alignment of the resultant pointer.  Likewise align 4 + 4-byte
+      // offset = 4-byte alignment, align 4 + 1-byte offset = align 1, etc.
+      unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
+                                      FrameOffset);
+      return std::max(Align, FIInfoAlign);
+    }
+  }
+
+  return 0;
+}
+
+SDValue DAGCombiner::visitLOAD(SDNode *N) {
+  LoadSDNode *LD  = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr   = LD->getBasePtr();
+
+  // Try to infer better alignment information than the load already has.
+  if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
+    if (unsigned Align = InferAlignment(Ptr, DAG)) {
+      if (Align > LD->getAlignment())
+        return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
+                              LD->getValueType(0),
+                              Chain, Ptr, LD->getSrcValue(),
+                              LD->getSrcValueOffset(), LD->getMemoryVT(),
+                              LD->isVolatile(), Align);
+    }
+  }
+
+  // If load is not volatile and there are no uses of the loaded value (and
+  // the updated indexed value in case of indexed loads), change uses of the
+  // chain value into uses of the chain input (i.e. delete the dead load).
+  if (!LD->isVolatile()) {
+    if (N->getValueType(1) == MVT::Other) {
+      // Unindexed loads.
+      if (N->hasNUsesOfValue(0, 0)) {
+        // It's not safe to use the two value CombineTo variant here. e.g.
+        // v1, chain2 = load chain1, loc
+        // v2, chain3 = load chain2, loc
+        // v3         = add v2, c
+        // Now we replace use of chain2 with chain1.  This makes the second load
+        // isomorphic to the one we are deleting, and thus makes this load live.
+        DOUT << "\nReplacing.6 "; DEBUG(N->dump(&DAG));
+        DOUT << "\nWith chain: "; DEBUG(Chain.getNode()->dump(&DAG));
+        DOUT << "\n";
+        WorkListRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain, &DeadNodes);
+
+        if (N->use_empty()) {
+          removeFromWorkList(N);
+          DAG.DeleteNode(N);
+        }
+
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    } else {
+      // Indexed loads.
+      assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
+      if (N->hasNUsesOfValue(0, 0) && N->hasNUsesOfValue(0, 1)) {
+        SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        DOUT << "\nReplacing.6 "; DEBUG(N->dump(&DAG));
+        DOUT << "\nWith: "; DEBUG(Undef.getNode()->dump(&DAG));
+        DOUT << " and 2 other values\n";
+        WorkListRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
+                                      DAG.getUNDEF(N->getValueType(1)),
+                                      &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain, &DeadNodes);
+        removeFromWorkList(N);
+        DAG.DeleteNode(N);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // If this load is directly stored, replace the load value with the stored
+  // value.
+  // TODO: Handle store large -> read small portion.
+  // TODO: Handle TRUNCSTORE/LOADEXT
+  if (LD->getExtensionType() == ISD::NON_EXTLOAD &&
+      !LD->isVolatile()) {
+    if (ISD::isNON_TRUNCStore(Chain.getNode())) {
+      StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
+      if (PrevST->getBasePtr() == Ptr &&
+          PrevST->getValue().getValueType() == N->getValueType(0))
+      return CombineTo(N, Chain.getOperand(1), Chain);
+    }
+  }
+
+  if (CombinerAA) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(N, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      SDValue ReplLoad;
+
+      // Replace the chain to void dependency.
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
+        ReplLoad = DAG.getLoad(N->getValueType(0), LD->getDebugLoc(),
+                               BetterChain, Ptr,
+                               LD->getSrcValue(), LD->getSrcValueOffset(),
+                               LD->isVolatile(), LD->getAlignment());
+      } else {
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(),
+                                  LD->getValueType(0),
+                                  BetterChain, Ptr, LD->getSrcValue(),
+                                  LD->getSrcValueOffset(),
+                                  LD->getMemoryVT(),
+                                  LD->isVolatile(),
+                                  LD->getAlignment());
+      }
+
+      // Create token factor to keep old chain connected.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                                  MVT::Other, Chain, ReplLoad.getValue(1));
+
+      // Replace uses with load result and token factor. Don't add users
+      // to work list.
+      return CombineTo(N, ReplLoad.getValue(0), Token, false);
+    }
+  }
+
+  // Try transforming N to an indexed load.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+
+/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is
+/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some
+/// of the loaded bits, try narrowing the load and store if it would end up
+/// being a win for performance or code size.
+SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  if (ST->isVolatile())
+    return SDValue();
+
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+  MVT VT = Value.getValueType();
+
+  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+    return SDValue();
+
+  unsigned Opc = Value.getOpcode();
+  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
+      Value.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N0 = Value.getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LD = cast<LoadSDNode>(N0);
+    if (LD->getBasePtr() != Ptr)
+      return SDValue();
+
+    // Find the type to narrow it the load / op / store to.
+    SDValue N1 = Value.getOperand(1);
+    unsigned BitWidth = N1.getValueSizeInBits();
+    APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
+    if (Opc == ISD::AND)
+      Imm ^= APInt::getAllOnesValue(BitWidth);
+    if (Imm == 0 || Imm.isAllOnesValue())
+      return SDValue();
+    unsigned ShAmt = Imm.countTrailingZeros();
+    unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
+    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    MVT NewVT = MVT::getIntegerVT(NewBW);
+    while (NewBW < BitWidth &&
+           !(TLI.isOperationLegalOrCustom(Opc, NewVT) &&
+             TLI.isNarrowingProfitable(VT, NewVT))) {
+      NewBW = NextPowerOf2(NewBW);
+      NewVT = MVT::getIntegerVT(NewBW);
+    }
+    if (NewBW >= BitWidth)
+      return SDValue();
+
+    // If the lsb changed does not start at the type bitwidth boundary,
+    // start at the previous one.
+    if (ShAmt % NewBW)
+      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
+    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, ShAmt + NewBW);
+    if ((Imm & Mask) == Imm) {
+      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
+      if (Opc == ISD::AND)
+        NewImm ^= APInt::getAllOnesValue(NewBW);
+      uint64_t PtrOff = ShAmt / 8;
+      // For big endian targets, we need to adjust the offset to the pointer to
+      // load the correct bytes.
+      if (TLI.isBigEndian())
+        PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+
+      unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
+      if (NewAlign <
+          TLI.getTargetData()->getABITypeAlignment(NewVT.getTypeForMVT()))
+        return SDValue();
+
+      SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(),
+                                   Ptr.getValueType(), Ptr,
+                                   DAG.getConstant(PtrOff, Ptr.getValueType()));
+      SDValue NewLD = DAG.getLoad(NewVT, N0.getDebugLoc(),
+                                  LD->getChain(), NewPtr,
+                                  LD->getSrcValue(), LD->getSrcValueOffset(),
+                                  LD->isVolatile(), NewAlign);
+      SDValue NewVal = DAG.getNode(Opc, Value.getDebugLoc(), NewVT, NewLD,
+                                   DAG.getConstant(NewImm, NewVT));
+      SDValue NewST = DAG.getStore(Chain, N->getDebugLoc(),
+                                   NewVal, NewPtr,
+                                   ST->getSrcValue(), ST->getSrcValueOffset(),
+                                   false, NewAlign);
+
+      AddToWorkList(NewPtr.getNode());
+      AddToWorkList(NewLD.getNode());
+      AddToWorkList(NewVal.getNode());
+      WorkListRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1),
+                                    &DeadNodes);
+      ++OpsNarrowed;
+      return NewST;
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSTORE(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+
+  // Try to infer better alignment information than the store already has.
+  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
+    if (unsigned Align = InferAlignment(Ptr, DAG)) {
+      if (Align > ST->getAlignment())
+        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
+                                 Ptr, ST->getSrcValue(),
+                                 ST->getSrcValueOffset(), ST->getMemoryVT(),
+                                 ST->isVolatile(), Align);
+    }
+  }
+
+  // If this is a store of a bit convert, store the input value if the
+  // resultant store does not need a higher alignment than the original.
+  if (Value.getOpcode() == ISD::BIT_CONVERT && !ST->isTruncatingStore() &&
+      ST->isUnindexed()) {
+    unsigned OrigAlign = ST->getAlignment();
+    MVT SVT = Value.getOperand(0).getValueType();
+    unsigned Align = TLI.getTargetData()->
+      getABITypeAlignment(SVT.getTypeForMVT());
+    if (Align <= OrigAlign &&
+        ((!LegalOperations && !ST->isVolatile()) ||
+         TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
+      return DAG.getStore(Chain, N->getDebugLoc(), Value.getOperand(0),
+                          Ptr, ST->getSrcValue(),
+                          ST->getSrcValueOffset(), ST->isVolatile(), OrigAlign);
+  }
+
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) {
+    // NOTE: If the original store is volatile, this transform must not increase
+    // the number of stores.  For example, on x86-32 an f64 can be stored in one
+    // processor operation but an i64 (which is not legal) requires two.  So the
+    // transform should not be done in this case.
+    if (Value.getOpcode() != ISD::TargetConstantFP) {
+      SDValue Tmp;
+      switch (CFP->getValueType(0).getSimpleVT()) {
+      default: assert(0 && "Unknown FP type");
+      case MVT::f80:    // We don't do this for these yet.
+      case MVT::f128:
+      case MVT::ppcf128:
+        break;
+      case MVT::f32:
+        if (((TLI.isTypeLegal(MVT::i32) || !LegalTypes) && !LegalOperations &&
+             !ST->isVolatile()) ||
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+          Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
+                              bitcastToAPInt().getZExtValue(), MVT::i32);
+          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
+                              Ptr, ST->getSrcValue(),
+                              ST->getSrcValueOffset(), ST->isVolatile(),
+                              ST->getAlignment());
+        }
+        break;
+      case MVT::f64:
+        if (((TLI.isTypeLegal(MVT::i64) || !LegalTypes) && !LegalOperations &&
+             !ST->isVolatile()) ||
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
+          Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                getZExtValue(), MVT::i64);
+          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
+                              Ptr, ST->getSrcValue(),
+                              ST->getSrcValueOffset(), ST->isVolatile(),
+                              ST->getAlignment());
+        } else if (!ST->isVolatile() &&
+                   TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+          // Many FP stores are not made apparent until after legalize, e.g. for
+          // argument passing.  Since this is so common, custom legalize the
+          // 64-bit integer store into two 32-bit stores.
+          uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+          SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, MVT::i32);
+          SDValue Hi = DAG.getConstant(Val >> 32, MVT::i32);
+          if (TLI.isBigEndian()) std::swap(Lo, Hi);
+
+          int SVOffset = ST->getSrcValueOffset();
+          unsigned Alignment = ST->getAlignment();
+          bool isVolatile = ST->isVolatile();
+
+          SDValue St0 = DAG.getStore(Chain, ST->getDebugLoc(), Lo,
+                                     Ptr, ST->getSrcValue(),
+                                     ST->getSrcValueOffset(),
+                                     isVolatile, ST->getAlignment());
+          Ptr = DAG.getNode(ISD::ADD, N->getDebugLoc(), Ptr.getValueType(), Ptr,
+                            DAG.getConstant(4, Ptr.getValueType()));
+          SVOffset += 4;
+          Alignment = MinAlign(Alignment, 4U);
+          SDValue St1 = DAG.getStore(Chain, ST->getDebugLoc(), Hi,
+                                     Ptr, ST->getSrcValue(),
+                                     SVOffset, isVolatile, Alignment);
+          return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
+                             St0, St1);
+        }
+
+        break;
+      }
+    }
+  }
+
+  if (CombinerAA) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(N, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      // Replace the chain to avoid dependency.
+      SDValue ReplStore;
+      if (ST->isTruncatingStore()) {
+        ReplStore = DAG.getTruncStore(BetterChain, N->getDebugLoc(), Value, Ptr,
+                                      ST->getSrcValue(),ST->getSrcValueOffset(),
+                                      ST->getMemoryVT(),
+                                      ST->isVolatile(), ST->getAlignment());
+      } else {
+        ReplStore = DAG.getStore(BetterChain, N->getDebugLoc(), Value, Ptr,
+                                 ST->getSrcValue(), ST->getSrcValueOffset(),
+                                 ST->isVolatile(), ST->getAlignment());
+      }
+
+      // Create token to keep both nodes around.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                                  MVT::Other, Chain, ReplStore);
+
+      // Don't add users to work list.
+      return CombineTo(N, Token, false);
+    }
+  }
+
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  // FIXME: is there such a thing as a truncating indexed store?
+  if (ST->isTruncatingStore() && ST->isUnindexed() &&
+      Value.getValueType().isInteger()) {
+    // See if we can simplify the input to this truncstore with knowledge that
+    // only the low bits are being used.  For example:
+    // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
+    SDValue Shorter =
+      GetDemandedBits(Value,
+                      APInt::getLowBitsSet(Value.getValueSizeInBits(),
+                                           ST->getMemoryVT().getSizeInBits()));
+    AddToWorkList(Value.getNode());
+    if (Shorter.getNode())
+      return DAG.getTruncStore(Chain, N->getDebugLoc(), Shorter,
+                               Ptr, ST->getSrcValue(),
+                               ST->getSrcValueOffset(), ST->getMemoryVT(),
+                               ST->isVolatile(), ST->getAlignment());
+
+    // Otherwise, see if we can simplify the operation with
+    // SimplifyDemandedBits, which only works if the value has a single use.
+    if (SimplifyDemandedBits(Value,
+                             APInt::getLowBitsSet(
+                               Value.getValueSizeInBits(),
+                               ST->getMemoryVT().getSizeInBits())))
+      return SDValue(N, 0);
+  }
+
+  // If this is a load followed by a store to the same location, then the store
+  // is dead/noop.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
+    if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
+        ST->isUnindexed() && !ST->isVolatile() &&
+        // There can't be any side effects between the load and store, such as
+        // a call or store.
+        Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
+  // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
+  // truncating store.  We can do this even if this is already a truncstore.
+  if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
+      && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+                            ST->getMemoryVT())) {
+    return DAG.getTruncStore(Chain, N->getDebugLoc(), Value.getOperand(0),
+                             Ptr, ST->getSrcValue(),
+                             ST->getSrcValueOffset(), ST->getMemoryVT(),
+                             ST->isVolatile(), ST->getAlignment());
+  }
+
+  return ReduceLoadOpStoreWidth(N);
+}
+
+SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InVec = N->getOperand(0);
+  SDValue InVal = N->getOperand(1);
+  SDValue EltNo = N->getOperand(2);
+
+  // If the invec is a BUILD_VECTOR and if EltNo is a constant, build a new
+  // vector with the inserted element.
+  if (InVec.getOpcode() == ISD::BUILD_VECTOR && isa<ConstantSDNode>(EltNo)) {
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    SmallVector<SDValue, 8> Ops(InVec.getNode()->op_begin(),
+                                InVec.getNode()->op_end());
+    if (Elt < Ops.size())
+      Ops[Elt] = InVal;
+    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                       InVec.getValueType(), &Ops[0], Ops.size());
+  }
+  // If the invec is an UNDEF and if EltNo is a constant, create a new 
+  // BUILD_VECTOR with undef elements and the inserted element.
+  if (!LegalOperations && InVec.getOpcode() == ISD::UNDEF && 
+      isa<ConstantSDNode>(EltNo)) {
+    MVT VT = InVec.getValueType();
+    MVT EVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    SmallVector<SDValue, 8> Ops(NElts, DAG.getUNDEF(EVT));
+
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    if (Elt < Ops.size())
+      Ops[Elt] = InVal;
+    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                       InVec.getValueType(), &Ops[0], Ops.size());
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
+  // (vextract (scalar_to_vector val, 0) -> val
+  SDValue InVec = N->getOperand(0);
+
+ if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+   // If the operand is wider than the vector element type then it is implicitly
+   // truncated.  Make that explicit here.
+   MVT EltVT = InVec.getValueType().getVectorElementType();
+   SDValue InOp = InVec.getOperand(0);
+   if (InOp.getValueType() != EltVT)
+     return DAG.getNode(ISD::TRUNCATE, InVec.getDebugLoc(), EltVT, InOp);
+   return InOp;
+ }
+
+  // Perform only after legalization to ensure build_vector / vector_shuffle
+  // optimizations have already been done.
+  if (!LegalOperations) return SDValue();
+
+  // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
+  SDValue EltNo = N->getOperand(1);
+
+  if (isa<ConstantSDNode>(EltNo)) {
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    bool NewLoad = false;
+    bool BCNumEltsChanged = false;
+    MVT VT = InVec.getValueType();
+    MVT EVT = VT.getVectorElementType();
+    MVT LVT = EVT;
+
+    if (InVec.getOpcode() == ISD::BIT_CONVERT) {
+      MVT BCVT = InVec.getOperand(0).getValueType();
+      if (!BCVT.isVector() || EVT.bitsGT(BCVT.getVectorElementType()))
+        return SDValue();
+      if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+        BCNumEltsChanged = true;
+      InVec = InVec.getOperand(0);
+      EVT = BCVT.getVectorElementType();
+      NewLoad = true;
+    }
+
+    LoadSDNode *LN0 = NULL;
+    const ShuffleVectorSDNode *SVN = NULL;
+    if (ISD::isNormalLoad(InVec.getNode())) {
+      LN0 = cast<LoadSDNode>(InVec);
+    } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+               InVec.getOperand(0).getValueType() == EVT &&
+               ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
+      LN0 = cast<LoadSDNode>(InVec.getOperand(0));
+    } else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
+      // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
+      // =>
+      // (load $addr+1*size)
+
+      // If the bit convert changed the number of elements, it is unsafe
+      // to examine the mask.
+      if (BCNumEltsChanged)
+        return SDValue();
+
+      // Select the input vector, guarding against out of range extract vector.
+      unsigned NumElems = VT.getVectorNumElements();
+      int Idx = (Elt > NumElems) ? -1 : SVN->getMaskElt(Elt);
+      InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);
+
+      if (InVec.getOpcode() == ISD::BIT_CONVERT)
+        InVec = InVec.getOperand(0);
+      if (ISD::isNormalLoad(InVec.getNode())) {
+        LN0 = cast<LoadSDNode>(InVec);
+        Elt = (Idx < (int)NumElems) ? Idx : Idx - NumElems;
+      }
+    }
+
+    if (!LN0 || !LN0->hasOneUse() || LN0->isVolatile())
+      return SDValue();
+
+    unsigned Align = LN0->getAlignment();
+    if (NewLoad) {
+      // Check the resultant load doesn't need a higher alignment than the
+      // original load.
+      unsigned NewAlign =
+        TLI.getTargetData()->getABITypeAlignment(LVT.getTypeForMVT());
+
+      if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT))
+        return SDValue();
+
+      Align = NewAlign;
+    }
+
+    SDValue NewPtr = LN0->getBasePtr();
+    if (Elt) {
+      unsigned PtrOff = LVT.getSizeInBits() * Elt / 8;
+      MVT PtrType = NewPtr.getValueType();
+      if (TLI.isBigEndian())
+        PtrOff = VT.getSizeInBits() / 8 - PtrOff;
+      NewPtr = DAG.getNode(ISD::ADD, N->getDebugLoc(), PtrType, NewPtr,
+                           DAG.getConstant(PtrOff, PtrType));
+    }
+
+    return DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
+                       LN0->getSrcValue(), LN0->getSrcValueOffset(),
+                       LN0->isVolatile(), Align);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+  unsigned NumInScalars = N->getNumOperands();
+  MVT VT = N->getValueType(0);
+  MVT EltType = VT.getVectorElementType();
+
+  // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
+  // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
+  // at most two distinct vectors, turn this into a shuffle node.
+  SDValue VecIn1, VecIn2;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    // Ignore undef inputs.
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+    // If this input is something other than a EXTRACT_VECTOR_ELT with a
+    // constant index, bail out.
+    if (N->getOperand(i).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(N->getOperand(i).getOperand(1))) {
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+
+    // If the input vector type disagrees with the result of the build_vector,
+    // we can't make a shuffle.
+    SDValue ExtractedFromVec = N->getOperand(i).getOperand(0);
+    if (ExtractedFromVec.getValueType() != VT) {
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+
+    // Otherwise, remember this.  We allow up to two distinct input vectors.
+    if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
+      continue;
+
+    if (VecIn1.getNode() == 0) {
+      VecIn1 = ExtractedFromVec;
+    } else if (VecIn2.getNode() == 0) {
+      VecIn2 = ExtractedFromVec;
+    } else {
+      // Too many inputs.
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+  }
+
+  // If everything is good, we can make a shuffle operation.
+  if (VecIn1.getNode()) {
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumInScalars; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // If extracting from the first vector, just use the index directly.
+      SDValue Extract = N->getOperand(i);
+      SDValue ExtVal = Extract.getOperand(1);
+      if (Extract.getOperand(0) == VecIn1) {
+        unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
+        if (ExtIndex > VT.getVectorNumElements())
+          return SDValue();
+        
+        Mask.push_back(ExtIndex);
+        continue;
+      }
+
+      // Otherwise, use InIdx + VecSize
+      unsigned Idx = cast<ConstantSDNode>(ExtVal)->getZExtValue();
+      Mask.push_back(Idx+NumInScalars);
+    }
+
+    // Add count and size info.
+    if (!TLI.isTypeLegal(VT) && LegalTypes)
+      return SDValue();
+
+    // Return the new VECTOR_SHUFFLE node.
+    SDValue Ops[2];
+    Ops[0] = VecIn1;
+    Ops[1] = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+    return DAG.getVectorShuffle(VT, N->getDebugLoc(), Ops[0], Ops[1], &Mask[0]);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
+  // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of
+  // EXTRACT_SUBVECTOR operations.  If so, and if the EXTRACT_SUBVECTOR vector
+  // inputs come from at most two distinct vectors, turn this into a shuffle
+  // node.
+
+  // If we only have one input vector, we don't need to do any concatenation.
+  if (N->getNumOperands() == 1)
+    return N->getOperand(0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
+  return SDValue();
+  
+  MVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  assert(N0.getValueType().getVectorNumElements() == NumElts &&
+        "Vector shuffle must be normalized in DAG");
+
+  // FIXME: implement canonicalizations from DAG.getVectorShuffle()
+
+  // If it is a splat, check if the argument vector is a build_vector with
+  // all scalar elements the same.
+  if (cast<ShuffleVectorSDNode>(N)->isSplat()) {
+    SDNode *V = N0.getNode();
+    
+
+    // If this is a bit convert that changes the element type of the vector but
+    // not the number of vector elements, look through it.  Be careful not to
+    // look though conversions that change things like v4f32 to v2f64.
+    if (V->getOpcode() == ISD::BIT_CONVERT) {
+      SDValue ConvInput = V->getOperand(0);
+      if (ConvInput.getValueType().isVector() &&
+          ConvInput.getValueType().getVectorNumElements() == NumElts)
+        V = ConvInput.getNode();
+    }
+
+    if (V->getOpcode() == ISD::BUILD_VECTOR) {
+      unsigned NumElems = V->getNumOperands();
+      unsigned BaseIdx = cast<ShuffleVectorSDNode>(N)->getSplatIndex();
+      if (NumElems > BaseIdx) {
+        SDValue Base;
+        bool AllSame = true;
+        for (unsigned i = 0; i != NumElems; ++i) {
+          if (V->getOperand(i).getOpcode() != ISD::UNDEF) {
+            Base = V->getOperand(i);
+            break;
+          }
+        }
+        // Splat of <u, u, u, u>, return <u, u, u, u>
+        if (!Base.getNode())
+          return N0;
+        for (unsigned i = 0; i != NumElems; ++i) {
+          if (V->getOperand(i) != Base) {
+            AllSame = false;
+            break;
+          }
+        }
+        // Splat of <x, x, x, x>, return <x, x, x, x>
+        if (AllSame)
+          return N0;
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// XformToShuffleWithZero - Returns a vector_shuffle if it able to transform
+/// an AND to a vector_shuffle with the destination vector and a zero vector.
+/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
+///      vector_shuffle V, Zero, <0, 4, 2, 4>
+SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (N->getOpcode() == ISD::AND) {
+    if (RHS.getOpcode() == ISD::BIT_CONVERT)
+      RHS = RHS.getOperand(0);
+    if (RHS.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<int, 8> Indices;
+      unsigned NumElts = RHS.getNumOperands();
+      for (unsigned i = 0; i != NumElts; ++i) {
+        SDValue Elt = RHS.getOperand(i);
+        if (!isa<ConstantSDNode>(Elt))
+          return SDValue();
+        else if (cast<ConstantSDNode>(Elt)->isAllOnesValue())
+          Indices.push_back(i);
+        else if (cast<ConstantSDNode>(Elt)->isNullValue())
+          Indices.push_back(NumElts);
+        else
+          return SDValue();
+      }
+
+      // Let's see if the target supports this vector_shuffle.
+      MVT RVT = RHS.getValueType();
+      if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+        return SDValue();
+
+      // Return the new VECTOR_SHUFFLE node.
+      MVT EVT = RVT.getVectorElementType();
+      SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
+                                     DAG.getConstant(0, EVT));
+      SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                                 RVT, &ZeroOps[0], ZeroOps.size());
+      LHS = DAG.getNode(ISD::BIT_CONVERT, dl, RVT, LHS);
+      SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuf);
+    }
+  }
+
+  return SDValue();
+}
+
+/// SimplifyVBinOp - Visit a binary vector operation, like ADD.
+SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
+  // After legalize, the target may be depending on adds and other
+  // binary ops to provide legal ways to construct constants or other
+  // things. Simplifying them may result in a loss of legality.
+  if (LegalOperations) return SDValue();
+
+  MVT VT = N->getValueType(0);
+  assert(VT.isVector() && "SimplifyVBinOp only works on vectors!");
+
+  MVT EltType = VT.getVectorElementType();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Shuffle = XformToShuffleWithZero(N);
+  if (Shuffle.getNode()) return Shuffle;
+
+  // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold
+  // this operation.
+  if (LHS.getOpcode() == ISD::BUILD_VECTOR &&
+      RHS.getOpcode() == ISD::BUILD_VECTOR) {
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
+      SDValue LHSOp = LHS.getOperand(i);
+      SDValue RHSOp = RHS.getOperand(i);
+      // If these two elements can't be folded, bail out.
+      if ((LHSOp.getOpcode() != ISD::UNDEF &&
+           LHSOp.getOpcode() != ISD::Constant &&
+           LHSOp.getOpcode() != ISD::ConstantFP) ||
+          (RHSOp.getOpcode() != ISD::UNDEF &&
+           RHSOp.getOpcode() != ISD::Constant &&
+           RHSOp.getOpcode() != ISD::ConstantFP))
+        break;
+
+      // Can't fold divide by zero.
+      if (N->getOpcode() == ISD::SDIV || N->getOpcode() == ISD::UDIV ||
+          N->getOpcode() == ISD::FDIV) {
+        if ((RHSOp.getOpcode() == ISD::Constant &&
+             cast<ConstantSDNode>(RHSOp.getNode())->isNullValue()) ||
+            (RHSOp.getOpcode() == ISD::ConstantFP &&
+             cast<ConstantFPSDNode>(RHSOp.getNode())->getValueAPF().isZero()))
+          break;
+      }
+
+      Ops.push_back(DAG.getNode(N->getOpcode(), LHS.getDebugLoc(),
+                                EltType, LHSOp, RHSOp));
+      AddToWorkList(Ops.back().getNode());
+      assert((Ops.back().getOpcode() == ISD::UNDEF ||
+              Ops.back().getOpcode() == ISD::Constant ||
+              Ops.back().getOpcode() == ISD::ConstantFP) &&
+             "Scalar binop didn't fold!");
+    }
+
+    if (Ops.size() == LHS.getNumOperands()) {
+      MVT VT = LHS.getValueType();
+      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                         &Ops[0], Ops.size());
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::SimplifySelect(DebugLoc DL, SDValue N0,
+                                    SDValue N1, SDValue N2){
+  assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
+
+  SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
+  // If we got a simplified select_cc node back from SimplifySelectCC, then
+  // break it down into a new SETCC node, and a new SELECT node, and then return
+  // the SELECT node, since we were called with a SELECT node.
+  if (SCC.getNode()) {
+    // Check to see if we got a select_cc back (to turn into setcc/select).
+    // Otherwise, just return whatever node we got back, like fabs.
+    if (SCC.getOpcode() == ISD::SELECT_CC) {
+      SDValue SETCC = DAG.getNode(ISD::SETCC, N0.getDebugLoc(),
+                                  N0.getValueType(),
+                                  SCC.getOperand(0), SCC.getOperand(1),
+                                  SCC.getOperand(4));
+      AddToWorkList(SETCC.getNode());
+      return DAG.getNode(ISD::SELECT, SCC.getDebugLoc(), SCC.getValueType(),
+                         SCC.getOperand(2), SCC.getOperand(3), SETCC);
+    }
+
+    return SCC;
+  }
+  return SDValue();
+}
+
+/// SimplifySelectOps - Given a SELECT or a SELECT_CC node, where LHS and RHS
+/// are the two values being selected between, see if we can simplify the
+/// select.  Callers of this should assume that TheSelect is deleted if this
+/// returns true.  As such, they should return the appropriate thing (e.g. the
+/// node) back to the top-level of the DAG combiner loop to avoid it being
+/// looked at.
+bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
+                                    SDValue RHS) {
+
+  // If this is a select from two identical things, try to pull the operation
+  // through the select.
+  if (LHS.getOpcode() == RHS.getOpcode() && LHS.hasOneUse() && RHS.hasOneUse()){
+    // If this is a load and the token chain is identical, replace the select
+    // of two loads with a load through a select of the address to load from.
+    // This triggers in things like "select bool X, 10.0, 123.0" after the FP
+    // constants have been dropped into the constant pool.
+    if (LHS.getOpcode() == ISD::LOAD &&
+        // Do not let this transformation reduce the number of volatile loads.
+        !cast<LoadSDNode>(LHS)->isVolatile() &&
+        !cast<LoadSDNode>(RHS)->isVolatile() &&
+        // Token chains must be identical.
+        LHS.getOperand(0) == RHS.getOperand(0)) {
+      LoadSDNode *LLD = cast<LoadSDNode>(LHS);
+      LoadSDNode *RLD = cast<LoadSDNode>(RHS);
+
+      // If this is an EXTLOAD, the VT's must match.
+      if (LLD->getMemoryVT() == RLD->getMemoryVT()) {
+        // FIXME: this conflates two src values, discarding one.  This is not
+        // the right thing to do, but nothing uses srcvalues now.  When they do,
+        // turn SrcValue into a list of locations.
+        SDValue Addr;
+        if (TheSelect->getOpcode() == ISD::SELECT) {
+          // Check that the condition doesn't reach either load.  If so, folding
+          // this will induce a cycle into the DAG.
+          if (!LLD->isPredecessorOf(TheSelect->getOperand(0).getNode()) &&
+              !RLD->isPredecessorOf(TheSelect->getOperand(0).getNode())) {
+            Addr = DAG.getNode(ISD::SELECT, TheSelect->getDebugLoc(),
+                               LLD->getBasePtr().getValueType(),
+                               TheSelect->getOperand(0), LLD->getBasePtr(),
+                               RLD->getBasePtr());
+          }
+        } else {
+          // Check that the condition doesn't reach either load.  If so, folding
+          // this will induce a cycle into the DAG.
+          if (!LLD->isPredecessorOf(TheSelect->getOperand(0).getNode()) &&
+              !RLD->isPredecessorOf(TheSelect->getOperand(0).getNode()) &&
+              !LLD->isPredecessorOf(TheSelect->getOperand(1).getNode()) &&
+              !RLD->isPredecessorOf(TheSelect->getOperand(1).getNode())) {
+            Addr = DAG.getNode(ISD::SELECT_CC, TheSelect->getDebugLoc(),
+                               LLD->getBasePtr().getValueType(),
+                               TheSelect->getOperand(0),
+                               TheSelect->getOperand(1),
+                               LLD->getBasePtr(), RLD->getBasePtr(),
+                               TheSelect->getOperand(4));
+          }
+        }
+
+        if (Addr.getNode()) {
+          SDValue Load;
+          if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
+            Load = DAG.getLoad(TheSelect->getValueType(0),
+                               TheSelect->getDebugLoc(),
+                               LLD->getChain(),
+                               Addr,LLD->getSrcValue(),
+                               LLD->getSrcValueOffset(),
+                               LLD->isVolatile(),
+                               LLD->getAlignment());
+          } else {
+            Load = DAG.getExtLoad(LLD->getExtensionType(),
+                                  TheSelect->getDebugLoc(),
+                                  TheSelect->getValueType(0),
+                                  LLD->getChain(), Addr, LLD->getSrcValue(),
+                                  LLD->getSrcValueOffset(),
+                                  LLD->getMemoryVT(),
+                                  LLD->isVolatile(),
+                                  LLD->getAlignment());
+          }
+
+          // Users of the select now use the result of the load.
+          CombineTo(TheSelect, Load);
+
+          // Users of the old loads now use the new load's chain.  We know the
+          // old-load value is dead now.
+          CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
+          CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// SimplifySelectCC - Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// where 'cond' is the comparison specified by CC.
+SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
+                                      SDValue N2, SDValue N3,
+                                      ISD::CondCode CC, bool NotExtCompare) {
+  // (x ? y : y) -> y.
+  if (N2 == N3) return N2;
+  
+  MVT VT = N2.getValueType();
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
+
+  // Determine if the condition we're dealing with is constant
+  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, DL, false);
+  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+  ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode());
+
+  // fold select_cc true, x, y -> x
+  if (SCCC && !SCCC->isNullValue())
+    return N2;
+  // fold select_cc false, x, y -> y
+  if (SCCC && SCCC->isNullValue())
+    return N3;
+
+  // Check to see if we can simplify the select into an fabs node
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) {
+    // Allow either -0.0 or 0.0
+    if (CFP->getValueAPF().isZero()) {
+      // select (setg[te] X, +/-0.0), X, fneg(X) -> fabs
+      if ((CC == ISD::SETGE || CC == ISD::SETGT) &&
+          N0 == N2 && N3.getOpcode() == ISD::FNEG &&
+          N2 == N3.getOperand(0))
+        return DAG.getNode(ISD::FABS, DL, VT, N0);
+
+      // select (setl[te] X, +/-0.0), fneg(X), X -> fabs
+      if ((CC == ISD::SETLT || CC == ISD::SETLE) &&
+          N0 == N3 && N2.getOpcode() == ISD::FNEG &&
+          N2.getOperand(0) == N3)
+        return DAG.getNode(ISD::FABS, DL, VT, N3);
+    }
+  }
+  
+  // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
+  // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
+  // in it.  This is a win when the constant is not otherwise available because
+  // it replaces two constant pool loads with one.  We only do this if the FP
+  // type is known to be legal, because if it isn't, then we are before legalize
+  // types an we want the other legalization to happen first (e.g. to avoid
+  // messing with soft float) and if the ConstantFP is not legal, because if
+  // it is legal, we may not need to store the FP constant in a constant pool.
+  if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
+    if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
+      if (TLI.isTypeLegal(N2.getValueType()) &&
+          (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
+           TargetLowering::Legal) &&
+          // If both constants have multiple uses, then we won't need to do an
+          // extra load, they are likely around in registers for other users.
+          (TV->hasOneUse() || FV->hasOneUse())) {
+        Constant *Elts[] = {
+          const_cast<ConstantFP*>(FV->getConstantFPValue()),
+          const_cast<ConstantFP*>(TV->getConstantFPValue())
+        };
+        const Type *FPTy = Elts[0]->getType();
+        const TargetData &TD = *TLI.getTargetData();
+        
+        // Create a ConstantArray of the two constants.
+        Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts, 2);
+        SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(),
+                                            TD.getPrefTypeAlignment(FPTy));
+        unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+
+        // Get the offsets to the 0 and 1 element of the array so that we can
+        // select between them.
+        SDValue Zero = DAG.getIntPtrConstant(0);
+        unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
+        SDValue One = DAG.getIntPtrConstant(EltSize);
+        
+        SDValue Cond = DAG.getSetCC(DL,
+                                    TLI.getSetCCResultType(N0.getValueType()),
+                                    N0, N1, CC);
+        SDValue CstOffset = DAG.getNode(ISD::SELECT, DL, Zero.getValueType(),
+                                        Cond, One, Zero);
+        CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
+                            CstOffset);
+        return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+                           PseudoSourceValue::getConstantPool(), 0, false,
+                           Alignment);
+
+      }
+    }  
+
+  // Check to see if we can perform the "gzip trick", transforming
+  // (select_cc setlt X, 0, A, 0) -> (and (sra X, (sub size(X), 1), A)
+  if (N1C && N3C && N3C->isNullValue() && CC == ISD::SETLT &&
+      N0.getValueType().isInteger() &&
+      N2.getValueType().isInteger() &&
+      (N1C->isNullValue() ||                         // (a < 0) ? b : 0
+       (N1C->getAPIntValue() == 1 && N0 == N2))) {   // (a < 1) ? a : 0
+    MVT XType = N0.getValueType();
+    MVT AType = N2.getValueType();
+    if (XType.bitsGE(AType)) {
+      // and (sra X, size(X)-1, A) -> "and (srl X, C2), A" iff A is a
+      // single-bit constant.
+      if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue()-1)) == 0)) {
+        unsigned ShCtV = N2C->getAPIntValue().logBase2();
+        ShCtV = XType.getSizeInBits()-ShCtV-1;
+        SDValue ShCt = DAG.getConstant(ShCtV, getShiftAmountTy());
+        SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(),
+                                    XType, N0, ShCt);
+        AddToWorkList(Shift.getNode());
+
+        if (XType.bitsGT(AType)) {
+          Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+          AddToWorkList(Shift.getNode());
+        }
+
+        return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+      }
+
+      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(),
+                                  XType, N0,
+                                  DAG.getConstant(XType.getSizeInBits()-1,
+                                                  getShiftAmountTy()));
+      AddToWorkList(Shift.getNode());
+
+      if (XType.bitsGT(AType)) {
+        Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+        AddToWorkList(Shift.getNode());
+      }
+
+      return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+    }
+  }
+
+  // fold select C, 16, 0 -> shl C, 4
+  if (N2C && N3C && N3C->isNullValue() && N2C->getAPIntValue().isPowerOf2() &&
+      TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent) {
+
+    // If the caller doesn't want us to simplify this into a zext of a compare,
+    // don't do it.
+    if (NotExtCompare && N2C->getAPIntValue() == 1)
+      return SDValue();
+
+    // Get a SetCC of the condition
+    // FIXME: Should probably make sure that setcc is legal if we ever have a
+    // target where it isn't.
+    SDValue Temp, SCC;
+    // cast from setcc result type to select result type
+    if (LegalTypes) {
+      SCC  = DAG.getSetCC(DL, TLI.getSetCCResultType(N0.getValueType()),
+                          N0, N1, CC);
+      if (N2.getValueType().bitsLT(SCC.getValueType()))
+        Temp = DAG.getZeroExtendInReg(SCC, N2.getDebugLoc(), N2.getValueType());
+      else
+        Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+                           N2.getValueType(), SCC);
+    } else {
+      SCC  = DAG.getSetCC(N0.getDebugLoc(), MVT::i1, N0, N1, CC);
+      Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+                         N2.getValueType(), SCC);
+    }
+
+    AddToWorkList(SCC.getNode());
+    AddToWorkList(Temp.getNode());
+
+    if (N2C->getAPIntValue() == 1)
+      return Temp;
+
+    // shl setcc result by log2 n2c
+    return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
+                       DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy()));
+  }
+
+  // Check to see if this is the equivalent of setcc
+  // FIXME: Turn all of these into setcc if setcc if setcc is legal
+  // otherwise, go ahead with the folds.
+  if (0 && N3C && N3C->isNullValue() && N2C && (N2C->getAPIntValue() == 1ULL)) {
+    MVT XType = N0.getValueType();
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC, TLI.getSetCCResultType(XType))) {
+      SDValue Res = DAG.getSetCC(DL, TLI.getSetCCResultType(XType), N0, N1, CC);
+      if (Res.getValueType() != VT)
+        Res = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
+      return Res;
+    }
+
+    // fold (seteq X, 0) -> (srl (ctlz X, log2(size(X))))
+    if (N1C && N1C->isNullValue() && CC == ISD::SETEQ &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::CTLZ, XType))) {
+      SDValue Ctlz = DAG.getNode(ISD::CTLZ, N0.getDebugLoc(), XType, N0);
+      return DAG.getNode(ISD::SRL, DL, XType, Ctlz,
+                         DAG.getConstant(Log2_32(XType.getSizeInBits()),
+                                         getShiftAmountTy()));
+    }
+    // fold (setgt X, 0) -> (srl (and (-X, ~X), size(X)-1))
+    if (N1C && N1C->isNullValue() && CC == ISD::SETGT) {
+      SDValue NegN0 = DAG.getNode(ISD::SUB, N0.getDebugLoc(),
+                                  XType, DAG.getConstant(0, XType), N0);
+      SDValue NotN0 = DAG.getNOT(N0.getDebugLoc(), N0, XType);
+      return DAG.getNode(ISD::SRL, DL, XType,
+                         DAG.getNode(ISD::AND, DL, XType, NegN0, NotN0),
+                         DAG.getConstant(XType.getSizeInBits()-1,
+                                         getShiftAmountTy()));
+    }
+    // fold (setgt X, -1) -> (xor (srl (X, size(X)-1), 1))
+    if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT) {
+      SDValue Sign = DAG.getNode(ISD::SRL, N0.getDebugLoc(), XType, N0,
+                                 DAG.getConstant(XType.getSizeInBits()-1,
+                                                 getShiftAmountTy()));
+      return DAG.getNode(ISD::XOR, DL, XType, Sign, DAG.getConstant(1, XType));
+    }
+  }
+
+  // Check to see if this is an integer abs. select_cc setl[te] X, 0, -X, X ->
+  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+  if (N1C && N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE) &&
+      N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1) &&
+      N2.getOperand(0) == N1 && N0.getValueType().isInteger()) {
+    MVT XType = N0.getValueType();
+    SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType, N0,
+                                DAG.getConstant(XType.getSizeInBits()-1,
+                                                getShiftAmountTy()));
+    SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(), XType,
+                              N0, Shift);
+    AddToWorkList(Shift.getNode());
+    AddToWorkList(Add.getNode());
+    return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
+  }
+  // Check to see if this is an integer abs. select_cc setgt X, -1, X, -X ->
+  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+  if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT &&
+      N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1)) {
+    if (ConstantSDNode *SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0))) {
+      MVT XType = N0.getValueType();
+      if (SubC->isNullValue() && XType.isInteger()) {
+        SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
+                                    N0,
+                                    DAG.getConstant(XType.getSizeInBits()-1,
+                                                    getShiftAmountTy()));
+        SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
+                                  XType, N0, Shift);
+        AddToWorkList(Shift.getNode());
+        AddToWorkList(Add.getNode());
+        return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// SimplifySetCC - This is a stub for TargetLowering::SimplifySetCC.
+SDValue DAGCombiner::SimplifySetCC(MVT VT, SDValue N0,
+                                   SDValue N1, ISD::CondCode Cond,
+                                   DebugLoc DL, bool foldBooleans) {
+  TargetLowering::DAGCombinerInfo
+    DagCombineInfo(DAG, Level == Unrestricted, false, this);
+  return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+}
+
+/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue DAGCombiner::BuildSDIV(SDNode *N) {
+  std::vector<SDNode*> Built;
+  SDValue S = TLI.BuildSDIV(N, DAG, &Built);
+
+  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
+       ii != ee; ++ii)
+    AddToWorkList(*ii);
+  return S;
+}
+
+/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue DAGCombiner::BuildUDIV(SDNode *N) {
+  std::vector<SDNode*> Built;
+  SDValue S = TLI.BuildUDIV(N, DAG, &Built);
+
+  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
+       ii != ee; ++ii)
+    AddToWorkList(*ii);
+  return S;
+}
+
+/// FindBaseOffset - Return true if base is known not to alias with anything
+/// but itself.  Provides base object and offset as results.
+static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset) {
+  // Assume it is a primitive operation.
+  Base = Ptr; Offset = 0;
+
+  // If it's an adding a simple constant then integrate the offset.
+  if (Base.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
+      Base = Base.getOperand(0);
+      Offset += C->getZExtValue();
+    }
+  }
+
+  // If it's any of the following then it can't alias with anything but itself.
+  return isa<FrameIndexSDNode>(Base) ||
+         isa<ConstantPoolSDNode>(Base) ||
+         isa<GlobalAddressSDNode>(Base);
+}
+
+/// isAlias - Return true if there is any possibility that the two addresses
+/// overlap.
+bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
+                          const Value *SrcValue1, int SrcValueOffset1,
+                          SDValue Ptr2, int64_t Size2,
+                          const Value *SrcValue2, int SrcValueOffset2) const {
+  // If they are the same then they must be aliases.
+  if (Ptr1 == Ptr2) return true;
+
+  // Gather base node and offset information.
+  SDValue Base1, Base2;
+  int64_t Offset1, Offset2;
+  bool KnownBase1 = FindBaseOffset(Ptr1, Base1, Offset1);
+  bool KnownBase2 = FindBaseOffset(Ptr2, Base2, Offset2);
+
+  // If they have a same base address then...
+  if (Base1 == Base2)
+    // Check to see if the addresses overlap.
+    return !((Offset1 + Size1) <= Offset2 || (Offset2 + Size2) <= Offset1);
+
+  // If we know both bases then they can't alias.
+  if (KnownBase1 && KnownBase2) return false;
+
+  if (CombinerGlobalAA) {
+    // Use alias analysis information.
+    int64_t MinOffset = std::min(SrcValueOffset1, SrcValueOffset2);
+    int64_t Overlap1 = Size1 + SrcValueOffset1 - MinOffset;
+    int64_t Overlap2 = Size2 + SrcValueOffset2 - MinOffset;
+    AliasAnalysis::AliasResult AAResult =
+                             AA.alias(SrcValue1, Overlap1, SrcValue2, Overlap2);
+    if (AAResult == AliasAnalysis::NoAlias)
+      return false;
+  }
+
+  // Otherwise we have to assume they alias.
+  return true;
+}
+
+/// FindAliasInfo - Extracts the relevant alias information from the memory
+/// node.  Returns true if the operand was a load.
+bool DAGCombiner::FindAliasInfo(SDNode *N,
+                        SDValue &Ptr, int64_t &Size,
+                        const Value *&SrcValue, int &SrcValueOffset) const {
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    Size = LD->getMemoryVT().getSizeInBits() >> 3;
+    SrcValue = LD->getSrcValue();
+    SrcValueOffset = LD->getSrcValueOffset();
+    return true;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    Size = ST->getMemoryVT().getSizeInBits() >> 3;
+    SrcValue = ST->getSrcValue();
+    SrcValueOffset = ST->getSrcValueOffset();
+  } else {
+    assert(0 && "FindAliasInfo expected a memory operand");
+  }
+
+  return false;
+}
+
+/// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+/// looking for aliasing nodes and adding them to the Aliases vector.
+void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                                   SmallVector<SDValue, 8> &Aliases) {
+  SmallVector<SDValue, 8> Chains;     // List of chains to visit.
+  std::set<SDNode *> Visited;           // Visited node set.
+
+  // Get alias information for node.
+  SDValue Ptr;
+  int64_t Size = 0;
+  const Value *SrcValue = 0;
+  int SrcValueOffset = 0;
+  bool IsLoad = FindAliasInfo(N, Ptr, Size, SrcValue, SrcValueOffset);
+
+  // Starting off.
+  Chains.push_back(OriginalChain);
+
+  // Look at each chain and determine if it is an alias.  If so, add it to the
+  // aliases list.  If not, then continue up the chain looking for the next
+  // candidate.
+  while (!Chains.empty()) {
+    SDValue Chain = Chains.back();
+    Chains.pop_back();
+
+     // Don't bother if we've been before.
+    if (Visited.find(Chain.getNode()) != Visited.end()) continue;
+    Visited.insert(Chain.getNode());
+
+    switch (Chain.getOpcode()) {
+    case ISD::EntryToken:
+      // Entry token is ideal chain operand, but handled in FindBetterChain.
+      break;
+
+    case ISD::LOAD:
+    case ISD::STORE: {
+      // Get alias information for Chain.
+      SDValue OpPtr;
+      int64_t OpSize = 0;
+      const Value *OpSrcValue = 0;
+      int OpSrcValueOffset = 0;
+      bool IsOpLoad = FindAliasInfo(Chain.getNode(), OpPtr, OpSize,
+                                    OpSrcValue, OpSrcValueOffset);
+
+      // If chain is alias then stop here.
+      if (!(IsLoad && IsOpLoad) &&
+          isAlias(Ptr, Size, SrcValue, SrcValueOffset,
+                  OpPtr, OpSize, OpSrcValue, OpSrcValueOffset)) {
+        Aliases.push_back(Chain);
+      } else {
+        // Look further up the chain.
+        Chains.push_back(Chain.getOperand(0));
+        // Clean up old chain.
+        AddToWorkList(Chain.getNode());
+      }
+      break;
+    }
+
+    case ISD::TokenFactor:
+      // We have to check each of the operands of the token factor, so we queue
+      // then up.  Adding the  operands to the queue (stack) in reverse order
+      // maintains the original order and increases the likelihood that getNode
+      // will find a matching token factor (CSE.)
+      for (unsigned n = Chain.getNumOperands(); n;)
+        Chains.push_back(Chain.getOperand(--n));
+      // Eliminate the token factor if we can.
+      AddToWorkList(Chain.getNode());
+      break;
+
+    default:
+      // For all other instructions we will just have to take what we can get.
+      Aliases.push_back(Chain);
+      break;
+    }
+  }
+}
+
+/// FindBetterChain - Walk up chain skipping non-aliasing memory nodes, looking
+/// for a better chain (aliasing node.)
+SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
+  SmallVector<SDValue, 8> Aliases;  // Ops for replacing token factor.
+
+  // Accumulate all the aliases to this node.
+  GatherAllAliases(N, OldChain, Aliases);
+
+  if (Aliases.size() == 0) {
+    // If no operands then chain to entry token.
+    return DAG.getEntryNode();
+  } else if (Aliases.size() == 1) {
+    // If a single operand then chain to it.  We don't need to revisit it.
+    return Aliases[0];
+  }
+
+  // Construct a custom tailored token factor.
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
+                                 &Aliases[0], Aliases.size());
+
+  // Make sure the old chain gets cleaned up.
+  if (NewChain != OldChain) AddToWorkList(OldChain.getNode());
+
+  return NewChain;
+}
+
+// SelectionDAG::Combine - This is the entry point for the file.
+//
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+                           CodeGenOpt::Level OptLevel) {
+  /// run - This is the main entry point to this class.
+  ///
+  DAGCombiner(*this, AA, OptLevel).Run(Level);
+}
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
new file mode 100644
index 0000000..6becff3
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -0,0 +1,1033 @@
+///===-- FastISel.cpp - Implementation of the FastISel class --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the FastISel class.
+//
+// "Fast" instruction selection is designed to emit very poor code quickly.
+// Also, it is not designed to be able to do much lowering, so most illegal
+// types (e.g. i64 on 32-bit targets) and operations are not supported.  It is
+// also not intended to be able to do much optimization, except in a few cases
+// where doing optimizations reduces overall compile time.  For example, folding
+// constants into immediate fields is often done, because it's cheap and it
+// reduces the number of instructions later phases have to examine.
+//
+// "Fast" instruction selection is able to fail gracefully and transfer
+// control to the SelectionDAG selector for operations that it doesn't
+// support.  In many cases, this allows us to avoid duplicating a lot of
+// the complicated lowering logic that SelectionDAG currently has.
+//
+// The intended use for "fast" instruction selection is "-O0" mode
+// compilation, where the quality of the generated code is irrelevant when
+// weighed against the speed at which the code can be generated.  Also,
+// at -O0, the LLVM optimizers are not running, and this makes the
+// compile time of codegen a much higher portion of the overall compile
+// time.  Despite its limitations, "fast" instruction selection is able to
+// handle enough code on its own to provide noticeable overall speedups
+// in -O0 compiles.
+//
+// Basic operations are supported in a target-independent way, by reading
+// the same instruction descriptions that the SelectionDAG selector reads,
+// and identifying simple arithmetic operations that can be directly selected
+// from simple operators.  More complicated operations currently require
+// target-specific code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/DebugLoc.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "SelectionDAGBuild.h"
+using namespace llvm;
+
+unsigned FastISel::getRegForValue(Value *V) {
+  MVT RealVT = TLI.getValueType(V->getType(), /*AllowUnknown=*/true);
+  // Don't handle non-simple values in FastISel.
+  if (!RealVT.isSimple())
+    return 0;
+
+  // Ignore illegal types. We must do this before looking up the value
+  // in ValueMap because Arguments are given virtual registers regardless
+  // of whether FastISel can handle them.
+  MVT::SimpleValueType VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT)) {
+    // Promote MVT::i1 to a legal type though, because it's common and easy.
+    if (VT == MVT::i1)
+      VT = TLI.getTypeToTransformTo(VT).getSimpleVT();
+    else
+      return 0;
+  }
+
+  // Look up the value to see if we already have a register for it. We
+  // cache values defined by Instructions across blocks, and other values
+  // only locally. This is because Instructions already have the SSA
+  // def-dominatess-use requirement enforced.
+  if (ValueMap.count(V))
+    return ValueMap[V];
+  unsigned Reg = LocalValueMap[V];
+  if (Reg != 0)
+    return Reg;
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getValue().getActiveBits() <= 64)
+      Reg = FastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+  } else if (isa<AllocaInst>(V)) {
+    Reg = TargetMaterializeAlloca(cast<AllocaInst>(V));
+  } else if (isa<ConstantPointerNull>(V)) {
+    // Translate this as an integer zero so that it can be
+    // local-CSE'd with actual integer zeros.
+    Reg = getRegForValue(Constant::getNullValue(TD.getIntPtrType()));
+  } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+    Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF);
+
+    if (!Reg) {
+      const APFloat &Flt = CF->getValueAPF();
+      MVT IntVT = TLI.getPointerTy();
+
+      uint64_t x[2];
+      uint32_t IntBitWidth = IntVT.getSizeInBits();
+      bool isExact;
+      (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
+                                APFloat::rmTowardZero, &isExact);
+      if (isExact) {
+        APInt IntVal(IntBitWidth, 2, x);
+
+        unsigned IntegerReg = getRegForValue(ConstantInt::get(IntVal));
+        if (IntegerReg != 0)
+          Reg = FastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg);
+      }
+    }
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (!SelectOperator(CE, CE->getOpcode())) return 0;
+    Reg = LocalValueMap[CE];
+  } else if (isa<UndefValue>(V)) {
+    Reg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(MBB, DL, TII.get(TargetInstrInfo::IMPLICIT_DEF), Reg);
+  }
+  
+  // If target-independent code couldn't handle the value, give target-specific
+  // code a try.
+  if (!Reg && isa<Constant>(V))
+    Reg = TargetMaterializeConstant(cast<Constant>(V));
+  
+  // Don't cache constant materializations in the general ValueMap.
+  // To do so would require tracking what uses they dominate.
+  if (Reg != 0)
+    LocalValueMap[V] = Reg;
+  return Reg;
+}
+
+unsigned FastISel::lookUpRegForValue(Value *V) {
+  // Look up the value to see if we already have a register for it. We
+  // cache values defined by Instructions across blocks, and other values
+  // only locally. This is because Instructions already have the SSA
+  // def-dominatess-use requirement enforced.
+  if (ValueMap.count(V))
+    return ValueMap[V];
+  return LocalValueMap[V];
+}
+
+/// UpdateValueMap - Update the value map to include the new mapping for this
+/// instruction, or insert an extra copy to get the result in a previous
+/// determined register.
+/// NOTE: This is only necessary because we might select a block that uses
+/// a value before we select the block that defines the value.  It might be
+/// possible to fix this by selecting blocks in reverse postorder.
+unsigned FastISel::UpdateValueMap(Value* I, unsigned Reg) {
+  if (!isa<Instruction>(I)) {
+    LocalValueMap[I] = Reg;
+    return Reg;
+  }
+  
+  unsigned &AssignedReg = ValueMap[I];
+  if (AssignedReg == 0)
+    AssignedReg = Reg;
+  else if (Reg != AssignedReg) {
+    const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
+    TII.copyRegToReg(*MBB, MBB->end(), AssignedReg,
+                     Reg, RegClass, RegClass);
+  }
+  return AssignedReg;
+}
+
+unsigned FastISel::getRegForGEPIndex(Value *Idx) {
+  unsigned IdxN = getRegForValue(Idx);
+  if (IdxN == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return 0;
+
+  // If the index is smaller or larger than intptr_t, truncate or extend it.
+  MVT PtrVT = TLI.getPointerTy();
+  MVT IdxVT = MVT::getMVT(Idx->getType(), /*HandleUnknown=*/false);
+  if (IdxVT.bitsLT(PtrVT))
+    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT.getSimpleVT(),
+                      ISD::SIGN_EXTEND, IdxN);
+  else if (IdxVT.bitsGT(PtrVT))
+    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT.getSimpleVT(),
+                      ISD::TRUNCATE, IdxN);
+  return IdxN;
+}
+
+/// SelectBinaryOp - Select and emit code for a binary operator instruction,
+/// which has an opcode which directly corresponds to the given ISD opcode.
+///
+bool FastISel::SelectBinaryOp(User *I, ISD::NodeType ISDOpcode) {
+  MVT VT = MVT::getMVT(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !VT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  if (!TLI.isTypeLegal(VT)) {
+    // MVT::i1 is special. Allow AND, OR, or XOR because they
+    // don't require additional zeroing, which makes them easy.
+    if (VT == MVT::i1 &&
+        (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR ||
+         ISDOpcode == ISD::XOR))
+      VT = TLI.getTypeToTransformTo(VT);
+    else
+      return false;
+  }
+
+  unsigned Op0 = getRegForValue(I->getOperand(0));
+  if (Op0 == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  // Check if the second operand is a constant and handle it appropriately.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    unsigned ResultReg = FastEmit_ri(VT.getSimpleVT(), VT.getSimpleVT(),
+                                     ISDOpcode, Op0, CI->getZExtValue());
+    if (ResultReg != 0) {
+      // We successfully emitted code for the given LLVM Instruction.
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  // Check if the second operand is a constant float.
+  if (ConstantFP *CF = dyn_cast<ConstantFP>(I->getOperand(1))) {
+    unsigned ResultReg = FastEmit_rf(VT.getSimpleVT(), VT.getSimpleVT(),
+                                     ISDOpcode, Op0, CF);
+    if (ResultReg != 0) {
+      // We successfully emitted code for the given LLVM Instruction.
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  unsigned Op1 = getRegForValue(I->getOperand(1));
+  if (Op1 == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  // Now we have both operands in registers. Emit the instruction.
+  unsigned ResultReg = FastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
+                                   ISDOpcode, Op0, Op1);
+  if (ResultReg == 0)
+    // Target-specific code wasn't able to find a machine opcode for
+    // the given ISD opcode and type. Halt "fast" selection and bail.
+    return false;
+
+  // We successfully emitted code for the given LLVM Instruction.
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::SelectGetElementPtr(User *I) {
+  unsigned N = getRegForValue(I->getOperand(0));
+  if (N == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  const Type *Ty = I->getOperand(0)->getType();
+  MVT::SimpleValueType VT = TLI.getPointerTy().getSimpleVT();
+  for (GetElementPtrInst::op_iterator OI = I->op_begin()+1, E = I->op_end();
+       OI != E; ++OI) {
+    Value *Idx = *OI;
+    if (const StructType *StTy = dyn_cast<StructType>(Ty)) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        uint64_t Offs = TD.getStructLayout(StTy)->getElementOffset(Field);
+        // FIXME: This can be optimized by combining the add with a
+        // subsequent one.
+        N = FastEmit_ri_(VT, ISD::ADD, N, Offs, VT);
+        if (N == 0)
+          // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+      }
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->getZExtValue() == 0) continue;
+        uint64_t Offs = 
+          TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
+        N = FastEmit_ri_(VT, ISD::ADD, N, Offs, VT);
+        if (N == 0)
+          // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+        continue;
+      }
+      
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = TD.getTypeAllocSize(Ty);
+      unsigned IdxN = getRegForGEPIndex(Idx);
+      if (IdxN == 0)
+        // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+
+      if (ElementSize != 1) {
+        IdxN = FastEmit_ri_(VT, ISD::MUL, IdxN, ElementSize, VT);
+        if (IdxN == 0)
+          // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+      }
+      N = FastEmit_rr(VT, VT, ISD::ADD, N, IdxN);
+      if (N == 0)
+        // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+    }
+  }
+
+  // We successfully emitted code for the given LLVM Instruction.
+  UpdateValueMap(I, N);
+  return true;
+}
+
+bool FastISel::SelectCall(User *I) {
+  Function *F = cast<CallInst>(I)->getCalledFunction();
+  if (!F) return false;
+
+  unsigned IID = F->getIntrinsicID();
+  switch (IID) {
+  default: break;
+  case Intrinsic::dbg_stoppoint: {
+    DbgStopPointInst *SPI = cast<DbgStopPointInst>(I);
+    if (DIDescriptor::ValidDebugInfo(SPI->getContext(), CodeGenOpt::None)) {
+      DICompileUnit CU(cast<GlobalVariable>(SPI->getContext()));
+      unsigned Line = SPI->getLine();
+      unsigned Col = SPI->getColumn();
+      unsigned Idx = MF.getOrCreateDebugLocID(CU.getGV(), Line, Col);
+      setCurDebugLoc(DebugLoc::get(Idx));
+    }
+    return true;
+  }
+  case Intrinsic::dbg_region_start: {
+    DbgRegionStartInst *RSI = cast<DbgRegionStartInst>(I);
+    if (DIDescriptor::ValidDebugInfo(RSI->getContext(), CodeGenOpt::None) &&
+        DW && DW->ShouldEmitDwarfDebug()) {
+      unsigned ID = 
+        DW->RecordRegionStart(cast<GlobalVariable>(RSI->getContext()));
+      const TargetInstrDesc &II = TII.get(TargetInstrInfo::DBG_LABEL);
+      BuildMI(MBB, DL, II).addImm(ID);
+    }
+    return true;
+  }
+  case Intrinsic::dbg_region_end: {
+    DbgRegionEndInst *REI = cast<DbgRegionEndInst>(I);
+    if (DIDescriptor::ValidDebugInfo(REI->getContext(), CodeGenOpt::None) &&
+        DW && DW->ShouldEmitDwarfDebug()) {
+     unsigned ID = 0;
+     DISubprogram Subprogram(cast<GlobalVariable>(REI->getContext()));
+     if (!Subprogram.isNull() && !Subprogram.describes(MF.getFunction())) {
+        // This is end of an inlined function.
+        const TargetInstrDesc &II = TII.get(TargetInstrInfo::DBG_LABEL);
+        ID = DW->RecordInlinedFnEnd(Subprogram);
+        if (ID)
+          // Returned ID is 0 if this is unbalanced "end of inlined
+          // scope". This could happen if optimizer eats dbg intrinsics
+          // or "beginning of inlined scope" is not recoginized due to
+          // missing location info. In such cases, do ignore this region.end.
+          BuildMI(MBB, DL, II).addImm(ID);
+      } else {
+        const TargetInstrDesc &II = TII.get(TargetInstrInfo::DBG_LABEL);
+        ID =  DW->RecordRegionEnd(cast<GlobalVariable>(REI->getContext()));
+        BuildMI(MBB, DL, II).addImm(ID);
+      }
+    }
+    return true;
+  }
+  case Intrinsic::dbg_func_start: {
+    DbgFuncStartInst *FSI = cast<DbgFuncStartInst>(I);
+    Value *SP = FSI->getSubprogram();
+    if (!DIDescriptor::ValidDebugInfo(SP, CodeGenOpt::None))
+      return true;
+
+    // llvm.dbg.func.start implicitly defines a dbg_stoppoint which is what
+    // (most?) gdb expects.
+    DebugLoc PrevLoc = DL;
+    DISubprogram Subprogram(cast<GlobalVariable>(SP));
+    DICompileUnit CompileUnit = Subprogram.getCompileUnit();
+
+    if (!Subprogram.describes(MF.getFunction())) {
+      // This is a beginning of an inlined function.
+
+      // If llvm.dbg.func.start is seen in a new block before any
+      // llvm.dbg.stoppoint intrinsic then the location info is unknown.
+      // FIXME : Why DebugLoc is reset at the beginning of each block ?
+      if (PrevLoc.isUnknown())
+        return true;
+      // Record the source line.
+      unsigned Line = Subprogram.getLineNumber();
+      setCurDebugLoc(DebugLoc::get(MF.getOrCreateDebugLocID(
+                                              CompileUnit.getGV(), Line, 0)));
+
+      if (DW && DW->ShouldEmitDwarfDebug()) {
+        DebugLocTuple PrevLocTpl = MF.getDebugLocTuple(PrevLoc);
+        unsigned LabelID = DW->RecordInlinedFnStart(Subprogram,
+                                          DICompileUnit(PrevLocTpl.CompileUnit),
+                                          PrevLocTpl.Line,
+                                          PrevLocTpl.Col);
+        const TargetInstrDesc &II = TII.get(TargetInstrInfo::DBG_LABEL);
+        BuildMI(MBB, DL, II).addImm(LabelID);
+      }
+    } else {
+      // Record the source line.
+      unsigned Line = Subprogram.getLineNumber();
+      MF.setDefaultDebugLoc(DebugLoc::get(MF.getOrCreateDebugLocID(
+                                              CompileUnit.getGV(), Line, 0)));
+      if (DW && DW->ShouldEmitDwarfDebug()) {
+        // llvm.dbg.func_start also defines beginning of function scope.
+        DW->RecordRegionStart(cast<GlobalVariable>(FSI->getSubprogram()));
+      }
+    }
+
+    return true;
+  }
+  case Intrinsic::dbg_declare: {
+    DbgDeclareInst *DI = cast<DbgDeclareInst>(I);
+    Value *Variable = DI->getVariable();
+    if (DIDescriptor::ValidDebugInfo(Variable, CodeGenOpt::None) &&
+        DW && DW->ShouldEmitDwarfDebug()) {
+      // Determine the address of the declared object.
+      Value *Address = DI->getAddress();
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+        Address = BCI->getOperand(0);
+      AllocaInst *AI = dyn_cast<AllocaInst>(Address);
+      // Don't handle byval struct arguments or VLAs, for example.
+      if (!AI) break;
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        StaticAllocaMap.find(AI);
+      if (SI == StaticAllocaMap.end()) break; // VLAs.
+      int FI = SI->second;
+
+      // Determine the debug globalvariable.
+      GlobalValue *GV = cast<GlobalVariable>(Variable);
+
+      // Build the DECLARE instruction.
+      const TargetInstrDesc &II = TII.get(TargetInstrInfo::DECLARE);
+      MachineInstr *DeclareMI 
+        = BuildMI(MBB, DL, II).addFrameIndex(FI).addGlobalAddress(GV);
+      DIVariable DV(cast<GlobalVariable>(GV));
+      if (!DV.isNull()) {
+        // This is a local variable
+        DW->RecordVariableScope(DV, DeclareMI);
+      }
+    }
+    return true;
+  }
+  case Intrinsic::eh_exception: {
+    MVT VT = TLI.getValueType(I->getType());
+    switch (TLI.getOperationAction(ISD::EXCEPTIONADDR, VT)) {
+    default: break;
+    case TargetLowering::Expand: {
+      assert(MBB->isLandingPad() && "Call to eh.exception not in landing pad!");
+      unsigned Reg = TLI.getExceptionAddressRegister();
+      const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+      unsigned ResultReg = createResultReg(RC);
+      bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                           Reg, RC, RC);
+      assert(InsertedCopy && "Can't copy address registers!");
+      InsertedCopy = InsertedCopy;
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+    }
+    break;
+  }
+  case Intrinsic::eh_selector_i32:
+  case Intrinsic::eh_selector_i64: {
+    MVT VT = TLI.getValueType(I->getType());
+    switch (TLI.getOperationAction(ISD::EHSELECTION, VT)) {
+    default: break;
+    case TargetLowering::Expand: {
+      MVT VT = (IID == Intrinsic::eh_selector_i32 ?
+                           MVT::i32 : MVT::i64);
+
+      if (MMI) {
+        if (MBB->isLandingPad())
+          AddCatchInfo(*cast<CallInst>(I), MMI, MBB);
+        else {
+#ifndef NDEBUG
+          CatchInfoLost.insert(cast<CallInst>(I));
+#endif
+          // FIXME: Mark exception selector register as live in.  Hack for PR1508.
+          unsigned Reg = TLI.getExceptionSelectorRegister();
+          if (Reg) MBB->addLiveIn(Reg);
+        }
+
+        unsigned Reg = TLI.getExceptionSelectorRegister();
+        const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+        unsigned ResultReg = createResultReg(RC);
+        bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                             Reg, RC, RC);
+        assert(InsertedCopy && "Can't copy address registers!");
+        InsertedCopy = InsertedCopy;
+        UpdateValueMap(I, ResultReg);
+      } else {
+        unsigned ResultReg =
+          getRegForValue(Constant::getNullValue(I->getType()));
+        UpdateValueMap(I, ResultReg);
+      }
+      return true;
+    }
+    }
+    break;
+  }
+  }
+  return false;
+}
+
+bool FastISel::SelectCast(User *I, ISD::NodeType Opcode) {
+  MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  MVT DstVT = TLI.getValueType(I->getType());
+    
+  if (SrcVT == MVT::Other || !SrcVT.isSimple() ||
+      DstVT == MVT::Other || !DstVT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+    
+  // Check if the destination type is legal. Or as a special case,
+  // it may be i1 if we're doing a truncate because that's
+  // easy and somewhat common.
+  if (!TLI.isTypeLegal(DstVT))
+    if (DstVT != MVT::i1 || Opcode != ISD::TRUNCATE)
+      // Unhandled type. Halt "fast" selection and bail.
+      return false;
+
+  // Check if the source operand is legal. Or as a special case,
+  // it may be i1 if we're doing zero-extension because that's
+  // easy and somewhat common.
+  if (!TLI.isTypeLegal(SrcVT))
+    if (SrcVT != MVT::i1 || Opcode != ISD::ZERO_EXTEND)
+      // Unhandled type. Halt "fast" selection and bail.
+      return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  // If the operand is i1, arrange for the high bits in the register to be zero.
+  if (SrcVT == MVT::i1) {
+   SrcVT = TLI.getTypeToTransformTo(SrcVT);
+   InputReg = FastEmitZExtFromI1(SrcVT.getSimpleVT(), InputReg);
+   if (!InputReg)
+     return false;
+  }
+  // If the result is i1, truncate to the target's type for i1 first.
+  if (DstVT == MVT::i1)
+    DstVT = TLI.getTypeToTransformTo(DstVT);
+
+  unsigned ResultReg = FastEmit_r(SrcVT.getSimpleVT(),
+                                  DstVT.getSimpleVT(),
+                                  Opcode,
+                                  InputReg);
+  if (!ResultReg)
+    return false;
+    
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::SelectBitCast(User *I) {
+  // If the bitcast doesn't change the type, just use the operand value.
+  if (I->getType() == I->getOperand(0)->getType()) {
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0)
+      return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+
+  // Bitcasts of other values become reg-reg copies or BIT_CONVERT operators.
+  MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  MVT DstVT = TLI.getValueType(I->getType());
+  
+  if (SrcVT == MVT::Other || !SrcVT.isSimple() ||
+      DstVT == MVT::Other || !DstVT.isSimple() ||
+      !TLI.isTypeLegal(SrcVT) || !TLI.isTypeLegal(DstVT))
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+  
+  unsigned Op0 = getRegForValue(I->getOperand(0));
+  if (Op0 == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+  
+  // First, try to perform the bitcast by inserting a reg-reg copy.
+  unsigned ResultReg = 0;
+  if (SrcVT.getSimpleVT() == DstVT.getSimpleVT()) {
+    TargetRegisterClass* SrcClass = TLI.getRegClassFor(SrcVT);
+    TargetRegisterClass* DstClass = TLI.getRegClassFor(DstVT);
+    ResultReg = createResultReg(DstClass);
+    
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         Op0, DstClass, SrcClass);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+  
+  // If the reg-reg copy failed, select a BIT_CONVERT opcode.
+  if (!ResultReg)
+    ResultReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
+                           ISD::BIT_CONVERT, Op0);
+  
+  if (!ResultReg)
+    return false;
+  
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool
+FastISel::SelectInstruction(Instruction *I) {
+  return SelectOperator(I, I->getOpcode());
+}
+
+/// FastEmitBranch - Emit an unconditional branch to the given block,
+/// unless it is the immediate (fall-through) successor, and update
+/// the CFG.
+void
+FastISel::FastEmitBranch(MachineBasicBlock *MSucc) {
+  MachineFunction::iterator NextMBB =
+     next(MachineFunction::iterator(MBB));
+
+  if (MBB->isLayoutSuccessor(MSucc)) {
+    // The unconditional fall-through case, which needs no instructions.
+  } else {
+    // The unconditional branch case.
+    TII.InsertBranch(*MBB, MSucc, NULL, SmallVector<MachineOperand, 0>());
+  }
+  MBB->addSuccessor(MSucc);
+}
+
+bool
+FastISel::SelectOperator(User *I, unsigned Opcode) {
+  switch (Opcode) {
+  case Instruction::Add: {
+    ISD::NodeType Opc = I->getType()->isFPOrFPVector() ? ISD::FADD : ISD::ADD;
+    return SelectBinaryOp(I, Opc);
+  }
+  case Instruction::Sub: {
+    ISD::NodeType Opc = I->getType()->isFPOrFPVector() ? ISD::FSUB : ISD::SUB;
+    return SelectBinaryOp(I, Opc);
+  }
+  case Instruction::Mul: {
+    ISD::NodeType Opc = I->getType()->isFPOrFPVector() ? ISD::FMUL : ISD::MUL;
+    return SelectBinaryOp(I, Opc);
+  }
+  case Instruction::SDiv:
+    return SelectBinaryOp(I, ISD::SDIV);
+  case Instruction::UDiv:
+    return SelectBinaryOp(I, ISD::UDIV);
+  case Instruction::FDiv:
+    return SelectBinaryOp(I, ISD::FDIV);
+  case Instruction::SRem:
+    return SelectBinaryOp(I, ISD::SREM);
+  case Instruction::URem:
+    return SelectBinaryOp(I, ISD::UREM);
+  case Instruction::FRem:
+    return SelectBinaryOp(I, ISD::FREM);
+  case Instruction::Shl:
+    return SelectBinaryOp(I, ISD::SHL);
+  case Instruction::LShr:
+    return SelectBinaryOp(I, ISD::SRL);
+  case Instruction::AShr:
+    return SelectBinaryOp(I, ISD::SRA);
+  case Instruction::And:
+    return SelectBinaryOp(I, ISD::AND);
+  case Instruction::Or:
+    return SelectBinaryOp(I, ISD::OR);
+  case Instruction::Xor:
+    return SelectBinaryOp(I, ISD::XOR);
+
+  case Instruction::GetElementPtr:
+    return SelectGetElementPtr(I);
+
+  case Instruction::Br: {
+    BranchInst *BI = cast<BranchInst>(I);
+
+    if (BI->isUnconditional()) {
+      BasicBlock *LLVMSucc = BI->getSuccessor(0);
+      MachineBasicBlock *MSucc = MBBMap[LLVMSucc];
+      FastEmitBranch(MSucc);
+      return true;
+    }
+
+    // Conditional branches are not handed yet.
+    // Halt "fast" selection and bail.
+    return false;
+  }
+
+  case Instruction::Unreachable:
+    // Nothing to emit.
+    return true;
+
+  case Instruction::PHI:
+    // PHI nodes are already emitted.
+    return true;
+
+  case Instruction::Alloca:
+    // FunctionLowering has the static-sized case covered.
+    if (StaticAllocaMap.count(cast<AllocaInst>(I)))
+      return true;
+
+    // Dynamic-sized alloca is not handled yet.
+    return false;
+    
+  case Instruction::Call:
+    return SelectCall(I);
+  
+  case Instruction::BitCast:
+    return SelectBitCast(I);
+
+  case Instruction::FPToSI:
+    return SelectCast(I, ISD::FP_TO_SINT);
+  case Instruction::ZExt:
+    return SelectCast(I, ISD::ZERO_EXTEND);
+  case Instruction::SExt:
+    return SelectCast(I, ISD::SIGN_EXTEND);
+  case Instruction::Trunc:
+    return SelectCast(I, ISD::TRUNCATE);
+  case Instruction::SIToFP:
+    return SelectCast(I, ISD::SINT_TO_FP);
+
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+    MVT DstVT = TLI.getValueType(I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return SelectCast(I, ISD::ZERO_EXTEND);
+    if (DstVT.bitsLT(SrcVT))
+      return SelectCast(I, ISD::TRUNCATE);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+
+  default:
+    // Unhandled instruction. Halt "fast" selection and bail.
+    return false;
+  }
+}
+
+FastISel::FastISel(MachineFunction &mf,
+                   MachineModuleInfo *mmi,
+                   DwarfWriter *dw,
+                   DenseMap<const Value *, unsigned> &vm,
+                   DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+                   DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                   , SmallSet<Instruction*, 8> &cil
+#endif
+                   )
+  : MBB(0),
+    ValueMap(vm),
+    MBBMap(bm),
+    StaticAllocaMap(am),
+#ifndef NDEBUG
+    CatchInfoLost(cil),
+#endif
+    MF(mf),
+    MMI(mmi),
+    DW(dw),
+    MRI(MF.getRegInfo()),
+    MFI(*MF.getFrameInfo()),
+    MCP(*MF.getConstantPool()),
+    TM(MF.getTarget()),
+    TD(*TM.getTargetData()),
+    TII(*TM.getInstrInfo()),
+    TLI(*TM.getTargetLowering()) {
+}
+
+FastISel::~FastISel() {}
+
+unsigned FastISel::FastEmit_(MVT::SimpleValueType, MVT::SimpleValueType,
+                             ISD::NodeType) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_r(MVT::SimpleValueType, MVT::SimpleValueType,
+                              ISD::NodeType, unsigned /*Op0*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_rr(MVT::SimpleValueType, MVT::SimpleValueType, 
+                               ISD::NodeType, unsigned /*Op0*/,
+                               unsigned /*Op0*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_i(MVT::SimpleValueType, MVT::SimpleValueType,
+                              ISD::NodeType, uint64_t /*Imm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_f(MVT::SimpleValueType, MVT::SimpleValueType,
+                              ISD::NodeType, ConstantFP * /*FPImm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_ri(MVT::SimpleValueType, MVT::SimpleValueType,
+                               ISD::NodeType, unsigned /*Op0*/,
+                               uint64_t /*Imm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_rf(MVT::SimpleValueType, MVT::SimpleValueType,
+                               ISD::NodeType, unsigned /*Op0*/,
+                               ConstantFP * /*FPImm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_rri(MVT::SimpleValueType, MVT::SimpleValueType,
+                                ISD::NodeType,
+                                unsigned /*Op0*/, unsigned /*Op1*/,
+                                uint64_t /*Imm*/) {
+  return 0;
+}
+
+/// FastEmit_ri_ - This method is a wrapper of FastEmit_ri. It first tries
+/// to emit an instruction with an immediate operand using FastEmit_ri.
+/// If that fails, it materializes the immediate into a register and try
+/// FastEmit_rr instead.
+unsigned FastISel::FastEmit_ri_(MVT::SimpleValueType VT, ISD::NodeType Opcode,
+                                unsigned Op0, uint64_t Imm,
+                                MVT::SimpleValueType ImmType) {
+  // First check if immediate type is legal. If not, we can't use the ri form.
+  unsigned ResultReg = FastEmit_ri(VT, VT, Opcode, Op0, Imm);
+  if (ResultReg != 0)
+    return ResultReg;
+  unsigned MaterialReg = FastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
+  if (MaterialReg == 0)
+    return 0;
+  return FastEmit_rr(VT, VT, Opcode, Op0, MaterialReg);
+}
+
+/// FastEmit_rf_ - This method is a wrapper of FastEmit_ri. It first tries
+/// to emit an instruction with a floating-point immediate operand using
+/// FastEmit_rf. If that fails, it materializes the immediate into a register
+/// and try FastEmit_rr instead.
+unsigned FastISel::FastEmit_rf_(MVT::SimpleValueType VT, ISD::NodeType Opcode,
+                                unsigned Op0, ConstantFP *FPImm,
+                                MVT::SimpleValueType ImmType) {
+  // First check if immediate type is legal. If not, we can't use the rf form.
+  unsigned ResultReg = FastEmit_rf(VT, VT, Opcode, Op0, FPImm);
+  if (ResultReg != 0)
+    return ResultReg;
+
+  // Materialize the constant in a register.
+  unsigned MaterialReg = FastEmit_f(ImmType, ImmType, ISD::ConstantFP, FPImm);
+  if (MaterialReg == 0) {
+    // If the target doesn't have a way to directly enter a floating-point
+    // value into a register, use an alternate approach.
+    // TODO: The current approach only supports floating-point constants
+    // that can be constructed by conversion from integer values. This should
+    // be replaced by code that creates a load from a constant-pool entry,
+    // which will require some target-specific work.
+    const APFloat &Flt = FPImm->getValueAPF();
+    MVT IntVT = TLI.getPointerTy();
+
+    uint64_t x[2];
+    uint32_t IntBitWidth = IntVT.getSizeInBits();
+    bool isExact;
+    (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
+                             APFloat::rmTowardZero, &isExact);
+    if (!isExact)
+      return 0;
+    APInt IntVal(IntBitWidth, 2, x);
+
+    unsigned IntegerReg = FastEmit_i(IntVT.getSimpleVT(), IntVT.getSimpleVT(),
+                                     ISD::Constant, IntVal.getZExtValue());
+    if (IntegerReg == 0)
+      return 0;
+    MaterialReg = FastEmit_r(IntVT.getSimpleVT(), VT,
+                             ISD::SINT_TO_FP, IntegerReg);
+    if (MaterialReg == 0)
+      return 0;
+  }
+  return FastEmit_rr(VT, VT, Opcode, Op0, MaterialReg);
+}
+
+unsigned FastISel::createResultReg(const TargetRegisterClass* RC) {
+  return MRI.createVirtualRegister(RC);
+}
+
+unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
+                                 const TargetRegisterClass* RC) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  BuildMI(MBB, DL, II, ResultReg);
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  unsigned Op0) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(MBB, DL, II, ResultReg).addReg(Op0);
+  else {
+    BuildMI(MBB, DL, II).addReg(Op0);
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         II.ImplicitDefs[0], RC, RC);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, unsigned Op1) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(MBB, DL, II, ResultReg).addReg(Op0).addReg(Op1);
+  else {
+    BuildMI(MBB, DL, II).addReg(Op0).addReg(Op1);
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         II.ImplicitDefs[0], RC, RC);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(MBB, DL, II, ResultReg).addReg(Op0).addImm(Imm);
+  else {
+    BuildMI(MBB, DL, II).addReg(Op0).addImm(Imm);
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         II.ImplicitDefs[0], RC, RC);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, ConstantFP *FPImm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(MBB, DL, II, ResultReg).addReg(Op0).addFPImm(FPImm);
+  else {
+    BuildMI(MBB, DL, II).addReg(Op0).addFPImm(FPImm);
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         II.ImplicitDefs[0], RC, RC);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    unsigned Op0, unsigned Op1, uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(MBB, DL, II, ResultReg).addReg(Op0).addReg(Op1).addImm(Imm);
+  else {
+    BuildMI(MBB, DL, II).addReg(Op0).addReg(Op1).addImm(Imm);
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         II.ImplicitDefs[0], RC, RC);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+  
+  if (II.getNumDefs() >= 1)
+    BuildMI(MBB, DL, II, ResultReg).addImm(Imm);
+  else {
+    BuildMI(MBB, DL, II).addImm(Imm);
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         II.ImplicitDefs[0], RC, RC);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_extractsubreg(MVT::SimpleValueType RetVT,
+                                              unsigned Op0, uint32_t Idx) {
+  const TargetRegisterClass* RC = MRI.getRegClass(Op0);
+  
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+  const TargetInstrDesc &II = TII.get(TargetInstrInfo::EXTRACT_SUBREG);
+  
+  if (II.getNumDefs() >= 1)
+    BuildMI(MBB, DL, II, ResultReg).addReg(Op0).addImm(Idx);
+  else {
+    BuildMI(MBB, DL, II).addReg(Op0).addImm(Idx);
+    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                         II.ImplicitDefs[0], RC, RC);
+    if (!InsertedCopy)
+      ResultReg = 0;
+  }
+  return ResultReg;
+}
+
+/// FastEmitZExtFromI1 - Emit MachineInstrs to compute the value of Op
+/// with all but the least significant bit set to zero.
+unsigned FastISel::FastEmitZExtFromI1(MVT::SimpleValueType VT, unsigned Op) {
+  return FastEmit_ri(VT, VT, ISD::AND, Op, 1);
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
new file mode 100644
index 0000000..2cd67e6
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -0,0 +1,3091 @@
+//===-- LegalizeDAG.cpp - Implement SelectionDAG::Legalize ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::Legalize method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <map>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGLegalize - This takes an arbitrary SelectionDAG as input and
+/// hacks on it until the target machine can handle it.  This involves
+/// eliminating value sizes the machine cannot handle (promoting small sizes to
+/// large sizes or splitting up large values into small values) as well as
+/// eliminating operations the machine cannot handle.
+///
+/// This code also does a small amount of optimization and recognition of idioms
+/// as part of its processing.  For example, if a target does not support a
+/// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
+/// will attempt merge setcc and brc instructions into brcc's.
+///
+namespace {
+class VISIBILITY_HIDDEN SelectionDAGLegalize {
+  TargetLowering &TLI;
+  SelectionDAG &DAG;
+  CodeGenOpt::Level OptLevel;
+
+  // Libcall insertion helpers.
+
+  /// LastCALLSEQ_END - This keeps track of the CALLSEQ_END node that has been
+  /// legalized.  We use this to ensure that calls are properly serialized
+  /// against each other, including inserted libcalls.
+  SDValue LastCALLSEQ_END;
+
+  /// IsLegalizingCall - This member is used *only* for purposes of providing
+  /// helpful assertions that a libcall isn't created while another call is
+  /// being legalized (which could lead to non-serialized call sequences).
+  bool IsLegalizingCall;
+
+  enum LegalizeAction {
+    Legal,      // The target natively supports this operation.
+    Promote,    // This operation should be executed in a larger type.
+    Expand      // Try to expand this to other ops, otherwise use a libcall.
+  };
+
+  /// ValueTypeActions - This is a bitvector that contains two bits for each
+  /// value type, where the two bits correspond to the LegalizeAction enum.
+  /// This can be queried with "getTypeAction(VT)".
+  TargetLowering::ValueTypeActionImpl ValueTypeActions;
+
+  /// LegalizedNodes - For nodes that are of legal width, and that have more
+  /// than one use, this map indicates what regularized operand to use.  This
+  /// allows us to avoid legalizing the same thing more than once.
+  DenseMap<SDValue, SDValue> LegalizedNodes;
+
+  void AddLegalizedOperand(SDValue From, SDValue To) {
+    LegalizedNodes.insert(std::make_pair(From, To));
+    // If someone requests legalization of the new node, return itself.
+    if (From != To)
+      LegalizedNodes.insert(std::make_pair(To, To));
+  }
+
+public:
+  SelectionDAGLegalize(SelectionDAG &DAG, CodeGenOpt::Level ol);
+
+  /// getTypeAction - Return how we should legalize values of this type, either
+  /// it is already legal or we need to expand it into multiple registers of
+  /// smaller integer type, or we need to promote it to a larger type.
+  LegalizeAction getTypeAction(MVT VT) const {
+    return (LegalizeAction)ValueTypeActions.getTypeAction(VT);
+  }
+
+  /// isTypeLegal - Return true if this type is legal on this target.
+  ///
+  bool isTypeLegal(MVT VT) const {
+    return getTypeAction(VT) == Legal;
+  }
+
+  void LegalizeDAG();
+
+private:
+  /// LegalizeOp - We know that the specified value has a legal type.
+  /// Recursively ensure that the operands have legal types, then return the
+  /// result.
+  SDValue LegalizeOp(SDValue O);
+
+  /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+  /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+  /// is necessary to spill the vector being inserted into to memory, perform
+  /// the insert there, and then read the result back.
+  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val,
+                                         SDValue Idx, DebugLoc dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
+                                  SDValue Idx, DebugLoc dl);
+
+  /// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+  /// performs the same shuffe in terms of order or result bytes, but on a type
+  /// whose vector element type is narrower than the original shuffle type.
+  /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+  SDValue ShuffleWithNarrowerEltType(MVT NVT, MVT VT, DebugLoc dl,
+                                     SDValue N1, SDValue N2, 
+                                     SmallVectorImpl<int> &Mask) const;
+
+  bool LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
+                                    SmallPtrSet<SDNode*, 32> &NodesLeadingTo);
+
+  void LegalizeSetCCCondCode(MVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
+                             DebugLoc dl);
+
+  SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+  SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
+                          RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
+                          RTLIB::Libcall Call_PPCF128);
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned, RTLIB::Libcall Call_I16,
+                           RTLIB::Libcall Call_I32, RTLIB::Libcall Call_I64,
+                           RTLIB::Libcall Call_I128);
+
+  SDValue EmitStackConvert(SDValue SrcOp, MVT SlotVT, MVT DestVT, DebugLoc dl);
+  SDValue ExpandBUILD_VECTOR(SDNode *Node);
+  SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
+  SDValue ExpandDBG_STOPPOINT(SDNode *Node);
+  void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
+                                SmallVectorImpl<SDValue> &Results);
+  SDValue ExpandFCOPYSIGN(SDNode *Node);
+  SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, MVT DestVT,
+                               DebugLoc dl);
+  SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, MVT DestVT, bool isSigned,
+                                DebugLoc dl);
+  SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, MVT DestVT, bool isSigned,
+                                DebugLoc dl);
+
+  SDValue ExpandBSWAP(SDValue Op, DebugLoc dl);
+  SDValue ExpandBitCount(unsigned Opc, SDValue Op, DebugLoc dl);
+
+  SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
+
+  void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+};
+}
+
+/// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+/// performs the same shuffe in terms of order or result bytes, but on a type
+/// whose vector element type is narrower than the original shuffle type.
+/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+SDValue 
+SelectionDAGLegalize::ShuffleWithNarrowerEltType(MVT NVT, MVT VT,  DebugLoc dl, 
+                                                 SDValue N1, SDValue N2,
+                                             SmallVectorImpl<int> &Mask) const {
+  MVT EltVT = NVT.getVectorElementType();
+  unsigned NumMaskElts = VT.getVectorNumElements();
+  unsigned NumDestElts = NVT.getVectorNumElements();
+  unsigned NumEltsGrowth = NumDestElts / NumMaskElts;
+
+  assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");
+
+  if (NumEltsGrowth == 1)
+    return DAG.getVectorShuffle(NVT, dl, N1, N2, &Mask[0]);
+  
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int Idx = Mask[i];
+    for (unsigned j = 0; j != NumEltsGrowth; ++j) {
+      if (Idx < 0) 
+        NewMask.push_back(-1);
+      else
+        NewMask.push_back(Idx * NumEltsGrowth + j);
+    }
+  }
+  assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
+  assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
+  return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
+}
+
+SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag,
+                                           CodeGenOpt::Level ol)
+  : TLI(dag.getTargetLoweringInfo()), DAG(dag), OptLevel(ol),
+    ValueTypeActions(TLI.getValueTypeActions()) {
+  assert(MVT::LAST_VALUETYPE <= 32 &&
+         "Too many value types for ValueTypeActions to hold!");
+}
+
+void SelectionDAGLegalize::LegalizeDAG() {
+  LastCALLSEQ_END = DAG.getEntryNode();
+  IsLegalizingCall = false;
+
+  // The legalize process is inherently a bottom-up recursive process (users
+  // legalize their uses before themselves).  Given infinite stack space, we
+  // could just start legalizing on the root and traverse the whole graph.  In
+  // practice however, this causes us to run out of stack space on large basic
+  // blocks.  To avoid this problem, compute an ordering of the nodes where each
+  // node is only legalized after all of its operands are legalized.
+  DAG.AssignTopologicalOrder();
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = prior(DAG.allnodes_end()); I != next(E); ++I)
+    LegalizeOp(SDValue(I, 0));
+
+  // Finally, it's possible the root changed.  Get the new root.
+  SDValue OldRoot = DAG.getRoot();
+  assert(LegalizedNodes.count(OldRoot) && "Root didn't get legalized?");
+  DAG.setRoot(LegalizedNodes[OldRoot]);
+
+  LegalizedNodes.clear();
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+}
+
+
+/// FindCallEndFromCallStart - Given a chained node that is part of a call
+/// sequence, find the CALLSEQ_END node that terminates the call sequence.
+static SDNode *FindCallEndFromCallStart(SDNode *Node) {
+  if (Node->getOpcode() == ISD::CALLSEQ_END)
+    return Node;
+  if (Node->use_empty())
+    return 0;   // No CallSeqEnd
+
+  // The chain is usually at the end.
+  SDValue TheChain(Node, Node->getNumValues()-1);
+  if (TheChain.getValueType() != MVT::Other) {
+    // Sometimes it's at the beginning.
+    TheChain = SDValue(Node, 0);
+    if (TheChain.getValueType() != MVT::Other) {
+      // Otherwise, hunt for it.
+      for (unsigned i = 1, e = Node->getNumValues(); i != e; ++i)
+        if (Node->getValueType(i) == MVT::Other) {
+          TheChain = SDValue(Node, i);
+          break;
+        }
+
+      // Otherwise, we walked into a node without a chain.
+      if (TheChain.getValueType() != MVT::Other)
+        return 0;
+    }
+  }
+
+  for (SDNode::use_iterator UI = Node->use_begin(),
+       E = Node->use_end(); UI != E; ++UI) {
+
+    // Make sure to only follow users of our token chain.
+    SDNode *User = *UI;
+    for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i)
+      if (User->getOperand(i) == TheChain)
+        if (SDNode *Result = FindCallEndFromCallStart(User))
+          return Result;
+  }
+  return 0;
+}
+
+/// FindCallStartFromCallEnd - Given a chained node that is part of a call
+/// sequence, find the CALLSEQ_START node that initiates the call sequence.
+static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
+  assert(Node && "Didn't find callseq_start for a call??");
+  if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+
+  assert(Node->getOperand(0).getValueType() == MVT::Other &&
+         "Node doesn't have a token chain argument!");
+  return FindCallStartFromCallEnd(Node->getOperand(0).getNode());
+}
+
+/// LegalizeAllNodesNotLeadingTo - Recursively walk the uses of N, looking to
+/// see if any uses can reach Dest.  If no dest operands can get to dest,
+/// legalize them, legalize ourself, and return false, otherwise, return true.
+///
+/// Keep track of the nodes we fine that actually do lead to Dest in
+/// NodesLeadingTo.  This avoids retraversing them exponential number of times.
+///
+bool SelectionDAGLegalize::LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
+                                     SmallPtrSet<SDNode*, 32> &NodesLeadingTo) {
+  if (N == Dest) return true;  // N certainly leads to Dest :)
+
+  // If we've already processed this node and it does lead to Dest, there is no
+  // need to reprocess it.
+  if (NodesLeadingTo.count(N)) return true;
+
+  // If the first result of this node has been already legalized, then it cannot
+  // reach N.
+  if (LegalizedNodes.count(SDValue(N, 0))) return false;
+
+  // Okay, this node has not already been legalized.  Check and legalize all
+  // operands.  If none lead to Dest, then we can legalize this node.
+  bool OperandsLeadToDest = false;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    OperandsLeadToDest |=     // If an operand leads to Dest, so do we.
+      LegalizeAllNodesNotLeadingTo(N->getOperand(i).getNode(), Dest, NodesLeadingTo);
+
+  if (OperandsLeadToDest) {
+    NodesLeadingTo.insert(N);
+    return true;
+  }
+
+  // Okay, this node looks safe, legalize it and return false.
+  LegalizeOp(SDValue(N, 0));
+  return false;
+}
+
+/// ExpandConstantFP - Expands the ConstantFP node to an integer constant or
+/// a load from the constant pool.
+static SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP,
+                                SelectionDAG &DAG, const TargetLowering &TLI) {
+  bool Extend = false;
+  DebugLoc dl = CFP->getDebugLoc();
+
+  // If a FP immediate is precise when represented as a float and if the
+  // target can do an extending load from float to double, we put it into
+  // the constant pool as a float, even if it's is statically typed as a
+  // double.  This shrinks FP constants and canonicalizes them for targets where
+  // an FP extending load is the same cost as a normal load (such as on the x87
+  // fp stack or PPC FP unit).
+  MVT VT = CFP->getValueType(0);
+  ConstantFP *LLVMC = const_cast<ConstantFP*>(CFP->getConstantFPValue());
+  if (!UseCP) {
+    assert((VT == MVT::f64 || VT == MVT::f32) && "Invalid type expansion");
+    return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(),
+                           (VT == MVT::f64) ? MVT::i64 : MVT::i32);
+  }
+
+  MVT OrigVT = VT;
+  MVT SVT = VT;
+  while (SVT != MVT::f32) {
+    SVT = (MVT::SimpleValueType)(SVT.getSimpleVT() - 1);
+    if (CFP->isValueValidForType(SVT, CFP->getValueAPF()) &&
+        // Only do this if the target has a native EXTLOAD instruction from
+        // smaller type.
+        TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
+        TLI.ShouldShrinkFPConstant(OrigVT)) {
+      const Type *SType = SVT.getTypeForMVT();
+      LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
+      VT = SVT;
+      Extend = true;
+    }
+  }
+
+  SDValue CPIdx = DAG.getConstantPool(LLVMC, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  if (Extend)
+    return DAG.getExtLoad(ISD::EXTLOAD, dl,
+                          OrigVT, DAG.getEntryNode(),
+                          CPIdx, PseudoSourceValue::getConstantPool(),
+                          0, VT, false, Alignment);
+  return DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
+                     PseudoSourceValue::getConstantPool(), 0, false, Alignment);
+}
+
+/// ExpandUnalignedStore - Expands an unaligned store to 2 half-size stores.
+static
+SDValue ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
+                             const TargetLowering &TLI) {
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Val = ST->getValue();
+  MVT VT = Val.getValueType();
+  int Alignment = ST->getAlignment();
+  int SVOffset = ST->getSrcValueOffset();
+  DebugLoc dl = ST->getDebugLoc();
+  if (ST->getMemoryVT().isFloatingPoint() ||
+      ST->getMemoryVT().isVector()) {
+    MVT intVT = MVT::getIntegerVT(VT.getSizeInBits());
+    if (TLI.isTypeLegal(intVT)) {
+      // Expand to a bitconvert of the value to the integer type of the
+      // same size, then a (misaligned) int store.
+      // FIXME: Does not handle truncating floating point stores!
+      SDValue Result = DAG.getNode(ISD::BIT_CONVERT, dl, intVT, Val);
+      return DAG.getStore(Chain, dl, Result, Ptr, ST->getSrcValue(),
+                          SVOffset, ST->isVolatile(), Alignment);
+    } else {
+      // Do a (aligned) store to a stack slot, then copy from the stack slot
+      // to the final destination using (unaligned) integer loads and stores.
+      MVT StoredVT = ST->getMemoryVT();
+      MVT RegVT =
+        TLI.getRegisterType(MVT::getIntegerVT(StoredVT.getSizeInBits()));
+      unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
+      unsigned RegBytes = RegVT.getSizeInBits() / 8;
+      unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
+
+      // Make sure the stack slot is also aligned for the register type.
+      SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
+
+      // Perform the original store, only redirected to the stack slot.
+      SDValue Store = DAG.getTruncStore(Chain, dl,
+                                        Val, StackPtr, NULL, 0, StoredVT);
+      SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+      SmallVector<SDValue, 8> Stores;
+      unsigned Offset = 0;
+
+      // Do all but one copies using the full register width.
+      for (unsigned i = 1; i < NumRegs; i++) {
+        // Load one integer register's worth from the stack slot.
+        SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr, NULL, 0);
+        // Store it to the final location.  Remember the store.
+        Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                      ST->getSrcValue(), SVOffset + Offset,
+                                      ST->isVolatile(),
+                                      MinAlign(ST->getAlignment(), Offset)));
+        // Increment the pointers.
+        Offset += RegBytes;
+        StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                               Increment);
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      }
+
+      // The last store may be partial.  Do a truncating store.  On big-endian
+      // machines this requires an extending load from the stack slot to ensure
+      // that the bits are in the right place.
+      MVT MemVT = MVT::getIntegerVT(8 * (StoredBytes - Offset));
+
+      // Load from the stack slot.
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+                                    NULL, 0, MemVT);
+
+      Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                                         ST->getSrcValue(), SVOffset + Offset,
+                                         MemVT, ST->isVolatile(),
+                                         MinAlign(ST->getAlignment(), Offset)));
+      // The order of the stores doesn't matter - say it with a TokenFactor.
+      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                         Stores.size());
+    }
+  }
+  assert(ST->getMemoryVT().isInteger() &&
+         !ST->getMemoryVT().isVector() &&
+         "Unaligned store of unknown type.");
+  // Get the half-size VT
+  MVT NewStoredVT =
+    (MVT::SimpleValueType)(ST->getMemoryVT().getSimpleVT() - 1);
+  int NumBits = NewStoredVT.getSizeInBits();
+  int IncrementSize = NumBits / 8;
+
+  // Divide the stored value in two parts.
+  SDValue ShiftAmount = DAG.getConstant(NumBits, TLI.getShiftAmountTy());
+  SDValue Lo = Val;
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
+
+  // Store the two parts
+  SDValue Store1, Store2;
+  Store1 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Lo:Hi, Ptr,
+                             ST->getSrcValue(), SVOffset, NewStoredVT,
+                             ST->isVolatile(), Alignment);
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+  Alignment = MinAlign(Alignment, IncrementSize);
+  Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
+                             ST->getSrcValue(), SVOffset + IncrementSize,
+                             NewStoredVT, ST->isVolatile(), Alignment);
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+}
+
+/// ExpandUnalignedLoad - Expands an unaligned load to 2 half-size loads.
+static
+SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
+                            const TargetLowering &TLI) {
+  int SVOffset = LD->getSrcValueOffset();
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  MVT VT = LD->getValueType(0);
+  MVT LoadedVT = LD->getMemoryVT();
+  DebugLoc dl = LD->getDebugLoc();
+  if (VT.isFloatingPoint() || VT.isVector()) {
+    MVT intVT = MVT::getIntegerVT(LoadedVT.getSizeInBits());
+    if (TLI.isTypeLegal(intVT)) {
+      // Expand to a (misaligned) integer load of the same size,
+      // then bitconvert to floating point or vector.
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getSrcValue(),
+                                    SVOffset, LD->isVolatile(),
+                                    LD->getAlignment());
+      SDValue Result = DAG.getNode(ISD::BIT_CONVERT, dl, LoadedVT, newLoad);
+      if (VT.isFloatingPoint() && LoadedVT != VT)
+        Result = DAG.getNode(ISD::FP_EXTEND, dl, VT, Result);
+
+      SDValue Ops[] = { Result, Chain };
+      return DAG.getMergeValues(Ops, 2, dl);
+    } else {
+      // Copy the value to a (aligned) stack slot using (unaligned) integer
+      // loads and stores, then do a (aligned) load from the stack slot.
+      MVT RegVT = TLI.getRegisterType(intVT);
+      unsigned LoadedBytes = LoadedVT.getSizeInBits() / 8;
+      unsigned RegBytes = RegVT.getSizeInBits() / 8;
+      unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
+
+      // Make sure the stack slot is also aligned for the register type.
+      SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
+
+      SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+      SmallVector<SDValue, 8> Stores;
+      SDValue StackPtr = StackBase;
+      unsigned Offset = 0;
+
+      // Do all but one copies using the full register width.
+      for (unsigned i = 1; i < NumRegs; i++) {
+        // Load one integer register's worth from the original location.
+        SDValue Load = DAG.getLoad(RegVT, dl, Chain, Ptr, LD->getSrcValue(),
+                                   SVOffset + Offset, LD->isVolatile(),
+                                   MinAlign(LD->getAlignment(), Offset));
+        // Follow the load with a store to the stack slot.  Remember the store.
+        Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
+                                      NULL, 0));
+        // Increment the pointers.
+        Offset += RegBytes;
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+        StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                               Increment);
+      }
+
+      // The last copy may be partial.  Do an extending load.
+      MVT MemVT = MVT::getIntegerVT(8 * (LoadedBytes - Offset));
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
+                                    LD->getSrcValue(), SVOffset + Offset,
+                                    MemVT, LD->isVolatile(),
+                                    MinAlign(LD->getAlignment(), Offset));
+      // Follow the load with a store to the stack slot.  Remember the store.
+      // On big-endian machines this requires a truncating store to ensure
+      // that the bits end up in the right place.
+      Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, StackPtr,
+                                         NULL, 0, MemVT));
+
+      // The order of the stores doesn't matter - say it with a TokenFactor.
+      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                               Stores.size());
+
+      // Finally, perform the original load only redirected to the stack slot.
+      Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
+                            NULL, 0, LoadedVT);
+
+      // Callers expect a MERGE_VALUES node.
+      SDValue Ops[] = { Load, TF };
+      return DAG.getMergeValues(Ops, 2, dl);
+    }
+  }
+  assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
+         "Unaligned load of unsupported type.");
+
+  // Compute the new VT that is half the size of the old one.  This is an
+  // integer MVT.
+  unsigned NumBits = LoadedVT.getSizeInBits();
+  MVT NewLoadedVT;
+  NewLoadedVT = MVT::getIntegerVT(NumBits/2);
+  NumBits >>= 1;
+
+  unsigned Alignment = LD->getAlignment();
+  unsigned IncrementSize = NumBits / 8;
+  ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+  // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+  if (HiExtType == ISD::NON_EXTLOAD)
+    HiExtType = ISD::ZEXTLOAD;
+
+  // Load the value in two parts
+  SDValue Lo, Hi;
+  if (TLI.isLittleEndian()) {
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset, NewLoadedVT, LD->isVolatile(), Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
+                        MinAlign(Alignment, IncrementSize));
+  } else {
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset, NewLoadedVT, LD->isVolatile(), Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getSrcValue(),
+                        SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
+                        MinAlign(Alignment, IncrementSize));
+  }
+
+  // aggregate the two parts
+  SDValue ShiftAmount = DAG.getConstant(NumBits, TLI.getShiftAmountTy());
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
+  Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                             Hi.getValue(1));
+
+  SDValue Ops[] = { Result, TF };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+/// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+/// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+/// is necessary to spill the vector being inserted into to memory, perform
+/// the insert there, and then read the result back.
+SDValue SelectionDAGLegalize::
+PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
+                               DebugLoc dl) {
+  SDValue Tmp1 = Vec;
+  SDValue Tmp2 = Val;
+  SDValue Tmp3 = Idx;
+
+  // If the target doesn't support this, we have to spill the input vector
+  // to a temporary stack slot, update the element, then reload it.  This is
+  // badness.  We could also load the value into a vector register (either
+  // with a "move to register" or "extload into register" instruction, then
+  // permute it into place, if the idx is a constant and if the idx is
+  // supported by the target.
+  MVT VT    = Tmp1.getValueType();
+  MVT EltVT = VT.getVectorElementType();
+  MVT IdxVT = Tmp3.getValueType();
+  MVT PtrVT = TLI.getPointerTy();
+  SDValue StackPtr = DAG.CreateStackTemporary(VT);
+
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+
+  // Store the vector.
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Tmp1, StackPtr,
+                            PseudoSourceValue::getFixedStack(SPFI), 0);
+
+  // Truncate or zero extend offset to target pointer type.
+  unsigned CastOpc = IdxVT.bitsGT(PtrVT) ? ISD::TRUNCATE : ISD::ZERO_EXTEND;
+  Tmp3 = DAG.getNode(CastOpc, dl, PtrVT, Tmp3);
+  // Add the offset to the index.
+  unsigned EltSize = EltVT.getSizeInBits()/8;
+  Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3,DAG.getConstant(EltSize, IdxVT));
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr);
+  // Store the scalar value.
+  Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2,
+                         PseudoSourceValue::getFixedStack(SPFI), 0, EltVT);
+  // Load the updated vector.
+  return DAG.getLoad(VT, dl, Ch, StackPtr,
+                     PseudoSourceValue::getFixedStack(SPFI), 0);
+}
+
+
+SDValue SelectionDAGLegalize::
+ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, DebugLoc dl) {
+  if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
+    // SCALAR_TO_VECTOR requires that the type of the value being inserted
+    // match the element type of the vector being created, except for
+    // integers in which case the inserted value can be over width.
+    MVT EltVT = Vec.getValueType().getVectorElementType();
+    if (Val.getValueType() == EltVT ||
+        (EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
+      SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                  Vec.getValueType(), Val);
+
+      unsigned NumElts = Vec.getValueType().getVectorNumElements();
+      // We generate a shuffle of InVec and ScVec, so the shuffle mask
+      // should be 0,1,2,3,4,5... with the appropriate element replaced with
+      // elt 0 of the RHS.
+      SmallVector<int, 8> ShufOps;
+      for (unsigned i = 0; i != NumElts; ++i)
+        ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);
+
+      return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec,
+                                  &ShufOps[0]);
+    }
+  }
+  return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
+}
+
+/// LegalizeOp - We know that the specified value has a legal type, and
+/// that its operands are legal.  Now ensure that the operation itself
+/// is legal, recursively ensuring that the operands' operations remain
+/// legal.
+SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
+  if (Op.getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
+    return Op;
+
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+
+  for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+    assert(getTypeAction(Node->getValueType(i)) == Legal &&
+           "Unexpected illegal type!");
+
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+    assert((isTypeLegal(Node->getOperand(i).getValueType()) || 
+            Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
+           "Unexpected illegal type!");
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+  if (I != LegalizedNodes.end()) return I->second;
+
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  SDValue Result = Op;
+  bool isCustom = false;
+
+  // Figure out the correct action; the way to query this varies by opcode
+  TargetLowering::LegalizeAction Action;
+  bool SimpleFinishLegalizing = true;
+  switch (Node->getOpcode()) {
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::VAARG:
+  case ISD::STACKSAVE:
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::EXTRACT_VECTOR_ELT:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    break;
+  case ISD::FP_ROUND_INREG:
+  case ISD::SIGN_EXTEND_INREG: {
+    MVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
+    break;
+  }
+  case ISD::SELECT_CC:
+  case ISD::SETCC:
+  case ISD::BR_CC: {
+    unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
+                         Node->getOpcode() == ISD::SETCC ? 2 : 1;
+    unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
+    MVT OpVT = Node->getOperand(CompareOperand).getValueType();
+    ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
+    Action = TLI.getCondCodeAction(CCCode, OpVT);
+    if (Action == TargetLowering::Legal) {
+      if (Node->getOpcode() == ISD::SELECT_CC)
+        Action = TLI.getOperationAction(Node->getOpcode(),
+                                        Node->getValueType(0));
+      else
+        Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
+    }
+    break;
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+    // FIXME: Model these properly.  LOAD and STORE are complicated, and
+    // STORE expects the unlegalized operand in some cases.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    // FIXME: This shouldn't be necessary.  These nodes have special properties
+    // dealing with the recursive nature of legalization.  Removing this
+    // special case should be done as part of making LegalizeDAG non-recursive.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::CALL:
+    // FIXME: Legalization for calls requires custom-lowering the call before
+    // legalizing the operands!  (I haven't looked into precisely why.)
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::EXTRACT_ELEMENT:
+  case ISD::FLT_ROUNDS_:
+  case ISD::SADDO:
+  case ISD::SSUBO:
+  case ISD::UADDO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+  case ISD::FPOWI:
+  case ISD::MERGE_VALUES:
+  case ISD::EH_RETURN:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be expanded.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Expand;
+    break;
+  case ISD::TRAMPOLINE:
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+  case ISD::FORMAL_ARGUMENTS:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be custom-lowered.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Custom;
+    break;
+  case ISD::BUILD_VECTOR:
+    // A weird case: legalization for BUILD_VECTOR never legalizes the
+    // operands!
+    // FIXME: This really sucks... changing it isn't semantically incorrect,
+    // but it massively pessimizes the code for floating-point BUILD_VECTORs
+    // because ConstantFP operands get legalized into constant pool loads
+    // before the BUILD_VECTOR code can see them.  It doesn't usually bite,
+    // though, because BUILD_VECTORS usually get lowered into other nodes
+    // which get legalized properly.
+    SimpleFinishLegalizing = false;
+    break;
+  default:
+    if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
+      Action = TargetLowering::Legal;
+    } else {
+      Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    }
+    break;
+  }
+
+  if (SimpleFinishLegalizing) {
+    SmallVector<SDValue, 8> Ops, ResultVals;
+    for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+      Ops.push_back(LegalizeOp(Node->getOperand(i)));
+    switch (Node->getOpcode()) {
+    default: break;
+    case ISD::BR:
+    case ISD::BRIND:
+    case ISD::BR_JT:
+    case ISD::BR_CC:
+    case ISD::BRCOND:
+    case ISD::RET:
+      // Branches tweak the chain to include LastCALLSEQ_END
+      Ops[0] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ops[0],
+                            LastCALLSEQ_END);
+      Ops[0] = LegalizeOp(Ops[0]);
+      LastCALLSEQ_END = DAG.getEntryNode();
+      break;
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SRA:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      if (!Ops[1].getValueType().isVector())
+        Ops[1] = LegalizeOp(DAG.getShiftAmountOperand(Ops[1]));
+      break;
+    }
+
+    Result = DAG.UpdateNodeOperands(Result.getValue(0), Ops.data(),
+                                    Ops.size());
+    switch (Action) {
+    case TargetLowering::Legal:
+      for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+        ResultVals.push_back(Result.getValue(i));
+      break;
+    case TargetLowering::Custom:
+      // FIXME: The handling for custom lowering with multiple results is
+      // a complete mess.
+      Tmp1 = TLI.LowerOperation(Result, DAG);
+      if (Tmp1.getNode()) {
+        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
+          if (e == 1)
+            ResultVals.push_back(Tmp1);
+          else
+            ResultVals.push_back(Tmp1.getValue(i));
+        }
+        break;
+      }
+
+      // FALL THROUGH
+    case TargetLowering::Expand:
+      ExpandNode(Result.getNode(), ResultVals);
+      break;
+    case TargetLowering::Promote:
+      PromoteNode(Result.getNode(), ResultVals);
+      break;
+    }
+    if (!ResultVals.empty()) {
+      for (unsigned i = 0, e = ResultVals.size(); i != e; ++i) {
+        if (ResultVals[i] != SDValue(Node, i))
+          ResultVals[i] = LegalizeOp(ResultVals[i]);
+        AddLegalizedOperand(SDValue(Node, i), ResultVals[i]);
+      }
+      return ResultVals[Op.getResNo()];
+    }
+  }
+
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "NODE: "; Node->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to legalize this operator!");
+    abort();
+  case ISD::CALL:
+    // The only option for this is to custom lower it.
+    Tmp3 = TLI.LowerOperation(Result.getValue(0), DAG);
+    assert(Tmp3.getNode() && "Target didn't custom lower this node!");
+    // A call within a calling sequence must be legalized to something
+    // other than the normal CALLSEQ_END.  Violating this gets Legalize
+    // into an infinite loop.
+    assert ((!IsLegalizingCall ||
+             Node->getOpcode() != ISD::CALL ||
+             Tmp3.getNode()->getOpcode() != ISD::CALLSEQ_END) &&
+            "Nested CALLSEQ_START..CALLSEQ_END not supported.");
+
+    // The number of incoming and outgoing values should match; unless the final
+    // outgoing value is a flag.
+    assert((Tmp3.getNode()->getNumValues() == Result.getNode()->getNumValues() ||
+            (Tmp3.getNode()->getNumValues() == Result.getNode()->getNumValues() + 1 &&
+             Tmp3.getNode()->getValueType(Tmp3.getNode()->getNumValues() - 1) ==
+               MVT::Flag)) &&
+           "Lowering call/formal_arguments produced unexpected # results!");
+
+    // Since CALL/FORMAL_ARGUMENTS nodes produce multiple values, make sure to
+    // remember that we legalized all of them, so it doesn't get relegalized.
+    for (unsigned i = 0, e = Tmp3.getNode()->getNumValues(); i != e; ++i) {
+      if (Tmp3.getNode()->getValueType(i) == MVT::Flag)
+        continue;
+      Tmp1 = LegalizeOp(Tmp3.getValue(i));
+      if (Op.getResNo() == i)
+        Tmp2 = Tmp1;
+      AddLegalizedOperand(SDValue(Node, i), Tmp1);
+    }
+    return Tmp2;
+  case ISD::BUILD_VECTOR:
+    switch (TLI.getOperationAction(ISD::BUILD_VECTOR, Node->getValueType(0))) {
+    default: assert(0 && "This action is not supported yet!");
+    case TargetLowering::Custom:
+      Tmp3 = TLI.LowerOperation(Result, DAG);
+      if (Tmp3.getNode()) {
+        Result = Tmp3;
+        break;
+      }
+      // FALLTHROUGH
+    case TargetLowering::Expand:
+      Result = ExpandBUILD_VECTOR(Result.getNode());
+      break;
+    }
+    break;
+  case ISD::CALLSEQ_START: {
+    SDNode *CallEnd = FindCallEndFromCallStart(Node);
+
+    // Recursively Legalize all of the inputs of the call end that do not lead
+    // to this call start.  This ensures that any libcalls that need be inserted
+    // are inserted *before* the CALLSEQ_START.
+    {SmallPtrSet<SDNode*, 32> NodesLeadingTo;
+    for (unsigned i = 0, e = CallEnd->getNumOperands(); i != e; ++i)
+      LegalizeAllNodesNotLeadingTo(CallEnd->getOperand(i).getNode(), Node,
+                                   NodesLeadingTo);
+    }
+
+    // Now that we legalized all of the inputs (which may have inserted
+    // libcalls) create the new CALLSEQ_START node.
+    Tmp1 = LegalizeOp(Node->getOperand(0));  // Legalize the chain.
+
+    // Merge in the last call, to ensure that this call start after the last
+    // call ended.
+    if (LastCALLSEQ_END.getOpcode() != ISD::EntryToken) {
+      Tmp1 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                         Tmp1, LastCALLSEQ_END);
+      Tmp1 = LegalizeOp(Tmp1);
+    }
+
+    // Do not try to legalize the target-specific arguments (#1+).
+    if (Tmp1 != Node->getOperand(0)) {
+      SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
+      Ops[0] = Tmp1;
+      Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+    }
+
+    // Remember that the CALLSEQ_START is legalized.
+    AddLegalizedOperand(Op.getValue(0), Result);
+    if (Node->getNumValues() == 2)    // If this has a flag result, remember it.
+      AddLegalizedOperand(Op.getValue(1), Result.getValue(1));
+
+    // Now that the callseq_start and all of the non-call nodes above this call
+    // sequence have been legalized, legalize the call itself.  During this
+    // process, no libcalls can/will be inserted, guaranteeing that no calls
+    // can overlap.
+    assert(!IsLegalizingCall && "Inconsistent sequentialization of calls!");
+    // Note that we are selecting this call!
+    LastCALLSEQ_END = SDValue(CallEnd, 0);
+    IsLegalizingCall = true;
+
+    // Legalize the call, starting from the CALLSEQ_END.
+    LegalizeOp(LastCALLSEQ_END);
+    assert(!IsLegalizingCall && "CALLSEQ_END should have cleared this!");
+    return Result;
+  }
+  case ISD::CALLSEQ_END:
+    // If the CALLSEQ_START node hasn't been legalized first, legalize it.  This
+    // will cause this node to be legalized as well as handling libcalls right.
+    if (LastCALLSEQ_END.getNode() != Node) {
+      LegalizeOp(SDValue(FindCallStartFromCallEnd(Node), 0));
+      DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+      assert(I != LegalizedNodes.end() &&
+             "Legalizing the call start should have legalized this node!");
+      return I->second;
+    }
+
+    // Otherwise, the call start has been legalized and everything is going
+    // according to plan.  Just legalize ourselves normally here.
+    Tmp1 = LegalizeOp(Node->getOperand(0));  // Legalize the chain.
+    // Do not try to legalize the target-specific arguments (#1+), except for
+    // an optional flag input.
+    if (Node->getOperand(Node->getNumOperands()-1).getValueType() != MVT::Flag){
+      if (Tmp1 != Node->getOperand(0)) {
+        SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
+        Ops[0] = Tmp1;
+        Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+      }
+    } else {
+      Tmp2 = LegalizeOp(Node->getOperand(Node->getNumOperands()-1));
+      if (Tmp1 != Node->getOperand(0) ||
+          Tmp2 != Node->getOperand(Node->getNumOperands()-1)) {
+        SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
+        Ops[0] = Tmp1;
+        Ops.back() = Tmp2;
+        Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+      }
+    }
+    assert(IsLegalizingCall && "Call sequence imbalance between start/end?");
+    // This finishes up call legalization.
+    IsLegalizingCall = false;
+
+    // If the CALLSEQ_END node has a flag, remember that we legalized it.
+    AddLegalizedOperand(SDValue(Node, 0), Result.getValue(0));
+    if (Node->getNumValues() == 2)
+      AddLegalizedOperand(SDValue(Node, 1), Result.getValue(1));
+    return Result.getValue(Op.getResNo());
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Node);
+    Tmp1 = LegalizeOp(LD->getChain());   // Legalize the chain.
+    Tmp2 = LegalizeOp(LD->getBasePtr()); // Legalize the base pointer.
+
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    if (ExtType == ISD::NON_EXTLOAD) {
+      MVT VT = Node->getValueType(0);
+      Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp2, LD->getOffset());
+      Tmp3 = Result.getValue(0);
+      Tmp4 = Result.getValue(1);
+
+      switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+      default: assert(0 && "This action is not supported yet!");
+      case TargetLowering::Legal:
+        // If this is an unaligned load and the target doesn't support it,
+        // expand it.
+        if (!TLI.allowsUnalignedMemoryAccesses()) {
+          unsigned ABIAlignment = TLI.getTargetData()->
+            getABITypeAlignment(LD->getMemoryVT().getTypeForMVT());
+          if (LD->getAlignment() < ABIAlignment){
+            Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()), DAG,
+                                         TLI);
+            Tmp3 = Result.getOperand(0);
+            Tmp4 = Result.getOperand(1);
+            Tmp3 = LegalizeOp(Tmp3);
+            Tmp4 = LegalizeOp(Tmp4);
+          }
+        }
+        break;
+      case TargetLowering::Custom:
+        Tmp1 = TLI.LowerOperation(Tmp3, DAG);
+        if (Tmp1.getNode()) {
+          Tmp3 = LegalizeOp(Tmp1);
+          Tmp4 = LegalizeOp(Tmp1.getValue(1));
+        }
+        break;
+      case TargetLowering::Promote: {
+        // Only promote a load of vector type to another.
+        assert(VT.isVector() && "Cannot promote this load!");
+        // Change base type to a different vector type.
+        MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+
+        Tmp1 = DAG.getLoad(NVT, dl, Tmp1, Tmp2, LD->getSrcValue(),
+                           LD->getSrcValueOffset(),
+                           LD->isVolatile(), LD->getAlignment());
+        Tmp3 = LegalizeOp(DAG.getNode(ISD::BIT_CONVERT, dl, VT, Tmp1));
+        Tmp4 = LegalizeOp(Tmp1.getValue(1));
+        break;
+      }
+      }
+      // Since loads produce two values, make sure to remember that we
+      // legalized both of them.
+      AddLegalizedOperand(SDValue(Node, 0), Tmp3);
+      AddLegalizedOperand(SDValue(Node, 1), Tmp4);
+      return Op.getResNo() ? Tmp4 : Tmp3;
+    } else {
+      MVT SrcVT = LD->getMemoryVT();
+      unsigned SrcWidth = SrcVT.getSizeInBits();
+      int SVOffset = LD->getSrcValueOffset();
+      unsigned Alignment = LD->getAlignment();
+      bool isVolatile = LD->isVolatile();
+
+      if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+          // Some targets pretend to have an i1 loading operation, and actually
+          // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+          // bits are guaranteed to be zero; it helps the optimizers understand
+          // that these bits are zero.  It is also useful for EXTLOAD, since it
+          // tells the optimizers that those bits are undefined.  It would be
+          // nice to have an effective generic way of getting these benefits...
+          // Until such a way is found, don't insist on promoting i1 here.
+          (SrcVT != MVT::i1 ||
+           TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+        // Promote to a byte-sized load if not loading an integral number of
+        // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+        unsigned NewWidth = SrcVT.getStoreSizeInBits();
+        MVT NVT = MVT::getIntegerVT(NewWidth);
+        SDValue Ch;
+
+        // The extra bits are guaranteed to be zero, since we stored them that
+        // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+        ISD::LoadExtType NewExtType =
+          ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+        Result = DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+                                Tmp1, Tmp2, LD->getSrcValue(), SVOffset,
+                                NVT, isVolatile, Alignment);
+
+        Ch = Result.getValue(1); // The chain.
+
+        if (ExtType == ISD::SEXTLOAD)
+          // Having the top bits zero doesn't help when sign extending.
+          Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                               Result.getValueType(),
+                               Result, DAG.getValueType(SrcVT));
+        else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+          // All the top bits are guaranteed to be zero - inform the optimizers.
+          Result = DAG.getNode(ISD::AssertZext, dl,
+                               Result.getValueType(), Result,
+                               DAG.getValueType(SrcVT));
+
+        Tmp1 = LegalizeOp(Result);
+        Tmp2 = LegalizeOp(Ch);
+      } else if (SrcWidth & (SrcWidth - 1)) {
+        // If not loading a power-of-2 number of bits, expand as two loads.
+        assert(SrcVT.isExtended() && !SrcVT.isVector() &&
+               "Unsupported extload!");
+        unsigned RoundWidth = 1 << Log2_32(SrcWidth);
+        assert(RoundWidth < SrcWidth);
+        unsigned ExtraWidth = SrcWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Load size not an integral number of bytes!");
+        MVT RoundVT = MVT::getIntegerVT(RoundWidth);
+        MVT ExtraVT = MVT::getIntegerVT(ExtraWidth);
+        SDValue Lo, Hi, Ch;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+          // Load the bottom RoundWidth bits.
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl,
+                              Node->getValueType(0), Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
+                              Alignment);
+
+          // Load the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset + IncrementSize,
+                              ExtraVT, isVolatile,
+                              MinAlign(Alignment, IncrementSize));
+
+          // Build a factor node to remember that this load is independent of the
+          // other one.
+          Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                           Hi.getValue(1));
+
+          // Move the top bits to the right place.
+          Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                           DAG.getConstant(RoundWidth, TLI.getShiftAmountTy()));
+
+          // Join the hi and lo parts.
+          Result = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+        } else {
+          // Big endian - avoid unaligned loads.
+          // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+          // Load the top RoundWidth bits.
+          Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
+                              Alignment);
+
+          // Load the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl,
+                              Node->getValueType(0), Tmp1, Tmp2,
+                              LD->getSrcValue(), SVOffset + IncrementSize,
+                              ExtraVT, isVolatile,
+                              MinAlign(Alignment, IncrementSize));
+
+          // Build a factor node to remember that this load is independent of the
+          // other one.
+          Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                           Hi.getValue(1));
+
+          // Move the top bits to the right place.
+          Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                           DAG.getConstant(ExtraWidth, TLI.getShiftAmountTy()));
+
+          // Join the hi and lo parts.
+          Result = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+        }
+
+        Tmp1 = LegalizeOp(Result);
+        Tmp2 = LegalizeOp(Ch);
+      } else {
+        switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
+        default: assert(0 && "This action is not supported yet!");
+        case TargetLowering::Custom:
+          isCustom = true;
+          // FALLTHROUGH
+        case TargetLowering::Legal:
+          Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp2, LD->getOffset());
+          Tmp1 = Result.getValue(0);
+          Tmp2 = Result.getValue(1);
+
+          if (isCustom) {
+            Tmp3 = TLI.LowerOperation(Result, DAG);
+            if (Tmp3.getNode()) {
+              Tmp1 = LegalizeOp(Tmp3);
+              Tmp2 = LegalizeOp(Tmp3.getValue(1));
+            }
+          } else {
+            // If this is an unaligned load and the target doesn't support it,
+            // expand it.
+            if (!TLI.allowsUnalignedMemoryAccesses()) {
+              unsigned ABIAlignment = TLI.getTargetData()->
+                getABITypeAlignment(LD->getMemoryVT().getTypeForMVT());
+              if (LD->getAlignment() < ABIAlignment){
+                Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()), DAG,
+                                             TLI);
+                Tmp1 = Result.getOperand(0);
+                Tmp2 = Result.getOperand(1);
+                Tmp1 = LegalizeOp(Tmp1);
+                Tmp2 = LegalizeOp(Tmp2);
+              }
+            }
+          }
+          break;
+        case TargetLowering::Expand:
+          // f64 = EXTLOAD f32 should expand to LOAD, FP_EXTEND
+          if (SrcVT == MVT::f32 && Node->getValueType(0) == MVT::f64) {
+            SDValue Load = DAG.getLoad(SrcVT, dl, Tmp1, Tmp2, LD->getSrcValue(),
+                                         LD->getSrcValueOffset(),
+                                         LD->isVolatile(), LD->getAlignment());
+            Result = DAG.getNode(ISD::FP_EXTEND, dl,
+                                 Node->getValueType(0), Load);
+            Tmp1 = LegalizeOp(Result);  // Relegalize new nodes.
+            Tmp2 = LegalizeOp(Load.getValue(1));
+            break;
+          }
+          assert(ExtType != ISD::EXTLOAD &&"EXTLOAD should always be supported!");
+          // Turn the unsupported load into an EXTLOAD followed by an explicit
+          // zero/sign extend inreg.
+          Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
+                                  Tmp1, Tmp2, LD->getSrcValue(),
+                                  LD->getSrcValueOffset(), SrcVT,
+                                  LD->isVolatile(), LD->getAlignment());
+          SDValue ValRes;
+          if (ExtType == ISD::SEXTLOAD)
+            ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                                 Result.getValueType(),
+                                 Result, DAG.getValueType(SrcVT));
+          else
+            ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT);
+          Tmp1 = LegalizeOp(ValRes);  // Relegalize new nodes.
+          Tmp2 = LegalizeOp(Result.getValue(1));  // Relegalize new nodes.
+          break;
+        }
+      }
+
+      // Since loads produce two values, make sure to remember that we legalized
+      // both of them.
+      AddLegalizedOperand(SDValue(Node, 0), Tmp1);
+      AddLegalizedOperand(SDValue(Node, 1), Tmp2);
+      return Op.getResNo() ? Tmp2 : Tmp1;
+    }
+  }
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    Tmp1 = LegalizeOp(ST->getChain());    // Legalize the chain.
+    Tmp2 = LegalizeOp(ST->getBasePtr());  // Legalize the pointer.
+    int SVOffset = ST->getSrcValueOffset();
+    unsigned Alignment = ST->getAlignment();
+    bool isVolatile = ST->isVolatile();
+
+    if (!ST->isTruncatingStore()) {
+      // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+      // FIXME: We shouldn't do this for TargetConstantFP's.
+      // FIXME: move this to the DAG Combiner!  Note that we can't regress due
+      // to phase ordering between legalized code and the dag combiner.  This
+      // probably means that we need to integrate dag combiner and legalizer
+      // together.
+      // We generally can't do this one for long doubles.
+      if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
+        if (CFP->getValueType(0) == MVT::f32 &&
+            getTypeAction(MVT::i32) == Legal) {
+          Tmp3 = DAG.getConstant(CFP->getValueAPF().
+                                          bitcastToAPInt().zextOrTrunc(32),
+                                  MVT::i32);
+          Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                SVOffset, isVolatile, Alignment);
+          break;
+        } else if (CFP->getValueType(0) == MVT::f64) {
+          // If this target supports 64-bit registers, do a single 64-bit store.
+          if (getTypeAction(MVT::i64) == Legal) {
+            Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                     zextOrTrunc(64), MVT::i64);
+            Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                  SVOffset, isVolatile, Alignment);
+            break;
+          } else if (getTypeAction(MVT::i32) == Legal && !ST->isVolatile()) {
+            // Otherwise, if the target supports 32-bit registers, use 2 32-bit
+            // stores.  If the target supports neither 32- nor 64-bits, this
+            // xform is certainly not worth it.
+            const APInt &IntVal =CFP->getValueAPF().bitcastToAPInt();
+            SDValue Lo = DAG.getConstant(APInt(IntVal).trunc(32), MVT::i32);
+            SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
+            if (TLI.isBigEndian()) std::swap(Lo, Hi);
+
+            Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getSrcValue(),
+                              SVOffset, isVolatile, Alignment);
+            Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                               DAG.getIntPtrConstant(4));
+            Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(), SVOffset+4,
+                              isVolatile, MinAlign(Alignment, 4U));
+
+            Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+            break;
+          }
+        }
+      }
+
+      {
+        Tmp3 = LegalizeOp(ST->getValue());
+        Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp3, Tmp2,
+                                        ST->getOffset());
+
+        MVT VT = Tmp3.getValueType();
+        switch (TLI.getOperationAction(ISD::STORE, VT)) {
+        default: assert(0 && "This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses()) {
+            unsigned ABIAlignment = TLI.getTargetData()->
+              getABITypeAlignment(ST->getMemoryVT().getTypeForMVT());
+            if (ST->getAlignment() < ABIAlignment)
+              Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()), DAG,
+                                            TLI);
+          }
+          break;
+        case TargetLowering::Custom:
+          Tmp1 = TLI.LowerOperation(Result, DAG);
+          if (Tmp1.getNode()) Result = Tmp1;
+          break;
+        case TargetLowering::Promote:
+          assert(VT.isVector() && "Unknown legal promote case!");
+          Tmp3 = DAG.getNode(ISD::BIT_CONVERT, dl,
+                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Tmp3);
+          Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
+                                ST->getSrcValue(), SVOffset, isVolatile,
+                                Alignment);
+          break;
+        }
+        break;
+      }
+    } else {
+      Tmp3 = LegalizeOp(ST->getValue());
+
+      MVT StVT = ST->getMemoryVT();
+      unsigned StWidth = StVT.getSizeInBits();
+
+      if (StWidth != StVT.getStoreSizeInBits()) {
+        // Promote to a byte-sized store with upper bits zero if not
+        // storing an integral number of bytes.  For example, promote
+        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+        MVT NVT = MVT::getIntegerVT(StVT.getStoreSizeInBits());
+        Tmp3 = DAG.getZeroExtendInReg(Tmp3, dl, StVT);
+        Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                   SVOffset, NVT, isVolatile, Alignment);
+      } else if (StWidth & (StWidth - 1)) {
+        // If not storing a power-of-2 number of bits, expand as two stores.
+        assert(StVT.isExtended() && !StVT.isVector() &&
+               "Unsupported truncstore!");
+        unsigned RoundWidth = 1 << Log2_32(StWidth);
+        assert(RoundWidth < StWidth);
+        unsigned ExtraWidth = StWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Store size not an integral number of bytes!");
+        MVT RoundVT = MVT::getIntegerVT(RoundWidth);
+        MVT ExtraVT = MVT::getIntegerVT(ExtraWidth);
+        SDValue Lo, Hi;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+          // Store the bottom RoundWidth bits.
+          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                 SVOffset, RoundVT,
+                                 isVolatile, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
+                           DAG.getConstant(RoundWidth, TLI.getShiftAmountTy()));
+          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(),
+                                 SVOffset + IncrementSize, ExtraVT, isVolatile,
+                                 MinAlign(Alignment, IncrementSize));
+        } else {
+          // Big endian - avoid unaligned stores.
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+          // Store the top RoundWidth bits.
+          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
+                           DAG.getConstant(ExtraWidth, TLI.getShiftAmountTy()));
+          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(),
+                                 SVOffset, RoundVT, isVolatile, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                 SVOffset + IncrementSize, ExtraVT, isVolatile,
+                                 MinAlign(Alignment, IncrementSize));
+        }
+
+        // The order of the stores doesn't matter.
+        Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+      } else {
+        if (Tmp1 != ST->getChain() || Tmp3 != ST->getValue() ||
+            Tmp2 != ST->getBasePtr())
+          Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp3, Tmp2,
+                                          ST->getOffset());
+
+        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
+        default: assert(0 && "This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses()) {
+            unsigned ABIAlignment = TLI.getTargetData()->
+              getABITypeAlignment(ST->getMemoryVT().getTypeForMVT());
+            if (ST->getAlignment() < ABIAlignment)
+              Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()), DAG,
+                                            TLI);
+          }
+          break;
+        case TargetLowering::Custom:
+          Result = TLI.LowerOperation(Result, DAG);
+          break;
+        case Expand:
+          // TRUNCSTORE:i16 i32 -> STORE i16
+          assert(isTypeLegal(StVT) && "Do not know how to expand this store!");
+          Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3);
+          Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                                SVOffset, isVolatile, Alignment);
+          break;
+        }
+      }
+    }
+    break;
+  }
+  }
+  assert(Result.getValueType() == Op.getValueType() &&
+         "Bad legalization!");
+
+  // Make sure that the generated code is itself legal.
+  if (Result != Op)
+    Result = LegalizeOp(Result);
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  AddLegalizedOperand(Op, Result);
+  return Result;
+}
+
+SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  // Store the value to a temporary stack slot, then LOAD the returned part.
+  SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, NULL, 0);
+
+  // Add the offset to the index.
+  unsigned EltSize =
+      Vec.getValueType().getVectorElementType().getSizeInBits()/8;
+  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(EltSize, Idx.getValueType()));
+
+  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
+    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
+  else
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+
+  StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
+
+  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, NULL, 0);
+}
+
+SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1 = Node->getOperand(0);
+  SDValue Tmp2 = Node->getOperand(1);
+  assert((Tmp2.getValueType() == MVT::f32 ||
+          Tmp2.getValueType() == MVT::f64) &&
+          "Ugly special-cased code!");
+  // Get the sign bit of the RHS.
+  SDValue SignBit;
+  MVT IVT = Tmp2.getValueType() == MVT::f64 ? MVT::i64 : MVT::i32;
+  if (isTypeLegal(IVT)) {
+    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, IVT, Tmp2);
+  } else {
+    assert(isTypeLegal(TLI.getPointerTy()) &&
+            (TLI.getPointerTy() == MVT::i32 || 
+            TLI.getPointerTy() == MVT::i64) &&
+            "Legal type for load?!");
+    SDValue StackPtr = DAG.CreateStackTemporary(Tmp2.getValueType());
+    SDValue StorePtr = StackPtr, LoadPtr = StackPtr;
+    SDValue Ch =
+        DAG.getStore(DAG.getEntryNode(), dl, Tmp2, StorePtr, NULL, 0);
+    if (Tmp2.getValueType() == MVT::f64 && TLI.isLittleEndian())
+      LoadPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(),
+                            LoadPtr, DAG.getIntPtrConstant(4));
+    SignBit = DAG.getExtLoad(ISD::SEXTLOAD, dl, TLI.getPointerTy(),
+                              Ch, LoadPtr, NULL, 0, MVT::i32);
+  }
+  SignBit =
+      DAG.getSetCC(dl, TLI.getSetCCResultType(SignBit.getValueType()),
+                    SignBit, DAG.getConstant(0, SignBit.getValueType()),
+                    ISD::SETLT);
+  // Get the absolute value of the result.
+  SDValue AbsVal = DAG.getNode(ISD::FABS, dl, Tmp1.getValueType(), Tmp1);
+  // Select between the nabs and abs value based on the sign bit of
+  // the input.
+  return DAG.getNode(ISD::SELECT, dl, AbsVal.getValueType(), SignBit,
+                     DAG.getNode(ISD::FNEG, dl, AbsVal.getValueType(), AbsVal),
+                     AbsVal);
+}
+
+SDValue SelectionDAGLegalize::ExpandDBG_STOPPOINT(SDNode* Node) {
+  DebugLoc dl = Node->getDebugLoc();
+  DwarfWriter *DW = DAG.getDwarfWriter();
+  bool useDEBUG_LOC = TLI.isOperationLegalOrCustom(ISD::DEBUG_LOC,
+                                                    MVT::Other);
+  bool useLABEL = TLI.isOperationLegalOrCustom(ISD::DBG_LABEL, MVT::Other);
+
+  const DbgStopPointSDNode *DSP = cast<DbgStopPointSDNode>(Node);
+  GlobalVariable *CU_GV = cast<GlobalVariable>(DSP->getCompileUnit());
+  if (DW && (useDEBUG_LOC || useLABEL) && !CU_GV->isDeclaration()) {
+    DICompileUnit CU(cast<GlobalVariable>(DSP->getCompileUnit()));
+
+    unsigned Line = DSP->getLine();
+    unsigned Col = DSP->getColumn();
+
+    if (OptLevel == CodeGenOpt::None) {
+      // A bit self-referential to have DebugLoc on Debug_Loc nodes, but it
+      // won't hurt anything.
+      if (useDEBUG_LOC) {
+        return DAG.getNode(ISD::DEBUG_LOC, dl, MVT::Other, Node->getOperand(0),
+                           DAG.getConstant(Line, MVT::i32),
+                           DAG.getConstant(Col, MVT::i32),
+                           DAG.getSrcValue(CU.getGV()));
+      } else {
+        unsigned ID = DW->RecordSourceLine(Line, Col, CU);
+        return DAG.getLabel(ISD::DBG_LABEL, dl, Node->getOperand(0), ID);
+      }
+    }
+  }
+  return Node->getOperand(0);
+}
+
+void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
+                                           SmallVectorImpl<SDValue> &Results) {
+  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+          " not tell us which reg is the stack pointer!");
+  DebugLoc dl = Node->getDebugLoc();
+  MVT VT = Node->getValueType(0);
+  SDValue Tmp1 = SDValue(Node, 0);
+  SDValue Tmp2 = SDValue(Node, 1);
+  SDValue Tmp3 = Node->getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+  unsigned StackAlign =
+    TLI.getTargetMachine().getFrameInfo()->getStackAlignment();
+  if (Align > StackAlign)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP,
+                      DAG.getConstant(-(uint64_t)Align, VT));
+  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
+
+  Tmp2 = DAG.getCALLSEQ_END(Chain,  DAG.getIntPtrConstant(0, true),
+                            DAG.getIntPtrConstant(0, true), SDValue());
+
+  Results.push_back(Tmp1);
+  Results.push_back(Tmp2);
+}
+
+/// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
+/// condition code CC on the current target. This routine assumes LHS and rHS
+/// have already been legalized by LegalizeSetCCOperands. It expands SETCC with
+/// illegal condition code into AND / OR of multiple SETCC values.
+void SelectionDAGLegalize::LegalizeSetCCCondCode(MVT VT,
+                                                 SDValue &LHS, SDValue &RHS,
+                                                 SDValue &CC,
+                                                 DebugLoc dl) {
+  MVT OpVT = LHS.getValueType();
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  switch (TLI.getCondCodeAction(CCCode, OpVT)) {
+  default: assert(0 && "Unknown condition code action!");
+  case TargetLowering::Legal:
+    // Nothing to do.
+    break;
+  case TargetLowering::Expand: {
+    ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
+    unsigned Opc = 0;
+    switch (CCCode) {
+    default: assert(0 && "Don't know how to expand this condition!"); abort();
+    case ISD::SETOEQ: CC1 = ISD::SETEQ; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOGT: CC1 = ISD::SETGT; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOGE: CC1 = ISD::SETGE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOLT: CC1 = ISD::SETLT; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETOLE: CC1 = ISD::SETLE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETONE: CC1 = ISD::SETNE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
+    case ISD::SETUEQ: CC1 = ISD::SETEQ; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETUGT: CC1 = ISD::SETGT; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETUGE: CC1 = ISD::SETGE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETULT: CC1 = ISD::SETLT; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETULE: CC1 = ISD::SETLE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    case ISD::SETUNE: CC1 = ISD::SETNE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
+    // FIXME: Implement more expansions.
+    }
+
+    SDValue SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
+    SDValue SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
+    LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
+    RHS = SDValue();
+    CC  = SDValue();
+    break;
+  }
+  }
+}
+
+/// EmitStackConvert - Emit a store/load combination to the stack.  This stores
+/// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
+/// a load from the stack slot to DestVT, extending it if needed.
+/// The resultant code need not be legal.
+SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
+                                               MVT SlotVT,
+                                               MVT DestVT,
+                                               DebugLoc dl) {
+  // Create the stack frame object.
+  unsigned SrcAlign =
+    TLI.getTargetData()->getPrefTypeAlignment(SrcOp.getValueType().
+                                              getTypeForMVT());
+  SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
+  int SPFI = StackPtrFI->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(SPFI);
+
+  unsigned SrcSize = SrcOp.getValueType().getSizeInBits();
+  unsigned SlotSize = SlotVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+  unsigned DestAlign =
+    TLI.getTargetData()->getPrefTypeAlignment(DestVT.getTypeForMVT());
+
+  // Emit a store to the stack slot.  Use a truncstore if the input value is
+  // later than DestVT.
+  SDValue Store;
+
+  if (SrcSize > SlotSize)
+    Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
+                              SV, 0, SlotVT, false, SrcAlign);
+  else {
+    assert(SrcSize == SlotSize && "Invalid store");
+    Store = DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
+                         SV, 0, false, SrcAlign);
+  }
+
+  // Result is a load from the stack slot.
+  if (SlotSize == DestSize)
+    return DAG.getLoad(DestVT, dl, Store, FIPtr, SV, 0, false, DestAlign);
+
+  assert(SlotSize < DestSize && "Unknown extension!");
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, SV, 0, SlotVT,
+                        false, DestAlign);
+}
+
+SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
+  DebugLoc dl = Node->getDebugLoc();
+  // Create a vector sized/aligned stack slot, store the value to element #0,
+  // then load the whole vector back out.
+  SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
+  int SPFI = StackPtrFI->getIndex();
+
+  SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), dl, Node->getOperand(0),
+                                 StackPtr,
+                                 PseudoSourceValue::getFixedStack(SPFI), 0,
+                                 Node->getValueType(0).getVectorElementType());
+  return DAG.getLoad(Node->getValueType(0), dl, Ch, StackPtr,
+                     PseudoSourceValue::getFixedStack(SPFI), 0);
+}
+
+
+/// ExpandBUILD_VECTOR - Expand a BUILD_VECTOR node on targets that don't
+/// support the operation, but do support the resultant vector type.
+SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
+  unsigned NumElems = Node->getNumOperands();
+  SDValue SplatValue = Node->getOperand(0);
+  DebugLoc dl = Node->getDebugLoc();
+  MVT VT = Node->getValueType(0);
+  MVT OpVT = SplatValue.getValueType();
+  MVT EltVT = VT.getVectorElementType();
+
+  // If the only non-undef value is the low element, turn this into a
+  // SCALAR_TO_VECTOR node.  If this is { X, X, X, X }, determine X.
+  bool isOnlyLowElement = true;
+
+  // FIXME: it would be far nicer to change this into map<SDValue,uint64_t>
+  // and use a bitmask instead of a list of elements.
+  // FIXME: this doesn't treat <0, u, 0, u> for example, as a splat.
+  std::map<SDValue, std::vector<unsigned> > Values;
+  Values[SplatValue].push_back(0);
+  bool isConstant = true;
+  if (!isa<ConstantFPSDNode>(SplatValue) && !isa<ConstantSDNode>(SplatValue) &&
+      SplatValue.getOpcode() != ISD::UNDEF)
+    isConstant = false;
+
+  for (unsigned i = 1; i < NumElems; ++i) {
+    SDValue V = Node->getOperand(i);
+    Values[V].push_back(i);
+    if (V.getOpcode() != ISD::UNDEF)
+      isOnlyLowElement = false;
+    if (SplatValue != V)
+      SplatValue = SDValue(0, 0);
+
+    // If this isn't a constant element or an undef, we can't use a constant
+    // pool load.
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V) &&
+        V.getOpcode() != ISD::UNDEF)
+      isConstant = false;
+  }
+
+  if (isOnlyLowElement) {
+    // If the low element is an undef too, then this whole things is an undef.
+    if (Node->getOperand(0).getOpcode() == ISD::UNDEF)
+      return DAG.getUNDEF(VT);
+    // Otherwise, turn this into a scalar_to_vector node.
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
+  }
+
+  // If all elements are constants, create a load from the constant pool.
+  if (isConstant) {
+    std::vector<Constant*> CV;
+    for (unsigned i = 0, e = NumElems; i != e; ++i) {
+      if (ConstantFPSDNode *V =
+          dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
+        CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
+      } else if (ConstantSDNode *V =
+                 dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
+        CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+      } else {
+        assert(Node->getOperand(i).getOpcode() == ISD::UNDEF);
+        const Type *OpNTy = OpVT.getTypeForMVT();
+        CV.push_back(UndefValue::get(OpNTy));
+      }
+    }
+    Constant *CP = ConstantVector::get(CV);
+    SDValue CPIdx = DAG.getConstantPool(CP, TLI.getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                       PseudoSourceValue::getConstantPool(), 0,
+                       false, Alignment);
+  }
+
+  if (SplatValue.getNode()) {   // Splat of one value?
+    // Build the shuffle constant vector: <0, 0, 0, 0>
+    SmallVector<int, 8> ZeroVec(NumElems, 0);
+
+    // If the target supports VECTOR_SHUFFLE and this shuffle mask, use it.
+    if (TLI.isShuffleMaskLegal(ZeroVec, Node->getValueType(0))) {
+      // Get the splatted value into the low element of a vector register.
+      SDValue LowValVec =
+        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, SplatValue);
+
+      // Return shuffle(LowValVec, undef, <0,0,0,0>)
+      return DAG.getVectorShuffle(VT, dl, LowValVec, DAG.getUNDEF(VT),
+                                  &ZeroVec[0]);
+    }
+  }
+
+  // If there are only two unique elements, we may be able to turn this into a
+  // vector shuffle.
+  if (Values.size() == 2) {
+    // Get the two values in deterministic order.
+    SDValue Val1 = Node->getOperand(1);
+    SDValue Val2;
+    std::map<SDValue, std::vector<unsigned> >::iterator MI = Values.begin();
+    if (MI->first != Val1)
+      Val2 = MI->first;
+    else
+      Val2 = (++MI)->first;
+
+    // If Val1 is an undef, make sure it ends up as Val2, to ensure that our
+    // vector shuffle has the undef vector on the RHS.
+    if (Val1.getOpcode() == ISD::UNDEF)
+      std::swap(Val1, Val2);
+
+    // Build the shuffle constant vector: e.g. <0, 4, 0, 4>
+    SmallVector<int, 8> ShuffleMask(NumElems, -1);
+
+    // Set elements of the shuffle mask for Val1.
+    std::vector<unsigned> &Val1Elts = Values[Val1];
+    for (unsigned i = 0, e = Val1Elts.size(); i != e; ++i)
+      ShuffleMask[Val1Elts[i]] = 0;
+
+    // Set elements of the shuffle mask for Val2.
+    std::vector<unsigned> &Val2Elts = Values[Val2];
+    for (unsigned i = 0, e = Val2Elts.size(); i != e; ++i)
+      if (Val2.getOpcode() != ISD::UNDEF)
+        ShuffleMask[Val2Elts[i]] = NumElems;
+
+    // If the target supports SCALAR_TO_VECTOR and this shuffle mask, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::SCALAR_TO_VECTOR, VT) &&
+        TLI.isShuffleMaskLegal(ShuffleMask, VT)) {
+      Val1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val1);
+      Val2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Val2);
+      return DAG.getVectorShuffle(VT, dl, Val1, Val2, &ShuffleMask[0]);
+    }
+  }
+
+  // Otherwise, we can't handle this case efficiently.  Allocate a sufficiently
+  // aligned object on the stack, store each element into it, then load
+  // the result as a vector.
+  // Create the stack frame object.
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(FI);
+
+  // Emit a store of each element to the stack slot.
+  SmallVector<SDValue, 8> Stores;
+  unsigned TypeByteSize = OpVT.getSizeInBits() / 8;
+  // Store (in the right endianness) the elements to memory.
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    // Ignore undef elements.
+    if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+    unsigned Offset = TypeByteSize*i;
+
+    SDValue Idx = DAG.getConstant(Offset, FIPtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
+
+    Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
+                                  Idx, SV, Offset));
+  }
+
+  SDValue StoreChain;
+  if (!Stores.empty())    // Not all undef elements?
+    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             &Stores[0], Stores.size());
+  else
+    StoreChain = DAG.getEntryNode();
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, SV, 0);
+}
+
+// ExpandLibCall - Expand a node into a call to a libcall.  If the result value
+// does not fit into a register, return the lo part and set the hi part to the
+// by-reg argument.  If it does fit into a single register, return the result
+// and leave the Hi part unset.
+SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                            bool isSigned) {
+  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    MVT ArgVT = Node->getOperand(i).getValueType();
+    const Type *ArgTy = ArgVT.getTypeForMVT();
+    Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  // Splice the libcall in wherever FindInputOutputChains tells us to.
+  const Type *RetTy = Node->getValueType(0).getTypeForMVT();
+  std::pair<SDValue, SDValue> CallInfo =
+    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                    CallingConv::C, false, Callee, Args, DAG,
+                    Node->getDebugLoc());
+
+  // Legalize the call sequence, starting with the chain.  This will advance
+  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
+  // was added by LowerCallTo (guaranteeing proper serialization of calls).
+  LegalizeOp(CallInfo.second);
+  return CallInfo.first;
+}
+
+SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
+                                              RTLIB::Libcall Call_F32,
+                                              RTLIB::Libcall Call_F64,
+                                              RTLIB::Libcall Call_F80,
+                                              RTLIB::Libcall Call_PPCF128) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT()) {
+  default: assert(0 && "Unexpected request for libcall!");
+  case MVT::f32: LC = Call_F32; break;
+  case MVT::f64: LC = Call_F64; break;
+  case MVT::f80: LC = Call_F80; break;
+  case MVT::ppcf128: LC = Call_PPCF128; break;
+  }
+  return ExpandLibCall(LC, Node, false);
+}
+
+SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
+                                               RTLIB::Libcall Call_I16,
+                                               RTLIB::Libcall Call_I32,
+                                               RTLIB::Libcall Call_I64,
+                                               RTLIB::Libcall Call_I128) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT()) {
+  default: assert(0 && "Unexpected request for libcall!");
+  case MVT::i16: LC = Call_I16; break;
+  case MVT::i32: LC = Call_I32; break;
+  case MVT::i64: LC = Call_I64; break;
+  case MVT::i128: LC = Call_I128; break;
+  }
+  return ExpandLibCall(LC, Node, isSigned);
+}
+
+/// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
+/// INT_TO_FP operation of the specified operand when the target requests that
+/// we expand it.  At this point, we know that the result and operand types are
+/// legal for the target.
+SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
+                                                   SDValue Op0,
+                                                   MVT DestVT,
+                                                   DebugLoc dl) {
+  if (Op0.getValueType() == MVT::i32) {
+    // simple 32-bit [signed|unsigned] integer to float/double expansion
+
+    // Get the stack frame index of a 8 byte buffer.
+    SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
+
+    // word offset constant for Hi/Lo address computation
+    SDValue WordOff = DAG.getConstant(sizeof(int), TLI.getPointerTy());
+    // set up Hi and Lo (into buffer) address based on endian
+    SDValue Hi = StackSlot;
+    SDValue Lo = DAG.getNode(ISD::ADD, dl,
+                             TLI.getPointerTy(), StackSlot, WordOff);
+    if (TLI.isLittleEndian())
+      std::swap(Hi, Lo);
+
+    // if signed map to unsigned space
+    SDValue Op0Mapped;
+    if (isSigned) {
+      // constant used to invert sign bit (signed to unsigned mapping)
+      SDValue SignBit = DAG.getConstant(0x80000000u, MVT::i32);
+      Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
+    } else {
+      Op0Mapped = Op0;
+    }
+    // store the lo of the constructed double - based on integer input
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl,
+                                  Op0Mapped, Lo, NULL, 0);
+    // initial hi portion of constructed double
+    SDValue InitialHi = DAG.getConstant(0x43300000u, MVT::i32);
+    // store the hi of the constructed double - biased exponent
+    SDValue Store2=DAG.getStore(Store1, dl, InitialHi, Hi, NULL, 0);
+    // load the constructed double
+    SDValue Load = DAG.getLoad(MVT::f64, dl, Store2, StackSlot, NULL, 0);
+    // FP constant to bias correct the final result
+    SDValue Bias = DAG.getConstantFP(isSigned ?
+                                     BitsToDouble(0x4330000080000000ULL) :
+                                     BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    // subtract the bias
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
+    // final result
+    SDValue Result;
+    // handle final rounding
+    if (DestVT == MVT::f64) {
+      // do nothing
+      Result = Sub;
+    } else if (DestVT.bitsLT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                           DAG.getIntPtrConstant(0));
+    } else if (DestVT.bitsGT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+    }
+    return Result;
+  }
+  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
+  SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+
+  SDValue SignSet = DAG.getSetCC(dl, TLI.getSetCCResultType(Op0.getValueType()),
+                                 Op0, DAG.getConstant(0, Op0.getValueType()),
+                                 ISD::SETLT);
+  SDValue Zero = DAG.getIntPtrConstant(0), Four = DAG.getIntPtrConstant(4);
+  SDValue CstOffset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(),
+                                    SignSet, Four, Zero);
+
+  // If the sign bit of the integer is set, the large number will be treated
+  // as a negative number.  To counteract this, the dynamic code adds an
+  // offset depending on the data type.
+  uint64_t FF;
+  switch (Op0.getValueType().getSimpleVT()) {
+  default: assert(0 && "Unsupported integer type!");
+  case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
+  case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
+  case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
+  case MVT::i64: FF = 0x5F800000ULL; break;  // 2^64 (as a float)
+  }
+  if (TLI.isLittleEndian()) FF <<= 32;
+  Constant *FudgeFactor = ConstantInt::get(Type::Int64Ty, FF);
+
+  SDValue CPIdx = DAG.getConstantPool(FudgeFactor, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  CPIdx = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), CPIdx, CstOffset);
+  Alignment = std::min(Alignment, 4u);
+  SDValue FudgeInReg;
+  if (DestVT == MVT::f32)
+    FudgeInReg = DAG.getLoad(MVT::f32, dl, DAG.getEntryNode(), CPIdx,
+                             PseudoSourceValue::getConstantPool(), 0,
+                             false, Alignment);
+  else {
+    FudgeInReg =
+      LegalizeOp(DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT,
+                                DAG.getEntryNode(), CPIdx,
+                                PseudoSourceValue::getConstantPool(), 0,
+                                MVT::f32, false, Alignment));
+  }
+
+  return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
+}
+
+/// PromoteLegalINT_TO_FP - This function is responsible for legalizing a
+/// *INT_TO_FP operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
+/// operation that takes a larger input.
+SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
+                                                    MVT DestVT,
+                                                    bool isSigned,
+                                                    DebugLoc dl) {
+  // First step, figure out the appropriate *INT_TO_FP operation to use.
+  MVT NewInTy = LegalOp.getValueType();
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT()+1);
+    assert(NewInTy.isInteger() && "Ran out of possibilities!");
+
+    // If the target supports SINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::SINT_TO_FP;
+      break;
+    }
+    if (isSigned) continue;
+
+    // If the target supports UINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::UINT_TO_FP;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+  // Okay, we found the operation and type to use.  Zero extend our input to the
+  // desired type then run the operation on it.
+  return DAG.getNode(OpToUse, dl, DestVT,
+                     DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                                 dl, NewInTy, LegalOp));
+}
+
+/// PromoteLegalFP_TO_INT - This function is responsible for legalizing a
+/// FP_TO_*INT operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
+/// operation that returns a larger result.
+SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
+                                                    MVT DestVT,
+                                                    bool isSigned,
+                                                    DebugLoc dl) {
+  // First step, figure out the appropriate FP_TO*INT operation to use.
+  MVT NewOutTy = DestVT;
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT()+1);
+    assert(NewOutTy.isInteger() && "Ran out of possibilities!");
+
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_SINT;
+      break;
+    }
+
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_UINT;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+
+  // Okay, we found the operation and type to use.
+  SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);
+
+  // Truncate the result of the extended FP_TO_*INT operation to the desired
+  // size.
+  return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
+}
+
+/// ExpandBSWAP - Open code the operations for BSWAP of the specified operation.
+///
+SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
+  MVT VT = Op.getValueType();
+  MVT SHVT = TLI.getShiftAmountTy();
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
+  switch (VT.getSimpleVT()) {
+  default: assert(0 && "Unhandled Expand type in BSWAP!"); abort();
+  case MVT::i16:
+    Tmp2 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    return DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  case MVT::i32:
+    Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3, DAG.getConstant(0xFF0000, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, VT));
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+  case MVT::i64:
+    Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, SHVT));
+    Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, SHVT));
+    Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, SHVT));
+    Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7, DAG.getConstant(255ULL<<48, VT));
+    Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6, DAG.getConstant(255ULL<<40, VT));
+    Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5, DAG.getConstant(255ULL<<32, VT));
+    Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4, DAG.getConstant(255ULL<<24, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3, DAG.getConstant(255ULL<<16, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(255ULL<<8 , VT));
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
+    Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
+  }
+}
+
+/// ExpandBitCount - Expand the specified bitcount instruction into operations.
+///
+SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
+                                             DebugLoc dl) {
+  switch (Opc) {
+  default: assert(0 && "Cannot expand this yet!");
+  case ISD::CTPOP: {
+    static const uint64_t mask[6] = {
+      0x5555555555555555ULL, 0x3333333333333333ULL,
+      0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
+      0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
+    };
+    MVT VT = Op.getValueType();
+    MVT ShVT = TLI.getShiftAmountTy();
+    unsigned len = VT.getSizeInBits();
+    for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
+      //x = (x & mask[i][len/8]) + (x >> (1 << i) & mask[i][len/8])
+      unsigned EltSize = VT.isVector() ?
+        VT.getVectorElementType().getSizeInBits() : len;
+      SDValue Tmp2 = DAG.getConstant(APInt(EltSize, mask[i]), VT);
+      SDValue Tmp3 = DAG.getConstant(1ULL << i, ShVT);
+      Op = DAG.getNode(ISD::ADD, dl, VT,
+                       DAG.getNode(ISD::AND, dl, VT, Op, Tmp2),
+                       DAG.getNode(ISD::AND, dl, VT,
+                                   DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3),
+                                   Tmp2));
+    }
+    return Op;
+  }
+  case ISD::CTLZ: {
+    // for now, we do this:
+    // x = x | (x >> 1);
+    // x = x | (x >> 2);
+    // ...
+    // x = x | (x >>16);
+    // x = x | (x >>32); // for 64-bit input
+    // return popcount(~x);
+    //
+    // but see also: http://www.hackersdelight.org/HDcode/nlz.cc
+    MVT VT = Op.getValueType();
+    MVT ShVT = TLI.getShiftAmountTy();
+    unsigned len = VT.getSizeInBits();
+    for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
+      SDValue Tmp3 = DAG.getConstant(1ULL << i, ShVT);
+      Op = DAG.getNode(ISD::OR, dl, VT, Op,
+                       DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
+    }
+    Op = DAG.getNOT(dl, Op, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  }
+  case ISD::CTTZ: {
+    // for now, we use: { return popcount(~x & (x - 1)); }
+    // unless the target has ctlz but not ctpop, in which case we use:
+    // { return 32 - nlz(~x & (x-1)); }
+    // see also http://www.hackersdelight.org/HDcode/ntz.cc
+    MVT VT = Op.getValueType();
+    SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNOT(dl, Op, VT),
+                               DAG.getNode(ISD::SUB, dl, VT, Op,
+                                           DAG.getConstant(1, VT)));
+    // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
+    if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
+      return DAG.getNode(ISD::SUB, dl, VT,
+                         DAG.getConstant(VT.getSizeInBits(), VT),
+                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
+    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
+  }
+  }
+}
+
+void SelectionDAGLegalize::ExpandNode(SDNode *Node,
+                                      SmallVectorImpl<SDValue> &Results) {
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  switch (Node->getOpcode()) {
+  case ISD::CTPOP:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+    Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BSWAP:
+    Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
+    break;
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+    Results.push_back(DAG.getConstant(0, Node->getValueType(0)));
+    break;
+  case ISD::FLT_ROUNDS_:
+    Results.push_back(DAG.getConstant(1, Node->getValueType(0)));
+    break;
+  case ISD::EH_RETURN:
+  case ISD::DECLARE:
+  case ISD::DBG_LABEL:
+  case ISD::EH_LABEL:
+  case ISD::PREFETCH:
+  case ISD::MEMBARRIER:
+  case ISD::VAEND:
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::DBG_STOPPOINT:
+    Results.push_back(ExpandDBG_STOPPOINT(Node));
+    break;
+  case ISD::DYNAMIC_STACKALLOC:
+    ExpandDYNAMIC_STACKALLOC(Node, Results);
+    break;
+  case ISD::MERGE_VALUES:
+    for (unsigned i = 0; i < Node->getNumValues(); i++)
+      Results.push_back(Node->getOperand(i));
+    break;
+  case ISD::UNDEF: {
+    MVT VT = Node->getValueType(0);
+    if (VT.isInteger())
+      Results.push_back(DAG.getConstant(0, VT));
+    else if (VT.isFloatingPoint())
+      Results.push_back(DAG.getConstantFP(0, VT));
+    else
+      assert(0 && "Unknown value type!");
+    break;
+  }
+  case ISD::TRAP: {
+    // If this operation is not supported, lower it to 'abort()' call
+    TargetLowering::ArgListTy Args;
+    std::pair<SDValue, SDValue> CallResult =
+      TLI.LowerCallTo(Node->getOperand(0), Type::VoidTy,
+                      false, false, false, false, CallingConv::C, false,
+                      DAG.getExternalSymbol("abort", TLI.getPointerTy()),
+                      Args, DAG, dl);
+    Results.push_back(CallResult.second);
+    break;
+  }
+  case ISD::FP_ROUND:
+  case ISD::BIT_CONVERT:
+    Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_EXTEND:
+    Tmp1 = EmitStackConvert(Node->getOperand(0),
+                            Node->getOperand(0).getValueType(),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    // NOTE: we could fall back on load/store here too for targets without
+    // SAR.  However, it is doubtful that any exist.
+    MVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    unsigned BitsDiff = Node->getValueType(0).getSizeInBits() -
+                        ExtraVT.getSizeInBits();
+    SDValue ShiftCst = DAG.getConstant(BitsDiff, TLI.getShiftAmountTy());
+    Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
+                       Node->getOperand(0), ShiftCst);
+    Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_ROUND_INREG: {
+    // The only way we can lower this is to turn it into a TRUNCSTORE,
+    // EXTLOAD pair, targetting a temporary location (a stack slot).
+
+    // NOTE: there is a choice here between constantly creating new stack
+    // slots and always reusing the same one.  We currently always create
+    // new ones, as reuse may inhibit scheduling.
+    MVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
+                                Node->getOperand(0), Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_TO_UINT: {
+    SDValue True, False;
+    MVT VT =  Node->getOperand(0).getValueType();
+    MVT NVT = Node->getValueType(0);
+    const uint64_t zero[] = {0, 0};
+    APFloat apf = APFloat(APInt(VT.getSizeInBits(), 2, zero));
+    APInt x = APInt::getSignBit(NVT.getSizeInBits());
+    (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
+    Tmp1 = DAG.getConstantFP(apf, VT);
+    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
+                        Node->getOperand(0),
+                        Tmp1, ISD::SETLT);
+    True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
+    False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
+                        DAG.getNode(ISD::FSUB, dl, VT,
+                                    Node->getOperand(0), Tmp1));
+    False = DAG.getNode(ISD::XOR, dl, NVT, False,
+                        DAG.getConstant(x, NVT));
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2, True, False);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VAARG: {
+    const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+    MVT VT = Node->getValueType(0);
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    SDValue VAList = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2, V, 0);
+    // Increment the pointer, VAList, to the next vaarg
+    Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+                       DAG.getConstant(TLI.getTargetData()->
+                                       getTypeAllocSize(VT.getTypeForMVT()),
+                                       TLI.getPointerTy()));
+    // Store the incremented VAList to the legalized pointer
+    Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Tmp2, V, 0);
+    // Load the actual argument out of the pointer VAList
+    Results.push_back(DAG.getLoad(VT, dl, Tmp3, VAList, NULL, 0));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  }
+  case ISD::VACOPY: {
+    // This defaults to loading a pointer from the input and storing it to the
+    // output, returning the chain.
+    const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
+    const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
+    Tmp1 = DAG.getLoad(TLI.getPointerTy(), dl, Node->getOperand(0),
+                       Node->getOperand(2), VS, 0);
+    Tmp1 = DAG.getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1), VD, 0);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
+      // This must be an access of the only element.  Return it.
+      Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, Node->getValueType(0), 
+                         Node->getOperand(0));
+    else
+      Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::CONCAT_VECTORS: {
+    // Use extract/insert/build vector for now. We might try to be
+    // more clever later.
+    SmallVector<SDValue, 8> Ops;
+    unsigned NumOperands = Node->getNumOperands();
+    for (unsigned i=0; i < NumOperands; ++i) {
+      SDValue SubOp = Node->getOperand(i);
+      MVT VVT = SubOp.getNode()->getValueType(0);
+      MVT EltVT = VVT.getVectorElementType();
+      unsigned NumSubElem = VVT.getVectorNumElements();
+      for (unsigned j=0; j < NumSubElem; ++j) {
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                  DAG.getIntPtrConstant(j)));
+      }
+    }
+    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
+                       &Ops[0], Ops.size());
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR:
+    Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
+    break;
+  case ISD::INSERT_VECTOR_ELT:
+    Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
+                                              Node->getOperand(1),
+                                              Node->getOperand(2), dl));
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    SmallVector<int, 8> Mask;
+    cast<ShuffleVectorSDNode>(Node)->getMask(Mask);
+
+    MVT VT = Node->getValueType(0);
+    MVT EltVT = VT.getVectorElementType();
+    unsigned NumElems = VT.getVectorNumElements();
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (Mask[i] < 0) {
+        Ops.push_back(DAG.getUNDEF(EltVT));
+        continue;
+      }
+      unsigned Idx = Mask[i];
+      if (Idx < NumElems)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  Node->getOperand(0),
+                                  DAG.getIntPtrConstant(Idx)));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  Node->getOperand(1),
+                                  DAG.getIntPtrConstant(Idx - NumElems)));
+    }
+    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    MVT OpTy = Node->getOperand(0).getValueType();
+    if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+      // 1 -> Hi
+      Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
+                         DAG.getConstant(OpTy.getSizeInBits()/2,
+                                         TLI.getShiftAmountTy()));
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
+    } else {
+      // 0 -> Lo
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::STACKSAVE:
+    // Expand to CopyFromReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
+                                           Node->getValueType(0)));
+      Results.push_back(Results[0].getValue(1));
+    } else {
+      Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::STACKRESTORE:
+    // Expand to CopyToReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
+                                         Node->getOperand(1)));
+    } else {
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::FCOPYSIGN:
+    Results.push_back(ExpandFCOPYSIGN(Node));
+    break;
+  case ISD::FNEG:
+    // Expand Y = FNEG(X) ->  Y = SUB -0.0, X
+    Tmp1 = DAG.getConstantFP(-0.0, Node->getValueType(0));
+    Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
+                       Node->getOperand(0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FABS: {
+    // Expand Y = FABS(X) -> Y = (X >u 0.0) ? X : fneg(X).
+    MVT VT = Node->getValueType(0);
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = DAG.getConstantFP(0.0, VT);
+    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(Tmp1.getValueType()),
+                        Tmp1, Tmp2, ISD::SETUGT);
+    Tmp3 = DAG.getNode(ISD::FNEG, dl, VT, Tmp1);
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, VT, Tmp2, Tmp1, Tmp3);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FSQRT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                      RTLIB::SQRT_F80, RTLIB::SQRT_PPCF128));
+    break;
+  case ISD::FSIN:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                      RTLIB::SIN_F80, RTLIB::SIN_PPCF128));
+    break;
+  case ISD::FCOS:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
+                                      RTLIB::COS_F80, RTLIB::COS_PPCF128));
+    break;
+  case ISD::FLOG:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                      RTLIB::LOG_F80, RTLIB::LOG_PPCF128));
+    break;
+  case ISD::FLOG2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                      RTLIB::LOG2_F80, RTLIB::LOG2_PPCF128));
+    break;
+  case ISD::FLOG10:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                      RTLIB::LOG10_F80, RTLIB::LOG10_PPCF128));
+    break;
+  case ISD::FEXP:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                      RTLIB::EXP_F80, RTLIB::EXP_PPCF128));
+    break;
+  case ISD::FEXP2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                      RTLIB::EXP2_F80, RTLIB::EXP2_PPCF128));
+    break;
+  case ISD::FTRUNC:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                      RTLIB::TRUNC_F80, RTLIB::TRUNC_PPCF128));
+    break;
+  case ISD::FFLOOR:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                      RTLIB::FLOOR_F80, RTLIB::FLOOR_PPCF128));
+    break;
+  case ISD::FCEIL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                      RTLIB::CEIL_F80, RTLIB::CEIL_PPCF128));
+    break;
+  case ISD::FRINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                      RTLIB::RINT_F80, RTLIB::RINT_PPCF128));
+    break;
+  case ISD::FNEARBYINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
+                                      RTLIB::NEARBYINT_F64,
+                                      RTLIB::NEARBYINT_F80,
+                                      RTLIB::NEARBYINT_PPCF128));
+    break;
+  case ISD::FPOWI:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                      RTLIB::POWI_F80, RTLIB::POWI_PPCF128));
+    break;
+  case ISD::FPOW:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
+                                      RTLIB::POW_F80, RTLIB::POW_PPCF128));
+    break;
+  case ISD::FDIV:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
+                                      RTLIB::DIV_F80, RTLIB::DIV_PPCF128));
+    break;
+  case ISD::FREM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
+                                      RTLIB::REM_F80, RTLIB::REM_PPCF128));
+    break;
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
+    // Check to see if this FP immediate is already legal.
+    bool isLegal = false;
+    for (TargetLowering::legal_fpimm_iterator I = TLI.legal_fpimm_begin(),
+            E = TLI.legal_fpimm_end(); I != E; ++I) {
+      if (CFP->isExactlyValue(*I)) {
+        isLegal = true;
+        break;
+      }
+    }
+    // If this is a legal constant, turn it into a TargetConstantFP node.
+    if (isLegal)
+      Results.push_back(SDValue(Node, 0));
+    else
+      Results.push_back(ExpandConstantFP(CFP, true, DAG, TLI));
+    break;
+  }
+  case ISD::EHSELECTION: {
+    unsigned Reg = TLI.getExceptionSelectorRegister();
+    assert(Reg && "Can't expand to unknown register!");
+    Results.push_back(DAG.getCopyFromReg(Node->getOperand(1), dl, Reg,
+                                         Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  }
+  case ISD::EXCEPTIONADDR: {
+    unsigned Reg = TLI.getExceptionAddressRegister();
+    assert(Reg && "Can't expand to unknown register!");
+    Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, Reg,
+                                         Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  }
+  case ISD::SUB: {
+    MVT VT = Node->getValueType(0);
+    assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+           TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
+           "Don't know how to expand this subtraction!");
+    Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
+               DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT));
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp2, DAG.getConstant(1, VT));
+    Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
+    break;
+  }
+  case ISD::UREM:
+  case ISD::SREM: {
+    MVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    bool isSigned = Node->getOpcode() == ISD::SREM;
+    unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    Tmp2 = Node->getOperand(0);
+    Tmp3 = Node->getOperand(1);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
+    } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
+      // X % Y -> X-X/Y*Y
+      Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
+      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
+    } else if (isSigned) {
+      Tmp1 = ExpandIntLibCall(Node, true, RTLIB::SREM_I16, RTLIB::SREM_I32,
+                              RTLIB::SREM_I64, RTLIB::SREM_I128);
+    } else {
+      Tmp1 = ExpandIntLibCall(Node, false, RTLIB::UREM_I16, RTLIB::UREM_I32,
+                              RTLIB::UREM_I64, RTLIB::UREM_I128);
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::UDIV:
+  case ISD::SDIV: {
+    bool isSigned = Node->getOpcode() == ISD::SDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    MVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT))
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
+                         Node->getOperand(1));
+    else if (isSigned)
+      Tmp1 = ExpandIntLibCall(Node, true, RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+                              RTLIB::SDIV_I64, RTLIB::SDIV_I128);
+    else
+      Tmp1 = ExpandIntLibCall(Node, false, RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+                              RTLIB::UDIV_I64, RTLIB::UDIV_I128);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    unsigned ExpandOpcode = Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI :
+                                                              ISD::SMUL_LOHI;
+    MVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    assert(TLI.isOperationLegalOrCustom(ExpandOpcode, VT) &&
+           "If this wasn't legal, it shouldn't have been created!");
+    Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
+                       Node->getOperand(1));
+    Results.push_back(Tmp1.getValue(1));
+    break;
+  }
+  case ISD::MUL: {
+    MVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    // See if multiply or divide can be lowered using two-result operations.
+    // We just need the low half of the multiply; try both the signed
+    // and unsigned forms. If the target supports both SMUL_LOHI and
+    // UMUL_LOHI, form a preference by checking which forms of plain
+    // MULH it supports.
+    bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
+    bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
+    bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
+    bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
+    unsigned OpToUse = 0;
+    if (HasSMUL_LOHI && !HasMULHS) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI && !HasMULHU) {
+      OpToUse = ISD::UMUL_LOHI;
+    } else if (HasSMUL_LOHI) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI) {
+      OpToUse = ISD::UMUL_LOHI;
+    }
+    if (OpToUse) {
+      Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
+                                    Node->getOperand(1)));
+      break;
+    }
+    Tmp1 = ExpandIntLibCall(Node, false, RTLIB::MUL_I16, RTLIB::MUL_I32,
+                            RTLIB::MUL_I64, RTLIB::MUL_I128);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SADDO:
+  case ISD::SSUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+    MVT OType = Node->getValueType(1);
+
+    SDValue Zero = DAG.getConstant(0, LHS.getValueType());
+
+    //   LHSSign -> LHS >= 0
+    //   RHSSign -> RHS >= 0
+    //   SumSign -> Sum >= 0
+    //
+    //   Add:
+    //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+    //   Sub:
+    //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+    //
+    SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+    SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+    SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                      Node->getOpcode() == ISD::SADDO ?
+                                      ISD::SETEQ : ISD::SETNE);
+
+    SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+    SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+    SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+    Results.push_back(Cmp);
+    break;
+  }
+  case ISD::UADDO:
+  case ISD::USUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+    Results.push_back(DAG.getSetCC(dl, Node->getValueType(1), Sum, LHS,
+                                   Node->getOpcode () == ISD::UADDO ?
+                                   ISD::SETULT : ISD::SETUGT));
+    break;
+  }
+  case ISD::BUILD_PAIR: {
+    MVT PairTy = Node->getValueType(0);
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
+    Tmp2 = DAG.getNode(ISD::SHL, dl, PairTy, Tmp2,
+                       DAG.getConstant(PairTy.getSizeInBits()/2,
+                                       TLI.getShiftAmountTy()));
+    Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
+    break;
+  }
+  case ISD::SELECT:
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    if (Tmp1.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
+                             Tmp2, Tmp3,
+                             cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+    } else {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1,
+                             DAG.getConstant(0, Tmp1.getValueType()),
+                             Tmp2, Tmp3, ISD::SETNE);
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BR_JT: {
+    SDValue Chain = Node->getOperand(0);
+    SDValue Table = Node->getOperand(1);
+    SDValue Index = Node->getOperand(2);
+
+    MVT PTy = TLI.getPointerTy();
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned EntrySize = MF.getJumpTableInfo()->getEntrySize();
+    Index= DAG.getNode(ISD::MUL, dl, PTy,
+                        Index, DAG.getConstant(EntrySize, PTy));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+
+    MVT MemVT = MVT::getIntegerVT(EntrySize * 8);
+    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
+                                PseudoSourceValue::getJumpTable(), 0, MemVT);
+    Addr = LD;
+    if (TLI.getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+      // For PIC, the sequence is:
+      // BRIND(load(Jumptable + index) + RelocBase)
+      // RelocBase can be JumpTable, GOT or some sort of global base.
+      Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
+                          TLI.getPICJumpTableRelocBase(Table, DAG));
+    }
+    Tmp1 = DAG.getNode(ISD::BRIND, dl, MVT::Other, LD.getValue(1), Addr);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BRCOND:
+    // Expand brcond's setcc into its constituent parts and create a BR_CC
+    // Node.
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    if (Tmp2.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
+                         Tmp1, Tmp2.getOperand(2),
+                         Tmp2.getOperand(0), Tmp2.getOperand(1),
+                         Node->getOperand(2));
+    } else {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
+                         DAG.getCondCode(ISD::SETNE), Tmp2,
+                         DAG.getConstant(0, Tmp2.getValueType()),
+                         Node->getOperand(2));
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SETCC: {
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2, Tmp3, dl);
+
+    // If we expanded the SETCC into an AND/OR, return the new node
+    if (Tmp2.getNode() == 0) {
+      Results.push_back(Tmp1);
+      break;
+    }
+
+    // Otherwise, SETCC for the given comparison type must be completely
+    // illegal; expand it into a SELECT_CC.
+    MVT VT = Node->getValueType(0);
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT), Tmp3);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SELECT_CC: {
+    Tmp1 = Node->getOperand(0);   // LHS
+    Tmp2 = Node->getOperand(1);   // RHS
+    Tmp3 = Node->getOperand(2);   // True
+    Tmp4 = Node->getOperand(3);   // False
+    SDValue CC = Node->getOperand(4);
+
+    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp1.getValueType()),
+                          Tmp1, Tmp2, CC, dl);
+
+    assert(!Tmp2.getNode() && "Can't legalize SELECT_CC with legal condition!");
+    Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
+    CC = DAG.getCondCode(ISD::SETNE);
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
+                       Tmp3, Tmp4, CC);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BR_CC: {
+    Tmp1 = Node->getOperand(0);              // Chain
+    Tmp2 = Node->getOperand(2);              // LHS
+    Tmp3 = Node->getOperand(3);              // RHS
+    Tmp4 = Node->getOperand(1);              // CC
+
+    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp2.getValueType()),
+                          Tmp2, Tmp3, Tmp4, dl);
+    LastCALLSEQ_END = DAG.getEntryNode();
+
+    assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
+    Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
+    Tmp4 = DAG.getCondCode(ISD::SETNE);
+    Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
+                       Tmp3, Node->getOperand(4));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::GLOBAL_OFFSET_TABLE:
+  case ISD::GlobalAddress:
+  case ISD::GlobalTLSAddress:
+  case ISD::ExternalSymbol:
+  case ISD::ConstantPool:
+  case ISD::JumpTable:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // FIXME: Custom lowering for these operations shouldn't return null!
+    for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+      Results.push_back(SDValue(Node, i));
+    break;
+  }
+}
+void SelectionDAGLegalize::PromoteNode(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  MVT OVT = Node->getValueType(0);
+  if (Node->getOpcode() == ISD::UINT_TO_FP ||
+      Node->getOpcode() == ISD::SINT_TO_FP) {
+    OVT = Node->getOperand(0).getValueType();
+  }
+  MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1, Tmp2, Tmp3;
+  switch (Node->getOpcode()) {
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+    // Zero extend the argument.
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    // Perform the larger operation.
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0), Tmp1);
+    if (Node->getOpcode() == ISD::CTTZ) {
+      //if Tmp1 == sizeinbits(NVT) then Tmp1 = sizeinbits(Old VT)
+      Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(Tmp1.getValueType()),
+                          Tmp1, DAG.getConstant(NVT.getSizeInBits(), NVT),
+                          ISD::SETEQ);
+      Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2,
+                          DAG.getConstant(OVT.getSizeInBits(), NVT), Tmp1);
+    } else if (Node->getOpcode() == ISD::CTLZ) {
+      // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
+      Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
+                          DAG.getConstant(NVT.getSizeInBits() -
+                                          OVT.getSizeInBits(), NVT));
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BSWAP: {
+    unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Tmp1);
+    Tmp1 = DAG.getNode(ISD::BSWAP, dl, NVT, Tmp1);
+    Tmp1 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
+                          DAG.getConstant(DiffBits, TLI.getShiftAmountTy()));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    Tmp1 = PromoteLegalFP_TO_INT(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::FP_TO_SINT, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::SINT_TO_FP, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    assert(OVT.isVector() && "Don't know how to promote scalar logic ops");
+    // Bit convert each of the values to the new type.
+    Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Node->getOperand(1));
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+    // Bit convert the result back the original type.
+    Results.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, OVT, Tmp1));
+    break;
+  case ISD::SELECT:
+    unsigned ExtOp, TruncOp;
+    if (Node->getValueType(0).isVector()) {
+      ExtOp   = ISD::BIT_CONVERT;
+      TruncOp = ISD::BIT_CONVERT;
+    } else if (Node->getValueType(0).isInteger()) {
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    } else {
+      ExtOp   = ISD::FP_EXTEND;
+      TruncOp = ISD::FP_ROUND;
+    }
+    Tmp1 = Node->getOperand(0);
+    // Promote each of the values to the new type.
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    // Perform the larger operation, then round down.
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp1, Tmp2, Tmp3);
+    if (TruncOp != ISD::FP_ROUND)
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
+    else
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
+                         DAG.getIntPtrConstant(0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    SmallVector<int, 8> Mask;
+    cast<ShuffleVectorSDNode>(Node)->getMask(Mask);
+
+    // Cast the two input vectors.
+    Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Node->getOperand(1));
+
+    // Convert the shuffle mask to the right # elements.
+    Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
+    Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, OVT, Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SETCC: {
+    // First step, figure out the appropriate operation to use.
+    // Allow SETCC to not be supported for all legal data types
+    // Mostly this targets FP
+    MVT NewInTy = Node->getOperand(0).getValueType();
+    MVT OldVT = NewInTy; OldVT = OldVT;
+
+    // Scan for the appropriate larger type to use.
+    while (1) {
+      NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT()+1);
+
+      assert(NewInTy.isInteger() == OldVT.isInteger() &&
+              "Fell off of the edge of the integer world");
+      assert(NewInTy.isFloatingPoint() == OldVT.isFloatingPoint() &&
+              "Fell off of the edge of the floating point world");
+
+      // If the target supports SETCC of this type, use it.
+      if (TLI.isOperationLegalOrCustom(ISD::SETCC, NewInTy))
+        break;
+    }
+    if (NewInTy.isInteger())
+      assert(0 && "Cannot promote Legal Integer SETCC yet");
+    else {
+      Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NewInTy, Tmp1);
+      Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NewInTy, Tmp2);
+    }
+    Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                                  Tmp1, Tmp2, Node->getOperand(2)));
+    break;
+  }
+  }
+}
+
+// SelectionDAG::Legalize - This is the entry point for the file.
+//
+void SelectionDAG::Legalize(bool TypesNeedLegalizing,
+                            CodeGenOpt::Level OptLevel) {
+  /// run - This is the main entry point to this class.
+  ///
+  SelectionDAGLegalize(*this, OptLevel).LegalizeDAG();
+}
+
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
new file mode 100644
index 0000000..c3c1bea
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -0,0 +1,1388 @@
+//===-------- LegalizeFloatTypes.cpp - Legalization of float types --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements float type expansion and softening for LegalizeTypes.
+// Softening is the act of turning a computation in an illegal floating point
+// type into a computation in an integer type of the same size; also known as
+// "soft float".  For example, turning f32 arithmetic into operations using i32.
+// The resulting integer value is the same as what you would get by performing
+// the floating point operation and bitcasting the result to the integer type.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  For example,
+// implementing ppcf128 arithmetic in two f64 registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+using namespace llvm;
+
+/// GetFPLibCall - Return the right libcall for the given floating point type.
+static RTLIB::Libcall GetFPLibCall(MVT VT,
+                                   RTLIB::Libcall Call_F32,
+                                   RTLIB::Libcall Call_F64,
+                                   RTLIB::Libcall Call_F80,
+                                   RTLIB::Libcall Call_PPCF128) {
+  return
+    VT == MVT::f32 ? Call_F32 :
+    VT == MVT::f64 ? Call_F64 :
+    VT == MVT::f80 ? Call_F80 :
+    VT == MVT::ppcf128 ? Call_PPCF128 :
+    RTLIB::UNKNOWN_LIBCALL;
+}
+
+//===----------------------------------------------------------------------===//
+//  Result Float to Integer Conversion.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Soften float result " << ResNo << ": "; N->dump(&DAG);
+        cerr << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "SoftenFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to soften the result of this operator!");
+    abort();
+
+    case ISD::BIT_CONVERT: R = SoftenFloatRes_BIT_CONVERT(N); break;
+    case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
+    case ISD::ConstantFP:
+      R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N));
+      break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
+    case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
+    case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
+    case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
+    case ISD::FEXP2:       R = SoftenFloatRes_FEXP2(N); break;
+    case ISD::FFLOOR:      R = SoftenFloatRes_FFLOOR(N); break;
+    case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
+    case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
+    case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
+    case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
+    case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
+    case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
+    case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
+    case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
+    case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
+    case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
+    case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
+    case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
+    case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
+    case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
+    case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetSoftenedFloat(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BIT_CONVERT(SDNode *N) {
+  return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
+  // Convert the inputs to integers, and build a new pair out of them.
+  return DAG.getNode(ISD::BUILD_PAIR, N->getDebugLoc(),
+                     TLI.getTypeToTransformTo(N->getValueType(0)),
+                     BitConvertToInteger(N->getOperand(0)),
+                     BitConvertToInteger(N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
+  return DAG.getConstant(N->getValueAPF().bitcastToAPInt(),
+                         TLI.getTypeToTransformTo(N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     NewOp.getValueType().getVectorElementType(),
+                     NewOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  unsigned Size = NVT.getSizeInBits();
+
+  // Mask = ~(1 << (Size-1))
+  SDValue Mask = DAG.getConstant(APInt::getAllOnesValue(Size).clear(Size-1),
+                                 NVT);
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return DAG.getNode(ISD::AND, N->getDebugLoc(), NVT, Op, Mask);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::ADD_F32,
+                                  RTLIB::ADD_F64,
+                                  RTLIB::ADD_F80,
+                                  RTLIB::ADD_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::CEIL_F32,
+                                  RTLIB::CEIL_F64,
+                                  RTLIB::CEIL_F80,
+                                  RTLIB::CEIL_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(0));
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT LVT = LHS.getValueType();
+  MVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // First get the sign bit of second operand.
+  SDValue SignBit = DAG.getNode(ISD::SHL, dl, RVT, DAG.getConstant(1, RVT),
+                                  DAG.getConstant(RSize - 1,
+                                                  TLI.getShiftAmountTy()));
+  SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+  if (SizeDiff > 0) {
+    SignBit = DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                          DAG.getConstant(SizeDiff, TLI.getShiftAmountTy()));
+    SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+  } else if (SizeDiff < 0) {
+    SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+    SignBit = DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                          DAG.getConstant(-SizeDiff, TLI.getShiftAmountTy()));
+  }
+
+  // Clear the sign bit of the first operand.
+  SDValue Mask = DAG.getNode(ISD::SHL, dl, LVT, DAG.getConstant(1, LVT),
+                               DAG.getConstant(LSize - 1,
+                                               TLI.getShiftAmountTy()));
+  Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, LVT));
+  LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::COS_F32,
+                                  RTLIB::COS_F64,
+                                  RTLIB::COS_F80,
+                                  RTLIB::COS_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::DIV_F32,
+                                  RTLIB::DIV_F64,
+                                  RTLIB::DIV_F80,
+                                  RTLIB::DIV_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::EXP_F32,
+                                  RTLIB::EXP_F64,
+                                  RTLIB::EXP_F80,
+                                  RTLIB::EXP_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::EXP2_F32,
+                                  RTLIB::EXP2_F64,
+                                  RTLIB::EXP2_F80,
+                                  RTLIB::EXP2_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::FLOOR_F32,
+                                  RTLIB::FLOOR_F64,
+                                  RTLIB::FLOOR_F80,
+                                  RTLIB::FLOOR_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::LOG_F32,
+                                  RTLIB::LOG_F64,
+                                  RTLIB::LOG_F80,
+                                  RTLIB::LOG_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::LOG2_F32,
+                                  RTLIB::LOG2_F64,
+                                  RTLIB::LOG2_F80,
+                                  RTLIB::LOG2_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::LOG10_F32,
+                                  RTLIB::LOG10_F64,
+                                  RTLIB::LOG10_F80,
+                                  RTLIB::LOG10_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::MUL_F32,
+                                  RTLIB::MUL_F64,
+                                  RTLIB::MUL_F80,
+                                  RTLIB::MUL_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::NEARBYINT_F32,
+                                  RTLIB::NEARBYINT_F64,
+                                  RTLIB::NEARBYINT_F80,
+                                  RTLIB::NEARBYINT_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  // Expand Y = FNEG(X) -> Y = SUB -0.0, X
+  SDValue Ops[2] = { DAG.getConstantFP(-0.0, N->getValueType(0)),
+                     GetSoftenedFloat(N->getOperand(0)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SUB_F32,
+                                  RTLIB::SUB_F64,
+                                  RTLIB::SUB_F80,
+                                  RTLIB::SUB_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  return MakeLibCall(LC, NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
+  return MakeLibCall(LC, NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::POW_F32,
+                                  RTLIB::POW_F64,
+                                  RTLIB::POW_F80,
+                                  RTLIB::POW_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::i32 &&
+         "Unsupported power type!");
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::POWI_F32,
+                                  RTLIB::POWI_F64,
+                                  RTLIB::POWI_F80,
+                                  RTLIB::POWI_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::REM_F32,
+                                  RTLIB::REM_F64,
+                                  RTLIB::REM_F80,
+                                  RTLIB::REM_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::RINT_F32,
+                                  RTLIB::RINT_F64,
+                                  RTLIB::RINT_F80,
+                                  RTLIB::RINT_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SIN_F32,
+                                  RTLIB::SIN_F64,
+                                  RTLIB::SIN_F80,
+                                  RTLIB::SIN_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SQRT_F32,
+                                  RTLIB::SQRT_F64,
+                                  RTLIB::SQRT_F80,
+                                  RTLIB::SQRT_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::SUB_F32,
+                                  RTLIB::SUB_F64,
+                                  RTLIB::SUB_F80,
+                                  RTLIB::SUB_PPCF128),
+                     NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::TRUNC_F32,
+                                  RTLIB::TRUNC_F64,
+                                  RTLIB::TRUNC_F80,
+                                  RTLIB::TRUNC_PPCF128),
+                     NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  MVT VT = N->getValueType(0);
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue NewL;
+  if (L->getExtensionType() == ISD::NON_EXTLOAD) {
+    NewL = DAG.getLoad(L->getAddressingMode(), dl, L->getExtensionType(),
+                       NVT, L->getChain(), L->getBasePtr(), L->getOffset(),
+                       L->getSrcValue(), L->getSrcValueOffset(), NVT,
+                       L->isVolatile(), L->getAlignment());
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    return NewL;
+  }
+
+  // Do a non-extending load followed by FP_EXTEND.
+  NewL = DAG.getLoad(L->getAddressingMode(), dl, ISD::NON_EXTLOAD,
+                     L->getMemoryVT(), L->getChain(),
+                     L->getBasePtr(), L->getOffset(),
+                     L->getSrcValue(), L->getSrcValueOffset(),
+                     L->getMemoryVT(),
+                     L->isVolatile(), L->getAlignment());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(1));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(2));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  MVT VT = N->getValueType(0);
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue NewVAARG;
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2));
+  
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  return NewVAARG;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
+  bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
+  MVT SVT = N->getOperand(0).getValueType();
+  MVT RVT = N->getValueType(0);
+  MVT NVT = MVT();
+  DebugLoc dl = N->getDebugLoc();
+
+  // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to
+  // a larger type, eg: i8 -> fp.  Even if it is legal, no libcall may exactly
+  // match.  Look for an appropriate libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+       t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; ++t) {
+    NVT = (MVT::SimpleValueType)t;
+    // The source needs to big enough to hold the operand.
+    if (NVT.bitsGE(SVT))
+      LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT):RTLIB::getUINTTOFP (NVT, RVT);
+  }
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+  // Sign/zero extend the argument if the libcall takes a larger type.
+  SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                           NVT, N->getOperand(0));
+  return MakeLibCall(LC, TLI.getTypeToTransformTo(RVT), &Op, 1, false, dl);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Float to Integer Conversion..
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(cerr << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
+        cerr << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "SoftenFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to soften this operator's operand!");
+    abort();
+
+  case ISD::BIT_CONVERT: Res = SoftenFloatOp_BIT_CONVERT(N); break;
+  case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
+  case ISD::FP_TO_SINT:  Res = SoftenFloatOp_FP_TO_SINT(N); break;
+  case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
+  case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
+  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// SoftenSetCCOperands - Soften the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::SoftenSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                           ISD::CondCode &CCCode, DebugLoc dl) {
+  SDValue LHSInt = GetSoftenedFloat(NewLHS);
+  SDValue RHSInt = GetSoftenedFloat(NewRHS);
+  MVT VT = NewLHS.getValueType();
+
+  assert((VT == MVT::f32 || VT == MVT::f64) && "Unsupported setcc type!");
+
+  // Expand into one or more soft-fp libcall(s).
+  RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
+  switch (CCCode) {
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : RTLIB::OEQ_F64;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 : RTLIB::UNE_F64;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : RTLIB::OGE_F64;
+    break;
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
+    break;
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : RTLIB::OLE_F64;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : RTLIB::OGT_F64;
+    break;
+  case ISD::SETUO:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : RTLIB::UO_F64;
+    break;
+  case ISD::SETO:
+    LC1 = (VT == MVT::f32) ? RTLIB::O_F32 : RTLIB::O_F64;
+    break;
+  default:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : RTLIB::UO_F64;
+    switch (CCCode) {
+    case ISD::SETONE:
+      // SETONE = SETOLT | SETOGT
+      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
+      // Fallthrough
+    case ISD::SETUGT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 : RTLIB::OGT_F64;
+      break;
+    case ISD::SETUGE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 : RTLIB::OGE_F64;
+      break;
+    case ISD::SETULT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
+      break;
+    case ISD::SETULE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 : RTLIB::OLE_F64;
+      break;
+    case ISD::SETUEQ:
+      LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : RTLIB::OEQ_F64;
+      break;
+    default: assert(false && "Do not know how to soften this setcc!");
+    }
+  }
+
+  MVT RetVT = MVT::i32; // FIXME: is this the correct return type?
+  SDValue Ops[2] = { LHSInt, RHSInt };
+  NewLHS = MakeLibCall(LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+  NewRHS = DAG.getConstant(0, RetVT);
+  CCCode = TLI.getCmpLibcallCC(LC1);
+  if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
+    SDValue Tmp = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(RetVT),
+                                NewLHS, NewRHS, DAG.getCondCode(CCCode));
+    NewLHS = MakeLibCall(LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+    NewLHS = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(RetVT), NewLHS,
+                         NewRHS, DAG.getCondCode(TLI.getCmpLibcallCC(LC2)));
+    NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
+    NewRHS = SDValue();
+  }
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BIT_CONVERT(SDNode *N) {
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), N->getValueType(0),
+                     GetSoftenedFloat(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
+  MVT SVT = N->getOperand(0).getValueType();
+  MVT RVT = N->getValueType(0);
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
+
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If SoftenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
+  MVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
+  MVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If SoftenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If SoftenSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  DebugLoc dl = N->getDebugLoc();
+
+  if (ST->isTruncatingStore())
+    // Do an FP_ROUND followed by a non-truncating store.
+    Val = BitConvertToInteger(DAG.getNode(ISD::FP_ROUND, dl, ST->getMemoryVT(),
+                                          Val, DAG.getIntPtrConstant(0)));
+  else
+    Val = GetSoftenedFloat(Val);
+
+  return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
+                      ST->getSrcValue(), ST->getSrcValueOffset(),
+                      ST->isVolatile(), ST->getAlignment());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Expand float result: "; N->dump(&DAG); cerr << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "ExpandFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to expand the result of this operator!");
+    abort();
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+
+  case ISD::BIT_CONVERT:        ExpandRes_BIT_CONVERT(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
+  case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
+  case ISD::FCOS:       ExpandFloatRes_FCOS(N, Lo, Hi); break;
+  case ISD::FDIV:       ExpandFloatRes_FDIV(N, Lo, Hi); break;
+  case ISD::FEXP:       ExpandFloatRes_FEXP(N, Lo, Hi); break;
+  case ISD::FEXP2:      ExpandFloatRes_FEXP2(N, Lo, Hi); break;
+  case ISD::FFLOOR:     ExpandFloatRes_FFLOOR(N, Lo, Hi); break;
+  case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
+  case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
+  case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
+  case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
+  case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
+  case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
+  case ISD::FP_EXTEND:  ExpandFloatRes_FP_EXTEND(N, Lo, Hi); break;
+  case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
+  case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
+  case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
+  case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
+  case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
+  case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedFloat(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  assert(NVT.getSizeInBits() == integerPartWidth &&
+         "Do not know how to expand this float constant!");
+  APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
+  Lo = DAG.getConstantFP(APFloat(APInt(integerPartWidth, 1,
+                                       &C.getRawData()[1])), NVT);
+  Hi = DAG.getConstantFP(APFloat(APInt(integerPartWidth, 1,
+                                       &C.getRawData()[0])), NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Tmp;
+  GetExpandedFloat(N->getOperand(0), Lo, Tmp);
+  Hi = DAG.getNode(ISD::FABS, dl, Tmp.getValueType(), Tmp);
+  // Lo = Hi==fabs(Hi) ? Lo : -Lo;
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, Lo.getValueType(), Tmp, Hi, Lo,
+                   DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo),
+                   DAG.getCondCode(ISD::SETEQ));
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                         RTLIB::ADD_F80, RTLIB::ADD_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCEIL(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                         RTLIB::CEIL_F80, RTLIB::CEIL_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::COS_F32, RTLIB::COS_F64,
+                                         RTLIB::COS_F80, RTLIB::COS_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                          RTLIB::DIV_F32,
+                                          RTLIB::DIV_F64,
+                                          RTLIB::DIV_F80,
+                                          RTLIB::DIV_PPCF128),
+                             N->getValueType(0), Ops, 2, false,
+                             N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                         RTLIB::EXP_F80, RTLIB::EXP_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                         RTLIB::EXP2_F80, RTLIB::EXP2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FFLOOR(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FLOOR_F32,RTLIB::FLOOR_F64,
+                                         RTLIB::FLOOR_F80,RTLIB::FLOOR_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                         RTLIB::LOG_F80, RTLIB::LOG_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                         RTLIB::LOG2_F80, RTLIB::LOG2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG10_F32,RTLIB::LOG10_F64,
+                                         RTLIB::LOG10_F80,RTLIB::LOG10_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                          RTLIB::MUL_F32,
+                                          RTLIB::MUL_F64,
+                                          RTLIB::MUL_F80,
+                                          RTLIB::MUL_PPCF128),
+                             N->getValueType(0), Ops, 2, false,
+                             N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
+                                                 SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::NEARBYINT_F32,
+                                         RTLIB::NEARBYINT_F64,
+                                         RTLIB::NEARBYINT_F80,
+                                         RTLIB::NEARBYINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  Hi = DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), NVT, N->getOperand(0));
+  Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POW_F32, RTLIB::POW_F64,
+                                         RTLIB::POW_F80, RTLIB::POW_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                         RTLIB::POWI_F80, RTLIB::POWI_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                         RTLIB::RINT_F80, RTLIB::RINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                         RTLIB::SIN_F80, RTLIB::SIN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                         RTLIB::SQRT_F80, RTLIB::SQRT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                          RTLIB::SUB_F32,
+                                          RTLIB::SUB_F64,
+                                          RTLIB::SUB_F80,
+                                          RTLIB::SUB_PPCF128),
+                             N->getValueType(0), Ops, 2, false,
+                             N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                         RTLIB::TRUNC_F80, RTLIB::TRUNC_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT NVT = TLI.getTypeToTransformTo(LD->getValueType(0));
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+
+  Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
+                      LD->getSrcValue(), LD->getSrcValueOffset(),
+                      LD->getMemoryVT(),
+                      LD->isVolatile(), LD->getAlignment());
+
+  // Remember the chain.
+  Chain = Hi.getValue(1);
+
+  // The low part is zero.
+  Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+
+  // Modified the chain - switch anything that used the old chain to use the
+  // new one.
+  ReplaceValueWith(SDValue(LD, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 && "Unsupported XINT_TO_FP!");
+  MVT VT = N->getValueType(0);
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  SDValue Src = N->getOperand(0);
+  MVT SrcVT = Src.getValueType();
+  bool isSigned = N->getOpcode() == ISD::SINT_TO_FP;
+  DebugLoc dl = N->getDebugLoc();
+
+  // First do an SINT_TO_FP, whether the original was signed or unsigned.
+  // When promoting partial word types to i32 we must honor the signedness,
+  // though.
+  if (SrcVT.bitsLE(MVT::i32)) {
+    // The integer can be represented exactly in an f64.
+    Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                      MVT::i32, Src);
+    Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+    Hi = DAG.getNode(ISD::SINT_TO_FP, dl, NVT, Src);
+  } else {
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    if (SrcVT.bitsLE(MVT::i64)) {
+      Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                        MVT::i64, Src);
+      LC = RTLIB::SINTTOFP_I64_PPCF128;
+    } else if (SrcVT.bitsLE(MVT::i128)) {
+      Src = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i128, Src);
+      LC = RTLIB::SINTTOFP_I128_PPCF128;
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+    Hi = MakeLibCall(LC, VT, &Src, 1, true, dl);
+    GetPairElements(Hi, Lo, Hi);
+  }
+
+  if (isSigned)
+    return;
+
+  // Unsigned - fix up the SINT_TO_FP value just calculated.
+  Hi = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi);
+  SrcVT = Src.getValueType();
+
+  // x>=0 ? (ppcf128)(iN)x : (ppcf128)(iN)x + 2^N; N=32,64,128.
+  static const uint64_t TwoE32[]  = { 0x41f0000000000000LL, 0 };
+  static const uint64_t TwoE64[]  = { 0x43f0000000000000LL, 0 };
+  static const uint64_t TwoE128[] = { 0x47f0000000000000LL, 0 };
+  const uint64_t *Parts = 0;
+
+  switch (SrcVT.getSimpleVT()) {
+  default:
+    assert(false && "Unsupported UINT_TO_FP!");
+  case MVT::i32:
+    Parts = TwoE32;
+    break;
+  case MVT::i64:
+    Parts = TwoE64;
+    break;
+  case MVT::i128:
+    Parts = TwoE128;
+    break;
+  }
+
+  Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
+                   DAG.getConstantFP(APFloat(APInt(128, 2, Parts)),
+                                     MVT::ppcf128));
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, VT, Src, DAG.getConstant(0, SrcVT),
+                   Lo, Hi, DAG.getCondCode(ISD::SETLT));
+  GetPairElements(Lo, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatOperand - This method is called when the specified operand of the
+/// specified node is found to need expansion.  At this point, all of the result
+/// types of the node are known to be legal, but other operands of the node may
+/// need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(cerr << "Expand float operand: "; N->dump(&DAG); cerr << "\n");
+  SDValue Res = SDValue();
+
+  if (TLI.getOperationAction(N->getOpcode(), N->getOperand(OpNo).getValueType())
+      == TargetLowering::Custom)
+    Res = TLI.LowerOperation(SDValue(N, 0), DAG);
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+      cerr << "ExpandFloatOperand Op #" << OpNo << ": ";
+      N->dump(&DAG); cerr << "\n";
+  #endif
+      assert(0 && "Do not know how to expand this operator's operand!");
+      abort();
+
+    case ISD::BIT_CONVERT:     Res = ExpandOp_BIT_CONVERT(N); break;
+    case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
+    case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+
+    case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+    case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
+    case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
+    case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
+    case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
+    case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
+    case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
+                                                    OpNo); break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// FloatExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
+                                                SDValue &NewRHS,
+                                                ISD::CondCode &CCCode,
+                                                DebugLoc dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedFloat(NewLHS, LHSLo, LHSHi);
+  GetExpandedFloat(NewRHS, RHSLo, RHSHi);
+
+  MVT VT = NewLHS.getValueType();
+  assert(VT == MVT::ppcf128 && "Unsupported setcc type!");
+
+  // FIXME:  This generated code sucks.  We want to generate
+  //         FCMPU crN, hi1, hi2
+  //         BNE crN, L:
+  //         FCMPU crN, lo1, lo2
+  // The following can be improved, but not that much.
+  SDValue Tmp1, Tmp2, Tmp3;
+  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETOEQ);
+  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+                      LHSLo, RHSLo, CCCode);
+  Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETUNE);
+  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, CCCode);
+  Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
+  NewRHS = SDValue();   // LHS is the result, not a compare.
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
+  assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  // Round it the rest of the way (e.g. to f32) if needed.
+  return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(),
+                     N->getValueType(0), Hi, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
+  MVT RVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    SDValue Res = DAG.getNode(ISD::FP_ROUND_INREG, dl, MVT::ppcf128,
+                              N->getOperand(0), DAG.getValueType(MVT::f64));
+    Res = DAG.getNode(ISD::FP_ROUND, dl, MVT::f64, Res,
+                      DAG.getIntPtrConstant(1));
+    return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
+  return MakeLibCall(LC, RVT, &N->getOperand(0), 1, false, dl);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
+  MVT RVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
+    APFloat APF = APFloat(APInt(128, 2, TwoE31));
+    SDValue Tmp = DAG.getConstantFP(APF, MVT::ppcf128);
+    //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+    // FIXME: generated code sucks.
+    return DAG.getNode(ISD::SELECT_CC, dl, MVT::i32, N->getOperand(0), Tmp,
+                       DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
+                                               DAG.getNode(ISD::FSUB, dl,
+                                                           MVT::ppcf128,
+                                                           N->getOperand(0),
+                                                           Tmp)),
+                                   DAG.getConstant(0x80000000, MVT::i32)),
+                       DAG.getNode(ISD::FP_TO_SINT, dl,
+                                   MVT::i32, N->getOperand(0)),
+                       DAG.getCondCode(ISD::SETGE));
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  return MakeLibCall(LC, N->getValueType(0), &N->getOperand(0), 1, false, dl);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode));
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode));
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  MVT NVT = TLI.getTypeToTransformTo(ST->getValue().getValueType());
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(ST->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+
+  SDValue Lo, Hi;
+  GetExpandedOp(ST->getValue(), Lo, Hi);
+
+  return DAG.getTruncStore(Chain, N->getDebugLoc(), Hi, Ptr,
+                           ST->getSrcValue(), ST->getSrcValueOffset(),
+                           ST->getMemoryVT(),
+                           ST->isVolatile(), ST->getAlignment());
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
new file mode 100644
index 0000000..eb9342c
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -0,0 +1,2382 @@
+//===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements integer type expansion and promotion for LegalizeTypes.
+// Promotion is the act of changing a computation in an illegal type into a
+// computation in a larger type.  For example, implementing i8 arithmetic in an
+// i32 register (often needed on powerpc).
+// Expansion is the act of changing a computation in an illegal type into a
+// computation in two identical registers of a smaller type.  For example,
+// implementing i64 arithmetic in two i32 registers (often needed on 32-bit
+// targets).
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerResult - This method is called when a result of a node is
+/// found to be in need of promotion to a larger type.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// expansion, we just know that (at least) one result needs promotion.
+void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Promote integer result: "; N->dump(&DAG); cerr << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "PromoteIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to promote this operator!");
+    abort();
+  case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
+  case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
+  case ISD::BIT_CONVERT: Res = PromoteIntRes_BIT_CONVERT(N); break;
+  case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
+  case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
+  case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
+  case ISD::CONVERT_RNDSAT:
+                         Res = PromoteIntRes_CONVERT_RNDSAT(N); break;
+  case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
+  case ISD::EXTRACT_VECTOR_ELT:
+                         Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
+  case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
+  case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
+  case ISD::SIGN_EXTEND_INREG:
+                         Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
+  case ISD::SRA:         Res = PromoteIntRes_SRA(N); break;
+  case ISD::SRL:         Res = PromoteIntRes_SRL(N); break;
+  case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
+  case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
+  case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
+
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:         Res = PromoteIntRes_SimpleIntBinOp(N); break;
+
+  case ISD::SDIV:
+  case ISD::SREM:        Res = PromoteIntRes_SDIV(N); break;
+
+  case ISD::UDIV:
+  case ISD::UREM:        Res = PromoteIntRes_UDIV(N); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
+  case ISD::UADDO:
+  case ISD::USUBO:       Res = PromoteIntRes_UADDSUBO(N, ResNo); break;
+  case ISD::SMULO:
+  case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP:
+    Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    Res = PromoteIntRes_Atomic2(cast<AtomicSDNode>(N)); break;
+  }
+
+  // If the result is null then the sub-method took care of registering it.
+  if (Res.getNode())
+    SetPromotedInteger(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
+  // Sign-extend the new bits, and continue the assertion.
+  SDValue Op = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertSext, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
+  // Zero the new bits, and continue the assertion.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertZext, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(),
+                              N->getChain(), N->getBasePtr(),
+                              Op2, N->getSrcValue(), N->getAlignment());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic2(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Op3 = GetPromotedInteger(N->getOperand(3));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(), N->getChain(), N->getBasePtr(),
+                              Op2, Op3, N->getSrcValue(), N->getAlignment());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BIT_CONVERT(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  MVT InVT = InOp.getValueType();
+  MVT NInVT = TLI.getTypeToTransformTo(InVT);
+  MVT OutVT = N->getValueType(0);
+  MVT NOutVT = TLI.getTypeToTransformTo(OutVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (getTypeAction(InVT)) {
+  default:
+    assert(false && "Unknown type action!");
+    break;
+  case Legal:
+    break;
+  case PromoteInteger:
+    if (NOutVT.bitsEq(NInVT))
+      // The input promotes to the same size.  Convert the promoted value.
+      return DAG.getNode(ISD::BIT_CONVERT, dl,
+                         NOutVT, GetPromotedInteger(InOp));
+    break;
+  case SoftenFloat:
+    // Promote the integer operand by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
+  case ExpandInteger:
+  case ExpandFloat:
+    break;
+  case ScalarizeVector:
+    // Convert the element to an integer and promote it by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                       BitConvertToInteger(GetScalarizedVector(InOp)));
+  case SplitVector: {
+    // For example, i32 = BIT_CONVERT v2i16 on alpha.  Convert the split
+    // pieces of the input into integers and reassemble in the final type.
+    SDValue Lo, Hi;
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+    Lo = BitConvertToInteger(Lo);
+    Hi = BitConvertToInteger(Hi);
+
+    if (TLI.isBigEndian())
+      std::swap(Lo, Hi);
+
+    InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       MVT::getIntegerVT(NOutVT.getSizeInBits()),
+                       JoinIntegers(Lo, Hi));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, InOp);
+  }
+  case WidenVector:
+    if (OutVT.bitsEq(NInVT))
+      // The input is widened to the same size.  Convert to the widened value.
+      return DAG.getNode(ISD::BIT_CONVERT, dl, OutVT, GetWidenedVector(InOp));
+  }
+
+  // Otherwise, lower the bit-convert to a store/load from the stack.
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and destination types.
+  SDValue FIPtr = DAG.CreateStackTemporary(InVT, OutVT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(FI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, FIPtr, SV, 0);
+
+  // Result is an extending load from the stack slot.
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, NOutVT, Store, FIPtr, SV, 0, OutVT);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  MVT OVT = N->getValueType(0);
+  MVT NVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+  return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+                     DAG.getConstant(DiffBits, TLI.getPointerTy()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
+  // The pair element type may be legal, or may not promote to the same type as
+  // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
+  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(),
+                     TLI.getTypeToTransformTo(N->getValueType(0)),
+                     JoinIntegers(N->getOperand(0), N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
+  MVT VT = N->getValueType(0);
+  // FIXME there is no actual debug info here
+  DebugLoc dl = N->getDebugLoc();
+  // Zero extend things like i1, sign extend everything else.  It shouldn't
+  // matter in theory which one we pick, but this tends to give better code?
+  unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  SDValue Result = DAG.getNode(Opc, dl, TLI.getTypeToTransformTo(VT),
+                               SDValue(N, 0));
+  assert(isa<ConstantSDNode>(Result) && "Didn't constant fold ext?");
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) &&
+          "can only promote integers");
+  MVT OutVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  return DAG.getConvertRndSat(OutVT, N->getDebugLoc(), N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  DebugLoc dl = N->getDebugLoc();
+  MVT OVT = N->getValueType(0);
+  MVT NVT = Op.getValueType();
+  Op = DAG.getNode(ISD::CTLZ, dl, NVT, Op);
+  // Subtract off the extra leading bits in the bigger type.
+  return DAG.getNode(ISD::SUB, dl, NVT, Op,
+                     DAG.getConstant(NVT.getSizeInBits() -
+                                     OVT.getSizeInBits(), NVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), Op.getValueType(), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  MVT OVT = N->getValueType(0);
+  MVT NVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+  // The count is the same in the promoted type except if the original
+  // value was zero.  This can be handled by setting the bit just off
+  // the top of the original type.
+  APInt TopBit(NVT.getSizeInBits(), 0);
+  TopBit.set(OVT.getSizeInBits());
+  Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, NVT));
+  return DAG.getNode(ISD::CTTZ, dl, NVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  MVT OldVT = N->getValueType(0);
+  SDValue OldVec = N->getOperand(0);
+  if (getTypeAction(OldVec.getValueType()) == WidenVector)
+    OldVec = GetWidenedVector(N->getOperand(0));
+  unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  DebugLoc dl = N->getDebugLoc();
+
+  if (OldElts == 1) {
+    assert(!isTypeLegal(OldVec.getValueType()) &&
+           "Legal one-element vector of a type needing promotion!");
+    // It is tempting to follow GetScalarizedVector by a call to
+    // GetPromotedInteger, but this would be wrong because the
+    // scalarized value may not yet have been processed.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, TLI.getTypeToTransformTo(OldVT),
+                       GetScalarizedVector(OldVec));
+  }
+
+  // Convert to a vector half as long with an element type of twice the width,
+  // for example <4 x i16> -> <2 x i32>.
+  assert(!(OldElts & 1) && "Odd length vectors not supported!");
+  MVT NewVT = MVT::getIntegerVT(2 * OldVT.getSizeInBits());
+  assert(OldVT.isSimple() && NewVT.isSimple());
+
+  SDValue NewVec = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                 MVT::getVectorVT(NewVT, OldElts / 2),
+                                 OldVec);
+
+  // Extract the element at OldIdx / 2 from the new vector.
+  SDValue OldIdx = N->getOperand(1);
+  SDValue NewIdx = DAG.getNode(ISD::SRL, dl, OldIdx.getValueType(), OldIdx,
+                               DAG.getConstant(1, TLI.getPointerTy()));
+  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, NewIdx);
+
+  // Select the appropriate half of the element: Lo if OldIdx was even,
+  // Hi if it was odd.
+  SDValue Lo = Elt;
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, NewVT, Elt,
+                           DAG.getConstant(OldVT.getSizeInBits(),
+                                           TLI.getPointerTy()));
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  // Extend to the promoted type.
+  SDValue Odd = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, OldIdx);
+  SDValue Res = DAG.getNode(ISD::SELECT, dl, NewVT, Odd, Hi, Lo);
+  return DAG.getNode(ISD::ANY_EXTEND, dl, TLI.getTypeToTransformTo(OldVT), Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  unsigned NewOpc = N->getOpcode();
+  DebugLoc dl = N->getDebugLoc();
+
+  // If we're promoting a UINT to a larger size, check to see if the new node
+  // will be legal.  If it isn't, check to see if FP_TO_SINT is legal, since
+  // we can use that instead.  This allows us to generate better code for
+  // FP_TO_UINT for small destination sizes on targets where FP_TO_UINT is not
+  // legal, such as PowerPC.
+  if (N->getOpcode() == ISD::FP_TO_UINT &&
+      !TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
+    NewOpc = ISD::FP_TO_SINT;
+
+  SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+
+  // Assert that the converted value fits in the original type.  If it doesn't
+  // (eg: because the value being converted is too big), then the result of the
+  // original operation was undefined anyway, so the assert is still correct.
+  return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ?
+                     ISD::AssertZext : ISD::AssertSext, dl,
+                     NVT, Res, DAG.getValueType(N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+
+  if (getTypeAction(N->getOperand(0).getValueType()) == PromoteInteger) {
+    SDValue Res = GetPromotedInteger(N->getOperand(0));
+    assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
+
+    // If the result and operand types are the same after promotion, simplify
+    // to an in-register extension.
+    if (NVT == Res.getValueType()) {
+      // The high bits are not guaranteed to be anything.  Insert an extend.
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                           DAG.getValueType(N->getOperand(0).getValueType()));
+      if (N->getOpcode() == ISD::ZERO_EXTEND)
+        return DAG.getZeroExtendInReg(Res, dl, N->getOperand(0).getValueType());
+      assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!");
+      return Res;
+    }
+  }
+
+  // Otherwise, just extend the original operand all the way to the larger type.
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  ISD::LoadExtType ExtType =
+    ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
+                               N->getSrcValue(), N->getSrcValueOffset(),
+                               N->getMemoryVT(), N->isVolatile(),
+                               N->getAlignment());
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+/// Promote the overflow flag of an overflowing arithmetic node.
+SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
+  // Simply change the return type of the boolean result.
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(1));
+  MVT ValueVTs[] = { N->getValueType(0), NVT };
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Res = DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                            DAG.getVTList(ValueVTs, 2), Ops, 2);
+
+  // Modified the sum result - switch anything that used the old sum to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 0), Res);
+
+  return SDValue(Res.getNode(), 1);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // sign extension of its truncation to the original type.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  MVT OVT = N->getOperand(0).getValueType();
+  MVT NVT = LHS.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: sign extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                            DAG.getValueType(OVT));
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SDIV(SDNode *N) {
+  // Sign extend the input.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(2));
+  SDValue RHS = GetPromotedInteger(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
+  MVT SVT = TLI.getSetCCResultType(N->getOperand(0).getValueType());
+  assert(isTypeLegal(SVT) && "Illegal SetCC type!");
+  DebugLoc dl = N->getDebugLoc();
+
+  // Get the SETCC result using the canonical SETCC type.
+  SDValue SetCC = DAG.getNode(ISD::SETCC, dl, SVT, N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2));
+
+  // Convert to the expected type.
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
+  return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
+  return DAG.getNode(ISD::SHL, N->getDebugLoc(),
+                     TLI.getTypeToTransformTo(N->getValueType(0)),
+                     GetPromotedInteger(N->getOperand(0)), N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
+  // The input may have strange things in the top bits of the registers, but
+  // these operations don't care.  They may have weird bits going out, but
+  // that too is okay if they are integer operations.
+  SDValue LHS = GetPromotedInteger(N->getOperand(0));
+  SDValue RHS = GetPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                    LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
+  // The input value must be properly sign extended.
+  SDValue Res = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SRA, N->getDebugLoc(),
+                     Res.getValueType(), Res, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
+  // The input value must be properly zero extended.
+  MVT VT = N->getValueType(0);
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  SDValue Res = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SRL, N->getDebugLoc(), NVT, Res, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Res;
+
+  switch (getTypeAction(N->getOperand(0).getValueType())) {
+  default: assert(0 && "Unknown type action!");
+  case Legal:
+  case ExpandInteger:
+    Res = N->getOperand(0);
+    break;
+  case PromoteInteger:
+    Res = GetPromotedInteger(N->getOperand(0));
+    break;
+  }
+
+  // Truncate to NVT instead of VT
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), NVT, Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // zero extension of its truncation to the original type.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  MVT OVT = N->getOperand(0).getValueType();
+  MVT NVT = LHS.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: zero extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UDIV(SDNode *N) {
+  // Zero extend the input.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT RegVT = TLI.getRegisterType(VT);
+  unsigned NumRegs = TLI.getNumRegisters(VT);
+  // The argument is passed as NumRegs registers of type RegVT.
+
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned i = 0; i < NumRegs; ++i) {
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2));
+    Chain = Parts[i].getValue(1);
+  }
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::reverse(Parts.begin(), Parts.end());
+
+  // Assemble the parts in the promoted type.
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]);
+  for (unsigned i = 1; i < NumRegs; ++i) {
+    SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
+    // Shift it to the right position and "or" it in.
+    Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
+                       DAG.getConstant(i * RegVT.getSizeInBits(),
+                                       TLI.getPointerTy()));
+    Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
+  }
+
+  // Modified the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
+  assert(ResNo == 1 && "Only boolean result promotion currently supported!");
+  return PromoteIntRes_Overflow(N);
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need promotion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(cerr << "Promote integer operand: "; N->dump(&DAG); cerr << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+    cerr << "PromoteIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+  #endif
+    assert(0 && "Do not know how to promote this operator's operand!");
+    abort();
+
+  case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
+  case ISD::BIT_CONVERT:  Res = PromoteIntOp_BIT_CONVERT(N); break;
+  case ISD::BR_CC:        Res = PromoteIntOp_BR_CC(N, OpNo); break;
+  case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
+  case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
+  case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
+  case ISD::CONVERT_RNDSAT:
+                          Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                          Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
+  case ISD::MEMBARRIER:   Res = PromoteIntOp_MEMBARRIER(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+                          Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
+  case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
+  case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
+  case ISD::SIGN_EXTEND:  Res = PromoteIntOp_SIGN_EXTEND(N); break;
+  case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
+                                                   OpNo); break;
+  case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
+  case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
+  case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
+                                            ISD::CondCode CCCode) {
+  // We have to insert explicit sign or zero extends.  Note that we could
+  // insert sign extends for ALL conditions, but zero extend is cheaper on
+  // many machines (an AND instead of two shifts), so prefer it.
+  switch (CCCode) {
+  default: assert(0 && "Unknown integer comparison!");
+  case ISD::SETEQ:
+  case ISD::SETNE:
+  case ISD::SETUGE:
+  case ISD::SETUGT:
+  case ISD::SETULE:
+  case ISD::SETULT:
+    // ALL of these operations will work if we either sign or zero extend
+    // the operands (including the unsigned comparisons!).  Zero extend is
+    // usually a simpler/cheaper operation, so prefer it.
+    NewLHS = ZExtPromotedInteger(NewLHS);
+    NewRHS = ZExtPromotedInteger(NewRHS);
+    break;
+  case ISD::SETGE:
+  case ISD::SETGT:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    NewLHS = SExtPromotedInteger(NewLHS);
+    NewRHS = SExtPromotedInteger(NewRHS);
+    break;
+  }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BIT_CONVERT(SDNode *N) {
+  // This should only occur in unusual situations like bitcasting to an
+  // x86_fp80, so just turn it into a store+load
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 2 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(1))->get());
+
+  // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
+  // legal types.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+                                N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "only know how to promote condition");
+
+  // Promote all the way up to the canonical SetCC type.
+  MVT SVT = TLI.getSetCCResultType(MVT::Other);
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT);
+
+  // The chain (Op#0) and basic block destination (Op#2) are always legal types.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Cond,
+                                N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
+  // Since the result type is legal, the operands must promote to it.
+  MVT OVT = N->getOperand(0).getValueType();
+  SDValue Lo = ZExtPromotedInteger(N->getOperand(0));
+  SDValue Hi = GetPromotedInteger(N->getOperand(1));
+  assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
+  DebugLoc dl = N->getDebugLoc();
+
+  Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
+                   DAG.getConstant(OVT.getSizeInBits(), TLI.getPointerTy()));
+  return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type is not.  This implies
+  // that the vector is a power-of-two in length and that the element
+  // type does not have a strange size (eg: it is not i1).
+  MVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  assert(!(NumElts & 1) && "Legal vector of one illegal element?");
+
+  // Promote the inserted value.  The type does not need to match the
+  // vector element type.  Check that any extra bits introduced will be
+  // truncated away.
+  assert(N->getOperand(0).getValueType().getSizeInBits() >=
+         N->getValueType(0).getVectorElementType().getSizeInBits() &&
+         "Type of inserted value narrower than vector element type!");
+
+  SmallVector<SDValue, 16> NewOps;
+  for (unsigned i = 0; i < NumElts; ++i)
+    NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
+
+  return DAG.UpdateNodeOperands(SDValue(N, 0), &NewOps[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) &&
+           "can only promote integer arguments");
+  SDValue InOp = GetPromotedInteger(N->getOperand(0));
+  return DAG.getConvertRndSat(N->getValueType(0), N->getDebugLoc(), InOp,
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
+                                                         unsigned OpNo) {
+  if (OpNo == 1) {
+    // Promote the inserted value.  This is valid because the type does not
+    // have to match the vector element type.
+
+    // Check that any extra bits introduced will be truncated away.
+    assert(N->getOperand(1).getValueType().getSizeInBits() >=
+           N->getValueType(0).getVectorElementType().getSizeInBits() &&
+           "Type of inserted value narrower than vector element type!");
+    return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+                                  GetPromotedInteger(N->getOperand(1)),
+                                  N->getOperand(2));
+  }
+
+  assert(OpNo == 2 && "Different operand and result vector types?");
+
+  // Promote the index.
+  SDValue Idx = ZExtPromotedInteger(N->getOperand(2));
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+                                N->getOperand(1), Idx);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MEMBARRIER(SDNode *N) {
+  SDValue NewOps[6];
+  DebugLoc dl = N->getDebugLoc();
+  NewOps[0] = N->getOperand(0);
+  for (unsigned i = 1; i < array_lengthof(NewOps); ++i) {
+    SDValue Flag = GetPromotedInteger(N->getOperand(i));
+    NewOps[i] = DAG.getZeroExtendInReg(Flag, dl, MVT::i1);
+  }
+  return DAG.UpdateNodeOperands(SDValue (N, 0), NewOps,
+                                array_lengthof(NewOps));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
+  // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
+  // the operand in place.
+  return DAG.UpdateNodeOperands(SDValue(N, 0),
+                                GetPromotedInteger(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Only know how to promote condition");
+
+  // Promote all the way up to the canonical SetCC type.
+  MVT SVT = TLI.getSetCCResultType(N->getOperand(1).getValueType());
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(0), SVT);
+
+  return DAG.UpdateNodeOperands(SDValue(N, 0), Cond,
+                                N->getOperand(1), N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
+
+  // The CC (#4) and the possible return values (#2 and #3) have legal types.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), LHS, RHS, N->getOperand(2),
+                                N->getOperand(3), N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
+
+  // The CC (#2) is always legal.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), LHS, RHS, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  DebugLoc dl = N->getDebugLoc();
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(),
+                     Op, DAG.getValueType(N->getOperand(0).getValueType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
+  return DAG.UpdateNodeOperands(SDValue(N, 0),
+                                SExtPromotedInteger(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
+  int SVOffset = N->getSrcValueOffset();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
+
+  // Truncate the value and store the result.
+  return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getSrcValue(),
+                           SVOffset, N->getMemoryVT(),
+                           isVolatile, Alignment);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
+  return DAG.UpdateNodeOperands(SDValue(N, 0),
+                                ZExtPromotedInteger(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Expand integer result: "; N->dump(&DAG); cerr << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "ExpandIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to expand the result of this operator!");
+    abort();
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+
+  case ISD::BIT_CONVERT:        ExpandRes_BIT_CONVERT(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ANY_EXTEND:  ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
+  case ISD::AssertSext:  ExpandIntRes_AssertSext(N, Lo, Hi); break;
+  case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
+  case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
+  case ISD::CTLZ:        ExpandIntRes_CTLZ(N, Lo, Hi); break;
+  case ISD::CTPOP:       ExpandIntRes_CTPOP(N, Lo, Hi); break;
+  case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
+  case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
+  case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
+  case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
+  case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
+  case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
+  case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
+  case ISD::TRUNCATE:    ExpandIntRes_TRUNCATE(N, Lo, Hi); break;
+  case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
+  case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
+  case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break;
+
+  case ISD::ADD:
+  case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break;
+
+  case ISD::ADDC:
+  case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break;
+
+  case ISD::ADDE:
+  case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedInteger(SDValue(N, ResNo), Lo, Hi);
+}
+
+/// ExpandShiftByConstant - N is a shift by a value that needs to be expanded,
+/// and the shift amount is a constant 'Amt'.  Expand the operation.
+void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
+                                             SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // Expand the incoming operand to be shifted, so that we have its parts
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  MVT NVT = InL.getValueType();
+  unsigned VTBits = N->getValueType(0).getSizeInBits();
+  unsigned NVTBits = NVT.getSizeInBits();
+  MVT ShTy = N->getOperand(1).getValueType();
+
+  if (N->getOpcode() == ISD::SHL) {
+    if (Amt > VTBits) {
+      Lo = Hi = DAG.getConstant(0, NVT);
+    } else if (Amt > NVTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = DAG.getNode(ISD::SHL, dl,
+                       NVT, InL, DAG.getConstant(Amt-NVTBits,ShTy));
+    } else if (Amt == NVTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = InL;
+    } else if (Amt == 1 &&
+               TLI.isOperationLegalOrCustom(ISD::ADDC,
+                                            TLI.getTypeToExpandTo(NVT))) {
+      // Emit this X << 1 as X+X.
+      SDVTList VTList = DAG.getVTList(NVT, MVT::Flag);
+      SDValue LoOps[2] = { InL, InL };
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+      SDValue HiOps[3] = { InH, InH, Lo.getValue(1) };
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+    } else {
+      Lo = DAG.getNode(ISD::SHL, dl, NVT, InL, DAG.getConstant(Amt, ShTy));
+      Hi = DAG.getNode(ISD::OR, dl, NVT,
+                       DAG.getNode(ISD::SHL, dl, NVT, InH,
+                                   DAG.getConstant(Amt, ShTy)),
+                       DAG.getNode(ISD::SRL, dl, NVT, InL,
+                                   DAG.getConstant(NVTBits-Amt, ShTy)));
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::SRL) {
+    if (Amt > VTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = DAG.getConstant(0, NVT);
+    } else if (Amt > NVTBits) {
+      Lo = DAG.getNode(ISD::SRL, dl,
+                       NVT, InH, DAG.getConstant(Amt-NVTBits,ShTy));
+      Hi = DAG.getConstant(0, NVT);
+    } else if (Amt == NVTBits) {
+      Lo = InH;
+      Hi = DAG.getConstant(0, NVT);
+    } else {
+      Lo = DAG.getNode(ISD::OR, dl, NVT,
+                       DAG.getNode(ISD::SRL, dl, NVT, InL,
+                                   DAG.getConstant(Amt, ShTy)),
+                       DAG.getNode(ISD::SHL, dl, NVT, InH,
+                                   DAG.getConstant(NVTBits-Amt, ShTy)));
+      Hi = DAG.getNode(ISD::SRL, dl, NVT, InH, DAG.getConstant(Amt, ShTy));
+    }
+    return;
+  }
+
+  assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+  if (Amt > VTBits) {
+    Hi = Lo = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                          DAG.getConstant(NVTBits-1, ShTy));
+  } else if (Amt > NVTBits) {
+    Lo = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                     DAG.getConstant(Amt-NVTBits, ShTy));
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                     DAG.getConstant(NVTBits-1, ShTy));
+  } else if (Amt == NVTBits) {
+    Lo = InH;
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,
+                     DAG.getConstant(NVTBits-1, ShTy));
+  } else {
+    Lo = DAG.getNode(ISD::OR, dl, NVT,
+                     DAG.getNode(ISD::SRL, dl, NVT, InL,
+                                 DAG.getConstant(Amt, ShTy)),
+                     DAG.getNode(ISD::SHL, dl, NVT, InH,
+                                 DAG.getConstant(NVTBits-Amt, ShTy)));
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, InH, DAG.getConstant(Amt, ShTy));
+  }
+}
+
+/// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify
+/// this shift based on knowledge of the high bit of the shift amount.  If we
+/// can tell this, we know that it is >= 32 or < 32, without knowing the actual
+/// shift amount.
+bool DAGTypeLegalizer::
+ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  MVT ShTy = Amt.getValueType();
+  unsigned ShBits = ShTy.getSizeInBits();
+  unsigned NVTBits = NVT.getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  DebugLoc dl = N->getDebugLoc();
+
+  APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(N->getOperand(1), HighBitMask, KnownZero, KnownOne);
+
+  // If we don't know anything about the high bits, exit.
+  if (((KnownZero|KnownOne) & HighBitMask) == 0)
+    return false;
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  // If we know that any of the high bits of the shift amount are one, then we
+  // can do this as a couple of simple shifts.
+  if (KnownOne.intersects(HighBitMask)) {
+    // Mask out the high bit, which we know is set.
+    Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt,
+                      DAG.getConstant(~HighBitMask, ShTy));
+
+    switch (N->getOpcode()) {
+    default: assert(0 && "Unknown shift");
+    case ISD::SHL:
+      Lo = DAG.getConstant(0, NVT);              // Low part is zero.
+      Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
+      return true;
+    case ISD::SRL:
+      Hi = DAG.getConstant(0, NVT);              // Hi part is zero.
+      Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    case ISD::SRA:
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
+                       DAG.getConstant(NVTBits-1, ShTy));
+      Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    }
+  }
+
+#if 0
+  // FIXME: This code is broken for shifts with a zero amount!
+  // If we know that all of the high bits of the shift amount are zero, then we
+  // can do this as a couple of simple shifts.
+  if ((KnownZero & HighBitMask) == HighBitMask) {
+    // Compute 32-amt.
+    SDValue Amt2 = DAG.getNode(ISD::SUB, ShTy,
+                                 DAG.getConstant(NVTBits, ShTy),
+                                 Amt);
+    unsigned Op1, Op2;
+    switch (N->getOpcode()) {
+    default: assert(0 && "Unknown shift");
+    case ISD::SHL:  Op1 = ISD::SHL; Op2 = ISD::SRL; break;
+    case ISD::SRL:
+    case ISD::SRA:  Op1 = ISD::SRL; Op2 = ISD::SHL; break;
+    }
+
+    Lo = DAG.getNode(N->getOpcode(), NVT, InL, Amt);
+    Hi = DAG.getNode(ISD::OR, NVT,
+                     DAG.getNode(Op1, NVT, InH, Amt),
+                     DAG.getNode(Op2, NVT, InL, Amt2));
+    return true;
+  }
+#endif
+
+  return false;
+}
+
+/// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift
+/// of any size.
+bool DAGTypeLegalizer::
+ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  MVT ShTy = Amt.getValueType();
+  unsigned NVTBits = NVT.getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  DebugLoc dl = N->getDebugLoc();
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  SDValue NVBitsNode = DAG.getConstant(NVTBits, ShTy);
+  SDValue Amt2 = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
+  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
+                             Amt, NVBitsNode, ISD::SETULT);
+
+  SDValue Lo1, Hi1, Lo2, Hi2;
+  switch (N->getOpcode()) {
+  default: assert(0 && "Unknown shift");
+  case ISD::SHL:
+    // ShAmt < NVTBits
+    Lo1 = DAG.getConstant(0, NVT);                  // Low part is zero.
+    Hi1 = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
+
+    // ShAmt >= NVTBits
+    Lo2 = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
+    Hi2 = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt2));
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    return true;
+  case ISD::SRL:
+    // ShAmt < NVTBits
+    Hi1 = DAG.getConstant(0, NVT);                  // Hi part is zero.
+    Lo1 = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
+
+    // ShAmt >= NVTBits
+    Hi2 = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
+    Lo2 = DAG.getNode(ISD::OR, dl, NVT,
+                     DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+                     DAG.getNode(ISD::SHL, dl, NVT, InH, Amt2));
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    return true;
+  case ISD::SRA:
+    // ShAmt < NVTBits
+    Hi1 = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
+                       DAG.getConstant(NVTBits-1, ShTy));
+    Lo1 = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
+
+    // ShAmt >= NVTBits
+    Hi2 = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
+    Lo2 = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt2));
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    return true;
+  }
+
+  return false;
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+
+  MVT NVT = LHSL.getValueType();
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support
+  // them.  TODO: Teach operation legalization how to expand unsupported
+  // ADDC/ADDE/SUBC/SUBE.  The problem is that these operations generate
+  // a carry of type MVT::Flag, but there doesn't seem to be any way to
+  // generate a value of this type in the expanded code sequence.
+  bool hasCarry =
+    TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
+                                   ISD::ADDC : ISD::SUBC,
+                                 TLI.getTypeToExpandTo(NVT));
+
+  if (hasCarry) {
+    SDVTList VTList = DAG.getVTList(NVT, MVT::Flag);
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+    } else {
+      Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+    }
+  } else {
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps, 2);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, HiOps, 2);
+      SDValue Cmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[0],
+                                  ISD::SETULT);
+      SDValue Carry1 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp1,
+                                   DAG.getConstant(1, NVT),
+                                   DAG.getConstant(0, NVT));
+      SDValue Cmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[1],
+                                  ISD::SETULT);
+      SDValue Carry2 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp2,
+                                   DAG.getConstant(1, NVT), Carry1);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2);
+    } else {
+      Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps, 2);
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps, 2);
+      SDValue Cmp =
+        DAG.getSetCC(dl, TLI.getSetCCResultType(LoOps[0].getValueType()),
+                     LoOps[0], LoOps[1], ISD::SETULT);
+      SDValue Borrow = DAG.getNode(ISD::SELECT, dl, NVT, Cmp,
+                                   DAG.getConstant(1, NVT),
+                                   DAG.getConstant(0, NVT));
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
+    }
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Flag);
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  if (N->getOpcode() == ISD::ADDC) {
+    Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+  } else {
+    Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+  }
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Flag);
+  SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps, 3);
+  HiOps[2] = Lo.getValue(1);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps, 3);
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is any extension of the input (which degenerates to a copy).
+    Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op);
+    Hi = DAG.getUNDEF(NVT);   // The high part is undefined.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  MVT NVT = Lo.getValueType();
+  MVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi,
+                     DAG.getValueType(MVT::getIntegerVT(EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part replicates the sign bit of Lo, make it explicit.
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(NVTBits-1, TLI.getPointerTy()));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  MVT NVT = Lo.getValueType();
+  MVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi,
+                     DAG.getValueType(MVT::getIntegerVT(EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part must be zero, make it explicit.
+    Hi = DAG.getConstant(0, NVT);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  unsigned NBitWidth = NVT.getSizeInBits();
+  const APInt &Cst = cast<ConstantSDNode>(N)->getAPIntValue();
+  Lo = DAG.getConstant(APInt(Cst).trunc(NBitWidth), NVT);
+  Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  MVT NVT = Lo.getValueType();
+
+  SDValue HiNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Hi,
+                                   DAG.getConstant(0, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(ISD::CTLZ, dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(ISD::CTLZ, dl, NVT, Hi);
+
+  Lo = DAG.getNode(ISD::SELECT, dl, NVT, HiNotZero, HiLZ,
+                   DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
+                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  MVT NVT = Lo.getValueType();
+  Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
+                   DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  MVT NVT = Lo.getValueType();
+
+  SDValue LoNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo,
+                                   DAG.getConstant(0, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(ISD::CTTZ, dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(ISD::CTTZ, dl, NVT, Hi);
+
+  Lo = DAG.getNode(ISD::SELECT, dl, NVT, LoNotZero, LoLZ,
+                   DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
+                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  MVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
+  SplitInteger(MakeLibCall(LC, VT, &Op, 1, true/*irrelevant*/, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  MVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
+  SplitInteger(MakeLibCall(LC, VT, &Op, 1, false/*irrelevant*/, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+
+  MVT VT = N->getValueType(0);
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  int SVOffset = N->getSrcValueOffset();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  DebugLoc dl = N->getDebugLoc();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    MVT EVT = N->getMemoryVT();
+
+    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(), SVOffset,
+                        EVT, isVolatile, Alignment);
+
+    // Remember the chain.
+    Ch = Lo.getValue(1);
+
+    if (ExtType == ISD::SEXTLOAD) {
+      // The high part is obtained by SRA'ing all but one of the bits of the
+      // lo part.
+      unsigned LoSize = Lo.getValueType().getSizeInBits();
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                       DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      // The high part is just a zero.
+      Hi = DAG.getConstant(0, NVT);
+    } else {
+      assert(ExtType == ISD::EXTLOAD && "Unknown extload!");
+      // The high part is undefined.
+      Hi = DAG.getUNDEF(NVT);
+    }
+  } else if (TLI.isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
+                     isVolatile, Alignment);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    MVT NEVT = MVT::getIntegerVT(ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(),
+                        SVOffset+IncrementSize, NEVT,
+                        isVolatile, MinAlign(Alignment, IncrementSize));
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+  } else {
+    // Big-endian - high bits are at low addresses.  Favor aligned loads at
+    // the cost of some bit-fiddling.
+    MVT EVT = N->getMemoryVT();
+    unsigned EBytes = EVT.getStoreSizeInBits()/8;
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    unsigned ExcessBits = (EBytes - IncrementSize)*8;
+
+    // Load both the high bits and maybe some of the low bits.
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(), SVOffset,
+                        MVT::getIntegerVT(EVT.getSizeInBits() - ExcessBits),
+                        isVolatile, Alignment);
+
+    // Increment the pointer to the other half.
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    // Load the rest of the low bits.
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr, N->getSrcValue(),
+                        SVOffset+IncrementSize,
+                        MVT::getIntegerVT(ExcessBits),
+                        isVolatile, MinAlign(Alignment, IncrementSize));
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+
+    if (ExcessBits < NVT.getSizeInBits()) {
+      // Transfer low bits from the bottom of Hi to the top of Lo.
+      Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
+                       DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                                   DAG.getConstant(ExcessBits,
+                                                   TLI.getPointerTy())));
+      // Move high bits to the right position in Hi.
+      Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl,
+                       NVT, Hi,
+                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits,
+                                       TLI.getPointerTy()));
+    }
+  }
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ch);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
+                                        SDValue &Lo, SDValue &Hi) {
+  MVT VT = N->getValueType(0);
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, NVT);
+  bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, NVT);
+  bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, NVT);
+  bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, NVT);
+  if (HasMULHU || HasMULHS || HasUMUL_LOHI || HasSMUL_LOHI) {
+    SDValue LL, LH, RL, RH;
+    GetExpandedInteger(N->getOperand(0), LL, LH);
+    GetExpandedInteger(N->getOperand(1), RL, RH);
+    unsigned OuterBitSize = VT.getSizeInBits();
+    unsigned InnerBitSize = NVT.getSizeInBits();
+    unsigned LHSSB = DAG.ComputeNumSignBits(N->getOperand(0));
+    unsigned RHSSB = DAG.ComputeNumSignBits(N->getOperand(1));
+
+    APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
+    if (DAG.MaskedValueIsZero(N->getOperand(0), HighMask) &&
+        DAG.MaskedValueIsZero(N->getOperand(1), HighMask)) {
+      // The inputs are both zero-extended.
+      if (HasUMUL_LOHI) {
+        // We can emit a umul_lohi.
+        Lo = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return;
+      }
+      if (HasMULHU) {
+        // We can emit a mulhu+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
+        return;
+      }
+    }
+    if (LHSSB > InnerBitSize && RHSSB > InnerBitSize) {
+      // The input values are both sign-extended.
+      if (HasSMUL_LOHI) {
+        // We can emit a smul_lohi.
+        Lo = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return;
+      }
+      if (HasMULHS) {
+        // We can emit a mulhs+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHS, dl, NVT, LL, RL);
+        return;
+      }
+    }
+    if (HasUMUL_LOHI) {
+      // Lo,Hi = umul LHS, RHS.
+      SDValue UMulLOHI = DAG.getNode(ISD::UMUL_LOHI, dl,
+                                       DAG.getVTList(NVT, NVT), LL, RL);
+      Lo = UMulLOHI;
+      Hi = UMulLOHI.getValue(1);
+      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
+      return;
+    }
+    if (HasMULHU) {
+      Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+      Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
+      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
+      return;
+    }
+  }
+
+  // If nothing else, we can make a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::MUL_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::MUL_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MUL_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MUL_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true/*irrelevant*/, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i32)
+    LC = RTLIB::SDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // If we can emit an efficient shift operation, do so now.  Check to see if
+  // the RHS is a constant.
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return ExpandShiftByConstant(N, CN->getZExtValue(), Lo, Hi);
+
+  // If we can determine that the high bit of the shift is zero or one, even if
+  // the low bits are variable, emit this shift in an optimized form.
+  if (ExpandShiftWithKnownAmountBit(N, Lo, Hi))
+    return;
+
+  // If this target supports shift_PARTS, use it.  First, map to the _PARTS opc.
+  unsigned PartsOpc;
+  if (N->getOpcode() == ISD::SHL) {
+    PartsOpc = ISD::SHL_PARTS;
+  } else if (N->getOpcode() == ISD::SRL) {
+    PartsOpc = ISD::SRL_PARTS;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    PartsOpc = ISD::SRA_PARTS;
+  }
+
+  // Next check to see if the target supports this SHL_PARTS operation or if it
+  // will custom expand it.
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT);
+  if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
+      Action == TargetLowering::Custom) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH;
+    GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+
+    SDValue Ops[] = { LHSL, LHSH, N->getOperand(1) };
+    MVT VT = LHSL.getValueType();
+    Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops, 3);
+    Hi = Lo.getValue(1);
+    return;
+  }
+
+  // Otherwise, emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  bool isSigned;
+  if (N->getOpcode() == ISD::SHL) {
+    isSigned = false; /*sign irrelevant*/
+    if (VT == MVT::i16)
+      LC = RTLIB::SHL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SHL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SHL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SHL_I128;
+  } else if (N->getOpcode() == ISD::SRL) {
+    isSigned = false;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRL_I128;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    isSigned = true;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRA_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRA_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRA_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRA_I128;
+  }
+  
+  if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    SplitInteger(MakeLibCall(LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
+    return;
+  }
+
+  if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi))
+    assert(0 && "Unsupported shift!");
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is sign extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
+    // The high part is obtained by SRA'ing all but one of the bits of low part.
+    unsigned LoSize = NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(MVT::getIntegerVT(ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::
+ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  MVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  if (EVT.bitsLE(Lo.getValueType())) {
+    // sext_inreg the low part if needed.
+    Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo,
+                     N->getOperand(1));
+
+    // The high part gets the sign extension from the lo-part.  This handles
+    // things like sextinreg V:i64 from i8.
+    Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
+                     DAG.getConstant(Hi.getValueType().getSizeInBits()-1,
+                                     TLI.getPointerTy()));
+  } else {
+    // For example, extension of an i48 to an i64.  Leave the low part alone,
+    // sext_inreg the high part.
+    unsigned ExcessBits =
+      EVT.getSizeInBits() - Lo.getValueType().getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(MVT::getIntegerVT(ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i32)
+    LC = RTLIB::SREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
+  Hi = DAG.getNode(ISD::SRL, dl,
+                   N->getOperand(0).getValueType(), N->getOperand(0),
+                   DAG.getConstant(NVT.getSizeInBits(), TLI.getPointerTy()));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i32)
+    LC = RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, false, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i32)
+    LC = RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(MakeLibCall(LC, VT, Ops, 2, false, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is zero extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0));
+    Hi = DAG.getConstant(0, NVT);   // The high part is just a zero.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getZeroExtendInReg(Hi, dl, MVT::getIntegerVT(ExcessBits));
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need expansion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(cerr << "Expand integer operand: "; N->dump(&DAG); cerr << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+  #ifndef NDEBUG
+    cerr << "ExpandIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+  #endif
+    assert(0 && "Do not know how to expand this operator's operand!");
+    abort();
+
+  case ISD::BIT_CONVERT:       Res = ExpandOp_BIT_CONVERT(N); break;
+  case ISD::BR_CC:             Res = ExpandIntOp_BR_CC(N); break;
+  case ISD::BUILD_VECTOR:      Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT:   Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
+  case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
+  case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
+  case ISD::UINT_TO_FP:        Res = ExpandIntOp_UINT_TO_FP(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR: Res = ExpandIntOp_Shift(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// IntegerExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
+                                                  SDValue &NewRHS,
+                                                  ISD::CondCode &CCCode,
+                                                  DebugLoc dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(NewLHS, LHSLo, LHSHi);
+  GetExpandedInteger(NewRHS, RHSLo, RHSHi);
+
+  MVT VT = NewLHS.getValueType();
+
+  if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) {
+    if (RHSLo == RHSHi) {
+      if (ConstantSDNode *RHSCST = dyn_cast<ConstantSDNode>(RHSLo)) {
+        if (RHSCST->isAllOnesValue()) {
+          // Equality comparison to -1.
+          NewLHS = DAG.getNode(ISD::AND, dl,
+                               LHSLo.getValueType(), LHSLo, LHSHi);
+          NewRHS = RHSLo;
+          return;
+        }
+      }
+    }
+
+    NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo);
+    NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi);
+    NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS);
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    return;
+  }
+
+  // If this is a comparison of the sign bit, just look at the top part.
+  // X > -1,  x < 0
+  if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(NewRHS))
+    if ((CCCode == ISD::SETLT && CST->isNullValue()) ||     // X < 0
+        (CCCode == ISD::SETGT && CST->isAllOnesValue())) {  // X > -1
+      NewLHS = LHSHi;
+      NewRHS = RHSHi;
+      return;
+    }
+
+  // FIXME: This generated code sucks.
+  ISD::CondCode LowCC;
+  switch (CCCode) {
+  default: assert(0 && "Unknown integer setcc!");
+  case ISD::SETLT:
+  case ISD::SETULT: LowCC = ISD::SETULT; break;
+  case ISD::SETGT:
+  case ISD::SETUGT: LowCC = ISD::SETUGT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: LowCC = ISD::SETULE; break;
+  case ISD::SETGE:
+  case ISD::SETUGE: LowCC = ISD::SETUGE; break;
+  }
+
+  // Tmp1 = lo(op1) < lo(op2)   // Always unsigned comparison
+  // Tmp2 = hi(op1) < hi(op2)   // Signedness depends on operands
+  // dest = hi(op1) == hi(op2) ? Tmp1 : Tmp2;
+
+  // NOTE: on targets without efficient SELECT of bools, we can always use
+  // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
+  TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, false, true, NULL);
+  SDValue Tmp1, Tmp2;
+  Tmp1 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSLo.getValueType()),
+                           LHSLo, RHSLo, LowCC, false, DagCombineInfo, dl);
+  if (!Tmp1.getNode())
+    Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+                        LHSLo, RHSLo, LowCC);
+  Tmp2 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+                           LHSHi, RHSHi, CCCode, false, DagCombineInfo, dl);
+  if (!Tmp2.getNode())
+    Tmp2 = DAG.getNode(ISD::SETCC, dl,
+                       TLI.getSetCCResultType(LHSHi.getValueType()),
+                       LHSHi, RHSHi, DAG.getCondCode(CCCode));
+
+  ConstantSDNode *Tmp1C = dyn_cast<ConstantSDNode>(Tmp1.getNode());
+  ConstantSDNode *Tmp2C = dyn_cast<ConstantSDNode>(Tmp2.getNode());
+  if ((Tmp1C && Tmp1C->isNullValue()) ||
+      (Tmp2C && Tmp2C->isNullValue() &&
+       (CCCode == ISD::SETLE || CCCode == ISD::SETGE ||
+        CCCode == ISD::SETUGE || CCCode == ISD::SETULE)) ||
+      (Tmp2C && Tmp2C->getAPIntValue() == 1 &&
+       (CCCode == ISD::SETLT || CCCode == ISD::SETGT ||
+        CCCode == ISD::SETUGT || CCCode == ISD::SETULT))) {
+    // low part is known false, returns high part.
+    // For LE / GE, if high part is known false, ignore the low part.
+    // For LT / GT, if high part is known true, ignore the low part.
+    NewLHS = Tmp2;
+    NewRHS = SDValue();
+    return;
+  }
+
+  NewLHS = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+                             LHSHi, RHSHi, ISD::SETEQ, false,
+                             DagCombineInfo, dl);
+  if (!NewLHS.getNode())
+    NewLHS = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                          LHSHi, RHSHi, ISD::SETEQ);
+  NewLHS = DAG.getNode(ISD::SELECT, dl, Tmp1.getValueType(),
+                       NewLHS, Tmp1, Tmp2);
+  NewRHS = SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode));
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode));
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
+  // The value being shifted is legal, but the shift amount is too big.
+  // It follows that either the result of the shift is undefined, or the
+  // upper half of the shift amount is zero.  Just use the lower half.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(1), Lo, Hi);
+  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Lo);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  MVT DstVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this SINT_TO_FP!");
+  return MakeLibCall(LC, DstVT, &Op, 1, true, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+
+  MVT VT = N->getOperand(1).getValueType();
+  MVT NVT = TLI.getTypeToTransformTo(VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  int SVOffset = N->getSrcValueOffset();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Lo, Hi;
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+    return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                             N->getMemoryVT(), isVolatile, Alignment);
+  } else if (TLI.isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                      isVolatile, Alignment);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    MVT NEVT = MVT::getIntegerVT(ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getSrcValue(),
+                           SVOffset+IncrementSize, NEVT,
+                           isVolatile, MinAlign(Alignment, IncrementSize));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  } else {
+    // Big-endian - high bits are at low addresses.  Favor aligned stores at
+    // the cost of some bit-fiddling.
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+
+    MVT EVT = N->getMemoryVT();
+    unsigned EBytes = EVT.getStoreSizeInBits()/8;
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    unsigned ExcessBits = (EBytes - IncrementSize)*8;
+    MVT HiVT = MVT::getIntegerVT(EVT.getSizeInBits() - ExcessBits);
+
+    if (ExcessBits < NVT.getSizeInBits()) {
+      // Transfer high bits from the top of Lo to the bottom of Hi.
+      Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits,
+                                       TLI.getPointerTy()));
+      Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
+                       DAG.getNode(ISD::SRL, dl, NVT, Lo,
+                                   DAG.getConstant(ExcessBits,
+                                                   TLI.getPointerTy())));
+    }
+
+    // Store both the high bits and maybe some of the low bits.
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getSrcValue(),
+                           SVOffset, HiVT, isVolatile, Alignment);
+
+    // Increment the pointer to the other half.
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    // Store the lowest ExcessBits bits in the second half.
+    Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getSrcValue(),
+                           SVOffset+IncrementSize,
+                           MVT::getIntegerVT(ExcessBits),
+                           isVolatile, MinAlign(Alignment, IncrementSize));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  }
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+  // Just truncate the low part of the source.
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), InL);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  MVT SrcVT = Op.getValueType();
+  MVT DstVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){
+    // Do a signed conversion then adjust the result.
+    SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op);
+    SignedConv = TLI.LowerOperation(SignedConv, DAG);
+
+    // The result of the signed conversion needs adjusting if the 'sign bit' of
+    // the incoming integer was set.  To handle this, we dynamically test to see
+    // if it is set, and, if so, add a fudge factor.
+
+    const uint64_t F32TwoE32  = 0x4F800000ULL;
+    const uint64_t F32TwoE64  = 0x5F800000ULL;
+    const uint64_t F32TwoE128 = 0x7F800000ULL;
+
+    APInt FF(32, 0);
+    if (SrcVT == MVT::i32)
+      FF = APInt(32, F32TwoE32);
+    else if (SrcVT == MVT::i64)
+      FF = APInt(32, F32TwoE64);
+    else if (SrcVT == MVT::i128)
+      FF = APInt(32, F32TwoE128);
+    else
+      assert(false && "Unsupported UINT_TO_FP!");
+
+    // Check whether the sign bit is set.
+    SDValue Lo, Hi;
+    GetExpandedInteger(Op, Lo, Hi);
+    SDValue SignSet = DAG.getSetCC(dl,
+                                   TLI.getSetCCResultType(Hi.getValueType()),
+                                   Hi, DAG.getConstant(0, Hi.getValueType()),
+                                   ISD::SETLT);
+
+    // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+    SDValue FudgePtr = DAG.getConstantPool(ConstantInt::get(FF.zext(64)),
+                                           TLI.getPointerTy());
+
+    // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+    SDValue Zero = DAG.getIntPtrConstant(0);
+    SDValue Four = DAG.getIntPtrConstant(4);
+    if (TLI.isBigEndian()) std::swap(Zero, Four);
+    SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                                 Zero, Four);
+    unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
+    FudgePtr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), FudgePtr, Offset);
+    Alignment = std::min(Alignment, 4u);
+
+    // Load the value out, extending it from f32 to the destination float type.
+    // FIXME: Avoid the extend by constructing the right constant pool?
+    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(),
+                                   FudgePtr, NULL, 0, MVT::f32,
+                                   false, Alignment);
+    return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
+  }
+
+  // Otherwise, use a libcall.
+  RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this UINT_TO_FP!");
+  return MakeLibCall(LC, DstVT, &Op, 1, true, dl);
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
new file mode 100644
index 0000000..00d71e1
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -0,0 +1,1074 @@
+//===-- LegalizeTypes.cpp - Common code for DAG type legalizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeTypes method.  It transforms
+// an arbitrary well-formed SelectionDAG to only consist of legal types.  This
+// is common code shared among the LegalizeTypes*.cpp files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/CallingConv.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetData.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
+
+/// PerformExpensiveChecks - Do extensive, expensive, sanity checking.
+void DAGTypeLegalizer::PerformExpensiveChecks() {
+  // If a node is not processed, then none of its values should be mapped by any
+  // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+
+  // If a node is processed, then each value with an illegal type must be mapped
+  // by exactly one of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+  // Values with a legal type may be mapped by ReplacedValues, but not by any of
+  // the other maps.
+
+  // Note that these invariants may not hold momentarily when processing a node:
+  // the node being processed may be put in a map before being marked Processed.
+
+  // Note that it is possible to have nodes marked NewNode in the DAG.  This can
+  // occur in two ways.  Firstly, a node may be created during legalization but
+  // never passed to the legalization core.  This is usually due to the implicit
+  // folding that occurs when using the DAG.getNode operators.  Secondly, a new
+  // node may be passed to the legalization core, but when analyzed may morph
+  // into a different node, leaving the original node as a NewNode in the DAG.
+  // A node may morph if one of its operands changes during analysis.  Whether
+  // it actually morphs or not depends on whether, after updating its operands,
+  // it is equivalent to an existing node: if so, it morphs into that existing
+  // node (CSE).  An operand can change during analysis if the operand is a new
+  // node that morphs, or it is a processed value that was mapped to some other
+  // value (as recorded in ReplacedValues) in which case the operand is turned
+  // into that other value.  If a node morphs then the node it morphed into will
+  // be used instead of it for legalization, however the original node continues
+  // to live on in the DAG.
+  // The conclusion is that though there may be nodes marked NewNode in the DAG,
+  // all uses of such nodes are also marked NewNode: the result is a fungus of
+  // NewNodes growing on top of the useful nodes, and perhaps using them, but
+  // not used by them.
+
+  // If a value is mapped by ReplacedValues, then it must have no uses, except
+  // by nodes marked NewNode (see above).
+
+  // The final node obtained by mapping by ReplacedValues is not marked NewNode.
+  // Note that ReplacedValues should be applied iteratively.
+
+  // Note that the ReplacedValues map may also map deleted nodes.  By iterating
+  // over the DAG we only consider non-deleted nodes.
+  SmallVector<SDNode*, 16> NewNodes;
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    // Remember nodes marked NewNode - they are subject to extra checking below.
+    if (I->getNodeId() == NewNode)
+      NewNodes.push_back(I);
+
+    for (unsigned i = 0, e = I->getNumValues(); i != e; ++i) {
+      SDValue Res(I, i);
+      bool Failed = false;
+
+      unsigned Mapped = 0;
+      if (ReplacedValues.find(Res) != ReplacedValues.end()) {
+        Mapped |= 1;
+        // Check that remapped values are only used by nodes marked NewNode.
+        for (SDNode::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE; ++UI)
+          if (UI.getUse().getResNo() == i)
+            assert(UI->getNodeId() == NewNode &&
+                   "Remapped value has non-trivial use!");
+
+        // Check that the final result of applying ReplacedValues is not
+        // marked NewNode.
+        SDValue NewVal = ReplacedValues[Res];
+        DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(NewVal);
+        while (I != ReplacedValues.end()) {
+          NewVal = I->second;
+          I = ReplacedValues.find(NewVal);
+        }
+        assert(NewVal.getNode()->getNodeId() != NewNode &&
+               "ReplacedValues maps to a new node!");
+      }
+      if (PromotedIntegers.find(Res) != PromotedIntegers.end())
+        Mapped |= 2;
+      if (SoftenedFloats.find(Res) != SoftenedFloats.end())
+        Mapped |= 4;
+      if (ScalarizedVectors.find(Res) != ScalarizedVectors.end())
+        Mapped |= 8;
+      if (ExpandedIntegers.find(Res) != ExpandedIntegers.end())
+        Mapped |= 16;
+      if (ExpandedFloats.find(Res) != ExpandedFloats.end())
+        Mapped |= 32;
+      if (SplitVectors.find(Res) != SplitVectors.end())
+        Mapped |= 64;
+      if (WidenedVectors.find(Res) != WidenedVectors.end())
+        Mapped |= 128;
+
+      if (I->getNodeId() != Processed) {
+        if (Mapped != 0) {
+          cerr << "Unprocessed value in a map!";
+          Failed = true;
+        }
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(I)) {
+        if (Mapped > 1) {
+          cerr << "Value with legal type was transformed!";
+          Failed = true;
+        }
+      } else {
+        if (Mapped == 0) {
+          cerr << "Processed value not in any map!";
+          Failed = true;
+        } else if (Mapped & (Mapped - 1)) {
+          cerr << "Value in multiple maps!";
+          Failed = true;
+        }
+      }
+
+      if (Failed) {
+        if (Mapped & 1)
+          cerr << " ReplacedValues";
+        if (Mapped & 2)
+          cerr << " PromotedIntegers";
+        if (Mapped & 4)
+          cerr << " SoftenedFloats";
+        if (Mapped & 8)
+          cerr << " ScalarizedVectors";
+        if (Mapped & 16)
+          cerr << " ExpandedIntegers";
+        if (Mapped & 32)
+          cerr << " ExpandedFloats";
+        if (Mapped & 64)
+          cerr << " SplitVectors";
+        if (Mapped & 128)
+          cerr << " WidenedVectors";
+        cerr << "\n";
+        abort();
+      }
+    }
+  }
+
+  // Checked that NewNodes are only used by other NewNodes.
+  for (unsigned i = 0, e = NewNodes.size(); i != e; ++i) {
+    SDNode *N = NewNodes[i];
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI)
+      assert(UI->getNodeId() == NewNode && "NewNode used by non-NewNode!");
+  }
+}
+
+/// run - This is the main entry point for the type legalizer.  This does a
+/// top-down traversal of the dag, legalizing types as it goes.  Returns "true"
+/// if it made any changes.
+bool DAGTypeLegalizer::run() {
+  bool Changed = false;
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+  Dummy.setNodeId(Unanalyzed);
+
+  // The root of the dag may dangle to deleted nodes until the type legalizer is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess'
+  // (and remembering them) if they are leaves and assigning 'Unanalyzed' if
+  // non-leaves.
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    if (I->getNumOperands() == 0) {
+      I->setNodeId(ReadyToProcess);
+      Worklist.push_back(I);
+    } else {
+      I->setNodeId(Unanalyzed);
+    }
+  }
+
+  // Now that we have a set of nodes to process, handle them all.
+  while (!Worklist.empty()) {
+#ifndef XDEBUG
+    if (EnableExpensiveChecks)
+#endif
+      PerformExpensiveChecks();
+
+    SDNode *N = Worklist.back();
+    Worklist.pop_back();
+    assert(N->getNodeId() == ReadyToProcess &&
+           "Node should be ready if on worklist!");
+
+    if (IgnoreNodeResults(N))
+      goto ScanOperands;
+
+    // Scan the values produced by the node, checking to see if any result
+    // types are illegal.
+    for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) {
+      MVT ResultVT = N->getValueType(i);
+      switch (getTypeAction(ResultVT)) {
+      default:
+        assert(false && "Unknown action!");
+      case Legal:
+        break;
+      // The following calls must take care of *all* of the node's results,
+      // not just the illegal result they were passed (this includes results
+      // with a legal type).  Results can be remapped using ReplaceValueWith,
+      // or their promoted/expanded/etc values registered in PromotedIntegers,
+      // ExpandedIntegers etc.
+      case PromoteInteger:
+        PromoteIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case ExpandInteger:
+        ExpandIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case SoftenFloat:
+        SoftenFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case ExpandFloat:
+        ExpandFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case ScalarizeVector:
+        ScalarizeVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case SplitVector:
+        SplitVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case WidenVector:
+        WidenVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      }
+    }
+
+ScanOperands:
+    // Scan the operand list for the node, handling any nodes with operands that
+    // are illegal.
+    {
+    unsigned NumOperands = N->getNumOperands();
+    bool NeedsReanalyzing = false;
+    unsigned i;
+    for (i = 0; i != NumOperands; ++i) {
+      if (IgnoreNodeResults(N->getOperand(i).getNode()))
+        continue;
+
+      MVT OpVT = N->getOperand(i).getValueType();
+      switch (getTypeAction(OpVT)) {
+      default:
+        assert(false && "Unknown action!");
+      case Legal:
+        continue;
+      // The following calls must either replace all of the node's results
+      // using ReplaceValueWith, and return "false"; or update the node's
+      // operands in place, and return "true".
+      case PromoteInteger:
+        NeedsReanalyzing = PromoteIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case ExpandInteger:
+        NeedsReanalyzing = ExpandIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case SoftenFloat:
+        NeedsReanalyzing = SoftenFloatOperand(N, i);
+        Changed = true;
+        break;
+      case ExpandFloat:
+        NeedsReanalyzing = ExpandFloatOperand(N, i);
+        Changed = true;
+        break;
+      case ScalarizeVector:
+        NeedsReanalyzing = ScalarizeVectorOperand(N, i);
+        Changed = true;
+        break;
+      case SplitVector:
+        NeedsReanalyzing = SplitVectorOperand(N, i);
+        Changed = true;
+        break;
+      case WidenVector:
+        NeedsReanalyzing = WidenVectorOperand(N, i);
+        Changed = true;
+        break;
+      }
+      break;
+    }
+
+    // The sub-method updated N in place.  Check to see if any operands are new,
+    // and if so, mark them.  If the node needs revisiting, don't add all users
+    // to the worklist etc.
+    if (NeedsReanalyzing) {
+      assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+      N->setNodeId(NewNode);
+      // Recompute the NodeId and correct processed operands, adding the node to
+      // the worklist if ready.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M == N)
+        // The node didn't morph - nothing special to do, it will be revisited.
+        continue;
+
+      // The node morphed - this is equivalent to legalizing by replacing every
+      // value of N with the corresponding value of M.  So do that now.  However
+      // there is no need to remember the replacement - morphing will make sure
+      // it is never used non-trivially.
+      assert(N->getNumValues() == M->getNumValues() &&
+             "Node morphing changed the number of results!");
+      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+        // Replacing the value takes care of remapping the new value.  Do the
+        // replacement without recording it in ReplacedValues.  This does not
+        // expunge From but that is fine - it is not really a new node.
+        ReplaceValueWithHelper(SDValue(N, i), SDValue(M, i));
+      assert(N->getNodeId() == NewNode && "Unexpected node state!");
+      // The node continues to live on as part of the NewNode fungus that
+      // grows on top of the useful nodes.  Nothing more needs to be done
+      // with it - move on to the next node.
+      continue;
+    }
+
+    if (i == NumOperands) {
+      DEBUG(cerr << "Legally typed node: "; N->dump(&DAG); cerr << "\n");
+    }
+    }
+NodeDone:
+
+    // If we reach here, the node was processed, potentially creating new nodes.
+    // Mark it as processed and add its users to the worklist as appropriate.
+    assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+    N->setNodeId(Processed);
+
+    for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+         UI != E; ++UI) {
+      SDNode *User = *UI;
+      int NodeId = User->getNodeId();
+
+      // This node has two options: it can either be a new node or its Node ID
+      // may be a count of the number of operands it has that are not ready.
+      if (NodeId > 0) {
+        User->setNodeId(NodeId-1);
+
+        // If this was the last use it was waiting on, add it to the ready list.
+        if (NodeId-1 == ReadyToProcess)
+          Worklist.push_back(User);
+        continue;
+      }
+
+      // If this is an unreachable new node, then ignore it.  If it ever becomes
+      // reachable by being used by a newly created node then it will be handled
+      // by AnalyzeNewNode.
+      if (NodeId == NewNode)
+        continue;
+
+      // Otherwise, this node is new: this is the first operand of it that
+      // became ready.  Its new NodeId is the number of operands it has minus 1
+      // (as this node is now processed).
+      assert(NodeId == Unanalyzed && "Unknown node ID!");
+      User->setNodeId(User->getNumOperands() - 1);
+
+      // If the node only has a single operand, it is now ready.
+      if (User->getNumOperands() == 1)
+        Worklist.push_back(User);
+    }
+  }
+
+#ifndef XDEBUG
+  if (EnableExpensiveChecks)
+#endif
+    PerformExpensiveChecks();
+
+  // If the root changed (e.g. it was a dead load) update the root.
+  DAG.setRoot(Dummy.getValue());
+
+  // Remove dead nodes.  This is important to do for cleanliness but also before
+  // the checking loop below.  Implicit folding by the DAG.getNode operators and
+  // node morphing can cause unreachable nodes to be around with their flags set
+  // to new.
+  DAG.RemoveDeadNodes();
+
+  // In a debug build, scan all the nodes to make sure we found them all.  This
+  // ensures that there are no cycles and that everything got processed.
+#ifndef NDEBUG
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    bool Failed = false;
+
+    // Check that all result types are legal.
+    if (!IgnoreNodeResults(I))
+      for (unsigned i = 0, NumVals = I->getNumValues(); i < NumVals; ++i)
+        if (!isTypeLegal(I->getValueType(i))) {
+          cerr << "Result type " << i << " illegal!\n";
+          Failed = true;
+        }
+
+    // Check that all operand types are legal.
+    for (unsigned i = 0, NumOps = I->getNumOperands(); i < NumOps; ++i)
+      if (!IgnoreNodeResults(I->getOperand(i).getNode()) &&
+          !isTypeLegal(I->getOperand(i).getValueType())) {
+        cerr << "Operand type " << i << " illegal!\n";
+        Failed = true;
+      }
+
+    if (I->getNodeId() != Processed) {
+       if (I->getNodeId() == NewNode)
+         cerr << "New node not analyzed?\n";
+       else if (I->getNodeId() == Unanalyzed)
+         cerr << "Unanalyzed node not noticed?\n";
+       else if (I->getNodeId() > 0)
+         cerr << "Operand not processed?\n";
+       else if (I->getNodeId() == ReadyToProcess)
+         cerr << "Not added to worklist?\n";
+       Failed = true;
+    }
+
+    if (Failed) {
+      I->dump(&DAG); cerr << "\n";
+      abort();
+    }
+  }
+#endif
+
+  return Changed;
+}
+
+/// AnalyzeNewNode - The specified node is the root of a subtree of potentially
+/// new nodes.  Correct any processed operands (this may change the node) and
+/// calculate the NodeId.  If the node itself changes to a processed node, it
+/// is not remapped - the caller needs to take care of this.
+/// Returns the potentially changed node.
+SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
+  // If this was an existing node that is already done, we're done.
+  if (N->getNodeId() != NewNode && N->getNodeId() != Unanalyzed)
+    return N;
+
+  // Remove any stale map entries.
+  ExpungeNode(N);
+
+  // Okay, we know that this node is new.  Recursively walk all of its operands
+  // to see if they are new also.  The depth of this walk is bounded by the size
+  // of the new tree that was constructed (usually 2-3 nodes), so we don't worry
+  // about revisiting of nodes.
+  //
+  // As we walk the operands, keep track of the number of nodes that are
+  // processed.  If non-zero, this will become the new nodeid of this node.
+  // Operands may morph when they are analyzed.  If so, the node will be
+  // updated after all operands have been analyzed.  Since this is rare,
+  // the code tries to minimize overhead in the non-morphing case.
+
+  SmallVector<SDValue, 8> NewOps;
+  unsigned NumProcessed = 0;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue OrigOp = N->getOperand(i);
+    SDValue Op = OrigOp;
+
+    AnalyzeNewValue(Op); // Op may morph.
+
+    if (Op.getNode()->getNodeId() == Processed)
+      ++NumProcessed;
+
+    if (!NewOps.empty()) {
+      // Some previous operand changed.  Add this one to the list.
+      NewOps.push_back(Op);
+    } else if (Op != OrigOp) {
+      // This is the first operand to change - add all operands so far.
+      for (unsigned j = 0; j < i; ++j)
+        NewOps.push_back(N->getOperand(j));
+      NewOps.push_back(Op);
+    }
+  }
+
+  // Some operands changed - update the node.
+  if (!NewOps.empty()) {
+    SDNode *M = DAG.UpdateNodeOperands(SDValue(N, 0), &NewOps[0],
+                                       NewOps.size()).getNode();
+    if (M != N) {
+      // The node morphed into a different node.  Normally for this to happen
+      // the original node would have to be marked NewNode.  However this can
+      // in theory momentarily not be the case while ReplaceValueWith is doing
+      // its stuff.  Mark the original node NewNode to help sanity checking.
+      N->setNodeId(NewNode);
+      if (M->getNodeId() != NewNode && M->getNodeId() != Unanalyzed)
+        // It morphed into a previously analyzed node - nothing more to do.
+        return M;
+
+      // It morphed into a different new node.  Do the equivalent of passing
+      // it to AnalyzeNewNode: expunge it and calculate the NodeId.  No need
+      // to remap the operands, since they are the same as the operands we
+      // remapped above.
+      N = M;
+      ExpungeNode(N);
+    }
+  }
+
+  // Calculate the NodeId.
+  N->setNodeId(N->getNumOperands() - NumProcessed);
+  if (N->getNodeId() == ReadyToProcess)
+    Worklist.push_back(N);
+
+  return N;
+}
+
+/// AnalyzeNewValue - Call AnalyzeNewNode, updating the node in Val if needed.
+/// If the node changes to a processed node, then remap it.
+void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
+  Val.setNode(AnalyzeNewNode(Val.getNode()));
+  if (Val.getNode()->getNodeId() == Processed)
+    // We were passed a processed node, or it morphed into one - remap it.
+    RemapValue(Val);
+}
+
+/// ExpungeNode - If N has a bogus mapping in ReplacedValues, eliminate it.
+/// This can occur when a node is deleted then reallocated as a new node -
+/// the mapping in ReplacedValues applies to the deleted node, not the new
+/// one.
+/// The only map that can have a deleted node as a source is ReplacedValues.
+/// Other maps can have deleted nodes as targets, but since their looked-up
+/// values are always immediately remapped using RemapValue, resulting in a
+/// not-deleted node, this is harmless as long as ReplacedValues/RemapValue
+/// always performs correct mappings.  In order to keep the mapping correct,
+/// ExpungeNode should be called on any new nodes *before* adding them as
+/// either source or target to ReplacedValues (which typically means calling
+/// Expunge when a new node is first seen, since it may no longer be marked
+/// NewNode by the time it is added to ReplacedValues).
+void DAGTypeLegalizer::ExpungeNode(SDNode *N) {
+  if (N->getNodeId() != NewNode)
+    return;
+
+  // If N is not remapped by ReplacedValues then there is nothing to do.
+  unsigned i, e;
+  for (i = 0, e = N->getNumValues(); i != e; ++i)
+    if (ReplacedValues.find(SDValue(N, i)) != ReplacedValues.end())
+      break;
+
+  if (i == e)
+    return;
+
+  // Remove N from all maps - this is expensive but rare.
+
+  for (DenseMap<SDValue, SDValue>::iterator I = PromotedIntegers.begin(),
+       E = PromotedIntegers.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = SoftenedFloats.begin(),
+       E = SoftenedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ScalarizedVectors.begin(),
+       E = ScalarizedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = WidenedVectors.begin(),
+       E = WidenedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedIntegers.begin(), E = ExpandedIntegers.end(); I != E; ++I){
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedFloats.begin(), E = ExpandedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = SplitVectors.begin(), E = SplitVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.begin(),
+       E = ReplacedValues.end(); I != E; ++I)
+    RemapValue(I->second);
+
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    ReplacedValues.erase(SDValue(N, i));
+}
+
+/// RemapValue - If the specified value was already legalized to another value,
+/// replace it by that value.
+void DAGTypeLegalizer::RemapValue(SDValue &N) {
+  DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(N);
+  if (I != ReplacedValues.end()) {
+    // Use path compression to speed up future lookups if values get multiply
+    // replaced with other values.
+    RemapValue(I->second);
+    N = I->second;
+    assert(N.getNode()->getNodeId() != NewNode && "Mapped to new node!");
+  }
+}
+
+namespace {
+  /// NodeUpdateListener - This class is a DAGUpdateListener that listens for
+  /// updates to nodes and recomputes their ready state.
+  class VISIBILITY_HIDDEN NodeUpdateListener :
+    public SelectionDAG::DAGUpdateListener {
+    DAGTypeLegalizer &DTL;
+    SmallSetVector<SDNode*, 16> &NodesToAnalyze;
+  public:
+    explicit NodeUpdateListener(DAGTypeLegalizer &dtl,
+                                SmallSetVector<SDNode*, 16> &nta)
+      : DTL(dtl), NodesToAnalyze(nta) {}
+
+    virtual void NodeDeleted(SDNode *N, SDNode *E) {
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      // It is possible, though rare, for the deleted node N to occur as a
+      // target in a map, so note the replacement N -> E in ReplacedValues.
+      assert(E && "Node not replaced?");
+      DTL.NoteDeletion(N, E);
+
+      // In theory the deleted node could also have been scheduled for analysis.
+      // So remove it from the set of nodes which will be analyzed.
+      NodesToAnalyze.remove(N);
+
+      // In general nothing needs to be done for E, since it didn't change but
+      // only gained new uses.  However N -> E was just added to ReplacedValues,
+      // and the result of a ReplacedValues mapping is not allowed to be marked
+      // NewNode.  So if E is marked NewNode, then it needs to be analyzed.
+      if (E->getNodeId() == DAGTypeLegalizer::NewNode)
+        NodesToAnalyze.insert(E);
+    }
+
+    virtual void NodeUpdated(SDNode *N) {
+      // Node updates can mean pretty much anything.  It is possible that an
+      // operand was set to something already processed (f.e.) in which case
+      // this node could become ready.  Recompute its flags.
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      N->setNodeId(DAGTypeLegalizer::NewNode);
+      NodesToAnalyze.insert(N);
+    }
+  };
+}
+
+
+/// ReplaceValueWithHelper - Internal helper for ReplaceValueWith.  Updates the
+/// DAG causing any uses of From to use To instead, but without expunging From
+/// or recording the replacement in ReplacedValues.  Do not call directly unless
+/// you really know what you are doing!
+void DAGTypeLegalizer::ReplaceValueWithHelper(SDValue From, SDValue To) {
+  assert(From.getNode() != To.getNode() && "Potential legalization loop!");
+
+  // If expansion produced new nodes, make sure they are properly marked.
+  AnalyzeNewValue(To); // Expunges To.
+
+  // Anything that used the old node should now use the new one.  Note that this
+  // can potentially cause recursive merging.
+  SmallSetVector<SDNode*, 16> NodesToAnalyze;
+  NodeUpdateListener NUL(*this, NodesToAnalyze);
+  DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
+
+  // Process the list of nodes that need to be reanalyzed.
+  while (!NodesToAnalyze.empty()) {
+    SDNode *N = NodesToAnalyze.back();
+    NodesToAnalyze.pop_back();
+    if (N->getNodeId() != DAGTypeLegalizer::NewNode)
+      // The node was analyzed while reanalyzing an earlier node - it is safe to
+      // skip.  Note that this is not a morphing node - otherwise it would still
+      // be marked NewNode.
+      continue;
+
+    // Analyze the node's operands and recalculate the node ID.
+    SDNode *M = AnalyzeNewNode(N);
+    if (M != N) {
+      // The node morphed into a different node.  Make everyone use the new node
+      // instead.
+      assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
+      assert(N->getNumValues() == M->getNumValues() &&
+             "Node morphing changed the number of results!");
+      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+        SDValue OldVal(N, i);
+        SDValue NewVal(M, i);
+        if (M->getNodeId() == Processed)
+          RemapValue(NewVal);
+        DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+      }
+      // The original node continues to exist in the DAG, marked NewNode.
+    }
+  }
+}
+
+/// ReplaceValueWith - The specified value was legalized to the specified other
+/// value.  Update the DAG and NodeIds replacing any uses of From to use To
+/// instead.
+void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
+  assert(From.getNode()->getNodeId() == ReadyToProcess &&
+         "Only the node being processed may be remapped!");
+
+  // If expansion produced new nodes, make sure they are properly marked.
+  ExpungeNode(From.getNode());
+  AnalyzeNewValue(To); // Expunges To.
+
+  // The old node may still be present in a map like ExpandedIntegers or
+  // PromotedIntegers.  Inform maps about the replacement.
+  ReplacedValues[From] = To;
+
+  // Do the replacement.
+  ReplaceValueWithHelper(From, To);
+}
+
+void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = PromotedIntegers[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already promoted!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = SoftenedFloats[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already converted to integer!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = ScalarizedVectors[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already scalarized!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::GetExpandedInteger(SDValue Op, SDValue &Lo,
+                                          SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
+                                          SDValue Hi) {
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::GetExpandedFloat(SDValue Op, SDValue &Lo,
+                                        SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
+                                        SDValue Hi) {
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
+                                      SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't split");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
+                                      SDValue Hi) {
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  assert(Entry.first.getNode() == 0 && "Node already split");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = WidenedVectors[Op];
+  assert(OpEntry.getNode() == 0 && "Node already widened!");
+  OpEntry = Result;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
+/// BitConvertToInteger - Convert to an integer of the same size.
+SDValue DAGTypeLegalizer::BitConvertToInteger(SDValue Op) {
+  unsigned BitWidth = Op.getValueType().getSizeInBits();
+  return DAG.getNode(ISD::BIT_CONVERT, Op.getDebugLoc(),
+                     MVT::getIntegerVT(BitWidth), Op);
+}
+
+/// BitConvertVectorToIntegerVector - Convert to a vector of integers of the
+/// same size.
+SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Only applies to vectors!");
+  unsigned EltWidth = Op.getValueType().getVectorElementType().getSizeInBits();
+  MVT EltNVT = MVT::getIntegerVT(EltWidth);
+  unsigned NumElts = Op.getValueType().getVectorNumElements();
+  return DAG.getNode(ISD::BIT_CONVERT, Op.getDebugLoc(),
+                     MVT::getVectorVT(EltNVT, NumElts), Op);
+}
+
+SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
+                                               MVT DestVT) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and destination types.
+  SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, NULL, 0);
+  // Result is a load from the stack slot.
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, NULL, 0);
+}
+
+/// CustomLowerNode - Replace the node's results with custom code provided
+/// by the target and return "true", or do nothing and return "false".
+/// The last parameter is FALSE if we are dealing with a node with legal
+/// result types and illegal operand. The second parameter denotes the type of
+/// illegal OperandNo in that case.
+/// The last parameter being TRUE means we are dealing with a
+/// node with illegal result types. The second parameter denotes the type of
+/// illegal ResNo in that case.
+bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, MVT VT, bool LegalizeResult) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  if (LegalizeResult)
+    TLI.ReplaceNodeResults(N, Results, DAG);
+  else
+    TLI.LowerOperationWrapper(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom lower it after all.
+    return false;
+
+  // Make everything that once used N's values now use those in Results instead.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i)
+    ReplaceValueWith(SDValue(N, i), Results[i]);
+  return true;
+}
+
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split into two not necessarily identical pieces.
+void DAGTypeLegalizer::GetSplitDestVTs(MVT InVT, MVT &LoVT, MVT &HiVT) {
+  if (!InVT.isVector()) {
+    LoVT = HiVT = TLI.getTypeToTransformTo(InVT);
+  } else {
+    MVT NewEltVT = InVT.getVectorElementType();
+    unsigned NumElements = InVT.getVectorNumElements();
+    if ((NumElements & (NumElements-1)) == 0) {  // Simple power of two vector.
+      NumElements >>= 1;
+      LoVT = HiVT =  MVT::getVectorVT(NewEltVT, NumElements);
+    } else {                                     // Non-power-of-two vectors.
+      unsigned NewNumElts_Lo = 1 << Log2_32(NumElements);
+      unsigned NewNumElts_Hi = NumElements - NewNumElts_Lo;
+      LoVT = MVT::getVectorVT(NewEltVT, NewNumElts_Lo);
+      HiVT = MVT::getVectorVT(NewEltVT, NewNumElts_Hi);
+    }
+  }
+}
+
+/// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
+/// high parts of the given value.
+void DAGTypeLegalizer::GetPairElements(SDValue Pair,
+                                       SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = Pair.getDebugLoc();
+  MVT NVT = TLI.getTypeToTransformTo(Pair.getValueType());
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(1));
+}
+
+SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, MVT EltVT,
+                                                  SDValue Index) {
+  DebugLoc dl = Index.getDebugLoc();
+  // Make sure the index type is big enough to compute in.
+  if (Index.getValueType().bitsGT(TLI.getPointerTy()))
+    Index = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Index);
+  else
+    Index = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Index);
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
+
+  Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
+                      DAG.getConstant(EltSize, Index.getValueType()));
+  return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr);
+}
+
+/// JoinIntegers - Build an integer with low bits Lo and high bits Hi.
+SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
+  // Arbitrarily use dlHi for result DebugLoc
+  DebugLoc dlHi = Hi.getDebugLoc();
+  DebugLoc dlLo = Lo.getDebugLoc();
+  MVT LVT = Lo.getValueType();
+  MVT HVT = Hi.getValueType();
+  MVT NVT = MVT::getIntegerVT(LVT.getSizeInBits() + HVT.getSizeInBits());
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
+  Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
+  Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
+                   DAG.getConstant(LVT.getSizeInBits(), TLI.getPointerTy()));
+  return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
+}
+
+/// LibCallify - Convert the node into a libcall with the same prototype.
+SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
+                                     bool isSigned) {
+  unsigned NumOps = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+  if (NumOps == 0) {
+    return MakeLibCall(LC, N->getValueType(0), 0, 0, isSigned, dl);
+  } else if (NumOps == 1) {
+    SDValue Op = N->getOperand(0);
+    return MakeLibCall(LC, N->getValueType(0), &Op, 1, isSigned, dl);
+  } else if (NumOps == 2) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    return MakeLibCall(LC, N->getValueType(0), Ops, 2, isSigned, dl);
+  }
+  SmallVector<SDValue, 8> Ops(NumOps);
+  for (unsigned i = 0; i < NumOps; ++i)
+    Ops[i] = N->getOperand(i);
+
+  return MakeLibCall(LC, N->getValueType(0), &Ops[0], NumOps, isSigned, dl);
+}
+
+/// MakeLibCall - Generate a libcall taking the given operands as arguments and
+/// returning a result of type RetVT.
+SDValue DAGTypeLegalizer::MakeLibCall(RTLIB::Libcall LC, MVT RetVT,
+                                      const SDValue *Ops, unsigned NumOps,
+                                      bool isSigned, DebugLoc dl) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForMVT();
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  const Type *RetTy = RetVT.getTypeForMVT();
+  std::pair<SDValue,SDValue> CallInfo =
+    TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                    false, CallingConv::C, false, Callee, Args, DAG, dl);
+  return CallInfo.first;
+}
+
+/// PromoteTargetBoolean - Promote the given target boolean to a target boolean
+/// of the given type.  A target boolean is an integer value, not necessarily of
+/// type i1, the bits of which conform to getBooleanContents.
+SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, MVT VT) {
+  DebugLoc dl = Bool.getDebugLoc();
+  ISD::NodeType ExtendCode;
+  switch (TLI.getBooleanContents()) {
+  default:
+    assert(false && "Unknown BooleanContent!");
+  case TargetLowering::UndefinedBooleanContent:
+    // Extend to VT by adding rubbish bits.
+    ExtendCode = ISD::ANY_EXTEND;
+    break;
+  case TargetLowering::ZeroOrOneBooleanContent:
+    // Extend to VT by adding zero bits.
+    ExtendCode = ISD::ZERO_EXTEND;
+    break;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent: {
+    // Extend to VT by copying the sign bit.
+    ExtendCode = ISD::SIGN_EXTEND;
+    break;
+  }
+  }
+  return DAG.getNode(ExtendCode, dl, VT, Bool);
+}
+
+/// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
+/// bits in Hi.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    MVT LoVT, MVT HiVT,
+                                    SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = Op.getDebugLoc();
+  assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
+         Op.getValueType().getSizeInBits() && "Invalid integer splitting!");
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
+  Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
+                   DAG.getConstant(LoVT.getSizeInBits(), TLI.getPointerTy()));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
+}
+
+/// SplitInteger - Return the lower and upper halves of Op's bits in a value
+/// type half the size of Op's.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    SDValue &Lo, SDValue &Hi) {
+  MVT HalfVT = MVT::getIntegerVT(Op.getValueType().getSizeInBits()/2);
+  SplitInteger(Op, HalfVT, HalfVT, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Entry Point
+//===----------------------------------------------------------------------===//
+
+/// LegalizeTypes - This transforms the SelectionDAG into a SelectionDAG that
+/// only uses types natively supported by the target.  Returns "true" if it made
+/// any changes.
+///
+/// Note that this is an involved process that may invalidate pointers into
+/// the graph.
+bool SelectionDAG::LegalizeTypes() {
+  return DAGTypeLegalizer(*this).run();
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
new file mode 100644
index 0000000..75c8924
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -0,0 +1,736 @@
+//===-- LegalizeTypes.h - Definition of the DAG Type Legalizer class ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DAGTypeLegalizer class.  This is a private interface
+// shared between the code that implements the SelectionDAG::LegalizeTypes
+// method.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SELECTIONDAG_LEGALIZETYPES_H
+#define SELECTIONDAG_LEGALIZETYPES_H
+
+#define DEBUG_TYPE "legalize-types"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// DAGTypeLegalizer - This takes an arbitrary SelectionDAG as input and hacks
+/// on it until only value types the target machine can handle are left.  This
+/// involves promoting small sizes to large sizes or splitting up large values
+/// into small values.
+///
+class VISIBILITY_HIDDEN DAGTypeLegalizer {
+  TargetLowering &TLI;
+  SelectionDAG &DAG;
+public:
+  // NodeIdFlags - This pass uses the NodeId on the SDNodes to hold information
+  // about the state of the node.  The enum has all the values.
+  enum NodeIdFlags {
+    /// ReadyToProcess - All operands have been processed, so this node is ready
+    /// to be handled.
+    ReadyToProcess = 0,
+
+    /// NewNode - This is a new node, not before seen, that was created in the
+    /// process of legalizing some other node.
+    NewNode = -1,
+
+    /// Unanalyzed - This node's ID needs to be set to the number of its
+    /// unprocessed operands.
+    Unanalyzed = -2,
+
+    /// Processed - This is a node that has already been processed.
+    Processed = -3
+
+    // 1+ - This is a node which has this many unprocessed operands.
+  };
+private:
+  enum LegalizeAction {
+    Legal,           // The target natively supports this type.
+    PromoteInteger,  // Replace this integer type with a larger one.
+    ExpandInteger,   // Split this integer type into two of half the size.
+    SoftenFloat,     // Convert this float type to a same size integer type.
+    ExpandFloat,     // Split this float type into two of half the size.
+    ScalarizeVector, // Replace this one-element vector with its element type.
+    SplitVector,     // This vector type should be split into smaller vectors.
+    WidenVector      // This vector type should be widened into a larger vector.
+  };
+
+  /// ValueTypeActions - This is a bitvector that contains two bits for each
+  /// simple value type, where the two bits correspond to the LegalizeAction
+  /// enum from TargetLowering.  This can be queried with "getTypeAction(VT)".
+  TargetLowering::ValueTypeActionImpl ValueTypeActions;
+
+  /// getTypeAction - Return how we should legalize values of this type.
+  LegalizeAction getTypeAction(MVT VT) const {
+    switch (ValueTypeActions.getTypeAction(VT)) {
+    default:
+      assert(false && "Unknown legalize action!");
+    case TargetLowering::Legal:
+      return Legal;
+    case TargetLowering::Promote:
+      // Promote can mean
+      //   1) For integers, use a larger integer type (e.g. i8 -> i32).
+      //   2) For vectors, use a wider vector type (e.g. v3i32 -> v4i32).
+      if (!VT.isVector())
+        return PromoteInteger;
+      else
+        return WidenVector;
+    case TargetLowering::Expand:
+      // Expand can mean
+      // 1) split scalar in half, 2) convert a float to an integer,
+      // 3) scalarize a single-element vector, 4) split a vector in two.
+      if (!VT.isVector()) {
+        if (VT.isInteger())
+          return ExpandInteger;
+        else if (VT.getSizeInBits() ==
+                 TLI.getTypeToTransformTo(VT).getSizeInBits())
+          return SoftenFloat;
+        else
+          return ExpandFloat;
+      } else if (VT.getVectorNumElements() == 1) {
+        return ScalarizeVector;
+      } else {
+        return SplitVector;
+      }
+    }
+  }
+
+  /// isTypeLegal - Return true if this type is legal on this target.
+  bool isTypeLegal(MVT VT) const {
+    return ValueTypeActions.getTypeAction(VT) == TargetLowering::Legal;
+  }
+
+  /// IgnoreNodeResults - Pretend all of this node's results are legal.
+  bool IgnoreNodeResults(SDNode *N) const {
+    return N->getOpcode() == ISD::TargetConstant;
+  }
+
+  /// PromotedIntegers - For integer nodes that are below legal width, this map
+  /// indicates what promoted value to use.
+  DenseMap<SDValue, SDValue> PromotedIntegers;
+
+  /// ExpandedIntegers - For integer nodes that need to be expanded this map
+  /// indicates which operands are the expanded version of the input.
+  DenseMap<SDValue, std::pair<SDValue, SDValue> > ExpandedIntegers;
+
+  /// SoftenedFloats - For floating point nodes converted to integers of
+  /// the same size, this map indicates the converted value to use.
+  DenseMap<SDValue, SDValue> SoftenedFloats;
+
+  /// ExpandedFloats - For float nodes that need to be expanded this map
+  /// indicates which operands are the expanded version of the input.
+  DenseMap<SDValue, std::pair<SDValue, SDValue> > ExpandedFloats;
+
+  /// ScalarizedVectors - For nodes that are <1 x ty>, this map indicates the
+  /// scalar value of type 'ty' to use.
+  DenseMap<SDValue, SDValue> ScalarizedVectors;
+
+  /// SplitVectors - For nodes that need to be split this map indicates
+  /// which operands are the expanded version of the input.
+  DenseMap<SDValue, std::pair<SDValue, SDValue> > SplitVectors;
+
+  /// WidenedVectors - For vector nodes that need to be widened, indicates
+  /// the widened value to use.
+  DenseMap<SDValue, SDValue> WidenedVectors;
+
+  /// ReplacedValues - For values that have been replaced with another,
+  /// indicates the replacement value to use.
+  DenseMap<SDValue, SDValue> ReplacedValues;
+
+  /// Worklist - This defines a worklist of nodes to process.  In order to be
+  /// pushed onto this worklist, all operands of a node must have already been
+  /// processed.
+  SmallVector<SDNode*, 128> Worklist;
+
+public:
+  explicit DAGTypeLegalizer(SelectionDAG &dag)
+    : TLI(dag.getTargetLoweringInfo()), DAG(dag),
+    ValueTypeActions(TLI.getValueTypeActions()) {
+    assert(MVT::LAST_VALUETYPE <= 32 &&
+           "Too many value types for ValueTypeActions to hold!");
+  }
+
+  /// run - This is the main entry point for the type legalizer.  This does a
+  /// top-down traversal of the dag, legalizing types as it goes.  Returns
+  /// "true" if it made any changes.
+  bool run();
+
+  void NoteDeletion(SDNode *Old, SDNode *New) {
+    ExpungeNode(Old);
+    ExpungeNode(New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
+      ReplacedValues[SDValue(Old, i)] = SDValue(New, i);
+  }
+
+private:
+  SDNode *AnalyzeNewNode(SDNode *N);
+  void AnalyzeNewValue(SDValue &Val);
+  void ExpungeNode(SDNode *N);
+  void PerformExpensiveChecks();
+  void RemapValue(SDValue &N);
+
+  // Common routines.
+  SDValue BitConvertToInteger(SDValue Op);
+  SDValue BitConvertVectorToIntegerVector(SDValue Op);
+  SDValue CreateStackStoreLoad(SDValue Op, MVT DestVT);
+  bool CustomLowerNode(SDNode *N, MVT VT, bool LegalizeResult);
+  SDValue GetVectorElementPointer(SDValue VecPtr, MVT EltVT, SDValue Index);
+  SDValue JoinIntegers(SDValue Lo, SDValue Hi);
+  SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
+  SDValue MakeLibCall(RTLIB::Libcall LC, MVT RetVT,
+                      const SDValue *Ops, unsigned NumOps, bool isSigned,
+                      DebugLoc dl);
+  SDValue PromoteTargetBoolean(SDValue Bool, MVT VT);
+  void ReplaceValueWith(SDValue From, SDValue To);
+  void ReplaceValueWithHelper(SDValue From, SDValue To);
+  void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SplitInteger(SDValue Op, MVT LoVT, MVT HiVT,
+                    SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Promotion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetPromotedInteger - Given a processed operand Op which was promoted to a
+  /// larger integer type, this returns the promoted value.  The low bits of the
+  /// promoted value corresponding to the original type are exactly equal to Op.
+  /// The extra bits contain rubbish, so the promoted value may need to be zero-
+  /// or sign-extended from the original type before it is usable (the helpers
+  /// SExtPromotedInteger and ZExtPromotedInteger can do this for you).
+  /// For example, if Op is an i16 and was promoted to an i32, then this method
+  /// returns an i32, the lower 16 bits of which coincide with Op, and the upper
+  /// 16 bits of which contain rubbish.
+  SDValue GetPromotedInteger(SDValue Op) {
+    SDValue &PromotedOp = PromotedIntegers[Op];
+    RemapValue(PromotedOp);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetPromotedInteger(SDValue Op, SDValue Result);
+
+  /// SExtPromotedInteger - Get a promoted operand and sign extend it to the
+  /// final size.
+  SDValue SExtPromotedInteger(SDValue Op) {
+    MVT OldVT = Op.getValueType();
+    DebugLoc dl = Op.getDebugLoc();
+    Op = GetPromotedInteger(Op);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op,
+                       DAG.getValueType(OldVT));
+  }
+
+  /// ZExtPromotedInteger - Get a promoted operand and zero extend it to the
+  /// final size.
+  SDValue ZExtPromotedInteger(SDValue Op) {
+    MVT OldVT = Op.getValueType();
+    DebugLoc dl = Op.getDebugLoc();
+    Op = GetPromotedInteger(Op);
+    return DAG.getZeroExtendInReg(Op, dl, OldVT);
+  }
+
+  // Integer Result Promotion.
+  void PromoteIntegerResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_AssertSext(SDNode *N);
+  SDValue PromoteIntRes_AssertZext(SDNode *N);
+  SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
+  SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
+  SDValue PromoteIntRes_BIT_CONVERT(SDNode *N);
+  SDValue PromoteIntRes_BSWAP(SDNode *N);
+  SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntRes_Constant(SDNode *N);
+  SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntRes_CTLZ(SDNode *N);
+  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTTZ(SDNode *N);
+  SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
+  SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
+  SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_Overflow(SDNode *N);
+  SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_SDIV(SDNode *N);
+  SDValue PromoteIntRes_SELECT(SDNode *N);
+  SDValue PromoteIntRes_SELECT_CC(SDNode *N);
+  SDValue PromoteIntRes_SETCC(SDNode *N);
+  SDValue PromoteIntRes_SHL(SDNode *N);
+  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue PromoteIntRes_SRA(SDNode *N);
+  SDValue PromoteIntRes_SRL(SDNode *N);
+  SDValue PromoteIntRes_TRUNCATE(SDNode *N);
+  SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_UDIV(SDNode *N);
+  SDValue PromoteIntRes_UNDEF(SDNode *N);
+  SDValue PromoteIntRes_VAARG(SDNode *N);
+  SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+
+  // Integer Operand Promotion.
+  bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_BIT_CONVERT(SDNode *N);
+  SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MEMBARRIER(SDNode *N);
+  SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_Shift(SDNode *N);
+  SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_TRUNCATE(SDNode *N);
+  SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+
+  void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Expansion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetExpandedInteger - Given a processed operand Op which was expanded into
+  /// two integers of half the size, this returns the two halves.  The low bits
+  /// of Op are exactly equal to the bits of Lo; the high bits exactly equal Hi.
+  /// For example, if Op is an i64 which was expanded into two i32's, then this
+  /// method returns the two i32's, with Lo being equal to the lower 32 bits of
+  /// Op, and Hi being equal to the upper 32 bits.
+  void GetExpandedInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedInteger(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Integer Result Expansion.
+  void ExpandIntegerResult(SDNode *N, unsigned ResNo);
+  void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Constant          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTLZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ZERO_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_SINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_UINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_Logical           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUB            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandShiftByConstant(SDNode *N, unsigned Amt,
+                             SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Integer Operand Expansion.
+  bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandIntOp_BIT_CONVERT(SDNode *N);
+  SDValue ExpandIntOp_BR_CC(SDNode *N);
+  SDValue ExpandIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue ExpandIntOp_EXTRACT_ELEMENT(SDNode *N);
+  SDValue ExpandIntOp_SELECT_CC(SDNode *N);
+  SDValue ExpandIntOp_SETCC(SDNode *N);
+  SDValue ExpandIntOp_Shift(SDNode *N);
+  SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_TRUNCATE(SDNode *N);
+  SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
+
+  void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                  ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
+  /// integer of the same size, this returns the integer.  The integer contains
+  /// exactly the same bits as Op - only the type changed.  For example, if Op
+  /// is an f32 which was softened to an i32, then this method returns an i32,
+  /// the bits of which coincide with those of Op.
+  SDValue GetSoftenedFloat(SDValue Op) {
+    SDValue &SoftenedOp = SoftenedFloats[Op];
+    RemapValue(SoftenedOp);
+    assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
+    return SoftenedOp;
+  }
+  void SetSoftenedFloat(SDValue Op, SDValue Result);
+
+  // Result Float to Integer Conversion.
+  void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatRes_BIT_CONVERT(SDNode *N);
+  SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
+  SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FADD(SDNode *N);
+  SDValue SoftenFloatRes_FCEIL(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOS(SDNode *N);
+  SDValue SoftenFloatRes_FDIV(SDNode *N);
+  SDValue SoftenFloatRes_FEXP(SDNode *N);
+  SDValue SoftenFloatRes_FEXP2(SDNode *N);
+  SDValue SoftenFloatRes_FFLOOR(SDNode *N);
+  SDValue SoftenFloatRes_FLOG(SDNode *N);
+  SDValue SoftenFloatRes_FLOG2(SDNode *N);
+  SDValue SoftenFloatRes_FLOG10(SDNode *N);
+  SDValue SoftenFloatRes_FMUL(SDNode *N);
+  SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
+  SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatRes_FPOW(SDNode *N);
+  SDValue SoftenFloatRes_FPOWI(SDNode *N);
+  SDValue SoftenFloatRes_FREM(SDNode *N);
+  SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FSIN(SDNode *N);
+  SDValue SoftenFloatRes_FSQRT(SDNode *N);
+  SDValue SoftenFloatRes_FSUB(SDNode *N);
+  SDValue SoftenFloatRes_FTRUNC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N);
+  SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_UNDEF(SDNode *N);
+  SDValue SoftenFloatRes_VAARG(SDNode *N);
+  SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
+
+  // Operand Float to Integer Conversion.
+  bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_BIT_CONVERT(SDNode *N);
+  SDValue SoftenFloatOp_BR_CC(SDNode *N);
+  SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatOp_SETCC(SDNode *N);
+  SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  void SoftenSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                           ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float Expansion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetExpandedFloat - Given a processed operand Op which was expanded into
+  /// two floating point values of half the size, this returns the two halves.
+  /// The low bits of Op are exactly equal to the bits of Lo; the high bits
+  /// exactly equal Hi.  For example, if Op is a ppcf128 which was expanded
+  /// into two f64's, then this method returns the two f64's, with Lo being
+  /// equal to the lower 64 bits of Op, and Hi to the upper 64 bits.
+  void GetExpandedFloat(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedFloat(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Float Result Expansion.
+  void ExpandFloatResult(SDNode *N, unsigned ResNo);
+  void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FADD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCEIL     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FDIV      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FFLOOR    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG10    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMUL      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEARBYINT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOW      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FTRUNC    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_LOAD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Float Operand Expansion.
+  bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue ExpandFloatOp_SELECT_CC(SDNode *N);
+  SDValue ExpandFloatOp_SETCC(SDNode *N);
+  SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Scalarization Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetScalarizedVector - Given a processed one-element vector Op which was
+  /// scalarized to its element type, this returns the element.  For example,
+  /// if Op is a v1i32, Op = < i32 val >, this method returns val, an i32.
+  SDValue GetScalarizedVector(SDValue Op) {
+    SDValue &ScalarizedOp = ScalarizedVectors[Op];
+    RemapValue(ScalarizedOp);
+    assert(ScalarizedOp.getNode() && "Operand wasn't scalarized?");
+    return ScalarizedOp;
+  }
+  void SetScalarizedVector(SDValue Op, SDValue Result);
+
+  // Vector Result Scalarization: <1 x ty> -> ty.
+  void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecRes_BinOp(SDNode *N);
+  SDValue ScalarizeVecRes_ShiftOp(SDNode *N);
+  SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
+
+  SDValue ScalarizeVecRes_BIT_CONVERT(SDNode *N);
+  SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_FPOWI(SDNode *N);
+  SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
+  SDValue ScalarizeVecRes_UNDEF(SDNode *N);
+  SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue ScalarizeVecRes_VSETCC(SDNode *N);
+
+  // Vector Operand Scalarization: <1 x ty> -> ty.
+  bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_BIT_CONVERT(SDNode *N);
+  SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Splitting Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetSplitVector - Given a processed vector Op which was split into smaller
+  /// vectors, this method returns the smaller vectors.  The first elements of
+  /// Op coincide with the elements of Lo; the remaining elements of Op coincide
+  /// with the elements of Hi: Op is what you would get by concatenating Lo and
+  /// Hi.  For example, if Op is a v8i32 that was split into two v4i32's, then
+  /// this method returns the two v4i32's, with Lo corresponding to the first 4
+  /// elements of Op, and Hi to the last 4 elements.
+  void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
+  void SplitVectorResult(SDNode *N, unsigned OpNo);
+  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void SplitVecRes_BIT_CONVERT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_CONVERT_RNDSAT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, 
+                                  SDValue &Hi);
+  void SplitVecRes_VSETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
+  bool SplitVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_UnaryOp(SDNode *N);
+
+  SDValue SplitVecOp_BIT_CONVERT(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetWidenedVector - Given a processed vector Op which was widened into a
+  /// larger vector, this method returns the larger vector.  The elements of
+  /// the returned vector consist of the elements of Op followed by elements
+  /// containing rubbish.  For example, if Op is a v2i32 that was widened to a
+  /// v4i32, then this method returns a v4i32 for which the first two elements
+  /// are the same as those of Op, while the last two elements contain rubbish.
+  SDValue GetWidenedVector(SDValue Op) {
+    SDValue &WidenedOp = WidenedVectors[Op];
+    RemapValue(WidenedOp);
+    assert(WidenedOp.getNode() && "Operand wasn't widened?");
+    return WidenedOp;
+  }
+  void SetWidenedVector(SDValue Op, SDValue Result);
+
+  // Widen Vector Result Promotion.
+  void WidenVectorResult(SDNode *N, unsigned ResNo);
+  SDValue WidenVecRes_BIT_CONVERT(SDNode* N);
+  SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
+  SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
+  SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
+  SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
+  SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+  SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
+  SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVecRes_SELECT_CC(SDNode* N);
+  SDValue WidenVecRes_UNDEF(SDNode *N);
+  SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
+  SDValue WidenVecRes_VSETCC(SDNode* N);
+
+  SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_Shift(SDNode *N);
+  SDValue WidenVecRes_Unary(SDNode *N);
+
+  // Widen Vector Operand.
+  bool WidenVectorOperand(SDNode *N, unsigned ResNo);
+  SDValue WidenVecOp_BIT_CONVERT(SDNode *N);
+  SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue WidenVecOp_STORE(SDNode* N);
+
+  SDValue WidenVecOp_Convert(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Helper genWidenVectorLoads - Helper function to generate a set of
+  /// loads to load a vector with a resulting wider type. It takes
+  ///   ExtType: Extension type
+  ///   LdChain: list of chains for the load we have generated.
+  ///   Chain:   incoming chain for the ld vector.
+  ///   BasePtr: base pointer to load from.
+  ///   SV:         memory disambiguation source value.
+  ///   SVOffset:   memory disambiugation offset.
+  ///   Alignment:  alignment of the memory.
+  ///   isVolatile: volatile load.
+  ///   LdWidth:    width of memory that we want to load.
+  ///   ResType:    the wider result result type for the resulting vector.
+  ///   dl:         DebugLoc to be applied to new nodes
+  SDValue GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain, SDValue Chain,
+                              SDValue BasePtr, const Value *SV,
+                              int SVOffset, unsigned Alignment,
+                              bool isVolatile, unsigned LdWidth,
+                              MVT ResType, DebugLoc dl);
+
+  /// Helper genWidenVectorStores - Helper function to generate a set of
+  /// stores to store a widen vector into non widen memory
+  /// It takes
+  ///   StChain: list of chains for the stores we have generated
+  ///   Chain:   incoming chain for the ld vector
+  ///   BasePtr: base pointer to load from
+  ///   SV:      memory disambiguation source value
+  ///   SVOffset:   memory disambiugation offset
+  ///   Alignment:  alignment of the memory
+  ///   isVolatile: volatile lod
+  ///   ValOp:   value to store
+  ///   StWidth: width of memory that we want to store
+  ///   dl:         DebugLoc to be applied to new nodes
+  void GenWidenVectorStores(SmallVector<SDValue, 16>& StChain, SDValue Chain,
+                            SDValue BasePtr, const Value *SV,
+                            int SVOffset, unsigned Alignment,
+                            bool isVolatile, SDValue ValOp,
+                            unsigned StWidth, DebugLoc dl);
+
+  /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+  /// input vector must have the same element type as NVT.
+  SDValue ModifyToType(SDValue InOp, MVT WidenVT);
+
+
+  //===--------------------------------------------------------------------===//
+  // Generic Splitting: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // not necessarily identical types.  As such they can be used for splitting
+  // vectors and expanding integers and floats.
+
+  void GetSplitOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isVector())
+      GetSplitVector(Op, Lo, Hi);
+    else if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+  /// which is split (or expanded) into two not necessarily identical pieces.
+  void GetSplitDestVTs(MVT InVT, MVT &LoVT, MVT &HiVT);
+
+  /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
+  /// high parts of the given value.
+  void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
+
+  // Generic Result Splitting.
+  void SplitRes_MERGE_VALUES(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Generic Expansion: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // identical types of half the size, and that the Lo/Hi part is stored first
+  // in memory on little/big-endian machines, followed by the Hi/Lo part.  As
+  // such they can be used for expanding integers and floats.
+
+  void GetExpandedOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  // Generic Result Expansion.
+  void ExpandRes_BIT_CONVERT       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BUILD_PAIR        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_ELEMENT   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_NormalLoad        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_VAARG             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Generic Operand Expansion.
+  SDValue ExpandOp_BIT_CONVERT      (SDNode *N);
+  SDValue ExpandOp_BUILD_VECTOR     (SDNode *N);
+  SDValue ExpandOp_EXTRACT_ELEMENT  (SDNode *N);
+  SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N);
+  SDValue ExpandOp_NormalStore      (SDNode *N, unsigned OpNo);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
new file mode 100644
index 0000000..e8ff3fc
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -0,0 +1,453 @@
+//===-------- LegalizeTypesGeneric.cpp - Generic type legalization --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements generic type expansion and splitting for LegalizeTypes.
+// The routines here perform legalization when the details of the type (such as
+// whether it is an integer or a float) do not matter.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.
+// Splitting is the act of changing a computation in an illegal type to be a
+// computation in two not necessarily identical registers of a smaller type.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Result Expansion.
+//===----------------------------------------------------------------------===//
+
+// These routines assume that the Lo/Hi part is stored first in memory on
+// little/big-endian machines, followed by the Hi/Lo part.  This means that
+// they cannot be used as is on vectors, for which Lo is always stored first.
+
+void DAGTypeLegalizer::ExpandRes_BIT_CONVERT(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  MVT OutVT = N->getValueType(0);
+  MVT NOutVT = TLI.getTypeToTransformTo(OutVT);
+  SDValue InOp = N->getOperand(0);
+  MVT InVT = InOp.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+    default:
+      assert(false && "Unknown type action!");
+    case Legal:
+    case PromoteInteger:
+      break;
+    case SoftenFloat:
+      // Convert the integer operand instead.
+      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    case ExpandInteger:
+    case ExpandFloat:
+      // Convert the expanded pieces of the input.
+      GetExpandedOp(InOp, Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    case SplitVector:
+      // Convert the split parts of the input if it was split in two.
+      GetSplitVector(InOp, Lo, Hi);
+      if (Lo.getValueType() == Hi.getValueType()) {
+        if (TLI.isBigEndian())
+          std::swap(Lo, Hi);
+        Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+        Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+        return;
+      }
+      break;
+    case ScalarizeVector:
+      // Convert the element instead.
+      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    case WidenVector: {
+      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BIT_CONVERT");
+      InOp = GetWidenedVector(InOp);
+      MVT InNVT = MVT::getVectorVT(InVT.getVectorElementType(),
+                                   InVT.getVectorNumElements()/2);
+      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                       DAG.getIntPtrConstant(0));
+      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                       DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
+      return;
+    }
+  }
+
+  // Lower the bit-convert to a store/load from the stack.
+  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");
+
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and expanded destination types.
+  unsigned Alignment =
+    TLI.getTargetData()->getPrefTypeAlignment(NOutVT.getTypeForMVT());
+  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(SPFI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, SV, 0);
+
+  // Load the first half from the stack slot.
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, SV, 0);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getIntPtrConstant(IncrementSize));
+
+  // Load the second half from the stack slot.
+  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr, SV, IncrementSize, false,
+                   MinAlign(Alignment, IncrementSize));
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BUILD_PAIR(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  // Return the operands.
+  Lo = N->getOperand(0);
+  Hi = N->getOperand(1);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_ELEMENT(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  SDValue Part = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ?
+                   Hi : Lo;
+
+  assert(Part.getValueType() == N->getValueType(0) &&
+         "Type twice as big as expanded type not itself expanded!");
+
+  GetPairElements(Part, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue OldVec = N->getOperand(0);
+  unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Convert to a vector of the expanded element type, for example
+  // <3 x i64> -> <6 x i32>.
+  MVT OldVT = N->getValueType(0);
+  MVT NewVT = TLI.getTypeToTransformTo(OldVT);
+
+  SDValue NewVec = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                 MVT::getVectorVT(NewVT, 2*OldElts),
+                                 OldVec);
+
+  // Extract the elements at 2 * Idx and 2 * Idx + 1 from the new vector.
+  SDValue Idx = N->getOperand(1);
+
+  // Make sure the type of Idx is big enough to hold the new values.
+  if (Idx.getValueType().bitsLT(TLI.getPointerTy()))
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(1, Idx.getValueType()));
+  Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
+  DebugLoc dl = N->getDebugLoc();
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  MVT NVT = TLI.getTypeToTransformTo(LD->getValueType(0));
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  int SVOffset = LD->getSrcValueOffset();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getSrcValue(), SVOffset,
+                   isVolatile, Alignment);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  Hi = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getSrcValue(),
+                   SVOffset+IncrementSize,
+                   isVolatile, MinAlign(Alignment, IncrementSize));
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                      Hi.getValue(1));
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  MVT NVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2));
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2));
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Operand Expansion.
+//===--------------------------------------------------------------------===//
+
+SDValue DAGTypeLegalizer::ExpandOp_BIT_CONVERT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0).isVector()) {
+    // An illegal expanding type is being converted to a legal vector type.
+    // Make a two element vector out of the expanded parts and convert that
+    // instead, but only if the new vector type is legal (otherwise there
+    // is no point, and it might create expansion loops).  For example, on
+    // x86 this turns v1i64 = BIT_CONVERT i64 into v1i64 = BIT_CONVERT v2i32.
+    MVT OVT = N->getOperand(0).getValueType();
+    MVT NVT = MVT::getVectorVT(TLI.getTypeToTransformTo(OVT), 2);
+
+    if (isTypeLegal(NVT)) {
+      SDValue Parts[2];
+      GetExpandedOp(N->getOperand(0), Parts[0], Parts[1]);
+
+      if (TLI.isBigEndian())
+        std::swap(Parts[0], Parts[1]);
+
+      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Parts, 2);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, N->getValueType(0), Vec);
+    }
+  }
+
+  // Otherwise, store to a temporary and load out again as the new type.
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  MVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  MVT OldVT = N->getOperand(0).getValueType();
+  MVT NewVT = TLI.getTypeToTransformTo(OldVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  assert(OldVT == VecVT.getVectorElementType() &&
+         "BUILD_VECTOR operand type doesn't match vector element type!");
+
+  // Build a vector of twice the length out of the expanded elements.
+  // For example <3 x i64> -> <6 x i32>.
+  std::vector<SDValue> NewElts;
+  NewElts.reserve(NumElts*2);
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Lo, Hi;
+    GetExpandedOp(N->getOperand(i), Lo, Hi);
+    if (TLI.isBigEndian())
+      std::swap(Lo, Hi);
+    NewElts.push_back(Lo);
+    NewElts.push_back(Hi);
+  }
+
+  SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::getVectorVT(NewVT, NewElts.size()),
+                                 &NewElts[0], NewElts.size());
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_EXTRACT_ELEMENT(SDNode *N) {
+  SDValue Lo, Hi;
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ? Hi : Lo;
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  MVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Val = N->getOperand(1);
+  MVT OldEVT = Val.getValueType();
+  MVT NewEVT = TLI.getTypeToTransformTo(OldEVT);
+
+  assert(OldEVT == VecVT.getVectorElementType() &&
+         "Inserted element type doesn't match vector element type!");
+
+  // Bitconvert to a vector of twice the length with elements of the expanded
+  // type, insert the expanded vector elements, and then convert back.
+  MVT NewVecVT = MVT::getVectorVT(NewEVT, NumElts*2);
+  SDValue NewVec = DAG.getNode(ISD::BIT_CONVERT, dl,
+                               NewVecVT, N->getOperand(0));
+
+  SDValue Lo, Hi;
+  GetExpandedOp(Val, Lo, Hi);
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  SDValue Idx = N->getOperand(2);
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
+  Idx = DAG.getNode(ISD::ADD, dl,
+                    Idx.getValueType(), Idx, DAG.getIntPtrConstant(1));
+  NewVec =  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  MVT VT = N->getValueType(0);
+  assert(VT.getVectorElementType() == N->getOperand(0).getValueType() &&
+         "SCALAR_TO_VECTOR operand type doesn't match vector element type!");
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  Ops[0] = N->getOperand(0);
+  SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
+  for (unsigned i = 1; i < NumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
+  assert(ISD::isNormalStore(N) && "This routine only for normal stores!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  DebugLoc dl = N->getDebugLoc();
+
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  MVT NVT = TLI.getTypeToTransformTo(St->getValue().getValueType());
+  SDValue Chain = St->getChain();
+  SDValue Ptr = St->getBasePtr();
+  int SVOffset = St->getSrcValueOffset();
+  unsigned Alignment = St->getAlignment();
+  bool isVolatile = St->isVolatile();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(St->getValue(), Lo, Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getSrcValue(), SVOffset,
+                    isVolatile, Alignment);
+
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  assert(isTypeLegal(Ptr.getValueType()) && "Pointers must be legal!");
+  Hi = DAG.getStore(Chain, dl, Hi, Ptr, St->getSrcValue(),
+                    SVOffset + IncrementSize,
+                    isVolatile, MinAlign(Alignment, IncrementSize));
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Result Splitting.
+//===--------------------------------------------------------------------===//
+
+// Be careful to make no assumptions about which of Lo/Hi is stored first in
+// memory (for vectors it is always Lo first followed by Hi in the following
+// bytes; for integers and floats it is Lo first if and only if the machine is
+// little-endian).
+
+void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  // A MERGE_VALUES node can produce any number of values.  We know that the
+  // first illegal one needs to be expanded into Lo/Hi.
+  unsigned i;
+
+  // The string of legal results gets turned into input operands, which have
+  // the same type.
+  for (i = 0; isTypeLegal(N->getValueType(i)); ++i)
+    ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+
+  // The first illegal result must be the one that needs to be expanded.
+  GetSplitOp(N->getOperand(i), Lo, Hi);
+
+  // Legalize the rest of the results into the input operands whether they are
+  // legal or not.
+  unsigned e = N->getNumValues();
+  for (++i; i != e; ++i)
+    ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
+                                       SDValue &Hi) {
+  SDValue LL, LH, RL, RH;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitOp(N->getOperand(1), LL, LH);
+  GetSplitOp(N->getOperand(2), RL, RH);
+
+  SDValue Cond = N->getOperand(0);
+  Lo = DAG.getNode(ISD::SELECT, dl, LL.getValueType(), Cond, LL, RL);
+  Hi = DAG.getNode(ISD::SELECT, dl, LH.getValueType(), Cond, LH, RH);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue LL, LH, RL, RH;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitOp(N->getOperand(2), LL, LH);
+  GetSplitOp(N->getOperand(3), RL, RH);
+
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, LL.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LL, RL, N->getOperand(4));
+  Hi = DAG.getNode(ISD::SELECT_CC, dl, LH.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LH, RH, N->getOperand(4));
+}
+
+void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  MVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  Lo = DAG.getUNDEF(LoVT);
+  Hi = DAG.getUNDEF(HiVT);
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
new file mode 100644
index 0000000..df9af21
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -0,0 +1,335 @@
+//===-- LegalizeVectorOps.cpp - Implement SelectionDAG::LegalizeVectors ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeVectors method.
+//
+// The vector legalizer looks for vector operations which might need to be
+// scalarized and legalizes them. This is a separate step from Legalize because
+// scalarizing can introduce illegal types.  For example, suppose we have an
+// ISD::SDIV of type v2i64 on x86-32.  The type is legal (for example, addition
+// on a v2i64 is legal), but ISD::SDIV isn't legal, so we have to unroll the
+// operation, which introduces nodes with the illegal type i64 which must be
+// expanded.  Similarly, suppose we have an ISD::SRA of type v16i8 on PowerPC;
+// the operation must be unrolled, which introduces nodes with the illegal
+// type i8 which must be promoted.
+//
+// This does not legalize vector manipulations like ISD::BUILD_VECTOR,
+// or operations that happen to take a vector which are custom-lowered like
+// ISD::CALL; the legalization for such operations never produces nodes
+// with illegal types, so it's okay to put off legalizing them until
+// SelectionDAG::Legalize runs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+namespace {
+class VectorLegalizer {
+  SelectionDAG& DAG;
+  TargetLowering& TLI;
+  bool Changed; // Keep track of whether anything changed
+
+  /// LegalizedNodes - For nodes that are of legal width, and that have more
+  /// than one use, this map indicates what regularized operand to use.  This
+  /// allows us to avoid legalizing the same thing more than once.
+  DenseMap<SDValue, SDValue> LegalizedNodes;
+
+  // Adds a node to the translation cache
+  void AddLegalizedOperand(SDValue From, SDValue To) {
+    LegalizedNodes.insert(std::make_pair(From, To));
+    // If someone requests legalization of the new node, return itself.
+    if (From != To)
+      LegalizedNodes.insert(std::make_pair(To, To));
+  }
+
+  // Legalizes the given node
+  SDValue LegalizeOp(SDValue Op);
+  // Assuming the node is legal, "legalize" the results
+  SDValue TranslateLegalizeResults(SDValue Op, SDValue Result);
+  // Implements unrolling a generic vector operation, i.e. turning it into
+  // scalar operations.
+  SDValue UnrollVectorOp(SDValue Op);
+  // Implements unrolling a VSETCC.
+  SDValue UnrollVSETCC(SDValue Op);
+  // Implements expansion for FNEG; falls back to UnrollVectorOp if FSUB
+  // isn't legal.
+  SDValue ExpandFNEG(SDValue Op);
+  // Implements vector promotion; this is essentially just bitcasting the
+  // operands to a different type and bitcasting the result back to the
+  // original type.
+  SDValue PromoteVectorOp(SDValue Op);
+
+  public:
+  bool Run();
+  VectorLegalizer(SelectionDAG& dag) :
+      DAG(dag), TLI(dag.getTargetLoweringInfo()), Changed(false) {}
+};
+
+bool VectorLegalizer::Run() {
+  // The legalize process is inherently a bottom-up recursive process (users
+  // legalize their uses before themselves).  Given infinite stack space, we
+  // could just start legalizing on the root and traverse the whole graph.  In
+  // practice however, this causes us to run out of stack space on large basic
+  // blocks.  To avoid this problem, compute an ordering of the nodes where each
+  // node is only legalized after all of its operands are legalized.
+  DAG.AssignTopologicalOrder();
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = prior(DAG.allnodes_end()); I != next(E); ++I)
+    LegalizeOp(SDValue(I, 0));
+
+  // Finally, it's possible the root changed.  Get the new root.
+  SDValue OldRoot = DAG.getRoot();
+  assert(LegalizedNodes.count(OldRoot) && "Root didn't get legalized?");
+  DAG.setRoot(LegalizedNodes[OldRoot]);
+
+  LegalizedNodes.clear();
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+
+  return Changed;
+}
+
+SDValue VectorLegalizer::TranslateLegalizeResults(SDValue Op, SDValue Result) {
+  // Generic legalization: just pass the operand through.
+  for (unsigned i = 0, e = Op.getNode()->getNumValues(); i != e; ++i)
+    AddLegalizedOperand(Op.getValue(i), Result.getValue(i));
+  return Result.getValue(Op.getResNo());
+}
+
+SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+  if (I != LegalizedNodes.end()) return I->second;
+
+  SDNode* Node = Op.getNode();
+
+  // Legalize the operands
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+    Ops.push_back(LegalizeOp(Node->getOperand(i)));
+
+  SDValue Result =
+      DAG.UpdateNodeOperands(Op.getValue(0), Ops.data(), Ops.size());
+
+  bool HasVectorValue = false;
+  for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
+       J != E;
+       ++J)
+    HasVectorValue |= J->isVector();
+  if (!HasVectorValue)
+    return TranslateLegalizeResults(Op, Result);
+
+  switch (Op.getOpcode()) {
+  default:
+    return TranslateLegalizeResults(Op, Result);
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::SELECT:
+  case ISD::SELECT_CC:
+  case ISD::VSETCC:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FFLOOR:
+    break;
+  }
+
+  switch (TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0))) {
+  case TargetLowering::Promote:
+    // "Promote" the operation by bitcasting
+    Result = PromoteVectorOp(Op);
+    Changed = true;
+    break;
+  case TargetLowering::Legal: break;
+  case TargetLowering::Custom: {
+    SDValue Tmp1 = TLI.LowerOperation(Op, DAG);
+    if (Tmp1.getNode()) {
+      Result = Tmp1;
+      break;
+    }
+    // FALL THROUGH
+  }
+  case TargetLowering::Expand:
+    if (Node->getOpcode() == ISD::FNEG)
+      Result = ExpandFNEG(Op);
+    else if (Node->getOpcode() == ISD::VSETCC)
+      Result = UnrollVSETCC(Op);
+    else
+      Result = UnrollVectorOp(Op);
+    break;
+  }
+
+  // Make sure that the generated code is itself legal.
+  if (Result != Op) {
+    Result = LegalizeOp(Result);
+    Changed = true;
+  }
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  AddLegalizedOperand(Op, Result);
+  return Result;
+}
+
+SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
+  // Vector "promotion" is basically just bitcasting and doing the operation
+  // in a different type.  For example, x86 promotes ISD::AND on v2i32 to
+  // v1i64.
+  MVT VT = Op.getValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+  MVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT);
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  Op = DAG.getNode(Op.getOpcode(), dl, NVT, &Operands[0], Operands.size());
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Op);
+}
+
+SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
+  if (TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) {
+    SDValue Zero = DAG.getConstantFP(-0.0, Op.getValueType());
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       Zero, Op.getOperand(0));
+  }
+  return UnrollVectorOp(Op);
+}
+
+SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
+  MVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  MVT EltVT = VT.getVectorElementType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1), CC = Op.getOperand(2);
+  MVT TmpEltVT = LHS.getValueType().getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 8> Ops(NumElems);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getIntPtrConstant(i));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getIntPtrConstant(i));
+    Ops[i] = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(TmpEltVT),
+                         LHSElem, RHSElem, CC);
+    Ops[i] = DAG.getNode(ISD::SELECT, dl, EltVT, Ops[i],
+                         DAG.getConstant(APInt::getAllOnesValue
+                                         (EltVT.getSizeInBits()), EltVT),
+                         DAG.getConstant(0, EltVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElems);
+}
+
+/// UnrollVectorOp - We know that the given vector has a legal type, however
+/// the operation it performs is not legal, and the target has requested that
+/// the operation be expanded.  "Unroll" the vector, splitting out the scalars
+/// and operating on each element individually.
+SDValue VectorLegalizer::UnrollVectorOp(SDValue Op) {
+  MVT VT = Op.getValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't unroll a vector with multiple results!");
+  unsigned NE = VT.getVectorNumElements();
+  MVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  SmallVector<SDValue, 8> Scalars;
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+  for (unsigned i = 0; i != NE; ++i) {
+    for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+      SDValue Operand = Op.getOperand(j);
+      MVT OperandVT = Operand.getValueType();
+      if (OperandVT.isVector()) {
+        // A vector operand; extract a single element.
+        MVT OperandEltVT = OperandVT.getVectorElementType();
+        Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                  OperandEltVT,
+                                  Operand,
+                                  DAG.getConstant(i, MVT::i32));
+      } else {
+        // A scalar operand; just use it as is.
+        Operands[j] = Operand;
+      }
+    }
+
+    switch (Op.getOpcode()) {
+    default:
+      Scalars.push_back(DAG.getNode(Op.getOpcode(), dl, EltVT,
+                                    &Operands[0], Operands.size()));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      Scalars.push_back(DAG.getNode(Op.getOpcode(), dl, EltVT, Operands[0],
+                                    DAG.getShiftAmountOperand(Operands[1])));
+      break;
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Scalars[0], Scalars.size());
+}
+
+}
+
+bool SelectionDAG::LegalizeVectors() {
+  return VectorLegalizer(*this).Run();
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
new file mode 100644
index 0000000..68967cc
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -0,0 +1,2151 @@
+//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file performs vector type splitting and scalarization for LegalizeTypes.
+// Scalarization is the act of changing a computation in an illegal one-element
+// vector type to be a computation in its scalar element type.  For example,
+// implementing <1 x f32> arithmetic in a scalar f32 register.  This is needed
+// as a base case when scalarizing vector arithmetic like <4 x f32>, which
+// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
+// types.
+// Splitting is the act of changing a computation in an invalid vector type to
+// be a computation in multiple vectors of a smaller type.  For example,
+// implementing <128 x f32> operations in terms of two <64 x f32> operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetData.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Scalarization: <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Scalarize node result " << ResNo << ": "; N->dump(&DAG);
+        cerr << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "ScalarizeVectorResult #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to scalarize the result of this operator!");
+    abort();
+
+  case ISD::BIT_CONVERT:       R = ScalarizeVecRes_BIT_CONVERT(N); break;
+  case ISD::BUILD_VECTOR:      R = N->getOperand(0); break;
+  case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FPOWI:             R = ScalarizeVecRes_FPOWI(N); break;
+  case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
+  case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::VSETCC:            R = ScalarizeVecRes_VSETCC(N); break;
+
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::FABS:
+  case ISD::FCOS:
+  case ISD::FNEG:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP: R = ScalarizeVecRes_UnaryOp(N); break;
+
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::FADD:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::SUB:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::XOR:  R = ScalarizeVecRes_BinOp(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL: R = ScalarizeVecRes_ShiftOp(N); break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetScalarizedVector(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ShiftOp(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue ShiftAmt = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, ShiftAmt);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BIT_CONVERT(SDNode *N) {
+  MVT NewVT = N->getValueType(0).getVectorElementType();
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                     NewVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
+  MVT NewVT = N->getValueType(0).getVectorElementType();
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  return DAG.getConvertRndSat(NewVT, N->getDebugLoc(),
+                              Op0, DAG.getValueType(NewVT),
+                              DAG.getValueType(Op0.getValueType()),
+                              N->getOperand(3),
+                              N->getOperand(4),
+                              cast<CvtRndSatSDNode>(N)->getCvtCode());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     N->getValueType(0).getVectorElementType(),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::FPOWI, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  // The value to insert may have a wider type than the vector element type,
+  // so be sure to truncate it to the element type if necessary.
+  SDValue Op = N->getOperand(1);
+  MVT EltVT = N->getValueType(0).getVectorElementType();
+  if (Op.getValueType() != EltVT)
+    // FIXME: Can this happen for floating point types?
+    Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, Op);
+  return Op;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
+  assert(N->isUnindexed() && "Indexed vector load?");
+
+  SDValue Result = DAG.getLoad(ISD::UNINDEXED, N->getDebugLoc(),
+                               N->getExtensionType(),
+                               N->getValueType(0).getVectorElementType(),
+                               N->getChain(), N->getBasePtr(),
+                               DAG.getUNDEF(N->getBasePtr().getValueType()),
+                               N->getSrcValue(), N->getSrcValueOffset(),
+                               N->getMemoryVT().getVectorElementType(),
+                               N->isVolatile(), N->getAlignment());
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
+  // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
+  MVT DestVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), DestVT, Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  // If the operand is wider than the vector element type then it is implicitly
+  // truncated.  Make that explicit here.
+  MVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  if (InOp.getValueType() != EltVT)
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0), LHS,
+                     GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), LHS.getValueType(),
+                     N->getOperand(0), N->getOperand(1),
+                     LHS, GetScalarizedVector(N->getOperand(3)),
+                     N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
+  // Figure out if the scalar is the LHS or RHS and return it.
+  SDValue Arg = N->getOperand(2).getOperand(0);
+  if (Arg.getOpcode() == ISD::UNDEF)
+    return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+  unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
+  return GetScalarizedVector(N->getOperand(Op));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  MVT NVT = N->getValueType(0).getVectorElementType();
+  MVT SVT = TLI.getSetCCResultType(LHS.getValueType());
+  DebugLoc dl = N->getDebugLoc();
+
+  // Turn it into a scalar SETCC.
+  SDValue Res = DAG.getNode(ISD::SETCC, dl, SVT, LHS, RHS, N->getOperand(2));
+
+  // VSETCC always returns a sign-extended value, while SETCC may not.  The
+  // SETCC result type may not match the vector element type.  Correct these.
+  if (NVT.bitsLE(SVT)) {
+    // The SETCC result type is bigger than the vector element type.
+    // Ensure the SETCC result is sign-extended.
+    if (TLI.getBooleanContents() !=
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, SVT, Res,
+                        DAG.getValueType(MVT::i1));
+    // Truncate to the final type.
+    return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
+  } else {
+    // The SETCC result type is smaller than the vector element type.
+    // If the SetCC result is not sign-extended, chop it down to MVT::i1.
+    if (TLI.getBooleanContents() !=
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Res);
+    // Sign extend to the final type.
+    return DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, Res);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Scalarization <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(cerr << "Scalarize node operand " << OpNo << ": "; N->dump(&DAG);
+        cerr << "\n");
+  SDValue Res = SDValue();
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      cerr << "ScalarizeVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG); cerr << "\n";
+#endif
+      assert(0 && "Do not know how to scalarize this operator's operand!");
+      abort();
+
+    case ISD::BIT_CONVERT:
+      Res = ScalarizeVecOp_BIT_CONVERT(N); break;
+
+    case ISD::CONCAT_VECTORS:
+      Res = ScalarizeVecOp_CONCAT_VECTORS(N); break;
+
+    case ISD::EXTRACT_VECTOR_ELT:
+      Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N); break;
+
+    case ISD::STORE:
+      Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo); break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// ScalarizeVecOp_BIT_CONVERT - If the value to convert is a vector that needs
+/// to be scalarized, it must be <1 x ty>.  Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_BIT_CONVERT(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
+                     N->getValueType(0), Elt);
+}
+
+/// ScalarizeVecOp_CONCAT_VECTORS - The vectors to concatenate have length one -
+/// use a BUILD_VECTOR instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
+  SmallVector<SDValue, 8> Ops(N->getNumOperands());
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+    Ops[i] = GetScalarizedVector(N->getOperand(i));
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), N->getValueType(0),
+                     &Ops[0], Ops.size());
+}
+
+/// ScalarizeVecOp_EXTRACT_VECTOR_ELT - If the input is a vector that needs to
+/// be scalarized, it must be <1 x ty>, so just return the element, ignoring the
+/// index.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  return GetScalarizedVector(N->getOperand(0));
+}
+
+/// ScalarizeVecOp_STORE - If the value to store is a vector that needs to be
+/// scalarized, it must be <1 x ty>.  Just store the element.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(N->isUnindexed() && "Indexed store of one-element vector?");
+  assert(OpNo == 1 && "Do not know how to scalarize this operand!");
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isTruncatingStore())
+    return DAG.getTruncStore(N->getChain(), dl,
+                             GetScalarizedVector(N->getOperand(1)),
+                             N->getBasePtr(),
+                             N->getSrcValue(), N->getSrcValueOffset(),
+                             N->getMemoryVT().getVectorElementType(),
+                             N->isVolatile(), N->getAlignment());
+
+  return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+                      N->getBasePtr(), N->getSrcValue(), N->getSrcValueOffset(),
+                      N->isVolatile(), N->getAlignment());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// SplitVectorResult - This method is called when the specified result of the
+/// specified node is found to need vector splitting.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// legalization, we just know that (at least) one result needs vector
+/// splitting.
+void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Split node result: "; N->dump(&DAG); cerr << "\n");
+  SDValue Lo, Hi;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "SplitVectorResult #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to split the result of this operator!");
+    abort();
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+
+  case ISD::BIT_CONVERT:       SplitVecRes_BIT_CONVERT(N, Lo, Hi); break;
+  case ISD::BUILD_VECTOR:      SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
+  case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
+  case ISD::CONVERT_RNDSAT:    SplitVecRes_CONVERT_RNDSAT(N, Lo, Hi); break;
+  case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
+  case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
+  case ISD::LOAD:              SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);break;
+  case ISD::VECTOR_SHUFFLE:
+      SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi); break;
+  case ISD::VSETCC:            SplitVecRes_VSETCC(N, Lo, Hi); break;
+
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FTRUNC:
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP: SplitVecRes_UnaryOp(N, Lo, Hi); break;
+
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::FDIV:
+  case ISD::FPOW:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::UREM:
+  case ISD::SREM:
+  case ISD::FREM: SplitVecRes_BinOp(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetSplitVector(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDValue RHSLo, RHSHi;
+  GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, RHSLo);
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BIT_CONVERT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  // We know the result is a vector.  The input may be either a vector or a
+  // scalar value.
+  MVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue InOp = N->getOperand(0);
+  MVT InVT = InOp.getValueType();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+  default:
+    assert(false && "Unknown type action!");
+  case Legal:
+  case PromoteInteger:
+  case SoftenFloat:
+  case ScalarizeVector:
+    break;
+  case ExpandInteger:
+  case ExpandFloat:
+    // A scalar to vector conversion, where the scalar needs expansion.
+    // If the vector is being split in two then we can just convert the
+    // expanded pieces.
+    if (LoVT == HiVT) {
+      GetExpandedOp(InOp, Lo, Hi);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, LoVT, Lo);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HiVT, Hi);
+      return;
+    }
+    break;
+  case SplitVector:
+    // If the input is a vector that needs to be split, convert each split
+    // piece of the input now.
+    GetSplitVector(InOp, Lo, Hi);
+    Lo = DAG.getNode(ISD::BIT_CONVERT, dl, LoVT, Lo);
+    Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HiVT, Hi);
+    return;
+  }
+
+  // In the general case, convert the input to an integer and split it by hand.
+  MVT LoIntVT = MVT::getIntegerVT(LoVT.getSizeInBits());
+  MVT HiIntVT = MVT::getIntegerVT(HiVT.getSizeInBits());
+  if (TLI.isBigEndian())
+    std::swap(LoIntVT, HiIntVT);
+
+  SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+  Lo = DAG.getNode(ISD::BIT_CONVERT, dl, LoVT, Lo);
+  Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  MVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  unsigned LoNumElts = LoVT.getVectorNumElements();
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
+  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, &LoOps[0], LoOps.size());
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
+  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, &HiOps[0], HiOps.size());
+}
+
+void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumSubvectors = N->getNumOperands() / 2;
+  if (NumSubvectors == 1) {
+    Lo = N->getOperand(0);
+    Hi = N->getOperand(1);
+    return;
+  }
+
+  MVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
+  Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, &LoOps[0], LoOps.size());
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
+  Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, &HiOps[0], HiOps.size());
+}
+
+void DAGTypeLegalizer::SplitVecRes_CONVERT_RNDSAT(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  MVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  SDValue DTyOpLo =  DAG.getValueType(LoVT);
+  SDValue DTyOpHi =  DAG.getValueType(HiVT);
+
+  SDValue RndOp = N->getOperand(3);
+  SDValue SatOp = N->getOperand(4);
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+
+  // Split the input.
+  SDValue VLo, VHi;
+  MVT InVT = N->getOperand(0).getValueType();
+  switch (getTypeAction(InVT)) {
+  default: assert(0 && "Unexpected type action!");
+  case Legal: {
+    assert(LoVT == HiVT && "Legal non-power-of-two vector type?");
+    MVT InNVT = MVT::getVectorVT(InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    VLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                      DAG.getIntPtrConstant(0));
+    VHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                      DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  case SplitVector:
+    GetSplitVector(N->getOperand(0), VLo, VHi);
+    break;
+  case WidenVector: {
+    // If the result needs to be split and the input needs to be widened,
+    // the two types must have different lengths. Use the widened result
+    // and extract from it to do the split.
+    assert(LoVT == HiVT && "Legal non-power-of-two vector type?");
+    SDValue InOp = GetWidenedVector(N->getOperand(0));
+    MVT InNVT = MVT::getVectorVT(InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    VLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(0));
+    VHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  }
+
+  SDValue STyOpLo =  DAG.getValueType(VLo.getValueType());
+  SDValue STyOpHi =  DAG.getValueType(VHi.getValueType());
+
+  Lo = DAG.getConvertRndSat(LoVT, dl, VLo, DTyOpLo, STyOpLo, RndOp, SatOp,
+                            CvtCode);
+  Hi = DAG.getConvertRndSat(HiVT, dl, VHi, DTyOpHi, STyOpHi, RndOp, SatOp,
+                            CvtCode);
+}
+
+void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  MVT IdxVT = Idx.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  // The indices are not guaranteed to be a multiple of the new vector
+  // size unless the original vector type was split in two.
+  assert(LoVT == HiVT && "Non power-of-two vectors not supported!");
+
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
+  Idx = DAG.getNode(ISD::ADD, dl, IdxVT, Idx,
+                    DAG.getConstant(LoVT.getVectorNumElements(), IdxVT));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec, Idx);
+}
+
+void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
+  Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
+}
+
+void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Elt = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(Vec, Lo, Hi);
+
+  if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
+    if (IdxVal < LoNumElts)
+      Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                       Lo.getValueType(), Lo, Elt, Idx);
+    else
+      Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
+                       DAG.getIntPtrConstant(IdxVal - LoNumElts));
+    return;
+  }
+
+  // Spill the vector to the stack.
+  MVT VecVT = Vec.getValueType();
+  MVT EltVT = VecVT.getVectorElementType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, NULL, 0);
+
+  // Store the new element.  This may be larger than the vector element type,
+  // so use a truncating store.
+  SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  unsigned Alignment =
+    TLI.getTargetData()->getPrefTypeAlignment(VecVT.getTypeForMVT());
+  Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, NULL, 0, EltVT);
+
+  // Load the Lo part from the stack slot.
+  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, NULL, 0);
+
+  // Increment the pointer to the other part.
+  unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getIntPtrConstant(IncrementSize));
+
+  // Load the Hi part from the stack slot.
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, NULL, 0, false,
+                   MinAlign(Alignment, IncrementSize));
+}
+
+void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  MVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
+  Hi = DAG.getUNDEF(HiVT);
+}
+
+void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
+                                        SDValue &Hi) {
+  assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
+  MVT LoVT, HiVT;
+  DebugLoc dl = LD->getDebugLoc();
+  GetSplitDestVTs(LD->getValueType(0), LoVT, HiVT);
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+  const Value *SV = LD->getSrcValue();
+  int SVOffset = LD->getSrcValueOffset();
+  MVT MemoryVT = LD->getMemoryVT();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+
+  MVT LoMemVT, HiMemVT;
+  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+
+  Lo = DAG.getLoad(ISD::UNINDEXED, dl, ExtType, LoVT, Ch, Ptr, Offset,
+                   SV, SVOffset, LoMemVT, isVolatile, Alignment);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  SVOffset += IncrementSize;
+  Alignment = MinAlign(Alignment, IncrementSize);
+  Hi = DAG.getLoad(ISD::UNINDEXED, dl, ExtType, HiVT, Ch, Ptr, Offset,
+                   SV, SVOffset, HiMemVT, isVolatile, Alignment);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // Get the dest types - they may not match the input types, e.g. int_to_fp.
+  MVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  // Split the input.
+  MVT InVT = N->getOperand(0).getValueType();
+  switch (getTypeAction(InVT)) {
+  default: assert(0 && "Unexpected type action!");
+  case Legal: {
+    assert(LoVT == HiVT && "Legal non-power-of-two vector type?");
+    MVT InNVT = MVT::getVectorVT(InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                     DAG.getIntPtrConstant(0));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  case SplitVector:
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+    break;
+  case WidenVector: {
+    // If the result needs to be split and the input needs to be widened,
+    // the two types must have different lengths. Use the widened result
+    // and extract from it to do the split.
+    assert(LoVT == HiVT && "Legal non-power-of-two vector type?");
+    SDValue InOp = GetWidenedVector(N->getOperand(0));
+    MVT InNVT = MVT::getVectorVT(InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(0));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  }
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
+                                                  SDValue &Lo, SDValue &Hi) {
+  // The low and high parts of the original input give four input vectors.
+  SDValue Inputs[4];
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
+  GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
+  MVT NewVT = Inputs[0].getValueType();
+  unsigned NewElts = NewVT.getVectorNumElements();
+  assert(NewVT == Inputs[1].getValueType() &&
+         "Non power-of-two vectors not supported!");
+
+  // If Lo or Hi uses elements from at most two of the four input vectors, then
+  // express it as a vector shuffle of those two inputs.  Otherwise extract the
+  // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
+  SmallVector<int, 16> Ops;
+  for (unsigned High = 0; High < 2; ++High) {
+    SDValue &Output = High ? Hi : Lo;
+
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with useBuildVector set.
+    unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
+    unsigned FirstMaskIdx = High * NewElts;
+    bool useBuildVector = false;
+    for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+      // The mask element.  This indexes into the input.
+      int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+      // The input vector this mask element indexes into.
+      unsigned Input = (unsigned)Idx / NewElts;
+
+      if (Input >= array_lengthof(Inputs)) {
+        // The mask element does not index into any input vector.
+        Ops.push_back(-1);
+        continue;
+      }
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NewElts;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input) {
+          // This input vector is already an operand.
+          break;
+        } else if (InputUsed[OpNo] == -1U) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        useBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Ops.push_back(Idx + OpNo * NewElts);
+    }
+
+    if (useBuildVector) {
+      MVT EltVT = NewVT.getVectorElementType();
+      SmallVector<SDValue, 16> SVOps;
+
+      // Extract the input elements by hand.
+      for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+        // The mask element.  This indexes into the input.
+        int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+        // The input vector this mask element indexes into.
+        unsigned Input = (unsigned)Idx / NewElts;
+
+        if (Input >= array_lengthof(Inputs)) {
+          // The mask element is "undef" or indexes off the end of the input.
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NewElts;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    Inputs[Input], DAG.getIntPtrConstant(Idx)));
+      }
+
+      // Construct the Lo/Hi output using a BUILD_VECTOR.
+      Output = DAG.getNode(ISD::BUILD_VECTOR,dl,NewVT, &SVOps[0], SVOps.size());
+    } else if (InputUsed[0] == -1U) {
+      // No input vectors were used!  The result is undefined.
+      Output = DAG.getUNDEF(NewVT);
+    } else {
+      SDValue Op0 = Inputs[InputUsed[0]];
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = InputUsed[1] == -1U ?
+        DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
+      // At least one input vector was used.  Create a new shuffle vector.
+      Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, &Ops[0]);
+    }
+
+    Ops.clear();
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_VSETCC(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  MVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  SDValue LL, LH, RL, RH;
+  GetSplitVector(N->getOperand(0), LL, LH);
+  GetSplitVector(N->getOperand(1), RL, RH);
+
+  Lo = DAG.getNode(ISD::VSETCC, dl, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(ISD::VSETCC, dl, HiVT, LH, RH, N->getOperand(2));
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// SplitVectorOperand - This method is called when the specified operand of the
+/// specified node is found to need vector splitting.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need legalization as well as the specified one.
+bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(cerr << "Split node operand: "; N->dump(&DAG); cerr << "\n");
+  SDValue Res = SDValue();
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      cerr << "SplitVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG); cerr << "\n";
+#endif
+      assert(0 && "Do not know how to split this operator's operand!");
+      abort();
+
+    case ISD::BIT_CONVERT:       Res = SplitVecOp_BIT_CONVERT(N); break;
+    case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
+    case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::STORE:             Res = SplitVecOp_STORE(cast<StoreSDNode>(N),
+                                                        OpNo); break;
+
+    case ISD::CTTZ:
+    case ISD::CTLZ:
+    case ISD::CTPOP:
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::TRUNCATE:
+    case ISD::UINT_TO_FP: Res = SplitVecOp_UnaryOp(N); break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  MVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  assert(Lo.getValueType() == Hi.getValueType() &&
+         "Returns legal non-power-of-two vector type?");
+  MVT InVT = Lo.getValueType();
+
+  MVT OutVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+                               InVT.getVectorNumElements());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_BIT_CONVERT(SDNode *N) {
+  // For example, i64 = BIT_CONVERT v4i16 on alpha.  Typically the vector will
+  // end up being split all the way down to individual components.  Convert the
+  // split pieces into integers and reassemble.
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = BitConvertToInteger(Lo);
+  Hi = BitConvertToInteger(Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), N->getValueType(0),
+                     JoinIntegers(Lo, Hi));
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  // We know that the extracted result type is legal.  For now, assume the index
+  // is a constant.
+  MVT SubVT = N->getValueType(0);
+  SDValue Idx = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+
+  uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  if (IdxVal < LoElts) {
+    assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
+           "Extracted subvector crosses vector split!");
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
+  } else {
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
+                       DAG.getConstant(IdxVal - LoElts, Idx.getValueType()));
+  }
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  MVT VecVT = Vec.getValueType();
+
+  if (isa<ConstantSDNode>(Idx)) {
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    assert(IdxVal < VecVT.getVectorNumElements() && "Invalid vector index!");
+
+    SDValue Lo, Hi;
+    GetSplitVector(Vec, Lo, Hi);
+
+    uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+
+    if (IdxVal < LoElts)
+      return DAG.UpdateNodeOperands(SDValue(N, 0), Lo, Idx);
+    else
+      return DAG.UpdateNodeOperands(SDValue(N, 0), Hi,
+                                    DAG.getConstant(IdxVal - LoElts,
+                                                    Idx.getValueType()));
+  }
+
+  // Store the vector to the stack.
+  MVT EltVT = VecVT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(SPFI);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, SV, 0);
+
+  // Load back the required element.
+  StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  return DAG.getLoad(EltVT, dl, Store, StackPtr, SV, 0);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed store of vector?");
+  assert(OpNo == 1 && "Can only split the stored value");
+  DebugLoc dl = N->getDebugLoc();
+
+  bool isTruncating = N->isTruncatingStore();
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  int SVOffset = N->getSrcValueOffset();
+  MVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getAlignment();
+  bool isVol = N->isVolatile();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(1), Lo, Hi);
+
+  MVT LoMemVT, HiMemVT;
+  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+
+  if (isTruncating)
+    Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                           LoMemVT, isVol, Alignment);
+  else
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+                      isVol, Alignment);
+
+  // Increment the pointer to the other half.
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+
+  if (isTruncating)
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
+                           N->getSrcValue(), SVOffset+IncrementSize,
+                           HiMemVT,
+                           isVol, MinAlign(Alignment, IncrementSize));
+  else
+    Hi = DAG.getStore(Ch, dl, Hi, Ptr, N->getSrcValue(), SVOffset+IncrementSize,
+                      isVol, MinAlign(Alignment, IncrementSize));
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Widening
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Widen node result " << ResNo << ": "; N->dump(&DAG);
+        cerr << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "WidenVectorResult #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to widen the result of this operator!");
+    abort();
+
+  case ISD::BIT_CONVERT:       Res = WidenVecRes_BIT_CONVERT(N); break;
+  case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
+  case ISD::CONVERT_RNDSAT:    Res = WidenVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
+  case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:    
+      Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N)); break;
+  case ISD::VSETCC:            Res = WidenVecRes_VSETCC(N); break;
+
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::BSWAP:
+  case ISD::FADD:
+  case ISD::FCOPYSIGN:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FPOWI:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::SUB:
+  case ISD::XOR:               Res = WidenVecRes_Binary(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:               Res = WidenVecRes_Shift(N); break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::ZERO_EXTEND:
+  case ISD::UINT_TO_FP:        Res = WidenVecRes_Convert(N); break;
+
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::FABS:
+  case ISD::FCOS:
+  case ISD::FNEG:
+  case ISD::FSIN:
+  case ISD::FSQRT:             Res = WidenVecRes_Unary(N); break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (Res.getNode())
+    SetWidenedVector(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
+  // Binary op widening.
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  MVT InVT = InOp.getValueType();
+  MVT InEltVT = InVT.getVectorElementType();
+  MVT InWidenVT = MVT::getVectorVT(InEltVT, WidenNumElts);
+
+  unsigned Opcode = N->getOpcode();
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+
+  if (getTypeAction(InVT) == WidenVector) {
+    InOp = GetWidenedVector(N->getOperand(0));
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts)
+      return DAG.getNode(Opcode, dl, WidenVT, InOp);
+  }
+
+  if (TLI.isTypeLegal(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      return DAG.getNode(Opcode, dl, WidenVT,
+                         DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT,
+                         &Ops[0], NumConcat));
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      // Extract the input and convert the shorten input vector.
+      return DAG.getNode(Opcode, dl, WidenVT,
+                         DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT,
+                                     InOp, DAG.getIntPtrConstant(0)));
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  MVT EltVT = WidenVT.getVectorElementType();
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i)
+    Ops[i] = DAG.getNode(Opcode, dl, EltVT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                     DAG.getIntPtrConstant(i)));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+
+  MVT ShVT = ShOp.getValueType();
+  if (getTypeAction(ShVT) == WidenVector) {
+    ShOp = GetWidenedVector(ShOp);
+    ShVT = ShOp.getValueType();
+  }
+  MVT ShWidenVT = MVT::getVectorVT(ShVT.getVectorElementType(),
+                                   WidenVT.getVectorNumElements());
+  if (ShVT != ShWidenVT)
+    ShOp = ModifyToType(ShOp, ShWidenVT);
+
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
+  // Unary op widening.
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BIT_CONVERT(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  MVT InVT = InOp.getValueType();
+  MVT VT = N->getValueType(0);
+  MVT WidenVT = TLI.getTypeToTransformTo(VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (getTypeAction(InVT)) {
+  default:
+    assert(false && "Unknown type action!");
+    break;
+  case Legal:
+    break;
+  case PromoteInteger:
+    // If the InOp is promoted to the same size, convert it.  Otherwise,
+    // fall out of the switch and widen the promoted input.
+    InOp = GetPromotedInteger(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, InOp);
+    break;
+  case SoftenFloat:
+  case ExpandInteger:
+  case ExpandFloat:
+  case ScalarizeVector:
+  case SplitVector:
+    break;
+  case WidenVector:
+    // If the InOp is widened to the same size, convert it.  Otherwise, fall
+    // out of the switch and widen the widened input.
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      // The input widens to the same size. Convert to the widen value.
+      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, InOp);
+    break;
+  }
+
+  unsigned WidenSize = WidenVT.getSizeInBits();
+  unsigned InSize = InVT.getSizeInBits();
+  if (WidenSize % InSize == 0) {
+    // Determine new input vector type.  The new input vector type will use
+    // the same element type (if its a vector) or use the input type as a
+    // vector.  It is the same size as the type to widen to.
+    MVT NewInVT;
+    unsigned NewNumElts = WidenSize / InSize;
+    if (InVT.isVector()) {
+      MVT InEltVT = InVT.getVectorElementType();
+      NewInVT= MVT::getVectorVT(InEltVT, WidenSize / InEltVT.getSizeInBits());
+    } else {
+      NewInVT = MVT::getVectorVT(InVT, NewNumElts);
+    }
+
+    if (TLI.isTypeLegal(NewInVT)) {
+      // Because the result and the input are different vector types, widening
+      // the result could create a legal type but widening the input might make
+      // it an illegal type that might lead to repeatedly splitting the input
+      // and then widening it. To avoid this, we widen the input only if
+      // it results in a legal type.
+      SmallVector<SDValue, 16> Ops(NewNumElts);
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      Ops[0] = InOp;
+      for (unsigned i = 1; i < NewNumElts; ++i)
+        Ops[i] = UndefVal;
+
+      SDValue NewVec;
+      if (InVT.isVector())
+        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                             NewInVT, &Ops[0], NewNumElts);
+      else
+        NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                             NewInVT, &Ops[0], NewNumElts);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, NewVec);
+    }
+  }
+
+  // This should occur rarely. Lower the bit-convert to a store/load
+  // from the stack. Create the stack frame object.  Make sure it is aligned
+  // for both the source and destination types.
+  SDValue FIPtr = DAG.CreateStackTemporary(InVT, WidenVT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(FI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, FIPtr, SV, 0);
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(WidenVT, dl, Store, FIPtr, SV, 0);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  // Build a vector with undefined for the new nodes.
+  MVT VT = N->getValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  MVT WidenVT = TLI.getTypeToTransformTo(VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
+  NewOps.reserve(WidenNumElts);
+  for (unsigned i = NumElts; i < WidenNumElts; ++i)
+    NewOps.push_back(DAG.getUNDEF(EltVT));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &NewOps[0], NewOps.size());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
+  MVT InVT = N->getOperand(0).getValueType();
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumOperands = N->getNumOperands();
+
+  bool InputWidened = false; // Indicates we need to widen the input.
+  if (getTypeAction(InVT) != WidenVector) {
+    if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
+      // Add undef vectors to widen to correct length.
+      unsigned NumConcat = WidenVT.getVectorNumElements() /
+                           InVT.getVectorNumElements();
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      for (unsigned i=0; i < NumOperands; ++i)
+        Ops[i] = N->getOperand(i);
+      for (unsigned i = NumOperands; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &Ops[0], NumConcat);
+    }
+  } else {
+    InputWidened = true;
+    if (WidenVT == TLI.getTypeToTransformTo(InVT)) {
+      // The inputs and the result are widen to the same value.
+      unsigned i;
+      for (i=1; i < NumOperands; ++i)
+        if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+          break;
+
+      if (i > NumOperands)
+        // Everything but the first operand is an UNDEF so just return the
+        // widened first operand.
+        return GetWidenedVector(N->getOperand(0));
+
+      if (NumOperands == 2) {
+        // Replace concat of two operands with a shuffle.
+        SmallVector<int, 16> MaskOps(WidenNumElts);
+        for (unsigned i=0; i < WidenNumElts/2; ++i) {
+          MaskOps[i] = i;
+          MaskOps[i+WidenNumElts/2] = i+WidenNumElts;
+        }
+        return DAG.getVectorShuffle(WidenVT, dl, 
+                                    GetWidenedVector(N->getOperand(0)),
+                                    GetWidenedVector(N->getOperand(1)),
+                                    &MaskOps[0]);
+      }
+    }
+  }
+
+  // Fall back to use extracts and build vector.
+  MVT EltVT = WidenVT.getVectorElementType();
+  unsigned NumInElts = InVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Idx = 0;
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (InputWidened)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+        Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                                 DAG.getIntPtrConstant(j));
+  }
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue InOp  = N->getOperand(0);
+  SDValue RndOp = N->getOperand(3);
+  SDValue SatOp = N->getOperand(4);
+
+  MVT      WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  MVT InVT = InOp.getValueType();
+  MVT InEltVT = InVT.getVectorElementType();
+  MVT InWidenVT = MVT::getVectorVT(InEltVT, WidenNumElts);
+
+  SDValue DTyOp = DAG.getValueType(WidenVT);
+  SDValue STyOp = DAG.getValueType(InWidenVT);
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+  if (getTypeAction(InVT) == WidenVector) {
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts)
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+  }
+
+  if (TLI.isTypeLegal(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i) {
+        Ops[i] = UndefVal;
+      }
+      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, &Ops[0],NumConcat);
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      // Extract the input and convert the shorten input vector.
+      InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
+                         DAG.getIntPtrConstant(0));
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                SatOp, CvtCode);
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  MVT EltVT = WidenVT.getVectorElementType();
+  DTyOp = DAG.getValueType(EltVT);
+  STyOp = DAG.getValueType(InEltVT);
+
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i) {
+    SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                 DAG.getIntPtrConstant(i));
+    Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
+                                        SatOp, CvtCode);
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  MVT      VT = N->getValueType(0);
+  MVT      WidenVT = TLI.getTypeToTransformTo(VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SDValue  InOp = N->getOperand(0);
+  SDValue  Idx  = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (getTypeAction(InOp.getValueType()) == WidenVector)
+    InOp = GetWidenedVector(InOp);
+
+  MVT InVT = InOp.getValueType();
+
+  ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx);
+  if (CIdx) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    // Check if we can just return the input vector after widening.
+    if (IdxVal == 0 && InVT == WidenVT)
+      return InOp;
+
+    // Check if we can extract from the vector.
+    unsigned InNumElts = InVT.getVectorNumElements();
+    if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
+  }
+
+  // We could try widening the input to the right length but for now, extract
+  // the original elements, fill the rest with undefs and build a vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  MVT EltVT = VT.getVectorElementType();
+  MVT IdxVT = Idx.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned i;
+  if (CIdx) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    for (i=0; i < NumElts; ++i)
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                           DAG.getConstant(IdxVal+i, IdxVT));
+  } else {
+    Ops[0] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, Idx);
+    for (i=1; i < NumElts; ++i) {
+      SDValue NewIdx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                                   DAG.getConstant(i, IdxVT));
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, NewIdx);
+    }
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, N->getDebugLoc(),
+                     InOp.getValueType(), InOp,
+                     N->getOperand(1), N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  MVT WidenVT = TLI.getTypeToTransformTo(LD->getValueType(0));
+  MVT LdVT    = LD->getMemoryVT();
+  DebugLoc dl = N->getDebugLoc();
+  assert(LdVT.isVector() && WidenVT.isVector());
+
+  // Load information
+  SDValue   Chain = LD->getChain();
+  SDValue   BasePtr = LD->getBasePtr();
+  int       SVOffset = LD->getSrcValueOffset();
+  unsigned  Align    = LD->getAlignment();
+  bool      isVolatile = LD->isVolatile();
+  const Value *SV = LD->getSrcValue();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  SDValue Result;
+  SmallVector<SDValue, 16> LdChain;  // Chain for the series of load
+  if (ExtType != ISD::NON_EXTLOAD) {
+    // For extension loads, we can not play the tricks of chopping legal
+    // vector types and bit cast it to the right type.  Instead, we unroll
+    // the load and build a vector.
+    MVT EltVT = WidenVT.getVectorElementType();
+    MVT LdEltVT = LdVT.getVectorElementType();
+    unsigned NumElts = LdVT.getVectorNumElements();
+
+    // Load each element and widen
+    unsigned WidenNumElts = WidenVT.getVectorNumElements();
+    SmallVector<SDValue, 16> Ops(WidenNumElts);
+    unsigned Increment = LdEltVT.getSizeInBits() / 8;
+    Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, SV, SVOffset,
+                            LdEltVT, isVolatile, Align);
+    LdChain.push_back(Ops[0].getValue(1));
+    unsigned i = 0, Offset = Increment;
+    for (i=1; i < NumElts; ++i, Offset += Increment) {
+      SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                       BasePtr, DAG.getIntPtrConstant(Offset));
+      Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, SV,
+                              SVOffset + Offset, LdEltVT, isVolatile, Align);
+      LdChain.push_back(Ops[i].getValue(1));
+    }
+
+    // Fill the rest with undefs
+    SDValue UndefVal = DAG.getUNDEF(EltVT);
+    for (; i != WidenNumElts; ++i)
+      Ops[i] = UndefVal;
+
+    Result =  DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size());
+  } else {
+    assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
+    unsigned int LdWidth = LdVT.getSizeInBits();
+    Result = GenWidenVectorLoads(LdChain, Chain, BasePtr, SV, SVOffset,
+                                 Align, isVolatile, LdWidth, WidenVT, dl);
+  }
+
+ // If we generate a single load, we can use that for the chain.  Otherwise,
+ // build a factor node to remember the multiple loads are independent and
+ // chain to that.
+ SDValue NewChain;
+ if (LdChain.size() == 1)
+   NewChain = LdChain[0];
+ else
+   NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &LdChain[0],
+                          LdChain.size());
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, N->getDebugLoc(),
+                     WidenVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue Cond1 = N->getOperand(0);
+  MVT CondVT = Cond1.getValueType();
+  if (CondVT.isVector()) {
+    MVT CondEltVT = CondVT.getVectorElementType();
+    MVT CondWidenVT =  MVT::getVectorVT(CondEltVT, WidenNumElts);
+    if (getTypeAction(CondVT) == WidenVector)
+      Cond1 = GetWidenedVector(Cond1);
+
+    if (Cond1.getValueType() != CondWidenVT)
+       Cond1 = ModifyToType(Cond1, CondWidenVT);
+  }
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(2));
+  assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     WidenVT, Cond1, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
+  SDValue InOp1 = GetWidenedVector(N->getOperand(2));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     InOp1.getValueType(), N->getOperand(0),
+                     N->getOperand(1), InOp1, InOp2, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
+ MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+ return DAG.getUNDEF(WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT WidenVT = TLI.getTypeToTransformTo(VT);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Adjust mask based on new input vector length.
+  SmallVector<int, 16> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx < (int)NumElts)
+      NewMask.push_back(Idx);
+    else
+      NewMask.push_back(Idx - NumElts + WidenNumElts);
+  }
+  for (unsigned i = NumElts; i != WidenNumElts; ++i)
+    NewMask.push_back(-1);
+  return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, &NewMask[0]);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
+  MVT WidenVT = TLI.getTypeToTransformTo(N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = N->getOperand(0);
+  MVT InVT = InOp1.getValueType();
+  assert(InVT.isVector() && "can not widen non vector type");
+  MVT WidenInVT = MVT::getVectorVT(InVT.getVectorElementType(), WidenNumElts);
+  InOp1 = GetWidenedVector(InOp1);
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Assume that the input and output will be widen appropriately.  If not,
+  // we will have to unroll it at some point.
+  assert(InOp1.getValueType() == WidenInVT &&
+         InOp2.getValueType() == WidenInVT &&
+         "Input not widened to expected type!");
+  return DAG.getNode(ISD::VSETCC, N->getDebugLoc(),
+                     WidenVT, InOp1, InOp2, N->getOperand(2));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Widen Vector Operand
+//===----------------------------------------------------------------------===//
+bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned ResNo) {
+  DEBUG(cerr << "Widen node operand " << ResNo << ": "; N->dump(&DAG);
+        cerr << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    cerr << "WidenVectorOperand op #" << ResNo << ": ";
+    N->dump(&DAG); cerr << "\n";
+#endif
+    assert(0 && "Do not know how to widen this operator's operand!");
+    abort();
+
+  case ISD::BIT_CONVERT:        Res = WidenVecOp_BIT_CONVERT(N); break;
+  case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:         Res = WidenVecOp_Convert(N); break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
+  // Since the result is legal and the input is illegal, it is unlikely
+  // that we can fix the input to a legal type so unroll the convert
+  // into some scalar code and create a nasty build vector.
+  MVT VT = N->getValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InOp = N->getOperand(0);
+  if (getTypeAction(InOp.getValueType()) == WidenVector)
+    InOp = GetWidenedVector(InOp);
+  MVT InVT = InOp.getValueType();
+  MVT InEltVT = InVT.getVectorElementType();
+
+  unsigned Opcode = N->getOpcode();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  for (unsigned i=0; i < NumElts; ++i)
+    Ops[i] = DAG.getNode(Opcode, dl, EltVT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                     DAG.getIntPtrConstant(i)));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_BIT_CONVERT(SDNode *N) {
+  MVT VT = N->getValueType(0);
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  MVT InWidenVT = InOp.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Check if we can convert between two legal vector types and extract.
+  unsigned InWidenSize = InWidenVT.getSizeInBits();
+  unsigned Size = VT.getSizeInBits();
+  if (InWidenSize % Size == 0 && !VT.isVector()) {
+    unsigned NewNumElts = InWidenSize / Size;
+    MVT NewVT = MVT::getVectorVT(VT, NewNumElts);
+    if (TLI.isTypeLegal(NewVT)) {
+      SDValue BitOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, InOp);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
+                         DAG.getIntPtrConstant(0));
+    }
+  }
+
+  // Lower the bit-convert to a store/load from the stack. Create the stack
+  // frame object.  Make sure it is aligned for both the source and destination
+  // types.
+  SDValue FIPtr = DAG.CreateStackTemporary(InWidenVT, VT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(FI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, FIPtr, SV, 0);
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(VT, dl, Store, FIPtr, SV, 0);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
+  // If the input vector is not legal, it is likely that we will not find a
+  // legal vector of the same size. Replace the concatenate vector with a
+  // nasty build vector.
+  MVT VT = N->getValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+
+  MVT InVT = N->getOperand(0).getValueType();
+  unsigned NumInElts = InVT.getVectorNumElements();
+
+  unsigned Idx = 0;
+  unsigned NumOperands = N->getNumOperands();
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (getTypeAction(InOp.getValueType()) == WidenVector)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getIntPtrConstant(j));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  MVT EltVT = InOp.getValueType().getVectorElementType();
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     EltVT, InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
+  // We have to widen the value but we want only to store the original
+  // vector type.
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue  Chain = ST->getChain();
+  SDValue  BasePtr = ST->getBasePtr();
+  const    Value *SV = ST->getSrcValue();
+  int      SVOffset = ST->getSrcValueOffset();
+  unsigned Align = ST->getAlignment();
+  bool     isVolatile = ST->isVolatile();
+  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT StVT = ST->getMemoryVT();
+  MVT ValVT = ValOp.getValueType();
+  // It must be true that we the widen vector type is bigger than where
+  // we need to store.
+  assert(StVT.isVector() && ValOp.getValueType().isVector());
+  assert(StVT.bitsLT(ValOp.getValueType()));
+
+  SmallVector<SDValue, 16> StChain;
+  if (ST->isTruncatingStore()) {
+    // For truncating stores, we can not play the tricks of chopping legal
+    // vector types and bit cast it to the right type.  Instead, we unroll
+    // the store.
+    MVT StEltVT  = StVT.getVectorElementType();
+    MVT ValEltVT = ValVT.getVectorElementType();
+    unsigned Increment = ValEltVT.getSizeInBits() / 8;
+    unsigned NumElts = StVT.getVectorNumElements();
+    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                              DAG.getIntPtrConstant(0));
+    StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr, SV,
+                                        SVOffset, StEltVT,
+                                        isVolatile, Align));
+    unsigned Offset = Increment;
+    for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
+      SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                       BasePtr, DAG.getIntPtrConstant(Offset));
+      SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                              DAG.getIntPtrConstant(0));
+      StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr, SV,
+                                          SVOffset + Offset, StEltVT,
+                                          isVolatile, MinAlign(Align, Offset)));
+    }
+  }
+  else {
+    assert(StVT.getVectorElementType() == ValVT.getVectorElementType());
+    // Store value
+    GenWidenVectorStores(StChain, Chain, BasePtr, SV, SVOffset,
+                         Align, isVolatile, ValOp, StVT.getSizeInBits(), dl);
+  }
+  if (StChain.size() == 1)
+    return StChain[0];
+  else
+    return DAG.getNode(ISD::TokenFactor, dl,
+                       MVT::Other,&StChain[0],StChain.size());
+}
+
+//===----------------------------------------------------------------------===//
+// Vector Widening Utilities
+//===----------------------------------------------------------------------===//
+
+
+// Utility function to find a vector type and its associated element
+// type from a preferred width and whose vector type must be the same size
+// as the VecVT.
+//  TLI:   Target lowering used to determine legal types.
+//  Width: Preferred width to store.
+//  VecVT: Vector value type whose size we must match.
+// Returns NewVecVT and NewEltVT - the vector type and its associated
+// element type.
+static void FindAssocWidenVecType(const TargetLowering &TLI, unsigned Width,
+                                  MVT VecVT,
+                                  MVT& NewEltVT, MVT& NewVecVT) {
+  unsigned EltWidth = Width + 1;
+  if (TLI.isTypeLegal(VecVT)) {
+    // We start with the preferred with, making it a power of 2 and find a
+    // legal vector type of that width.  If not, we reduce it by another of 2.
+    // For incoming type is legal, this process will end as a vector of the
+    // smallest loadable type should always be legal.
+    do {
+      assert(EltWidth > 0);
+      EltWidth = 1 << Log2_32(EltWidth - 1);
+      NewEltVT = MVT::getIntegerVT(EltWidth);
+      unsigned NumElts = VecVT.getSizeInBits() / EltWidth;
+      NewVecVT = MVT::getVectorVT(NewEltVT, NumElts);
+    } while (!TLI.isTypeLegal(NewVecVT) ||
+             VecVT.getSizeInBits() != NewVecVT.getSizeInBits());
+  } else {
+    // The incoming vector type is illegal and is the result of widening
+    // a vector to a power of 2. In this case, we will use the preferred
+    // with as long as it is a multiple of the incoming vector length.
+    // The legalization process will eventually make this into a legal type
+    // and remove the illegal bit converts (which would turn to stack converts
+    // if they are allow to exist).
+     do {
+      assert(EltWidth > 0);
+      EltWidth = 1 << Log2_32(EltWidth - 1);
+      NewEltVT = MVT::getIntegerVT(EltWidth);
+      unsigned NumElts = VecVT.getSizeInBits() / EltWidth;
+      NewVecVT = MVT::getVectorVT(NewEltVT, NumElts);
+    } while (!TLI.isTypeLegal(NewEltVT) ||
+             VecVT.getSizeInBits() != NewVecVT.getSizeInBits());
+  }
+}
+
+SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
+                                              SDValue      Chain,
+                                              SDValue      BasePtr,
+                                              const Value *SV,
+                                              int          SVOffset,
+                                              unsigned     Alignment,
+                                              bool         isVolatile,
+                                              unsigned     LdWidth,
+                                              MVT          ResType,
+                                              DebugLoc     dl) {
+  // The strategy assumes that we can efficiently load powers of two widths.
+  // The routines chops the vector into the largest power of 2 load and
+  // can be inserted into a legal vector and then cast the result into the
+  // vector type we want.  This avoids unnecessary stack converts.
+
+  // TODO: If the Ldwidth is legal, alignment is the same as the LdWidth, and
+  //       the load is nonvolatile, we an use a wider load for the value.
+
+  // Find the vector type that can load from.
+  MVT NewEltVT, NewVecVT;
+  unsigned NewEltVTWidth;
+  FindAssocWidenVecType(TLI, LdWidth, ResType, NewEltVT, NewVecVT);
+  NewEltVTWidth = NewEltVT.getSizeInBits();
+
+  SDValue LdOp = DAG.getLoad(NewEltVT, dl, Chain, BasePtr, SV, SVOffset,
+                             isVolatile, Alignment);
+  SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+  LdChain.push_back(LdOp.getValue(1));
+
+  // Check if we can load the element with one instruction
+  if (LdWidth == NewEltVTWidth) {
+    return DAG.getNode(ISD::BIT_CONVERT, dl, ResType, VecOp);
+  }
+
+  unsigned Idx = 1;
+  LdWidth -= NewEltVTWidth;
+  unsigned Offset = 0;
+
+  while (LdWidth > 0) {
+    unsigned Increment = NewEltVTWidth / 8;
+    Offset += Increment;
+    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                          DAG.getIntPtrConstant(Increment));
+
+    if (LdWidth < NewEltVTWidth) {
+      // Our current type we are using is too large, use a smaller size by
+      // using a smaller power of 2
+      unsigned oNewEltVTWidth = NewEltVTWidth;
+      FindAssocWidenVecType(TLI, LdWidth, ResType, NewEltVT, NewVecVT);
+      NewEltVTWidth = NewEltVT.getSizeInBits();
+      // Readjust position and vector position based on new load type
+      Idx = Idx * (oNewEltVTWidth/NewEltVTWidth);
+      VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp);
+    }
+
+    SDValue LdOp = DAG.getLoad(NewEltVT, dl, Chain, BasePtr, SV,
+                                 SVOffset+Offset, isVolatile,
+                                 MinAlign(Alignment, Offset));
+    LdChain.push_back(LdOp.getValue(1));
+    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOp,
+                        DAG.getIntPtrConstant(Idx++));
+
+    LdWidth -= NewEltVTWidth;
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, ResType, VecOp);
+}
+
+void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
+                                            SDValue   Chain,
+                                            SDValue   BasePtr,
+                                            const Value *SV,
+                                            int         SVOffset,
+                                            unsigned    Alignment,
+                                            bool        isVolatile,
+                                            SDValue     ValOp,
+                                            unsigned    StWidth,
+                                            DebugLoc    dl) {
+  // Breaks the stores into a series of power of 2 width stores.  For any
+  // width, we convert the vector to the vector of element size that we
+  // want to store.  This avoids requiring a stack convert.
+
+  // Find a width of the element type we can store with
+  MVT WidenVT = ValOp.getValueType();
+  MVT NewEltVT, NewVecVT;
+
+  FindAssocWidenVecType(TLI, StWidth, WidenVT, NewEltVT, NewVecVT);
+  unsigned NewEltVTWidth = NewEltVT.getSizeInBits();
+
+  SDValue VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, ValOp);
+  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewEltVT, VecOp,
+                            DAG.getIntPtrConstant(0));
+  SDValue StOp = DAG.getStore(Chain, dl, EOp, BasePtr, SV, SVOffset,
+                               isVolatile, Alignment);
+  StChain.push_back(StOp);
+
+  // Check if we are done
+  if (StWidth == NewEltVTWidth) {
+    return;
+  }
+
+  unsigned Idx = 1;
+  StWidth -= NewEltVTWidth;
+  unsigned Offset = 0;
+
+  while (StWidth > 0) {
+    unsigned Increment = NewEltVTWidth / 8;
+    Offset += Increment;
+    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                          DAG.getIntPtrConstant(Increment));
+
+    if (StWidth < NewEltVTWidth) {
+      // Our current type we are using is too large, use a smaller size by
+      // using a smaller power of 2
+      unsigned oNewEltVTWidth = NewEltVTWidth;
+      FindAssocWidenVecType(TLI, StWidth, WidenVT, NewEltVT, NewVecVT);
+      NewEltVTWidth = NewEltVT.getSizeInBits();
+      // Readjust position and vector position based on new load type
+      Idx = Idx * (oNewEltVTWidth/NewEltVTWidth);
+      VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVecVT, VecOp);
+    }
+
+    EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewEltVT, VecOp,
+                      DAG.getIntPtrConstant(Idx++));
+    StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr, SV,
+                                   SVOffset + Offset, isVolatile,
+                                   MinAlign(Alignment, Offset)));
+    StWidth -= NewEltVTWidth;
+  }
+}
+
+/// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, MVT NVT) {
+  // Note that InOp might have been widened so it might already have
+  // the right width or it might need be narrowed.
+  MVT InVT = InOp.getValueType();
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+  DebugLoc dl = InOp.getDebugLoc();
+
+  // Check if InOp already has the right width.
+  if (InVT == NVT)
+    return InOp;
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
+    unsigned NumConcat = WidenNumElts / InNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue UndefVal = DAG.getUNDEF(InVT);
+    Ops[0] = InOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = UndefVal;
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, &Ops[0], NumConcat);
+  }
+
+  if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+                       DAG.getIntPtrConstant(0));
+
+  // Fall back to extract and build.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  MVT EltVT = NVT.getVectorElementType();
+  unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
+  unsigned Idx;
+  for (Idx = 0; Idx < MinNumElts; ++Idx)
+    Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                           DAG.getIntPtrConstant(Idx));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for ( ; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], WidenNumElts);
+}
diff --git a/lib/CodeGen/SelectionDAG/Makefile b/lib/CodeGen/SelectionDAG/Makefile
new file mode 100644
index 0000000..185222a
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/Makefile
@@ -0,0 +1,15 @@
+##===- lib/CodeGen/SelectionDAG/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMSelectionDAG
+PARALLEL_DIRS =
+BUILD_ARCHIVE = 1
+DONT_BUILD_RELINKED = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
new file mode 100644
index 0000000..af73b28
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -0,0 +1,635 @@
+//===----- ScheduleDAGFast.cpp - Fast poor list scheduler -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a fast scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical copies");
+
+static RegisterScheduler
+  fastDAGScheduler("fast", "Fast suboptimal list scheduling",
+                   createFastDAGScheduler);
+
+namespace {
+  /// FastPriorityQueue - A degenerate priority queue that considers
+  /// all nodes to have the same priority.
+  ///
+  struct VISIBILITY_HIDDEN FastPriorityQueue {
+    SmallVector<SUnit *, 16> Queue;
+
+    bool empty() const { return Queue.empty(); }
+    
+    void push(SUnit *U) {
+      Queue.push_back(U);
+    }
+
+    SUnit *pop() {
+      if (empty()) return NULL;
+      SUnit *V = Queue.back();
+      Queue.pop_back();
+      return V;
+    }
+  };
+
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGFast - The actual "fast" list scheduler implementation.
+///
+class VISIBILITY_HIDDEN ScheduleDAGFast : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  FastPriorityQueue AvailableQueue;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<unsigned> LiveRegCycles;
+
+public:
+  ScheduleDAGFast(MachineFunction &mf)
+    : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule();
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  void AddPred(SUnit *SU, const SDep &D) {
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    SU->removePred(D);
+  }
+
+private:
+  void ReleasePred(SUnit *SU, SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU, unsigned CurCycle);
+  void ScheduleNodeBottomUp(SUnit*, unsigned);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVector<SUnit*, 2>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+  void ListScheduleBottomUp();
+
+  /// ForceUnitLatencies - The fast scheduler doesn't care about real latencies.
+  bool ForceUnitLatencies() const { return true; }
+};
+}  // end anonymous namespace
+
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGFast::Schedule() {
+  DOUT << "********** List Scheduling **********\n";
+
+  NumLiveRegs = 0;
+  LiveRegDefs.resize(TRI->getNumRegs(), NULL);  
+  LiveRegCycles.resize(TRI->getNumRegs(), 0);
+
+  // Build the scheduling graph.
+  BuildSchedGraph();
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+  --PredSU->NumSuccsLeft;
+  
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft < 0) {
+    cerr << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    cerr << " has been released too many times!\n";
+    assert(0);
+  }
+#endif
+  
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+    AvailableQueue.push(PredSU);
+  }
+}
+
+void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can 
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      if (!LiveRegDefs[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegDefs[I->getReg()] = I->getSUnit();
+        LiveRegCycles[I->getReg()] = CurCycle;
+      }
+    }
+  }
+}
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
+  DOUT << "*** Scheduling [" << CurCycle << "]: ";
+  DEBUG(SU->dump(this));
+
+  assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
+  SU->setHeightToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleasePredecessors(SU, CurCycle);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[I->getReg()] == SU &&
+               "Physical register dependency violated?");
+        --NumLiveRegs;
+        LiveRegDefs[I->getReg()] = NULL;
+        LiveRegCycles[I->getReg()] = 0;
+      }
+    }
+  }
+
+  SU->isScheduled = true;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
+  if (SU->getNode()->getFlaggedNode())
+    return NULL;
+
+  SDNode *N = SU->getNode();
+  if (!N)
+    return NULL;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    MVT VT = N->getValueType(i);
+    if (VT == MVT::Flag)
+      return NULL;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDValue &Op = N->getOperand(i);
+    MVT VT = Op.getNode()->getValueType(Op.getResNo());
+    if (VT == MVT::Flag)
+      return NULL;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return NULL;
+
+    DOUT << "Unfolding SU # " << SU->NodeNum << "\n";
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    SUnit *NewSU = NewSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+      
+    const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != TID.getNumOperands(); ++i) {
+      if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (TID.isCommutable())
+      NewSU->isCommutable = true;
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = NewSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+    }
+
+    SDep ChainPred;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainPred = *I;
+      else if (I->getSUnit()->getNode() &&
+               I->getSUnit()->getNode()->isOperandOf(LoadNode))
+        LoadPreds.push_back(*I);
+      else
+        NodePreds.push_back(*I);
+    }
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainSuccs.push_back(*I);
+      else
+        NodeSuccs.push_back(*I);
+    }
+
+    if (ChainPred.getSUnit()) {
+      RemovePred(SU, ChainPred);
+      if (isNewLoad)
+        AddPred(LoadSU, ChainPred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad) {
+        AddPred(LoadSU, Pred);
+      }
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    } 
+    if (isNewLoad) {
+      AddPred(NewSU, SDep(LoadSU, SDep::Order, LoadSU->Latency));
+    }
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DOUT << "Duplicating SU # " << SU->NodeNum << "\n";
+  NewSU = Clone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (!I->isArtificial())
+      AddPred(NewSU, *I);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                               SmallVector<SUnit*, 2> &Copies) {
+  SUnit *CopyFromSU = NewSUnit(static_cast<SDNode *>(NULL));
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = NewSUnit(static_cast<SDNode *>(NULL));
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, *I));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+  }
+
+  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
+  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+  assert(TID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+  unsigned NumRes = TID.getNumDefs();
+  for (const unsigned *ImpDef = TID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    if (Reg == *ImpDef)
+      break;
+    ++NumRes;
+  }
+  return N->getValueType(NumRes);
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
+                                               SmallVector<unsigned, 4> &LRegs){
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      unsigned Reg = I->getReg();
+      if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != I->getSUnit()) {
+        if (RegAdded.insert(Reg))
+          LRegs.push_back(Reg);
+      }
+      for (const unsigned *Alias = TRI->getAliasSet(Reg);
+           *Alias; ++Alias)
+        if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != I->getSUnit()) {
+          if (RegAdded.insert(*Alias))
+            LRegs.push_back(*Alias);
+        }
+    }
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getFlaggedNode()) {
+    if (!Node->isMachineOpcode())
+      continue;
+    const TargetInstrDesc &TID = TII->get(Node->getMachineOpcode());
+    if (!TID.ImplicitDefs)
+      continue;
+    for (const unsigned *Reg = TID.ImplicitDefs; *Reg; ++Reg) {
+      if (LiveRegDefs[*Reg] && LiveRegDefs[*Reg] != SU) {
+        if (RegAdded.insert(*Reg))
+          LRegs.push_back(*Reg);
+      }
+      for (const unsigned *Alias = TRI->getAliasSet(*Reg);
+           *Alias; ++Alias)
+        if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
+          if (RegAdded.insert(*Alias))
+            LRegs.push_back(*Alias);
+        }
+    }
+  }
+  return !LRegs.empty();
+}
+
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGFast::ListScheduleBottomUp() {
+  unsigned CurCycle = 0;
+
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU, CurCycle);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue.push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  SmallVector<SUnit*, 4> NotReady;
+  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty()) {
+    bool Delayed = false;
+    LRegsMap.clear();
+    SUnit *CurSU = AvailableQueue.pop();
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      Delayed = true;
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      NotReady.push_back(CurSU);
+      CurSU = AvailableQueue.pop();
+    }
+
+    // All candidates are delayed due to live physical reg dependencies.
+    // Try code duplication or inserting cross class copies
+    // to resolve it.
+    if (Delayed && !CurSU) {
+      if (!CurSU) {
+        // Try duplicating the nodes that produces these
+        // "expensive to copy" values to break the dependency. In case even
+        // that doesn't work, insert cross class copies.
+        SUnit *TrySU = NotReady[0];
+        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+        assert(LRegs.size() == 1 && "Can't handle this yet!");
+        unsigned Reg = LRegs[0];
+        SUnit *LRDef = LiveRegDefs[Reg];
+        MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        const TargetRegisterClass *RC =
+          TRI->getPhysicalRegisterRegClass(Reg, VT);
+        const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+        // If cross copy register class is null, then it must be possible copy
+        // the value directly. Do not try duplicate the def.
+        SUnit *NewDef = 0;
+        if (DestRC)
+          NewDef = CopyAndMoveSuccessors(LRDef);
+        else
+          DestRC = RC;
+        if (!NewDef) {
+          // Issue copies, these can be expensive cross register class copies.
+          SmallVector<SUnit*, 2> Copies;
+          InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+          DOUT << "Adding an edge from SU # " << TrySU->NodeNum
+               << " to SU #" << Copies.front()->NodeNum << "\n";
+          AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
+                              /*Reg=*/0, /*isNormalMemory=*/false,
+                              /*isMustAlias=*/false, /*isArtificial=*/true));
+          NewDef = Copies.back();
+        }
+
+        DOUT << "Adding an edge from SU # " << NewDef->NodeNum
+             << " to SU #" << TrySU->NodeNum << "\n";
+        LiveRegDefs[Reg] = NewDef;
+        AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
+                             /*Reg=*/0, /*isNormalMemory=*/false,
+                             /*isMustAlias=*/false, /*isArtificial=*/true));
+        TrySU->isAvailable = false;
+        CurSU = NewDef;
+      }
+
+      if (!CurSU) {
+        assert(false && "Unable to resolve live physical register dependencies!");
+        abort();
+      }
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+      NotReady[i]->isPending = false;
+      // May no longer be available due to backtracking.
+      if (NotReady[i]->isAvailable)
+        AvailableQueue.push(NotReady[i]);
+    }
+    NotReady.clear();
+
+    if (CurSU)
+      ScheduleNodeBottomUp(CurSU, CurCycle);
+    ++CurCycle;
+  }
+
+  // Reverse the order if it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+  
+  
+#ifndef NDEBUG
+  // Verify that all SUnits were scheduled.
+  bool AnyNotSched = false;
+  unsigned DeadNodes = 0;
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    if (!SUnits[i].isScheduled) {
+      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+        ++DeadNodes;
+        continue;
+      }
+      if (!AnyNotSched)
+        cerr << "*** List scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      cerr << "has not been scheduled!\n";
+      AnyNotSched = true;
+    }
+    if (SUnits[i].NumSuccsLeft != 0) {
+      if (!AnyNotSched)
+        cerr << "*** List scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      cerr << "has successors left!\n";
+      AnyNotSched = true;
+    }
+  }
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(!AnyNotSched);
+  assert(Sequence.size() + DeadNodes - Noops == SUnits.size() &&
+         "The number of nodes scheduled doesn't match the expected number!");
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGFast(*IS->MF);
+}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGList.cpp
new file mode 100644
index 0000000..c432534
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGList.cpp
@@ -0,0 +1,268 @@
+//===---- ScheduleDAGList.cpp - Implement a list scheduler for isel DAG ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/Statistic.h"
+#include <climits>
+using namespace llvm;
+
+STATISTIC(NumNoops , "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+
+static RegisterScheduler
+  tdListDAGScheduler("list-td", "Top-down list scheduler",
+                     createTDListDAGScheduler);
+   
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGList - The actual list scheduler implementation.  This supports
+/// top-down scheduling.
+///
+class VISIBILITY_HIDDEN ScheduleDAGList : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  ///
+  SchedulingPriorityQueue *AvailableQueue;
+  
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands become available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit*> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+public:
+  ScheduleDAGList(MachineFunction &mf,
+                  SchedulingPriorityQueue *availqueue,
+                  ScheduleHazardRecognizer *HR)
+    : ScheduleDAGSDNodes(mf),
+      AvailableQueue(availqueue), HazardRec(HR) {
+    }
+
+  ~ScheduleDAGList() {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule();
+
+private:
+  void ReleaseSucc(SUnit *SU, const SDep &D);
+  void ReleaseSuccessors(SUnit *SU);
+  void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+  void ListScheduleTopDown();
+};
+}  // end anonymous namespace
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGList::Schedule() {
+  DOUT << "********** List Scheduling **********\n";
+  
+  // Build the scheduling graph.
+  BuildSchedGraph();
+
+  AvailableQueue->initNodes(SUnits);
+  
+  ListScheduleTopDown();
+  
+  AvailableQueue->releaseState();
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGList::ReleaseSucc(SUnit *SU, const SDep &D) {
+  SUnit *SuccSU = D.getSUnit();
+  --SuccSU->NumPredsLeft;
+  
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft < 0) {
+    cerr << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    cerr << " has been released too many times!\n";
+    assert(0);
+  }
+#endif
+  
+  SuccSU->setDepthToAtLeast(SU->getDepth() + D.getLatency());
+  
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    PendingQueue.push_back(SuccSU);
+}
+
+void ScheduleDAGList::ReleaseSuccessors(SUnit *SU) {
+  // Top down: release successors.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    assert(!I->isAssignedRegDep() &&
+           "The list-td scheduler doesn't yet support physreg dependencies!");
+
+    ReleaseSucc(SU, *I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DOUT << "*** Scheduling [" << CurCycle << "]: ";
+  DEBUG(SU->dump(this));
+  
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue->ScheduledNode(SU);
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void ScheduleDAGList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // All leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (SUnits[i].Preds.empty()) {
+      AvailableQueue->push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+  
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() == CurCycle) {
+        AvailableQueue->push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      } else {
+        assert(PendingQueue[i]->getDepth() > CurCycle && "Negative latency?");
+      }
+    }
+    
+    // If there are no instructions available, don't try to issue anything, and
+    // don't advance the hazard recognizer.
+    if (AvailableQueue->empty()) {
+      ++CurCycle;
+      continue;
+    }
+
+    SUnit *FoundSUnit = 0;
+    
+    bool HasNoopHazards = false;
+    while (!AvailableQueue->empty()) {
+      SUnit *CurSUnit = AvailableQueue->pop();
+      
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+    
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+      
+      NotReady.push_back(CurSUnit);
+    }
+    
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue->push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule, do it now.
+    if (FoundSUnit) {
+      ScheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+
+      // If this is a pseudo-op node, we don't want to increment the current
+      // cycle.
+      if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
+        ++CurCycle;        
+    } else if (!HasNoopHazards) {
+      // Otherwise, we have a pipeline stall, but no other problem, just advance
+      // the current cycle and try again.
+      DOUT << "*** Advancing cycle, no work to do\n";
+      HazardRec->AdvanceCycle();
+      ++NumStalls;
+      ++CurCycle;
+    } else {
+      // Otherwise, we have no instructions to issue and we have instructions
+      // that will fault if we don't do this right.  This is the case for
+      // processors without pipeline interlocks and other cases.
+      DOUT << "*** Emitting noop\n";
+      HazardRec->EmitNoop();
+      Sequence.push_back(0);   // NULL here means noop
+      ++NumNoops;
+      ++CurCycle;
+    }
+  }
+
+#ifndef NDEBUG
+  VerifySchedule(/*isBottomUp=*/false);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+/// createTDListDAGScheduler - This creates a top-down list scheduler with a
+/// new hazard recognizer. This scheduler takes ownership of the hazard
+/// recognizer and deletes it when done.
+ScheduleDAGSDNodes *
+llvm::createTDListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGList(*IS->MF,
+                             new LatencyPriorityQueue(),
+                             IS->CreateTargetHazardRecognizer());
+}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
new file mode 100644
index 0000000..c97e2a8
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -0,0 +1,1533 @@
+//===----- ScheduleDAGRRList.cpp - Reg pressure reduction list scheduler --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements bottom-up and top-down register pressure reduction list
+// schedulers, using standard algorithms.  The basic approach uses a priority
+// queue of available nodes to schedule.  One at a time, nodes are taken from
+// the priority queue (thus in priority order), checked for legality to
+// schedule, and emitted if legal.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <climits>
+using namespace llvm;
+
+STATISTIC(NumBacktracks, "Number of times scheduler backtracked");
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical register copies");
+
+static RegisterScheduler
+  burrListDAGScheduler("list-burr",
+                       "Bottom-up register reduction list scheduling",
+                       createBURRListDAGScheduler);
+static RegisterScheduler
+  tdrListrDAGScheduler("list-tdrr",
+                       "Top-down register reduction list scheduling",
+                       createTDRRListDAGScheduler);
+
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGRRList - The actual register reduction list scheduler
+/// implementation.  This supports both top-down and bottom-up scheduling.
+///
+class VISIBILITY_HIDDEN ScheduleDAGRRList : public ScheduleDAGSDNodes {
+private:
+  /// isBottomUp - This is true if the scheduling problem is bottom-up, false if
+  /// it is top-down.
+  bool isBottomUp;
+
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<unsigned> LiveRegCycles;
+
+  /// Topo - A topological ordering for SUnits which permits fast IsReachable
+  /// and similar queries.
+  ScheduleDAGTopologicalSort Topo;
+
+public:
+  ScheduleDAGRRList(MachineFunction &mf,
+                    bool isbottomup,
+                    SchedulingPriorityQueue *availqueue)
+    : ScheduleDAGSDNodes(mf), isBottomUp(isbottomup),
+      AvailableQueue(availqueue), Topo(SUnits) {
+    }
+
+  ~ScheduleDAGRRList() {
+    delete AvailableQueue;
+  }
+
+  void Schedule();
+
+  /// IsReachable - Checks if SU is reachable from TargetSU.
+  bool IsReachable(const SUnit *SU, const SUnit *TargetSU) {
+    return Topo.IsReachable(SU, TargetSU);
+  }
+
+  /// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
+  /// create a cycle.
+  bool WillCreateCycle(SUnit *SU, SUnit *TargetSU) {
+    return Topo.WillCreateCycle(SU, TargetSU);
+  }
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  /// Updates the topological ordering if required.
+  void AddPred(SUnit *SU, const SDep &D) {
+    Topo.AddPred(SU, D.getSUnit());
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  /// Updates the topological ordering if required.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    Topo.RemovePred(SU, D.getSUnit());
+    SU->removePred(D);
+  }
+
+private:
+  void ReleasePred(SUnit *SU, const SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU, unsigned CurCycle);
+  void ReleaseSucc(SUnit *SU, const SDep *SuccEdge);
+  void ReleaseSuccessors(SUnit *SU);
+  void CapturePred(SDep *PredEdge);
+  void ScheduleNodeBottomUp(SUnit*, unsigned);
+  void ScheduleNodeTopDown(SUnit*, unsigned);
+  void UnscheduleNodeBottomUp(SUnit*);
+  void BacktrackBottomUp(SUnit*, unsigned, unsigned&);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVector<SUnit*, 2>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+  void ListScheduleTopDown();
+  void ListScheduleBottomUp();
+
+
+  /// CreateNewSUnit - Creates a new SUnit and returns a pointer to it.
+  /// Updates the topological ordering if required.
+  SUnit *CreateNewSUnit(SDNode *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = NewSUnit(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// CreateClone - Creates a new SUnit from an existing one.
+  /// Updates the topological ordering if required.
+  SUnit *CreateClone(SUnit *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = Clone(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// ForceUnitLatencies - Return true, since register-pressure-reducing
+  /// scheduling doesn't need actual latency information.
+  bool ForceUnitLatencies() const { return true; }
+};
+}  // end anonymous namespace
+
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGRRList::Schedule() {
+  DOUT << "********** List Scheduling **********\n";
+
+  NumLiveRegs = 0;
+  LiveRegDefs.resize(TRI->getNumRegs(), NULL);  
+  LiveRegCycles.resize(TRI->getNumRegs(), 0);
+
+  // Build the scheduling graph.
+  BuildSchedGraph();
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+  Topo.InitDAGTopologicalSorting();
+
+  AvailableQueue->initNodes(SUnits);
+  
+  // Execute the actual scheduling loop Top-Down or Bottom-Up as appropriate.
+  if (isBottomUp)
+    ListScheduleBottomUp();
+  else
+    ListScheduleTopDown();
+  
+  AvailableQueue->releaseState();
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+  --PredSU->NumSuccsLeft;
+  
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft < 0) {
+    cerr << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    cerr << " has been released too many times!\n";
+    assert(0);
+  }
+#endif
+  
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+    AvailableQueue->push(PredSU);
+  }
+}
+
+void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can 
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      if (!LiveRegDefs[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegDefs[I->getReg()] = I->getSUnit();
+        LiveRegCycles[I->getReg()] = CurCycle;
+      }
+    }
+  }
+}
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
+  DOUT << "*** Scheduling [" << CurCycle << "]: ";
+  DEBUG(SU->dump(this));
+
+  assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
+  SU->setHeightToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleasePredecessors(SU, CurCycle);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[I->getReg()] == SU &&
+               "Physical register dependency violated?");
+        --NumLiveRegs;
+        LiveRegDefs[I->getReg()] = NULL;
+        LiveRegCycles[I->getReg()] = 0;
+      }
+    }
+  }
+
+  SU->isScheduled = true;
+  AvailableQueue->ScheduledNode(SU);
+}
+
+/// CapturePred - This does the opposite of ReleasePred. Since SU is being
+/// unscheduled, incrcease the succ left count of its predecessors. Remove
+/// them from AvailableQueue if necessary.
+void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {  
+  SUnit *PredSU = PredEdge->getSUnit();
+  if (PredSU->isAvailable) {
+    PredSU->isAvailable = false;
+    if (!PredSU->isPending)
+      AvailableQueue->remove(PredSU);
+  }
+
+  ++PredSU->NumSuccsLeft;
+}
+
+/// UnscheduleNodeBottomUp - Remove the node from the schedule, update its and
+/// its predecessor states to reflect the change.
+void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
+  DOUT << "*** Unscheduling [" << SU->getHeight() << "]: ";
+  DEBUG(SU->dump(this));
+
+  AvailableQueue->UnscheduledNode(SU);
+
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    CapturePred(&*I);
+    if (I->isAssignedRegDep() && SU->getHeight() == LiveRegCycles[I->getReg()]) {
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
+             "Physical register dependency violated?");
+      --NumLiveRegs;
+      LiveRegDefs[I->getReg()] = NULL;
+      LiveRegCycles[I->getReg()] = 0;
+    }
+  }
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (!LiveRegDefs[I->getReg()]) {
+        LiveRegDefs[I->getReg()] = SU;
+        ++NumLiveRegs;
+      }
+      if (I->getSUnit()->getHeight() < LiveRegCycles[I->getReg()])
+        LiveRegCycles[I->getReg()] = I->getSUnit()->getHeight();
+    }
+  }
+
+  SU->setHeightDirty();
+  SU->isScheduled = false;
+  SU->isAvailable = true;
+  AvailableQueue->push(SU);
+}
+
+/// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in
+/// BTCycle in order to schedule a specific node.
+void ScheduleDAGRRList::BacktrackBottomUp(SUnit *SU, unsigned BtCycle,
+                                          unsigned &CurCycle) {
+  SUnit *OldSU = NULL;
+  while (CurCycle > BtCycle) {
+    OldSU = Sequence.back();
+    Sequence.pop_back();
+    if (SU->isSucc(OldSU))
+      // Don't try to remove SU from AvailableQueue.
+      SU->isAvailable = false;
+    UnscheduleNodeBottomUp(OldSU);
+    --CurCycle;
+  }
+
+  assert(!SU->isSucc(OldSU) && "Something is wrong!");
+
+  ++NumBacktracks;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
+  if (SU->getNode()->getFlaggedNode())
+    return NULL;
+
+  SDNode *N = SU->getNode();
+  if (!N)
+    return NULL;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    MVT VT = N->getValueType(i);
+    if (VT == MVT::Flag)
+      return NULL;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDValue &Op = N->getOperand(i);
+    MVT VT = Op.getNode()->getValueType(Op.getResNo());
+    if (VT == MVT::Flag)
+      return NULL;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return NULL;
+
+    DOUT << "Unfolding SU # " << SU->NodeNum << "\n";
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = CreateNewSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+      ComputeLatency(LoadSU);
+    }
+
+    SUnit *NewSU = CreateNewSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+      
+    const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != TID.getNumOperands(); ++i) {
+      if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (TID.isCommutable())
+      NewSU->isCommutable = true;
+    ComputeLatency(NewSU);
+
+    // Record all the edges to and from the old SU, by category.
+    SmallVector<SDep, 4> ChainPreds;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainPreds.push_back(*I);
+      else if (I->getSUnit()->getNode() &&
+               I->getSUnit()->getNode()->isOperandOf(LoadNode))
+        LoadPreds.push_back(*I);
+      else
+        NodePreds.push_back(*I);
+    }
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainSuccs.push_back(*I);
+      else
+        NodeSuccs.push_back(*I);
+    }
+
+    // Now assign edges to the newly-created nodes.
+    for (unsigned i = 0, e = ChainPreds.size(); i != e; ++i) {
+      const SDep &Pred = ChainPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    } 
+
+    // Add a data dependency to reflect that NewSU reads the value defined
+    // by LoadSU.
+    AddPred(NewSU, SDep(LoadSU, SDep::Data, LoadSU->Latency));
+
+    if (isNewLoad)
+      AvailableQueue->addNode(LoadSU);
+    AvailableQueue->addNode(NewSU);
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DOUT << "Duplicating SU # " << SU->NodeNum << "\n";
+  NewSU = CreateClone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (!I->isArtificial())
+      AddPred(NewSU, *I);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(NewSU);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                               const TargetRegisterClass *DestRC,
+                                               const TargetRegisterClass *SrcRC,
+                                               SmallVector<SUnit*, 2> &Copies) {
+  SUnit *CopyFromSU = CreateNewSUnit(NULL);
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = CreateNewSUnit(NULL);
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, *I));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
+  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(CopyFromSU);
+  AvailableQueue->addNode(CopyToSU);
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+  assert(TID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+  unsigned NumRes = TID.getNumDefs();
+  for (const unsigned *ImpDef = TID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    if (Reg == *ImpDef)
+      break;
+    ++NumRes;
+  }
+  return N->getValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               std::vector<SUnit*> &LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVector<unsigned, 4> &LRegs,
+                               const TargetRegisterInfo *TRI) {
+  bool Added = false;
+  if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != SU) {
+    if (RegAdded.insert(Reg)) {
+      LRegs.push_back(Reg);
+      Added = true;
+    }
+  }
+  for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias)
+    if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
+      if (RegAdded.insert(*Alias)) {
+        LRegs.push_back(*Alias);
+        Added = true;
+      }
+    }
+  return Added;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGRRList::DelayForLiveRegsBottomUp(SUnit *SU,
+                                                 SmallVector<unsigned, 4> &LRegs){
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep())
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+                         RegAdded, LRegs, TRI);
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getFlaggedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Flag)
+        --NumOps;  // Ignore the flag operand.
+
+      for (unsigned i = 2; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = (Flags & 0xffff) >> 3;
+
+        ++i; // Skip the ID value.
+        if ((Flags & 7) == 2 || (Flags & 7) == 6) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
+
+    if (!Node->isMachineOpcode())
+      continue;
+    const TargetInstrDesc &TID = TII->get(Node->getMachineOpcode());
+    if (!TID.ImplicitDefs)
+      continue;
+    for (const unsigned *Reg = TID.ImplicitDefs; *Reg; ++Reg)
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+  }
+  return !LRegs.empty();
+}
+
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGRRList::ListScheduleBottomUp() {
+  unsigned CurCycle = 0;
+
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU, CurCycle);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue->push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  SmallVector<SUnit*, 4> NotReady;
+  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty()) {
+    bool Delayed = false;
+    LRegsMap.clear();
+    SUnit *CurSU = AvailableQueue->pop();
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      Delayed = true;
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      NotReady.push_back(CurSU);
+      CurSU = AvailableQueue->pop();
+    }
+
+    // All candidates are delayed due to live physical reg dependencies.
+    // Try backtracking, code duplication, or inserting cross class copies
+    // to resolve it.
+    if (Delayed && !CurSU) {
+      for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+        SUnit *TrySU = NotReady[i];
+        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+
+        // Try unscheduling up to the point where it's safe to schedule
+        // this node.
+        unsigned LiveCycle = CurCycle;
+        for (unsigned j = 0, ee = LRegs.size(); j != ee; ++j) {
+          unsigned Reg = LRegs[j];
+          unsigned LCycle = LiveRegCycles[Reg];
+          LiveCycle = std::min(LiveCycle, LCycle);
+        }
+        SUnit *OldSU = Sequence[LiveCycle];
+        if (!WillCreateCycle(TrySU, OldSU))  {
+          BacktrackBottomUp(TrySU, LiveCycle, CurCycle);
+          // Force the current node to be scheduled before the node that
+          // requires the physical reg dep.
+          if (OldSU->isAvailable) {
+            OldSU->isAvailable = false;
+            AvailableQueue->remove(OldSU);
+          }
+          AddPred(TrySU, SDep(OldSU, SDep::Order, /*Latency=*/1,
+                              /*Reg=*/0, /*isNormalMemory=*/false,
+                              /*isMustAlias=*/false, /*isArtificial=*/true));
+          // If one or more successors has been unscheduled, then the current
+          // node is no longer avaialable. Schedule a successor that's now
+          // available instead.
+          if (!TrySU->isAvailable)
+            CurSU = AvailableQueue->pop();
+          else {
+            CurSU = TrySU;
+            TrySU->isPending = false;
+            NotReady.erase(NotReady.begin()+i);
+          }
+          break;
+        }
+      }
+
+      if (!CurSU) {
+        // Can't backtrack. If it's too expensive to copy the value, then try
+        // duplicate the nodes that produces these "too expensive to copy"
+        // values to break the dependency. In case even that doesn't work,
+        // insert cross class copies.
+        // If it's not too expensive, i.e. cost != -1, issue copies.
+        SUnit *TrySU = NotReady[0];
+        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+        assert(LRegs.size() == 1 && "Can't handle this yet!");
+        unsigned Reg = LRegs[0];
+        SUnit *LRDef = LiveRegDefs[Reg];
+        MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        const TargetRegisterClass *RC =
+          TRI->getPhysicalRegisterRegClass(Reg, VT);
+        const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+        // If cross copy register class is null, then it must be possible copy
+        // the value directly. Do not try duplicate the def.
+        SUnit *NewDef = 0;
+        if (DestRC)
+          NewDef = CopyAndMoveSuccessors(LRDef);
+        else
+          DestRC = RC;
+        if (!NewDef) {
+          // Issue copies, these can be expensive cross register class copies.
+          SmallVector<SUnit*, 2> Copies;
+          InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+          DOUT << "Adding an edge from SU #" << TrySU->NodeNum
+               << " to SU #" << Copies.front()->NodeNum << "\n";
+          AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
+                              /*Reg=*/0, /*isNormalMemory=*/false,
+                              /*isMustAlias=*/false,
+                              /*isArtificial=*/true));
+          NewDef = Copies.back();
+        }
+
+        DOUT << "Adding an edge from SU #" << NewDef->NodeNum
+             << " to SU #" << TrySU->NodeNum << "\n";
+        LiveRegDefs[Reg] = NewDef;
+        AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
+                             /*Reg=*/0, /*isNormalMemory=*/false,
+                             /*isMustAlias=*/false,
+                             /*isArtificial=*/true));
+        TrySU->isAvailable = false;
+        CurSU = NewDef;
+      }
+
+      assert(CurSU && "Unable to resolve live physical register dependencies!");
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+      NotReady[i]->isPending = false;
+      // May no longer be available due to backtracking.
+      if (NotReady[i]->isAvailable)
+        AvailableQueue->push(NotReady[i]);
+    }
+    NotReady.clear();
+
+    if (CurSU)
+      ScheduleNodeBottomUp(CurSU, CurCycle);
+    ++CurCycle;
+  }
+
+  // Reverse the order if it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+  
+#ifndef NDEBUG
+  VerifySchedule(isBottomUp);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGRRList::ReleaseSucc(SUnit *SU, const SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+  --SuccSU->NumPredsLeft;
+  
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft < 0) {
+    cerr << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    cerr << " has been released too many times!\n";
+    assert(0);
+  }
+#endif
+  
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) {
+    SuccSU->isAvailable = true;
+    AvailableQueue->push(SuccSU);
+  }
+}
+
+void ScheduleDAGRRList::ReleaseSuccessors(SUnit *SU) {
+  // Top down: release successors
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    assert(!I->isAssignedRegDep() &&
+           "The list-tdrr scheduler doesn't yet support physreg dependencies!");
+
+    ReleaseSucc(SU, &*I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGRRList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DOUT << "*** Scheduling [" << CurCycle << "]: ";
+  DEBUG(SU->dump(this));
+
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue->ScheduledNode(SU);
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void ScheduleDAGRRList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // All leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (SUnits[i].Preds.empty()) {
+      AvailableQueue->push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+  
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty()) {
+    SUnit *CurSU = AvailableQueue->pop();
+    
+    if (CurSU)
+      ScheduleNodeTopDown(CurSU, CurCycle);
+    ++CurCycle;
+  }
+  
+#ifndef NDEBUG
+  VerifySchedule(isBottomUp);
+#endif
+}
+
+
+//===----------------------------------------------------------------------===//
+//                RegReductionPriorityQueue Implementation
+//===----------------------------------------------------------------------===//
+//
+// This is a SchedulingPriorityQueue that schedules using Sethi Ullman numbers
+// to reduce register pressure.
+// 
+namespace {
+  template<class SF>
+  class RegReductionPriorityQueue;
+  
+  /// Sorting functions for the Available queue.
+  struct bu_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<bu_ls_rr_sort> *SPQ;
+    bu_ls_rr_sort(RegReductionPriorityQueue<bu_ls_rr_sort> *spq) : SPQ(spq) {}
+    bu_ls_rr_sort(const bu_ls_rr_sort &RHS) : SPQ(RHS.SPQ) {}
+    
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+
+  struct td_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<td_ls_rr_sort> *SPQ;
+    td_ls_rr_sort(RegReductionPriorityQueue<td_ls_rr_sort> *spq) : SPQ(spq) {}
+    td_ls_rr_sort(const td_ls_rr_sort &RHS) : SPQ(RHS.SPQ) {}
+    
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+}  // end anonymous namespace
+
+/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
+/// Smaller number is the higher priority.
+static unsigned
+CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
+  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
+  if (SethiUllmanNumber != 0)
+    return SethiUllmanNumber;
+
+  unsigned Extra = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = I->getSUnit();
+    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
+    if (PredSethiUllman > SethiUllmanNumber) {
+      SethiUllmanNumber = PredSethiUllman;
+      Extra = 0;
+    } else if (PredSethiUllman == SethiUllmanNumber)
+      ++Extra;
+  }
+
+  SethiUllmanNumber += Extra;
+
+  if (SethiUllmanNumber == 0)
+    SethiUllmanNumber = 1;
+  
+  return SethiUllmanNumber;
+}
+
+namespace {
+  template<class SF>
+  class VISIBILITY_HIDDEN RegReductionPriorityQueue
+   : public SchedulingPriorityQueue {
+    PriorityQueue<SUnit*, std::vector<SUnit*>, SF> Queue;
+    unsigned currentQueueId;
+
+  protected:
+    // SUnits - The SUnits for the current graph.
+    std::vector<SUnit> *SUnits;
+    
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    ScheduleDAGRRList *scheduleDAG;
+
+    // SethiUllmanNumbers - The SethiUllman number for each node.
+    std::vector<unsigned> SethiUllmanNumbers;
+
+  public:
+    RegReductionPriorityQueue(const TargetInstrInfo *tii,
+                              const TargetRegisterInfo *tri) :
+    Queue(SF(this)), currentQueueId(0),
+    TII(tii), TRI(tri), scheduleDAG(NULL) {}
+    
+    void initNodes(std::vector<SUnit> &sunits) {
+      SUnits = &sunits;
+      // Add pseudo dependency edges for two-address nodes.
+      AddPseudoTwoAddrDeps();
+      // Reroute edges to nodes with multiple uses.
+      PrescheduleNodesWithMultipleUses();
+      // Calculate node priorities.
+      CalculateSethiUllmanNumbers();
+    }
+
+    void addNode(const SUnit *SU) {
+      unsigned SUSize = SethiUllmanNumbers.size();
+      if (SUnits->size() > SUSize)
+        SethiUllmanNumbers.resize(SUSize*2, 0);
+      CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+    }
+
+    void updateNode(const SUnit *SU) {
+      SethiUllmanNumbers[SU->NodeNum] = 0;
+      CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+    }
+
+    void releaseState() {
+      SUnits = 0;
+      SethiUllmanNumbers.clear();
+    }
+
+    unsigned getNodePriority(const SUnit *SU) const {
+      assert(SU->NodeNum < SethiUllmanNumbers.size());
+      unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+      if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+        // CopyToReg should be close to its uses to facilitate coalescing and
+        // avoid spilling.
+        return 0;
+      if (Opc == TargetInstrInfo::EXTRACT_SUBREG ||
+          Opc == TargetInstrInfo::SUBREG_TO_REG ||
+          Opc == TargetInstrInfo::INSERT_SUBREG)
+        // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+        // close to their uses to facilitate coalescing.
+        return 0;
+      if (SU->NumSuccs == 0 && SU->NumPreds != 0)
+        // If SU does not have a register use, i.e. it doesn't produce a value
+        // that would be consumed (e.g. store), then it terminates a chain of
+        // computation.  Give it a large SethiUllman number so it will be
+        // scheduled right before its predecessors that it doesn't lengthen
+        // their live ranges.
+        return 0xffff;
+      if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+        // If SU does not have a register def, schedule it close to its uses
+        // because it does not lengthen any live ranges.
+        return 0;
+      return SethiUllmanNumbers[SU->NodeNum];
+    }
+    
+    unsigned size() const { return Queue.size(); }
+
+    bool empty() const { return Queue.empty(); }
+    
+    void push(SUnit *U) {
+      assert(!U->NodeQueueId && "Node in the queue already");
+      U->NodeQueueId = ++currentQueueId;
+      Queue.push(U);
+    }
+
+    void push_all(const std::vector<SUnit *> &Nodes) {
+      for (unsigned i = 0, e = Nodes.size(); i != e; ++i)
+        push(Nodes[i]);
+    }
+    
+    SUnit *pop() {
+      if (empty()) return NULL;
+      SUnit *V = Queue.top();
+      Queue.pop();
+      V->NodeQueueId = 0;
+      return V;
+    }
+
+    void remove(SUnit *SU) {
+      assert(!Queue.empty() && "Queue is empty!");
+      assert(SU->NodeQueueId != 0 && "Not in queue!");
+      Queue.erase_one(SU);
+      SU->NodeQueueId = 0;
+    }
+
+    void setScheduleDAG(ScheduleDAGRRList *scheduleDag) { 
+      scheduleDAG = scheduleDag; 
+    }
+
+  protected:
+    bool canClobber(const SUnit *SU, const SUnit *Op);
+    void AddPseudoTwoAddrDeps();
+    void PrescheduleNodesWithMultipleUses();
+    void CalculateSethiUllmanNumbers();
+  };
+
+  typedef RegReductionPriorityQueue<bu_ls_rr_sort>
+    BURegReductionPriorityQueue;
+
+  typedef RegReductionPriorityQueue<td_ls_rr_sort>
+    TDRegReductionPriorityQueue;
+}
+
+/// closestSucc - Returns the scheduled cycle of the successor which is
+/// closest to the current cycle.
+static unsigned closestSucc(const SUnit *SU) {
+  unsigned MaxHeight = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain succs
+    unsigned Height = I->getSUnit()->getHeight();
+    // If there are bunch of CopyToRegs stacked up, they should be considered
+    // to be at the same position.
+    if (I->getSUnit()->getNode() &&
+        I->getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
+      Height = closestSucc(I->getSUnit())+1;
+    if (Height > MaxHeight)
+      MaxHeight = Height;
+  }
+  return MaxHeight;
+}
+
+/// calcMaxScratches - Returns an cost estimate of the worse case requirement
+/// for scratch registers, i.e. number of data dependencies.
+static unsigned calcMaxScratches(const SUnit *SU) {
+  unsigned Scratches = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    Scratches++;
+  }
+  return Scratches;
+}
+
+// Bottom up
+bool bu_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  unsigned LPriority = SPQ->getNodePriority(left);
+  unsigned RPriority = SPQ->getNodePriority(right);
+  if (LPriority != RPriority)
+    return LPriority > RPriority;
+
+  // Try schedule def + use closer when Sethi-Ullman numbers are the same.
+  // e.g.
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // and the following instructions are both ready.
+  // t2 = op c3
+  // t4 = op c4
+  //
+  // Then schedule t2 = op first.
+  // i.e.
+  // t4 = op c4
+  // t2 = op c3
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // This creates more short live intervals.
+  unsigned LDist = closestSucc(left);
+  unsigned RDist = closestSucc(right);
+  if (LDist != RDist)
+    return LDist < RDist;
+
+  // How many registers becomes live when the node is scheduled.
+  unsigned LScratch = calcMaxScratches(left);
+  unsigned RScratch = calcMaxScratches(right);
+  if (LScratch != RScratch)
+    return LScratch > RScratch;
+
+  if (left->getHeight() != right->getHeight())
+    return left->getHeight() > right->getHeight();
+  
+  if (left->getDepth() != right->getDepth())
+    return left->getDepth() < right->getDepth();
+
+  assert(left->NodeQueueId && right->NodeQueueId && 
+         "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId);
+}
+
+template<class SF>
+bool
+RegReductionPriorityQueue<SF>::canClobber(const SUnit *SU, const SUnit *Op) {
+  if (SU->isTwoAddress) {
+    unsigned Opc = SU->getNode()->getMachineOpcode();
+    const TargetInstrDesc &TID = TII->get(Opc);
+    unsigned NumRes = TID.getNumDefs();
+    unsigned NumOps = TID.getNumOperands() - NumRes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (TID.getOperandConstraint(i+NumRes, TOI::TIED_TO) != -1) {
+        SDNode *DU = SU->getNode()->getOperand(i).getNode();
+        if (DU->getNodeId() != -1 &&
+            Op->OrigNode == &(*SUnits)[DU->getNodeId()])
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+/// hasCopyToRegUse - Return true if SU has a value successor that is a
+/// CopyToReg node.
+static bool hasCopyToRegUse(const SUnit *SU) {
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    const SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg)
+      return true;
+  }
+  return false;
+}
+
+/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
+/// physical register defs.
+static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
+                                  const TargetInstrInfo *TII,
+                                  const TargetRegisterInfo *TRI) {
+  SDNode *N = SuccSU->getNode();
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  const unsigned *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  assert(ImpDefs && "Caller should check hasPhysRegDefs");
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getFlaggedNode()) {
+    if (!SUNode->isMachineOpcode())
+      continue;
+    const unsigned *SUImpDefs =
+      TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
+    if (!SUImpDefs)
+      return false;
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      MVT VT = N->getValueType(i);
+      if (VT == MVT::Flag || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      unsigned Reg = ImpDefs[i - NumDefs];
+      for (;*SUImpDefs; ++SUImpDefs) {
+        unsigned SUReg = *SUImpDefs;
+        if (TRI->regsOverlap(Reg, SUReg))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// PrescheduleNodesWithMultipleUses - Nodes with multiple uses
+/// are not handled well by the general register pressure reduction
+/// heuristics. When presented with code like this:
+///
+///      N
+///    / |
+///   /  |
+///  U  store
+///  |
+/// ...
+///
+/// the heuristics tend to push the store up, but since the
+/// operand of the store has another use (U), this would increase
+/// the length of that other use (the U->N edge).
+///
+/// This function transforms code like the above to route U's
+/// dependence through the store when possible, like this:
+///
+///      N
+///      ||
+///      ||
+///     store
+///       |
+///       U
+///       |
+///      ...
+///
+/// This results in the store being scheduled immediately
+/// after N, which shortens the U->N live range, reducing
+/// register pressure.
+///
+template<class SF>
+void RegReductionPriorityQueue<SF>::PrescheduleNodesWithMultipleUses() {
+  // Visit all the nodes in topological order, working top-down.
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    // For now, only look at nodes with no data successors, such as stores.
+    // These are especially important, due to the heuristics in
+    // getNodePriority for nodes with no data successors.
+    if (SU->NumSuccs != 0)
+      continue;
+    // For now, only look at nodes with exactly one data predecessor.
+    if (SU->NumPreds != 1)
+      continue;
+    // Avoid prescheduling copies to virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU->getNode())
+      if (N->getOpcode() == ISD::CopyToReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Locate the single data predecessor.
+    SUnit *PredSU = 0;
+    for (SUnit::const_pred_iterator II = SU->Preds.begin(),
+         EE = SU->Preds.end(); II != EE; ++II)
+      if (!II->isCtrl()) {
+        PredSU = II->getSUnit();
+        break;
+      }
+    assert(PredSU);
+
+    // Don't rewrite edges that carry physregs, because that requires additional
+    // support infrastructure.
+    if (PredSU->hasPhysRegDefs)
+      continue;
+    // Short-circuit the case where SU is PredSU's only data successor.
+    if (PredSU->NumSuccs == 1)
+      continue;
+    // Avoid prescheduling to copies from virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling // heuristics.
+    if (SDNode *N = SU->getNode())
+      if (N->getOpcode() == ISD::CopyFromReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Perform checks on the successors of PredSU.
+    for (SUnit::const_succ_iterator II = PredSU->Succs.begin(),
+         EE = PredSU->Succs.end(); II != EE; ++II) {
+      SUnit *PredSuccSU = II->getSUnit();
+      if (PredSuccSU == SU) continue;
+      // If PredSU has another successor with no data successors, for
+      // now don't attempt to choose either over the other.
+      if (PredSuccSU->NumSuccs == 0)
+        goto outer_loop_continue;
+      // Don't break physical register dependencies.
+      if (SU->hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
+        if (canClobberPhysRegDefs(PredSuccSU, SU, TII, TRI))
+          goto outer_loop_continue;
+      // Don't introduce graph cycles.
+      if (scheduleDAG->IsReachable(SU, PredSuccSU))
+        goto outer_loop_continue;
+    }
+
+    // Ok, the transformation is safe and the heuristics suggest it is
+    // profitable. Update the graph.
+    DOUT << "Prescheduling SU # " << SU->NodeNum
+         << " next to PredSU # " << PredSU->NodeNum
+         << " to guide scheduling in the presence of multiple uses\n";
+    for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
+      SDep Edge = PredSU->Succs[i];
+      assert(!Edge.isAssignedRegDep());
+      SUnit *SuccSU = Edge.getSUnit();
+      if (SuccSU != SU) {
+        Edge.setSUnit(PredSU);
+        scheduleDAG->RemovePred(SuccSU, Edge);
+        scheduleDAG->AddPred(SU, Edge);
+        Edge.setSUnit(SU);
+        scheduleDAG->AddPred(SuccSU, Edge);
+        --i;
+      }
+    }
+  outer_loop_continue:;
+  }
+}
+
+/// AddPseudoTwoAddrDeps - If two nodes share an operand and one of them uses
+/// it as a def&use operand. Add a pseudo control edge from it to the other
+/// node (if it won't create a cycle) so the two-address one will be scheduled
+/// first (lower in the schedule). If both nodes are two-address, favor the
+/// one that has a CopyToReg use (more likely to be a loop induction update).
+/// If both are two-address, but one is commutable while the other is not
+/// commutable, favor the one that's not commutable.
+template<class SF>
+void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    if (!SU->isTwoAddress)
+      continue;
+
+    SDNode *Node = SU->getNode();
+    if (!Node || !Node->isMachineOpcode() || SU->getNode()->getFlaggedNode())
+      continue;
+
+    unsigned Opc = Node->getMachineOpcode();
+    const TargetInstrDesc &TID = TII->get(Opc);
+    unsigned NumRes = TID.getNumDefs();
+    unsigned NumOps = TID.getNumOperands() - NumRes;
+    for (unsigned j = 0; j != NumOps; ++j) {
+      if (TID.getOperandConstraint(j+NumRes, TOI::TIED_TO) == -1)
+        continue;
+      SDNode *DU = SU->getNode()->getOperand(j).getNode();
+      if (DU->getNodeId() == -1)
+        continue;
+      const SUnit *DUSU = &(*SUnits)[DU->getNodeId()];
+      if (!DUSU) continue;
+      for (SUnit::const_succ_iterator I = DUSU->Succs.begin(),
+           E = DUSU->Succs.end(); I != E; ++I) {
+        if (I->isCtrl()) continue;
+        SUnit *SuccSU = I->getSUnit();
+        if (SuccSU == SU)
+          continue;
+        // Be conservative. Ignore if nodes aren't at roughly the same
+        // depth and height.
+        if (SuccSU->getHeight() < SU->getHeight() &&
+            (SU->getHeight() - SuccSU->getHeight()) > 1)
+          continue;
+        // Skip past COPY_TO_REGCLASS nodes, so that the pseudo edge
+        // constrains whatever is using the copy, instead of the copy
+        // itself. In the case that the copy is coalesced, this
+        // preserves the intent of the pseudo two-address heurietics.
+        while (SuccSU->Succs.size() == 1 &&
+               SuccSU->getNode()->isMachineOpcode() &&
+               SuccSU->getNode()->getMachineOpcode() ==
+                 TargetInstrInfo::COPY_TO_REGCLASS)
+          SuccSU = SuccSU->Succs.front().getSUnit();
+        // Don't constrain non-instruction nodes.
+        if (!SuccSU->getNode() || !SuccSU->getNode()->isMachineOpcode())
+          continue;
+        // Don't constrain nodes with physical register defs if the
+        // predecessor can clobber them.
+        if (SuccSU->hasPhysRegDefs && SU->hasPhysRegClobbers) {
+          if (canClobberPhysRegDefs(SuccSU, SU, TII, TRI))
+            continue;
+        }
+        // Don't constrain EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG;
+        // these may be coalesced away. We want them close to their uses.
+        unsigned SuccOpc = SuccSU->getNode()->getMachineOpcode();
+        if (SuccOpc == TargetInstrInfo::EXTRACT_SUBREG ||
+            SuccOpc == TargetInstrInfo::INSERT_SUBREG ||
+            SuccOpc == TargetInstrInfo::SUBREG_TO_REG)
+          continue;
+        if ((!canClobber(SuccSU, DUSU) ||
+             (hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) ||
+             (!SU->isCommutable && SuccSU->isCommutable)) &&
+            !scheduleDAG->IsReachable(SuccSU, SU)) {
+          DOUT << "Adding a pseudo-two-addr edge from SU # " << SU->NodeNum
+               << " to SU #" << SuccSU->NodeNum << "\n";
+          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Order, /*Latency=*/0,
+                                        /*Reg=*/0, /*isNormalMemory=*/false,
+                                        /*isMustAlias=*/false,
+                                        /*isArtificial=*/true));
+        }
+      }
+    }
+  }
+}
+
+/// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
+/// scheduling units.
+template<class SF>
+void RegReductionPriorityQueue<SF>::CalculateSethiUllmanNumbers() {
+  SethiUllmanNumbers.assign(SUnits->size(), 0);
+  
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i)
+    CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers);
+}
+
+/// LimitedSumOfUnscheduledPredsOfSuccs - Compute the sum of the unscheduled
+/// predecessors of the successors of the SUnit SU. Stop when the provided
+/// limit is exceeded.
+static unsigned LimitedSumOfUnscheduledPredsOfSuccs(const SUnit *SU, 
+                                                    unsigned Limit) {
+  unsigned Sum = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    const SUnit *SuccSU = I->getSUnit();
+    for (SUnit::const_pred_iterator II = SuccSU->Preds.begin(),
+         EE = SuccSU->Preds.end(); II != EE; ++II) {
+      SUnit *PredSU = II->getSUnit();
+      if (!PredSU->isScheduled)
+        if (++Sum > Limit)
+          return Sum;
+    }
+  }
+  return Sum;
+}
+
+
+// Top down
+bool td_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  unsigned LPriority = SPQ->getNodePriority(left);
+  unsigned RPriority = SPQ->getNodePriority(right);
+  bool LIsTarget = left->getNode() && left->getNode()->isMachineOpcode();
+  bool RIsTarget = right->getNode() && right->getNode()->isMachineOpcode();
+  bool LIsFloater = LIsTarget && left->NumPreds == 0;
+  bool RIsFloater = RIsTarget && right->NumPreds == 0;
+  unsigned LBonus = (LimitedSumOfUnscheduledPredsOfSuccs(left,1) == 1) ? 2 : 0;
+  unsigned RBonus = (LimitedSumOfUnscheduledPredsOfSuccs(right,1) == 1) ? 2 : 0;
+
+  if (left->NumSuccs == 0 && right->NumSuccs != 0)
+    return false;
+  else if (left->NumSuccs != 0 && right->NumSuccs == 0)
+    return true;
+
+  if (LIsFloater)
+    LBonus -= 2;
+  if (RIsFloater)
+    RBonus -= 2;
+  if (left->NumSuccs == 1)
+    LBonus += 2;
+  if (right->NumSuccs == 1)
+    RBonus += 2;
+
+  if (LPriority+LBonus != RPriority+RBonus)
+    return LPriority+LBonus < RPriority+RBonus;
+
+  if (left->getDepth() != right->getDepth())
+    return left->getDepth() < right->getDepth();
+
+  if (left->NumSuccsLeft != right->NumSuccsLeft)
+    return left->NumSuccsLeft > right->NumSuccsLeft;
+
+  assert(left->NodeQueueId && right->NodeQueueId && 
+         "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId);
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createBURRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  
+  BURegReductionPriorityQueue *PQ = new BURegReductionPriorityQueue(TII, TRI);
+
+  ScheduleDAGRRList *SD =
+    new ScheduleDAGRRList(*IS->MF, true, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;  
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createTDRRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  
+  TDRegReductionPriorityQueue *PQ = new TDRegReductionPriorityQueue(TII, TRI);
+
+  ScheduleDAGRRList *SD =
+    new ScheduleDAGRRList(*IS->MF, false, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
new file mode 100644
index 0000000..7aa15bc
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -0,0 +1,294 @@
+//===--- ScheduleDAGSDNodes.cpp - Implement the ScheduleDAGSDNodes class --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
+  : ScheduleDAG(mf) {
+}
+
+/// Run - perform scheduling.
+///
+void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb,
+                             MachineBasicBlock::iterator insertPos) {
+  DAG = dag;
+  ScheduleDAG::Run(bb, insertPos);
+}
+
+SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
+  SUnit *SU = NewSUnit(Old->getNode());
+  SU->OrigNode = Old->OrigNode;
+  SU->Latency = Old->Latency;
+  SU->isTwoAddress = Old->isTwoAddress;
+  SU->isCommutable = Old->isCommutable;
+  SU->hasPhysRegDefs = Old->hasPhysRegDefs;
+  SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
+  Old->isCloned = true;
+  return SU;
+}
+
+/// CheckForPhysRegDependency - Check if the dependency between def and use of
+/// a specified operand is a physical register dependency. If so, returns the
+/// register and the cost of copying the register.
+static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
+                                      const TargetRegisterInfo *TRI, 
+                                      const TargetInstrInfo *TII,
+                                      unsigned &PhysReg, int &Cost) {
+  if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
+    return;
+
+  unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return;
+
+  unsigned ResNo = User->getOperand(2).getResNo();
+  if (Def->isMachineOpcode()) {
+    const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
+    if (ResNo >= II.getNumDefs() &&
+        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) {
+      PhysReg = Reg;
+      const TargetRegisterClass *RC =
+        TRI->getPhysicalRegisterRegClass(Reg, Def->getValueType(ResNo));
+      Cost = RC->getCopyCost();
+    }
+  }
+}
+
+void ScheduleDAGSDNodes::BuildSchedUnits() {
+  // During scheduling, the NodeId field of SDNode is used to map SDNodes
+  // to their associated SUnits by holding SUnits table indices. A value
+  // of -1 means the SDNode does not yet have an associated SUnit.
+  unsigned NumNodes = 0;
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    NI->setNodeId(-1);
+    ++NumNodes;
+  }
+
+  // Reserve entries in the vector for each of the SUnits we are creating.  This
+  // ensure that reallocation of the vector won't happen, so SUnit*'s won't get
+  // invalidated.
+  // FIXME: Multiply by 2 because we may clone nodes during scheduling.
+  // This is a temporary workaround.
+  SUnits.reserve(NumNodes * 2);
+  
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = ForceUnitLatencies();
+
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    if (isPassiveNode(NI))  // Leaf node, e.g. a TargetImmediate.
+      continue;
+    
+    // If this node has already been processed, stop now.
+    if (NI->getNodeId() != -1) continue;
+    
+    SUnit *NodeSUnit = NewSUnit(NI);
+    
+    // See if anything is flagged to this node, if so, add them to flagged
+    // nodes.  Nodes can have at most one flag input and one flag output.  Flags
+    // are required to be the last operand and result of a node.
+    
+    // Scan up to find flagged preds.
+    SDNode *N = NI;
+    while (N->getNumOperands() &&
+           N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Flag) {
+      N = N->getOperand(N->getNumOperands()-1).getNode();
+      assert(N->getNodeId() == -1 && "Node already inserted!");
+      N->setNodeId(NodeSUnit->NodeNum);
+    }
+    
+    // Scan down to find any flagged succs.
+    N = NI;
+    while (N->getValueType(N->getNumValues()-1) == MVT::Flag) {
+      SDValue FlagVal(N, N->getNumValues()-1);
+      
+      // There are either zero or one users of the Flag result.
+      bool HasFlagUse = false;
+      for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); 
+           UI != E; ++UI)
+        if (FlagVal.isOperandOf(*UI)) {
+          HasFlagUse = true;
+          assert(N->getNodeId() == -1 && "Node already inserted!");
+          N->setNodeId(NodeSUnit->NodeNum);
+          N = *UI;
+          break;
+        }
+      if (!HasFlagUse) break;
+    }
+    
+    // If there are flag operands involved, N is now the bottom-most node
+    // of the sequence of nodes that are flagged together.
+    // Update the SUnit.
+    NodeSUnit->setNode(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NodeSUnit->NodeNum);
+
+    // Assign the Latency field of NodeSUnit using target-provided information.
+    if (UnitLatencies)
+      NodeSUnit->Latency = 1;
+    else
+      ComputeLatency(NodeSUnit);
+  }
+}
+
+void ScheduleDAGSDNodes::AddSchedEdges() {
+  // Pass 2: add the preds, succs, etc.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    SUnit *SU = &SUnits[su];
+    SDNode *MainNode = SU->getNode();
+    
+    if (MainNode->isMachineOpcode()) {
+      unsigned Opc = MainNode->getMachineOpcode();
+      const TargetInstrDesc &TID = TII->get(Opc);
+      for (unsigned i = 0; i != TID.getNumOperands(); ++i) {
+        if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) {
+          SU->isTwoAddress = true;
+          break;
+        }
+      }
+      if (TID.isCommutable())
+        SU->isCommutable = true;
+    }
+    
+    // Find all predecessors and successors of the group.
+    for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode()) {
+      if (N->isMachineOpcode() &&
+          TII->get(N->getMachineOpcode()).getImplicitDefs()) {
+        SU->hasPhysRegClobbers = true;
+        unsigned NumUsed = CountResults(N);
+        while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
+          --NumUsed;    // Skip over unused values at the end.
+        if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
+          SU->hasPhysRegDefs = true;
+      }
+      
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+        SDNode *OpN = N->getOperand(i).getNode();
+        if (isPassiveNode(OpN)) continue;   // Not scheduled.
+        SUnit *OpSU = &SUnits[OpN->getNodeId()];
+        assert(OpSU && "Node has no SUnit!");
+        if (OpSU == SU) continue;           // In the same group.
+
+        MVT OpVT = N->getOperand(i).getValueType();
+        assert(OpVT != MVT::Flag && "Flagged nodes should be in same sunit!");
+        bool isChain = OpVT == MVT::Other;
+
+        unsigned PhysReg = 0;
+        int Cost = 1;
+        // Determine if this is a physical register dependency.
+        CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
+        assert((PhysReg == 0 || !isChain) &&
+               "Chain dependence via physreg data?");
+        // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
+        // emits a copy from the physical register to a virtual register unless
+        // it requires a cross class copy (cost < 0). That means we are only
+        // treating "expensive to copy" register dependency as physical register
+        // dependency. This may change in the future though.
+        if (Cost >= 0)
+          PhysReg = 0;
+        SU->addPred(SDep(OpSU, isChain ? SDep::Order : SDep::Data,
+                         OpSU->Latency, PhysReg));
+      }
+    }
+  }
+}
+
+/// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+/// are input.  This SUnit graph is similar to the SelectionDAG, but
+/// excludes nodes that aren't interesting to scheduling, and represents
+/// flagged together nodes with a single SUnit.
+void ScheduleDAGSDNodes::BuildSchedGraph() {
+  // Populate the SUnits array.
+  BuildSchedUnits();
+  // Compute all the scheduling dependencies between nodes.
+  AddSchedEdges();
+}
+
+void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  
+  // Compute the latency for the node.  We use the sum of the latencies for
+  // all nodes flagged together into this SUnit.
+  SU->Latency = 0;
+  bool SawMachineOpcode = false;
+  for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode())
+    if (N->isMachineOpcode()) {
+      SawMachineOpcode = true;
+      SU->Latency +=
+        InstrItins.getLatency(TII->get(N->getMachineOpcode()).getSchedClass());
+    }
+}
+
+/// CountResults - The results of target nodes have register or immediate
+/// operands first, then an optional chain, and optional flag operands (which do
+/// not go into the resulting MachineInstr).
+unsigned ScheduleDAGSDNodes::CountResults(SDNode *Node) {
+  unsigned N = Node->getNumValues();
+  while (N && Node->getValueType(N - 1) == MVT::Flag)
+    --N;
+  if (N && Node->getValueType(N - 1) == MVT::Other)
+    --N;    // Skip over chain result.
+  return N;
+}
+
+/// CountOperands - The inputs to target nodes have any actual inputs first,
+/// followed by special operands that describe memory references, then an
+/// optional chain operand, then an optional flag operand.  Compute the number
+/// of actual operands that will go into the resulting MachineInstr.
+unsigned ScheduleDAGSDNodes::CountOperands(SDNode *Node) {
+  unsigned N = ComputeMemOperandsEnd(Node);
+  while (N && isa<MemOperandSDNode>(Node->getOperand(N - 1).getNode()))
+    --N; // Ignore MEMOPERAND nodes
+  return N;
+}
+
+/// ComputeMemOperandsEnd - Find the index one past the last MemOperandSDNode
+/// operand
+unsigned ScheduleDAGSDNodes::ComputeMemOperandsEnd(SDNode *Node) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Flag)
+    --N;
+  if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
+    --N; // Ignore chain if it exists.
+  return N;
+}
+
+
+void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+  if (!SU->getNode()) {
+    cerr << "PHYS REG COPY\n";
+    return;
+  }
+
+  SU->getNode()->dump(DAG);
+  cerr << "\n";
+  SmallVector<SDNode *, 4> FlaggedNodes;
+  for (SDNode *N = SU->getNode()->getFlaggedNode(); N; N = N->getFlaggedNode())
+    FlaggedNodes.push_back(N);
+  while (!FlaggedNodes.empty()) {
+    cerr << "    ";
+    FlaggedNodes.back()->dump(DAG);
+    cerr << "\n";
+    FlaggedNodes.pop_back();
+  }
+}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
new file mode 100644
index 0000000..2a278b7
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -0,0 +1,179 @@
+//===---- ScheduleDAGSDNodes.h - SDNode Scheduling --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScheduleDAGSDNodes class, which implements
+// scheduling for an SDNode-based dependency graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCHEDULEDAGSDNODES_H
+#define SCHEDULEDAGSDNODES_H
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+  /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
+  /// 
+  /// Edges between SUnits are initially based on edges in the SelectionDAG,
+  /// and additional edges can be added by the schedulers as heuristics.
+  /// SDNodes such as Constants, Registers, and a few others that are not
+  /// interesting to schedulers are not allocated SUnits.
+  ///
+  /// SDNodes with MVT::Flag operands are grouped along with the flagged
+  /// nodes into a single SUnit so that they are scheduled together.
+  ///
+  /// SDNode-based scheduling graphs do not use SDep::Anti or SDep::Output
+  /// edges.  Physical register dependence information is not carried in
+  /// the DAG and must be handled explicitly by schedulers.
+  ///
+  class ScheduleDAGSDNodes : public ScheduleDAG {
+  public:
+    SelectionDAG *DAG;                    // DAG of the current basic block
+
+    explicit ScheduleDAGSDNodes(MachineFunction &mf);
+
+    virtual ~ScheduleDAGSDNodes() {}
+
+    /// Run - perform scheduling.
+    ///
+    void Run(SelectionDAG *dag, MachineBasicBlock *bb,
+             MachineBasicBlock::iterator insertPos);
+
+    /// isPassiveNode - Return true if the node is a non-scheduled leaf.
+    ///
+    static bool isPassiveNode(SDNode *Node) {
+      if (isa<ConstantSDNode>(Node))       return true;
+      if (isa<ConstantFPSDNode>(Node))     return true;
+      if (isa<RegisterSDNode>(Node))       return true;
+      if (isa<GlobalAddressSDNode>(Node))  return true;
+      if (isa<BasicBlockSDNode>(Node))     return true;
+      if (isa<FrameIndexSDNode>(Node))     return true;
+      if (isa<ConstantPoolSDNode>(Node))   return true;
+      if (isa<JumpTableSDNode>(Node))      return true;
+      if (isa<ExternalSymbolSDNode>(Node)) return true;
+      if (isa<MemOperandSDNode>(Node))     return true;
+      if (Node->getOpcode() == ISD::EntryToken) return true;
+      return false;
+    }
+
+    /// NewSUnit - Creates a new SUnit and return a ptr to it.
+    ///
+    SUnit *NewSUnit(SDNode *N) {
+#ifndef NDEBUG
+      const SUnit *Addr = 0;
+      if (!SUnits.empty())
+        Addr = &SUnits[0];
+#endif
+      SUnits.push_back(SUnit(N, (unsigned)SUnits.size()));
+      assert((Addr == 0 || Addr == &SUnits[0]) &&
+             "SUnits std::vector reallocated on the fly!");
+      SUnits.back().OrigNode = &SUnits.back();
+      return &SUnits.back();
+    }
+
+    /// Clone - Creates a clone of the specified SUnit. It does not copy the
+    /// predecessors / successors info nor the temporary scheduling states.
+    ///
+    SUnit *Clone(SUnit *N);
+    
+    /// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+    /// are input.  This SUnit graph is similar to the SelectionDAG, but
+    /// excludes nodes that aren't interesting to scheduling, and represents
+    /// flagged together nodes with a single SUnit.
+    virtual void BuildSchedGraph();
+
+    /// ComputeLatency - Compute node latency.
+    ///
+    virtual void ComputeLatency(SUnit *SU);
+
+    /// CountResults - The results of target nodes have register or immediate
+    /// operands first, then an optional chain, and optional flag operands
+    /// (which do not go into the machine instrs.)
+    static unsigned CountResults(SDNode *Node);
+
+    /// CountOperands - The inputs to target nodes have any actual inputs first,
+    /// followed by special operands that describe memory references, then an
+    /// optional chain operand, then flag operands.  Compute the number of
+    /// actual operands that will go into the resulting MachineInstr.
+    static unsigned CountOperands(SDNode *Node);
+
+    /// ComputeMemOperandsEnd - Find the index one past the last
+    /// MemOperandSDNode operand
+    static unsigned ComputeMemOperandsEnd(SDNode *Node);
+
+    /// EmitNode - Generate machine code for an node and needed dependencies.
+    /// VRBaseMap contains, for each already emitted node, the first virtual
+    /// register number for the results of the node.
+    ///
+    void EmitNode(SDNode *Node, bool IsClone, bool HasClone,
+                  DenseMap<SDValue, unsigned> &VRBaseMap);
+    
+    virtual MachineBasicBlock *EmitSchedule();
+
+    /// Schedule - Order nodes according to selected style, filling
+    /// in the Sequence member.
+    ///
+    virtual void Schedule() = 0;
+
+    virtual void dumpNode(const SUnit *SU) const;
+
+    virtual std::string getGraphNodeLabel(const SUnit *SU) const;
+
+    virtual void getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const;
+
+  private:
+    /// EmitSubregNode - Generate machine code for subreg nodes.
+    ///
+    void EmitSubregNode(SDNode *Node, 
+                        DenseMap<SDValue, unsigned> &VRBaseMap);
+
+    /// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS
+    /// nodes.
+    ///
+    void EmitCopyToRegClassNode(SDNode *Node,
+                                DenseMap<SDValue, unsigned> &VRBaseMap);
+
+    /// getVR - Return the virtual register corresponding to the specified result
+    /// of the specified node.
+    unsigned getVR(SDValue Op, DenseMap<SDValue, unsigned> &VRBaseMap);
+  
+    /// getDstOfCopyToRegUse - If the only use of the specified result number of
+    /// node is a CopyToReg, return its destination register. Return 0 otherwise.
+    unsigned getDstOfOnlyCopyToRegUse(SDNode *Node, unsigned ResNo) const;
+
+    void AddOperand(MachineInstr *MI, SDValue Op, unsigned IIOpNum,
+                    const TargetInstrDesc *II,
+                    DenseMap<SDValue, unsigned> &VRBaseMap);
+
+    /// AddRegisterOperand - Add the specified register as an operand to the
+    /// specified machine instr. Insert register copies if the register is
+    /// not in the required register class.
+    void AddRegisterOperand(MachineInstr *MI, SDValue Op,
+                            unsigned IIOpNum, const TargetInstrDesc *II,
+                            DenseMap<SDValue, unsigned> &VRBaseMap);
+
+    /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+    /// implicit physical register output.
+    void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
+                         bool IsCloned, unsigned SrcReg,
+                         DenseMap<SDValue, unsigned> &VRBaseMap);
+    
+    void CreateVirtualRegisters(SDNode *Node, MachineInstr *MI,
+                                const TargetInstrDesc &II, bool IsClone,
+                                bool IsCloned,
+                                DenseMap<SDValue, unsigned> &VRBaseMap);
+
+    /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
+    void BuildSchedUnits();
+    void AddSchedEdges();
+  };
+}
+
+#endif
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodesEmit.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodesEmit.cpp
new file mode 100644
index 0000000..fb5e207
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodesEmit.cpp
@@ -0,0 +1,668 @@
+//===---- ScheduleDAGEmit.cpp - Emit routines for the ScheduleDAG class ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the Emit routines for the ScheduleDAG class, which creates
+// MachineInstrs according to the computed schedule.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+/// implicit physical register output.
+void ScheduleDAGSDNodes::EmitCopyFromReg(SDNode *Node, unsigned ResNo,
+                                         bool IsClone, bool IsCloned,
+                                         unsigned SrcReg,
+                                         DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VRBase = 0;
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    // Just use the input register directly!
+    SDValue Op(Node, ResNo);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
+    isNew = isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+    return;
+  }
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  bool MatchReg = true;
+  const TargetRegisterClass *UseRC = NULL;
+  if (!IsClone && !IsCloned)
+    for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+         UI != E; ++UI) {
+      SDNode *User = *UI;
+      bool Match = true;
+      if (User->getOpcode() == ISD::CopyToReg && 
+          User->getOperand(2).getNode() == Node &&
+          User->getOperand(2).getResNo() == ResNo) {
+        unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+          VRBase = DestReg;
+          Match = false;
+        } else if (DestReg != SrcReg)
+          Match = false;
+      } else {
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          SDValue Op = User->getOperand(i);
+          if (Op.getNode() != Node || Op.getResNo() != ResNo)
+            continue;
+          MVT VT = Node->getValueType(Op.getResNo());
+          if (VT == MVT::Other || VT == MVT::Flag)
+            continue;
+          Match = false;
+          if (User->isMachineOpcode()) {
+            const TargetInstrDesc &II = TII->get(User->getMachineOpcode());
+            const TargetRegisterClass *RC =
+              getInstrOperandRegClass(TRI, II, i+II.getNumDefs());
+            if (!UseRC)
+              UseRC = RC;
+            else if (RC) {
+              if (UseRC->hasSuperClass(RC))
+                UseRC = RC;
+              else
+                assert((UseRC == RC || RC->hasSuperClass(UseRC)) &&
+                       "Multiple uses expecting different register classes!");
+            }
+          }
+        }
+      }
+      MatchReg &= Match;
+      if (VRBase)
+        break;
+    }
+
+  MVT VT = Node->getValueType(ResNo);
+  const TargetRegisterClass *SrcRC = 0, *DstRC = 0;
+  SrcRC = TRI->getPhysicalRegisterRegClass(SrcReg, VT);
+  
+  // Figure out the register class to create for the destreg.
+  if (VRBase) {
+    DstRC = MRI.getRegClass(VRBase);
+  } else if (UseRC) {
+    assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!");
+    DstRC = UseRC;
+  } else {
+    DstRC = TLI->getRegClassFor(VT);
+  }
+    
+  // If all uses are reading from the src physical register and copying the
+  // register is either impossible or very expensive, then don't create a copy.
+  if (MatchReg && SrcRC->getCopyCost() < 0) {
+    VRBase = SrcReg;
+  } else {
+    // Create the reg, emit the copy.
+    VRBase = MRI.createVirtualRegister(DstRC);
+    bool Emitted = TII->copyRegToReg(*BB, InsertPos, VRBase, SrcReg,
+                                     DstRC, SrcRC);
+
+    assert(Emitted && "Unable to issue a copy instruction!\n");
+    (void) Emitted;
+  }
+
+  SDValue Op(Node, ResNo);
+  if (IsClone)
+    VRBaseMap.erase(Op);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// getDstOfCopyToRegUse - If the only use of the specified result number of
+/// node is a CopyToReg, return its destination register. Return 0 otherwise.
+unsigned ScheduleDAGSDNodes::getDstOfOnlyCopyToRegUse(SDNode *Node,
+                                                      unsigned ResNo) const {
+  if (!Node->hasOneUse())
+    return 0;
+
+  SDNode *User = *Node->use_begin();
+  if (User->getOpcode() == ISD::CopyToReg && 
+      User->getOperand(2).getNode() == Node &&
+      User->getOperand(2).getResNo() == ResNo) {
+    unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return Reg;
+  }
+  return 0;
+}
+
+void ScheduleDAGSDNodes::CreateVirtualRegisters(SDNode *Node, MachineInstr *MI,
+                                       const TargetInstrDesc &II,
+                                       bool IsClone, bool IsCloned,
+                                       DenseMap<SDValue, unsigned> &VRBaseMap) {
+  assert(Node->getMachineOpcode() != TargetInstrInfo::IMPLICIT_DEF &&
+         "IMPLICIT_DEF should have been handled as a special case elsewhere!");
+
+  for (unsigned i = 0; i < II.getNumDefs(); ++i) {
+    // If the specific node value is only used by a CopyToReg and the dest reg
+    // is a vreg in the same register class, use the CopyToReg'd destination
+    // register instead of creating a new vreg.
+    unsigned VRBase = 0;
+    const TargetRegisterClass *RC = getInstrOperandRegClass(TRI, II, i);
+
+    if (!IsClone && !IsCloned)
+      for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+           UI != E; ++UI) {
+        SDNode *User = *UI;
+        if (User->getOpcode() == ISD::CopyToReg && 
+            User->getOperand(2).getNode() == Node &&
+            User->getOperand(2).getResNo() == i) {
+          unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+            const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
+            if (RegRC == RC) {
+              VRBase = Reg;
+              MI->addOperand(MachineOperand::CreateReg(Reg, true));
+              break;
+            }
+          }
+        }
+      }
+
+    // Create the result registers for this node and add the result regs to
+    // the machine instruction.
+    if (VRBase == 0) {
+      assert(RC && "Isn't a register operand!");
+      VRBase = MRI.createVirtualRegister(RC);
+      MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    }
+
+    SDValue Op(Node, i);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+    isNew = isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+  }
+}
+
+/// getVR - Return the virtual register corresponding to the specified result
+/// of the specified node.
+unsigned ScheduleDAGSDNodes::getVR(SDValue Op,
+                                   DenseMap<SDValue, unsigned> &VRBaseMap) {
+  if (Op.isMachineOpcode() &&
+      Op.getMachineOpcode() == TargetInstrInfo::IMPLICIT_DEF) {
+    // Add an IMPLICIT_DEF instruction before every use.
+    unsigned VReg = getDstOfOnlyCopyToRegUse(Op.getNode(), Op.getResNo());
+    // IMPLICIT_DEF can produce any type of result so its TargetInstrDesc
+    // does not include operand register class info.
+    if (!VReg) {
+      const TargetRegisterClass *RC = TLI->getRegClassFor(Op.getValueType());
+      VReg = MRI.createVirtualRegister(RC);
+    }
+    BuildMI(BB, Op.getDebugLoc(), TII->get(TargetInstrInfo::IMPLICIT_DEF),VReg);
+    return VReg;
+  }
+
+  DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+  assert(I != VRBaseMap.end() && "Node emitted out of order - late");
+  return I->second;
+}
+
+
+/// AddRegisterOperand - Add the specified register as an operand to the
+/// specified machine instr. Insert register copies if the register is
+/// not in the required register class.
+void
+ScheduleDAGSDNodes::AddRegisterOperand(MachineInstr *MI, SDValue Op,
+                                       unsigned IIOpNum,
+                                       const TargetInstrDesc *II,
+                                       DenseMap<SDValue, unsigned> &VRBaseMap) {
+  assert(Op.getValueType() != MVT::Other &&
+         Op.getValueType() != MVT::Flag &&
+         "Chain and flag operands should occur at end of operand list!");
+  // Get/emit the operand.
+  unsigned VReg = getVR(Op, VRBaseMap);
+  assert(TargetRegisterInfo::isVirtualRegister(VReg) && "Not a vreg?");
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  bool isOptDef = IIOpNum < TID.getNumOperands() &&
+    TID.OpInfo[IIOpNum].isOptionalDef();
+
+  // If the instruction requires a register in a different class, create
+  // a new virtual register and copy the value into it.
+  if (II) {
+    const TargetRegisterClass *SrcRC =
+      MRI.getRegClass(VReg);
+    const TargetRegisterClass *DstRC =
+      getInstrOperandRegClass(TRI, *II, IIOpNum);
+    assert((DstRC || (TID.isVariadic() && IIOpNum >= TID.getNumOperands())) &&
+           "Don't have operand info for this instruction!");
+    if (DstRC && SrcRC != DstRC && !SrcRC->hasSuperClass(DstRC)) {
+      unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+      bool Emitted = TII->copyRegToReg(*BB, InsertPos, NewVReg, VReg,
+                                       DstRC, SrcRC);
+      assert(Emitted && "Unable to issue a copy instruction!\n");
+      (void) Emitted;
+      VReg = NewVReg;
+    }
+  }
+
+  MI->addOperand(MachineOperand::CreateReg(VReg, isOptDef));
+}
+
+/// AddOperand - Add the specified operand to the specified machine instr.  II
+/// specifies the instruction information for the node, and IIOpNum is the
+/// operand number (in the II) that we are adding. IIOpNum and II are used for 
+/// assertions only.
+void ScheduleDAGSDNodes::AddOperand(MachineInstr *MI, SDValue Op,
+                                    unsigned IIOpNum,
+                                    const TargetInstrDesc *II,
+                                    DenseMap<SDValue, unsigned> &VRBaseMap) {
+  if (Op.isMachineOpcode()) {
+    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap);
+  } else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateImm(C->getZExtValue()));
+  } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
+    const ConstantFP *CFP = F->getConstantFPValue();
+    MI->addOperand(MachineOperand::CreateFPImm(CFP));
+  } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false));
+  } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateGA(TGA->getGlobal(),TGA->getOffset()));
+  } else if (BasicBlockSDNode *BBNode = dyn_cast<BasicBlockSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateMBB(BBNode->getBasicBlock()));
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateFI(FI->getIndex()));
+  } else if (JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateJTI(JT->getIndex()));
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op)) {
+    int Offset = CP->getOffset();
+    unsigned Align = CP->getAlignment();
+    const Type *Type = CP->getType();
+    // MachineConstantPool wants an explicit alignment.
+    if (Align == 0) {
+      Align = TM.getTargetData()->getPrefTypeAlignment(Type);
+      if (Align == 0) {
+        // Alignment of vector types.  FIXME!
+        Align = TM.getTargetData()->getTypeAllocSize(Type);
+      }
+    }
+    
+    unsigned Idx;
+    if (CP->isMachineConstantPoolEntry())
+      Idx = ConstPool->getConstantPoolIndex(CP->getMachineCPVal(), Align);
+    else
+      Idx = ConstPool->getConstantPoolIndex(CP->getConstVal(), Align);
+    MI->addOperand(MachineOperand::CreateCPI(Idx, Offset));
+  } else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateES(ES->getSymbol()));
+  } else {
+    assert(Op.getValueType() != MVT::Other &&
+           Op.getValueType() != MVT::Flag &&
+           "Chain and flag operands should occur at end of operand list!");
+    AddRegisterOperand(MI, Op, IIOpNum, II, VRBaseMap);
+  }
+}
+
+/// getSuperRegisterRegClass - Returns the register class of a superreg A whose
+/// "SubIdx"'th sub-register class is the specified register class and whose
+/// type matches the specified type.
+static const TargetRegisterClass*
+getSuperRegisterRegClass(const TargetRegisterClass *TRC,
+                         unsigned SubIdx, MVT VT) {
+  // Pick the register class of the superegister for this type
+  for (TargetRegisterInfo::regclass_iterator I = TRC->superregclasses_begin(),
+         E = TRC->superregclasses_end(); I != E; ++I)
+    if ((*I)->hasType(VT) && (*I)->getSubRegisterRegClass(SubIdx) == TRC)
+      return *I;
+  assert(false && "Couldn't find the register class");
+  return 0;
+}
+
+/// EmitSubregNode - Generate machine code for subreg nodes.
+///
+void ScheduleDAGSDNodes::EmitSubregNode(SDNode *Node, 
+                                        DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VRBase = 0;
+  unsigned Opc = Node->getMachineOpcode();
+  
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+       UI != E; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() == ISD::CopyToReg && 
+        User->getOperand(2).getNode() == Node) {
+      unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+        VRBase = DestReg;
+        break;
+      }
+    }
+  }
+  
+  if (Opc == TargetInstrInfo::EXTRACT_SUBREG) {
+    unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+
+    // Create the extract_subreg machine instruction.
+    MachineInstr *MI = BuildMI(MF, Node->getDebugLoc(),
+                               TII->get(TargetInstrInfo::EXTRACT_SUBREG));
+
+    // Figure out the register class to create for the destreg.
+    unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+    const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
+    const TargetRegisterClass *SRC = TRC->getSubRegisterRegClass(SubIdx);
+    assert(SRC && "Invalid subregister index in EXTRACT_SUBREG");
+
+    // Figure out the register class to create for the destreg.
+    // Note that if we're going to directly use an existing register,
+    // it must be precisely the required class, and not a subclass
+    // thereof.
+    if (VRBase == 0 || SRC != MRI.getRegClass(VRBase)) {
+      // Create the reg
+      assert(SRC && "Couldn't find source register class");
+      VRBase = MRI.createVirtualRegister(SRC);
+    }
+
+    // Add def, source, and subreg index
+    MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    AddOperand(MI, Node->getOperand(0), 0, 0, VRBaseMap);
+    MI->addOperand(MachineOperand::CreateImm(SubIdx));
+    BB->insert(InsertPos, MI);
+  } else if (Opc == TargetInstrInfo::INSERT_SUBREG ||
+             Opc == TargetInstrInfo::SUBREG_TO_REG) {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    SDValue N2 = Node->getOperand(2);
+    unsigned SubReg = getVR(N1, VRBaseMap);
+    unsigned SubIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+    const TargetRegisterClass *TRC = MRI.getRegClass(SubReg);
+    const TargetRegisterClass *SRC =
+      getSuperRegisterRegClass(TRC, SubIdx,
+                               Node->getValueType(0));
+
+    // Figure out the register class to create for the destreg.
+    // Note that if we're going to directly use an existing register,
+    // it must be precisely the required class, and not a subclass
+    // thereof.
+    if (VRBase == 0 || SRC != MRI.getRegClass(VRBase)) {
+      // Create the reg
+      assert(SRC && "Couldn't find source register class");
+      VRBase = MRI.createVirtualRegister(SRC);
+    }
+
+    // Create the insert_subreg or subreg_to_reg machine instruction.
+    MachineInstr *MI = BuildMI(MF, Node->getDebugLoc(), TII->get(Opc));
+    MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    
+    // If creating a subreg_to_reg, then the first input operand
+    // is an implicit value immediate, otherwise it's a register
+    if (Opc == TargetInstrInfo::SUBREG_TO_REG) {
+      const ConstantSDNode *SD = cast<ConstantSDNode>(N0);
+      MI->addOperand(MachineOperand::CreateImm(SD->getZExtValue()));
+    } else
+      AddOperand(MI, N0, 0, 0, VRBaseMap);
+    // Add the subregster being inserted
+    AddOperand(MI, N1, 0, 0, VRBaseMap);
+    MI->addOperand(MachineOperand::CreateImm(SubIdx));
+    BB->insert(InsertPos, MI);
+  } else
+    assert(0 && "Node is not insert_subreg, extract_subreg, or subreg_to_reg");
+     
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+/// COPY_TO_REGCLASS is just a normal copy, except that the destination
+/// register is constrained to be in a particular register class.
+///
+void
+ScheduleDAGSDNodes::EmitCopyToRegClassNode(SDNode *Node,
+                                       DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(VReg);
+
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  const TargetRegisterClass *DstRC = TRI->getRegClass(DstRCIdx);
+
+  // Create the new VReg in the destination class and emit a copy.
+  unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+  bool Emitted = TII->copyRegToReg(*BB, InsertPos, NewVReg, VReg,
+                                   DstRC, SrcRC);
+  assert(Emitted &&
+         "Unable to issue a copy instruction for a COPY_TO_REGCLASS node!\n");
+  (void) Emitted;
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  isNew = isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitNode - Generate machine code for an node and needed dependencies.
+///
+void ScheduleDAGSDNodes::EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
+                                  DenseMap<SDValue, unsigned> &VRBaseMap) {
+  // If machine instruction
+  if (Node->isMachineOpcode()) {
+    unsigned Opc = Node->getMachineOpcode();
+    
+    // Handle subreg insert/extract specially
+    if (Opc == TargetInstrInfo::EXTRACT_SUBREG || 
+        Opc == TargetInstrInfo::INSERT_SUBREG ||
+        Opc == TargetInstrInfo::SUBREG_TO_REG) {
+      EmitSubregNode(Node, VRBaseMap);
+      return;
+    }
+
+    // Handle COPY_TO_REGCLASS specially.
+    if (Opc == TargetInstrInfo::COPY_TO_REGCLASS) {
+      EmitCopyToRegClassNode(Node, VRBaseMap);
+      return;
+    }
+
+    if (Opc == TargetInstrInfo::IMPLICIT_DEF)
+      // We want a unique VR for each IMPLICIT_DEF use.
+      return;
+    
+    const TargetInstrDesc &II = TII->get(Opc);
+    unsigned NumResults = CountResults(Node);
+    unsigned NodeOperands = CountOperands(Node);
+    unsigned MemOperandsEnd = ComputeMemOperandsEnd(Node);
+    bool HasPhysRegOuts = (NumResults > II.getNumDefs()) &&
+                          II.getImplicitDefs() != 0;
+#ifndef NDEBUG
+    unsigned NumMIOperands = NodeOperands + NumResults;
+    assert((II.getNumOperands() == NumMIOperands ||
+            HasPhysRegOuts || II.isVariadic()) &&
+           "#operands for dag node doesn't match .td file!"); 
+#endif
+
+    // Create the new machine instruction.
+    MachineInstr *MI = BuildMI(MF, Node->getDebugLoc(), II);
+    
+    // Add result register values for things that are defined by this
+    // instruction.
+    if (NumResults)
+      CreateVirtualRegisters(Node, MI, II, IsClone, IsCloned, VRBaseMap);
+    
+    // Emit all of the actual operands of this instruction, adding them to the
+    // instruction as appropriate.
+    for (unsigned i = 0; i != NodeOperands; ++i)
+      AddOperand(MI, Node->getOperand(i), i+II.getNumDefs(), &II, VRBaseMap);
+
+    // Emit all of the memory operands of this instruction
+    for (unsigned i = NodeOperands; i != MemOperandsEnd; ++i)
+      AddMemOperand(MI, cast<MemOperandSDNode>(Node->getOperand(i))->MO);
+
+    if (II.usesCustomDAGSchedInsertionHook()) {
+      // Insert this instruction into the basic block using a target
+      // specific inserter which may returns a new basic block.
+      BB = TLI->EmitInstrWithCustomInserter(MI, BB);
+      InsertPos = BB->end();
+    } else {
+      BB->insert(InsertPos, MI);
+    }
+
+    // Additional results must be an physical register def.
+    if (HasPhysRegOuts) {
+      for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
+        unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
+        if (Node->hasAnyUseOfValue(i))
+          EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap);
+      }
+    }
+    return;
+  }
+
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    Node->dump(DAG);
+#endif
+    assert(0 && "This target-independent node should have been selected!");
+    break;
+  case ISD::EntryToken:
+    assert(0 && "EntryToken should have been excluded from the schedule!");
+    break;
+  case ISD::TokenFactor: // fall thru
+    break;
+  case ISD::CopyToReg: {
+    unsigned SrcReg;
+    SDValue SrcVal = Node->getOperand(2);
+    if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(SrcVal))
+      SrcReg = R->getReg();
+    else
+      SrcReg = getVR(SrcVal, VRBaseMap);
+      
+    unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
+      break;
+      
+    const TargetRegisterClass *SrcTRC = 0, *DstTRC = 0;
+    // Get the register classes of the src/dst.
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+      SrcTRC = MRI.getRegClass(SrcReg);
+    else
+      SrcTRC = TRI->getPhysicalRegisterRegClass(SrcReg,SrcVal.getValueType());
+
+    if (TargetRegisterInfo::isVirtualRegister(DestReg))
+      DstTRC = MRI.getRegClass(DestReg);
+    else
+      DstTRC = TRI->getPhysicalRegisterRegClass(DestReg,
+                                            Node->getOperand(1).getValueType());
+
+    bool Emitted = TII->copyRegToReg(*BB, InsertPos, DestReg, SrcReg,
+                                     DstTRC, SrcTRC);
+    assert(Emitted && "Unable to issue a copy instruction!\n");
+    (void) Emitted;
+    break;
+  }
+  case ISD::CopyFromReg: {
+    unsigned SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    EmitCopyFromReg(Node, 0, IsClone, IsCloned, SrcReg, VRBaseMap);
+    break;
+  }
+  case ISD::INLINEASM: {
+    unsigned NumOps = Node->getNumOperands();
+    if (Node->getOperand(NumOps-1).getValueType() == MVT::Flag)
+      --NumOps;  // Ignore the flag operand.
+      
+    // Create the inline asm machine instruction.
+    MachineInstr *MI = BuildMI(MF, Node->getDebugLoc(),
+                               TII->get(TargetInstrInfo::INLINEASM));
+
+    // Add the asm string as an external symbol operand.
+    const char *AsmStr =
+      cast<ExternalSymbolSDNode>(Node->getOperand(1))->getSymbol();
+    MI->addOperand(MachineOperand::CreateES(AsmStr));
+      
+    // Add all of the operand registers to the instruction.
+    for (unsigned i = 2; i != NumOps;) {
+      unsigned Flags =
+        cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+        
+      MI->addOperand(MachineOperand::CreateImm(Flags));
+      ++i;  // Skip the ID value.
+        
+      switch (Flags & 7) {
+      default: assert(0 && "Bad flags!");
+      case 2:   // Def of register.
+        for (; NumVals; --NumVals, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          MI->addOperand(MachineOperand::CreateReg(Reg, true));
+        }
+        break;
+      case 6:   // Def of earlyclobber register.
+        for (; NumVals; --NumVals, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          MI->addOperand(MachineOperand::CreateReg(Reg, true, false, false, 
+                                                   false, 0, true));
+        }
+        break;
+      case 1:  // Use of register.
+      case 3:  // Immediate.
+      case 4:  // Addressing mode.
+        // The addressing mode has been selected, just add all of the
+        // operands to the machine instruction.
+        for (; NumVals; --NumVals, ++i)
+          AddOperand(MI, Node->getOperand(i), 0, 0, VRBaseMap);
+        break;
+      }
+    }
+    BB->insert(InsertPos, MI);
+    break;
+  }
+  }
+}
+
+/// EmitSchedule - Emit the machine code in scheduled order.
+MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
+  DenseMap<SDValue, unsigned> VRBaseMap;
+  DenseMap<SUnit*, unsigned> CopyVRBaseMap;
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    SUnit *SU = Sequence[i];
+    if (!SU) {
+      // Null SUnit* is a noop.
+      EmitNoop();
+      continue;
+    }
+
+    // For pre-regalloc scheduling, create instructions corresponding to the
+    // SDNode and any flagged SDNodes and append them to the block.
+    if (!SU->getNode()) {
+      // Emit a copy.
+      EmitPhysRegCopy(SU, CopyVRBaseMap);
+      continue;
+    }
+
+    SmallVector<SDNode *, 4> FlaggedNodes;
+    for (SDNode *N = SU->getNode()->getFlaggedNode(); N;
+         N = N->getFlaggedNode())
+      FlaggedNodes.push_back(N);
+    while (!FlaggedNodes.empty()) {
+      EmitNode(FlaggedNodes.back(), SU->OrigNode != SU, SU->isCloned,VRBaseMap);
+      FlaggedNodes.pop_back();
+    }
+    EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned, VRBaseMap);
+  }
+
+  return BB;
+}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
new file mode 100644
index 0000000..195896e
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -0,0 +1,5743 @@
+//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG class.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Constants.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+/// makeVTList - Return an instance of the SDVTList struct initialized with the
+/// specified members.
+static SDVTList makeVTList(const MVT *VTs, unsigned NumVTs) {
+  SDVTList Res = {VTs, NumVTs};
+  return Res;
+}
+
+static const fltSemantics *MVTToAPFloatSemantics(MVT VT) {
+  switch (VT.getSimpleVT()) {
+  default: assert(0 && "Unknown FP format");
+  case MVT::f32:     return &APFloat::IEEEsingle;
+  case MVT::f64:     return &APFloat::IEEEdouble;
+  case MVT::f80:     return &APFloat::x87DoubleExtended;
+  case MVT::f128:    return &APFloat::IEEEquad;
+  case MVT::ppcf128: return &APFloat::PPCDoubleDouble;
+  }
+}
+
+SelectionDAG::DAGUpdateListener::~DAGUpdateListener() {}
+
+//===----------------------------------------------------------------------===//
+//                              ConstantFPSDNode Class
+//===----------------------------------------------------------------------===//
+
+/// isExactlyValue - We don't rely on operator== working on double values, as
+/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
+/// As such, this method can be used to do an exact bit-for-bit comparison of
+/// two floating point values.
+bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
+  return getValueAPF().bitwiseIsEqual(V);
+}
+
+bool ConstantFPSDNode::isValueValidForType(MVT VT,
+                                           const APFloat& Val) {
+  assert(VT.isFloatingPoint() && "Can only convert between FP types");
+
+  // PPC long double cannot be converted to any other type.
+  if (VT == MVT::ppcf128 ||
+      &Val.getSemantics() == &APFloat::PPCDoubleDouble)
+    return false;
+
+  // convert modifies in place, so make a copy.
+  APFloat Val2 = APFloat(Val);
+  bool losesInfo;
+  (void) Val2.convert(*MVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven,
+                      &losesInfo);
+  return !losesInfo;
+}
+
+//===----------------------------------------------------------------------===//
+//                              ISD Namespace
+//===----------------------------------------------------------------------===//
+
+/// isBuildVectorAllOnes - Return true if the specified node is a
+/// BUILD_VECTOR where all of the elements are ~0 or undef.
+bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BIT_CONVERT)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-~0
+  // elements.
+  SDValue NotZero = N->getOperand(i);
+  if (isa<ConstantSDNode>(NotZero)) {
+    if (!cast<ConstantSDNode>(NotZero)->isAllOnesValue())
+      return false;
+  } else if (isa<ConstantFPSDNode>(NotZero)) {
+    if (!cast<ConstantFPSDNode>(NotZero)->getValueAPF().
+                bitcastToAPInt().isAllOnesValue())
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one ~0 value, check to see if the rest match or are
+  // undefs.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != NotZero &&
+        N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+  return true;
+}
+
+
+/// isBuildVectorAllZeros - Return true if the specified node is a
+/// BUILD_VECTOR where all of the elements are 0 or undef.
+bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BIT_CONVERT)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-~0
+  // elements.
+  SDValue Zero = N->getOperand(i);
+  if (isa<ConstantSDNode>(Zero)) {
+    if (!cast<ConstantSDNode>(Zero)->isNullValue())
+      return false;
+  } else if (isa<ConstantFPSDNode>(Zero)) {
+    if (!cast<ConstantFPSDNode>(Zero)->getValueAPF().isPosZero())
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one ~0 value, check to see if the rest match or are
+  // undefs.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != Zero &&
+        N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+  return true;
+}
+
+/// isScalarToVector - Return true if the specified node is a
+/// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low
+/// element is not an undef.
+bool ISD::isScalarToVector(const SDNode *N) {
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return true;
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+  if (N->getOperand(0).getOpcode() == ISD::UNDEF)
+    return false;
+  unsigned NumElems = N->getNumOperands();
+  for (unsigned i = 1; i < NumElems; ++i) {
+    SDValue V = N->getOperand(i);
+    if (V.getOpcode() != ISD::UNDEF)
+      return false;
+  }
+  return true;
+}
+
+
+/// isDebugLabel - Return true if the specified node represents a debug
+/// label (i.e. ISD::DBG_LABEL or TargetInstrInfo::DBG_LABEL node).
+bool ISD::isDebugLabel(const SDNode *N) {
+  SDValue Zero;
+  if (N->getOpcode() == ISD::DBG_LABEL)
+    return true;
+  if (N->isMachineOpcode() &&
+      N->getMachineOpcode() == TargetInstrInfo::DBG_LABEL)
+    return true;
+  return false;
+}
+
+/// getSetCCSwappedOperands - Return the operation corresponding to (Y op X)
+/// when given the operation for (X op Y).
+ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
+  // To perform this operation, we just need to swap the L and G bits of the
+  // operation.
+  unsigned OldL = (Operation >> 2) & 1;
+  unsigned OldG = (Operation >> 1) & 1;
+  return ISD::CondCode((Operation & ~6) |  // Keep the N, U, E bits
+                       (OldL << 1) |       // New G bit
+                       (OldG << 2));       // New L bit.
+}
+
+/// getSetCCInverse - Return the operation corresponding to !(X op Y), where
+/// 'op' is a valid SetCC operation.
+ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
+  unsigned Operation = Op;
+  if (isInteger)
+    Operation ^= 7;   // Flip L, G, E bits, but not U.
+  else
+    Operation ^= 15;  // Flip all of the condition bits.
+
+  if (Operation > ISD::SETTRUE2)
+    Operation &= ~8;  // Don't let N and U bits get set.
+
+  return ISD::CondCode(Operation);
+}
+
+
+/// isSignedOp - For an integer comparison, return 1 if the comparison is a
+/// signed operation and 2 if the result is an unsigned comparison.  Return zero
+/// if the operation does not depend on the sign of the input (setne and seteq).
+static int isSignedOp(ISD::CondCode Opcode) {
+  switch (Opcode) {
+  default: assert(0 && "Illegal integer setcc operation!");
+  case ISD::SETEQ:
+  case ISD::SETNE: return 0;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE: return 1;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return 2;
+  }
+}
+
+/// getSetCCOrOperation - Return the result of a logical OR between different
+/// comparisons of identical values: ((X op1 Y) | (X op2 Y)).  This function
+/// returns SETCC_INVALID if it is not possible to represent the resultant
+/// comparison.
+ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                       bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed integer setcc with an unsigned integer setcc.
+    return ISD::SETCC_INVALID;
+
+  unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
+
+  // If the N and U bits get set then the resultant comparison DOES suddenly
+  // care about orderedness, and is true when ordered.
+  if (Op > ISD::SETTRUE2)
+    Op &= ~16;     // Clear the U bit if the N bit is set.
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+    Op = ISD::SETNE;
+
+  return ISD::CondCode(Op);
+}
+
+/// getSetCCAndOperation - Return the result of a logical AND between different
+/// comparisons of identical values: ((X op1 Y) & (X op2 Y)).  This
+/// function returns zero if it is not possible to represent the resultant
+/// comparison.
+ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                        bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed setcc with an unsigned setcc.
+    return ISD::SETCC_INVALID;
+
+  // Combine all of the condition bits.
+  ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger) {
+    switch (Result) {
+    default: break;
+    case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
+    case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
+    case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
+    case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
+    case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
+    }
+  }
+
+  return Result;
+}
+
+const TargetMachine &SelectionDAG::getTarget() const {
+  return MF->getTarget();
+}
+
+//===----------------------------------------------------------------------===//
+//                           SDNode Profile Support
+//===----------------------------------------------------------------------===//
+
+/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
+///
+static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
+  ID.AddInteger(OpC);
+}
+
+/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
+/// solely with their pointer.
+static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
+  ID.AddPointer(VTList.VTs);
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              const SDValue *Ops, unsigned NumOps) {
+  for (; NumOps; --NumOps, ++Ops) {
+    ID.AddPointer(Ops->getNode());
+    ID.AddInteger(Ops->getResNo());
+  }
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              const SDUse *Ops, unsigned NumOps) {
+  for (; NumOps; --NumOps, ++Ops) {
+    ID.AddPointer(Ops->getNode());
+    ID.AddInteger(Ops->getResNo());
+  }
+}
+
+static void AddNodeIDNode(FoldingSetNodeID &ID,
+                          unsigned short OpC, SDVTList VTList,
+                          const SDValue *OpList, unsigned N) {
+  AddNodeIDOpcode(ID, OpC);
+  AddNodeIDValueTypes(ID, VTList);
+  AddNodeIDOperands(ID, OpList, N);
+}
+
+/// AddNodeIDCustom - If this is an SDNode with special info, add this info to
+/// the NodeID data.
+static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
+  switch (N->getOpcode()) {
+  default: break;  // Normal nodes don't need extra info.
+  case ISD::ARG_FLAGS:
+    ID.AddInteger(cast<ARG_FLAGSSDNode>(N)->getArgFlags().getRawBits());
+    break;
+  case ISD::TargetConstant:
+  case ISD::Constant:
+    ID.AddPointer(cast<ConstantSDNode>(N)->getConstantIntValue());
+    break;
+  case ISD::TargetConstantFP:
+  case ISD::ConstantFP: {
+    ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
+    break;
+  }
+  case ISD::TargetGlobalAddress:
+  case ISD::GlobalAddress:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::GlobalTLSAddress: {
+    const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+    ID.AddPointer(GA->getGlobal());
+    ID.AddInteger(GA->getOffset());
+    break;
+  }
+  case ISD::BasicBlock:
+    ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
+    break;
+  case ISD::Register:
+    ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
+    break;
+  case ISD::DBG_STOPPOINT: {
+    const DbgStopPointSDNode *DSP = cast<DbgStopPointSDNode>(N);
+    ID.AddInteger(DSP->getLine());
+    ID.AddInteger(DSP->getColumn());
+    ID.AddPointer(DSP->getCompileUnit());
+    break;
+  }
+  case ISD::SRCVALUE:
+    ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
+    break;
+  case ISD::MEMOPERAND: {
+    const MachineMemOperand &MO = cast<MemOperandSDNode>(N)->MO;
+    MO.Profile(ID);
+    break;
+  }
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
+    break;
+  case ISD::JumpTable:
+  case ISD::TargetJumpTable:
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
+    break;
+  case ISD::ConstantPool:
+  case ISD::TargetConstantPool: {
+    const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    ID.AddInteger(CP->getAlignment());
+    ID.AddInteger(CP->getOffset());
+    if (CP->isMachineConstantPoolEntry())
+      CP->getMachineCPVal()->AddSelectionDAGCSEId(ID);
+    else
+      ID.AddPointer(CP->getConstVal());
+    break;
+  }
+  case ISD::CALL: {
+    const CallSDNode *Call = cast<CallSDNode>(N);
+    ID.AddInteger(Call->getCallingConv());
+    ID.AddInteger(Call->isVarArg());
+    break;
+  }
+  case ISD::LOAD: {
+    const LoadSDNode *LD = cast<LoadSDNode>(N);
+    ID.AddInteger(LD->getMemoryVT().getRawBits());
+    ID.AddInteger(LD->getRawSubclassData());
+    break;
+  }
+  case ISD::STORE: {
+    const StoreSDNode *ST = cast<StoreSDNode>(N);
+    ID.AddInteger(ST->getMemoryVT().getRawBits());
+    ID.AddInteger(ST->getRawSubclassData());
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX: {
+    const AtomicSDNode *AT = cast<AtomicSDNode>(N);
+    ID.AddInteger(AT->getMemoryVT().getRawBits());
+    ID.AddInteger(AT->getRawSubclassData());
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+    for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements(); 
+         i != e; ++i)
+      ID.AddInteger(SVN->getMaskElt(i));
+    break;
+  }
+  } // end switch (N->getOpcode())
+}
+
+/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
+/// data.
+static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
+  AddNodeIDOpcode(ID, N->getOpcode());
+  // Add the return value info.
+  AddNodeIDValueTypes(ID, N->getVTList());
+  // Add the operand info.
+  AddNodeIDOperands(ID, N->op_begin(), N->getNumOperands());
+
+  // Handle SDNode leafs with special info.
+  AddNodeIDCustom(ID, N);
+}
+
+/// encodeMemSDNodeFlags - Generic routine for computing a value for use in
+/// the CSE map that carries alignment, volatility, indexing mode, and
+/// extension/truncation information.
+///
+static inline unsigned
+encodeMemSDNodeFlags(int ConvType, ISD::MemIndexedMode AM,
+                     bool isVolatile, unsigned Alignment) {
+  assert((ConvType & 3) == ConvType &&
+         "ConvType may not require more than 2 bits!");
+  assert((AM & 7) == AM &&
+         "AM may not require more than 3 bits!");
+  return ConvType |
+         (AM << 2) |
+         (isVolatile << 5) |
+         ((Log2_32(Alignment) + 1) << 6);
+}
+
+//===----------------------------------------------------------------------===//
+//                              SelectionDAG Class
+//===----------------------------------------------------------------------===//
+
+/// doNotCSE - Return true if CSE should not be performed for this node.
+static bool doNotCSE(SDNode *N) {
+  if (N->getValueType(0) == MVT::Flag)
+    return true; // Never CSE anything that produces a flag.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::HANDLENODE:
+  case ISD::DBG_LABEL:
+  case ISD::DBG_STOPPOINT:
+  case ISD::EH_LABEL:
+  case ISD::DECLARE:
+    return true;   // Never CSE these nodes.
+  }
+
+  // Check that remaining values produced are not flags.
+  for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
+    if (N->getValueType(i) == MVT::Flag)
+      return true; // Never CSE anything that produces a flag.
+
+  return false;
+}
+
+/// RemoveDeadNodes - This method deletes all unreachable nodes in the
+/// SelectionDAG.
+void SelectionDAG::RemoveDeadNodes() {
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted.
+  HandleSDNode Dummy(getRoot());
+
+  SmallVector<SDNode*, 128> DeadNodes;
+
+  // Add all obviously-dead nodes to the DeadNodes worklist.
+  for (allnodes_iterator I = allnodes_begin(), E = allnodes_end(); I != E; ++I)
+    if (I->use_empty())
+      DeadNodes.push_back(I);
+
+  RemoveDeadNodes(DeadNodes);
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  setRoot(Dummy.getValue());
+}
+
+/// RemoveDeadNodes - This method deletes the unreachable nodes in the
+/// given list, and any nodes that become unreachable as a result.
+void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
+                                   DAGUpdateListener *UpdateListener) {
+
+  // Process the worklist, deleting the nodes and adding their uses to the
+  // worklist.
+  while (!DeadNodes.empty()) {
+    SDNode *N = DeadNodes.pop_back_val();
+
+    if (UpdateListener)
+      UpdateListener->NodeDeleted(N, 0);
+
+    // Take the node out of the appropriate CSE map.
+    RemoveNodeFromCSEMaps(N);
+
+    // Next, brutally remove the operand list.  This is safe to do, as there are
+    // no cycles in the graph.
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+      SDUse &Use = *I++;
+      SDNode *Operand = Use.getNode();
+      Use.set(SDValue());
+
+      // Now that we removed this operand, see if there are no uses of it left.
+      if (Operand->use_empty())
+        DeadNodes.push_back(Operand);
+    }
+
+    DeallocateNode(N);
+  }
+}
+
+void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){
+  SmallVector<SDNode*, 16> DeadNodes(1, N);
+  RemoveDeadNodes(DeadNodes, UpdateListener);
+}
+
+void SelectionDAG::DeleteNode(SDNode *N) {
+  // First take this out of the appropriate CSE map.
+  RemoveNodeFromCSEMaps(N);
+
+  // Finally, remove uses due to operands of this node, remove from the
+  // AllNodes list, and delete the node.
+  DeleteNodeNotInCSEMaps(N);
+}
+
+void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
+  assert(N != AllNodes.begin() && "Cannot delete the entry node!");
+  assert(N->use_empty() && "Cannot delete a node that is not dead!");
+
+  // Drop all of the operands and decrement used node's use counts.
+  N->DropOperands();
+
+  DeallocateNode(N);
+}
+
+void SelectionDAG::DeallocateNode(SDNode *N) {
+  if (N->OperandsNeedDelete)
+    delete[] N->OperandList;
+
+  // Set the opcode to DELETED_NODE to help catch bugs when node
+  // memory is reallocated.
+  N->NodeType = ISD::DELETED_NODE;
+
+  NodeAllocator.Deallocate(AllNodes.remove(N));
+}
+
+/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
+/// correspond to it.  This is useful when we're about to delete or repurpose
+/// the node.  We don't want future request for structurally identical nodes
+/// to return N anymore.
+bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
+  bool Erased = false;
+  switch (N->getOpcode()) {
+  case ISD::EntryToken:
+    assert(0 && "EntryToken should not be in CSEMaps!");
+    return false;
+  case ISD::HANDLENODE: return false;  // noop.
+  case ISD::CONDCODE:
+    assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
+           "Cond code doesn't exist!");
+    Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != 0;
+    CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = 0;
+    break;
+  case ISD::ExternalSymbol:
+    Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
+    break;
+  case ISD::TargetExternalSymbol:
+    Erased =
+      TargetExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
+    break;
+  case ISD::VALUETYPE: {
+    MVT VT = cast<VTSDNode>(N)->getVT();
+    if (VT.isExtended()) {
+      Erased = ExtendedValueTypeNodes.erase(VT);
+    } else {
+      Erased = ValueTypeNodes[VT.getSimpleVT()] != 0;
+      ValueTypeNodes[VT.getSimpleVT()] = 0;
+    }
+    break;
+  }
+  default:
+    // Remove it from the CSE Map.
+    Erased = CSEMap.RemoveNode(N);
+    break;
+  }
+#ifndef NDEBUG
+  // Verify that the node was actually in one of the CSE maps, unless it has a
+  // flag result (which cannot be CSE'd) or is one of the special cases that are
+  // not subject to CSE.
+  if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Flag &&
+      !N->isMachineOpcode() && !doNotCSE(N)) {
+    N->dump(this);
+    cerr << "\n";
+    assert(0 && "Node is not in map!");
+  }
+#endif
+  return Erased;
+}
+
+/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
+/// maps and modified in place. Add it back to the CSE maps, unless an identical
+/// node already exists, in which case transfer all its users to the existing
+/// node. This transfer can potentially trigger recursive merging.
+///
+void
+SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N,
+                                       DAGUpdateListener *UpdateListener) {
+  // For node types that aren't CSE'd, just act as if no identical node
+  // already exists.
+  if (!doNotCSE(N)) {
+    SDNode *Existing = CSEMap.GetOrInsertNode(N);
+    if (Existing != N) {
+      // If there was already an existing matching node, use ReplaceAllUsesWith
+      // to replace the dead one with the existing one.  This can cause
+      // recursive merging of other unrelated nodes down the line.
+      ReplaceAllUsesWith(N, Existing, UpdateListener);
+
+      // N is now dead.  Inform the listener if it exists and delete it.
+      if (UpdateListener)
+        UpdateListener->NodeDeleted(N, Existing);
+      DeleteNodeNotInCSEMaps(N);
+      return;
+    }
+  }
+
+  // If the node doesn't already exist, we updated it.  Inform a listener if
+  // it exists.
+  if (UpdateListener)
+    UpdateListener->NodeUpdated(N);
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  SDValue Ops[] = { Op };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 1);
+  AddNodeIDCustom(ID, N);
+  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           SDValue Op1, SDValue Op2,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  SDValue Ops[] = { Op1, Op2 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 2);
+  AddNodeIDCustom(ID, N);
+  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+}
+
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           const SDValue *Ops,unsigned NumOps,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, NumOps);
+  AddNodeIDCustom(ID, N);
+  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+}
+
+/// VerifyNode - Sanity check the given node.  Aborts if it is invalid.
+void SelectionDAG::VerifyNode(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::BUILD_PAIR: {
+    MVT VT = N->getValueType(0);
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+           "Wrong return type!");
+    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Mismatched operand types!");
+    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+           "Wrong operand type!");
+    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+           "Wrong return type size");
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(N->getValueType(0).isVector() && "Wrong return type!");
+    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+           "Wrong number of operands!");
+    MVT EltVT = N->getValueType(0).getVectorElementType();
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I)
+      assert((I->getValueType() == EltVT ||
+             (EltVT.isInteger() && I->getValueType().isInteger() &&
+              EltVT.bitsLE(I->getValueType()))) &&
+            "Wrong operand type!");
+    break;
+  }
+  }
+}
+
+/// getMVTAlignment - Compute the default alignment value for the
+/// given type.
+///
+unsigned SelectionDAG::getMVTAlignment(MVT VT) const {
+  const Type *Ty = VT == MVT::iPTR ?
+                   PointerType::get(Type::Int8Ty, 0) :
+                   VT.getTypeForMVT();
+
+  return TLI.getTargetData()->getABITypeAlignment(Ty);
+}
+
+// EntryNode could meaningfully have debug info if we can find it...
+SelectionDAG::SelectionDAG(TargetLowering &tli, FunctionLoweringInfo &fli)
+  : TLI(tli), FLI(fli), DW(0),
+    EntryNode(ISD::EntryToken, DebugLoc::getUnknownLoc(),
+    getVTList(MVT::Other)), Root(getEntryNode()) {
+  AllNodes.push_back(&EntryNode);
+}
+
+void SelectionDAG::init(MachineFunction &mf, MachineModuleInfo *mmi,
+                        DwarfWriter *dw) {
+  MF = &mf;
+  MMI = mmi;
+  DW = dw;
+}
+
+SelectionDAG::~SelectionDAG() {
+  allnodes_clear();
+}
+
+void SelectionDAG::allnodes_clear() {
+  assert(&*AllNodes.begin() == &EntryNode);
+  AllNodes.remove(AllNodes.begin());
+  while (!AllNodes.empty())
+    DeallocateNode(AllNodes.begin());
+}
+
+void SelectionDAG::clear() {
+  allnodes_clear();
+  OperandAllocator.Reset();
+  CSEMap.clear();
+
+  ExtendedValueTypeNodes.clear();
+  ExternalSymbols.clear();
+  TargetExternalSymbols.clear();
+  std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
+            static_cast<CondCodeSDNode*>(0));
+  std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
+            static_cast<SDNode*>(0));
+
+  EntryNode.UseList = 0;
+  AllNodes.push_back(&EntryNode);
+  Root = getEntryNode();
+}
+
+SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, DebugLoc DL, MVT VT) {
+  if (Op.getValueType() == VT) return Op;
+  APInt Imm = APInt::getLowBitsSet(Op.getValueSizeInBits(),
+                                   VT.getSizeInBits());
+  return getNode(ISD::AND, DL, Op.getValueType(), Op,
+                 getConstant(Imm, Op.getValueType()));
+}
+
+/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
+///
+SDValue SelectionDAG::getNOT(DebugLoc DL, SDValue Val, MVT VT) {
+  MVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+  SDValue NegOne =
+    getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
+  return getNode(ISD::XOR, DL, VT, Val, NegOne);
+}
+
+SDValue SelectionDAG::getConstant(uint64_t Val, MVT VT, bool isT) {
+  MVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+  assert((EltVT.getSizeInBits() >= 64 ||
+         (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
+         "getConstant with a uint64_t value that doesn't fit in the type!");
+  return getConstant(APInt(EltVT.getSizeInBits(), Val), VT, isT);
+}
+
+SDValue SelectionDAG::getConstant(const APInt &Val, MVT VT, bool isT) {
+  return getConstant(*ConstantInt::get(Val), VT, isT);
+}
+
+SDValue SelectionDAG::getConstant(const ConstantInt &Val, MVT VT, bool isT) {
+  assert(VT.isInteger() && "Cannot create FP integer constant!");
+
+  MVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+  assert(Val.getBitWidth() == EltVT.getSizeInBits() &&
+         "APInt size does not match type size!");
+
+  unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  ID.AddPointer(&Val);
+  void *IP = 0;
+  SDNode *N = NULL;
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+  if (!N) {
+    N = NodeAllocator.Allocate<ConstantSDNode>();
+    new (N) ConstantSDNode(isT, &Val, EltVT);
+    CSEMap.InsertNode(N, IP);
+    AllNodes.push_back(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.assign(VT.getVectorNumElements(), Result);
+    Result = getNode(ISD::BUILD_VECTOR, DebugLoc::getUnknownLoc(),
+                     VT, &Ops[0], Ops.size());
+  }
+  return Result;
+}
+
+SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, bool isTarget) {
+  return getConstant(Val, TLI.getPointerTy(), isTarget);
+}
+
+
+SDValue SelectionDAG::getConstantFP(const APFloat& V, MVT VT, bool isTarget) {
+  return getConstantFP(*ConstantFP::get(V), VT, isTarget);
+}
+
+SDValue SelectionDAG::getConstantFP(const ConstantFP& V, MVT VT, bool isTarget){
+  assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
+
+  MVT EltVT =
+    VT.isVector() ? VT.getVectorElementType() : VT;
+
+  // Do the map lookup using the actual bit pattern for the floating point
+  // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
+  // we don't have issues with SNANs.
+  unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  ID.AddPointer(&V);
+  void *IP = 0;
+  SDNode *N = NULL;
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+  if (!N) {
+    N = NodeAllocator.Allocate<ConstantFPSDNode>();
+    new (N) ConstantFPSDNode(isTarget, &V, EltVT);
+    CSEMap.InsertNode(N, IP);
+    AllNodes.push_back(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.assign(VT.getVectorNumElements(), Result);
+    // FIXME DebugLoc info might be appropriate here
+    Result = getNode(ISD::BUILD_VECTOR, DebugLoc::getUnknownLoc(),
+                     VT, &Ops[0], Ops.size());
+  }
+  return Result;
+}
+
+SDValue SelectionDAG::getConstantFP(double Val, MVT VT, bool isTarget) {
+  MVT EltVT =
+    VT.isVector() ? VT.getVectorElementType() : VT;
+  if (EltVT==MVT::f32)
+    return getConstantFP(APFloat((float)Val), VT, isTarget);
+  else
+    return getConstantFP(APFloat(Val), VT, isTarget);
+}
+
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV,
+                                       MVT VT, int64_t Offset,
+                                       bool isTargetGA) {
+  unsigned Opc;
+
+  // Truncate (with sign-extension) the offset value to the pointer size.
+  unsigned BitWidth = TLI.getPointerTy().getSizeInBits();
+  if (BitWidth < 64)
+    Offset = (Offset << (64 - BitWidth) >> (64 - BitWidth));
+
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee for determining thread-localness.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+  }
+
+  if (GVar && GVar->isThreadLocal())
+    Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
+  else
+    Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddPointer(GV);
+  ID.AddInteger(Offset);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<GlobalAddressSDNode>();
+  new (N) GlobalAddressSDNode(isTargetGA, GV, VT, Offset);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getFrameIndex(int FI, MVT VT, bool isTarget) {
+  unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(FI);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<FrameIndexSDNode>();
+  new (N) FrameIndexSDNode(FI, VT, isTarget);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getJumpTable(int JTI, MVT VT, bool isTarget){
+  unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(JTI);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<JumpTableSDNode>();
+  new (N) JumpTableSDNode(JTI, VT, isTarget);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConstantPool(Constant *C, MVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget) {
+  if (Alignment == 0)
+    Alignment = TLI.getTargetData()->getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  ID.AddPointer(C);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<ConstantPoolSDNode>();
+  new (N) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+
+SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, MVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget) {
+  if (Alignment == 0)
+    Alignment = TLI.getTargetData()->getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  C->AddSelectionDAGCSEId(ID);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<ConstantPoolSDNode>();
+  new (N) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(MBB);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<BasicBlockSDNode>();
+  new (N) BasicBlockSDNode(MBB);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getArgFlags(ISD::ArgFlagsTy Flags) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ARG_FLAGS, getVTList(MVT::Other), 0, 0);
+  ID.AddInteger(Flags.getRawBits());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<ARG_FLAGSSDNode>();
+  new (N) ARG_FLAGSSDNode(Flags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getValueType(MVT VT) {
+  if (VT.isSimple() && (unsigned)VT.getSimpleVT() >= ValueTypeNodes.size())
+    ValueTypeNodes.resize(VT.getSimpleVT()+1);
+
+  SDNode *&N = VT.isExtended() ?
+    ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT()];
+
+  if (N) return SDValue(N, 0);
+  N = NodeAllocator.Allocate<VTSDNode>();
+  new (N) VTSDNode(VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getExternalSymbol(const char *Sym, MVT VT) {
+  SDNode *&N = ExternalSymbols[Sym];
+  if (N) return SDValue(N, 0);
+  N = NodeAllocator.Allocate<ExternalSymbolSDNode>();
+  new (N) ExternalSymbolSDNode(false, Sym, VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, MVT VT) {
+  SDNode *&N = TargetExternalSymbols[Sym];
+  if (N) return SDValue(N, 0);
+  N = NodeAllocator.Allocate<ExternalSymbolSDNode>();
+  new (N) ExternalSymbolSDNode(true, Sym, VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
+  if ((unsigned)Cond >= CondCodeNodes.size())
+    CondCodeNodes.resize(Cond+1);
+
+  if (CondCodeNodes[Cond] == 0) {
+    CondCodeSDNode *N = NodeAllocator.Allocate<CondCodeSDNode>();
+    new (N) CondCodeSDNode(Cond);
+    CondCodeNodes[Cond] = N;
+    AllNodes.push_back(N);
+  }
+  return SDValue(CondCodeNodes[Cond], 0);
+}
+
+// commuteShuffle - swaps the values of N1 and N2, and swaps all indices in
+// the shuffle mask M that point at N1 to point at N2, and indices that point
+// N2 to point at N1.
+static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl<int> &M) {
+  std::swap(N1, N2);
+  int NElts = M.size();
+  for (int i = 0; i != NElts; ++i) {
+    if (M[i] >= NElts)
+      M[i] -= NElts;
+    else if (M[i] >= 0)
+      M[i] += NElts;
+  }
+}
+
+SDValue SelectionDAG::getVectorShuffle(MVT VT, DebugLoc dl, SDValue N1, 
+                                       SDValue N2, const int *Mask) {
+  assert(N1.getValueType() == N2.getValueType() && "Invalid VECTOR_SHUFFLE");
+  assert(VT.isVector() && N1.getValueType().isVector() && 
+         "Vector Shuffle VTs must be a vectors");
+  assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType()
+         && "Vector Shuffle VTs must have same element type");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  // Validate that all indices in Mask are within the range of the elements 
+  // input to the shuffle.
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i != NElts; ++i) {
+    assert(Mask[i] < (int)(NElts * 2) && "Index out of range");
+    MaskVec.push_back(Mask[i]);
+  }
+  
+  // Canonicalize shuffle v, v -> v, undef
+  if (N1 == N2) {
+    N2 = getUNDEF(VT);
+    for (unsigned i = 0; i != NElts; ++i)
+      if (MaskVec[i] >= (int)NElts) MaskVec[i] -= NElts;
+  }
+  
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N1.getOpcode() == ISD::UNDEF)
+    commuteShuffle(N1, N2, MaskVec);
+  
+  // Canonicalize all index into lhs, -> shuffle lhs, undef
+  // Canonicalize all index into rhs, -> shuffle rhs, undef
+  bool AllLHS = true, AllRHS = true;
+  bool N2Undef = N2.getOpcode() == ISD::UNDEF;
+  for (unsigned i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= (int)NElts) {
+      if (N2Undef)
+        MaskVec[i] = -1;
+      else
+        AllLHS = false;
+    } else if (MaskVec[i] >= 0) {
+      AllRHS = false;
+    }
+  }
+  if (AllLHS && AllRHS)
+    return getUNDEF(VT);
+  if (AllLHS && !N2Undef)
+    N2 = getUNDEF(VT);
+  if (AllRHS) {
+    N1 = getUNDEF(VT);
+    commuteShuffle(N1, N2, MaskVec);
+  }
+  
+  // If Identity shuffle, or all shuffle in to undef, return that node.
+  bool AllUndef = true;
+  bool Identity = true;
+  for (unsigned i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
+    if (MaskVec[i] >= 0) AllUndef = false;
+  }
+  if (Identity)
+    return N1;
+  if (AllUndef)
+    return getUNDEF(VT);
+
+  FoldingSetNodeID ID;
+  SDValue Ops[2] = { N1, N2 };
+  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops, 2);
+  for (unsigned i = 0; i != NElts; ++i)
+    ID.AddInteger(MaskVec[i]);
+  
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  
+  // Allocate the mask array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the NodeAllocator is released.
+  int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
+  memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int));
+  
+  ShuffleVectorSDNode *N = NodeAllocator.Allocate<ShuffleVectorSDNode>();
+  new (N) ShuffleVectorSDNode(VT, dl, N1, N2, MaskAlloc);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConvertRndSat(MVT VT, DebugLoc dl,
+                                       SDValue Val, SDValue DTy,
+                                       SDValue STy, SDValue Rnd, SDValue Sat,
+                                       ISD::CvtCode Code) {
+  // If the src and dest types are the same and the conversion is between
+  // integer types of the same sign or two floats, no conversion is necessary.
+  if (DTy == STy &&
+      (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF))
+    return Val;
+
+  FoldingSetNodeID ID;
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  CvtRndSatSDNode *N = NodeAllocator.Allocate<CvtRndSatSDNode>();
+  SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
+  new (N) CvtRndSatSDNode(VT, dl, Ops, 5, Code);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getRegister(unsigned RegNo, MVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::Register, getVTList(VT), 0, 0);
+  ID.AddInteger(RegNo);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<RegisterSDNode>();
+  new (N) RegisterSDNode(RegNo, VT);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getDbgStopPoint(DebugLoc DL, SDValue Root,
+                                      unsigned Line, unsigned Col,
+                                      Value *CU) {
+  SDNode *N = NodeAllocator.Allocate<DbgStopPointSDNode>();
+  new (N) DbgStopPointSDNode(Root, Line, Col, CU);
+  N->setDebugLoc(DL);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getLabel(unsigned Opcode, DebugLoc dl,
+                               SDValue Root,
+                               unsigned LabelID) {
+  FoldingSetNodeID ID;
+  SDValue Ops[] = { Root };
+  AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), &Ops[0], 1);
+  ID.AddInteger(LabelID);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<LabelSDNode>();
+  new (N) LabelSDNode(Opcode, dl, Root, LabelID);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getSrcValue(const Value *V) {
+  assert((!V || isa<PointerType>(V->getType())) &&
+         "SrcValue is not a pointer?");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(V);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = NodeAllocator.Allocate<SrcValueSDNode>();
+  new (N) SrcValueSDNode(V);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMemOperand(const MachineMemOperand &MO) {
+#ifndef NDEBUG
+  const Value *v = MO.getValue();
+  assert((!v || isa<PointerType>(v->getType())) &&
+         "SrcValue is not a pointer?");
+#endif
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MEMOPERAND, getVTList(MVT::Other), 0, 0);
+  MO.Profile(ID);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = NodeAllocator.Allocate<MemOperandSDNode>();
+  new (N) MemOperandSDNode(MO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+/// getShiftAmountOperand - Return the specified value casted to
+/// the target's desired shift amount type.
+SDValue SelectionDAG::getShiftAmountOperand(SDValue Op) {
+  MVT OpTy = Op.getValueType();
+  MVT ShTy = TLI.getShiftAmountTy();
+  if (OpTy == ShTy || OpTy.isVector()) return Op;
+
+  ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
+  return getNode(Opcode, Op.getDebugLoc(), ShTy, Op);
+}
+
+/// CreateStackTemporary - Create a stack temporary, suitable for holding the
+/// specified value type.
+SDValue SelectionDAG::CreateStackTemporary(MVT VT, unsigned minAlign) {
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  unsigned ByteSize = VT.getStoreSizeInBits()/8;
+  const Type *Ty = VT.getTypeForMVT();
+  unsigned StackAlign =
+  std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty), minAlign);
+
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign);
+  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+}
+
+/// CreateStackTemporary - Create a stack temporary suitable for holding
+/// either of the specified value types.
+SDValue SelectionDAG::CreateStackTemporary(MVT VT1, MVT VT2) {
+  unsigned Bytes = std::max(VT1.getStoreSizeInBits(),
+                            VT2.getStoreSizeInBits())/8;
+  const Type *Ty1 = VT1.getTypeForMVT();
+  const Type *Ty2 = VT2.getTypeForMVT();
+  const TargetData *TD = TLI.getTargetData();
+  unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
+                            TD->getPrefTypeAlignment(Ty2));
+
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align);
+  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+}
+
+SDValue SelectionDAG::FoldSetCC(MVT VT, SDValue N1,
+                                SDValue N2, ISD::CondCode Cond, DebugLoc dl) {
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return getConstant(0, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:  return getConstant(1, VT);
+
+  case ISD::SETOEQ:
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+  case ISD::SETO:
+  case ISD::SETUO:
+  case ISD::SETUEQ:
+  case ISD::SETUNE:
+    assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
+    break;
+  }
+
+  if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode())) {
+    const APInt &C2 = N2C->getAPIntValue();
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+      const APInt &C1 = N1C->getAPIntValue();
+
+      switch (Cond) {
+      default: assert(0 && "Unknown integer setcc!");
+      case ISD::SETEQ:  return getConstant(C1 == C2, VT);
+      case ISD::SETNE:  return getConstant(C1 != C2, VT);
+      case ISD::SETULT: return getConstant(C1.ult(C2), VT);
+      case ISD::SETUGT: return getConstant(C1.ugt(C2), VT);
+      case ISD::SETULE: return getConstant(C1.ule(C2), VT);
+      case ISD::SETUGE: return getConstant(C1.uge(C2), VT);
+      case ISD::SETLT:  return getConstant(C1.slt(C2), VT);
+      case ISD::SETGT:  return getConstant(C1.sgt(C2), VT);
+      case ISD::SETLE:  return getConstant(C1.sle(C2), VT);
+      case ISD::SETGE:  return getConstant(C1.sge(C2), VT);
+      }
+    }
+  }
+  if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+    if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2.getNode())) {
+      // No compile time operations on this type yet.
+      if (N1C->getValueType(0) == MVT::ppcf128)
+        return SDValue();
+
+      APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
+      switch (Cond) {
+      default: break;
+      case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, VT);
+      case ISD::SETNE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpLessThan, VT);
+      case ISD::SETLT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, VT);
+      case ISD::SETGT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, VT);
+      case ISD::SETLE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETGE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETO:   return getConstant(R!=APFloat::cmpUnordered, VT);
+      case ISD::SETUO:  return getConstant(R==APFloat::cmpUnordered, VT);
+      case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, VT);
+      case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpLessThan, VT);
+      case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpUnordered, VT);
+      case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, VT);
+      case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, VT);
+      }
+    } else {
+      // Ensure that the constant occurs on the RHS.
+      return getSetCC(dl, VT, N2, N1, ISD::getSetCCSwappedOperands(Cond));
+    }
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// SignBitIsZero - Return true if the sign bit of Op is known to be zero.  We
+/// use this predicate to simplify operations downstream.
+bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
+  unsigned BitWidth = Op.getValueSizeInBits();
+  return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth);
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
+                                     unsigned Depth) const {
+  APInt KnownZero, KnownOne;
+  ComputeMaskedBits(Op, Mask, KnownZero, KnownOne, Depth);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  return (KnownZero & Mask) == Mask;
+}
+
+/// ComputeMaskedBits - Determine which of the bits specified in Mask are
+/// known to be either zero or one and return them in the KnownZero/KnownOne
+/// bitsets.  This code only analyzes bits in Mask, in order to short-circuit
+/// processing.
+void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
+                                     APInt &KnownZero, APInt &KnownOne,
+                                     unsigned Depth) const {
+  unsigned BitWidth = Mask.getBitWidth();
+  assert(BitWidth == Op.getValueType().getSizeInBits() &&
+         "Mask size mismatches value type size!");
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  if (Depth == 6 || Mask == 0)
+    return;  // Limit search depth.
+
+  APInt KnownZero2, KnownOne2;
+
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue() & Mask;
+    KnownZero = ~KnownOne & Mask;
+    return;
+  case ISD::AND:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask & ~KnownZero,
+                      KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    return;
+  case ISD::OR:
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask & ~KnownOne,
+                      KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    return;
+  case ISD::XOR: {
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    return;
+  }
+  case ISD::MUL: {
+    APInt Mask2 = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conserative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    KnownOne.clear();
+    unsigned TrailZ = KnownZero.countTrailingOnes() +
+                      KnownZero2.countTrailingOnes();
+    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                               KnownZero2.countLeadingOnes(),
+                               BitWidth) - BitWidth;
+
+    TrailZ = std::min(TrailZ, BitWidth);
+    LeadZ = std::min(LeadZ, BitWidth);
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+                APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero &= Mask;
+    return;
+  }
+  case ISD::UDIV: {
+    // For the purposes of computing leading zeros we can conservatively
+    // treat a udiv as a logical right shift by the power of 2 known to
+    // be less than the denominator.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(0),
+                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    unsigned LeadZ = KnownZero2.countLeadingOnes();
+
+    KnownOne2.clear();
+    KnownZero2.clear();
+    ComputeMaskedBits(Op.getOperand(1),
+                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
+    if (RHSUnknownLeadingOnes != BitWidth)
+      LeadZ = std::min(BitWidth,
+                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ) & Mask;
+    return;
+  }
+  case ISD::SELECT:
+    ComputeMaskedBits(Op.getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case ISD::SELECT_CC:
+    ComputeMaskedBits(Op.getOperand(3), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(2), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      return;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+  case ISD::SETCC:
+    // If we know the result of a setcc has the top bits zero, use this info.
+    if (TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    return;
+  case ISD::SHL:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      ComputeMaskedBits(Op.getOperand(0), Mask.lshr(ShAmt),
+                        KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero <<= ShAmt;
+      KnownOne  <<= ShAmt;
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt);
+    }
+    return;
+  case ISD::SRL:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      ComputeMaskedBits(Op.getOperand(0), (Mask << ShAmt),
+                        KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt) & Mask;
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    return;
+  case ISD::SRA:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      APInt InDemandedMask = (Mask << ShAmt);
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt) & Mask;
+      if (HighBits.getBoolValue())
+        InDemandedMask |= APInt::getSignBit(BitWidth);
+
+      ComputeMaskedBits(Op.getOperand(0), InDemandedMask, KnownZero, KnownOne,
+                        Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      // Handle the sign bits.
+      APInt SignBit = APInt::getSignBit(BitWidth);
+      SignBit = SignBit.lshr(ShAmt);  // Adjust to where it is now in the mask.
+
+      if (KnownZero.intersects(SignBit)) {
+        KnownZero |= HighBits;  // New bits are known zero.
+      } else if (KnownOne.intersects(SignBit)) {
+        KnownOne  |= HighBits;  // New bits are known one.
+      }
+    }
+    return;
+  case ISD::SIGN_EXTEND_INREG: {
+    MVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    unsigned EBits = EVT.getSizeInBits();
+
+    // Sign extension.  Compute the demanded bits in the result that are not
+    // present in the input.
+    APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits) & Mask;
+
+    APInt InSignBit = APInt::getSignBit(EBits);
+    APInt InputDemandedBits = Mask & APInt::getLowBitsSet(BitWidth, EBits);
+
+    // If the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InSignBit.zext(BitWidth);
+    if (NewBits.getBoolValue())
+      InputDemandedBits |= InSignBit;
+
+    ComputeMaskedBits(Op.getOperand(0), InputDemandedBits,
+                      KnownZero, KnownOne, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    if (KnownZero.intersects(InSignBit)) {         // Input sign bit known clear
+      KnownZero |= NewBits;
+      KnownOne  &= ~NewBits;
+    } else if (KnownOne.intersects(InSignBit)) {   // Input sign bit known set
+      KnownOne  |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                              // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne  &= ~NewBits;
+    }
+    return;
+  }
+  case ISD::CTTZ:
+  case ISD::CTLZ:
+  case ISD::CTPOP: {
+    unsigned LowBits = Log2_32(BitWidth)+1;
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+    KnownOne.clear();
+    return;
+  }
+  case ISD::LOAD: {
+    if (ISD::isZEXTLoad(Op.getNode())) {
+      LoadSDNode *LD = cast<LoadSDNode>(Op);
+      MVT VT = LD->getMemoryVT();
+      unsigned MemBits = VT.getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits) & Mask;
+    }
+    return;
+  }
+  case ISD::ZERO_EXTEND: {
+    MVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getSizeInBits();
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
+    APInt InMask    = Mask;
+    InMask.trunc(InBits);
+    KnownZero.trunc(InBits);
+    KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    return;
+  }
+  case ISD::SIGN_EXTEND: {
+    MVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getSizeInBits();
+    APInt InSignBit = APInt::getSignBit(InBits);
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
+    APInt InMask = Mask;
+    InMask.trunc(InBits);
+
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded. Temporarily set this bit in the mask for our callee.
+    if (NewBits.getBoolValue())
+      InMask |= InSignBit;
+
+    KnownZero.trunc(InBits);
+    KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+
+    // Note if the sign bit is known to be zero or one.
+    bool SignBitKnownZero = KnownZero.isNegative();
+    bool SignBitKnownOne  = KnownOne.isNegative();
+    assert(!(SignBitKnownZero && SignBitKnownOne) &&
+           "Sign bit can't be known to be both zero and one!");
+
+    // If the sign bit wasn't actually demanded by our caller, we don't
+    // want it set in the KnownZero and KnownOne result values. Reset the
+    // mask and reapply it to the result values.
+    InMask = Mask;
+    InMask.trunc(InBits);
+    KnownZero &= InMask;
+    KnownOne  &= InMask;
+
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+
+    // If the sign bit is known zero or one, the top bits match.
+    if (SignBitKnownZero)
+      KnownZero |= NewBits;
+    else if (SignBitKnownOne)
+      KnownOne  |= NewBits;
+    return;
+  }
+  case ISD::ANY_EXTEND: {
+    MVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getSizeInBits();
+    APInt InMask = Mask;
+    InMask.trunc(InBits);
+    KnownZero.trunc(InBits);
+    KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    return;
+  }
+  case ISD::TRUNCATE: {
+    MVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getSizeInBits();
+    APInt InMask = Mask;
+    InMask.zext(InBits);
+    KnownZero.zext(InBits);
+    KnownOne.zext(InBits);
+    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero.trunc(BitWidth);
+    KnownOne.trunc(BitWidth);
+    break;
+  }
+  case ISD::AssertZext: {
+    MVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
+    ComputeMaskedBits(Op.getOperand(0), Mask & InMask, KnownZero,
+                      KnownOne, Depth+1);
+    KnownZero |= (~InMask) & Mask;
+    return;
+  }
+  case ISD::FGETSIGN:
+    // All bits are zero except the low bit.
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    return;
+
+  case ISD::SUB: {
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0))) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (CLHS->getAPIntValue().isNonNegative()) {
+        unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        ComputeMaskedBits(Op.getOperand(1), MaskV, KnownZero2, KnownOne2,
+                          Depth+1);
+
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
+        }
+      }
+    }
+  }
+  // fall through
+  case ISD::ADD: {
+    // Output known-0 bits are known if clear or set in both the low clear bits
+    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
+    // low 3 bits clear.
+    APInt Mask2 = APInt::getLowBitsSet(BitWidth, Mask.countTrailingOnes());
+    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    unsigned KnownZeroOut = KnownZero2.countTrailingOnes();
+
+    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    KnownZeroOut = std::min(KnownZeroOut,
+                            KnownZero2.countTrailingOnes());
+
+    KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroOut);
+    return;
+  }
+  case ISD::SREM:
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue();
+      if (RA.isPowerOf2() || (-RA).isPowerOf2()) {
+        APInt LowBits = RA.isStrictlyPositive() ? (RA - 1) : ~RA;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        ComputeMaskedBits(Op.getOperand(0), Mask2,KnownZero2,KnownOne2,Depth+1);
+
+        // If the sign bit of the first operand is zero, the sign bit of
+        // the result is zero. If the first operand has no one bits below
+        // the second operand's single 1 bit, its sign will be zero.
+        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+          KnownZero2 |= ~LowBits;
+
+        KnownZero |= KnownZero2 & Mask;
+
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+      }
+    }
+    return;
+  case ISD::UREM: {
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = (RA - 1);
+        APInt Mask2 = LowBits & Mask;
+        KnownZero |= ~LowBits & Mask;
+        ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero, KnownOne,Depth+1);
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+        break;
+      }
+    }
+
+    // Since the result is less than or equal to either operand, any leading
+    // zero bits in either operand must also exist in the result.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(0), AllOnes, KnownZero, KnownOne,
+                      Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), AllOnes, KnownZero2, KnownOne2,
+                      Depth+1);
+
+    uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
+                                KnownZero2.countLeadingOnes());
+    KnownOne.clear();
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & Mask;
+    return;
+  }
+  default:
+    // Allow the target to implement this method for its nodes.
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:
+      TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this);
+    }
+    return;
+  }
+}
+
+/// ComputeNumSignBits - Return the number of times the sign bit of the
+/// register is replicated into the other bits.  We know that at least 1 bit
+/// is always equal to the sign bit (itself), but other cases can give us
+/// information.  For example, immediately after an "SRA X, 2", we know that
+/// the top 3 bits are all equal to each other, so we return 3.
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
+  MVT VT = Op.getValueType();
+  assert(VT.isInteger() && "Invalid VT!");
+  unsigned VTBits = VT.getSizeInBits();
+  unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
+
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::AssertSext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp+1;
+  case ISD::AssertZext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp;
+
+  case ISD::Constant: {
+    const APInt &Val = cast<ConstantSDNode>(Op)->getAPIntValue();
+    // If negative, return # leading ones.
+    if (Val.isNegative())
+      return Val.countLeadingOnes();
+
+    // Return # leading zeros.
+    return Val.countLeadingZeros();
+  }
+
+  case ISD::SIGN_EXTEND:
+    Tmp = VTBits-Op.getOperand(0).getValueType().getSizeInBits();
+    return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
+
+  case ISD::SIGN_EXTEND_INREG:
+    // Max of the input and what this extends.
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    Tmp = VTBits-Tmp+1;
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    return std::max(Tmp, Tmp2);
+
+  case ISD::SRA:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    // SRA X, C   -> adds C sign bits.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      Tmp += C->getZExtValue();
+      if (Tmp > VTBits) Tmp = VTBits;
+    }
+    return Tmp;
+  case ISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      // shl destroys sign bits.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (C->getZExtValue() >= VTBits ||      // Bad shift.
+          C->getZExtValue() >= Tmp) break;    // Shifted all sign bits out.
+      return Tmp - C->getZExtValue();
+    }
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:    // NOT is handled here.
+    // Logical binary ops preserve the number of sign bits at the worst.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // ComputeMaskedBits, and pick whichever answer is better.
+    }
+    break;
+
+  case ISD::SELECT:
+    Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1);
+    return std::min(Tmp, Tmp2);
+
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      break;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+  case ISD::SETCC:
+    // If setcc returns 0/-1, all bits are sign bits.
+    if (TLI.getBooleanContents() ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned RotAmt = C->getZExtValue() & (VTBits-1);
+
+      // Handle rotate right by N like a rotate left by 32-N.
+      if (Op.getOpcode() == ISD::ROTR)
+        RotAmt = (VTBits-RotAmt) & (VTBits-1);
+
+      // If we aren't rotating out all of the known-in sign bits, return the
+      // number that are left.  This handles rotl(sext(x), 1) for example.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (Tmp > RotAmt+1) return Tmp-RotAmt;
+    }
+    break;
+  case ISD::ADD:
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+
+    // Special case decrementing a value (ADD X, -1):
+    if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      if (CRHS->isAllOnesValue()) {
+        APInt KnownZero, KnownOne;
+        APInt Mask = APInt::getAllOnesValue(VTBits);
+        ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)) == Mask)
+          return VTBits;
+
+        // If we are subtracting one from a positive number, there is no carry
+        // out of the result.
+        if (KnownZero.isNegative())
+          return Tmp;
+      }
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+      return std::min(Tmp, Tmp2)-1;
+    break;
+
+  case ISD::SUB:
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+
+    // Handle NEG.
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0)))
+      if (CLHS->isNullValue()) {
+        APInt KnownZero, KnownOne;
+        APInt Mask = APInt::getAllOnesValue(VTBits);
+        ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)) == Mask)
+          return VTBits;
+
+        // If the input is known to be positive (the sign bit is known clear),
+        // the output of the NEG has the same number of sign bits as the input.
+        if (KnownZero.isNegative())
+          return Tmp2;
+
+        // Otherwise, we treat this like a SUB.
+      }
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+      return std::min(Tmp, Tmp2)-1;
+    break;
+  case ISD::TRUNCATE:
+    // FIXME: it's tricky to do anything useful for this, but it is an important
+    // case for targets like X86.
+    break;
+  }
+
+  // Handle LOADX separately here. EXTLOAD case will fallthrough.
+  if (Op.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    unsigned ExtType = LD->getExtensionType();
+    switch (ExtType) {
+    default: break;
+    case ISD::SEXTLOAD:    // '17' bits known
+      Tmp = LD->getMemoryVT().getSizeInBits();
+      return VTBits-Tmp+1;
+    case ISD::ZEXTLOAD:    // '16' bits known
+      Tmp = LD->getMemoryVT().getSizeInBits();
+      return VTBits-Tmp;
+    }
+  }
+
+  // Allow the target to implement this method for its nodes.
+  if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+      Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_VOID) {
+    unsigned NumBits = TLI.ComputeNumSignBitsForTargetNode(Op, Depth);
+    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  APInt KnownZero, KnownOne;
+  APInt Mask = APInt::getAllOnesValue(VTBits);
+  ComputeMaskedBits(Op, Mask, KnownZero, KnownOne, Depth);
+
+  if (KnownZero.isNegative()) {        // sign bit is 0
+    Mask = KnownZero;
+  } else if (KnownOne.isNegative()) {  // sign bit is 1;
+    Mask = KnownOne;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
+
+  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // the number of identical bits in the top of the input value.
+  Mask = ~Mask;
+  Mask <<= Mask.getBitWidth()-VTBits;
+  // Return # leading zeros.  We use 'min' here in case Val was zero before
+  // shifting.  We don't want to return '64' as for an i32 "0".
+  return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
+}
+
+
+bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const {
+  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+  if (!GA) return false;
+  if (GA->getOffset() != 0) return false;
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
+  if (!GV) return false;
+  MachineModuleInfo *MMI = getMachineModuleInfo();
+  return MMI && MMI->hasDebugInfo();
+}
+
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+SDValue SelectionDAG::getShuffleScalarElt(const ShuffleVectorSDNode *N,
+                                          unsigned i) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getMaskElt(i) < 0)
+    return getUNDEF(VT.getVectorElementType());
+  unsigned Index = N->getMaskElt(i);
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue V = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
+  Index %= NumElems;
+
+  if (V.getOpcode() == ISD::BIT_CONVERT) {
+    V = V.getOperand(0);
+    MVT VVT = V.getValueType();
+    if (!VVT.isVector() || VVT.getVectorNumElements() != (unsigned)NumElems)
+      return SDValue();
+  }
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? V.getOperand(0)
+                      : getUNDEF(VT.getVectorElementType());
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Index);
+  if (const ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V))
+    return getShuffleScalarElt(SVN, Index);
+  return SDValue();
+}
+
+
+/// getNode - Gets or creates the specified node.
+///
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, getVTList(VT), 0, 0);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<SDNode>();
+  new (N) SDNode(Opcode, DL, getVTList(VT));
+  CSEMap.InsertNode(N, IP);
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              MVT VT, SDValue Operand) {
+  // Constant fold unary operations with an integer constant operand.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand.getNode())) {
+    const APInt &Val = C->getAPIntValue();
+    unsigned BitWidth = VT.getSizeInBits();
+    switch (Opcode) {
+    default: break;
+    case ISD::SIGN_EXTEND:
+      return getConstant(APInt(Val).sextOrTrunc(BitWidth), VT);
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::TRUNCATE:
+      return getConstant(APInt(Val).zextOrTrunc(BitWidth), VT);
+    case ISD::UINT_TO_FP:
+    case ISD::SINT_TO_FP: {
+      const uint64_t zero[] = {0, 0};
+      // No compile time operations on this type.
+      if (VT==MVT::ppcf128)
+        break;
+      APFloat apf = APFloat(APInt(BitWidth, 2, zero));
+      (void)apf.convertFromAPInt(Val,
+                                 Opcode==ISD::SINT_TO_FP,
+                                 APFloat::rmNearestTiesToEven);
+      return getConstantFP(apf, VT);
+    }
+    case ISD::BIT_CONVERT:
+      if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
+        return getConstantFP(Val.bitsToFloat(), VT);
+      else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+        return getConstantFP(Val.bitsToDouble(), VT);
+      break;
+    case ISD::BSWAP:
+      return getConstant(Val.byteSwap(), VT);
+    case ISD::CTPOP:
+      return getConstant(Val.countPopulation(), VT);
+    case ISD::CTLZ:
+      return getConstant(Val.countLeadingZeros(), VT);
+    case ISD::CTTZ:
+      return getConstant(Val.countTrailingZeros(), VT);
+    }
+  }
+
+  // Constant fold unary operations with a floating point constant operand.
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand.getNode())) {
+    APFloat V = C->getValueAPF();    // make copy
+    if (VT != MVT::ppcf128 && Operand.getValueType() != MVT::ppcf128) {
+      switch (Opcode) {
+      case ISD::FNEG:
+        V.changeSign();
+        return getConstantFP(V, VT);
+      case ISD::FABS:
+        V.clearSign();
+        return getConstantFP(V, VT);
+      case ISD::FP_ROUND:
+      case ISD::FP_EXTEND: {
+        bool ignored;
+        // This can return overflow, underflow, or inexact; we don't care.
+        // FIXME need to be more flexible about rounding mode.
+        (void)V.convert(*MVTToAPFloatSemantics(VT),
+                        APFloat::rmNearestTiesToEven, &ignored);
+        return getConstantFP(V, VT);
+      }
+      case ISD::FP_TO_SINT:
+      case ISD::FP_TO_UINT: {
+        integerPart x[2];
+        bool ignored;
+        assert(integerPartWidth >= 64);
+        // FIXME need to be more flexible about rounding mode.
+        APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
+                              Opcode==ISD::FP_TO_SINT,
+                              APFloat::rmTowardZero, &ignored);
+        if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
+          break;
+        APInt api(VT.getSizeInBits(), 2, x);
+        return getConstant(api, VT);
+      }
+      case ISD::BIT_CONVERT:
+        if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+          return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
+        else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
+          return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
+        break;
+      }
+    }
+  }
+
+  unsigned OpOpcode = Operand.getNode()->getOpcode();
+  switch (Opcode) {
+  case ISD::TokenFactor:
+  case ISD::MERGE_VALUES:
+  case ISD::CONCAT_VECTORS:
+    return Operand;         // Factor, merge or concat of one node?  No need.
+  case ISD::FP_ROUND: assert(0 && "Invalid method to make FP_ROUND node");
+  case ISD::FP_EXTEND:
+    assert(VT.isFloatingPoint() &&
+           Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
+    if (Operand.getValueType() == VT) return Operand;  // noop conversion.
+    if (Operand.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SIGN_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid SIGN_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().bitsLT(VT)
+           && "Invalid sext node, dst < src!");
+    if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+    break;
+  case ISD::ZERO_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ZERO_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().bitsLT(VT)
+           && "Invalid zext node, dst < src!");
+    if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
+      return getNode(ISD::ZERO_EXTEND, DL, VT,
+                     Operand.getNode()->getOperand(0));
+    break;
+  case ISD::ANY_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ANY_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().bitsLT(VT)
+           && "Invalid anyext node, dst < src!");
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND)
+      // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+    break;
+  case ISD::TRUNCATE:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid TRUNCATE!");
+    if (Operand.getValueType() == VT) return Operand;   // noop truncate
+    assert(Operand.getValueType().bitsGT(VT)
+           && "Invalid truncate node, src < dst!");
+    if (OpOpcode == ISD::TRUNCATE)
+      return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+             OpOpcode == ISD::ANY_EXTEND) {
+      // If the source is smaller than the dest, we still need an extend.
+      if (Operand.getNode()->getOperand(0).getValueType().bitsLT(VT))
+        return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      else if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+      else
+        return Operand.getNode()->getOperand(0);
+    }
+    break;
+  case ISD::BIT_CONVERT:
+    // Basic sanity checking.
+    assert(VT.getSizeInBits() == Operand.getValueType().getSizeInBits()
+           && "Cannot BIT_CONVERT between types of different sizes!");
+    if (VT == Operand.getValueType()) return Operand;  // noop conversion.
+    if (OpOpcode == ISD::BIT_CONVERT)  // bitconv(bitconv(x)) -> bitconv(x)
+      return getNode(ISD::BIT_CONVERT, DL, VT, Operand.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SCALAR_TO_VECTOR:
+    assert(VT.isVector() && !Operand.getValueType().isVector() &&
+           (VT.getVectorElementType() == Operand.getValueType() ||
+            (VT.getVectorElementType().isInteger() &&
+             Operand.getValueType().isInteger() &&
+             VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
+           "Illegal SCALAR_TO_VECTOR node!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
+    if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
+        isa<ConstantSDNode>(Operand.getOperand(1)) &&
+        Operand.getConstantOperandVal(1) == 0 &&
+        Operand.getOperand(0).getValueType() == VT)
+      return Operand.getOperand(0);
+    break;
+  case ISD::FNEG:
+    // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
+    if (UnsafeFPMath && OpOpcode == ISD::FSUB)
+      return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
+                     Operand.getNode()->getOperand(0));
+    if (OpOpcode == ISD::FNEG)  // --X -> X
+      return Operand.getNode()->getOperand(0);
+    break;
+  case ISD::FABS:
+    if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
+      return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+    break;
+  }
+
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Flag) { // Don't CSE flag producing nodes
+    FoldingSetNodeID ID;
+    SDValue Ops[1] = { Operand };
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 1);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+    N = NodeAllocator.Allocate<UnarySDNode>();
+    new (N) UnarySDNode(Opcode, DL, VTs, Operand);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = NodeAllocator.Allocate<UnarySDNode>();
+    new (N) UnarySDNode(Opcode, DL, VTs, Operand);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode,
+                                             MVT VT,
+                                             ConstantSDNode *Cst1,
+                                             ConstantSDNode *Cst2) {
+  const APInt &C1 = Cst1->getAPIntValue(), &C2 = Cst2->getAPIntValue();
+
+  switch (Opcode) {
+  case ISD::ADD:  return getConstant(C1 + C2, VT);
+  case ISD::SUB:  return getConstant(C1 - C2, VT);
+  case ISD::MUL:  return getConstant(C1 * C2, VT);
+  case ISD::UDIV:
+    if (C2.getBoolValue()) return getConstant(C1.udiv(C2), VT);
+    break;
+  case ISD::UREM:
+    if (C2.getBoolValue()) return getConstant(C1.urem(C2), VT);
+    break;
+  case ISD::SDIV:
+    if (C2.getBoolValue()) return getConstant(C1.sdiv(C2), VT);
+    break;
+  case ISD::SREM:
+    if (C2.getBoolValue()) return getConstant(C1.srem(C2), VT);
+    break;
+  case ISD::AND:  return getConstant(C1 & C2, VT);
+  case ISD::OR:   return getConstant(C1 | C2, VT);
+  case ISD::XOR:  return getConstant(C1 ^ C2, VT);
+  case ISD::SHL:  return getConstant(C1 << C2, VT);
+  case ISD::SRL:  return getConstant(C1.lshr(C2), VT);
+  case ISD::SRA:  return getConstant(C1.ashr(C2), VT);
+  case ISD::ROTL: return getConstant(C1.rotl(C2), VT);
+  case ISD::ROTR: return getConstant(C1.rotr(C2), VT);
+  default: break;
+  }
+
+  return SDValue();
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT,
+                              SDValue N1, SDValue N2) {
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  switch (Opcode) {
+  default: break;
+  case ISD::TokenFactor:
+    assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
+           N2.getValueType() == MVT::Other && "Invalid token factor!");
+    // Fold trivial token factors.
+    if (N1.getOpcode() == ISD::EntryToken) return N2;
+    if (N2.getOpcode() == ISD::EntryToken) return N1;
+    if (N1 == N2) return N1;
+    break;
+  case ISD::CONCAT_VECTORS:
+    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
+    // one big BUILD_VECTOR.
+    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
+        N2.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
+      Elts.insert(Elts.end(), N2.getNode()->op_begin(), N2.getNode()->op_end());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+    }
+    break;
+  case ISD::AND:
+    assert(VT.isInteger() && N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
+    // worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N2;
+    if (N2C && N2C->isAllOnesValue())  // X & -1 -> X
+      return N1;
+    break;
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+    assert(VT.isInteger() && N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
+    // it's worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N1;
+    break;
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::SREM:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    // fall through
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+    if (UnsafeFPMath) {
+      if (Opcode == ISD::FADD) {
+        // 0+x --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1))
+          if (CFP->getValueAPF().isZero())
+            return N2;
+        // x+0 --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
+          if (CFP->getValueAPF().isZero())
+            return N1;
+      } else if (Opcode == ISD::FSUB) {
+        // x-0 --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
+          if (CFP->getValueAPF().isZero())
+            return N1;
+      }
+    }
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
+    assert(N1.getValueType() == VT &&
+           N1.getValueType().isFloatingPoint() &&
+           N2.getValueType().isFloatingPoint() &&
+           "Invalid FCOPYSIGN!");
+    break;
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+    assert(VT == N1.getValueType() &&
+           "Shift operators return type must be the same as their first arg");
+    assert(VT.isInteger() && N2.getValueType().isInteger() &&
+           "Shifts only work on integers");
+
+    // Always fold shifts of i1 values so the code generator doesn't need to
+    // handle them.  Since we know the size of the shift has to be less than the
+    // size of the value, the shift/rotate count is guaranteed to be zero.
+    if (VT == MVT::i1)
+      return N1;
+    break;
+  case ISD::FP_ROUND_INREG: {
+    MVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg round!");
+    assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
+           "Cannot FP_ROUND_INREG integer types");
+    assert(EVT.bitsLE(VT) && "Not rounding down!");
+    if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
+    break;
+  }
+  case ISD::FP_ROUND:
+    assert(VT.isFloatingPoint() &&
+           N1.getValueType().isFloatingPoint() &&
+           VT.bitsLE(N1.getValueType()) &&
+           isa<ConstantSDNode>(N2) && "Invalid FP_ROUND!");
+    if (N1.getValueType() == VT) return N1;  // noop conversion.
+    break;
+  case ISD::AssertSext:
+  case ISD::AssertZext: {
+    MVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (VT == EVT) return N1; // noop assertion.
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    MVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (EVT == VT) return N1;  // Not actually extending
+
+    if (N1C) {
+      APInt Val = N1C->getAPIntValue();
+      unsigned FromBits = cast<VTSDNode>(N2)->getVT().getSizeInBits();
+      Val <<= Val.getBitWidth()-FromBits;
+      Val = Val.ashr(Val.getBitWidth()-FromBits);
+      return getConstant(Val, VT);
+    }
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
+    if (N1.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
+    // expanding copies of large vectors from registers.
+    if (N2C &&
+        N1.getOpcode() == ISD::CONCAT_VECTORS &&
+        N1.getNumOperands() > 0) {
+      unsigned Factor =
+        N1.getOperand(0).getValueType().getVectorNumElements();
+      return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                     N1.getOperand(N2C->getZExtValue() / Factor),
+                     getConstant(N2C->getZExtValue() % Factor,
+                                 N2.getValueType()));
+    }
+
+    // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
+    // expanding large vector constants.
+    if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt = N1.getOperand(N2C->getZExtValue());
+      if (Elt.getValueType() != VT) {
+        // If the vector element type is not legal, the BUILD_VECTOR operands
+        // are promoted and implicitly truncated.  Make that explicit here.
+        assert(VT.isInteger() && Elt.getValueType().isInteger() &&
+               VT.bitsLE(Elt.getValueType()) &&
+               "Bad type for BUILD_VECTOR operand");
+        Elt = getNode(ISD::TRUNCATE, DL, VT, Elt);
+      }
+      return Elt;
+    }
+
+    // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
+    // operations are lowered to scalars.
+    if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+      // If the indices are the same, return the inserted element.
+      if (N1.getOperand(2) == N2)
+        return N1.getOperand(1);
+      // If the indices are known different, extract the element from
+      // the original vector.
+      else if (isa<ConstantSDNode>(N1.getOperand(2)) &&
+               isa<ConstantSDNode>(N2))
+        return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
+    }
+    break;
+  case ISD::EXTRACT_ELEMENT:
+    assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
+    assert(!N1.getValueType().isVector() && !VT.isVector() &&
+           (N1.getValueType().isInteger() == VT.isInteger()) &&
+           "Wrong types for EXTRACT_ELEMENT!");
+
+    // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
+    // 64-bit integers into 32-bit parts.  Instead of building the extract of
+    // the BUILD_PAIR, only to have legalize rip it apart, just do it now.
+    if (N1.getOpcode() == ISD::BUILD_PAIR)
+      return N1.getOperand(N2C->getZExtValue());
+
+    // EXTRACT_ELEMENT of a constant int is also very common.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      unsigned ElementSize = VT.getSizeInBits();
+      unsigned Shift = ElementSize * N2C->getZExtValue();
+      APInt ShiftedVal = C->getAPIntValue().lshr(Shift);
+      return getConstant(ShiftedVal.trunc(ElementSize), VT);
+    }
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    if (N1.getValueType() == VT) // Trivial extraction.
+      return N1;
+    break;
+  }
+
+  if (N1C) {
+    if (N2C) {
+      SDValue SV = FoldConstantArithmetic(Opcode, VT, N1C, N2C);
+      if (SV.getNode()) return SV;
+    } else {      // Cannonicalize constant to RHS if commutative
+      if (isCommutativeBinOp(Opcode)) {
+        std::swap(N1C, N2C);
+        std::swap(N1, N2);
+      }
+    }
+  }
+
+  // Constant fold FP operations.
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
+  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
+  if (N1CFP) {
+    if (!N2CFP && isCommutativeBinOp(Opcode)) {
+      // Cannonicalize constant to RHS if commutative
+      std::swap(N1CFP, N2CFP);
+      std::swap(N1, N2);
+    } else if (N2CFP && VT != MVT::ppcf128) {
+      APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
+      APFloat::opStatus s;
+      switch (Opcode) {
+      case ISD::FADD:
+        s = V1.add(V2, APFloat::rmNearestTiesToEven);
+        if (s != APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FSUB:
+        s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FMUL:
+        s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FDIV:
+        s = V1.divide(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FREM :
+        s = V1.mod(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FCOPYSIGN:
+        V1.copySign(V2);
+        return getConstantFP(V1, VT);
+      default: break;
+      }
+    }
+  }
+
+  // Canonicalize an UNDEF to the RHS, even over a constant.
+  if (N1.getOpcode() == ISD::UNDEF) {
+    if (isCommutativeBinOp(Opcode)) {
+      std::swap(N1, N2);
+    } else {
+      switch (Opcode) {
+      case ISD::FP_ROUND_INREG:
+      case ISD::SIGN_EXTEND_INREG:
+      case ISD::SUB:
+      case ISD::FSUB:
+      case ISD::FDIV:
+      case ISD::FREM:
+      case ISD::SRA:
+        return N1;     // fold op(undef, arg2) -> undef
+      case ISD::UDIV:
+      case ISD::SDIV:
+      case ISD::UREM:
+      case ISD::SREM:
+      case ISD::SRL:
+      case ISD::SHL:
+        if (!VT.isVector())
+          return getConstant(0, VT);    // fold op(undef, arg2) -> 0
+        // For vectors, we can't easily build an all zero vector, just return
+        // the LHS.
+        return N2;
+      }
+    }
+  }
+
+  // Fold a bunch of operators when the RHS is undef.
+  if (N2.getOpcode() == ISD::UNDEF) {
+    switch (Opcode) {
+    case ISD::XOR:
+      if (N1.getOpcode() == ISD::UNDEF)
+        // Handle undef ^ undef -> 0 special case. This is a common
+        // idiom (misuse).
+        return getConstant(0, VT);
+      // fallthrough
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::SUB:
+    case ISD::FADD:
+    case ISD::FSUB:
+    case ISD::FMUL:
+    case ISD::FDIV:
+    case ISD::FREM:
+    case ISD::UDIV:
+    case ISD::SDIV:
+    case ISD::UREM:
+    case ISD::SREM:
+      return N2;       // fold op(arg1, undef) -> undef
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::SRL:
+    case ISD::SHL:
+      if (!VT.isVector())
+        return getConstant(0, VT);  // fold op(arg1, undef) -> 0
+      // For vectors, we can't easily build an all zero vector, just return
+      // the LHS.
+      return N1;
+    case ISD::OR:
+      if (!VT.isVector())
+        return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+      // For vectors, we can't easily build an all one vector, just return
+      // the LHS.
+      return N1;
+    case ISD::SRA:
+      return N1;
+    }
+  }
+
+  // Memoize this node if possible.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Flag) {
+    SDValue Ops[] = { N1, N2 };
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+    N = NodeAllocator.Allocate<BinarySDNode>();
+    new (N) BinarySDNode(Opcode, DL, VTs, N1, N2);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = NodeAllocator.Allocate<BinarySDNode>();
+    new (N) BinarySDNode(Opcode, DL, VTs, N1, N2);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  // Perform various simplifications.
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  switch (Opcode) {
+  case ISD::CONCAT_VECTORS:
+    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
+    // one big BUILD_VECTOR.
+    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
+        N2.getOpcode() == ISD::BUILD_VECTOR &&
+        N3.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
+      Elts.insert(Elts.end(), N2.getNode()->op_begin(), N2.getNode()->op_end());
+      Elts.insert(Elts.end(), N3.getNode()->op_begin(), N3.getNode()->op_end());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+    }
+    break;
+  case ISD::SETCC: {
+    // Use FoldSetCC to simplify SETCC's.
+    SDValue Simp = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL);
+    if (Simp.getNode()) return Simp;
+    break;
+  }
+  case ISD::SELECT:
+    if (N1C) {
+     if (N1C->getZExtValue())
+        return N2;             // select true, X, Y -> X
+      else
+        return N3;             // select false, X, Y -> Y
+    }
+
+    if (N2 == N3) return N2;   // select C, X, X -> X
+    break;
+  case ISD::BRCOND:
+    if (N2C) {
+      if (N2C->getZExtValue()) // Unconditional branch
+        return getNode(ISD::BR, DL, MVT::Other, N1, N3);
+      else
+        return N1;         // Never-taken branch
+    }
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    assert(0 && "should use getVectorShuffle constructor!");
+    break;
+  case ISD::BIT_CONVERT:
+    // Fold bit_convert nodes from a type to themselves.
+    if (N1.getValueType() == VT)
+      return N1;
+    break;
+  }
+
+  // Memoize node if it doesn't produce a flag.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Flag) {
+    SDValue Ops[] = { N1, N2, N3 };
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+    N = NodeAllocator.Allocate<TernarySDNode>();
+    new (N) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = NodeAllocator.Allocate<TernarySDNode>();
+    new (N) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+  }
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VT, Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4, SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VT, Ops, 5);
+}
+
+/// getMemsetValue - Vectorized representation of the memset value
+/// operand.
+static SDValue getMemsetValue(SDValue Value, MVT VT, SelectionDAG &DAG,
+                              DebugLoc dl) {
+  unsigned NumBits = VT.isVector() ?
+    VT.getVectorElementType().getSizeInBits() : VT.getSizeInBits();
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
+    APInt Val = APInt(NumBits, C->getZExtValue() & 255);
+    unsigned Shift = 8;
+    for (unsigned i = NumBits; i > 8; i >>= 1) {
+      Val = (Val << Shift) | Val;
+      Shift <<= 1;
+    }
+    if (VT.isInteger())
+      return DAG.getConstant(Val, VT);
+    return DAG.getConstantFP(APFloat(Val), VT);
+  }
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Value);
+  unsigned Shift = 8;
+  for (unsigned i = NumBits; i > 8; i >>= 1) {
+    Value = DAG.getNode(ISD::OR, dl, VT,
+                        DAG.getNode(ISD::SHL, dl, VT, Value,
+                                    DAG.getConstant(Shift,
+                                                    TLI.getShiftAmountTy())),
+                        Value);
+    Shift <<= 1;
+  }
+
+  return Value;
+}
+
+/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
+/// used when a memcpy is turned into a memset when the source is a constant
+/// string ptr.
+static SDValue getMemsetStringVal(MVT VT, DebugLoc dl, SelectionDAG &DAG,
+                                    const TargetLowering &TLI,
+                                    std::string &Str, unsigned Offset) {
+  // Handle vector with all elements zero.
+  if (Str.empty()) {
+    if (VT.isInteger())
+      return DAG.getConstant(0, VT);
+    unsigned NumElts = VT.getVectorNumElements();
+    MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                       DAG.getConstant(0, MVT::getVectorVT(EltVT, NumElts)));
+  }
+
+  assert(!VT.isVector() && "Can't handle vector type here!");
+  unsigned NumBits = VT.getSizeInBits();
+  unsigned MSB = NumBits / 8;
+  uint64_t Val = 0;
+  if (TLI.isLittleEndian())
+    Offset = Offset + MSB - 1;
+  for (unsigned i = 0; i != MSB; ++i) {
+    Val = (Val << 8) | (unsigned char)Str[Offset];
+    Offset += TLI.isLittleEndian() ? -1 : 1;
+  }
+  return DAG.getConstant(Val, VT);
+}
+
+/// getMemBasePlusOffset - Returns base and offset node for the
+///
+static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset,
+                                      SelectionDAG &DAG) {
+  MVT VT = Base.getValueType();
+  return DAG.getNode(ISD::ADD, Base.getDebugLoc(),
+                     VT, Base, DAG.getConstant(Offset, VT));
+}
+
+/// isMemSrcFromString - Returns true if memcpy source is a string constant.
+///
+static bool isMemSrcFromString(SDValue Src, std::string &Str) {
+  unsigned SrcDelta = 0;
+  GlobalAddressSDNode *G = NULL;
+  if (Src.getOpcode() == ISD::GlobalAddress)
+    G = cast<GlobalAddressSDNode>(Src);
+  else if (Src.getOpcode() == ISD::ADD &&
+           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+           Src.getOperand(1).getOpcode() == ISD::Constant) {
+    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+  }
+  if (!G)
+    return false;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(G->getGlobal());
+  if (GV && GetConstantStringInfo(GV, Str, SrcDelta, false))
+    return true;
+
+  return false;
+}
+
+/// MeetsMaxMemopRequirement - Determines if the number of memory ops required
+/// to replace the memset / memcpy is below the threshold. It also returns the
+/// types of the sequence of memory ops to perform memset / memcpy.
+static
+bool MeetsMaxMemopRequirement(std::vector<MVT> &MemOps,
+                              SDValue Dst, SDValue Src,
+                              unsigned Limit, uint64_t Size, unsigned &Align,
+                              std::string &Str, bool &isSrcStr,
+                              SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
+  isSrcStr = isMemSrcFromString(Src, Str);
+  bool isSrcConst = isa<ConstantSDNode>(Src);
+  bool AllowUnalign = TLI.allowsUnalignedMemoryAccesses();
+  MVT VT = TLI.getOptimalMemOpType(Size, Align, isSrcConst, isSrcStr);
+  if (VT != MVT::iAny) {
+    unsigned NewAlign = (unsigned)
+      TLI.getTargetData()->getABITypeAlignment(VT.getTypeForMVT());
+    // If source is a string constant, this will require an unaligned load.
+    if (NewAlign > Align && (isSrcConst || AllowUnalign)) {
+      if (Dst.getOpcode() != ISD::FrameIndex) {
+        // Can't change destination alignment. It requires a unaligned store.
+        if (AllowUnalign)
+          VT = MVT::iAny;
+      } else {
+        int FI = cast<FrameIndexSDNode>(Dst)->getIndex();
+        MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+        if (MFI->isFixedObjectIndex(FI)) {
+          // Can't change destination alignment. It requires a unaligned store.
+          if (AllowUnalign)
+            VT = MVT::iAny;
+        } else {
+          // Give the stack frame object a larger alignment if needed.
+          if (MFI->getObjectAlignment(FI) < NewAlign)
+            MFI->setObjectAlignment(FI, NewAlign);
+          Align = NewAlign;
+        }
+      }
+    }
+  }
+
+  if (VT == MVT::iAny) {
+    if (AllowUnalign) {
+      VT = MVT::i64;
+    } else {
+      switch (Align & 7) {
+      case 0:  VT = MVT::i64; break;
+      case 4:  VT = MVT::i32; break;
+      case 2:  VT = MVT::i16; break;
+      default: VT = MVT::i8;  break;
+      }
+    }
+
+    MVT LVT = MVT::i64;
+    while (!TLI.isTypeLegal(LVT))
+      LVT = (MVT::SimpleValueType)(LVT.getSimpleVT() - 1);
+    assert(LVT.isInteger());
+
+    if (VT.bitsGT(LVT))
+      VT = LVT;
+  }
+
+  unsigned NumMemOps = 0;
+  while (Size != 0) {
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    while (VTSize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      if (VT.isVector()) {
+        VT = MVT::i64;
+        while (!TLI.isTypeLegal(VT))
+          VT = (MVT::SimpleValueType)(VT.getSimpleVT() - 1);
+        VTSize = VT.getSizeInBits() / 8;
+      } else {
+        VT = (MVT::SimpleValueType)(VT.getSimpleVT() - 1);
+        VTSize >>= 1;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+    MemOps.push_back(VT);
+    Size -= VTSize;
+  }
+
+  return true;
+}
+
+static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+                                         SDValue Chain, SDValue Dst,
+                                         SDValue Src, uint64_t Size,
+                                         unsigned Align, bool AlwaysInline,
+                                         const Value *DstSV, uint64_t DstSVOff,
+                                         const Value *SrcSV, uint64_t SrcSVOff){
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Expand memcpy to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  std::vector<MVT> MemOps;
+  uint64_t Limit = -1ULL;
+  if (!AlwaysInline)
+    Limit = TLI.getMaxStoresPerMemcpy();
+  unsigned DstAlign = Align;  // Destination alignment can change.
+  std::string Str;
+  bool CopyFromStr;
+  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
+                                Str, CopyFromStr, DAG, TLI))
+    return SDValue();
+
+
+  bool isZeroStr = CopyFromStr && Str.empty();
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  uint64_t SrcOff = 0, DstOff = 0;
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    if (CopyFromStr && (isZeroStr || !VT.isVector())) {
+      // It's unlikely a store of a vector immediate can be done in a single
+      // instruction. It would require a load from a constantpool first.
+      // We also handle store a vector with all zero's.
+      // FIXME: Handle other cases where store of vector immediate is done in
+      // a single instruction.
+      Value = getMemsetStringVal(VT, dl, DAG, TLI, Str, SrcOff);
+      Store = DAG.getStore(Chain, dl, Value,
+                           getMemBasePlusOffset(Dst, DstOff, DAG),
+                           DstSV, DstSVOff + DstOff, false, DstAlign);
+    } else {
+      Value = DAG.getLoad(VT, dl, Chain,
+                          getMemBasePlusOffset(Src, SrcOff, DAG),
+                          SrcSV, SrcSVOff + SrcOff, false, Align);
+      Store = DAG.getStore(Chain, dl, Value,
+                           getMemBasePlusOffset(Dst, DstOff, DAG),
+                           DstSV, DstSVOff + DstOff, false, DstAlign);
+    }
+    OutChains.push_back(Store);
+    SrcOff += VTSize;
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+                                          SDValue Chain, SDValue Dst,
+                                          SDValue Src, uint64_t Size,
+                                          unsigned Align, bool AlwaysInline,
+                                          const Value *DstSV, uint64_t DstSVOff,
+                                          const Value *SrcSV, uint64_t SrcSVOff){
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Expand memmove to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  std::vector<MVT> MemOps;
+  uint64_t Limit = -1ULL;
+  if (!AlwaysInline)
+    Limit = TLI.getMaxStoresPerMemmove();
+  unsigned DstAlign = Align;  // Destination alignment can change.
+  std::string Str;
+  bool CopyFromStr;
+  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
+                                Str, CopyFromStr, DAG, TLI))
+    return SDValue();
+
+  uint64_t SrcOff = 0, DstOff = 0;
+
+  SmallVector<SDValue, 8> LoadValues;
+  SmallVector<SDValue, 8> LoadChains;
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    Value = DAG.getLoad(VT, dl, Chain,
+                        getMemBasePlusOffset(Src, SrcOff, DAG),
+                        SrcSV, SrcSVOff + SrcOff, false, Align);
+    LoadValues.push_back(Value);
+    LoadChains.push_back(Value.getValue(1));
+    SrcOff += VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      &LoadChains[0], LoadChains.size());
+  OutChains.clear();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    Store = DAG.getStore(Chain, dl, LoadValues[i],
+                         getMemBasePlusOffset(Dst, DstOff, DAG),
+                         DstSV, DstSVOff + DstOff, false, DstAlign);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue Chain, SDValue Dst,
+                                 SDValue Src, uint64_t Size,
+                                 unsigned Align,
+                                 const Value *DstSV, uint64_t DstSVOff) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Expand memset to a series of load/store ops if the size operand
+  // falls below a certain threshold.
+  std::vector<MVT> MemOps;
+  std::string Str;
+  bool CopyFromStr;
+  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, TLI.getMaxStoresPerMemset(),
+                                Size, Align, Str, CopyFromStr, DAG, TLI))
+    return SDValue();
+
+  SmallVector<SDValue, 8> OutChains;
+  uint64_t DstOff = 0;
+
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value = getMemsetValue(Src, VT, DAG, dl);
+    SDValue Store = DAG.getStore(Chain, dl, Value,
+                                 getMemBasePlusOffset(Dst, DstOff, DAG),
+                                 DstSV, DstSVOff + DstOff);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                SDValue Src, SDValue Size,
+                                unsigned Align, bool AlwaysInline,
+                                const Value *DstSV, uint64_t DstSVOff,
+                                const Value *SrcSV, uint64_t SrcSVOff) {
+
+  // Check to see if we should lower the memcpy to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memcpy with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                              ConstantSize->getZExtValue(),
+                              Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memcpy with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TLI.EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
+                                AlwaysInline,
+                                DstSV, DstSVOff, SrcSV, SrcSVOff);
+  if (Result.getNode())
+    return Result;
+
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                   ConstantSize->getZExtValue(), Align, true,
+                                   DstSV, DstSVOff, SrcSV, SrcSVOff);
+  }
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getTargetData()->getIntPtrType();
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME: pass in DebugLoc
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain, Type::VoidTy,
+                    false, false, false, false, CallingConv::C, false,
+                    getExternalSymbol("memcpy", TLI.getPointerTy()),
+                    Args, *this, dl);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                 SDValue Src, SDValue Size,
+                                 unsigned Align,
+                                 const Value *DstSV, uint64_t DstSVOff,
+                                 const Value *SrcSV, uint64_t SrcSVOff) {
+
+  // Check to see if we should lower the memmove to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memmove with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
+                               ConstantSize->getZExtValue(),
+                               Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memmove with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TLI.EmitTargetCodeForMemmove(*this, dl, Chain, Dst, Src, Size, Align,
+                                 DstSV, DstSVOff, SrcSV, SrcSVOff);
+  if (Result.getNode())
+    return Result;
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getTargetData()->getIntPtrType();
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME:  pass in DebugLoc
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain, Type::VoidTy,
+                    false, false, false, false, CallingConv::C, false,
+                    getExternalSymbol("memmove", TLI.getPointerTy()),
+                    Args, *this, dl);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                SDValue Src, SDValue Size,
+                                unsigned Align,
+                                const Value *DstSV, uint64_t DstSVOff) {
+
+  // Check to see if we should lower the memset to stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memset with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+                      Align, DstSV, DstSVOff);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memset with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TLI.EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src, Size, Align,
+                                DstSV, DstSVOff);
+  if (Result.getNode())
+    return Result;
+
+  // Emit a library call.
+  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Dst; Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType().bitsGT(MVT::i32))
+    Src = getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  else
+    Src = getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+  Entry.Node = Src; Entry.Ty = Type::Int32Ty; Entry.isSExt = true;
+  Args.push_back(Entry);
+  Entry.Node = Size; Entry.Ty = IntPtrTy; Entry.isSExt = false;
+  Args.push_back(Entry);
+  // FIXME: pass in DebugLoc
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain, Type::VoidTy,
+                    false, false, false, false, CallingConv::C, false,
+                    getExternalSymbol("memset", TLI.getPointerTy()),
+                    Args, *this, dl);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, MVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Cmp,
+                                SDValue Swp, const Value* PtrVal,
+                                unsigned Alignment) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP && "Invalid Atomic Op");
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
+  MVT VT = Cmp.getValueType();
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getMVTAlignment(MemVT);
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode* N = NodeAllocator.Allocate<AtomicSDNode>();
+  new (N) AtomicSDNode(Opcode, dl, VTs, MemVT,
+                       Chain, Ptr, Cmp, Swp, PtrVal, Alignment);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, MVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Val,
+                                const Value* PtrVal,
+                                unsigned Alignment) {
+  assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
+          Opcode == ISD::ATOMIC_LOAD_SUB ||
+          Opcode == ISD::ATOMIC_LOAD_AND ||
+          Opcode == ISD::ATOMIC_LOAD_OR ||
+          Opcode == ISD::ATOMIC_LOAD_XOR ||
+          Opcode == ISD::ATOMIC_LOAD_NAND ||
+          Opcode == ISD::ATOMIC_LOAD_MIN ||
+          Opcode == ISD::ATOMIC_LOAD_MAX ||
+          Opcode == ISD::ATOMIC_LOAD_UMIN ||
+          Opcode == ISD::ATOMIC_LOAD_UMAX ||
+          Opcode == ISD::ATOMIC_SWAP) &&
+         "Invalid Atomic Op");
+
+  MVT VT = Val.getValueType();
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getMVTAlignment(MemVT);
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr, Val};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode* N = NodeAllocator.Allocate<AtomicSDNode>();
+  new (N) AtomicSDNode(Opcode, dl, VTs, MemVT,
+                       Chain, Ptr, Val, PtrVal, Alignment);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+/// getMergeValues - Create a MERGE_VALUES node from the given operands.
+/// Allowed to return something different (and simpler) if Simplify is true.
+SDValue SelectionDAG::getMergeValues(const SDValue *Ops, unsigned NumOps,
+                                     DebugLoc dl) {
+  if (NumOps == 1)
+    return Ops[0];
+
+  SmallVector<MVT, 4> VTs;
+  VTs.reserve(NumOps);
+  for (unsigned i = 0; i < NumOps; ++i)
+    VTs.push_back(Ops[i].getValueType());
+  return getNode(ISD::MERGE_VALUES, dl, getVTList(&VTs[0], NumOps),
+                 Ops, NumOps);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl,
+                                  const MVT *VTs, unsigned NumVTs,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  MVT MemVT, const Value *srcValue, int SVOff,
+                                  unsigned Align, bool Vol,
+                                  bool ReadMem, bool WriteMem) {
+  return getMemIntrinsicNode(Opcode, dl, makeVTList(VTs, NumVTs), Ops, NumOps,
+                             MemVT, srcValue, SVOff, Align, Vol,
+                             ReadMem, WriteMem);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  MVT MemVT, const Value *srcValue, int SVOff,
+                                  unsigned Align, bool Vol,
+                                  bool ReadMem, bool WriteMem) {
+  // Memoize the node unless it returns a flag.
+  MemIntrinsicSDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = NodeAllocator.Allocate<MemIntrinsicSDNode>();
+    new (N) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps, MemVT,
+                               srcValue, SVOff, Align, Vol, ReadMem, WriteMem);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = NodeAllocator.Allocate<MemIntrinsicSDNode>();
+    new (N) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps, MemVT,
+                               srcValue, SVOff, Align, Vol, ReadMem, WriteMem);
+  }
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue
+SelectionDAG::getCall(unsigned CallingConv, DebugLoc dl, bool IsVarArgs,
+                      bool IsTailCall, bool IsInreg, SDVTList VTs,
+                      const SDValue *Operands, unsigned NumOperands) {
+  // Do not include isTailCall in the folding set profile.
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::CALL, VTs, Operands, NumOperands);
+  ID.AddInteger(CallingConv);
+  ID.AddInteger(IsVarArgs);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    // Instead of including isTailCall in the folding set, we just
+    // set the flag of the existing node.
+    if (!IsTailCall)
+      cast<CallSDNode>(E)->setNotTailCall();
+    return SDValue(E, 0);
+  }
+  SDNode *N = NodeAllocator.Allocate<CallSDNode>();
+  new (N) CallSDNode(CallingConv, dl, IsVarArgs, IsTailCall, IsInreg,
+                     VTs, Operands, NumOperands);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
+                      ISD::LoadExtType ExtType, MVT VT, SDValue Chain,
+                      SDValue Ptr, SDValue Offset,
+                      const Value *SV, int SVOffset, MVT EVT,
+                      bool isVolatile, unsigned Alignment) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getMVTAlignment(VT);
+
+  if (VT == EVT) {
+    ExtType = ISD::NON_EXTLOAD;
+  } else if (ExtType == ISD::NON_EXTLOAD) {
+    assert(VT == EVT && "Non-extending load from different memory type!");
+  } else {
+    // Extending load.
+    if (VT.isVector())
+      assert(EVT.getVectorNumElements() == VT.getVectorNumElements() &&
+             "Invalid vector extload!");
+    else
+      assert(EVT.bitsLT(VT) &&
+             "Should only be an extending load, not truncating!");
+    assert((ExtType == ISD::EXTLOAD || VT.isInteger()) &&
+           "Cannot sign/zero extend a FP/Vector load!");
+    assert(VT.isInteger() == EVT.isInteger() &&
+           "Cannot convert from FP to Int or Int -> FP!");
+  }
+
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.getOpcode() == ISD::UNDEF) &&
+         "Unindexed load with an offset!");
+
+  SDVTList VTs = Indexed ?
+    getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::LOAD, VTs, Ops, 3);
+  ID.AddInteger(EVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, isVolatile, Alignment));
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<LoadSDNode>();
+  new (N) LoadSDNode(Ops, dl, VTs, AM, ExtType, EVT, SV, SVOffset,
+                     Alignment, isVolatile);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getLoad(MVT VT, DebugLoc dl,
+                              SDValue Chain, SDValue Ptr,
+                              const Value *SV, int SVOffset,
+                              bool isVolatile, unsigned Alignment) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, dl, ISD::NON_EXTLOAD, VT, Chain, Ptr, Undef,
+                 SV, SVOffset, VT, isVolatile, Alignment);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, DebugLoc dl, MVT VT,
+                                 SDValue Chain, SDValue Ptr,
+                                 const Value *SV,
+                                 int SVOffset, MVT EVT,
+                                 bool isVolatile, unsigned Alignment) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, dl, ExtType, VT, Chain, Ptr, Undef,
+                 SV, SVOffset, EVT, isVolatile, Alignment);
+}
+
+SDValue
+SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
+                             SDValue Offset, ISD::MemIndexedMode AM) {
+  LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
+  assert(LD->getOffset().getOpcode() == ISD::UNDEF &&
+         "Load is already a indexed load!");
+  return getLoad(AM, dl, LD->getExtensionType(), OrigLoad.getValueType(),
+                 LD->getChain(), Base, Offset, LD->getSrcValue(),
+                 LD->getSrcValueOffset(), LD->getMemoryVT(),
+                 LD->isVolatile(), LD->getAlignment());
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                               SDValue Ptr, const Value *SV, int SVOffset,
+                               bool isVolatile, unsigned Alignment) {
+  MVT VT = Val.getValueType();
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getMVTAlignment(VT);
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED,
+                                     isVolatile, Alignment));
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
+  new (N) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED, false,
+                      VT, SV, SVOffset, Alignment, isVolatile);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                                    SDValue Ptr, const Value *SV,
+                                    int SVOffset, MVT SVT,
+                                    bool isVolatile, unsigned Alignment) {
+  MVT VT = Val.getValueType();
+
+  if (VT == SVT)
+    return getStore(Chain, dl, Val, Ptr, SV, SVOffset, isVolatile, Alignment);
+
+  assert(VT.bitsGT(SVT) && "Not a truncation?");
+  assert(VT.isInteger() == SVT.isInteger() &&
+         "Can't do FP-INT conversion!");
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getMVTAlignment(VT);
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED,
+                                     isVolatile, Alignment));
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
+  new (N) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED, true,
+                      SVT, SV, SVOffset, Alignment, isVolatile);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue
+SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
+                              SDValue Offset, ISD::MemIndexedMode AM) {
+  StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
+  assert(ST->getOffset().getOpcode() == ISD::UNDEF &&
+         "Store is already a indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(ST->getMemoryVT().getRawBits());
+  ID.AddInteger(ST->getRawSubclassData());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+  SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
+  new (N) StoreSDNode(Ops, dl, VTs, AM,
+                      ST->isTruncatingStore(), ST->getMemoryVT(),
+                      ST->getSrcValue(), ST->getSrcValueOffset(),
+                      ST->getAlignment(), ST->isVolatile());
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getVAArg(MVT VT, DebugLoc dl,
+                               SDValue Chain, SDValue Ptr,
+                               SDValue SV) {
+  SDValue Ops[] = { Chain, Ptr, SV };
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 3);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT,
+                              const SDUse *Ops, unsigned NumOps) {
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  // Copy from an SDUse array into an SDValue array for use with
+  // the regular getNode logic.
+  SmallVector<SDValue, 8> NewOps(Ops, Ops + NumOps);
+  return getNode(Opcode, DL, VT, &NewOps[0], NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT,
+                              const SDValue *Ops, unsigned NumOps) {
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::SELECT_CC: {
+    assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
+    assert(Ops[0].getValueType() == Ops[1].getValueType() &&
+           "LHS and RHS of condition must have same type!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "True and False arms of SelectCC must have same type!");
+    assert(Ops[2].getValueType() == VT &&
+           "select_cc node must be of same type as true and false value!");
+    break;
+  }
+  case ISD::BR_CC: {
+    assert(NumOps == 5 && "BR_CC takes 5 operands!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "LHS/RHS of comparison should match types!");
+    break;
+  }
+  }
+
+  // Memoize nodes.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+
+  if (VT != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, NumOps);
+    void *IP = 0;
+
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = NodeAllocator.Allocate<SDNode>();
+    new (N) SDNode(Opcode, DL, VTs, Ops, NumOps);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = NodeAllocator.Allocate<SDNode>();
+    new (N) SDNode(Opcode, DL, VTs, Ops, NumOps);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              const std::vector<MVT> &ResultTys,
+                              const SDValue *Ops, unsigned NumOps) {
+  return getNode(Opcode, DL, getVTList(&ResultTys[0], ResultTys.size()),
+                 Ops, NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              const MVT *VTs, unsigned NumVTs,
+                              const SDValue *Ops, unsigned NumOps) {
+  if (NumVTs == 1)
+    return getNode(Opcode, DL, VTs[0], Ops, NumOps);
+  return getNode(Opcode, DL, makeVTList(VTs, NumVTs), Ops, NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              const SDValue *Ops, unsigned NumOps) {
+  if (VTList.NumVTs == 1)
+    return getNode(Opcode, DL, VTList.VTs[0], Ops, NumOps);
+
+  switch (Opcode) {
+  // FIXME: figure out how to safely handle things like
+  // int foo(int x) { return 1 << (x & 255); }
+  // int bar() { return foo(256); }
+#if 0
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+  case ISD::SHL_PARTS:
+    if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+        cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
+      return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+    else if (N3.getOpcode() == ISD::AND)
+      if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
+        // If the and is only masking out bits that cannot effect the shift,
+        // eliminate the and.
+        unsigned NumBits = VT.getSizeInBits()*2;
+        if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
+          return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+      }
+    break;
+#endif
+  }
+
+  // Memoize the node unless it returns a flag.
+  SDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+    if (NumOps == 1) {
+      N = NodeAllocator.Allocate<UnarySDNode>();
+      new (N) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+    } else if (NumOps == 2) {
+      N = NodeAllocator.Allocate<BinarySDNode>();
+      new (N) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+    } else if (NumOps == 3) {
+      N = NodeAllocator.Allocate<TernarySDNode>();
+      new (N) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1], Ops[2]);
+    } else {
+      N = NodeAllocator.Allocate<SDNode>();
+      new (N) SDNode(Opcode, DL, VTList, Ops, NumOps);
+    }
+    CSEMap.InsertNode(N, IP);
+  } else {
+    if (NumOps == 1) {
+      N = NodeAllocator.Allocate<UnarySDNode>();
+      new (N) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+    } else if (NumOps == 2) {
+      N = NodeAllocator.Allocate<BinarySDNode>();
+      new (N) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+    } else if (NumOps == 3) {
+      N = NodeAllocator.Allocate<TernarySDNode>();
+      new (N) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1], Ops[2]);
+    } else {
+      N = NodeAllocator.Allocate<SDNode>();
+      new (N) SDNode(Opcode, DL, VTList, Ops, NumOps);
+    }
+  }
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList) {
+  return getNode(Opcode, DL, VTList, 0, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1) {
+  SDValue Ops[] = { N1 };
+  return getNode(Opcode, DL, VTList, Ops, 1);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2) {
+  SDValue Ops[] = { N1, N2 };
+  return getNode(Opcode, DL, VTList, Ops, 2);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  SDValue Ops[] = { N1, N2, N3 };
+  return getNode(Opcode, DL, VTList, Ops, 3);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VTList, Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4, SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VTList, Ops, 5);
+}
+
+SDVTList SelectionDAG::getVTList(MVT VT) {
+  return makeVTList(SDNode::getValueTypeList(VT), 1);
+}
+
+SDVTList SelectionDAG::getVTList(MVT VT1, MVT VT2) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 2 && I->VTs[0] == VT1 && I->VTs[1] == VT2)
+      return *I;
+
+  MVT *Array = Allocator.Allocate<MVT>(2);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  SDVTList Result = makeVTList(Array, 2);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(MVT VT1, MVT VT2, MVT VT3) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 3 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
+                          I->VTs[2] == VT3)
+      return *I;
+
+  MVT *Array = Allocator.Allocate<MVT>(3);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  Array[2] = VT3;
+  SDVTList Result = makeVTList(Array, 3);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(MVT VT1, MVT VT2, MVT VT3, MVT VT4) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 4 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
+                          I->VTs[2] == VT3 && I->VTs[3] == VT4)
+      return *I;
+
+  MVT *Array = Allocator.Allocate<MVT>(3);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  Array[2] = VT3;
+  Array[3] = VT4;
+  SDVTList Result = makeVTList(Array, 4);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(const MVT *VTs, unsigned NumVTs) {
+  switch (NumVTs) {
+    case 0: assert(0 && "Cannot have nodes without results!");
+    case 1: return getVTList(VTs[0]);
+    case 2: return getVTList(VTs[0], VTs[1]);
+    case 3: return getVTList(VTs[0], VTs[1], VTs[2]);
+    default: break;
+  }
+
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I) {
+    if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
+      continue;
+
+    bool NoMatch = false;
+    for (unsigned i = 2; i != NumVTs; ++i)
+      if (VTs[i] != I->VTs[i]) {
+        NoMatch = true;
+        break;
+      }
+    if (!NoMatch)
+      return *I;
+  }
+
+  MVT *Array = Allocator.Allocate<MVT>(NumVTs);
+  std::copy(VTs, VTs+NumVTs, Array);
+  SDVTList Result = makeVTList(Array, NumVTs);
+  VTList.push_back(Result);
+  return Result;
+}
+
+
+/// UpdateNodeOperands - *Mutate* the specified node in-place to have the
+/// specified operands.  If the resultant node already exists in the DAG,
+/// this does not modify the specified node, instead it returns the node that
+/// already exists.  If the resultant node does not exist in the DAG, the
+/// input node is returned.  As a degenerate case, if you specify the same
+/// input operands as the node already has, the input node is returned.
+SDValue SelectionDAG::UpdateNodeOperands(SDValue InN, SDValue Op) {
+  SDNode *N = InN.getNode();
+  assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op == N->getOperand(0)) return InN;
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
+    return SDValue(Existing, InN.getResNo());
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  N->OperandList[0].set(Op);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return InN;
+}
+
+SDValue SelectionDAG::
+UpdateNodeOperands(SDValue InN, SDValue Op1, SDValue Op2) {
+  SDNode *N = InN.getNode();
+  assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
+    return InN;   // No operands changed, just return the input node.
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
+    return SDValue(Existing, InN.getResNo());
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  if (N->OperandList[0] != Op1)
+    N->OperandList[0].set(Op1);
+  if (N->OperandList[1] != Op2)
+    N->OperandList[1].set(Op2);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return InN;
+}
+
+SDValue SelectionDAG::
+UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return UpdateNodeOperands(N, Ops, 3);
+}
+
+SDValue SelectionDAG::
+UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4 };
+  return UpdateNodeOperands(N, Ops, 4);
+}
+
+SDValue SelectionDAG::
+UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4, SDValue Op5) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
+  return UpdateNodeOperands(N, Ops, 5);
+}
+
+SDValue SelectionDAG::
+UpdateNodeOperands(SDValue InN, const SDValue *Ops, unsigned NumOps) {
+  SDNode *N = InN.getNode();
+  assert(N->getNumOperands() == NumOps &&
+         "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  bool AnyChange = false;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    if (Ops[i] != N->getOperand(i)) {
+      AnyChange = true;
+      break;
+    }
+  }
+
+  // No operands changed, just return the input node.
+  if (!AnyChange) return InN;
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, NumOps, InsertPos))
+    return SDValue(Existing, InN.getResNo());
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  for (unsigned i = 0; i != NumOps; ++i)
+    if (N->OperandList[i] != Ops[i])
+      N->OperandList[i].set(Ops[i]);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return InN;
+}
+
+/// DropOperands - Release the operands and set this node to have
+/// zero operands.
+void SDNode::DropOperands() {
+  // Unlike the code in MorphNodeTo that does this, we don't need to
+  // watch for dead nodes here.
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    Use.set(SDValue());
+  }
+}
+
+/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
+/// machine opcode.
+///
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, 0, 0);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT, SDValue Op1,
+                                   SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT, SDValue Op1,
+                                   SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT, const SDValue *Ops,
+                                   unsigned NumOps) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2, const SDValue *Ops,
+                                   unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, (SDValue *)0, 0);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2, MVT VT3,
+                                   const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2, MVT VT3, MVT VT4,
+                                   const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2,
+                                   SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2,
+                                   SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   MVT VT1, MVT VT2, MVT VT3,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   SDVTList VTs, const SDValue *Ops,
+                                   unsigned NumOps) {
+  return MorphNodeTo(N, ~MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return MorphNodeTo(N, Opc, VTs, 0, 0);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return MorphNodeTo(N, Opc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT, SDValue Op1,
+                                  SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return MorphNodeTo(N, Opc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT, SDValue Op1,
+                                  SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return MorphNodeTo(N, Opc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT, const SDValue *Ops,
+                                  unsigned NumOps) {
+  SDVTList VTs = getVTList(VT);
+  return MorphNodeTo(N, Opc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT1, MVT VT2, const SDValue *Ops,
+                                  unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return MorphNodeTo(N, Opc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT1, MVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return MorphNodeTo(N, Opc, VTs, (SDValue *)0, 0);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT1, MVT VT2, MVT VT3,
+                                  const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return MorphNodeTo(N, Opc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT1, MVT VT2,
+                                  SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return MorphNodeTo(N, Opc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT1, MVT VT2,
+                                  SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return MorphNodeTo(N, Opc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  MVT VT1, MVT VT2,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return MorphNodeTo(N, Opc, VTs, Ops, 3);
+}
+
+/// MorphNodeTo - These *mutate* the specified node to have the specified
+/// return type, opcode, and operands.
+///
+/// Note that MorphNodeTo returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.  Note that the DebugLoc need not be the same.
+///
+/// Using MorphNodeTo is faster than creating a new node and swapping it in
+/// with ReplaceAllUsesWith both because it often avoids allocating a new
+/// node, and because it doesn't require CSE recalculation for any of
+/// the node's users.
+///
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  SDVTList VTs, const SDValue *Ops,
+                                  unsigned NumOps) {
+  // If an identical node already exists, use it.
+  void *IP = 0;
+  if (VTs.VTs[VTs.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opc, VTs, Ops, NumOps);
+    if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return ON;
+  }
+
+  if (!RemoveNodeFromCSEMaps(N))
+    IP = 0;
+
+  // Start the morphing.
+  N->NodeType = Opc;
+  N->ValueList = VTs.VTs;
+  N->NumValues = VTs.NumVTs;
+
+  // Clear the operands list, updating used nodes to remove this from their
+  // use list.  Keep track of any operands that become dead as a result.
+  SmallPtrSet<SDNode*, 16> DeadNodeSet;
+  for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    SDNode *Used = Use.getNode();
+    Use.set(SDValue());
+    if (Used->use_empty())
+      DeadNodeSet.insert(Used);
+  }
+
+  // If NumOps is larger than the # of operands we currently have, reallocate
+  // the operand list.
+  if (NumOps > N->NumOperands) {
+    if (N->OperandsNeedDelete)
+      delete[] N->OperandList;
+
+    if (N->isMachineOpcode()) {
+      // We're creating a final node that will live unmorphed for the
+      // remainder of the current SelectionDAG iteration, so we can allocate
+      // the operands directly out of a pool with no recycling metadata.
+      N->OperandList = OperandAllocator.Allocate<SDUse>(NumOps);
+      N->OperandsNeedDelete = false;
+    } else {
+      N->OperandList = new SDUse[NumOps];
+      N->OperandsNeedDelete = true;
+    }
+  }
+
+  // Assign the new operands.
+  N->NumOperands = NumOps;
+  for (unsigned i = 0, e = NumOps; i != e; ++i) {
+    N->OperandList[i].setUser(N);
+    N->OperandList[i].setInitial(Ops[i]);
+  }
+
+  // Delete any nodes that are still dead after adding the uses for the
+  // new operands.
+  SmallVector<SDNode *, 16> DeadNodes;
+  for (SmallPtrSet<SDNode *, 16>::iterator I = DeadNodeSet.begin(),
+       E = DeadNodeSet.end(); I != E; ++I)
+    if ((*I)->use_empty())
+      DeadNodes.push_back(*I);
+  RemoveDeadNodes(DeadNodes);
+
+  if (IP)
+    CSEMap.InsertNode(N, IP);   // Memoize the new node.
+  return N;
+}
+
+
+/// getTargetNode - These are used for target selectors to create a new node
+/// with specified return type(s), target opcode, and operands.
+///
+/// Note that getTargetNode returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT) {
+  return getNode(~Opcode, dl, VT).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT,
+                                    SDValue Op1) {
+  return getNode(~Opcode, dl, VT, Op1).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT,
+                                    SDValue Op1, SDValue Op2) {
+  return getNode(~Opcode, dl, VT, Op1, Op2).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT,
+                                    SDValue Op1, SDValue Op2,
+                                    SDValue Op3) {
+  return getNode(~Opcode, dl, VT, Op1, Op2, Op3).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT,
+                                    const SDValue *Ops, unsigned NumOps) {
+  return getNode(~Opcode, dl, VT, Ops, NumOps).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl,
+                                    MVT VT1, MVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Op;
+  return getNode(~Opcode, dl, VTs, &Op, 0).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT1,
+                                    MVT VT2, SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getNode(~Opcode, dl, VTs, &Op1, 1).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT1,
+                                    MVT VT2, SDValue Op1,
+                                    SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return getNode(~Opcode, dl, VTs, Ops, 2).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT1,
+                                    MVT VT2, SDValue Op1,
+                                    SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getNode(~Opcode, dl, VTs, Ops, 3).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl,
+                                    MVT VT1, MVT VT2,
+                                    const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getNode(~Opcode, dl, VTs, Ops, NumOps).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl,
+                                    MVT VT1, MVT VT2, MVT VT3,
+                                    SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2 };
+  return getNode(~Opcode, dl, VTs, Ops, 2).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl,
+                                    MVT VT1, MVT VT2, MVT VT3,
+                                    SDValue Op1, SDValue Op2,
+                                    SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getNode(~Opcode, dl, VTs, Ops, 3).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl,
+                                    MVT VT1, MVT VT2, MVT VT3,
+                                    const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return getNode(~Opcode, dl, VTs, Ops, NumOps).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl, MVT VT1,
+                                    MVT VT2, MVT VT3, MVT VT4,
+                                    const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return getNode(~Opcode, dl, VTs, Ops, NumOps).getNode();
+}
+
+SDNode *SelectionDAG::getTargetNode(unsigned Opcode, DebugLoc dl,
+                                    const std::vector<MVT> &ResultTys,
+                                    const SDValue *Ops, unsigned NumOps) {
+  return getNode(~Opcode, dl, ResultTys, Ops, NumOps).getNode();
+}
+
+/// getNodeIfExists - Get the specified node if it's already available, or
+/// else return NULL.
+SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                      const SDValue *Ops, unsigned NumOps) {
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Flag) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return E;
+  }
+  return NULL;
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes From has a single result value.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
+                                      DAGUpdateListener *UpdateListener) {
+  SDNode *From = FromN.getNode();
+  assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
+         "Cannot replace with this method!");
+  assert(From != To.getNode() && "Cannot replace uses of with self");
+
+  // Iterate over all the existing uses of From. New uses will be added
+  // to the beginning of the use list, which we avoid visiting.
+  // This specifically avoids visiting uses of From that arise while the
+  // replacement is happening, because any such uses would be the result
+  // of CSE: If an existing node looks like From after one of its operands
+  // is replaced by To, we don't want to replace of all its users with To
+  // too. See PR3018 for more info.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, UpdateListener);
+  }
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes that for each value of From, there is a
+/// corresponding value in To in the same position with the same type.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
+                                      DAGUpdateListener *UpdateListener) {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    assert((!From->hasAnyUseOfValue(i) ||
+            From->getValueType(i) == To->getValueType(i)) &&
+           "Cannot use this version of ReplaceAllUsesWith!");
+#endif
+
+  // Handle the trivial case.
+  if (From == To)
+    return;
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.setNode(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, UpdateListener);
+  }
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version can replace From with any result values.  To must match the
+/// number and types of values returned by From.
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
+                                      const SDValue *To,
+                                      DAGUpdateListener *UpdateListener) {
+  if (From->getNumValues() == 1)  // Handle the simple case efficiently.
+    return ReplaceAllUsesWith(SDValue(From, 0), To[0], UpdateListener);
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      const SDValue &ToOp = To[Use.getResNo()];
+      ++UI;
+      Use.set(ToOp);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, UpdateListener);
+  }
+}
+
+/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The Deleted
+/// vector is handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
+                                             DAGUpdateListener *UpdateListener){
+  // Handle the really simple, really trivial case efficiently.
+  if (From == To) return;
+
+  // Handle the simple, trivial, case efficiently.
+  if (From.getNode()->getNumValues() == 1) {
+    ReplaceAllUsesWith(From, To, UpdateListener);
+    return;
+  }
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From.getNode()->use_begin(),
+                       UE = From.getNode()->use_end();
+  while (UI != UE) {
+    SDNode *User = *UI;
+    bool UserRemovedFromCSEMaps = false;
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+
+      // Skip uses of different values from the same node.
+      if (Use.getResNo() != From.getResNo()) {
+        ++UI;
+        continue;
+      }
+
+      // If this node hasn't been modified yet, it's still in the CSE maps,
+      // so remove its old self from the CSE maps.
+      if (!UserRemovedFromCSEMaps) {
+        RemoveNodeFromCSEMaps(User);
+        UserRemovedFromCSEMaps = true;
+      }
+
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // We are iterating over all uses of the From node, so if a use
+    // doesn't use the specific value, no changes are made.
+    if (!UserRemovedFromCSEMaps)
+      continue;
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, UpdateListener);
+  }
+}
+
+namespace {
+  /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+  /// to record information about a use.
+  struct UseMemo {
+    SDNode *User;
+    unsigned Index;
+    SDUse *Use;
+  };
+
+  /// operator< - Sort Memos by User.
+  bool operator<(const UseMemo &L, const UseMemo &R) {
+    return (intptr_t)L.User < (intptr_t)R.User;
+  }
+}
+
+/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The same value
+/// may appear in both the From and To list.  The Deleted vector is
+/// handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
+                                              const SDValue *To,
+                                              unsigned Num,
+                                              DAGUpdateListener *UpdateListener){
+  // Handle the simple, trivial case efficiently.
+  if (Num == 1)
+    return ReplaceAllUsesOfValueWith(*From, *To, UpdateListener);
+
+  // Read up all the uses and make records of them. This helps
+  // processing new uses that are introduced during the
+  // replacement process.
+  SmallVector<UseMemo, 4> Uses;
+  for (unsigned i = 0; i != Num; ++i) {
+    unsigned FromResNo = From[i].getResNo();
+    SDNode *FromNode = From[i].getNode();
+    for (SDNode::use_iterator UI = FromNode->use_begin(),
+         E = FromNode->use_end(); UI != E; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == FromResNo) {
+        UseMemo Memo = { *UI, i, &Use };
+        Uses.push_back(Memo);
+      }
+    }
+  }
+
+  // Sort the uses, so that all the uses from a given User are together.
+  std::sort(Uses.begin(), Uses.end());
+
+  for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
+       UseIndex != UseIndexEnd; ) {
+    // We know that this user uses some value of From.  If it is the right
+    // value, update it.
+    SDNode *User = Uses[UseIndex].User;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // The Uses array is sorted, so all the uses for a given User
+    // are next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      unsigned i = Uses[UseIndex].Index;
+      SDUse &Use = *Uses[UseIndex].Use;
+      ++UseIndex;
+
+      Use.set(To[i]);
+    } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User, UpdateListener);
+  }
+}
+
+/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
+/// based on their topological order. It returns the maximum id and a vector
+/// of the SDNodes* in assigned order by reference.
+unsigned SelectionDAG::AssignTopologicalOrder() {
+
+  unsigned DAGSize = 0;
+
+  // SortedPos tracks the progress of the algorithm. Nodes before it are
+  // sorted, nodes after it are unsorted. When the algorithm completes
+  // it is at the end of the list.
+  allnodes_iterator SortedPos = allnodes_begin();
+
+  // Visit all the nodes. Move nodes with no operands to the front of
+  // the list immediately. Annotate nodes that do have operands with their
+  // operand count. Before we do this, the Node Id fields of the nodes
+  // may contain arbitrary values. After, the Node Id fields for nodes
+  // before SortedPos will contain the topological sort index, and the
+  // Node Id fields for nodes At SortedPos and after will contain the
+  // count of outstanding operands.
+  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
+    SDNode *N = I++;
+    unsigned Degree = N->getNumOperands();
+    if (Degree == 0) {
+      // A node with no uses, add it to the result array immediately.
+      N->setNodeId(DAGSize++);
+      allnodes_iterator Q = N;
+      if (Q != SortedPos)
+        SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
+      ++SortedPos;
+    } else {
+      // Temporarily use the Node Id as scratch space for the degree count.
+      N->setNodeId(Degree);
+    }
+  }
+
+  // Visit all the nodes. As we iterate, moves nodes into sorted order,
+  // such that by the time the end is reached all nodes will be sorted.
+  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
+    SDNode *N = I;
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDNode *P = *UI;
+      unsigned Degree = P->getNodeId();
+      --Degree;
+      if (Degree == 0) {
+        // All of P's operands are sorted, so P may sorted now.
+        P->setNodeId(DAGSize++);
+        if (P != SortedPos)
+          SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
+        ++SortedPos;
+      } else {
+        // Update P's outstanding operand count.
+        P->setNodeId(Degree);
+      }
+    }
+  }
+
+  assert(SortedPos == AllNodes.end() &&
+         "Topological sort incomplete!");
+  assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
+         "First node in topological sort is not the entry token!");
+  assert(AllNodes.front().getNodeId() == 0 &&
+         "First node in topological sort has non-zero id!");
+  assert(AllNodes.front().getNumOperands() == 0 &&
+         "First node in topological sort has operands!");
+  assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
+         "Last node in topologic sort has unexpected id!");
+  assert(AllNodes.back().use_empty() &&
+         "Last node in topologic sort has users!");
+  assert(DAGSize == allnodes_size() && "Node count mismatch!");
+  return DAGSize;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                              SDNode Class
+//===----------------------------------------------------------------------===//
+
+HandleSDNode::~HandleSDNode() {
+  DropOperands();
+}
+
+GlobalAddressSDNode::GlobalAddressSDNode(bool isTarget, const GlobalValue *GA,
+                                         MVT VT, int64_t o)
+  : SDNode(isa<GlobalVariable>(GA) &&
+           cast<GlobalVariable>(GA)->isThreadLocal() ?
+           // Thread Local
+           (isTarget ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress) :
+           // Non Thread Local
+           (isTarget ? ISD::TargetGlobalAddress : ISD::GlobalAddress),
+           DebugLoc::getUnknownLoc(), getSDVTList(VT)), Offset(o) {
+  TheGlobal = const_cast<GlobalValue*>(GA);
+}
+
+MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs, MVT memvt,
+                     const Value *srcValue, int SVO,
+                     unsigned alignment, bool vol)
+ : SDNode(Opc, dl, VTs), MemoryVT(memvt), SrcValue(srcValue), SVOffset(SVO) {
+  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, vol, alignment);
+  assert(isPowerOf2_32(alignment) && "Alignment is not a power of 2!");
+  assert(getAlignment() == alignment && "Alignment representation error!");
+  assert(isVolatile() == vol && "Volatile representation error!");
+}
+
+MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs,
+                     const SDValue *Ops,
+                     unsigned NumOps, MVT memvt, const Value *srcValue,
+                     int SVO, unsigned alignment, bool vol)
+   : SDNode(Opc, dl, VTs, Ops, NumOps),
+     MemoryVT(memvt), SrcValue(srcValue), SVOffset(SVO) {
+  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, vol, alignment);
+  assert(isPowerOf2_32(alignment) && "Alignment is not a power of 2!");
+  assert(getAlignment() == alignment && "Alignment representation error!");
+  assert(isVolatile() == vol && "Volatile representation error!");
+}
+
+/// getMemOperand - Return a MachineMemOperand object describing the memory
+/// reference performed by this memory reference.
+MachineMemOperand MemSDNode::getMemOperand() const {
+  int Flags = 0;
+  if (isa<LoadSDNode>(this))
+    Flags = MachineMemOperand::MOLoad;
+  else if (isa<StoreSDNode>(this))
+    Flags = MachineMemOperand::MOStore;
+  else if (isa<AtomicSDNode>(this)) {
+    Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+  }
+  else {
+    const MemIntrinsicSDNode* MemIntrinNode = dyn_cast<MemIntrinsicSDNode>(this);
+    assert(MemIntrinNode && "Unknown MemSDNode opcode!");
+    if (MemIntrinNode->readMem()) Flags |= MachineMemOperand::MOLoad;
+    if (MemIntrinNode->writeMem()) Flags |= MachineMemOperand::MOStore;
+  }
+
+  int Size = (getMemoryVT().getSizeInBits() + 7) >> 3;
+  if (isVolatile()) Flags |= MachineMemOperand::MOVolatile;
+
+  // Check if the memory reference references a frame index
+  const FrameIndexSDNode *FI =
+  dyn_cast<const FrameIndexSDNode>(getBasePtr().getNode());
+  if (!getSrcValue() && FI)
+    return MachineMemOperand(PseudoSourceValue::getFixedStack(FI->getIndex()),
+                             Flags, 0, Size, getAlignment());
+  else
+    return MachineMemOperand(getSrcValue(), Flags, getSrcValueOffset(),
+                             Size, getAlignment());
+}
+
+/// Profile - Gather unique data for the node.
+///
+void SDNode::Profile(FoldingSetNodeID &ID) const {
+  AddNodeIDNode(ID, this);
+}
+
+/// getValueTypeList - Return a pointer to the specified value type.
+///
+const MVT *SDNode::getValueTypeList(MVT VT) {
+  if (VT.isExtended()) {
+    static std::set<MVT, MVT::compareRawBits> EVTs;
+    return &(*EVTs.insert(VT).first);
+  } else {
+    static MVT VTs[MVT::LAST_VALUETYPE];
+    VTs[VT.getSimpleVT()] = VT;
+    return &VTs[VT.getSimpleVT()];
+  }
+}
+
+/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
+/// indicated value.  This method ignores uses of other values defined by this
+/// operation.
+bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  // TODO: Only iterate over uses of a given value of the node
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    if (UI.getUse().getResNo() == Value) {
+      if (NUses == 0)
+        return false;
+      --NUses;
+    }
+  }
+
+  // Found exactly the right number of uses?
+  return NUses == 0;
+}
+
+
+/// hasAnyUseOfValue - Return true if there are any use of the indicated
+/// value. This method ignores uses of other values defined by this operation.
+bool SDNode::hasAnyUseOfValue(unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
+    if (UI.getUse().getResNo() == Value)
+      return true;
+
+  return false;
+}
+
+
+/// isOnlyUserOf - Return true if this node is the only use of N.
+///
+bool SDNode::isOnlyUserOf(SDNode *N) const {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (User == this)
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
+/// isOperand - Return true if this node is an operand of N.
+///
+bool SDValue::isOperandOf(SDNode *N) const {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (*this == N->getOperand(i))
+      return true;
+  return false;
+}
+
+bool SDNode::isOperandOf(SDNode *N) const {
+  for (unsigned i = 0, e = N->NumOperands; i != e; ++i)
+    if (this == N->OperandList[i].getNode())
+      return true;
+  return false;
+}
+
+/// reachesChainWithoutSideEffects - Return true if this operand (which must
+/// be a chain) reaches the specified operand without crossing any
+/// side-effecting instructions.  In practice, this looks through token
+/// factors and non-volatile loads.  In order to remain efficient, this only
+/// looks a couple of nodes in, it does not do an exhaustive search.
+bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
+                                               unsigned Depth) const {
+  if (*this == Dest) return true;
+
+  // Don't search too deeply, we just want to be able to see through
+  // TokenFactor's etc.
+  if (Depth == 0) return false;
+
+  // If this is a token factor, all inputs to the TF happen in parallel.  If any
+  // of the operands of the TF reach dest, then we can do the xform.
+  if (getOpcode() == ISD::TokenFactor) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1))
+        return true;
+    return false;
+  }
+
+  // Loads don't have side effects, look through them.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
+    if (!Ld->isVolatile())
+      return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
+  }
+  return false;
+}
+
+
+static void findPredecessor(SDNode *N, const SDNode *P, bool &found,
+                            SmallPtrSet<SDNode *, 32> &Visited) {
+  if (found || !Visited.insert(N))
+    return;
+
+  for (unsigned i = 0, e = N->getNumOperands(); !found && i != e; ++i) {
+    SDNode *Op = N->getOperand(i).getNode();
+    if (Op == P) {
+      found = true;
+      return;
+    }
+    findPredecessor(Op, P, found, Visited);
+  }
+}
+
+/// isPredecessorOf - Return true if this node is a predecessor of N. This node
+/// is either an operand of N or it can be reached by recursively traversing
+/// up the operands.
+/// NOTE: this is an expensive method. Use it carefully.
+bool SDNode::isPredecessorOf(SDNode *N) const {
+  SmallPtrSet<SDNode *, 32> Visited;
+  bool found = false;
+  findPredecessor(N, this, found, Visited);
+  return found;
+}
+
+uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
+  assert(Num < NumOperands && "Invalid child # of SDNode!");
+  return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
+}
+
+std::string SDNode::getOperationName(const SelectionDAG *G) const {
+  switch (getOpcode()) {
+  default:
+    if (getOpcode() < ISD::BUILTIN_OP_END)
+      return "<<Unknown DAG Node>>";
+    if (isMachineOpcode()) {
+      if (G)
+        if (const TargetInstrInfo *TII = G->getTarget().getInstrInfo())
+          if (getMachineOpcode() < TII->getNumOpcodes())
+            return TII->get(getMachineOpcode()).getName();
+      return "<<Unknown Machine Node>>";
+    }
+    if (G) {
+      const TargetLowering &TLI = G->getTargetLoweringInfo();
+      const char *Name = TLI.getTargetNodeName(getOpcode());
+      if (Name) return Name;
+      return "<<Unknown Target Node>>";
+    }
+    return "<<Unknown Node>>";
+
+#ifndef NDEBUG
+  case ISD::DELETED_NODE:
+    return "<<Deleted Node!>>";
+#endif
+  case ISD::PREFETCH:      return "Prefetch";
+  case ISD::MEMBARRIER:    return "MemBarrier";
+  case ISD::ATOMIC_CMP_SWAP:    return "AtomicCmpSwap";
+  case ISD::ATOMIC_SWAP:        return "AtomicSwap";
+  case ISD::ATOMIC_LOAD_ADD:    return "AtomicLoadAdd";
+  case ISD::ATOMIC_LOAD_SUB:    return "AtomicLoadSub";
+  case ISD::ATOMIC_LOAD_AND:    return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_OR:     return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:    return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_NAND:   return "AtomicLoadNand";
+  case ISD::ATOMIC_LOAD_MIN:    return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:    return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN:   return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX:   return "AtomicLoadUMax";
+  case ISD::PCMARKER:      return "PCMarker";
+  case ISD::READCYCLECOUNTER: return "ReadCycleCounter";
+  case ISD::SRCVALUE:      return "SrcValue";
+  case ISD::MEMOPERAND:    return "MemOperand";
+  case ISD::EntryToken:    return "EntryToken";
+  case ISD::TokenFactor:   return "TokenFactor";
+  case ISD::AssertSext:    return "AssertSext";
+  case ISD::AssertZext:    return "AssertZext";
+
+  case ISD::BasicBlock:    return "BasicBlock";
+  case ISD::ARG_FLAGS:     return "ArgFlags";
+  case ISD::VALUETYPE:     return "ValueType";
+  case ISD::Register:      return "Register";
+
+  case ISD::Constant:      return "Constant";
+  case ISD::ConstantFP:    return "ConstantFP";
+  case ISD::GlobalAddress: return "GlobalAddress";
+  case ISD::GlobalTLSAddress: return "GlobalTLSAddress";
+  case ISD::FrameIndex:    return "FrameIndex";
+  case ISD::JumpTable:     return "JumpTable";
+  case ISD::GLOBAL_OFFSET_TABLE: return "GLOBAL_OFFSET_TABLE";
+  case ISD::RETURNADDR: return "RETURNADDR";
+  case ISD::FRAMEADDR: return "FRAMEADDR";
+  case ISD::FRAME_TO_ARGS_OFFSET: return "FRAME_TO_ARGS_OFFSET";
+  case ISD::EXCEPTIONADDR: return "EXCEPTIONADDR";
+  case ISD::EHSELECTION: return "EHSELECTION";
+  case ISD::EH_RETURN: return "EH_RETURN";
+  case ISD::ConstantPool:  return "ConstantPool";
+  case ISD::ExternalSymbol: return "ExternalSymbol";
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(getOperand(0))->getZExtValue();
+    return Intrinsic::getName((Intrinsic::ID)IID);
+  }
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(getOperand(1))->getZExtValue();
+    return Intrinsic::getName((Intrinsic::ID)IID);
+  }
+
+  case ISD::BUILD_VECTOR:   return "BUILD_VECTOR";
+  case ISD::TargetConstant: return "TargetConstant";
+  case ISD::TargetConstantFP:return "TargetConstantFP";
+  case ISD::TargetGlobalAddress: return "TargetGlobalAddress";
+  case ISD::TargetGlobalTLSAddress: return "TargetGlobalTLSAddress";
+  case ISD::TargetFrameIndex: return "TargetFrameIndex";
+  case ISD::TargetJumpTable:  return "TargetJumpTable";
+  case ISD::TargetConstantPool:  return "TargetConstantPool";
+  case ISD::TargetExternalSymbol: return "TargetExternalSymbol";
+
+  case ISD::CopyToReg:     return "CopyToReg";
+  case ISD::CopyFromReg:   return "CopyFromReg";
+  case ISD::UNDEF:         return "undef";
+  case ISD::MERGE_VALUES:  return "merge_values";
+  case ISD::INLINEASM:     return "inlineasm";
+  case ISD::DBG_LABEL:     return "dbg_label";
+  case ISD::EH_LABEL:      return "eh_label";
+  case ISD::DECLARE:       return "declare";
+  case ISD::HANDLENODE:    return "handlenode";
+  case ISD::FORMAL_ARGUMENTS: return "formal_arguments";
+  case ISD::CALL:          return "call";
+
+  // Unary operators
+  case ISD::FABS:   return "fabs";
+  case ISD::FNEG:   return "fneg";
+  case ISD::FSQRT:  return "fsqrt";
+  case ISD::FSIN:   return "fsin";
+  case ISD::FCOS:   return "fcos";
+  case ISD::FPOWI:  return "fpowi";
+  case ISD::FPOW:   return "fpow";
+  case ISD::FTRUNC: return "ftrunc";
+  case ISD::FFLOOR: return "ffloor";
+  case ISD::FCEIL:  return "fceil";
+  case ISD::FRINT:  return "frint";
+  case ISD::FNEARBYINT: return "fnearbyint";
+
+  // Binary operators
+  case ISD::ADD:    return "add";
+  case ISD::SUB:    return "sub";
+  case ISD::MUL:    return "mul";
+  case ISD::MULHU:  return "mulhu";
+  case ISD::MULHS:  return "mulhs";
+  case ISD::SDIV:   return "sdiv";
+  case ISD::UDIV:   return "udiv";
+  case ISD::SREM:   return "srem";
+  case ISD::UREM:   return "urem";
+  case ISD::SMUL_LOHI:  return "smul_lohi";
+  case ISD::UMUL_LOHI:  return "umul_lohi";
+  case ISD::SDIVREM:    return "sdivrem";
+  case ISD::UDIVREM:    return "udivrem";
+  case ISD::AND:    return "and";
+  case ISD::OR:     return "or";
+  case ISD::XOR:    return "xor";
+  case ISD::SHL:    return "shl";
+  case ISD::SRA:    return "sra";
+  case ISD::SRL:    return "srl";
+  case ISD::ROTL:   return "rotl";
+  case ISD::ROTR:   return "rotr";
+  case ISD::FADD:   return "fadd";
+  case ISD::FSUB:   return "fsub";
+  case ISD::FMUL:   return "fmul";
+  case ISD::FDIV:   return "fdiv";
+  case ISD::FREM:   return "frem";
+  case ISD::FCOPYSIGN: return "fcopysign";
+  case ISD::FGETSIGN:  return "fgetsign";
+
+  case ISD::SETCC:       return "setcc";
+  case ISD::VSETCC:      return "vsetcc";
+  case ISD::SELECT:      return "select";
+  case ISD::SELECT_CC:   return "select_cc";
+  case ISD::INSERT_VECTOR_ELT:   return "insert_vector_elt";
+  case ISD::EXTRACT_VECTOR_ELT:  return "extract_vector_elt";
+  case ISD::CONCAT_VECTORS:      return "concat_vectors";
+  case ISD::EXTRACT_SUBVECTOR:   return "extract_subvector";
+  case ISD::SCALAR_TO_VECTOR:    return "scalar_to_vector";
+  case ISD::VECTOR_SHUFFLE:      return "vector_shuffle";
+  case ISD::CARRY_FALSE:         return "carry_false";
+  case ISD::ADDC:        return "addc";
+  case ISD::ADDE:        return "adde";
+  case ISD::SADDO:       return "saddo";
+  case ISD::UADDO:       return "uaddo";
+  case ISD::SSUBO:       return "ssubo";
+  case ISD::USUBO:       return "usubo";
+  case ISD::SMULO:       return "smulo";
+  case ISD::UMULO:       return "umulo";
+  case ISD::SUBC:        return "subc";
+  case ISD::SUBE:        return "sube";
+  case ISD::SHL_PARTS:   return "shl_parts";
+  case ISD::SRA_PARTS:   return "sra_parts";
+  case ISD::SRL_PARTS:   return "srl_parts";
+
+  // Conversion operators.
+  case ISD::SIGN_EXTEND: return "sign_extend";
+  case ISD::ZERO_EXTEND: return "zero_extend";
+  case ISD::ANY_EXTEND:  return "any_extend";
+  case ISD::SIGN_EXTEND_INREG: return "sign_extend_inreg";
+  case ISD::TRUNCATE:    return "truncate";
+  case ISD::FP_ROUND:    return "fp_round";
+  case ISD::FLT_ROUNDS_: return "flt_rounds";
+  case ISD::FP_ROUND_INREG: return "fp_round_inreg";
+  case ISD::FP_EXTEND:   return "fp_extend";
+
+  case ISD::SINT_TO_FP:  return "sint_to_fp";
+  case ISD::UINT_TO_FP:  return "uint_to_fp";
+  case ISD::FP_TO_SINT:  return "fp_to_sint";
+  case ISD::FP_TO_UINT:  return "fp_to_uint";
+  case ISD::BIT_CONVERT: return "bit_convert";
+
+  case ISD::CONVERT_RNDSAT: {
+    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
+    default: assert(0 && "Unknown cvt code!");
+    case ISD::CVT_FF:  return "cvt_ff";
+    case ISD::CVT_FS:  return "cvt_fs";
+    case ISD::CVT_FU:  return "cvt_fu";
+    case ISD::CVT_SF:  return "cvt_sf";
+    case ISD::CVT_UF:  return "cvt_uf";
+    case ISD::CVT_SS:  return "cvt_ss";
+    case ISD::CVT_SU:  return "cvt_su";
+    case ISD::CVT_US:  return "cvt_us";
+    case ISD::CVT_UU:  return "cvt_uu";
+    }
+  }
+
+    // Control flow instructions
+  case ISD::BR:      return "br";
+  case ISD::BRIND:   return "brind";
+  case ISD::BR_JT:   return "br_jt";
+  case ISD::BRCOND:  return "brcond";
+  case ISD::BR_CC:   return "br_cc";
+  case ISD::RET:     return "ret";
+  case ISD::CALLSEQ_START:  return "callseq_start";
+  case ISD::CALLSEQ_END:    return "callseq_end";
+
+    // Other operators
+  case ISD::LOAD:               return "load";
+  case ISD::STORE:              return "store";
+  case ISD::VAARG:              return "vaarg";
+  case ISD::VACOPY:             return "vacopy";
+  case ISD::VAEND:              return "vaend";
+  case ISD::VASTART:            return "vastart";
+  case ISD::DYNAMIC_STACKALLOC: return "dynamic_stackalloc";
+  case ISD::EXTRACT_ELEMENT:    return "extract_element";
+  case ISD::BUILD_PAIR:         return "build_pair";
+  case ISD::STACKSAVE:          return "stacksave";
+  case ISD::STACKRESTORE:       return "stackrestore";
+  case ISD::TRAP:               return "trap";
+
+  // Bit manipulation
+  case ISD::BSWAP:   return "bswap";
+  case ISD::CTPOP:   return "ctpop";
+  case ISD::CTTZ:    return "cttz";
+  case ISD::CTLZ:    return "ctlz";
+
+  // Debug info
+  case ISD::DBG_STOPPOINT: return "dbg_stoppoint";
+  case ISD::DEBUG_LOC: return "debug_loc";
+
+  // Trampolines
+  case ISD::TRAMPOLINE: return "trampoline";
+
+  case ISD::CONDCODE:
+    switch (cast<CondCodeSDNode>(this)->get()) {
+    default: assert(0 && "Unknown setcc condition!");
+    case ISD::SETOEQ:  return "setoeq";
+    case ISD::SETOGT:  return "setogt";
+    case ISD::SETOGE:  return "setoge";
+    case ISD::SETOLT:  return "setolt";
+    case ISD::SETOLE:  return "setole";
+    case ISD::SETONE:  return "setone";
+
+    case ISD::SETO:    return "seto";
+    case ISD::SETUO:   return "setuo";
+    case ISD::SETUEQ:  return "setue";
+    case ISD::SETUGT:  return "setugt";
+    case ISD::SETUGE:  return "setuge";
+    case ISD::SETULT:  return "setult";
+    case ISD::SETULE:  return "setule";
+    case ISD::SETUNE:  return "setune";
+
+    case ISD::SETEQ:   return "seteq";
+    case ISD::SETGT:   return "setgt";
+    case ISD::SETGE:   return "setge";
+    case ISD::SETLT:   return "setlt";
+    case ISD::SETLE:   return "setle";
+    case ISD::SETNE:   return "setne";
+    }
+  }
+}
+
+const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
+  switch (AM) {
+  default:
+    return "";
+  case ISD::PRE_INC:
+    return "<pre-inc>";
+  case ISD::PRE_DEC:
+    return "<pre-dec>";
+  case ISD::POST_INC:
+    return "<post-inc>";
+  case ISD::POST_DEC:
+    return "<post-dec>";
+  }
+}
+
+std::string ISD::ArgFlagsTy::getArgFlagsString() {
+  std::string S = "< ";
+
+  if (isZExt())
+    S += "zext ";
+  if (isSExt())
+    S += "sext ";
+  if (isInReg())
+    S += "inreg ";
+  if (isSRet())
+    S += "sret ";
+  if (isByVal())
+    S += "byval ";
+  if (isNest())
+    S += "nest ";
+  if (getByValAlign())
+    S += "byval-align:" + utostr(getByValAlign()) + " ";
+  if (getOrigAlign())
+    S += "orig-align:" + utostr(getOrigAlign()) + " ";
+  if (getByValSize())
+    S += "byval-size:" + utostr(getByValSize()) + " ";
+  return S + ">";
+}
+
+void SDNode::dump() const { dump(0); }
+void SDNode::dump(const SelectionDAG *G) const {
+  print(errs(), G);
+}
+
+void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
+  OS << (void*)this << ": ";
+
+  for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
+    if (i) OS << ",";
+    if (getValueType(i) == MVT::Other)
+      OS << "ch";
+    else
+      OS << getValueType(i).getMVTString();
+  }
+  OS << " = " << getOperationName(G);
+}
+
+void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
+  if (!isTargetOpcode() && getOpcode() == ISD::VECTOR_SHUFFLE) {
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(this);
+    OS << "<";
+    for (unsigned i = 0, e = ValueList[0].getVectorNumElements(); i != e; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (i) OS << ",";
+      if (Idx < 0)
+        OS << "u";
+      else
+        OS << Idx;
+    }
+    OS << ">";
+  }
+
+  if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(this)) {
+    OS << '<' << CSDN->getAPIntValue() << '>';
+  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(this)) {
+    if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEsingle)
+      OS << '<' << CSDN->getValueAPF().convertToFloat() << '>';
+    else if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEdouble)
+      OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
+    else {
+      OS << "<APFloat(";
+      CSDN->getValueAPF().bitcastToAPInt().dump();
+      OS << ")>";
+    }
+  } else if (const GlobalAddressSDNode *GADN =
+             dyn_cast<GlobalAddressSDNode>(this)) {
+    int64_t offset = GADN->getOffset();
+    OS << '<';
+    WriteAsOperand(OS, GADN->getGlobal());
+    OS << '>';
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(this)) {
+    OS << "<" << FIDN->getIndex() << ">";
+  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(this)) {
+    OS << "<" << JTDN->getIndex() << ">";
+  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(this)){
+    int offset = CP->getOffset();
+    if (CP->isMachineConstantPoolEntry())
+      OS << "<" << *CP->getMachineCPVal() << ">";
+    else
+      OS << "<" << *CP->getConstVal() << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
+    OS << "<";
+    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
+    if (LBB)
+      OS << LBB->getName() << " ";
+    OS << (const void*)BBDN->getBasicBlock() << ">";
+  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
+    if (G && R->getReg() &&
+        TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+      OS << " " << G->getTarget().getRegisterInfo()->getName(R->getReg());
+    } else {
+      OS << " #" << R->getReg();
+    }
+  } else if (const ExternalSymbolSDNode *ES =
+             dyn_cast<ExternalSymbolSDNode>(this)) {
+    OS << "'" << ES->getSymbol() << "'";
+  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(this)) {
+    if (M->getValue())
+      OS << "<" << M->getValue() << ">";
+    else
+      OS << "<null>";
+  } else if (const MemOperandSDNode *M = dyn_cast<MemOperandSDNode>(this)) {
+    if (M->MO.getValue())
+      OS << "<" << M->MO.getValue() << ":" << M->MO.getOffset() << ">";
+    else
+      OS << "<null:" << M->MO.getOffset() << ">";
+  } else if (const ARG_FLAGSSDNode *N = dyn_cast<ARG_FLAGSSDNode>(this)) {
+    OS << N->getArgFlags().getArgFlagsString();
+  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) {
+    OS << ":" << N->getVT().getMVTString();
+  }
+  else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
+    const Value *SrcValue = LD->getSrcValue();
+    int SrcOffset = LD->getSrcValueOffset();
+    OS << " <";
+    if (SrcValue)
+      OS << SrcValue;
+    else
+      OS << "null";
+    OS << ":" << SrcOffset << ">";
+
+    bool doExt = true;
+    switch (LD->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD: OS << " <anyext "; break;
+    case ISD::SEXTLOAD: OS << " <sext "; break;
+    case ISD::ZEXTLOAD: OS << " <zext "; break;
+    }
+    if (doExt)
+      OS << LD->getMemoryVT().getMVTString() << ">";
+
+    const char *AM = getIndexedModeName(LD->getAddressingMode());
+    if (*AM)
+      OS << " " << AM;
+    if (LD->isVolatile())
+      OS << " <volatile>";
+    OS << " alignment=" << LD->getAlignment();
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
+    const Value *SrcValue = ST->getSrcValue();
+    int SrcOffset = ST->getSrcValueOffset();
+    OS << " <";
+    if (SrcValue)
+      OS << SrcValue;
+    else
+      OS << "null";
+    OS << ":" << SrcOffset << ">";
+
+    if (ST->isTruncatingStore())
+      OS << " <trunc " << ST->getMemoryVT().getMVTString() << ">";
+
+    const char *AM = getIndexedModeName(ST->getAddressingMode());
+    if (*AM)
+      OS << " " << AM;
+    if (ST->isVolatile())
+      OS << " <volatile>";
+    OS << " alignment=" << ST->getAlignment();
+  } else if (const AtomicSDNode* AT = dyn_cast<AtomicSDNode>(this)) {
+    const Value *SrcValue = AT->getSrcValue();
+    int SrcOffset = AT->getSrcValueOffset();
+    OS << " <";
+    if (SrcValue)
+      OS << SrcValue;
+    else
+      OS << "null";
+    OS << ":" << SrcOffset << ">";
+    if (AT->isVolatile())
+      OS << " <volatile>";
+    OS << " alignment=" << AT->getAlignment();
+  }
+}
+
+void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  OS << " ";
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    if (i) OS << ", ";
+    OS << (void*)getOperand(i).getNode();
+    if (unsigned RN = getOperand(i).getResNo())
+      OS << ":" << RN;
+  }
+  print_details(OS, G);
+}
+
+static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i).getNode()->hasOneUse())
+      DumpNodes(N->getOperand(i).getNode(), indent+2, G);
+    else
+      cerr << "\n" << std::string(indent+2, ' ')
+           << (void*)N->getOperand(i).getNode() << ": <multiple use>";
+
+
+  cerr << "\n" << std::string(indent, ' ');
+  N->dump(G);
+}
+
+void SelectionDAG::dump() const {
+  cerr << "SelectionDAG has " << AllNodes.size() << " nodes:";
+
+  for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
+       I != E; ++I) {
+    const SDNode *N = I;
+    if (!N->hasOneUse() && N != getRoot().getNode())
+      DumpNodes(N, 2, this);
+  }
+
+  if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
+
+  cerr << "\n\n";
+}
+
+void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  print_details(OS, G);
+}
+
+typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
+static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
+                       const SelectionDAG *G, VisitedSDNodeSet &once) {
+  if (!once.insert(N))          // If we've been here before, return now.
+    return;
+  // Dump the current SDNode, but don't end the line yet.
+  OS << std::string(indent, ' ');
+  N->printr(OS, G);
+  // Having printed this SDNode, walk the children:
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+    if (i) OS << ",";
+    OS << " ";
+    if (child->getNumOperands() == 0) {
+      // This child has no grandchildren; print it inline right here.
+      child->printr(OS, G);
+      once.insert(child);
+    } else {          // Just the address.  FIXME: also print the child's opcode
+      OS << (void*)child;
+      if (unsigned RN = N->getOperand(i).getResNo())
+        OS << ":" << RN;
+    }
+  }
+  OS << "\n";
+  // Dump children that have grandchildren on their own line(s).
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+    DumpNodesr(OS, child, indent+2, G, once);
+  }
+}
+
+void SDNode::dumpr() const {
+  VisitedSDNodeSet once;
+  DumpNodesr(errs(), this, 0, 0, once);
+}
+
+
+// getAddressSpace - Return the address space this GlobalAddress belongs to.
+unsigned GlobalAddressSDNode::getAddressSpace() const {
+  return getGlobal()->getType()->getAddressSpace();
+}
+
+
+const Type *ConstantPoolSDNode::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
+                                        APInt &SplatUndef,
+                                        unsigned &SplatBitSize,
+                                        bool &HasAnyUndefs,
+                                        unsigned MinSplatBits) {
+  MVT VT = getValueType(0);
+  assert(VT.isVector() && "Expected a vector type");
+  unsigned sz = VT.getSizeInBits();
+  if (MinSplatBits > sz)
+    return false;
+
+  SplatValue = APInt(sz, 0);
+  SplatUndef = APInt(sz, 0);
+
+  // Get the bits.  Bits with undefined values (when the corresponding element
+  // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
+  // in SplatValue.  If any of the values are not constant, give up and return
+  // false.
+  unsigned int nOps = getNumOperands();
+  assert(nOps > 0 && "isConstantSplat has 0-size build vector");
+  unsigned EltBitSize = VT.getVectorElementType().getSizeInBits();
+  for (unsigned i = 0; i < nOps; ++i) {
+    SDValue OpVal = getOperand(i);
+    unsigned BitPos = i * EltBitSize;
+
+    if (OpVal.getOpcode() == ISD::UNDEF)
+      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos +EltBitSize);
+    else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
+      SplatValue |= (APInt(CN->getAPIntValue()).zextOrTrunc(EltBitSize).
+                     zextOrTrunc(sz) << BitPos);
+    else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal))
+      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos;
+     else
+      return false;
+  }
+
+  // The build_vector is all constants or undefs.  Find the smallest element
+  // size that splats the vector.
+
+  HasAnyUndefs = (SplatUndef != 0);
+  while (sz > 8) {
+
+    unsigned HalfSize = sz / 2;
+    APInt HighValue = APInt(SplatValue).lshr(HalfSize).trunc(HalfSize);
+    APInt LowValue = APInt(SplatValue).trunc(HalfSize);
+    APInt HighUndef = APInt(SplatUndef).lshr(HalfSize).trunc(HalfSize);
+    APInt LowUndef = APInt(SplatUndef).trunc(HalfSize);
+
+    // If the two halves do not match (ignoring undef bits), stop here.
+    if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
+        MinSplatBits > HalfSize)
+      break;
+
+    SplatValue = HighValue | LowValue;
+    SplatUndef = HighUndef & LowUndef;
+   
+    sz = HalfSize;
+  }
+
+  SplatBitSize = sz;
+  return true;
+}
+
+bool ShuffleVectorSDNode::isSplatMask(const int *Mask, MVT VT) {
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i, e;
+  for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
+    /* search */;
+
+  assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");
+  
+  // Make sure all remaining elements are either undef or the same as the first
+  // non-undef value.
+  for (int Idx = Mask[i]; i != e; ++i)
+    if (Mask[i] >= 0 && Mask[i] != Idx)
+      return false;
+  return true;
+}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
new file mode 100644
index 0000000..889d7f5
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
@@ -0,0 +1,6052 @@
+//===-- SelectionDAGBuild.cpp - Selection-DAG building --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "isel"
+#include "SelectionDAGBuild.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Constants.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+/// LimitFloatPrecision - Generate low-precision inline sequences for
+/// some float libcalls (6, 8 or 12 bits).
+static unsigned LimitFloatPrecision;
+
+static cl::opt<unsigned, true>
+LimitFPPrecision("limit-float-precision",
+                 cl::desc("Generate low-precision inline sequences "
+                          "for some float libcalls"),
+                 cl::location(LimitFloatPrecision),
+                 cl::init(0));
+
+/// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
+/// of insertvalue or extractvalue indices that identify a member, return
+/// the linearized index of the start of the member.
+///
+static unsigned ComputeLinearIndex(const TargetLowering &TLI, const Type *Ty,
+                                   const unsigned *Indices,
+                                   const unsigned *IndicesEnd,
+                                   unsigned CurIndex = 0) {
+  // Base case: We're done.
+  if (Indices && Indices == IndicesEnd)
+    return CurIndex;
+
+  // Given a struct type, recursively traverse the elements.
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+        EI != EE; ++EI) {
+      if (Indices && *Indices == unsigned(EI - EB))
+        return ComputeLinearIndex(TLI, *EI, Indices+1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(TLI, *EI, 0, 0, CurIndex);
+    }
+    return CurIndex;
+  }
+  // Given an array type, recursively traverse the elements.
+  else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    const Type *EltTy = ATy->getElementType();
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
+      if (Indices && *Indices == i)
+        return ComputeLinearIndex(TLI, EltTy, Indices+1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(TLI, EltTy, 0, 0, CurIndex);
+    }
+    return CurIndex;
+  }
+  // We haven't found the type we're looking for, so keep searching.
+  return CurIndex + 1;
+}
+
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// MVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+static void ComputeValueVTs(const TargetLowering &TLI, const Type *Ty,
+                            SmallVectorImpl<MVT> &ValueVTs,
+                            SmallVectorImpl<uint64_t> *Offsets = 0,
+                            uint64_t StartingOffset = 0) {
+  // Given a struct type, recursively traverse the elements.
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = TLI.getTargetData()->getStructLayout(STy);
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+         EI != EE; ++EI)
+      ComputeValueVTs(TLI, *EI, ValueVTs, Offsets,
+                      StartingOffset + SL->getElementOffset(EI - EB));
+    return;
+  }
+  // Given an array type, recursively traverse the elements.
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    const Type *EltTy = ATy->getElementType();
+    uint64_t EltSize = TLI.getTargetData()->getTypeAllocSize(EltTy);
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
+      ComputeValueVTs(TLI, EltTy, ValueVTs, Offsets,
+                      StartingOffset + i * EltSize);
+    return;
+  }
+  // Interpret void as zero return values.
+  if (Ty == Type::VoidTy)
+    return;
+  // Base case: we can get an MVT for this LLVM IR type.
+  ValueVTs.push_back(TLI.getValueType(Ty));
+  if (Offsets)
+    Offsets->push_back(StartingOffset);
+}
+
+namespace llvm {
+  /// RegsForValue - This struct represents the registers (physical or virtual)
+  /// that a particular set of values is assigned, and the type information about
+  /// the value. The most common situation is to represent one value at a time,
+  /// but struct or array values are handled element-wise as multiple values.
+  /// The splitting of aggregates is performed recursively, so that we never
+  /// have aggregate-typed registers. The values at this point do not necessarily
+  /// have legal types, so each value may require one or more registers of some
+  /// legal type.
+  ///
+  struct VISIBILITY_HIDDEN RegsForValue {
+    /// TLI - The TargetLowering object.
+    ///
+    const TargetLowering *TLI;
+
+    /// ValueVTs - The value types of the values, which may not be legal, and
+    /// may need be promoted or synthesized from one or more registers.
+    ///
+    SmallVector<MVT, 4> ValueVTs;
+
+    /// RegVTs - The value types of the registers. This is the same size as
+    /// ValueVTs and it records, for each value, what the type of the assigned
+    /// register or registers are. (Individual values are never synthesized
+    /// from more than one type of register.)
+    ///
+    /// With virtual registers, the contents of RegVTs is redundant with TLI's
+    /// getRegisterType member function, however when with physical registers
+    /// it is necessary to have a separate record of the types.
+    ///
+    SmallVector<MVT, 4> RegVTs;
+
+    /// Regs - This list holds the registers assigned to the values.
+    /// Each legal or promoted value requires one register, and each
+    /// expanded value requires multiple registers.
+    ///
+    SmallVector<unsigned, 4> Regs;
+
+    RegsForValue() : TLI(0) {}
+
+    RegsForValue(const TargetLowering &tli,
+                 const SmallVector<unsigned, 4> &regs,
+                 MVT regvt, MVT valuevt)
+      : TLI(&tli),  ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+    RegsForValue(const TargetLowering &tli,
+                 const SmallVector<unsigned, 4> &regs,
+                 const SmallVector<MVT, 4> &regvts,
+                 const SmallVector<MVT, 4> &valuevts)
+      : TLI(&tli), ValueVTs(valuevts), RegVTs(regvts), Regs(regs) {}
+    RegsForValue(const TargetLowering &tli,
+                 unsigned Reg, const Type *Ty) : TLI(&tli) {
+      ComputeValueVTs(tli, Ty, ValueVTs);
+
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        MVT ValueVT = ValueVTs[Value];
+        unsigned NumRegs = TLI->getNumRegisters(ValueVT);
+        MVT RegisterVT = TLI->getRegisterType(ValueVT);
+        for (unsigned i = 0; i != NumRegs; ++i)
+          Regs.push_back(Reg + i);
+        RegVTs.push_back(RegisterVT);
+        Reg += NumRegs;
+      }
+    }
+
+    /// append - Add the specified values to this one.
+    void append(const RegsForValue &RHS) {
+      TLI = RHS.TLI;
+      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
+      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
+      Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    }
+
+
+    /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+    /// this value and returns the result as a ValueVTs value.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    SDValue getCopyFromRegs(SelectionDAG &DAG, DebugLoc dl,
+                              SDValue &Chain, SDValue *Flag) const;
+
+    /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+    /// specified value into the registers specified by this object.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                       SDValue &Chain, SDValue *Flag) const;
+
+    /// AddInlineAsmOperands - Add this value to the specified inlineasm node
+    /// operand list.  This adds the code marker, matching input operand index
+    /// (if applicable), and includes the number of values added into it.
+    void AddInlineAsmOperands(unsigned Code,
+                              bool HasMatching, unsigned MatchingIdx,
+                              SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
+  };
+}
+
+/// isUsedOutsideOfDefiningBlock - Return true if this instruction is used by
+/// PHI nodes or outside of the basic block that defines it, or used by a
+/// switch or atomic instruction, which may expand to multiple basic blocks.
+static bool isUsedOutsideOfDefiningBlock(Instruction *I) {
+  if (isa<PHINode>(I)) return true;
+  BasicBlock *BB = I->getParent();
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; ++UI)
+    if (cast<Instruction>(*UI)->getParent() != BB || isa<PHINode>(*UI))
+      return true;
+  return false;
+}
+
+/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
+/// entry block, return true.  This includes arguments used by switches, since
+/// the switch may expand into multiple basic blocks.
+static bool isOnlyUsedInEntryBlock(Argument *A, bool EnableFastISel) {
+  // With FastISel active, we may be splitting blocks, so force creation
+  // of virtual registers for all non-dead arguments.
+  // Don't force virtual registers for byval arguments though, because
+  // fast-isel can't handle those in all cases.
+  if (EnableFastISel && !A->hasByValAttr())
+    return A->use_empty();
+
+  BasicBlock *Entry = A->getParent()->begin();
+  for (Value::use_iterator UI = A->use_begin(), E = A->use_end(); UI != E; ++UI)
+    if (cast<Instruction>(*UI)->getParent() != Entry || isa<SwitchInst>(*UI))
+      return false;  // Use not in entry block.
+  return true;
+}
+
+FunctionLoweringInfo::FunctionLoweringInfo(TargetLowering &tli)
+  : TLI(tli) {
+}
+
+void FunctionLoweringInfo::set(Function &fn, MachineFunction &mf,
+                               SelectionDAG &DAG,
+                               bool EnableFastISel) {
+  Fn = &fn;
+  MF = &mf;
+  RegInfo = &MF->getRegInfo();
+
+  // Create a vreg for each argument register that is not dead and is used
+  // outside of the entry block for the function.
+  for (Function::arg_iterator AI = Fn->arg_begin(), E = Fn->arg_end();
+       AI != E; ++AI)
+    if (!isOnlyUsedInEntryBlock(AI, EnableFastISel))
+      InitializeRegForValue(AI);
+
+  // Initialize the mapping of values to registers.  This is only set up for
+  // instruction values that are used outside of the block that defines
+  // them.
+  Function::iterator BB = Fn->begin(), EB = Fn->end();
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      if (ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
+        const Type *Ty = AI->getAllocatedType();
+        uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
+        unsigned Align =
+          std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
+                   AI->getAlignment());
+
+        TySize *= CUI->getZExtValue();   // Get total allocated size.
+        if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+        StaticAllocaMap[AI] =
+          MF->getFrameInfo()->CreateStackObject(TySize, Align);
+      }
+
+  for (; BB != EB; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (!I->use_empty() && isUsedOutsideOfDefiningBlock(I))
+        if (!isa<AllocaInst>(I) ||
+            !StaticAllocaMap.count(cast<AllocaInst>(I)))
+          InitializeRegForValue(I);
+
+  // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
+  // also creates the initial PHI MachineInstrs, though none of the input
+  // operands are populated.
+  for (BB = Fn->begin(), EB = Fn->end(); BB != EB; ++BB) {
+    MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(BB);
+    MBBMap[BB] = MBB;
+    MF->push_back(MBB);
+
+    // Create Machine PHI nodes for LLVM PHI nodes, lowering them as
+    // appropriate.
+    PHINode *PN;
+    DebugLoc DL;
+    for (BasicBlock::iterator
+           I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (Function *F = CI->getCalledFunction()) {
+          switch (F->getIntrinsicID()) {
+          default: break;
+          case Intrinsic::dbg_stoppoint: {
+            DbgStopPointInst *SPI = cast<DbgStopPointInst>(I);
+
+            if (DIDescriptor::ValidDebugInfo(SPI->getContext(),
+                                             CodeGenOpt::Default)) {
+              DICompileUnit CU(cast<GlobalVariable>(SPI->getContext()));
+              unsigned idx = MF->getOrCreateDebugLocID(CU.getGV(),
+                                                       SPI->getLine(),
+                                                       SPI->getColumn());
+              DL = DebugLoc::get(idx);
+            }
+
+            break;
+          }
+          case Intrinsic::dbg_func_start: {
+            DbgFuncStartInst *FSI = cast<DbgFuncStartInst>(I);
+            Value *SP = FSI->getSubprogram();
+
+            if (DIDescriptor::ValidDebugInfo(SP, CodeGenOpt::Default)) {
+              DISubprogram Subprogram(cast<GlobalVariable>(SP));
+              DICompileUnit CU(Subprogram.getCompileUnit());
+              unsigned Line = Subprogram.getLineNumber();
+              DL = DebugLoc::get(MF->getOrCreateDebugLocID(CU.getGV(),
+                                                           Line, 0));
+            }
+
+            break;
+          }
+          }
+        }
+      }
+
+      PN = dyn_cast<PHINode>(I);
+      if (!PN || PN->use_empty()) continue;
+
+      unsigned PHIReg = ValueMap[PN];
+      assert(PHIReg && "PHI node does not have an assigned virtual register!");
+
+      SmallVector<MVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        MVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI.getNumRegisters(VT);
+        const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+        for (unsigned i = 0; i != NumRegisters; ++i)
+          BuildMI(MBB, DL, TII->get(TargetInstrInfo::PHI), PHIReg + i);
+        PHIReg += NumRegisters;
+      }
+    }
+  }
+}
+
+unsigned FunctionLoweringInfo::MakeReg(MVT VT) {
+  return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
+}
+
+/// CreateRegForValue - Allocate the appropriate number of virtual registers of
+/// the correctly promoted or expanded types.  Assign these registers
+/// consecutive vreg numbers and return the first assigned number.
+///
+/// In the case that the given value has struct or array type, this function
+/// will assign registers for each member or element.
+///
+unsigned FunctionLoweringInfo::CreateRegForValue(const Value *V) {
+  SmallVector<MVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, V->getType(), ValueVTs);
+
+  unsigned FirstReg = 0;
+  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    MVT ValueVT = ValueVTs[Value];
+    MVT RegisterVT = TLI.getRegisterType(ValueVT);
+
+    unsigned NumRegs = TLI.getNumRegisters(ValueVT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      unsigned R = MakeReg(RegisterVT);
+      if (!FirstReg) FirstReg = R;
+    }
+  }
+  return FirstReg;
+}
+
+/// getCopyFromParts - Create a value that contains the specified legal parts
+/// combined into the value they represent.  If the parts combine to a type
+/// larger then ValueVT then AssertOp can be used to specify whether the extra
+/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
+/// (ISD::AssertSext).
+static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc dl,
+                                const SDValue *Parts,
+                                unsigned NumParts, MVT PartVT, MVT ValueVT,
+                                ISD::NodeType AssertOp = ISD::DELETED_NODE) {
+  assert(NumParts > 0 && "No parts to assemble!");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+
+  if (NumParts > 1) {
+    // Assemble the value from multiple parts.
+    if (!ValueVT.isVector() && ValueVT.isInteger()) {
+      unsigned PartBits = PartVT.getSizeInBits();
+      unsigned ValueBits = ValueVT.getSizeInBits();
+
+      // Assemble the power of 2 part.
+      unsigned RoundParts = NumParts & (NumParts - 1) ?
+        1 << Log2_32(NumParts) : NumParts;
+      unsigned RoundBits = PartBits * RoundParts;
+      MVT RoundVT = RoundBits == ValueBits ?
+        ValueVT : MVT::getIntegerVT(RoundBits);
+      SDValue Lo, Hi;
+
+      MVT HalfVT = MVT::getIntegerVT(RoundBits/2);
+
+      if (RoundParts > 2) {
+        Lo = getCopyFromParts(DAG, dl, Parts, RoundParts/2, PartVT, HalfVT);
+        Hi = getCopyFromParts(DAG, dl, Parts+RoundParts/2, RoundParts/2,
+                              PartVT, HalfVT);
+      } else {
+        Lo = DAG.getNode(ISD::BIT_CONVERT, dl, HalfVT, Parts[0]);
+        Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HalfVT, Parts[1]);
+      }
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, dl, RoundVT, Lo, Hi);
+
+      if (RoundParts < NumParts) {
+        // Assemble the trailing non-power-of-2 part.
+        unsigned OddParts = NumParts - RoundParts;
+        MVT OddVT = MVT::getIntegerVT(OddParts * PartBits);
+        Hi = getCopyFromParts(DAG, dl,
+                              Parts+RoundParts, OddParts, PartVT, OddVT);
+
+        // Combine the round and odd parts.
+        Lo = Val;
+        if (TLI.isBigEndian())
+          std::swap(Lo, Hi);
+        MVT TotalVT = MVT::getIntegerVT(NumParts * PartBits);
+        Hi = DAG.getNode(ISD::ANY_EXTEND, dl, TotalVT, Hi);
+        Hi = DAG.getNode(ISD::SHL, dl, TotalVT, Hi,
+                         DAG.getConstant(Lo.getValueType().getSizeInBits(),
+                                         TLI.getPointerTy()));
+        Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, TotalVT, Lo);
+        Val = DAG.getNode(ISD::OR, dl, TotalVT, Lo, Hi);
+      }
+    } else if (ValueVT.isVector()) {
+      // Handle a multi-element vector.
+      MVT IntermediateVT, RegisterVT;
+      unsigned NumIntermediates;
+      unsigned NumRegs =
+        TLI.getVectorTypeBreakdown(ValueVT, IntermediateVT, NumIntermediates,
+                                   RegisterVT);
+      assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+      NumParts = NumRegs; // Silence a compiler warning.
+      assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+      assert(RegisterVT == Parts[0].getValueType() &&
+             "Part type doesn't match part!");
+
+      // Assemble the parts into intermediate operands.
+      SmallVector<SDValue, 8> Ops(NumIntermediates);
+      if (NumIntermediates == NumParts) {
+        // If the register was not expanded, truncate or copy the value,
+        // as appropriate.
+        for (unsigned i = 0; i != NumParts; ++i)
+          Ops[i] = getCopyFromParts(DAG, dl, &Parts[i], 1,
+                                    PartVT, IntermediateVT);
+      } else if (NumParts > 0) {
+        // If the intermediate type was expanded, build the intermediate operands
+        // from the parts.
+        assert(NumParts % NumIntermediates == 0 &&
+               "Must expand into a divisible number of parts!");
+        unsigned Factor = NumParts / NumIntermediates;
+        for (unsigned i = 0; i != NumIntermediates; ++i)
+          Ops[i] = getCopyFromParts(DAG, dl, &Parts[i * Factor], Factor,
+                                    PartVT, IntermediateVT);
+      }
+
+      // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the intermediate
+      // operands.
+      Val = DAG.getNode(IntermediateVT.isVector() ?
+                        ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, dl,
+                        ValueVT, &Ops[0], NumIntermediates);
+    } else if (PartVT.isFloatingPoint()) {
+      // FP split into multiple FP parts (for ppcf128)
+      assert(ValueVT == MVT(MVT::ppcf128) && PartVT == MVT(MVT::f64) &&
+             "Unexpected split");
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, MVT(MVT::f64), Parts[0]);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, MVT(MVT::f64), Parts[1]);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, dl, ValueVT, Lo, Hi);
+    } else {
+      // FP split into integer parts (soft fp)
+      assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
+             !PartVT.isVector() && "Unexpected split");
+      MVT IntVT = MVT::getIntegerVT(ValueVT.getSizeInBits());
+      Val = getCopyFromParts(DAG, dl, Parts, NumParts, PartVT, IntVT);
+    }
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  PartVT = Val.getValueType();
+
+  if (PartVT == ValueVT)
+    return Val;
+
+  if (PartVT.isVector()) {
+    assert(ValueVT.isVector() && "Unknown vector conversion!");
+    return DAG.getNode(ISD::BIT_CONVERT, dl, ValueVT, Val);
+  }
+
+  if (ValueVT.isVector()) {
+    assert(ValueVT.getVectorElementType() == PartVT &&
+           ValueVT.getVectorNumElements() == 1 &&
+           "Only trivial scalar-to-vector conversions should get here!");
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ValueVT, Val);
+  }
+
+  if (PartVT.isInteger() &&
+      ValueVT.isInteger()) {
+    if (ValueVT.bitsLT(PartVT)) {
+      // For a truncate, see if we have any information to
+      // indicate whether the truncated bits will always be
+      // zero or sign-extension.
+      if (AssertOp != ISD::DELETED_NODE)
+        Val = DAG.getNode(AssertOp, dl, PartVT, Val,
+                          DAG.getValueType(ValueVT));
+      return DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
+    } else {
+      return DAG.getNode(ISD::ANY_EXTEND, dl, ValueVT, Val);
+    }
+  }
+
+  if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+    if (ValueVT.bitsLT(Val.getValueType()))
+      // FP_ROUND's are always exact here.
+      return DAG.getNode(ISD::FP_ROUND, dl, ValueVT, Val,
+                         DAG.getIntPtrConstant(1));
+    return DAG.getNode(ISD::FP_EXTEND, dl, ValueVT, Val);
+  }
+
+  if (PartVT.getSizeInBits() == ValueVT.getSizeInBits())
+    return DAG.getNode(ISD::BIT_CONVERT, dl, ValueVT, Val);
+
+  assert(0 && "Unknown mismatch!");
+  return SDValue();
+}
+
+/// getCopyToParts - Create a series of nodes that contain the specified value
+/// split into legal parts.  If the parts contain more bits than Val, then, for
+/// integers, ExtendKind can be used to specify how to generate the extra bits.
+static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl, SDValue Val,
+                           SDValue *Parts, unsigned NumParts, MVT PartVT,
+                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT PtrVT = TLI.getPointerTy();
+  MVT ValueVT = Val.getValueType();
+  unsigned PartBits = PartVT.getSizeInBits();
+  unsigned OrigNumParts = NumParts;
+  assert(TLI.isTypeLegal(PartVT) && "Copying to an illegal type!");
+
+  if (!NumParts)
+    return;
+
+  if (!ValueVT.isVector()) {
+    if (PartVT == ValueVT) {
+      assert(NumParts == 1 && "No-op copy with multiple parts!");
+      Parts[0] = Val;
+      return;
+    }
+
+    if (NumParts * PartBits > ValueVT.getSizeInBits()) {
+      // If the parts cover more bits than the value has, promote the value.
+      if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+        assert(NumParts == 1 && "Do not know what to promote to!");
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, PartVT, Val);
+      } else if (PartVT.isInteger() && ValueVT.isInteger()) {
+        ValueVT = MVT::getIntegerVT(NumParts * PartBits);
+        Val = DAG.getNode(ExtendKind, dl, ValueVT, Val);
+      } else {
+        assert(0 && "Unknown mismatch!");
+      }
+    } else if (PartBits == ValueVT.getSizeInBits()) {
+      // Different types of the same size.
+      assert(NumParts == 1 && PartVT != ValueVT);
+      Val = DAG.getNode(ISD::BIT_CONVERT, dl, PartVT, Val);
+    } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
+      // If the parts cover less bits than value has, truncate the value.
+      if (PartVT.isInteger() && ValueVT.isInteger()) {
+        ValueVT = MVT::getIntegerVT(NumParts * PartBits);
+        Val = DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
+      } else {
+        assert(0 && "Unknown mismatch!");
+      }
+    }
+
+    // The value may have changed - recompute ValueVT.
+    ValueVT = Val.getValueType();
+    assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
+           "Failed to tile the value with PartVT!");
+
+    if (NumParts == 1) {
+      assert(PartVT == ValueVT && "Type conversion failed!");
+      Parts[0] = Val;
+      return;
+    }
+
+    // Expand the value into multiple parts.
+    if (NumParts & (NumParts - 1)) {
+      // The number of parts is not a power of 2.  Split off and copy the tail.
+      assert(PartVT.isInteger() && ValueVT.isInteger() &&
+             "Do not know what to expand to!");
+      unsigned RoundParts = 1 << Log2_32(NumParts);
+      unsigned RoundBits = RoundParts * PartBits;
+      unsigned OddParts = NumParts - RoundParts;
+      SDValue OddVal = DAG.getNode(ISD::SRL, dl, ValueVT, Val,
+                                   DAG.getConstant(RoundBits,
+                                                   TLI.getPointerTy()));
+      getCopyToParts(DAG, dl, OddVal, Parts + RoundParts, OddParts, PartVT);
+      if (TLI.isBigEndian())
+        // The odd parts were reversed by getCopyToParts - unreverse them.
+        std::reverse(Parts + RoundParts, Parts + NumParts);
+      NumParts = RoundParts;
+      ValueVT = MVT::getIntegerVT(NumParts * PartBits);
+      Val = DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
+    }
+
+    // The number of parts is a power of 2.  Repeatedly bisect the value using
+    // EXTRACT_ELEMENT.
+    Parts[0] = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::getIntegerVT(ValueVT.getSizeInBits()),
+                           Val);
+    for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
+      for (unsigned i = 0; i < NumParts; i += StepSize) {
+        unsigned ThisBits = StepSize * PartBits / 2;
+        MVT ThisVT = MVT::getIntegerVT (ThisBits);
+        SDValue &Part0 = Parts[i];
+        SDValue &Part1 = Parts[i+StepSize/2];
+
+        Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                            ThisVT, Part0,
+                            DAG.getConstant(1, PtrVT));
+        Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                            ThisVT, Part0,
+                            DAG.getConstant(0, PtrVT));
+
+        if (ThisBits == PartBits && ThisVT != PartVT) {
+          Part0 = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                PartVT, Part0);
+          Part1 = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                PartVT, Part1);
+        }
+      }
+    }
+
+    if (TLI.isBigEndian())
+      std::reverse(Parts, Parts + OrigNumParts);
+
+    return;
+  }
+
+  // Vector ValueVT.
+  if (NumParts == 1) {
+    if (PartVT != ValueVT) {
+      if (PartVT.isVector()) {
+        Val = DAG.getNode(ISD::BIT_CONVERT, dl, PartVT, Val);
+      } else {
+        assert(ValueVT.getVectorElementType() == PartVT &&
+               ValueVT.getVectorNumElements() == 1 &&
+               "Only trivial vector-to-scalar conversions should get here!");
+        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                          PartVT, Val,
+                          DAG.getConstant(0, PtrVT));
+      }
+    }
+
+    Parts[0] = Val;
+    return;
+  }
+
+  // Handle a multi-element vector.
+  MVT IntermediateVT, RegisterVT;
+  unsigned NumIntermediates;
+  unsigned NumRegs = TLI
+      .getVectorTypeBreakdown(ValueVT, IntermediateVT, NumIntermediates,
+                              RegisterVT);
+  unsigned NumElements = ValueVT.getVectorNumElements();
+
+  assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+  NumParts = NumRegs; // Silence a compiler warning.
+  assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+
+  // Split the vector into intermediate operands.
+  SmallVector<SDValue, 8> Ops(NumIntermediates);
+  for (unsigned i = 0; i != NumIntermediates; ++i)
+    if (IntermediateVT.isVector())
+      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+                           IntermediateVT, Val,
+                           DAG.getConstant(i * (NumElements / NumIntermediates),
+                                           PtrVT));
+    else
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                           IntermediateVT, Val,
+                           DAG.getConstant(i, PtrVT));
+
+  // Split the intermediate operands into legal parts.
+  if (NumParts == NumIntermediates) {
+    // If the register was not expanded, promote or copy the value,
+    // as appropriate.
+    for (unsigned i = 0; i != NumParts; ++i)
+      getCopyToParts(DAG, dl, Ops[i], &Parts[i], 1, PartVT);
+  } else if (NumParts > 0) {
+    // If the intermediate type was expanded, split each the value into
+    // legal parts.
+    assert(NumParts % NumIntermediates == 0 &&
+           "Must expand into a divisible number of parts!");
+    unsigned Factor = NumParts / NumIntermediates;
+    for (unsigned i = 0; i != NumIntermediates; ++i)
+      getCopyToParts(DAG, dl, Ops[i], &Parts[i * Factor], Factor, PartVT);
+  }
+}
+
+
+void SelectionDAGLowering::init(GCFunctionInfo *gfi, AliasAnalysis &aa) {
+  AA = &aa;
+  GFI = gfi;
+  TD = DAG.getTarget().getTargetData();
+}
+
+/// clear - Clear out the curret SelectionDAG and the associated
+/// state and prepare this SelectionDAGLowering object to be used
+/// for a new block. This doesn't clear out information about
+/// additional blocks that are needed to complete switch lowering
+/// or PHI node updating; that information is cleared out as it is
+/// consumed.
+void SelectionDAGLowering::clear() {
+  NodeMap.clear();
+  PendingLoads.clear();
+  PendingExports.clear();
+  DAG.clear();
+  CurDebugLoc = DebugLoc::getUnknownLoc();
+}
+
+/// getRoot - Return the current virtual root of the Selection DAG,
+/// flushing any PendingLoad items. This must be done before emitting
+/// a store or any other node that may need to be ordered after any
+/// prior load instructions.
+///
+SDValue SelectionDAGLowering::getRoot() {
+  if (PendingLoads.empty())
+    return DAG.getRoot();
+
+  if (PendingLoads.size() == 1) {
+    SDValue Root = PendingLoads[0];
+    DAG.setRoot(Root);
+    PendingLoads.clear();
+    return Root;
+  }
+
+  // Otherwise, we have to make a token factor node.
+  SDValue Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                               &PendingLoads[0], PendingLoads.size());
+  PendingLoads.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+/// getControlRoot - Similar to getRoot, but instead of flushing all the
+/// PendingLoad items, flush all the PendingExports items. It is necessary
+/// to do this before emitting a terminator instruction.
+///
+SDValue SelectionDAGLowering::getControlRoot() {
+  SDValue Root = DAG.getRoot();
+
+  if (PendingExports.empty())
+    return Root;
+
+  // Turn all of the CopyToReg chains into one factored node.
+  if (Root.getOpcode() != ISD::EntryToken) {
+    unsigned i = 0, e = PendingExports.size();
+    for (; i != e; ++i) {
+      assert(PendingExports[i].getNode()->getNumOperands() > 1);
+      if (PendingExports[i].getNode()->getOperand(0) == Root)
+        break;  // Don't add the root if we already indirectly depend on it.
+    }
+
+    if (i == e)
+      PendingExports.push_back(Root);
+  }
+
+  Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                     &PendingExports[0],
+                     PendingExports.size());
+  PendingExports.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+void SelectionDAGLowering::visit(Instruction &I) {
+  visit(I.getOpcode(), I);
+}
+
+void SelectionDAGLowering::visit(unsigned Opcode, User &I) {
+  // Note: this doesn't use InstVisitor, because it has to work with
+  // ConstantExpr's in addition to instructions.
+  switch (Opcode) {
+  default: assert(0 && "Unknown instruction type encountered!");
+           abort();
+    // Build the switch statement using the Instruction.def file.
+#define HANDLE_INST(NUM, OPCODE, CLASS) \
+  case Instruction::OPCODE:return visit##OPCODE((CLASS&)I);
+#include "llvm/Instruction.def"
+  }
+}
+
+void SelectionDAGLowering::visitAdd(User &I) {
+  if (I.getType()->isFPOrFPVector())
+    visitBinary(I, ISD::FADD);
+  else
+    visitBinary(I, ISD::ADD);
+}
+
+void SelectionDAGLowering::visitMul(User &I) {
+  if (I.getType()->isFPOrFPVector())
+    visitBinary(I, ISD::FMUL);
+  else
+    visitBinary(I, ISD::MUL);
+}
+
+SDValue SelectionDAGLowering::getValue(const Value *V) {
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  if (Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V))) {
+    MVT VT = TLI.getValueType(V->getType(), true);
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return N = DAG.getConstant(*CI, VT);
+
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      return N = DAG.getGlobalAddress(GV, VT);
+
+    if (isa<ConstantPointerNull>(C))
+      return N = DAG.getConstant(0, TLI.getPointerTy());
+
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+      return N = DAG.getConstantFP(*CFP, VT);
+
+    if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
+      return N = DAG.getUNDEF(VT);
+
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      visit(CE->getOpcode(), *CE);
+      SDValue N1 = NodeMap[V];
+      assert(N1.getNode() && "visit didn't populate the ValueMap!");
+      return N1;
+    }
+
+    if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
+      SmallVector<SDValue, 4> Constants;
+      for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
+           OI != OE; ++OI) {
+        SDNode *Val = getValue(*OI).getNode();
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Constants.push_back(SDValue(Val, i));
+      }
+      return DAG.getMergeValues(&Constants[0], Constants.size(),
+                                getCurDebugLoc());
+    }
+
+    if (isa<StructType>(C->getType()) || isa<ArrayType>(C->getType())) {
+      assert((isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) &&
+             "Unknown struct or array constant!");
+
+      SmallVector<MVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, C->getType(), ValueVTs);
+      unsigned NumElts = ValueVTs.size();
+      if (NumElts == 0)
+        return SDValue(); // empty struct
+      SmallVector<SDValue, 4> Constants(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i) {
+        MVT EltVT = ValueVTs[i];
+        if (isa<UndefValue>(C))
+          Constants[i] = DAG.getUNDEF(EltVT);
+        else if (EltVT.isFloatingPoint())
+          Constants[i] = DAG.getConstantFP(0, EltVT);
+        else
+          Constants[i] = DAG.getConstant(0, EltVT);
+      }
+      return DAG.getMergeValues(&Constants[0], NumElts, getCurDebugLoc());
+    }
+
+    const VectorType *VecTy = cast<VectorType>(V->getType());
+    unsigned NumElements = VecTy->getNumElements();
+
+    // Now that we know the number and type of the elements, get that number of
+    // elements into the Ops array based on what kind of constant it is.
+    SmallVector<SDValue, 16> Ops;
+    if (ConstantVector *CP = dyn_cast<ConstantVector>(C)) {
+      for (unsigned i = 0; i != NumElements; ++i)
+        Ops.push_back(getValue(CP->getOperand(i)));
+    } else {
+      assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
+      MVT EltVT = TLI.getValueType(VecTy->getElementType());
+
+      SDValue Op;
+      if (EltVT.isFloatingPoint())
+        Op = DAG.getConstantFP(0, EltVT);
+      else
+        Op = DAG.getConstant(0, EltVT);
+      Ops.assign(NumElements, Op);
+    }
+
+    // Create a BUILD_VECTOR node.
+    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+                                    VT, &Ops[0], Ops.size());
+  }
+
+  // If this is a static alloca, generate it as the frameindex instead of
+  // computation.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end())
+      return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
+  }
+
+  unsigned InReg = FuncInfo.ValueMap[V];
+  assert(InReg && "Value not in map!");
+
+  RegsForValue RFV(TLI, InReg, V->getType());
+  SDValue Chain = DAG.getEntryNode();
+  return RFV.getCopyFromRegs(DAG, getCurDebugLoc(), Chain, NULL);
+}
+
+
+void SelectionDAGLowering::visitRet(ReturnInst &I) {
+  if (I.getNumOperands() == 0) {
+    DAG.setRoot(DAG.getNode(ISD::RET, getCurDebugLoc(),
+                            MVT::Other, getControlRoot()));
+    return;
+  }
+
+  SmallVector<SDValue, 8> NewValues;
+  NewValues.push_back(getControlRoot());
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    SmallVector<MVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, I.getOperand(i)->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+    if (NumValues == 0) continue;
+
+    SDValue RetOp = getValue(I.getOperand(i));
+    for (unsigned j = 0, f = NumValues; j != f; ++j) {
+      MVT VT = ValueVTs[j];
+
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+      const Function *F = I.getParent()->getParent();
+      if (F->paramHasAttr(0, Attribute::SExt))
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (F->paramHasAttr(0, Attribute::ZExt))
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      // FIXME: C calling convention requires the return type to be promoted to
+      // at least 32-bit. But this is not necessary for non-C calling
+      // conventions. The frontend should mark functions whose return values
+      // require promoting with signext or zeroext attributes.
+      if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+        MVT MinVT = TLI.getRegisterType(MVT::i32);
+        if (VT.bitsLT(MinVT))
+          VT = MinVT;
+      }
+
+      unsigned NumParts = TLI.getNumRegisters(VT);
+      MVT PartVT = TLI.getRegisterType(VT);
+      SmallVector<SDValue, 4> Parts(NumParts);
+      getCopyToParts(DAG, getCurDebugLoc(),
+                     SDValue(RetOp.getNode(), RetOp.getResNo() + j),
+                     &Parts[0], NumParts, PartVT, ExtendKind);
+
+      // 'inreg' on function refers to return value
+      ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+      if (F->paramHasAttr(0, Attribute::InReg))
+        Flags.setInReg();
+      for (unsigned i = 0; i < NumParts; ++i) {
+        NewValues.push_back(Parts[i]);
+        NewValues.push_back(DAG.getArgFlags(Flags));
+      }
+    }
+  }
+  DAG.setRoot(DAG.getNode(ISD::RET, getCurDebugLoc(), MVT::Other,
+                          &NewValues[0], NewValues.size()));
+}
+
+/// CopyToExportRegsIfNeeded - If the given value has virtual registers
+/// created for it, emit nodes to copy the value into the virtual
+/// registers.
+void SelectionDAGLowering::CopyToExportRegsIfNeeded(Value *V) {
+  if (!V->use_empty()) {
+    DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+    if (VMI != FuncInfo.ValueMap.end())
+      CopyValueToVirtualRegister(V, VMI->second);
+  }
+}
+
+/// ExportFromCurrentBlock - If this condition isn't known to be exported from
+/// the current basic block, add it to ValueMap now so that we'll get a
+/// CopyTo/FromReg.
+void SelectionDAGLowering::ExportFromCurrentBlock(Value *V) {
+  // No need to export constants.
+  if (!isa<Instruction>(V) && !isa<Argument>(V)) return;
+
+  // Already exported?
+  if (FuncInfo.isExportedInst(V)) return;
+
+  unsigned Reg = FuncInfo.InitializeRegForValue(V);
+  CopyValueToVirtualRegister(V, Reg);
+}
+
+bool SelectionDAGLowering::isExportableFromCurrentBlock(Value *V,
+                                                    const BasicBlock *FromBB) {
+  // The operands of the setcc have to be in this block.  We don't know
+  // how to export them from some other block.
+  if (Instruction *VI = dyn_cast<Instruction>(V)) {
+    // Can export from current BB.
+    if (VI->getParent() == FromBB)
+      return true;
+
+    // Is already exported, noop.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // If this is an argument, we can export it if the BB is the entry block or
+  // if it is already exported.
+  if (isa<Argument>(V)) {
+    if (FromBB == &FromBB->getParent()->getEntryBlock())
+      return true;
+
+    // Otherwise, can only export this if it is already exported.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // Otherwise, constants can always be exported.
+  return true;
+}
+
+static bool InBlock(const Value *V, const BasicBlock *BB) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == BB;
+  return true;
+}
+
+/// getFCmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR floating-point condition code.  This includes
+/// consideration of global floating-point math flags.
+///
+static ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred) {
+  ISD::CondCode FPC, FOC;
+  switch (Pred) {
+  case FCmpInst::FCMP_FALSE: FOC = FPC = ISD::SETFALSE; break;
+  case FCmpInst::FCMP_OEQ:   FOC = ISD::SETEQ; FPC = ISD::SETOEQ; break;
+  case FCmpInst::FCMP_OGT:   FOC = ISD::SETGT; FPC = ISD::SETOGT; break;
+  case FCmpInst::FCMP_OGE:   FOC = ISD::SETGE; FPC = ISD::SETOGE; break;
+  case FCmpInst::FCMP_OLT:   FOC = ISD::SETLT; FPC = ISD::SETOLT; break;
+  case FCmpInst::FCMP_OLE:   FOC = ISD::SETLE; FPC = ISD::SETOLE; break;
+  case FCmpInst::FCMP_ONE:   FOC = ISD::SETNE; FPC = ISD::SETONE; break;
+  case FCmpInst::FCMP_ORD:   FOC = FPC = ISD::SETO;   break;
+  case FCmpInst::FCMP_UNO:   FOC = FPC = ISD::SETUO;  break;
+  case FCmpInst::FCMP_UEQ:   FOC = ISD::SETEQ; FPC = ISD::SETUEQ; break;
+  case FCmpInst::FCMP_UGT:   FOC = ISD::SETGT; FPC = ISD::SETUGT; break;
+  case FCmpInst::FCMP_UGE:   FOC = ISD::SETGE; FPC = ISD::SETUGE; break;
+  case FCmpInst::FCMP_ULT:   FOC = ISD::SETLT; FPC = ISD::SETULT; break;
+  case FCmpInst::FCMP_ULE:   FOC = ISD::SETLE; FPC = ISD::SETULE; break;
+  case FCmpInst::FCMP_UNE:   FOC = ISD::SETNE; FPC = ISD::SETUNE; break;
+  case FCmpInst::FCMP_TRUE:  FOC = FPC = ISD::SETTRUE; break;
+  default:
+    assert(0 && "Invalid FCmp predicate opcode!");
+    FOC = FPC = ISD::SETFALSE;
+    break;
+  }
+  if (FiniteOnlyFPMath())
+    return FOC;
+  else
+    return FPC;
+}
+
+/// getICmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR integer condition code.
+///
+static ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred) {
+  switch (Pred) {
+  case ICmpInst::ICMP_EQ:  return ISD::SETEQ;
+  case ICmpInst::ICMP_NE:  return ISD::SETNE;
+  case ICmpInst::ICMP_SLE: return ISD::SETLE;
+  case ICmpInst::ICMP_ULE: return ISD::SETULE;
+  case ICmpInst::ICMP_SGE: return ISD::SETGE;
+  case ICmpInst::ICMP_UGE: return ISD::SETUGE;
+  case ICmpInst::ICMP_SLT: return ISD::SETLT;
+  case ICmpInst::ICMP_ULT: return ISD::SETULT;
+  case ICmpInst::ICMP_SGT: return ISD::SETGT;
+  case ICmpInst::ICMP_UGT: return ISD::SETUGT;
+  default:
+    assert(0 && "Invalid ICmp predicate opcode!");
+    return ISD::SETNE;
+  }
+}
+
+/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
+/// This function emits a branch and is used at the leaves of an OR or an
+/// AND operator tree.
+///
+void
+SelectionDAGLowering::EmitBranchForMergedCondition(Value *Cond,
+                                                   MachineBasicBlock *TBB,
+                                                   MachineBasicBlock *FBB,
+                                                   MachineBasicBlock *CurBB) {
+  const BasicBlock *BB = CurBB->getBasicBlock();
+
+  // If the leaf of the tree is a comparison, merge the condition into
+  // the caseblock.
+  if (CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
+    // The operands of the cmp have to be in this block.  We don't know
+    // how to export them from some other block.  If this is the first block
+    // of the sequence, no exporting is needed.
+    if (CurBB == CurMBB ||
+        (isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
+         isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
+      ISD::CondCode Condition;
+      if (ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
+        Condition = getICmpCondCode(IC->getPredicate());
+      } else if (FCmpInst *FC = dyn_cast<FCmpInst>(Cond)) {
+        Condition = getFCmpCondCode(FC->getPredicate());
+      } else {
+        Condition = ISD::SETEQ; // silence warning.
+        assert(0 && "Unknown compare instruction");
+      }
+
+      CaseBlock CB(Condition, BOp->getOperand(0),
+                   BOp->getOperand(1), NULL, TBB, FBB, CurBB);
+      SwitchCases.push_back(CB);
+      return;
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(),
+               NULL, TBB, FBB, CurBB);
+  SwitchCases.push_back(CB);
+}
+
+/// FindMergedConditions - If Cond is an expression like
+void SelectionDAGLowering::FindMergedConditions(Value *Cond,
+                                                MachineBasicBlock *TBB,
+                                                MachineBasicBlock *FBB,
+                                                MachineBasicBlock *CurBB,
+                                                unsigned Opc) {
+  // If this node is not part of the or/and tree, emit it as a branch.
+  Instruction *BOp = dyn_cast<Instruction>(Cond);
+  if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
+      (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() ||
+      BOp->getParent() != CurBB->getBasicBlock() ||
+      !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
+      !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
+    EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB);
+    return;
+  }
+
+  //  Create TmpBB after CurBB.
+  MachineFunction::iterator BBI = CurBB;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
+  CurBB->getParent()->insert(++BBI, TmpBB);
+
+  if (Opc == Instruction::Or) {
+    // Codegen X | Y as:
+    //   jmp_if_X TBB
+    //   jmp TmpBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, Opc);
+
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, Opc);
+  } else {
+    assert(Opc == Instruction::And && "Unknown merge op!");
+    // Codegen X & Y as:
+    //   jmp_if_X TmpBB
+    //   jmp FBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+    //  This requires creation of TmpBB after CurBB.
+
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, Opc);
+
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, Opc);
+  }
+}
+
+/// If the set of cases should be emitted as a series of branches, return true.
+/// If we should emit this as a bunch of and/or'd together conditions, return
+/// false.
+bool
+SelectionDAGLowering::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases){
+  if (Cases.size() != 2) return true;
+
+  // If this is two comparisons of the same values or'd or and'd together, they
+  // will get folded into a single comparison, so don't emit two blocks.
+  if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
+       Cases[0].CmpRHS == Cases[1].CmpRHS) ||
+      (Cases[0].CmpRHS == Cases[1].CmpLHS &&
+       Cases[0].CmpLHS == Cases[1].CmpRHS)) {
+    return false;
+  }
+
+  return true;
+}
+
+void SelectionDAGLowering::visitBr(BranchInst &I) {
+  // Update machine-CFG edges.
+  MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CurMBB;
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  if (I.isUnconditional()) {
+    // Update machine-CFG edges.
+    CurMBB->addSuccessor(Succ0MBB);
+
+    // If this is not a fall-through branch, emit the branch.
+    if (Succ0MBB != NextBlock)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(Succ0MBB)));
+    return;
+  }
+
+  // If this condition is one of the special cases we handle, do special stuff
+  // now.
+  Value *CondVal = I.getCondition();
+  MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  // If this is a series of conditions that are or'd or and'd together, emit
+  // this as a sequence of branches instead of setcc's with and/or operations.
+  // For example, instead of something like:
+  //     cmp A, B
+  //     C = seteq
+  //     cmp D, E
+  //     F = setle
+  //     or C, F
+  //     jnz foo
+  // Emit:
+  //     cmp A, B
+  //     je foo
+  //     cmp D, E
+  //     jle foo
+  //
+  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
+    if (BOp->hasOneUse() &&
+        (BOp->getOpcode() == Instruction::And ||
+         BOp->getOpcode() == Instruction::Or)) {
+      FindMergedConditions(BOp, Succ0MBB, Succ1MBB, CurMBB, BOp->getOpcode());
+      // If the compares in later blocks need to use values not currently
+      // exported from this block, export them now.  This block should always
+      // be the first entry.
+      assert(SwitchCases[0].ThisBB == CurMBB && "Unexpected lowering!");
+
+      // Allow some cases to be rejected.
+      if (ShouldEmitAsBranches(SwitchCases)) {
+        for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) {
+          ExportFromCurrentBlock(SwitchCases[i].CmpLHS);
+          ExportFromCurrentBlock(SwitchCases[i].CmpRHS);
+        }
+
+        // Emit the branch for this block.
+        visitSwitchCase(SwitchCases[0]);
+        SwitchCases.erase(SwitchCases.begin());
+        return;
+      }
+
+      // Okay, we decided not to do this, remove any inserted MBB's and clear
+      // SwitchCases.
+      for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i)
+        CurMBB->getParent()->erase(SwitchCases[i].ThisBB);
+
+      SwitchCases.clear();
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(),
+               NULL, Succ0MBB, Succ1MBB, CurMBB);
+  // Use visitSwitchCase to actually insert the fast branch sequence for this
+  // cond branch.
+  visitSwitchCase(CB);
+}
+
+/// visitSwitchCase - Emits the necessary code to represent a single node in
+/// the binary search tree resulting from lowering a switch instruction.
+void SelectionDAGLowering::visitSwitchCase(CaseBlock &CB) {
+  SDValue Cond;
+  SDValue CondLHS = getValue(CB.CmpLHS);
+  DebugLoc dl = getCurDebugLoc();
+
+  // Build the setcc now.
+  if (CB.CmpMHS == NULL) {
+    // Fold "(X == true)" to X and "(X == false)" to !X to
+    // handle common cases produced by branch lowering.
+    if (CB.CmpRHS == ConstantInt::getTrue() && CB.CC == ISD::SETEQ)
+      Cond = CondLHS;
+    else if (CB.CmpRHS == ConstantInt::getFalse() && CB.CC == ISD::SETEQ) {
+      SDValue True = DAG.getConstant(1, CondLHS.getValueType());
+      Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
+    } else
+      Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
+  } else {
+    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
+
+    const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
+    const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
+
+    SDValue CmpOp = getValue(CB.CmpMHS);
+    MVT VT = CmpOp.getValueType();
+
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
+      Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
+                          ISD::SETLE);
+    } else {
+      SDValue SUB = DAG.getNode(ISD::SUB, dl,
+                                VT, CmpOp, DAG.getConstant(Low, VT));
+      Cond = DAG.getSetCC(dl, MVT::i1, SUB,
+                          DAG.getConstant(High-Low, VT), ISD::SETULE);
+    }
+  }
+
+  // Update successor info
+  CurMBB->addSuccessor(CB.TrueBB);
+  CurMBB->addSuccessor(CB.FalseBB);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CurMBB;
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  // If the lhs block is the next block, invert the condition so that we can
+  // fall through to the lhs instead of the rhs block.
+  if (CB.TrueBB == NextBlock) {
+    std::swap(CB.TrueBB, CB.FalseBB);
+    SDValue True = DAG.getConstant(1, Cond.getValueType());
+    Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
+  }
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, getControlRoot(), Cond,
+                               DAG.getBasicBlock(CB.TrueBB));
+
+  // If the branch was constant folded, fix up the CFG.
+  if (BrCond.getOpcode() == ISD::BR) {
+    CurMBB->removeSuccessor(CB.FalseBB);
+    DAG.setRoot(BrCond);
+  } else {
+    // Otherwise, go ahead and insert the false branch.
+    if (BrCond == getControlRoot())
+      CurMBB->removeSuccessor(CB.TrueBB);
+
+    if (CB.FalseBB == NextBlock)
+      DAG.setRoot(BrCond);
+    else
+      DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                              DAG.getBasicBlock(CB.FalseBB)));
+  }
+}
+
+/// visitJumpTable - Emit JumpTable node in the current MBB
+void SelectionDAGLowering::visitJumpTable(JumpTable &JT) {
+  // Emit the code for the jump table
+  assert(JT.Reg != -1U && "Should lower JT Header first!");
+  MVT PTy = TLI.getPointerTy();
+  SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(),
+                                     JT.Reg, PTy);
+  SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
+  DAG.setRoot(DAG.getNode(ISD::BR_JT, getCurDebugLoc(),
+                          MVT::Other, Index.getValue(1),
+                          Table, Index));
+}
+
+/// visitJumpTableHeader - This function emits necessary code to produce index
+/// in the JumpTable from switch case.
+void SelectionDAGLowering::visitJumpTableHeader(JumpTable &JT,
+                                                JumpTableHeader &JTH) {
+  // Subtract the lowest switch case value from the value being switched on and
+  // conditional branch to default mbb if the result is greater than the
+  // difference between smallest and largest cases.
+  SDValue SwitchOp = getValue(JTH.SValue);
+  MVT VT = SwitchOp.getValueType();
+  SDValue SUB = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+                            DAG.getConstant(JTH.First, VT));
+
+  // The SDNode we just created, which holds the value being switched on minus
+  // the the smallest case value, needs to be copied to a virtual register so it
+  // can be used as an index into the jump table in a subsequent basic block.
+  // This value may be smaller or larger than the target's pointer type, and
+  // therefore require extension or truncating.
+  if (VT.bitsGT(TLI.getPointerTy()))
+    SwitchOp = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                           TLI.getPointerTy(), SUB);
+  else
+    SwitchOp = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                           TLI.getPointerTy(), SUB);
+
+  unsigned JumpTableReg = FuncInfo.MakeReg(TLI.getPointerTy());
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+                                    JumpTableReg, SwitchOp);
+  JT.Reg = JumpTableReg;
+
+  // Emit the range check for the jump table, and branch to the default block
+  // for the switch statement if the value being switched on exceeds the largest
+  // case in the switch.
+  SDValue CMP = DAG.getSetCC(getCurDebugLoc(),
+                             TLI.getSetCCResultType(SUB.getValueType()), SUB,
+                             DAG.getConstant(JTH.Last-JTH.First,VT),
+                             ISD::SETUGT);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CurMBB;
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                               MVT::Other, CopyTo, CMP,
+                               DAG.getBasicBlock(JT.Default));
+
+  if (JT.MBB == NextBlock)
+    DAG.setRoot(BrCond);
+  else
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrCond,
+                            DAG.getBasicBlock(JT.MBB)));
+}
+
+/// visitBitTestHeader - This function emits necessary code to produce value
+/// suitable for "bit tests"
+void SelectionDAGLowering::visitBitTestHeader(BitTestBlock &B) {
+  // Subtract the minimum value
+  SDValue SwitchOp = getValue(B.SValue);
+  MVT VT = SwitchOp.getValueType();
+  SDValue SUB = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+                            DAG.getConstant(B.First, VT));
+
+  // Check range
+  SDValue RangeCmp = DAG.getSetCC(getCurDebugLoc(),
+                                  TLI.getSetCCResultType(SUB.getValueType()),
+                                  SUB, DAG.getConstant(B.Range, VT),
+                                  ISD::SETUGT);
+
+  SDValue ShiftOp;
+  if (VT.bitsGT(TLI.getPointerTy()))
+    ShiftOp = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                          TLI.getPointerTy(), SUB);
+  else
+    ShiftOp = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                          TLI.getPointerTy(), SUB);
+
+  B.Reg = FuncInfo.MakeReg(TLI.getPointerTy());
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+                                    B.Reg, ShiftOp);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CurMBB;
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  MachineBasicBlock* MBB = B.Cases[0].ThisBB;
+
+  CurMBB->addSuccessor(B.Default);
+  CurMBB->addSuccessor(MBB);
+
+  SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                                MVT::Other, CopyTo, RangeCmp,
+                                DAG.getBasicBlock(B.Default));
+
+  if (MBB == NextBlock)
+    DAG.setRoot(BrRange);
+  else
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, CopyTo,
+                            DAG.getBasicBlock(MBB)));
+}
+
+/// visitBitTestCase - this function produces one "bit test"
+void SelectionDAGLowering::visitBitTestCase(MachineBasicBlock* NextMBB,
+                                            unsigned Reg,
+                                            BitTestCase &B) {
+  // Make desired shift
+  SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(), Reg,
+                                       TLI.getPointerTy());
+  SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+                                  TLI.getPointerTy(),
+                                  DAG.getConstant(1, TLI.getPointerTy()),
+                                  ShiftOp);
+
+  // Emit bit tests and jumps
+  SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                              TLI.getPointerTy(), SwitchVal,
+                              DAG.getConstant(B.Mask, TLI.getPointerTy()));
+  SDValue AndCmp = DAG.getSetCC(getCurDebugLoc(),
+                                TLI.getSetCCResultType(AndOp.getValueType()),
+                                AndOp, DAG.getConstant(0, TLI.getPointerTy()),
+                                ISD::SETNE);
+
+  CurMBB->addSuccessor(B.TargetBB);
+  CurMBB->addSuccessor(NextMBB);
+
+  SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              AndCmp, DAG.getBasicBlock(B.TargetBB));
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CurMBB;
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  if (NextMBB == NextBlock)
+    DAG.setRoot(BrAnd);
+  else
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrAnd,
+                            DAG.getBasicBlock(NextMBB)));
+}
+
+void SelectionDAGLowering::visitInvoke(InvokeInst &I) {
+  // Retrieve successors.
+  MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
+  MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  const Value *Callee(I.getCalledValue());
+  if (isa<InlineAsm>(Callee))
+    visitInlineAsm(&I);
+  else
+    LowerCallTo(&I, getValue(Callee), false, LandingPad);
+
+  // If the value of the invoke is used outside of its defining block, make it
+  // available as a virtual register.
+  CopyToExportRegsIfNeeded(&I);
+
+  // Update successor info
+  CurMBB->addSuccessor(Return);
+  CurMBB->addSuccessor(LandingPad);
+
+  // Drop into normal successor.
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                          MVT::Other, getControlRoot(),
+                          DAG.getBasicBlock(Return)));
+}
+
+void SelectionDAGLowering::visitUnwind(UnwindInst &I) {
+}
+
+/// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
+/// small case ranges).
+bool SelectionDAGLowering::handleSmallSwitchRange(CaseRec& CR,
+                                                  CaseRecVector& WorkList,
+                                                  Value* SV,
+                                                  MachineBasicBlock* Default) {
+  Case& BackCase  = *(CR.Range.second-1);
+
+  // Size is the number of Cases represented by this range.
+  size_t Size = CR.Range.second - CR.Range.first;
+  if (Size > 3)
+    return false;
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = CurMBB->getParent();
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CR.CaseBB;
+
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  // TODO: If any two of the cases has the same destination, and if one value
+  // is the same as the other, but has one bit unset that the other has set,
+  // use bit manipulation to do two compares at once.  For example:
+  // "if (X == 6 || X == 4)" -> "if ((X|2) == 6)"
+
+  // Rearrange the case blocks so that the last one falls through if possible.
+  if (NextBlock && Default != NextBlock && BackCase.BB != NextBlock) {
+    // The last case block won't fall through into 'NextBlock' if we emit the
+    // branches in this order.  See if rearranging a case value would help.
+    for (CaseItr I = CR.Range.first, E = CR.Range.second-1; I != E; ++I) {
+      if (I->BB == NextBlock) {
+        std::swap(*I, BackCase);
+        break;
+      }
+    }
+  }
+
+  // Create a CaseBlock record representing a conditional branch to
+  // the Case's target mbb if the value being switched on SV is equal
+  // to C.
+  MachineBasicBlock *CurBlock = CR.CaseBB;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
+    MachineBasicBlock *FallThrough;
+    if (I != E-1) {
+      FallThrough = CurMF->CreateMachineBasicBlock(CurBlock->getBasicBlock());
+      CurMF->insert(BBI, FallThrough);
+
+      // Put SV in a virtual register to make it available from the new blocks.
+      ExportFromCurrentBlock(SV);
+    } else {
+      // If the last case doesn't match, go to the default block.
+      FallThrough = Default;
+    }
+
+    Value *RHS, *LHS, *MHS;
+    ISD::CondCode CC;
+    if (I->High == I->Low) {
+      // This is just small small case range :) containing exactly 1 case
+      CC = ISD::SETEQ;
+      LHS = SV; RHS = I->High; MHS = NULL;
+    } else {
+      CC = ISD::SETLE;
+      LHS = I->Low; MHS = SV; RHS = I->High;
+    }
+    CaseBlock CB(CC, LHS, RHS, MHS, I->BB, FallThrough, CurBlock);
+
+    // If emitting the first comparison, just call visitSwitchCase to emit the
+    // code into the current block.  Otherwise, push the CaseBlock onto the
+    // vector to be later processed by SDISel, and insert the node's MBB
+    // before the next MBB.
+    if (CurBlock == CurMBB)
+      visitSwitchCase(CB);
+    else
+      SwitchCases.push_back(CB);
+
+    CurBlock = FallThrough;
+  }
+
+  return true;
+}
+
+static inline bool areJTsAllowed(const TargetLowering &TLI) {
+  return !DisableJumpTables &&
+          (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+           TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+}
+
+static APInt ComputeRange(const APInt &First, const APInt &Last) {
+  APInt LastExt(Last), FirstExt(First);
+  uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
+  LastExt.sext(BitWidth); FirstExt.sext(BitWidth);
+  return (LastExt - FirstExt + 1ULL);
+}
+
+/// handleJTSwitchCase - Emit jumptable for current switch case range
+bool SelectionDAGLowering::handleJTSwitchCase(CaseRec& CR,
+                                              CaseRecVector& WorkList,
+                                              Value* SV,
+                                              MachineBasicBlock* Default) {
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+
+  const APInt& First = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt& Last  = cast<ConstantInt>(BackCase.High)->getValue();
+
+  size_t TSize = 0;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I)
+    TSize += I->size();
+
+  if (!areJTsAllowed(TLI) || TSize <= 3)
+    return false;
+
+  APInt Range = ComputeRange(First, Last);
+  double Density = (double)TSize / Range.roundToDouble();
+  if (Density < 0.4)
+    return false;
+
+  DEBUG(errs() << "Lowering jump table\n"
+               << "First entry: " << First << ". Last entry: " << Last << '\n'
+               << "Range: " << Range
+               << "Size: " << TSize << ". Density: " << Density << "\n\n");
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = CurMBB->getParent();
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CR.CaseBB;
+
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  // Create a new basic block to hold the code for loading the address
+  // of the jump table, and jumping to it.  Update successor information;
+  // we will either branch to the default case for the switch, or the jump
+  // table.
+  MachineBasicBlock *JumpTableBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+  CurMF->insert(BBI, JumpTableBB);
+  CR.CaseBB->addSuccessor(Default);
+  CR.CaseBB->addSuccessor(JumpTableBB);
+
+  // Build a vector of destination BBs, corresponding to each target
+  // of the jump table. If the value of the jump table slot corresponds to
+  // a case statement, push the case's BB onto the vector, otherwise, push
+  // the default BB.
+  std::vector<MachineBasicBlock*> DestBBs;
+  APInt TEI = First;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++TEI) {
+    const APInt& Low = cast<ConstantInt>(I->Low)->getValue();
+    const APInt& High = cast<ConstantInt>(I->High)->getValue();
+
+    if (Low.sle(TEI) && TEI.sle(High)) {
+      DestBBs.push_back(I->BB);
+      if (TEI==High)
+        ++I;
+    } else {
+      DestBBs.push_back(Default);
+    }
+  }
+
+  // Update successor info. Add one edge to each unique successor.
+  BitVector SuccsHandled(CR.CaseBB->getParent()->getNumBlockIDs());
+  for (std::vector<MachineBasicBlock*>::iterator I = DestBBs.begin(),
+         E = DestBBs.end(); I != E; ++I) {
+    if (!SuccsHandled[(*I)->getNumber()]) {
+      SuccsHandled[(*I)->getNumber()] = true;
+      JumpTableBB->addSuccessor(*I);
+    }
+  }
+
+  // Create a jump table index for this jump table, or return an existing
+  // one.
+  unsigned JTI = CurMF->getJumpTableInfo()->getJumpTableIndex(DestBBs);
+
+  // Set the jump table information so that we can codegen it as a second
+  // MachineBasicBlock
+  JumpTable JT(-1U, JTI, JumpTableBB, Default);
+  JumpTableHeader JTH(First, Last, SV, CR.CaseBB, (CR.CaseBB == CurMBB));
+  if (CR.CaseBB == CurMBB)
+    visitJumpTableHeader(JT, JTH);
+
+  JTCases.push_back(JumpTableBlock(JTH, JT));
+
+  return true;
+}
+
+/// handleBTSplitSwitchCase - emit comparison and split binary search tree into
+/// 2 subtrees.
+bool SelectionDAGLowering::handleBTSplitSwitchCase(CaseRec& CR,
+                                                   CaseRecVector& WorkList,
+                                                   Value* SV,
+                                                   MachineBasicBlock* Default) {
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = CurMBB->getParent();
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CR.CaseBB;
+
+  if (++BBI != CurMBB->getParent()->end())
+    NextBlock = BBI;
+
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  // Size is the number of Cases represented by this range.
+  unsigned Size = CR.Range.second - CR.Range.first;
+
+  const APInt& First = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt& Last  = cast<ConstantInt>(BackCase.High)->getValue();
+  double FMetric = 0;
+  CaseItr Pivot = CR.Range.first + Size/2;
+
+  // Select optimal pivot, maximizing sum density of LHS and RHS. This will
+  // (heuristically) allow us to emit JumpTable's later.
+  size_t TSize = 0;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I)
+    TSize += I->size();
+
+  size_t LSize = FrontCase.size();
+  size_t RSize = TSize-LSize;
+  DEBUG(errs() << "Selecting best pivot: \n"
+               << "First: " << First << ", Last: " << Last <<'\n'
+               << "LSize: " << LSize << ", RSize: " << RSize << '\n');
+  for (CaseItr I = CR.Range.first, J=I+1, E = CR.Range.second;
+       J!=E; ++I, ++J) {
+    const APInt& LEnd = cast<ConstantInt>(I->High)->getValue();
+    const APInt& RBegin = cast<ConstantInt>(J->Low)->getValue();
+    APInt Range = ComputeRange(LEnd, RBegin);
+    assert((Range - 2ULL).isNonNegative() &&
+           "Invalid case distance");
+    double LDensity = (double)LSize / (LEnd - First + 1ULL).roundToDouble();
+    double RDensity = (double)RSize / (Last - RBegin + 1ULL).roundToDouble();
+    double Metric = Range.logBase2()*(LDensity+RDensity);
+    // Should always split in some non-trivial place
+    DEBUG(errs() <<"=>Step\n"
+                 << "LEnd: " << LEnd << ", RBegin: " << RBegin << '\n'
+                 << "LDensity: " << LDensity
+                 << ", RDensity: " << RDensity << '\n'
+                 << "Metric: " << Metric << '\n');
+    if (FMetric < Metric) {
+      Pivot = J;
+      FMetric = Metric;
+      DEBUG(errs() << "Current metric set to: " << FMetric << '\n');
+    }
+
+    LSize += J->size();
+    RSize -= J->size();
+  }
+  if (areJTsAllowed(TLI)) {
+    // If our case is dense we *really* should handle it earlier!
+    assert((FMetric > 0) && "Should handle dense range earlier!");
+  } else {
+    Pivot = CR.Range.first + Size/2;
+  }
+
+  CaseRange LHSR(CR.Range.first, Pivot);
+  CaseRange RHSR(Pivot, CR.Range.second);
+  Constant *C = Pivot->Low;
+  MachineBasicBlock *FalseBB = 0, *TrueBB = 0;
+
+  // We know that we branch to the LHS if the Value being switched on is
+  // less than the Pivot value, C.  We use this to optimize our binary
+  // tree a bit, by recognizing that if SV is greater than or equal to the
+  // LHS's Case Value, and that Case Value is exactly one less than the
+  // Pivot's Value, then we can branch directly to the LHS's Target,
+  // rather than creating a leaf node for it.
+  if ((LHSR.second - LHSR.first) == 1 &&
+      LHSR.first->High == CR.GE &&
+      cast<ConstantInt>(C)->getValue() ==
+      (cast<ConstantInt>(CR.GE)->getValue() + 1LL)) {
+    TrueBB = LHSR.first->BB;
+  } else {
+    TrueBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, TrueBB);
+    WorkList.push_back(CaseRec(TrueBB, C, CR.GE, LHSR));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  // Similar to the optimization above, if the Value being switched on is
+  // known to be less than the Constant CR.LT, and the current Case Value
+  // is CR.LT - 1, then we can branch directly to the target block for
+  // the current Case Value, rather than emitting a RHS leaf node for it.
+  if ((RHSR.second - RHSR.first) == 1 && CR.LT &&
+      cast<ConstantInt>(RHSR.first->Low)->getValue() ==
+      (cast<ConstantInt>(CR.LT)->getValue() - 1LL)) {
+    FalseBB = RHSR.first->BB;
+  } else {
+    FalseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, FalseBB);
+    WorkList.push_back(CaseRec(FalseBB,CR.LT,C,RHSR));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  // Create a CaseBlock record representing a conditional branch to
+  // the LHS node if the value being switched on SV is less than C.
+  // Otherwise, branch to LHS.
+  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+
+  if (CR.CaseBB == CurMBB)
+    visitSwitchCase(CB);
+  else
+    SwitchCases.push_back(CB);
+
+  return true;
+}
+
+/// handleBitTestsSwitchCase - if current case range has few destination and
+/// range span less, than machine word bitwidth, encode case range into series
+/// of masks and emit bit tests with these masks.
+bool SelectionDAGLowering::handleBitTestsSwitchCase(CaseRec& CR,
+                                                    CaseRecVector& WorkList,
+                                                    Value* SV,
+                                                    MachineBasicBlock* Default){
+  unsigned IntPtrBits = TLI.getPointerTy().getSizeInBits();
+
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = CurMBB->getParent();
+
+  // If target does not have legal shift left, do not emit bit tests at all.
+  if (!TLI.isOperationLegal(ISD::SHL, TLI.getPointerTy()))
+    return false;
+
+  size_t numCmps = 0;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I) {
+    // Single case counts one, case range - two.
+    numCmps += (I->Low == I->High ? 1 : 2);
+  }
+
+  // Count unique destinations
+  SmallSet<MachineBasicBlock*, 4> Dests;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+    Dests.insert(I->BB);
+    if (Dests.size() > 3)
+      // Don't bother the code below, if there are too much unique destinations
+      return false;
+  }
+  DEBUG(errs() << "Total number of unique destinations: " << Dests.size() << '\n'
+               << "Total number of comparisons: " << numCmps << '\n');
+
+  // Compute span of values.
+  const APInt& minValue = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt& maxValue = cast<ConstantInt>(BackCase.High)->getValue();
+  APInt cmpRange = maxValue - minValue;
+
+  DEBUG(errs() << "Compare range: " << cmpRange << '\n'
+               << "Low bound: " << minValue << '\n'
+               << "High bound: " << maxValue << '\n');
+
+  if (cmpRange.uge(APInt(cmpRange.getBitWidth(), IntPtrBits)) ||
+      (!(Dests.size() == 1 && numCmps >= 3) &&
+       !(Dests.size() == 2 && numCmps >= 5) &&
+       !(Dests.size() >= 3 && numCmps >= 6)))
+    return false;
+
+  DEBUG(errs() << "Emitting bit tests\n");
+  APInt lowBound = APInt::getNullValue(cmpRange.getBitWidth());
+
+  // Optimize the case where all the case values fit in a
+  // word without having to subtract minValue. In this case,
+  // we can optimize away the subtraction.
+  if (minValue.isNonNegative() &&
+      maxValue.slt(APInt(maxValue.getBitWidth(), IntPtrBits))) {
+    cmpRange = maxValue;
+  } else {
+    lowBound = minValue;
+  }
+
+  CaseBitsVector CasesBits;
+  unsigned i, count = 0;
+
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+    MachineBasicBlock* Dest = I->BB;
+    for (i = 0; i < count; ++i)
+      if (Dest == CasesBits[i].BB)
+        break;
+
+    if (i == count) {
+      assert((count < 3) && "Too much destinations to test!");
+      CasesBits.push_back(CaseBits(0, Dest, 0));
+      count++;
+    }
+
+    const APInt& lowValue = cast<ConstantInt>(I->Low)->getValue();
+    const APInt& highValue = cast<ConstantInt>(I->High)->getValue();
+
+    uint64_t lo = (lowValue - lowBound).getZExtValue();
+    uint64_t hi = (highValue - lowBound).getZExtValue();
+
+    for (uint64_t j = lo; j <= hi; j++) {
+      CasesBits[i].Mask |=  1ULL << j;
+      CasesBits[i].Bits++;
+    }
+
+  }
+  std::sort(CasesBits.begin(), CasesBits.end(), CaseBitsCmp());
+
+  BitTestInfo BTC;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  DEBUG(errs() << "Cases:\n");
+  for (unsigned i = 0, e = CasesBits.size(); i!=e; ++i) {
+    DEBUG(errs() << "Mask: " << CasesBits[i].Mask
+                 << ", Bits: " << CasesBits[i].Bits
+                 << ", BB: " << CasesBits[i].BB << '\n');
+
+    MachineBasicBlock *CaseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, CaseBB);
+    BTC.push_back(BitTestCase(CasesBits[i].Mask,
+                              CaseBB,
+                              CasesBits[i].BB));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  BitTestBlock BTB(lowBound, cmpRange, SV,
+                   -1U, (CR.CaseBB == CurMBB),
+                   CR.CaseBB, Default, BTC);
+
+  if (CR.CaseBB == CurMBB)
+    visitBitTestHeader(BTB);
+
+  BitTestCases.push_back(BTB);
+
+  return true;
+}
+
+
+/// Clusterify - Transform simple list of Cases into list of CaseRange's
+size_t SelectionDAGLowering::Clusterify(CaseVector& Cases,
+                                          const SwitchInst& SI) {
+  size_t numCmps = 0;
+
+  // Start with "simple" cases
+  for (size_t i = 1; i < SI.getNumSuccessors(); ++i) {
+    MachineBasicBlock *SMBB = FuncInfo.MBBMap[SI.getSuccessor(i)];
+    Cases.push_back(Case(SI.getSuccessorValue(i),
+                         SI.getSuccessorValue(i),
+                         SMBB));
+  }
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size() >= 2)
+    // Must recompute end() each iteration because it may be
+    // invalidated by erase if we hold on to it
+    for (CaseItr I = Cases.begin(), J = ++(Cases.begin()); J != Cases.end(); ) {
+      const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue();
+      const APInt& currentValue = cast<ConstantInt>(I->High)->getValue();
+      MachineBasicBlock* nextBB = J->BB;
+      MachineBasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
+  }
+
+  return numCmps;
+}
+
+void SelectionDAGLowering::visitSwitch(SwitchInst &SI) {
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CurMBB;
+
+  MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
+
+  // If there is only the default destination, branch to it if it is not the
+  // next basic block.  Otherwise, just fall through.
+  if (SI.getNumOperands() == 2) {
+    // Update machine-CFG edges.
+
+    // If this is not a fall-through branch, emit the branch.
+    CurMBB->addSuccessor(Default);
+    if (Default != NextBlock)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(Default)));
+    return;
+  }
+
+  // If there are any non-default case statements, create a vector of Cases
+  // representing each one, and sort the vector so that we can efficiently
+  // create a binary search tree from them.
+  CaseVector Cases;
+  size_t numCmps = Clusterify(Cases, SI);
+  DEBUG(errs() << "Clusterify finished. Total clusters: " << Cases.size()
+               << ". Total compares: " << numCmps << '\n');
+  numCmps = 0;
+
+  // Get the Value to be switched on and default basic blocks, which will be
+  // inserted into CaseBlock records, representing basic blocks in the binary
+  // search tree.
+  Value *SV = SI.getOperand(0);
+
+  // Push the initial CaseRec onto the worklist
+  CaseRecVector WorkList;
+  WorkList.push_back(CaseRec(CurMBB,0,0,CaseRange(Cases.begin(),Cases.end())));
+
+  while (!WorkList.empty()) {
+    // Grab a record representing a case range to process off the worklist
+    CaseRec CR = WorkList.back();
+    WorkList.pop_back();
+
+    if (handleBitTestsSwitchCase(CR, WorkList, SV, Default))
+      continue;
+
+    // If the range has few cases (two or less) emit a series of specific
+    // tests.
+    if (handleSmallSwitchRange(CR, WorkList, SV, Default))
+      continue;
+
+    // If the switch has more than 5 blocks, and at least 40% dense, and the
+    // target supports indirect branches, then emit a jump table rather than
+    // lowering the switch to a binary tree of conditional branches.
+    if (handleJTSwitchCase(CR, WorkList, SV, Default))
+      continue;
+
+    // Emit binary tree. We need to pick a pivot, and push left and right ranges
+    // onto the worklist. Leafs are handled via handleSmallSwitchRange() call.
+    handleBTSplitSwitchCase(CR, WorkList, SV, Default);
+  }
+}
+
+
+void SelectionDAGLowering::visitSub(User &I) {
+  // -0.0 - X --> fneg
+  const Type *Ty = I.getType();
+  if (isa<VectorType>(Ty)) {
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(I.getOperand(0))) {
+      const VectorType *DestTy = cast<VectorType>(I.getType());
+      const Type *ElTy = DestTy->getElementType();
+      if (ElTy->isFloatingPoint()) {
+        unsigned VL = DestTy->getNumElements();
+        std::vector<Constant*> NZ(VL, ConstantFP::getNegativeZero(ElTy));
+        Constant *CNZ = ConstantVector::get(&NZ[0], NZ.size());
+        if (CV == CNZ) {
+          SDValue Op2 = getValue(I.getOperand(1));
+          setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
+                                   Op2.getValueType(), Op2));
+          return;
+        }
+      }
+    }
+  }
+  if (Ty->isFloatingPoint()) {
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(I.getOperand(0)))
+      if (CFP->isExactlyValue(ConstantFP::getNegativeZero(Ty)->getValueAPF())) {
+        SDValue Op2 = getValue(I.getOperand(1));
+        setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
+                                 Op2.getValueType(), Op2));
+        return;
+      }
+  }
+
+  visitBinary(I, Ty->isFPOrFPVector() ? ISD::FSUB : ISD::SUB);
+}
+
+void SelectionDAGLowering::visitBinary(User &I, unsigned OpCode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  setValue(&I, DAG.getNode(OpCode, getCurDebugLoc(),
+                           Op1.getValueType(), Op1, Op2));
+}
+
+void SelectionDAGLowering::visitShift(User &I, unsigned Opcode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  if (!isa<VectorType>(I.getType()) &&
+      Op2.getValueType() != TLI.getShiftAmountTy()) {
+    // If the operand is smaller than the shift count type, promote it.
+    if (TLI.getShiftAmountTy().bitsGT(Op2.getValueType()))
+      Op2 = DAG.getNode(ISD::ANY_EXTEND, getCurDebugLoc(),
+                        TLI.getShiftAmountTy(), Op2);
+    // If the operand is larger than the shift count type but the shift
+    // count type has enough bits to represent any shift value, truncate
+    // it now. This is a common case and it exposes the truncate to
+    // optimization early.
+    else if (TLI.getShiftAmountTy().getSizeInBits() >=
+             Log2_32_Ceil(Op2.getValueType().getSizeInBits()))
+      Op2 = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                        TLI.getShiftAmountTy(), Op2);
+    // Otherwise we'll need to temporarily settle for some other
+    // convenient type; type legalization will make adjustments as
+    // needed.
+    else if (TLI.getPointerTy().bitsLT(Op2.getValueType()))
+      Op2 = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                        TLI.getPointerTy(), Op2);
+    else if (TLI.getPointerTy().bitsGT(Op2.getValueType()))
+      Op2 = DAG.getNode(ISD::ANY_EXTEND, getCurDebugLoc(),
+                        TLI.getPointerTy(), Op2);
+  }
+
+  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(),
+                           Op1.getValueType(), Op1, Op2));
+}
+
+void SelectionDAGLowering::visitICmp(User &I) {
+  ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
+  if (ICmpInst *IC = dyn_cast<ICmpInst>(&I))
+    predicate = IC->getPredicate();
+  else if (ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
+    predicate = ICmpInst::Predicate(IC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Opcode = getICmpCondCode(predicate);
+  setValue(&I, DAG.getSetCC(getCurDebugLoc(),MVT::i1, Op1, Op2, Opcode));
+}
+
+void SelectionDAGLowering::visitFCmp(User &I) {
+  FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
+  if (FCmpInst *FC = dyn_cast<FCmpInst>(&I))
+    predicate = FC->getPredicate();
+  else if (ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
+    predicate = FCmpInst::Predicate(FC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Condition = getFCmpCondCode(predicate);
+  setValue(&I, DAG.getSetCC(getCurDebugLoc(), MVT::i1, Op1, Op2, Condition));
+}
+
+void SelectionDAGLowering::visitVICmp(User &I) {
+  ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
+  if (VICmpInst *IC = dyn_cast<VICmpInst>(&I))
+    predicate = IC->getPredicate();
+  else if (ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
+    predicate = ICmpInst::Predicate(IC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Opcode = getICmpCondCode(predicate);
+  setValue(&I, DAG.getVSetCC(getCurDebugLoc(), Op1.getValueType(),
+                             Op1, Op2, Opcode));
+}
+
+void SelectionDAGLowering::visitVFCmp(User &I) {
+  FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
+  if (VFCmpInst *FC = dyn_cast<VFCmpInst>(&I))
+    predicate = FC->getPredicate();
+  else if (ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
+    predicate = FCmpInst::Predicate(FC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Condition = getFCmpCondCode(predicate);
+  MVT DestVT = TLI.getValueType(I.getType());
+
+  setValue(&I, DAG.getVSetCC(getCurDebugLoc(), DestVT, Op1, Op2, Condition));
+}
+
+void SelectionDAGLowering::visitSelect(User &I) {
+  SmallVector<MVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues != 0) {
+    SmallVector<SDValue, 4> Values(NumValues);
+    SDValue Cond     = getValue(I.getOperand(0));
+    SDValue TrueVal  = getValue(I.getOperand(1));
+    SDValue FalseVal = getValue(I.getOperand(2));
+
+    for (unsigned i = 0; i != NumValues; ++i)
+      Values[i] = DAG.getNode(ISD::SELECT, getCurDebugLoc(),
+                              TrueVal.getValueType(), Cond,
+                              SDValue(TrueVal.getNode(), TrueVal.getResNo() + i),
+                              SDValue(FalseVal.getNode(), FalseVal.getResNo() + i));
+
+    setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                             DAG.getVTList(&ValueVTs[0], NumValues),
+                             &Values[0], NumValues));
+  }
+}
+
+
+void SelectionDAGLowering::visitTrunc(User &I) {
+  // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitZExt(User &I) {
+  // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // ZExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitSExt(User &I) {
+  // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // SExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitFPTrunc(User &I) {
+  // FPTrunc is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurDebugLoc(),
+                           DestVT, N, DAG.getIntPtrConstant(0)));
+}
+
+void SelectionDAGLowering::visitFPExt(User &I){
+  // FPTrunc is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitFPToUI(User &I) {
+  // FPToUI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitFPToSI(User &I) {
+  // FPToSI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitUIToFP(User &I) {
+  // UIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitSIToFP(User &I){
+  // SIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGLowering::visitPtrToInt(User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  MVT SrcVT = N.getValueType();
+  MVT DestVT = TLI.getValueType(I.getType());
+  SDValue Result;
+  if (DestVT.bitsLT(SrcVT))
+    Result = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), DestVT, N);
+  else
+    // Note: ZERO_EXTEND can handle cases where the sizes are equal too
+    Result = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(), DestVT, N);
+  setValue(&I, Result);
+}
+
+void SelectionDAGLowering::visitIntToPtr(User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  MVT SrcVT = N.getValueType();
+  MVT DestVT = TLI.getValueType(I.getType());
+  if (DestVT.bitsLT(SrcVT))
+    setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), DestVT, N));
+  else
+    // Note: ZERO_EXTEND can handle cases where the sizes are equal too
+    setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                             DestVT, N));
+}
+
+void SelectionDAGLowering::visitBitCast(User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  MVT DestVT = TLI.getValueType(I.getType());
+
+  // BitCast assures us that source and destination are the same size so this
+  // is either a BIT_CONVERT or a no-op.
+  if (DestVT != N.getValueType())
+    setValue(&I, DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(),
+                             DestVT, N)); // convert types
+  else
+    setValue(&I, N); // noop cast.
+}
+
+void SelectionDAGLowering::visitInsertElement(User &I) {
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InVal = getValue(I.getOperand(1));
+  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                                TLI.getPointerTy(),
+                                getValue(I.getOperand(2)));
+
+  setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurDebugLoc(),
+                           TLI.getValueType(I.getType()),
+                           InVec, InVal, InIdx));
+}
+
+void SelectionDAGLowering::visitExtractElement(User &I) {
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                                TLI.getPointerTy(),
+                                getValue(I.getOperand(1)));
+  setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                           TLI.getValueType(I.getType()), InVec, InIdx));
+}
+
+
+// Utility for visitShuffleVector - Returns true if the mask is mask starting
+// from SIndx and increasing to the element length (undefs are allowed).
+static bool SequentialMask(SmallVectorImpl<int> &Mask, unsigned SIndx) {
+  unsigned MaskNumElts = Mask.size();
+  for (unsigned i = 0; i != MaskNumElts; ++i)
+    if ((Mask[i] >= 0) && (Mask[i] != (int)(i + SIndx)))
+      return false;
+  return true;
+}
+
+void SelectionDAGLowering::visitShuffleVector(User &I) {
+  SmallVector<int, 8> Mask;
+  SDValue Src1 = getValue(I.getOperand(0));
+  SDValue Src2 = getValue(I.getOperand(1));
+
+  // Convert the ConstantVector mask operand into an array of ints, with -1
+  // representing undef values.
+  SmallVector<Constant*, 8> MaskElts;
+  cast<Constant>(I.getOperand(2))->getVectorElements(MaskElts);
+  unsigned MaskNumElts = MaskElts.size();
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    if (isa<UndefValue>(MaskElts[i]))
+      Mask.push_back(-1);
+    else
+      Mask.push_back(cast<ConstantInt>(MaskElts[i])->getSExtValue());
+  }
+  
+  MVT VT = TLI.getValueType(I.getType());
+  MVT SrcVT = Src1.getValueType();
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
+
+  if (SrcNumElts == MaskNumElts) {
+    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                      &Mask[0]));
+    return;
+  }
+
+  // Normalize the shuffle vector since mask and vector length don't match.
+  if (SrcNumElts < MaskNumElts && MaskNumElts % SrcNumElts == 0) {
+    // Mask is longer than the source vectors and is a multiple of the source
+    // vectors.  We can use concatenate vector to make the mask and vectors
+    // lengths match.
+    if (SrcNumElts*2 == MaskNumElts && SequentialMask(Mask, 0)) {
+      // The shuffle is concatenating two vectors together.
+      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurDebugLoc(),
+                               VT, Src1, Src2));
+      return;
+    }
+
+    // Pad both vectors with undefs to make them the same length as the mask.
+    unsigned NumConcat = MaskNumElts / SrcNumElts;
+    bool Src1U = Src1.getOpcode() == ISD::UNDEF;
+    bool Src2U = Src2.getOpcode() == ISD::UNDEF;
+    SDValue UndefVal = DAG.getUNDEF(SrcVT);
+
+    SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
+    SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
+    MOps1[0] = Src1;
+    MOps2[0] = Src2;
+    
+    Src1 = Src1U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS, 
+                                                  getCurDebugLoc(), VT, 
+                                                  &MOps1[0], NumConcat);
+    Src2 = Src2U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
+                                                  getCurDebugLoc(), VT, 
+                                                  &MOps2[0], NumConcat);
+
+    // Readjust mask for new input vector length.
+    SmallVector<int, 8> MappedOps;
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      if (Idx < (int)SrcNumElts)
+        MappedOps.push_back(Idx);
+      else
+        MappedOps.push_back(Idx + MaskNumElts - SrcNumElts);
+    }
+    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2, 
+                                      &MappedOps[0]));
+    return;
+  }
+
+  if (SrcNumElts > MaskNumElts) {
+    // Analyze the access pattern of the vector to see if we can extract
+    // two subvectors and do the shuffle. The analysis is done by calculating
+    // the range of elements the mask access on both vectors.
+    int MinRange[2] = { SrcNumElts+1, SrcNumElts+1};
+    int MaxRange[2] = {-1, -1};
+
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      int Input = 0;
+      if (Idx < 0)
+        continue;
+      
+      if (Idx >= (int)SrcNumElts) {
+        Input = 1;
+        Idx -= SrcNumElts;
+      }
+      if (Idx > MaxRange[Input])
+        MaxRange[Input] = Idx;
+      if (Idx < MinRange[Input])
+        MinRange[Input] = Idx;
+    }
+
+    // Check if the access is smaller than the vector size and can we find
+    // a reasonable extract index.
+    int RangeUse[2] = { 2, 2 };  // 0 = Unused, 1 = Extract, 2 = Can not Extract.
+    int StartIdx[2];  // StartIdx to extract from
+    for (int Input=0; Input < 2; ++Input) {
+      if (MinRange[Input] == (int)(SrcNumElts+1) && MaxRange[Input] == -1) {
+        RangeUse[Input] = 0; // Unused
+        StartIdx[Input] = 0;
+      } else if (MaxRange[Input] - MinRange[Input] < (int)MaskNumElts) {
+        // Fits within range but we should see if we can find a good
+        // start index that is a multiple of the mask length.
+        if (MaxRange[Input] < (int)MaskNumElts) {
+          RangeUse[Input] = 1; // Extract from beginning of the vector
+          StartIdx[Input] = 0;
+        } else {
+          StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
+          if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
+              StartIdx[Input] + MaskNumElts < SrcNumElts)
+            RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+        }
+      }
+    }
+
+    if (RangeUse[0] == 0 && RangeUse[0] == 0) {
+      setValue(&I, DAG.getUNDEF(VT));  // Vectors are not used.
+      return;
+    }
+    else if (RangeUse[0] < 2 && RangeUse[1] < 2) {
+      // Extract appropriate subvector and generate a vector shuffle
+      for (int Input=0; Input < 2; ++Input) {
+        SDValue& Src = Input == 0 ? Src1 : Src2;
+        if (RangeUse[Input] == 0) {
+          Src = DAG.getUNDEF(VT);
+        } else {
+          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, getCurDebugLoc(), VT,
+                            Src, DAG.getIntPtrConstant(StartIdx[Input]));
+        }
+      }
+      // Calculate new mask.
+      SmallVector<int, 8> MappedOps;
+      for (unsigned i = 0; i != MaskNumElts; ++i) {
+        int Idx = Mask[i];
+        if (Idx < 0)
+          MappedOps.push_back(Idx);
+        else if (Idx < (int)SrcNumElts)
+          MappedOps.push_back(Idx - StartIdx[0]);
+        else
+          MappedOps.push_back(Idx - SrcNumElts - StartIdx[1] + MaskNumElts);
+      }
+      setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                        &MappedOps[0]));
+      return;
+    }
+  }
+
+  // We can't use either concat vectors or extract subvectors so fall back to
+  // replacing the shuffle with extract and build vector.
+  // to insert and build vector.
+  MVT EltVT = VT.getVectorElementType();
+  MVT PtrVT = TLI.getPointerTy();
+  SmallVector<SDValue,8> Ops;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    if (Mask[i] < 0) {
+      Ops.push_back(DAG.getUNDEF(EltVT));
+    } else {
+      int Idx = Mask[i];
+      if (Idx < (int)SrcNumElts)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                                  EltVT, Src1, DAG.getConstant(Idx, PtrVT)));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                                  EltVT, Src2,
+                                  DAG.getConstant(Idx - SrcNumElts, PtrVT)));
+    }
+  }
+  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+                           VT, &Ops[0], Ops.size()));
+}
+
+void SelectionDAGLowering::visitInsertValue(InsertValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  const Value *Op1 = I.getOperand(1);
+  const Type *AggTy = I.getType();
+  const Type *ValTy = Op1->getType();
+  bool IntoUndef = isa<UndefValue>(Op0);
+  bool FromUndef = isa<UndefValue>(Op1);
+
+  unsigned LinearIndex = ComputeLinearIndex(TLI, AggTy,
+                                            I.idx_begin(), I.idx_end());
+
+  SmallVector<MVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+  SmallVector<MVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+
+  unsigned NumAggValues = AggValueVTs.size();
+  unsigned NumValValues = ValValueVTs.size();
+  SmallVector<SDValue, 4> Values(NumAggValues);
+
+  SDValue Agg = getValue(Op0);
+  SDValue Val = getValue(Op1);
+  unsigned i = 0;
+  // Copy the beginning value(s) from the original aggregate.
+  for (; i != LinearIndex; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+  // Copy values from the inserted value(s).
+  for (; i != LinearIndex + NumValValues; ++i)
+    Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  // Copy remaining value(s) from the original aggregate.
+  for (; i != NumAggValues; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&AggValueVTs[0], NumAggValues),
+                           &Values[0], NumAggValues));
+}
+
+void SelectionDAGLowering::visitExtractValue(ExtractValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  const Type *AggTy = Op0->getType();
+  const Type *ValTy = I.getType();
+  bool OutOfUndef = isa<UndefValue>(Op0);
+
+  unsigned LinearIndex = ComputeLinearIndex(TLI, AggTy,
+                                            I.idx_begin(), I.idx_end());
+
+  SmallVector<MVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+
+  unsigned NumValValues = ValValueVTs.size();
+  SmallVector<SDValue, 4> Values(NumValValues);
+
+  SDValue Agg = getValue(Op0);
+  // Copy out the selected value(s).
+  for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
+    Values[i - LinearIndex] =
+      OutOfUndef ?
+        DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
+        SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValValueVTs[0], NumValValues),
+                           &Values[0], NumValValues));
+}
+
+
+void SelectionDAGLowering::visitGetElementPtr(User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  const Type *Ty = I.getOperand(0)->getType();
+
+  for (GetElementPtrInst::op_iterator OI = I.op_begin()+1, E = I.op_end();
+       OI != E; ++OI) {
+    Value *Idx = *OI;
+    if (const StructType *StTy = dyn_cast<StructType>(Ty)) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        uint64_t Offset = TD->getStructLayout(StTy)->getElementOffset(Field);
+        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+                        DAG.getIntPtrConstant(Offset));
+      }
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->getZExtValue() == 0) continue;
+        uint64_t Offs =
+            TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
+        SDValue OffsVal;
+        unsigned PtrBits = TLI.getPointerTy().getSizeInBits();
+        if (PtrBits < 64) {
+          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                                TLI.getPointerTy(),
+                                DAG.getConstant(Offs, MVT::i64));
+        } else
+          OffsVal = DAG.getIntPtrConstant(Offs);
+        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+                        OffsVal);
+        continue;
+      }
+
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = TD->getTypeAllocSize(Ty);
+      SDValue IdxN = getValue(Idx);
+
+      // If the index is smaller or larger than intptr_t, truncate or extend
+      // it.
+      if (IdxN.getValueType().bitsLT(N.getValueType()))
+        IdxN = DAG.getNode(ISD::SIGN_EXTEND, getCurDebugLoc(),
+                           N.getValueType(), IdxN);
+      else if (IdxN.getValueType().bitsGT(N.getValueType()))
+        IdxN = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                           N.getValueType(), IdxN);
+
+      // If this is a multiply by a power of two, turn it into a shl
+      // immediately.  This is a very common case.
+      if (ElementSize != 1) {
+        if (isPowerOf2_64(ElementSize)) {
+          unsigned Amt = Log2_64(ElementSize);
+          IdxN = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+                             N.getValueType(), IdxN,
+                             DAG.getConstant(Amt, TLI.getPointerTy()));
+        } else {
+          SDValue Scale = DAG.getIntPtrConstant(ElementSize);
+          IdxN = DAG.getNode(ISD::MUL, getCurDebugLoc(),
+                             N.getValueType(), IdxN, Scale);
+        }
+      }
+
+      N = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                      N.getValueType(), N, IdxN);
+    }
+  }
+  setValue(&I, N);
+}
+
+void SelectionDAGLowering::visitAlloca(AllocaInst &I) {
+  // If this is a fixed sized alloca in the entry block of the function,
+  // allocate it statically on the stack.
+  if (FuncInfo.StaticAllocaMap.count(&I))
+    return;   // getValue will auto-populate this.
+
+  const Type *Ty = I.getAllocatedType();
+  uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
+  unsigned Align =
+    std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
+             I.getAlignment());
+
+  SDValue AllocSize = getValue(I.getArraySize());
+  
+  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), AllocSize.getValueType(),
+                          AllocSize,
+                          DAG.getConstant(TySize, AllocSize.getValueType()));
+  
+  
+  
+  MVT IntPtr = TLI.getPointerTy();
+  if (IntPtr.bitsLT(AllocSize.getValueType()))
+    AllocSize = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                            IntPtr, AllocSize);
+  else if (IntPtr.bitsGT(AllocSize.getValueType()))
+    AllocSize = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                            IntPtr, AllocSize);
+
+  // Handle alignment.  If the requested alignment is less than or equal to
+  // the stack alignment, ignore it.  If the size is greater than or equal to
+  // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
+  unsigned StackAlign =
+    TLI.getTargetMachine().getFrameInfo()->getStackAlignment();
+  if (Align <= StackAlign)
+    Align = 0;
+
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size.
+  AllocSize = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(StackAlign-1));
+  // Mask out the low bits for alignment purposes.
+  AllocSize = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(~(uint64_t)(StackAlign-1)));
+
+  SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align) };
+  SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
+  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurDebugLoc(),
+                            VTs, Ops, 3);
+  setValue(&I, DSA);
+  DAG.setRoot(DSA.getValue(1));
+
+  // Inform the Frame Information that we have just allocated a variable-sized
+  // object.
+  CurMBB->getParent()->getFrameInfo()->CreateVariableSizedObject();
+}
+
+void SelectionDAGLowering::visitLoad(LoadInst &I) {
+  const Value *SV = I.getOperand(0);
+  SDValue Ptr = getValue(SV);
+
+  const Type *Ty = I.getType();
+  bool isVolatile = I.isVolatile();
+  unsigned Alignment = I.getAlignment();
+
+  SmallVector<MVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  SDValue Root;
+  bool ConstantMemory = false;
+  if (I.isVolatile())
+    // Serialize volatile loads with other side effects.
+    Root = getRoot();
+  else if (AA->pointsToConstantMemory(SV)) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    Root = DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = DAG.getRoot();
+  }
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SmallVector<SDValue, 4> Chains(NumValues);
+  MVT PtrVT = Ptr.getValueType();
+  for (unsigned i = 0; i != NumValues; ++i) {
+    SDValue L = DAG.getLoad(ValueVTs[i], getCurDebugLoc(), Root,
+                              DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                                          PtrVT, Ptr,
+                                          DAG.getConstant(Offsets[i], PtrVT)),
+                              SV, Offsets[i],
+                              isVolatile, Alignment);
+    Values[i] = L;
+    Chains[i] = L.getValue(1);
+  }
+
+  if (!ConstantMemory) {
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                  MVT::Other,
+                                  &Chains[0], NumValues);
+    if (isVolatile)
+      DAG.setRoot(Chain);
+    else
+      PendingLoads.push_back(Chain);
+  }
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValueVTs[0], NumValues),
+                           &Values[0], NumValues));
+}
+
+
+void SelectionDAGLowering::visitStore(StoreInst &I) {
+  Value *SrcV = I.getOperand(0);
+  Value *PtrV = I.getOperand(1);
+
+  SmallVector<MVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, SrcV->getType(), ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  // Get the lowered operands. Note that we do this after
+  // checking if NumResults is zero, because with zero results
+  // the operands won't have values in the map.
+  SDValue Src = getValue(SrcV);
+  SDValue Ptr = getValue(PtrV);
+
+  SDValue Root = getRoot();
+  SmallVector<SDValue, 4> Chains(NumValues);
+  MVT PtrVT = Ptr.getValueType();
+  bool isVolatile = I.isVolatile();
+  unsigned Alignment = I.getAlignment();
+  for (unsigned i = 0; i != NumValues; ++i)
+    Chains[i] = DAG.getStore(Root, getCurDebugLoc(),
+                             SDValue(Src.getNode(), Src.getResNo() + i),
+                             DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                                         PtrVT, Ptr,
+                                         DAG.getConstant(Offsets[i], PtrVT)),
+                             PtrV, Offsets[i],
+                             isVolatile, Alignment);
+
+  DAG.setRoot(DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                          MVT::Other, &Chains[0], NumValues));
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGLowering::visitTargetIntrinsic(CallInst &I,
+                                                unsigned Intrinsic) {
+  bool HasChain = !I.doesNotAccessMemory();
+  bool OnlyLoad = HasChain && I.onlyReadsMemory();
+
+  // Build the operand list.
+  SmallVector<SDValue, 8> Ops;
+  if (HasChain) {  // If this intrinsic has side-effects, chainify it.
+    if (OnlyLoad) {
+      // We don't need to serialize loads against other loads.
+      Ops.push_back(DAG.getRoot());
+    } else {
+      Ops.push_back(getRoot());
+    }
+  }
+
+  // Info is set by getTgtMemInstrinsic
+  TargetLowering::IntrinsicInfo Info;
+  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);
+
+  // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
+  if (!IsTgtIntrinsic)
+    Ops.push_back(DAG.getConstant(Intrinsic, TLI.getPointerTy()));
+
+  // Add all operands of the call to the operand list.
+  for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i) {
+    SDValue Op = getValue(I.getOperand(i));
+    assert(TLI.isTypeLegal(Op.getValueType()) &&
+           "Intrinsic uses a non-legal type?");
+    Ops.push_back(Op);
+  }
+
+  std::vector<MVT> VTArray;
+  if (I.getType() != Type::VoidTy) {
+    MVT VT = TLI.getValueType(I.getType());
+    if (VT.isVector()) {
+      const VectorType *DestTy = cast<VectorType>(I.getType());
+      MVT EltVT = TLI.getValueType(DestTy->getElementType());
+
+      VT = MVT::getVectorVT(EltVT, DestTy->getNumElements());
+      assert(VT != MVT::Other && "Intrinsic uses a non-legal type?");
+    }
+
+    assert(TLI.isTypeLegal(VT) && "Intrinsic uses a non-legal type?");
+    VTArray.push_back(VT);
+  }
+  if (HasChain)
+    VTArray.push_back(MVT::Other);
+
+  SDVTList VTs = DAG.getVTList(&VTArray[0], VTArray.size());
+
+  // Create the node.
+  SDValue Result;
+  if (IsTgtIntrinsic) {
+    // This is target intrinsic that touches memory
+    Result = DAG.getMemIntrinsicNode(Info.opc, getCurDebugLoc(),
+                                     VTs, &Ops[0], Ops.size(),
+                                     Info.memVT, Info.ptrVal, Info.offset,
+                                     Info.align, Info.vol,
+                                     Info.readMem, Info.writeMem);
+  }
+  else if (!HasChain)
+    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  else if (I.getType() != Type::VoidTy)
+    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  else
+    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+  if (I.getType() != Type::VoidTy) {
+    if (const VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
+      MVT VT = TLI.getValueType(PTy);
+      Result = DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(), VT, Result);
+    }
+    setValue(&I, Result);
+  }
+}
+
+/// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
+static GlobalVariable *ExtractTypeInfo(Value *V) {
+  V = V->stripPointerCasts();
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+  assert ((GV || isa<ConstantPointerNull>(V)) &&
+          "TypeInfo must be a global variable or NULL");
+  return GV;
+}
+
+namespace llvm {
+
+/// AddCatchInfo - Extract the personality and type infos from an eh.selector
+/// call, and add them to the specified machine basic block.
+void AddCatchInfo(CallInst &I, MachineModuleInfo *MMI,
+                  MachineBasicBlock *MBB) {
+  // Inform the MachineModuleInfo of the personality for this landing pad.
+  ConstantExpr *CE = cast<ConstantExpr>(I.getOperand(2));
+  assert(CE->getOpcode() == Instruction::BitCast &&
+         isa<Function>(CE->getOperand(0)) &&
+         "Personality should be a function");
+  MMI->addPersonality(MBB, cast<Function>(CE->getOperand(0)));
+
+  // Gather all the type infos for this landing pad and pass them along to
+  // MachineModuleInfo.
+  std::vector<GlobalVariable *> TyInfo;
+  unsigned N = I.getNumOperands();
+
+  for (unsigned i = N - 1; i > 2; --i) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(i))) {
+      unsigned FilterLength = CI->getZExtValue();
+      unsigned FirstCatch = i + FilterLength + !FilterLength;
+      assert (FirstCatch <= N && "Invalid filter length");
+
+      if (FirstCatch < N) {
+        TyInfo.reserve(N - FirstCatch);
+        for (unsigned j = FirstCatch; j < N; ++j)
+          TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+        MMI->addCatchTypeInfo(MBB, TyInfo);
+        TyInfo.clear();
+      }
+
+      if (!FilterLength) {
+        // Cleanup.
+        MMI->addCleanup(MBB);
+      } else {
+        // Filter.
+        TyInfo.reserve(FilterLength - 1);
+        for (unsigned j = i + 1; j < FirstCatch; ++j)
+          TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+        MMI->addFilterTypeInfo(MBB, TyInfo);
+        TyInfo.clear();
+      }
+
+      N = i;
+    }
+  }
+
+  if (N > 3) {
+    TyInfo.reserve(N - 3);
+    for (unsigned j = 3; j < N; ++j)
+      TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+    MMI->addCatchTypeInfo(MBB, TyInfo);
+  }
+}
+
+}
+
+/// GetSignificand - Get the significand and build it into a floating-point
+/// number with exponent of 1:
+///
+///   Op = (Op & 0x007fffff) | 0x3f800000;
+///
+/// where Op is the hexidecimal representation of floating point value.
+static SDValue
+GetSignificand(SelectionDAG &DAG, SDValue Op, DebugLoc dl) {
+  SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x007fffff, MVT::i32));
+  SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
+                           DAG.getConstant(0x3f800000, MVT::i32));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t2);
+}
+
+/// GetExponent - Get the exponent:
+///
+///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
+///
+/// where Op is the hexidecimal representation of floating point value.
+static SDValue
+GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
+            DebugLoc dl) {
+  SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x7f800000, MVT::i32));
+  SDValue t1 = DAG.getNode(ISD::SRL, dl, MVT::i32, t0,
+                           DAG.getConstant(23, TLI.getPointerTy()));
+  SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
+                           DAG.getConstant(127, MVT::i32));
+  return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
+}
+
+/// getF32Constant - Get 32-bit floating point constant.
+static SDValue
+getF32Constant(SelectionDAG &DAG, unsigned Flt) {
+  return DAG.getConstantFP(APFloat(APInt(32, Flt)), MVT::f32);
+}
+
+/// Inlined utility function to implement binary input atomic intrinsics for
+/// visitIntrinsicCall: I is a call instruction
+///                     Op is the associated NodeType for I
+const char *
+SelectionDAGLowering::implVisitBinaryAtomic(CallInst& I, ISD::NodeType Op) {
+  SDValue Root = getRoot();
+  SDValue L =
+    DAG.getAtomic(Op, getCurDebugLoc(),
+                  getValue(I.getOperand(2)).getValueType().getSimpleVT(),
+                  Root,
+                  getValue(I.getOperand(1)),
+                  getValue(I.getOperand(2)),
+                  I.getOperand(1));
+  setValue(&I, L);
+  DAG.setRoot(L.getValue(1));
+  return 0;
+}
+
+// implVisitAluOverflow - Lower arithmetic overflow instrinsics.
+const char *
+SelectionDAGLowering::implVisitAluOverflow(CallInst &I, ISD::NodeType Op) {
+  SDValue Op1 = getValue(I.getOperand(1));
+  SDValue Op2 = getValue(I.getOperand(2));
+
+  SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
+  SDValue Result = DAG.getNode(Op, getCurDebugLoc(), VTs, Op1, Op2);
+
+  setValue(&I, Result);
+  return 0;
+}
+
+/// visitExp - Lower an exp intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGLowering::visitExp(CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getOperand(1));
+
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OFe 1.4426950f
+    //   IntegerPartOfX = ((int32_t)(X * LOG2OFe));
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
+                             getF32Constant(DAG, 0x3fb8aa3b));
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+    //   FractionalPartOfX = (X * LOG2OFe) - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f7f5e7e));
+      SDValue TwoToFracPartOfX = DAG.getNode(ISD::BIT_CONVERT, dl,MVT::i32, t5);
+
+      // Add the exponent into the result in integer domain.
+      SDValue t6 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                               TwoToFracPartOfX, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t6);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // 0.000107046256 error, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3f7ff8fd));
+      SDValue TwoToFracPartOfX = DAG.getNode(ISD::BIT_CONVERT, dl,MVT::i32, t7);
+
+      // Add the exponent into the result in integer domain.
+      SDValue t8 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                               TwoToFracPartOfX, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t8);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      //
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      SDValue t13 = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                getF32Constant(DAG, 0x3f800000));
+      SDValue TwoToFracPartOfX = DAG.getNode(ISD::BIT_CONVERT, dl,
+                                             MVT::i32, t13);
+
+      // Add the exponent into the result in integer domain.
+      SDValue t14 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                TwoToFracPartOfX, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, t14);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FEXP, dl,
+                         getValue(I.getOperand(1)).getValueType(),
+                         getValue(I.getOperand(1)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitLog - Lower a log intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGLowering::visitLog(CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+
+    // Scale the exponent by log(2) [0.69314718f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3f317218));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   LogofMantissa =
+      //     -1.1609546f +
+      //       (1.4034025f - 0.23903021f * x) * x;
+      //
+      // error 0.0034276066, which is better than 8 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbe74c456));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3fb3a2b1));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                          getF32Constant(DAG, 0x3f949a29));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, LogOfMantissa);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   LogOfMantissa =
+      //     -1.7417939f +
+      //       (2.8212026f +
+      //         (-1.4699568f +
+      //           (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
+      //
+      // error 0.000061011436, which is 14 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbd67b6d6));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ee4f4b8));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fbc278b));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40348e95));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                          getF32Constant(DAG, 0x3fdef31a));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, LogOfMantissa);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   LogOfMantissa =
+      //     -2.1072184f +
+      //       (4.2372794f +
+      //         (-3.7029485f +
+      //           (2.2781945f +
+      //             (-0.87823314f +
+      //               (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
+      //
+      // error 0.0000023660568, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbc91e5ac));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e4350aa));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f60d3e3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x4011cdf0));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x406cfd1c));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x408797cb));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                          getF32Constant(DAG, 0x4006dcab));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, LogOfMantissa);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FLOG, dl,
+                         getValue(I.getOperand(1)).getValueType(),
+                         getValue(I.getOperand(1)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitLog2 - Lower a log2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGLowering::visitLog2(CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+
+    // Get the exponent.
+    SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    // Different possible minimax approximations of significand in
+    // floating-point for various degrees of accuracy over [1,2].
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
+      //
+      // error 0.0049451742, which is more than 7 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbeb08fe0));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x40019463));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                           getF32Constant(DAG, 0x3fd6633d));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log2ofMantissa);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log2ofMantissa =
+      //     -2.51285454f +
+      //       (4.07009056f +
+      //         (-2.12067489f +
+      //           (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
+      //
+      // error 0.0000876136000, which is better than 13 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbda7262e));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f25280b));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x4007b923));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40823e2f));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                           getF32Constant(DAG, 0x4020d29c));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log2ofMantissa);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log2ofMantissa =
+      //     -3.0400495f +
+      //       (6.1129976f +
+      //         (-5.3420409f +
+      //           (3.2865683f +
+      //             (-1.2669343f +
+      //               (0.27515199f -
+      //                 0.25691327e-1f * x) * x) * x) * x) * x) * x;
+      //
+      // error 0.0000018516, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbcd2769e));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e8ce0b9));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fa22ae7));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40525723));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x40aaf200));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x40c39dad));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                           getF32Constant(DAG, 0x4042902c));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log2ofMantissa);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FLOG2, dl,
+                         getValue(I.getOperand(1)).getValueType(),
+                         getValue(I.getOperand(1)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitLog10 - Lower a log10 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGLowering::visitLog10(CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+
+    // Scale the exponent by log10(2) [0.30102999f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3e9a209a));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log10ofMantissa =
+      //     -0.50419619f +
+      //       (0.60948995f - 0.10380950f * x) * x;
+      //
+      // error 0.0014886165, which is 6 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbdd49a13));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f1c0789));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                            getF32Constant(DAG, 0x3f011300));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log10ofMantissa);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log10ofMantissa =
+      //     -0.64831180f +
+      //       (0.91751397f +
+      //         (-0.31664806f + 0.47637168e-1f * x) * x) * x;
+      //
+      // error 0.00019228036, which is better than 12 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3d431f31));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ea21fb2));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f6ae232));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                                            getF32Constant(DAG, 0x3f25f7c3));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log10ofMantissa);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log10ofMantissa =
+      //     -0.84299375f +
+      //       (1.5327582f +
+      //         (-1.0688956f +
+      //           (0.49102474f +
+      //             (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
+      //
+      // error 0.0000037995730, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3c5d51ce));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e00685a));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3efb6798));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f88d192));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3fc4316c));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
+                                            getF32Constant(DAG, 0x3f57ce70));
+
+      result = DAG.getNode(ISD::FADD, dl,
+                           MVT::f32, LogOfExponent, Log10ofMantissa);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FLOG10, dl,
+                         getValue(I.getOperand(1)).getValueType(),
+                         getValue(I.getOperand(1)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitExp2 - Lower an exp2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+void
+SelectionDAGLowering::visitExp2(CallInst &I) {
+  SDValue result;
+  DebugLoc dl = getCurDebugLoc();
+
+  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getOperand(1));
+
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Op);
+
+    //   FractionalPartOfX = x - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, Op, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f7f5e7e));
+      SDValue t6 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t5);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t6, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // error 0.000107046256, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3f7ff8fd));
+      SDValue t8 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t7);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t8, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      SDValue t13 = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                getF32Constant(DAG, 0x3f800000));
+      SDValue t14 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t13);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t14, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FEXP2, dl,
+                         getValue(I.getOperand(1)).getValueType(),
+                         getValue(I.getOperand(1)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitPow - Lower a pow intrinsic. Handles the special sequences for
+/// limited-precision mode with x == 10.0f.
+void
+SelectionDAGLowering::visitPow(CallInst &I) {
+  SDValue result;
+  Value *Val = I.getOperand(1);
+  DebugLoc dl = getCurDebugLoc();
+  bool IsExp10 = false;
+
+  if (getValue(Val).getValueType() == MVT::f32 &&
+      getValue(I.getOperand(2)).getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    if (Constant *C = const_cast<Constant*>(dyn_cast<Constant>(Val))) {
+      if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+        APFloat Ten(10.0f);
+        IsExp10 = CFP->getValueAPF().bitwiseIsEqual(Ten);
+      }
+    }
+  }
+
+  if (IsExp10 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op = getValue(I.getOperand(2));
+
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OF10 3.3219281f
+    //   IntegerPartOfX = (int32_t)(x * LOG2OF10);
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
+                             getF32Constant(DAG, 0x40549a78));
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+    //   FractionalPartOfX = x - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   twoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f7f5e7e));
+      SDValue t6 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t5);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t6, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else if (LimitFloatPrecision > 6 && LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // error 0.000107046256, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3f7ff8fd));
+      SDValue t8 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t7);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t8, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    } else { // LimitFloatPrecision > 12 && LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      SDValue t13 = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                getF32Constant(DAG, 0x3f800000));
+      SDValue t14 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, t13);
+      SDValue TwoToFractionalPartOfX =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, t14, IntegerPartOfX);
+
+      result = DAG.getNode(ISD::BIT_CONVERT, dl,
+                           MVT::f32, TwoToFractionalPartOfX);
+    }
+  } else {
+    // No special expansion.
+    result = DAG.getNode(ISD::FPOW, dl,
+                         getValue(I.getOperand(1)).getValueType(),
+                         getValue(I.getOperand(1)),
+                         getValue(I.getOperand(2)));
+  }
+
+  setValue(&I, result);
+}
+
+/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
+/// we want to emit this as a call to a named external function, return the name
+/// otherwise lower it and return null.
+const char *
+SelectionDAGLowering::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) {
+  DebugLoc dl = getCurDebugLoc();
+  switch (Intrinsic) {
+  default:
+    // By default, turn this into a target intrinsic node.
+    visitTargetIntrinsic(I, Intrinsic);
+    return 0;
+  case Intrinsic::vastart:  visitVAStart(I); return 0;
+  case Intrinsic::vaend:    visitVAEnd(I); return 0;
+  case Intrinsic::vacopy:   visitVACopy(I); return 0;
+  case Intrinsic::returnaddress:
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, dl, TLI.getPointerTy(),
+                             getValue(I.getOperand(1))));
+    return 0;
+  case Intrinsic::frameaddress:
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, dl, TLI.getPointerTy(),
+                             getValue(I.getOperand(1))));
+    return 0;
+  case Intrinsic::setjmp:
+    return "_setjmp"+!TLI.usesUnderscoreSetJmp();
+    break;
+  case Intrinsic::longjmp:
+    return "_longjmp"+!TLI.usesUnderscoreLongJmp();
+    break;
+  case Intrinsic::memcpy: {
+    SDValue Op1 = getValue(I.getOperand(1));
+    SDValue Op2 = getValue(I.getOperand(2));
+    SDValue Op3 = getValue(I.getOperand(3));
+    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, false,
+                              I.getOperand(1), 0, I.getOperand(2), 0));
+    return 0;
+  }
+  case Intrinsic::memset: {
+    SDValue Op1 = getValue(I.getOperand(1));
+    SDValue Op2 = getValue(I.getOperand(2));
+    SDValue Op3 = getValue(I.getOperand(3));
+    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemset(getRoot(), dl, Op1, Op2, Op3, Align,
+                              I.getOperand(1), 0));
+    return 0;
+  }
+  case Intrinsic::memmove: {
+    SDValue Op1 = getValue(I.getOperand(1));
+    SDValue Op2 = getValue(I.getOperand(2));
+    SDValue Op3 = getValue(I.getOperand(3));
+    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
+
+    // If the source and destination are known to not be aliases, we can
+    // lower memmove as memcpy.
+    uint64_t Size = -1ULL;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op3))
+      Size = C->getZExtValue();
+    if (AA->alias(I.getOperand(1), Size, I.getOperand(2), Size) ==
+        AliasAnalysis::NoAlias) {
+      DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, false,
+                                I.getOperand(1), 0, I.getOperand(2), 0));
+      return 0;
+    }
+
+    DAG.setRoot(DAG.getMemmove(getRoot(), dl, Op1, Op2, Op3, Align,
+                               I.getOperand(1), 0, I.getOperand(2), 0));
+    return 0;
+  }
+  case Intrinsic::dbg_stoppoint: {
+    DbgStopPointInst &SPI = cast<DbgStopPointInst>(I);
+    if (DIDescriptor::ValidDebugInfo(SPI.getContext(), OptLevel)) {
+      MachineFunction &MF = DAG.getMachineFunction();
+      DICompileUnit CU(cast<GlobalVariable>(SPI.getContext()));
+      DebugLoc Loc = DebugLoc::get(MF.getOrCreateDebugLocID(CU.getGV(),
+                                              SPI.getLine(), SPI.getColumn()));
+      setCurDebugLoc(Loc);
+      
+      if (OptLevel == CodeGenOpt::None)
+        DAG.setRoot(DAG.getDbgStopPoint(Loc, getRoot(),
+                                        SPI.getLine(),
+                                        SPI.getColumn(),
+                                        SPI.getContext()));
+    }
+    return 0;
+  }
+  case Intrinsic::dbg_region_start: {
+    DwarfWriter *DW = DAG.getDwarfWriter();
+    DbgRegionStartInst &RSI = cast<DbgRegionStartInst>(I);
+
+    if (DIDescriptor::ValidDebugInfo(RSI.getContext(), OptLevel) &&
+        DW && DW->ShouldEmitDwarfDebug()) {
+      unsigned LabelID =
+        DW->RecordRegionStart(cast<GlobalVariable>(RSI.getContext()));
+      DAG.setRoot(DAG.getLabel(ISD::DBG_LABEL, getCurDebugLoc(),
+                               getRoot(), LabelID));
+    }
+
+    return 0;
+  }
+  case Intrinsic::dbg_region_end: {
+    DwarfWriter *DW = DAG.getDwarfWriter();
+    DbgRegionEndInst &REI = cast<DbgRegionEndInst>(I);
+
+    if (DIDescriptor::ValidDebugInfo(REI.getContext(), OptLevel) &&
+        DW && DW->ShouldEmitDwarfDebug()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+      DISubprogram Subprogram(cast<GlobalVariable>(REI.getContext()));
+
+      if (Subprogram.isNull() || Subprogram.describes(MF.getFunction())) {
+        unsigned LabelID =
+          DW->RecordRegionEnd(cast<GlobalVariable>(REI.getContext()));
+        DAG.setRoot(DAG.getLabel(ISD::DBG_LABEL, getCurDebugLoc(),
+                                 getRoot(), LabelID));
+      } else {
+        // This is end of inlined function. Debugging information for inlined
+        // function is not handled yet (only supported by FastISel).
+        if (OptLevel == CodeGenOpt::None) {
+          unsigned ID = DW->RecordInlinedFnEnd(Subprogram);
+          if (ID != 0)
+            // Returned ID is 0 if this is unbalanced "end of inlined
+            // scope". This could happen if optimizer eats dbg intrinsics or
+            // "beginning of inlined scope" is not recoginized due to missing
+            // location info. In such cases, do ignore this region.end.
+            DAG.setRoot(DAG.getLabel(ISD::DBG_LABEL, getCurDebugLoc(), 
+                                     getRoot(), ID));
+        }
+      }
+    }
+
+    return 0;
+  }
+  case Intrinsic::dbg_func_start: {
+    DwarfWriter *DW = DAG.getDwarfWriter();
+    DbgFuncStartInst &FSI = cast<DbgFuncStartInst>(I);
+    Value *SP = FSI.getSubprogram();
+    if (!DIDescriptor::ValidDebugInfo(SP, OptLevel))
+      return 0;
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    if (OptLevel == CodeGenOpt::None) {
+      // llvm.dbg.func.start implicitly defines a dbg_stoppoint which is what
+      // (most?) gdb expects.
+      DebugLoc PrevLoc = CurDebugLoc;
+      DISubprogram Subprogram(cast<GlobalVariable>(SP));
+      DICompileUnit CompileUnit = Subprogram.getCompileUnit();
+
+      if (!Subprogram.describes(MF.getFunction())) {
+        // This is a beginning of an inlined function.
+
+        // If llvm.dbg.func.start is seen in a new block before any
+        // llvm.dbg.stoppoint intrinsic then the location info is unknown.
+        // FIXME : Why DebugLoc is reset at the beginning of each block ?
+        if (PrevLoc.isUnknown())
+          return 0;
+
+        // Record the source line.
+        unsigned Line = Subprogram.getLineNumber();
+        setCurDebugLoc(DebugLoc::get(
+                     MF.getOrCreateDebugLocID(CompileUnit.getGV(), Line, 0)));
+
+        if (DW && DW->ShouldEmitDwarfDebug()) {
+          DebugLocTuple PrevLocTpl = MF.getDebugLocTuple(PrevLoc);
+          unsigned LabelID = DW->RecordInlinedFnStart(Subprogram,
+                                          DICompileUnit(PrevLocTpl.CompileUnit),
+                                          PrevLocTpl.Line,
+                                          PrevLocTpl.Col);
+          DAG.setRoot(DAG.getLabel(ISD::DBG_LABEL, getCurDebugLoc(),
+                                   getRoot(), LabelID));
+        }
+      } else {
+        // Record the source line.
+        unsigned Line = Subprogram.getLineNumber();
+        MF.setDefaultDebugLoc(DebugLoc::get(
+                     MF.getOrCreateDebugLocID(CompileUnit.getGV(), Line, 0)));
+        if (DW && DW->ShouldEmitDwarfDebug()) {
+          // llvm.dbg.func_start also defines beginning of function scope.
+          DW->RecordRegionStart(cast<GlobalVariable>(FSI.getSubprogram()));
+        }
+      }
+    } else {
+      DISubprogram Subprogram(cast<GlobalVariable>(SP));
+
+      std::string SPName;
+      Subprogram.getLinkageName(SPName);
+      if (!SPName.empty()
+          && strcmp(SPName.c_str(), MF.getFunction()->getNameStart())) {
+        // This is beginning of inlined function. Debugging information for
+        // inlined function is not handled yet (only supported by FastISel).
+        return 0;
+      }
+
+      // llvm.dbg.func.start implicitly defines a dbg_stoppoint which is
+      // what (most?) gdb expects.
+      DICompileUnit CompileUnit = Subprogram.getCompileUnit();
+
+      // Record the source line but does not create a label for the normal
+      // function start. It will be emitted at asm emission time. However,
+      // create a label if this is a beginning of inlined function.
+      unsigned Line = Subprogram.getLineNumber();
+      setCurDebugLoc(DebugLoc::get(
+                     MF.getOrCreateDebugLocID(CompileUnit.getGV(), Line, 0)));
+      // FIXME -  Start new region because llvm.dbg.func_start also defines
+      // beginning of function scope.
+    }
+
+    return 0;
+  }
+  case Intrinsic::dbg_declare: {
+    if (OptLevel == CodeGenOpt::None) {
+      DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
+      Value *Variable = DI.getVariable();
+      if (DIDescriptor::ValidDebugInfo(Variable, OptLevel))
+        DAG.setRoot(DAG.getNode(ISD::DECLARE, dl, MVT::Other, getRoot(),
+                                getValue(DI.getAddress()), getValue(Variable)));
+    } else {
+      // FIXME: Do something sensible here when we support debug declare.
+    }
+    return 0;
+  }
+  case Intrinsic::eh_exception: {
+    // Insert the EXCEPTIONADDR instruction.
+    assert(CurMBB->isLandingPad() &&"Call to eh.exception not in landing pad!");
+    SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
+    SDValue Ops[1];
+    Ops[0] = DAG.getRoot();
+    SDValue Op = DAG.getNode(ISD::EXCEPTIONADDR, dl, VTs, Ops, 1);
+    setValue(&I, Op);
+    DAG.setRoot(Op.getValue(1));
+    return 0;
+  }
+
+  case Intrinsic::eh_selector_i32:
+  case Intrinsic::eh_selector_i64: {
+    MachineModuleInfo *MMI = DAG.getMachineModuleInfo();
+    MVT VT = (Intrinsic == Intrinsic::eh_selector_i32 ?
+                         MVT::i32 : MVT::i64);
+
+    if (MMI) {
+      if (CurMBB->isLandingPad())
+        AddCatchInfo(I, MMI, CurMBB);
+      else {
+#ifndef NDEBUG
+        FuncInfo.CatchInfoLost.insert(&I);
+#endif
+        // FIXME: Mark exception selector register as live in.  Hack for PR1508.
+        unsigned Reg = TLI.getExceptionSelectorRegister();
+        if (Reg) CurMBB->addLiveIn(Reg);
+      }
+
+      // Insert the EHSELECTION instruction.
+      SDVTList VTs = DAG.getVTList(VT, MVT::Other);
+      SDValue Ops[2];
+      Ops[0] = getValue(I.getOperand(1));
+      Ops[1] = getRoot();
+      SDValue Op = DAG.getNode(ISD::EHSELECTION, dl, VTs, Ops, 2);
+      setValue(&I, Op);
+      DAG.setRoot(Op.getValue(1));
+    } else {
+      setValue(&I, DAG.getConstant(0, VT));
+    }
+
+    return 0;
+  }
+
+  case Intrinsic::eh_typeid_for_i32:
+  case Intrinsic::eh_typeid_for_i64: {
+    MachineModuleInfo *MMI = DAG.getMachineModuleInfo();
+    MVT VT = (Intrinsic == Intrinsic::eh_typeid_for_i32 ?
+                         MVT::i32 : MVT::i64);
+
+    if (MMI) {
+      // Find the type id for the given typeinfo.
+      GlobalVariable *GV = ExtractTypeInfo(I.getOperand(1));
+
+      unsigned TypeID = MMI->getTypeIDFor(GV);
+      setValue(&I, DAG.getConstant(TypeID, VT));
+    } else {
+      // Return something different to eh_selector.
+      setValue(&I, DAG.getConstant(1, VT));
+    }
+
+    return 0;
+  }
+
+  case Intrinsic::eh_return_i32:
+  case Intrinsic::eh_return_i64:
+    if (MachineModuleInfo *MMI = DAG.getMachineModuleInfo()) {
+      MMI->setCallsEHReturn(true);
+      DAG.setRoot(DAG.getNode(ISD::EH_RETURN, dl,
+                              MVT::Other,
+                              getControlRoot(),
+                              getValue(I.getOperand(1)),
+                              getValue(I.getOperand(2))));
+    } else {
+      setValue(&I, DAG.getConstant(0, TLI.getPointerTy()));
+    }
+
+    return 0;
+  case Intrinsic::eh_unwind_init:
+    if (MachineModuleInfo *MMI = DAG.getMachineModuleInfo()) {
+      MMI->setCallsUnwindInit(true);
+    }
+
+    return 0;
+
+  case Intrinsic::eh_dwarf_cfa: {
+    MVT VT = getValue(I.getOperand(1)).getValueType();
+    SDValue CfaArg;
+    if (VT.bitsGT(TLI.getPointerTy()))
+      CfaArg = DAG.getNode(ISD::TRUNCATE, dl,
+                           TLI.getPointerTy(), getValue(I.getOperand(1)));
+    else
+      CfaArg = DAG.getNode(ISD::SIGN_EXTEND, dl,
+                           TLI.getPointerTy(), getValue(I.getOperand(1)));
+
+    SDValue Offset = DAG.getNode(ISD::ADD, dl,
+                                 TLI.getPointerTy(),
+                                 DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, dl,
+                                             TLI.getPointerTy()),
+                                 CfaArg);
+    setValue(&I, DAG.getNode(ISD::ADD, dl,
+                             TLI.getPointerTy(),
+                             DAG.getNode(ISD::FRAMEADDR, dl,
+                                         TLI.getPointerTy(),
+                                         DAG.getConstant(0,
+                                                         TLI.getPointerTy())),
+                             Offset));
+    return 0;
+  }
+
+  case Intrinsic::convertff:
+  case Intrinsic::convertfsi:
+  case Intrinsic::convertfui:
+  case Intrinsic::convertsif:
+  case Intrinsic::convertuif:
+  case Intrinsic::convertss:
+  case Intrinsic::convertsu:
+  case Intrinsic::convertus:
+  case Intrinsic::convertuu: {
+    ISD::CvtCode Code = ISD::CVT_INVALID;
+    switch (Intrinsic) {
+    case Intrinsic::convertff:  Code = ISD::CVT_FF; break;
+    case Intrinsic::convertfsi: Code = ISD::CVT_FS; break;
+    case Intrinsic::convertfui: Code = ISD::CVT_FU; break;
+    case Intrinsic::convertsif: Code = ISD::CVT_SF; break;
+    case Intrinsic::convertuif: Code = ISD::CVT_UF; break;
+    case Intrinsic::convertss:  Code = ISD::CVT_SS; break;
+    case Intrinsic::convertsu:  Code = ISD::CVT_SU; break;
+    case Intrinsic::convertus:  Code = ISD::CVT_US; break;
+    case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
+    }
+    MVT DestVT = TLI.getValueType(I.getType());
+    Value* Op1 = I.getOperand(1);
+    setValue(&I, DAG.getConvertRndSat(DestVT, getCurDebugLoc(), getValue(Op1),
+                                DAG.getValueType(DestVT),
+                                DAG.getValueType(getValue(Op1).getValueType()),
+                                getValue(I.getOperand(2)),
+                                getValue(I.getOperand(3)),
+                                Code));
+    return 0;
+  }
+
+  case Intrinsic::sqrt:
+    setValue(&I, DAG.getNode(ISD::FSQRT, dl,
+                             getValue(I.getOperand(1)).getValueType(),
+                             getValue(I.getOperand(1))));
+    return 0;
+  case Intrinsic::powi:
+    setValue(&I, DAG.getNode(ISD::FPOWI, dl,
+                             getValue(I.getOperand(1)).getValueType(),
+                             getValue(I.getOperand(1)),
+                             getValue(I.getOperand(2))));
+    return 0;
+  case Intrinsic::sin:
+    setValue(&I, DAG.getNode(ISD::FSIN, dl,
+                             getValue(I.getOperand(1)).getValueType(),
+                             getValue(I.getOperand(1))));
+    return 0;
+  case Intrinsic::cos:
+    setValue(&I, DAG.getNode(ISD::FCOS, dl,
+                             getValue(I.getOperand(1)).getValueType(),
+                             getValue(I.getOperand(1))));
+    return 0;
+  case Intrinsic::log:
+    visitLog(I);
+    return 0;
+  case Intrinsic::log2:
+    visitLog2(I);
+    return 0;
+  case Intrinsic::log10:
+    visitLog10(I);
+    return 0;
+  case Intrinsic::exp:
+    visitExp(I);
+    return 0;
+  case Intrinsic::exp2:
+    visitExp2(I);
+    return 0;
+  case Intrinsic::pow:
+    visitPow(I);
+    return 0;
+  case Intrinsic::pcmarker: {
+    SDValue Tmp = getValue(I.getOperand(1));
+    DAG.setRoot(DAG.getNode(ISD::PCMARKER, dl, MVT::Other, getRoot(), Tmp));
+    return 0;
+  }
+  case Intrinsic::readcyclecounter: {
+    SDValue Op = getRoot();
+    SDValue Tmp = DAG.getNode(ISD::READCYCLECOUNTER, dl,
+                              DAG.getVTList(MVT::i64, MVT::Other),
+                              &Op, 1);
+    setValue(&I, Tmp);
+    DAG.setRoot(Tmp.getValue(1));
+    return 0;
+  }
+  case Intrinsic::part_select: {
+    // Currently not implemented: just abort
+    assert(0 && "part_select intrinsic not implemented");
+    abort();
+  }
+  case Intrinsic::part_set: {
+    // Currently not implemented: just abort
+    assert(0 && "part_set intrinsic not implemented");
+    abort();
+  }
+  case Intrinsic::bswap:
+    setValue(&I, DAG.getNode(ISD::BSWAP, dl,
+                             getValue(I.getOperand(1)).getValueType(),
+                             getValue(I.getOperand(1))));
+    return 0;
+  case Intrinsic::cttz: {
+    SDValue Arg = getValue(I.getOperand(1));
+    MVT Ty = Arg.getValueType();
+    SDValue result = DAG.getNode(ISD::CTTZ, dl, Ty, Arg);
+    setValue(&I, result);
+    return 0;
+  }
+  case Intrinsic::ctlz: {
+    SDValue Arg = getValue(I.getOperand(1));
+    MVT Ty = Arg.getValueType();
+    SDValue result = DAG.getNode(ISD::CTLZ, dl, Ty, Arg);
+    setValue(&I, result);
+    return 0;
+  }
+  case Intrinsic::ctpop: {
+    SDValue Arg = getValue(I.getOperand(1));
+    MVT Ty = Arg.getValueType();
+    SDValue result = DAG.getNode(ISD::CTPOP, dl, Ty, Arg);
+    setValue(&I, result);
+    return 0;
+  }
+  case Intrinsic::stacksave: {
+    SDValue Op = getRoot();
+    SDValue Tmp = DAG.getNode(ISD::STACKSAVE, dl,
+              DAG.getVTList(TLI.getPointerTy(), MVT::Other), &Op, 1);
+    setValue(&I, Tmp);
+    DAG.setRoot(Tmp.getValue(1));
+    return 0;
+  }
+  case Intrinsic::stackrestore: {
+    SDValue Tmp = getValue(I.getOperand(1));
+    DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, dl, MVT::Other, getRoot(), Tmp));
+    return 0;
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code into the DAG to store the stack guard onto the stack.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    MVT PtrTy = TLI.getPointerTy();
+
+    SDValue Src = getValue(I.getOperand(1));   // The guard's value.
+    AllocaInst *Slot = cast<AllocaInst>(I.getOperand(2));
+
+    int FI = FuncInfo.StaticAllocaMap[Slot];
+    MFI->setStackProtectorIndex(FI);
+
+    SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+
+    // Store the stack protector onto the stack.
+    SDValue Result = DAG.getStore(getRoot(), getCurDebugLoc(), Src, FIN,
+                                  PseudoSourceValue::getFixedStack(FI),
+                                  0, true);
+    setValue(&I, Result);
+    DAG.setRoot(Result);
+    return 0;
+  }
+  case Intrinsic::var_annotation:
+    // Discard annotate attributes
+    return 0;
+
+  case Intrinsic::init_trampoline: {
+    const Function *F = cast<Function>(I.getOperand(2)->stripPointerCasts());
+
+    SDValue Ops[6];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getOperand(1));
+    Ops[2] = getValue(I.getOperand(2));
+    Ops[3] = getValue(I.getOperand(3));
+    Ops[4] = DAG.getSrcValue(I.getOperand(1));
+    Ops[5] = DAG.getSrcValue(F);
+
+    SDValue Tmp = DAG.getNode(ISD::TRAMPOLINE, dl,
+                              DAG.getVTList(TLI.getPointerTy(), MVT::Other),
+                              Ops, 6);
+
+    setValue(&I, Tmp);
+    DAG.setRoot(Tmp.getValue(1));
+    return 0;
+  }
+
+  case Intrinsic::gcroot:
+    if (GFI) {
+      Value *Alloca = I.getOperand(1);
+      Constant *TypeMap = cast<Constant>(I.getOperand(2));
+
+      FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
+      GFI->addStackRoot(FI->getIndex(), TypeMap);
+    }
+    return 0;
+
+  case Intrinsic::gcread:
+  case Intrinsic::gcwrite:
+    assert(0 && "GC failed to lower gcread/gcwrite intrinsics!");
+    return 0;
+
+  case Intrinsic::flt_rounds: {
+    setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, dl, MVT::i32));
+    return 0;
+  }
+
+  case Intrinsic::trap: {
+    DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot()));
+    return 0;
+  }
+
+  case Intrinsic::uadd_with_overflow:
+    return implVisitAluOverflow(I, ISD::UADDO);
+  case Intrinsic::sadd_with_overflow:
+    return implVisitAluOverflow(I, ISD::SADDO);
+  case Intrinsic::usub_with_overflow:
+    return implVisitAluOverflow(I, ISD::USUBO);
+  case Intrinsic::ssub_with_overflow:
+    return implVisitAluOverflow(I, ISD::SSUBO);
+  case Intrinsic::umul_with_overflow:
+    return implVisitAluOverflow(I, ISD::UMULO);
+  case Intrinsic::smul_with_overflow:
+    return implVisitAluOverflow(I, ISD::SMULO);
+
+  case Intrinsic::prefetch: {
+    SDValue Ops[4];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getOperand(1));
+    Ops[2] = getValue(I.getOperand(2));
+    Ops[3] = getValue(I.getOperand(3));
+    DAG.setRoot(DAG.getNode(ISD::PREFETCH, dl, MVT::Other, &Ops[0], 4));
+    return 0;
+  }
+
+  case Intrinsic::memory_barrier: {
+    SDValue Ops[6];
+    Ops[0] = getRoot();
+    for (int x = 1; x < 6; ++x)
+      Ops[x] = getValue(I.getOperand(x));
+
+    DAG.setRoot(DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, &Ops[0], 6));
+    return 0;
+  }
+  case Intrinsic::atomic_cmp_swap: {
+    SDValue Root = getRoot();
+    SDValue L =
+      DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, getCurDebugLoc(),
+                    getValue(I.getOperand(2)).getValueType().getSimpleVT(),
+                    Root,
+                    getValue(I.getOperand(1)),
+                    getValue(I.getOperand(2)),
+                    getValue(I.getOperand(3)),
+                    I.getOperand(1));
+    setValue(&I, L);
+    DAG.setRoot(L.getValue(1));
+    return 0;
+  }
+  case Intrinsic::atomic_load_add:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_ADD);
+  case Intrinsic::atomic_load_sub:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_SUB);
+  case Intrinsic::atomic_load_or:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_OR);
+  case Intrinsic::atomic_load_xor:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_XOR);
+  case Intrinsic::atomic_load_and:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_AND);
+  case Intrinsic::atomic_load_nand:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_NAND);
+  case Intrinsic::atomic_load_max:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MAX);
+  case Intrinsic::atomic_load_min:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MIN);
+  case Intrinsic::atomic_load_umin:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMIN);
+  case Intrinsic::atomic_load_umax:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMAX);
+  case Intrinsic::atomic_swap:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_SWAP);
+  }
+}
+
+
+void SelectionDAGLowering::LowerCallTo(CallSite CS, SDValue Callee,
+                                       bool IsTailCall,
+                                       MachineBasicBlock *LandingPad) {
+  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  MachineModuleInfo *MMI = DAG.getMachineModuleInfo();
+  unsigned BeginLabel = 0, EndLabel = 0;
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+  for (CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    SDValue ArgNode = getValue(*i);
+    Entry.Node = ArgNode; Entry.Ty = (*i)->getType();
+
+    unsigned attrInd = i - CS.arg_begin() + 1;
+    Entry.isSExt  = CS.paramHasAttr(attrInd, Attribute::SExt);
+    Entry.isZExt  = CS.paramHasAttr(attrInd, Attribute::ZExt);
+    Entry.isInReg = CS.paramHasAttr(attrInd, Attribute::InReg);
+    Entry.isSRet  = CS.paramHasAttr(attrInd, Attribute::StructRet);
+    Entry.isNest  = CS.paramHasAttr(attrInd, Attribute::Nest);
+    Entry.isByVal = CS.paramHasAttr(attrInd, Attribute::ByVal);
+    Entry.Alignment = CS.getParamAlignment(attrInd);
+    Args.push_back(Entry);
+  }
+
+  if (LandingPad && MMI) {
+    // Insert a label before the invoke call to mark the try range.  This can be
+    // used to detect deletion of the invoke via the MachineModuleInfo.
+    BeginLabel = MMI->NextLabelID();
+    // Both PendingLoads and PendingExports must be flushed here;
+    // this call might not return.
+    (void)getRoot();
+    DAG.setRoot(DAG.getLabel(ISD::EH_LABEL, getCurDebugLoc(),
+                             getControlRoot(), BeginLabel));
+  }
+
+  std::pair<SDValue,SDValue> Result =
+    TLI.LowerCallTo(getRoot(), CS.getType(),
+                    CS.paramHasAttr(0, Attribute::SExt),
+                    CS.paramHasAttr(0, Attribute::ZExt), FTy->isVarArg(),
+                    CS.paramHasAttr(0, Attribute::InReg),
+                    CS.getCallingConv(),
+                    IsTailCall && PerformTailCallOpt,
+                    Callee, Args, DAG, getCurDebugLoc());
+  if (CS.getType() != Type::VoidTy)
+    setValue(CS.getInstruction(), Result.first);
+  DAG.setRoot(Result.second);
+
+  if (LandingPad && MMI) {
+    // Insert a label at the end of the invoke call to mark the try range.  This
+    // can be used to detect deletion of the invoke via the MachineModuleInfo.
+    EndLabel = MMI->NextLabelID();
+    DAG.setRoot(DAG.getLabel(ISD::EH_LABEL, getCurDebugLoc(),
+                             getRoot(), EndLabel));
+
+    // Inform MachineModuleInfo of range.
+    MMI->addInvoke(LandingPad, BeginLabel, EndLabel);
+  }
+}
+
+
+void SelectionDAGLowering::visitCall(CallInst &I) {
+  const char *RenameFn = 0;
+  if (Function *F = I.getCalledFunction()) {
+    if (F->isDeclaration()) {
+      const TargetIntrinsicInfo *II = TLI.getTargetMachine().getIntrinsicInfo();
+      if (II) {
+        if (unsigned IID = II->getIntrinsicID(F)) {
+          RenameFn = visitIntrinsicCall(I, IID);
+          if (!RenameFn)
+            return;
+        }
+      }
+      if (unsigned IID = F->getIntrinsicID()) {
+        RenameFn = visitIntrinsicCall(I, IID);
+        if (!RenameFn)
+          return;
+      }
+    }
+
+    // Check for well-known libc/libm calls.  If the function is internal, it
+    // can't be a library call.
+    unsigned NameLen = F->getNameLen();
+    if (!F->hasLocalLinkage() && NameLen) {
+      const char *NameStr = F->getNameStart();
+      if (NameStr[0] == 'c' &&
+          ((NameLen == 8 && !strcmp(NameStr, "copysign")) ||
+           (NameLen == 9 && !strcmp(NameStr, "copysignf")))) {
+        if (I.getNumOperands() == 3 &&   // Basic sanity checks.
+            I.getOperand(1)->getType()->isFloatingPoint() &&
+            I.getType() == I.getOperand(1)->getType() &&
+            I.getType() == I.getOperand(2)->getType()) {
+          SDValue LHS = getValue(I.getOperand(1));
+          SDValue RHS = getValue(I.getOperand(2));
+          setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
+                                   LHS.getValueType(), LHS, RHS));
+          return;
+        }
+      } else if (NameStr[0] == 'f' &&
+                 ((NameLen == 4 && !strcmp(NameStr, "fabs")) ||
+                  (NameLen == 5 && !strcmp(NameStr, "fabsf")) ||
+                  (NameLen == 5 && !strcmp(NameStr, "fabsl")))) {
+        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
+            I.getOperand(1)->getType()->isFloatingPoint() &&
+            I.getType() == I.getOperand(1)->getType()) {
+          SDValue Tmp = getValue(I.getOperand(1));
+          setValue(&I, DAG.getNode(ISD::FABS, getCurDebugLoc(),
+                                   Tmp.getValueType(), Tmp));
+          return;
+        }
+      } else if (NameStr[0] == 's' &&
+                 ((NameLen == 3 && !strcmp(NameStr, "sin")) ||
+                  (NameLen == 4 && !strcmp(NameStr, "sinf")) ||
+                  (NameLen == 4 && !strcmp(NameStr, "sinl")))) {
+        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
+            I.getOperand(1)->getType()->isFloatingPoint() &&
+            I.getType() == I.getOperand(1)->getType()) {
+          SDValue Tmp = getValue(I.getOperand(1));
+          setValue(&I, DAG.getNode(ISD::FSIN, getCurDebugLoc(),
+                                   Tmp.getValueType(), Tmp));
+          return;
+        }
+      } else if (NameStr[0] == 'c' &&
+                 ((NameLen == 3 && !strcmp(NameStr, "cos")) ||
+                  (NameLen == 4 && !strcmp(NameStr, "cosf")) ||
+                  (NameLen == 4 && !strcmp(NameStr, "cosl")))) {
+        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
+            I.getOperand(1)->getType()->isFloatingPoint() &&
+            I.getType() == I.getOperand(1)->getType()) {
+          SDValue Tmp = getValue(I.getOperand(1));
+          setValue(&I, DAG.getNode(ISD::FCOS, getCurDebugLoc(),
+                                   Tmp.getValueType(), Tmp));
+          return;
+        }
+      }
+    }
+  } else if (isa<InlineAsm>(I.getOperand(0))) {
+    visitInlineAsm(&I);
+    return;
+  }
+
+  SDValue Callee;
+  if (!RenameFn)
+    Callee = getValue(I.getOperand(0));
+  else
+    Callee = DAG.getExternalSymbol(RenameFn, TLI.getPointerTy());
+
+  LowerCallTo(&I, Callee, I.isTailCall());
+}
+
+
+/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+/// this value and returns the result as a ValueVT value.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, DebugLoc dl,
+                                      SDValue &Chain,
+                                      SDValue *Flag) const {
+  // Assemble the legal parts into the final values.
+  SmallVector<SDValue, 4> Values(ValueVTs.size());
+  SmallVector<SDValue, 8> Parts;
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    // Copy the legal parts from the registers.
+    MVT ValueVT = ValueVTs[Value];
+    unsigned NumRegs = TLI->getNumRegisters(ValueVT);
+    MVT RegisterVT = RegVTs[Value];
+
+    Parts.resize(NumRegs);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      SDValue P;
+      if (Flag == 0)
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
+      else {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
+        *Flag = P.getValue(2);
+      }
+      Chain = P.getValue(1);
+
+      // If the source register was virtual and if we know something about it,
+      // add an assert node.
+      if (TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) &&
+          RegisterVT.isInteger() && !RegisterVT.isVector()) {
+        unsigned SlotNo = Regs[Part+i]-TargetRegisterInfo::FirstVirtualRegister;
+        FunctionLoweringInfo &FLI = DAG.getFunctionLoweringInfo();
+        if (FLI.LiveOutRegInfo.size() > SlotNo) {
+          FunctionLoweringInfo::LiveOutInfo &LOI = FLI.LiveOutRegInfo[SlotNo];
+
+          unsigned RegSize = RegisterVT.getSizeInBits();
+          unsigned NumSignBits = LOI.NumSignBits;
+          unsigned NumZeroBits = LOI.KnownZero.countLeadingOnes();
+
+          // FIXME: We capture more information than the dag can represent.  For
+          // now, just use the tightest assertzext/assertsext possible.
+          bool isSExt = true;
+          MVT FromVT(MVT::Other);
+          if (NumSignBits == RegSize)
+            isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
+          else if (NumZeroBits >= RegSize-1)
+            isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
+          else if (NumSignBits > RegSize-8)
+            isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
+          else if (NumZeroBits >= RegSize-8)
+            isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
+          else if (NumSignBits > RegSize-16)
+            isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
+          else if (NumZeroBits >= RegSize-16)
+            isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
+          else if (NumSignBits > RegSize-32)
+            isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
+          else if (NumZeroBits >= RegSize-32)
+            isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
+
+          if (FromVT != MVT::Other) {
+            P = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
+                            RegisterVT, P, DAG.getValueType(FromVT));
+
+          }
+        }
+      }
+
+      Parts[i] = P;
+    }
+
+    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
+                                     NumRegs, RegisterVT, ValueVT);
+    Part += NumRegs;
+    Parts.clear();
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl,
+                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
+                     &Values[0], ValueVTs.size());
+}
+
+/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+/// specified value into the registers specified by this object.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue &Chain, SDValue *Flag) const {
+  // Get the list of the values's legal parts.
+  unsigned NumRegs = Regs.size();
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    MVT ValueVT = ValueVTs[Value];
+    unsigned NumParts = TLI->getNumRegisters(ValueVT);
+    MVT RegisterVT = RegVTs[Value];
+
+    getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
+                   &Parts[Part], NumParts, RegisterVT);
+    Part += NumParts;
+  }
+
+  // Copy the parts into the registers.
+  SmallVector<SDValue, 8> Chains(NumRegs);
+  for (unsigned i = 0; i != NumRegs; ++i) {
+    SDValue Part;
+    if (Flag == 0)
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
+    else {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
+      *Flag = Part.getValue(1);
+    }
+    Chains[i] = Part.getValue(0);
+  }
+
+  if (NumRegs == 1 || Flag)
+    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
+    // flagged to it. That is the CopyToReg nodes and the user are considered
+    // a single scheduling unit. If we create a TokenFactor and return it as
+    // chain, then the TokenFactor is both a predecessor (operand) of the
+    // user as well as a successor (the TF operands are flagged to the user).
+    // c1, f1 = CopyToReg
+    // c2, f2 = CopyToReg
+    // c3     = TokenFactor c1, c2
+    // ...
+    //        = op c3, ..., f2
+    Chain = Chains[NumRegs-1];
+  else
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
+}
+
+/// AddInlineAsmOperands - Add this value to the specified inlineasm node
+/// operand list.  This adds the code marker and includes the number of
+/// values added into it.
+void RegsForValue::AddInlineAsmOperands(unsigned Code,
+                                        bool HasMatching,unsigned MatchingIdx,
+                                        SelectionDAG &DAG,
+                                        std::vector<SDValue> &Ops) const {
+  MVT IntPtrTy = DAG.getTargetLoweringInfo().getPointerTy();
+  assert(Regs.size() < (1 << 13) && "Too many inline asm outputs!");
+  unsigned Flag = Code | (Regs.size() << 3);
+  if (HasMatching)
+    Flag |= 0x80000000 | (MatchingIdx << 16);
+  Ops.push_back(DAG.getTargetConstant(Flag, IntPtrTy));
+  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    unsigned NumRegs = TLI->getNumRegisters(ValueVTs[Value]);
+    MVT RegisterVT = RegVTs[Value];
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      assert(Reg < Regs.size() && "Mismatch in # registers expected");
+      Ops.push_back(DAG.getRegister(Regs[Reg++], RegisterVT));
+    }
+  }
+}
+
+/// isAllocatableRegister - If the specified register is safe to allocate,
+/// i.e. it isn't a stack pointer or some other special register, return the
+/// register class for the register.  Otherwise, return null.
+static const TargetRegisterClass *
+isAllocatableRegister(unsigned Reg, MachineFunction &MF,
+                      const TargetLowering &TLI,
+                      const TargetRegisterInfo *TRI) {
+  MVT FoundVT = MVT::Other;
+  const TargetRegisterClass *FoundRC = 0;
+  for (TargetRegisterInfo::regclass_iterator RCI = TRI->regclass_begin(),
+       E = TRI->regclass_end(); RCI != E; ++RCI) {
+    MVT ThisVT = MVT::Other;
+
+    const TargetRegisterClass *RC = *RCI;
+    // If none of the the value types for this register class are valid, we
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+         I != E; ++I) {
+      if (TLI.isTypeLegal(*I)) {
+        // If we have already found this register in a different register class,
+        // choose the one with the largest VT specified.  For example, on
+        // PowerPC, we favor f64 register classes over f32.
+        if (FoundVT == MVT::Other || FoundVT.bitsLT(*I)) {
+          ThisVT = *I;
+          break;
+        }
+      }
+    }
+
+    if (ThisVT == MVT::Other) continue;
+
+    // NOTE: This isn't ideal.  In particular, this might allocate the
+    // frame pointer in functions that need it (due to them not being taken
+    // out of allocation, because a variable sized allocation hasn't been seen
+    // yet).  This is a slight code pessimization, but should still work.
+    for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
+         E = RC->allocation_order_end(MF); I != E; ++I)
+      if (*I == Reg) {
+        // We found a matching register class.  Keep looking at others in case
+        // we find one with larger registers that this physreg is also in.
+        FoundRC = RC;
+        FoundVT = ThisVT;
+        break;
+      }
+  }
+  return FoundRC;
+}
+
+
+namespace llvm {
+/// AsmOperandInfo - This contains information for each constraint that we are
+/// lowering.
+class VISIBILITY_HIDDEN SDISelAsmOperandInfo :
+    public TargetLowering::AsmOperandInfo {
+public:
+  /// CallOperand - If this is the result output operand or a clobber
+  /// this is null, otherwise it is the incoming operand to the CallInst.
+  /// This gets modified as the asm is processed.
+  SDValue CallOperand;
+
+  /// AssignedRegs - If this is a register or register class operand, this
+  /// contains the set of register corresponding to the operand.
+  RegsForValue AssignedRegs;
+
+  explicit SDISelAsmOperandInfo(const InlineAsm::ConstraintInfo &info)
+    : TargetLowering::AsmOperandInfo(info), CallOperand(0,0) {
+  }
+
+  /// MarkAllocatedRegs - Once AssignedRegs is set, mark the assigned registers
+  /// busy in OutputRegs/InputRegs.
+  void MarkAllocatedRegs(bool isOutReg, bool isInReg,
+                         std::set<unsigned> &OutputRegs,
+                         std::set<unsigned> &InputRegs,
+                         const TargetRegisterInfo &TRI) const {
+    if (isOutReg) {
+      for (unsigned i = 0, e = AssignedRegs.Regs.size(); i != e; ++i)
+        MarkRegAndAliases(AssignedRegs.Regs[i], OutputRegs, TRI);
+    }
+    if (isInReg) {
+      for (unsigned i = 0, e = AssignedRegs.Regs.size(); i != e; ++i)
+        MarkRegAndAliases(AssignedRegs.Regs[i], InputRegs, TRI);
+    }
+  }
+
+  /// getCallOperandValMVT - Return the MVT of the Value* that this operand
+  /// corresponds to.  If there is no Value* for this operand, it returns
+  /// MVT::Other.
+  MVT getCallOperandValMVT(const TargetLowering &TLI,
+                           const TargetData *TD) const {
+    if (CallOperandVal == 0) return MVT::Other;
+
+    if (isa<BasicBlock>(CallOperandVal))
+      return TLI.getPointerTy();
+
+    const llvm::Type *OpTy = CallOperandVal->getType();
+
+    // If this is an indirect operand, the operand is a pointer to the
+    // accessed type.
+    if (isIndirect)
+      OpTy = cast<PointerType>(OpTy)->getElementType();
+
+    // If OpTy is not a single value, it may be a struct/union that we
+    // can tile with integers.
+    if (!OpTy->isSingleValueType() && OpTy->isSized()) {
+      unsigned BitSize = TD->getTypeSizeInBits(OpTy);
+      switch (BitSize) {
+      default: break;
+      case 1:
+      case 8:
+      case 16:
+      case 32:
+      case 64:
+      case 128:
+        OpTy = IntegerType::get(BitSize);
+        break;
+      }
+    }
+
+    return TLI.getValueType(OpTy, true);
+  }
+
+private:
+  /// MarkRegAndAliases - Mark the specified register and all aliases in the
+  /// specified set.
+  static void MarkRegAndAliases(unsigned Reg, std::set<unsigned> &Regs,
+                                const TargetRegisterInfo &TRI) {
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "Isn't a physreg");
+    Regs.insert(Reg);
+    if (const unsigned *Aliases = TRI.getAliasSet(Reg))
+      for (; *Aliases; ++Aliases)
+        Regs.insert(*Aliases);
+  }
+};
+} // end llvm namespace.
+
+
+/// GetRegistersForValue - Assign registers (virtual or physical) for the
+/// specified operand.  We prefer to assign virtual registers, to allow the
+/// register allocator handle the assignment process.  However, if the asm uses
+/// features that we can't model on machineinstrs, we have SDISel do the
+/// allocation.  This produces generally horrible, but correct, code.
+///
+///   OpInfo describes the operand.
+///   Input and OutputRegs are the set of already allocated physical registers.
+///
+void SelectionDAGLowering::
+GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
+                     std::set<unsigned> &OutputRegs,
+                     std::set<unsigned> &InputRegs) {
+  // Compute whether this value requires an input register, an output register,
+  // or both.
+  bool isOutReg = false;
+  bool isInReg = false;
+  switch (OpInfo.Type) {
+  case InlineAsm::isOutput:
+    isOutReg = true;
+
+    // If there is an input constraint that matches this, we need to reserve
+    // the input register so no other inputs allocate to it.
+    isInReg = OpInfo.hasMatchingInput();
+    break;
+  case InlineAsm::isInput:
+    isInReg = true;
+    isOutReg = false;
+    break;
+  case InlineAsm::isClobber:
+    isOutReg = true;
+    isInReg = true;
+    break;
+  }
+
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SmallVector<unsigned, 4> Regs;
+
+  // If this is a constraint for a single physreg, or a constraint for a
+  // register class, find it.
+  std::pair<unsigned, const TargetRegisterClass*> PhysReg =
+    TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                     OpInfo.ConstraintVT);
+
+  unsigned NumRegs = 1;
+  if (OpInfo.ConstraintVT != MVT::Other) {
+    // If this is a FP input in an integer register (or visa versa) insert a bit
+    // cast of the input value.  More generally, handle any case where the input
+    // value disagrees with the register class we plan to stick this in.
+    if (OpInfo.Type == InlineAsm::isInput &&
+        PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) {
+      // Try to convert to the first MVT that the reg class contains.  If the
+      // types are identical size, use a bitcast to convert (e.g. two differing
+      // vector types).
+      MVT RegVT = *PhysReg.second->vt_begin();
+      if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
+        OpInfo.CallOperand = DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(),
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
+        // If the input is a FP value and we want it in FP registers, do a
+        // bitcast to the corresponding integer type.  This turns an f64 value
+        // into i64, which can be passed with two i32 values on a 32-bit
+        // machine.
+        RegVT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
+        OpInfo.CallOperand = DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(),
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      }
+    }
+
+    NumRegs = TLI.getNumRegisters(OpInfo.ConstraintVT);
+  }
+
+  MVT RegVT;
+  MVT ValueVT = OpInfo.ConstraintVT;
+
+  // If this is a constraint for a specific physical register, like {r17},
+  // assign it now.
+  if (unsigned AssignedReg = PhysReg.first) {
+    const TargetRegisterClass *RC = PhysReg.second;
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = *RC->vt_begin();
+
+    // Get the actual register value type.  This is important, because the user
+    // may have asked for (e.g.) the AX register in i32 type.  We need to
+    // remember that AX is actually i16 to get the right extension.
+    RegVT = *RC->vt_begin();
+
+    // This is a explicit reference to a physical register.
+    Regs.push_back(AssignedReg);
+
+    // If this is an expanded reference, add the rest of the regs to Regs.
+    if (NumRegs != 1) {
+      TargetRegisterClass::iterator I = RC->begin();
+      for (; *I != AssignedReg; ++I)
+        assert(I != RC->end() && "Didn't find reg!");
+
+      // Already added the first reg.
+      --NumRegs; ++I;
+      for (; NumRegs; --NumRegs, ++I) {
+        assert(I != RC->end() && "Ran out of registers to allocate!");
+        Regs.push_back(*I);
+      }
+    }
+    OpInfo.AssignedRegs = RegsForValue(TLI, Regs, RegVT, ValueVT);
+    const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+    OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
+    return;
+  }
+
+  // Otherwise, if this was a reference to an LLVM register class, create vregs
+  // for this reference.
+  if (const TargetRegisterClass *RC = PhysReg.second) {
+    RegVT = *RC->vt_begin();
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = RegVT;
+
+    // Create the appropriate number of virtual registers.
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    for (; NumRegs; --NumRegs)
+      Regs.push_back(RegInfo.createVirtualRegister(RC));
+
+    OpInfo.AssignedRegs = RegsForValue(TLI, Regs, RegVT, ValueVT);
+    return;
+  }
+  
+  // This is a reference to a register class that doesn't directly correspond
+  // to an LLVM register class.  Allocate NumRegs consecutive, available,
+  // registers from the class.
+  std::vector<unsigned> RegClassRegs
+    = TLI.getRegClassForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                            OpInfo.ConstraintVT);
+
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  unsigned NumAllocated = 0;
+  for (unsigned i = 0, e = RegClassRegs.size(); i != e; ++i) {
+    unsigned Reg = RegClassRegs[i];
+    // See if this register is available.
+    if ((isOutReg && OutputRegs.count(Reg)) ||   // Already used.
+        (isInReg  && InputRegs.count(Reg))) {    // Already used.
+      // Make sure we find consecutive registers.
+      NumAllocated = 0;
+      continue;
+    }
+
+    // Check to see if this register is allocatable (i.e. don't give out the
+    // stack pointer).
+    const TargetRegisterClass *RC = isAllocatableRegister(Reg, MF, TLI, TRI);
+    if (!RC) {        // Couldn't allocate this register.
+      // Reset NumAllocated to make sure we return consecutive registers.
+      NumAllocated = 0;
+      continue;
+    }
+
+    // Okay, this register is good, we can use it.
+    ++NumAllocated;
+
+    // If we allocated enough consecutive registers, succeed.
+    if (NumAllocated == NumRegs) {
+      unsigned RegStart = (i-NumAllocated)+1;
+      unsigned RegEnd   = i+1;
+      // Mark all of the allocated registers used.
+      for (unsigned i = RegStart; i != RegEnd; ++i)
+        Regs.push_back(RegClassRegs[i]);
+
+      OpInfo.AssignedRegs = RegsForValue(TLI, Regs, *RC->vt_begin(),
+                                         OpInfo.ConstraintVT);
+      OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
+      return;
+    }
+  }
+
+  // Otherwise, we couldn't allocate enough registers for this.
+}
+
+/// hasInlineAsmMemConstraint - Return true if the inline asm instruction being
+/// processed uses a memory 'm' constraint.
+static bool
+hasInlineAsmMemConstraint(std::vector<InlineAsm::ConstraintInfo> &CInfos,
+                          const TargetLowering &TLI) {
+  for (unsigned i = 0, e = CInfos.size(); i != e; ++i) {
+    InlineAsm::ConstraintInfo &CI = CInfos[i];
+    for (unsigned j = 0, ee = CI.Codes.size(); j != ee; ++j) {
+      TargetLowering::ConstraintType CType = TLI.getConstraintType(CI.Codes[j]);
+      if (CType == TargetLowering::C_Memory)
+        return true;
+    }
+    
+    // Indirect operand accesses access memory.
+    if (CI.isIndirect)
+      return true;
+  }
+
+  return false;
+}
+
+/// visitInlineAsm - Handle a call to an InlineAsm object.
+///
+void SelectionDAGLowering::visitInlineAsm(CallSite CS) {
+  InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+
+  /// ConstraintOperands - Information about all of the constraints.
+  std::vector<SDISelAsmOperandInfo> ConstraintOperands;
+
+  std::set<unsigned> OutputRegs, InputRegs;
+
+  // Do a prepass over the constraints, canonicalizing them, and building up the
+  // ConstraintOperands list.
+  std::vector<InlineAsm::ConstraintInfo>
+    ConstraintInfos = IA->ParseConstraints();
+
+  bool hasMemory = hasInlineAsmMemConstraint(ConstraintInfos, TLI);
+  
+  SDValue Chain, Flag;
+  
+  // We won't need to flush pending loads if this asm doesn't touch
+  // memory and is nonvolatile.
+  if (hasMemory || IA->hasSideEffects())
+    Chain = getRoot();
+  else
+    Chain = DAG.getRoot();
+
+  unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0;   // ResNo - The result number of the next output.
+  for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) {
+    ConstraintOperands.push_back(SDISelAsmOperandInfo(ConstraintInfos[i]));
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    MVT OpVT = MVT::Other;
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      // Indirect outputs just consume an argument.
+      if (OpInfo.isIndirect) {
+        OpInfo.CallOperandVal = CS.getArgument(ArgNo++);
+        break;
+      }
+
+      // The return value of the call is this value.  As such, there is no
+      // corresponding argument.
+      assert(CS.getType() != Type::VoidTy && "Bad inline asm!");
+      if (const StructType *STy = dyn_cast<StructType>(CS.getType())) {
+        OpVT = TLI.getValueType(STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpVT = TLI.getValueType(CS.getType());
+      }
+      ++ResNo;
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = CS.getArgument(ArgNo++);
+      break;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    // If this is an input or an indirect output, process the call argument.
+    // BasicBlocks are labels, currently appearing only in asm's.
+    if (OpInfo.CallOperandVal) {
+      if (BasicBlock *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
+        OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
+      } else {
+        OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
+      }
+
+      OpVT = OpInfo.getCallOperandValMVT(TLI, TD);
+    }
+
+    OpInfo.ConstraintVT = OpVT;
+  }
+
+  // Second pass over the constraints: compute which constraint option to use
+  // and assign registers to constraints that want a specific physreg.
+  for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    // If this is an output operand with a matching input operand, look up the
+    // matching input. If their types mismatch, e.g. one is an integer, the
+    // other is floating point, or their sizes are different, flag it as an
+    // error.
+    if (OpInfo.hasMatchingInput()) {
+      SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+      if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+        if ((OpInfo.ConstraintVT.isInteger() !=
+             Input.ConstraintVT.isInteger()) ||
+            (OpInfo.ConstraintVT.getSizeInBits() !=
+             Input.ConstraintVT.getSizeInBits())) {
+          cerr << "llvm: error: Unsupported asm: input constraint with a "
+               << "matching output constraint of incompatible type!\n";
+          exit(1);
+        }
+        Input.ConstraintVT = OpInfo.ConstraintVT;
+      }
+    }
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, hasMemory, &DAG);
+
+    // If this is a memory input, and if the operand is not indirect, do what we
+    // need to to provide an address for the memory input.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        !OpInfo.isIndirect) {
+      assert(OpInfo.Type == InlineAsm::isInput &&
+             "Can only indirectify direct input operands!");
+
+      // Memory operands really want the address of the value.  If we don't have
+      // an indirect input, put it in the constpool if we can, otherwise spill
+      // it to a stack slot.
+
+      // If the operand is a float, integer, or vector constant, spill to a
+      // constant pool entry to get its address.
+      Value *OpVal = OpInfo.CallOperandVal;
+      if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
+          isa<ConstantVector>(OpVal)) {
+        OpInfo.CallOperand = DAG.getConstantPool(cast<Constant>(OpVal),
+                                                 TLI.getPointerTy());
+      } else {
+        // Otherwise, create a stack slot and emit a store to it before the
+        // asm.
+        const Type *Ty = OpVal->getType();
+        uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
+        unsigned Align  = TLI.getTargetData()->getPrefTypeAlignment(Ty);
+        MachineFunction &MF = DAG.getMachineFunction();
+        int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align);
+        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
+        Chain = DAG.getStore(Chain, getCurDebugLoc(),
+                             OpInfo.CallOperand, StackSlot, NULL, 0);
+        OpInfo.CallOperand = StackSlot;
+      }
+
+      // There is no longer a Value* corresponding to this operand.
+      OpInfo.CallOperandVal = 0;
+      // It is now an indirect operand.
+      OpInfo.isIndirect = true;
+    }
+
+    // If this constraint is for a specific register, allocate it before
+    // anything else.
+    if (OpInfo.ConstraintType == TargetLowering::C_Register)
+      GetRegistersForValue(OpInfo, OutputRegs, InputRegs);
+  }
+  ConstraintInfos.clear();
+
+
+  // Second pass - Loop over all of the operands, assigning virtual or physregs
+  // to register class operands.
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    // C_Register operands have already been allocated, Other/Memory don't need
+    // to be.
+    if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass)
+      GetRegistersForValue(OpInfo, OutputRegs, InputRegs);
+  }
+
+  // AsmNodeOperands - The operands for the ISD::INLINEASM node.
+  std::vector<SDValue> AsmNodeOperands;
+  AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
+  AsmNodeOperands.push_back(
+          DAG.getTargetExternalSymbol(IA->getAsmString().c_str(), MVT::Other));
+
+
+  // Loop over all of the inputs, copying the operand values into the
+  // appropriate registers and processing the output regs.
+  RegsForValue RetValRegs;
+
+  // IndirectStoresToEmit - The set of stores to emit after the inline asm node.
+  std::vector<std::pair<RegsForValue, Value*> > IndirectStoresToEmit;
+
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput: {
+      if (OpInfo.ConstraintType != TargetLowering::C_RegisterClass &&
+          OpInfo.ConstraintType != TargetLowering::C_Register) {
+        // Memory output, or 'other' output (e.g. 'X' constraint).
+        assert(OpInfo.isIndirect && "Memory output must be indirect operand");
+
+        // Add information to the INLINEASM node to know about this output.
+        unsigned ResOpType = 4/*MEM*/ | (1<<3);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
+                                                        TLI.getPointerTy()));
+        AsmNodeOperands.push_back(OpInfo.CallOperand);
+        break;
+      }
+
+      // Otherwise, this is a register or register class output.
+
+      // Copy the output from the appropriate register.  Find a register that
+      // we can use.
+      if (OpInfo.AssignedRegs.Regs.empty()) {
+        cerr << "llvm: error: Couldn't allocate output reg for constraint '"
+             << OpInfo.ConstraintCode << "'!\n";
+        exit(1);
+      }
+
+      // If this is an indirect operand, store through the pointer after the
+      // asm.
+      if (OpInfo.isIndirect) {
+        IndirectStoresToEmit.push_back(std::make_pair(OpInfo.AssignedRegs,
+                                                      OpInfo.CallOperandVal));
+      } else {
+        // This is the result value of the call.
+        assert(CS.getType() != Type::VoidTy && "Bad inline asm!");
+        // Concatenate this output onto the outputs list.
+        RetValRegs.append(OpInfo.AssignedRegs);
+      }
+
+      // Add information to the INLINEASM node to know that this register is
+      // set.
+      OpInfo.AssignedRegs.AddInlineAsmOperands(OpInfo.isEarlyClobber ?
+                                               6 /* EARLYCLOBBER REGDEF */ :
+                                               2 /* REGDEF */ ,
+                                               false,
+                                               0,
+                                               DAG, AsmNodeOperands);
+      break;
+    }
+    case InlineAsm::isInput: {
+      SDValue InOperandVal = OpInfo.CallOperand;
+
+      if (OpInfo.isMatchingInputConstraint()) {   // Matching constraint?
+        // If this is required to match an output register we have already set,
+        // just use its register.
+        unsigned OperandNo = OpInfo.getMatchedOperand();
+
+        // Scan until we find the definition we already emitted of this operand.
+        // When we find it, create a RegsForValue operand.
+        unsigned CurOp = 2;  // The first operand.
+        for (; OperandNo; --OperandNo) {
+          // Advance to the next operand.
+          unsigned OpFlag =
+            cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+          assert(((OpFlag & 7) == 2 /*REGDEF*/ ||
+                  (OpFlag & 7) == 6 /*EARLYCLOBBER REGDEF*/ ||
+                  (OpFlag & 7) == 4 /*MEM*/) &&
+                 "Skipped past definitions?");
+          CurOp += InlineAsm::getNumOperandRegisters(OpFlag)+1;
+        }
+
+        unsigned OpFlag =
+          cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+        if ((OpFlag & 7) == 2 /*REGDEF*/
+            || (OpFlag & 7) == 6 /* EARLYCLOBBER REGDEF */) {
+          // Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
+          assert(!OpInfo.isIndirect &&
+                 "Don't know how to handle tied indirect register inputs yet!");
+          RegsForValue MatchedRegs;
+          MatchedRegs.TLI = &TLI;
+          MatchedRegs.ValueVTs.push_back(InOperandVal.getValueType());
+          MVT RegVT = AsmNodeOperands[CurOp+1].getValueType();
+          MatchedRegs.RegVTs.push_back(RegVT);
+          MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
+          for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag);
+               i != e; ++i)
+            MatchedRegs.Regs.
+              push_back(RegInfo.createVirtualRegister(TLI.getRegClassFor(RegVT)));
+
+          // Use the produced MatchedRegs object to
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
+                                    Chain, &Flag);
+          MatchedRegs.AddInlineAsmOperands(1 /*REGUSE*/,
+                                           true, OpInfo.getMatchedOperand(),
+                                           DAG, AsmNodeOperands);
+          break;
+        } else {
+          assert(((OpFlag & 7) == 4) && "Unknown matching constraint!");
+          assert((InlineAsm::getNumOperandRegisters(OpFlag)) == 1 &&
+                 "Unexpected number of operands");
+          // Add information to the INLINEASM node to know about this input.
+          // See InlineAsm.h isUseOperandTiedToDef.
+          OpFlag |= 0x80000000 | (OpInfo.getMatchedOperand() << 16);
+          AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlag,
+                                                          TLI.getPointerTy()));
+          AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
+          break;
+        }
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Other) {
+        assert(!OpInfo.isIndirect &&
+               "Don't know how to handle indirect other inputs yet!");
+
+        std::vector<SDValue> Ops;
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode[0],
+                                         hasMemory, Ops, DAG);
+        if (Ops.empty()) {
+          cerr << "llvm: error: Invalid operand for inline asm constraint '"
+               << OpInfo.ConstraintCode << "'!\n";
+          exit(1);
+        }
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType = 3 /*IMM*/ | (Ops.size() << 3);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
+                                                        TLI.getPointerTy()));
+        AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
+        break;
+      } else if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+        assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
+        assert(InOperandVal.getValueType() == TLI.getPointerTy() &&
+               "Memory operands expect pointer values");
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType = 4/*MEM*/ | (1<<3);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
+                                                        TLI.getPointerTy()));
+        AsmNodeOperands.push_back(InOperandVal);
+        break;
+      }
+
+      assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
+              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+             "Unknown constraint type!");
+      assert(!OpInfo.isIndirect &&
+             "Don't know how to handle indirect register inputs yet!");
+
+      // Copy the input into the appropriate registers.
+      if (OpInfo.AssignedRegs.Regs.empty()) {
+        cerr << "llvm: error: Couldn't allocate output reg for constraint '"
+             << OpInfo.ConstraintCode << "'!\n";
+        exit(1);
+      }
+
+      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
+                                        Chain, &Flag);
+
+      OpInfo.AssignedRegs.AddInlineAsmOperands(1/*REGUSE*/, false, 0,
+                                               DAG, AsmNodeOperands);
+      break;
+    }
+    case InlineAsm::isClobber: {
+      // Add the clobbered value to the operand list, so that the register
+      // allocator is aware that the physreg got clobbered.
+      if (!OpInfo.AssignedRegs.Regs.empty())
+        OpInfo.AssignedRegs.AddInlineAsmOperands(6 /* EARLYCLOBBER REGDEF */,
+                                                 false, 0, DAG,AsmNodeOperands);
+      break;
+    }
+    }
+  }
+
+  // Finish up input operands.
+  AsmNodeOperands[0] = Chain;
+  if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
+
+  Chain = DAG.getNode(ISD::INLINEASM, getCurDebugLoc(),
+                      DAG.getVTList(MVT::Other, MVT::Flag),
+                      &AsmNodeOperands[0], AsmNodeOperands.size());
+  Flag = Chain.getValue(1);
+
+  // If this asm returns a register value, copy the result from that register
+  // and set it as the value of the call.
+  if (!RetValRegs.Regs.empty()) {
+    SDValue Val = RetValRegs.getCopyFromRegs(DAG, getCurDebugLoc(),
+                                             Chain, &Flag);
+
+    // FIXME: Why don't we do this for inline asms with MRVs?
+    if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
+      MVT ResultType = TLI.getValueType(CS.getType());
+
+      // If any of the results of the inline asm is a vector, it may have the
+      // wrong width/num elts.  This can happen for register classes that can
+      // contain multiple different value types.  The preg or vreg allocated may
+      // not have the same VT as was expected.  Convert it to the right type
+      // with bit_convert.
+      if (ResultType != Val.getValueType() && Val.getValueType().isVector()) {
+        Val = DAG.getNode(ISD::BIT_CONVERT, getCurDebugLoc(),
+                          ResultType, Val);
+
+      } else if (ResultType != Val.getValueType() &&
+                 ResultType.isInteger() && Val.getValueType().isInteger()) {
+        // If a result value was tied to an input value, the computed result may
+        // have a wider width than the expected result.  Extract the relevant
+        // portion.
+        Val = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), ResultType, Val);
+      }
+
+      assert(ResultType == Val.getValueType() && "Asm result value mismatch!");
+    }
+
+    setValue(CS.getInstruction(), Val);
+    // Don't need to use this as a chain in this case.
+    if (!IA->hasSideEffects() && !hasMemory && IndirectStoresToEmit.empty())
+      return;
+  }
+
+  std::vector<std::pair<SDValue, Value*> > StoresToEmit;
+
+  // Process indirect outputs, first output all of the flagged copies out of
+  // physregs.
+  for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
+    RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
+    Value *Ptr = IndirectStoresToEmit[i].second;
+    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, getCurDebugLoc(),
+                                             Chain, &Flag);
+    StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
+
+  }
+
+  // Emit the non-flagged stores from the physregs.
+  SmallVector<SDValue, 8> OutChains;
+  for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i)
+    OutChains.push_back(DAG.getStore(Chain, getCurDebugLoc(),
+                                    StoresToEmit[i].first,
+                                    getValue(StoresToEmit[i].second),
+                                    StoresToEmit[i].second, 0));
+  if (!OutChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                        &OutChains[0], OutChains.size());
+  DAG.setRoot(Chain);
+}
+
+
+void SelectionDAGLowering::visitMalloc(MallocInst &I) {
+  SDValue Src = getValue(I.getOperand(0));
+
+  // Scale up by the type size in the original i32 type width.  Various
+  // mid-level optimizers may make assumptions about demanded bits etc from the
+  // i32-ness of the optimizer: we do not want to promote to i64 and then
+  // multiply on 64-bit targets.
+  // FIXME: Malloc inst should go away: PR715.
+  uint64_t ElementSize = TD->getTypeAllocSize(I.getType()->getElementType());
+  if (ElementSize != 1)
+    Src = DAG.getNode(ISD::MUL, getCurDebugLoc(), Src.getValueType(),
+                      Src, DAG.getConstant(ElementSize, Src.getValueType()));
+  
+  MVT IntPtr = TLI.getPointerTy();
+
+  if (IntPtr.bitsLT(Src.getValueType()))
+    Src = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), IntPtr, Src);
+  else if (IntPtr.bitsGT(Src.getValueType()))
+    Src = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(), IntPtr, Src);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Src;
+  Entry.Ty = TLI.getTargetData()->getIntPtrType();
+  Args.push_back(Entry);
+
+  std::pair<SDValue,SDValue> Result =
+    TLI.LowerCallTo(getRoot(), I.getType(), false, false, false, false,
+                    CallingConv::C, PerformTailCallOpt,
+                    DAG.getExternalSymbol("malloc", IntPtr),
+                    Args, DAG, getCurDebugLoc());
+  setValue(&I, Result.first);  // Pointers always fit in registers
+  DAG.setRoot(Result.second);
+}
+
+void SelectionDAGLowering::visitFree(FreeInst &I) {
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = getValue(I.getOperand(0));
+  Entry.Ty = TLI.getTargetData()->getIntPtrType();
+  Args.push_back(Entry);
+  MVT IntPtr = TLI.getPointerTy();
+  std::pair<SDValue,SDValue> Result =
+    TLI.LowerCallTo(getRoot(), Type::VoidTy, false, false, false, false,
+                    CallingConv::C, PerformTailCallOpt,
+                    DAG.getExternalSymbol("free", IntPtr), Args, DAG,
+                    getCurDebugLoc());
+  DAG.setRoot(Result.second);
+}
+
+void SelectionDAGLowering::visitVAStart(CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VASTART, getCurDebugLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getOperand(1)),
+                          DAG.getSrcValue(I.getOperand(1))));
+}
+
+void SelectionDAGLowering::visitVAArg(VAArgInst &I) {
+  SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
+                           getRoot(), getValue(I.getOperand(0)),
+                           DAG.getSrcValue(I.getOperand(0)));
+  setValue(&I, V);
+  DAG.setRoot(V.getValue(1));
+}
+
+void SelectionDAGLowering::visitVAEnd(CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VAEND, getCurDebugLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getOperand(1)),
+                          DAG.getSrcValue(I.getOperand(1))));
+}
+
+void SelectionDAGLowering::visitVACopy(CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurDebugLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getOperand(1)),
+                          getValue(I.getOperand(2)),
+                          DAG.getSrcValue(I.getOperand(1)),
+                          DAG.getSrcValue(I.getOperand(2))));
+}
+
+/// TargetLowering::LowerArguments - This is the default LowerArguments
+/// implementation, which just inserts a FORMAL_ARGUMENTS node.  FIXME: When all
+/// targets are migrated to using FORMAL_ARGUMENTS, this hook should be
+/// integrated into SDISel.
+void TargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &ArgValues,
+                                    DebugLoc dl) {
+  // Add CC# and isVararg as operands to the FORMAL_ARGUMENTS node.
+  SmallVector<SDValue, 3+16> Ops;
+  Ops.push_back(DAG.getRoot());
+  Ops.push_back(DAG.getConstant(F.getCallingConv(), getPointerTy()));
+  Ops.push_back(DAG.getConstant(F.isVarArg(), getPointerTy()));
+
+  // Add one result value for each formal argument.
+  SmallVector<MVT, 16> RetVals;
+  unsigned j = 1;
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I, ++j) {
+    SmallVector<MVT, 4> ValueVTs;
+    ComputeValueVTs(*this, I->getType(), ValueVTs);
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      MVT VT = ValueVTs[Value];
+      const Type *ArgTy = VT.getTypeForMVT();
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment =
+        getTargetData()->getABITypeAlignment(ArgTy);
+
+      if (F.paramHasAttr(j, Attribute::ZExt))
+        Flags.setZExt();
+      if (F.paramHasAttr(j, Attribute::SExt))
+        Flags.setSExt();
+      if (F.paramHasAttr(j, Attribute::InReg))
+        Flags.setInReg();
+      if (F.paramHasAttr(j, Attribute::StructRet))
+        Flags.setSRet();
+      if (F.paramHasAttr(j, Attribute::ByVal)) {
+        Flags.setByVal();
+        const PointerType *Ty = cast<PointerType>(I->getType());
+        const Type *ElementTy = Ty->getElementType();
+        unsigned FrameAlign = getByValTypeAlignment(ElementTy);
+        unsigned FrameSize  = getTargetData()->getTypeAllocSize(ElementTy);
+        // For ByVal, alignment should be passed from FE.  BE will guess if
+        // this info is not there but there are cases it cannot get right.
+        if (F.getParamAlignment(j))
+          FrameAlign = F.getParamAlignment(j);
+        Flags.setByValAlign(FrameAlign);
+        Flags.setByValSize(FrameSize);
+      }
+      if (F.paramHasAttr(j, Attribute::Nest))
+        Flags.setNest();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      MVT RegisterVT = getRegisterType(VT);
+      unsigned NumRegs = getNumRegisters(VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        RetVals.push_back(RegisterVT);
+        ISD::ArgFlagsTy MyFlags = Flags;
+        if (NumRegs > 1 && i == 0)
+          MyFlags.setSplit();
+        // if it isn't first piece, alignment must be 1
+        else if (i > 0)
+          MyFlags.setOrigAlign(1);
+        Ops.push_back(DAG.getArgFlags(MyFlags));
+      }
+    }
+  }
+
+  RetVals.push_back(MVT::Other);
+
+  // Create the node.
+  SDNode *Result = DAG.getNode(ISD::FORMAL_ARGUMENTS, dl,
+                               DAG.getVTList(&RetVals[0], RetVals.size()),
+                               &Ops[0], Ops.size()).getNode();
+
+  // Prelower FORMAL_ARGUMENTS.  This isn't required for functionality, but
+  // allows exposing the loads that may be part of the argument access to the
+  // first DAGCombiner pass.
+  SDValue TmpRes = LowerOperation(SDValue(Result, 0), DAG);
+
+  // The number of results should match up, except that the lowered one may have
+  // an extra flag result.
+  assert((Result->getNumValues() == TmpRes.getNode()->getNumValues() ||
+          (Result->getNumValues()+1 == TmpRes.getNode()->getNumValues() &&
+           TmpRes.getValue(Result->getNumValues()).getValueType() == MVT::Flag))
+         && "Lowering produced unexpected number of results!");
+
+  // The FORMAL_ARGUMENTS node itself is likely no longer needed.
+  if (Result != TmpRes.getNode() && Result->use_empty()) {
+    HandleSDNode Dummy(DAG.getRoot());
+    DAG.RemoveDeadNode(Result);
+  }
+
+  Result = TmpRes.getNode();
+
+  unsigned NumArgRegs = Result->getNumValues() - 1;
+  DAG.setRoot(SDValue(Result, NumArgRegs));
+
+  // Set up the return result vector.
+  unsigned i = 0;
+  unsigned Idx = 1;
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
+      ++I, ++Idx) {
+    SmallVector<MVT, 4> ValueVTs;
+    ComputeValueVTs(*this, I->getType(), ValueVTs);
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      MVT VT = ValueVTs[Value];
+      MVT PartVT = getRegisterType(VT);
+
+      unsigned NumParts = getNumRegisters(VT);
+      SmallVector<SDValue, 4> Parts(NumParts);
+      for (unsigned j = 0; j != NumParts; ++j)
+        Parts[j] = SDValue(Result, i++);
+
+      ISD::NodeType AssertOp = ISD::DELETED_NODE;
+      if (F.paramHasAttr(Idx, Attribute::SExt))
+        AssertOp = ISD::AssertSext;
+      else if (F.paramHasAttr(Idx, Attribute::ZExt))
+        AssertOp = ISD::AssertZext;
+
+      ArgValues.push_back(getCopyFromParts(DAG, dl, &Parts[0], NumParts,
+                                           PartVT, VT, AssertOp));
+    }
+  }
+  assert(i == NumArgRegs && "Argument register count mismatch!");
+}
+
+
+/// TargetLowering::LowerCallTo - This is the default LowerCallTo
+/// implementation, which just inserts an ISD::CALL node, which is later custom
+/// lowered by the target to something concrete.  FIXME: When all targets are
+/// migrated to using ISD::CALL, this hook should be integrated into SDISel.
+std::pair<SDValue, SDValue>
+TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
+                            bool RetSExt, bool RetZExt, bool isVarArg,
+                            bool isInreg,
+                            unsigned CallingConv, bool isTailCall,
+                            SDValue Callee,
+                            ArgListTy &Args, SelectionDAG &DAG, DebugLoc dl) {
+  assert((!isTailCall || PerformTailCallOpt) &&
+         "isTailCall set when tail-call optimizations are disabled!");
+
+  SmallVector<SDValue, 32> Ops;
+  Ops.push_back(Chain);   // Op#0 - Chain
+  Ops.push_back(Callee);
+
+  // Handle all of the outgoing arguments.
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    SmallVector<MVT, 4> ValueVTs;
+    ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      MVT VT = ValueVTs[Value];
+      const Type *ArgTy = VT.getTypeForMVT();
+      SDValue Op = SDValue(Args[i].Node.getNode(),
+                           Args[i].Node.getResNo() + Value);
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment =
+        getTargetData()->getABITypeAlignment(ArgTy);
+
+      if (Args[i].isZExt)
+        Flags.setZExt();
+      if (Args[i].isSExt)
+        Flags.setSExt();
+      if (Args[i].isInReg)
+        Flags.setInReg();
+      if (Args[i].isSRet)
+        Flags.setSRet();
+      if (Args[i].isByVal) {
+        Flags.setByVal();
+        const PointerType *Ty = cast<PointerType>(Args[i].Ty);
+        const Type *ElementTy = Ty->getElementType();
+        unsigned FrameAlign = getByValTypeAlignment(ElementTy);
+        unsigned FrameSize  = getTargetData()->getTypeAllocSize(ElementTy);
+        // For ByVal, alignment should come from FE.  BE will guess if this
+        // info is not there but there are cases it cannot get right.
+        if (Args[i].Alignment)
+          FrameAlign = Args[i].Alignment;
+        Flags.setByValAlign(FrameAlign);
+        Flags.setByValSize(FrameSize);
+      }
+      if (Args[i].isNest)
+        Flags.setNest();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      MVT PartVT = getRegisterType(VT);
+      unsigned NumParts = getNumRegisters(VT);
+      SmallVector<SDValue, 4> Parts(NumParts);
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+      if (Args[i].isSExt)
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (Args[i].isZExt)
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      getCopyToParts(DAG, dl, Op, &Parts[0], NumParts, PartVT, ExtendKind);
+
+      for (unsigned i = 0; i != NumParts; ++i) {
+        // if it isn't first piece, alignment must be 1
+        ISD::ArgFlagsTy MyFlags = Flags;
+        if (NumParts > 1 && i == 0)
+          MyFlags.setSplit();
+        else if (i != 0)
+          MyFlags.setOrigAlign(1);
+
+        Ops.push_back(Parts[i]);
+        Ops.push_back(DAG.getArgFlags(MyFlags));
+      }
+    }
+  }
+
+  // Figure out the result value types. We start by making a list of
+  // the potentially illegal return value types.
+  SmallVector<MVT, 4> LoweredRetTys;
+  SmallVector<MVT, 4> RetTys;
+  ComputeValueVTs(*this, RetTy, RetTys);
+
+  // Then we translate that to a list of legal types.
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    MVT VT = RetTys[I];
+    MVT RegisterVT = getRegisterType(VT);
+    unsigned NumRegs = getNumRegisters(VT);
+    for (unsigned i = 0; i != NumRegs; ++i)
+      LoweredRetTys.push_back(RegisterVT);
+  }
+
+  LoweredRetTys.push_back(MVT::Other);  // Always has a chain.
+
+  // Create the CALL node.
+  SDValue Res = DAG.getCall(CallingConv, dl,
+                            isVarArg, isTailCall, isInreg,
+                            DAG.getVTList(&LoweredRetTys[0],
+                                          LoweredRetTys.size()),
+                            &Ops[0], Ops.size()
+                            );
+  Chain = Res.getValue(LoweredRetTys.size() - 1);
+
+  // Gather up the call result into a single value.
+  if (RetTy != Type::VoidTy && !RetTys.empty()) {
+    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+
+    if (RetSExt)
+      AssertOp = ISD::AssertSext;
+    else if (RetZExt)
+      AssertOp = ISD::AssertZext;
+
+    SmallVector<SDValue, 4> ReturnValues;
+    unsigned RegNo = 0;
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      MVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterType(VT);
+      unsigned NumRegs = getNumRegisters(VT);
+      unsigned RegNoEnd = NumRegs + RegNo;
+      SmallVector<SDValue, 4> Results;
+      for (; RegNo != RegNoEnd; ++RegNo)
+        Results.push_back(Res.getValue(RegNo));
+      SDValue ReturnValue =
+        getCopyFromParts(DAG, dl, &Results[0], NumRegs, RegisterVT, VT,
+                         AssertOp);
+      ReturnValues.push_back(ReturnValue);
+    }
+    Res = DAG.getNode(ISD::MERGE_VALUES, dl,
+                      DAG.getVTList(&RetTys[0], RetTys.size()),
+                      &ReturnValues[0], ReturnValues.size());
+  }
+
+  return std::make_pair(Res, Chain);
+}
+
+void TargetLowering::LowerOperationWrapper(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+  if (Res.getNode())
+    Results.push_back(Res);
+}
+
+SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  assert(0 && "LowerOperation not implemented for this target!");
+  abort();
+  return SDValue();
+}
+
+
+void SelectionDAGLowering::CopyValueToVirtualRegister(Value *V, unsigned Reg) {
+  SDValue Op = getValue(V);
+  assert((Op.getOpcode() != ISD::CopyFromReg ||
+          cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
+         "Copy from a reg to the same reg!");
+  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
+
+  RegsForValue RFV(TLI, Reg, V->getType());
+  SDValue Chain = DAG.getEntryNode();
+  RFV.getCopyToRegs(Op, DAG, getCurDebugLoc(), Chain, 0);
+  PendingExports.push_back(Chain);
+}
+
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+void SelectionDAGISel::
+LowerArguments(BasicBlock *LLVMBB) {
+  // If this is the entry block, emit arguments.
+  Function &F = *LLVMBB->getParent();
+  SDValue OldRoot = SDL->DAG.getRoot();
+  SmallVector<SDValue, 16> Args;
+  TLI.LowerArguments(F, SDL->DAG, Args, SDL->getCurDebugLoc());
+
+  unsigned a = 0;
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end();
+       AI != E; ++AI) {
+    SmallVector<MVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, AI->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+    if (!AI->use_empty()) {
+      SDL->setValue(AI, SDL->DAG.getMergeValues(&Args[a], NumValues,
+                                                SDL->getCurDebugLoc()));
+      // If this argument is live outside of the entry block, insert a copy from
+      // whereever we got it to the vreg that other BB's will reference it as.
+      SDL->CopyToExportRegsIfNeeded(AI);
+    }
+    a += NumValues;
+  }
+
+  // Finally, if the target has anything special to do, allow it to do so.
+  // FIXME: this should insert code into the DAG!
+  EmitFunctionEntryCode(F, SDL->DAG.getMachineFunction());
+}
+
+/// Handle PHI nodes in successor blocks.  Emit code into the SelectionDAG to
+/// ensure constants are generated when needed.  Remember the virtual registers
+/// that need to be added to the Machine PHI nodes as input.  We cannot just
+/// directly add them, because expansion might result in multiple MBB's for one
+/// BB.  As such, the start of the BB might correspond to a different MBB than
+/// the end.
+///
+void
+SelectionDAGISel::HandlePHINodesInSuccessorBlocks(BasicBlock *LLVMBB) {
+  TerminatorInst *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+
+  // Check successor nodes' PHI nodes that expect a constant to be available
+  // from this block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin())) continue;
+    MachineBasicBlock *SuccMBB = FuncInfo->MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB)) continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+    PHINode *PN;
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (BasicBlock::iterator I = SuccBB->begin();
+         (PN = dyn_cast<PHINode>(I)); ++I) {
+      // Ignore dead phi's.
+      if (PN->use_empty()) continue;
+
+      unsigned Reg;
+      Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
+
+      if (Constant *C = dyn_cast<Constant>(PHIOp)) {
+        unsigned &RegOut = SDL->ConstantsOut[C];
+        if (RegOut == 0) {
+          RegOut = FuncInfo->CreateRegForValue(C);
+          SDL->CopyValueToVirtualRegister(C, RegOut);
+        }
+        Reg = RegOut;
+      } else {
+        Reg = FuncInfo->ValueMap[PHIOp];
+        if (Reg == 0) {
+          assert(isa<AllocaInst>(PHIOp) &&
+                 FuncInfo->StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
+                 "Didn't codegen value into a register!??");
+          Reg = FuncInfo->CreateRegForValue(PHIOp);
+          SDL->CopyValueToVirtualRegister(PHIOp, Reg);
+        }
+      }
+
+      // Remember that this register needs to added to the machine PHI node as
+      // the input for this MBB.
+      SmallVector<MVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        MVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI.getNumRegisters(VT);
+        for (unsigned i = 0, e = NumRegisters; i != e; ++i)
+          SDL->PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i));
+        Reg += NumRegisters;
+      }
+    }
+  }
+  SDL->ConstantsOut.clear();
+}
+
+/// This is the Fast-ISel version of HandlePHINodesInSuccessorBlocks. It only
+/// supports legal types, and it emits MachineInstrs directly instead of
+/// creating SelectionDAG nodes.
+///
+bool
+SelectionDAGISel::HandlePHINodesInSuccessorBlocksFast(BasicBlock *LLVMBB,
+                                                      FastISel *F) {
+  TerminatorInst *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+  unsigned OrigNumPHINodesToUpdate = SDL->PHINodesToUpdate.size();
+
+  // Check successor nodes' PHI nodes that expect a constant to be available
+  // from this block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin())) continue;
+    MachineBasicBlock *SuccMBB = FuncInfo->MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB)) continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+    PHINode *PN;
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (BasicBlock::iterator I = SuccBB->begin();
+         (PN = dyn_cast<PHINode>(I)); ++I) {
+      // Ignore dead phi's.
+      if (PN->use_empty()) continue;
+
+      // Only handle legal types. Two interesting things to note here. First,
+      // by bailing out early, we may leave behind some dead instructions,
+      // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its
+      // own moves. Second, this check is necessary becuase FastISel doesn't
+      // use CreateRegForValue to create registers, so it always creates
+      // exactly one register for each non-void instruction.
+      MVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
+      if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
+        // Promote MVT::i1.
+        if (VT == MVT::i1)
+          VT = TLI.getTypeToTransformTo(VT);
+        else {
+          SDL->PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+          return false;
+        }
+      }
+
+      Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
+
+      unsigned Reg = F->getRegForValue(PHIOp);
+      if (Reg == 0) {
+        SDL->PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+        return false;
+      }
+      SDL->PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
+    }
+  }
+
+  return true;
+}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
new file mode 100644
index 0000000..578aa591
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
@@ -0,0 +1,558 @@
+//===-- SelectionDAGBuild.h - Selection-DAG building ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SELECTIONDAGBUILD_H
+#define SELECTIONDAGBUILD_H
+
+#include "llvm/Constants.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#ifndef NDEBUG
+#include "llvm/ADT/SmallSet.h"
+#endif
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Target/TargetMachine.h"
+#include <vector>
+#include <set>
+
+namespace llvm {
+
+class AliasAnalysis;
+class AllocaInst;
+class BasicBlock;
+class BitCastInst;
+class BranchInst;
+class CallInst;
+class ExtractElementInst;
+class ExtractValueInst;
+class FCmpInst;
+class FPExtInst;
+class FPToSIInst;
+class FPToUIInst;
+class FPTruncInst;
+class FreeInst;
+class Function;
+class GetElementPtrInst;
+class GCFunctionInfo;
+class ICmpInst;
+class IntToPtrInst;
+class InvokeInst;
+class InsertElementInst;
+class InsertValueInst;
+class Instruction;
+class LoadInst;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class MachineModuleInfo;
+class MachineRegisterInfo;
+class MallocInst;
+class PHINode;
+class PtrToIntInst;
+class ReturnInst;
+class SDISelAsmOperandInfo;
+class SExtInst;
+class SelectInst;
+class ShuffleVectorInst;
+class SIToFPInst;
+class StoreInst;
+class SwitchInst;
+class TargetData;
+class TargetLowering;
+class TruncInst;
+class UIToFPInst;
+class UnreachableInst;
+class UnwindInst;
+class VICmpInst;
+class VFCmpInst;
+class VAArgInst;
+class ZExtInst;
+
+//===--------------------------------------------------------------------===//
+/// FunctionLoweringInfo - This contains information that is global to a
+/// function that is used when lowering a region of the function.
+///
+class FunctionLoweringInfo {
+public:
+  TargetLowering &TLI;
+  Function *Fn;
+  MachineFunction *MF;
+  MachineRegisterInfo *RegInfo;
+
+  explicit FunctionLoweringInfo(TargetLowering &TLI);
+
+  /// set - Initialize this FunctionLoweringInfo with the given Function
+  /// and its associated MachineFunction.
+  ///
+  void set(Function &Fn, MachineFunction &MF, SelectionDAG &DAG,
+           bool EnableFastISel);
+
+  /// MBBMap - A mapping from LLVM basic blocks to their machine code entry.
+  DenseMap<const BasicBlock*, MachineBasicBlock *> MBBMap;
+
+  /// ValueMap - Since we emit code for the function a basic block at a time,
+  /// we must remember which virtual registers hold the values for
+  /// cross-basic-block values.
+  DenseMap<const Value*, unsigned> ValueMap;
+
+  /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in
+  /// the entry block.  This allows the allocas to be efficiently referenced
+  /// anywhere in the function.
+  DenseMap<const AllocaInst*, int> StaticAllocaMap;
+
+#ifndef NDEBUG
+  SmallSet<Instruction*, 8> CatchInfoLost;
+  SmallSet<Instruction*, 8> CatchInfoFound;
+#endif
+
+  unsigned MakeReg(MVT VT);
+  
+  /// isExportedInst - Return true if the specified value is an instruction
+  /// exported from its block.
+  bool isExportedInst(const Value *V) {
+    return ValueMap.count(V);
+  }
+
+  unsigned CreateRegForValue(const Value *V);
+  
+  unsigned InitializeRegForValue(const Value *V) {
+    unsigned &R = ValueMap[V];
+    assert(R == 0 && "Already initialized this value register!");
+    return R = CreateRegForValue(V);
+  }
+  
+  struct LiveOutInfo {
+    unsigned NumSignBits;
+    APInt KnownOne, KnownZero;
+    LiveOutInfo() : NumSignBits(0), KnownOne(1, 0), KnownZero(1, 0) {}
+  };
+  
+  /// LiveOutRegInfo - Information about live out vregs, indexed by their
+  /// register number offset by 'FirstVirtualRegister'.
+  std::vector<LiveOutInfo> LiveOutRegInfo;
+
+  /// clear - Clear out all the function-specific state. This returns this
+  /// FunctionLoweringInfo to an empty state, ready to be used for a
+  /// different function.
+  void clear() {
+    MBBMap.clear();
+    ValueMap.clear();
+    StaticAllocaMap.clear();
+#ifndef NDEBUG
+    CatchInfoLost.clear();
+    CatchInfoFound.clear();
+#endif
+    LiveOutRegInfo.clear();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGLowering - This is the common target-independent lowering
+/// implementation that is parameterized by a TargetLowering object.
+/// Also, targets can overload any lowering method.
+///
+class SelectionDAGLowering {
+  MachineBasicBlock *CurMBB;
+
+  /// CurDebugLoc - current file + line number.  Changes as we build the DAG.
+  DebugLoc CurDebugLoc;
+
+  DenseMap<const Value*, SDValue> NodeMap;
+
+  /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
+  /// them up and then emit token factor nodes when possible.  This allows us to
+  /// get simple disambiguation between loads without worrying about alias
+  /// analysis.
+  SmallVector<SDValue, 8> PendingLoads;
+
+  /// PendingExports - CopyToReg nodes that copy values to virtual registers
+  /// for export to other blocks need to be emitted before any terminator
+  /// instruction, but they have no other ordering requirements. We bunch them
+  /// up and the emit a single tokenfactor for them just before terminator
+  /// instructions.
+  SmallVector<SDValue, 8> PendingExports;
+
+  /// Case - A struct to record the Value for a switch case, and the
+  /// case's target basic block.
+  struct Case {
+    Constant* Low;
+    Constant* High;
+    MachineBasicBlock* BB;
+
+    Case() : Low(0), High(0), BB(0) { }
+    Case(Constant* low, Constant* high, MachineBasicBlock* bb) :
+      Low(low), High(high), BB(bb) { }
+    uint64_t size() const {
+      uint64_t rHigh = cast<ConstantInt>(High)->getSExtValue();
+      uint64_t rLow  = cast<ConstantInt>(Low)->getSExtValue();
+      return (rHigh - rLow + 1ULL);
+    }
+  };
+
+  struct CaseBits {
+    uint64_t Mask;
+    MachineBasicBlock* BB;
+    unsigned Bits;
+
+    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits):
+      Mask(mask), BB(bb), Bits(bits) { }
+  };
+
+  typedef std::vector<Case>           CaseVector;
+  typedef std::vector<CaseBits>       CaseBitsVector;
+  typedef CaseVector::iterator        CaseItr;
+  typedef std::pair<CaseItr, CaseItr> CaseRange;
+
+  /// CaseRec - A struct with ctor used in lowering switches to a binary tree
+  /// of conditional branches.
+  struct CaseRec {
+    CaseRec(MachineBasicBlock *bb, Constant *lt, Constant *ge, CaseRange r) :
+    CaseBB(bb), LT(lt), GE(ge), Range(r) {}
+
+    /// CaseBB - The MBB in which to emit the compare and branch
+    MachineBasicBlock *CaseBB;
+    /// LT, GE - If nonzero, we know the current case value must be less-than or
+    /// greater-than-or-equal-to these Constants.
+    Constant *LT;
+    Constant *GE;
+    /// Range - A pair of iterators representing the range of case values to be
+    /// processed at this point in the binary search tree.
+    CaseRange Range;
+  };
+
+  typedef std::vector<CaseRec> CaseRecVector;
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator () (const Case& C1, const Case& C2) {
+      assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High));
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+
+  struct CaseBitsCmp {
+    bool operator () (const CaseBits& C1, const CaseBits& C2) {
+      return C1.Bits > C2.Bits;
+    }
+  };
+
+  size_t Clusterify(CaseVector& Cases, const SwitchInst &SI);
+
+  /// CaseBlock - This structure is used to communicate between SDLowering and
+  /// SDISel for the code generation of additional basic blocks needed by multi-
+  /// case switch statements.
+  struct CaseBlock {
+    CaseBlock(ISD::CondCode cc, Value *cmplhs, Value *cmprhs, Value *cmpmiddle,
+              MachineBasicBlock *truebb, MachineBasicBlock *falsebb,
+              MachineBasicBlock *me)
+      : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs),
+        TrueBB(truebb), FalseBB(falsebb), ThisBB(me) {}
+    // CC - the condition code to use for the case block's setcc node
+    ISD::CondCode CC;
+    // CmpLHS/CmpRHS/CmpMHS - The LHS/MHS/RHS of the comparison to emit.
+    // Emit by default LHS op RHS. MHS is used for range comparisons:
+    // If MHS is not null: (LHS <= MHS) and (MHS <= RHS).
+    Value *CmpLHS, *CmpMHS, *CmpRHS;
+    // TrueBB/FalseBB - the block to branch to if the setcc is true/false.
+    MachineBasicBlock *TrueBB, *FalseBB;
+    // ThisBB - the block into which to emit the code for the setcc and branches
+    MachineBasicBlock *ThisBB;
+  };
+  struct JumpTable {
+    JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
+              MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
+  
+    /// Reg - the virtual register containing the index of the jump table entry
+    //. to jump to.
+    unsigned Reg;
+    /// JTI - the JumpTableIndex for this jump table in the function.
+    unsigned JTI;
+    /// MBB - the MBB into which to emit the code for the indirect jump.
+    MachineBasicBlock *MBB;
+    /// Default - the MBB of the default bb, which is a successor of the range
+    /// check MBB.  This is when updating PHI nodes in successors.
+    MachineBasicBlock *Default;
+  };
+  struct JumpTableHeader {
+    JumpTableHeader(APInt F, APInt L, Value* SV, MachineBasicBlock* H,
+                    bool E = false):
+      First(F), Last(L), SValue(SV), HeaderBB(H), Emitted(E) {}
+    APInt First;
+    APInt Last;
+    Value *SValue;
+    MachineBasicBlock *HeaderBB;
+    bool Emitted;
+  };
+  typedef std::pair<JumpTableHeader, JumpTable> JumpTableBlock;
+
+  struct BitTestCase {
+    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr):
+      Mask(M), ThisBB(T), TargetBB(Tr) { }
+    uint64_t Mask;
+    MachineBasicBlock* ThisBB;
+    MachineBasicBlock* TargetBB;
+  };
+
+  typedef SmallVector<BitTestCase, 3> BitTestInfo;
+
+  struct BitTestBlock {
+    BitTestBlock(APInt F, APInt R, Value* SV,
+                 unsigned Rg, bool E,
+                 MachineBasicBlock* P, MachineBasicBlock* D,
+                 const BitTestInfo& C):
+      First(F), Range(R), SValue(SV), Reg(Rg), Emitted(E),
+      Parent(P), Default(D), Cases(C) { }
+    APInt First;
+    APInt Range;
+    Value  *SValue;
+    unsigned Reg;
+    bool Emitted;
+    MachineBasicBlock *Parent;
+    MachineBasicBlock *Default;
+    BitTestInfo Cases;
+  };
+
+public:
+  // TLI - This is information that describes the available target features we
+  // need for lowering.  This indicates when operations are unavailable,
+  // implemented with a libcall, etc.
+  TargetLowering &TLI;
+  SelectionDAG &DAG;
+  const TargetData *TD;
+  AliasAnalysis *AA;
+
+  /// SwitchCases - Vector of CaseBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<CaseBlock> SwitchCases;
+  /// JTCases - Vector of JumpTable structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<JumpTableBlock> JTCases;
+  /// BitTestCases - Vector of BitTestBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<BitTestBlock> BitTestCases;
+  
+  std::vector<std::pair<MachineInstr*, unsigned> > PHINodesToUpdate;
+
+  // Emit PHI-node-operand constants only once even if used by multiple
+  // PHI nodes.
+  DenseMap<Constant*, unsigned> ConstantsOut;
+
+  /// FuncInfo - Information about the function as a whole.
+  ///
+  FunctionLoweringInfo &FuncInfo;
+
+  /// OptLevel - What optimization level we're generating code for.
+  /// 
+  CodeGenOpt::Level OptLevel;
+  
+  /// GFI - Garbage collection metadata for the function.
+  GCFunctionInfo *GFI;
+
+  SelectionDAGLowering(SelectionDAG &dag, TargetLowering &tli,
+                       FunctionLoweringInfo &funcinfo,
+                       CodeGenOpt::Level ol)
+    : CurDebugLoc(DebugLoc::getUnknownLoc()), 
+      TLI(tli), DAG(dag), FuncInfo(funcinfo), OptLevel(ol) {
+  }
+
+  void init(GCFunctionInfo *gfi, AliasAnalysis &aa);
+
+  /// clear - Clear out the curret SelectionDAG and the associated
+  /// state and prepare this SelectionDAGLowering object to be used
+  /// for a new block. This doesn't clear out information about
+  /// additional blocks that are needed to complete switch lowering
+  /// or PHI node updating; that information is cleared out as it is
+  /// consumed.
+  void clear();
+
+  /// getRoot - Return the current virtual root of the Selection DAG,
+  /// flushing any PendingLoad items. This must be done before emitting
+  /// a store or any other node that may need to be ordered after any
+  /// prior load instructions.
+  ///
+  SDValue getRoot();
+
+  /// getControlRoot - Similar to getRoot, but instead of flushing all the
+  /// PendingLoad items, flush all the PendingExports items. It is necessary
+  /// to do this before emitting a terminator instruction.
+  ///
+  SDValue getControlRoot();
+
+  DebugLoc getCurDebugLoc() const { return CurDebugLoc; }
+  void setCurDebugLoc(DebugLoc dl) { CurDebugLoc = dl; }
+
+  void CopyValueToVirtualRegister(Value *V, unsigned Reg);
+
+  void visit(Instruction &I);
+
+  void visit(unsigned Opcode, User &I);
+
+  void setCurrentBasicBlock(MachineBasicBlock *MBB) { CurMBB = MBB; }
+
+  SDValue getValue(const Value *V);
+
+  void setValue(const Value *V, SDValue NewN) {
+    SDValue &N = NodeMap[V];
+    assert(N.getNode() == 0 && "Already set a value for this node!");
+    N = NewN;
+  }
+  
+  void GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
+                            std::set<unsigned> &OutputRegs, 
+                            std::set<unsigned> &InputRegs);
+
+  void FindMergedConditions(Value *Cond, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
+                            unsigned Opc);
+  void EmitBranchForMergedCondition(Value *Cond, MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    MachineBasicBlock *CurBB);
+  bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
+  bool isExportableFromCurrentBlock(Value *V, const BasicBlock *FromBB);
+  void CopyToExportRegsIfNeeded(Value *V);
+  void ExportFromCurrentBlock(Value *V);
+  void LowerCallTo(CallSite CS, SDValue Callee, bool IsTailCall,
+                   MachineBasicBlock *LandingPad = NULL);
+
+private:
+  // Terminator instructions.
+  void visitRet(ReturnInst &I);
+  void visitBr(BranchInst &I);
+  void visitSwitch(SwitchInst &I);
+  void visitUnreachable(UnreachableInst &I) { /* noop */ }
+
+  // Helpers for visitSwitch
+  bool handleSmallSwitchRange(CaseRec& CR,
+                              CaseRecVector& WorkList,
+                              Value* SV,
+                              MachineBasicBlock* Default);
+  bool handleJTSwitchCase(CaseRec& CR,
+                          CaseRecVector& WorkList,
+                          Value* SV,
+                          MachineBasicBlock* Default);
+  bool handleBTSplitSwitchCase(CaseRec& CR,
+                               CaseRecVector& WorkList,
+                               Value* SV,
+                               MachineBasicBlock* Default);
+  bool handleBitTestsSwitchCase(CaseRec& CR,
+                                CaseRecVector& WorkList,
+                                Value* SV,
+                                MachineBasicBlock* Default);  
+public:
+  void visitSwitchCase(CaseBlock &CB);
+  void visitBitTestHeader(BitTestBlock &B);
+  void visitBitTestCase(MachineBasicBlock* NextMBB,
+                        unsigned Reg,
+                        BitTestCase &B);
+  void visitJumpTable(JumpTable &JT);
+  void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH);
+  
+private:
+  // These all get lowered before this pass.
+  void visitInvoke(InvokeInst &I);
+  void visitUnwind(UnwindInst &I);
+
+  void visitBinary(User &I, unsigned OpCode);
+  void visitShift(User &I, unsigned Opcode);
+  void visitAdd(User &I);
+  void visitSub(User &I);
+  void visitMul(User &I);
+  void visitURem(User &I) { visitBinary(I, ISD::UREM); }
+  void visitSRem(User &I) { visitBinary(I, ISD::SREM); }
+  void visitFRem(User &I) { visitBinary(I, ISD::FREM); }
+  void visitUDiv(User &I) { visitBinary(I, ISD::UDIV); }
+  void visitSDiv(User &I) { visitBinary(I, ISD::SDIV); }
+  void visitFDiv(User &I) { visitBinary(I, ISD::FDIV); }
+  void visitAnd (User &I) { visitBinary(I, ISD::AND); }
+  void visitOr  (User &I) { visitBinary(I, ISD::OR); }
+  void visitXor (User &I) { visitBinary(I, ISD::XOR); }
+  void visitShl (User &I) { visitShift(I, ISD::SHL); }
+  void visitLShr(User &I) { visitShift(I, ISD::SRL); }
+  void visitAShr(User &I) { visitShift(I, ISD::SRA); }
+  void visitICmp(User &I);
+  void visitFCmp(User &I);
+  void visitVICmp(User &I);
+  void visitVFCmp(User &I);
+  // Visit the conversion instructions
+  void visitTrunc(User &I);
+  void visitZExt(User &I);
+  void visitSExt(User &I);
+  void visitFPTrunc(User &I);
+  void visitFPExt(User &I);
+  void visitFPToUI(User &I);
+  void visitFPToSI(User &I);
+  void visitUIToFP(User &I);
+  void visitSIToFP(User &I);
+  void visitPtrToInt(User &I);
+  void visitIntToPtr(User &I);
+  void visitBitCast(User &I);
+
+  void visitExtractElement(User &I);
+  void visitInsertElement(User &I);
+  void visitShuffleVector(User &I);
+
+  void visitExtractValue(ExtractValueInst &I);
+  void visitInsertValue(InsertValueInst &I);
+
+  void visitGetElementPtr(User &I);
+  void visitSelect(User &I);
+
+  void visitMalloc(MallocInst &I);
+  void visitFree(FreeInst &I);
+  void visitAlloca(AllocaInst &I);
+  void visitLoad(LoadInst &I);
+  void visitStore(StoreInst &I);
+  void visitPHI(PHINode &I) { } // PHI nodes are handled specially.
+  void visitCall(CallInst &I);
+  void visitInlineAsm(CallSite CS);
+  const char *visitIntrinsicCall(CallInst &I, unsigned Intrinsic);
+  void visitTargetIntrinsic(CallInst &I, unsigned Intrinsic);
+
+  void visitPow(CallInst &I);
+  void visitExp2(CallInst &I);
+  void visitExp(CallInst &I);
+  void visitLog(CallInst &I);
+  void visitLog2(CallInst &I);
+  void visitLog10(CallInst &I);
+
+  void visitVAStart(CallInst &I);
+  void visitVAArg(VAArgInst &I);
+  void visitVAEnd(CallInst &I);
+  void visitVACopy(CallInst &I);
+
+  void visitUserOp1(Instruction &I) {
+    assert(0 && "UserOp1 should not exist at instruction selection time!");
+    abort();
+  }
+  void visitUserOp2(Instruction &I) {
+    assert(0 && "UserOp2 should not exist at instruction selection time!");
+    abort();
+  }
+  
+  const char *implVisitBinaryAtomic(CallInst& I, ISD::NodeType Op);
+  const char *implVisitAluOverflow(CallInst &I, ISD::NodeType Op);
+};
+
+/// AddCatchInfo - Extract the personality and type infos from an eh.selector
+/// call, and add them to the specified machine basic block.
+void AddCatchInfo(CallInst &I, MachineModuleInfo *MMI,
+                  MachineBasicBlock *MBB);
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
new file mode 100644
index 0000000..9d72a12
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -0,0 +1,1347 @@
+//===-- SelectionDAGISel.cpp - Implement the SelectionDAGISel class -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAGISel class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "isel"
+#include "ScheduleDAGSDNodes.h"
+#include "SelectionDAGBuild.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Constants.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Timer.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<bool>
+DisableLegalizeTypes("disable-legalize-types", cl::Hidden);
+static cl::opt<bool>
+EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
+          cl::desc("Enable verbose messages in the \"fast\" "
+                   "instruction selector"));
+static cl::opt<bool>
+EnableFastISelAbort("fast-isel-abort", cl::Hidden,
+          cl::desc("Enable abort calls when \"fast\" instruction fails"));
+static cl::opt<bool>
+SchedLiveInCopies("schedule-livein-copies",
+                  cl::desc("Schedule copies of livein registers"),
+                  cl::init(false));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the first "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewLegalizeTypesDAGs("view-legalize-types-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize types"));
+static cl::opt<bool>
+ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize"));
+static cl::opt<bool>
+ViewDAGCombine2("view-dag-combine2-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the second "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the post legalize types"
+                   " dag combine pass"));
+static cl::opt<bool>
+ViewISelDAGs("view-isel-dags", cl::Hidden,
+          cl::desc("Pop up a window to show isel dags as they are selected"));
+static cl::opt<bool>
+ViewSchedDAGs("view-sched-dags", cl::Hidden,
+          cl::desc("Pop up a window to show sched dags as they are processed"));
+static cl::opt<bool>
+ViewSUnitDAGs("view-sunit-dags", cl::Hidden,
+      cl::desc("Pop up a window to show SUnit dags after they are processed"));
+#else
+static const bool ViewDAGCombine1 = false,
+                  ViewLegalizeTypesDAGs = false, ViewLegalizeDAGs = false,
+                  ViewDAGCombine2 = false,
+                  ViewDAGCombineLT = false,
+                  ViewISelDAGs = false, ViewSchedDAGs = false,
+                  ViewSUnitDAGs = false;
+#endif
+
+//===---------------------------------------------------------------------===//
+///
+/// RegisterScheduler class - Track the registration of instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+MachinePassRegistry RegisterScheduler::Registry;
+
+//===---------------------------------------------------------------------===//
+///
+/// ISHeuristic command line option for instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+static cl::opt<RegisterScheduler::FunctionPassCtor, false,
+               RegisterPassParser<RegisterScheduler> >
+ISHeuristic("pre-RA-sched",
+            cl::init(&createDefaultScheduler),
+            cl::desc("Instruction schedulers available (before register"
+                     " allocation):"));
+
+static RegisterScheduler
+defaultListDAGScheduler("default", "Best scheduler for the target",
+                        createDefaultScheduler);
+
+namespace llvm {
+  //===--------------------------------------------------------------------===//
+  /// createDefaultScheduler - This creates an instruction scheduler appropriate
+  /// for the target.
+  ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
+                                             CodeGenOpt::Level OptLevel) {
+    const TargetLowering &TLI = IS->getTargetLowering();
+
+    if (OptLevel == CodeGenOpt::None)
+      return createFastDAGScheduler(IS, OptLevel);
+    if (TLI.getSchedulingPreference() == TargetLowering::SchedulingForLatency)
+      return createTDListDAGScheduler(IS, OptLevel);
+    assert(TLI.getSchedulingPreference() ==
+         TargetLowering::SchedulingForRegPressure && "Unknown sched type!");
+    return createBURRListDAGScheduler(IS, OptLevel);
+  }
+}
+
+// EmitInstrWithCustomInserter - This method should be implemented by targets
+// that mark instructions with the 'usesCustomDAGSchedInserter' flag.  These
+// instructions are special in various ways, which require special support to
+// insert.  The specified MachineInstr is created but not inserted into any
+// basic blocks, and the scheduler passes ownership of it to this method.
+MachineBasicBlock *TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  cerr << "If a target marks an instruction with "
+       << "'usesCustomDAGSchedInserter', it must implement "
+       << "TargetLowering::EmitInstrWithCustomInserter!\n";
+  abort();
+  return 0;  
+}
+
+/// EmitLiveInCopy - Emit a copy for a live in physical register. If the
+/// physical register has only a single copy use, then coalesced the copy
+/// if possible.
+static void EmitLiveInCopy(MachineBasicBlock *MBB,
+                           MachineBasicBlock::iterator &InsertPos,
+                           unsigned VirtReg, unsigned PhysReg,
+                           const TargetRegisterClass *RC,
+                           DenseMap<MachineInstr*, unsigned> &CopyRegMap,
+                           const MachineRegisterInfo &MRI,
+                           const TargetRegisterInfo &TRI,
+                           const TargetInstrInfo &TII) {
+  unsigned NumUses = 0;
+  MachineInstr *UseMI = NULL;
+  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(VirtReg),
+         UE = MRI.use_end(); UI != UE; ++UI) {
+    UseMI = &*UI;
+    if (++NumUses > 1)
+      break;
+  }
+
+  // If the number of uses is not one, or the use is not a move instruction,
+  // don't coalesce. Also, only coalesce away a virtual register to virtual
+  // register copy.
+  bool Coalesced = false;
+  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+  if (NumUses == 1 &&
+      TII.isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
+      TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    VirtReg = DstReg;
+    Coalesced = true;
+  }
+
+  // Now find an ideal location to insert the copy.
+  MachineBasicBlock::iterator Pos = InsertPos;
+  while (Pos != MBB->begin()) {
+    MachineInstr *PrevMI = prior(Pos);
+    DenseMap<MachineInstr*, unsigned>::iterator RI = CopyRegMap.find(PrevMI);
+    // copyRegToReg might emit multiple instructions to do a copy.
+    unsigned CopyDstReg = (RI == CopyRegMap.end()) ? 0 : RI->second;
+    if (CopyDstReg && !TRI.regsOverlap(CopyDstReg, PhysReg))
+      // This is what the BB looks like right now:
+      // r1024 = mov r0
+      // ...
+      // r1    = mov r1024
+      //
+      // We want to insert "r1025 = mov r1". Inserting this copy below the
+      // move to r1024 makes it impossible for that move to be coalesced.
+      //
+      // r1025 = mov r1
+      // r1024 = mov r0
+      // ...
+      // r1    = mov 1024
+      // r2    = mov 1025
+      break; // Woot! Found a good location.
+    --Pos;
+  }
+
+  TII.copyRegToReg(*MBB, Pos, VirtReg, PhysReg, RC, RC);
+  CopyRegMap.insert(std::make_pair(prior(Pos), VirtReg));
+  if (Coalesced) {
+    if (&*InsertPos == UseMI) ++InsertPos;
+    MBB->erase(UseMI);
+  }
+}
+
+/// EmitLiveInCopies - If this is the first basic block in the function,
+/// and if it has live ins that need to be copied into vregs, emit the
+/// copies into the block.
+static void EmitLiveInCopies(MachineBasicBlock *EntryMBB,
+                             const MachineRegisterInfo &MRI,
+                             const TargetRegisterInfo &TRI,
+                             const TargetInstrInfo &TII) {
+  if (SchedLiveInCopies) {
+    // Emit the copies at a heuristically-determined location in the block.
+    DenseMap<MachineInstr*, unsigned> CopyRegMap;
+    MachineBasicBlock::iterator InsertPos = EntryMBB->begin();
+    for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+           E = MRI.livein_end(); LI != E; ++LI)
+      if (LI->second) {
+        const TargetRegisterClass *RC = MRI.getRegClass(LI->second);
+        EmitLiveInCopy(EntryMBB, InsertPos, LI->second, LI->first,
+                       RC, CopyRegMap, MRI, TRI, TII);
+      }
+  } else {
+    // Emit the copies into the top of the block.
+    for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+           E = MRI.livein_end(); LI != E; ++LI)
+      if (LI->second) {
+        const TargetRegisterClass *RC = MRI.getRegClass(LI->second);
+        TII.copyRegToReg(*EntryMBB, EntryMBB->begin(),
+                         LI->second, LI->first, RC, RC);
+      }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SelectionDAGISel code
+//===----------------------------------------------------------------------===//
+
+SelectionDAGISel::SelectionDAGISel(TargetMachine &tm, CodeGenOpt::Level OL) :
+  FunctionPass(&ID), TM(tm), TLI(*tm.getTargetLowering()),
+  FuncInfo(new FunctionLoweringInfo(TLI)),
+  CurDAG(new SelectionDAG(TLI, *FuncInfo)),
+  SDL(new SelectionDAGLowering(*CurDAG, TLI, *FuncInfo, OL)),
+  GFI(),
+  OptLevel(OL),
+  DAGSize(0)
+{}
+
+SelectionDAGISel::~SelectionDAGISel() {
+  delete SDL;
+  delete CurDAG;
+  delete FuncInfo;
+}
+
+unsigned SelectionDAGISel::MakeReg(MVT VT) {
+  return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
+}
+
+void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<GCModuleInfo>();
+  AU.addRequired<DwarfWriter>();
+  AU.setPreservesAll();
+}
+
+bool SelectionDAGISel::runOnFunction(Function &Fn) {
+  // Do some sanity-checking on the command-line options.
+  assert((!EnableFastISelVerbose || EnableFastISel) &&
+         "-fast-isel-verbose requires -fast-isel");
+  assert((!EnableFastISelAbort || EnableFastISel) &&
+         "-fast-isel-abort requires -fast-isel");
+
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (Fn.hasAvailableExternallyLinkage())
+    return false;
+
+
+  // Get alias analysis for load/store combining.
+  AA = &getAnalysis<AliasAnalysis>();
+
+  TargetMachine &TM = TLI.getTargetMachine();
+  MF = &MachineFunction::construct(&Fn, TM);
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+
+  if (MF->getFunction()->hasGC())
+    GFI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF->getFunction());
+  else
+    GFI = 0;
+  RegInfo = &MF->getRegInfo();
+  DOUT << "\n\n\n=== " << Fn.getName() << "\n";
+
+  MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+  CurDAG->init(*MF, MMI, DW);
+  FuncInfo->set(Fn, *MF, *CurDAG, EnableFastISel);
+  SDL->init(GFI, *AA);
+
+  for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
+    if (InvokeInst *Invoke = dyn_cast<InvokeInst>(I->getTerminator()))
+      // Mark landing pad.
+      FuncInfo->MBBMap[Invoke->getSuccessor(1)]->setIsLandingPad();
+
+  SelectAllBasicBlocks(Fn, *MF, MMI, DW, TII);
+
+  // If the first basic block in the function has live ins that need to be
+  // copied into vregs, emit the copies into the top of the block before
+  // emitting the code for the block.
+  EmitLiveInCopies(MF->begin(), *RegInfo, TRI, TII);
+
+  // Add function live-ins to entry block live-in set.
+  for (MachineRegisterInfo::livein_iterator I = RegInfo->livein_begin(),
+         E = RegInfo->livein_end(); I != E; ++I)
+    MF->begin()->addLiveIn(I->first);
+
+#ifndef NDEBUG
+  assert(FuncInfo->CatchInfoFound.size() == FuncInfo->CatchInfoLost.size() &&
+         "Not all catch info was assigned to a landing pad!");
+#endif
+
+  FuncInfo->clear();
+
+  return true;
+}
+
+static void copyCatchInfo(BasicBlock *SrcBB, BasicBlock *DestBB,
+                          MachineModuleInfo *MMI, FunctionLoweringInfo &FLI) {
+  for (BasicBlock::iterator I = SrcBB->begin(), E = --SrcBB->end(); I != E; ++I)
+    if (EHSelectorInst *EHSel = dyn_cast<EHSelectorInst>(I)) {
+      // Apply the catch info to DestBB.
+      AddCatchInfo(*EHSel, MMI, FLI.MBBMap[DestBB]);
+#ifndef NDEBUG
+      if (!FLI.MBBMap[SrcBB]->isLandingPad())
+        FLI.CatchInfoFound.insert(EHSel);
+#endif
+    }
+}
+
+/// IsFixedFrameObjectWithPosOffset - Check if object is a fixed frame object and
+/// whether object offset >= 0.
+static bool
+IsFixedFrameObjectWithPosOffset(MachineFrameInfo *MFI, SDValue Op) {
+  if (!isa<FrameIndexSDNode>(Op)) return false;
+
+  FrameIndexSDNode * FrameIdxNode = dyn_cast<FrameIndexSDNode>(Op);
+  int FrameIdx =  FrameIdxNode->getIndex();
+  return MFI->isFixedObjectIndex(FrameIdx) &&
+    MFI->getObjectOffset(FrameIdx) >= 0;
+}
+
+/// IsPossiblyOverwrittenArgumentOfTailCall - Check if the operand could
+/// possibly be overwritten when lowering the outgoing arguments in a tail
+/// call. Currently the implementation of this call is very conservative and
+/// assumes all arguments sourcing from FORMAL_ARGUMENTS or a CopyFromReg with
+/// virtual registers would be overwritten by direct lowering.
+static bool IsPossiblyOverwrittenArgumentOfTailCall(SDValue Op,
+                                                    MachineFrameInfo *MFI) {
+  RegisterSDNode * OpReg = NULL;
+  if (Op.getOpcode() == ISD::FORMAL_ARGUMENTS ||
+      (Op.getOpcode()== ISD::CopyFromReg &&
+       (OpReg = dyn_cast<RegisterSDNode>(Op.getOperand(1))) &&
+       (OpReg->getReg() >= TargetRegisterInfo::FirstVirtualRegister)) ||
+      (Op.getOpcode() == ISD::LOAD &&
+       IsFixedFrameObjectWithPosOffset(MFI, Op.getOperand(1))) ||
+      (Op.getOpcode() == ISD::MERGE_VALUES &&
+       Op.getOperand(Op.getResNo()).getOpcode() == ISD::LOAD &&
+       IsFixedFrameObjectWithPosOffset(MFI, Op.getOperand(Op.getResNo()).
+                                       getOperand(1))))
+    return true;
+  return false;
+}
+
+/// CheckDAGForTailCallsAndFixThem - This Function looks for CALL nodes in the
+/// DAG and fixes their tailcall attribute operand.
+static void CheckDAGForTailCallsAndFixThem(SelectionDAG &DAG, 
+                                           const TargetLowering& TLI) {
+  SDNode * Ret = NULL;
+  SDValue Terminator = DAG.getRoot();
+
+  // Find RET node.
+  if (Terminator.getOpcode() == ISD::RET) {
+    Ret = Terminator.getNode();
+  }
+ 
+  // Fix tail call attribute of CALL nodes.
+  for (SelectionDAG::allnodes_iterator BE = DAG.allnodes_begin(),
+         BI = DAG.allnodes_end(); BI != BE; ) {
+    --BI;
+    if (CallSDNode *TheCall = dyn_cast<CallSDNode>(BI)) {
+      SDValue OpRet(Ret, 0);
+      SDValue OpCall(BI, 0);
+      bool isMarkedTailCall = TheCall->isTailCall();
+      // If CALL node has tail call attribute set to true and the call is not
+      // eligible (no RET or the target rejects) the attribute is fixed to
+      // false. The TargetLowering::IsEligibleForTailCallOptimization function
+      // must correctly identify tail call optimizable calls.
+      if (!isMarkedTailCall) continue;
+      if (Ret==NULL ||
+          !TLI.IsEligibleForTailCallOptimization(TheCall, OpRet, DAG)) {
+        // Not eligible. Mark CALL node as non tail call. Note that we
+        // can modify the call node in place since calls are not CSE'd.
+        TheCall->setNotTailCall();
+      } else {
+        // Look for tail call clobbered arguments. Emit a series of
+        // copyto/copyfrom virtual register nodes to protect them.
+        SmallVector<SDValue, 32> Ops;
+        SDValue Chain = TheCall->getChain(), InFlag;
+        Ops.push_back(Chain);
+        Ops.push_back(TheCall->getCallee());
+        for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; ++i) {
+          SDValue Arg = TheCall->getArg(i);
+          bool isByVal = TheCall->getArgFlags(i).isByVal();
+          MachineFunction &MF = DAG.getMachineFunction();
+          MachineFrameInfo *MFI = MF.getFrameInfo();
+          if (!isByVal &&
+              IsPossiblyOverwrittenArgumentOfTailCall(Arg, MFI)) {
+            MVT VT = Arg.getValueType();
+            unsigned VReg = MF.getRegInfo().
+              createVirtualRegister(TLI.getRegClassFor(VT));
+            Chain = DAG.getCopyToReg(Chain, Arg.getDebugLoc(),
+                                     VReg, Arg, InFlag);
+            InFlag = Chain.getValue(1);
+            Arg = DAG.getCopyFromReg(Chain, Arg.getDebugLoc(),
+                                     VReg, VT, InFlag);
+            Chain = Arg.getValue(1);
+            InFlag = Arg.getValue(2);
+          }
+          Ops.push_back(Arg);
+          Ops.push_back(TheCall->getArgFlagsVal(i));
+        }
+        // Link in chain of CopyTo/CopyFromReg.
+        Ops[0] = Chain;
+        DAG.UpdateNodeOperands(OpCall, Ops.begin(), Ops.size());
+      }
+    }
+  }
+}
+
+void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
+                                        BasicBlock::iterator Begin,
+                                        BasicBlock::iterator End) {
+  SDL->setCurrentBasicBlock(BB);
+
+  // Lower all of the non-terminator instructions.
+  for (BasicBlock::iterator I = Begin; I != End; ++I)
+    if (!isa<TerminatorInst>(I))
+      SDL->visit(*I);
+
+  // Ensure that all instructions which are used outside of their defining
+  // blocks are available as virtual registers.  Invoke is handled elsewhere.
+  for (BasicBlock::iterator I = Begin; I != End; ++I)
+    if (!isa<PHINode>(I) && !isa<InvokeInst>(I))
+      SDL->CopyToExportRegsIfNeeded(I);
+
+  // Handle PHI nodes in successor blocks.
+  if (End == LLVMBB->end()) {
+    HandlePHINodesInSuccessorBlocks(LLVMBB);
+
+    // Lower the terminator after the copies are emitted.
+    SDL->visit(*LLVMBB->getTerminator());
+  }
+    
+  // Make sure the root of the DAG is up-to-date.
+  CurDAG->setRoot(SDL->getControlRoot());
+
+  // Check whether calls in this block are real tail calls. Fix up CALL nodes
+  // with correct tailcall attribute so that the target can rely on the tailcall
+  // attribute indicating whether the call is really eligible for tail call
+  // optimization.
+  if (PerformTailCallOpt)
+    CheckDAGForTailCallsAndFixThem(*CurDAG, TLI);
+
+  // Final step, emit the lowered DAG as machine code.
+  CodeGenAndEmitDAG();
+  SDL->clear();
+}
+
+void SelectionDAGISel::ComputeLiveOutVRegInfo() {
+  SmallPtrSet<SDNode*, 128> VisitedNodes;
+  SmallVector<SDNode*, 128> Worklist;
+  
+  Worklist.push_back(CurDAG->getRoot().getNode());
+  
+  APInt Mask;
+  APInt KnownZero;
+  APInt KnownOne;
+  
+  while (!Worklist.empty()) {
+    SDNode *N = Worklist.back();
+    Worklist.pop_back();
+    
+    // If we've already seen this node, ignore it.
+    if (!VisitedNodes.insert(N))
+      continue;
+    
+    // Otherwise, add all chain operands to the worklist.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        Worklist.push_back(N->getOperand(i).getNode());
+    
+    // If this is a CopyToReg with a vreg dest, process it.
+    if (N->getOpcode() != ISD::CopyToReg)
+      continue;
+    
+    unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+      continue;
+    
+    // Ignore non-scalar or non-integer values.
+    SDValue Src = N->getOperand(2);
+    MVT SrcVT = Src.getValueType();
+    if (!SrcVT.isInteger() || SrcVT.isVector())
+      continue;
+    
+    unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
+    Mask = APInt::getAllOnesValue(SrcVT.getSizeInBits());
+    CurDAG->ComputeMaskedBits(Src, Mask, KnownZero, KnownOne);
+    
+    // Only install this information if it tells us something.
+    if (NumSignBits != 1 || KnownZero != 0 || KnownOne != 0) {
+      DestReg -= TargetRegisterInfo::FirstVirtualRegister;
+      FunctionLoweringInfo &FLI = CurDAG->getFunctionLoweringInfo();
+      if (DestReg >= FLI.LiveOutRegInfo.size())
+        FLI.LiveOutRegInfo.resize(DestReg+1);
+      FunctionLoweringInfo::LiveOutInfo &LOI = FLI.LiveOutRegInfo[DestReg];
+      LOI.NumSignBits = NumSignBits;
+      LOI.KnownOne = KnownOne;
+      LOI.KnownZero = KnownZero;
+    }
+  }
+}
+
+void SelectionDAGISel::CodeGenAndEmitDAG() {
+  std::string GroupName;
+  if (TimePassesIsEnabled)
+    GroupName = "Instruction Selection and Scheduling";
+  std::string BlockName;
+  if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
+      ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
+      ViewSUnitDAGs)
+    BlockName = CurDAG->getMachineFunction().getFunction()->getName() + ':' +
+                BB->getBasicBlock()->getName();
+
+  DOUT << "Initial selection DAG:\n";
+  DEBUG(CurDAG->dump());
+
+  if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+
+  // Run the DAG combiner in pre-legalize mode.
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("DAG Combining 1", GroupName);
+    CurDAG->Combine(Unrestricted, *AA, OptLevel);
+  } else {
+    CurDAG->Combine(Unrestricted, *AA, OptLevel);
+  }
+  
+  DOUT << "Optimized lowered selection DAG:\n";
+  DEBUG(CurDAG->dump());
+  
+  // Second step, hack on the DAG until it only uses operations and types that
+  // the target supports.
+  if (!DisableLegalizeTypes) {
+    if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
+                                                 BlockName);
+
+    bool Changed;
+    if (TimePassesIsEnabled) {
+      NamedRegionTimer T("Type Legalization", GroupName);
+      Changed = CurDAG->LegalizeTypes();
+    } else {
+      Changed = CurDAG->LegalizeTypes();
+    }
+
+    DOUT << "Type-legalized selection DAG:\n";
+    DEBUG(CurDAG->dump());
+
+    if (Changed) {
+      if (ViewDAGCombineLT)
+        CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
+
+      // Run the DAG combiner in post-type-legalize mode.
+      if (TimePassesIsEnabled) {
+        NamedRegionTimer T("DAG Combining after legalize types", GroupName);
+        CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
+      } else {
+        CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
+      }
+
+      DOUT << "Optimized type-legalized selection DAG:\n";
+      DEBUG(CurDAG->dump());
+    }
+
+    if (TimePassesIsEnabled) {
+      NamedRegionTimer T("Vector Legalization", GroupName);
+      Changed = CurDAG->LegalizeVectors();
+    } else {
+      Changed = CurDAG->LegalizeVectors();
+    }
+
+    if (Changed) {
+      if (TimePassesIsEnabled) {
+        NamedRegionTimer T("Type Legalization 2", GroupName);
+        Changed = CurDAG->LegalizeTypes();
+      } else {
+        Changed = CurDAG->LegalizeTypes();
+      }
+
+      if (ViewDAGCombineLT)
+        CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
+
+      // Run the DAG combiner in post-type-legalize mode.
+      if (TimePassesIsEnabled) {
+        NamedRegionTimer T("DAG Combining after legalize vectors", GroupName);
+        CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+      } else {
+        CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+      }
+
+      DOUT << "Optimized vector-legalized selection DAG:\n";
+      DEBUG(CurDAG->dump());
+    }
+  }
+  
+  if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
+
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("DAG Legalization", GroupName);
+    CurDAG->Legalize(DisableLegalizeTypes, OptLevel);
+  } else {
+    CurDAG->Legalize(DisableLegalizeTypes, OptLevel);
+  }
+  
+  DOUT << "Legalized selection DAG:\n";
+  DEBUG(CurDAG->dump());
+  
+  if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+
+  // Run the DAG combiner in post-legalize mode.
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("DAG Combining 2", GroupName);
+    CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+  } else {
+    CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+  }
+  
+  DOUT << "Optimized legalized selection DAG:\n";
+  DEBUG(CurDAG->dump());
+
+  if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
+  
+  if (OptLevel != CodeGenOpt::None)
+    ComputeLiveOutVRegInfo();
+
+  // Third, instruction select all of the operations to machine code, adding the
+  // code to the MachineBasicBlock.
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Instruction Selection", GroupName);
+    InstructionSelect();
+  } else {
+    InstructionSelect();
+  }
+
+  DOUT << "Selected selection DAG:\n";
+  DEBUG(CurDAG->dump());
+
+  if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
+
+  // Schedule machine code.
+  ScheduleDAGSDNodes *Scheduler = CreateScheduler();
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Instruction Scheduling", GroupName);
+    Scheduler->Run(CurDAG, BB, BB->end());
+  } else {
+    Scheduler->Run(CurDAG, BB, BB->end());
+  }
+
+  if (ViewSUnitDAGs) Scheduler->viewGraph();
+
+  // Emit machine code to BB.  This can change 'BB' to the last block being 
+  // inserted into.
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Instruction Creation", GroupName);
+    BB = Scheduler->EmitSchedule();
+  } else {
+    BB = Scheduler->EmitSchedule();
+  }
+
+  // Free the scheduler state.
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName);
+    delete Scheduler;
+  } else {
+    delete Scheduler;
+  }
+
+  DOUT << "Selected machine code:\n";
+  DEBUG(BB->dump());
+}  
+
+void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
+                                            MachineFunction &MF,
+                                            MachineModuleInfo *MMI,
+                                            DwarfWriter *DW,
+                                            const TargetInstrInfo &TII) {
+  // Initialize the Fast-ISel state, if needed.
+  FastISel *FastIS = 0;
+  if (EnableFastISel)
+    FastIS = TLI.createFastISel(MF, MMI, DW,
+                                FuncInfo->ValueMap,
+                                FuncInfo->MBBMap,
+                                FuncInfo->StaticAllocaMap
+#ifndef NDEBUG
+                                , FuncInfo->CatchInfoLost
+#endif
+                                );
+
+  // Iterate over all basic blocks in the function.
+  for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    BasicBlock *LLVMBB = &*I;
+    BB = FuncInfo->MBBMap[LLVMBB];
+
+    BasicBlock::iterator const Begin = LLVMBB->begin();
+    BasicBlock::iterator const End = LLVMBB->end();
+    BasicBlock::iterator BI = Begin;
+
+    // Lower any arguments needed in this block if this is the entry block.
+    bool SuppressFastISel = false;
+    if (LLVMBB == &Fn.getEntryBlock()) {
+      LowerArguments(LLVMBB);
+
+      // If any of the arguments has the byval attribute, forgo
+      // fast-isel in the entry block.
+      if (FastIS) {
+        unsigned j = 1;
+        for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end();
+             I != E; ++I, ++j)
+          if (Fn.paramHasAttr(j, Attribute::ByVal)) {
+            if (EnableFastISelVerbose || EnableFastISelAbort)
+              cerr << "FastISel skips entry block due to byval argument\n";
+            SuppressFastISel = true;
+            break;
+          }
+      }
+    }
+
+    if (MMI && BB->isLandingPad()) {
+      // Add a label to mark the beginning of the landing pad.  Deletion of the
+      // landing pad can thus be detected via the MachineModuleInfo.
+      unsigned LabelID = MMI->addLandingPad(BB);
+
+      const TargetInstrDesc &II = TII.get(TargetInstrInfo::EH_LABEL);
+      BuildMI(BB, SDL->getCurDebugLoc(), II).addImm(LabelID);
+
+      // Mark exception register as live in.
+      unsigned Reg = TLI.getExceptionAddressRegister();
+      if (Reg) BB->addLiveIn(Reg);
+
+      // Mark exception selector register as live in.
+      Reg = TLI.getExceptionSelectorRegister();
+      if (Reg) BB->addLiveIn(Reg);
+
+      // FIXME: Hack around an exception handling flaw (PR1508): the personality
+      // function and list of typeids logically belong to the invoke (or, if you
+      // like, the basic block containing the invoke), and need to be associated
+      // with it in the dwarf exception handling tables.  Currently however the
+      // information is provided by an intrinsic (eh.selector) that can be moved
+      // to unexpected places by the optimizers: if the unwind edge is critical,
+      // then breaking it can result in the intrinsics being in the successor of
+      // the landing pad, not the landing pad itself.  This results in exceptions
+      // not being caught because no typeids are associated with the invoke.
+      // This may not be the only way things can go wrong, but it is the only way
+      // we try to work around for the moment.
+      BranchInst *Br = dyn_cast<BranchInst>(LLVMBB->getTerminator());
+
+      if (Br && Br->isUnconditional()) { // Critical edge?
+        BasicBlock::iterator I, E;
+        for (I = LLVMBB->begin(), E = --LLVMBB->end(); I != E; ++I)
+          if (isa<EHSelectorInst>(I))
+            break;
+
+        if (I == E)
+          // No catch info found - try to extract some from the successor.
+          copyCatchInfo(Br->getSuccessor(0), LLVMBB, MMI, *FuncInfo);
+      }
+    }
+
+    // Before doing SelectionDAG ISel, see if FastISel has been requested.
+    if (FastIS && !SuppressFastISel) {
+      // Emit code for any incoming arguments. This must happen before
+      // beginning FastISel on the entry block.
+      if (LLVMBB == &Fn.getEntryBlock()) {
+        CurDAG->setRoot(SDL->getControlRoot());
+        CodeGenAndEmitDAG();
+        SDL->clear();
+      }
+      FastIS->startNewBlock(BB);
+      // Do FastISel on as many instructions as possible.
+      for (; BI != End; ++BI) {
+        // Just before the terminator instruction, insert instructions to
+        // feed PHI nodes in successor blocks.
+        if (isa<TerminatorInst>(BI))
+          if (!HandlePHINodesInSuccessorBlocksFast(LLVMBB, FastIS)) {
+            if (EnableFastISelVerbose || EnableFastISelAbort) {
+              cerr << "FastISel miss: ";
+              BI->dump();
+            }
+            if (EnableFastISelAbort)
+              assert(0 && "FastISel didn't handle a PHI in a successor");
+            break;
+          }
+
+        // First try normal tablegen-generated "fast" selection.
+        if (FastIS->SelectInstruction(BI))
+          continue;
+
+        // Next, try calling the target to attempt to handle the instruction.
+        if (FastIS->TargetSelectInstruction(BI))
+          continue;
+
+        // Then handle certain instructions as single-LLVM-Instruction blocks.
+        if (isa<CallInst>(BI)) {
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            cerr << "FastISel missed call: ";
+            BI->dump();
+          }
+
+          if (BI->getType() != Type::VoidTy) {
+            unsigned &R = FuncInfo->ValueMap[BI];
+            if (!R)
+              R = FuncInfo->CreateRegForValue(BI);
+          }
+
+          SDL->setCurDebugLoc(FastIS->getCurDebugLoc());
+          SelectBasicBlock(LLVMBB, BI, next(BI));
+          // If the instruction was codegen'd with multiple blocks,
+          // inform the FastISel object where to resume inserting.
+          FastIS->setCurrentBlock(BB);
+          continue;
+        }
+
+        // Otherwise, give up on FastISel for the rest of the block.
+        // For now, be a little lenient about non-branch terminators.
+        if (!isa<TerminatorInst>(BI) || isa<BranchInst>(BI)) {
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            cerr << "FastISel miss: ";
+            BI->dump();
+          }
+          if (EnableFastISelAbort)
+            // The "fast" selector couldn't handle something and bailed.
+            // For the purpose of debugging, just abort.
+            assert(0 && "FastISel didn't select the entire block");
+        }
+        break;
+      }
+    }
+
+    // Run SelectionDAG instruction selection on the remainder of the block
+    // not handled by FastISel. If FastISel is not run, this is the entire
+    // block.
+    if (BI != End) {
+      // If FastISel is run and it has known DebugLoc then use it.
+      if (FastIS && !FastIS->getCurDebugLoc().isUnknown())
+        SDL->setCurDebugLoc(FastIS->getCurDebugLoc());
+      SelectBasicBlock(LLVMBB, BI, End);
+    }
+
+    FinishBasicBlock();
+  }
+
+  delete FastIS;
+}
+
+void
+SelectionDAGISel::FinishBasicBlock() {
+
+  DOUT << "Target-post-processed machine code:\n";
+  DEBUG(BB->dump());
+
+  DOUT << "Total amount of phi nodes to update: "
+       << SDL->PHINodesToUpdate.size() << "\n";
+  DEBUG(for (unsigned i = 0, e = SDL->PHINodesToUpdate.size(); i != e; ++i)
+          DOUT << "Node " << i << " : (" << SDL->PHINodesToUpdate[i].first
+               << ", " << SDL->PHINodesToUpdate[i].second << ")\n";);
+  
+  // Next, now that we know what the last MBB the LLVM BB expanded is, update
+  // PHI nodes in successors.
+  if (SDL->SwitchCases.empty() &&
+      SDL->JTCases.empty() &&
+      SDL->BitTestCases.empty()) {
+    for (unsigned i = 0, e = SDL->PHINodesToUpdate.size(); i != e; ++i) {
+      MachineInstr *PHI = SDL->PHINodesToUpdate[i].first;
+      assert(PHI->getOpcode() == TargetInstrInfo::PHI &&
+             "This is not a machine PHI node that we are updating!");
+      PHI->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[i].second,
+                                                false));
+      PHI->addOperand(MachineOperand::CreateMBB(BB));
+    }
+    SDL->PHINodesToUpdate.clear();
+    return;
+  }
+
+  for (unsigned i = 0, e = SDL->BitTestCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDL->BitTestCases[i].Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      BB = SDL->BitTestCases[i].Parent;
+      SDL->setCurrentBasicBlock(BB);
+      // Emit the code
+      SDL->visitBitTestHeader(SDL->BitTestCases[i]);
+      CurDAG->setRoot(SDL->getRoot());
+      CodeGenAndEmitDAG();
+      SDL->clear();
+    }    
+
+    for (unsigned j = 0, ej = SDL->BitTestCases[i].Cases.size(); j != ej; ++j) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      BB = SDL->BitTestCases[i].Cases[j].ThisBB;
+      SDL->setCurrentBasicBlock(BB);
+      // Emit the code
+      if (j+1 != ej)
+        SDL->visitBitTestCase(SDL->BitTestCases[i].Cases[j+1].ThisBB,
+                              SDL->BitTestCases[i].Reg,
+                              SDL->BitTestCases[i].Cases[j]);
+      else
+        SDL->visitBitTestCase(SDL->BitTestCases[i].Default,
+                              SDL->BitTestCases[i].Reg,
+                              SDL->BitTestCases[i].Cases[j]);
+        
+        
+      CurDAG->setRoot(SDL->getRoot());
+      CodeGenAndEmitDAG();
+      SDL->clear();
+    }
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = SDL->PHINodesToUpdate.size(); pi != pe; ++pi) {
+      MachineInstr *PHI = SDL->PHINodesToUpdate[pi].first;
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->getOpcode() == TargetInstrInfo::PHI &&
+             "This is not a machine PHI node that we are updating!");
+      // This is "default" BB. We have two jumps to it. From "header" BB and
+      // from last "case" BB.
+      if (PHIBB == SDL->BitTestCases[i].Default) {
+        PHI->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[pi].second,
+                                                  false));
+        PHI->addOperand(MachineOperand::CreateMBB(SDL->BitTestCases[i].Parent));
+        PHI->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[pi].second,
+                                                  false));
+        PHI->addOperand(MachineOperand::CreateMBB(SDL->BitTestCases[i].Cases.
+                                                  back().ThisBB));
+      }
+      // One of "cases" BB.
+      for (unsigned j = 0, ej = SDL->BitTestCases[i].Cases.size();
+           j != ej; ++j) {
+        MachineBasicBlock* cBB = SDL->BitTestCases[i].Cases[j].ThisBB;
+        if (cBB->succ_end() !=
+            std::find(cBB->succ_begin(),cBB->succ_end(), PHIBB)) {
+          PHI->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[pi].second,
+                                                    false));
+          PHI->addOperand(MachineOperand::CreateMBB(cBB));
+        }
+      }
+    }
+  }
+  SDL->BitTestCases.clear();
+
+  // If the JumpTable record is filled in, then we need to emit a jump table.
+  // Updating the PHI nodes is tricky in this case, since we need to determine
+  // whether the PHI is a successor of the range check MBB or the jump table MBB
+  for (unsigned i = 0, e = SDL->JTCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDL->JTCases[i].first.Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      BB = SDL->JTCases[i].first.HeaderBB;
+      SDL->setCurrentBasicBlock(BB);
+      // Emit the code
+      SDL->visitJumpTableHeader(SDL->JTCases[i].second, SDL->JTCases[i].first);
+      CurDAG->setRoot(SDL->getRoot());
+      CodeGenAndEmitDAG();
+      SDL->clear();
+    }
+    
+    // Set the current basic block to the mbb we wish to insert the code into
+    BB = SDL->JTCases[i].second.MBB;
+    SDL->setCurrentBasicBlock(BB);
+    // Emit the code
+    SDL->visitJumpTable(SDL->JTCases[i].second);
+    CurDAG->setRoot(SDL->getRoot());
+    CodeGenAndEmitDAG();
+    SDL->clear();
+    
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = SDL->PHINodesToUpdate.size(); pi != pe; ++pi) {
+      MachineInstr *PHI = SDL->PHINodesToUpdate[pi].first;
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->getOpcode() == TargetInstrInfo::PHI &&
+             "This is not a machine PHI node that we are updating!");
+      // "default" BB. We can go there only from header BB.
+      if (PHIBB == SDL->JTCases[i].second.Default) {
+        PHI->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[pi].second,
+                                                  false));
+        PHI->addOperand(MachineOperand::CreateMBB(SDL->JTCases[i].first.HeaderBB));
+      }
+      // JT BB. Just iterate over successors here
+      if (BB->succ_end() != std::find(BB->succ_begin(),BB->succ_end(), PHIBB)) {
+        PHI->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[pi].second,
+                                                  false));
+        PHI->addOperand(MachineOperand::CreateMBB(BB));
+      }
+    }
+  }
+  SDL->JTCases.clear();
+  
+  // If the switch block involved a branch to one of the actual successors, we
+  // need to update PHI nodes in that block.
+  for (unsigned i = 0, e = SDL->PHINodesToUpdate.size(); i != e; ++i) {
+    MachineInstr *PHI = SDL->PHINodesToUpdate[i].first;
+    assert(PHI->getOpcode() == TargetInstrInfo::PHI &&
+           "This is not a machine PHI node that we are updating!");
+    if (BB->isSuccessor(PHI->getParent())) {
+      PHI->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[i].second,
+                                                false));
+      PHI->addOperand(MachineOperand::CreateMBB(BB));
+    }
+  }
+  
+  // If we generated any switch lowering information, build and codegen any
+  // additional DAGs necessary.
+  for (unsigned i = 0, e = SDL->SwitchCases.size(); i != e; ++i) {
+    // Set the current basic block to the mbb we wish to insert the code into
+    BB = SDL->SwitchCases[i].ThisBB;
+    SDL->setCurrentBasicBlock(BB);
+    
+    // Emit the code
+    SDL->visitSwitchCase(SDL->SwitchCases[i]);
+    CurDAG->setRoot(SDL->getRoot());
+    CodeGenAndEmitDAG();
+    SDL->clear();
+    
+    // Handle any PHI nodes in successors of this chunk, as if we were coming
+    // from the original BB before switch expansion.  Note that PHI nodes can
+    // occur multiple times in PHINodesToUpdate.  We have to be very careful to
+    // handle them the right number of times.
+    while ((BB = SDL->SwitchCases[i].TrueBB)) {  // Handle LHS and RHS.
+      for (MachineBasicBlock::iterator Phi = BB->begin();
+           Phi != BB->end() && Phi->getOpcode() == TargetInstrInfo::PHI; ++Phi){
+        // This value for this PHI node is recorded in PHINodesToUpdate, get it.
+        for (unsigned pn = 0; ; ++pn) {
+          assert(pn != SDL->PHINodesToUpdate.size() &&
+                 "Didn't find PHI entry!");
+          if (SDL->PHINodesToUpdate[pn].first == Phi) {
+            Phi->addOperand(MachineOperand::CreateReg(SDL->PHINodesToUpdate[pn].
+                                                      second, false));
+            Phi->addOperand(MachineOperand::CreateMBB(SDL->SwitchCases[i].ThisBB));
+            break;
+          }
+        }
+      }
+      
+      // Don't process RHS if same block as LHS.
+      if (BB == SDL->SwitchCases[i].FalseBB)
+        SDL->SwitchCases[i].FalseBB = 0;
+      
+      // If we haven't handled the RHS, do so now.  Otherwise, we're done.
+      SDL->SwitchCases[i].TrueBB = SDL->SwitchCases[i].FalseBB;
+      SDL->SwitchCases[i].FalseBB = 0;
+    }
+    assert(SDL->SwitchCases[i].TrueBB == 0 && SDL->SwitchCases[i].FalseBB == 0);
+  }
+  SDL->SwitchCases.clear();
+
+  SDL->PHINodesToUpdate.clear();
+}
+
+
+/// Create the scheduler. If a specific scheduler was specified
+/// via the SchedulerRegistry, use it, otherwise select the
+/// one preferred by the target.
+///
+ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
+  RegisterScheduler::FunctionPassCtor Ctor = RegisterScheduler::getDefault();
+  
+  if (!Ctor) {
+    Ctor = ISHeuristic;
+    RegisterScheduler::setDefault(Ctor);
+  }
+  
+  return Ctor(this, OptLevel);
+}
+
+ScheduleHazardRecognizer *SelectionDAGISel::CreateTargetHazardRecognizer() {
+  return new ScheduleHazardRecognizer();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions used by the generated instruction selector.
+//===----------------------------------------------------------------------===//
+// Calls to these methods are generated by tblgen.
+
+/// CheckAndMask - The isel is trying to match something like (and X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an AND, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS, 
+                                    int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+  
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+  
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+  
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+  if (CurDAG->MaskedValueIsZero(LHS, NeededMask))
+    return true;
+  
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+/// CheckOrMask - The isel is trying to match something like (or X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an OR, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS, 
+                                   int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+  
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+  
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+  
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+  
+  APInt KnownZero, KnownOne;
+  CurDAG->ComputeMaskedBits(LHS, NeededMask, KnownZero, KnownOne);
+  
+  // If all the missing bits in the or are already known to be set, match!
+  if ((NeededMask & KnownOne) == NeededMask)
+    return true;
+  
+  // TODO: check to see if missing bits are just not demanded.
+  
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+
+/// SelectInlineAsmMemoryOperands - Calls to this are automatically generated
+/// by tblgen.  Others should not call it.
+void SelectionDAGISel::
+SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops) {
+  std::vector<SDValue> InOps;
+  std::swap(InOps, Ops);
+
+  Ops.push_back(InOps[0]);  // input chain.
+  Ops.push_back(InOps[1]);  // input asm string.
+
+  unsigned i = 2, e = InOps.size();
+  if (InOps[e-1].getValueType() == MVT::Flag)
+    --e;  // Don't process a flag operand if it is here.
+  
+  while (i != e) {
+    unsigned Flags = cast<ConstantSDNode>(InOps[i])->getZExtValue();
+    if ((Flags & 7) != 4 /*MEM*/) {
+      // Just skip over this operand, copying the operands verbatim.
+      Ops.insert(Ops.end(), InOps.begin()+i,
+                 InOps.begin()+i+InlineAsm::getNumOperandRegisters(Flags) + 1);
+      i += InlineAsm::getNumOperandRegisters(Flags) + 1;
+    } else {
+      assert(InlineAsm::getNumOperandRegisters(Flags) == 1 &&
+             "Memory operand with multiple values?");
+      // Otherwise, this is a memory operand.  Ask the target to select it.
+      std::vector<SDValue> SelOps;
+      if (SelectInlineAsmMemoryOperand(InOps[i+1], 'm', SelOps)) {
+        cerr << "Could not match memory address.  Inline asm failure!\n";
+        exit(1);
+      }
+      
+      // Add this to the output node.
+      MVT IntPtrTy = CurDAG->getTargetLoweringInfo().getPointerTy();
+      Ops.push_back(CurDAG->getTargetConstant(4/*MEM*/ | (SelOps.size()<< 3),
+                                              IntPtrTy));
+      Ops.insert(Ops.end(), SelOps.begin(), SelOps.end());
+      i += 2;
+    }
+  }
+  
+  // Add the flag input back if present.
+  if (e != InOps.size())
+    Ops.push_back(InOps.back());
+}
+
+/// findFlagUse - Return use of MVT::Flag value produced by the specified
+/// SDNode.
+///
+static SDNode *findFlagUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDUse &Use = I.getUse();
+    if (Use.getResNo() == FlagResNo)
+      return Use.getUser();
+  }
+  return NULL;
+}
+
+/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
+/// This function recursively traverses up the operand chain, ignoring
+/// certain nodes.
+static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+                          SDNode *Root,
+                          SmallPtrSet<SDNode*, 16> &Visited) {
+  if (Use->getNodeId() < Def->getNodeId() ||
+      !Visited.insert(Use))
+    return false;
+
+  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
+    SDNode *N = Use->getOperand(i).getNode();
+    if (N == Def) {
+      if (Use == ImmedUse || Use == Root)
+        continue;  // We are not looking for immediate use.
+      assert(N != Root);
+      return true;
+    }
+
+    // Traverse up the operand chain.
+    if (findNonImmUse(N, Def, ImmedUse, Root, Visited))
+      return true;
+  }
+  return false;
+}
+
+/// isNonImmUse - Start searching from Root up the DAG to check is Def can
+/// be reached. Return true if that's the case. However, ignore direct uses
+/// by ImmedUse (which would be U in the example illustrated in
+/// IsLegalAndProfitableToFold) and by Root (which can happen in the store
+/// case).
+/// FIXME: to be really generic, we should allow direct use by any node
+/// that is being folded. But realisticly since we only fold loads which
+/// have one non-chain use, we only need to watch out for load/op/store
+/// and load/op/cmp case where the root (store / cmp) may reach the load via
+/// its chain operand.
+static inline bool isNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse) {
+  SmallPtrSet<SDNode*, 16> Visited;
+  return findNonImmUse(Root, Def, ImmedUse, Root, Visited);
+}
+
+/// IsLegalAndProfitableToFold - Returns true if the specific operand node N of
+/// U can be folded during instruction selection that starts at Root and
+/// folding N is profitable.
+bool SelectionDAGISel::IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+                                                  SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  // If Root use can somehow reach N through a path that that doesn't contain
+  // U then folding N would create a cycle. e.g. In the following
+  // diagram, Root can reach N through X. If N is folded into into Root, then
+  // X is both a predecessor and a successor of U.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^     ^          //
+  //         \   /           //
+  //          \ /            //
+  //         [Root*]         //
+  //
+  // * indicates nodes to be folded together.
+  //
+  // If Root produces a flag, then it gets (even more) interesting. Since it
+  // will be "glued" together with its flag use in the scheduler, we need to
+  // check if it might reach N.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^       ^        //
+  //         \       \       //
+  //          \      |       //
+  //         [Root*] |       //
+  //          ^      |       //
+  //          f      |       //
+  //          |      /       //
+  //         [Y]    /        //
+  //           ^   /         //
+  //           f  /          //
+  //           | /           //
+  //          [FU]           //
+  //
+  // If FU (flag use) indirectly reaches N (the load), and Root folds N
+  // (call it Fold), then X is a predecessor of FU and a successor of
+  // Fold. But since Fold and FU are flagged together, this will create
+  // a cycle in the scheduling graph.
+
+  MVT VT = Root->getValueType(Root->getNumValues()-1);
+  while (VT == MVT::Flag) {
+    SDNode *FU = findFlagUse(Root);
+    if (FU == NULL)
+      break;
+    Root = FU;
+    VT = Root->getValueType(Root->getNumValues()-1);
+  }
+
+  return !isNonImmUse(Root, N, U);
+}
+
+
+char SelectionDAGISel::ID = 0;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
new file mode 100644
index 0000000..3eec684
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -0,0 +1,416 @@
+//===-- SelectionDAGPrinter.cpp - Implement SelectionDAG::viewGraph() -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include <fstream>
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
+    static bool hasEdgeDestLabels() {
+      return true;
+    }
+
+    static unsigned numEdgeDestLabels(const void *Node) {
+      return ((const SDNode *) Node)->getNumValues();
+    }
+
+    static std::string getEdgeDestLabel(const void *Node, unsigned i) {
+      return ((const SDNode *) Node)->getValueType(i).getMVTString();
+    }
+
+    /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
+    /// should actually target another edge source, not a node.  If this method is
+    /// implemented, getEdgeTarget should be implemented.
+    template<typename EdgeIter>
+    static bool edgeTargetsEdgeSource(const void *Node, EdgeIter I) {
+      return true;
+    }
+
+    /// getEdgeTarget - If edgeTargetsEdgeSource returns true, this method is
+    /// called to determine which outgoing edge of Node is the target of this
+    /// edge.
+    template<typename EdgeIter>
+    static EdgeIter getEdgeTarget(const void *Node, EdgeIter I) {
+      SDNode *TargetNode = *I;
+      SDNodeIterator NI = SDNodeIterator::begin(TargetNode);
+      std::advance(NI, I.getNode()->getOperand(I.getOperand()).getResNo());
+      return NI;
+    }
+
+    static std::string getGraphName(const SelectionDAG *G) {
+      return G->getMachineFunction().getFunction()->getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+    
+    static bool hasNodeAddressLabel(const SDNode *Node,
+                                    const SelectionDAG *Graph) {
+      return true;
+    }
+    
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    template<typename EdgeIter>
+    static std::string getEdgeAttributes(const void *Node, EdgeIter EI) {
+      SDValue Op = EI.getNode()->getOperand(EI.getOperand());
+      MVT VT = Op.getValueType();
+      if (VT == MVT::Flag)
+        return "color=red,style=bold";
+      else if (VT == MVT::Other)
+        return "color=blue,style=dashed";
+      return "";
+    }
+    
+
+    static std::string getNodeLabel(const SDNode *Node,
+                                    const SelectionDAG *Graph);
+    static std::string getNodeAttributes(const SDNode *N,
+                                         const SelectionDAG *Graph) {
+#ifndef NDEBUG
+      const std::string &Attrs = Graph->getGraphAttrs(N);
+      if (!Attrs.empty()) {
+        if (Attrs.find("shape=") == std::string::npos)
+          return std::string("shape=Mrecord,") + Attrs;
+        else
+          return Attrs;
+      }
+#endif
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(SelectionDAG *G,
+                                       GraphWriter<SelectionDAG*> &GW) {
+      GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+      if (G->getRoot().getNode())
+        GW.emitEdge(0, -1, G->getRoot().getNode(), G->getRoot().getResNo(),
+                    "color=blue,style=dashed");
+    }
+  };
+}
+
+std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
+                                                        const SelectionDAG *G) {
+  std::string Op = Node->getOperationName(G);
+
+  if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Node)) {
+    Op += ": " + utostr(CSDN->getZExtValue());
+  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(Node)) {
+    Op += ": " + ftostr(CSDN->getValueAPF());
+  } else if (const GlobalAddressSDNode *GADN =
+             dyn_cast<GlobalAddressSDNode>(Node)) {
+    Op += ": " + GADN->getGlobal()->getName();
+    if (int64_t Offset = GADN->getOffset()) {
+      if (Offset > 0)
+        Op += "+" + itostr(Offset);
+      else
+        Op += itostr(Offset);
+    }
+  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(Node)) {
+    Op += " " + itostr(FIDN->getIndex());
+  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(Node)) {
+    Op += " " + itostr(JTDN->getIndex());
+  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Node)){
+    if (CP->isMachineConstantPoolEntry()) {
+      Op += '<';
+      {
+        raw_string_ostream OSS(Op);
+        OSS << *CP->getMachineCPVal();
+      }
+      Op += '>';
+    } else {
+      if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+        Op += "<" + ftostr(CFP->getValueAPF()) + ">";
+      else if (ConstantInt *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
+        Op += "<" + utostr(CI->getZExtValue()) + ">";
+      else {
+        Op += '<';
+        {
+          raw_string_ostream OSS(Op);
+          WriteAsOperand(OSS, CP->getConstVal(), false);
+        }
+        Op += '>';
+      }
+    }
+    Op += " A=" + itostr(CP->getAlignment());
+  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(Node)) {
+    Op = "BB: ";
+    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
+    if (LBB)
+      Op += LBB->getName();
+    //Op += " " + (const void*)BBDN->getBasicBlock();
+  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node)) {
+    if (G && R->getReg() != 0 &&
+        TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+      Op = Op + " " +
+        G->getTarget().getRegisterInfo()->getName(R->getReg());
+    } else {
+      Op += " #" + utostr(R->getReg());
+    }
+  } else if (const DbgStopPointSDNode *D = dyn_cast<DbgStopPointSDNode>(Node)) {
+    DICompileUnit CU(cast<GlobalVariable>(D->getCompileUnit()));
+    std::string FN;
+    Op += ": " + CU.getFilename(FN);
+    Op += ":" + utostr(D->getLine());
+    if (D->getColumn() != 0)
+      Op += ":" + utostr(D->getColumn());
+  } else if (const LabelSDNode *L = dyn_cast<LabelSDNode>(Node)) {
+    Op += ": LabelID=" + utostr(L->getLabelID());
+  } else if (const CallSDNode *C = dyn_cast<CallSDNode>(Node)) {
+    Op += ": CallingConv=" + utostr(C->getCallingConv());
+    if (C->isVarArg())
+      Op += ", isVarArg";
+    if (C->isTailCall())
+      Op += ", isTailCall";
+  } else if (const ExternalSymbolSDNode *ES =
+             dyn_cast<ExternalSymbolSDNode>(Node)) {
+    Op += "'" + std::string(ES->getSymbol()) + "'";
+  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(Node)) {
+    if (M->getValue())
+      Op += "<" + M->getValue()->getName() + ">";
+    else
+      Op += "<null>";
+  } else if (const MemOperandSDNode *M = dyn_cast<MemOperandSDNode>(Node)) {
+    const Value *V = M->MO.getValue();
+    Op += '<';
+    if (!V) {
+      Op += "(unknown)";
+    } else if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+      // PseudoSourceValues don't have names, so use their print method.
+      raw_string_ostream OSS(Op);
+      PSV->print(OSS);
+    } else {
+      Op += V->getName();
+    }
+    Op += '+' + itostr(M->MO.getOffset()) + '>';
+  } else if (const ARG_FLAGSSDNode *N = dyn_cast<ARG_FLAGSSDNode>(Node)) {
+    Op = Op + " AF=" + N->getArgFlags().getArgFlagsString();
+  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(Node)) {
+    Op = Op + " VT=" + N->getVT().getMVTString();
+  } else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node)) {
+    bool doExt = true;
+    switch (LD->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:
+      Op = Op + "<anyext ";
+      break;
+    case ISD::SEXTLOAD:
+      Op = Op + " <sext ";
+      break;
+    case ISD::ZEXTLOAD:
+      Op = Op + " <zext ";
+      break;
+    }
+    if (doExt)
+      Op += LD->getMemoryVT().getMVTString() + ">";
+    if (LD->isVolatile())
+      Op += "<V>";
+    Op += LD->getIndexedModeName(LD->getAddressingMode());
+    if (LD->getAlignment() > 1)
+      Op += " A=" + utostr(LD->getAlignment());
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(Node)) {
+    if (ST->isTruncatingStore())
+      Op += "<trunc " + ST->getMemoryVT().getMVTString() + ">";
+    if (ST->isVolatile())
+      Op += "<V>";
+    Op += ST->getIndexedModeName(ST->getAddressingMode());
+    if (ST->getAlignment() > 1)
+      Op += " A=" + utostr(ST->getAlignment());
+  }
+
+#if 0
+  Op += " Id=" + itostr(Node->getNodeId());
+#endif
+  
+  return Op;
+}
+
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void SelectionDAG::viewGraph(const std::string &Title) {
+// This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getName(),
+            Title);
+#else
+  cerr << "SelectionDAG::viewGraph is only available in debug builds on "
+       << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+// This overload is defined out-of-line here instead of just using a
+// default parameter because this is easiest for gdb to call.
+void SelectionDAG::viewGraph() {
+  viewGraph("");
+}
+
+/// clearGraphAttrs - Clear all previously defined node graph attributes.
+/// Intended to be used from a debugging tool (eg. gdb).
+void SelectionDAG::clearGraphAttrs() {
+#ifndef NDEBUG
+  NodeGraphAttrs.clear();
+#else
+  cerr << "SelectionDAG::clearGraphAttrs is only available in debug builds"
+       << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// setGraphAttrs - Set graph attributes for a node. (eg. "color=red".)
+///
+void SelectionDAG::setGraphAttrs(const SDNode *N, const char *Attrs) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = Attrs;
+#else
+  cerr << "SelectionDAG::setGraphAttrs is only available in debug builds"
+       << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// getGraphAttrs - Get graph attributes for a node. (eg. "color=red".)
+/// Used from getNodeAttributes.
+const std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
+#ifndef NDEBUG
+  std::map<const SDNode *, std::string>::const_iterator I =
+    NodeGraphAttrs.find(N);
+    
+  if (I != NodeGraphAttrs.end())
+    return I->second;
+  else
+    return "";
+#else
+  cerr << "SelectionDAG::getGraphAttrs is only available in debug builds"
+       << " on systems with Graphviz or gv!\n";
+  return std::string("");
+#endif
+}
+
+/// setGraphColor - Convenience for setting node color attribute.
+///
+void SelectionDAG::setGraphColor(const SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = std::string("color=") + Color;
+#else
+  cerr << "SelectionDAG::setGraphColor is only available in debug builds"
+       << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+/// setSubgraphColorHelper - Implement setSubgraphColor.  Return
+/// whether we truncated the search.
+///
+bool SelectionDAG::setSubgraphColorHelper(SDNode *N, const char *Color, DenseSet<SDNode *> &visited,
+                                          int level, bool &printed) {
+  bool hit_limit = false;
+
+#ifndef NDEBUG
+  if (level >= 20) {
+    if (!printed) {
+      printed = true;
+      DOUT << "setSubgraphColor hit max level\n";
+    }
+    return true;
+  }
+
+  unsigned oldSize = visited.size();
+  visited.insert(N);
+  if (visited.size() != oldSize) {
+    setGraphColor(N, Color);
+    for(SDNodeIterator i = SDNodeIterator::begin(N), iend = SDNodeIterator::end(N);
+        i != iend;
+        ++i) {
+      hit_limit = setSubgraphColorHelper(*i, Color, visited, level+1, printed) || hit_limit;
+    }
+  }
+#else
+  cerr << "SelectionDAG::setSubgraphColor is only available in debug builds"
+       << " on systems with Graphviz or gv!\n";
+#endif
+  return hit_limit;
+}
+
+/// setSubgraphColor - Convenience for setting subgraph color attribute.
+///
+void SelectionDAG::setSubgraphColor(SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  DenseSet<SDNode *> visited;
+  bool printed = false;
+  if (setSubgraphColorHelper(N, Color, visited, 0, printed)) {
+    // Visually mark that we hit the limit
+    if (strcmp(Color, "red") == 0) {
+      setSubgraphColorHelper(N, "blue", visited, 0, printed);
+    }
+    else if (strcmp(Color, "yellow") == 0) {
+      setSubgraphColorHelper(N, "green", visited, 0, printed);
+    }
+  }
+
+#else
+  cerr << "SelectionDAG::setSubgraphColor is only available in debug builds"
+       << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream O(s);
+  O << "SU(" << SU->NodeNum << "): ";
+  if (SU->getNode()) {
+    SmallVector<SDNode *, 4> FlaggedNodes;
+    for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode())
+      FlaggedNodes.push_back(N);
+    while (!FlaggedNodes.empty()) {
+      O << DOTGraphTraits<SelectionDAG*>::getNodeLabel(FlaggedNodes.back(), DAG);
+      FlaggedNodes.pop_back();
+      if (!FlaggedNodes.empty())
+        O << "\n    ";
+    }
+  } else {
+    O << "CROSS RC COPY";
+  }
+  return O.str();
+}
+
+void ScheduleDAGSDNodes::getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const {
+  if (DAG) {
+    // Draw a special "GraphRoot" node to indicate the root of the graph.
+    GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+    const SDNode *N = DAG->getRoot().getNode();
+    if (N && N->getNodeId() != -1)
+      GW.emitEdge(0, -1, &SUnits[N->getNodeId()], -1,
+                  "color=blue,style=dashed");
+  }
+}
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
new file mode 100644
index 0000000..3334e53
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -0,0 +1,2592 @@
+//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace llvm {
+TLSModel::Model getTLSModel(const GlobalValue *GV, Reloc::Model reloc) {
+  bool isLocal = GV->hasLocalLinkage();
+  bool isDeclaration = GV->isDeclaration();
+  // FIXME: what should we do for protected and internal visibility?
+  // For variables, is internal different from hidden?
+  bool isHidden = GV->hasHiddenVisibility();
+
+  if (reloc == Reloc::PIC_) {
+    if (isLocal || isHidden)
+      return TLSModel::LocalDynamic;
+    else
+      return TLSModel::GeneralDynamic;
+  } else {
+    if (!isDeclaration || isHidden)
+      return TLSModel::LocalExec;
+    else
+      return TLSModel::InitialExec;
+  }
+}
+}
+
+/// InitLibcallNames - Set default libcall names.
+///
+static void InitLibcallNames(const char **Names) {
+  Names[RTLIB::SHL_I16] = "__ashlhi3";
+  Names[RTLIB::SHL_I32] = "__ashlsi3";
+  Names[RTLIB::SHL_I64] = "__ashldi3";
+  Names[RTLIB::SHL_I128] = "__ashlti3";
+  Names[RTLIB::SRL_I16] = "__lshrhi3";
+  Names[RTLIB::SRL_I32] = "__lshrsi3";
+  Names[RTLIB::SRL_I64] = "__lshrdi3";
+  Names[RTLIB::SRL_I128] = "__lshrti3";
+  Names[RTLIB::SRA_I16] = "__ashrhi3";
+  Names[RTLIB::SRA_I32] = "__ashrsi3";
+  Names[RTLIB::SRA_I64] = "__ashrdi3";
+  Names[RTLIB::SRA_I128] = "__ashrti3";
+  Names[RTLIB::MUL_I16] = "__mulhi3";
+  Names[RTLIB::MUL_I32] = "__mulsi3";
+  Names[RTLIB::MUL_I64] = "__muldi3";
+  Names[RTLIB::MUL_I128] = "__multi3";
+  Names[RTLIB::SDIV_I16] = "__divhi3";
+  Names[RTLIB::SDIV_I32] = "__divsi3";
+  Names[RTLIB::SDIV_I64] = "__divdi3";
+  Names[RTLIB::SDIV_I128] = "__divti3";
+  Names[RTLIB::UDIV_I16] = "__udivhi3";
+  Names[RTLIB::UDIV_I32] = "__udivsi3";
+  Names[RTLIB::UDIV_I64] = "__udivdi3";
+  Names[RTLIB::UDIV_I128] = "__udivti3";
+  Names[RTLIB::SREM_I16] = "__modhi3";
+  Names[RTLIB::SREM_I32] = "__modsi3";
+  Names[RTLIB::SREM_I64] = "__moddi3";
+  Names[RTLIB::SREM_I128] = "__modti3";
+  Names[RTLIB::UREM_I16] = "__umodhi3";
+  Names[RTLIB::UREM_I32] = "__umodsi3";
+  Names[RTLIB::UREM_I64] = "__umoddi3";
+  Names[RTLIB::UREM_I128] = "__umodti3";
+  Names[RTLIB::NEG_I32] = "__negsi2";
+  Names[RTLIB::NEG_I64] = "__negdi2";
+  Names[RTLIB::ADD_F32] = "__addsf3";
+  Names[RTLIB::ADD_F64] = "__adddf3";
+  Names[RTLIB::ADD_F80] = "__addxf3";
+  Names[RTLIB::ADD_PPCF128] = "__gcc_qadd";
+  Names[RTLIB::SUB_F32] = "__subsf3";
+  Names[RTLIB::SUB_F64] = "__subdf3";
+  Names[RTLIB::SUB_F80] = "__subxf3";
+  Names[RTLIB::SUB_PPCF128] = "__gcc_qsub";
+  Names[RTLIB::MUL_F32] = "__mulsf3";
+  Names[RTLIB::MUL_F64] = "__muldf3";
+  Names[RTLIB::MUL_F80] = "__mulxf3";
+  Names[RTLIB::MUL_PPCF128] = "__gcc_qmul";
+  Names[RTLIB::DIV_F32] = "__divsf3";
+  Names[RTLIB::DIV_F64] = "__divdf3";
+  Names[RTLIB::DIV_F80] = "__divxf3";
+  Names[RTLIB::DIV_PPCF128] = "__gcc_qdiv";
+  Names[RTLIB::REM_F32] = "fmodf";
+  Names[RTLIB::REM_F64] = "fmod";
+  Names[RTLIB::REM_F80] = "fmodl";
+  Names[RTLIB::REM_PPCF128] = "fmodl";
+  Names[RTLIB::POWI_F32] = "__powisf2";
+  Names[RTLIB::POWI_F64] = "__powidf2";
+  Names[RTLIB::POWI_F80] = "__powixf2";
+  Names[RTLIB::POWI_PPCF128] = "__powitf2";
+  Names[RTLIB::SQRT_F32] = "sqrtf";
+  Names[RTLIB::SQRT_F64] = "sqrt";
+  Names[RTLIB::SQRT_F80] = "sqrtl";
+  Names[RTLIB::SQRT_PPCF128] = "sqrtl";
+  Names[RTLIB::LOG_F32] = "logf";
+  Names[RTLIB::LOG_F64] = "log";
+  Names[RTLIB::LOG_F80] = "logl";
+  Names[RTLIB::LOG_PPCF128] = "logl";
+  Names[RTLIB::LOG2_F32] = "log2f";
+  Names[RTLIB::LOG2_F64] = "log2";
+  Names[RTLIB::LOG2_F80] = "log2l";
+  Names[RTLIB::LOG2_PPCF128] = "log2l";
+  Names[RTLIB::LOG10_F32] = "log10f";
+  Names[RTLIB::LOG10_F64] = "log10";
+  Names[RTLIB::LOG10_F80] = "log10l";
+  Names[RTLIB::LOG10_PPCF128] = "log10l";
+  Names[RTLIB::EXP_F32] = "expf";
+  Names[RTLIB::EXP_F64] = "exp";
+  Names[RTLIB::EXP_F80] = "expl";
+  Names[RTLIB::EXP_PPCF128] = "expl";
+  Names[RTLIB::EXP2_F32] = "exp2f";
+  Names[RTLIB::EXP2_F64] = "exp2";
+  Names[RTLIB::EXP2_F80] = "exp2l";
+  Names[RTLIB::EXP2_PPCF128] = "exp2l";
+  Names[RTLIB::SIN_F32] = "sinf";
+  Names[RTLIB::SIN_F64] = "sin";
+  Names[RTLIB::SIN_F80] = "sinl";
+  Names[RTLIB::SIN_PPCF128] = "sinl";
+  Names[RTLIB::COS_F32] = "cosf";
+  Names[RTLIB::COS_F64] = "cos";
+  Names[RTLIB::COS_F80] = "cosl";
+  Names[RTLIB::COS_PPCF128] = "cosl";
+  Names[RTLIB::POW_F32] = "powf";
+  Names[RTLIB::POW_F64] = "pow";
+  Names[RTLIB::POW_F80] = "powl";
+  Names[RTLIB::POW_PPCF128] = "powl";
+  Names[RTLIB::CEIL_F32] = "ceilf";
+  Names[RTLIB::CEIL_F64] = "ceil";
+  Names[RTLIB::CEIL_F80] = "ceill";
+  Names[RTLIB::CEIL_PPCF128] = "ceill";
+  Names[RTLIB::TRUNC_F32] = "truncf";
+  Names[RTLIB::TRUNC_F64] = "trunc";
+  Names[RTLIB::TRUNC_F80] = "truncl";
+  Names[RTLIB::TRUNC_PPCF128] = "truncl";
+  Names[RTLIB::RINT_F32] = "rintf";
+  Names[RTLIB::RINT_F64] = "rint";
+  Names[RTLIB::RINT_F80] = "rintl";
+  Names[RTLIB::RINT_PPCF128] = "rintl";
+  Names[RTLIB::NEARBYINT_F32] = "nearbyintf";
+  Names[RTLIB::NEARBYINT_F64] = "nearbyint";
+  Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::FLOOR_F32] = "floorf";
+  Names[RTLIB::FLOOR_F64] = "floor";
+  Names[RTLIB::FLOOR_F80] = "floorl";
+  Names[RTLIB::FLOOR_PPCF128] = "floorl";
+  Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
+  Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
+  Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
+  Names[RTLIB::FPROUND_PPCF128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
+  Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
+  Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
+  Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
+  Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
+  Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
+  Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
+  Names[RTLIB::FPTOSINT_F80_I32] = "__fixxfsi";
+  Names[RTLIB::FPTOSINT_F80_I64] = "__fixxfdi";
+  Names[RTLIB::FPTOSINT_F80_I128] = "__fixxfti";
+  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
+  Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
+  Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
+  Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
+  Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
+  Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
+  Names[RTLIB::FPTOUINT_F80_I32] = "__fixunsxfsi";
+  Names[RTLIB::FPTOUINT_F80_I64] = "__fixunsxfdi";
+  Names[RTLIB::FPTOUINT_F80_I128] = "__fixunsxfti";
+  Names[RTLIB::FPTOUINT_PPCF128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_PPCF128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_PPCF128_I128] = "__fixunstfti";
+  Names[RTLIB::SINTTOFP_I32_F32] = "__floatsisf";
+  Names[RTLIB::SINTTOFP_I32_F64] = "__floatsidf";
+  Names[RTLIB::SINTTOFP_I32_F80] = "__floatsixf";
+  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I64_F32] = "__floatdisf";
+  Names[RTLIB::SINTTOFP_I64_F64] = "__floatdidf";
+  Names[RTLIB::SINTTOFP_I64_F80] = "__floatdixf";
+  Names[RTLIB::SINTTOFP_I64_PPCF128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I128_F32] = "__floattisf";
+  Names[RTLIB::SINTTOFP_I128_F64] = "__floattidf";
+  Names[RTLIB::SINTTOFP_I128_F80] = "__floattixf";
+  Names[RTLIB::SINTTOFP_I128_PPCF128] = "__floattitf";
+  Names[RTLIB::UINTTOFP_I32_F32] = "__floatunsisf";
+  Names[RTLIB::UINTTOFP_I32_F64] = "__floatunsidf";
+  Names[RTLIB::UINTTOFP_I32_F80] = "__floatunsixf";
+  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I64_F32] = "__floatundisf";
+  Names[RTLIB::UINTTOFP_I64_F64] = "__floatundidf";
+  Names[RTLIB::UINTTOFP_I64_F80] = "__floatundixf";
+  Names[RTLIB::UINTTOFP_I64_PPCF128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I128_F32] = "__floatuntisf";
+  Names[RTLIB::UINTTOFP_I128_F64] = "__floatuntidf";
+  Names[RTLIB::UINTTOFP_I128_F80] = "__floatuntixf";
+  Names[RTLIB::UINTTOFP_I128_PPCF128] = "__floatuntitf";
+  Names[RTLIB::OEQ_F32] = "__eqsf2";
+  Names[RTLIB::OEQ_F64] = "__eqdf2";
+  Names[RTLIB::UNE_F32] = "__nesf2";
+  Names[RTLIB::UNE_F64] = "__nedf2";
+  Names[RTLIB::OGE_F32] = "__gesf2";
+  Names[RTLIB::OGE_F64] = "__gedf2";
+  Names[RTLIB::OLT_F32] = "__ltsf2";
+  Names[RTLIB::OLT_F64] = "__ltdf2";
+  Names[RTLIB::OLE_F32] = "__lesf2";
+  Names[RTLIB::OLE_F64] = "__ledf2";
+  Names[RTLIB::OGT_F32] = "__gtsf2";
+  Names[RTLIB::OGT_F64] = "__gtdf2";
+  Names[RTLIB::UO_F32] = "__unordsf2";
+  Names[RTLIB::UO_F64] = "__unorddf2";
+  Names[RTLIB::O_F32] = "__unordsf2";
+  Names[RTLIB::O_F64] = "__unorddf2";
+  Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
+}
+
+/// getFPEXT - Return the FPEXT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPEXT(MVT OpVT, MVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::f64)
+      return FPEXT_F32_F64;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPROUND - Return the FPROUND_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPROUND(MVT OpVT, MVT RetVT) {
+  if (RetVT == MVT::f32) {
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F32;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F32;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F32;
+  } else if (RetVT == MVT::f64) {
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F64;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F64;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOSINT(MVT OpVT, MVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F80_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOUINT(MVT OpVT, MVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F80_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getSINTTOFP(MVT OpVT, MVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I32_F32;
+    else if (RetVT == MVT::f64)
+      return SINTTOFP_I32_F64;
+    else if (RetVT == MVT::f80)
+      return SINTTOFP_I32_F80;
+    else if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I64_F32;
+    else if (RetVT == MVT::f64)
+      return SINTTOFP_I64_F64;
+    else if (RetVT == MVT::f80)
+      return SINTTOFP_I64_F80;
+    else if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I128_F32;
+    else if (RetVT == MVT::f64)
+      return SINTTOFP_I128_F64;
+    else if (RetVT == MVT::f80)
+      return SINTTOFP_I128_F80;
+    else if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getUINTTOFP(MVT OpVT, MVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I32_F32;
+    else if (RetVT == MVT::f64)
+      return UINTTOFP_I32_F64;
+    else if (RetVT == MVT::f80)
+      return UINTTOFP_I32_F80;
+    else if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I64_F32;
+    else if (RetVT == MVT::f64)
+      return UINTTOFP_I64_F64;
+    else if (RetVT == MVT::f80)
+      return UINTTOFP_I64_F80;
+    else if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I128_F32;
+    else if (RetVT == MVT::f64)
+      return UINTTOFP_I128_F64;
+    else if (RetVT == MVT::f80)
+      return UINTTOFP_I128_F80;
+    else if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// InitCmpLibcallCCs - Set default comparison libcall CC.
+///
+static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
+  memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
+  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CCs[RTLIB::UO_F32] = ISD::SETNE;
+  CCs[RTLIB::UO_F64] = ISD::SETNE;
+  CCs[RTLIB::O_F32] = ISD::SETEQ;
+  CCs[RTLIB::O_F64] = ISD::SETEQ;
+}
+
+TargetLowering::TargetLowering(TargetMachine &tm)
+  : TM(tm), TD(TM.getTargetData()) {
+  // All operations default to being supported.
+  memset(OpActions, 0, sizeof(OpActions));
+  memset(LoadExtActions, 0, sizeof(LoadExtActions));
+  memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
+  memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
+  memset(ConvertActions, 0, sizeof(ConvertActions));
+  memset(CondCodeActions, 0, sizeof(CondCodeActions));
+
+  // Set default actions for various operations.
+  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
+    // Default all indexed load / store to expand.
+    for (unsigned IM = (unsigned)ISD::PRE_INC;
+         IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
+      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
+    }
+    
+    // These operations default to expand.
+    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  // Most targets ignore the @llvm.prefetch intrinsic.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
+  
+  // ConstantFP nodes default to expand.  Targets can either change this to 
+  // Legal, in which case all fp constants are legal, or use addLegalFPImmediate
+  // to optimize expansions for certain constants.
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
+
+  // These library functions default to expand.
+  setOperationAction(ISD::FLOG , MVT::f64, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
+  setOperationAction(ISD::FLOG10,MVT::f64, Expand);
+  setOperationAction(ISD::FEXP , MVT::f64, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
+  setOperationAction(ISD::FLOG , MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10,MVT::f32, Expand);
+  setOperationAction(ISD::FEXP , MVT::f32, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
+
+  // Default ISD::TRAP to expand (which turns it into abort).
+  setOperationAction(ISD::TRAP, MVT::Other, Expand);
+    
+  IsLittleEndian = TD->isLittleEndian();
+  UsesGlobalOffsetTable = false;
+  ShiftAmountTy = PointerTy = getValueType(TD->getIntPtrType());
+  ShiftAmtHandling = Undefined;
+  memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
+  memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
+  maxStoresPerMemset = maxStoresPerMemcpy = maxStoresPerMemmove = 8;
+  allowUnalignedMemoryAccesses = false;
+  benefitFromCodePlacementOpt = false;
+  UseUnderscoreSetJmp = false;
+  UseUnderscoreLongJmp = false;
+  SelectIsExpensive = false;
+  IntDivIsCheap = false;
+  Pow2DivIsCheap = false;
+  StackPointerRegisterToSaveRestore = 0;
+  ExceptionPointerRegister = 0;
+  ExceptionSelectorRegister = 0;
+  BooleanContents = UndefinedBooleanContent;
+  SchedPreferenceInfo = SchedulingForLatency;
+  JumpBufSize = 0;
+  JumpBufAlignment = 0;
+  IfCvtBlockSizeLimit = 2;
+  IfCvtDupBlockSizeLimit = 0;
+  PrefLoopAlignment = 0;
+
+  InitLibcallNames(LibcallRoutineNames);
+  InitCmpLibcallCCs(CmpLibcallCCs);
+
+  // Tell Legalize whether the assembler supports DEBUG_LOC.
+  const TargetAsmInfo *TASM = TM.getTargetAsmInfo();
+  if (!TASM || !TASM->hasDotLocAndDotFile())
+    setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+}
+
+TargetLowering::~TargetLowering() {}
+
+/// computeRegisterProperties - Once all of the register classes are added,
+/// this allows us to compute derived properties we expose.
+void TargetLowering::computeRegisterProperties() {
+  assert(MVT::LAST_VALUETYPE <= 32 &&
+         "Too many value types for ValueTypeActions to hold!");
+
+  // Everything defaults to needing one register.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    NumRegistersForVT[i] = 1;
+    RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
+  }
+  // ...except isVoid, which doesn't need any registers.
+  NumRegistersForVT[MVT::isVoid] = 0;
+
+  // Find the largest integer register class.
+  unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
+  for (; RegClassForVT[LargestIntReg] == 0; --LargestIntReg)
+    assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
+
+  // Every integer value type larger than this largest register takes twice as
+  // many registers to represent as the previous ValueType.
+  for (unsigned ExpandedReg = LargestIntReg + 1; ; ++ExpandedReg) {
+    MVT EVT = (MVT::SimpleValueType)ExpandedReg;
+    if (!EVT.isInteger())
+      break;
+    NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
+    RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
+    TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
+    ValueTypeActions.setTypeAction(EVT, Expand);
+  }
+
+  // Inspect all of the ValueType's smaller than the largest integer
+  // register to see which ones need promotion.
+  unsigned LegalIntReg = LargestIntReg;
+  for (unsigned IntReg = LargestIntReg - 1;
+       IntReg >= (unsigned)MVT::i1; --IntReg) {
+    MVT IVT = (MVT::SimpleValueType)IntReg;
+    if (isTypeLegal(IVT)) {
+      LegalIntReg = IntReg;
+    } else {
+      RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
+        (MVT::SimpleValueType)LegalIntReg;
+      ValueTypeActions.setTypeAction(IVT, Promote);
+    }
+  }
+
+  // ppcf128 type is really two f64's.
+  if (!isTypeLegal(MVT::ppcf128)) {
+    NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
+    RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
+    TransformToType[MVT::ppcf128] = MVT::f64;
+    ValueTypeActions.setTypeAction(MVT::ppcf128, Expand);
+  }    
+
+  // Decide how to handle f64. If the target does not have native f64 support,
+  // expand it to i64 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f64)) {
+    NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
+    RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
+    TransformToType[MVT::f64] = MVT::i64;
+    ValueTypeActions.setTypeAction(MVT::f64, Expand);
+  }
+
+  // Decide how to handle f32. If the target does not have native support for
+  // f32, promote it to f64 if it is legal. Otherwise, expand it to i32.
+  if (!isTypeLegal(MVT::f32)) {
+    if (isTypeLegal(MVT::f64)) {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64];
+      TransformToType[MVT::f32] = MVT::f64;
+      ValueTypeActions.setTypeAction(MVT::f32, Promote);
+    } else {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
+      TransformToType[MVT::f32] = MVT::i32;
+      ValueTypeActions.setTypeAction(MVT::f32, Expand);
+    }
+  }
+  
+  // Loop over all of the vector value types to see which need transformations.
+  for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+    if (!isTypeLegal(VT)) {
+      MVT IntermediateVT, RegisterVT;
+      unsigned NumIntermediates;
+      NumRegistersForVT[i] =
+        getVectorTypeBreakdown(VT,
+                               IntermediateVT, NumIntermediates,
+                               RegisterVT);
+      RegisterTypeForVT[i] = RegisterVT;
+      
+      // Determine if there is a legal wider type.
+      bool IsLegalWiderType = false;
+      MVT EltVT = VT.getVectorElementType();
+      unsigned NElts = VT.getVectorNumElements();
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType)nVT;
+        if (isTypeLegal(SVT) && SVT.getVectorElementType() == EltVT &&
+            SVT.getVectorNumElements() > NElts) {
+          TransformToType[i] = SVT;
+          ValueTypeActions.setTypeAction(VT, Promote);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (!IsLegalWiderType) {
+        MVT NVT = VT.getPow2VectorType();
+        if (NVT == VT) {
+          // Type is already a power of 2.  The default action is to split.
+          TransformToType[i] = MVT::Other;
+          ValueTypeActions.setTypeAction(VT, Expand);
+        } else {
+          TransformToType[i] = NVT;
+          ValueTypeActions.setTypeAction(VT, Promote);
+        }
+      }
+    }
+  }
+}
+
+const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return NULL;
+}
+
+
+MVT TargetLowering::getSetCCResultType(MVT VT) const {
+  return getValueType(TD->getIntPtrType());
+}
+
+
+/// getVectorTypeBreakdown - Vector types are broken down into some number of
+/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
+/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
+/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
+///
+/// This method returns the number of registers needed, and the VT for each
+/// register.  It also returns the VT and quantity of the intermediate values
+/// before they are promoted/expanded.
+///
+unsigned TargetLowering::getVectorTypeBreakdown(MVT VT,
+                                                MVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                      MVT &RegisterVT) const {
+  // Figure out the right, legal destination reg to copy into.
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType();
+  
+  unsigned NumVectorRegs = 1;
+  
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we 
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+  
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+  
+  MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
+  if (!isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  MVT DestVT = getRegisterType(NewVT);
+  RegisterVT = DestVT;
+  if (DestVT.bitsLT(NewVT)) {
+    // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
+  } else {
+    // Otherwise, promotion or legal types use the same number of registers as
+    // the vector decimated to the appropriate level.
+    return NumVectorRegs;
+  }
+  
+  return 1;
+}
+
+/// getWidenVectorType: given a vector type, returns the type to widen to
+/// (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+/// If there is no vector type that we want to widen to, returns MVT::Other
+/// When and where to widen is target dependent based on the cost of
+/// scalarizing vs using the wider vector type.
+MVT TargetLowering::getWidenVectorType(MVT VT) const {
+  assert(VT.isVector());
+  if (isTypeLegal(VT))
+    return VT;
+ 
+  // Default is not to widen until moved to LegalizeTypes
+  return MVT::Other;
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.  This is the actual
+/// alignment, not its logarithm.
+unsigned TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+  return TD->getCallFrameTypeAlignment(Ty);
+}
+
+SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                 SelectionDAG &DAG) const {
+  if (usesGlobalOffsetTable())
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
+  return Table;
+}
+
+bool
+TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // Assume that everything is safe in static mode.
+  if (getTargetMachine().getRelocationModel() == Reloc::Static)
+    return true;
+
+  // In dynamic-no-pic mode, assume that known defined values are safe.
+  if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
+      GA &&
+      !GA->getGlobal()->isDeclaration() &&
+      !GA->getGlobal()->isWeakForLinker())
+    return true;
+
+  // Otherwise assume nothing is safe.
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//  Optimization Methods
+//===----------------------------------------------------------------------===//
+
+/// ShrinkDemandedConstant - Check to see if the specified operand of the 
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, 
+                                                        const APInt &Demanded) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // FIXME: ISD::SELECT, ISD::SELECT_CC
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::XOR:
+  case ISD::AND:
+  case ISD::OR: {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!C) return false;
+
+    if (Op.getOpcode() == ISD::XOR &&
+        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+      return false;
+
+    // if we can expand it to have all bits set, do it
+    if (C->getAPIntValue().intersects(~Demanded)) {
+      MVT VT = Op.getValueType();
+      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0),
+                                DAG.getConstant(Demanded &
+                                                C->getAPIntValue(), 
+                                                VT));
+      return CombineTo(Op, New);
+    }
+
+    break;
+  }
+  }
+
+  return false;
+}
+
+/// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
+/// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
+/// cast, but it could be generalized for targets with other types of
+/// implicit widening casts.
+bool
+TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
+                                                    unsigned BitWidth,
+                                                    const APInt &Demanded,
+                                                    DebugLoc dl) {
+  assert(Op.getNumOperands() == 2 &&
+         "ShrinkDemandedOp only supports binary operators!");
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "ShrinkDemandedOp only supports nodes with one result!");
+
+  // Don't do this if the node has another user, which may require the
+  // full value.
+  if (!Op.getNode()->hasOneUse())
+    return false;
+
+  // Search for the smallest integer type with free casts to and from
+  // Op's type. For expedience, just check power-of-2 integer types.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned SmallVTBits = BitWidth - Demanded.countLeadingZeros();
+  if (!isPowerOf2_32(SmallVTBits))
+    SmallVTBits = NextPowerOf2(SmallVTBits);
+  for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+    MVT SmallVT = MVT::getIntegerVT(SmallVTBits);
+    if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
+        TLI.isZExtFree(SmallVT, Op.getValueType())) {
+      // We found a type with free casts.
+      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(0)),
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(1)));
+      SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), X);
+      return CombineTo(Op, Z);
+    }
+  }
+  return false;
+}
+
+/// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
+/// DemandedMask bits of the result of Op are ever used downstream.  If we can
+/// use this information to simplify Op, create a new simplified DAG node and
+/// return true, returning the original and new nodes in Old and New. Otherwise,
+/// analyze the expression and return a mask of KnownOne and KnownZero bits for
+/// the expression (used to simplify the caller).  The KnownZero/One bits may
+/// only be accurate for those bits in the DemandedMask.
+bool TargetLowering::SimplifyDemandedBits(SDValue Op,
+                                          const APInt &DemandedMask,
+                                          APInt &KnownZero,
+                                          APInt &KnownOne,
+                                          TargetLoweringOpt &TLO,
+                                          unsigned Depth) const {
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  assert(Op.getValueSizeInBits() == BitWidth &&
+         "Mask size mismatches value type size!");
+  APInt NewMask = DemandedMask;
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Don't know anything.
+  KnownZero = KnownOne = APInt(BitWidth, 0);
+
+  // Other users may use these bits.
+  if (!Op.getNode()->hasOneUse()) { 
+    if (Depth != 0) {
+      // If not at the root, Just compute the KnownZero/KnownOne bits to 
+      // simplify things downstream.
+      TLO.DAG.ComputeMaskedBits(Op, DemandedMask, KnownZero, KnownOne, Depth);
+      return false;
+    }
+    // If this is the root being simplified, allow it to have multiple uses,
+    // just set the NewMask to all bits.
+    NewMask = APInt::getAllOnesValue(BitWidth);
+  } else if (DemandedMask == 0) {   
+    // Not demanding any bits from Op.
+    if (Op.getOpcode() != ISD::UNDEF)
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(Op.getValueType()));
+    return false;
+  } else if (Depth == 6) {        // Limit search depth.
+    return false;
+  }
+
+  APInt KnownZero2, KnownOne2, KnownZeroOut, KnownOneOut;
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue() & NewMask;
+    KnownZero = ~KnownOne & NewMask;
+    return false;   // Don't fall through, will infinitely loop.
+  case ISD::AND:
+    // If the RHS is a constant, check to see if the LHS would be zero without
+    // using the bits from the RHS.  Below, we use knowledge about the RHS to
+    // simplify the LHS, here we're using information from the LHS to simplify
+    // the RHS.
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt LHSZero, LHSOne;
+      TLO.DAG.ComputeMaskedBits(Op.getOperand(0), NewMask,
+                                LHSZero, LHSOne, Depth+1);
+      // If the LHS already has zeros where RHSC does, this and is dead.
+      if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
+        return TLO.CombineTo(Op, Op.getOperand(0));
+      // If any of the set bits in the RHS are known zero on the LHS, shrink
+      // the constant.
+      if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
+        return true;
+    }
+    
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownZero & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+      
+    // If all of the demanded bits are known one on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((NewMask & (KnownZero|KnownZero2)) == NewMask)
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, Op.getValueType()));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    break;
+  case ISD::OR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownOne & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((NewMask & ~KnownOne2 & KnownZero) == (~KnownOne2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownOne & KnownZero2) == (~KnownOne & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    break;
+  case ISD::XOR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((KnownZero & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((KnownZero2 & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // If all of the unknown bits are known to be zero on one side or the other
+    // (but not both) turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((NewMask & ~KnownZero & ~KnownZero2) == 0)
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(),
+                                               Op.getOperand(0),
+                                               Op.getOperand(1)));
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known
+      if ((KnownOne & KnownOne2) == KnownOne) {
+        MVT VT = Op.getValueType();
+        SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, VT);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT, 
+                                                 Op.getOperand(0), ANDC));
+      }
+    }
+    
+    // If the RHS is a constant, see if we can simplify it.
+    // for XOR, we prefer to force bits to 1 if they will make a -1.
+    // if we can't force bits, try to shrink constant
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt Expanded = C->getAPIntValue() | (~NewMask);
+      // if we can expand it to have all bits set, do it
+      if (Expanded.isAllOnesValue()) {
+        if (Expanded != C->getAPIntValue()) {
+          MVT VT = Op.getValueType();
+          SDValue New = TLO.DAG.getNode(Op.getOpcode(), dl,VT, Op.getOperand(0),
+                                          TLO.DAG.getConstant(Expanded, VT));
+          return TLO.CombineTo(Op, New);
+        }
+        // if it already has all the bits set, nothing to change
+        // but don't shrink either!
+      } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
+        return true;
+      }
+    }
+
+    KnownZero = KnownZeroOut;
+    KnownOne  = KnownOneOut;
+    break;
+  case ISD::SELECT:
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+    
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SELECT_CC:
+    if (SimplifyDemandedBits(Op.getOperand(3), NewMask, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+      
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SHL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+      SDValue InOp = Op.getOperand(0);
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the bottom bits (which are shifted
+      // out) are never demanded.
+      if (InOp.getOpcode() == ISD::SRL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SHL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SRL;
+          }          
+          
+          SDValue NewSA = 
+            TLO.DAG.getConstant(Diff, Op.getOperand(1).getValueType());
+          MVT VT = Op.getValueType();
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }      
+      
+      if (SimplifyDemandedBits(Op.getOperand(0), NewMask.lshr(ShAmt),
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      KnownZero <<= SA->getZExtValue();
+      KnownOne  <<= SA->getZExtValue();
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue());
+    }
+    break;
+  case ISD::SRL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      MVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+      unsigned VTSize = VT.getSizeInBits();
+      SDValue InOp = Op.getOperand(0);
+      
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the top bits (which are shifted out)
+      // are never demanded.
+      if (InOp.getOpcode() == ISD::SHL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getHighBitsSet(VTSize, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SRL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SHL;
+          }          
+          
+          SDValue NewSA =
+            TLO.DAG.getConstant(Diff, Op.getOperand(1).getValueType());
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }      
+      
+      // Compute the new bits that are at the top now.
+      if (SimplifyDemandedBits(InOp, (NewMask << ShAmt),
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    break;
+  case ISD::SRA:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask == 1)
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
+                                               Op.getOperand(0), Op.getOperand(1)));
+
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      MVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+      
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      APInt InDemandedMask = (NewMask << ShAmt);
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      if (HighBits.intersects(NewMask))
+        InDemandedMask |= APInt::getSignBit(VT.getSizeInBits());
+      
+      if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask,
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+      
+      // Handle the sign bit, adjusted to where it is now in the mask.
+      APInt SignBit = APInt::getSignBit(BitWidth).lshr(ShAmt);
+      
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, 
+                                                 Op.getOperand(0),
+                                                 Op.getOperand(1)));
+      } else if (KnownOne.intersects(SignBit)) { // New bits are known one.
+        KnownOne |= HighBits;
+      }
+    }
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    MVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+
+    // Sign extension.  Compute the demanded bits in the result that are not 
+    // present in the input.
+    APInt NewBits = APInt::getHighBitsSet(BitWidth,
+                                          BitWidth - EVT.getSizeInBits()) &
+                    NewMask;
+    
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    if (NewBits == 0)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+
+    APInt InSignBit = APInt::getSignBit(EVT.getSizeInBits());
+    InSignBit.zext(BitWidth);
+    APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth,
+                                                   EVT.getSizeInBits()) &
+                              NewMask;
+    
+    // Since the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InputDemandedBits |= InSignBit;
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    
+    // If the input sign bit is known zero, convert this into a zero extension.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op, 
+                           TLO.DAG.getZeroExtendInReg(Op.getOperand(0),dl,EVT));
+    
+    if (KnownOne.intersects(InSignBit)) {    // Input sign bit known set
+      KnownOne |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                       // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne &= ~NewBits;
+    }
+    break;
+  }
+  case ISD::ZERO_EXTEND: {
+    unsigned OperandBitWidth = Op.getOperand(0).getValueSizeInBits();
+    APInt InMask = NewMask;
+    InMask.trunc(OperandBitWidth);
+    
+    // If none of the top bits are demanded, convert this into an any_extend.
+    APInt NewBits =
+      APInt::getHighBitsSet(BitWidth, BitWidth - OperandBitWidth) & NewMask;
+    if (!NewBits.intersects(NewMask))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                               Op.getValueType(), 
+                                               Op.getOperand(0)));
+    
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    break;
+  }
+  case ISD::SIGN_EXTEND: {
+    MVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getSizeInBits();
+    APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits);
+    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits);
+    APInt NewBits   = ~InMask & NewMask;
+    
+    // If none of the top bits are demanded, convert this into an any_extend.
+    if (NewBits == 0)
+      return TLO.CombineTo(Op,TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                              Op.getValueType(),
+                                              Op.getOperand(0)));
+    
+    // Since some of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    APInt InDemandedBits = InMask & NewMask;
+    InDemandedBits |= InSignBit;
+    InDemandedBits.trunc(InBits);
+    
+    if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, KnownZero, 
+                             KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    
+    // If the sign bit is known zero, convert this to a zero extend.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl,
+                                               Op.getValueType(), 
+                                               Op.getOperand(0)));
+    
+    // If the sign bit is known one, the top bits match.
+    if (KnownOne.intersects(InSignBit)) {
+      KnownOne  |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {   // Otherwise, top bits aren't known.
+      KnownOne  &= ~NewBits;
+      KnownZero &= ~NewBits;
+    }
+    break;
+  }
+  case ISD::ANY_EXTEND: {
+    unsigned OperandBitWidth = Op.getOperand(0).getValueSizeInBits();
+    APInt InMask = NewMask;
+    InMask.trunc(OperandBitWidth);
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    break;
+  }
+  case ISD::TRUNCATE: {
+    // Simplify the input, using demanded bit information, and compute the known
+    // zero/one bits live out.
+    APInt TruncMask = NewMask;
+    TruncMask.zext(Op.getOperand(0).getValueSizeInBits());
+    if (SimplifyDemandedBits(Op.getOperand(0), TruncMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero.trunc(BitWidth);
+    KnownOne.trunc(BitWidth);
+    
+    // If the input is only used by this truncate, see if we can shrink it based
+    // on the known demanded bits.
+    if (Op.getOperand(0).getNode()->hasOneUse()) {
+      SDValue In = Op.getOperand(0);
+      unsigned InBitWidth = In.getValueSizeInBits();
+      switch (In.getOpcode()) {
+      default: break;
+      case ISD::SRL:
+        // Shrink SRL by a constant if none of the high bits shifted in are
+        // demanded.
+        if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(In.getOperand(1))){
+          APInt HighBits = APInt::getHighBitsSet(InBitWidth,
+                                                 InBitWidth - BitWidth);
+          HighBits = HighBits.lshr(ShAmt->getZExtValue());
+          HighBits.trunc(BitWidth);
+          
+          if (ShAmt->getZExtValue() < BitWidth && !(HighBits & NewMask)) {
+            // None of the shifted in bits are needed.  Add a truncate of the
+            // shift input, then shift it.
+            SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl,
+                                                 Op.getValueType(), 
+                                                 In.getOperand(0));
+            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
+                                                     Op.getValueType(),
+                                                     NewTrunc, 
+                                                     In.getOperand(1)));
+          }
+        }
+        break;
+      }
+    }
+    
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    break;
+  }
+  case ISD::AssertZext: {
+    MVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth,
+                                        VT.getSizeInBits());
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask & NewMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    KnownZero |= ~InMask & NewMask;
+    break;
+  }
+  case ISD::BIT_CONVERT:
+#if 0
+    // If this is an FP->Int bitcast and if the sign bit is the only thing that
+    // is demanded, turn this into a FGETSIGN.
+    if (NewMask == MVT::getIntegerVTSignBit(Op.getValueType()) &&
+        MVT::isFloatingPoint(Op.getOperand(0).getValueType()) &&
+        !MVT::isVector(Op.getOperand(0).getValueType())) {
+      // Only do this xform if FGETSIGN is valid or if before legalize.
+      if (!TLO.AfterLegalize ||
+          isOperationLegal(ISD::FGETSIGN, Op.getValueType())) {
+        // Make a FGETSIGN + SHL to move the sign bit into the appropriate
+        // place.  We expect the SHL to be eliminated by other optimizations.
+        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, Op.getValueType(), 
+                                         Op.getOperand(0));
+        unsigned ShVal = Op.getValueType().getSizeInBits()-1;
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, getShiftAmountTy());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, Op.getValueType(),
+                                                 Sign, ShAmt));
+      }
+    }
+#endif
+    break;
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::SUB: {
+    // Add, Sub, and Mul don't demand any bits in positions beyond that
+    // of the highest bit demanded of them.
+    APInt LoMask = APInt::getLowBitsSet(BitWidth,
+                                        BitWidth - NewMask.countLeadingZeros());
+    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    // See if the operation should be performed at a smaller bit width.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+  }
+  // FALL THROUGH
+  default:
+    // Just use ComputeMaskedBits to compute output bits.
+    TLO.DAG.ComputeMaskedBits(Op, NewMask, KnownZero, KnownOne, Depth);
+    break;
+  }
+  
+  // If we know the value of all of the demanded bits, return this as a
+  // constant.
+  if ((NewMask & (KnownZero|KnownOne)) == NewMask)
+    return TLO.CombineTo(Op, TLO.DAG.getConstant(KnownOne, Op.getValueType()));
+  
+  return false;
+}
+
+/// computeMaskedBitsForTargetNode - Determine which of the bits specified 
+/// in Mask are known to be either zero or one and return them in the 
+/// KnownZero/KnownOne bitsets.
+void TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 
+                                                    const APInt &Mask,
+                                                    APInt &KnownZero, 
+                                                    APInt &KnownOne,
+                                                    const SelectionDAG &DAG,
+                                                    unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+}
+
+/// ComputeNumSignBitsForTargetNode - This method can be implemented by
+/// targets that want to expose additional information about sign bits to the
+/// DAG Combiner.
+unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use ComputeNumSignBits if you don't know whether Op"
+         " is a target node!");
+  return 1;
+}
+
+/// ValueHasExactlyOneBitSet - Test if the given value is known to have exactly
+/// one bit set. This differs from ComputeMaskedBits in that it doesn't need to
+/// determine which bit is set.
+///
+static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
+  // A left-shift of a constant one will have exactly one bit set, because
+  // shifting the bit off the end is undefined.
+  if (Val.getOpcode() == ISD::SHL)
+    if (ConstantSDNode *C =
+         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
+      if (C->getAPIntValue() == 1)
+        return true;
+
+  // Similarly, a right-shift of a constant sign-bit will have exactly
+  // one bit set.
+  if (Val.getOpcode() == ISD::SRL)
+    if (ConstantSDNode *C =
+         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
+      if (C->getAPIntValue().isSignBit())
+        return true;
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+
+  // Fall back to ComputeMaskedBits to catch other known cases.
+  MVT OpVT = Val.getValueType();
+  unsigned BitWidth = OpVT.getSizeInBits();
+  APInt Mask = APInt::getAllOnesValue(BitWidth);
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(Val, Mask, KnownZero, KnownOne);
+  return (KnownZero.countPopulation() == BitWidth - 1) &&
+         (KnownOne.countPopulation() == 1);
+}
+
+/// SimplifySetCC - Try to simplify a setcc built with the specified operands 
+/// and cc. If it is unable to simplify it, return a null SDValue.
+SDValue
+TargetLowering::SimplifySetCC(MVT VT, SDValue N0, SDValue N1,
+                              ISD::CondCode Cond, bool foldBooleans,
+                              DAGCombinerInfo &DCI, DebugLoc dl) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return DAG.getConstant(0, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:  return DAG.getConstant(1, VT);
+  }
+
+  if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const APInt &C1 = N1C->getAPIntValue();
+    if (isa<ConstantSDNode>(N0.getNode())) {
+      return DAG.FoldSetCC(VT, N0, N1, Cond, dl);
+    } else {
+      // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
+      // equality comparison, then we're just comparing whether X itself is
+      // zero.
+      if (N0.getOpcode() == ISD::SRL && (C1 == 0 || C1 == 1) &&
+          N0.getOperand(0).getOpcode() == ISD::CTLZ &&
+          N0.getOperand(1).getOpcode() == ISD::Constant) {
+        unsigned ShAmt = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+        if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+            ShAmt == Log2_32(N0.getValueType().getSizeInBits())) {
+          if ((C1 == 0) == (Cond == ISD::SETEQ)) {
+            // (srl (ctlz x), 5) == 0  -> X != 0
+            // (srl (ctlz x), 5) != 1  -> X != 0
+            Cond = ISD::SETNE;
+          } else {
+            // (srl (ctlz x), 5) != 0  -> X == 0
+            // (srl (ctlz x), 5) == 1  -> X == 0
+            Cond = ISD::SETEQ;
+          }
+          SDValue Zero = DAG.getConstant(0, N0.getValueType());
+          return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0),
+                              Zero, Cond);
+        }
+      }
+
+      // If the LHS is '(and load, const)', the RHS is 0,
+      // the test is for equality or unsigned, and all 1 bits of the const are
+      // in the same partial word, see if we can shorten the load.
+      if (DCI.isBeforeLegalize() &&
+          N0.getOpcode() == ISD::AND && C1 == 0 &&
+          N0.getNode()->hasOneUse() &&
+          isa<LoadSDNode>(N0.getOperand(0)) &&
+          N0.getOperand(0).getNode()->hasOneUse() &&
+          isa<ConstantSDNode>(N0.getOperand(1))) {
+        LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
+        uint64_t bestMask = 0;
+        unsigned bestWidth = 0, bestOffset = 0;
+        if (!Lod->isVolatile() && Lod->isUnindexed() &&
+            // FIXME: This uses getZExtValue() below so it only works on i64 and
+            // below.
+            N0.getValueType().getSizeInBits() <= 64) {
+          unsigned origWidth = N0.getValueType().getSizeInBits();
+          // We can narrow (e.g.) 16-bit extending loads on 32-bit target to 
+          // 8 bits, but have to be careful...
+          if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
+            origWidth = Lod->getMemoryVT().getSizeInBits();
+          uint64_t Mask =cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+          for (unsigned width = origWidth / 2; width>=8; width /= 2) {
+            uint64_t newMask = (1ULL << width) - 1;
+            for (unsigned offset=0; offset<origWidth/width; offset++) {
+              if ((newMask & Mask) == Mask) {
+                if (!TD->isLittleEndian())
+                  bestOffset = (origWidth/width - offset - 1) * (width/8);
+                else
+                  bestOffset = (uint64_t)offset * (width/8);
+                bestMask = Mask >> (offset * (width/8) * 8);
+                bestWidth = width;
+                break;
+              }
+              newMask = newMask << width;
+            }
+          }
+        }
+        if (bestWidth) {
+          MVT newVT = MVT::getIntegerVT(bestWidth);
+          if (newVT.isRound()) {
+            MVT PtrType = Lod->getOperand(1).getValueType();
+            SDValue Ptr = Lod->getBasePtr();
+            if (bestOffset != 0)
+              Ptr = DAG.getNode(ISD::ADD, dl, PtrType, Lod->getBasePtr(),
+                                DAG.getConstant(bestOffset, PtrType));
+            unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
+            SDValue NewLoad = DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
+                                          Lod->getSrcValue(), 
+                                          Lod->getSrcValueOffset() + bestOffset,
+                                          false, NewAlign);
+            return DAG.getSetCC(dl, VT, 
+                                DAG.getNode(ISD::AND, dl, newVT, NewLoad,
+                                            DAG.getConstant(bestMask, newVT)),
+                                DAG.getConstant(0LL, newVT), Cond);
+          }
+        }
+      }
+
+      // If the LHS is a ZERO_EXTEND, perform the comparison on the input.
+      if (N0.getOpcode() == ISD::ZERO_EXTEND) {
+        unsigned InSize = N0.getOperand(0).getValueType().getSizeInBits();
+
+        // If the comparison constant has bits in the upper part, the
+        // zero-extended value could never match.
+        if (C1.intersects(APInt::getHighBitsSet(C1.getBitWidth(),
+                                                C1.getBitWidth() - InSize))) {
+          switch (Cond) {
+          case ISD::SETUGT:
+          case ISD::SETUGE:
+          case ISD::SETEQ: return DAG.getConstant(0, VT);
+          case ISD::SETULT:
+          case ISD::SETULE:
+          case ISD::SETNE: return DAG.getConstant(1, VT);
+          case ISD::SETGT:
+          case ISD::SETGE:
+            // True if the sign bit of C1 is set.
+            return DAG.getConstant(C1.isNegative(), VT);
+          case ISD::SETLT:
+          case ISD::SETLE:
+            // True if the sign bit of C1 isn't set.
+            return DAG.getConstant(C1.isNonNegative(), VT);
+          default:
+            break;
+          }
+        }
+
+        // Otherwise, we can perform the comparison with the low bits.
+        switch (Cond) {
+        case ISD::SETEQ:
+        case ISD::SETNE:
+        case ISD::SETUGT:
+        case ISD::SETUGE:
+        case ISD::SETULT:
+        case ISD::SETULE:
+          return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                          DAG.getConstant(APInt(C1).trunc(InSize),
+                                          N0.getOperand(0).getValueType()),
+                          Cond);
+        default:
+          break;   // todo, be more careful with signed comparisons
+        }
+      } else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+                 (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+        MVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
+        unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
+        MVT ExtDstTy = N0.getValueType();
+        unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();
+
+        // If the extended part has any inconsistent bits, it cannot ever
+        // compare equal.  In other words, they have to be all ones or all
+        // zeros.
+        APInt ExtBits =
+          APInt::getHighBitsSet(ExtDstTyBits, ExtDstTyBits - ExtSrcTyBits);
+        if ((C1 & ExtBits) != 0 && (C1 & ExtBits) != ExtBits)
+          return DAG.getConstant(Cond == ISD::SETNE, VT);
+        
+        SDValue ZextOp;
+        MVT Op0Ty = N0.getOperand(0).getValueType();
+        if (Op0Ty == ExtSrcTy) {
+          ZextOp = N0.getOperand(0);
+        } else {
+          APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
+          ZextOp = DAG.getNode(ISD::AND, dl, Op0Ty, N0.getOperand(0),
+                               DAG.getConstant(Imm, Op0Ty));
+        }
+        if (!DCI.isCalledByLegalizer())
+          DCI.AddToWorklist(ZextOp.getNode());
+        // Otherwise, make this a use of a zext.
+        return DAG.getSetCC(dl, VT, ZextOp, 
+                            DAG.getConstant(C1 & APInt::getLowBitsSet(
+                                                               ExtDstTyBits,
+                                                               ExtSrcTyBits), 
+                                            ExtDstTy),
+                            Cond);
+      } else if ((N1C->isNullValue() || N1C->getAPIntValue() == 1) &&
+                 (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+        
+        // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
+        if (N0.getOpcode() == ISD::SETCC) {
+          bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (N1C->getZExtValue() != 1);
+          if (TrueWhenTrue)
+            return N0;
+          
+          // Invert the condition.
+          ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+          CC = ISD::getSetCCInverse(CC, 
+                                   N0.getOperand(0).getValueType().isInteger());
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+        }
+        
+        if ((N0.getOpcode() == ISD::XOR ||
+             (N0.getOpcode() == ISD::AND && 
+              N0.getOperand(0).getOpcode() == ISD::XOR &&
+              N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
+            isa<ConstantSDNode>(N0.getOperand(1)) &&
+            cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue() == 1) {
+          // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
+          // can only do this if the top bits are known zero.
+          unsigned BitWidth = N0.getValueSizeInBits();
+          if (DAG.MaskedValueIsZero(N0,
+                                    APInt::getHighBitsSet(BitWidth,
+                                                          BitWidth-1))) {
+            // Okay, get the un-inverted input value.
+            SDValue Val;
+            if (N0.getOpcode() == ISD::XOR)
+              Val = N0.getOperand(0);
+            else {
+              assert(N0.getOpcode() == ISD::AND && 
+                     N0.getOperand(0).getOpcode() == ISD::XOR);
+              // ((X^1)&1)^1 -> X & 1
+              Val = DAG.getNode(ISD::AND, dl, N0.getValueType(),
+                                N0.getOperand(0).getOperand(0),
+                                N0.getOperand(1));
+            }
+            return DAG.getSetCC(dl, VT, Val, N1,
+                                Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+          }
+        }
+      }
+      
+      APInt MinVal, MaxVal;
+      unsigned OperandBitSize = N1C->getValueType(0).getSizeInBits();
+      if (ISD::isSignedIntSetCC(Cond)) {
+        MinVal = APInt::getSignedMinValue(OperandBitSize);
+        MaxVal = APInt::getSignedMaxValue(OperandBitSize);
+      } else {
+        MinVal = APInt::getMinValue(OperandBitSize);
+        MaxVal = APInt::getMaxValue(OperandBitSize);
+      }
+
+      // Canonicalize GE/LE comparisons to use GT/LT comparisons.
+      if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
+        if (C1 == MinVal) return DAG.getConstant(1, VT);   // X >= MIN --> true
+        // X >= C0 --> X > (C0-1)
+        return DAG.getSetCC(dl, VT, N0, 
+                            DAG.getConstant(C1-1, N1.getValueType()),
+                            (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT);
+      }
+
+      if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
+        if (C1 == MaxVal) return DAG.getConstant(1, VT);   // X <= MAX --> true
+        // X <= C0 --> X < (C0+1)
+        return DAG.getSetCC(dl, VT, N0, 
+                            DAG.getConstant(C1+1, N1.getValueType()),
+                            (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT);
+      }
+
+      if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal)
+        return DAG.getConstant(0, VT);      // X < MIN --> false
+      if ((Cond == ISD::SETGE || Cond == ISD::SETUGE) && C1 == MinVal)
+        return DAG.getConstant(1, VT);      // X >= MIN --> true
+      if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal)
+        return DAG.getConstant(0, VT);      // X > MAX --> false
+      if ((Cond == ISD::SETLE || Cond == ISD::SETULE) && C1 == MaxVal)
+        return DAG.getConstant(1, VT);      // X <= MAX --> true
+
+      // Canonicalize setgt X, Min --> setne X, Min
+      if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MinVal)
+        return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+      // Canonicalize setlt X, Max --> setne X, Max
+      if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MaxVal)
+        return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+      // If we have setult X, 1, turn it into seteq X, 0
+      if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal+1)
+        return DAG.getSetCC(dl, VT, N0, 
+                            DAG.getConstant(MinVal, N0.getValueType()), 
+                            ISD::SETEQ);
+      // If we have setugt X, Max-1, turn it into seteq X, Max
+      else if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal-1)
+        return DAG.getSetCC(dl, VT, N0, 
+                            DAG.getConstant(MaxVal, N0.getValueType()),
+                            ISD::SETEQ);
+
+      // If we have "setcc X, C0", check to see if we can shrink the immediate
+      // by changing cc.
+
+      // SETUGT X, SINTMAX  -> SETLT X, 0
+      if (Cond == ISD::SETUGT && 
+          C1 == APInt::getSignedMaxValue(OperandBitSize))
+        return DAG.getSetCC(dl, VT, N0, 
+                            DAG.getConstant(0, N1.getValueType()),
+                            ISD::SETLT);
+
+      // SETULT X, SINTMIN  -> SETGT X, -1
+      if (Cond == ISD::SETULT &&
+          C1 == APInt::getSignedMinValue(OperandBitSize)) {
+        SDValue ConstMinusOne =
+            DAG.getConstant(APInt::getAllOnesValue(OperandBitSize),
+                            N1.getValueType());
+        return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
+      }
+
+      // Fold bit comparisons when we can.
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          VT == N0.getValueType() && N0.getOpcode() == ISD::AND)
+        if (ConstantSDNode *AndRHS =
+                    dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          MVT ShiftTy = DCI.isBeforeLegalize() ?
+            getPointerTy() : getShiftAmountTy();
+          if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
+            // Perform the xform if the AND RHS is a single bit.
+            if (isPowerOf2_64(AndRHS->getZExtValue())) {
+              return DAG.getNode(ISD::SRL, dl, VT, N0,
+                                 DAG.getConstant(Log2_64(AndRHS->getZExtValue()),
+                                                 ShiftTy));
+            }
+          } else if (Cond == ISD::SETEQ && C1 == AndRHS->getZExtValue()) {
+            // (X & 8) == 8  -->  (X & 8) >> 3
+            // Perform the xform if C1 is a single bit.
+            if (C1.isPowerOf2()) {
+              return DAG.getNode(ISD::SRL, dl, VT, N0,
+                                 DAG.getConstant(C1.logBase2(), ShiftTy));
+            }
+          }
+        }
+    }
+  } else if (isa<ConstantSDNode>(N0.getNode())) {
+      // Ensure that the constant occurs on the RHS.
+    return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond));
+  }
+
+  if (isa<ConstantFPSDNode>(N0.getNode())) {
+    // Constant fold or commute setcc.
+    SDValue O = DAG.FoldSetCC(VT, N0, N1, Cond, dl);
+    if (O.getNode()) return O;
+  } else if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+    // If the RHS of an FP comparison is a constant, simplify it away in
+    // some cases.
+    if (CFP->getValueAPF().isNaN()) {
+      // If an operand is known to be a nan, we can fold it.
+      switch (ISD::getUnorderedFlavor(Cond)) {
+      default: assert(0 && "Unknown flavor!");
+      case 0:  // Known false.
+        return DAG.getConstant(0, VT);
+      case 1:  // Known true.
+        return DAG.getConstant(1, VT);
+      case 2:  // Undefined.
+        return DAG.getUNDEF(VT);
+      }
+    }
+    
+    // Otherwise, we know the RHS is not a NaN.  Simplify the node to drop the
+    // constant if knowing that the operand is non-nan is enough.  We prefer to
+    // have SETO(x,x) instead of SETO(x, 0.0) because this avoids having to
+    // materialize 0.0.
+    if (Cond == ISD::SETO || Cond == ISD::SETUO)
+      return DAG.getSetCC(dl, VT, N0, N0, Cond);
+  }
+
+  if (N0 == N1) {
+    // We can always fold X == X for integer setcc's.
+    if (N0.getValueType().isInteger())
+      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+    unsigned UOF = ISD::getUnorderedFlavor(Cond);
+    if (UOF == 2)   // FP operators that are undefined on NaNs.
+      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+    if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
+      return DAG.getConstant(UOF, VT);
+    // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
+    // if it is not already.
+    ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
+    if (NewCond != Cond)
+      return DAG.getSetCC(dl, VT, N0, N1, NewCond);
+  }
+
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+      N0.getValueType().isInteger()) {
+    if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB ||
+        N0.getOpcode() == ISD::XOR) {
+      // Simplify (X+Y) == (X+Z) -->  Y == Z
+      if (N0.getOpcode() == N1.getOpcode()) {
+        if (N0.getOperand(0) == N1.getOperand(0))
+          return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
+        if (N0.getOperand(1) == N1.getOperand(1))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
+        if (DAG.isCommutativeBinOp(N0.getOpcode())) {
+          // If X op Y == Y op X, try other combinations.
+          if (N0.getOperand(0) == N1.getOperand(1))
+            return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0), 
+                                Cond);
+          if (N0.getOperand(1) == N1.getOperand(0))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(1), 
+                                Cond);
+        }
+      }
+      
+      if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(N1)) {
+        if (ConstantSDNode *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          // Turn (X+C1) == C2 --> X == C2-C1
+          if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
+            return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                                DAG.getConstant(RHSC->getAPIntValue()-
+                                                LHSR->getAPIntValue(),
+                                N0.getValueType()), Cond);
+          }
+          
+          // Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0.
+          if (N0.getOpcode() == ISD::XOR)
+            // If we know that all of the inverted bits are zero, don't bother
+            // performing the inversion.
+            if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue()))
+              return
+                DAG.getSetCC(dl, VT, N0.getOperand(0),
+                             DAG.getConstant(LHSR->getAPIntValue() ^
+                                               RHSC->getAPIntValue(),
+                                             N0.getValueType()),
+                             Cond);
+        }
+        
+        // Turn (C1-X) == C2 --> X == C1-C2
+        if (ConstantSDNode *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+          if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
+            return
+              DAG.getSetCC(dl, VT, N0.getOperand(1),
+                           DAG.getConstant(SUBC->getAPIntValue() -
+                                             RHSC->getAPIntValue(),
+                                           N0.getValueType()),
+                           Cond);
+          }
+        }          
+      }
+
+      // Simplify (X+Z) == X -->  Z == 0
+      if (N0.getOperand(0) == N1)
+        return DAG.getSetCC(dl, VT, N0.getOperand(1),
+                        DAG.getConstant(0, N0.getValueType()), Cond);
+      if (N0.getOperand(1) == N1) {
+        if (DAG.isCommutativeBinOp(N0.getOpcode()))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                          DAG.getConstant(0, N0.getValueType()), Cond);
+        else if (N0.getNode()->hasOneUse()) {
+          assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!");
+          // (Z-X) == X  --> Z == X<<1
+          SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(),
+                                     N1, 
+                                     DAG.getConstant(1, getShiftAmountTy()));
+          if (!DCI.isCalledByLegalizer())
+            DCI.AddToWorklist(SH.getNode());
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
+        }
+      }
+    }
+
+    if (N1.getOpcode() == ISD::ADD || N1.getOpcode() == ISD::SUB ||
+        N1.getOpcode() == ISD::XOR) {
+      // Simplify  X == (X+Z) -->  Z == 0
+      if (N1.getOperand(0) == N0) {
+        return DAG.getSetCC(dl, VT, N1.getOperand(1),
+                        DAG.getConstant(0, N1.getValueType()), Cond);
+      } else if (N1.getOperand(1) == N0) {
+        if (DAG.isCommutativeBinOp(N1.getOpcode())) {
+          return DAG.getSetCC(dl, VT, N1.getOperand(0),
+                          DAG.getConstant(0, N1.getValueType()), Cond);
+        } else if (N1.getNode()->hasOneUse()) {
+          assert(N1.getOpcode() == ISD::SUB && "Unexpected operation!");
+          // X == (Z-X)  --> X<<1 == Z
+          SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N0, 
+                                     DAG.getConstant(1, getShiftAmountTy()));
+          if (!DCI.isCalledByLegalizer())
+            DCI.AddToWorklist(SH.getNode());
+          return DAG.getSetCC(dl, VT, SH, N1.getOperand(0), Cond);
+        }
+      }
+    }
+
+    // Simplify x&y == y to x&y != 0 if y has exactly one bit set.
+    // Note that where y is variable and is known to have at most
+    // one bit set (for example, if it is z&1) we cannot do this;
+    // the expressions are not equivalent when y==0.
+    if (N0.getOpcode() == ISD::AND)
+      if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
+        if (ValueHasExactlyOneBitSet(N1, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N1.getValueType());
+          return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+        }
+      }
+    if (N1.getOpcode() == ISD::AND)
+      if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
+        if (ValueHasExactlyOneBitSet(N0, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N0.getValueType());
+          return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+        }
+      }
+  }
+
+  // Fold away ALL boolean setcc's.
+  SDValue Temp;
+  if (N0.getValueType() == MVT::i1 && foldBooleans) {
+    switch (Cond) {
+    default: assert(0 && "Unknown integer setcc!");
+    case ISD::SETEQ:  // X == Y  -> ~(X^Y)
+      Temp = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      N0 = DAG.getNOT(dl, Temp, MVT::i1);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETNE:  // X != Y   -->  (X^Y)
+      N0 = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      break;
+    case ISD::SETGT:  // X >s Y   -->  X == 0 & Y == 1  -->  ~X & Y
+    case ISD::SETULT: // X <u Y   -->  X == 0 & Y == 1  -->  ~X & Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETLT:  // X <s Y   --> X == 1 & Y == 0  -->  ~Y & X
+    case ISD::SETUGT: // X >u Y   --> X == 1 & Y == 0  -->  ~Y & X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N0, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETULE: // X <=u Y  --> X == 0 | Y == 1  -->  ~X | Y
+    case ISD::SETGE:  // X >=s Y  --> X == 0 | Y == 1  -->  ~X | Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETUGE: // X >=u Y  --> X == 1 | Y == 0  -->  ~Y | X
+    case ISD::SETLE:  // X <=s Y  --> X == 1 | Y == 0  -->  ~Y | X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N0, Temp);
+      break;
+    }
+    if (VT != MVT::i1) {
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(N0.getNode());
+      // FIXME: If running after legalize, we probably can't do this.
+      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0);
+    }
+    return N0;
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool TargetLowering::isGAPlusOffset(SDNode *N, GlobalValue* &GA,
+                                    int64_t &Offset) const {
+  if (isa<GlobalAddressSDNode>(N)) {
+    GlobalAddressSDNode *GASD = cast<GlobalAddressSDNode>(N);
+    GA = GASD->getGlobal();
+    Offset += GASD->getOffset();
+    return true;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    SDValue N1 = N->getOperand(0);
+    SDValue N2 = N->getOperand(1);
+    if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
+      if (V) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    } else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
+      if (V) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+/// isConsecutiveLoad - Return true if LD (which must be a LoadSDNode) is
+/// loading 'Bytes' bytes from a location that is 'Dist' units away from the
+/// location that the 'Base' load is loading from.
+bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base,
+                                       unsigned Bytes, int Dist,
+                                       const MachineFrameInfo *MFI) const {
+  if (LD->getOperand(0).getNode() != Base->getOperand(0).getNode())
+    return false;
+  MVT VT = LD->getValueType(0);
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LD->getOperand(1);
+  SDValue BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+
+  GlobalValue *GV1 = NULL;
+  GlobalValue *GV2 = NULL;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+
+SDValue TargetLowering::
+PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  // Default implementation: no optimization.
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Inline Assembler Implementation Methods
+//===----------------------------------------------------------------------===//
+
+
+TargetLowering::ConstraintType
+TargetLowering::getConstraintType(const std::string &Constraint) const {
+  // FIXME: lots more standard ones to handle.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'r': return C_RegisterClass;
+    case 'm':    // memory
+    case 'o':    // offsetable
+    case 'V':    // not offsetable
+      return C_Memory;
+    case 'i':    // Simple Integer or Relocatable Constant
+    case 'n':    // Simple Integer
+    case 's':    // Relocatable Constant
+    case 'X':    // Allow ANY value.
+    case 'I':    // Target registers.
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'O':
+    case 'P':
+      return C_Other;
+    }
+  }
+  
+  if (Constraint.size() > 1 && Constraint[0] == '{' && 
+      Constraint[Constraint.size()-1] == '}')
+    return C_Register;
+  return C_Unknown;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *TargetLowering::LowerXConstraint(MVT ConstraintVT) const{
+  if (ConstraintVT.isInteger())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "f";      // works for many targets
+  return 0;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                  char ConstraintLetter,
+                                                  bool hasMemory,
+                                                  std::vector<SDValue> &Ops,
+                                                  SelectionDAG &DAG) const {
+  switch (ConstraintLetter) {
+  default: break;
+  case 'X':     // Allows any operand; labels (basic block) use this.
+    if (Op.getOpcode() == ISD::BasicBlock) {
+      Ops.push_back(Op);
+      return;
+    }
+    // fall through
+  case 'i':    // Simple Integer or Relocatable Constant
+  case 'n':    // Simple Integer
+  case 's': {  // Relocatable Constant
+    // These operands are interested in values of the form (GV+C), where C may
+    // be folded in as an offset of GV, or it may be explicitly added.  Also, it
+    // is possible and fine if either GV or C are missing.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+    
+    // If we have "(add GV, C)", pull out GV/C
+    if (Op.getOpcode() == ISD::ADD) {
+      C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+      if (C == 0 || GA == 0) {
+        C = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(1));
+      }
+      if (C == 0 || GA == 0)
+        C = 0, GA = 0;
+    }
+    
+    // If we find a valid operand, map to the TargetXXX version so that the
+    // value itself doesn't get selected.
+    if (GA) {   // Either &GV   or   &GV+C
+      if (ConstraintLetter != 'n') {
+        int64_t Offs = GA->getOffset();
+        if (C) Offs += C->getZExtValue();
+        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                                 Op.getValueType(), Offs));
+        return;
+      }
+    }
+    if (C) {   // just C, no GV.
+      // Simple constants are not allowed for 's'.
+      if (ConstraintLetter != 's') {
+        // gcc prints these as sign extended.  Sign extend value to 64 bits
+        // now; without this it would get ZExt'd later in
+        // ScheduleDAGSDNodes::EmitNode, which is very generic.
+        Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(),
+                                            MVT::i64));
+        return;
+      }
+    }
+    break;
+  }
+  }
+}
+
+std::vector<unsigned> TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT VT) const {
+  return std::vector<unsigned>();
+}
+
+
+std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             MVT VT) const {
+  if (Constraint[0] != '{')
+    return std::pair<unsigned, const TargetRegisterClass*>(0, 0);
+  assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
+
+  // Remove the braces from around the name.
+  std::string RegName(Constraint.begin()+1, Constraint.end()-1);
+
+  // Figure out which register class contains this reg.
+  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
+       E = RI->regclass_end(); RCI != E; ++RCI) {
+    const TargetRegisterClass *RC = *RCI;
+    
+    // If none of the the value types for this register class are valid, we 
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    bool isLegal = false;
+    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+         I != E; ++I) {
+      if (isTypeLegal(*I)) {
+        isLegal = true;
+        break;
+      }
+    }
+    
+    if (!isLegal) continue;
+    
+    for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); 
+         I != E; ++I) {
+      if (StringsEqualNoCase(RegName, RI->get(*I).AsmName))
+        return std::make_pair(*I, RC);
+    }
+  }
+  
+  return std::pair<unsigned, const TargetRegisterClass*>(0, 0);
+}
+
+//===----------------------------------------------------------------------===//
+// Constraint Selection.
+
+/// isMatchingInputConstraint - Return true of this is an input operand that is
+/// a matching constraint like "4".
+bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return isdigit(ConstraintCode[0]);
+}
+
+/// getMatchedOperand - If this is an input matching constraint, this method
+/// returns the output operand it matches.
+unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return atoi(ConstraintCode.c_str());
+}
+
+
+/// getConstraintGenerality - Return an integer indicating how general CT
+/// is.
+static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
+  switch (CT) {
+  default: assert(0 && "Unknown constraint type!");
+  case TargetLowering::C_Other:
+  case TargetLowering::C_Unknown:
+    return 0;
+  case TargetLowering::C_Register:
+    return 1;
+  case TargetLowering::C_RegisterClass:
+    return 2;
+  case TargetLowering::C_Memory:
+    return 3;
+  }
+}
+
+/// ChooseConstraint - If there are multiple different constraints that we
+/// could pick for this operand (e.g. "imr") try to pick the 'best' one.
+/// This is somewhat tricky: constraints fall into four classes:
+///    Other         -> immediates and magic values
+///    Register      -> one specific register
+///    RegisterClass -> a group of regs
+///    Memory        -> memory
+/// Ideally, we would pick the most specific constraint possible: if we have
+/// something that fits into a register, we would pick it.  The problem here
+/// is that if we have something that could either be in a register or in
+/// memory that use of the register could cause selection of *other*
+/// operands to fail: they might only succeed if we pick memory.  Because of
+/// this the heuristic we use is:
+///
+///  1) If there is an 'other' constraint, and if the operand is valid for
+///     that constraint, use it.  This makes us take advantage of 'i'
+///     constraints when available.
+///  2) Otherwise, pick the most general constraint present.  This prefers
+///     'm' over 'r', for example.
+///
+static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
+                             bool hasMemory,  const TargetLowering &TLI,
+                             SDValue Op, SelectionDAG *DAG) {
+  assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
+  unsigned BestIdx = 0;
+  TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
+  int BestGenerality = -1;
+  
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
+    TargetLowering::ConstraintType CType =
+      TLI.getConstraintType(OpInfo.Codes[i]);
+    
+    // If this is an 'other' constraint, see if the operand is valid for it.
+    // For example, on X86 we might have an 'rI' constraint.  If the operand
+    // is an integer in the range [0..31] we want to use I (saving a load
+    // of a register), otherwise we must use 'r'.
+    if (CType == TargetLowering::C_Other && Op.getNode()) {
+      assert(OpInfo.Codes[i].size() == 1 &&
+             "Unhandled multi-letter 'other' constraint");
+      std::vector<SDValue> ResultOps;
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0], hasMemory,
+                                       ResultOps, *DAG);
+      if (!ResultOps.empty()) {
+        BestType = CType;
+        BestIdx = i;
+        break;
+      }
+    }
+    
+    // This constraint letter is more general than the previous one, use it.
+    int Generality = getConstraintGenerality(CType);
+    if (Generality > BestGenerality) {
+      BestType = CType;
+      BestIdx = i;
+      BestGenerality = Generality;
+    }
+  }
+  
+  OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
+  OpInfo.ConstraintType = BestType;
+}
+
+/// ComputeConstraintToUse - Determines the constraint code and constraint
+/// type to use for the specific AsmOperandInfo, setting
+/// OpInfo.ConstraintCode and OpInfo.ConstraintType.
+void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
+                                            SDValue Op, 
+                                            bool hasMemory,
+                                            SelectionDAG *DAG) const {
+  assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
+  
+  // Single-letter constraints ('r') are very common.
+  if (OpInfo.Codes.size() == 1) {
+    OpInfo.ConstraintCode = OpInfo.Codes[0];
+    OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+  } else {
+    ChooseConstraint(OpInfo, hasMemory, *this, Op, DAG);
+  }
+  
+  // 'X' matches anything.
+  if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
+    // Labels and constants are handled elsewhere ('X' is the only thing
+    // that matches labels).
+    if (isa<BasicBlock>(OpInfo.CallOperandVal) ||
+        isa<ConstantInt>(OpInfo.CallOperandVal))
+      return;
+    
+    // Otherwise, try to resolve it to something we know about by looking at
+    // the actual operand type.
+    if (const char *Repl = LowerXConstraint(OpInfo.ConstraintVT)) {
+      OpInfo.ConstraintCode = Repl;
+      OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Loop Strength Reduction hooks
+//===----------------------------------------------------------------------===//
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool TargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                           const Type *Ty) const {
+  // The default implementation of this implements a conservative RISCy, r+r and
+  // r+i addr mode.
+
+  // Allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+  
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+  
+  // Only support r+r, 
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  }
+  
+  return true;
+}
+
+/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, 
+                                  std::vector<SDNode*>* Created) const {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl= N->getDebugLoc();
+  
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+  
+  APInt d = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  APInt::ms magics = d.magic();
+  
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  SDValue Q;
+  if (isOperationLegalOrCustom(ISD::MULHS, VT))
+    Q = DAG.getNode(ISD::MULHS, dl, VT, N->getOperand(0),
+                    DAG.getConstant(magics.m, VT));
+  else if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT),
+                              N->getOperand(0),
+                              DAG.getConstant(magics.m, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhs or equvialent
+  // If d > 0 and m < 0, add the numerator
+  if (d.isStrictlyPositive() && magics.m.isNegative()) { 
+    Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // If d < 0 and m > 0, subtract the numerator.
+  if (d.isNegative() && magics.m.isStrictlyPositive()) {
+    Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // Shift right algebraic if shift value is nonzero
+  if (magics.s > 0) {
+    Q = DAG.getNode(ISD::SRA, dl, VT, Q, 
+                    DAG.getConstant(magics.s, getShiftAmountTy()));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // Extract the sign bit and add it to the quotient
+  SDValue T =
+    DAG.getNode(ISD::SRL, dl, VT, Q, DAG.getConstant(VT.getSizeInBits()-1,
+                                                 getShiftAmountTy()));
+  if (Created)
+    Created->push_back(T.getNode());
+  return DAG.getNode(ISD::ADD, dl, VT, Q, T);
+}
+
+/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
+                                  std::vector<SDNode*>* Created) const {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  // FIXME: We should use a narrower constant when the upper
+  // bits are known to be zero.
+  ConstantSDNode *N1C = cast<ConstantSDNode>(N->getOperand(1));
+  APInt::mu magics = N1C->getAPIntValue().magicu();
+
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  SDValue Q;
+  if (isOperationLegalOrCustom(ISD::MULHU, VT))
+    Q = DAG.getNode(ISD::MULHU, dl, VT, N->getOperand(0),
+                    DAG.getConstant(magics.m, VT));
+  else if (isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT),
+                              N->getOperand(0),
+                              DAG.getConstant(magics.m, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhu or equvialent
+  if (Created)
+    Created->push_back(Q.getNode());
+
+  if (magics.a == 0) {
+    assert(magics.s < N1C->getAPIntValue().getBitWidth() &&
+           "We shouldn't generate an undefined shift!");
+    return DAG.getNode(ISD::SRL, dl, VT, Q, 
+                       DAG.getConstant(magics.s, getShiftAmountTy()));
+  } else {
+    SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ, 
+                      DAG.getConstant(1, getShiftAmountTy()));
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    return DAG.getNode(ISD::SRL, dl, VT, NPQ, 
+                       DAG.getConstant(magics.s-1, getShiftAmountTy()));
+  }
+}
+
+/// IgnoreHarmlessInstructions - Ignore instructions between a CALL and RET
+/// node that don't prevent tail call optimization.
+static SDValue IgnoreHarmlessInstructions(SDValue node) {
+  // Found call return.
+  if (node.getOpcode() == ISD::CALL) return node;
+  // Ignore MERGE_VALUES. Will have at least one operand.
+  if (node.getOpcode() == ISD::MERGE_VALUES)
+    return IgnoreHarmlessInstructions(node.getOperand(0));
+  // Ignore ANY_EXTEND node.
+  if (node.getOpcode() == ISD::ANY_EXTEND)
+    return IgnoreHarmlessInstructions(node.getOperand(0));
+  if (node.getOpcode() == ISD::TRUNCATE)
+    return IgnoreHarmlessInstructions(node.getOperand(0));
+  // Any other node type.
+  return node;
+} 
+
+bool TargetLowering::CheckTailCallReturnConstraints(CallSDNode *TheCall,
+                                                    SDValue Ret) {
+  unsigned NumOps = Ret.getNumOperands();
+  // ISD::CALL results:(value0, ..., valuen, chain)
+  // ISD::RET  operands:(chain, value0, flag0, ..., valuen, flagn)
+  // Value return:
+  // Check that operand of the RET node sources from the CALL node. The RET node
+  // has at least two operands. Operand 0 holds the chain. Operand 1 holds the
+  // value.
+  if (NumOps > 1 &&
+      IgnoreHarmlessInstructions(Ret.getOperand(1)) == SDValue(TheCall,0))
+    return true;
+  // void return: The RET node  has the chain result value of the CALL node as
+  // input.
+  if (NumOps == 1 &&
+      Ret.getOperand(0) == SDValue(TheCall, TheCall->getNumValues()-1))
+    return true;
+
+  return false;
+}
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
new file mode 100644
index 0000000..2402f81
--- /dev/null
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -0,0 +1,439 @@
+//===-- ShadowStackGC.cpp - GC support for uncooperative targets ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering for the llvm.gc* intrinsics for targets that do
+// not natively support them (which includes the C backend). Note that the code
+// generated is not quite as efficient as algorithms which generate stack maps
+// to identify roots.
+//
+// This pass implements the code transformation described in this paper:
+//   "Accurate Garbage Collection in an Uncooperative Environment"
+//   Fergus Henderson, ISMM, 2002
+//
+// In runtime/GC/SemiSpace.cpp is a prototype runtime which is compatible with
+// ShadowStackGC.
+//
+// In order to support this particular transformation, all stack roots are
+// coallocated in the stack. This allows a fully target-independent stack map
+// while introducing only minor runtime overhead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "shadowstackgc"
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/IRBuilder.h"
+
+using namespace llvm;
+
+namespace {
+
+  class VISIBILITY_HIDDEN ShadowStackGC : public GCStrategy {
+    /// RootChain - This is the global linked-list that contains the chain of GC
+    /// roots.
+    GlobalVariable *Head;
+
+    /// StackEntryTy - Abstract type of a link in the shadow stack.
+    ///
+    const StructType *StackEntryTy;
+
+    /// Roots - GC roots in the current function. Each is a pair of the
+    /// intrinsic call and its corresponding alloca.
+    std::vector<std::pair<CallInst*,AllocaInst*> > Roots;
+
+  public:
+    ShadowStackGC();
+
+    bool initializeCustomLowering(Module &M);
+    bool performCustomLowering(Function &F);
+
+  private:
+    bool IsNullValue(Value *V);
+    Constant *GetFrameMap(Function &F);
+    const Type* GetConcreteStackEntryType(Function &F);
+    void CollectRoots(Function &F);
+    static GetElementPtrInst *CreateGEP(IRBuilder<> &B, Value *BasePtr,
+                                        int Idx1, const char *Name);
+    static GetElementPtrInst *CreateGEP(IRBuilder<> &B, Value *BasePtr,
+                                        int Idx1, int Idx2, const char *Name);
+  };
+
+}
+
+static GCRegistry::Add<ShadowStackGC>
+X("shadow-stack", "Very portable GC for uncooperative code generators");
+
+namespace {
+  /// EscapeEnumerator - This is a little algorithm to find all escape points
+  /// from a function so that "finally"-style code can be inserted. In addition
+  /// to finding the existing return and unwind instructions, it also (if
+  /// necessary) transforms any call instructions into invokes and sends them to
+  /// a landing pad.
+  ///
+  /// It's wrapped up in a state machine using the same transform C# uses for
+  /// 'yield return' enumerators, This transform allows it to be non-allocating.
+  class VISIBILITY_HIDDEN EscapeEnumerator {
+    Function &F;
+    const char *CleanupBBName;
+
+    // State.
+    int State;
+    Function::iterator StateBB, StateE;
+    IRBuilder<> Builder;
+
+  public:
+    EscapeEnumerator(Function &F, const char *N = "cleanup")
+      : F(F), CleanupBBName(N), State(0) {}
+
+    IRBuilder<> *Next() {
+      switch (State) {
+      default:
+        return 0;
+
+      case 0:
+        StateBB = F.begin();
+        StateE = F.end();
+        State = 1;
+
+      case 1:
+        // Find all 'return' and 'unwind' instructions.
+        while (StateBB != StateE) {
+          BasicBlock *CurBB = StateBB++;
+
+          // Branches and invokes do not escape, only unwind and return do.
+          TerminatorInst *TI = CurBB->getTerminator();
+          if (!isa<UnwindInst>(TI) && !isa<ReturnInst>(TI))
+            continue;
+
+          Builder.SetInsertPoint(TI->getParent(), TI);
+          return &Builder;
+        }
+
+        State = 2;
+
+        // Find all 'call' instructions.
+        SmallVector<Instruction*,16> Calls;
+        for (Function::iterator BB = F.begin(),
+                                E = F.end(); BB != E; ++BB)
+          for (BasicBlock::iterator II = BB->begin(),
+                                    EE = BB->end(); II != EE; ++II)
+            if (CallInst *CI = dyn_cast<CallInst>(II))
+              if (!CI->getCalledFunction() ||
+                  !CI->getCalledFunction()->getIntrinsicID())
+                Calls.push_back(CI);
+
+        if (Calls.empty())
+          return 0;
+
+        // Create a cleanup block.
+        BasicBlock *CleanupBB = BasicBlock::Create(CleanupBBName, &F);
+        UnwindInst *UI = new UnwindInst(CleanupBB);
+
+        // Transform the 'call' instructions into 'invoke's branching to the
+        // cleanup block. Go in reverse order to make prettier BB names.
+        SmallVector<Value*,16> Args;
+        for (unsigned I = Calls.size(); I != 0; ) {
+          CallInst *CI = cast<CallInst>(Calls[--I]);
+
+          // Split the basic block containing the function call.
+          BasicBlock *CallBB = CI->getParent();
+          BasicBlock *NewBB =
+            CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont");
+
+          // Remove the unconditional branch inserted at the end of CallBB.
+          CallBB->getInstList().pop_back();
+          NewBB->getInstList().remove(CI);
+
+          // Create a new invoke instruction.
+          Args.clear();
+          Args.append(CI->op_begin() + 1, CI->op_end());
+
+          InvokeInst *II = InvokeInst::Create(CI->getOperand(0),
+                                              NewBB, CleanupBB,
+                                              Args.begin(), Args.end(),
+                                              CI->getName(), CallBB);
+          II->setCallingConv(CI->getCallingConv());
+          II->setAttributes(CI->getAttributes());
+          CI->replaceAllUsesWith(II);
+          delete CI;
+        }
+
+        Builder.SetInsertPoint(UI->getParent(), UI);
+        return &Builder;
+      }
+    }
+  };
+}
+
+// -----------------------------------------------------------------------------
+
+void llvm::linkShadowStackGC() { }
+
+ShadowStackGC::ShadowStackGC() : Head(0), StackEntryTy(0) {
+  InitRoots = true;
+  CustomRoots = true;
+}
+
+Constant *ShadowStackGC::GetFrameMap(Function &F) {
+  // doInitialization creates the abstract type of this value.
+
+  Type *VoidPtr = PointerType::getUnqual(Type::Int8Ty);
+
+  // Truncate the ShadowStackDescriptor if some metadata is null.
+  unsigned NumMeta = 0;
+  SmallVector<Constant*,16> Metadata;
+  for (unsigned I = 0; I != Roots.size(); ++I) {
+    Constant *C = cast<Constant>(Roots[I].first->getOperand(2));
+    if (!C->isNullValue())
+      NumMeta = I + 1;
+    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
+  }
+
+  Constant *BaseElts[] = {
+    ConstantInt::get(Type::Int32Ty, Roots.size(), false),
+    ConstantInt::get(Type::Int32Ty, NumMeta, false),
+  };
+
+  Constant *DescriptorElts[] = {
+    ConstantStruct::get(BaseElts, 2),
+    ConstantArray::get(ArrayType::get(VoidPtr, NumMeta),
+                       Metadata.begin(), NumMeta)
+  };
+
+  Constant *FrameMap = ConstantStruct::get(DescriptorElts, 2);
+
+  std::string TypeName("gc_map.");
+  TypeName += utostr(NumMeta);
+  F.getParent()->addTypeName(TypeName, FrameMap->getType());
+
+  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
+  //        that, short of multithreaded LLVM, it should be safe; all that is
+  //        necessary is that a simple Module::iterator loop not be invalidated.
+  //        Appending to the GlobalVariable list is safe in that sense.
+  //
+  //        All of the output passes emit globals last. The ExecutionEngine
+  //        explicitly supports adding globals to the module after
+  //        initialization.
+  //
+  //        Still, if it isn't deemed acceptable, then this transformation needs
+  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
+  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
+  //        provided a ModulePass))).
+  Constant *GV = new GlobalVariable(FrameMap->getType(), true,
+                                    GlobalVariable::InternalLinkage,
+                                    FrameMap, "__gc_" + F.getName(),
+                                    F.getParent());
+
+  Constant *GEPIndices[2] = { ConstantInt::get(Type::Int32Ty, 0),
+                              ConstantInt::get(Type::Int32Ty, 0) };
+  return ConstantExpr::getGetElementPtr(GV, GEPIndices, 2);
+}
+
+const Type* ShadowStackGC::GetConcreteStackEntryType(Function &F) {
+  // doInitialization creates the generic version of this type.
+  std::vector<const Type*> EltTys;
+  EltTys.push_back(StackEntryTy);
+  for (size_t I = 0; I != Roots.size(); I++)
+    EltTys.push_back(Roots[I].second->getAllocatedType());
+  Type *Ty = StructType::get(EltTys);
+
+  std::string TypeName("gc_stackentry.");
+  TypeName += F.getName();
+  F.getParent()->addTypeName(TypeName, Ty);
+
+  return Ty;
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now. If
+/// not, exit fast.
+bool ShadowStackGC::initializeCustomLowering(Module &M) {
+  // struct FrameMap {
+  //   int32_t NumRoots; // Number of roots in stack frame.
+  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
+  //   void *Meta[];     // May be absent for roots without metadata.
+  // };
+  std::vector<const Type*> EltTys;
+  EltTys.push_back(Type::Int32Ty); // 32 bits is ok up to a 32GB stack frame. :)
+  EltTys.push_back(Type::Int32Ty); // Specifies length of variable length array.
+  StructType *FrameMapTy = StructType::get(EltTys);
+  M.addTypeName("gc_map", FrameMapTy);
+  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
+
+  // struct StackEntry {
+  //   ShadowStackEntry *Next; // Caller's stack entry.
+  //   FrameMap *Map;          // Pointer to constant FrameMap.
+  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
+  // };
+  OpaqueType *RecursiveTy = OpaqueType::get();
+
+  EltTys.clear();
+  EltTys.push_back(PointerType::getUnqual(RecursiveTy));
+  EltTys.push_back(FrameMapPtrTy);
+  PATypeHolder LinkTyH = StructType::get(EltTys);
+
+  RecursiveTy->refineAbstractTypeTo(LinkTyH.get());
+  StackEntryTy = cast<StructType>(LinkTyH.get());
+  const PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
+  M.addTypeName("gc_stackentry", LinkTyH.get());  // FIXME: Is this safe from
+                                                  //        a FunctionPass?
+
+  // Get the root chain if it already exists.
+  Head = M.getGlobalVariable("llvm_gc_root_chain");
+  if (!Head) {
+    // If the root chain does not exist, insert a new one with linkonce
+    // linkage!
+    Head = new GlobalVariable(StackEntryPtrTy, false,
+                              GlobalValue::LinkOnceAnyLinkage,
+                              Constant::getNullValue(StackEntryPtrTy),
+                              "llvm_gc_root_chain", &M);
+  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
+    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
+    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  }
+
+  return true;
+}
+
+bool ShadowStackGC::IsNullValue(Value *V) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+void ShadowStackGC::CollectRoots(Function &F) {
+  // FIXME: Account for original alignment. Could fragment the root array.
+  //   Approach 1: Null initialize empty slots at runtime. Yuck.
+  //   Approach 2: Emit a map of the array instead of just a count.
+
+  assert(Roots.empty() && "Not cleaned up?");
+
+  SmallVector<std::pair<CallInst*,AllocaInst*>,16> MetaRoots;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::gcroot) {
+            std::pair<CallInst*,AllocaInst*> Pair = std::make_pair(
+              CI, cast<AllocaInst>(CI->getOperand(1)->stripPointerCasts()));
+            if (IsNullValue(CI->getOperand(2)))
+              Roots.push_back(Pair);
+            else
+              MetaRoots.push_back(Pair);
+          }
+
+  // Number roots with metadata (usually empty) at the beginning, so that the
+  // FrameMap::Meta array can be elided.
+  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
+}
+
+GetElementPtrInst *
+ShadowStackGC::CreateGEP(IRBuilder<> &B, Value *BasePtr,
+                         int Idx, int Idx2, const char *Name) {
+  Value *Indices[] = { ConstantInt::get(Type::Int32Ty, 0),
+                       ConstantInt::get(Type::Int32Ty, Idx),
+                       ConstantInt::get(Type::Int32Ty, Idx2) };
+  Value* Val = B.CreateGEP(BasePtr, Indices, Indices + 3, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+GetElementPtrInst *
+ShadowStackGC::CreateGEP(IRBuilder<> &B, Value *BasePtr,
+                         int Idx, const char *Name) {
+  Value *Indices[] = { ConstantInt::get(Type::Int32Ty, 0),
+                       ConstantInt::get(Type::Int32Ty, Idx) };
+  Value *Val = B.CreateGEP(BasePtr, Indices, Indices + 2, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+/// runOnFunction - Insert code to maintain the shadow stack.
+bool ShadowStackGC::performCustomLowering(Function &F) {
+  // Find calls to llvm.gcroot.
+  CollectRoots(F);
+
+  // If there are no roots in this function, then there is no need to add a
+  // stack map entry for it.
+  if (Roots.empty())
+    return false;
+
+  // Build the constant map and figure the type of the shadow stack entry.
+  Value *FrameMap = GetFrameMap(F);
+  const Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
+
+  // Build the shadow stack entry at the very start of the function.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  IRBuilder<> AtEntry(IP->getParent(), IP);
+
+  Instruction *StackEntry   = AtEntry.CreateAlloca(ConcreteStackEntryTy, 0,
+                                                   "gc_frame");
+
+  while (isa<AllocaInst>(IP)) ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Initialize the map pointer and load the current head of the shadow stack.
+  Instruction *CurrentHead  = AtEntry.CreateLoad(Head, "gc_currhead");
+  Instruction *EntryMapPtr  = CreateGEP(AtEntry, StackEntry,0,1,"gc_frame.map");
+                              AtEntry.CreateStore(FrameMap, EntryMapPtr);
+
+  // After all the allocas...
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    // For each root, find the corresponding slot in the aggregate...
+    Value *SlotPtr = CreateGEP(AtEntry, StackEntry, 1 + I, "gc_root");
+
+    // And use it in lieu of the alloca.
+    AllocaInst *OriginalAlloca = Roots[I].second;
+    SlotPtr->takeName(OriginalAlloca);
+    OriginalAlloca->replaceAllUsesWith(SlotPtr);
+  }
+
+  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
+  // really necessary (the collector would never see the intermediate state at
+  // runtime), but it's nicer not to push the half-initialized entry onto the
+  // shadow stack.
+  while (isa<StoreInst>(IP)) ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Push the entry onto the shadow stack.
+  Instruction *EntryNextPtr = CreateGEP(AtEntry,StackEntry,0,0,"gc_frame.next");
+  Instruction *NewHeadVal   = CreateGEP(AtEntry,StackEntry, 0, "gc_newhead");
+                              AtEntry.CreateStore(CurrentHead, EntryNextPtr);
+                              AtEntry.CreateStore(NewHeadVal, Head);
+
+  // For each instruction that escapes...
+  EscapeEnumerator EE(F, "gc_cleanup");
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
+    // AtEntry, since that would make the value live for the entire function.
+    Instruction *EntryNextPtr2 = CreateGEP(*AtExit, StackEntry, 0, 0,
+                                           "gc_frame.next");
+    Value *SavedHead = AtExit->CreateLoad(EntryNextPtr2, "gc_savedhead");
+                       AtExit->CreateStore(SavedHead, Head);
+  }
+
+  // Delete the original allocas (which are no longer used) and the intrinsic
+  // calls (which are no longer valid). Doing this last avoids invalidating
+  // iterators.
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    Roots[I].first->eraseFromParent();
+    Roots[I].second->eraseFromParent();
+  }
+
+  Roots.clear();
+  return true;
+}
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
new file mode 100644
index 0000000..e44a138
--- /dev/null
+++ b/lib/CodeGen/ShrinkWrapping.cpp
@@ -0,0 +1,1141 @@
+//===-- ShrinkWrapping.cpp - Reduce spills/restores of callee-saved regs --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a shrink wrapping variant of prolog/epilog insertion:
+// - Spills and restores of callee-saved registers (CSRs) are placed in the
+//   machine CFG to tightly surround their uses so that execution paths that
+//   do not use CSRs do not pay the spill/restore penalty.
+//
+// - Avoiding placment of spills/restores in loops: if a CSR is used inside a
+//   loop the spills are placed in the loop preheader, and restores are
+//   placed in the loop exit nodes (the successors of loop _exiting_ nodes).
+//
+// - Covering paths without CSR uses:
+//   If a region in a CFG uses CSRs and has multiple entry and/or exit points,
+//   the use info for the CSRs inside the region is propagated outward in the
+//   CFG to ensure validity of the spill/restore placements. This decreases
+//   the effectiveness of shrink wrapping but does not require edge splitting
+//   in the machine CFG.
+//
+// This shrink wrapping implementation uses an iterative analysis to determine
+// which basic blocks require spills and restores for CSRs.
+//
+// This pass uses MachineDominators and MachineLoopInfo. Loop information
+// is used to prevent placement of callee-saved register spills/restores
+// in the bodies of loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "shrink-wrap"
+
+#include "PrologEpilogInserter.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include <sstream>
+
+using namespace llvm;
+
+STATISTIC(numSRReduced, "Number of CSR spills+restores reduced.");
+
+// Shrink Wrapping:
+static cl::opt<bool>
+ShrinkWrapping("shrink-wrap",
+               cl::desc("Shrink wrap callee-saved register spills/restores"));
+
+// Shrink wrap only the specified function, a debugging aid.
+static cl::opt<std::string>
+ShrinkWrapFunc("shrink-wrap-func", cl::Hidden,
+               cl::desc("Shrink wrap the specified function"),
+               cl::value_desc("funcname"),
+               cl::init(""));
+
+// Debugging level for shrink wrapping.
+enum ShrinkWrapDebugLevel {
+  None, BasicInfo, Iterations, Details
+};
+
+static cl::opt<enum ShrinkWrapDebugLevel>
+ShrinkWrapDebugging("shrink-wrap-dbg", cl::Hidden,
+  cl::desc("Print shrink wrapping debugging information"),
+  cl::values(
+    clEnumVal(None      , "disable debug output"),
+    clEnumVal(BasicInfo , "print basic DF sets"),
+    clEnumVal(Iterations, "print SR sets for each iteration"),
+    clEnumVal(Details   , "print all DF sets"),
+    clEnumValEnd));
+
+
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  if (ShrinkWrapping || ShrinkWrapFunc != "") {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+  }
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+//===----------------------------------------------------------------------===//
+//  ShrinkWrapping implementation
+//===----------------------------------------------------------------------===//
+
+// Convienences for dealing with machine loops.
+MachineBasicBlock* PEI::getTopLevelLoopPreheader(MachineLoop* LP) {
+  assert(LP && "Machine loop is NULL.");
+  MachineBasicBlock* PHDR = LP->getLoopPreheader();
+  MachineLoop* PLP = LP->getParentLoop();
+  while (PLP) {
+    PHDR = PLP->getLoopPreheader();
+    PLP = PLP->getParentLoop();
+  }
+  return PHDR;
+}
+
+MachineLoop* PEI::getTopLevelLoopParent(MachineLoop *LP) {
+  if (LP == 0)
+    return 0;
+  MachineLoop* PLP = LP->getParentLoop();
+  while (PLP) {
+    LP = PLP;
+    PLP = PLP->getParentLoop();
+  }
+  return LP;
+}
+
+bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
+  return (MBB && !MBB->empty() && MBB->back().getDesc().isReturn());
+}
+
+// Initialize shrink wrapping DFA sets, called before iterations.
+void PEI::clearAnticAvailSets() {
+  AnticIn.clear();
+  AnticOut.clear();
+  AvailIn.clear();
+  AvailOut.clear();
+}
+
+// Clear all sets constructed by shrink wrapping.
+void PEI::clearAllSets() {
+  ReturnBlocks.clear();
+  clearAnticAvailSets();
+  UsedCSRegs.clear();
+  CSRUsed.clear();
+  TLLoops.clear();
+  CSRSave.clear();
+  CSRRestore.clear();
+}
+
+// Initialize all shrink wrapping data.
+void PEI::initShrinkWrappingInfo() {
+  clearAllSets();
+  EntryBlock = 0;
+#ifndef NDEBUG
+  HasFastExitPath = false;
+#endif
+  ShrinkWrapThisFunction = ShrinkWrapping;
+  // DEBUG: enable or disable shrink wrapping for the current function
+  // via --shrink-wrap-func=<funcname>.
+#ifndef NDEBUG
+  if (ShrinkWrapFunc != "") {
+    std::string MFName = MF->getFunction()->getName();
+    ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
+  }
+#endif
+}
+
+
+/// placeCSRSpillsAndRestores - determine which MBBs of the function
+/// need save, restore code for callee-saved registers by doing a DF analysis
+/// similar to the one used in code motion (GVNPRE). This produces maps of MBBs
+/// to sets of registers (CSRs) for saves and restores. MachineLoopInfo
+/// is used to ensure that CSR save/restore code is not placed inside loops.
+/// This function computes the maps of MBBs -> CSRs to spill and restore
+/// in CSRSave, CSRRestore.
+///
+/// If shrink wrapping is not being performed, place all spills in
+/// the entry block, all restores in return blocks. In this case,
+/// CSRSave has a single mapping, CSRRestore has mappings for each
+/// return block.
+///
+void PEI::placeCSRSpillsAndRestores(MachineFunction &Fn) {
+
+  DEBUG(MF = &Fn);
+
+  initShrinkWrappingInfo();
+
+  DEBUG(if (ShrinkWrapThisFunction) {
+      DOUT << "Place CSR spills/restores for "
+           << MF->getFunction()->getName() << "\n";
+    });
+
+  if (calculateSets(Fn))
+    placeSpillsAndRestores(Fn);
+}
+
+/// calcAnticInOut - calculate the anticipated in/out reg sets
+/// for the given MBB by looking forward in the MCFG at MBB's
+/// successors.
+///
+bool PEI::calcAnticInOut(MachineBasicBlock* MBB) {
+  bool changed = false;
+
+  // AnticOut[MBB] = INTERSECT(AnticIn[S] for S in SUCCESSORS(MBB))
+  SmallVector<MachineBasicBlock*, 4> successors;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    if (SUCC != MBB)
+      successors.push_back(SUCC);
+  }
+
+  unsigned i = 0, e = successors.size();
+  if (i != e) {
+    CSRegSet prevAnticOut = AnticOut[MBB];
+    MachineBasicBlock* SUCC = successors[i];
+
+    AnticOut[MBB] = AnticIn[SUCC];
+    for (++i; i != e; ++i) {
+      SUCC = successors[i];
+      AnticOut[MBB] &= AnticIn[SUCC];
+    }
+    if (prevAnticOut != AnticOut[MBB])
+      changed = true;
+  }
+
+  // AnticIn[MBB] = UNION(CSRUsed[MBB], AnticOut[MBB]);
+  CSRegSet prevAnticIn = AnticIn[MBB];
+  AnticIn[MBB] = CSRUsed[MBB] | AnticOut[MBB];
+  if (prevAnticIn |= AnticIn[MBB])
+    changed = true;
+  return changed;
+}
+
+/// calcAvailInOut - calculate the available in/out reg sets
+/// for the given MBB by looking backward in the MCFG at MBB's
+/// predecessors.
+///
+bool PEI::calcAvailInOut(MachineBasicBlock* MBB) {
+  bool changed = false;
+
+  // AvailIn[MBB] = INTERSECT(AvailOut[P] for P in PREDECESSORS(MBB))
+  SmallVector<MachineBasicBlock*, 4> predecessors;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    if (PRED != MBB)
+      predecessors.push_back(PRED);
+  }
+
+  unsigned i = 0, e = predecessors.size();
+  if (i != e) {
+    CSRegSet prevAvailIn = AvailIn[MBB];
+    MachineBasicBlock* PRED = predecessors[i];
+
+    AvailIn[MBB] = AvailOut[PRED];
+    for (++i; i != e; ++i) {
+      PRED = predecessors[i];
+      AvailIn[MBB] &= AvailOut[PRED];
+    }
+    if (prevAvailIn != AvailIn[MBB])
+      changed = true;
+  }
+
+  // AvailOut[MBB] = UNION(CSRUsed[MBB], AvailIn[MBB]);
+  CSRegSet prevAvailOut = AvailOut[MBB];
+  AvailOut[MBB] = CSRUsed[MBB] | AvailIn[MBB];
+  if (prevAvailOut |= AvailOut[MBB])
+    changed = true;
+  return changed;
+}
+
+/// calculateAnticAvail - build the sets anticipated and available
+/// registers in the MCFG of the current function iteratively,
+/// doing a combined forward and backward analysis.
+///
+void PEI::calculateAnticAvail(MachineFunction &Fn) {
+  // Initialize data flow sets.
+  clearAnticAvailSets();
+
+  // Calulate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
+  bool changed = true;
+  unsigned iterations = 0;
+  while (changed) {
+    changed = false;
+    ++iterations;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+
+      // Calculate anticipated in, out regs at MBB from
+      // anticipated at successors of MBB.
+      changed |= calcAnticInOut(MBB);
+
+      // Calculate available in, out regs at MBB from
+      // available at predecessors of MBB.
+      changed |= calcAvailInOut(MBB);
+    }
+  }
+
+  DEBUG(if (ShrinkWrapDebugging >= Details) {
+      DOUT << "-----------------------------------------------------------\n";
+      DOUT << " Antic/Avail Sets:\n";
+      DOUT << "-----------------------------------------------------------\n";
+      DOUT << "iterations = " << iterations << "\n";
+      DOUT << "-----------------------------------------------------------\n";
+      DOUT << "MBB | USED | ANTIC_IN | ANTIC_OUT | AVAIL_IN | AVAIL_OUT\n";
+      DOUT << "-----------------------------------------------------------\n";
+      for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+           MBBI != MBBE; ++MBBI) {
+        MachineBasicBlock* MBB = MBBI;
+        dumpSets(MBB);
+      }
+      DOUT << "-----------------------------------------------------------\n";
+    });
+}
+
+/// propagateUsesAroundLoop - copy used register info from MBB to all blocks
+/// of the loop given by LP and its parent loops. This prevents spills/restores
+/// from being placed in the bodies of loops.
+///
+void PEI::propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP) {
+  if (! MBB || !LP)
+    return;
+
+  std::vector<MachineBasicBlock*> loopBlocks = LP->getBlocks();
+  for (unsigned i = 0, e = loopBlocks.size(); i != e; ++i) {
+    MachineBasicBlock* LBB = loopBlocks[i];
+    if (LBB == MBB)
+      continue;
+    if (CSRUsed[LBB].contains(CSRUsed[MBB]))
+      continue;
+    CSRUsed[LBB] |= CSRUsed[MBB];
+  }
+}
+
+/// calculateSets - collect the CSRs used in this function, compute
+/// the DF sets that describe the initial minimal regions in the
+/// Machine CFG around which CSR spills and restores must be placed.
+///
+/// Additionally, this function decides if shrink wrapping should
+/// be disabled for the current function, checking the following:
+///  1. the current function has more than 500 MBBs: heuristic limit
+///     on function size to reduce compile time impact of the current
+///     iterative algorithm.
+///  2. all CSRs are used in the entry block.
+///  3. all CSRs are used in all immediate successors of the entry block.
+///  4. all CSRs are used in a subset of blocks, each of which dominates
+///     all return blocks. These blocks, taken as a subgraph of the MCFG,
+///     are equivalent to the entry block since all execution paths pass
+///     through them.
+///
+bool PEI::calculateSets(MachineFunction &Fn) {
+  // Sets used to compute spill, restore placement sets.
+  const std::vector<CalleeSavedInfo> CSI =
+    Fn.getFrameInfo()->getCalleeSavedInfo();
+
+  // If no CSRs used, we are done.
+  if (CSI.empty()) {
+    DEBUG(if (ShrinkWrapThisFunction)
+            DOUT << "DISABLED: " << Fn.getFunction()->getName()
+                 << ": uses no callee-saved registers\n");
+    return false;
+  }
+
+  // Save refs to entry and return blocks.
+  EntryBlock = Fn.begin();
+  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
+       MBB != E; ++MBB)
+    if (isReturnBlock(MBB))
+      ReturnBlocks.push_back(MBB);
+
+  // Determine if this function has fast exit paths.
+  DEBUG(if (ShrinkWrapThisFunction)
+          findFastExitPath());
+
+  // Limit shrink wrapping via the current iterative bit vector
+  // implementation to functions with <= 500 MBBs.
+  if (Fn.size() > 500) {
+    DEBUG(if (ShrinkWrapThisFunction)
+            DOUT << "DISABLED: " << Fn.getFunction()->getName()
+                 << ": too large (" << Fn.size() << " MBBs)\n");
+    ShrinkWrapThisFunction = false;
+  }
+
+  // Return now if not shrink wrapping.
+  if (! ShrinkWrapThisFunction)
+    return false;
+
+  // Collect set of used CSRs.
+  for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
+    UsedCSRegs.set(inx);
+  }
+
+  // Walk instructions in all MBBs, create CSRUsed[] sets, choose
+  // whether or not to shrink wrap this function.
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+
+  bool allCSRUsesInEntryBlock = true;
+  for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock* MBB = MBBI;
+    for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end(); ++I) {
+      for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
+        unsigned Reg = CSI[inx].getReg();
+        // If instruction I reads or modifies Reg, add it to UsedCSRegs,
+        // CSRUsed map for the current block.
+        for (unsigned opInx = 0, opEnd = I->getNumOperands();
+             opInx != opEnd; ++opInx) {
+          const MachineOperand &MO = I->getOperand(opInx);
+          if (! (MO.isReg() && (MO.isUse() || MO.isDef())))
+            continue;
+          unsigned MOReg = MO.getReg();
+          if (!MOReg)
+            continue;
+          if (MOReg == Reg ||
+              (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+               TargetRegisterInfo::isPhysicalRegister(Reg) &&
+               TRI->isSubRegister(Reg, MOReg))) {
+            // CSR Reg is defined/used in block MBB.
+            CSRUsed[MBB].set(inx);
+            // Check for uses in EntryBlock.
+            if (MBB != EntryBlock)
+              allCSRUsesInEntryBlock = false;
+          }
+        }
+      }
+    }
+
+    if (CSRUsed[MBB].empty())
+      continue;
+
+    // Propagate CSRUsed[MBB] in loops
+    if (MachineLoop* LP = LI.getLoopFor(MBB)) {
+      // Add top level loop to work list.
+      MachineBasicBlock* HDR = getTopLevelLoopPreheader(LP);
+      MachineLoop* PLP = getTopLevelLoopParent(LP);
+
+      if (! HDR) {
+        HDR = PLP->getHeader();
+        assert(HDR->pred_size() > 0 && "Loop header has no predecessors?");
+        MachineBasicBlock::pred_iterator PI = HDR->pred_begin();
+        HDR = *PI;
+      }
+      TLLoops[HDR] = PLP;
+
+      // Push uses from inside loop to its parent loops,
+      // or to all other MBBs in its loop.
+      if (LP->getLoopDepth() > 1) {
+        for (MachineLoop* PLP = LP->getParentLoop(); PLP;
+             PLP = PLP->getParentLoop()) {
+          propagateUsesAroundLoop(MBB, PLP);
+        }
+      } else {
+        propagateUsesAroundLoop(MBB, LP);
+      }
+    }
+  }
+
+  if (allCSRUsesInEntryBlock) {
+    DEBUG(DOUT << "DISABLED: " << Fn.getFunction()->getName()
+          << ": all CSRs used in EntryBlock\n");
+    ShrinkWrapThisFunction = false;
+  } else {
+    bool allCSRsUsedInEntryFanout = true;
+    for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
+           SE = EntryBlock->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock* SUCC = *SI;
+      if (CSRUsed[SUCC] != UsedCSRegs)
+        allCSRsUsedInEntryFanout = false;
+    }
+    if (allCSRsUsedInEntryFanout) {
+      DEBUG(DOUT << "DISABLED: " << Fn.getFunction()->getName()
+            << ": all CSRs used in imm successors of EntryBlock\n");
+      ShrinkWrapThisFunction = false;
+    }
+  }
+
+  if (ShrinkWrapThisFunction) {
+    // Check if MBB uses CSRs and dominates all exit nodes.
+    // Such nodes are equiv. to the entry node w.r.t.
+    // CSR uses: every path through the function must
+    // pass through this node. If each CSR is used at least
+    // once by these nodes, shrink wrapping is disabled.
+    CSRegSet CSRUsedInChokePoints;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      if (MBB == EntryBlock || CSRUsed[MBB].empty() || MBB->succ_size() < 1)
+        continue;
+      bool dominatesExitNodes = true;
+      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
+        if (! DT.dominates(MBB, ReturnBlocks[ri])) {
+          dominatesExitNodes = false;
+          break;
+        }
+      if (dominatesExitNodes) {
+        CSRUsedInChokePoints |= CSRUsed[MBB];
+        if (CSRUsedInChokePoints == UsedCSRegs) {
+          DEBUG(DOUT << "DISABLED: " << Fn.getFunction()->getName()
+                << ": all CSRs used in choke point(s) at "
+                << getBasicBlockName(MBB) << "\n");
+          ShrinkWrapThisFunction = false;
+          break;
+        }
+      }
+    }
+  }
+
+  // Return now if we have decided not to apply shrink wrapping
+  // to the current function.
+  if (! ShrinkWrapThisFunction)
+    return false;
+
+  DEBUG({
+      DOUT << "ENABLED: " << Fn.getFunction()->getName();
+      if (HasFastExitPath)
+        DOUT << " (fast exit path)";
+      DOUT << "\n";
+      if (ShrinkWrapDebugging >= BasicInfo) {
+        DOUT << "------------------------------"
+             << "-----------------------------\n";
+        DOUT << "UsedCSRegs = " << stringifyCSRegSet(UsedCSRegs) << "\n";
+        if (ShrinkWrapDebugging >= Details) {
+          DOUT << "------------------------------"
+               << "-----------------------------\n";
+          dumpAllUsed();
+        }
+      }
+    });
+
+  // Build initial DF sets to determine minimal regions in the
+  // Machine CFG around which CSRs must be spilled and restored.
+  calculateAnticAvail(Fn);
+
+  return true;
+}
+
+/// addUsesForMEMERegion - add uses of CSRs spilled or restored in
+/// multi-entry, multi-exit (MEME) regions so spill and restore
+/// placement will not break code that enters or leaves a
+/// shrink-wrapped region by inducing spills with no matching
+/// restores or restores with no matching spills. A MEME region
+/// is a subgraph of the MCFG with multiple entry edges, multiple
+/// exit edges, or both. This code propagates use information
+/// through the MCFG until all paths requiring spills and restores
+/// _outside_ the computed minimal placement regions have been covered.
+///
+bool PEI::addUsesForMEMERegion(MachineBasicBlock* MBB,
+                               SmallVector<MachineBasicBlock*, 4>& blks) {
+  if (MBB->succ_size() < 2 && MBB->pred_size() < 2) {
+    bool processThisBlock = false;
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           SE = MBB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock* SUCC = *SI;
+      if (SUCC->pred_size() > 1) {
+        processThisBlock = true;
+        break;
+      }
+    }
+    if (!CSRRestore[MBB].empty() && MBB->succ_size() > 0) {
+      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+             PE = MBB->pred_end(); PI != PE; ++PI) {
+        MachineBasicBlock* PRED = *PI;
+        if (PRED->succ_size() > 1) {
+          processThisBlock = true;
+          break;
+        }
+      }
+    }
+    if (! processThisBlock)
+      return false;
+  }
+
+  CSRegSet prop;
+  if (!CSRSave[MBB].empty())
+    prop = CSRSave[MBB];
+  else if (!CSRRestore[MBB].empty())
+    prop = CSRRestore[MBB];
+  else
+    prop = CSRUsed[MBB];
+  if (prop.empty())
+    return false;
+
+  // Propagate selected bits to successors, predecessors of MBB.
+  bool addedUses = false;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    // Self-loop
+    if (SUCC == MBB)
+      continue;
+    if (! CSRUsed[SUCC].contains(prop)) {
+      CSRUsed[SUCC] |= prop;
+      addedUses = true;
+      blks.push_back(SUCC);
+      DEBUG(if (ShrinkWrapDebugging >= Iterations)
+              DOUT << getBasicBlockName(MBB)
+                   << "(" << stringifyCSRegSet(prop) << ")->"
+                   << "successor " << getBasicBlockName(SUCC) << "\n");
+    }
+  }
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    // Self-loop
+    if (PRED == MBB)
+      continue;
+    if (! CSRUsed[PRED].contains(prop)) {
+      CSRUsed[PRED] |= prop;
+      addedUses = true;
+      blks.push_back(PRED);
+      DEBUG(if (ShrinkWrapDebugging >= Iterations)
+              DOUT << getBasicBlockName(MBB)
+                   << "(" << stringifyCSRegSet(prop) << ")->"
+                   << "predecessor " << getBasicBlockName(PRED) << "\n");
+    }
+  }
+  return addedUses;
+}
+
+/// addUsesForTopLevelLoops - add uses for CSRs used inside top
+/// level loops to the exit blocks of those loops.
+///
+bool PEI::addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks) {
+  bool addedUses = false;
+
+  // Place restores for top level loops where needed.
+  for (DenseMap<MachineBasicBlock*, MachineLoop*>::iterator
+         I = TLLoops.begin(), E = TLLoops.end(); I != E; ++I) {
+    MachineBasicBlock* MBB = I->first;
+    MachineLoop* LP = I->second;
+    MachineBasicBlock* HDR = LP->getHeader();
+    SmallVector<MachineBasicBlock*, 4> exitBlocks;
+    CSRegSet loopSpills;
+
+    loopSpills = CSRSave[MBB];
+    if (CSRSave[MBB].empty()) {
+      loopSpills = CSRUsed[HDR];
+      assert(!loopSpills.empty() && "No CSRs used in loop?");
+    } else if (CSRRestore[MBB].contains(CSRSave[MBB]))
+      continue;
+
+    LP->getExitBlocks(exitBlocks);
+    assert(exitBlocks.size() > 0 && "Loop has no top level exit blocks?");
+    for (unsigned i = 0, e = exitBlocks.size(); i != e; ++i) {
+      MachineBasicBlock* EXB = exitBlocks[i];
+      if (! CSRUsed[EXB].contains(loopSpills)) {
+        CSRUsed[EXB] |= loopSpills;
+        addedUses = true;
+        DEBUG(if (ShrinkWrapDebugging >= Iterations)
+                DOUT << "LOOP " << getBasicBlockName(MBB)
+                     << "(" << stringifyCSRegSet(loopSpills) << ")->"
+                     << getBasicBlockName(EXB) << "\n");
+        if (EXB->succ_size() > 1 || EXB->pred_size() > 1)
+          blks.push_back(EXB);
+      }
+    }
+  }
+  return addedUses;
+}
+
+/// calcSpillPlacements - determine which CSRs should be spilled
+/// in MBB using AnticIn sets of MBB's predecessors, keeping track
+/// of changes to spilled reg sets. Add MBB to the set of blocks
+/// that need to be processed for propagating use info to cover
+/// multi-entry/exit regions.
+///
+bool PEI::calcSpillPlacements(MachineBasicBlock* MBB,
+                              SmallVector<MachineBasicBlock*, 4> &blks,
+                              CSRegBlockMap &prevSpills) {
+  bool placedSpills = false;
+  // Intersect (CSRegs - AnticIn[P]) for P in Predecessors(MBB)
+  CSRegSet anticInPreds;
+  SmallVector<MachineBasicBlock*, 4> predecessors;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    if (PRED != MBB)
+      predecessors.push_back(PRED);
+  }
+  unsigned i = 0, e = predecessors.size();
+  if (i != e) {
+    MachineBasicBlock* PRED = predecessors[i];
+    anticInPreds = UsedCSRegs - AnticIn[PRED];
+    for (++i; i != e; ++i) {
+      PRED = predecessors[i];
+      anticInPreds &= (UsedCSRegs - AnticIn[PRED]);
+    }
+  } else {
+    // Handle uses in entry blocks (which have no predecessors).
+    // This is necessary because the DFA formulation assumes the
+    // entry and (multiple) exit nodes cannot have CSR uses, which
+    // is not the case in the real world.
+    anticInPreds = UsedCSRegs;
+  }
+  // Compute spills required at MBB:
+  CSRSave[MBB] |= (AnticIn[MBB] - AvailIn[MBB]) & anticInPreds;
+
+  if (! CSRSave[MBB].empty()) {
+    if (MBB == EntryBlock) {
+      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
+        CSRRestore[ReturnBlocks[ri]] |= CSRSave[MBB];
+    } else {
+      // Reset all regs spilled in MBB that are also spilled in EntryBlock.
+      if (CSRSave[EntryBlock].intersects(CSRSave[MBB])) {
+        CSRSave[MBB] = CSRSave[MBB] - CSRSave[EntryBlock];
+      }
+    }
+  }
+  placedSpills = (CSRSave[MBB] != prevSpills[MBB]);
+  prevSpills[MBB] = CSRSave[MBB];
+  // Remember this block for adding restores to successor
+  // blocks for multi-entry region.
+  if (placedSpills)
+    blks.push_back(MBB);
+
+  DEBUG(if (! CSRSave[MBB].empty() && ShrinkWrapDebugging >= Iterations)
+          DOUT << "SAVE[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRSave[MBB]) << "\n");
+
+  return placedSpills;
+}
+
+/// calcRestorePlacements - determine which CSRs should be restored
+/// in MBB using AvailOut sets of MBB's succcessors, keeping track
+/// of changes to restored reg sets. Add MBB to the set of blocks
+/// that need to be processed for propagating use info to cover
+/// multi-entry/exit regions.
+///
+bool PEI::calcRestorePlacements(MachineBasicBlock* MBB,
+                                SmallVector<MachineBasicBlock*, 4> &blks,
+                                CSRegBlockMap &prevRestores) {
+  bool placedRestores = false;
+  // Intersect (CSRegs - AvailOut[S]) for S in Successors(MBB)
+  CSRegSet availOutSucc;
+  SmallVector<MachineBasicBlock*, 4> successors;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    if (SUCC != MBB)
+      successors.push_back(SUCC);
+  }
+  unsigned i = 0, e = successors.size();
+  if (i != e) {
+    MachineBasicBlock* SUCC = successors[i];
+    availOutSucc = UsedCSRegs - AvailOut[SUCC];
+    for (++i; i != e; ++i) {
+      SUCC = successors[i];
+      availOutSucc &= (UsedCSRegs - AvailOut[SUCC]);
+    }
+  } else {
+    if (! CSRUsed[MBB].empty() || ! AvailOut[MBB].empty()) {
+      // Handle uses in return blocks (which have no successors).
+      // This is necessary because the DFA formulation assumes the
+      // entry and (multiple) exit nodes cannot have CSR uses, which
+      // is not the case in the real world.
+      availOutSucc = UsedCSRegs;
+    }
+  }
+  // Compute restores required at MBB:
+  CSRRestore[MBB] |= (AvailOut[MBB] - AnticOut[MBB]) & availOutSucc;
+
+  // Postprocess restore placements at MBB.
+  // Remove the CSRs that are restored in the return blocks.
+  // Lest this be confusing, note that:
+  // CSRSave[EntryBlock] == CSRRestore[B] for all B in ReturnBlocks.
+  if (MBB->succ_size() && ! CSRRestore[MBB].empty()) {
+    if (! CSRSave[EntryBlock].empty())
+      CSRRestore[MBB] = CSRRestore[MBB] - CSRSave[EntryBlock];
+  }
+  placedRestores = (CSRRestore[MBB] != prevRestores[MBB]);
+  prevRestores[MBB] = CSRRestore[MBB];
+  // Remember this block for adding saves to predecessor
+  // blocks for multi-entry region.
+  if (placedRestores)
+    blks.push_back(MBB);
+
+  DEBUG(if (! CSRRestore[MBB].empty() && ShrinkWrapDebugging >= Iterations)
+          DOUT << "RESTORE[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
+
+  return placedRestores;
+}
+
+/// placeSpillsAndRestores - place spills and restores of CSRs
+/// used in MBBs in minimal regions that contain the uses.
+///
+void PEI::placeSpillsAndRestores(MachineFunction &Fn) {
+  CSRegBlockMap prevCSRSave;
+  CSRegBlockMap prevCSRRestore;
+  SmallVector<MachineBasicBlock*, 4> cvBlocks, ncvBlocks;
+  bool changed = true;
+  unsigned iterations = 0;
+
+  // Iterate computation of spill and restore placements in the MCFG until:
+  //   1. CSR use info has been fully propagated around the MCFG, and
+  //   2. computation of CSRSave[], CSRRestore[] reach fixed points.
+  while (changed) {
+    changed = false;
+    ++iterations;
+
+    DEBUG(if (ShrinkWrapDebugging >= Iterations)
+            DOUT << "iter " << iterations
+                 << " --------------------------------------------------\n");
+
+    // Calculate CSR{Save,Restore} sets using Antic, Avail on the MCFG,
+    // which determines the placements of spills and restores.
+    // Keep track of changes to spills, restores in each iteration to
+    // minimize the total iterations.
+    bool SRChanged = false;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+
+      // Place spills for CSRs in MBB.
+      SRChanged |= calcSpillPlacements(MBB, cvBlocks, prevCSRSave);
+
+      // Place restores for CSRs in MBB.
+      SRChanged |= calcRestorePlacements(MBB, cvBlocks, prevCSRRestore);
+    }
+
+    // Add uses of CSRs used inside loops where needed.
+    changed |= addUsesForTopLevelLoops(cvBlocks);
+
+    // Add uses for CSRs spilled or restored at branch, join points.
+    if (changed || SRChanged) {
+      while (! cvBlocks.empty()) {
+        MachineBasicBlock* MBB = cvBlocks.pop_back_val();
+        changed |= addUsesForMEMERegion(MBB, ncvBlocks);
+      }
+      if (! ncvBlocks.empty()) {
+        cvBlocks = ncvBlocks;
+        ncvBlocks.clear();
+      }
+    }
+
+    if (changed) {
+      calculateAnticAvail(Fn);
+      CSRSave.clear();
+      CSRRestore.clear();
+    }
+  }
+
+  // Check for effectiveness:
+  //  SR0 = {r | r in CSRSave[EntryBlock], CSRRestore[RB], RB in ReturnBlocks}
+  //  numSRReduced = |(UsedCSRegs - SR0)|, approx. SR0 by CSRSave[EntryBlock]
+  // Gives a measure of how many CSR spills have been moved from EntryBlock
+  // to minimal regions enclosing their uses.
+  CSRegSet notSpilledInEntryBlock = (UsedCSRegs - CSRSave[EntryBlock]);
+  unsigned numSRReducedThisFunc = notSpilledInEntryBlock.count();
+  numSRReduced += numSRReducedThisFunc;
+  DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
+      DOUT << "-----------------------------------------------------------\n";
+      DOUT << "total iterations = " << iterations << " ( "
+           << Fn.getFunction()->getName()
+           << " " << numSRReducedThisFunc
+           << " " << Fn.size()
+           << " )\n";
+      DOUT << "-----------------------------------------------------------\n";
+      dumpSRSets();
+      DOUT << "-----------------------------------------------------------\n";
+      if (numSRReducedThisFunc)
+        verifySpillRestorePlacement();
+    });
+}
+
+// Debugging methods.
+#ifndef NDEBUG
+/// findFastExitPath - debugging method used to detect functions
+/// with at least one path from the entry block to a return block
+/// directly or which has a very small number of edges.
+///
+void PEI::findFastExitPath() {
+  if (! EntryBlock)
+    return;
+  // Fina a path from EntryBlock to any return block that does not branch:
+  //        Entry
+  //          |     ...
+  //          v      |
+  //         B1<-----+
+  //          |
+  //          v
+  //       Return
+  for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
+         SE = EntryBlock->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+
+    // Assume positive, disprove existence of fast path.
+    HasFastExitPath = true;
+
+    // Check the immediate successors.
+    if (isReturnBlock(SUCC)) {
+      if (ShrinkWrapDebugging >= BasicInfo)
+        DOUT << "Fast exit path: " << getBasicBlockName(EntryBlock)
+             << "->" << getBasicBlockName(SUCC) << "\n";
+      break;
+    }
+    // Traverse df from SUCC, look for a branch block.
+    std::string exitPath = getBasicBlockName(SUCC);
+    for (df_iterator<MachineBasicBlock*> BI = df_begin(SUCC),
+           BE = df_end(SUCC); BI != BE; ++BI) {
+      MachineBasicBlock* SBB = *BI;
+      // Reject paths with branch nodes.
+      if (SBB->succ_size() > 1) {
+        HasFastExitPath = false;
+        break;
+      }
+      exitPath += "->" + getBasicBlockName(SBB);
+    }
+    if (HasFastExitPath) {
+      if (ShrinkWrapDebugging >= BasicInfo)
+        DOUT << "Fast exit path: " << getBasicBlockName(EntryBlock)
+             << "->" << exitPath << "\n";
+      break;
+    }
+  }
+}
+
+/// verifySpillRestorePlacement - check the current spill/restore
+/// sets for safety. Attempt to find spills without restores or
+/// restores without spills.
+/// Spills: walk df from each MBB in spill set ensuring that
+///         all CSRs spilled at MMBB are restored on all paths
+///         from MBB to all exit blocks.
+/// Restores: walk idf from each MBB in restore set ensuring that
+///           all CSRs restored at MBB are spilled on all paths
+///           reaching MBB.
+///
+void PEI::verifySpillRestorePlacement() {
+  unsigned numReturnBlocks = 0;
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock* MBB = MBBI;
+    if (isReturnBlock(MBB) || MBB->succ_size() == 0)
+      ++numReturnBlocks;
+  }
+  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
+         BE = CSRSave.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet spilled = BI->second;
+    CSRegSet restored;
+
+    if (spilled.empty())
+      continue;
+
+    DOUT << "SAVE[" << getBasicBlockName(MBB) << "] = "
+         << stringifyCSRegSet(spilled)
+         << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
+         << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
+
+    if (CSRRestore[MBB].intersects(spilled)) {
+      restored |= (CSRRestore[MBB] & spilled);
+    }
+
+    // Walk depth first from MBB to find restores of all CSRs spilled at MBB:
+    // we must find restores for all spills w/no intervening spills on all
+    // paths from MBB to all return blocks.
+    for (df_iterator<MachineBasicBlock*> BI = df_begin(MBB),
+           BE = df_end(MBB); BI != BE; ++BI) {
+      MachineBasicBlock* SBB = *BI;
+      if (SBB == MBB)
+        continue;
+      // Stop when we encounter spills of any CSRs spilled at MBB that
+      // have not yet been seen to be restored.
+      if (CSRSave[SBB].intersects(spilled) &&
+          !restored.contains(CSRSave[SBB] & spilled))
+        break;
+      // Collect the CSRs spilled at MBB that are restored
+      // at this DF successor of MBB.
+      if (CSRRestore[SBB].intersects(spilled))
+        restored |= (CSRRestore[SBB] & spilled);
+      // If we are at a retun block, check that the restores
+      // we have seen so far exhaust the spills at MBB, then
+      // reset the restores.
+      if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
+        if (restored != spilled) {
+          CSRegSet notRestored = (spilled - restored);
+          DOUT << MF->getFunction()->getName() << ": "
+               << stringifyCSRegSet(notRestored)
+               << " spilled at " << getBasicBlockName(MBB)
+               << " are never restored on path to return "
+               << getBasicBlockName(SBB) << "\n";
+        }
+        restored.clear();
+      }
+    }
+  }
+
+  // Check restore placements.
+  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
+         BE = CSRRestore.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet restored = BI->second;
+    CSRegSet spilled;
+
+    if (restored.empty())
+      continue;
+
+    DOUT << "SAVE[" << getBasicBlockName(MBB) << "] = "
+         << stringifyCSRegSet(CSRSave[MBB])
+         << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
+         << stringifyCSRegSet(restored) << "\n";
+
+    if (CSRSave[MBB].intersects(restored)) {
+      spilled |= (CSRSave[MBB] & restored);
+    }
+    // Walk inverse depth first from MBB to find spills of all
+    // CSRs restored at MBB:
+    for (idf_iterator<MachineBasicBlock*> BI = idf_begin(MBB),
+           BE = idf_end(MBB); BI != BE; ++BI) {
+      MachineBasicBlock* PBB = *BI;
+      if (PBB == MBB)
+        continue;
+      // Stop when we encounter restores of any CSRs restored at MBB that
+      // have not yet been seen to be spilled.
+      if (CSRRestore[PBB].intersects(restored) &&
+          !spilled.contains(CSRRestore[PBB] & restored))
+        break;
+      // Collect the CSRs restored at MBB that are spilled
+      // at this DF predecessor of MBB.
+      if (CSRSave[PBB].intersects(restored))
+        spilled |= (CSRSave[PBB] & restored);
+    }
+    if (spilled != restored) {
+      CSRegSet notSpilled = (restored - spilled);
+      DOUT << MF->getFunction()->getName() << ": "
+           << stringifyCSRegSet(notSpilled)
+           << " restored at " << getBasicBlockName(MBB)
+           << " are never spilled\n";
+    }
+  }
+}
+
+// Debugging print methods.
+std::string PEI::getBasicBlockName(const MachineBasicBlock* MBB) {
+  std::ostringstream name;
+  if (MBB) {
+    if (MBB->getBasicBlock())
+      name << MBB->getBasicBlock()->getName();
+    else
+      name << "_MBB_" << MBB->getNumber();
+  }
+  return name.str();
+}
+
+std::string PEI::stringifyCSRegSet(const CSRegSet& s) {
+  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
+  const std::vector<CalleeSavedInfo> CSI =
+    MF->getFrameInfo()->getCalleeSavedInfo();
+
+  std::ostringstream srep;
+  if (CSI.size() == 0) {
+    srep << "[]";
+    return srep.str();
+  }
+  srep << "[";
+  CSRegSet::iterator I = s.begin(), E = s.end();
+  if (I != E) {
+    unsigned reg = CSI[*I].getReg();
+    srep << TRI->getName(reg);
+    for (++I; I != E; ++I) {
+      reg = CSI[*I].getReg();
+      srep << ",";
+      srep << TRI->getName(reg);
+    }
+  }
+  srep << "]";
+  return srep.str();
+}
+
+void PEI::dumpSet(const CSRegSet& s) {
+  DOUT << stringifyCSRegSet(s) << "\n";
+}
+
+void PEI::dumpUsed(MachineBasicBlock* MBB) {
+  if (MBB) {
+    DOUT << "CSRUsed[" << getBasicBlockName(MBB) << "] = "
+         << stringifyCSRegSet(CSRUsed[MBB])  << "\n";
+  }
+}
+
+void PEI::dumpAllUsed() {
+    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      dumpUsed(MBB);
+    }
+}
+
+void PEI::dumpSets(MachineBasicBlock* MBB) {
+  if (MBB) {
+    DOUT << getBasicBlockName(MBB)           << " | "
+         << stringifyCSRegSet(CSRUsed[MBB])  << " | "
+         << stringifyCSRegSet(AnticIn[MBB])  << " | "
+         << stringifyCSRegSet(AnticOut[MBB]) << " | "
+         << stringifyCSRegSet(AvailIn[MBB])  << " | "
+         << stringifyCSRegSet(AvailOut[MBB]) << "\n";
+  }
+}
+
+void PEI::dumpSets1(MachineBasicBlock* MBB) {
+  if (MBB) {
+    DOUT << getBasicBlockName(MBB)             << " | "
+         << stringifyCSRegSet(CSRUsed[MBB])    << " | "
+         << stringifyCSRegSet(AnticIn[MBB])    << " | "
+         << stringifyCSRegSet(AnticOut[MBB])   << " | "
+         << stringifyCSRegSet(AvailIn[MBB])    << " | "
+         << stringifyCSRegSet(AvailOut[MBB])   << " | "
+         << stringifyCSRegSet(CSRSave[MBB])    << " | "
+         << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
+  }
+}
+
+void PEI::dumpAllSets() {
+    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      dumpSets1(MBB);
+    }
+}
+
+void PEI::dumpSRSets() {
+  for (MachineFunction::iterator MBB = MF->begin(), E = MF->end();
+       MBB != E; ++MBB) {
+    if (! CSRSave[MBB].empty()) {
+      DOUT << "SAVE[" << getBasicBlockName(MBB) << "] = "
+           << stringifyCSRegSet(CSRSave[MBB]);
+      if (CSRRestore[MBB].empty())
+        DOUT << "\n";
+    }
+    if (! CSRRestore[MBB].empty()) {
+      if (! CSRSave[MBB].empty())
+        DOUT << "    ";
+      DOUT << "RESTORE[" << getBasicBlockName(MBB) << "] = "
+           << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
+    }
+  }
+}
+#endif
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
new file mode 100644
index 0000000..2bc234f
--- /dev/null
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -0,0 +1,2827 @@
+//===-- SimpleRegisterCoalescing.cpp - Register Coalescing ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple register coalescing pass that attempts to
+// aggressively coalesce every register copy that it can.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regcoalescing"
+#include "SimpleRegisterCoalescing.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Value.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+STATISTIC(numJoins    , "Number of interval joins performed");
+STATISTIC(numCrossRCs , "Number of cross class joins performed");
+STATISTIC(numCommutes , "Number of instruction commuting performed");
+STATISTIC(numExtends  , "Number of copies extended");
+STATISTIC(NumReMats   , "Number of instructions re-materialized");
+STATISTIC(numPeep     , "Number of identity moves eliminated after coalescing");
+STATISTIC(numAborts   , "Number of times interval joining aborted");
+STATISTIC(numDeadValNo, "Number of valno def marked dead");
+
+char SimpleRegisterCoalescing::ID = 0;
+static cl::opt<bool>
+EnableJoining("join-liveintervals",
+              cl::desc("Coalesce copies (default=true)"),
+              cl::init(true));
+
+static cl::opt<bool>
+NewHeuristic("new-coalescer-heuristic",
+             cl::desc("Use new coalescer heuristic"),
+             cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+CrossClassJoin("join-cross-class-copies",
+               cl::desc("Coalesce cross register class copies"),
+               cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+PhysJoinTweak("tweak-phys-join-heuristics",
+               cl::desc("Tweak heuristics for joining phys reg with vr"),
+               cl::init(false), cl::Hidden);
+
+static RegisterPass<SimpleRegisterCoalescing> 
+X("simple-register-coalescing", "Simple Register Coalescing");
+
+// Declare that we implement the RegisterCoalescer interface
+static RegisterAnalysisGroup<RegisterCoalescer, true/*The Default*/> V(X);
+
+const PassInfo *const llvm::SimpleRegisterCoalescingID = &X;
+
+void SimpleRegisterCoalescing::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreservedID(MachineDominatorsID);
+  if (StrongPHIElim)
+    AU.addPreservedID(StrongPHIEliminationID);
+  else
+    AU.addPreservedID(PHIEliminationID);
+  AU.addPreservedID(TwoAddressInstructionPassID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
+/// being the source and IntB being the dest, thus this defines a value number
+/// in IntB.  If the source value number (in IntA) is defined by a copy from B,
+/// see if we can merge these two pieces of B into a single value number,
+/// eliminating a copy.  For example:
+///
+///  A3 = B0
+///    ...
+///  B1 = A3      <- this copy
+///
+/// In this case, B0 can be extended to where the B1 copy lives, allowing the B1
+/// value number to be replaced with B0 (which simplifies the B liveinterval).
+///
+/// This returns true if an interval was modified.
+///
+bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
+                                                    LiveInterval &IntB,
+                                                    MachineInstr *CopyMI) {
+  unsigned CopyIdx = li_->getDefIndex(li_->getInstructionIndex(CopyMI));
+
+  // BValNo is a value number in B that is defined by a copy from A.  'B3' in
+  // the example above.
+  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
+  assert(BLR != IntB.end() && "Live range not found!");
+  VNInfo *BValNo = BLR->valno;
+  
+  // Get the location that B is defined at.  Two options: either this value has
+  // an unknown definition point or it is defined at CopyIdx.  If unknown, we 
+  // can't process it.
+  if (!BValNo->copy) return false;
+  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
+  
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyIdx-1);
+  assert(ALR != IntA.end() && "Live range not found!");
+  VNInfo *AValNo = ALR->valno;
+  // If it's re-defined by an early clobber somewhere in the live range, then
+  // it's not safe to eliminate the copy. FIXME: This is a temporary workaround.
+  // See PR3149:
+  // 172     %ECX<def> = MOV32rr %reg1039<kill>
+  // 180     INLINEASM <es:subl $5,$1
+  //         sbbl $3,$0>, 10, %EAX<def>, 14, %ECX<earlyclobber,def>, 9, %EAX<kill>,
+  // 36, <fi#0>, 1, %reg0, 0, 9, %ECX<kill>, 36, <fi#1>, 1, %reg0, 0
+  // 188     %EAX<def> = MOV32rr %EAX<kill>
+  // 196     %ECX<def> = MOV32rr %ECX<kill>
+  // 204     %ECX<def> = MOV32rr %ECX<kill>
+  // 212     %EAX<def> = MOV32rr %EAX<kill>
+  // 220     %EAX<def> = MOV32rr %EAX
+  // 228     %reg1039<def> = MOV32rr %ECX<kill>
+  // The early clobber operand ties ECX input to the ECX def.
+  //
+  // The live interval of ECX is represented as this:
+  // %reg20,inf = [46,47:1)[174,230:0)  0@174-(230) 1@46-(47)
+  // The coalescer has no idea there was a def in the middle of [174,230].
+  if (AValNo->redefByEC)
+    return false;
+  
+  // If AValNo is defined as a copy from IntB, we can potentially process this.  
+  // Get the instruction that defines this value number.
+  unsigned SrcReg = li_->getVNInfoSourceReg(AValNo);
+  if (!SrcReg) return false;  // Not defined by a copy.
+    
+  // If the value number is not defined by a copy instruction, ignore it.
+
+  // If the source register comes from an interval other than IntB, we can't
+  // handle this.
+  if (SrcReg != IntB.reg) return false;
+  
+  // Get the LiveRange in IntB that this value number starts with.
+  LiveInterval::iterator ValLR = IntB.FindLiveRangeContaining(AValNo->def-1);
+  assert(ValLR != IntB.end() && "Live range not found!");
+  
+  // Make sure that the end of the live range is inside the same block as
+  // CopyMI.
+  MachineInstr *ValLREndInst = li_->getInstructionFromIndex(ValLR->end-1);
+  if (!ValLREndInst || 
+      ValLREndInst->getParent() != CopyMI->getParent()) return false;
+
+  // Okay, we now know that ValLR ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live ranges between them in
+  // IntB, we can merge them.
+  if (ValLR+1 != BLR) return false;
+
+  // If a live interval is a physical register, conservatively check if any
+  // of its sub-registers is overlapping the live interval of the virtual
+  // register. If so, do not coalesce.
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg) &&
+      *tri_->getSubRegisters(IntB.reg)) {
+    for (const unsigned* SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR)
+      if (li_->hasInterval(*SR) && IntA.overlaps(li_->getInterval(*SR))) {
+        DOUT << "Interfere with sub-register ";
+        DEBUG(li_->getInterval(*SR).print(DOUT, tri_));
+        return false;
+      }
+  }
+  
+  DOUT << "\nExtending: "; IntB.print(DOUT, tri_);
+  
+  unsigned FillerStart = ValLR->end, FillerEnd = BLR->start;
+  // We are about to delete CopyMI, so need to remove it as the 'instruction
+  // that defines this value #'. Update the the valnum with the new defining
+  // instruction #.
+  BValNo->def  = FillerStart;
+  BValNo->copy = NULL;
+  
+  // Okay, we can merge them.  We need to insert a new liverange:
+  // [ValLR.end, BLR.begin) of either value number, then we merge the
+  // two value numbers.
+  IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
+
+  // If the IntB live range is assigned to a physical register, and if that
+  // physreg has sub-registers, update their live intervals as well. 
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
+    for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+      LiveInterval &SRLI = li_->getInterval(*SR);
+      SRLI.addRange(LiveRange(FillerStart, FillerEnd,
+                 SRLI.getNextValue(FillerStart, 0, li_->getVNInfoAllocator())));
+    }
+  }
+
+  // Okay, merge "B1" into the same value number as "B0".
+  if (BValNo != ValLR->valno) {
+    IntB.addKills(ValLR->valno, BValNo->kills);
+    IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+  }
+  DOUT << "   result = "; IntB.print(DOUT, tri_);
+  DOUT << "\n";
+
+  // If the source instruction was killing the source register before the
+  // merge, unset the isKill marker given the live range has been extended.
+  int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  if (UIdx != -1) {
+    ValLREndInst->getOperand(UIdx).setIsKill(false);
+    IntB.removeKill(ValLR->valno, FillerStart);
+  }
+
+  ++numExtends;
+  return true;
+}
+
+/// HasOtherReachingDefs - Return true if there are definitions of IntB
+/// other than BValNo val# that can reach uses of AValno val# of IntA.
+bool SimpleRegisterCoalescing::HasOtherReachingDefs(LiveInterval &IntA,
+                                                    LiveInterval &IntB,
+                                                    VNInfo *AValNo,
+                                                    VNInfo *BValNo) {
+  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
+       AI != AE; ++AI) {
+    if (AI->valno != AValNo) continue;
+    LiveInterval::Ranges::iterator BI =
+      std::upper_bound(IntB.ranges.begin(), IntB.ranges.end(), AI->start);
+    if (BI != IntB.ranges.begin())
+      --BI;
+    for (; BI != IntB.ranges.end() && AI->end >= BI->start; ++BI) {
+      if (BI->valno == BValNo)
+        continue;
+      if (BI->start <= AI->start && BI->end > AI->start)
+        return true;
+      if (BI->start > AI->start && BI->start < AI->end)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with IntA
+/// being the source and IntB being the dest, thus this defines a value number
+/// in IntB.  If the source value number (in IntA) is defined by a commutable
+/// instruction and its other operand is coalesced to the copy dest register,
+/// see if we can transform the copy into a noop by commuting the definition. For
+/// example,
+///
+///  A3 = op A2 B0<kill>
+///    ...
+///  B1 = A3      <- this copy
+///    ...
+///     = op A3   <- more uses
+///
+/// ==>
+///
+///  B2 = op B0 A2<kill>
+///    ...
+///  B1 = B2      <- now an identify copy
+///    ...
+///     = op B2   <- more uses
+///
+/// This returns true if an interval was modified.
+///
+bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
+                                                        LiveInterval &IntB,
+                                                        MachineInstr *CopyMI) {
+  unsigned CopyIdx = li_->getDefIndex(li_->getInstructionIndex(CopyMI));
+
+  // FIXME: For now, only eliminate the copy by commuting its def when the
+  // source register is a virtual register. We want to guard against cases
+  // where the copy is a back edge copy and commuting the def lengthen the
+  // live interval of the source register to the entire loop.
+  if (TargetRegisterInfo::isPhysicalRegister(IntA.reg))
+    return false;
+
+  // BValNo is a value number in B that is defined by a copy from A. 'B3' in
+  // the example above.
+  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
+  assert(BLR != IntB.end() && "Live range not found!");
+  VNInfo *BValNo = BLR->valno;
+  
+  // Get the location that B is defined at.  Two options: either this value has
+  // an unknown definition point or it is defined at CopyIdx.  If unknown, we 
+  // can't process it.
+  if (!BValNo->copy) return false;
+  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
+  
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyIdx-1);
+  assert(ALR != IntA.end() && "Live range not found!");
+  VNInfo *AValNo = ALR->valno;
+  // If other defs can reach uses of this def, then it's not safe to perform
+  // the optimization.
+  if (AValNo->def == ~0U || AValNo->def == ~1U || AValNo->hasPHIKill)
+    return false;
+  MachineInstr *DefMI = li_->getInstructionFromIndex(AValNo->def);
+  const TargetInstrDesc &TID = DefMI->getDesc();
+  unsigned NewDstIdx;
+  if (!TID.isCommutable() ||
+      !tii_->CommuteChangesDestination(DefMI, NewDstIdx))
+    return false;
+
+  MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
+  unsigned NewReg = NewDstMO.getReg();
+  if (NewReg != IntB.reg || !NewDstMO.isKill())
+    return false;
+
+  // Make sure there are no other definitions of IntB that would reach the
+  // uses which the new definition can reach.
+  if (HasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
+    return false;
+
+  // If some of the uses of IntA.reg is already coalesced away, return false.
+  // It's not possible to determine whether it's safe to perform the coalescing.
+  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(IntA.reg),
+         UE = mri_->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    unsigned UseIdx = li_->getInstructionIndex(UseMI);
+    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
+    if (ULR == IntA.end())
+      continue;
+    if (ULR->valno == AValNo && JoinedCopies.count(UseMI))
+      return false;
+  }
+
+  // At this point we have decided that it is legal to do this
+  // transformation.  Start by commuting the instruction.
+  MachineBasicBlock *MBB = DefMI->getParent();
+  MachineInstr *NewMI = tii_->commuteInstruction(DefMI);
+  if (!NewMI)
+    return false;
+  if (NewMI != DefMI) {
+    li_->ReplaceMachineInstrInMaps(DefMI, NewMI);
+    MBB->insert(DefMI, NewMI);
+    MBB->erase(DefMI);
+  }
+  unsigned OpIdx = NewMI->findRegisterUseOperandIdx(IntA.reg, false);
+  NewMI->getOperand(OpIdx).setIsKill();
+
+  bool BHasPHIKill = BValNo->hasPHIKill;
+  SmallVector<VNInfo*, 4> BDeadValNos;
+  SmallVector<unsigned, 4> BKills;
+  std::map<unsigned, unsigned> BExtend;
+
+  // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
+  // A = or A, B
+  // ...
+  // B = A
+  // ...
+  // C = A<kill>
+  // ...
+  //   = B
+  //
+  // then do not add kills of A to the newly created B interval.
+  bool Extended = BLR->end > ALR->end && ALR->end != ALR->start;
+  if (Extended)
+    BExtend[ALR->end] = BLR->end;
+
+  // Update uses of IntA of the specific Val# with IntB.
+  bool BHasSubRegs = false;
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg))
+    BHasSubRegs = *tri_->getSubRegisters(IntB.reg);
+  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(IntA.reg),
+         UE = mri_->use_end(); UI != UE;) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = &*UI;
+    ++UI;
+    if (JoinedCopies.count(UseMI))
+      continue;
+    unsigned UseIdx = li_->getInstructionIndex(UseMI);
+    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
+    if (ULR == IntA.end() || ULR->valno != AValNo)
+      continue;
+    UseMO.setReg(NewReg);
+    if (UseMI == CopyMI)
+      continue;
+    if (UseMO.isKill()) {
+      if (Extended)
+        UseMO.setIsKill(false);
+      else
+        BKills.push_back(li_->getUseIndex(UseIdx)+1);
+    }
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (!tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+      continue;
+    if (DstReg == IntB.reg) {
+      // This copy will become a noop. If it's defining a new val#,
+      // remove that val# as well. However this live range is being
+      // extended to the end of the existing live range defined by the copy.
+      unsigned DefIdx = li_->getDefIndex(UseIdx);
+      const LiveRange *DLR = IntB.getLiveRangeContaining(DefIdx);
+      BHasPHIKill |= DLR->valno->hasPHIKill;
+      assert(DLR->valno->def == DefIdx);
+      BDeadValNos.push_back(DLR->valno);
+      BExtend[DLR->start] = DLR->end;
+      JoinedCopies.insert(UseMI);
+      // If this is a kill but it's going to be removed, the last use
+      // of the same val# is the new kill.
+      if (UseMO.isKill())
+        BKills.pop_back();
+    }
+  }
+
+  // We need to insert a new liverange: [ALR.start, LastUse). It may be we can
+  // simply extend BLR if CopyMI doesn't end the range.
+  DOUT << "\nExtending: "; IntB.print(DOUT, tri_);
+
+  // Remove val#'s defined by copies that will be coalesced away.
+  for (unsigned i = 0, e = BDeadValNos.size(); i != e; ++i) {
+    VNInfo *DeadVNI = BDeadValNos[i];
+    if (BHasSubRegs) {
+      for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+        LiveInterval &SRLI = li_->getInterval(*SR);
+        const LiveRange *SRLR = SRLI.getLiveRangeContaining(DeadVNI->def);
+        SRLI.removeValNo(SRLR->valno);
+      }
+    }
+    IntB.removeValNo(BDeadValNos[i]);
+  }
+
+  // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
+  // is updated. Kills are also updated.
+  VNInfo *ValNo = BValNo;
+  ValNo->def = AValNo->def;
+  ValNo->copy = NULL;
+  for (unsigned j = 0, ee = ValNo->kills.size(); j != ee; ++j) {
+    unsigned Kill = ValNo->kills[j];
+    if (Kill != BLR->end)
+      BKills.push_back(Kill);
+  }
+  ValNo->kills.clear();
+  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
+       AI != AE; ++AI) {
+    if (AI->valno != AValNo) continue;
+    unsigned End = AI->end;
+    std::map<unsigned, unsigned>::iterator EI = BExtend.find(End);
+    if (EI != BExtend.end())
+      End = EI->second;
+    IntB.addRange(LiveRange(AI->start, End, ValNo));
+
+    // If the IntB live range is assigned to a physical register, and if that
+    // physreg has sub-registers, update their live intervals as well. 
+    if (BHasSubRegs) {
+      for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+        LiveInterval &SRLI = li_->getInterval(*SR);
+        SRLI.MergeInClobberRange(AI->start, End, li_->getVNInfoAllocator());
+      }
+    }
+  }
+  IntB.addKills(ValNo, BKills);
+  ValNo->hasPHIKill = BHasPHIKill;
+
+  DOUT << "   result = "; IntB.print(DOUT, tri_);
+  DOUT << "\n";
+
+  DOUT << "\nShortening: "; IntA.print(DOUT, tri_);
+  IntA.removeValNo(AValNo);
+  DOUT << "   result = "; IntA.print(DOUT, tri_);
+  DOUT << "\n";
+
+  ++numCommutes;
+  return true;
+}
+
+/// isSameOrFallThroughBB - Return true if MBB == SuccMBB or MBB simply
+/// fallthoughs to SuccMBB.
+static bool isSameOrFallThroughBB(MachineBasicBlock *MBB,
+                                  MachineBasicBlock *SuccMBB,
+                                  const TargetInstrInfo *tii_) {
+  if (MBB == SuccMBB)
+    return true;
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  return !tii_->AnalyzeBranch(*MBB, TBB, FBB, Cond) && !TBB && !FBB &&
+    MBB->isSuccessor(SuccMBB);
+}
+
+/// removeRange - Wrapper for LiveInterval::removeRange. This removes a range
+/// from a physical register live interval as well as from the live intervals
+/// of its sub-registers.
+static void removeRange(LiveInterval &li, unsigned Start, unsigned End,
+                        LiveIntervals *li_, const TargetRegisterInfo *tri_) {
+  li.removeRange(Start, End, true);
+  if (TargetRegisterInfo::isPhysicalRegister(li.reg)) {
+    for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) {
+      if (!li_->hasInterval(*SR))
+        continue;
+      LiveInterval &sli = li_->getInterval(*SR);
+      unsigned RemoveEnd = Start;
+      while (RemoveEnd != End) {
+        LiveInterval::iterator LR = sli.FindLiveRangeContaining(Start);
+        if (LR == sli.end())
+          break;
+        RemoveEnd = (LR->end < End) ? LR->end : End;
+        sli.removeRange(Start, RemoveEnd, true);
+        Start = RemoveEnd;
+      }
+    }
+  }
+}
+
+/// TrimLiveIntervalToLastUse - If there is a last use in the same basic block
+/// as the copy instruction, trim the live interval to the last use and return
+/// true.
+bool
+SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(unsigned CopyIdx,
+                                                    MachineBasicBlock *CopyMBB,
+                                                    LiveInterval &li,
+                                                    const LiveRange *LR) {
+  unsigned MBBStart = li_->getMBBStartIdx(CopyMBB);
+  unsigned LastUseIdx;
+  MachineOperand *LastUse = lastRegisterUse(LR->start, CopyIdx-1, li.reg,
+                                            LastUseIdx);
+  if (LastUse) {
+    MachineInstr *LastUseMI = LastUse->getParent();
+    if (!isSameOrFallThroughBB(LastUseMI->getParent(), CopyMBB, tii_)) {
+      // r1024 = op
+      // ...
+      // BB1:
+      //       = r1024
+      //
+      // BB2:
+      // r1025<dead> = r1024<kill>
+      if (MBBStart < LR->end)
+        removeRange(li, MBBStart, LR->end, li_, tri_);
+      return true;
+    }
+
+    // There are uses before the copy, just shorten the live range to the end
+    // of last use.
+    LastUse->setIsKill();
+    removeRange(li, li_->getDefIndex(LastUseIdx), LR->end, li_, tri_);
+    li.addKill(LR->valno, LastUseIdx+1);
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (tii_->isMoveInstr(*LastUseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
+        DstReg == li.reg) {
+      // Last use is itself an identity code.
+      int DeadIdx = LastUseMI->findRegisterDefOperandIdx(li.reg, false, tri_);
+      LastUseMI->getOperand(DeadIdx).setIsDead();
+    }
+    return true;
+  }
+
+  // Is it livein?
+  if (LR->start <= MBBStart && LR->end > MBBStart) {
+    if (LR->start == 0) {
+      assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
+      // Live-in to the function but dead. Remove it from entry live-in set.
+      mf_->begin()->removeLiveIn(li.reg);
+    }
+    // FIXME: Shorten intervals in BBs that reaches this BB.
+  }
+
+  return false;
+}
+
+/// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
+/// computation, replace the copy by rematerialize the definition.
+bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
+                                                       unsigned DstReg,
+                                                       MachineInstr *CopyMI) {
+  unsigned CopyIdx = li_->getUseIndex(li_->getInstructionIndex(CopyMI));
+  LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
+  assert(SrcLR != SrcInt.end() && "Live range not found!");
+  VNInfo *ValNo = SrcLR->valno;
+  // If other defs can reach uses of this def, then it's not safe to perform
+  // the optimization.
+  if (ValNo->def == ~0U || ValNo->def == ~1U || ValNo->hasPHIKill)
+    return false;
+  MachineInstr *DefMI = li_->getInstructionFromIndex(ValNo->def);
+  const TargetInstrDesc &TID = DefMI->getDesc();
+  if (!TID.isAsCheapAsAMove())
+    return false;
+  if (!DefMI->getDesc().isRematerializable() ||
+      !tii_->isTriviallyReMaterializable(DefMI))
+    return false;
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(tii_, SawStore))
+    return false;
+
+  unsigned DefIdx = li_->getDefIndex(CopyIdx);
+  const LiveRange *DLR= li_->getInterval(DstReg).getLiveRangeContaining(DefIdx);
+  DLR->valno->copy = NULL;
+  // Don't forget to update sub-register intervals.
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    for (const unsigned* SR = tri_->getSubRegisters(DstReg); *SR; ++SR) {
+      if (!li_->hasInterval(*SR))
+        continue;
+      DLR = li_->getInterval(*SR).getLiveRangeContaining(DefIdx);
+      if (DLR && DLR->valno->copy == CopyMI)
+        DLR->valno->copy = NULL;
+    }
+  }
+
+  // If copy kills the source register, find the last use and propagate
+  // kill.
+  bool checkForDeadDef = false;
+  MachineBasicBlock *MBB = CopyMI->getParent();
+  if (CopyMI->killsRegister(SrcInt.reg))
+    if (!TrimLiveIntervalToLastUse(CopyIdx, MBB, SrcInt, SrcLR)) {
+      checkForDeadDef = true;
+    }
+
+  MachineBasicBlock::iterator MII = next(MachineBasicBlock::iterator(CopyMI));
+  CopyMI->removeFromParent();
+  tii_->reMaterialize(*MBB, MII, DstReg, DefMI);
+  MachineInstr *NewMI = prior(MII);
+
+  if (checkForDeadDef) {
+      // PR4090 fix: Trim interval failed because there was no use of the
+      // source interval in this MBB. If the def is in this MBB too then we
+      // should mark it dead:
+      if (DefMI->getParent() == MBB) {
+        DefMI->addRegisterDead(SrcInt.reg, tri_);
+        SrcLR->end = SrcLR->start + 1;
+      }
+ 
+  }
+
+  // CopyMI may have implicit operands, transfer them over to the newly
+  // rematerialized instruction. And update implicit def interval valnos.
+  for (unsigned i = CopyMI->getDesc().getNumOperands(),
+         e = CopyMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = CopyMI->getOperand(i);
+    if (MO.isReg() && MO.isImplicit())
+      NewMI->addOperand(MO);
+    if (MO.isDef() && li_->hasInterval(MO.getReg())) {
+      unsigned Reg = MO.getReg();
+      DLR = li_->getInterval(Reg).getLiveRangeContaining(DefIdx);
+      if (DLR && DLR->valno->copy == CopyMI)
+        DLR->valno->copy = NULL;
+    }
+  }
+
+  li_->ReplaceMachineInstrInMaps(CopyMI, NewMI);
+  MBB->getParent()->DeleteMachineInstr(CopyMI);
+  ReMatCopies.insert(CopyMI);
+  ReMatDefs.insert(DefMI);
+  ++NumReMats;
+  return true;
+}
+
+/// isBackEdgeCopy - Returns true if CopyMI is a back edge copy.
+///
+bool SimpleRegisterCoalescing::isBackEdgeCopy(MachineInstr *CopyMI,
+                                              unsigned DstReg) const {
+  MachineBasicBlock *MBB = CopyMI->getParent();
+  const MachineLoop *L = loopInfo->getLoopFor(MBB);
+  if (!L)
+    return false;
+  if (MBB != L->getLoopLatch())
+    return false;
+
+  LiveInterval &LI = li_->getInterval(DstReg);
+  unsigned DefIdx = li_->getInstructionIndex(CopyMI);
+  LiveInterval::const_iterator DstLR =
+    LI.FindLiveRangeContaining(li_->getDefIndex(DefIdx));
+  if (DstLR == LI.end())
+    return false;
+  unsigned KillIdx = li_->getMBBEndIdx(MBB) + 1;
+  if (DstLR->valno->kills.size() == 1 &&
+      DstLR->valno->kills[0] == KillIdx && DstLR->valno->hasPHIKill)
+    return true;
+  return false;
+}
+
+/// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+/// update the subregister number if it is not zero. If DstReg is a
+/// physical register and the existing subregister number of the def / use
+/// being updated is not zero, make sure to set it to the correct physical
+/// subregister.
+void
+SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
+                                            unsigned SubIdx) {
+  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  if (DstIsPhys && SubIdx) {
+    // Figure out the real physical register we are updating with.
+    DstReg = tri_->getSubReg(DstReg, SubIdx);
+    SubIdx = 0;
+  }
+
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg),
+         E = mri_->reg_end(); I != E; ) {
+    MachineOperand &O = I.getOperand();
+    MachineInstr *UseMI = &*I;
+    ++I;
+    unsigned OldSubIdx = O.getSubReg();
+    if (DstIsPhys) {
+      unsigned UseDstReg = DstReg;
+      if (OldSubIdx)
+          UseDstReg = tri_->getSubReg(DstReg, OldSubIdx);
+
+      unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
+      if (tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
+                            CopySrcSubIdx, CopyDstSubIdx) &&
+          CopySrcReg != CopyDstReg &&
+          CopySrcReg == SrcReg && CopyDstReg != UseDstReg) {
+        // If the use is a copy and it won't be coalesced away, and its source
+        // is defined by a trivial computation, try to rematerialize it instead.
+        if (ReMaterializeTrivialDef(li_->getInterval(SrcReg), CopyDstReg,UseMI))
+          continue;
+      }
+
+      O.setReg(UseDstReg);
+      O.setSubReg(0);
+      continue;
+    }
+
+    // Sub-register indexes goes from small to large. e.g.
+    // RAX: 1 -> AL, 2 -> AX, 3 -> EAX
+    // EAX: 1 -> AL, 2 -> AX
+    // So RAX's sub-register 2 is AX, RAX's sub-regsiter 3 is EAX, whose
+    // sub-register 2 is also AX.
+    if (SubIdx && OldSubIdx && SubIdx != OldSubIdx)
+      assert(OldSubIdx < SubIdx && "Conflicting sub-register index!");
+    else if (SubIdx)
+      O.setSubReg(SubIdx);
+    // Remove would-be duplicated kill marker.
+    if (O.isKill() && UseMI->killsRegister(DstReg))
+      O.setIsKill(false);
+    O.setReg(DstReg);
+
+    // After updating the operand, check if the machine instruction has
+    // become a copy. If so, update its val# information.
+    const TargetInstrDesc &TID = UseMI->getDesc();
+    unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
+    if (TID.getNumDefs() == 1 && TID.getNumOperands() > 2 &&
+        tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
+                          CopySrcSubIdx, CopyDstSubIdx) &&
+        CopySrcReg != CopyDstReg &&
+        (TargetRegisterInfo::isVirtualRegister(CopyDstReg) ||
+         allocatableRegs_[CopyDstReg])) {
+      LiveInterval &LI = li_->getInterval(CopyDstReg);
+      unsigned DefIdx = li_->getDefIndex(li_->getInstructionIndex(UseMI));
+      const LiveRange *DLR = LI.getLiveRangeContaining(DefIdx);
+      if (DLR->valno->def == DefIdx)
+        DLR->valno->copy = UseMI;
+    }
+  }
+}
+
+/// RemoveDeadImpDef - Remove implicit_def instructions which are "re-defining"
+/// registers due to insert_subreg coalescing. e.g.
+/// r1024 = op
+/// r1025 = implicit_def
+/// r1025 = insert_subreg r1025, r1024
+///       = op r1025
+/// =>
+/// r1025 = op
+/// r1025 = implicit_def
+/// r1025 = insert_subreg r1025, r1025
+///       = op r1025
+void
+SimpleRegisterCoalescing::RemoveDeadImpDef(unsigned Reg, LiveInterval &LI) {
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(Reg),
+         E = mri_->reg_end(); I != E; ) {
+    MachineOperand &O = I.getOperand();
+    MachineInstr *DefMI = &*I;
+    ++I;
+    if (!O.isDef())
+      continue;
+    if (DefMI->getOpcode() != TargetInstrInfo::IMPLICIT_DEF)
+      continue;
+    if (!LI.liveBeforeAndAt(li_->getInstructionIndex(DefMI)))
+      continue;
+    li_->RemoveMachineInstrFromMaps(DefMI);
+    DefMI->eraseFromParent();
+  }
+}
+
+/// RemoveUnnecessaryKills - Remove kill markers that are no longer accurate
+/// due to live range lengthening as the result of coalescing.
+void SimpleRegisterCoalescing::RemoveUnnecessaryKills(unsigned Reg,
+                                                      LiveInterval &LI) {
+  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(Reg),
+         UE = mri_->use_end(); UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    if (UseMO.isKill()) {
+      MachineInstr *UseMI = UseMO.getParent();
+      unsigned UseIdx = li_->getUseIndex(li_->getInstructionIndex(UseMI));
+      const LiveRange *UI = LI.getLiveRangeContaining(UseIdx);
+      if (!UI || !LI.isKill(UI->valno, UseIdx+1))
+        UseMO.setIsKill(false);
+    }
+  }
+}
+
+/// removeIntervalIfEmpty - Check if the live interval of a physical register
+/// is empty, if so remove it and also remove the empty intervals of its
+/// sub-registers. Return true if live interval is removed.
+static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *li_,
+                                  const TargetRegisterInfo *tri_) {
+  if (li.empty()) {
+    if (TargetRegisterInfo::isPhysicalRegister(li.reg))
+      for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) {
+        if (!li_->hasInterval(*SR))
+          continue;
+        LiveInterval &sli = li_->getInterval(*SR);
+        if (sli.empty())
+          li_->removeInterval(*SR);
+      }
+    li_->removeInterval(li.reg);
+    return true;
+  }
+  return false;
+}
+
+/// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
+/// Return true if live interval is removed.
+bool SimpleRegisterCoalescing::ShortenDeadCopyLiveRange(LiveInterval &li,
+                                                        MachineInstr *CopyMI) {
+  unsigned CopyIdx = li_->getInstructionIndex(CopyMI);
+  LiveInterval::iterator MLR =
+    li.FindLiveRangeContaining(li_->getDefIndex(CopyIdx));
+  if (MLR == li.end())
+    return false;  // Already removed by ShortenDeadCopySrcLiveRange.
+  unsigned RemoveStart = MLR->start;
+  unsigned RemoveEnd = MLR->end;
+  // Remove the liverange that's defined by this.
+  if (RemoveEnd == li_->getDefIndex(CopyIdx)+1) {
+    removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
+    return removeIntervalIfEmpty(li, li_, tri_);
+  }
+  return false;
+}
+
+/// RemoveDeadDef - If a def of a live interval is now determined dead, remove
+/// the val# it defines. If the live interval becomes empty, remove it as well.
+bool SimpleRegisterCoalescing::RemoveDeadDef(LiveInterval &li,
+                                             MachineInstr *DefMI) {
+  unsigned DefIdx = li_->getDefIndex(li_->getInstructionIndex(DefMI));
+  LiveInterval::iterator MLR = li.FindLiveRangeContaining(DefIdx);
+  if (DefIdx != MLR->valno->def)
+    return false;
+  li.removeValNo(MLR->valno);
+  return removeIntervalIfEmpty(li, li_, tri_);
+}
+
+/// PropagateDeadness - Propagate the dead marker to the instruction which
+/// defines the val#.
+static void PropagateDeadness(LiveInterval &li, MachineInstr *CopyMI,
+                              unsigned &LRStart, LiveIntervals *li_,
+                              const TargetRegisterInfo* tri_) {
+  MachineInstr *DefMI =
+    li_->getInstructionFromIndex(li_->getDefIndex(LRStart));
+  if (DefMI && DefMI != CopyMI) {
+    int DeadIdx = DefMI->findRegisterDefOperandIdx(li.reg, false, tri_);
+    if (DeadIdx != -1) {
+      DefMI->getOperand(DeadIdx).setIsDead();
+      // A dead def should have a single cycle interval.
+      ++LRStart;
+    }
+  }
+}
+
+/// ShortenDeadCopySrcLiveRange - Shorten a live range as it's artificially
+/// extended by a dead copy. Mark the last use (if any) of the val# as kill as
+/// ends the live range there. If there isn't another use, then this live range
+/// is dead. Return true if live interval is removed.
+bool
+SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
+                                                      MachineInstr *CopyMI) {
+  unsigned CopyIdx = li_->getInstructionIndex(CopyMI);
+  if (CopyIdx == 0) {
+    // FIXME: special case: function live in. It can be a general case if the
+    // first instruction index starts at > 0 value.
+    assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
+    // Live-in to the function but dead. Remove it from entry live-in set.
+    if (mf_->begin()->isLiveIn(li.reg))
+      mf_->begin()->removeLiveIn(li.reg);
+    const LiveRange *LR = li.getLiveRangeContaining(CopyIdx);
+    removeRange(li, LR->start, LR->end, li_, tri_);
+    return removeIntervalIfEmpty(li, li_, tri_);
+  }
+
+  LiveInterval::iterator LR = li.FindLiveRangeContaining(CopyIdx-1);
+  if (LR == li.end())
+    // Livein but defined by a phi.
+    return false;
+
+  unsigned RemoveStart = LR->start;
+  unsigned RemoveEnd = li_->getDefIndex(CopyIdx)+1;
+  if (LR->end > RemoveEnd)
+    // More uses past this copy? Nothing to do.
+    return false;
+
+  // If there is a last use in the same bb, we can't remove the live range.
+  // Shorten the live interval and return.
+  MachineBasicBlock *CopyMBB = CopyMI->getParent();
+  if (TrimLiveIntervalToLastUse(CopyIdx, CopyMBB, li, LR))
+    return false;
+
+  MachineBasicBlock *StartMBB = li_->getMBBFromIndex(RemoveStart);
+  if (!isSameOrFallThroughBB(StartMBB, CopyMBB, tii_))
+    // If the live range starts in another mbb and the copy mbb is not a fall
+    // through mbb, then we can only cut the range from the beginning of the
+    // copy mbb.
+    RemoveStart = li_->getMBBStartIdx(CopyMBB) + 1;
+
+  if (LR->valno->def == RemoveStart) {
+    // If the def MI defines the val# and this copy is the only kill of the
+    // val#, then propagate the dead marker.
+    if (li.isOnlyLROfValNo(LR)) {
+      PropagateDeadness(li, CopyMI, RemoveStart, li_, tri_);
+      ++numDeadValNo;
+    }
+    if (li.isKill(LR->valno, RemoveEnd))
+      li.removeKill(LR->valno, RemoveEnd);
+  }
+
+  removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
+  return removeIntervalIfEmpty(li, li_, tri_);
+}
+
+/// CanCoalesceWithImpDef - Returns true if the specified copy instruction
+/// from an implicit def to another register can be coalesced away.
+bool SimpleRegisterCoalescing::CanCoalesceWithImpDef(MachineInstr *CopyMI,
+                                                     LiveInterval &li,
+                                                     LiveInterval &ImpLi) const{
+  if (!CopyMI->killsRegister(ImpLi.reg))
+    return false;
+  unsigned CopyIdx = li_->getDefIndex(li_->getInstructionIndex(CopyMI));
+  LiveInterval::iterator LR = li.FindLiveRangeContaining(CopyIdx);
+  if (LR == li.end())
+    return false;
+  if (LR->valno->hasPHIKill)
+    return false;
+  if (LR->valno->def != CopyIdx)
+    return false;
+  // Make sure all of val# uses are copies.
+  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(li.reg),
+         UE = mri_->use_end(); UI != UE;) {
+    MachineInstr *UseMI = &*UI;
+    ++UI;
+    if (JoinedCopies.count(UseMI))
+      continue;
+    unsigned UseIdx = li_->getUseIndex(li_->getInstructionIndex(UseMI));
+    LiveInterval::iterator ULR = li.FindLiveRangeContaining(UseIdx);
+    if (ULR == li.end() || ULR->valno != LR->valno)
+      continue;
+    // If the use is not a use, then it's not safe to coalesce the move.
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (!tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
+      if (UseMI->getOpcode() == TargetInstrInfo::INSERT_SUBREG &&
+          UseMI->getOperand(1).getReg() == li.reg)
+        continue;
+      return false;
+    }
+  }
+  return true;
+}
+
+
+/// RemoveCopiesFromValNo - The specified value# is defined by an implicit
+/// def and it is being removed. Turn all copies from this value# into
+/// identity copies so they will be removed.
+void SimpleRegisterCoalescing::RemoveCopiesFromValNo(LiveInterval &li,
+                                                     VNInfo *VNI) {
+  SmallVector<MachineInstr*, 4> ImpDefs;
+  MachineOperand *LastUse = NULL;
+  unsigned LastUseIdx = li_->getUseIndex(VNI->def);
+  for (MachineRegisterInfo::reg_iterator RI = mri_->reg_begin(li.reg),
+         RE = mri_->reg_end(); RI != RE;) {
+    MachineOperand *MO = &RI.getOperand();
+    MachineInstr *MI = &*RI;
+    ++RI;
+    if (MO->isDef()) {
+      if (MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF) {
+        ImpDefs.push_back(MI);
+      }
+      continue;
+    }
+    if (JoinedCopies.count(MI))
+      continue;
+    unsigned UseIdx = li_->getUseIndex(li_->getInstructionIndex(MI));
+    LiveInterval::iterator ULR = li.FindLiveRangeContaining(UseIdx);
+    if (ULR == li.end() || ULR->valno != VNI)
+      continue;
+    // If the use is a copy, turn it into an identity copy.
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
+        SrcReg == li.reg) {
+      // Each use MI may have multiple uses of this register. Change them all.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.getReg() == li.reg)
+          MO.setReg(DstReg);
+      }
+      JoinedCopies.insert(MI);
+    } else if (UseIdx > LastUseIdx) {
+      LastUseIdx = UseIdx;
+      LastUse = MO;
+    }
+  }
+  if (LastUse) {
+    LastUse->setIsKill();
+    li.addKill(VNI, LastUseIdx+1);
+  } else {
+    // Remove dead implicit_def's.
+    while (!ImpDefs.empty()) {
+      MachineInstr *ImpDef = ImpDefs.back();
+      ImpDefs.pop_back();
+      li_->RemoveMachineInstrFromMaps(ImpDef);
+      ImpDef->eraseFromParent();
+    }
+  }
+}
+
+/// isWinToJoinVRWithSrcPhysReg - Return true if it's worth while to join a
+/// a virtual destination register with physical source register.
+bool
+SimpleRegisterCoalescing::isWinToJoinVRWithSrcPhysReg(MachineInstr *CopyMI,
+                                                     MachineBasicBlock *CopyMBB,
+                                                     LiveInterval &DstInt,
+                                                     LiveInterval &SrcInt) {
+  // If the virtual register live interval is long but it has low use desity,
+  // do not join them, instead mark the physical register as its allocation
+  // preference.
+  const TargetRegisterClass *RC = mri_->getRegClass(DstInt.reg);
+  unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+  unsigned Length = li_->getApproximateInstructionCount(DstInt);
+  if (Length > Threshold &&
+      (((float)std::distance(mri_->use_begin(DstInt.reg),
+                             mri_->use_end()) / Length) < (1.0 / Threshold)))
+    return false;
+
+  // If the virtual register live interval extends into a loop, turn down
+  // aggressiveness.
+  unsigned CopyIdx = li_->getDefIndex(li_->getInstructionIndex(CopyMI));
+  const MachineLoop *L = loopInfo->getLoopFor(CopyMBB);
+  if (!L) {
+    // Let's see if the virtual register live interval extends into the loop.
+    LiveInterval::iterator DLR = DstInt.FindLiveRangeContaining(CopyIdx);
+    assert(DLR != DstInt.end() && "Live range not found!");
+    DLR = DstInt.FindLiveRangeContaining(DLR->end+1);
+    if (DLR != DstInt.end()) {
+      CopyMBB = li_->getMBBFromIndex(DLR->start);
+      L = loopInfo->getLoopFor(CopyMBB);
+    }
+  }
+
+  if (!L || Length <= Threshold)
+    return true;
+
+  unsigned UseIdx = li_->getUseIndex(CopyIdx);
+  LiveInterval::iterator SLR = SrcInt.FindLiveRangeContaining(UseIdx);
+  MachineBasicBlock *SMBB = li_->getMBBFromIndex(SLR->start);
+  if (loopInfo->getLoopFor(SMBB) != L) {
+    if (!loopInfo->isLoopHeader(CopyMBB))
+      return false;
+    // If vr's live interval extends pass the loop header, do not join.
+    for (MachineBasicBlock::succ_iterator SI = CopyMBB->succ_begin(),
+           SE = CopyMBB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock *SuccMBB = *SI;
+      if (SuccMBB == CopyMBB)
+        continue;
+      if (DstInt.overlaps(li_->getMBBStartIdx(SuccMBB),
+                          li_->getMBBEndIdx(SuccMBB)+1))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// isWinToJoinVRWithDstPhysReg - Return true if it's worth while to join a
+/// copy from a virtual source register to a physical destination register.
+bool
+SimpleRegisterCoalescing::isWinToJoinVRWithDstPhysReg(MachineInstr *CopyMI,
+                                                     MachineBasicBlock *CopyMBB,
+                                                     LiveInterval &DstInt,
+                                                     LiveInterval &SrcInt) {
+  // If the virtual register live interval is long but it has low use desity,
+  // do not join them, instead mark the physical register as its allocation
+  // preference.
+  const TargetRegisterClass *RC = mri_->getRegClass(SrcInt.reg);
+  unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+  unsigned Length = li_->getApproximateInstructionCount(SrcInt);
+  if (Length > Threshold &&
+      (((float)std::distance(mri_->use_begin(SrcInt.reg),
+                             mri_->use_end()) / Length) < (1.0 / Threshold)))
+    return false;
+
+  if (SrcInt.empty())
+    // Must be implicit_def.
+    return false;
+
+  // If the virtual register live interval is defined or cross a loop, turn
+  // down aggressiveness.
+  unsigned CopyIdx = li_->getDefIndex(li_->getInstructionIndex(CopyMI));
+  unsigned UseIdx = li_->getUseIndex(CopyIdx);
+  LiveInterval::iterator SLR = SrcInt.FindLiveRangeContaining(UseIdx);
+  assert(SLR != SrcInt.end() && "Live range not found!");
+  SLR = SrcInt.FindLiveRangeContaining(SLR->start-1);
+  if (SLR == SrcInt.end())
+    return true;
+  MachineBasicBlock *SMBB = li_->getMBBFromIndex(SLR->start);
+  const MachineLoop *L = loopInfo->getLoopFor(SMBB);
+
+  if (!L || Length <= Threshold)
+    return true;
+
+  if (loopInfo->getLoopFor(CopyMBB) != L) {
+    if (SMBB != L->getLoopLatch())
+      return false;
+    // If vr's live interval is extended from before the loop latch, do not
+    // join.
+    for (MachineBasicBlock::pred_iterator PI = SMBB->pred_begin(),
+           PE = SMBB->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *PredMBB = *PI;
+      if (PredMBB == SMBB)
+        continue;
+      if (SrcInt.overlaps(li_->getMBBStartIdx(PredMBB),
+                          li_->getMBBEndIdx(PredMBB)+1))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// isWinToJoinCrossClass - Return true if it's profitable to coalesce
+/// two virtual registers from different register classes.
+bool
+SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned LargeReg,
+                                                unsigned SmallReg,
+                                                unsigned Threshold) {
+  // Then make sure the intervals are *short*.
+  LiveInterval &LargeInt = li_->getInterval(LargeReg);
+  LiveInterval &SmallInt = li_->getInterval(SmallReg);
+  unsigned LargeSize = li_->getApproximateInstructionCount(LargeInt);
+  unsigned SmallSize = li_->getApproximateInstructionCount(SmallInt);
+  if (SmallSize > Threshold || LargeSize > Threshold)
+    if ((float)std::distance(mri_->use_begin(SmallReg),
+                             mri_->use_end()) / SmallSize <
+        (float)std::distance(mri_->use_begin(LargeReg),
+                             mri_->use_end()) / LargeSize)
+      return false;
+  return true;
+}
+
+/// HasIncompatibleSubRegDefUse - If we are trying to coalesce a virtual
+/// register with a physical register, check if any of the virtual register
+/// operand is a sub-register use or def. If so, make sure it won't result
+/// in an illegal extract_subreg or insert_subreg instruction. e.g.
+/// vr1024 = extract_subreg vr1025, 1
+/// ...
+/// vr1024 = mov8rr AH
+/// If vr1024 is coalesced with AH, the extract_subreg is now illegal since
+/// AH does not have a super-reg whose sub-register 1 is AH.
+bool
+SimpleRegisterCoalescing::HasIncompatibleSubRegDefUse(MachineInstr *CopyMI,
+                                                      unsigned VirtReg,
+                                                      unsigned PhysReg) {
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(VirtReg),
+         E = mri_->reg_end(); I != E; ++I) {
+    MachineOperand &O = I.getOperand();
+    MachineInstr *MI = &*I;
+    if (MI == CopyMI || JoinedCopies.count(MI))
+      continue;
+    unsigned SubIdx = O.getSubReg();
+    if (SubIdx && !tri_->getSubReg(PhysReg, SubIdx))
+      return true;
+    if (MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
+      SubIdx = MI->getOperand(2).getImm();
+      if (O.isUse() && !tri_->getSubReg(PhysReg, SubIdx))
+        return true;
+      if (O.isDef()) {
+        unsigned SrcReg = MI->getOperand(1).getReg();
+        const TargetRegisterClass *RC =
+          TargetRegisterInfo::isPhysicalRegister(SrcReg)
+          ? tri_->getPhysicalRegisterRegClass(SrcReg)
+          : mri_->getRegClass(SrcReg);
+        if (!tri_->getMatchingSuperReg(PhysReg, SubIdx, RC))
+          return true;
+      }
+    }
+    if (MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+        MI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
+      SubIdx = MI->getOperand(3).getImm();
+      if (VirtReg == MI->getOperand(0).getReg()) {
+        if (!tri_->getSubReg(PhysReg, SubIdx))
+          return true;
+      } else {
+        unsigned DstReg = MI->getOperand(0).getReg();
+        const TargetRegisterClass *RC =
+          TargetRegisterInfo::isPhysicalRegister(DstReg)
+          ? tri_->getPhysicalRegisterRegClass(DstReg)
+          : mri_->getRegClass(DstReg);
+        if (!tri_->getMatchingSuperReg(PhysReg, SubIdx, RC))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+/// CanJoinExtractSubRegToPhysReg - Return true if it's possible to coalesce
+/// an extract_subreg where dst is a physical register, e.g.
+/// cl = EXTRACT_SUBREG reg1024, 1
+bool
+SimpleRegisterCoalescing::CanJoinExtractSubRegToPhysReg(unsigned DstReg,
+                                               unsigned SrcReg, unsigned SubIdx,
+                                               unsigned &RealDstReg) {
+  const TargetRegisterClass *RC = mri_->getRegClass(SrcReg);
+  RealDstReg = tri_->getMatchingSuperReg(DstReg, SubIdx, RC);
+  assert(RealDstReg && "Invalid extract_subreg instruction!");
+
+  // For this type of EXTRACT_SUBREG, conservatively
+  // check if the live interval of the source register interfere with the
+  // actual super physical register we are trying to coalesce with.
+  LiveInterval &RHS = li_->getInterval(SrcReg);
+  if (li_->hasInterval(RealDstReg) &&
+      RHS.overlaps(li_->getInterval(RealDstReg))) {
+    DOUT << "Interfere with register ";
+    DEBUG(li_->getInterval(RealDstReg).print(DOUT, tri_));
+    return false; // Not coalescable
+  }
+  for (const unsigned* SR = tri_->getSubRegisters(RealDstReg); *SR; ++SR)
+    if (li_->hasInterval(*SR) && RHS.overlaps(li_->getInterval(*SR))) {
+      DOUT << "Interfere with sub-register ";
+      DEBUG(li_->getInterval(*SR).print(DOUT, tri_));
+      return false; // Not coalescable
+    }
+  return true;
+}
+
+/// CanJoinInsertSubRegToPhysReg - Return true if it's possible to coalesce
+/// an insert_subreg where src is a physical register, e.g.
+/// reg1024 = INSERT_SUBREG reg1024, c1, 0
+bool
+SimpleRegisterCoalescing::CanJoinInsertSubRegToPhysReg(unsigned DstReg,
+                                               unsigned SrcReg, unsigned SubIdx,
+                                               unsigned &RealSrcReg) {
+  const TargetRegisterClass *RC = mri_->getRegClass(DstReg);
+  RealSrcReg = tri_->getMatchingSuperReg(SrcReg, SubIdx, RC);
+  assert(RealSrcReg && "Invalid extract_subreg instruction!");
+
+  LiveInterval &RHS = li_->getInterval(DstReg);
+  if (li_->hasInterval(RealSrcReg) &&
+      RHS.overlaps(li_->getInterval(RealSrcReg))) {
+    DOUT << "Interfere with register ";
+    DEBUG(li_->getInterval(RealSrcReg).print(DOUT, tri_));
+    return false; // Not coalescable
+  }
+  for (const unsigned* SR = tri_->getSubRegisters(RealSrcReg); *SR; ++SR)
+    if (li_->hasInterval(*SR) && RHS.overlaps(li_->getInterval(*SR))) {
+      DOUT << "Interfere with sub-register ";
+      DEBUG(li_->getInterval(*SR).print(DOUT, tri_));
+      return false; // Not coalescable
+    }
+  return true;
+}
+
+/// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+/// which are the src/dst of the copy instruction CopyMI.  This returns true
+/// if the copy was successfully coalesced away. If it is not currently
+/// possible to coalesce this interval, but it may be possible if other
+/// things get coalesced, then it returns true by reference in 'Again'.
+bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
+  MachineInstr *CopyMI = TheCopy.MI;
+
+  Again = false;
+  if (JoinedCopies.count(CopyMI) || ReMatCopies.count(CopyMI))
+    return false; // Already done.
+
+  DOUT << li_->getInstructionIndex(CopyMI) << '\t' << *CopyMI;
+
+  unsigned SrcReg, DstReg, SrcSubIdx = 0, DstSubIdx = 0;
+  bool isExtSubReg = CopyMI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG;
+  bool isInsSubReg = CopyMI->getOpcode() == TargetInstrInfo::INSERT_SUBREG;
+  bool isSubRegToReg = CopyMI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG;
+  unsigned SubIdx = 0;
+  if (isExtSubReg) {
+    DstReg    = CopyMI->getOperand(0).getReg();
+    DstSubIdx = CopyMI->getOperand(0).getSubReg();
+    SrcReg    = CopyMI->getOperand(1).getReg();
+    SrcSubIdx = CopyMI->getOperand(2).getImm();
+  } else if (isInsSubReg || isSubRegToReg) {
+    if (CopyMI->getOperand(2).getSubReg()) {
+      DOUT << "\tSource of insert_subreg is already coalesced "
+           << "to another register.\n";
+      return false;  // Not coalescable.
+    }
+    DstReg    = CopyMI->getOperand(0).getReg();
+    DstSubIdx = CopyMI->getOperand(3).getImm();
+    SrcReg    = CopyMI->getOperand(2).getReg();
+  } else if (!tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)){
+    assert(0 && "Unrecognized copy instruction!");
+    return false;
+  }
+
+  // If they are already joined we continue.
+  if (SrcReg == DstReg) {
+    DOUT << "\tCopy already coalesced.\n";
+    return false;  // Not coalescable.
+  }
+  
+  bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+
+  // If they are both physical registers, we cannot join them.
+  if (SrcIsPhys && DstIsPhys) {
+    DOUT << "\tCan not coalesce physregs.\n";
+    return false;  // Not coalescable.
+  }
+  
+  // We only join virtual registers with allocatable physical registers.
+  if (SrcIsPhys && !allocatableRegs_[SrcReg]) {
+    DOUT << "\tSrc reg is unallocatable physreg.\n";
+    return false;  // Not coalescable.
+  }
+  if (DstIsPhys && !allocatableRegs_[DstReg]) {
+    DOUT << "\tDst reg is unallocatable physreg.\n";
+    return false;  // Not coalescable.
+  }
+
+  // Check that a physical source register is compatible with dst regclass
+  if (SrcIsPhys) {
+    unsigned SrcSubReg = SrcSubIdx ?
+      tri_->getSubReg(SrcReg, SrcSubIdx) : SrcReg;
+    const TargetRegisterClass *DstRC = mri_->getRegClass(DstReg);
+    const TargetRegisterClass *DstSubRC = DstRC;
+    if (DstSubIdx)
+      DstSubRC = DstRC->getSubRegisterRegClass(DstSubIdx);
+    assert(DstSubRC && "Illegal subregister index");
+    if (!DstSubRC->contains(SrcSubReg)) {
+      DOUT << "\tIncompatible destination regclass: "
+           << tri_->getName(SrcSubReg) << " not in " << DstSubRC->getName()
+           << ".\n";
+      return false;             // Not coalescable.
+    }
+  }
+
+  // Check that a physical dst register is compatible with source regclass
+  if (DstIsPhys) {
+    unsigned DstSubReg = DstSubIdx ?
+      tri_->getSubReg(DstReg, DstSubIdx) : DstReg;
+    const TargetRegisterClass *SrcRC = mri_->getRegClass(SrcReg);
+    const TargetRegisterClass *SrcSubRC = SrcRC;
+    if (SrcSubIdx)
+      SrcSubRC = SrcRC->getSubRegisterRegClass(SrcSubIdx);
+    assert(SrcSubRC && "Illegal subregister index");
+    if (!SrcSubRC->contains(DstReg)) {
+      DOUT << "\tIncompatible source regclass: "
+           << tri_->getName(DstSubReg) << " not in " << SrcSubRC->getName()
+           << ".\n";
+      return false;             // Not coalescable.
+    }
+  }
+
+  // Should be non-null only when coalescing to a sub-register class.
+  bool CrossRC = false;
+  const TargetRegisterClass *NewRC = NULL;
+  MachineBasicBlock *CopyMBB = CopyMI->getParent();
+  unsigned RealDstReg = 0;
+  unsigned RealSrcReg = 0;
+  if (isExtSubReg || isInsSubReg || isSubRegToReg) {
+    SubIdx = CopyMI->getOperand(isExtSubReg ? 2 : 3).getImm();
+    if (SrcIsPhys && isExtSubReg) {
+      // r1024 = EXTRACT_SUBREG EAX, 0 then r1024 is really going to be
+      // coalesced with AX.
+      unsigned DstSubIdx = CopyMI->getOperand(0).getSubReg();
+      if (DstSubIdx) {
+        // r1024<2> = EXTRACT_SUBREG EAX, 2. Then r1024 has already been
+        // coalesced to a larger register so the subreg indices cancel out.
+        if (DstSubIdx != SubIdx) {
+          DOUT << "\t Sub-register indices mismatch.\n";
+          return false; // Not coalescable.
+        }
+      } else
+        SrcReg = tri_->getSubReg(SrcReg, SubIdx);
+      SubIdx = 0;
+    } else if (DstIsPhys && (isInsSubReg || isSubRegToReg)) {
+      // EAX = INSERT_SUBREG EAX, r1024, 0
+      unsigned SrcSubIdx = CopyMI->getOperand(2).getSubReg();
+      if (SrcSubIdx) {
+        // EAX = INSERT_SUBREG EAX, r1024<2>, 2 Then r1024 has already been
+        // coalesced to a larger register so the subreg indices cancel out.
+        if (SrcSubIdx != SubIdx) {
+          DOUT << "\t Sub-register indices mismatch.\n";
+          return false; // Not coalescable.
+        }
+      } else
+        DstReg = tri_->getSubReg(DstReg, SubIdx);
+      SubIdx = 0;
+    } else if ((DstIsPhys && isExtSubReg) ||
+               (SrcIsPhys && (isInsSubReg || isSubRegToReg))) {
+      if (!isSubRegToReg && CopyMI->getOperand(1).getSubReg()) {
+        DOUT << "\tSrc of extract_subreg already coalesced with reg"
+             << " of a super-class.\n";
+        return false; // Not coalescable.
+      }
+
+      if (isExtSubReg) {
+        if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealDstReg))
+          return false; // Not coalescable
+      } else {
+        if (!CanJoinInsertSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealSrcReg))
+          return false; // Not coalescable
+      }
+      SubIdx = 0;
+    } else {
+      unsigned OldSubIdx = isExtSubReg ? CopyMI->getOperand(0).getSubReg()
+        : CopyMI->getOperand(2).getSubReg();
+      if (OldSubIdx) {
+        if (OldSubIdx == SubIdx && !differingRegisterClasses(SrcReg, DstReg))
+          // r1024<2> = EXTRACT_SUBREG r1025, 2. Then r1024 has already been
+          // coalesced to a larger register so the subreg indices cancel out.
+          // Also check if the other larger register is of the same register
+          // class as the would be resulting register.
+          SubIdx = 0;
+        else {
+          DOUT << "\t Sub-register indices mismatch.\n";
+          return false; // Not coalescable.
+        }
+      }
+      if (SubIdx) {
+        unsigned LargeReg = isExtSubReg ? SrcReg : DstReg;
+        unsigned SmallReg = isExtSubReg ? DstReg : SrcReg;
+        unsigned Limit= allocatableRCRegs_[mri_->getRegClass(SmallReg)].count();
+        if (!isWinToJoinCrossClass(LargeReg, SmallReg, Limit)) {
+          Again = true;  // May be possible to coalesce later.
+          return false;
+        }
+      }
+    }
+  } else if (differingRegisterClasses(SrcReg, DstReg)) {
+    if (!CrossClassJoin)
+      return false;
+    CrossRC = true;
+
+    // FIXME: What if the result of a EXTRACT_SUBREG is then coalesced
+    // with another? If it's the resulting destination register, then
+    // the subidx must be propagated to uses (but only those defined
+    // by the EXTRACT_SUBREG). If it's being coalesced into another
+    // register, it should be safe because register is assumed to have
+    // the register class of the super-register.
+
+    // Process moves where one of the registers have a sub-register index.
+    MachineOperand *DstMO = CopyMI->findRegisterDefOperand(DstReg);
+    MachineOperand *SrcMO = CopyMI->findRegisterUseOperand(SrcReg);
+    SubIdx = DstMO->getSubReg();
+    if (SubIdx) {
+      if (SrcMO->getSubReg())
+        // FIXME: can we handle this?
+        return false;
+      // This is not an insert_subreg but it looks like one.
+      // e.g. %reg1024:4 = MOV32rr %EAX
+      isInsSubReg = true;
+      if (SrcIsPhys) {
+        if (!CanJoinInsertSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealSrcReg))
+          return false; // Not coalescable
+        SubIdx = 0;
+      }
+    } else {
+      SubIdx = SrcMO->getSubReg();
+      if (SubIdx) {
+        // This is not a extract_subreg but it looks like one.
+        // e.g. %cl = MOV16rr %reg1024:1
+        isExtSubReg = true;
+        if (DstIsPhys) {
+          if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx,RealDstReg))
+            return false; // Not coalescable
+          SubIdx = 0;
+        }
+      }
+    }
+
+    const TargetRegisterClass *SrcRC= SrcIsPhys ? 0 : mri_->getRegClass(SrcReg);
+    const TargetRegisterClass *DstRC= DstIsPhys ? 0 : mri_->getRegClass(DstReg);
+    unsigned LargeReg = SrcReg;
+    unsigned SmallReg = DstReg;
+    unsigned Limit = 0;
+
+    // Now determine the register class of the joined register.
+    if (isExtSubReg) {
+      if (SubIdx && DstRC && DstRC->isASubClass()) {
+        // This is a move to a sub-register class. However, the source is a
+        // sub-register of a larger register class. We don't know what should
+        // the register class be. FIXME.
+        Again = true;
+        return false;
+      }
+      Limit = allocatableRCRegs_[DstRC].count();
+    } else if (!SrcIsPhys && !DstIsPhys) {
+      NewRC = getCommonSubClass(SrcRC, DstRC);
+      if (!NewRC) {
+        DOUT << "\tDisjoint regclasses: "
+             << SrcRC->getName() << ", "
+             << DstRC->getName() << ".\n";
+        return false;           // Not coalescable.
+      }
+      if (DstRC->getSize() > SrcRC->getSize())
+        std::swap(LargeReg, SmallReg);
+    }
+
+    // If we are joining two virtual registers and the resulting register
+    // class is more restrictive (fewer register, smaller size). Check if it's
+    // worth doing the merge.
+    if (!SrcIsPhys && !DstIsPhys &&
+        (isExtSubReg || DstRC->isASubClass()) &&
+        !isWinToJoinCrossClass(LargeReg, SmallReg,
+                               allocatableRCRegs_[NewRC].count())) {
+      DOUT << "\tSrc/Dest are different register classes.\n";
+      // Allow the coalescer to try again in case either side gets coalesced to
+      // a physical register that's compatible with the other side. e.g.
+      // r1024 = MOV32to32_ r1025
+      // But later r1024 is assigned EAX then r1025 may be coalesced with EAX.
+      Again = true;  // May be possible to coalesce later.
+      return false;
+    }
+  }
+
+  // Will it create illegal extract_subreg / insert_subreg?
+  if (SrcIsPhys && HasIncompatibleSubRegDefUse(CopyMI, DstReg, SrcReg))
+    return false;
+  if (DstIsPhys && HasIncompatibleSubRegDefUse(CopyMI, SrcReg, DstReg))
+    return false;
+  
+  LiveInterval &SrcInt = li_->getInterval(SrcReg);
+  LiveInterval &DstInt = li_->getInterval(DstReg);
+  assert(SrcInt.reg == SrcReg && DstInt.reg == DstReg &&
+         "Register mapping is horribly broken!");
+
+  DOUT << "\t\tInspecting "; SrcInt.print(DOUT, tri_);
+  DOUT << " and "; DstInt.print(DOUT, tri_);
+  DOUT << ": ";
+
+  // Save a copy of the virtual register live interval. We'll manually
+  // merge this into the "real" physical register live interval this is
+  // coalesced with.
+  LiveInterval *SavedLI = 0;
+  if (RealDstReg)
+    SavedLI = li_->dupInterval(&SrcInt);
+  else if (RealSrcReg)
+    SavedLI = li_->dupInterval(&DstInt);
+
+  // Check if it is necessary to propagate "isDead" property.
+  if (!isExtSubReg && !isInsSubReg && !isSubRegToReg) {
+    MachineOperand *mopd = CopyMI->findRegisterDefOperand(DstReg, false);
+    bool isDead = mopd->isDead();
+
+    // We need to be careful about coalescing a source physical register with a
+    // virtual register. Once the coalescing is done, it cannot be broken and
+    // these are not spillable! If the destination interval uses are far away,
+    // think twice about coalescing them!
+    if (!isDead && (SrcIsPhys || DstIsPhys)) {
+      // If the copy is in a loop, take care not to coalesce aggressively if the
+      // src is coming in from outside the loop (or the dst is out of the loop).
+      // If it's not in a loop, then determine whether to join them base purely
+      // by the length of the interval.
+      if (PhysJoinTweak) {
+        if (SrcIsPhys) {
+          if (!isWinToJoinVRWithSrcPhysReg(CopyMI, CopyMBB, DstInt, SrcInt)) {
+            DstInt.preference = SrcReg;
+            ++numAborts;
+            DOUT << "\tMay tie down a physical register, abort!\n";
+            Again = true;  // May be possible to coalesce later.
+            return false;
+          }
+        } else {
+          if (!isWinToJoinVRWithDstPhysReg(CopyMI, CopyMBB, DstInt, SrcInt)) {
+            SrcInt.preference = DstReg;
+            ++numAborts;
+            DOUT << "\tMay tie down a physical register, abort!\n";
+            Again = true;  // May be possible to coalesce later.
+            return false;
+          }
+        }
+      } else {
+        // If the virtual register live interval is long but it has low use desity,
+        // do not join them, instead mark the physical register as its allocation
+        // preference.
+        LiveInterval &JoinVInt = SrcIsPhys ? DstInt : SrcInt;
+        unsigned JoinVReg = SrcIsPhys ? DstReg : SrcReg;
+        unsigned JoinPReg = SrcIsPhys ? SrcReg : DstReg;
+        const TargetRegisterClass *RC = mri_->getRegClass(JoinVReg);
+        unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+        if (TheCopy.isBackEdge)
+          Threshold *= 2; // Favors back edge copies.
+
+        unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
+        float Ratio = 1.0 / Threshold;
+        if (Length > Threshold &&
+            (((float)std::distance(mri_->use_begin(JoinVReg),
+                                   mri_->use_end()) / Length) < Ratio)) {
+          JoinVInt.preference = JoinPReg;
+          ++numAborts;
+          DOUT << "\tMay tie down a physical register, abort!\n";
+          Again = true;  // May be possible to coalesce later.
+          return false;
+        }
+      }
+    }
+  }
+
+  // Okay, attempt to join these two intervals.  On failure, this returns false.
+  // Otherwise, if one of the intervals being joined is a physreg, this method
+  // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
+  // been modified, so we can use this information below to update aliases.
+  bool Swapped = false;
+  // If SrcInt is implicitly defined, it's safe to coalesce.
+  bool isEmpty = SrcInt.empty();
+  if (isEmpty && !CanCoalesceWithImpDef(CopyMI, DstInt, SrcInt)) {
+    // Only coalesce an empty interval (defined by implicit_def) with
+    // another interval which has a valno defined by the CopyMI and the CopyMI
+    // is a kill of the implicit def.
+    DOUT << "Not profitable!\n";
+    return false;
+  }
+
+  if (!isEmpty && !JoinIntervals(DstInt, SrcInt, Swapped)) {
+    // Coalescing failed.
+
+    // If definition of source is defined by trivial computation, try
+    // rematerializing it.
+    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
+        ReMaterializeTrivialDef(SrcInt, DstInt.reg, CopyMI))
+      return true;
+    
+    // If we can eliminate the copy without merging the live ranges, do so now.
+    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
+        (AdjustCopiesBackFrom(SrcInt, DstInt, CopyMI) ||
+         RemoveCopyByCommutingDef(SrcInt, DstInt, CopyMI))) {
+      JoinedCopies.insert(CopyMI);
+      return true;
+    }
+    
+    // Otherwise, we are unable to join the intervals.
+    DOUT << "Interference!\n";
+    Again = true;  // May be possible to coalesce later.
+    return false;
+  }
+
+  LiveInterval *ResSrcInt = &SrcInt;
+  LiveInterval *ResDstInt = &DstInt;
+  if (Swapped) {
+    std::swap(SrcReg, DstReg);
+    std::swap(ResSrcInt, ResDstInt);
+  }
+  assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+         "LiveInterval::join didn't work right!");
+                               
+  // If we're about to merge live ranges into a physical register live interval,
+  // we have to update any aliased register's live ranges to indicate that they
+  // have clobbered values for this range.
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    // If this is a extract_subreg where dst is a physical register, e.g.
+    // cl = EXTRACT_SUBREG reg1024, 1
+    // then create and update the actual physical register allocated to RHS.
+    if (RealDstReg || RealSrcReg) {
+      LiveInterval &RealInt =
+        li_->getOrCreateInterval(RealDstReg ? RealDstReg : RealSrcReg);
+      for (LiveInterval::const_vni_iterator I = SavedLI->vni_begin(),
+             E = SavedLI->vni_end(); I != E; ++I) {
+        const VNInfo *ValNo = *I;
+        VNInfo *NewValNo = RealInt.getNextValue(ValNo->def, ValNo->copy,
+                                                li_->getVNInfoAllocator());
+        NewValNo->hasPHIKill = ValNo->hasPHIKill;
+        NewValNo->redefByEC = ValNo->redefByEC;
+        RealInt.addKills(NewValNo, ValNo->kills);
+        RealInt.MergeValueInAsValue(*SavedLI, ValNo, NewValNo);
+      }
+      RealInt.weight += SavedLI->weight;      
+      DstReg = RealDstReg ? RealDstReg : RealSrcReg;
+    }
+
+    // Update the liveintervals of sub-registers.
+    for (const unsigned *AS = tri_->getSubRegisters(DstReg); *AS; ++AS)
+      li_->getOrCreateInterval(*AS).MergeInClobberRanges(*ResSrcInt,
+                                                 li_->getVNInfoAllocator());
+  }
+
+  // If this is a EXTRACT_SUBREG, make sure the result of coalescing is the
+  // larger super-register.
+  if ((isExtSubReg || isInsSubReg || isSubRegToReg) &&
+      !SrcIsPhys && !DstIsPhys) {
+    if ((isExtSubReg && !Swapped) ||
+        ((isInsSubReg || isSubRegToReg) && Swapped)) {
+      ResSrcInt->Copy(*ResDstInt, li_->getVNInfoAllocator());
+      std::swap(SrcReg, DstReg);
+      std::swap(ResSrcInt, ResDstInt);
+    }
+  }
+
+  // Coalescing to a virtual register that is of a sub-register class of the
+  // other. Make sure the resulting register is set to the right register class.
+  if (CrossRC) {
+      ++numCrossRCs;
+    if (NewRC)
+      mri_->setRegClass(DstReg, NewRC);
+  }
+
+  if (NewHeuristic) {
+    // Add all copies that define val# in the source interval into the queue.
+    for (LiveInterval::const_vni_iterator i = ResSrcInt->vni_begin(),
+           e = ResSrcInt->vni_end(); i != e; ++i) {
+      const VNInfo *vni = *i;
+      if (!vni->def || vni->def == ~1U || vni->def == ~0U)
+        continue;
+      MachineInstr *CopyMI = li_->getInstructionFromIndex(vni->def);
+      unsigned NewSrcReg, NewDstReg, NewSrcSubIdx, NewDstSubIdx;
+      if (CopyMI &&
+          JoinedCopies.count(CopyMI) == 0 &&
+          tii_->isMoveInstr(*CopyMI, NewSrcReg, NewDstReg,
+                            NewSrcSubIdx, NewDstSubIdx)) {
+        unsigned LoopDepth = loopInfo->getLoopDepth(CopyMBB);
+        JoinQueue->push(CopyRec(CopyMI, LoopDepth,
+                                isBackEdgeCopy(CopyMI, DstReg)));
+      }
+    }
+  }
+
+  // Remember to delete the copy instruction.
+  JoinedCopies.insert(CopyMI);
+
+  // Some live range has been lengthened due to colaescing, eliminate the
+  // unnecessary kills.
+  RemoveUnnecessaryKills(SrcReg, *ResDstInt);
+  if (TargetRegisterInfo::isVirtualRegister(DstReg))
+    RemoveUnnecessaryKills(DstReg, *ResDstInt);
+
+  if (isInsSubReg)
+    // Avoid:
+    // r1024 = op
+    // r1024 = implicit_def
+    // ...
+    //       = r1024
+    RemoveDeadImpDef(DstReg, *ResDstInt);
+  UpdateRegDefsUses(SrcReg, DstReg, SubIdx);
+
+  // SrcReg is guarateed to be the register whose live interval that is
+  // being merged.
+  li_->removeInterval(SrcReg);
+
+  // Manually deleted the live interval copy.
+  if (SavedLI) {
+    SavedLI->clear();
+    delete SavedLI;
+  }
+
+  if (isEmpty) {
+    // Now the copy is being coalesced away, the val# previously defined
+    // by the copy is being defined by an IMPLICIT_DEF which defines a zero
+    // length interval. Remove the val#.
+    unsigned CopyIdx = li_->getDefIndex(li_->getInstructionIndex(CopyMI));
+    const LiveRange *LR = ResDstInt->getLiveRangeContaining(CopyIdx);
+    VNInfo *ImpVal = LR->valno;
+    assert(ImpVal->def == CopyIdx);
+    unsigned NextDef = LR->end;
+    RemoveCopiesFromValNo(*ResDstInt, ImpVal);
+    ResDstInt->removeValNo(ImpVal);
+    LR = ResDstInt->FindLiveRangeContaining(NextDef);
+    if (LR != ResDstInt->end() && LR->valno->def == NextDef) {
+      // Special case: vr1024 = implicit_def
+      //               vr1024 = insert_subreg vr1024, vr1025, c
+      // The insert_subreg becomes a "copy" that defines a val# which can itself
+      // be coalesced away.
+      MachineInstr *DefMI = li_->getInstructionFromIndex(NextDef);
+      if (DefMI->getOpcode() == TargetInstrInfo::INSERT_SUBREG)
+        LR->valno->copy = DefMI;
+    }
+  }
+
+  // If resulting interval has a preference that no longer fits because of subreg
+  // coalescing, just clear the preference.
+  if (ResDstInt->preference && (isExtSubReg || isInsSubReg || isSubRegToReg) &&
+      TargetRegisterInfo::isVirtualRegister(ResDstInt->reg)) {
+    const TargetRegisterClass *RC = mri_->getRegClass(ResDstInt->reg);
+    if (!RC->contains(ResDstInt->preference))
+      ResDstInt->preference = 0;
+  }
+
+  DOUT << "\n\t\tJoined.  Result = "; ResDstInt->print(DOUT, tri_);
+  DOUT << "\n";
+
+  ++numJoins;
+  return true;
+}
+
+/// ComputeUltimateVN - Assuming we are going to join two live intervals,
+/// compute what the resultant value numbers for each value in the input two
+/// ranges will be.  This is complicated by copies between the two which can
+/// and will commonly cause multiple value numbers to be merged into one.
+///
+/// VN is the value number that we're trying to resolve.  InstDefiningValue
+/// keeps track of the new InstDefiningValue assignment for the result
+/// LiveInterval.  ThisFromOther/OtherFromThis are sets that keep track of
+/// whether a value in this or other is a copy from the opposite set.
+/// ThisValNoAssignments/OtherValNoAssignments keep track of value #'s that have
+/// already been assigned.
+///
+/// ThisFromOther[x] - If x is defined as a copy from the other interval, this
+/// contains the value number the copy is from.
+///
+static unsigned ComputeUltimateVN(VNInfo *VNI,
+                                  SmallVector<VNInfo*, 16> &NewVNInfo,
+                                  DenseMap<VNInfo*, VNInfo*> &ThisFromOther,
+                                  DenseMap<VNInfo*, VNInfo*> &OtherFromThis,
+                                  SmallVector<int, 16> &ThisValNoAssignments,
+                                  SmallVector<int, 16> &OtherValNoAssignments) {
+  unsigned VN = VNI->id;
+
+  // If the VN has already been computed, just return it.
+  if (ThisValNoAssignments[VN] >= 0)
+    return ThisValNoAssignments[VN];
+//  assert(ThisValNoAssignments[VN] != -2 && "Cyclic case?");
+
+  // If this val is not a copy from the other val, then it must be a new value
+  // number in the destination.
+  DenseMap<VNInfo*, VNInfo*>::iterator I = ThisFromOther.find(VNI);
+  if (I == ThisFromOther.end()) {
+    NewVNInfo.push_back(VNI);
+    return ThisValNoAssignments[VN] = NewVNInfo.size()-1;
+  }
+  VNInfo *OtherValNo = I->second;
+
+  // Otherwise, this *is* a copy from the RHS.  If the other side has already
+  // been computed, return it.
+  if (OtherValNoAssignments[OtherValNo->id] >= 0)
+    return ThisValNoAssignments[VN] = OtherValNoAssignments[OtherValNo->id];
+  
+  // Mark this value number as currently being computed, then ask what the
+  // ultimate value # of the other value is.
+  ThisValNoAssignments[VN] = -2;
+  unsigned UltimateVN =
+    ComputeUltimateVN(OtherValNo, NewVNInfo, OtherFromThis, ThisFromOther,
+                      OtherValNoAssignments, ThisValNoAssignments);
+  return ThisValNoAssignments[VN] = UltimateVN;
+}
+
+static bool InVector(VNInfo *Val, const SmallVector<VNInfo*, 8> &V) {
+  return std::find(V.begin(), V.end(), Val) != V.end();
+}
+
+/// RangeIsDefinedByCopyFromReg - Return true if the specified live range of
+/// the specified live interval is defined by a copy from the specified
+/// register.
+bool SimpleRegisterCoalescing::RangeIsDefinedByCopyFromReg(LiveInterval &li,
+                                                           LiveRange *LR,
+                                                           unsigned Reg) {
+  unsigned SrcReg = li_->getVNInfoSourceReg(LR->valno);
+  if (SrcReg == Reg)
+    return true;
+  if (LR->valno->def == ~0U &&
+      TargetRegisterInfo::isPhysicalRegister(li.reg) &&
+      *tri_->getSuperRegisters(li.reg)) {
+    // It's a sub-register live interval, we may not have precise information.
+    // Re-compute it.
+    MachineInstr *DefMI = li_->getInstructionFromIndex(LR->start);
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (DefMI &&
+        tii_->isMoveInstr(*DefMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
+        DstReg == li.reg && SrcReg == Reg) {
+      // Cache computed info.
+      LR->valno->def  = LR->start;
+      LR->valno->copy = DefMI;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// SimpleJoin - Attempt to joint the specified interval into this one. The
+/// caller of this method must guarantee that the RHS only contains a single
+/// value number and that the RHS is not defined by a copy from this
+/// interval.  This returns false if the intervals are not joinable, or it
+/// joins them and returns true.
+bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
+  assert(RHS.containsOneValue());
+  
+  // Some number (potentially more than one) value numbers in the current
+  // interval may be defined as copies from the RHS.  Scan the overlapping
+  // portions of the LHS and RHS, keeping track of this and looking for
+  // overlapping live ranges that are NOT defined as copies.  If these exist, we
+  // cannot coalesce.
+  
+  LiveInterval::iterator LHSIt = LHS.begin(), LHSEnd = LHS.end();
+  LiveInterval::iterator RHSIt = RHS.begin(), RHSEnd = RHS.end();
+  
+  if (LHSIt->start < RHSIt->start) {
+    LHSIt = std::upper_bound(LHSIt, LHSEnd, RHSIt->start);
+    if (LHSIt != LHS.begin()) --LHSIt;
+  } else if (RHSIt->start < LHSIt->start) {
+    RHSIt = std::upper_bound(RHSIt, RHSEnd, LHSIt->start);
+    if (RHSIt != RHS.begin()) --RHSIt;
+  }
+  
+  SmallVector<VNInfo*, 8> EliminatedLHSVals;
+  
+  while (1) {
+    // Determine if these live intervals overlap.
+    bool Overlaps = false;
+    if (LHSIt->start <= RHSIt->start)
+      Overlaps = LHSIt->end > RHSIt->start;
+    else
+      Overlaps = RHSIt->end > LHSIt->start;
+    
+    // If the live intervals overlap, there are two interesting cases: if the
+    // LHS interval is defined by a copy from the RHS, it's ok and we record
+    // that the LHS value # is the same as the RHS.  If it's not, then we cannot
+    // coalesce these live ranges and we bail out.
+    if (Overlaps) {
+      // If we haven't already recorded that this value # is safe, check it.
+      if (!InVector(LHSIt->valno, EliminatedLHSVals)) {
+        // Copy from the RHS?
+        if (!RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg))
+          return false;    // Nope, bail out.
+
+        if (LHSIt->contains(RHSIt->valno->def))
+          // Here is an interesting situation:
+          // BB1:
+          //   vr1025 = copy vr1024
+          //   ..
+          // BB2:
+          //   vr1024 = op 
+          //          = vr1025
+          // Even though vr1025 is copied from vr1024, it's not safe to
+          // coalesce them since the live range of vr1025 intersects the
+          // def of vr1024. This happens because vr1025 is assigned the
+          // value of the previous iteration of vr1024.
+          return false;
+        EliminatedLHSVals.push_back(LHSIt->valno);
+      }
+      
+      // We know this entire LHS live range is okay, so skip it now.
+      if (++LHSIt == LHSEnd) break;
+      continue;
+    }
+    
+    if (LHSIt->end < RHSIt->end) {
+      if (++LHSIt == LHSEnd) break;
+    } else {
+      // One interesting case to check here.  It's possible that we have
+      // something like "X3 = Y" which defines a new value number in the LHS,
+      // and is the last use of this liverange of the RHS.  In this case, we
+      // want to notice this copy (so that it gets coalesced away) even though
+      // the live ranges don't actually overlap.
+      if (LHSIt->start == RHSIt->end) {
+        if (InVector(LHSIt->valno, EliminatedLHSVals)) {
+          // We already know that this value number is going to be merged in
+          // if coalescing succeeds.  Just skip the liverange.
+          if (++LHSIt == LHSEnd) break;
+        } else {
+          // Otherwise, if this is a copy from the RHS, mark it as being merged
+          // in.
+          if (RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg)) {
+            if (LHSIt->contains(RHSIt->valno->def))
+              // Here is an interesting situation:
+              // BB1:
+              //   vr1025 = copy vr1024
+              //   ..
+              // BB2:
+              //   vr1024 = op 
+              //          = vr1025
+              // Even though vr1025 is copied from vr1024, it's not safe to
+              // coalesced them since live range of vr1025 intersects the
+              // def of vr1024. This happens because vr1025 is assigned the
+              // value of the previous iteration of vr1024.
+              return false;
+            EliminatedLHSVals.push_back(LHSIt->valno);
+
+            // We know this entire LHS live range is okay, so skip it now.
+            if (++LHSIt == LHSEnd) break;
+          }
+        }
+      }
+      
+      if (++RHSIt == RHSEnd) break;
+    }
+  }
+  
+  // If we got here, we know that the coalescing will be successful and that
+  // the value numbers in EliminatedLHSVals will all be merged together.  Since
+  // the most common case is that EliminatedLHSVals has a single number, we
+  // optimize for it: if there is more than one value, we merge them all into
+  // the lowest numbered one, then handle the interval as if we were merging
+  // with one value number.
+  VNInfo *LHSValNo = NULL;
+  if (EliminatedLHSVals.size() > 1) {
+    // Loop through all the equal value numbers merging them into the smallest
+    // one.
+    VNInfo *Smallest = EliminatedLHSVals[0];
+    for (unsigned i = 1, e = EliminatedLHSVals.size(); i != e; ++i) {
+      if (EliminatedLHSVals[i]->id < Smallest->id) {
+        // Merge the current notion of the smallest into the smaller one.
+        LHS.MergeValueNumberInto(Smallest, EliminatedLHSVals[i]);
+        Smallest = EliminatedLHSVals[i];
+      } else {
+        // Merge into the smallest.
+        LHS.MergeValueNumberInto(EliminatedLHSVals[i], Smallest);
+      }
+    }
+    LHSValNo = Smallest;
+  } else if (EliminatedLHSVals.empty()) {
+    if (TargetRegisterInfo::isPhysicalRegister(LHS.reg) &&
+        *tri_->getSuperRegisters(LHS.reg))
+      // Imprecise sub-register information. Can't handle it.
+      return false;
+    assert(0 && "No copies from the RHS?");
+  } else {
+    LHSValNo = EliminatedLHSVals[0];
+  }
+  
+  // Okay, now that there is a single LHS value number that we're merging the
+  // RHS into, update the value number info for the LHS to indicate that the
+  // value number is defined where the RHS value number was.
+  const VNInfo *VNI = RHS.getValNumInfo(0);
+  LHSValNo->def  = VNI->def;
+  LHSValNo->copy = VNI->copy;
+  
+  // Okay, the final step is to loop over the RHS live intervals, adding them to
+  // the LHS.
+  LHSValNo->hasPHIKill |= VNI->hasPHIKill;
+  LHS.addKills(LHSValNo, VNI->kills);
+  LHS.MergeRangesInAsValue(RHS, LHSValNo);
+  LHS.weight += RHS.weight;
+  if (RHS.preference && !LHS.preference)
+    LHS.preference = RHS.preference;
+
+  // Update the liveintervals of sub-registers.
+  if (TargetRegisterInfo::isPhysicalRegister(LHS.reg))
+    for (const unsigned *AS = tri_->getSubRegisters(LHS.reg); *AS; ++AS)
+      li_->getOrCreateInterval(*AS).MergeInClobberRanges(LHS,
+                                                    li_->getVNInfoAllocator());
+
+  return true;
+}
+
+/// JoinIntervals - Attempt to join these two intervals.  On failure, this
+/// returns false.  Otherwise, if one of the intervals being joined is a
+/// physreg, this method always canonicalizes LHS to be it.  The output
+/// "RHS" will not have been modified, so we can use this information
+/// below to update aliases.
+bool
+SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
+                                        bool &Swapped) {
+  // Compute the final value assignment, assuming that the live ranges can be
+  // coalesced.
+  SmallVector<int, 16> LHSValNoAssignments;
+  SmallVector<int, 16> RHSValNoAssignments;
+  DenseMap<VNInfo*, VNInfo*> LHSValsDefinedFromRHS;
+  DenseMap<VNInfo*, VNInfo*> RHSValsDefinedFromLHS;
+  SmallVector<VNInfo*, 16> NewVNInfo;
+
+  // If a live interval is a physical register, conservatively check if any
+  // of its sub-registers is overlapping the live interval of the virtual
+  // register. If so, do not coalesce.
+  if (TargetRegisterInfo::isPhysicalRegister(LHS.reg) &&
+      *tri_->getSubRegisters(LHS.reg)) {
+    // If it's coalescing a virtual register to a physical register, estimate
+    // its live interval length. This is the *cost* of scanning an entire live
+    // interval. If the cost is low, we'll do an exhaustive check instead.
+
+    // If this is something like this:
+    // BB1:
+    // v1024 = op
+    // ...
+    // BB2:
+    // ...
+    // RAX   = v1024
+    //
+    // That is, the live interval of v1024 crosses a bb. Then we can't rely on
+    // less conservative check. It's possible a sub-register is defined before
+    // v1024 (or live in) and live out of BB1.
+    if (RHS.containsOneValue() &&
+        li_->intervalIsInOneMBB(RHS) &&
+        li_->getApproximateInstructionCount(RHS) <= 10) {
+      // Perform a more exhaustive check for some common cases.
+      if (li_->conflictsWithPhysRegRef(RHS, LHS.reg, true, JoinedCopies))
+        return false;
+    } else {
+      for (const unsigned* SR = tri_->getSubRegisters(LHS.reg); *SR; ++SR)
+        if (li_->hasInterval(*SR) && RHS.overlaps(li_->getInterval(*SR))) {
+          DOUT << "Interfere with sub-register ";
+          DEBUG(li_->getInterval(*SR).print(DOUT, tri_));
+          return false;
+        }
+    }
+  } else if (TargetRegisterInfo::isPhysicalRegister(RHS.reg) &&
+             *tri_->getSubRegisters(RHS.reg)) {
+    if (LHS.containsOneValue() &&
+        li_->getApproximateInstructionCount(LHS) <= 10) {
+      // Perform a more exhaustive check for some common cases.
+      if (li_->conflictsWithPhysRegRef(LHS, RHS.reg, false, JoinedCopies))
+        return false;
+    } else {
+      for (const unsigned* SR = tri_->getSubRegisters(RHS.reg); *SR; ++SR)
+        if (li_->hasInterval(*SR) && LHS.overlaps(li_->getInterval(*SR))) {
+          DOUT << "Interfere with sub-register ";
+          DEBUG(li_->getInterval(*SR).print(DOUT, tri_));
+          return false;
+        }
+    }
+  }
+                          
+  // Compute ultimate value numbers for the LHS and RHS values.
+  if (RHS.containsOneValue()) {
+    // Copies from a liveinterval with a single value are simple to handle and
+    // very common, handle the special case here.  This is important, because
+    // often RHS is small and LHS is large (e.g. a physreg).
+    
+    // Find out if the RHS is defined as a copy from some value in the LHS.
+    int RHSVal0DefinedFromLHS = -1;
+    int RHSValID = -1;
+    VNInfo *RHSValNoInfo = NULL;
+    VNInfo *RHSValNoInfo0 = RHS.getValNumInfo(0);
+    unsigned RHSSrcReg = li_->getVNInfoSourceReg(RHSValNoInfo0);
+    if (RHSSrcReg == 0 || RHSSrcReg != LHS.reg) {
+      // If RHS is not defined as a copy from the LHS, we can use simpler and
+      // faster checks to see if the live ranges are coalescable.  This joiner
+      // can't swap the LHS/RHS intervals though.
+      if (!TargetRegisterInfo::isPhysicalRegister(RHS.reg)) {
+        return SimpleJoin(LHS, RHS);
+      } else {
+        RHSValNoInfo = RHSValNoInfo0;
+      }
+    } else {
+      // It was defined as a copy from the LHS, find out what value # it is.
+      RHSValNoInfo = LHS.getLiveRangeContaining(RHSValNoInfo0->def-1)->valno;
+      RHSValID = RHSValNoInfo->id;
+      RHSVal0DefinedFromLHS = RHSValID;
+    }
+    
+    LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
+    RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
+    NewVNInfo.resize(LHS.getNumValNums(), NULL);
+    
+    // Okay, *all* of the values in LHS that are defined as a copy from RHS
+    // should now get updated.
+    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+         i != e; ++i) {
+      VNInfo *VNI = *i;
+      unsigned VN = VNI->id;
+      if (unsigned LHSSrcReg = li_->getVNInfoSourceReg(VNI)) {
+        if (LHSSrcReg != RHS.reg) {
+          // If this is not a copy from the RHS, its value number will be
+          // unmodified by the coalescing.
+          NewVNInfo[VN] = VNI;
+          LHSValNoAssignments[VN] = VN;
+        } else if (RHSValID == -1) {
+          // Otherwise, it is a copy from the RHS, and we don't already have a
+          // value# for it.  Keep the current value number, but remember it.
+          LHSValNoAssignments[VN] = RHSValID = VN;
+          NewVNInfo[VN] = RHSValNoInfo;
+          LHSValsDefinedFromRHS[VNI] = RHSValNoInfo0;
+        } else {
+          // Otherwise, use the specified value #.
+          LHSValNoAssignments[VN] = RHSValID;
+          if (VN == (unsigned)RHSValID) {  // Else this val# is dead.
+            NewVNInfo[VN] = RHSValNoInfo;
+            LHSValsDefinedFromRHS[VNI] = RHSValNoInfo0;
+          }
+        }
+      } else {
+        NewVNInfo[VN] = VNI;
+        LHSValNoAssignments[VN] = VN;
+      }
+    }
+    
+    assert(RHSValID != -1 && "Didn't find value #?");
+    RHSValNoAssignments[0] = RHSValID;
+    if (RHSVal0DefinedFromLHS != -1) {
+      // This path doesn't go through ComputeUltimateVN so just set
+      // it to anything.
+      RHSValsDefinedFromLHS[RHSValNoInfo0] = (VNInfo*)1;
+    }
+  } else {
+    // Loop over the value numbers of the LHS, seeing if any are defined from
+    // the RHS.
+    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+         i != e; ++i) {
+      VNInfo *VNI = *i;
+      if (VNI->def == ~1U || VNI->copy == 0)  // Src not defined by a copy?
+        continue;
+      
+      // DstReg is known to be a register in the LHS interval.  If the src is
+      // from the RHS interval, we can use its value #.
+      if (li_->getVNInfoSourceReg(VNI) != RHS.reg)
+        continue;
+      
+      // Figure out the value # from the RHS.
+      LHSValsDefinedFromRHS[VNI]=RHS.getLiveRangeContaining(VNI->def-1)->valno;
+    }
+    
+    // Loop over the value numbers of the RHS, seeing if any are defined from
+    // the LHS.
+    for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+         i != e; ++i) {
+      VNInfo *VNI = *i;
+      if (VNI->def == ~1U || VNI->copy == 0)  // Src not defined by a copy?
+        continue;
+      
+      // DstReg is known to be a register in the RHS interval.  If the src is
+      // from the LHS interval, we can use its value #.
+      if (li_->getVNInfoSourceReg(VNI) != LHS.reg)
+        continue;
+      
+      // Figure out the value # from the LHS.
+      RHSValsDefinedFromLHS[VNI]=LHS.getLiveRangeContaining(VNI->def-1)->valno;
+    }
+    
+    LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
+    RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
+    NewVNInfo.reserve(LHS.getNumValNums() + RHS.getNumValNums());
+    
+    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+         i != e; ++i) {
+      VNInfo *VNI = *i;
+      unsigned VN = VNI->id;
+      if (LHSValNoAssignments[VN] >= 0 || VNI->def == ~1U) 
+        continue;
+      ComputeUltimateVN(VNI, NewVNInfo,
+                        LHSValsDefinedFromRHS, RHSValsDefinedFromLHS,
+                        LHSValNoAssignments, RHSValNoAssignments);
+    }
+    for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+         i != e; ++i) {
+      VNInfo *VNI = *i;
+      unsigned VN = VNI->id;
+      if (RHSValNoAssignments[VN] >= 0 || VNI->def == ~1U)
+        continue;
+      // If this value number isn't a copy from the LHS, it's a new number.
+      if (RHSValsDefinedFromLHS.find(VNI) == RHSValsDefinedFromLHS.end()) {
+        NewVNInfo.push_back(VNI);
+        RHSValNoAssignments[VN] = NewVNInfo.size()-1;
+        continue;
+      }
+      
+      ComputeUltimateVN(VNI, NewVNInfo,
+                        RHSValsDefinedFromLHS, LHSValsDefinedFromRHS,
+                        RHSValNoAssignments, LHSValNoAssignments);
+    }
+  }
+  
+  // Armed with the mappings of LHS/RHS values to ultimate values, walk the
+  // interval lists to see if these intervals are coalescable.
+  LiveInterval::const_iterator I = LHS.begin();
+  LiveInterval::const_iterator IE = LHS.end();
+  LiveInterval::const_iterator J = RHS.begin();
+  LiveInterval::const_iterator JE = RHS.end();
+  
+  // Skip ahead until the first place of potential sharing.
+  if (I->start < J->start) {
+    I = std::upper_bound(I, IE, J->start);
+    if (I != LHS.begin()) --I;
+  } else if (J->start < I->start) {
+    J = std::upper_bound(J, JE, I->start);
+    if (J != RHS.begin()) --J;
+  }
+  
+  while (1) {
+    // Determine if these two live ranges overlap.
+    bool Overlaps;
+    if (I->start < J->start) {
+      Overlaps = I->end > J->start;
+    } else {
+      Overlaps = J->end > I->start;
+    }
+
+    // If so, check value # info to determine if they are really different.
+    if (Overlaps) {
+      // If the live range overlap will map to the same value number in the
+      // result liverange, we can still coalesce them.  If not, we can't.
+      if (LHSValNoAssignments[I->valno->id] !=
+          RHSValNoAssignments[J->valno->id])
+        return false;
+    }
+    
+    if (I->end < J->end) {
+      ++I;
+      if (I == IE) break;
+    } else {
+      ++J;
+      if (J == JE) break;
+    }
+  }
+
+  // Update kill info. Some live ranges are extended due to copy coalescing.
+  for (DenseMap<VNInfo*, VNInfo*>::iterator I = LHSValsDefinedFromRHS.begin(),
+         E = LHSValsDefinedFromRHS.end(); I != E; ++I) {
+    VNInfo *VNI = I->first;
+    unsigned LHSValID = LHSValNoAssignments[VNI->id];
+    LiveInterval::removeKill(NewVNInfo[LHSValID], VNI->def);
+    NewVNInfo[LHSValID]->hasPHIKill |= VNI->hasPHIKill;
+    RHS.addKills(NewVNInfo[LHSValID], VNI->kills);
+  }
+
+  // Update kill info. Some live ranges are extended due to copy coalescing.
+  for (DenseMap<VNInfo*, VNInfo*>::iterator I = RHSValsDefinedFromLHS.begin(),
+         E = RHSValsDefinedFromLHS.end(); I != E; ++I) {
+    VNInfo *VNI = I->first;
+    unsigned RHSValID = RHSValNoAssignments[VNI->id];
+    LiveInterval::removeKill(NewVNInfo[RHSValID], VNI->def);
+    NewVNInfo[RHSValID]->hasPHIKill |= VNI->hasPHIKill;
+    LHS.addKills(NewVNInfo[RHSValID], VNI->kills);
+  }
+
+  // If we get here, we know that we can coalesce the live ranges.  Ask the
+  // intervals to coalesce themselves now.
+  if ((RHS.ranges.size() > LHS.ranges.size() &&
+      TargetRegisterInfo::isVirtualRegister(LHS.reg)) ||
+      TargetRegisterInfo::isPhysicalRegister(RHS.reg)) {
+    RHS.join(LHS, &RHSValNoAssignments[0], &LHSValNoAssignments[0], NewVNInfo);
+    Swapped = true;
+  } else {
+    LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo);
+    Swapped = false;
+  }
+  return true;
+}
+
+namespace {
+  // DepthMBBCompare - Comparison predicate that sort first based on the loop
+  // depth of the basic block (the unsigned), and then on the MBB number.
+  struct DepthMBBCompare {
+    typedef std::pair<unsigned, MachineBasicBlock*> DepthMBBPair;
+    bool operator()(const DepthMBBPair &LHS, const DepthMBBPair &RHS) const {
+      if (LHS.first > RHS.first) return true;   // Deeper loops first
+      return LHS.first == RHS.first &&
+        LHS.second->getNumber() < RHS.second->getNumber();
+    }
+  };
+}
+
+/// getRepIntervalSize - Returns the size of the interval that represents the
+/// specified register.
+template<class SF>
+unsigned JoinPriorityQueue<SF>::getRepIntervalSize(unsigned Reg) {
+  return Rc->getRepIntervalSize(Reg);
+}
+
+/// CopyRecSort::operator - Join priority queue sorting function.
+///
+bool CopyRecSort::operator()(CopyRec left, CopyRec right) const {
+  // Inner loops first.
+  if (left.LoopDepth > right.LoopDepth)
+    return false;
+  else if (left.LoopDepth == right.LoopDepth)
+    if (left.isBackEdge && !right.isBackEdge)
+      return false;
+  return true;
+}
+
+void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
+                                               std::vector<CopyRec> &TryAgain) {
+  DOUT << ((Value*)MBB->getBasicBlock())->getName() << ":\n";
+
+  std::vector<CopyRec> VirtCopies;
+  std::vector<CopyRec> PhysCopies;
+  std::vector<CopyRec> ImpDefCopies;
+  unsigned LoopDepth = loopInfo->getLoopDepth(MBB);
+  for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+       MII != E;) {
+    MachineInstr *Inst = MII++;
+    
+    // If this isn't a copy nor a extract_subreg, we can't join intervals.
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (Inst->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
+      DstReg = Inst->getOperand(0).getReg();
+      SrcReg = Inst->getOperand(1).getReg();
+    } else if (Inst->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+               Inst->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
+      DstReg = Inst->getOperand(0).getReg();
+      SrcReg = Inst->getOperand(2).getReg();
+    } else if (!tii_->isMoveInstr(*Inst, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+      continue;
+
+    bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+    bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    if (NewHeuristic) {
+      JoinQueue->push(CopyRec(Inst, LoopDepth, isBackEdgeCopy(Inst, DstReg)));
+    } else {
+      if (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty())
+        ImpDefCopies.push_back(CopyRec(Inst, 0, false));
+      else if (SrcIsPhys || DstIsPhys)
+        PhysCopies.push_back(CopyRec(Inst, 0, false));
+      else
+        VirtCopies.push_back(CopyRec(Inst, 0, false));
+    }
+  }
+
+  if (NewHeuristic)
+    return;
+
+  // Try coalescing implicit copies first, followed by copies to / from
+  // physical registers, then finally copies from virtual registers to
+  // virtual registers.
+  for (unsigned i = 0, e = ImpDefCopies.size(); i != e; ++i) {
+    CopyRec &TheCopy = ImpDefCopies[i];
+    bool Again = false;
+    if (!JoinCopy(TheCopy, Again))
+      if (Again)
+        TryAgain.push_back(TheCopy);
+  }
+  for (unsigned i = 0, e = PhysCopies.size(); i != e; ++i) {
+    CopyRec &TheCopy = PhysCopies[i];
+    bool Again = false;
+    if (!JoinCopy(TheCopy, Again))
+      if (Again)
+        TryAgain.push_back(TheCopy);
+  }
+  for (unsigned i = 0, e = VirtCopies.size(); i != e; ++i) {
+    CopyRec &TheCopy = VirtCopies[i];
+    bool Again = false;
+    if (!JoinCopy(TheCopy, Again))
+      if (Again)
+        TryAgain.push_back(TheCopy);
+  }
+}
+
+void SimpleRegisterCoalescing::joinIntervals() {
+  DOUT << "********** JOINING INTERVALS ***********\n";
+
+  if (NewHeuristic)
+    JoinQueue = new JoinPriorityQueue<CopyRecSort>(this);
+
+  std::vector<CopyRec> TryAgainList;
+  if (loopInfo->empty()) {
+    // If there are no loops in the function, join intervals in function order.
+    for (MachineFunction::iterator I = mf_->begin(), E = mf_->end();
+         I != E; ++I)
+      CopyCoalesceInMBB(I, TryAgainList);
+  } else {
+    // Otherwise, join intervals in inner loops before other intervals.
+    // Unfortunately we can't just iterate over loop hierarchy here because
+    // there may be more MBB's than BB's.  Collect MBB's for sorting.
+
+    // Join intervals in the function prolog first. We want to join physical
+    // registers with virtual registers before the intervals got too long.
+    std::vector<std::pair<unsigned, MachineBasicBlock*> > MBBs;
+    for (MachineFunction::iterator I = mf_->begin(), E = mf_->end();I != E;++I){
+      MachineBasicBlock *MBB = I;
+      MBBs.push_back(std::make_pair(loopInfo->getLoopDepth(MBB), I));
+    }
+
+    // Sort by loop depth.
+    std::sort(MBBs.begin(), MBBs.end(), DepthMBBCompare());
+
+    // Finally, join intervals in loop nest order.
+    for (unsigned i = 0, e = MBBs.size(); i != e; ++i)
+      CopyCoalesceInMBB(MBBs[i].second, TryAgainList);
+  }
+  
+  // Joining intervals can allow other intervals to be joined.  Iteratively join
+  // until we make no progress.
+  if (NewHeuristic) {
+    SmallVector<CopyRec, 16> TryAgain;
+    bool ProgressMade = true;
+    while (ProgressMade) {
+      ProgressMade = false;
+      while (!JoinQueue->empty()) {
+        CopyRec R = JoinQueue->pop();
+        bool Again = false;
+        bool Success = JoinCopy(R, Again);
+        if (Success)
+          ProgressMade = true;
+        else if (Again)
+          TryAgain.push_back(R);
+      }
+
+      if (ProgressMade) {
+        while (!TryAgain.empty()) {
+          JoinQueue->push(TryAgain.back());
+          TryAgain.pop_back();
+        }
+      }
+    }
+  } else {
+    bool ProgressMade = true;
+    while (ProgressMade) {
+      ProgressMade = false;
+
+      for (unsigned i = 0, e = TryAgainList.size(); i != e; ++i) {
+        CopyRec &TheCopy = TryAgainList[i];
+        if (TheCopy.MI) {
+          bool Again = false;
+          bool Success = JoinCopy(TheCopy, Again);
+          if (Success || !Again) {
+            TheCopy.MI = 0;   // Mark this one as done.
+            ProgressMade = true;
+          }
+        }
+      }
+    }
+  }
+
+  if (NewHeuristic)
+    delete JoinQueue;  
+}
+
+/// Return true if the two specified registers belong to different register
+/// classes.  The registers may be either phys or virt regs.
+bool
+SimpleRegisterCoalescing::differingRegisterClasses(unsigned RegA,
+                                                   unsigned RegB) const {
+  // Get the register classes for the first reg.
+  if (TargetRegisterInfo::isPhysicalRegister(RegA)) {
+    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+           "Shouldn't consider two physregs!");
+    return !mri_->getRegClass(RegB)->contains(RegA);
+  }
+
+  // Compare against the regclass for the second reg.
+  const TargetRegisterClass *RegClassA = mri_->getRegClass(RegA);
+  if (TargetRegisterInfo::isVirtualRegister(RegB)) {
+    const TargetRegisterClass *RegClassB = mri_->getRegClass(RegB);
+    return RegClassA != RegClassB;
+  }
+  return !RegClassA->contains(RegB);
+}
+
+/// lastRegisterUse - Returns the last use of the specific register between
+/// cycles Start and End or NULL if there are no uses.
+MachineOperand *
+SimpleRegisterCoalescing::lastRegisterUse(unsigned Start, unsigned End,
+                                          unsigned Reg, unsigned &UseIdx) const{
+  UseIdx = 0;
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    MachineOperand *LastUse = NULL;
+    for (MachineRegisterInfo::use_iterator I = mri_->use_begin(Reg),
+           E = mri_->use_end(); I != E; ++I) {
+      MachineOperand &Use = I.getOperand();
+      MachineInstr *UseMI = Use.getParent();
+      unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+      if (tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
+          SrcReg == DstReg)
+        // Ignore identity copies.
+        continue;
+      unsigned Idx = li_->getInstructionIndex(UseMI);
+      if (Idx >= Start && Idx < End && Idx >= UseIdx) {
+        LastUse = &Use;
+        UseIdx = li_->getUseIndex(Idx);
+      }
+    }
+    return LastUse;
+  }
+
+  int e = (End-1) / InstrSlots::NUM * InstrSlots::NUM;
+  int s = Start;
+  while (e >= s) {
+    // Skip deleted instructions
+    MachineInstr *MI = li_->getInstructionFromIndex(e);
+    while ((e - InstrSlots::NUM) >= s && !MI) {
+      e -= InstrSlots::NUM;
+      MI = li_->getInstructionFromIndex(e);
+    }
+    if (e < s || MI == NULL)
+      return NULL;
+
+    // Ignore identity copies.
+    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    if (!(tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
+          SrcReg == DstReg))
+      for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
+        MachineOperand &Use = MI->getOperand(i);
+        if (Use.isReg() && Use.isUse() && Use.getReg() &&
+            tri_->regsOverlap(Use.getReg(), Reg)) {
+          UseIdx = li_->getUseIndex(e);
+          return &Use;
+        }
+      }
+
+    e -= InstrSlots::NUM;
+  }
+
+  return NULL;
+}
+
+
+void SimpleRegisterCoalescing::printRegName(unsigned reg) const {
+  if (TargetRegisterInfo::isPhysicalRegister(reg))
+    cerr << tri_->getName(reg);
+  else
+    cerr << "%reg" << reg;
+}
+
+void SimpleRegisterCoalescing::releaseMemory() {
+  JoinedCopies.clear();
+  ReMatCopies.clear();
+  ReMatDefs.clear();
+}
+
+static bool isZeroLengthInterval(LiveInterval *li) {
+  for (LiveInterval::Ranges::const_iterator
+         i = li->ranges.begin(), e = li->ranges.end(); i != e; ++i)
+    if (i->end - i->start > LiveInterval::InstrSlots::NUM)
+      return false;
+  return true;
+}
+
+/// TurnCopyIntoImpDef - If source of the specified copy is an implicit def,
+/// turn the copy into an implicit def.
+bool
+SimpleRegisterCoalescing::TurnCopyIntoImpDef(MachineBasicBlock::iterator &I,
+                                             MachineBasicBlock *MBB,
+                                             unsigned DstReg, unsigned SrcReg) {
+  MachineInstr *CopyMI = &*I;
+  unsigned CopyIdx = li_->getDefIndex(li_->getInstructionIndex(CopyMI));
+  if (!li_->hasInterval(SrcReg))
+    return false;
+  LiveInterval &SrcInt = li_->getInterval(SrcReg);
+  if (!SrcInt.empty())
+    return false;
+  if (!li_->hasInterval(DstReg))
+    return false;
+  LiveInterval &DstInt = li_->getInterval(DstReg);
+  const LiveRange *DstLR = DstInt.getLiveRangeContaining(CopyIdx);
+  DstInt.removeValNo(DstLR->valno);
+  CopyMI->setDesc(tii_->get(TargetInstrInfo::IMPLICIT_DEF));
+  for (int i = CopyMI->getNumOperands() - 1, e = 0; i > e; --i)
+    CopyMI->RemoveOperand(i);
+  bool NoUse = mri_->use_empty(SrcReg);
+  if (NoUse) {
+    for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg),
+           E = mri_->reg_end(); I != E; ) {
+      assert(I.getOperand().isDef());
+      MachineInstr *DefMI = &*I;
+      ++I;
+      // The implicit_def source has no other uses, delete it.
+      assert(DefMI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF);
+      li_->RemoveMachineInstrFromMaps(DefMI);
+      DefMI->eraseFromParent();
+    }
+  }
+  ++I;
+  return true;
+}
+
+
+bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
+  mf_ = &fn;
+  mri_ = &fn.getRegInfo();
+  tm_ = &fn.getTarget();
+  tri_ = tm_->getRegisterInfo();
+  tii_ = tm_->getInstrInfo();
+  li_ = &getAnalysis<LiveIntervals>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  DOUT << "********** SIMPLE REGISTER COALESCING **********\n"
+       << "********** Function: "
+       << ((Value*)mf_->getFunction())->getName() << '\n';
+
+  allocatableRegs_ = tri_->getAllocatableSet(fn);
+  for (TargetRegisterInfo::regclass_iterator I = tri_->regclass_begin(),
+         E = tri_->regclass_end(); I != E; ++I)
+    allocatableRCRegs_.insert(std::make_pair(*I,
+                                             tri_->getAllocatableSet(fn, *I)));
+
+  // Join (coalesce) intervals if requested.
+  if (EnableJoining) {
+    joinIntervals();
+    DEBUG({
+        DOUT << "********** INTERVALS POST JOINING **********\n";
+        for (LiveIntervals::iterator I = li_->begin(), E = li_->end(); I != E; ++I){
+          I->second->print(DOUT, tri_);
+          DOUT << "\n";
+        }
+      });
+  }
+
+  // Perform a final pass over the instructions and compute spill weights
+  // and remove identity moves.
+  SmallVector<unsigned, 4> DeadDefs;
+  for (MachineFunction::iterator mbbi = mf_->begin(), mbbe = mf_->end();
+       mbbi != mbbe; ++mbbi) {
+    MachineBasicBlock* mbb = mbbi;
+    unsigned loopDepth = loopInfo->getLoopDepth(mbb);
+
+    for (MachineBasicBlock::iterator mii = mbb->begin(), mie = mbb->end();
+         mii != mie; ) {
+      MachineInstr *MI = mii;
+      unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+      if (JoinedCopies.count(MI)) {
+        // Delete all coalesced copies.
+        if (!tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
+          assert((MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
+                  MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+                  MI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) &&
+                 "Unrecognized copy instruction");
+          DstReg = MI->getOperand(0).getReg();
+        }
+        if (MI->registerDefIsDead(DstReg)) {
+          LiveInterval &li = li_->getInterval(DstReg);
+          if (!ShortenDeadCopySrcLiveRange(li, MI))
+            ShortenDeadCopyLiveRange(li, MI);
+        }
+        li_->RemoveMachineInstrFromMaps(MI);
+        mii = mbbi->erase(mii);
+        ++numPeep;
+        continue;
+      }
+
+      // Now check if this is a remat'ed def instruction which is now dead.
+      if (ReMatDefs.count(MI)) {
+        bool isDead = true;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg())
+            continue;
+          unsigned Reg = MO.getReg();
+          if (!Reg)
+            continue;
+          if (TargetRegisterInfo::isVirtualRegister(Reg))
+            DeadDefs.push_back(Reg);
+          if (MO.isDead())
+            continue;
+          if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+              !mri_->use_empty(Reg)) {
+            isDead = false;
+            break;
+          }
+        }
+        if (isDead) {
+          while (!DeadDefs.empty()) {
+            unsigned DeadDef = DeadDefs.back();
+            DeadDefs.pop_back();
+            RemoveDeadDef(li_->getInterval(DeadDef), MI);
+          }
+          li_->RemoveMachineInstrFromMaps(mii);
+          mii = mbbi->erase(mii);
+          continue;
+        } else
+          DeadDefs.clear();
+      }
+
+      // If the move will be an identity move delete it
+      bool isMove= tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
+      if (isMove && SrcReg == DstReg) {
+        if (li_->hasInterval(SrcReg)) {
+          LiveInterval &RegInt = li_->getInterval(SrcReg);
+          // If def of this move instruction is dead, remove its live range
+          // from the dstination register's live interval.
+          if (MI->registerDefIsDead(DstReg)) {
+            if (!ShortenDeadCopySrcLiveRange(RegInt, MI))
+              ShortenDeadCopyLiveRange(RegInt, MI);
+          }
+        }
+        li_->RemoveMachineInstrFromMaps(MI);
+        mii = mbbi->erase(mii);
+        ++numPeep;
+      } else if (!isMove || !TurnCopyIntoImpDef(mii, mbb, DstReg, SrcReg)) {
+        SmallSet<unsigned, 4> UniqueUses;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &mop = MI->getOperand(i);
+          if (mop.isReg() && mop.getReg() &&
+              TargetRegisterInfo::isVirtualRegister(mop.getReg())) {
+            unsigned reg = mop.getReg();
+            // Multiple uses of reg by the same instruction. It should not
+            // contribute to spill weight again.
+            if (UniqueUses.count(reg) != 0)
+              continue;
+            LiveInterval &RegInt = li_->getInterval(reg);
+            RegInt.weight +=
+              li_->getSpillWeight(mop.isDef(), mop.isUse(), loopDepth);
+            UniqueUses.insert(reg);
+          }
+        }
+        ++mii;
+      }
+    }
+  }
+
+  for (LiveIntervals::iterator I = li_->begin(), E = li_->end(); I != E; ++I) {
+    LiveInterval &LI = *I->second;
+    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+      // If the live interval length is essentially zero, i.e. in every live
+      // range the use follows def immediately, it doesn't make sense to spill
+      // it and hope it will be easier to allocate for this li.
+      if (isZeroLengthInterval(&LI))
+        LI.weight = HUGE_VALF;
+      else {
+        bool isLoad = false;
+        SmallVector<LiveInterval*, 4> SpillIs;
+        if (li_->isReMaterializable(LI, SpillIs, isLoad)) {
+          // If all of the definitions of the interval are re-materializable,
+          // it is a preferred candidate for spilling. If non of the defs are
+          // loads, then it's potentially very cheap to re-materialize.
+          // FIXME: this gets much more complicated once we support non-trivial
+          // re-materialization.
+          if (isLoad)
+            LI.weight *= 0.9F;
+          else
+            LI.weight *= 0.5F;
+        }
+      }
+
+      // Slightly prefer live interval that has been assigned a preferred reg.
+      if (LI.preference)
+        LI.weight *= 1.01F;
+
+      // Divide the weight of the interval by its size.  This encourages 
+      // spilling of intervals that are large and have few uses, and
+      // discourages spilling of small intervals with many uses.
+      LI.weight /= li_->getApproximateInstructionCount(LI) * InstrSlots::NUM;
+    }
+  }
+
+  DEBUG(dump());
+  return true;
+}
+
+/// print - Implement the dump method.
+void SimpleRegisterCoalescing::print(std::ostream &O, const Module* m) const {
+   li_->print(O, m);
+}
+
+RegisterCoalescer* llvm::createSimpleRegisterCoalescer() {
+  return new SimpleRegisterCoalescing();
+}
+
+// Make sure that anything that uses RegisterCoalescer pulls in this file...
+DEFINING_FILE_FOR(SimpleRegisterCoalescing)
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
new file mode 100644
index 0000000..a495bfd
--- /dev/null
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -0,0 +1,313 @@
+//===-- SimpleRegisterCoalescing.h - Register Coalescing --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple register copy coalescing phase.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SIMPLE_REGISTER_COALESCING_H
+#define LLVM_CODEGEN_SIMPLE_REGISTER_COALESCING_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/ADT/BitVector.h"
+#include <queue>
+
+namespace llvm {
+  class SimpleRegisterCoalescing;
+  class LiveVariables;
+  class TargetRegisterInfo;
+  class TargetInstrInfo;
+  class VirtRegMap;
+  class MachineLoopInfo;
+
+  /// CopyRec - Representation for copy instructions in coalescer queue.
+  ///
+  struct CopyRec {
+    MachineInstr *MI;
+    unsigned LoopDepth;
+    bool isBackEdge;
+    CopyRec(MachineInstr *mi, unsigned depth, bool be)
+      : MI(mi), LoopDepth(depth), isBackEdge(be) {};
+  };
+
+  template<class SF> class JoinPriorityQueue;
+
+  /// CopyRecSort - Sorting function for coalescer queue.
+  ///
+  struct CopyRecSort : public std::binary_function<CopyRec,CopyRec,bool> {
+    JoinPriorityQueue<CopyRecSort> *JPQ;
+    explicit CopyRecSort(JoinPriorityQueue<CopyRecSort> *jpq) : JPQ(jpq) {}
+    CopyRecSort(const CopyRecSort &RHS) : JPQ(RHS.JPQ) {}
+    bool operator()(CopyRec left, CopyRec right) const;
+  };
+
+  /// JoinQueue - A priority queue of copy instructions the coalescer is
+  /// going to process.
+  template<class SF>
+  class JoinPriorityQueue {
+    SimpleRegisterCoalescing *Rc;
+    std::priority_queue<CopyRec, std::vector<CopyRec>, SF> Queue;
+
+  public:
+    explicit JoinPriorityQueue(SimpleRegisterCoalescing *rc)
+      : Rc(rc), Queue(SF(this)) {}
+
+    bool empty() const { return Queue.empty(); }
+    void push(CopyRec R) { Queue.push(R); }
+    CopyRec pop() {
+      if (empty()) return CopyRec(0, 0, false);
+      CopyRec R = Queue.top();
+      Queue.pop();
+      return R;
+    }
+
+    // Callbacks to SimpleRegisterCoalescing.
+    unsigned getRepIntervalSize(unsigned Reg);
+  };
+
+  class SimpleRegisterCoalescing : public MachineFunctionPass,
+                                   public RegisterCoalescer {
+    MachineFunction* mf_;
+    MachineRegisterInfo* mri_;
+    const TargetMachine* tm_;
+    const TargetRegisterInfo* tri_;
+    const TargetInstrInfo* tii_;
+    LiveIntervals *li_;
+    const MachineLoopInfo* loopInfo;
+    
+    BitVector allocatableRegs_;
+    DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs_;
+
+    /// JoinQueue - A priority queue of copy instructions the coalescer is
+    /// going to process.
+    JoinPriorityQueue<CopyRecSort> *JoinQueue;
+
+    /// JoinedCopies - Keep track of copies eliminated due to coalescing.
+    ///
+    SmallPtrSet<MachineInstr*, 32> JoinedCopies;
+
+    /// ReMatCopies - Keep track of copies eliminated due to remat.
+    ///
+    SmallPtrSet<MachineInstr*, 32> ReMatCopies;
+
+    /// ReMatDefs - Keep track of definition instructions which have
+    /// been remat'ed.
+    SmallPtrSet<MachineInstr*, 8> ReMatDefs;
+
+  public:
+    static char ID; // Pass identifcation, replacement for typeid
+    SimpleRegisterCoalescing() : MachineFunctionPass(&ID) {}
+
+    struct InstrSlots {
+      enum {
+        LOAD  = 0,
+        USE   = 1,
+        DEF   = 2,
+        STORE = 3,
+        NUM   = 4
+      };
+    };
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual void releaseMemory();
+
+    /// runOnMachineFunction - pass entry point
+    virtual bool runOnMachineFunction(MachineFunction&);
+
+    bool coalesceFunction(MachineFunction &mf, RegallocQuery &) {
+      // This runs as an independent pass, so don't do anything.
+      return false;
+    };
+
+    /// getRepIntervalSize - Called from join priority queue sorting function.
+    /// It returns the size of the interval that represent the given register.
+    unsigned getRepIntervalSize(unsigned Reg) {
+      if (!li_->hasInterval(Reg))
+        return 0;
+      return li_->getApproximateInstructionCount(li_->getInterval(Reg)) *
+             LiveInterval::InstrSlots::NUM;
+    }
+
+    /// print - Implement the dump method.
+    virtual void print(std::ostream &O, const Module* = 0) const;
+    void print(std::ostream *O, const Module* M = 0) const {
+      if (O) print(*O, M);
+    }
+
+  private:
+    /// joinIntervals - join compatible live intervals
+    void joinIntervals();
+
+    /// CopyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into the "TryAgain" list.
+    void CopyCoalesceInMBB(MachineBasicBlock *MBB,
+                           std::vector<CopyRec> &TryAgain);
+
+    /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// which are the src/dst of the copy instruction CopyMI.  This returns true
+    /// if the copy was successfully coalesced away. If it is not currently
+    /// possible to coalesce this interval, but it may be possible if other
+    /// things get coalesced, then it returns true by reference in 'Again'.
+    bool JoinCopy(CopyRec &TheCopy, bool &Again);
+    
+    /// JoinIntervals - Attempt to join these two intervals.  On failure, this
+    /// returns false.  Otherwise, if one of the intervals being joined is a
+    /// physreg, this method always canonicalizes DestInt to be it.  The output
+    /// "SrcInt" will not have been modified, so we can use this information
+    /// below to update aliases.
+    bool JoinIntervals(LiveInterval &LHS, LiveInterval &RHS, bool &Swapped);
+    
+    /// SimpleJoin - Attempt to join the specified interval into this one. The
+    /// caller of this method must guarantee that the RHS only contains a single
+    /// value number and that the RHS is not defined by a copy from this
+    /// interval.  This returns false if the intervals are not joinable, or it
+    /// joins them and returns true.
+    bool SimpleJoin(LiveInterval &LHS, LiveInterval &RHS);
+    
+    /// Return true if the two specified registers belong to different register
+    /// classes.  The registers may be either phys or virt regs.
+    bool differingRegisterClasses(unsigned RegA, unsigned RegB) const;
+
+
+    /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// the source value number is defined by a copy from the destination reg
+    /// see if we can merge these two destination reg valno# into a single
+    /// value number, eliminating a copy.
+    bool AdjustCopiesBackFrom(LiveInterval &IntA, LiveInterval &IntB,
+                              MachineInstr *CopyMI);
+
+    /// HasOtherReachingDefs - Return true if there are definitions of IntB
+    /// other than BValNo val# that can reach uses of AValno val# of IntA.
+    bool HasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+                              VNInfo *AValNo, VNInfo *BValNo);
+
+    /// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// If the source value number is defined by a commutable instruction and
+    /// its other operand is coalesced to the copy dest register, see if we
+    /// can transform the copy into a noop by commuting the definition.
+    bool RemoveCopyByCommutingDef(LiveInterval &IntA, LiveInterval &IntB,
+                                  MachineInstr *CopyMI);
+
+    /// TrimLiveIntervalToLastUse - If there is a last use in the same basic
+    /// block as the copy instruction, trim the ive interval to the last use
+    /// and return true.
+    bool TrimLiveIntervalToLastUse(unsigned CopyIdx,
+                                   MachineBasicBlock *CopyMBB,
+                                   LiveInterval &li, const LiveRange *LR);
+
+    /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
+    /// computation, replace the copy by rematerialize the definition.
+    bool ReMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg,
+                                 MachineInstr *CopyMI);
+
+    /// TurnCopyIntoImpDef - If source of the specified copy is an implicit def,
+    /// turn the copy into an implicit def.
+    bool TurnCopyIntoImpDef(MachineBasicBlock::iterator &I,
+                            MachineBasicBlock *MBB,
+                            unsigned DstReg, unsigned SrcReg);
+
+    /// CanCoalesceWithImpDef - Returns true if the specified copy instruction
+    /// from an implicit def to another register can be coalesced away.
+    bool CanCoalesceWithImpDef(MachineInstr *CopyMI,
+                               LiveInterval &li, LiveInterval &ImpLi) const;
+
+    /// RemoveCopiesFromValNo - The specified value# is defined by an implicit
+    /// def and it is being removed. Turn all copies from this value# into
+    /// identity copies so they will be removed.
+    void RemoveCopiesFromValNo(LiveInterval &li, VNInfo *VNI);
+
+    /// isWinToJoinVRWithSrcPhysReg - Return true if it's worth while to join a
+    /// a virtual destination register with physical source register.
+    bool isWinToJoinVRWithSrcPhysReg(MachineInstr *CopyMI,
+                                    MachineBasicBlock *CopyMBB,
+                                    LiveInterval &DstInt, LiveInterval &SrcInt);
+
+    /// isWinToJoinVRWithDstPhysReg - Return true if it's worth while to join a
+    /// copy from a virtual source register to a physical destination register.
+    bool isWinToJoinVRWithDstPhysReg(MachineInstr *CopyMI,
+                                    MachineBasicBlock *CopyMBB,
+                                    LiveInterval &DstInt, LiveInterval &SrcInt);
+
+    /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
+    /// two virtual registers from different register classes.
+    bool isWinToJoinCrossClass(unsigned LargeReg, unsigned SmallReg,
+                               unsigned Threshold);
+
+    /// HasIncompatibleSubRegDefUse - If we are trying to coalesce a virtual
+    /// register with a physical register, check if any of the virtual register
+    /// operand is a sub-register use or def. If so, make sure it won't result
+    /// in an illegal extract_subreg or insert_subreg instruction.
+    bool HasIncompatibleSubRegDefUse(MachineInstr *CopyMI,
+                                     unsigned VirtReg, unsigned PhysReg);
+
+    /// CanJoinExtractSubRegToPhysReg - Return true if it's possible to coalesce
+    /// an extract_subreg where dst is a physical register, e.g.
+    /// cl = EXTRACT_SUBREG reg1024, 1
+    bool CanJoinExtractSubRegToPhysReg(unsigned DstReg, unsigned SrcReg,
+                                       unsigned SubIdx, unsigned &RealDstReg);
+
+    /// CanJoinInsertSubRegToPhysReg - Return true if it's possible to coalesce
+    /// an insert_subreg where src is a physical register, e.g.
+    /// reg1024 = INSERT_SUBREG reg1024, c1, 0
+    bool CanJoinInsertSubRegToPhysReg(unsigned DstReg, unsigned SrcReg,
+                                      unsigned SubIdx, unsigned &RealDstReg);
+
+    /// RangeIsDefinedByCopyFromReg - Return true if the specified live range of
+    /// the specified live interval is defined by a copy from the specified
+    /// register.
+    bool RangeIsDefinedByCopyFromReg(LiveInterval &li, LiveRange *LR,
+                                     unsigned Reg);
+
+    /// isBackEdgeCopy - Return true if CopyMI is a back edge copy.
+    ///
+    bool isBackEdgeCopy(MachineInstr *CopyMI, unsigned DstReg) const;
+
+    /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    /// update the subregister number if it is not zero. If DstReg is a
+    /// physical register and the existing subregister number of the def / use
+    /// being updated is not zero, make sure to set it to the correct physical
+    /// subregister.
+    void UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
+
+    /// RemoveDeadImpDef - Remove implicit_def instructions which are
+    /// "re-defining" registers due to insert_subreg coalescing. e.g.
+    void RemoveDeadImpDef(unsigned Reg, LiveInterval &LI);
+
+    /// RemoveUnnecessaryKills - Remove kill markers that are no longer accurate
+    /// due to live range lengthening as the result of coalescing.
+    void RemoveUnnecessaryKills(unsigned Reg, LiveInterval &LI);
+
+    /// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
+    /// Return true if live interval is removed.
+    bool ShortenDeadCopyLiveRange(LiveInterval &li, MachineInstr *CopyMI);
+
+    /// ShortenDeadCopyLiveRange - Shorten a live range as it's artificially
+    /// extended by a dead copy. Mark the last use (if any) of the val# as kill
+    /// as ends the live range there. If there isn't another use, then this
+    /// live range is dead. Return true if live interval is removed.
+    bool ShortenDeadCopySrcLiveRange(LiveInterval &li, MachineInstr *CopyMI);
+
+    /// RemoveDeadDef - If a def of a live interval is now determined dead,
+    /// remove the val# it defines. If the live interval becomes empty, remove
+    /// it as well.
+    bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
+
+    /// lastRegisterUse - Returns the last use of the specific register between
+    /// cycles Start and End or NULL if there are no uses.
+    MachineOperand *lastRegisterUse(unsigned Start, unsigned End, unsigned Reg,
+                                    unsigned &LastUseIdx) const;
+
+    void printRegName(unsigned reg) const;
+  };
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
new file mode 100644
index 0000000..ce63121
--- /dev/null
+++ b/lib/CodeGen/Spiller.cpp
@@ -0,0 +1,229 @@
+//===-- llvm/CodeGen/Spiller.cpp -  Spiller -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "spiller"
+
+#include "Spiller.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+Spiller::~Spiller() {}
+
+namespace {
+
+/// Utility class for spillers.
+class SpillerBase : public Spiller {
+protected:
+
+  MachineFunction *mf;
+  LiveIntervals *lis;
+  LiveStacks *ls;
+  MachineFrameInfo *mfi;
+  MachineRegisterInfo *mri;
+  const TargetInstrInfo *tii;
+  VirtRegMap *vrm;
+  
+  /// Construct a spiller base. 
+  SpillerBase(MachineFunction *mf, LiveIntervals *lis, LiveStacks *ls, VirtRegMap *vrm) :
+    mf(mf), lis(lis), ls(ls), vrm(vrm)
+  {
+    mfi = mf->getFrameInfo();
+    mri = &mf->getRegInfo();
+    tii = mf->getTarget().getInstrInfo();
+  }
+
+  /// Insert a store of the given vreg to the given stack slot immediately
+  /// after the given instruction. Returns the base index of the inserted
+  /// instruction. The caller is responsible for adding an appropriate
+  /// LiveInterval to the LiveIntervals analysis.
+  unsigned insertStoreFor(MachineInstr *mi, unsigned ss,
+                          unsigned newVReg,
+                          const TargetRegisterClass *trc) {
+    MachineBasicBlock::iterator nextInstItr(mi); 
+    ++nextInstItr;
+
+    if (!lis->hasGapAfterInstr(lis->getInstructionIndex(mi))) {
+      lis->scaleNumbering(2);
+      ls->scaleNumbering(2);
+    }
+
+    unsigned miIdx = lis->getInstructionIndex(mi);
+
+    assert(lis->hasGapAfterInstr(miIdx));
+
+    tii->storeRegToStackSlot(*mi->getParent(), nextInstItr, newVReg,
+                             true, ss, trc);
+    MachineBasicBlock::iterator storeInstItr(mi);
+    ++storeInstItr;
+    MachineInstr *storeInst = &*storeInstItr;
+    unsigned storeInstIdx = miIdx + LiveInterval::InstrSlots::NUM;
+
+    assert(lis->getInstructionFromIndex(storeInstIdx) == 0 &&
+           "Store inst index already in use.");
+    
+    lis->InsertMachineInstrInMaps(storeInst, storeInstIdx);
+
+    return storeInstIdx;
+  }
+
+  /// Insert a load of the given veg from the given stack slot immediately
+  /// before the given instruction. Returns the base index of the inserted
+  /// instruction. The caller is responsible for adding an appropriate
+  /// LiveInterval to the LiveIntervals analysis.
+  unsigned insertLoadFor(MachineInstr *mi, unsigned ss,
+                         unsigned newVReg,
+                         const TargetRegisterClass *trc) {
+    MachineBasicBlock::iterator useInstItr(mi);
+
+    if (!lis->hasGapBeforeInstr(lis->getInstructionIndex(mi))) {
+      lis->scaleNumbering(2);
+      ls->scaleNumbering(2);
+    }
+
+    unsigned miIdx = lis->getInstructionIndex(mi);
+
+    assert(lis->hasGapBeforeInstr(miIdx));
+    
+    tii->loadRegFromStackSlot(*mi->getParent(), useInstItr, newVReg, ss, trc);
+    MachineBasicBlock::iterator loadInstItr(mi);
+    --loadInstItr;
+    MachineInstr *loadInst = &*loadInstItr;
+    unsigned loadInstIdx = miIdx - LiveInterval::InstrSlots::NUM;
+
+    assert(lis->getInstructionFromIndex(loadInstIdx) == 0 &&
+           "Load inst index already in use.");
+
+    lis->InsertMachineInstrInMaps(loadInst, loadInstIdx);
+
+    return loadInstIdx;
+  }
+
+
+  /// Add spill ranges for every use/def of the live interval, inserting loads
+  /// immediately before each use, and stores after each def. No folding is
+  /// attempted.
+  std::vector<LiveInterval*> trivialSpillEverywhere(LiveInterval *li) {
+    DOUT << "Spilling everywhere " << *li << "\n";
+
+    assert(li->weight != HUGE_VALF &&
+           "Attempting to spill already spilled value.");
+
+    assert(!li->isStackSlot() &&
+           "Trying to spill a stack slot.");
+
+    std::vector<LiveInterval*> added;
+    
+    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
+    unsigned ss = vrm->assignVirt2StackSlot(li->reg);
+
+    for (MachineRegisterInfo::reg_iterator
+         regItr = mri->reg_begin(li->reg); regItr != mri->reg_end();) {
+
+      MachineInstr *mi = &*regItr;
+      do {
+        ++regItr;
+      } while (regItr != mri->reg_end() && (&*regItr == mi));
+      
+      SmallVector<unsigned, 2> indices;
+      bool hasUse = false;
+      bool hasDef = false;
+    
+      for (unsigned i = 0; i != mi->getNumOperands(); ++i) {
+        MachineOperand &op = mi->getOperand(i);
+
+        if (!op.isReg() || op.getReg() != li->reg)
+          continue;
+      
+        hasUse |= mi->getOperand(i).isUse();
+        hasDef |= mi->getOperand(i).isDef();
+      
+        indices.push_back(i);
+      }
+
+      unsigned newVReg = mri->createVirtualRegister(trc);
+      vrm->grow();
+      vrm->assignVirt2StackSlot(newVReg, ss);
+
+      LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
+      newLI->weight = HUGE_VALF;
+      
+      for (unsigned i = 0; i < indices.size(); ++i) {
+        mi->getOperand(indices[i]).setReg(newVReg);
+
+        if (mi->getOperand(indices[i]).isUse()) {
+          mi->getOperand(indices[i]).setIsKill(true);
+        }
+      }
+
+      assert(hasUse || hasDef);
+
+      if (hasUse) {
+        unsigned loadInstIdx = insertLoadFor(mi, ss, newVReg, trc);
+        unsigned start = lis->getDefIndex(loadInstIdx),
+                 end = lis->getUseIndex(lis->getInstructionIndex(mi));
+
+        VNInfo *vni =
+          newLI->getNextValue(loadInstIdx, 0, lis->getVNInfoAllocator());
+        vni->kills.push_back(lis->getInstructionIndex(mi));
+        LiveRange lr(start, end, vni);
+
+        newLI->addRange(lr);
+      }
+
+      if (hasDef) {
+        unsigned storeInstIdx = insertStoreFor(mi, ss, newVReg, trc);
+        unsigned start = lis->getDefIndex(lis->getInstructionIndex(mi)),
+                 end = lis->getUseIndex(storeInstIdx);
+
+        VNInfo *vni =
+          newLI->getNextValue(storeInstIdx, 0, lis->getVNInfoAllocator());
+        vni->kills.push_back(storeInstIdx);
+        LiveRange lr(start, end, vni);
+      
+        newLI->addRange(lr);
+      }
+
+      added.push_back(newLI);
+    }
+
+
+    return added;
+  }
+
+};
+
+
+/// Spills any live range using the spill-everywhere method with no attempt at
+/// folding.
+class TrivialSpiller : public SpillerBase {
+public:
+  TrivialSpiller(MachineFunction *mf, LiveIntervals *lis, LiveStacks *ls, VirtRegMap *vrm) :
+    SpillerBase(mf, lis, ls, vrm) {}
+
+  std::vector<LiveInterval*> spill(LiveInterval *li) {
+    return trivialSpillEverywhere(li);
+  }
+
+};
+
+}
+
+llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
+                                   LiveStacks *ls, VirtRegMap *vrm) {
+  return new TrivialSpiller(mf, lis, ls, vrm);
+}
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
new file mode 100644
index 0000000..cad054d
--- /dev/null
+++ b/lib/CodeGen/Spiller.h
@@ -0,0 +1,37 @@
+//===-- llvm/CodeGen/Spiller.h - Spiller -*- C++ -*------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SPILLER_H
+#define LLVM_CODEGEN_SPILLER_H
+
+#include <vector>
+
+namespace llvm {
+  class LiveInterval;
+  class LiveIntervals;
+  class LiveStacks;
+  class MachineFunction;
+  class VirtRegMap;
+
+  /// Spiller interface.
+  ///
+  /// Implementations are utility classes which insert spill or remat code on
+  /// demand.
+  class Spiller {
+  public:
+    virtual ~Spiller() = 0;
+    virtual std::vector<LiveInterval*> spill(LiveInterval *li) = 0;
+  };
+
+  /// Create and return a spiller object, as specified on the command line.
+  Spiller* createSpiller(MachineFunction *mf, LiveIntervals *li,
+                         LiveStacks *ls, VirtRegMap *vrm);
+}
+
+#endif
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
new file mode 100644
index 0000000..c179f1e
--- /dev/null
+++ b/lib/CodeGen/StackProtector.cpp
@@ -0,0 +1,224 @@
+//===-- StackProtector.cpp - Stack Protector Insertion --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass inserts stack protectors into functions which need them. A variable
+// with a random value in it is stored onto the stack before the local variables
+// are allocated. Upon exiting the block, the stored value is checked. If it's
+// changed, then there was some sort of violation and the program aborts.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stack-protector"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Attributes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+// SSPBufferSize - The lower bound for a buffer to be considered for stack
+// smashing protection.
+static cl::opt<unsigned>
+SSPBufferSize("stack-protector-buffer-size", cl::init(8),
+              cl::desc("Lower bound for a buffer to be considered for "
+                       "stack protection"));
+
+namespace {
+  class VISIBILITY_HIDDEN StackProtector : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// target type sizes.
+    const TargetLowering *TLI;
+
+    Function *F;
+    Module *M;
+
+    /// InsertStackProtectors - Insert code into the prologue and epilogue of
+    /// the function.
+    ///
+    ///  - The prologue code loads and stores the stack guard onto the stack.
+    ///  - The epilogue checks the value stored in the prologue against the
+    ///    original value. It calls __stack_chk_fail if they differ.
+    bool InsertStackProtectors();
+
+    /// CreateFailBB - Create a basic block to jump to when the stack protector
+    /// check fails.
+    BasicBlock *CreateFailBB();
+
+    /// RequiresStackProtector - Check whether or not this function needs a
+    /// stack protector based upon the stack protector level.
+    bool RequiresStackProtector() const;
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    StackProtector() : FunctionPass(&ID), TLI(0) {}
+    StackProtector(const TargetLowering *tli)
+      : FunctionPass(&ID), TLI(tli) {}
+
+    virtual bool runOnFunction(Function &Fn);
+  };
+} // end anonymous namespace
+
+char StackProtector::ID = 0;
+static RegisterPass<StackProtector>
+X("stack-protector", "Insert stack protectors");
+
+FunctionPass *llvm::createStackProtectorPass(const TargetLowering *tli) {
+  return new StackProtector(tli);
+}
+
+bool StackProtector::runOnFunction(Function &Fn) {
+  F = &Fn;
+  M = F->getParent();
+
+  if (!RequiresStackProtector()) return false;
+  
+  return InsertStackProtectors();
+}
+
+/// RequiresStackProtector - Check whether or not this function needs a stack
+/// protector based upon the stack protector level. The heuristic we use is to
+/// add a guard variable to functions that call alloca, and functions with
+/// buffers larger than SSPBufferSize bytes.
+bool StackProtector::RequiresStackProtector() const {
+  if (F->hasFnAttr(Attribute::StackProtectReq))
+    return true;
+
+  if (!F->hasFnAttr(Attribute::StackProtect))
+    return false;
+
+  const TargetData *TD = TLI->getTargetData();
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    BasicBlock *BB = I;
+
+    for (BasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+        if (AI->isArrayAllocation())
+          // This is a call to alloca with a variable size. Emit stack
+          // protectors.
+          return true;
+
+        if (const ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType()))
+          // If an array has more than SSPBufferSize bytes of allocated space,
+          // then we emit stack protectors.
+          if (SSPBufferSize <= TD->getTypeAllocSize(AT))
+            return true;
+      }
+  }
+
+  return false;
+}
+
+/// InsertStackProtectors - Insert code into the prologue and epilogue of the
+/// function.
+///
+///  - The prologue code loads and stores the stack guard onto the stack.
+///  - The epilogue checks the value stored in the prologue against the original
+///    value. It calls __stack_chk_fail if they differ.
+bool StackProtector::InsertStackProtectors() {
+  BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
+  AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
+  Constant *StackGuardVar = 0;  // The stack guard variable.
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+    if (!RI) continue;
+
+    if (!FailBB) {
+      // Insert code into the entry block that stores the __stack_chk_guard
+      // variable onto the stack:
+      //
+      //   entry:
+      //     StackGuardSlot = alloca i8*
+      //     StackGuard = load __stack_chk_guard
+      //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
+      // 
+      PointerType *PtrTy = PointerType::getUnqual(Type::Int8Ty);
+      StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
+
+      BasicBlock &Entry = F->getEntryBlock();
+      Instruction *InsPt = &Entry.front();
+
+      AI = new AllocaInst(PtrTy, "StackGuardSlot", InsPt);
+      LoadInst *LI = new LoadInst(StackGuardVar, "StackGuard", false, InsPt);
+
+      Value *Args[] = { LI, AI };
+      CallInst::
+        Create(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
+               &Args[0], array_endof(Args), "", InsPt);
+
+      // Create the basic block to jump to when the guard check fails.
+      FailBB = CreateFailBB();
+    }
+
+    // For each block with a return instruction, convert this:
+    //
+    //   return:
+    //     ...
+    //     ret ...
+    //
+    // into this:
+    //
+    //   return:
+    //     ...
+    //     %1 = load __stack_chk_guard
+    //     %2 = load StackGuardSlot
+    //     %3 = cmp i1 %1, %2
+    //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
+    //
+    //   SP_return:
+    //     ret ...
+    //
+    //   CallStackCheckFailBlk:
+    //     call void @__stack_chk_fail()
+    //     unreachable
+
+    // Split the basic block before the return instruction.
+    BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+
+    // Remove default branch instruction to the new BB.
+    BB->getTerminator()->eraseFromParent();
+
+    // Move the newly created basic block to the point right after the old basic
+    // block so that it's in the "fall through" position.
+    NewBB->moveAfter(BB);
+
+    // Generate the stack protector instructions in the old basic block.
+    LoadInst *LI1 = new LoadInst(StackGuardVar, "", false, BB);
+    LoadInst *LI2 = new LoadInst(AI, "", true, BB);
+    ICmpInst *Cmp = new ICmpInst(CmpInst::ICMP_EQ, LI1, LI2, "", BB);
+    BranchInst::Create(NewBB, FailBB, Cmp, BB);
+  }
+
+  // Return if we didn't modify any basic blocks. I.e., there are no return
+  // statements in the function.
+  if (!FailBB) return false;
+
+  return true;
+}
+
+/// CreateFailBB - Create a basic block to jump to when the stack protector
+/// check fails.
+BasicBlock *StackProtector::CreateFailBB() {
+  BasicBlock *FailBB = BasicBlock::Create("CallStackCheckFailBlk", F);
+  Constant *StackChkFail =
+    M->getOrInsertFunction("__stack_chk_fail", Type::VoidTy, NULL);
+  CallInst::Create(StackChkFail, "", FailBB);
+  new UnreachableInst(FailBB);
+  return FailBB;
+}
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
new file mode 100644
index 0000000..5824644
--- /dev/null
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -0,0 +1,733 @@
+//===-- StackSlotColoring.cpp - Stack slot coloring pass. -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the stack slot coloring pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackcoloring"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include <vector>
+using namespace llvm;
+
+static cl::opt<bool>
+DisableSharing("no-stack-slot-sharing",
+             cl::init(false), cl::Hidden,
+             cl::desc("Suppress slot sharing during stack coloring"));
+
+static cl::opt<bool>
+ColorWithRegsOpt("color-ss-with-regs",
+                 cl::init(false), cl::Hidden,
+                 cl::desc("Color stack slots with free registers"));
+
+
+static cl::opt<int> DCELimit("ssc-dce-limit", cl::init(-1), cl::Hidden);
+
+STATISTIC(NumEliminated, "Number of stack slots eliminated due to coloring");
+STATISTIC(NumRegRepl,    "Number of stack slot refs replaced with reg refs");
+STATISTIC(NumLoadElim,   "Number of loads eliminated");
+STATISTIC(NumStoreElim,  "Number of stores eliminated");
+STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
+
+namespace {
+  class VISIBILITY_HIDDEN StackSlotColoring : public MachineFunctionPass {
+    bool ColorWithRegs;
+    LiveStacks* LS;
+    VirtRegMap* VRM;
+    MachineFrameInfo *MFI;
+    MachineRegisterInfo *MRI;
+    const TargetInstrInfo  *TII;
+    const TargetRegisterInfo *TRI;
+    const MachineLoopInfo *loopInfo;
+
+    // SSIntervals - Spill slot intervals.
+    std::vector<LiveInterval*> SSIntervals;
+
+    // SSRefs - Keep a list of frame index references for each spill slot.
+    SmallVector<SmallVector<MachineInstr*, 8>, 16> SSRefs;
+
+    // OrigAlignments - Alignments of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigAlignments;
+
+    // OrigSizes - Sizess of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigSizes;
+
+    // AllColors - If index is set, it's a spill slot, i.e. color.
+    // FIXME: This assumes PEI locate spill slot with smaller indices
+    // closest to stack pointer / frame pointer. Therefore, smaller
+    // index == better color.
+    BitVector AllColors;
+
+    // NextColor - Next "color" that's not yet used.
+    int NextColor;
+
+    // UsedColors - "Colors" that have been assigned.
+    BitVector UsedColors;
+
+    // Assignments - Color to intervals mapping.
+    SmallVector<SmallVector<LiveInterval*,4>, 16> Assignments;
+
+  public:
+    static char ID; // Pass identification
+    StackSlotColoring() :
+      MachineFunctionPass(&ID), ColorWithRegs(false), NextColor(-1) {}
+    StackSlotColoring(bool RegColor) :
+      MachineFunctionPass(&ID), ColorWithRegs(RegColor), NextColor(-1) {}
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LiveStacks>();
+      AU.addRequired<VirtRegMap>();
+      AU.addPreserved<VirtRegMap>();      
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+    virtual const char* getPassName() const {
+      return "Stack Slot Coloring";
+    }
+
+  private:
+    void InitializeSlots();
+    void ScanForSpillSlotRefs(MachineFunction &MF);
+    bool OverlapWithAssignments(LiveInterval *li, int Color) const;
+    int ColorSlot(LiveInterval *li);
+    bool ColorSlots(MachineFunction &MF);
+    bool ColorSlotsWithFreeRegs(SmallVector<int, 16> &SlotMapping,
+                                SmallVector<SmallVector<int, 4>, 16> &RevMap,
+                                BitVector &SlotIsReg);
+    void RewriteInstruction(MachineInstr *MI, int OldFI, int NewFI,
+                            MachineFunction &MF);
+    bool PropagateBackward(MachineBasicBlock::iterator MII,
+                           MachineBasicBlock *MBB,
+                           unsigned OldReg, unsigned NewReg);
+    bool PropagateForward(MachineBasicBlock::iterator MII,
+                          MachineBasicBlock *MBB,
+                          unsigned OldReg, unsigned NewReg);
+    void UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
+                                    unsigned Reg, const TargetRegisterClass *RC,
+                                    SmallSet<unsigned, 4> &Defs,
+                                    MachineFunction &MF);
+    bool AllMemRefsCanBeUnfolded(int SS);
+    bool RemoveDeadStores(MachineBasicBlock* MBB);
+  };
+} // end anonymous namespace
+
+char StackSlotColoring::ID = 0;
+
+static RegisterPass<StackSlotColoring>
+X("stack-slot-coloring", "Stack Slot Coloring");
+
+FunctionPass *llvm::createStackSlotColoringPass(bool RegColor) {
+  return new StackSlotColoring(RegColor);
+}
+
+namespace {
+  // IntervalSorter - Comparison predicate that sort live intervals by
+  // their weight.
+  struct IntervalSorter {
+    bool operator()(LiveInterval* LHS, LiveInterval* RHS) const {
+      return LHS->weight > RHS->weight;
+    }
+  };
+}
+
+/// ScanForSpillSlotRefs - Scan all the machine instructions for spill slot
+/// references and update spill slot weights.
+void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
+  SSRefs.resize(MFI->getObjectIndexEnd());
+
+  // FIXME: Need the equivalent of MachineRegisterInfo for frameindex operands.
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = &*MBBI;
+    unsigned loopDepth = loopInfo->getLoopDepth(MBB);
+    for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
+         MII != EE; ++MII) {
+      MachineInstr *MI = &*MII;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isFI())
+          continue;
+        int FI = MO.getIndex();
+        if (FI < 0)
+          continue;
+        if (!LS->hasInterval(FI))
+          continue;
+        LiveInterval &li = LS->getInterval(FI);
+        li.weight += LiveIntervals::getSpillWeight(false, true, loopDepth);
+        SSRefs[FI].push_back(MI);
+      }
+    }
+  }
+}
+
+/// InitializeSlots - Process all spill stack slot liveintervals and add them
+/// to a sorted (by weight) list.
+void StackSlotColoring::InitializeSlots() {
+  int LastFI = MFI->getObjectIndexEnd();
+  OrigAlignments.resize(LastFI);
+  OrigSizes.resize(LastFI);
+  AllColors.resize(LastFI);
+  UsedColors.resize(LastFI);
+  Assignments.resize(LastFI);
+
+  // Gather all spill slots into a list.
+  DOUT << "Spill slot intervals:\n";
+  for (LiveStacks::iterator i = LS->begin(), e = LS->end(); i != e; ++i) {
+    LiveInterval &li = i->second;
+    DEBUG(li.dump());
+    int FI = li.getStackSlotIndex();
+    if (MFI->isDeadObjectIndex(FI))
+      continue;
+    SSIntervals.push_back(&li);
+    OrigAlignments[FI] = MFI->getObjectAlignment(FI);
+    OrigSizes[FI]      = MFI->getObjectSize(FI);
+    AllColors.set(FI);
+  }
+  DOUT << '\n';
+
+  // Sort them by weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+  // Get first "color".
+  NextColor = AllColors.find_first();
+}
+
+/// OverlapWithAssignments - Return true if LiveInterval overlaps with any
+/// LiveIntervals that have already been assigned to the specified color.
+bool
+StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
+  const SmallVector<LiveInterval*,4> &OtherLIs = Assignments[Color];
+  for (unsigned i = 0, e = OtherLIs.size(); i != e; ++i) {
+    LiveInterval *OtherLI = OtherLIs[i];
+    if (OtherLI->overlaps(*li))
+      return true;
+  }
+  return false;
+}
+
+/// ColorSlotsWithFreeRegs - If there are any free registers available, try
+/// replacing spill slots references with registers instead.
+bool
+StackSlotColoring::ColorSlotsWithFreeRegs(SmallVector<int, 16> &SlotMapping,
+                                   SmallVector<SmallVector<int, 4>, 16> &RevMap,
+                                   BitVector &SlotIsReg) {
+  if (!(ColorWithRegs || ColorWithRegsOpt) || !VRM->HasUnusedRegisters())
+    return false;
+
+  bool Changed = false;
+  DOUT << "Assigning unused registers to spill slots:\n";
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = li->getStackSlotIndex();
+    if (!UsedColors[SS] || li->weight < 20)
+      // If the weight is < 20, i.e. two references in a loop with depth 1,
+      // don't bother with it.
+      continue;
+
+    // These slots allow to share the same registers.
+    bool AllColored = true;
+    SmallVector<unsigned, 4> ColoredRegs;
+    for (unsigned j = 0, ee = RevMap[SS].size(); j != ee; ++j) {
+      int RSS = RevMap[SS][j];
+      const TargetRegisterClass *RC = LS->getIntervalRegClass(RSS);
+      // If it's not colored to another stack slot, try coloring it
+      // to a "free" register.
+      if (!RC) {
+        AllColored = false;
+        continue;
+      }
+      unsigned Reg = VRM->getFirstUnusedRegister(RC);
+      if (!Reg) {
+        AllColored = false;
+        continue;
+      }
+      if (!AllMemRefsCanBeUnfolded(RSS)) {
+        AllColored = false;
+        continue;
+      } else {
+        DOUT << "Assigning fi#" << RSS << " to " << TRI->getName(Reg) << '\n';
+        ColoredRegs.push_back(Reg);
+        SlotMapping[RSS] = Reg;
+        SlotIsReg.set(RSS);
+        Changed = true;
+      }
+    }
+
+    // Register and its sub-registers are no longer free.
+    while (!ColoredRegs.empty()) {
+      unsigned Reg = ColoredRegs.back();
+      ColoredRegs.pop_back();
+      VRM->setRegisterUsed(Reg);
+      // If reg is a callee-saved register, it will have to be spilled in
+      // the prologue.
+      MRI->setPhysRegUsed(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+        VRM->setRegisterUsed(*AS);
+        MRI->setPhysRegUsed(*AS);
+      }
+    }
+    // This spill slot is dead after the rewrites
+    if (AllColored) {
+      MFI->RemoveStackObject(SS);
+      ++NumEliminated;
+    }
+  }
+  DOUT << '\n';
+
+  return Changed;
+}
+
+/// ColorSlot - Assign a "color" (stack slot) to the specified stack slot.
+///
+int StackSlotColoring::ColorSlot(LiveInterval *li) {
+  int Color = -1;
+  bool Share = false;
+  if (!DisableSharing) {
+    // Check if it's possible to reuse any of the used colors.
+    Color = UsedColors.find_first();
+    while (Color != -1) {
+      if (!OverlapWithAssignments(li, Color)) {
+        Share = true;
+        ++NumEliminated;
+        break;
+      }
+      Color = UsedColors.find_next(Color);
+    }
+  }
+
+  // Assign it to the first available color (assumed to be the best) if it's
+  // not possible to share a used color with other objects.
+  if (!Share) {
+    assert(NextColor != -1 && "No more spill slots?");
+    Color = NextColor;
+    UsedColors.set(Color);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  // Record the assignment.
+  Assignments[Color].push_back(li);
+  int FI = li->getStackSlotIndex();
+  DOUT << "Assigning fi#" << FI << " to fi#" << Color << "\n";
+
+  // Change size and alignment of the allocated slot. If there are multiple
+  // objects sharing the same slot, then make sure the size and alignment
+  // are large enough for all.
+  unsigned Align = OrigAlignments[FI];
+  if (!Share || Align > MFI->getObjectAlignment(Color))
+    MFI->setObjectAlignment(Color, Align);
+  int64_t Size = OrigSizes[FI];
+  if (!Share || Size > MFI->getObjectSize(Color))
+    MFI->setObjectSize(Color, Size);
+  return Color;
+}
+
+/// Colorslots - Color all spill stack slots and rewrite all frameindex machine
+/// operands in the function.
+bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
+  unsigned NumObjs = MFI->getObjectIndexEnd();
+  SmallVector<int, 16> SlotMapping(NumObjs, -1);
+  SmallVector<float, 16> SlotWeights(NumObjs, 0.0);
+  SmallVector<SmallVector<int, 4>, 16> RevMap(NumObjs);
+  BitVector SlotIsReg(NumObjs);
+  BitVector UsedColors(NumObjs);
+
+  DOUT << "Color spill slot intervals:\n";
+  bool Changed = false;
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = li->getStackSlotIndex();
+    int NewSS = ColorSlot(li);
+    assert(NewSS >= 0 && "Stack coloring failed?");
+    SlotMapping[SS] = NewSS;
+    RevMap[NewSS].push_back(SS);
+    SlotWeights[NewSS] += li->weight;
+    UsedColors.set(NewSS);
+    Changed |= (SS != NewSS);
+  }
+
+  DOUT << "\nSpill slots after coloring:\n";
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = li->getStackSlotIndex();
+    li->weight = SlotWeights[SS];
+  }
+  // Sort them by new weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i)
+    DEBUG(SSIntervals[i]->dump());
+  DOUT << '\n';
+#endif
+
+  // Can we "color" a stack slot with a unused register?
+  Changed |= ColorSlotsWithFreeRegs(SlotMapping, RevMap, SlotIsReg);
+
+  if (!Changed)
+    return false;
+
+  // Rewrite all MO_FrameIndex operands.
+  SmallVector<SmallSet<unsigned, 4>, 4> NewDefs(MF.getNumBlockIDs());
+  for (unsigned SS = 0, SE = SSRefs.size(); SS != SE; ++SS) {
+    bool isReg = SlotIsReg[SS];
+    int NewFI = SlotMapping[SS];
+    if (NewFI == -1 || (NewFI == (int)SS && !isReg))
+      continue;
+
+    const TargetRegisterClass *RC = LS->getIntervalRegClass(SS);
+    SmallVector<MachineInstr*, 8> &RefMIs = SSRefs[SS];
+    for (unsigned i = 0, e = RefMIs.size(); i != e; ++i)
+      if (!isReg)
+        RewriteInstruction(RefMIs[i], SS, NewFI, MF);
+      else {
+        // Rewrite to use a register instead.
+        unsigned MBBId = RefMIs[i]->getParent()->getNumber();
+        SmallSet<unsigned, 4> &Defs = NewDefs[MBBId];
+        UnfoldAndRewriteInstruction(RefMIs[i], SS, NewFI, RC, Defs, MF);
+      }
+  }
+
+  // Delete unused stack slots.
+  while (NextColor != -1) {
+    DOUT << "Removing unused stack object fi#" << NextColor << "\n";
+    MFI->RemoveStackObject(NextColor);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  return true;
+}
+
+/// AllMemRefsCanBeUnfolded - Return true if all references of the specified
+/// spill slot index can be unfolded.
+bool StackSlotColoring::AllMemRefsCanBeUnfolded(int SS) {
+  SmallVector<MachineInstr*, 8> &RefMIs = SSRefs[SS];
+  for (unsigned i = 0, e = RefMIs.size(); i != e; ++i) {
+    MachineInstr *MI = RefMIs[i];
+    if (TII->isLoadFromStackSlot(MI, SS) ||
+        TII->isStoreToStackSlot(MI, SS))
+      // Restore and spill will become copies.
+      return true;
+    if (!TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(), false, false))
+      return false;
+    for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+      MachineOperand &MO = MI->getOperand(j);
+      if (MO.isFI() && MO.getIndex() != SS)
+        // If it uses another frameindex, we can, currently* unfold it.
+        return false;
+    }
+  }
+  return true;
+}
+
+/// RewriteInstruction - Rewrite specified instruction by replacing references
+/// to old frame index with new one.
+void StackSlotColoring::RewriteInstruction(MachineInstr *MI, int OldFI,
+                                           int NewFI, MachineFunction &MF) {
+  for (unsigned i = 0, ee = MI->getNumOperands(); i != ee; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isFI())
+      continue;
+    int FI = MO.getIndex();
+    if (FI != OldFI)
+      continue;
+    MO.setIndex(NewFI);
+  }
+
+  // Update the MachineMemOperand for the new memory location.
+  // FIXME: We need a better method of managing these too.
+  SmallVector<MachineMemOperand, 2> MMOs(MI->memoperands_begin(),
+                                         MI->memoperands_end());
+  MI->clearMemOperands(MF);
+  const Value *OldSV = PseudoSourceValue::getFixedStack(OldFI);
+  for (unsigned i = 0, ee = MMOs.size(); i != ee; ++i) {
+    if (MMOs[i].getValue() != OldSV)
+      MI->addMemOperand(MF, MMOs[i]);
+    else {
+      MachineMemOperand MMO(PseudoSourceValue::getFixedStack(NewFI),
+                            MMOs[i].getFlags(), MMOs[i].getOffset(),
+                            MMOs[i].getSize(),  MMOs[i].getAlignment());
+      MI->addMemOperand(MF, MMO);
+    }
+  }
+}
+
+/// PropagateBackward - Traverse backward and look for the definition of
+/// OldReg. If it can successfully update all of the references with NewReg,
+/// do so and return true.
+bool StackSlotColoring::PropagateBackward(MachineBasicBlock::iterator MII,
+                                          MachineBasicBlock *MBB,
+                                          unsigned OldReg, unsigned NewReg) {
+  if (MII == MBB->begin())
+    return false;
+
+  SmallVector<MachineOperand*, 4> Uses;
+  SmallVector<MachineOperand*, 4> Refs;
+  while (--MII != MBB->begin()) {
+    bool FoundDef = false;  // Not counting 2address def.
+
+    Uses.clear();
+    const TargetInstrDesc &TID = MII->getDesc();
+    for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MII->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0)
+        continue;
+      if (Reg == OldReg) {
+        if (MO.isImplicit())
+          return false;
+        const TargetRegisterClass *RC = getInstrOperandRegClass(TRI, TID, i);
+        if (RC && !RC->contains(NewReg))
+          return false;
+
+        if (MO.isUse()) {
+          Uses.push_back(&MO);
+        } else {
+          Refs.push_back(&MO);
+          if (!MII->isRegTiedToUseOperand(i))
+            FoundDef = true;
+        }
+      } else if (TRI->regsOverlap(Reg, NewReg)) {
+        return false;
+      } else if (TRI->regsOverlap(Reg, OldReg)) {
+        if (!MO.isUse() || !MO.isKill())
+          return false;
+      }
+    }
+
+    if (FoundDef) {
+      // Found non-two-address def. Stop here.
+      for (unsigned i = 0, e = Refs.size(); i != e; ++i)
+        Refs[i]->setReg(NewReg);
+      return true;
+    }
+
+    // Two-address uses must be updated as well.
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i)
+      Refs.push_back(Uses[i]);
+  }
+  return false;
+}
+
+/// PropagateForward - Traverse forward and look for the kill of OldReg. If
+/// it can successfully update all of the uses with NewReg, do so and
+/// return true.
+bool StackSlotColoring::PropagateForward(MachineBasicBlock::iterator MII,
+                                         MachineBasicBlock *MBB,
+                                         unsigned OldReg, unsigned NewReg) {
+  if (MII == MBB->end())
+    return false;
+
+  SmallVector<MachineOperand*, 4> Uses;
+  while (++MII != MBB->end()) {
+    bool FoundUse = false;
+    bool FoundKill = false;
+    const TargetInstrDesc &TID = MII->getDesc();
+    for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MII->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0)
+        continue;
+      if (Reg == OldReg) {
+        if (MO.isDef() || MO.isImplicit())
+          return false;
+
+        const TargetRegisterClass *RC = getInstrOperandRegClass(TRI, TID, i);
+        if (RC && !RC->contains(NewReg))
+          return false;
+        FoundUse = true;
+        if (MO.isKill())
+          FoundKill = true;
+        Uses.push_back(&MO);
+      } else if (TRI->regsOverlap(Reg, NewReg) ||
+                 TRI->regsOverlap(Reg, OldReg))
+        return false;
+    }
+    if (FoundKill) {
+      for (unsigned i = 0, e = Uses.size(); i != e; ++i)
+        Uses[i]->setReg(NewReg);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// UnfoldAndRewriteInstruction - Rewrite specified instruction by unfolding
+/// folded memory references and replacing those references with register
+/// references instead.
+void
+StackSlotColoring::UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
+                                               unsigned Reg,
+                                               const TargetRegisterClass *RC,
+                                               SmallSet<unsigned, 4> &Defs,
+                                               MachineFunction &MF) {
+  MachineBasicBlock *MBB = MI->getParent();
+  if (unsigned DstReg = TII->isLoadFromStackSlot(MI, OldFI)) {
+    if (PropagateForward(MI, MBB, DstReg, Reg)) {
+      DOUT << "Eliminated load: ";
+      DEBUG(MI->dump());
+      ++NumLoadElim;
+    } else {
+      TII->copyRegToReg(*MBB, MI, DstReg, Reg, RC, RC);
+      ++NumRegRepl;
+    }
+
+    if (!Defs.count(Reg)) {
+      // If this is the first use of Reg in this MBB and it wasn't previously
+      // defined in MBB, add it to livein.
+      MBB->addLiveIn(Reg);
+      Defs.insert(Reg);
+    }
+  } else if (unsigned SrcReg = TII->isStoreToStackSlot(MI, OldFI)) {
+    if (MI->killsRegister(SrcReg) && PropagateBackward(MI, MBB, SrcReg, Reg)) {
+      DOUT << "Eliminated store: ";
+      DEBUG(MI->dump());
+      ++NumStoreElim;
+    } else {
+      TII->copyRegToReg(*MBB, MI, Reg, SrcReg, RC, RC);
+      ++NumRegRepl;
+    }
+
+    // Remember reg has been defined in MBB.
+    Defs.insert(Reg);
+  } else {
+    SmallVector<MachineInstr*, 4> NewMIs;
+    bool Success = TII->unfoldMemoryOperand(MF, MI, Reg, false, false, NewMIs);
+    Success = Success; // Silence compiler warning.
+    assert(Success && "Failed to unfold!");
+    MachineInstr *NewMI = NewMIs[0];
+    MBB->insert(MI, NewMI);
+    ++NumRegRepl;
+
+    if (NewMI->readsRegister(Reg)) {
+      if (!Defs.count(Reg))
+        // If this is the first use of Reg in this MBB and it wasn't previously
+        // defined in MBB, add it to livein.
+        MBB->addLiveIn(Reg);
+      Defs.insert(Reg);
+    }
+  }
+  MBB->erase(MI);
+}
+
+/// RemoveDeadStores - Scan through a basic block and look for loads followed
+/// by stores.  If they're both using the same stack slot, then the store is
+/// definitely dead.  This could obviously be much more aggressive (consider
+/// pairs with instructions between them), but such extensions might have a
+/// considerable compile time impact.
+bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
+  // FIXME: This could be much more aggressive, but we need to investigate
+  // the compile time impact of doing so.
+  bool changed = false;
+
+  SmallVector<MachineInstr*, 4> toErase;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    if (DCELimit != -1 && (int)NumDead >= DCELimit)
+      break;
+    
+    MachineBasicBlock::iterator NextMI = next(I);
+    if (NextMI == MBB->end()) continue;
+    
+    int FirstSS, SecondSS;
+    unsigned LoadReg = 0;
+    unsigned StoreReg = 0;
+    if (!(LoadReg = TII->isLoadFromStackSlot(I, FirstSS))) continue;
+    if (!(StoreReg = TII->isStoreToStackSlot(NextMI, SecondSS))) continue;
+    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
+    
+    ++NumDead;
+    changed = true;
+    
+    if (NextMI->findRegisterUseOperandIdx(LoadReg, true, 0) != -1) {
+      ++NumDead;
+      toErase.push_back(I);
+    }
+    
+    toErase.push_back(NextMI);
+    ++I;
+  }
+  
+  for (SmallVector<MachineInstr*, 4>::iterator I = toErase.begin(),
+       E = toErase.end(); I != E; ++I)
+    (*I)->eraseFromParent();
+  
+  return changed;
+}
+
+
+bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
+  DOUT << "********** Stack Slot Coloring **********\n";
+
+  MFI = MF.getFrameInfo();
+  MRI = &MF.getRegInfo(); 
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  LS = &getAnalysis<LiveStacks>();
+  VRM = &getAnalysis<VirtRegMap>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  bool Changed = false;
+
+  unsigned NumSlots = LS->getNumIntervals();
+  if (NumSlots < 2) {
+    if (NumSlots == 0 || !VRM->HasUnusedRegisters())
+      // Nothing to do!
+      return false;
+  }
+
+  // Gather spill slot references
+  ScanForSpillSlotRefs(MF);
+  InitializeSlots();
+  Changed = ColorSlots(MF);
+
+  NextColor = -1;
+  SSIntervals.clear();
+  for (unsigned i = 0, e = SSRefs.size(); i != e; ++i)
+    SSRefs[i].clear();
+  SSRefs.clear();
+  OrigAlignments.clear();
+  OrigSizes.clear();
+  AllColors.clear();
+  UsedColors.clear();
+  for (unsigned i = 0, e = Assignments.size(); i != e; ++i)
+    Assignments[i].clear();
+  Assignments.clear();
+
+  if (Changed) {
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      Changed |= RemoveDeadStores(I);
+  }
+
+  return Changed;
+}
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
new file mode 100644
index 0000000..a2c1255
--- /dev/null
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -0,0 +1,1053 @@
+//===- StrongPhiElimination.cpp - Eliminate PHI nodes by inserting copies -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates machine instruction PHI nodes by inserting copy
+// instructions, using an intelligent copy-folding technique based on
+// dominator information.  This is technique is derived from:
+// 
+//    Budimlic, et al. Fast copy coalescing and live-range identification.
+//    In Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language
+//    Design and Implementation (Berlin, Germany, June 17 - 19, 2002).
+//    PLDI '02. ACM, New York, NY, 25-32.
+//    DOI= http://doi.acm.org/10.1145/512529.512534
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "strongphielim"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterCoalescer.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+namespace {
+  struct VISIBILITY_HIDDEN StrongPHIElimination : public MachineFunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    StrongPHIElimination() : MachineFunctionPass(&ID) {}
+
+    // Waiting stores, for each MBB, the set of copies that need to
+    // be inserted into that MBB
+    DenseMap<MachineBasicBlock*,
+             std::multimap<unsigned, unsigned> > Waiting;
+    
+    // Stacks holds the renaming stack for each register
+    std::map<unsigned, std::vector<unsigned> > Stacks;
+    
+    // Registers in UsedByAnother are PHI nodes that are themselves
+    // used as operands to another another PHI node
+    std::set<unsigned> UsedByAnother;
+    
+    // RenameSets are the is a map from a PHI-defined register
+    // to the input registers to be coalesced along with the 
+    // predecessor block for those input registers.
+    std::map<unsigned, std::map<unsigned, MachineBasicBlock*> > RenameSets;
+    
+    // PhiValueNumber holds the ID numbers of the VNs for each phi that we're
+    // eliminating, indexed by the register defined by that phi.
+    std::map<unsigned, unsigned> PhiValueNumber;
+
+    // Store the DFS-in number of each block
+    DenseMap<MachineBasicBlock*, unsigned> preorder;
+    
+    // Store the DFS-out number of each block
+    DenseMap<MachineBasicBlock*, unsigned> maxpreorder;
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<LiveIntervals>();
+      
+      // TODO: Actually make this true.
+      AU.addPreserved<LiveIntervals>();
+      AU.addPreserved<RegisterCoalescer>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    virtual void releaseMemory() {
+      preorder.clear();
+      maxpreorder.clear();
+      
+      Waiting.clear();
+      Stacks.clear();
+      UsedByAnother.clear();
+      RenameSets.clear();
+    }
+
+  private:
+    
+    /// DomForestNode - Represents a node in the "dominator forest".  This is
+    /// a forest in which the nodes represent registers and the edges
+    /// represent a dominance relation in the block defining those registers.
+    struct DomForestNode {
+    private:
+      // Store references to our children
+      std::vector<DomForestNode*> children;
+      // The register we represent
+      unsigned reg;
+      
+      // Add another node as our child
+      void addChild(DomForestNode* DFN) { children.push_back(DFN); }
+      
+    public:
+      typedef std::vector<DomForestNode*>::iterator iterator;
+      
+      // Create a DomForestNode by providing the register it represents, and
+      // the node to be its parent.  The virtual root node has register 0
+      // and a null parent.
+      DomForestNode(unsigned r, DomForestNode* parent) : reg(r) {
+        if (parent)
+          parent->addChild(this);
+      }
+      
+      ~DomForestNode() {
+        for (iterator I = begin(), E = end(); I != E; ++I)
+          delete *I;
+      }
+      
+      /// getReg - Return the regiser that this node represents
+      inline unsigned getReg() { return reg; }
+      
+      // Provide iterator access to our children
+      inline DomForestNode::iterator begin() { return children.begin(); }
+      inline DomForestNode::iterator end() { return children.end(); }
+    };
+    
+    void computeDFS(MachineFunction& MF);
+    void processBlock(MachineBasicBlock* MBB);
+    
+    std::vector<DomForestNode*> computeDomForest(
+                           std::map<unsigned, MachineBasicBlock*>& instrs,
+                                                 MachineRegisterInfo& MRI);
+    void processPHIUnion(MachineInstr* Inst,
+                         std::map<unsigned, MachineBasicBlock*>& PHIUnion,
+                         std::vector<StrongPHIElimination::DomForestNode*>& DF,
+                         std::vector<std::pair<unsigned, unsigned> >& locals);
+    void ScheduleCopies(MachineBasicBlock* MBB, std::set<unsigned>& pushed);
+    void InsertCopies(MachineDomTreeNode* MBB,
+                      SmallPtrSet<MachineBasicBlock*, 16>& v);
+    bool mergeLiveIntervals(unsigned primary, unsigned secondary);
+  };
+}
+
+char StrongPHIElimination::ID = 0;
+static RegisterPass<StrongPHIElimination>
+X("strong-phi-node-elimination",
+  "Eliminate PHI nodes for register allocation, intelligently");
+
+const PassInfo *const llvm::StrongPHIEliminationID = &X;
+
+/// computeDFS - Computes the DFS-in and DFS-out numbers of the dominator tree
+/// of the given MachineFunction.  These numbers are then used in other parts
+/// of the PHI elimination process.
+void StrongPHIElimination::computeDFS(MachineFunction& MF) {
+  SmallPtrSet<MachineDomTreeNode*, 8> frontier;
+  SmallPtrSet<MachineDomTreeNode*, 8> visited;
+  
+  unsigned time = 0;
+  
+  MachineDominatorTree& DT = getAnalysis<MachineDominatorTree>();
+  
+  MachineDomTreeNode* node = DT.getRootNode();
+  
+  std::vector<MachineDomTreeNode*> worklist;
+  worklist.push_back(node);
+  
+  while (!worklist.empty()) {
+    MachineDomTreeNode* currNode = worklist.back();
+    
+    if (!frontier.count(currNode)) {
+      frontier.insert(currNode);
+      ++time;
+      preorder.insert(std::make_pair(currNode->getBlock(), time));
+    }
+    
+    bool inserted = false;
+    for (MachineDomTreeNode::iterator I = currNode->begin(), E = currNode->end();
+         I != E; ++I)
+      if (!frontier.count(*I) && !visited.count(*I)) {
+        worklist.push_back(*I);
+        inserted = true;
+        break;
+      }
+    
+    if (!inserted) {
+      frontier.erase(currNode);
+      visited.insert(currNode);
+      maxpreorder.insert(std::make_pair(currNode->getBlock(), time));
+      
+      worklist.pop_back();
+    }
+  }
+}
+
+namespace {
+
+/// PreorderSorter - a helper class that is used to sort registers
+/// according to the preorder number of their defining blocks
+class PreorderSorter {
+private:
+  DenseMap<MachineBasicBlock*, unsigned>& preorder;
+  MachineRegisterInfo& MRI;
+  
+public:
+  PreorderSorter(DenseMap<MachineBasicBlock*, unsigned>& p,
+                MachineRegisterInfo& M) : preorder(p), MRI(M) { }
+  
+  bool operator()(unsigned A, unsigned B) {
+    if (A == B)
+      return false;
+    
+    MachineBasicBlock* ABlock = MRI.getVRegDef(A)->getParent();
+    MachineBasicBlock* BBlock = MRI.getVRegDef(B)->getParent();
+    
+    if (preorder[ABlock] < preorder[BBlock])
+      return true;
+    else if (preorder[ABlock] > preorder[BBlock])
+      return false;
+    
+    return false;
+  }
+};
+
+}
+
+/// computeDomForest - compute the subforest of the DomTree corresponding
+/// to the defining blocks of the registers in question
+std::vector<StrongPHIElimination::DomForestNode*>
+StrongPHIElimination::computeDomForest(
+                  std::map<unsigned, MachineBasicBlock*>& regs, 
+                                       MachineRegisterInfo& MRI) {
+  // Begin by creating a virtual root node, since the actual results
+  // may well be a forest.  Assume this node has maximum DFS-out number.
+  DomForestNode* VirtualRoot = new DomForestNode(0, 0);
+  maxpreorder.insert(std::make_pair((MachineBasicBlock*)0, ~0UL));
+  
+  // Populate a worklist with the registers
+  std::vector<unsigned> worklist;
+  worklist.reserve(regs.size());
+  for (std::map<unsigned, MachineBasicBlock*>::iterator I = regs.begin(),
+       E = regs.end(); I != E; ++I)
+    worklist.push_back(I->first);
+  
+  // Sort the registers by the DFS-in number of their defining block
+  PreorderSorter PS(preorder, MRI);
+  std::sort(worklist.begin(), worklist.end(), PS);
+  
+  // Create a "current parent" stack, and put the virtual root on top of it
+  DomForestNode* CurrentParent = VirtualRoot;
+  std::vector<DomForestNode*> stack;
+  stack.push_back(VirtualRoot);
+  
+  // Iterate over all the registers in the previously computed order
+  for (std::vector<unsigned>::iterator I = worklist.begin(), E = worklist.end();
+       I != E; ++I) {
+    unsigned pre = preorder[MRI.getVRegDef(*I)->getParent()];
+    MachineBasicBlock* parentBlock = CurrentParent->getReg() ?
+                 MRI.getVRegDef(CurrentParent->getReg())->getParent() :
+                 0;
+    
+    // If the DFS-in number of the register is greater than the DFS-out number
+    // of the current parent, repeatedly pop the parent stack until it isn't.
+    while (pre > maxpreorder[parentBlock]) {
+      stack.pop_back();
+      CurrentParent = stack.back();
+      
+      parentBlock = CurrentParent->getReg() ?
+                   MRI.getVRegDef(CurrentParent->getReg())->getParent() :
+                   0;
+    }
+    
+    // Now that we've found the appropriate parent, create a DomForestNode for
+    // this register and attach it to the forest
+    DomForestNode* child = new DomForestNode(*I, CurrentParent);
+    
+    // Push this new node on the "current parent" stack
+    stack.push_back(child);
+    CurrentParent = child;
+  }
+  
+  // Return a vector containing the children of the virtual root node
+  std::vector<DomForestNode*> ret;
+  ret.insert(ret.end(), VirtualRoot->begin(), VirtualRoot->end());
+  return ret;
+}
+
+/// isLiveIn - helper method that determines, from a regno, if a register
+/// is live into a block
+static bool isLiveIn(unsigned r, MachineBasicBlock* MBB,
+                     LiveIntervals& LI) {
+  LiveInterval& I = LI.getOrCreateInterval(r);
+  unsigned idx = LI.getMBBStartIdx(MBB);
+  return I.liveAt(idx);
+}
+
+/// isLiveOut - help method that determines, from a regno, if a register is
+/// live out of a block.
+static bool isLiveOut(unsigned r, MachineBasicBlock* MBB,
+                      LiveIntervals& LI) {
+  for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(),
+       E = MBB->succ_end(); PI != E; ++PI)
+    if (isLiveIn(r, *PI, LI))
+      return true;
+  
+  return false;
+}
+
+/// interferes - checks for local interferences by scanning a block.  The only
+/// trick parameter is 'mode' which tells it the relationship of the two
+/// registers. 0 - defined in the same block, 1 - first properly dominates
+/// second, 2 - second properly dominates first 
+static bool interferes(unsigned a, unsigned b, MachineBasicBlock* scan,
+                       LiveIntervals& LV, unsigned mode) {
+  MachineInstr* def = 0;
+  MachineInstr* kill = 0;
+  
+  // The code is still in SSA form at this point, so there is only one
+  // definition per VReg.  Thus we can safely use MRI->getVRegDef().
+  const MachineRegisterInfo* MRI = &scan->getParent()->getRegInfo();
+  
+  bool interference = false;
+  
+  // Wallk the block, checking for interferences
+  for (MachineBasicBlock::iterator MBI = scan->begin(), MBE = scan->end();
+       MBI != MBE; ++MBI) {
+    MachineInstr* curr = MBI;
+    
+    // Same defining block...
+    if (mode == 0) {
+      if (curr == MRI->getVRegDef(a)) {
+        // If we find our first definition, save it
+        if (!def) {
+          def = curr;
+        // If there's already an unkilled definition, then 
+        // this is an interference
+        } else if (!kill) {
+          interference = true;
+          break;
+        // If there's a definition followed by a KillInst, then
+        // they can't interfere
+        } else {
+          interference = false;
+          break;
+        }
+      // Symmetric with the above
+      } else if (curr == MRI->getVRegDef(b)) {
+        if (!def) {
+          def = curr;
+        } else if (!kill) {
+          interference = true;
+          break;
+        } else {
+          interference = false;
+          break;
+        }
+      // Store KillInsts if they match up with the definition
+      } else if (curr->killsRegister(a)) {
+        if (def == MRI->getVRegDef(a)) {
+          kill = curr;
+        } else if (curr->killsRegister(b)) {
+          if (def == MRI->getVRegDef(b)) {
+            kill = curr;
+          }
+        }
+      }
+    // First properly dominates second...
+    } else if (mode == 1) {
+      if (curr == MRI->getVRegDef(b)) {
+        // Definition of second without kill of first is an interference
+        if (!kill) {
+          interference = true;
+          break;
+        // Definition after a kill is a non-interference
+        } else {
+          interference = false;
+          break;
+        }
+      // Save KillInsts of First
+      } else if (curr->killsRegister(a)) {
+        kill = curr;
+      }
+    // Symmetric with the above
+    } else if (mode == 2) {
+      if (curr == MRI->getVRegDef(a)) {
+        if (!kill) {
+          interference = true;
+          break;
+        } else {
+          interference = false;
+          break;
+        }
+      } else if (curr->killsRegister(b)) {
+        kill = curr;
+      }
+    }
+  }
+  
+  return interference;
+}
+
+/// processBlock - Determine how to break up PHIs in the current block.  Each
+/// PHI is broken up by some combination of renaming its operands and inserting
+/// copies.  This method is responsible for determining which operands receive
+/// which treatment.
+void StrongPHIElimination::processBlock(MachineBasicBlock* MBB) {
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  MachineRegisterInfo& MRI = MBB->getParent()->getRegInfo();
+  
+  // Holds names that have been added to a set in any PHI within this block
+  // before the current one.
+  std::set<unsigned> ProcessedNames;
+  
+  // Iterate over all the PHI nodes in this block
+  MachineBasicBlock::iterator P = MBB->begin();
+  while (P != MBB->end() && P->getOpcode() == TargetInstrInfo::PHI) {
+    unsigned DestReg = P->getOperand(0).getReg();
+    
+    // Don't both doing PHI elimination for dead PHI's.
+    if (P->registerDefIsDead(DestReg)) {
+      ++P;
+      continue;
+    }
+
+    LiveInterval& PI = LI.getOrCreateInterval(DestReg);
+    unsigned pIdx = LI.getDefIndex(LI.getInstructionIndex(P));
+    VNInfo* PVN = PI.getLiveRangeContaining(pIdx)->valno;
+    PhiValueNumber.insert(std::make_pair(DestReg, PVN->id));
+
+    // PHIUnion is the set of incoming registers to the PHI node that
+    // are going to be renames rather than having copies inserted.  This set
+    // is refinded over the course of this function.  UnionedBlocks is the set
+    // of corresponding MBBs.
+    std::map<unsigned, MachineBasicBlock*> PHIUnion;
+    SmallPtrSet<MachineBasicBlock*, 8> UnionedBlocks;
+  
+    // Iterate over the operands of the PHI node
+    for (int i = P->getNumOperands() - 1; i >= 2; i-=2) {
+      unsigned SrcReg = P->getOperand(i-1).getReg();
+      
+      // Don't need to try to coalesce a register with itself.
+      if (SrcReg == DestReg) {
+        ProcessedNames.insert(SrcReg);
+        continue;
+      }
+      
+      // We don't need to insert copies for implicit_defs.
+      MachineInstr* DefMI = MRI.getVRegDef(SrcReg);
+      if (DefMI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
+        ProcessedNames.insert(SrcReg);
+    
+      // Check for trivial interferences via liveness information, allowing us
+      // to avoid extra work later.  Any registers that interfere cannot both
+      // be in the renaming set, so choose one and add copies for it instead.
+      // The conditions are:
+      //   1) if the operand is live into the PHI node's block OR
+      //   2) if the PHI node is live out of the operand's defining block OR
+      //   3) if the operand is itself a PHI node and the original PHI is
+      //      live into the operand's defining block OR
+      //   4) if the operand is already being renamed for another PHI node
+      //      in this block OR
+      //   5) if any two operands are defined in the same block, insert copies
+      //      for one of them
+      if (isLiveIn(SrcReg, P->getParent(), LI) ||
+          isLiveOut(P->getOperand(0).getReg(),
+                    MRI.getVRegDef(SrcReg)->getParent(), LI) ||
+          ( MRI.getVRegDef(SrcReg)->getOpcode() == TargetInstrInfo::PHI &&
+            isLiveIn(P->getOperand(0).getReg(),
+                     MRI.getVRegDef(SrcReg)->getParent(), LI) ) ||
+          ProcessedNames.count(SrcReg) ||
+          UnionedBlocks.count(MRI.getVRegDef(SrcReg)->getParent())) {
+        
+        // Add a copy for the selected register
+        MachineBasicBlock* From = P->getOperand(i).getMBB();
+        Waiting[From].insert(std::make_pair(SrcReg, DestReg));
+        UsedByAnother.insert(SrcReg);
+      } else {
+        // Otherwise, add it to the renaming set
+        PHIUnion.insert(std::make_pair(SrcReg,P->getOperand(i).getMBB()));
+        UnionedBlocks.insert(MRI.getVRegDef(SrcReg)->getParent());
+      }
+    }
+    
+    // Compute the dominator forest for the renaming set.  This is a forest
+    // where the nodes are the registers and the edges represent dominance 
+    // relations between the defining blocks of the registers
+    std::vector<StrongPHIElimination::DomForestNode*> DF = 
+                                                computeDomForest(PHIUnion, MRI);
+    
+    // Walk DomForest to resolve interferences at an inter-block level.  This
+    // will remove registers from the renaming set (and insert copies for them)
+    // if interferences are found.
+    std::vector<std::pair<unsigned, unsigned> > localInterferences;
+    processPHIUnion(P, PHIUnion, DF, localInterferences);
+    
+    // If one of the inputs is defined in the same block as the current PHI
+    // then we need to check for a local interference between that input and
+    // the PHI.
+    for (std::map<unsigned, MachineBasicBlock*>::iterator I = PHIUnion.begin(),
+         E = PHIUnion.end(); I != E; ++I)
+      if (MRI.getVRegDef(I->first)->getParent() == P->getParent())
+        localInterferences.push_back(std::make_pair(I->first,
+                                                    P->getOperand(0).getReg()));
+    
+    // The dominator forest walk may have returned some register pairs whose
+    // interference cannot be determined from dominator analysis.  We now 
+    // examine these pairs for local interferences.
+    for (std::vector<std::pair<unsigned, unsigned> >::iterator I =
+        localInterferences.begin(), E = localInterferences.end(); I != E; ++I) {
+      std::pair<unsigned, unsigned> p = *I;
+      
+      MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+      
+      // Determine the block we need to scan and the relationship between
+      // the two registers
+      MachineBasicBlock* scan = 0;
+      unsigned mode = 0;
+      if (MRI.getVRegDef(p.first)->getParent() ==
+          MRI.getVRegDef(p.second)->getParent()) {
+        scan = MRI.getVRegDef(p.first)->getParent();
+        mode = 0; // Same block
+      } else if (MDT.dominates(MRI.getVRegDef(p.first)->getParent(),
+                               MRI.getVRegDef(p.second)->getParent())) {
+        scan = MRI.getVRegDef(p.second)->getParent();
+        mode = 1; // First dominates second
+      } else {
+        scan = MRI.getVRegDef(p.first)->getParent();
+        mode = 2; // Second dominates first
+      }
+      
+      // If there's an interference, we need to insert  copies
+      if (interferes(p.first, p.second, scan, LI, mode)) {
+        // Insert copies for First
+        for (int i = P->getNumOperands() - 1; i >= 2; i-=2) {
+          if (P->getOperand(i-1).getReg() == p.first) {
+            unsigned SrcReg = p.first;
+            MachineBasicBlock* From = P->getOperand(i).getMBB();
+            
+            Waiting[From].insert(std::make_pair(SrcReg,
+                                                P->getOperand(0).getReg()));
+            UsedByAnother.insert(SrcReg);
+            
+            PHIUnion.erase(SrcReg);
+          }
+        }
+      }
+    }
+    
+    // Add the renaming set for this PHI node to our overall renaming information
+    for (std::map<unsigned, MachineBasicBlock*>::iterator QI = PHIUnion.begin(),
+         QE = PHIUnion.end(); QI != QE; ++QI) {
+      DOUT << "Adding Renaming: " << QI->first << " -> "
+           << P->getOperand(0).getReg() << "\n";
+    }
+    
+    RenameSets.insert(std::make_pair(P->getOperand(0).getReg(), PHIUnion));
+    
+    // Remember which registers are already renamed, so that we don't try to 
+    // rename them for another PHI node in this block
+    for (std::map<unsigned, MachineBasicBlock*>::iterator I = PHIUnion.begin(),
+         E = PHIUnion.end(); I != E; ++I)
+      ProcessedNames.insert(I->first);
+    
+    ++P;
+  }
+}
+
+/// processPHIUnion - Take a set of candidate registers to be coalesced when
+/// decomposing the PHI instruction.  Use the DominanceForest to remove the ones
+/// that are known to interfere, and flag others that need to be checked for
+/// local interferences.
+void StrongPHIElimination::processPHIUnion(MachineInstr* Inst,
+                        std::map<unsigned, MachineBasicBlock*>& PHIUnion,
+                        std::vector<StrongPHIElimination::DomForestNode*>& DF,
+                        std::vector<std::pair<unsigned, unsigned> >& locals) {
+  
+  std::vector<DomForestNode*> worklist(DF.begin(), DF.end());
+  SmallPtrSet<DomForestNode*, 4> visited;
+  
+  // Code is still in SSA form, so we can use MRI::getVRegDef()
+  MachineRegisterInfo& MRI = Inst->getParent()->getParent()->getRegInfo();
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  unsigned DestReg = Inst->getOperand(0).getReg();
+  
+  // DF walk on the DomForest
+  while (!worklist.empty()) {
+    DomForestNode* DFNode = worklist.back();
+    
+    visited.insert(DFNode);
+    
+    bool inserted = false;
+    for (DomForestNode::iterator CI = DFNode->begin(), CE = DFNode->end();
+         CI != CE; ++CI) {
+      DomForestNode* child = *CI;   
+      
+      // If the current node is live-out of the defining block of one of its
+      // children, insert a copy for it.  NOTE: The paper actually calls for
+      // a more elaborate heuristic for determining whether to insert copies
+      // for the child or the parent.  In the interest of simplicity, we're
+      // just always choosing the parent.
+      if (isLiveOut(DFNode->getReg(),
+          MRI.getVRegDef(child->getReg())->getParent(), LI)) {
+        // Insert copies for parent
+        for (int i = Inst->getNumOperands() - 1; i >= 2; i-=2) {
+          if (Inst->getOperand(i-1).getReg() == DFNode->getReg()) {
+            unsigned SrcReg = DFNode->getReg();
+            MachineBasicBlock* From = Inst->getOperand(i).getMBB();
+            
+            Waiting[From].insert(std::make_pair(SrcReg, DestReg));
+            UsedByAnother.insert(SrcReg);
+            
+            PHIUnion.erase(SrcReg);
+          }
+        }
+      
+      // If a node is live-in to the defining block of one of its children, but
+      // not live-out, then we need to scan that block for local interferences.
+      } else if (isLiveIn(DFNode->getReg(),
+                          MRI.getVRegDef(child->getReg())->getParent(), LI) ||
+                 MRI.getVRegDef(DFNode->getReg())->getParent() ==
+                                 MRI.getVRegDef(child->getReg())->getParent()) {
+        // Add (p, c) to possible local interferences
+        locals.push_back(std::make_pair(DFNode->getReg(), child->getReg()));
+      }
+      
+      if (!visited.count(child)) {
+        worklist.push_back(child);
+        inserted = true;
+      }
+    }
+    
+    if (!inserted) worklist.pop_back();
+  }
+}
+
+/// ScheduleCopies - Insert copies into predecessor blocks, scheduling
+/// them properly so as to avoid the 'lost copy' and the 'virtual swap'
+/// problems.
+///
+/// Based on "Practical Improvements to the Construction and Destruction
+/// of Static Single Assignment Form" by Briggs, et al.
+void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
+                                          std::set<unsigned>& pushed) {
+  // FIXME: This function needs to update LiveIntervals
+  std::multimap<unsigned, unsigned>& copy_set= Waiting[MBB];
+  
+  std::multimap<unsigned, unsigned> worklist;
+  std::map<unsigned, unsigned> map;
+  
+  // Setup worklist of initial copies
+  for (std::multimap<unsigned, unsigned>::iterator I = copy_set.begin(),
+       E = copy_set.end(); I != E; ) {
+    map.insert(std::make_pair(I->first, I->first));
+    map.insert(std::make_pair(I->second, I->second));
+         
+    if (!UsedByAnother.count(I->second)) {
+      worklist.insert(*I);
+      
+      // Avoid iterator invalidation
+      std::multimap<unsigned, unsigned>::iterator OI = I;
+      ++I;
+      copy_set.erase(OI);
+    } else {
+      ++I;
+    }
+  }
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  MachineFunction* MF = MBB->getParent();
+  MachineRegisterInfo& MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  
+  SmallVector<std::pair<unsigned, MachineInstr*>, 4> InsertedPHIDests;
+  
+  // Iterate over the worklist, inserting copies
+  while (!worklist.empty() || !copy_set.empty()) {
+    while (!worklist.empty()) {
+      std::multimap<unsigned, unsigned>::iterator WI = worklist.begin();
+      std::pair<unsigned, unsigned> curr = *WI;
+      worklist.erase(WI);
+      
+      const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(curr.first);
+      
+      if (isLiveOut(curr.second, MBB, LI)) {
+        // Create a temporary
+        unsigned t = MF->getRegInfo().createVirtualRegister(RC);
+        
+        // Insert copy from curr.second to a temporary at
+        // the Phi defining curr.second
+        MachineBasicBlock::iterator PI = MRI.getVRegDef(curr.second);
+        TII->copyRegToReg(*PI->getParent(), PI, t,
+                          curr.second, RC, RC);
+        
+        DOUT << "Inserted copy from " << curr.second << " to " << t << "\n";
+        
+        // Push temporary on Stacks
+        Stacks[curr.second].push_back(t);
+        
+        // Insert curr.second in pushed
+        pushed.insert(curr.second);
+        
+        // Create a live interval for this temporary
+        InsertedPHIDests.push_back(std::make_pair(t, --PI));
+      }
+      
+      // Insert copy from map[curr.first] to curr.second
+      TII->copyRegToReg(*MBB, MBB->getFirstTerminator(), curr.second,
+                        map[curr.first], RC, RC);
+      map[curr.first] = curr.second;
+      DOUT << "Inserted copy from " << curr.first << " to "
+           << curr.second << "\n";
+      
+      // Push this copy onto InsertedPHICopies so we can
+      // update LiveIntervals with it.
+      MachineBasicBlock::iterator MI = MBB->getFirstTerminator();
+      InsertedPHIDests.push_back(std::make_pair(curr.second, --MI));
+      
+      // If curr.first is a destination in copy_set...
+      for (std::multimap<unsigned, unsigned>::iterator I = copy_set.begin(),
+           E = copy_set.end(); I != E; )
+        if (curr.first == I->second) {
+          std::pair<unsigned, unsigned> temp = *I;
+          worklist.insert(temp);
+          
+          // Avoid iterator invalidation
+          std::multimap<unsigned, unsigned>::iterator OI = I;
+          ++I;
+          copy_set.erase(OI);
+          
+          break;
+        } else {
+          ++I;
+        }
+    }
+    
+    if (!copy_set.empty()) {
+      std::multimap<unsigned, unsigned>::iterator CI = copy_set.begin();
+      std::pair<unsigned, unsigned> curr = *CI;
+      worklist.insert(curr);
+      copy_set.erase(CI);
+      
+      LiveInterval& I = LI.getInterval(curr.second);
+      MachineBasicBlock::iterator term = MBB->getFirstTerminator();
+      unsigned endIdx = 0;
+      if (term != MBB->end())
+        endIdx = LI.getInstructionIndex(term);
+      else
+        endIdx = LI.getMBBEndIdx(MBB);
+      
+      if (I.liveAt(endIdx)) {
+        const TargetRegisterClass *RC =
+                                       MF->getRegInfo().getRegClass(curr.first);
+        
+        // Insert a copy from dest to a new temporary t at the end of b
+        unsigned t = MF->getRegInfo().createVirtualRegister(RC);
+        TII->copyRegToReg(*MBB, MBB->getFirstTerminator(), t,
+                          curr.second, RC, RC);
+        map[curr.second] = t;
+        
+        MachineBasicBlock::iterator TI = MBB->getFirstTerminator();
+        InsertedPHIDests.push_back(std::make_pair(t, --TI));
+      }
+    }
+  }
+  
+  // Renumber the instructions so that we can perform the index computations
+  // needed to create new live intervals.
+  LI.computeNumbering();
+  
+  // For copies that we inserted at the ends of predecessors, we construct
+  // live intervals.  This is pretty easy, since we know that the destination
+  // register cannot have be in live at that point previously.  We just have
+  // to make sure that, for registers that serve as inputs to more than one
+  // PHI, we don't create multiple overlapping live intervals.
+  std::set<unsigned> RegHandled;
+  for (SmallVector<std::pair<unsigned, MachineInstr*>, 4>::iterator I =
+       InsertedPHIDests.begin(), E = InsertedPHIDests.end(); I != E; ++I) {
+    if (RegHandled.insert(I->first).second) {
+      LiveInterval& Int = LI.getOrCreateInterval(I->first);
+      unsigned instrIdx = LI.getInstructionIndex(I->second);
+      if (Int.liveAt(LiveIntervals::getDefIndex(instrIdx)))
+        Int.removeRange(LiveIntervals::getDefIndex(instrIdx),
+                        LI.getMBBEndIdx(I->second->getParent())+1,
+                        true);
+      
+      LiveRange R = LI.addLiveRangeToEndOfBlock(I->first, I->second);
+      R.valno->copy = I->second;
+      R.valno->def =
+                  LiveIntervals::getDefIndex(LI.getInstructionIndex(I->second));
+    }
+  }
+}
+
+/// InsertCopies - insert copies into MBB and all of its successors
+void StrongPHIElimination::InsertCopies(MachineDomTreeNode* MDTN,
+                                 SmallPtrSet<MachineBasicBlock*, 16>& visited) {
+  MachineBasicBlock* MBB = MDTN->getBlock();
+  visited.insert(MBB);
+  
+  std::set<unsigned> pushed;
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  // Rewrite register uses from Stacks
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+      I != E; ++I) {
+    if (I->getOpcode() == TargetInstrInfo::PHI)
+      continue;
+    
+    for (unsigned i = 0; i < I->getNumOperands(); ++i)
+      if (I->getOperand(i).isReg() &&
+          Stacks[I->getOperand(i).getReg()].size()) {
+        // Remove the live range for the old vreg.
+        LiveInterval& OldInt = LI.getInterval(I->getOperand(i).getReg());
+        LiveInterval::iterator OldLR = OldInt.FindLiveRangeContaining(
+                  LiveIntervals::getUseIndex(LI.getInstructionIndex(I)));
+        if (OldLR != OldInt.end())
+          OldInt.removeRange(*OldLR, true);
+        
+        // Change the register
+        I->getOperand(i).setReg(Stacks[I->getOperand(i).getReg()].back());
+        
+        // Add a live range for the new vreg
+        LiveInterval& Int = LI.getInterval(I->getOperand(i).getReg());
+        VNInfo* FirstVN = *Int.vni_begin();
+        FirstVN->hasPHIKill = false;
+        if (I->getOperand(i).isKill())
+          FirstVN->kills.push_back(
+                         LiveIntervals::getUseIndex(LI.getInstructionIndex(I)));
+        
+        LiveRange LR (LI.getMBBStartIdx(I->getParent()),
+                      LiveIntervals::getUseIndex(LI.getInstructionIndex(I))+1,
+                      FirstVN);
+        
+        Int.addRange(LR);
+      }
+  }    
+  
+  // Schedule the copies for this block
+  ScheduleCopies(MBB, pushed);
+  
+  // Recur down the dominator tree.
+  for (MachineDomTreeNode::iterator I = MDTN->begin(),
+       E = MDTN->end(); I != E; ++I)
+    if (!visited.count((*I)->getBlock()))
+      InsertCopies(*I, visited);
+  
+  // As we exit this block, pop the names we pushed while processing it
+  for (std::set<unsigned>::iterator I = pushed.begin(), 
+       E = pushed.end(); I != E; ++I)
+    Stacks[*I].pop_back();
+}
+
+bool StrongPHIElimination::mergeLiveIntervals(unsigned primary,
+                                              unsigned secondary) {
+  
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  LiveInterval& LHS = LI.getOrCreateInterval(primary);
+  LiveInterval& RHS = LI.getOrCreateInterval(secondary);
+  
+  LI.computeNumbering();
+  
+  DenseMap<VNInfo*, VNInfo*> VNMap;
+  for (LiveInterval::iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    LiveRange R = *I;
+ 
+    unsigned Start = R.start;
+    unsigned End = R.end;
+    if (LHS.getLiveRangeContaining(Start))
+      return false;
+    
+    if (LHS.getLiveRangeContaining(End))
+      return false;
+    
+    LiveInterval::iterator RI = std::upper_bound(LHS.begin(), LHS.end(), R);
+    if (RI != LHS.end() && RI->start < End)
+      return false;
+  }
+  
+  for (LiveInterval::iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    LiveRange R = *I;
+    VNInfo* OldVN = R.valno;
+    VNInfo*& NewVN = VNMap[OldVN];
+    if (!NewVN) {
+      NewVN = LHS.getNextValue(OldVN->def,
+                               OldVN->copy,
+                               LI.getVNInfoAllocator());
+      NewVN->kills = OldVN->kills;
+    }
+    
+    LiveRange LR (R.start, R.end, NewVN);
+    LHS.addRange(LR);
+  }
+  
+  LI.removeInterval(RHS.reg);
+  
+  return true;
+}
+
+bool StrongPHIElimination::runOnMachineFunction(MachineFunction &Fn) {
+  LiveIntervals& LI = getAnalysis<LiveIntervals>();
+  
+  // Compute DFS numbers of each block
+  computeDFS(Fn);
+  
+  // Determine which phi node operands need copies
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
+    if (!I->empty() &&
+        I->begin()->getOpcode() == TargetInstrInfo::PHI)
+      processBlock(I);
+  
+  // Break interferences where two different phis want to coalesce
+  // in the same register.
+  std::set<unsigned> seen;
+  typedef std::map<unsigned, std::map<unsigned, MachineBasicBlock*> >
+          RenameSetType;
+  for (RenameSetType::iterator I = RenameSets.begin(), E = RenameSets.end();
+       I != E; ++I) {
+    for (std::map<unsigned, MachineBasicBlock*>::iterator
+         OI = I->second.begin(), OE = I->second.end(); OI != OE; ) {
+      if (!seen.count(OI->first)) {
+        seen.insert(OI->first);
+        ++OI;
+      } else {
+        Waiting[OI->second].insert(std::make_pair(OI->first, I->first));
+        unsigned reg = OI->first;
+        ++OI;
+        I->second.erase(reg);
+        DOUT << "Removing Renaming: " << reg << " -> " << I->first << "\n";
+      }
+    }
+  }
+  
+  // Insert copies
+  // FIXME: This process should probably preserve LiveIntervals
+  SmallPtrSet<MachineBasicBlock*, 16> visited;
+  MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+  InsertCopies(MDT.getRootNode(), visited);
+  
+  // Perform renaming
+  for (RenameSetType::iterator I = RenameSets.begin(), E = RenameSets.end();
+       I != E; ++I)
+    while (I->second.size()) {
+      std::map<unsigned, MachineBasicBlock*>::iterator SI = I->second.begin();
+      
+      DOUT << "Renaming: " << SI->first << " -> " << I->first << "\n";
+      
+      if (SI->first != I->first) {
+        if (mergeLiveIntervals(I->first, SI->first)) {
+          Fn.getRegInfo().replaceRegWith(SI->first, I->first);
+      
+          if (RenameSets.count(SI->first)) {
+            I->second.insert(RenameSets[SI->first].begin(),
+                             RenameSets[SI->first].end());
+            RenameSets.erase(SI->first);
+          }
+        } else {
+          // Insert a last-minute copy if a conflict was detected.
+          const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(I->first);
+          TII->copyRegToReg(*SI->second, SI->second->getFirstTerminator(),
+                            I->first, SI->first, RC, RC);
+          
+          LI.computeNumbering();
+          
+          LiveInterval& Int = LI.getOrCreateInterval(I->first);
+          unsigned instrIdx =
+                     LI.getInstructionIndex(--SI->second->getFirstTerminator());
+          if (Int.liveAt(LiveIntervals::getDefIndex(instrIdx)))
+            Int.removeRange(LiveIntervals::getDefIndex(instrIdx),
+                            LI.getMBBEndIdx(SI->second)+1, true);
+
+          LiveRange R = LI.addLiveRangeToEndOfBlock(I->first,
+                                            --SI->second->getFirstTerminator());
+          R.valno->copy = --SI->second->getFirstTerminator();
+          R.valno->def = LiveIntervals::getDefIndex(instrIdx);
+          
+          DOUT << "Renaming failed: " << SI->first << " -> "
+               << I->first << "\n";
+        }
+      }
+      
+      LiveInterval& Int = LI.getOrCreateInterval(I->first);
+      const LiveRange* LR =
+                       Int.getLiveRangeContaining(LI.getMBBEndIdx(SI->second));
+      LR->valno->hasPHIKill = true;
+      
+      I->second.erase(SI->first);
+    }
+  
+  // Remove PHIs
+  std::vector<MachineInstr*> phis;
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    for (MachineBasicBlock::iterator BI = I->begin(), BE = I->end();
+         BI != BE; ++BI)
+      if (BI->getOpcode() == TargetInstrInfo::PHI)
+        phis.push_back(BI);
+  }
+  
+  for (std::vector<MachineInstr*>::iterator I = phis.begin(), E = phis.end();
+       I != E; ) {
+    MachineInstr* PInstr = *(I++);
+    
+    // If this is a dead PHI node, then remove it from LiveIntervals.
+    unsigned DestReg = PInstr->getOperand(0).getReg();
+    LiveInterval& PI = LI.getInterval(DestReg);
+    if (PInstr->registerDefIsDead(DestReg)) {
+      if (PI.containsOneValue()) {
+        LI.removeInterval(DestReg);
+      } else {
+        unsigned idx = LI.getDefIndex(LI.getInstructionIndex(PInstr));
+        PI.removeRange(*PI.getLiveRangeContaining(idx), true);
+      }
+    } else {
+      // Trim live intervals of input registers.  They are no longer live into
+      // this block if they died after the PHI.  If they lived after it, don't
+      // trim them because they might have other legitimate uses.
+      for (unsigned i = 1; i < PInstr->getNumOperands(); i += 2) {
+        unsigned reg = PInstr->getOperand(i).getReg();
+        
+        MachineBasicBlock* MBB = PInstr->getOperand(i+1).getMBB();
+        LiveInterval& InputI = LI.getInterval(reg);
+        if (MBB != PInstr->getParent() &&
+            InputI.liveAt(LI.getMBBStartIdx(PInstr->getParent())) &&
+            InputI.expiredAt(LI.getInstructionIndex(PInstr) + 
+                             LiveInterval::InstrSlots::NUM))
+          InputI.removeRange(LI.getMBBStartIdx(PInstr->getParent()),
+                             LI.getInstructionIndex(PInstr),
+                             true);
+      }
+      
+      // If the PHI is not dead, then the valno defined by the PHI
+      // now has an unknown def.
+      unsigned idx = LI.getDefIndex(LI.getInstructionIndex(PInstr));
+      const LiveRange* PLR = PI.getLiveRangeContaining(idx);
+      PLR->valno->def = ~0U;
+      LiveRange R (LI.getMBBStartIdx(PInstr->getParent()),
+                   PLR->start, PLR->valno);
+      PI.addRange(R);
+    }
+    
+    LI.RemoveMachineInstrFromMaps(PInstr);
+    PInstr->eraseFromParent();
+  }
+  
+  LI.computeNumbering();
+  
+  return true;
+}
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
new file mode 100644
index 0000000..a5e1ee4
--- /dev/null
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -0,0 +1,194 @@
+//===-- TargetInstrInfoImpl.cpp - Target Instruction Information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfoImpl class, it just provides default
+// implementations of various methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+using namespace llvm;
+
+// commuteInstruction - The default implementation of this method just exchanges
+// operand 1 and 2.
+MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
+                                                      bool NewMI) const {
+  assert(MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+         "This only knows how to commute register operands so far");
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  unsigned Reg2 = MI->getOperand(2).getReg();
+  bool Reg1IsKill = MI->getOperand(1).isKill();
+  bool Reg2IsKill = MI->getOperand(2).isKill();
+  bool ChangeReg0 = false;
+  if (MI->getOperand(0).getReg() == Reg1) {
+    // Must be two address instruction!
+    assert(MI->getDesc().getOperandConstraint(0, TOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    Reg2IsKill = false;
+    ChangeReg0 = true;
+  }
+
+  if (NewMI) {
+    // Create a new instruction.
+    unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg();
+    bool Reg0IsDead = MI->getOperand(0).isDead();
+    MachineFunction &MF = *MI->getParent()->getParent();
+    return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
+      .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+      .addReg(Reg2, getKillRegState(Reg2IsKill))
+      .addReg(Reg1, getKillRegState(Reg2IsKill));
+  }
+
+  if (ChangeReg0)
+    MI->getOperand(0).setReg(Reg2);
+  MI->getOperand(2).setReg(Reg1);
+  MI->getOperand(1).setReg(Reg2);
+  MI->getOperand(2).setIsKill(Reg1IsKill);
+  MI->getOperand(1).setIsKill(Reg2IsKill);
+  return MI;
+}
+
+/// CommuteChangesDestination - Return true if commuting the specified
+/// instruction will also changes the destination operand. Also return the
+/// current operand index of the would be new destination register by
+/// reference. This can happen when the commutable instruction is also a
+/// two-address instruction.
+bool TargetInstrInfoImpl::CommuteChangesDestination(MachineInstr *MI,
+                                                    unsigned &OpIdx) const{
+  assert(MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+         "This only knows how to commute register operands so far");
+  if (MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+    // Must be two address instruction!
+    assert(MI->getDesc().getOperandConstraint(0, TOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    OpIdx = 2;
+    return true;
+  }
+  return false;
+}
+
+
+bool TargetInstrInfoImpl::PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const {
+  bool MadeChange = false;
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isPredicable())
+    return false;
+  
+  for (unsigned j = 0, i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (TID.OpInfo[i].isPredicate()) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg()) {
+        MO.setReg(Pred[j].getReg());
+        MadeChange = true;
+      } else if (MO.isImm()) {
+        MO.setImm(Pred[j].getImm());
+        MadeChange = true;
+      } else if (MO.isMBB()) {
+        MO.setMBB(Pred[j].getMBB());
+        MadeChange = true;
+      }
+      ++j;
+    }
+  }
+  return MadeChange;
+}
+
+void TargetInstrInfoImpl::reMaterialize(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned DestReg,
+                                        const MachineInstr *Orig) const {
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+unsigned
+TargetInstrInfoImpl::GetFunctionSizeInBytes(const MachineFunction &MF) const {
+  unsigned FnSize = 0;
+  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
+         I != E; ++I)
+      FnSize += GetInstSizeInBytes(I);
+  }
+  return FnSize;
+}
+
+/// foldMemoryOperand - Attempt to fold a load or store of the specified stack
+/// slot into the specified machine instruction for the specified operand(s).
+/// If this is possible, a new instruction is returned with the specified
+/// operand folded, otherwise NULL is returned. The client is responsible for
+/// removing the old instruction and adding the new one in the instruction
+/// stream.
+MachineInstr*
+TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
+                                   MachineInstr* MI,
+                                   const SmallVectorImpl<unsigned> &Ops,
+                                   int FrameIndex) const {
+  unsigned Flags = 0;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (MI->getOperand(Ops[i]).isDef())
+      Flags |= MachineMemOperand::MOStore;
+    else
+      Flags |= MachineMemOperand::MOLoad;
+
+  // Ask the target to do the actual folding.
+  MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+  if (!NewMI) return 0;
+
+  assert((!(Flags & MachineMemOperand::MOStore) ||
+          NewMI->getDesc().mayStore()) &&
+         "Folded a def to a non-store!");
+  assert((!(Flags & MachineMemOperand::MOLoad) ||
+          NewMI->getDesc().mayLoad()) &&
+         "Folded a use to a non-load!");
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  assert(MFI.getObjectOffset(FrameIndex) != -1);
+  MachineMemOperand MMO(PseudoSourceValue::getFixedStack(FrameIndex),
+                        Flags,
+                        MFI.getObjectOffset(FrameIndex),
+                        MFI.getObjectSize(FrameIndex),
+                        MFI.getObjectAlignment(FrameIndex));
+  NewMI->addMemOperand(MF, MMO);
+
+  return NewMI;
+}
+
+/// foldMemoryOperand - Same as the previous version except it allows folding
+/// of any load and store from / to any address, not just from a specific
+/// stack slot.
+MachineInstr*
+TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
+                                   MachineInstr* MI,
+                                   const SmallVectorImpl<unsigned> &Ops,
+                                   MachineInstr* LoadMI) const {
+  assert(LoadMI->getDesc().canFoldAsLoad() && "LoadMI isn't foldable!");
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    assert(MI->getOperand(Ops[i]).isUse() && "Folding load into def!");
+#endif
+
+  // Ask the target to do the actual folding.
+  MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, LoadMI);
+  if (!NewMI) return 0;
+
+  // Copy the memoperands from the load to the folded instruction.
+  for (std::list<MachineMemOperand>::iterator I = LoadMI->memoperands_begin(),
+       E = LoadMI->memoperands_end(); I != E; ++I)
+    NewMI->addMemOperand(MF, *I);
+
+  return NewMI;
+}
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
new file mode 100644
index 0000000..3c40404
--- /dev/null
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -0,0 +1,997 @@
+//===-- TwoAddressInstructionPass.cpp - Two-Address instruction pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TwoAddress instruction pass which is used
+// by most register allocators. Two-Address instructions are rewritten
+// from:
+//
+//     A = B op C
+//
+// to:
+//
+//     A = B
+//     A op= C
+//
+// Note that if a register allocator chooses to use this pass, that it
+// has to be capable of handling the non-SSA nature of these rewritten
+// virtual registers.
+//
+// It is also worth noting that the duplicate operand of the two
+// address instruction is removed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "twoaddrinstr"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
+STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
+STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
+STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
+STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
+STATISTIC(NumReMats,           "Number of instructions re-materialized");
+STATISTIC(NumDeletes,          "Number of dead instructions deleted");
+
+namespace {
+  class VISIBILITY_HIDDEN TwoAddressInstructionPass
+    : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+    LiveVariables *LV;
+
+    // DistanceMap - Keep track the distance of a MI from the start of the
+    // current basic block.
+    DenseMap<MachineInstr*, unsigned> DistanceMap;
+
+    // SrcRegMap - A map from virtual registers to physical registers which
+    // are likely targets to be coalesced to due to copies from physical
+    // registers to virtual registers. e.g. v1024 = move r0.
+    DenseMap<unsigned, unsigned> SrcRegMap;
+
+    // DstRegMap - A map from virtual registers to physical registers which
+    // are likely targets to be coalesced to due to copies to physical
+    // registers from virtual registers. e.g. r1 = move v1024.
+    DenseMap<unsigned, unsigned> DstRegMap;
+
+    bool Sink3AddrInstruction(MachineBasicBlock *MBB, MachineInstr *MI,
+                              unsigned Reg,
+                              MachineBasicBlock::iterator OldPos);
+
+    bool isProfitableToReMat(unsigned Reg, const TargetRegisterClass *RC,
+                             MachineInstr *MI, MachineInstr *DefMI,
+                             MachineBasicBlock *MBB, unsigned Loc);
+
+    bool NoUseAfterLastDef(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist,
+                           unsigned &LastDef);
+
+    MachineInstr *FindLastUseInMBB(unsigned Reg, MachineBasicBlock *MBB,
+                                   unsigned Dist);
+
+    bool isProfitableToCommute(unsigned regB, unsigned regC,
+                               MachineInstr *MI, MachineBasicBlock *MBB,
+                               unsigned Dist);
+
+    bool CommuteInstruction(MachineBasicBlock::iterator &mi,
+                            MachineFunction::iterator &mbbi,
+                            unsigned RegB, unsigned RegC, unsigned Dist);
+
+    bool isProfitableToConv3Addr(unsigned RegA);
+
+    bool ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                            MachineBasicBlock::iterator &nmi,
+                            MachineFunction::iterator &mbbi,
+                            unsigned RegB, unsigned Dist);
+
+    void ProcessCopy(MachineInstr *MI, MachineBasicBlock *MBB,
+                     SmallPtrSet<MachineInstr*, 8> &Processed);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    TwoAddressInstructionPass() : MachineFunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<LiveVariables>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      if (StrongPHIElim)
+        AU.addPreservedID(StrongPHIEliminationID);
+      else
+        AU.addPreservedID(PHIEliminationID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    /// runOnMachineFunction - Pass entry point.
+    bool runOnMachineFunction(MachineFunction&);
+  };
+}
+
+char TwoAddressInstructionPass::ID = 0;
+static RegisterPass<TwoAddressInstructionPass>
+X("twoaddressinstruction", "Two-Address instruction pass");
+
+const PassInfo *const llvm::TwoAddressInstructionPassID = &X;
+
+/// Sink3AddrInstruction - A two-address instruction has been converted to a
+/// three-address instruction to avoid clobbering a register. Try to sink it
+/// past the instruction that would kill the above mentioned register to reduce
+/// register pressure.
+bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
+                                           MachineInstr *MI, unsigned SavedReg,
+                                           MachineBasicBlock::iterator OldPos) {
+  // Check if it's safe to move this instruction.
+  bool SeenStore = true; // Be conservative.
+  if (!MI->isSafeToMove(TII, SeenStore))
+    return false;
+
+  unsigned DefReg = 0;
+  SmallSet<unsigned, 4> UseRegs;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MO.isUse() && MOReg != SavedReg)
+      UseRegs.insert(MO.getReg());
+    if (!MO.isDef())
+      continue;
+    if (MO.isImplicit())
+      // Don't try to move it if it implicitly defines a register.
+      return false;
+    if (DefReg)
+      // For now, don't move any instructions that define multiple registers.
+      return false;
+    DefReg = MO.getReg();
+  }
+
+  // Find the instruction that kills SavedReg.
+  MachineInstr *KillMI = NULL;
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SavedReg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    if (!UseMO.isKill())
+      continue;
+    KillMI = UseMO.getParent();
+    break;
+  }
+
+  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI)
+    return false;
+
+  // If any of the definitions are used by another instruction between the
+  // position and the kill use, then it's not safe to sink it.
+  // 
+  // FIXME: This can be sped up if there is an easy way to query whether an
+  // instruction is before or after another instruction. Then we can use
+  // MachineRegisterInfo def / use instead.
+  MachineOperand *KillMO = NULL;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  ++KillPos;
+
+  unsigned NumVisited = 0;
+  for (MachineBasicBlock::iterator I = next(OldPos); I != KillPos; ++I) {
+    MachineInstr *OtherMI = I;
+    if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = OtherMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (DefReg == MOReg)
+        return false;
+
+      if (MO.isKill()) {
+        if (OtherMI == KillMI && MOReg == SavedReg)
+          // Save the operand that kills the register. We want to unset the kill
+          // marker if we can sink MI past it.
+          KillMO = &MO;
+        else if (UseRegs.count(MOReg))
+          // One of the uses is killed before the destination.
+          return false;
+      }
+    }
+  }
+
+  // Update kill and LV information.
+  KillMO->setIsKill(false);
+  KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI);
+  KillMO->setIsKill(true);
+  
+  if (LV)
+    LV->replaceKillInstruction(SavedReg, KillMI, MI);
+
+  // Move instruction to its destination.
+  MBB->remove(MI);
+  MBB->insert(KillPos, MI);
+
+  ++Num3AddrSunk;
+  return true;
+}
+
+/// isTwoAddrUse - Return true if the specified MI is using the specified
+/// register as a two-address operand.
+static bool isTwoAddrUse(MachineInstr *UseMI, unsigned Reg) {
+  const TargetInstrDesc &TID = UseMI->getDesc();
+  for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = UseMI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == Reg &&
+        (MO.isDef() || UseMI->isRegTiedToDefOperand(i)))
+      // Earlier use is a two-address one.
+      return true;
+  }
+  return false;
+}
+
+/// isProfitableToReMat - Return true if the heuristics determines it is likely
+/// to be profitable to re-materialize the definition of Reg rather than copy
+/// the register.
+bool
+TwoAddressInstructionPass::isProfitableToReMat(unsigned Reg,
+                                         const TargetRegisterClass *RC,
+                                         MachineInstr *MI, MachineInstr *DefMI,
+                                         MachineBasicBlock *MBB, unsigned Loc) {
+  bool OtherUse = false;
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = UseMO.getParent();
+    MachineBasicBlock *UseMBB = UseMI->getParent();
+    if (UseMBB == MBB) {
+      DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
+      if (DI != DistanceMap.end() && DI->second == Loc)
+        continue;  // Current use.
+      OtherUse = true;
+      // There is at least one other use in the MBB that will clobber the
+      // register. 
+      if (isTwoAddrUse(UseMI, Reg))
+        return true;
+    }
+  }
+
+  // If other uses in MBB are not two-address uses, then don't remat.
+  if (OtherUse)
+    return false;
+
+  // No other uses in the same block, remat if it's defined in the same
+  // block so it does not unnecessarily extend the live range.
+  return MBB == DefMI->getParent();
+}
+
+/// NoUseAfterLastDef - Return true if there are no intervening uses between the
+/// last instruction in the MBB that defines the specified register and the
+/// two-address instruction which is being processed. It also returns the last
+/// def location by reference
+bool TwoAddressInstructionPass::NoUseAfterLastDef(unsigned Reg,
+                                           MachineBasicBlock *MBB, unsigned Dist,
+                                           unsigned &LastDef) {
+  LastDef = 0;
+  unsigned LastUse = Dist;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB)
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (MO.isUse() && DI->second < LastUse)
+      LastUse = DI->second;
+    if (MO.isDef() && DI->second > LastDef)
+      LastDef = DI->second;
+  }
+
+  return !(LastUse > LastDef && LastUse < Dist);
+}
+
+MachineInstr *TwoAddressInstructionPass::FindLastUseInMBB(unsigned Reg,
+                                                         MachineBasicBlock *MBB,
+                                                         unsigned Dist) {
+  unsigned LastUseDist = 0;
+  MachineInstr *LastUse = 0;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB)
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (DI->second >= Dist)
+      continue;
+
+    if (MO.isUse() && DI->second > LastUseDist) {
+      LastUse = DI->first;
+      LastUseDist = DI->second;
+    }
+  }
+  return LastUse;
+}
+
+/// isCopyToReg - Return true if the specified MI is a copy instruction or
+/// a extract_subreg instruction. It also returns the source and destination
+/// registers and whether they are physical registers by reference.
+static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
+                        unsigned &SrcReg, unsigned &DstReg,
+                        bool &IsSrcPhys, bool &IsDstPhys) {
+  SrcReg = 0;
+  DstReg = 0;
+  unsigned SrcSubIdx, DstSubIdx;
+  if (!TII->isMoveInstr(MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
+    if (MI.getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+    } else if (MI.getOpcode() == TargetInstrInfo::INSERT_SUBREG) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+    } else if (MI.getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+    }
+  }
+
+  if (DstReg) {
+    IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    return true;
+  }
+  return false;
+}
+
+/// isKilled - Test if the given register value, which is used by the given
+/// instruction, is killed by the given instruction. This looks through
+/// coalescable copies to see if the original value is potentially not killed.
+///
+/// For example, in this code:
+///
+///   %reg1034 = copy %reg1024
+///   %reg1035 = copy %reg1025<kill>
+///   %reg1036 = add %reg1034<kill>, %reg1035<kill>
+///
+/// %reg1034 is not considered to be killed, since it is copied from a
+/// register which is not killed. Treating it as not killed lets the
+/// normal heuristics commute the (two-address) add, which lets
+/// coalescing eliminate the extra copy.
+///
+static bool isKilled(MachineInstr &MI, unsigned Reg,
+                     const MachineRegisterInfo *MRI,
+                     const TargetInstrInfo *TII) {
+  MachineInstr *DefMI = &MI;
+  for (;;) {
+    if (!DefMI->killsRegister(Reg))
+      return false;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return true;
+    MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
+    // If there are multiple defs, we can't do a simple analysis, so just
+    // go with what the kill flag says.
+    if (next(Begin) != MRI->def_end())
+      return true;
+    DefMI = &*Begin;
+    bool IsSrcPhys, IsDstPhys;
+    unsigned SrcReg,  DstReg;
+    // If the def is something other than a copy, then it isn't going to
+    // be coalesced, so follow the kill flag.
+    if (!isCopyToReg(*DefMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+      return true;
+    Reg = SrcReg;
+  }
+}
+
+/// isTwoAddrUse - Return true if the specified MI uses the specified register
+/// as a two-address use. If so, return the destination register by reference.
+static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned NumOps = (MI.getOpcode() == TargetInstrInfo::INLINEASM)
+    ? MI.getNumOperands() : TID.getNumOperands();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+      continue;
+    unsigned ti;
+    if (MI.isRegTiedToDefOperand(i, &ti)) {
+      DstReg = MI.getOperand(ti).getReg();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// findOnlyInterestingUse - Given a register, if has a single in-basic block
+/// use, return the use instruction if it's a copy or a two-address use.
+static
+MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
+                                     MachineRegisterInfo *MRI,
+                                     const TargetInstrInfo *TII,
+                                     bool &IsCopy,
+                                     unsigned &DstReg, bool &IsDstPhys) {
+  MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg);
+  if (UI == MRI->use_end())
+    return 0;
+  MachineInstr &UseMI = *UI;
+  if (++UI != MRI->use_end())
+    // More than one use.
+    return 0;
+  if (UseMI.getParent() != MBB)
+    return 0;
+  unsigned SrcReg;
+  bool IsSrcPhys;
+  if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
+    IsCopy = true;
+    return &UseMI;
+  }
+  IsDstPhys = false;
+  if (isTwoAddrUse(UseMI, Reg, DstReg)) {
+    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    return &UseMI;
+  }
+  return 0;
+}
+
+/// getMappedReg - Return the physical register the specified virtual register
+/// might be mapped to.
+static unsigned
+getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
+  while (TargetRegisterInfo::isVirtualRegister(Reg))  {
+    DenseMap<unsigned, unsigned>::iterator SI = RegMap.find(Reg);
+    if (SI == RegMap.end())
+      return 0;
+    Reg = SI->second;
+  }
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return Reg;
+  return 0;
+}
+
+/// regsAreCompatible - Return true if the two registers are equal or aliased.
+///
+static bool
+regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
+  if (RegA == RegB)
+    return true;
+  if (!RegA || !RegB)
+    return false;
+  return TRI->regsOverlap(RegA, RegB);
+}
+
+
+/// isProfitableToReMat - Return true if it's potentially profitable to commute
+/// the two-address instruction that's being processed.
+bool
+TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
+                                       MachineInstr *MI, MachineBasicBlock *MBB,
+                                       unsigned Dist) {
+  // Determine if it's profitable to commute this two address instruction. In
+  // general, we want no uses between this instruction and the definition of
+  // the two-address register.
+  // e.g.
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1028
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // In this case, it might not be possible to coalesce the second MOV8rr
+  // instruction if the first one is coalesced. So it would be profitable to
+  // commute it:
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1029
+  // %reg1030<def> = ADD8rr %reg1029<kill>, %reg1028<kill>, %EFLAGS<imp-def,dead>  
+
+  if (!MI->killsRegister(regC))
+    return false;
+
+  // Ok, we have something like:
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // let's see if it's worth commuting it.
+
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r0            = MOV %reg1026
+  // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
+  unsigned FromRegB = getMappedReg(regB, SrcRegMap);
+  unsigned FromRegC = getMappedReg(regC, SrcRegMap);
+  unsigned ToRegB = getMappedReg(regB, DstRegMap);
+  unsigned ToRegC = getMappedReg(regC, DstRegMap);
+  if (!regsAreCompatible(FromRegB, ToRegB, TRI) &&
+      (regsAreCompatible(FromRegB, ToRegC, TRI) ||
+       regsAreCompatible(FromRegC, ToRegB, TRI)))
+    return true;
+
+  // If there is a use of regC between its last def (could be livein) and this
+  // instruction, then bail.
+  unsigned LastDefC = 0;
+  if (!NoUseAfterLastDef(regC, MBB, Dist, LastDefC))
+    return false;
+
+  // If there is a use of regB between its last def (could be livein) and this
+  // instruction, then go ahead and make this transformation.
+  unsigned LastDefB = 0;
+  if (!NoUseAfterLastDef(regB, MBB, Dist, LastDefB))
+    return true;
+
+  // Since there are no intervening uses for both registers, then commute
+  // if the def of regC is closer. Its live interval is shorter.
+  return LastDefB && LastDefC && LastDefC > LastDefB;
+}
+
+/// CommuteInstruction - Commute a two-address instruction and update the basic
+/// block, distance map, and live variables if needed. Return true if it is
+/// successful.
+bool
+TwoAddressInstructionPass::CommuteInstruction(MachineBasicBlock::iterator &mi,
+                               MachineFunction::iterator &mbbi,
+                               unsigned RegB, unsigned RegC, unsigned Dist) {
+  MachineInstr *MI = mi;
+  DOUT << "2addr: COMMUTING  : " << *MI;
+  MachineInstr *NewMI = TII->commuteInstruction(MI);
+
+  if (NewMI == 0) {
+    DOUT << "2addr: COMMUTING FAILED!\n";
+    return false;
+  }
+
+  DOUT << "2addr: COMMUTED TO: " << *NewMI;
+  // If the instruction changed to commute it, update livevar.
+  if (NewMI != MI) {
+    if (LV)
+      // Update live variables
+      LV->replaceKillInstruction(RegC, MI, NewMI);
+
+    mbbi->insert(mi, NewMI);           // Insert the new inst
+    mbbi->erase(mi);                   // Nuke the old inst.
+    mi = NewMI;
+    DistanceMap.insert(std::make_pair(NewMI, Dist));
+  }
+
+  // Update source register map.
+  unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
+  if (FromRegC) {
+    unsigned RegA = MI->getOperand(0).getReg();
+    SrcRegMap[RegA] = FromRegC;
+  }
+
+  return true;
+}
+
+/// isProfitableToConv3Addr - Return true if it is profitable to convert the
+/// given 2-address instruction to a 3-address one.
+bool
+TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA) {
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r2            = MOV %reg1026
+  // Turn ADD into a 3-address instruction to avoid a copy.
+  unsigned FromRegA = getMappedReg(RegA, SrcRegMap);
+  unsigned ToRegA = getMappedReg(RegA, DstRegMap);
+  return (FromRegA && ToRegA && !regsAreCompatible(FromRegA, ToRegA, TRI));
+}
+
+/// ConvertInstTo3Addr - Convert the specified two-address instruction into a
+/// three address one. Return true if this transformation was successful.
+bool
+TwoAddressInstructionPass::ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                                              MachineBasicBlock::iterator &nmi,
+                                              MachineFunction::iterator &mbbi,
+                                              unsigned RegB, unsigned Dist) {
+  MachineInstr *NewMI = TII->convertToThreeAddress(mbbi, mi, LV);
+  if (NewMI) {
+    DOUT << "2addr: CONVERTING 2-ADDR: " << *mi;
+    DOUT << "2addr:         TO 3-ADDR: " << *NewMI;
+    bool Sunk = false;
+
+    if (NewMI->findRegisterUseOperand(RegB, false, TRI))
+      // FIXME: Temporary workaround. If the new instruction doesn't
+      // uses RegB, convertToThreeAddress must have created more
+      // then one instruction.
+      Sunk = Sink3AddrInstruction(mbbi, NewMI, RegB, mi);
+
+    mbbi->erase(mi); // Nuke the old inst.
+
+    if (!Sunk) {
+      DistanceMap.insert(std::make_pair(NewMI, Dist));
+      mi = NewMI;
+      nmi = next(mi);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+/// ProcessCopy - If the specified instruction is not yet processed, process it
+/// if it's a copy. For a copy instruction, we find the physical registers the
+/// source and destination registers might be mapped to. These are kept in
+/// point-to maps used to determine future optimizations. e.g.
+/// v1024 = mov r0
+/// v1025 = mov r1
+/// v1026 = add v1024, v1025
+/// r1    = mov r1026
+/// If 'add' is a two-address instruction, v1024, v1026 are both potentially
+/// coalesced to r0 (from the input side). v1025 is mapped to r1. v1026 is
+/// potentially joined with r1 on the output side. It's worthwhile to commute
+/// 'add' to eliminate a copy.
+void TwoAddressInstructionPass::ProcessCopy(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     SmallPtrSet<MachineInstr*, 8> &Processed) {
+  if (Processed.count(MI))
+    return;
+
+  bool IsSrcPhys, IsDstPhys;
+  unsigned SrcReg, DstReg;
+  if (!isCopyToReg(*MI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+    return;
+
+  if (IsDstPhys && !IsSrcPhys)
+    DstRegMap.insert(std::make_pair(SrcReg, DstReg));
+  else if (!IsDstPhys && IsSrcPhys) {
+    bool isNew = SrcRegMap.insert(std::make_pair(DstReg, SrcReg)).second;
+    if (!isNew)
+      assert(SrcRegMap[DstReg] == SrcReg &&
+             "Can't map to two src physical registers!");
+
+    SmallVector<unsigned, 4> VirtRegPairs;
+    bool IsCopy = false;
+    unsigned NewReg = 0;
+    while (MachineInstr *UseMI = findOnlyInterestingUse(DstReg, MBB, MRI,TII,
+                                                   IsCopy, NewReg, IsDstPhys)) {
+      if (IsCopy) {
+        if (!Processed.insert(UseMI))
+          break;
+      }
+
+      DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
+      if (DI != DistanceMap.end())
+        // Earlier in the same MBB.Reached via a back edge.
+        break;
+
+      if (IsDstPhys) {
+        VirtRegPairs.push_back(NewReg);
+        break;
+      }
+      bool isNew = SrcRegMap.insert(std::make_pair(NewReg, DstReg)).second;
+      if (!isNew)
+        assert(SrcRegMap[NewReg] == DstReg &&
+               "Can't map to two src physical registers!");
+      VirtRegPairs.push_back(NewReg);
+      DstReg = NewReg;
+    }
+
+    if (!VirtRegPairs.empty()) {
+      unsigned ToReg = VirtRegPairs.back();
+      VirtRegPairs.pop_back();
+      while (!VirtRegPairs.empty()) {
+        unsigned FromReg = VirtRegPairs.back();
+        VirtRegPairs.pop_back();
+        bool isNew = DstRegMap.insert(std::make_pair(FromReg, ToReg)).second;
+        if (!isNew)
+          assert(DstRegMap[FromReg] == ToReg &&
+                 "Can't map to two dst physical registers!");
+        ToReg = FromReg;
+      }
+    }
+  }
+
+  Processed.insert(MI);
+}
+
+/// isSafeToDelete - If the specified instruction does not produce any side
+/// effects and all of its defs are dead, then it's safe to delete.
+static bool isSafeToDelete(MachineInstr *MI, unsigned Reg,
+                           const TargetInstrInfo *TII,
+                           SmallVector<unsigned, 4> &Kills) {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (TID.mayStore() || TID.isCall())
+    return false;
+  if (TID.isTerminator() || TID.hasUnmodeledSideEffects())
+    return false;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef() && !MO.isDead())
+      return false;
+    if (MO.isUse() && MO.getReg() != Reg && MO.isKill())
+      Kills.push_back(MO.getReg());
+  }
+
+  return true;
+}
+
+/// runOnMachineFunction - Reduce two-address instructions to two operands.
+///
+bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
+  DOUT << "Machine Function\n";
+  const TargetMachine &TM = MF.getTarget();
+  MRI = &MF.getRegInfo();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+
+  bool MadeChange = false;
+
+  DOUT << "********** REWRITING TWO-ADDR INSTRS **********\n";
+  DOUT << "********** Function: " << MF.getFunction()->getName() << '\n';
+
+  // ReMatRegs - Keep track of the registers whose def's are remat'ed.
+  BitVector ReMatRegs;
+  ReMatRegs.resize(MRI->getLastVirtReg()+1);
+
+  SmallPtrSet<MachineInstr*, 8> Processed;
+  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+       mbbi != mbbe; ++mbbi) {
+    unsigned Dist = 0;
+    DistanceMap.clear();
+    SrcRegMap.clear();
+    DstRegMap.clear();
+    Processed.clear();
+    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
+         mi != me; ) {
+      MachineBasicBlock::iterator nmi = next(mi);
+      const TargetInstrDesc &TID = mi->getDesc();
+      bool FirstTied = true;
+
+      DistanceMap.insert(std::make_pair(mi, ++Dist));
+
+      ProcessCopy(&*mi, &*mbbi, Processed);
+
+      unsigned NumOps = (mi->getOpcode() == TargetInstrInfo::INLINEASM)
+        ? mi->getNumOperands() : TID.getNumOperands();
+      for (unsigned si = 0; si < NumOps; ++si) {
+        unsigned ti = 0;
+        if (!mi->isRegTiedToDefOperand(si, &ti))
+          continue;
+
+        if (FirstTied) {
+          ++NumTwoAddressInstrs;
+          DOUT << '\t'; DEBUG(mi->print(*cerr.stream(), &TM));
+        }
+
+        FirstTied = false;
+
+        assert(mi->getOperand(si).isReg() && mi->getOperand(si).getReg() &&
+               mi->getOperand(si).isUse() && "two address instruction invalid");
+
+        // If the two operands are the same we just remove the use
+        // and mark the def as def&use, otherwise we have to insert a copy.
+        if (mi->getOperand(ti).getReg() != mi->getOperand(si).getReg()) {
+          // Rewrite:
+          //     a = b op c
+          // to:
+          //     a = b
+          //     a = a op c
+          unsigned regA = mi->getOperand(ti).getReg();
+          unsigned regB = mi->getOperand(si).getReg();
+
+          assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+                 "cannot update physical register live information");
+
+#ifndef NDEBUG
+          // First, verify that we don't have a use of a in the instruction (a =
+          // b + a for example) because our transformation will not work. This
+          // should never occur because we are in SSA form.
+          for (unsigned i = 0; i != mi->getNumOperands(); ++i)
+            assert(i == ti ||
+                   !mi->getOperand(i).isReg() ||
+                   mi->getOperand(i).getReg() != regA);
+#endif
+
+          // If this instruction is not the killing user of B, see if we can
+          // rearrange the code to make it so.  Making it the killing user will
+          // allow us to coalesce A and B together, eliminating the copy we are
+          // about to insert.
+          if (!isKilled(*mi, regB, MRI, TII)) {
+            // If regA is dead and the instruction can be deleted, just delete
+            // it so it doesn't clobber regB.
+            SmallVector<unsigned, 4> Kills;
+            if (mi->getOperand(ti).isDead() &&
+                isSafeToDelete(mi, regB, TII, Kills)) {
+              SmallVector<std::pair<std::pair<unsigned, bool>
+                ,MachineInstr*>, 4> NewKills;
+              bool ReallySafe = true;
+              // If this instruction kills some virtual registers, we need
+              // update the kill information. If it's not possible to do so,
+              // then bail out.
+              while (!Kills.empty()) {
+                unsigned Kill = Kills.back();
+                Kills.pop_back();
+                if (TargetRegisterInfo::isPhysicalRegister(Kill)) {
+                  ReallySafe = false;
+                  break;
+                }
+                MachineInstr *LastKill = FindLastUseInMBB(Kill, &*mbbi, Dist);
+                if (LastKill) {
+                  bool isModRef = LastKill->modifiesRegister(Kill);
+                  NewKills.push_back(std::make_pair(std::make_pair(Kill,isModRef),
+                                                    LastKill));
+                } else {
+                  ReallySafe = false;
+                  break;
+                }
+              }
+
+              if (ReallySafe) {
+                if (LV) {
+                  while (!NewKills.empty()) {
+                    MachineInstr *NewKill = NewKills.back().second;
+                    unsigned Kill = NewKills.back().first.first;
+                    bool isDead = NewKills.back().first.second;
+                    NewKills.pop_back();
+                    if (LV->removeVirtualRegisterKilled(Kill,  mi)) {
+                      if (isDead)
+                        LV->addVirtualRegisterDead(Kill, NewKill);
+                      else
+                        LV->addVirtualRegisterKilled(Kill, NewKill);
+                    }
+                  }
+                }
+
+                // We're really going to nuke the old inst. If regB was marked
+                // as a kill we need to update its Kills list.
+                if (mi->getOperand(si).isKill())
+                  LV->removeVirtualRegisterKilled(regB, mi);
+
+                mbbi->erase(mi); // Nuke the old inst.
+                mi = nmi;
+                ++NumDeletes;
+                break; // Done with this instruction.
+              }
+            }
+
+            // If this instruction is commutative, check to see if C dies.  If
+            // so, swap the B and C operands.  This makes the live ranges of A
+            // and C joinable.
+            // FIXME: This code also works for A := B op C instructions.
+            if (TID.isCommutable() && mi->getNumOperands() >= 3) {
+              assert(mi->getOperand(3-si).isReg() &&
+                     "Not a proper commutative instruction!");
+              unsigned regC = mi->getOperand(3-si).getReg();
+              if (isKilled(*mi, regC, MRI, TII)) {
+                if (CommuteInstruction(mi, mbbi, regB, regC, Dist)) {
+                  ++NumCommuted;
+                  regB = regC;
+                  goto InstructionRearranged;
+                }
+              }
+            }
+
+            // If this instruction is potentially convertible to a true
+            // three-address instruction,
+            if (TID.isConvertibleTo3Addr()) {
+              // FIXME: This assumes there are no more operands which are tied
+              // to another register.
+#ifndef NDEBUG
+              for (unsigned i = si + 1, e = TID.getNumOperands(); i < e; ++i)
+                assert(TID.getOperandConstraint(i, TOI::TIED_TO) == -1);
+#endif
+
+              if (ConvertInstTo3Addr(mi, nmi, mbbi, regB, Dist)) {
+                ++NumConvertedTo3Addr;
+                break; // Done with this instruction.
+              }
+            }
+          }
+
+          // If it's profitable to commute the instruction, do so.
+          if (TID.isCommutable() && mi->getNumOperands() >= 3) {
+            unsigned regC = mi->getOperand(3-si).getReg();
+            if (isProfitableToCommute(regB, regC, mi, mbbi, Dist))
+              if (CommuteInstruction(mi, mbbi, regB, regC, Dist)) {
+                ++NumAggrCommuted;
+                ++NumCommuted;
+                regB = regC;
+                goto InstructionRearranged;
+              }
+          }
+
+          // If it's profitable to convert the 2-address instruction to a
+          // 3-address one, do so.
+          if (TID.isConvertibleTo3Addr() && isProfitableToConv3Addr(regA)) {
+            if (ConvertInstTo3Addr(mi, nmi, mbbi, regB, Dist)) {
+              ++NumConvertedTo3Addr;
+              break; // Done with this instruction.
+            }
+          }
+
+        InstructionRearranged:
+          const TargetRegisterClass* rc = MRI->getRegClass(regB);
+          MachineInstr *DefMI = MRI->getVRegDef(regB);
+          // If it's safe and profitable, remat the definition instead of
+          // copying it.
+          if (DefMI &&
+              DefMI->getDesc().isAsCheapAsAMove() &&
+              DefMI->isSafeToReMat(TII, regB) &&
+              isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){
+            DEBUG(cerr << "2addr: REMATTING : " << *DefMI << "\n");
+            TII->reMaterialize(*mbbi, mi, regA, DefMI);
+            ReMatRegs.set(regB);
+            ++NumReMats;
+          } else {
+            bool Emitted = TII->copyRegToReg(*mbbi, mi, regA, regB, rc, rc);
+            (void)Emitted;
+            assert(Emitted && "Unable to issue a copy instruction!\n");
+          }
+
+          MachineBasicBlock::iterator prevMI = prior(mi);
+          // Update DistanceMap.
+          DistanceMap.insert(std::make_pair(prevMI, Dist));
+          DistanceMap[mi] = ++Dist;
+
+          // Update live variables for regB.
+          if (LV) {
+            if (LV->removeVirtualRegisterKilled(regB,  mi))
+              LV->addVirtualRegisterKilled(regB, prevMI);
+
+            if (LV->removeVirtualRegisterDead(regB, mi))
+              LV->addVirtualRegisterDead(regB, prevMI);
+          }
+
+          DOUT << "\t\tprepend:\t"; DEBUG(prevMI->print(*cerr.stream(), &TM));
+          
+          // Replace all occurences of regB with regA.
+          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+            if (mi->getOperand(i).isReg() &&
+                mi->getOperand(i).getReg() == regB)
+              mi->getOperand(i).setReg(regA);
+          }
+        }
+
+        assert(mi->getOperand(ti).isDef() && mi->getOperand(si).isUse());
+        mi->getOperand(ti).setReg(mi->getOperand(si).getReg());
+        MadeChange = true;
+
+        DOUT << "\t\trewrite to:\t"; DEBUG(mi->print(*cerr.stream(), &TM));
+      }
+
+      mi = nmi;
+    }
+  }
+
+  // Some remat'ed instructions are dead.
+  int VReg = ReMatRegs.find_first();
+  while (VReg != -1) {
+    if (MRI->use_empty(VReg)) {
+      MachineInstr *DefMI = MRI->getVRegDef(VReg);
+      DefMI->eraseFromParent();
+    }
+    VReg = ReMatRegs.find_next(VReg);
+  }
+
+  return MadeChange;
+}
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
new file mode 100644
index 0000000..c3b213c
--- /dev/null
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -0,0 +1,199 @@
+//===-- UnreachableBlockElim.cpp - Remove unreachable blocks for codegen --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is an extremely simple version of the SimplifyCFG pass.  Its sole
+// job is to delete LLVM basic blocks that are not reachable from the entry
+// node.  To do this, it performs a simple depth first traversal of the CFG,
+// then deletes any unvisited nodes.
+//
+// Note that this pass is really a hack.  In particular, the instruction
+// selectors for various targets should just not generate code for unreachable
+// blocks.  Until LLVM has a more systematic way of defining instruction
+// selectors, however, we cannot really expect them to handle additional
+// complexity.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Constant.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN UnreachableBlockElim : public FunctionPass {
+    virtual bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableBlockElim() : FunctionPass(&ID) {}
+  };
+}
+char UnreachableBlockElim::ID = 0;
+static RegisterPass<UnreachableBlockElim>
+X("unreachableblockelim", "Remove unreachable blocks from the CFG");
+
+FunctionPass *llvm::createUnreachableBlockEliminationPass() {
+  return new UnreachableBlockElim();
+}
+
+bool UnreachableBlockElim::runOnFunction(Function &F) {
+  SmallPtrSet<BasicBlock*, 8> Reachable;
+
+  // Mark all reachable blocks.
+  for (df_ext_iterator<Function*, SmallPtrSet<BasicBlock*, 8> > I =
+       df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable); I != E; ++I)
+    /* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<BasicBlock*> DeadBlocks;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (!Reachable.count(I)) {
+      BasicBlock *BB = I;
+      DeadBlocks.push_back(BB);
+      while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+        PN->replaceAllUsesWith(Constant::getNullValue(PN->getType()));
+        BB->getInstList().pop_front();
+      }
+      for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+        (*SI)->removePredecessor(BB);
+      BB->dropAllReferences();
+    }
+
+  // Actually remove the blocks now.
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i)
+    DeadBlocks[i]->eraseFromParent();
+
+  return DeadBlocks.size();
+}
+
+
+namespace {
+  class VISIBILITY_HIDDEN UnreachableMachineBlockElim :
+        public MachineFunctionPass {
+    virtual bool runOnMachineFunction(MachineFunction &F);
+    MachineModuleInfo *MMI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableMachineBlockElim() : MachineFunctionPass(&ID) {}
+  };
+}
+char UnreachableMachineBlockElim::ID = 0;
+
+static RegisterPass<UnreachableMachineBlockElim>
+Y("unreachable-mbb-elimination",
+  "Remove unreachable machine basic blocks");
+
+const PassInfo *const llvm::UnreachableMachineBlockElimID = &Y;
+
+bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
+  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+
+  // Mark all reachable blocks.
+  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
+       I = df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable);
+       I != E; ++I)
+    /* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<MachineBasicBlock*> DeadBlocks;
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+
+    // Test for deadness.
+    if (!Reachable.count(BB)) {
+      DeadBlocks.push_back(BB);
+
+      while (BB->succ_begin() != BB->succ_end()) {
+        MachineBasicBlock* succ = *BB->succ_begin();
+
+        MachineBasicBlock::iterator start = succ->begin();
+        while (start != succ->end() &&
+               start->getOpcode() == TargetInstrInfo::PHI) {
+          for (unsigned i = start->getNumOperands() - 1; i >= 2; i-=2)
+            if (start->getOperand(i).isMBB() &&
+                start->getOperand(i).getMBB() == BB) {
+              start->RemoveOperand(i);
+              start->RemoveOperand(i-1);
+            }
+
+          start++;
+        }
+
+        BB->removeSuccessor(BB->succ_begin());
+      }
+    }
+  }
+
+  // Actually remove the blocks now.
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = DeadBlocks[i];
+    // If there are any labels in the basic block, unregister them from
+    // MachineModuleInfo.
+    if (MMI && !MBB->empty()) {
+      for (MachineBasicBlock::iterator I = MBB->begin(),
+             E = MBB->end(); I != E; ++I) {
+        if (I->isLabel())
+          // The label ID # is always operand #0, an immediate.
+          MMI->InvalidateLabel(I->getOperand(0).getImm());
+      }
+    }
+    MBB->eraseFromParent();
+  }
+
+  // Cleanup PHI nodes.
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+    // Prune unneeded PHI entries.
+    SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(),
+                                             BB->pred_end());
+    MachineBasicBlock::iterator phi = BB->begin();
+    while (phi != BB->end() &&
+           phi->getOpcode() == TargetInstrInfo::PHI) {
+      for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2)
+        if (!preds.count(phi->getOperand(i).getMBB())) {
+          phi->RemoveOperand(i);
+          phi->RemoveOperand(i-1);
+        }
+
+      if (phi->getNumOperands() == 3) {
+        unsigned Input = phi->getOperand(1).getReg();
+        unsigned Output = phi->getOperand(0).getReg();
+
+        MachineInstr* temp = phi;
+        ++phi;
+        temp->eraseFromParent();
+
+        if (Input != Output)
+          F.getRegInfo().replaceRegWith(Output, Input);
+
+        continue;
+      }
+
+      ++phi;
+    }
+  }
+
+  F.RenumberBlocks();
+
+  return DeadBlocks.size();
+}
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
new file mode 100644
index 0000000..29637b9
--- /dev/null
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -0,0 +1,269 @@
+//===-- llvm/CodeGen/VirtRegMap.cpp - Virtual Register Map ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VirtRegMap class.
+//
+// It also contains implementations of the the Spiller interface, which, given a
+// virtual register map and a machine function, eliminates all virtual
+// references by replacing them with physical register references - adding spill
+// code as necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "virtregmap"
+#include "VirtRegMap.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumSpills  , "Number of register spills");
+
+//===----------------------------------------------------------------------===//
+//  VirtRegMap implementation
+//===----------------------------------------------------------------------===//
+
+char VirtRegMap::ID = 0;
+
+static RegisterPass<VirtRegMap>
+X("virtregmap", "Virtual Register Map");
+
+bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
+  TII = mf.getTarget().getInstrInfo();
+  TRI = mf.getTarget().getRegisterInfo();
+  MF = &mf;
+  
+  ReMatId = MAX_STACK_SLOT+1;
+  LowSpillSlot = HighSpillSlot = NO_STACK_SLOT;
+  
+  Virt2PhysMap.clear();
+  Virt2StackSlotMap.clear();
+  Virt2ReMatIdMap.clear();
+  Virt2SplitMap.clear();
+  Virt2SplitKillMap.clear();
+  ReMatMap.clear();
+  ImplicitDefed.clear();
+  SpillSlotToUsesMap.clear();
+  MI2VirtMap.clear();
+  SpillPt2VirtMap.clear();
+  RestorePt2VirtMap.clear();
+  EmergencySpillMap.clear();
+  EmergencySpillSlots.clear();
+  
+  SpillSlotToUsesMap.resize(8);
+  ImplicitDefed.resize(MF->getRegInfo().getLastVirtReg()+1-
+                       TargetRegisterInfo::FirstVirtualRegister);
+
+  allocatableRCRegs.clear();
+  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+         E = TRI->regclass_end(); I != E; ++I)
+    allocatableRCRegs.insert(std::make_pair(*I,
+                                            TRI->getAllocatableSet(mf, *I)));
+
+  grow();
+  
+  return false;
+}
+
+void VirtRegMap::grow() {
+  unsigned LastVirtReg = MF->getRegInfo().getLastVirtReg();
+  Virt2PhysMap.grow(LastVirtReg);
+  Virt2StackSlotMap.grow(LastVirtReg);
+  Virt2ReMatIdMap.grow(LastVirtReg);
+  Virt2SplitMap.grow(LastVirtReg);
+  Virt2SplitKillMap.grow(LastVirtReg);
+  ReMatMap.grow(LastVirtReg);
+  ImplicitDefed.resize(LastVirtReg-TargetRegisterInfo::FirstVirtualRegister+1);
+}
+
+int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
+  int SS = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                RC->getAlignment());
+  if (LowSpillSlot == NO_STACK_SLOT)
+    LowSpillSlot = SS;
+  if (HighSpillSlot == NO_STACK_SLOT || SS > HighSpillSlot)
+    HighSpillSlot = SS;
+  unsigned Idx = SS-LowSpillSlot;
+  while (Idx >= SpillSlotToUsesMap.size())
+    SpillSlotToUsesMap.resize(SpillSlotToUsesMap.size()*2);
+  Virt2StackSlotMap[virtReg] = SS;
+  ++NumSpills;
+  return SS;
+}
+
+void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  assert((SS >= 0 ||
+          (SS >= MF->getFrameInfo()->getObjectIndexBegin())) &&
+         "illegal fixed frame index");
+  Virt2StackSlotMap[virtReg] = SS;
+}
+
+int VirtRegMap::assignVirtReMatId(unsigned virtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2ReMatIdMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign re-mat id to already spilled register");
+  Virt2ReMatIdMap[virtReg] = ReMatId;
+  return ReMatId++;
+}
+
+void VirtRegMap::assignVirtReMatId(unsigned virtReg, int id) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2ReMatIdMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign re-mat id to already spilled register");
+  Virt2ReMatIdMap[virtReg] = id;
+}
+
+int VirtRegMap::getEmergencySpillSlot(const TargetRegisterClass *RC) {
+  std::map<const TargetRegisterClass*, int>::iterator I =
+    EmergencySpillSlots.find(RC);
+  if (I != EmergencySpillSlots.end())
+    return I->second;
+  int SS = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                RC->getAlignment());
+  if (LowSpillSlot == NO_STACK_SLOT)
+    LowSpillSlot = SS;
+  if (HighSpillSlot == NO_STACK_SLOT || SS > HighSpillSlot)
+    HighSpillSlot = SS;
+  EmergencySpillSlots[RC] = SS;
+  return SS;
+}
+
+void VirtRegMap::addSpillSlotUse(int FI, MachineInstr *MI) {
+  if (!MF->getFrameInfo()->isFixedObjectIndex(FI)) {
+    // If FI < LowSpillSlot, this stack reference was produced by
+    // instruction selection and is not a spill
+    if (FI >= LowSpillSlot) {
+      assert(FI >= 0 && "Spill slot index should not be negative!");
+      assert((unsigned)FI-LowSpillSlot < SpillSlotToUsesMap.size()
+             && "Invalid spill slot");
+      SpillSlotToUsesMap[FI-LowSpillSlot].insert(MI);
+    }
+  }
+}
+
+void VirtRegMap::virtFolded(unsigned VirtReg, MachineInstr *OldMI,
+                            MachineInstr *NewMI, ModRef MRInfo) {
+  // Move previous memory references folded to new instruction.
+  MI2VirtMapTy::iterator IP = MI2VirtMap.lower_bound(NewMI);
+  for (MI2VirtMapTy::iterator I = MI2VirtMap.lower_bound(OldMI),
+         E = MI2VirtMap.end(); I != E && I->first == OldMI; ) {
+    MI2VirtMap.insert(IP, std::make_pair(NewMI, I->second));
+    MI2VirtMap.erase(I++);
+  }
+
+  // add new memory reference
+  MI2VirtMap.insert(IP, std::make_pair(NewMI, std::make_pair(VirtReg, MRInfo)));
+}
+
+void VirtRegMap::virtFolded(unsigned VirtReg, MachineInstr *MI, ModRef MRInfo) {
+  MI2VirtMapTy::iterator IP = MI2VirtMap.lower_bound(MI);
+  MI2VirtMap.insert(IP, std::make_pair(MI, std::make_pair(VirtReg, MRInfo)));
+}
+
+void VirtRegMap::RemoveMachineInstrFromMaps(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isFI())
+      continue;
+    int FI = MO.getIndex();
+    if (MF->getFrameInfo()->isFixedObjectIndex(FI))
+      continue;
+    // This stack reference was produced by instruction selection and
+    // is not a spill
+    if (FI < LowSpillSlot)
+      continue;
+    assert((unsigned)FI-LowSpillSlot < SpillSlotToUsesMap.size()
+           && "Invalid spill slot");
+    SpillSlotToUsesMap[FI-LowSpillSlot].erase(MI);
+  }
+  MI2VirtMap.erase(MI);
+  SpillPt2VirtMap.erase(MI);
+  RestorePt2VirtMap.erase(MI);
+  EmergencySpillMap.erase(MI);
+}
+
+/// FindUnusedRegisters - Gather a list of allocatable registers that
+/// have not been allocated to any virtual register.
+bool VirtRegMap::FindUnusedRegisters(const TargetRegisterInfo *TRI,
+                                     LiveIntervals* LIs) {
+  unsigned NumRegs = TRI->getNumRegs();
+  UnusedRegs.reset();
+  UnusedRegs.resize(NumRegs);
+
+  BitVector Used(NumRegs);
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
+         e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i)
+    if (Virt2PhysMap[i] != (unsigned)VirtRegMap::NO_PHYS_REG)
+      Used.set(Virt2PhysMap[i]);
+
+  BitVector Allocatable = TRI->getAllocatableSet(*MF);
+  bool AnyUnused = false;
+  for (unsigned Reg = 1; Reg < NumRegs; ++Reg) {
+    if (Allocatable[Reg] && !Used[Reg] && !LIs->hasInterval(Reg)) {
+      bool ReallyUnused = true;
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+        if (Used[*AS] || LIs->hasInterval(*AS)) {
+          ReallyUnused = false;
+          break;
+        }
+      }
+      if (ReallyUnused) {
+        AnyUnused = true;
+        UnusedRegs.set(Reg);
+      }
+    }
+  }
+
+  return AnyUnused;
+}
+
+void VirtRegMap::print(std::ostream &OS, const Module* M) const {
+  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
+
+  OS << "********** REGISTER MAP **********\n";
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
+         e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i) {
+    if (Virt2PhysMap[i] != (unsigned)VirtRegMap::NO_PHYS_REG)
+      OS << "[reg" << i << " -> " << TRI->getName(Virt2PhysMap[i])
+         << "]\n";
+  }
+
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
+         e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i)
+    if (Virt2StackSlotMap[i] != VirtRegMap::NO_STACK_SLOT)
+      OS << "[reg" << i << " -> fi#" << Virt2StackSlotMap[i] << "]\n";
+  OS << '\n';
+}
+
+void VirtRegMap::dump() const {
+  print(cerr);
+}
diff --git a/lib/CodeGen/VirtRegMap.h b/lib/CodeGen/VirtRegMap.h
new file mode 100644
index 0000000..507557d
--- /dev/null
+++ b/lib/CodeGen/VirtRegMap.h
@@ -0,0 +1,495 @@
+//===-- llvm/CodeGen/VirtRegMap.h - Virtual Register Map -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a virtual register map. This maps virtual registers to
+// physical registers and virtual registers to stack slots. It is created and
+// updated by a register allocator and then used by a machine code rewriter that
+// adds spill code and rewrites virtual into physical register references.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_VIRTREGMAP_H
+#define LLVM_CODEGEN_VIRTREGMAP_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Streams.h"
+#include <map>
+
+namespace llvm {
+  class LiveIntervals;
+  class MachineInstr;
+  class MachineFunction;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
+
+  class VirtRegMap : public MachineFunctionPass {
+  public:
+    enum {
+      NO_PHYS_REG = 0,
+      NO_STACK_SLOT = (1L << 30)-1,
+      MAX_STACK_SLOT = (1L << 18)-1
+    };
+
+    enum ModRef { isRef = 1, isMod = 2, isModRef = 3 };
+    typedef std::multimap<MachineInstr*,
+                          std::pair<unsigned, ModRef> > MI2VirtMapTy;
+
+  private:
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineFunction *MF;
+
+    DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs;
+
+    /// Virt2PhysMap - This is a virtual to physical register
+    /// mapping. Each virtual register is required to have an entry in
+    /// it; even spilled virtual registers (the register mapped to a
+    /// spilled register is the temporary used to load it from the
+    /// stack).
+    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2PhysMap;
+
+    /// Virt2StackSlotMap - This is virtual register to stack slot
+    /// mapping. Each spilled virtual register has an entry in it
+    /// which corresponds to the stack slot this register is spilled
+    /// at.
+    IndexedMap<int, VirtReg2IndexFunctor> Virt2StackSlotMap;
+
+    /// Virt2ReMatIdMap - This is virtual register to rematerialization id
+    /// mapping. Each spilled virtual register that should be remat'd has an
+    /// entry in it which corresponds to the remat id.
+    IndexedMap<int, VirtReg2IndexFunctor> Virt2ReMatIdMap;
+
+    /// Virt2SplitMap - This is virtual register to splitted virtual register
+    /// mapping.
+    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2SplitMap;
+
+    /// Virt2SplitKillMap - This is splitted virtual register to its last use
+    /// (kill) index mapping.
+    IndexedMap<unsigned> Virt2SplitKillMap;
+
+    /// ReMatMap - This is virtual register to re-materialized instruction
+    /// mapping. Each virtual register whose definition is going to be
+    /// re-materialized has an entry in it.
+    IndexedMap<MachineInstr*, VirtReg2IndexFunctor> ReMatMap;
+
+    /// MI2VirtMap - This is MachineInstr to virtual register
+    /// mapping. In the case of memory spill code being folded into
+    /// instructions, we need to know which virtual register was
+    /// read/written by this instruction.
+    MI2VirtMapTy MI2VirtMap;
+
+    /// SpillPt2VirtMap - This records the virtual registers which should
+    /// be spilled right after the MachineInstr due to live interval
+    /// splitting.
+    std::map<MachineInstr*, std::vector<std::pair<unsigned,bool> > >
+    SpillPt2VirtMap;
+
+    /// RestorePt2VirtMap - This records the virtual registers which should
+    /// be restored right before the MachineInstr due to live interval
+    /// splitting.
+    std::map<MachineInstr*, std::vector<unsigned> > RestorePt2VirtMap;
+
+    /// EmergencySpillMap - This records the physical registers that should
+    /// be spilled / restored around the MachineInstr since the register
+    /// allocator has run out of registers.
+    std::map<MachineInstr*, std::vector<unsigned> > EmergencySpillMap;
+
+    /// EmergencySpillSlots - This records emergency spill slots used to
+    /// spill physical registers when the register allocator runs out of
+    /// registers. Ideally only one stack slot is used per function per
+    /// register class.
+    std::map<const TargetRegisterClass*, int> EmergencySpillSlots;
+
+    /// ReMatId - Instead of assigning a stack slot to a to be rematerialized
+    /// virtual register, an unique id is being assigned. This keeps track of
+    /// the highest id used so far. Note, this starts at (1<<18) to avoid
+    /// conflicts with stack slot numbers.
+    int ReMatId;
+
+    /// LowSpillSlot, HighSpillSlot - Lowest and highest spill slot indexes.
+    int LowSpillSlot, HighSpillSlot;
+
+    /// SpillSlotToUsesMap - Records uses for each register spill slot.
+    SmallVector<SmallPtrSet<MachineInstr*, 4>, 8> SpillSlotToUsesMap;
+
+    /// ImplicitDefed - One bit for each virtual register. If set it indicates
+    /// the register is implicitly defined.
+    BitVector ImplicitDefed;
+
+    /// UnusedRegs - A list of physical registers that have not been used.
+    BitVector UnusedRegs;
+
+    VirtRegMap(const VirtRegMap&);     // DO NOT IMPLEMENT
+    void operator=(const VirtRegMap&); // DO NOT IMPLEMENT
+
+  public:
+    static char ID;
+    VirtRegMap() : MachineFunctionPass(&ID), Virt2PhysMap(NO_PHYS_REG),
+                   Virt2StackSlotMap(NO_STACK_SLOT), 
+                   Virt2ReMatIdMap(NO_STACK_SLOT), Virt2SplitMap(0),
+                   Virt2SplitKillMap(0), ReMatMap(NULL),
+                   ReMatId(MAX_STACK_SLOT+1),
+                   LowSpillSlot(NO_STACK_SLOT), HighSpillSlot(NO_STACK_SLOT) { }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    void grow();
+
+    /// @brief returns true if the specified virtual register is
+    /// mapped to a physical register
+    bool hasPhys(unsigned virtReg) const {
+      return getPhys(virtReg) != NO_PHYS_REG;
+    }
+
+    /// @brief returns the physical register mapped to the specified
+    /// virtual register
+    unsigned getPhys(unsigned virtReg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      return Virt2PhysMap[virtReg];
+    }
+
+    /// @brief creates a mapping for the specified virtual register to
+    /// the specified physical register
+    void assignVirt2Phys(unsigned virtReg, unsigned physReg) {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg) &&
+             TargetRegisterInfo::isPhysicalRegister(physReg));
+      assert(Virt2PhysMap[virtReg] == NO_PHYS_REG &&
+             "attempt to assign physical register to already mapped "
+             "virtual register");
+      Virt2PhysMap[virtReg] = physReg;
+    }
+
+    /// @brief clears the specified virtual register's, physical
+    /// register mapping
+    void clearVirt(unsigned virtReg) {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      assert(Virt2PhysMap[virtReg] != NO_PHYS_REG &&
+             "attempt to clear a not assigned virtual register");
+      Virt2PhysMap[virtReg] = NO_PHYS_REG;
+    }
+
+    /// @brief clears all virtual to physical register mappings
+    void clearAllVirt() {
+      Virt2PhysMap.clear();
+      grow();
+    }
+
+    /// @brief records virtReg is a split live interval from SReg.
+    void setIsSplitFromReg(unsigned virtReg, unsigned SReg) {
+      Virt2SplitMap[virtReg] = SReg;
+    }
+
+    /// @brief returns the live interval virtReg is split from.
+    unsigned getPreSplitReg(unsigned virtReg) {
+      return Virt2SplitMap[virtReg];
+    }
+
+    /// @brief returns true if the specified virtual register is not
+    /// mapped to a stack slot or rematerialized.
+    bool isAssignedReg(unsigned virtReg) const {
+      if (getStackSlot(virtReg) == NO_STACK_SLOT &&
+          getReMatId(virtReg) == NO_STACK_SLOT)
+        return true;
+      // Split register can be assigned a physical register as well as a
+      // stack slot or remat id.
+      return (Virt2SplitMap[virtReg] && Virt2PhysMap[virtReg] != NO_PHYS_REG);
+    }
+
+    /// @brief returns the stack slot mapped to the specified virtual
+    /// register
+    int getStackSlot(unsigned virtReg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      return Virt2StackSlotMap[virtReg];
+    }
+
+    /// @brief returns the rematerialization id mapped to the specified virtual
+    /// register
+    int getReMatId(unsigned virtReg) const {
+      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+      return Virt2ReMatIdMap[virtReg];
+    }
+
+    /// @brief create a mapping for the specifed virtual register to
+    /// the next available stack slot
+    int assignVirt2StackSlot(unsigned virtReg);
+    /// @brief create a mapping for the specified virtual register to
+    /// the specified stack slot
+    void assignVirt2StackSlot(unsigned virtReg, int frameIndex);
+
+    /// @brief assign an unique re-materialization id to the specified
+    /// virtual register.
+    int assignVirtReMatId(unsigned virtReg);
+    /// @brief assign an unique re-materialization id to the specified
+    /// virtual register.
+    void assignVirtReMatId(unsigned virtReg, int id);
+
+    /// @brief returns true if the specified virtual register is being
+    /// re-materialized.
+    bool isReMaterialized(unsigned virtReg) const {
+      return ReMatMap[virtReg] != NULL;
+    }
+
+    /// @brief returns the original machine instruction being re-issued
+    /// to re-materialize the specified virtual register.
+    MachineInstr *getReMaterializedMI(unsigned virtReg) const {
+      return ReMatMap[virtReg];
+    }
+
+    /// @brief records the specified virtual register will be
+    /// re-materialized and the original instruction which will be re-issed
+    /// for this purpose.  If parameter all is true, then all uses of the
+    /// registers are rematerialized and it's safe to delete the definition.
+    void setVirtIsReMaterialized(unsigned virtReg, MachineInstr *def) {
+      ReMatMap[virtReg] = def;
+    }
+
+    /// @brief record the last use (kill) of a split virtual register.
+    void addKillPoint(unsigned virtReg, unsigned index) {
+      Virt2SplitKillMap[virtReg] = index;
+    }
+
+    unsigned getKillPoint(unsigned virtReg) const {
+      return Virt2SplitKillMap[virtReg];
+    }
+
+    /// @brief remove the last use (kill) of a split virtual register.
+    void removeKillPoint(unsigned virtReg) {
+      Virt2SplitKillMap[virtReg] = 0;
+    }
+
+    /// @brief returns true if the specified MachineInstr is a spill point.
+    bool isSpillPt(MachineInstr *Pt) const {
+      return SpillPt2VirtMap.find(Pt) != SpillPt2VirtMap.end();
+    }
+
+    /// @brief returns the virtual registers that should be spilled due to
+    /// splitting right after the specified MachineInstr.
+    std::vector<std::pair<unsigned,bool> > &getSpillPtSpills(MachineInstr *Pt) {
+      return SpillPt2VirtMap[Pt];
+    }
+
+    /// @brief records the specified MachineInstr as a spill point for virtReg.
+    void addSpillPoint(unsigned virtReg, bool isKill, MachineInstr *Pt) {
+      std::map<MachineInstr*, std::vector<std::pair<unsigned,bool> > >::iterator
+        I = SpillPt2VirtMap.find(Pt);
+      if (I != SpillPt2VirtMap.end())
+        I->second.push_back(std::make_pair(virtReg, isKill));
+      else {
+        std::vector<std::pair<unsigned,bool> > Virts;
+        Virts.push_back(std::make_pair(virtReg, isKill));
+        SpillPt2VirtMap.insert(std::make_pair(Pt, Virts));
+      }
+    }
+
+    /// @brief - transfer spill point information from one instruction to
+    /// another.
+    void transferSpillPts(MachineInstr *Old, MachineInstr *New) {
+      std::map<MachineInstr*, std::vector<std::pair<unsigned,bool> > >::iterator
+        I = SpillPt2VirtMap.find(Old);
+      if (I == SpillPt2VirtMap.end())
+        return;
+      while (!I->second.empty()) {
+        unsigned virtReg = I->second.back().first;
+        bool isKill = I->second.back().second;
+        I->second.pop_back();
+        addSpillPoint(virtReg, isKill, New);
+      }
+      SpillPt2VirtMap.erase(I);
+    }
+
+    /// @brief returns true if the specified MachineInstr is a restore point.
+    bool isRestorePt(MachineInstr *Pt) const {
+      return RestorePt2VirtMap.find(Pt) != RestorePt2VirtMap.end();
+    }
+
+    /// @brief returns the virtual registers that should be restoreed due to
+    /// splitting right after the specified MachineInstr.
+    std::vector<unsigned> &getRestorePtRestores(MachineInstr *Pt) {
+      return RestorePt2VirtMap[Pt];
+    }
+
+    /// @brief records the specified MachineInstr as a restore point for virtReg.
+    void addRestorePoint(unsigned virtReg, MachineInstr *Pt) {
+      std::map<MachineInstr*, std::vector<unsigned> >::iterator I =
+        RestorePt2VirtMap.find(Pt);
+      if (I != RestorePt2VirtMap.end())
+        I->second.push_back(virtReg);
+      else {
+        std::vector<unsigned> Virts;
+        Virts.push_back(virtReg);
+        RestorePt2VirtMap.insert(std::make_pair(Pt, Virts));
+      }
+    }
+
+    /// @brief - transfer restore point information from one instruction to
+    /// another.
+    void transferRestorePts(MachineInstr *Old, MachineInstr *New) {
+      std::map<MachineInstr*, std::vector<unsigned> >::iterator I =
+        RestorePt2VirtMap.find(Old);
+      if (I == RestorePt2VirtMap.end())
+        return;
+      while (!I->second.empty()) {
+        unsigned virtReg = I->second.back();
+        I->second.pop_back();
+        addRestorePoint(virtReg, New);
+      }
+      RestorePt2VirtMap.erase(I);
+    }
+
+    /// @brief records that the specified physical register must be spilled
+    /// around the specified machine instr.
+    void addEmergencySpill(unsigned PhysReg, MachineInstr *MI) {
+      if (EmergencySpillMap.find(MI) != EmergencySpillMap.end())
+        EmergencySpillMap[MI].push_back(PhysReg);
+      else {
+        std::vector<unsigned> PhysRegs;
+        PhysRegs.push_back(PhysReg);
+        EmergencySpillMap.insert(std::make_pair(MI, PhysRegs));
+      }
+    }
+
+    /// @brief returns true if one or more physical registers must be spilled
+    /// around the specified instruction.
+    bool hasEmergencySpills(MachineInstr *MI) const {
+      return EmergencySpillMap.find(MI) != EmergencySpillMap.end();
+    }
+
+    /// @brief returns the physical registers to be spilled and restored around
+    /// the instruction.
+    std::vector<unsigned> &getEmergencySpills(MachineInstr *MI) {
+      return EmergencySpillMap[MI];
+    }
+
+    /// @brief - transfer emergency spill information from one instruction to
+    /// another.
+    void transferEmergencySpills(MachineInstr *Old, MachineInstr *New) {
+      std::map<MachineInstr*,std::vector<unsigned> >::iterator I =
+        EmergencySpillMap.find(Old);
+      if (I == EmergencySpillMap.end())
+        return;
+      while (!I->second.empty()) {
+        unsigned virtReg = I->second.back();
+        I->second.pop_back();
+        addEmergencySpill(virtReg, New);
+      }
+      EmergencySpillMap.erase(I);
+    }
+
+    /// @brief return or get a emergency spill slot for the register class.
+    int getEmergencySpillSlot(const TargetRegisterClass *RC);
+
+    /// @brief Return lowest spill slot index.
+    int getLowSpillSlot() const {
+      return LowSpillSlot;
+    }
+
+    /// @brief Return highest spill slot index.
+    int getHighSpillSlot() const {
+      return HighSpillSlot;
+    }
+
+    /// @brief Records a spill slot use.
+    void addSpillSlotUse(int FrameIndex, MachineInstr *MI);
+
+    /// @brief Returns true if spill slot has been used.
+    bool isSpillSlotUsed(int FrameIndex) const {
+      assert(FrameIndex >= 0 && "Spill slot index should not be negative!");
+      return !SpillSlotToUsesMap[FrameIndex-LowSpillSlot].empty();
+    }
+
+    /// @brief Mark the specified register as being implicitly defined.
+    void setIsImplicitlyDefined(unsigned VirtReg) {
+      ImplicitDefed.set(VirtReg-TargetRegisterInfo::FirstVirtualRegister);
+    }
+
+    /// @brief Returns true if the virtual register is implicitly defined.
+    bool isImplicitlyDefined(unsigned VirtReg) const {
+      return ImplicitDefed[VirtReg-TargetRegisterInfo::FirstVirtualRegister];
+    }
+
+    /// @brief Updates information about the specified virtual register's value
+    /// folded into newMI machine instruction.
+    void virtFolded(unsigned VirtReg, MachineInstr *OldMI, MachineInstr *NewMI,
+                    ModRef MRInfo);
+
+    /// @brief Updates information about the specified virtual register's value
+    /// folded into the specified machine instruction.
+    void virtFolded(unsigned VirtReg, MachineInstr *MI, ModRef MRInfo);
+
+    /// @brief returns the virtual registers' values folded in memory
+    /// operands of this instruction
+    std::pair<MI2VirtMapTy::const_iterator, MI2VirtMapTy::const_iterator>
+    getFoldedVirts(MachineInstr* MI) const {
+      return MI2VirtMap.equal_range(MI);
+    }
+    
+    /// RemoveMachineInstrFromMaps - MI is being erased, remove it from the
+    /// the folded instruction map and spill point map.
+    void RemoveMachineInstrFromMaps(MachineInstr *MI);
+
+    /// FindUnusedRegisters - Gather a list of allocatable registers that
+    /// have not been allocated to any virtual register.
+    bool FindUnusedRegisters(const TargetRegisterInfo *TRI,
+                             LiveIntervals* LIs);
+
+    /// HasUnusedRegisters - Return true if there are any allocatable registers
+    /// that have not been allocated to any virtual register.
+    bool HasUnusedRegisters() const {
+      return !UnusedRegs.none();
+    }
+
+    /// setRegisterUsed - Remember the physical register is now used.
+    void setRegisterUsed(unsigned Reg) {
+      UnusedRegs.reset(Reg);
+    }
+
+    /// isRegisterUnused - Return true if the physical register has not been
+    /// used.
+    bool isRegisterUnused(unsigned Reg) const {
+      return UnusedRegs[Reg];
+    }
+
+    /// getFirstUnusedRegister - Return the first physical register that has not
+    /// been used.
+    unsigned getFirstUnusedRegister(const TargetRegisterClass *RC) {
+      int Reg = UnusedRegs.find_first();
+      while (Reg != -1) {
+        if (allocatableRCRegs[RC][Reg])
+          return (unsigned)Reg;
+        Reg = UnusedRegs.find_next(Reg);
+      }
+      return 0;
+    }
+
+    void print(std::ostream &OS, const Module* M = 0) const;
+    void print(std::ostream *OS) const { if (OS) print(*OS); }
+    void dump() const;
+  };
+
+  inline std::ostream *operator<<(std::ostream *OS, const VirtRegMap &VRM) {
+    VRM.print(OS);
+    return OS;
+  }
+  inline std::ostream &operator<<(std::ostream &OS, const VirtRegMap &VRM) {
+    VRM.print(OS);
+    return OS;
+  }
+} // End llvm namespace
+
+#endif
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
new file mode 100644
index 0000000..b4c8bc1
--- /dev/null
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -0,0 +1,2225 @@
+//===-- llvm/CodeGen/Rewriter.cpp -  Rewriter -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "virtregrewriter"
+#include "VirtRegRewriter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumDSE     , "Number of dead stores elided");
+STATISTIC(NumDSS     , "Number of dead spill slots removed");
+STATISTIC(NumCommutes, "Number of instructions commuted");
+STATISTIC(NumDRM     , "Number of re-materializable defs elided");
+STATISTIC(NumStores  , "Number of stores added");
+STATISTIC(NumPSpills , "Number of physical register spills");
+STATISTIC(NumOmitted , "Number of reloads omited");
+STATISTIC(NumAvoided , "Number of reloads deemed unnecessary");
+STATISTIC(NumCopified, "Number of available reloads turned into copies");
+STATISTIC(NumReMats  , "Number of re-materialization");
+STATISTIC(NumLoads   , "Number of loads added");
+STATISTIC(NumReused  , "Number of values reused");
+STATISTIC(NumDCE     , "Number of copies elided");
+STATISTIC(NumSUnfold , "Number of stores unfolded");
+STATISTIC(NumModRefUnfold, "Number of modref unfolded");
+
+namespace {
+  enum RewriterName { simple, local, trivial };
+}
+
+static cl::opt<RewriterName>
+RewriterOpt("rewriter",
+            cl::desc("Rewriter to use: (default: local)"),
+            cl::Prefix,
+            cl::values(clEnumVal(simple,  "simple rewriter"),
+                       clEnumVal(local,   "local rewriter"),
+                       clEnumVal(trivial, "trivial rewriter"),
+                       clEnumValEnd),
+            cl::init(local));
+
+VirtRegRewriter::~VirtRegRewriter() {}
+
+ 
+// ****************************** //
+// Simple Spiller Implementation  //
+// ****************************** //
+
+struct VISIBILITY_HIDDEN SimpleRewriter : public VirtRegRewriter {
+
+  bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
+                            LiveIntervals* LIs) {
+    DOUT << "********** REWRITE MACHINE CODE **********\n";
+    DOUT << "********** Function: " << MF.getFunction()->getName() << '\n';
+    const TargetMachine &TM = MF.getTarget();
+    const TargetInstrInfo &TII = *TM.getInstrInfo();
+    const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+
+
+    // LoadedRegs - Keep track of which vregs are loaded, so that we only load
+    // each vreg once (in the case where a spilled vreg is used by multiple
+    // operands).  This is always smaller than the number of operands to the
+    // current machine instr, so it should be small.
+    std::vector<unsigned> LoadedRegs;
+
+    for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+         MBBI != E; ++MBBI) {
+      DOUT << MBBI->getBasicBlock()->getName() << ":\n";
+      MachineBasicBlock &MBB = *MBBI;
+      for (MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
+           MII != E; ++MII) {
+        MachineInstr &MI = *MII;
+        for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+          MachineOperand &MO = MI.getOperand(i);
+          if (MO.isReg() && MO.getReg()) {
+            if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+              unsigned VirtReg = MO.getReg();
+              unsigned SubIdx = MO.getSubReg();
+              unsigned PhysReg = VRM.getPhys(VirtReg);
+              unsigned RReg = SubIdx ? TRI.getSubReg(PhysReg, SubIdx) : PhysReg;
+              if (!VRM.isAssignedReg(VirtReg)) {
+                int StackSlot = VRM.getStackSlot(VirtReg);
+                const TargetRegisterClass* RC = 
+                                             MF.getRegInfo().getRegClass(VirtReg);
+                
+                if (MO.isUse() &&
+                    std::find(LoadedRegs.begin(), LoadedRegs.end(), VirtReg)
+                             == LoadedRegs.end()) {
+                  TII.loadRegFromStackSlot(MBB, &MI, PhysReg, StackSlot, RC);
+                  MachineInstr *LoadMI = prior(MII);
+                  VRM.addSpillSlotUse(StackSlot, LoadMI);
+                  LoadedRegs.push_back(VirtReg);
+                  ++NumLoads;
+                  DOUT << '\t' << *LoadMI;
+                }
+
+                if (MO.isDef()) {
+                  TII.storeRegToStackSlot(MBB, next(MII), PhysReg, true,   
+                                          StackSlot, RC);
+                  MachineInstr *StoreMI = next(MII);
+                  VRM.addSpillSlotUse(StackSlot, StoreMI);
+                  ++NumStores;
+                }
+              }
+              MF.getRegInfo().setPhysRegUsed(RReg);
+              MI.getOperand(i).setReg(RReg);
+              MI.getOperand(i).setSubReg(0);
+            } else {
+              MF.getRegInfo().setPhysRegUsed(MO.getReg());
+            }
+          }
+        }
+
+        DOUT << '\t' << MI;
+        LoadedRegs.clear();
+      }
+    }
+    return true;
+  }
+
+};
+ 
+/// This class is intended for use with the new spilling framework only. It
+/// rewrites vreg def/uses to use the assigned preg, but does not insert any
+/// spill code.
+struct VISIBILITY_HIDDEN TrivialRewriter : public VirtRegRewriter {
+
+  bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
+                            LiveIntervals* LIs) {
+    DOUT << "********** REWRITE MACHINE CODE **********\n";
+    DOUT << "********** Function: " << MF.getFunction()->getName() << '\n';
+    MachineRegisterInfo *mri = &MF.getRegInfo();
+
+    bool changed = false;
+
+    for (LiveIntervals::iterator liItr = LIs->begin(), liEnd = LIs->end();
+         liItr != liEnd; ++liItr) {
+
+      if (TargetRegisterInfo::isVirtualRegister(liItr->first)) {
+        if (VRM.hasPhys(liItr->first)) {
+          unsigned preg = VRM.getPhys(liItr->first);
+          mri->replaceRegWith(liItr->first, preg);
+          mri->setPhysRegUsed(preg);
+          changed = true;
+        }
+      }
+      else {
+        if (!liItr->second->empty()) {
+          mri->setPhysRegUsed(liItr->first);
+        }
+      }
+    }
+    
+    return changed;
+  }
+
+};
+
+// ************************************************************************ //
+
+/// AvailableSpills - As the local rewriter is scanning and rewriting an MBB
+/// from top down, keep track of which spill slots or remat are available in
+/// each register.
+///
+/// Note that not all physregs are created equal here.  In particular, some
+/// physregs are reloads that we are allowed to clobber or ignore at any time.
+/// Other physregs are values that the register allocated program is using
+/// that we cannot CHANGE, but we can read if we like.  We keep track of this
+/// on a per-stack-slot / remat id basis as the low bit in the value of the
+/// SpillSlotsAvailable entries.  The predicate 'canClobberPhysReg()' checks
+/// this bit and addAvailable sets it if.
+class VISIBILITY_HIDDEN AvailableSpills {
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+  // SpillSlotsOrReMatsAvailable - This map keeps track of all of the spilled
+  // or remat'ed virtual register values that are still available, due to
+  // being loaded or stored to, but not invalidated yet.
+  std::map<int, unsigned> SpillSlotsOrReMatsAvailable;
+
+  // PhysRegsAvailable - This is the inverse of SpillSlotsOrReMatsAvailable,
+  // indicating which stack slot values are currently held by a physreg.  This
+  // is used to invalidate entries in SpillSlotsOrReMatsAvailable when a
+  // physreg is modified.
+  std::multimap<unsigned, int> PhysRegsAvailable;
+
+  void disallowClobberPhysRegOnly(unsigned PhysReg);
+
+  void ClobberPhysRegOnly(unsigned PhysReg);
+public:
+  AvailableSpills(const TargetRegisterInfo *tri, const TargetInstrInfo *tii)
+    : TRI(tri), TII(tii) {
+  }
+
+  /// clear - Reset the state.
+  void clear() {
+    SpillSlotsOrReMatsAvailable.clear();
+    PhysRegsAvailable.clear();
+  }
+
+  const TargetRegisterInfo *getRegInfo() const { return TRI; }
+
+  /// getSpillSlotOrReMatPhysReg - If the specified stack slot or remat is
+  /// available in a physical register, return that PhysReg, otherwise
+  /// return 0.
+  unsigned getSpillSlotOrReMatPhysReg(int Slot) const {
+    std::map<int, unsigned>::const_iterator I =
+      SpillSlotsOrReMatsAvailable.find(Slot);
+    if (I != SpillSlotsOrReMatsAvailable.end()) {
+      return I->second >> 1;  // Remove the CanClobber bit.
+    }
+    return 0;
+  }
+
+  /// addAvailable - Mark that the specified stack slot / remat is available
+  /// in the specified physreg.  If CanClobber is true, the physreg can be
+  /// modified at any time without changing the semantics of the program.
+  void addAvailable(int SlotOrReMat, unsigned Reg, bool CanClobber = true) {
+    // If this stack slot is thought to be available in some other physreg, 
+    // remove its record.
+    ModifyStackSlotOrReMat(SlotOrReMat);
+
+    PhysRegsAvailable.insert(std::make_pair(Reg, SlotOrReMat));
+    SpillSlotsOrReMatsAvailable[SlotOrReMat]= (Reg << 1) |
+                                              (unsigned)CanClobber;
+
+    if (SlotOrReMat > VirtRegMap::MAX_STACK_SLOT)
+      DOUT << "Remembering RM#" << SlotOrReMat-VirtRegMap::MAX_STACK_SLOT-1;
+    else
+      DOUT << "Remembering SS#" << SlotOrReMat;
+    DOUT << " in physreg " << TRI->getName(Reg) << "\n";
+  }
+
+  /// canClobberPhysRegForSS - Return true if the spiller is allowed to change
+  /// the value of the specified stackslot register if it desires. The
+  /// specified stack slot must be available in a physreg for this query to
+  /// make sense.
+  bool canClobberPhysRegForSS(int SlotOrReMat) const {
+    assert(SpillSlotsOrReMatsAvailable.count(SlotOrReMat) &&
+           "Value not available!");
+    return SpillSlotsOrReMatsAvailable.find(SlotOrReMat)->second & 1;
+  }
+
+  /// canClobberPhysReg - Return true if the spiller is allowed to clobber the
+  /// physical register where values for some stack slot(s) might be
+  /// available.
+  bool canClobberPhysReg(unsigned PhysReg) const {
+    std::multimap<unsigned, int>::const_iterator I =
+      PhysRegsAvailable.lower_bound(PhysReg);
+    while (I != PhysRegsAvailable.end() && I->first == PhysReg) {
+      int SlotOrReMat = I->second;
+      I++;
+      if (!canClobberPhysRegForSS(SlotOrReMat))
+        return false;
+    }
+    return true;
+  }
+
+  /// disallowClobberPhysReg - Unset the CanClobber bit of the specified
+  /// stackslot register. The register is still available but is no longer
+  /// allowed to be modifed.
+  void disallowClobberPhysReg(unsigned PhysReg);
+
+  /// ClobberPhysReg - This is called when the specified physreg changes
+  /// value.  We use this to invalidate any info about stuff that lives in
+  /// it and any of its aliases.
+  void ClobberPhysReg(unsigned PhysReg);
+
+  /// ModifyStackSlotOrReMat - This method is called when the value in a stack
+  /// slot changes.  This removes information about which register the
+  /// previous value for this slot lives in (as the previous value is dead
+  /// now).
+  void ModifyStackSlotOrReMat(int SlotOrReMat);
+
+  /// AddAvailableRegsToLiveIn - Availability information is being kept coming
+  /// into the specified MBB. Add available physical registers as potential
+  /// live-in's. If they are reused in the MBB, they will be added to the
+  /// live-in set to make register scavenger and post-allocation scheduler.
+  void AddAvailableRegsToLiveIn(MachineBasicBlock &MBB, BitVector &RegKills,
+                                std::vector<MachineOperand*> &KillOps);
+};
+
+// ************************************************************************ //
+
+// ReusedOp - For each reused operand, we keep track of a bit of information,
+// in case we need to rollback upon processing a new operand.  See comments
+// below.
+struct ReusedOp {
+  // The MachineInstr operand that reused an available value.
+  unsigned Operand;
+
+  // StackSlotOrReMat - The spill slot or remat id of the value being reused.
+  unsigned StackSlotOrReMat;
+
+  // PhysRegReused - The physical register the value was available in.
+  unsigned PhysRegReused;
+
+  // AssignedPhysReg - The physreg that was assigned for use by the reload.
+  unsigned AssignedPhysReg;
+  
+  // VirtReg - The virtual register itself.
+  unsigned VirtReg;
+
+  ReusedOp(unsigned o, unsigned ss, unsigned prr, unsigned apr,
+           unsigned vreg)
+    : Operand(o), StackSlotOrReMat(ss), PhysRegReused(prr),
+      AssignedPhysReg(apr), VirtReg(vreg) {}
+};
+
+/// ReuseInfo - This maintains a collection of ReuseOp's for each operand that
+/// is reused instead of reloaded.
+class VISIBILITY_HIDDEN ReuseInfo {
+  MachineInstr &MI;
+  std::vector<ReusedOp> Reuses;
+  BitVector PhysRegsClobbered;
+public:
+  ReuseInfo(MachineInstr &mi, const TargetRegisterInfo *tri) : MI(mi) {
+    PhysRegsClobbered.resize(tri->getNumRegs());
+  }
+  
+  bool hasReuses() const {
+    return !Reuses.empty();
+  }
+  
+  /// addReuse - If we choose to reuse a virtual register that is already
+  /// available instead of reloading it, remember that we did so.
+  void addReuse(unsigned OpNo, unsigned StackSlotOrReMat,
+                unsigned PhysRegReused, unsigned AssignedPhysReg,
+                unsigned VirtReg) {
+    // If the reload is to the assigned register anyway, no undo will be
+    // required.
+    if (PhysRegReused == AssignedPhysReg) return;
+    
+    // Otherwise, remember this.
+    Reuses.push_back(ReusedOp(OpNo, StackSlotOrReMat, PhysRegReused, 
+                              AssignedPhysReg, VirtReg));
+  }
+
+  void markClobbered(unsigned PhysReg) {
+    PhysRegsClobbered.set(PhysReg);
+  }
+
+  bool isClobbered(unsigned PhysReg) const {
+    return PhysRegsClobbered.test(PhysReg);
+  }
+  
+  /// GetRegForReload - We are about to emit a reload into PhysReg.  If there
+  /// is some other operand that is using the specified register, either pick
+  /// a new register to use, or evict the previous reload and use this reg. 
+  unsigned GetRegForReload(unsigned PhysReg, MachineInstr *MI,
+                           AvailableSpills &Spills,
+                           std::vector<MachineInstr*> &MaybeDeadStores,
+                           SmallSet<unsigned, 8> &Rejected,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps,
+                           VirtRegMap &VRM);
+
+  /// GetRegForReload - Helper for the above GetRegForReload(). Add a
+  /// 'Rejected' set to remember which registers have been considered and
+  /// rejected for the reload. This avoids infinite looping in case like
+  /// this:
+  /// t1 := op t2, t3
+  /// t2 <- assigned r0 for use by the reload but ended up reuse r1
+  /// t3 <- assigned r1 for use by the reload but ended up reuse r0
+  /// t1 <- desires r1
+  ///       sees r1 is taken by t2, tries t2's reload register r0
+  ///       sees r0 is taken by t3, tries t3's reload register r1
+  ///       sees r1 is taken by t2, tries t2's reload register r0 ...
+  unsigned GetRegForReload(unsigned PhysReg, MachineInstr *MI,
+                           AvailableSpills &Spills,
+                           std::vector<MachineInstr*> &MaybeDeadStores,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps,
+                           VirtRegMap &VRM) {
+    SmallSet<unsigned, 8> Rejected;
+    return GetRegForReload(PhysReg, MI, Spills, MaybeDeadStores, Rejected,
+                           RegKills, KillOps, VRM);
+  }
+};
+
+
+// ****************** //
+// Utility Functions  //
+// ****************** //
+
+/// findSinglePredSuccessor - Return via reference a vector of machine basic
+/// blocks each of which is a successor of the specified BB and has no other
+/// predecessor.
+static void findSinglePredSuccessor(MachineBasicBlock *MBB,
+                                   SmallVectorImpl<MachineBasicBlock *> &Succs) {
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+    if (SuccMBB->pred_size() == 1)
+      Succs.push_back(SuccMBB);
+  }
+}
+
+/// InvalidateKill - Invalidate register kill information for a specific
+/// register. This also unsets the kills marker on the last kill operand.
+static void InvalidateKill(unsigned Reg,
+                           const TargetRegisterInfo* TRI,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps) {
+  if (RegKills[Reg]) {
+    KillOps[Reg]->setIsKill(false);
+    KillOps[Reg] = NULL;
+    RegKills.reset(Reg);
+    for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+      if (RegKills[*SR]) {
+        KillOps[*SR]->setIsKill(false);
+        KillOps[*SR] = NULL;
+        RegKills.reset(*SR);
+      }
+    }
+  }
+}
+
+/// InvalidateKills - MI is going to be deleted. If any of its operands are
+/// marked kill, then invalidate the information.
+static void InvalidateKills(MachineInstr &MI,
+                            const TargetRegisterInfo* TRI,
+                            BitVector &RegKills,
+                            std::vector<MachineOperand*> &KillOps,
+                            SmallVector<unsigned, 2> *KillRegs = NULL) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || !MO.isKill())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (KillRegs)
+      KillRegs->push_back(Reg);
+    assert(Reg < KillOps.size());
+    if (KillOps[Reg] == &MO) {
+      KillOps[Reg] = NULL;
+      RegKills.reset(Reg);
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+        if (RegKills[*SR]) {
+          KillOps[*SR] = NULL;
+          RegKills.reset(*SR);
+        }
+      }
+    }
+  }
+}
+
+/// InvalidateRegDef - If the def operand of the specified def MI is now dead
+/// (since it's spill instruction is removed), mark it isDead. Also checks if
+/// the def MI has other definition operands that are not dead. Returns it by
+/// reference.
+static bool InvalidateRegDef(MachineBasicBlock::iterator I,
+                             MachineInstr &NewDef, unsigned Reg,
+                             bool &HasLiveDef) {
+  // Due to remat, it's possible this reg isn't being reused. That is,
+  // the def of this reg (by prev MI) is now dead.
+  MachineInstr *DefMI = I;
+  MachineOperand *DefOp = NULL;
+  for (unsigned i = 0, e = DefMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = DefMI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      if (MO.getReg() == Reg)
+        DefOp = &MO;
+      else if (!MO.isDead())
+        HasLiveDef = true;
+    }
+  }
+  if (!DefOp)
+    return false;
+
+  bool FoundUse = false, Done = false;
+  MachineBasicBlock::iterator E = &NewDef;
+  ++I; ++E;
+  for (; !Done && I != E; ++I) {
+    MachineInstr *NMI = I;
+    for (unsigned j = 0, ee = NMI->getNumOperands(); j != ee; ++j) {
+      MachineOperand &MO = NMI->getOperand(j);
+      if (!MO.isReg() || MO.getReg() != Reg)
+        continue;
+      if (MO.isUse())
+        FoundUse = true;
+      Done = true; // Stop after scanning all the operands of this MI.
+    }
+  }
+  if (!FoundUse) {
+    // Def is dead!
+    DefOp->setIsDead();
+    return true;
+  }
+  return false;
+}
+
+/// UpdateKills - Track and update kill info. If a MI reads a register that is
+/// marked kill, then it must be due to register reuse. Transfer the kill info
+/// over.
+static void UpdateKills(MachineInstr &MI, const TargetRegisterInfo* TRI,
+                        BitVector &RegKills,
+                        std::vector<MachineOperand*> &KillOps) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+    
+    if (RegKills[Reg] && KillOps[Reg]->getParent() != &MI) {
+      // That can't be right. Register is killed but not re-defined and it's
+      // being reused. Let's fix that.
+      KillOps[Reg]->setIsKill(false);
+      KillOps[Reg] = NULL;
+      RegKills.reset(Reg);
+      if (!MI.isRegTiedToDefOperand(i))
+        // Unless it's a two-address operand, this is the new kill.
+        MO.setIsKill();
+    }
+    if (MO.isKill()) {
+      RegKills.set(Reg);
+      KillOps[Reg] = &MO;
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+        RegKills.set(*SR);
+        KillOps[*SR] = &MO;
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    RegKills.reset(Reg);
+    KillOps[Reg] = NULL;
+    // It also defines (or partially define) aliases.
+    for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) {
+      RegKills.reset(*SR);
+      KillOps[*SR] = NULL;
+    }
+  }
+}
+
+/// ReMaterialize - Re-materialize definition for Reg targetting DestReg.
+///
+static void ReMaterialize(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MII,
+                          unsigned DestReg, unsigned Reg,
+                          const TargetInstrInfo *TII,
+                          const TargetRegisterInfo *TRI,
+                          VirtRegMap &VRM) {
+  TII->reMaterialize(MBB, MII, DestReg, VRM.getReMaterializedMI(Reg));
+  MachineInstr *NewMI = prior(MII);
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned VirtReg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(VirtReg))
+      continue;
+    assert(MO.isUse());
+    unsigned SubIdx = MO.getSubReg();
+    unsigned Phys = VRM.getPhys(VirtReg);
+    assert(Phys);
+    unsigned RReg = SubIdx ? TRI->getSubReg(Phys, SubIdx) : Phys;
+    MO.setReg(RReg);
+    MO.setSubReg(0);
+  }
+  ++NumReMats;
+}
+
+/// findSuperReg - Find the SubReg's super-register of given register class
+/// where its SubIdx sub-register is SubReg.
+static unsigned findSuperReg(const TargetRegisterClass *RC, unsigned SubReg,
+                             unsigned SubIdx, const TargetRegisterInfo *TRI) {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I) {
+    unsigned Reg = *I;
+    if (TRI->getSubReg(Reg, SubIdx) == SubReg)
+      return Reg;
+  }
+  return 0;
+}
+
+// ******************************** //
+// Available Spills Implementation  //
+// ******************************** //
+
+/// disallowClobberPhysRegOnly - Unset the CanClobber bit of the specified
+/// stackslot register. The register is still available but is no longer
+/// allowed to be modifed.
+void AvailableSpills::disallowClobberPhysRegOnly(unsigned PhysReg) {
+  std::multimap<unsigned, int>::iterator I =
+    PhysRegsAvailable.lower_bound(PhysReg);
+  while (I != PhysRegsAvailable.end() && I->first == PhysReg) {
+    int SlotOrReMat = I->second;
+    I++;
+    assert((SpillSlotsOrReMatsAvailable[SlotOrReMat] >> 1) == PhysReg &&
+           "Bidirectional map mismatch!");
+    SpillSlotsOrReMatsAvailable[SlotOrReMat] &= ~1;
+    DOUT << "PhysReg " << TRI->getName(PhysReg)
+         << " copied, it is available for use but can no longer be modified\n";
+  }
+}
+
+/// disallowClobberPhysReg - Unset the CanClobber bit of the specified
+/// stackslot register and its aliases. The register and its aliases may
+/// still available but is no longer allowed to be modifed.
+void AvailableSpills::disallowClobberPhysReg(unsigned PhysReg) {
+  for (const unsigned *AS = TRI->getAliasSet(PhysReg); *AS; ++AS)
+    disallowClobberPhysRegOnly(*AS);
+  disallowClobberPhysRegOnly(PhysReg);
+}
+
+/// ClobberPhysRegOnly - This is called when the specified physreg changes
+/// value.  We use this to invalidate any info about stuff we thing lives in it.
+void AvailableSpills::ClobberPhysRegOnly(unsigned PhysReg) {
+  std::multimap<unsigned, int>::iterator I =
+    PhysRegsAvailable.lower_bound(PhysReg);
+  while (I != PhysRegsAvailable.end() && I->first == PhysReg) {
+    int SlotOrReMat = I->second;
+    PhysRegsAvailable.erase(I++);
+    assert((SpillSlotsOrReMatsAvailable[SlotOrReMat] >> 1) == PhysReg &&
+           "Bidirectional map mismatch!");
+    SpillSlotsOrReMatsAvailable.erase(SlotOrReMat);
+    DOUT << "PhysReg " << TRI->getName(PhysReg)
+         << " clobbered, invalidating ";
+    if (SlotOrReMat > VirtRegMap::MAX_STACK_SLOT)
+      DOUT << "RM#" << SlotOrReMat-VirtRegMap::MAX_STACK_SLOT-1 << "\n";
+    else
+      DOUT << "SS#" << SlotOrReMat << "\n";
+  }
+}
+
+/// ClobberPhysReg - This is called when the specified physreg changes
+/// value.  We use this to invalidate any info about stuff we thing lives in
+/// it and any of its aliases.
+void AvailableSpills::ClobberPhysReg(unsigned PhysReg) {
+  for (const unsigned *AS = TRI->getAliasSet(PhysReg); *AS; ++AS)
+    ClobberPhysRegOnly(*AS);
+  ClobberPhysRegOnly(PhysReg);
+}
+
+/// AddAvailableRegsToLiveIn - Availability information is being kept coming
+/// into the specified MBB. Add available physical registers as potential
+/// live-in's. If they are reused in the MBB, they will be added to the
+/// live-in set to make register scavenger and post-allocation scheduler.
+void AvailableSpills::AddAvailableRegsToLiveIn(MachineBasicBlock &MBB,
+                                        BitVector &RegKills,
+                                        std::vector<MachineOperand*> &KillOps) {
+  std::set<unsigned> NotAvailable;
+  for (std::multimap<unsigned, int>::iterator
+         I = PhysRegsAvailable.begin(), E = PhysRegsAvailable.end();
+       I != E; ++I) {
+    unsigned Reg = I->first;
+    const TargetRegisterClass* RC = TRI->getPhysicalRegisterRegClass(Reg);
+    // FIXME: A temporary workaround. We can't reuse available value if it's
+    // not safe to move the def of the virtual register's class. e.g.
+    // X86::RFP* register classes. Do not add it as a live-in.
+    if (!TII->isSafeToMoveRegClassDefs(RC))
+      // This is no longer available.
+      NotAvailable.insert(Reg);
+    else {
+      MBB.addLiveIn(Reg);
+      InvalidateKill(Reg, TRI, RegKills, KillOps);
+    }
+
+    // Skip over the same register.
+    std::multimap<unsigned, int>::iterator NI = next(I);
+    while (NI != E && NI->first == Reg) {
+      ++I;
+      ++NI;
+    }
+  }
+
+  for (std::set<unsigned>::iterator I = NotAvailable.begin(),
+         E = NotAvailable.end(); I != E; ++I) {
+    ClobberPhysReg(*I);
+    for (const unsigned *SubRegs = TRI->getSubRegisters(*I);
+       *SubRegs; ++SubRegs)
+      ClobberPhysReg(*SubRegs);
+  }
+}
+
+/// ModifyStackSlotOrReMat - This method is called when the value in a stack
+/// slot changes.  This removes information about which register the previous
+/// value for this slot lives in (as the previous value is dead now).
+void AvailableSpills::ModifyStackSlotOrReMat(int SlotOrReMat) {
+  std::map<int, unsigned>::iterator It =
+    SpillSlotsOrReMatsAvailable.find(SlotOrReMat);
+  if (It == SpillSlotsOrReMatsAvailable.end()) return;
+  unsigned Reg = It->second >> 1;
+  SpillSlotsOrReMatsAvailable.erase(It);
+  
+  // This register may hold the value of multiple stack slots, only remove this
+  // stack slot from the set of values the register contains.
+  std::multimap<unsigned, int>::iterator I = PhysRegsAvailable.lower_bound(Reg);
+  for (; ; ++I) {
+    assert(I != PhysRegsAvailable.end() && I->first == Reg &&
+           "Map inverse broken!");
+    if (I->second == SlotOrReMat) break;
+  }
+  PhysRegsAvailable.erase(I);
+}
+
+// ************************** //
+// Reuse Info Implementation  //
+// ************************** //
+
+/// GetRegForReload - We are about to emit a reload into PhysReg.  If there
+/// is some other operand that is using the specified register, either pick
+/// a new register to use, or evict the previous reload and use this reg.
+unsigned ReuseInfo::GetRegForReload(unsigned PhysReg, MachineInstr *MI,
+                         AvailableSpills &Spills,
+                         std::vector<MachineInstr*> &MaybeDeadStores,
+                         SmallSet<unsigned, 8> &Rejected,
+                         BitVector &RegKills,
+                         std::vector<MachineOperand*> &KillOps,
+                         VirtRegMap &VRM) {
+  const TargetInstrInfo* TII = MI->getParent()->getParent()->getTarget()
+                               .getInstrInfo();
+  
+  if (Reuses.empty()) return PhysReg;  // This is most often empty.
+
+  for (unsigned ro = 0, e = Reuses.size(); ro != e; ++ro) {
+    ReusedOp &Op = Reuses[ro];
+    // If we find some other reuse that was supposed to use this register
+    // exactly for its reload, we can change this reload to use ITS reload
+    // register. That is, unless its reload register has already been
+    // considered and subsequently rejected because it has also been reused
+    // by another operand.
+    if (Op.PhysRegReused == PhysReg &&
+        Rejected.count(Op.AssignedPhysReg) == 0) {
+      // Yup, use the reload register that we didn't use before.
+      unsigned NewReg = Op.AssignedPhysReg;
+      Rejected.insert(PhysReg);
+      return GetRegForReload(NewReg, MI, Spills, MaybeDeadStores, Rejected,
+                             RegKills, KillOps, VRM);
+    } else {
+      // Otherwise, we might also have a problem if a previously reused
+      // value aliases the new register.  If so, codegen the previous reload
+      // and use this one.          
+      unsigned PRRU = Op.PhysRegReused;
+      const TargetRegisterInfo *TRI = Spills.getRegInfo();
+      if (TRI->areAliases(PRRU, PhysReg)) {
+        // Okay, we found out that an alias of a reused register
+        // was used.  This isn't good because it means we have
+        // to undo a previous reuse.
+        MachineBasicBlock *MBB = MI->getParent();
+        const TargetRegisterClass *AliasRC =
+          MBB->getParent()->getRegInfo().getRegClass(Op.VirtReg);
+
+        // Copy Op out of the vector and remove it, we're going to insert an
+        // explicit load for it.
+        ReusedOp NewOp = Op;
+        Reuses.erase(Reuses.begin()+ro);
+
+        // Ok, we're going to try to reload the assigned physreg into the
+        // slot that we were supposed to in the first place.  However, that
+        // register could hold a reuse.  Check to see if it conflicts or
+        // would prefer us to use a different register.
+        unsigned NewPhysReg = GetRegForReload(NewOp.AssignedPhysReg,
+                                              MI, Spills, MaybeDeadStores,
+                                          Rejected, RegKills, KillOps, VRM);
+        
+        MachineBasicBlock::iterator MII = MI;
+        if (NewOp.StackSlotOrReMat > VirtRegMap::MAX_STACK_SLOT) {
+          ReMaterialize(*MBB, MII, NewPhysReg, NewOp.VirtReg, TII, TRI,VRM);
+        } else {
+          TII->loadRegFromStackSlot(*MBB, MII, NewPhysReg,
+                                    NewOp.StackSlotOrReMat, AliasRC);
+          MachineInstr *LoadMI = prior(MII);
+          VRM.addSpillSlotUse(NewOp.StackSlotOrReMat, LoadMI);
+          // Any stores to this stack slot are not dead anymore.
+          MaybeDeadStores[NewOp.StackSlotOrReMat] = NULL;            
+          ++NumLoads;
+        }
+        Spills.ClobberPhysReg(NewPhysReg);
+        Spills.ClobberPhysReg(NewOp.PhysRegReused);
+
+        unsigned SubIdx = MI->getOperand(NewOp.Operand).getSubReg();
+        unsigned RReg = SubIdx ? TRI->getSubReg(NewPhysReg, SubIdx) : NewPhysReg;
+        MI->getOperand(NewOp.Operand).setReg(RReg);
+        MI->getOperand(NewOp.Operand).setSubReg(0);
+
+        Spills.addAvailable(NewOp.StackSlotOrReMat, NewPhysReg);
+        --MII;
+        UpdateKills(*MII, TRI, RegKills, KillOps);
+        DOUT << '\t' << *MII;
+        
+        DOUT << "Reuse undone!\n";
+        --NumReused;
+        
+        // Finally, PhysReg is now available, go ahead and use it.
+        return PhysReg;
+      }
+    }
+  }
+  return PhysReg;
+}
+
+// ************************************************************************ //
+
+/// FoldsStackSlotModRef - Return true if the specified MI folds the specified
+/// stack slot mod/ref. It also checks if it's possible to unfold the
+/// instruction by having it define a specified physical register instead.
+static bool FoldsStackSlotModRef(MachineInstr &MI, int SS, unsigned PhysReg,
+                                 const TargetInstrInfo *TII,
+                                 const TargetRegisterInfo *TRI,
+                                 VirtRegMap &VRM) {
+  if (VRM.hasEmergencySpills(&MI) || VRM.isSpillPt(&MI))
+    return false;
+
+  bool Found = false;
+  VirtRegMap::MI2VirtMapTy::const_iterator I, End;
+  for (tie(I, End) = VRM.getFoldedVirts(&MI); I != End; ++I) {
+    unsigned VirtReg = I->second.first;
+    VirtRegMap::ModRef MR = I->second.second;
+    if (MR & VirtRegMap::isModRef)
+      if (VRM.getStackSlot(VirtReg) == SS) {
+        Found= TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(), true, true) != 0;
+        break;
+      }
+  }
+  if (!Found)
+    return false;
+
+  // Does the instruction uses a register that overlaps the scratch register?
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (!VRM.hasPhys(Reg))
+        continue;
+      Reg = VRM.getPhys(Reg);
+    }
+    if (TRI->regsOverlap(PhysReg, Reg))
+      return false;
+  }
+  return true;
+}
+
+/// FindFreeRegister - Find a free register of a given register class by looking
+/// at (at most) the last two machine instructions.
+static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
+                                 MachineBasicBlock &MBB,
+                                 const TargetRegisterClass *RC,
+                                 const TargetRegisterInfo *TRI,
+                                 BitVector &AllocatableRegs) {
+  BitVector Defs(TRI->getNumRegs());
+  BitVector Uses(TRI->getNumRegs());
+  SmallVector<unsigned, 4> LocalUses;
+  SmallVector<unsigned, 4> Kills;
+
+  // Take a look at 2 instructions at most.
+  for (unsigned Count = 0; Count < 2; ++Count) {
+    if (MII == MBB.begin())
+      break;
+    MachineInstr *PrevMI = prior(MII);
+    for (unsigned i = 0, e = PrevMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = PrevMI->getOperand(i);
+      if (!MO.isReg() || MO.getReg() == 0)
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+          Defs.set(*AS);
+      } else  {
+        LocalUses.push_back(Reg);
+        if (MO.isKill() && AllocatableRegs[Reg])
+          Kills.push_back(Reg);
+      }
+    }
+
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+      unsigned Kill = Kills[i];
+      if (!Defs[Kill] && !Uses[Kill] &&
+          TRI->getPhysicalRegisterRegClass(Kill) == RC)
+        return Kill;
+    }
+    for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+      unsigned Reg = LocalUses[i];
+      Uses.set(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Uses.set(*AS);
+    }
+
+    MII = PrevMI;
+  }
+
+  return 0;
+}
+
+static
+void AssignPhysToVirtReg(MachineInstr *MI, unsigned VirtReg, unsigned PhysReg) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == VirtReg)
+      MO.setReg(PhysReg);
+  }
+}
+
+namespace {
+  struct RefSorter {
+    bool operator()(const std::pair<MachineInstr*, int> &A,
+                    const std::pair<MachineInstr*, int> &B) {
+      return A.second < B.second;
+    }
+  };
+}
+
+// ***************************** //
+// Local Spiller Implementation  //
+// ***************************** //
+
+class VISIBILITY_HIDDEN LocalRewriter : public VirtRegRewriter {
+  MachineRegisterInfo *RegInfo;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  BitVector AllocatableRegs;
+  DenseMap<MachineInstr*, unsigned> DistanceMap;
+public:
+
+  bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
+                            LiveIntervals* LIs) {
+    RegInfo = &MF.getRegInfo(); 
+    TRI = MF.getTarget().getRegisterInfo();
+    TII = MF.getTarget().getInstrInfo();
+    AllocatableRegs = TRI->getAllocatableSet(MF);
+    DOUT << "\n**** Local spiller rewriting function '"
+         << MF.getFunction()->getName() << "':\n";
+    DOUT << "**** Machine Instrs (NOTE! Does not include spills and reloads!)"
+            " ****\n";
+    DEBUG(MF.dump());
+
+    // Spills - Keep track of which spilled values are available in physregs
+    // so that we can choose to reuse the physregs instead of emitting
+    // reloads. This is usually refreshed per basic block.
+    AvailableSpills Spills(TRI, TII);
+
+    // Keep track of kill information.
+    BitVector RegKills(TRI->getNumRegs());
+    std::vector<MachineOperand*> KillOps;
+    KillOps.resize(TRI->getNumRegs(), NULL);
+
+    // SingleEntrySuccs - Successor blocks which have a single predecessor.
+    SmallVector<MachineBasicBlock*, 4> SinglePredSuccs;
+    SmallPtrSet<MachineBasicBlock*,16> EarlyVisited;
+
+    // Traverse the basic blocks depth first.
+    MachineBasicBlock *Entry = MF.begin();
+    SmallPtrSet<MachineBasicBlock*,16> Visited;
+    for (df_ext_iterator<MachineBasicBlock*,
+           SmallPtrSet<MachineBasicBlock*,16> >
+           DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+         DFI != E; ++DFI) {
+      MachineBasicBlock *MBB = *DFI;
+      if (!EarlyVisited.count(MBB))
+        RewriteMBB(*MBB, VRM, LIs, Spills, RegKills, KillOps);
+
+      // If this MBB is the only predecessor of a successor. Keep the
+      // availability information and visit it next.
+      do {
+        // Keep visiting single predecessor successor as long as possible.
+        SinglePredSuccs.clear();
+        findSinglePredSuccessor(MBB, SinglePredSuccs);
+        if (SinglePredSuccs.empty())
+          MBB = 0;
+        else {
+          // FIXME: More than one successors, each of which has MBB has
+          // the only predecessor.
+          MBB = SinglePredSuccs[0];
+          if (!Visited.count(MBB) && EarlyVisited.insert(MBB)) {
+            Spills.AddAvailableRegsToLiveIn(*MBB, RegKills, KillOps);
+            RewriteMBB(*MBB, VRM, LIs, Spills, RegKills, KillOps);
+          }
+        }
+      } while (MBB);
+
+      // Clear the availability info.
+      Spills.clear();
+    }
+
+    DOUT << "**** Post Machine Instrs ****\n";
+    DEBUG(MF.dump());
+
+    // Mark unused spill slots.
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    int SS = VRM.getLowSpillSlot();
+    if (SS != VirtRegMap::NO_STACK_SLOT)
+      for (int e = VRM.getHighSpillSlot(); SS <= e; ++SS)
+        if (!VRM.isSpillSlotUsed(SS)) {
+          MFI->RemoveStackObject(SS);
+          ++NumDSS;
+        }
+
+    return true;
+  }
+
+private:
+
+  /// OptimizeByUnfold2 - Unfold a series of load / store folding instructions if
+  /// a scratch register is available.
+  ///     xorq  %r12<kill>, %r13
+  ///     addq  %rax, -184(%rbp)
+  ///     addq  %r13, -184(%rbp)
+  /// ==>
+  ///     xorq  %r12<kill>, %r13
+  ///     movq  -184(%rbp), %r12
+  ///     addq  %rax, %r12
+  ///     addq  %r13, %r12
+  ///     movq  %r12, -184(%rbp)
+  bool OptimizeByUnfold2(unsigned VirtReg, int SS,
+                         MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MII,
+                         std::vector<MachineInstr*> &MaybeDeadStores,
+                         AvailableSpills &Spills,
+                         BitVector &RegKills,
+                         std::vector<MachineOperand*> &KillOps,
+                         VirtRegMap &VRM) {
+
+    MachineBasicBlock::iterator NextMII = next(MII);
+    if (NextMII == MBB.end())
+      return false;
+
+    if (TII->getOpcodeAfterMemoryUnfold(MII->getOpcode(), true, true) == 0)
+      return false;
+
+    // Now let's see if the last couple of instructions happens to have freed up
+    // a register.
+    const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+    unsigned PhysReg = FindFreeRegister(MII, MBB, RC, TRI, AllocatableRegs);
+    if (!PhysReg)
+      return false;
+
+    MachineFunction &MF = *MBB.getParent();
+    TRI = MF.getTarget().getRegisterInfo();
+    MachineInstr &MI = *MII;
+    if (!FoldsStackSlotModRef(MI, SS, PhysReg, TII, TRI, VRM))
+      return false;
+
+    // If the next instruction also folds the same SS modref and can be unfoled,
+    // then it's worthwhile to issue a load from SS into the free register and
+    // then unfold these instructions.
+    if (!FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, VRM))
+      return false;
+
+    // Load from SS to the spare physical register.
+    TII->loadRegFromStackSlot(MBB, MII, PhysReg, SS, RC);
+    // This invalidates Phys.
+    Spills.ClobberPhysReg(PhysReg);
+    // Remember it's available.
+    Spills.addAvailable(SS, PhysReg);
+    MaybeDeadStores[SS] = NULL;
+
+    // Unfold current MI.
+    SmallVector<MachineInstr*, 4> NewMIs;
+    if (!TII->unfoldMemoryOperand(MF, &MI, VirtReg, false, false, NewMIs))
+      assert(0 && "Unable unfold the load / store folding instruction!");
+    assert(NewMIs.size() == 1);
+    AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg);
+    VRM.transferRestorePts(&MI, NewMIs[0]);
+    MII = MBB.insert(MII, NewMIs[0]);
+    InvalidateKills(MI, TRI, RegKills, KillOps);
+    VRM.RemoveMachineInstrFromMaps(&MI);
+    MBB.erase(&MI);
+    ++NumModRefUnfold;
+
+    // Unfold next instructions that fold the same SS.
+    do {
+      MachineInstr &NextMI = *NextMII;
+      NextMII = next(NextMII);
+      NewMIs.clear();
+      if (!TII->unfoldMemoryOperand(MF, &NextMI, VirtReg, false, false, NewMIs))
+        assert(0 && "Unable unfold the load / store folding instruction!");
+      assert(NewMIs.size() == 1);
+      AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg);
+      VRM.transferRestorePts(&NextMI, NewMIs[0]);
+      MBB.insert(NextMII, NewMIs[0]);
+      InvalidateKills(NextMI, TRI, RegKills, KillOps);
+      VRM.RemoveMachineInstrFromMaps(&NextMI);
+      MBB.erase(&NextMI);
+      ++NumModRefUnfold;
+    } while (FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, VRM));
+
+    // Store the value back into SS.
+    TII->storeRegToStackSlot(MBB, NextMII, PhysReg, true, SS, RC);
+    MachineInstr *StoreMI = prior(NextMII);
+    VRM.addSpillSlotUse(SS, StoreMI);
+    VRM.virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+
+    return true;
+  }
+
+  /// OptimizeByUnfold - Turn a store folding instruction into a load folding
+  /// instruction. e.g.
+  ///     xorl  %edi, %eax
+  ///     movl  %eax, -32(%ebp)
+  ///     movl  -36(%ebp), %eax
+  ///     orl   %eax, -32(%ebp)
+  /// ==>
+  ///     xorl  %edi, %eax
+  ///     orl   -36(%ebp), %eax
+  ///     mov   %eax, -32(%ebp)
+  /// This enables unfolding optimization for a subsequent instruction which will
+  /// also eliminate the newly introduced store instruction.
+  bool OptimizeByUnfold(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MII,
+                        std::vector<MachineInstr*> &MaybeDeadStores,
+                        AvailableSpills &Spills,
+                        BitVector &RegKills,
+                        std::vector<MachineOperand*> &KillOps,
+                        VirtRegMap &VRM) {
+    MachineFunction &MF = *MBB.getParent();
+    MachineInstr &MI = *MII;
+    unsigned UnfoldedOpc = 0;
+    unsigned UnfoldPR = 0;
+    unsigned UnfoldVR = 0;
+    int FoldedSS = VirtRegMap::NO_STACK_SLOT;
+    VirtRegMap::MI2VirtMapTy::const_iterator I, End;
+    for (tie(I, End) = VRM.getFoldedVirts(&MI); I != End; ) {
+      // Only transform a MI that folds a single register.
+      if (UnfoldedOpc)
+        return false;
+      UnfoldVR = I->second.first;
+      VirtRegMap::ModRef MR = I->second.second;
+      // MI2VirtMap be can updated which invalidate the iterator.
+      // Increment the iterator first.
+      ++I; 
+      if (VRM.isAssignedReg(UnfoldVR))
+        continue;
+      // If this reference is not a use, any previous store is now dead.
+      // Otherwise, the store to this stack slot is not dead anymore.
+      FoldedSS = VRM.getStackSlot(UnfoldVR);
+      MachineInstr* DeadStore = MaybeDeadStores[FoldedSS];
+      if (DeadStore && (MR & VirtRegMap::isModRef)) {
+        unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(FoldedSS);
+        if (!PhysReg || !DeadStore->readsRegister(PhysReg))
+          continue;
+        UnfoldPR = PhysReg;
+        UnfoldedOpc = TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(),
+                                                      false, true);
+      }
+    }
+
+    if (!UnfoldedOpc) {
+      if (!UnfoldVR)
+        return false;
+
+      // Look for other unfolding opportunities.
+      return OptimizeByUnfold2(UnfoldVR, FoldedSS, MBB, MII,
+                               MaybeDeadStores, Spills, RegKills, KillOps, VRM);
+    }
+
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() || MO.getReg() == 0 || !MO.isUse())
+        continue;
+      unsigned VirtReg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(VirtReg) || MO.getSubReg())
+        continue;
+      if (VRM.isAssignedReg(VirtReg)) {
+        unsigned PhysReg = VRM.getPhys(VirtReg);
+        if (PhysReg && TRI->regsOverlap(PhysReg, UnfoldPR))
+          return false;
+      } else if (VRM.isReMaterialized(VirtReg))
+        continue;
+      int SS = VRM.getStackSlot(VirtReg);
+      unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SS);
+      if (PhysReg) {
+        if (TRI->regsOverlap(PhysReg, UnfoldPR))
+          return false;
+        continue;
+      }
+      if (VRM.hasPhys(VirtReg)) {
+        PhysReg = VRM.getPhys(VirtReg);
+        if (!TRI->regsOverlap(PhysReg, UnfoldPR))
+          continue;
+      }
+
+      // Ok, we'll need to reload the value into a register which makes
+      // it impossible to perform the store unfolding optimization later.
+      // Let's see if it is possible to fold the load if the store is
+      // unfolded. This allows us to perform the store unfolding
+      // optimization.
+      SmallVector<MachineInstr*, 4> NewMIs;
+      if (TII->unfoldMemoryOperand(MF, &MI, UnfoldVR, false, false, NewMIs)) {
+        assert(NewMIs.size() == 1);
+        MachineInstr *NewMI = NewMIs.back();
+        NewMIs.clear();
+        int Idx = NewMI->findRegisterUseOperandIdx(VirtReg, false);
+        assert(Idx != -1);
+        SmallVector<unsigned, 1> Ops;
+        Ops.push_back(Idx);
+        MachineInstr *FoldedMI = TII->foldMemoryOperand(MF, NewMI, Ops, SS);
+        if (FoldedMI) {
+          VRM.addSpillSlotUse(SS, FoldedMI);
+          if (!VRM.hasPhys(UnfoldVR))
+            VRM.assignVirt2Phys(UnfoldVR, UnfoldPR);
+          VRM.virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
+          MII = MBB.insert(MII, FoldedMI);
+          InvalidateKills(MI, TRI, RegKills, KillOps);
+          VRM.RemoveMachineInstrFromMaps(&MI);
+          MBB.erase(&MI);
+          MF.DeleteMachineInstr(NewMI);
+          return true;
+        }
+        MF.DeleteMachineInstr(NewMI);
+      }
+    }
+
+    return false;
+  }
+
+  /// CommuteToFoldReload -
+  /// Look for
+  /// r1 = load fi#1
+  /// r1 = op r1, r2<kill>
+  /// store r1, fi#1
+  ///
+  /// If op is commutable and r2 is killed, then we can xform these to
+  /// r2 = op r2, fi#1
+  /// store r2, fi#1
+  bool CommuteToFoldReload(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MII,
+                           unsigned VirtReg, unsigned SrcReg, int SS,
+                           AvailableSpills &Spills,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps,
+                           const TargetRegisterInfo *TRI,
+                           VirtRegMap &VRM) {
+    if (MII == MBB.begin() || !MII->killsRegister(SrcReg))
+      return false;
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineInstr &MI = *MII;
+    MachineBasicBlock::iterator DefMII = prior(MII);
+    MachineInstr *DefMI = DefMII;
+    const TargetInstrDesc &TID = DefMI->getDesc();
+    unsigned NewDstIdx;
+    if (DefMII != MBB.begin() &&
+        TID.isCommutable() &&
+        TII->CommuteChangesDestination(DefMI, NewDstIdx)) {
+      MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
+      unsigned NewReg = NewDstMO.getReg();
+      if (!NewDstMO.isKill() || TRI->regsOverlap(NewReg, SrcReg))
+        return false;
+      MachineInstr *ReloadMI = prior(DefMII);
+      int FrameIdx;
+      unsigned DestReg = TII->isLoadFromStackSlot(ReloadMI, FrameIdx);
+      if (DestReg != SrcReg || FrameIdx != SS)
+        return false;
+      int UseIdx = DefMI->findRegisterUseOperandIdx(DestReg, false);
+      if (UseIdx == -1)
+        return false;
+      unsigned DefIdx;
+      if (!MI.isRegTiedToDefOperand(UseIdx, &DefIdx))
+        return false;
+      assert(DefMI->getOperand(DefIdx).isReg() &&
+             DefMI->getOperand(DefIdx).getReg() == SrcReg);
+
+      // Now commute def instruction.
+      MachineInstr *CommutedMI = TII->commuteInstruction(DefMI, true);
+      if (!CommutedMI)
+        return false;
+      SmallVector<unsigned, 1> Ops;
+      Ops.push_back(NewDstIdx);
+      MachineInstr *FoldedMI = TII->foldMemoryOperand(MF, CommutedMI, Ops, SS);
+      // Not needed since foldMemoryOperand returns new MI.
+      MF.DeleteMachineInstr(CommutedMI);
+      if (!FoldedMI)
+        return false;
+
+      VRM.addSpillSlotUse(SS, FoldedMI);
+      VRM.virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
+      // Insert new def MI and spill MI.
+      const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+      TII->storeRegToStackSlot(MBB, &MI, NewReg, true, SS, RC);
+      MII = prior(MII);
+      MachineInstr *StoreMI = MII;
+      VRM.addSpillSlotUse(SS, StoreMI);
+      VRM.virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+      MII = MBB.insert(MII, FoldedMI);  // Update MII to backtrack.
+
+      // Delete all 3 old instructions.
+      InvalidateKills(*ReloadMI, TRI, RegKills, KillOps);
+      VRM.RemoveMachineInstrFromMaps(ReloadMI);
+      MBB.erase(ReloadMI);
+      InvalidateKills(*DefMI, TRI, RegKills, KillOps);
+      VRM.RemoveMachineInstrFromMaps(DefMI);
+      MBB.erase(DefMI);
+      InvalidateKills(MI, TRI, RegKills, KillOps);
+      VRM.RemoveMachineInstrFromMaps(&MI);
+      MBB.erase(&MI);
+
+      // If NewReg was previously holding value of some SS, it's now clobbered.
+      // This has to be done now because it's a physical register. When this
+      // instruction is re-visited, it's ignored.
+      Spills.ClobberPhysReg(NewReg);
+
+      ++NumCommutes;
+      return true;
+    }
+
+    return false;
+  }
+
+  /// SpillRegToStackSlot - Spill a register to a specified stack slot. Check if
+  /// the last store to the same slot is now dead. If so, remove the last store.
+  void SpillRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MII,
+                           int Idx, unsigned PhysReg, int StackSlot,
+                           const TargetRegisterClass *RC,
+                           bool isAvailable, MachineInstr *&LastStore,
+                           AvailableSpills &Spills,
+                           SmallSet<MachineInstr*, 4> &ReMatDefs,
+                           BitVector &RegKills,
+                           std::vector<MachineOperand*> &KillOps,
+                           VirtRegMap &VRM) {
+
+    TII->storeRegToStackSlot(MBB, next(MII), PhysReg, true, StackSlot, RC);
+    MachineInstr *StoreMI = next(MII);
+    VRM.addSpillSlotUse(StackSlot, StoreMI);
+    DOUT << "Store:\t" << *StoreMI;
+
+    // If there is a dead store to this stack slot, nuke it now.
+    if (LastStore) {
+      DOUT << "Removed dead store:\t" << *LastStore;
+      ++NumDSE;
+      SmallVector<unsigned, 2> KillRegs;
+      InvalidateKills(*LastStore, TRI, RegKills, KillOps, &KillRegs);
+      MachineBasicBlock::iterator PrevMII = LastStore;
+      bool CheckDef = PrevMII != MBB.begin();
+      if (CheckDef)
+        --PrevMII;
+      VRM.RemoveMachineInstrFromMaps(LastStore);
+      MBB.erase(LastStore);
+      if (CheckDef) {
+        // Look at defs of killed registers on the store. Mark the defs
+        // as dead since the store has been deleted and they aren't
+        // being reused.
+        for (unsigned j = 0, ee = KillRegs.size(); j != ee; ++j) {
+          bool HasOtherDef = false;
+          if (InvalidateRegDef(PrevMII, *MII, KillRegs[j], HasOtherDef)) {
+            MachineInstr *DeadDef = PrevMII;
+            if (ReMatDefs.count(DeadDef) && !HasOtherDef) {
+              // FIXME: This assumes a remat def does not have side
+              // effects.
+              VRM.RemoveMachineInstrFromMaps(DeadDef);
+              MBB.erase(DeadDef);
+              ++NumDRM;
+            }
+          }
+        }
+      }
+    }
+
+    LastStore = next(MII);
+
+    // If the stack slot value was previously available in some other
+    // register, change it now.  Otherwise, make the register available,
+    // in PhysReg.
+    Spills.ModifyStackSlotOrReMat(StackSlot);
+    Spills.ClobberPhysReg(PhysReg);
+    Spills.addAvailable(StackSlot, PhysReg, isAvailable);
+    ++NumStores;
+  }
+
+  /// TransferDeadness - A identity copy definition is dead and it's being
+  /// removed. Find the last def or use and mark it as dead / kill.
+  void TransferDeadness(MachineBasicBlock *MBB, unsigned CurDist,
+                        unsigned Reg, BitVector &RegKills,
+                        std::vector<MachineOperand*> &KillOps,
+                        VirtRegMap &VRM) {
+    SmallPtrSet<MachineInstr*, 4> Seens;
+    SmallVector<std::pair<MachineInstr*, int>,8> Refs;
+    for (MachineRegisterInfo::reg_iterator RI = RegInfo->reg_begin(Reg),
+           RE = RegInfo->reg_end(); RI != RE; ++RI) {
+      MachineInstr *UDMI = &*RI;
+      if (UDMI->getParent() != MBB)
+        continue;
+      DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UDMI);
+      if (DI == DistanceMap.end() || DI->second > CurDist)
+        continue;
+      if (Seens.insert(UDMI))
+        Refs.push_back(std::make_pair(UDMI, DI->second));
+    }
+
+    if (Refs.empty())
+      return;
+    std::sort(Refs.begin(), Refs.end(), RefSorter());
+
+    while (!Refs.empty()) {
+      MachineInstr *LastUDMI = Refs.back().first;
+      Refs.pop_back();
+
+      MachineOperand *LastUD = NULL;
+      for (unsigned i = 0, e = LastUDMI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = LastUDMI->getOperand(i);
+        if (!MO.isReg() || MO.getReg() != Reg)
+          continue;
+        if (!LastUD || (LastUD->isUse() && MO.isDef()))
+          LastUD = &MO;
+        if (LastUDMI->isRegTiedToDefOperand(i))
+          break;
+      }
+      if (LastUD->isDef()) {
+        // If the instruction has no side effect, delete it and propagate
+        // backward further. Otherwise, mark is dead and we are done.
+        const TargetInstrDesc &TID = LastUDMI->getDesc();
+        if (TID.mayStore() || TID.isCall() || TID.isTerminator() ||
+            TID.hasUnmodeledSideEffects()) {
+          LastUD->setIsDead();
+          break;
+        }
+        VRM.RemoveMachineInstrFromMaps(LastUDMI);
+        MBB->erase(LastUDMI);
+      } else {
+        LastUD->setIsKill();
+        RegKills.set(Reg);
+        KillOps[Reg] = LastUD;
+        break;
+      }
+    }
+  }
+
+  /// rewriteMBB - Keep track of which spills are available even after the
+  /// register allocator is done with them.  If possible, avid reloading vregs.
+  void RewriteMBB(MachineBasicBlock &MBB, VirtRegMap &VRM,
+                  LiveIntervals *LIs,
+                  AvailableSpills &Spills, BitVector &RegKills,
+                  std::vector<MachineOperand*> &KillOps) {
+
+    DOUT << "\n**** Local spiller rewriting MBB '"
+         << MBB.getBasicBlock()->getName() << "':\n";
+
+    MachineFunction &MF = *MBB.getParent();
+    
+    // MaybeDeadStores - When we need to write a value back into a stack slot,
+    // keep track of the inserted store.  If the stack slot value is never read
+    // (because the value was used from some available register, for example), and
+    // subsequently stored to, the original store is dead.  This map keeps track
+    // of inserted stores that are not used.  If we see a subsequent store to the
+    // same stack slot, the original store is deleted.
+    std::vector<MachineInstr*> MaybeDeadStores;
+    MaybeDeadStores.resize(MF.getFrameInfo()->getObjectIndexEnd(), NULL);
+
+    // ReMatDefs - These are rematerializable def MIs which are not deleted.
+    SmallSet<MachineInstr*, 4> ReMatDefs;
+
+    // Clear kill info.
+    SmallSet<unsigned, 2> KilledMIRegs;
+    RegKills.reset();
+    KillOps.clear();
+    KillOps.resize(TRI->getNumRegs(), NULL);
+
+    unsigned Dist = 0;
+    DistanceMap.clear();
+    for (MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
+         MII != E; ) {
+      MachineBasicBlock::iterator NextMII = next(MII);
+
+      VirtRegMap::MI2VirtMapTy::const_iterator I, End;
+      bool Erased = false;
+      bool BackTracked = false;
+      if (OptimizeByUnfold(MBB, MII,
+                           MaybeDeadStores, Spills, RegKills, KillOps, VRM))
+        NextMII = next(MII);
+
+      MachineInstr &MI = *MII;
+
+      if (VRM.hasEmergencySpills(&MI)) {
+        // Spill physical register(s) in the rare case the allocator has run out
+        // of registers to allocate.
+        SmallSet<int, 4> UsedSS;
+        std::vector<unsigned> &EmSpills = VRM.getEmergencySpills(&MI);
+        for (unsigned i = 0, e = EmSpills.size(); i != e; ++i) {
+          unsigned PhysReg = EmSpills[i];
+          const TargetRegisterClass *RC =
+            TRI->getPhysicalRegisterRegClass(PhysReg);
+          assert(RC && "Unable to determine register class!");
+          int SS = VRM.getEmergencySpillSlot(RC);
+          if (UsedSS.count(SS))
+            assert(0 && "Need to spill more than one physical registers!");
+          UsedSS.insert(SS);
+          TII->storeRegToStackSlot(MBB, MII, PhysReg, true, SS, RC);
+          MachineInstr *StoreMI = prior(MII);
+          VRM.addSpillSlotUse(SS, StoreMI);
+          TII->loadRegFromStackSlot(MBB, next(MII), PhysReg, SS, RC);
+          MachineInstr *LoadMI = next(MII);
+          VRM.addSpillSlotUse(SS, LoadMI);
+          ++NumPSpills;
+        }
+        NextMII = next(MII);
+      }
+
+      // Insert restores here if asked to.
+      if (VRM.isRestorePt(&MI)) {
+        std::vector<unsigned> &RestoreRegs = VRM.getRestorePtRestores(&MI);
+        for (unsigned i = 0, e = RestoreRegs.size(); i != e; ++i) {
+          unsigned VirtReg = RestoreRegs[e-i-1];  // Reverse order.
+          if (!VRM.getPreSplitReg(VirtReg))
+            continue; // Split interval spilled again.
+          unsigned Phys = VRM.getPhys(VirtReg);
+          RegInfo->setPhysRegUsed(Phys);
+
+          // Check if the value being restored if available. If so, it must be
+          // from a predecessor BB that fallthrough into this BB. We do not
+          // expect:
+          // BB1:
+          // r1 = load fi#1
+          // ...
+          //    = r1<kill>
+          // ... # r1 not clobbered
+          // ...
+          //    = load fi#1
+          bool DoReMat = VRM.isReMaterialized(VirtReg);
+          int SSorRMId = DoReMat
+            ? VRM.getReMatId(VirtReg) : VRM.getStackSlot(VirtReg);
+          const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+          unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SSorRMId);
+          if (InReg == Phys) {
+            // If the value is already available in the expected register, save
+            // a reload / remat.
+            if (SSorRMId)
+              DOUT << "Reusing RM#" << SSorRMId-VirtRegMap::MAX_STACK_SLOT-1;
+            else
+              DOUT << "Reusing SS#" << SSorRMId;
+            DOUT << " from physreg "
+                 << TRI->getName(InReg) << " for vreg"
+                 << VirtReg <<" instead of reloading into physreg "
+                 << TRI->getName(Phys) << "\n";
+            ++NumOmitted;
+            continue;
+          } else if (InReg && InReg != Phys) {
+            if (SSorRMId)
+              DOUT << "Reusing RM#" << SSorRMId-VirtRegMap::MAX_STACK_SLOT-1;
+            else
+              DOUT << "Reusing SS#" << SSorRMId;
+            DOUT << " from physreg "
+                 << TRI->getName(InReg) << " for vreg"
+                 << VirtReg <<" by copying it into physreg "
+                 << TRI->getName(Phys) << "\n";
+
+            // If the reloaded / remat value is available in another register,
+            // copy it to the desired register.
+            TII->copyRegToReg(MBB, &MI, Phys, InReg, RC, RC);
+
+            // This invalidates Phys.
+            Spills.ClobberPhysReg(Phys);
+            // Remember it's available.
+            Spills.addAvailable(SSorRMId, Phys);
+
+            // Mark is killed.
+            MachineInstr *CopyMI = prior(MII);
+            MachineOperand *KillOpnd = CopyMI->findRegisterUseOperand(InReg);
+            KillOpnd->setIsKill();
+            UpdateKills(*CopyMI, TRI, RegKills, KillOps);
+
+            DOUT << '\t' << *CopyMI;
+            ++NumCopified;
+            continue;
+          }
+
+          if (VRM.isReMaterialized(VirtReg)) {
+            ReMaterialize(MBB, MII, Phys, VirtReg, TII, TRI, VRM);
+          } else {
+            const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+            TII->loadRegFromStackSlot(MBB, &MI, Phys, SSorRMId, RC);
+            MachineInstr *LoadMI = prior(MII);
+            VRM.addSpillSlotUse(SSorRMId, LoadMI);
+            ++NumLoads;
+          }
+
+          // This invalidates Phys.
+          Spills.ClobberPhysReg(Phys);
+          // Remember it's available.
+          Spills.addAvailable(SSorRMId, Phys);
+
+          UpdateKills(*prior(MII), TRI, RegKills, KillOps);
+          DOUT << '\t' << *prior(MII);
+        }
+      }
+
+      // Insert spills here if asked to.
+      if (VRM.isSpillPt(&MI)) {
+        std::vector<std::pair<unsigned,bool> > &SpillRegs =
+          VRM.getSpillPtSpills(&MI);
+        for (unsigned i = 0, e = SpillRegs.size(); i != e; ++i) {
+          unsigned VirtReg = SpillRegs[i].first;
+          bool isKill = SpillRegs[i].second;
+          if (!VRM.getPreSplitReg(VirtReg))
+            continue; // Split interval spilled again.
+          const TargetRegisterClass *RC = RegInfo->getRegClass(VirtReg);
+          unsigned Phys = VRM.getPhys(VirtReg);
+          int StackSlot = VRM.getStackSlot(VirtReg);
+          TII->storeRegToStackSlot(MBB, next(MII), Phys, isKill, StackSlot, RC);
+          MachineInstr *StoreMI = next(MII);
+          VRM.addSpillSlotUse(StackSlot, StoreMI);
+          DOUT << "Store:\t" << *StoreMI;
+          VRM.virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+        }
+        NextMII = next(MII);
+      }
+
+      /// ReusedOperands - Keep track of operand reuse in case we need to undo
+      /// reuse.
+      ReuseInfo ReusedOperands(MI, TRI);
+      SmallVector<unsigned, 4> VirtUseOps;
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI.getOperand(i);
+        if (!MO.isReg() || MO.getReg() == 0)
+          continue;   // Ignore non-register operands.
+        
+        unsigned VirtReg = MO.getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(VirtReg)) {
+          // Ignore physregs for spilling, but remember that it is used by this
+          // function.
+          RegInfo->setPhysRegUsed(VirtReg);
+          continue;
+        }
+
+        // We want to process implicit virtual register uses first.
+        if (MO.isImplicit())
+          // If the virtual register is implicitly defined, emit a implicit_def
+          // before so scavenger knows it's "defined".
+          VirtUseOps.insert(VirtUseOps.begin(), i);
+        else
+          VirtUseOps.push_back(i);
+      }
+
+      // Process all of the spilled uses and all non spilled reg references.
+      SmallVector<int, 2> PotentialDeadStoreSlots;
+      KilledMIRegs.clear();
+      for (unsigned j = 0, e = VirtUseOps.size(); j != e; ++j) {
+        unsigned i = VirtUseOps[j];
+        MachineOperand &MO = MI.getOperand(i);
+        unsigned VirtReg = MO.getReg();
+        assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+               "Not a virtual register?");
+
+        unsigned SubIdx = MO.getSubReg();
+        if (VRM.isAssignedReg(VirtReg)) {
+          // This virtual register was assigned a physreg!
+          unsigned Phys = VRM.getPhys(VirtReg);
+          RegInfo->setPhysRegUsed(Phys);
+          if (MO.isDef())
+            ReusedOperands.markClobbered(Phys);
+          unsigned RReg = SubIdx ? TRI->getSubReg(Phys, SubIdx) : Phys;
+          MI.getOperand(i).setReg(RReg);
+          MI.getOperand(i).setSubReg(0);
+          if (VRM.isImplicitlyDefined(VirtReg))
+            BuildMI(MBB, &MI, MI.getDebugLoc(),
+                    TII->get(TargetInstrInfo::IMPLICIT_DEF), RReg);
+          continue;
+        }
+        
+        // This virtual register is now known to be a spilled value.
+        if (!MO.isUse())
+          continue;  // Handle defs in the loop below (handle use&def here though)
+
+        bool AvoidReload = false;
+        if (LIs->hasInterval(VirtReg)) {
+          LiveInterval &LI = LIs->getInterval(VirtReg);
+          if (!LI.liveAt(LIs->getUseIndex(LI.beginNumber())))
+            // Must be defined by an implicit def. It should not be spilled. Note,
+            // this is for correctness reason. e.g.
+            // 8   %reg1024<def> = IMPLICIT_DEF
+            // 12  %reg1024<def> = INSERT_SUBREG %reg1024<kill>, %reg1025, 2
+            // The live range [12, 14) are not part of the r1024 live interval since
+            // it's defined by an implicit def. It will not conflicts with live
+            // interval of r1025. Now suppose both registers are spilled, you can
+            // easily see a situation where both registers are reloaded before
+            // the INSERT_SUBREG and both target registers that would overlap.
+            AvoidReload = true;
+        }
+
+        bool DoReMat = VRM.isReMaterialized(VirtReg);
+        int SSorRMId = DoReMat
+          ? VRM.getReMatId(VirtReg) : VRM.getStackSlot(VirtReg);
+        int ReuseSlot = SSorRMId;
+
+        // Check to see if this stack slot is available.
+        unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SSorRMId);
+
+        // If this is a sub-register use, make sure the reuse register is in the
+        // right register class. For example, for x86 not all of the 32-bit
+        // registers have accessible sub-registers.
+        // Similarly so for EXTRACT_SUBREG. Consider this:
+        // EDI = op
+        // MOV32_mr fi#1, EDI
+        // ...
+        //       = EXTRACT_SUBREG fi#1
+        // fi#1 is available in EDI, but it cannot be reused because it's not in
+        // the right register file.
+        if (PhysReg && !AvoidReload &&
+            (SubIdx || MI.getOpcode() == TargetInstrInfo::EXTRACT_SUBREG)) {
+          const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+          if (!RC->contains(PhysReg))
+            PhysReg = 0;
+        }
+
+        if (PhysReg && !AvoidReload) {
+          // This spilled operand might be part of a two-address operand.  If this
+          // is the case, then changing it will necessarily require changing the 
+          // def part of the instruction as well.  However, in some cases, we
+          // aren't allowed to modify the reused register.  If none of these cases
+          // apply, reuse it.
+          bool CanReuse = true;
+          bool isTied = MI.isRegTiedToDefOperand(i);
+          if (isTied) {
+            // Okay, we have a two address operand.  We can reuse this physreg as
+            // long as we are allowed to clobber the value and there isn't an
+            // earlier def that has already clobbered the physreg.
+            CanReuse = !ReusedOperands.isClobbered(PhysReg) &&
+              Spills.canClobberPhysReg(PhysReg);
+          }
+          
+          if (CanReuse) {
+            // If this stack slot value is already available, reuse it!
+            if (ReuseSlot > VirtRegMap::MAX_STACK_SLOT)
+              DOUT << "Reusing RM#" << ReuseSlot-VirtRegMap::MAX_STACK_SLOT-1;
+            else
+              DOUT << "Reusing SS#" << ReuseSlot;
+            DOUT << " from physreg "
+                 << TRI->getName(PhysReg) << " for vreg"
+                 << VirtReg <<" instead of reloading into physreg "
+                 << TRI->getName(VRM.getPhys(VirtReg)) << "\n";
+            unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+            MI.getOperand(i).setReg(RReg);
+            MI.getOperand(i).setSubReg(0);
+
+            // The only technical detail we have is that we don't know that
+            // PhysReg won't be clobbered by a reloaded stack slot that occurs
+            // later in the instruction.  In particular, consider 'op V1, V2'.
+            // If V1 is available in physreg R0, we would choose to reuse it
+            // here, instead of reloading it into the register the allocator
+            // indicated (say R1).  However, V2 might have to be reloaded
+            // later, and it might indicate that it needs to live in R0.  When
+            // this occurs, we need to have information available that
+            // indicates it is safe to use R1 for the reload instead of R0.
+            //
+            // To further complicate matters, we might conflict with an alias,
+            // or R0 and R1 might not be compatible with each other.  In this
+            // case, we actually insert a reload for V1 in R1, ensuring that
+            // we can get at R0 or its alias.
+            ReusedOperands.addReuse(i, ReuseSlot, PhysReg,
+                                    VRM.getPhys(VirtReg), VirtReg);
+            if (isTied)
+              // Only mark it clobbered if this is a use&def operand.
+              ReusedOperands.markClobbered(PhysReg);
+            ++NumReused;
+
+            if (MI.getOperand(i).isKill() &&
+                ReuseSlot <= VirtRegMap::MAX_STACK_SLOT) {
+
+              // The store of this spilled value is potentially dead, but we
+              // won't know for certain until we've confirmed that the re-use
+              // above is valid, which means waiting until the other operands
+              // are processed. For now we just track the spill slot, we'll
+              // remove it after the other operands are processed if valid.
+
+              PotentialDeadStoreSlots.push_back(ReuseSlot);
+            }
+
+            // Mark is isKill if it's there no other uses of the same virtual
+            // register and it's not a two-address operand. IsKill will be
+            // unset if reg is reused.
+            if (!isTied && KilledMIRegs.count(VirtReg) == 0) {
+              MI.getOperand(i).setIsKill();
+              KilledMIRegs.insert(VirtReg);
+            }
+
+            continue;
+          }  // CanReuse
+          
+          // Otherwise we have a situation where we have a two-address instruction
+          // whose mod/ref operand needs to be reloaded.  This reload is already
+          // available in some register "PhysReg", but if we used PhysReg as the
+          // operand to our 2-addr instruction, the instruction would modify
+          // PhysReg.  This isn't cool if something later uses PhysReg and expects
+          // to get its initial value.
+          //
+          // To avoid this problem, and to avoid doing a load right after a store,
+          // we emit a copy from PhysReg into the designated register for this
+          // operand.
+          unsigned DesignatedReg = VRM.getPhys(VirtReg);
+          assert(DesignatedReg && "Must map virtreg to physreg!");
+
+          // Note that, if we reused a register for a previous operand, the
+          // register we want to reload into might not actually be
+          // available.  If this occurs, use the register indicated by the
+          // reuser.
+          if (ReusedOperands.hasReuses())
+            DesignatedReg = ReusedOperands.GetRegForReload(DesignatedReg, &MI, 
+                                 Spills, MaybeDeadStores, RegKills, KillOps, VRM);
+          
+          // If the mapped designated register is actually the physreg we have
+          // incoming, we don't need to inserted a dead copy.
+          if (DesignatedReg == PhysReg) {
+            // If this stack slot value is already available, reuse it!
+            if (ReuseSlot > VirtRegMap::MAX_STACK_SLOT)
+              DOUT << "Reusing RM#" << ReuseSlot-VirtRegMap::MAX_STACK_SLOT-1;
+            else
+              DOUT << "Reusing SS#" << ReuseSlot;
+            DOUT << " from physreg " << TRI->getName(PhysReg)
+                 << " for vreg" << VirtReg
+                 << " instead of reloading into same physreg.\n";
+            unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+            MI.getOperand(i).setReg(RReg);
+            MI.getOperand(i).setSubReg(0);
+            ReusedOperands.markClobbered(RReg);
+            ++NumReused;
+            continue;
+          }
+          
+          const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+          RegInfo->setPhysRegUsed(DesignatedReg);
+          ReusedOperands.markClobbered(DesignatedReg);
+          TII->copyRegToReg(MBB, &MI, DesignatedReg, PhysReg, RC, RC);
+
+          MachineInstr *CopyMI = prior(MII);
+          UpdateKills(*CopyMI, TRI, RegKills, KillOps);
+
+          // This invalidates DesignatedReg.
+          Spills.ClobberPhysReg(DesignatedReg);
+          
+          Spills.addAvailable(ReuseSlot, DesignatedReg);
+          unsigned RReg =
+            SubIdx ? TRI->getSubReg(DesignatedReg, SubIdx) : DesignatedReg;
+          MI.getOperand(i).setReg(RReg);
+          MI.getOperand(i).setSubReg(0);
+          DOUT << '\t' << *prior(MII);
+          ++NumReused;
+          continue;
+        } // if (PhysReg)
+        
+        // Otherwise, reload it and remember that we have it.
+        PhysReg = VRM.getPhys(VirtReg);
+        assert(PhysReg && "Must map virtreg to physreg!");
+
+        // Note that, if we reused a register for a previous operand, the
+        // register we want to reload into might not actually be
+        // available.  If this occurs, use the register indicated by the
+        // reuser.
+        if (ReusedOperands.hasReuses())
+          PhysReg = ReusedOperands.GetRegForReload(PhysReg, &MI, 
+                                 Spills, MaybeDeadStores, RegKills, KillOps, VRM);
+        
+        RegInfo->setPhysRegUsed(PhysReg);
+        ReusedOperands.markClobbered(PhysReg);
+        if (AvoidReload)
+          ++NumAvoided;
+        else {
+          if (DoReMat) {
+            ReMaterialize(MBB, MII, PhysReg, VirtReg, TII, TRI, VRM);
+          } else {
+            const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+            TII->loadRegFromStackSlot(MBB, &MI, PhysReg, SSorRMId, RC);
+            MachineInstr *LoadMI = prior(MII);
+            VRM.addSpillSlotUse(SSorRMId, LoadMI);
+            ++NumLoads;
+          }
+          // This invalidates PhysReg.
+          Spills.ClobberPhysReg(PhysReg);
+
+          // Any stores to this stack slot are not dead anymore.
+          if (!DoReMat)
+            MaybeDeadStores[SSorRMId] = NULL;
+          Spills.addAvailable(SSorRMId, PhysReg);
+          // Assumes this is the last use. IsKill will be unset if reg is reused
+          // unless it's a two-address operand.
+          if (!MI.isRegTiedToDefOperand(i) &&
+              KilledMIRegs.count(VirtReg) == 0) {
+            MI.getOperand(i).setIsKill();
+            KilledMIRegs.insert(VirtReg);
+          }
+
+          UpdateKills(*prior(MII), TRI, RegKills, KillOps);
+          DOUT << '\t' << *prior(MII);
+        }
+        unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+        MI.getOperand(i).setReg(RReg);
+        MI.getOperand(i).setSubReg(0);
+      }
+
+      // Ok - now we can remove stores that have been confirmed dead.
+      for (unsigned j = 0, e = PotentialDeadStoreSlots.size(); j != e; ++j) {
+        // This was the last use and the spilled value is still available
+        // for reuse. That means the spill was unnecessary!
+        int PDSSlot = PotentialDeadStoreSlots[j];
+        MachineInstr* DeadStore = MaybeDeadStores[PDSSlot];
+        if (DeadStore) {
+          DOUT << "Removed dead store:\t" << *DeadStore;
+          InvalidateKills(*DeadStore, TRI, RegKills, KillOps);
+          VRM.RemoveMachineInstrFromMaps(DeadStore);
+          MBB.erase(DeadStore);
+          MaybeDeadStores[PDSSlot] = NULL;
+          ++NumDSE;
+        }
+      }
+
+
+      DOUT << '\t' << MI;
+
+
+      // If we have folded references to memory operands, make sure we clear all
+      // physical registers that may contain the value of the spilled virtual
+      // register
+      SmallSet<int, 2> FoldedSS;
+      for (tie(I, End) = VRM.getFoldedVirts(&MI); I != End; ) {
+        unsigned VirtReg = I->second.first;
+        VirtRegMap::ModRef MR = I->second.second;
+        DOUT << "Folded vreg: " << VirtReg << "  MR: " << MR;
+
+        // MI2VirtMap be can updated which invalidate the iterator.
+        // Increment the iterator first.
+        ++I;
+        int SS = VRM.getStackSlot(VirtReg);
+        if (SS == VirtRegMap::NO_STACK_SLOT)
+          continue;
+        FoldedSS.insert(SS);
+        DOUT << " - StackSlot: " << SS << "\n";
+        
+        // If this folded instruction is just a use, check to see if it's a
+        // straight load from the virt reg slot.
+        if ((MR & VirtRegMap::isRef) && !(MR & VirtRegMap::isMod)) {
+          int FrameIdx;
+          unsigned DestReg = TII->isLoadFromStackSlot(&MI, FrameIdx);
+          if (DestReg && FrameIdx == SS) {
+            // If this spill slot is available, turn it into a copy (or nothing)
+            // instead of leaving it as a load!
+            if (unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SS)) {
+              DOUT << "Promoted Load To Copy: " << MI;
+              if (DestReg != InReg) {
+                const TargetRegisterClass *RC = RegInfo->getRegClass(VirtReg);
+                TII->copyRegToReg(MBB, &MI, DestReg, InReg, RC, RC);
+                MachineOperand *DefMO = MI.findRegisterDefOperand(DestReg);
+                unsigned SubIdx = DefMO->getSubReg();
+                // Revisit the copy so we make sure to notice the effects of the
+                // operation on the destreg (either needing to RA it if it's 
+                // virtual or needing to clobber any values if it's physical).
+                NextMII = &MI;
+                --NextMII;  // backtrack to the copy.
+                // Propagate the sub-register index over.
+                if (SubIdx) {
+                  DefMO = NextMII->findRegisterDefOperand(DestReg);
+                  DefMO->setSubReg(SubIdx);
+                }
+
+                // Mark is killed.
+                MachineOperand *KillOpnd = NextMII->findRegisterUseOperand(InReg);
+                KillOpnd->setIsKill();
+
+                BackTracked = true;
+              } else {
+                DOUT << "Removing now-noop copy: " << MI;
+                // Unset last kill since it's being reused.
+                InvalidateKill(InReg, TRI, RegKills, KillOps);
+                Spills.disallowClobberPhysReg(InReg);
+              }
+
+              InvalidateKills(MI, TRI, RegKills, KillOps);
+              VRM.RemoveMachineInstrFromMaps(&MI);
+              MBB.erase(&MI);
+              Erased = true;
+              goto ProcessNextInst;
+            }
+          } else {
+            unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SS);
+            SmallVector<MachineInstr*, 4> NewMIs;
+            if (PhysReg &&
+                TII->unfoldMemoryOperand(MF, &MI, PhysReg, false, false, NewMIs)) {
+              MBB.insert(MII, NewMIs[0]);
+              InvalidateKills(MI, TRI, RegKills, KillOps);
+              VRM.RemoveMachineInstrFromMaps(&MI);
+              MBB.erase(&MI);
+              Erased = true;
+              --NextMII;  // backtrack to the unfolded instruction.
+              BackTracked = true;
+              goto ProcessNextInst;
+            }
+          }
+        }
+
+        // If this reference is not a use, any previous store is now dead.
+        // Otherwise, the store to this stack slot is not dead anymore.
+        MachineInstr* DeadStore = MaybeDeadStores[SS];
+        if (DeadStore) {
+          bool isDead = !(MR & VirtRegMap::isRef);
+          MachineInstr *NewStore = NULL;
+          if (MR & VirtRegMap::isModRef) {
+            unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SS);
+            SmallVector<MachineInstr*, 4> NewMIs;
+            // We can reuse this physreg as long as we are allowed to clobber
+            // the value and there isn't an earlier def that has already clobbered
+            // the physreg.
+            if (PhysReg &&
+                !ReusedOperands.isClobbered(PhysReg) &&
+                Spills.canClobberPhysReg(PhysReg) &&
+                !TII->isStoreToStackSlot(&MI, SS)) { // Not profitable!
+              MachineOperand *KillOpnd =
+                DeadStore->findRegisterUseOperand(PhysReg, true);
+              // Note, if the store is storing a sub-register, it's possible the
+              // super-register is needed below.
+              if (KillOpnd && !KillOpnd->getSubReg() &&
+                  TII->unfoldMemoryOperand(MF, &MI, PhysReg, false, true,NewMIs)){
+                MBB.insert(MII, NewMIs[0]);
+                NewStore = NewMIs[1];
+                MBB.insert(MII, NewStore);
+                VRM.addSpillSlotUse(SS, NewStore);
+                InvalidateKills(MI, TRI, RegKills, KillOps);
+                VRM.RemoveMachineInstrFromMaps(&MI);
+                MBB.erase(&MI);
+                Erased = true;
+                --NextMII;
+                --NextMII;  // backtrack to the unfolded instruction.
+                BackTracked = true;
+                isDead = true;
+                ++NumSUnfold;
+              }
+            }
+          }
+
+          if (isDead) {  // Previous store is dead.
+            // If we get here, the store is dead, nuke it now.
+            DOUT << "Removed dead store:\t" << *DeadStore;
+            InvalidateKills(*DeadStore, TRI, RegKills, KillOps);
+            VRM.RemoveMachineInstrFromMaps(DeadStore);
+            MBB.erase(DeadStore);
+            if (!NewStore)
+              ++NumDSE;
+          }
+
+          MaybeDeadStores[SS] = NULL;
+          if (NewStore) {
+            // Treat this store as a spill merged into a copy. That makes the
+            // stack slot value available.
+            VRM.virtFolded(VirtReg, NewStore, VirtRegMap::isMod);
+            goto ProcessNextInst;
+          }
+        }
+
+        // If the spill slot value is available, and this is a new definition of
+        // the value, the value is not available anymore.
+        if (MR & VirtRegMap::isMod) {
+          // Notice that the value in this stack slot has been modified.
+          Spills.ModifyStackSlotOrReMat(SS);
+          
+          // If this is *just* a mod of the value, check to see if this is just a
+          // store to the spill slot (i.e. the spill got merged into the copy). If
+          // so, realize that the vreg is available now, and add the store to the
+          // MaybeDeadStore info.
+          int StackSlot;
+          if (!(MR & VirtRegMap::isRef)) {
+            if (unsigned SrcReg = TII->isStoreToStackSlot(&MI, StackSlot)) {
+              assert(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+                     "Src hasn't been allocated yet?");
+
+              if (CommuteToFoldReload(MBB, MII, VirtReg, SrcReg, StackSlot,
+                                      Spills, RegKills, KillOps, TRI, VRM)) {
+                NextMII = next(MII);
+                BackTracked = true;
+                goto ProcessNextInst;
+              }
+
+              // Okay, this is certainly a store of SrcReg to [StackSlot].  Mark
+              // this as a potentially dead store in case there is a subsequent
+              // store into the stack slot without a read from it.
+              MaybeDeadStores[StackSlot] = &MI;
+
+              // If the stack slot value was previously available in some other
+              // register, change it now.  Otherwise, make the register
+              // available in PhysReg.
+              Spills.addAvailable(StackSlot, SrcReg, MI.killsRegister(SrcReg));
+            }
+          }
+        }
+      }
+
+      // Process all of the spilled defs.
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI.getOperand(i);
+        if (!(MO.isReg() && MO.getReg() && MO.isDef()))
+          continue;
+
+        unsigned VirtReg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(VirtReg)) {
+          // Check to see if this is a noop copy.  If so, eliminate the
+          // instruction before considering the dest reg to be changed.
+          unsigned Src, Dst, SrcSR, DstSR;
+          if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) && Src == Dst) {
+            ++NumDCE;
+            DOUT << "Removing now-noop copy: " << MI;
+            SmallVector<unsigned, 2> KillRegs;
+            InvalidateKills(MI, TRI, RegKills, KillOps, &KillRegs);
+            if (MO.isDead() && !KillRegs.empty()) {
+              // Source register or an implicit super/sub-register use is killed.
+              assert(KillRegs[0] == Dst ||
+                     TRI->isSubRegister(KillRegs[0], Dst) ||
+                     TRI->isSuperRegister(KillRegs[0], Dst));
+              // Last def is now dead.
+              TransferDeadness(&MBB, Dist, Src, RegKills, KillOps, VRM);
+            }
+            VRM.RemoveMachineInstrFromMaps(&MI);
+            MBB.erase(&MI);
+            Erased = true;
+            Spills.disallowClobberPhysReg(VirtReg);
+            goto ProcessNextInst;
+          }
+            
+          // If it's not a no-op copy, it clobbers the value in the destreg.
+          Spills.ClobberPhysReg(VirtReg);
+          ReusedOperands.markClobbered(VirtReg);
+   
+          // Check to see if this instruction is a load from a stack slot into
+          // a register.  If so, this provides the stack slot value in the reg.
+          int FrameIdx;
+          if (unsigned DestReg = TII->isLoadFromStackSlot(&MI, FrameIdx)) {
+            assert(DestReg == VirtReg && "Unknown load situation!");
+
+            // If it is a folded reference, then it's not safe to clobber.
+            bool Folded = FoldedSS.count(FrameIdx);
+            // Otherwise, if it wasn't available, remember that it is now!
+            Spills.addAvailable(FrameIdx, DestReg, !Folded);
+            goto ProcessNextInst;
+          }
+              
+          continue;
+        }
+
+        unsigned SubIdx = MO.getSubReg();
+        bool DoReMat = VRM.isReMaterialized(VirtReg);
+        if (DoReMat)
+          ReMatDefs.insert(&MI);
+
+        // The only vregs left are stack slot definitions.
+        int StackSlot = VRM.getStackSlot(VirtReg);
+        const TargetRegisterClass *RC = RegInfo->getRegClass(VirtReg);
+
+        // If this def is part of a two-address operand, make sure to execute
+        // the store from the correct physical register.
+        unsigned PhysReg;
+        unsigned TiedOp;
+        if (MI.isRegTiedToUseOperand(i, &TiedOp)) {
+          PhysReg = MI.getOperand(TiedOp).getReg();
+          if (SubIdx) {
+            unsigned SuperReg = findSuperReg(RC, PhysReg, SubIdx, TRI);
+            assert(SuperReg && TRI->getSubReg(SuperReg, SubIdx) == PhysReg &&
+                   "Can't find corresponding super-register!");
+            PhysReg = SuperReg;
+          }
+        } else {
+          PhysReg = VRM.getPhys(VirtReg);
+          if (ReusedOperands.isClobbered(PhysReg)) {
+            // Another def has taken the assigned physreg. It must have been a
+            // use&def which got it due to reuse. Undo the reuse!
+            PhysReg = ReusedOperands.GetRegForReload(PhysReg, &MI, 
+                                 Spills, MaybeDeadStores, RegKills, KillOps, VRM);
+          }
+        }
+
+        assert(PhysReg && "VR not assigned a physical register?");
+        RegInfo->setPhysRegUsed(PhysReg);
+        unsigned RReg = SubIdx ? TRI->getSubReg(PhysReg, SubIdx) : PhysReg;
+        ReusedOperands.markClobbered(RReg);
+        MI.getOperand(i).setReg(RReg);
+        MI.getOperand(i).setSubReg(0);
+
+        if (!MO.isDead()) {
+          MachineInstr *&LastStore = MaybeDeadStores[StackSlot];
+          SpillRegToStackSlot(MBB, MII, -1, PhysReg, StackSlot, RC, true,
+                            LastStore, Spills, ReMatDefs, RegKills, KillOps, VRM);
+          NextMII = next(MII);
+
+          // Check to see if this is a noop copy.  If so, eliminate the
+          // instruction before considering the dest reg to be changed.
+          {
+            unsigned Src, Dst, SrcSR, DstSR;
+            if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) && Src == Dst) {
+              ++NumDCE;
+              DOUT << "Removing now-noop copy: " << MI;
+              InvalidateKills(MI, TRI, RegKills, KillOps);
+              VRM.RemoveMachineInstrFromMaps(&MI);
+              MBB.erase(&MI);
+              Erased = true;
+              UpdateKills(*LastStore, TRI, RegKills, KillOps);
+              goto ProcessNextInst;
+            }
+          }
+        }    
+      }
+    ProcessNextInst:
+      DistanceMap.insert(std::make_pair(&MI, Dist++));
+      if (!Erased && !BackTracked) {
+        for (MachineBasicBlock::iterator II = &MI; II != NextMII; ++II)
+          UpdateKills(*II, TRI, RegKills, KillOps);
+      }
+      MII = NextMII;
+    }
+
+  }
+
+};
+
+llvm::VirtRegRewriter* llvm::createVirtRegRewriter() {
+  switch (RewriterOpt) {
+  default: assert(0 && "Unreachable!");
+  case local:
+    return new LocalRewriter();
+  case simple:
+    return new SimpleRewriter();
+  case trivial:
+    return new TrivialRewriter();
+  }
+}
diff --git a/lib/CodeGen/VirtRegRewriter.h b/lib/CodeGen/VirtRegRewriter.h
new file mode 100644
index 0000000..bc830f7
--- /dev/null
+++ b/lib/CodeGen/VirtRegRewriter.h
@@ -0,0 +1,56 @@
+//===-- llvm/CodeGen/VirtRegRewriter.h - VirtRegRewriter -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_VIRTREGREWRITER_H
+#define LLVM_CODEGEN_VIRTREGREWRITER_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "VirtRegMap.h"
+#include <map>
+
+// TODO:
+//       - Finish renaming Spiller -> Rewriter
+//         - SimpleSpiller
+//         - LocalSpiller
+
+namespace llvm {
+  
+  /// VirtRegRewriter interface: Implementations of this interface assign
+  /// spilled virtual registers to stack slots, rewriting the code.
+  struct VirtRegRewriter {
+    virtual ~VirtRegRewriter();
+    virtual bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
+                                      LiveIntervals* LIs) = 0;
+  };
+
+  /// createVirtRegRewriter - Create an return a rewriter object, as specified
+  /// on the command line.
+  VirtRegRewriter* createVirtRegRewriter();
+
+}
+
+#endif
diff --git a/lib/CompilerDriver/Action.cpp b/lib/CompilerDriver/Action.cpp
new file mode 100644
index 0000000..c0a1b84
--- /dev/null
+++ b/lib/CompilerDriver/Action.cpp
@@ -0,0 +1,78 @@
+//===--- Action.cpp - The LLVM Compiler Driver ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open
+// Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  Action class - implementation and auxiliary functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CompilerDriver/Action.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/System/Program.h"
+
+#include <iostream>
+#include <stdexcept>
+
+using namespace llvm;
+using namespace llvmc;
+
+extern cl::opt<bool> DryRun;
+extern cl::opt<bool> VerboseMode;
+
+namespace {
+  int ExecuteProgram(const std::string& name,
+                     const StrVector& args) {
+    sys::Path prog = sys::Program::FindProgramByName(name);
+
+    if (prog.isEmpty())
+      throw std::runtime_error("Can't find program '" + name + "'");
+    if (!prog.canExecute())
+      throw std::runtime_error("Program '" + name + "' is not executable.");
+
+    // Build the command line vector and the redirects array.
+    const sys::Path* redirects[3] = {0,0,0};
+    sys::Path stdout_redirect;
+
+    std::vector<const char*> argv;
+    argv.reserve((args.size()+2));
+    argv.push_back(name.c_str());
+
+    for (StrVector::const_iterator B = args.begin(), E = args.end();
+         B!=E; ++B) {
+      if (*B == ">") {
+        ++B;
+        stdout_redirect.set(*B);
+        redirects[1] = &stdout_redirect;
+      }
+      else {
+        argv.push_back((*B).c_str());
+      }
+    }
+    argv.push_back(0);  // null terminate list.
+
+    // Invoke the program.
+    return sys::Program::ExecuteAndWait(prog, &argv[0], 0, &redirects[0]);
+  }
+
+  void print_string (const std::string& str) {
+    std::cerr << str << ' ';
+  }
+}
+
+int llvmc::Action::Execute() const {
+  if (DryRun || VerboseMode) {
+    std::cerr << Command_ << " ";
+    std::for_each(Args_.begin(), Args_.end(), print_string);
+    std::cerr << '\n';
+  }
+  if (DryRun)
+    return 0;
+  else
+    return ExecuteProgram(Command_, Args_);
+}
diff --git a/lib/CompilerDriver/CMakeLists.txt b/lib/CompilerDriver/CMakeLists.txt
new file mode 100644
index 0000000..153dd44
--- /dev/null
+++ b/lib/CompilerDriver/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS support system)
+set(LLVM_REQUIRES_EH 1)
+
+add_llvm_tool(llvmc
+  Action.cpp
+  CompilationGraph.cpp
+  llvmc.cpp
+  Plugin.cpp
+  Tool.cpp
+  )
diff --git a/lib/CompilerDriver/CompilationGraph.cpp b/lib/CompilerDriver/CompilationGraph.cpp
new file mode 100644
index 0000000..dece4e8
--- /dev/null
+++ b/lib/CompilerDriver/CompilationGraph.cpp
@@ -0,0 +1,536 @@
+//===--- CompilationGraph.cpp - The LLVM Compiler Driver --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open
+// Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  Compilation graph - implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CompilerDriver/CompilationGraph.h"
+#include "llvm/CompilerDriver/Error.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/GraphWriter.h"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <queue>
+#include <stdexcept>
+
+using namespace llvm;
+using namespace llvmc;
+
+extern cl::list<std::string> InputFilenames;
+extern cl::list<std::string> Languages;
+
+namespace llvmc {
+
+  const std::string& LanguageMap::GetLanguage(const sys::Path& File) const {
+    LanguageMap::const_iterator Lang = this->find(File.getSuffix());
+    if (Lang == this->end())
+      throw std::runtime_error("Unknown suffix: " + File.getSuffix());
+    return Lang->second;
+  }
+}
+
+namespace {
+
+  /// ChooseEdge - Return the edge with the maximum weight.
+  template <class C>
+  const Edge* ChooseEdge(const C& EdgesContainer,
+                         const InputLanguagesSet& InLangs,
+                         const std::string& NodeName = "root") {
+    const Edge* MaxEdge = 0;
+    unsigned MaxWeight = 0;
+    bool SingleMax = true;
+
+    for (typename C::const_iterator B = EdgesContainer.begin(),
+           E = EdgesContainer.end(); B != E; ++B) {
+      const Edge* e = B->getPtr();
+      unsigned EW = e->Weight(InLangs);
+      if (EW > MaxWeight) {
+        MaxEdge = e;
+        MaxWeight = EW;
+        SingleMax = true;
+      } else if (EW == MaxWeight) {
+        SingleMax = false;
+      }
+    }
+
+    if (!SingleMax)
+      throw std::runtime_error("Node " + NodeName +
+                               ": multiple maximal outward edges found!"
+                               " Most probably a specification error.");
+    if (!MaxEdge)
+      throw std::runtime_error("Node " + NodeName +
+                               ": no maximal outward edge found!"
+                               " Most probably a specification error.");
+    return MaxEdge;
+  }
+
+}
+
+void Node::AddEdge(Edge* Edg) {
+  // If there already was an edge between two nodes, modify it instead
+  // of adding a new edge.
+  const std::string& ToolName = Edg->ToolName();
+  for (container_type::iterator B = OutEdges.begin(), E = OutEdges.end();
+       B != E; ++B) {
+    if ((*B)->ToolName() == ToolName) {
+      llvm::IntrusiveRefCntPtr<Edge>(Edg).swap(*B);
+      return;
+    }
+  }
+  OutEdges.push_back(llvm::IntrusiveRefCntPtr<Edge>(Edg));
+}
+
+CompilationGraph::CompilationGraph() {
+  NodesMap["root"] = Node(this);
+}
+
+Node& CompilationGraph::getNode(const std::string& ToolName) {
+  nodes_map_type::iterator I = NodesMap.find(ToolName);
+  if (I == NodesMap.end())
+    throw std::runtime_error("Node " + ToolName + " is not in the graph");
+  return I->second;
+}
+
+const Node& CompilationGraph::getNode(const std::string& ToolName) const {
+  nodes_map_type::const_iterator I = NodesMap.find(ToolName);
+  if (I == NodesMap.end())
+    throw std::runtime_error("Node " + ToolName + " is not in the graph!");
+  return I->second;
+}
+
+// Find the tools list corresponding to the given language name.
+const CompilationGraph::tools_vector_type&
+CompilationGraph::getToolsVector(const std::string& LangName) const
+{
+  tools_map_type::const_iterator I = ToolsMap.find(LangName);
+  if (I == ToolsMap.end())
+    throw std::runtime_error("No tool corresponding to the language "
+                             + LangName + " found");
+  return I->second;
+}
+
+void CompilationGraph::insertNode(Tool* V) {
+  if (NodesMap.count(V->Name()) == 0)
+    NodesMap[V->Name()] = Node(this, V);
+}
+
+void CompilationGraph::insertEdge(const std::string& A, Edge* Edg) {
+  Node& B = getNode(Edg->ToolName());
+  if (A == "root") {
+    const char** InLangs = B.ToolPtr->InputLanguages();
+    for (;*InLangs; ++InLangs)
+      ToolsMap[*InLangs].push_back(IntrusiveRefCntPtr<Edge>(Edg));
+    NodesMap["root"].AddEdge(Edg);
+  }
+  else {
+    Node& N = getNode(A);
+    N.AddEdge(Edg);
+  }
+  // Increase the inward edge counter.
+  B.IncrInEdges();
+}
+
+// Pass input file through the chain until we bump into a Join node or
+// a node that says that it is the last.
+void CompilationGraph::PassThroughGraph (const sys::Path& InFile,
+                                         const Node* StartNode,
+                                         const InputLanguagesSet& InLangs,
+                                         const sys::Path& TempDir,
+                                         const LanguageMap& LangMap) const {
+  sys::Path In = InFile;
+  const Node* CurNode = StartNode;
+
+  while(true) {
+    Tool* CurTool = CurNode->ToolPtr.getPtr();
+
+    if (CurTool->IsJoin()) {
+      JoinTool& JT = dynamic_cast<JoinTool&>(*CurTool);
+      JT.AddToJoinList(In);
+      break;
+    }
+
+    Action CurAction = CurTool->GenerateAction(In, CurNode->HasChildren(),
+                                               TempDir, InLangs, LangMap);
+
+    if (int ret = CurAction.Execute())
+      throw error_code(ret);
+
+    if (CurAction.StopCompilation())
+      return;
+
+    CurNode = &getNode(ChooseEdge(CurNode->OutEdges,
+                                  InLangs,
+                                  CurNode->Name())->ToolName());
+    In = CurAction.OutFile();
+  }
+}
+
+// Find the head of the toolchain corresponding to the given file.
+// Also, insert an input language into InLangs.
+const Node* CompilationGraph::
+FindToolChain(const sys::Path& In, const std::string* ForceLanguage,
+              InputLanguagesSet& InLangs, const LanguageMap& LangMap) const {
+
+  // Determine the input language.
+  const std::string& InLanguage =
+    ForceLanguage ? *ForceLanguage : LangMap.GetLanguage(In);
+
+  // Add the current input language to the input language set.
+  InLangs.insert(InLanguage);
+
+  // Find the toolchain for the input language.
+  const tools_vector_type& TV = getToolsVector(InLanguage);
+  if (TV.empty())
+    throw std::runtime_error("No toolchain corresponding to language "
+                             + InLanguage + " found");
+  return &getNode(ChooseEdge(TV, InLangs)->ToolName());
+}
+
+// Helper function used by Build().
+// Traverses initial portions of the toolchains (up to the first Join node).
+// This function is also responsible for handling the -x option.
+void CompilationGraph::BuildInitial (InputLanguagesSet& InLangs,
+                                     const sys::Path& TempDir,
+                                     const LanguageMap& LangMap) {
+  // This is related to -x option handling.
+  cl::list<std::string>::const_iterator xIter = Languages.begin(),
+    xBegin = xIter, xEnd = Languages.end();
+  bool xEmpty = true;
+  const std::string* xLanguage = 0;
+  unsigned xPos = 0, xPosNext = 0, filePos = 0;
+
+  if (xIter != xEnd) {
+    xEmpty = false;
+    xPos = Languages.getPosition(xIter - xBegin);
+    cl::list<std::string>::const_iterator xNext = llvm::next(xIter);
+    xPosNext = (xNext == xEnd) ? std::numeric_limits<unsigned>::max()
+      : Languages.getPosition(xNext - xBegin);
+    xLanguage = (*xIter == "none") ? 0 : &(*xIter);
+  }
+
+  // For each input file:
+  for (cl::list<std::string>::const_iterator B = InputFilenames.begin(),
+         CB = B, E = InputFilenames.end(); B != E; ++B) {
+    sys::Path In = sys::Path(*B);
+
+    // Code for handling the -x option.
+    // Output: std::string* xLanguage (can be NULL).
+    if (!xEmpty) {
+      filePos = InputFilenames.getPosition(B - CB);
+
+      if (xPos < filePos) {
+        if (filePos < xPosNext) {
+          xLanguage = (*xIter == "none") ? 0 : &(*xIter);
+        }
+        else { // filePos >= xPosNext
+          // Skip xIters while filePos > xPosNext
+          while (filePos > xPosNext) {
+            ++xIter;
+            xPos = xPosNext;
+
+            cl::list<std::string>::const_iterator xNext = llvm::next(xIter);
+            if (xNext == xEnd)
+              xPosNext = std::numeric_limits<unsigned>::max();
+            else
+              xPosNext = Languages.getPosition(xNext - xBegin);
+            xLanguage = (*xIter == "none") ? 0 : &(*xIter);
+          }
+        }
+      }
+    }
+
+    // Find the toolchain corresponding to this file.
+    const Node* N = FindToolChain(In, xLanguage, InLangs, LangMap);
+    // Pass file through the chain starting at head.
+    PassThroughGraph(In, N, InLangs, TempDir, LangMap);
+  }
+}
+
+// Sort the nodes in topological order.
+void CompilationGraph::TopologicalSort(std::vector<const Node*>& Out) {
+  std::queue<const Node*> Q;
+  Q.push(&getNode("root"));
+
+  while (!Q.empty()) {
+    const Node* A = Q.front();
+    Q.pop();
+    Out.push_back(A);
+    for (Node::const_iterator EB = A->EdgesBegin(), EE = A->EdgesEnd();
+         EB != EE; ++EB) {
+      Node* B = &getNode((*EB)->ToolName());
+      B->DecrInEdges();
+      if (B->HasNoInEdges())
+        Q.push(B);
+    }
+  }
+}
+
+namespace {
+  bool NotJoinNode(const Node* N) {
+    return N->ToolPtr ? !N->ToolPtr->IsJoin() : true;
+  }
+}
+
+// Call TopologicalSort and filter the resulting list to include
+// only Join nodes.
+void CompilationGraph::
+TopologicalSortFilterJoinNodes(std::vector<const Node*>& Out) {
+  std::vector<const Node*> TopSorted;
+  TopologicalSort(TopSorted);
+  std::remove_copy_if(TopSorted.begin(), TopSorted.end(),
+                      std::back_inserter(Out), NotJoinNode);
+}
+
+int CompilationGraph::Build (const sys::Path& TempDir,
+                             const LanguageMap& LangMap) {
+
+  InputLanguagesSet InLangs;
+
+  // Traverse initial parts of the toolchains and fill in InLangs.
+  BuildInitial(InLangs, TempDir, LangMap);
+
+  std::vector<const Node*> JTV;
+  TopologicalSortFilterJoinNodes(JTV);
+
+  // For all join nodes in topological order:
+  for (std::vector<const Node*>::iterator B = JTV.begin(), E = JTV.end();
+       B != E; ++B) {
+
+    const Node* CurNode = *B;
+    JoinTool* JT = &dynamic_cast<JoinTool&>(*CurNode->ToolPtr.getPtr());
+
+    // Are there any files in the join list?
+    if (JT->JoinListEmpty())
+      continue;
+
+    Action CurAction = JT->GenerateAction(CurNode->HasChildren(),
+                                          TempDir, InLangs, LangMap);
+
+    if (int ret = CurAction.Execute())
+      throw error_code(ret);
+
+    if (CurAction.StopCompilation())
+      return 0;
+
+    const Node* NextNode = &getNode(ChooseEdge(CurNode->OutEdges, InLangs,
+                                               CurNode->Name())->ToolName());
+    PassThroughGraph(sys::Path(CurAction.OutFile()), NextNode,
+                     InLangs, TempDir, LangMap);
+  }
+
+  return 0;
+}
+
+int CompilationGraph::CheckLanguageNames() const {
+  int ret = 0;
+  // Check that names for output and input languages on all edges do match.
+  for (const_nodes_iterator B = this->NodesMap.begin(),
+         E = this->NodesMap.end(); B != E; ++B) {
+
+    const Node & N1 = B->second;
+    if (N1.ToolPtr) {
+      for (Node::const_iterator EB = N1.EdgesBegin(), EE = N1.EdgesEnd();
+           EB != EE; ++EB) {
+        const Node& N2 = this->getNode((*EB)->ToolName());
+
+        if (!N2.ToolPtr) {
+          ++ret;
+          std::cerr << "Error: there is an edge from '" << N1.ToolPtr->Name()
+                    << "' back to the root!\n\n";
+          continue;
+        }
+
+        const char* OutLang = N1.ToolPtr->OutputLanguage();
+        const char** InLangs = N2.ToolPtr->InputLanguages();
+        bool eq = false;
+        for (;*InLangs; ++InLangs) {
+          if (std::strcmp(OutLang, *InLangs) == 0) {
+            eq = true;
+            break;
+          }
+        }
+
+        if (!eq) {
+          ++ret;
+          std::cerr << "Error: Output->input language mismatch in the edge '" <<
+            N1.ToolPtr->Name() << "' -> '" << N2.ToolPtr->Name() << "'!\n";
+
+          std::cerr << "Expected one of { ";
+
+          InLangs = N2.ToolPtr->InputLanguages();
+          for (;*InLangs; ++InLangs) {
+            std::cerr << '\'' << *InLangs << (*(InLangs+1) ? "', " : "'");
+          }
+
+          std::cerr << " }, but got '" << OutLang << "'!\n\n";
+        }
+
+      }
+    }
+  }
+
+  return ret;
+}
+
+int CompilationGraph::CheckMultipleDefaultEdges() const {
+  int ret = 0;
+  InputLanguagesSet Dummy;
+
+  // For all nodes, just iterate over the outgoing edges and check if there is
+  // more than one edge with maximum weight.
+  for (const_nodes_iterator B = this->NodesMap.begin(),
+         E = this->NodesMap.end(); B != E; ++B) {
+    const Node& N = B->second;
+    unsigned MaxWeight = 0;
+
+    // Ignore the root node.
+    if (!N.ToolPtr)
+      continue;
+
+    for (Node::const_iterator EB = N.EdgesBegin(), EE = N.EdgesEnd();
+         EB != EE; ++EB) {
+      unsigned EdgeWeight = (*EB)->Weight(Dummy);
+      if (EdgeWeight > MaxWeight) {
+        MaxWeight = EdgeWeight;
+      }
+      else if (EdgeWeight == MaxWeight) {
+        ++ret;
+        std::cerr
+          << "Error: there are multiple maximal edges stemming from the '"
+          << N.ToolPtr->Name() << "' node!\n\n";
+        break;
+      }
+    }
+  }
+
+  return ret;
+}
+
+int CompilationGraph::CheckCycles() {
+  unsigned deleted = 0;
+  std::queue<Node*> Q;
+  Q.push(&getNode("root"));
+
+  // Try to delete all nodes that have no ingoing edges, starting from the
+  // root. If there are any nodes left after this operation, then we have a
+  // cycle. This relies on '--check-graph' not performing the topological sort.
+  while (!Q.empty()) {
+    Node* A = Q.front();
+    Q.pop();
+    ++deleted;
+
+    for (Node::iterator EB = A->EdgesBegin(), EE = A->EdgesEnd();
+         EB != EE; ++EB) {
+      Node* B = &getNode((*EB)->ToolName());
+      B->DecrInEdges();
+      if (B->HasNoInEdges())
+        Q.push(B);
+    }
+  }
+
+  if (deleted != NodesMap.size()) {
+    std::cerr << "Error: there are cycles in the compilation graph!\n"
+              << "Try inspecting the diagram produced by "
+      "'llvmc --view-graph'.\n\n";
+    return 1;
+  }
+
+  return 0;
+}
+
+int CompilationGraph::Check () {
+  // We try to catch as many errors as we can in one go.
+  int ret = 0;
+
+  // Check that output/input language names match.
+  ret += this->CheckLanguageNames();
+
+  // Check for multiple default edges.
+  ret += this->CheckMultipleDefaultEdges();
+
+  // Check for cycles.
+  ret += this->CheckCycles();
+
+  return ret;
+}
+
+// Code related to graph visualization.
+
+namespace llvm {
+  template <>
+  struct DOTGraphTraits<llvmc::CompilationGraph*>
+    : public DefaultDOTGraphTraits
+  {
+
+    template<typename GraphType>
+    static std::string getNodeLabel(const Node* N, const GraphType&)
+    {
+      if (N->ToolPtr)
+        if (N->ToolPtr->IsJoin())
+          return N->Name() + "\n (join" +
+            (N->HasChildren() ? ")"
+             : std::string(": ") + N->ToolPtr->OutputLanguage() + ')');
+        else
+          return N->Name();
+      else
+        return "root";
+    }
+
+    template<typename EdgeIter>
+    static std::string getEdgeSourceLabel(const Node* N, EdgeIter I) {
+      if (N->ToolPtr) {
+        return N->ToolPtr->OutputLanguage();
+      }
+      else {
+        const char** InLangs = I->ToolPtr->InputLanguages();
+        std::string ret;
+
+        for (; *InLangs; ++InLangs) {
+          if (*(InLangs + 1)) {
+            ret += *InLangs;
+            ret +=  ", ";
+          }
+          else {
+            ret += *InLangs;
+          }
+        }
+
+        return ret;
+      }
+    }
+  };
+
+}
+
+void CompilationGraph::writeGraph(const std::string& OutputFilename) {
+  std::ofstream O(OutputFilename.c_str());
+
+  if (O.good()) {
+    std::cerr << "Writing '"<< OutputFilename << "' file...";
+    llvm::WriteGraph(O, this);
+    std::cerr << "done.\n";
+    O.close();
+  }
+  else {
+    throw std::runtime_error("Error opening file '" + OutputFilename
+                             + "' for writing!");
+  }
+}
+
+void CompilationGraph::viewGraph() {
+  llvm::ViewGraph(this, "compilation-graph");
+}
diff --git a/lib/CompilerDriver/Makefile b/lib/CompilerDriver/Makefile
new file mode 100644
index 0000000..e5bf3e1
--- /dev/null
+++ b/lib/CompilerDriver/Makefile
@@ -0,0 +1,19 @@
+##===- lib/CompilerDriver/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open
+# Source License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+
+# We don't want this library to appear in `llvm-config --libs` output, so its
+# name doesn't start with "LLVM".
+
+LIBRARYNAME = CompilerDriver
+LINK_COMPONENTS = support system
+REQUIRES_EH := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/CompilerDriver/Plugin.cpp b/lib/CompilerDriver/Plugin.cpp
new file mode 100644
index 0000000..75abbd0
--- /dev/null
+++ b/lib/CompilerDriver/Plugin.cpp
@@ -0,0 +1,73 @@
+//===--- Plugin.cpp - The LLVM Compiler Driver ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open
+// Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  Plugin support.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CompilerDriver/Plugin.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace {
+
+  // Registry::Add<> does not do lifetime management (probably issues
+  // with static constructor/destructor ordering), so we have to
+  // implement it here.
+  //
+  // All this static registration/life-before-main model seems
+  // unnecessary convoluted to me.
+
+  static bool pluginListInitialized = false;
+  typedef std::vector<const llvmc::BasePlugin*> PluginList;
+  static PluginList Plugins;
+
+  struct ByPriority {
+    bool operator()(const llvmc::BasePlugin* lhs,
+                    const llvmc::BasePlugin* rhs) {
+      return lhs->Priority() < rhs->Priority();
+    }
+  };
+}
+
+namespace llvmc {
+
+  PluginLoader::PluginLoader() {
+    if (!pluginListInitialized) {
+      for (PluginRegistry::iterator B = PluginRegistry::begin(),
+             E = PluginRegistry::end(); B != E; ++B)
+        Plugins.push_back(B->instantiate());
+      std::sort(Plugins.begin(), Plugins.end(), ByPriority());
+    }
+    pluginListInitialized = true;
+  }
+
+  PluginLoader::~PluginLoader() {
+    if (pluginListInitialized) {
+      for (PluginList::iterator B = Plugins.begin(), E = Plugins.end();
+           B != E; ++B)
+        delete (*B);
+    }
+    pluginListInitialized = false;
+  }
+
+  void PluginLoader::PopulateLanguageMap(LanguageMap& langMap) {
+    for (PluginList::iterator B = Plugins.begin(), E = Plugins.end();
+         B != E; ++B)
+      (*B)->PopulateLanguageMap(langMap);
+  }
+
+  void PluginLoader::PopulateCompilationGraph(CompilationGraph& graph) {
+    for (PluginList::iterator B = Plugins.begin(), E = Plugins.end();
+         B != E; ++B)
+      (*B)->PopulateCompilationGraph(graph);
+  }
+
+}
diff --git a/lib/CompilerDriver/Tool.cpp b/lib/CompilerDriver/Tool.cpp
new file mode 100644
index 0000000..886b26b
--- /dev/null
+++ b/lib/CompilerDriver/Tool.cpp
@@ -0,0 +1,74 @@
+//===--- Tool.cpp - The LLVM Compiler Driver --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open
+// Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  Tool base class - implementation details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CompilerDriver/Tool.h"
+
+#include "llvm/System/Path.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+using namespace llvmc;
+
+extern cl::opt<std::string> OutputFilename;
+
+namespace {
+  sys::Path MakeTempFile(const sys::Path& TempDir, const std::string& BaseName,
+                         const std::string& Suffix) {
+    sys::Path Out;
+
+    // Make sure we don't end up with path names like '/file.o' if the
+    // TempDir is empty.
+    if (TempDir.empty()) {
+      Out.set(BaseName);
+    }
+    else {
+      Out = TempDir;
+      Out.appendComponent(BaseName);
+    }
+    Out.appendSuffix(Suffix);
+    // NOTE: makeUnique always *creates* a unique temporary file,
+    // which is good, since there will be no races. However, some
+    // tools do not like it when the output file already exists, so
+    // they have to be placated with -f or something like that.
+    Out.makeUnique(true, NULL);
+    return Out;
+  }
+}
+
+sys::Path Tool::OutFilename(const sys::Path& In,
+                            const sys::Path& TempDir,
+                            bool StopCompilation,
+                            const char* OutputSuffix) const {
+  sys::Path Out;
+
+  if (StopCompilation) {
+    if (!OutputFilename.empty()) {
+      Out.set(OutputFilename);
+    }
+    else if (IsJoin()) {
+      Out.set("a");
+      Out.appendSuffix(OutputSuffix);
+    }
+    else {
+      Out.set(In.getBasename());
+      Out.appendSuffix(OutputSuffix);
+    }
+  }
+  else {
+    if (IsJoin())
+      Out = MakeTempFile(TempDir, "tmp", OutputSuffix);
+    else
+      Out = MakeTempFile(TempDir, In.getBasename(), OutputSuffix);
+  }
+  return Out;
+}
diff --git a/lib/Debugger/CMakeLists.txt b/lib/Debugger/CMakeLists.txt
new file mode 100644
index 0000000..d2508cf
--- /dev/null
+++ b/lib/Debugger/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(LLVMDebugger
+  Debugger.cpp
+  ProgramInfo.cpp
+  RuntimeInfo.cpp
+  SourceFile.cpp
+  SourceLanguage-CFamily.cpp
+  SourceLanguage-CPlusPlus.cpp
+  SourceLanguage-Unknown.cpp
+  SourceLanguage.cpp
+  )
diff --git a/lib/Debugger/Debugger.cpp b/lib/Debugger/Debugger.cpp
new file mode 100644
index 0000000..b12d90a
--- /dev/null
+++ b/lib/Debugger/Debugger.cpp
@@ -0,0 +1,230 @@
+//===-- Debugger.cpp - LLVM debugger library implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the main implementation of the LLVM debugger library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/Debugger.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Debugger/InferiorProcess.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/StringExtras.h"
+#include <cstdlib>
+#include <memory>
+using namespace llvm;
+
+/// Debugger constructor - Initialize the debugger to its initial, empty, state.
+///
+Debugger::Debugger() : Environment(0), Program(0), Process(0) {
+}
+
+Debugger::~Debugger() {
+  // Killing the program could throw an exception.  We don't want to progagate
+  // the exception out of our destructor though.
+  try {
+    killProgram();
+  } catch (const char *) {
+  } catch (const std::string &) {
+  }
+
+  unloadProgram();
+}
+
+/// getProgramPath - Get the path of the currently loaded program, or an
+/// empty string if none is loaded.
+std::string Debugger::getProgramPath() const {
+  return Program ? Program->getModuleIdentifier() : "";
+}
+
+static Module *
+getMaterializedModuleProvider(const std::string &Filename) {
+  std::auto_ptr<MemoryBuffer> Buffer;
+  Buffer.reset(MemoryBuffer::getFileOrSTDIN(Filename.c_str()));
+  if (Buffer.get())
+    return ParseBitcodeFile(Buffer.get());
+  return 0;
+}
+
+/// loadProgram - If a program is currently loaded, unload it.  Then search
+/// the PATH for the specified program, loading it when found.  If the
+/// specified program cannot be found, an exception is thrown to indicate the
+/// error.
+void Debugger::loadProgram(const std::string &Filename) {
+  if ((Program = getMaterializedModuleProvider(Filename)) ||
+      (Program = getMaterializedModuleProvider(Filename+".bc")))
+    return;   // Successfully loaded the program.
+
+  // Search the program path for the file...
+  if (const char *PathS = getenv("PATH")) {
+    std::string Path = PathS;
+
+    std::string Directory = getToken(Path, ":");
+    while (!Directory.empty()) {
+      if ((Program = getMaterializedModuleProvider(Directory +"/"+ Filename)) ||
+          (Program = getMaterializedModuleProvider(Directory +"/"+ Filename
+                                                                      + ".bc")))
+        return;   // Successfully loaded the program.
+
+      Directory = getToken(Path, ":");
+    }
+  }
+
+  throw "Could not find program '" + Filename + "'!";
+}
+
+/// unloadProgram - If a program is running, kill it, then unload all traces
+/// of the current program.  If no program is loaded, this method silently
+/// succeeds.
+void Debugger::unloadProgram() {
+  if (!isProgramLoaded()) return;
+  killProgram();
+  delete Program;
+  Program = 0;
+}
+
+
+/// createProgram - Create an instance of the currently loaded program,
+/// killing off any existing one.  This creates the program and stops it at
+/// the first possible moment.  If there is no program loaded or if there is a
+/// problem starting the program, this method throws an exception.
+void Debugger::createProgram() {
+  if (!isProgramLoaded())
+    throw "Cannot start program: none is loaded.";
+
+  // Kill any existing program.
+  killProgram();
+
+  // Add argv[0] to the arguments vector..
+  std::vector<std::string> Args(ProgramArguments);
+  Args.insert(Args.begin(), getProgramPath());
+
+  // Start the new program... this could throw if the program cannot be started.
+  Process = InferiorProcess::create(Program, Args, Environment);
+}
+
+InferiorProcess *
+InferiorProcess::create(Module *M, const std::vector<std::string> &Arguments,
+                        const char * const *envp) {
+  throw"No supported binding to inferior processes (debugger not implemented).";
+}
+
+/// killProgram - If the program is currently executing, kill off the
+/// process and free up any state related to the currently running program.  If
+/// there is no program currently running, this just silently succeeds.
+void Debugger::killProgram() {
+  // The destructor takes care of the dirty work.
+  try {
+    delete Process;
+  } catch (...) {
+    Process = 0;
+    throw;
+  }
+  Process = 0;
+}
+
+/// stepProgram - Implement the 'step' command, continuing execution until
+/// the next possible stop point.
+void Debugger::stepProgram() {
+  assert(isProgramRunning() && "Cannot step if the program isn't running!");
+  try {
+    Process->stepProgram();
+  } catch (InferiorProcessDead &IPD) {
+    killProgram();
+    throw NonErrorException("The program stopped with exit code " +
+                            itostr(IPD.getExitCode()));
+  } catch (...) {
+    killProgram();
+    throw;
+  }
+}
+
+/// nextProgram - Implement the 'next' command, continuing execution until
+/// the next possible stop point that is in the current function.
+void Debugger::nextProgram() {
+  assert(isProgramRunning() && "Cannot next if the program isn't running!");
+  try {
+    // This should step the process.  If the process enters a function, then it
+    // should 'finish' it.  However, figuring this out is tricky.  In
+    // particular, the program can do any of:
+    //  0. Not change current frame.
+    //  1. Entering or exiting a region within the current function
+    //     (which changes the frame ID, but which we shouldn't 'finish')
+    //  2. Exiting the current function (which changes the frame ID)
+    //  3. Entering a function (which should be 'finish'ed)
+    // For this reason, we have to be very careful about when we decide to do
+    // the 'finish'.
+
+    // Get the current frame, but don't trust it.  It could change...
+    void *CurrentFrame = Process->getPreviousFrame(0);
+
+    // Don't trust the current frame: get the caller frame.
+    void *ParentFrame  = Process->getPreviousFrame(CurrentFrame);
+
+    // Ok, we have some information, run the program one step.
+    Process->stepProgram();
+
+    // Where is the new frame?  The most common case, by far is that it has not
+    // been modified (Case #0), in which case we don't need to do anything more.
+    void *NewFrame = Process->getPreviousFrame(0);
+    if (NewFrame != CurrentFrame) {
+      // Ok, the frame changed.  If we are case #1, then the parent frame will
+      // be identical.
+      void *NewParentFrame = Process->getPreviousFrame(NewFrame);
+      if (ParentFrame != NewParentFrame) {
+        // Ok, now we know we aren't case #0 or #1.  Check to see if we entered
+        // a new function.  If so, the parent frame will be "CurrentFrame".
+        if (CurrentFrame == NewParentFrame)
+          Process->finishProgram(NewFrame);
+      }
+    }
+
+  } catch (InferiorProcessDead &IPD) {
+    killProgram();
+    throw NonErrorException("The program stopped with exit code " +
+                            itostr(IPD.getExitCode()));
+  } catch (...) {
+    killProgram();
+    throw;
+  }
+}
+
+/// finishProgram - Implement the 'finish' command, continuing execution
+/// until the specified frame ID returns.
+void Debugger::finishProgram(void *Frame) {
+  assert(isProgramRunning() && "Cannot cont if the program isn't running!");
+  try {
+    Process->finishProgram(Frame);
+  } catch (InferiorProcessDead &IPD) {
+    killProgram();
+    throw NonErrorException("The program stopped with exit code " +
+                            itostr(IPD.getExitCode()));
+  } catch (...) {
+    killProgram();
+    throw;
+  }
+}
+
+/// contProgram - Implement the 'cont' command, continuing execution until
+/// the next breakpoint is encountered.
+void Debugger::contProgram() {
+  assert(isProgramRunning() && "Cannot cont if the program isn't running!");
+  try {
+    Process->contProgram();
+  } catch (InferiorProcessDead &IPD) {
+    killProgram();
+    throw NonErrorException("The program stopped with exit code " +
+                            itostr(IPD.getExitCode()));
+  } catch (...) {
+    killProgram();
+    throw;
+  }
+}
diff --git a/lib/Debugger/Makefile b/lib/Debugger/Makefile
new file mode 100644
index 0000000..8290e30
--- /dev/null
+++ b/lib/Debugger/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Debugger/Makefile -------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMDebugger
+EXTRA_DIST = README.txt
+REQUIRES_EH := 1
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Debugger/ProgramInfo.cpp b/lib/Debugger/ProgramInfo.cpp
new file mode 100644
index 0000000..125ff55
--- /dev/null
+++ b/lib/Debugger/ProgramInfo.cpp
@@ -0,0 +1,377 @@
+//===-- ProgramInfo.cpp - Compute and cache info about a program ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ProgramInfo and related classes, by sorting through
+// the loaded Module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/ProgramInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Debugger/SourceFile.h"
+#include "llvm/Debugger/SourceLanguage.h"
+#include "llvm/Support/SlowOperationInformer.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+/// getGlobalVariablesUsing - Return all of the global variables which have the
+/// specified value in their initializer somewhere.
+static void getGlobalVariablesUsing(Value *V,
+                                    std::vector<GlobalVariable*> &Found) {
+  for (Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) {
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(*I))
+      Found.push_back(GV);
+    else if (Constant *C = dyn_cast<Constant>(*I))
+      getGlobalVariablesUsing(C, Found);
+  }
+}
+
+/// getNextStopPoint - Follow the def-use chains of the specified LLVM value,
+/// traversing the use chains until we get to a stoppoint.  When we do, return
+/// the source location of the stoppoint.  If we don't find a stoppoint, return
+/// null.
+static const GlobalVariable *getNextStopPoint(const Value *V, unsigned &LineNo,
+                                              unsigned &ColNo) {
+  // The use-def chains can fork.  As such, we pick the lowest numbered one we
+  // find.
+  const GlobalVariable *LastDesc = 0;
+  unsigned LastLineNo = ~0;
+  unsigned LastColNo = ~0;
+
+  for (Value::use_const_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    bool ShouldRecurse = true;
+    if (cast<Instruction>(*UI)->getOpcode() == Instruction::PHI) {
+      // Infinite loops == bad, ignore PHI nodes.
+      ShouldRecurse = false;
+    } else if (const CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      
+      // If we found a stop point, check to see if it is earlier than what we
+      // already have.  If so, remember it.
+      if (CI->getCalledFunction())
+        if (const DbgStopPointInst *SPI = dyn_cast<DbgStopPointInst>(CI)) {
+          unsigned CurLineNo = SPI->getLine();
+          unsigned CurColNo = SPI->getColumn();
+          const GlobalVariable *CurDesc = 0;
+          const Value *Op = SPI->getContext();
+
+          if ((CurDesc = dyn_cast<GlobalVariable>(Op)) &&
+              (LineNo < LastLineNo ||
+               (LineNo == LastLineNo && ColNo < LastColNo))) {
+            LastDesc = CurDesc;
+            LastLineNo = CurLineNo;
+            LastColNo = CurColNo;
+          }
+          ShouldRecurse = false;
+        }
+    }
+
+    // If this is not a phi node or a stopping point, recursively scan the users
+    // of this instruction to skip over region.begin's and the like.
+    if (ShouldRecurse) {
+      unsigned CurLineNo, CurColNo;
+      if (const GlobalVariable *GV = getNextStopPoint(*UI, CurLineNo,CurColNo)){
+        if (LineNo < LastLineNo || (LineNo == LastLineNo && ColNo < LastColNo)){
+          LastDesc = GV;
+          LastLineNo = CurLineNo;
+          LastColNo = CurColNo;
+        }
+      }
+    }
+  }
+
+  if (LastDesc) {
+    LineNo = LastLineNo != ~0U ? LastLineNo : 0;
+    ColNo  = LastColNo  != ~0U ? LastColNo : 0;
+  }
+  return LastDesc;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SourceFileInfo implementation
+//
+
+SourceFileInfo::SourceFileInfo(const GlobalVariable *Desc,
+                               const SourceLanguage &Lang)
+  : Language(&Lang), Descriptor(Desc) {
+  Version = 0;
+  SourceText = 0;
+
+  if (Desc && Desc->hasInitializer())
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Desc->getInitializer()))
+      if (CS->getNumOperands() > 4) {
+        if (ConstantInt *CUI = dyn_cast<ConstantInt>(CS->getOperand(1)))
+          Version = CUI->getZExtValue();
+
+        if (!GetConstantStringInfo(CS->getOperand(3), BaseName))
+          BaseName = "";
+        if (!GetConstantStringInfo(CS->getOperand(4), Directory))
+          Directory = "";
+      }
+}
+
+SourceFileInfo::~SourceFileInfo() {
+  delete SourceText;
+}
+
+SourceFile &SourceFileInfo::getSourceText() const {
+  // FIXME: this should take into account the source search directories!
+  if (SourceText == 0) { // Read the file in if we haven't already.
+    sys::Path tmpPath;
+    if (!Directory.empty())
+      tmpPath.set(Directory);
+    tmpPath.appendComponent(BaseName);
+    if (tmpPath.canRead())
+      SourceText = new SourceFile(tmpPath.toString(), Descriptor);
+    else
+      SourceText = new SourceFile(BaseName, Descriptor);
+  }
+  return *SourceText;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SourceFunctionInfo implementation
+//
+SourceFunctionInfo::SourceFunctionInfo(ProgramInfo &PI,
+                                       const GlobalVariable *Desc)
+  : Descriptor(Desc) {
+  LineNo = ColNo = 0;
+  if (Desc && Desc->hasInitializer())
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Desc->getInitializer()))
+      if (CS->getNumOperands() > 2) {
+        // Entry #1 is the file descriptor.
+        if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(CS->getOperand(1)))
+          SourceFile = &PI.getSourceFile(GV);
+
+        // Entry #2 is the function name.
+        if (!GetConstantStringInfo(CS->getOperand(2), Name))
+          Name = "";
+      }
+}
+
+/// getSourceLocation - This method returns the location of the first stopping
+/// point in the function.
+void SourceFunctionInfo::getSourceLocation(unsigned &RetLineNo,
+                                           unsigned &RetColNo) const {
+  // If we haven't computed this yet...
+  if (!LineNo) {
+    // Look at all of the users of the function descriptor, looking for calls to
+    // %llvm.dbg.func.start.
+    for (Value::use_const_iterator UI = Descriptor->use_begin(),
+           E = Descriptor->use_end(); UI != E; ++UI)
+      if (const CallInst *CI = dyn_cast<CallInst>(*UI))
+        if (const Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::dbg_func_start) {
+            // We found the start of the function.  Check to see if there are
+            // any stop points on the use-list of the function start.
+            const GlobalVariable *SD = getNextStopPoint(CI, LineNo, ColNo);
+            if (SD) {             // We found the first stop point!
+              // This is just a sanity check.
+              if (getSourceFile().getDescriptor() != SD)
+                cout << "WARNING: first line of function is not in the"
+                     << " file that the function descriptor claims it is in.\n";
+              break;
+            }
+          }
+  }
+  RetLineNo = LineNo; RetColNo = ColNo;
+}
+
+//===----------------------------------------------------------------------===//
+// ProgramInfo implementation
+//
+
+ProgramInfo::ProgramInfo(Module *m) : M(m), ProgramTimeStamp(0,0) {
+  assert(M && "Cannot create program information with a null module!");
+  sys::PathWithStatus ModPath(M->getModuleIdentifier());
+  const sys::FileStatus *Stat = ModPath.getFileStatus();
+  if (Stat)
+    ProgramTimeStamp = Stat->getTimestamp();
+
+  SourceFilesIsComplete = false;
+  SourceFunctionsIsComplete = false;
+}
+
+ProgramInfo::~ProgramInfo() {
+  // Delete cached information about source program objects...
+  for (std::map<const GlobalVariable*, SourceFileInfo*>::iterator
+         I = SourceFiles.begin(), E = SourceFiles.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<const GlobalVariable*, SourceFunctionInfo*>::iterator
+         I = SourceFunctions.begin(), E = SourceFunctions.end(); I != E; ++I)
+    delete I->second;
+
+  // Delete the source language caches.
+  for (unsigned i = 0, e = LanguageCaches.size(); i != e; ++i)
+    delete LanguageCaches[i].second;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SourceFileInfo tracking...
+//
+
+/// getSourceFile - Return source file information for the specified source file
+/// descriptor object, adding it to the collection as needed.  This method
+/// always succeeds (is unambiguous), and is always efficient.
+///
+const SourceFileInfo &
+ProgramInfo::getSourceFile(const GlobalVariable *Desc) {
+  SourceFileInfo *&Result = SourceFiles[Desc];
+  if (Result) return *Result;
+
+  // Figure out what language this source file comes from...
+  unsigned LangID = 0;   // Zero is unknown language
+  if (Desc && Desc->hasInitializer())
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Desc->getInitializer()))
+      if (CS->getNumOperands() > 2)
+        if (ConstantInt *CUI = dyn_cast<ConstantInt>(CS->getOperand(2)))
+          LangID = CUI->getZExtValue();
+
+  const SourceLanguage &Lang = SourceLanguage::get(LangID);
+  SourceFileInfo *New = Lang.createSourceFileInfo(Desc, *this);
+
+  // FIXME: this should check to see if there is already a Filename/WorkingDir
+  // pair that matches this one.  If so, we shouldn't create the duplicate!
+  //
+  SourceFileIndex.insert(std::make_pair(New->getBaseName(), New));
+  return *(Result = New);
+}
+
+
+/// getSourceFiles - Index all of the source files in the program and return
+/// a mapping of it.  This information is lazily computed the first time
+/// that it is requested.  Since this information can take a long time to
+/// compute, the user is given a chance to cancel it.  If this occurs, an
+/// exception is thrown.
+const std::map<const GlobalVariable*, SourceFileInfo*> &
+ProgramInfo::getSourceFiles(bool RequiresCompleteMap) {
+  // If we have a fully populated map, or if the client doesn't need one, just
+  // return what we have.
+  if (SourceFilesIsComplete || !RequiresCompleteMap)
+    return SourceFiles;
+
+  // Ok, all of the source file descriptors (compile_unit in dwarf terms),
+  // should be on the use list of the llvm.dbg.translation_units global.
+  //
+  GlobalVariable *Units =
+    M->getGlobalVariable("llvm.dbg.translation_units",
+                         StructType::get(std::vector<const Type*>()));
+  if (Units == 0)
+    throw "Program contains no debugging information!";
+
+  std::vector<GlobalVariable*> TranslationUnits;
+  getGlobalVariablesUsing(Units, TranslationUnits);
+
+  SlowOperationInformer SOI("building source files index");
+
+  // Loop over all of the translation units found, building the SourceFiles
+  // mapping.
+  for (unsigned i = 0, e = TranslationUnits.size(); i != e; ++i) {
+    getSourceFile(TranslationUnits[i]);
+    if (SOI.progress(i+1, e))
+      throw "While building source files index, operation cancelled.";
+  }
+
+  // Ok, if we got this far, then we indexed the whole program.
+  SourceFilesIsComplete = true;
+  return SourceFiles;
+}
+
+/// getSourceFile - Look up the file with the specified name.  If there is
+/// more than one match for the specified filename, prompt the user to pick
+/// one.  If there is no source file that matches the specified name, throw
+/// an exception indicating that we can't find the file.  Otherwise, return
+/// the file information for that file.
+const SourceFileInfo &ProgramInfo::getSourceFile(const std::string &Filename) {
+  std::multimap<std::string, SourceFileInfo*>::const_iterator Start, End;
+  getSourceFiles();
+  tie(Start, End) = SourceFileIndex.equal_range(Filename);
+
+  if (Start == End) throw "Could not find source file '" + Filename + "'!";
+  const SourceFileInfo &SFI = *Start->second;
+  ++Start;
+  if (Start == End) return SFI;
+
+  throw "FIXME: Multiple source files with the same name not implemented!";
+}
+
+
+//===----------------------------------------------------------------------===//
+// SourceFunctionInfo tracking...
+//
+
+
+/// getFunction - Return function information for the specified function
+/// descriptor object, adding it to the collection as needed.  This method
+/// always succeeds (is unambiguous), and is always efficient.
+///
+const SourceFunctionInfo &
+ProgramInfo::getFunction(const GlobalVariable *Desc) {
+  SourceFunctionInfo *&Result = SourceFunctions[Desc];
+  if (Result) return *Result;
+
+  // Figure out what language this function comes from...
+  const GlobalVariable *SourceFileDesc = 0;
+  if (Desc && Desc->hasInitializer())
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Desc->getInitializer()))
+      if (CS->getNumOperands() > 0)
+        if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(CS->getOperand(1)))
+          SourceFileDesc = GV;
+
+  const SourceLanguage &Lang = getSourceFile(SourceFileDesc).getLanguage();
+  return *(Result = Lang.createSourceFunctionInfo(Desc, *this));
+}
+
+
+// getSourceFunctions - Index all of the functions in the program and return
+// them.  This information is lazily computed the first time that it is
+// requested.  Since this information can take a long time to compute, the user
+// is given a chance to cancel it.  If this occurs, an exception is thrown.
+const std::map<const GlobalVariable*, SourceFunctionInfo*> &
+ProgramInfo::getSourceFunctions(bool RequiresCompleteMap) {
+  if (SourceFunctionsIsComplete || !RequiresCompleteMap)
+    return SourceFunctions;
+
+  // Ok, all of the source function descriptors (subprogram in dwarf terms),
+  // should be on the use list of the llvm.dbg.translation_units global.
+  //
+  GlobalVariable *Units =
+    M->getGlobalVariable("llvm.dbg.globals",
+                         StructType::get(std::vector<const Type*>()));
+  if (Units == 0)
+    throw "Program contains no debugging information!";
+
+  std::vector<GlobalVariable*> Functions;
+  getGlobalVariablesUsing(Units, Functions);
+
+  SlowOperationInformer SOI("building functions index");
+
+  // Loop over all of the functions found, building the SourceFunctions mapping.
+  for (unsigned i = 0, e = Functions.size(); i != e; ++i) {
+    getFunction(Functions[i]);
+    if (SOI.progress(i+1, e))
+      throw "While functions index, operation cancelled.";
+  }
+
+  // Ok, if we got this far, then we indexed the whole program.
+  SourceFunctionsIsComplete = true;
+  return SourceFunctions;
+}
diff --git a/lib/Debugger/README.txt b/lib/Debugger/README.txt
new file mode 100644
index 0000000..89935c5
--- /dev/null
+++ b/lib/Debugger/README.txt
@@ -0,0 +1,7 @@
+//===-- llvm/lib/Debugger/ - LLVM Debugger interfaces ---------------------===//
+
+This directory contains the implementation of the LLVM debugger backend.  This
+directory builds into a library which can be used by various debugger 
+front-ends to debug LLVM programs.  The current command line LLVM debugger, 
+llvm-db is currently the only client of this library, but others could be 
+built, to provide a GUI front-end for example.
diff --git a/lib/Debugger/RuntimeInfo.cpp b/lib/Debugger/RuntimeInfo.cpp
new file mode 100644
index 0000000..2f0ff72
--- /dev/null
+++ b/lib/Debugger/RuntimeInfo.cpp
@@ -0,0 +1,69 @@
+//===-- RuntimeInfo.cpp - Compute and cache info about running program ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RuntimeInfo and related classes, by querying and
+// cachine information from the running inferior process.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/InferiorProcess.h"
+#include "llvm/Debugger/ProgramInfo.h"
+#include "llvm/Debugger/RuntimeInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// StackFrame class implementation
+
+StackFrame::StackFrame(RuntimeInfo &ri, void *ParentFrameID)
+  : RI(ri), SourceInfo(0) {
+  FrameID = RI.getInferiorProcess().getPreviousFrame(ParentFrameID);
+  if (FrameID == 0) throw "Stack frame does not exist!";
+
+  // Compute lazily as needed.
+  FunctionDesc = 0;
+}
+
+const GlobalVariable *StackFrame::getFunctionDesc() {
+  if (FunctionDesc == 0)
+    FunctionDesc = RI.getInferiorProcess().getSubprogramDesc(FrameID);
+  return FunctionDesc;
+}
+
+/// getSourceLocation - Return the source location that this stack frame is
+/// sitting at.
+void StackFrame::getSourceLocation(unsigned &lineNo, unsigned &colNo,
+                                   const SourceFileInfo *&sourceInfo) {
+  if (SourceInfo == 0) {
+    const GlobalVariable *SourceDesc = 0;
+    RI.getInferiorProcess().getFrameLocation(FrameID, LineNo,ColNo, SourceDesc);
+    SourceInfo = &RI.getProgramInfo().getSourceFile(SourceDesc);
+  }
+
+  lineNo = LineNo;
+  colNo = ColNo;
+  sourceInfo = SourceInfo;
+}
+
+//===----------------------------------------------------------------------===//
+// RuntimeInfo class implementation
+
+/// materializeFrame - Create and process all frames up to and including the
+/// specified frame number.  This throws an exception if the specified frame
+/// ID is nonexistant.
+void RuntimeInfo::materializeFrame(unsigned ID) {
+  assert(ID >= CallStack.size() && "no need to materialize this frame!");
+  void *CurFrame = 0;
+  if (!CallStack.empty())
+    CurFrame = CallStack.back().getFrameID();
+
+  while (CallStack.size() <= ID) {
+    CallStack.push_back(StackFrame(*this, CurFrame));
+    CurFrame = CallStack.back().getFrameID();
+  }
+}
diff --git a/lib/Debugger/SourceFile.cpp b/lib/Debugger/SourceFile.cpp
new file mode 100644
index 0000000..03c60f8
--- /dev/null
+++ b/lib/Debugger/SourceFile.cpp
@@ -0,0 +1,82 @@
+//===-- SourceFile.cpp - SourceFile implementation for the debugger -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SourceFile class for the LLVM debugger.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/SourceFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cassert>
+using namespace llvm;
+
+static const char EmptyFile = 0;
+
+SourceFile::SourceFile(const std::string &fn, const GlobalVariable *Desc)
+  : Filename(fn), Descriptor(Desc) {
+  File.reset(MemoryBuffer::getFileOrSTDIN(fn));
+    
+  // On error, return an empty buffer.
+  if (File == 0)
+    File.reset(MemoryBuffer::getMemBuffer(&EmptyFile, &EmptyFile));
+}
+
+SourceFile::~SourceFile() {
+}
+
+
+/// calculateLineOffsets - Compute the LineOffset vector for the current file.
+///
+void SourceFile::calculateLineOffsets() const {
+  assert(LineOffset.empty() && "Line offsets already computed!");
+  const char *BufPtr = File->getBufferStart();
+  const char *FileStart = BufPtr;
+  const char *FileEnd = File->getBufferEnd();
+  do {
+    LineOffset.push_back(BufPtr-FileStart);
+
+    // Scan until we get to a newline.
+    while (BufPtr != FileEnd && *BufPtr != '\n' && *BufPtr != '\r')
+      ++BufPtr;
+
+    if (BufPtr != FileEnd) {
+      ++BufPtr;               // Skip over the \n or \r
+      if (BufPtr[-1] == '\r' && BufPtr != FileEnd && BufPtr[0] == '\n')
+        ++BufPtr;   // Skip over dos/windows style \r\n's
+    }
+  } while (BufPtr != FileEnd);
+}
+
+
+/// getSourceLine - Given a line number, return the start and end of the line
+/// in the file.  If the line number is invalid, or if the file could not be
+/// loaded, null pointers are returned for the start and end of the file. Note
+/// that line numbers start with 0, not 1.
+void SourceFile::getSourceLine(unsigned LineNo, const char *&LineStart,
+                               const char *&LineEnd) const {
+  LineStart = LineEnd = 0;
+  if (LineOffset.empty()) calculateLineOffsets();
+
+  // Asking for an out-of-range line number?
+  if (LineNo >= LineOffset.size()) return;
+
+  // Otherwise, they are asking for a valid line, which we can fulfill.
+  LineStart = File->getBufferStart()+LineOffset[LineNo];
+
+  if (LineNo+1 < LineOffset.size())
+    LineEnd = File->getBufferStart()+LineOffset[LineNo+1];
+  else
+    LineEnd = File->getBufferEnd();
+
+  // If the line ended with a newline, strip it off.
+  while (LineEnd != LineStart && (LineEnd[-1] == '\n' || LineEnd[-1] == '\r'))
+    --LineEnd;
+
+  assert(LineEnd >= LineStart && "We somehow got our pointers swizzled!");
+}
diff --git a/lib/Debugger/SourceLanguage-CFamily.cpp b/lib/Debugger/SourceLanguage-CFamily.cpp
new file mode 100644
index 0000000..f329db4
--- /dev/null
+++ b/lib/Debugger/SourceLanguage-CFamily.cpp
@@ -0,0 +1,28 @@
+//===-- SourceLanguage-CFamily.cpp - C family SourceLanguage impl ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SourceLanguage class for the C family of languages
+// (K&R C, C89, C99, etc).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/SourceLanguage.h"
+using namespace llvm;
+
+#if 0
+namespace {
+  struct CSL : public SourceLanguage {
+  } TheCSourceLanguageInstance;
+}
+#endif
+
+const SourceLanguage &SourceLanguage::getCFamilyInstance() {
+  return get(0);  // We don't have an implementation for C yet fall back on
+                  // generic
+}
diff --git a/lib/Debugger/SourceLanguage-CPlusPlus.cpp b/lib/Debugger/SourceLanguage-CPlusPlus.cpp
new file mode 100644
index 0000000..ce94ff4
--- /dev/null
+++ b/lib/Debugger/SourceLanguage-CPlusPlus.cpp
@@ -0,0 +1,27 @@
+//===-- SourceLanguage-CPlusPlus.cpp - C++ SourceLanguage impl ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SourceLanguage class for the C++ language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/SourceLanguage.h"
+using namespace llvm;
+
+#if 0
+namespace {
+  struct CPPSL : public SourceLanguage {
+  } TheCPlusPlusLanguageInstance;
+}
+#endif
+
+const SourceLanguage &SourceLanguage::getCPlusPlusInstance() {
+  return get(0);  // We don't have an implementation for C yet fall back on
+                  // generic
+}
diff --git a/lib/Debugger/SourceLanguage-Unknown.cpp b/lib/Debugger/SourceLanguage-Unknown.cpp
new file mode 100644
index 0000000..b806fc7
--- /dev/null
+++ b/lib/Debugger/SourceLanguage-Unknown.cpp
@@ -0,0 +1,138 @@
+//===-- SourceLanguage-Unknown.cpp - Implement itf for unknown languages --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// If the LLVM debugger does not have a module for a particular language, it
+// falls back on using this one to perform the source-language interface.  This
+// interface is not wonderful, but it gets the job done.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/SourceLanguage.h"
+#include "llvm/Debugger/ProgramInfo.h"
+#include "llvm/Support/Streams.h"
+#include <cassert>
+#include <ostream>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Implement the SourceLanguage cache for the Unknown language.
+//
+
+namespace {
+  /// SLUCache - This cache allows for efficient lookup of source functions by
+  /// name.
+  ///
+  struct SLUCache : public SourceLanguageCache {
+    ProgramInfo &PI;
+    std::multimap<std::string, SourceFunctionInfo*> FunctionMap;
+  public:
+    SLUCache(ProgramInfo &pi);
+
+    typedef std::multimap<std::string, SourceFunctionInfo*>::const_iterator
+       fm_iterator;
+
+    std::pair<fm_iterator, fm_iterator>
+    getFunction(const std::string &Name) const {
+      return FunctionMap.equal_range(Name);
+    }
+
+    SourceFunctionInfo *addSourceFunction(SourceFunctionInfo *SF) {
+      FunctionMap.insert(std::make_pair(SF->getSymbolicName(), SF));
+      return SF;
+    }
+  };
+}
+
+SLUCache::SLUCache(ProgramInfo &pi) : PI(pi) {
+}
+
+
+//===----------------------------------------------------------------------===//
+// Implement SourceLanguageUnknown class, which is used to handle unrecognized
+// languages.
+//
+
+namespace {
+  static struct SLU : public SourceLanguage {
+    //===------------------------------------------------------------------===//
+    // Implement the miscellaneous methods...
+    //
+    virtual const char *getSourceLanguageName() const {
+      return "unknown";
+    }
+
+    /// lookupFunction - Given a textual function name, return the
+    /// SourceFunctionInfo descriptor for that function, or null if it cannot be
+    /// found.  If the program is currently running, the RuntimeInfo object
+    /// provides information about the current evaluation context, otherwise it
+    /// will be null.
+    ///
+    virtual SourceFunctionInfo *lookupFunction(const std::string &FunctionName,
+                                               ProgramInfo &PI,
+                                               RuntimeInfo *RI = 0) const;
+
+    //===------------------------------------------------------------------===//
+    // We do use a cache for information...
+    //
+    typedef SLUCache CacheType;
+    SLUCache *createSourceLanguageCache(ProgramInfo &PI) const {
+      return new SLUCache(PI);
+    }
+
+    /// createSourceFunctionInfo - Create the new object and inform the cache of
+    /// the new function.
+    virtual SourceFunctionInfo *
+    createSourceFunctionInfo(const GlobalVariable *Desc, ProgramInfo &PI) const;
+
+  } TheUnknownSourceLanguageInstance;
+}
+
+const SourceLanguage &SourceLanguage::getUnknownLanguageInstance() {
+  return TheUnknownSourceLanguageInstance;
+}
+
+
+SourceFunctionInfo *
+SLU::createSourceFunctionInfo(const GlobalVariable *Desc,
+                              ProgramInfo &PI) const {
+  SourceFunctionInfo *Result = new SourceFunctionInfo(PI, Desc);
+  return PI.getLanguageCache(this).addSourceFunction(Result);
+}
+
+
+/// lookupFunction - Given a textual function name, return the
+/// SourceFunctionInfo descriptor for that function, or null if it cannot be
+/// found.  If the program is currently running, the RuntimeInfo object
+/// provides information about the current evaluation context, otherwise it will
+/// be null.
+///
+SourceFunctionInfo *SLU::lookupFunction(const std::string &FunctionName,
+                                        ProgramInfo &PI, RuntimeInfo *RI) const{
+  SLUCache &Cache = PI.getLanguageCache(this);
+  std::pair<SLUCache::fm_iterator, SLUCache::fm_iterator> IP
+    = Cache.getFunction(FunctionName);
+
+  if (IP.first == IP.second) {
+    if (PI.allSourceFunctionsRead())
+      return 0;  // Nothing found
+
+    // Otherwise, we might be able to find the function if we read all of them
+    // in.  Do so now.
+    PI.getSourceFunctions();
+    assert(PI.allSourceFunctionsRead() && "Didn't read in all functions?");
+    return lookupFunction(FunctionName, PI, RI);
+  }
+
+  SourceFunctionInfo *Found = IP.first->second;
+  ++IP.first;
+  if (IP.first != IP.second)
+    cout << "Whoa, found multiple functions with the same name.  I should"
+         << " ask the user which one to use: FIXME!\n";
+  return Found;
+}
diff --git a/lib/Debugger/SourceLanguage.cpp b/lib/Debugger/SourceLanguage.cpp
new file mode 100644
index 0000000..4fcc38b
--- /dev/null
+++ b/lib/Debugger/SourceLanguage.cpp
@@ -0,0 +1,54 @@
+//===-- SourceLanguage.cpp - Implement the SourceLanguage class -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SourceLanguage class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debugger/SourceLanguage.h"
+#include "llvm/Debugger/ProgramInfo.h"
+using namespace llvm;
+
+const SourceLanguage &SourceLanguage::get(unsigned ID) {
+  switch (ID) {
+  case 1:  // DW_LANG_C89
+  case 2:  // DW_LANG_C
+  case 12: // DW_LANG_C99
+    return getCFamilyInstance();
+
+  case 4:  // DW_LANG_C_plus_plus
+    return getCPlusPlusInstance();
+
+  case 3:  // DW_LANG_Ada83
+  case 5:  // DW_LANG_Cobol74
+  case 6:  // DW_LANG_Cobol85
+  case 7:  // DW_LANG_Fortran77
+  case 8:  // DW_LANG_Fortran90
+  case 9:  // DW_LANG_Pascal83
+  case 10: // DW_LANG_Modula2
+  case 11: // DW_LANG_Java
+  case 13: // DW_LANG_Ada95
+  case 14: // DW_LANG_Fortran95
+  default:
+    return getUnknownLanguageInstance();
+  }
+}
+
+
+SourceFileInfo *
+SourceLanguage::createSourceFileInfo(const GlobalVariable *Desc,
+                                     ProgramInfo &PI) const {
+  return new SourceFileInfo(Desc, *this);
+}
+
+SourceFunctionInfo *
+SourceLanguage::createSourceFunctionInfo(const GlobalVariable *Desc,
+                                         ProgramInfo &PI) const {
+  return new SourceFunctionInfo(PI, Desc);
+}
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
new file mode 100644
index 0000000..e26b98f
--- /dev/null
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_partially_linked_object(LLVMExecutionEngine
+  ExecutionEngine.cpp
+  ExecutionEngineBindings.cpp
+  )
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
new file mode 100644
index 0000000..29a05bb
--- /dev/null
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -0,0 +1,1010 @@
+//===-- ExecutionEngine.cpp - Common Implementation shared by EEs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common interface used by the various execution engine
+// subclasses.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Config/alloca.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/System/DynamicLibrary.h"
+#include "llvm/System/Host.h"
+#include "llvm/Target/TargetData.h"
+#include <cmath>
+#include <cstring>
+using namespace llvm;
+
+STATISTIC(NumInitBytes, "Number of bytes of global vars initialized");
+STATISTIC(NumGlobals  , "Number of global vars initialized");
+
+ExecutionEngine::EECtorFn ExecutionEngine::JITCtor = 0;
+ExecutionEngine::EECtorFn ExecutionEngine::InterpCtor = 0;
+ExecutionEngine::EERegisterFn ExecutionEngine::ExceptionTableRegister = 0;
+
+
+ExecutionEngine::ExecutionEngine(ModuleProvider *P) : LazyFunctionCreator(0) {
+  LazyCompilationDisabled = false;
+  GVCompilationDisabled   = false;
+  SymbolSearchingDisabled = false;
+  DlsymStubsEnabled       = false;
+  Modules.push_back(P);
+  assert(P && "ModuleProvider is null?");
+}
+
+ExecutionEngine::~ExecutionEngine() {
+  clearAllGlobalMappings();
+  for (unsigned i = 0, e = Modules.size(); i != e; ++i)
+    delete Modules[i];
+}
+
+char* ExecutionEngine::getMemoryForGV(const GlobalVariable* GV) {
+  const Type *ElTy = GV->getType()->getElementType();
+  size_t GVSize = (size_t)getTargetData()->getTypeAllocSize(ElTy);
+  return new char[GVSize];
+}
+
+/// removeModuleProvider - Remove a ModuleProvider from the list of modules.
+/// Relases the Module from the ModuleProvider, materializing it in the
+/// process, and returns the materialized Module.
+Module* ExecutionEngine::removeModuleProvider(ModuleProvider *P, 
+                                              std::string *ErrInfo) {
+  for(SmallVector<ModuleProvider *, 1>::iterator I = Modules.begin(), 
+        E = Modules.end(); I != E; ++I) {
+    ModuleProvider *MP = *I;
+    if (MP == P) {
+      Modules.erase(I);
+      clearGlobalMappingsFromModule(MP->getModule());
+      return MP->releaseModule(ErrInfo);
+    }
+  }
+  return NULL;
+}
+
+/// deleteModuleProvider - Remove a ModuleProvider from the list of modules,
+/// and deletes the ModuleProvider and owned Module.  Avoids materializing 
+/// the underlying module.
+void ExecutionEngine::deleteModuleProvider(ModuleProvider *P, 
+                                           std::string *ErrInfo) {
+  for(SmallVector<ModuleProvider *, 1>::iterator I = Modules.begin(), 
+      E = Modules.end(); I != E; ++I) {
+    ModuleProvider *MP = *I;
+    if (MP == P) {
+      Modules.erase(I);
+      clearGlobalMappingsFromModule(MP->getModule());
+      delete MP;
+      return;
+    }
+  }
+}
+
+/// FindFunctionNamed - Search all of the active modules to find the one that
+/// defines FnName.  This is very slow operation and shouldn't be used for
+/// general code.
+Function *ExecutionEngine::FindFunctionNamed(const char *FnName) {
+  for (unsigned i = 0, e = Modules.size(); i != e; ++i) {
+    if (Function *F = Modules[i]->getModule()->getFunction(FnName))
+      return F;
+  }
+  return 0;
+}
+
+
+/// addGlobalMapping - Tell the execution engine that the specified global is
+/// at the specified location.  This is used internally as functions are JIT'd
+/// and as global variables are laid out in memory.  It can and should also be
+/// used by clients of the EE that want to have an LLVM global overlay
+/// existing data in memory.
+void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
+  MutexGuard locked(lock);
+
+  DOUT << "JIT: Map \'" << GV->getNameStart() << "\' to [" << Addr << "]\n";  
+  void *&CurVal = state.getGlobalAddressMap(locked)[GV];
+  assert((CurVal == 0 || Addr == 0) && "GlobalMapping already established!");
+  CurVal = Addr;
+  
+  // If we are using the reverse mapping, add it too
+  if (!state.getGlobalAddressReverseMap(locked).empty()) {
+    const GlobalValue *&V = state.getGlobalAddressReverseMap(locked)[Addr];
+    assert((V == 0 || GV == 0) && "GlobalMapping already established!");
+    V = GV;
+  }
+}
+
+/// clearAllGlobalMappings - Clear all global mappings and start over again
+/// use in dynamic compilation scenarios when you want to move globals
+void ExecutionEngine::clearAllGlobalMappings() {
+  MutexGuard locked(lock);
+  
+  state.getGlobalAddressMap(locked).clear();
+  state.getGlobalAddressReverseMap(locked).clear();
+}
+
+/// clearGlobalMappingsFromModule - Clear all global mappings that came from a
+/// particular module, because it has been removed from the JIT.
+void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
+  MutexGuard locked(lock);
+  
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI) {
+    state.getGlobalAddressMap(locked).erase(FI);
+    state.getGlobalAddressReverseMap(locked).erase(FI);
+  }
+  for (Module::global_iterator GI = M->global_begin(), GE = M->global_end(); 
+       GI != GE; ++GI) {
+    state.getGlobalAddressMap(locked).erase(GI);
+    state.getGlobalAddressReverseMap(locked).erase(GI);
+  }
+}
+
+/// updateGlobalMapping - Replace an existing mapping for GV with a new
+/// address.  This updates both maps as required.  If "Addr" is null, the
+/// entry for the global is removed from the mappings.
+void *ExecutionEngine::updateGlobalMapping(const GlobalValue *GV, void *Addr) {
+  MutexGuard locked(lock);
+
+  std::map<const GlobalValue*, void *> &Map = state.getGlobalAddressMap(locked);
+
+  // Deleting from the mapping?
+  if (Addr == 0) {
+    std::map<const GlobalValue*, void *>::iterator I = Map.find(GV);
+    void *OldVal;
+    if (I == Map.end())
+      OldVal = 0;
+    else {
+      OldVal = I->second;
+      Map.erase(I); 
+    }
+    
+    if (!state.getGlobalAddressReverseMap(locked).empty())
+      state.getGlobalAddressReverseMap(locked).erase(Addr);
+    return OldVal;
+  }
+  
+  void *&CurVal = Map[GV];
+  void *OldVal = CurVal;
+
+  if (CurVal && !state.getGlobalAddressReverseMap(locked).empty())
+    state.getGlobalAddressReverseMap(locked).erase(CurVal);
+  CurVal = Addr;
+  
+  // If we are using the reverse mapping, add it too
+  if (!state.getGlobalAddressReverseMap(locked).empty()) {
+    const GlobalValue *&V = state.getGlobalAddressReverseMap(locked)[Addr];
+    assert((V == 0 || GV == 0) && "GlobalMapping already established!");
+    V = GV;
+  }
+  return OldVal;
+}
+
+/// getPointerToGlobalIfAvailable - This returns the address of the specified
+/// global value if it is has already been codegen'd, otherwise it returns null.
+///
+void *ExecutionEngine::getPointerToGlobalIfAvailable(const GlobalValue *GV) {
+  MutexGuard locked(lock);
+  
+  std::map<const GlobalValue*, void*>::iterator I =
+  state.getGlobalAddressMap(locked).find(GV);
+  return I != state.getGlobalAddressMap(locked).end() ? I->second : 0;
+}
+
+/// getGlobalValueAtAddress - Return the LLVM global value object that starts
+/// at the specified address.
+///
+const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
+  MutexGuard locked(lock);
+
+  // If we haven't computed the reverse mapping yet, do so first.
+  if (state.getGlobalAddressReverseMap(locked).empty()) {
+    for (std::map<const GlobalValue*, void *>::iterator
+         I = state.getGlobalAddressMap(locked).begin(),
+         E = state.getGlobalAddressMap(locked).end(); I != E; ++I)
+      state.getGlobalAddressReverseMap(locked).insert(std::make_pair(I->second,
+                                                                     I->first));
+  }
+
+  std::map<void *, const GlobalValue*>::iterator I =
+    state.getGlobalAddressReverseMap(locked).find(Addr);
+  return I != state.getGlobalAddressReverseMap(locked).end() ? I->second : 0;
+}
+
+// CreateArgv - Turn a vector of strings into a nice argv style array of
+// pointers to null terminated strings.
+//
+static void *CreateArgv(ExecutionEngine *EE,
+                        const std::vector<std::string> &InputArgv) {
+  unsigned PtrSize = EE->getTargetData()->getPointerSize();
+  char *Result = new char[(InputArgv.size()+1)*PtrSize];
+
+  DOUT << "JIT: ARGV = " << (void*)Result << "\n";
+  const Type *SBytePtr = PointerType::getUnqual(Type::Int8Ty);
+
+  for (unsigned i = 0; i != InputArgv.size(); ++i) {
+    unsigned Size = InputArgv[i].size()+1;
+    char *Dest = new char[Size];
+    DOUT << "JIT: ARGV[" << i << "] = " << (void*)Dest << "\n";
+
+    std::copy(InputArgv[i].begin(), InputArgv[i].end(), Dest);
+    Dest[Size-1] = 0;
+
+    // Endian safe: Result[i] = (PointerTy)Dest;
+    EE->StoreValueToMemory(PTOGV(Dest), (GenericValue*)(Result+i*PtrSize),
+                           SBytePtr);
+  }
+
+  // Null terminate it
+  EE->StoreValueToMemory(PTOGV(0),
+                         (GenericValue*)(Result+InputArgv.size()*PtrSize),
+                         SBytePtr);
+  return Result;
+}
+
+
+/// runStaticConstructorsDestructors - This method is used to execute all of
+/// the static constructors or destructors for a module, depending on the
+/// value of isDtors.
+void ExecutionEngine::runStaticConstructorsDestructors(Module *module, bool isDtors) {
+  const char *Name = isDtors ? "llvm.global_dtors" : "llvm.global_ctors";
+  
+  // Execute global ctors/dtors for each module in the program.
+  
+ GlobalVariable *GV = module->getNamedGlobal(Name);
+
+ // If this global has internal linkage, or if it has a use, then it must be
+ // an old-style (llvmgcc3) static ctor with __main linked in and in use.  If
+ // this is the case, don't execute any of the global ctors, __main will do
+ // it.
+ if (!GV || GV->isDeclaration() || GV->hasLocalLinkage()) return;
+ 
+ // Should be an array of '{ int, void ()* }' structs.  The first value is
+ // the init priority, which we ignore.
+ ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+ if (!InitList) return;
+ for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+   if (ConstantStruct *CS = 
+       dyn_cast<ConstantStruct>(InitList->getOperand(i))) {
+     if (CS->getNumOperands() != 2) return; // Not array of 2-element structs.
+   
+     Constant *FP = CS->getOperand(1);
+     if (FP->isNullValue())
+       break;  // Found a null terminator, exit.
+   
+     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
+       if (CE->isCast())
+         FP = CE->getOperand(0);
+     if (Function *F = dyn_cast<Function>(FP)) {
+       // Execute the ctor/dtor function!
+       runFunction(F, std::vector<GenericValue>());
+     }
+   }
+}
+
+/// runStaticConstructorsDestructors - This method is used to execute all of
+/// the static constructors or destructors for a program, depending on the
+/// value of isDtors.
+void ExecutionEngine::runStaticConstructorsDestructors(bool isDtors) {
+  // Execute global ctors/dtors for each module in the program.
+  for (unsigned m = 0, e = Modules.size(); m != e; ++m)
+    runStaticConstructorsDestructors(Modules[m]->getModule(), isDtors);
+}
+
+#ifndef NDEBUG
+/// isTargetNullPtr - Return whether the target pointer stored at Loc is null.
+static bool isTargetNullPtr(ExecutionEngine *EE, void *Loc) {
+  unsigned PtrSize = EE->getTargetData()->getPointerSize();
+  for (unsigned i = 0; i < PtrSize; ++i)
+    if (*(i + (uint8_t*)Loc))
+      return false;
+  return true;
+}
+#endif
+
+/// runFunctionAsMain - This is a helper function which wraps runFunction to
+/// handle the common task of starting up main with the specified argc, argv,
+/// and envp parameters.
+int ExecutionEngine::runFunctionAsMain(Function *Fn,
+                                       const std::vector<std::string> &argv,
+                                       const char * const * envp) {
+  std::vector<GenericValue> GVArgs;
+  GenericValue GVArgc;
+  GVArgc.IntVal = APInt(32, argv.size());
+
+  // Check main() type
+  unsigned NumArgs = Fn->getFunctionType()->getNumParams();
+  const FunctionType *FTy = Fn->getFunctionType();
+  const Type* PPInt8Ty = 
+    PointerType::getUnqual(PointerType::getUnqual(Type::Int8Ty));
+  switch (NumArgs) {
+  case 3:
+   if (FTy->getParamType(2) != PPInt8Ty) {
+     cerr << "Invalid type for third argument of main() supplied\n";
+     abort();
+   }
+   // FALLS THROUGH
+  case 2:
+   if (FTy->getParamType(1) != PPInt8Ty) {
+     cerr << "Invalid type for second argument of main() supplied\n";
+     abort();
+   }
+   // FALLS THROUGH
+  case 1:
+   if (FTy->getParamType(0) != Type::Int32Ty) {
+     cerr << "Invalid type for first argument of main() supplied\n";
+     abort();
+   }
+   // FALLS THROUGH
+  case 0:
+   if (!isa<IntegerType>(FTy->getReturnType()) &&
+       FTy->getReturnType() != Type::VoidTy) {
+     cerr << "Invalid return type of main() supplied\n";
+     abort();
+   }
+   break;
+  default:
+   cerr << "Invalid number of arguments of main() supplied\n";
+   abort();
+  }
+  
+  if (NumArgs) {
+    GVArgs.push_back(GVArgc); // Arg #0 = argc.
+    if (NumArgs > 1) {
+      GVArgs.push_back(PTOGV(CreateArgv(this, argv))); // Arg #1 = argv.
+      assert(!isTargetNullPtr(this, GVTOP(GVArgs[1])) &&
+             "argv[0] was null after CreateArgv");
+      if (NumArgs > 2) {
+        std::vector<std::string> EnvVars;
+        for (unsigned i = 0; envp[i]; ++i)
+          EnvVars.push_back(envp[i]);
+        GVArgs.push_back(PTOGV(CreateArgv(this, EnvVars))); // Arg #2 = envp.
+      }
+    }
+  }
+  return runFunction(Fn, GVArgs).IntVal.getZExtValue();
+}
+
+/// If possible, create a JIT, unless the caller specifically requests an
+/// Interpreter or there's an error. If even an Interpreter cannot be created,
+/// NULL is returned.
+///
+ExecutionEngine *ExecutionEngine::create(ModuleProvider *MP,
+                                         bool ForceInterpreter,
+                                         std::string *ErrorStr,
+                                         CodeGenOpt::Level OptLevel) {
+  ExecutionEngine *EE = 0;
+
+  // Make sure we can resolve symbols in the program as well. The zero arg
+  // to the function tells DynamicLibrary to load the program, not a library.
+  if (sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr))
+    return 0;
+
+  // Unless the interpreter was explicitly selected, try making a JIT.
+  if (!ForceInterpreter && JITCtor)
+    EE = JITCtor(MP, ErrorStr, OptLevel);
+
+  // If we can't make a JIT, make an interpreter instead.
+  if (EE == 0 && InterpCtor)
+    EE = InterpCtor(MP, ErrorStr, OptLevel);
+
+  return EE;
+}
+
+ExecutionEngine *ExecutionEngine::create(Module *M) {
+  return create(new ExistingModuleProvider(M));
+}
+
+/// getPointerToGlobal - This returns the address of the specified global
+/// value.  This may involve code generation if it's a function.
+///
+void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
+  if (Function *F = const_cast<Function*>(dyn_cast<Function>(GV)))
+    return getPointerToFunction(F);
+
+  MutexGuard locked(lock);
+  void *p = state.getGlobalAddressMap(locked)[GV];
+  if (p)
+    return p;
+
+  // Global variable might have been added since interpreter started.
+  if (GlobalVariable *GVar =
+          const_cast<GlobalVariable *>(dyn_cast<GlobalVariable>(GV)))
+    EmitGlobalVariable(GVar);
+  else
+    assert(0 && "Global hasn't had an address allocated yet!");
+  return state.getGlobalAddressMap(locked)[GV];
+}
+
+/// This function converts a Constant* into a GenericValue. The interesting 
+/// part is if C is a ConstantExpr.
+/// @brief Get a GenericValue for a Constant*
+GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
+  // If its undefined, return the garbage.
+  if (isa<UndefValue>(C)) 
+    return GenericValue();
+
+  // If the value is a ConstantExpr
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    Constant *Op0 = CE->getOperand(0);
+    switch (CE->getOpcode()) {
+    case Instruction::GetElementPtr: {
+      // Compute the index 
+      GenericValue Result = getConstantValue(Op0);
+      SmallVector<Value*, 8> Indices(CE->op_begin()+1, CE->op_end());
+      uint64_t Offset =
+        TD->getIndexedOffset(Op0->getType(), &Indices[0], Indices.size());
+
+      char* tmp = (char*) Result.PointerVal;
+      Result = PTOGV(tmp + Offset);
+      return Result;
+    }
+    case Instruction::Trunc: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      GV.IntVal = GV.IntVal.trunc(BitWidth);
+      return GV;
+    }
+    case Instruction::ZExt: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      GV.IntVal = GV.IntVal.zext(BitWidth);
+      return GV;
+    }
+    case Instruction::SExt: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      GV.IntVal = GV.IntVal.sext(BitWidth);
+      return GV;
+    }
+    case Instruction::FPTrunc: {
+      // FIXME long double
+      GenericValue GV = getConstantValue(Op0);
+      GV.FloatVal = float(GV.DoubleVal);
+      return GV;
+    }
+    case Instruction::FPExt:{
+      // FIXME long double
+      GenericValue GV = getConstantValue(Op0);
+      GV.DoubleVal = double(GV.FloatVal);
+      return GV;
+    }
+    case Instruction::UIToFP: {
+      GenericValue GV = getConstantValue(Op0);
+      if (CE->getType() == Type::FloatTy)
+        GV.FloatVal = float(GV.IntVal.roundToDouble());
+      else if (CE->getType() == Type::DoubleTy)
+        GV.DoubleVal = GV.IntVal.roundToDouble();
+      else if (CE->getType() == Type::X86_FP80Ty) {
+        const uint64_t zero[] = {0, 0};
+        APFloat apf = APFloat(APInt(80, 2, zero));
+        (void)apf.convertFromAPInt(GV.IntVal, 
+                                   false,
+                                   APFloat::rmNearestTiesToEven);
+        GV.IntVal = apf.bitcastToAPInt();
+      }
+      return GV;
+    }
+    case Instruction::SIToFP: {
+      GenericValue GV = getConstantValue(Op0);
+      if (CE->getType() == Type::FloatTy)
+        GV.FloatVal = float(GV.IntVal.signedRoundToDouble());
+      else if (CE->getType() == Type::DoubleTy)
+        GV.DoubleVal = GV.IntVal.signedRoundToDouble();
+      else if (CE->getType() == Type::X86_FP80Ty) {
+        const uint64_t zero[] = { 0, 0};
+        APFloat apf = APFloat(APInt(80, 2, zero));
+        (void)apf.convertFromAPInt(GV.IntVal, 
+                                   true,
+                                   APFloat::rmNearestTiesToEven);
+        GV.IntVal = apf.bitcastToAPInt();
+      }
+      return GV;
+    }
+    case Instruction::FPToUI: // double->APInt conversion handles sign
+    case Instruction::FPToSI: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      if (Op0->getType() == Type::FloatTy)
+        GV.IntVal = APIntOps::RoundFloatToAPInt(GV.FloatVal, BitWidth);
+      else if (Op0->getType() == Type::DoubleTy)
+        GV.IntVal = APIntOps::RoundDoubleToAPInt(GV.DoubleVal, BitWidth);
+      else if (Op0->getType() == Type::X86_FP80Ty) {
+        APFloat apf = APFloat(GV.IntVal);
+        uint64_t v;
+        bool ignored;
+        (void)apf.convertToInteger(&v, BitWidth,
+                                   CE->getOpcode()==Instruction::FPToSI, 
+                                   APFloat::rmTowardZero, &ignored);
+        GV.IntVal = v; // endian?
+      }
+      return GV;
+    }
+    case Instruction::PtrToInt: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t PtrWidth = TD->getPointerSizeInBits();
+      GV.IntVal = APInt(PtrWidth, uintptr_t(GV.PointerVal));
+      return GV;
+    }
+    case Instruction::IntToPtr: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t PtrWidth = TD->getPointerSizeInBits();
+      if (PtrWidth != GV.IntVal.getBitWidth())
+        GV.IntVal = GV.IntVal.zextOrTrunc(PtrWidth);
+      assert(GV.IntVal.getBitWidth() <= 64 && "Bad pointer width");
+      GV.PointerVal = PointerTy(uintptr_t(GV.IntVal.getZExtValue()));
+      return GV;
+    }
+    case Instruction::BitCast: {
+      GenericValue GV = getConstantValue(Op0);
+      const Type* DestTy = CE->getType();
+      switch (Op0->getType()->getTypeID()) {
+        default: assert(0 && "Invalid bitcast operand");
+        case Type::IntegerTyID:
+          assert(DestTy->isFloatingPoint() && "invalid bitcast");
+          if (DestTy == Type::FloatTy)
+            GV.FloatVal = GV.IntVal.bitsToFloat();
+          else if (DestTy == Type::DoubleTy)
+            GV.DoubleVal = GV.IntVal.bitsToDouble();
+          break;
+        case Type::FloatTyID: 
+          assert(DestTy == Type::Int32Ty && "Invalid bitcast");
+          GV.IntVal.floatToBits(GV.FloatVal);
+          break;
+        case Type::DoubleTyID:
+          assert(DestTy == Type::Int64Ty && "Invalid bitcast");
+          GV.IntVal.doubleToBits(GV.DoubleVal);
+          break;
+        case Type::PointerTyID:
+          assert(isa<PointerType>(DestTy) && "Invalid bitcast");
+          break; // getConstantValue(Op0)  above already converted it
+      }
+      return GV;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      GenericValue LHS = getConstantValue(Op0);
+      GenericValue RHS = getConstantValue(CE->getOperand(1));
+      GenericValue GV;
+      switch (CE->getOperand(0)->getType()->getTypeID()) {
+      default: assert(0 && "Bad add type!"); abort();
+      case Type::IntegerTyID:
+        switch (CE->getOpcode()) {
+          default: assert(0 && "Invalid integer opcode");
+          case Instruction::Add: GV.IntVal = LHS.IntVal + RHS.IntVal; break;
+          case Instruction::Sub: GV.IntVal = LHS.IntVal - RHS.IntVal; break;
+          case Instruction::Mul: GV.IntVal = LHS.IntVal * RHS.IntVal; break;
+          case Instruction::UDiv:GV.IntVal = LHS.IntVal.udiv(RHS.IntVal); break;
+          case Instruction::SDiv:GV.IntVal = LHS.IntVal.sdiv(RHS.IntVal); break;
+          case Instruction::URem:GV.IntVal = LHS.IntVal.urem(RHS.IntVal); break;
+          case Instruction::SRem:GV.IntVal = LHS.IntVal.srem(RHS.IntVal); break;
+          case Instruction::And: GV.IntVal = LHS.IntVal & RHS.IntVal; break;
+          case Instruction::Or:  GV.IntVal = LHS.IntVal | RHS.IntVal; break;
+          case Instruction::Xor: GV.IntVal = LHS.IntVal ^ RHS.IntVal; break;
+        }
+        break;
+      case Type::FloatTyID:
+        switch (CE->getOpcode()) {
+          default: assert(0 && "Invalid float opcode"); abort();
+          case Instruction::Add:  
+            GV.FloatVal = LHS.FloatVal + RHS.FloatVal; break;
+          case Instruction::Sub:  
+            GV.FloatVal = LHS.FloatVal - RHS.FloatVal; break;
+          case Instruction::Mul:  
+            GV.FloatVal = LHS.FloatVal * RHS.FloatVal; break;
+          case Instruction::FDiv: 
+            GV.FloatVal = LHS.FloatVal / RHS.FloatVal; break;
+          case Instruction::FRem: 
+            GV.FloatVal = ::fmodf(LHS.FloatVal,RHS.FloatVal); break;
+        }
+        break;
+      case Type::DoubleTyID:
+        switch (CE->getOpcode()) {
+          default: assert(0 && "Invalid double opcode"); abort();
+          case Instruction::Add:  
+            GV.DoubleVal = LHS.DoubleVal + RHS.DoubleVal; break;
+          case Instruction::Sub:  
+            GV.DoubleVal = LHS.DoubleVal - RHS.DoubleVal; break;
+          case Instruction::Mul:  
+            GV.DoubleVal = LHS.DoubleVal * RHS.DoubleVal; break;
+          case Instruction::FDiv: 
+            GV.DoubleVal = LHS.DoubleVal / RHS.DoubleVal; break;
+          case Instruction::FRem: 
+            GV.DoubleVal = ::fmod(LHS.DoubleVal,RHS.DoubleVal); break;
+        }
+        break;
+      case Type::X86_FP80TyID:
+      case Type::PPC_FP128TyID:
+      case Type::FP128TyID: {
+        APFloat apfLHS = APFloat(LHS.IntVal);
+        switch (CE->getOpcode()) {
+          default: assert(0 && "Invalid long double opcode"); abort();
+          case Instruction::Add:  
+            apfLHS.add(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::Sub:  
+            apfLHS.subtract(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::Mul:  
+            apfLHS.multiply(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::FDiv: 
+            apfLHS.divide(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::FRem: 
+            apfLHS.mod(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          }
+        }
+        break;
+      }
+      return GV;
+    }
+    default:
+      break;
+    }
+    cerr << "ConstantExpr not handled: " << *CE << "\n";
+    abort();
+  }
+
+  GenericValue Result;
+  switch (C->getType()->getTypeID()) {
+  case Type::FloatTyID: 
+    Result.FloatVal = cast<ConstantFP>(C)->getValueAPF().convertToFloat(); 
+    break;
+  case Type::DoubleTyID:
+    Result.DoubleVal = cast<ConstantFP>(C)->getValueAPF().convertToDouble();
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    Result.IntVal = cast <ConstantFP>(C)->getValueAPF().bitcastToAPInt();
+    break;
+  case Type::IntegerTyID:
+    Result.IntVal = cast<ConstantInt>(C)->getValue();
+    break;
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(C))
+      Result.PointerVal = 0;
+    else if (const Function *F = dyn_cast<Function>(C))
+      Result = PTOGV(getPointerToFunctionOrStub(const_cast<Function*>(F)));
+    else if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(C))
+      Result = PTOGV(getOrEmitGlobalVariable(const_cast<GlobalVariable*>(GV)));
+    else
+      assert(0 && "Unknown constant pointer type!");
+    break;
+  default:
+    cerr << "ERROR: Constant unimplemented for type: " << *C->getType() << "\n";
+    abort();
+  }
+  return Result;
+}
+
+/// StoreIntToMemory - Fills the StoreBytes bytes of memory starting from Dst
+/// with the integer held in IntVal.
+static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
+                             unsigned StoreBytes) {
+  assert((IntVal.getBitWidth()+7)/8 >= StoreBytes && "Integer too small!");
+  uint8_t *Src = (uint8_t *)IntVal.getRawData();
+
+  if (sys::isLittleEndianHost())
+    // Little-endian host - the source is ordered from LSB to MSB.  Order the
+    // destination from LSB to MSB: Do a straight copy.
+    memcpy(Dst, Src, StoreBytes);
+  else {
+    // Big-endian host - the source is an array of 64 bit words ordered from
+    // LSW to MSW.  Each word is ordered from MSB to LSB.  Order the destination
+    // from MSB to LSB: Reverse the word order, but not the bytes in a word.
+    while (StoreBytes > sizeof(uint64_t)) {
+      StoreBytes -= sizeof(uint64_t);
+      // May not be aligned so use memcpy.
+      memcpy(Dst + StoreBytes, Src, sizeof(uint64_t));
+      Src += sizeof(uint64_t);
+    }
+
+    memcpy(Dst, Src + sizeof(uint64_t) - StoreBytes, StoreBytes);
+  }
+}
+
+/// StoreValueToMemory - Stores the data in Val of type Ty at address Ptr.  Ptr
+/// is the address of the memory at which to store Val, cast to GenericValue *.
+/// It is not a pointer to a GenericValue containing the address at which to
+/// store Val.
+void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
+                                         GenericValue *Ptr, const Type *Ty) {
+  const unsigned StoreBytes = getTargetData()->getTypeStoreSize(Ty);
+
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID:
+    StoreIntToMemory(Val.IntVal, (uint8_t*)Ptr, StoreBytes);
+    break;
+  case Type::FloatTyID:
+    *((float*)Ptr) = Val.FloatVal;
+    break;
+  case Type::DoubleTyID:
+    *((double*)Ptr) = Val.DoubleVal;
+    break;
+  case Type::X86_FP80TyID:
+    memcpy(Ptr, Val.IntVal.getRawData(), 10);
+    break;
+  case Type::PointerTyID:
+    // Ensure 64 bit target pointers are fully initialized on 32 bit hosts.
+    if (StoreBytes != sizeof(PointerTy))
+      memset(Ptr, 0, StoreBytes);
+
+    *((PointerTy*)Ptr) = Val.PointerVal;
+    break;
+  default:
+    cerr << "Cannot store value of type " << *Ty << "!\n";
+  }
+
+  if (sys::isLittleEndianHost() != getTargetData()->isLittleEndian())
+    // Host and target are different endian - reverse the stored bytes.
+    std::reverse((uint8_t*)Ptr, StoreBytes + (uint8_t*)Ptr);
+}
+
+/// LoadIntFromMemory - Loads the integer stored in the LoadBytes bytes starting
+/// from Src into IntVal, which is assumed to be wide enough and to hold zero.
+static void LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) {
+  assert((IntVal.getBitWidth()+7)/8 >= LoadBytes && "Integer too small!");
+  uint8_t *Dst = (uint8_t *)IntVal.getRawData();
+
+  if (sys::isLittleEndianHost())
+    // Little-endian host - the destination must be ordered from LSB to MSB.
+    // The source is ordered from LSB to MSB: Do a straight copy.
+    memcpy(Dst, Src, LoadBytes);
+  else {
+    // Big-endian - the destination is an array of 64 bit words ordered from
+    // LSW to MSW.  Each word must be ordered from MSB to LSB.  The source is
+    // ordered from MSB to LSB: Reverse the word order, but not the bytes in
+    // a word.
+    while (LoadBytes > sizeof(uint64_t)) {
+      LoadBytes -= sizeof(uint64_t);
+      // May not be aligned so use memcpy.
+      memcpy(Dst, Src + LoadBytes, sizeof(uint64_t));
+      Dst += sizeof(uint64_t);
+    }
+
+    memcpy(Dst + sizeof(uint64_t) - LoadBytes, Src, LoadBytes);
+  }
+}
+
+/// FIXME: document
+///
+void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
+                                          GenericValue *Ptr,
+                                          const Type *Ty) {
+  const unsigned LoadBytes = getTargetData()->getTypeStoreSize(Ty);
+
+  if (sys::isLittleEndianHost() != getTargetData()->isLittleEndian()) {
+    // Host and target are different endian - reverse copy the stored
+    // bytes into a buffer, and load from that.
+    uint8_t *Src = (uint8_t*)Ptr;
+    uint8_t *Buf = (uint8_t*)alloca(LoadBytes);
+    std::reverse_copy(Src, Src + LoadBytes, Buf);
+    Ptr = (GenericValue*)Buf;
+  }
+
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID:
+    // An APInt with all words initially zero.
+    Result.IntVal = APInt(cast<IntegerType>(Ty)->getBitWidth(), 0);
+    LoadIntFromMemory(Result.IntVal, (uint8_t*)Ptr, LoadBytes);
+    break;
+  case Type::FloatTyID:
+    Result.FloatVal = *((float*)Ptr);
+    break;
+  case Type::DoubleTyID:
+    Result.DoubleVal = *((double*)Ptr);
+    break;
+  case Type::PointerTyID:
+    Result.PointerVal = *((PointerTy*)Ptr);
+    break;
+  case Type::X86_FP80TyID: {
+    // This is endian dependent, but it will only work on x86 anyway.
+    // FIXME: Will not trap if loading a signaling NaN.
+    uint64_t y[2];
+    memcpy(y, Ptr, 10);
+    Result.IntVal = APInt(80, 2, y);
+    break;
+  }
+  default:
+    cerr << "Cannot load value of type " << *Ty << "!\n";
+    abort();
+  }
+}
+
+// InitializeMemory - Recursive function to apply a Constant value into the
+// specified memory location...
+//
+void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
+  DOUT << "JIT: Initializing " << Addr << " ";
+  DEBUG(Init->dump());
+  if (isa<UndefValue>(Init)) {
+    return;
+  } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) {
+    unsigned ElementSize =
+      getTargetData()->getTypeAllocSize(CP->getType()->getElementType());
+    for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
+      InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize);
+    return;
+  } else if (isa<ConstantAggregateZero>(Init)) {
+    memset(Addr, 0, (size_t)getTargetData()->getTypeAllocSize(Init->getType()));
+    return;
+  } else if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) {
+    unsigned ElementSize =
+      getTargetData()->getTypeAllocSize(CPA->getType()->getElementType());
+    for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i)
+      InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize);
+    return;
+  } else if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) {
+    const StructLayout *SL =
+      getTargetData()->getStructLayout(cast<StructType>(CPS->getType()));
+    for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i)
+      InitializeMemory(CPS->getOperand(i), (char*)Addr+SL->getElementOffset(i));
+    return;
+  } else if (Init->getType()->isFirstClassType()) {
+    GenericValue Val = getConstantValue(Init);
+    StoreValueToMemory(Val, (GenericValue*)Addr, Init->getType());
+    return;
+  }
+
+  cerr << "Bad Type: " << *Init->getType() << "\n";
+  assert(0 && "Unknown constant type to initialize memory with!");
+}
+
+/// EmitGlobals - Emit all of the global variables to memory, storing their
+/// addresses into GlobalAddress.  This must make sure to copy the contents of
+/// their initializers into the memory.
+///
+void ExecutionEngine::emitGlobals() {
+
+  // Loop over all of the global variables in the program, allocating the memory
+  // to hold them.  If there is more than one module, do a prepass over globals
+  // to figure out how the different modules should link together.
+  //
+  std::map<std::pair<std::string, const Type*>,
+           const GlobalValue*> LinkedGlobalsMap;
+
+  if (Modules.size() != 1) {
+    for (unsigned m = 0, e = Modules.size(); m != e; ++m) {
+      Module &M = *Modules[m]->getModule();
+      for (Module::const_global_iterator I = M.global_begin(),
+           E = M.global_end(); I != E; ++I) {
+        const GlobalValue *GV = I;
+        if (GV->hasLocalLinkage() || GV->isDeclaration() ||
+            GV->hasAppendingLinkage() || !GV->hasName())
+          continue;// Ignore external globals and globals with internal linkage.
+          
+        const GlobalValue *&GVEntry = 
+          LinkedGlobalsMap[std::make_pair(GV->getName(), GV->getType())];
+
+        // If this is the first time we've seen this global, it is the canonical
+        // version.
+        if (!GVEntry) {
+          GVEntry = GV;
+          continue;
+        }
+        
+        // If the existing global is strong, never replace it.
+        if (GVEntry->hasExternalLinkage() ||
+            GVEntry->hasDLLImportLinkage() ||
+            GVEntry->hasDLLExportLinkage())
+          continue;
+        
+        // Otherwise, we know it's linkonce/weak, replace it if this is a strong
+        // symbol.  FIXME is this right for common?
+        if (GV->hasExternalLinkage() || GVEntry->hasExternalWeakLinkage())
+          GVEntry = GV;
+      }
+    }
+  }
+  
+  std::vector<const GlobalValue*> NonCanonicalGlobals;
+  for (unsigned m = 0, e = Modules.size(); m != e; ++m) {
+    Module &M = *Modules[m]->getModule();
+    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+      // In the multi-module case, see what this global maps to.
+      if (!LinkedGlobalsMap.empty()) {
+        if (const GlobalValue *GVEntry = 
+              LinkedGlobalsMap[std::make_pair(I->getName(), I->getType())]) {
+          // If something else is the canonical global, ignore this one.
+          if (GVEntry != &*I) {
+            NonCanonicalGlobals.push_back(I);
+            continue;
+          }
+        }
+      }
+      
+      if (!I->isDeclaration()) {
+        addGlobalMapping(I, getMemoryForGV(I));
+      } else {
+        // External variable reference. Try to use the dynamic loader to
+        // get a pointer to it.
+        if (void *SymAddr =
+            sys::DynamicLibrary::SearchForAddressOfSymbol(I->getName().c_str()))
+          addGlobalMapping(I, SymAddr);
+        else {
+          cerr << "Could not resolve external global address: "
+               << I->getName() << "\n";
+          abort();
+        }
+      }
+    }
+    
+    // If there are multiple modules, map the non-canonical globals to their
+    // canonical location.
+    if (!NonCanonicalGlobals.empty()) {
+      for (unsigned i = 0, e = NonCanonicalGlobals.size(); i != e; ++i) {
+        const GlobalValue *GV = NonCanonicalGlobals[i];
+        const GlobalValue *CGV =
+          LinkedGlobalsMap[std::make_pair(GV->getName(), GV->getType())];
+        void *Ptr = getPointerToGlobalIfAvailable(CGV);
+        assert(Ptr && "Canonical global wasn't codegen'd!");
+        addGlobalMapping(GV, Ptr);
+      }
+    }
+    
+    // Now that all of the globals are set up in memory, loop through them all 
+    // and initialize their contents.
+    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+      if (!I->isDeclaration()) {
+        if (!LinkedGlobalsMap.empty()) {
+          if (const GlobalValue *GVEntry = 
+                LinkedGlobalsMap[std::make_pair(I->getName(), I->getType())])
+            if (GVEntry != &*I)  // Not the canonical variable.
+              continue;
+        }
+        EmitGlobalVariable(I);
+      }
+    }
+  }
+}
+
+// EmitGlobalVariable - This method emits the specified global variable to the
+// address specified in GlobalAddresses, or allocates new memory if it's not
+// already in the map.
+void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
+  void *GA = getPointerToGlobalIfAvailable(GV);
+
+  if (GA == 0) {
+    // If it's not already specified, allocate memory for the global.
+    GA = getMemoryForGV(GV);
+    addGlobalMapping(GV, GA);
+  }
+  
+  // Don't initialize if it's thread local, let the client do it.
+  if (!GV->isThreadLocal())
+    InitializeMemory(GV->getInitializer(), GA);
+  
+  const Type *ElTy = GV->getType()->getElementType();
+  size_t GVSize = (size_t)getTargetData()->getTypeAllocSize(ElTy);
+  NumInitBytes += (unsigned)GVSize;
+  ++NumGlobals;
+}
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
new file mode 100644
index 0000000..83397a58
--- /dev/null
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -0,0 +1,206 @@
+//===-- ExecutionEngineBindings.cpp - C bindings for EEs ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the C bindings for the ExecutionEngine library.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "llvm-c/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include <cstring>
+
+using namespace llvm;
+
+/*===-- Operations on generic values --------------------------------------===*/
+
+LLVMGenericValueRef LLVMCreateGenericValueOfInt(LLVMTypeRef Ty,
+                                                unsigned long long N,
+                                                int IsSigned) {
+  GenericValue *GenVal = new GenericValue();
+  GenVal->IntVal = APInt(unwrap<IntegerType>(Ty)->getBitWidth(), N, IsSigned);
+  return wrap(GenVal);
+}
+
+LLVMGenericValueRef LLVMCreateGenericValueOfPointer(void *P) {
+  GenericValue *GenVal = new GenericValue();
+  GenVal->PointerVal = P;
+  return wrap(GenVal);
+}
+
+LLVMGenericValueRef LLVMCreateGenericValueOfFloat(LLVMTypeRef TyRef, double N) {
+  GenericValue *GenVal = new GenericValue();
+  switch (unwrap(TyRef)->getTypeID()) {
+  case Type::FloatTyID:
+    GenVal->FloatVal = N;
+    break;
+  case Type::DoubleTyID:
+    GenVal->DoubleVal = N;
+    break;
+  default:
+    assert(0 && "LLVMGenericValueToFloat supports only float and double.");
+    break;
+  }
+  return wrap(GenVal);
+}
+
+unsigned LLVMGenericValueIntWidth(LLVMGenericValueRef GenValRef) {
+  return unwrap(GenValRef)->IntVal.getBitWidth();
+}
+
+unsigned long long LLVMGenericValueToInt(LLVMGenericValueRef GenValRef,
+                                         int IsSigned) {
+  GenericValue *GenVal = unwrap(GenValRef);
+  if (IsSigned)
+    return GenVal->IntVal.getSExtValue();
+  else
+    return GenVal->IntVal.getZExtValue();
+}
+
+void *LLVMGenericValueToPointer(LLVMGenericValueRef GenVal) {
+  return unwrap(GenVal)->PointerVal;
+}
+
+double LLVMGenericValueToFloat(LLVMTypeRef TyRef, LLVMGenericValueRef GenVal) {
+  switch (unwrap(TyRef)->getTypeID()) {
+  case Type::FloatTyID:
+    return unwrap(GenVal)->FloatVal;
+  case Type::DoubleTyID:
+    return unwrap(GenVal)->DoubleVal;
+  default:
+    assert(0 && "LLVMGenericValueToFloat supports only float and double.");
+    break;
+  }
+  return 0; // Not reached
+}
+
+void LLVMDisposeGenericValue(LLVMGenericValueRef GenVal) {
+  delete unwrap(GenVal);
+}
+
+/*===-- Operations on execution engines -----------------------------------===*/
+
+int LLVMCreateExecutionEngine(LLVMExecutionEngineRef *OutEE,
+                              LLVMModuleProviderRef MP,
+                              char **OutError) {
+  std::string Error;
+  if (ExecutionEngine *EE = ExecutionEngine::create(unwrap(MP), false, &Error)){
+    *OutEE = wrap(EE);
+    return 0;
+  }
+  *OutError = strdup(Error.c_str());
+  return 1;
+}
+
+int LLVMCreateInterpreter(LLVMExecutionEngineRef *OutInterp,
+                          LLVMModuleProviderRef MP,
+                          char **OutError) {
+  std::string Error;
+  if (ExecutionEngine *Interp =
+      ExecutionEngine::create(unwrap(MP), true, &Error)) {
+    *OutInterp = wrap(Interp);
+    return 0;
+  }
+  *OutError = strdup(Error.c_str());
+  return 1;
+}
+
+int LLVMCreateJITCompiler(LLVMExecutionEngineRef *OutJIT,
+                          LLVMModuleProviderRef MP,
+                          unsigned OptLevel,
+                          char **OutError) {
+  std::string Error;
+  if (ExecutionEngine *JIT =
+      ExecutionEngine::createJIT(unwrap(MP), &Error, 0,
+                                 (CodeGenOpt::Level)OptLevel)) {
+    *OutJIT = wrap(JIT);
+    return 0;
+  }
+  *OutError = strdup(Error.c_str());
+  return 1;
+}
+
+void LLVMDisposeExecutionEngine(LLVMExecutionEngineRef EE) {
+  delete unwrap(EE);
+}
+
+void LLVMRunStaticConstructors(LLVMExecutionEngineRef EE) {
+  unwrap(EE)->runStaticConstructorsDestructors(false);
+}
+
+void LLVMRunStaticDestructors(LLVMExecutionEngineRef EE) {
+  unwrap(EE)->runStaticConstructorsDestructors(true);
+}
+
+int LLVMRunFunctionAsMain(LLVMExecutionEngineRef EE, LLVMValueRef F,
+                          unsigned ArgC, const char * const *ArgV,
+                          const char * const *EnvP) {
+  std::vector<std::string> ArgVec;
+  for (unsigned I = 0; I != ArgC; ++I)
+    ArgVec.push_back(ArgV[I]);
+  
+  return unwrap(EE)->runFunctionAsMain(unwrap<Function>(F), ArgVec, EnvP);
+}
+
+LLVMGenericValueRef LLVMRunFunction(LLVMExecutionEngineRef EE, LLVMValueRef F,
+                                    unsigned NumArgs,
+                                    LLVMGenericValueRef *Args) {
+  std::vector<GenericValue> ArgVec;
+  ArgVec.reserve(NumArgs);
+  for (unsigned I = 0; I != NumArgs; ++I)
+    ArgVec.push_back(*unwrap(Args[I]));
+  
+  GenericValue *Result = new GenericValue();
+  *Result = unwrap(EE)->runFunction(unwrap<Function>(F), ArgVec);
+  return wrap(Result);
+}
+
+void LLVMFreeMachineCodeForFunction(LLVMExecutionEngineRef EE, LLVMValueRef F) {
+  unwrap(EE)->freeMachineCodeForFunction(unwrap<Function>(F));
+}
+
+void LLVMAddModuleProvider(LLVMExecutionEngineRef EE, LLVMModuleProviderRef MP){
+  unwrap(EE)->addModuleProvider(unwrap(MP));
+}
+
+int LLVMRemoveModuleProvider(LLVMExecutionEngineRef EE,
+                             LLVMModuleProviderRef MP,
+                             LLVMModuleRef *OutMod, char **OutError) {
+  std::string Error;
+  if (Module *Gone = unwrap(EE)->removeModuleProvider(unwrap(MP), &Error)) {
+    *OutMod = wrap(Gone);
+    return 0;
+  }
+  if (OutError)
+    *OutError = strdup(Error.c_str());
+  return 1;
+}
+
+int LLVMFindFunction(LLVMExecutionEngineRef EE, const char *Name,
+                     LLVMValueRef *OutFn) {
+  if (Function *F = unwrap(EE)->FindFunctionNamed(Name)) {
+    *OutFn = wrap(F);
+    return 0;
+  }
+  return 1;
+}
+
+LLVMTargetDataRef LLVMGetExecutionEngineTargetData(LLVMExecutionEngineRef EE) {
+  return wrap(unwrap(EE)->getTargetData());
+}
+
+void LLVMAddGlobalMapping(LLVMExecutionEngineRef EE, LLVMValueRef Global,
+                          void* Addr) {
+  unwrap(EE)->addGlobalMapping(unwrap<GlobalValue>(Global), Addr);
+}
+
+void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global) {
+  return unwrap(EE)->getPointerToGlobal(unwrap<GlobalValue>(Global));
+}
diff --git a/lib/ExecutionEngine/Interpreter/CMakeLists.txt b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
new file mode 100644
index 0000000..626e804
--- /dev/null
+++ b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_partially_linked_object(LLVMInterpreter
+  Execution.cpp
+  ExternalFunctions.cpp
+  Interpreter.cpp
+  )
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
new file mode 100644
index 0000000..765fed2
--- /dev/null
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -0,0 +1,1382 @@
+//===-- Execution.cpp - Implement code to simulate the program ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains the actual instruction interpreter.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "interpreter"
+#include "Interpreter.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+using namespace llvm;
+
+STATISTIC(NumDynamicInsts, "Number of dynamic instructions executed");
+static Interpreter *TheEE = 0;
+
+static cl::opt<bool> PrintVolatile("interpreter-print-volatile", cl::Hidden,
+          cl::desc("make the interpreter print every volatile load and store"));
+
+//===----------------------------------------------------------------------===//
+//                     Various Helper Functions
+//===----------------------------------------------------------------------===//
+
+static inline uint64_t doSignExtension(uint64_t Val, const IntegerType* ITy) {
+  // Determine if the value is signed or not
+  bool isSigned = (Val & (1 << (ITy->getBitWidth()-1))) != 0;
+  // If its signed, extend the sign bits
+  if (isSigned)
+    Val |= ~ITy->getBitMask();
+  return Val;
+}
+
+static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF) {
+  SF.Values[V] = Val;
+}
+
+void Interpreter::initializeExecutionEngine() {
+  TheEE = this;
+}
+
+//===----------------------------------------------------------------------===//
+//                    Binary Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+#define IMPLEMENT_BINARY_OPERATOR(OP, TY) \
+   case Type::TY##TyID: \
+     Dest.TY##Val = Src1.TY##Val OP Src2.TY##Val; \
+     break
+
+#define IMPLEMENT_INTEGER_BINOP1(OP, TY) \
+   case Type::IntegerTyID: { \
+     Dest.IntVal = Src1.IntVal OP Src2.IntVal; \
+     break; \
+   }
+
+
+static void executeAddInst(GenericValue &Dest, GenericValue Src1, 
+                           GenericValue Src2, const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_BINOP1(+, Ty);
+    IMPLEMENT_BINARY_OPERATOR(+, Float);
+    IMPLEMENT_BINARY_OPERATOR(+, Double);
+  default:
+    cerr << "Unhandled type for Add instruction: " << *Ty << "\n";
+    abort();
+  }
+}
+
+static void executeSubInst(GenericValue &Dest, GenericValue Src1, 
+                           GenericValue Src2, const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_BINOP1(-, Ty);
+    IMPLEMENT_BINARY_OPERATOR(-, Float);
+    IMPLEMENT_BINARY_OPERATOR(-, Double);
+  default:
+    cerr << "Unhandled type for Sub instruction: " << *Ty << "\n";
+    abort();
+  }
+}
+
+static void executeMulInst(GenericValue &Dest, GenericValue Src1, 
+                           GenericValue Src2, const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_BINOP1(*, Ty);
+    IMPLEMENT_BINARY_OPERATOR(*, Float);
+    IMPLEMENT_BINARY_OPERATOR(*, Double);
+  default:
+    cerr << "Unhandled type for Mul instruction: " << *Ty << "\n";
+    abort();
+  }
+}
+
+static void executeFDivInst(GenericValue &Dest, GenericValue Src1, 
+                            GenericValue Src2, const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_BINARY_OPERATOR(/, Float);
+    IMPLEMENT_BINARY_OPERATOR(/, Double);
+  default:
+    cerr << "Unhandled type for FDiv instruction: " << *Ty << "\n";
+    abort();
+  }
+}
+
+static void executeFRemInst(GenericValue &Dest, GenericValue Src1, 
+                            GenericValue Src2, const Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    Dest.FloatVal = fmod(Src1.FloatVal, Src2.FloatVal);
+    break;
+  case Type::DoubleTyID:
+    Dest.DoubleVal = fmod(Src1.DoubleVal, Src2.DoubleVal);
+    break;
+  default:
+    cerr << "Unhandled type for Rem instruction: " << *Ty << "\n";
+    abort();
+  }
+}
+
+#define IMPLEMENT_INTEGER_ICMP(OP, TY) \
+   case Type::IntegerTyID:  \
+      Dest.IntVal = APInt(1,Src1.IntVal.OP(Src2.IntVal)); \
+      break;
+
+// Handle pointers specially because they must be compared with only as much
+// width as the host has.  We _do not_ want to be comparing 64 bit values when
+// running on a 32-bit target, otherwise the upper 32 bits might mess up
+// comparisons if they contain garbage.
+#define IMPLEMENT_POINTER_ICMP(OP) \
+   case Type::PointerTyID: \
+      Dest.IntVal = APInt(1,(void*)(intptr_t)Src1.PointerVal OP \
+                            (void*)(intptr_t)Src2.PointerVal); \
+      break;
+
+static GenericValue executeICMP_EQ(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(eq,Ty);
+    IMPLEMENT_POINTER_ICMP(==);
+  default:
+    cerr << "Unhandled type for ICMP_EQ predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_NE(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ne,Ty);
+    IMPLEMENT_POINTER_ICMP(!=);
+  default:
+    cerr << "Unhandled type for ICMP_NE predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_ULT(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ult,Ty);
+    IMPLEMENT_POINTER_ICMP(<);
+  default:
+    cerr << "Unhandled type for ICMP_ULT predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SLT(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(slt,Ty);
+    IMPLEMENT_POINTER_ICMP(<);
+  default:
+    cerr << "Unhandled type for ICMP_SLT predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_UGT(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ugt,Ty);
+    IMPLEMENT_POINTER_ICMP(>);
+  default:
+    cerr << "Unhandled type for ICMP_UGT predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SGT(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(sgt,Ty);
+    IMPLEMENT_POINTER_ICMP(>);
+  default:
+    cerr << "Unhandled type for ICMP_SGT predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_ULE(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ule,Ty);
+    IMPLEMENT_POINTER_ICMP(<=);
+  default:
+    cerr << "Unhandled type for ICMP_ULE predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SLE(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(sle,Ty);
+    IMPLEMENT_POINTER_ICMP(<=);
+  default:
+    cerr << "Unhandled type for ICMP_SLE predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_UGE(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(uge,Ty);
+    IMPLEMENT_POINTER_ICMP(>=);
+  default:
+    cerr << "Unhandled type for ICMP_UGE predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SGE(GenericValue Src1, GenericValue Src2,
+                                    const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(sge,Ty);
+    IMPLEMENT_POINTER_ICMP(>=);
+  default:
+    cerr << "Unhandled type for ICMP_SGE predicate: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+void Interpreter::visitICmpInst(ICmpInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  const Type *Ty    = I.getOperand(0)->getType();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue R;   // Result
+  
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:  R = executeICMP_EQ(Src1,  Src2, Ty); break;
+  case ICmpInst::ICMP_NE:  R = executeICMP_NE(Src1,  Src2, Ty); break;
+  case ICmpInst::ICMP_ULT: R = executeICMP_ULT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SLT: R = executeICMP_SLT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_UGT: R = executeICMP_UGT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SGT: R = executeICMP_SGT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_ULE: R = executeICMP_ULE(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SLE: R = executeICMP_SLE(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_UGE: R = executeICMP_UGE(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SGE: R = executeICMP_SGE(Src1, Src2, Ty); break;
+  default:
+    cerr << "Don't know how to handle this ICmp predicate!\n-->" << I;
+    abort();
+  }
+ 
+  SetValue(&I, R, SF);
+}
+
+#define IMPLEMENT_FCMP(OP, TY) \
+   case Type::TY##TyID: \
+     Dest.IntVal = APInt(1,Src1.TY##Val OP Src2.TY##Val); \
+     break
+
+static GenericValue executeFCMP_OEQ(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(==, Float);
+    IMPLEMENT_FCMP(==, Double);
+  default:
+    cerr << "Unhandled type for FCmp EQ instruction: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_ONE(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(!=, Float);
+    IMPLEMENT_FCMP(!=, Double);
+
+  default:
+    cerr << "Unhandled type for FCmp NE instruction: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OLE(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(<=, Float);
+    IMPLEMENT_FCMP(<=, Double);
+  default:
+    cerr << "Unhandled type for FCmp LE instruction: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OGE(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(>=, Float);
+    IMPLEMENT_FCMP(>=, Double);
+  default:
+    cerr << "Unhandled type for FCmp GE instruction: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OLT(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(<, Float);
+    IMPLEMENT_FCMP(<, Double);
+  default:
+    cerr << "Unhandled type for FCmp LT instruction: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OGT(GenericValue Src1, GenericValue Src2,
+                                     const Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(>, Float);
+    IMPLEMENT_FCMP(>, Double);
+  default:
+    cerr << "Unhandled type for FCmp GT instruction: " << *Ty << "\n";
+    abort();
+  }
+  return Dest;
+}
+
+#define IMPLEMENT_UNORDERED(TY, X,Y)                                     \
+  if (TY == Type::FloatTy) {                                             \
+    if (X.FloatVal != X.FloatVal || Y.FloatVal != Y.FloatVal) {          \
+      Dest.IntVal = APInt(1,true);                                       \
+      return Dest;                                                       \
+    }                                                                    \
+  } else if (X.DoubleVal != X.DoubleVal || Y.DoubleVal != Y.DoubleVal) { \
+    Dest.IntVal = APInt(1,true);                                         \
+    return Dest;                                                         \
+  }
+
+
+static GenericValue executeFCMP_UEQ(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OEQ(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_UNE(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_ONE(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_ULE(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OLE(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_UGE(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OGE(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_ULT(GenericValue Src1, GenericValue Src2,
+                                   const Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OLT(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_UGT(GenericValue Src1, GenericValue Src2,
+                                     const Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OGT(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_ORD(GenericValue Src1, GenericValue Src2,
+                                     const Type *Ty) {
+  GenericValue Dest;
+  if (Ty == Type::FloatTy)
+    Dest.IntVal = APInt(1,(Src1.FloatVal == Src1.FloatVal && 
+                           Src2.FloatVal == Src2.FloatVal));
+  else
+    Dest.IntVal = APInt(1,(Src1.DoubleVal == Src1.DoubleVal && 
+                           Src2.DoubleVal == Src2.DoubleVal));
+  return Dest;
+}
+
+static GenericValue executeFCMP_UNO(GenericValue Src1, GenericValue Src2,
+                                     const Type *Ty) {
+  GenericValue Dest;
+  if (Ty == Type::FloatTy)
+    Dest.IntVal = APInt(1,(Src1.FloatVal != Src1.FloatVal || 
+                           Src2.FloatVal != Src2.FloatVal));
+  else
+    Dest.IntVal = APInt(1,(Src1.DoubleVal != Src1.DoubleVal || 
+                           Src2.DoubleVal != Src2.DoubleVal));
+  return Dest;
+}
+
+void Interpreter::visitFCmpInst(FCmpInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  const Type *Ty    = I.getOperand(0)->getType();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue R;   // Result
+  
+  switch (I.getPredicate()) {
+  case FCmpInst::FCMP_FALSE: R.IntVal = APInt(1,false); break;
+  case FCmpInst::FCMP_TRUE:  R.IntVal = APInt(1,true); break;
+  case FCmpInst::FCMP_ORD:   R = executeFCMP_ORD(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UNO:   R = executeFCMP_UNO(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UEQ:   R = executeFCMP_UEQ(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OEQ:   R = executeFCMP_OEQ(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UNE:   R = executeFCMP_UNE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_ONE:   R = executeFCMP_ONE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_ULT:   R = executeFCMP_ULT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OLT:   R = executeFCMP_OLT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UGT:   R = executeFCMP_UGT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OGT:   R = executeFCMP_OGT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_ULE:   R = executeFCMP_ULE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OLE:   R = executeFCMP_OLE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UGE:   R = executeFCMP_UGE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OGE:   R = executeFCMP_OGE(Src1, Src2, Ty); break;
+  default:
+    cerr << "Don't know how to handle this FCmp predicate!\n-->" << I;
+    abort();
+  }
+ 
+  SetValue(&I, R, SF);
+}
+
+static GenericValue executeCmpInst(unsigned predicate, GenericValue Src1, 
+                                   GenericValue Src2, const Type *Ty) {
+  GenericValue Result;
+  switch (predicate) {
+  case ICmpInst::ICMP_EQ:    return executeICMP_EQ(Src1, Src2, Ty);
+  case ICmpInst::ICMP_NE:    return executeICMP_NE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_UGT:   return executeICMP_UGT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SGT:   return executeICMP_SGT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_ULT:   return executeICMP_ULT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SLT:   return executeICMP_SLT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_UGE:   return executeICMP_UGE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SGE:   return executeICMP_SGE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_ULE:   return executeICMP_ULE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SLE:   return executeICMP_SLE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ORD:   return executeFCMP_ORD(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UNO:   return executeFCMP_UNO(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OEQ:   return executeFCMP_OEQ(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UEQ:   return executeFCMP_UEQ(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ONE:   return executeFCMP_ONE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UNE:   return executeFCMP_UNE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OLT:   return executeFCMP_OLT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ULT:   return executeFCMP_ULT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OGT:   return executeFCMP_OGT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UGT:   return executeFCMP_UGT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OLE:   return executeFCMP_OLE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ULE:   return executeFCMP_ULE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OGE:   return executeFCMP_OGE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UGE:   return executeFCMP_UGE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_FALSE: { 
+    GenericValue Result;
+    Result.IntVal = APInt(1, false);
+    return Result;
+  }
+  case FCmpInst::FCMP_TRUE: {
+    GenericValue Result;
+    Result.IntVal = APInt(1, true);
+    return Result;
+  }
+  default:
+    cerr << "Unhandled Cmp predicate\n";
+    abort();
+  }
+}
+
+void Interpreter::visitBinaryOperator(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  const Type *Ty    = I.getOperand(0)->getType();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue R;   // Result
+
+  switch (I.getOpcode()) {
+  case Instruction::Add:   executeAddInst  (R, Src1, Src2, Ty); break;
+  case Instruction::Sub:   executeSubInst  (R, Src1, Src2, Ty); break;
+  case Instruction::Mul:   executeMulInst  (R, Src1, Src2, Ty); break;
+  case Instruction::FDiv:  executeFDivInst (R, Src1, Src2, Ty); break;
+  case Instruction::FRem:  executeFRemInst (R, Src1, Src2, Ty); break;
+  case Instruction::UDiv:  R.IntVal = Src1.IntVal.udiv(Src2.IntVal); break;
+  case Instruction::SDiv:  R.IntVal = Src1.IntVal.sdiv(Src2.IntVal); break;
+  case Instruction::URem:  R.IntVal = Src1.IntVal.urem(Src2.IntVal); break;
+  case Instruction::SRem:  R.IntVal = Src1.IntVal.srem(Src2.IntVal); break;
+  case Instruction::And:   R.IntVal = Src1.IntVal & Src2.IntVal; break;
+  case Instruction::Or:    R.IntVal = Src1.IntVal | Src2.IntVal; break;
+  case Instruction::Xor:   R.IntVal = Src1.IntVal ^ Src2.IntVal; break;
+  default:
+    cerr << "Don't know how to handle this binary operator!\n-->" << I;
+    abort();
+  }
+
+  SetValue(&I, R, SF);
+}
+
+static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2,
+                                      GenericValue Src3) {
+  return Src1.IntVal == 0 ? Src3 : Src2;
+}
+
+void Interpreter::visitSelectInst(SelectInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue R = executeSelectInst(Src1, Src2, Src3);
+  SetValue(&I, R, SF);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                     Terminator Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+void Interpreter::exitCalled(GenericValue GV) {
+  // runAtExitHandlers() assumes there are no stack frames, but
+  // if exit() was called, then it had a stack frame. Blow away
+  // the stack before interpreting atexit handlers.
+  ECStack.clear ();
+  runAtExitHandlers ();
+  exit (GV.IntVal.zextOrTrunc(32).getZExtValue());
+}
+
+/// Pop the last stack frame off of ECStack and then copy the result
+/// back into the result variable if we are not returning void. The
+/// result variable may be the ExitValue, or the Value of the calling
+/// CallInst if there was a previous stack frame. This method may
+/// invalidate any ECStack iterators you have. This method also takes
+/// care of switching to the normal destination BB, if we are returning
+/// from an invoke.
+///
+void Interpreter::popStackAndReturnValueToCaller (const Type *RetTy,
+                                                  GenericValue Result) {
+  // Pop the current stack frame.
+  ECStack.pop_back();
+
+  if (ECStack.empty()) {  // Finished main.  Put result into exit code...
+    if (RetTy && RetTy->isInteger()) {          // Nonvoid return type?
+      ExitValue = Result;   // Capture the exit value of the program
+    } else {
+      memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
+    }
+  } else {
+    // If we have a previous stack frame, and we have a previous call,
+    // fill in the return value...
+    ExecutionContext &CallingSF = ECStack.back();
+    if (Instruction *I = CallingSF.Caller.getInstruction()) {
+      if (CallingSF.Caller.getType() != Type::VoidTy)      // Save result...
+        SetValue(I, Result, CallingSF);
+      if (InvokeInst *II = dyn_cast<InvokeInst> (I))
+        SwitchToNewBasicBlock (II->getNormalDest (), CallingSF);
+      CallingSF.Caller = CallSite();          // We returned from the call...
+    }
+  }
+}
+
+void Interpreter::visitReturnInst(ReturnInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  const Type *RetTy = Type::VoidTy;
+  GenericValue Result;
+
+  // Save away the return value... (if we are not 'ret void')
+  if (I.getNumOperands()) {
+    RetTy  = I.getReturnValue()->getType();
+    Result = getOperandValue(I.getReturnValue(), SF);
+  }
+
+  popStackAndReturnValueToCaller(RetTy, Result);
+}
+
+void Interpreter::visitUnwindInst(UnwindInst &I) {
+  // Unwind stack
+  Instruction *Inst;
+  do {
+    ECStack.pop_back ();
+    if (ECStack.empty ())
+      abort ();
+    Inst = ECStack.back ().Caller.getInstruction ();
+  } while (!(Inst && isa<InvokeInst> (Inst)));
+
+  // Return from invoke
+  ExecutionContext &InvokingSF = ECStack.back ();
+  InvokingSF.Caller = CallSite ();
+
+  // Go to exceptional destination BB of invoke instruction
+  SwitchToNewBasicBlock(cast<InvokeInst>(Inst)->getUnwindDest(), InvokingSF);
+}
+
+void Interpreter::visitUnreachableInst(UnreachableInst &I) {
+  cerr << "ERROR: Program executed an 'unreachable' instruction!\n";
+  abort();
+}
+
+void Interpreter::visitBranchInst(BranchInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  BasicBlock *Dest;
+
+  Dest = I.getSuccessor(0);          // Uncond branches have a fixed dest...
+  if (!I.isUnconditional()) {
+    Value *Cond = I.getCondition();
+    if (getOperandValue(Cond, SF).IntVal == 0) // If false cond...
+      Dest = I.getSuccessor(1);
+  }
+  SwitchToNewBasicBlock(Dest, SF);
+}
+
+void Interpreter::visitSwitchInst(SwitchInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue CondVal = getOperandValue(I.getOperand(0), SF);
+  const Type *ElTy = I.getOperand(0)->getType();
+
+  // Check to see if any of the cases match...
+  BasicBlock *Dest = 0;
+  for (unsigned i = 2, e = I.getNumOperands(); i != e; i += 2)
+    if (executeICMP_EQ(CondVal, getOperandValue(I.getOperand(i), SF), ElTy)
+        .IntVal != 0) {
+      Dest = cast<BasicBlock>(I.getOperand(i+1));
+      break;
+    }
+
+  if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
+  SwitchToNewBasicBlock(Dest, SF);
+}
+
+// SwitchToNewBasicBlock - This method is used to jump to a new basic block.
+// This function handles the actual updating of block and instruction iterators
+// as well as execution of all of the PHI nodes in the destination block.
+//
+// This method does this because all of the PHI nodes must be executed
+// atomically, reading their inputs before any of the results are updated.  Not
+// doing this can cause problems if the PHI nodes depend on other PHI nodes for
+// their inputs.  If the input PHI node is updated before it is read, incorrect
+// results can happen.  Thus we use a two phase approach.
+//
+void Interpreter::SwitchToNewBasicBlock(BasicBlock *Dest, ExecutionContext &SF){
+  BasicBlock *PrevBB = SF.CurBB;      // Remember where we came from...
+  SF.CurBB   = Dest;                  // Update CurBB to branch destination
+  SF.CurInst = SF.CurBB->begin();     // Update new instruction ptr...
+
+  if (!isa<PHINode>(SF.CurInst)) return;  // Nothing fancy to do
+
+  // Loop over all of the PHI nodes in the current block, reading their inputs.
+  std::vector<GenericValue> ResultValues;
+
+  for (; PHINode *PN = dyn_cast<PHINode>(SF.CurInst); ++SF.CurInst) {
+    // Search for the value corresponding to this previous bb...
+    int i = PN->getBasicBlockIndex(PrevBB);
+    assert(i != -1 && "PHINode doesn't contain entry for predecessor??");
+    Value *IncomingValue = PN->getIncomingValue(i);
+
+    // Save the incoming value for this PHI node...
+    ResultValues.push_back(getOperandValue(IncomingValue, SF));
+  }
+
+  // Now loop over all of the PHI nodes setting their values...
+  SF.CurInst = SF.CurBB->begin();
+  for (unsigned i = 0; isa<PHINode>(SF.CurInst); ++SF.CurInst, ++i) {
+    PHINode *PN = cast<PHINode>(SF.CurInst);
+    SetValue(PN, ResultValues[i], SF);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                     Memory Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+void Interpreter::visitAllocationInst(AllocationInst &I) {
+  ExecutionContext &SF = ECStack.back();
+
+  const Type *Ty = I.getType()->getElementType();  // Type to be allocated
+
+  // Get the number of elements being allocated by the array...
+  unsigned NumElements = 
+    getOperandValue(I.getOperand(0), SF).IntVal.getZExtValue();
+
+  unsigned TypeSize = (size_t)TD.getTypeAllocSize(Ty);
+
+  // Avoid malloc-ing zero bytes, use max()...
+  unsigned MemToAlloc = std::max(1U, NumElements * TypeSize);
+
+  // Allocate enough memory to hold the type...
+  void *Memory = malloc(MemToAlloc);
+
+  DOUT << "Allocated Type: " << *Ty << " (" << TypeSize << " bytes) x " 
+       << NumElements << " (Total: " << MemToAlloc << ") at "
+       << uintptr_t(Memory) << '\n';
+
+  GenericValue Result = PTOGV(Memory);
+  assert(Result.PointerVal != 0 && "Null pointer returned by malloc!");
+  SetValue(&I, Result, SF);
+
+  if (I.getOpcode() == Instruction::Alloca)
+    ECStack.back().Allocas.add(Memory);
+}
+
+void Interpreter::visitFreeInst(FreeInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  assert(isa<PointerType>(I.getOperand(0)->getType()) && "Freeing nonptr?");
+  GenericValue Value = getOperandValue(I.getOperand(0), SF);
+  // TODO: Check to make sure memory is allocated
+  free(GVTOP(Value));   // Free memory
+}
+
+// getElementOffset - The workhorse for getelementptr.
+//
+GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
+                                              gep_type_iterator E,
+                                              ExecutionContext &SF) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+         "Cannot getElementOffset of a nonpointer type!");
+
+  uint64_t Total = 0;
+
+  for (; I != E; ++I) {
+    if (const StructType *STy = dyn_cast<StructType>(*I)) {
+      const StructLayout *SLO = TD.getStructLayout(STy);
+
+      const ConstantInt *CPU = cast<ConstantInt>(I.getOperand());
+      unsigned Index = unsigned(CPU->getZExtValue());
+
+      Total += SLO->getElementOffset(Index);
+    } else {
+      const SequentialType *ST = cast<SequentialType>(*I);
+      // Get the index number for the array... which must be long type...
+      GenericValue IdxGV = getOperandValue(I.getOperand(), SF);
+
+      int64_t Idx;
+      unsigned BitWidth = 
+        cast<IntegerType>(I.getOperand()->getType())->getBitWidth();
+      if (BitWidth == 32)
+        Idx = (int64_t)(int32_t)IdxGV.IntVal.getZExtValue();
+      else {
+        assert(BitWidth == 64 && "Invalid index type for getelementptr");
+        Idx = (int64_t)IdxGV.IntVal.getZExtValue();
+      }
+      Total += TD.getTypeAllocSize(ST->getElementType())*Idx;
+    }
+  }
+
+  GenericValue Result;
+  Result.PointerVal = ((char*)getOperandValue(Ptr, SF).PointerVal) + Total;
+  DOUT << "GEP Index " << Total << " bytes.\n";
+  return Result;
+}
+
+void Interpreter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, TheEE->executeGEPOperation(I.getPointerOperand(),
+                                   gep_type_begin(I), gep_type_end(I), SF), SF);
+}
+
+void Interpreter::visitLoadInst(LoadInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue SRC = getOperandValue(I.getPointerOperand(), SF);
+  GenericValue *Ptr = (GenericValue*)GVTOP(SRC);
+  GenericValue Result;
+  LoadValueFromMemory(Result, Ptr, I.getType());
+  SetValue(&I, Result, SF);
+  if (I.isVolatile() && PrintVolatile)
+    cerr << "Volatile load " << I;
+}
+
+void Interpreter::visitStoreInst(StoreInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Val = getOperandValue(I.getOperand(0), SF);
+  GenericValue SRC = getOperandValue(I.getPointerOperand(), SF);
+  StoreValueToMemory(Val, (GenericValue *)GVTOP(SRC),
+                     I.getOperand(0)->getType());
+  if (I.isVolatile() && PrintVolatile)
+    cerr << "Volatile store: " << I;
+}
+
+//===----------------------------------------------------------------------===//
+//                 Miscellaneous Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+void Interpreter::visitCallSite(CallSite CS) {
+  ExecutionContext &SF = ECStack.back();
+
+  // Check to see if this is an intrinsic function call...
+  Function *F = CS.getCalledFunction();
+  if (F && F->isDeclaration ())
+    switch (F->getIntrinsicID()) {
+    case Intrinsic::not_intrinsic:
+      break;
+    case Intrinsic::vastart: { // va_start
+      GenericValue ArgIndex;
+      ArgIndex.UIntPairVal.first = ECStack.size() - 1;
+      ArgIndex.UIntPairVal.second = 0;
+      SetValue(CS.getInstruction(), ArgIndex, SF);
+      return;
+    }
+    case Intrinsic::vaend:    // va_end is a noop for the interpreter
+      return;
+    case Intrinsic::vacopy:   // va_copy: dest = src
+      SetValue(CS.getInstruction(), getOperandValue(*CS.arg_begin(), SF), SF);
+      return;
+    default:
+      // If it is an unknown intrinsic function, use the intrinsic lowering
+      // class to transform it into hopefully tasty LLVM code.
+      //
+      BasicBlock::iterator me(CS.getInstruction());
+      BasicBlock *Parent = CS.getInstruction()->getParent();
+      bool atBegin(Parent->begin() == me);
+      if (!atBegin)
+        --me;
+      IL->LowerIntrinsicCall(cast<CallInst>(CS.getInstruction()));
+
+      // Restore the CurInst pointer to the first instruction newly inserted, if
+      // any.
+      if (atBegin) {
+        SF.CurInst = Parent->begin();
+      } else {
+        SF.CurInst = me;
+        ++SF.CurInst;
+      }
+      return;
+    }
+
+
+  SF.Caller = CS;
+  std::vector<GenericValue> ArgVals;
+  const unsigned NumArgs = SF.Caller.arg_size();
+  ArgVals.reserve(NumArgs);
+  uint16_t pNum = 1;
+  for (CallSite::arg_iterator i = SF.Caller.arg_begin(),
+         e = SF.Caller.arg_end(); i != e; ++i, ++pNum) {
+    Value *V = *i;
+    ArgVals.push_back(getOperandValue(V, SF));
+    // Promote all integral types whose size is < sizeof(i32) into i32.
+    // We do this by zero or sign extending the value as appropriate
+    // according to the parameter attributes
+    const Type *Ty = V->getType();
+    if (Ty->isInteger() && (ArgVals.back().IntVal.getBitWidth() < 32)) {
+      if (CS.paramHasAttr(pNum, Attribute::ZExt))
+        ArgVals.back().IntVal = ArgVals.back().IntVal.zext(32);
+      else if (CS.paramHasAttr(pNum, Attribute::SExt))
+        ArgVals.back().IntVal = ArgVals.back().IntVal.sext(32);
+    }
+  }
+
+  // To handle indirect calls, we must get the pointer value from the argument
+  // and treat it as a function pointer.
+  GenericValue SRC = getOperandValue(SF.Caller.getCalledValue(), SF);
+  callFunction((Function*)GVTOP(SRC), ArgVals);
+}
+
+void Interpreter::visitShl(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest;
+  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
+    Dest.IntVal = Src1.IntVal.shl(Src2.IntVal.getZExtValue());
+  else
+    Dest.IntVal = Src1.IntVal;
+  
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitLShr(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest;
+  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
+    Dest.IntVal = Src1.IntVal.lshr(Src2.IntVal.getZExtValue());
+  else
+    Dest.IntVal = Src1.IntVal;
+  
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitAShr(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest;
+  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
+    Dest.IntVal = Src1.IntVal.ashr(Src2.IntVal.getZExtValue());
+  else
+    Dest.IntVal = Src1.IntVal;
+  
+  SetValue(&I, Dest, SF);
+}
+
+GenericValue Interpreter::executeTruncInst(Value *SrcVal, const Type *DstTy,
+                                           ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  const IntegerType *DITy = cast<IntegerType>(DstTy);
+  unsigned DBitWidth = DITy->getBitWidth();
+  Dest.IntVal = Src.IntVal.trunc(DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeSExtInst(Value *SrcVal, const Type *DstTy,
+                                          ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  const IntegerType *DITy = cast<IntegerType>(DstTy);
+  unsigned DBitWidth = DITy->getBitWidth();
+  Dest.IntVal = Src.IntVal.sext(DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeZExtInst(Value *SrcVal, const Type *DstTy,
+                                          ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  const IntegerType *DITy = cast<IntegerType>(DstTy);
+  unsigned DBitWidth = DITy->getBitWidth();
+  Dest.IntVal = Src.IntVal.zext(DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPTruncInst(Value *SrcVal, const Type *DstTy,
+                                             ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcVal->getType() == Type::DoubleTy && DstTy == Type::FloatTy &&
+         "Invalid FPTrunc instruction");
+  Dest.FloatVal = (float) Src.DoubleVal;
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPExtInst(Value *SrcVal, const Type *DstTy,
+                                           ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcVal->getType() == Type::FloatTy && DstTy == Type::DoubleTy &&
+         "Invalid FPTrunc instruction");
+  Dest.DoubleVal = (double) Src.FloatVal;
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, const Type *DstTy,
+                                            ExecutionContext &SF) {
+  const Type *SrcTy = SrcVal->getType();
+  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcTy->isFloatingPoint() && "Invalid FPToUI instruction");
+
+  if (SrcTy->getTypeID() == Type::FloatTyID)
+    Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
+  else
+    Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, const Type *DstTy,
+                                            ExecutionContext &SF) {
+  const Type *SrcTy = SrcVal->getType();
+  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcTy->isFloatingPoint() && "Invalid FPToSI instruction");
+
+  if (SrcTy->getTypeID() == Type::FloatTyID)
+    Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
+  else
+    Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, const Type *DstTy,
+                                            ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(DstTy->isFloatingPoint() && "Invalid UIToFP instruction");
+
+  if (DstTy->getTypeID() == Type::FloatTyID)
+    Dest.FloatVal = APIntOps::RoundAPIntToFloat(Src.IntVal);
+  else
+    Dest.DoubleVal = APIntOps::RoundAPIntToDouble(Src.IntVal);
+  return Dest;
+}
+
+GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, const Type *DstTy,
+                                            ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(DstTy->isFloatingPoint() && "Invalid SIToFP instruction");
+
+  if (DstTy->getTypeID() == Type::FloatTyID)
+    Dest.FloatVal = APIntOps::RoundSignedAPIntToFloat(Src.IntVal);
+  else
+    Dest.DoubleVal = APIntOps::RoundSignedAPIntToDouble(Src.IntVal);
+  return Dest;
+
+}
+
+GenericValue Interpreter::executePtrToIntInst(Value *SrcVal, const Type *DstTy,
+                                              ExecutionContext &SF) {
+  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(isa<PointerType>(SrcVal->getType()) && "Invalid PtrToInt instruction");
+
+  Dest.IntVal = APInt(DBitWidth, (intptr_t) Src.PointerVal);
+  return Dest;
+}
+
+GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, const Type *DstTy,
+                                              ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(isa<PointerType>(DstTy) && "Invalid PtrToInt instruction");
+
+  uint32_t PtrSize = TD.getPointerSizeInBits();
+  if (PtrSize != Src.IntVal.getBitWidth())
+    Src.IntVal = Src.IntVal.zextOrTrunc(PtrSize);
+
+  Dest.PointerVal = PointerTy(intptr_t(Src.IntVal.getZExtValue()));
+  return Dest;
+}
+
+GenericValue Interpreter::executeBitCastInst(Value *SrcVal, const Type *DstTy,
+                                             ExecutionContext &SF) {
+  
+  const Type *SrcTy = SrcVal->getType();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  if (isa<PointerType>(DstTy)) {
+    assert(isa<PointerType>(SrcTy) && "Invalid BitCast");
+    Dest.PointerVal = Src.PointerVal;
+  } else if (DstTy->isInteger()) {
+    if (SrcTy == Type::FloatTy) {
+      Dest.IntVal.zext(sizeof(Src.FloatVal) * CHAR_BIT);
+      Dest.IntVal.floatToBits(Src.FloatVal);
+    } else if (SrcTy == Type::DoubleTy) {
+      Dest.IntVal.zext(sizeof(Src.DoubleVal) * CHAR_BIT);
+      Dest.IntVal.doubleToBits(Src.DoubleVal);
+    } else if (SrcTy->isInteger()) {
+      Dest.IntVal = Src.IntVal;
+    } else 
+      assert(0 && "Invalid BitCast");
+  } else if (DstTy == Type::FloatTy) {
+    if (SrcTy->isInteger())
+      Dest.FloatVal = Src.IntVal.bitsToFloat();
+    else
+      Dest.FloatVal = Src.FloatVal;
+  } else if (DstTy == Type::DoubleTy) {
+    if (SrcTy->isInteger())
+      Dest.DoubleVal = Src.IntVal.bitsToDouble();
+    else
+      Dest.DoubleVal = Src.DoubleVal;
+  } else
+    assert(0 && "Invalid Bitcast");
+
+  return Dest;
+}
+
+void Interpreter::visitTruncInst(TruncInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeTruncInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitSExtInst(SExtInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeSExtInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitZExtInst(ZExtInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeZExtInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPTruncInst(FPTruncInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPTruncInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPExtInst(FPExtInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPExtInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitUIToFPInst(UIToFPInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeUIToFPInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitSIToFPInst(SIToFPInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeSIToFPInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPToUIInst(FPToUIInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPToUIInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPToSIInst(FPToSIInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPToSIInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitPtrToIntInst(PtrToIntInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executePtrToIntInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitIntToPtrInst(IntToPtrInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeIntToPtrInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitBitCastInst(BitCastInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeBitCastInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+#define IMPLEMENT_VAARG(TY) \
+   case Type::TY##TyID: Dest.TY##Val = Src.TY##Val; break
+
+void Interpreter::visitVAArgInst(VAArgInst &I) {
+  ExecutionContext &SF = ECStack.back();
+
+  // Get the incoming valist parameter.  LLI treats the valist as a
+  // (ec-stack-depth var-arg-index) pair.
+  GenericValue VAList = getOperandValue(I.getOperand(0), SF);
+  GenericValue Dest;
+  GenericValue Src = ECStack[VAList.UIntPairVal.first]
+                      .VarArgs[VAList.UIntPairVal.second];
+  const Type *Ty = I.getType();
+  switch (Ty->getTypeID()) {
+    case Type::IntegerTyID: Dest.IntVal = Src.IntVal;
+    IMPLEMENT_VAARG(Pointer);
+    IMPLEMENT_VAARG(Float);
+    IMPLEMENT_VAARG(Double);
+  default:
+    cerr << "Unhandled dest type for vaarg instruction: " << *Ty << "\n";
+    abort();
+  }
+
+  // Set the Value of this Instruction.
+  SetValue(&I, Dest, SF);
+
+  // Move the pointer to the next vararg.
+  ++VAList.UIntPairVal.second;
+}
+
+GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
+                                                ExecutionContext &SF) {
+  switch (CE->getOpcode()) {
+  case Instruction::Trunc:   
+      return executeTruncInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::ZExt:
+      return executeZExtInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::SExt:
+      return executeSExtInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPTrunc:
+      return executeFPTruncInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPExt:
+      return executeFPExtInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::UIToFP:
+      return executeUIToFPInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::SIToFP:
+      return executeSIToFPInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPToUI:
+      return executeFPToUIInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPToSI:
+      return executeFPToSIInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::PtrToInt:
+      return executePtrToIntInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::IntToPtr:
+      return executeIntToPtrInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::BitCast:
+      return executeBitCastInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::GetElementPtr:
+    return executeGEPOperation(CE->getOperand(0), gep_type_begin(CE),
+                               gep_type_end(CE), SF);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return executeCmpInst(CE->getPredicate(),
+                          getOperandValue(CE->getOperand(0), SF),
+                          getOperandValue(CE->getOperand(1), SF),
+                          CE->getOperand(0)->getType());
+  case Instruction::Select:
+    return executeSelectInst(getOperandValue(CE->getOperand(0), SF),
+                             getOperandValue(CE->getOperand(1), SF),
+                             getOperandValue(CE->getOperand(2), SF));
+  default :
+    break;
+  }
+
+  // The cases below here require a GenericValue parameter for the result
+  // so we initialize one, compute it and then return it.
+  GenericValue Op0 = getOperandValue(CE->getOperand(0), SF);
+  GenericValue Op1 = getOperandValue(CE->getOperand(1), SF);
+  GenericValue Dest;
+  const Type * Ty = CE->getOperand(0)->getType();
+  switch (CE->getOpcode()) {
+  case Instruction::Add:  executeAddInst (Dest, Op0, Op1, Ty); break;
+  case Instruction::Sub:  executeSubInst (Dest, Op0, Op1, Ty); break;
+  case Instruction::Mul:  executeMulInst (Dest, Op0, Op1, Ty); break;
+  case Instruction::FDiv: executeFDivInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::FRem: executeFRemInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::SDiv: Dest.IntVal = Op0.IntVal.sdiv(Op1.IntVal); break;
+  case Instruction::UDiv: Dest.IntVal = Op0.IntVal.udiv(Op1.IntVal); break;
+  case Instruction::URem: Dest.IntVal = Op0.IntVal.urem(Op1.IntVal); break;
+  case Instruction::SRem: Dest.IntVal = Op0.IntVal.srem(Op1.IntVal); break;
+  case Instruction::And:  Dest.IntVal = Op0.IntVal.And(Op1.IntVal); break;
+  case Instruction::Or:   Dest.IntVal = Op0.IntVal.Or(Op1.IntVal); break;
+  case Instruction::Xor:  Dest.IntVal = Op0.IntVal.Xor(Op1.IntVal); break;
+  case Instruction::Shl:  
+    Dest.IntVal = Op0.IntVal.shl(Op1.IntVal.getZExtValue());
+    break;
+  case Instruction::LShr: 
+    Dest.IntVal = Op0.IntVal.lshr(Op1.IntVal.getZExtValue());
+    break;
+  case Instruction::AShr: 
+    Dest.IntVal = Op0.IntVal.ashr(Op1.IntVal.getZExtValue());
+    break;
+  default:
+    cerr << "Unhandled ConstantExpr: " << *CE << "\n";
+    abort();
+    return GenericValue();
+  }
+  return Dest;
+}
+
+GenericValue Interpreter::getOperandValue(Value *V, ExecutionContext &SF) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    return getConstantExprValue(CE, SF);
+  } else if (Constant *CPV = dyn_cast<Constant>(V)) {
+    return getConstantValue(CPV);
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    return PTOGV(getPointerToGlobal(GV));
+  } else {
+    return SF.Values[V];
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                        Dispatch and Execution Code
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// callFunction - Execute the specified function...
+//
+void Interpreter::callFunction(Function *F,
+                               const std::vector<GenericValue> &ArgVals) {
+  assert((ECStack.empty() || ECStack.back().Caller.getInstruction() == 0 ||
+          ECStack.back().Caller.arg_size() == ArgVals.size()) &&
+         "Incorrect number of arguments passed into function call!");
+  // Make a new stack frame... and fill it in.
+  ECStack.push_back(ExecutionContext());
+  ExecutionContext &StackFrame = ECStack.back();
+  StackFrame.CurFunction = F;
+
+  // Special handling for external functions.
+  if (F->isDeclaration()) {
+    GenericValue Result = callExternalFunction (F, ArgVals);
+    // Simulate a 'ret' instruction of the appropriate type.
+    popStackAndReturnValueToCaller (F->getReturnType (), Result);
+    return;
+  }
+
+  // Get pointers to first LLVM BB & Instruction in function.
+  StackFrame.CurBB     = F->begin();
+  StackFrame.CurInst   = StackFrame.CurBB->begin();
+
+  // Run through the function arguments and initialize their values...
+  assert((ArgVals.size() == F->arg_size() ||
+         (ArgVals.size() > F->arg_size() && F->getFunctionType()->isVarArg()))&&
+         "Invalid number of values passed to function invocation!");
+
+  // Handle non-varargs arguments...
+  unsigned i = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); 
+       AI != E; ++AI, ++i)
+    SetValue(AI, ArgVals[i], StackFrame);
+
+  // Handle varargs arguments...
+  StackFrame.VarArgs.assign(ArgVals.begin()+i, ArgVals.end());
+}
+
+
+void Interpreter::run() {
+  while (!ECStack.empty()) {
+    // Interpret a single instruction & increment the "PC".
+    ExecutionContext &SF = ECStack.back();  // Current stack frame
+    Instruction &I = *SF.CurInst++;         // Increment before execute
+
+    // Track the number of dynamic instructions executed.
+    ++NumDynamicInsts;
+
+    DOUT << "About to interpret: " << I;
+    visit(I);   // Dispatch to one of the visit* methods...
+#if 0
+    // This is not safe, as visiting the instruction could lower it and free I.
+#ifndef NDEBUG
+    if (!isa<CallInst>(I) && !isa<InvokeInst>(I) && 
+        I.getType() != Type::VoidTy) {
+      DOUT << "  --> ";
+      const GenericValue &Val = SF.Values[&I];
+      switch (I.getType()->getTypeID()) {
+      default: assert(0 && "Invalid GenericValue Type");
+      case Type::VoidTyID:    DOUT << "void"; break;
+      case Type::FloatTyID:   DOUT << "float " << Val.FloatVal; break;
+      case Type::DoubleTyID:  DOUT << "double " << Val.DoubleVal; break;
+      case Type::PointerTyID: DOUT << "void* " << intptr_t(Val.PointerVal);
+        break;
+      case Type::IntegerTyID: 
+        DOUT << "i" << Val.IntVal.getBitWidth() << " "
+        << Val.IntVal.toStringUnsigned(10)
+        << " (0x" << Val.IntVal.toStringUnsigned(16) << ")\n";
+        break;
+      }
+    }
+#endif
+#endif
+  }
+}
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
new file mode 100644
index 0000000..160f1ba
--- /dev/null
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -0,0 +1,542 @@
+//===-- ExternalFunctions.cpp - Implement External Functions --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains both code to deal with invoking "external" functions, but
+//  also contains code that implements "exported" external functions.
+//
+//  There are currently two mechanisms for handling external functions in the
+//  Interpreter.  The first is to implement lle_* wrapper functions that are
+//  specific to well-known library functions which manually translate the
+//  arguments from GenericValues and make the call.  If such a wrapper does
+//  not exist, and libffi is available, then the Interpreter will attempt to
+//  invoke the function using libffi, after finding its address.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interpreter.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Config/config.h"     // Detect libffi
+#include "llvm/Support/Streams.h"
+#include "llvm/System/DynamicLibrary.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/ManagedStatic.h"
+#include <csignal>
+#include <cstdio>
+#include <map>
+#include <cmath>
+#include <cstring>
+
+#ifdef HAVE_FFI_CALL
+#ifdef HAVE_FFI_H
+#include <ffi.h>
+#define USE_LIBFFI
+#elif HAVE_FFI_FFI_H
+#include <ffi/ffi.h>
+#define USE_LIBFFI
+#endif
+#endif
+
+using namespace llvm;
+
+typedef GenericValue (*ExFunc)(const FunctionType *,
+                               const std::vector<GenericValue> &);
+static ManagedStatic<std::map<const Function *, ExFunc> > ExportedFunctions;
+static std::map<std::string, ExFunc> FuncNames;
+
+#ifdef USE_LIBFFI
+typedef void (*RawFunc)(void);
+static ManagedStatic<std::map<const Function *, RawFunc> > RawFunctions;
+#endif
+
+static Interpreter *TheInterpreter;
+
+static char getTypeID(const Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:    return 'V';
+  case Type::IntegerTyID:
+    switch (cast<IntegerType>(Ty)->getBitWidth()) {
+      case 1:  return 'o';
+      case 8:  return 'B';
+      case 16: return 'S';
+      case 32: return 'I';
+      case 64: return 'L';
+      default: return 'N';
+    }
+  case Type::FloatTyID:   return 'F';
+  case Type::DoubleTyID:  return 'D';
+  case Type::PointerTyID: return 'P';
+  case Type::FunctionTyID:return 'M';
+  case Type::StructTyID:  return 'T';
+  case Type::ArrayTyID:   return 'A';
+  case Type::OpaqueTyID:  return 'O';
+  default: return 'U';
+  }
+}
+
+// Try to find address of external function given a Function object.
+// Please note, that interpreter doesn't know how to assemble a
+// real call in general case (this is JIT job), that's why it assumes,
+// that all external functions has the same (and pretty "general") signature.
+// The typical example of such functions are "lle_X_" ones.
+static ExFunc lookupFunction(const Function *F) {
+  // Function not found, look it up... start by figuring out what the
+  // composite function name should be.
+  std::string ExtName = "lle_";
+  const FunctionType *FT = F->getFunctionType();
+  for (unsigned i = 0, e = FT->getNumContainedTypes(); i != e; ++i)
+    ExtName += getTypeID(FT->getContainedType(i));
+  ExtName += "_" + F->getName();
+
+  ExFunc FnPtr = FuncNames[ExtName];
+  if (FnPtr == 0)
+    FnPtr = FuncNames["lle_X_"+F->getName()];
+  if (FnPtr == 0)  // Try calling a generic function... if it exists...
+    FnPtr = (ExFunc)(intptr_t)sys::DynamicLibrary::SearchForAddressOfSymbol(
+            ("lle_X_"+F->getName()).c_str());
+  if (FnPtr != 0)
+    ExportedFunctions->insert(std::make_pair(F, FnPtr));  // Cache for later
+  return FnPtr;
+}
+
+#ifdef USE_LIBFFI
+static ffi_type *ffiTypeFor(const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    case Type::VoidTyID: return &ffi_type_void;
+    case Type::IntegerTyID:
+      switch (cast<IntegerType>(Ty)->getBitWidth()) {
+        case 8:  return &ffi_type_sint8;
+        case 16: return &ffi_type_sint16;
+        case 32: return &ffi_type_sint32;
+        case 64: return &ffi_type_sint64;
+      }
+    case Type::FloatTyID:   return &ffi_type_float;
+    case Type::DoubleTyID:  return &ffi_type_double;
+    case Type::PointerTyID: return &ffi_type_pointer;
+    default: break;
+  }
+  // TODO: Support other types such as StructTyID, ArrayTyID, OpaqueTyID, etc.
+  cerr << "Type could not be mapped for use with libffi.\n";
+  abort();
+  return NULL;
+}
+
+static void *ffiValueFor(const Type *Ty, const GenericValue &AV,
+                         void *ArgDataPtr) {
+  switch (Ty->getTypeID()) {
+    case Type::IntegerTyID:
+      switch (cast<IntegerType>(Ty)->getBitWidth()) {
+        case 8: {
+          int8_t *I8Ptr = (int8_t *) ArgDataPtr;
+          *I8Ptr = (int8_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+        case 16: {
+          int16_t *I16Ptr = (int16_t *) ArgDataPtr;
+          *I16Ptr = (int16_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+        case 32: {
+          int32_t *I32Ptr = (int32_t *) ArgDataPtr;
+          *I32Ptr = (int32_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+        case 64: {
+          int64_t *I64Ptr = (int64_t *) ArgDataPtr;
+          *I64Ptr = (int64_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+      }
+    case Type::FloatTyID: {
+      float *FloatPtr = (float *) ArgDataPtr;
+      *FloatPtr = AV.DoubleVal;
+      return ArgDataPtr;
+    }
+    case Type::DoubleTyID: {
+      double *DoublePtr = (double *) ArgDataPtr;
+      *DoublePtr = AV.DoubleVal;
+      return ArgDataPtr;
+    }
+    case Type::PointerTyID: {
+      void **PtrPtr = (void **) ArgDataPtr;
+      *PtrPtr = GVTOP(AV);
+      return ArgDataPtr;
+    }
+    default: break;
+  }
+  // TODO: Support other types such as StructTyID, ArrayTyID, OpaqueTyID, etc.
+  cerr << "Type value could not be mapped for use with libffi.\n";
+  abort();
+  return NULL;
+}
+
+static bool ffiInvoke(RawFunc Fn, Function *F,
+                      const std::vector<GenericValue> &ArgVals,
+                      const TargetData *TD, GenericValue &Result) {
+  ffi_cif cif;
+  const FunctionType *FTy = F->getFunctionType();
+  const unsigned NumArgs = F->arg_size();
+
+  // TODO: We don't have type information about the remaining arguments, because
+  // this information is never passed into ExecutionEngine::runFunction().
+  if (ArgVals.size() > NumArgs && F->isVarArg()) {
+    cerr << "Calling external var arg function '" << F->getName()
+         << "' is not supported by the Interpreter.\n";
+    abort();
+  }
+
+  unsigned ArgBytes = 0;
+
+  std::vector<ffi_type*> args(NumArgs);
+  for (Function::const_arg_iterator A = F->arg_begin(), E = F->arg_end();
+       A != E; ++A) {
+    const unsigned ArgNo = A->getArgNo();
+    const Type *ArgTy = FTy->getParamType(ArgNo);
+    args[ArgNo] = ffiTypeFor(ArgTy);
+    ArgBytes += TD->getTypeStoreSize(ArgTy);
+  }
+
+  uint8_t *ArgData = (uint8_t*) alloca(ArgBytes);
+  uint8_t *ArgDataPtr = ArgData;
+  std::vector<void*> values(NumArgs);
+  for (Function::const_arg_iterator A = F->arg_begin(), E = F->arg_end();
+       A != E; ++A) {
+    const unsigned ArgNo = A->getArgNo();
+    const Type *ArgTy = FTy->getParamType(ArgNo);
+    values[ArgNo] = ffiValueFor(ArgTy, ArgVals[ArgNo], ArgDataPtr);
+    ArgDataPtr += TD->getTypeStoreSize(ArgTy);
+  }
+
+  const Type *RetTy = FTy->getReturnType();
+  ffi_type *rtype = ffiTypeFor(RetTy);
+
+  if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, &args[0]) == FFI_OK) {
+    void *ret = NULL;
+    if (RetTy->getTypeID() != Type::VoidTyID)
+      ret = alloca(TD->getTypeStoreSize(RetTy));
+    ffi_call(&cif, Fn, ret, &values[0]);
+    switch (RetTy->getTypeID()) {
+      case Type::IntegerTyID:
+        switch (cast<IntegerType>(RetTy)->getBitWidth()) {
+          case 8:  Result.IntVal = APInt(8 , *(int8_t *) ret); break;
+          case 16: Result.IntVal = APInt(16, *(int16_t*) ret); break;
+          case 32: Result.IntVal = APInt(32, *(int32_t*) ret); break;
+          case 64: Result.IntVal = APInt(64, *(int64_t*) ret); break;
+        }
+        break;
+      case Type::FloatTyID:   Result.FloatVal   = *(float *) ret; break;
+      case Type::DoubleTyID:  Result.DoubleVal  = *(double*) ret; break;
+      case Type::PointerTyID: Result.PointerVal = *(void **) ret; break;
+      default: break;
+    }
+    return true;
+  }
+
+  return false;
+}
+#endif // USE_LIBFFI
+
+GenericValue Interpreter::callExternalFunction(Function *F,
+                                     const std::vector<GenericValue> &ArgVals) {
+  TheInterpreter = this;
+
+  // Do a lookup to see if the function is in our cache... this should just be a
+  // deferred annotation!
+  std::map<const Function *, ExFunc>::iterator FI = ExportedFunctions->find(F);
+  if (ExFunc Fn = (FI == ExportedFunctions->end()) ? lookupFunction(F)
+                                                   : FI->second)
+    return Fn(F->getFunctionType(), ArgVals);
+
+#ifdef USE_LIBFFI
+  std::map<const Function *, RawFunc>::iterator RF = RawFunctions->find(F);
+  RawFunc RawFn;
+  if (RF == RawFunctions->end()) {
+    RawFn = (RawFunc)(intptr_t)
+      sys::DynamicLibrary::SearchForAddressOfSymbol(F->getName());
+    if (RawFn != 0)
+      RawFunctions->insert(std::make_pair(F, RawFn));  // Cache for later
+  } else {
+    RawFn = RF->second;
+  }
+
+  GenericValue Result;
+  if (RawFn != 0 && ffiInvoke(RawFn, F, ArgVals, getTargetData(), Result))
+    return Result;
+#endif // USE_LIBFFI
+
+  cerr << "Tried to execute an unknown external function: "
+       << F->getType()->getDescription() << " " << F->getName() << "\n";
+  if (F->getName() != "__main")
+    abort();
+  return GenericValue();
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Functions "exported" to the running application...
+//
+extern "C" {  // Don't add C++ manglings to llvm mangling :)
+
+// void atexit(Function*)
+GenericValue lle_X_atexit(const FunctionType *FT,
+                          const std::vector<GenericValue> &Args) {
+  assert(Args.size() == 1);
+  TheInterpreter->addAtExitHandler((Function*)GVTOP(Args[0]));
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
+// void exit(int)
+GenericValue lle_X_exit(const FunctionType *FT,
+                        const std::vector<GenericValue> &Args) {
+  TheInterpreter->exitCalled(Args[0]);
+  return GenericValue();
+}
+
+// void abort(void)
+GenericValue lle_X_abort(const FunctionType *FT,
+                         const std::vector<GenericValue> &Args) {
+  raise (SIGABRT);
+  return GenericValue();
+}
+
+// int sprintf(char *, const char *, ...) - a very rough implementation to make
+// output useful.
+GenericValue lle_X_sprintf(const FunctionType *FT,
+                           const std::vector<GenericValue> &Args) {
+  char *OutputBuffer = (char *)GVTOP(Args[0]);
+  const char *FmtStr = (const char *)GVTOP(Args[1]);
+  unsigned ArgNo = 2;
+
+  // printf should return # chars printed.  This is completely incorrect, but
+  // close enough for now.
+  GenericValue GV; 
+  GV.IntVal = APInt(32, strlen(FmtStr));
+  while (1) {
+    switch (*FmtStr) {
+    case 0: return GV;             // Null terminator...
+    default:                       // Normal nonspecial character
+      sprintf(OutputBuffer++, "%c", *FmtStr++);
+      break;
+    case '\\': {                   // Handle escape codes
+      sprintf(OutputBuffer, "%c%c", *FmtStr, *(FmtStr+1));
+      FmtStr += 2; OutputBuffer += 2;
+      break;
+    }
+    case '%': {                    // Handle format specifiers
+      char FmtBuf[100] = "", Buffer[1000] = "";
+      char *FB = FmtBuf;
+      *FB++ = *FmtStr++;
+      char Last = *FB++ = *FmtStr++;
+      unsigned HowLong = 0;
+      while (Last != 'c' && Last != 'd' && Last != 'i' && Last != 'u' &&
+             Last != 'o' && Last != 'x' && Last != 'X' && Last != 'e' &&
+             Last != 'E' && Last != 'g' && Last != 'G' && Last != 'f' &&
+             Last != 'p' && Last != 's' && Last != '%') {
+        if (Last == 'l' || Last == 'L') HowLong++;  // Keep track of l's
+        Last = *FB++ = *FmtStr++;
+      }
+      *FB = 0;
+
+      switch (Last) {
+      case '%':
+        strcpy(Buffer, "%"); break;
+      case 'c':
+        sprintf(Buffer, FmtBuf, uint32_t(Args[ArgNo++].IntVal.getZExtValue()));
+        break;
+      case 'd': case 'i':
+      case 'u': case 'o':
+      case 'x': case 'X':
+        if (HowLong >= 1) {
+          if (HowLong == 1 &&
+              TheInterpreter->getTargetData()->getPointerSizeInBits() == 64 &&
+              sizeof(long) < sizeof(int64_t)) {
+            // Make sure we use %lld with a 64 bit argument because we might be
+            // compiling LLI on a 32 bit compiler.
+            unsigned Size = strlen(FmtBuf);
+            FmtBuf[Size] = FmtBuf[Size-1];
+            FmtBuf[Size+1] = 0;
+            FmtBuf[Size-1] = 'l';
+          }
+          sprintf(Buffer, FmtBuf, Args[ArgNo++].IntVal.getZExtValue());
+        } else
+          sprintf(Buffer, FmtBuf,uint32_t(Args[ArgNo++].IntVal.getZExtValue()));
+        break;
+      case 'e': case 'E': case 'g': case 'G': case 'f':
+        sprintf(Buffer, FmtBuf, Args[ArgNo++].DoubleVal); break;
+      case 'p':
+        sprintf(Buffer, FmtBuf, (void*)GVTOP(Args[ArgNo++])); break;
+      case 's':
+        sprintf(Buffer, FmtBuf, (char*)GVTOP(Args[ArgNo++])); break;
+      default:  cerr << "<unknown printf code '" << *FmtStr << "'!>";
+        ArgNo++; break;
+      }
+      strcpy(OutputBuffer, Buffer);
+      OutputBuffer += strlen(Buffer);
+      }
+      break;
+    }
+  }
+  return GV;
+}
+
+// int printf(const char *, ...) - a very rough implementation to make output
+// useful.
+GenericValue lle_X_printf(const FunctionType *FT,
+                          const std::vector<GenericValue> &Args) {
+  char Buffer[10000];
+  std::vector<GenericValue> NewArgs;
+  NewArgs.push_back(PTOGV((void*)&Buffer[0]));
+  NewArgs.insert(NewArgs.end(), Args.begin(), Args.end());
+  GenericValue GV = lle_X_sprintf(FT, NewArgs);
+  cout << Buffer;
+  return GV;
+}
+
+static void ByteswapSCANFResults(const char *Fmt, void *Arg0, void *Arg1,
+                                 void *Arg2, void *Arg3, void *Arg4, void *Arg5,
+                                 void *Arg6, void *Arg7, void *Arg8) {
+  void *Args[] = { Arg0, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, 0 };
+
+  // Loop over the format string, munging read values as appropriate (performs
+  // byteswaps as necessary).
+  unsigned ArgNo = 0;
+  while (*Fmt) {
+    if (*Fmt++ == '%') {
+      // Read any flag characters that may be present...
+      bool Suppress = false;
+      bool Half = false;
+      bool Long = false;
+      bool LongLong = false;  // long long or long double
+
+      while (1) {
+        switch (*Fmt++) {
+        case '*': Suppress = true; break;
+        case 'a': /*Allocate = true;*/ break;  // We don't need to track this
+        case 'h': Half = true; break;
+        case 'l': Long = true; break;
+        case 'q':
+        case 'L': LongLong = true; break;
+        default:
+          if (Fmt[-1] > '9' || Fmt[-1] < '0')   // Ignore field width specs
+            goto Out;
+        }
+      }
+    Out:
+
+      // Read the conversion character
+      if (!Suppress && Fmt[-1] != '%') { // Nothing to do?
+        unsigned Size = 0;
+        const Type *Ty = 0;
+
+        switch (Fmt[-1]) {
+        case 'i': case 'o': case 'u': case 'x': case 'X': case 'n': case 'p':
+        case 'd':
+          if (Long || LongLong) {
+            Size = 8; Ty = Type::Int64Ty;
+          } else if (Half) {
+            Size = 4; Ty = Type::Int16Ty;
+          } else {
+            Size = 4; Ty = Type::Int32Ty;
+          }
+          break;
+
+        case 'e': case 'g': case 'E':
+        case 'f':
+          if (Long || LongLong) {
+            Size = 8; Ty = Type::DoubleTy;
+          } else {
+            Size = 4; Ty = Type::FloatTy;
+          }
+          break;
+
+        case 's': case 'c': case '[':  // No byteswap needed
+          Size = 1;
+          Ty = Type::Int8Ty;
+          break;
+
+        default: break;
+        }
+
+        if (Size) {
+          GenericValue GV;
+          void *Arg = Args[ArgNo++];
+          memcpy(&GV, Arg, Size);
+          TheInterpreter->StoreValueToMemory(GV, (GenericValue*)Arg, Ty);
+        }
+      }
+    }
+  }
+}
+
+// int sscanf(const char *format, ...);
+GenericValue lle_X_sscanf(const FunctionType *FT,
+                          const std::vector<GenericValue> &args) {
+  assert(args.size() < 10 && "Only handle up to 10 args to sscanf right now!");
+
+  char *Args[10];
+  for (unsigned i = 0; i < args.size(); ++i)
+    Args[i] = (char*)GVTOP(args[i]);
+
+  GenericValue GV;
+  GV.IntVal = APInt(32, sscanf(Args[0], Args[1], Args[2], Args[3], Args[4],
+                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+  ByteswapSCANFResults(Args[1], Args[2], Args[3], Args[4],
+                       Args[5], Args[6], Args[7], Args[8], Args[9], 0);
+  return GV;
+}
+
+// int scanf(const char *format, ...);
+GenericValue lle_X_scanf(const FunctionType *FT,
+                         const std::vector<GenericValue> &args) {
+  assert(args.size() < 10 && "Only handle up to 10 args to scanf right now!");
+
+  char *Args[10];
+  for (unsigned i = 0; i < args.size(); ++i)
+    Args[i] = (char*)GVTOP(args[i]);
+
+  GenericValue GV;
+  GV.IntVal = APInt(32, scanf( Args[0], Args[1], Args[2], Args[3], Args[4],
+                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+  ByteswapSCANFResults(Args[0], Args[1], Args[2], Args[3], Args[4],
+                       Args[5], Args[6], Args[7], Args[8], Args[9]);
+  return GV;
+}
+
+// int fprintf(FILE *, const char *, ...) - a very rough implementation to make
+// output useful.
+GenericValue lle_X_fprintf(const FunctionType *FT,
+                           const std::vector<GenericValue> &Args) {
+  assert(Args.size() >= 2);
+  char Buffer[10000];
+  std::vector<GenericValue> NewArgs;
+  NewArgs.push_back(PTOGV(Buffer));
+  NewArgs.insert(NewArgs.end(), Args.begin()+1, Args.end());
+  GenericValue GV = lle_X_sprintf(FT, NewArgs);
+
+  fputs(Buffer, (FILE *) GVTOP(Args[0]));
+  return GV;
+}
+
+} // End extern "C"
+
+
+void Interpreter::initializeExternalFunctions() {
+  FuncNames["lle_X_atexit"]       = lle_X_atexit;
+  FuncNames["lle_X_exit"]         = lle_X_exit;
+  FuncNames["lle_X_abort"]        = lle_X_abort;
+
+  FuncNames["lle_X_printf"]       = lle_X_printf;
+  FuncNames["lle_X_sprintf"]      = lle_X_sprintf;
+  FuncNames["lle_X_sscanf"]       = lle_X_sscanf;
+  FuncNames["lle_X_scanf"]        = lle_X_scanf;
+  FuncNames["lle_X_fprintf"]      = lle_X_fprintf;
+}
+
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
new file mode 100644
index 0000000..ded65d5
--- /dev/null
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
@@ -0,0 +1,104 @@
+//===- Interpreter.cpp - Top-Level LLVM Interpreter Implementation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the top-level functionality for the LLVM interpreter.
+// This interpreter is designed to be a very simple, portable, inefficient
+// interpreter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interpreter.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include <cstring>
+using namespace llvm;
+
+namespace {
+
+static struct RegisterInterp {
+  RegisterInterp() { Interpreter::Register(); }
+} InterpRegistrator;
+
+}
+
+namespace llvm {
+  void LinkInInterpreter() {
+  }
+}
+
+/// create - Create a new interpreter object.  This can never fail.
+///
+ExecutionEngine *Interpreter::create(ModuleProvider *MP, std::string* ErrStr,
+                                     CodeGenOpt::Level OptLevel /*unused*/) {
+  // Tell this ModuleProvide to materialize and release the module
+  if (!MP->materializeModule(ErrStr))
+    // We got an error, just return 0
+    return 0;
+
+  return new Interpreter(MP);
+}
+
+//===----------------------------------------------------------------------===//
+// Interpreter ctor - Initialize stuff
+//
+Interpreter::Interpreter(ModuleProvider *M)
+  : ExecutionEngine(M), TD(M->getModule()) {
+      
+  memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
+  setTargetData(&TD);
+  // Initialize the "backend"
+  initializeExecutionEngine();
+  initializeExternalFunctions();
+  emitGlobals();
+
+  IL = new IntrinsicLowering(TD);
+}
+
+Interpreter::~Interpreter() {
+  delete IL;
+}
+
+void Interpreter::runAtExitHandlers () {
+  while (!AtExitHandlers.empty()) {
+    callFunction(AtExitHandlers.back(), std::vector<GenericValue>());
+    AtExitHandlers.pop_back();
+    run();
+  }
+}
+
+/// run - Start execution with the specified function and arguments.
+///
+GenericValue
+Interpreter::runFunction(Function *F,
+                         const std::vector<GenericValue> &ArgValues) {
+  assert (F && "Function *F was null at entry to run()");
+
+  // Try extra hard not to pass extra args to a function that isn't
+  // expecting them.  C programmers frequently bend the rules and
+  // declare main() with fewer parameters than it actually gets
+  // passed, and the interpreter barfs if you pass a function more
+  // parameters than it is declared to take. This does not attempt to
+  // take into account gratuitous differences in declared types,
+  // though.
+  std::vector<GenericValue> ActualArgs;
+  const unsigned ArgCount = F->getFunctionType()->getNumParams();
+  for (unsigned i = 0; i < ArgCount; ++i)
+    ActualArgs.push_back(ArgValues[i]);
+
+  // Set up the function call.
+  callFunction(F, ActualArgs);
+
+  // Start executing the function.
+  run();
+
+  return ExitValue;
+}
+
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
new file mode 100644
index 0000000..8a285ec
--- /dev/null
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -0,0 +1,241 @@
+//===-- Interpreter.h ------------------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines the interpreter structure
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLI_INTERPRETER_H
+#define LLI_INTERPRETER_H
+
+#include "llvm/Function.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class IntrinsicLowering;
+struct FunctionInfo;
+template<typename T> class generic_gep_type_iterator;
+class ConstantExpr;
+typedef generic_gep_type_iterator<User::const_op_iterator> gep_type_iterator;
+
+
+// AllocaHolder - Object to track all of the blocks of memory allocated by
+// alloca.  When the function returns, this object is popped off the execution
+// stack, which causes the dtor to be run, which frees all the alloca'd memory.
+//
+class AllocaHolder {
+  friend class AllocaHolderHandle;
+  std::vector<void*> Allocations;
+  unsigned RefCnt;
+public:
+  AllocaHolder() : RefCnt(0) {}
+  void add(void *mem) { Allocations.push_back(mem); }
+  ~AllocaHolder() {
+    for (unsigned i = 0; i < Allocations.size(); ++i)
+      free(Allocations[i]);
+  }
+};
+
+// AllocaHolderHandle gives AllocaHolder value semantics so we can stick it into
+// a vector...
+//
+class AllocaHolderHandle {
+  AllocaHolder *H;
+public:
+  AllocaHolderHandle() : H(new AllocaHolder()) { H->RefCnt++; }
+  AllocaHolderHandle(const AllocaHolderHandle &AH) : H(AH.H) { H->RefCnt++; }
+  ~AllocaHolderHandle() { if (--H->RefCnt == 0) delete H; }
+
+  void add(void *mem) { H->add(mem); }
+};
+
+typedef std::vector<GenericValue> ValuePlaneTy;
+
+// ExecutionContext struct - This struct represents one stack frame currently
+// executing.
+//
+struct ExecutionContext {
+  Function             *CurFunction;// The currently executing function
+  BasicBlock           *CurBB;      // The currently executing BB
+  BasicBlock::iterator  CurInst;    // The next instruction to execute
+  std::map<Value *, GenericValue> Values; // LLVM values used in this invocation
+  std::vector<GenericValue>  VarArgs; // Values passed through an ellipsis
+  CallSite             Caller;     // Holds the call that called subframes.
+                                   // NULL if main func or debugger invoked fn
+  AllocaHolderHandle    Allocas;    // Track memory allocated by alloca
+};
+
+// Interpreter - This class represents the entirety of the interpreter.
+//
+class Interpreter : public ExecutionEngine, public InstVisitor<Interpreter> {
+  GenericValue ExitValue;          // The return value of the called function
+  TargetData TD;
+  IntrinsicLowering *IL;
+
+  // The runtime stack of executing code.  The top of the stack is the current
+  // function record.
+  std::vector<ExecutionContext> ECStack;
+
+  // AtExitHandlers - List of functions to call when the program exits,
+  // registered with the atexit() library function.
+  std::vector<Function*> AtExitHandlers;
+
+public:
+  explicit Interpreter(ModuleProvider *M);
+  ~Interpreter();
+
+  /// runAtExitHandlers - Run any functions registered by the program's calls to
+  /// atexit(3), which we intercept and store in AtExitHandlers.
+  ///
+  void runAtExitHandlers();
+
+  static void Register() {
+    InterpCtor = create;
+  }
+  
+  /// create - Create an interpreter ExecutionEngine. This can never fail.
+  ///
+  static ExecutionEngine *create(ModuleProvider *M, std::string *ErrorStr = 0,
+                                 CodeGenOpt::Level = CodeGenOpt::Default);
+
+  /// run - Start execution with the specified function and arguments.
+  ///
+  virtual GenericValue runFunction(Function *F,
+                                   const std::vector<GenericValue> &ArgValues);
+
+  /// recompileAndRelinkFunction - For the interpreter, functions are always
+  /// up-to-date.
+  ///
+  virtual void *recompileAndRelinkFunction(Function *F) {
+    return getPointerToFunction(F);
+  }
+
+  /// freeMachineCodeForFunction - The interpreter does not generate any code.
+  ///
+  void freeMachineCodeForFunction(Function *F) { }
+
+  // Methods used to execute code:
+  // Place a call on the stack
+  void callFunction(Function *F, const std::vector<GenericValue> &ArgVals);
+  void run();                // Execute instructions until nothing left to do
+
+  // Opcode Implementations
+  void visitReturnInst(ReturnInst &I);
+  void visitBranchInst(BranchInst &I);
+  void visitSwitchInst(SwitchInst &I);
+
+  void visitBinaryOperator(BinaryOperator &I);
+  void visitICmpInst(ICmpInst &I);
+  void visitFCmpInst(FCmpInst &I);
+  void visitAllocationInst(AllocationInst &I);
+  void visitFreeInst(FreeInst &I);
+  void visitLoadInst(LoadInst &I);
+  void visitStoreInst(StoreInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitPHINode(PHINode &PN) { assert(0 && "PHI nodes already handled!"); }
+  void visitTruncInst(TruncInst &I);
+  void visitZExtInst(ZExtInst &I);
+  void visitSExtInst(SExtInst &I);
+  void visitFPTruncInst(FPTruncInst &I);
+  void visitFPExtInst(FPExtInst &I);
+  void visitUIToFPInst(UIToFPInst &I);
+  void visitSIToFPInst(SIToFPInst &I);
+  void visitFPToUIInst(FPToUIInst &I);
+  void visitFPToSIInst(FPToSIInst &I);
+  void visitPtrToIntInst(PtrToIntInst &I);
+  void visitIntToPtrInst(IntToPtrInst &I);
+  void visitBitCastInst(BitCastInst &I);
+  void visitSelectInst(SelectInst &I);
+
+
+  void visitCallSite(CallSite CS);
+  void visitCallInst(CallInst &I) { visitCallSite (CallSite (&I)); }
+  void visitInvokeInst(InvokeInst &I) { visitCallSite (CallSite (&I)); }
+  void visitUnwindInst(UnwindInst &I);
+  void visitUnreachableInst(UnreachableInst &I);
+
+  void visitShl(BinaryOperator &I);
+  void visitLShr(BinaryOperator &I);
+  void visitAShr(BinaryOperator &I);
+
+  void visitVAArgInst(VAArgInst &I);
+  void visitInstruction(Instruction &I) {
+    cerr << I;
+    assert(0 && "Instruction not interpretable yet!");
+  }
+
+  GenericValue callExternalFunction(Function *F,
+                                    const std::vector<GenericValue> &ArgVals);
+  void exitCalled(GenericValue GV);
+
+  void addAtExitHandler(Function *F) {
+    AtExitHandlers.push_back(F);
+  }
+
+  GenericValue *getFirstVarArg () {
+    return &(ECStack.back ().VarArgs[0]);
+  }
+
+  //FIXME: private:
+public:
+  GenericValue executeGEPOperation(Value *Ptr, gep_type_iterator I,
+                                   gep_type_iterator E, ExecutionContext &SF);
+
+private:  // Helper functions
+  // SwitchToNewBasicBlock - Start execution in a new basic block and run any
+  // PHI nodes in the top of the block.  This is used for intraprocedural
+  // control flow.
+  //
+  void SwitchToNewBasicBlock(BasicBlock *Dest, ExecutionContext &SF);
+
+  void *getPointerToFunction(Function *F) { return (void*)F; }
+
+  void initializeExecutionEngine();
+  void initializeExternalFunctions();
+  GenericValue getConstantExprValue(ConstantExpr *CE, ExecutionContext &SF);
+  GenericValue getOperandValue(Value *V, ExecutionContext &SF);
+  GenericValue executeTruncInst(Value *SrcVal, const Type *DstTy,
+                                ExecutionContext &SF);
+  GenericValue executeSExtInst(Value *SrcVal, const Type *DstTy,
+                               ExecutionContext &SF);
+  GenericValue executeZExtInst(Value *SrcVal, const Type *DstTy,
+                               ExecutionContext &SF);
+  GenericValue executeFPTruncInst(Value *SrcVal, const Type *DstTy,
+                                  ExecutionContext &SF);
+  GenericValue executeFPExtInst(Value *SrcVal, const Type *DstTy,
+                                ExecutionContext &SF);
+  GenericValue executeFPToUIInst(Value *SrcVal, const Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executeFPToSIInst(Value *SrcVal, const Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executeUIToFPInst(Value *SrcVal, const Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executeSIToFPInst(Value *SrcVal, const Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executePtrToIntInst(Value *SrcVal, const Type *DstTy,
+                                   ExecutionContext &SF);
+  GenericValue executeIntToPtrInst(Value *SrcVal, const Type *DstTy,
+                                   ExecutionContext &SF);
+  GenericValue executeBitCastInst(Value *SrcVal, const Type *DstTy,
+                                  ExecutionContext &SF);
+  GenericValue executeCastOperation(Instruction::CastOps opcode, Value *SrcVal, 
+                                    const Type *Ty, ExecutionContext &SF);
+  void popStackAndReturnValueToCaller(const Type *RetTy, GenericValue Result);
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/ExecutionEngine/Interpreter/Makefile b/lib/ExecutionEngine/Interpreter/Makefile
new file mode 100644
index 0000000..5f937c3
--- /dev/null
+++ b/lib/ExecutionEngine/Interpreter/Makefile
@@ -0,0 +1,12 @@
+##===- lib/ExecutionEngine/Interpreter/Makefile ------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMInterpreter
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
new file mode 100644
index 0000000..d7980d0
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/CMakeLists.txt
@@ -0,0 +1,11 @@
+# TODO: Support other architectures. See Makefile.
+add_definitions(-DENABLE_X86_JIT)
+
+add_partially_linked_object(LLVMJIT
+  Intercept.cpp
+  JIT.cpp
+  JITDwarfEmitter.cpp
+  JITEmitter.cpp
+  JITMemoryManager.cpp
+  TargetSelect.cpp
+  )
diff --git a/lib/ExecutionEngine/JIT/Intercept.cpp b/lib/ExecutionEngine/JIT/Intercept.cpp
new file mode 100644
index 0000000..3dcc462
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/Intercept.cpp
@@ -0,0 +1,148 @@
+//===-- Intercept.cpp - System function interception routines -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// If a function call occurs to an external function, the JIT is designed to use
+// the dynamic loader interface to find a function to call.  This is useful for
+// calling system calls and library functions that are not available in LLVM.
+// Some system calls, however, need to be handled specially.  For this reason,
+// we intercept some of them here and use our own stubs to handle them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JIT.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/DynamicLibrary.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+// AtExitHandlers - List of functions to call when the program exits,
+// registered with the atexit() library function.
+static std::vector<void (*)()> AtExitHandlers;
+
+/// runAtExitHandlers - Run any functions registered by the program's
+/// calls to atexit(3), which we intercept and store in
+/// AtExitHandlers.
+///
+static void runAtExitHandlers() {
+  while (!AtExitHandlers.empty()) {
+    void (*Fn)() = AtExitHandlers.back();
+    AtExitHandlers.pop_back();
+    Fn();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Function stubs that are invoked instead of certain library calls
+//===----------------------------------------------------------------------===//
+
+// Force the following functions to be linked in to anything that uses the
+// JIT. This is a hack designed to work around the all-too-clever Glibc
+// strategy of making these functions work differently when inlined vs. when
+// not inlined, and hiding their real definitions in a separate archive file
+// that the dynamic linker can't see. For more info, search for
+// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+#if defined(__linux__)
+#if defined(HAVE_SYS_STAT_H)
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+/* stat functions are redirecting to __xstat with a version number.  On x86-64 
+ * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat' 
+ * available as an exported symbol, so we have to add it explicitly.
+ */
+class StatSymbols {
+public:
+  StatSymbols() {
+    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
+    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
+    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
+    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
+    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
+    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
+    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
+    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
+    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
+  }
+};
+static StatSymbols initStatSymbols;
+#endif // __linux__
+
+// jit_exit - Used to intercept the "exit" library call.
+static void jit_exit(int Status) {
+  runAtExitHandlers();   // Run atexit handlers...
+  exit(Status);
+}
+
+// jit_atexit - Used to intercept the "atexit" library call.
+static int jit_atexit(void (*Fn)(void)) {
+  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
+  return 0;  // Always successful
+}
+
+//===----------------------------------------------------------------------===//
+//
+/// getPointerToNamedFunction - This method returns the address of the specified
+/// function by using the dynamic loader interface.  As such it is only useful
+/// for resolving library symbols, not code generated symbols.
+///
+void *JIT::getPointerToNamedFunction(const std::string &Name,
+                                     bool AbortOnFailure) {
+  if (!isSymbolSearchingDisabled()) {
+    // Check to see if this is one of the functions we want to intercept.  Note,
+    // we cast to intptr_t here to silence a -pedantic warning that complains
+    // about casting a function pointer to a normal pointer.
+    if (Name == "exit") return (void*)(intptr_t)&jit_exit;
+    if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
+
+    const char *NameStr = Name.c_str();
+    // If this is an asm specifier, skip the sentinal.
+    if (NameStr[0] == 1) ++NameStr;
+    
+    // If it's an external function, look it up in the process image...
+    void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+    if (Ptr) return Ptr;
+    
+    // If it wasn't found and if it starts with an underscore ('_') character,
+    // and has an asm specifier, try again without the underscore.
+    if (Name[0] == 1 && NameStr[0] == '_') {
+      Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+      if (Ptr) return Ptr;
+    }
+    
+    // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
+    // are references to hidden visibility symbols that dlsym cannot resolve.
+    // If we have one of these, strip off $LDBLStub and try again.
+#if defined(__APPLE__) && defined(__ppc__)
+    if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
+        memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
+      // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
+      // This mirrors logic in libSystemStubs.a.
+      std::string Prefix = std::string(Name.begin(), Name.end()-9);
+      if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
+        return Ptr;
+      if (void *Ptr = getPointerToNamedFunction(Prefix, false))
+        return Ptr;
+    }
+#endif
+  }
+  
+  /// If a LazyFunctionCreator is installed, use it to get/create the function.
+  if (LazyFunctionCreator)
+    if (void *RP = LazyFunctionCreator(Name))
+      return RP;
+
+  if (AbortOnFailure) {
+    cerr << "ERROR: Program used external function '" << Name
+         << "' which could not be resolved!\n";
+    abort();
+  }
+  return 0;
+}
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
new file mode 100644
index 0000000..f8ae884
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -0,0 +1,708 @@
+//===-- JIT.cpp - LLVM Just in Time Compiler ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tool implements a just-in-time compiler for LLVM, allowing direct
+// execution of LLVM bitcode in an efficient manner.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JIT.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/CodeGen/MachineCodeInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/System/DynamicLibrary.h"
+#include "llvm/Config/config.h"
+
+using namespace llvm;
+
+#ifdef __APPLE__ 
+// Apple gcc defaults to -fuse-cxa-atexit (i.e. calls __cxa_atexit instead
+// of atexit). It passes the address of linker generated symbol __dso_handle
+// to the function.
+// This configuration change happened at version 5330.
+# include <AvailabilityMacros.h>
+# if defined(MAC_OS_X_VERSION_10_4) && \
+     ((MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_4) || \
+      (MAC_OS_X_VERSION_MIN_REQUIRED == MAC_OS_X_VERSION_10_4 && \
+       __APPLE_CC__ >= 5330))
+#  ifndef HAVE___DSO_HANDLE
+#   define HAVE___DSO_HANDLE 1
+#  endif
+# endif
+#endif
+
+#if HAVE___DSO_HANDLE
+extern void *__dso_handle __attribute__ ((__visibility__ ("hidden")));
+#endif
+
+namespace {
+
+static struct RegisterJIT {
+  RegisterJIT() { JIT::Register(); }
+} JITRegistrator;
+
+}
+
+namespace llvm {
+  void LinkInJIT() {
+  }
+}
+
+
+#if defined(__GNUC__) && !defined(__ARM__EABI__)
+ 
+// libgcc defines the __register_frame function to dynamically register new
+// dwarf frames for exception handling. This functionality is not portable
+// across compilers and is only provided by GCC. We use the __register_frame
+// function here so that code generated by the JIT cooperates with the unwinding
+// runtime of libgcc. When JITting with exception handling enable, LLVM
+// generates dwarf frames and registers it to libgcc with __register_frame.
+//
+// The __register_frame function works with Linux.
+//
+// Unfortunately, this functionality seems to be in libgcc after the unwinding
+// library of libgcc for darwin was written. The code for darwin overwrites the
+// value updated by __register_frame with a value fetched with "keymgr".
+// "keymgr" is an obsolete functionality, which should be rewritten some day.
+// In the meantime, since "keymgr" is on all libgccs shipped with apple-gcc, we
+// need a workaround in LLVM which uses the "keymgr" to dynamically modify the
+// values of an opaque key, used by libgcc to find dwarf tables.
+
+extern "C" void __register_frame(void*);
+
+#if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED <= 1050
+# define USE_KEYMGR 1
+#else
+# define USE_KEYMGR 0
+#endif
+
+#if USE_KEYMGR
+
+namespace {
+
+// LibgccObject - This is the structure defined in libgcc. There is no #include
+// provided for this structure, so we also define it here. libgcc calls it
+// "struct object". The structure is undocumented in libgcc.
+struct LibgccObject {
+  void *unused1;
+  void *unused2;
+  void *unused3;
+  
+  /// frame - Pointer to the exception table.
+  void *frame;
+  
+  /// encoding -  The encoding of the object?
+  union {
+    struct {
+      unsigned long sorted : 1;
+      unsigned long from_array : 1;
+      unsigned long mixed_encoding : 1;
+      unsigned long encoding : 8;
+      unsigned long count : 21; 
+    } b;
+    size_t i;
+  } encoding;
+  
+  /// fde_end - libgcc defines this field only if some macro is defined. We
+  /// include this field even if it may not there, to make libgcc happy.
+  char *fde_end;
+  
+  /// next - At least we know it's a chained list!
+  struct LibgccObject *next;
+};
+
+// "kemgr" stuff. Apparently, all frame tables are stored there.
+extern "C" void _keymgr_set_and_unlock_processwide_ptr(int, void *);
+extern "C" void *_keymgr_get_and_lock_processwide_ptr(int);
+#define KEYMGR_GCC3_DW2_OBJ_LIST        302     /* Dwarf2 object list  */
+
+/// LibgccObjectInfo - libgcc defines this struct as km_object_info. It
+/// probably contains all dwarf tables that are loaded.
+struct LibgccObjectInfo {
+
+  /// seenObjects - LibgccObjects already parsed by the unwinding runtime.
+  ///
+  struct LibgccObject* seenObjects;
+
+  /// unseenObjects - LibgccObjects not parsed yet by the unwinding runtime.
+  ///
+  struct LibgccObject* unseenObjects;
+  
+  unsigned unused[2];
+};
+
+/// darwin_register_frame - Since __register_frame does not work with darwin's
+/// libgcc,we provide our own function, which "tricks" libgcc by modifying the
+/// "Dwarf2 object list" key.
+void DarwinRegisterFrame(void* FrameBegin) {
+  // Get the key.
+  LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
+    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
+  assert(LOI && "This should be preallocated by the runtime");
+  
+  // Allocate a new LibgccObject to represent this frame. Deallocation of this
+  // object may be impossible: since darwin code in libgcc was written after
+  // the ability to dynamically register frames, things may crash if we
+  // deallocate it.
+  struct LibgccObject* ob = (struct LibgccObject*)
+    malloc(sizeof(struct LibgccObject));
+  
+  // Do like libgcc for the values of the field.
+  ob->unused1 = (void *)-1;
+  ob->unused2 = 0;
+  ob->unused3 = 0;
+  ob->frame = FrameBegin;
+  ob->encoding.i = 0; 
+  ob->encoding.b.encoding = llvm::dwarf::DW_EH_PE_omit;
+  
+  // Put the info on both places, as libgcc uses the first or the the second
+  // field. Note that we rely on having two pointers here. If fde_end was a
+  // char, things would get complicated.
+  ob->fde_end = (char*)LOI->unseenObjects;
+  ob->next = LOI->unseenObjects;
+  
+  // Update the key's unseenObjects list.
+  LOI->unseenObjects = ob;
+  
+  // Finally update the "key". Apparently, libgcc requires it. 
+  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST,
+                                         LOI);
+
+}
+
+}
+#endif // __APPLE__
+#endif // __GNUC__
+
+/// createJIT - This is the factory method for creating a JIT for the current
+/// machine, it does not fall back to the interpreter.  This takes ownership
+/// of the module provider.
+ExecutionEngine *ExecutionEngine::createJIT(ModuleProvider *MP,
+                                            std::string *ErrorStr,
+                                            JITMemoryManager *JMM,
+                                            CodeGenOpt::Level OptLevel) {
+  ExecutionEngine *EE = JIT::createJIT(MP, ErrorStr, JMM, OptLevel);
+  if (!EE) return 0;
+  
+  // Make sure we can resolve symbols in the program as well. The zero arg
+  // to the function tells DynamicLibrary to load the program, not a library.
+  sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr);
+  return EE;
+}
+
+JIT::JIT(ModuleProvider *MP, TargetMachine &tm, TargetJITInfo &tji,
+         JITMemoryManager *JMM, CodeGenOpt::Level OptLevel)
+  : ExecutionEngine(MP), TM(tm), TJI(tji) {
+  setTargetData(TM.getTargetData());
+
+  jitstate = new JITState(MP);
+
+  // Initialize JCE
+  JCE = createEmitter(*this, JMM);
+
+  // Add target data
+  MutexGuard locked(lock);
+  FunctionPassManager &PM = jitstate->getPM(locked);
+  PM.add(new TargetData(*TM.getTargetData()));
+
+  // Turn the machine code intermediate representation into bytes in memory that
+  // may be executed.
+  if (TM.addPassesToEmitMachineCode(PM, *JCE, OptLevel)) {
+    cerr << "Target does not support machine code emission!\n";
+    abort();
+  }
+  
+  // Register routine for informing unwinding runtime about new EH frames
+#if defined(__GNUC__) && !defined(__ARM_EABI__)
+#if USE_KEYMGR
+  struct LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
+    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
+  
+  // The key is created on demand, and libgcc creates it the first time an
+  // exception occurs. Since we need the key to register frames, we create
+  // it now.
+  if (!LOI)
+    LOI = (LibgccObjectInfo*)calloc(sizeof(struct LibgccObjectInfo), 1); 
+  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, LOI);
+  InstallExceptionTableRegister(DarwinRegisterFrame);
+#else
+  InstallExceptionTableRegister(__register_frame);
+#endif // __APPLE__
+#endif // __GNUC__
+  
+  // Initialize passes.
+  PM.doInitialization();
+}
+
+JIT::~JIT() {
+  delete jitstate;
+  delete JCE;
+  delete &TM;
+}
+
+/// addModuleProvider - Add a new ModuleProvider to the JIT.  If we previously
+/// removed the last ModuleProvider, we need re-initialize jitstate with a valid
+/// ModuleProvider.
+void JIT::addModuleProvider(ModuleProvider *MP) {
+  MutexGuard locked(lock);
+
+  if (Modules.empty()) {
+    assert(!jitstate && "jitstate should be NULL if Modules vector is empty!");
+
+    jitstate = new JITState(MP);
+
+    FunctionPassManager &PM = jitstate->getPM(locked);
+    PM.add(new TargetData(*TM.getTargetData()));
+
+    // Turn the machine code intermediate representation into bytes in memory
+    // that may be executed.
+    if (TM.addPassesToEmitMachineCode(PM, *JCE, CodeGenOpt::Default)) {
+      cerr << "Target does not support machine code emission!\n";
+      abort();
+    }
+    
+    // Initialize passes.
+    PM.doInitialization();
+  }
+  
+  ExecutionEngine::addModuleProvider(MP);
+}
+
+/// removeModuleProvider - If we are removing the last ModuleProvider, 
+/// invalidate the jitstate since the PassManager it contains references a
+/// released ModuleProvider.
+Module *JIT::removeModuleProvider(ModuleProvider *MP, std::string *E) {
+  Module *result = ExecutionEngine::removeModuleProvider(MP, E);
+  
+  MutexGuard locked(lock);
+  
+  if (jitstate->getMP() == MP) {
+    delete jitstate;
+    jitstate = 0;
+  }
+  
+  if (!jitstate && !Modules.empty()) {
+    jitstate = new JITState(Modules[0]);
+
+    FunctionPassManager &PM = jitstate->getPM(locked);
+    PM.add(new TargetData(*TM.getTargetData()));
+    
+    // Turn the machine code intermediate representation into bytes in memory
+    // that may be executed.
+    if (TM.addPassesToEmitMachineCode(PM, *JCE, CodeGenOpt::Default)) {
+      cerr << "Target does not support machine code emission!\n";
+      abort();
+    }
+    
+    // Initialize passes.
+    PM.doInitialization();
+  }    
+  return result;
+}
+
+/// deleteModuleProvider - Remove a ModuleProvider from the list of modules,
+/// and deletes the ModuleProvider and owned Module.  Avoids materializing 
+/// the underlying module.
+void JIT::deleteModuleProvider(ModuleProvider *MP, std::string *E) {
+  ExecutionEngine::deleteModuleProvider(MP, E);
+  
+  MutexGuard locked(lock);
+  
+  if (jitstate->getMP() == MP) {
+    delete jitstate;
+    jitstate = 0;
+  }
+
+  if (!jitstate && !Modules.empty()) {
+    jitstate = new JITState(Modules[0]);
+    
+    FunctionPassManager &PM = jitstate->getPM(locked);
+    PM.add(new TargetData(*TM.getTargetData()));
+    
+    // Turn the machine code intermediate representation into bytes in memory
+    // that may be executed.
+    if (TM.addPassesToEmitMachineCode(PM, *JCE, CodeGenOpt::Default)) {
+      cerr << "Target does not support machine code emission!\n";
+      abort();
+    }
+    
+    // Initialize passes.
+    PM.doInitialization();
+  }    
+}
+
+/// run - Start execution with the specified function and arguments.
+///
+GenericValue JIT::runFunction(Function *F,
+                              const std::vector<GenericValue> &ArgValues) {
+  assert(F && "Function *F was null at entry to run()");
+
+  void *FPtr = getPointerToFunction(F);
+  assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
+  const FunctionType *FTy = F->getFunctionType();
+  const Type *RetTy = FTy->getReturnType();
+
+  assert((FTy->getNumParams() == ArgValues.size() ||
+          (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
+         "Wrong number of arguments passed into function!");
+  assert(FTy->getNumParams() == ArgValues.size() &&
+         "This doesn't support passing arguments through varargs (yet)!");
+
+  // Handle some common cases first.  These cases correspond to common `main'
+  // prototypes.
+  if (RetTy == Type::Int32Ty || RetTy == Type::VoidTy) {
+    switch (ArgValues.size()) {
+    case 3:
+      if (FTy->getParamType(0) == Type::Int32Ty &&
+          isa<PointerType>(FTy->getParamType(1)) &&
+          isa<PointerType>(FTy->getParamType(2))) {
+        int (*PF)(int, char **, const char **) =
+          (int(*)(int, char **, const char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(), 
+                                 (char **)GVTOP(ArgValues[1]),
+                                 (const char **)GVTOP(ArgValues[2])));
+        return rv;
+      }
+      break;
+    case 2:
+      if (FTy->getParamType(0) == Type::Int32Ty &&
+          isa<PointerType>(FTy->getParamType(1))) {
+        int (*PF)(int, char **) = (int(*)(int, char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(), 
+                                 (char **)GVTOP(ArgValues[1])));
+        return rv;
+      }
+      break;
+    case 1:
+      if (FTy->getNumParams() == 1 &&
+          FTy->getParamType(0) == Type::Int32Ty) {
+        GenericValue rv;
+        int (*PF)(int) = (int(*)(int))(intptr_t)FPtr;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue()));
+        return rv;
+      }
+      break;
+    }
+  }
+
+  // Handle cases where no arguments are passed first.
+  if (ArgValues.empty()) {
+    GenericValue rv;
+    switch (RetTy->getTypeID()) {
+    default: assert(0 && "Unknown return type for function call!");
+    case Type::IntegerTyID: {
+      unsigned BitWidth = cast<IntegerType>(RetTy)->getBitWidth();
+      if (BitWidth == 1)
+        rv.IntVal = APInt(BitWidth, ((bool(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 8)
+        rv.IntVal = APInt(BitWidth, ((char(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 16)
+        rv.IntVal = APInt(BitWidth, ((short(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 32)
+        rv.IntVal = APInt(BitWidth, ((int(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 64)
+        rv.IntVal = APInt(BitWidth, ((int64_t(*)())(intptr_t)FPtr)());
+      else 
+        assert(0 && "Integer types > 64 bits not supported");
+      return rv;
+    }
+    case Type::VoidTyID:
+      rv.IntVal = APInt(32, ((int(*)())(intptr_t)FPtr)());
+      return rv;
+    case Type::FloatTyID:
+      rv.FloatVal = ((float(*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::DoubleTyID:
+      rv.DoubleVal = ((double(*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::X86_FP80TyID:
+    case Type::FP128TyID:
+    case Type::PPC_FP128TyID:
+      assert(0 && "long double not supported yet");
+      return rv;
+    case Type::PointerTyID:
+      return PTOGV(((void*(*)())(intptr_t)FPtr)());
+    }
+  }
+
+  // Okay, this is not one of our quick and easy cases.  Because we don't have a
+  // full FFI, we have to codegen a nullary stub function that just calls the
+  // function we are interested in, passing in constants for all of the
+  // arguments.  Make this function and return.
+
+  // First, create the function.
+  FunctionType *STy=FunctionType::get(RetTy, std::vector<const Type*>(), false);
+  Function *Stub = Function::Create(STy, Function::InternalLinkage, "",
+                                    F->getParent());
+
+  // Insert a basic block.
+  BasicBlock *StubBB = BasicBlock::Create("", Stub);
+
+  // Convert all of the GenericValue arguments over to constants.  Note that we
+  // currently don't support varargs.
+  SmallVector<Value*, 8> Args;
+  for (unsigned i = 0, e = ArgValues.size(); i != e; ++i) {
+    Constant *C = 0;
+    const Type *ArgTy = FTy->getParamType(i);
+    const GenericValue &AV = ArgValues[i];
+    switch (ArgTy->getTypeID()) {
+    default: assert(0 && "Unknown argument type for function call!");
+    case Type::IntegerTyID:
+        C = ConstantInt::get(AV.IntVal);
+        break;
+    case Type::FloatTyID:
+        C = ConstantFP::get(APFloat(AV.FloatVal));
+        break;
+    case Type::DoubleTyID:
+        C = ConstantFP::get(APFloat(AV.DoubleVal));
+        break;
+    case Type::PPC_FP128TyID:
+    case Type::X86_FP80TyID:
+    case Type::FP128TyID:
+        C = ConstantFP::get(APFloat(AV.IntVal));
+        break;
+    case Type::PointerTyID:
+      void *ArgPtr = GVTOP(AV);
+      if (sizeof(void*) == 4)
+        C = ConstantInt::get(Type::Int32Ty, (int)(intptr_t)ArgPtr);
+      else
+        C = ConstantInt::get(Type::Int64Ty, (intptr_t)ArgPtr);
+      C = ConstantExpr::getIntToPtr(C, ArgTy);  // Cast the integer to pointer
+      break;
+    }
+    Args.push_back(C);
+  }
+
+  CallInst *TheCall = CallInst::Create(F, Args.begin(), Args.end(),
+                                       "", StubBB);
+  TheCall->setCallingConv(F->getCallingConv());
+  TheCall->setTailCall();
+  if (TheCall->getType() != Type::VoidTy)
+    ReturnInst::Create(TheCall, StubBB);    // Return result of the call.
+  else
+    ReturnInst::Create(StubBB);             // Just return void.
+
+  // Finally, return the value returned by our nullary stub function.
+  return runFunction(Stub, std::vector<GenericValue>());
+}
+
+/// runJITOnFunction - Run the FunctionPassManager full of
+/// just-in-time compilation passes on F, hopefully filling in
+/// GlobalAddress[F] with the address of F's machine code.
+///
+void JIT::runJITOnFunction(Function *F, MachineCodeInfo *MCI) {
+  MutexGuard locked(lock);
+
+  registerMachineCodeInfo(MCI);
+
+  runJITOnFunctionUnlocked(F, locked);
+
+  registerMachineCodeInfo(0);
+}
+
+void JIT::runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked) {
+  static bool isAlreadyCodeGenerating = false;
+  assert(!isAlreadyCodeGenerating && "Error: Recursive compilation detected!");
+
+  // JIT the function
+  isAlreadyCodeGenerating = true;
+  jitstate->getPM(locked).run(*F);
+  isAlreadyCodeGenerating = false;
+
+  // If the function referred to another function that had not yet been
+  // read from bitcode, but we are jitting non-lazily, emit it now.
+  while (!jitstate->getPendingFunctions(locked).empty()) {
+    Function *PF = jitstate->getPendingFunctions(locked).back();
+    jitstate->getPendingFunctions(locked).pop_back();
+
+    // JIT the function
+    isAlreadyCodeGenerating = true;
+    jitstate->getPM(locked).run(*PF);
+    isAlreadyCodeGenerating = false;
+    
+    // Now that the function has been jitted, ask the JITEmitter to rewrite
+    // the stub with real address of the function.
+    updateFunctionStub(PF);
+  }
+  
+  // If the JIT is configured to emit info so that dlsym can be used to
+  // rewrite stubs to external globals, do so now.
+  if (areDlsymStubsEnabled() && isLazyCompilationDisabled())
+    updateDlsymStubTable();
+}
+
+/// getPointerToFunction - This method is used to get the address of the
+/// specified function, compiling it if neccesary.
+///
+void *JIT::getPointerToFunction(Function *F) {
+
+  if (void *Addr = getPointerToGlobalIfAvailable(F))
+    return Addr;   // Check if function already code gen'd
+
+  MutexGuard locked(lock);
+
+  // Make sure we read in the function if it exists in this Module.
+  if (F->hasNotBeenReadFromBitcode()) {
+    // Determine the module provider this function is provided by.
+    Module *M = F->getParent();
+    ModuleProvider *MP = 0;
+    for (unsigned i = 0, e = Modules.size(); i != e; ++i) {
+      if (Modules[i]->getModule() == M) {
+        MP = Modules[i];
+        break;
+      }
+    }
+    assert(MP && "Function isn't in a module we know about!");
+    
+    std::string ErrorMsg;
+    if (MP->materializeFunction(F, &ErrorMsg)) {
+      cerr << "Error reading function '" << F->getName()
+           << "' from bitcode file: " << ErrorMsg << "\n";
+      abort();
+    }
+
+    // Now retry to get the address.
+    if (void *Addr = getPointerToGlobalIfAvailable(F))
+      return Addr;
+  }
+
+  if (F->isDeclaration()) {
+    bool AbortOnFailure =
+      !areDlsymStubsEnabled() && !F->hasExternalWeakLinkage();
+    void *Addr = getPointerToNamedFunction(F->getName(), AbortOnFailure);
+    addGlobalMapping(F, Addr);
+    return Addr;
+  }
+
+  runJITOnFunctionUnlocked(F, locked);
+
+  void *Addr = getPointerToGlobalIfAvailable(F);
+  assert(Addr && "Code generation didn't add function to GlobalAddress table!");
+  return Addr;
+}
+
+/// getOrEmitGlobalVariable - Return the address of the specified global
+/// variable, possibly emitting it to memory if needed.  This is used by the
+/// Emitter.
+void *JIT::getOrEmitGlobalVariable(const GlobalVariable *GV) {
+  MutexGuard locked(lock);
+
+  void *Ptr = getPointerToGlobalIfAvailable(GV);
+  if (Ptr) return Ptr;
+
+  // If the global is external, just remember the address.
+  if (GV->isDeclaration()) {
+#if HAVE___DSO_HANDLE
+    if (GV->getName() == "__dso_handle")
+      return (void*)&__dso_handle;
+#endif
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(GV->getName().c_str());
+    if (Ptr == 0 && !areDlsymStubsEnabled()) {
+      cerr << "Could not resolve external global address: "
+           << GV->getName() << "\n";
+      abort();
+    }
+    addGlobalMapping(GV, Ptr);
+  } else {
+    // GlobalVariable's which are not "constant" will cause trouble in a server
+    // situation. It's returned in the same block of memory as code which may
+    // not be writable.
+    if (isGVCompilationDisabled() && !GV->isConstant()) {
+      cerr << "Compilation of non-internal GlobalValue is disabled!\n";
+      abort();
+    }
+    // If the global hasn't been emitted to memory yet, allocate space and
+    // emit it into memory.  It goes in the same array as the generated
+    // code, jump tables, etc.
+    const Type *GlobalType = GV->getType()->getElementType();
+    size_t S = getTargetData()->getTypeAllocSize(GlobalType);
+    size_t A = getTargetData()->getPreferredAlignment(GV);
+    if (GV->isThreadLocal()) {
+      MutexGuard locked(lock);
+      Ptr = TJI.allocateThreadLocalMemory(S);
+    } else if (TJI.allocateSeparateGVMemory()) {
+      if (A <= 8) {
+        Ptr = malloc(S);
+      } else {
+        // Allocate S+A bytes of memory, then use an aligned pointer within that
+        // space.
+        Ptr = malloc(S+A);
+        unsigned MisAligned = ((intptr_t)Ptr & (A-1));
+        Ptr = (char*)Ptr + (MisAligned ? (A-MisAligned) : 0);
+      }
+    } else {
+      Ptr = JCE->allocateSpace(S, A);
+    }
+    addGlobalMapping(GV, Ptr);
+    EmitGlobalVariable(GV);
+  }
+  return Ptr;
+}
+
+/// recompileAndRelinkFunction - This method is used to force a function
+/// which has already been compiled, to be compiled again, possibly
+/// after it has been modified. Then the entry to the old copy is overwritten
+/// with a branch to the new copy. If there was no old copy, this acts
+/// just like JIT::getPointerToFunction().
+///
+void *JIT::recompileAndRelinkFunction(Function *F) {
+  void *OldAddr = getPointerToGlobalIfAvailable(F);
+
+  // If it's not already compiled there is no reason to patch it up.
+  if (OldAddr == 0) { return getPointerToFunction(F); }
+
+  // Delete the old function mapping.
+  addGlobalMapping(F, 0);
+
+  // Recodegen the function
+  runJITOnFunction(F);
+
+  // Update state, forward the old function to the new function.
+  void *Addr = getPointerToGlobalIfAvailable(F);
+  assert(Addr && "Code generation didn't add function to GlobalAddress table!");
+  TJI.replaceMachineCodeForFunction(OldAddr, Addr);
+  return Addr;
+}
+
+/// getMemoryForGV - This method abstracts memory allocation of global
+/// variable so that the JIT can allocate thread local variables depending
+/// on the target.
+///
+char* JIT::getMemoryForGV(const GlobalVariable* GV) {
+  const Type *ElTy = GV->getType()->getElementType();
+  size_t GVSize = (size_t)getTargetData()->getTypeAllocSize(ElTy);
+  if (GV->isThreadLocal()) {
+    MutexGuard locked(lock);
+    return TJI.allocateThreadLocalMemory(GVSize);
+  } else {
+    return new char[GVSize];
+  }
+}
+
+void JIT::addPendingFunction(Function *F) {
+  MutexGuard locked(lock);
+  jitstate->getPendingFunctions(locked).push_back(F);
+}
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
new file mode 100644
index 0000000..3ccb2dd
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -0,0 +1,176 @@
+//===-- JIT.h - Class definition for the JIT --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the top-level JIT data structure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef JIT_H
+#define JIT_H
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/PassManager.h"
+
+namespace llvm {
+
+class Function;
+class TargetMachine;
+class TargetJITInfo;
+class MachineCodeEmitter;
+class MachineCodeInfo;
+
+class JITState {
+private:
+  FunctionPassManager PM;  // Passes to compile a function
+  ModuleProvider *MP;      // ModuleProvider used to create the PM
+
+  /// PendingFunctions - Functions which have not been code generated yet, but
+  /// were called from a function being code generated.
+  std::vector<Function*> PendingFunctions;
+
+public:
+  explicit JITState(ModuleProvider *MP) : PM(MP), MP(MP) {}
+
+  FunctionPassManager &getPM(const MutexGuard &L) {
+    return PM;
+  }
+  
+  ModuleProvider *getMP() const { return MP; }
+  std::vector<Function*> &getPendingFunctions(const MutexGuard &L) {
+    return PendingFunctions;
+  }
+};
+
+
+class JIT : public ExecutionEngine {
+  TargetMachine &TM;       // The current target we are compiling to
+  TargetJITInfo &TJI;      // The JITInfo for the target we are compiling to
+  JITCodeEmitter *JCE;     // JCE object
+
+  JITState *jitstate;
+
+  JIT(ModuleProvider *MP, TargetMachine &tm, TargetJITInfo &tji, 
+      JITMemoryManager *JMM, CodeGenOpt::Level OptLevel);
+public:
+  ~JIT();
+
+  static void Register() {
+    JITCtor = create;
+  }
+  
+  /// getJITInfo - Return the target JIT information structure.
+  ///
+  TargetJITInfo &getJITInfo() const { return TJI; }
+
+  /// create - Create an return a new JIT compiler if there is one available
+  /// for the current target.  Otherwise, return null.
+  ///
+  static ExecutionEngine *create(ModuleProvider *MP, std::string *Err,
+                                 CodeGenOpt::Level OptLevel =
+                                   CodeGenOpt::Default) {
+    return createJIT(MP, Err, 0, OptLevel);
+  }
+
+  virtual void addModuleProvider(ModuleProvider *MP);
+  
+  /// removeModuleProvider - Remove a ModuleProvider from the list of modules.
+  /// Relases the Module from the ModuleProvider, materializing it in the
+  /// process, and returns the materialized Module.
+  virtual Module *removeModuleProvider(ModuleProvider *MP,
+                                       std::string *ErrInfo = 0);
+
+  /// deleteModuleProvider - Remove a ModuleProvider from the list of modules,
+  /// and deletes the ModuleProvider and owned Module.  Avoids materializing 
+  /// the underlying module.
+  virtual void deleteModuleProvider(ModuleProvider *P,std::string *ErrInfo = 0);
+
+  /// runFunction - Start execution with the specified function and arguments.
+  ///
+  virtual GenericValue runFunction(Function *F,
+                                   const std::vector<GenericValue> &ArgValues);
+
+  /// getPointerToNamedFunction - This method returns the address of the
+  /// specified function by using the dlsym function call.  As such it is only
+  /// useful for resolving library symbols, not code generated symbols.
+  ///
+  /// If AbortOnFailure is false and no function with the given name is
+  /// found, this function silently returns a null pointer. Otherwise,
+  /// it prints a message to stderr and aborts.
+  ///
+  void *getPointerToNamedFunction(const std::string &Name,
+                                  bool AbortOnFailure = true);
+
+  // CompilationCallback - Invoked the first time that a call site is found,
+  // which causes lazy compilation of the target function.
+  //
+  static void CompilationCallback();
+
+  /// getPointerToFunction - This returns the address of the specified function,
+  /// compiling it if necessary.
+  ///
+  void *getPointerToFunction(Function *F);
+
+  /// getOrEmitGlobalVariable - Return the address of the specified global
+  /// variable, possibly emitting it to memory if needed.  This is used by the
+  /// Emitter.
+  void *getOrEmitGlobalVariable(const GlobalVariable *GV);
+
+  /// getPointerToFunctionOrStub - If the specified function has been
+  /// code-gen'd, return a pointer to the function.  If not, compile it, or use
+  /// a stub to implement lazy compilation if available.
+  ///
+  void *getPointerToFunctionOrStub(Function *F);
+
+  /// recompileAndRelinkFunction - This method is used to force a function
+  /// which has already been compiled, to be compiled again, possibly
+  /// after it has been modified. Then the entry to the old copy is overwritten
+  /// with a branch to the new copy. If there was no old copy, this acts
+  /// just like JIT::getPointerToFunction().
+  ///
+  void *recompileAndRelinkFunction(Function *F);
+
+  /// freeMachineCodeForFunction - deallocate memory used to code-generate this
+  /// Function.
+  ///
+  void freeMachineCodeForFunction(Function *F);
+
+  /// addPendingFunction - while jitting non-lazily, a called but non-codegen'd
+  /// function was encountered.  Add it to a pending list to be processed after 
+  /// the current function.
+  /// 
+  void addPendingFunction(Function *F);
+  
+  /// getCodeEmitter - Return the code emitter this JIT is emitting into.
+  JITCodeEmitter *getCodeEmitter() const { return JCE; }
+  
+  static ExecutionEngine *createJIT(ModuleProvider *MP, std::string *Err,
+                                    JITMemoryManager *JMM,
+                                    CodeGenOpt::Level OptLevel);
+
+
+  // Run the JIT on F and return information about the generated code
+  void runJITOnFunction(Function *F, MachineCodeInfo *MCI = 0);
+
+private:
+  static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM);
+  void registerMachineCodeInfo(MachineCodeInfo *MCI);
+  void runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked);
+  void updateFunctionStub(Function *F);
+  void updateDlsymStubTable();
+
+protected:
+
+  /// getMemoryforGV - Allocate memory for a global variable.
+  virtual char* getMemoryForGV(const GlobalVariable* GV);
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
new file mode 100644
index 0000000..e101ef3
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
@@ -0,0 +1,1056 @@
+//===----- JITDwarfEmitter.cpp - Write dwarf tables into memory -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITDwarfEmitter object that is used by the JIT to
+// write dwarf tables to memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JIT.h"
+#include "JITDwarfEmitter.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+JITDwarfEmitter::JITDwarfEmitter(JIT& theJit) : Jit(theJit) {}
+
+
+unsigned char* JITDwarfEmitter::EmitDwarfTable(MachineFunction& F, 
+                                               JITCodeEmitter& jce,
+                                               unsigned char* StartFunction,
+                                               unsigned char* EndFunction) {
+  const TargetMachine& TM = F.getTarget();
+  TD = TM.getTargetData();
+  needsIndirectEncoding = TM.getTargetAsmInfo()->getNeedsIndirectEncoding();
+  stackGrowthDirection = TM.getFrameInfo()->getStackGrowthDirection();
+  RI = TM.getRegisterInfo();
+  JCE = &jce;
+  
+  unsigned char* ExceptionTable = EmitExceptionTable(&F, StartFunction,
+                                                     EndFunction);
+      
+  unsigned char* Result = 0;
+  unsigned char* EHFramePtr = 0;
+
+  const std::vector<Function *> Personalities = MMI->getPersonalities();
+  EHFramePtr = EmitCommonEHFrame(Personalities[MMI->getPersonalityIndex()]);
+
+  Result = EmitEHFrame(Personalities[MMI->getPersonalityIndex()], EHFramePtr,
+                       StartFunction, EndFunction, ExceptionTable);
+  
+  return Result;
+}
+
+
+void 
+JITDwarfEmitter::EmitFrameMoves(intptr_t BaseLabelPtr,
+                                const std::vector<MachineMove> &Moves) const {
+  unsigned PointerSize = TD->getPointerSize();
+  int stackGrowth = stackGrowthDirection == TargetFrameInfo::StackGrowsUp ?
+          PointerSize : -PointerSize;
+  bool IsLocal = false;
+  unsigned BaseLabelID = 0;
+
+  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
+    const MachineMove &Move = Moves[i];
+    unsigned LabelID = Move.getLabelID();
+    
+    if (LabelID) {
+      LabelID = MMI->MappedLabel(LabelID);
+    
+      // Throw out move if the label is invalid.
+      if (!LabelID) continue;
+    }
+    
+    intptr_t LabelPtr = 0;
+    if (LabelID) LabelPtr = JCE->getLabelAddress(LabelID);
+
+    const MachineLocation &Dst = Move.getDestination();
+    const MachineLocation &Src = Move.getSource();
+    
+    // Advance row if new location.
+    if (BaseLabelPtr && LabelID && (BaseLabelID != LabelID || !IsLocal)) {
+      JCE->emitByte(dwarf::DW_CFA_advance_loc4);
+      JCE->emitInt32(LabelPtr - BaseLabelPtr);
+      
+      BaseLabelID = LabelID; 
+      BaseLabelPtr = LabelPtr;
+      IsLocal = true;
+    }
+    
+    // If advancing cfa.
+    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
+      if (!Src.isReg()) {
+        if (Src.getReg() == MachineLocation::VirtualFP) {
+          JCE->emitByte(dwarf::DW_CFA_def_cfa_offset);
+        } else {
+          JCE->emitByte(dwarf::DW_CFA_def_cfa);
+          JCE->emitULEB128Bytes(RI->getDwarfRegNum(Src.getReg(), true));
+        }
+        
+        int Offset = -Src.getOffset();
+        
+        JCE->emitULEB128Bytes(Offset);
+      } else {
+        assert(0 && "Machine move no supported yet.");
+      }
+    } else if (Src.isReg() &&
+      Src.getReg() == MachineLocation::VirtualFP) {
+      if (Dst.isReg()) {
+        JCE->emitByte(dwarf::DW_CFA_def_cfa_register);
+        JCE->emitULEB128Bytes(RI->getDwarfRegNum(Dst.getReg(), true));
+      } else {
+        assert(0 && "Machine move no supported yet.");
+      }
+    } else {
+      unsigned Reg = RI->getDwarfRegNum(Src.getReg(), true);
+      int Offset = Dst.getOffset() / stackGrowth;
+      
+      if (Offset < 0) {
+        JCE->emitByte(dwarf::DW_CFA_offset_extended_sf);
+        JCE->emitULEB128Bytes(Reg);
+        JCE->emitSLEB128Bytes(Offset);
+      } else if (Reg < 64) {
+        JCE->emitByte(dwarf::DW_CFA_offset + Reg);
+        JCE->emitULEB128Bytes(Offset);
+      } else {
+        JCE->emitByte(dwarf::DW_CFA_offset_extended);
+        JCE->emitULEB128Bytes(Reg);
+        JCE->emitULEB128Bytes(Offset);
+      }
+    }
+  }
+}
+
+/// SharedTypeIds - How many leading type ids two landing pads have in common.
+static unsigned SharedTypeIds(const LandingPadInfo *L,
+                              const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+  unsigned Count = 0;
+
+  for (; Count != MinSize; ++Count)
+    if (LIds[Count] != RIds[Count])
+      return Count;
+
+  return Count;
+}
+
+
+/// PadLT - Order landing pads lexicographically by type id.
+static bool PadLT(const LandingPadInfo *L, const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+
+  for (unsigned i = 0; i != MinSize; ++i)
+    if (LIds[i] != RIds[i])
+      return LIds[i] < RIds[i];
+
+  return LSize < RSize;
+}
+
+namespace {
+
+struct KeyInfo {
+  static inline unsigned getEmptyKey() { return -1U; }
+  static inline unsigned getTombstoneKey() { return -2U; }
+  static unsigned getHashValue(const unsigned &Key) { return Key; }
+  static bool isEqual(unsigned LHS, unsigned RHS) { return LHS == RHS; }
+  static bool isPod() { return true; }
+};
+
+/// ActionEntry - Structure describing an entry in the actions table.
+struct ActionEntry {
+  int ValueForTypeID; // The value to write - may not be equal to the type id.
+  int NextAction;
+  struct ActionEntry *Previous;
+};
+
+/// PadRange - Structure holding a try-range and the associated landing pad.
+struct PadRange {
+  // The index of the landing pad.
+  unsigned PadIndex;
+  // The index of the begin and end labels in the landing pad's label lists.
+  unsigned RangeIndex;
+};
+
+typedef DenseMap<unsigned, PadRange, KeyInfo> RangeMapType;
+
+/// CallSiteEntry - Structure describing an entry in the call-site table.
+struct CallSiteEntry {
+  unsigned BeginLabel; // zero indicates the start of the function.
+  unsigned EndLabel;   // zero indicates the end of the function.
+  unsigned PadLabel;   // zero indicates that there is no landing pad.
+  unsigned Action;
+};
+
+}
+
+unsigned char* JITDwarfEmitter::EmitExceptionTable(MachineFunction* MF,
+                                         unsigned char* StartFunction,
+                                         unsigned char* EndFunction) const {
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+
+  const std::vector<GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+  if (PadInfos.empty()) return 0;
+
+  // Sort the landing pads in order of their type ids.  This is used to fold
+  // duplicate actions.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
+    LandingPads.push_back(&PadInfos[i]);
+  std::sort(LandingPads.begin(), LandingPads.end(), PadLT);
+
+  // Negative type ids index into FilterIds, positive type ids index into
+  // TypeInfos.  The value written for a positive type id is just the type
+  // id itself.  For a negative type id, however, the value written is the
+  // (negative) byte offset of the corresponding FilterIds entry.  The byte
+  // offset is usually equal to the type id, because the FilterIds entries
+  // are written using a variable width encoding which outputs one byte per
+  // entry as long as the value written is not too large, but can differ.
+  // This kind of complication does not occur for positive type ids because
+  // type infos are output using a fixed width encoding.
+  // FilterOffsets[i] holds the byte offset corresponding to FilterIds[i].
+  SmallVector<int, 16> FilterOffsets;
+  FilterOffsets.reserve(FilterIds.size());
+  int Offset = -1;
+  for(std::vector<unsigned>::const_iterator I = FilterIds.begin(),
+    E = FilterIds.end(); I != E; ++I) {
+    FilterOffsets.push_back(Offset);
+    Offset -= TargetAsmInfo::getULEB128Size(*I);
+  }
+
+  // Compute the actions table and gather the first action index for each
+  // landing pad site.
+  SmallVector<ActionEntry, 32> Actions;
+  SmallVector<unsigned, 64> FirstActions;
+  FirstActions.reserve(LandingPads.size());
+
+  int FirstAction = 0;
+  unsigned SizeActions = 0;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LP = LandingPads[i];
+    const std::vector<int> &TypeIds = LP->TypeIds;
+    const unsigned NumShared = i ? SharedTypeIds(LP, LandingPads[i-1]) : 0;
+    unsigned SizeSiteActions = 0;
+
+    if (NumShared < TypeIds.size()) {
+      unsigned SizeAction = 0;
+      ActionEntry *PrevAction = 0;
+
+      if (NumShared) {
+        const unsigned SizePrevIds = LandingPads[i-1]->TypeIds.size();
+        assert(Actions.size());
+        PrevAction = &Actions.back();
+        SizeAction = TargetAsmInfo::getSLEB128Size(PrevAction->NextAction) +
+          TargetAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
+          SizeAction -= TargetAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+          SizeAction += -PrevAction->NextAction;
+          PrevAction = PrevAction->Previous;
+        }
+      }
+
+      // Compute the actions.
+      for (unsigned I = NumShared, M = TypeIds.size(); I != M; ++I) {
+        int TypeID = TypeIds[I];
+        assert(-1-TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
+        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        unsigned SizeTypeID = TargetAsmInfo::getSLEB128Size(ValueForTypeID);
+
+        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
+        SizeAction = SizeTypeID + TargetAsmInfo::getSLEB128Size(NextAction);
+        SizeSiteActions += SizeAction;
+
+        ActionEntry Action = {ValueForTypeID, NextAction, PrevAction};
+        Actions.push_back(Action);
+
+        PrevAction = &Actions.back();
+      }
+
+      // Record the first action of the landing pad site.
+      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
+    } // else identical - re-use previous FirstAction
+
+    FirstActions.push_back(FirstAction);
+
+    // Compute this sites contribution to size.
+    SizeActions += SizeSiteActions;
+  }
+
+  // Compute the call-site table.  Entries must be ordered by address.
+  SmallVector<CallSiteEntry, 64> CallSites;
+
+  RangeMapType PadMap;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j=0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      unsigned BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
+  bool MayThrow = false;
+  unsigned LastLabel = 0;
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+        I != E; ++I) {
+    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
+          MI != E; ++MI) {
+      if (!MI->isLabel()) {
+        MayThrow |= MI->getDesc().isCall();
+        continue;
+      }
+
+      unsigned BeginLabel = MI->getOperand(0).getImm();
+      assert(BeginLabel && "Invalid label!");
+
+      if (BeginLabel == LastLabel)
+        MayThrow = false;
+
+      RangeMapType::iterator L = PadMap.find(BeginLabel);
+
+      if (L == PadMap.end())
+        continue;
+
+      PadRange P = L->second;
+      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
+
+      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
+              "Inconsistent landing pad map!");
+
+      // If some instruction between the previous try-range and this one may
+      // throw, create a call-site entry with no landing pad for the region
+      // between the try-ranges.
+      if (MayThrow) {
+        CallSiteEntry Site = {LastLabel, BeginLabel, 0, 0};
+        CallSites.push_back(Site);
+      }
+
+      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+      CallSiteEntry Site = {BeginLabel, LastLabel,
+        LandingPad->LandingPadLabel, FirstActions[P.PadIndex]};
+
+      assert(Site.BeginLabel && Site.EndLabel && Site.PadLabel &&
+              "Invalid landing pad!");
+
+      // Try to merge with the previous call-site.
+      if (CallSites.size()) {
+        CallSiteEntry &Prev = CallSites.back();
+        if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+          // Extend the range of the previous entry.
+          Prev.EndLabel = Site.EndLabel;
+          continue;
+        }
+      }
+
+      // Otherwise, create a new call-site.
+      CallSites.push_back(Site);
+    }
+  }
+  // If some instruction between the previous try-range and the end of the
+  // function may throw, create a call-site entry with no landing pad for the
+  // region following the try-range.
+  if (MayThrow) {
+    CallSiteEntry Site = {LastLabel, 0, 0, 0};
+    CallSites.push_back(Site);
+  }
+
+  // Final tallies.
+  unsigned SizeSites = CallSites.size() * (sizeof(int32_t) + // Site start.
+                                            sizeof(int32_t) + // Site length.
+                                            sizeof(int32_t)); // Landing pad.
+  for (unsigned i = 0, e = CallSites.size(); i < e; ++i)
+    SizeSites += TargetAsmInfo::getULEB128Size(CallSites[i].Action);
+
+  unsigned SizeTypes = TypeInfos.size() * TD->getPointerSize();
+
+  unsigned TypeOffset = sizeof(int8_t) + // Call site format
+                        // Call-site table length
+                        TargetAsmInfo::getULEB128Size(SizeSites) + 
+                        SizeSites + SizeActions + SizeTypes;
+
+  unsigned TotalSize = sizeof(int8_t) + // LPStart format
+                       sizeof(int8_t) + // TType format
+                       TargetAsmInfo::getULEB128Size(TypeOffset) + // TType base offset
+                       TypeOffset;
+
+  unsigned SizeAlign = (4 - TotalSize) & 3;
+
+  // Begin the exception table.
+  JCE->emitAlignment(4);
+  for (unsigned i = 0; i != SizeAlign; ++i) {
+    JCE->emitByte(0);
+    // Asm->EOL("Padding");
+  }
+  
+  unsigned char* DwarfExceptionTable = (unsigned char*)JCE->getCurrentPCValue();
+
+  // Emit the header.
+  JCE->emitByte(dwarf::DW_EH_PE_omit);
+  // Asm->EOL("LPStart format (DW_EH_PE_omit)");
+  JCE->emitByte(dwarf::DW_EH_PE_absptr);
+  // Asm->EOL("TType format (DW_EH_PE_absptr)");
+  JCE->emitULEB128Bytes(TypeOffset);
+  // Asm->EOL("TType base offset");
+  JCE->emitByte(dwarf::DW_EH_PE_udata4);
+  // Asm->EOL("Call site format (DW_EH_PE_udata4)");
+  JCE->emitULEB128Bytes(SizeSites);
+  // Asm->EOL("Call-site table length");
+
+  // Emit the landing pad site information.
+  for (unsigned i = 0; i < CallSites.size(); ++i) {
+    CallSiteEntry &S = CallSites[i];
+    intptr_t BeginLabelPtr = 0;
+    intptr_t EndLabelPtr = 0;
+
+    if (!S.BeginLabel) {
+      BeginLabelPtr = (intptr_t)StartFunction;
+      JCE->emitInt32(0);
+    } else {
+      BeginLabelPtr = JCE->getLabelAddress(S.BeginLabel);
+      JCE->emitInt32(BeginLabelPtr - (intptr_t)StartFunction);
+    }
+
+    // Asm->EOL("Region start");
+
+    if (!S.EndLabel) {
+      EndLabelPtr = (intptr_t)EndFunction;
+      JCE->emitInt32((intptr_t)EndFunction - BeginLabelPtr);
+    } else {
+      EndLabelPtr = JCE->getLabelAddress(S.EndLabel);
+      JCE->emitInt32(EndLabelPtr - BeginLabelPtr);
+    }
+    //Asm->EOL("Region length");
+
+    if (!S.PadLabel) {
+      JCE->emitInt32(0);
+    } else {
+      unsigned PadLabelPtr = JCE->getLabelAddress(S.PadLabel);
+      JCE->emitInt32(PadLabelPtr - (intptr_t)StartFunction);
+    }
+    // Asm->EOL("Landing pad");
+
+    JCE->emitULEB128Bytes(S.Action);
+    // Asm->EOL("Action");
+  }
+
+  // Emit the actions.
+  for (unsigned I = 0, N = Actions.size(); I != N; ++I) {
+    ActionEntry &Action = Actions[I];
+
+    JCE->emitSLEB128Bytes(Action.ValueForTypeID);
+    //Asm->EOL("TypeInfo index");
+    JCE->emitSLEB128Bytes(Action.NextAction);
+    //Asm->EOL("Next action");
+  }
+
+  // Emit the type ids.
+  for (unsigned M = TypeInfos.size(); M; --M) {
+    GlobalVariable *GV = TypeInfos[M - 1];
+    
+    if (GV) {
+      if (TD->getPointerSize() == sizeof(int32_t)) {
+        JCE->emitInt32((intptr_t)Jit.getOrEmitGlobalVariable(GV));
+      } else {
+        JCE->emitInt64((intptr_t)Jit.getOrEmitGlobalVariable(GV));
+      }
+    } else {
+      if (TD->getPointerSize() == sizeof(int32_t))
+        JCE->emitInt32(0);
+      else
+        JCE->emitInt64(0);
+    }
+    // Asm->EOL("TypeInfo");
+  }
+
+  // Emit the filter typeids.
+  for (unsigned j = 0, M = FilterIds.size(); j < M; ++j) {
+    unsigned TypeID = FilterIds[j];
+    JCE->emitULEB128Bytes(TypeID);
+    //Asm->EOL("Filter TypeInfo index");
+  }
+  
+  JCE->emitAlignment(4);
+
+  return DwarfExceptionTable;
+}
+
+unsigned char*
+JITDwarfEmitter::EmitCommonEHFrame(const Function* Personality) const {
+  unsigned PointerSize = TD->getPointerSize();
+  int stackGrowth = stackGrowthDirection == TargetFrameInfo::StackGrowsUp ?
+          PointerSize : -PointerSize;
+  
+  unsigned char* StartCommonPtr = (unsigned char*)JCE->getCurrentPCValue();
+  // EH Common Frame header
+  JCE->allocateSpace(4, 0);
+  unsigned char* FrameCommonBeginPtr = (unsigned char*)JCE->getCurrentPCValue();
+  JCE->emitInt32((int)0);
+  JCE->emitByte(dwarf::DW_CIE_VERSION);
+  JCE->emitString(Personality ? "zPLR" : "zR");
+  JCE->emitULEB128Bytes(1);
+  JCE->emitSLEB128Bytes(stackGrowth);
+  JCE->emitByte(RI->getDwarfRegNum(RI->getRARegister(), true));
+  
+  if (Personality) {
+    // Augmentation Size: 3 small ULEBs of one byte each, and the personality
+    // function which size is PointerSize.
+    JCE->emitULEB128Bytes(3 + PointerSize); 
+    
+    // We set the encoding of the personality as direct encoding because we use
+    // the function pointer. The encoding is not relative because the current
+    // PC value may be bigger than the personality function pointer.
+    if (PointerSize == 4) {
+      JCE->emitByte(dwarf::DW_EH_PE_sdata4); 
+      JCE->emitInt32(((intptr_t)Jit.getPointerToGlobal(Personality)));
+    } else {
+      JCE->emitByte(dwarf::DW_EH_PE_sdata8);
+      JCE->emitInt64(((intptr_t)Jit.getPointerToGlobal(Personality)));
+    }
+    
+    JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+    JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+      
+  } else {
+    JCE->emitULEB128Bytes(1);
+    JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+  }
+
+  std::vector<MachineMove> Moves;
+  RI->getInitialFrameState(Moves);
+  EmitFrameMoves(0, Moves);
+  JCE->emitAlignment(PointerSize);
+  
+  JCE->emitInt32At((uintptr_t*)StartCommonPtr, 
+              (uintptr_t)((unsigned char*)JCE->getCurrentPCValue() - 
+                          FrameCommonBeginPtr));
+
+  return StartCommonPtr;
+}
+
+
+unsigned char*
+JITDwarfEmitter::EmitEHFrame(const Function* Personality,
+                             unsigned char* StartCommonPtr,
+                             unsigned char* StartFunction, 
+                             unsigned char* EndFunction,
+                             unsigned char* ExceptionTable) const {
+  unsigned PointerSize = TD->getPointerSize();
+  
+  // EH frame header.
+  unsigned char* StartEHPtr = (unsigned char*)JCE->getCurrentPCValue();
+  JCE->allocateSpace(4, 0);
+  unsigned char* FrameBeginPtr = (unsigned char*)JCE->getCurrentPCValue();
+  // FDE CIE Offset
+  JCE->emitInt32(FrameBeginPtr - StartCommonPtr);
+  JCE->emitInt32(StartFunction - (unsigned char*)JCE->getCurrentPCValue());
+  JCE->emitInt32(EndFunction - StartFunction);
+
+  // If there is a personality and landing pads then point to the language
+  // specific data area in the exception table.
+  if (MMI->getPersonalityIndex()) {
+    JCE->emitULEB128Bytes(4);
+        
+    if (!MMI->getLandingPads().empty()) {
+      JCE->emitInt32(ExceptionTable - (unsigned char*)JCE->getCurrentPCValue());
+    } else {
+      JCE->emitInt32((int)0);
+    }
+  } else {
+    JCE->emitULEB128Bytes(0);
+  }
+      
+  // Indicate locations of function specific  callee saved registers in
+  // frame.
+  EmitFrameMoves((intptr_t)StartFunction, MMI->getFrameMoves());
+      
+  JCE->emitAlignment(PointerSize);
+  
+  // Indicate the size of the table
+  JCE->emitInt32At((uintptr_t*)StartEHPtr, 
+              (uintptr_t)((unsigned char*)JCE->getCurrentPCValue() - 
+                          StartEHPtr));
+  
+  // Double zeroes for the unwind runtime
+  if (PointerSize == 8) {
+    JCE->emitInt64(0);
+    JCE->emitInt64(0);
+  } else {
+    JCE->emitInt32(0);
+    JCE->emitInt32(0);
+  }
+
+  
+  return StartEHPtr;
+}
+
+unsigned JITDwarfEmitter::GetDwarfTableSizeInBytes(MachineFunction& F,
+                                         JITCodeEmitter& jce,
+                                         unsigned char* StartFunction,
+                                         unsigned char* EndFunction) {
+  const TargetMachine& TM = F.getTarget();
+  TD = TM.getTargetData();
+  needsIndirectEncoding = TM.getTargetAsmInfo()->getNeedsIndirectEncoding();
+  stackGrowthDirection = TM.getFrameInfo()->getStackGrowthDirection();
+  RI = TM.getRegisterInfo();
+  JCE = &jce;
+  unsigned FinalSize = 0;
+  
+  FinalSize += GetExceptionTableSizeInBytes(&F);
+      
+  const std::vector<Function *> Personalities = MMI->getPersonalities();
+  FinalSize += 
+    GetCommonEHFrameSizeInBytes(Personalities[MMI->getPersonalityIndex()]);
+
+  FinalSize += GetEHFrameSizeInBytes(Personalities[MMI->getPersonalityIndex()],
+                                     StartFunction);
+  
+  return FinalSize;
+}
+
+/// RoundUpToAlign - Add the specified alignment to FinalSize and returns
+/// the new value.
+static unsigned RoundUpToAlign(unsigned FinalSize, unsigned Alignment) {
+  if (Alignment == 0) Alignment = 1;
+  // Since we do not know where the buffer will be allocated, be pessimistic.
+  return FinalSize + Alignment;
+}
+  
+unsigned
+JITDwarfEmitter::GetEHFrameSizeInBytes(const Function* Personality,
+                                       unsigned char* StartFunction) const { 
+  unsigned PointerSize = TD->getPointerSize();
+  unsigned FinalSize = 0;
+  // EH frame header.
+  FinalSize += PointerSize;
+  // FDE CIE Offset
+  FinalSize += 3 * PointerSize;
+  // If there is a personality and landing pads then point to the language
+  // specific data area in the exception table.
+  if (MMI->getPersonalityIndex()) {
+    FinalSize += TargetAsmInfo::getULEB128Size(4); 
+    FinalSize += PointerSize;
+  } else {
+    FinalSize += TargetAsmInfo::getULEB128Size(0);
+  }
+      
+  // Indicate locations of function specific  callee saved registers in
+  // frame.
+  FinalSize += GetFrameMovesSizeInBytes((intptr_t)StartFunction,
+                                        MMI->getFrameMoves());
+      
+  FinalSize = RoundUpToAlign(FinalSize, 4);
+  
+  // Double zeroes for the unwind runtime
+  FinalSize += 2 * PointerSize;
+
+  return FinalSize;
+}
+
+unsigned JITDwarfEmitter::GetCommonEHFrameSizeInBytes(const Function* Personality) 
+  const {
+
+  unsigned PointerSize = TD->getPointerSize();
+  int stackGrowth = stackGrowthDirection == TargetFrameInfo::StackGrowsUp ?
+          PointerSize : -PointerSize;
+  unsigned FinalSize = 0; 
+  // EH Common Frame header
+  FinalSize += PointerSize;
+  FinalSize += 4;
+  FinalSize += 1;
+  FinalSize += Personality ? 5 : 3; // "zPLR" or "zR"
+  FinalSize += TargetAsmInfo::getULEB128Size(1);
+  FinalSize += TargetAsmInfo::getSLEB128Size(stackGrowth);
+  FinalSize += 1;
+  
+  if (Personality) {
+    FinalSize += TargetAsmInfo::getULEB128Size(7);
+    
+    // Encoding
+    FinalSize+= 1;
+    //Personality
+    FinalSize += PointerSize;
+    
+    FinalSize += TargetAsmInfo::getULEB128Size(dwarf::DW_EH_PE_pcrel);
+    FinalSize += TargetAsmInfo::getULEB128Size(dwarf::DW_EH_PE_pcrel);
+      
+  } else {
+    FinalSize += TargetAsmInfo::getULEB128Size(1);
+    FinalSize += TargetAsmInfo::getULEB128Size(dwarf::DW_EH_PE_pcrel);
+  }
+
+  std::vector<MachineMove> Moves;
+  RI->getInitialFrameState(Moves);
+  FinalSize += GetFrameMovesSizeInBytes(0, Moves);
+  FinalSize = RoundUpToAlign(FinalSize, 4);
+  return FinalSize;
+}
+
+unsigned
+JITDwarfEmitter::GetFrameMovesSizeInBytes(intptr_t BaseLabelPtr,
+                                  const std::vector<MachineMove> &Moves) const {
+  unsigned PointerSize = TD->getPointerSize();
+  int stackGrowth = stackGrowthDirection == TargetFrameInfo::StackGrowsUp ?
+          PointerSize : -PointerSize;
+  bool IsLocal = BaseLabelPtr;
+  unsigned FinalSize = 0; 
+
+  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
+    const MachineMove &Move = Moves[i];
+    unsigned LabelID = Move.getLabelID();
+    
+    if (LabelID) {
+      LabelID = MMI->MappedLabel(LabelID);
+    
+      // Throw out move if the label is invalid.
+      if (!LabelID) continue;
+    }
+    
+    intptr_t LabelPtr = 0;
+    if (LabelID) LabelPtr = JCE->getLabelAddress(LabelID);
+
+    const MachineLocation &Dst = Move.getDestination();
+    const MachineLocation &Src = Move.getSource();
+    
+    // Advance row if new location.
+    if (BaseLabelPtr && LabelID && (BaseLabelPtr != LabelPtr || !IsLocal)) {
+      FinalSize++;
+      FinalSize += PointerSize;
+      BaseLabelPtr = LabelPtr;
+      IsLocal = true;
+    }
+    
+    // If advancing cfa.
+    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
+      if (!Src.isReg()) {
+        if (Src.getReg() == MachineLocation::VirtualFP) {
+          ++FinalSize;
+        } else {
+          ++FinalSize;
+          unsigned RegNum = RI->getDwarfRegNum(Src.getReg(), true);
+          FinalSize += TargetAsmInfo::getULEB128Size(RegNum);
+        }
+        
+        int Offset = -Src.getOffset();
+        
+        FinalSize += TargetAsmInfo::getULEB128Size(Offset);
+      } else {
+        assert(0 && "Machine move no supported yet.");
+      }
+    } else if (Src.isReg() &&
+      Src.getReg() == MachineLocation::VirtualFP) {
+      if (Dst.isReg()) {
+        ++FinalSize;
+        unsigned RegNum = RI->getDwarfRegNum(Dst.getReg(), true);
+        FinalSize += TargetAsmInfo::getULEB128Size(RegNum);
+      } else {
+        assert(0 && "Machine move no supported yet.");
+      }
+    } else {
+      unsigned Reg = RI->getDwarfRegNum(Src.getReg(), true);
+      int Offset = Dst.getOffset() / stackGrowth;
+      
+      if (Offset < 0) {
+        ++FinalSize;
+        FinalSize += TargetAsmInfo::getULEB128Size(Reg);
+        FinalSize += TargetAsmInfo::getSLEB128Size(Offset);
+      } else if (Reg < 64) {
+        ++FinalSize;
+        FinalSize += TargetAsmInfo::getULEB128Size(Offset);
+      } else {
+        ++FinalSize;
+        FinalSize += TargetAsmInfo::getULEB128Size(Reg);
+        FinalSize += TargetAsmInfo::getULEB128Size(Offset);
+      }
+    }
+  }
+  return FinalSize;
+}
+
+unsigned 
+JITDwarfEmitter::GetExceptionTableSizeInBytes(MachineFunction* MF) const {
+  unsigned FinalSize = 0;
+
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+
+  const std::vector<GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+  if (PadInfos.empty()) return 0;
+
+  // Sort the landing pads in order of their type ids.  This is used to fold
+  // duplicate actions.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
+    LandingPads.push_back(&PadInfos[i]);
+  std::sort(LandingPads.begin(), LandingPads.end(), PadLT);
+
+  // Negative type ids index into FilterIds, positive type ids index into
+  // TypeInfos.  The value written for a positive type id is just the type
+  // id itself.  For a negative type id, however, the value written is the
+  // (negative) byte offset of the corresponding FilterIds entry.  The byte
+  // offset is usually equal to the type id, because the FilterIds entries
+  // are written using a variable width encoding which outputs one byte per
+  // entry as long as the value written is not too large, but can differ.
+  // This kind of complication does not occur for positive type ids because
+  // type infos are output using a fixed width encoding.
+  // FilterOffsets[i] holds the byte offset corresponding to FilterIds[i].
+  SmallVector<int, 16> FilterOffsets;
+  FilterOffsets.reserve(FilterIds.size());
+  int Offset = -1;
+  for(std::vector<unsigned>::const_iterator I = FilterIds.begin(),
+    E = FilterIds.end(); I != E; ++I) {
+    FilterOffsets.push_back(Offset);
+    Offset -= TargetAsmInfo::getULEB128Size(*I);
+  }
+
+  // Compute the actions table and gather the first action index for each
+  // landing pad site.
+  SmallVector<ActionEntry, 32> Actions;
+  SmallVector<unsigned, 64> FirstActions;
+  FirstActions.reserve(LandingPads.size());
+
+  int FirstAction = 0;
+  unsigned SizeActions = 0;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LP = LandingPads[i];
+    const std::vector<int> &TypeIds = LP->TypeIds;
+    const unsigned NumShared = i ? SharedTypeIds(LP, LandingPads[i-1]) : 0;
+    unsigned SizeSiteActions = 0;
+
+    if (NumShared < TypeIds.size()) {
+      unsigned SizeAction = 0;
+      ActionEntry *PrevAction = 0;
+
+      if (NumShared) {
+        const unsigned SizePrevIds = LandingPads[i-1]->TypeIds.size();
+        assert(Actions.size());
+        PrevAction = &Actions.back();
+        SizeAction = TargetAsmInfo::getSLEB128Size(PrevAction->NextAction) +
+          TargetAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
+          SizeAction -= TargetAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+          SizeAction += -PrevAction->NextAction;
+          PrevAction = PrevAction->Previous;
+        }
+      }
+
+      // Compute the actions.
+      for (unsigned I = NumShared, M = TypeIds.size(); I != M; ++I) {
+        int TypeID = TypeIds[I];
+        assert(-1-TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
+        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        unsigned SizeTypeID = TargetAsmInfo::getSLEB128Size(ValueForTypeID);
+
+        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
+        SizeAction = SizeTypeID + TargetAsmInfo::getSLEB128Size(NextAction);
+        SizeSiteActions += SizeAction;
+
+        ActionEntry Action = {ValueForTypeID, NextAction, PrevAction};
+        Actions.push_back(Action);
+
+        PrevAction = &Actions.back();
+      }
+
+      // Record the first action of the landing pad site.
+      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
+    } // else identical - re-use previous FirstAction
+
+    FirstActions.push_back(FirstAction);
+
+    // Compute this sites contribution to size.
+    SizeActions += SizeSiteActions;
+  }
+
+  // Compute the call-site table.  Entries must be ordered by address.
+  SmallVector<CallSiteEntry, 64> CallSites;
+
+  RangeMapType PadMap;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j=0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      unsigned BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
+  bool MayThrow = false;
+  unsigned LastLabel = 0;
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+        I != E; ++I) {
+    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
+          MI != E; ++MI) {
+      if (!MI->isLabel()) {
+        MayThrow |= MI->getDesc().isCall();
+        continue;
+      }
+
+      unsigned BeginLabel = MI->getOperand(0).getImm();
+      assert(BeginLabel && "Invalid label!");
+
+      if (BeginLabel == LastLabel)
+        MayThrow = false;
+
+      RangeMapType::iterator L = PadMap.find(BeginLabel);
+
+      if (L == PadMap.end())
+        continue;
+
+      PadRange P = L->second;
+      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
+
+      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
+              "Inconsistent landing pad map!");
+
+      // If some instruction between the previous try-range and this one may
+      // throw, create a call-site entry with no landing pad for the region
+      // between the try-ranges.
+      if (MayThrow) {
+        CallSiteEntry Site = {LastLabel, BeginLabel, 0, 0};
+        CallSites.push_back(Site);
+      }
+
+      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+      CallSiteEntry Site = {BeginLabel, LastLabel,
+        LandingPad->LandingPadLabel, FirstActions[P.PadIndex]};
+
+      assert(Site.BeginLabel && Site.EndLabel && Site.PadLabel &&
+              "Invalid landing pad!");
+
+      // Try to merge with the previous call-site.
+      if (CallSites.size()) {
+        CallSiteEntry &Prev = CallSites.back();
+        if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+          // Extend the range of the previous entry.
+          Prev.EndLabel = Site.EndLabel;
+          continue;
+        }
+      }
+
+      // Otherwise, create a new call-site.
+      CallSites.push_back(Site);
+    }
+  }
+  // If some instruction between the previous try-range and the end of the
+  // function may throw, create a call-site entry with no landing pad for the
+  // region following the try-range.
+  if (MayThrow) {
+    CallSiteEntry Site = {LastLabel, 0, 0, 0};
+    CallSites.push_back(Site);
+  }
+
+  // Final tallies.
+  unsigned SizeSites = CallSites.size() * (sizeof(int32_t) + // Site start.
+                                            sizeof(int32_t) + // Site length.
+                                            sizeof(int32_t)); // Landing pad.
+  for (unsigned i = 0, e = CallSites.size(); i < e; ++i)
+    SizeSites += TargetAsmInfo::getULEB128Size(CallSites[i].Action);
+
+  unsigned SizeTypes = TypeInfos.size() * TD->getPointerSize();
+
+  unsigned TypeOffset = sizeof(int8_t) + // Call site format
+                        // Call-site table length
+                        TargetAsmInfo::getULEB128Size(SizeSites) + 
+                        SizeSites + SizeActions + SizeTypes;
+
+  unsigned TotalSize = sizeof(int8_t) + // LPStart format
+                       sizeof(int8_t) + // TType format
+                       TargetAsmInfo::getULEB128Size(TypeOffset) + // TType base offset
+                       TypeOffset;
+
+  unsigned SizeAlign = (4 - TotalSize) & 3;
+
+  // Begin the exception table.
+  FinalSize = RoundUpToAlign(FinalSize, 4);
+  for (unsigned i = 0; i != SizeAlign; ++i) {
+    ++FinalSize;
+  }
+  
+  unsigned PointerSize = TD->getPointerSize();
+
+  // Emit the header.
+  ++FinalSize;
+  // Asm->EOL("LPStart format (DW_EH_PE_omit)");
+  ++FinalSize;
+  // Asm->EOL("TType format (DW_EH_PE_absptr)");
+  ++FinalSize;
+  // Asm->EOL("TType base offset");
+  ++FinalSize;
+  // Asm->EOL("Call site format (DW_EH_PE_udata4)");
+  ++FinalSize;
+  // Asm->EOL("Call-site table length");
+
+  // Emit the landing pad site information.
+  for (unsigned i = 0; i < CallSites.size(); ++i) {
+    CallSiteEntry &S = CallSites[i];
+
+    // Asm->EOL("Region start");
+    FinalSize += PointerSize;
+    
+    //Asm->EOL("Region length");
+    FinalSize += PointerSize;
+
+    // Asm->EOL("Landing pad");
+    FinalSize += PointerSize;
+
+    FinalSize += TargetAsmInfo::getULEB128Size(S.Action);
+    // Asm->EOL("Action");
+  }
+
+  // Emit the actions.
+  for (unsigned I = 0, N = Actions.size(); I != N; ++I) {
+    ActionEntry &Action = Actions[I];
+
+    //Asm->EOL("TypeInfo index");
+    FinalSize += TargetAsmInfo::getSLEB128Size(Action.ValueForTypeID);
+    //Asm->EOL("Next action");
+    FinalSize += TargetAsmInfo::getSLEB128Size(Action.NextAction);
+  }
+
+  // Emit the type ids.
+  for (unsigned M = TypeInfos.size(); M; --M) {
+    // Asm->EOL("TypeInfo");
+    FinalSize += PointerSize;
+  }
+
+  // Emit the filter typeids.
+  for (unsigned j = 0, M = FilterIds.size(); j < M; ++j) {
+    unsigned TypeID = FilterIds[j];
+    FinalSize += TargetAsmInfo::getULEB128Size(TypeID);
+    //Asm->EOL("Filter TypeInfo index");
+  }
+  
+  FinalSize = RoundUpToAlign(FinalSize, 4);
+
+  return FinalSize;
+}
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
new file mode 100644
index 0000000..9120ed4
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
@@ -0,0 +1,87 @@
+//===------ JITDwarfEmitter.h - Write dwarf tables into memory ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITDwarfEmitter object that is used by the JIT to
+// write dwarf tables to memory.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
+#define LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
+
+namespace llvm {
+
+class Function;
+class JITCodeEmitter;
+class MachineFunction;
+class MachineModuleInfo;
+class MachineMove;
+class TargetData;
+class TargetMachine;
+class TargetRegisterInfo;
+
+class JITDwarfEmitter {
+  const TargetData* TD;
+  JITCodeEmitter* JCE;
+  const TargetRegisterInfo* RI;
+  MachineModuleInfo* MMI;
+  JIT& Jit;
+  bool needsIndirectEncoding;
+  bool stackGrowthDirection;
+  
+  unsigned char* EmitExceptionTable(MachineFunction* MF,
+                                    unsigned char* StartFunction, 
+                                    unsigned char* EndFunction) const;
+
+  void EmitFrameMoves(intptr_t BaseLabelPtr, 
+                      const std::vector<MachineMove> &Moves) const;
+    
+  unsigned char* EmitCommonEHFrame(const Function* Personality) const;
+
+  unsigned char* EmitEHFrame(const Function* Personality, 
+                             unsigned char* StartBufferPtr,
+                             unsigned char* StartFunction, 
+                             unsigned char* EndFunction,
+                             unsigned char* ExceptionTable) const;
+    
+  unsigned GetExceptionTableSizeInBytes(MachineFunction* MF) const;
+  
+  unsigned
+    GetFrameMovesSizeInBytes(intptr_t BaseLabelPtr, 
+                             const std::vector<MachineMove> &Moves) const;
+    
+  unsigned GetCommonEHFrameSizeInBytes(const Function* Personality) const;
+
+  unsigned GetEHFrameSizeInBytes(const Function* Personality, 
+                                 unsigned char* StartFunction) const; 
+    
+public:
+  
+  JITDwarfEmitter(JIT& jit);
+  
+  unsigned char* EmitDwarfTable(MachineFunction& F, 
+                                JITCodeEmitter& JCE,
+                                unsigned char* StartFunction,
+                                unsigned char* EndFunction);
+  
+  
+  unsigned GetDwarfTableSizeInBytes(MachineFunction& F, 
+                                    JITCodeEmitter& JCE,
+                                    unsigned char* StartFunction,
+                                    unsigned char* EndFunction);
+
+  void setModuleInfo(MachineModuleInfo* Info) {
+    MMI = Info;
+  }
+};
+
+
+} // end namespace llvm
+
+#endif // LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
new file mode 100644
index 0000000..d3b0820
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -0,0 +1,1615 @@
+//===-- JITEmitter.cpp - Write machine code to executable memory ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a MachineCodeEmitter object that is used by the JIT to
+// write machine code to memory and remember where relocatable values are.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "JIT.h"
+#include "JITDwarfEmitter.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRelocation.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/CodeGen/MachineCodeInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/System/Disassembler.h"
+#include "llvm/System/Memory.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+using namespace llvm;
+
+STATISTIC(NumBytes, "Number of bytes of machine code compiled");
+STATISTIC(NumRelos, "Number of relocations applied");
+static JIT *TheJIT = 0;
+
+
+//===----------------------------------------------------------------------===//
+// JIT lazy compilation code.
+//
+namespace {
+  class JITResolverState {
+  public:
+    typedef std::map<AssertingVH<Function>, void*> FunctionToStubMapTy;
+    typedef std::map<void*, Function*> StubToFunctionMapTy;
+    typedef std::map<AssertingVH<GlobalValue>, void*> GlobalToIndirectSymMapTy;
+  private:
+    /// FunctionToStubMap - Keep track of the stub created for a particular
+    /// function so that we can reuse them if necessary.
+    FunctionToStubMapTy FunctionToStubMap;
+
+    /// StubToFunctionMap - Keep track of the function that each stub
+    /// corresponds to.
+    StubToFunctionMapTy StubToFunctionMap;
+
+    /// GlobalToIndirectSymMap - Keep track of the indirect symbol created for a
+    /// particular GlobalVariable so that we can reuse them if necessary.
+    GlobalToIndirectSymMapTy GlobalToIndirectSymMap;
+
+  public:
+    FunctionToStubMapTy& getFunctionToStubMap(const MutexGuard& locked) {
+      assert(locked.holds(TheJIT->lock));
+      return FunctionToStubMap;
+    }
+
+    StubToFunctionMapTy& getStubToFunctionMap(const MutexGuard& locked) {
+      assert(locked.holds(TheJIT->lock));
+      return StubToFunctionMap;
+    }
+
+    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap(const MutexGuard& locked) {
+      assert(locked.holds(TheJIT->lock));
+      return GlobalToIndirectSymMap;
+    }
+  };
+
+  /// JITResolver - Keep track of, and resolve, call sites for functions that
+  /// have not yet been compiled.
+  class JITResolver {
+    typedef JITResolverState::FunctionToStubMapTy FunctionToStubMapTy;
+    typedef JITResolverState::StubToFunctionMapTy StubToFunctionMapTy;
+    typedef JITResolverState::GlobalToIndirectSymMapTy GlobalToIndirectSymMapTy;
+
+    /// LazyResolverFn - The target lazy resolver function that we actually
+    /// rewrite instructions to use.
+    TargetJITInfo::LazyResolverFn LazyResolverFn;
+
+    JITResolverState state;
+
+    /// ExternalFnToStubMap - This is the equivalent of FunctionToStubMap for
+    /// external functions.
+    std::map<void*, void*> ExternalFnToStubMap;
+
+    /// revGOTMap - map addresses to indexes in the GOT
+    std::map<void*, unsigned> revGOTMap;
+    unsigned nextGOTIndex;
+
+    static JITResolver *TheJITResolver;
+  public:
+    explicit JITResolver(JIT &jit) : nextGOTIndex(0) {
+      TheJIT = &jit;
+
+      LazyResolverFn = jit.getJITInfo().getLazyResolverFunction(JITCompilerFn);
+      assert(TheJITResolver == 0 && "Multiple JIT resolvers?");
+      TheJITResolver = this;
+    }
+    
+    ~JITResolver() {
+      TheJITResolver = 0;
+    }
+
+    /// getFunctionStubIfAvailable - This returns a pointer to a function stub
+    /// if it has already been created.
+    void *getFunctionStubIfAvailable(Function *F);
+
+    /// getFunctionStub - This returns a pointer to a function stub, creating
+    /// one on demand as needed.  If empty is true, create a function stub
+    /// pointing at address 0, to be filled in later.
+    void *getFunctionStub(Function *F);
+
+    /// getExternalFunctionStub - Return a stub for the function at the
+    /// specified address, created lazily on demand.
+    void *getExternalFunctionStub(void *FnAddr);
+
+    /// getGlobalValueIndirectSym - Return an indirect symbol containing the
+    /// specified GV address.
+    void *getGlobalValueIndirectSym(GlobalValue *V, void *GVAddress);
+
+    /// AddCallbackAtLocation - If the target is capable of rewriting an
+    /// instruction without the use of a stub, record the location of the use so
+    /// we know which function is being used at the location.
+    void *AddCallbackAtLocation(Function *F, void *Location) {
+      MutexGuard locked(TheJIT->lock);
+      /// Get the target-specific JIT resolver function.
+      state.getStubToFunctionMap(locked)[Location] = F;
+      return (void*)(intptr_t)LazyResolverFn;
+    }
+    
+    void getRelocatableGVs(SmallVectorImpl<GlobalValue*> &GVs,
+                           SmallVectorImpl<void*> &Ptrs);
+    
+    GlobalValue *invalidateStub(void *Stub);
+
+    /// getGOTIndexForAddress - Return a new or existing index in the GOT for
+    /// an address.  This function only manages slots, it does not manage the
+    /// contents of the slots or the memory associated with the GOT.
+    unsigned getGOTIndexForAddr(void *addr);
+
+    /// JITCompilerFn - This function is called to resolve a stub to a compiled
+    /// address.  If the LLVM Function corresponding to the stub has not yet
+    /// been compiled, this function compiles it first.
+    static void *JITCompilerFn(void *Stub);
+  };
+}
+
+JITResolver *JITResolver::TheJITResolver = 0;
+
+/// getFunctionStubIfAvailable - This returns a pointer to a function stub
+/// if it has already been created.
+void *JITResolver::getFunctionStubIfAvailable(Function *F) {
+  MutexGuard locked(TheJIT->lock);
+
+  // If we already have a stub for this function, recycle it.
+  void *&Stub = state.getFunctionToStubMap(locked)[F];
+  return Stub;
+}
+
+/// getFunctionStub - This returns a pointer to a function stub, creating
+/// one on demand as needed.
+void *JITResolver::getFunctionStub(Function *F) {
+  MutexGuard locked(TheJIT->lock);
+
+  // If we already have a stub for this function, recycle it.
+  void *&Stub = state.getFunctionToStubMap(locked)[F];
+  if (Stub) return Stub;
+
+  // Call the lazy resolver function unless we are JIT'ing non-lazily, in which
+  // case we must resolve the symbol now.
+  void *Actual =  TheJIT->isLazyCompilationDisabled() 
+    ? (void *)0 : (void *)(intptr_t)LazyResolverFn;
+   
+  // If this is an external declaration, attempt to resolve the address now
+  // to place in the stub.
+  if (F->isDeclaration() && !F->hasNotBeenReadFromBitcode()) {
+    Actual = TheJIT->getPointerToFunction(F);
+
+    // If we resolved the symbol to a null address (eg. a weak external)
+    // don't emit a stub. Return a null pointer to the application.  If dlsym
+    // stubs are enabled, not being able to resolve the address is not
+    // meaningful.
+    if (!Actual && !TheJIT->areDlsymStubsEnabled()) return 0;
+  }
+
+  // Codegen a new stub, calling the lazy resolver or the actual address of the
+  // external function, if it was resolved.
+  Stub = TheJIT->getJITInfo().emitFunctionStub(F, Actual,
+                                               *TheJIT->getCodeEmitter());
+
+  if (Actual != (void*)(intptr_t)LazyResolverFn) {
+    // If we are getting the stub for an external function, we really want the
+    // address of the stub in the GlobalAddressMap for the JIT, not the address
+    // of the external function.
+    TheJIT->updateGlobalMapping(F, Stub);
+  }
+
+  DOUT << "JIT: Stub emitted at [" << Stub << "] for function '"
+       << F->getName() << "'\n";
+
+  // Finally, keep track of the stub-to-Function mapping so that the
+  // JITCompilerFn knows which function to compile!
+  state.getStubToFunctionMap(locked)[Stub] = F;
+  
+  // If we are JIT'ing non-lazily but need to call a function that does not
+  // exist yet, add it to the JIT's work list so that we can fill in the stub
+  // address later.
+  if (!Actual && TheJIT->isLazyCompilationDisabled())
+    if (!F->isDeclaration() || F->hasNotBeenReadFromBitcode())
+      TheJIT->addPendingFunction(F);
+  
+  return Stub;
+}
+
+/// getGlobalValueIndirectSym - Return a lazy pointer containing the specified
+/// GV address.
+void *JITResolver::getGlobalValueIndirectSym(GlobalValue *GV, void *GVAddress) {
+  MutexGuard locked(TheJIT->lock);
+
+  // If we already have a stub for this global variable, recycle it.
+  void *&IndirectSym = state.getGlobalToIndirectSymMap(locked)[GV];
+  if (IndirectSym) return IndirectSym;
+
+  // Otherwise, codegen a new indirect symbol.
+  IndirectSym = TheJIT->getJITInfo().emitGlobalValueIndirectSym(GV, GVAddress,
+                                                     *TheJIT->getCodeEmitter());
+
+  DOUT << "JIT: Indirect symbol emitted at [" << IndirectSym << "] for GV '"
+       << GV->getName() << "'\n";
+
+  return IndirectSym;
+}
+
+/// getExternalFunctionStub - Return a stub for the function at the
+/// specified address, created lazily on demand.
+void *JITResolver::getExternalFunctionStub(void *FnAddr) {
+  // If we already have a stub for this function, recycle it.
+  void *&Stub = ExternalFnToStubMap[FnAddr];
+  if (Stub) return Stub;
+
+  Stub = TheJIT->getJITInfo().emitFunctionStub(0, FnAddr,
+                                               *TheJIT->getCodeEmitter());
+
+  DOUT << "JIT: Stub emitted at [" << Stub
+       << "] for external function at '" << FnAddr << "'\n";
+  return Stub;
+}
+
+unsigned JITResolver::getGOTIndexForAddr(void* addr) {
+  unsigned idx = revGOTMap[addr];
+  if (!idx) {
+    idx = ++nextGOTIndex;
+    revGOTMap[addr] = idx;
+    DOUT << "JIT: Adding GOT entry " << idx << " for addr [" << addr << "]\n";
+  }
+  return idx;
+}
+
+void JITResolver::getRelocatableGVs(SmallVectorImpl<GlobalValue*> &GVs,
+                                    SmallVectorImpl<void*> &Ptrs) {
+  MutexGuard locked(TheJIT->lock);
+  
+  FunctionToStubMapTy &FM = state.getFunctionToStubMap(locked);
+  GlobalToIndirectSymMapTy &GM = state.getGlobalToIndirectSymMap(locked);
+  
+  for (FunctionToStubMapTy::iterator i = FM.begin(), e = FM.end(); i != e; ++i){
+    Function *F = i->first;
+    if (F->isDeclaration() && F->hasExternalLinkage()) {
+      GVs.push_back(i->first);
+      Ptrs.push_back(i->second);
+    }
+  }
+  for (GlobalToIndirectSymMapTy::iterator i = GM.begin(), e = GM.end();
+       i != e; ++i) {
+    GVs.push_back(i->first);
+    Ptrs.push_back(i->second);
+  }
+}
+
+GlobalValue *JITResolver::invalidateStub(void *Stub) {
+  MutexGuard locked(TheJIT->lock);
+  
+  FunctionToStubMapTy &FM = state.getFunctionToStubMap(locked);
+  StubToFunctionMapTy &SM = state.getStubToFunctionMap(locked);
+  GlobalToIndirectSymMapTy &GM = state.getGlobalToIndirectSymMap(locked);
+  
+  // Look up the cheap way first, to see if it's a function stub we are
+  // invalidating.  If so, remove it from both the forward and reverse maps.
+  if (SM.find(Stub) != SM.end()) {
+    Function *F = SM[Stub];
+    SM.erase(Stub);
+    FM.erase(F);
+    return F;
+  }
+  
+  // Otherwise, it might be an indirect symbol stub.  Find it and remove it.
+  for (GlobalToIndirectSymMapTy::iterator i = GM.begin(), e = GM.end();
+       i != e; ++i) {
+    if (i->second != Stub)
+      continue;
+    GlobalValue *GV = i->first;
+    GM.erase(i);
+    return GV;
+  }
+  
+  // Lastly, check to see if it's in the ExternalFnToStubMap.
+  for (std::map<void *, void *>::iterator i = ExternalFnToStubMap.begin(),
+       e = ExternalFnToStubMap.end(); i != e; ++i) {
+    if (i->second != Stub)
+      continue;
+    ExternalFnToStubMap.erase(i);
+    break;
+  }
+  
+  return 0;
+}
+
+/// JITCompilerFn - This function is called when a lazy compilation stub has
+/// been entered.  It looks up which function this stub corresponds to, compiles
+/// it if necessary, then returns the resultant function pointer.
+void *JITResolver::JITCompilerFn(void *Stub) {
+  JITResolver &JR = *TheJITResolver;
+  
+  Function* F = 0;
+  void* ActualPtr = 0;
+
+  {
+    // Only lock for getting the Function. The call getPointerToFunction made
+    // in this function might trigger function materializing, which requires
+    // JIT lock to be unlocked.
+    MutexGuard locked(TheJIT->lock);
+
+    // The address given to us for the stub may not be exactly right, it might be
+    // a little bit after the stub.  As such, use upper_bound to find it.
+    StubToFunctionMapTy::iterator I =
+      JR.state.getStubToFunctionMap(locked).upper_bound(Stub);
+    assert(I != JR.state.getStubToFunctionMap(locked).begin() &&
+           "This is not a known stub!");
+    F = (--I)->second;
+    ActualPtr = I->first;
+  }
+
+  // If we have already code generated the function, just return the address.
+  void *Result = TheJIT->getPointerToGlobalIfAvailable(F);
+  
+  if (!Result) {
+    // Otherwise we don't have it, do lazy compilation now.
+    
+    // If lazy compilation is disabled, emit a useful error message and abort.
+    if (TheJIT->isLazyCompilationDisabled()) {
+      cerr << "LLVM JIT requested to do lazy compilation of function '"
+      << F->getName() << "' when lazy compiles are disabled!\n";
+      abort();
+    }
+  
+    // We might like to remove the stub from the StubToFunction map.
+    // We can't do that! Multiple threads could be stuck, waiting to acquire the
+    // lock above. As soon as the 1st function finishes compiling the function,
+    // the next one will be released, and needs to be able to find the function
+    // it needs to call.
+    //JR.state.getStubToFunctionMap(locked).erase(I);
+
+    DOUT << "JIT: Lazily resolving function '" << F->getName()
+         << "' In stub ptr = " << Stub << " actual ptr = "
+         << ActualPtr << "\n";
+
+    Result = TheJIT->getPointerToFunction(F);
+  }
+  
+  // Reacquire the lock to erase the stub in the map.
+  MutexGuard locked(TheJIT->lock);
+
+  // We don't need to reuse this stub in the future, as F is now compiled.
+  JR.state.getFunctionToStubMap(locked).erase(F);
+
+  // FIXME: We could rewrite all references to this stub if we knew them.
+
+  // What we will do is set the compiled function address to map to the
+  // same GOT entry as the stub so that later clients may update the GOT
+  // if they see it still using the stub address.
+  // Note: this is done so the Resolver doesn't have to manage GOT memory
+  // Do this without allocating map space if the target isn't using a GOT
+  if(JR.revGOTMap.find(Stub) != JR.revGOTMap.end())
+    JR.revGOTMap[Result] = JR.revGOTMap[Stub];
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Function Index Support
+
+// On MacOS we generate an index of currently JIT'd functions so that
+// performance tools can determine a symbol name and accurate code range for a
+// PC value.  Because performance tools are generally asynchronous, the code
+// below is written with the hope that it could be interrupted at any time and
+// have useful answers.  However, we don't go crazy with atomic operations, we
+// just do a "reasonable effort".
+#ifdef __APPLE__ 
+#define ENABLE_JIT_SYMBOL_TABLE 0
+#endif
+
+/// JitSymbolEntry - Each function that is JIT compiled results in one of these
+/// being added to an array of symbols.  This indicates the name of the function
+/// as well as the address range it occupies.  This allows the client to map
+/// from a PC value to the name of the function.
+struct JitSymbolEntry {
+  const char *FnName;   // FnName - a strdup'd string.
+  void *FnStart;
+  intptr_t FnSize;
+};
+
+
+struct JitSymbolTable {
+  /// NextPtr - This forms a linked list of JitSymbolTable entries.  This
+  /// pointer is not used right now, but might be used in the future.  Consider
+  /// it reserved for future use.
+  JitSymbolTable *NextPtr;
+  
+  /// Symbols - This is an array of JitSymbolEntry entries.  Only the first
+  /// 'NumSymbols' symbols are valid.
+  JitSymbolEntry *Symbols;
+  
+  /// NumSymbols - This indicates the number entries in the Symbols array that
+  /// are valid.
+  unsigned NumSymbols;
+  
+  /// NumAllocated - This indicates the amount of space we have in the Symbols
+  /// array.  This is a private field that should not be read by external tools.
+  unsigned NumAllocated;
+};
+
+#if ENABLE_JIT_SYMBOL_TABLE 
+JitSymbolTable *__jitSymbolTable;
+#endif
+
+static void AddFunctionToSymbolTable(const char *FnName, 
+                                     void *FnStart, intptr_t FnSize) {
+  assert(FnName != 0 && FnStart != 0 && "Bad symbol to add");
+  JitSymbolTable **SymTabPtrPtr = 0;
+#if !ENABLE_JIT_SYMBOL_TABLE
+  return;
+#else
+  SymTabPtrPtr = &__jitSymbolTable;
+#endif
+  
+  // If this is the first entry in the symbol table, add the JitSymbolTable
+  // index.
+  if (*SymTabPtrPtr == 0) {
+    JitSymbolTable *New = new JitSymbolTable();
+    New->NextPtr = 0;
+    New->Symbols = 0;
+    New->NumSymbols = 0;
+    New->NumAllocated = 0;
+    *SymTabPtrPtr = New;
+  }
+  
+  JitSymbolTable *SymTabPtr = *SymTabPtrPtr;
+  
+  // If we have space in the table, reallocate the table.
+  if (SymTabPtr->NumSymbols >= SymTabPtr->NumAllocated) {
+    // If we don't have space, reallocate the table.
+    unsigned NewSize = std::max(64U, SymTabPtr->NumAllocated*2);
+    JitSymbolEntry *NewSymbols = new JitSymbolEntry[NewSize];
+    JitSymbolEntry *OldSymbols = SymTabPtr->Symbols;
+    
+    // Copy the old entries over.
+    memcpy(NewSymbols, OldSymbols, SymTabPtr->NumSymbols*sizeof(OldSymbols[0]));
+    
+    // Swap the new symbols in, delete the old ones.
+    SymTabPtr->Symbols = NewSymbols;
+    SymTabPtr->NumAllocated = NewSize;
+    delete [] OldSymbols;
+  }
+  
+  // Otherwise, we have enough space, just tack it onto the end of the array.
+  JitSymbolEntry &Entry = SymTabPtr->Symbols[SymTabPtr->NumSymbols];
+  Entry.FnName = strdup(FnName);
+  Entry.FnStart = FnStart;
+  Entry.FnSize = FnSize;
+  ++SymTabPtr->NumSymbols;
+}
+
+static void RemoveFunctionFromSymbolTable(void *FnStart) {
+  assert(FnStart && "Invalid function pointer");
+  JitSymbolTable **SymTabPtrPtr = 0;
+#if !ENABLE_JIT_SYMBOL_TABLE
+  return;
+#else
+  SymTabPtrPtr = &__jitSymbolTable;
+#endif
+  
+  JitSymbolTable *SymTabPtr = *SymTabPtrPtr;
+  JitSymbolEntry *Symbols = SymTabPtr->Symbols;
+  
+  // Scan the table to find its index.  The table is not sorted, so do a linear
+  // scan.
+  unsigned Index;
+  for (Index = 0; Symbols[Index].FnStart != FnStart; ++Index)
+    assert(Index != SymTabPtr->NumSymbols && "Didn't find function!");
+  
+  // Once we have an index, we know to nuke this entry, overwrite it with the
+  // entry at the end of the array, making the last entry redundant.
+  const char *OldName = Symbols[Index].FnName;
+  Symbols[Index] = Symbols[SymTabPtr->NumSymbols-1];
+  free((void*)OldName);
+  
+  // Drop the number of symbols in the table.
+  --SymTabPtr->NumSymbols;
+
+  // Finally, if we deleted the final symbol, deallocate the table itself.
+  if (SymTabPtr->NumSymbols != 0) 
+    return;
+  
+  *SymTabPtrPtr = 0;
+  delete [] Symbols;
+  delete SymTabPtr;
+}
+
+//===----------------------------------------------------------------------===//
+// JITEmitter code.
+//
+namespace {
+  /// JITEmitter - The JIT implementation of the MachineCodeEmitter, which is
+  /// used to output functions to memory for execution.
+  class JITEmitter : public JITCodeEmitter {
+    JITMemoryManager *MemMgr;
+
+    // When outputting a function stub in the context of some other function, we
+    // save BufferBegin/BufferEnd/CurBufferPtr here.
+    uint8_t *SavedBufferBegin, *SavedBufferEnd, *SavedCurBufferPtr;
+
+    /// Relocations - These are the relocations that the function needs, as
+    /// emitted.
+    std::vector<MachineRelocation> Relocations;
+    
+    /// MBBLocations - This vector is a mapping from MBB ID's to their address.
+    /// It is filled in by the StartMachineBasicBlock callback and queried by
+    /// the getMachineBasicBlockAddress callback.
+    std::vector<uintptr_t> MBBLocations;
+
+    /// ConstantPool - The constant pool for the current function.
+    ///
+    MachineConstantPool *ConstantPool;
+
+    /// ConstantPoolBase - A pointer to the first entry in the constant pool.
+    ///
+    void *ConstantPoolBase;
+
+    /// ConstPoolAddresses - Addresses of individual constant pool entries.
+    ///
+    SmallVector<uintptr_t, 8> ConstPoolAddresses;
+
+    /// JumpTable - The jump tables for the current function.
+    ///
+    MachineJumpTableInfo *JumpTable;
+    
+    /// JumpTableBase - A pointer to the first entry in the jump table.
+    ///
+    void *JumpTableBase;
+
+    /// Resolver - This contains info about the currently resolved functions.
+    JITResolver Resolver;
+    
+    /// DE - The dwarf emitter for the jit.
+    JITDwarfEmitter *DE;
+
+    /// LabelLocations - This vector is a mapping from Label ID's to their 
+    /// address.
+    std::vector<uintptr_t> LabelLocations;
+
+    /// MMI - Machine module info for exception informations
+    MachineModuleInfo* MMI;
+
+    // GVSet - a set to keep track of which globals have been seen
+    SmallPtrSet<const GlobalVariable*, 8> GVSet;
+
+    // CurFn - The llvm function being emitted.  Only valid during 
+    // finishFunction().
+    const Function *CurFn;
+    
+    // CurFnStubUses - For a given Function, a vector of stubs that it
+    // references.  This facilitates the JIT detecting that a stub is no
+    // longer used, so that it may be deallocated.
+    DenseMap<const Function *, SmallVector<void*, 1> > CurFnStubUses;
+    
+    // StubFnRefs - For a given pointer to a stub, a set of Functions which
+    // reference the stub.  When the count of a stub's references drops to zero,
+    // the stub is unused.
+    DenseMap<void *, SmallPtrSet<const Function*, 1> > StubFnRefs;
+    
+    // ExtFnStubs - A map of external function names to stubs which have entries
+    // in the JITResolver's ExternalFnToStubMap.
+    StringMap<void *> ExtFnStubs;
+
+    // MCI - A pointer to a MachineCodeInfo object to update with information.
+    MachineCodeInfo *MCI;
+
+  public:
+    JITEmitter(JIT &jit, JITMemoryManager *JMM) : Resolver(jit), CurFn(0), MCI(0) {
+      MemMgr = JMM ? JMM : JITMemoryManager::CreateDefaultMemManager();
+      if (jit.getJITInfo().needsGOT()) {
+        MemMgr->AllocateGOT();
+        DOUT << "JIT is managing a GOT\n";
+      }
+
+      if (ExceptionHandling) DE = new JITDwarfEmitter(jit);
+    }
+    ~JITEmitter() { 
+      delete MemMgr;
+      if (ExceptionHandling) delete DE;
+    }
+
+    /// classof - Methods for support type inquiry through isa, cast, and
+    /// dyn_cast:
+    ///
+    static inline bool classof(const JITEmitter*) { return true; }
+    static inline bool classof(const MachineCodeEmitter*) { return true; }
+    
+    JITResolver &getJITResolver() { return Resolver; }
+
+    virtual void startFunction(MachineFunction &F);
+    virtual bool finishFunction(MachineFunction &F);
+    
+    void emitConstantPool(MachineConstantPool *MCP);
+    void initJumpTableInfo(MachineJumpTableInfo *MJTI);
+    void emitJumpTableInfo(MachineJumpTableInfo *MJTI);
+    
+    virtual void startGVStub(const GlobalValue* GV, unsigned StubSize,
+                                   unsigned Alignment = 1);
+    virtual void startGVStub(const GlobalValue* GV, void *Buffer,
+                             unsigned StubSize);
+    virtual void* finishGVStub(const GlobalValue *GV);
+
+    /// allocateSpace - Reserves space in the current block if any, or
+    /// allocate a new one of the given size.
+    virtual void *allocateSpace(uintptr_t Size, unsigned Alignment);
+
+    virtual void addRelocation(const MachineRelocation &MR) {
+      Relocations.push_back(MR);
+    }
+    
+    virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) {
+      if (MBBLocations.size() <= (unsigned)MBB->getNumber())
+        MBBLocations.resize((MBB->getNumber()+1)*2);
+      MBBLocations[MBB->getNumber()] = getCurrentPCValue();
+      DOUT << "JIT: Emitting BB" << MBB->getNumber() << " at ["
+           << (void*) getCurrentPCValue() << "]\n";
+    }
+
+    virtual uintptr_t getConstantPoolEntryAddress(unsigned Entry) const;
+    virtual uintptr_t getJumpTableEntryAddress(unsigned Entry) const;
+
+    virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const {
+      assert(MBBLocations.size() > (unsigned)MBB->getNumber() && 
+             MBBLocations[MBB->getNumber()] && "MBB not emitted!");
+      return MBBLocations[MBB->getNumber()];
+    }
+
+    /// deallocateMemForFunction - Deallocate all memory for the specified
+    /// function body.
+    void deallocateMemForFunction(Function *F);
+
+    /// AddStubToCurrentFunction - Mark the current function being JIT'd as
+    /// using the stub at the specified address. Allows
+    /// deallocateMemForFunction to also remove stubs no longer referenced.
+    void AddStubToCurrentFunction(void *Stub);
+    
+    /// getExternalFnStubs - Accessor for the JIT to find stubs emitted for
+    /// MachineRelocations that reference external functions by name.
+    const StringMap<void*> &getExternalFnStubs() const { return ExtFnStubs; }
+    
+    virtual void emitLabel(uint64_t LabelID) {
+      if (LabelLocations.size() <= LabelID)
+        LabelLocations.resize((LabelID+1)*2);
+      LabelLocations[LabelID] = getCurrentPCValue();
+    }
+
+    virtual uintptr_t getLabelAddress(uint64_t LabelID) const {
+      assert(LabelLocations.size() > (unsigned)LabelID && 
+             LabelLocations[LabelID] && "Label not emitted!");
+      return LabelLocations[LabelID];
+    }
+ 
+    virtual void setModuleInfo(MachineModuleInfo* Info) {
+      MMI = Info;
+      if (ExceptionHandling) DE->setModuleInfo(Info);
+    }
+
+    void setMemoryExecutable(void) {
+      MemMgr->setMemoryExecutable();
+    }
+    
+    JITMemoryManager *getMemMgr(void) const { return MemMgr; }
+
+    void setMachineCodeInfo(MachineCodeInfo *mci) {
+      MCI = mci;
+    }
+
+  private:
+    void *getPointerToGlobal(GlobalValue *GV, void *Reference, bool NoNeedStub);
+    void *getPointerToGVIndirectSym(GlobalValue *V, void *Reference,
+                                    bool NoNeedStub);
+    unsigned addSizeOfGlobal(const GlobalVariable *GV, unsigned Size);
+    unsigned addSizeOfGlobalsInConstantVal(const Constant *C, unsigned Size);
+    unsigned addSizeOfGlobalsInInitializer(const Constant *Init, unsigned Size);
+    unsigned GetSizeOfGlobalsInBytes(MachineFunction &MF);
+  };
+}
+
+void *JITEmitter::getPointerToGlobal(GlobalValue *V, void *Reference,
+                                     bool DoesntNeedStub) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return TheJIT->getOrEmitGlobalVariable(GV);
+
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
+    return TheJIT->getPointerToGlobal(GA->resolveAliasedGlobal(false));
+
+  // If we have already compiled the function, return a pointer to its body.
+  Function *F = cast<Function>(V);
+  void *ResultPtr;
+  if (!DoesntNeedStub && !TheJIT->isLazyCompilationDisabled()) {
+    // Return the function stub if it's already created.
+    ResultPtr = Resolver.getFunctionStubIfAvailable(F);
+    if (ResultPtr)
+      AddStubToCurrentFunction(ResultPtr);
+  } else {
+    ResultPtr = TheJIT->getPointerToGlobalIfAvailable(F);
+  }
+  if (ResultPtr) return ResultPtr;
+
+  // If this is an external function pointer, we can force the JIT to
+  // 'compile' it, which really just adds it to the map.  In dlsym mode, 
+  // external functions are forced through a stub, regardless of reloc type.
+  if (F->isDeclaration() && !F->hasNotBeenReadFromBitcode() &&
+      DoesntNeedStub && !TheJIT->areDlsymStubsEnabled())
+    return TheJIT->getPointerToFunction(F);
+
+  // Okay, the function has not been compiled yet, if the target callback
+  // mechanism is capable of rewriting the instruction directly, prefer to do
+  // that instead of emitting a stub.  This uses the lazy resolver, so is not
+  // legal if lazy compilation is disabled.
+  if (DoesntNeedStub && !TheJIT->isLazyCompilationDisabled())
+    return Resolver.AddCallbackAtLocation(F, Reference);
+
+  // Otherwise, we have to emit a stub.
+  void *StubAddr = Resolver.getFunctionStub(F);
+
+  // Add the stub to the current function's list of referenced stubs, so we can
+  // deallocate them if the current function is ever freed.  It's possible to
+  // return null from getFunctionStub in the case of a weak extern that fails
+  // to resolve.
+  if (StubAddr)
+    AddStubToCurrentFunction(StubAddr);
+
+  return StubAddr;
+}
+
+void *JITEmitter::getPointerToGVIndirectSym(GlobalValue *V, void *Reference,
+                                            bool NoNeedStub) {
+  // Make sure GV is emitted first, and create a stub containing the fully
+  // resolved address.
+  void *GVAddress = getPointerToGlobal(V, Reference, true);
+  void *StubAddr = Resolver.getGlobalValueIndirectSym(V, GVAddress);
+  
+  // Add the stub to the current function's list of referenced stubs, so we can
+  // deallocate them if the current function is ever freed.
+  AddStubToCurrentFunction(StubAddr);
+  
+  return StubAddr;
+}
+
+void JITEmitter::AddStubToCurrentFunction(void *StubAddr) {
+  if (!TheJIT->areDlsymStubsEnabled())
+    return;
+  
+  assert(CurFn && "Stub added to current function, but current function is 0!");
+  
+  SmallVectorImpl<void*> &StubsUsed = CurFnStubUses[CurFn];
+  StubsUsed.push_back(StubAddr);
+
+  SmallPtrSet<const Function *, 1> &FnRefs = StubFnRefs[StubAddr];
+  FnRefs.insert(CurFn);
+}
+
+static unsigned GetConstantPoolSizeInBytes(MachineConstantPool *MCP,
+                                           const TargetData *TD) {
+  const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
+  if (Constants.empty()) return 0;
+
+  unsigned Size = 0;
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    MachineConstantPoolEntry CPE = Constants[i];
+    unsigned AlignMask = CPE.getAlignment() - 1;
+    Size = (Size + AlignMask) & ~AlignMask;
+    const Type *Ty = CPE.getType();
+    Size += TD->getTypeAllocSize(Ty);
+  }
+  return Size;
+}
+
+static unsigned GetJumpTableSizeInBytes(MachineJumpTableInfo *MJTI) {
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return 0;
+  
+  unsigned NumEntries = 0;
+  for (unsigned i = 0, e = JT.size(); i != e; ++i)
+    NumEntries += JT[i].MBBs.size();
+
+  unsigned EntrySize = MJTI->getEntrySize();
+
+  return NumEntries * EntrySize;
+}
+
+static uintptr_t RoundUpToAlign(uintptr_t Size, unsigned Alignment) {
+  if (Alignment == 0) Alignment = 1;
+  // Since we do not know where the buffer will be allocated, be pessimistic. 
+  return Size + Alignment;
+}
+
+/// addSizeOfGlobal - add the size of the global (plus any alignment padding)
+/// into the running total Size.
+
+unsigned JITEmitter::addSizeOfGlobal(const GlobalVariable *GV, unsigned Size) {
+  const Type *ElTy = GV->getType()->getElementType();
+  size_t GVSize = (size_t)TheJIT->getTargetData()->getTypeAllocSize(ElTy);
+  size_t GVAlign = 
+      (size_t)TheJIT->getTargetData()->getPreferredAlignment(GV);
+  DOUT << "JIT: Adding in size " << GVSize << " alignment " << GVAlign;
+  DEBUG(GV->dump());
+  // Assume code section ends with worst possible alignment, so first
+  // variable needs maximal padding.
+  if (Size==0)
+    Size = 1;
+  Size = ((Size+GVAlign-1)/GVAlign)*GVAlign;
+  Size += GVSize;
+  return Size;
+}
+
+/// addSizeOfGlobalsInConstantVal - find any globals that we haven't seen yet
+/// but are referenced from the constant; put them in GVSet and add their
+/// size into the running total Size.
+
+unsigned JITEmitter::addSizeOfGlobalsInConstantVal(const Constant *C, 
+                                              unsigned Size) {
+  // If its undefined, return the garbage.
+  if (isa<UndefValue>(C))
+    return Size;
+
+  // If the value is a ConstantExpr
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    Constant *Op0 = CE->getOperand(0);
+    switch (CE->getOpcode()) {
+    case Instruction::GetElementPtr:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast: {
+      Size = addSizeOfGlobalsInConstantVal(Op0, Size);
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      Size = addSizeOfGlobalsInConstantVal(Op0, Size);
+      Size = addSizeOfGlobalsInConstantVal(CE->getOperand(1), Size);
+      break;
+    }
+    default: {
+       cerr << "ConstantExpr not handled: " << *CE << "\n";
+      abort();
+    }
+    }
+  }
+
+  if (C->getType()->getTypeID() == Type::PointerTyID)
+    if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(C))
+      if (GVSet.insert(GV))
+        Size = addSizeOfGlobal(GV, Size);
+
+  return Size;
+}
+
+/// addSizeOfGLobalsInInitializer - handle any globals that we haven't seen yet
+/// but are referenced from the given initializer.
+
+unsigned JITEmitter::addSizeOfGlobalsInInitializer(const Constant *Init, 
+                                              unsigned Size) {
+  if (!isa<UndefValue>(Init) &&
+      !isa<ConstantVector>(Init) &&
+      !isa<ConstantAggregateZero>(Init) &&
+      !isa<ConstantArray>(Init) &&
+      !isa<ConstantStruct>(Init) &&
+      Init->getType()->isFirstClassType())
+    Size = addSizeOfGlobalsInConstantVal(Init, Size);
+  return Size;
+}
+
+/// GetSizeOfGlobalsInBytes - walk the code for the function, looking for
+/// globals; then walk the initializers of those globals looking for more.
+/// If their size has not been considered yet, add it into the running total
+/// Size.
+
+unsigned JITEmitter::GetSizeOfGlobalsInBytes(MachineFunction &MF) {
+  unsigned Size = 0;
+  GVSet.clear();
+
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+       MBB != E; ++MBB) {
+    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      const TargetInstrDesc &Desc = I->getDesc();
+      const MachineInstr &MI = *I;
+      unsigned NumOps = Desc.getNumOperands();
+      for (unsigned CurOp = 0; CurOp < NumOps; CurOp++) {
+        const MachineOperand &MO = MI.getOperand(CurOp);
+        if (MO.isGlobal()) {
+          GlobalValue* V = MO.getGlobal();
+          const GlobalVariable *GV = dyn_cast<const GlobalVariable>(V);
+          if (!GV)
+            continue;
+          // If seen in previous function, it will have an entry here.
+          if (TheJIT->getPointerToGlobalIfAvailable(GV))
+            continue;
+          // If seen earlier in this function, it will have an entry here.
+          // FIXME: it should be possible to combine these tables, by
+          // assuming the addresses of the new globals in this module
+          // start at 0 (or something) and adjusting them after codegen
+          // complete.  Another possibility is to grab a marker bit in GV.
+          if (GVSet.insert(GV))
+            // A variable as yet unseen.  Add in its size.
+            Size = addSizeOfGlobal(GV, Size);
+        }
+      }
+    }
+  }
+  DOUT << "JIT: About to look through initializers\n";
+  // Look for more globals that are referenced only from initializers.
+  // GVSet.end is computed each time because the set can grow as we go.
+  for (SmallPtrSet<const GlobalVariable *, 8>::iterator I = GVSet.begin(); 
+       I != GVSet.end(); I++) {
+    const GlobalVariable* GV = *I;
+    if (GV->hasInitializer())
+      Size = addSizeOfGlobalsInInitializer(GV->getInitializer(), Size);
+  }
+
+  return Size;
+}
+
+void JITEmitter::startFunction(MachineFunction &F) {
+  DOUT << "JIT: Starting CodeGen of Function "
+       << F.getFunction()->getName() << "\n";
+
+  uintptr_t ActualSize = 0;
+  // Set the memory writable, if it's not already
+  MemMgr->setMemoryWritable();
+  if (MemMgr->NeedsExactSize()) {
+    DOUT << "JIT: ExactSize\n";
+    const TargetInstrInfo* TII = F.getTarget().getInstrInfo();
+    MachineJumpTableInfo *MJTI = F.getJumpTableInfo();
+    MachineConstantPool *MCP = F.getConstantPool();
+    
+    // Ensure the constant pool/jump table info is at least 4-byte aligned.
+    ActualSize = RoundUpToAlign(ActualSize, 16);
+    
+    // Add the alignment of the constant pool
+    ActualSize = RoundUpToAlign(ActualSize, MCP->getConstantPoolAlignment());
+
+    // Add the constant pool size
+    ActualSize += GetConstantPoolSizeInBytes(MCP, TheJIT->getTargetData());
+
+    // Add the aligment of the jump table info
+    ActualSize = RoundUpToAlign(ActualSize, MJTI->getAlignment());
+
+    // Add the jump table size
+    ActualSize += GetJumpTableSizeInBytes(MJTI);
+    
+    // Add the alignment for the function
+    ActualSize = RoundUpToAlign(ActualSize,
+                                std::max(F.getFunction()->getAlignment(), 8U));
+
+    // Add the function size
+    ActualSize += TII->GetFunctionSizeInBytes(F);
+
+    DOUT << "JIT: ActualSize before globals " << ActualSize << "\n";
+    // Add the size of the globals that will be allocated after this function.
+    // These are all the ones referenced from this function that were not
+    // previously allocated.
+    ActualSize += GetSizeOfGlobalsInBytes(F);
+    DOUT << "JIT: ActualSize after globals " << ActualSize << "\n";
+  }
+
+  BufferBegin = CurBufferPtr = MemMgr->startFunctionBody(F.getFunction(),
+                                                         ActualSize);
+  BufferEnd = BufferBegin+ActualSize;
+  
+  // Ensure the constant pool/jump table info is at least 4-byte aligned.
+  emitAlignment(16);
+
+  emitConstantPool(F.getConstantPool());
+  initJumpTableInfo(F.getJumpTableInfo());
+
+  // About to start emitting the machine code for the function.
+  emitAlignment(std::max(F.getFunction()->getAlignment(), 8U));
+  TheJIT->updateGlobalMapping(F.getFunction(), CurBufferPtr);
+
+  MBBLocations.clear();
+}
+
+bool JITEmitter::finishFunction(MachineFunction &F) {
+  if (CurBufferPtr == BufferEnd) {
+    // FIXME: Allocate more space, then try again.
+    cerr << "JIT: Ran out of space for generated machine code!\n";
+    abort();
+  }
+  
+  emitJumpTableInfo(F.getJumpTableInfo());
+  
+  // FnStart is the start of the text, not the start of the constant pool and
+  // other per-function data.
+  uint8_t *FnStart =
+    (uint8_t *)TheJIT->getPointerToGlobalIfAvailable(F.getFunction());
+
+  // FnEnd is the end of the function's machine code.
+  uint8_t *FnEnd = CurBufferPtr;
+
+  if (!Relocations.empty()) {
+    CurFn = F.getFunction();
+    NumRelos += Relocations.size();
+
+    // Resolve the relocations to concrete pointers.
+    for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
+      MachineRelocation &MR = Relocations[i];
+      void *ResultPtr = 0;
+      if (!MR.letTargetResolve()) {
+        if (MR.isExternalSymbol()) {
+          ResultPtr = TheJIT->getPointerToNamedFunction(MR.getExternalSymbol(),
+                                                        false);
+          DOUT << "JIT: Map \'" << MR.getExternalSymbol() << "\' to ["
+               << ResultPtr << "]\n";  
+
+          // If the target REALLY wants a stub for this function, emit it now.
+          if (!MR.doesntNeedStub()) {
+            if (!TheJIT->areDlsymStubsEnabled()) {
+              ResultPtr = Resolver.getExternalFunctionStub(ResultPtr);
+            } else {
+              void *&Stub = ExtFnStubs[MR.getExternalSymbol()];
+              if (!Stub) {
+                Stub = Resolver.getExternalFunctionStub((void *)&Stub);
+                AddStubToCurrentFunction(Stub);
+              }
+              ResultPtr = Stub;
+            }
+          }
+        } else if (MR.isGlobalValue()) {
+          ResultPtr = getPointerToGlobal(MR.getGlobalValue(),
+                                         BufferBegin+MR.getMachineCodeOffset(),
+                                         MR.doesntNeedStub());
+        } else if (MR.isIndirectSymbol()) {
+          ResultPtr = getPointerToGVIndirectSym(MR.getGlobalValue(),
+                                          BufferBegin+MR.getMachineCodeOffset(),
+                                          MR.doesntNeedStub());
+        } else if (MR.isBasicBlock()) {
+          ResultPtr = (void*)getMachineBasicBlockAddress(MR.getBasicBlock());
+        } else if (MR.isConstantPoolIndex()) {
+          ResultPtr = (void*)getConstantPoolEntryAddress(MR.getConstantPoolIndex());
+        } else {
+          assert(MR.isJumpTableIndex());
+          ResultPtr=(void*)getJumpTableEntryAddress(MR.getJumpTableIndex());
+        }
+
+        MR.setResultPointer(ResultPtr);
+      }
+
+      // if we are managing the GOT and the relocation wants an index,
+      // give it one
+      if (MR.isGOTRelative() && MemMgr->isManagingGOT()) {
+        unsigned idx = Resolver.getGOTIndexForAddr(ResultPtr);
+        MR.setGOTIndex(idx);
+        if (((void**)MemMgr->getGOTBase())[idx] != ResultPtr) {
+          DOUT << "JIT: GOT was out of date for " << ResultPtr
+               << " pointing at " << ((void**)MemMgr->getGOTBase())[idx]
+               << "\n";
+          ((void**)MemMgr->getGOTBase())[idx] = ResultPtr;
+        }
+      }
+    }
+
+    CurFn = 0;
+    TheJIT->getJITInfo().relocate(BufferBegin, &Relocations[0],
+                                  Relocations.size(), MemMgr->getGOTBase());
+  }
+
+  // Update the GOT entry for F to point to the new code.
+  if (MemMgr->isManagingGOT()) {
+    unsigned idx = Resolver.getGOTIndexForAddr((void*)BufferBegin);
+    if (((void**)MemMgr->getGOTBase())[idx] != (void*)BufferBegin) {
+      DOUT << "JIT: GOT was out of date for " << (void*)BufferBegin
+           << " pointing at " << ((void**)MemMgr->getGOTBase())[idx] << "\n";
+      ((void**)MemMgr->getGOTBase())[idx] = (void*)BufferBegin;
+    }
+  }
+
+  // CurBufferPtr may have moved beyond FnEnd, due to memory allocation for
+  // global variables that were referenced in the relocations.
+  MemMgr->endFunctionBody(F.getFunction(), BufferBegin, CurBufferPtr);
+
+  if (CurBufferPtr == BufferEnd) {
+    // FIXME: Allocate more space, then try again.
+    cerr << "JIT: Ran out of space for generated machine code!\n";
+    abort();
+  }
+
+  BufferBegin = CurBufferPtr = 0;
+  NumBytes += FnEnd-FnStart;
+
+  // Invalidate the icache if necessary.
+  sys::Memory::InvalidateInstructionCache(FnStart, FnEnd-FnStart);
+  
+  // Add it to the JIT symbol table if the host wants it.
+  AddFunctionToSymbolTable(F.getFunction()->getNameStart(),
+                           FnStart, FnEnd-FnStart);
+
+  DOUT << "JIT: Finished CodeGen of [" << (void*)FnStart
+       << "] Function: " << F.getFunction()->getName()
+       << ": " << (FnEnd-FnStart) << " bytes of text, "
+       << Relocations.size() << " relocations\n";
+
+  if (MCI) {
+    MCI->setAddress(FnStart);
+    MCI->setSize(FnEnd-FnStart);
+  }
+
+  Relocations.clear();
+  ConstPoolAddresses.clear();
+
+  // Mark code region readable and executable if it's not so already.
+  MemMgr->setMemoryExecutable();
+
+#ifndef NDEBUG
+  {
+    if (sys::hasDisassembler()) {
+      DOUT << "JIT: Disassembled code:\n";
+      DOUT << sys::disassembleBuffer(FnStart, FnEnd-FnStart, (uintptr_t)FnStart);
+    } else {
+      DOUT << "JIT: Binary code:\n";
+      DOUT << std::hex;
+      uint8_t* q = FnStart;
+      for (int i = 0; q < FnEnd; q += 4, ++i) {
+        if (i == 4)
+          i = 0;
+        if (i == 0)
+          DOUT << "JIT: " << std::setw(8) << std::setfill('0')
+               << (long)(q - FnStart) << ": ";
+        bool Done = false;
+        for (int j = 3; j >= 0; --j) {
+          if (q + j >= FnEnd)
+            Done = true;
+          else
+            DOUT << std::setw(2) << std::setfill('0') << (unsigned short)q[j];
+        }
+        if (Done)
+          break;
+        DOUT << ' ';
+        if (i == 3)
+          DOUT << '\n';
+      }
+      DOUT << std::dec;
+      DOUT<< '\n';
+    }
+  }
+#endif
+  if (ExceptionHandling) {
+    uintptr_t ActualSize = 0;
+    SavedBufferBegin = BufferBegin;
+    SavedBufferEnd = BufferEnd;
+    SavedCurBufferPtr = CurBufferPtr;
+    
+    if (MemMgr->NeedsExactSize()) {
+      ActualSize = DE->GetDwarfTableSizeInBytes(F, *this, FnStart, FnEnd);
+    }
+
+    BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
+                                                             ActualSize);
+    BufferEnd = BufferBegin+ActualSize;
+    uint8_t* FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd);
+    MemMgr->endExceptionTable(F.getFunction(), BufferBegin, CurBufferPtr,
+                              FrameRegister);
+    BufferBegin = SavedBufferBegin;
+    BufferEnd = SavedBufferEnd;
+    CurBufferPtr = SavedCurBufferPtr;
+
+    TheJIT->RegisterTable(FrameRegister);
+  }
+
+  if (MMI)
+    MMI->EndFunction();
+ 
+  return false;
+}
+
+/// deallocateMemForFunction - Deallocate all memory for the specified
+/// function body.  Also drop any references the function has to stubs.
+void JITEmitter::deallocateMemForFunction(Function *F) {
+  MemMgr->deallocateMemForFunction(F);
+
+  // If the function did not reference any stubs, return.
+  if (CurFnStubUses.find(F) == CurFnStubUses.end())
+    return;
+  
+  // For each referenced stub, erase the reference to this function, and then
+  // erase the list of referenced stubs.
+  SmallVectorImpl<void *> &StubList = CurFnStubUses[F];
+  for (unsigned i = 0, e = StubList.size(); i != e; ++i) {
+    void *Stub = StubList[i];
+    
+    // If we already invalidated this stub for this function, continue.
+    if (StubFnRefs.count(Stub) == 0)
+      continue;
+      
+    SmallPtrSet<const Function *, 1> &FnRefs = StubFnRefs[Stub];
+    FnRefs.erase(F);
+    
+    // If this function was the last reference to the stub, invalidate the stub
+    // in the JITResolver.  Were there a memory manager deallocateStub routine,
+    // we could call that at this point too.
+    if (FnRefs.empty()) {
+      DOUT << "\nJIT: Invalidated Stub at [" << Stub << "]\n";
+      StubFnRefs.erase(Stub);
+
+      // Invalidate the stub.  If it is a GV stub, update the JIT's global
+      // mapping for that GV to zero, otherwise, search the string map of
+      // external function names to stubs and remove the entry for this stub.
+      GlobalValue *GV = Resolver.invalidateStub(Stub);
+      if (GV) {
+        TheJIT->updateGlobalMapping(GV, 0);
+      } else {
+        for (StringMapIterator<void*> i = ExtFnStubs.begin(),
+             e = ExtFnStubs.end(); i != e; ++i) {
+          if (i->second == Stub) {
+            ExtFnStubs.erase(i);
+            break;
+          }
+        }
+      }
+    }
+  }
+  CurFnStubUses.erase(F);
+}
+
+
+void* JITEmitter::allocateSpace(uintptr_t Size, unsigned Alignment) {
+  if (BufferBegin)
+    return JITCodeEmitter::allocateSpace(Size, Alignment);
+
+  // create a new memory block if there is no active one.
+  // care must be taken so that BufferBegin is invalidated when a
+  // block is trimmed
+  BufferBegin = CurBufferPtr = MemMgr->allocateSpace(Size, Alignment);
+  BufferEnd = BufferBegin+Size;
+  return CurBufferPtr;
+}
+
+void JITEmitter::emitConstantPool(MachineConstantPool *MCP) {
+  if (TheJIT->getJITInfo().hasCustomConstantPool())
+    return;
+
+  const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
+  if (Constants.empty()) return;
+
+  unsigned Size = GetConstantPoolSizeInBytes(MCP, TheJIT->getTargetData());
+  unsigned Align = MCP->getConstantPoolAlignment();
+  ConstantPoolBase = allocateSpace(Size, Align);
+  ConstantPool = MCP;
+
+  if (ConstantPoolBase == 0) return;  // Buffer overflow.
+
+  DOUT << "JIT: Emitted constant pool at [" << ConstantPoolBase
+       << "] (size: " << Size << ", alignment: " << Align << ")\n";
+
+  // Initialize the memory for all of the constant pool entries.
+  unsigned Offset = 0;
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    MachineConstantPoolEntry CPE = Constants[i];
+    unsigned AlignMask = CPE.getAlignment() - 1;
+    Offset = (Offset + AlignMask) & ~AlignMask;
+
+    uintptr_t CAddr = (uintptr_t)ConstantPoolBase + Offset;
+    ConstPoolAddresses.push_back(CAddr);
+    if (CPE.isMachineConstantPoolEntry()) {
+      // FIXME: add support to lower machine constant pool values into bytes!
+      cerr << "Initialize memory with machine specific constant pool entry"
+           << " has not been implemented!\n";
+      abort();
+    }
+    TheJIT->InitializeMemory(CPE.Val.ConstVal, (void*)CAddr);
+    DOUT << "JIT:   CP" << i << " at [0x"
+         << std::hex << CAddr << std::dec << "]\n";
+
+    const Type *Ty = CPE.Val.ConstVal->getType();
+    Offset += TheJIT->getTargetData()->getTypeAllocSize(Ty);
+  }
+}
+
+void JITEmitter::initJumpTableInfo(MachineJumpTableInfo *MJTI) {
+  if (TheJIT->getJITInfo().hasCustomJumpTables())
+    return;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+  
+  unsigned NumEntries = 0;
+  for (unsigned i = 0, e = JT.size(); i != e; ++i)
+    NumEntries += JT[i].MBBs.size();
+
+  unsigned EntrySize = MJTI->getEntrySize();
+
+  // Just allocate space for all the jump tables now.  We will fix up the actual
+  // MBB entries in the tables after we emit the code for each block, since then
+  // we will know the final locations of the MBBs in memory.
+  JumpTable = MJTI;
+  JumpTableBase = allocateSpace(NumEntries * EntrySize, MJTI->getAlignment());
+}
+
+void JITEmitter::emitJumpTableInfo(MachineJumpTableInfo *MJTI) {
+  if (TheJIT->getJITInfo().hasCustomJumpTables())
+    return;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty() || JumpTableBase == 0) return;
+  
+  if (TargetMachine::getRelocationModel() == Reloc::PIC_) {
+    assert(MJTI->getEntrySize() == 4 && "Cross JIT'ing?");
+    // For each jump table, place the offset from the beginning of the table
+    // to the target address.
+    int *SlotPtr = (int*)JumpTableBase;
+
+    for (unsigned i = 0, e = JT.size(); i != e; ++i) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
+      // Store the offset of the basic block for this jump table slot in the
+      // memory we allocated for the jump table in 'initJumpTableInfo'
+      uintptr_t Base = (uintptr_t)SlotPtr;
+      for (unsigned mi = 0, me = MBBs.size(); mi != me; ++mi) {
+        uintptr_t MBBAddr = getMachineBasicBlockAddress(MBBs[mi]);
+        *SlotPtr++ = TheJIT->getJITInfo().getPICJumpTableEntry(MBBAddr, Base);
+      }
+    }
+  } else {
+    assert(MJTI->getEntrySize() == sizeof(void*) && "Cross JIT'ing?");
+    
+    // For each jump table, map each target in the jump table to the address of 
+    // an emitted MachineBasicBlock.
+    intptr_t *SlotPtr = (intptr_t*)JumpTableBase;
+
+    for (unsigned i = 0, e = JT.size(); i != e; ++i) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
+      // Store the address of the basic block for this jump table slot in the
+      // memory we allocated for the jump table in 'initJumpTableInfo'
+      for (unsigned mi = 0, me = MBBs.size(); mi != me; ++mi)
+        *SlotPtr++ = getMachineBasicBlockAddress(MBBs[mi]);
+    }
+  }
+}
+
+void JITEmitter::startGVStub(const GlobalValue* GV, unsigned StubSize,
+                             unsigned Alignment) {
+  SavedBufferBegin = BufferBegin;
+  SavedBufferEnd = BufferEnd;
+  SavedCurBufferPtr = CurBufferPtr;
+  
+  BufferBegin = CurBufferPtr = MemMgr->allocateStub(GV, StubSize, Alignment);
+  BufferEnd = BufferBegin+StubSize+1;
+}
+
+void JITEmitter::startGVStub(const GlobalValue* GV, void *Buffer,
+                             unsigned StubSize) {
+  SavedBufferBegin = BufferBegin;
+  SavedBufferEnd = BufferEnd;
+  SavedCurBufferPtr = CurBufferPtr;
+  
+  BufferBegin = CurBufferPtr = (uint8_t *)Buffer;
+  BufferEnd = BufferBegin+StubSize+1;
+}
+
+void *JITEmitter::finishGVStub(const GlobalValue* GV) {
+  NumBytes += getCurrentPCOffset();
+  std::swap(SavedBufferBegin, BufferBegin);
+  BufferEnd = SavedBufferEnd;
+  CurBufferPtr = SavedCurBufferPtr;
+  return SavedBufferBegin;
+}
+
+// getConstantPoolEntryAddress - Return the address of the 'ConstantNum' entry
+// in the constant pool that was last emitted with the 'emitConstantPool'
+// method.
+//
+uintptr_t JITEmitter::getConstantPoolEntryAddress(unsigned ConstantNum) const {
+  assert(ConstantNum < ConstantPool->getConstants().size() &&
+         "Invalid ConstantPoolIndex!");
+  return ConstPoolAddresses[ConstantNum];
+}
+
+// getJumpTableEntryAddress - Return the address of the JumpTable with index
+// 'Index' in the jumpp table that was last initialized with 'initJumpTableInfo'
+//
+uintptr_t JITEmitter::getJumpTableEntryAddress(unsigned Index) const {
+  const std::vector<MachineJumpTableEntry> &JT = JumpTable->getJumpTables();
+  assert(Index < JT.size() && "Invalid jump table index!");
+  
+  unsigned Offset = 0;
+  unsigned EntrySize = JumpTable->getEntrySize();
+  
+  for (unsigned i = 0; i < Index; ++i)
+    Offset += JT[i].MBBs.size();
+  
+   Offset *= EntrySize;
+  
+  return (uintptr_t)((char *)JumpTableBase + Offset);
+}
+
+//===----------------------------------------------------------------------===//
+//  Public interface to this file
+//===----------------------------------------------------------------------===//
+
+JITCodeEmitter *JIT::createEmitter(JIT &jit, JITMemoryManager *JMM) {
+  return new JITEmitter(jit, JMM);
+}
+
+// getPointerToNamedFunction - This function is used as a global wrapper to
+// JIT::getPointerToNamedFunction for the purpose of resolving symbols when
+// bugpoint is debugging the JIT. In that scenario, we are loading an .so and
+// need to resolve function(s) that are being mis-codegenerated, so we need to
+// resolve their addresses at runtime, and this is the way to do it.
+extern "C" {
+  void *getPointerToNamedFunction(const char *Name) {
+    if (Function *F = TheJIT->FindFunctionNamed(Name))
+      return TheJIT->getPointerToFunction(F);
+    return TheJIT->getPointerToNamedFunction(Name);
+  }
+}
+
+// getPointerToFunctionOrStub - If the specified function has been
+// code-gen'd, return a pointer to the function.  If not, compile it, or use
+// a stub to implement lazy compilation if available.
+//
+void *JIT::getPointerToFunctionOrStub(Function *F) {
+  // If we have already code generated the function, just return the address.
+  if (void *Addr = getPointerToGlobalIfAvailable(F))
+    return Addr;
+  
+  // Get a stub if the target supports it.
+  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
+  JITEmitter *JE = cast<JITEmitter>(getCodeEmitter());
+  return JE->getJITResolver().getFunctionStub(F);
+}
+
+void JIT::registerMachineCodeInfo(MachineCodeInfo *mc) {
+  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
+  JITEmitter *JE = cast<JITEmitter>(getCodeEmitter());
+
+  JE->setMachineCodeInfo(mc);
+}
+
+void JIT::updateFunctionStub(Function *F) {
+  // Get the empty stub we generated earlier.
+  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
+  JITEmitter *JE = cast<JITEmitter>(getCodeEmitter());
+  void *Stub = JE->getJITResolver().getFunctionStub(F);
+
+  // Tell the target jit info to rewrite the stub at the specified address,
+  // rather than creating a new one.
+  void *Addr = getPointerToGlobalIfAvailable(F);
+  getJITInfo().emitFunctionStubAtAddr(F, Addr, Stub, *getCodeEmitter());
+}
+
+/// updateDlsymStubTable - Emit the data necessary to relocate the stubs
+/// that were emitted during code generation.
+///
+void JIT::updateDlsymStubTable() {
+  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
+  JITEmitter *JE = cast<JITEmitter>(getCodeEmitter());
+  
+  SmallVector<GlobalValue*, 8> GVs;
+  SmallVector<void*, 8> Ptrs;
+  const StringMap<void *> &ExtFns = JE->getExternalFnStubs();
+
+  JE->getJITResolver().getRelocatableGVs(GVs, Ptrs);
+
+  unsigned nStubs = GVs.size() + ExtFns.size();
+  
+  // If there are no relocatable stubs, return.
+  if (nStubs == 0)
+    return;
+
+  // If there are no new relocatable stubs, return.
+  void *CurTable = JE->getMemMgr()->getDlsymTable();
+  if (CurTable && (*(unsigned *)CurTable == nStubs))
+    return;
+  
+  // Calculate the size of the stub info
+  unsigned offset = 4 + 4 * nStubs + sizeof(intptr_t) * nStubs;
+  
+  SmallVector<unsigned, 8> Offsets;
+  for (unsigned i = 0; i != GVs.size(); ++i) {
+    Offsets.push_back(offset);
+    offset += GVs[i]->getName().length() + 1;
+  }
+  for (StringMapConstIterator<void*> i = ExtFns.begin(), e = ExtFns.end(); 
+       i != e; ++i) {
+    Offsets.push_back(offset);
+    offset += strlen(i->first()) + 1;
+  }
+  
+  // Allocate space for the new "stub", which contains the dlsym table.
+  JE->startGVStub(0, offset, 4);
+  
+  // Emit the number of records
+  JE->emitInt32(nStubs);
+  
+  // Emit the string offsets
+  for (unsigned i = 0; i != nStubs; ++i)
+    JE->emitInt32(Offsets[i]);
+  
+  // Emit the pointers.  Verify that they are at least 2-byte aligned, and set
+  // the low bit to 0 == GV, 1 == Function, so that the client code doing the
+  // relocation can write the relocated pointer at the appropriate place in
+  // the stub.
+  for (unsigned i = 0; i != GVs.size(); ++i) {
+    intptr_t Ptr = (intptr_t)Ptrs[i];
+    assert((Ptr & 1) == 0 && "Stub pointers must be at least 2-byte aligned!");
+    
+    if (isa<Function>(GVs[i]))
+      Ptr |= (intptr_t)1;
+           
+    if (sizeof(Ptr) == 8)
+      JE->emitInt64(Ptr);
+    else
+      JE->emitInt32(Ptr);
+  }
+  for (StringMapConstIterator<void*> i = ExtFns.begin(), e = ExtFns.end(); 
+       i != e; ++i) {
+    intptr_t Ptr = (intptr_t)i->second | 1;
+
+    if (sizeof(Ptr) == 8)
+      JE->emitInt64(Ptr);
+    else
+      JE->emitInt32(Ptr);
+  }
+  
+  // Emit the strings.
+  for (unsigned i = 0; i != GVs.size(); ++i)
+    JE->emitString(GVs[i]->getName());
+  for (StringMapConstIterator<void*> i = ExtFns.begin(), e = ExtFns.end(); 
+       i != e; ++i)
+    JE->emitString(i->first());
+  
+  // Tell the JIT memory manager where it is.  The JIT Memory Manager will
+  // deallocate space for the old one, if one existed.
+  JE->getMemMgr()->SetDlsymTable(JE->finishGVStub(0));
+}
+
+/// freeMachineCodeForFunction - release machine code memory for given Function.
+///
+void JIT::freeMachineCodeForFunction(Function *F) {
+
+  // Delete translation for this from the ExecutionEngine, so it will get
+  // retranslated next time it is used.
+  void *OldPtr = updateGlobalMapping(F, 0);
+
+  if (OldPtr)
+    RemoveFunctionFromSymbolTable(OldPtr);
+
+  // Free the actual memory for the function body and related stuff.
+  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
+  cast<JITEmitter>(JCE)->deallocateMemForFunction(F);
+}
+
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
new file mode 100644
index 0000000..70ccdcc
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -0,0 +1,541 @@
+//===-- JITMemoryManager.cpp - Memory Allocator for JIT'd code ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DefaultJITMemoryManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/GlobalValue.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/System/Memory.h"
+#include <map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+
+JITMemoryManager::~JITMemoryManager() {}
+
+//===----------------------------------------------------------------------===//
+// Memory Block Implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// MemoryRangeHeader - For a range of memory, this is the header that we put
+  /// on the block of memory.  It is carefully crafted to be one word of memory.
+  /// Allocated blocks have just this header, free'd blocks have FreeRangeHeader
+  /// which starts with this.
+  struct FreeRangeHeader;
+  struct MemoryRangeHeader {
+    /// ThisAllocated - This is true if this block is currently allocated.  If
+    /// not, this can be converted to a FreeRangeHeader.
+    unsigned ThisAllocated : 1;
+    
+    /// PrevAllocated - Keep track of whether the block immediately before us is
+    /// allocated.  If not, the word immediately before this header is the size
+    /// of the previous block.
+    unsigned PrevAllocated : 1;
+    
+    /// BlockSize - This is the size in bytes of this memory block,
+    /// including this header.
+    uintptr_t BlockSize : (sizeof(intptr_t)*CHAR_BIT - 2);
+    
+
+    /// getBlockAfter - Return the memory block immediately after this one.
+    ///
+    MemoryRangeHeader &getBlockAfter() const {
+      return *(MemoryRangeHeader*)((char*)this+BlockSize);
+    }
+    
+    /// getFreeBlockBefore - If the block before this one is free, return it,
+    /// otherwise return null.
+    FreeRangeHeader *getFreeBlockBefore() const {
+      if (PrevAllocated) return 0;
+      intptr_t PrevSize = ((intptr_t *)this)[-1];
+      return (FreeRangeHeader*)((char*)this-PrevSize);
+    }
+    
+    /// FreeBlock - Turn an allocated block into a free block, adjusting
+    /// bits in the object headers, and adding an end of region memory block.
+    FreeRangeHeader *FreeBlock(FreeRangeHeader *FreeList);
+    
+    /// TrimAllocationToSize - If this allocated block is significantly larger
+    /// than NewSize, split it into two pieces (where the former is NewSize
+    /// bytes, including the header), and add the new block to the free list.
+    FreeRangeHeader *TrimAllocationToSize(FreeRangeHeader *FreeList, 
+                                          uint64_t NewSize);
+  };
+
+  /// FreeRangeHeader - For a memory block that isn't already allocated, this
+  /// keeps track of the current block and has a pointer to the next free block.
+  /// Free blocks are kept on a circularly linked list.
+  struct FreeRangeHeader : public MemoryRangeHeader {
+    FreeRangeHeader *Prev;
+    FreeRangeHeader *Next;
+    
+    /// getMinBlockSize - Get the minimum size for a memory block.  Blocks
+    /// smaller than this size cannot be created.
+    static unsigned getMinBlockSize() {
+      return sizeof(FreeRangeHeader)+sizeof(intptr_t);
+    }
+    
+    /// SetEndOfBlockSizeMarker - The word at the end of every free block is
+    /// known to be the size of the free block.  Set it for this block.
+    void SetEndOfBlockSizeMarker() {
+      void *EndOfBlock = (char*)this + BlockSize;
+      ((intptr_t *)EndOfBlock)[-1] = BlockSize;
+    }
+
+    FreeRangeHeader *RemoveFromFreeList() {
+      assert(Next->Prev == this && Prev->Next == this && "Freelist broken!");
+      Next->Prev = Prev;
+      return Prev->Next = Next;
+    }
+    
+    void AddToFreeList(FreeRangeHeader *FreeList) {
+      Next = FreeList;
+      Prev = FreeList->Prev;
+      Prev->Next = this;
+      Next->Prev = this;
+    }
+
+    /// GrowBlock - The block after this block just got deallocated.  Merge it
+    /// into the current block.
+    void GrowBlock(uintptr_t NewSize);
+    
+    /// AllocateBlock - Mark this entire block allocated, updating freelists
+    /// etc.  This returns a pointer to the circular free-list.
+    FreeRangeHeader *AllocateBlock();
+  };
+}
+
+
+/// AllocateBlock - Mark this entire block allocated, updating freelists
+/// etc.  This returns a pointer to the circular free-list.
+FreeRangeHeader *FreeRangeHeader::AllocateBlock() {
+  assert(!ThisAllocated && !getBlockAfter().PrevAllocated &&
+         "Cannot allocate an allocated block!");
+  // Mark this block allocated.
+  ThisAllocated = 1;
+  getBlockAfter().PrevAllocated = 1;
+ 
+  // Remove it from the free list.
+  return RemoveFromFreeList();
+}
+
+/// FreeBlock - Turn an allocated block into a free block, adjusting
+/// bits in the object headers, and adding an end of region memory block.
+/// If possible, coalesce this block with neighboring blocks.  Return the
+/// FreeRangeHeader to allocate from.
+FreeRangeHeader *MemoryRangeHeader::FreeBlock(FreeRangeHeader *FreeList) {
+  MemoryRangeHeader *FollowingBlock = &getBlockAfter();
+  assert(ThisAllocated && "This block is already allocated!");
+  assert(FollowingBlock->PrevAllocated && "Flags out of sync!");
+  
+  FreeRangeHeader *FreeListToReturn = FreeList;
+  
+  // If the block after this one is free, merge it into this block.
+  if (!FollowingBlock->ThisAllocated) {
+    FreeRangeHeader &FollowingFreeBlock = *(FreeRangeHeader *)FollowingBlock;
+    // "FreeList" always needs to be a valid free block.  If we're about to
+    // coalesce with it, update our notion of what the free list is.
+    if (&FollowingFreeBlock == FreeList) {
+      FreeList = FollowingFreeBlock.Next;
+      FreeListToReturn = 0;
+      assert(&FollowingFreeBlock != FreeList && "No tombstone block?");
+    }
+    FollowingFreeBlock.RemoveFromFreeList();
+    
+    // Include the following block into this one.
+    BlockSize += FollowingFreeBlock.BlockSize;
+    FollowingBlock = &FollowingFreeBlock.getBlockAfter();
+    
+    // Tell the block after the block we are coalescing that this block is
+    // allocated.
+    FollowingBlock->PrevAllocated = 1;
+  }
+  
+  assert(FollowingBlock->ThisAllocated && "Missed coalescing?");
+  
+  if (FreeRangeHeader *PrevFreeBlock = getFreeBlockBefore()) {
+    PrevFreeBlock->GrowBlock(PrevFreeBlock->BlockSize + BlockSize);
+    return FreeListToReturn ? FreeListToReturn : PrevFreeBlock;
+  }
+
+  // Otherwise, mark this block free.
+  FreeRangeHeader &FreeBlock = *(FreeRangeHeader*)this;
+  FollowingBlock->PrevAllocated = 0;
+  FreeBlock.ThisAllocated = 0;
+
+  // Link this into the linked list of free blocks.
+  FreeBlock.AddToFreeList(FreeList);
+
+  // Add a marker at the end of the block, indicating the size of this free
+  // block.
+  FreeBlock.SetEndOfBlockSizeMarker();
+  return FreeListToReturn ? FreeListToReturn : &FreeBlock;
+}
+
+/// GrowBlock - The block after this block just got deallocated.  Merge it
+/// into the current block.
+void FreeRangeHeader::GrowBlock(uintptr_t NewSize) {
+  assert(NewSize > BlockSize && "Not growing block?");
+  BlockSize = NewSize;
+  SetEndOfBlockSizeMarker();
+  getBlockAfter().PrevAllocated = 0;
+}
+
+/// TrimAllocationToSize - If this allocated block is significantly larger
+/// than NewSize, split it into two pieces (where the former is NewSize
+/// bytes, including the header), and add the new block to the free list.
+FreeRangeHeader *MemoryRangeHeader::
+TrimAllocationToSize(FreeRangeHeader *FreeList, uint64_t NewSize) {
+  assert(ThisAllocated && getBlockAfter().PrevAllocated &&
+         "Cannot deallocate part of an allocated block!");
+
+  // Don't allow blocks to be trimmed below minimum required size
+  NewSize = std::max<uint64_t>(FreeRangeHeader::getMinBlockSize(), NewSize);
+
+  // Round up size for alignment of header.
+  unsigned HeaderAlign = __alignof(FreeRangeHeader);
+  NewSize = (NewSize+ (HeaderAlign-1)) & ~(HeaderAlign-1);
+  
+  // Size is now the size of the block we will remove from the start of the
+  // current block.
+  assert(NewSize <= BlockSize &&
+         "Allocating more space from this block than exists!");
+  
+  // If splitting this block will cause the remainder to be too small, do not
+  // split the block.
+  if (BlockSize <= NewSize+FreeRangeHeader::getMinBlockSize())
+    return FreeList;
+  
+  // Otherwise, we splice the required number of bytes out of this block, form
+  // a new block immediately after it, then mark this block allocated.
+  MemoryRangeHeader &FormerNextBlock = getBlockAfter();
+  
+  // Change the size of this block.
+  BlockSize = NewSize;
+  
+  // Get the new block we just sliced out and turn it into a free block.
+  FreeRangeHeader &NewNextBlock = (FreeRangeHeader &)getBlockAfter();
+  NewNextBlock.BlockSize = (char*)&FormerNextBlock - (char*)&NewNextBlock;
+  NewNextBlock.ThisAllocated = 0;
+  NewNextBlock.PrevAllocated = 1;
+  NewNextBlock.SetEndOfBlockSizeMarker();
+  FormerNextBlock.PrevAllocated = 0;
+  NewNextBlock.AddToFreeList(FreeList);
+  return &NewNextBlock;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Block Implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {  
+  /// DefaultJITMemoryManager - Manage memory for the JIT code generation.
+  /// This splits a large block of MAP_NORESERVE'd memory into two
+  /// sections, one for function stubs, one for the functions themselves.  We
+  /// have to do this because we may need to emit a function stub while in the
+  /// middle of emitting a function, and we don't know how large the function we
+  /// are emitting is.
+  class VISIBILITY_HIDDEN DefaultJITMemoryManager : public JITMemoryManager {
+    std::vector<sys::MemoryBlock> Blocks; // Memory blocks allocated by the JIT
+    FreeRangeHeader *FreeMemoryList;      // Circular list of free blocks.
+    
+    // When emitting code into a memory block, this is the block.
+    MemoryRangeHeader *CurBlock;
+    
+    uint8_t *CurStubPtr, *StubBase;
+    uint8_t *GOTBase;     // Target Specific reserved memory
+    void *DlsymTable;     // Stub external symbol information
+
+    // Centralize memory block allocation.
+    sys::MemoryBlock getNewMemoryBlock(unsigned size);
+    
+    std::map<const Function*, MemoryRangeHeader*> FunctionBlocks;
+    std::map<const Function*, MemoryRangeHeader*> TableBlocks;
+  public:
+    DefaultJITMemoryManager();
+    ~DefaultJITMemoryManager();
+
+    void AllocateGOT();
+    void SetDlsymTable(void *);
+    
+    uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                          unsigned Alignment);
+    
+    /// startFunctionBody - When a function starts, allocate a block of free
+    /// executable memory, returning a pointer to it and its actual size.
+    uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize) {
+      
+      FreeRangeHeader* candidateBlock = FreeMemoryList;
+      FreeRangeHeader* head = FreeMemoryList;
+      FreeRangeHeader* iter = head->Next;
+
+      uintptr_t largest = candidateBlock->BlockSize;
+      
+      // Search for the largest free block
+      while (iter != head) {
+          if (iter->BlockSize > largest) {
+              largest = iter->BlockSize;
+              candidateBlock = iter;
+          }
+          iter = iter->Next;
+      }
+      
+      // Select this candidate block for allocation
+      CurBlock = candidateBlock;
+
+      // Allocate the entire memory block.
+      FreeMemoryList = candidateBlock->AllocateBlock();
+      ActualSize = CurBlock->BlockSize-sizeof(MemoryRangeHeader);
+      return (uint8_t *)(CurBlock+1);
+    }
+    
+    /// endFunctionBody - The function F is now allocated, and takes the memory
+    /// in the range [FunctionStart,FunctionEnd).
+    void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                         uint8_t *FunctionEnd) {
+      assert(FunctionEnd > FunctionStart);
+      assert(FunctionStart == (uint8_t *)(CurBlock+1) &&
+             "Mismatched function start/end!");
+
+      uintptr_t BlockSize = FunctionEnd - (uint8_t *)CurBlock;
+      FunctionBlocks[F] = CurBlock;
+
+      // Release the memory at the end of this block that isn't needed.
+      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
+    }
+
+    /// allocateSpace - Allocate a memory block of the given size.
+    uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) {
+      CurBlock = FreeMemoryList;
+      FreeMemoryList = FreeMemoryList->AllocateBlock();
+
+      uint8_t *result = (uint8_t *)CurBlock+1;
+
+      if (Alignment == 0) Alignment = 1;
+      result = (uint8_t*)(((intptr_t)result+Alignment-1) &
+               ~(intptr_t)(Alignment-1));
+
+      uintptr_t BlockSize = result + Size - (uint8_t *)CurBlock;
+      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
+
+      return result;
+    }
+
+    /// startExceptionTable - Use startFunctionBody to allocate memory for the 
+    /// function's exception table.
+    uint8_t* startExceptionTable(const Function* F, uintptr_t &ActualSize) {
+      return startFunctionBody(F, ActualSize);
+    }
+
+    /// endExceptionTable - The exception table of F is now allocated, 
+    /// and takes the memory in the range [TableStart,TableEnd).
+    void endExceptionTable(const Function *F, uint8_t *TableStart,
+                           uint8_t *TableEnd, uint8_t* FrameRegister) {
+      assert(TableEnd > TableStart);
+      assert(TableStart == (uint8_t *)(CurBlock+1) &&
+             "Mismatched table start/end!");
+      
+      uintptr_t BlockSize = TableEnd - (uint8_t *)CurBlock;
+      TableBlocks[F] = CurBlock;
+
+      // Release the memory at the end of this block that isn't needed.
+      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
+    }
+    
+    uint8_t *getGOTBase() const {
+      return GOTBase;
+    }
+    
+    void *getDlsymTable() const {
+      return DlsymTable;
+    }
+    
+    /// deallocateMemForFunction - Deallocate all memory for the specified
+    /// function body.
+    void deallocateMemForFunction(const Function *F) {
+      std::map<const Function*, MemoryRangeHeader*>::iterator
+        I = FunctionBlocks.find(F);
+      if (I == FunctionBlocks.end()) return;
+      
+      // Find the block that is allocated for this function.
+      MemoryRangeHeader *MemRange = I->second;
+      assert(MemRange->ThisAllocated && "Block isn't allocated!");
+      
+      // Fill the buffer with garbage!
+#ifndef NDEBUG
+      memset(MemRange+1, 0xCD, MemRange->BlockSize-sizeof(*MemRange));
+#endif
+      
+      // Free the memory.
+      FreeMemoryList = MemRange->FreeBlock(FreeMemoryList);
+      
+      // Finally, remove this entry from FunctionBlocks.
+      FunctionBlocks.erase(I);
+      
+      I = TableBlocks.find(F);
+      if (I == TableBlocks.end()) return;
+      
+      // Find the block that is allocated for this function.
+      MemRange = I->second;
+      assert(MemRange->ThisAllocated && "Block isn't allocated!");
+      
+      // Fill the buffer with garbage!
+#ifndef NDEBUG
+      memset(MemRange+1, 0xCD, MemRange->BlockSize-sizeof(*MemRange));
+#endif
+      
+      // Free the memory.
+      FreeMemoryList = MemRange->FreeBlock(FreeMemoryList);
+      
+      // Finally, remove this entry from TableBlocks.
+      TableBlocks.erase(I);
+    }
+
+    /// setMemoryWritable - When code generation is in progress,
+    /// the code pages may need permissions changed.
+    void setMemoryWritable(void)
+    {
+      for (unsigned i = 0, e = Blocks.size(); i != e; ++i)
+        sys::Memory::setWritable(Blocks[i]);
+    }
+    /// setMemoryExecutable - When code generation is done and we're ready to
+    /// start execution, the code pages may need permissions changed.
+    void setMemoryExecutable(void)
+    {
+      for (unsigned i = 0, e = Blocks.size(); i != e; ++i)
+        sys::Memory::setExecutable(Blocks[i]);
+    }
+  };
+}
+
+DefaultJITMemoryManager::DefaultJITMemoryManager() {
+  // Allocate a 16M block of memory for functions.
+#if defined(__APPLE__) && defined(__arm__)
+  sys::MemoryBlock MemBlock = getNewMemoryBlock(4 << 20);
+#else
+  sys::MemoryBlock MemBlock = getNewMemoryBlock(16 << 20);
+#endif
+
+  uint8_t *MemBase = static_cast<uint8_t*>(MemBlock.base());
+
+  // Allocate stubs backwards from the base, allocate functions forward
+  // from the base.
+  StubBase   = MemBase;
+  CurStubPtr = MemBase + 512*1024; // Use 512k for stubs, working backwards.
+  
+  // We set up the memory chunk with 4 mem regions, like this:
+  //  [ START
+  //    [ Free      #0 ] -> Large space to allocate functions from.
+  //    [ Allocated #1 ] -> Tiny space to separate regions.
+  //    [ Free      #2 ] -> Tiny space so there is always at least 1 free block.
+  //    [ Allocated #3 ] -> Tiny space to prevent looking past end of block.
+  //  END ]
+  //
+  // The last three blocks are never deallocated or touched.
+  
+  // Add MemoryRangeHeader to the end of the memory region, indicating that
+  // the space after the block of memory is allocated.  This is block #3.
+  MemoryRangeHeader *Mem3 = (MemoryRangeHeader*)(MemBase+MemBlock.size())-1;
+  Mem3->ThisAllocated = 1;
+  Mem3->PrevAllocated = 0;
+  Mem3->BlockSize     = 0;
+  
+  /// Add a tiny free region so that the free list always has one entry.
+  FreeRangeHeader *Mem2 = 
+    (FreeRangeHeader *)(((char*)Mem3)-FreeRangeHeader::getMinBlockSize());
+  Mem2->ThisAllocated = 0;
+  Mem2->PrevAllocated = 1;
+  Mem2->BlockSize     = FreeRangeHeader::getMinBlockSize();
+  Mem2->SetEndOfBlockSizeMarker();
+  Mem2->Prev = Mem2;   // Mem2 *is* the free list for now.
+  Mem2->Next = Mem2;
+
+  /// Add a tiny allocated region so that Mem2 is never coalesced away.
+  MemoryRangeHeader *Mem1 = (MemoryRangeHeader*)Mem2-1;
+  Mem1->ThisAllocated = 1;
+  Mem1->PrevAllocated = 0;
+  Mem1->BlockSize     = (char*)Mem2 - (char*)Mem1;
+  
+  // Add a FreeRangeHeader to the start of the function body region, indicating
+  // that the space is free.  Mark the previous block allocated so we never look
+  // at it.
+  FreeRangeHeader *Mem0 = (FreeRangeHeader*)CurStubPtr;
+  Mem0->ThisAllocated = 0;
+  Mem0->PrevAllocated = 1;
+  Mem0->BlockSize = (char*)Mem1-(char*)Mem0;
+  Mem0->SetEndOfBlockSizeMarker();
+  Mem0->AddToFreeList(Mem2);
+  
+  // Start out with the freelist pointing to Mem0.
+  FreeMemoryList = Mem0;
+
+  GOTBase = NULL;
+  DlsymTable = NULL;
+}
+
+void DefaultJITMemoryManager::AllocateGOT() {
+  assert(GOTBase == 0 && "Cannot allocate the got multiple times");
+  GOTBase = new uint8_t[sizeof(void*) * 8192];
+  HasGOT = true;
+}
+
+void DefaultJITMemoryManager::SetDlsymTable(void *ptr) {
+  DlsymTable = ptr;
+}
+
+DefaultJITMemoryManager::~DefaultJITMemoryManager() {
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i)
+    sys::Memory::ReleaseRWX(Blocks[i]);
+  
+  delete[] GOTBase;
+  Blocks.clear();
+}
+
+uint8_t *DefaultJITMemoryManager::allocateStub(const GlobalValue* F,
+                                                     unsigned StubSize,
+                                                     unsigned Alignment) {
+  CurStubPtr -= StubSize;
+  CurStubPtr = (uint8_t*)(((intptr_t)CurStubPtr) &
+                          ~(intptr_t)(Alignment-1));
+  if (CurStubPtr < StubBase) {
+    // FIXME: allocate a new block
+    fprintf(stderr, "JIT ran out of memory for function stubs!\n");
+    abort();
+  }
+  return CurStubPtr;
+}
+
+sys::MemoryBlock DefaultJITMemoryManager::getNewMemoryBlock(unsigned size) {
+  // Allocate a new block close to the last one.
+  const sys::MemoryBlock *BOld = Blocks.empty() ? 0 : &Blocks.front();
+  std::string ErrMsg;
+  sys::MemoryBlock B = sys::Memory::AllocateRWX(size, BOld, &ErrMsg);
+  if (B.base() == 0) {
+    fprintf(stderr,
+            "Allocation failed when allocating new memory in the JIT\n%s\n",
+            ErrMsg.c_str());
+    abort();
+  }
+  Blocks.push_back(B);
+  return B;
+}
+
+
+JITMemoryManager *JITMemoryManager::CreateDefaultMemManager() {
+  return new DefaultJITMemoryManager();
+}
diff --git a/lib/ExecutionEngine/JIT/Makefile b/lib/ExecutionEngine/JIT/Makefile
new file mode 100644
index 0000000..e2c9c61
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/Makefile
@@ -0,0 +1,37 @@
+##===- lib/ExecutionEngine/JIT/Makefile --------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMJIT
+
+# Get the $(ARCH) setting
+include $(LEVEL)/Makefile.config
+
+# Enable the X86 JIT if compiling on X86
+ifeq ($(ARCH), x86)
+  ENABLE_X86_JIT = 1
+endif
+
+# This flag can also be used on the command line to force inclusion
+# of the X86 JIT on non-X86 hosts
+ifdef ENABLE_X86_JIT
+  CPPFLAGS += -DENABLE_X86_JIT
+endif
+
+# Enable the Sparc JIT if compiling on Sparc
+ifeq ($(ARCH), Sparc)
+  ENABLE_SPARC_JIT = 1
+endif
+
+# This flag can also be used on the command line to force inclusion
+# of the Sparc JIT on non-Sparc hosts
+ifdef ENABLE_SPARC_JIT
+  CPPFLAGS += -DENABLE_SPARC_JIT
+endif
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/JIT/TargetSelect.cpp b/lib/ExecutionEngine/JIT/TargetSelect.cpp
new file mode 100644
index 0000000..0f20819
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/TargetSelect.cpp
@@ -0,0 +1,83 @@
+//===-- TargetSelect.cpp - Target Chooser Code ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This just asks the TargetMachineRegistry for the appropriate JIT to use, and
+// allows the user to specify a specific one on the commandline with -march=x.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JIT.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/Support/RegistryParser.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Target/SubtargetFeature.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+static cl::opt<const TargetMachineRegistry::entry*, false,
+               RegistryParser<TargetMachine> >
+MArch("march", cl::desc("Architecture to generate assembly for:"));
+
+static cl::opt<std::string>
+MCPU("mcpu",
+  cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+  cl::value_desc("cpu-name"),
+  cl::init(""));
+
+static cl::list<std::string>
+MAttrs("mattr",
+  cl::CommaSeparated,
+  cl::desc("Target specific attributes (-mattr=help for details)"),
+  cl::value_desc("a1,+a2,-a3,..."));
+
+/// createInternal - Create an return a new JIT compiler if there is one
+/// available for the current target.  Otherwise, return null.
+///
+ExecutionEngine *JIT::createJIT(ModuleProvider *MP, std::string *ErrorStr,
+                                JITMemoryManager *JMM,
+                                CodeGenOpt::Level OptLevel) {
+  const TargetMachineRegistry::entry *TheArch = MArch;
+  if (TheArch == 0) {
+    std::string Error;
+    TheArch = TargetMachineRegistry::getClosestTargetForJIT(Error);
+    if (TheArch == 0) {
+      if (ErrorStr)
+        *ErrorStr = Error;
+      return 0;
+    }
+  } else if (TheArch->JITMatchQualityFn() == 0) {
+    cerr << "WARNING: This target JIT is not designed for the host you are"
+         << " running.  If bad things happen, please choose a different "
+         << "-march switch.\n";
+  }
+
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+  if (!MCPU.empty() || !MAttrs.empty()) {
+    SubtargetFeatures Features;
+    Features.setCPU(MCPU);
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
+
+  // Allocate a target...
+  TargetMachine *Target = TheArch->CtorFn(*MP->getModule(), FeaturesStr);
+  assert(Target && "Could not allocate target machine!");
+
+  // If the target supports JIT code generation, return a new JIT now.
+  if (TargetJITInfo *TJ = Target->getJITInfo())
+    return new JIT(MP, *Target, *TJ, JMM, OptLevel);
+
+  if (ErrorStr)
+    *ErrorStr = "target does not support JIT code generation";
+  return 0;
+}
diff --git a/lib/ExecutionEngine/Makefile b/lib/ExecutionEngine/Makefile
new file mode 100644
index 0000000..e0e050e
--- /dev/null
+++ b/lib/ExecutionEngine/Makefile
@@ -0,0 +1,13 @@
+##===- lib/ExecutionEngine/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../..
+LIBRARYNAME = LLVMExecutionEngine
+PARALLEL_DIRS = Interpreter JIT
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Linker/CMakeLists.txt b/lib/Linker/CMakeLists.txt
new file mode 100644
index 0000000..0b6d2f4
--- /dev/null
+++ b/lib/Linker/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_library(LLVMLinker
+  LinkArchives.cpp
+  LinkItems.cpp
+  LinkModules.cpp
+  Linker.cpp
+  )
diff --git a/lib/Linker/LinkArchives.cpp b/lib/Linker/LinkArchives.cpp
new file mode 100644
index 0000000..551cc8c
--- /dev/null
+++ b/lib/Linker/LinkArchives.cpp
@@ -0,0 +1,201 @@
+//===- lib/Linker/LinkArchives.cpp - Link LLVM objects and libraries ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains routines to handle linking together LLVM bitcode files,
+// and to handle annoying things like static libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/Bitcode/Archive.h"
+#include "llvm/Config/config.h"
+#include <memory>
+#include <set>
+using namespace llvm;
+
+/// GetAllUndefinedSymbols - calculates the set of undefined symbols that still
+/// exist in an LLVM module. This is a bit tricky because there may be two
+/// symbols with the same name but different LLVM types that will be resolved to
+/// each other but aren't currently (thus we need to treat it as resolved).
+///
+/// Inputs:
+///  M - The module in which to find undefined symbols.
+///
+/// Outputs:
+///  UndefinedSymbols - A set of C++ strings containing the name of all
+///                     undefined symbols.
+///
+static void
+GetAllUndefinedSymbols(Module *M, std::set<std::string> &UndefinedSymbols) {
+  std::set<std::string> DefinedSymbols;
+  UndefinedSymbols.clear();
+
+  // If the program doesn't define a main, try pulling one in from a .a file.
+  // This is needed for programs where the main function is defined in an
+  // archive, such f2c'd programs.
+  Function *Main = M->getFunction("main");
+  if (Main == 0 || Main->isDeclaration())
+    UndefinedSymbols.insert("main");
+
+  for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
+    if (I->hasName()) {
+      if (I->isDeclaration())
+        UndefinedSymbols.insert(I->getName());
+      else if (!I->hasLocalLinkage()) {
+        assert(!I->hasDLLImportLinkage()
+               && "Found dllimported non-external symbol!");
+        DefinedSymbols.insert(I->getName());
+      }      
+    }
+
+  for (Module::global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I)
+    if (I->hasName()) {
+      if (I->isDeclaration())
+        UndefinedSymbols.insert(I->getName());
+      else if (!I->hasLocalLinkage()) {
+        assert(!I->hasDLLImportLinkage()
+               && "Found dllimported non-external symbol!");
+        DefinedSymbols.insert(I->getName());
+      }      
+    }
+
+  for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    if (I->hasName())
+      DefinedSymbols.insert(I->getName());
+
+  // Prune out any defined symbols from the undefined symbols set...
+  for (std::set<std::string>::iterator I = UndefinedSymbols.begin();
+       I != UndefinedSymbols.end(); )
+    if (DefinedSymbols.count(*I))
+      UndefinedSymbols.erase(I++);  // This symbol really is defined!
+    else
+      ++I; // Keep this symbol in the undefined symbols list
+}
+
+/// LinkInArchive - opens an archive library and link in all objects which
+/// provide symbols that are currently undefined.
+///
+/// Inputs:
+///  Filename - The pathname of the archive.
+///
+/// Return Value:
+///  TRUE  - An error occurred.
+///  FALSE - No errors.
+bool
+Linker::LinkInArchive(const sys::Path &Filename, bool &is_native) {
+  // Make sure this is an archive file we're dealing with
+  if (!Filename.isArchive())
+    return error("File '" + Filename.toString() + "' is not an archive.");
+
+  // Open the archive file
+  verbose("Linking archive file '" + Filename.toString() + "'");
+
+  // Find all of the symbols currently undefined in the bitcode program.
+  // If all the symbols are defined, the program is complete, and there is
+  // no reason to link in any archive files.
+  std::set<std::string> UndefinedSymbols;
+  GetAllUndefinedSymbols(Composite, UndefinedSymbols);
+
+  if (UndefinedSymbols.empty()) {
+    verbose("No symbols undefined, skipping library '" +
+            Filename.toString() + "'");
+    return false;  // No need to link anything in!
+  }
+
+  std::string ErrMsg;
+  std::auto_ptr<Archive> AutoArch (
+    Archive::OpenAndLoadSymbols(Filename,&ErrMsg));
+
+  Archive* arch = AutoArch.get();
+
+  if (!arch)
+    return error("Cannot read archive '" + Filename.toString() +
+                 "': " + ErrMsg);
+  if (!arch->isBitcodeArchive()) {
+    is_native = true;
+    return false;
+  }
+  is_native = false;
+
+  // Save a set of symbols that are not defined by the archive. Since we're
+  // entering a loop, there's no point searching for these multiple times. This
+  // variable is used to "set_subtract" from the set of undefined symbols.
+  std::set<std::string> NotDefinedByArchive;
+
+  // Save the current set of undefined symbols, because we may have to make
+  // multiple passes over the archive:
+  std::set<std::string> CurrentlyUndefinedSymbols;
+
+  do {
+    CurrentlyUndefinedSymbols = UndefinedSymbols;
+
+    // Find the modules we need to link into the target module
+    std::set<ModuleProvider*> Modules;
+    if (!arch->findModulesDefiningSymbols(UndefinedSymbols, Modules, &ErrMsg))
+      return error("Cannot find symbols in '" + Filename.toString() + 
+                   "': " + ErrMsg);
+
+    // If we didn't find any more modules to link this time, we are done
+    // searching this archive.
+    if (Modules.empty())
+      break;
+
+    // Any symbols remaining in UndefinedSymbols after
+    // findModulesDefiningSymbols are ones that the archive does not define. So
+    // we add them to the NotDefinedByArchive variable now.
+    NotDefinedByArchive.insert(UndefinedSymbols.begin(),
+        UndefinedSymbols.end());
+
+    // Loop over all the ModuleProviders that we got back from the archive
+    for (std::set<ModuleProvider*>::iterator I=Modules.begin(), E=Modules.end();
+         I != E; ++I) {
+
+      // Get the module we must link in.
+      std::string moduleErrorMsg;
+      std::auto_ptr<Module> AutoModule((*I)->releaseModule( &moduleErrorMsg ));
+      if (!moduleErrorMsg.empty())
+        return error("Could not load a module: " + moduleErrorMsg);
+
+      Module* aModule = AutoModule.get();
+
+      if (aModule != NULL) {
+        verbose("  Linking in module: " + aModule->getModuleIdentifier());
+
+        // Link it in
+        if (LinkInModule(aModule, &moduleErrorMsg)) {
+          return error("Cannot link in module '" +
+                       aModule->getModuleIdentifier() + "': " + moduleErrorMsg);
+        }
+      } 
+    }
+    
+    // Get the undefined symbols from the aggregate module. This recomputes the
+    // symbols we still need after the new modules have been linked in.
+    GetAllUndefinedSymbols(Composite, UndefinedSymbols);
+
+    // At this point we have two sets of undefined symbols: UndefinedSymbols
+    // which holds the undefined symbols from all the modules, and
+    // NotDefinedByArchive which holds symbols we know the archive doesn't
+    // define. There's no point searching for symbols that we won't find in the
+    // archive so we subtract these sets.
+    set_subtract(UndefinedSymbols, NotDefinedByArchive);
+
+    // If there's no symbols left, no point in continuing to search the
+    // archive.
+    if (UndefinedSymbols.empty())
+      break;
+  } while (CurrentlyUndefinedSymbols != UndefinedSymbols);
+
+  return false;
+}
diff --git a/lib/Linker/LinkItems.cpp b/lib/Linker/LinkItems.cpp
new file mode 100644
index 0000000..7c888aa
--- /dev/null
+++ b/lib/Linker/LinkItems.cpp
@@ -0,0 +1,238 @@
+//===- lib/Linker/LinkItems.cpp - Link LLVM objects and libraries ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains routines to handle linking together LLVM bitcode files,
+// and to handle annoying things like static libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker.h"
+#include "llvm/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+
+using namespace llvm;
+
+// LinkItems - This function is the main entry point into linking. It takes a
+// list of LinkItem which indicates the order the files should be linked and
+// how each file should be treated (plain file or with library search). The
+// function only links bitcode and produces a result list of items that are
+// native objects. 
+bool
+Linker::LinkInItems(const ItemList& Items, ItemList& NativeItems) {
+  // Clear the NativeItems just in case
+  NativeItems.clear();
+
+  // For each linkage item ...
+  for (ItemList::const_iterator I = Items.begin(), E = Items.end();
+       I != E; ++I) {
+    if (I->second) {
+      // Link in the library suggested.
+      bool is_native = false;
+      if (LinkInLibrary(I->first, is_native))
+        return true;
+      if (is_native)
+        NativeItems.push_back(*I);
+    } else {
+      // Link in the file suggested
+      bool is_native = false;
+      if (LinkInFile(sys::Path(I->first), is_native))
+        return true;
+      if (is_native)
+        NativeItems.push_back(*I);
+    }
+  }
+
+  // At this point we have processed all the link items provided to us. Since
+  // we have an aggregated module at this point, the dependent libraries in
+  // that module should also be aggregated with duplicates eliminated. This is
+  // now the time to process the dependent libraries to resolve any remaining
+  // symbols.
+  bool is_native;
+  for (Module::lib_iterator I = Composite->lib_begin(),
+         E = Composite->lib_end(); I != E; ++I) {
+    if(LinkInLibrary(*I, is_native))
+      return true;
+    if (is_native)
+      NativeItems.push_back(std::make_pair(*I, true));
+  }
+
+  return false;
+}
+
+
+/// LinkInLibrary - links one library into the HeadModule.
+///
+bool Linker::LinkInLibrary(const std::string& Lib, bool& is_native) {
+  is_native = false;
+  // Determine where this library lives.
+  sys::Path Pathname = FindLib(Lib);
+  if (Pathname.isEmpty())
+    return error("Cannot find library '" + Lib + "'");
+
+  // If its an archive, try to link it in
+  std::string Magic;
+  Pathname.getMagicNumber(Magic, 64);
+  switch (sys::IdentifyFileType(Magic.c_str(), 64)) {
+    default: assert(0 && "Bad file type identification");
+    case sys::Unknown_FileType:
+      return warning("Supposed library '" + Lib + "' isn't a library.");
+
+    case sys::Bitcode_FileType:
+      // LLVM ".so" file.
+      if (LinkInFile(Pathname, is_native))
+        return true;
+      break;
+
+    case sys::Archive_FileType:
+      if (LinkInArchive(Pathname, is_native))
+        return error("Cannot link archive '" + Pathname.toString() + "'");
+      break;
+
+    case sys::ELF_Relocatable_FileType:
+    case sys::ELF_SharedObject_FileType:
+    case sys::Mach_O_Object_FileType:
+    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
+    case sys::COFF_FileType:
+      is_native = true;
+      break;
+  }
+  return false;
+}
+
+/// LinkLibraries - takes the specified library files and links them into the
+/// main bitcode object file.
+///
+/// Inputs:
+///  Libraries  - The list of libraries to link into the module.
+///
+/// Return value:
+///  FALSE - No error.
+///  TRUE  - Error.
+///
+bool Linker::LinkInLibraries(const std::vector<std::string> &Libraries) {
+
+  // Process the set of libraries we've been provided.
+  bool is_native = false;
+  for (unsigned i = 0; i < Libraries.size(); ++i)
+    if (LinkInLibrary(Libraries[i], is_native))
+      return true;
+
+  // At this point we have processed all the libraries provided to us. Since
+  // we have an aggregated module at this point, the dependent libraries in
+  // that module should also be aggregated with duplicates eliminated. This is
+  // now the time to process the dependent libraries to resolve any remaining
+  // symbols.
+  const Module::LibraryListType& DepLibs = Composite->getLibraries();
+  for (Module::LibraryListType::const_iterator I = DepLibs.begin(),
+         E = DepLibs.end(); I != E; ++I)
+    if (LinkInLibrary(*I, is_native))
+      return true;
+
+  return false;
+}
+
+/// LinkInFile - opens a bitcode file and links in all objects which
+/// provide symbols that are currently undefined.
+///
+/// Inputs:
+///  File - The pathname of the bitcode file.
+///
+/// Outputs:
+///  ErrorMessage - A C++ string detailing what error occurred, if any.
+///
+/// Return Value:
+///  TRUE  - An error occurred.
+///  FALSE - No errors.
+///
+bool Linker::LinkInFile(const sys::Path &File, bool &is_native) {
+  is_native = false;
+  
+  // Check for a file of name "-", which means "read standard input"
+  if (File.toString() == "-") {
+    std::auto_ptr<Module> M;
+    if (MemoryBuffer *Buffer = MemoryBuffer::getSTDIN()) {
+      M.reset(ParseBitcodeFile(Buffer, &Error));
+      delete Buffer;
+      if (M.get())
+        if (!LinkInModule(M.get(), &Error))
+          return false;
+    } else 
+      Error = "standard input is empty";
+    return error("Cannot link stdin: " + Error);
+  }
+
+  // Make sure we can at least read the file
+  if (!File.canRead())
+    return error("Cannot find linker input '" + File.toString() + "'");
+
+  // If its an archive, try to link it in
+  std::string Magic;
+  File.getMagicNumber(Magic, 64);
+  switch (sys::IdentifyFileType(Magic.c_str(), 64)) {
+    default: assert(0 && "Bad file type identification");
+    case sys::Unknown_FileType:
+      return warning("Ignoring file '" + File.toString() + 
+                   "' because does not contain bitcode.");
+
+    case sys::Archive_FileType:
+      // A user may specify an ar archive without -l, perhaps because it
+      // is not installed as a library. Detect that and link the archive.
+      verbose("Linking archive file '" + File.toString() + "'");
+      if (LinkInArchive(File, is_native))
+        return true;
+      break;
+
+    case sys::Bitcode_FileType: {
+      verbose("Linking bitcode file '" + File.toString() + "'");
+      std::auto_ptr<Module> M(LoadObject(File));
+      if (M.get() == 0)
+        return error("Cannot load file '" + File.toString() + "': " + Error);
+      if (LinkInModule(M.get(), &Error))
+        return error("Cannot link file '" + File.toString() + "': " + Error);
+
+      verbose("Linked in file '" + File.toString() + "'");
+      break;
+    }
+
+    case sys::ELF_Relocatable_FileType:
+    case sys::ELF_SharedObject_FileType:
+    case sys::Mach_O_Object_FileType:
+    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
+    case sys::COFF_FileType:
+      is_native = true;
+      break;
+  }
+  return false;
+}
+
+/// LinkFiles - takes a module and a list of files and links them all together.
+/// It locates the file either in the current directory, as its absolute
+/// or relative pathname, or as a file somewhere in LLVM_LIB_SEARCH_PATH.
+///
+/// Inputs:
+///  Files      - A vector of sys::Path indicating the LLVM bitcode filenames
+///               to be linked.  The names can refer to a mixture of pure LLVM
+///               bitcode files and archive (ar) formatted files.
+///
+/// Return value:
+///  FALSE - No errors.
+///  TRUE  - Some error occurred.
+///
+bool Linker::LinkInFiles(const std::vector<sys::Path> &Files) {
+  bool is_native;
+  for (unsigned i = 0; i < Files.size(); ++i)
+    if (LinkInFile(Files[i], is_native))
+      return true;
+  return false;
+}
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
new file mode 100644
index 0000000..4a15d88
--- /dev/null
+++ b/lib/Linker/LinkModules.cpp
@@ -0,0 +1,1328 @@
+//===- lib/Linker/LinkModules.cpp - Module Linker Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM module linker.
+//
+// Specifically, this:
+//  * Merges global variables between the two modules
+//    * Uninit + Uninit = Init, Init + Uninit = Init, Init + Init = Error if !=
+//  * Merges functions between two modules
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/Path.h"
+#include "llvm/ADT/DenseMap.h"
+#include <sstream>
+using namespace llvm;
+
+// Error - Simple wrapper function to conditionally assign to E and return true.
+// This just makes error return conditions a little bit simpler...
+static inline bool Error(std::string *E, const std::string &Message) {
+  if (E) *E = Message;
+  return true;
+}
+
+// Function: ResolveTypes()
+//
+// Description:
+//  Attempt to link the two specified types together.
+//
+// Inputs:
+//  DestTy - The type to which we wish to resolve.
+//  SrcTy  - The original type which we want to resolve.
+//
+// Outputs:
+//  DestST - The symbol table in which the new type should be placed.
+//
+// Return value:
+//  true  - There is an error and the types cannot yet be linked.
+//  false - No errors.
+//
+static bool ResolveTypes(const Type *DestTy, const Type *SrcTy) {
+  if (DestTy == SrcTy) return false;       // If already equal, noop
+  assert(DestTy && SrcTy && "Can't handle null types");
+
+  if (const OpaqueType *OT = dyn_cast<OpaqueType>(DestTy)) {
+    // Type _is_ in module, just opaque...
+    const_cast<OpaqueType*>(OT)->refineAbstractTypeTo(SrcTy);
+  } else if (const OpaqueType *OT = dyn_cast<OpaqueType>(SrcTy)) {
+    const_cast<OpaqueType*>(OT)->refineAbstractTypeTo(DestTy);
+  } else {
+    return true;  // Cannot link types... not-equal and neither is opaque.
+  }
+  return false;
+}
+
+/// LinkerTypeMap - This implements a map of types that is stable
+/// even if types are resolved/refined to other types.  This is not a general
+/// purpose map, it is specific to the linker's use.
+namespace {
+class LinkerTypeMap : public AbstractTypeUser {
+  typedef DenseMap<const Type*, PATypeHolder> TheMapTy;
+  TheMapTy TheMap;
+
+  LinkerTypeMap(const LinkerTypeMap&); // DO NOT IMPLEMENT
+  void operator=(const LinkerTypeMap&); // DO NOT IMPLEMENT
+public:
+  LinkerTypeMap() {}
+  ~LinkerTypeMap() {
+    for (DenseMap<const Type*, PATypeHolder>::iterator I = TheMap.begin(),
+         E = TheMap.end(); I != E; ++I)
+      I->first->removeAbstractTypeUser(this);
+  }
+
+  /// lookup - Return the value for the specified type or null if it doesn't
+  /// exist.
+  const Type *lookup(const Type *Ty) const {
+    TheMapTy::const_iterator I = TheMap.find(Ty);
+    if (I != TheMap.end()) return I->second;
+    return 0;
+  }
+
+  /// erase - Remove the specified type, returning true if it was in the set.
+  bool erase(const Type *Ty) {
+    if (!TheMap.erase(Ty))
+      return false;
+    if (Ty->isAbstract())
+      Ty->removeAbstractTypeUser(this);
+    return true;
+  }
+
+  /// insert - This returns true if the pointer was new to the set, false if it
+  /// was already in the set.
+  bool insert(const Type *Src, const Type *Dst) {
+    if (!TheMap.insert(std::make_pair(Src, PATypeHolder(Dst))).second)
+      return false;  // Already in map.
+    if (Src->isAbstract())
+      Src->addAbstractTypeUser(this);
+    return true;
+  }
+
+protected:
+  /// refineAbstractType - The callback method invoked when an abstract type is
+  /// resolved to another type.  An object must override this method to update
+  /// its internal state to reference NewType instead of OldType.
+  ///
+  virtual void refineAbstractType(const DerivedType *OldTy,
+                                  const Type *NewTy) {
+    TheMapTy::iterator I = TheMap.find(OldTy);
+    const Type *DstTy = I->second;
+
+    TheMap.erase(I);
+    if (OldTy->isAbstract())
+      OldTy->removeAbstractTypeUser(this);
+
+    // Don't reinsert into the map if the key is concrete now.
+    if (NewTy->isAbstract())
+      insert(NewTy, DstTy);
+  }
+
+  /// The other case which AbstractTypeUsers must be aware of is when a type
+  /// makes the transition from being abstract (where it has clients on it's
+  /// AbstractTypeUsers list) to concrete (where it does not).  This method
+  /// notifies ATU's when this occurs for a type.
+  virtual void typeBecameConcrete(const DerivedType *AbsTy) {
+    TheMap.erase(AbsTy);
+    AbsTy->removeAbstractTypeUser(this);
+  }
+
+  // for debugging...
+  virtual void dump() const {
+    cerr << "AbstractTypeSet!\n";
+  }
+};
+}
+
+
+// RecursiveResolveTypes - This is just like ResolveTypes, except that it
+// recurses down into derived types, merging the used types if the parent types
+// are compatible.
+static bool RecursiveResolveTypesI(const Type *DstTy, const Type *SrcTy,
+                                   LinkerTypeMap &Pointers) {
+  if (DstTy == SrcTy) return false;       // If already equal, noop
+
+  // If we found our opaque type, resolve it now!
+  if (isa<OpaqueType>(DstTy) || isa<OpaqueType>(SrcTy))
+    return ResolveTypes(DstTy, SrcTy);
+
+  // Two types cannot be resolved together if they are of different primitive
+  // type.  For example, we cannot resolve an int to a float.
+  if (DstTy->getTypeID() != SrcTy->getTypeID()) return true;
+
+  // If neither type is abstract, then they really are just different types.
+  if (!DstTy->isAbstract() && !SrcTy->isAbstract())
+    return true;
+
+  // Otherwise, resolve the used type used by this derived type...
+  switch (DstTy->getTypeID()) {
+  default:
+    return true;
+  case Type::FunctionTyID: {
+    const FunctionType *DstFT = cast<FunctionType>(DstTy);
+    const FunctionType *SrcFT = cast<FunctionType>(SrcTy);
+    if (DstFT->isVarArg() != SrcFT->isVarArg() ||
+        DstFT->getNumContainedTypes() != SrcFT->getNumContainedTypes())
+      return true;
+
+    // Use TypeHolder's so recursive resolution won't break us.
+    PATypeHolder ST(SrcFT), DT(DstFT);
+    for (unsigned i = 0, e = DstFT->getNumContainedTypes(); i != e; ++i) {
+      const Type *SE = ST->getContainedType(i), *DE = DT->getContainedType(i);
+      if (SE != DE && RecursiveResolveTypesI(DE, SE, Pointers))
+        return true;
+    }
+    return false;
+  }
+  case Type::StructTyID: {
+    const StructType *DstST = cast<StructType>(DstTy);
+    const StructType *SrcST = cast<StructType>(SrcTy);
+    if (DstST->getNumContainedTypes() != SrcST->getNumContainedTypes())
+      return true;
+
+    PATypeHolder ST(SrcST), DT(DstST);
+    for (unsigned i = 0, e = DstST->getNumContainedTypes(); i != e; ++i) {
+      const Type *SE = ST->getContainedType(i), *DE = DT->getContainedType(i);
+      if (SE != DE && RecursiveResolveTypesI(DE, SE, Pointers))
+        return true;
+    }
+    return false;
+  }
+  case Type::ArrayTyID: {
+    const ArrayType *DAT = cast<ArrayType>(DstTy);
+    const ArrayType *SAT = cast<ArrayType>(SrcTy);
+    if (DAT->getNumElements() != SAT->getNumElements()) return true;
+    return RecursiveResolveTypesI(DAT->getElementType(), SAT->getElementType(),
+                                  Pointers);
+  }
+  case Type::VectorTyID: {
+    const VectorType *DVT = cast<VectorType>(DstTy);
+    const VectorType *SVT = cast<VectorType>(SrcTy);
+    if (DVT->getNumElements() != SVT->getNumElements()) return true;
+    return RecursiveResolveTypesI(DVT->getElementType(), SVT->getElementType(),
+                                  Pointers);
+  }
+  case Type::PointerTyID: {
+    const PointerType *DstPT = cast<PointerType>(DstTy);
+    const PointerType *SrcPT = cast<PointerType>(SrcTy);
+
+    if (DstPT->getAddressSpace() != SrcPT->getAddressSpace())
+      return true;
+
+    // If this is a pointer type, check to see if we have already seen it.  If
+    // so, we are in a recursive branch.  Cut off the search now.  We cannot use
+    // an associative container for this search, because the type pointers (keys
+    // in the container) change whenever types get resolved.
+    if (SrcPT->isAbstract())
+      if (const Type *ExistingDestTy = Pointers.lookup(SrcPT))
+        return ExistingDestTy != DstPT;
+
+    if (DstPT->isAbstract())
+      if (const Type *ExistingSrcTy = Pointers.lookup(DstPT))
+        return ExistingSrcTy != SrcPT;
+    // Otherwise, add the current pointers to the vector to stop recursion on
+    // this pair.
+    if (DstPT->isAbstract())
+      Pointers.insert(DstPT, SrcPT);
+    if (SrcPT->isAbstract())
+      Pointers.insert(SrcPT, DstPT);
+
+    return RecursiveResolveTypesI(DstPT->getElementType(),
+                                  SrcPT->getElementType(), Pointers);
+  }
+  }
+}
+
+static bool RecursiveResolveTypes(const Type *DestTy, const Type *SrcTy) {
+  LinkerTypeMap PointerTypes;
+  return RecursiveResolveTypesI(DestTy, SrcTy, PointerTypes);
+}
+
+
+// LinkTypes - Go through the symbol table of the Src module and see if any
+// types are named in the src module that are not named in the Dst module.
+// Make sure there are no type name conflicts.
+static bool LinkTypes(Module *Dest, const Module *Src, std::string *Err) {
+        TypeSymbolTable *DestST = &Dest->getTypeSymbolTable();
+  const TypeSymbolTable *SrcST  = &Src->getTypeSymbolTable();
+
+  // Look for a type plane for Type's...
+  TypeSymbolTable::const_iterator TI = SrcST->begin();
+  TypeSymbolTable::const_iterator TE = SrcST->end();
+  if (TI == TE) return false;  // No named types, do nothing.
+
+  // Some types cannot be resolved immediately because they depend on other
+  // types being resolved to each other first.  This contains a list of types we
+  // are waiting to recheck.
+  std::vector<std::string> DelayedTypesToResolve;
+
+  for ( ; TI != TE; ++TI ) {
+    const std::string &Name = TI->first;
+    const Type *RHS = TI->second;
+
+    // Check to see if this type name is already in the dest module.
+    Type *Entry = DestST->lookup(Name);
+
+    // If the name is just in the source module, bring it over to the dest.
+    if (Entry == 0) {
+      if (!Name.empty())
+        DestST->insert(Name, const_cast<Type*>(RHS));
+    } else if (ResolveTypes(Entry, RHS)) {
+      // They look different, save the types 'till later to resolve.
+      DelayedTypesToResolve.push_back(Name);
+    }
+  }
+
+  // Iteratively resolve types while we can...
+  while (!DelayedTypesToResolve.empty()) {
+    // Loop over all of the types, attempting to resolve them if possible...
+    unsigned OldSize = DelayedTypesToResolve.size();
+
+    // Try direct resolution by name...
+    for (unsigned i = 0; i != DelayedTypesToResolve.size(); ++i) {
+      const std::string &Name = DelayedTypesToResolve[i];
+      Type *T1 = SrcST->lookup(Name);
+      Type *T2 = DestST->lookup(Name);
+      if (!ResolveTypes(T2, T1)) {
+        // We are making progress!
+        DelayedTypesToResolve.erase(DelayedTypesToResolve.begin()+i);
+        --i;
+      }
+    }
+
+    // Did we not eliminate any types?
+    if (DelayedTypesToResolve.size() == OldSize) {
+      // Attempt to resolve subelements of types.  This allows us to merge these
+      // two types: { int* } and { opaque* }
+      for (unsigned i = 0, e = DelayedTypesToResolve.size(); i != e; ++i) {
+        const std::string &Name = DelayedTypesToResolve[i];
+        if (!RecursiveResolveTypes(SrcST->lookup(Name), DestST->lookup(Name))) {
+          // We are making progress!
+          DelayedTypesToResolve.erase(DelayedTypesToResolve.begin()+i);
+
+          // Go back to the main loop, perhaps we can resolve directly by name
+          // now...
+          break;
+        }
+      }
+
+      // If we STILL cannot resolve the types, then there is something wrong.
+      if (DelayedTypesToResolve.size() == OldSize) {
+        // Remove the symbol name from the destination.
+        DelayedTypesToResolve.pop_back();
+      }
+    }
+  }
+
+
+  return false;
+}
+
+#ifndef NDEBUG
+static void PrintMap(const std::map<const Value*, Value*> &M) {
+  for (std::map<const Value*, Value*>::const_iterator I = M.begin(), E =M.end();
+       I != E; ++I) {
+    cerr << " Fr: " << (void*)I->first << " ";
+    I->first->dump();
+    cerr << " To: " << (void*)I->second << " ";
+    I->second->dump();
+    cerr << "\n";
+  }
+}
+#endif
+
+
+// RemapOperand - Use ValueMap to convert constants from one module to another.
+static Value *RemapOperand(const Value *In,
+                           std::map<const Value*, Value*> &ValueMap) {
+  std::map<const Value*,Value*>::const_iterator I = ValueMap.find(In);
+  if (I != ValueMap.end())
+    return I->second;
+
+  // Check to see if it's a constant that we are interested in transforming.
+  Value *Result = 0;
+  if (const Constant *CPV = dyn_cast<Constant>(In)) {
+    if ((!isa<DerivedType>(CPV->getType()) && !isa<ConstantExpr>(CPV)) ||
+        isa<ConstantInt>(CPV) || isa<ConstantAggregateZero>(CPV))
+      return const_cast<Constant*>(CPV);   // Simple constants stay identical.
+
+    if (const ConstantArray *CPA = dyn_cast<ConstantArray>(CPV)) {
+      std::vector<Constant*> Operands(CPA->getNumOperands());
+      for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i)
+        Operands[i] =cast<Constant>(RemapOperand(CPA->getOperand(i), ValueMap));
+      Result = ConstantArray::get(cast<ArrayType>(CPA->getType()), Operands);
+    } else if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(CPV)) {
+      std::vector<Constant*> Operands(CPS->getNumOperands());
+      for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i)
+        Operands[i] =cast<Constant>(RemapOperand(CPS->getOperand(i), ValueMap));
+      Result = ConstantStruct::get(cast<StructType>(CPS->getType()), Operands);
+    } else if (isa<ConstantPointerNull>(CPV) || isa<UndefValue>(CPV)) {
+      Result = const_cast<Constant*>(CPV);
+    } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CPV)) {
+      std::vector<Constant*> Operands(CP->getNumOperands());
+      for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
+        Operands[i] = cast<Constant>(RemapOperand(CP->getOperand(i), ValueMap));
+      Result = ConstantVector::get(Operands);
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
+      std::vector<Constant*> Ops;
+      for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
+        Ops.push_back(cast<Constant>(RemapOperand(CE->getOperand(i),ValueMap)));
+      Result = CE->getWithOperands(Ops);
+    } else {
+      assert(!isa<GlobalValue>(CPV) && "Unmapped global?");
+      assert(0 && "Unknown type of derived type constant value!");
+    }
+  } else if (isa<InlineAsm>(In)) {
+    Result = const_cast<Value*>(In);
+  }
+
+  // Cache the mapping in our local map structure
+  if (Result) {
+    ValueMap[In] = Result;
+    return Result;
+  }
+
+#ifndef NDEBUG
+  cerr << "LinkModules ValueMap: \n";
+  PrintMap(ValueMap);
+
+  cerr << "Couldn't remap value: " << (void*)In << " " << *In << "\n";
+  assert(0 && "Couldn't remap value!");
+#endif
+  return 0;
+}
+
+/// ForceRenaming - The LLVM SymbolTable class autorenames globals that conflict
+/// in the symbol table.  This is good for all clients except for us.  Go
+/// through the trouble to force this back.
+static void ForceRenaming(GlobalValue *GV, const std::string &Name) {
+  assert(GV->getName() != Name && "Can't force rename to self");
+  ValueSymbolTable &ST = GV->getParent()->getValueSymbolTable();
+
+  // If there is a conflict, rename the conflict.
+  if (GlobalValue *ConflictGV = cast_or_null<GlobalValue>(ST.lookup(Name))) {
+    assert(ConflictGV->hasLocalLinkage() &&
+           "Not conflicting with a static global, should link instead!");
+    GV->takeName(ConflictGV);
+    ConflictGV->setName(Name);    // This will cause ConflictGV to get renamed
+    assert(ConflictGV->getName() != Name && "ForceRenaming didn't work");
+  } else {
+    GV->setName(Name);              // Force the name back
+  }
+}
+
+/// CopyGVAttributes - copy additional attributes (those not needed to construct
+/// a GlobalValue) from the SrcGV to the DestGV.
+static void CopyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
+  // Use the maximum alignment, rather than just copying the alignment of SrcGV.
+  unsigned Alignment = std::max(DestGV->getAlignment(), SrcGV->getAlignment());
+  DestGV->copyAttributesFrom(SrcGV);
+  DestGV->setAlignment(Alignment);
+}
+
+/// GetLinkageResult - This analyzes the two global values and determines what
+/// the result will look like in the destination module.  In particular, it
+/// computes the resultant linkage type, computes whether the global in the
+/// source should be copied over to the destination (replacing the existing
+/// one), and computes whether this linkage is an error or not. It also performs
+/// visibility checks: we cannot link together two symbols with different
+/// visibilities.
+static bool GetLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
+                             GlobalValue::LinkageTypes &LT, bool &LinkFromSrc,
+                             std::string *Err) {
+  assert((!Dest || !Src->hasLocalLinkage()) &&
+         "If Src has internal linkage, Dest shouldn't be set!");
+  if (!Dest) {
+    // Linking something to nothing.
+    LinkFromSrc = true;
+    LT = Src->getLinkage();
+  } else if (Src->isDeclaration()) {
+    // If Src is external or if both Src & Dest are external..  Just link the
+    // external globals, we aren't adding anything.
+    if (Src->hasDLLImportLinkage()) {
+      // If one of GVs has DLLImport linkage, result should be dllimport'ed.
+      if (Dest->isDeclaration()) {
+        LinkFromSrc = true;
+        LT = Src->getLinkage();
+      }
+    } else if (Dest->hasExternalWeakLinkage()) {
+      // If the Dest is weak, use the source linkage.
+      LinkFromSrc = true;
+      LT = Src->getLinkage();
+    } else {
+      LinkFromSrc = false;
+      LT = Dest->getLinkage();
+    }
+  } else if (Dest->isDeclaration() && !Dest->hasDLLImportLinkage()) {
+    // If Dest is external but Src is not:
+    LinkFromSrc = true;
+    LT = Src->getLinkage();
+  } else if (Src->hasAppendingLinkage() || Dest->hasAppendingLinkage()) {
+    if (Src->getLinkage() != Dest->getLinkage())
+      return Error(Err, "Linking globals named '" + Src->getName() +
+            "': can only link appending global with another appending global!");
+    LinkFromSrc = true; // Special cased.
+    LT = Src->getLinkage();
+  } else if (Src->isWeakForLinker()) {
+    // At this point we know that Dest has LinkOnce, External*, Weak, Common,
+    // or DLL* linkage.
+    if (Dest->hasExternalWeakLinkage() ||
+        Dest->hasAvailableExternallyLinkage() ||
+        (Dest->hasLinkOnceLinkage() &&
+         (Src->hasWeakLinkage() || Src->hasCommonLinkage()))) {
+      LinkFromSrc = true;
+      LT = Src->getLinkage();
+    } else {
+      LinkFromSrc = false;
+      LT = Dest->getLinkage();
+    }
+  } else if (Dest->isWeakForLinker()) {
+    // At this point we know that Src has External* or DLL* linkage.
+    if (Src->hasExternalWeakLinkage()) {
+      LinkFromSrc = false;
+      LT = Dest->getLinkage();
+    } else {
+      LinkFromSrc = true;
+      LT = GlobalValue::ExternalLinkage;
+    }
+  } else {
+    assert((Dest->hasExternalLinkage() ||
+            Dest->hasDLLImportLinkage() ||
+            Dest->hasDLLExportLinkage() ||
+            Dest->hasExternalWeakLinkage()) &&
+           (Src->hasExternalLinkage() ||
+            Src->hasDLLImportLinkage() ||
+            Src->hasDLLExportLinkage() ||
+            Src->hasExternalWeakLinkage()) &&
+           "Unexpected linkage type!");
+    return Error(Err, "Linking globals named '" + Src->getName() +
+                 "': symbol multiply defined!");
+  }
+
+  // Check visibility
+  if (Dest && Src->getVisibility() != Dest->getVisibility())
+    if (!Src->isDeclaration() && !Dest->isDeclaration())
+      return Error(Err, "Linking globals named '" + Src->getName() +
+                   "': symbols have different visibilities!");
+  return false;
+}
+
+// LinkGlobals - Loop through the global variables in the src module and merge
+// them into the dest module.
+static bool LinkGlobals(Module *Dest, const Module *Src,
+                        std::map<const Value*, Value*> &ValueMap,
+                    std::multimap<std::string, GlobalVariable *> &AppendingVars,
+                        std::string *Err) {
+  ValueSymbolTable &DestSymTab = Dest->getValueSymbolTable();
+
+  // Loop over all of the globals in the src module, mapping them over as we go
+  for (Module::const_global_iterator I = Src->global_begin(),
+       E = Src->global_end(); I != E; ++I) {
+    const GlobalVariable *SGV = I;
+    GlobalValue *DGV = 0;
+
+    // Check to see if may have to link the global with the global, alias or
+    // function.
+    if (SGV->hasName() && !SGV->hasLocalLinkage())
+      DGV = cast_or_null<GlobalValue>(DestSymTab.lookup(SGV->getNameStart(),
+                                                        SGV->getNameEnd()));
+
+    // If we found a global with the same name in the dest module, but it has
+    // internal linkage, we are really not doing any linkage here.
+    if (DGV && DGV->hasLocalLinkage())
+      DGV = 0;
+
+    // If types don't agree due to opaque types, try to resolve them.
+    if (DGV && DGV->getType() != SGV->getType())
+      RecursiveResolveTypes(SGV->getType(), DGV->getType());
+
+    assert((SGV->hasInitializer() || SGV->hasExternalWeakLinkage() ||
+            SGV->hasExternalLinkage() || SGV->hasDLLImportLinkage()) &&
+           "Global must either be external or have an initializer!");
+
+    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+    bool LinkFromSrc = false;
+    if (GetLinkageResult(DGV, SGV, NewLinkage, LinkFromSrc, Err))
+      return true;
+
+    if (DGV == 0) {
+      // No linking to be performed, simply create an identical version of the
+      // symbol over in the dest module... the initializer will be filled in
+      // later by LinkGlobalInits.
+      GlobalVariable *NewDGV =
+        new GlobalVariable(SGV->getType()->getElementType(),
+                           SGV->isConstant(), SGV->getLinkage(), /*init*/0,
+                           SGV->getName(), Dest, false,
+                           SGV->getType()->getAddressSpace());
+      // Propagate alignment, visibility and section info.
+      CopyGVAttributes(NewDGV, SGV);
+
+      // If the LLVM runtime renamed the global, but it is an externally visible
+      // symbol, DGV must be an existing global with internal linkage.  Rename
+      // it.
+      if (!NewDGV->hasLocalLinkage() && NewDGV->getName() != SGV->getName())
+        ForceRenaming(NewDGV, SGV->getName());
+
+      // Make sure to remember this mapping.
+      ValueMap[SGV] = NewDGV;
+
+      // Keep track that this is an appending variable.
+      if (SGV->hasAppendingLinkage())
+        AppendingVars.insert(std::make_pair(SGV->getName(), NewDGV));
+      continue;
+    }
+
+    // If the visibilities of the symbols disagree and the destination is a
+    // prototype, take the visibility of its input.
+    if (DGV->isDeclaration())
+      DGV->setVisibility(SGV->getVisibility());
+
+    if (DGV->hasAppendingLinkage()) {
+      // No linking is performed yet.  Just insert a new copy of the global, and
+      // keep track of the fact that it is an appending variable in the
+      // AppendingVars map.  The name is cleared out so that no linkage is
+      // performed.
+      GlobalVariable *NewDGV =
+        new GlobalVariable(SGV->getType()->getElementType(),
+                           SGV->isConstant(), SGV->getLinkage(), /*init*/0,
+                           "", Dest, false,
+                           SGV->getType()->getAddressSpace());
+
+      // Set alignment allowing CopyGVAttributes merge it with alignment of SGV.
+      NewDGV->setAlignment(DGV->getAlignment());
+      // Propagate alignment, section and visibility info.
+      CopyGVAttributes(NewDGV, SGV);
+
+      // Make sure to remember this mapping...
+      ValueMap[SGV] = NewDGV;
+
+      // Keep track that this is an appending variable...
+      AppendingVars.insert(std::make_pair(SGV->getName(), NewDGV));
+      continue;
+    }
+
+    if (LinkFromSrc) {
+      if (isa<GlobalAlias>(DGV))
+        return Error(Err, "Global-Alias Collision on '" + SGV->getName() +
+                     "': symbol multiple defined");
+
+      // If the types don't match, and if we are to link from the source, nuke
+      // DGV and create a new one of the appropriate type.  Note that the thing
+      // we are replacing may be a function (if a prototype, weak, etc) or a
+      // global variable.
+      GlobalVariable *NewDGV =
+        new GlobalVariable(SGV->getType()->getElementType(), SGV->isConstant(),
+                           NewLinkage, /*init*/0, DGV->getName(), Dest, false,
+                           SGV->getType()->getAddressSpace());
+
+      // Propagate alignment, section, and visibility info.
+      CopyGVAttributes(NewDGV, SGV);
+      DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
+
+      // DGV will conflict with NewDGV because they both had the same
+      // name. We must erase this now so ForceRenaming doesn't assert
+      // because DGV might not have internal linkage.
+      if (GlobalVariable *Var = dyn_cast<GlobalVariable>(DGV))
+        Var->eraseFromParent();
+      else
+        cast<Function>(DGV)->eraseFromParent();
+      DGV = NewDGV;
+
+      // If the symbol table renamed the global, but it is an externally visible
+      // symbol, DGV must be an existing global with internal linkage.  Rename.
+      if (NewDGV->getName() != SGV->getName() && !NewDGV->hasLocalLinkage())
+        ForceRenaming(NewDGV, SGV->getName());
+
+      // Inherit const as appropriate.
+      NewDGV->setConstant(SGV->isConstant());
+
+      // Make sure to remember this mapping.
+      ValueMap[SGV] = NewDGV;
+      continue;
+    }
+
+    // Not "link from source", keep the one in the DestModule and remap the
+    // input onto it.
+
+    // Special case for const propagation.
+    if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
+      if (DGVar->isDeclaration() && SGV->isConstant() && !DGVar->isConstant())
+        DGVar->setConstant(true);
+
+    // SGV is global, but DGV is alias.
+    if (isa<GlobalAlias>(DGV)) {
+      // The only valid mappings are:
+      // - SGV is external declaration, which is effectively a no-op.
+      // - SGV is weak, when we just need to throw SGV out.
+      if (!SGV->isDeclaration() && !SGV->isWeakForLinker())
+        return Error(Err, "Global-Alias Collision on '" + SGV->getName() +
+                     "': symbol multiple defined");
+    }
+
+    // Set calculated linkage
+    DGV->setLinkage(NewLinkage);
+
+    // Make sure to remember this mapping...
+    ValueMap[SGV] = ConstantExpr::getBitCast(DGV, SGV->getType());
+  }
+  return false;
+}
+
+static GlobalValue::LinkageTypes
+CalculateAliasLinkage(const GlobalValue *SGV, const GlobalValue *DGV) {
+  GlobalValue::LinkageTypes SL = SGV->getLinkage();
+  GlobalValue::LinkageTypes DL = DGV->getLinkage();
+  if (SL == GlobalValue::ExternalLinkage || DL == GlobalValue::ExternalLinkage)
+    return GlobalValue::ExternalLinkage;
+  else if (SL == GlobalValue::WeakAnyLinkage ||
+           DL == GlobalValue::WeakAnyLinkage)
+    return GlobalValue::WeakAnyLinkage;
+  else if (SL == GlobalValue::WeakODRLinkage ||
+           DL == GlobalValue::WeakODRLinkage)
+    return GlobalValue::WeakODRLinkage;
+  else if (SL == GlobalValue::InternalLinkage &&
+           DL == GlobalValue::InternalLinkage)
+    return GlobalValue::InternalLinkage;
+  else {
+    assert (SL == GlobalValue::PrivateLinkage &&
+            DL == GlobalValue::PrivateLinkage && "Unexpected linkage type");
+    return GlobalValue::PrivateLinkage;
+  }
+}
+
+// LinkAlias - Loop through the alias in the src module and link them into the
+// dest module. We're assuming, that all functions/global variables were already
+// linked in.
+static bool LinkAlias(Module *Dest, const Module *Src,
+                      std::map<const Value*, Value*> &ValueMap,
+                      std::string *Err) {
+  // Loop over all alias in the src module
+  for (Module::const_alias_iterator I = Src->alias_begin(),
+         E = Src->alias_end(); I != E; ++I) {
+    const GlobalAlias *SGA = I;
+    const GlobalValue *SAliasee = SGA->getAliasedGlobal();
+    GlobalAlias *NewGA = NULL;
+
+    // Globals were already linked, thus we can just query ValueMap for variant
+    // of SAliasee in Dest.
+    std::map<const Value*,Value*>::const_iterator VMI = ValueMap.find(SAliasee);
+    assert(VMI != ValueMap.end() && "Aliasee not linked");
+    GlobalValue* DAliasee = cast<GlobalValue>(VMI->second);
+    GlobalValue* DGV = NULL;
+
+    // Try to find something 'similar' to SGA in destination module.
+    if (!DGV && !SGA->hasLocalLinkage()) {
+      DGV = Dest->getNamedAlias(SGA->getName());
+
+      // If types don't agree due to opaque types, try to resolve them.
+      if (DGV && DGV->getType() != SGA->getType())
+        RecursiveResolveTypes(SGA->getType(), DGV->getType());
+    }
+
+    if (!DGV && !SGA->hasLocalLinkage()) {
+      DGV = Dest->getGlobalVariable(SGA->getName());
+
+      // If types don't agree due to opaque types, try to resolve them.
+      if (DGV && DGV->getType() != SGA->getType())
+        RecursiveResolveTypes(SGA->getType(), DGV->getType());
+    }
+
+    if (!DGV && !SGA->hasLocalLinkage()) {
+      DGV = Dest->getFunction(SGA->getName());
+
+      // If types don't agree due to opaque types, try to resolve them.
+      if (DGV && DGV->getType() != SGA->getType())
+        RecursiveResolveTypes(SGA->getType(), DGV->getType());
+    }
+
+    // No linking to be performed on internal stuff.
+    if (DGV && DGV->hasLocalLinkage())
+      DGV = NULL;
+
+    if (GlobalAlias *DGA = dyn_cast_or_null<GlobalAlias>(DGV)) {
+      // Types are known to be the same, check whether aliasees equal. As
+      // globals are already linked we just need query ValueMap to find the
+      // mapping.
+      if (DAliasee == DGA->getAliasedGlobal()) {
+        // This is just two copies of the same alias. Propagate linkage, if
+        // necessary.
+        DGA->setLinkage(CalculateAliasLinkage(SGA, DGA));
+
+        NewGA = DGA;
+        // Proceed to 'common' steps
+      } else
+        return Error(Err, "Alias Collision on '"  + SGA->getName()+
+                     "': aliases have different aliasees");
+    } else if (GlobalVariable *DGVar = dyn_cast_or_null<GlobalVariable>(DGV)) {
+      // The only allowed way is to link alias with external declaration or weak
+      // symbol..
+      if (DGVar->isDeclaration() || DGVar->isWeakForLinker()) {
+        // But only if aliasee is global too...
+        if (!isa<GlobalVariable>(DAliasee))
+          return Error(Err, "Global-Alias Collision on '" + SGA->getName() +
+                       "': aliasee is not global variable");
+
+        NewGA = new GlobalAlias(SGA->getType(), SGA->getLinkage(),
+                                SGA->getName(), DAliasee, Dest);
+        CopyGVAttributes(NewGA, SGA);
+
+        // Any uses of DGV need to change to NewGA, with cast, if needed.
+        if (SGA->getType() != DGVar->getType())
+          DGVar->replaceAllUsesWith(ConstantExpr::getBitCast(NewGA,
+                                                             DGVar->getType()));
+        else
+          DGVar->replaceAllUsesWith(NewGA);
+
+        // DGVar will conflict with NewGA because they both had the same
+        // name. We must erase this now so ForceRenaming doesn't assert
+        // because DGV might not have internal linkage.
+        DGVar->eraseFromParent();
+
+        // Proceed to 'common' steps
+      } else
+        return Error(Err, "Global-Alias Collision on '" + SGA->getName() +
+                     "': symbol multiple defined");
+    } else if (Function *DF = dyn_cast_or_null<Function>(DGV)) {
+      // The only allowed way is to link alias with external declaration or weak
+      // symbol...
+      if (DF->isDeclaration() || DF->isWeakForLinker()) {
+        // But only if aliasee is function too...
+        if (!isa<Function>(DAliasee))
+          return Error(Err, "Function-Alias Collision on '" + SGA->getName() +
+                       "': aliasee is not function");
+
+        NewGA = new GlobalAlias(SGA->getType(), SGA->getLinkage(),
+                                SGA->getName(), DAliasee, Dest);
+        CopyGVAttributes(NewGA, SGA);
+
+        // Any uses of DF need to change to NewGA, with cast, if needed.
+        if (SGA->getType() != DF->getType())
+          DF->replaceAllUsesWith(ConstantExpr::getBitCast(NewGA,
+                                                          DF->getType()));
+        else
+          DF->replaceAllUsesWith(NewGA);
+
+        // DF will conflict with NewGA because they both had the same
+        // name. We must erase this now so ForceRenaming doesn't assert
+        // because DF might not have internal linkage.
+        DF->eraseFromParent();
+
+        // Proceed to 'common' steps
+      } else
+        return Error(Err, "Function-Alias Collision on '" + SGA->getName() +
+                     "': symbol multiple defined");
+    } else {
+      // No linking to be performed, simply create an identical version of the
+      // alias over in the dest module...
+
+      NewGA = new GlobalAlias(SGA->getType(), SGA->getLinkage(),
+                              SGA->getName(), DAliasee, Dest);
+      CopyGVAttributes(NewGA, SGA);
+
+      // Proceed to 'common' steps
+    }
+
+    assert(NewGA && "No alias was created in destination module!");
+
+    // If the symbol table renamed the alias, but it is an externally visible
+    // symbol, DGA must be an global value with internal linkage. Rename it.
+    if (NewGA->getName() != SGA->getName() &&
+        !NewGA->hasLocalLinkage())
+      ForceRenaming(NewGA, SGA->getName());
+
+    // Remember this mapping so uses in the source module get remapped
+    // later by RemapOperand.
+    ValueMap[SGA] = NewGA;
+  }
+
+  return false;
+}
+
+
+// LinkGlobalInits - Update the initializers in the Dest module now that all
+// globals that may be referenced are in Dest.
+static bool LinkGlobalInits(Module *Dest, const Module *Src,
+                            std::map<const Value*, Value*> &ValueMap,
+                            std::string *Err) {
+  // Loop over all of the globals in the src module, mapping them over as we go
+  for (Module::const_global_iterator I = Src->global_begin(),
+       E = Src->global_end(); I != E; ++I) {
+    const GlobalVariable *SGV = I;
+
+    if (SGV->hasInitializer()) {      // Only process initialized GV's
+      // Figure out what the initializer looks like in the dest module...
+      Constant *SInit =
+        cast<Constant>(RemapOperand(SGV->getInitializer(), ValueMap));
+      // Grab destination global variable or alias.
+      GlobalValue *DGV = cast<GlobalValue>(ValueMap[SGV]->stripPointerCasts());
+
+      // If dest if global variable, check that initializers match.
+      if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV)) {
+        if (DGVar->hasInitializer()) {
+          if (SGV->hasExternalLinkage()) {
+            if (DGVar->getInitializer() != SInit)
+              return Error(Err, "Global Variable Collision on '" +
+                           SGV->getName() +
+                           "': global variables have different initializers");
+          } else if (DGVar->isWeakForLinker()) {
+            // Nothing is required, mapped values will take the new global
+            // automatically.
+          } else if (SGV->isWeakForLinker()) {
+            // Nothing is required, mapped values will take the new global
+            // automatically.
+          } else if (DGVar->hasAppendingLinkage()) {
+            assert(0 && "Appending linkage unimplemented!");
+          } else {
+            assert(0 && "Unknown linkage!");
+          }
+        } else {
+          // Copy the initializer over now...
+          DGVar->setInitializer(SInit);
+        }
+      } else {
+        // Destination is alias, the only valid situation is when source is
+        // weak. Also, note, that we already checked linkage in LinkGlobals(),
+        // thus we assert here.
+        // FIXME: Should we weaken this assumption, 'dereference' alias and
+        // check for initializer of aliasee?
+        assert(SGV->isWeakForLinker());
+      }
+    }
+  }
+  return false;
+}
+
+// LinkFunctionProtos - Link the functions together between the two modules,
+// without doing function bodies... this just adds external function prototypes
+// to the Dest function...
+//
+static bool LinkFunctionProtos(Module *Dest, const Module *Src,
+                               std::map<const Value*, Value*> &ValueMap,
+                               std::string *Err) {
+  ValueSymbolTable &DestSymTab = Dest->getValueSymbolTable();
+
+  // Loop over all of the functions in the src module, mapping them over
+  for (Module::const_iterator I = Src->begin(), E = Src->end(); I != E; ++I) {
+    const Function *SF = I;   // SrcFunction
+    GlobalValue *DGV = 0;
+
+    // Check to see if may have to link the function with the global, alias or
+    // function.
+    if (SF->hasName() && !SF->hasLocalLinkage())
+      DGV = cast_or_null<GlobalValue>(DestSymTab.lookup(SF->getNameStart(),
+                                                        SF->getNameEnd()));
+
+    // If we found a global with the same name in the dest module, but it has
+    // internal linkage, we are really not doing any linkage here.
+    if (DGV && DGV->hasLocalLinkage())
+      DGV = 0;
+
+    // If types don't agree due to opaque types, try to resolve them.
+    if (DGV && DGV->getType() != SF->getType())
+      RecursiveResolveTypes(SF->getType(), DGV->getType());
+
+    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+    bool LinkFromSrc = false;
+    if (GetLinkageResult(DGV, SF, NewLinkage, LinkFromSrc, Err))
+      return true;
+
+    // If there is no linkage to be performed, just bring over SF without
+    // modifying it.
+    if (DGV == 0) {
+      // Function does not already exist, simply insert an function signature
+      // identical to SF into the dest module.
+      Function *NewDF = Function::Create(SF->getFunctionType(),
+                                         SF->getLinkage(),
+                                         SF->getName(), Dest);
+      CopyGVAttributes(NewDF, SF);
+
+      // If the LLVM runtime renamed the function, but it is an externally
+      // visible symbol, DF must be an existing function with internal linkage.
+      // Rename it.
+      if (!NewDF->hasLocalLinkage() && NewDF->getName() != SF->getName())
+        ForceRenaming(NewDF, SF->getName());
+
+      // ... and remember this mapping...
+      ValueMap[SF] = NewDF;
+      continue;
+    }
+
+    // If the visibilities of the symbols disagree and the destination is a
+    // prototype, take the visibility of its input.
+    if (DGV->isDeclaration())
+      DGV->setVisibility(SF->getVisibility());
+
+    if (LinkFromSrc) {
+      if (isa<GlobalAlias>(DGV))
+        return Error(Err, "Function-Alias Collision on '" + SF->getName() +
+                     "': symbol multiple defined");
+
+      // We have a definition of the same name but different type in the
+      // source module. Copy the prototype to the destination and replace
+      // uses of the destination's prototype with the new prototype.
+      Function *NewDF = Function::Create(SF->getFunctionType(), NewLinkage,
+                                         SF->getName(), Dest);
+      CopyGVAttributes(NewDF, SF);
+
+      // Any uses of DF need to change to NewDF, with cast
+      DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDF, DGV->getType()));
+
+      // DF will conflict with NewDF because they both had the same. We must
+      // erase this now so ForceRenaming doesn't assert because DF might
+      // not have internal linkage.
+      if (GlobalVariable *Var = dyn_cast<GlobalVariable>(DGV))
+        Var->eraseFromParent();
+      else
+        cast<Function>(DGV)->eraseFromParent();
+
+      // If the symbol table renamed the function, but it is an externally
+      // visible symbol, DF must be an existing function with internal
+      // linkage.  Rename it.
+      if (NewDF->getName() != SF->getName() && !NewDF->hasLocalLinkage())
+        ForceRenaming(NewDF, SF->getName());
+
+      // Remember this mapping so uses in the source module get remapped
+      // later by RemapOperand.
+      ValueMap[SF] = NewDF;
+      continue;
+    }
+
+    // Not "link from source", keep the one in the DestModule and remap the
+    // input onto it.
+
+    if (isa<GlobalAlias>(DGV)) {
+      // The only valid mappings are:
+      // - SF is external declaration, which is effectively a no-op.
+      // - SF is weak, when we just need to throw SF out.
+      if (!SF->isDeclaration() && !SF->isWeakForLinker())
+        return Error(Err, "Function-Alias Collision on '" + SF->getName() +
+                     "': symbol multiple defined");
+    }
+
+    // Set calculated linkage
+    DGV->setLinkage(NewLinkage);
+
+    // Make sure to remember this mapping.
+    ValueMap[SF] = ConstantExpr::getBitCast(DGV, SF->getType());
+  }
+  return false;
+}
+
+// LinkFunctionBody - Copy the source function over into the dest function and
+// fix up references to values.  At this point we know that Dest is an external
+// function, and that Src is not.
+static bool LinkFunctionBody(Function *Dest, Function *Src,
+                             std::map<const Value*, Value*> &ValueMap,
+                             std::string *Err) {
+  assert(Src && Dest && Dest->isDeclaration() && !Src->isDeclaration());
+
+  // Go through and convert function arguments over, remembering the mapping.
+  Function::arg_iterator DI = Dest->arg_begin();
+  for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
+       I != E; ++I, ++DI) {
+    DI->setName(I->getName());  // Copy the name information over...
+
+    // Add a mapping to our local map
+    ValueMap[I] = DI;
+  }
+
+  // Splice the body of the source function into the dest function.
+  Dest->getBasicBlockList().splice(Dest->end(), Src->getBasicBlockList());
+
+  // At this point, all of the instructions and values of the function are now
+  // copied over.  The only problem is that they are still referencing values in
+  // the Source function as operands.  Loop through all of the operands of the
+  // functions and patch them up to point to the local versions...
+  //
+  for (Function::iterator BB = Dest->begin(), BE = Dest->end(); BB != BE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+           OI != OE; ++OI)
+        if (!isa<Instruction>(*OI) && !isa<BasicBlock>(*OI))
+          *OI = RemapOperand(*OI, ValueMap);
+
+  // There is no need to map the arguments anymore.
+  for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
+       I != E; ++I)
+    ValueMap.erase(I);
+
+  return false;
+}
+
+
+// LinkFunctionBodies - Link in the function bodies that are defined in the
+// source module into the DestModule.  This consists basically of copying the
+// function over and fixing up references to values.
+static bool LinkFunctionBodies(Module *Dest, Module *Src,
+                               std::map<const Value*, Value*> &ValueMap,
+                               std::string *Err) {
+
+  // Loop over all of the functions in the src module, mapping them over as we
+  // go
+  for (Module::iterator SF = Src->begin(), E = Src->end(); SF != E; ++SF) {
+    if (!SF->isDeclaration()) {               // No body if function is external
+      Function *DF = dyn_cast<Function>(ValueMap[SF]); // Destination function
+
+      // DF not external SF external?
+      if (DF && DF->isDeclaration())
+        // Only provide the function body if there isn't one already.
+        if (LinkFunctionBody(DF, SF, ValueMap, Err))
+          return true;
+    }
+  }
+  return false;
+}
+
+// LinkAppendingVars - If there were any appending global variables, link them
+// together now.  Return true on error.
+static bool LinkAppendingVars(Module *M,
+                  std::multimap<std::string, GlobalVariable *> &AppendingVars,
+                              std::string *ErrorMsg) {
+  if (AppendingVars.empty()) return false; // Nothing to do.
+
+  // Loop over the multimap of appending vars, processing any variables with the
+  // same name, forming a new appending global variable with both of the
+  // initializers merged together, then rewrite references to the old variables
+  // and delete them.
+  std::vector<Constant*> Inits;
+  while (AppendingVars.size() > 1) {
+    // Get the first two elements in the map...
+    std::multimap<std::string,
+      GlobalVariable*>::iterator Second = AppendingVars.begin(), First=Second++;
+
+    // If the first two elements are for different names, there is no pair...
+    // Otherwise there is a pair, so link them together...
+    if (First->first == Second->first) {
+      GlobalVariable *G1 = First->second, *G2 = Second->second;
+      const ArrayType *T1 = cast<ArrayType>(G1->getType()->getElementType());
+      const ArrayType *T2 = cast<ArrayType>(G2->getType()->getElementType());
+
+      // Check to see that they two arrays agree on type...
+      if (T1->getElementType() != T2->getElementType())
+        return Error(ErrorMsg,
+         "Appending variables with different element types need to be linked!");
+      if (G1->isConstant() != G2->isConstant())
+        return Error(ErrorMsg,
+                     "Appending variables linked with different const'ness!");
+
+      if (G1->getAlignment() != G2->getAlignment())
+        return Error(ErrorMsg,
+         "Appending variables with different alignment need to be linked!");
+
+      if (G1->getVisibility() != G2->getVisibility())
+        return Error(ErrorMsg,
+         "Appending variables with different visibility need to be linked!");
+
+      if (G1->getSection() != G2->getSection())
+        return Error(ErrorMsg,
+         "Appending variables with different section name need to be linked!");
+
+      unsigned NewSize = T1->getNumElements() + T2->getNumElements();
+      ArrayType *NewType = ArrayType::get(T1->getElementType(), NewSize);
+
+      G1->setName("");   // Clear G1's name in case of a conflict!
+
+      // Create the new global variable...
+      GlobalVariable *NG =
+        new GlobalVariable(NewType, G1->isConstant(), G1->getLinkage(),
+                           /*init*/0, First->first, M, G1->isThreadLocal(),
+                           G1->getType()->getAddressSpace());
+
+      // Propagate alignment, visibility and section info.
+      CopyGVAttributes(NG, G1);
+
+      // Merge the initializer...
+      Inits.reserve(NewSize);
+      if (ConstantArray *I = dyn_cast<ConstantArray>(G1->getInitializer())) {
+        for (unsigned i = 0, e = T1->getNumElements(); i != e; ++i)
+          Inits.push_back(I->getOperand(i));
+      } else {
+        assert(isa<ConstantAggregateZero>(G1->getInitializer()));
+        Constant *CV = Constant::getNullValue(T1->getElementType());
+        for (unsigned i = 0, e = T1->getNumElements(); i != e; ++i)
+          Inits.push_back(CV);
+      }
+      if (ConstantArray *I = dyn_cast<ConstantArray>(G2->getInitializer())) {
+        for (unsigned i = 0, e = T2->getNumElements(); i != e; ++i)
+          Inits.push_back(I->getOperand(i));
+      } else {
+        assert(isa<ConstantAggregateZero>(G2->getInitializer()));
+        Constant *CV = Constant::getNullValue(T2->getElementType());
+        for (unsigned i = 0, e = T2->getNumElements(); i != e; ++i)
+          Inits.push_back(CV);
+      }
+      NG->setInitializer(ConstantArray::get(NewType, Inits));
+      Inits.clear();
+
+      // Replace any uses of the two global variables with uses of the new
+      // global...
+
+      // FIXME: This should rewrite simple/straight-forward uses such as
+      // getelementptr instructions to not use the Cast!
+      G1->replaceAllUsesWith(ConstantExpr::getBitCast(NG, G1->getType()));
+      G2->replaceAllUsesWith(ConstantExpr::getBitCast(NG, G2->getType()));
+
+      // Remove the two globals from the module now...
+      M->getGlobalList().erase(G1);
+      M->getGlobalList().erase(G2);
+
+      // Put the new global into the AppendingVars map so that we can handle
+      // linking of more than two vars...
+      Second->second = NG;
+    }
+    AppendingVars.erase(First);
+  }
+
+  return false;
+}
+
+static bool ResolveAliases(Module *Dest) {
+  for (Module::alias_iterator I = Dest->alias_begin(), E = Dest->alias_end();
+       I != E; ++I)
+    if (const GlobalValue *GV = I->resolveAliasedGlobal())
+      if (GV != I && !GV->isDeclaration())
+        I->replaceAllUsesWith(const_cast<GlobalValue*>(GV));
+
+  return false;
+}
+
+// LinkModules - This function links two modules together, with the resulting
+// left module modified to be the composite of the two input modules.  If an
+// error occurs, true is returned and ErrorMsg (if not null) is set to indicate
+// the problem.  Upon failure, the Dest module could be in a modified state, and
+// shouldn't be relied on to be consistent.
+bool
+Linker::LinkModules(Module *Dest, Module *Src, std::string *ErrorMsg) {
+  assert(Dest != 0 && "Invalid Destination module");
+  assert(Src  != 0 && "Invalid Source Module");
+
+  if (Dest->getDataLayout().empty()) {
+    if (!Src->getDataLayout().empty()) {
+      Dest->setDataLayout(Src->getDataLayout());
+    } else {
+      std::string DataLayout;
+
+      if (Dest->getEndianness() == Module::AnyEndianness) {
+        if (Src->getEndianness() == Module::BigEndian)
+          DataLayout.append("E");
+        else if (Src->getEndianness() == Module::LittleEndian)
+          DataLayout.append("e");
+      }
+
+      if (Dest->getPointerSize() == Module::AnyPointerSize) {
+        if (Src->getPointerSize() == Module::Pointer64)
+          DataLayout.append(DataLayout.length() == 0 ? "p:64:64" : "-p:64:64");
+        else if (Src->getPointerSize() == Module::Pointer32)
+          DataLayout.append(DataLayout.length() == 0 ? "p:32:32" : "-p:32:32");
+      }
+      Dest->setDataLayout(DataLayout);
+    }
+  }
+
+  // Copy the target triple from the source to dest if the dest's is empty.
+  if (Dest->getTargetTriple().empty() && !Src->getTargetTriple().empty())
+    Dest->setTargetTriple(Src->getTargetTriple());
+
+  if (!Src->getDataLayout().empty() && !Dest->getDataLayout().empty() &&
+      Src->getDataLayout() != Dest->getDataLayout())
+    cerr << "WARNING: Linking two modules of different data layouts!\n";
+  if (!Src->getTargetTriple().empty() &&
+      Dest->getTargetTriple() != Src->getTargetTriple())
+    cerr << "WARNING: Linking two modules of different target triples!\n";
+
+  // Append the module inline asm string.
+  if (!Src->getModuleInlineAsm().empty()) {
+    if (Dest->getModuleInlineAsm().empty())
+      Dest->setModuleInlineAsm(Src->getModuleInlineAsm());
+    else
+      Dest->setModuleInlineAsm(Dest->getModuleInlineAsm()+"\n"+
+                               Src->getModuleInlineAsm());
+  }
+
+  // Update the destination module's dependent libraries list with the libraries
+  // from the source module. There's no opportunity for duplicates here as the
+  // Module ensures that duplicate insertions are discarded.
+  for (Module::lib_iterator SI = Src->lib_begin(), SE = Src->lib_end();
+       SI != SE; ++SI)
+    Dest->addLibrary(*SI);
+
+  // LinkTypes - Go through the symbol table of the Src module and see if any
+  // types are named in the src module that are not named in the Dst module.
+  // Make sure there are no type name conflicts.
+  if (LinkTypes(Dest, Src, ErrorMsg))
+    return true;
+
+  // ValueMap - Mapping of values from what they used to be in Src, to what they
+  // are now in Dest.
+  std::map<const Value*, Value*> ValueMap;
+
+  // AppendingVars - Keep track of global variables in the destination module
+  // with appending linkage.  After the module is linked together, they are
+  // appended and the module is rewritten.
+  std::multimap<std::string, GlobalVariable *> AppendingVars;
+  for (Module::global_iterator I = Dest->global_begin(), E = Dest->global_end();
+       I != E; ++I) {
+    // Add all of the appending globals already in the Dest module to
+    // AppendingVars.
+    if (I->hasAppendingLinkage())
+      AppendingVars.insert(std::make_pair(I->getName(), I));
+  }
+
+  // Insert all of the globals in src into the Dest module... without linking
+  // initializers (which could refer to functions not yet mapped over).
+  if (LinkGlobals(Dest, Src, ValueMap, AppendingVars, ErrorMsg))
+    return true;
+
+  // Link the functions together between the two modules, without doing function
+  // bodies... this just adds external function prototypes to the Dest
+  // function...  We do this so that when we begin processing function bodies,
+  // all of the global values that may be referenced are available in our
+  // ValueMap.
+  if (LinkFunctionProtos(Dest, Src, ValueMap, ErrorMsg))
+    return true;
+
+  // If there were any alias, link them now. We really need to do this now,
+  // because all of the aliases that may be referenced need to be available in
+  // ValueMap
+  if (LinkAlias(Dest, Src, ValueMap, ErrorMsg)) return true;
+
+  // Update the initializers in the Dest module now that all globals that may
+  // be referenced are in Dest.
+  if (LinkGlobalInits(Dest, Src, ValueMap, ErrorMsg)) return true;
+
+  // Link in the function bodies that are defined in the source module into the
+  // DestModule.  This consists basically of copying the function over and
+  // fixing up references to values.
+  if (LinkFunctionBodies(Dest, Src, ValueMap, ErrorMsg)) return true;
+
+  // If there were any appending global variables, link them together now.
+  if (LinkAppendingVars(Dest, AppendingVars, ErrorMsg)) return true;
+
+  // Resolve all uses of aliases with aliasees
+  if (ResolveAliases(Dest)) return true;
+
+  // If the source library's module id is in the dependent library list of the
+  // destination library, remove it since that module is now linked in.
+  sys::Path modId;
+  modId.set(Src->getModuleIdentifier());
+  if (!modId.isEmpty())
+    Dest->removeLibrary(modId.getBasename());
+
+  return false;
+}
+
+// vim: sw=2
diff --git a/lib/Linker/Linker.cpp b/lib/Linker/Linker.cpp
new file mode 100644
index 0000000..d673772
--- /dev/null
+++ b/lib/Linker/Linker.cpp
@@ -0,0 +1,178 @@
+//===- lib/Linker/Linker.cpp - Basic Linker functionality  ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains basic Linker functionality that all usages will need.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker.h"
+#include "llvm/Module.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+Linker::Linker(const std::string& progname, const std::string& modname,
+               unsigned flags)
+  : Composite(0)
+  , LibPaths()
+  , Flags(flags)
+  , Error()
+  , ProgramName(progname)
+{
+  Composite = new Module(modname);
+}
+
+Linker::Linker(const std::string& progname, Module* aModule, unsigned flags)
+  : Composite(aModule)
+  , LibPaths()
+  , Flags(flags)
+  , Error()
+  , ProgramName(progname)
+{
+}
+
+Linker::~Linker() {
+  delete Composite;
+}
+
+bool
+Linker::error(const std::string& message) {
+  Error = message;
+  if (!(Flags&QuietErrors))
+    cerr << ProgramName << ": error: " << message << "\n";
+  return true;
+}
+
+bool
+Linker::warning(const std::string& message) {
+  Error = message;
+  if (!(Flags&QuietWarnings))
+    cerr << ProgramName << ": warning: " << message << "\n";
+  return false;
+}
+
+void
+Linker::verbose(const std::string& message) {
+  if (Flags&Verbose)
+    cerr << "  " << message << "\n";
+}
+
+void
+Linker::addPath(const sys::Path& path) {
+  LibPaths.push_back(path);
+}
+
+void
+Linker::addPaths(const std::vector<std::string>& paths) {
+  for (unsigned i = 0; i != paths.size(); ++i) {
+    sys::Path aPath;
+    aPath.set(paths[i]);
+    LibPaths.push_back(aPath);
+  }
+}
+
+void
+Linker::addSystemPaths() {
+  sys::Path::GetBitcodeLibraryPaths(LibPaths);
+  LibPaths.insert(LibPaths.begin(),sys::Path("./"));
+}
+
+Module*
+Linker::releaseModule() {
+  Module* result = Composite;
+  LibPaths.clear();
+  Error.clear();
+  Composite = 0;
+  Flags = 0;
+  return result;
+}
+
+// LoadObject - Read in and parse the bitcode file named by FN and return the
+// module it contains (wrapped in an auto_ptr), or auto_ptr<Module>() and set
+// Error if an error occurs.
+std::auto_ptr<Module>
+Linker::LoadObject(const sys::Path &FN) {
+  std::string ParseErrorMessage;
+  Module *Result = 0;
+  
+  const std::string &FNS = FN.toString();
+  std::auto_ptr<MemoryBuffer> Buffer(MemoryBuffer::getFileOrSTDIN(FNS.c_str()));
+  if (Buffer.get())
+    Result = ParseBitcodeFile(Buffer.get(), &ParseErrorMessage);
+  else
+    ParseErrorMessage = "Error reading file '" + FNS + "'";
+    
+  if (Result)
+    return std::auto_ptr<Module>(Result);
+  Error = "Bitcode file '" + FN.toString() + "' could not be loaded";
+  if (ParseErrorMessage.size())
+    Error += ": " + ParseErrorMessage;
+  return std::auto_ptr<Module>();
+}
+
+// IsLibrary - Determine if "Name" is a library in "Directory". Return
+// a non-empty sys::Path if its found, an empty one otherwise.
+static inline sys::Path IsLibrary(const std::string& Name,
+                                  const sys::Path& Directory) {
+
+  sys::Path FullPath(Directory);
+
+  // Try the libX.a form
+  FullPath.appendComponent("lib" + Name);
+  FullPath.appendSuffix("a");
+  if (FullPath.isArchive())
+    return FullPath;
+
+  // Try the libX.bca form
+  FullPath.eraseSuffix();
+  FullPath.appendSuffix("bca");
+  if (FullPath.isArchive())
+    return FullPath;
+
+  // Try the libX.so (or .dylib) form
+  FullPath.eraseSuffix();
+  FullPath.appendSuffix(&(LTDL_SHLIB_EXT[1]));
+  if (FullPath.isDynamicLibrary())  // Native shared library?
+    return FullPath;
+  if (FullPath.isBitcodeFile())    // .so file containing bitcode?
+    return FullPath;
+
+  // Not found .. fall through
+
+  // Indicate that the library was not found in the directory.
+  FullPath.clear();
+  return FullPath;
+}
+
+/// FindLib - Try to convert Filename into the name of a file that we can open,
+/// if it does not already name a file we can open, by first trying to open
+/// Filename, then libFilename.[suffix] for each of a set of several common
+/// library suffixes, in each of the directories in LibPaths. Returns an empty
+/// Path if no matching file can be found.
+///
+sys::Path
+Linker::FindLib(const std::string &Filename) {
+  // Determine if the pathname can be found as it stands.
+  sys::Path FilePath(Filename);
+  if (FilePath.canRead() &&
+      (FilePath.isArchive() || FilePath.isDynamicLibrary()))
+    return FilePath;
+
+  // Iterate over the directories in Paths to see if we can find the library
+  // there.
+  for (unsigned Index = 0; Index != LibPaths.size(); ++Index) {
+    sys::Path Directory(LibPaths[Index]);
+    sys::Path FullPath = IsLibrary(Filename,Directory);
+    if (!FullPath.isEmpty())
+      return FullPath;
+  }
+  return sys::Path();
+}
diff --git a/lib/Linker/Makefile b/lib/Linker/Makefile
new file mode 100644
index 0000000..19e646b
--- /dev/null
+++ b/lib/Linker/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Linker/Makefile ---------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMLinker
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Makefile b/lib/Makefile
new file mode 100644
index 0000000..8dd67d9
--- /dev/null
+++ b/lib/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Makefile ----------------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ..
+
+PARALLEL_DIRS = VMCore AsmParser Bitcode Archive Analysis Transforms CodeGen \
+                Target ExecutionEngine Debugger Linker CompilerDriver
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
new file mode 100644
index 0000000..3b03c54
--- /dev/null
+++ b/lib/Support/APFloat.cpp
@@ -0,0 +1,2950 @@
+//===-- APFloat.cpp - Implement APFloat class -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a class to represent arbitrary precision floating
+// point values and provide a variety of arithmetic operations on them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Support/MathExtras.h"
+#include <cstring>
+
+using namespace llvm;
+
+#define convolve(lhs, rhs) ((lhs) * 4 + (rhs))
+
+/* Assumed in hexadecimal significand parsing, and conversion to
+   hexadecimal strings.  */
+#define COMPILE_TIME_ASSERT(cond) extern int CTAssert[(cond) ? 1 : -1]
+COMPILE_TIME_ASSERT(integerPartWidth % 4 == 0);
+
+namespace llvm {
+
+  /* Represents floating point arithmetic semantics.  */
+  struct fltSemantics {
+    /* The largest E such that 2^E is representable; this matches the
+       definition of IEEE 754.  */
+    exponent_t maxExponent;
+
+    /* The smallest E such that 2^E is a normalized number; this
+       matches the definition of IEEE 754.  */
+    exponent_t minExponent;
+
+    /* Number of bits in the significand.  This includes the integer
+       bit.  */
+    unsigned int precision;
+
+    /* True if arithmetic is supported.  */
+    unsigned int arithmeticOK;
+  };
+
+  const fltSemantics APFloat::IEEEsingle = { 127, -126, 24, true };
+  const fltSemantics APFloat::IEEEdouble = { 1023, -1022, 53, true };
+  const fltSemantics APFloat::IEEEquad = { 16383, -16382, 113, true };
+  const fltSemantics APFloat::x87DoubleExtended = { 16383, -16382, 64, true };
+  const fltSemantics APFloat::Bogus = { 0, 0, 0, true };
+
+  // The PowerPC format consists of two doubles.  It does not map cleanly
+  // onto the usual format above.  For now only storage of constants of
+  // this type is supported, no arithmetic.
+  const fltSemantics APFloat::PPCDoubleDouble = { 1023, -1022, 106, false };
+
+  /* A tight upper bound on number of parts required to hold the value
+     pow(5, power) is
+
+       power * 815 / (351 * integerPartWidth) + 1
+       
+     However, whilst the result may require only this many parts,
+     because we are multiplying two values to get it, the
+     multiplication may require an extra part with the excess part
+     being zero (consider the trivial case of 1 * 1, tcFullMultiply
+     requires two parts to hold the single-part result).  So we add an
+     extra one to guarantee enough space whilst multiplying.  */
+  const unsigned int maxExponent = 16383;
+  const unsigned int maxPrecision = 113;
+  const unsigned int maxPowerOfFiveExponent = maxExponent + maxPrecision - 1;
+  const unsigned int maxPowerOfFiveParts = 2 + ((maxPowerOfFiveExponent * 815)
+                                                / (351 * integerPartWidth));
+}
+
+/* A bunch of private, handy routines.  */
+
+static inline unsigned int
+partCountForBits(unsigned int bits)
+{
+  return ((bits) + integerPartWidth - 1) / integerPartWidth;
+}
+
+/* Returns 0U-9U.  Return values >= 10U are not digits.  */
+static inline unsigned int
+decDigitValue(unsigned int c)
+{
+  return c - '0';
+}
+
+static unsigned int
+hexDigitValue(unsigned int c)
+{
+  unsigned int r;
+
+  r = c - '0';
+  if(r <= 9)
+    return r;
+
+  r = c - 'A';
+  if(r <= 5)
+    return r + 10;
+
+  r = c - 'a';
+  if(r <= 5)
+    return r + 10;
+
+  return -1U;
+}
+
+static inline void
+assertArithmeticOK(const llvm::fltSemantics &semantics) {
+  assert(semantics.arithmeticOK
+         && "Compile-time arithmetic does not support these semantics");
+}
+
+/* Return the value of a decimal exponent of the form
+   [+-]ddddddd.
+
+   If the exponent overflows, returns a large exponent with the
+   appropriate sign.  */
+static int
+readExponent(const char *p)
+{
+  bool isNegative;
+  unsigned int absExponent;
+  const unsigned int overlargeExponent = 24000;  /* FIXME.  */
+
+  isNegative = (*p == '-');
+  if (*p == '-' || *p == '+')
+    p++;
+
+  absExponent = decDigitValue(*p++);
+  assert (absExponent < 10U);
+
+  for (;;) {
+    unsigned int value;
+
+    value = decDigitValue(*p);
+    if (value >= 10U)
+      break;
+
+    p++;
+    value += absExponent * 10;
+    if (absExponent >= overlargeExponent) {
+      absExponent = overlargeExponent;
+      break;
+    }
+    absExponent = value;
+  }
+
+  if (isNegative)
+    return -(int) absExponent;
+  else
+    return (int) absExponent;
+}
+
+/* This is ugly and needs cleaning up, but I don't immediately see
+   how whilst remaining safe.  */
+static int
+totalExponent(const char *p, int exponentAdjustment)
+{
+  int unsignedExponent;
+  bool negative, overflow;
+  int exponent;
+
+  /* Move past the exponent letter and sign to the digits.  */
+  p++;
+  negative = *p == '-';
+  if(*p == '-' || *p == '+')
+    p++;
+
+  unsignedExponent = 0;
+  overflow = false;
+  for(;;) {
+    unsigned int value;
+
+    value = decDigitValue(*p);
+    if(value >= 10U)
+      break;
+
+    p++;
+    unsignedExponent = unsignedExponent * 10 + value;
+    if(unsignedExponent > 65535)
+      overflow = true;
+  }
+
+  if(exponentAdjustment > 65535 || exponentAdjustment < -65536)
+    overflow = true;
+
+  if(!overflow) {
+    exponent = unsignedExponent;
+    if(negative)
+      exponent = -exponent;
+    exponent += exponentAdjustment;
+    if(exponent > 65535 || exponent < -65536)
+      overflow = true;
+  }
+
+  if(overflow)
+    exponent = negative ? -65536: 65535;
+
+  return exponent;
+}
+
+static const char *
+skipLeadingZeroesAndAnyDot(const char *p, const char **dot)
+{
+  *dot = 0;
+  while(*p == '0')
+    p++;
+
+  if(*p == '.') {
+    *dot = p++;
+    while(*p == '0')
+      p++;
+  }
+
+  return p;
+}
+
+/* Given a normal decimal floating point number of the form
+
+     dddd.dddd[eE][+-]ddd
+
+   where the decimal point and exponent are optional, fill out the
+   structure D.  Exponent is appropriate if the significand is
+   treated as an integer, and normalizedExponent if the significand
+   is taken to have the decimal point after a single leading
+   non-zero digit.
+
+   If the value is zero, V->firstSigDigit points to a non-digit, and
+   the return exponent is zero.
+*/
+struct decimalInfo {
+  const char *firstSigDigit;
+  const char *lastSigDigit;
+  int exponent;
+  int normalizedExponent;
+};
+
+static void
+interpretDecimal(const char *p, decimalInfo *D)
+{
+  const char *dot;
+
+  p = skipLeadingZeroesAndAnyDot (p, &dot);
+
+  D->firstSigDigit = p;
+  D->exponent = 0;
+  D->normalizedExponent = 0;
+
+  for (;;) {
+    if (*p == '.') {
+      assert(dot == 0);
+      dot = p++;
+    }
+    if (decDigitValue(*p) >= 10U)
+      break;
+    p++;
+  }
+
+  /* If number is all zerooes accept any exponent.  */
+  if (p != D->firstSigDigit) {
+    if (*p == 'e' || *p == 'E')
+      D->exponent = readExponent(p + 1);
+
+    /* Implied decimal point?  */
+    if (!dot)
+      dot = p;
+
+    /* Drop insignificant trailing zeroes.  */
+    do
+      do
+        p--;
+      while (*p == '0');
+    while (*p == '.');
+
+    /* Adjust the exponents for any decimal point.  */
+    D->exponent += static_cast<exponent_t>((dot - p) - (dot > p));
+    D->normalizedExponent = (D->exponent +
+              static_cast<exponent_t>((p - D->firstSigDigit)
+                                      - (dot > D->firstSigDigit && dot < p)));
+  }
+
+  D->lastSigDigit = p;
+}
+
+/* Return the trailing fraction of a hexadecimal number.
+   DIGITVALUE is the first hex digit of the fraction, P points to
+   the next digit.  */
+static lostFraction
+trailingHexadecimalFraction(const char *p, unsigned int digitValue)
+{
+  unsigned int hexDigit;
+
+  /* If the first trailing digit isn't 0 or 8 we can work out the
+     fraction immediately.  */
+  if(digitValue > 8)
+    return lfMoreThanHalf;
+  else if(digitValue < 8 && digitValue > 0)
+    return lfLessThanHalf;
+
+  /* Otherwise we need to find the first non-zero digit.  */
+  while(*p == '0')
+    p++;
+
+  hexDigit = hexDigitValue(*p);
+
+  /* If we ran off the end it is exactly zero or one-half, otherwise
+     a little more.  */
+  if(hexDigit == -1U)
+    return digitValue == 0 ? lfExactlyZero: lfExactlyHalf;
+  else
+    return digitValue == 0 ? lfLessThanHalf: lfMoreThanHalf;
+}
+
+/* Return the fraction lost were a bignum truncated losing the least
+   significant BITS bits.  */
+static lostFraction
+lostFractionThroughTruncation(const integerPart *parts,
+                              unsigned int partCount,
+                              unsigned int bits)
+{
+  unsigned int lsb;
+
+  lsb = APInt::tcLSB(parts, partCount);
+
+  /* Note this is guaranteed true if bits == 0, or LSB == -1U.  */
+  if(bits <= lsb)
+    return lfExactlyZero;
+  if(bits == lsb + 1)
+    return lfExactlyHalf;
+  if(bits <= partCount * integerPartWidth
+     && APInt::tcExtractBit(parts, bits - 1))
+    return lfMoreThanHalf;
+
+  return lfLessThanHalf;
+}
+
+/* Shift DST right BITS bits noting lost fraction.  */
+static lostFraction
+shiftRight(integerPart *dst, unsigned int parts, unsigned int bits)
+{
+  lostFraction lost_fraction;
+
+  lost_fraction = lostFractionThroughTruncation(dst, parts, bits);
+
+  APInt::tcShiftRight(dst, parts, bits);
+
+  return lost_fraction;
+}
+
+/* Combine the effect of two lost fractions.  */
+static lostFraction
+combineLostFractions(lostFraction moreSignificant,
+                     lostFraction lessSignificant)
+{
+  if(lessSignificant != lfExactlyZero) {
+    if(moreSignificant == lfExactlyZero)
+      moreSignificant = lfLessThanHalf;
+    else if(moreSignificant == lfExactlyHalf)
+      moreSignificant = lfMoreThanHalf;
+  }
+
+  return moreSignificant;
+}
+
+/* The error from the true value, in half-ulps, on multiplying two
+   floating point numbers, which differ from the value they
+   approximate by at most HUE1 and HUE2 half-ulps, is strictly less
+   than the returned value.
+
+   See "How to Read Floating Point Numbers Accurately" by William D
+   Clinger.  */
+static unsigned int
+HUerrBound(bool inexactMultiply, unsigned int HUerr1, unsigned int HUerr2)
+{
+  assert(HUerr1 < 2 || HUerr2 < 2 || (HUerr1 + HUerr2 < 8));
+
+  if (HUerr1 + HUerr2 == 0)
+    return inexactMultiply * 2;  /* <= inexactMultiply half-ulps.  */
+  else
+    return inexactMultiply + 2 * (HUerr1 + HUerr2);
+}
+
+/* The number of ulps from the boundary (zero, or half if ISNEAREST)
+   when the least significant BITS are truncated.  BITS cannot be
+   zero.  */
+static integerPart
+ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
+{
+  unsigned int count, partBits;
+  integerPart part, boundary;
+
+  assert (bits != 0);
+
+  bits--;
+  count = bits / integerPartWidth;
+  partBits = bits % integerPartWidth + 1;
+
+  part = parts[count] & (~(integerPart) 0 >> (integerPartWidth - partBits));
+
+  if (isNearest)
+    boundary = (integerPart) 1 << (partBits - 1);
+  else
+    boundary = 0;
+
+  if (count == 0) {
+    if (part - boundary <= boundary - part)
+      return part - boundary;
+    else
+      return boundary - part;
+  }
+
+  if (part == boundary) {
+    while (--count)
+      if (parts[count])
+        return ~(integerPart) 0; /* A lot.  */
+
+    return parts[0];
+  } else if (part == boundary - 1) {
+    while (--count)
+      if (~parts[count])
+        return ~(integerPart) 0; /* A lot.  */
+
+    return -parts[0];
+  }
+
+  return ~(integerPart) 0; /* A lot.  */
+}
+
+/* Place pow(5, power) in DST, and return the number of parts used.
+   DST must be at least one part larger than size of the answer.  */
+static unsigned int
+powerOf5(integerPart *dst, unsigned int power)
+{
+  static const integerPart firstEightPowers[] = { 1, 5, 25, 125, 625, 3125,
+                                                  15625, 78125 };
+  integerPart pow5s[maxPowerOfFiveParts * 2 + 5];
+  pow5s[0] = 78125 * 5;
+  
+  unsigned int partsCount[16] = { 1 };
+  integerPart scratch[maxPowerOfFiveParts], *p1, *p2, *pow5;
+  unsigned int result;
+  assert(power <= maxExponent);
+
+  p1 = dst;
+  p2 = scratch;
+
+  *p1 = firstEightPowers[power & 7];
+  power >>= 3;
+
+  result = 1;
+  pow5 = pow5s;
+
+  for (unsigned int n = 0; power; power >>= 1, n++) {
+    unsigned int pc;
+
+    pc = partsCount[n];
+
+    /* Calculate pow(5,pow(2,n+3)) if we haven't yet.  */
+    if (pc == 0) {
+      pc = partsCount[n - 1];
+      APInt::tcFullMultiply(pow5, pow5 - pc, pow5 - pc, pc, pc);
+      pc *= 2;
+      if (pow5[pc - 1] == 0)
+        pc--;
+      partsCount[n] = pc;
+    }
+
+    if (power & 1) {
+      integerPart *tmp;
+
+      APInt::tcFullMultiply(p2, p1, pow5, result, pc);
+      result += pc;
+      if (p2[result - 1] == 0)
+        result--;
+
+      /* Now result is in p1 with partsCount parts and p2 is scratch
+         space.  */
+      tmp = p1, p1 = p2, p2 = tmp;
+    }
+
+    pow5 += pc;
+  }
+
+  if (p1 != dst)
+    APInt::tcAssign(dst, p1, result);
+
+  return result;
+}
+
+/* Zero at the end to avoid modular arithmetic when adding one; used
+   when rounding up during hexadecimal output.  */
+static const char hexDigitsLower[] = "0123456789abcdef0";
+static const char hexDigitsUpper[] = "0123456789ABCDEF0";
+static const char infinityL[] = "infinity";
+static const char infinityU[] = "INFINITY";
+static const char NaNL[] = "nan";
+static const char NaNU[] = "NAN";
+
+/* Write out an integerPart in hexadecimal, starting with the most
+   significant nibble.  Write out exactly COUNT hexdigits, return
+   COUNT.  */
+static unsigned int
+partAsHex (char *dst, integerPart part, unsigned int count,
+           const char *hexDigitChars)
+{
+  unsigned int result = count;
+
+  assert (count != 0 && count <= integerPartWidth / 4);
+
+  part >>= (integerPartWidth - 4 * count);
+  while (count--) {
+    dst[count] = hexDigitChars[part & 0xf];
+    part >>= 4;
+  }
+
+  return result;
+}
+
+/* Write out an unsigned decimal integer.  */
+static char *
+writeUnsignedDecimal (char *dst, unsigned int n)
+{
+  char buff[40], *p;
+
+  p = buff;
+  do
+    *p++ = '0' + n % 10;
+  while (n /= 10);
+
+  do
+    *dst++ = *--p;
+  while (p != buff);
+
+  return dst;
+}
+
+/* Write out a signed decimal integer.  */
+static char *
+writeSignedDecimal (char *dst, int value)
+{
+  if (value < 0) {
+    *dst++ = '-';
+    dst = writeUnsignedDecimal(dst, -(unsigned) value);
+  } else
+    dst = writeUnsignedDecimal(dst, value);
+
+  return dst;
+}
+
+/* Constructors.  */
+void
+APFloat::initialize(const fltSemantics *ourSemantics)
+{
+  unsigned int count;
+
+  semantics = ourSemantics;
+  count = partCount();
+  if(count > 1)
+    significand.parts = new integerPart[count];
+}
+
+void
+APFloat::freeSignificand()
+{
+  if(partCount() > 1)
+    delete [] significand.parts;
+}
+
+void
+APFloat::assign(const APFloat &rhs)
+{
+  assert(semantics == rhs.semantics);
+
+  sign = rhs.sign;
+  category = rhs.category;
+  exponent = rhs.exponent;
+  sign2 = rhs.sign2;
+  exponent2 = rhs.exponent2;
+  if(category == fcNormal || category == fcNaN)
+    copySignificand(rhs);
+}
+
+void
+APFloat::copySignificand(const APFloat &rhs)
+{
+  assert(category == fcNormal || category == fcNaN);
+  assert(rhs.partCount() >= partCount());
+
+  APInt::tcAssign(significandParts(), rhs.significandParts(),
+                  partCount());
+}
+
+/* Make this number a NaN, with an arbitrary but deterministic value
+   for the significand.  If double or longer, this is a signalling NaN,
+   which may not be ideal.  If float, this is QNaN(0).  */
+void
+APFloat::makeNaN(unsigned type)
+{
+  category = fcNaN;
+  // FIXME: Add double and long double support for QNaN(0).
+  if (semantics->precision == 24 && semantics->maxExponent == 127) {
+    type |=  0x7fc00000U;
+    type &= ~0x80000000U;
+  } else
+    type = ~0U;
+  APInt::tcSet(significandParts(), type, partCount());
+}
+
+APFloat &
+APFloat::operator=(const APFloat &rhs)
+{
+  if(this != &rhs) {
+    if(semantics != rhs.semantics) {
+      freeSignificand();
+      initialize(rhs.semantics);
+    }
+    assign(rhs);
+  }
+
+  return *this;
+}
+
+bool
+APFloat::bitwiseIsEqual(const APFloat &rhs) const {
+  if (this == &rhs)
+    return true;
+  if (semantics != rhs.semantics ||
+      category != rhs.category ||
+      sign != rhs.sign)
+    return false;
+  if (semantics==(const llvm::fltSemantics*)&PPCDoubleDouble &&
+      sign2 != rhs.sign2)
+    return false;
+  if (category==fcZero || category==fcInfinity)
+    return true;
+  else if (category==fcNormal && exponent!=rhs.exponent)
+    return false;
+  else if (semantics==(const llvm::fltSemantics*)&PPCDoubleDouble &&
+           exponent2!=rhs.exponent2)
+    return false;
+  else {
+    int i= partCount();
+    const integerPart* p=significandParts();
+    const integerPart* q=rhs.significandParts();
+    for (; i>0; i--, p++, q++) {
+      if (*p != *q)
+        return false;
+    }
+    return true;
+  }
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value)
+{
+  assertArithmeticOK(ourSemantics);
+  initialize(&ourSemantics);
+  sign = 0;
+  zeroSignificand();
+  exponent = ourSemantics.precision - 1;
+  significandParts()[0] = value;
+  normalize(rmNearestTiesToEven, lfExactlyZero);
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics,
+                 fltCategory ourCategory, bool negative, unsigned type)
+{
+  assertArithmeticOK(ourSemantics);
+  initialize(&ourSemantics);
+  category = ourCategory;
+  sign = negative;
+  if (category == fcNormal)
+    category = fcZero;
+  else if (ourCategory == fcNaN)
+    makeNaN(type);
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics, const char *text)
+{
+  assertArithmeticOK(ourSemantics);
+  initialize(&ourSemantics);
+  convertFromString(text, rmNearestTiesToEven);
+}
+
+APFloat::APFloat(const APFloat &rhs)
+{
+  initialize(rhs.semantics);
+  assign(rhs);
+}
+
+APFloat::~APFloat()
+{
+  freeSignificand();
+}
+
+// Profile - This method 'profiles' an APFloat for use with FoldingSet.
+void APFloat::Profile(FoldingSetNodeID& ID) const {
+  ID.Add(bitcastToAPInt());
+}
+
+unsigned int
+APFloat::partCount() const
+{
+  return partCountForBits(semantics->precision + 1);
+}
+
+unsigned int
+APFloat::semanticsPrecision(const fltSemantics &semantics)
+{
+  return semantics.precision;
+}
+
+const integerPart *
+APFloat::significandParts() const
+{
+  return const_cast<APFloat *>(this)->significandParts();
+}
+
+integerPart *
+APFloat::significandParts()
+{
+  assert(category == fcNormal || category == fcNaN);
+
+  if(partCount() > 1)
+    return significand.parts;
+  else
+    return &significand.part;
+}
+
+void
+APFloat::zeroSignificand()
+{
+  category = fcNormal;
+  APInt::tcSet(significandParts(), 0, partCount());
+}
+
+/* Increment an fcNormal floating point number's significand.  */
+void
+APFloat::incrementSignificand()
+{
+  integerPart carry;
+
+  carry = APInt::tcIncrement(significandParts(), partCount());
+
+  /* Our callers should never cause us to overflow.  */
+  assert(carry == 0);
+}
+
+/* Add the significand of the RHS.  Returns the carry flag.  */
+integerPart
+APFloat::addSignificand(const APFloat &rhs)
+{
+  integerPart *parts;
+
+  parts = significandParts();
+
+  assert(semantics == rhs.semantics);
+  assert(exponent == rhs.exponent);
+
+  return APInt::tcAdd(parts, rhs.significandParts(), 0, partCount());
+}
+
+/* Subtract the significand of the RHS with a borrow flag.  Returns
+   the borrow flag.  */
+integerPart
+APFloat::subtractSignificand(const APFloat &rhs, integerPart borrow)
+{
+  integerPart *parts;
+
+  parts = significandParts();
+
+  assert(semantics == rhs.semantics);
+  assert(exponent == rhs.exponent);
+
+  return APInt::tcSubtract(parts, rhs.significandParts(), borrow,
+                           partCount());
+}
+
+/* Multiply the significand of the RHS.  If ADDEND is non-NULL, add it
+   on to the full-precision result of the multiplication.  Returns the
+   lost fraction.  */
+lostFraction
+APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
+{
+  unsigned int omsb;        // One, not zero, based MSB.
+  unsigned int partsCount, newPartsCount, precision;
+  integerPart *lhsSignificand;
+  integerPart scratch[4];
+  integerPart *fullSignificand;
+  lostFraction lost_fraction;
+  bool ignored;
+
+  assert(semantics == rhs.semantics);
+
+  precision = semantics->precision;
+  newPartsCount = partCountForBits(precision * 2);
+
+  if(newPartsCount > 4)
+    fullSignificand = new integerPart[newPartsCount];
+  else
+    fullSignificand = scratch;
+
+  lhsSignificand = significandParts();
+  partsCount = partCount();
+
+  APInt::tcFullMultiply(fullSignificand, lhsSignificand,
+                        rhs.significandParts(), partsCount, partsCount);
+
+  lost_fraction = lfExactlyZero;
+  omsb = APInt::tcMSB(fullSignificand, newPartsCount) + 1;
+  exponent += rhs.exponent;
+
+  if(addend) {
+    Significand savedSignificand = significand;
+    const fltSemantics *savedSemantics = semantics;
+    fltSemantics extendedSemantics;
+    opStatus status;
+    unsigned int extendedPrecision;
+
+    /* Normalize our MSB.  */
+    extendedPrecision = precision + precision - 1;
+    if(omsb != extendedPrecision)
+      {
+        APInt::tcShiftLeft(fullSignificand, newPartsCount,
+                           extendedPrecision - omsb);
+        exponent -= extendedPrecision - omsb;
+      }
+
+    /* Create new semantics.  */
+    extendedSemantics = *semantics;
+    extendedSemantics.precision = extendedPrecision;
+
+    if(newPartsCount == 1)
+      significand.part = fullSignificand[0];
+    else
+      significand.parts = fullSignificand;
+    semantics = &extendedSemantics;
+
+    APFloat extendedAddend(*addend);
+    status = extendedAddend.convert(extendedSemantics, rmTowardZero, &ignored);
+    assert(status == opOK);
+    lost_fraction = addOrSubtractSignificand(extendedAddend, false);
+
+    /* Restore our state.  */
+    if(newPartsCount == 1)
+      fullSignificand[0] = significand.part;
+    significand = savedSignificand;
+    semantics = savedSemantics;
+
+    omsb = APInt::tcMSB(fullSignificand, newPartsCount) + 1;
+  }
+
+  exponent -= (precision - 1);
+
+  if(omsb > precision) {
+    unsigned int bits, significantParts;
+    lostFraction lf;
+
+    bits = omsb - precision;
+    significantParts = partCountForBits(omsb);
+    lf = shiftRight(fullSignificand, significantParts, bits);
+    lost_fraction = combineLostFractions(lf, lost_fraction);
+    exponent += bits;
+  }
+
+  APInt::tcAssign(lhsSignificand, fullSignificand, partsCount);
+
+  if(newPartsCount > 4)
+    delete [] fullSignificand;
+
+  return lost_fraction;
+}
+
+/* Multiply the significands of LHS and RHS to DST.  */
+lostFraction
+APFloat::divideSignificand(const APFloat &rhs)
+{
+  unsigned int bit, i, partsCount;
+  const integerPart *rhsSignificand;
+  integerPart *lhsSignificand, *dividend, *divisor;
+  integerPart scratch[4];
+  lostFraction lost_fraction;
+
+  assert(semantics == rhs.semantics);
+
+  lhsSignificand = significandParts();
+  rhsSignificand = rhs.significandParts();
+  partsCount = partCount();
+
+  if(partsCount > 2)
+    dividend = new integerPart[partsCount * 2];
+  else
+    dividend = scratch;
+
+  divisor = dividend + partsCount;
+
+  /* Copy the dividend and divisor as they will be modified in-place.  */
+  for(i = 0; i < partsCount; i++) {
+    dividend[i] = lhsSignificand[i];
+    divisor[i] = rhsSignificand[i];
+    lhsSignificand[i] = 0;
+  }
+
+  exponent -= rhs.exponent;
+
+  unsigned int precision = semantics->precision;
+
+  /* Normalize the divisor.  */
+  bit = precision - APInt::tcMSB(divisor, partsCount) - 1;
+  if(bit) {
+    exponent += bit;
+    APInt::tcShiftLeft(divisor, partsCount, bit);
+  }
+
+  /* Normalize the dividend.  */
+  bit = precision - APInt::tcMSB(dividend, partsCount) - 1;
+  if(bit) {
+    exponent -= bit;
+    APInt::tcShiftLeft(dividend, partsCount, bit);
+  }
+
+  /* Ensure the dividend >= divisor initially for the loop below.
+     Incidentally, this means that the division loop below is
+     guaranteed to set the integer bit to one.  */
+  if(APInt::tcCompare(dividend, divisor, partsCount) < 0) {
+    exponent--;
+    APInt::tcShiftLeft(dividend, partsCount, 1);
+    assert(APInt::tcCompare(dividend, divisor, partsCount) >= 0);
+  }
+
+  /* Long division.  */
+  for(bit = precision; bit; bit -= 1) {
+    if(APInt::tcCompare(dividend, divisor, partsCount) >= 0) {
+      APInt::tcSubtract(dividend, divisor, 0, partsCount);
+      APInt::tcSetBit(lhsSignificand, bit - 1);
+    }
+
+    APInt::tcShiftLeft(dividend, partsCount, 1);
+  }
+
+  /* Figure out the lost fraction.  */
+  int cmp = APInt::tcCompare(dividend, divisor, partsCount);
+
+  if(cmp > 0)
+    lost_fraction = lfMoreThanHalf;
+  else if(cmp == 0)
+    lost_fraction = lfExactlyHalf;
+  else if(APInt::tcIsZero(dividend, partsCount))
+    lost_fraction = lfExactlyZero;
+  else
+    lost_fraction = lfLessThanHalf;
+
+  if(partsCount > 2)
+    delete [] dividend;
+
+  return lost_fraction;
+}
+
+unsigned int
+APFloat::significandMSB() const
+{
+  return APInt::tcMSB(significandParts(), partCount());
+}
+
+unsigned int
+APFloat::significandLSB() const
+{
+  return APInt::tcLSB(significandParts(), partCount());
+}
+
+/* Note that a zero result is NOT normalized to fcZero.  */
+lostFraction
+APFloat::shiftSignificandRight(unsigned int bits)
+{
+  /* Our exponent should not overflow.  */
+  assert((exponent_t) (exponent + bits) >= exponent);
+
+  exponent += bits;
+
+  return shiftRight(significandParts(), partCount(), bits);
+}
+
+/* Shift the significand left BITS bits, subtract BITS from its exponent.  */
+void
+APFloat::shiftSignificandLeft(unsigned int bits)
+{
+  assert(bits < semantics->precision);
+
+  if(bits) {
+    unsigned int partsCount = partCount();
+
+    APInt::tcShiftLeft(significandParts(), partsCount, bits);
+    exponent -= bits;
+
+    assert(!APInt::tcIsZero(significandParts(), partsCount));
+  }
+}
+
+APFloat::cmpResult
+APFloat::compareAbsoluteValue(const APFloat &rhs) const
+{
+  int compare;
+
+  assert(semantics == rhs.semantics);
+  assert(category == fcNormal);
+  assert(rhs.category == fcNormal);
+
+  compare = exponent - rhs.exponent;
+
+  /* If exponents are equal, do an unsigned bignum comparison of the
+     significands.  */
+  if(compare == 0)
+    compare = APInt::tcCompare(significandParts(), rhs.significandParts(),
+                               partCount());
+
+  if(compare > 0)
+    return cmpGreaterThan;
+  else if(compare < 0)
+    return cmpLessThan;
+  else
+    return cmpEqual;
+}
+
+/* Handle overflow.  Sign is preserved.  We either become infinity or
+   the largest finite number.  */
+APFloat::opStatus
+APFloat::handleOverflow(roundingMode rounding_mode)
+{
+  /* Infinity?  */
+  if(rounding_mode == rmNearestTiesToEven
+     || rounding_mode == rmNearestTiesToAway
+     || (rounding_mode == rmTowardPositive && !sign)
+     || (rounding_mode == rmTowardNegative && sign))
+    {
+      category = fcInfinity;
+      return (opStatus) (opOverflow | opInexact);
+    }
+
+  /* Otherwise we become the largest finite number.  */
+  category = fcNormal;
+  exponent = semantics->maxExponent;
+  APInt::tcSetLeastSignificantBits(significandParts(), partCount(),
+                                   semantics->precision);
+
+  return opInexact;
+}
+
+/* Returns TRUE if, when truncating the current number, with BIT the
+   new LSB, with the given lost fraction and rounding mode, the result
+   would need to be rounded away from zero (i.e., by increasing the
+   signficand).  This routine must work for fcZero of both signs, and
+   fcNormal numbers.  */
+bool
+APFloat::roundAwayFromZero(roundingMode rounding_mode,
+                           lostFraction lost_fraction,
+                           unsigned int bit) const
+{
+  /* NaNs and infinities should not have lost fractions.  */
+  assert(category == fcNormal || category == fcZero);
+
+  /* Current callers never pass this so we don't handle it.  */
+  assert(lost_fraction != lfExactlyZero);
+
+  switch (rounding_mode) {
+  default:
+    assert(0);
+
+  case rmNearestTiesToAway:
+    return lost_fraction == lfExactlyHalf || lost_fraction == lfMoreThanHalf;
+
+  case rmNearestTiesToEven:
+    if(lost_fraction == lfMoreThanHalf)
+      return true;
+
+    /* Our zeroes don't have a significand to test.  */
+    if(lost_fraction == lfExactlyHalf && category != fcZero)
+      return APInt::tcExtractBit(significandParts(), bit);
+
+    return false;
+
+  case rmTowardZero:
+    return false;
+
+  case rmTowardPositive:
+    return sign == false;
+
+  case rmTowardNegative:
+    return sign == true;
+  }
+}
+
+APFloat::opStatus
+APFloat::normalize(roundingMode rounding_mode,
+                   lostFraction lost_fraction)
+{
+  unsigned int omsb;                /* One, not zero, based MSB.  */
+  int exponentChange;
+
+  if(category != fcNormal)
+    return opOK;
+
+  /* Before rounding normalize the exponent of fcNormal numbers.  */
+  omsb = significandMSB() + 1;
+
+  if(omsb) {
+    /* OMSB is numbered from 1.  We want to place it in the integer
+       bit numbered PRECISON if possible, with a compensating change in
+       the exponent.  */
+    exponentChange = omsb - semantics->precision;
+
+    /* If the resulting exponent is too high, overflow according to
+       the rounding mode.  */
+    if(exponent + exponentChange > semantics->maxExponent)
+      return handleOverflow(rounding_mode);
+
+    /* Subnormal numbers have exponent minExponent, and their MSB
+       is forced based on that.  */
+    if(exponent + exponentChange < semantics->minExponent)
+      exponentChange = semantics->minExponent - exponent;
+
+    /* Shifting left is easy as we don't lose precision.  */
+    if(exponentChange < 0) {
+      assert(lost_fraction == lfExactlyZero);
+
+      shiftSignificandLeft(-exponentChange);
+
+      return opOK;
+    }
+
+    if(exponentChange > 0) {
+      lostFraction lf;
+
+      /* Shift right and capture any new lost fraction.  */
+      lf = shiftSignificandRight(exponentChange);
+
+      lost_fraction = combineLostFractions(lf, lost_fraction);
+
+      /* Keep OMSB up-to-date.  */
+      if(omsb > (unsigned) exponentChange)
+        omsb -= exponentChange;
+      else
+        omsb = 0;
+    }
+  }
+
+  /* Now round the number according to rounding_mode given the lost
+     fraction.  */
+
+  /* As specified in IEEE 754, since we do not trap we do not report
+     underflow for exact results.  */
+  if(lost_fraction == lfExactlyZero) {
+    /* Canonicalize zeroes.  */
+    if(omsb == 0)
+      category = fcZero;
+
+    return opOK;
+  }
+
+  /* Increment the significand if we're rounding away from zero.  */
+  if(roundAwayFromZero(rounding_mode, lost_fraction, 0)) {
+    if(omsb == 0)
+      exponent = semantics->minExponent;
+
+    incrementSignificand();
+    omsb = significandMSB() + 1;
+
+    /* Did the significand increment overflow?  */
+    if(omsb == (unsigned) semantics->precision + 1) {
+      /* Renormalize by incrementing the exponent and shifting our
+         significand right one.  However if we already have the
+         maximum exponent we overflow to infinity.  */
+      if(exponent == semantics->maxExponent) {
+        category = fcInfinity;
+
+        return (opStatus) (opOverflow | opInexact);
+      }
+
+      shiftSignificandRight(1);
+
+      return opInexact;
+    }
+  }
+
+  /* The normal case - we were and are not denormal, and any
+     significand increment above didn't overflow.  */
+  if(omsb == semantics->precision)
+    return opInexact;
+
+  /* We have a non-zero denormal.  */
+  assert(omsb < semantics->precision);
+
+  /* Canonicalize zeroes.  */
+  if(omsb == 0)
+    category = fcZero;
+
+  /* The fcZero case is a denormal that underflowed to zero.  */
+  return (opStatus) (opUnderflow | opInexact);
+}
+
+APFloat::opStatus
+APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    assert(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcNormal, fcZero):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcZero):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcInfinity):
+  case convolve(fcZero, fcInfinity):
+    category = fcInfinity;
+    sign = rhs.sign ^ subtract;
+    return opOK;
+
+  case convolve(fcZero, fcNormal):
+    assign(rhs);
+    sign = rhs.sign ^ subtract;
+    return opOK;
+
+  case convolve(fcZero, fcZero):
+    /* Sign depends on rounding mode; handled by caller.  */
+    return opOK;
+
+  case convolve(fcInfinity, fcInfinity):
+    /* Differently signed infinities can only be validly
+       subtracted.  */
+    if(((sign ^ rhs.sign)!=0) != subtract) {
+      makeNaN();
+      return opInvalidOp;
+    }
+
+    return opOK;
+
+  case convolve(fcNormal, fcNormal):
+    return opDivByZero;
+  }
+}
+
+/* Add or subtract two normal numbers.  */
+lostFraction
+APFloat::addOrSubtractSignificand(const APFloat &rhs, bool subtract)
+{
+  integerPart carry;
+  lostFraction lost_fraction;
+  int bits;
+
+  /* Determine if the operation on the absolute values is effectively
+     an addition or subtraction.  */
+  subtract ^= (sign ^ rhs.sign) ? true : false;
+
+  /* Are we bigger exponent-wise than the RHS?  */
+  bits = exponent - rhs.exponent;
+
+  /* Subtraction is more subtle than one might naively expect.  */
+  if(subtract) {
+    APFloat temp_rhs(rhs);
+    bool reverse;
+
+    if (bits == 0) {
+      reverse = compareAbsoluteValue(temp_rhs) == cmpLessThan;
+      lost_fraction = lfExactlyZero;
+    } else if (bits > 0) {
+      lost_fraction = temp_rhs.shiftSignificandRight(bits - 1);
+      shiftSignificandLeft(1);
+      reverse = false;
+    } else {
+      lost_fraction = shiftSignificandRight(-bits - 1);
+      temp_rhs.shiftSignificandLeft(1);
+      reverse = true;
+    }
+
+    if (reverse) {
+      carry = temp_rhs.subtractSignificand
+        (*this, lost_fraction != lfExactlyZero);
+      copySignificand(temp_rhs);
+      sign = !sign;
+    } else {
+      carry = subtractSignificand
+        (temp_rhs, lost_fraction != lfExactlyZero);
+    }
+
+    /* Invert the lost fraction - it was on the RHS and
+       subtracted.  */
+    if(lost_fraction == lfLessThanHalf)
+      lost_fraction = lfMoreThanHalf;
+    else if(lost_fraction == lfMoreThanHalf)
+      lost_fraction = lfLessThanHalf;
+
+    /* The code above is intended to ensure that no borrow is
+       necessary.  */
+    assert(!carry);
+  } else {
+    if(bits > 0) {
+      APFloat temp_rhs(rhs);
+
+      lost_fraction = temp_rhs.shiftSignificandRight(bits);
+      carry = addSignificand(temp_rhs);
+    } else {
+      lost_fraction = shiftSignificandRight(-bits);
+      carry = addSignificand(rhs);
+    }
+
+    /* We have a guard bit; generating a carry cannot happen.  */
+    assert(!carry);
+  }
+
+  return lost_fraction;
+}
+
+APFloat::opStatus
+APFloat::multiplySpecials(const APFloat &rhs)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    assert(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcInfinity):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcInfinity):
+    category = fcInfinity;
+    return opOK;
+
+  case convolve(fcZero, fcNormal):
+  case convolve(fcNormal, fcZero):
+  case convolve(fcZero, fcZero):
+    category = fcZero;
+    return opOK;
+
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcInfinity, fcZero):
+    makeNaN();
+    return opInvalidOp;
+
+  case convolve(fcNormal, fcNormal):
+    return opOK;
+  }
+}
+
+APFloat::opStatus
+APFloat::divideSpecials(const APFloat &rhs)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    assert(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcInfinity, fcZero):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcZero, fcNormal):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcInfinity):
+    category = fcZero;
+    return opOK;
+
+  case convolve(fcNormal, fcZero):
+    category = fcInfinity;
+    return opDivByZero;
+
+  case convolve(fcInfinity, fcInfinity):
+  case convolve(fcZero, fcZero):
+    makeNaN();
+    return opInvalidOp;
+
+  case convolve(fcNormal, fcNormal):
+    return opOK;
+  }
+}
+
+APFloat::opStatus
+APFloat::modSpecials(const APFloat &rhs)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    assert(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcZero, fcNormal):
+  case convolve(fcNormal, fcInfinity):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcZero):
+  case convolve(fcInfinity, fcZero):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcInfinity):
+  case convolve(fcZero, fcZero):
+    makeNaN();
+    return opInvalidOp;
+
+  case convolve(fcNormal, fcNormal):
+    return opOK;
+  }
+}
+
+/* Change sign.  */
+void
+APFloat::changeSign()
+{
+  /* Look mummy, this one's easy.  */
+  sign = !sign;
+}
+
+void
+APFloat::clearSign()
+{
+  /* So is this one. */
+  sign = 0;
+}
+
+void
+APFloat::copySign(const APFloat &rhs)
+{
+  /* And this one. */
+  sign = rhs.sign;
+}
+
+/* Normalized addition or subtraction.  */
+APFloat::opStatus
+APFloat::addOrSubtract(const APFloat &rhs, roundingMode rounding_mode,
+                       bool subtract)
+{
+  opStatus fs;
+
+  assertArithmeticOK(*semantics);
+
+  fs = addOrSubtractSpecials(rhs, subtract);
+
+  /* This return code means it was not a simple case.  */
+  if(fs == opDivByZero) {
+    lostFraction lost_fraction;
+
+    lost_fraction = addOrSubtractSignificand(rhs, subtract);
+    fs = normalize(rounding_mode, lost_fraction);
+
+    /* Can only be zero if we lost no fraction.  */
+    assert(category != fcZero || lost_fraction == lfExactlyZero);
+  }
+
+  /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
+     positive zero unless rounding to minus infinity, except that
+     adding two like-signed zeroes gives that zero.  */
+  if(category == fcZero) {
+    if(rhs.category != fcZero || (sign == rhs.sign) == subtract)
+      sign = (rounding_mode == rmTowardNegative);
+  }
+
+  return fs;
+}
+
+/* Normalized addition.  */
+APFloat::opStatus
+APFloat::add(const APFloat &rhs, roundingMode rounding_mode)
+{
+  return addOrSubtract(rhs, rounding_mode, false);
+}
+
+/* Normalized subtraction.  */
+APFloat::opStatus
+APFloat::subtract(const APFloat &rhs, roundingMode rounding_mode)
+{
+  return addOrSubtract(rhs, rounding_mode, true);
+}
+
+/* Normalized multiply.  */
+APFloat::opStatus
+APFloat::multiply(const APFloat &rhs, roundingMode rounding_mode)
+{
+  opStatus fs;
+
+  assertArithmeticOK(*semantics);
+  sign ^= rhs.sign;
+  fs = multiplySpecials(rhs);
+
+  if(category == fcNormal) {
+    lostFraction lost_fraction = multiplySignificand(rhs, 0);
+    fs = normalize(rounding_mode, lost_fraction);
+    if(lost_fraction != lfExactlyZero)
+      fs = (opStatus) (fs | opInexact);
+  }
+
+  return fs;
+}
+
+/* Normalized divide.  */
+APFloat::opStatus
+APFloat::divide(const APFloat &rhs, roundingMode rounding_mode)
+{
+  opStatus fs;
+
+  assertArithmeticOK(*semantics);
+  sign ^= rhs.sign;
+  fs = divideSpecials(rhs);
+
+  if(category == fcNormal) {
+    lostFraction lost_fraction = divideSignificand(rhs);
+    fs = normalize(rounding_mode, lost_fraction);
+    if(lost_fraction != lfExactlyZero)
+      fs = (opStatus) (fs | opInexact);
+  }
+
+  return fs;
+}
+
+/* Normalized remainder.  This is not currently correct in all cases.  */
+APFloat::opStatus
+APFloat::remainder(const APFloat &rhs)
+{
+  opStatus fs;
+  APFloat V = *this;
+  unsigned int origSign = sign;
+
+  assertArithmeticOK(*semantics);
+  fs = V.divide(rhs, rmNearestTiesToEven);
+  if (fs == opDivByZero)
+    return fs;
+
+  int parts = partCount();
+  integerPart *x = new integerPart[parts];
+  bool ignored;
+  fs = V.convertToInteger(x, parts * integerPartWidth, true,
+                          rmNearestTiesToEven, &ignored);
+  if (fs==opInvalidOp)
+    return fs;
+
+  fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
+                                        rmNearestTiesToEven);
+  assert(fs==opOK);   // should always work
+
+  fs = V.multiply(rhs, rmNearestTiesToEven);
+  assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
+
+  fs = subtract(V, rmNearestTiesToEven);
+  assert(fs==opOK || fs==opInexact);   // likewise
+
+  if (isZero())
+    sign = origSign;    // IEEE754 requires this
+  delete[] x;
+  return fs;
+}
+
+/* Normalized llvm frem (C fmod).  
+   This is not currently correct in all cases.  */
+APFloat::opStatus
+APFloat::mod(const APFloat &rhs, roundingMode rounding_mode)
+{
+  opStatus fs;
+  assertArithmeticOK(*semantics);
+  fs = modSpecials(rhs);
+
+  if (category == fcNormal && rhs.category == fcNormal) {
+    APFloat V = *this;
+    unsigned int origSign = sign;
+
+    fs = V.divide(rhs, rmNearestTiesToEven);
+    if (fs == opDivByZero)
+      return fs;
+
+    int parts = partCount();
+    integerPart *x = new integerPart[parts];
+    bool ignored;
+    fs = V.convertToInteger(x, parts * integerPartWidth, true,
+                            rmTowardZero, &ignored);
+    if (fs==opInvalidOp)
+      return fs;
+
+    fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
+                                          rmNearestTiesToEven);
+    assert(fs==opOK);   // should always work
+
+    fs = V.multiply(rhs, rounding_mode);
+    assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
+
+    fs = subtract(V, rounding_mode);
+    assert(fs==opOK || fs==opInexact);   // likewise
+
+    if (isZero())
+      sign = origSign;    // IEEE754 requires this
+    delete[] x;
+  }
+  return fs;
+}
+
+/* Normalized fused-multiply-add.  */
+APFloat::opStatus
+APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
+                          const APFloat &addend,
+                          roundingMode rounding_mode)
+{
+  opStatus fs;
+
+  assertArithmeticOK(*semantics);
+
+  /* Post-multiplication sign, before addition.  */
+  sign ^= multiplicand.sign;
+
+  /* If and only if all arguments are normal do we need to do an
+     extended-precision calculation.  */
+  if(category == fcNormal
+     && multiplicand.category == fcNormal
+     && addend.category == fcNormal) {
+    lostFraction lost_fraction;
+
+    lost_fraction = multiplySignificand(multiplicand, &addend);
+    fs = normalize(rounding_mode, lost_fraction);
+    if(lost_fraction != lfExactlyZero)
+      fs = (opStatus) (fs | opInexact);
+
+    /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
+       positive zero unless rounding to minus infinity, except that
+       adding two like-signed zeroes gives that zero.  */
+    if(category == fcZero && sign != addend.sign)
+      sign = (rounding_mode == rmTowardNegative);
+  } else {
+    fs = multiplySpecials(multiplicand);
+
+    /* FS can only be opOK or opInvalidOp.  There is no more work
+       to do in the latter case.  The IEEE-754R standard says it is
+       implementation-defined in this case whether, if ADDEND is a
+       quiet NaN, we raise invalid op; this implementation does so.
+
+       If we need to do the addition we can do so with normal
+       precision.  */
+    if(fs == opOK)
+      fs = addOrSubtract(addend, rounding_mode, false);
+  }
+
+  return fs;
+}
+
+/* Comparison requires normalized numbers.  */
+APFloat::cmpResult
+APFloat::compare(const APFloat &rhs) const
+{
+  cmpResult result;
+
+  assertArithmeticOK(*semantics);
+  assert(semantics == rhs.semantics);
+
+  switch (convolve(category, rhs.category)) {
+  default:
+    assert(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    return cmpUnordered;
+
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcZero):
+  case convolve(fcNormal, fcZero):
+    if(sign)
+      return cmpLessThan;
+    else
+      return cmpGreaterThan;
+
+  case convolve(fcNormal, fcInfinity):
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcZero, fcNormal):
+    if(rhs.sign)
+      return cmpGreaterThan;
+    else
+      return cmpLessThan;
+
+  case convolve(fcInfinity, fcInfinity):
+    if(sign == rhs.sign)
+      return cmpEqual;
+    else if(sign)
+      return cmpLessThan;
+    else
+      return cmpGreaterThan;
+
+  case convolve(fcZero, fcZero):
+    return cmpEqual;
+
+  case convolve(fcNormal, fcNormal):
+    break;
+  }
+
+  /* Two normal numbers.  Do they have the same sign?  */
+  if(sign != rhs.sign) {
+    if(sign)
+      result = cmpLessThan;
+    else
+      result = cmpGreaterThan;
+  } else {
+    /* Compare absolute values; invert result if negative.  */
+    result = compareAbsoluteValue(rhs);
+
+    if(sign) {
+      if(result == cmpLessThan)
+        result = cmpGreaterThan;
+      else if(result == cmpGreaterThan)
+        result = cmpLessThan;
+    }
+  }
+
+  return result;
+}
+
+/// APFloat::convert - convert a value of one floating point type to another.
+/// The return value corresponds to the IEEE754 exceptions.  *losesInfo
+/// records whether the transformation lost information, i.e. whether
+/// converting the result back to the original type will produce the
+/// original value (this is almost the same as return value==fsOK, but there
+/// are edge cases where this is not so).
+
+APFloat::opStatus
+APFloat::convert(const fltSemantics &toSemantics,
+                 roundingMode rounding_mode, bool *losesInfo)
+{
+  lostFraction lostFraction;
+  unsigned int newPartCount, oldPartCount;
+  opStatus fs;
+
+  assertArithmeticOK(*semantics);
+  assertArithmeticOK(toSemantics);
+  lostFraction = lfExactlyZero;
+  newPartCount = partCountForBits(toSemantics.precision + 1);
+  oldPartCount = partCount();
+
+  /* Handle storage complications.  If our new form is wider,
+     re-allocate our bit pattern into wider storage.  If it is
+     narrower, we ignore the excess parts, but if narrowing to a
+     single part we need to free the old storage.
+     Be careful not to reference significandParts for zeroes
+     and infinities, since it aborts.  */
+  if (newPartCount > oldPartCount) {
+    integerPart *newParts;
+    newParts = new integerPart[newPartCount];
+    APInt::tcSet(newParts, 0, newPartCount);
+    if (category==fcNormal || category==fcNaN)
+      APInt::tcAssign(newParts, significandParts(), oldPartCount);
+    freeSignificand();
+    significand.parts = newParts;
+  } else if (newPartCount < oldPartCount) {
+    /* Capture any lost fraction through truncation of parts so we get
+       correct rounding whilst normalizing.  */
+    if (category==fcNormal)
+      lostFraction = lostFractionThroughTruncation
+        (significandParts(), oldPartCount, toSemantics.precision);
+    if (newPartCount == 1) {
+        integerPart newPart = 0;
+        if (category==fcNormal || category==fcNaN)
+          newPart = significandParts()[0];
+        freeSignificand();
+        significand.part = newPart;
+    }
+  }
+
+  if(category == fcNormal) {
+    /* Re-interpret our bit-pattern.  */
+    exponent += toSemantics.precision - semantics->precision;
+    semantics = &toSemantics;
+    fs = normalize(rounding_mode, lostFraction);
+    *losesInfo = (fs != opOK);
+  } else if (category == fcNaN) {
+    int shift = toSemantics.precision - semantics->precision;
+    // Do this now so significandParts gets the right answer
+    const fltSemantics *oldSemantics = semantics;
+    semantics = &toSemantics;
+    *losesInfo = false;
+    // No normalization here, just truncate
+    if (shift>0)
+      APInt::tcShiftLeft(significandParts(), newPartCount, shift);
+    else if (shift < 0) {
+      unsigned ushift = -shift;
+      // Figure out if we are losing information.  This happens
+      // if are shifting out something other than 0s, or if the x87 long
+      // double input did not have its integer bit set (pseudo-NaN), or if the
+      // x87 long double input did not have its QNan bit set (because the x87
+      // hardware sets this bit when converting a lower-precision NaN to
+      // x87 long double).
+      if (APInt::tcLSB(significandParts(), newPartCount) < ushift)
+        *losesInfo = true;
+      if (oldSemantics == &APFloat::x87DoubleExtended && 
+          (!(*significandParts() & 0x8000000000000000ULL) ||
+           !(*significandParts() & 0x4000000000000000ULL)))
+        *losesInfo = true;
+      APInt::tcShiftRight(significandParts(), newPartCount, ushift);
+    }
+    // gcc forces the Quiet bit on, which means (float)(double)(float_sNan)
+    // does not give you back the same bits.  This is dubious, and we
+    // don't currently do it.  You're really supposed to get
+    // an invalid operation signal at runtime, but nobody does that.
+    fs = opOK;
+  } else {
+    semantics = &toSemantics;
+    fs = opOK;
+    *losesInfo = false;
+  }
+
+  return fs;
+}
+
+/* Convert a floating point number to an integer according to the
+   rounding mode.  If the rounded integer value is out of range this
+   returns an invalid operation exception and the contents of the
+   destination parts are unspecified.  If the rounded value is in
+   range but the floating point number is not the exact integer, the C
+   standard doesn't require an inexact exception to be raised.  IEEE
+   854 does require it so we do that.
+
+   Note that for conversions to integer type the C standard requires
+   round-to-zero to always be used.  */
+APFloat::opStatus
+APFloat::convertToSignExtendedInteger(integerPart *parts, unsigned int width,
+                                      bool isSigned,
+                                      roundingMode rounding_mode,
+                                      bool *isExact) const
+{
+  lostFraction lost_fraction;
+  const integerPart *src;
+  unsigned int dstPartsCount, truncatedBits;
+
+  assertArithmeticOK(*semantics);
+
+  *isExact = false;
+
+  /* Handle the three special cases first.  */
+  if(category == fcInfinity || category == fcNaN)
+    return opInvalidOp;
+
+  dstPartsCount = partCountForBits(width);
+
+  if(category == fcZero) {
+    APInt::tcSet(parts, 0, dstPartsCount);
+    // Negative zero can't be represented as an int.
+    *isExact = !sign;
+    return opOK;
+  }
+
+  src = significandParts();
+
+  /* Step 1: place our absolute value, with any fraction truncated, in
+     the destination.  */
+  if (exponent < 0) {
+    /* Our absolute value is less than one; truncate everything.  */
+    APInt::tcSet(parts, 0, dstPartsCount);
+    /* For exponent -1 the integer bit represents .5, look at that.
+       For smaller exponents leftmost truncated bit is 0. */
+    truncatedBits = semantics->precision -1U - exponent;
+  } else {
+    /* We want the most significant (exponent + 1) bits; the rest are
+       truncated.  */
+    unsigned int bits = exponent + 1U;
+
+    /* Hopelessly large in magnitude?  */
+    if (bits > width)
+      return opInvalidOp;
+
+    if (bits < semantics->precision) {
+      /* We truncate (semantics->precision - bits) bits.  */
+      truncatedBits = semantics->precision - bits;
+      APInt::tcExtract(parts, dstPartsCount, src, bits, truncatedBits);
+    } else {
+      /* We want at least as many bits as are available.  */
+      APInt::tcExtract(parts, dstPartsCount, src, semantics->precision, 0);
+      APInt::tcShiftLeft(parts, dstPartsCount, bits - semantics->precision);
+      truncatedBits = 0;
+    }
+  }
+
+  /* Step 2: work out any lost fraction, and increment the absolute
+     value if we would round away from zero.  */
+  if (truncatedBits) {
+    lost_fraction = lostFractionThroughTruncation(src, partCount(),
+                                                  truncatedBits);
+    if (lost_fraction != lfExactlyZero
+        && roundAwayFromZero(rounding_mode, lost_fraction, truncatedBits)) {
+      if (APInt::tcIncrement(parts, dstPartsCount))
+        return opInvalidOp;     /* Overflow.  */
+    }
+  } else {
+    lost_fraction = lfExactlyZero;
+  }
+
+  /* Step 3: check if we fit in the destination.  */
+  unsigned int omsb = APInt::tcMSB(parts, dstPartsCount) + 1;
+
+  if (sign) {
+    if (!isSigned) {
+      /* Negative numbers cannot be represented as unsigned.  */
+      if (omsb != 0)
+        return opInvalidOp;
+    } else {
+      /* It takes omsb bits to represent the unsigned integer value.
+         We lose a bit for the sign, but care is needed as the
+         maximally negative integer is a special case.  */
+      if (omsb == width && APInt::tcLSB(parts, dstPartsCount) + 1 != omsb)
+        return opInvalidOp;
+
+      /* This case can happen because of rounding.  */
+      if (omsb > width)
+        return opInvalidOp;
+    }
+
+    APInt::tcNegate (parts, dstPartsCount);
+  } else {
+    if (omsb >= width + !isSigned)
+      return opInvalidOp;
+  }
+
+  if (lost_fraction == lfExactlyZero) {
+    *isExact = true;
+    return opOK;
+  } else
+    return opInexact;
+}
+
+/* Same as convertToSignExtendedInteger, except we provide
+   deterministic values in case of an invalid operation exception,
+   namely zero for NaNs and the minimal or maximal value respectively
+   for underflow or overflow.
+   The *isExact output tells whether the result is exact, in the sense
+   that converting it back to the original floating point type produces
+   the original value.  This is almost equivalent to result==opOK,
+   except for negative zeroes.
+*/
+APFloat::opStatus
+APFloat::convertToInteger(integerPart *parts, unsigned int width,
+                          bool isSigned,
+                          roundingMode rounding_mode, bool *isExact) const
+{
+  opStatus fs;
+
+  fs = convertToSignExtendedInteger(parts, width, isSigned, rounding_mode, 
+                                    isExact);
+
+  if (fs == opInvalidOp) {
+    unsigned int bits, dstPartsCount;
+
+    dstPartsCount = partCountForBits(width);
+
+    if (category == fcNaN)
+      bits = 0;
+    else if (sign)
+      bits = isSigned;
+    else
+      bits = width - isSigned;
+
+    APInt::tcSetLeastSignificantBits(parts, dstPartsCount, bits);
+    if (sign && isSigned)
+      APInt::tcShiftLeft(parts, dstPartsCount, width - 1);
+  }
+
+  return fs;
+}
+
+/* Convert an unsigned integer SRC to a floating point number,
+   rounding according to ROUNDING_MODE.  The sign of the floating
+   point number is not modified.  */
+APFloat::opStatus
+APFloat::convertFromUnsignedParts(const integerPart *src,
+                                  unsigned int srcCount,
+                                  roundingMode rounding_mode)
+{
+  unsigned int omsb, precision, dstCount;
+  integerPart *dst;
+  lostFraction lost_fraction;
+
+  assertArithmeticOK(*semantics);
+  category = fcNormal;
+  omsb = APInt::tcMSB(src, srcCount) + 1;
+  dst = significandParts();
+  dstCount = partCount();
+  precision = semantics->precision;
+
+  /* We want the most significant PRECISON bits of SRC.  There may not
+     be that many; extract what we can.  */
+  if (precision <= omsb) {
+    exponent = omsb - 1;
+    lost_fraction = lostFractionThroughTruncation(src, srcCount,
+                                                  omsb - precision);
+    APInt::tcExtract(dst, dstCount, src, precision, omsb - precision);
+  } else {
+    exponent = precision - 1;
+    lost_fraction = lfExactlyZero;
+    APInt::tcExtract(dst, dstCount, src, omsb, 0);
+  }
+
+  return normalize(rounding_mode, lost_fraction);
+}
+
+APFloat::opStatus
+APFloat::convertFromAPInt(const APInt &Val,
+                          bool isSigned,
+                          roundingMode rounding_mode)
+{
+  unsigned int partCount = Val.getNumWords();
+  APInt api = Val;
+
+  sign = false;
+  if (isSigned && api.isNegative()) {
+    sign = true;
+    api = -api;
+  }
+
+  return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode);
+}
+
+/* Convert a two's complement integer SRC to a floating point number,
+   rounding according to ROUNDING_MODE.  ISSIGNED is true if the
+   integer is signed, in which case it must be sign-extended.  */
+APFloat::opStatus
+APFloat::convertFromSignExtendedInteger(const integerPart *src,
+                                        unsigned int srcCount,
+                                        bool isSigned,
+                                        roundingMode rounding_mode)
+{
+  opStatus status;
+
+  assertArithmeticOK(*semantics);
+  if (isSigned
+      && APInt::tcExtractBit(src, srcCount * integerPartWidth - 1)) {
+    integerPart *copy;
+
+    /* If we're signed and negative negate a copy.  */
+    sign = true;
+    copy = new integerPart[srcCount];
+    APInt::tcAssign(copy, src, srcCount);
+    APInt::tcNegate(copy, srcCount);
+    status = convertFromUnsignedParts(copy, srcCount, rounding_mode);
+    delete [] copy;
+  } else {
+    sign = false;
+    status = convertFromUnsignedParts(src, srcCount, rounding_mode);
+  }
+
+  return status;
+}
+
+/* FIXME: should this just take a const APInt reference?  */
+APFloat::opStatus
+APFloat::convertFromZeroExtendedInteger(const integerPart *parts,
+                                        unsigned int width, bool isSigned,
+                                        roundingMode rounding_mode)
+{
+  unsigned int partCount = partCountForBits(width);
+  APInt api = APInt(width, partCount, parts);
+
+  sign = false;
+  if(isSigned && APInt::tcExtractBit(parts, width - 1)) {
+    sign = true;
+    api = -api;
+  }
+
+  return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode);
+}
+
+APFloat::opStatus
+APFloat::convertFromHexadecimalString(const char *p,
+                                      roundingMode rounding_mode)
+{
+  lostFraction lost_fraction;
+  integerPart *significand;
+  unsigned int bitPos, partsCount;
+  const char *dot, *firstSignificantDigit;
+
+  zeroSignificand();
+  exponent = 0;
+  category = fcNormal;
+
+  significand = significandParts();
+  partsCount = partCount();
+  bitPos = partsCount * integerPartWidth;
+
+  /* Skip leading zeroes and any (hexa)decimal point.  */
+  p = skipLeadingZeroesAndAnyDot(p, &dot);
+  firstSignificantDigit = p;
+
+  for(;;) {
+    integerPart hex_value;
+
+    if(*p == '.') {
+      assert(dot == 0);
+      dot = p++;
+    }
+
+    hex_value = hexDigitValue(*p);
+    if(hex_value == -1U) {
+      lost_fraction = lfExactlyZero;
+      break;
+    }
+
+    p++;
+
+    /* Store the number whilst 4-bit nibbles remain.  */
+    if(bitPos) {
+      bitPos -= 4;
+      hex_value <<= bitPos % integerPartWidth;
+      significand[bitPos / integerPartWidth] |= hex_value;
+    } else {
+      lost_fraction = trailingHexadecimalFraction(p, hex_value);
+      while(hexDigitValue(*p) != -1U)
+        p++;
+      break;
+    }
+  }
+
+  /* Hex floats require an exponent but not a hexadecimal point.  */
+  assert(*p == 'p' || *p == 'P');
+
+  /* Ignore the exponent if we are zero.  */
+  if(p != firstSignificantDigit) {
+    int expAdjustment;
+
+    /* Implicit hexadecimal point?  */
+    if(!dot)
+      dot = p;
+
+    /* Calculate the exponent adjustment implicit in the number of
+       significant digits.  */
+    expAdjustment = static_cast<int>(dot - firstSignificantDigit);
+    if(expAdjustment < 0)
+      expAdjustment++;
+    expAdjustment = expAdjustment * 4 - 1;
+
+    /* Adjust for writing the significand starting at the most
+       significant nibble.  */
+    expAdjustment += semantics->precision;
+    expAdjustment -= partsCount * integerPartWidth;
+
+    /* Adjust for the given exponent.  */
+    exponent = totalExponent(p, expAdjustment);
+  }
+
+  return normalize(rounding_mode, lost_fraction);
+}
+
+APFloat::opStatus
+APFloat::roundSignificandWithExponent(const integerPart *decSigParts,
+                                      unsigned sigPartCount, int exp,
+                                      roundingMode rounding_mode)
+{
+  unsigned int parts, pow5PartCount;
+  fltSemantics calcSemantics = { 32767, -32767, 0, true };
+  integerPart pow5Parts[maxPowerOfFiveParts];
+  bool isNearest;
+
+  isNearest = (rounding_mode == rmNearestTiesToEven
+               || rounding_mode == rmNearestTiesToAway);
+
+  parts = partCountForBits(semantics->precision + 11);
+
+  /* Calculate pow(5, abs(exp)).  */
+  pow5PartCount = powerOf5(pow5Parts, exp >= 0 ? exp: -exp);
+
+  for (;; parts *= 2) {
+    opStatus sigStatus, powStatus;
+    unsigned int excessPrecision, truncatedBits;
+
+    calcSemantics.precision = parts * integerPartWidth - 1;
+    excessPrecision = calcSemantics.precision - semantics->precision;
+    truncatedBits = excessPrecision;
+
+    APFloat decSig(calcSemantics, fcZero, sign);
+    APFloat pow5(calcSemantics, fcZero, false);
+
+    sigStatus = decSig.convertFromUnsignedParts(decSigParts, sigPartCount,
+                                                rmNearestTiesToEven);
+    powStatus = pow5.convertFromUnsignedParts(pow5Parts, pow5PartCount,
+                                              rmNearestTiesToEven);
+    /* Add exp, as 10^n = 5^n * 2^n.  */
+    decSig.exponent += exp;
+
+    lostFraction calcLostFraction;
+    integerPart HUerr, HUdistance;
+    unsigned int powHUerr;
+
+    if (exp >= 0) {
+      /* multiplySignificand leaves the precision-th bit set to 1.  */
+      calcLostFraction = decSig.multiplySignificand(pow5, NULL);
+      powHUerr = powStatus != opOK;
+    } else {
+      calcLostFraction = decSig.divideSignificand(pow5);
+      /* Denormal numbers have less precision.  */
+      if (decSig.exponent < semantics->minExponent) {
+        excessPrecision += (semantics->minExponent - decSig.exponent);
+        truncatedBits = excessPrecision;
+        if (excessPrecision > calcSemantics.precision)
+          excessPrecision = calcSemantics.precision;
+      }
+      /* Extra half-ulp lost in reciprocal of exponent.  */
+      powHUerr = (powStatus == opOK && calcLostFraction == lfExactlyZero) ? 0:2;
+    }
+
+    /* Both multiplySignificand and divideSignificand return the
+       result with the integer bit set.  */
+    assert (APInt::tcExtractBit
+            (decSig.significandParts(), calcSemantics.precision - 1) == 1);
+
+    HUerr = HUerrBound(calcLostFraction != lfExactlyZero, sigStatus != opOK,
+                       powHUerr);
+    HUdistance = 2 * ulpsFromBoundary(decSig.significandParts(),
+                                      excessPrecision, isNearest);
+
+    /* Are we guaranteed to round correctly if we truncate?  */
+    if (HUdistance >= HUerr) {
+      APInt::tcExtract(significandParts(), partCount(), decSig.significandParts(),
+                       calcSemantics.precision - excessPrecision,
+                       excessPrecision);
+      /* Take the exponent of decSig.  If we tcExtract-ed less bits
+         above we must adjust our exponent to compensate for the
+         implicit right shift.  */
+      exponent = (decSig.exponent + semantics->precision
+                  - (calcSemantics.precision - excessPrecision));
+      calcLostFraction = lostFractionThroughTruncation(decSig.significandParts(),
+                                                       decSig.partCount(),
+                                                       truncatedBits);
+      return normalize(rounding_mode, calcLostFraction);
+    }
+  }
+}
+
+APFloat::opStatus
+APFloat::convertFromDecimalString(const char *p, roundingMode rounding_mode)
+{
+  decimalInfo D;
+  opStatus fs;
+
+  /* Scan the text.  */
+  interpretDecimal(p, &D);
+
+  /* Handle the quick cases.  First the case of no significant digits,
+     i.e. zero, and then exponents that are obviously too large or too
+     small.  Writing L for log 10 / log 2, a number d.ddddd*10^exp
+     definitely overflows if
+
+           (exp - 1) * L >= maxExponent
+
+     and definitely underflows to zero where
+
+           (exp + 1) * L <= minExponent - precision
+
+     With integer arithmetic the tightest bounds for L are
+
+           93/28 < L < 196/59            [ numerator <= 256 ]
+           42039/12655 < L < 28738/8651  [ numerator <= 65536 ]
+  */
+
+  if (decDigitValue(*D.firstSigDigit) >= 10U) {
+    category = fcZero;
+    fs = opOK;
+  } else if ((D.normalizedExponent + 1) * 28738
+             <= 8651 * (semantics->minExponent - (int) semantics->precision)) {
+    /* Underflow to zero and round.  */
+    zeroSignificand();
+    fs = normalize(rounding_mode, lfLessThanHalf);
+  } else if ((D.normalizedExponent - 1) * 42039
+             >= 12655 * semantics->maxExponent) {
+    /* Overflow and round.  */
+    fs = handleOverflow(rounding_mode);
+  } else {
+    integerPart *decSignificand;
+    unsigned int partCount;
+
+    /* A tight upper bound on number of bits required to hold an
+       N-digit decimal integer is N * 196 / 59.  Allocate enough space
+       to hold the full significand, and an extra part required by
+       tcMultiplyPart.  */
+    partCount = static_cast<unsigned int>(D.lastSigDigit - D.firstSigDigit) + 1;
+    partCount = partCountForBits(1 + 196 * partCount / 59);
+    decSignificand = new integerPart[partCount + 1];
+    partCount = 0;
+
+    /* Convert to binary efficiently - we do almost all multiplication
+       in an integerPart.  When this would overflow do we do a single
+       bignum multiplication, and then revert again to multiplication
+       in an integerPart.  */
+    do {
+      integerPart decValue, val, multiplier;
+
+      val = 0;
+      multiplier = 1;
+
+      do {
+        if (*p == '.')
+          p++;
+
+        decValue = decDigitValue(*p++);
+        multiplier *= 10;
+        val = val * 10 + decValue;
+        /* The maximum number that can be multiplied by ten with any
+           digit added without overflowing an integerPart.  */
+      } while (p <= D.lastSigDigit && multiplier <= (~ (integerPart) 0 - 9) / 10);
+
+      /* Multiply out the current part.  */
+      APInt::tcMultiplyPart(decSignificand, decSignificand, multiplier, val,
+                            partCount, partCount + 1, false);
+
+      /* If we used another part (likely but not guaranteed), increase
+         the count.  */
+      if (decSignificand[partCount])
+        partCount++;
+    } while (p <= D.lastSigDigit);
+
+    category = fcNormal;
+    fs = roundSignificandWithExponent(decSignificand, partCount,
+                                      D.exponent, rounding_mode);
+
+    delete [] decSignificand;
+  }
+
+  return fs;
+}
+
+APFloat::opStatus
+APFloat::convertFromString(const char *p, roundingMode rounding_mode)
+{
+  assertArithmeticOK(*semantics);
+
+  /* Handle a leading minus sign.  */
+  if(*p == '-')
+    sign = 1, p++;
+  else
+    sign = 0;
+
+  if(p[0] == '0' && (p[1] == 'x' || p[1] == 'X'))
+    return convertFromHexadecimalString(p + 2, rounding_mode);
+
+  return convertFromDecimalString(p, rounding_mode);
+}
+
+/* Write out a hexadecimal representation of the floating point value
+   to DST, which must be of sufficient size, in the C99 form
+   [-]0xh.hhhhp[+-]d.  Return the number of characters written,
+   excluding the terminating NUL.
+
+   If UPPERCASE, the output is in upper case, otherwise in lower case.
+
+   HEXDIGITS digits appear altogether, rounding the value if
+   necessary.  If HEXDIGITS is 0, the minimal precision to display the
+   number precisely is used instead.  If nothing would appear after
+   the decimal point it is suppressed.
+
+   The decimal exponent is always printed and has at least one digit.
+   Zero values display an exponent of zero.  Infinities and NaNs
+   appear as "infinity" or "nan" respectively.
+
+   The above rules are as specified by C99.  There is ambiguity about
+   what the leading hexadecimal digit should be.  This implementation
+   uses whatever is necessary so that the exponent is displayed as
+   stored.  This implies the exponent will fall within the IEEE format
+   range, and the leading hexadecimal digit will be 0 (for denormals),
+   1 (normal numbers) or 2 (normal numbers rounded-away-from-zero with
+   any other digits zero).
+*/
+unsigned int
+APFloat::convertToHexString(char *dst, unsigned int hexDigits,
+                            bool upperCase, roundingMode rounding_mode) const
+{
+  char *p;
+
+  assertArithmeticOK(*semantics);
+
+  p = dst;
+  if (sign)
+    *dst++ = '-';
+
+  switch (category) {
+  case fcInfinity:
+    memcpy (dst, upperCase ? infinityU: infinityL, sizeof infinityU - 1);
+    dst += sizeof infinityL - 1;
+    break;
+
+  case fcNaN:
+    memcpy (dst, upperCase ? NaNU: NaNL, sizeof NaNU - 1);
+    dst += sizeof NaNU - 1;
+    break;
+
+  case fcZero:
+    *dst++ = '0';
+    *dst++ = upperCase ? 'X': 'x';
+    *dst++ = '0';
+    if (hexDigits > 1) {
+      *dst++ = '.';
+      memset (dst, '0', hexDigits - 1);
+      dst += hexDigits - 1;
+    }
+    *dst++ = upperCase ? 'P': 'p';
+    *dst++ = '0';
+    break;
+
+  case fcNormal:
+    dst = convertNormalToHexString (dst, hexDigits, upperCase, rounding_mode);
+    break;
+  }
+
+  *dst = 0;
+
+  return static_cast<unsigned int>(dst - p);
+}
+
+/* Does the hard work of outputting the correctly rounded hexadecimal
+   form of a normal floating point number with the specified number of
+   hexadecimal digits.  If HEXDIGITS is zero the minimum number of
+   digits necessary to print the value precisely is output.  */
+char *
+APFloat::convertNormalToHexString(char *dst, unsigned int hexDigits,
+                                  bool upperCase,
+                                  roundingMode rounding_mode) const
+{
+  unsigned int count, valueBits, shift, partsCount, outputDigits;
+  const char *hexDigitChars;
+  const integerPart *significand;
+  char *p;
+  bool roundUp;
+
+  *dst++ = '0';
+  *dst++ = upperCase ? 'X': 'x';
+
+  roundUp = false;
+  hexDigitChars = upperCase ? hexDigitsUpper: hexDigitsLower;
+
+  significand = significandParts();
+  partsCount = partCount();
+
+  /* +3 because the first digit only uses the single integer bit, so
+     we have 3 virtual zero most-significant-bits.  */
+  valueBits = semantics->precision + 3;
+  shift = integerPartWidth - valueBits % integerPartWidth;
+
+  /* The natural number of digits required ignoring trailing
+     insignificant zeroes.  */
+  outputDigits = (valueBits - significandLSB () + 3) / 4;
+
+  /* hexDigits of zero means use the required number for the
+     precision.  Otherwise, see if we are truncating.  If we are,
+     find out if we need to round away from zero.  */
+  if (hexDigits) {
+    if (hexDigits < outputDigits) {
+      /* We are dropping non-zero bits, so need to check how to round.
+         "bits" is the number of dropped bits.  */
+      unsigned int bits;
+      lostFraction fraction;
+
+      bits = valueBits - hexDigits * 4;
+      fraction = lostFractionThroughTruncation (significand, partsCount, bits);
+      roundUp = roundAwayFromZero(rounding_mode, fraction, bits);
+    }
+    outputDigits = hexDigits;
+  }
+
+  /* Write the digits consecutively, and start writing in the location
+     of the hexadecimal point.  We move the most significant digit
+     left and add the hexadecimal point later.  */
+  p = ++dst;
+
+  count = (valueBits + integerPartWidth - 1) / integerPartWidth;
+
+  while (outputDigits && count) {
+    integerPart part;
+
+    /* Put the most significant integerPartWidth bits in "part".  */
+    if (--count == partsCount)
+      part = 0;  /* An imaginary higher zero part.  */
+    else
+      part = significand[count] << shift;
+
+    if (count && shift)
+      part |= significand[count - 1] >> (integerPartWidth - shift);
+
+    /* Convert as much of "part" to hexdigits as we can.  */
+    unsigned int curDigits = integerPartWidth / 4;
+
+    if (curDigits > outputDigits)
+      curDigits = outputDigits;
+    dst += partAsHex (dst, part, curDigits, hexDigitChars);
+    outputDigits -= curDigits;
+  }
+
+  if (roundUp) {
+    char *q = dst;
+
+    /* Note that hexDigitChars has a trailing '0'.  */
+    do {
+      q--;
+      *q = hexDigitChars[hexDigitValue (*q) + 1];
+    } while (*q == '0');
+    assert (q >= p);
+  } else {
+    /* Add trailing zeroes.  */
+    memset (dst, '0', outputDigits);
+    dst += outputDigits;
+  }
+
+  /* Move the most significant digit to before the point, and if there
+     is something after the decimal point add it.  This must come
+     after rounding above.  */
+  p[-1] = p[0];
+  if (dst -1 == p)
+    dst--;
+  else
+    p[0] = '.';
+
+  /* Finally output the exponent.  */
+  *dst++ = upperCase ? 'P': 'p';
+
+  return writeSignedDecimal (dst, exponent);
+}
+
+// For good performance it is desirable for different APFloats
+// to produce different integers.
+uint32_t
+APFloat::getHashValue() const
+{
+  if (category==fcZero) return sign<<8 | semantics->precision ;
+  else if (category==fcInfinity) return sign<<9 | semantics->precision;
+  else if (category==fcNaN) return 1<<10 | semantics->precision;
+  else {
+    uint32_t hash = sign<<11 | semantics->precision | exponent<<12;
+    const integerPart* p = significandParts();
+    for (int i=partCount(); i>0; i--, p++)
+      hash ^= ((uint32_t)*p) ^ (uint32_t)((*p)>>32);
+    return hash;
+  }
+}
+
+// Conversion from APFloat to/from host float/double.  It may eventually be
+// possible to eliminate these and have everybody deal with APFloats, but that
+// will take a while.  This approach will not easily extend to long double.
+// Current implementation requires integerPartWidth==64, which is correct at
+// the moment but could be made more general.
+
+// Denormals have exponent minExponent in APFloat, but minExponent-1 in
+// the actual IEEE respresentations.  We compensate for that here.
+
+APInt
+APFloat::convertF80LongDoubleAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&x87DoubleExtended);
+  assert (partCount()==2);
+
+  uint64_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+16383; //bias
+    mysignificand = significandParts()[0];
+    if (myexponent==1 && !(mysignificand & 0x8000000000000000ULL))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x7fff;
+    mysignificand = 0x8000000000000000ULL;
+  } else {
+    assert(category == fcNaN && "Unknown category");
+    myexponent = 0x7fff;
+    mysignificand = significandParts()[0];
+  }
+
+  uint64_t words[2];
+  words[0] = mysignificand;
+  words[1] =  ((uint64_t)(sign & 1) << 15) |
+              (myexponent & 0x7fffLL);
+  return APInt(80, 2, words);
+}
+
+APInt
+APFloat::convertPPCDoubleDoubleAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&PPCDoubleDouble);
+  assert (partCount()==2);
+
+  uint64_t myexponent, mysignificand, myexponent2, mysignificand2;
+
+  if (category==fcNormal) {
+    myexponent = exponent + 1023; //bias
+    myexponent2 = exponent2 + 1023;
+    mysignificand = significandParts()[0];
+    mysignificand2 = significandParts()[1];
+    if (myexponent==1 && !(mysignificand & 0x10000000000000LL))
+      myexponent = 0;   // denormal
+    if (myexponent2==1 && !(mysignificand2 & 0x10000000000000LL))
+      myexponent2 = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+    myexponent2 = 0;
+    mysignificand2 = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x7ff;
+    myexponent2 = 0;
+    mysignificand = 0;
+    mysignificand2 = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category");
+    myexponent = 0x7ff;
+    mysignificand = significandParts()[0];
+    myexponent2 = exponent2;
+    mysignificand2 = significandParts()[1];
+  }
+
+  uint64_t words[2];
+  words[0] =  ((uint64_t)(sign & 1) << 63) |
+              ((myexponent & 0x7ff) <<  52) |
+              (mysignificand & 0xfffffffffffffLL);
+  words[1] =  ((uint64_t)(sign2 & 1) << 63) |
+              ((myexponent2 & 0x7ff) <<  52) |
+              (mysignificand2 & 0xfffffffffffffLL);
+  return APInt(128, 2, words);
+}
+
+APInt
+APFloat::convertDoubleAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEdouble);
+  assert (partCount()==1);
+
+  uint64_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+1023; //bias
+    mysignificand = *significandParts();
+    if (myexponent==1 && !(mysignificand & 0x10000000000000LL))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x7ff;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0x7ff;
+    mysignificand = *significandParts();
+  }
+
+  return APInt(64, ((((uint64_t)(sign & 1) << 63) |
+                     ((myexponent & 0x7ff) <<  52) |
+                     (mysignificand & 0xfffffffffffffLL))));
+}
+
+APInt
+APFloat::convertFloatAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEsingle);
+  assert (partCount()==1);
+
+  uint32_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+127; //bias
+    mysignificand = (uint32_t)*significandParts();
+    if (myexponent == 1 && !(mysignificand & 0x800000))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0xff;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0xff;
+    mysignificand = (uint32_t)*significandParts();
+  }
+
+  return APInt(32, (((sign&1) << 31) | ((myexponent&0xff) << 23) |
+                    (mysignificand & 0x7fffff)));
+}
+
+// This function creates an APInt that is just a bit map of the floating
+// point constant as it would appear in memory.  It is not a conversion,
+// and treating the result as a normal integer is unlikely to be useful.
+
+APInt
+APFloat::bitcastToAPInt() const
+{
+  if (semantics == (const llvm::fltSemantics*)&IEEEsingle)
+    return convertFloatAPFloatToAPInt();
+  
+  if (semantics == (const llvm::fltSemantics*)&IEEEdouble)
+    return convertDoubleAPFloatToAPInt();
+
+  if (semantics == (const llvm::fltSemantics*)&PPCDoubleDouble)
+    return convertPPCDoubleDoubleAPFloatToAPInt();
+
+  assert(semantics == (const llvm::fltSemantics*)&x87DoubleExtended &&
+         "unknown format!");
+  return convertF80LongDoubleAPFloatToAPInt();
+}
+
+float
+APFloat::convertToFloat() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEsingle);
+  APInt api = bitcastToAPInt();
+  return api.bitsToFloat();
+}
+
+double
+APFloat::convertToDouble() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEdouble);
+  APInt api = bitcastToAPInt();
+  return api.bitsToDouble();
+}
+
+/// Integer bit is explicit in this format.  Intel hardware (387 and later)
+/// does not support these bit patterns:
+///  exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity")
+///  exponent = all 1's, integer bit 0, significand nonzero ("pseudoNaN")
+///  exponent = 0, integer bit 1 ("pseudodenormal")
+///  exponent!=0 nor all 1's, integer bit 0 ("unnormal")
+/// At the moment, the first two are treated as NaNs, the second two as Normal.
+void
+APFloat::initFromF80LongDoubleAPInt(const APInt &api)
+{
+  assert(api.getBitWidth()==80);
+  uint64_t i1 = api.getRawData()[0];
+  uint64_t i2 = api.getRawData()[1];
+  uint64_t myexponent = (i2 & 0x7fff);
+  uint64_t mysignificand = i1;
+
+  initialize(&APFloat::x87DoubleExtended);
+  assert(partCount()==2);
+
+  sign = static_cast<unsigned int>(i2>>15);
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0x7fff && mysignificand==0x8000000000000000ULL) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0x7fff && mysignificand!=0x8000000000000000ULL) {
+    // exponent meaningless
+    category = fcNaN;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = 0;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 16383;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = 0;
+    if (myexponent==0)          // denormal
+      exponent = -16382;
+  }
+}
+
+void
+APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
+{
+  assert(api.getBitWidth()==128);
+  uint64_t i1 = api.getRawData()[0];
+  uint64_t i2 = api.getRawData()[1];
+  uint64_t myexponent = (i1 >> 52) & 0x7ff;
+  uint64_t mysignificand = i1 & 0xfffffffffffffLL;
+  uint64_t myexponent2 = (i2 >> 52) & 0x7ff;
+  uint64_t mysignificand2 = i2 & 0xfffffffffffffLL;
+
+  initialize(&APFloat::PPCDoubleDouble);
+  assert(partCount()==2);
+
+  sign = static_cast<unsigned int>(i1>>63);
+  sign2 = static_cast<unsigned int>(i2>>63);
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    // exponent2 and significand2 are required to be 0; we don't check
+    category = fcZero;
+  } else if (myexponent==0x7ff && mysignificand==0) {
+    // exponent, significand meaningless
+    // exponent2 and significand2 are required to be 0; we don't check
+    category = fcInfinity;
+  } else if (myexponent==0x7ff && mysignificand!=0) {
+    // exponent meaningless.  So is the whole second word, but keep it 
+    // for determinism.
+    category = fcNaN;
+    exponent2 = myexponent2;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = mysignificand2;
+  } else {
+    category = fcNormal;
+    // Note there is no category2; the second word is treated as if it is
+    // fcNormal, although it might be something else considered by itself.
+    exponent = myexponent - 1023;
+    exponent2 = myexponent2 - 1023;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = mysignificand2;
+    if (myexponent==0)          // denormal
+      exponent = -1022;
+    else
+      significandParts()[0] |= 0x10000000000000LL;  // integer bit
+    if (myexponent2==0) 
+      exponent2 = -1022;
+    else
+      significandParts()[1] |= 0x10000000000000LL;  // integer bit
+  }
+}
+
+void
+APFloat::initFromDoubleAPInt(const APInt &api)
+{
+  assert(api.getBitWidth()==64);
+  uint64_t i = *api.getRawData();
+  uint64_t myexponent = (i >> 52) & 0x7ff;
+  uint64_t mysignificand = i & 0xfffffffffffffLL;
+
+  initialize(&APFloat::IEEEdouble);
+  assert(partCount()==1);
+
+  sign = static_cast<unsigned int>(i>>63);
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0x7ff && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0x7ff && mysignificand!=0) {
+    // exponent meaningless
+    category = fcNaN;
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 1023;
+    *significandParts() = mysignificand;
+    if (myexponent==0)          // denormal
+      exponent = -1022;
+    else
+      *significandParts() |= 0x10000000000000LL;  // integer bit
+  }
+}
+
+void
+APFloat::initFromFloatAPInt(const APInt & api)
+{
+  assert(api.getBitWidth()==32);
+  uint32_t i = (uint32_t)*api.getRawData();
+  uint32_t myexponent = (i >> 23) & 0xff;
+  uint32_t mysignificand = i & 0x7fffff;
+
+  initialize(&APFloat::IEEEsingle);
+  assert(partCount()==1);
+
+  sign = i >> 31;
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0xff && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0xff && mysignificand!=0) {
+    // sign, exponent, significand meaningless
+    category = fcNaN;
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 127;  //bias
+    *significandParts() = mysignificand;
+    if (myexponent==0)    // denormal
+      exponent = -126;
+    else
+      *significandParts() |= 0x800000; // integer bit
+  }
+}
+
+/// Treat api as containing the bits of a floating point number.  Currently
+/// we infer the floating point type from the size of the APInt.  The
+/// isIEEE argument distinguishes between PPC128 and IEEE128 (not meaningful
+/// when the size is anything else).
+void
+APFloat::initFromAPInt(const APInt& api, bool isIEEE)
+{
+  if (api.getBitWidth() == 32)
+    return initFromFloatAPInt(api);
+  else if (api.getBitWidth()==64)
+    return initFromDoubleAPInt(api);
+  else if (api.getBitWidth()==80)
+    return initFromF80LongDoubleAPInt(api);
+  else if (api.getBitWidth()==128 && !isIEEE)
+    return initFromPPCDoubleDoubleAPInt(api);
+  else
+    assert(0);
+}
+
+APFloat::APFloat(const APInt& api, bool isIEEE)
+{
+  initFromAPInt(api, isIEEE);
+}
+
+APFloat::APFloat(float f)
+{
+  APInt api = APInt(32, 0);
+  initFromAPInt(api.floatToBits(f));
+}
+
+APFloat::APFloat(double d)
+{
+  APInt api = APInt(64, 0);
+  initFromAPInt(api.doubleToBits(d));
+}
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
new file mode 100644
index 0000000..73bf774
--- /dev/null
+++ b/lib/Support/APInt.cpp
@@ -0,0 +1,2816 @@
+//===-- APInt.cpp - Implement APInt class ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a class to represent arbitrary precision integer
+// constant values and provide a variety of arithmetic operations on them.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "apint"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cmath>
+#include <limits>
+#include <cstring>
+#include <cstdlib>
+using namespace llvm;
+
+/// A utility function for allocating memory, checking for allocation failures,
+/// and ensuring the contents are zeroed.
+inline static uint64_t* getClearedMemory(unsigned numWords) {
+  uint64_t * result = new uint64_t[numWords];
+  assert(result && "APInt memory allocation fails!");
+  memset(result, 0, numWords * sizeof(uint64_t));
+  return result;
+}
+
+/// A utility function for allocating memory and checking for allocation 
+/// failure.  The content is not zeroed.
+inline static uint64_t* getMemory(unsigned numWords) {
+  uint64_t * result = new uint64_t[numWords];
+  assert(result && "APInt memory allocation fails!");
+  return result;
+}
+
+void APInt::initSlowCase(unsigned numBits, uint64_t val, bool isSigned) {
+  pVal = getClearedMemory(getNumWords());
+  pVal[0] = val;
+  if (isSigned && int64_t(val) < 0) 
+    for (unsigned i = 1; i < getNumWords(); ++i)
+      pVal[i] = -1ULL;
+}
+
+void APInt::initSlowCase(const APInt& that) {
+  pVal = getMemory(getNumWords());
+  memcpy(pVal, that.pVal, getNumWords() * APINT_WORD_SIZE);
+}
+
+
+APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
+  : BitWidth(numBits), VAL(0) {
+  assert(BitWidth && "bitwidth too small");
+  assert(bigVal && "Null pointer detected!");
+  if (isSingleWord())
+    VAL = bigVal[0];
+  else {
+    // Get memory, cleared to 0
+    pVal = getClearedMemory(getNumWords());
+    // Calculate the number of words to copy
+    unsigned words = std::min<unsigned>(numWords, getNumWords());
+    // Copy the words from bigVal to pVal
+    memcpy(pVal, bigVal, words * APINT_WORD_SIZE);
+  }
+  // Make sure unused high bits are cleared
+  clearUnusedBits();
+}
+
+APInt::APInt(unsigned numbits, const char StrStart[], unsigned slen,
+             uint8_t radix) 
+  : BitWidth(numbits), VAL(0) {
+  assert(BitWidth && "bitwidth too small");
+  fromString(numbits, StrStart, slen, radix);
+}
+
+APInt& APInt::AssignSlowCase(const APInt& RHS) {
+  // Don't do anything for X = X
+  if (this == &RHS)
+    return *this;
+
+  if (BitWidth == RHS.getBitWidth()) {
+    // assume same bit-width single-word case is already handled
+    assert(!isSingleWord());
+    memcpy(pVal, RHS.pVal, getNumWords() * APINT_WORD_SIZE);
+    return *this;
+  }
+
+  if (isSingleWord()) {
+    // assume case where both are single words is already handled
+    assert(!RHS.isSingleWord());
+    VAL = 0;
+    pVal = getMemory(RHS.getNumWords());
+    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
+  } else if (getNumWords() == RHS.getNumWords()) 
+    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
+  else if (RHS.isSingleWord()) {
+    delete [] pVal;
+    VAL = RHS.VAL;
+  } else {
+    delete [] pVal;
+    pVal = getMemory(RHS.getNumWords());
+    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
+  }
+  BitWidth = RHS.BitWidth;
+  return clearUnusedBits();
+}
+
+APInt& APInt::operator=(uint64_t RHS) {
+  if (isSingleWord()) 
+    VAL = RHS;
+  else {
+    pVal[0] = RHS;
+    memset(pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
+  }
+  return clearUnusedBits();
+}
+
+/// Profile - This method 'profiles' an APInt for use with FoldingSet.
+void APInt::Profile(FoldingSetNodeID& ID) const {
+  ID.AddInteger(BitWidth);
+  
+  if (isSingleWord()) {
+    ID.AddInteger(VAL);
+    return;
+  }
+
+  unsigned NumWords = getNumWords();
+  for (unsigned i = 0; i < NumWords; ++i)
+    ID.AddInteger(pVal[i]);
+}
+
+/// add_1 - This function adds a single "digit" integer, y, to the multiple 
+/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
+/// 1 is returned if there is a carry out, otherwise 0 is returned.
+/// @returns the carry of the addition.
+static bool add_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
+  for (unsigned i = 0; i < len; ++i) {
+    dest[i] = y + x[i];
+    if (dest[i] < y)
+      y = 1; // Carry one to next digit.
+    else {
+      y = 0; // No need to carry so exit early
+      break;
+    }
+  }
+  return y;
+}
+
+/// @brief Prefix increment operator. Increments the APInt by one.
+APInt& APInt::operator++() {
+  if (isSingleWord()) 
+    ++VAL;
+  else
+    add_1(pVal, pVal, getNumWords(), 1);
+  return clearUnusedBits();
+}
+
+/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from 
+/// the multi-digit integer array, x[], propagating the borrowed 1 value until 
+/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
+/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
+/// In other words, if y > x then this function returns 1, otherwise 0.
+/// @returns the borrow out of the subtraction
+static bool sub_1(uint64_t x[], unsigned len, uint64_t y) {
+  for (unsigned i = 0; i < len; ++i) {
+    uint64_t X = x[i];
+    x[i] -= y;
+    if (y > X) 
+      y = 1;  // We have to "borrow 1" from next "digit"
+    else {
+      y = 0;  // No need to borrow
+      break;  // Remaining digits are unchanged so exit early
+    }
+  }
+  return bool(y);
+}
+
+/// @brief Prefix decrement operator. Decrements the APInt by one.
+APInt& APInt::operator--() {
+  if (isSingleWord()) 
+    --VAL;
+  else
+    sub_1(pVal, getNumWords(), 1);
+  return clearUnusedBits();
+}
+
+/// add - This function adds the integer array x to the integer array Y and
+/// places the result in dest. 
+/// @returns the carry out from the addition
+/// @brief General addition of 64-bit integer arrays
+static bool add(uint64_t *dest, const uint64_t *x, const uint64_t *y, 
+                unsigned len) {
+  bool carry = false;
+  for (unsigned i = 0; i< len; ++i) {
+    uint64_t limit = std::min(x[i],y[i]); // must come first in case dest == x
+    dest[i] = x[i] + y[i] + carry;
+    carry = dest[i] < limit || (carry && dest[i] == limit);
+  }
+  return carry;
+}
+
+/// Adds the RHS APint to this APInt.
+/// @returns this, after addition of RHS.
+/// @brief Addition assignment operator. 
+APInt& APInt::operator+=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) 
+    VAL += RHS.VAL;
+  else {
+    add(pVal, pVal, RHS.pVal, getNumWords());
+  }
+  return clearUnusedBits();
+}
+
+/// Subtracts the integer array y from the integer array x 
+/// @returns returns the borrow out.
+/// @brief Generalized subtraction of 64-bit integer arrays.
+static bool sub(uint64_t *dest, const uint64_t *x, const uint64_t *y, 
+                unsigned len) {
+  bool borrow = false;
+  for (unsigned i = 0; i < len; ++i) {
+    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
+    dest[i] = x_tmp - y[i];
+  }
+  return borrow;
+}
+
+/// Subtracts the RHS APInt from this APInt
+/// @returns this, after subtraction
+/// @brief Subtraction assignment operator. 
+APInt& APInt::operator-=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) 
+    VAL -= RHS.VAL;
+  else
+    sub(pVal, pVal, RHS.pVal, getNumWords());
+  return clearUnusedBits();
+}
+
+/// Multiplies an integer array, x by a a uint64_t integer and places the result
+/// into dest. 
+/// @returns the carry out of the multiplication.
+/// @brief Multiply a multi-digit APInt by a single digit (64-bit) integer.
+static uint64_t mul_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
+  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
+  uint64_t ly = y & 0xffffffffULL, hy = y >> 32;
+  uint64_t carry = 0;
+
+  // For each digit of x.
+  for (unsigned i = 0; i < len; ++i) {
+    // Split x into high and low words
+    uint64_t lx = x[i] & 0xffffffffULL;
+    uint64_t hx = x[i] >> 32;
+    // hasCarry - A flag to indicate if there is a carry to the next digit.
+    // hasCarry == 0, no carry
+    // hasCarry == 1, has carry
+    // hasCarry == 2, no carry and the calculation result == 0.
+    uint8_t hasCarry = 0;
+    dest[i] = carry + lx * ly;
+    // Determine if the add above introduces carry.
+    hasCarry = (dest[i] < carry) ? 1 : 0;
+    carry = hx * ly + (dest[i] >> 32) + (hasCarry ? (1ULL << 32) : 0);
+    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) + 
+    // (2^32 - 1) + 2^32 = 2^64.
+    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+
+    carry += (lx * hy) & 0xffffffffULL;
+    dest[i] = (carry << 32) | (dest[i] & 0xffffffffULL);
+    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) + 
+            (carry >> 32) + ((lx * hy) >> 32) + hx * hy;
+  }
+  return carry;
+}
+
+/// Multiplies integer array x by integer array y and stores the result into 
+/// the integer array dest. Note that dest's size must be >= xlen + ylen.
+/// @brief Generalized multiplicate of integer arrays.
+static void mul(uint64_t dest[], uint64_t x[], unsigned xlen, uint64_t y[],
+                unsigned ylen) {
+  dest[xlen] = mul_1(dest, x, xlen, y[0]);
+  for (unsigned i = 1; i < ylen; ++i) {
+    uint64_t ly = y[i] & 0xffffffffULL, hy = y[i] >> 32;
+    uint64_t carry = 0, lx = 0, hx = 0;
+    for (unsigned j = 0; j < xlen; ++j) {
+      lx = x[j] & 0xffffffffULL;
+      hx = x[j] >> 32;
+      // hasCarry - A flag to indicate if has carry.
+      // hasCarry == 0, no carry
+      // hasCarry == 1, has carry
+      // hasCarry == 2, no carry and the calculation result == 0.
+      uint8_t hasCarry = 0;
+      uint64_t resul = carry + lx * ly;
+      hasCarry = (resul < carry) ? 1 : 0;
+      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + (resul >> 32);
+      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+
+      carry += (lx * hy) & 0xffffffffULL;
+      resul = (carry << 32) | (resul & 0xffffffffULL);
+      dest[i+j] += resul;
+      carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0)+
+              (carry >> 32) + (dest[i+j] < resul ? 1 : 0) + 
+              ((lx * hy) >> 32) + hx * hy;
+    }
+    dest[i+xlen] = carry;
+  }
+}
+
+APInt& APInt::operator*=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL *= RHS.VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  // Get some bit facts about LHS and check for zero
+  unsigned lhsBits = getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
+  if (!lhsWords) 
+    // 0 * X ===> 0
+    return *this;
+
+  // Get some bit facts about RHS and check for zero
+  unsigned rhsBits = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
+  if (!rhsWords) {
+    // X * 0 ===> 0
+    clear();
+    return *this;
+  }
+
+  // Allocate space for the result
+  unsigned destWords = rhsWords + lhsWords;
+  uint64_t *dest = getMemory(destWords);
+
+  // Perform the long multiply
+  mul(dest, pVal, lhsWords, RHS.pVal, rhsWords);
+
+  // Copy result back into *this
+  clear();
+  unsigned wordsToCopy = destWords >= getNumWords() ? getNumWords() : destWords;
+  memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
+
+  // delete dest array and return
+  delete[] dest;
+  return *this;
+}
+
+APInt& APInt::operator&=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL &= RHS.VAL;
+    return *this;
+  }
+  unsigned numWords = getNumWords();
+  for (unsigned i = 0; i < numWords; ++i)
+    pVal[i] &= RHS.pVal[i];
+  return *this;
+}
+
+APInt& APInt::operator|=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL |= RHS.VAL;
+    return *this;
+  }
+  unsigned numWords = getNumWords();
+  for (unsigned i = 0; i < numWords; ++i)
+    pVal[i] |= RHS.pVal[i];
+  return *this;
+}
+
+APInt& APInt::operator^=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL ^= RHS.VAL;
+    this->clearUnusedBits();
+    return *this;
+  } 
+  unsigned numWords = getNumWords();
+  for (unsigned i = 0; i < numWords; ++i)
+    pVal[i] ^= RHS.pVal[i];
+  return clearUnusedBits();
+}
+
+APInt APInt::AndSlowCase(const APInt& RHS) const {
+  unsigned numWords = getNumWords();
+  uint64_t* val = getMemory(numWords);
+  for (unsigned i = 0; i < numWords; ++i)
+    val[i] = pVal[i] & RHS.pVal[i];
+  return APInt(val, getBitWidth());
+}
+
+APInt APInt::OrSlowCase(const APInt& RHS) const {
+  unsigned numWords = getNumWords();
+  uint64_t *val = getMemory(numWords);
+  for (unsigned i = 0; i < numWords; ++i)
+    val[i] = pVal[i] | RHS.pVal[i];
+  return APInt(val, getBitWidth());
+}
+
+APInt APInt::XorSlowCase(const APInt& RHS) const {
+  unsigned numWords = getNumWords();
+  uint64_t *val = getMemory(numWords);
+  for (unsigned i = 0; i < numWords; ++i)
+    val[i] = pVal[i] ^ RHS.pVal[i];
+
+  // 0^0==1 so clear the high bits in case they got set.
+  return APInt(val, getBitWidth()).clearUnusedBits();
+}
+
+bool APInt::operator !() const {
+  if (isSingleWord())
+    return !VAL;
+
+  for (unsigned i = 0; i < getNumWords(); ++i)
+    if (pVal[i]) 
+      return false;
+  return true;
+}
+
+APInt APInt::operator*(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    return APInt(BitWidth, VAL * RHS.VAL);
+  APInt Result(*this);
+  Result *= RHS;
+  return Result.clearUnusedBits();
+}
+
+APInt APInt::operator+(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    return APInt(BitWidth, VAL + RHS.VAL);
+  APInt Result(BitWidth, 0);
+  add(Result.pVal, this->pVal, RHS.pVal, getNumWords());
+  return Result.clearUnusedBits();
+}
+
+APInt APInt::operator-(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    return APInt(BitWidth, VAL - RHS.VAL);
+  APInt Result(BitWidth, 0);
+  sub(Result.pVal, this->pVal, RHS.pVal, getNumWords());
+  return Result.clearUnusedBits();
+}
+
+bool APInt::operator[](unsigned bitPosition) const {
+  return (maskBit(bitPosition) & 
+          (isSingleWord() ?  VAL : pVal[whichWord(bitPosition)])) != 0;
+}
+
+bool APInt::EqualSlowCase(const APInt& RHS) const {
+  // Get some facts about the number of bits used in the two operands.
+  unsigned n1 = getActiveBits();
+  unsigned n2 = RHS.getActiveBits();
+
+  // If the number of bits isn't the same, they aren't equal
+  if (n1 != n2) 
+    return false;
+
+  // If the number of bits fits in a word, we only need to compare the low word.
+  if (n1 <= APINT_BITS_PER_WORD)
+    return pVal[0] == RHS.pVal[0];
+
+  // Otherwise, compare everything
+  for (int i = whichWord(n1 - 1); i >= 0; --i)
+    if (pVal[i] != RHS.pVal[i]) 
+      return false;
+  return true;
+}
+
+bool APInt::EqualSlowCase(uint64_t Val) const {
+  unsigned n = getActiveBits();
+  if (n <= APINT_BITS_PER_WORD)
+    return pVal[0] == Val;
+  else
+    return false;
+}
+
+bool APInt::ult(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison");
+  if (isSingleWord())
+    return VAL < RHS.VAL;
+
+  // Get active bit length of both operands
+  unsigned n1 = getActiveBits();
+  unsigned n2 = RHS.getActiveBits();
+
+  // If magnitude of LHS is less than RHS, return true.
+  if (n1 < n2)
+    return true;
+
+  // If magnitude of RHS is greather than LHS, return false.
+  if (n2 < n1)
+    return false;
+
+  // If they bot fit in a word, just compare the low order word
+  if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+    return pVal[0] < RHS.pVal[0];
+
+  // Otherwise, compare all words
+  unsigned topWord = whichWord(std::max(n1,n2)-1);
+  for (int i = topWord; i >= 0; --i) {
+    if (pVal[i] > RHS.pVal[i]) 
+      return false;
+    if (pVal[i] < RHS.pVal[i]) 
+      return true;
+  }
+  return false;
+}
+
+bool APInt::slt(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison");
+  if (isSingleWord()) {
+    int64_t lhsSext = (int64_t(VAL) << (64-BitWidth)) >> (64-BitWidth);
+    int64_t rhsSext = (int64_t(RHS.VAL) << (64-BitWidth)) >> (64-BitWidth);
+    return lhsSext < rhsSext;
+  }
+
+  APInt lhs(*this);
+  APInt rhs(RHS);
+  bool lhsNeg = isNegative();
+  bool rhsNeg = rhs.isNegative();
+  if (lhsNeg) {
+    // Sign bit is set so perform two's complement to make it positive
+    lhs.flip();
+    lhs++;
+  }
+  if (rhsNeg) {
+    // Sign bit is set so perform two's complement to make it positive
+    rhs.flip();
+    rhs++;
+  }
+
+  // Now we have unsigned values to compare so do the comparison if necessary
+  // based on the negativeness of the values.
+  if (lhsNeg)
+    if (rhsNeg)
+      return lhs.ugt(rhs);
+    else
+      return true;
+  else if (rhsNeg)
+    return false;
+  else 
+    return lhs.ult(rhs);
+}
+
+APInt& APInt::set(unsigned bitPosition) {
+  if (isSingleWord()) 
+    VAL |= maskBit(bitPosition);
+  else 
+    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
+  return *this;
+}
+
+/// Set the given bit to 0 whose position is given as "bitPosition".
+/// @brief Set a given bit to 0.
+APInt& APInt::clear(unsigned bitPosition) {
+  if (isSingleWord()) 
+    VAL &= ~maskBit(bitPosition);
+  else 
+    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
+  return *this;
+}
+
+/// @brief Toggle every bit to its opposite value.
+
+/// Toggle a given bit to its opposite value whose position is given 
+/// as "bitPosition".
+/// @brief Toggles a given bit to its opposite value.
+APInt& APInt::flip(unsigned bitPosition) {
+  assert(bitPosition < BitWidth && "Out of the bit-width range!");
+  if ((*this)[bitPosition]) clear(bitPosition);
+  else set(bitPosition);
+  return *this;
+}
+
+unsigned APInt::getBitsNeeded(const char* str, unsigned slen, uint8_t radix) {
+  assert(str != 0 && "Invalid value string");
+  assert(slen > 0 && "Invalid string length");
+
+  // Each computation below needs to know if its negative
+  unsigned isNegative = str[0] == '-';
+  if (isNegative) {
+    slen--;
+    str++;
+  }
+  // For radixes of power-of-two values, the bits required is accurately and
+  // easily computed
+  if (radix == 2)
+    return slen + isNegative;
+  if (radix == 8)
+    return slen * 3 + isNegative;
+  if (radix == 16)
+    return slen * 4 + isNegative;
+
+  // Otherwise it must be radix == 10, the hard case
+  assert(radix == 10 && "Invalid radix");
+
+  // This is grossly inefficient but accurate. We could probably do something
+  // with a computation of roughly slen*64/20 and then adjust by the value of
+  // the first few digits. But, I'm not sure how accurate that could be.
+
+  // Compute a sufficient number of bits that is always large enough but might
+  // be too large. This avoids the assertion in the constructor.
+  unsigned sufficient = slen*64/18;
+
+  // Convert to the actual binary value.
+  APInt tmp(sufficient, str, slen, radix);
+
+  // Compute how many bits are required.
+  return isNegative + tmp.logBase2() + 1;
+}
+
+// From http://www.burtleburtle.net, byBob Jenkins.
+// When targeting x86, both GCC and LLVM seem to recognize this as a
+// rotate instruction.
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+// From http://www.burtleburtle.net, by Bob Jenkins.
+#define mix(a,b,c) \
+  { \
+    a -= c;  a ^= rot(c, 4);  c += b; \
+    b -= a;  b ^= rot(a, 6);  a += c; \
+    c -= b;  c ^= rot(b, 8);  b += a; \
+    a -= c;  a ^= rot(c,16);  c += b; \
+    b -= a;  b ^= rot(a,19);  a += c; \
+    c -= b;  c ^= rot(b, 4);  b += a; \
+  }
+
+// From http://www.burtleburtle.net, by Bob Jenkins.
+#define final(a,b,c) \
+  { \
+    c ^= b; c -= rot(b,14); \
+    a ^= c; a -= rot(c,11); \
+    b ^= a; b -= rot(a,25); \
+    c ^= b; c -= rot(b,16); \
+    a ^= c; a -= rot(c,4);  \
+    b ^= a; b -= rot(a,14); \
+    c ^= b; c -= rot(b,24); \
+  }
+
+// hashword() was adapted from http://www.burtleburtle.net, by Bob
+// Jenkins.  k is a pointer to an array of uint32_t values; length is
+// the length of the key, in 32-bit chunks.  This version only handles
+// keys that are a multiple of 32 bits in size.
+static inline uint32_t hashword(const uint64_t *k64, size_t length)
+{
+  const uint32_t *k = reinterpret_cast<const uint32_t *>(k64);
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + (((uint32_t)length)<<2);
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+    {
+      a += k[0];
+      b += k[1];
+      c += k[2];
+      mix(a,b,c);
+      length -= 3;
+      k += 3;
+    }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch (length) {                  /* all the case statements fall through */
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+    case 0:     /* case 0: nothing left to add */
+      break;
+    }
+  /*------------------------------------------------------ report the result */
+  return c;
+}
+
+// hashword8() was adapted from http://www.burtleburtle.net, by Bob
+// Jenkins.  This computes a 32-bit hash from one 64-bit word.  When
+// targeting x86 (32 or 64 bit), both LLVM and GCC compile this
+// function into about 35 instructions when inlined.
+static inline uint32_t hashword8(const uint64_t k64)
+{
+  uint32_t a,b,c;
+  a = b = c = 0xdeadbeef + 4;
+  b += k64 >> 32;
+  a += k64 & 0xffffffff;
+  final(a,b,c);
+  return c;
+}
+#undef final
+#undef mix
+#undef rot
+
+uint64_t APInt::getHashValue() const {
+  uint64_t hash;
+  if (isSingleWord())
+    hash = hashword8(VAL);
+  else
+    hash = hashword(pVal, getNumWords()*2);
+  return hash;
+}
+
+/// HiBits - This function returns the high "numBits" bits of this APInt.
+APInt APInt::getHiBits(unsigned numBits) const {
+  return APIntOps::lshr(*this, BitWidth - numBits);
+}
+
+/// LoBits - This function returns the low "numBits" bits of this APInt.
+APInt APInt::getLoBits(unsigned numBits) const {
+  return APIntOps::lshr(APIntOps::shl(*this, BitWidth - numBits), 
+                        BitWidth - numBits);
+}
+
+bool APInt::isPowerOf2() const {
+  return (!!*this) && !(*this & (*this - APInt(BitWidth,1)));
+}
+
+unsigned APInt::countLeadingZerosSlowCase() const {
+  unsigned Count = 0;
+  for (unsigned i = getNumWords(); i > 0u; --i) {
+    if (pVal[i-1] == 0)
+      Count += APINT_BITS_PER_WORD;
+    else {
+      Count += CountLeadingZeros_64(pVal[i-1]);
+      break;
+    }
+  }
+  unsigned remainder = BitWidth % APINT_BITS_PER_WORD;
+  if (remainder)
+    Count -= APINT_BITS_PER_WORD - remainder;
+  return std::min(Count, BitWidth);
+}
+
+static unsigned countLeadingOnes_64(uint64_t V, unsigned skip) {
+  unsigned Count = 0;
+  if (skip)
+    V <<= skip;
+  while (V && (V & (1ULL << 63))) {
+    Count++;
+    V <<= 1;
+  }
+  return Count;
+}
+
+unsigned APInt::countLeadingOnes() const {
+  if (isSingleWord())
+    return countLeadingOnes_64(VAL, APINT_BITS_PER_WORD - BitWidth);
+
+  unsigned highWordBits = BitWidth % APINT_BITS_PER_WORD;
+  unsigned shift;
+  if (!highWordBits) {
+    highWordBits = APINT_BITS_PER_WORD;
+    shift = 0;
+  } else {
+    shift = APINT_BITS_PER_WORD - highWordBits;
+  }
+  int i = getNumWords() - 1;
+  unsigned Count = countLeadingOnes_64(pVal[i], shift);
+  if (Count == highWordBits) {
+    for (i--; i >= 0; --i) {
+      if (pVal[i] == -1ULL)
+        Count += APINT_BITS_PER_WORD;
+      else {
+        Count += countLeadingOnes_64(pVal[i], 0);
+        break;
+      }
+    }
+  }
+  return Count;
+}
+
+unsigned APInt::countTrailingZeros() const {
+  if (isSingleWord())
+    return std::min(unsigned(CountTrailingZeros_64(VAL)), BitWidth);
+  unsigned Count = 0;
+  unsigned i = 0;
+  for (; i < getNumWords() && pVal[i] == 0; ++i)
+    Count += APINT_BITS_PER_WORD;
+  if (i < getNumWords())
+    Count += CountTrailingZeros_64(pVal[i]);
+  return std::min(Count, BitWidth);
+}
+
+unsigned APInt::countTrailingOnesSlowCase() const {
+  unsigned Count = 0;
+  unsigned i = 0;
+  for (; i < getNumWords() && pVal[i] == -1ULL; ++i)
+    Count += APINT_BITS_PER_WORD;
+  if (i < getNumWords())
+    Count += CountTrailingOnes_64(pVal[i]);
+  return std::min(Count, BitWidth);
+}
+
+unsigned APInt::countPopulationSlowCase() const {
+  unsigned Count = 0;
+  for (unsigned i = 0; i < getNumWords(); ++i)
+    Count += CountPopulation_64(pVal[i]);
+  return Count;
+}
+
+APInt APInt::byteSwap() const {
+  assert(BitWidth >= 16 && BitWidth % 16 == 0 && "Cannot byteswap!");
+  if (BitWidth == 16)
+    return APInt(BitWidth, ByteSwap_16(uint16_t(VAL)));
+  else if (BitWidth == 32)
+    return APInt(BitWidth, ByteSwap_32(unsigned(VAL)));
+  else if (BitWidth == 48) {
+    unsigned Tmp1 = unsigned(VAL >> 16);
+    Tmp1 = ByteSwap_32(Tmp1);
+    uint16_t Tmp2 = uint16_t(VAL);
+    Tmp2 = ByteSwap_16(Tmp2);
+    return APInt(BitWidth, (uint64_t(Tmp2) << 32) | Tmp1);
+  } else if (BitWidth == 64)
+    return APInt(BitWidth, ByteSwap_64(VAL));
+  else {
+    APInt Result(BitWidth, 0);
+    char *pByte = (char*)Result.pVal;
+    for (unsigned i = 0; i < BitWidth / APINT_WORD_SIZE / 2; ++i) {
+      char Tmp = pByte[i];
+      pByte[i] = pByte[BitWidth / APINT_WORD_SIZE - 1 - i];
+      pByte[BitWidth / APINT_WORD_SIZE - i - 1] = Tmp;
+    }
+    return Result;
+  }
+}
+
+APInt llvm::APIntOps::GreatestCommonDivisor(const APInt& API1, 
+                                            const APInt& API2) {
+  APInt A = API1, B = API2;
+  while (!!B) {
+    APInt T = B;
+    B = APIntOps::urem(A, B);
+    A = T;
+  }
+  return A;
+}
+
+APInt llvm::APIntOps::RoundDoubleToAPInt(double Double, unsigned width) {
+  union {
+    double D;
+    uint64_t I;
+  } T;
+  T.D = Double;
+
+  // Get the sign bit from the highest order bit
+  bool isNeg = T.I >> 63;
+
+  // Get the 11-bit exponent and adjust for the 1023 bit bias
+  int64_t exp = ((T.I >> 52) & 0x7ff) - 1023;
+
+  // If the exponent is negative, the value is < 0 so just return 0.
+  if (exp < 0)
+    return APInt(width, 0u);
+
+  // Extract the mantissa by clearing the top 12 bits (sign + exponent).
+  uint64_t mantissa = (T.I & (~0ULL >> 12)) | 1ULL << 52;
+
+  // If the exponent doesn't shift all bits out of the mantissa
+  if (exp < 52)
+    return isNeg ? -APInt(width, mantissa >> (52 - exp)) : 
+                    APInt(width, mantissa >> (52 - exp));
+
+  // If the client didn't provide enough bits for us to shift the mantissa into
+  // then the result is undefined, just return 0
+  if (width <= exp - 52)
+    return APInt(width, 0);
+
+  // Otherwise, we have to shift the mantissa bits up to the right location
+  APInt Tmp(width, mantissa);
+  Tmp = Tmp.shl((unsigned)exp - 52);
+  return isNeg ? -Tmp : Tmp;
+}
+
+/// RoundToDouble - This function convert this APInt to a double.
+/// The layout for double is as following (IEEE Standard 754):
+///  --------------------------------------
+/// |  Sign    Exponent    Fraction    Bias |
+/// |-------------------------------------- |
+/// |  1[63]   11[62-52]   52[51-00]   1023 |
+///  -------------------------------------- 
+double APInt::roundToDouble(bool isSigned) const {
+
+  // Handle the simple case where the value is contained in one uint64_t.
+  if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
+    if (isSigned) {
+      int64_t sext = (int64_t(VAL) << (64-BitWidth)) >> (64-BitWidth);
+      return double(sext);
+    } else
+      return double(VAL);
+  }
+
+  // Determine if the value is negative.
+  bool isNeg = isSigned ? (*this)[BitWidth-1] : false;
+
+  // Construct the absolute value if we're negative.
+  APInt Tmp(isNeg ? -(*this) : (*this));
+
+  // Figure out how many bits we're using.
+  unsigned n = Tmp.getActiveBits();
+
+  // The exponent (without bias normalization) is just the number of bits
+  // we are using. Note that the sign bit is gone since we constructed the
+  // absolute value.
+  uint64_t exp = n;
+
+  // Return infinity for exponent overflow
+  if (exp > 1023) {
+    if (!isSigned || !isNeg)
+      return std::numeric_limits<double>::infinity();
+    else 
+      return -std::numeric_limits<double>::infinity();
+  }
+  exp += 1023; // Increment for 1023 bias
+
+  // Number of bits in mantissa is 52. To obtain the mantissa value, we must
+  // extract the high 52 bits from the correct words in pVal.
+  uint64_t mantissa;
+  unsigned hiWord = whichWord(n-1);
+  if (hiWord == 0) {
+    mantissa = Tmp.pVal[0];
+    if (n > 52)
+      mantissa >>= n - 52; // shift down, we want the top 52 bits.
+  } else {
+    assert(hiWord > 0 && "huh?");
+    uint64_t hibits = Tmp.pVal[hiWord] << (52 - n % APINT_BITS_PER_WORD);
+    uint64_t lobits = Tmp.pVal[hiWord-1] >> (11 + n % APINT_BITS_PER_WORD);
+    mantissa = hibits | lobits;
+  }
+
+  // The leading bit of mantissa is implicit, so get rid of it.
+  uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0;
+  union {
+    double D;
+    uint64_t I;
+  } T;
+  T.I = sign | (exp << 52) | mantissa;
+  return T.D;
+}
+
+// Truncate to new width.
+APInt &APInt::trunc(unsigned width) {
+  assert(width < BitWidth && "Invalid APInt Truncate request");
+  assert(width && "Can't truncate to 0 bits");
+  unsigned wordsBefore = getNumWords();
+  BitWidth = width;
+  unsigned wordsAfter = getNumWords();
+  if (wordsBefore != wordsAfter) {
+    if (wordsAfter == 1) {
+      uint64_t *tmp = pVal;
+      VAL = pVal[0];
+      delete [] tmp;
+    } else {
+      uint64_t *newVal = getClearedMemory(wordsAfter);
+      for (unsigned i = 0; i < wordsAfter; ++i)
+        newVal[i] = pVal[i];
+      delete [] pVal;
+      pVal = newVal;
+    }
+  }
+  return clearUnusedBits();
+}
+
+// Sign extend to a new width.
+APInt &APInt::sext(unsigned width) {
+  assert(width > BitWidth && "Invalid APInt SignExtend request");
+  // If the sign bit isn't set, this is the same as zext.
+  if (!isNegative()) {
+    zext(width);
+    return *this;
+  }
+
+  // The sign bit is set. First, get some facts
+  unsigned wordsBefore = getNumWords();
+  unsigned wordBits = BitWidth % APINT_BITS_PER_WORD;
+  BitWidth = width;
+  unsigned wordsAfter = getNumWords();
+
+  // Mask the high order word appropriately
+  if (wordsBefore == wordsAfter) {
+    unsigned newWordBits = width % APINT_BITS_PER_WORD;
+    // The extension is contained to the wordsBefore-1th word.
+    uint64_t mask = ~0ULL;
+    if (newWordBits)
+      mask >>= APINT_BITS_PER_WORD - newWordBits;
+    mask <<= wordBits;
+    if (wordsBefore == 1)
+      VAL |= mask;
+    else
+      pVal[wordsBefore-1] |= mask;
+    return clearUnusedBits();
+  }
+
+  uint64_t mask = wordBits == 0 ? 0 : ~0ULL << wordBits;
+  uint64_t *newVal = getMemory(wordsAfter);
+  if (wordsBefore == 1)
+    newVal[0] = VAL | mask;
+  else {
+    for (unsigned i = 0; i < wordsBefore; ++i)
+      newVal[i] = pVal[i];
+    newVal[wordsBefore-1] |= mask;
+  }
+  for (unsigned i = wordsBefore; i < wordsAfter; i++)
+    newVal[i] = -1ULL;
+  if (wordsBefore != 1)
+    delete [] pVal;
+  pVal = newVal;
+  return clearUnusedBits();
+}
+
+//  Zero extend to a new width.
+APInt &APInt::zext(unsigned width) {
+  assert(width > BitWidth && "Invalid APInt ZeroExtend request");
+  unsigned wordsBefore = getNumWords();
+  BitWidth = width;
+  unsigned wordsAfter = getNumWords();
+  if (wordsBefore != wordsAfter) {
+    uint64_t *newVal = getClearedMemory(wordsAfter);
+    if (wordsBefore == 1)
+      newVal[0] = VAL;
+    else 
+      for (unsigned i = 0; i < wordsBefore; ++i)
+        newVal[i] = pVal[i];
+    if (wordsBefore != 1)
+      delete [] pVal;
+    pVal = newVal;
+  }
+  return *this;
+}
+
+APInt &APInt::zextOrTrunc(unsigned width) {
+  if (BitWidth < width)
+    return zext(width);
+  if (BitWidth > width)
+    return trunc(width);
+  return *this;
+}
+
+APInt &APInt::sextOrTrunc(unsigned width) {
+  if (BitWidth < width)
+    return sext(width);
+  if (BitWidth > width)
+    return trunc(width);
+  return *this;
+}
+
+/// Arithmetic right-shift this APInt by shiftAmt.
+/// @brief Arithmetic right-shift function.
+APInt APInt::ashr(const APInt &shiftAmt) const {
+  return ashr((unsigned)shiftAmt.getLimitedValue(BitWidth));
+}
+
+/// Arithmetic right-shift this APInt by shiftAmt.
+/// @brief Arithmetic right-shift function.
+APInt APInt::ashr(unsigned shiftAmt) const {
+  assert(shiftAmt <= BitWidth && "Invalid shift amount");
+  // Handle a degenerate case
+  if (shiftAmt == 0)
+    return *this;
+
+  // Handle single word shifts with built-in ashr
+  if (isSingleWord()) {
+    if (shiftAmt == BitWidth)
+      return APInt(BitWidth, 0); // undefined
+    else {
+      unsigned SignBit = APINT_BITS_PER_WORD - BitWidth;
+      return APInt(BitWidth, 
+        (((int64_t(VAL) << SignBit) >> SignBit) >> shiftAmt));
+    }
+  }
+
+  // If all the bits were shifted out, the result is, technically, undefined.
+  // We return -1 if it was negative, 0 otherwise. We check this early to avoid
+  // issues in the algorithm below.
+  if (shiftAmt == BitWidth) {
+    if (isNegative())
+      return APInt(BitWidth, -1ULL, true);
+    else
+      return APInt(BitWidth, 0);
+  }
+
+  // Create some space for the result.
+  uint64_t * val = new uint64_t[getNumWords()];
+
+  // Compute some values needed by the following shift algorithms
+  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD; // bits to shift per word
+  unsigned offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
+  unsigned breakWord = getNumWords() - 1 - offset; // last word affected
+  unsigned bitsInWord = whichBit(BitWidth); // how many bits in last word?
+  if (bitsInWord == 0)
+    bitsInWord = APINT_BITS_PER_WORD;
+
+  // If we are shifting whole words, just move whole words
+  if (wordShift == 0) {
+    // Move the words containing significant bits
+    for (unsigned i = 0; i <= breakWord; ++i)
+      val[i] = pVal[i+offset]; // move whole word
+
+    // Adjust the top significant word for sign bit fill, if negative
+    if (isNegative())
+      if (bitsInWord < APINT_BITS_PER_WORD)
+        val[breakWord] |= ~0ULL << bitsInWord; // set high bits
+  } else {
+    // Shift the low order words 
+    for (unsigned i = 0; i < breakWord; ++i) {
+      // This combines the shifted corresponding word with the low bits from
+      // the next word (shifted into this word's high bits).
+      val[i] = (pVal[i+offset] >> wordShift) | 
+               (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift));
+    }
+
+    // Shift the break word. In this case there are no bits from the next word
+    // to include in this word.
+    val[breakWord] = pVal[breakWord+offset] >> wordShift;
+
+    // Deal with sign extenstion in the break word, and possibly the word before
+    // it.
+    if (isNegative()) {
+      if (wordShift > bitsInWord) {
+        if (breakWord > 0)
+          val[breakWord-1] |= 
+            ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
+        val[breakWord] |= ~0ULL;
+      } else 
+        val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
+    }
+  }
+
+  // Remaining words are 0 or -1, just assign them.
+  uint64_t fillValue = (isNegative() ? -1ULL : 0);
+  for (unsigned i = breakWord+1; i < getNumWords(); ++i)
+    val[i] = fillValue;
+  return APInt(val, BitWidth).clearUnusedBits();
+}
+
+/// Logical right-shift this APInt by shiftAmt.
+/// @brief Logical right-shift function.
+APInt APInt::lshr(const APInt &shiftAmt) const {
+  return lshr((unsigned)shiftAmt.getLimitedValue(BitWidth));
+}
+
+/// Logical right-shift this APInt by shiftAmt.
+/// @brief Logical right-shift function.
+APInt APInt::lshr(unsigned shiftAmt) const {
+  if (isSingleWord()) {
+    if (shiftAmt == BitWidth)
+      return APInt(BitWidth, 0);
+    else 
+      return APInt(BitWidth, this->VAL >> shiftAmt);
+  }
+
+  // If all the bits were shifted out, the result is 0. This avoids issues
+  // with shifting by the size of the integer type, which produces undefined
+  // results. We define these "undefined results" to always be 0.
+  if (shiftAmt == BitWidth)
+    return APInt(BitWidth, 0);
+
+  // If none of the bits are shifted out, the result is *this. This avoids
+  // issues with shifting by the size of the integer type, which produces 
+  // undefined results in the code below. This is also an optimization.
+  if (shiftAmt == 0)
+    return *this;
+
+  // Create some space for the result.
+  uint64_t * val = new uint64_t[getNumWords()];
+
+  // If we are shifting less than a word, compute the shift with a simple carry
+  if (shiftAmt < APINT_BITS_PER_WORD) {
+    uint64_t carry = 0;
+    for (int i = getNumWords()-1; i >= 0; --i) {
+      val[i] = (pVal[i] >> shiftAmt) | carry;
+      carry = pVal[i] << (APINT_BITS_PER_WORD - shiftAmt);
+    }
+    return APInt(val, BitWidth).clearUnusedBits();
+  }
+
+  // Compute some values needed by the remaining shift algorithms
+  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD;
+  unsigned offset = shiftAmt / APINT_BITS_PER_WORD;
+
+  // If we are shifting whole words, just move whole words
+  if (wordShift == 0) {
+    for (unsigned i = 0; i < getNumWords() - offset; ++i)
+      val[i] = pVal[i+offset];
+    for (unsigned i = getNumWords()-offset; i < getNumWords(); i++)
+      val[i] = 0;
+    return APInt(val,BitWidth).clearUnusedBits();
+  }
+
+  // Shift the low order words 
+  unsigned breakWord = getNumWords() - offset -1;
+  for (unsigned i = 0; i < breakWord; ++i)
+    val[i] = (pVal[i+offset] >> wordShift) |
+             (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift));
+  // Shift the break word.
+  val[breakWord] = pVal[breakWord+offset] >> wordShift;
+
+  // Remaining words are 0
+  for (unsigned i = breakWord+1; i < getNumWords(); ++i)
+    val[i] = 0;
+  return APInt(val, BitWidth).clearUnusedBits();
+}
+
+/// Left-shift this APInt by shiftAmt.
+/// @brief Left-shift function.
+APInt APInt::shl(const APInt &shiftAmt) const {
+  // It's undefined behavior in C to shift by BitWidth or greater.
+  return shl((unsigned)shiftAmt.getLimitedValue(BitWidth));
+}
+
+APInt APInt::shlSlowCase(unsigned shiftAmt) const {
+  // If all the bits were shifted out, the result is 0. This avoids issues
+  // with shifting by the size of the integer type, which produces undefined
+  // results. We define these "undefined results" to always be 0.
+  if (shiftAmt == BitWidth)
+    return APInt(BitWidth, 0);
+
+  // If none of the bits are shifted out, the result is *this. This avoids a
+  // lshr by the words size in the loop below which can produce incorrect
+  // results. It also avoids the expensive computation below for a common case.
+  if (shiftAmt == 0)
+    return *this;
+
+  // Create some space for the result.
+  uint64_t * val = new uint64_t[getNumWords()];
+
+  // If we are shifting less than a word, do it the easy way
+  if (shiftAmt < APINT_BITS_PER_WORD) {
+    uint64_t carry = 0;
+    for (unsigned i = 0; i < getNumWords(); i++) {
+      val[i] = pVal[i] << shiftAmt | carry;
+      carry = pVal[i] >> (APINT_BITS_PER_WORD - shiftAmt);
+    }
+    return APInt(val, BitWidth).clearUnusedBits();
+  }
+
+  // Compute some values needed by the remaining shift algorithms
+  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD;
+  unsigned offset = shiftAmt / APINT_BITS_PER_WORD;
+
+  // If we are shifting whole words, just move whole words
+  if (wordShift == 0) {
+    for (unsigned i = 0; i < offset; i++)
+      val[i] = 0;
+    for (unsigned i = offset; i < getNumWords(); i++)
+      val[i] = pVal[i-offset];
+    return APInt(val,BitWidth).clearUnusedBits();
+  }
+
+  // Copy whole words from this to Result.
+  unsigned i = getNumWords() - 1;
+  for (; i > offset; --i)
+    val[i] = pVal[i-offset] << wordShift |
+             pVal[i-offset-1] >> (APINT_BITS_PER_WORD - wordShift);
+  val[offset] = pVal[0] << wordShift;
+  for (i = 0; i < offset; ++i)
+    val[i] = 0;
+  return APInt(val, BitWidth).clearUnusedBits();
+}
+
+APInt APInt::rotl(const APInt &rotateAmt) const {
+  return rotl((unsigned)rotateAmt.getLimitedValue(BitWidth));
+}
+
+APInt APInt::rotl(unsigned rotateAmt) const {
+  if (rotateAmt == 0)
+    return *this;
+  // Don't get too fancy, just use existing shift/or facilities
+  APInt hi(*this);
+  APInt lo(*this);
+  hi.shl(rotateAmt);
+  lo.lshr(BitWidth - rotateAmt);
+  return hi | lo;
+}
+
+APInt APInt::rotr(const APInt &rotateAmt) const {
+  return rotr((unsigned)rotateAmt.getLimitedValue(BitWidth));
+}
+
+APInt APInt::rotr(unsigned rotateAmt) const {
+  if (rotateAmt == 0)
+    return *this;
+  // Don't get too fancy, just use existing shift/or facilities
+  APInt hi(*this);
+  APInt lo(*this);
+  lo.lshr(rotateAmt);
+  hi.shl(BitWidth - rotateAmt);
+  return hi | lo;
+}
+
+// Square Root - this method computes and returns the square root of "this".
+// Three mechanisms are used for computation. For small values (<= 5 bits),
+// a table lookup is done. This gets some performance for common cases. For
+// values using less than 52 bits, the value is converted to double and then
+// the libc sqrt function is called. The result is rounded and then converted
+// back to a uint64_t which is then used to construct the result. Finally,
+// the Babylonian method for computing square roots is used. 
+APInt APInt::sqrt() const {
+
+  // Determine the magnitude of the value.
+  unsigned magnitude = getActiveBits();
+
+  // Use a fast table for some small values. This also gets rid of some
+  // rounding errors in libc sqrt for small values.
+  if (magnitude <= 5) {
+    static const uint8_t results[32] = {
+      /*     0 */ 0,
+      /*  1- 2 */ 1, 1,
+      /*  3- 6 */ 2, 2, 2, 2, 
+      /*  7-12 */ 3, 3, 3, 3, 3, 3,
+      /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
+      /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+      /*    31 */ 6
+    };
+    return APInt(BitWidth, results[ (isSingleWord() ? VAL : pVal[0]) ]);
+  }
+
+  // If the magnitude of the value fits in less than 52 bits (the precision of
+  // an IEEE double precision floating point value), then we can use the
+  // libc sqrt function which will probably use a hardware sqrt computation.
+  // This should be faster than the algorithm below.
+  if (magnitude < 52) {
+#ifdef _MSC_VER
+    // Amazingly, VC++ doesn't have round().
+    return APInt(BitWidth, 
+                 uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0]))) + 0.5);
+#else
+    return APInt(BitWidth, 
+                 uint64_t(::round(::sqrt(double(isSingleWord()?VAL:pVal[0])))));
+#endif
+  }
+
+  // Okay, all the short cuts are exhausted. We must compute it. The following
+  // is a classical Babylonian method for computing the square root. This code
+  // was adapted to APINt from a wikipedia article on such computations.
+  // See http://www.wikipedia.org/ and go to the page named
+  // Calculate_an_integer_square_root. 
+  unsigned nbits = BitWidth, i = 4;
+  APInt testy(BitWidth, 16);
+  APInt x_old(BitWidth, 1);
+  APInt x_new(BitWidth, 0);
+  APInt two(BitWidth, 2);
+
+  // Select a good starting value using binary logarithms.
+  for (;; i += 2, testy = testy.shl(2)) 
+    if (i >= nbits || this->ule(testy)) {
+      x_old = x_old.shl(i / 2);
+      break;
+    }
+
+  // Use the Babylonian method to arrive at the integer square root: 
+  for (;;) {
+    x_new = (this->udiv(x_old) + x_old).udiv(two);
+    if (x_old.ule(x_new))
+      break;
+    x_old = x_new;
+  }
+
+  // Make sure we return the closest approximation
+  // NOTE: The rounding calculation below is correct. It will produce an 
+  // off-by-one discrepancy with results from pari/gp. That discrepancy has been
+  // determined to be a rounding issue with pari/gp as it begins to use a 
+  // floating point representation after 192 bits. There are no discrepancies
+  // between this algorithm and pari/gp for bit widths < 192 bits.
+  APInt square(x_old * x_old);
+  APInt nextSquare((x_old + 1) * (x_old +1));
+  if (this->ult(square))
+    return x_old;
+  else if (this->ule(nextSquare)) {
+    APInt midpoint((nextSquare - square).udiv(two));
+    APInt offset(*this - square);
+    if (offset.ult(midpoint))
+      return x_old;
+    else
+      return x_old + 1;
+  } else
+    assert(0 && "Error in APInt::sqrt computation");
+  return x_old + 1;
+}
+
+/// Computes the multiplicative inverse of this APInt for a given modulo. The
+/// iterative extended Euclidean algorithm is used to solve for this value,
+/// however we simplify it to speed up calculating only the inverse, and take
+/// advantage of div+rem calculations. We also use some tricks to avoid copying
+/// (potentially large) APInts around.
+APInt APInt::multiplicativeInverse(const APInt& modulo) const {
+  assert(ult(modulo) && "This APInt must be smaller than the modulo");
+
+  // Using the properties listed at the following web page (accessed 06/21/08):
+  //   http://www.numbertheory.org/php/euclid.html
+  // (especially the properties numbered 3, 4 and 9) it can be proved that
+  // BitWidth bits suffice for all the computations in the algorithm implemented
+  // below. More precisely, this number of bits suffice if the multiplicative
+  // inverse exists, but may not suffice for the general extended Euclidean
+  // algorithm.
+
+  APInt r[2] = { modulo, *this };
+  APInt t[2] = { APInt(BitWidth, 0), APInt(BitWidth, 1) };
+  APInt q(BitWidth, 0);
+  
+  unsigned i;
+  for (i = 0; r[i^1] != 0; i ^= 1) {
+    // An overview of the math without the confusing bit-flipping:
+    // q = r[i-2] / r[i-1]
+    // r[i] = r[i-2] % r[i-1]
+    // t[i] = t[i-2] - t[i-1] * q
+    udivrem(r[i], r[i^1], q, r[i]);
+    t[i] -= t[i^1] * q;
+  }
+
+  // If this APInt and the modulo are not coprime, there is no multiplicative
+  // inverse, so return 0. We check this by looking at the next-to-last
+  // remainder, which is the gcd(*this,modulo) as calculated by the Euclidean
+  // algorithm.
+  if (r[i] != 1)
+    return APInt(BitWidth, 0);
+
+  // The next-to-last t is the multiplicative inverse.  However, we are
+  // interested in a positive inverse. Calcuate a positive one from a negative
+  // one if necessary. A simple addition of the modulo suffices because
+  // abs(t[i]) is known to be less than *this/2 (see the link above).
+  return t[i].isNegative() ? t[i] + modulo : t[i];
+}
+
+/// Calculate the magic numbers required to implement a signed integer division
+/// by a constant as a sequence of multiplies, adds and shifts.  Requires that
+/// the divisor not be 0, 1, or -1.  Taken from "Hacker's Delight", Henry S.
+/// Warren, Jr., chapter 10.
+APInt::ms APInt::magic() const {
+  const APInt& d = *this;
+  unsigned p;
+  APInt ad, anc, delta, q1, r1, q2, r2, t;
+  APInt allOnes = APInt::getAllOnesValue(d.getBitWidth());
+  APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
+  APInt signedMax = APInt::getSignedMaxValue(d.getBitWidth());
+  struct ms mag;
+  
+  ad = d.abs();
+  t = signedMin + (d.lshr(d.getBitWidth() - 1));
+  anc = t - 1 - t.urem(ad);   // absolute value of nc
+  p = d.getBitWidth() - 1;    // initialize p
+  q1 = signedMin.udiv(anc);   // initialize q1 = 2p/abs(nc)
+  r1 = signedMin - q1*anc;    // initialize r1 = rem(2p,abs(nc))
+  q2 = signedMin.udiv(ad);    // initialize q2 = 2p/abs(d)
+  r2 = signedMin - q2*ad;     // initialize r2 = rem(2p,abs(d))
+  do {
+    p = p + 1;
+    q1 = q1<<1;          // update q1 = 2p/abs(nc)
+    r1 = r1<<1;          // update r1 = rem(2p/abs(nc))
+    if (r1.uge(anc)) {  // must be unsigned comparison
+      q1 = q1 + 1;
+      r1 = r1 - anc;
+    }
+    q2 = q2<<1;          // update q2 = 2p/abs(d)
+    r2 = r2<<1;          // update r2 = rem(2p/abs(d))
+    if (r2.uge(ad)) {   // must be unsigned comparison
+      q2 = q2 + 1;
+      r2 = r2 - ad;
+    }
+    delta = ad - r2;
+  } while (q1.ule(delta) || (q1 == delta && r1 == 0));
+  
+  mag.m = q2 + 1;
+  if (d.isNegative()) mag.m = -mag.m;   // resulting magic number
+  mag.s = p - d.getBitWidth();          // resulting shift
+  return mag;
+}
+
+/// Calculate the magic numbers required to implement an unsigned integer
+/// division by a constant as a sequence of multiplies, adds and shifts.
+/// Requires that the divisor not be 0.  Taken from "Hacker's Delight", Henry
+/// S. Warren, Jr., chapter 10.
+APInt::mu APInt::magicu() const {
+  const APInt& d = *this;
+  unsigned p;
+  APInt nc, delta, q1, r1, q2, r2;
+  struct mu magu;
+  magu.a = 0;               // initialize "add" indicator
+  APInt allOnes = APInt::getAllOnesValue(d.getBitWidth());
+  APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
+  APInt signedMax = APInt::getSignedMaxValue(d.getBitWidth());
+
+  nc = allOnes - (-d).urem(d);
+  p = d.getBitWidth() - 1;  // initialize p
+  q1 = signedMin.udiv(nc);  // initialize q1 = 2p/nc
+  r1 = signedMin - q1*nc;   // initialize r1 = rem(2p,nc)
+  q2 = signedMax.udiv(d);   // initialize q2 = (2p-1)/d
+  r2 = signedMax - q2*d;    // initialize r2 = rem((2p-1),d)
+  do {
+    p = p + 1;
+    if (r1.uge(nc - r1)) {
+      q1 = q1 + q1 + 1;  // update q1
+      r1 = r1 + r1 - nc; // update r1
+    }
+    else {
+      q1 = q1+q1; // update q1
+      r1 = r1+r1; // update r1
+    }
+    if ((r2 + 1).uge(d - r2)) {
+      if (q2.uge(signedMax)) magu.a = 1;
+      q2 = q2+q2 + 1;     // update q2
+      r2 = r2+r2 + 1 - d; // update r2
+    }
+    else {
+      if (q2.uge(signedMin)) magu.a = 1;
+      q2 = q2+q2;     // update q2
+      r2 = r2+r2 + 1; // update r2
+    }
+    delta = d - 1 - r2;
+  } while (p < d.getBitWidth()*2 &&
+           (q1.ult(delta) || (q1 == delta && r1 == 0)));
+  magu.m = q2 + 1; // resulting magic number
+  magu.s = p - d.getBitWidth();  // resulting shift
+  return magu;
+}
+
+/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
+/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
+/// variables here have the same names as in the algorithm. Comments explain
+/// the algorithm and any deviation from it.
+static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
+                     unsigned m, unsigned n) {
+  assert(u && "Must provide dividend");
+  assert(v && "Must provide divisor");
+  assert(q && "Must provide quotient");
+  assert(u != v && u != q && v != q && "Must us different memory");
+  assert(n>1 && "n must be > 1");
+
+  // Knuth uses the value b as the base of the number system. In our case b
+  // is 2^31 so we just set it to -1u.
+  uint64_t b = uint64_t(1) << 32;
+
+#if 0
+  DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n');
+  DEBUG(cerr << "KnuthDiv: original:");
+  DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) << u[i]);
+  DEBUG(cerr << " by");
+  DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) << v[i-1]);
+  DEBUG(cerr << '\n');
+#endif
+  // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of 
+  // u and v by d. Note that we have taken Knuth's advice here to use a power 
+  // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of 
+  // 2 allows us to shift instead of multiply and it is easy to determine the 
+  // shift amount from the leading zeros.  We are basically normalizing the u
+  // and v so that its high bits are shifted to the top of v's range without
+  // overflow. Note that this can require an extra word in u so that u must
+  // be of length m+n+1.
+  unsigned shift = CountLeadingZeros_32(v[n-1]);
+  unsigned v_carry = 0;
+  unsigned u_carry = 0;
+  if (shift) {
+    for (unsigned i = 0; i < m+n; ++i) {
+      unsigned u_tmp = u[i] >> (32 - shift);
+      u[i] = (u[i] << shift) | u_carry;
+      u_carry = u_tmp;
+    }
+    for (unsigned i = 0; i < n; ++i) {
+      unsigned v_tmp = v[i] >> (32 - shift);
+      v[i] = (v[i] << shift) | v_carry;
+      v_carry = v_tmp;
+    }
+  }
+  u[m+n] = u_carry;
+#if 0
+  DEBUG(cerr << "KnuthDiv:   normal:");
+  DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) << u[i]);
+  DEBUG(cerr << " by");
+  DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) << v[i-1]);
+  DEBUG(cerr << '\n');
+#endif
+
+  // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
+  int j = m;
+  do {
+    DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n');
+    // D3. [Calculate q'.]. 
+    //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
+    //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
+    // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
+    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // on v[n-2] determines at high speed most of the cases in which the trial
+    // value qp is one too large, and it eliminates all cases where qp is two 
+    // too large. 
+    uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+    DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n');
+    uint64_t qp = dividend / v[n-1];
+    uint64_t rp = dividend % v[n-1];
+    if (qp == b || qp*v[n-2] > b*rp + u[j+n-2]) {
+      qp--;
+      rp += v[n-1];
+      if (rp < b && (qp == b || qp*v[n-2] > b*rp + u[j+n-2]))
+        qp--;
+    }
+    DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
+
+    // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
+    // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
+    // consists of a simple multiplication by a one-place number, combined with
+    // a subtraction. 
+    bool isNeg = false;
+    for (unsigned i = 0; i < n; ++i) {
+      uint64_t u_tmp = uint64_t(u[j+i]) | (uint64_t(u[j+i+1]) << 32);
+      uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
+      bool borrow = subtrahend > u_tmp;
+      DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp 
+                 << ", subtrahend == " << subtrahend
+                 << ", borrow = " << borrow << '\n');
+
+      uint64_t result = u_tmp - subtrahend;
+      unsigned k = j + i;
+      u[k++] = (unsigned)(result & (b-1)); // subtract low word
+      u[k++] = (unsigned)(result >> 32);   // subtract high word
+      while (borrow && k <= m+n) { // deal with borrow to the left
+        borrow = u[k] == 0;
+        u[k]--;
+        k++;
+      }
+      isNeg |= borrow;
+      DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ",  u[j+i+1] == " << 
+                    u[j+i+1] << '\n'); 
+    }
+    DEBUG(cerr << "KnuthDiv: after subtraction:");
+    DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+    DEBUG(cerr << '\n');
+    // The digits (u[j+n]...u[j]) should be kept positive; if the result of 
+    // this step is actually negative, (u[j+n]...u[j]) should be left as the 
+    // true value plus b**(n+1), namely as the b's complement of
+    // the true value, and a "borrow" to the left should be remembered.
+    //
+    if (isNeg) {
+      bool carry = true;  // true because b's complement is "complement + 1"
+      for (unsigned i = 0; i <= m+n; ++i) {
+        u[i] = ~u[i] + carry; // b's complement
+        carry = carry && u[i] == 0;
+      }
+    }
+    DEBUG(cerr << "KnuthDiv: after complement:");
+    DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+    DEBUG(cerr << '\n');
+
+    // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was 
+    // negative, go to step D6; otherwise go on to step D7.
+    q[j] = (unsigned)qp;
+    if (isNeg) {
+      // D6. [Add back]. The probability that this step is necessary is very 
+      // small, on the order of only 2/b. Make sure that test data accounts for
+      // this possibility. Decrease q[j] by 1 
+      q[j]--;
+      // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]). 
+      // A carry will occur to the left of u[j+n], and it should be ignored 
+      // since it cancels with the borrow that occurred in D4.
+      bool carry = false;
+      for (unsigned i = 0; i < n; i++) {
+        unsigned limit = std::min(u[j+i],v[i]);
+        u[j+i] += v[i] + carry;
+        carry = u[j+i] < limit || (carry && u[j+i] == limit);
+      }
+      u[j+n] += carry;
+    }
+    DEBUG(cerr << "KnuthDiv: after correction:");
+    DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]);
+    DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');
+
+  // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
+  } while (--j >= 0);
+
+  DEBUG(cerr << "KnuthDiv: quotient:");
+  DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]);
+  DEBUG(cerr << '\n');
+
+  // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
+  // remainder may be obtained by dividing u[...] by d. If r is non-null we
+  // compute the remainder (urem uses this).
+  if (r) {
+    // The value d is expressed by the "shift" value above since we avoided
+    // multiplication by d by using a shift left. So, all we have to do is
+    // shift right here. In order to mak
+    if (shift) {
+      unsigned carry = 0;
+      DEBUG(cerr << "KnuthDiv: remainder:");
+      for (int i = n-1; i >= 0; i--) {
+        r[i] = (u[i] >> shift) | carry;
+        carry = u[i] << (32 - shift);
+        DEBUG(cerr << " " << r[i]);
+      }
+    } else {
+      for (int i = n-1; i >= 0; i--) {
+        r[i] = u[i];
+        DEBUG(cerr << " " << r[i]);
+      }
+    }
+    DEBUG(cerr << '\n');
+  }
+#if 0
+  DEBUG(cerr << std::setbase(10) << '\n');
+#endif
+}
+
+void APInt::divide(const APInt LHS, unsigned lhsWords,
+                   const APInt &RHS, unsigned rhsWords,
+                   APInt *Quotient, APInt *Remainder)
+{
+  assert(lhsWords >= rhsWords && "Fractional result");
+
+  // First, compose the values into an array of 32-bit words instead of 
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the the Knuth "classical algorithm" which requires there to be native 
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We 
+  // can't use 64-bit operands here because we don't have native results of 
+  // 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't 
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT);
+  unsigned n = rhsWords * 2;
+  unsigned m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  unsigned SPACE[128];
+  unsigned *U = 0;
+  unsigned *V = 0;
+  unsigned *Q = 0;
+  unsigned *R = 0;
+  if ((Remainder?4:3)*n+2*m+1 <= 128) {
+    U = &SPACE[0];
+    V = &SPACE[m+n+1];
+    Q = &SPACE[(m+n+1) + n];
+    if (Remainder)
+      R = &SPACE[(m+n+1) + n + (m+n)];
+  } else {
+    U = new unsigned[m + n + 1];
+    V = new unsigned[n];
+    Q = new unsigned[m+n];
+    if (Remainder)
+      R = new unsigned[n];
+  }
+
+  // Initialize the dividend
+  memset(U, 0, (m+n+1)*sizeof(unsigned));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.VAL : LHS.pVal[i]);
+    U[i * 2] = (unsigned)(tmp & mask);
+    U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+  }
+  U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(V, 0, (n)*sizeof(unsigned));
+  for (unsigned i = 0; i < rhsWords; ++i) {
+    uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.VAL : RHS.pVal[i]);
+    V[i * 2] = (unsigned)(tmp & mask);
+    V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+  }
+
+  // initialize the quotient and remainder
+  memset(Q, 0, (m+n) * sizeof(unsigned));
+  if (Remainder)
+    memset(R, 0, n * sizeof(unsigned));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in 
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not 
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && V[i-1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m+n; i > 0 && U[i-1] == 0; i--)
+    m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    unsigned divisor = V[0];
+    unsigned remainder = 0;
+    for (int i = m+n-1; i >= 0; i--) {
+      uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i];
+      if (partial_dividend == 0) {
+        Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        Q[i] = 0;
+        remainder = (unsigned)partial_dividend;
+      } else if (partial_dividend == divisor) {
+        Q[i] = 1;
+        remainder = 0;
+      } else {
+        Q[i] = (unsigned)(partial_dividend / divisor);
+        remainder = (unsigned)(partial_dividend - (Q[i] * divisor));
+      }
+    }
+    if (R)
+      R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(U, V, Q, R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord())
+        Quotient->VAL = 0;
+      else
+        delete [] Quotient->pVal;
+      Quotient->BitWidth = LHS.BitWidth;
+      if (!Quotient->isSingleWord())
+        Quotient->pVal = getClearedMemory(Quotient->getNumWords());
+    } else
+      Quotient->clear();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low 
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp = 
+        uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2));
+      if (Quotient->isSingleWord())
+        Quotient->VAL = tmp;
+      else
+        Quotient->pVal[0] = tmp;
+    } else {
+      assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->pVal[i] = 
+          uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+    }
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != RHS.BitWidth) {
+      if (Remainder->isSingleWord())
+        Remainder->VAL = 0;
+      else
+        delete [] Remainder->pVal;
+      Remainder->BitWidth = RHS.BitWidth;
+      if (!Remainder->isSingleWord())
+        Remainder->pVal = getClearedMemory(Remainder->getNumWords());
+    } else
+      Remainder->clear();
+
+    // The remainder is in R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp = 
+        uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2));
+      if (Remainder->isSingleWord())
+        Remainder->VAL = tmp;
+      else
+        Remainder->pVal[0] = tmp;
+    } else {
+      assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->pVal[i] = 
+          uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+    }
+  }
+
+  // Clean up the memory we allocated.
+  if (U != &SPACE[0]) {
+    delete [] U;
+    delete [] V;
+    delete [] Q;
+    delete [] R;
+  }
+}
+
+APInt APInt::udiv(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+
+  // First, deal with the easy case
+  if (isSingleWord()) {
+    assert(RHS.VAL != 0 && "Divide by zero?");
+    return APInt(BitWidth, VAL / RHS.VAL);
+  }
+
+  // Get some facts about the LHS and RHS number of bits and words
+  unsigned rhsBits = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  assert(rhsWords && "Divided by zero???");
+  unsigned lhsBits = this->getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+
+  // Deal with some degenerate cases
+  if (!lhsWords) 
+    // 0 / X ===> 0
+    return APInt(BitWidth, 0); 
+  else if (lhsWords < rhsWords || this->ult(RHS)) {
+    // X / Y ===> 0, iff X < Y
+    return APInt(BitWidth, 0);
+  } else if (*this == RHS) {
+    // X / X ===> 1
+    return APInt(BitWidth, 1);
+  } else if (lhsWords == 1 && rhsWords == 1) {
+    // All high words are zero, just use native divide
+    return APInt(BitWidth, this->pVal[0] / RHS.pVal[0]);
+  }
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  APInt Quotient(1,0); // to hold result.
+  divide(*this, lhsWords, RHS, rhsWords, &Quotient, 0);
+  return Quotient;
+}
+
+APInt APInt::urem(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    assert(RHS.VAL != 0 && "Remainder by zero?");
+    return APInt(BitWidth, VAL % RHS.VAL);
+  }
+
+  // Get some facts about the LHS
+  unsigned lhsBits = getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+  // Get some facts about the RHS
+  unsigned rhsBits = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  assert(rhsWords && "Performing remainder operation by zero ???");
+
+  // Check the degenerate cases
+  if (lhsWords == 0) {
+    // 0 % Y ===> 0
+    return APInt(BitWidth, 0);
+  } else if (lhsWords < rhsWords || this->ult(RHS)) {
+    // X % Y ===> X, iff X < Y
+    return *this;
+  } else if (*this == RHS) {
+    // X % X == 0;
+    return APInt(BitWidth, 0);
+  } else if (lhsWords == 1) {
+    // All high words are zero, just use native remainder
+    return APInt(BitWidth, pVal[0] % RHS.pVal[0]);
+  }
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  APInt Remainder(1,0);
+  divide(*this, lhsWords, RHS, rhsWords, 0, &Remainder);
+  return Remainder;
+}
+
+void APInt::udivrem(const APInt &LHS, const APInt &RHS, 
+                    APInt &Quotient, APInt &Remainder) {
+  // Get some size facts about the dividend and divisor
+  unsigned lhsBits  = LHS.getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+  unsigned rhsBits  = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+
+  // Check the degenerate cases
+  if (lhsWords == 0) {              
+    Quotient = 0;                // 0 / Y ===> 0
+    Remainder = 0;               // 0 % Y ===> 0
+    return;
+  } 
+  
+  if (lhsWords < rhsWords || LHS.ult(RHS)) { 
+    Quotient = 0;               // X / Y ===> 0, iff X < Y
+    Remainder = LHS;            // X % Y ===> X, iff X < Y
+    return;
+  } 
+  
+  if (LHS == RHS) {
+    Quotient  = 1;              // X / X ===> 1
+    Remainder = 0;              // X % X ===> 0;
+    return;
+  } 
+  
+  if (lhsWords == 1 && rhsWords == 1) {
+    // There is only one word to consider so use the native versions.
+    uint64_t lhsValue = LHS.isSingleWord() ? LHS.VAL : LHS.pVal[0];
+    uint64_t rhsValue = RHS.isSingleWord() ? RHS.VAL : RHS.pVal[0];
+    Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue);
+    Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue);
+    return;
+  }
+
+  // Okay, lets do it the long way
+  divide(LHS, lhsWords, RHS, rhsWords, &Quotient, &Remainder);
+}
+
+void APInt::fromString(unsigned numbits, const char *str, unsigned slen,
+                       uint8_t radix) {
+  // Check our assumptions here
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  assert(str && "String is null?");
+  bool isNeg = str[0] == '-';
+  if (isNeg)
+    str++, slen--;
+  assert((slen <= numbits || radix != 2) && "Insufficient bit width");
+  assert(((slen-1)*3 <= numbits || radix != 8) && "Insufficient bit width");
+  assert(((slen-1)*4 <= numbits || radix != 16) && "Insufficient bit width");
+  assert((((slen-1)*64)/22 <= numbits || radix != 10) && "Insufficient bit width");
+
+  // Allocate memory
+  if (!isSingleWord())
+    pVal = getClearedMemory(getNumWords());
+
+  // Figure out if we can shift instead of multiply
+  unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+  // Set up an APInt for the digit to add outside the loop so we don't
+  // constantly construct/destruct it.
+  APInt apdigit(getBitWidth(), 0);
+  APInt apradix(getBitWidth(), radix);
+
+  // Enter digit traversal loop
+  for (unsigned i = 0; i < slen; i++) {
+    // Get a digit
+    unsigned digit = 0;
+    char cdigit = str[i];
+    if (radix == 16) {
+      if (!isxdigit(cdigit))
+        assert(0 && "Invalid hex digit in string");
+      if (isdigit(cdigit))
+        digit = cdigit - '0';
+      else if (cdigit >= 'a')
+        digit = cdigit - 'a' + 10;
+      else if (cdigit >= 'A')
+        digit = cdigit - 'A' + 10;
+      else
+        assert(0 && "huh? we shouldn't get here");
+    } else if (isdigit(cdigit)) {
+      digit = cdigit - '0';
+      assert((radix == 10 ||
+              (radix == 8 && digit != 8 && digit != 9) ||
+              (radix == 2 && (digit == 0 || digit == 1))) &&
+             "Invalid digit in string for given radix");
+    } else {
+      assert(0 && "Invalid character in digit string");
+    }
+
+    // Shift or multiply the value by the radix
+    if (slen > 1) {
+      if (shift)
+        *this <<= shift;
+      else
+        *this *= apradix;
+    }
+
+    // Add in the digit we just interpreted
+    if (apdigit.isSingleWord())
+      apdigit.VAL = digit;
+    else
+      apdigit.pVal[0] = digit;
+    *this += apdigit;
+  }
+  // If its negative, put it in two's complement form
+  if (isNeg) {
+    (*this)--;
+    this->flip();
+  }
+}
+
+void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
+                     bool Signed) const {
+  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  
+  // First, check for a zero value and just short circuit the logic below.
+  if (*this == 0) {
+    Str.push_back('0');
+    return;
+  }
+  
+  static const char Digits[] = "0123456789ABCDEF";
+  
+  if (isSingleWord()) {
+    char Buffer[65];
+    char *BufPtr = Buffer+65;
+    
+    uint64_t N;
+    if (Signed) {
+      int64_t I = getSExtValue();
+      if (I < 0) {
+        Str.push_back('-');
+        I = -I;
+      }
+      N = I;
+    } else {
+      N = getZExtValue();
+    }
+    
+    while (N) {
+      *--BufPtr = Digits[N % Radix];
+      N /= Radix;
+    }
+    Str.append(BufPtr, Buffer+65);
+    return;
+  }
+
+  APInt Tmp(*this);
+  
+  if (Signed && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    Tmp.flip();
+    Tmp++;
+    Str.push_back('-');
+  }
+  
+  // We insert the digits backward, then reverse them to get the right order.
+  unsigned StartDig = Str.size();
+  
+  // For the 2, 8 and 16 bit cases, we can just shift instead of divide 
+  // because the number of bits per digit (1, 3 and 4 respectively) divides 
+  // equaly.  We just shift until the value is zero.
+  if (Radix != 10) {
+    // Just shift tmp right for each digit width until it becomes zero
+    unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
+    unsigned MaskAmt = Radix - 1;
+    
+    while (Tmp != 0) {
+      unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt;
+      Str.push_back(Digits[Digit]);
+      Tmp = Tmp.lshr(ShiftAmt);
+    }
+  } else {
+    APInt divisor(4, 10);
+    while (Tmp != 0) {
+      APInt APdigit(1, 0);
+      APInt tmp2(Tmp.getBitWidth(), 0);
+      divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2, 
+             &APdigit);
+      unsigned Digit = (unsigned)APdigit.getZExtValue();
+      assert(Digit < Radix && "divide failed");
+      Str.push_back(Digits[Digit]);
+      Tmp = tmp2;
+    }
+  }
+  
+  // Reverse the digits before returning.
+  std::reverse(Str.begin()+StartDig, Str.end());
+}
+
+/// toString - This returns the APInt as a std::string.  Note that this is an
+/// inefficient method.  It is better to pass in a SmallVector/SmallString
+/// to the methods above.
+std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const {
+  SmallString<40> S;
+  toString(S, Radix, Signed);
+  return S.c_str();
+}
+
+
+void APInt::dump() const {
+  SmallString<40> S, U;
+  this->toStringUnsigned(U);
+  this->toStringSigned(S);
+  fprintf(stderr, "APInt(%db, %su %ss)", BitWidth, U.c_str(), S.c_str());
+}
+
+void APInt::print(raw_ostream &OS, bool isSigned) const {
+  SmallString<40> S;
+  this->toString(S, 10, isSigned);
+  OS << S.c_str();
+}
+
+// This implements a variety of operations on a representation of
+// arbitrary precision, two's-complement, bignum integer values.
+
+/* Assumed by lowHalf, highHalf, partMSB and partLSB.  A fairly safe
+   and unrestricting assumption.  */
+#define COMPILE_TIME_ASSERT(cond) extern int CTAssert[(cond) ? 1 : -1]
+COMPILE_TIME_ASSERT(integerPartWidth % 2 == 0);
+
+/* Some handy functions local to this file.  */
+namespace {
+
+  /* Returns the integer part with the least significant BITS set.
+     BITS cannot be zero.  */
+  static inline integerPart
+  lowBitMask(unsigned int bits)
+  {
+    assert (bits != 0 && bits <= integerPartWidth);
+
+    return ~(integerPart) 0 >> (integerPartWidth - bits);
+  }
+
+  /* Returns the value of the lower half of PART.  */
+  static inline integerPart
+  lowHalf(integerPart part)
+  {
+    return part & lowBitMask(integerPartWidth / 2);
+  }
+
+  /* Returns the value of the upper half of PART.  */
+  static inline integerPart
+  highHalf(integerPart part)
+  {
+    return part >> (integerPartWidth / 2);
+  }
+
+  /* Returns the bit number of the most significant set bit of a part.
+     If the input number has no bits set -1U is returned.  */
+  static unsigned int
+  partMSB(integerPart value)
+  {
+    unsigned int n, msb;
+
+    if (value == 0)
+      return -1U;
+
+    n = integerPartWidth / 2;
+
+    msb = 0;
+    do {
+      if (value >> n) {
+        value >>= n;
+        msb += n;
+      }
+
+      n >>= 1;
+    } while (n);
+
+    return msb;
+  }
+
+  /* Returns the bit number of the least significant set bit of a
+     part.  If the input number has no bits set -1U is returned.  */
+  static unsigned int
+  partLSB(integerPart value)
+  {
+    unsigned int n, lsb;
+
+    if (value == 0)
+      return -1U;
+
+    lsb = integerPartWidth - 1;
+    n = integerPartWidth / 2;
+
+    do {
+      if (value << n) {
+        value <<= n;
+        lsb -= n;
+      }
+
+      n >>= 1;
+    } while (n);
+
+    return lsb;
+  }
+}
+
+/* Sets the least significant part of a bignum to the input value, and
+   zeroes out higher parts.  */
+void
+APInt::tcSet(integerPart *dst, integerPart part, unsigned int parts)
+{
+  unsigned int i;
+
+  assert (parts > 0);
+
+  dst[0] = part;
+  for(i = 1; i < parts; i++)
+    dst[i] = 0;
+}
+
+/* Assign one bignum to another.  */
+void
+APInt::tcAssign(integerPart *dst, const integerPart *src, unsigned int parts)
+{
+  unsigned int i;
+
+  for(i = 0; i < parts; i++)
+    dst[i] = src[i];
+}
+
+/* Returns true if a bignum is zero, false otherwise.  */
+bool
+APInt::tcIsZero(const integerPart *src, unsigned int parts)
+{
+  unsigned int i;
+
+  for(i = 0; i < parts; i++)
+    if (src[i])
+      return false;
+
+  return true;
+}
+
+/* Extract the given bit of a bignum; returns 0 or 1.  */
+int
+APInt::tcExtractBit(const integerPart *parts, unsigned int bit)
+{
+  return(parts[bit / integerPartWidth]
+         & ((integerPart) 1 << bit % integerPartWidth)) != 0;
+}
+
+/* Set the given bit of a bignum.  */
+void
+APInt::tcSetBit(integerPart *parts, unsigned int bit)
+{
+  parts[bit / integerPartWidth] |= (integerPart) 1 << (bit % integerPartWidth);
+}
+
+/* Returns the bit number of the least significant set bit of a
+   number.  If the input number has no bits set -1U is returned.  */
+unsigned int
+APInt::tcLSB(const integerPart *parts, unsigned int n)
+{
+  unsigned int i, lsb;
+
+  for(i = 0; i < n; i++) {
+      if (parts[i] != 0) {
+          lsb = partLSB(parts[i]);
+
+          return lsb + i * integerPartWidth;
+      }
+  }
+
+  return -1U;
+}
+
+/* Returns the bit number of the most significant set bit of a number.
+   If the input number has no bits set -1U is returned.  */
+unsigned int
+APInt::tcMSB(const integerPart *parts, unsigned int n)
+{
+  unsigned int msb;
+
+  do {
+      --n;
+
+      if (parts[n] != 0) {
+          msb = partMSB(parts[n]);
+
+          return msb + n * integerPartWidth;
+      }
+  } while (n);
+
+  return -1U;
+}
+
+/* Copy the bit vector of width srcBITS from SRC, starting at bit
+   srcLSB, to DST, of dstCOUNT parts, such that the bit srcLSB becomes
+   the least significant bit of DST.  All high bits above srcBITS in
+   DST are zero-filled.  */
+void
+APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
+                 unsigned int srcBits, unsigned int srcLSB)
+{
+  unsigned int firstSrcPart, dstParts, shift, n;
+
+  dstParts = (srcBits + integerPartWidth - 1) / integerPartWidth;
+  assert (dstParts <= dstCount);
+
+  firstSrcPart = srcLSB / integerPartWidth;
+  tcAssign (dst, src + firstSrcPart, dstParts);
+
+  shift = srcLSB % integerPartWidth;
+  tcShiftRight (dst, dstParts, shift);
+
+  /* We now have (dstParts * integerPartWidth - shift) bits from SRC
+     in DST.  If this is less that srcBits, append the rest, else
+     clear the high bits.  */
+  n = dstParts * integerPartWidth - shift;
+  if (n < srcBits) {
+    integerPart mask = lowBitMask (srcBits - n);
+    dst[dstParts - 1] |= ((src[firstSrcPart + dstParts] & mask)
+                          << n % integerPartWidth);
+  } else if (n > srcBits) {
+    if (srcBits % integerPartWidth)
+      dst[dstParts - 1] &= lowBitMask (srcBits % integerPartWidth);
+  }
+
+  /* Clear high parts.  */
+  while (dstParts < dstCount)
+    dst[dstParts++] = 0;
+}
+
+/* DST += RHS + C where C is zero or one.  Returns the carry flag.  */
+integerPart
+APInt::tcAdd(integerPart *dst, const integerPart *rhs,
+             integerPart c, unsigned int parts)
+{
+  unsigned int i;
+
+  assert(c <= 1);
+
+  for(i = 0; i < parts; i++) {
+    integerPart l;
+
+    l = dst[i];
+    if (c) {
+      dst[i] += rhs[i] + 1;
+      c = (dst[i] <= l);
+    } else {
+      dst[i] += rhs[i];
+      c = (dst[i] < l);
+    }
+  }
+
+  return c;
+}
+
+/* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
+integerPart
+APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
+                  integerPart c, unsigned int parts)
+{
+  unsigned int i;
+
+  assert(c <= 1);
+
+  for(i = 0; i < parts; i++) {
+    integerPart l;
+
+    l = dst[i];
+    if (c) {
+      dst[i] -= rhs[i] + 1;
+      c = (dst[i] >= l);
+    } else {
+      dst[i] -= rhs[i];
+      c = (dst[i] > l);
+    }
+  }
+
+  return c;
+}
+
+/* Negate a bignum in-place.  */
+void
+APInt::tcNegate(integerPart *dst, unsigned int parts)
+{
+  tcComplement(dst, parts);
+  tcIncrement(dst, parts);
+}
+
+/*  DST += SRC * MULTIPLIER + CARRY   if add is true
+    DST  = SRC * MULTIPLIER + CARRY   if add is false
+
+    Requires 0 <= DSTPARTS <= SRCPARTS + 1.  If DST overlaps SRC
+    they must start at the same point, i.e. DST == SRC.
+
+    If DSTPARTS == SRCPARTS + 1 no overflow occurs and zero is
+    returned.  Otherwise DST is filled with the least significant
+    DSTPARTS parts of the result, and if all of the omitted higher
+    parts were zero return zero, otherwise overflow occurred and
+    return one.  */
+int
+APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
+                      integerPart multiplier, integerPart carry,
+                      unsigned int srcParts, unsigned int dstParts,
+                      bool add)
+{
+  unsigned int i, n;
+
+  /* Otherwise our writes of DST kill our later reads of SRC.  */
+  assert(dst <= src || dst >= src + srcParts);
+  assert(dstParts <= srcParts + 1);
+
+  /* N loops; minimum of dstParts and srcParts.  */
+  n = dstParts < srcParts ? dstParts: srcParts;
+
+  for(i = 0; i < n; i++) {
+    integerPart low, mid, high, srcPart;
+
+      /* [ LOW, HIGH ] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
+
+         This cannot overflow, because
+
+         (n - 1) * (n - 1) + 2 (n - 1) = (n - 1) * (n + 1)
+
+         which is less than n^2.  */
+
+    srcPart = src[i];
+
+    if (multiplier == 0 || srcPart == 0)        {
+      low = carry;
+      high = 0;
+    } else {
+      low = lowHalf(srcPart) * lowHalf(multiplier);
+      high = highHalf(srcPart) * highHalf(multiplier);
+
+      mid = lowHalf(srcPart) * highHalf(multiplier);
+      high += highHalf(mid);
+      mid <<= integerPartWidth / 2;
+      if (low + mid < low)
+        high++;
+      low += mid;
+
+      mid = highHalf(srcPart) * lowHalf(multiplier);
+      high += highHalf(mid);
+      mid <<= integerPartWidth / 2;
+      if (low + mid < low)
+        high++;
+      low += mid;
+
+      /* Now add carry.  */
+      if (low + carry < low)
+        high++;
+      low += carry;
+    }
+
+    if (add) {
+      /* And now DST[i], and store the new low part there.  */
+      if (low + dst[i] < low)
+        high++;
+      dst[i] += low;
+    } else
+      dst[i] = low;
+
+    carry = high;
+  }
+
+  if (i < dstParts) {
+    /* Full multiplication, there is no overflow.  */
+    assert(i + 1 == dstParts);
+    dst[i] = carry;
+    return 0;
+  } else {
+    /* We overflowed if there is carry.  */
+    if (carry)
+      return 1;
+
+    /* We would overflow if any significant unwritten parts would be
+       non-zero.  This is true if any remaining src parts are non-zero
+       and the multiplier is non-zero.  */
+    if (multiplier)
+      for(; i < srcParts; i++)
+        if (src[i])
+          return 1;
+
+    /* We fitted in the narrow destination.  */
+    return 0;
+  }
+}
+
+/* DST = LHS * RHS, where DST has the same width as the operands and
+   is filled with the least significant parts of the result.  Returns
+   one if overflow occurred, otherwise zero.  DST must be disjoint
+   from both operands.  */
+int
+APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
+                  const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+  int overflow;
+
+  assert(dst != lhs && dst != rhs);
+
+  overflow = 0;
+  tcSet(dst, 0, parts);
+
+  for(i = 0; i < parts; i++)
+    overflow |= tcMultiplyPart(&dst[i], lhs, rhs[i], 0, parts,
+                               parts - i, true);
+
+  return overflow;
+}
+
+/* DST = LHS * RHS, where DST has width the sum of the widths of the
+   operands.  No overflow occurs.  DST must be disjoint from both
+   operands.  Returns the number of parts required to hold the
+   result.  */
+unsigned int
+APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
+                      const integerPart *rhs, unsigned int lhsParts,
+                      unsigned int rhsParts)
+{
+  /* Put the narrower number on the LHS for less loops below.  */
+  if (lhsParts > rhsParts) {
+    return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
+  } else {
+    unsigned int n;
+
+    assert(dst != lhs && dst != rhs);
+
+    tcSet(dst, 0, rhsParts);
+
+    for(n = 0; n < lhsParts; n++)
+      tcMultiplyPart(&dst[n], rhs, lhs[n], 0, rhsParts, rhsParts + 1, true);
+
+    n = lhsParts + rhsParts;
+
+    return n - (dst[n - 1] == 0);
+  }
+}
+
+/* If RHS is zero LHS and REMAINDER are left unchanged, return one.
+   Otherwise set LHS to LHS / RHS with the fractional part discarded,
+   set REMAINDER to the remainder, return zero.  i.e.
+
+   OLD_LHS = RHS * LHS + REMAINDER
+
+   SCRATCH is a bignum of the same size as the operands and result for
+   use by the routine; its contents need not be initialized and are
+   destroyed.  LHS, REMAINDER and SCRATCH must be distinct.
+*/
+int
+APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
+                integerPart *remainder, integerPart *srhs,
+                unsigned int parts)
+{
+  unsigned int n, shiftCount;
+  integerPart mask;
+
+  assert(lhs != remainder && lhs != srhs && remainder != srhs);
+
+  shiftCount = tcMSB(rhs, parts) + 1;
+  if (shiftCount == 0)
+    return true;
+
+  shiftCount = parts * integerPartWidth - shiftCount;
+  n = shiftCount / integerPartWidth;
+  mask = (integerPart) 1 << (shiftCount % integerPartWidth);
+
+  tcAssign(srhs, rhs, parts);
+  tcShiftLeft(srhs, parts, shiftCount);
+  tcAssign(remainder, lhs, parts);
+  tcSet(lhs, 0, parts);
+
+  /* Loop, subtracting SRHS if REMAINDER is greater and adding that to
+     the total.  */
+  for(;;) {
+      int compare;
+
+      compare = tcCompare(remainder, srhs, parts);
+      if (compare >= 0) {
+        tcSubtract(remainder, srhs, 0, parts);
+        lhs[n] |= mask;
+      }
+
+      if (shiftCount == 0)
+        break;
+      shiftCount--;
+      tcShiftRight(srhs, parts, 1);
+      if ((mask >>= 1) == 0)
+        mask = (integerPart) 1 << (integerPartWidth - 1), n--;
+  }
+
+  return false;
+}
+
+/* Shift a bignum left COUNT bits in-place.  Shifted in bits are zero.
+   There are no restrictions on COUNT.  */
+void
+APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
+{
+  if (count) {
+    unsigned int jump, shift;
+
+    /* Jump is the inter-part jump; shift is is intra-part shift.  */
+    jump = count / integerPartWidth;
+    shift = count % integerPartWidth;
+
+    while (parts > jump) {
+      integerPart part;
+
+      parts--;
+
+      /* dst[i] comes from the two parts src[i - jump] and, if we have
+         an intra-part shift, src[i - jump - 1].  */
+      part = dst[parts - jump];
+      if (shift) {
+        part <<= shift;
+        if (parts >= jump + 1)
+          part |= dst[parts - jump - 1] >> (integerPartWidth - shift);
+      }
+
+      dst[parts] = part;
+    }
+
+    while (parts > 0)
+      dst[--parts] = 0;
+  }
+}
+
+/* Shift a bignum right COUNT bits in-place.  Shifted in bits are
+   zero.  There are no restrictions on COUNT.  */
+void
+APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
+{
+  if (count) {
+    unsigned int i, jump, shift;
+
+    /* Jump is the inter-part jump; shift is is intra-part shift.  */
+    jump = count / integerPartWidth;
+    shift = count % integerPartWidth;
+
+    /* Perform the shift.  This leaves the most significant COUNT bits
+       of the result at zero.  */
+    for(i = 0; i < parts; i++) {
+      integerPart part;
+
+      if (i + jump >= parts) {
+        part = 0;
+      } else {
+        part = dst[i + jump];
+        if (shift) {
+          part >>= shift;
+          if (i + jump + 1 < parts)
+            part |= dst[i + jump + 1] << (integerPartWidth - shift);
+        }
+      }
+
+      dst[i] = part;
+    }
+  }
+}
+
+/* Bitwise and of two bignums.  */
+void
+APInt::tcAnd(integerPart *dst, const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+
+  for(i = 0; i < parts; i++)
+    dst[i] &= rhs[i];
+}
+
+/* Bitwise inclusive or of two bignums.  */
+void
+APInt::tcOr(integerPart *dst, const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+
+  for(i = 0; i < parts; i++)
+    dst[i] |= rhs[i];
+}
+
+/* Bitwise exclusive or of two bignums.  */
+void
+APInt::tcXor(integerPart *dst, const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+
+  for(i = 0; i < parts; i++)
+    dst[i] ^= rhs[i];
+}
+
+/* Complement a bignum in-place.  */
+void
+APInt::tcComplement(integerPart *dst, unsigned int parts)
+{
+  unsigned int i;
+
+  for(i = 0; i < parts; i++)
+    dst[i] = ~dst[i];
+}
+
+/* Comparison (unsigned) of two bignums.  */
+int
+APInt::tcCompare(const integerPart *lhs, const integerPart *rhs,
+                 unsigned int parts)
+{
+  while (parts) {
+      parts--;
+      if (lhs[parts] == rhs[parts])
+        continue;
+
+      if (lhs[parts] > rhs[parts])
+        return 1;
+      else
+        return -1;
+    }
+
+  return 0;
+}
+
+/* Increment a bignum in-place, return the carry flag.  */
+integerPart
+APInt::tcIncrement(integerPart *dst, unsigned int parts)
+{
+  unsigned int i;
+
+  for(i = 0; i < parts; i++)
+    if (++dst[i] != 0)
+      break;
+
+  return i == parts;
+}
+
+/* Set the least significant BITS bits of a bignum, clear the
+   rest.  */
+void
+APInt::tcSetLeastSignificantBits(integerPart *dst, unsigned int parts,
+                                 unsigned int bits)
+{
+  unsigned int i;
+
+  i = 0;
+  while (bits > integerPartWidth) {
+    dst[i++] = ~(integerPart) 0;
+    bits -= integerPartWidth;
+  }
+
+  if (bits)
+    dst[i++] = ~(integerPart) 0 >> (integerPartWidth - bits);
+
+  while (i < parts)
+    dst[i++] = 0;
+}
diff --git a/lib/Support/APSInt.cpp b/lib/Support/APSInt.cpp
new file mode 100644
index 0000000..73acafa
--- /dev/null
+++ b/lib/Support/APSInt.cpp
@@ -0,0 +1,23 @@
+//===-- llvm/ADT/APSInt.cpp - Arbitrary Precision Signed Int ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the APSInt class, which is a simple class that
+// represents an arbitrary sized integer that knows its signedness.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/FoldingSet.h"
+
+using namespace llvm;
+
+void APSInt::Profile(FoldingSetNodeID& ID) const {
+  ID.AddInteger((unsigned) (IsUnsigned ? 1 : 0));
+  APInt::Profile(ID);
+}
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
new file mode 100644
index 0000000..db0d8f3
--- /dev/null
+++ b/lib/Support/Allocator.cpp
@@ -0,0 +1,141 @@
+//===--- Allocator.cpp - Simple memory allocation abstraction -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BumpPtrAllocator interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Recycler.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Streams.h"
+#include <ostream>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MemRegion class implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// MemRegion - This is one chunk of the BumpPtrAllocator.
+class MemRegion {
+  unsigned RegionSize;
+  MemRegion *Next;
+  char *NextPtr;
+public:
+  void Init(unsigned size, unsigned Alignment, MemRegion *next) {
+    RegionSize = size;
+    Next = next;
+    NextPtr = (char*)(this+1);
+    
+    // Align NextPtr.
+    NextPtr = (char*)((intptr_t)(NextPtr+Alignment-1) &
+                      ~(intptr_t)(Alignment-1));
+  }
+  
+  const MemRegion *getNext() const { return Next; }
+  unsigned getNumBytesAllocated() const {
+    return NextPtr-(const char*)this;
+  }
+  
+  /// Allocate - Allocate and return at least the specified number of bytes.
+  ///
+  void *Allocate(size_t AllocSize, size_t Alignment, MemRegion **RegPtr) {
+    
+    char* Result = (char*) (((uintptr_t) (NextPtr+Alignment-1)) 
+                            & ~((uintptr_t) Alignment-1));
+
+    // Speculate the new value of NextPtr.
+    char* NextPtrTmp = Result + AllocSize;
+    
+    // If we are still within the current region, return Result.
+    if (unsigned (NextPtrTmp - (char*) this) <= RegionSize) {
+      NextPtr = NextPtrTmp;
+      return Result;
+    }
+    
+    // Otherwise, we have to allocate a new chunk.  Create one twice as big as
+    // this one.
+    MemRegion *NewRegion = (MemRegion *)malloc(RegionSize*2);
+    NewRegion->Init(RegionSize*2, Alignment, this);
+
+    // Update the current "first region" pointer  to point to the new region.
+    *RegPtr = NewRegion;
+    
+    // Try allocating from it now.
+    return NewRegion->Allocate(AllocSize, Alignment, RegPtr);
+  }
+  
+  /// Deallocate - Recursively release all memory for this and its next regions
+  /// to the system.
+  void Deallocate() {
+    MemRegion *next = Next;
+    free(this);
+    if (next)
+      next->Deallocate();
+  }
+
+  /// DeallocateAllButLast - Recursively release all memory for this and its
+  /// next regions to the system stopping at the last region in the list.
+  /// Returns the pointer to the last region.
+  MemRegion *DeallocateAllButLast() {
+    MemRegion *next = Next;
+    if (!next)
+      return this;
+    free(this);
+    return next->DeallocateAllButLast();
+  }
+};
+}
+
+//===----------------------------------------------------------------------===//
+// BumpPtrAllocator class implementation
+//===----------------------------------------------------------------------===//
+
+BumpPtrAllocator::BumpPtrAllocator() {
+  TheMemory = malloc(4096);
+  ((MemRegion*)TheMemory)->Init(4096, 1, 0);
+}
+
+BumpPtrAllocator::~BumpPtrAllocator() {
+  ((MemRegion*)TheMemory)->Deallocate();
+}
+
+void BumpPtrAllocator::Reset() {
+  MemRegion *MRP = (MemRegion*)TheMemory;
+  MRP = MRP->DeallocateAllButLast();
+  MRP->Init(4096, 1, 0);
+  TheMemory = MRP;
+}
+
+void *BumpPtrAllocator::Allocate(size_t Size, size_t Align) {
+  MemRegion *MRP = (MemRegion*)TheMemory;
+  void *Ptr = MRP->Allocate(Size, Align, &MRP);
+  TheMemory = MRP;
+  return Ptr;
+}
+
+void BumpPtrAllocator::PrintStats() const {
+  unsigned BytesUsed = 0;
+  unsigned NumRegions = 0;
+  const MemRegion *R = (MemRegion*)TheMemory;
+  for (; R; R = R->getNext(), ++NumRegions)
+    BytesUsed += R->getNumBytesAllocated();
+
+  cerr << "\nNumber of memory regions: " << NumRegions << "\n";
+  cerr << "Bytes allocated: " << BytesUsed << "\n";
+}
+
+void llvm::PrintRecyclerStats(size_t Size,
+                              size_t Align,
+                              size_t FreeListSize) {
+  cerr << "Recycler element size: " << Size << '\n';
+  cerr << "Recycler element alignment: " << Align << '\n';
+  cerr << "Number of elements free for recycling: " << FreeListSize << '\n';
+}
diff --git a/lib/Support/Annotation.cpp b/lib/Support/Annotation.cpp
new file mode 100644
index 0000000..9764b5e
--- /dev/null
+++ b/lib/Support/Annotation.cpp
@@ -0,0 +1,115 @@
+//===-- Annotation.cpp - Implement the Annotation Classes -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AnnotationManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Annotation.h"
+#include "llvm/Support/ManagedStatic.h"
+#include <map>
+#include <cstring>
+using namespace llvm;
+
+Annotation::~Annotation() {}  // Designed to be subclassed
+
+Annotable::~Annotable() {   // Virtual because it's designed to be subclassed...
+  Annotation *A = AnnotationList;
+  while (A) {
+    Annotation *Next = A->getNext();
+    delete A;
+    A = Next;
+  }
+}
+
+namespace {
+  class StrCmp {
+  public:
+    bool operator()(const char *a, const char *b) const {
+      return strcmp(a, b) < 0;
+    }
+  };
+}
+
+typedef std::map<const char*, unsigned, StrCmp> IDMapType;
+static unsigned IDCounter = 0;  // Unique ID counter
+
+// Static member to ensure initialiation on demand.
+static ManagedStatic<IDMapType> IDMap;
+
+// On demand annotation creation support...
+typedef Annotation *(*AnnFactory)(AnnotationID, const Annotable *, void *);
+typedef std::map<unsigned, std::pair<AnnFactory,void*> > FactMapType;
+
+static FactMapType *TheFactMap = 0;
+static FactMapType &getFactMap() {
+  if (TheFactMap == 0)
+    TheFactMap = new FactMapType();
+  return *TheFactMap;
+}
+
+static void eraseFromFactMap(unsigned ID) {
+  assert(TheFactMap && "No entries found!");
+  TheFactMap->erase(ID);
+  if (TheFactMap->empty()) {   // Delete when empty
+    delete TheFactMap;
+    TheFactMap = 0;
+  }
+}
+
+AnnotationID AnnotationManager::getID(const char *Name) {  // Name -> ID
+  IDMapType::iterator I = IDMap->find(Name);
+  if (I == IDMap->end()) {
+    (*IDMap)[Name] = IDCounter++;   // Add a new element
+    return AnnotationID(IDCounter-1);
+  }
+  return AnnotationID(I->second);
+}
+
+// getID - Name -> ID + registration of a factory function for demand driven
+// annotation support.
+AnnotationID AnnotationManager::getID(const char *Name, Factory Fact,
+                                      void *Data) {
+  AnnotationID Result(getID(Name));
+  registerAnnotationFactory(Result, Fact, Data);
+  return Result;
+}
+
+// getName - This function is especially slow, but that's okay because it should
+// only be used for debugging.
+//
+const char *AnnotationManager::getName(AnnotationID ID) {  // ID -> Name
+  IDMapType &TheMap = *IDMap;
+  for (IDMapType::iterator I = TheMap.begin(); ; ++I) {
+    assert(I != TheMap.end() && "Annotation ID is unknown!");
+    if (I->second == ID.ID) return I->first;
+  }
+}
+
+// registerAnnotationFactory - This method is used to register a callback
+// function used to create an annotation on demand if it is needed by the
+// Annotable::findOrCreateAnnotation method.
+//
+void AnnotationManager::registerAnnotationFactory(AnnotationID ID, AnnFactory F,
+                                                  void *ExtraData) {
+  if (F)
+    getFactMap()[ID.ID] = std::make_pair(F, ExtraData);
+  else
+    eraseFromFactMap(ID.ID);
+}
+
+// createAnnotation - Create an annotation of the specified ID for the
+// specified object, using a register annotation creation function.
+//
+Annotation *AnnotationManager::createAnnotation(AnnotationID ID,
+                                                const Annotable *Obj) {
+  FactMapType::iterator I = getFactMap().find(ID.ID);
+  if (I == getFactMap().end()) return 0;
+  return I->second.first(ID, Obj, I->second.second);
+}
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
new file mode 100644
index 0000000..7c8ce70
--- /dev/null
+++ b/lib/Support/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_llvm_library(LLVMSupport
+  APFloat.cpp
+  APInt.cpp
+  APSInt.cpp
+  Allocator.cpp
+  Annotation.cpp
+  CommandLine.cpp
+  ConstantRange.cpp
+  Debug.cpp
+  Dwarf.cpp
+  FileUtilities.cpp
+  FoldingSet.cpp
+  GraphWriter.cpp
+  IsInf.cpp
+  IsNAN.cpp
+  ManagedStatic.cpp
+  MemoryBuffer.cpp
+  PluginLoader.cpp
+  PrettyStackTrace.cpp
+  SlowOperationInformer.cpp
+  SmallPtrSet.cpp
+  Statistic.cpp
+  Streams.cpp
+  StringExtras.cpp
+  StringMap.cpp
+  StringPool.cpp
+  SystemUtils.cpp
+  Timer.cpp
+  Triple.cpp
+  raw_ostream.cpp
+  )
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
new file mode 100644
index 0000000..4922560
--- /dev/null
+++ b/lib/Support/CommandLine.cpp
@@ -0,0 +1,1184 @@
+//===-- CommandLine.cpp - Command line parser implementation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements a command line argument processor that is useful when
+// creating a tool.  It provides a simple, minimalistic interface that is easily
+// extensible and supports nonlocal (library) command line options.
+//
+// Note that rather than trying to figure out what this code does, you could try
+// reading the library documentation located in docs/CommandLine.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/Path.h"
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <ostream>
+#include <set>
+#include <cstdlib>
+#include <cerrno>
+#include <cstring>
+#include <climits>
+using namespace llvm;
+using namespace cl;
+
+//===----------------------------------------------------------------------===//
+// Template instantiations and anchors.
+//
+TEMPLATE_INSTANTIATION(class basic_parser<bool>);
+TEMPLATE_INSTANTIATION(class basic_parser<boolOrDefault>);
+TEMPLATE_INSTANTIATION(class basic_parser<int>);
+TEMPLATE_INSTANTIATION(class basic_parser<unsigned>);
+TEMPLATE_INSTANTIATION(class basic_parser<double>);
+TEMPLATE_INSTANTIATION(class basic_parser<float>);
+TEMPLATE_INSTANTIATION(class basic_parser<std::string>);
+TEMPLATE_INSTANTIATION(class basic_parser<char>);
+
+TEMPLATE_INSTANTIATION(class opt<unsigned>);
+TEMPLATE_INSTANTIATION(class opt<int>);
+TEMPLATE_INSTANTIATION(class opt<std::string>);
+TEMPLATE_INSTANTIATION(class opt<char>);
+TEMPLATE_INSTANTIATION(class opt<bool>);
+
+void Option::anchor() {}
+void basic_parser_impl::anchor() {}
+void parser<bool>::anchor() {}
+void parser<boolOrDefault>::anchor() {}
+void parser<int>::anchor() {}
+void parser<unsigned>::anchor() {}
+void parser<double>::anchor() {}
+void parser<float>::anchor() {}
+void parser<std::string>::anchor() {}
+void parser<char>::anchor() {}
+
+//===----------------------------------------------------------------------===//
+
+// Globals for name and overview of program.  Program name is not a string to
+// avoid static ctor/dtor issues.
+static char ProgramName[80] = "<premain>";
+static const char *ProgramOverview = 0;
+
+// This collects additional help to be printed.
+static ManagedStatic<std::vector<const char*> > MoreHelp;
+
+extrahelp::extrahelp(const char *Help)
+  : morehelp(Help) {
+  MoreHelp->push_back(Help);
+}
+
+static bool OptionListChanged = false;
+
+// MarkOptionsChanged - Internal helper function.
+void cl::MarkOptionsChanged() {
+  OptionListChanged = true;
+}
+
+/// RegisteredOptionList - This is the list of the command line options that
+/// have statically constructed themselves.
+static Option *RegisteredOptionList = 0;
+
+void Option::addArgument() {
+  assert(NextRegistered == 0 && "argument multiply registered!");
+
+  NextRegistered = RegisteredOptionList;
+  RegisteredOptionList = this;
+  MarkOptionsChanged();
+}
+
+
+//===----------------------------------------------------------------------===//
+// Basic, shared command line option processing machinery.
+//
+
+/// GetOptionInfo - Scan the list of registered options, turning them into data
+/// structures that are easier to handle.
+static void GetOptionInfo(std::vector<Option*> &PositionalOpts,
+                          std::vector<Option*> &SinkOpts,
+                          std::map<std::string, Option*> &OptionsMap) {
+  std::vector<const char*> OptionNames;
+  Option *CAOpt = 0;  // The ConsumeAfter option if it exists.
+  for (Option *O = RegisteredOptionList; O; O = O->getNextRegisteredOption()) {
+    // If this option wants to handle multiple option names, get the full set.
+    // This handles enum options like "-O1 -O2" etc.
+    O->getExtraOptionNames(OptionNames);
+    if (O->ArgStr[0])
+      OptionNames.push_back(O->ArgStr);
+
+    // Handle named options.
+    for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
+      // Add argument to the argument map!
+      if (!OptionsMap.insert(std::pair<std::string,Option*>(OptionNames[i],
+                                                            O)).second) {
+        cerr << ProgramName << ": CommandLine Error: Argument '"
+             << OptionNames[i] << "' defined more than once!\n";
+      }
+    }
+
+    OptionNames.clear();
+
+    // Remember information about positional options.
+    if (O->getFormattingFlag() == cl::Positional)
+      PositionalOpts.push_back(O);
+    else if (O->getMiscFlags() & cl::Sink) // Remember sink options
+      SinkOpts.push_back(O);
+    else if (O->getNumOccurrencesFlag() == cl::ConsumeAfter) {
+      if (CAOpt)
+        O->error("Cannot specify more than one option with cl::ConsumeAfter!");
+      CAOpt = O;
+    }
+  }
+
+  if (CAOpt)
+    PositionalOpts.push_back(CAOpt);
+
+  // Make sure that they are in order of registration not backwards.
+  std::reverse(PositionalOpts.begin(), PositionalOpts.end());
+}
+
+
+/// LookupOption - Lookup the option specified by the specified option on the
+/// command line.  If there is a value specified (after an equal sign) return
+/// that as well.
+static Option *LookupOption(const char *&Arg, const char *&Value,
+                            std::map<std::string, Option*> &OptionsMap) {
+  while (*Arg == '-') ++Arg;  // Eat leading dashes
+
+  const char *ArgEnd = Arg;
+  while (*ArgEnd && *ArgEnd != '=')
+    ++ArgEnd; // Scan till end of argument name.
+
+  if (*ArgEnd == '=')  // If we have an equals sign...
+    Value = ArgEnd+1;  // Get the value, not the equals
+
+
+  if (*Arg == 0) return 0;
+
+  // Look up the option.
+  std::map<std::string, Option*>::iterator I =
+    OptionsMap.find(std::string(Arg, ArgEnd));
+  return I != OptionsMap.end() ? I->second : 0;
+}
+
+static inline bool ProvideOption(Option *Handler, const char *ArgName,
+                                 const char *Value, int argc, char **argv,
+                                 int &i) {
+  // Is this a multi-argument option?
+  unsigned NumAdditionalVals = Handler->getNumAdditionalVals();
+
+  // Enforce value requirements
+  switch (Handler->getValueExpectedFlag()) {
+  case ValueRequired:
+    if (Value == 0) {       // No value specified?
+      if (i+1 < argc) {     // Steal the next argument, like for '-o filename'
+        Value = argv[++i];
+      } else {
+        return Handler->error(" requires a value!");
+      }
+    }
+    break;
+  case ValueDisallowed:
+    if (NumAdditionalVals > 0)
+      return Handler->error(": multi-valued option specified"
+      " with ValueDisallowed modifier!");
+
+    if (Value)
+      return Handler->error(" does not allow a value! '" +
+                            std::string(Value) + "' specified.");
+    break;
+  case ValueOptional:
+    break;
+  default:
+    cerr << ProgramName
+         << ": Bad ValueMask flag! CommandLine usage error:"
+         << Handler->getValueExpectedFlag() << "\n";
+    abort();
+    break;
+  }
+
+  // If this isn't a multi-arg option, just run the handler.
+  if (NumAdditionalVals == 0) {
+    return Handler->addOccurrence(i, ArgName, Value ? Value : "");
+  }
+  // If it is, run the handle several times.
+  else {
+    bool MultiArg = false;
+
+    if (Value) {
+      if (Handler->addOccurrence(i, ArgName, Value, MultiArg))
+        return true;
+      --NumAdditionalVals;
+      MultiArg = true;
+    }
+
+    while (NumAdditionalVals > 0) {
+
+      if (i+1 < argc) {
+        Value = argv[++i];
+      } else {
+        return Handler->error(": not enough values!");
+      }
+      if (Handler->addOccurrence(i, ArgName, Value, MultiArg))
+        return true;
+      MultiArg = true;
+      --NumAdditionalVals;
+    }
+    return false;
+  }
+}
+
+static bool ProvidePositionalOption(Option *Handler, const std::string &Arg,
+                                    int i) {
+  int Dummy = i;
+  return ProvideOption(Handler, Handler->ArgStr, Arg.c_str(), 0, 0, Dummy);
+}
+
+
+// Option predicates...
+static inline bool isGrouping(const Option *O) {
+  return O->getFormattingFlag() == cl::Grouping;
+}
+static inline bool isPrefixedOrGrouping(const Option *O) {
+  return isGrouping(O) || O->getFormattingFlag() == cl::Prefix;
+}
+
+// getOptionPred - Check to see if there are any options that satisfy the
+// specified predicate with names that are the prefixes in Name.  This is
+// checked by progressively stripping characters off of the name, checking to
+// see if there options that satisfy the predicate.  If we find one, return it,
+// otherwise return null.
+//
+static Option *getOptionPred(std::string Name, size_t &Length,
+                             bool (*Pred)(const Option*),
+                             std::map<std::string, Option*> &OptionsMap) {
+
+  std::map<std::string, Option*>::iterator OMI = OptionsMap.find(Name);
+  if (OMI != OptionsMap.end() && Pred(OMI->second)) {
+    Length = Name.length();
+    return OMI->second;
+  }
+
+  if (Name.size() == 1) return 0;
+  do {
+    Name.erase(Name.end()-1, Name.end());   // Chop off the last character...
+    OMI = OptionsMap.find(Name);
+
+    // Loop while we haven't found an option and Name still has at least two
+    // characters in it (so that the next iteration will not be the empty
+    // string...
+  } while ((OMI == OptionsMap.end() || !Pred(OMI->second)) && Name.size() > 1);
+
+  if (OMI != OptionsMap.end() && Pred(OMI->second)) {
+    Length = Name.length();
+    return OMI->second;    // Found one!
+  }
+  return 0;                // No option found!
+}
+
+static bool RequiresValue(const Option *O) {
+  return O->getNumOccurrencesFlag() == cl::Required ||
+         O->getNumOccurrencesFlag() == cl::OneOrMore;
+}
+
+static bool EatsUnboundedNumberOfValues(const Option *O) {
+  return O->getNumOccurrencesFlag() == cl::ZeroOrMore ||
+         O->getNumOccurrencesFlag() == cl::OneOrMore;
+}
+
+/// ParseCStringVector - Break INPUT up wherever one or more
+/// whitespace characters are found, and store the resulting tokens in
+/// OUTPUT. The tokens stored in OUTPUT are dynamically allocated
+/// using strdup (), so it is the caller's responsibility to free ()
+/// them later.
+///
+static void ParseCStringVector(std::vector<char *> &output,
+                               const char *input) {
+  // Characters which will be treated as token separators:
+  static const char *const delims = " \v\f\t\r\n";
+
+  std::string work (input);
+  // Skip past any delims at head of input string.
+  size_t pos = work.find_first_not_of (delims);
+  // If the string consists entirely of delims, then exit early.
+  if (pos == std::string::npos) return;
+  // Otherwise, jump forward to beginning of first word.
+  work = work.substr (pos);
+  // Find position of first delimiter.
+  pos = work.find_first_of (delims);
+
+  while (!work.empty() && pos != std::string::npos) {
+    // Everything from 0 to POS is the next word to copy.
+    output.push_back (strdup (work.substr (0,pos).c_str ()));
+    // Is there another word in the string?
+    size_t nextpos = work.find_first_not_of (delims, pos + 1);
+    if (nextpos != std::string::npos) {
+      // Yes? Then remove delims from beginning ...
+      work = work.substr (work.find_first_not_of (delims, pos + 1));
+      // and find the end of the word.
+      pos = work.find_first_of (delims);
+    } else {
+      // No? (Remainder of string is delims.) End the loop.
+      work = "";
+      pos = std::string::npos;
+    }
+  }
+
+  // If `input' ended with non-delim char, then we'll get here with
+  // the last word of `input' in `work'; copy it now.
+  if (!work.empty ()) {
+    output.push_back (strdup (work.c_str ()));
+  }
+}
+
+/// ParseEnvironmentOptions - An alternative entry point to the
+/// CommandLine library, which allows you to read the program's name
+/// from the caller (as PROGNAME) and its command-line arguments from
+/// an environment variable (whose name is given in ENVVAR).
+///
+void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
+                                 const char *Overview, bool ReadResponseFiles) {
+  // Check args.
+  assert(progName && "Program name not specified");
+  assert(envVar && "Environment variable name missing");
+
+  // Get the environment variable they want us to parse options out of.
+  const char *envValue = getenv(envVar);
+  if (!envValue)
+    return;
+
+  // Get program's "name", which we wouldn't know without the caller
+  // telling us.
+  std::vector<char*> newArgv;
+  newArgv.push_back(strdup(progName));
+
+  // Parse the value of the environment variable into a "command line"
+  // and hand it off to ParseCommandLineOptions().
+  ParseCStringVector(newArgv, envValue);
+  int newArgc = static_cast<int>(newArgv.size());
+  ParseCommandLineOptions(newArgc, &newArgv[0], Overview, ReadResponseFiles);
+
+  // Free all the strdup()ed strings.
+  for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
+       i != e; ++i)
+    free (*i);
+}
+
+
+/// ExpandResponseFiles - Copy the contents of argv into newArgv,
+/// substituting the contents of the response files for the arguments
+/// of type @file.
+static void ExpandResponseFiles(int argc, char** argv,
+                                std::vector<char*>& newArgv) {
+  for (int i = 1; i != argc; ++i) {
+    char* arg = argv[i];
+
+    if (arg[0] == '@') {
+
+      sys::PathWithStatus respFile(++arg);
+
+      // Check that the response file is not empty (mmap'ing empty
+      // files can be problematic).
+      const sys::FileStatus *FileStat = respFile.getFileStatus();
+      if (FileStat && FileStat->getSize() != 0) {
+
+        // Mmap the response file into memory.
+        OwningPtr<MemoryBuffer>
+          respFilePtr(MemoryBuffer::getFile(respFile.c_str()));
+
+        // If we could open the file, parse its contents, otherwise
+        // pass the @file option verbatim.
+
+        // TODO: we should also support recursive loading of response files,
+        // since this is how gcc behaves. (From their man page: "The file may
+        // itself contain additional @file options; any such options will be
+        // processed recursively.")
+
+        if (respFilePtr != 0) {
+          ParseCStringVector(newArgv, respFilePtr->getBufferStart());
+          continue;
+        }
+      }
+    }
+    newArgv.push_back(strdup(arg));
+  }
+}
+
+void cl::ParseCommandLineOptions(int argc, char **argv,
+                                 const char *Overview, bool ReadResponseFiles) {
+  // Process all registered options.
+  std::vector<Option*> PositionalOpts;
+  std::vector<Option*> SinkOpts;
+  std::map<std::string, Option*> Opts;
+  GetOptionInfo(PositionalOpts, SinkOpts, Opts);
+
+  assert((!Opts.empty() || !PositionalOpts.empty()) &&
+         "No options specified!");
+
+  // Expand response files.
+  std::vector<char*> newArgv;
+  if (ReadResponseFiles) {
+    newArgv.push_back(strdup(argv[0]));
+    ExpandResponseFiles(argc, argv, newArgv);
+    argv = &newArgv[0];
+    argc = static_cast<int>(newArgv.size());
+  }
+
+  // Copy the program name into ProgName, making sure not to overflow it.
+  std::string ProgName = sys::Path(argv[0]).getLast();
+  if (ProgName.size() > 79) ProgName.resize(79);
+  strcpy(ProgramName, ProgName.c_str());
+
+  ProgramOverview = Overview;
+  bool ErrorParsing = false;
+
+  // Check out the positional arguments to collect information about them.
+  unsigned NumPositionalRequired = 0;
+
+  // Determine whether or not there are an unlimited number of positionals
+  bool HasUnlimitedPositionals = false;
+
+  Option *ConsumeAfterOpt = 0;
+  if (!PositionalOpts.empty()) {
+    if (PositionalOpts[0]->getNumOccurrencesFlag() == cl::ConsumeAfter) {
+      assert(PositionalOpts.size() > 1 &&
+             "Cannot specify cl::ConsumeAfter without a positional argument!");
+      ConsumeAfterOpt = PositionalOpts[0];
+    }
+
+    // Calculate how many positional values are _required_.
+    bool UnboundedFound = false;
+    for (size_t i = ConsumeAfterOpt != 0, e = PositionalOpts.size();
+         i != e; ++i) {
+      Option *Opt = PositionalOpts[i];
+      if (RequiresValue(Opt))
+        ++NumPositionalRequired;
+      else if (ConsumeAfterOpt) {
+        // ConsumeAfter cannot be combined with "optional" positional options
+        // unless there is only one positional argument...
+        if (PositionalOpts.size() > 2)
+          ErrorParsing |=
+            Opt->error(" error - this positional option will never be matched, "
+                       "because it does not Require a value, and a "
+                       "cl::ConsumeAfter option is active!");
+      } else if (UnboundedFound && !Opt->ArgStr[0]) {
+        // This option does not "require" a value...  Make sure this option is
+        // not specified after an option that eats all extra arguments, or this
+        // one will never get any!
+        //
+        ErrorParsing |= Opt->error(" error - option can never match, because "
+                                   "another positional argument will match an "
+                                   "unbounded number of values, and this option"
+                                   " does not require a value!");
+      }
+      UnboundedFound |= EatsUnboundedNumberOfValues(Opt);
+    }
+    HasUnlimitedPositionals = UnboundedFound || ConsumeAfterOpt;
+  }
+
+  // PositionalVals - A vector of "positional" arguments we accumulate into
+  // the process at the end...
+  //
+  std::vector<std::pair<std::string,unsigned> > PositionalVals;
+
+  // If the program has named positional arguments, and the name has been run
+  // across, keep track of which positional argument was named.  Otherwise put
+  // the positional args into the PositionalVals list...
+  Option *ActivePositionalArg = 0;
+
+  // Loop over all of the arguments... processing them.
+  bool DashDashFound = false;  // Have we read '--'?
+  for (int i = 1; i < argc; ++i) {
+    Option *Handler = 0;
+    const char *Value = 0;
+    const char *ArgName = "";
+
+    // If the option list changed, this means that some command line
+    // option has just been registered or deregistered.  This can occur in
+    // response to things like -load, etc.  If this happens, rescan the options.
+    if (OptionListChanged) {
+      PositionalOpts.clear();
+      SinkOpts.clear();
+      Opts.clear();
+      GetOptionInfo(PositionalOpts, SinkOpts, Opts);
+      OptionListChanged = false;
+    }
+
+    // Check to see if this is a positional argument.  This argument is
+    // considered to be positional if it doesn't start with '-', if it is "-"
+    // itself, or if we have seen "--" already.
+    //
+    if (argv[i][0] != '-' || argv[i][1] == 0 || DashDashFound) {
+      // Positional argument!
+      if (ActivePositionalArg) {
+        ProvidePositionalOption(ActivePositionalArg, argv[i], i);
+        continue;  // We are done!
+      } else if (!PositionalOpts.empty()) {
+        PositionalVals.push_back(std::make_pair(argv[i],i));
+
+        // All of the positional arguments have been fulfulled, give the rest to
+        // the consume after option... if it's specified...
+        //
+        if (PositionalVals.size() >= NumPositionalRequired &&
+            ConsumeAfterOpt != 0) {
+          for (++i; i < argc; ++i)
+            PositionalVals.push_back(std::make_pair(argv[i],i));
+          break;   // Handle outside of the argument processing loop...
+        }
+
+        // Delay processing positional arguments until the end...
+        continue;
+      }
+    } else if (argv[i][0] == '-' && argv[i][1] == '-' && argv[i][2] == 0 &&
+               !DashDashFound) {
+      DashDashFound = true;  // This is the mythical "--"?
+      continue;              // Don't try to process it as an argument itself.
+    } else if (ActivePositionalArg &&
+               (ActivePositionalArg->getMiscFlags() & PositionalEatsArgs)) {
+      // If there is a positional argument eating options, check to see if this
+      // option is another positional argument.  If so, treat it as an argument,
+      // otherwise feed it to the eating positional.
+      ArgName = argv[i]+1;
+      Handler = LookupOption(ArgName, Value, Opts);
+      if (!Handler || Handler->getFormattingFlag() != cl::Positional) {
+        ProvidePositionalOption(ActivePositionalArg, argv[i], i);
+        continue;  // We are done!
+      }
+
+    } else {     // We start with a '-', must be an argument...
+      ArgName = argv[i]+1;
+      Handler = LookupOption(ArgName, Value, Opts);
+
+      // Check to see if this "option" is really a prefixed or grouped argument.
+      if (Handler == 0) {
+        std::string RealName(ArgName);
+        if (RealName.size() > 1) {
+          size_t Length = 0;
+          Option *PGOpt = getOptionPred(RealName, Length, isPrefixedOrGrouping,
+                                        Opts);
+
+          // If the option is a prefixed option, then the value is simply the
+          // rest of the name...  so fall through to later processing, by
+          // setting up the argument name flags and value fields.
+          //
+          if (PGOpt && PGOpt->getFormattingFlag() == cl::Prefix) {
+            Value = ArgName+Length;
+            assert(Opts.find(std::string(ArgName, Value)) != Opts.end() &&
+                   Opts.find(std::string(ArgName, Value))->second == PGOpt);
+            Handler = PGOpt;
+          } else if (PGOpt) {
+            // This must be a grouped option... handle them now.
+            assert(isGrouping(PGOpt) && "Broken getOptionPred!");
+
+            do {
+              // Move current arg name out of RealName into RealArgName...
+              std::string RealArgName(RealName.begin(),
+                                      RealName.begin() + Length);
+              RealName.erase(RealName.begin(), RealName.begin() + Length);
+
+              // Because ValueRequired is an invalid flag for grouped arguments,
+              // we don't need to pass argc/argv in...
+              //
+              assert(PGOpt->getValueExpectedFlag() != cl::ValueRequired &&
+                     "Option can not be cl::Grouping AND cl::ValueRequired!");
+              int Dummy;
+              ErrorParsing |= ProvideOption(PGOpt, RealArgName.c_str(),
+                                            0, 0, 0, Dummy);
+
+              // Get the next grouping option...
+              PGOpt = getOptionPred(RealName, Length, isGrouping, Opts);
+            } while (PGOpt && Length != RealName.size());
+
+            Handler = PGOpt; // Ate all of the options.
+          }
+        }
+      }
+    }
+
+    if (Handler == 0) {
+      if (SinkOpts.empty()) {
+        cerr << ProgramName << ": Unknown command line argument '"
+             << argv[i] << "'.  Try: '" << argv[0] << " --help'\n";
+        ErrorParsing = true;
+      } else {
+        for (std::vector<Option*>::iterator I = SinkOpts.begin(),
+               E = SinkOpts.end(); I != E ; ++I)
+          (*I)->addOccurrence(i, "", argv[i]);
+      }
+      continue;
+    }
+
+    // Check to see if this option accepts a comma separated list of values.  If
+    // it does, we have to split up the value into multiple values...
+    if (Value && Handler->getMiscFlags() & CommaSeparated) {
+      std::string Val(Value);
+      std::string::size_type Pos = Val.find(',');
+
+      while (Pos != std::string::npos) {
+        // Process the portion before the comma...
+        ErrorParsing |= ProvideOption(Handler, ArgName,
+                                      std::string(Val.begin(),
+                                                  Val.begin()+Pos).c_str(),
+                                      argc, argv, i);
+        // Erase the portion before the comma, AND the comma...
+        Val.erase(Val.begin(), Val.begin()+Pos+1);
+        Value += Pos+1;  // Increment the original value pointer as well...
+
+        // Check for another comma...
+        Pos = Val.find(',');
+      }
+    }
+
+    // If this is a named positional argument, just remember that it is the
+    // active one...
+    if (Handler->getFormattingFlag() == cl::Positional)
+      ActivePositionalArg = Handler;
+    else
+      ErrorParsing |= ProvideOption(Handler, ArgName, Value, argc, argv, i);
+  }
+
+  // Check and handle positional arguments now...
+  if (NumPositionalRequired > PositionalVals.size()) {
+    cerr << ProgramName
+         << ": Not enough positional command line arguments specified!\n"
+         << "Must specify at least " << NumPositionalRequired
+         << " positional arguments: See: " << argv[0] << " --help\n";
+
+    ErrorParsing = true;
+  } else if (!HasUnlimitedPositionals
+             && PositionalVals.size() > PositionalOpts.size()) {
+    cerr << ProgramName
+         << ": Too many positional arguments specified!\n"
+         << "Can specify at most " << PositionalOpts.size()
+         << " positional arguments: See: " << argv[0] << " --help\n";
+    ErrorParsing = true;
+
+  } else if (ConsumeAfterOpt == 0) {
+    // Positional args have already been handled if ConsumeAfter is specified...
+    unsigned ValNo = 0, NumVals = static_cast<unsigned>(PositionalVals.size());
+    for (size_t i = 0, e = PositionalOpts.size(); i != e; ++i) {
+      if (RequiresValue(PositionalOpts[i])) {
+        ProvidePositionalOption(PositionalOpts[i], PositionalVals[ValNo].first,
+                                PositionalVals[ValNo].second);
+        ValNo++;
+        --NumPositionalRequired;  // We fulfilled our duty...
+      }
+
+      // If we _can_ give this option more arguments, do so now, as long as we
+      // do not give it values that others need.  'Done' controls whether the
+      // option even _WANTS_ any more.
+      //
+      bool Done = PositionalOpts[i]->getNumOccurrencesFlag() == cl::Required;
+      while (NumVals-ValNo > NumPositionalRequired && !Done) {
+        switch (PositionalOpts[i]->getNumOccurrencesFlag()) {
+        case cl::Optional:
+          Done = true;          // Optional arguments want _at most_ one value
+          // FALL THROUGH
+        case cl::ZeroOrMore:    // Zero or more will take all they can get...
+        case cl::OneOrMore:     // One or more will take all they can get...
+          ProvidePositionalOption(PositionalOpts[i],
+                                  PositionalVals[ValNo].first,
+                                  PositionalVals[ValNo].second);
+          ValNo++;
+          break;
+        default:
+          assert(0 && "Internal error, unexpected NumOccurrences flag in "
+                 "positional argument processing!");
+        }
+      }
+    }
+  } else {
+    assert(ConsumeAfterOpt && NumPositionalRequired <= PositionalVals.size());
+    unsigned ValNo = 0;
+    for (size_t j = 1, e = PositionalOpts.size(); j != e; ++j)
+      if (RequiresValue(PositionalOpts[j])) {
+        ErrorParsing |= ProvidePositionalOption(PositionalOpts[j],
+                                                PositionalVals[ValNo].first,
+                                                PositionalVals[ValNo].second);
+        ValNo++;
+      }
+
+    // Handle the case where there is just one positional option, and it's
+    // optional.  In this case, we want to give JUST THE FIRST option to the
+    // positional option and keep the rest for the consume after.  The above
+    // loop would have assigned no values to positional options in this case.
+    //
+    if (PositionalOpts.size() == 2 && ValNo == 0 && !PositionalVals.empty()) {
+      ErrorParsing |= ProvidePositionalOption(PositionalOpts[1],
+                                              PositionalVals[ValNo].first,
+                                              PositionalVals[ValNo].second);
+      ValNo++;
+    }
+
+    // Handle over all of the rest of the arguments to the
+    // cl::ConsumeAfter command line option...
+    for (; ValNo != PositionalVals.size(); ++ValNo)
+      ErrorParsing |= ProvidePositionalOption(ConsumeAfterOpt,
+                                              PositionalVals[ValNo].first,
+                                              PositionalVals[ValNo].second);
+  }
+
+  // Loop over args and make sure all required args are specified!
+  for (std::map<std::string, Option*>::iterator I = Opts.begin(),
+         E = Opts.end(); I != E; ++I) {
+    switch (I->second->getNumOccurrencesFlag()) {
+    case Required:
+    case OneOrMore:
+      if (I->second->getNumOccurrences() == 0) {
+        I->second->error(" must be specified at least once!");
+        ErrorParsing = true;
+      }
+      // Fall through
+    default:
+      break;
+    }
+  }
+
+  // Free all of the memory allocated to the map.  Command line options may only
+  // be processed once!
+  Opts.clear();
+  PositionalOpts.clear();
+  MoreHelp->clear();
+
+  // Free the memory allocated by ExpandResponseFiles.
+  if (ReadResponseFiles) {
+    // Free all the strdup()ed strings.
+    for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
+         i != e; ++i)
+      free (*i);
+  }
+
+  // If we had an error processing our arguments, don't let the program execute
+  if (ErrorParsing) exit(1);
+}
+
+//===----------------------------------------------------------------------===//
+// Option Base class implementation
+//
+
+bool Option::error(std::string Message, const char *ArgName) {
+  if (ArgName == 0) ArgName = ArgStr;
+  if (ArgName[0] == 0)
+    cerr << HelpStr;  // Be nice for positional arguments
+  else
+    cerr << ProgramName << ": for the -" << ArgName;
+
+  cerr << " option: " << Message << "\n";
+  return true;
+}
+
+bool Option::addOccurrence(unsigned pos, const char *ArgName,
+                           const std::string &Value,
+                           bool MultiArg) {
+  if (!MultiArg)
+    NumOccurrences++;   // Increment the number of times we have been seen
+
+  switch (getNumOccurrencesFlag()) {
+  case Optional:
+    if (NumOccurrences > 1)
+      return error(": may only occur zero or one times!", ArgName);
+    break;
+  case Required:
+    if (NumOccurrences > 1)
+      return error(": must occur exactly one time!", ArgName);
+    // Fall through
+  case OneOrMore:
+  case ZeroOrMore:
+  case ConsumeAfter: break;
+  default: return error(": bad num occurrences flag value!");
+  }
+
+  return handleOccurrence(pos, ArgName, Value);
+}
+
+
+// getValueStr - Get the value description string, using "DefaultMsg" if nothing
+// has been specified yet.
+//
+static const char *getValueStr(const Option &O, const char *DefaultMsg) {
+  if (O.ValueStr[0] == 0) return DefaultMsg;
+  return O.ValueStr;
+}
+
+//===----------------------------------------------------------------------===//
+// cl::alias class implementation
+//
+
+// Return the width of the option tag for printing...
+size_t alias::getOptionWidth() const {
+  return std::strlen(ArgStr)+6;
+}
+
+// Print out the option for the alias.
+void alias::printOptionInfo(size_t GlobalWidth) const {
+  size_t L = std::strlen(ArgStr);
+  cout << "  -" << ArgStr << std::string(GlobalWidth-L-6, ' ') << " - "
+       << HelpStr << "\n";
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Parser Implementation code...
+//
+
+// basic_parser implementation
+//
+
+// Return the width of the option tag for printing...
+size_t basic_parser_impl::getOptionWidth(const Option &O) const {
+  size_t Len = std::strlen(O.ArgStr);
+  if (const char *ValName = getValueName())
+    Len += std::strlen(getValueStr(O, ValName))+3;
+
+  return Len + 6;
+}
+
+// printOptionInfo - Print out information about this option.  The
+// to-be-maintained width is specified.
+//
+void basic_parser_impl::printOptionInfo(const Option &O,
+                                        size_t GlobalWidth) const {
+  cout << "  -" << O.ArgStr;
+
+  if (const char *ValName = getValueName())
+    cout << "=<" << getValueStr(O, ValName) << ">";
+
+  cout << std::string(GlobalWidth-getOptionWidth(O), ' ') << " - "
+       << O.HelpStr << "\n";
+}
+
+
+
+
+// parser<bool> implementation
+//
+bool parser<bool>::parse(Option &O, const char *ArgName,
+                         const std::string &Arg, bool &Value) {
+  if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
+      Arg == "1") {
+    Value = true;
+  } else if (Arg == "false" || Arg == "FALSE" || Arg == "False" || Arg == "0") {
+    Value = false;
+  } else {
+    return O.error(": '" + Arg +
+                   "' is invalid value for boolean argument! Try 0 or 1");
+  }
+  return false;
+}
+
+// parser<boolOrDefault> implementation
+//
+bool parser<boolOrDefault>::parse(Option &O, const char *ArgName,
+                         const std::string &Arg, boolOrDefault &Value) {
+  if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
+      Arg == "1") {
+    Value = BOU_TRUE;
+  } else if (Arg == "false" || Arg == "FALSE"
+             || Arg == "False" || Arg == "0") {
+    Value = BOU_FALSE;
+  } else {
+    return O.error(": '" + Arg +
+                   "' is invalid value for boolean argument! Try 0 or 1");
+  }
+  return false;
+}
+
+// parser<int> implementation
+//
+bool parser<int>::parse(Option &O, const char *ArgName,
+                        const std::string &Arg, int &Value) {
+  char *End;
+  Value = (int)strtol(Arg.c_str(), &End, 0);
+  if (*End != 0)
+    return O.error(": '" + Arg + "' value invalid for integer argument!");
+  return false;
+}
+
+// parser<unsigned> implementation
+//
+bool parser<unsigned>::parse(Option &O, const char *ArgName,
+                             const std::string &Arg, unsigned &Value) {
+  char *End;
+  errno = 0;
+  unsigned long V = strtoul(Arg.c_str(), &End, 0);
+  Value = (unsigned)V;
+  if (((V == ULONG_MAX) && (errno == ERANGE))
+      || (*End != 0)
+      || (Value != V))
+    return O.error(": '" + Arg + "' value invalid for uint argument!");
+  return false;
+}
+
+// parser<double>/parser<float> implementation
+//
+static bool parseDouble(Option &O, const std::string &Arg, double &Value) {
+  const char *ArgStart = Arg.c_str();
+  char *End;
+  Value = strtod(ArgStart, &End);
+  if (*End != 0)
+    return O.error(": '" +Arg+ "' value invalid for floating point argument!");
+  return false;
+}
+
+bool parser<double>::parse(Option &O, const char *AN,
+                           const std::string &Arg, double &Val) {
+  return parseDouble(O, Arg, Val);
+}
+
+bool parser<float>::parse(Option &O, const char *AN,
+                          const std::string &Arg, float &Val) {
+  double dVal;
+  if (parseDouble(O, Arg, dVal))
+    return true;
+  Val = (float)dVal;
+  return false;
+}
+
+
+
+// generic_parser_base implementation
+//
+
+// findOption - Return the option number corresponding to the specified
+// argument string.  If the option is not found, getNumOptions() is returned.
+//
+unsigned generic_parser_base::findOption(const char *Name) {
+  unsigned i = 0, e = getNumOptions();
+  std::string N(Name);
+
+  while (i != e)
+    if (getOption(i) == N)
+      return i;
+    else
+      ++i;
+  return e;
+}
+
+
+// Return the width of the option tag for printing...
+size_t generic_parser_base::getOptionWidth(const Option &O) const {
+  if (O.hasArgStr()) {
+    size_t Size = std::strlen(O.ArgStr)+6;
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
+      Size = std::max(Size, std::strlen(getOption(i))+8);
+    return Size;
+  } else {
+    size_t BaseSize = 0;
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
+      BaseSize = std::max(BaseSize, std::strlen(getOption(i))+8);
+    return BaseSize;
+  }
+}
+
+// printOptionInfo - Print out information about this option.  The
+// to-be-maintained width is specified.
+//
+void generic_parser_base::printOptionInfo(const Option &O,
+                                          size_t GlobalWidth) const {
+  if (O.hasArgStr()) {
+    size_t L = std::strlen(O.ArgStr);
+    cout << "  -" << O.ArgStr << std::string(GlobalWidth-L-6, ' ')
+         << " - " << O.HelpStr << "\n";
+
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
+      size_t NumSpaces = GlobalWidth-strlen(getOption(i))-8;
+      cout << "    =" << getOption(i) << std::string(NumSpaces, ' ')
+           << " -   " << getDescription(i) << "\n";
+    }
+  } else {
+    if (O.HelpStr[0])
+      cout << "  " << O.HelpStr << "\n";
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
+      size_t L = std::strlen(getOption(i));
+      cout << "    -" << getOption(i) << std::string(GlobalWidth-L-8, ' ')
+           << " - " << getDescription(i) << "\n";
+    }
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// --help and --help-hidden option implementation
+//
+
+namespace {
+
+class HelpPrinter {
+  size_t MaxArgLen;
+  const Option *EmptyArg;
+  const bool ShowHidden;
+
+  // isHidden/isReallyHidden - Predicates to be used to filter down arg lists.
+  inline static bool isHidden(std::pair<std::string, Option *> &OptPair) {
+    return OptPair.second->getOptionHiddenFlag() >= Hidden;
+  }
+  inline static bool isReallyHidden(std::pair<std::string, Option *> &OptPair) {
+    return OptPair.second->getOptionHiddenFlag() == ReallyHidden;
+  }
+
+public:
+  explicit HelpPrinter(bool showHidden) : ShowHidden(showHidden) {
+    EmptyArg = 0;
+  }
+
+  void operator=(bool Value) {
+    if (Value == false) return;
+
+    // Get all the options.
+    std::vector<Option*> PositionalOpts;
+    std::vector<Option*> SinkOpts;
+    std::map<std::string, Option*> OptMap;
+    GetOptionInfo(PositionalOpts, SinkOpts, OptMap);
+
+    // Copy Options into a vector so we can sort them as we like...
+    std::vector<std::pair<std::string, Option*> > Opts;
+    copy(OptMap.begin(), OptMap.end(), std::back_inserter(Opts));
+
+    // Eliminate Hidden or ReallyHidden arguments, depending on ShowHidden
+    Opts.erase(std::remove_if(Opts.begin(), Opts.end(),
+                          std::ptr_fun(ShowHidden ? isReallyHidden : isHidden)),
+               Opts.end());
+
+    // Eliminate duplicate entries in table (from enum flags options, f.e.)
+    {  // Give OptionSet a scope
+      std::set<Option*> OptionSet;
+      for (unsigned i = 0; i != Opts.size(); ++i)
+        if (OptionSet.count(Opts[i].second) == 0)
+          OptionSet.insert(Opts[i].second);   // Add new entry to set
+        else
+          Opts.erase(Opts.begin()+i--);    // Erase duplicate
+    }
+
+    if (ProgramOverview)
+      cout << "OVERVIEW: " << ProgramOverview << "\n";
+
+    cout << "USAGE: " << ProgramName << " [options]";
+
+    // Print out the positional options.
+    Option *CAOpt = 0;   // The cl::ConsumeAfter option, if it exists...
+    if (!PositionalOpts.empty() &&
+        PositionalOpts[0]->getNumOccurrencesFlag() == ConsumeAfter)
+      CAOpt = PositionalOpts[0];
+
+    for (size_t i = CAOpt != 0, e = PositionalOpts.size(); i != e; ++i) {
+      if (PositionalOpts[i]->ArgStr[0])
+        cout << " --" << PositionalOpts[i]->ArgStr;
+      cout << " " << PositionalOpts[i]->HelpStr;
+    }
+
+    // Print the consume after option info if it exists...
+    if (CAOpt) cout << " " << CAOpt->HelpStr;
+
+    cout << "\n\n";
+
+    // Compute the maximum argument length...
+    MaxArgLen = 0;
+    for (size_t i = 0, e = Opts.size(); i != e; ++i)
+      MaxArgLen = std::max(MaxArgLen, Opts[i].second->getOptionWidth());
+
+    cout << "OPTIONS:\n";
+    for (size_t i = 0, e = Opts.size(); i != e; ++i)
+      Opts[i].second->printOptionInfo(MaxArgLen);
+
+    // Print any extra help the user has declared.
+    for (std::vector<const char *>::iterator I = MoreHelp->begin(),
+          E = MoreHelp->end(); I != E; ++I)
+      cout << *I;
+    MoreHelp->clear();
+
+    // Halt the program since help information was printed
+    exit(1);
+  }
+};
+} // End anonymous namespace
+
+// Define the two HelpPrinter instances that are used to print out help, or
+// help-hidden...
+//
+static HelpPrinter NormalPrinter(false);
+static HelpPrinter HiddenPrinter(true);
+
+static cl::opt<HelpPrinter, true, parser<bool> >
+HOp("help", cl::desc("Display available options (--help-hidden for more)"),
+    cl::location(NormalPrinter), cl::ValueDisallowed);
+
+static cl::opt<HelpPrinter, true, parser<bool> >
+HHOp("help-hidden", cl::desc("Display all available options"),
+     cl::location(HiddenPrinter), cl::Hidden, cl::ValueDisallowed);
+
+static void (*OverrideVersionPrinter)() = 0;
+
+namespace {
+class VersionPrinter {
+public:
+  void print() {
+        cout << "Low Level Virtual Machine (http://llvm.org/):\n";
+        cout << "  " << PACKAGE_NAME << " version " << PACKAGE_VERSION;
+#ifdef LLVM_VERSION_INFO
+        cout << LLVM_VERSION_INFO;
+#endif
+        cout << "\n  ";
+#ifndef __OPTIMIZE__
+        cout << "DEBUG build";
+#else
+        cout << "Optimized build";
+#endif
+#ifndef NDEBUG
+        cout << " with assertions";
+#endif
+        cout << ".\n";
+        cout << "  Built " << __DATE__ << "(" << __TIME__ << ").\n";
+  }
+  void operator=(bool OptionWasSpecified) {
+    if (OptionWasSpecified) {
+      if (OverrideVersionPrinter == 0) {
+        print();
+        exit(1);
+      } else {
+        (*OverrideVersionPrinter)();
+        exit(1);
+      }
+    }
+  }
+};
+} // End anonymous namespace
+
+
+// Define the --version option that prints out the LLVM version for the tool
+static VersionPrinter VersionPrinterInstance;
+
+static cl::opt<VersionPrinter, true, parser<bool> >
+VersOp("version", cl::desc("Display the version of this program"),
+    cl::location(VersionPrinterInstance), cl::ValueDisallowed);
+
+// Utility function for printing the help message.
+void cl::PrintHelpMessage() {
+  // This looks weird, but it actually prints the help message. The
+  // NormalPrinter variable is a HelpPrinter and the help gets printed when
+  // its operator= is invoked. That's because the "normal" usages of the
+  // help printer is to be assigned true/false depending on whether the
+  // --help option was given or not. Since we're circumventing that we have
+  // to make it look like --help was given, so we assign true.
+  NormalPrinter = true;
+}
+
+/// Utility function for printing version number.
+void cl::PrintVersionMessage() {
+  VersionPrinterInstance.print();
+}
+
+void cl::SetVersionPrinter(void (*func)()) {
+  OverrideVersionPrinter = func;
+}
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
new file mode 100644
index 0000000..cb8c4b0
--- /dev/null
+++ b/lib/Support/ConstantRange.cpp
@@ -0,0 +1,472 @@
+//===-- ConstantRange.cpp - ConstantRange implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Represent a range of possible values that may occur when the program is run
+// for an integral value.  This keeps track of a lower and upper bound for the
+// constant, which MAY wrap around the end of the numeric range.  To do this, it
+// keeps track of a [lower, upper) bound, which specifies an interval just like
+// STL iterators.  When used with boolean values, the following are important
+// ranges (other integral ranges use min/max values for special range values):
+//
+//  [F, F) = {}     = Empty set
+//  [T, F) = {T}
+//  [F, T) = {F}
+//  [T, T) = {F, T} = Full set
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// Initialize a full (the default) or empty set for the specified type.
+///
+ConstantRange::ConstantRange(uint32_t BitWidth, bool Full) :
+  Lower(BitWidth, 0), Upper(BitWidth, 0) {
+  if (Full)
+    Lower = Upper = APInt::getMaxValue(BitWidth);
+  else
+    Lower = Upper = APInt::getMinValue(BitWidth);
+}
+
+/// Initialize a range to hold the single specified value.
+///
+ConstantRange::ConstantRange(const APInt & V) : Lower(V), Upper(V + 1) { }
+
+ConstantRange::ConstantRange(const APInt &L, const APInt &U) :
+  Lower(L), Upper(U) {
+  assert(L.getBitWidth() == U.getBitWidth() && 
+         "ConstantRange with unequal bit widths");
+  assert((L != U || (L.isMaxValue() || L.isMinValue())) &&
+         "Lower == Upper, but they aren't min or max value!");
+}
+
+/// isFullSet - Return true if this set contains all of the elements possible
+/// for this data-type
+bool ConstantRange::isFullSet() const {
+  return Lower == Upper && Lower.isMaxValue();
+}
+
+/// isEmptySet - Return true if this set contains no members.
+///
+bool ConstantRange::isEmptySet() const {
+  return Lower == Upper && Lower.isMinValue();
+}
+
+/// isWrappedSet - Return true if this set wraps around the top of the range,
+/// for example: [100, 8)
+///
+bool ConstantRange::isWrappedSet() const {
+  return Lower.ugt(Upper);
+}
+
+/// getSetSize - Return the number of elements in this set.
+///
+APInt ConstantRange::getSetSize() const {
+  if (isEmptySet()) 
+    return APInt(getBitWidth(), 0);
+  if (getBitWidth() == 1) {
+    if (Lower != Upper)  // One of T or F in the set...
+      return APInt(2, 1);
+    return APInt(2, 2);      // Must be full set...
+  }
+
+  // Simply subtract the bounds...
+  return Upper - Lower;
+}
+
+/// getUnsignedMax - Return the largest unsigned value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getUnsignedMax() const {
+  if (isFullSet() || isWrappedSet())
+    return APInt::getMaxValue(getBitWidth());
+  else
+    return getUpper() - 1;
+}
+
+/// getUnsignedMin - Return the smallest unsigned value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getUnsignedMin() const {
+  if (isFullSet() || (isWrappedSet() && getUpper() != 0))
+    return APInt::getMinValue(getBitWidth());
+  else
+    return getLower();
+}
+
+/// getSignedMax - Return the largest signed value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getSignedMax() const {
+  APInt SignedMax(APInt::getSignedMaxValue(getBitWidth()));
+  if (!isWrappedSet()) {
+    if (getLower().sle(getUpper() - 1))
+      return getUpper() - 1;
+    else
+      return SignedMax;
+  } else {
+    if ((getUpper() - 1).slt(getLower())) {
+      if (getLower() != SignedMax)
+        return SignedMax;
+      else
+        return getUpper() - 1;
+    } else {
+      return getUpper() - 1;
+    }
+  }
+}
+
+/// getSignedMin - Return the smallest signed value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getSignedMin() const {
+  APInt SignedMin(APInt::getSignedMinValue(getBitWidth()));
+  if (!isWrappedSet()) {
+    if (getLower().sle(getUpper() - 1))
+      return getLower();
+    else
+      return SignedMin;
+  } else {
+    if ((getUpper() - 1).slt(getLower())) {
+      if (getUpper() != SignedMin)
+        return SignedMin;
+      else
+        return getLower();
+    } else {
+      return getLower();
+    }
+  }
+}
+
+/// contains - Return true if the specified value is in the set.
+///
+bool ConstantRange::contains(const APInt &V) const {
+  if (Lower == Upper)
+    return isFullSet();
+
+  if (!isWrappedSet())
+    return Lower.ule(V) && V.ult(Upper);
+  else
+    return Lower.ule(V) || V.ult(Upper);
+}
+
+/// subtract - Subtract the specified constant from the endpoints of this
+/// constant range.
+ConstantRange ConstantRange::subtract(const APInt &Val) const {
+  assert(Val.getBitWidth() == getBitWidth() && "Wrong bit width");
+  // If the set is empty or full, don't modify the endpoints.
+  if (Lower == Upper) 
+    return *this;
+  return ConstantRange(Lower - Val, Upper - Val);
+}
+
+
+// intersect1Wrapped - This helper function is used to intersect two ranges when
+// it is known that LHS is wrapped and RHS isn't.
+//
+ConstantRange 
+ConstantRange::intersect1Wrapped(const ConstantRange &LHS,
+                                 const ConstantRange &RHS) {
+  assert(LHS.isWrappedSet() && !RHS.isWrappedSet());
+
+  // Check to see if we overlap on the Left side of RHS...
+  //
+  if (RHS.Lower.ult(LHS.Upper)) {
+    // We do overlap on the left side of RHS, see if we overlap on the right of
+    // RHS...
+    if (RHS.Upper.ugt(LHS.Lower)) {
+      // Ok, the result overlaps on both the left and right sides.  See if the
+      // resultant interval will be smaller if we wrap or not...
+      //
+      if (LHS.getSetSize().ult(RHS.getSetSize()))
+        return LHS;
+      else
+        return RHS;
+
+    } else {
+      // No overlap on the right, just on the left.
+      return ConstantRange(RHS.Lower, LHS.Upper);
+    }
+  } else {
+    // We don't overlap on the left side of RHS, see if we overlap on the right
+    // of RHS...
+    if (RHS.Upper.ugt(LHS.Lower)) {
+      // Simple overlap...
+      return ConstantRange(LHS.Lower, RHS.Upper);
+    } else {
+      // No overlap...
+      return ConstantRange(LHS.getBitWidth(), false);
+    }
+  }
+}
+
+/// intersectWith - Return the range that results from the intersection of this
+/// range with another range.
+///
+ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
+  assert(getBitWidth() == CR.getBitWidth() && 
+         "ConstantRange types don't agree!");
+  // Handle common special cases
+  if (isEmptySet() || CR.isFullSet())  
+    return *this;
+  if (isFullSet()  || CR.isEmptySet()) 
+    return CR;
+
+  if (!isWrappedSet()) {
+    if (!CR.isWrappedSet()) {
+      using namespace APIntOps;
+      APInt L = umax(Lower, CR.Lower);
+      APInt U = umin(Upper, CR.Upper);
+
+      if (L.ult(U)) // If range isn't empty...
+        return ConstantRange(L, U);
+      else
+        return ConstantRange(getBitWidth(), false);// Otherwise, empty set
+    } else
+      return intersect1Wrapped(CR, *this);
+  } else {   // We know "this" is wrapped...
+    if (!CR.isWrappedSet())
+      return intersect1Wrapped(*this, CR);
+    else {
+      // Both ranges are wrapped...
+      using namespace APIntOps;
+      APInt L = umax(Lower, CR.Lower);
+      APInt U = umin(Upper, CR.Upper);
+      return ConstantRange(L, U);
+    }
+  }
+  return *this;
+}
+
+/// maximalIntersectWith - Return the range that results from the intersection
+/// of this range with another range.  The resultant range is guaranteed to
+/// include all elements contained in both input ranges, and to have the
+/// smallest possible set size that does so.  Because there may be two
+/// intersections with the same set size, A.maximalIntersectWith(B) might not
+/// be equal to B.maximalIntersect(A).
+ConstantRange ConstantRange::maximalIntersectWith(const ConstantRange &CR) const {
+  assert(getBitWidth() == CR.getBitWidth() && 
+         "ConstantRange types don't agree!");
+
+  // Handle common cases.
+  if (   isEmptySet() || CR.isFullSet()) return *this;
+  if (CR.isEmptySet() ||    isFullSet()) return CR;
+
+  if (!isWrappedSet() && CR.isWrappedSet())
+    return CR.maximalIntersectWith(*this);
+
+  if (!isWrappedSet() && !CR.isWrappedSet()) {
+    if (Lower.ult(CR.Lower)) {
+      if (Upper.ule(CR.Lower))
+        return ConstantRange(getBitWidth(), false);
+
+      if (Upper.ult(CR.Upper))
+        return ConstantRange(CR.Lower, Upper);
+
+      return CR;
+    } else {
+      if (Upper.ult(CR.Upper))
+        return *this;
+
+      if (Lower.ult(CR.Upper))
+        return ConstantRange(Lower, CR.Upper);
+
+      return ConstantRange(getBitWidth(), false);
+    }
+  }
+
+  if (isWrappedSet() && !CR.isWrappedSet()) {
+    if (CR.Lower.ult(Upper)) {
+      if (CR.Upper.ult(Upper))
+        return CR;
+
+      if (CR.Upper.ult(Lower))
+        return ConstantRange(CR.Lower, Upper);
+
+      if (getSetSize().ult(CR.getSetSize()))
+        return *this;
+      else
+        return CR;
+    } else if (CR.Lower.ult(Lower)) {
+      if (CR.Upper.ule(Lower))
+        return ConstantRange(getBitWidth(), false);
+
+      return ConstantRange(Lower, CR.Upper);
+    }
+    return CR;
+  }
+
+  if (CR.Upper.ult(Upper)) {
+    if (CR.Lower.ult(Upper)) {
+      if (getSetSize().ult(CR.getSetSize()))
+        return *this;
+      else
+        return CR;
+    }
+
+    if (CR.Lower.ult(Lower))
+      return ConstantRange(Lower, CR.Upper);
+
+    return CR;
+  } else if (CR.Upper.ult(Lower)) {
+    if (CR.Lower.ult(Lower))
+      return *this;
+
+    return ConstantRange(CR.Lower, Upper);
+  }
+  if (getSetSize().ult(CR.getSetSize()))
+    return *this;
+  else
+    return CR;
+}
+
+
+/// unionWith - Return the range that results from the union of this range with
+/// another range.  The resultant range is guaranteed to include the elements of
+/// both sets, but may contain more.  For example, [3, 9) union [12,15) is
+/// [3, 15), which includes 9, 10, and 11, which were not included in either
+/// set before.
+///
+ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
+  assert(getBitWidth() == CR.getBitWidth() && 
+         "ConstantRange types don't agree!");
+
+  if (   isFullSet() || CR.isEmptySet()) return *this;
+  if (CR.isFullSet() ||    isEmptySet()) return CR;
+
+  if (!isWrappedSet() && CR.isWrappedSet()) return CR.unionWith(*this);
+
+  APInt L = Lower, U = Upper;
+
+  if (!isWrappedSet() && !CR.isWrappedSet()) {
+    if (CR.Lower.ult(L))
+      L = CR.Lower;
+
+    if (CR.Upper.ugt(U))
+      U = CR.Upper;
+  }
+
+  if (isWrappedSet() && !CR.isWrappedSet()) {
+    if ((CR.Lower.ult(Upper) && CR.Upper.ult(Upper)) ||
+        (CR.Lower.ugt(Lower) && CR.Upper.ugt(Lower))) {
+      return *this;
+    }
+
+    if (CR.Lower.ule(Upper) && Lower.ule(CR.Upper)) {
+      return ConstantRange(getBitWidth());
+    }
+
+    if (CR.Lower.ule(Upper) && CR.Upper.ule(Lower)) {
+      APInt d1 = CR.Upper - Upper, d2 = Lower - CR.Upper;
+      if (d1.ult(d2)) {
+        U = CR.Upper;
+      } else {
+        L = CR.Upper;
+      }
+    }
+
+    if (Upper.ult(CR.Lower) && CR.Upper.ult(Lower)) {
+      APInt d1 = CR.Lower - Upper, d2 = Lower - CR.Upper;
+      if (d1.ult(d2)) {
+        U = CR.Lower + 1;
+      } else {
+        L = CR.Upper - 1;
+      }
+    }
+
+    if (Upper.ult(CR.Lower) && Lower.ult(CR.Upper)) {
+      APInt d1 = CR.Lower - Upper, d2 = Lower - CR.Lower;
+
+      if (d1.ult(d2)) {
+        U = CR.Lower + 1;
+      } else {
+        L = CR.Lower;
+      }
+    }
+  }
+
+  if (isWrappedSet() && CR.isWrappedSet()) {
+    if (Lower.ult(CR.Upper) || CR.Lower.ult(Upper))
+      return ConstantRange(getBitWidth());
+
+    if (CR.Upper.ugt(U)) {
+      U = CR.Upper;
+    }
+
+    if (CR.Lower.ult(L)) {
+      L = CR.Lower;
+    }
+
+    if (L == U) return ConstantRange(getBitWidth());
+  }
+
+  return ConstantRange(L, U);
+}
+
+/// zeroExtend - Return a new range in the specified integer type, which must
+/// be strictly larger than the current type.  The returned range will
+/// correspond to the possible range of values as if the source range had been
+/// zero extended.
+ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
+  unsigned SrcTySize = getBitWidth();
+  assert(SrcTySize < DstTySize && "Not a value extension");
+  if (isFullSet())
+    // Change a source full set into [0, 1 << 8*numbytes)
+    return ConstantRange(APInt(DstTySize,0), APInt(DstTySize,1).shl(SrcTySize));
+
+  APInt L = Lower; L.zext(DstTySize);
+  APInt U = Upper; U.zext(DstTySize);
+  return ConstantRange(L, U);
+}
+
+/// signExtend - Return a new range in the specified integer type, which must
+/// be strictly larger than the current type.  The returned range will
+/// correspond to the possible range of values as if the source range had been
+/// sign extended.
+ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
+  unsigned SrcTySize = getBitWidth();
+  assert(SrcTySize < DstTySize && "Not a value extension");
+  if (isFullSet()) {
+    return ConstantRange(APInt::getHighBitsSet(DstTySize,DstTySize-SrcTySize+1),
+                         APInt::getLowBitsSet(DstTySize, SrcTySize-1));
+  }
+
+  APInt L = Lower; L.sext(DstTySize);
+  APInt U = Upper; U.sext(DstTySize);
+  return ConstantRange(L, U);
+}
+
+/// truncate - Return a new range in the specified integer type, which must be
+/// strictly smaller than the current type.  The returned range will
+/// correspond to the possible range of values as if the source range had been
+/// truncated to the specified type.
+ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
+  unsigned SrcTySize = getBitWidth();
+  assert(SrcTySize > DstTySize && "Not a value truncation");
+  APInt Size(APInt::getLowBitsSet(SrcTySize, DstTySize));
+  if (isFullSet() || getSetSize().ugt(Size))
+    return ConstantRange(DstTySize);
+
+  APInt L = Lower; L.trunc(DstTySize);
+  APInt U = Upper; U.trunc(DstTySize);
+  return ConstantRange(L, U);
+}
+
+/// print - Print out the bounds to a stream...
+///
+void ConstantRange::print(raw_ostream &OS) const {
+  OS << "[" << Lower << "," << Upper << ")";
+}
+
+/// dump - Allow printing from a debugger easily...
+///
+void ConstantRange::dump() const {
+  print(errs());
+}
diff --git a/lib/Support/Debug.cpp b/lib/Support/Debug.cpp
new file mode 100644
index 0000000..a09cddf
--- /dev/null
+++ b/lib/Support/Debug.cpp
@@ -0,0 +1,77 @@
+//===-- Debug.cpp - An easy way to add debug output to your code ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a handle way of adding debugging information to your
+// code, without it being enabled all of the time, and without having to add
+// command line options to enable it.
+//
+// In particular, just wrap your code with the DEBUG() macro, and it will be
+// enabled automatically if you specify '-debug' on the command-line.
+// Alternatively, you can also use the SET_DEBUG_TYPE("foo") macro to specify
+// that your debug code belongs to class "foo".  Then, on the command line, you
+// can specify '-debug-only=foo' to enable JUST the debug information for the
+// foo class.
+//
+// When compiling in release mode, the -debug-* options and all code in DEBUG()
+// statements disappears, so it does not effect the runtime of the code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+bool llvm::DebugFlag;  // DebugFlag - Exported boolean set by the -debug option
+
+namespace {
+#ifndef NDEBUG
+  // -debug - Command line option to enable the DEBUG statements in the passes.
+  // This flag may only be enabled in debug builds.
+  static cl::opt<bool, true>
+  Debug("debug", cl::desc("Enable debug output"), cl::Hidden,
+        cl::location(DebugFlag));
+
+  static std::string CurrentDebugType;
+  static struct DebugOnlyOpt {
+    void operator=(const std::string &Val) const {
+      DebugFlag |= !Val.empty();
+      CurrentDebugType = Val;
+    }
+  } DebugOnlyOptLoc;
+
+  static cl::opt<DebugOnlyOpt, true, cl::parser<std::string> >
+  DebugOnly("debug-only", cl::desc("Enable a specific type of debug output"),
+            cl::Hidden, cl::value_desc("debug string"),
+            cl::location(DebugOnlyOptLoc), cl::ValueRequired);
+#endif
+}
+
+// isCurrentDebugType - Return true if the specified string is the debug type
+// specified on the command line, or if none was specified on the command line
+// with the -debug-only=X option.
+//
+bool llvm::isCurrentDebugType(const char *DebugType) {
+#ifndef NDEBUG
+  return CurrentDebugType.empty() || DebugType == CurrentDebugType;
+#else
+  return false;
+#endif
+}
+
+// getErrorOutputStream - Returns the error output stream (std::cerr). This
+// places the std::c* I/O streams into one .cpp file and relieves the whole
+// program from having to have hundreds of static c'tor/d'tors for them.
+// 
+OStream &llvm::getErrorOutputStream(const char *DebugType) {
+  static OStream cnoout(0);
+  if (DebugFlag && isCurrentDebugType(DebugType))
+    return cerr;
+  else
+    return cnoout;
+}
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
new file mode 100644
index 0000000..fa99035
--- /dev/null
+++ b/lib/Support/Dwarf.cpp
@@ -0,0 +1,589 @@
+//===-- llvm/Support/Dwarf.cpp - Dwarf Framework ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for generic dwarf information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Dwarf.h"
+
+#include <cassert>
+
+namespace llvm {
+
+namespace dwarf {
+
+/// TagString - Return the string for the specified tag.
+///
+const char *TagString(unsigned Tag) {
+  switch (Tag) {
+    case DW_TAG_array_type:                return "DW_TAG_array_type";
+    case DW_TAG_class_type:                return "DW_TAG_class_type";
+    case DW_TAG_entry_point:               return "DW_TAG_entry_point";
+    case DW_TAG_enumeration_type:          return "DW_TAG_enumeration_type";
+    case DW_TAG_formal_parameter:          return "DW_TAG_formal_parameter";
+    case DW_TAG_imported_declaration:      return "DW_TAG_imported_declaration";
+    case DW_TAG_label:                     return "DW_TAG_label";
+    case DW_TAG_lexical_block:             return "DW_TAG_lexical_block";
+    case DW_TAG_member:                    return "DW_TAG_member";
+    case DW_TAG_pointer_type:              return "DW_TAG_pointer_type";
+    case DW_TAG_reference_type:            return "DW_TAG_reference_type";
+    case DW_TAG_compile_unit:              return "DW_TAG_compile_unit";
+    case DW_TAG_string_type:               return "DW_TAG_string_type";
+    case DW_TAG_structure_type:            return "DW_TAG_structure_type";
+    case DW_TAG_subroutine_type:           return "DW_TAG_subroutine_type";
+    case DW_TAG_typedef:                   return "DW_TAG_typedef";
+    case DW_TAG_union_type:                return "DW_TAG_union_type";
+    case DW_TAG_unspecified_parameters:    return "DW_TAG_unspecified_parameters";
+    case DW_TAG_variant:                   return "DW_TAG_variant";
+    case DW_TAG_common_block:              return "DW_TAG_common_block";
+    case DW_TAG_common_inclusion:          return "DW_TAG_common_inclusion";
+    case DW_TAG_inheritance:               return "DW_TAG_inheritance";
+    case DW_TAG_inlined_subroutine:        return "DW_TAG_inlined_subroutine";
+    case DW_TAG_module:                    return "DW_TAG_module";
+    case DW_TAG_ptr_to_member_type:        return "DW_TAG_ptr_to_member_type";
+    case DW_TAG_set_type:                  return "DW_TAG_set_type";
+    case DW_TAG_subrange_type:             return "DW_TAG_subrange_type";
+    case DW_TAG_with_stmt:                 return "DW_TAG_with_stmt";
+    case DW_TAG_access_declaration:        return "DW_TAG_access_declaration";
+    case DW_TAG_base_type:                 return "DW_TAG_base_type";
+    case DW_TAG_catch_block:               return "DW_TAG_catch_block";
+    case DW_TAG_const_type:                return "DW_TAG_const_type";
+    case DW_TAG_constant:                  return "DW_TAG_constant";
+    case DW_TAG_enumerator:                return "DW_TAG_enumerator";
+    case DW_TAG_file_type:                 return "DW_TAG_file_type";
+    case DW_TAG_friend:                    return "DW_TAG_friend";
+    case DW_TAG_namelist:                  return "DW_TAG_namelist";
+    case DW_TAG_namelist_item:             return "DW_TAG_namelist_item";
+    case DW_TAG_packed_type:               return "DW_TAG_packed_type";
+    case DW_TAG_subprogram:                return "DW_TAG_subprogram";
+    case DW_TAG_template_type_parameter:   return "DW_TAG_template_type_parameter";
+    case DW_TAG_template_value_parameter: return "DW_TAG_template_value_parameter";
+    case DW_TAG_thrown_type:               return "DW_TAG_thrown_type";
+    case DW_TAG_try_block:                 return "DW_TAG_try_block";
+    case DW_TAG_variant_part:              return "DW_TAG_variant_part";
+    case DW_TAG_variable:                  return "DW_TAG_variable";
+    case DW_TAG_volatile_type:             return "DW_TAG_volatile_type";
+    case DW_TAG_dwarf_procedure:           return "DW_TAG_dwarf_procedure";
+    case DW_TAG_restrict_type:             return "DW_TAG_restrict_type";
+    case DW_TAG_interface_type:            return "DW_TAG_interface_type";
+    case DW_TAG_namespace:                 return "DW_TAG_namespace";
+    case DW_TAG_imported_module:           return "DW_TAG_imported_module";
+    case DW_TAG_unspecified_type:          return "DW_TAG_unspecified_type";
+    case DW_TAG_partial_unit:              return "DW_TAG_partial_unit";
+    case DW_TAG_imported_unit:             return "DW_TAG_imported_unit";
+    case DW_TAG_condition:                 return "DW_TAG_condition";
+    case DW_TAG_shared_type:               return "DW_TAG_shared_type";
+    case DW_TAG_lo_user:                   return "DW_TAG_lo_user";
+    case DW_TAG_hi_user:                   return "DW_TAG_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Tag");
+  return "";
+}
+
+/// ChildrenString - Return the string for the specified children flag.
+///
+const char *ChildrenString(unsigned Children) {
+  switch (Children) {
+    case DW_CHILDREN_no:                   return "CHILDREN_no";
+    case DW_CHILDREN_yes:                  return "CHILDREN_yes";
+  }
+  assert(0 && "Unknown Dwarf ChildrenFlag");
+  return "";
+}
+
+/// AttributeString - Return the string for the specified attribute.
+///
+const char *AttributeString(unsigned Attribute) {
+  switch (Attribute) {
+    case DW_AT_sibling:                    return "DW_AT_sibling";
+    case DW_AT_location:                   return "DW_AT_location";
+    case DW_AT_name:                       return "DW_AT_name";
+    case DW_AT_ordering:                   return "DW_AT_ordering";
+    case DW_AT_byte_size:                  return "DW_AT_byte_size";
+    case DW_AT_bit_offset:                 return "DW_AT_bit_offset";
+    case DW_AT_bit_size:                   return "DW_AT_bit_size";
+    case DW_AT_stmt_list:                  return "DW_AT_stmt_list";
+    case DW_AT_low_pc:                     return "DW_AT_low_pc";
+    case DW_AT_high_pc:                    return "DW_AT_high_pc";
+    case DW_AT_language:                   return "DW_AT_language";
+    case DW_AT_discr:                      return "DW_AT_discr";
+    case DW_AT_discr_value:                return "DW_AT_discr_value";
+    case DW_AT_visibility:                 return "DW_AT_visibility";
+    case DW_AT_import:                     return "DW_AT_import";
+    case DW_AT_string_length:              return "DW_AT_string_length";
+    case DW_AT_common_reference:           return "DW_AT_common_reference";
+    case DW_AT_comp_dir:                   return "DW_AT_comp_dir";
+    case DW_AT_const_value:                return "DW_AT_const_value";
+    case DW_AT_containing_type:            return "DW_AT_containing_type";
+    case DW_AT_default_value:              return "DW_AT_default_value";
+    case DW_AT_inline:                     return "DW_AT_inline";
+    case DW_AT_is_optional:                return "DW_AT_is_optional";
+    case DW_AT_lower_bound:                return "DW_AT_lower_bound";
+    case DW_AT_producer:                   return "DW_AT_producer";
+    case DW_AT_prototyped:                 return "DW_AT_prototyped";
+    case DW_AT_return_addr:                return "DW_AT_return_addr";
+    case DW_AT_start_scope:                return "DW_AT_start_scope";
+    case DW_AT_bit_stride:                 return "DW_AT_bit_stride";
+    case DW_AT_upper_bound:                return "DW_AT_upper_bound";
+    case DW_AT_abstract_origin:            return "DW_AT_abstract_origin";
+    case DW_AT_accessibility:              return "DW_AT_accessibility";
+    case DW_AT_address_class:              return "DW_AT_address_class";
+    case DW_AT_artificial:                 return "DW_AT_artificial";
+    case DW_AT_base_types:                 return "DW_AT_base_types";
+    case DW_AT_calling_convention:         return "DW_AT_calling_convention";
+    case DW_AT_count:                      return "DW_AT_count";
+    case DW_AT_data_member_location:       return "DW_AT_data_member_location";
+    case DW_AT_decl_column:                return "DW_AT_decl_column";
+    case DW_AT_decl_file:                  return "DW_AT_decl_file";
+    case DW_AT_decl_line:                  return "DW_AT_decl_line";
+    case DW_AT_declaration:                return "DW_AT_declaration";
+    case DW_AT_discr_list:                 return "DW_AT_discr_list";
+    case DW_AT_encoding:                   return "DW_AT_encoding";
+    case DW_AT_external:                   return "DW_AT_external";
+    case DW_AT_frame_base:                 return "DW_AT_frame_base";
+    case DW_AT_friend:                     return "DW_AT_friend";
+    case DW_AT_identifier_case:            return "DW_AT_identifier_case";
+    case DW_AT_macro_info:                 return "DW_AT_macro_info";
+    case DW_AT_namelist_item:              return "DW_AT_namelist_item";
+    case DW_AT_priority:                   return "DW_AT_priority";
+    case DW_AT_segment:                    return "DW_AT_segment";
+    case DW_AT_specification:              return "DW_AT_specification";
+    case DW_AT_static_link:                return "DW_AT_static_link";
+    case DW_AT_type:                       return "DW_AT_type";
+    case DW_AT_use_location:               return "DW_AT_use_location";
+    case DW_AT_variable_parameter:         return "DW_AT_variable_parameter";
+    case DW_AT_virtuality:                 return "DW_AT_virtuality";
+    case DW_AT_vtable_elem_location:       return "DW_AT_vtable_elem_location";
+    case DW_AT_allocated:                  return "DW_AT_allocated";
+    case DW_AT_associated:                 return "DW_AT_associated";
+    case DW_AT_data_location:              return "DW_AT_data_location";
+    case DW_AT_byte_stride:                return "DW_AT_byte_stride";
+    case DW_AT_entry_pc:                   return "DW_AT_entry_pc";
+    case DW_AT_use_UTF8:                   return "DW_AT_use_UTF8";
+    case DW_AT_extension:                  return "DW_AT_extension";
+    case DW_AT_ranges:                     return "DW_AT_ranges";
+    case DW_AT_trampoline:                 return "DW_AT_trampoline";
+    case DW_AT_call_column:                return "DW_AT_call_column";
+    case DW_AT_call_file:                  return "DW_AT_call_file";
+    case DW_AT_call_line:                  return "DW_AT_call_line";
+    case DW_AT_description:                return "DW_AT_description";
+    case DW_AT_binary_scale:               return "DW_AT_binary_scale";
+    case DW_AT_decimal_scale:              return "DW_AT_decimal_scale";
+    case DW_AT_small:                      return "DW_AT_small";
+    case DW_AT_decimal_sign:               return "DW_AT_decimal_sign";
+    case DW_AT_digit_count:                return "DW_AT_digit_count";
+    case DW_AT_picture_string:             return "DW_AT_picture_string";
+    case DW_AT_mutable:                    return "DW_AT_mutable";
+    case DW_AT_threads_scaled:             return "DW_AT_threads_scaled";
+    case DW_AT_explicit:                   return "DW_AT_explicit";
+    case DW_AT_object_pointer:             return "DW_AT_object_pointer";
+    case DW_AT_endianity:                  return "DW_AT_endianity";
+    case DW_AT_elemental:                  return "DW_AT_elemental";
+    case DW_AT_pure:                       return "DW_AT_pure";
+    case DW_AT_recursive:                  return "DW_AT_recursive";
+    case DW_AT_MIPS_linkage_name:          return "DW_AT_MIPS_linkage_name";
+    case DW_AT_sf_names:                   return "DW_AT_sf_names";
+    case DW_AT_src_info:                   return "DW_AT_src_info";
+    case DW_AT_mac_info:                   return "DW_AT_mac_info";
+    case DW_AT_src_coords:                 return "DW_AT_src_coords";
+    case DW_AT_body_begin:                 return "DW_AT_body_begin";
+    case DW_AT_body_end:                   return "DW_AT_body_end";
+    case DW_AT_GNU_vector:                 return "DW_AT_GNU_vector";
+    case DW_AT_lo_user:                    return "DW_AT_lo_user";
+    case DW_AT_hi_user:                    return "DW_AT_hi_user";
+    case DW_AT_APPLE_optimized:            return "DW_AT_APPLE_optimized";
+    case DW_AT_APPLE_flags:                return "DW_AT_APPLE_flags";
+    case DW_AT_APPLE_isa:                  return "DW_AT_APPLE_isa";
+    case DW_AT_APPLE_block:                return "DW_AT_APPLE_block";
+    case DW_AT_APPLE_major_runtime_vers:   return "DW_AT_APPLE_major_runtime_vers";
+    case DW_AT_APPLE_runtime_class:        return "DW_AT_APPLE_runtime_class";
+  }
+  assert(0 && "Unknown Dwarf Attribute");
+  return "";
+}
+
+/// FormEncodingString - Return the string for the specified form encoding.
+///
+const char *FormEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+    case DW_FORM_addr:                     return "FORM_addr";
+    case DW_FORM_block2:                   return "FORM_block2";
+    case DW_FORM_block4:                   return "FORM_block4";
+    case DW_FORM_data2:                    return "FORM_data2";
+    case DW_FORM_data4:                    return "FORM_data4";
+    case DW_FORM_data8:                    return "FORM_data8";
+    case DW_FORM_string:                   return "FORM_string";
+    case DW_FORM_block:                    return "FORM_block";
+    case DW_FORM_block1:                   return "FORM_block1";
+    case DW_FORM_data1:                    return "FORM_data1";
+    case DW_FORM_flag:                     return "FORM_flag";
+    case DW_FORM_sdata:                    return "FORM_sdata";
+    case DW_FORM_strp:                     return "FORM_strp";
+    case DW_FORM_udata:                    return "FORM_udata";
+    case DW_FORM_ref_addr:                 return "FORM_ref_addr";
+    case DW_FORM_ref1:                     return "FORM_ref1";
+    case DW_FORM_ref2:                     return "FORM_ref2";
+    case DW_FORM_ref4:                     return "FORM_ref4";
+    case DW_FORM_ref8:                     return "FORM_ref8";
+    case DW_FORM_ref_udata:                return "FORM_ref_udata";
+    case DW_FORM_indirect:                 return "FORM_indirect";
+  }
+  assert(0 && "Unknown Dwarf Form Encoding");
+  return "";
+}
+
+/// OperationEncodingString - Return the string for the specified operation
+/// encoding.
+const char *OperationEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+    case DW_OP_addr:                       return "OP_addr";
+    case DW_OP_deref:                      return "OP_deref";
+    case DW_OP_const1u:                    return "OP_const1u";
+    case DW_OP_const1s:                    return "OP_const1s";
+    case DW_OP_const2u:                    return "OP_const2u";
+    case DW_OP_const2s:                    return "OP_const2s";
+    case DW_OP_const4u:                    return "OP_const4u";
+    case DW_OP_const4s:                    return "OP_const4s";
+    case DW_OP_const8u:                    return "OP_const8u";
+    case DW_OP_const8s:                    return "OP_const8s";
+    case DW_OP_constu:                     return "OP_constu";
+    case DW_OP_consts:                     return "OP_consts";
+    case DW_OP_dup:                        return "OP_dup";
+    case DW_OP_drop:                       return "OP_drop";
+    case DW_OP_over:                       return "OP_over";
+    case DW_OP_pick:                       return "OP_pick";
+    case DW_OP_swap:                       return "OP_swap";
+    case DW_OP_rot:                        return "OP_rot";
+    case DW_OP_xderef:                     return "OP_xderef";
+    case DW_OP_abs:                        return "OP_abs";
+    case DW_OP_and:                        return "OP_and";
+    case DW_OP_div:                        return "OP_div";
+    case DW_OP_minus:                      return "OP_minus";
+    case DW_OP_mod:                        return "OP_mod";
+    case DW_OP_mul:                        return "OP_mul";
+    case DW_OP_neg:                        return "OP_neg";
+    case DW_OP_not:                        return "OP_not";
+    case DW_OP_or:                         return "OP_or";
+    case DW_OP_plus:                       return "OP_plus";
+    case DW_OP_plus_uconst:                return "OP_plus_uconst";
+    case DW_OP_shl:                        return "OP_shl";
+    case DW_OP_shr:                        return "OP_shr";
+    case DW_OP_shra:                       return "OP_shra";
+    case DW_OP_xor:                        return "OP_xor";
+    case DW_OP_skip:                       return "OP_skip";
+    case DW_OP_bra:                        return "OP_bra";
+    case DW_OP_eq:                         return "OP_eq";
+    case DW_OP_ge:                         return "OP_ge";
+    case DW_OP_gt:                         return "OP_gt";
+    case DW_OP_le:                         return "OP_le";
+    case DW_OP_lt:                         return "OP_lt";
+    case DW_OP_ne:                         return "OP_ne";
+    case DW_OP_lit0:                       return "OP_lit0";
+    case DW_OP_lit1:                       return "OP_lit1";
+    case DW_OP_lit31:                      return "OP_lit31";
+    case DW_OP_reg0:                       return "OP_reg0";
+    case DW_OP_reg1:                       return "OP_reg1";
+    case DW_OP_reg31:                      return "OP_reg31";
+    case DW_OP_breg0:                      return "OP_breg0";
+    case DW_OP_breg1:                      return "OP_breg1";
+    case DW_OP_breg31:                     return "OP_breg31";
+    case DW_OP_regx:                       return "OP_regx";
+    case DW_OP_fbreg:                      return "OP_fbreg";
+    case DW_OP_bregx:                      return "OP_bregx";
+    case DW_OP_piece:                      return "OP_piece";
+    case DW_OP_deref_size:                 return "OP_deref_size";
+    case DW_OP_xderef_size:                return "OP_xderef_size";
+    case DW_OP_nop:                        return "OP_nop";
+    case DW_OP_push_object_address:        return "OP_push_object_address";
+    case DW_OP_call2:                      return "OP_call2";
+    case DW_OP_call4:                      return "OP_call4";
+    case DW_OP_call_ref:                   return "OP_call_ref";
+    case DW_OP_form_tls_address:           return "OP_form_tls_address";
+    case DW_OP_call_frame_cfa:             return "OP_call_frame_cfa";
+    case DW_OP_lo_user:                    return "OP_lo_user";
+    case DW_OP_hi_user:                    return "OP_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Operation Encoding");
+  return "";
+}
+
+/// AttributeEncodingString - Return the string for the specified attribute
+/// encoding.
+const char *AttributeEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+    case DW_ATE_address:                   return "ATE_address";
+    case DW_ATE_boolean:                   return "ATE_boolean";
+    case DW_ATE_complex_float:             return "ATE_complex_float";
+    case DW_ATE_float:                     return "ATE_float";
+    case DW_ATE_signed:                    return "ATE_signed";
+    case DW_ATE_signed_char:               return "ATE_signed_char";
+    case DW_ATE_unsigned:                  return "ATE_unsigned";
+    case DW_ATE_unsigned_char:             return "ATE_unsigned_char";
+    case DW_ATE_imaginary_float:           return "ATE_imaginary_float";
+    case DW_ATE_packed_decimal:            return "ATE_packed_decimal";
+    case DW_ATE_numeric_string:            return "ATE_numeric_string";
+    case DW_ATE_edited:                    return "ATE_edited";
+    case DW_ATE_signed_fixed:              return "ATE_signed_fixed";
+    case DW_ATE_unsigned_fixed:            return "ATE_unsigned_fixed";
+    case DW_ATE_decimal_float:             return "ATE_decimal_float";
+    case DW_ATE_lo_user:                   return "ATE_lo_user";
+    case DW_ATE_hi_user:                   return "ATE_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Attribute Encoding");
+  return "";
+}
+
+/// DecimalSignString - Return the string for the specified decimal sign
+/// attribute.
+const char *DecimalSignString(unsigned Sign) {
+  switch (Sign) {
+    case DW_DS_unsigned:                   return "DS_unsigned";
+    case DW_DS_leading_overpunch:          return "DS_leading_overpunch";
+    case DW_DS_trailing_overpunch:         return "DS_trailing_overpunch";
+    case DW_DS_leading_separate:           return "DS_leading_separate";
+    case DW_DS_trailing_separate:          return "DS_trailing_separate";
+  }
+  assert(0 && "Unknown Dwarf Decimal Sign Attribute");
+  return "";
+}
+
+/// EndianityString - Return the string for the specified endianity.
+///
+const char *EndianityString(unsigned Endian) {
+  switch (Endian) {
+    case DW_END_default:                   return "END_default";
+    case DW_END_big:                       return "END_big";
+    case DW_END_little:                    return "END_little";
+    case DW_END_lo_user:                   return "END_lo_user";
+    case DW_END_hi_user:                   return "END_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Endianity");
+  return "";
+}
+
+/// AccessibilityString - Return the string for the specified accessibility.
+///
+const char *AccessibilityString(unsigned Access) {
+  switch (Access) {
+    // Accessibility codes
+    case DW_ACCESS_public:                 return "ACCESS_public";
+    case DW_ACCESS_protected:              return "ACCESS_protected";
+    case DW_ACCESS_private:                return "ACCESS_private";
+  }
+  assert(0 && "Unknown Dwarf Accessibility");
+  return "";
+}
+
+/// VisibilityString - Return the string for the specified visibility.
+///
+const char *VisibilityString(unsigned Visibility) {
+  switch (Visibility) {
+    case DW_VIS_local:                     return "VIS_local";
+    case DW_VIS_exported:                  return "VIS_exported";
+    case DW_VIS_qualified:                 return "VIS_qualified";
+  }
+  assert(0 && "Unknown Dwarf Visibility");
+  return "";
+}
+
+/// VirtualityString - Return the string for the specified virtuality.
+///
+const char *VirtualityString(unsigned Virtuality) {
+  switch (Virtuality) {
+    case DW_VIRTUALITY_none:               return "VIRTUALITY_none";
+    case DW_VIRTUALITY_virtual:            return "VIRTUALITY_virtual";
+    case DW_VIRTUALITY_pure_virtual:       return "VIRTUALITY_pure_virtual";
+  }
+  assert(0 && "Unknown Dwarf Virtuality");
+  return "";
+}
+
+/// LanguageString - Return the string for the specified language.
+///
+const char *LanguageString(unsigned Language) {
+  switch (Language) {
+    case DW_LANG_C89:                      return "LANG_C89";
+    case DW_LANG_C:                        return "LANG_C";
+    case DW_LANG_Ada83:                    return "LANG_Ada83";
+    case DW_LANG_C_plus_plus:              return "LANG_C_plus_plus";
+    case DW_LANG_Cobol74:                  return "LANG_Cobol74";
+    case DW_LANG_Cobol85:                  return "LANG_Cobol85";
+    case DW_LANG_Fortran77:                return "LANG_Fortran77";
+    case DW_LANG_Fortran90:                return "LANG_Fortran90";
+    case DW_LANG_Pascal83:                 return "LANG_Pascal83";
+    case DW_LANG_Modula2:                  return "LANG_Modula2";
+    case DW_LANG_Java:                     return "LANG_Java";
+    case DW_LANG_C99:                      return "LANG_C99";
+    case DW_LANG_Ada95:                    return "LANG_Ada95";
+    case DW_LANG_Fortran95:                return "LANG_Fortran95";
+    case DW_LANG_PLI:                      return "LANG_PLI";
+    case DW_LANG_ObjC:                     return "LANG_ObjC";
+    case DW_LANG_ObjC_plus_plus:           return "LANG_ObjC_plus_plus";
+    case DW_LANG_UPC:                      return "LANG_UPC";
+    case DW_LANG_D:                        return "LANG_D";
+    case DW_LANG_lo_user:                  return "LANG_lo_user";
+    case DW_LANG_hi_user:                  return "LANG_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Language");
+  return "";
+}
+
+/// CaseString - Return the string for the specified identifier case.
+///
+const char *CaseString(unsigned Case) {
+   switch (Case) {
+    case DW_ID_case_sensitive:             return "ID_case_sensitive";
+    case DW_ID_up_case:                    return "ID_up_case";
+    case DW_ID_down_case:                  return "ID_down_case";
+    case DW_ID_case_insensitive:           return "ID_case_insensitive";
+  }
+  assert(0 && "Unknown Dwarf Identifier Case");
+  return "";
+}
+
+/// ConventionString - Return the string for the specified calling convention.
+///
+const char *ConventionString(unsigned Convention) {
+   switch (Convention) {
+    case DW_CC_normal:                     return "CC_normal";
+    case DW_CC_program:                    return "CC_program";
+    case DW_CC_nocall:                     return "CC_nocall";
+    case DW_CC_lo_user:                    return "CC_lo_user";
+    case DW_CC_hi_user:                    return "CC_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Calling Convention");
+  return "";
+}
+
+/// InlineCodeString - Return the string for the specified inline code.
+///
+const char *InlineCodeString(unsigned Code) {
+   switch (Code) {
+    case DW_INL_not_inlined:               return "INL_not_inlined";
+    case DW_INL_inlined:                   return "INL_inlined";
+    case DW_INL_declared_not_inlined:      return "INL_declared_not_inlined";
+    case DW_INL_declared_inlined:          return "INL_declared_inlined";
+  }
+  assert(0 && "Unknown Dwarf Inline Code");
+  return "";
+}
+
+/// ArrayOrderString - Return the string for the specified array order.
+///
+const char *ArrayOrderString(unsigned Order) {
+   switch (Order) {
+    case DW_ORD_row_major:                 return "ORD_row_major";
+    case DW_ORD_col_major:                 return "ORD_col_major";
+  }
+  assert(0 && "Unknown Dwarf Array Order");
+  return "";
+}
+
+/// DiscriminantString - Return the string for the specified discriminant
+/// descriptor.
+const char *DiscriminantString(unsigned Discriminant) {
+   switch (Discriminant) {
+    case DW_DSC_label:                     return "DSC_label";
+    case DW_DSC_range:                     return "DSC_range";
+  }
+  assert(0 && "Unknown Dwarf Discriminant Descriptor");
+  return "";
+}
+
+/// LNStandardString - Return the string for the specified line number standard.
+///
+const char *LNStandardString(unsigned Standard) {
+   switch (Standard) {
+    case DW_LNS_copy:                      return "LNS_copy";
+    case DW_LNS_advance_pc:                return "LNS_advance_pc";
+    case DW_LNS_advance_line:              return "LNS_advance_line";
+    case DW_LNS_set_file:                  return "LNS_set_file";
+    case DW_LNS_set_column:                return "LNS_set_column";
+    case DW_LNS_negate_stmt:               return "LNS_negate_stmt";
+    case DW_LNS_set_basic_block:           return "LNS_set_basic_block";
+    case DW_LNS_const_add_pc:              return "LNS_const_add_pc";
+    case DW_LNS_fixed_advance_pc:          return "LNS_fixed_advance_pc";
+    case DW_LNS_set_prologue_end:          return "LNS_set_prologue_end";
+    case DW_LNS_set_epilogue_begin:        return "LNS_set_epilogue_begin";
+    case DW_LNS_set_isa:                   return "LNS_set_isa";
+  }
+  assert(0 && "Unknown Dwarf Line Number Standard");
+  return "";
+}
+
+/// LNExtendedString - Return the string for the specified line number extended
+/// opcode encodings.
+const char *LNExtendedString(unsigned Encoding) {
+   switch (Encoding) {
+    // Line Number Extended Opcode Encodings
+    case DW_LNE_end_sequence:              return "LNE_end_sequence";
+    case DW_LNE_set_address:               return "LNE_set_address";
+    case DW_LNE_define_file:               return "LNE_define_file";
+    case DW_LNE_lo_user:                   return "LNE_lo_user";
+    case DW_LNE_hi_user:                   return "LNE_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Line Number Extended Opcode Encoding");
+  return "";
+}
+
+/// MacinfoString - Return the string for the specified macinfo type encodings.
+///
+const char *MacinfoString(unsigned Encoding) {
+   switch (Encoding) {
+    // Macinfo Type Encodings
+    case DW_MACINFO_define:                return "MACINFO_define";
+    case DW_MACINFO_undef:                 return "MACINFO_undef";
+    case DW_MACINFO_start_file:            return "MACINFO_start_file";
+    case DW_MACINFO_end_file:              return "MACINFO_end_file";
+    case DW_MACINFO_vendor_ext:            return "MACINFO_vendor_ext";
+  }
+  assert(0 && "Unknown Dwarf Macinfo Type Encodings");
+  return "";
+}
+
+/// CallFrameString - Return the string for the specified call frame instruction
+/// encodings.
+const char *CallFrameString(unsigned Encoding) {
+   switch (Encoding) {
+    case DW_CFA_advance_loc:               return "CFA_advance_loc";
+    case DW_CFA_offset:                    return "CFA_offset";
+    case DW_CFA_restore:                   return "CFA_restore";
+    case DW_CFA_set_loc:                   return "CFA_set_loc";
+    case DW_CFA_advance_loc1:              return "CFA_advance_loc1";
+    case DW_CFA_advance_loc2:              return "CFA_advance_loc2";
+    case DW_CFA_advance_loc4:              return "CFA_advance_loc4";
+    case DW_CFA_offset_extended:           return "CFA_offset_extended";
+    case DW_CFA_restore_extended:          return "CFA_restore_extended";
+    case DW_CFA_undefined:                 return "CFA_undefined";
+    case DW_CFA_same_value:                return "CFA_same_value";
+    case DW_CFA_register:                  return "CFA_register";
+    case DW_CFA_remember_state:            return "CFA_remember_state";
+    case DW_CFA_restore_state:             return "CFA_restore_state";
+    case DW_CFA_def_cfa:                   return "CFA_def_cfa";
+    case DW_CFA_def_cfa_register:          return "CFA_def_cfa_register";
+    case DW_CFA_def_cfa_offset:            return "CFA_def_cfa_offset";
+    case DW_CFA_def_cfa_expression:        return "CFA_def_cfa_expression";
+    case DW_CFA_expression:                return "CFA_expression";
+    case DW_CFA_offset_extended_sf:        return "CFA_offset_extended_sf";
+    case DW_CFA_def_cfa_sf:                return "CFA_def_cfa_sf";
+    case DW_CFA_def_cfa_offset_sf:         return "CFA_def_cfa_offset_sf";
+    case DW_CFA_val_offset:                return "CFA_val_offset";
+    case DW_CFA_val_offset_sf:             return "CFA_val_offset_sf";
+    case DW_CFA_val_expression:            return "CFA_val_expression";
+    case DW_CFA_lo_user:                   return "CFA_lo_user";
+    case DW_CFA_hi_user:                   return "CFA_hi_user";
+  }
+  assert(0 && "Unknown Dwarf Call Frame Instruction Encodings");
+  return "";
+}
+
+} // End of namespace dwarf.
+
+} // End of namespace llvm.
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
new file mode 100644
index 0000000..21080b6
--- /dev/null
+++ b/lib/Support/FileUtilities.cpp
@@ -0,0 +1,263 @@
+//===- Support/FileUtilities.cpp - File System Utilities ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a family of utility functions which are useful for doing
+// various things with files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/System/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include <cstdlib>
+#include <cstring>
+#include <cctype>
+using namespace llvm;
+
+static bool isSignedChar(char C) {
+  return (C == '+' || C == '-');
+}
+
+static bool isExponentChar(char C) {
+  switch (C) {
+  case 'D':  // Strange exponential notation.
+  case 'd':  // Strange exponential notation.
+  case 'e':
+  case 'E': return true;
+  default: return false;
+  }
+}
+
+static bool isNumberChar(char C) {
+  switch (C) {
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+  case '.': return true;
+  default: return isSignedChar(C) || isExponentChar(C);
+  }
+}
+
+static const char *BackupNumber(const char *Pos, const char *FirstChar) {
+  // If we didn't stop in the middle of a number, don't backup.
+  if (!isNumberChar(*Pos)) return Pos;
+
+  // Otherwise, return to the start of the number.
+  while (Pos > FirstChar && isNumberChar(Pos[-1])) {
+    --Pos;
+    if (Pos > FirstChar && isSignedChar(Pos[0]) && !isExponentChar(Pos[-1]))
+      break;
+  }
+  return Pos;
+}
+
+/// EndOfNumber - Return the first character that is not part of the specified
+/// number.  This assumes that the buffer is null terminated, so it won't fall
+/// off the end.
+static const char *EndOfNumber(const char *Pos) {
+  while (isNumberChar(*Pos))
+    ++Pos;
+  return Pos;
+}
+
+/// CompareNumbers - compare two numbers, returning true if they are different.
+static bool CompareNumbers(const char *&F1P, const char *&F2P,
+                           const char *F1End, const char *F2End,
+                           double AbsTolerance, double RelTolerance,
+                           std::string *ErrorMsg) {
+  const char *F1NumEnd, *F2NumEnd;
+  double V1 = 0.0, V2 = 0.0;
+
+  // If one of the positions is at a space and the other isn't, chomp up 'til
+  // the end of the space.
+  while (isspace(*F1P) && F1P != F1End)
+    ++F1P;
+  while (isspace(*F2P) && F2P != F2End)
+    ++F2P;
+
+  // If we stop on numbers, compare their difference.
+  if (!isNumberChar(*F1P) || !isNumberChar(*F2P)) {
+    // The diff failed.
+    F1NumEnd = F1P;
+    F2NumEnd = F2P;
+  } else {
+    // Note that some ugliness is built into this to permit support for numbers
+    // that use "D" or "d" as their exponential marker, e.g. "1.234D45".  This
+    // occurs in 200.sixtrack in spec2k.
+    V1 = strtod(F1P, const_cast<char**>(&F1NumEnd));
+    V2 = strtod(F2P, const_cast<char**>(&F2NumEnd));
+
+    if (*F1NumEnd == 'D' || *F1NumEnd == 'd') {
+      // Copy string into tmp buffer to replace the 'D' with an 'e'.
+      SmallString<200> StrTmp(F1P, EndOfNumber(F1NumEnd)+1);
+      // Strange exponential notation!
+      StrTmp[static_cast<unsigned>(F1NumEnd-F1P)] = 'e';
+      
+      V1 = strtod(&StrTmp[0], const_cast<char**>(&F1NumEnd));
+      F1NumEnd = F1P + (F1NumEnd-&StrTmp[0]);
+    }
+    
+    if (*F2NumEnd == 'D' || *F2NumEnd == 'd') {
+      // Copy string into tmp buffer to replace the 'D' with an 'e'.
+      SmallString<200> StrTmp(F2P, EndOfNumber(F2NumEnd)+1);
+      // Strange exponential notation!
+      StrTmp[static_cast<unsigned>(F2NumEnd-F2P)] = 'e';
+      
+      V2 = strtod(&StrTmp[0], const_cast<char**>(&F2NumEnd));
+      F2NumEnd = F2P + (F2NumEnd-&StrTmp[0]);
+    }
+  }
+
+  if (F1NumEnd == F1P || F2NumEnd == F2P) {
+    if (ErrorMsg) {
+      *ErrorMsg = "FP Comparison failed, not a numeric difference between '";
+      *ErrorMsg += F1P[0];
+      *ErrorMsg += "' and '";
+      *ErrorMsg += F2P[0];
+      *ErrorMsg += "'";
+    }
+    return true;
+  }
+
+  // Check to see if these are inside the absolute tolerance
+  if (AbsTolerance < std::abs(V1-V2)) {
+    // Nope, check the relative tolerance...
+    double Diff;
+    if (V2)
+      Diff = std::abs(V1/V2 - 1.0);
+    else if (V1)
+      Diff = std::abs(V2/V1 - 1.0);
+    else
+      Diff = 0;  // Both zero.
+    if (Diff > RelTolerance) {
+      if (ErrorMsg) {
+        *ErrorMsg = "Compared: " + ftostr(V1) + " and " + ftostr(V2) + "\n";
+        *ErrorMsg += "abs. diff = " + ftostr(std::abs(V1-V2)) + 
+                     " rel.diff = " + ftostr(Diff) + "\n";
+        *ErrorMsg += "Out of tolerance: rel/abs: " + ftostr(RelTolerance) +
+                     "/" + ftostr(AbsTolerance);
+      }
+      return true;
+    }
+  }
+
+  // Otherwise, advance our read pointers to the end of the numbers.
+  F1P = F1NumEnd;  F2P = F2NumEnd;
+  return false;
+}
+
+/// DiffFilesWithTolerance - Compare the two files specified, returning 0 if the
+/// files match, 1 if they are different, and 2 if there is a file error.  This
+/// function differs from DiffFiles in that you can specify an absolete and
+/// relative FP error that is allowed to exist.  If you specify a string to fill
+/// in for the error option, it will set the string to an error message if an
+/// error occurs, allowing the caller to distinguish between a failed diff and a
+/// file system error.
+///
+int llvm::DiffFilesWithTolerance(const sys::PathWithStatus &FileA,
+                                 const sys::PathWithStatus &FileB,
+                                 double AbsTol, double RelTol,
+                                 std::string *Error) {
+  const sys::FileStatus *FileAStat = FileA.getFileStatus(false, Error);
+  if (!FileAStat)
+    return 2;
+  const sys::FileStatus *FileBStat = FileB.getFileStatus(false, Error);
+  if (!FileBStat)
+    return 2;
+
+  // Check for zero length files because some systems croak when you try to
+  // mmap an empty file.
+  size_t A_size = FileAStat->getSize();
+  size_t B_size = FileBStat->getSize();
+
+  // If they are both zero sized then they're the same
+  if (A_size == 0 && B_size == 0)
+    return 0;
+
+  // If only one of them is zero sized then they can't be the same
+  if ((A_size == 0 || B_size == 0)) {
+    if (Error)
+      *Error = "Files differ: one is zero-sized, the other isn't";
+    return 1;
+  }
+
+  // Now its safe to mmap the files into memory becasue both files
+  // have a non-zero size.
+  OwningPtr<MemoryBuffer> F1(MemoryBuffer::getFile(FileA.c_str(), Error));
+  OwningPtr<MemoryBuffer> F2(MemoryBuffer::getFile(FileB.c_str(), Error));
+  if (F1 == 0 || F2 == 0)
+    return 2;
+  
+  // Okay, now that we opened the files, scan them for the first difference.
+  const char *File1Start = F1->getBufferStart();
+  const char *File2Start = F2->getBufferStart();
+  const char *File1End = F1->getBufferEnd();
+  const char *File2End = F2->getBufferEnd();
+  const char *F1P = File1Start;
+  const char *F2P = File2Start;
+
+  if (A_size == B_size) {
+    // Are the buffers identical?  Common case: Handle this efficiently.
+    if (std::memcmp(File1Start, File2Start, A_size) == 0)
+      return 0;
+
+    if (AbsTol == 0 && RelTol == 0) {
+      if (Error)
+        *Error = "Files differ without tolerance allowance";
+      return 1;   // Files different!
+    }
+  }
+
+  bool CompareFailed = false;
+  while (1) {
+    // Scan for the end of file or next difference.
+    while (F1P < File1End && F2P < File2End && *F1P == *F2P)
+      ++F1P, ++F2P;
+
+    if (F1P >= File1End || F2P >= File2End) break;
+
+    // Okay, we must have found a difference.  Backup to the start of the
+    // current number each stream is at so that we can compare from the
+    // beginning.
+    F1P = BackupNumber(F1P, File1Start);
+    F2P = BackupNumber(F2P, File2Start);
+
+    // Now that we are at the start of the numbers, compare them, exiting if
+    // they don't match.
+    if (CompareNumbers(F1P, F2P, File1End, File2End, AbsTol, RelTol, Error)) {
+      CompareFailed = true;
+      break;
+    }
+  }
+
+  // Okay, we reached the end of file.  If both files are at the end, we
+  // succeeded.
+  bool F1AtEnd = F1P >= File1End;
+  bool F2AtEnd = F2P >= File2End;
+  if (!CompareFailed && (!F1AtEnd || !F2AtEnd)) {
+    // Else, we might have run off the end due to a number: backup and retry.
+    if (F1AtEnd && isNumberChar(F1P[-1])) --F1P;
+    if (F2AtEnd && isNumberChar(F2P[-1])) --F2P;
+    F1P = BackupNumber(F1P, File1Start);
+    F2P = BackupNumber(F2P, File2Start);
+
+    // Now that we are at the start of the numbers, compare them, exiting if
+    // they don't match.
+    if (CompareNumbers(F1P, F2P, File1End, File2End, AbsTol, RelTol, Error))
+      CompareFailed = true;
+
+    // If we found the end, we succeeded.
+    if (F1P < File1End || F2P < File2End)
+      CompareFailed = true;
+  }
+
+  return CompareFailed;
+}
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
new file mode 100644
index 0000000..41c730e
--- /dev/null
+++ b/lib/Support/FoldingSet.cpp
@@ -0,0 +1,378 @@
+//===-- Support/FoldingSet.cpp - Uniquing Hash Set --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a hash set that can be used to remove duplication of
+// nodes in a graph.  This code was originally created by Chris Lattner for use
+// with SelectionDAGCSEMap, but was isolated to provide use across the llvm code
+// set. 
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstring>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// FoldingSetNodeID Implementation
+
+/// Add* - Add various data types to Bit data.
+///
+void FoldingSetNodeID::AddPointer(const void *Ptr) {
+  // Note: this adds pointers to the hash using sizes and endianness that
+  // depend on the host.  It doesn't matter however, because hashing on
+  // pointer values in inherently unstable.  Nothing  should depend on the 
+  // ordering of nodes in the folding set.
+  intptr_t PtrI = (intptr_t)Ptr;
+  Bits.push_back(unsigned(PtrI));
+  if (sizeof(intptr_t) > sizeof(unsigned))
+    Bits.push_back(unsigned(uint64_t(PtrI) >> 32));
+}
+void FoldingSetNodeID::AddInteger(signed I) {
+  Bits.push_back(I);
+}
+void FoldingSetNodeID::AddInteger(unsigned I) {
+  Bits.push_back(I);
+}
+void FoldingSetNodeID::AddInteger(long I) {
+  AddInteger((unsigned long)I);
+}
+void FoldingSetNodeID::AddInteger(unsigned long I) {
+  if (sizeof(long) == sizeof(int))
+    AddInteger(unsigned(I));
+  else if (sizeof(long) == sizeof(long long)) {
+    AddInteger((unsigned long long)I);
+  } else {
+    assert(0 && "unexpected sizeof(long)");
+  }
+}
+void FoldingSetNodeID::AddInteger(long long I) {
+  AddInteger((unsigned long long)I);
+}
+void FoldingSetNodeID::AddInteger(unsigned long long I) {
+  AddInteger(unsigned(I));
+  if ((uint64_t)(int)I != I)
+    Bits.push_back(unsigned(I >> 32));
+}
+
+void FoldingSetNodeID::AddString(const char *String, const char *End) {
+  unsigned Size =  static_cast<unsigned>(End - String);
+  Bits.push_back(Size);
+  if (!Size) return;
+
+  unsigned Units = Size / 4;
+  unsigned Pos = 0;
+  const unsigned *Base = (const unsigned *)String;
+  
+  // If the string is aligned do a bulk transfer.
+  if (!((intptr_t)Base & 3)) {
+    Bits.append(Base, Base + Units);
+    Pos = (Units + 1) * 4;
+  } else {
+    // Otherwise do it the hard way.
+    for (Pos += 4; Pos <= Size; Pos += 4) {
+      unsigned V = ((unsigned char)String[Pos - 4] << 24) |
+                   ((unsigned char)String[Pos - 3] << 16) |
+                   ((unsigned char)String[Pos - 2] << 8) |
+                    (unsigned char)String[Pos - 1];
+      Bits.push_back(V);
+    }
+  }
+  
+  // With the leftover bits.
+  unsigned V = 0;
+  // Pos will have overshot size by 4 - #bytes left over. 
+  switch (Pos - Size) {
+  case 1: V = (V << 8) | (unsigned char)String[Size - 3]; // Fall thru.
+  case 2: V = (V << 8) | (unsigned char)String[Size - 2]; // Fall thru.
+  case 3: V = (V << 8) | (unsigned char)String[Size - 1]; break;
+  default: return; // Nothing left.
+  }
+
+  Bits.push_back(V);
+}
+
+void FoldingSetNodeID::AddString(const char *String) {
+  AddString(String, String + strlen(String));
+}
+
+void FoldingSetNodeID::AddString(const std::string &String) {
+  AddString(&*String.begin(), &*String.end());
+}
+
+/// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used to 
+/// lookup the node in the FoldingSetImpl.
+unsigned FoldingSetNodeID::ComputeHash() const {
+  // This is adapted from SuperFastHash by Paul Hsieh.
+  unsigned Hash = static_cast<unsigned>(Bits.size());
+  for (const unsigned *BP = &Bits[0], *E = BP+Bits.size(); BP != E; ++BP) {
+    unsigned Data = *BP;
+    Hash         += Data & 0xFFFF;
+    unsigned Tmp  = ((Data >> 16) << 11) ^ Hash;
+    Hash          = (Hash << 16) ^ Tmp;
+    Hash         += Hash >> 11;
+  }
+  
+  // Force "avalanching" of final 127 bits.
+  Hash ^= Hash << 3;
+  Hash += Hash >> 5;
+  Hash ^= Hash << 4;
+  Hash += Hash >> 17;
+  Hash ^= Hash << 25;
+  Hash += Hash >> 6;
+  return Hash;
+}
+
+/// operator== - Used to compare two nodes to each other.
+///
+bool FoldingSetNodeID::operator==(const FoldingSetNodeID &RHS)const{
+  if (Bits.size() != RHS.Bits.size()) return false;
+  return memcmp(&Bits[0], &RHS.Bits[0], Bits.size()*sizeof(Bits[0])) == 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+/// Helper functions for FoldingSetImpl.
+
+/// GetNextPtr - In order to save space, each bucket is a
+/// singly-linked-list. In order to make deletion more efficient, we make
+/// the list circular, so we can delete a node without computing its hash.
+/// The problem with this is that the start of the hash buckets are not
+/// Nodes.  If NextInBucketPtr is a bucket pointer, this method returns null:
+/// use GetBucketPtr when this happens.
+static FoldingSetImpl::Node *GetNextPtr(void *NextInBucketPtr) {
+  // The low bit is set if this is the pointer back to the bucket.
+  if (reinterpret_cast<intptr_t>(NextInBucketPtr) & 1)
+    return 0;
+  
+  return static_cast<FoldingSetImpl::Node*>(NextInBucketPtr);
+}
+
+
+/// testing.
+static void **GetBucketPtr(void *NextInBucketPtr) {
+  intptr_t Ptr = reinterpret_cast<intptr_t>(NextInBucketPtr);
+  assert((Ptr & 1) && "Not a bucket pointer");
+  return reinterpret_cast<void**>(Ptr & ~intptr_t(1));
+}
+
+/// GetBucketFor - Hash the specified node ID and return the hash bucket for
+/// the specified ID.
+static void **GetBucketFor(const FoldingSetNodeID &ID,
+                           void **Buckets, unsigned NumBuckets) {
+  // NumBuckets is always a power of 2.
+  unsigned BucketNum = ID.ComputeHash() & (NumBuckets-1);
+  return Buckets + BucketNum;
+}
+
+//===----------------------------------------------------------------------===//
+// FoldingSetImpl Implementation
+
+FoldingSetImpl::FoldingSetImpl(unsigned Log2InitSize) {
+  assert(5 < Log2InitSize && Log2InitSize < 32 &&
+         "Initial hash table size out of range");
+  NumBuckets = 1 << Log2InitSize;
+  Buckets = new void*[NumBuckets+1];
+  clear();
+}
+FoldingSetImpl::~FoldingSetImpl() {
+  delete [] Buckets;
+}
+void FoldingSetImpl::clear() {
+  // Set all but the last bucket to null pointers.
+  memset(Buckets, 0, NumBuckets*sizeof(void*));
+
+  // Set the very last bucket to be a non-null "pointer".
+  Buckets[NumBuckets] = reinterpret_cast<void*>(-1);
+
+  // Reset the node count to zero.
+  NumNodes = 0;
+}
+
+/// GrowHashTable - Double the size of the hash table and rehash everything.
+///
+void FoldingSetImpl::GrowHashTable() {
+  void **OldBuckets = Buckets;
+  unsigned OldNumBuckets = NumBuckets;
+  NumBuckets <<= 1;
+  
+  // Clear out new buckets.
+  Buckets = new void*[NumBuckets+1];
+  clear();
+
+  // Walk the old buckets, rehashing nodes into their new place.
+  FoldingSetNodeID ID;
+  for (unsigned i = 0; i != OldNumBuckets; ++i) {
+    void *Probe = OldBuckets[i];
+    if (!Probe) continue;
+    while (Node *NodeInBucket = GetNextPtr(Probe)) {
+      // Figure out the next link, remove NodeInBucket from the old link.
+      Probe = NodeInBucket->getNextInBucket();
+      NodeInBucket->SetNextInBucket(0);
+
+      // Insert the node into the new bucket, after recomputing the hash.
+      GetNodeProfile(ID, NodeInBucket);
+      InsertNode(NodeInBucket, GetBucketFor(ID, Buckets, NumBuckets));
+      ID.clear();
+    }
+  }
+  
+  delete[] OldBuckets;
+}
+
+/// FindNodeOrInsertPos - Look up the node specified by ID.  If it exists,
+/// return it.  If not, return the insertion token that will make insertion
+/// faster.
+FoldingSetImpl::Node
+*FoldingSetImpl::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                     void *&InsertPos) {
+  
+  void **Bucket = GetBucketFor(ID, Buckets, NumBuckets);
+  void *Probe = *Bucket;
+  
+  InsertPos = 0;
+  
+  FoldingSetNodeID OtherID;
+  while (Node *NodeInBucket = GetNextPtr(Probe)) {
+    GetNodeProfile(OtherID, NodeInBucket);
+    if (OtherID == ID)
+      return NodeInBucket;
+
+    Probe = NodeInBucket->getNextInBucket();
+    OtherID.clear();
+  }
+  
+  // Didn't find the node, return null with the bucket as the InsertPos.
+  InsertPos = Bucket;
+  return 0;
+}
+
+/// InsertNode - Insert the specified node into the folding set, knowing that it
+/// is not already in the map.  InsertPos must be obtained from 
+/// FindNodeOrInsertPos.
+void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
+  assert(N->getNextInBucket() == 0);
+  // Do we need to grow the hashtable?
+  if (NumNodes+1 > NumBuckets*2) {
+    GrowHashTable();
+    FoldingSetNodeID ID;
+    GetNodeProfile(ID, N);
+    InsertPos = GetBucketFor(ID, Buckets, NumBuckets);
+  }
+
+  ++NumNodes;
+  
+  /// The insert position is actually a bucket pointer.
+  void **Bucket = static_cast<void**>(InsertPos);
+  
+  void *Next = *Bucket;
+  
+  // If this is the first insertion into this bucket, its next pointer will be
+  // null.  Pretend as if it pointed to itself, setting the low bit to indicate
+  // that it is a pointer to the bucket.
+  if (Next == 0)
+    Next = reinterpret_cast<void*>(reinterpret_cast<intptr_t>(Bucket)|1);
+
+  // Set the node's next pointer, and make the bucket point to the node.
+  N->SetNextInBucket(Next);
+  *Bucket = N;
+}
+
+/// RemoveNode - Remove a node from the folding set, returning true if one was
+/// removed or false if the node was not in the folding set.
+bool FoldingSetImpl::RemoveNode(Node *N) {
+  // Because each bucket is a circular list, we don't need to compute N's hash
+  // to remove it.
+  void *Ptr = N->getNextInBucket();
+  if (Ptr == 0) return false;  // Not in folding set.
+
+  --NumNodes;
+  N->SetNextInBucket(0);
+
+  // Remember what N originally pointed to, either a bucket or another node.
+  void *NodeNextPtr = Ptr;
+  
+  // Chase around the list until we find the node (or bucket) which points to N.
+  while (true) {
+    if (Node *NodeInBucket = GetNextPtr(Ptr)) {
+      // Advance pointer.
+      Ptr = NodeInBucket->getNextInBucket();
+      
+      // We found a node that points to N, change it to point to N's next node,
+      // removing N from the list.
+      if (Ptr == N) {
+        NodeInBucket->SetNextInBucket(NodeNextPtr);
+        return true;
+      }
+    } else {
+      void **Bucket = GetBucketPtr(Ptr);
+      Ptr = *Bucket;
+      
+      // If we found that the bucket points to N, update the bucket to point to
+      // whatever is next.
+      if (Ptr == N) {
+        *Bucket = NodeNextPtr;
+        return true;
+      }
+    }
+  }
+}
+
+/// GetOrInsertNode - If there is an existing simple Node exactly
+/// equal to the specified node, return it.  Otherwise, insert 'N' and it
+/// instead.
+FoldingSetImpl::Node *FoldingSetImpl::GetOrInsertNode(FoldingSetImpl::Node *N) {
+  FoldingSetNodeID ID;
+  GetNodeProfile(ID, N);
+  void *IP;
+  if (Node *E = FindNodeOrInsertPos(ID, IP))
+    return E;
+  InsertNode(N, IP);
+  return N;
+}
+
+//===----------------------------------------------------------------------===//
+// FoldingSetIteratorImpl Implementation
+
+FoldingSetIteratorImpl::FoldingSetIteratorImpl(void **Bucket) {
+  // Skip to the first non-null non-self-cycle bucket.
+  while (*Bucket != reinterpret_cast<void*>(-1) &&
+         (*Bucket == 0 || GetNextPtr(*Bucket) == 0))
+    ++Bucket;
+  
+  NodePtr = static_cast<FoldingSetNode*>(*Bucket);
+}
+
+void FoldingSetIteratorImpl::advance() {
+  // If there is another link within this bucket, go to it.
+  void *Probe = NodePtr->getNextInBucket();
+
+  if (FoldingSetNode *NextNodeInBucket = GetNextPtr(Probe))
+    NodePtr = NextNodeInBucket;
+  else {
+    // Otherwise, this is the last link in this bucket.  
+    void **Bucket = GetBucketPtr(Probe);
+
+    // Skip to the next non-null non-self-cycle bucket.
+    do {
+      ++Bucket;
+    } while (*Bucket != reinterpret_cast<void*>(-1) &&
+             (*Bucket == 0 || GetNextPtr(*Bucket) == 0));
+    
+    NodePtr = static_cast<FoldingSetNode*>(*Bucket);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// FoldingSetBucketIteratorImpl Implementation
+
+FoldingSetBucketIteratorImpl::FoldingSetBucketIteratorImpl(void **Bucket) {
+  Ptr = (*Bucket == 0 || GetNextPtr(*Bucket) == 0) ? (void*) Bucket : *Bucket;
+}
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
new file mode 100644
index 0000000..c359dfb
--- /dev/null
+++ b/lib/Support/GraphWriter.cpp
@@ -0,0 +1,89 @@
+//===-- GraphWriter.cpp - Implements GraphWriter support routines ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements misc. GraphWriter support routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/Path.h"
+#include "llvm/System/Program.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+void llvm::DisplayGraph(const sys::Path &Filename) {
+  std::string ErrMsg;
+#if HAVE_GRAPHVIZ
+  sys::Path Graphviz(LLVM_PATH_GRAPHVIZ);
+
+  std::vector<const char*> args;
+  args.push_back(Graphviz.c_str());
+  args.push_back(Filename.c_str());
+  args.push_back(0);
+  
+  cerr << "Running 'Graphviz' program... " << std::flush;
+  if (sys::Program::ExecuteAndWait(Graphviz, &args[0],0,0,0,0,&ErrMsg)) {
+    cerr << "Error viewing graph: " << ErrMsg << "\n";
+  }
+#elif (HAVE_GV && HAVE_DOT)
+  sys::Path PSFilename = Filename;
+  PSFilename.appendSuffix("ps");
+  
+  sys::Path dot(LLVM_PATH_DOT);
+
+  std::vector<const char*> args;
+  args.push_back(dot.c_str());
+  args.push_back("-Tps");
+  args.push_back("-Nfontname=Courier");
+  args.push_back("-Gsize=7.5,10");
+  args.push_back(Filename.c_str());
+  args.push_back("-o");
+  args.push_back(PSFilename.c_str());
+  args.push_back(0);
+  
+  cerr << "Running 'dot' program... " << std::flush;
+  if (sys::Program::ExecuteAndWait(dot, &args[0],0,0,0,0,&ErrMsg)) {
+    cerr << "Error viewing graph: '" << ErrMsg << "\n";
+  } else {
+    cerr << " done. \n";
+
+    sys::Path gv(LLVM_PATH_GV);
+    args.clear();
+    args.push_back(gv.c_str());
+    args.push_back(PSFilename.c_str());
+    args.push_back("-spartan");
+    args.push_back(0);
+    
+    ErrMsg.clear();
+    if (sys::Program::ExecuteAndWait(gv, &args[0],0,0,0,0,&ErrMsg)) {
+      cerr << "Error viewing graph: " << ErrMsg << "\n";
+    }
+  }
+  PSFilename.eraseFromDisk();
+#elif HAVE_DOTTY
+  sys::Path dotty(LLVM_PATH_DOTTY);
+
+  std::vector<const char*> args;
+  args.push_back(dotty.c_str());
+  args.push_back(Filename.c_str());
+  args.push_back(0);
+  
+  cerr << "Running 'dotty' program... " << std::flush;
+  if (sys::Program::ExecuteAndWait(dotty, &args[0],0,0,0,0,&ErrMsg)) {
+    cerr << "Error viewing graph: " << ErrMsg << "\n";
+  } else {
+#ifdef __MINGW32__ // Dotty spawns another app and doesn't wait until it returns
+    return;
+#endif
+  }
+#endif
+  
+  Filename.eraseFromDisk();
+}
diff --git a/lib/Support/IsInf.cpp b/lib/Support/IsInf.cpp
new file mode 100644
index 0000000..d6da0c9
--- /dev/null
+++ b/lib/Support/IsInf.cpp
@@ -0,0 +1,49 @@
+//===-- IsInf.cpp - Platform-independent wrapper around C99 isinf() -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Platform-independent wrapper around C99 isinf()
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+
+#if HAVE_ISINF_IN_MATH_H
+# include <math.h>
+#elif HAVE_ISINF_IN_CMATH
+# include <cmath>
+#elif HAVE_STD_ISINF_IN_CMATH
+# include <cmath>
+using std::isinf;
+#elif HAVE_FINITE_IN_IEEEFP_H
+// A handy workaround I found at http://www.unixguide.net/sun/faq ...
+// apparently this has been a problem with Solaris for years.
+# include <ieeefp.h>
+static int isinf(double x) { return !finite(x) && x==x; }
+#elif defined(_MSC_VER)
+#include <float.h>
+#define isinf(X) (!_finite(X))
+#elif defined(_AIX) && defined(__GNUC__)
+// GCC's fixincludes seems to be removing the isinf() declaration from the
+// system header /usr/include/math.h
+# include <math.h>
+static int isinf(double x) { return !finite(x) && x==x; }
+#elif defined(__hpux)
+// HP-UX is "special"
+#include <math.h>
+static int isinf(double x) { return ((x) == INFINITY) || ((x) == -INFINITY); }
+#else
+# error "Don't know how to get isinf()"
+#endif
+
+namespace llvm {
+
+int IsInf(float f)  { return isinf(f); }
+int IsInf(double d) { return isinf(d); }
+
+} // end namespace llvm;
diff --git a/lib/Support/IsNAN.cpp b/lib/Support/IsNAN.cpp
new file mode 100644
index 0000000..bdfdfbf
--- /dev/null
+++ b/lib/Support/IsNAN.cpp
@@ -0,0 +1,33 @@
+//===-- IsNAN.cpp ---------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Platform-independent wrapper around C99 isnan().
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+
+#if HAVE_ISNAN_IN_MATH_H
+# include <math.h>
+#elif HAVE_ISNAN_IN_CMATH
+# include <cmath>
+#elif HAVE_STD_ISNAN_IN_CMATH
+# include <cmath>
+using std::isnan;
+#elif defined(_MSC_VER)
+#include <float.h>
+#define isnan _isnan
+#else
+# error "Don't know how to get isnan()"
+#endif
+
+namespace llvm {
+  int IsNAN(float f)  { return isnan(f); }
+  int IsNAN(double d) { return isnan(d); }
+} // end namespace llvm;
diff --git a/lib/Support/Makefile b/lib/Support/Makefile
new file mode 100644
index 0000000..48c21f4
--- /dev/null
+++ b/lib/Support/Makefile
@@ -0,0 +1,17 @@
+##===- lib/Support/Makefile --------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMSupport
+BUILD_ARCHIVE = 1
+
+## FIXME: This only requires RTTI because tblgen uses it.  Fix that.
+REQUIRES_RTTI = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp
new file mode 100644
index 0000000..6de6575
--- /dev/null
+++ b/lib/Support/ManagedStatic.cpp
@@ -0,0 +1,91 @@
+//===-- ManagedStatic.cpp - Static Global wrapper -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ManagedStatic class and llvm_shutdown().
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Config/config.h"
+#include "llvm/System/Atomic.h"
+#include "llvm/System/Mutex.h"
+#include <cassert>
+using namespace llvm;
+
+static const ManagedStaticBase *StaticList = 0;
+
+static sys::Mutex* ManagedStaticMutex = 0;
+
+void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
+                                              void (*Deleter)(void*)) const {
+  if (ManagedStaticMutex) {
+    ManagedStaticMutex->acquire();
+
+    if (Ptr == 0) {
+      void* tmp = Creator ? Creator() : 0;
+
+      sys::MemoryFence();
+      Ptr = tmp;
+      DeleterFn = Deleter;
+      
+      // Add to list of managed statics.
+      Next = StaticList;
+      StaticList = this;
+    }
+
+    ManagedStaticMutex->release();
+  } else {
+    assert(Ptr == 0 && DeleterFn == 0 && Next == 0 &&
+           "Partially initialized ManagedStatic!?");
+    Ptr = Creator ? Creator() : 0;
+    DeleterFn = Deleter;
+  
+    // Add to list of managed statics.
+    Next = StaticList;
+    StaticList = this;
+  }
+}
+
+void ManagedStaticBase::destroy() const {
+  assert(DeleterFn && "ManagedStatic not initialized correctly!");
+  assert(StaticList == this &&
+         "Not destroyed in reverse order of construction?");
+  // Unlink from list.
+  StaticList = Next;
+  Next = 0;
+
+  // Destroy memory.
+  DeleterFn(Ptr);
+  
+  // Cleanup.
+  Ptr = 0;
+  DeleterFn = 0;
+}
+
+bool llvm::llvm_start_multithreaded() {
+#if LLVM_MULTITHREADED
+  assert(ManagedStaticMutex == 0 && "Multithreaded LLVM already initialized!");
+  ManagedStaticMutex = new sys::Mutex(true);
+  return true;
+#else
+  return false;
+#endif
+}
+
+/// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
+void llvm::llvm_shutdown() {
+  while (StaticList)
+    StaticList->destroy();
+
+  if (ManagedStaticMutex) {
+    delete ManagedStaticMutex;
+    ManagedStaticMutex = 0;
+  }
+}
+
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
new file mode 100644
index 0000000..e35c626
--- /dev/null
+++ b/lib/Support/MemoryBuffer.cpp
@@ -0,0 +1,279 @@
+//===--- MemoryBuffer.cpp - Memory Buffer implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the MemoryBuffer interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/System/Path.h"
+#include "llvm/System/Process.h"
+#include "llvm/System/Program.h"
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <cerrno>
+#include <sys/types.h>
+#include <sys/stat.h>
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#include <sys/uio.h>
+#else
+#include <io.h>
+#endif
+#include <fcntl.h>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer implementation itself.
+//===----------------------------------------------------------------------===//
+
+MemoryBuffer::~MemoryBuffer() {
+  if (MustDeleteBuffer)
+    free((void*)BufferStart);
+}
+
+/// initCopyOf - Initialize this source buffer with a copy of the specified
+/// memory range.  We make the copy so that we can null terminate it
+/// successfully.
+void MemoryBuffer::initCopyOf(const char *BufStart, const char *BufEnd) {
+  size_t Size = BufEnd-BufStart;
+  BufferStart = (char *)malloc((Size+1) * sizeof(char));
+  BufferEnd = BufferStart+Size;
+  memcpy(const_cast<char*>(BufferStart), BufStart, Size);
+  *const_cast<char*>(BufferEnd) = 0;   // Null terminate buffer.
+  MustDeleteBuffer = true;
+}
+
+/// init - Initialize this MemoryBuffer as a reference to externally allocated
+/// memory, memory that we know is already null terminated.
+void MemoryBuffer::init(const char *BufStart, const char *BufEnd) {
+  assert(BufEnd[0] == 0 && "Buffer is not null terminated!");
+  BufferStart = BufStart;
+  BufferEnd = BufEnd;
+  MustDeleteBuffer = false;
+}
+
+//===----------------------------------------------------------------------===//
+// MemoryBufferMem implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class MemoryBufferMem : public MemoryBuffer {
+  std::string FileID;
+public:
+  MemoryBufferMem(const char *Start, const char *End, const char *FID,
+                  bool Copy = false)
+  : FileID(FID) {
+    if (!Copy)
+      init(Start, End);
+    else
+      initCopyOf(Start, End);
+  }
+  
+  virtual const char *getBufferIdentifier() const {
+    return FileID.c_str();
+  }
+};
+}
+
+/// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
+/// that EndPtr[0] must be a null byte and be accessible!
+MemoryBuffer *MemoryBuffer::getMemBuffer(const char *StartPtr, 
+                                         const char *EndPtr,
+                                         const char *BufferName) {
+  return new MemoryBufferMem(StartPtr, EndPtr, BufferName);
+}
+
+/// getMemBufferCopy - Open the specified memory range as a MemoryBuffer,
+/// copying the contents and taking ownership of it.  This has no requirements
+/// on EndPtr[0].
+MemoryBuffer *MemoryBuffer::getMemBufferCopy(const char *StartPtr, 
+                                             const char *EndPtr,
+                                             const char *BufferName) {
+  return new MemoryBufferMem(StartPtr, EndPtr, BufferName, true);
+}
+
+/// getNewUninitMemBuffer - Allocate a new MemoryBuffer of the specified size
+/// that is completely initialized to zeros.  Note that the caller should
+/// initialize the memory allocated by this method.  The memory is owned by
+/// the MemoryBuffer object.
+MemoryBuffer *MemoryBuffer::getNewUninitMemBuffer(size_t Size,
+                                                  const char *BufferName) {
+  char *Buf = (char *)malloc((Size+1) * sizeof(char));
+  if (!Buf) return 0;
+  Buf[Size] = 0;
+  MemoryBufferMem *SB = new MemoryBufferMem(Buf, Buf+Size, BufferName);
+  // The memory for this buffer is owned by the MemoryBuffer.
+  SB->MustDeleteBuffer = true;
+  return SB;
+}
+
+/// getNewMemBuffer - Allocate a new MemoryBuffer of the specified size that
+/// is completely initialized to zeros.  Note that the caller should
+/// initialize the memory allocated by this method.  The memory is owned by
+/// the MemoryBuffer object.
+MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size,
+                                            const char *BufferName) {
+  MemoryBuffer *SB = getNewUninitMemBuffer(Size, BufferName);
+  if (!SB) return 0;
+  memset(const_cast<char*>(SB->getBufferStart()), 0, Size+1);
+  return SB;
+}
+
+
+/// getFileOrSTDIN - Open the specified file as a MemoryBuffer, or open stdin
+/// if the Filename is "-".  If an error occurs, this returns null and fills
+/// in *ErrStr with a reason.  If stdin is empty, this API (unlike getSTDIN)
+/// returns an empty buffer.
+MemoryBuffer *MemoryBuffer::getFileOrSTDIN(const char *Filename,
+                                           std::string *ErrStr,
+                                           int64_t FileSize) {
+  if (Filename[0] != '-' || Filename[1] != 0)
+    return getFile(Filename, ErrStr, FileSize);
+  MemoryBuffer *M = getSTDIN();
+  if (M) return M;
+
+  // If stdin was empty, M is null.  Cons up an empty memory buffer now.
+  const char *EmptyStr = "";
+  return MemoryBuffer::getMemBuffer(EmptyStr, EmptyStr, "<stdin>");
+}
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer::getFile implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// MemoryBufferMMapFile - This represents a file that was mapped in with the
+/// sys::Path::MapInFilePages method.  When destroyed, it calls the
+/// sys::Path::UnMapFilePages method.
+class MemoryBufferMMapFile : public MemoryBuffer {
+  std::string Filename;
+public:
+  MemoryBufferMMapFile(const char *filename, const char *Pages, uint64_t Size)
+    : Filename(filename) {
+    init(Pages, Pages+Size);
+  }
+  
+  virtual const char *getBufferIdentifier() const {
+    return Filename.c_str();
+  }
+    
+  ~MemoryBufferMMapFile() {
+    sys::Path::UnMapFilePages(getBufferStart(), getBufferSize());
+  }
+};
+}
+
+MemoryBuffer *MemoryBuffer::getFile(const char *Filename, std::string *ErrStr,
+                                    int64_t FileSize) {
+  int OpenFlags = 0;
+#ifdef O_BINARY
+  OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
+#endif
+  int FD = ::open(Filename, O_RDONLY|OpenFlags);
+  if (FD == -1) {
+    if (ErrStr) *ErrStr = "could not open file";
+    return 0;
+  }
+  
+  // If we don't know the file size, use fstat to find out.  fstat on an open
+  // file descriptor is cheaper than stat on a random path.
+  if (FileSize == -1) {
+    struct stat FileInfo;
+    // TODO: This should use fstat64 when available.
+    if (fstat(FD, &FileInfo) == -1) {
+      if (ErrStr) *ErrStr = "could not get file length";
+      ::close(FD);
+      return 0;
+    }
+    FileSize = FileInfo.st_size;
+  }
+  
+  
+  // If the file is large, try to use mmap to read it in.  We don't use mmap
+  // for small files, because this can severely fragment our address space. Also
+  // don't try to map files that are exactly a multiple of the system page size,
+  // as the file would not have the required null terminator.
+  if (FileSize >= 4096*4 &&
+      (FileSize & (sys::Process::GetPageSize()-1)) != 0) {
+    if (const char *Pages = sys::Path::MapInFilePages(FD, FileSize)) {
+      // Close the file descriptor, now that the whole file is in memory.
+      ::close(FD);
+      return new MemoryBufferMMapFile(Filename, Pages, FileSize);
+    }
+  }
+
+  MemoryBuffer *Buf = MemoryBuffer::getNewUninitMemBuffer(FileSize, Filename);
+  if (!Buf) {
+    // Failed to create a buffer.
+    if (ErrStr) *ErrStr = "could not allocate buffer";
+    ::close(FD);
+    return 0;
+  }
+
+  OwningPtr<MemoryBuffer> SB(Buf);
+  char *BufPtr = const_cast<char*>(SB->getBufferStart());
+  
+  size_t BytesLeft = FileSize;
+  while (BytesLeft) {
+    ssize_t NumRead = ::read(FD, BufPtr, BytesLeft);
+    if (NumRead != -1) {
+      BytesLeft -= NumRead;
+      BufPtr += NumRead;
+    } else if (errno == EINTR) {
+      // try again
+    } else {
+      // error reading.
+      close(FD);
+      if (ErrStr) *ErrStr = "error reading file data";
+      return 0;
+    }
+  }
+  close(FD);
+  
+  return SB.take();
+}
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer::getSTDIN implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class STDINBufferFile : public MemoryBuffer {
+public:
+  virtual const char *getBufferIdentifier() const {
+    return "<stdin>";
+  }
+};
+}
+
+MemoryBuffer *MemoryBuffer::getSTDIN() {
+  char Buffer[4096*4];
+
+  std::vector<char> FileData;
+
+  // Read in all of the data from stdin, we cannot mmap stdin.
+  sys::Program::ChangeStdinToBinary();
+  size_t ReadBytes;
+  do {
+    ReadBytes = fread(Buffer, sizeof(char), sizeof(Buffer), stdin);
+    FileData.insert(FileData.end(), Buffer, Buffer+ReadBytes);
+  } while (ReadBytes == sizeof(Buffer));
+
+  FileData.push_back(0); // &FileData[Size] is invalid. So is &*FileData.end().
+  size_t Size = FileData.size();
+  if (Size <= 1)
+    return 0;
+  MemoryBuffer *B = new STDINBufferFile();
+  B->initCopyOf(&FileData[0], &FileData[Size-1]);
+  return B;
+}
diff --git a/lib/Support/PluginLoader.cpp b/lib/Support/PluginLoader.cpp
new file mode 100644
index 0000000..5acf1d1
--- /dev/null
+++ b/lib/Support/PluginLoader.cpp
@@ -0,0 +1,43 @@
+//===-- PluginLoader.cpp - Implement -load command line option ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the -load <plugin> command line option handler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DONT_GET_PLUGIN_LOADER_OPTION
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PluginLoader.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/DynamicLibrary.h"
+#include <ostream>
+#include <vector>
+using namespace llvm;
+
+static ManagedStatic<std::vector<std::string> > Plugins;
+
+void PluginLoader::operator=(const std::string &Filename) {
+  std::string Error;
+  if (sys::DynamicLibrary::LoadLibraryPermanently(Filename.c_str(), &Error)) {
+    cerr << "Error opening '" << Filename << "': " << Error
+         << "\n  -load request ignored.\n";
+  } else {
+    Plugins->push_back(Filename);
+  }
+}
+
+unsigned PluginLoader::getNumPlugins() {
+  return Plugins.isConstructed() ? Plugins->size() : 0;
+}
+
+std::string &PluginLoader::getPlugin(unsigned num) {
+  assert(Plugins.isConstructed() && num < Plugins->size() &&
+         "Asking for an out of bounds plugin");
+  return (*Plugins)[num];
+}
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
new file mode 100644
index 0000000..c111c5e
--- /dev/null
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -0,0 +1,108 @@
+//===- PrettyStackTrace.cpp - Pretty Crash Handling -----------------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occuring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Signals.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+// FIXME: This should be thread local when llvm supports threads.
+static const PrettyStackTraceEntry *PrettyStackTraceHead = 0;
+
+static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
+  unsigned NextID = 0;
+  if (Entry->getNextEntry())
+    NextID = PrintStack(Entry->getNextEntry(), OS);
+  OS << NextID << ".\t";
+  Entry->print(OS);
+  
+  return NextID+1;
+}
+
+/// PrintCurStackTrace - Print the current stack trace to the specified stream.
+static void PrintCurStackTrace(raw_ostream &OS) {
+  // Don't print an empty trace.
+  if (PrettyStackTraceHead == 0) return;
+  
+  // If there are pretty stack frames registered, walk and emit them.
+  OS << "Stack dump:\n";
+  
+  PrintStack(PrettyStackTraceHead, OS);
+  OS.flush();
+}
+
+// Integrate with crash reporter.
+#ifdef __APPLE__
+extern "C" const char *__crashreporter_info__;
+const char *__crashreporter_info__ = 0;
+#endif
+
+
+/// CrashHandler - This callback is run if a fatal signal is delivered to the
+/// process, it prints the pretty stack trace.
+static void CrashHandler(void *Cookie) {
+#ifndef __APPLE__
+  // On non-apple systems, just emit the crash stack trace to stderr.
+  PrintCurStackTrace(errs());
+#else
+  // Otherwise, emit to a smallvector of chars, send *that* to stderr, but also
+  // put it into __crashreporter_info__.
+  SmallString<2048> TmpStr;
+  {
+    raw_svector_ostream Stream(TmpStr);
+    PrintCurStackTrace(Stream);
+  }
+  
+  if (!TmpStr.empty()) {
+    __crashreporter_info__ = strdup(TmpStr.c_str());
+    errs() << __crashreporter_info__;
+  }
+  
+#endif
+}
+
+static bool RegisterCrashPrinter() {
+  sys::AddSignalHandler(CrashHandler, 0);
+  return false;
+}
+
+PrettyStackTraceEntry::PrettyStackTraceEntry() {
+  // The first time this is called, we register the crash printer.
+  static bool HandlerRegistered = RegisterCrashPrinter();
+  HandlerRegistered = HandlerRegistered;
+    
+  // Link ourselves.
+  NextEntry = PrettyStackTraceHead;
+  PrettyStackTraceHead = this;
+}
+
+PrettyStackTraceEntry::~PrettyStackTraceEntry() {
+  assert(PrettyStackTraceHead == this &&
+         "Pretty stack trace entry destruction is out of order");
+  PrettyStackTraceHead = getNextEntry();
+}
+
+void PrettyStackTraceString::print(raw_ostream &OS) const {
+  OS << Str << "\n";
+}
+
+void PrettyStackTraceProgram::print(raw_ostream &OS) const {
+  OS << "Program arguments: ";
+  // Print the argument list.
+  for (unsigned i = 0, e = ArgC; i != e; ++i)
+    OS << ArgV[i] << ' ';
+  OS << '\n';
+}
+
diff --git a/lib/Support/SlowOperationInformer.cpp b/lib/Support/SlowOperationInformer.cpp
new file mode 100644
index 0000000..d5ffff9
--- /dev/null
+++ b/lib/Support/SlowOperationInformer.cpp
@@ -0,0 +1,66 @@
+//===-- SlowOperationInformer.cpp - Keep the user informed ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SlowOperationInformer class for the LLVM debugger.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SlowOperationInformer.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/Alarm.h"
+#include <sstream>
+#include <cassert>
+using namespace llvm;
+
+SlowOperationInformer::SlowOperationInformer(const std::string &Name)
+  : OperationName(Name), LastPrintAmount(0) {
+  sys::SetupAlarm(1);
+}
+
+SlowOperationInformer::~SlowOperationInformer() {
+  sys::TerminateAlarm();
+  if (LastPrintAmount) {
+    // If we have printed something, make _sure_ we print the 100% amount, and
+    // also print a newline.
+    cout << std::string(LastPrintAmount, '\b') << "Progress "
+         << OperationName << ": 100%  \n";
+  }
+}
+
+/// progress - Clients should periodically call this method when they are in
+/// an exception-safe state.  The Amount variable should indicate how far
+/// along the operation is, given in 1/10ths of a percent (in other words,
+/// Amount should range from 0 to 1000).
+bool SlowOperationInformer::progress(unsigned Amount) {
+  int status = sys::AlarmStatus();
+  if (status == -1) {
+    cout << "\n";
+    LastPrintAmount = 0;
+    return true;
+  }
+
+  // If we haven't spent enough time in this operation to warrant displaying the
+  // progress bar, don't do so yet.
+  if (status == 0)
+    return false;
+
+  // Delete whatever we printed last time.
+  std::string ToPrint = std::string(LastPrintAmount, '\b');
+
+  std::ostringstream OS;
+  OS << "Progress " << OperationName << ": " << Amount/10;
+  if (unsigned Rem = Amount % 10)
+    OS << "." << Rem << "%";
+  else
+    OS << "%  ";
+
+  LastPrintAmount = OS.str().size();
+  cout << ToPrint+OS.str() << std::flush;
+  return false;
+}
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
new file mode 100644
index 0000000..68938fa
--- /dev/null
+++ b/lib/Support/SmallPtrSet.cpp
@@ -0,0 +1,223 @@
+//===- llvm/ADT/SmallPtrSet.cpp - 'Normally small' pointer set ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SmallPtrSet class.  See SmallPtrSet.h for an
+// overview of the algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/MathExtras.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+void SmallPtrSetImpl::shrink_and_clear() {
+  assert(!isSmall() && "Can't shrink a small set!");
+  free(CurArray);
+
+  // Reduce the number of buckets.
+  CurArraySize = NumElements > 16 ? 1 << (Log2_32_Ceil(NumElements) + 1) : 32;
+  NumElements = NumTombstones = 0;
+
+  // Install the new array.  Clear all the buckets to empty.
+  CurArray = (const void**)malloc(sizeof(void*) * (CurArraySize+1));
+  assert(CurArray && "Failed to allocate memory?");
+  memset(CurArray, -1, CurArraySize*sizeof(void*));
+  
+  // The end pointer, always valid, is set to a valid element to help the
+  // iterator.
+  CurArray[CurArraySize] = 0;
+}
+
+bool SmallPtrSetImpl::insert_imp(const void * Ptr) {
+  if (isSmall()) {
+    // Check to see if it is already in the set.
+    for (const void **APtr = SmallArray, **E = SmallArray+NumElements;
+         APtr != E; ++APtr)
+      if (*APtr == Ptr)
+        return false;
+    
+    // Nope, there isn't.  If we stay small, just 'pushback' now.
+    if (NumElements < CurArraySize-1) {
+      SmallArray[NumElements++] = Ptr;
+      return true;
+    }
+    // Otherwise, hit the big set case, which will call grow.
+  }
+  
+  // If more than 3/4 of the array is full, grow.
+  if (NumElements*4 >= CurArraySize*3 ||
+      CurArraySize-(NumElements+NumTombstones) < CurArraySize/8)
+    Grow();
+  
+  // Okay, we know we have space.  Find a hash bucket.
+  const void **Bucket = const_cast<const void**>(FindBucketFor(Ptr));
+  if (*Bucket == Ptr) return false; // Already inserted, good.
+  
+  // Otherwise, insert it!
+  if (*Bucket == getTombstoneMarker())
+    --NumTombstones;
+  *Bucket = Ptr;
+  ++NumElements;  // Track density.
+  return true;
+}
+
+bool SmallPtrSetImpl::erase_imp(const void * Ptr) {
+  if (isSmall()) {
+    // Check to see if it is in the set.
+    for (const void **APtr = SmallArray, **E = SmallArray+NumElements;
+         APtr != E; ++APtr)
+      if (*APtr == Ptr) {
+        // If it is in the set, replace this element.
+        *APtr = E[-1];
+        E[-1] = getEmptyMarker();
+        --NumElements;
+        return true;
+      }
+    
+    return false;
+  }
+  
+  // Okay, we know we have space.  Find a hash bucket.
+  void **Bucket = const_cast<void**>(FindBucketFor(Ptr));
+  if (*Bucket != Ptr) return false;  // Not in the set?
+
+  // Set this as a tombstone.
+  *Bucket = getTombstoneMarker();
+  --NumElements;
+  ++NumTombstones;
+  return true;
+}
+
+const void * const *SmallPtrSetImpl::FindBucketFor(const void *Ptr) const {
+  unsigned Bucket = Hash(Ptr);
+  unsigned ArraySize = CurArraySize;
+  unsigned ProbeAmt = 1;
+  const void *const *Array = CurArray;
+  const void *const *Tombstone = 0;
+  while (1) {
+    // Found Ptr's bucket?
+    if (Array[Bucket] == Ptr)
+      return Array+Bucket;
+    
+    // If we found an empty bucket, the pointer doesn't exist in the set.
+    // Return a tombstone if we've seen one so far, or the empty bucket if
+    // not.
+    if (Array[Bucket] == getEmptyMarker())
+      return Tombstone ? Tombstone : Array+Bucket;
+    
+    // If this is a tombstone, remember it.  If Ptr ends up not in the set, we
+    // prefer to return it than something that would require more probing.
+    if (Array[Bucket] == getTombstoneMarker() && !Tombstone)
+      Tombstone = Array+Bucket;  // Remember the first tombstone found.
+    
+    // It's a hash collision or a tombstone. Reprobe.
+    Bucket = (Bucket + ProbeAmt++) & (ArraySize-1);
+  }
+}
+
+/// Grow - Allocate a larger backing store for the buckets and move it over.
+///
+void SmallPtrSetImpl::Grow() {
+  // Allocate at twice as many buckets, but at least 128.
+  unsigned OldSize = CurArraySize;
+  unsigned NewSize = OldSize < 64 ? 128 : OldSize*2;
+  
+  const void **OldBuckets = CurArray;
+  bool WasSmall = isSmall();
+  
+  // Install the new array.  Clear all the buckets to empty.
+  CurArray = (const void**)malloc(sizeof(void*) * (NewSize+1));
+  assert(CurArray && "Failed to allocate memory?");
+  CurArraySize = NewSize;
+  memset(CurArray, -1, NewSize*sizeof(void*));
+  
+  // The end pointer, always valid, is set to a valid element to help the
+  // iterator.
+  CurArray[NewSize] = 0;
+  
+  // Copy over all the elements.
+  if (WasSmall) {
+    // Small sets store their elements in order.
+    for (const void **BucketPtr = OldBuckets, **E = OldBuckets+NumElements;
+         BucketPtr != E; ++BucketPtr) {
+      const void *Elt = *BucketPtr;
+      *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
+    }
+  } else {
+    // Copy over all valid entries.
+    for (const void **BucketPtr = OldBuckets, **E = OldBuckets+OldSize;
+         BucketPtr != E; ++BucketPtr) {
+      // Copy over the element if it is valid.
+      const void *Elt = *BucketPtr;
+      if (Elt != getTombstoneMarker() && Elt != getEmptyMarker())
+        *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
+    }
+    
+    free(OldBuckets);
+    NumTombstones = 0;
+  }
+}
+
+SmallPtrSetImpl::SmallPtrSetImpl(const SmallPtrSetImpl& that) {
+  // If we're becoming small, prepare to insert into our stack space
+  if (that.isSmall()) {
+    CurArray = &SmallArray[0];
+  // Otherwise, allocate new heap space (unless we were the same size)
+  } else {
+    CurArray = (const void**)malloc(sizeof(void*) * (that.CurArraySize+1));
+    assert(CurArray && "Failed to allocate memory?");
+  }
+  
+  // Copy over the new array size
+  CurArraySize = that.CurArraySize;
+
+  // Copy over the contents from the other set
+  memcpy(CurArray, that.CurArray, sizeof(void*)*(CurArraySize+1));
+  
+  NumElements = that.NumElements;
+  NumTombstones = that.NumTombstones;
+}
+
+/// CopyFrom - implement operator= from a smallptrset that has the same pointer
+/// type, but may have a different small size.
+void SmallPtrSetImpl::CopyFrom(const SmallPtrSetImpl &RHS) {
+  if (isSmall() && RHS.isSmall())
+    assert(CurArraySize == RHS.CurArraySize &&
+           "Cannot assign sets with different small sizes");
+           
+  // If we're becoming small, prepare to insert into our stack space
+  if (RHS.isSmall()) {
+    if (!isSmall())
+      free(CurArray);
+    CurArray = &SmallArray[0];
+  // Otherwise, allocate new heap space (unless we were the same size)
+  } else if (CurArraySize != RHS.CurArraySize) {
+    if (isSmall())
+      CurArray = (const void**)malloc(sizeof(void*) * (RHS.CurArraySize+1));
+    else
+      CurArray = (const void**)realloc(CurArray, sizeof(void*)*(RHS.CurArraySize+1));
+    assert(CurArray && "Failed to allocate memory?");
+  }
+  
+  // Copy over the new array size
+  CurArraySize = RHS.CurArraySize;
+
+  // Copy over the contents from the other set
+  memcpy(CurArray, RHS.CurArray, sizeof(void*)*(CurArraySize+1));
+  
+  NumElements = RHS.NumElements;
+  NumTombstones = RHS.NumTombstones;
+}
+
+SmallPtrSetImpl::~SmallPtrSetImpl() {
+  if (!isSmall())
+    free(CurArray);
+}
diff --git a/lib/Support/Statistic.cpp b/lib/Support/Statistic.cpp
new file mode 100644
index 0000000..13acc1b
--- /dev/null
+++ b/lib/Support/Statistic.cpp
@@ -0,0 +1,126 @@
+//===-- Statistic.cpp - Easy way to expose stats information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the 'Statistic' class, which is designed to be an easy
+// way to expose various success metrics from passes.  These statistics are
+// printed at the end of a run, when the -stats command line option is enabled
+// on the command line.
+//
+// This is useful for reporting information like the number of instructions
+// simplified, optimized or removed by various transformations, like this:
+//
+// static Statistic NumInstEliminated("GCSE", "Number of instructions killed");
+//
+// Later, in the code: ++NumInstEliminated;
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <ostream>
+#include <cstring>
+using namespace llvm;
+
+// GetLibSupportInfoOutputFile - Return a file stream to print our output on.
+namespace llvm { extern std::ostream *GetLibSupportInfoOutputFile(); }
+
+/// -stats - Command line option to cause transformations to emit stats about
+/// what they did.
+///
+static cl::opt<bool>
+Enabled("stats", cl::desc("Enable statistics output from program"));
+
+
+namespace {
+/// StatisticInfo - This class is used in a ManagedStatic so that it is created
+/// on demand (when the first statistic is bumped) and destroyed only when 
+/// llvm_shutdown is called.  We print statistics from the destructor.
+class StatisticInfo {
+  std::vector<const Statistic*> Stats;
+public:
+  ~StatisticInfo();
+  
+  void addStatistic(const Statistic *S) {
+    Stats.push_back(S);
+  }
+};
+}
+
+static ManagedStatic<StatisticInfo> StatInfo;
+
+
+/// RegisterStatistic - The first time a statistic is bumped, this method is
+/// called.
+void Statistic::RegisterStatistic() {
+  // If stats are enabled, inform StatInfo that this statistic should be
+  // printed.
+  if (Enabled)
+    StatInfo->addStatistic(this);
+  // Remember we have been registered.
+  Initialized = true;
+}
+
+namespace {
+
+struct NameCompare {
+  bool operator()(const Statistic *LHS, const Statistic *RHS) const {
+    int Cmp = std::strcmp(LHS->getName(), RHS->getName());
+    if (Cmp != 0) return Cmp < 0;
+    
+    // Secondary key is the description.
+    return std::strcmp(LHS->getDesc(), RHS->getDesc()) < 0;
+  }
+};
+
+}
+
+// Print information when destroyed, iff command line option is specified.
+StatisticInfo::~StatisticInfo() {
+  // Statistics not enabled?
+  if (Stats.empty()) return;
+
+  // Get the stream to write to.
+  std::ostream &OutStream = *GetLibSupportInfoOutputFile();
+
+  // Figure out how long the biggest Value and Name fields are.
+  unsigned MaxNameLen = 0, MaxValLen = 0;
+  for (size_t i = 0, e = Stats.size(); i != e; ++i) {
+    MaxValLen = std::max(MaxValLen,
+                         (unsigned)utostr(Stats[i]->getValue()).size());
+    MaxNameLen = std::max(MaxNameLen,
+                          (unsigned)std::strlen(Stats[i]->getName()));
+  }
+  
+  // Sort the fields by name.
+  std::stable_sort(Stats.begin(), Stats.end(), NameCompare());
+
+  // Print out the statistics header...
+  OutStream << "===" << std::string(73, '-') << "===\n"
+            << "                          ... Statistics Collected ...\n"
+            << "===" << std::string(73, '-') << "===\n\n";
+  
+  // Print all of the statistics.
+  for (size_t i = 0, e = Stats.size(); i != e; ++i) {
+    std::string CountStr = utostr(Stats[i]->getValue());
+    OutStream << std::string(MaxValLen-CountStr.size(), ' ')
+              << CountStr << " " << Stats[i]->getName()
+              << std::string(MaxNameLen-std::strlen(Stats[i]->getName()), ' ')
+              << " - " << Stats[i]->getDesc() << "\n";
+    
+  }
+  
+  OutStream << std::endl;  // Flush the output stream...
+  
+  if (&OutStream != cerr.stream() && &OutStream != cout.stream())
+    delete &OutStream;   // Close the file.
+}
diff --git a/lib/Support/Streams.cpp b/lib/Support/Streams.cpp
new file mode 100644
index 0000000..cf6cfeb
--- /dev/null
+++ b/lib/Support/Streams.cpp
@@ -0,0 +1,30 @@
+//===-- Streams.cpp - Wrappers for iostreams ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a wrapper for the std::cout and std::cerr I/O streams.
+// It prevents the need to include <iostream> to each file just to get I/O.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Streams.h"
+#include <iostream>
+using namespace llvm;
+
+OStream llvm::cout(std::cout);
+OStream llvm::cerr(std::cerr);
+IStream llvm::cin(std::cin);
+
+namespace llvm {
+
+/// FlushStream - Function called by BaseStream to flush an ostream.
+void FlushStream(std::ostream &S) {
+  S << std::flush;
+}
+
+} // end anonymous namespace
diff --git a/lib/Support/StringExtras.cpp b/lib/Support/StringExtras.cpp
new file mode 100644
index 0000000..1618086
--- /dev/null
+++ b/lib/Support/StringExtras.cpp
@@ -0,0 +1,114 @@
+//===-- StringExtras.cpp - Implement the StringExtras header --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringExtras.h header
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringExtras.h"
+#include <cstring>
+using namespace llvm;
+
+/// getToken - This function extracts one token from source, ignoring any
+/// leading characters that appear in the Delimiters string, and ending the
+/// token at any of the characters that appear in the Delimiters string.  If
+/// there are no tokens in the source string, an empty string is returned.
+/// The Source source string is updated in place to remove the returned string
+/// and any delimiter prefix from it.
+std::string llvm::getToken(std::string &Source, const char *Delimiters) {
+  size_t NumDelimiters = std::strlen(Delimiters);
+
+  // Figure out where the token starts.
+  std::string::size_type Start =
+    Source.find_first_not_of(Delimiters, 0, NumDelimiters);
+  if (Start == std::string::npos) Start = Source.size();
+
+  // Find the next occurance of the delimiter.
+  std::string::size_type End =
+    Source.find_first_of(Delimiters, Start, NumDelimiters);
+  if (End == std::string::npos) End = Source.size();
+
+  // Create the return token.
+  std::string Result = std::string(Source.begin()+Start, Source.begin()+End);
+
+  // Erase the token that we read in.
+  Source.erase(Source.begin(), Source.begin()+End);
+
+  return Result;
+}
+
+/// SplitString - Split up the specified string according to the specified
+/// delimiters, appending the result fragments to the output list.
+void llvm::SplitString(const std::string &Source, 
+                       std::vector<std::string> &OutFragments,
+                       const char *Delimiters) {
+  std::string S = Source;
+  
+  std::string S2 = getToken(S, Delimiters);
+  while (!S2.empty()) {
+    OutFragments.push_back(S2);
+    S2 = getToken(S, Delimiters);
+  }
+}
+
+
+
+/// UnescapeString - Modify the argument string, turning two character sequences
+/// @verbatim
+/// like '\\' 'n' into '\n'.  This handles: \e \a \b \f \n \r \t \v \' \ and
+/// \num (where num is a 1-3 byte octal value).
+/// @endverbatim
+void llvm::UnescapeString(std::string &Str) {
+  for (unsigned i = 0; i != Str.size(); ++i) {
+    if (Str[i] == '\\' && i != Str.size()-1) {
+      switch (Str[i+1]) {
+      default: continue;  // Don't execute the code after the switch.
+      case 'a': Str[i] = '\a'; break;
+      case 'b': Str[i] = '\b'; break;
+      case 'e': Str[i] = 27; break;
+      case 'f': Str[i] = '\f'; break;
+      case 'n': Str[i] = '\n'; break;
+      case 'r': Str[i] = '\r'; break;
+      case 't': Str[i] = '\t'; break;
+      case 'v': Str[i] = '\v'; break;
+      case '"': Str[i] = '\"'; break;
+      case '\'': Str[i] = '\''; break;
+      case '\\': Str[i] = '\\'; break;
+      }
+      // Nuke the second character.
+      Str.erase(Str.begin()+i+1);
+    }
+  }
+}
+
+/// EscapeString - Modify the argument string, turning '\\' and anything that
+/// doesn't satisfy std::isprint into an escape sequence.
+void llvm::EscapeString(std::string &Str) {
+  for (unsigned i = 0; i != Str.size(); ++i) {
+    if (Str[i] == '\\') {
+      ++i;
+      Str.insert(Str.begin()+i, '\\');
+    } else if (Str[i] == '\t') {
+      Str[i++] = '\\';
+      Str.insert(Str.begin()+i, 't');
+    } else if (Str[i] == '"') {
+      Str.insert(Str.begin()+i++, '\\');
+    } else if (Str[i] == '\n') {
+      Str[i++] = '\\';
+      Str.insert(Str.begin()+i, 'n');
+    } else if (!std::isprint(Str[i])) {
+      // Always expand to a 3-digit octal escape.
+      unsigned Char = Str[i];
+      Str[i++] = '\\';
+      Str.insert(Str.begin()+i++, '0'+((Char/64) & 7));
+      Str.insert(Str.begin()+i++, '0'+((Char/8)  & 7));
+      Str.insert(Str.begin()+i  , '0'+( Char     & 7));
+    }
+  }
+}
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
new file mode 100644
index 0000000..0c61732
--- /dev/null
+++ b/lib/Support/StringMap.cpp
@@ -0,0 +1,234 @@
+//===--- StringMap.cpp - String Hash table map implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringMap class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringMap.h"
+#include <cassert>
+using namespace llvm;
+
+StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
+  ItemSize = itemSize;
+  
+  // If a size is specified, initialize the table with that many buckets.
+  if (InitSize) {
+    init(InitSize);
+    return;
+  }
+  
+  // Otherwise, initialize it with zero buckets to avoid the allocation.
+  TheTable = 0;
+  NumBuckets = 0;
+  NumItems = 0;
+  NumTombstones = 0;
+}
+
+void StringMapImpl::init(unsigned InitSize) {
+  assert((InitSize & (InitSize-1)) == 0 &&
+         "Init Size must be a power of 2 or zero!");
+  NumBuckets = InitSize ? InitSize : 16;
+  NumItems = 0;
+  NumTombstones = 0;
+  
+  TheTable = (ItemBucket*)calloc(NumBuckets+1, sizeof(ItemBucket));
+  
+  // Allocate one extra bucket, set it to look filled so the iterators stop at
+  // end.
+  TheTable[NumBuckets].Item = (StringMapEntryBase*)2;
+}
+
+
+/// HashString - Compute a hash code for the specified string.
+///
+static unsigned HashString(const char *Start, const char *End) {
+  // Bernstein hash function.
+  unsigned int Result = 0;
+  // TODO: investigate whether a modified bernstein hash function performs
+  // better: http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
+  //   X*33+c -> X*33^c
+  while (Start != End)
+    Result = Result * 33 + *Start++;
+  Result = Result + (Result >> 5);
+  return Result;
+}
+
+/// LookupBucketFor - Look up the bucket that the specified string should end
+/// up in.  If it already exists as a key in the map, the Item pointer for the
+/// specified bucket will be non-null.  Otherwise, it will be null.  In either
+/// case, the FullHashValue field of the bucket will be set to the hash value
+/// of the string.
+unsigned StringMapImpl::LookupBucketFor(const char *NameStart,
+                                        const char *NameEnd) {
+  unsigned HTSize = NumBuckets;
+  if (HTSize == 0) {  // Hash table unallocated so far?
+    init(16);
+    HTSize = NumBuckets;
+  }
+  unsigned FullHashValue = HashString(NameStart, NameEnd);
+  unsigned BucketNo = FullHashValue & (HTSize-1);
+  
+  unsigned ProbeAmt = 1;
+  int FirstTombstone = -1;
+  while (1) {
+    ItemBucket &Bucket = TheTable[BucketNo];
+    StringMapEntryBase *BucketItem = Bucket.Item;
+    // If we found an empty bucket, this key isn't in the table yet, return it.
+    if (BucketItem == 0) {
+      // If we found a tombstone, we want to reuse the tombstone instead of an
+      // empty bucket.  This reduces probing.
+      if (FirstTombstone != -1) {
+        TheTable[FirstTombstone].FullHashValue = FullHashValue;
+        return FirstTombstone;
+      }
+      
+      Bucket.FullHashValue = FullHashValue;
+      return BucketNo;
+    }
+    
+    if (BucketItem == getTombstoneVal()) {
+      // Skip over tombstones.  However, remember the first one we see.
+      if (FirstTombstone == -1) FirstTombstone = BucketNo;
+    } else if (Bucket.FullHashValue == FullHashValue) {
+      // If the full hash value matches, check deeply for a match.  The common
+      // case here is that we are only looking at the buckets (for item info
+      // being non-null and for the full hash value) not at the items.  This
+      // is important for cache locality.
+      
+      // Do the comparison like this because NameStart isn't necessarily
+      // null-terminated!
+      char *ItemStr = (char*)BucketItem+ItemSize;
+      unsigned ItemStrLen = BucketItem->getKeyLength();
+      if (unsigned(NameEnd-NameStart) == ItemStrLen &&
+          memcmp(ItemStr, NameStart, ItemStrLen) == 0) {
+        // We found a match!
+        return BucketNo;
+      }
+    }
+    
+    // Okay, we didn't find the item.  Probe to the next bucket.
+    BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+    
+    // Use quadratic probing, it has fewer clumping artifacts than linear
+    // probing and has good cache behavior in the common case.
+    ++ProbeAmt;
+  }
+}
+
+
+/// FindKey - Look up the bucket that contains the specified key. If it exists
+/// in the map, return the bucket number of the key.  Otherwise return -1.
+/// This does not modify the map.
+int StringMapImpl::FindKey(const char *KeyStart, const char *KeyEnd) const {
+  unsigned HTSize = NumBuckets;
+  if (HTSize == 0) return -1;  // Really empty table?
+  unsigned FullHashValue = HashString(KeyStart, KeyEnd);
+  unsigned BucketNo = FullHashValue & (HTSize-1);
+  
+  unsigned ProbeAmt = 1;
+  while (1) {
+    ItemBucket &Bucket = TheTable[BucketNo];
+    StringMapEntryBase *BucketItem = Bucket.Item;
+    // If we found an empty bucket, this key isn't in the table yet, return.
+    if (BucketItem == 0)
+      return -1;
+    
+    if (BucketItem == getTombstoneVal()) {
+      // Ignore tombstones.
+    } else if (Bucket.FullHashValue == FullHashValue) {
+      // If the full hash value matches, check deeply for a match.  The common
+      // case here is that we are only looking at the buckets (for item info
+      // being non-null and for the full hash value) not at the items.  This
+      // is important for cache locality.
+      
+      // Do the comparison like this because NameStart isn't necessarily
+      // null-terminated!
+      char *ItemStr = (char*)BucketItem+ItemSize;
+      unsigned ItemStrLen = BucketItem->getKeyLength();
+      if (unsigned(KeyEnd-KeyStart) == ItemStrLen &&
+          memcmp(ItemStr, KeyStart, ItemStrLen) == 0) {
+        // We found a match!
+        return BucketNo;
+      }
+    }
+    
+    // Okay, we didn't find the item.  Probe to the next bucket.
+    BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+    
+    // Use quadratic probing, it has fewer clumping artifacts than linear
+    // probing and has good cache behavior in the common case.
+    ++ProbeAmt;
+  }
+}
+
+/// RemoveKey - Remove the specified StringMapEntry from the table, but do not
+/// delete it.  This aborts if the value isn't in the table.
+void StringMapImpl::RemoveKey(StringMapEntryBase *V) {
+  const char *VStr = (char*)V + ItemSize;
+  StringMapEntryBase *V2 = RemoveKey(VStr, VStr+V->getKeyLength());
+  V2 = V2;
+  assert(V == V2 && "Didn't find key?");
+}
+
+/// RemoveKey - Remove the StringMapEntry for the specified key from the
+/// table, returning it.  If the key is not in the table, this returns null.
+StringMapEntryBase *StringMapImpl::RemoveKey(const char *KeyStart,
+                                             const char *KeyEnd) {
+  int Bucket = FindKey(KeyStart, KeyEnd);
+  if (Bucket == -1) return 0;
+  
+  StringMapEntryBase *Result = TheTable[Bucket].Item;
+  TheTable[Bucket].Item = getTombstoneVal();
+  --NumItems;
+  ++NumTombstones;
+  return Result;
+}
+
+
+
+/// RehashTable - Grow the table, redistributing values into the buckets with
+/// the appropriate mod-of-hashtable-size.
+void StringMapImpl::RehashTable() {
+  unsigned NewSize = NumBuckets*2;
+  // Allocate one extra bucket which will always be non-empty.  This allows the
+  // iterators to stop at end.
+  ItemBucket *NewTableArray =(ItemBucket*)calloc(NewSize+1, sizeof(ItemBucket));
+  NewTableArray[NewSize].Item = (StringMapEntryBase*)2;
+  
+  // Rehash all the items into their new buckets.  Luckily :) we already have
+  // the hash values available, so we don't have to rehash any strings.
+  for (ItemBucket *IB = TheTable, *E = TheTable+NumBuckets; IB != E; ++IB) {
+    if (IB->Item && IB->Item != getTombstoneVal()) {
+      // Fast case, bucket available.
+      unsigned FullHash = IB->FullHashValue;
+      unsigned NewBucket = FullHash & (NewSize-1);
+      if (NewTableArray[NewBucket].Item == 0) {
+        NewTableArray[FullHash & (NewSize-1)].Item = IB->Item;
+        NewTableArray[FullHash & (NewSize-1)].FullHashValue = FullHash;
+        continue;
+      }
+      
+      // Otherwise probe for a spot.
+      unsigned ProbeSize = 1;
+      do {
+        NewBucket = (NewBucket + ProbeSize++) & (NewSize-1);
+      } while (NewTableArray[NewBucket].Item);
+      
+      // Finally found a slot.  Fill it in.
+      NewTableArray[NewBucket].Item = IB->Item;
+      NewTableArray[NewBucket].FullHashValue = FullHash;
+    }
+  }
+  
+  free(TheTable);
+  
+  TheTable = NewTableArray;
+  NumBuckets = NewSize;
+}
diff --git a/lib/Support/StringPool.cpp b/lib/Support/StringPool.cpp
new file mode 100644
index 0000000..b9c1fd0
--- /dev/null
+++ b/lib/Support/StringPool.cpp
@@ -0,0 +1,35 @@
+//===-- StringPool.cpp - Interned string pool -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringPool class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/StringPool.h"
+#include "llvm/Support/Streams.h"
+
+using namespace llvm;
+
+StringPool::StringPool() {}
+
+StringPool::~StringPool() {
+  assert(InternTable.empty() && "PooledStringPtr leaked!");
+}
+
+PooledStringPtr StringPool::intern(const char *Begin, const char *End) {
+  table_t::iterator I = InternTable.find(Begin, End);
+  if (I != InternTable.end())
+    return PooledStringPtr(&*I);
+  
+  entry_t *S = entry_t::Create(Begin, End);
+  S->getValue().Pool = this;
+  InternTable.insert(S);
+  
+  return PooledStringPtr(S);
+}
diff --git a/lib/Support/SystemUtils.cpp b/lib/Support/SystemUtils.cpp
new file mode 100644
index 0000000..80d6e4c
--- /dev/null
+++ b/lib/Support/SystemUtils.cpp
@@ -0,0 +1,58 @@
+//===- SystemUtils.cpp - Utilities for low-level system tasks -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions used to do a variety of low-level, often
+// system-specific, tasks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/SystemUtils.h"
+#include "llvm/System/Process.h"
+#include "llvm/System/Program.h"
+#include <ostream>
+using namespace llvm;
+
+bool llvm::CheckBitcodeOutputToConsole(std::ostream* stream_to_check,
+                                       bool print_warning) {
+  if (stream_to_check == cout.stream() &&
+      sys::Process::StandardOutIsDisplayed()) {
+    if (print_warning) {
+      cerr << "WARNING: You're attempting to print out a bitcode file.\n"
+           << "This is inadvisable as it may cause display problems. If\n"
+           << "you REALLY want to taste LLVM bitcode first-hand, you\n"
+           << "can force output with the `-f' option.\n\n";
+    }
+    return true;
+  }
+  return false;
+}
+
+/// FindExecutable - Find a named executable, giving the argv[0] of program
+/// being executed. This allows us to find another LLVM tool if it is built
+/// into the same directory, but that directory is neither the current
+/// directory, nor in the PATH.  If the executable cannot be found, return an
+/// empty string.
+///
+#undef FindExecutable   // needed on windows :(
+sys::Path llvm::FindExecutable(const std::string &ExeName,
+                               const std::string &ProgramPath) {
+  // First check the directory that the calling program is in.  We can do this
+  // if ProgramPath contains at least one / character, indicating that it is a
+  // relative path to bugpoint itself.
+  sys::Path Result ( ProgramPath );
+  Result.eraseComponent();
+  if (!Result.isEmpty()) {
+    Result.appendComponent(ExeName);
+    if (Result.canExecute())
+      return Result;
+  }
+
+  return sys::Program::FindProgramByName(ExeName);
+}
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
new file mode 100644
index 0000000..3c8879b
--- /dev/null
+++ b/lib/Support/Timer.cpp
@@ -0,0 +1,387 @@
+//===-- Timer.cpp - Interval Timing Support -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interval Timing implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/Process.h"
+#include <algorithm>
+#include <fstream>
+#include <functional>
+#include <map>
+using namespace llvm;
+
+// GetLibSupportInfoOutputFile - Return a file stream to print our output on.
+namespace llvm { extern std::ostream *GetLibSupportInfoOutputFile(); }
+
+// getLibSupportInfoOutputFilename - This ugly hack is brought to you courtesy
+// of constructor/destructor ordering being unspecified by C++.  Basically the
+// problem is that a Statistic object gets destroyed, which ends up calling
+// 'GetLibSupportInfoOutputFile()' (below), which calls this function.
+// LibSupportInfoOutputFilename used to be a global variable, but sometimes it
+// would get destroyed before the Statistic, causing havoc to ensue.  We "fix"
+// this by creating the string the first time it is needed and never destroying
+// it.
+static ManagedStatic<std::string> LibSupportInfoOutputFilename;
+static std::string &getLibSupportInfoOutputFilename() {
+  return *LibSupportInfoOutputFilename;
+}
+
+namespace {
+  static cl::opt<bool>
+  TrackSpace("track-memory", cl::desc("Enable -time-passes memory "
+                                      "tracking (this may be slow)"),
+             cl::Hidden);
+
+  static cl::opt<std::string, true>
+  InfoOutputFilename("info-output-file", cl::value_desc("filename"),
+                     cl::desc("File to append -stats and -timer output to"),
+                   cl::Hidden, cl::location(getLibSupportInfoOutputFilename()));
+}
+
+static TimerGroup *DefaultTimerGroup = 0;
+static TimerGroup *getDefaultTimerGroup() {
+  if (DefaultTimerGroup) return DefaultTimerGroup;
+  return DefaultTimerGroup = new TimerGroup("Miscellaneous Ungrouped Timers");
+}
+
+Timer::Timer(const std::string &N)
+  : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
+    Started(false), TG(getDefaultTimerGroup()) {
+  TG->addTimer();
+}
+
+Timer::Timer(const std::string &N, TimerGroup &tg)
+  : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
+    Started(false), TG(&tg) {
+  TG->addTimer();
+}
+
+Timer::Timer(const Timer &T) {
+  TG = T.TG;
+  if (TG) TG->addTimer();
+  operator=(T);
+}
+
+
+// Copy ctor, initialize with no TG member.
+Timer::Timer(bool, const Timer &T) {
+  TG = T.TG;     // Avoid assertion in operator=
+  operator=(T);  // Copy contents
+  TG = 0;
+}
+
+
+Timer::~Timer() {
+  if (TG) {
+    if (Started) {
+      Started = false;
+      TG->addTimerToPrint(*this);
+    }
+    TG->removeTimer();
+  }
+}
+
+static inline size_t getMemUsage() {
+  if (TrackSpace)
+    return sys::Process::GetMallocUsage();
+  return 0;
+}
+
+struct TimeRecord {
+  double Elapsed, UserTime, SystemTime;
+  ssize_t MemUsed;
+};
+
+static TimeRecord getTimeRecord(bool Start) {
+  TimeRecord Result;
+
+  sys::TimeValue now(0,0);
+  sys::TimeValue user(0,0);
+  sys::TimeValue sys(0,0);
+
+  ssize_t MemUsed = 0;
+  if (Start) {
+    MemUsed = getMemUsage();
+    sys::Process::GetTimeUsage(now,user,sys);
+  } else {
+    sys::Process::GetTimeUsage(now,user,sys);
+    MemUsed = getMemUsage();
+  }
+
+  Result.Elapsed  = now.seconds()  + now.microseconds()  / 1000000.0;
+  Result.UserTime = user.seconds() + user.microseconds() / 1000000.0;
+  Result.SystemTime  = sys.seconds()  + sys.microseconds()  / 1000000.0;
+  Result.MemUsed  = MemUsed;
+
+  return Result;
+}
+
+static ManagedStatic<std::vector<Timer*> > ActiveTimers;
+
+void Timer::startTimer() {
+  Started = true;
+  ActiveTimers->push_back(this);
+  TimeRecord TR = getTimeRecord(true);
+  Elapsed    -= TR.Elapsed;
+  UserTime   -= TR.UserTime;
+  SystemTime -= TR.SystemTime;
+  MemUsed    -= TR.MemUsed;
+  PeakMemBase = TR.MemUsed;
+}
+
+void Timer::stopTimer() {
+  TimeRecord TR = getTimeRecord(false);
+  Elapsed    += TR.Elapsed;
+  UserTime   += TR.UserTime;
+  SystemTime += TR.SystemTime;
+  MemUsed    += TR.MemUsed;
+
+  if (ActiveTimers->back() == this) {
+    ActiveTimers->pop_back();
+  } else {
+    std::vector<Timer*>::iterator I =
+      std::find(ActiveTimers->begin(), ActiveTimers->end(), this);
+    assert(I != ActiveTimers->end() && "stop but no startTimer?");
+    ActiveTimers->erase(I);
+  }
+}
+
+void Timer::sum(const Timer &T) {
+  Elapsed    += T.Elapsed;
+  UserTime   += T.UserTime;
+  SystemTime += T.SystemTime;
+  MemUsed    += T.MemUsed;
+  PeakMem    += T.PeakMem;
+}
+
+/// addPeakMemoryMeasurement - This method should be called whenever memory
+/// usage needs to be checked.  It adds a peak memory measurement to the
+/// currently active timers, which will be printed when the timer group prints
+///
+void Timer::addPeakMemoryMeasurement() {
+  size_t MemUsed = getMemUsage();
+
+  for (std::vector<Timer*>::iterator I = ActiveTimers->begin(),
+         E = ActiveTimers->end(); I != E; ++I)
+    (*I)->PeakMem = std::max((*I)->PeakMem, MemUsed-(*I)->PeakMemBase);
+}
+
+//===----------------------------------------------------------------------===//
+//   NamedRegionTimer Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+typedef std::map<std::string, Timer> Name2Timer;
+typedef std::map<std::string, std::pair<TimerGroup, Name2Timer> > Name2Pair;
+
+}
+
+static ManagedStatic<Name2Timer> NamedTimers;
+
+static ManagedStatic<Name2Pair> NamedGroupedTimers;
+
+static Timer &getNamedRegionTimer(const std::string &Name) {
+  Name2Timer::iterator I = NamedTimers->find(Name);
+  if (I != NamedTimers->end())
+    return I->second;
+
+  return NamedTimers->insert(I, std::make_pair(Name, Timer(Name)))->second;
+}
+
+static Timer &getNamedRegionTimer(const std::string &Name,
+                                  const std::string &GroupName) {
+
+  Name2Pair::iterator I = NamedGroupedTimers->find(GroupName);
+  if (I == NamedGroupedTimers->end()) {
+    TimerGroup TG(GroupName);
+    std::pair<TimerGroup, Name2Timer> Pair(TG, Name2Timer());
+    I = NamedGroupedTimers->insert(I, std::make_pair(GroupName, Pair));
+  }
+
+  Name2Timer::iterator J = I->second.second.find(Name);
+  if (J == I->second.second.end())
+    J = I->second.second.insert(J,
+                                std::make_pair(Name,
+                                               Timer(Name,
+                                                     I->second.first)));
+
+  return J->second;
+}
+
+NamedRegionTimer::NamedRegionTimer(const std::string &Name)
+  : TimeRegion(getNamedRegionTimer(Name)) {}
+
+NamedRegionTimer::NamedRegionTimer(const std::string &Name,
+                                   const std::string &GroupName)
+  : TimeRegion(getNamedRegionTimer(Name, GroupName)) {}
+
+//===----------------------------------------------------------------------===//
+//   TimerGroup Implementation
+//===----------------------------------------------------------------------===//
+
+// printAlignedFP - Simulate the printf "%A.Bf" format, where A is the
+// TotalWidth size, and B is the AfterDec size.
+//
+static void printAlignedFP(double Val, unsigned AfterDec, unsigned TotalWidth,
+                           std::ostream &OS) {
+  assert(TotalWidth >= AfterDec+1 && "Bad FP Format!");
+  OS.width(TotalWidth-AfterDec-1);
+  char OldFill = OS.fill();
+  OS.fill(' ');
+  OS << (int)Val;  // Integer part;
+  OS << ".";
+  OS.width(AfterDec);
+  OS.fill('0');
+  unsigned ResultFieldSize = 1;
+  while (AfterDec--) ResultFieldSize *= 10;
+  OS << (int)(Val*ResultFieldSize) % ResultFieldSize;
+  OS.fill(OldFill);
+}
+
+static void printVal(double Val, double Total, std::ostream &OS) {
+  if (Total < 1e-7)   // Avoid dividing by zero...
+    OS << "        -----     ";
+  else {
+    OS << "  ";
+    printAlignedFP(Val, 4, 7, OS);
+    OS << " (";
+    printAlignedFP(Val*100/Total, 1, 5, OS);
+    OS << "%)";
+  }
+}
+
+void Timer::print(const Timer &Total, std::ostream &OS) {
+  if (Total.UserTime)
+    printVal(UserTime, Total.UserTime, OS);
+  if (Total.SystemTime)
+    printVal(SystemTime, Total.SystemTime, OS);
+  if (Total.getProcessTime())
+    printVal(getProcessTime(), Total.getProcessTime(), OS);
+  printVal(Elapsed, Total.Elapsed, OS);
+
+  OS << "  ";
+
+  if (Total.MemUsed) {
+    OS.width(9);
+    OS << MemUsed << "  ";
+  }
+  if (Total.PeakMem) {
+    if (PeakMem) {
+      OS.width(9);
+      OS << PeakMem << "  ";
+    } else
+      OS << "           ";
+  }
+  OS << Name << "\n";
+
+  Started = false;  // Once printed, don't print again
+}
+
+// GetLibSupportInfoOutputFile - Return a file stream to print our output on...
+std::ostream *
+llvm::GetLibSupportInfoOutputFile() {
+  std::string &LibSupportInfoOutputFilename = getLibSupportInfoOutputFilename();
+  if (LibSupportInfoOutputFilename.empty())
+    return cerr.stream();
+  if (LibSupportInfoOutputFilename == "-")
+    return cout.stream();
+
+  std::ostream *Result = new std::ofstream(LibSupportInfoOutputFilename.c_str(),
+                                           std::ios::app);
+  if (!Result->good()) {
+    cerr << "Error opening info-output-file '"
+         << LibSupportInfoOutputFilename << " for appending!\n";
+    delete Result;
+    return cerr.stream();
+  }
+  return Result;
+}
+
+
+void TimerGroup::removeTimer() {
+  if (--NumTimers == 0 && !TimersToPrint.empty()) { // Print timing report...
+    // Sort the timers in descending order by amount of time taken...
+    std::sort(TimersToPrint.begin(), TimersToPrint.end(),
+              std::greater<Timer>());
+
+    // Figure out how many spaces to indent TimerGroup name...
+    unsigned Padding = (80-Name.length())/2;
+    if (Padding > 80) Padding = 0;         // Don't allow "negative" numbers
+
+    std::ostream *OutStream = GetLibSupportInfoOutputFile();
+
+    ++NumTimers;
+    {  // Scope to contain Total timer... don't allow total timer to drop us to
+       // zero timers...
+      Timer Total("TOTAL");
+
+      for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i)
+        Total.sum(TimersToPrint[i]);
+
+      // Print out timing header...
+      *OutStream << "===" << std::string(73, '-') << "===\n"
+                 << std::string(Padding, ' ') << Name << "\n"
+                 << "===" << std::string(73, '-')
+                 << "===\n";
+
+      // If this is not an collection of ungrouped times, print the total time.
+      // Ungrouped timers don't really make sense to add up.  We still print the
+      // TOTAL line to make the percentages make sense.
+      if (this != DefaultTimerGroup) {
+        *OutStream << "  Total Execution Time: ";
+
+        printAlignedFP(Total.getProcessTime(), 4, 5, *OutStream);
+        *OutStream << " seconds (";
+        printAlignedFP(Total.getWallTime(), 4, 5, *OutStream);
+        *OutStream << " wall clock)\n";
+      }
+      *OutStream << "\n";
+
+      if (Total.UserTime)
+        *OutStream << "   ---User Time---";
+      if (Total.SystemTime)
+        *OutStream << "   --System Time--";
+      if (Total.getProcessTime())
+        *OutStream << "   --User+System--";
+      *OutStream << "   ---Wall Time---";
+      if (Total.getMemUsed())
+        *OutStream << "  ---Mem---";
+      if (Total.getPeakMem())
+        *OutStream << "  -PeakMem-";
+      *OutStream << "  --- Name ---\n";
+
+      // Loop through all of the timing data, printing it out...
+      for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i)
+        TimersToPrint[i].print(Total, *OutStream);
+
+      Total.print(Total, *OutStream);
+      *OutStream << std::endl;  // Flush output
+    }
+    --NumTimers;
+
+    TimersToPrint.clear();
+
+    if (OutStream != cerr.stream() && OutStream != cout.stream())
+      delete OutStream;   // Close the file...
+  }
+
+  // Delete default timer group!
+  if (NumTimers == 0 && this == DefaultTimerGroup) {
+    delete DefaultTimerGroup;
+    DefaultTimerGroup = 0;
+  }
+}
+
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
new file mode 100644
index 0000000..e8cf69d
--- /dev/null
+++ b/lib/Support/Triple.cpp
@@ -0,0 +1,187 @@
+//===--- Triple.cpp - Target triple helper class --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include <cassert>
+#include <cstring>
+using namespace llvm;
+
+//
+
+const char *Triple::getArchTypeName(ArchType Kind) {
+  switch (Kind) {
+  case InvalidArch: return "<invalid>";
+  case UnknownArch: return "unknown";
+
+  case x86: return "i386";
+  case x86_64: return "x86_64";
+  case ppc: return "powerpc";
+  case ppc64: return "powerpc64";
+  }
+
+  return "<invalid>";
+}
+
+const char *Triple::getVendorTypeName(VendorType Kind) {
+  switch (Kind) {
+  case UnknownVendor: return "unknown";
+
+  case Apple: return "apple";
+  case PC: return "PC";
+  }
+
+  return "<invalid>";
+}
+
+const char *Triple::getOSTypeName(OSType Kind) {
+  switch (Kind) {
+  case UnknownOS: return "unknown";
+
+  case Darwin: return "darwin";
+  case DragonFly: return "dragonfly";
+  case FreeBSD: return "freebsd";
+  case Linux: return "linux";
+  }
+
+  return "<invalid>";
+}
+
+//
+
+void Triple::Parse() const {
+  assert(!isInitialized() && "Invalid parse call.");
+
+  std::string ArchName = getArchName();
+  if (ArchName.size() == 4 && ArchName[0] == 'i' && 
+      ArchName[2] == '8' && ArchName[3] == '6')
+    Arch = x86;
+  else if (ArchName == "amd64" || ArchName == "x86_64")
+    Arch = x86_64;
+  else if (ArchName == "powerpc")
+    Arch = ppc;
+  else if (ArchName == "powerpc64")
+    Arch = ppc64;
+  else
+    Arch = UnknownArch;
+
+  std::string VendorName = getVendorName();
+  if (VendorName == "apple")
+    Vendor = Apple;
+  else if (VendorName == "pc")
+    Vendor = PC;
+  else
+    Vendor = UnknownVendor;
+
+  std::string OSName = getOSName();
+  if (memcmp(&OSName[0], "darwin", 6) == 0)
+    OS = Darwin;
+  else if (memcmp(&OSName[0], "dragonfly", 9) == 0)
+    OS = DragonFly;
+  else if (memcmp(&OSName[0], "freebsd", 7) == 0)
+    OS = FreeBSD;
+  else if (memcmp(&OSName[0], "linux", 5) == 0)
+    OS = Linux;
+  else
+    OS = UnknownOS;
+
+  assert(isInitialized() && "Failed to initialize!");
+}
+
+static std::string extract(const std::string &A,
+                           std::string::size_type begin,
+                           std::string::size_type end) {
+  if (begin == std::string::npos)
+    return "";
+  if (end == std::string::npos)
+    return A.substr(begin);
+  return A.substr(begin, end - begin);
+}
+
+static std::string extract1(const std::string &A,
+                           std::string::size_type begin,
+                           std::string::size_type end) {
+  if (begin == std::string::npos || begin == end)
+    return "";
+  return extract(A, begin + 1, end);
+}
+
+std::string Triple::getArchName() const {
+  std::string Tmp = Data;
+  return extract(Tmp, 0, Tmp.find('-'));
+}
+
+std::string Triple::getVendorName() const {
+  std::string Tmp = Data;
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  return extract(Tmp, 0, Tmp.find('-'));
+}
+
+std::string Triple::getOSName() const {
+  std::string Tmp = Data;
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  return extract(Tmp, 0, Tmp.find('-'));
+}
+
+std::string Triple::getEnvironmentName() const {
+  std::string Tmp = Data;
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  return extract(Tmp, 0, std::string::npos);
+}
+
+std::string Triple::getOSAndEnvironmentName() const {
+  std::string Tmp = Data;
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  Tmp = extract1(Tmp, Tmp.find('-'), std::string::npos);
+  return extract(Tmp, 0, std::string::npos);
+}
+
+void Triple::setTriple(const std::string &Str) {
+  Data = Str;
+  Arch = InvalidArch;
+}
+
+void Triple::setArch(ArchType Kind) {
+  setArchName(getArchTypeName(Kind));
+}
+
+void Triple::setVendor(VendorType Kind) {
+  setVendorName(getVendorTypeName(Kind));
+}
+
+void Triple::setOS(OSType Kind) {
+  setOSName(getOSTypeName(Kind));
+}
+
+void Triple::setArchName(const std::string &Str) {
+  setTriple(Str + "-" + getVendorName() + "-" + getOSAndEnvironmentName());
+}
+
+void Triple::setVendorName(const std::string &Str) {
+  setTriple(getArchName() + "-" + Str + "-" + getOSAndEnvironmentName());
+}
+
+void Triple::setOSName(const std::string &Str) {
+  if (hasEnvironment())
+    setTriple(getArchName() + "-" + getVendorName() + "-" + Str +
+              "-" + getEnvironmentName());
+  else
+    setTriple(getArchName() + "-" + getVendorName() + "-" + Str);
+}
+
+void Triple::setEnvironmentName(const std::string &Str) {
+  setTriple(getArchName() + "-" + getVendorName() + "-" + getOSName() + 
+            "-" + Str);
+}
+
+void Triple::setOSAndEnvironmentName(const std::string &Str) {
+  setTriple(getArchName() + "-" + getVendorName() + "-" + Str);
+}
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
new file mode 100644
index 0000000..6ac37bc
--- /dev/null
+++ b/lib/Support/raw_ostream.cpp
@@ -0,0 +1,376 @@
+//===--- raw_ostream.cpp - Implement the raw_ostream classes --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements support for bulk buffered stream output.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Format.h"
+#include "llvm/System/Program.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Compiler.h"
+#include <ostream>
+
+#if defined(HAVE_UNISTD_H)
+# include <unistd.h>
+#endif
+#if defined(HAVE_FCNTL_H)
+# include <fcntl.h>
+#endif
+
+#if defined(_MSC_VER)
+#include <io.h>
+#include <fcntl.h>
+#ifndef STDIN_FILENO
+# define STDIN_FILENO 0
+#endif
+#ifndef STDOUT_FILENO
+# define STDOUT_FILENO 1
+#endif
+#ifndef STDERR_FILENO
+# define STDERR_FILENO 2
+#endif
+#endif
+
+using namespace llvm;
+
+
+// An out of line virtual method to provide a home for the class vtable.
+void raw_ostream::handle() {}
+
+raw_ostream &raw_ostream::operator<<(unsigned long N) {
+  // Zero is a special case.
+  if (N == 0)
+    return *this << '0';
+  
+  char NumberBuffer[20];
+  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char *CurPtr = EndPtr;
+  
+  while (N) {
+    *--CurPtr = '0' + char(N % 10);
+    N /= 10;
+  }
+  return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::operator<<(long N) {
+  if (N <  0) {
+    *this << '-';
+    N = -N;
+  }
+  
+  return this->operator<<(static_cast<unsigned long>(N));
+}
+
+raw_ostream &raw_ostream::operator<<(unsigned long long N) {
+  // Zero is a special case.
+  if (N == 0)
+    return *this << '0';
+  
+  char NumberBuffer[20];
+  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char *CurPtr = EndPtr;
+  
+  while (N) {
+    *--CurPtr = '0' + char(N % 10);
+    N /= 10;
+  }
+  return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::operator<<(long long N) {
+  if (N <  0) {
+    *this << '-';
+    N = -N;
+  }
+  
+  return this->operator<<(static_cast<unsigned long long>(N));
+}
+
+raw_ostream &raw_ostream::operator<<(const void *P) {
+  uintptr_t N = (uintptr_t) P;
+  *this << '0' << 'x';
+  
+  // Zero is a special case.
+  if (N == 0)
+    return *this << '0';
+
+  char NumberBuffer[20];
+  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char *CurPtr = EndPtr;
+
+  while (N) {
+    unsigned x = N % 16;
+    *--CurPtr = (x < 10 ? '0' + x : 'a' + x - 10);
+    N /= 16;
+  }
+
+  return write(CurPtr, EndPtr-CurPtr);
+}
+
+void raw_ostream::flush_nonempty() {
+  assert(OutBufCur > OutBufStart && "Invalid call to flush_nonempty.");
+  write_impl(OutBufStart, OutBufCur - OutBufStart);
+  OutBufCur = OutBufStart;    
+}
+
+raw_ostream &raw_ostream::write(unsigned char C) {
+  // Group exceptional cases into a single branch.
+  if (OutBufCur >= OutBufEnd) {
+    if (Unbuffered) {
+      write_impl(reinterpret_cast<char*>(&C), 1);
+      return *this;
+    }
+    
+    if (!OutBufStart)
+      SetBufferSize();
+    else
+      flush_nonempty();
+  }
+
+  *OutBufCur++ = C;
+  return *this;
+}
+
+raw_ostream &raw_ostream::write(const char *Ptr, unsigned Size) {
+  // Group exceptional cases into a single branch.
+  if (BUILTIN_EXPECT(OutBufCur+Size > OutBufEnd, false)) {
+    if (Unbuffered) {
+      write_impl(Ptr, Size);
+      return *this;
+    }
+    
+    if (!OutBufStart)
+      SetBufferSize();
+    else
+      flush_nonempty();
+  }
+  
+  // Handle short strings specially, memcpy isn't very good at very short
+  // strings.
+  switch (Size) {
+  case 4: OutBufCur[3] = Ptr[3]; // FALL THROUGH
+  case 3: OutBufCur[2] = Ptr[2]; // FALL THROUGH
+  case 2: OutBufCur[1] = Ptr[1]; // FALL THROUGH
+  case 1: OutBufCur[0] = Ptr[0]; // FALL THROUGH
+  case 0: break;
+  default:
+    // Normally the string to emit is shorter than the buffer.
+    if (Size <= unsigned(OutBufEnd-OutBufStart)) {
+      memcpy(OutBufCur, Ptr, Size);
+      break;
+    } 
+
+    // Otherwise we are emitting a string larger than our buffer. We
+    // know we already flushed, so just write it out directly.
+    write_impl(Ptr, Size);
+    Size = 0;
+    break;
+  }
+  OutBufCur += Size;
+
+  return *this;
+}
+
+// Formatted output.
+raw_ostream &raw_ostream::operator<<(const format_object_base &Fmt) {
+  // If we have more than a few bytes left in our output buffer, try
+  // formatting directly onto its end.
+  //
+  // FIXME: This test is a bit silly, since if we don't have enough
+  // space in the buffer we will have to flush the formatted output
+  // anyway. We should just flush upfront in such cases, and use the
+  // whole buffer as our scratch pad. Note, however, that this case is
+  // also necessary for correctness on unbuffered streams.
+  unsigned NextBufferSize = 127;
+  if (OutBufEnd-OutBufCur > 3) {
+    unsigned BufferBytesLeft = OutBufEnd-OutBufCur;
+    unsigned BytesUsed = Fmt.print(OutBufCur, BufferBytesLeft);
+    
+    // Common case is that we have plenty of space.
+    if (BytesUsed < BufferBytesLeft) {
+      OutBufCur += BytesUsed;
+      return *this;
+    }
+    
+    // Otherwise, we overflowed and the return value tells us the size to try
+    // again with.
+    NextBufferSize = BytesUsed;
+  }
+  
+  // If we got here, we didn't have enough space in the output buffer for the
+  // string.  Try printing into a SmallVector that is resized to have enough
+  // space.  Iterate until we win.
+  SmallVector<char, 128> V;
+  
+  while (1) {
+    V.resize(NextBufferSize);
+    
+    // Try formatting into the SmallVector.
+    unsigned BytesUsed = Fmt.print(&V[0], NextBufferSize);
+    
+    // If BytesUsed fit into the vector, we win.
+    if (BytesUsed <= NextBufferSize)
+      return write(&V[0], BytesUsed);
+    
+    // Otherwise, try again with a new size.
+    assert(BytesUsed > NextBufferSize && "Didn't grow buffer!?");
+    NextBufferSize = BytesUsed;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Formatted Output
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+void format_object_base::home() {
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_fd_ostream
+//===----------------------------------------------------------------------===//
+
+/// raw_fd_ostream - Open the specified file for writing. If an error
+/// occurs, information about the error is put into ErrorInfo, and the
+/// stream should be immediately destroyed; the string will be empty
+/// if no error occurred.
+raw_fd_ostream::raw_fd_ostream(const char *Filename, bool Binary,
+                               std::string &ErrorInfo) : pos(0) {
+  ErrorInfo.clear();
+
+  // Handle "-" as stdout.
+  if (Filename[0] == '-' && Filename[1] == 0) {
+    FD = STDOUT_FILENO;
+    // If user requested binary then put stdout into binary mode if
+    // possible.
+    if (Binary)
+      sys::Program::ChangeStdoutToBinary();
+    ShouldClose = false;
+    return;
+  }
+  
+  int Flags = O_WRONLY|O_CREAT|O_TRUNC;
+#ifdef O_BINARY
+  if (Binary)
+    Flags |= O_BINARY;
+#endif
+  FD = open(Filename, Flags, 0644);
+  if (FD < 0) {
+    ErrorInfo = "Error opening output file '" + std::string(Filename) + "'";
+    ShouldClose = false;
+  } else {
+    ShouldClose = true;
+  }
+}
+
+raw_fd_ostream::~raw_fd_ostream() {
+  if (FD >= 0) {
+    flush();
+    if (ShouldClose)
+      ::close(FD);
+  }
+}
+
+void raw_fd_ostream::write_impl(const char *Ptr, unsigned Size) {
+  assert (FD >= 0 && "File already closed.");
+  pos += Size;
+  ::write(FD, Ptr, Size);
+}
+
+void raw_fd_ostream::close() {
+  assert (ShouldClose);
+  ShouldClose = false;
+  flush();
+  ::close(FD);
+  FD = -1;
+}
+
+uint64_t raw_fd_ostream::seek(uint64_t off) {
+  flush();
+  pos = lseek(FD, off, SEEK_SET);
+  return pos;  
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_stdout/err_ostream
+//===----------------------------------------------------------------------===//
+
+raw_stdout_ostream::raw_stdout_ostream():raw_fd_ostream(STDOUT_FILENO, false) {}
+raw_stderr_ostream::raw_stderr_ostream():raw_fd_ostream(STDERR_FILENO, false, 
+                                                        true) {}
+
+// An out of line virtual method to provide a home for the class vtable.
+void raw_stdout_ostream::handle() {}
+void raw_stderr_ostream::handle() {}
+
+/// outs() - This returns a reference to a raw_ostream for standard output.
+/// Use it like: outs() << "foo" << "bar";
+raw_ostream &llvm::outs() {
+  static raw_stdout_ostream S;
+  return S;
+}
+
+/// errs() - This returns a reference to a raw_ostream for standard error.
+/// Use it like: errs() << "foo" << "bar";
+raw_ostream &llvm::errs() {
+  static raw_stderr_ostream S;
+  return S;
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_os_ostream
+//===----------------------------------------------------------------------===//
+
+raw_os_ostream::~raw_os_ostream() {
+  flush();
+}
+
+void raw_os_ostream::write_impl(const char *Ptr, unsigned Size) {
+  OS.write(Ptr, Size);
+}
+
+uint64_t raw_os_ostream::current_pos() { return OS.tellp(); }
+
+uint64_t raw_os_ostream::tell() { 
+  return (uint64_t)OS.tellp() + GetNumBytesInBuffer(); 
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_string_ostream
+//===----------------------------------------------------------------------===//
+
+raw_string_ostream::~raw_string_ostream() {
+  flush();
+}
+
+void raw_string_ostream::write_impl(const char *Ptr, unsigned Size) {
+  OS.append(Ptr, Size);
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_svector_ostream
+//===----------------------------------------------------------------------===//
+
+raw_svector_ostream::~raw_svector_ostream() {
+  flush();
+}
+
+void raw_svector_ostream::write_impl(const char *Ptr, unsigned Size) {
+  OS.append(Ptr, Ptr + Size);
+}
+
+uint64_t raw_svector_ostream::current_pos() { return OS.size(); }
+
+uint64_t raw_svector_ostream::tell() { 
+  return OS.size() + GetNumBytesInBuffer(); 
+}
diff --git a/lib/System/Alarm.cpp b/lib/System/Alarm.cpp
new file mode 100644
index 0000000..0014ca7
--- /dev/null
+++ b/lib/System/Alarm.cpp
@@ -0,0 +1,33 @@
+//===- Alarm.cpp - Alarm Generation Support ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Alarm functionality 
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Alarm.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+}
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Alarm.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Win32/Alarm.inc"
+#endif
diff --git a/lib/System/Atomic.cpp b/lib/System/Atomic.cpp
new file mode 100644
index 0000000..cefd0bb
--- /dev/null
+++ b/lib/System/Atomic.cpp
@@ -0,0 +1,53 @@
+//===-- Atomic.cpp - Atomic Operations --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements atomic operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Atomic.h"
+#include "llvm/Config/config.h"
+
+using namespace llvm;
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#undef MemoryFence
+#endif
+
+void sys::MemoryFence() {
+#if LLVM_MULTITHREADED==0
+  return;
+#else
+#  if defined(__GNUC__)
+  __sync_synchronize();
+#  elif defined(_MSC_VER)
+  MemoryBarrier();
+#  else
+# error No memory fence implementation for your platform!
+#  endif
+#endif
+}
+
+sys::cas_flag sys::CompareAndSwap(volatile sys::cas_flag* ptr,
+                                  sys::cas_flag new_value,
+                                  sys::cas_flag old_value) {
+#if LLVM_MULTITHREADED==0
+  sys::cas_flag result = *ptr;
+  if (result == old_value)
+    *ptr = new_value;
+  return result;
+#elif defined(__GNUC__)
+  return __sync_val_compare_and_swap(ptr, old_value, new_value);
+#elif defined(_MSC_VER)
+  return InterlockedCompareExchange(ptr, new_value, old_value);
+#else
+#  error No compare-and-swap implementation for your platform!
+#endif
+}
+\ No newline at end of file
diff --git a/lib/System/CMakeLists.txt b/lib/System/CMakeLists.txt
new file mode 100644
index 0000000..5415dd6
--- /dev/null
+++ b/lib/System/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_llvm_library(LLVMSystem
+  Alarm.cpp
+  Atomic.cpp
+  Disassembler.cpp
+  DynamicLibrary.cpp
+  Host.cpp
+  IncludeFile.cpp
+  Memory.cpp
+  Mutex.cpp
+  Path.cpp
+  Process.cpp
+  Program.cpp
+  Signals.cpp
+  TimeValue.cpp
+  )
+
+if( BUILD_SHARED_LIBS AND NOT WIN32 )
+  target_link_libraries(LLVMSystem dl)
+endif()
diff --git a/lib/System/Disassembler.cpp b/lib/System/Disassembler.cpp
new file mode 100644
index 0000000..378fe26
--- /dev/null
+++ b/lib/System/Disassembler.cpp
@@ -0,0 +1,79 @@
+//===- lib/System/Disassembler.cpp ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the necessary glue to call external disassembler
+// libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/System/Disassembler.h"
+
+#include <cassert>
+#include <iomanip>
+#include <string>
+#include <sstream>
+
+#if USE_UDIS86
+#include <udis86.h>
+#endif
+
+using namespace llvm;
+
+bool llvm::sys::hasDisassembler(void) 
+{
+#if defined (__i386__) || defined (__amd64__) || defined (__x86_64__)
+  // We have option to enable udis86 library.
+# if USE_UDIS86
+  return true;
+#else
+  return false;
+#endif
+#else
+  return false;
+#endif
+}
+
+std::string llvm::sys::disassembleBuffer(uint8_t* start, size_t length,
+                                         uint64_t pc) {
+  std::stringstream res;
+
+#if defined (__i386__) || defined (__amd64__) || defined (__x86_64__)
+  unsigned bits;
+# if defined(__i386__)
+  bits = 32;
+# else
+  bits = 64;
+# endif
+  
+# if USE_UDIS86
+  ud_t ud_obj;
+   
+  ud_init(&ud_obj);
+  ud_set_input_buffer(&ud_obj, start, length);
+  ud_set_mode(&ud_obj, bits);
+  ud_set_pc(&ud_obj, pc);
+  ud_set_syntax(&ud_obj, UD_SYN_ATT);
+  
+  res << std::setbase(16)
+      << std::setw(bits/4);
+  
+  while (ud_disassemble(&ud_obj)) {
+    res << ud_insn_off(&ud_obj) << ":\t" << ud_insn_asm(&ud_obj) << "\n";
+  }
+# else
+  res << "No disassembler available. See configure help for options.\n";
+# endif
+  
+#else
+  res << "No disassembler available. See configure help for options.\n";
+#endif
+
+  return res.str();
+}
diff --git a/lib/System/DynamicLibrary.cpp b/lib/System/DynamicLibrary.cpp
new file mode 100644
index 0000000..3bf172c
--- /dev/null
+++ b/lib/System/DynamicLibrary.cpp
@@ -0,0 +1,165 @@
+//===-- DynamicLibrary.cpp - Runtime link/load libraries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system DynamicLibrary concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/DynamicLibrary.h"
+#include "llvm/Config/config.h"
+#include <cstdio>
+#include <cstring>
+#include <map>
+
+// Collection of symbol name/value pairs to be searched prior to any libraries.
+std::map<std::string, void *> &g_symbols() {
+  static std::map<std::string, void *> symbols;
+  return symbols;
+}
+
+void llvm::sys::DynamicLibrary::AddSymbol(const char* symbolName,
+                                          void *symbolValue) {
+  g_symbols()[symbolName] = symbolValue;
+}
+
+// It is not possible to use ltdl.c on VC++ builds as the terms of its LGPL
+// license and special exception would cause all of LLVM to be placed under
+// the LGPL.  This is because the exception applies only when libtool is
+// used, and obviously libtool is not used with Visual Studio.  An entirely
+// separate implementation is provided in win32/DynamicLibrary.cpp.
+
+#ifdef LLVM_ON_WIN32
+
+#include "Win32/DynamicLibrary.inc"
+
+#else
+
+//#include "ltdl.h"
+#include <dlfcn.h>
+#include <cassert>
+using namespace llvm;
+using namespace llvm::sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+//static std::vector<lt_dlhandle> OpenedHandles;
+static std::vector<void *> OpenedHandles;
+
+DynamicLibrary::DynamicLibrary() {}
+
+DynamicLibrary::~DynamicLibrary() {
+  while(!OpenedHandles.empty()) {
+    void *H = OpenedHandles.back();   OpenedHandles.pop_back(); 
+    dlclose(H);
+  }
+}
+
+bool DynamicLibrary::LoadLibraryPermanently(const char *Filename,
+                                            std::string *ErrMsg) {
+  void *H = dlopen(Filename, RTLD_LAZY|RTLD_GLOBAL);
+  if (H == 0) {
+    if (ErrMsg)
+      *ErrMsg = dlerror();
+    return true;
+  }
+  OpenedHandles.push_back(H);
+  return false;
+}
+
+void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
+  //  check_ltdl_initialization();
+
+  // First check symbols added via AddSymbol().
+  std::map<std::string, void *>::iterator I = g_symbols().find(symbolName);
+  if (I != g_symbols().end())
+    return I->second;
+
+  // Now search the libraries.
+  for (std::vector<void *>::iterator I = OpenedHandles.begin(),
+       E = OpenedHandles.end(); I != E; ++I) {
+    //lt_ptr ptr = lt_dlsym(*I, symbolName);
+    void *ptr = dlsym(*I, symbolName);
+    if (ptr)
+      return ptr;
+  }
+
+#define EXPLICIT_SYMBOL(SYM) \
+   extern void *SYM; if (!strcmp(symbolName, #SYM)) return &SYM
+
+  // If this is darwin, it has some funky issues, try to solve them here.  Some
+  // important symbols are marked 'private external' which doesn't allow
+  // SearchForAddressOfSymbol to find them.  As such, we special case them here,
+  // there is only a small handful of them.
+
+#ifdef __APPLE__
+  {
+    EXPLICIT_SYMBOL(__ashldi3);
+    EXPLICIT_SYMBOL(__ashrdi3);
+    EXPLICIT_SYMBOL(__cmpdi2);
+    EXPLICIT_SYMBOL(__divdi3);
+    EXPLICIT_SYMBOL(__eprintf);
+    EXPLICIT_SYMBOL(__fixdfdi);
+    EXPLICIT_SYMBOL(__fixsfdi);
+    EXPLICIT_SYMBOL(__fixunsdfdi);
+    EXPLICIT_SYMBOL(__fixunssfdi);
+    EXPLICIT_SYMBOL(__floatdidf);
+    EXPLICIT_SYMBOL(__floatdisf);
+    EXPLICIT_SYMBOL(__lshrdi3);
+    EXPLICIT_SYMBOL(__moddi3);
+    EXPLICIT_SYMBOL(__udivdi3);
+    EXPLICIT_SYMBOL(__umoddi3);
+  }
+#endif
+
+#ifdef __CYGWIN__
+  {
+    EXPLICIT_SYMBOL(_alloca);
+    EXPLICIT_SYMBOL(__main);
+  }
+#endif
+
+#undef EXPLICIT_SYMBOL
+
+// This macro returns the address of a well-known, explicit symbol
+#define EXPLICIT_SYMBOL(SYM) \
+   if (!strcmp(symbolName, #SYM)) return &SYM
+
+// On linux we have a weird situation. The stderr/out/in symbols are both
+// macros and global variables because of standards requirements. So, we 
+// boldly use the EXPLICIT_SYMBOL macro without checking for a #define first.
+#if defined(__linux__)
+  {
+    EXPLICIT_SYMBOL(stderr);
+    EXPLICIT_SYMBOL(stdout);
+    EXPLICIT_SYMBOL(stdin);
+  }
+#else
+  // For everything else, we want to check to make sure the symbol isn't defined
+  // as a macro before using EXPLICIT_SYMBOL.
+  {
+#ifndef stdin
+    EXPLICIT_SYMBOL(stdin);
+#endif
+#ifndef stdout
+    EXPLICIT_SYMBOL(stdout);
+#endif
+#ifndef stderr
+    EXPLICIT_SYMBOL(stderr);
+#endif
+  }
+#endif
+#undef EXPLICIT_SYMBOL
+
+  return 0;
+}
+
+#endif // LLVM_ON_WIN32
diff --git a/lib/System/Host.cpp b/lib/System/Host.cpp
new file mode 100644
index 0000000..fd2d952
--- /dev/null
+++ b/lib/System/Host.cpp
@@ -0,0 +1,24 @@
+//===-- Host.cpp - Implement OS Host Concept --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Host concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Host.h"
+#include "llvm/Config/config.h"
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Host.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Win32/Host.inc"
+#endif
+
diff --git a/lib/System/IncludeFile.cpp b/lib/System/IncludeFile.cpp
new file mode 100644
index 0000000..8258d40
--- /dev/null
+++ b/lib/System/IncludeFile.cpp
@@ -0,0 +1,20 @@
+//===- lib/System/IncludeFile.cpp - Ensure Linking Of Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IncludeFile constructor.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/IncludeFile.h"
+
+using namespace llvm;
+
+// This constructor is used to ensure linking of other modules. See the
+// llvm/System/IncludeFile.h header for details. 
+IncludeFile::IncludeFile(const void*) {}
diff --git a/lib/System/LICENSE.TXT b/lib/System/LICENSE.TXT
new file mode 100644
index 0000000..f569da2
--- /dev/null
+++ b/lib/System/LICENSE.TXT
@@ -0,0 +1,6 @@
+LLVM System Interface Library
+-------------------------------------------------------------------------------
+The LLVM System Interface Library is licensed under the Illinois Open Source 
+License and has the following additional copyright:
+
+Copyright (C) 2004 eXtensible Systems, Inc.
diff --git a/lib/System/Makefile b/lib/System/Makefile
new file mode 100644
index 0000000..49704c3
--- /dev/null
+++ b/lib/System/Makefile
@@ -0,0 +1,19 @@
+##===- lib/System/Makefile ---------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMSystem
+BUILD_ARCHIVE = 1
+
+EXTRA_DIST = Unix Win32 README.txt
+
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts := $(filter-out -pedantic,$(CompileCommonOpts))
+CompileCommonOpts := $(filter-out -Wno-long-long,$(CompileCommonOpts))
diff --git a/lib/System/Memory.cpp b/lib/System/Memory.cpp
new file mode 100644
index 0000000..375c73c
--- /dev/null
+++ b/lib/System/Memory.cpp
@@ -0,0 +1,62 @@
+//===- Memory.cpp - Memory Handling Support ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for allocating memory and dealing
+// with memory mapped files
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Memory.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+}
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Memory.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Win32/Memory.inc"
+#endif
+
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
+                                                   size_t Len) {
+  
+// icache invalidation for PPC and ARM.
+#if defined(__APPLE__)
+#if (defined(__POWERPC__) || defined (__ppc__) || \
+     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
+  sys_icache_invalidate(Addr, Len);
+#endif
+#else
+#if (defined(__POWERPC__) || defined (__ppc__) || \
+     defined(_POWER) || defined(_ARCH_PPC)) && defined(__GNUC__)
+  const size_t LineSize = 32;
+
+  const intptr_t Mask = ~(LineSize - 1);
+  const intptr_t StartLine = ((intptr_t) Addr) & Mask;
+  const intptr_t EndLine = ((intptr_t) Addr + Len + LineSize - 1) & Mask;
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("dcbf 0, %0" : : "r"(Line));
+  asm volatile("sync");
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("icbi 0, %0" : : "r"(Line));
+  asm volatile("isync");
+#endif
+#endif  // end apple
+}
diff --git a/lib/System/Mutex.cpp b/lib/System/Mutex.cpp
new file mode 100644
index 0000000..d95c25b
--- /dev/null
+++ b/lib/System/Mutex.cpp
@@ -0,0 +1,160 @@
+//===- Mutex.cpp - Mutual Exclusion Lock ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the llvm::sys::Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/System/Mutex.h"
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+#if !defined(ENABLE_THREADS) || ENABLE_THREADS == 0
+// Define all methods as no-ops if threading is explicitly disabled
+namespace llvm {
+using namespace sys;
+Mutex::Mutex( bool recursive) { }
+Mutex::~Mutex() { }
+bool Mutex::acquire() { return true; }
+bool Mutex::release() { return true; }
+bool Mutex::tryacquire() { return true; }
+}
+#else
+
+#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_MUTEX_LOCK)
+
+#include <cassert>
+#include <pthread.h>
+#include <stdlib.h>
+
+namespace llvm {
+using namespace sys;
+
+
+// This variable is useful for situations where the pthread library has been
+// compiled with weak linkage for its interface symbols. This allows the
+// threading support to be turned off by simply not linking against -lpthread.
+// In that situation, the value of pthread_mutex_init will be 0 and
+// consequently pthread_enabled will be false. In such situations, all the
+// pthread operations become no-ops and the functions all return false. If
+// pthread_mutex_init does have an address, then mutex support is enabled.
+// Note: all LLVM tools will link against -lpthread if its available since it
+//       is configured into the LIBS variable.
+// Note: this line of code generates a warning if pthread_mutex_init is not
+//       declared with weak linkage. It's safe to ignore the warning.
+static const bool pthread_enabled = true;
+
+// Construct a Mutex using pthread calls
+Mutex::Mutex( bool recursive)
+  : data_(0)
+{
+  if (pthread_enabled)
+  {
+    // Declare the pthread_mutex data structures
+    pthread_mutex_t* mutex =
+      static_cast<pthread_mutex_t*>(malloc(sizeof(pthread_mutex_t)));
+    pthread_mutexattr_t attr;
+
+    // Initialize the mutex attributes
+    int errorcode = pthread_mutexattr_init(&attr);
+    assert(errorcode == 0);
+
+    // Initialize the mutex as a recursive mutex, if requested, or normal
+    // otherwise.
+    int kind = ( recursive  ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL );
+    errorcode = pthread_mutexattr_settype(&attr, kind);
+    assert(errorcode == 0);
+
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && !defined(__DragonFly__)
+    // Make it a process local mutex
+    errorcode = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_PRIVATE);
+#endif
+
+    // Initialize the mutex
+    errorcode = pthread_mutex_init(mutex, &attr);
+    assert(errorcode == 0);
+
+    // Destroy the attributes
+    errorcode = pthread_mutexattr_destroy(&attr);
+    assert(errorcode == 0);
+
+    // Assign the data member
+    data_ = mutex;
+  }
+}
+
+// Destruct a Mutex
+Mutex::~Mutex()
+{
+  if (pthread_enabled)
+  {
+    pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+    assert(mutex != 0);
+    pthread_mutex_destroy(mutex);
+    free(mutex);
+  }
+}
+
+bool
+Mutex::acquire()
+{
+  if (pthread_enabled)
+  {
+    pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+    assert(mutex != 0);
+
+    int errorcode = pthread_mutex_lock(mutex);
+    return errorcode == 0;
+  }
+  return false;
+}
+
+bool
+Mutex::release()
+{
+  if (pthread_enabled)
+  {
+    pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+    assert(mutex != 0);
+
+    int errorcode = pthread_mutex_unlock(mutex);
+    return errorcode == 0;
+  }
+  return false;
+}
+
+bool
+Mutex::tryacquire()
+{
+  if (pthread_enabled)
+  {
+    pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+    assert(mutex != 0);
+
+    int errorcode = pthread_mutex_trylock(mutex);
+    return errorcode == 0;
+  }
+  return false;
+}
+
+}
+
+#elif defined(LLVM_ON_UNIX)
+#include "Unix/Mutex.inc"
+#elif defined( LLVM_ON_WIN32)
+#include "Win32/Mutex.inc"
+#else
+#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in System/Mutex.cpp
+#endif
+#endif
+
diff --git a/lib/System/Path.cpp b/lib/System/Path.cpp
new file mode 100644
index 0000000..72bd7ad
--- /dev/null
+++ b/lib/System/Path.cpp
@@ -0,0 +1,287 @@
+//===-- Path.cpp - Implement OS Path Concept --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Path concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Path.h"
+#include "llvm/Config/config.h"
+#include <cassert>
+#include <cstring>
+#include <ostream>
+using namespace llvm;
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+bool Path::operator==(const Path &that) const {
+  return path == that.path;
+}
+
+bool Path::operator!=(const Path &that) const {
+  return path != that.path;
+}
+
+bool Path::operator<(const Path& that) const {
+  return path < that.path;
+}
+
+std::ostream& llvm::operator<<(std::ostream &strm, const sys::Path &aPath) {
+  strm << aPath.toString();
+  return strm;
+}
+
+Path
+Path::GetLLVMConfigDir() {
+  Path result;
+#ifdef LLVM_ETCDIR
+  if (result.set(LLVM_ETCDIR))
+    return result;
+#endif
+  return GetLLVMDefaultConfigDir();
+}
+
+LLVMFileType
+sys::IdentifyFileType(const char *magic, unsigned length) {
+  assert(magic && "Invalid magic number string");
+  assert(length >=4 && "Invalid magic number length");
+  switch ((unsigned char)magic[0]) {
+    case 0xDE:  // 0x0B17C0DE = BC wraper
+      if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 &&
+          magic[3] == (char)0x0B)
+        return Bitcode_FileType;
+      break;
+    case 'B':
+      if (magic[1] == 'C' && magic[2] == (char)0xC0 && magic[3] == (char)0xDE)
+        return Bitcode_FileType;
+      break;
+    case '!':
+      if (length >= 8)
+        if (memcmp(magic,"!<arch>\n",8) == 0)
+          return Archive_FileType;
+      break;
+      
+    case '\177':
+      if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
+        if (length >= 18 && magic[17] == 0)
+          switch (magic[16]) {
+            default: break;
+            case 1: return ELF_Relocatable_FileType;
+            case 2: return ELF_Executable_FileType;
+            case 3: return ELF_SharedObject_FileType;
+            case 4: return ELF_Core_FileType;
+          }
+      }
+      break;
+
+    case 0xCA:
+      if (magic[1] == char(0xFE) && magic[2] == char(0xBA) && 
+          magic[3] == char(0xBE)) {
+        // This is complicated by an overlap with Java class files. 
+        // See the Mach-O section in /usr/share/file/magic for details.
+        if (length >= 8 && magic[7] < 43) 
+          // FIXME: Universal Binary of any type.
+          return Mach_O_DynamicallyLinkedSharedLib_FileType;
+      }
+      break;
+
+    case 0xFE:
+    case 0xCE: {
+      uint16_t type = 0;
+      if (magic[0] == char(0xFE) && magic[1] == char(0xED) && 
+          magic[2] == char(0xFA) && magic[3] == char(0xCE)) {
+        /* Native endian */
+        if (length >= 16) type = magic[14] << 8 | magic[15];
+      } else if (magic[0] == char(0xCE) && magic[1] == char(0xFA) && 
+                 magic[2] == char(0xED) && magic[3] == char(0xFE)) {
+        /* Reverse endian */
+        if (length >= 14) type = magic[13] << 8 | magic[12];
+      }
+      switch (type) {
+        default: break;      
+        case 1: return Mach_O_Object_FileType; 
+        case 2: return Mach_O_Executable_FileType;
+        case 3: return Mach_O_FixedVirtualMemorySharedLib_FileType;
+        case 4: return Mach_O_Core_FileType;
+        case 5: return Mach_O_PreloadExectuable_FileType;
+        case 6: return Mach_O_DynamicallyLinkedSharedLib_FileType;
+        case 7: return Mach_O_DynamicLinker_FileType;
+        case 8: return Mach_O_Bundle_FileType;
+        case 9: return Mach_O_DynamicallyLinkedSharedLibStub_FileType;
+        case 10: break; // FIXME: MH_DSYM companion file with only debug.
+      }
+      break;
+    }
+    case 0xF0: // PowerPC Windows
+    case 0x83: // Alpha 32-bit
+    case 0x84: // Alpha 64-bit
+    case 0x66: // MPS R4000 Windows
+    case 0x50: // mc68K
+    case 0x4c: // 80386 Windows
+      if (magic[1] == 0x01)
+        return COFF_FileType;
+
+    case 0x90: // PA-RISC Windows
+    case 0x68: // mc68K Windows
+      if (magic[1] == 0x02)
+        return COFF_FileType;
+      break;
+
+    default:
+      break;
+  }
+  return Unknown_FileType;
+}
+
+bool
+Path::isArchive() const {
+  if (canRead())
+    return hasMagicNumber("!<arch>\012");
+  return false;
+}
+
+bool
+Path::isDynamicLibrary() const {
+  if (canRead()) {
+    std::string Magic;
+    if (getMagicNumber(Magic, 64))
+      switch (IdentifyFileType(Magic.c_str(),
+                               static_cast<unsigned>(Magic.length()))) {
+        default: return false;
+        case Mach_O_FixedVirtualMemorySharedLib_FileType:
+        case Mach_O_DynamicallyLinkedSharedLib_FileType:
+        case Mach_O_DynamicallyLinkedSharedLibStub_FileType:
+        case ELF_SharedObject_FileType:
+        case COFF_FileType:  return true;
+      }
+  }
+  return false;
+}
+
+Path
+Path::FindLibrary(std::string& name) {
+  std::vector<sys::Path> LibPaths;
+  GetSystemLibraryPaths(LibPaths);
+  for (unsigned i = 0; i < LibPaths.size(); ++i) {
+    sys::Path FullPath(LibPaths[i]);
+    FullPath.appendComponent("lib" + name + LTDL_SHLIB_EXT);
+    if (FullPath.isDynamicLibrary())
+      return FullPath;
+    FullPath.eraseSuffix();
+    FullPath.appendSuffix("a");
+    if (FullPath.isArchive())
+      return FullPath;
+  }
+  return sys::Path();
+}
+
+std::string Path::GetDLLSuffix() {
+  return LTDL_SHLIB_EXT;
+}
+
+bool
+Path::isBitcodeFile() const {
+  std::string actualMagic;
+  if (!getMagicNumber(actualMagic, 4))
+    return false;
+  LLVMFileType FT =
+    IdentifyFileType(actualMagic.c_str(),
+                     static_cast<unsigned>(actualMagic.length()));
+  return FT == Bitcode_FileType;
+}
+
+bool Path::hasMagicNumber(const std::string &Magic) const {
+  std::string actualMagic;
+  if (getMagicNumber(actualMagic, static_cast<unsigned>(Magic.size())))
+    return Magic == actualMagic;
+  return false;
+}
+
+void Path::makeAbsolute() {
+  if (isAbsolute())
+    return;
+
+  Path CWD = Path::GetCurrentDirectory();
+  assert(CWD.isAbsolute() && "GetCurrentDirectory returned relative path!");
+
+  CWD.appendComponent(path);
+
+  path = CWD.toString();
+}
+
+static void getPathList(const char*path, std::vector<Path>& Paths) {
+  const char* at = path;
+  const char* delim = strchr(at, PathSeparator);
+  Path tmpPath;
+  while (delim != 0) {
+    std::string tmp(at, size_t(delim-at));
+    if (tmpPath.set(tmp))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+    at = delim + 1;
+    delim = strchr(at, PathSeparator);
+  }
+
+  if (*at != 0)
+    if (tmpPath.set(std::string(at)))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+}
+
+static std::string getDirnameCharSep(const std::string& path, char Sep) {
+  
+  if (path.empty())
+    return ".";
+  
+  // If the path is all slashes, return a single slash.
+  // Otherwise, remove all trailing slashes.
+  
+  signed pos = static_cast<signed>(path.size()) - 1;
+  
+  while (pos >= 0 && path[pos] == Sep)
+    --pos;
+  
+  if (pos < 0)
+    return path[0] == Sep ? std::string(1, Sep) : std::string(".");
+  
+  // Any slashes left?
+  signed i = 0;
+  
+  while (i < pos && path[i] != Sep)
+    ++i;
+  
+  if (i == pos) // No slashes?  Return "."
+    return ".";
+  
+  // There is at least one slash left.  Remove all trailing non-slashes.  
+  while (pos >= 0 && path[pos] != Sep)
+    --pos;
+  
+  // Remove any trailing slashes.
+  while (pos >= 0 && path[pos] == Sep)
+    --pos;
+  
+  if (pos < 0)
+    return path[0] == Sep ? std::string(1, Sep) : std::string(".");
+  
+  return path.substr(0, pos+1);
+}
+
+// Include the truly platform-specific parts of this class.
+#if defined(LLVM_ON_UNIX)
+#include "Unix/Path.inc"
+#endif
+#if defined(LLVM_ON_WIN32)
+#include "Win32/Path.inc"
+#endif
+
diff --git a/lib/System/Process.cpp b/lib/System/Process.cpp
new file mode 100644
index 0000000..e93b2af
--- /dev/null
+++ b/lib/System/Process.cpp
@@ -0,0 +1,33 @@
+//===-- Process.cpp - Implement OS Process Concept --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Process concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Process.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+}
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Process.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Win32/Process.inc"
+#endif
diff --git a/lib/System/Program.cpp b/lib/System/Program.cpp
new file mode 100644
index 0000000..eb289d8
--- /dev/null
+++ b/lib/System/Program.cpp
@@ -0,0 +1,33 @@
+//===-- Program.cpp - Implement OS Program Concept --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Program concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Program.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+}
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Program.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Win32/Program.inc"
+#endif
diff --git a/lib/System/README.txt b/lib/System/README.txt
new file mode 100644
index 0000000..eacb200
--- /dev/null
+++ b/lib/System/README.txt
@@ -0,0 +1,43 @@
+Design Of lib/System
+====================
+
+The software in this directory is designed to completely shield LLVM from any
+and all operating system specific functionality. It is not intended to be a
+complete operating system wrapper (such as ACE), but only to provide the
+functionality necessary to support LLVM.
+
+The software located here, of necessity, has very specific and stringent design
+rules. Violation of these rules means that cracks in the shield could form and
+the primary goal of the library is defeated. By consistently using this library,
+LLVM becomes more easily ported to new platforms since the only thing requiring 
+porting is this library.
+
+Complete documentation for the library can be found in the file:
+  llvm/docs/SystemLibrary.html 
+or at this URL:
+  http://llvm.org/docs/SystemLibrary.html
+
+While we recommend that you read the more detailed documentation, for the 
+impatient, here's a high level summary of the library's requirements.
+
+ 1. No system header files are to be exposed through the interface.
+ 2. Std C++ and Std C header files are okay to be exposed through the interface.
+ 3. No exposed system-specific functions.
+ 4. No exposed system-specific data.
+ 5. Data in lib/System classes must use only simple C++ intrinsic types.
+ 6. Errors are handled by returning "true" and setting an optional std::string
+ 7. Library must not throw any exceptions, period.
+ 8. Interface functions must not have throw() specifications.
+ 9. No duplicate function impementations are permitted within an operating
+    system class.
+
+To accomplish these requirements, the library has numerous design criteria that 
+must be satisfied. Here's a high level summary of the library's design criteria:
+
+ 1. No unused functionality (only what LLVM needs)
+ 2. High-Level Interfaces
+ 3. Use Opaque Classes
+ 4. Common Implementations</a></li>
+ 5. Multiple Implementations</a></li>
+ 6. Minimize Memory Allocation</a></li>
+ 7. No Virtual Methods
diff --git a/lib/System/Signals.cpp b/lib/System/Signals.cpp
new file mode 100644
index 0000000..d345b0a
--- /dev/null
+++ b/lib/System/Signals.cpp
@@ -0,0 +1,34 @@
+//===- Signals.cpp - Signal Handling support --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occuring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/Signals.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+}
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Signals.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Win32/Signals.inc"
+#endif
diff --git a/lib/System/TimeValue.cpp b/lib/System/TimeValue.cpp
new file mode 100644
index 0000000..cf4984c
--- /dev/null
+++ b/lib/System/TimeValue.cpp
@@ -0,0 +1,58 @@
+//===-- TimeValue.cpp - Implement OS TimeValue Concept ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the operating system TimeValue concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/System/TimeValue.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+
+const TimeValue TimeValue::MinTime       = TimeValue ( INT64_MIN,0 );
+const TimeValue TimeValue::MaxTime       = TimeValue ( INT64_MAX,0 );
+const TimeValue TimeValue::ZeroTime      = TimeValue ( 0,0 );
+const TimeValue TimeValue::PosixZeroTime = TimeValue ( -946684800,0 );
+const TimeValue TimeValue::Win32ZeroTime = TimeValue ( -12591158400ULL,0 );
+
+void
+TimeValue::normalize( void ) {
+  if ( nanos_ >= NANOSECONDS_PER_SECOND ) {
+    do {
+      seconds_++;
+      nanos_ -= NANOSECONDS_PER_SECOND;
+    } while ( nanos_ >= NANOSECONDS_PER_SECOND );
+  } else if (nanos_ <= -NANOSECONDS_PER_SECOND ) {
+    do {
+      seconds_--;
+      nanos_ += NANOSECONDS_PER_SECOND;
+    } while (nanos_ <= -NANOSECONDS_PER_SECOND);
+  }
+
+  if (seconds_ >= 1 && nanos_ < 0) {
+    seconds_--;
+    nanos_ += NANOSECONDS_PER_SECOND;
+  } else if (seconds_ < 0 && nanos_ > 0) {
+    seconds_++;
+    nanos_ -= NANOSECONDS_PER_SECOND;
+  }
+}
+
+}
+
+/// Include the platform specific portion of TimeValue class
+#ifdef LLVM_ON_UNIX
+#include "Unix/TimeValue.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Win32/TimeValue.inc"
+#endif
+
diff --git a/lib/System/Unix/Alarm.inc b/lib/System/Unix/Alarm.inc
new file mode 100644
index 0000000..28ff1b8
--- /dev/null
+++ b/lib/System/Unix/Alarm.inc
@@ -0,0 +1,72 @@
+//===-- Alarm.inc - Implement Unix Alarm Support ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the UNIX Alarm support.
+//
+//===----------------------------------------------------------------------===//
+
+#include <signal.h>
+#include <unistd.h>
+#include <cassert>
+using namespace llvm;
+
+/// AlarmCancelled - This flag is set by the SIGINT signal handler if the
+/// user presses CTRL-C.
+static volatile bool AlarmCancelled = false;
+
+/// AlarmTriggered - This flag is set by the SIGALRM signal handler if the
+/// alarm was triggered.
+static volatile bool AlarmTriggered = false;
+
+/// NestedSOI - Sanity check.  Alarms cannot be nested or run in parallel.
+/// This ensures that they never do.
+static bool NestedSOI = false;
+
+static RETSIGTYPE SigIntHandler(int Sig) {
+  AlarmCancelled = true;
+  signal(SIGINT, SigIntHandler);
+}
+
+static RETSIGTYPE SigAlarmHandler(int Sig) {
+  AlarmTriggered = true;
+}
+
+static void (*OldSigIntHandler) (int);
+
+void sys::SetupAlarm(unsigned seconds) {
+  assert(!NestedSOI && "sys::SetupAlarm calls cannot be nested!");
+  NestedSOI = true;
+  AlarmCancelled = false;
+  AlarmTriggered = false;
+  ::signal(SIGALRM, SigAlarmHandler);
+  OldSigIntHandler = ::signal(SIGINT, SigIntHandler);
+  ::alarm(seconds);
+}
+
+void sys::TerminateAlarm() {
+  assert(NestedSOI && "sys::TerminateAlarm called without sys::SetupAlarm!");
+  ::alarm(0);
+  ::signal(SIGALRM, SIG_DFL);
+  ::signal(SIGINT, OldSigIntHandler);
+  AlarmCancelled = false;
+  AlarmTriggered = false;
+  NestedSOI = false;
+}
+
+int sys::AlarmStatus() {
+  if (AlarmCancelled)
+    return -1;
+  if (AlarmTriggered)
+    return 1;
+  return 0;
+}
+
+void Sleep(unsigned n) {
+  ::sleep(n);
+}
diff --git a/lib/System/Unix/Host.inc b/lib/System/Unix/Host.inc
new file mode 100644
index 0000000..fb319fd
--- /dev/null
+++ b/lib/System/Unix/Host.inc
@@ -0,0 +1,58 @@
+ //===- llvm/System/Unix/Host.inc -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the UNIX Host support.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include <llvm/Config/config.h>
+#include "Unix.h"
+#include <sys/utsname.h>
+#include <string>
+
+using namespace llvm;
+
+static std::string getOSVersion() {
+  struct utsname info;
+
+  if (uname(&info))
+    return "";
+
+  return info.release;
+}
+
+std::string sys::getHostTriple() {
+  // FIXME: Derive more directly instead of relying on the autoconf
+  // generated variable.
+
+  std::string Triple = LLVM_HOSTTRIPLE;
+
+  // Force i<N>86 to i386.
+  if (Triple[0] == 'i' && isdigit(Triple[1]) && 
+      Triple[2] == '8' && Triple[3] == '6')
+    Triple[1] = '3';
+
+  // On darwin, we want to update the version to match that of the
+  // host.    
+  std::string::size_type DarwinDashIdx = Triple.find("-darwin");
+  if (DarwinDashIdx != std::string::npos) {
+    Triple.resize(DarwinDashIdx + strlen("-darwin"));
+    
+    // Only add the major part of the os version.
+    std::string Version = getOSVersion();
+    Triple += Version.substr(0, Version.find('.'));
+  }
+
+  return Triple;
+}
diff --git a/lib/System/Unix/Memory.inc b/lib/System/Unix/Memory.inc
new file mode 100644
index 0000000..b7a7013
--- /dev/null
+++ b/lib/System/Unix/Memory.inc
@@ -0,0 +1,150 @@
+//===- Unix/Memory.cpp - Generic UNIX System Configuration ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines some functions for various memory management utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/System/Process.h"
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach/mach.h>
+#endif
+
+/// AllocateRWX - Allocate a slab of memory with read/write/execute
+/// permissions.  This is typically used for JIT applications where we want
+/// to emit code to the memory then jump to it.  Getting this type of memory
+/// is very OS specific.
+///
+llvm::sys::MemoryBlock 
+llvm::sys::Memory::AllocateRWX(unsigned NumBytes, const MemoryBlock* NearBlock,
+                               std::string *ErrMsg) {
+  if (NumBytes == 0) return MemoryBlock();
+
+  unsigned pageSize = Process::GetPageSize();
+  unsigned NumPages = (NumBytes+pageSize-1)/pageSize;
+
+  int fd = -1;
+#ifdef NEED_DEV_ZERO_FOR_MMAP
+  static int zero_fd = open("/dev/zero", O_RDWR);
+  if (zero_fd == -1) {
+    MakeErrMsg(ErrMsg, "Can't open /dev/zero device");
+    return MemoryBlock();
+  }
+  fd = zero_fd;
+#endif
+
+  int flags = MAP_PRIVATE |
+#ifdef HAVE_MMAP_ANONYMOUS
+  MAP_ANONYMOUS
+#else
+  MAP_ANON
+#endif
+  ;
+
+  void* start = NearBlock ? (unsigned char*)NearBlock->base() + 
+                            NearBlock->size() : 0;
+
+#if defined(__APPLE__) && defined(__arm__)
+  void *pa = ::mmap(start, pageSize*NumPages, PROT_READ|PROT_EXEC,
+                    flags, fd, 0);
+#else
+  void *pa = ::mmap(start, pageSize*NumPages, PROT_READ|PROT_WRITE|PROT_EXEC,
+                    flags, fd, 0);
+#endif
+  if (pa == MAP_FAILED) {
+    if (NearBlock) //Try again without a near hint
+      return AllocateRWX(NumBytes, 0);
+
+    MakeErrMsg(ErrMsg, "Can't allocate RWX Memory");
+    return MemoryBlock();
+  }
+
+#if defined(__APPLE__) && defined(__arm__)
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)pa,
+                                (vm_size_t)(pageSize*NumPages), 0,
+                                VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
+  if (KERN_SUCCESS != kr) {
+    MakeErrMsg(ErrMsg, "vm_protect max RX failed");
+    return sys::MemoryBlock();
+  }
+
+  kr = vm_protect(mach_task_self(), (vm_address_t)pa,
+                  (vm_size_t)(pageSize*NumPages), 0,
+                  VM_PROT_READ | VM_PROT_WRITE);
+  if (KERN_SUCCESS != kr) {
+    MakeErrMsg(ErrMsg, "vm_protect RW failed");
+    return sys::MemoryBlock();
+  }
+#endif
+
+  MemoryBlock result;
+  result.Address = pa;
+  result.Size = NumPages*pageSize;
+
+  return result;
+}
+
+bool llvm::sys::Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+  if (M.Address == 0 || M.Size == 0) return false;
+  if (0 != ::munmap(M.Address, M.Size))
+    return MakeErrMsg(ErrMsg, "Can't release RWX Memory");
+  return false;
+}
+
+bool llvm::sys::Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
+#if defined(__APPLE__) && defined(__arm__)
+  if (M.Address == 0 || M.Size == 0) return false;
+  sys::Memory::InvalidateInstructionCache(M.Address, M.Size);
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
+    (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_WRITE);
+  return KERN_SUCCESS == kr;
+#else
+  return true;
+#endif
+}
+
+bool llvm::sys::Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
+#if defined(__APPLE__) && defined(__arm__)
+  if (M.Address == 0 || M.Size == 0) return false;
+  sys::Memory::InvalidateInstructionCache(M.Address, M.Size);
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
+    (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
+  return KERN_SUCCESS == kr;
+#else
+  return false;
+#endif
+}
+
+bool llvm::sys::Memory::setRangeWritable(const void *Addr, size_t Size) {
+#if defined(__APPLE__) && defined(__arm__)
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
+                                (vm_size_t)Size, 0,
+                                VM_PROT_READ | VM_PROT_WRITE);
+  return KERN_SUCCESS == kr;
+#else
+  return true;
+#endif
+}
+
+bool llvm::sys::Memory::setRangeExecutable(const void *Addr, size_t Size) {
+#if defined(__APPLE__) && defined(__arm__)
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
+                                (vm_size_t)Size, 0,
+                                VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
+  return KERN_SUCCESS == kr;
+#else
+  return true;
+#endif
+}
diff --git a/lib/System/Unix/Mutex.inc b/lib/System/Unix/Mutex.inc
new file mode 100644
index 0000000..4a015a6
--- /dev/null
+++ b/lib/System/Unix/Mutex.inc
@@ -0,0 +1,49 @@
+//===- llvm/System/Unix/Mutex.inc - Unix Mutex Implementation ---*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific (non-pthread) Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+namespace llvm
+{
+using namespace sys;
+
+Mutex::Mutex( bool recursive)
+{
+}
+
+Mutex::~Mutex()
+{
+}
+
+bool 
+Mutex::acquire()
+{
+  return true;
+}
+
+bool 
+Mutex::release()
+{
+  return true;
+}
+
+bool 
+Mutex::tryacquire( void )
+{
+  return true;
+}
+
+}
diff --git a/lib/System/Unix/Path.inc b/lib/System/Unix/Path.inc
new file mode 100644
index 0000000..d5edee1
--- /dev/null
+++ b/lib/System/Unix/Path.inc
@@ -0,0 +1,876 @@
+//===- llvm/System/Unix/Path.cpp - Unix Path Implementation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific portion of the Path class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/alloca.h"
+#include "Unix.h"
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_UTIME_H
+#include <utime.h>
+#endif
+#if HAVE_TIME_H
+#include <time.h>
+#endif
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+#  include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+#  include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+#  include <ndir.h>
+# endif
+#endif
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+// Put in a hack for Cygwin which falsely reports that the mkdtemp function
+// is available when it is not.
+#ifdef __CYGWIN__
+# undef HAVE_MKDTEMP
+#endif
+
+namespace {
+inline bool lastIsSlash(const std::string& path) {
+  return !path.empty() && path[path.length() - 1] == '/';
+}
+
+}
+
+namespace llvm {
+using namespace sys;
+
+extern const char sys::PathSeparator = ':';
+
+Path::Path(const std::string& p)
+  : path(p) {}
+
+Path::Path(const char *StrStart, unsigned StrLen)
+  : path(StrStart, StrLen) {}
+
+Path&
+Path::operator=(const std::string &that) {
+  path = that;
+  return *this;
+}
+
+bool
+Path::isValid() const {
+  // Check some obvious things
+  if (path.empty())
+    return false;
+  else if (path.length() >= MAXPATHLEN)
+    return false;
+
+  // Check that the characters are ascii chars
+  size_t len = path.length();
+  unsigned i = 0;
+  while (i < len && isascii(path[i]))
+    ++i;
+  return i >= len;
+}
+
+bool
+Path::isAbsolute() const {
+  if (path.empty())
+    return false;
+  return path[0] == '/';
+}
+Path
+Path::GetRootDirectory() {
+  Path result;
+  result.set("/");
+  return result;
+}
+
+Path
+Path::GetTemporaryDirectory(std::string *ErrMsg) {
+#if defined(HAVE_MKDTEMP)
+  // The best way is with mkdtemp but that's not available on many systems,
+  // Linux and FreeBSD have it. Others probably won't.
+  char pathname[MAXPATHLEN];
+  strcpy(pathname,"/tmp/llvm_XXXXXX");
+  if (0 == mkdtemp(pathname)) {
+    MakeErrMsg(ErrMsg,
+      std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  Path result;
+  result.set(pathname);
+  assert(result.isValid() && "mkdtemp didn't create a valid pathname!");
+  return result;
+#elif defined(HAVE_MKSTEMP)
+  // If no mkdtemp is available, mkstemp can be used to create a temporary file
+  // which is then removed and created as a directory. We prefer this over
+  // mktemp because of mktemp's inherent security and threading risks. We still
+  // have a slight race condition from the time the temporary file is created to
+  // the time it is re-created as a directoy.
+  char pathname[MAXPATHLEN];
+  strcpy(pathname, "/tmp/llvm_XXXXXX");
+  int fd = 0;
+  if (-1 == (fd = mkstemp(pathname))) {
+    MakeErrMsg(ErrMsg,
+      std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  ::close(fd);
+  ::unlink(pathname); // start race condition, ignore errors
+  if (-1 == ::mkdir(pathname, S_IRWXU)) { // end race condition
+    MakeErrMsg(ErrMsg,
+      std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  Path result;
+  result.set(pathname);
+  assert(result.isValid() && "mkstemp didn't create a valid pathname!");
+  return result;
+#elif defined(HAVE_MKTEMP)
+  // If a system doesn't have mkdtemp(3) or mkstemp(3) but it does have
+  // mktemp(3) then we'll assume that system (e.g. AIX) has a reasonable
+  // implementation of mktemp(3) and doesn't follow BSD 4.3's lead of replacing
+  // the XXXXXX with the pid of the process and a letter. That leads to only
+  // twenty six temporary files that can be generated.
+  char pathname[MAXPATHLEN];
+  strcpy(pathname, "/tmp/llvm_XXXXXX");
+  char *TmpName = ::mktemp(pathname);
+  if (TmpName == 0) {
+    MakeErrMsg(ErrMsg,
+      std::string(TmpName) + ": can't create unique directory name");
+    return Path();
+  }
+  if (-1 == ::mkdir(TmpName, S_IRWXU)) {
+    MakeErrMsg(ErrMsg,
+        std::string(TmpName) + ": can't create temporary directory");
+    return Path();
+  }
+  Path result;
+  result.set(TmpName);
+  assert(result.isValid() && "mktemp didn't create a valid pathname!");
+  return result;
+#else
+  // This is the worst case implementation. tempnam(3) leaks memory unless its
+  // on an SVID2 (or later) system. On BSD 4.3 it leaks. tmpnam(3) has thread
+  // issues. The mktemp(3) function doesn't have enough variability in the
+  // temporary name generated. So, we provide our own implementation that
+  // increments an integer from a random number seeded by the current time. This
+  // should be sufficiently unique that we don't have many collisions between
+  // processes. Generally LLVM processes don't run very long and don't use very
+  // many temporary files so this shouldn't be a big issue for LLVM.
+  static time_t num = ::time(0);
+  char pathname[MAXPATHLEN];
+  do {
+    num++;
+    sprintf(pathname, "/tmp/llvm_%010u", unsigned(num));
+  } while ( 0 == access(pathname, F_OK ) );
+  if (-1 == ::mkdir(pathname, S_IRWXU)) {
+    MakeErrMsg(ErrMsg,
+      std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  Path result;
+  result.set(pathname);
+  assert(result.isValid() && "mkstemp didn't create a valid pathname!");
+  return result;
+#endif
+}
+
+void
+Path::GetSystemLibraryPaths(std::vector<sys::Path>& Paths) {
+#ifdef LTDL_SHLIBPATH_VAR
+  char* env_var = getenv(LTDL_SHLIBPATH_VAR);
+  if (env_var != 0) {
+    getPathList(env_var,Paths);
+  }
+#endif
+  // FIXME: Should this look at LD_LIBRARY_PATH too?
+  Paths.push_back(sys::Path("/usr/local/lib/"));
+  Paths.push_back(sys::Path("/usr/X11R6/lib/"));
+  Paths.push_back(sys::Path("/usr/lib/"));
+  Paths.push_back(sys::Path("/lib/"));
+}
+
+void
+Path::GetBitcodeLibraryPaths(std::vector<sys::Path>& Paths) {
+  char * env_var = getenv("LLVM_LIB_SEARCH_PATH");
+  if (env_var != 0) {
+    getPathList(env_var,Paths);
+  }
+#ifdef LLVM_LIBDIR
+  {
+    Path tmpPath;
+    if (tmpPath.set(LLVM_LIBDIR))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+  }
+#endif
+  GetSystemLibraryPaths(Paths);
+}
+
+Path
+Path::GetLLVMDefaultConfigDir() {
+  return Path("/etc/llvm/");
+}
+
+Path
+Path::GetUserHomeDirectory() {
+  const char* home = getenv("HOME");
+  if (home) {
+    Path result;
+    if (result.set(home))
+      return result;
+  }
+  return GetRootDirectory();
+}
+
+Path
+Path::GetCurrentDirectory() {
+  char pathname[MAXPATHLEN];
+  if (!getcwd(pathname,MAXPATHLEN)) {
+    assert (false && "Could not query current working directory.");
+    return Path("");
+  }
+
+  return Path(pathname);
+}
+
+#ifdef __FreeBSD__
+static int
+test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
+    const char *dir, const char *bin)
+{
+  struct stat sb;
+
+  snprintf(buf, PATH_MAX, "%s//%s", dir, bin);
+  if (realpath(buf, ret) == NULL)
+    return (1);
+  if (stat(buf, &sb) != 0)
+    return (1);
+
+  return (0);
+}
+
+static char *
+getprogpath(char ret[PATH_MAX], const char *bin)
+{
+  char *pv, *s, *t, buf[PATH_MAX];
+
+  /* First approach: absolute path. */
+  if (bin[0] == '/') {
+    if (test_dir(buf, ret, "/", bin) == 0)
+      return (ret);
+    return (NULL);
+  }
+
+  /* Second approach: relative path. */
+  if (strchr(bin, '/') != NULL) {
+    if (getcwd(buf, PATH_MAX) == NULL)
+      return (NULL);
+    if (test_dir(buf, ret, buf, bin) == 0)
+      return (ret);
+    return (NULL);
+  }
+
+  /* Third approach: $PATH */
+  if ((pv = getenv("PATH")) == NULL)
+    return (NULL);
+  s = pv = strdup(pv);
+  if (pv == NULL)
+    return (NULL);
+  while ((t = strsep(&s, ":")) != NULL) {
+    if (test_dir(buf, ret, t, bin) == 0) {
+      free(pv);
+      return (ret);
+    }
+  }
+  free(pv);
+  return (NULL);
+}
+#endif
+
+/// GetMainExecutable - Return the path to the main executable, given the
+/// value of argv[0] from program startup.
+Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
+#if defined(__FreeBSD__)
+  char exe_path[PATH_MAX];
+
+  if (getprogpath(exe_path, argv0) != NULL)
+    return Path(std::string(exe_path));
+#elif defined(__linux__) || defined(__CYGWIN__)
+  char exe_path[MAXPATHLEN];
+  ssize_t len = readlink("/proc/self/exe", exe_path, sizeof(exe_path));
+  if (len > 0 && len < MAXPATHLEN - 1) {
+    exe_path[len] = '\0';
+    return Path(std::string(exe_path));
+  }
+#elif defined(HAVE_DLFCN_H)
+  // Use dladdr to get executable path if available.
+  Dl_info DLInfo;
+  int err = dladdr(MainAddr, &DLInfo);
+  if (err == 0)
+    return Path();
+
+  // If the filename is a symlink, we need to resolve and return the location of
+  // the actual executable.
+  char link_path[MAXPATHLEN];
+  return Path(std::string(realpath(DLInfo.dli_fname, link_path)));
+#endif
+  return Path();
+}
+
+
+std::string Path::getDirname() const {
+  return getDirnameCharSep(path, '/');
+}
+
+std::string
+Path::getBasename() const {
+  // Find the last slash
+  std::string::size_type slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  std::string::size_type dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return path.substr(slash);
+  else
+    return path.substr(slash, dot - slash);
+}
+
+std::string
+Path::getSuffix() const {
+  // Find the last slash
+  std::string::size_type slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  std::string::size_type dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return std::string();
+  else
+    return path.substr(dot + 1);
+}
+
+bool Path::getMagicNumber(std::string& Magic, unsigned len) const {
+  assert(len < 1024 && "Request for magic string too long");
+  char* buf = (char*) alloca(1 + len);
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0)
+    return false;
+  ssize_t bytes_read = ::read(fd, buf, len);
+  ::close(fd);
+  if (ssize_t(len) != bytes_read) {
+    Magic.clear();
+    return false;
+  }
+  Magic.assign(buf,len);
+  return true;
+}
+
+bool
+Path::exists() const {
+  return 0 == access(path.c_str(), F_OK );
+}
+
+bool
+Path::isDirectory() const {
+  struct stat buf;
+  if (0 != stat(path.c_str(), &buf))
+    return false;
+  return buf.st_mode & S_IFDIR ? true : false;
+}
+
+bool
+Path::canRead() const {
+  return 0 == access(path.c_str(), F_OK | R_OK );
+}
+
+bool
+Path::canWrite() const {
+  return 0 == access(path.c_str(), F_OK | W_OK );
+}
+
+bool
+Path::canExecute() const {
+  if (0 != access(path.c_str(), R_OK | X_OK ))
+    return false;
+  struct stat buf;
+  if (0 != stat(path.c_str(), &buf))
+    return false;
+  if (!S_ISREG(buf.st_mode))
+    return false;
+  return true;
+}
+
+std::string
+Path::getLast() const {
+  // Find the last slash
+  size_t pos = path.rfind('/');
+
+  // Handle the corner cases
+  if (pos == std::string::npos)
+    return path;
+
+  // If the last character is a slash
+  if (pos == path.length()-1) {
+    // Find the second to last slash
+    size_t pos2 = path.rfind('/', pos-1);
+    if (pos2 == std::string::npos)
+      return path.substr(0,pos);
+    else
+      return path.substr(pos2+1,pos-pos2-1);
+  }
+  // Return everything after the last slash
+  return path.substr(pos+1);
+}
+
+const FileStatus *
+PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
+  if (!fsIsValid || update) {
+    struct stat buf;
+    if (0 != stat(path.c_str(), &buf)) {
+      MakeErrMsg(ErrStr, path + ": can't get status of file");
+      return 0;
+    }
+    status.fileSize = buf.st_size;
+    status.modTime.fromEpochTime(buf.st_mtime);
+    status.mode = buf.st_mode;
+    status.user = buf.st_uid;
+    status.group = buf.st_gid;
+    status.uniqueID = uint64_t(buf.st_ino);
+    status.isDir  = S_ISDIR(buf.st_mode);
+    status.isFile = S_ISREG(buf.st_mode);
+    fsIsValid = true;
+  }
+  return &status;
+}
+
+static bool AddPermissionBits(const Path &File, int bits) {
+  // Get the umask value from the operating system.  We want to use it
+  // when changing the file's permissions. Since calling umask() sets
+  // the umask and returns its old value, we must call it a second
+  // time to reset it to the user's preference.
+  int mask = umask(0777); // The arg. to umask is arbitrary.
+  umask(mask);            // Restore the umask.
+
+  // Get the file's current mode.
+  struct stat buf;
+  if (0 != stat(File.toString().c_str(), &buf))
+    return false;
+  // Change the file to have whichever permissions bits from 'bits'
+  // that the umask would not disable.
+  if ((chmod(File.c_str(), (buf.st_mode | (bits & ~mask)))) == -1)
+      return false;
+  return true;
+}
+
+bool Path::makeReadableOnDisk(std::string* ErrMsg) {
+  if (!AddPermissionBits(*this, 0444))
+    return MakeErrMsg(ErrMsg, path + ": can't make file readable");
+  return false;
+}
+
+bool Path::makeWriteableOnDisk(std::string* ErrMsg) {
+  if (!AddPermissionBits(*this, 0222))
+    return MakeErrMsg(ErrMsg, path + ": can't make file writable");
+  return false;
+}
+
+bool Path::makeExecutableOnDisk(std::string* ErrMsg) {
+  if (!AddPermissionBits(*this, 0111))
+    return MakeErrMsg(ErrMsg, path + ": can't make file executable");
+  return false;
+}
+
+bool
+Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
+  DIR* direntries = ::opendir(path.c_str());
+  if (direntries == 0)
+    return MakeErrMsg(ErrMsg, path + ": can't open directory");
+
+  std::string dirPath = path;
+  if (!lastIsSlash(dirPath))
+    dirPath += '/';
+
+  result.clear();
+  struct dirent* de = ::readdir(direntries);
+  for ( ; de != 0; de = ::readdir(direntries)) {
+    if (de->d_name[0] != '.') {
+      Path aPath(dirPath + (const char*)de->d_name);
+      struct stat st;
+      if (0 != lstat(aPath.path.c_str(), &st)) {
+        if (S_ISLNK(st.st_mode))
+          continue; // dangling symlink -- ignore
+        return MakeErrMsg(ErrMsg,
+                          aPath.path +  ": can't determine file object type");
+      }
+      result.insert(aPath);
+    }
+  }
+
+  closedir(direntries);
+  return false;
+}
+
+bool
+Path::set(const std::string& a_path) {
+  if (a_path.empty())
+    return false;
+  std::string save(path);
+  path = a_path;
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::appendComponent(const std::string& name) {
+  if (name.empty())
+    return false;
+  std::string save(path);
+  if (!lastIsSlash(path))
+    path += '/';
+  path += name;
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::eraseComponent() {
+  size_t slashpos = path.rfind('/',path.size());
+  if (slashpos == 0 || slashpos == std::string::npos) {
+    path.erase();
+    return true;
+  }
+  if (slashpos == path.size() - 1)
+    slashpos = path.rfind('/',slashpos-1);
+  if (slashpos == std::string::npos) {
+    path.erase();
+    return true;
+  }
+  path.erase(slashpos);
+  return true;
+}
+
+bool
+Path::appendSuffix(const std::string& suffix) {
+  std::string save(path);
+  path.append(".");
+  path.append(suffix);
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::eraseSuffix() {
+  std::string save = path;
+  size_t dotpos = path.rfind('.',path.size());
+  size_t slashpos = path.rfind('/',path.size());
+  if (dotpos != std::string::npos) {
+    if (slashpos == std::string::npos || dotpos > slashpos+1) {
+      path.erase(dotpos, path.size()-dotpos);
+      return true;
+    }
+  }
+  if (!isValid())
+    path = save;
+  return false;
+}
+
+static bool createDirectoryHelper(char* beg, char* end, bool create_parents) {
+
+  if (access(beg, F_OK | R_OK | W_OK) == 0)
+    return false;
+
+  if (create_parents) {
+
+    char* c = end;
+
+    for (; c != beg; --c)
+      if (*c == '/') {
+
+        // Recurse to handling the parent directory.
+        *c = '\0';
+        bool x = createDirectoryHelper(beg, c, create_parents);
+        *c = '/';
+
+        // Return if we encountered an error.
+        if (x)
+          return true;
+
+        break;
+      }
+  }
+
+  return mkdir(beg, S_IRWXU | S_IRWXG) != 0;
+}
+
+bool
+Path::createDirectoryOnDisk( bool create_parents, std::string* ErrMsg ) {
+  // Get a writeable copy of the path name
+  char pathname[MAXPATHLEN];
+  path.copy(pathname,MAXPATHLEN);
+
+  // Null-terminate the last component
+  size_t lastchar = path.length() - 1 ;
+
+  if (pathname[lastchar] != '/')
+    ++lastchar;
+
+  pathname[lastchar] = 0;
+
+  if (createDirectoryHelper(pathname, pathname+lastchar, create_parents))
+    return MakeErrMsg(ErrMsg,
+                      std::string(pathname) + ": can't create directory");
+
+  return false;
+}
+
+bool
+Path::createFileOnDisk(std::string* ErrMsg) {
+  // Create the file
+  int fd = ::creat(path.c_str(), S_IRUSR | S_IWUSR);
+  if (fd < 0)
+    return MakeErrMsg(ErrMsg, path + ": can't create file");
+  ::close(fd);
+  return false;
+}
+
+bool
+Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
+  // Make this into a unique file name
+  if (makeUnique( reuse_current, ErrMsg ))
+    return true;
+
+  // create the file
+  int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
+  if (fd < 0)
+    return MakeErrMsg(ErrMsg, path + ": can't create temporary file");
+  ::close(fd);
+  return false;
+}
+
+bool
+Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
+  // Get the status so we can determin if its a file or directory
+  struct stat buf;
+  if (0 != stat(path.c_str(), &buf)) {
+    MakeErrMsg(ErrStr, path + ": can't get status of file");
+    return true;
+  }
+
+  // Note: this check catches strange situations. In all cases, LLVM should
+  // only be involved in the creation and deletion of regular files.  This
+  // check ensures that what we're trying to erase is a regular file. It
+  // effectively prevents LLVM from erasing things like /dev/null, any block
+  // special file, or other things that aren't "regular" files.
+  if (S_ISREG(buf.st_mode)) {
+    if (unlink(path.c_str()) != 0)
+      return MakeErrMsg(ErrStr, path + ": can't destroy file");
+    return false;
+  }
+
+  if (!S_ISDIR(buf.st_mode)) {
+    if (ErrStr) *ErrStr = "not a file or directory";
+    return true;
+  }
+
+  if (remove_contents) {
+    // Recursively descend the directory to remove its contents.
+    std::string cmd = "/bin/rm -rf " + path;
+    if (system(cmd.c_str()) != 0) {
+      MakeErrMsg(ErrStr, path + ": failed to recursively remove directory.");
+      return true;
+    }
+    return false;
+  }
+
+  // Otherwise, try to just remove the one directory.
+  char pathname[MAXPATHLEN];
+  path.copy(pathname, MAXPATHLEN);
+  size_t lastchar = path.length() - 1;
+  if (pathname[lastchar] == '/')
+    pathname[lastchar] = 0;
+  else
+    pathname[lastchar+1] = 0;
+
+  if (rmdir(pathname) != 0)
+    return MakeErrMsg(ErrStr,
+      std::string(pathname) + ": can't erase directory");
+  return false;
+}
+
+bool
+Path::renamePathOnDisk(const Path& newName, std::string* ErrMsg) {
+  if (0 != ::rename(path.c_str(), newName.c_str()))
+    return MakeErrMsg(ErrMsg, std::string("can't rename '") + path + "' as '" +
+               newName.toString() + "'");
+  return false;
+}
+
+bool
+Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrStr) const {
+  struct utimbuf utb;
+  utb.actime = si.modTime.toPosixTime();
+  utb.modtime = utb.actime;
+  if (0 != ::utime(path.c_str(),&utb))
+    return MakeErrMsg(ErrStr, path + ": can't set file modification time");
+  if (0 != ::chmod(path.c_str(),si.mode))
+    return MakeErrMsg(ErrStr, path + ": can't set mode");
+  return false;
+}
+
+bool
+sys::CopyFile(const sys::Path &Dest, const sys::Path &Src, std::string* ErrMsg){
+  int inFile = -1;
+  int outFile = -1;
+  inFile = ::open(Src.c_str(), O_RDONLY);
+  if (inFile == -1)
+    return MakeErrMsg(ErrMsg, Src.toString() +
+      ": can't open source file to copy");
+
+  outFile = ::open(Dest.c_str(), O_WRONLY|O_CREAT, 0666);
+  if (outFile == -1) {
+    ::close(inFile);
+    return MakeErrMsg(ErrMsg, Dest.toString() +
+      ": can't create destination file for copy");
+  }
+
+  char Buffer[16*1024];
+  while (ssize_t Amt = ::read(inFile, Buffer, 16*1024)) {
+    if (Amt == -1) {
+      if (errno != EINTR && errno != EAGAIN) {
+        ::close(inFile);
+        ::close(outFile);
+        return MakeErrMsg(ErrMsg, Src.toString()+": can't read source file");
+      }
+    } else {
+      char *BufPtr = Buffer;
+      while (Amt) {
+        ssize_t AmtWritten = ::write(outFile, BufPtr, Amt);
+        if (AmtWritten == -1) {
+          if (errno != EINTR && errno != EAGAIN) {
+            ::close(inFile);
+            ::close(outFile);
+            return MakeErrMsg(ErrMsg, Dest.toString() +
+              ": can't write destination file");
+          }
+        } else {
+          Amt -= AmtWritten;
+          BufPtr += AmtWritten;
+        }
+      }
+    }
+  }
+  ::close(inFile);
+  ::close(outFile);
+  return false;
+}
+
+bool
+Path::makeUnique(bool reuse_current, std::string* ErrMsg) {
+  if (reuse_current && !exists())
+    return false; // File doesn't exist already, just use it!
+
+  // Append an XXXXXX pattern to the end of the file for use with mkstemp,
+  // mktemp or our own implementation.
+  char *FNBuffer = (char*) alloca(path.size()+8);
+    path.copy(FNBuffer,path.size());
+  if (isDirectory())
+    strcpy(FNBuffer+path.size(), "/XXXXXX");
+  else
+    strcpy(FNBuffer+path.size(), "-XXXXXX");
+
+#if defined(HAVE_MKSTEMP)
+  int TempFD;
+  if ((TempFD = mkstemp(FNBuffer)) == -1)
+    return MakeErrMsg(ErrMsg, path + ": can't make unique filename");
+
+  // We don't need to hold the temp file descriptor... we will trust that no one
+  // will overwrite/delete the file before we can open it again.
+  close(TempFD);
+
+  // Save the name
+  path = FNBuffer;
+#elif defined(HAVE_MKTEMP)
+  // If we don't have mkstemp, use the old and obsolete mktemp function.
+  if (mktemp(FNBuffer) == 0)
+    return MakeErrMsg(ErrMsg, path + ": can't make unique filename");
+
+  // Save the name
+  path = FNBuffer;
+#else
+  // Okay, looks like we have to do it all by our lonesome.
+  static unsigned FCounter = 0;
+  unsigned offset = path.size() + 1;
+  while ( FCounter < 999999 && exists()) {
+    sprintf(FNBuffer+offset,"%06u",++FCounter);
+    path = FNBuffer;
+  }
+  if (FCounter > 999999)
+    return MakeErrMsg(ErrMsg,
+      path + ": can't make unique filename: too many files");
+#endif
+  return false;
+}
+
+const char *Path::MapInFilePages(int FD, uint64_t FileSize) {
+  int Flags = MAP_PRIVATE;
+#ifdef MAP_FILE
+  Flags |= MAP_FILE;
+#endif
+  void *BasePtr = ::mmap(0, FileSize, PROT_READ, Flags, FD, 0);
+  if (BasePtr == MAP_FAILED)
+    return 0;
+  return (const char*)BasePtr;
+}
+
+void Path::UnMapFilePages(const char *BasePtr, uint64_t FileSize) {
+  ::munmap((void*)BasePtr, FileSize);
+}
+
+} // end llvm namespace
diff --git a/lib/System/Unix/Process.inc b/lib/System/Unix/Process.inc
new file mode 100644
index 0000000..74b9bb8
--- /dev/null
+++ b/lib/System/Unix/Process.inc
@@ -0,0 +1,237 @@
+//===- Unix/Process.cpp - Unix Process Implementation --------- -*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file provides the generic Unix implementation of the Process class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_SYS_IOCTL_H
+#  include <sys/ioctl.h>
+#endif
+#ifdef HAVE_TERMIOS_H
+#  include <termios.h>
+#endif
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+using namespace llvm;
+using namespace sys;
+
+unsigned 
+Process::GetPageSize() 
+{
+#if defined(__CYGWIN__)
+  // On Cygwin, getpagesize() returns 64k but the page size for the purposes of
+  // memory protection and mmap() is 4k.
+  // See http://www.cygwin.com/ml/cygwin/2009-01/threads.html#00492
+  static const int page_size = 0x1000;
+#elif defined(HAVE_GETPAGESIZE)
+  static const int page_size = ::getpagesize();
+#elif defined(HAVE_SYSCONF)
+  static long page_size = ::sysconf(_SC_PAGE_SIZE);
+#else
+#warning Cannot get the page size on this machine
+#endif
+  return static_cast<unsigned>(page_size);
+}
+
+size_t Process::GetMallocUsage() {
+#if defined(HAVE_MALLINFO)
+  struct mallinfo mi;
+  mi = ::mallinfo();
+  return mi.uordblks;
+#elif defined(HAVE_MALLOC_ZONE_STATISTICS) && defined(HAVE_MALLOC_MALLOC_H)
+  malloc_statistics_t Stats;
+  malloc_zone_statistics(malloc_default_zone(), &Stats);
+  return Stats.size_in_use;   // darwin
+#elif defined(HAVE_SBRK)
+  // Note this is only an approximation and more closely resembles
+  // the value returned by mallinfo in the arena field.
+  static char *StartOfMemory = reinterpret_cast<char*>(::sbrk(0));
+  char *EndOfMemory = (char*)sbrk(0);
+  if (EndOfMemory != ((char*)-1) && StartOfMemory != ((char*)-1))
+    return EndOfMemory - StartOfMemory;
+  else
+    return 0;
+#else
+#warning Cannot get malloc info on this platform
+  return 0;
+#endif
+}
+
+size_t
+Process::GetTotalMemoryUsage()
+{
+#if defined(HAVE_MALLINFO)
+  struct mallinfo mi = ::mallinfo();
+  return mi.uordblks + mi.hblkhd;
+#elif defined(HAVE_MALLOC_ZONE_STATISTICS) && defined(HAVE_MALLOC_MALLOC_H)
+  malloc_statistics_t Stats;
+  malloc_zone_statistics(malloc_default_zone(), &Stats);
+  return Stats.size_allocated;   // darwin
+#elif defined(HAVE_GETRUSAGE)
+  struct rusage usage;
+  ::getrusage(RUSAGE_SELF, &usage);
+  return usage.ru_maxrss;
+#else
+#warning Cannot get total memory size on this platform
+  return 0;
+#endif
+}
+
+void
+Process::GetTimeUsage(TimeValue& elapsed, TimeValue& user_time, 
+                      TimeValue& sys_time)
+{
+  elapsed = TimeValue::now();
+#if defined(HAVE_GETRUSAGE)
+  struct rusage usage;
+  ::getrusage(RUSAGE_SELF, &usage);
+  user_time = TimeValue( 
+    static_cast<TimeValue::SecondsType>( usage.ru_utime.tv_sec ), 
+    static_cast<TimeValue::NanoSecondsType>( usage.ru_utime.tv_usec * 
+      TimeValue::NANOSECONDS_PER_MICROSECOND ) );
+  sys_time = TimeValue( 
+    static_cast<TimeValue::SecondsType>( usage.ru_stime.tv_sec ), 
+    static_cast<TimeValue::NanoSecondsType>( usage.ru_stime.tv_usec * 
+      TimeValue::NANOSECONDS_PER_MICROSECOND ) );
+#else
+#warning Cannot get usage times on this platform
+  user_time.seconds(0);
+  user_time.microseconds(0);
+  sys_time.seconds(0);
+  sys_time.microseconds(0);
+#endif
+}
+
+int Process::GetCurrentUserId() {
+  return getuid();
+}
+
+int Process::GetCurrentGroupId() {
+  return getgid();
+}
+
+#ifdef HAVE_MACH_MACH_H
+#include <mach/mach.h>
+#endif
+
+// Some LLVM programs such as bugpoint produce core files as a normal part of
+// their operation. To prevent the disk from filling up, this function
+// does what's necessary to prevent their generation.
+void Process::PreventCoreFiles() {
+#if HAVE_SETRLIMIT
+  struct rlimit rlim;
+  rlim.rlim_cur = rlim.rlim_max = 0;
+  setrlimit(RLIMIT_CORE, &rlim);
+#endif
+
+#ifdef HAVE_MACH_MACH_H
+  // Disable crash reporting on Mac OS X 10.0-10.4
+
+  // get information about the original set of exception ports for the task
+  mach_msg_type_number_t Count = 0;
+  exception_mask_t OriginalMasks[EXC_TYPES_COUNT];
+  exception_port_t OriginalPorts[EXC_TYPES_COUNT];
+  exception_behavior_t OriginalBehaviors[EXC_TYPES_COUNT];
+  thread_state_flavor_t OriginalFlavors[EXC_TYPES_COUNT];
+  kern_return_t err = 
+    task_get_exception_ports(mach_task_self(), EXC_MASK_ALL, OriginalMasks,
+                             &Count, OriginalPorts, OriginalBehaviors,
+                             OriginalFlavors);
+  if (err == KERN_SUCCESS) {
+    // replace each with MACH_PORT_NULL.
+    for (unsigned i = 0; i != Count; ++i)
+      task_set_exception_ports(mach_task_self(), OriginalMasks[i], 
+                               MACH_PORT_NULL, OriginalBehaviors[i],
+                               OriginalFlavors[i]);
+  }
+
+  // Disable crash reporting on Mac OS X 10.5
+  signal(SIGABRT, _exit);
+  signal(SIGILL,  _exit);
+  signal(SIGFPE,  _exit);
+  signal(SIGSEGV, _exit);
+  signal(SIGBUS,  _exit);
+#endif
+}
+
+bool Process::StandardInIsUserInput() {
+#if HAVE_ISATTY
+  return isatty(0);
+#endif
+  // If we don't have isatty, just return false.
+  return false;
+}
+
+bool Process::StandardOutIsDisplayed() {
+#if HAVE_ISATTY
+  return isatty(1);
+#endif
+  // If we don't have isatty, just return false.
+  return false;
+}
+
+bool Process::StandardErrIsDisplayed() {
+#if HAVE_ISATTY
+  return isatty(2);
+#endif
+  // If we don't have isatty, just return false.
+  return false;
+}
+
+static unsigned getColumns(int FileID) {
+  // If COLUMNS is defined in the environment, wrap to that many columns.
+  if (const char *ColumnsStr = std::getenv("COLUMNS")) {
+    int Columns = std::atoi(ColumnsStr);
+    if (Columns > 0)
+      return Columns;
+  }
+
+  unsigned Columns = 0;
+
+#if defined(HAVE_SYS_IOCTL_H) && defined(HAVE_TERMIOS_H)
+  // Try to determine the width of the terminal.
+  struct winsize ws;
+  if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
+    Columns = ws.ws_col;
+#endif
+
+  return Columns;
+}
+
+unsigned Process::StandardOutColumns() {
+  if (!StandardOutIsDisplayed())
+    return 0;
+
+  return getColumns(1);
+}
+
+unsigned Process::StandardErrColumns() {
+  if (!StandardErrIsDisplayed())
+    return 0;
+
+  return getColumns(2);
+}
diff --git a/lib/System/Unix/Program.inc b/lib/System/Unix/Program.inc
new file mode 100644
index 0000000..cdc6fee
--- /dev/null
+++ b/lib/System/Unix/Program.inc
@@ -0,0 +1,287 @@
+//===- llvm/System/Unix/Program.cpp -----------------------------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific portion of the Program class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include <llvm/Config/config.h>
+#include "Unix.h"
+#include <iostream>
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+#if HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+
+namespace llvm {
+using namespace sys;
+
+// This function just uses the PATH environment variable to find the program.
+Path
+Program::FindProgramByName(const std::string& progName) {
+
+  // Check some degenerate cases
+  if (progName.length() == 0) // no program
+    return Path();
+  Path temp;
+  if (!temp.set(progName)) // invalid name
+    return Path();
+  // FIXME: have to check for absolute filename - we cannot assume anything
+  // about "." being in $PATH
+  if (temp.canExecute()) // already executable as is
+    return temp;
+
+  // At this point, the file name is valid and its not executable
+ 
+  // Get the path. If its empty, we can't do anything to find it.
+  const char *PathStr = getenv("PATH");
+  if (PathStr == 0) 
+    return Path();
+
+  // Now we have a colon separated list of directories to search; try them.
+  size_t PathLen = strlen(PathStr);
+  while (PathLen) {
+    // Find the first colon...
+    const char *Colon = std::find(PathStr, PathStr+PathLen, ':');
+
+    // Check to see if this first directory contains the executable...
+    Path FilePath;
+    if (FilePath.set(std::string(PathStr,Colon))) {
+      FilePath.appendComponent(progName);
+      if (FilePath.canExecute())
+        return FilePath;                    // Found the executable!
+    }
+
+    // Nope it wasn't in this directory, check the next path in the list!
+    PathLen -= Colon-PathStr;
+    PathStr = Colon;
+
+    // Advance past duplicate colons
+    while (*PathStr == ':') {
+      PathStr++;
+      PathLen--;
+    }
+  }
+  return Path();
+}
+
+static bool RedirectIO(const Path *Path, int FD, std::string* ErrMsg) {
+  if (Path == 0)
+    // Noop
+    return false;
+  std::string File;
+  if (Path->isEmpty())
+    // Redirect empty paths to /dev/null
+    File = "/dev/null";
+  else
+    File = Path->toString();
+
+  // Open the file
+  int InFD = open(File.c_str(), FD == 0 ? O_RDONLY : O_WRONLY|O_CREAT, 0666);
+  if (InFD == -1) {
+    MakeErrMsg(ErrMsg, "Cannot open file '" + File + "' for "
+              + (FD == 0 ? "input" : "output"));
+    return true;
+  }
+
+  // Install it as the requested FD
+  if (-1 == dup2(InFD, FD)) {
+    MakeErrMsg(ErrMsg, "Cannot dup2");
+    return true;
+  }
+  close(InFD);      // Close the original FD
+  return false;
+}
+
+static bool Timeout = false;
+static void TimeOutHandler(int Sig) {
+  Timeout = true;
+}
+
+static void SetMemoryLimits (unsigned size)
+{
+#if HAVE_SYS_RESOURCE_H
+  struct rlimit r;
+  __typeof__ (r.rlim_cur) limit = (__typeof__ (r.rlim_cur)) (size) * 1048576;
+
+  // Heap size
+  getrlimit (RLIMIT_DATA, &r);
+  r.rlim_cur = limit;
+  setrlimit (RLIMIT_DATA, &r);
+#ifdef RLIMIT_RSS
+  // Resident set size.
+  getrlimit (RLIMIT_RSS, &r);
+  r.rlim_cur = limit;
+  setrlimit (RLIMIT_RSS, &r);
+#endif
+#ifdef RLIMIT_AS  // e.g. NetBSD doesn't have it.
+  // Virtual memory.
+  getrlimit (RLIMIT_AS, &r);
+  r.rlim_cur = limit;
+  setrlimit (RLIMIT_AS, &r);
+#endif
+#endif
+}
+
+int 
+Program::ExecuteAndWait(const Path& path, 
+                        const char** args,
+                        const char** envp,
+                        const Path** redirects,
+                        unsigned secondsToWait,
+                        unsigned memoryLimit,
+                        std::string* ErrMsg) 
+{
+  if (!path.canExecute()) {
+    if (ErrMsg)
+      *ErrMsg = path.toString() + " is not executable";
+    return -1;
+  }
+
+#ifdef HAVE_SYS_WAIT_H
+  // Create a child process.
+  int child = fork();
+  switch (child) {
+    // An error occured:  Return to the caller.
+    case -1:
+      MakeErrMsg(ErrMsg, "Couldn't fork");
+      return -1;
+
+    // Child process: Execute the program.
+    case 0: {
+      // Redirect file descriptors...
+      if (redirects) {
+        // Redirect stdin
+        if (RedirectIO(redirects[0], 0, ErrMsg)) { return -1; }
+        // Redirect stdout
+        if (RedirectIO(redirects[1], 1, ErrMsg)) { return -1; }
+        if (redirects[1] && redirects[2] && 
+            *(redirects[1]) == *(redirects[2])) {
+          // If stdout and stderr should go to the same place, redirect stderr
+          // to the FD already open for stdout.
+          if (-1 == dup2(1,2)) {
+            MakeErrMsg(ErrMsg, "Can't redirect stderr to stdout");
+            return -1;
+          }
+        } else {
+          // Just redirect stderr
+          if (RedirectIO(redirects[2], 2, ErrMsg)) { return -1; }
+        }
+      }
+
+      // Set memory limits
+      if (memoryLimit!=0) {
+        SetMemoryLimits(memoryLimit);
+      }
+      
+      // Execute!
+      if (envp != 0)
+        execve (path.c_str(), (char**)args, (char**)envp);
+      else
+        execv (path.c_str(), (char**)args);
+      // If the execve() failed, we should exit and let the parent pick up
+      // our non-zero exit status.
+      exit (errno);
+    }
+
+    // Parent process: Break out of the switch to do our processing.
+    default:
+      break;
+  }
+
+  // Make sure stderr and stdout have been flushed
+  std::cerr << std::flush;
+  std::cout << std::flush;
+  fsync(1);
+  fsync(2);
+
+  struct sigaction Act, Old;
+
+  // Install a timeout handler.
+  if (secondsToWait) {
+    Timeout = false;
+    Act.sa_sigaction = 0;
+    Act.sa_handler = TimeOutHandler;
+    sigemptyset(&Act.sa_mask);
+    Act.sa_flags = 0;
+    sigaction(SIGALRM, &Act, &Old);
+    alarm(secondsToWait);
+  }
+
+  // Parent process: Wait for the child process to terminate.
+  int status;
+  while (wait(&status) != child)
+    if (secondsToWait && errno == EINTR) {
+      // Kill the child.
+      kill(child, SIGKILL);
+        
+      // Turn off the alarm and restore the signal handler
+      alarm(0);
+      sigaction(SIGALRM, &Old, 0);
+
+      // Wait for child to die
+      if (wait(&status) != child)
+        MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
+      else
+        MakeErrMsg(ErrMsg, "Child timed out", 0);
+
+      return -1;   // Timeout detected
+    } else if (errno != EINTR) {
+      MakeErrMsg(ErrMsg, "Error waiting for child process");
+      return -1;
+    }
+
+  // We exited normally without timeout, so turn off the timer.
+  if (secondsToWait) {
+    alarm(0);
+    sigaction(SIGALRM, &Old, 0);
+  }
+
+  // Return the proper exit status. 0=success, >0 is programs' exit status,
+  // <0 means a signal was returned, -9999999 means the program dumped core.
+  int result = 0;
+  if (WIFEXITED(status))
+    result = WEXITSTATUS(status);
+  else if (WIFSIGNALED(status))
+    result = 0 - WTERMSIG(status);
+#ifdef WCOREDUMP
+  else if (WCOREDUMP(status))
+    result |= 0x01000000;
+#endif
+  return result;
+#else
+  return -99;
+#endif
+    
+}
+
+bool Program::ChangeStdinToBinary(){
+  // Do nothing, as Unix doesn't differentiate between text and binary.
+  return false;
+}
+
+bool Program::ChangeStdoutToBinary(){
+  // Do nothing, as Unix doesn't differentiate between text and binary.
+  return false;
+}
+
+}
diff --git a/lib/System/Unix/README.txt b/lib/System/Unix/README.txt
new file mode 100644
index 0000000..b3bace4
--- /dev/null
+++ b/lib/System/Unix/README.txt
@@ -0,0 +1,16 @@
+llvm/lib/System/Unix README
+===========================
+
+This directory provides implementations of the lib/System classes that
+are common to two or more variants of UNIX. For example, the directory 
+structure underneath this directory could look like this:
+
+Unix           - only code that is truly generic to all UNIX platforms
+  Posix        - code that is specific to Posix variants of UNIX
+  SUS          - code that is specific to the Single Unix Specification 
+  SysV         - code that is specific to System V variants of UNIX
+
+As a rule, only those directories actually needing to be created should be
+created. Also, further subdirectories could be created to reflect versions of
+the various standards. For example, under SUS there could be v1, v2, and v3
+subdirectories to reflect the three major versions of SUS.
diff --git a/lib/System/Unix/Signals.inc b/lib/System/Unix/Signals.inc
new file mode 100644
index 0000000..e385e0c
--- /dev/null
+++ b/lib/System/Unix/Signals.inc
@@ -0,0 +1,230 @@
+//===- Signals.cpp - Generic Unix Signals Implementation -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occuring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/ADT/STLExtras.h"
+#include <vector>
+#include <algorithm>
+#if HAVE_EXECINFO_H
+# include <execinfo.h>         // For backtrace().
+#endif
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_DLFCN_H && __GNUG__
+#include <dlfcn.h>
+#include <cxxabi.h> 
+#endif
+using namespace llvm;
+
+static RETSIGTYPE SignalHandler(int Sig);  // defined below.
+
+/// InterruptFunction - The function to call if ctrl-c is pressed.
+static void (*InterruptFunction)() = 0;
+
+static std::vector<sys::Path> *FilesToRemove = 0;
+static std::vector<std::pair<void(*)(void*), void*> > *CallBacksToRun = 0;
+
+// IntSigs - Signals that may interrupt the program at any time.
+static const int IntSigs[] = {
+  SIGHUP, SIGINT, SIGQUIT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
+};
+static const int *const IntSigsEnd =
+  IntSigs + sizeof(IntSigs) / sizeof(IntSigs[0]);
+
+// KillSigs - Signals that are synchronous with the program that will cause it
+// to die.
+static const int KillSigs[] = {
+  SIGILL, SIGTRAP, SIGABRT, SIGFPE, SIGBUS, SIGSEGV, SIGSYS, SIGXCPU, SIGXFSZ
+#ifdef SIGEMT
+  , SIGEMT
+#endif
+};
+static const int *const KillSigsEnd =
+  KillSigs + sizeof(KillSigs) / sizeof(KillSigs[0]);
+
+static unsigned NumRegisteredSignals = 0;
+static struct {
+  struct sigaction SA;
+  int SigNo;
+} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])];
+
+
+static void RegisterHandler(int Signal) {
+  assert(NumRegisteredSignals <
+         sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) &&
+         "Out of space for signal handlers!");
+
+  struct sigaction NewHandler;
+  
+  NewHandler.sa_handler = SignalHandler;
+  NewHandler.sa_flags = SA_NODEFER|SA_RESETHAND;
+  sigemptyset(&NewHandler.sa_mask); 
+  
+  // Install the new handler, save the old one in RegisteredSignalInfo.
+  sigaction(Signal, &NewHandler,
+            &RegisteredSignalInfo[NumRegisteredSignals].SA);
+  RegisteredSignalInfo[NumRegisteredSignals].SigNo = Signal;
+  ++NumRegisteredSignals;
+}
+
+static void RegisterHandlers() {
+  // If the handlers are already registered, we're done.
+  if (NumRegisteredSignals != 0) return;
+
+  std::for_each(IntSigs, IntSigsEnd, RegisterHandler);
+  std::for_each(KillSigs, KillSigsEnd, RegisterHandler);
+}
+
+static void UnregisterHandlers() {
+  // Restore all of the signal handlers to how they were before we showed up.
+  for (unsigned i = 0, e = NumRegisteredSignals; i != e; ++i)
+    sigaction(RegisteredSignalInfo[i].SigNo,
+              &RegisteredSignalInfo[i].SA, 0);
+  NumRegisteredSignals = 0;
+}
+
+
+
+// SignalHandler - The signal handler that runs.
+static RETSIGTYPE SignalHandler(int Sig) {
+  // Restore the signal behavior to default, so that the program actually
+  // crashes when we return and the signal reissues.  This also ensures that if
+  // we crash in our signal handler that the program will terminate immediately
+  // instead of recursing in the signal handler.
+  UnregisterHandlers();
+
+  // Unmask all potentially blocked kill signals.
+  sigset_t SigMask;
+  sigfillset(&SigMask);
+  sigprocmask(SIG_UNBLOCK, &SigMask, 0);
+
+  if (FilesToRemove != 0)
+    while (!FilesToRemove->empty()) {
+      FilesToRemove->back().eraseFromDisk(true);
+      FilesToRemove->pop_back();
+    }
+
+  if (std::find(IntSigs, IntSigsEnd, Sig) != IntSigsEnd) {
+    if (InterruptFunction) {
+      void (*IF)() = InterruptFunction;
+      InterruptFunction = 0;
+      IF();        // run the interrupt function.
+      return;
+    }
+    raise(Sig);   // Execute the default handler.
+    return;
+  }
+
+  // Otherwise if it is a fault (like SEGV) run any handler.
+  if (CallBacksToRun)
+    for (unsigned i = 0, e = CallBacksToRun->size(); i != e; ++i)
+      (*CallBacksToRun)[i].first((*CallBacksToRun)[i].second);
+}
+
+
+
+void llvm::sys::SetInterruptFunction(void (*IF)()) {
+  InterruptFunction = IF;
+  RegisterHandlers();
+}
+
+// RemoveFileOnSignal - The public API
+bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
+                                   std::string* ErrMsg) {
+  if (FilesToRemove == 0)
+    FilesToRemove = new std::vector<sys::Path>();
+
+  FilesToRemove->push_back(Filename);
+
+  RegisterHandlers();
+  return false;
+}
+
+/// AddSignalHandler - Add a function to be called when a signal is delivered
+/// to the process.  The handler can have a cookie passed to it to identify
+/// what instance of the handler it is.
+void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
+  if (CallBacksToRun == 0)
+    CallBacksToRun = new std::vector<std::pair<void(*)(void*), void*> >();
+  CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
+  RegisterHandlers();
+}
+
+
+// PrintStackTrace - In the case of a program crash or fault, print out a stack
+// trace so that the user has an indication of why and where we died.
+//
+// On glibc systems we have the 'backtrace' function, which works nicely, but
+// doesn't demangle symbols.  
+static void PrintStackTrace(void *) {
+#ifdef HAVE_BACKTRACE
+  static void* StackTrace[256];
+  // Use backtrace() to output a backtrace on Linux systems with glibc.
+  int depth = backtrace(StackTrace,
+                        static_cast<int>(array_lengthof(StackTrace)));
+#if HAVE_DLFCN_H && __GNUG__
+  int width = 0;
+  for (int i = 0; i < depth; ++i) {
+    Dl_info dlinfo;
+    dladdr(StackTrace[i], &dlinfo);
+    const char* name = strrchr(dlinfo.dli_fname, '/');
+
+    int nwidth;
+    if (name == NULL) nwidth = strlen(dlinfo.dli_fname);
+    else              nwidth = strlen(name) - 1;
+
+    if (nwidth > width) width = nwidth;
+  }
+
+  for (int i = 0; i < depth; ++i) {
+    Dl_info dlinfo;
+    dladdr(StackTrace[i], &dlinfo);
+
+    fprintf(stderr, "%-3d", i);
+
+    const char* name = strrchr(dlinfo.dli_fname, '/');
+    if (name == NULL) fprintf(stderr, " %-*s", width, dlinfo.dli_fname);
+    else              fprintf(stderr, " %-*s", width, name+1);
+
+    fprintf(stderr, " %#0*lx",
+            (int)(sizeof(void*) * 2) + 2, (unsigned long)StackTrace[i]);
+
+    if (dlinfo.dli_sname != NULL) {
+      int res;
+      fputc(' ', stderr);
+      char* d = abi::__cxa_demangle(dlinfo.dli_sname, NULL, NULL, &res);
+      if (d == NULL) fputs(dlinfo.dli_sname, stderr);
+      else           fputs(d, stderr);
+      free(d);
+
+      fprintf(stderr, " + %tu",(char*)StackTrace[i]-(char*)dlinfo.dli_saddr);
+    }
+    fputc('\n', stderr);
+  }
+#else
+  backtrace_symbols_fd(StackTrace, depth, STDERR_FILENO);
+#endif
+#endif
+}
+
+/// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
+/// SIGSEGV) is delivered to the process, print a stack trace and then exit.
+void llvm::sys::PrintStackTraceOnErrorSignal() {
+  AddSignalHandler(PrintStackTrace, 0);
+}
+
diff --git a/lib/System/Unix/TimeValue.inc b/lib/System/Unix/TimeValue.inc
new file mode 100644
index 0000000..8dd30b9
--- /dev/null
+++ b/lib/System/Unix/TimeValue.inc
@@ -0,0 +1,56 @@
+//===- Unix/TimeValue.cpp - Unix TimeValue Implementation -------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific portion of the TimeValue class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+
+namespace llvm {
+  using namespace sys;
+
+std::string TimeValue::toString() const {
+  char buffer[32];
+
+  time_t ourTime = time_t(this->toEpochTime());
+#ifdef __hpux
+// note that the following line needs -D_REENTRANT on HP-UX to be picked up 
+  asctime_r(localtime(&ourTime), buffer);
+#else
+  ::asctime_r(::localtime(&ourTime), buffer);
+#endif
+
+  std::string result(buffer);
+  return result.substr(0,24);
+}
+
+TimeValue TimeValue::now() {
+  struct timeval the_time;
+  timerclear(&the_time);
+  if (0 != ::gettimeofday(&the_time,0)) {
+    // This is *really* unlikely to occur because the only gettimeofday
+    // errors concern the timezone parameter which we're passing in as 0.
+    // In the unlikely case it does happen, just return MinTime, no error
+    // message needed. 
+    return MinTime;
+  }
+
+  return TimeValue(
+    static_cast<TimeValue::SecondsType>( the_time.tv_sec ), 
+    static_cast<TimeValue::NanoSecondsType>( the_time.tv_usec * 
+      NANOSECONDS_PER_MICROSECOND ) );
+}
+
+}
diff --git a/lib/System/Unix/Unix.h b/lib/System/Unix/Unix.h
new file mode 100644
index 0000000..452226f
--- /dev/null
+++ b/lib/System/Unix/Unix.h
@@ -0,0 +1,104 @@
+//===- llvm/System/Unix/Unix.h - Common Unix Include File -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines things specific to Unix implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEM_UNIX_UNIX_H
+#define LLVM_SYSTEM_UNIX_UNIX_H
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on all UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cerrno>
+#include <string>
+#include <algorithm>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_ASSERT_H
+#include <assert.h>
+#endif
+
+#ifdef TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# ifdef HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#ifdef HAVE_SYS_WAIT_H
+# include <sys/wait.h>
+#endif
+
+#ifndef WEXITSTATUS
+# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
+#endif
+
+#ifndef WIFEXITED
+# define WIFEXITED(stat_val) (((stat_val) & 255) == 0)
+#endif
+
+/// This function builds an error message into \p ErrMsg using the \p prefix
+/// string and the Unix error number given by \p errnum. If errnum is -1, the
+/// default then the value of errno is used.
+/// @brief Make an error message
+///
+/// If the error number can be converted to a string, it will be
+/// separated from prefix by ": ".
+static inline bool MakeErrMsg(
+  std::string* ErrMsg, const std::string& prefix, int errnum = -1) {
+  if (!ErrMsg)
+    return true;
+  char buffer[MAXPATHLEN];
+  buffer[0] = 0;
+  if (errnum == -1)
+    errnum = errno;
+#ifdef HAVE_STRERROR_R
+  // strerror_r is thread-safe.
+  if (errnum)
+    strerror_r(errnum,buffer,MAXPATHLEN-1);
+#elif HAVE_STRERROR
+  // Copy the thread un-safe result of strerror into
+  // the buffer as fast as possible to minimize impact
+  // of collision of strerror in multiple threads.
+  if (errnum)
+    strncpy(buffer,strerror(errnum),MAXPATHLEN-1);
+  buffer[MAXPATHLEN-1] = 0;
+#else
+  // Strange that this system doesn't even have strerror
+  // but, oh well, just use a generic message
+  sprintf(buffer, "Error #%d", errnum);
+#endif
+  *ErrMsg = prefix + ": " + buffer;
+  return true;
+}
+
+#endif
diff --git a/lib/System/Win32/Alarm.inc b/lib/System/Win32/Alarm.inc
new file mode 100644
index 0000000..e0d00a0
--- /dev/null
+++ b/lib/System/Win32/Alarm.inc
@@ -0,0 +1,43 @@
+//===-- Alarm.inc - Implement Win32 Alarm Support ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Win32 Alarm support.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+using namespace llvm;
+
+/// NestedSOI - Sanity check.  Alarms cannot be nested or run in parallel.
+/// This ensures that they never do.
+static bool NestedSOI = false;
+
+void sys::SetupAlarm(unsigned seconds) {
+  assert(!NestedSOI && "sys::SetupAlarm calls cannot be nested!");
+  NestedSOI = true;
+  // FIXME: Implement for Win32
+}
+
+void sys::TerminateAlarm() {
+  assert(NestedSOI && "sys::TerminateAlarm called without sys::SetupAlarm!");
+  // FIXME: Implement for Win32
+  NestedSOI = false;
+}
+
+int sys::AlarmStatus() {
+  // FIXME: Implement for Win32
+  return 0;
+}
+
+// Don't pull in all of the Windows headers.
+extern "C"  void __stdcall Sleep(unsigned long);
+
+void sys::Sleep(unsigned n) {
+  ::Sleep(n*1000);
+}
diff --git a/lib/System/Win32/DynamicLibrary.inc b/lib/System/Win32/DynamicLibrary.inc
new file mode 100644
index 0000000..1ddf6ce
--- /dev/null
+++ b/lib/System/Win32/DynamicLibrary.inc
@@ -0,0 +1,219 @@
+//===- Win32/DynamicLibrary.cpp - Win32 DL Implementation -------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of DynamicLibrary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+
+#ifdef __MINGW32__
+ #include <imagehlp.h>
+#else
+ #include <dbghelp.h>
+#endif
+
+#ifdef _MSC_VER
+ #include <ntverp.h>
+#endif
+
+#ifdef __MINGW32__
+ #if (HAVE_LIBIMAGEHLP != 1)
+  #error "libimagehlp.a should be present"
+ #endif
+#else
+ #pragma comment(lib, "dbghelp.lib")
+#endif
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code 
+//===          and must not be UNIX code.
+//===----------------------------------------------------------------------===//
+
+static std::vector<HMODULE> OpenedHandles;
+
+#ifdef _WIN64
+  typedef DWORD64 ModuleBaseType;
+#else
+  typedef ULONG ModuleBaseType;
+#endif
+
+extern "C" {
+// Use old callback if:
+//  - Not using Visual Studio
+//  - Visual Studio 2005 or earlier but only if we are not using the Windows SDK 
+//    or Windows SDK version is older than 6.0
+// Use new callback if:
+//  - Newer Visual Studio (comes with newer SDK).
+//  - Visual Studio 2005 with Windows SDK 6.0+
+#if !defined(_MSC_VER) || _MSC_VER < 1500 && (!defined(VER_PRODUCTBUILD) || VER_PRODUCTBUILD < 6000)
+  static BOOL CALLBACK ELM_Callback(PSTR  ModuleName,
+                                    ModuleBaseType ModuleBase,
+                                    ULONG ModuleSize,
+                                    PVOID UserContext)
+#else
+  static BOOL CALLBACK ELM_Callback(PCSTR  ModuleName,
+                                    ModuleBaseType ModuleBase,
+                                    ULONG ModuleSize,
+                                    PVOID UserContext)
+#endif
+  {
+    // Ignore VC++ runtimes prior to 7.1.  Somehow some of them get loaded
+    // into the process.
+    if (stricmp(ModuleName, "msvci70") != 0 &&
+        stricmp(ModuleName, "msvcirt") != 0 &&
+        stricmp(ModuleName, "msvcp50") != 0 &&
+        stricmp(ModuleName, "msvcp60") != 0 &&
+        stricmp(ModuleName, "msvcp70") != 0 &&
+        stricmp(ModuleName, "msvcr70") != 0 &&
+#ifndef __MINGW32__
+        // Mingw32 uses msvcrt.dll by default. Don't ignore it.
+        // Otherwise, user should be aware, what he's doing :)
+        stricmp(ModuleName, "msvcrt") != 0 &&
+#endif        
+        stricmp(ModuleName, "msvcrt20") != 0 &&
+        stricmp(ModuleName, "msvcrt40") != 0) {
+      OpenedHandles.push_back((HMODULE)ModuleBase);
+    }
+    return TRUE;
+  }
+}
+
+DynamicLibrary::DynamicLibrary() : handle(0) {
+  handle = GetModuleHandle(NULL);
+  OpenedHandles.push_back((HMODULE)handle);
+}
+
+DynamicLibrary::~DynamicLibrary() {
+  if (handle == 0)
+    return;
+
+  // GetModuleHandle() does not increment the ref count, so we must not free
+  // the handle to the executable.
+  if (handle != GetModuleHandle(NULL))
+    FreeLibrary((HMODULE)handle);
+  handle = 0;
+
+  for (std::vector<HMODULE>::iterator I = OpenedHandles.begin(),
+       E = OpenedHandles.end(); I != E; ++I) {
+    if (*I == handle) {
+      // Note: don't use the swap/pop_back trick here. Order is important.
+      OpenedHandles.erase(I);
+    }
+  }
+}
+ 
+bool DynamicLibrary::LoadLibraryPermanently(const char *filename,
+                                            std::string *ErrMsg) {
+  if (filename) {
+    HMODULE a_handle = LoadLibrary(filename);
+
+    if (a_handle == 0)
+      return MakeErrMsg(ErrMsg, std::string(filename) + ": Can't open : ");
+
+    OpenedHandles.push_back(a_handle);
+  } else {
+    // When no file is specified, enumerate all DLLs and EXEs in the
+    // process.
+    EnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0);
+  }
+
+  // Because we don't remember the handle, we will never free it; hence,
+  // it is loaded permanently.
+  return false;
+}
+
+// Stack probing routines are in the support library (e.g. libgcc), but we don't
+// have dynamic linking on windows. Provide a hook.
+#if defined(__MINGW32__) || defined (_MSC_VER)
+  #define EXPLICIT_SYMBOL(SYM)                    \
+    if (!strcmp(symbolName, #SYM)) return (void*)&SYM
+  #define EXPLICIT_SYMBOL2(SYMFROM, SYMTO)        \
+    if (!strcmp(symbolName, #SYMFROM)) return (void*)&SYMTO
+  #define EXPLICIT_SYMBOL_DEF(SYM)                \
+    extern "C" { extern void *SYM; }
+
+  #if defined(__MINGW32__)
+    EXPLICIT_SYMBOL_DEF(_alloca);
+    EXPLICIT_SYMBOL_DEF(__main);
+    EXPLICIT_SYMBOL_DEF(__ashldi3);
+    EXPLICIT_SYMBOL_DEF(__ashrdi3);
+    EXPLICIT_SYMBOL_DEF(__cmpdi2);
+    EXPLICIT_SYMBOL_DEF(__divdi3);
+    EXPLICIT_SYMBOL_DEF(__fixdfdi);
+    EXPLICIT_SYMBOL_DEF(__fixsfdi);
+    EXPLICIT_SYMBOL_DEF(__fixunsdfdi);
+    EXPLICIT_SYMBOL_DEF(__fixunssfdi);
+    EXPLICIT_SYMBOL_DEF(__floatdidf);
+    EXPLICIT_SYMBOL_DEF(__floatdisf);
+    EXPLICIT_SYMBOL_DEF(__lshrdi3);
+    EXPLICIT_SYMBOL_DEF(__moddi3);
+    EXPLICIT_SYMBOL_DEF(__udivdi3);
+    EXPLICIT_SYMBOL_DEF(__umoddi3);
+  #elif defined(_MSC_VER)
+    EXPLICIT_SYMBOL_DEF(_alloca_probe);
+  #endif
+#endif
+
+void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
+  // First check symbols added via AddSymbol().
+  std::map<std::string, void *>::iterator I = g_symbols().find(symbolName);
+  if (I != g_symbols().end())
+    return I->second;
+
+  // Now search the libraries.
+  for (std::vector<HMODULE>::iterator I = OpenedHandles.begin(),
+       E = OpenedHandles.end(); I != E; ++I) {
+    FARPROC ptr = GetProcAddress((HMODULE)*I, symbolName);
+    if (ptr)
+      return (void *) ptr;
+  }
+
+#if defined(__MINGW32__)
+  {
+    EXPLICIT_SYMBOL(_alloca);
+    EXPLICIT_SYMBOL(__main);
+    EXPLICIT_SYMBOL(__ashldi3);
+    EXPLICIT_SYMBOL(__ashrdi3);
+    EXPLICIT_SYMBOL(__cmpdi2);
+    EXPLICIT_SYMBOL(__divdi3);
+    EXPLICIT_SYMBOL(__fixdfdi);
+    EXPLICIT_SYMBOL(__fixsfdi);
+    EXPLICIT_SYMBOL(__fixunsdfdi);
+    EXPLICIT_SYMBOL(__fixunssfdi);
+    EXPLICIT_SYMBOL(__floatdidf);
+    EXPLICIT_SYMBOL(__floatdisf);
+    EXPLICIT_SYMBOL(__lshrdi3);
+    EXPLICIT_SYMBOL(__moddi3);
+    EXPLICIT_SYMBOL(__udivdi3);
+    EXPLICIT_SYMBOL(__umoddi3);
+
+    EXPLICIT_SYMBOL2(alloca, _alloca);
+#undef EXPLICIT_SYMBOL
+#undef EXPLICIT_SYMBOL2
+#undef EXPLICIT_SYMBOL_DEF    
+  }
+#elif defined(_MSC_VER)
+  {
+    EXPLICIT_SYMBOL2(alloca, _alloca_probe);
+    EXPLICIT_SYMBOL2(_alloca, _alloca_probe);
+#undef EXPLICIT_SYMBOL
+#undef EXPLICIT_SYMBOL2
+#undef EXPLICIT_SYMBOL_DEF    
+  }  
+#endif
+
+  return 0;
+}
+
+}
+
diff --git a/lib/System/Win32/Host.inc b/lib/System/Win32/Host.inc
new file mode 100644
index 0000000..18f00f8
--- /dev/null
+++ b/lib/System/Win32/Host.inc
@@ -0,0 +1,23 @@
+//===- llvm/System/Win32/Host.inc -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Win32 Host support.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include <cstdio>
+#include <string>
+
+using namespace llvm;
+
+std::string sys::getHostTriple() {
+  // FIXME: Adapt to running version.
+  return LLVM_HOSTTRIPLE;
+}
diff --git a/lib/System/Win32/Memory.inc b/lib/System/Win32/Memory.inc
new file mode 100644
index 0000000..5e5cf7a
--- /dev/null
+++ b/lib/System/Win32/Memory.inc
@@ -0,0 +1,72 @@
+//===- Win32/Memory.cpp - Win32 Memory Implementation -----------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of various Memory
+// management utilities
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include "llvm/System/Process.h"
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code 
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+MemoryBlock Memory::AllocateRWX(unsigned NumBytes,
+                                const MemoryBlock *NearBlock,
+                                std::string *ErrMsg) {
+  if (NumBytes == 0) return MemoryBlock();
+
+  static const long pageSize = Process::GetPageSize();
+  unsigned NumPages = (NumBytes+pageSize-1)/pageSize;
+
+  //FIXME: support NearBlock if ever needed on Win64.
+
+  void *pa = VirtualAlloc(NULL, NumPages*pageSize, MEM_COMMIT,
+                  PAGE_EXECUTE_READWRITE);
+  if (pa == NULL) {
+    MakeErrMsg(ErrMsg, "Can't allocate RWX Memory: ");
+    return MemoryBlock();
+  }
+
+  MemoryBlock result;
+  result.Address = pa;
+  result.Size = NumPages*pageSize;
+  return result;
+}
+
+bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+  if (M.Address == 0 || M.Size == 0) return false;
+  if (!VirtualFree(M.Address, 0, MEM_RELEASE))
+    return MakeErrMsg(ErrMsg, "Can't release RWX Memory: ");
+  return false;
+}
+
+bool Memory::setWritable(MemoryBlock &M, std::string *ErrMsg) {
+  return true;
+}
+
+bool Memory::setExecutable(MemoryBlock &M, std::string *ErrMsg) {
+  return false;
+}
+
+bool Memory::setRangeWritable(const void *Addr, size_t Size) {
+  return true;
+}
+
+bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
+  return false;
+}
+
+}
diff --git a/lib/System/Win32/Mutex.inc b/lib/System/Win32/Mutex.inc
new file mode 100644
index 0000000..7c1723b
--- /dev/null
+++ b/lib/System/Win32/Mutex.inc
@@ -0,0 +1,58 @@
+//===- llvm/System/Win32/Mutex.inc - Win32 Mutex Implementation -*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Win32 specific (non-pthread) Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include "llvm/System/Mutex.h"
+
+namespace llvm {
+using namespace sys;
+
+Mutex::Mutex(bool /*recursive*/)
+{
+  data_ = new CRITICAL_SECTION;
+  InitializeCriticalSection((LPCRITICAL_SECTION)data_);
+}
+
+Mutex::~Mutex()
+{
+  DeleteCriticalSection((LPCRITICAL_SECTION)data_);
+  delete (LPCRITICAL_SECTION)data_;
+  data_ = 0;
+}
+
+bool 
+Mutex::acquire()
+{
+  EnterCriticalSection((LPCRITICAL_SECTION)data_);
+  return true;
+}
+
+bool 
+Mutex::release()
+{
+  LeaveCriticalSection((LPCRITICAL_SECTION)data_);
+  return true;
+}
+
+bool 
+Mutex::tryacquire()
+{
+  return TryEnterCriticalSection((LPCRITICAL_SECTION)data_);
+}
+
+}
diff --git a/lib/System/Win32/Path.inc b/lib/System/Win32/Path.inc
new file mode 100644
index 0000000..fbf8f66
--- /dev/null
+++ b/lib/System/Win32/Path.inc
@@ -0,0 +1,825 @@
+//===- llvm/System/Win32/Path.cpp - Win32 Path Implementation ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// Modified by Henrik Bach to comply with at least MinGW.
+// Ported to Win32 by Jeff Cohen.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Path class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include <malloc.h>
+#include <cstdio>
+
+// We need to undo a macro defined in Windows.h, otherwise we won't compile:
+#undef CopyFile
+#undef GetCurrentDirectory
+
+// Windows happily accepts either forward or backward slashes, though any path
+// returned by a Win32 API will have backward slashes.  As LLVM code basically
+// assumes forward slashes are used, backward slashs are converted where they
+// can be introduced into a path.
+//
+// Another invariant is that a path ends with a slash if and only if the path
+// is a root directory.  Any other use of a trailing slash is stripped.  Unlike
+// in Unix, Windows has a rather complicated notion of a root path and this
+// invariant helps simply the code.
+
+static void FlipBackSlashes(std::string& s) {
+  for (size_t i = 0; i < s.size(); i++)
+    if (s[i] == '\\')
+      s[i] = '/';
+}
+
+namespace llvm {
+namespace sys {
+const char PathSeparator = ';';
+
+Path::Path(const std::string& p)
+  : path(p) {
+  FlipBackSlashes(path);
+}
+
+Path::Path(const char *StrStart, unsigned StrLen)
+  : path(StrStart, StrLen) {
+  FlipBackSlashes(path);
+}
+
+Path&
+Path::operator=(const std::string &that) {
+  path = that;
+  FlipBackSlashes(path);
+  return *this;
+}
+
+bool
+Path::isValid() const {
+  if (path.empty())
+    return false;
+
+  // If there is a colon, it must be the second character, preceded by a letter
+  // and followed by something.
+  size_t len = path.size();
+  size_t pos = path.rfind(':',len);
+  size_t rootslash = 0;
+  if (pos != std::string::npos) {
+    if (pos != 1 || !isalpha(path[0]) || len < 3)
+      return false;
+      rootslash = 2;
+  }
+
+  // Look for a UNC path, and if found adjust our notion of the root slash.
+  if (len > 3 && path[0] == '/' && path[1] == '/') {
+    rootslash = path.find('/', 2);
+    if (rootslash == std::string::npos)
+      rootslash = 0;
+  }
+
+  // Check for illegal characters.
+  if (path.find_first_of("\\<>\"|\001\002\003\004\005\006\007\010\011\012"
+                         "\013\014\015\016\017\020\021\022\023\024\025\026"
+                         "\027\030\031\032\033\034\035\036\037")
+      != std::string::npos)
+    return false;
+
+  // Remove trailing slash, unless it's a root slash.
+  if (len > rootslash+1 && path[len-1] == '/')
+    path.erase(--len);
+
+  // Check each component for legality.
+  for (pos = 0; pos < len; ++pos) {
+    // A component may not end in a space.
+    if (path[pos] == ' ') {
+      if (path[pos+1] == '/' || path[pos+1] == '\0')
+        return false;
+    }
+
+    // A component may not end in a period.
+    if (path[pos] == '.') {
+      if (path[pos+1] == '/' || path[pos+1] == '\0') {
+        // Unless it is the pseudo-directory "."...
+        if (pos == 0 || path[pos-1] == '/' || path[pos-1] == ':')
+          return true;
+        // or "..".
+        if (pos > 0 && path[pos-1] == '.') {
+          if (pos == 1 || path[pos-2] == '/' || path[pos-2] == ':')
+            return true;
+        }
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool 
+Path::isAbsolute() const {
+  switch (path.length()) {
+    case 0:
+      return false;
+    case 1:
+    case 2:
+      return path[0] == '/';
+    default:
+      return path[0] == '/' || (path[1] == ':' && path[2] == '/');
+  }
+} 
+
+static Path *TempDirectory = NULL;
+
+Path
+Path::GetTemporaryDirectory(std::string* ErrMsg) {
+  if (TempDirectory)
+    return *TempDirectory;
+
+  char pathname[MAX_PATH];
+  if (!GetTempPath(MAX_PATH, pathname)) {
+    if (ErrMsg)
+      *ErrMsg = "Can't determine temporary directory";
+    return Path();
+  }
+
+  Path result;
+  result.set(pathname);
+
+  // Append a subdirectory passed on our process id so multiple LLVMs don't
+  // step on each other's toes.
+#ifdef __MINGW32__
+  // Mingw's Win32 header files are broken.
+  sprintf(pathname, "LLVM_%u", unsigned(GetCurrentProcessId()));
+#else
+  sprintf(pathname, "LLVM_%u", GetCurrentProcessId());
+#endif
+  result.appendComponent(pathname);
+
+  // If there's a directory left over from a previous LLVM execution that
+  // happened to have the same process id, get rid of it.
+  result.eraseFromDisk(true);
+
+  // And finally (re-)create the empty directory.
+  result.createDirectoryOnDisk(false);
+  TempDirectory = new Path(result);
+  return *TempDirectory;
+}
+
+// FIXME: the following set of functions don't map to Windows very well.
+Path
+Path::GetRootDirectory() {
+  Path result;
+  result.set("C:/");
+  return result;
+}
+
+void
+Path::GetSystemLibraryPaths(std::vector<sys::Path>& Paths) {
+  Paths.push_back(sys::Path("C:/WINDOWS/SYSTEM32"));
+  Paths.push_back(sys::Path("C:/WINDOWS"));
+}
+
+void
+Path::GetBitcodeLibraryPaths(std::vector<sys::Path>& Paths) {
+  char * env_var = getenv("LLVM_LIB_SEARCH_PATH");
+  if (env_var != 0) {
+    getPathList(env_var,Paths);
+  }
+#ifdef LLVM_LIBDIR
+  {
+    Path tmpPath;
+    if (tmpPath.set(LLVM_LIBDIR))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+  }
+#endif
+  GetSystemLibraryPaths(Paths);
+}
+
+Path
+Path::GetLLVMDefaultConfigDir() {
+  // TODO: this isn't going to fly on Windows
+  return Path("/etc/llvm");
+}
+
+Path
+Path::GetUserHomeDirectory() {
+  // TODO: Typical Windows setup doesn't define HOME.
+  const char* home = getenv("HOME");
+  if (home) {
+    Path result;
+    if (result.set(home))
+      return result;
+  }
+  return GetRootDirectory();
+}
+
+Path
+Path::GetCurrentDirectory() {
+  char pathname[MAX_PATH];
+  ::GetCurrentDirectoryA(MAX_PATH,pathname);
+  return Path(pathname);  
+}
+
+/// GetMainExecutable - Return the path to the main executable, given the
+/// value of argv[0] from program startup.
+Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
+  return Path();
+}
+
+
+// FIXME: the above set of functions don't map to Windows very well.
+
+
+bool
+Path::isRootDirectory() const {
+  size_t len = path.size();
+  return len > 0 && path[len-1] == '/';
+}
+
+std::string Path::getDirname() const {
+  return getDirnameCharSep(path, '/');
+}
+
+std::string
+Path::getBasename() const {
+  // Find the last slash
+  size_t slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  size_t dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return path.substr(slash);
+  else
+    return path.substr(slash, dot - slash);
+}
+
+std::string
+Path::getSuffix() const {
+  // Find the last slash
+  size_t slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  size_t dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return std::string();
+  else
+    return path.substr(dot + 1);
+}
+
+bool
+Path::exists() const {
+  DWORD attr = GetFileAttributes(path.c_str());
+  return attr != INVALID_FILE_ATTRIBUTES;
+}
+
+bool
+Path::isDirectory() const {
+  DWORD attr = GetFileAttributes(path.c_str());
+  return (attr != INVALID_FILE_ATTRIBUTES) &&
+         (attr & FILE_ATTRIBUTE_DIRECTORY);
+}
+
+bool
+Path::canRead() const {
+  // FIXME: take security attributes into account.
+  DWORD attr = GetFileAttributes(path.c_str());
+  return attr != INVALID_FILE_ATTRIBUTES;
+}
+
+bool
+Path::canWrite() const {
+  // FIXME: take security attributes into account.
+  DWORD attr = GetFileAttributes(path.c_str());
+  return (attr != INVALID_FILE_ATTRIBUTES) && !(attr & FILE_ATTRIBUTE_READONLY);
+}
+
+bool
+Path::canExecute() const {
+  // FIXME: take security attributes into account.
+  DWORD attr = GetFileAttributes(path.c_str());
+  return attr != INVALID_FILE_ATTRIBUTES;
+}
+
+std::string
+Path::getLast() const {
+  // Find the last slash
+  size_t pos = path.rfind('/');
+
+  // Handle the corner cases
+  if (pos == std::string::npos)
+    return path;
+
+  // If the last character is a slash, we have a root directory
+  if (pos == path.length()-1)
+    return path;
+
+  // Return everything after the last slash
+  return path.substr(pos+1);
+}
+
+const FileStatus *
+PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
+  if (!fsIsValid || update) {
+    WIN32_FILE_ATTRIBUTE_DATA fi;
+    if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi)) {
+      MakeErrMsg(ErrStr, "getStatusInfo():" + std::string(path) +
+                      ": Can't get status: ");
+      return 0;
+    }
+
+    status.fileSize = fi.nFileSizeHigh;
+    status.fileSize <<= sizeof(fi.nFileSizeHigh)*8;
+    status.fileSize += fi.nFileSizeLow;
+
+    status.mode = fi.dwFileAttributes & FILE_ATTRIBUTE_READONLY ? 0555 : 0777;
+    status.user = 9999;    // Not applicable to Windows, so...
+    status.group = 9999;   // Not applicable to Windows, so...
+
+    // FIXME: this is only unique if the file is accessed by the same file path.
+    // How do we do this for C:\dir\file and ..\dir\file ? Unix has inode
+    // numbers, but the concept doesn't exist in Windows.
+    status.uniqueID = 0;
+    for (unsigned i = 0; i < path.length(); ++i)
+      status.uniqueID += path[i];
+
+    __int64 ft = *reinterpret_cast<__int64*>(&fi.ftLastWriteTime);
+    status.modTime.fromWin32Time(ft);
+
+    status.isDir = fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY;
+    fsIsValid = true;
+  }
+  return &status;
+}
+
+bool Path::makeReadableOnDisk(std::string* ErrMsg) {
+  // All files are readable on Windows (ignoring security attributes).
+  return false;
+}
+
+bool Path::makeWriteableOnDisk(std::string* ErrMsg) {
+  DWORD attr = GetFileAttributes(path.c_str());
+
+  // If it doesn't exist, we're done.
+  if (attr == INVALID_FILE_ATTRIBUTES)
+    return false;
+
+  if (attr & FILE_ATTRIBUTE_READONLY) {
+    if (!SetFileAttributes(path.c_str(), attr & ~FILE_ATTRIBUTE_READONLY)) {
+      MakeErrMsg(ErrMsg, std::string(path) + ": Can't make file writable: ");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Path::makeExecutableOnDisk(std::string* ErrMsg) {
+  // All files are executable on Windows (ignoring security attributes).
+  return false;
+}
+
+bool
+Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
+  WIN32_FILE_ATTRIBUTE_DATA fi;
+  if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi)) {
+    MakeErrMsg(ErrMsg, path + ": can't get status of file");
+    return true;
+  }
+    
+  if (!(fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    if (ErrMsg)
+      *ErrMsg = path + ": not a directory";
+    return true;
+  }
+
+  result.clear();
+  WIN32_FIND_DATA fd;
+  std::string searchpath = path;
+  if (path.size() == 0 || searchpath[path.size()-1] == '/')
+    searchpath += "*";
+  else
+    searchpath += "/*";
+
+  HANDLE h = FindFirstFile(searchpath.c_str(), &fd);
+  if (h == INVALID_HANDLE_VALUE) {
+    if (GetLastError() == ERROR_FILE_NOT_FOUND)
+      return true; // not really an error, now is it?
+    MakeErrMsg(ErrMsg, path + ": Can't read directory: ");
+    return true;
+  }
+
+  do {
+    if (fd.cFileName[0] == '.')
+      continue;
+    Path aPath(path);
+    aPath.appendComponent(&fd.cFileName[0]);
+    result.insert(aPath);
+  } while (FindNextFile(h, &fd));
+
+  DWORD err = GetLastError();
+  FindClose(h);
+  if (err != ERROR_NO_MORE_FILES) {
+    SetLastError(err);
+    MakeErrMsg(ErrMsg, path + ": Can't read directory: ");
+    return true;
+  }
+  return false;
+}
+
+bool
+Path::set(const std::string& a_path) {
+  if (a_path.empty())
+    return false;
+  std::string save(path);
+  path = a_path;
+  FlipBackSlashes(path);
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::appendComponent(const std::string& name) {
+  if (name.empty())
+    return false;
+  std::string save(path);
+  if (!path.empty()) {
+    size_t last = path.size() - 1;
+    if (path[last] != '/')
+      path += '/';
+  }
+  path += name;
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::eraseComponent() {
+  size_t slashpos = path.rfind('/',path.size());
+  if (slashpos == path.size() - 1 || slashpos == std::string::npos)
+    return false;
+  std::string save(path);
+  path.erase(slashpos);
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::appendSuffix(const std::string& suffix) {
+  std::string save(path);
+  path.append(".");
+  path.append(suffix);
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::eraseSuffix() {
+  size_t dotpos = path.rfind('.',path.size());
+  size_t slashpos = path.rfind('/',path.size());
+  if (dotpos != std::string::npos) {
+    if (slashpos == std::string::npos || dotpos > slashpos+1) {
+      std::string save(path);
+      path.erase(dotpos, path.size()-dotpos);
+      if (!isValid()) {
+        path = save;
+        return false;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool PathMsg(std::string* ErrMsg, const char* pathname, const char*msg) {
+  if (ErrMsg)
+    *ErrMsg = std::string(pathname) + ": " + std::string(msg);
+  return true;
+}
+
+bool
+Path::createDirectoryOnDisk(bool create_parents, std::string* ErrMsg) {
+  // Get a writeable copy of the path name
+  size_t len = path.length();
+  char *pathname = reinterpret_cast<char *>(_alloca(len+2));
+  path.copy(pathname, len);
+  pathname[len] = 0;
+
+  // Make sure it ends with a slash.
+  if (len == 0 || pathname[len - 1] != '/') {
+    pathname[len] = '/';
+    pathname[++len] = 0;
+  }
+
+  // Determine starting point for initial / search.
+  char *next = pathname;
+  if (pathname[0] == '/' && pathname[1] == '/') {
+    // Skip host name.
+    next = strchr(pathname+2, '/');
+    if (next == NULL)
+      return PathMsg(ErrMsg, pathname, "badly formed remote directory");
+
+    // Skip share name.
+    next = strchr(next+1, '/');
+    if (next == NULL)
+      return PathMsg(ErrMsg, pathname,"badly formed remote directory");
+
+    next++;
+    if (*next == 0)
+      return PathMsg(ErrMsg, pathname, "badly formed remote directory");
+
+  } else {
+    if (pathname[1] == ':')
+      next += 2;    // skip drive letter
+    if (*next == '/')
+      next++;       // skip root directory
+  }
+
+  // If we're supposed to create intermediate directories
+  if (create_parents) {
+    // Loop through the directory components until we're done
+    while (*next) {
+      next = strchr(next, '/');
+      *next = 0;
+      if (!CreateDirectory(pathname, NULL))
+          return MakeErrMsg(ErrMsg, 
+            std::string(pathname) + ": Can't create directory: ");
+      *next++ = '/';
+    }
+  } else {
+    // Drop trailing slash.
+    pathname[len-1] = 0;
+    if (!CreateDirectory(pathname, NULL)) {
+      return MakeErrMsg(ErrMsg, std::string(pathname) + ": Can't create directory: ");
+    }
+  }
+  return false;
+}
+
+bool
+Path::createFileOnDisk(std::string* ErrMsg) {
+  // Create the file
+  HANDLE h = CreateFile(path.c_str(), GENERIC_WRITE, 0, NULL, CREATE_NEW,
+                        FILE_ATTRIBUTE_NORMAL, NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return MakeErrMsg(ErrMsg, path + ": Can't create file: ");
+
+  CloseHandle(h);
+  return false;
+}
+
+bool
+Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
+  WIN32_FILE_ATTRIBUTE_DATA fi;
+  if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi))
+    return true;
+    
+  if (fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+    // If it doesn't exist, we're done.
+    if (!exists())
+      return false;
+
+    char *pathname = reinterpret_cast<char *>(_alloca(path.length()+3));
+    int lastchar = path.length() - 1 ;
+    path.copy(pathname, lastchar+1);
+
+    // Make path end with '/*'.
+    if (pathname[lastchar] != '/')
+      pathname[++lastchar] = '/';
+    pathname[lastchar+1] = '*';
+    pathname[lastchar+2] = 0;
+
+    if (remove_contents) {
+      WIN32_FIND_DATA fd;
+      HANDLE h = FindFirstFile(pathname, &fd);
+
+      // It's a bad idea to alter the contents of a directory while enumerating
+      // its contents. So build a list of its contents first, then destroy them.
+
+      if (h != INVALID_HANDLE_VALUE) {
+        std::vector<Path> list;
+
+        do {
+          if (strcmp(fd.cFileName, ".") == 0)
+            continue;
+          if (strcmp(fd.cFileName, "..") == 0)
+            continue;
+
+          Path aPath(path);
+          aPath.appendComponent(&fd.cFileName[0]);
+          list.push_back(aPath);
+        } while (FindNextFile(h, &fd));
+
+        DWORD err = GetLastError();
+        FindClose(h);
+        if (err != ERROR_NO_MORE_FILES) {
+          SetLastError(err);
+          return MakeErrMsg(ErrStr, path + ": Can't read directory: ");
+        }
+
+        for (std::vector<Path>::iterator I = list.begin(); I != list.end();
+             ++I) {
+          Path &aPath = *I;
+          aPath.eraseFromDisk(true);
+        }
+      } else {
+        if (GetLastError() != ERROR_FILE_NOT_FOUND)
+          return MakeErrMsg(ErrStr, path + ": Can't read directory: ");
+      }
+    }
+
+    pathname[lastchar] = 0;
+    if (!RemoveDirectory(pathname))
+      return MakeErrMsg(ErrStr, 
+        std::string(pathname) + ": Can't destroy directory: ");
+    return false;
+  } else {
+    // Read-only files cannot be deleted on Windows.  Must remove the read-only
+    // attribute first.
+    if (fi.dwFileAttributes & FILE_ATTRIBUTE_READONLY) {
+      if (!SetFileAttributes(path.c_str(),
+                             fi.dwFileAttributes & ~FILE_ATTRIBUTE_READONLY))
+        return MakeErrMsg(ErrStr, path + ": Can't destroy file: ");
+    }
+
+    if (!DeleteFile(path.c_str()))
+      return MakeErrMsg(ErrStr, path + ": Can't destroy file: ");
+    return false;
+  }
+}
+
+bool Path::getMagicNumber(std::string& Magic, unsigned len) const {
+  assert(len < 1024 && "Request for magic string too long");
+  char* buf = (char*) alloca(1 + len);
+
+  HANDLE h = CreateFile(path.c_str(),
+                        GENERIC_READ,
+                        FILE_SHARE_READ,
+                        NULL,
+                        OPEN_EXISTING,
+                        FILE_ATTRIBUTE_NORMAL,
+                        NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return false;
+
+  DWORD nRead = 0;
+  BOOL ret = ReadFile(h, buf, len, &nRead, NULL);
+  CloseHandle(h);
+
+  if (!ret || nRead != len)
+    return false;
+
+  buf[len] = '\0';
+  Magic = buf;
+  return true;
+}
+
+bool
+Path::renamePathOnDisk(const Path& newName, std::string* ErrMsg) {
+  if (!MoveFileEx(path.c_str(), newName.c_str(), MOVEFILE_REPLACE_EXISTING))
+    return MakeErrMsg(ErrMsg, "Can't move '" + path + "' to '" + newName.path 
+        + "': ");
+  return false;
+}
+
+bool
+Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrMsg) const {
+  // FIXME: should work on directories also.
+  if (!si.isFile) {
+    return true;
+  }
+  
+  HANDLE h = CreateFile(path.c_str(),
+                        FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES,
+                        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                        NULL,
+                        OPEN_EXISTING,
+                        FILE_ATTRIBUTE_NORMAL,
+                        NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return true;
+
+  BY_HANDLE_FILE_INFORMATION bhfi;
+  if (!GetFileInformationByHandle(h, &bhfi)) {
+    DWORD err = GetLastError();
+    CloseHandle(h);
+    SetLastError(err);
+    return MakeErrMsg(ErrMsg, path + ": GetFileInformationByHandle: ");
+  }
+
+  FILETIME ft;
+  (uint64_t&)ft = si.modTime.toWin32Time();
+  BOOL ret = SetFileTime(h, NULL, &ft, &ft);
+  DWORD err = GetLastError();
+  CloseHandle(h);
+  if (!ret) {
+    SetLastError(err);
+    return MakeErrMsg(ErrMsg, path + ": SetFileTime: ");
+  }
+
+  // Best we can do with Unix permission bits is to interpret the owner
+  // writable bit.
+  if (si.mode & 0200) {
+    if (bhfi.dwFileAttributes & FILE_ATTRIBUTE_READONLY) {
+      if (!SetFileAttributes(path.c_str(),
+              bhfi.dwFileAttributes & ~FILE_ATTRIBUTE_READONLY))
+        return MakeErrMsg(ErrMsg, path + ": SetFileAttributes: ");
+    }
+  } else {
+    if (!(bhfi.dwFileAttributes & FILE_ATTRIBUTE_READONLY)) {
+      if (!SetFileAttributes(path.c_str(),
+              bhfi.dwFileAttributes | FILE_ATTRIBUTE_READONLY))
+        return MakeErrMsg(ErrMsg, path + ": SetFileAttributes: ");
+    }
+  }
+
+  return false;
+}
+
+bool
+CopyFile(const sys::Path &Dest, const sys::Path &Src, std::string* ErrMsg) {
+  // Can't use CopyFile macro defined in Windows.h because it would mess up the
+  // above line.  We use the expansion it would have in a non-UNICODE build.
+  if (!::CopyFileA(Src.c_str(), Dest.c_str(), false))
+    return MakeErrMsg(ErrMsg, "Can't copy '" + Src.toString() +
+               "' to '" + Dest.toString() + "': ");
+  return false;
+}
+
+bool
+Path::makeUnique(bool reuse_current, std::string* ErrMsg) {
+  if (reuse_current && !exists())
+    return false; // File doesn't exist already, just use it!
+
+  // Reserve space for -XXXXXX at the end.
+  char *FNBuffer = (char*) alloca(path.size()+8);
+  unsigned offset = path.size();
+  path.copy(FNBuffer, offset);
+
+  // Find a numeric suffix that isn't used by an existing file.  Assume there
+  // won't be more than 1 million files with the same prefix.  Probably a safe
+  // bet.
+  static unsigned FCounter = 0;
+  do {
+    sprintf(FNBuffer+offset, "-%06u", FCounter);
+    if (++FCounter > 999999)
+      FCounter = 0;
+    path = FNBuffer;
+  } while (exists());
+  return false;
+}
+
+bool
+Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
+  // Make this into a unique file name
+  makeUnique(reuse_current, ErrMsg);
+
+  // Now go and create it
+  HANDLE h = CreateFile(path.c_str(), GENERIC_WRITE, 0, NULL, CREATE_NEW,
+                        FILE_ATTRIBUTE_NORMAL, NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return MakeErrMsg(ErrMsg, path + ": can't create file");
+
+  CloseHandle(h);
+  return false;
+}
+
+/// MapInFilePages - Not yet implemented on win32.
+const char *Path::MapInFilePages(int FD, uint64_t FileSize) {
+  return 0;
+}
+
+/// MapInFilePages - Not yet implemented on win32.
+void Path::UnMapFilePages(const char *Base, uint64_t FileSize) {
+  assert(0 && "NOT IMPLEMENTED");
+}
+
+}
+}
diff --git a/lib/System/Win32/Process.inc b/lib/System/Win32/Process.inc
new file mode 100644
index 0000000..e1d7a92
--- /dev/null
+++ b/lib/System/Win32/Process.inc
@@ -0,0 +1,150 @@
+//===- Win32/Process.cpp - Win32 Process Implementation ------- -*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Process class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include <psapi.h>
+#include <malloc.h>
+#include <io.h>
+
+#ifdef __MINGW32__
+ #if (HAVE_LIBPSAPI != 1)
+  #error "libpsapi.a should be present"
+ #endif
+#else
+ #pragma comment(lib, "psapi.lib")
+#endif
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code 
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+#ifdef __MINGW32__
+// This ban should be lifted when MinGW 1.0+ has defined this value.
+#  define _HEAPOK (-2)
+#endif
+
+namespace llvm {
+using namespace sys;
+
+// This function retrieves the page size using GetSystemInfo and is present
+// solely so it can be called once in Process::GetPageSize to initialize the
+// static variable PageSize.
+inline unsigned GetPageSizeOnce() {
+  // NOTE: A 32-bit application running under WOW64 is supposed to use
+  // GetNativeSystemInfo.  However, this interface is not present prior
+  // to Windows XP so to use it requires dynamic linking.  It is not clear
+  // how this affects the reported page size, if at all.  One could argue
+  // that LLVM ought to run as 64-bits on a 64-bit system, anyway.
+  SYSTEM_INFO info;
+  GetSystemInfo(&info);
+  return static_cast<unsigned>(info.dwPageSize);
+}
+
+unsigned 
+Process::GetPageSize() {
+  static const unsigned PageSize = GetPageSizeOnce();
+  return PageSize;
+}
+
+size_t 
+Process::GetMallocUsage()
+{
+  _HEAPINFO hinfo;
+  hinfo._pentry = NULL;
+
+  size_t size = 0;
+
+  while (_heapwalk(&hinfo) == _HEAPOK)
+    size += hinfo._size;
+
+  return size;
+}
+
+size_t
+Process::GetTotalMemoryUsage()
+{
+  PROCESS_MEMORY_COUNTERS pmc;
+  GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc));
+  return pmc.PagefileUsage;
+}
+
+void
+Process::GetTimeUsage(
+  TimeValue& elapsed, TimeValue& user_time, TimeValue& sys_time)
+{
+  elapsed = TimeValue::now();
+
+  uint64_t ProcCreate, ProcExit, KernelTime, UserTime;
+  GetProcessTimes(GetCurrentProcess(), (FILETIME*)&ProcCreate, 
+                  (FILETIME*)&ProcExit, (FILETIME*)&KernelTime,
+                  (FILETIME*)&UserTime);
+
+  // FILETIME's are # of 100 nanosecond ticks (1/10th of a microsecond)
+  user_time.seconds( UserTime / 10000000 );
+  user_time.nanoseconds( unsigned(UserTime % 10000000) * 100 );
+  sys_time.seconds( KernelTime / 10000000 );
+  sys_time.nanoseconds( unsigned(KernelTime % 10000000) * 100 );
+}
+
+int Process::GetCurrentUserId()
+{
+  return 65536;
+}
+
+int Process::GetCurrentGroupId()
+{
+  return 65536;
+}
+
+// Some LLVM programs such as bugpoint produce core files as a normal part of
+// their operation. To prevent the disk from filling up, this configuration item
+// does what's necessary to prevent their generation.
+void Process::PreventCoreFiles() {
+  // Windows doesn't do core files, but it does do modal pop-up message
+  // boxes.  As this method is used by bugpoint, preventing these pop-ups
+  // is the moral equivalent of suppressing core files.
+  SetErrorMode(SEM_FAILCRITICALERRORS |
+               SEM_NOGPFAULTERRORBOX |
+               SEM_NOOPENFILEERRORBOX);
+}
+
+bool Process::StandardInIsUserInput() {
+  return GetFileType((HANDLE)_get_osfhandle(0)) == FILE_TYPE_CHAR;
+}
+
+bool Process::StandardOutIsDisplayed() {
+  return GetFileType((HANDLE)_get_osfhandle(1)) == FILE_TYPE_CHAR;
+}
+
+bool Process::StandardErrIsDisplayed() {
+  return GetFileType((HANDLE)_get_osfhandle(2)) == FILE_TYPE_CHAR;
+}
+
+unsigned Process::StandardOutColumns() {
+  unsigned Columns = 0;
+  CONSOLE_SCREEN_BUFFER_INFO csbi;
+  if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi))
+    Columns = csbi.dwSize.X;
+  return Columns;
+}
+
+unsigned Process::StandardErrColumns() {
+  unsigned Columns = 0;
+  CONSOLE_SCREEN_BUFFER_INFO csbi;
+  if (GetConsoleScreenBufferInfo(GetStdHandle(STD_ERROR_HANDLE), &csbi))
+    Columns = csbi.dwSize.X;
+  return Columns;
+}
+
+}
diff --git a/lib/System/Win32/Program.inc b/lib/System/Win32/Program.inc
new file mode 100644
index 0000000..49086b8
--- /dev/null
+++ b/lib/System/Win32/Program.inc
@@ -0,0 +1,316 @@
+//===- Win32/Program.cpp - Win32 Program Implementation ------- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Program class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include <cstdio>
+#include <malloc.h>
+#include <io.h>
+#include <fcntl.h>
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+using namespace sys;
+
+// This function just uses the PATH environment variable to find the program.
+Path
+Program::FindProgramByName(const std::string& progName) {
+
+  // Check some degenerate cases
+  if (progName.length() == 0) // no program
+    return Path();
+  Path temp;
+  if (!temp.set(progName)) // invalid name
+    return Path();
+  if (temp.canExecute()) // already executable as is
+    return temp;
+
+  // At this point, the file name is valid and its not executable.
+  // Let Windows search for it.
+  char buffer[MAX_PATH];
+  char *dummy = NULL;
+  DWORD len = SearchPath(NULL, progName.c_str(), ".exe", MAX_PATH,
+                         buffer, &dummy);
+
+  // See if it wasn't found.
+  if (len == 0)
+    return Path();
+
+  // See if we got the entire path.
+  if (len < MAX_PATH)
+    return Path(buffer);
+
+  // Buffer was too small; grow and retry.
+  while (true) {
+    char *b = reinterpret_cast<char *>(_alloca(len+1));
+    DWORD len2 = SearchPath(NULL, progName.c_str(), ".exe", len+1, b, &dummy);
+
+    // It is unlikely the search failed, but it's always possible some file
+    // was added or removed since the last search, so be paranoid...
+    if (len2 == 0)
+      return Path();
+    else if (len2 <= len)
+      return Path(b);
+
+    len = len2;
+  }
+}
+
+static HANDLE RedirectIO(const Path *path, int fd, std::string* ErrMsg) {
+  HANDLE h;
+  if (path == 0) {
+    DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
+                    GetCurrentProcess(), &h,
+                    0, TRUE, DUPLICATE_SAME_ACCESS);
+    return h;
+  }
+
+  const char *fname;
+  if (path->isEmpty())
+    fname = "NUL";
+  else
+    fname = path->toString().c_str();
+
+  SECURITY_ATTRIBUTES sa;
+  sa.nLength = sizeof(sa);
+  sa.lpSecurityDescriptor = 0;
+  sa.bInheritHandle = TRUE;
+
+  h = CreateFile(fname, fd ? GENERIC_WRITE : GENERIC_READ, FILE_SHARE_READ,
+                 &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
+                 FILE_ATTRIBUTE_NORMAL, NULL);
+  if (h == INVALID_HANDLE_VALUE) {
+    MakeErrMsg(ErrMsg, std::string(fname) + ": Can't open file for " +
+        (fd ? "input: " : "output: "));
+  }
+
+  return h;
+}
+
+#ifdef __MINGW32__
+  // Due to unknown reason, mingw32's w32api doesn't have this declaration.
+  extern "C"
+  BOOL WINAPI SetInformationJobObject(HANDLE hJob,
+                                      JOBOBJECTINFOCLASS JobObjectInfoClass,
+                                      LPVOID lpJobObjectInfo,
+                                      DWORD cbJobObjectInfoLength);
+#endif
+
+int
+Program::ExecuteAndWait(const Path& path,
+                        const char** args,
+                        const char** envp,
+                        const Path** redirects,
+                        unsigned secondsToWait,
+                        unsigned memoryLimit,
+                        std::string* ErrMsg) {
+  if (!path.canExecute()) {
+    if (ErrMsg)
+      *ErrMsg = "program not executable";
+    return -1;
+  }
+
+  // Windows wants a command line, not an array of args, to pass to the new
+  // process.  We have to concatenate them all, while quoting the args that
+  // have embedded spaces.
+
+  // First, determine the length of the command line.
+  unsigned len = 0;
+  for (unsigned i = 0; args[i]; i++) {
+    len += strlen(args[i]) + 1;
+    if (strchr(args[i], ' '))
+      len += 2;
+  }
+
+  // Now build the command line.
+  char *command = reinterpret_cast<char *>(_alloca(len+1));
+  char *p = command;
+
+  for (unsigned i = 0; args[i]; i++) {
+    const char *arg = args[i];
+    size_t len = strlen(arg);
+    bool needsQuoting = strchr(arg, ' ') != 0;
+    if (needsQuoting)
+      *p++ = '"';
+    memcpy(p, arg, len);
+    p += len;
+    if (needsQuoting)
+      *p++ = '"';
+    *p++ = ' ';
+  }
+
+  *p = 0;
+
+  // The pointer to the environment block for the new process.
+  char *envblock = 0;
+
+  if (envp) {
+    // An environment block consists of a null-terminated block of
+    // null-terminated strings. Convert the array of environment variables to
+    // an environment block by concatenating them.
+
+    // First, determine the length of the environment block.
+    len = 0;
+    for (unsigned i = 0; envp[i]; i++)
+      len += strlen(envp[i]) + 1;
+
+    // Now build the environment block.
+    envblock = reinterpret_cast<char *>(_alloca(len+1));
+    p = envblock;
+
+    for (unsigned i = 0; envp[i]; i++) {
+      const char *ev = envp[i];
+      size_t len = strlen(ev) + 1;
+      memcpy(p, ev, len);
+      p += len;
+    }
+
+    *p = 0;
+  }
+
+  // Create a child process.
+  STARTUPINFO si;
+  memset(&si, 0, sizeof(si));
+  si.cb = sizeof(si);
+  si.hStdInput = INVALID_HANDLE_VALUE;
+  si.hStdOutput = INVALID_HANDLE_VALUE;
+  si.hStdError = INVALID_HANDLE_VALUE;
+
+  if (redirects) {
+    si.dwFlags = STARTF_USESTDHANDLES;
+
+    si.hStdInput = RedirectIO(redirects[0], 0, ErrMsg);
+    if (si.hStdInput == INVALID_HANDLE_VALUE) {
+      MakeErrMsg(ErrMsg, "can't redirect stdin");
+      return -1;
+    }
+    si.hStdOutput = RedirectIO(redirects[1], 1, ErrMsg);
+    if (si.hStdOutput == INVALID_HANDLE_VALUE) {
+      CloseHandle(si.hStdInput);
+      MakeErrMsg(ErrMsg, "can't redirect stdout");
+      return -1;
+    }
+    if (redirects[1] && redirects[2] && *(redirects[1]) == *(redirects[2])) {
+      // If stdout and stderr should go to the same place, redirect stderr
+      // to the handle already open for stdout.
+      DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
+                      GetCurrentProcess(), &si.hStdError,
+                      0, TRUE, DUPLICATE_SAME_ACCESS);
+    } else {
+      // Just redirect stderr
+      si.hStdError = RedirectIO(redirects[2], 2, ErrMsg);
+      if (si.hStdError == INVALID_HANDLE_VALUE) {
+        CloseHandle(si.hStdInput);
+        CloseHandle(si.hStdOutput);
+        MakeErrMsg(ErrMsg, "can't redirect stderr");
+        return -1;
+      }
+    }
+  }
+
+  PROCESS_INFORMATION pi;
+  memset(&pi, 0, sizeof(pi));
+
+  fflush(stdout);
+  fflush(stderr);
+  BOOL rc = CreateProcess(path.c_str(), command, NULL, NULL, TRUE, 0,
+                          envblock, NULL, &si, &pi);
+  DWORD err = GetLastError();
+
+  // Regardless of whether the process got created or not, we are done with
+  // the handles we created for it to inherit.
+  CloseHandle(si.hStdInput);
+  CloseHandle(si.hStdOutput);
+  CloseHandle(si.hStdError);
+
+  // Now return an error if the process didn't get created.
+  if (!rc)
+  {
+    SetLastError(err);
+    MakeErrMsg(ErrMsg, std::string("Couldn't execute program '") +
+               path.toString() + "'");
+    return -1;
+  }
+
+  // Make sure these get closed no matter what.
+  AutoHandle hProcess(pi.hProcess);
+  AutoHandle hThread(pi.hThread);
+
+  // Assign the process to a job if a memory limit is defined.
+  AutoHandle hJob(0);
+  if (memoryLimit != 0) {
+    hJob = CreateJobObject(0, 0);
+    bool success = false;
+    if (hJob != 0) {
+      JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli;
+      memset(&jeli, 0, sizeof(jeli));
+      jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_PROCESS_MEMORY;
+      jeli.ProcessMemoryLimit = uintptr_t(memoryLimit) * 1048576;
+      if (SetInformationJobObject(hJob, JobObjectExtendedLimitInformation,
+                                  &jeli, sizeof(jeli))) {
+        if (AssignProcessToJobObject(hJob, pi.hProcess))
+          success = true;
+      }
+    }
+    if (!success) {
+      SetLastError(GetLastError());
+      MakeErrMsg(ErrMsg, std::string("Unable to set memory limit"));
+      TerminateProcess(pi.hProcess, 1);
+      WaitForSingleObject(pi.hProcess, INFINITE);
+      return -1;
+    }
+  }
+
+  // Wait for it to terminate.
+  DWORD millisecondsToWait = INFINITE;
+  if (secondsToWait > 0)
+    millisecondsToWait = secondsToWait * 1000;
+
+  if (WaitForSingleObject(pi.hProcess, millisecondsToWait) == WAIT_TIMEOUT) {
+    if (!TerminateProcess(pi.hProcess, 1)) {
+      MakeErrMsg(ErrMsg, std::string("Failed to terminate timed-out program '")
+          + path.toString() + "'");
+      return -1;
+    }
+    WaitForSingleObject(pi.hProcess, INFINITE);
+  }
+
+  // Get its exit status.
+  DWORD status;
+  rc = GetExitCodeProcess(pi.hProcess, &status);
+  err = GetLastError();
+
+  if (!rc) {
+    SetLastError(err);
+    MakeErrMsg(ErrMsg, std::string("Failed getting status for program '") +
+               path.toString() + "'");
+    return -1;
+  }
+
+  return status;
+}
+
+bool Program::ChangeStdinToBinary(){
+  int result = _setmode( _fileno(stdin), _O_BINARY );
+  return result == -1;
+}
+
+bool Program::ChangeStdoutToBinary(){
+  int result = _setmode( _fileno(stdout), _O_BINARY );
+  return result == -1;
+}
+
+}
diff --git a/lib/System/Win32/Signals.inc b/lib/System/Win32/Signals.inc
new file mode 100644
index 0000000..3a8f77e
--- /dev/null
+++ b/lib/System/Win32/Signals.inc
@@ -0,0 +1,270 @@
+//===- Win32/Signals.cpp - Win32 Signals Implementation ---------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Signals class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
+
+#ifdef __MINGW32__
+ #include <imagehlp.h>
+#else
+ #include <dbghelp.h>
+#endif
+#include <psapi.h>
+
+#ifdef __MINGW32__
+ #if ((HAVE_LIBIMAGEHLP != 1) || (HAVE_LIBPSAPI != 1))
+  #error "libimagehlp.a & libpsapi.a should be present"
+ #endif
+#else
+ #pragma comment(lib, "psapi.lib")
+ #pragma comment(lib, "dbghelp.lib")
+#endif
+
+// Forward declare.
+static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep);
+static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType);
+
+// InterruptFunction - The function to call if ctrl-c is pressed.
+static void (*InterruptFunction)() = 0;
+
+static std::vector<llvm::sys::Path> *FilesToRemove = NULL;
+static std::vector<std::pair<void(*)(void*), void*> > *CallBacksToRun = 0;
+static bool RegisteredUnhandledExceptionFilter = false;
+static bool CleanupExecuted = false;
+static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL;
+
+// Windows creates a new thread to execute the console handler when an event
+// (such as CTRL/C) occurs.  This causes concurrency issues with the above
+// globals which this critical section addresses.
+static CRITICAL_SECTION CriticalSection;
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code 
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+
+static void RegisterHandler() { 
+  if (RegisteredUnhandledExceptionFilter) {
+    EnterCriticalSection(&CriticalSection);
+    return;
+  }
+
+  // Now's the time to create the critical section.  This is the first time
+  // through here, and there's only one thread.
+  InitializeCriticalSection(&CriticalSection);
+
+  // Enter it immediately.  Now if someone hits CTRL/C, the console handler
+  // can't proceed until the globals are updated.
+  EnterCriticalSection(&CriticalSection);
+
+  RegisteredUnhandledExceptionFilter = true;
+  OldFilter = SetUnhandledExceptionFilter(LLVMUnhandledExceptionFilter);
+  SetConsoleCtrlHandler(LLVMConsoleCtrlHandler, TRUE);
+
+  // IMPORTANT NOTE: Caller must call LeaveCriticalSection(&CriticalSection) or
+  // else multi-threading problems will ensue.
+}
+
+// RemoveFileOnSignal - The public API
+bool sys::RemoveFileOnSignal(const sys::Path &Filename, std::string* ErrMsg) {
+  RegisterHandler();
+
+  if (CleanupExecuted) {
+    if (ErrMsg)
+      *ErrMsg = "Process terminating -- cannot register for removal";
+    return true;
+  }
+
+  if (FilesToRemove == NULL)
+    FilesToRemove = new std::vector<sys::Path>;
+
+  FilesToRemove->push_back(Filename);
+
+  LeaveCriticalSection(&CriticalSection);
+  return false;
+}
+
+/// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
+/// SIGSEGV) is delivered to the process, print a stack trace and then exit.
+void sys::PrintStackTraceOnErrorSignal() {
+  RegisterHandler();
+  LeaveCriticalSection(&CriticalSection);
+}
+
+
+void sys::SetInterruptFunction(void (*IF)()) {
+  RegisterHandler();
+  InterruptFunction = IF;
+  LeaveCriticalSection(&CriticalSection);
+}
+
+
+/// AddSignalHandler - Add a function to be called when a signal is delivered
+/// to the process.  The handler can have a cookie passed to it to identify
+/// what instance of the handler it is.
+void sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
+  if (CallBacksToRun == 0)
+    CallBacksToRun = new std::vector<std::pair<void(*)(void*), void*> >();
+  CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
+  RegisterHandler();
+}
+}
+
+static void Cleanup() {
+  EnterCriticalSection(&CriticalSection);
+
+  // Prevent other thread from registering new files and directories for
+  // removal, should we be executing because of the console handler callback.
+  CleanupExecuted = true;
+
+  // FIXME: open files cannot be deleted.
+
+  if (FilesToRemove != NULL)
+    while (!FilesToRemove->empty()) {
+      try {
+        FilesToRemove->back().eraseFromDisk();
+      } catch (...) {
+      }
+      FilesToRemove->pop_back();
+    }
+
+  if (CallBacksToRun)
+    for (unsigned i = 0, e = CallBacksToRun->size(); i != e; ++i)
+      (*CallBacksToRun)[i].first((*CallBacksToRun)[i].second);
+
+  LeaveCriticalSection(&CriticalSection);
+}
+
+static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
+  try {
+    Cleanup();
+    
+#ifdef _WIN64
+  // TODO: provide a x64 friendly version of the following
+#else
+    
+    // Initialize the STACKFRAME structure.
+    STACKFRAME StackFrame;
+    memset(&StackFrame, 0, sizeof(StackFrame));
+
+    StackFrame.AddrPC.Offset = ep->ContextRecord->Eip;
+    StackFrame.AddrPC.Mode = AddrModeFlat;
+    StackFrame.AddrStack.Offset = ep->ContextRecord->Esp;
+    StackFrame.AddrStack.Mode = AddrModeFlat;
+    StackFrame.AddrFrame.Offset = ep->ContextRecord->Ebp;
+    StackFrame.AddrFrame.Mode = AddrModeFlat;
+
+    HANDLE hProcess = GetCurrentProcess();
+    HANDLE hThread = GetCurrentThread();
+
+    // Initialize the symbol handler.
+    SymSetOptions(SYMOPT_DEFERRED_LOADS|SYMOPT_LOAD_LINES);
+    SymInitialize(hProcess, NULL, TRUE);
+
+    while (true) {
+      if (!StackWalk(IMAGE_FILE_MACHINE_I386, hProcess, hThread, &StackFrame,
+                     ep->ContextRecord, NULL, SymFunctionTableAccess,
+                     SymGetModuleBase, NULL)) {
+        break;
+      }
+
+      if (StackFrame.AddrFrame.Offset == 0)
+        break;
+
+      // Print the PC in hexadecimal.
+      DWORD PC = StackFrame.AddrPC.Offset;
+      fprintf(stderr, "%08lX", PC);
+
+      // Print the parameters.  Assume there are four.
+      fprintf(stderr, " (0x%08lX 0x%08lX 0x%08lX 0x%08lX)", StackFrame.Params[0],
+              StackFrame.Params[1], StackFrame.Params[2], StackFrame.Params[3]);
+
+      // Verify the PC belongs to a module in this process.
+      if (!SymGetModuleBase(hProcess, PC)) {
+        fputs(" <unknown module>\n", stderr);
+        continue;
+      }
+
+      // Print the symbol name.
+      char buffer[512];
+      IMAGEHLP_SYMBOL *symbol = reinterpret_cast<IMAGEHLP_SYMBOL *>(buffer);
+      memset(symbol, 0, sizeof(IMAGEHLP_SYMBOL));
+      symbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL);
+      symbol->MaxNameLength = 512 - sizeof(IMAGEHLP_SYMBOL);
+
+      DWORD dwDisp;
+      if (!SymGetSymFromAddr(hProcess, PC, &dwDisp, symbol)) {
+        fputc('\n', stderr);
+        continue;
+      }
+
+      buffer[511] = 0;
+      if (dwDisp > 0)
+        fprintf(stderr, ", %s()+%04lu bytes(s)", symbol->Name, dwDisp);
+      else
+        fprintf(stderr, ", %s", symbol->Name);
+
+      // Print the source file and line number information.
+      IMAGEHLP_LINE line;
+      memset(&line, 0, sizeof(line));
+      line.SizeOfStruct = sizeof(line);
+      if (SymGetLineFromAddr(hProcess, PC, &dwDisp, &line)) {
+        fprintf(stderr, ", %s, line %lu", line.FileName, line.LineNumber);
+        if (dwDisp > 0)
+          fprintf(stderr, "+%04lu byte(s)", dwDisp);
+      }
+
+      fputc('\n', stderr);
+    }
+
+#endif
+
+  } catch (...) {
+      assert(0 && "Crashed in LLVMUnhandledExceptionFilter");
+  }
+
+  // Allow dialog box to pop up allowing choice to start debugger.
+  if (OldFilter)
+    return (*OldFilter)(ep);
+  else
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+
+static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType) {
+  // We are running in our very own thread, courtesy of Windows.
+  EnterCriticalSection(&CriticalSection);
+  Cleanup();
+
+  // If an interrupt function has been set, go and run one it; otherwise,
+  // the process dies.
+  void (*IF)() = InterruptFunction;
+  InterruptFunction = 0;      // Don't run it on another CTRL-C.
+
+  if (IF) {
+    // Note: if the interrupt function throws an exception, there is nothing
+    // to catch it in this thread so it will kill the process.
+    IF();                     // Run it now.
+    LeaveCriticalSection(&CriticalSection);
+    return TRUE;              // Don't kill the process.
+  }
+
+  // Allow normal processing to take place; i.e., the process dies.
+  LeaveCriticalSection(&CriticalSection);
+  return FALSE;
+}
+
diff --git a/lib/System/Win32/TimeValue.inc b/lib/System/Win32/TimeValue.inc
new file mode 100644
index 0000000..0ca87d4
--- /dev/null
+++ b/lib/System/Win32/TimeValue.inc
@@ -0,0 +1,51 @@
+//===- Win32/TimeValue.cpp - Win32 TimeValue Implementation -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 implementation of the TimeValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win32.h"
+#include <time.h>
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code.
+//===----------------------------------------------------------------------===//
+
+TimeValue TimeValue::now() {
+  uint64_t ft;
+  GetSystemTimeAsFileTime(reinterpret_cast<FILETIME *>(&ft));
+
+  TimeValue t(0, 0);
+  t.fromWin32Time(ft);
+  return t;
+}
+
+std::string TimeValue::toString() const {
+#ifdef __MINGW32__
+  // This ban may be lifted by either:
+  // (i) a future MinGW version other than 1.0 inherents the __time64_t type, or
+  // (ii) configure tests for either the time_t or __time64_t type.
+  time_t ourTime = time_t(this->toEpochTime());
+  struct tm *lt = ::localtime(&ourTime);
+#else
+  __time64_t ourTime = this->toEpochTime();
+  struct tm *lt = ::_localtime64(&ourTime);
+#endif
+
+  char buffer[25];
+  strftime(buffer, 25, "%a %b %d %H:%M:%S %Y", lt);
+  return std::string(buffer);
+}
+
+
+}
diff --git a/lib/System/Win32/Win32.h b/lib/System/Win32/Win32.h
new file mode 100644
index 0000000..8f505b1
--- /dev/null
+++ b/lib/System/Win32/Win32.h
@@ -0,0 +1,57 @@
+//===- Win32/Win32.h - Common Win32 Include File ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines things specific to Win32 implementations.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+// Require at least Windows 2000 API.
+#define _WIN32_WINNT 0x0500
+
+#include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "windows.h"
+#include <cassert>
+#include <string>
+
+inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
+  if (!ErrMsg)
+    return true;
+  char *buffer = NULL;
+  FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM,
+      NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
+  *ErrMsg = prefix + buffer;
+  LocalFree(buffer);
+  return true;
+}
+
+class AutoHandle {
+  HANDLE handle;
+
+public:
+  AutoHandle(HANDLE h) : handle(h) {}
+
+  ~AutoHandle() {
+    if (handle)
+      CloseHandle(handle);
+  }
+
+  operator HANDLE() {
+    return handle;
+  }
+
+  AutoHandle &operator=(HANDLE h) {
+    handle = h;
+    return *this;
+  }
+};
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
new file mode 100644
index 0000000..ac7de91
--- /dev/null
+++ b/lib/Target/ARM/ARM.h
@@ -0,0 +1,121 @@
+//===-- ARM.h - Top-level interface for ARM representation---- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_H
+#define TARGET_ARM_H
+
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+namespace llvm {
+
+class ARMTargetMachine;
+class FunctionPass;
+class MachineCodeEmitter;
+class JITCodeEmitter;
+class raw_ostream;
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  // The CondCodes constants map directly to the 4-bit encoding of the 
+  // condition field for predicated instructions. 
+  enum CondCodes {
+    EQ,
+    NE,
+    HS,
+    LO,
+    MI,
+    PL,
+    VS,
+    VC,
+    HI,
+    LS,
+    GE,
+    LT,
+    GT,
+    LE,
+    AL
+  };
+  
+  inline static CondCodes getOppositeCondition(CondCodes CC){
+    switch (CC) {
+    default: assert(0 && "Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+}
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition code");
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+}
+
+FunctionPass *createARMISelDag(ARMTargetMachine &TM);
+FunctionPass *createARMCodePrinterPass(raw_ostream &O,
+                                       ARMTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel,
+                                       bool Verbose);
+FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM,
+                                       MachineCodeEmitter &MCE);
+
+FunctionPass *createARMCodeEmitterPass( ARMTargetMachine &TM,
+                                        MachineCodeEmitter &MCE);
+FunctionPass *createARMJITCodeEmitterPass( ARMTargetMachine &TM, 
+                                           JITCodeEmitter &JCE);
+
+FunctionPass *createARMLoadStoreOptimizationPass();
+FunctionPass *createARMConstantIslandPass();
+
+} // end namespace llvm;
+
+// Defines symbolic names for ARM registers.  This defines a mapping from
+// register name to register number.
+//
+#include "ARMGenRegisterNames.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#include "ARMGenInstrNames.inc"
+
+
+#endif
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
new file mode 100644
index 0000000..4ac6857
--- /dev/null
+++ b/lib/Target/ARM/ARM.td
@@ -0,0 +1,136 @@
+//===- ARM.td - Describe the ARM Target Machine -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def ArchV4T     : SubtargetFeature<"v4t", "ARMArchVersion", "V4T",
+                                   "ARM v4T">;
+def ArchV5T     : SubtargetFeature<"v5t", "ARMArchVersion", "V5T",
+                                   "ARM v5T">;
+def ArchV5TE    : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE",
+                                   "ARM v5TE, v5TEj, v5TExp">;
+def ArchV6      : SubtargetFeature<"v6", "ARMArchVersion", "V6",
+                                   "ARM v6">;
+def ArchV7A     : SubtargetFeature<"v7a", "ARMArchVersion", "V7A",
+                                   "ARM v7A">;
+def FeatureVFP2 : SubtargetFeature<"vfp2", "ARMFPUType", "VFPv2",
+                                   "Enable VFP2 instructions">;
+def FeatureVFP3 : SubtargetFeature<"vfp3", "ARMFPUType", "VFPv3",
+                                   "Enable VFP3 instructions">;
+def FeatureNEON : SubtargetFeature<"neon", "ARMFPUType", "NEON",
+                                   "Enable NEON instructions">;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2",
+                                     "Enable Thumb2 instructions">;
+
+//===----------------------------------------------------------------------===//
+// ARM Processors supported.
+//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+// V4 Processors.
+def : Proc<"generic",         []>;
+def : Proc<"arm8",            []>;
+def : Proc<"arm810",          []>;
+def : Proc<"strongarm",       []>;
+def : Proc<"strongarm110",    []>;
+def : Proc<"strongarm1100",   []>;
+def : Proc<"strongarm1110",   []>;
+
+// V4T Processors.
+def : Proc<"arm7tdmi",        [ArchV4T]>;
+def : Proc<"arm7tdmi-s",      [ArchV4T]>;
+def : Proc<"arm710t",         [ArchV4T]>;
+def : Proc<"arm720t",         [ArchV4T]>;
+def : Proc<"arm9",            [ArchV4T]>;
+def : Proc<"arm9tdmi",        [ArchV4T]>;
+def : Proc<"arm920",          [ArchV4T]>;
+def : Proc<"arm920t",         [ArchV4T]>;
+def : Proc<"arm922t",         [ArchV4T]>;
+def : Proc<"arm940t",         [ArchV4T]>;
+def : Proc<"ep9312",          [ArchV4T]>;
+
+// V5T Processors.
+def : Proc<"arm10tdmi",       [ArchV5T]>;
+def : Proc<"arm1020t",        [ArchV5T]>;
+
+// V5TE Processors.
+def : Proc<"arm9e",           [ArchV5TE]>;
+def : Proc<"arm926ej-s",      [ArchV5TE]>;
+def : Proc<"arm946e-s",       [ArchV5TE]>;
+def : Proc<"arm966e-s",       [ArchV5TE]>;
+def : Proc<"arm968e-s",       [ArchV5TE]>;
+def : Proc<"arm10e",          [ArchV5TE]>;
+def : Proc<"arm1020e",        [ArchV5TE]>;
+def : Proc<"arm1022e",        [ArchV5TE]>;
+def : Proc<"xscale",          [ArchV5TE]>;
+def : Proc<"iwmmxt",          [ArchV5TE]>;
+
+// V6 Processors.
+def : Proc<"arm1136j-s",      [ArchV6]>;
+def : Proc<"arm1136jf-s",     [ArchV6, FeatureVFP2]>;
+def : Proc<"arm1176jz-s",     [ArchV6]>;
+def : Proc<"arm1176jzf-s",    [ArchV6, FeatureVFP2]>;
+def : Proc<"mpcorenovfp",     [ArchV6]>;
+def : Proc<"mpcore",          [ArchV6, FeatureVFP2]>;
+
+def : Proc<"arm1156t2-s",     [ArchV6, FeatureThumb2]>;
+def : Proc<"arm1156t2f-s",    [ArchV6, FeatureThumb2, FeatureVFP2]>;
+
+def : Proc<"cortex-a8",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
+def : Proc<"cortex-a9",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+
+include "ARMCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+
+def ARMInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+  let TSFlagsFields = ["AddrModeBits",
+                       "SizeFlag",
+                       "IndexModeBits",
+                       "isUnaryDataProc",
+                       "Form"];
+  let TSFlagsShifts = [0,
+                       4,
+                       7,
+                       9,
+                       10];
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def ARM : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = ARMInstrInfo;
+}
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
new file mode 100644
index 0000000..6d9b9ee
--- /dev/null
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -0,0 +1,394 @@
+//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+  
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+  
+  enum AddrOpc {
+    add = '+', sub = '-'
+  };
+  
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+  
+  static inline ShiftOpc getShiftOpcForNode(SDValue N) {
+    switch (N.getOpcode()) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return isLD ? "fd" : "ea";
+    case ARM_AM::ib: return isLD ? "ed" : "fa";
+    case ARM_AM::da: return isLD ? "fa" : "ed";
+    case ARM_AM::db: return isLD ? "ea" : "fd";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+  
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+  
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+    
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = CountTrailingZeros_32(Imm);
+    
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+    
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should skip the first run of ones, then
+    // retry the hunt.
+    if (Imm & 1) {
+      unsigned TrailingOnes = CountTrailingZeros_32(~Imm);
+      if (TrailingOnes != 32) {  // Avoid overflow on 0xFFFFFFFF
+        // Restart the search for a high-order bit after the initial seconds of
+        // ones.
+        unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1));
+      
+        // Rotate amount must be even.
+        unsigned RotAmt2 = TZ2 & ~1;
+        
+        // If this fits, use it.
+        if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0)
+          return (32-RotAmt2)&31;  // HW rotates right, not left.
+      }
+    }
+    
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+    
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+      
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+  
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+    
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+  
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.  
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+  
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with 
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  // 
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13);
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)(AM2Opc >> 13);
+  }
+  
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7.
+  
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  //
+  // If the 4th bit (writeback)is set, then the base register is updated after
+  // the memory transfer.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode, bool WB = false) {
+    return (int)SubMode | ((int)WB << 3);
+  }
+
+  static inline bool getAM4WBFlag(unsigned Mode) {
+    return (Mode >> 3) & 1;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7.
+  //
+  // This can also be used for FP load/store multiple ops. The third field encodes
+  // writeback mode in bit 8, the number of registers (or 2 times the number of
+  // registers for DPR ops) in bits 0-7. In addition, bit 9-11 encodes one of the
+  // following two sub-modes:
+  //
+  //    IA - Increment after
+  //    DB - Decrement before
+  
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field for FLDM and
+  /// FSTM instructions.
+  static inline unsigned getAM5Opc(AMSubMode SubMode, bool WB,
+                                   unsigned char Offset) {
+    assert((SubMode == ia || SubMode == db) &&
+           "Illegal addressing mode 5 sub-mode!");
+    return ((int)SubMode << 9) | ((int)WB << 8) | Offset;
+  }
+  static inline AMSubMode getAM5SubMode(unsigned AM5Opc) {
+    return (AMSubMode)((AM5Opc >> 9) & 0x7);
+  }
+  static inline bool getAM5WBFlag(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1);
+  }
+  
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+
diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h
new file mode 100644
index 0000000..3b38375
--- /dev/null
+++ b/lib/Target/ARM/ARMBuildAttrs.h
@@ -0,0 +1,64 @@
+//===-------- ARMBuildAttrs.h - ARM Build Attributes ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains enumerations and support routines for ARM build attributes
+// as defined in ARM ABI addenda document (ABI release 2.07).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __TARGET_ARMBUILDATTRS_H__
+#define __TARGET_ARMBUILDATTRS_H__
+
+namespace ARMBuildAttrs {
+  enum {
+    File                      = 1,
+    Section                   = 2,
+    Symbol                    = 3,
+    CPU_raw_name              = 4,
+    CPU_name                  = 5,
+    CPU_arch                  = 6,
+    CPU_arch_profile          = 7,
+    ARM_ISA_use               = 8,
+    THUMB_ISA_use             = 9,
+    VFP_arch                  = 10,
+    WMMX_arch                 = 11,
+    Advanced_SIMD_arch        = 12,
+    PCS_config                = 13,
+    ABI_PCS_R9_use            = 14,
+    ABI_PCS_RW_data           = 15,
+    ABI_PCS_RO_data           = 16,
+    ABI_PCS_GOT_use           = 17,
+    ABI_PCS_wchar_t           = 18,
+    ABI_FP_rounding           = 19,
+    ABI_FP_denormal           = 20,
+    ABI_FP_exceptions         = 21,
+    ABI_FP_user_exceptions    = 22,
+    ABI_FP_number_model       = 23,
+    ABI_align8_needed         = 24,
+    ABI_align8_preserved      = 25,
+    ABI_enum_size             = 26,
+    ABI_HardFP_use            = 27,
+    ABI_VFP_args              = 28,
+    ABI_WMMX_args             = 29,
+    ABI_optimization_goals    = 30,
+    ABI_FP_optimization_goals = 31,
+    compatibility             = 32,
+    CPU_unaligned_access      = 34,
+    VFP_HP_extension          = 36,
+    ABI_FP_16bit_format       = 38,
+    nodefaults                = 64,
+    also_compatible_with      = 65,
+    T2EE_use                  = 66,
+    conformance               = 67,
+    Virtualization_use        = 68,
+    MPextension_use           = 70
+  };
+}
+
+#endif // __TARGET_ARMBUILDATTRS_H__
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
new file mode 100644
index 0000000..6cd786e
--- /dev/null
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -0,0 +1,87 @@
+//===- ARMCallingConv.td - Calling Conventions for ARM ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for ARM architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>:
+  CCIf<!strconcat("State.getTarget().getSubtarget<ARMSubtarget>().", F), A>;
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A>:
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_ARM_APCS : CallingConv<[
+
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // f64 is passed in pairs of GPRs, possibly split onto the stack
+  CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 4>>
+]>;
+
+def RetCC_ARM_APCS : CallingConv<[
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_ARM_AAPCS : CallingConv<[
+
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // i64/f64 is passed in even pairs of GPRs
+  // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
+  // (and the same is true for f64 if VFP is not enabled)
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
+  CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[i32], CCIf<"State.getNextStackOffset() == 0 &&"
+                       "ArgFlags.getOrigAlign() != 8",
+                       CCAssignToReg<[R0, R1, R2, R3]>>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_ARM_AAPCS : CallingConv<[
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Calling Convention Dispatch
+//===----------------------------------------------------------------------===//
+
+def CC_ARM : CallingConv<[
+  CCIfSubtarget<"isAAPCS_ABI()", CCDelegateTo<CC_ARM_AAPCS>>,
+  CCDelegateTo<CC_ARM_APCS>
+]>;
+
+def RetCC_ARM : CallingConv<[
+  CCIfSubtarget<"isAAPCS_ABI()", CCDelegateTo<RetCC_ARM_AAPCS>>,
+  CCDelegateTo<RetCC_ARM_APCS>
+]>;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
new file mode 100644
index 0000000..44fac12
--- /dev/null
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -0,0 +1,1411 @@
+//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the ARM machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMInstrInfo.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+  class ARMCodeEmitter {
+  public:
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI);
+  };
+
+  template<class CodeEmitter>
+  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass,
+                                    public ARMCodeEmitter {
+    ARMJITInfo                *JTI;
+    const ARMInstrInfo        *II;
+    const TargetData          *TD;
+    TargetMachine             &TM;
+    CodeEmitter               &MCE;
+    const std::vector<MachineConstantPoolEntry> *MCPEs;
+    const std::vector<MachineJumpTableEntry> *MJTEs;
+    bool IsPIC;
+
+  public:
+    static char ID;
+    explicit Emitter(TargetMachine &tm, CodeEmitter &mce)
+      : MachineFunctionPass(&ID), JTI(0), II(0), TD(0), TM(tm),
+      MCE(mce), MCPEs(0), MJTEs(0),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+    Emitter(TargetMachine &tm, CodeEmitter &mce,
+            const ARMInstrInfo &ii, const TargetData &td)
+      : MachineFunctionPass(&ID), JTI(0), II(&ii), TD(&td), TM(tm),
+      MCE(mce), MCPEs(0), MJTEs(0),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "ARM Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+
+    void emitWordLE(unsigned Binary);
+
+    void emitDWordLE(uint64_t Binary);
+
+    void emitConstPoolInstruction(const MachineInstr &MI);
+
+    void emitMOVi2piecesInstruction(const MachineInstr &MI);
+
+    void emitLEApcrelJTInstruction(const MachineInstr &MI);
+
+    void emitPseudoMoveInstruction(const MachineInstr &MI);
+
+    void addPCLabel(unsigned LabelID);
+
+    void emitPseudoInstruction(const MachineInstr &MI);
+
+    unsigned getMachineSoRegOpValue(const MachineInstr &MI,
+                                    const TargetInstrDesc &TID,
+                                    const MachineOperand &MO,
+                                    unsigned OpIdx);
+
+    unsigned getMachineSoImmOpValue(unsigned SoImm);
+
+    unsigned getAddrModeSBit(const MachineInstr &MI,
+                             const TargetInstrDesc &TID) const;
+
+    void emitDataProcessingInstruction(const MachineInstr &MI,
+                                       unsigned ImplicitRd = 0,
+                                       unsigned ImplicitRn = 0);
+
+    void emitLoadStoreInstruction(const MachineInstr &MI,
+                                  unsigned ImplicitRd = 0,
+                                  unsigned ImplicitRn = 0);
+
+    void emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                      unsigned ImplicitRn = 0);
+
+    void emitLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitMulFrmInstruction(const MachineInstr &MI);
+
+    void emitExtendInstruction(const MachineInstr &MI);
+
+    void emitMiscArithInstruction(const MachineInstr &MI);
+
+    void emitBranchInstruction(const MachineInstr &MI);
+
+    void emitInlineJumpTable(unsigned JTIndex);
+
+    void emitMiscBranchInstruction(const MachineInstr &MI);
+
+    void emitVFPArithInstruction(const MachineInstr &MI);
+
+    void emitVFPConversionInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitMiscInstruction(const MachineInstr &MI);
+
+    /// getMachineOpValue - Return binary encoding of operand. If the machine
+    /// operand requires relocation, record the relocation and return zero.
+    unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO);
+    unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) {
+      return getMachineOpValue(MI, MI.getOperand(OpIdx));
+    }
+
+    /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+    ///
+    unsigned getShiftOp(unsigned Imm) const ;
+
+    /// Routines that handle operands which add machine relocations which are
+    /// fixed up by the relocation stage.
+    void emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                           bool NeedStub, intptr_t ACPV = 0);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc);
+    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc);
+    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc,
+                               intptr_t JTBase = 0);
+  };
+  template <class CodeEmitter>
+  char Emitter<CodeEmitter>::ID = 0;
+}
+
+/// createARMCodeEmitterPass - Return a pass that emits the collected ARM code
+/// to the specified MCE object.
+
+namespace llvm {
+
+FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM,
+                                       MachineCodeEmitter &MCE) {
+  return new Emitter<MachineCodeEmitter>(TM, MCE);
+}
+FunctionPass *createARMJITCodeEmitterPass(ARMTargetMachine &TM,
+                                          JITCodeEmitter &JCE) {
+  return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+} // end namespace llvm
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+  II = ((ARMTargetMachine&)MF.getTarget()).getInstrInfo();
+  TD = ((ARMTargetMachine&)MF.getTarget()).getTargetData();
+  JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  JTI->Initialize(MF, IsPIC);
+
+  do {
+    DOUT << "JITTing function '" << MF.getFunction()->getName() << "'\n";
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+///
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getShiftOp(unsigned Imm) const {
+  switch (ARM_AM::getAM2ShiftOpc(Imm)) {
+  default: assert(0 && "Unknown shift opc!");
+  case ARM_AM::asr: return 2;
+  case ARM_AM::lsl: return 0;
+  case ARM_AM::lsr: return 1;
+  case ARM_AM::ror:
+  case ARM_AM::rrx: return 3;
+  }
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI,
+                                                 const MachineOperand &MO) {
+  if (MO.isReg())
+    return ARMRegisterInfo::getRegisterNumbering(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch);
+  else if (MO.isCPI()) {
+    const TargetInstrDesc &TID = MI.getDesc();
+    // For VFP load, the immediate offset is multiplied by 4.
+    unsigned Reloc =  ((TID.TSFlags & ARMII::FormMask) == ARMII::VFPLdStFrm)
+      ? ARM::reloc_arm_vfp_cp_entry : ARM::reloc_arm_cp_entry;
+    emitConstPoolAddress(MO.getIndex(), Reloc);
+  } else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative);
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
+  else {
+    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+    abort();
+  }
+  return 0;
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                                             bool NeedStub, intptr_t ACPV) {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             GV, ACPV, NeedStub));
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+                                                     unsigned Reloc) {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES));
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI,
+                                                unsigned Reloc) {
+  // Tell JIT emitter we'll resolve the address.
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, true));
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTIndex, 
+                                                unsigned Reloc) {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, true));
+}
+
+/// emitMachineBasicBlock - Emit the specified address basic block.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                              unsigned Reloc, intptr_t JTBase) {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB, JTBase));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitWordLE(unsigned Binary) {
+#ifndef NDEBUG
+  DOUT << "  0x" << std::hex << std::setw(8) << std::setfill('0')
+       << Binary << std::dec << "\n";
+#endif
+  MCE.emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDWordLE(uint64_t Binary) {
+#ifndef NDEBUG
+  DOUT << "  0x" << std::hex << std::setw(8) << std::setfill('0')
+       << (unsigned)Binary << std::dec << "\n";
+  DOUT << "  0x" << std::hex << std::setw(8) << std::setfill('0')
+       << (unsigned)(Binary >> 32) << std::dec << "\n";
+#endif
+  MCE.emitDWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI) {
+  DOUT << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI;
+
+  NumEmitted++;  // Keep track of the # of mi's emitted
+  switch (MI.getDesc().TSFlags & ARMII::FormMask) {
+  default: {
+    assert(0 && "Unhandled instruction encoding format!");
+    break;
+  }
+  case ARMII::Pseudo:
+    emitPseudoInstruction(MI);
+    break;
+  case ARMII::DPFrm:
+  case ARMII::DPSoRegFrm:
+    emitDataProcessingInstruction(MI);
+    break;
+  case ARMII::LdFrm:
+  case ARMII::StFrm:
+    emitLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdMiscFrm:
+  case ARMII::StMiscFrm:
+    emitMiscLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdStMulFrm:
+    emitLoadStoreMultipleInstruction(MI);
+    break;
+  case ARMII::MulFrm:
+    emitMulFrmInstruction(MI);
+    break;
+  case ARMII::ExtFrm:
+    emitExtendInstruction(MI);
+    break;
+  case ARMII::ArithMiscFrm:
+    emitMiscArithInstruction(MI);
+    break;
+  case ARMII::BrFrm:
+    emitBranchInstruction(MI);
+    break;
+  case ARMII::BrMiscFrm:
+    emitMiscBranchInstruction(MI);
+    break;
+  // VFP instructions.
+  case ARMII::VFPUnaryFrm:
+  case ARMII::VFPBinaryFrm:
+    emitVFPArithInstruction(MI);
+    break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    emitVFPConversionInstruction(MI);
+    break;
+  case ARMII::VFPLdStFrm:
+    emitVFPLoadStoreInstruction(MI);
+    break;
+  case ARMII::VFPLdStMulFrm:
+    emitVFPLoadStoreMultipleInstruction(MI);
+    break;
+  case ARMII::VFPMiscFrm:
+    emitMiscInstruction(MI);
+    break;
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolInstruction(const MachineInstr &MI) {
+  unsigned CPI = MI.getOperand(0).getImm();       // CP instruction index.
+  unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
+  const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex];
+  
+  // Remember the CONSTPOOL_ENTRY address for later relocation.
+  JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue());
+
+  // Emit constpool island entry. In most cases, the actual values will be
+  // resolved and relocated after code emission.
+  if (MCPE.isMachineConstantPoolEntry()) {
+    ARMConstantPoolValue *ACPV =
+      static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+    DOUT << "  ** ARM constant pool #" << CPI << " @ "
+         << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n';
+
+    GlobalValue *GV = ACPV->getGV();
+    if (GV) {
+      assert(!ACPV->isStub() && "Don't know how to deal this yet!");
+      if (ACPV->isNonLazyPointer())
+        MCE.addRelocation(MachineRelocation::getIndirectSymbol(
+                  MCE.getCurrentPCOffset(), ARM::reloc_arm_machine_cp_entry, GV,
+                  (intptr_t)ACPV, false));
+      else 
+        emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
+                          ACPV->isStub() || isa<Function>(GV), (intptr_t)ACPV);
+     } else  {
+      assert(!ACPV->isNonLazyPointer() && "Don't know how to deal this yet!");
+      emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute);
+    }
+    emitWordLE(0);
+  } else {
+    Constant *CV = MCPE.Val.ConstVal;
+
+#ifndef NDEBUG
+    DOUT << "  ** Constant pool #" << CPI << " @ "
+         << (void*)MCE.getCurrentPCValue() << " ";
+    if (const Function *F = dyn_cast<Function>(CV))
+      DOUT << F->getName();
+    else
+      DOUT << *CV;
+    DOUT << '\n';
+#endif
+
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV));
+      emitWordLE(0);
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      uint32_t Val = *(uint32_t*)CI->getValue().getRawData();
+      emitWordLE(Val);
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+      if (CFP->getType() == Type::FloatTy)
+        emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else if (CFP->getType() == Type::DoubleTy)
+        emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else {
+        assert(0 && "Unable to handle this constantpool entry!");
+        abort();
+      }
+    } else {
+      assert(0 && "Unable to handle this constantpool entry!");
+      abort();
+    }
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) {
+  const MachineOperand &MO0 = MI.getOperand(0);
+  const MachineOperand &MO1 = MI.getOperand(1);
+  assert(MO1.isImm() && "Not a valid so_imm value!");
+  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm());
+  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm());
+
+  // Emit the 'mov' instruction.
+  unsigned Binary = 0xd << 21;  // mov: Insts{24-21} = 0b1101
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V1));
+  emitWordLE(Binary);
+
+  // Now the 'orr' instruction.
+  Binary = 0xc << 21;  // orr: Insts{24-21} = 0b1100
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRnShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V2));
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitLEApcrelJTInstruction(const MachineInstr &MI) {
+  // It's basically add r, pc, (LJTI - $+8)
+  
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Emit the 'add' instruction.
+  unsigned Binary = 0x4 << 21;  // add: Insts{24-31} = 0b0100
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, TID);
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode Rn which is PC.
+  Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+
+  // Encode the displacement.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>.
+  Binary |= 1 << ARMII::I_BitShift;
+  emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base);
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPseudoMoveInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  if (Opcode == ARM::MOVsrl_flag || Opcode == ARM::MOVsra_flag)
+    Binary |= 1 << ARMII::S_BitShift;
+
+  // Encode register def if there is one.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode the shift operation.
+  switch (Opcode) {
+  default: break;
+  case ARM::MOVrx:
+    // rrx
+    Binary |= 0x6 << 4;
+    break;
+  case ARM::MOVsrl_flag:
+    // lsr #1
+    Binary |= (0x2 << 4) | (1 << 7);
+    break;
+  case ARM::MOVsra_flag:
+    // asr #1
+    Binary |= (0x4 << 4) | (1 << 7);
+    break;
+  }
+
+  // Encode register Rm.
+  Binary |= getMachineOpValue(MI, 1);
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::addPCLabel(unsigned LabelID) {
+  DOUT << "  ** LPC" << LabelID << " @ "
+       << (void*)MCE.getCurrentPCValue() << '\n';
+  JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue());
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPseudoInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+  switch (Opcode) {
+  default:
+    abort(); // FIXME:
+  case TargetInstrInfo::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI.getOperand(0).getSymbolName()[0]) {
+      assert(0 && "JIT does not support inline asm!\n");
+      abort();
+    }
+    break;
+  }
+  case TargetInstrInfo::DBG_LABEL:
+  case TargetInstrInfo::EH_LABEL:
+    MCE.emitLabel(MI.getOperand(0).getImm());
+    break;
+  case TargetInstrInfo::IMPLICIT_DEF:
+  case TargetInstrInfo::DECLARE:
+  case ARM::DWARF_LOC:
+    // Do nothing.
+    break;
+  case ARM::CONSTPOOL_ENTRY:
+    emitConstPoolInstruction(MI);
+    break;
+  case ARM::PICADD: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // PICADD is just an add instruction that implicitly read pc.
+    emitDataProcessingInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICSTR:
+  case ARM::PICSTRB: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitLoadStoreInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDRH:
+  case ARM::PICLDRSH:
+  case ARM::PICLDRSB:
+  case ARM::PICSTRH: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitMiscLoadStoreInstruction(MI, ARM::PC);
+    break;
+  }
+  case ARM::MOVi2pieces:
+    // Two instructions to materialize a constant.
+    emitMOVi2piecesInstruction(MI);
+    break;
+  case ARM::LEApcrelJT:
+    // Materialize jumptable address.
+    emitLEApcrelJTInstruction(MI);
+    break;
+  case ARM::MOVrx:
+  case ARM::MOVsrl_flag:
+  case ARM::MOVsra_flag:
+    emitPseudoMoveInstruction(MI);
+    break;
+  }
+}
+
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue(
+                                                const MachineInstr &MI,
+                                                const TargetInstrDesc &TID,
+                                                const MachineOperand &MO,
+                                                unsigned OpIdx) {
+  unsigned Binary = getMachineOpValue(MI, MO);
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    // RRX - 0110 and bit[11:8] clear.
+    switch (SOpc) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    case ARM_AM::rrx: SBits = 0x6; break;
+    }
+  } else {
+    // Set shift operand (bit[6:4]).
+    // LSL - 000
+    // LSR - 010
+    // ASR - 100
+    // ROR - 110
+    switch (SOpc) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x0; break;
+    case ARM_AM::lsr: SBits = 0x2; break;
+    case ARM_AM::asr: SBits = 0x4; break;
+    case ARM_AM::ror: SBits = 0x6; break;
+    }
+  }
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode the shift operation Rs or shift_imm (except rrx).
+  if (Rs) {
+    // Encode Rs bit[11:8].
+    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+    return Binary |
+      (ARMRegisterInfo::getRegisterNumbering(Rs) << ARMII::RegRsShift);
+  }
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
+}
+
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getMachineSoImmOpValue(unsigned SoImm) {
+  // Encode rotate_imm.
+  unsigned Binary = (ARM_AM::getSOImmValRot(SoImm) >> 1)
+    << ARMII::SoRotImmShift;
+
+  // Encode immed_8.
+  Binary |= ARM_AM::getSOImmValImm(SoImm);
+  return Binary;
+}
+
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getAddrModeSBit(const MachineInstr &MI,
+                                             const TargetInstrDesc &TID) const {
+  for (unsigned i = MI.getNumOperands(), e = TID.getNumOperands(); i != e; --i){
+    const MachineOperand &MO = MI.getOperand(i-1);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
+      return 1 << ARMII::S_BitShift;
+  }
+  return 0;
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDataProcessingInstruction(
+                                                   const MachineInstr &MI,
+                                                   unsigned ImplicitRd,
+                                                   unsigned ImplicitRn) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, TID);
+
+  // Encode register def if there is one.
+  unsigned NumDefs = TID.getNumDefs();
+  unsigned OpIdx = 0;
+  if (NumDefs)
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+  else if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd)
+               << ARMII::RegRdShift);
+
+  // If this is a two-address operand, skip it. e.g. MOVCCr operand 1.
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode first non-shifter register operand if there is one.
+  bool isUnary = TID.TSFlags & ARMII::UnaryDP;
+  if (!isUnary) {
+    if (ImplicitRn)
+      // Special handling for implicit use (e.g. PC).
+      Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
+                 << ARMII::RegRnShift);
+    else {
+      Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift;
+      ++OpIdx;
+    }
+  }
+
+  // Encode shifter operand.
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  if ((TID.TSFlags & ARMII::FormMask) == ARMII::DPSoRegFrm) {
+    // Encode SoReg.
+    emitWordLE(Binary | getMachineSoRegOpValue(MI, TID, MO, OpIdx));
+    return;
+  }
+
+  if (MO.isReg()) {
+    // Encode register Rm.
+    emitWordLE(Binary | ARMRegisterInfo::getRegisterNumbering(MO.getReg()));
+    return;
+  }
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>.
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(MO.getImm());
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitLoadStoreInstruction(
+                                              const MachineInstr &MI,
+                                              unsigned ImplicitRd,
+                                              unsigned ImplicitRn) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Form = TID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (TID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd)
+               << ARMII::RegRdShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
+               << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDR_PRE.
+  if (!Skipped && TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM2Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative).
+  Binary |= ((ARM_AM::getAM2Op(AM2Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+  if (!MO2.getReg()) { // is immediate
+    if (ARM_AM::getAM2Offset(AM2Opc))
+      // Set the value of offset_12 field
+      Binary |= ARM_AM::getAM2Offset(AM2Opc);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Set bit I(25), because this is not in immediate enconding.
+  Binary |= 1 << ARMII::I_BitShift;
+  assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
+  // Set bit[3:0] to the corresponding Rm register
+  Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg());
+
+  // If this instr is in scaled register offset/index instruction, set
+  // shift_immed(bit[11:7]) and shift(bit[6:5]) fields.
+  if (unsigned ShImm = ARM_AM::getAM2Offset(AM2Opc)) {
+    Binary |= getShiftOp(AM2Opc) << ARMII::ShiftImmShift;  // shift
+    Binary |= ShImm              << ARMII::ShiftShift;     // shift_immed
+  }
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                                        unsigned ImplicitRn) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Form = TID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (TID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StMiscFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
+               << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDRH_POST.
+  if (!Skipped && TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM3Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative)
+  Binary |= ((ARM_AM::getAM3Op(AM3Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+
+  // If this instr is in register offset/index encoding, set bit[3:0]
+  // to the corresponding Rm register.
+  if (MO2.getReg()) {
+    Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg());
+    emitWordLE(Binary);
+    return;
+  }
+
+  // This instr is in immediate offset/index encoding, set bit 22 to 1.
+  Binary |= 1 << ARMII::AM3_I_BitShift;
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(AM3Opc)) {
+    // Set operands
+    Binary |= (ImmOffs >> 4) << ARMII::ImmHiShift;  // immedH
+    Binary |= (ImmOffs & 0xF);                      // immedL
+  }
+
+  emitWordLE(Binary);
+}
+
+static unsigned getAddrModeUPBits(unsigned Mode) {
+  unsigned Binary = 0;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  // IA - Increment after  - bit U = 1 and bit P = 0
+  // IB - Increment before - bit U = 1 and bit P = 1
+  // DA - Decrement after  - bit U = 0 and bit P = 0
+  // DB - Decrement before - bit U = 0 and bit P = 1
+  switch (Mode) {
+  default: assert(0 && "Unknown addressing sub-mode!");
+  case ARM_AM::da:                      break;
+  case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break;
+  case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break;
+  case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break;
+  }
+
+  return Binary;
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitLoadStoreMultipleInstruction(
+                                                       const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  const MachineOperand &MO = MI.getOperand(1);
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm()));
+
+  // Set bit W(21)
+  if (ARM_AM::getAM4WBFlag(MO.getImm()))
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // Set registers
+  for (unsigned i = 4, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(MO.getReg());
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           RegNum < 16);
+    Binary |= 0x1 << RegNum;
+  }
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMulFrmInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, TID);
+
+  // 32x32->64bit operations have two destination registers. The number
+  // of register definitions will tell us if that's what we're dealing with.
+  unsigned OpIdx = 0;
+  if (TID.getNumDefs() == 2)
+    Binary |= getMachineOpValue (MI, OpIdx++) << ARMII::RegRdLoShift;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdHiShift;
+
+  // Encode Rm
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode Rs
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRsShift;
+
+  // Many multiple instructions (e.g. MLA) have three src operands. Encode
+  // it as Rn (for multiply, that's in the same offset as RdLo.
+  if (TID.getNumOperands() > OpIdx &&
+      !TID.OpInfo[OpIdx].isPredicate() &&
+      !TID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRdLoShift;
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExtendInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx++);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  if (MO2.isReg()) {
+    // Two register operand form.
+    // Encode Rn.
+    Binary |= getMachineOpValue(MI, MO1) << ARMII::RegRnShift;
+
+    // Encode Rm.
+    Binary |= getMachineOpValue(MI, MO2);
+    ++OpIdx;
+  } else {
+    Binary |= getMachineOpValue(MI, MO1);
+  }
+
+  // Encode rot imm (0, 8, 16, or 24) if it has a rotate immediate operand.
+  if (MI.getOperand(OpIdx).isImm() &&
+      !TID.OpInfo[OpIdx].isPredicate() &&
+      !TID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= (getMachineOpValue(MI, OpIdx) / 8) << ARMII::ExtRotImmShift;
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMiscArithInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO = MI.getOperand(OpIdx++);
+  if (OpIdx == TID.getNumOperands() ||
+      TID.OpInfo[OpIdx].isPredicate() ||
+      TID.OpInfo[OpIdx].isOptionalDef()) {
+    // Encode Rm and it's done.
+    Binary |= getMachineOpValue(MI, MO);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO) << ARMII::RegRnShift;
+
+  // Encode Rm.
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode shift_imm.
+  unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
+  assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
+  Binary |= ShiftAmt << ARMII::ShiftShift;
+  
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitBranchInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  if (TID.Opcode == ARM::TPsoft)
+    abort(); // FIXME
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set signed_immed_24 field
+  Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInlineJumpTable(unsigned JTIndex) {
+  // Remember the base address of the inline jump table.
+  uintptr_t JTBase = MCE.getCurrentPCValue();
+  JTI->addJumpTableBaseAddr(JTIndex, JTBase);
+  DOUT << "  ** Jump Table #" << JTIndex << " @ " << (void*)JTBase << '\n';
+
+  // Now emit the jump table entries.
+  const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs;
+  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
+    if (IsPIC)
+      // DestBB address - JT base.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_pic_jt, JTBase);
+    else
+      // Absolute DestBB address.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_absolute);
+    emitWordLE(0);
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMiscBranchInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Handle jump tables.
+  if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd) {
+    // First emit a ldr pc, [] instruction.
+    emitDataProcessingInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    unsigned JTIndex = (TID.Opcode == ARM::BR_JTr)
+      ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex();
+    emitInlineJumpTable(JTIndex);
+    return;
+  } else if (TID.Opcode == ARM::BR_JTm) {
+    // First emit a ldr pc, [] instruction.
+    emitLoadStoreInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    emitInlineJumpTable(MI.getOperand(3).getIndex());
+    return;
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  if (TID.Opcode == ARM::BX_RET)
+    // The return register is LR.
+    Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR);
+  else 
+    // otherwise, set the return register
+    Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = false;
+  RegD = ARMRegisterInfo::getRegisterNumbering(RegD, isSPVFP);
+  if (!isSPVFP)
+    Binary |=   RegD               << ARMII::RegRdShift;
+  else {
+    Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift;
+    Binary |=  (RegD & 0x01)       << ARMII::D_BitShift;
+  }
+  return Binary;
+}
+
+static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = false;
+  RegN = ARMRegisterInfo::getRegisterNumbering(RegN, isSPVFP);
+  if (!isSPVFP)
+    Binary |=   RegN               << ARMII::RegRnShift;
+  else {
+    Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift;
+    Binary |=  (RegN & 0x01)       << ARMII::N_BitShift;
+  }
+  return Binary;
+}
+
+static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = false;
+  RegM = ARMRegisterInfo::getRegisterNumbering(RegM, isSPVFP);
+  if (!isSPVFP)
+    Binary |=   RegM;
+  else {
+    Binary |= ((RegM & 0x1E) >> 1);
+    Binary |=  (RegM & 0x01)       << ARMII::M_BitShift;
+  }
+  return Binary;
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVFPArithInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+  assert((Binary & ARMII::D_BitShift) == 0 &&
+         (Binary & ARMII::N_BitShift) == 0 &&
+         (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!");
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // If this is a two-address operand, skip it, e.g. FMACD.
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode Dn / Sn.
+  if ((TID.TSFlags & ARMII::FormMask) == ARMII::VFPBinaryFrm)
+    Binary |= encodeVFPRn(MI, OpIdx++);
+
+  if (OpIdx == TID.getNumOperands() ||
+      TID.OpInfo[OpIdx].isPredicate() ||
+      TID.OpInfo[OpIdx].isOptionalDef()) {
+    // FCMPEZD etc. has only one operand.
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Dm / Sm.
+  Binary |= encodeVFPRm(MI, OpIdx);
+  
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVFPConversionInstruction(
+      const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Form = TID.TSFlags & ARMII::FormMask;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 0);
+    break;
+  case ARMII::VFPConv4Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 0);
+    break;
+  case ARMII::VFPConv5Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 0);
+    break;
+  }
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 1);
+    break;
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 1);
+    break;
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 1);
+    break;
+  }
+
+  if (Form == ARMII::VFPConv5Frm)
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 2);
+  else if (Form == ARMII::VFPConv3Frm)
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 2);
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVFPLoadStoreInstruction(const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // Encode address base.
+  const MachineOperand &Base = MI.getOperand(OpIdx++);
+  Binary |= getMachineOpValue(MI, Base) << ARMII::RegRnShift;
+
+  // If there is a non-zero immediate offset, encode it.
+  if (Base.isReg()) {
+    const MachineOperand &Offset = MI.getOperand(OpIdx);
+    if (unsigned ImmOffs = ARM_AM::getAM5Offset(Offset.getImm())) {
+      if (ARM_AM::getAM5Op(Offset.getImm()) == ARM_AM::add)
+        Binary |= 1 << ARMII::U_BitShift;
+      Binary |= ImmOffs;
+      emitWordLE(Binary);
+      return;
+    }
+  }
+
+  // If immediate offset is omitted, default to +0.
+  Binary |= 1 << ARMII::U_BitShift;
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVFPLoadStoreMultipleInstruction(
+                                                       const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  const MachineOperand &MO = MI.getOperand(1);
+  Binary |= getAddrModeUPBits(ARM_AM::getAM5SubMode(MO.getImm()));
+
+  // Set bit W(21)
+  if (ARM_AM::getAM5WBFlag(MO.getImm()))
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // First register is encoded in Dd.
+  Binary |= encodeVFPRd(MI, 4);
+
+  // Number of registers are encoded in offset field.
+  unsigned NumRegs = 1;
+  for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    ++NumRegs;
+  }
+  Binary |= NumRegs * 2;
+
+  emitWordLE(Binary);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMiscInstruction(const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  emitWordLE(Binary);
+}
+
+#include "ARMGenCodeEmitter.inc"
+
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
new file mode 100644
index 0000000..db723fe
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -0,0 +1,1285 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function.  This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-cp-islands"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumCPEs,     "Number of constpool entries");
+STATISTIC(NumSplit,    "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed, "Number of cond branches fixed");
+STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+
+namespace {
+  /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+  class VISIBILITY_HIDDEN ARMConstantIslands : public MachineFunctionPass {
+    /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed
+    /// by MBB Number.  The two-byte pads required for Thumb alignment are
+    /// counted as part of the following block (i.e., the offset and size for
+    /// a padded block will both be ==2 mod 4).
+    std::vector<unsigned> BBSizes;
+
+    /// BBOffsets - the offset of each MBB in bytes, starting from 0.
+    /// The two-byte pads required for Thumb alignment are counted as part of
+    /// the following block.
+    std::vector<unsigned> BBOffsets;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      unsigned MaxDisp;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp) {}
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+    /// CPEntry - One per constant pool entry, keeping the machine instruction
+    /// pointer, the constpool index, and the number of CPUser's which
+    /// reference this entry.
+    struct CPEntry {
+      MachineInstr *CPEMI;
+      unsigned CPI;
+      unsigned RefCount;
+      CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+        : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+    };
+
+    /// CPEntries - Keep track of all of the constant pool entry machine
+    /// instructions. For each original constpool index (i.e. those that
+    /// existed upon entry to this pass), it keeps a vector of entries.
+    /// Original elements are cloned as we go along; the clones are
+    /// put in the vector of the original element, but have distinct CPIs.
+    std::vector<std::vector<CPEntry> > CPEntries;
+
+    /// ImmBranch - One per immediate branch, keeping the machine instruction
+    /// pointer, conditional or unconditional, the max displacement,
+    /// and (if isCond is true) the corresponding unconditional branch
+    /// opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned MaxDisp : 31;
+      bool isCond : 1;
+      int UncondBr;
+      ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+        : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+    };
+
+    /// ImmBranches - Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    /// PushPopMIs - Keep track of all the Thumb push / pop instructions.
+    ///
+    SmallVector<MachineInstr*, 4> PushPopMIs;
+
+    /// HasFarJump - True if any far jump instruction has been emitted during
+    /// the branch fix up pass.
+    bool HasFarJump;
+
+    const TargetInstrInfo *TII;
+    ARMFunctionInfo *AFI;
+    bool isThumb;
+  public:
+    static char ID;
+    ARMConstantIslands() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM constant island placement and branch shortening pass";
+    }
+
+  private:
+    void DoInitialPlacement(MachineFunction &Fn,
+                            std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    void InitialFunctionScan(MachineFunction &Fn,
+                             const std::vector<MachineInstr*> &CPEMIs);
+    MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI);
+    void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta);
+    bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
+    int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
+    bool LookForWater(CPUser&U, unsigned UserOffset,
+                      MachineBasicBlock** NewMBB);
+    MachineBasicBlock* AcceptWater(MachineBasicBlock *WaterBB,
+                        std::vector<MachineBasicBlock*>::iterator IP);
+    void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                      MachineBasicBlock** NewMBB);
+    bool HandleConstantPoolUser(MachineFunction &Fn, unsigned CPUserIndex);
+    void RemoveDeadCPEMI(MachineInstr *CPEMI);
+    bool RemoveUnusedCPEntries();
+    bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
+                      MachineInstr *CPEMI, unsigned Disp,
+                      bool DoDump);
+    bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U);
+    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
+                        unsigned Disp, bool NegativeOK);
+    bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br);
+    bool FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br);
+    bool FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br);
+    bool UndoLRSpillRestore();
+
+    unsigned GetOffsetOf(MachineInstr *MI) const;
+    void dumpBBs();
+    void verify(MachineFunction &Fn);
+  };
+  char ARMConstantIslands::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARMConstantIslands::verify(MachineFunction &Fn) {
+  assert(BBOffsets.size() == BBSizes.size());
+  for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i)
+    assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]);
+  if (isThumb) {
+    for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
+         MBBI != E; ++MBBI) {
+      MachineBasicBlock *MBB = MBBI;
+      if (!MBB->empty() &&
+          MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY)
+        assert((BBOffsets[MBB->getNumber()]%4 == 0 &&
+                BBSizes[MBB->getNumber()]%4 == 0) ||
+               (BBOffsets[MBB->getNumber()]%4 != 0 &&
+                BBSizes[MBB->getNumber()]%4 != 0));
+    }
+  }
+}
+
+/// print block size and offset information - debugging
+void ARMConstantIslands::dumpBBs() {
+  for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) {
+    DOUT << "block " << J << " offset " << BBOffsets[J] <<
+                            " size " << BBSizes[J] << "\n";
+  }
+}
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
+  MachineConstantPool &MCP = *Fn.getConstantPool();
+
+  TII = Fn.getTarget().getInstrInfo();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  isThumb = AFI->isThumbFunction();
+
+  HasFarJump = false;
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  Fn.RenumberBlocks();
+
+  /// Thumb functions containing constant pools get 2-byte alignment.
+  /// This is so we can keep exact track of where the alignment padding goes.
+  /// Set default.
+  AFI->setAlign(isThumb ? 1U : 2U);
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP.isEmpty()) {
+    DoInitialPlacement(Fn, CPEMIs);
+    if (isThumb)
+      AFI->setAlign(2U);
+  }
+
+  /// The next UID to take is the first unused one.
+  AFI->initConstPoolEntryUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  InitialFunctionScan(Fn, CPEMIs);
+  CPEMIs.clear();
+
+  /// Remove dead constant pool entries.
+  RemoveUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  bool MadeChange = false;
+  while (true) {
+    bool Change = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      Change |= HandleConstantPoolUser(Fn, i);
+    DEBUG(dumpBBs());
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      Change |= FixUpImmediateBr(Fn, ImmBranches[i]);
+    DEBUG(dumpBBs());
+    if (!Change)
+      break;
+    MadeChange = true;
+  }
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify(Fn);
+
+  // If LR has been forced spilled and no far jumps (i.e. BL) has been issued.
+  // Undo the spill / restore of LR if possible.
+  if (!HasFarJump && AFI->isLRSpilledForFarJump() && isThumb)
+    MadeChange |= UndoLRSpillRestore();
+
+  BBSizes.clear();
+  BBOffsets.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  PushPopMIs.clear();
+
+  return MadeChange;
+}
+
+/// DoInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn,
+                                        std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = Fn.CreateMachineBasicBlock();
+  Fn.push_back(BB);
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs =
+    Fn.getConstantPool()->getConstants();
+
+  const TargetData &TD = *Fn.getTarget().getTargetData();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    // Verify that all constant pool entries are a multiple of 4 bytes.  If not,
+    // we would have to pad them out or something so that instructions stay
+    // aligned.
+    assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!");
+    MachineInstr *CPEMI =
+      BuildMI(BB, DebugLoc::getUnknownLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+                           .addImm(i).addConstantPoolIndex(i).addImm(Size);
+    CPEMIs.push_back(CPEMI);
+
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    NumCPEs++;
+    DOUT << "Moved CPI#" << i << " to end of function as #" << i << "\n";
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  if (next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
+    return false;
+
+  MachineBasicBlock *NextBB = next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+ARMConstantIslands::CPEntry
+*ARMConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return NULL;
+}
+
+/// InitialFunctionScan - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
+                                 const std::vector<MachineInstr*> &CPEMIs) {
+  unsigned Offset = 0;
+  for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+
+    unsigned MBBSize = 0;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      // Add instruction size to MBBSize.
+      MBBSize += TII->GetInstSizeInBytes(I);
+
+      int Opc = I->getOpcode();
+      if (I->getDesc().isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        case ARM::tBR_JTr:
+          // A Thumb table jump may involve padding; for the offsets to
+          // be right, functions containing these must be 4-byte aligned.
+          AFI->setAlign(2U);
+          if ((Offset+MBBSize)%4 != 0)
+            MBBSize += 2;           // padding
+          continue;   // Does not get an entry in ImmBranches
+        default:
+          continue;  // Ignore other JT branches
+        case ARM::Bcc:
+          isCond = true;
+          UOpc = ARM::B;
+          // Fallthrough
+        case ARM::B:
+          Bits = 24;
+          Scale = 4;
+          break;
+        case ARM::tBcc:
+          isCond = true;
+          UOpc = ARM::tB;
+          Bits = 8;
+          Scale = 2;
+          break;
+        case ARM::tB:
+          Bits = 11;
+          Scale = 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
+        PushPopMIs.push_back(I);
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          unsigned TSFlags = I->getDesc().TSFlags;
+          switch (TSFlags & ARMII::AddrModeMask) {
+          default:
+            // Constant pool entries can reach anything.
+            if (I->getOpcode() == ARM::CONSTPOOL_ENTRY)
+              continue;
+            if (I->getOpcode() == ARM::tLEApcrel) {
+              Bits = 8;  // Taking the address of a CP entry.
+              break;
+            }
+            assert(0 && "Unknown addressing mode for CP reference!");
+          case ARMII::AddrMode1: // AM1: 8 bits << 2
+            Bits = 8;
+            Scale = 4;  // Taking the address of a CP entry.
+            break;
+          case ARMII::AddrMode2:
+            Bits = 12;  // +-offset_12
+            break;
+          case ARMII::AddrMode3:
+            Bits = 8;   // +-offset_8
+            break;
+            // addrmode4 has no immediate offset.
+          case ARMII::AddrMode5:
+            Bits = 8;
+            Scale = 4;  // +-(offset_8*4)
+            break;
+          case ARMII::AddrModeT1:
+            Bits = 5;  // +offset_5
+            break;
+          case ARMII::AddrModeT2:
+            Bits = 5;
+            Scale = 2;  // +(offset_5*2)
+            break;
+          case ARMII::AddrModeT4:
+            Bits = 5;
+            Scale = 4;  // +(offset_5*4)
+            break;
+          case ARMII::AddrModeTs:
+            Bits = 8;
+            Scale = 4;  // +(offset_8*4)
+            break;
+          }
+
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+
+    // In thumb mode, if this block is a constpool island, we may need padding
+    // so it's aligned on 4 byte boundary.
+    if (isThumb &&
+        !MBB.empty() &&
+        MBB.begin()->getOpcode() == ARM::CONSTPOOL_ENTRY &&
+        (Offset%4) != 0)
+      MBBSize += 2;
+
+    BBSizes.push_back(MBBSize);
+    BBOffsets.push_back(Offset);
+    Offset += MBBSize;
+  }
+}
+
+/// GetOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBOffsets[MBB->getNumber()];
+
+  // If we're looking for a CONSTPOOL_ENTRY in Thumb, see if this block has
+  // alignment padding, and compensate if so.
+  if (isThumb &&
+      MI->getOpcode() == ARM::CONSTPOOL_ENTRY &&
+      Offset%4 != 0)
+    Offset += 2;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    if (&*I == MI) return Offset;
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// UpdateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consequtive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert a size into BBSizes to align it properly with the (newly
+  // renumbered) block numbers.
+  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
+
+  // Likewise for BBOffsets.
+  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  std::vector<MachineBasicBlock*>::iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update datastructures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+  MachineFunction &MF = *OrigBB->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF.CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF.insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc::getUnknownLoc(),
+          TII->get(isThumb ? ARM::tB : ARM::B)).addMBB(NewBB);
+  NumSplit++;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  while (!OrigBB->succ_empty()) {
+    MachineBasicBlock *Succ = *OrigBB->succ_begin();
+    OrigBB->removeSuccessor(Succ);
+    NewBB->addSuccessor(Succ);
+
+    // This pass should be run after register allocation, so there should be no
+    // PHI nodes to update.
+    assert((Succ->empty() || Succ->begin()->getOpcode() != TargetInstrInfo::PHI)
+           && "PHI nodes should be eliminated by now!");
+  }
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as UpdateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF.RenumberBlocks(NewBB);
+
+  // Insert a size into BBSizes to align it properly with the (newly
+  // renumbered) block numbers.
+  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
+
+  // Likewise for BBOffsets.
+  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  std::vector<MachineBasicBlock*>::iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+
+  // Figure out how large the first NewMBB is.  (It cannot
+  // contain a constpool_entry or tablejump.)
+  unsigned NewBBSize = 0;
+  for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end();
+       I != E; ++I)
+    NewBBSize += TII->GetInstSizeInBytes(I);
+
+  unsigned OrigBBI = OrigBB->getNumber();
+  unsigned NewBBI = NewBB->getNumber();
+  // Set the size of NewBB in BBSizes.
+  BBSizes[NewBBI] = NewBBSize;
+
+  // We removed instructions from UserMBB, subtract that off from its size.
+  // Add 2 or 4 to the block to count the unconditional branch we added to it.
+  unsigned delta = isThumb ? 2 : 4;
+  BBSizes[OrigBBI] -= NewBBSize - delta;
+
+  // ...and adjust BBOffsets for NewBB accordingly.
+  BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI];
+
+  // All BBOffsets following these blocks must be modified.
+  AdjustBBOffsetsAfter(NewBB, delta);
+
+  return NewBB;
+}
+
+/// OffsetIsInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
+                      unsigned TrialOffset, unsigned MaxDisp, bool NegativeOK) {
+  // On Thumb offsets==2 mod 4 are rounded down by the hardware for
+  // purposes of the displacement computation; compensate for that here.
+  // Effectively, the valid range of displacements is 2 bytes smaller for such
+  // references.
+  if (isThumb && UserOffset%4 !=0)
+    UserOffset -= 2;
+  // CPEs will be rounded up to a multiple of 4.
+  if (isThumb && TrialOffset%4 != 0)
+    TrialOffset += 2;
+
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset-UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset-TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// WaterIsInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+
+bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
+                         MachineBasicBlock* Water, CPUser &U)
+{
+  unsigned MaxDisp = U.MaxDisp;
+  MachineFunction::iterator I = next(MachineFunction::iterator(Water));
+  unsigned CPEOffset = BBOffsets[Water->getNumber()] +
+                       BBSizes[Water->getNumber()];
+
+  // If the CPE is to be inserted before the instruction, that will raise
+  // the offset of the instruction.  (Currently applies only to ARM, so
+  // no alignment compensation attempted here.)
+  if (CPEOffset < UserOffset)
+    UserOffset += U.CPEMI->getOperand(2).getImm();
+
+  return OffsetIsInRange (UserOffset, CPEOffset, MaxDisp, !isThumb);
+}
+
+/// CPEIsInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
+                                      MachineInstr *CPEMI,
+                                      unsigned MaxDisp, bool DoDump) {
+  unsigned CPEOffset  = GetOffsetOf(CPEMI);
+  assert(CPEOffset%4 == 0 && "Misaligned CPE");
+
+  if (DoDump) {
+    DOUT << "User of CPE#" << CPEMI->getOperand(0).getImm()
+         << " max delta=" << MaxDisp
+         << " insn address=" << UserOffset
+         << " CPE address=" << CPEOffset
+         << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI;
+  }
+
+  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, !isThumb);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif // NDEBUG
+
+void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
+                                              int delta) {
+  MachineFunction::iterator MBBI = BB; MBBI = next(MBBI);
+  for(unsigned i=BB->getNumber()+1; i<BB->getParent()->getNumBlockIDs(); i++) {
+    BBOffsets[i] += delta;
+    // If some existing blocks have padding, adjust the padding as needed, a
+    // bit tricky.  delta can be negative so don't use % on that.
+    if (isThumb) {
+      MachineBasicBlock *MBB = MBBI;
+      if (!MBB->empty()) {
+        // Constant pool entries require padding.
+        if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+          unsigned oldOffset = BBOffsets[i] - delta;
+          if (oldOffset%4==0 && BBOffsets[i]%4!=0) {
+            // add new padding
+            BBSizes[i] += 2;
+            delta += 2;
+          } else if (oldOffset%4!=0 && BBOffsets[i]%4==0) {
+            // remove existing padding
+            BBSizes[i] -=2;
+            delta -= 2;
+          }
+        }
+        // Thumb jump tables require padding.  They should be at the end;
+        // following unconditional branches are removed by AnalyzeBranch.
+        MachineInstr *ThumbJTMI = NULL;
+        if (prior(MBB->end())->getOpcode() == ARM::tBR_JTr)
+          ThumbJTMI = prior(MBB->end());
+        if (ThumbJTMI) {
+          unsigned newMIOffset = GetOffsetOf(ThumbJTMI);
+          unsigned oldMIOffset = newMIOffset - delta;
+          if (oldMIOffset%4 == 0 && newMIOffset%4 != 0) {
+            // remove existing padding
+            BBSizes[i] -= 2;
+            delta -= 2;
+          } else if (oldMIOffset%4 != 0 && newMIOffset%4 == 0) {
+            // add new padding
+            BBSizes[i] += 2;
+            delta += 2;
+          }
+        }
+        if (delta==0)
+          return;
+      }
+      MBBI = next(MBBI);
+    }
+  }
+}
+
+/// DecrementOldEntry - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    RemoveDeadCPEMI(CPEMI);
+    CPE->CPEMI = NULL;
+    NumCPEs--;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, true)) {
+    DOUT << "In range\n";
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, false)) {
+      DOUT << "Replacing CPE#" << CPI << " with CPE#" << CPEs[i].CPI << "\n";
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return DecrementOldEntry(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  return (Opc == ARM::tB) ? ((1<<10)-1)*2 : ((1<<23)-1)*4;
+}
+
+/// AcceptWater - Small amount of common code factored out of the following.
+
+MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB,
+                          std::vector<MachineBasicBlock*>::iterator IP) {
+  DOUT << "found water in range\n";
+  // Remove the original WaterList entry; we want subsequent
+  // insertions in this vicinity to go after the one we're
+  // about to insert.  This considerably reduces the number
+  // of times we have to move the same CPE more than once.
+  WaterList.erase(IP);
+  // CPE goes before following block (NewMBB).
+  return next(MachineFunction::iterator(WaterBB));
+}
+
+/// LookForWater - look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, *NewMBB
+/// is set to the WaterList entry.
+/// For ARM, we prefer the water that's farthest away.  For Thumb, prefer
+/// water that will not introduce padding to water that will; within each
+/// group, prefer the water that's farthest away.
+
+bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
+                                      MachineBasicBlock** NewMBB) {
+  std::vector<MachineBasicBlock*>::iterator IPThatWouldPad;
+  MachineBasicBlock* WaterBBThatWouldPad = NULL;
+  if (!WaterList.empty()) {
+    for (std::vector<MachineBasicBlock*>::iterator IP = prior(WaterList.end()),
+        B = WaterList.begin();; --IP) {
+      MachineBasicBlock* WaterBB = *IP;
+      if (WaterIsInRange(UserOffset, WaterBB, U)) {
+        if (isThumb &&
+            (BBOffsets[WaterBB->getNumber()] +
+             BBSizes[WaterBB->getNumber()])%4 != 0) {
+          // This is valid Water, but would introduce padding.  Remember
+          // it in case we don't find any Water that doesn't do this.
+          if (!WaterBBThatWouldPad) {
+            WaterBBThatWouldPad = WaterBB;
+            IPThatWouldPad = IP;
+          }
+        } else {
+          *NewMBB = AcceptWater(WaterBB, IP);
+          return true;
+        }
+    }
+      if (IP == B)
+        break;
+    }
+  }
+  if (isThumb && WaterBBThatWouldPad) {
+    *NewMBB = AcceptWater(WaterBBThatWouldPad, IPThatWouldPad);
+    return true;
+  }
+  return false;
+}
+
+/// CreateNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case *NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+
+void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
+                        unsigned UserOffset, MachineBasicBlock** NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  unsigned OffsetOfNextBlock = BBOffsets[UserMBB->getNumber()] +
+                               BBSizes[UserMBB->getNumber()];
+  assert(OffsetOfNextBlock== BBOffsets[UserMBB->getNumber()+1]);
+
+  // If the use is at the end of the block, or the end of the block
+  // is within range, make new water there.  (The addition below is
+  // for the unconditional branch we will be adding:  4 bytes on ARM,
+  // 2 on Thumb.  Possible Thumb alignment padding is allowed for
+  // inside OffsetIsInRange.
+  // If the block ends in an unconditional branch already, it is water,
+  // and is known to be out of range, so we'll always be adding a branch.)
+  if (&UserMBB->back() == UserMI ||
+      OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb ? 2: 4),
+           U.MaxDisp, !isThumb)) {
+    DOUT << "Split at end of block\n";
+    if (&UserMBB->back() == UserMI)
+      assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!");
+    *NewMBB = next(MachineFunction::iterator(UserMBB));
+    // Add an unconditional branch from UserMBB to fallthrough block.
+    // Record it for branch lengthening; this new branch will not get out of
+    // range, but if the preceding conditional branch is out of range, the
+    // targets will be exchanged, and the altered branch may be out of
+    // range, so the machinery has to know about it.
+    int UncondBr = isThumb ? ARM::tB : ARM::B;
+    BuildMI(UserMBB, DebugLoc::getUnknownLoc(),
+            TII->get(UncondBr)).addMBB(*NewMBB);
+    unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+    ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                          MaxDisp, false, UncondBr));
+    int delta = isThumb ? 2 : 4;
+    BBSizes[UserMBB->getNumber()] += delta;
+    AdjustBBOffsetsAfter(UserMBB, delta);
+  } else {
+    // What a big block.  Find a place within the block to split it.
+    // This is a little tricky on Thumb since instructions are 2 bytes
+    // and constant pool entries are 4 bytes: if instruction I references
+    // island CPE, and instruction I+1 references CPE', it will
+    // not work well to put CPE as far forward as possible, since then
+    // CPE' cannot immediately follow it (that location is 2 bytes
+    // farther away from I+1 than CPE was from I) and we'd need to create
+    // a new island.  So, we make a first guess, then walk through the
+    // instructions between the one currently being looked at and the
+    // possible insertion point, and make sure any other instructions
+    // that reference CPEs will be able to use the same island area;
+    // if not, we back up the insertion point.
+
+    // The 4 in the following is for the unconditional branch we'll be
+    // inserting (allows for long branch on Thumb).  Alignment of the
+    // island is handled inside OffsetIsInRange.
+    unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4;
+    // This could point off the end of the block if we've already got
+    // constant pool entries following this block; only the last one is
+    // in the water list.  Back past any possible branches (allow for a
+    // conditional and a maximally long unconditional).
+    if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1])
+      BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] -
+                              (isThumb ? 6 : 8);
+    unsigned EndInsertOffset = BaseInsertOffset +
+           CPEMI->getOperand(2).getImm();
+    MachineBasicBlock::iterator MI = UserMI;
+    ++MI;
+    unsigned CPUIndex = CPUserIndex+1;
+    for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+         Offset < BaseInsertOffset;
+         Offset += TII->GetInstSizeInBytes(MI),
+            MI = next(MI)) {
+      if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) {
+        if (!OffsetIsInRange(Offset, EndInsertOffset,
+              CPUsers[CPUIndex].MaxDisp, !isThumb)) {
+          BaseInsertOffset -= (isThumb ? 2 : 4);
+          EndInsertOffset -= (isThumb ? 2 : 4);
+        }
+        // This is overly conservative, as we don't account for CPEMIs
+        // being reused within the block, but it doesn't matter much.
+        EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm();
+        CPUIndex++;
+      }
+    }
+    DOUT << "Split in middle of big block\n";
+    *NewMBB = SplitBlockBeforeInstr(prior(MI));
+  }
+}
+
+/// HandleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
+                                                unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  MachineBasicBlock *NewMBB;
+  // Compute this only once, it's expensive.  The 4 or 8 is the value the
+  // hardware keeps in the PC (2 insns ahead of the reference).
+  unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8);
+
+  // Special case: tLEApcrel are two instructions MI's. The actual user is the
+  // second instruction.
+  if (UserMI->getOpcode() == ARM::tLEApcrel)
+    UserOffset += 2;
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = LookForExistingCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = AFI->createConstPoolEntryUId();
+
+  // Look for water where we can place this CPE.  We look for the farthest one
+  // away that will work.  Forward references only for now (although later
+  // we might find some that are backwards).
+
+  if (!LookForWater(U, UserOffset, &NewMBB)) {
+    // No water found.
+    DOUT << "No water found\n";
+    CreateNewWater(CPUserIndex, UserOffset, &NewMBB);
+  }
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MachineBasicBlock *NewIsland = Fn.CreateMachineBasicBlock();
+  Fn.insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  UpdateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  DecrementOldEntry(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.CPEMI = BuildMI(NewIsland, DebugLoc::getUnknownLoc(),
+                    TII->get(ARM::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  NumCPEs++;
+
+  BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()];
+  // Compensate for .align 2 in thumb mode.
+  if (isThumb && BBOffsets[NewIsland->getNumber()]%4 != 0)
+    Size += 2;
+  // Increase the size of the island block to account for the new entry.
+  BBSizes[NewIsland->getNumber()] += Size;
+  AdjustBBOffsetsAfter(NewIsland, Size);
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DOUT << "  Moved CPE to #" << ID << " CPI=" << CPI << "\t" << *UserMI;
+
+  return true;
+}
+
+/// RemoveDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBSizes[CPEBB->getNumber()] -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    // In thumb mode, the size of island may be  padded by two to compensate for
+    // the alignment requirement.  Then it will now be 2 when the block is
+    // empty, so fix this.
+    // All succeeding offsets have the current size value added in, fix this.
+    if (BBSizes[CPEBB->getNumber()] != 0) {
+      Size += BBSizes[CPEBB->getNumber()];
+      BBSizes[CPEBB->getNumber()] = 0;
+    }
+  }
+  AdjustBBOffsetsAfter(CPEBB, -Size);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// RemoveUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool ARMConstantIslands::RemoveUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          RemoveDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = NULL;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// BBIsInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+                                     unsigned MaxDisp) {
+  unsigned PCAdj      = isThumb ? 4 : 8;
+  unsigned BrOffset   = GetOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBOffsets[DestBB->getNumber()];
+
+  DOUT << "Branch of destination BB#" << DestBB->getNumber()
+       << " from BB#" << MI->getParent()->getNumber()
+       << " max delta=" << MaxDisp
+       << " from " << GetOffsetOf(MI) << " to " << DestOffset
+       << " offset " << int(DestOffset-BrOffset) << "\t" << *MI;
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (BBIsInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return FixUpUnconditionalBr(Fn, Br);
+  return FixUpConditionalBr(Fn, Br);
+}
+
+/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  assert(isThumb && "Expected a Thumb function!");
+
+  // Use BL to implement far jump.
+  Br.MaxDisp = (1 << 21) * 2;
+  MI->setDesc(TII->get(ARM::tBfar));
+  BBSizes[MBB->getNumber()] += 2;
+  AdjustBBOffsetsAfter(MBB, 2);
+  HasFarJump = true;
+  NumUBrFixed++;
+
+  DOUT << "  Changed B to long jump " << *MI;
+
+  return true;
+}
+
+/// FixUpConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
+  CC = ARMCC::getOppositeCondition(CC);
+  unsigned CCReg = MI->getOperand(2).getReg();
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  NumCBrFixed++;
+  if (BMI != MI) {
+    if (next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (BBIsInRange(MI, NewDest, Br.MaxDisp)) {
+        DOUT << "  Invert Bcc condition and swap its destination with " << *BMI;
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        MI->getOperand(1).setImm(CC);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    SplitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBSizes[MBB->getNumber()] -= delta;
+    MachineBasicBlock* SplitBB = next(MachineFunction::iterator(MBB));
+    AdjustBBOffsetsAfter(SplitBB, -delta);
+    MBB->back().eraseFromParent();
+    // BBOffsets[SplitBB] is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
+
+  DOUT << "  Insert B to BB#" << DestBB->getNumber()
+       << " also invert condition and change dest. to BB#"
+       << NextBB->getNumber() << "\n";
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc::getUnknownLoc(),
+          TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc::getUnknownLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBSizes[MI->getParent()->getNumber()] -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+
+  // The net size change is an addition of one unconditional branch.
+  int delta = TII->GetInstSizeInBytes(&MBB->back());
+  AdjustBBOffsetsAfter(MBB, delta);
+  return true;
+}
+
+/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// LR / restores LR to pc.
+bool ARMConstantIslands::UndoLRSpillRestore() {
+  bool MadeChange = false;
+  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
+    MachineInstr *MI = PushPopMIs[i];
+    if (MI->getOpcode() == ARM::tPOP_RET &&
+        MI->getOperand(0).getReg() == ARM::PC &&
+        MI->getNumExplicitOperands() == 1) {
+      BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET));
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
new file mode 100644
index 0000000..3a038c9
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -0,0 +1,100 @@
+//===- ARMConstantPoolValue.cpp - ARM constantpool value --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Type.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/raw_ostream.h"
+#include <ostream>
+using namespace llvm;
+
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id,
+                                           ARMCP::ARMCPKind k,
+                                           unsigned char PCAdj,
+                                           const char *Modif,
+                                           bool AddCA)
+  : MachineConstantPoolValue((const Type*)gv->getType()),
+    GV(gv), S(NULL), LabelId(id), Kind(k), PCAdjust(PCAdj),
+    Modifier(Modif), AddCurrentAddress(AddCA) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(const char *s, unsigned id,
+                                           ARMCP::ARMCPKind k,
+                                           unsigned char PCAdj,
+                                           const char *Modif,
+                                           bool AddCA)
+  : MachineConstantPoolValue((const Type*)Type::Int32Ty),
+    GV(NULL), S(s), LabelId(id), Kind(k), PCAdjust(PCAdj),
+    Modifier(Modif), AddCurrentAddress(AddCA) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv,
+                                           ARMCP::ARMCPKind k,
+                                           const char *Modif)
+  : MachineConstantPoolValue((const Type*)Type::Int32Ty),
+    GV(gv), S(NULL), LabelId(0), Kind(k), PCAdjust(0),
+    Modifier(Modif) {}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                    unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      if (CPV->GV == GV &&
+          CPV->S == S &&
+          CPV->LabelId == LabelId &&
+          CPV->Kind == Kind &&
+          CPV->PCAdjust == PCAdjust)
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+void
+ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(GV);
+  ID.AddPointer(S);
+  ID.AddInteger(LabelId);
+  ID.AddInteger((unsigned)Kind);
+  ID.AddInteger(PCAdjust);
+}
+
+void ARMConstantPoolValue::dump() const {
+  cerr << "  " << *this;
+}
+
+void ARMConstantPoolValue::print(std::ostream &O) const {
+  raw_os_ostream RawOS(O);
+  print(RawOS);
+}
+
+void ARMConstantPoolValue::print(raw_ostream &O) const {
+  if (GV)
+    O << GV->getName();
+  else
+    O << S;
+  if (isNonLazyPointer()) O << "$non_lazy_ptr";
+  else if (isStub()) O << "$stub";
+  if (Modifier) O << "(" << Modifier << ")";
+  if (PCAdjust != 0) {
+    O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
+    if (AddCurrentAddress) O << "-.";
+    O << ")";
+  }
+}
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
new file mode 100644
index 0000000..d2b9066
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -0,0 +1,92 @@
+//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include <iosfwd>
+
+namespace llvm {
+
+class GlobalValue;
+
+namespace ARMCP {
+  enum ARMCPKind {
+    CPValue,
+    CPNonLazyPtr,
+    CPStub
+  };
+}
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC relative displacement between the address of the load
+/// instruction and the global value being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+  GlobalValue *GV;         // GlobalValue being loaded.
+  const char *S;           // ExtSymbol being loaded.
+  unsigned LabelId;        // Label id of the load.
+  ARMCP::ARMCPKind Kind;   // non_lazy_ptr or stub?
+  unsigned char PCAdjust;  // Extra adjustment if constantpool is pc relative.
+                           // 8 for ARM, 4 for Thumb.
+  const char *Modifier;    // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+  bool AddCurrentAddress;
+
+public:
+  ARMConstantPoolValue(GlobalValue *gv, unsigned id,
+                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
+                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       bool AddCurrentAddress = false);
+  ARMConstantPoolValue(const char *s, unsigned id,
+                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
+                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       bool AddCurrentAddress = false);
+  ARMConstantPoolValue(GlobalValue *GV, ARMCP::ARMCPKind Kind,
+                       const char *Modifier);
+
+
+  GlobalValue *getGV() const { return GV; }
+  const char *getSymbol() const { return S; }
+  const char *getModifier() const { return Modifier; }
+  bool hasModifier() const { return Modifier != NULL; }
+  bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+  unsigned getLabelId() const { return LabelId; }
+  bool isNonLazyPointer() const { return Kind == ARMCP::CPNonLazyPtr; }
+  bool isStub() const { return Kind == ARMCP::CPStub; }
+  unsigned char getPCAdjustment() const { return PCAdjust; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  void print(std::ostream *O) const { if (O) print(*O); }
+  void print(std::ostream &O) const;
+  void print(raw_ostream *O) const { if (O) print(*O); }
+  void print(raw_ostream &O) const;
+  void dump() const;
+};
+
+  inline std::ostream &operator<<(std::ostream &O, const ARMConstantPoolValue &V) {
+  V.print(O);
+  return O;
+}
+  
+inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
+  V.print(O);
+  return O;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/ARM/ARMFrameInfo.h b/lib/Target/ARM/ARMFrameInfo.h
new file mode 100644
index 0000000..405b8f2
--- /dev/null
+++ b/lib/Target/ARM/ARMFrameInfo.h
@@ -0,0 +1,32 @@
+//===-- ARMTargetFrameInfo.h - Define TargetFrameInfo for ARM ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_FRAMEINFO_H
+#define ARM_FRAMEINFO_H
+
+#include "ARM.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "ARMSubtarget.h"
+
+namespace llvm {
+
+class ARMFrameInfo : public TargetFrameInfo {
+public:
+  explicit ARMFrameInfo(const ARMSubtarget &ST)
+    : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0) {
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
new file mode 100644
index 0000000..ca3a9cb
--- /dev/null
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -0,0 +1,911 @@
+//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===--------------------------------------------------------------------===//
+/// ARMDAGToDAGISel - ARM specific code to select ARM machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class ARMDAGToDAGISel : public SelectionDAGISel {
+  ARMTargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+
+public:
+  explicit ARMDAGToDAGISel(ARMTargetMachine &tm)
+    : SelectionDAGISel(tm), TM(tm),
+    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "ARM Instruction Selection";
+  } 
+  
+  SDNode *Select(SDValue Op);
+  virtual void InstructionSelect();
+  bool SelectAddrMode2(SDValue Op, SDValue N, SDValue &Base,
+                       SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2Offset(SDValue Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode3(SDValue Op, SDValue N, SDValue &Base,
+                       SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode3Offset(SDValue Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode5(SDValue Op, SDValue N, SDValue &Base,
+                       SDValue &Offset);
+
+  bool SelectAddrModePC(SDValue Op, SDValue N, SDValue &Offset,
+                         SDValue &Label);
+
+  bool SelectThumbAddrModeRR(SDValue Op, SDValue N, SDValue &Base,
+                             SDValue &Offset);
+  bool SelectThumbAddrModeRI5(SDValue Op, SDValue N, unsigned Scale,
+                              SDValue &Base, SDValue &OffImm,
+                              SDValue &Offset);
+  bool SelectThumbAddrModeS1(SDValue Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm, SDValue &Offset);
+  bool SelectThumbAddrModeS2(SDValue Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm, SDValue &Offset);
+  bool SelectThumbAddrModeS4(SDValue Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm, SDValue &Offset);
+  bool SelectThumbAddrModeSP(SDValue Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm);
+
+  bool SelectShifterOperandReg(SDValue Op, SDValue N, SDValue &A,
+                               SDValue &B, SDValue &C);
+  
+  // Include the pieces autogenerated from the target description.
+#include "ARMGenDAGISel.inc"
+
+private:
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps);
+};
+}
+
+void ARMDAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
+                                      SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
+  }
+  
+  // Match simple R +/- imm12 operands.
+  if (N.getOpcode() == ISD::ADD)
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if ((RHSC >= 0 && RHSC < 0x1000) ||
+          (RHSC < 0 && RHSC > -0x1000)) { // 12 bits.
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
+        Offset = CurDAG->getRegister(0, MVT::i32);
+
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+                                                          ARM_AM::no_shift),
+                                        MVT::i32);
+        return true;
+      }
+    }
+  
+  // Otherwise this is R +/- [possibly shifted] R
+  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  unsigned ShAmt = 0;
+  
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+  
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      Offset = N.getOperand(1).getOperand(0);
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+  
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        Offset = N.getOperand(0).getOperand(0);
+        Base = N.getOperand(1);
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+  
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDValue Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op.getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    int Val = (int)C->getZExtValue();
+    if (Val >= 0 && Val < 0x1000) { // 12 bits.
+      Offset = CurDAG->getRegister(0, MVT::i32);
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                        ARM_AM::no_shift),
+                                      MVT::i32);
+      return true;
+    }
+  }
+
+  Offset = N;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  unsigned ShAmt = 0;
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      Offset = N.getOperand(0);
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N,
+                                      SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::SUB) {
+    // X - C  is canonicalize to X + -C, no need to handle it here.
+    Base = N.getOperand(0);
+    Offset = N.getOperand(1);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
+    return true;
+  }
+  
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
+    return true;
+  }
+  
+  // If the RHS is +/- imm8, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if ((RHSC >= 0 && RHSC < 256) ||
+        (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      Offset = CurDAG->getRegister(0, MVT::i32);
+
+      ARM_AM::AddrOpc AddSub = ARM_AM::add;
+      if (RHSC < 0) {
+        AddSub = ARM_AM::sub;
+        RHSC = - RHSC;
+      }
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32);
+      return true;
+    }
+  }
+  
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDValue Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op.getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    int Val = (int)C->getZExtValue();
+    if (Val >= 0 && Val < 256) {
+      Offset = CurDAG->getRegister(0, MVT::i32);
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32);
+      return true;
+    }
+  }
+
+  Offset = N;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
+                                      SDValue &Base, SDValue &Offset) {
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       MVT::i32);
+    return true;
+  }
+  
+  // If the RHS is +/- imm8, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied by 4.
+      RHSC >>= 2;
+      if ((RHSC >= 0 && RHSC < 256) ||
+          (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
+
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                           MVT::i32);
+        return true;
+      }
+    }
+  }
+  
+  Base = N;
+  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                     MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModePC(SDValue Op, SDValue N,
+                                        SDValue &Offset, SDValue &Label) {
+  if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
+    Offset = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    Label  = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+                                       MVT::i32);
+    return true;
+  }
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue Op, SDValue N,
+                                            SDValue &Base, SDValue &Offset){
+  // FIXME dl should come from the parent load or store, not the address
+  DebugLoc dl = Op.getDebugLoc();
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    // We must materialize a zero in a reg! Returning a constant here
+    // wouldn't work without additional code to position the node within
+    // ISel's topological ordering in a place where ISel will process it
+    // normally.  Instead, just explicitly issue a tMOVri8 node!
+    Offset = SDValue(CurDAG->getTargetNode(ARM::tMOVi8, dl, MVT::i32,
+                                    CurDAG->getTargetConstant(0, MVT::i32)), 0);
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDValue Op, SDValue N,
+                                        unsigned Scale, SDValue &Base,
+                                        SDValue &OffImm, SDValue &Offset) {
+  if (Scale == 4) {
+    SDValue TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(Op, N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (N.getOpcode() != ISD::ADD) {
+    Base = (N.getOpcode() == ARMISD::Wrapper) ? N.getOperand(0) : N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // Thumb does not have [sp, r] address mode.
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP)) {
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // If the RHS is + imm5 * scale, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if ((RHSC & (Scale-1)) == 0) {  // The constant is implicitly multiplied.
+      RHSC /= Scale;
+      if (RHSC >= 0 && RHSC < 32) {
+        Base = N.getOperand(0);
+        Offset = CurDAG->getRegister(0, MVT::i32);
+        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS1(SDValue Op, SDValue N,
+                                            SDValue &Base, SDValue &OffImm,
+                                            SDValue &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 1, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS2(SDValue Op, SDValue N,
+                                            SDValue &Base, SDValue &OffImm,
+                                            SDValue &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 2, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS4(SDValue Op, SDValue N,
+                                            SDValue &Base, SDValue &OffImm,
+                                            SDValue &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 4, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue Op, SDValue N,
+                                           SDValue &Base, SDValue &OffImm) {
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
+      (LHSR && LHSR->getReg() == ARM::SP)) {
+    // If the RHS is + imm8 * scale, fold into addr mode.
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied.
+        RHSC >>= 2;
+        if (RHSC >= 0 && RHSC < 256) {
+          Base = N.getOperand(0);
+          if (Base.getOpcode() == ISD::FrameIndex) {
+            int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+            Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+          }
+          OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+  
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue Op,
+                                              SDValue N, 
+                                              SDValue &BaseReg,
+                                              SDValue &ShReg,
+                                              SDValue &Opc) {
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+  
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ShReg = CurDAG->getRegister(0, MVT::i32);
+    ShImmVal = RHS->getZExtValue() & 31;
+  } else {
+    ShReg = N.getOperand(1);
+  }
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+/// getAL - Returns a ARMCC::AL immediate node.
+static inline SDValue getAL(SelectionDAG *CurDAG) {
+  return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32);
+}
+
+
+SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+    bool UseCP = true;
+    if (Subtarget->isThumb())
+      UseCP = (Val > 255 &&                          // MOV
+               ~Val > 255 &&                         // MOV + MVN
+               !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
+    else
+      UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
+               ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
+               !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
+    if (UseCP) {
+      SDValue CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val),
+                                      TLI.getPointerTy());
+
+      SDNode *ResNode;
+      if (Subtarget->isThumb())
+        ResNode = CurDAG->getTargetNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other,
+                                        CPIdx, CurDAG->getEntryNode());
+      else {
+        SDValue Ops[] = {
+          CPIdx, 
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getTargetConstant(0, MVT::i32),
+          getAL(CurDAG),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
+        ResNode=CurDAG->getTargetNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                      Ops, 6);
+      }
+      ReplaceUses(Op, SDValue(ResNode, 0));
+      return NULL;
+    }
+      
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::FrameIndex: {
+    // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    if (Subtarget->isThumb()) {
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI,
+                                  CurDAG->getTargetConstant(0, MVT::i32));
+    } else {
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                          CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, ARM::ADDri, MVT::i32, Ops, 5);
+    }
+  }
+  case ISD::ADD: {
+    if (!Subtarget->isThumb())
+      break;
+    // Select add sp, c to tADDhirr.
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(Op.getOperand(0));
+    RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(Op.getOperand(1));
+    if (LHSR && LHSR->getReg() == ARM::SP) {
+      std::swap(N0, N1);
+      std::swap(LHSR, RHSR);
+    }
+    if (RHSR && RHSR->getReg() == ARM::SP) {
+      SDValue Val = SDValue(CurDAG->getTargetNode(ARM::tMOVlor2hir, dl,
+                                  Op.getValueType(), N0, N0), 0);
+      return CurDAG->SelectNodeTo(N, ARM::tADDhirr, Op.getValueType(), Val, N1);
+    }
+    break;
+  }
+  case ISD::MUL:
+    if (Subtarget->isThumb())
+      break;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned RHSV = C->getZExtValue();
+      if (!RHSV) break;
+      if (isPowerOf2_32(RHSV-1)) {  // 2^n+1?
+        SDValue V = Op.getOperand(0);
+        unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV-1));
+        SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getTargetConstant(ShImm, MVT::i32),
+                            getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+      }
+      if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
+        SDValue V = Op.getOperand(0);
+        unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV+1));
+        SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getTargetConstant(ShImm, MVT::i32),
+                            getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+      }
+    }
+    break;
+  case ARMISD::FMRRD:
+    return CurDAG->getTargetNode(ARM::FMRRD, dl, MVT::i32, MVT::i32,
+                                 Op.getOperand(0), getAL(CurDAG),
+                                 CurDAG->getRegister(0, MVT::i32));
+  case ISD::UMUL_LOHI: {
+    SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+    return CurDAG->getTargetNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+  }
+  case ISD::SMUL_LOHI: {
+    SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+    return CurDAG->getTargetNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    ISD::MemIndexedMode AM = LD->getAddressingMode();
+    MVT LoadedVT = LD->getMemoryVT();
+    if (AM != ISD::UNINDEXED) {
+      SDValue Offset, AMOpc;
+      bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+      unsigned Opcode = 0;
+      bool Match = false;
+      if (LoadedVT == MVT::i32 &&
+          SelectAddrMode2Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+        Opcode = isPre ? ARM::LDR_PRE : ARM::LDR_POST;
+        Match = true;
+      } else if (LoadedVT == MVT::i16 &&
+                 SelectAddrMode3Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = (LD->getExtensionType() == ISD::SEXTLOAD)
+          ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST)
+          : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST);
+      } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) {
+        if (LD->getExtensionType() == ISD::SEXTLOAD) {
+          if (SelectAddrMode3Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+            Match = true;
+            Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
+          }
+        } else {
+          if (SelectAddrMode2Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+            Match = true;
+            Opcode = isPre ? ARM::LDRB_PRE : ARM::LDRB_POST;
+          }
+        }
+      }
+
+      if (Match) {
+        SDValue Chain = LD->getChain();
+        SDValue Base = LD->getBasePtr();
+        SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
+                           CurDAG->getRegister(0, MVT::i32), Chain };
+        return CurDAG->getTargetNode(Opcode, dl, MVT::i32, MVT::i32,
+                                     MVT::Other, Ops, 6);
+      }
+    }
+    // Other cases are autogenerated.
+    break;
+  }
+  case ARMISD::BRCOND: {
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    unsigned Opc = Subtarget->isThumb() ? ARM::tBcc : ARM::Bcc;
+    SDValue Chain = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDValue N2 = Op.getOperand(2);
+    SDValue N3 = Op.getOperand(3);
+    SDValue InFlag = Op.getOperand(4);
+    assert(N1.getOpcode() == ISD::BasicBlock);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+    SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
+    SDNode *ResNode = CurDAG->getTargetNode(Opc, dl, MVT::Other, 
+                                            MVT::Flag, Ops, 5);
+    Chain = SDValue(ResNode, 0);
+    if (Op.getNode()->getNumValues() == 2) {
+      InFlag = SDValue(ResNode, 1);
+      ReplaceUses(SDValue(Op.getNode(), 1), InFlag);
+    }
+    ReplaceUses(SDValue(Op.getNode(), 0), SDValue(Chain.getNode(), Chain.getResNo()));
+    return NULL;
+  }
+  case ARMISD::CMOV: {
+    bool isThumb = Subtarget->isThumb();
+    MVT VT = Op.getValueType();
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDValue N2 = Op.getOperand(2);
+    SDValue N3 = Op.getOperand(3);
+    SDValue InFlag = Op.getOperand(4);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 18  cost = 1  size = 0
+    SDValue CPTmp0;
+    SDValue CPTmp1;
+    SDValue CPTmp2;
+    if (!isThumb && VT == MVT::i32 &&
+        SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) {
+      SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+      SDValue Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag };
+      return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCs, MVT::i32, Ops, 7);
+    }
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false,
+    //             (imm:i32)<<P:Predicate_so_imm>><<X:so_imm_XFORM>>:$true,
+    //             (imm:i32):$cc)
+    // Emits: (MOVCCi:i32 GPR:i32:$false,
+    //           (so_imm_XFORM:i32 (imm:i32):$true), (imm:i32):$cc)
+    // Pattern complexity = 10  cost = 1  size = 0
+    if (VT == MVT::i32 &&
+        N3.getOpcode() == ISD::Constant &&
+        Predicate_so_imm(N3.getNode())) {
+      SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N1)->getZExtValue()),
+                               MVT::i32);
+      Tmp1 = Transform_so_imm_XFORM(Tmp1.getNode());
+      SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+      SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
+      return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCi, MVT::i32, Ops, 5);
+    }
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+    //
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 11  size = 0
+    //
+    // Also FCPYScc and FCPYDcc.
+    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+    SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
+    unsigned Opc = 0;
+    switch (VT.getSimpleVT()) {
+    default: assert(false && "Illegal conditional move type!");
+      break;
+    case MVT::i32:
+      Opc = isThumb ? ARM::tMOVCCr : ARM::MOVCCr;
+      break;
+    case MVT::f32:
+      Opc = ARM::FCPYScc;
+      break;
+    case MVT::f64:
+      Opc = ARM::FCPYDcc;
+      break; 
+    }
+    return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5);
+  }
+  case ARMISD::CNEG: {
+    MVT VT = Op.getValueType();
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDValue N2 = Op.getOperand(2);
+    SDValue N3 = Op.getOperand(3);
+    SDValue InFlag = Op.getOperand(4);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+    SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
+    unsigned Opc = 0;
+    switch (VT.getSimpleVT()) {
+    default: assert(false && "Illegal conditional move type!");
+      break;
+    case MVT::f32:
+      Opc = ARM::FNEGScc;
+      break;
+    case MVT::f64:
+      Opc = ARM::FNEGDcc;
+      break;
+    }
+    return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5);
+  }
+
+  case ISD::DECLARE: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDValue N2 = Op.getOperand(2);
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
+    // FIXME: handle VLAs.
+    if (!FINode) {
+      ReplaceUses(Op.getValue(0), Chain);
+      return NULL;
+    }
+    if (N2.getOpcode() == ARMISD::PIC_ADD && isa<LoadSDNode>(N2.getOperand(0)))
+      N2 = N2.getOperand(0);
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(N2);
+    if (!Ld) {
+      ReplaceUses(Op.getValue(0), Chain);
+      return NULL;
+    }
+    SDValue BasePtr = Ld->getBasePtr();
+    assert(BasePtr.getOpcode() == ARMISD::Wrapper &&
+           isa<ConstantPoolSDNode>(BasePtr.getOperand(0)) &&
+           "llvm.dbg.variable should be a constantpool node");
+    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(BasePtr.getOperand(0));
+    GlobalValue *GV = 0;
+    if (CP->isMachineConstantPoolEntry()) {
+      ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)CP->getMachineCPVal();
+      GV = ACPV->getGV();
+    } else
+      GV = dyn_cast<GlobalValue>(CP->getConstVal());
+    if (!GV) {
+      ReplaceUses(Op.getValue(0), Chain);
+      return NULL;
+    }
+    
+    SDValue Tmp1 = CurDAG->getTargetFrameIndex(FINode->getIndex(),
+                                               TLI.getPointerTy());
+    SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GV, TLI.getPointerTy());
+    SDValue Ops[] = { Tmp1, Tmp2, Chain };
+    return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
+                                 MVT::Other, Ops, 3);
+  }
+  }
+
+  return SelectCode(Op);
+}
+
+bool ARMDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+
+  SDValue Base, Offset, Opc;
+  if (!SelectAddrMode2(Op, Op, Base, Offset, Opc))
+    return true;
+  
+  OutOps.push_back(Base);
+  OutOps.push_back(Offset);
+  OutOps.push_back(Opc);
+  return false;
+}
+
+/// createARMISelDag - This pass converts a legalized DAG into a
+/// ARM-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createARMISelDag(ARMTargetMachine &TM) {
+  return new ARMDAGToDAGISel(TM);
+}
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
new file mode 100644
index 0000000..c0fd9dc
--- /dev/null
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -0,0 +1,2346 @@
+//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instruction.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State);
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State);
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State);
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State);
+
+ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
+    : TargetLowering(TM), ARMPCLabelIndex(0) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+
+  if (Subtarget->isTargetDarwin()) {
+    // Uses VFP for Thumb libfuncs if available.
+    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
+      // Single-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
+      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
+      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
+      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
+
+      // Double-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
+      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
+      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
+      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
+
+      // Single-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
+      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
+      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
+      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
+      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
+      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
+      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
+      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
+
+      // Double-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
+      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
+      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
+      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
+      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
+      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
+      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
+      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
+
+      // Floating-point to integer conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
+
+      // Conversions between floating types.
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
+      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
+
+      // Integer to floating-point conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+      // e.g., __floatunsidf vs. __floatunssidfvfp.
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
+    }
+  }
+
+  // These libcalls are not available in 32-bit.
+  setLibcallName(RTLIB::SHL_I128, 0);
+  setLibcallName(RTLIB::SRL_I128, 0);
+  setLibcallName(RTLIB::SRA_I128, 0);
+
+  if (Subtarget->isThumb())
+    addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
+  else
+    addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+    addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
+    addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  }
+  computeRegisterProperties();
+
+  // ARM does not have f32 extending load.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  // ARM does not have i1 sign extending load.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // ARM supports all 4 flavors of integer indexed load / store.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im,  MVT::i1,  Legal);
+    setIndexedLoadAction(im,  MVT::i8,  Legal);
+    setIndexedLoadAction(im,  MVT::i16, Legal);
+    setIndexedLoadAction(im,  MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i1,  Legal);
+    setIndexedStoreAction(im, MVT::i8,  Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+  }
+
+  // i64 operation support.
+  if (Subtarget->isThumb()) {
+    setOperationAction(ISD::MUL,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+    setOperationAction(ISD::MULHS,   MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  } else {
+    setOperationAction(ISD::MUL,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+    if (!Subtarget->hasV6Ops())
+      setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  }
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL,       MVT::i64, Custom);
+  setOperationAction(ISD::SRA,       MVT::i64, Custom);
+
+  // ARM does not have ROTL.
+  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ,  MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  if (!Subtarget->hasV5TOps() || Subtarget->isThumb())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  // Only ARMv6 has BSWAP.
+  if (!Subtarget->hasV6Ops())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  // These are expanded into libcalls.
+  setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+  setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+  setOperationAction(ISD::SREM,  MVT::i32, Expand);
+  setOperationAction(ISD::UREM,  MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // Support label based line numbers.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+
+  setOperationAction(ISD::RET,           MVT::Other, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
+  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Expand);
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
+
+  if (!Subtarget->hasV6Ops()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+  }
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb())
+    // Turn f64->i64 into FMRRD, i64 -> f64 to FMDRR iff target supports vfp2.
+    setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,    MVT::i32, Expand);
+  setOperationAction(ISD::SELECT,    MVT::f32, Expand);
+  setOperationAction(ISD::SELECT,    MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
+  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
+
+  // We don't support sin/cos/fmod/copysign/pow
+  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
+  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
+  setOperationAction(ISD::FREM,      MVT::f64, Expand);
+  setOperationAction(ISD::FREM,      MVT::f32, Expand);
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+  }
+  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
+  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
+
+  // int <-> fp are custom expanded into bit_convert + ARMISD ops.
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  // ARMISD::FMRRD  - No need to call setTargetDAGCombine
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+
+  setStackPointerRegisterToSaveRestore(ARM::SP);
+  setSchedulingPreference(SchedulingForRegPressure);
+  setIfCvtBlockSizeLimit(Subtarget->isThumb() ? 0 : 10);
+  setIfCvtDupBlockSizeLimit(Subtarget->isThumb() ? 0 : 2);
+
+  maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
+  // Do not enable CodePlacementOpt for now: it currently runs after the
+  // ARMConstantIslandPass and messes up branch relaxation and placement
+  // of constant islands.
+  // benefitFromCodePlacementOpt = true;
+}
+
+const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::CALL:          return "ARMISD::CALL";
+  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
+  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::tCALL:         return "ARMISD::tCALL";
+  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
+  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMPNZ:         return "ARMISD::CMPNZ";
+  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
+  case ARMISD::CMOV:          return "ARMISD::CMOV";
+  case ARMISD::CNEG:          return "ARMISD::CNEG";
+
+  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
+  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
+  case ARMISD::SITOF:         return "ARMISD::SITOF";
+  case ARMISD::UITOF:         return "ARMISD::UITOF";
+
+  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
+  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
+  case ARMISD::RRX:           return "ARMISD::RRX";
+
+  case ARMISD::FMRRD:         return "ARMISD::FMRRD";
+  case ARMISD::FMDRR:         return "ARMISD::FMDRR";
+
+  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
+static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition code!");
+  case ISD::SETNE:  return ARMCC::NE;
+  case ISD::SETEQ:  return ARMCC::EQ;
+  case ISD::SETGT:  return ARMCC::GT;
+  case ISD::SETGE:  return ARMCC::GE;
+  case ISD::SETLT:  return ARMCC::LT;
+  case ISD::SETLE:  return ARMCC::LE;
+  case ISD::SETUGT: return ARMCC::HI;
+  case ISD::SETUGE: return ARMCC::HS;
+  case ISD::SETULT: return ARMCC::LO;
+  case ISD::SETULE: return ARMCC::LS;
+  }
+}
+
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. It
+/// returns true if the operands should be inverted to form the proper
+/// comparison.
+static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                        ARMCC::CondCodes &CondCode2) {
+  bool Invert = false;
+  CondCode2 = ARMCC::AL;
+  switch (CC) {
+  default: assert(0 && "Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = ARMCC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = ARMCC::GE; break;
+  case ISD::SETOLT: CondCode = ARMCC::MI; break;
+  case ISD::SETOLE: CondCode = ARMCC::GT; Invert = true; break;
+  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETO:   CondCode = ARMCC::VC; break;
+  case ISD::SETUO:  CondCode = ARMCC::VS; break;
+  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUGT: CondCode = ARMCC::HI; break;
+  case ISD::SETUGE: CondCode = ARMCC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = ARMCC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = ARMCC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  }
+  return Invert;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//
+//  The lower operations present on calling convention works on this order:
+//      LowerCALL (virt regs --> phys regs, virt regs --> stack)
+//      LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
+//      LowerRET (virt regs --> phys regs)
+//      LowerCALL (phys regs --> virt regs)
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMGenCallingConv.inc"
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State) {
+  static const unsigned HiRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+  static const unsigned LoRegList[] = { ARM::R1,
+                                        ARM::R2,
+                                        ARM::R3,
+                                        ARM::NoRegister };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 4);
+  if (Reg == 0) 
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 4; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
+  if (LoRegList[i] != ARM::NoRegister)
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                           MVT::i32, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4, 4),
+                                           MVT::i32, LocInfo));
+  return true;  // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
+  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  if (Reg == 0)
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         MVT::i32, LocInfo));
+  return true;  // we handled it
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
+  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  if (Reg == 0)
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         MVT::i32, LocInfo));
+  return true;  // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
+  return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                   State);
+}
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered.  The returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDNode *ARMTargetLowering::
+LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
+                unsigned CallingConv, SelectionDAG &DAG) {
+
+  DebugLoc dl = TheCall->getDebugLoc();
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool isVarArg = TheCall->isVarArg();
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_ARM);
+
+  SmallVector<SDValue, 8> ResultVals;
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    SDValue Val;
+    if (VA.needsCustom()) {
+      // Handle f64 as custom.
+      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Lo.getValue(1);
+      InFlag = Lo.getValue(2);
+      VA = RVLocs[++i]; // skip ahead to next loc
+      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Hi.getValue(1);
+      InFlag = Hi.getValue(2);
+      Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi);
+    } else {
+      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                               InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+    }
+
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    ResultVals.push_back(Val);
+  }
+
+  // Merge everything together with a MERGE_VALUES node.
+  ResultVals.push_back(Chain);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
+                     &ResultVals[0], ResultVals.size()).getNode();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute.  The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*AlwaysInline=*/false, NULL, 0, NULL, 0);
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack.
+SDValue
+ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
+                                    const SDValue &StackPtr,
+                                    const CCValAssign &VA, SDValue Chain,
+                                    SDValue Arg, ISD::ArgFlagsTy Flags) {
+  DebugLoc dl = TheCall->getDebugLoc();
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal()) {
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+  }
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      PseudoSourceValue::getStack(), LocMemOffset);
+}
+
+/// LowerCALL - Lowering a ISD::CALL node into a callseq_start <-
+/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
+/// nodes.
+SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  MVT RetVT           = TheCall->getRetValType(0);
+  SDValue Chain       = TheCall->getChain();
+  unsigned CC         = TheCall->getCallingConv();
+  assert((CC == CallingConv::C ||
+          CC == CallingConv::Fast) && "unknown calling convention");
+  bool isVarArg       = TheCall->isVarArg();
+  SDValue Callee      = TheCall->getCallee();
+  DebugLoc dl         = TheCall->getDebugLoc();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(TheCall, CC_ARM);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue StackPtr = DAG.getRegister(ARM::SP, MVT::i32);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization, arguments are handled later.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = TheCall->getArg(realArgIdx);
+    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(realArgIdx);
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // f64 is passed in i32 pairs and must be combined
+    if (VA.needsCustom()) {
+      SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
+                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
+      VA = ArgLocs[++i]; // skip ahead to next loc
+      if (VA.isRegLoc())
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(1)));
+      else {
+        assert(VA.isMemLoc());
+        if (StackPtr.getNode() == 0)
+          StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+        MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
+                                               Chain, fmrrd.getValue(1),
+                                               Flags));
+      }
+    } else if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+      MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
+                                             Chain, Arg, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool isDirect = false;
+  bool isARMFunc = false;
+  bool isLocalARMFunc = false;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    GlobalValue *GV = G->getGlobal();
+    isDirect = true;
+    bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
+                  GV->hasLinkOnceLinkage());
+    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
+                   getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // ARM call to a local ARM function is predicable.
+    isLocalARMFunc = !Subtarget->isThumb() && !isExt;
+    // tBX takes a register source operand.
+    if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
+                                                           ARMCP::CPStub, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr, NULL, 0);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
+                           getPointerTy(), Callee, PICLabel);
+   } else
+      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    isDirect = true;
+    bool isStub = Subtarget->isTargetDarwin() &&
+                  getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // tBX takes a register source operand.
+    const char *Sym = S->getSymbol();
+    if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(Sym, ARMPCLabelIndex,
+                                                           ARMCP::CPStub, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr, NULL, 0);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
+                           getPointerTy(), Callee, PICLabel);
+    } else
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  }
+
+  // FIXME: handle tail calls differently.
+  unsigned CallOpc;
+  if (Subtarget->isThumb()) {
+    if (!Subtarget->hasV5TOps() && (!isDirect || isARMFunc))
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
+  } else {
+    CallOpc = (isDirect || Subtarget->hasV5TOps())
+      ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
+      : ARMISD::CALL_NOLINK;
+  }
+  if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb()) {
+    // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK
+    Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
+                      &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (RetVT != MVT::Other)
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
+                                 Op.getResNo());
+}
+
+SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
+  // The chain is always operand #0
+  SDValue Chain = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+
+  // Analyze return values of ISD::RET.
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_ARM);
+
+  // If this is the first return lowered for this function, add
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // ISD::RET => ret chain, (regnum1,val1), ...
+    // So i*2+1 index only the regnums
+    SDValue Arg = Op.getOperand(realRVLocIdx*2+1);
+
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
+    // available.
+    if (VA.needsCustom()) {
+      SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
+                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
+      Flag = Chain.getValue(1);
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+
+    // Guarantee that all emitted copies are
+    // stuck together, avoiding something bad.
+    Flag = Chain.getValue(1);
+  }
+
+  SDValue result;
+  if (Flag.getNode())
+    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  else // Return Void
+    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain);
+
+  return result;
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOVi.
+static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  MVT PtrVT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+SDValue
+ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                                 SelectionDAG &DAG) {
+  DebugLoc dl = GA->getDebugLoc();
+  MVT PtrVT = getPointerTy();
+  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
+                             PCAdj, "tlsgd", true);
+  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
+  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, NULL, 0);
+  SDValue Chain = Argument.getValue(1);
+
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+
+  // call __tls_get_addr.
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Entry.Node = Argument;
+  Entry.Ty = (const Type *) Type::Int32Ty;
+  Args.push_back(Entry);
+  // FIXME: is there useful debug info available here?
+  std::pair<SDValue, SDValue> CallResult =
+    LowerCallTo(Chain, (const Type *) Type::Int32Ty, false, false, false, false,
+                CallingConv::C, false,
+                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+  return CallResult.first;
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or
+// "local exec" model.
+SDValue
+ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                        SelectionDAG &DAG) {
+  GlobalValue *GV = GA->getGlobal();
+  DebugLoc dl = GA->getDebugLoc();
+  SDValue Offset;
+  SDValue Chain = DAG.getEntryNode();
+  MVT PtrVT = getPointerTy();
+  // Get the Thread Pointer
+  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+
+  if (GV->isDeclaration()){
+    // initial exec model
+    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
+                               PCAdj, "gottpoff", true);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
+    Chain = Offset.getValue(1);
+
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
+
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
+  } else {
+    // local exec model
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GV, ARMCP::CPValue, "tpoff");
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
+  // TODO: implement the "local dynamic" model
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
+  // otherwise use the "Local Exec" TLS Model
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
+    return LowerToTLSGeneralDynamicModel(GA, DAG);
+  else
+    return LowerToTLSExecModels(GA, DAG);
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                                 SelectionDAG &DAG) {
+  MVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  if (RelocM == Reloc::PIC_) {
+    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GV, ARMCP::CPValue, UseGOTOFF ? "GOTOFF":"GOT");
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                                 CPAddr, NULL, 0);
+    SDValue Chain = Result.getValue(1);
+    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
+    if (!UseGOTOFF)
+      Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0);
+    return Result;
+  } else {
+    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+  }
+}
+
+/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol
+/// even in non-static mode.
+static bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) {
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+  bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
+  if (GV->hasHiddenVisibility() && (!isDecl && !GV->hasCommonLinkage()))
+    return false;
+  return RelocM != Reloc::Static && (isDecl || GV->isWeakForLinker());
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
+                                                    SelectionDAG &DAG) {
+  MVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  bool IsIndirect = GVIsIndirectSymbol(GV, RelocM);
+  SDValue CPAddr;
+  if (RelocM == Reloc::Static)
+    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+  else {
+    unsigned PCAdj = (RelocM != Reloc::PIC_)
+      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    ARMCP::ARMCPKind Kind = IsIndirect ? ARMCP::CPNonLazyPtr
+      : ARMCP::CPValue;
+    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
+                                                         Kind, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+  SDValue Chain = Result.getValue(1);
+
+  if (RelocM == Reloc::PIC_) {
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+  }
+  if (IsIndirect)
+    Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0);
+
+  return Result;
+}
+
+SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
+                                                    SelectionDAG &DAG){
+  assert(Subtarget->isTargetELF() &&
+         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
+  MVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV = new ARMConstantPoolValue("_GLOBAL_OFFSET_TABLE_",
+                                                       ARMPCLabelIndex,
+                                                       ARMCP::CPValue, PCAdj);
+  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+}
+
+SDValue
+ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::arm_thread_pointer:
+      return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+  case Intrinsic::eh_sjlj_setjmp:
+      SDValue Res = DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32,
+                         Op.getOperand(1));
+      return Res;
+  }
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                            unsigned VarArgsFrameIndex) {
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  DebugLoc dl = Op.getDebugLoc();
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+}
+
+SDValue
+ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  SDValue Root = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  unsigned CC = MF.getFunction()->getCallingConv();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_ARM);
+
+  SmallVector<SDValue, 16> ArgValues;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      MVT RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+      if (AFI->isThumbFunction())
+        RC = ARM::tGPRRegisterClass;
+      else
+        RC = ARM::GPRRegisterClass;
+
+      if (RegVT == MVT::f64) {
+        // f64 is passed in pairs of GPRs and must be combined.
+        RegVT = MVT::i32;
+      } else if (!((RegVT == MVT::i32) || (RegVT == MVT::f32)))
+        assert(0 && "RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+      // Transform the arguments stored in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+
+      // f64 is passed in i32 pairs and must be combined.
+      if (VA.needsCustom()) {
+        SDValue ArgValue2;
+
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        if (VA.isMemLoc()) {
+          // must be APCS to split like this
+          unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+          int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset());
+
+          // Create load node to retrieve arguments from the stack.
+          SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+          ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0);
+        } else {
+          Reg = MF.addLiveIn(VA.getLocReg(), RC);
+          ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+        }
+
+        ArgValue = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64,
+                               ArgValue, ArgValue2);
+      }
+
+      // If this is an 8 or 16-bit value, it is really passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      ArgValues.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
+
+      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset());
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), dl, Root, FIN, NULL, 0));
+    }
+  }
+
+  // varargs
+  if (isVarArg) {
+    static const unsigned GPRArgRegs[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3
+    };
+
+    unsigned NumGPRs = CCInfo.getFirstUnallocated
+      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+
+    unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+    unsigned VARegSize = (4 - NumGPRs) * 4;
+    unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
+    unsigned ArgOffset = 0;
+    if (VARegSaveSize) {
+      // If this function is vararg, store any remaining integer argument regs
+      // to their spots on the stack so that they may be loaded by deferencing
+      // the result of va_next.
+      AFI->setVarArgsRegSaveSize(VARegSaveSize);
+      ArgOffset = CCInfo.getNextStackOffset();
+      VarArgsFrameIndex = MFI->CreateFixedObject(VARegSaveSize, ArgOffset +
+                                                 VARegSaveSize - VARegSize);
+      SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+
+      SmallVector<SDValue, 4> MemOps;
+      for (; NumGPRs < 4; ++NumGPRs) {
+        TargetRegisterClass *RC;
+        if (AFI->isThumbFunction())
+          RC = ARM::tGPRRegisterClass;
+        else
+          RC = ARM::GPRRegisterClass;
+
+        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
+        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                          DAG.getConstant(4, getPointerTy()));
+      }
+      if (!MemOps.empty())
+        Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           &MemOps[0], MemOps.size());
+    } else
+      // This will point to the next argument passed via stack.
+      VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset);
+  }
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+}
+
+/// isFloatingPointZero - Return true if this is +0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isPosZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
+      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
+      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
+        if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+          return CFP->getValueAPF().isPosZero();
+    }
+  }
+  return false;
+}
+
+static bool isLegalCmpImmediate(unsigned C, bool isThumb) {
+  return ( isThumb && (C & ~255U) == 0) ||
+         (!isThumb && ARM_AM::getSOImmVal(C) != -1);
+}
+
+/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
+/// the given operands.
+static SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                         SDValue &ARMCC, SelectionDAG &DAG, bool isThumb,
+                         DebugLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    unsigned C = RHSC->getZExtValue();
+    if (!isLegalCmpImmediate(C, isThumb)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (isLegalCmpImmediate(C-1, isThumb)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (C > 0 && isLegalCmpImmediate(C-1, isThumb)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (isLegalCmpImmediate(C+1, isThumb)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      }
+    }
+  }
+
+  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+  ARMISD::NodeType CompareType;
+  switch (CondCode) {
+  default:
+    CompareType = ARMISD::CMP;
+    break;
+  case ARMCC::EQ:
+  case ARMCC::NE:
+  case ARMCC::MI:
+  case ARMCC::PL:
+    // Uses only N and Z Flags
+    CompareType = ARMISD::CMPNZ;
+    break;
+  }
+  ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS);
+}
+
+/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
+static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                         DebugLoc dl) {
+  SDValue Cmp;
+  if (!isFloatingPointZero(RHS))
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS);
+  else
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Flag, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp);
+}
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const ARMSubtarget *ST) {
+  MVT VT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMCC;
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp);
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  if (FPCCToARMCC(CC, CondCode, CondCode2))
+    std::swap(TrueVal, FalseVal);
+
+  SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+                                 ARMCC, CCR, Cmp);
+  if (CondCode2 != ARMCC::AL) {
+    SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32);
+    // FIXME: Needs another CMP because flag can have but one use.
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
+                         Result, TrueVal, ARMCC2, CCR, Cmp2);
+  }
+  return Result;
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  SDValue  Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue    LHS = Op.getOperand(2);
+  SDValue    RHS = Op.getOperand(3);
+  SDValue   Dest = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMCC;
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl);
+    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                       Chain, Dest, ARMCC, CCR,Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  ARMCC::CondCodes CondCode, CondCode2;
+  if (FPCCToARMCC(CC, CondCode, CondCode2))
+    // Swap the LHS/RHS of the comparison if needed.
+    std::swap(LHS, RHS);
+
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp };
+  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  if (CondCode2 != ARMCC::AL) {
+    ARMCC = DAG.getConstant(CondCode2, MVT::i32);
+    SDValue Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) };
+    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  }
+  return Res;
+}
+
+SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  MVT PTy = getPointerTy();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+  SDValue UId =  DAG.getConstant(AFI->createJumpTableUId(), PTy);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
+  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+  bool isPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  Addr = DAG.getLoad(isPIC ? (MVT)MVT::i32 : PTy, dl,
+                     Chain, Addr, NULL, 0);
+  Chain = Addr.getValue(1);
+  if (isPIC)
+    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
+  return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+}
+
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc =
+    Op.getOpcode() == ISD::FP_TO_SINT ? ARMISD::FTOSI : ARMISD::FTOUI;
+  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+}
+
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc =
+    Op.getOpcode() == ISD::SINT_TO_FP ? ARMISD::SITOF : ARMISD::UITOF;
+
+  Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  // Implement fcopysign with a fabs and a conditional fneg.
+  SDValue Tmp0 = Op.getOperand(0);
+  SDValue Tmp1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType();
+  MVT SrcVT = Tmp1.getValueType();
+  SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
+  SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl);
+  SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp);
+}
+
+SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->useThumbBacktraces())
+    ? ARM::R7 : ARM::R11;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
+}
+
+SDValue
+ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                           SDValue Chain,
+                                           SDValue Dst, SDValue Src,
+                                           SDValue Size, unsigned Align,
+                                           bool AlwaysInline,
+                                         const Value *DstSV, uint64_t DstSVOff,
+                                         const Value *SrcSV, uint64_t SrcSVOff){
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferrably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
+    return SDValue();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  MVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  const unsigned MAX_LOADS_IN_LDM = 6;
+  SDValue TFOps[MAX_LOADS_IN_LDM];
+  SDValue Loads[MAX_LOADS_IN_LDM];
+  uint64_t SrcOff = 0, DstOff = 0;
+
+  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
+  // same number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while (EmittedNumMemOps < NumMemOps) {
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      Loads[i] = DAG.getLoad(VT, dl, Chain,
+                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                         DAG.getConstant(SrcOff, MVT::i32)),
+                             SrcSV, SrcSVOff + SrcOff);
+      TFOps[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                           DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                       DAG.getConstant(DstOff, MVT::i32)),
+                           DstSV, DstSVOff + DstOff);
+      DstOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    EmittedNumMemOps += i;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                       DAG.getConstant(SrcOff, MVT::i32)),
+                           SrcSV, SrcSVOff + SrcOff);
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                        DAG.getConstant(DstOff, MVT::i32)),
+                            DstSV, DstSVOff + DstOff);
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+}
+
+static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) == MVT::f64) {
+    // Turn i64->f64 into FMDRR.
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(0, MVT::i32));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(1, MVT::i32));
+    return DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi);
+  }
+
+  // Turn f64->i64 into FMRRD.
+  SDValue Cvt = DAG.getNode(ARMISD::FMRRD, dl,
+                            DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
+
+  // Merge the pieces into a single i64 value.
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
+}
+
+static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
+  assert(N->getValueType(0) == MVT::i64 &&
+         (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+         "Unknown shift to lower!");
+
+  // We only lower SRA, SRL of 1 here, all others use generic lowering.
+  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
+      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
+    return SDValue();
+
+  // If we are in thumb mode, we don't have RRX.
+  if (ST->isThumb()) return SDValue();
+
+  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                             DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                             DAG.getConstant(1, MVT::i32));
+
+  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
+  // captures the result into a carry flag.
+  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Flag), &Hi, 1);
+
+  // The low part is an ARMISD::RRX operand, which shifts the carry in.
+  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
+
+  // Merge the pieces into a single i64 value.
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Don't know how to custom lower this!"); abort();
+  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:
+    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
+      LowerGlobalAddressELF(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::CALL:          return LowerCALL(Op, DAG);
+  case ISD::RET:           return LowerRET(Op, DAG);
+  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG, Subtarget);
+  case ISD::BR_CC:         return LowerBR_CC(Op, DAG, Subtarget);
+  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:       return LowerVASTART(Op, DAG, VarArgsFrameIndex);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
+  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
+  case ISD::RETURNADDR:    break;
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
+  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::BIT_CONVERT:   return ExpandBIT_CONVERT(Op.getNode(), DAG);
+  case ISD::SRL:
+  case ISD::SRA:           return ExpandSRx(Op.getNode(), DAG,Subtarget);
+  }
+  return SDValue();
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default:
+    assert(0 && "Don't know how to custom expand this!");
+    return;
+  case ISD::BIT_CONVERT:
+    Results.push_back(ExpandBIT_CONVERT(N, DAG));
+    return;
+  case ISD::SRL:
+  case ISD::SRA: {
+    SDValue Res = ExpandSRx(N, DAG, Subtarget);
+    if (Res.getNode())
+      Results.push_back(Res);
+    return;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case ARM::tMOVCCr: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
+      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+        e = BB->succ_end(); i != e; ++i)
+      sinkMBB->addSuccessor(*i);
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, dl, TII->get(ARM::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+static
+SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                            TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT VT = N->getValueType(0);
+  unsigned Opc = N->getOpcode();
+  bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC;
+  SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1);
+  SDValue RHS = isSlctCC ? Slct.getOperand(3) : Slct.getOperand(2);
+  ISD::CondCode CC = ISD::SETCC_INVALID;
+
+  if (isSlctCC) {
+    CC = cast<CondCodeSDNode>(Slct.getOperand(4))->get();
+  } else {
+    SDValue CCOp = Slct.getOperand(0);
+    if (CCOp.getOpcode() == ISD::SETCC)
+      CC = cast<CondCodeSDNode>(CCOp.getOperand(2))->get();
+  }
+
+  bool DoXform = false;
+  bool InvCC = false;
+  assert ((Opc == ISD::ADD || (Opc == ISD::SUB && Slct == N->getOperand(1))) &&
+          "Bad input!");
+
+  if (LHS.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(LHS)->isNullValue()) {
+    DoXform = true;
+  } else if (CC != ISD::SETCC_INVALID &&
+             RHS.getOpcode() == ISD::Constant &&
+             cast<ConstantSDNode>(RHS)->isNullValue()) {
+    std::swap(LHS, RHS);
+    SDValue Op0 = Slct.getOperand(0);
+    MVT OpVT = isSlctCC ? Op0.getValueType() :
+                          Op0.getOperand(0).getValueType();
+    bool isInt = OpVT.isInteger();
+    CC = ISD::getSetCCInverse(CC, isInt);
+
+    if (!TLI.isCondCodeLegal(CC, OpVT))
+      return SDValue();         // Inverse operator isn't legal.
+
+    DoXform = true;
+    InvCC = true;
+  }
+
+  if (DoXform) {
+    SDValue Result = DAG.getNode(Opc, RHS.getDebugLoc(), VT, OtherOp, RHS);
+    if (isSlctCC)
+      return DAG.getSelectCC(N->getDebugLoc(), OtherOp, Result,
+                             Slct.getOperand(0), Slct.getOperand(1), CC);
+    SDValue CCOp = Slct.getOperand(0);
+    if (InvCC)
+      CCOp = DAG.getSetCC(Slct.getDebugLoc(), CCOp.getValueType(),
+                          CCOp.getOperand(0), CCOp.getOperand(1), CC);
+    return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+                       CCOp, OtherOp, Result);
+  }
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // added by evan in r37685 with no testcase.
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+  if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
+    if (Result.getNode()) return Result;
+  }
+  if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
+    if (Result.getNode()) return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+static SDValue PerformSUBCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // added by evan in r37685 with no testcase.
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+  if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
+    if (Result.getNode()) return Result;
+  }
+
+  return SDValue();
+}
+
+
+/// PerformFMRRDCombine - Target-specific dag combine xforms for ARMISD::FMRRD.
+static SDValue PerformFMRRDCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  // fmrrd(fmdrr x, y) -> x,y
+  SDValue InDouble = N->getOperand(0);
+  if (InDouble.getOpcode() == ARMISD::FMDRR)
+    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+  return SDValue();
+}
+
+SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADD:      return PerformADDCombine(N, DCI);
+  case ISD::SUB:      return PerformSUBCombine(N, DCI);
+  case ARMISD::FMRRD: return PerformFMRRDCombine(N, DCI);
+  }
+
+  return SDValue();
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+static bool isLegalAddressImmediate(int64_t V, MVT VT,
+                                    const ARMSubtarget *Subtarget) {
+  if (V == 0)
+    return true;
+
+  if (!VT.isSimple())
+    return false;
+
+  if (Subtarget->isThumb()) {
+    if (V < 0)
+      return false;
+
+    unsigned Scale = 1;
+    switch (VT.getSimpleVT()) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+      // Scale == 1;
+      break;
+    case MVT::i16:
+      // Scale == 2;
+      Scale = 2;
+      break;
+    case MVT::i32:
+      // Scale == 4;
+      Scale = 4;
+      break;
+    }
+
+    if ((V & (Scale - 1)) != 0)
+      return false;
+    V /= Scale;
+    return V == (V & ((1LL << 5) - 1));
+  }
+
+  if (V < 0)
+    V = - V;
+  switch (VT.getSimpleVT()) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i32:
+    // +- imm12
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::i16:
+    // +- imm8
+    return V == (V & ((1LL << 8) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    if (!Subtarget->hasVFP2())
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              const Type *Ty) const {
+  MVT VT = getValueType(Ty, true);
+  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
+    return false;
+
+  // Can never fold addr of global into load/store.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0:  // no scale reg, must be "r+i" or "r", or "i".
+    break;
+  case 1:
+    if (Subtarget->isThumb())
+      return false;
+    // FALL THROUGH.
+  default:
+    // ARM doesn't support any R+R*scale+imm addr modes.
+    if (AM.BaseOffs)
+      return false;
+
+    if (!VT.isSimple())
+      return false;
+
+    int Scale = AM.Scale;
+    switch (VT.getSimpleVT()) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i32:
+    case MVT::i64:
+      // This assumes i64 is legalized to a pair of i32. If not (i.e.
+      // ldrd / strd are used, then its address mode is same as i16.
+      // r + r
+      if (Scale < 0) Scale = -Scale;
+      if (Scale == 1)
+        return true;
+      // r + r << imm
+      return isPowerOf2_32(Scale & ~1);
+    case MVT::i16:
+      // r + r
+      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+        return true;
+      return false;
+
+    case MVT::isVoid:
+      // Note, we allow "void" uses (basically, uses that aren't loads or
+      // stores), because arm allows folding a scale into many arithmetic
+      // operations.  This should be made more precise and revisited later.
+
+      // Allow r << imm, but the imm has to be a multiple of two.
+      if (AM.Scale & 1) return false;
+      return isPowerOf2_32(AM.Scale);
+    }
+    break;
+  }
+  return true;
+}
+
+static bool getIndexedAddressParts(SDNode *Ptr, MVT VT,
+                                   bool isSEXTLoad, SDValue &Base,
+                                   SDValue &Offset, bool &isInc,
+                                   SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
+    // AddressingMode 3
+    Base = Ptr->getOperand(0);
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -256) {
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        return true;
+      }
+    }
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Offset = Ptr->getOperand(1);
+    return true;
+  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
+    // AddressingMode 2
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -0x1000) {
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        Base = Ptr->getOperand(0);
+        return true;
+      }
+    }
+
+    if (Ptr->getOpcode() == ISD::ADD) {
+      isInc = true;
+      ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0));
+      if (ShOpcVal != ARM_AM::no_shift) {
+        Base = Ptr->getOperand(1);
+        Offset = Ptr->getOperand(0);
+      } else {
+        Base = Ptr->getOperand(0);
+        Offset = Ptr->getOperand(1);
+      }
+      return true;
+    }
+
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    return true;
+  }
+
+  // FIXME: Use FLDM / FSTM to emulate indexed FP load / store.
+  return false;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool
+ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                             SDValue &Offset,
+                                             ISD::MemIndexedMode &AM,
+                                             SelectionDAG &DAG) const {
+  if (Subtarget->isThumb())
+    return false;
+
+  MVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = getIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (isLegal) {
+    AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
+    return true;
+  }
+  return false;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                   SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   SelectionDAG &DAG) const {
+  if (Subtarget->isThumb())
+    return false;
+
+  MVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (isLegal) {
+    AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+    return true;
+  }
+  return false;
+}
+
+void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       const APInt &Mask,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case ARMISD::CMOV: {
+    // Bits are known zero/one if known on the LHS and RHS.
+    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+    if (KnownZero == 0 && KnownOne == 0) return;
+
+    APInt KnownZeroRHS, KnownOneRHS;
+    DAG.ComputeMaskedBits(Op.getOperand(1), Mask,
+                          KnownZeroRHS, KnownOneRHS, Depth+1);
+    KnownZero &= KnownZeroRHS;
+    KnownOne  &= KnownOneRHS;
+    return;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARMTargetLowering::ConstraintType
+ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'l': return C_RegisterClass;
+    case 'w': return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'l':
+      if (Subtarget->isThumb())
+        return std::make_pair(0U, ARM::tGPRRegisterClass);
+      else
+        return std::make_pair(0U, ARM::GPRRegisterClass);
+    case 'r':
+      return std::make_pair(0U, ARM::GPRRegisterClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, ARM::SPRRegisterClass);
+      if (VT == MVT::f64)
+        return std::make_pair(0U, ARM::DPRRegisterClass);
+      break;
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+std::vector<unsigned> ARMTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT VT) const {
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {      // GCC ARM Constraint Letters
+  default: break;
+  case 'l':
+    return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+                                 ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+                                 0);
+  case 'r':
+    return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+                                 ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+                                 ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+                                 ARM::R12, ARM::LR, 0);
+  case 'w':
+    if (VT == MVT::f32)
+      return make_vector<unsigned>(ARM::S0, ARM::S1, ARM::S2, ARM::S3,
+                                   ARM::S4, ARM::S5, ARM::S6, ARM::S7,
+                                   ARM::S8, ARM::S9, ARM::S10, ARM::S11,
+                                   ARM::S12,ARM::S13,ARM::S14,ARM::S15,
+                                   ARM::S16,ARM::S17,ARM::S18,ARM::S19,
+                                   ARM::S20,ARM::S21,ARM::S22,ARM::S23,
+                                   ARM::S24,ARM::S25,ARM::S26,ARM::S27,
+                                   ARM::S28,ARM::S29,ARM::S30,ARM::S31, 0);
+    if (VT == MVT::f64)
+      return make_vector<unsigned>(ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                   ARM::D4, ARM::D5, ARM::D6, ARM::D7,
+                                   ARM::D8, ARM::D9, ARM::D10,ARM::D11,
+                                   ARM::D12,ARM::D13,ARM::D14,ARM::D15, 0);
+      break;
+  }
+
+  return std::vector<unsigned>();
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     char Constraint,
+                                                     bool hasMemory,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  switch (Constraint) {
+  default: break;
+  case 'I': case 'J': case 'K': case 'L':
+  case 'M': case 'N': case 'O':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    int64_t CVal64 = C->getSExtValue();
+    int CVal = (int) CVal64;
+    // None of these constraints allow values larger than 32 bits.  Check
+    // that the value fits in an int.
+    if (CVal != CVal64)
+      return;
+
+    switch (Constraint) {
+      case 'I':
+        if (Subtarget->isThumb()) {
+          // This must be a constant between 0 and 255, for ADD immediates.
+          if (CVal >= 0 && CVal <= 255)
+            break;
+        } else {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getSOImmVal(CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'J':
+        if (Subtarget->isThumb()) {
+          // This must be a constant between -255 and -1, for negated ADD
+          // immediates. This can be used in GCC with an "n" modifier that
+          // prints the negated value, for use with SUB instructions. It is
+          // not useful otherwise but is implemented for compatibility.
+          if (CVal >= -255 && CVal <= -1)
+            break;
+        } else {
+          // This must be a constant between -4095 and 4095. It is not clear
+          // what this constraint is intended for. Implemented for
+          // compatibility with GCC.
+          if (CVal >= -4095 && CVal <= 4095)
+            break;
+        }
+        return;
+
+      case 'K':
+        if (Subtarget->isThumb()) {
+          // A 32-bit value where only one byte has a nonzero value. Exclude
+          // zero to match GCC. This constraint is used by GCC internally for
+          // constants that can be loaded with a move/shift combination.
+          // It is not useful otherwise but is implemented for compatibility.
+          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
+            break;
+        } else {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getSOImmVal(~CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'L':
+        if (Subtarget->isThumb()) {
+          // This must be a constant between -7 and 7,
+          // for 3-operand ADD/SUB immediate instructions.
+          if (CVal >= -7 && CVal < 7)
+            break;
+        } else {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getSOImmVal(-CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'M':
+        if (Subtarget->isThumb()) {
+          // This must be a multiple of 4 between 0 and 1020, for
+          // ADD sp + immediate.
+          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
+            break;
+        } else {
+          // A power of two or a constant between 0 and 32.  This is used in
+          // GCC for the shift amount on shifted register operands, but it is
+          // useful in general for any shift amounts.
+          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
+            break;
+        }
+        return;
+
+      case 'N':
+        if (Subtarget->isThumb()) {
+          // This must be a constant between 0 and 31, for shift amounts.
+          if (CVal >= 0 && CVal <= 31)
+            break;
+        }
+        return;
+
+      case 'O':
+        if (Subtarget->isThumb()) {
+          // This must be a multiple of 4 between -508 and 508, for
+          // ADD/SUB sp = sp + immediate.
+          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
+            break;
+        }
+        return;
+    }
+    Result = DAG.getTargetConstant(CVal, Op.getValueType());
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+                                                      Ops, DAG);
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
new file mode 100644
index 0000000..2dab2db
--- /dev/null
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -0,0 +1,184 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMISELLOWERING_H
+#define ARMISELLOWERING_H
+
+#include "ARMSubtarget.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include <vector>
+
+namespace llvm {
+  class ARMConstantPoolValue;
+
+  namespace ARMISD {
+    // ARM Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
+                    // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
+
+      CALL,         // Function call.
+      CALL_PRED,    // Function call that's predicable.
+      CALL_NOLINK,  // Function call with branch not branch-and-link.
+      tCALL,        // Thumb function call.
+      BRCOND,       // Conditional branch.
+      BR_JT,        // Jumptable branch.
+      RET_FLAG,     // Return with a flag operand.
+
+      PIC_ADD,      // Add with a PC operand and a PIC label.
+
+      CMP,          // ARM compare instructions.
+      CMPNZ,        // ARM compare that uses only N or Z flags.
+      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      FMSTAT,       // ARM fmstat instruction.
+      CMOV,         // ARM conditional move instructions.
+      CNEG,         // ARM conditional negate instructions.
+
+      FTOSI,        // FP to sint within a FP register.
+      FTOUI,        // FP to uint within a FP register.
+      SITOF,        // sint to FP within a FP register.
+      UITOF,        // uint to FP within a FP register.
+
+      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+
+      FMRRD,        // double to two gprs.
+      FMDRR,        // Two gprs to double.
+
+      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp
+      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp
+
+      THREAD_POINTER
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
+
+  class ARMTargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+  public:
+    explicit ARMTargetLowering(TargetMachine &TM);
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *MBB) const;
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                           SDValue &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG) const;
+
+    /// getPostIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if this node can be
+    /// combined with a load / store to form a post-indexed load / store.
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDValue &Base, SDValue &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth) const;
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      MVT VT) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              char ConstraintLetter,
+                                              bool hasMemory,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
+    virtual const ARMSubtarget* getSubtarget() {
+      return Subtarget;
+    }
+
+  private:
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    /// ARMPCLabelIndex - Keep track the number of ARM PC labels created.
+    ///
+    unsigned ARMPCLabelIndex;
+
+    SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
+                             const SDValue &StackPtr, const CCValAssign &VA,
+                             SDValue Chain, SDValue Arg, ISD::ArgFlagsTy Flags);
+    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                            SelectionDAG &DAG);
+    SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                   SelectionDAG &DAG);
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+
+    SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                      SDValue Chain,
+                                      SDValue Dst, SDValue Src,
+                                      SDValue Size, unsigned Align,
+                                      bool AlwaysInline,
+                                      const Value *DstSV, uint64_t DstSVOff,
+                                      const Value *SrcSV, uint64_t SrcSVOff);
+  };
+}
+
+#endif  // ARMISELLOWERING_H
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
new file mode 100644
index 0000000..9a1e1c2
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -0,0 +1,868 @@
+//===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// ARM Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<5> val> {
+  bits<5> Value = val;
+}
+
+def Pseudo        : Format<0>;
+def MulFrm        : Format<1>;
+def BrFrm         : Format<2>;
+def BrMiscFrm     : Format<3>;
+
+def DPFrm         : Format<4>;
+def DPSoRegFrm    : Format<5>;
+
+def LdFrm         : Format<6>;
+def StFrm         : Format<7>;
+def LdMiscFrm     : Format<8>;
+def StMiscFrm     : Format<9>;
+def LdStMulFrm    : Format<10>;
+
+def ArithMiscFrm  : Format<11>;
+def ExtFrm        : Format<12>;
+
+def VFPUnaryFrm   : Format<13>;
+def VFPBinaryFrm  : Format<14>;
+def VFPConv1Frm   : Format<15>;
+def VFPConv2Frm   : Format<16>;
+def VFPConv3Frm   : Format<17>;
+def VFPConv4Frm   : Format<18>;
+def VFPConv5Frm   : Format<19>;
+def VFPLdStFrm    : Format<20>;
+def VFPLdStMulFrm : Format<21>;
+def VFPMiscFrm    : Format<22>;
+
+def ThumbFrm      : Format<23>;
+
+// Misc flag for data processing instructions that indicates whether
+// the instruction has a Rn register operand.
+class UnaryDP  { bit isUnaryDataProc = 1; }
+
+//===----------------------------------------------------------------------===//
+
+// ARM Instruction templates.
+//
+
+class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im,
+              Format f, string cstr>
+  : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "ARM";
+
+  // TSFlagsFields
+  AddrMode AM = am;
+  bits<4> AddrModeBits = AM.Value;
+  
+  SizeFlagVal SZ = sz;
+  bits<3> SizeFlag = SZ.Value;
+
+  IndexMode IM = im;
+  bits<2> IndexModeBits = IM.Value;
+  
+  Format F = f;
+  bits<5> Form = F.Value;
+
+  //
+  // Attributes specific to ARM instructions...
+  //
+  bit isUnaryDataProc = 0;
+  
+  let Constraints = cstr;
+}
+
+class PseudoInst<dag oops, dag iops, string asm, list<dag> pattern>
+  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, ""> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+}
+
+// Almost all ARM instructions are predicable.
+class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+        IndexMode im, Format f, string opc, string asm, string cstr,
+        list<dag> pattern>
+  : InstARM<am, sz, im, f, cstr> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString   = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Same as I except it can optionally modify CPSR. Note it's modeled as
+// an input operand since by default it's a zero register. It will
+// become an implicit def once it's "flipped".
+class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+         IndexMode im, Format f, string opc, string asm, string cstr,
+         list<dag> pattern>
+  : InstARM<am, sz, im, f, cstr> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p, cc_out:$s));
+  let AsmString   = !strconcat(opc, !strconcat("${p}${s}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Special cases
+class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+         IndexMode im, Format f, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, cstr> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+class AI<dag oops, dag iops, Format f, string opc,
+         string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc,
+       asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, string asm,
+          list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, asm,
+       "", pattern>;
+
+// Ctrl flow instructions
+class ABI<bits<4> opcod, dag oops, dag iops, string opc,
+         string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, opc,
+      asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+class ABXI<bits<4> opcod, dag oops, dag iops, string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, asm,
+       "", pattern> {
+  let Inst{27-24} = opcod;
+}
+class ABXIx2<dag oops, dag iops, string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, asm,
+       "", pattern>;
+
+// BR_JT instructions
+class JTI<dag oops, dag iops, string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm,
+       asm, "", pattern>;
+
+// addrmode1 instructions
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = {0,0};
+}
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, string opc,
+           string asm, list<dag> pattern>
+  : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc,
+       asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = {0,0};
+}
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, asm,
+       "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = {0,0};
+}
+class AI1x2<dag oops, dag iops, Format f, string opc,
+            string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, opc,
+      asm, "", pattern>;
+
+
+// addrmode2 loads and stores
+class AI2<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{27-26} = {0,1};
+}
+
+// loads
+class AI2ldw<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2ldw<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2ldb<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2ldb<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// stores
+class AI2stw<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2stw<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2stb<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2stb<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Pre-indexed loads
+class AI2ldwpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2ldbpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Pre-indexed stores
+class AI2stwpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2stbpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Post-indexed loads
+class AI2ldwpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2ldbpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Post-indexed stores
+class AI2stwpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2stbpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// addrmode3 instructions
+class AI3<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern>;
+class AXI3<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, asm,
+       "", pattern>;
+
+// loads
+class AI3ldh<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AXI3ldh<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldsh<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AXI3ldsh<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldsb<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AXI3ldsb<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldd<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+
+// stores
+class AI3sth<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AXI3sth<dag oops, dag iops, Format f, string asm,
+           list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3std<dag oops, dag iops, Format f, string opc,
+          string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
+      asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+
+// Pre-indexed loads
+class AI3ldhpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldshpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldsbpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+}
+
+// Pre-indexed stores
+class AI3sthpr<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
+      asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+}
+
+// Post-indexed loads
+class AI3ldhpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+}
+class AI3ldshpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+}
+class AI3ldsbpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+}
+
+// Post-indexed stores
+class AI3sthpo<dag oops, dag iops, Format f, string opc,
+            string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
+      asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+}
+
+
+// addrmode4 instructions
+class AXI4ld<dag oops, dag iops, Format f, string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm,
+       "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{22}    = 0; // S bit
+  let Inst{27-25} = 0b100;
+}
+class AXI4st<dag oops, dag iops, Format f, string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm,
+       "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{22}    = 0; // S bit
+  let Inst{27-25} = 0b100;
+}
+
+// Unsigned multiply, multiply-accumulate instructions.
+class AMul1I<bits<7> opcod, dag oops, dag iops, string opc,
+         string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
+      asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{20}    = 0; // S bit
+  let Inst{27-21} = opcod;
+}
+class AsMul1I<bits<7> opcod, dag oops, dag iops, string opc,
+          string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
+       asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{27-21} = opcod;
+}
+
+// Most significant word multiply
+class AMul2I<bits<7> opcod, dag oops, dag iops, string opc,
+         string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
+      asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{20}    = 1;
+  let Inst{27-21} = opcod;
+}
+
+// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
+class AMulxyI<bits<7> opcod, dag oops, dag iops, string opc,
+         string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
+      asm, "", pattern> {
+  let Inst{4}     = 0;
+  let Inst{7}     = 1;
+  let Inst{20}    = 0;
+  let Inst{27-21} = opcod;
+}
+
+// Extend instructions.
+class AExtI<bits<8> opcod, dag oops, dag iops, string opc,
+            string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, opc,
+      asm, "", pattern> {
+  let Inst{7-4}   = 0b0111;
+  let Inst{27-20} = opcod;
+}
+
+// Misc Arithmetic instructions.
+class AMiscA1I<bits<8> opcod, dag oops, dag iops, string opc,
+               string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, opc,
+      asm, "", pattern> {
+  let Inst{27-20} = opcod;
+}
+
+//===----------------------------------------------------------------------===//
+
+// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode.
+class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM];
+}
+class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE];
+}
+class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV6];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Thumb Instruction Format Definitions.
+//
+
+
+// TI - Thumb instruction.
+
+class ThumbI<dag outs, dag ins, AddrMode am, SizeFlagVal sz,
+             string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class TI<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeNone, Size2Bytes, asm, "", pattern>;
+class TI1<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeT1, Size2Bytes, asm, "", pattern>;
+class TI2<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeT2, Size2Bytes, asm, "", pattern>;
+class TI4<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeT4, Size2Bytes, asm, "", pattern>;
+class TIs<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeTs, Size2Bytes, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeNone, Size2Bytes, asm, "$lhs = $dst", pattern>;
+
+// BL, BLX(1) are translated by assembler into two instructions
+class TIx2<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeNone, Size4Bytes, asm, "", pattern>;
+
+// BR_JT instructions
+class TJTI<dag outs, dag ins, string asm, list<dag> pattern>
+  : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;
+
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// ARM VFP addrmode5 loads and stores
+class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+      VFPLdStFrm, opc, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1011;
+}
+
+class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+      VFPLdStFrm, opc, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1010;
+}
+
+// Load / store multiple
+class AXSI5<dag oops, dag iops, string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+       VFPLdStMulFrm, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1011;
+}
+
+class AXDI5<dag oops, dag iops, string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+       VFPLdStMulFrm, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1010;
+}
+
+
+// Double precision, unary
+class ADuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
+           string opc, string asm, list<dag> pattern>
+  : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{19-16} = opcod2;
+  let Inst{11-8}  = 0b1011;
+  let Inst{7-4}   = opcod3;
+}
+
+// Double precision, binary
+class ADbI<bits<8> opcod, dag oops, dag iops, string opc,
+           string asm, list<dag> pattern>
+  : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> {
+  let Inst{27-20} = opcod;
+  let Inst{11-8}  = 0b1011;
+}
+
+// Single precision, unary
+class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
+           string opc, string asm, list<dag> pattern>
+  : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> {
+  // Bits 22 (D bit) and 5 (M bit) will be changed during instruction encoding.
+  let Inst{27-20} = opcod1;
+  let Inst{19-16} = opcod2;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7-4}   = opcod3;
+}
+
+// Single precision, binary
+class ASbI<bits<8> opcod, dag oops, dag iops, string opc,
+           string asm, list<dag> pattern>
+  : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> {
+  // Bit 22 (D bit) can be changed during instruction encoding.
+  let Inst{27-20} = opcod;
+  let Inst{11-8}  = 0b1010;
+}
+
+// VFP conversion instructions
+class AVConv1I<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3,
+               dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : AI<oops, iops, VFPConv1Frm, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{19-16} = opcod2;
+  let Inst{11-8}  = opcod3;
+  let Inst{6}     = 1;
+}
+
+class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
+             string opc, string asm, list<dag> pattern>
+  : AI<oops, iops, f, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{4}     = 1;
+}
+
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
+              string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, opc, asm, pattern>;
+
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
+              string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, opc, asm, pattern>;
+
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
+              string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, opc, asm, pattern>;
+
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
+              string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, opc, asm, pattern>;
+
+//===----------------------------------------------------------------------===//
+
+
+// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
+class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, HasV5T];
+}
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
new file mode 100644
index 0000000..4b0dbb5
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -0,0 +1,1025 @@
+//===- ARMInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstrInfo.h"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+               cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+  : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)),
+    RI(*this, STI) {
+}
+
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI,
+                               unsigned &SrcReg, unsigned &DstReg,
+                               unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  unsigned oc = MI.getOpcode();
+  switch (oc) {
+  default:
+    return false;
+  case ARM::FCPYS:
+  case ARM::FCPYD:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  case ARM::MOVr:
+  case ARM::tMOVr:
+  case ARM::tMOVhir2lor:
+  case ARM::tMOVlor2hir:
+  case ARM::tMOVhir2hir:
+    assert(MI.getDesc().getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "Invalid ARM MOV instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+}
+
+unsigned ARMInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                           int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDR:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FLDD:
+  case ARM::FLDS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::tRestore:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned ARMInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STR:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FSTD:
+  case ARM::FSTS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::tSpill:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+void ARMInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 unsigned DestReg,
+                                 const MachineInstr *Orig) const {
+  DebugLoc dl = Orig->getDebugLoc();
+  if (Orig->getOpcode() == ARM::MOVi2pieces) {
+    RI.emitLoadConstPool(MBB, I, DestReg, Orig->getOperand(1).getImm(),
+                         Orig->getOperand(2).getImm(),
+                         Orig->getOperand(3).getReg(), this, false, dl);
+    return;
+  }
+
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+static unsigned getUnindexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default: break;
+  case ARM::LDR_PRE:
+  case ARM::LDR_POST:
+    return ARM::LDR;
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+    return ARM::LDRH;
+  case ARM::LDRB_PRE:
+  case ARM::LDRB_POST:
+    return ARM::LDRB;
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+    return ARM::LDRSH;
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+    return ARM::LDRSB;
+  case ARM::STR_PRE:
+  case ARM::STR_POST:
+    return ARM::STR;
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+    return ARM::STRH;
+  case ARM::STRB_PRE:
+  case ARM::STRB_POST:
+    return ARM::STRB;
+  }
+  return 0;
+}
+
+MachineInstr *
+ARMInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables *LV) const {
+  if (!EnableARM3Addr)
+    return NULL;
+
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TSFlags = MI->getDesc().TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return NULL;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  if (MemOpc == 0)
+    return NULL;
+
+  MachineInstr *UpdateMI = NULL;
+  MachineInstr *MemMI = NULL;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned NumOps = TID.getNumOperands();
+  bool isLoad = !TID.mayStore();
+  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+  const MachineOperand &Base = MI->getOperand(2);
+  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  switch (AddrMode) {
+  default:
+    assert(false && "Unknown indexed op!");
+    return NULL;
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      int SOImmVal = ARM_AM::getSOImmVal(Amt);
+      if (SOImmVal == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return NULL;
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(SOImmVal)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else 
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+  
+  // Transfer LiveVariables states, kill / dead info.
+  if (LV) {
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.getReg() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+      
+        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+        if (MO.isDef()) {
+          MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+          if (MO.isDead())
+            LV->addVirtualRegisterDead(Reg, NewMI);
+        }
+        if (MO.isUse() && MO.isKill()) {
+          for (unsigned j = 0; j < 2; ++j) {
+            // Look at the two new MI's in reverse order.
+            MachineInstr *NewMI = NewMIs[j];
+            if (!NewMI->readsRegister(Reg))
+              continue;
+            LV->addVirtualRegisterKilled(Reg, NewMI);
+            if (VI.removeKill(MI))
+              VI.Kills.push_back(NewMI);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool ARMInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+  
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastOpc == ARM::B || LastOpc == ARM::tB) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (LastOpc == ARM::Bcc || LastOpc == ARM::tBcc) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(LastInst->getOperand(1));
+      Cond.push_back(LastInst->getOperand(2));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+  
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with ARM::B/ARM::tB and a ARM::Bcc/ARM::tBcc, handle it.
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+  if ((SecondLastOpc == ARM::Bcc && LastOpc == ARM::B) ||
+      (SecondLastOpc == ARM::tBcc && LastOpc == ARM::tB)) {
+    TBB =  SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(1));
+    Cond.push_back(SecondLastInst->getOperand(2));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it.  The second 
+  // one is not executed, so remove it.
+  if ((SecondLastOpc == ARM::B || SecondLastOpc==ARM::tB) &&
+      (LastOpc == ARM::B || LastOpc == ARM::tB)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with a branch table followed by an unconditional
+  // branch. The branch folder can create these, and we must get rid of them for
+  // correctness of Thumb constant islands.
+  if ((SecondLastOpc == ARM::BR_JTr || SecondLastOpc==ARM::BR_JTm ||
+       SecondLastOpc == ARM::BR_JTadd || SecondLastOpc==ARM::tBR_JTr) &&
+      (LastOpc == ARM::B || LastOpc == ARM::tB)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  } 
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned ARMInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int BOpc   = AFI->isThumbFunction() ? ARM::tB : ARM::B;
+  int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+  
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != BccOpc)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+ARMInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int BOpc   = AFI->isThumbFunction() ? ARM::tB : ARM::B;
+  int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc;
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+  
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
+    else
+      BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+    return 1;
+  }
+  
+  // Two-way conditional branch.
+  BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+  BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
+  return 2;
+}
+
+bool ARMInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                unsigned DestReg, unsigned SrcReg,
+                                const TargetRegisterClass *DestRC,
+                                const TargetRegisterClass *SrcRC) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (!AFI->isThumbFunction()) {
+    if (DestRC == ARM::GPRRegisterClass) {
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+                                  .addReg(SrcReg)));
+      return true;
+    }
+  } else {
+    if (DestRC == ARM::GPRRegisterClass) {
+      if (SrcRC == ARM::GPRRegisterClass) {
+        BuildMI(MBB, I, DL, get(ARM::tMOVhir2hir), DestReg).addReg(SrcReg);
+        return true;
+      } else if (SrcRC == ARM::tGPRRegisterClass) {
+        BuildMI(MBB, I, DL, get(ARM::tMOVlor2hir), DestReg).addReg(SrcReg);
+        return true;
+      }
+    } else if (DestRC == ARM::tGPRRegisterClass) {
+      if (SrcRC == ARM::GPRRegisterClass) {
+        BuildMI(MBB, I, DL, get(ARM::tMOVhir2lor), DestReg).addReg(SrcReg);
+        return true;
+      } else if (SrcRC == ARM::tGPRRegisterClass) {
+        BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
+        return true;
+      }
+    }
+  }
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+
+  if (DestRC == ARM::SPRRegisterClass)
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg)
+                   .addReg(SrcReg));
+  else if (DestRC == ARM::DPRRegisterClass)
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
+                   .addReg(SrcReg));
+  else
+    return false;
+  
+  return true;
+}
+
+void ARMInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == ARM::GPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    assert (!AFI->isThumbFunction());
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addReg(0).addImm(0));
+  } else if (RC == ARM::tGPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    assert (AFI->isThumbFunction());
+    BuildMI(MBB, I, DL, get(ARM::tSpill))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI).addImm(0);
+  } else if (RC == ARM::DPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0));
+  } else {
+    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTS))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0));
+  }
+}
+
+void ARMInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  unsigned Opc = 0;
+  if (RC == ARM::GPRRegisterClass) {
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    if (AFI->isThumbFunction()) {
+      Opc = Addr[0].isFI() ? ARM::tSpill : ARM::tSTR;
+      MachineInstrBuilder MIB = 
+        BuildMI(MF, DL,  get(Opc)).addReg(SrcReg, getKillRegState(isKill));
+      for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+        MIB.addOperand(Addr[i]);
+      NewMIs.push_back(MIB);
+      return;
+    }
+    Opc = ARM::STR;
+  } else if (RC == ARM::DPRRegisterClass) {
+    Opc = ARM::FSTD;
+  } else {
+    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
+    Opc = ARM::FSTS;
+  }
+
+  MachineInstrBuilder MIB = 
+    BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  AddDefaultPred(MIB);
+  NewMIs.push_back(MIB);
+  return;
+}
+
+void ARMInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == ARM::GPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    assert (!AFI->isThumbFunction());
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
+                   .addFrameIndex(FI).addReg(0).addImm(0));
+  } else if (RC == ARM::tGPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    assert (AFI->isThumbFunction());
+    BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
+      .addFrameIndex(FI).addImm(0);
+  } else if (RC == ARM::DPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg)
+                   .addFrameIndex(FI).addImm(0));
+  } else {
+    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDS), DestReg)
+                   .addFrameIndex(FI).addImm(0));
+  }
+}
+
+void ARMInstrInfo::
+loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                SmallVectorImpl<MachineOperand> &Addr,
+                const TargetRegisterClass *RC,
+                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  unsigned Opc = 0;
+  if (RC == ARM::GPRRegisterClass) {
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    if (AFI->isThumbFunction()) {
+      Opc = Addr[0].isFI() ? ARM::tRestore : ARM::tLDR;
+      MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+      for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+        MIB.addOperand(Addr[i]);
+      NewMIs.push_back(MIB);
+      return;
+    }
+    Opc = ARM::LDR;
+  } else if (RC == ARM::DPRRegisterClass) {
+    Opc = ARM::FLDD;
+  } else {
+    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
+    Opc = ARM::FLDS;
+  }
+
+  MachineInstrBuilder MIB =  BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  AddDefaultPred(MIB);
+  NewMIs.push_back(MIB);
+  return;
+}
+
+bool ARMInstrInfo::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (!AFI->isThumbFunction() || CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH));
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    MIB.addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool ARMInstrInfo::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (!AFI->isThumbFunction() || CSI.empty())
+    return false;
+
+  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
+  MachineInstr *PopMI = MF.CreateMachineInstr(get(ARM::tPOP),MI->getDebugLoc());
+  MBB.insert(MI, PopMI);
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (Reg == ARM::LR) {
+      // Special epilogue for vararg functions. See emitEpilogue
+      if (isVarArg)
+        continue;
+      Reg = ARM::PC;
+      PopMI->setDesc(get(ARM::tPOP_RET));
+      MBB.erase(MI);
+    }
+    PopMI->addOperand(MachineOperand::CreateReg(Reg, true));
+  }
+  return true;
+}
+
+MachineInstr *ARMInstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
+  if (Ops.size() != 1) return NULL;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  MachineInstr *NewMI = NULL;
+  switch (Opc) {
+  default: break;
+  case ARM::MOVr: {
+    if (MI->getOperand(4).getReg() == ARM::CPSR)
+      // If it is updating CPSR, then it cannot be folded.
+      break;
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
+        .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
+        .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+    break;
+  }
+  case ARM::tMOVr:
+  case ARM::tMOVlor2hir:
+  case ARM::tMOVhir2lor:
+  case ARM::tMOVhir2hir: {
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      if (RI.isPhysicalRegister(SrcReg) && !RI.isLowRegister(SrcReg))
+        // tSpill cannot take a high register operand.
+        break;
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addFrameIndex(FI).addImm(0);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      if (RI.isPhysicalRegister(DstReg) && !RI.isLowRegister(DstReg))
+        // tRestore cannot target a high register operand.
+        break;
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
+        .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
+        .addFrameIndex(FI).addImm(0);
+    }
+    break;
+  }
+  case ARM::FCPYS: {
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTS))
+        .addReg(SrcReg).addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDS), DstReg)
+        .addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    }
+    break;
+  }
+  case ARM::FCPYD: {
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTD))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDD))
+        .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+    break;
+  }
+  }
+
+  return NewMI;
+}
+
+bool ARMInstrInfo::
+canFoldMemoryOperand(const MachineInstr *MI,
+                     const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default: break;
+  case ARM::MOVr:
+    // If it is updating CPSR, then it cannot be folded.
+    return MI->getOperand(4).getReg() != ARM::CPSR;
+  case ARM::tMOVr:
+  case ARM::tMOVlor2hir:
+  case ARM::tMOVhir2lor:
+  case ARM::tMOVhir2hir: {
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      if (RI.isPhysicalRegister(SrcReg) && !RI.isLowRegister(SrcReg))
+        // tSpill cannot take a high register operand.
+        return false;
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      if (RI.isPhysicalRegister(DstReg) && !RI.isLowRegister(DstReg))
+        // tRestore cannot target a high register operand.
+        return false;
+    }
+    return true;
+  }
+  case ARM::FCPYS:
+  case ARM::FCPYD:
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case ARM::BX_RET:   // Return.
+  case ARM::LDM_RET:
+  case ARM::tBX_RET:
+  case ARM::tBX_RET_vararg:
+  case ARM::tPOP_RET:
+  case ARM::B:
+  case ARM::tB:       // Uncond branch.
+  case ARM::tBR_JTr:
+  case ARM::BR_JTr:   // Jumptable branch.
+  case ARM::BR_JTm:   // Jumptable branch through mem.
+  case ARM::BR_JTadd: // Jumptable branch add to pc.
+    return true;
+  default: return false;
+  }
+}
+
+bool ARMInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
+}
+
+bool ARMInstrInfo::isPredicated(const MachineInstr *MI) const {
+  int PIdx = MI->findFirstPredOperandIdx();
+  return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+}
+
+bool ARMInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::B || Opc == ARM::tB) {
+    MI->setDesc(get(Opc == ARM::B ? ARM::Bcc : ARM::tBcc));
+    MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm()));
+    MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false));
+    return true;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setImm(Pred[0].getImm());
+    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    return true;
+  }
+  return false;
+}
+
+bool ARMInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  if (Pred1.size() > 2 || Pred2.size() > 2)
+    return false;
+
+  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+  if (CC1 == CC2)
+    return true;
+
+  switch (CC1) {
+  default:
+    return false;
+  case ARMCC::AL:
+    return true;
+  case ARMCC::HS:
+    return CC2 == ARMCC::HI;
+  case ARMCC::LS:
+    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+  case ARMCC::GE:
+    return CC2 == ARMCC::GT;
+  case ARMCC::LE:
+    return CC2 == ARMCC::LT;
+  }
+}
+
+bool ARMInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.getImplicitDefs() && !TID.hasOptionalDef())
+    return false;
+
+  bool Found = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == ARM::CPSR) {
+      Pred.push_back(MO);
+      Found = true;
+    }
+  }
+
+  return Found;
+}
+
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) DISABLE_INLINE;
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) {
+  return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const TargetAsmInfo *TAI = MF->getTarget().getTargetAsmInfo();
+
+  // Basic size info comes from the TSFlags field.
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned TSFlags = TID.TSFlags;
+  
+  switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
+  default: {
+    // If this machine instr is an inline asm, measure it.
+    if (MI->getOpcode() == ARM::INLINEASM)
+      return TAI->getInlineAsmLength(MI->getOperand(0).getSymbolName());
+    if (MI->isLabel())
+      return 0;
+    switch (MI->getOpcode()) {
+    default:
+      assert(0 && "Unknown or unset size field for instr!");
+      break;
+    case TargetInstrInfo::IMPLICIT_DEF:
+    case TargetInstrInfo::DECLARE:
+    case TargetInstrInfo::DBG_LABEL:
+    case TargetInstrInfo::EH_LABEL:
+      return 0;
+    }
+    break;
+  }
+  case ARMII::Size8Bytes: return 8;          // Arm instruction x 2.
+  case ARMII::Size4Bytes: return 4;          // Arm instruction.
+  case ARMII::Size2Bytes: return 2;          // Thumb instruction.
+  case ARMII::SizeSpecial: {
+    switch (MI->getOpcode()) {
+    case ARM::CONSTPOOL_ENTRY:
+      // If this machine instr is a constant pool entry, its size is recorded as
+      // operand #2.
+      return MI->getOperand(2).getImm();
+    case ARM::Int_eh_sjlj_setjmp: return 12;
+    case ARM::BR_JTr:
+    case ARM::BR_JTm:
+    case ARM::BR_JTadd:
+    case ARM::tBR_JTr: {
+      // These are jumptable branches, i.e. a branch followed by an inlined
+      // jumptable. The size is 4 + 4 * number of entries.
+      unsigned NumOps = TID.getNumOperands();
+      MachineOperand JTOP =
+        MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2));
+      unsigned JTI = JTOP.getIndex();
+      const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+      const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+      assert(JTI < JT.size());
+      // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
+      // 4 aligned. The assembler / linker may add 2 byte padding just before
+      // the JT entries.  The size does not include this padding; the
+      // constant islands pass does separate bookkeeping for it.
+      // FIXME: If we know the size of the function is less than (1 << 16) *2
+      // bytes, we can use 16-bit entries instead. Then there won't be an
+      // alignment issue.
+      return getNumJTEntries(JT, JTI) * 4 + 
+             (MI->getOpcode()==ARM::tBR_JTr ? 2 : 4);
+    }
+    default:
+      // Otherwise, pseudo-instruction sizes are zero.
+      return 0;
+    }
+  }
+  }
+  return 0; // Not reached
+}
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
new file mode 100644
index 0000000..13ff3fe
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -0,0 +1,258 @@
+//===- ARMInstrInfo.h - ARM Instruction Information -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTRUCTIONINFO_H
+#define ARMINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARM.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+
+    AddrModeMask  = 0xf,
+    AddrModeNone  = 0,
+    AddrMode1     = 1,
+    AddrMode2     = 2,
+    AddrMode3     = 3,
+    AddrMode4     = 4,
+    AddrMode5     = 5,
+    AddrModeT1    = 6,
+    AddrModeT2    = 7,
+    AddrModeT4    = 8,
+    AddrModeTs    = 9,  // i8 * 4 for pc and sp relative data
+
+    // Size* - Flags to keep track of the size of an instruction.
+    SizeShift     = 4,
+    SizeMask      = 7 << SizeShift,
+    SizeSpecial   = 1,   // 0 byte pseudo or special case.
+    Size8Bytes    = 2,
+    Size4Bytes    = 3,
+    Size2Bytes    = 4,
+    
+    // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
+    // and store ops 
+    IndexModeShift = 7,
+    IndexModeMask  = 3 << IndexModeShift,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 9,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 10,
+    FormMask      = 0x1f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 11 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 12 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 13 << FormShift,
+    VFPBinaryFrm  = 14 << FormShift,
+    VFPConv1Frm   = 15 << FormShift,
+    VFPConv2Frm   = 16 << FormShift,
+    VFPConv3Frm   = 17 << FormShift,
+    VFPConv4Frm   = 18 << FormShift,
+    VFPConv5Frm   = 19 << FormShift,
+    VFPLdStFrm    = 20 << FormShift,
+    VFPLdStMulFrm = 21 << FormShift,
+    VFPMiscFrm    = 22 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 23 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+}
+
+class ARMInstrInfo : public TargetInstrInfoImpl {
+  const ARMRegisterInfo RI;
+public:
+  explicit ARMInstrInfo(const ARMSubtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const ARMRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+
+  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops) const;
+  
+  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  // Predication support.
+  virtual bool isPredicated(const MachineInstr *MI) const;
+
+  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
+    int PIdx = MI->findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm() 
+                      : ARMCC::AL;
+  }
+
+  virtual
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  virtual
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+    
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
new file mode 100644
index 0000000..680e772
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -0,0 +1,1390 @@
+//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM specific DAG Nodes.
+//
+
+// Type profiles.
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+
+def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
+
+def SDT_ARMcall    : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+
+def SDT_ARMCMov    : SDTypeProfile<1, 3,
+                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                    SDTCisVT<3, i32>]>;
+
+def SDT_ARMBrcond  : SDTypeProfile<0, 2,
+                                   [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBrJT    : SDTypeProfile<0, 3,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>]>;
+
+def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+
+// Node definitions.
+def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
+
+def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
+                              [SDNPHasChain, SDNPOutFlag]>;
+def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInFlag]>;
+
+def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
+                              [SDNPInFlag]>;
+def ARMcneg          : SDNode<"ARMISD::CNEG", SDT_ARMCMov,
+                              [SDNPInFlag]>;
+
+def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
+                              [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
+                              [SDNPHasChain]>;
+
+def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
+                              [SDNPOutFlag]>;
+
+def ARMcmpNZ         : SDNode<"ARMISD::CMPNZ", SDT_ARMCmp,
+                              [SDNPOutFlag]>;
+
+def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+
+def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
+def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
+def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInFlag ]>;
+
+def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV5T   : Predicate<"Subtarget->hasV5TOps()">;
+def HasV5TE  : Predicate<"Subtarget->hasV5TEOps()">;
+def HasV6    : Predicate<"Subtarget->hasV6Ops()">;
+def IsThumb  : Predicate<"Subtarget->isThumb()">;
+def IsThumb2 : Predicate<"Subtarget->isThumb2()">;
+def IsARM    : Predicate<"!Subtarget->isThumb()">;
+
+//===----------------------------------------------------------------------===//
+// ARM Flag Definitions.
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+//===----------------------------------------------------------------------===//
+//  ARM specific transformation functions and pattern fragments.
+//
+
+// so_imm_XFORM - Return a so_imm value packed into the format described for
+// so_imm def below.
+def so_imm_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(N->getZExtValue()),
+                                   MVT::i32);
+}]>;
+
+// so_imm_neg_XFORM - Return a so_imm value packed into the format described for
+// so_imm_neg def below.
+def so_imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(-(int)N->getZExtValue()),
+                                   MVT::i32);
+}]>;
+
+// so_imm_not_XFORM - Return a so_imm value packed into the format described for
+// so_imm_not def below.
+def so_imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(~(int)N->getZExtValue()),
+                                   MVT::i32);
+}]>;
+
+// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24.
+def rot_imm : PatLeaf<(i32 imm), [{
+  int32_t v = (int32_t)N->getZExtValue();
+  return v == 8 || v == 16 || v == 24;
+}]>;
+
+/// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15].
+def imm1_15 : PatLeaf<(i32 imm), [{
+  return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16;
+}]>;
+
+/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
+def imm16_31 : PatLeaf<(i32 imm), [{
+  return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32;
+}]>;
+
+def so_imm_neg : 
+  PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(-(int)N->getZExtValue()) != -1;
+  }], so_imm_neg_XFORM>;
+
+def so_imm_not :
+  PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(~(int)N->getZExtValue()) != -1;
+  }], so_imm_not_XFORM>;
+
+// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
+def sext_16_node : PatLeaf<(i32 GPR:$a), [{
+  return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
+}]>;
+
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// Branch target.
+def brtarget : Operand<OtherVT>;
+
+// A list of registers separated by comma. Used by load/store multiple.
+def reglist : Operand<i32> {
+  let PrintMethod = "printRegisterList";
+}
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  let PrintMethod = "printCPInstOperand";
+}
+
+def jtblock_operand : Operand<i32> {
+  let PrintMethod = "printJTBlockOperand";
+}
+
+// Local PC labels.
+def pclabel : Operand<i32> {
+  let PrintMethod = "printPCLabel";
+}
+
+// shifter_operand operands: so_reg and so_imm.
+def so_reg : Operand<i32>,    // reg reg imm
+            ComplexPattern<i32, 3, "SelectShifterOperandReg",
+                            [shl,srl,sra,rotr]> {
+  let PrintMethod = "printSORegOperand";
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+
+// so_imm - Match a 32-bit shifter_operand immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits.  so_imm values are
+// represented in the imm field in the same 12-bit form that they are encoded
+// into so_imm instructions: the 8-bit immediate is the least significant bits
+// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11].
+def so_imm : Operand<i32>,
+             PatLeaf<(imm),
+                     [{ return ARM_AM::getSOImmVal(N->getZExtValue()) != -1; }],
+                     so_imm_XFORM> {
+  let PrintMethod = "printSOImmOperand";
+}
+
+// Break so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses so_imm2part to match and so_imm2part_[12] to
+// get the first/second pieces.
+def so_imm2part : Operand<i32>,
+                  PatLeaf<(imm), [{
+      return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+    }]> {
+  let PrintMethod = "printSOImm2PartOperand";
+}
+
+def so_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+}]>;
+
+def so_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+}]>;
+
+
+// Define ARM specific addressing modes.
+
+// addrmode2 := reg +/- reg shop imm
+// addrmode2 := reg +/- imm12
+//
+def addrmode2 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+  let PrintMethod = "printAddrMode2Operand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def am2offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2Offset", []> {
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode3 := reg +/- reg
+// addrmode3 := reg +/- imm8
+//
+def addrmode3 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+  let PrintMethod = "printAddrMode3Operand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def am3offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode3Offset", []> {
+  let PrintMethod = "printAddrMode3OffsetOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode4 := reg, <mode|W>
+//
+def addrmode4 : Operand<i32>,
+                ComplexPattern<i32, 2, "", []> {
+  let PrintMethod = "printAddrMode4Operand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode5 := reg +/- imm8*4
+//
+def addrmode5 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode5", []> {
+  let PrintMethod = "printAddrMode5Operand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmodepc := pc + reg
+//
+def addrmodepc : Operand<i32>,
+                 ComplexPattern<i32, 2, "SelectAddrModePC", []> {
+  let PrintMethod = "printAddrModePCOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
+                                     (ops (i32 14), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+//
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let PrintMethod = "printSBitModifierOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction flags.  These need to match ARMInstrInfo.h.
+//
+
+// Addressing mode.
+class AddrMode<bits<4> val> {
+  bits<4> Value = val;
+}
+def AddrModeNone : AddrMode<0>;
+def AddrMode1    : AddrMode<1>;
+def AddrMode2    : AddrMode<2>;
+def AddrMode3    : AddrMode<3>;
+def AddrMode4    : AddrMode<4>;
+def AddrMode5    : AddrMode<5>;
+def AddrModeT1   : AddrMode<6>;
+def AddrModeT2   : AddrMode<7>;
+def AddrModeT4   : AddrMode<8>;
+def AddrModeTs   : AddrMode<9>;
+
+// Instruction size.
+class SizeFlagVal<bits<3> val> {
+  bits<3> Value = val;
+}
+def SizeInvalid  : SizeFlagVal<0>;  // Unset.
+def SizeSpecial  : SizeFlagVal<1>;  // Pseudo or special.
+def Size8Bytes   : SizeFlagVal<2>;
+def Size4Bytes   : SizeFlagVal<3>;
+def Size2Bytes   : SizeFlagVal<4>;
+
+// Load / store index mode.
+class IndexMode<bits<2> val> {
+  bits<2> Value = val;
+}
+def IndexModeNone : IndexMode<0>;
+def IndexModePre  : IndexMode<1>;
+def IndexModePost : IndexMode<2>;
+
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+/// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
+/// binop that produces a value.
+multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode> {
+  def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+               opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+  def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
+               opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>;
+  def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+               opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+}
+
+/// ASI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CSPR register.
+let Defs = [CPSR] in {
+multiclass ASI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode> {
+  def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+               opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+  def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
+               opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>;
+  def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+               opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+}
+}
+
+/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+let Defs = [CPSR] in {
+multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
+  def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm,
+               opc, " $a, $b",
+               [(opnode GPR:$a, so_imm:$b)]>;
+  def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm,
+               opc, " $a, $b",
+               [(opnode GPR:$a, GPR:$b)]>;
+  def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+               opc, " $a, $b",
+               [(opnode GPR:$a, so_reg:$b)]>;
+}
+}
+
+/// AI_unary_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
+multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
+  def r     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src),
+                 opc, " $dst, $Src",
+                 [(set GPR:$dst, (opnode GPR:$Src))]>,
+              Requires<[IsARM, HasV6]> {
+                let Inst{19-16} = 0b1111;
+              }
+  def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src, i32imm:$rot),
+                 opc, " $dst, $Src, ror $rot",
+                 [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>,
+              Requires<[IsARM, HasV6]> {
+                let Inst{19-16} = 0b1111;
+              }
+}
+
+/// AI_bin_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> {
+  def rr     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
+                  opc, " $dst, $LHS, $RHS",
+                  [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>,
+                  Requires<[IsARM, HasV6]>;
+  def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
+                  opc, " $dst, $LHS, $RHS, ror $rot",
+                  [(set GPR:$dst, (opnode GPR:$LHS,
+                                          (rotr GPR:$RHS, rot_imm:$rot)))]>,
+                  Requires<[IsARM, HasV6]>;
+}
+
+/// AsXI1_bin_c_irs - Same as AsI1_bin_irs but without the predicate operand and
+/// setting carry bit. But it can optionally set CPSR.
+let Uses = [CPSR] in {
+multiclass AsXI1_bin_c_irs<bits<4> opcod, string opc, PatFrag opnode> {
+  def ri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b, cc_out:$s),
+                DPFrm, !strconcat(opc, "${s} $dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+  def rr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b, cc_out:$s),
+                DPFrm, !strconcat(opc, "${s} $dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>;
+  def rs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b, cc_out:$s),
+                DPSoRegFrm, !strconcat(opc, "${s} $dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+/// the function.  The first operand is the ID# for this instruction, the second
+/// is the index into the MachineConstantPool that this is, the third is the
+/// size in bytes of this constant pool entry.
+let isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                    i32imm:$size),
+           "${instid:label} ${cpidx:cpentry}", []>;
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p),
+           "@ ADJCALLSTACKUP $amt1",
+           [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
+
+def ADJCALLSTACKDOWN : 
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p),
+           "@ ADJCALLSTACKDOWN $amt",
+           [(ARMcallseq_start timm:$amt)]>;
+}
+
+def DWARF_LOC :
+PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file),
+           ".loc $file, $line, $col",
+           [(dwarf_loc (i32 imm:$line), (i32 imm:$col), (i32 imm:$file))]>;
+
+
+// Address computation and loads and stores in PIC mode.
+let isNotDuplicable = 1 in {
+def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
+                  Pseudo, "$cp:\n\tadd$p $dst, pc, $a",
+                   [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
+
+let AddedComplexity = 10 in {
+let canFoldAsLoad = 1 in
+def PICLDR  : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                  Pseudo, "${addr:label}:\n\tldr$p $dst, $addr",
+                  [(set GPR:$dst, (load addrmodepc:$addr))]>;
+
+def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                  Pseudo, "${addr:label}:\n\tldr${p}h $dst, $addr",
+                  [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                  Pseudo, "${addr:label}:\n\tldr${p}b $dst, $addr",
+                  [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>;
+
+def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                  Pseudo, "${addr:label}:\n\tldr${p}sh $dst, $addr",
+                  [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                  Pseudo, "${addr:label}:\n\tldr${p}sb $dst, $addr",
+                  [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>;
+}
+let AddedComplexity = 10 in {
+def PICSTR  : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+               Pseudo, "${addr:label}:\n\tstr$p $src, $addr",
+               [(store GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+               Pseudo, "${addr:label}:\n\tstr${p}h $src, $addr",
+               [(truncstorei16 GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+               Pseudo, "${addr:label}:\n\tstr${p}b $src, $addr",
+               [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+}
+} // isNotDuplicable = 1
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1 in
+  def BX_RET : AI<(outs), (ins), BrMiscFrm, "bx", " lr", [(ARMretflag)]> {
+  let Inst{7-4}   = 0b0001;
+  let Inst{19-8}  = 0b111111111111;
+  let Inst{27-20} = 0b00010010;
+}
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: $dst1 should be a def. But the extra ops must be in the end of the
+// operand list.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1 in
+  def LDM_RET : AXI4ld<(outs),
+                    (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
+                    LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
+                    []>;
+
+let isCall = 1,
+  Defs = [R0, R1, R2, R3, R12, LR,
+          D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+  def BL  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                "bl ${func:call}",
+                [(ARMcall tglobaladdr:$func)]>;
+
+  def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                   "bl", " ${func:call}",
+                   [(ARMcall_pred tglobaladdr:$func)]>;
+
+  // ARMv5T and above
+  def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+                "blx $func",
+                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T]> {
+    let Inst{7-4}   = 0b0011;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+  }
+
+  let Uses = [LR] in {
+    // ARMv4T
+    def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+                     "mov lr, pc\n\tbx $func",
+                    [(ARMcall_nolink GPR:$func)]>;
+  }
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  // B is "predicable" since it can be xformed into a Bcc.
+  let isBarrier = 1 in {
+    let isPredicable = 1 in
+    def B : ABXI<0b1010, (outs), (ins brtarget:$target), "b $target",
+                [(br bb:$target)]>;
+
+  let isNotDuplicable = 1, isIndirectBranch = 1 in {
+  def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
+                    "mov pc, $target \n$jt",
+                    [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> {
+    let Inst{20}    = 0; // S Bit
+    let Inst{24-21} = 0b1101;
+    let Inst{27-26} = {0,0};
+  }
+  def BR_JTm : JTI<(outs),
+                   (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id),
+                   "ldr pc, $target \n$jt",
+                  [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
+                    imm:$id)]> {
+    let Inst{20}    = 1; // L bit
+    let Inst{21}    = 0; // W bit
+    let Inst{22}    = 0; // B bit
+    let Inst{24}    = 1; // P bit
+    let Inst{27-26} = {0,1};
+  }
+  def BR_JTadd : JTI<(outs),
+                   (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
+                     "add pc, $target, $idx \n$jt",
+                    [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
+                      imm:$id)]> {
+    let Inst{20}    = 0; // S bit
+    let Inst{24-21} = 0b0100;
+    let Inst{27-26} = {0,0};
+  }
+  } // isNotDuplicable = 1, isIndirectBranch = 1
+  } // isBarrier = 1
+
+  // FIXME: should be able to write a pattern for ARMBrcond, but can't use
+  // a two-value operand where a dag node expects two operands. :( 
+  def Bcc : ABI<0b1010, (outs), (ins brtarget:$target),
+               "b", " $target",
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+// Load
+let canFoldAsLoad = 1 in 
+def LDR  : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+               "ldr", " $dst, $addr",
+               [(set GPR:$dst, (load addrmode2:$addr))]>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
+def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+                 "ldr", " $dst, $addr", []>;
+
+// Loads with zero extension
+def LDRH  : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
+                 "ldr", "h $dst, $addr",
+                [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
+
+def LDRB  : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+                 "ldr", "b $dst, $addr",
+                [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
+
+// Loads with sign extension
+def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
+                 "ldr", "sh $dst, $addr",
+                [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
+                 "ldr", "sb $dst, $addr",
+                [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
+
+let mayLoad = 1 in {
+// Load doubleword
+def LDRD  : AI3ldd<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
+                 "ldr", "d $dst, $addr",
+                []>, Requires<[IsARM, HasV5T]>;
+
+// Indexed loads
+def LDR_PRE  : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb),
+                     (ins addrmode2:$addr), LdFrm,
+                     "ldr", " $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
+                     (ins GPR:$base, am2offset:$offset), LdFrm,
+                     "ldr", " $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRH_PRE  : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb),
+                     (ins addrmode3:$addr), LdMiscFrm,
+                     "ldr", "h $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb),
+                     (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+                     "ldr", "h $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRB_PRE  : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb),
+                     (ins addrmode2:$addr), LdFrm,
+                     "ldr", "b $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb),
+                     (ins GPR:$base,am2offset:$offset), LdFrm,
+                     "ldr", "b $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb),
+                      (ins addrmode3:$addr), LdMiscFrm,
+                      "ldr", "sh $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb),
+                      (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+                    "ldr", "sh $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb),
+                      (ins addrmode3:$addr), LdMiscFrm,
+                      "ldr", "sb $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
+                      (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+                      "ldr", "sb $dst, [$base], $offset", "$base = $base_wb", []>;
+}
+
+// Store
+def STR  : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm,
+               "str", " $src, $addr",
+               [(store GPR:$src, addrmode2:$addr)]>;
+
+// Stores with truncate
+def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm,
+               "str", "h $src, $addr",
+               [(truncstorei16 GPR:$src, addrmode3:$addr)]>;
+
+def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm,
+               "str", "b $src, $addr",
+               [(truncstorei8 GPR:$src, addrmode2:$addr)]>;
+
+// Store doubleword
+let mayStore = 1 in
+def STRD : AI3std<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm,
+               "str", "d $src, $addr",
+               []>, Requires<[IsARM, HasV5T]>;
+
+// Indexed stores
+def STR_PRE  : AI2stwpr<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base, am2offset:$offset), StFrm,
+                    "str", " $src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>;
+
+def STR_POST : AI2stwpo<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+                    "str", " $src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (post_store GPR:$src, GPR:$base, am2offset:$offset))]>;
+
+def STRH_PRE : AI3sthpr<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm,
+                     "str", "h $src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>;
+
+def STRH_POST: AI3sthpo<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm,
+                     "str", "h $src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb, (post_truncsti16 GPR:$src,
+                                         GPR:$base, am3offset:$offset))]>;
+
+def STRB_PRE : AI2stbpr<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+                     "str", "b $src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb, (pre_truncsti8 GPR:$src,
+                                         GPR:$base, am2offset:$offset))]>;
+
+def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+                     "str", "b $src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb, (post_truncsti8 GPR:$src,
+                                         GPR:$base, am2offset:$offset))]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// FIXME: $dst1 should be a def.
+let mayLoad = 1 in
+def LDM : AXI4ld<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
+               LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
+               []>;
+
+let mayStore = 1 in
+def STM : AXI4st<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$src1, variable_ops),
+               LdStMulFrm, "stm${p}${addr:submode} $addr, $src1",
+               []>;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm,
+                 "mov", " $dst, $src", []>, UnaryDP;
+def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
+                 "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm,
+                 "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP;
+
+def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
+                 "mov", " $dst, $src, rrx",
+                 [(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP;
+
+// These aren't really mov instructions, but we have to define them this way
+// due to flag operands.
+
+let Defs = [CPSR] in {
+def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
+                      "mov", "s $dst, $src, lsr #1",
+                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP;
+def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
+                      "mov", "s $dst, $src, asr #1",
+                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP;
+}
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+defm SXTB  : AI_unary_rrot<0b01101010,
+                           "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+defm SXTH  : AI_unary_rrot<0b01101011,
+                           "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+defm SXTAB : AI_bin_rrot<0b01101010,
+               "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+defm SXTAH : AI_bin_rrot<0b01101011,
+               "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+// TODO: SXT(A){B|H}16
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+defm UXTB   : AI_unary_rrot<0b01101110,
+                            "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+defm UXTH   : AI_unary_rrot<0b01101111,
+                            "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+defm UXTB16 : AI_unary_rrot<0b01101100,
+                            "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+def : ARMV6Pat<(and (shl GPR:$Src, 8), 0xFF00FF),
+               (UXTB16r_rot GPR:$Src, 24)>;
+def : ARMV6Pat<(and (srl GPR:$Src, 8), 0xFF00FF),
+               (UXTB16r_rot GPR:$Src, 8)>;
+
+defm UXTAB : AI_bin_rrot<0b01101110, "uxtab",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+defm UXTAH : AI_bin_rrot<0b01101111, "uxtah",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+}
+
+// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
+//defm UXTAB16 : xxx<"uxtab16", 0xff00ff>;
+
+// TODO: UXT(A){B|H}16
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm ADD  : AsI1_bin_irs<0b0100, "add",
+                         BinOpFrag<(add  node:$LHS, node:$RHS)>>;
+defm SUB  : AsI1_bin_irs<0b0010, "sub",
+                         BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set.
+defm ADDS : ASI1_bin_s_irs<0b0100, "add",
+                           BinOpFrag<(addc node:$LHS, node:$RHS)>>;
+defm SUBS : ASI1_bin_s_irs<0b0010, "sub",
+                           BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+
+// FIXME: Do not allow ADC / SBC to be predicated for now.
+defm ADC  : AsXI1_bin_c_irs<0b0101, "adc",
+                            BinOpFrag<(adde node:$LHS, node:$RHS)>>;
+defm SBC  : AsXI1_bin_c_irs<0b0110, "sbc",
+                            BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+
+// These don't define reg/reg forms, because they are handled above.
+def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+                  "rsb", " $dst, $a, $b",
+                  [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]>;
+
+def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+                  "rsb", " $dst, $a, $b",
+                  [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]>;
+
+// RSB with 's' bit set.
+let Defs = [CPSR] in {
+def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+                 "rsb", "s $dst, $a, $b",
+                 [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]>;
+def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+                 "rsb", "s $dst, $a, $b",
+                 [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]>;
+}
+
+// FIXME: Do not allow RSC to be predicated for now. But they can set CPSR.
+let Uses = [CPSR] in {
+def RSCri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b, cc_out:$s),
+                 DPFrm, "rsc${s} $dst, $a, $b",
+                 [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>;
+def RSCrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b, cc_out:$s),
+                 DPSoRegFrm, "rsc${s} $dst, $a, $b",
+                 [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>;
+}
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : ARMPat<(add    GPR:$src, so_imm_neg:$imm),
+             (SUBri  GPR:$src, so_imm_neg:$imm)>;
+
+//def : ARMPat<(addc   GPR:$src, so_imm_neg:$imm),
+//             (SUBSri GPR:$src, so_imm_neg:$imm)>;
+//def : ARMPat<(adde   GPR:$src, so_imm_neg:$imm),
+//             (SBCri  GPR:$src, so_imm_neg:$imm)>;
+
+// Note: These are implemented in C++ code, because they have to generate
+// ADD/SUBrs instructions, which use a complex pattern that a xform function
+// cannot produce.
+// (mul X, 2^n+1) -> (add (X << n), X)
+// (mul X, 2^n-1) -> (rsb X, (X << n))
+
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm AND   : AsI1_bin_irs<0b0000, "and",
+                          BinOpFrag<(and node:$LHS, node:$RHS)>>;
+defm ORR   : AsI1_bin_irs<0b1100, "orr",
+                          BinOpFrag<(or  node:$LHS, node:$RHS)>>;
+defm EOR   : AsI1_bin_irs<0b0001, "eor",
+                          BinOpFrag<(xor node:$LHS, node:$RHS)>>;
+defm BIC   : AsI1_bin_irs<0b1110, "bic",
+                          BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+def  MVNr  : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm,
+                  "mvn", " $dst, $src",
+                  [(set GPR:$dst, (not GPR:$src))]>, UnaryDP;
+def  MVNs  : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
+                  "mvn", " $dst, $src",
+                  [(set GPR:$dst, (not so_reg:$src))]>, UnaryDP;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def  MVNi  : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm,
+                  "mvn", " $dst, $imm",
+                  [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP;
+
+def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
+             (BICri GPR:$src, so_imm_not:$imm)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+
+def MUL   : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                    "mul", " $dst, $a, $b",
+                   [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
+
+def MLA   : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+                    "mla", " $dst, $a, $b, $c",
+                   [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
+
+// Extra precision multiplies with low / high results
+def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b),
+                    "smull", " $ldst, $hdst, $a, $b", []>;
+
+def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b),
+                    "umull", " $ldst, $hdst, $a, $b", []>;
+
+// Multiply + accumulate
+def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b),
+                    "smlal", " $ldst, $hdst, $a, $b", []>;
+
+def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b),
+                    "umlal", " $ldst, $hdst, $a, $b", []>;
+
+def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b),
+                    "umaal", " $ldst, $hdst, $a, $b", []>,
+                    Requires<[IsARM, HasV6]>;
+
+// Most significant word multiply
+def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+               "smmul", " $dst, $a, $b",
+               [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0001;
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+               "smmla", " $dst, $a, $b, $c",
+               [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0001;
+}
+
+
+def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+               "smmls", " $dst, $a, $b, $c",
+               [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b1101;
+}
+
+multiclass AI_smul<string opc, PatFrag opnode> {
+  def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              !strconcat(opc, "bb"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sext_inreg GPR:$b, i16)))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 0;
+           }
+
+  def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              !strconcat(opc, "bt"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sra GPR:$b, 16)))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 1;
+           }
+
+  def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              !strconcat(opc, "tb"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
+                                      (sext_inreg GPR:$b, i16)))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 0;
+           }
+
+  def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              !strconcat(opc, "tt"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
+                                      (sra GPR:$b, 16)))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 1;
+           }
+
+  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              !strconcat(opc, "wb"), " $dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sext_inreg GPR:$b, i16)), 16))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 0;
+           }
+
+  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              !strconcat(opc, "wt"), " $dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sra GPR:$b, 16)), 16))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 1;
+           }
+}
+
+
+multiclass AI_smla<string opc, PatFrag opnode> {
+  def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc,
+                               (opnode (sext_inreg GPR:$a, i16),
+                                       (sext_inreg GPR:$b, i16))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 0;
+           }
+
+  def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
+                                                     (sra GPR:$b, 16))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 1;
+           }
+
+  def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
+                                                 (sext_inreg GPR:$b, i16))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 0;
+           }
+
+  def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
+                                                     (sra GPR:$b, 16))))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 1;
+           }
+
+  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                            (sext_inreg GPR:$b, i16)), 16)))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 0;
+           }
+
+  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                                   (sra GPR:$b, 16)), 16)))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 1;
+           }
+}
+
+defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// TODO: Halfword multiple accumulate long: SMLAL<x><y>
+// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+def CLZ  : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src),
+              "clz", " $dst, $src",
+              [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> {
+  let Inst{7-4}   = 0b0001;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
+              "rev", " $dst, $src",
+              [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0011;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
+               "rev16", " $dst, $src",
+               [(set GPR:$dst,
+                   (or (and (srl GPR:$src, 8), 0xFF),
+                       (or (and (shl GPR:$src, 8), 0xFF00),
+                           (or (and (srl GPR:$src, 8), 0xFF0000),
+                               (and (shl GPR:$src, 8), 0xFF000000)))))]>,
+               Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b1011;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src),
+               "revsh", " $dst, $src",
+               [(set GPR:$dst,
+                  (sext_inreg
+                    (or (srl (and GPR:$src, 0xFF00), 8),
+                        (shl GPR:$src, 8)), i16))]>,
+               Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b1011;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
+                                 (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+               "pkhbt", " $dst, $src1, $src2, LSL $shamt",
+               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
+                                   (and (shl GPR:$src2, (i32 imm:$shamt)),
+                                        0xFFFF0000)))]>,
+               Requires<[IsARM, HasV6]> {
+  let Inst{6-4} = 0b001;
+}
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
+               (PKHBT GPR:$src1, GPR:$src2, 0)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
+               (PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
+
+
+def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
+                                 (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+               "pkhtb", " $dst, $src1, $src2, ASR $shamt",
+               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
+                                   (and (sra GPR:$src2, imm16_31:$shamt),
+                                        0xFFFF)))]>, Requires<[IsARM, HasV6]> {
+  let Inst{6-4} = 0b101;
+}
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, 16)),
+               (PKHTB GPR:$src1, GPR:$src2, 16)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
+                   (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
+               (PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+defm CMP  : AI1_cmp_irs<0b1010, "cmp",
+                        BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+defm CMN  : AI1_cmp_irs<0b1011, "cmn",
+                        BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+
+// Note that TST/TEQ don't set all the same flags that CMP does!
+defm TST  : AI1_cmp_irs<0b1000, "tst",
+                        BinOpFrag<(ARMcmpNZ (and node:$LHS, node:$RHS), 0)>>;
+defm TEQ  : AI1_cmp_irs<0b1001, "teq",
+                        BinOpFrag<(ARMcmpNZ (xor node:$LHS, node:$RHS), 0)>>;
+
+defm CMPnz : AI1_cmp_irs<0b1010, "cmp",
+                         BinOpFrag<(ARMcmpNZ node:$LHS, node:$RHS)>>;
+defm CMNnz : AI1_cmp_irs<0b1011, "cmn",
+                         BinOpFrag<(ARMcmpNZ node:$LHS,(ineg node:$RHS))>>;
+
+def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm),
+             (CMNri  GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpNZ GPR:$src, so_imm_neg:$imm),
+             (CMNri  GPR:$src, so_imm_neg:$imm)>;
+
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :( 
+def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm,
+                "mov", " $dst, $true",
+      [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">, UnaryDP;
+
+def MOVCCs : AI1<0b1101, (outs GPR:$dst),
+                        (ins GPR:$false, so_reg:$true), DPSoRegFrm,
+                "mov", " $dst, $true",
+   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">, UnaryDP;
+
+def MOVCCi : AI1<0b1101, (outs GPR:$dst),
+                        (ins GPR:$false, so_imm:$true), DPFrm,
+                "mov", " $dst, $true",
+   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">, UnaryDP;
+
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo,
+                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
+def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, i32imm:$id, pred:$p),
+          Pseudo,
+          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR] in {
+  def TPsoft : ABXI<0b1011, (outs), (ins),
+               "bl __aeabi_read_tp",
+               [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is a three instruction sequence to store the return 
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everthing out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+let Defs = 
+  [ R0, R1, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR,
+    D0, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15 ] in {
+  def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src),
+                               AddrModeNone, SizeSpecial, IndexModeNone, Pseudo,
+                               "add r0, pc, #4\n\t"
+                               "str r0, [$src, #+4]\n\t"
+                               "mov r0, #0 @ eh_setjmp", "",
+                               [(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>;
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+             (LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Large immediate handling.
+
+// Two piece so_imms.
+let isReMaterializable = 1 in
+def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), Pseudo,
+                         "mov", " $dst, $src",
+                         [(set GPR:$dst, so_imm2part:$src)]>;
+
+def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
+              (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                     (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
+              (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                     (so_imm2part_2 imm:$RHS))>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(extloadi8  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
+
+def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
+def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra (shl GPR:$b, 16), 16)),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16)),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, 16)),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16)),
+                 (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, 16), sext_16_node:$b),
+                (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16),
+                 (SMULWB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), 16),
+                 (SMULWB GPR:$a, GPR:$b)>;
+
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, 16), 16),
+                           (sra (shl GPR:$b, 16), 16))),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, sext_16_node:$b)),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, (sra GPR:$b, 16))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16))),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, 16), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16)),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, sext_16_node:$b), 16)),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
new file mode 100644
index 0000000..ffb83a8
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -0,0 +1,562 @@
+//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
+                      [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
+}]>;
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
+}]>;
+
+
+/// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7].
+def imm0_7 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() < 8;
+}]>;
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)-N->getZExtValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() < 256;
+}]>;
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+  return ~((uint32_t)N->getZExtValue()) < 256;
+}]>;
+
+def imm8_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() >= 8 && (uint32_t)N->getZExtValue() < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+  unsigned Val = -N->getZExtValue();
+  return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift.
+// This uses thumb_immshifted to match and thumb_immshifted_val and
+// thumb_immshifted_shamt to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+  return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+// Define Thumb specific addressing modes.
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_s4 := reg + reg
+//                  reg + imm5 * 4
+//
+def t_addrmode_s4 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS4", []> {
+  let PrintMethod = "printThumbAddrModeS4Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+}
+
+// t_addrmode_s2 := reg + reg
+//                  reg + imm5 * 2
+//
+def t_addrmode_s2 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS2", []> {
+  let PrintMethod = "printThumbAddrModeS2Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+}
+
+// t_addrmode_s1 := reg + reg
+//                  reg + imm5
+//
+def t_addrmode_s1 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS1", []> {
+  let PrintMethod = "printThumbAddrModeS1Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+def t_addrmode_sp : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let PrintMethod = "printThumbAddrModeSPOperand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let Defs = [SP], Uses = [SP] in {
+def tADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+           "@ tADJCALLSTACKUP $amt1",
+           [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb]>;
+
+def tADJCALLSTACKDOWN :
+PseudoInst<(outs), (ins i32imm:$amt),
+           "@ tADJCALLSTACKDOWN $amt",
+           [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb]>;
+}
+
+let isNotDuplicable = 1 in
+def tPICADD : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, pclabel:$cp),
+                  "$cp:\n\tadd $dst, pc",
+                  [(set tGPR:$dst, (ARMpic_add tGPR:$lhs, imm:$cp))]>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1 in {
+  def tBX_RET : TI<(outs), (ins), "bx lr", [(ARMretflag)]>;
+  // Alternative return instruction used by vararg functions.
+  def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), "bx $target", []>;
+}
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isReturn = 1, isTerminator = 1 in
+def tPOP_RET : TI<(outs reglist:$dst1, variable_ops), (ins),
+                   "pop $dst1", []>;
+
+let isCall = 1,
+  Defs = [R0, R1, R2, R3, LR,
+          D0, D1, D2, D3, D4, D5, D6, D7] in {
+  def tBL  : TIx2<(outs), (ins i32imm:$func, variable_ops),
+                   "bl ${func:call}",
+                   [(ARMtcall tglobaladdr:$func)]>;
+  // ARMv5T and above
+  def tBLXi : TIx2<(outs), (ins i32imm:$func, variable_ops),
+                    "blx ${func:call}",
+                    [(ARMcall tglobaladdr:$func)]>, Requires<[HasV5T]>;
+  def tBLXr : TI<(outs), (ins tGPR:$func, variable_ops),
+                  "blx $func",
+                  [(ARMtcall tGPR:$func)]>, Requires<[HasV5T]>;
+  // ARMv4T
+  def tBX : TIx2<(outs), (ins tGPR:$func, variable_ops),
+                  "cpy lr, pc\n\tbx $func",
+                  [(ARMcall_nolink tGPR:$func)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  let isBarrier = 1 in {
+    let isPredicable = 1 in
+    def tB   : TI<(outs), (ins brtarget:$target), "b $target",
+                  [(br bb:$target)]>;
+
+  // Far jump
+  def tBfar : TIx2<(outs), (ins brtarget:$target), "bl $target\t@ far jump",[]>;
+
+  def tBR_JTr : TJTI<(outs),
+                     (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id),
+                     "cpy pc, $target \n\t.align\t2\n$jt",
+                     [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>;
+  }
+}
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let isBranch = 1, isTerminator = 1 in
+  def tBcc : TI<(outs), (ins brtarget:$target, pred:$cc), "b$cc $target",
+                 [/*(ARMbrcond bb:$target, imm:$cc)*/]>;
+
+//===----------------------------------------------------------------------===//
+//  Load Store Instructions.
+//
+
+let canFoldAsLoad = 1 in
+def tLDR : TI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr),
+               "ldr $dst, $addr",
+               [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>;
+
+def tLDRB : TI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr),
+                "ldrb $dst, $addr",
+                [(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>;
+
+def tLDRH : TI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr),
+                "ldrh $dst, $addr",
+                [(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>;
+
+def tLDRSB : TI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+                 "ldrsb $dst, $addr",
+                 [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
+
+def tLDRSH : TI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+                 "ldrsh $dst, $addr",
+                 [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
+
+let canFoldAsLoad = 1 in
+def tLDRspi : TIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr),
+                  "ldr $dst, $addr",
+                  [(set tGPR:$dst, (load t_addrmode_sp:$addr))]>;
+
+// Special instruction for restore. It cannot clobber condition register
+// when it's expanded by eliminateCallFramePseudoInstr().
+let canFoldAsLoad = 1, mayLoad = 1 in
+def tRestore : TIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr),
+                    "ldr $dst, $addr", []>;
+
+// Load tconstpool
+let canFoldAsLoad = 1 in
+def tLDRpci : TIs<(outs tGPR:$dst), (ins i32imm:$addr),
+                  "ldr $dst, $addr",
+                  [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
+def tLDRcp  : TIs<(outs tGPR:$dst), (ins i32imm:$addr),
+                  "ldr $dst, $addr", []>;
+
+def tSTR : TI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr),
+               "str $src, $addr",
+               [(store tGPR:$src, t_addrmode_s4:$addr)]>;
+
+def tSTRB : TI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr),
+                 "strb $src, $addr",
+                 [(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>;
+
+def tSTRH : TI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr),
+                 "strh $src, $addr",
+                 [(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>;
+
+def tSTRspi : TIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr),
+                   "str $src, $addr",
+                   [(store tGPR:$src, t_addrmode_sp:$addr)]>;
+
+let mayStore = 1 in {
+// Special instruction for spill. It cannot clobber condition register
+// when it's expanded by eliminateCallFramePseudoInstr().
+def tSpill : TIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr),
+                  "str $src, $addr", []>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// TODO: A7-44: LDMIA - load multiple
+
+let mayLoad = 1 in
+def tPOP : TI<(outs reglist:$dst1, variable_ops), (ins),
+               "pop $dst1", []>;
+
+let mayStore = 1 in
+def tPUSH : TI<(outs), (ins reglist:$src1, variable_ops),
+                "push $src1", []>;
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+// Add with carry
+def tADC : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+               "adc $dst, $rhs",
+               [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>;
+
+def tADDS : TI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+               "add $dst, $lhs, $rhs",
+               [(set tGPR:$dst, (addc tGPR:$lhs, tGPR:$rhs))]>;
+
+
+def tADDi3 : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                "add $dst, $lhs, $rhs",
+                [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>;
+
+def tADDi8 : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                 "add $dst, $rhs",
+                 [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>;
+
+def tADDrr : TI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                "add $dst, $lhs, $rhs",
+                [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>;
+
+def tADDhirr : TIt<(outs tGPR:$dst), (ins GPR:$lhs, GPR:$rhs),
+                   "add $dst, $rhs @ addhirr", []>;
+
+def tADDrPCi : TI<(outs tGPR:$dst), (ins i32imm:$rhs),
+                  "add $dst, pc, $rhs * 4", []>;
+
+def tADDrSPi : TI<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs),
+                  "add $dst, $sp, $rhs * 4 @ addrspi", []>;
+
+def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+                  "add $dst, $rhs * 4", []>;
+
+def tAND : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                "and $dst, $rhs",
+                [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>;
+
+def tASRri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                "asr $dst, $lhs, $rhs",
+                [(set tGPR:$dst, (sra tGPR:$lhs, imm:$rhs))]>;
+
+def tASRrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                 "asr $dst, $rhs",
+                 [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>;
+
+def tBIC : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+               "bic $dst, $rhs",
+               [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>;
+
+
+def tCMN : TI<(outs), (ins tGPR:$lhs, tGPR:$rhs),
+              "cmn $lhs, $rhs",
+              [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+
+def tCMPi8 : TI<(outs), (ins tGPR:$lhs, i32imm:$rhs),
+               "cmp $lhs, $rhs",
+               [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>;
+
+def tCMPr : TI<(outs), (ins tGPR:$lhs, tGPR:$rhs),
+               "cmp $lhs, $rhs",
+               [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>;
+
+def tTST  : TI<(outs), (ins tGPR:$lhs, tGPR:$rhs),
+               "tst $lhs, $rhs",
+               [(ARMcmpNZ (and tGPR:$lhs, tGPR:$rhs), 0)]>;
+
+def tCMNNZ : TI<(outs), (ins tGPR:$lhs, tGPR:$rhs),
+                "cmn $lhs, $rhs",
+                [(ARMcmpNZ tGPR:$lhs, (ineg tGPR:$rhs))]>;
+
+def tCMPNZi8 : TI<(outs), (ins tGPR:$lhs, i32imm:$rhs),
+                 "cmp $lhs, $rhs",
+                 [(ARMcmpNZ tGPR:$lhs, imm0_255:$rhs)]>;
+
+def tCMPNZr : TI<(outs), (ins tGPR:$lhs, tGPR:$rhs),
+                 "cmp $lhs, $rhs",
+                 [(ARMcmpNZ tGPR:$lhs, tGPR:$rhs)]>;
+
+// TODO: A7-37: CMP(3) - cmp hi regs
+
+def tEOR : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+               "eor $dst, $rhs",
+               [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>;
+
+def tLSLri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                "lsl $dst, $lhs, $rhs",
+                [(set tGPR:$dst, (shl tGPR:$lhs, imm:$rhs))]>;
+
+def tLSLrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                 "lsl $dst, $rhs",
+                 [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>;
+
+def tLSRri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                "lsr $dst, $lhs, $rhs",
+                [(set tGPR:$dst, (srl tGPR:$lhs, imm:$rhs))]>;
+
+def tLSRrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                 "lsr $dst, $rhs",
+                 [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>;
+
+// FIXME: This is not rematerializable because mov changes the condition code.
+def tMOVi8 : TI<(outs tGPR:$dst), (ins i32imm:$src),
+                 "mov $dst, $src",
+                 [(set tGPR:$dst, imm0_255:$src)]>;
+
+// TODO: A7-73: MOV(2) - mov setting flag.
+
+
+// Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy',
+// which is MOV(3).  This also supports high registers.
+def tMOVr       : TI<(outs tGPR:$dst), (ins tGPR:$src),
+                      "cpy $dst, $src", []>;
+def tMOVhir2lor : TI<(outs tGPR:$dst), (ins GPR:$src),
+                      "cpy $dst, $src\t@ hir2lor", []>;
+def tMOVlor2hir : TI<(outs GPR:$dst), (ins tGPR:$src),
+                      "cpy $dst, $src\t@ lor2hir", []>;
+def tMOVhir2hir : TI<(outs GPR:$dst), (ins GPR:$src),
+                      "cpy $dst, $src\t@ hir2hir", []>;
+
+def tMUL : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+               "mul $dst, $rhs",
+               [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>;
+
+def tMVN : TI<(outs tGPR:$dst), (ins tGPR:$src),
+              "mvn $dst, $src",
+              [(set tGPR:$dst, (not tGPR:$src))]>;
+
+def tNEG : TI<(outs tGPR:$dst), (ins tGPR:$src),
+              "neg $dst, $src",
+              [(set tGPR:$dst, (ineg tGPR:$src))]>;
+
+def tORR : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+               "orr $dst, $rhs",
+               [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>;
+
+
+def tREV : TI<(outs tGPR:$dst), (ins tGPR:$src),
+              "rev $dst, $src",
+              [(set tGPR:$dst, (bswap tGPR:$src))]>,
+              Requires<[IsThumb, HasV6]>;
+
+def tREV16 : TI<(outs tGPR:$dst), (ins tGPR:$src),
+                "rev16 $dst, $src",
+                [(set tGPR:$dst,
+                    (or (and (srl tGPR:$src, 8), 0xFF),
+                        (or (and (shl tGPR:$src, 8), 0xFF00),
+                            (or (and (srl tGPR:$src, 8), 0xFF0000),
+                                (and (shl tGPR:$src, 8), 0xFF000000)))))]>,
+                Requires<[IsThumb, HasV6]>;
+
+def tREVSH : TI<(outs tGPR:$dst), (ins tGPR:$src),
+                "revsh $dst, $src",
+                [(set tGPR:$dst,
+                   (sext_inreg
+                     (or (srl (and tGPR:$src, 0xFFFF), 8),
+                         (shl tGPR:$src, 8)), i16))]>,
+                Requires<[IsThumb, HasV6]>;
+
+def tROR : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                "ror $dst, $rhs",
+                [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>;
+
+
+// Subtract with carry
+def tSBC : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                "sbc $dst, $rhs",
+                [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>;
+
+def tSUBS : TI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                "sub $dst, $lhs, $rhs",
+               [(set tGPR:$dst, (subc tGPR:$lhs, tGPR:$rhs))]>;
+
+
+// TODO: A7-96: STMIA - store multiple.
+
+def tSUBi3 : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                "sub $dst, $lhs, $rhs",
+                [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>;
+
+def tSUBi8 : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                  "sub $dst, $rhs",
+                  [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>;
+
+def tSUBrr : TI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+                "sub $dst, $lhs, $rhs",
+                [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>;
+
+def tSUBspi : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
+                  "sub $dst, $rhs * 4", []>;
+
+def tSXTB  : TI<(outs tGPR:$dst), (ins tGPR:$src),
+                "sxtb $dst, $src",
+                [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>,
+                Requires<[IsThumb, HasV6]>;
+def tSXTH  : TI<(outs tGPR:$dst), (ins tGPR:$src),
+                "sxth $dst, $src",
+                [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>,
+                Requires<[IsThumb, HasV6]>;
+
+
+def tUXTB  : TI<(outs tGPR:$dst), (ins tGPR:$src),
+                "uxtb $dst, $src",
+                [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>,
+                Requires<[IsThumb, HasV6]>;
+def tUXTH  : TI<(outs tGPR:$dst), (ins tGPR:$src),
+                "uxth $dst, $src",
+                [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>,
+                Requires<[IsThumb, HasV6]>;
+
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation.
+// Expanded by the scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in  // Expanded by the scheduler.
+  def tMOVCCr :
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
+              "@ tMOVCCr $cc",
+              [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def tLEApcrel : TIx2<(outs tGPR:$dst), (ins i32imm:$label),
+                    !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
+                                          "${:private}PCRELL${:uid}+4))\n"),
+                               !strconcat("\tmov $dst, #PCRELV${:uid}\n",
+                                  "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
+                    []>;
+
+def tLEApcrelJT : TIx2<(outs tGPR:$dst), (ins i32imm:$label, i32imm:$id),
+          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
+                                         "${:private}PCRELL${:uid}+4))\n"),
+                     !strconcat("\tmov $dst, #PCRELV${:uid}\n",
+                                "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
+                    []>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, LR] in {
+  def tTPsoft  : TIx2<(outs), (ins),
+               "bl __aeabi_read_tp",
+               [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ConstantPool, GlobalAddress
+def : ThumbPat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+def : ThumbPat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+
+// JumpTable
+def : ThumbPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+               (tLEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Direct calls
+def : ThumbPat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>;
+def : ThumbV5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>;
+
+// Indirect calls to ARM routines
+def : ThumbV5Pat<(ARMcall tGPR:$dst), (tBLXr tGPR:$dst)>;
+
+// zextload i1 -> zextload i8
+def : ThumbPat<(zextloadi1 t_addrmode_s1:$addr),
+               (tLDRB t_addrmode_s1:$addr)>;
+
+// extload -> zextload
+def : ThumbPat<(extloadi1  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
+def : ThumbPat<(extloadi8  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
+def : ThumbPat<(extloadi16 t_addrmode_s2:$addr),  (tLDRH t_addrmode_s2:$addr)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : ThumbPat<(i32 thumb_immshifted:$src),
+               (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
+                       (thumb_immshifted_shamt imm:$src))>;
+
+def : ThumbPat<(i32 imm0_255_comp:$src),
+               (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
new file mode 100644
index 0000000..168fb45
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -0,0 +1,12 @@
+//===- ARMInstrThumb2.td - Thumb2 support for ARM -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb2 instruction set.
+//
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
new file mode 100644
index 0000000..0247daf
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -0,0 +1,398 @@
+//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VFP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_FTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDT_ITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDT_CMPFP0 :
+SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_FMDRR :
+SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                     SDTCisSameAs<1, 2>]>;
+
+def arm_ftoui  : SDNode<"ARMISD::FTOUI",  SDT_FTOI>;
+def arm_ftosi  : SDNode<"ARMISD::FTOSI",  SDT_FTOI>;
+def arm_sitof  : SDNode<"ARMISD::SITOF",  SDT_ITOF>;
+def arm_uitof  : SDNode<"ARMISD::UITOF",  SDT_ITOF>;
+def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInFlag,SDNPOutFlag]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",  SDT_ARMCmp, [SDNPOutFlag]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0",SDT_CMPFP0, [SDNPOutFlag]>;
+def arm_fmdrr  : SDNode<"ARMISD::FMDRR",  SDT_FMDRR>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+let canFoldAsLoad = 1 in {
+def FLDD  : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr),
+                 "fldd", " $dst, $addr",
+                 [(set DPR:$dst, (load addrmode5:$addr))]>;
+
+def FLDS  : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr),
+                 "flds", " $dst, $addr",
+                 [(set SPR:$dst, (load addrmode5:$addr))]>;
+} // canFoldAsLoad
+
+def FSTD  : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr),
+                 "fstd", " $src, $addr",
+                 [(store DPR:$src, addrmode5:$addr)]>;
+
+def FSTS  : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
+                 "fsts", " $src, $addr",
+                 [(store SPR:$src, addrmode5:$addr)]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+let mayLoad = 1 in {
+def FLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1,
+                           variable_ops),
+                  "fldm${addr:submode}d${p} ${addr:base}, $dst1",
+                  []> {
+  let Inst{20} = 1;
+}
+
+def FLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1,
+                           variable_ops),
+                  "fldm${addr:submode}s${p} ${addr:base}, $dst1",
+                  []> {
+  let Inst{20} = 1;
+}
+}
+
+let mayStore = 1 in {
+def FSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
+                           variable_ops),
+                 "fstm${addr:submode}d${p} ${addr:base}, $src1",
+                 []> {
+  let Inst{20} = 0;
+}
+
+def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
+                           variable_ops),
+                 "fstm${addr:submode}s${p} ${addr:base}, $src1",
+                 []> {
+  let Inst{20} = 0;
+}
+} // mayStore
+
+// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+def FADDD  : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 "faddd", " $dst, $a, $b",
+                 [(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>;
+
+def FADDS  : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                 "fadds", " $dst, $a, $b",
+                 [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
+
+// These are encoded as unary instructions.
+def FCMPED : ADuI<0b11101011, 0b0100, 0b1100, (outs), (ins DPR:$a, DPR:$b),
+                 "fcmped", " $a, $b",
+                 [(arm_cmpfp DPR:$a, DPR:$b)]>;
+
+def FCMPES : ASuI<0b11101011, 0b0100, 0b1100, (outs), (ins SPR:$a, SPR:$b),
+                 "fcmpes", " $a, $b",
+                 [(arm_cmpfp SPR:$a, SPR:$b)]>;
+
+def FDIVD  : ADbI<0b11101000, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 "fdivd", " $dst, $a, $b",
+                 [(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>;
+
+def FDIVS  : ASbI<0b11101000, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                 "fdivs", " $dst, $a, $b",
+                 [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>;
+
+def FMULD  : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 "fmuld", " $dst, $a, $b",
+                 [(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>;
+
+def FMULS  : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                 "fmuls", " $dst, $a, $b",
+                 [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
+                 
+def FNMULD  : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                  "fnmuld", " $dst, $a, $b",
+                  [(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]> {
+  let Inst{6} = 1;
+}
+
+def FNMULS  : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  "fnmuls", " $dst, $a, $b",
+                  [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]> {
+  let Inst{6} = 1;
+}
+
+// Match reassociated forms only if not sign dependent rounding.
+def : Pat<(fmul (fneg DPR:$a), DPR:$b),
+          (FNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+def : Pat<(fmul (fneg SPR:$a), SPR:$b),
+          (FNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+
+
+def FSUBD  : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 "fsubd", " $dst, $a, $b",
+                 [(set DPR:$dst, (fsub DPR:$a, DPR:$b))]> {
+  let Inst{6} = 1;
+}
+
+def FSUBS  : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                 "fsubs", " $dst, $a, $b",
+                 [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]> {
+  let Inst{6} = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def FABSD  : ADuI<0b11101011, 0b0000, 0b1100, (outs DPR:$dst), (ins DPR:$a),
+                 "fabsd", " $dst, $a",
+                 [(set DPR:$dst, (fabs DPR:$a))]>;
+
+def FABSS  : ASuI<0b11101011, 0b0000, 0b1100, (outs SPR:$dst), (ins SPR:$a),
+                 "fabss", " $dst, $a",
+                 [(set SPR:$dst, (fabs SPR:$a))]>;
+
+def FCMPEZD : ADuI<0b11101011, 0b0101, 0b1100, (outs), (ins DPR:$a),
+                  "fcmpezd", " $a",
+                  [(arm_cmpfp0 DPR:$a)]>;
+
+def FCMPEZS : ASuI<0b11101011, 0b0101, 0b1100, (outs), (ins SPR:$a),
+                  "fcmpezs", " $a",
+                  [(arm_cmpfp0 SPR:$a)]>;
+
+def FCVTDS : ASuI<0b11101011, 0b0111, 0b1100, (outs DPR:$dst), (ins SPR:$a),
+                 "fcvtds", " $dst, $a",
+                 [(set DPR:$dst, (fextend SPR:$a))]>;
+
+// Special case encoding: bits 11-8 is 0b1011.
+def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
+                 "fcvtsd", " $dst, $a",
+                 [(set SPR:$dst, (fround DPR:$a))]> {
+  let Inst{27-23} = 0b11101;
+  let Inst{21-16} = 0b110111;
+  let Inst{11-8}  = 0b1011;
+  let Inst{7-4}   = 0b1100;
+}
+
+def FCPYD  : ADuI<0b11101011, 0b0000, 0b0100, (outs DPR:$dst), (ins DPR:$a),
+                 "fcpyd", " $dst, $a", []>;
+
+def FCPYS  : ASuI<0b11101011, 0b0000, 0b0100, (outs SPR:$dst), (ins SPR:$a),
+                 "fcpys", " $dst, $a", []>;
+
+def FNEGD  : ADuI<0b11101011, 0b0001, 0b0100, (outs DPR:$dst), (ins DPR:$a),
+                 "fnegd", " $dst, $a",
+                 [(set DPR:$dst, (fneg DPR:$a))]>;
+
+def FNEGS  : ASuI<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$a),
+                 "fnegs", " $dst, $a",
+                 [(set SPR:$dst, (fneg SPR:$a))]>;
+
+def FSQRTD  : ADuI<0b11101011, 0b0001, 0b1100, (outs DPR:$dst), (ins DPR:$a),
+                 "fsqrtd", " $dst, $a",
+                 [(set DPR:$dst, (fsqrt DPR:$a))]>;
+
+def FSQRTS  : ASuI<0b11101011, 0b0001, 0b1100, (outs SPR:$dst), (ins SPR:$a),
+                 "fsqrts", " $dst, $a",
+                 [(set SPR:$dst, (fsqrt SPR:$a))]>;
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies.  Int <-> FP Conversions.
+//
+
+def FMRS   : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src),
+                 "fmrs", " $dst, $src",
+                 [(set GPR:$dst, (bitconvert SPR:$src))]>;
+
+def FMSR   : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src),
+                 "fmsr", " $dst, $src",
+                 [(set SPR:$dst, (bitconvert GPR:$src))]>;
+
+def FMRRD  : AVConv3I<0b11000101, 0b1011,
+                      (outs GPR:$dst1, GPR:$dst2), (ins DPR:$src),
+                 "fmrrd", " $dst1, $dst2, $src",
+                 [/* FIXME: Can't write pattern for multiple result instr*/]>;
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def FMDRR : AVConv5I<0b11000100, 0b1011,
+                     (outs DPR:$dst), (ins GPR:$src1, GPR:$src2),
+                "fmdrr", " $dst, $src1, $src2",
+                [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]>;
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX : SPR system reg -> GPR
+
+// FMSRR: GPR -> SPR
+
+// FMXR: GPR -> VFP Sstem reg
+
+
+// Int to FP:
+
+def FSITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
+                 "fsitod", " $dst, $a",
+                 [(set DPR:$dst, (arm_sitof SPR:$a))]> {
+  let Inst{7} = 1;
+}
+
+def FSITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
+                 "fsitos", " $dst, $a",
+                 [(set SPR:$dst, (arm_sitof SPR:$a))]> {
+  let Inst{7} = 1;
+}
+
+def FUITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
+                 "fuitod", " $dst, $a",
+                 [(set DPR:$dst, (arm_uitof SPR:$a))]>;
+
+def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
+                 "fuitos", " $dst, $a",
+                 [(set SPR:$dst, (arm_uitof SPR:$a))]>;
+
+// FP to Int:
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+
+def FTOSIZD : AVConv1I<0b11101011, 0b1101, 0b1011,
+                       (outs SPR:$dst), (ins DPR:$a),
+                 "ftosizd", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftosi DPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def FTOSIZS : AVConv1I<0b11101011, 0b1101, 0b1010,
+                       (outs SPR:$dst), (ins SPR:$a),
+                 "ftosizs", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftosi SPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def FTOUIZD : AVConv1I<0b11101011, 0b1100, 0b1011,
+                       (outs SPR:$dst), (ins DPR:$a),
+                 "ftouizd", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftoui DPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010,
+                       (outs SPR:$dst), (ins SPR:$a),
+                 "ftouizs", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftoui SPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+//===----------------------------------------------------------------------===//
+// FP FMA Operations.
+//
+
+def FMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                "fmacd", " $dst, $a, $b",
+                [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                "fmacs", " $dst, $a, $b",
+                [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                "fmscd", " $dst, $a, $b",
+                [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                "fmscs", " $dst, $a, $b",
+                [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                 "fnmacd", " $dst, $a, $b",
+             [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst"> {
+  let Inst{6} = 1;
+}
+
+def FNMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                "fnmacs", " $dst, $a, $b",
+             [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst"> {
+  let Inst{6} = 1;
+}
+
+def FNMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                 "fnmscd", " $dst, $a, $b",
+             [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst"> {
+  let Inst{6} = 1;
+}
+
+def FNMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                "fnmscs", " $dst, $a, $b",
+             [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst"> {
+  let Inst{6} = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+def FCPYDcc  : ADuI<0b11101011, 0b0000, 0b0100,
+                    (outs DPR:$dst), (ins DPR:$false, DPR:$true),
+                    "fcpyd", " $dst, $true",
+                [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def FCPYScc  : ASuI<0b11101011, 0b0000, 0b0100,
+                    (outs SPR:$dst), (ins SPR:$false, SPR:$true),
+                    "fcpys", " $dst, $true",
+                [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def FNEGDcc  : ADuI<0b11101011, 0b0001, 0b0100,
+                    (outs DPR:$dst), (ins DPR:$false, DPR:$true),
+                    "fnegd", " $dst, $true",
+                [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def FNEGScc  : ASuI<0b11101011, 0b0001, 0b0100,
+                    (outs SPR:$dst), (ins SPR:$false, SPR:$true),
+                    "fnegs", " $dst, $true",
+                [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+
+//===----------------------------------------------------------------------===//
+// Misc.
+//
+
+let Defs = [CPSR] in
+def FMSTAT : AI<(outs), (ins), VFPMiscFrm, "fmstat", "", [(arm_fmstat)]> {
+  let Inst{27-20} = 0b11101111;
+  let Inst{19-16} = 0b0001;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{4}     = 1;
+}
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
new file mode 100644
index 0000000..e551c41
--- /dev/null
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -0,0 +1,298 @@
+//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "ARMJITInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/System/Memory.h"
+#include <cstdlib>
+using namespace llvm;
+
+void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  abort();
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us (we need
+// to preserve more context and manipulate the stack directly).  Instead,
+// write our own wrapper, which does things our way, so we have complete 
+// control over register saving and restoring.
+extern "C" {
+#if defined(__arm__)
+  void ARMCompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "ARMCompilationCallback\n"
+    ASMPREFIX "ARMCompilationCallback:\n"
+    // Save caller saved registers since they may contain stuff
+    // for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned, so we can't just preserve the callee saved regs.
+    "stmdb sp!, {r0, r1, r2, r3, lr}\n"
+#ifndef __SOFTFP__
+    "fstmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    // The LR contains the address of the stub function on entry.
+    // pass it as the argument to the C part of the callback
+    "mov  r0, lr\n"
+    "sub  sp, sp, #4\n"
+    // Call the C portion of the callback
+    "bl   " ASMPREFIX "ARMCompilationCallbackC\n"
+    "add  sp, sp, #4\n"
+    // Restoring the LR to the return address of the function that invoked
+    // the stub and de-allocating the stack space for it requires us to
+    // swap the two saved LR values on the stack, as they're backwards
+    // for what we need since the pop instruction has a pre-determined
+    // order for the registers.
+    //      +--------+
+    //   0  | LR     | Original return address
+    //      +--------+    
+    //   1  | LR     | Stub address (start of stub)
+    // 2-5  | R3..R0 | Saved registers (we need to preserve all regs)
+    // 6-20 | D0..D7 | Saved VFP registers
+    //      +--------+    
+    //
+#ifndef __SOFTFP__
+    // Restore VFP caller-saved registers.
+    "fldmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    //
+    //      We need to exchange the values in slots 0 and 1 so we can
+    //      return to the address in slot 1 with the address in slot 0
+    //      restored to the LR.
+    "ldr  r0, [sp,#20]\n"
+    "ldr  r1, [sp,#16]\n"
+    "str  r1, [sp,#20]\n"
+    "str  r0, [sp,#16]\n"
+    // Return to the (newly modified) stub to invoke the real function.
+    // The above twiddling of the saved return addresses allows us to
+    // deallocate everything, including the LR the stub saved, all in one
+    // pop instruction.
+    "ldmia  sp!, {r0, r1, r2, r3, lr, pc}\n"
+      );
+#else  // Not an ARM host
+  void ARMCompilationCallback() {
+    assert(0 && "Cannot call ARMCompilationCallback() on a non-ARM arch!\n");
+    abort();
+  }
+#endif
+}
+
+/// ARMCompilationCallbackC - This is the target-specific function invoked 
+/// by the function stub when we did not know the real target of a call.  
+/// This function must locate the start of the stub or call site and pass 
+/// it into the JIT compiler function.
+extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)StubAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call. We're replacing the first two instructions of the
+  // stub with:
+  //   ldr pc, [pc,#-4]
+  //   <addr>
+  if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) {
+    cerr << "ERROR: Unable to mark stub writable\n";
+    abort();
+  }
+  *(intptr_t *)StubAddr = 0xe51ff004;  // ldr pc, [pc, #-4]
+  *(intptr_t *)(StubAddr+4) = NewVal;
+  if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) {
+    cerr << "ERROR: Unable to mark stub executable\n";
+    abort();
+  }
+}
+
+TargetJITInfo::LazyResolverFn
+ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return ARMCompilationCallback;
+}
+
+void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr,
+                                             JITCodeEmitter &JCE) {
+  JCE.startGVStub(GV, 4, 4);
+  JCE.emitWordLE((intptr_t)Ptr);
+  void *PtrAddr = JCE.finishGVStub(GV);
+  addIndirectSymAddr(Ptr, (intptr_t)PtrAddr);
+  return PtrAddr;
+}
+
+void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)ARMCompilationCallback) {
+    // Branch to the corresponding function addr.
+    if (IsPIC) {
+      // The stub is 8-byte size and 4-aligned.
+      intptr_t LazyPtr = getIndirectSymAddr(Fn);
+      if (!LazyPtr) {
+        // In PIC mode, the function stub is loading a lazy-ptr.
+        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE);
+        if (F)
+          DOUT << "JIT: Indirect symbol emitted at [" << LazyPtr << "] for GV '"
+               << F->getName() << "'\n";
+        else
+          DOUT << "JIT: Stub emitted at [" << LazyPtr
+               << "] for external function at '" << Fn << "'\n";
+      }
+      JCE.startGVStub(F, 16, 4);
+      intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+      JCE.emitWordLE(0xe59fc004);            // ldr pc, [pc, #+4]
+      JCE.emitWordLE(0xe08fc00c);            // L_func$scv: add ip, pc, ip
+      JCE.emitWordLE(0xe59cf000);            // ldr pc, [ip]
+      JCE.emitWordLE(LazyPtr - (Addr+4+8));  // func - (L_func$scv+8)
+      sys::Memory::InvalidateInstructionCache((void*)Addr, 16);
+    } else {
+      // The stub is 8-byte size and 4-aligned.
+      JCE.startGVStub(F, 8, 4);
+      intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+      JCE.emitWordLE(0xe51ff004);    // ldr pc, [pc, #-4]
+      JCE.emitWordLE((intptr_t)Fn);  // addr of function
+      sys::Memory::InvalidateInstructionCache((void*)Addr, 8);
+    }
+  } else {
+    // The compilation callback will overwrite the first two words of this
+    // stub with indirect branch instructions targeting the compiled code. 
+    // This stub sets the return address to restart the stub, so that
+    // the new branch will be invoked when we come back.
+    //
+    // Branch and link to the compilation callback.
+    // The stub is 16-byte size and 4-byte aligned.
+    JCE.startGVStub(F, 16, 4);
+    intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+    // Save LR so the callback can determine which stub called it.
+    // The compilation callback is responsible for popping this prior
+    // to returning.
+    JCE.emitWordLE(0xe92d4000); // push {lr}
+    // Set the return address to go back to the start of this stub.
+    JCE.emitWordLE(0xe24fe00c); // sub lr, pc, #12
+    // Invoke the compilation callback.
+    JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4]
+    // The address of the compilation callback.
+    JCE.emitWordLE((intptr_t)ARMCompilationCallback);
+    sys::Memory::InvalidateInstructionCache((void*)Addr, 16);
+  }
+
+  return JCE.finishGVStub(F);
+}
+
+intptr_t ARMJITInfo::resolveRelocDestAddr(MachineRelocation *MR) const {
+  ARM::RelocationType RT = (ARM::RelocationType)MR->getRelocationType();
+  switch (RT) {
+  default:
+    return (intptr_t)(MR->getResultPointer());
+  case ARM::reloc_arm_pic_jt:
+    // Destination address - jump table base.
+    return (intptr_t)(MR->getResultPointer()) - MR->getConstantVal();
+  case ARM::reloc_arm_jt_base:
+    // Jump table base address.
+    return getJumpTableBaseAddr(MR->getJumpTableIndex());
+  case ARM::reloc_arm_cp_entry:
+  case ARM::reloc_arm_vfp_cp_entry:
+    // Constant pool entry address.
+    return getConstantPoolEntryAddr(MR->getConstantPoolIndex());
+  case ARM::reloc_arm_machine_cp_entry: {
+    ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MR->getConstantVal();
+    assert((!ACPV->hasModifier() && !ACPV->mustAddCurrentAddress()) &&
+           "Can't handle this machine constant pool entry yet!");
+    intptr_t Addr = (intptr_t)(MR->getResultPointer());
+    Addr -= getPCLabelAddr(ACPV->getLabelId()) + ACPV->getPCAdjustment();
+    return Addr;
+  }
+  }
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = resolveRelocDestAddr(MR);
+    switch ((ARM::RelocationType)MR->getRelocationType()) {
+    case ARM::reloc_arm_cp_entry:
+    case ARM::reloc_arm_vfp_cp_entry:
+    case ARM::reloc_arm_relative: {
+      // It is necessary to calculate the correct PC relative value. We
+      // subtract the base addr from the target addr to form a byte offset.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      // If the result is positive, set bit U(23) to 1.
+      if (ResultPtr >= 0)
+        *((intptr_t*)RelocPos) |= 1 << ARMII::U_BitShift;
+      else {
+        // Otherwise, obtain the absolute value and set bit U(23) to 0.
+        *((intptr_t*)RelocPos) &= ~(1 << ARMII::U_BitShift);
+        ResultPtr = - ResultPtr;
+      }
+      // Set the immed value calculated.
+      // VFP immediate offset is multiplied by 4.
+      if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry)
+        ResultPtr = ResultPtr >> 2;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      // Set register Rn to PC.
+      *((intptr_t*)RelocPos) |=
+        ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+      break;
+    }
+    case ARM::reloc_arm_pic_jt:
+    case ARM::reloc_arm_machine_cp_entry:
+    case ARM::reloc_arm_absolute: {
+      // These addresses have already been resolved.
+      *((intptr_t*)RelocPos) |= (intptr_t)ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_branch: {
+      // It is necessary to calculate the correct value of signed_immed_24
+      // field. We subtract the base addr from the target addr to form a
+      // byte offset, which must be inside the range -33554432 and +33554428.
+      // Then, we set the signed_immed_24 field of the instruction to bits
+      // [25:2] of the byte offset. More details ARM-ARM p. A4-11.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      ResultPtr = (ResultPtr & 0x03FFFFFC) >> 2;
+      assert(ResultPtr >= -33554432 && ResultPtr <= 33554428);
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_jt_base: {
+      // JT base - (instruction addr + 8)
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    }
+  }
+}
diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
new file mode 100644
index 0000000..7dfeed8
--- /dev/null
+++ b/lib/Target/ARM/ARMJITInfo.h
@@ -0,0 +1,178 @@
+//===- ARMJITInfo.h - ARM implementation of the JIT interface  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMJITINFO_H
+#define ARMJITINFO_H
+
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+  class ARMTargetMachine;
+
+  class ARMJITInfo : public TargetJITInfo {
+    // ConstPoolId2AddrMap - A map from constant pool ids to the corresponding
+    // CONSTPOOL_ENTRY addresses.
+    SmallVector<intptr_t, 16> ConstPoolId2AddrMap;
+
+    // JumpTableId2AddrMap - A map from inline jumptable ids to the
+    // corresponding inline jump table bases.
+    SmallVector<intptr_t, 16> JumpTableId2AddrMap;
+
+    // PCLabelMap - A map from PC labels to addresses.
+    DenseMap<unsigned, intptr_t> PCLabelMap;
+
+    // Sym2IndirectSymMap - A map from symbol (GlobalValue and ExternalSymbol)
+    // addresses to their indirect symbol addresses.
+    DenseMap<void*, intptr_t> Sym2IndirectSymMap;
+
+    // IsPIC - True if the relocation model is PIC. This is used to determine
+    // how to codegen function stubs.
+    bool IsPIC;
+
+  public:
+    explicit ARMJITInfo() : IsPIC(false) { useGOT = false; }
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                            JITCodeEmitter &JCE);
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// hasCustomConstantPool - Allows a target to specify that constant
+    /// pool address resolution is handled by the target.
+    virtual bool hasCustomConstantPool() const { return true; }
+
+    /// hasCustomJumpTables - Allows a target to specify that jumptables
+    /// are emitted by the target.
+    virtual bool hasCustomJumpTables() const { return true; }
+
+    /// allocateSeparateGVMemory - If true, globals should be placed in
+    /// separately allocated heap memory rather than in the same
+    /// code memory allocated by JITCodeEmitter.
+    virtual bool allocateSeparateGVMemory() const {
+#ifdef __APPLE__
+      return true;
+#else
+      return false;
+#endif
+    }
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize
+    /// jump table ids to jump table bases map; remember if codegen relocation
+    /// model is PIC.
+    void Initialize(const MachineFunction &MF, bool isPIC) {
+      const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      ConstPoolId2AddrMap.resize(AFI->getNumConstPoolEntries());
+      JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
+      IsPIC = isPIC;
+    }
+
+    /// getConstantPoolEntryAddr - The ARM target puts all constant
+    /// pool entries into constant islands. This returns the address of the
+    /// constant pool entry of the specified index.
+    intptr_t getConstantPoolEntryAddr(unsigned CPI) const {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      return ConstPoolId2AddrMap[CPI];
+    }
+
+    /// addConstantPoolEntryAddr - Map a Constant Pool Index to the address
+    /// where its associated value is stored. When relocations are processed,
+    /// this value will be used to resolve references to the constant.
+    void addConstantPoolEntryAddr(unsigned CPI, intptr_t Addr) {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      ConstPoolId2AddrMap[CPI] = Addr;
+    }
+
+    /// getJumpTableBaseAddr - The ARM target inline all jump tables within
+    /// text section of the function. This returns the address of the base of
+    /// the jump table of the specified index.
+    intptr_t getJumpTableBaseAddr(unsigned JTI) const {
+      assert(JTI < JumpTableId2AddrMap.size());
+      return JumpTableId2AddrMap[JTI];
+    }
+
+    /// addJumpTableBaseAddr - Map a jump table index to the address where
+    /// the corresponding inline jump table is emitted. When relocations are
+    /// processed, this value will be used to resolve references to the
+    /// jump table.
+    void addJumpTableBaseAddr(unsigned JTI, intptr_t Addr) {
+      assert(JTI < JumpTableId2AddrMap.size());
+      JumpTableId2AddrMap[JTI] = Addr;
+    }
+
+    /// getPCLabelAddr - Retrieve the address of the PC label of the specified id.
+    intptr_t getPCLabelAddr(unsigned Id) const {
+      DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id);
+      assert(I != PCLabelMap.end());
+      return I->second;
+    }
+
+    /// addPCLabelAddr - Remember the address of the specified PC label.
+    void addPCLabelAddr(unsigned Id, intptr_t Addr) {
+      PCLabelMap.insert(std::make_pair(Id, Addr));
+    }
+
+    /// getIndirectSymAddr - Retrieve the address of the indirect symbol of the
+    /// specified symbol located at address. Returns 0 if the indirect symbol
+    /// has not been emitted.
+    intptr_t getIndirectSymAddr(void *Addr) const {
+      DenseMap<void*,intptr_t>::const_iterator I= Sym2IndirectSymMap.find(Addr);
+      if (I != Sym2IndirectSymMap.end())
+        return I->second;
+      return 0;
+    }
+
+    /// addIndirectSymAddr - Add a mapping from address of an emitted symbol to
+    /// its indirect symbol address.
+    void addIndirectSymAddr(void *SymAddr, intptr_t IndSymAddr) {
+      Sym2IndirectSymMap.insert(std::make_pair(SymAddr, IndSymAddr));
+    }
+
+  private:
+    /// resolveRelocDestAddr - Resolve the resulting address of the relocation
+    /// if it's not already solved. Constantpool entries must be resolved by
+    /// ARM target.
+    intptr_t resolveRelocDestAddr(MachineRelocation *MR) const;
+  };
+}
+
+#endif
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
new file mode 100644
index 0000000..047552f
--- /dev/null
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -0,0 +1,778 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-ldst-opt"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumFLDMGened, "Number of fldm instructions generated");
+STATISTIC(NumFSTMGened, "Number of fstm instructions generated");
+
+namespace {
+  struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass {
+    static char ID;
+    ARMLoadStoreOpt() : MachineFunctionPass(&ID) {}
+
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    ARMFunctionInfo *AFI;
+    RegScavenger *RS;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM load / store optimization pass";
+    }
+
+  private:
+    struct MemOpQueueEntry {
+      int Offset;
+      unsigned Position;
+      MachineBasicBlock::iterator MBBI;
+      bool Merged;
+      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
+        : Offset(o), Position(p), MBBI(i), Merged(false) {};
+    };
+    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+    typedef MemOpQueue::iterator MemOpQueueIter;
+
+    SmallVector<MachineBasicBlock::iterator, 4>
+    MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
+                 int Opcode, unsigned Size,
+                 ARMCC::CondCodes Pred, unsigned PredReg,
+                 unsigned Scratch, MemOpQueue &MemOps);
+
+    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+  };
+  char ARMLoadStoreOpt::ID = 0;
+}
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass() {
+  return new ARMLoadStoreOpt();
+}
+
+static int getLoadStoreMultipleOpcode(int Opcode) {
+  switch (Opcode) {
+  case ARM::LDR:
+    NumLDMGened++;
+    return ARM::LDM;
+  case ARM::STR:
+    NumSTMGened++;
+    return ARM::STM;
+  case ARM::FLDS:
+    NumFLDMGened++;
+    return ARM::FLDMS;
+  case ARM::FSTS:
+    NumFSTMGened++;
+    return ARM::FSTMS;
+  case ARM::FLDD:
+    NumFLDMGened++;
+    return ARM::FLDMD;
+  case ARM::FSTD:
+    NumFSTMGened++;
+    return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+/// mergeOps - Create and insert a LDM or STM with Base as base register and
+/// registers in Regs as the register operands that would be loaded / stored.
+/// It returns true if the transformation is done. 
+static bool mergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     int Offset, unsigned Base, bool BaseKill, int Opcode,
+                     ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
+                     SmallVector<std::pair<unsigned, bool>, 8> &Regs,
+                     const TargetInstrInfo *TII) {
+  // FIXME would it be better to take a DL from one of the loads arbitrarily?
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Only a single register to load / store. Don't bother.
+  unsigned NumRegs = Regs.size();
+  if (NumRegs <= 1)
+    return false;
+
+  ARM_AM::AMSubMode Mode = ARM_AM::ia;
+  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  if (isAM4 && Offset == 4)
+    Mode = ARM_AM::ib;
+  else if (isAM4 && Offset == -4 * (int)NumRegs + 4)
+    Mode = ARM_AM::da;
+  else if (isAM4 && Offset == -4 * (int)NumRegs)
+    Mode = ARM_AM::db;
+  else if (Offset != 0) {
+    // If starting offset isn't zero, insert a MI to materialize a new base.
+    // But only do so if it is cost effective, i.e. merging more than two
+    // loads / stores.
+    if (NumRegs <= 2)
+      return false;
+
+    unsigned NewBase;
+    if (Opcode == ARM::LDR)
+      // If it is a load, then just use one of the destination register to
+      // use as the new base.
+      NewBase = Regs[NumRegs-1].first;
+    else {
+      // Use the scratch register to use as a new base.
+      NewBase = Scratch;
+      if (NewBase == 0)
+        return false;
+    }
+    int BaseOpc = ARM::ADDri;
+    if (Offset < 0) {
+      BaseOpc = ARM::SUBri;
+      Offset = - Offset;
+    }
+    int ImmedOffset = ARM_AM::getSOImmVal(Offset);
+    if (ImmedOffset == -1)
+      return false;  // Probably not worth it then.
+
+    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
+      .addReg(Base, getKillRegState(BaseKill)).addImm(ImmedOffset)
+      .addImm(Pred).addReg(PredReg).addReg(0);
+    Base = NewBase;
+    BaseKill = true;  // New base is always killed right its use.
+  }
+
+  bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD;
+  bool isDef = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  Opcode = getLoadStoreMultipleOpcode(Opcode);
+  MachineInstrBuilder MIB = (isAM4)
+    ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
+        .addReg(Base, getKillRegState(BaseKill))
+        .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
+    : BuildMI(MBB, MBBI, dl, TII->get(Opcode))
+        .addReg(Base, getKillRegState(BaseKill))
+        .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
+        .addImm(Pred).addReg(PredReg);
+  for (unsigned i = 0; i != NumRegs; ++i)
+    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
+                     | getKillRegState(Regs[i].second));
+
+  return true;
+}
+
+/// MergeLDR_STR - Merge a number of load / store instructions into one or more
+/// load / store multiple instructions.
+SmallVector<MachineBasicBlock::iterator, 4>
+ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
+                              unsigned Base, int Opcode, unsigned Size,
+                              ARMCC::CondCodes Pred, unsigned PredReg,
+                              unsigned Scratch, MemOpQueue &MemOps) {
+  SmallVector<MachineBasicBlock::iterator, 4> Merges;
+  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  int Offset = MemOps[SIndex].Offset;
+  int SOffset = Offset;
+  unsigned Pos = MemOps[SIndex].Position;
+  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
+  unsigned PReg = MemOps[SIndex].MBBI->getOperand(0).getReg();
+  unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg);
+  bool isKill = MemOps[SIndex].MBBI->getOperand(0).isKill();
+
+  SmallVector<std::pair<unsigned,bool>, 8> Regs;
+  Regs.push_back(std::make_pair(PReg, isKill));
+  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
+    int NewOffset = MemOps[i].Offset;
+    unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg();
+    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+    isKill = MemOps[i].MBBI->getOperand(0).isKill();
+    // AM4 - register numbers in ascending order.
+    // AM5 - consecutive register numbers in ascending order.
+    if (NewOffset == Offset + (int)Size &&
+        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
+      Offset += Size;
+      Regs.push_back(std::make_pair(Reg, isKill));
+      PRegNum = RegNum;
+    } else {
+      // Can't merge this in. Try merge the earlier ones first.
+      if (mergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg,
+                   Scratch, Regs, TII)) {
+        Merges.push_back(prior(Loc));
+        for (unsigned j = SIndex; j < i; ++j) {
+          MBB.erase(MemOps[j].MBBI);
+          MemOps[j].Merged = true;
+        }
+      }
+      SmallVector<MachineBasicBlock::iterator, 4> Merges2 =
+        MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,MemOps);
+      Merges.append(Merges2.begin(), Merges2.end());
+      return Merges;
+    }
+
+    if (MemOps[i].Position > Pos) {
+      Pos = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
+  if (mergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg,
+               Scratch, Regs, TII)) {
+    Merges.push_back(prior(Loc));
+    for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) {
+      MBB.erase(MemOps[i].MBBI);
+      MemOps[i].Merged = true;
+    }
+  }
+
+  return Merges;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+static ARMCC::CondCodes getInstrPredicate(MachineInstr *MI, unsigned &PredReg) {
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx == -1) {
+    PredReg = 0;
+    return ARMCC::AL;
+  }
+
+  PredReg = MI->getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+}
+
+static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes, ARMCC::CondCodes Pred,
+                                       unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  return (MI && MI->getOpcode() == ARM::SUBri &&
+          MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
+          getInstrPredicate(MI, MyPredReg) == Pred &&
+          MyPredReg == PredReg);
+}
+
+static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes, ARMCC::CondCodes Pred,
+                                       unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  return (MI && MI->getOpcode() == ARM::ADDri &&
+          MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
+          getInstrPredicate(MI, MyPredReg) == Pred &&
+          MyPredReg == PredReg);
+}
+
+static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDR:
+  case ARM::STR:
+  case ARM::FLDS:
+  case ARM::FSTS:
+    return 4;
+  case ARM::FLDD:
+  case ARM::FSTD:
+    return 8;
+  case ARM::LDM:
+  case ARM::STM:
+    return (MI->getNumOperands() - 4) * 4;
+  case ARM::FLDMS:
+  case ARM::FSTMS:
+  case ARM::FLDMD:
+  case ARM::FSTMD:
+    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
+  }
+}
+
+/// mergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      bool &Advance,
+                                      MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(0).getReg();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  int Opcode = MI->getOpcode();
+  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::STM;
+
+  if (isAM4) {
+    if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    // Can't use the updating AM4 sub-mode if the base register is also a dest
+    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+    for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
+      if (MI->getOperand(i).getReg() == Base)
+        return false;
+    }
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
+        MBB.erase(PrevMBBI);
+        return true;
+      } else if (Mode == ARM_AM::ib &&
+                 isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+          isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        if (NextMBBI == I) {
+          Advance = true;
+          ++I;
+        }
+        MBB.erase(NextMBBI);
+        return true;
+      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+                 isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        if (NextMBBI == I) {
+          Advance = true;
+          ++I;
+        }
+        MBB.erase(NextMBBI);
+        return true;
+      }
+    }
+  } else {
+    // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops.
+    if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
+    unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
+        if (NextMBBI == I) {
+          Advance = true;
+          ++I;
+        }
+        MBB.erase(NextMBBI);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_PRE;
+  case ARM::STR: return ARM::STR_PRE;
+  case ARM::FLDS: return ARM::FLDMS;
+  case ARM::FLDD: return ARM::FLDMD;
+  case ARM::FSTS: return ARM::FSTMS;
+  case ARM::FSTD: return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_POST;
+  case ARM::STR: return ARM::STR_POST;
+  case ARM::FLDS: return ARM::FLDMS;
+  case ARM::FLDD: return ARM::FLDMD;
+  case ARM::FSTS: return ARM::FSTMS;
+  case ARM::FSTD: return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+/// mergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     const TargetInstrInfo *TII,
+                                     bool &Advance,
+                                     MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(1).getReg();
+  bool BaseKill = MI->getOperand(1).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  int Opcode = MI->getOpcode();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  if ((isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) ||
+      (!isAM2 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0))
+    return false;
+
+  bool isLd = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  // Can't do the merge if the destination register is the same as the would-be
+  // writeback register.
+  if (isLd && MI->getOperand(0).getReg() == Base)
+    return false;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  if (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    } else if (isAM2 && isMatchingIncrement(PrevMBBI, Base, Bytes,
+                                            Pred, PredReg)) {
+      DoMerge = true;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
+  }
+
+  if (!DoMerge && MBBI != MBB.end()) {
+    MachineBasicBlock::iterator NextMBBI = next(MBBI);
+    if (isAM2 && isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+      DoMerge = true;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge) {
+      if (NextMBBI == I) {
+        Advance = true;
+        ++I;
+      }
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;
+
+  bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD;
+  unsigned Offset = isAM2 ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift)
+    : ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia,
+                        true, isDPR ? 2 : 1);
+  if (isLd) {
+    if (isAM2)
+      // LDR_PRE, LDR_POST;
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    else
+      // FLDMS, FLDMD
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+        .addReg(Base, getKillRegState(BaseKill))
+        .addImm(Offset).addImm(Pred).addReg(PredReg)
+        .addReg(MI->getOperand(0).getReg(), RegState::Define);
+  } else {
+    MachineOperand &MO = MI->getOperand(0);
+    if (isAM2)
+      // STR_PRE, STR_POST;
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(BaseKill))
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    else
+      // FSTMS, FSTMD
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
+        .addImm(Pred).addReg(PredReg)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()));
+  }
+  MBB.erase(MBBI);
+
+  return true;
+}
+
+/// isMemoryOp - Returns true if instruction is a memory operations (that this
+/// pass is capable of operating on).
+static bool isMemoryOp(MachineInstr *MI) {
+  int Opcode = MI->getOpcode();
+  switch (Opcode) {
+  default: break;
+  case ARM::LDR:
+  case ARM::STR:
+    return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
+  case ARM::FLDS:
+  case ARM::FSTS:
+    return MI->getOperand(1).isReg();
+  case ARM::FLDD:
+  case ARM::FSTD:
+    return MI->getOperand(1).isReg();
+  }
+  return false;
+}
+
+/// AdvanceRS - Advance register scavenger to just before the earliest memory
+/// op that is being merged.
+void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
+  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
+  unsigned Position = MemOps[0].Position;
+  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
+    if (MemOps[i].Position < Position) {
+      Position = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  if (Loc != MBB.begin())
+    RS->forward(prior(Loc));
+}
+
+/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
+/// ops of the same base and incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+  unsigned NumMerges = 0;
+  unsigned NumMemOps = 0;
+  MemOpQueue MemOps;
+  unsigned CurrBase = 0;
+  int CurrOpc = -1;
+  unsigned CurrSize = 0;
+  ARMCC::CondCodes CurrPred = ARMCC::AL;
+  unsigned CurrPredReg = 0;
+  unsigned Position = 0;
+
+  RS->enterBasicBlock(&MBB);
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    bool Advance  = false;
+    bool TryMerge = false;
+    bool Clobber  = false;
+
+    bool isMemOp = isMemoryOp(MBBI);
+    if (isMemOp) {
+      int Opcode = MBBI->getOpcode();
+      bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+      unsigned Size = getLSMultipleTransferSize(MBBI);
+      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
+      unsigned NumOperands = MBBI->getDesc().getNumOperands();
+      unsigned OffField = MBBI->getOperand(NumOperands-3).getImm();
+      int Offset = isAM2
+        ? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4;
+      if (isAM2) {
+        if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
+          Offset = -Offset;
+      } else {
+        if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
+          Offset = -Offset;
+      }
+      // Watch out for:
+      // r4 := ldr [r5]
+      // r5 := ldr [r5, #4]
+      // r6 := ldr [r5, #8]
+      //
+      // The second ldr has effectively broken the chain even though it
+      // looks like the later ldr(s) use the same base register. Try to
+      // merge the ldr's so far, including this one. But don't try to
+      // combine the following ldr(s).
+      Clobber = (Opcode == ARM::LDR && Base == MBBI->getOperand(0).getReg());
+      if (CurrBase == 0 && !Clobber) {
+        // Start of a new chain.
+        CurrBase = Base;
+        CurrOpc  = Opcode;
+        CurrSize = Size;
+        CurrPred = Pred;
+        CurrPredReg = PredReg;
+        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+        NumMemOps++;
+        Advance = true;
+      } else {
+        if (Clobber) {
+          TryMerge = true;
+          Advance = true;
+        }
+
+        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+          // No need to match PredReg.
+          // Continue adding to the queue.
+          if (Offset > MemOps.back().Offset) {
+            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+            NumMemOps++;
+            Advance = true;
+          } else {
+            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
+                 I != E; ++I) {
+              if (Offset < I->Offset) {
+                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
+                NumMemOps++;
+                Advance = true;
+                break;
+              } else if (Offset == I->Offset) {
+                // Collision! This can't be merged!
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (Advance) {
+      ++Position;
+      ++MBBI;
+    } else
+      TryMerge = true;
+
+    if (TryMerge) {
+      if (NumMemOps > 1) {
+        // Try to find a free register to use as a new base in case it's needed.
+        // First advance to the instruction just before the start of the chain.
+        AdvanceRS(MBB, MemOps);
+        // Find a scratch register. Make sure it's a call clobbered register or
+        // a spilled callee-saved register.
+        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, true);
+        if (!Scratch)
+          Scratch = RS->FindUnusedReg(&ARM::GPRRegClass,
+                                      AFI->getSpilledCSRegisters());
+        // Process the load / store instructions.
+        RS->forward(prior(MBBI));
+
+        // Merge ops.
+        SmallVector<MachineBasicBlock::iterator,4> MBBII =
+          MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
+                       CurrPred, CurrPredReg, Scratch, MemOps);
+
+        // Try folding preceeding/trailing base inc/dec into the generated
+        // LDM/STM ops.
+        for (unsigned i = 0, e = MBBII.size(); i < e; ++i)
+          if (mergeBaseUpdateLSMultiple(MBB, MBBII[i], Advance, MBBI))
+            NumMerges++;
+        NumMerges += MBBII.size();
+
+        // Try folding preceeding/trailing base inc/dec into those load/store
+        // that were not merged to form LDM/STM ops.
+        for (unsigned i = 0; i != NumMemOps; ++i)
+          if (!MemOps[i].Merged)
+            if (mergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
+              NumMerges++;
+
+        // RS may be pointing to an instruction that's deleted. 
+        RS->skipTo(prior(MBBI));
+      }
+
+      CurrBase = 0;
+      CurrOpc = -1;
+      CurrSize = 0;
+      CurrPred = ARMCC::AL;
+      CurrPredReg = 0;
+      if (NumMemOps) {
+        MemOps.clear();
+        NumMemOps = 0;
+      }
+
+      // If iterator hasn't been advanced and this is not a memory op, skip it.
+      // It can't start a new chain anyway.
+      if (!Advance && !isMemOp && MBBI != E) {
+        ++Position;
+        ++MBBI;
+      }
+    }
+  }
+  return NumMerges > 0;
+}
+
+/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
+/// (bx lr) into the preceeding stack restore so it directly restore the value
+/// of LR into pc.
+///   ldmfd sp!, {r7, lr}
+///   bx lr
+/// =>
+///   ldmfd sp!, {r7, pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  if (MBB.empty()) return false;
+
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  if (MBBI->getOpcode() == ARM::BX_RET && MBBI != MBB.begin()) {
+    MachineInstr *PrevMI = prior(MBBI);
+    if (PrevMI->getOpcode() == ARM::LDM) {
+      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      if (MO.getReg() == ARM::LR) {
+        PrevMI->setDesc(TII->get(ARM::LDM_RET));
+        MO.setReg(ARM::PC);
+        MBB.erase(MBBI);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  RS = new RegScavenger();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= LoadStoreMultipleOpti(MBB);
+    Modified |= MergeReturnIntoLDM(MBB);
+  }
+
+  delete RS;
+  return Modified;
+}
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
new file mode 100644
index 0000000..6662be1
--- /dev/null
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -0,0 +1,238 @@
+//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMACHINEFUNCTIONINFO_H
+#define ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunction private
+/// ARM target-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+
+  /// isThumb - True if this function is compiled under Thumb mode.
+  /// Used to initialized Align, so must precede it.
+  bool isThumb;
+
+  /// Align - required alignment.  ARM functions and Thumb functions with
+  /// constant pools require 4-byte alignment; other Thumb functions
+  /// require only 2-byte alignment.
+  unsigned Align;
+
+  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  ///
+  unsigned VarArgsRegSaveSize;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// LRSpilledForFarJump - True if the LR register has been for spilled to
+  /// enable far jump.
+  bool LRSpilledForFarJump;
+
+  /// R3IsLiveIn - True if R3 is live in to this function.
+  /// FIXME: Remove when register scavenger for Thumb is done.
+  bool R3IsLiveIn;
+
+  /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
+  /// spill stack offset.
+  unsigned FramePtrSpillOffset;
+
+  /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+  /// register spills areas. For Mac OS X:
+  ///
+  /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+  /// --------------------------------------------
+  /// GPR callee-saved (2) : r8, r10, r11
+  /// --------------------------------------------
+  /// DPR callee-saved : d8 - d15
+  unsigned GPRCS1Offset;
+  unsigned GPRCS2Offset;
+  unsigned DPRCSOffset;
+
+  /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+  /// areas.
+  unsigned GPRCS1Size;
+  unsigned GPRCS2Size;
+  unsigned DPRCSSize;
+
+  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
+  /// which belong to these spill areas.
+  BitVector GPRCS1Frames;
+  BitVector GPRCS2Frames;
+  BitVector DPRCSFrames;
+
+  /// SpilledCSRegs - A BitVector mask of all spilled callee-saved registers.
+  ///
+  BitVector SpilledCSRegs;
+
+  /// JumpTableUId - Unique id for jumptables.
+  ///
+  unsigned JumpTableUId;
+
+  unsigned ConstPoolEntryUId;
+
+public:
+  ARMFunctionInfo() :
+    isThumb(false), 
+    Align(2U),
+    VarArgsRegSaveSize(0), HasStackFrame(false),
+    LRSpilledForFarJump(false), R3IsLiveIn(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
+    JumpTableUId(0), ConstPoolEntryUId(0) {}
+
+  ARMFunctionInfo(MachineFunction &MF) :
+    isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
+    Align(isThumb ? 1U : 2U),
+    VarArgsRegSaveSize(0), HasStackFrame(false),
+    LRSpilledForFarJump(false), R3IsLiveIn(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
+    SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()),
+    JumpTableUId(0), ConstPoolEntryUId(0) {}
+
+  bool isThumbFunction() const { return isThumb; }
+
+  unsigned getAlign() const { return Align; }
+  void setAlign(unsigned a) { Align = a; }
+
+  unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; }
+  void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
+  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
+
+  // FIXME: Remove when register scavenger for Thumb is done.
+  bool isR3LiveIn() const { return R3IsLiveIn; }
+  void setR3IsLiveIn(bool l) { R3IsLiveIn = l; }
+
+  unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+  void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+  
+  unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+  unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+
+  void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+  void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+  unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+
+  void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+  void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+
+  bool isGPRCalleeSavedArea1Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS1Frames.size())
+      return false;
+    return GPRCS1Frames[fi];
+  }
+  bool isGPRCalleeSavedArea2Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS2Frames.size())
+      return false;
+    return GPRCS2Frames[fi];
+  }
+  bool isDPRCalleeSavedAreaFrame(int fi) const {
+    if (fi < 0 || fi >= (int)DPRCSFrames.size())
+      return false;
+    return DPRCSFrames[fi];
+  }
+
+  void addGPRCalleeSavedArea1Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS1Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS1Frames.resize(Size);
+      }
+      GPRCS1Frames[fi] = true;
+    }
+  }
+  void addGPRCalleeSavedArea2Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS2Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS2Frames.resize(Size);
+      }
+      GPRCS2Frames[fi] = true;
+    }
+  }
+  void addDPRCalleeSavedAreaFrame(int fi) {
+    if (fi >= 0) {
+      int Size = DPRCSFrames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        DPRCSFrames.resize(Size);
+      }
+      DPRCSFrames[fi] = true;
+    }
+  }
+
+  void setCSRegisterIsSpilled(unsigned Reg) {
+    SpilledCSRegs.set(Reg);
+  }
+
+  bool isCSRegisterSpilled(unsigned Reg) const {
+    return SpilledCSRegs[Reg];
+  }
+
+  const BitVector &getSpilledCSRegisters() const {
+    return SpilledCSRegs;
+  }
+
+  unsigned createJumpTableUId() {
+    return JumpTableUId++;
+  }
+
+  unsigned getNumJumpTables() const {
+    return JumpTableUId;
+  }
+
+  void initConstPoolEntryUId(unsigned UId) {
+    ConstPoolEntryUId = UId;
+  }
+
+  unsigned getNumConstPoolEntries() const {
+    return ConstPoolEntryUId;
+  }
+
+  unsigned createConstPoolEntryUId() {
+    return ConstPoolEntryUId++;
+  }
+};
+} // End llvm namespace
+
+#endif // ARMMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
new file mode 100644
index 0000000..199858f
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -0,0 +1,1528 @@
+//===- ARMRegisterInfo.cpp - ARM Register Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<bool> ThumbRegScavenging("enable-thumb-reg-scavenging",
+                               cl::Hidden,
+                               cl::desc("Enable register scavenging on Thumb"));
+
+unsigned ARMRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace ARM;
+  switch (RegEnum) {
+  case R0:  case S0:  case D0:  return 0;
+  case R1:  case S1:  case D1:  return 1;
+  case R2:  case S2:  case D2:  return 2;
+  case R3:  case S3:  case D3:  return 3;
+  case R4:  case S4:  case D4:  return 4;
+  case R5:  case S5:  case D5:  return 5;
+  case R6:  case S6:  case D6:  return 6;
+  case R7:  case S7:  case D7:  return 7;
+  case R8:  case S8:  case D8:  return 8;
+  case R9:  case S9:  case D9:  return 9;
+  case R10: case S10: case D10: return 10;
+  case R11: case S11: case D11: return 11;
+  case R12: case S12: case D12: return 12;
+  case SP:  case S13: case D13: return 13;
+  case LR:  case S14: case D14: return 14;
+  case PC:  case S15: case D15: return 15;
+  case S16: return 16;
+  case S17: return 17;
+  case S18: return 18;
+  case S19: return 19;
+  case S20: return 20;
+  case S21: return 21;
+  case S22: return 22;
+  case S23: return 23;
+  case S24: return 24;
+  case S25: return 25;
+  case S26: return 26;
+  case S27: return 27;
+  case S28: return 28;
+  case S29: return 29;
+  case S30: return 30;
+  case S31: return 31;
+  default:
+    assert(0 && "Unknown ARM register!");
+    abort();
+  }
+}
+
+unsigned ARMRegisterInfo::getRegisterNumbering(unsigned RegEnum,
+                                               bool &isSPVFP) {
+  isSPVFP = false;
+
+  using namespace ARM;
+  switch (RegEnum) {
+  default:
+    assert(0 && "Unknown ARM register!");
+    abort();
+  case R0:  case D0:  return 0;
+  case R1:  case D1:  return 1;
+  case R2:  case D2:  return 2;
+  case R3:  case D3:  return 3;
+  case R4:  case D4:  return 4;
+  case R5:  case D5:  return 5;
+  case R6:  case D6:  return 6;
+  case R7:  case D7:  return 7;
+  case R8:  case D8:  return 8;
+  case R9:  case D9:  return 9;
+  case R10: case D10: return 10;
+  case R11: case D11: return 11;
+  case R12: case D12: return 12;
+  case SP:  case D13: return 13;
+  case LR:  case D14: return 14;
+  case PC:  case D15: return 15;
+
+  case S0: case S1: case S2: case S3:
+  case S4: case S5: case S6: case S7: 
+  case S8: case S9: case S10: case S11: 
+  case S12: case S13: case S14: case S15: 
+  case S16: case S17: case S18: case S19: 
+  case S20: case S21: case S22: case S23: 
+  case S24: case S25: case S26: case S27: 
+  case S28: case S29: case S30: case S31:  {
+    isSPVFP = true;
+    switch (RegEnum) {
+    default: return 0; // Avoid compile time warning.
+    case S0: return 0;
+    case S1: return 1;
+    case S2: return 2;
+    case S3: return 3;
+    case S4: return 4;
+    case S5: return 5;
+    case S6: return 6;
+    case S7: return 7;
+    case S8: return 8;
+    case S9: return 9;
+    case S10: return 10;
+    case S11: return 11;
+    case S12: return 12;
+    case S13: return 13;
+    case S14: return 14;
+    case S15: return 15;
+    case S16: return 16;
+    case S17: return 17;
+    case S18: return 18;
+    case S19: return 19;
+    case S20: return 20;
+    case S21: return 21;
+    case S22: return 22;
+    case S23: return 23;
+    case S24: return 24;
+    case S25: return 25;
+    case S26: return 26;
+    case S27: return 27;
+    case S28: return 28;
+    case S29: return 29;
+    case S30: return 30;
+    case S31: return 31;
+    }
+  }
+  }
+}
+
+ARMRegisterInfo::ARMRegisterInfo(const TargetInstrInfo &tii,
+                                 const ARMSubtarget &sti)
+  : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+    TII(tii), STI(sti),
+    FramePtr((STI.useThumbBacktraces() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMRegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator &MBBI,
+                                        unsigned DestReg, int Val,
+                                        unsigned Pred, unsigned PredReg,
+                                        const TargetInstrInfo *TII,
+                                        bool isThumb,
+                                        DebugLoc dl) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  Constant *C = ConstantInt::get(Type::Int32Ty, Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  if (isThumb)
+    BuildMI(MBB, MBBI, dl, 
+            TII->get(ARM::tLDRcp),DestReg).addConstantPoolIndex(Idx);
+  else
+    BuildMI(MBB, MBBI, dl, TII->get(ARM::LDRcp), DestReg)
+      .addConstantPoolIndex(Idx)
+      .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+}
+
+const TargetRegisterClass *ARMRegisterInfo::getPointerRegClass() const {
+  return &ARM::GPRRegClass;
+}
+
+/// isLowRegister - Returns true if the register is low register r0-r7.
+///
+bool ARMRegisterInfo::isLowRegister(unsigned Reg) const {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+const TargetRegisterClass*
+ARMRegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const {
+  if (STI.isThumb()) {
+    if (isLowRegister(Reg))
+      return ARM::tGPRRegisterClass;
+    switch (Reg) {
+    default:
+      break;
+    case ARM::R8:  case ARM::R9:  case ARM::R10:  case ARM::R11:
+    case ARM::R12: case ARM::SP:  case ARM::LR:   case ARM::PC:
+      return ARM::GPRRegisterClass;
+    }
+  }
+  return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT);
+}
+
+const unsigned*
+ARMRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = {
+    ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+    ARM::R7, ARM::R6,  ARM::R5,  ARM::R4,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+
+  static const unsigned DarwinCalleeSavedRegs[] = {
+    ARM::LR,  ARM::R7,  ARM::R6, ARM::R5, ARM::R4,
+    ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+  return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const *
+ARMRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+  static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
+    &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+  return STI.isThumb() ? ThumbCalleeSavedRegClasses : CalleeSavedRegClasses;
+}
+
+BitVector ARMRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  // FIXME: avoid re-calculating this everytime.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM::SP);
+  Reserved.set(ARM::PC);
+  if (STI.isTargetDarwin() || hasFP(MF))
+    Reserved.set(FramePtr);
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    Reserved.set(ARM::R9);
+  return Reserved;
+}
+
+bool
+ARMRegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const {
+  switch (Reg) {
+  default: break;
+  case ARM::SP:
+  case ARM::PC:
+    return true;
+  case ARM::R7:
+  case ARM::R11:
+    if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
+      return true;
+    break;
+  case ARM::R9:
+    return STI.isR9Reserved();
+  }
+
+  return false;
+}
+
+bool
+ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  return ThumbRegScavenging || !AFI->isThumbFunction();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+///
+bool ARMRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+// not required, we reserve argument space for call sites in the function
+// immediately on entry to the current function. This eliminates the need for
+// add/sub sp brackets around call sites. Returns true if the call frame is
+// included as part of the stack frame.
+bool ARMRegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (AFI->isThumbFunction()) {
+    if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+      return false;
+  } else {
+    if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+      return false;
+  }
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+/// emitARMRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in ARM code.
+static
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI,
+                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const TargetInstrInfo &TII,
+                             DebugLoc dl) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  while (NumBytes) {
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+    assert(ThisVal && "Didn't extract field correctly");
+    
+    // We will handle these bits from offset, clear them.
+    NumBytes &= ~ThisVal;
+    
+    // Get the properly encoded SOImmVal field.
+    int SOImmVal = ARM_AM::getSOImmVal(ThisVal);
+    assert(SOImmVal != -1 && "Bit extraction didn't work?");
+    
+    // Build the new ADD / SUB.
+    BuildMI(MBB, MBBI, dl, TII.get(isSub ? ARM::SUBri : ARM::ADDri), DestReg)
+      .addReg(BaseReg, RegState::Kill).addImm(SOImmVal)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+    BaseReg = DestReg;
+  }
+}
+
+/// calcNumMI - Returns the number of instructions required to materialize
+/// the specific add / sub r, c instruction.
+static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
+                          unsigned NumBits, unsigned Scale) {
+  unsigned NumMIs = 0;
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+
+  if (Opc == ARM::tADDrSPi) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    NumMIs++;
+    NumBits = 8;
+    Scale = 1;  // Followed by a number of tADDi8.
+    Chunk = ((1 << NumBits) - 1) * Scale;
+  }
+
+  NumMIs += Bytes / Chunk;
+  if ((Bytes % Chunk) != 0)
+    NumMIs++;
+  if (ExtraOpc)
+    NumMIs++;
+  return NumMIs;
+}
+
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
+/// in a register using mov / mvn sequences or load the immediate from a
+/// constpool entry.
+static
+void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, unsigned BaseReg,
+                              int NumBytes, bool CanChangeCC,
+                              const TargetInstrInfo &TII,
+                              const ARMRegisterInfo& MRI,
+                              DebugLoc dl) {
+    bool isHigh = !MRI.isLowRegister(DestReg) ||
+                  (BaseReg != 0 && !MRI.isLowRegister(BaseReg));
+    bool isSub = false;
+    // Subtract doesn't have high register version. Load the negative value
+    // if either base or dest register is a high register. Also, if do not
+    // issue sub as part of the sequence if condition register is to be
+    // preserved.
+    if (NumBytes < 0 && !isHigh && CanChangeCC) {
+      isSub = true;
+      NumBytes = -NumBytes;
+    }
+    unsigned LdReg = DestReg;
+    if (DestReg == ARM::SP) {
+      assert(BaseReg == ARM::SP && "Unexpected!");
+      LdReg = ARM::R3;
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
+        .addReg(ARM::R3, RegState::Kill);
+    }
+
+    if (NumBytes <= 255 && NumBytes >= 0)
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
+    else if (NumBytes < 0 && NumBytes >= -255) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), LdReg)
+        .addReg(LdReg, RegState::Kill);
+    } else
+      MRI.emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, ARMCC::AL, 0, &TII, 
+                            true, dl);
+
+    // Emit add / sub.
+    int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
+    const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, 
+                                            TII.get(Opc), DestReg);
+    if (DestReg == ARM::SP || isSub)
+      MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+    else
+      MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+    if (DestReg == ARM::SP)
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
+        .addReg(ARM::R12, RegState::Kill);
+}
+
+/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code.
+static
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI,
+                               unsigned DestReg, unsigned BaseReg,
+                               int NumBytes, const TargetInstrInfo &TII,
+                               const ARMRegisterInfo& MRI,
+                               DebugLoc dl) {
+  bool isSub = NumBytes < 0;
+  unsigned Bytes = (unsigned)NumBytes;
+  if (isSub) Bytes = -NumBytes;
+  bool isMul4 = (Bytes & 3) == 0;
+  bool isTwoAddr = false;
+  bool DstNotEqBase = false;
+  unsigned NumBits = 1;
+  unsigned Scale = 1;
+  int Opc = 0;
+  int ExtraOpc = 0;
+
+  if (DestReg == BaseReg && BaseReg == ARM::SP) {
+    assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+    NumBits = 7;
+    Scale = 4;
+    Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+    isTwoAddr = true;
+  } else if (!isSub && BaseReg == ARM::SP) {
+    // r1 = add sp, 403
+    // =>
+    // r1 = add sp, 100 * 4
+    // r1 = add r1, 3
+    if (!isMul4) {
+      Bytes &= ~3;
+      ExtraOpc = ARM::tADDi3;
+    }
+    NumBits = 8;
+    Scale = 4;
+    Opc = ARM::tADDrSPi;
+  } else {
+    // sp = sub sp, c
+    // r1 = sub sp, c
+    // r8 = sub sp, c
+    if (DestReg != BaseReg)
+      DstNotEqBase = true;
+    NumBits = 8;
+    Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+    isTwoAddr = true;
+  }
+
+  unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
+  unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
+  if (NumMIs > Threshold) {
+    // This will expand into too many instructions. Load the immediate from a
+    // constpool entry.
+    emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII,
+                             MRI, dl);
+    return;
+  }
+
+  if (DstNotEqBase) {
+    if (MRI.isLowRegister(DestReg) && MRI.isLowRegister(BaseReg)) {
+      // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
+      unsigned Chunk = (1 << 3) - 1;
+      unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+      Bytes -= ThisVal;
+      BuildMI(MBB, MBBI, dl,TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg)
+        .addReg(BaseReg, RegState::Kill).addImm(ThisVal);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+        .addReg(BaseReg, RegState::Kill);
+    }
+    BaseReg = DestReg;
+  }
+
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+  while (Bytes) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    ThisVal /= Scale;
+    // Build the new tADD / tSUB.
+    if (isTwoAddr)
+      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+        .addReg(DestReg).addImm(ThisVal);
+    else {
+      bool isKill = BaseReg != ARM::SP;
+      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+        .addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+      BaseReg = DestReg;
+
+      if (Opc == ARM::tADDrSPi) {
+        // r4 = add sp, imm
+        // r4 = add r4, imm
+        // ...
+        NumBits = 8;
+        Scale = 1;
+        Chunk = ((1 << NumBits) - 1) * Scale;
+        Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+        isTwoAddr = true;
+      }
+    }
+  }
+
+  if (ExtraOpc)
+    BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg)
+      .addReg(DestReg, RegState::Kill)
+      .addImm(((unsigned)NumBytes) & 3);
+}
+
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg,
+                  bool isThumb, const TargetInstrInfo &TII, 
+                  const ARMRegisterInfo& MRI,
+                  DebugLoc dl) {
+  if (isThumb)
+    emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII,
+                              MRI, dl);
+  else
+    emitARMRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes,
+                            Pred, PredReg, TII, dl);
+}
+
+void ARMRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      bool isThumb = AFI->isThumbFunction();
+      ARMCC::CondCodes Pred = isThumb
+        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(1).getImm();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+        unsigned PredReg = isThumb ? 0 : Old->getOperand(2).getReg();
+        emitSPUpdate(MBB, I, -Amount, Pred, PredReg, isThumb, TII, *this, dl);
+      } else {
+        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+        unsigned PredReg = isThumb ? 0 : Old->getOperand(3).getReg();
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(MBB, I, Amount, Pred, PredReg, isThumb, TII, *this, dl);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+/// emitThumbConstant - Emit a series of instructions to materialize a
+/// constant.
+static void emitThumbConstant(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, int Imm,
+                              const TargetInstrInfo &TII,
+                              const ARMRegisterInfo& MRI,
+                              DebugLoc dl) {
+  bool isSub = Imm < 0;
+  if (isSub) Imm = -Imm;
+
+  int Chunk = (1 << 8) - 1;
+  int ThisVal = (Imm > Chunk) ? Chunk : Imm;
+  Imm -= ThisVal;
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal);
+  if (Imm > 0) 
+    emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl);
+  if (isSub)
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), DestReg)
+      .addReg(DestReg, RegState::Kill);
+}
+
+/// findScratchRegister - Find a 'free' ARM register. If register scavenger
+/// is not being used, R12 is available. Otherwise, try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static
+unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC,
+                             ARMFunctionInfo *AFI) {
+  unsigned Reg = RS ? RS->FindUnusedReg(RC, true) : (unsigned) ARM::R12;
+  assert (!AFI->isThumbFunction());
+  if (Reg == 0)
+    // Try a already spilled CS register.
+    Reg = RS->FindUnusedReg(RC, AFI->getSpilledCSRegisters());
+
+  return Reg;
+}
+
+void ARMRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, RegScavenger *RS) const{
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isThumb = AFI->isThumbFunction();
+  DebugLoc dl = MI.getDebugLoc();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + 
+               MF.getFrameInfo()->getStackSize() + SPAdj;
+
+  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+  else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex))
+    Offset -= AFI->getDPRCalleeSavedAreaOffset();
+  else if (hasFP(MF)) {
+    assert(SPAdj == 0 && "Unexpected");
+    // There is alloca()'s in this function, must reference off the frame
+    // pointer instead.
+    FrameReg = getFrameRegister(MF);
+    Offset -= AFI->getFramePtrSpillOffset();
+  }
+
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrMode2.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrMode2;
+
+  if (Opcode == ARM::ADDri) {
+    Offset += MI.getOperand(i+1).getImm();
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::MOVr));
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(i+1);
+      return;
+    } else if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::SUBri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    int ImmedOffset = ARM_AM::getSOImmVal(Offset);
+    if (ImmedOffset != -1) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(i+1).ChangeToImmediate(ImmedOffset);
+      return;
+    }
+    
+    // Otherwise, we fallback to common code below to form the imm offset with
+    // a sequence of ADDri instructions.  First though, pull as much of the imm
+    // into this ADDri as possible.
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+    
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+    
+    // Get the properly encoded SOImmVal field.
+    int ThisSOImmVal = ARM_AM::getSOImmVal(ThisImmVal);
+    assert(ThisSOImmVal != -1 && "Bit extraction didn't work?");    
+    MI.getOperand(i+1).ChangeToImmediate(ThisSOImmVal);
+  } else if (Opcode == ARM::tADDrSPi) {
+    Offset += MI.getOperand(i+1).getImm();
+
+    // Can't use tADDrSPi if it's based off the frame pointer.
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (FrameReg != ARM::SP) {
+      Opcode = ARM::tADDi3;
+      MI.setDesc(TII.get(ARM::tADDi3));
+      NumBits = 3;
+    } else {
+      NumBits = 8;
+      Scale = 4;
+      assert((Offset & 3) == 0 &&
+             "Thumb add/sub sp, #imm immediate must be multiple of 4!");
+    }
+
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVhir2lor));
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(i+1);
+      return;
+    }
+
+    // Common case: small offset, fits into instruction.
+    unsigned Mask = (1 << NumBits) - 1;
+    if (((Offset / Scale) & ~Mask) == 0) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
+      return;
+    }
+
+    unsigned DestReg = MI.getOperand(0).getReg();
+    unsigned Bytes = (Offset > 0) ? Offset : -Offset;
+    unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
+    // MI would expand into a large number of instructions. Don't try to
+    // simplify the immediate.
+    if (NumMIs > 2) {
+      emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
+                                *this, dl);
+      MBB.erase(II);
+      return;
+    }
+
+    if (Offset > 0) {
+      // Translate r0 = add sp, imm to
+      // r0 = add sp, 255*4
+      // r0 = add r0, (imm - 255*4)
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(i+1).ChangeToImmediate(Mask);
+      Offset = (Offset - Mask * Scale);
+      MachineBasicBlock::iterator NII = next(II);
+      emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
+                                *this, dl);
+    } else {
+      // Translate r0 = add sp, -imm to
+      // r0 = -imm (this is then translated into a series of instructons)
+      // r0 = add r0, sp
+      emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
+      MI.setDesc(TII.get(ARM::tADDhirr));
+      MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
+      MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
+    }
+    return;
+  } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrMode2: {
+      ImmIdx = i+2;
+      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode3: {
+      ImmIdx = i+2;
+      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      break;
+    }
+    case ARMII::AddrMode5: {
+      ImmIdx = i+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      break;
+    }
+    case ARMII::AddrModeTs: {
+      ImmIdx = i+1;
+      InstrOffs = MI.getOperand(ImmIdx).getImm();
+      NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+      Scale = 4;
+      break;
+    }
+    default:
+      assert(0 && "Unsupported addressing mode!");
+      abort();
+      break;
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    if (Offset < 0 && !isThumb) {
+      Offset = -Offset;
+      isSub = true;
+    }
+
+    // Common case: small offset, fits into instruction.
+    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with sp
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      if (isSub)
+        ImmedOffset |= 1 << NumBits;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      return;
+    }
+
+    bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
+    if (AddrMode == ARMII::AddrModeTs) {
+      // Thumb tLDRspi, tSTRspi. These will change to instructions that use
+      // a different base register.
+      NumBits = 5;
+      Mask = (1 << NumBits) - 1;
+    }
+    // If this is a thumb spill / restore, we will be using a constpool load to
+    // materialize the offset.
+    if (AddrMode == ARMII::AddrModeTs && isThumSpillRestore)
+      ImmOp.ChangeToImmediate(0);
+    else {
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      if (isSub)
+        ImmedOffset |= 1 << NumBits;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+  
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert(Offset && "This code isn't needed if offset already handled!");
+
+  if (isThumb) {
+    if (Desc.mayLoad()) {
+      // Use the destination register to materialize sp + offset.
+      unsigned TmpReg = MI.getOperand(0).getReg();
+      bool UseRR = false;
+      if (Opcode == ARM::tRestore) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
+                                   Offset, false, TII, *this, dl);
+        else {
+          emitLoadConstPool(MBB, II, TmpReg, Offset, ARMCC::AL, 0, &TII,
+                            true, dl);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
+                                  *this, dl);
+      MI.setDesc(TII.get(ARM::tLDR));
+      MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
+      if (UseRR)
+        // Use [reg, reg] addrmode.
+        MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
+      else  // tLDR has an extra register operand.
+        MI.addOperand(MachineOperand::CreateReg(0, false));
+    } else if (Desc.mayStore()) {
+      // FIXME! This is horrific!!! We need register scavenging.
+      // Our temporary workaround has marked r3 unavailable. Of course, r3 is
+      // also a ABI register so it's possible that is is the register that is
+      // being storing here. If that's the case, we do the following:
+      // r12 = r2
+      // Use r2 to materialize sp + offset
+      // str r3, r2
+      // r2 = r12
+      unsigned ValReg = MI.getOperand(0).getReg();
+      unsigned TmpReg = ARM::R3;
+      bool UseRR = false;
+      if (ValReg == ARM::R3) {
+        BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
+          .addReg(ARM::R2, RegState::Kill);
+        TmpReg = ARM::R2;
+      }
+      if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
+        BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
+          .addReg(ARM::R3, RegState::Kill);
+      if (Opcode == ARM::tSpill) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
+                                   Offset, false, TII, *this, dl);
+        else {
+          emitLoadConstPool(MBB, II, TmpReg, Offset, ARMCC::AL, 0, &TII,
+                            true, dl);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
+                                  *this, dl);
+      MI.setDesc(TII.get(ARM::tSTR));
+      MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
+      if (UseRR)  // Use [reg, reg] addrmode.
+        MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
+      else // tSTR has an extra register operand.
+        MI.addOperand(MachineOperand::CreateReg(0, false));
+
+      MachineBasicBlock::iterator NII = next(II);
+      if (ValReg == ARM::R3)
+        BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R2)
+          .addReg(ARM::R12, RegState::Kill);
+      if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
+        BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
+          .addReg(ARM::R12, RegState::Kill);
+    } else
+      assert(false && "Unexpected opcode!");
+  } else {
+    // Insert a set of r12 with the full address: r12 = sp + offset
+    // If the offset we have is too large to fit into the instruction, we need
+    // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
+    // out of 'Offset'.
+    unsigned ScratchReg = findScratchRegister(RS, &ARM::GPRRegClass, AFI);
+    if (ScratchReg == 0)
+      // No register is "free". Scavenge a register.
+      ScratchReg = RS->scavengeRegister(&ARM::GPRRegClass, II, SPAdj);
+    int PIdx = MI.findFirstPredOperandIdx();
+    ARMCC::CondCodes Pred = (PIdx == -1)
+      ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+    unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+    emitARMRegPlusImmediate(MBB, II, ScratchReg, FrameReg,
+                            isSub ? -Offset : Offset, Pred, PredReg, TII, dl);
+    MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+  }
+}
+
+static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+  }
+  return (unsigned)Offset;
+}
+
+void
+ARMRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is used.
+  const unsigned *CSRegs = getCalleeSavedRegs();
+  const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (MF.getRegInfo().isPhysRegUsed(Reg)) {
+      AFI->setCSRegisterIsSpilled(Reg);
+      Spilled = true;
+      CanEliminateFrame = false;
+    } else {
+      // Check alias registers too.
+      for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
+        if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
+          Spilled = true;
+          CanEliminateFrame = false;
+        }
+      }
+    }
+
+    if (CSRegClasses[i] == &ARM::GPRRegClass) {
+      if (Spilled) {
+        NumGPRSpills++;
+
+        if (!STI.isTargetDarwin()) {
+          if (Reg == ARM::LR)
+            LRSpilled = true;
+          CS1Spilled = true;
+          continue;
+        }
+
+        // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+        switch (Reg) {
+        case ARM::LR:
+          LRSpilled = true;
+          // Fallthrough
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+          CS1Spilled = true;
+          break;
+        default:
+          break;
+        }
+      } else { 
+        if (!STI.isTargetDarwin()) {
+          UnspilledCS1GPRs.push_back(Reg);
+          continue;
+        }
+
+        switch (Reg) {
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+        case ARM::LR:
+          UnspilledCS1GPRs.push_back(Reg);
+          break;
+        default:
+          UnspilledCS2GPRs.push_back(Reg);
+          break;
+        }
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumbFunction()) {
+    unsigned FnSize = TII.GetFunctionSizeInBytes(MF);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  bool ExtraCSSpill = false;
+  if (!CanEliminateFrame || hasFP(MF)) {
+    AFI->setHasStackFrame(true);
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      MF.getRegInfo().setPhysRegUsed(ARM::LR);
+      AFI->setCSRegisterIsSpilled(ARM::LR);
+      NumGPRSpills++;
+      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
+                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    // Darwin ABI requires FP to point to the stack slot that contains the
+    // previous FP.
+    if (STI.isTargetDarwin() || hasFP(MF)) {
+      MF.getRegInfo().setPhysRegUsed(FramePtr);
+      NumGPRSpills++;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs. Spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spiil high register if the function is thumb
+          if (!AFI->isThumbFunction() || isLowRegister(Reg) || Reg == ARM::LR) {
+            MF.getRegInfo().setPhysRegUsed(Reg);
+            AFI->setCSRegisterIsSpilled(Reg);
+            if (!isReservedReg(MF, Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() &&
+                 !AFI->isThumbFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        MF.getRegInfo().setPhysRegUsed(Reg);
+        AFI->setCSRegisterIsSpilled(Reg);
+        if (!isReservedReg(MF, Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additiona
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging.
+    if (RS && !ExtraCSSpill && !AFI->isThumbFunction()) {
+      MachineFrameInfo  *MFI = MF.getFrameInfo();
+      unsigned Size = estimateStackSize(MF, MFI);
+      unsigned Limit = (1 << 12) - 1;
+      for (MachineFunction::iterator BB = MF.begin(),E = MF.end();BB != E; ++BB)
+        for (MachineBasicBlock::iterator I= BB->begin(); I != BB->end(); ++I) {
+          for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+            if (I->getOperand(i).isFI()) {
+              unsigned Opcode = I->getOpcode();
+              const TargetInstrDesc &Desc = TII.get(Opcode);
+              unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+              if (AddrMode == ARMII::AddrMode3) {
+                Limit = (1 << 8) - 1;
+                goto DoneEstimating;
+              } else if (AddrMode == ARMII::AddrMode5) {
+                unsigned ThisLimit = ((1 << 8) - 1) * 4;
+                if (ThisLimit < Limit)
+                  Limit = ThisLimit;
+              }
+            }
+        }
+    DoneEstimating:
+      if (Size >= Limit) {
+        // If any non-reserved CS register isn't spilled, just spill one or two
+        // extra. That should take care of it!
+        unsigned NumExtras = TargetAlign / 4;
+        SmallVector<unsigned, 2> Extras;
+        while (NumExtras && !UnspilledCS1GPRs.empty()) {
+          unsigned Reg = UnspilledCS1GPRs.back();
+          UnspilledCS1GPRs.pop_back();
+          if (!isReservedReg(MF, Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+        while (NumExtras && !UnspilledCS2GPRs.empty()) {
+          unsigned Reg = UnspilledCS2GPRs.back();
+          UnspilledCS2GPRs.pop_back();
+          if (!isReservedReg(MF, Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+        if (Extras.size() && NumExtras == 0) {
+          for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+            MF.getRegInfo().setPhysRegUsed(Extras[i]);
+            AFI->setCSRegisterIsSpilled(Extras[i]);
+          }
+        } else {
+          // Reserve a slot closest to SP or frame pointer.
+          const TargetRegisterClass *RC = &ARM::GPRRegClass;
+          RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                           RC->getAlignment()));
+        }
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    MF.getRegInfo().setPhysRegUsed(ARM::LR);
+    AFI->setCSRegisterIsSpilled(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
+
+/// Move iterator pass the next bunch of callee save load / store ops for
+/// the particular spill area (1: integer area 1, 2: integer area 2,
+/// 3: fp area, 0: don't care).
+static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator &MBBI,
+                                   int Opc, unsigned Area,
+                                   const ARMSubtarget &STI) {
+  while (MBBI != MBB.end() &&
+         MBBI->getOpcode() == Opc && MBBI->getOperand(1).isFI()) {
+    if (Area != 0) {
+      bool Done = false;
+      unsigned Category = 0;
+      switch (MBBI->getOperand(0).getReg()) {
+      case ARM::R4:  case ARM::R5:  case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        Category = 1;
+        break;
+      case ARM::R8:  case ARM::R9:  case ARM::R10: case ARM::R11:
+        Category = STI.isTargetDarwin() ? 2 : 1;
+        break;
+      case ARM::D8:  case ARM::D9:  case ARM::D10: case ARM::D11:
+      case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
+        Category = 3;
+        break;
+      default:
+        Done = true;
+        break;
+      }
+      if (Done || Category != Area)
+        break;
+    }
+
+    ++MBBI;
+  }
+}
+
+void ARMRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isThumb = AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  if (isThumb) {
+    // Check if R3 is live in. It might have to be used as a scratch register.
+    for (MachineRegisterInfo::livein_iterator I =MF.getRegInfo().livein_begin(),
+         E = MF.getRegInfo().livein_end(); I != E; ++I) {
+      if (I->first == ARM::R3) {
+        AFI->setR3IsLiveIn(true);
+        break;
+      }
+    }
+
+    // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+    NumBytes = (NumBytes + 3) & ~3;
+    MFI->setStackSize(NumBytes);
+  }
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  if (VARegSaveSize)
+    emitSPUpdate(MBB, MBBI, -VARegSaveSize, ARMCC::AL, 0, isThumb, TII,
+                 *this, dl);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, -NumBytes, ARMCC::AL, 0, isThumb, TII, *this, dl);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetDarwin()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  if (!isThumb) {
+    // Build the new SUBri to adjust SP for integer callee-save spill area 1.
+    emitSPUpdate(MBB, MBBI, -GPRCS1Size, ARMCC::AL, 0, isThumb, TII, *this, dl);
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 1, STI);
+  } else if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+    ++MBBI;
+    if (MBBI != MBB.end())
+      dl = MBBI->getDebugLoc();
+  }
+
+  // Darwin ABI requires FP to point to the stack slot that contains the
+  // previous FP.
+  if (STI.isTargetDarwin() || hasFP(MF)) {
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(isThumb ? ARM::tADDrSPi : ARM::ADDri), 
+              FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0);
+    if (!isThumb) AddDefaultCC(AddDefaultPred(MIB));
+  }
+
+  if (!isThumb) {
+    // Build the new SUBri to adjust SP for integer callee-save spill area 2.
+    emitSPUpdate(MBB, MBBI, -GPRCS2Size, ARMCC::AL, 0, false, TII, *this, dl);
+
+    // Build the new SUBri to adjust SP for FP callee-save spill area.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 2, STI);
+    emitSPUpdate(MBB, MBBI, -DPRCSSize, ARMCC::AL, 0, false, TII, *this, dl);
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  
+  NumBytes = DPRCSOffset;
+  if (NumBytes) {
+    // Insert it after all the callee-save spills.
+    if (!isThumb)
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 3, STI);
+    emitSPUpdate(MBB, MBBI, -NumBytes, ARMCC::AL, 0, isThumb, TII, *this, dl);
+  }
+
+  if(STI.isTargetELF() && hasFP(MF)) {
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+  }
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
+  return ((MI->getOpcode() == ARM::FLDD ||
+           MI->getOpcode() == ARM::LDR  ||
+           MI->getOpcode() == ARM::tRestore) &&
+          MI->getOperand(1).isFI() &&
+          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
+}
+
+void ARMRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert((MBBI->getOpcode() == ARM::BX_RET ||
+          MBBI->getOpcode() == ARM::tBX_RET ||
+          MBBI->getOpcode() == ARM::tPOP_RET) &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isThumb = AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, isThumb, TII, *this, dl);
+  } else {
+    // Unwind MBBI to point to first LDR / FLDD.
+    const unsigned *CSRegs = getCalleeSavedRegs();
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
+      if (!isCSRestore(MBBI, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+    if (isThumb) {
+      if (hasFP(MF)) {
+        NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+        // Reset SP based on frame pointer only if the stack frame extends beyond
+        // frame pointer stack slot or target is ELF and the function has FP.
+        if (NumBytes)
+          emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes,
+                                    TII, *this, dl);
+        else
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::SP)
+            .addReg(FramePtr);
+      } else {
+        if (MBBI->getOpcode() == ARM::tBX_RET &&
+            &MBB.front() != MBBI &&
+            prior(MBBI)->getOpcode() == ARM::tPOP) {
+          MachineBasicBlock::iterator PMBBI = prior(MBBI);
+          emitSPUpdate(MBB, PMBBI, NumBytes, ARMCC::AL, 0, isThumb, TII,
+                       *this, dl);
+        } else
+          emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, isThumb, TII,
+                       *this, dl);
+      }
+    } else {
+      // Darwin ABI requires FP to point to the stack slot that contains the
+      // previous FP.
+      if ((STI.isTargetDarwin() && NumBytes) || hasFP(MF)) {
+        NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+        // Reset SP based on frame pointer only if the stack frame extends beyond
+        // frame pointer stack slot or target is ELF and the function has FP.
+        if (AFI->getGPRCalleeSavedArea2Size() ||
+            AFI->getDPRCalleeSavedAreaSize()  ||
+            AFI->getDPRCalleeSavedAreaOffset()||
+            hasFP(MF)) {
+          if (NumBytes)
+            BuildMI(MBB, MBBI, dl, TII.get(ARM::SUBri), ARM::SP).addReg(FramePtr)
+              .addImm(NumBytes)
+              .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+          else
+            BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP).addReg(FramePtr)
+              .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+        }
+      } else if (NumBytes) {
+        emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, false, TII, *this, dl);
+      }
+
+      // Move SP to start of integer callee save spill area 2.
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 3, STI);
+      emitSPUpdate(MBB, MBBI, AFI->getDPRCalleeSavedAreaSize(), ARMCC::AL, 0,
+                   false, TII, *this, dl);
+
+      // Move SP to start of integer callee save spill area 1.
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 2, STI);
+      emitSPUpdate(MBB, MBBI, AFI->getGPRCalleeSavedArea2Size(), ARMCC::AL, 0,
+                   false, TII, *this, dl);
+
+      // Move SP to SP upon entry to the function.
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 1, STI);
+      emitSPUpdate(MBB, MBBI, AFI->getGPRCalleeSavedArea1Size(), ARMCC::AL, 0,
+                   false, TII, *this, dl);
+    }
+  }
+
+  if (VARegSaveSize) {
+    if (isThumb)
+      // Epilogue for vararg functions: pop LR to R3 and branch off it.
+      // FIXME: Verify this is still ok when R3 is no longer being reserved.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)).addReg(ARM::R3);
+
+    emitSPUpdate(MBB, MBBI, VARegSaveSize, ARMCC::AL, 0, isThumb, TII,
+                 *this, dl);
+
+    if (isThumb) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3);
+      MBB.erase(MBBI);
+    }
+  }
+}
+
+unsigned ARMRegisterInfo::getRARegister() const {
+  return ARM::LR;
+}
+
+unsigned ARMRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  if (STI.isTargetDarwin() || hasFP(MF))
+    return (STI.useThumbBacktraces() || STI.isThumb()) ? ARM::R7 : ARM::R11;
+  else
+    return ARM::SP;
+}
+
+unsigned ARMRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned ARMRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int ARMRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+#include "ARMGenRegisterInfo.inc"
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
new file mode 100644
index 0000000..e1d9efb
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -0,0 +1,102 @@
+//===- ARMRegisterInfo.h - ARM Register Information Impl --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMREGISTERINFO_H
+#define ARMREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "ARMGenRegisterInfo.h.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class TargetInstrInfo;
+  class Type;
+
+struct ARMRegisterInfo : public ARMGenRegisterInfo {
+  const TargetInstrInfo &TII;
+  const ARMSubtarget &STI;
+private:
+  /// FramePtr - ARM physical register used as frame ptr.
+  unsigned FramePtr;
+
+public:
+  ARMRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  void emitLoadConstPool(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         unsigned DestReg, int Val,
+                         unsigned Pred, unsigned PredReg,
+                         const TargetInstrInfo *TII, bool isThumb,
+                         DebugLoc dl) const;
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// ARM::LR, return the number that it corresponds to (e.g. 14).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Same as previous getRegisterNumbering except it returns true in isSPVFP
+  /// if the register is a single precision VFP register.
+  static unsigned getRegisterNumbering(unsigned RegEnum, bool &isSPVFP);
+
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  const TargetRegisterClass *getPointerRegClass() const;
+
+  /// Code Generation virtual methods...
+  const TargetRegisterClass *
+    getPhysicalRegisterRegClass(unsigned Reg, MVT VT = MVT::Other) const;
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  
+  bool isLowRegister(unsigned Reg) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
new file mode 100644
index 0000000..e8daf74
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -0,0 +1,221 @@
+//===- ARMRegisterInfo.td - ARM Register defs -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the ARM register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "ARM";
+  let SubRegs = subregs;
+}
+
+class ARMFReg<bits<5> num, string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "ARM";
+}
+
+// Integer registers
+def R0  : ARMReg< 0, "r0">,  DwarfRegNum<[0]>;
+def R1  : ARMReg< 1, "r1">,  DwarfRegNum<[1]>;
+def R2  : ARMReg< 2, "r2">,  DwarfRegNum<[2]>;
+def R3  : ARMReg< 3, "r3">,  DwarfRegNum<[3]>;
+def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
+def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
+def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
+def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
+def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
+def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
+def SP  : ARMReg<13, "sp">,  DwarfRegNum<[13]>;
+def LR  : ARMReg<14, "lr">,  DwarfRegNum<[14]>;
+def PC  : ARMReg<15, "pc">,  DwarfRegNum<[15]>;
+
+// Float registers
+def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
+def S2  : ARMFReg< 2, "s2">;  def S3  : ARMFReg< 3, "s3">;
+def S4  : ARMFReg< 4, "s4">;  def S5  : ARMFReg< 5, "s5">;
+def S6  : ARMFReg< 6, "s6">;  def S7  : ARMFReg< 7, "s7">;
+def S8  : ARMFReg< 8, "s8">;  def S9  : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>; 
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>;
+
+// Current Program Status Register.
+def CPSR : ARMReg<0, "cpsr">;
+
+// Register classes.
+//
+// pc  == Program Counter
+// lr  == Link Register
+// sp  == Stack Pointer
+// r12 == ip (scratch)
+// r7  == Frame Pointer (thumb-style backtraces)
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
+//
+def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
+                                           R7, R8, R9, R10, R12, R11,
+                                           LR, SP, PC]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  // FIXME: We are reserving r12 in case the PEI needs to use it to
+  // generate large stack offset. Make it available once we have register
+  // scavenging. Similarly r3 is reserved in Thumb mode for now.
+  let MethodBodies = [{
+    // FP is R11, R9 is available.
+    static const unsigned ARM_GPR_AO_1[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R9, ARM::R10,
+      ARM::R11 };
+    // FP is R11, R9 is not available.
+    static const unsigned ARM_GPR_AO_2[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R10,
+      ARM::R11 };
+    // FP is R7, R9 is available.
+    static const unsigned ARM_GPR_AO_3[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R9, ARM::R10,ARM::R11,
+      ARM::R7 };
+    // FP is R7, R9 is not available.
+    static const unsigned ARM_GPR_AO_4[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R10,ARM::R11,
+      ARM::R7 };
+
+    GPRClass::iterator
+    GPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.useThumbBacktraces()) {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_4;
+        else
+          return ARM_GPR_AO_3;
+      } else {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_2;
+        else
+          return ARM_GPR_AO_1;
+      }
+    }
+
+    GPRClass::iterator
+    GPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      GPRClass::iterator I;
+
+      if (Subtarget.useThumbBacktraces()) {
+        if (Subtarget.isR9Reserved()) {
+          I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned));
+        } else {
+          I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned));
+        }
+      } else {
+        if (Subtarget.isR9Reserved()) {
+          I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
+        } else {
+          I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
+        }
+      }
+
+      // Mac OS X requires FP not to be clobbered for backtracing purpose.
+      return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+    }
+  }];
+}
+
+// Thumb registers are R0-R7 normally. Some instructions can still use
+// the general GPR register class above (MOV, e.g.)
+def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  // FIXME: We are reserving r3 in Thumb mode in case the PEI needs to use it
+  // to generate large stack offset. Make it available once we have register
+  // scavenging.
+  let MethodBodies = [{
+    static const unsigned THUMB_tGPR_AO[] = {
+      ARM::R2, ARM::R1, ARM::R0,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
+
+    // FP is R7, only low registers available.
+    tGPRClass::iterator
+    tGPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      return THUMB_tGPR_AO;
+    }
+
+    tGPRClass::iterator
+    tGPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      tGPRClass::iterator I =
+        THUMB_tGPR_AO + (sizeof(THUMB_tGPR_AO)/sizeof(unsigned));
+      // Mac OS X requires FP not to be clobbered for backtracing purpose.
+      return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+    }
+  }];
+}
+
+def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
+  S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
+  S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
+
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
+  D9, D10, D11, D12, D13, D14, D15]>;
+
+// Condition code registers.
+def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
diff --git a/lib/Target/ARM/ARMRelocations.h b/lib/Target/ARM/ARMRelocations.h
new file mode 100644
index 0000000..2cc2950
--- /dev/null
+++ b/lib/Target/ARM/ARMRelocations.h
@@ -0,0 +1,56 @@
+//===- ARMRelocations.h - ARM Code Relocations ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMRELOCATIONS_H
+#define ARMRELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace ARM {
+    enum RelocationType {
+      // reloc_arm_absolute - Absolute relocation, just add the relocated value
+      // to the value already in memory.
+      reloc_arm_absolute,
+
+      // reloc_arm_relative - PC relative relocation, add the relocated value to
+      // the value already in memory, after we adjust it for where the PC is.
+      reloc_arm_relative,
+
+      // reloc_arm_cp_entry - PC relative relocation for constpool_entry's whose
+      // addresses are kept locally in a map.
+      reloc_arm_cp_entry,
+
+      // reloc_arm_vfp_cp_entry - Same as reloc_arm_cp_entry except the offset
+      // should be divided by 4.
+      reloc_arm_vfp_cp_entry,
+
+      // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool
+      // entry.
+      reloc_arm_machine_cp_entry,
+
+      // reloc_arm_jt_base - PC relative relocation for jump tables whose
+      // addresses are kept locally in a map.
+      reloc_arm_jt_base,
+
+      // reloc_arm_pic_jt - PIC jump table entry relocation: dest bb - jt base.
+      reloc_arm_pic_jt,
+
+      // reloc_arm_branch - Branch address relocation.
+      reloc_arm_branch
+    };
+  }
+}
+
+#endif
+
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
new file mode 100644
index 0000000..ef78cd5
--- /dev/null
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -0,0 +1,84 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMGenSubtarget.inc"
+#include "llvm/Module.h"
+using namespace llvm;
+
+ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
+                           bool isThumb)
+  : ARMArchVersion(V4T)
+  , ARMFPUType(None)
+  , IsThumb(isThumb)
+  , ThumbMode(Thumb1)
+  , UseThumbBacktraces(false)
+  , IsR9Reserved(false)
+  , stackAlignment(4)
+  , CPUString("generic")
+  , TargetType(isELF) // Default to ELF unless otherwise specified.
+  , TargetABI(ARM_ABI_APCS) {
+  // Determine default and user specified characteristics
+
+  // Parse features string.
+  CPUString = ParseSubtargetFeatures(FS, CPUString);
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string& TT = M.getTargetTriple();
+  unsigned Len = TT.length();
+  unsigned Idx = 0;
+
+  if (Len >= 5 && TT.substr(0, 4) == "armv")
+    Idx = 4;
+  else if (Len >= 6 && TT.substr(0, 6) == "thumb") {
+    IsThumb = true;
+    if (Len >= 7 && TT[5] == 'v')
+      Idx = 6;
+  }
+  if (Idx) {
+    unsigned SubVer = TT[Idx];
+    if (SubVer > '4' && SubVer <= '9') {
+      if (SubVer >= '7')
+        ARMArchVersion = V7A;
+      else if (SubVer == '6')
+        ARMArchVersion = V6;
+      else if (SubVer == '5') {
+        ARMArchVersion = V5T;
+        if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
+          ARMArchVersion = V5TE;
+      }
+    }
+  }
+
+  if (Len >= 10) {
+    if (TT.find("-darwin") != std::string::npos)
+      // arm-darwin
+      TargetType = isDarwin;
+  } else if (TT.empty()) {
+#if defined(__APPLE__)
+    TargetType = isDarwin;
+#endif
+  }
+
+  if (TT.find("eabi") != std::string::npos)
+    TargetABI = ARM_ABI_AAPCS;
+
+  if (isAAPCS_ABI())
+    stackAlignment = 8;
+
+  if (isTargetDarwin()) {
+    UseThumbBacktraces = true;
+    IsR9Reserved = true;
+  }
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
new file mode 100644
index 0000000..8b469cf
--- /dev/null
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -0,0 +1,122 @@
+//=====---- ARMSubtarget.h - Define Subtarget for the ARM -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSUBTARGET_H
+#define ARMSUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+class Module;
+
+class ARMSubtarget : public TargetSubtarget {
+protected:
+  enum ARMArchEnum {
+    V4T, V5T, V5TE, V6, V7A
+  };
+
+  enum ARMFPEnum {
+    None, VFPv2, VFPv3, NEON
+  };
+
+  enum ThumbTypeEnum {
+    Thumb1,
+    Thumb2
+  };
+
+  /// ARMArchVersion - ARM architecture version: V4T (base), V5T, V5TE,
+  /// V6, V6T2, V7A.
+  ARMArchEnum ARMArchVersion;
+
+  /// ARMFPUType - Floating Point Unit type.
+  ARMFPEnum ARMFPUType;
+
+  /// IsThumb - True if we are in thumb mode, false if in ARM mode.
+  bool IsThumb;
+
+  /// ThumbMode - Indicates supported Thumb version.
+  ThumbTypeEnum ThumbMode;
+
+  /// UseThumbBacktraces - True if we use thumb style backtraces.
+  bool UseThumbBacktraces;
+
+  /// IsR9Reserved - True if R9 is a not available as general purpose register.
+  bool IsR9Reserved;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+ public:
+  enum {
+    isELF, isDarwin
+  } TargetType;
+
+  enum {
+    ARM_ABI_APCS,
+    ARM_ABI_AAPCS // ARM EABI
+  } TargetABI;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  ARMSubtarget(const Module &M, const std::string &FS, bool isThumb);
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const {
+    // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb.
+    // Change this once Thumb ldmia / stmia support is added.
+    return isThumb() ? 0 : 64;
+  }
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool hasV4TOps()  const { return ARMArchVersion >= V4T;  }
+  bool hasV5TOps()  const { return ARMArchVersion >= V5T;  }
+  bool hasV5TEOps() const { return ARMArchVersion >= V5TE; }
+  bool hasV6Ops()   const { return ARMArchVersion >= V6;   }
+  bool hasV7Ops()   const { return ARMArchVersion >= V7A;  }
+
+  bool hasVFP2() const { return ARMFPUType >= VFPv2; }
+  bool hasVFP3() const { return ARMFPUType >= VFPv3; }
+  bool hasNEON() const { return ARMFPUType >= NEON;  }
+
+  bool isTargetDarwin() const { return TargetType == isDarwin; }
+  bool isTargetELF() const { return TargetType == isELF; }
+
+  bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
+  bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
+
+  bool isThumb() const { return IsThumb; }
+  bool isThumb2() const { return IsThumb && (ThumbMode >= Thumb2); }
+
+  bool useThumbBacktraces() const { return UseThumbBacktraces; }
+  bool isR9Reserved() const { return IsR9Reserved; }
+
+  const std::string & getCPUString() const { return CPUString; }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+};
+} // End llvm namespace
+
+#endif  // ARMSUBTARGET_H
diff --git a/lib/Target/ARM/ARMTargetAsmInfo.cpp b/lib/Target/ARM/ARMTargetAsmInfo.cpp
new file mode 100644
index 0000000..4107dcc
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetAsmInfo.cpp
@@ -0,0 +1,291 @@
+//===-- ARMTargetAsmInfo.cpp - ARM asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetAsmInfo.h"
+#include "ARMTargetMachine.h"
+#include <cstring>
+#include <cctype>
+using namespace llvm;
+
+
+const char *const llvm::arm_asm_table[] = {
+                                      "{r0}", "r0",
+                                      "{r1}", "r1",
+                                      "{r2}", "r2",
+                                      "{r3}", "r3",
+                                      "{r4}", "r4",
+                                      "{r5}", "r5",
+                                      "{r6}", "r6",
+                                      "{r7}", "r7",
+                                      "{r8}", "r8",
+                                      "{r9}", "r9",
+                                      "{r10}", "r10",
+                                      "{r11}", "r11",
+                                      "{r12}", "r12",
+                                      "{r13}", "r13",
+                                      "{r14}", "r14",
+                                      "{lr}", "lr",
+                                      "{sp}", "sp",
+                                      "{ip}", "ip",
+                                      "{fp}", "fp",
+                                      "{sl}", "sl",
+                                      "{memory}", "memory",
+                                      "{cc}", "cc",
+                                      0,0};
+
+ARMDarwinTargetAsmInfo::ARMDarwinTargetAsmInfo(const ARMTargetMachine &TM):
+  ARMTargetAsmInfo<DarwinTargetAsmInfo>(TM) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+
+  GlobalPrefix = "_";
+  PrivateGlobalPrefix = "L";
+  LessPrivateGlobalPrefix = "l";
+  StringConstantPrefix = "\1LC";
+  BSSSection = 0;                       // no BSS section
+  ZeroDirective = "\t.space\t";
+  ZeroFillDirective = "\t.zerofill\t";  // Uses .zerofill
+  SetDirective = "\t.set\t";
+  WeakRefDirective = "\t.weak_reference\t";
+  WeakDefDirective = "\t.weak_definition ";
+  HiddenDirective = "\t.private_extern\t";
+  ProtectedDirective = NULL;
+  JumpTableDataSection = ".const";
+  CStringSection = "\t.cstring";
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = false;
+  NeedsIndirectEncoding = true;
+  if (TM.getRelocationModel() == Reloc::Static) {
+    StaticCtorsSection = ".constructor";
+    StaticDtorsSection = ".destructor";
+  } else {
+    StaticCtorsSection = ".mod_init_func";
+    StaticDtorsSection = ".mod_term_func";
+  }
+
+  // In non-PIC modes, emit a special label before jump tables so that the
+  // linker can perform more accurate dead code stripping.
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    // Emit a local label that is preserved until the linker runs.
+    JumpTableSpecialLabelPrefix = "l";
+  }
+
+  NeedsSet = true;
+  DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+  DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+  DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+  DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+  DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+  DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+  DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+  DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+  DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+  DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+  DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+}
+
+ARMELFTargetAsmInfo::ARMELFTargetAsmInfo(const ARMTargetMachine &TM):
+  ARMTargetAsmInfo<ELFTargetAsmInfo>(TM) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+
+  NeedsSet = false;
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+  CStringSection = ".rodata.str";
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+  DwarfRequiresFrameSection = false;
+  DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"\",%progbits";
+  DwarfInfoSection =    "\t.section\t.debug_info,\"\",%progbits";
+  DwarfLineSection =    "\t.section\t.debug_line,\"\",%progbits";
+  DwarfFrameSection =   "\t.section\t.debug_frame,\"\",%progbits";
+  DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",%progbits";
+  DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",%progbits";
+  DwarfStrSection =     "\t.section\t.debug_str,\"\",%progbits";
+  DwarfLocSection =     "\t.section\t.debug_loc,\"\",%progbits";
+  DwarfARangesSection = "\t.section\t.debug_aranges,\"\",%progbits";
+  DwarfRangesSection =  "\t.section\t.debug_ranges,\"\",%progbits";
+  DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",%progbits";
+
+  if (Subtarget->isAAPCS_ABI()) {
+    StaticCtorsSection = "\t.section .init_array,\"aw\",%init_array";
+    StaticDtorsSection = "\t.section .fini_array,\"aw\",%fini_array";
+  } else {
+    StaticCtorsSection = "\t.section .ctors,\"aw\",%progbits";
+    StaticDtorsSection = "\t.section .dtors,\"aw\",%progbits";
+  }
+}
+
+/// Count the number of comma-separated arguments.
+/// Do not try to detect errors.
+template <class BaseTAI>
+unsigned ARMTargetAsmInfo<BaseTAI>::countArguments(const char* p) const {
+  unsigned count = 0;
+  while (*p && isspace(*p) && *p != '\n')
+    p++;
+  count++;
+  while (*p && *p!='\n' &&
+         strncmp(p, BaseTAI::CommentString,
+                 strlen(BaseTAI::CommentString))!=0) {
+    if (*p==',')
+      count++;
+    p++;
+  }
+  return count;
+}
+
+/// Count the length of a string enclosed in quote characters.
+/// Do not try to detect errors.
+template <class BaseTAI>
+unsigned ARMTargetAsmInfo<BaseTAI>::countString(const char* p) const {
+  unsigned count = 0;
+  while (*p && isspace(*p) && *p!='\n')
+    p++;
+  if (!*p || *p != '\"')
+    return count;
+  while (*++p && *p != '\"')
+    count++;
+  return count;
+}
+
+/// ARM-specific version of TargetAsmInfo::getInlineAsmLength.
+template <class BaseTAI>
+unsigned ARMTargetAsmInfo<BaseTAI>::getInlineAsmLength(const char *s) const {
+  // Make a lowercase-folded version of s for counting purposes.
+  char *q, *s_copy = (char *)malloc(strlen(s) + 1);
+  strcpy(s_copy, s);
+  for (q=s_copy; *q; q++)
+    *q = tolower(*q);
+  const char *Str = s_copy;
+
+  // Count the number of bytes in the asm.
+  bool atInsnStart = true;
+  bool inTextSection = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (atInsnStart) {
+      // Skip whitespace
+      while (*Str && isspace(*Str) && *Str != '\n')
+        Str++;
+      // Skip label
+      for (const char* p = Str; *p && !isspace(*p); p++)
+        if (*p == ':') {
+          Str = p+1;
+          while (*Str && isspace(*Str) && *Str != '\n')
+            Str++;
+          break;
+        }
+      
+      if (*Str == 0) break;
+      
+      // Ignore everything from comment char(s) to EOL
+      if (strncmp(Str, BaseTAI::CommentString,
+                  strlen(BaseTAI::CommentString)) == 0)
+        atInsnStart = false;
+      // FIXME do something like the following for non-Darwin
+      else if (*Str == '.' && Subtarget->isTargetDarwin()) {
+        // Directive.
+        atInsnStart = false;
+
+        // Some change the section, but don't generate code.
+        if (strncmp(Str, ".literal4", strlen(".literal4"))==0 ||
+            strncmp(Str, ".literal8", strlen(".literal8"))==0 ||
+            strncmp(Str, ".const", strlen(".const"))==0 ||
+            strncmp(Str, ".constructor", strlen(".constructor"))==0 ||
+            strncmp(Str, ".cstring", strlen(".cstring"))==0 ||
+            strncmp(Str, ".data", strlen(".data"))==0 ||
+            strncmp(Str, ".destructor", strlen(".destructor"))==0 ||
+            strncmp(Str, ".fvmlib_init0", strlen(".fvmlib_init0"))==0 ||
+            strncmp(Str, ".fvmlib_init1", strlen(".fvmlib_init1"))==0 ||
+            strncmp(Str, ".mod_init_func", strlen(".mod_init_func"))==0 ||
+            strncmp(Str, ".mod_term_func", strlen(".mod_term_func"))==0 ||
+            strncmp(Str, ".picsymbol_stub", strlen(".picsymbol_stub"))==0 ||
+            strncmp(Str, ".symbol_stub", strlen(".symbol_stub"))==0 ||
+            strncmp(Str, ".static_data", strlen(".static_data"))==0 ||
+            strncmp(Str, ".section", strlen(".section"))==0 ||
+            strncmp(Str, ".lazy_symbol_pointer", strlen(".lazy_symbol_pointer"))==0 ||
+            strncmp(Str, ".non_lazy_symbol_pointer", strlen(".non_lazy_symbol_pointer"))==0 ||
+            strncmp(Str, ".dyld", strlen(".dyld"))==0 ||
+            strncmp(Str, ".const_data", strlen(".const_data"))==0 ||
+            strncmp(Str, ".objc", strlen(".objc"))==0 ||       //// many directives
+            strncmp(Str, ".static_const", strlen(".static_const"))==0)
+          inTextSection=false;
+        else if (strncmp(Str, ".text", strlen(".text"))==0)
+          inTextSection = true;
+        // Some can't really be handled without implementing significant pieces
+        // of an assembler.  Others require dynamic adjustment of block sizes in
+        // AdjustBBOffsetsAfter; it's a big compile-time speed hit to check every
+        // instruction in there, and none of these are currently used in the kernel.
+        else if (strncmp(Str, ".macro", strlen(".macro"))==0 ||
+                 strncmp(Str, ".if", strlen(".if"))==0 ||
+                 strncmp(Str, ".align", strlen(".align"))==0 ||
+                 strncmp(Str, ".fill", strlen(".fill"))==0 ||
+                 strncmp(Str, ".space", strlen(".space"))==0 ||
+                 strncmp(Str, ".zerofill", strlen(".zerofill"))==0 ||
+                 strncmp(Str, ".p2align", strlen(".p2align"))==0 ||
+                 strncmp(Str, ".p2alignw", strlen(".p2alignw"))==0 ||
+                 strncmp(Str, ".p2alignl", strlen(".p2alignl"))==0 ||
+                 strncmp(Str, ".align32", strlen(".p2align32"))==0 ||
+                 strncmp(Str, ".include", strlen(".include"))==0)
+          cerr << "Directive " << Str << " in asm may lead to invalid offsets for" <<
+                   " constant pools (the assembler will tell you if this happens).\n";
+        // Some generate code, but this is only interesting in the text section.
+        else if (inTextSection) {
+          if (strncmp(Str, ".long", strlen(".long"))==0)
+            Length += 4*countArguments(Str+strlen(".long"));
+          else if (strncmp(Str, ".short", strlen(".short"))==0)
+            Length += 2*countArguments(Str+strlen(".short"));
+          else if (strncmp(Str, ".byte", strlen(".byte"))==0)
+            Length += 1*countArguments(Str+strlen(".byte"));
+          else if (strncmp(Str, ".single", strlen(".single"))==0)
+            Length += 4*countArguments(Str+strlen(".single"));
+          else if (strncmp(Str, ".double", strlen(".double"))==0)
+            Length += 8*countArguments(Str+strlen(".double"));
+          else if (strncmp(Str, ".quad", strlen(".quad"))==0)
+            Length += 16*countArguments(Str+strlen(".quad"));
+          else if (strncmp(Str, ".ascii", strlen(".ascii"))==0)
+            Length += countString(Str+strlen(".ascii"));
+          else if (strncmp(Str, ".asciz", strlen(".asciz"))==0)
+            Length += countString(Str+strlen(".asciz"))+1;
+        }
+      } else if (inTextSection) {
+        // An instruction
+        atInsnStart = false;
+        if (Subtarget->isThumb()) {
+          // BL and BLX <non-reg> are 4 bytes, all others 2.
+          if (strncmp(Str, "blx", strlen("blx"))==0) {
+            const char* p = Str+3;
+            while (*p && isspace(*p))
+              p++;
+            if (*p == 'r' || *p=='R')
+              Length += 2;    // BLX reg
+            else
+              Length += 4;    // BLX non-reg
+          } else if (strncmp(Str, "bl", strlen("bl"))==0)
+            Length += 4;    // BL
+          else
+            Length += 2;    // Thumb anything else
+        }
+        else
+          Length += 4;    // ARM
+      }
+    }
+    if (*Str == '\n' || *Str == BaseTAI::SeparatorChar)
+      atInsnStart = true;
+  }
+  free(s_copy);
+  return Length;
+}
+
+// Instantiate default implementation.
+TEMPLATE_INSTANTIATION(class ARMTargetAsmInfo<TargetAsmInfo>);
diff --git a/lib/Target/ARM/ARMTargetAsmInfo.h b/lib/Target/ARM/ARMTargetAsmInfo.h
new file mode 100644
index 0000000..9e6f856
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetAsmInfo.h
@@ -0,0 +1,64 @@
+//=====-- ARMTargetAsmInfo.h - ARM asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMTARGETASMINFO_H
+#define ARMTARGETASMINFO_H
+
+#include "ARMTargetMachine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+#include "llvm/Target/DarwinTargetAsmInfo.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+  extern const char *const arm_asm_table[];
+
+  template <class BaseTAI>
+  struct ARMTargetAsmInfo : public BaseTAI {
+    explicit ARMTargetAsmInfo(const ARMTargetMachine &TM):
+      BaseTAI(TM) {
+      BaseTAI::AsmTransCBE = arm_asm_table;
+
+      BaseTAI::AlignmentIsInBytes = false;
+      BaseTAI::Data64bitsDirective = 0;
+      BaseTAI::CommentString = "@";
+      BaseTAI::ConstantPoolSection = "\t.text\n";
+      BaseTAI::COMMDirectiveTakesAlignment = false;
+      BaseTAI::InlineAsmStart = "@ InlineAsm Start";
+      BaseTAI::InlineAsmEnd = "@ InlineAsm End";
+      BaseTAI::LCOMMDirective = "\t.lcomm\t";
+    }
+
+    const ARMSubtarget *Subtarget;
+
+    virtual unsigned getInlineAsmLength(const char *Str) const;
+    unsigned countArguments(const char *p) const;
+    unsigned countString(const char *p) const;
+  };
+
+  typedef ARMTargetAsmInfo<TargetAsmInfo> ARMGenericTargetAsmInfo;
+
+  EXTERN_TEMPLATE_INSTANTIATION(class ARMTargetAsmInfo<TargetAsmInfo>);
+
+  struct ARMDarwinTargetAsmInfo : public ARMTargetAsmInfo<DarwinTargetAsmInfo> {
+    explicit ARMDarwinTargetAsmInfo(const ARMTargetMachine &TM);
+  };
+
+  struct ARMELFTargetAsmInfo : public ARMTargetAsmInfo<ELFTargetAsmInfo> {
+    explicit ARMELFTargetAsmInfo(const ARMTargetMachine &TM);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
new file mode 100644
index 0000000..1dc7d19
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -0,0 +1,242 @@
+//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "ARMTargetAsmInfo.h"
+#include "ARMFrameInfo.h"
+#include "ARM.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden,
+                              cl::desc("Disable load store optimization pass"));
+static cl::opt<bool> DisableIfConversion("disable-arm-if-conversion",cl::Hidden,
+                              cl::desc("Disable if-conversion pass"));
+
+/// ARMTargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library.  In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this.  Though it is unused, do not remove it.
+extern "C" int ARMTargetMachineModule;
+int ARMTargetMachineModule = 0;
+
+// Register the target.
+static RegisterTarget<ARMTargetMachine>   X("arm",   "ARM");
+static RegisterTarget<ThumbTargetMachine> Y("thumb", "Thumb");
+
+// No assembler printer by default
+ARMTargetMachine::AsmPrinterCtorFn ARMTargetMachine::AsmPrinterCtor = 0;
+
+/// ThumbTargetMachine - Create an Thumb architecture model.
+///
+unsigned ThumbTargetMachine::getJITMatchQuality() {
+#if defined(__thumb__)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned ThumbTargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+  // Match thumb-foo-bar, as well as things like thumbv5blah-*
+  if (TT.size() >= 6 &&
+      (TT.substr(0, 6) == "thumb-" || TT.substr(0, 6) == "thumbv"))
+    return 20;
+
+  // If the target triple is something non-thumb, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+ThumbTargetMachine::ThumbTargetMachine(const Module &M, const std::string &FS)
+  : ARMTargetMachine(M, FS, true) {
+}
+
+/// TargetMachine ctor - Create an ARM architecture model.
+///
+ARMTargetMachine::ARMTargetMachine(const Module &M, const std::string &FS,
+                                   bool isThumb)
+  : Subtarget(M, FS, isThumb),
+    DataLayout(Subtarget.isAPCS_ABI() ?
+               // APCS ABI
+          (isThumb ?
+           std::string("e-p:32:32-f64:32:32-i64:32:32-"
+                       "i16:16:32-i8:8:32-i1:8:32-a:0:32") :
+           std::string("e-p:32:32-f64:32:32-i64:32:32")) :
+               // AAPCS ABI
+          (isThumb ?
+           std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                       "i16:16:32-i8:8:32-i1:8:32-a:0:32") :
+           std::string("e-p:32:32-f64:64:64-i64:64:64"))),
+    InstrInfo(Subtarget),
+    FrameInfo(Subtarget),
+    JITInfo(),
+    TLInfo(*this) {
+  DefRelocModel = getRelocationModel();
+}
+
+unsigned ARMTargetMachine::getJITMatchQuality() {
+#if defined(__arm__)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+  // Match arm-foo-bar, as well as things like armv5blah-*
+  if (TT.size() >= 4 &&
+      (TT.substr(0, 4) == "arm-" || TT.substr(0, 4) == "armv"))
+    return 20;
+  // If the target triple is something non-arm, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+
+const TargetAsmInfo *ARMTargetMachine::createTargetAsmInfo() const {
+  switch (Subtarget.TargetType) {
+   case ARMSubtarget::isDarwin:
+    return new ARMDarwinTargetAsmInfo(*this);
+   case ARMSubtarget::isELF:
+    return new ARMELFTargetAsmInfo(*this);
+   default:
+    return new ARMGenericTargetAsmInfo(*this);
+  }
+}
+
+
+// Pass Pipeline Configuration
+bool ARMTargetMachine::addInstSelector(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  PM.add(createARMISelDag(*this));
+  return false;
+}
+
+bool ARMTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
+  if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb())
+    PM.add(createARMLoadStoreOptimizationPass());
+
+  if (OptLevel != CodeGenOpt::None &&
+      !DisableIfConversion && !Subtarget.isThumb())
+    PM.add(createIfConverterPass());
+
+  PM.add(createARMConstantIslandPass());
+  return true;
+}
+
+bool ARMTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          bool Verbose,
+                                          raw_ostream &Out) {
+  // Output assembly language.
+  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+  if (AsmPrinterCtor)
+    PM.add(AsmPrinterCtor(Out, *this, OptLevel, Verbose));
+
+  return false;
+}
+
+
+bool ARMTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      bool DumpAsm,
+                                      MachineCodeEmitter &MCE) {
+  // FIXME: Move this to TargetJITInfo!
+  if (DefRelocModel == Reloc::Default)
+    setRelocationModel(Reloc::Static);
+
+  // Machine code emitter pass for ARM.
+  PM.add(createARMCodeEmitterPass(*this, MCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool ARMTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      bool DumpAsm,
+                                      JITCodeEmitter &JCE) {
+  // FIXME: Move this to TargetJITInfo!
+  if (DefRelocModel == Reloc::Default)
+    setRelocationModel(Reloc::Static);
+
+  // Machine code emitter pass for ARM.
+  PM.add(createARMJITCodeEmitterPass(*this, JCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool ARMTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool DumpAsm,
+                                            MachineCodeEmitter &MCE) {
+  // Machine code emitter pass for ARM.
+  PM.add(createARMCodeEmitterPass(*this, MCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool ARMTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool DumpAsm,
+                                            JITCodeEmitter &JCE) {
+  // Machine code emitter pass for ARM.
+  PM.add(createARMJITCodeEmitterPass(*this, JCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
new file mode 100644
index 0000000..916a8aa
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -0,0 +1,104 @@
+//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMTARGETMACHINE_H
+#define ARMTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMFrameInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMISelLowering.h"
+
+namespace llvm {
+
+class Module;
+
+class ARMTargetMachine : public LLVMTargetMachine {
+  ARMSubtarget      Subtarget;
+  const TargetData  DataLayout;       // Calculates type size & alignment
+  ARMInstrInfo      InstrInfo;
+  ARMFrameInfo      FrameInfo;
+  ARMJITInfo        JITInfo;
+  ARMTargetLowering TLInfo;
+  Reloc::Model      DefRelocModel;    // Reloc model before it's overridden.
+
+protected:
+  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
+  // set this functions to ctor pointer at startup time if they are linked in.
+  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
+                                            ARMTargetMachine &tm,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool verbose);
+  static AsmPrinterCtorFn AsmPrinterCtor;
+
+public:
+  ARMTargetMachine(const Module &M, const std::string &FS, bool isThumb = false);
+
+  virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const ARMFrameInfo     *getFrameInfo() const { return &FrameInfo; }
+  virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual const ARMRegisterInfo  *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual       ARMTargetLowering *getTargetLowering() const {
+    return const_cast<ARMTargetLowering*>(&TLInfo);
+  }
+
+  static void registerAsmPrinter(AsmPrinterCtorFn F) {
+    AsmPrinterCtor = F;
+  }
+
+  static unsigned getModuleMatchQuality(const Module &M);
+  static unsigned getJITMatchQuality();
+
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel,
+                                  bool Verbose, raw_ostream &Out);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, MachineCodeEmitter &MCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, JITCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm,
+                                    MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm,
+                                    JITCodeEmitter &MCE);
+};
+
+/// ThumbTargetMachine - Thumb target machine.
+///
+class ThumbTargetMachine : public ARMTargetMachine {
+public:
+  ThumbTargetMachine(const Module &M, const std::string &FS);
+
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
new file mode 100644
index 0000000..d908cf4
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -0,0 +1,1117 @@
+//===-- ARMAsmPrinter.cpp - ARM LLVM assembly writer ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format ARM assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM.h"
+#include "ARMBuildAttrs.h"
+#include "ARMTargetMachine.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter {
+    DwarfWriter *DW;
+    MachineModuleInfo *MMI;
+
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when printing asm code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    /// AFI - Keep a pointer to ARMFunctionInfo for the current
+    /// MachineFunction.
+    ARMFunctionInfo *AFI;
+
+    /// MCP - Keep a pointer to constantpool entries of the current
+    /// MachineFunction.
+    const MachineConstantPool *MCP;
+
+    /// We name each basic block in a Function with a unique number, so
+    /// that we can consistently refer to them later. This is cleared
+    /// at the beginning of each call to runOnMachineFunction().
+    ///
+    typedef std::map<const Value *, unsigned> ValueMapTy;
+    ValueMapTy NumberForBB;
+
+    /// GVNonLazyPtrs - Keeps the set of GlobalValues that require
+    /// non-lazy-pointers for indirect access.
+    StringSet<> GVNonLazyPtrs;
+
+    /// HiddenGVNonLazyPtrs - Keeps the set of GlobalValues with hidden
+    /// visibility that require non-lazy-pointers for indirect access.
+    StringSet<> HiddenGVNonLazyPtrs;
+
+    /// FnStubs - Keeps the set of external function GlobalAddresses that the
+    /// asm printer should generate stubs for.
+    StringSet<> FnStubs;
+
+    /// True if asm printer is printing a series of CONSTPOOL_ENTRY.
+    bool InCPMode;
+  public:
+    explicit ARMAsmPrinter(raw_ostream &O, TargetMachine &TM,
+                           const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                           bool V)
+      : AsmPrinter(O, TM, T, OL, V), DW(0), MMI(NULL), AFI(NULL), MCP(NULL),
+        InCPMode(false) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "ARM Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum,
+                      const char *Modifier = 0);
+    void printSOImmOperand(const MachineInstr *MI, int opNum);
+    void printSOImm2PartOperand(const MachineInstr *MI, int opNum);
+    void printSORegOperand(const MachineInstr *MI, int opNum);
+    void printAddrMode2Operand(const MachineInstr *MI, int OpNo);
+    void printAddrMode2OffsetOperand(const MachineInstr *MI, int OpNo);
+    void printAddrMode3Operand(const MachineInstr *MI, int OpNo);
+    void printAddrMode3OffsetOperand(const MachineInstr *MI, int OpNo);
+    void printAddrMode4Operand(const MachineInstr *MI, int OpNo,
+                               const char *Modifier = 0);
+    void printAddrMode5Operand(const MachineInstr *MI, int OpNo,
+                               const char *Modifier = 0);
+    void printAddrModePCOperand(const MachineInstr *MI, int OpNo,
+                                const char *Modifier = 0);
+    void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNo,
+                                      unsigned Scale);
+    void printThumbAddrModeS1Operand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeS2Operand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNo);
+    void printPredicateOperand(const MachineInstr *MI, int opNum);
+    void printSBitModifierOperand(const MachineInstr *MI, int opNum);
+    void printPCLabel(const MachineInstr *MI, int opNum);
+    void printRegisterList(const MachineInstr *MI, int opNum);
+    void printCPInstOperand(const MachineInstr *MI, int opNum,
+                            const char *Modifier);
+    void printJTBlockOperand(const MachineInstr *MI, int opNum);
+
+    virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                 unsigned AsmVariant, const char *ExtraCode);
+    virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode);
+
+    void printModuleLevelGV(const GlobalVariable* GVar);
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    void printMachineInstruction(const MachineInstr *MI);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    /// EmitMachineConstantPoolValue - Print a machine constantpool value to
+    /// the .s file.
+    virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+      printDataDirective(MCPV->getType());
+
+      ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
+      GlobalValue *GV = ACPV->getGV();
+      std::string Name = GV ? Mang->getValueName(GV) : TAI->getGlobalPrefix();
+      if (!GV)
+        Name += ACPV->getSymbol();
+      if (ACPV->isNonLazyPointer()) {
+        if (GV->hasHiddenVisibility())
+          HiddenGVNonLazyPtrs.insert(Name);
+        else
+          GVNonLazyPtrs.insert(Name);
+        printSuffixedName(Name, "$non_lazy_ptr");
+      } else if (ACPV->isStub()) {
+        FnStubs.insert(Name);
+        printSuffixedName(Name, "$stub");
+      } else
+        O << Name;
+      if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")";
+      if (ACPV->getPCAdjustment() != 0) {
+        O << "-(" << TAI->getPrivateGlobalPrefix() << "PC"
+          << utostr(ACPV->getLabelId())
+          << "+" << (unsigned)ACPV->getPCAdjustment();
+         if (ACPV->mustAddCurrentAddress())
+           O << "-.";
+         O << ")";
+      }
+      O << "\n";
+
+      // If the constant pool value is a extern weak symbol, remember to emit
+      // the weak reference.
+      if (GV && GV->hasExternalWeakLinkage())
+        ExtWeakSymbols.insert(GV);
+    }
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+    }
+  };
+} // end of anonymous namespace
+
+#include "ARMGenAsmWriter.inc"
+
+/// runOnMachineFunction - This uses the printInstruction()
+/// method to print assembly for each instruction.
+///
+bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  AFI = MF.getInfo<ARMFunctionInfo>();
+  MCP = MF.getConstantPool();
+
+  SetupMachineFunction(MF);
+  O << "\n";
+
+  // NOTE: we don't print out constant pools here, they are handled as
+  // instructions.
+
+  O << "\n";
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::PrivateLinkage:
+  case Function::InternalLinkage:
+    SwitchToTextSection("\t.text", F);
+    break;
+  case Function::ExternalLinkage:
+    SwitchToTextSection("\t.text", F);
+    O << "\t.globl\t" << CurrentFnName << "\n";
+    break;
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      SwitchToTextSection(
+                ".section __TEXT,__textcoal_nt,coalesced,pure_instructions", F);
+      O << "\t.globl\t" << CurrentFnName << "\n";
+      O << "\t.weak_definition\t" << CurrentFnName << "\n";
+    } else {
+      O << TAI->getWeakRefDirective() << CurrentFnName << "\n";
+    }
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  if (AFI->isThumbFunction()) {
+    EmitAlignment(1, F, AFI->getAlign());
+    O << "\t.code\t16\n";
+    O << "\t.thumb_func";
+    if (Subtarget->isTargetDarwin())
+      O << "\t" << CurrentFnName;
+    O << "\n";
+    InCPMode = false;
+  } else
+    EmitAlignment(2, F);
+
+  O << CurrentFnName << ":\n";
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
+
+  if (Subtarget->isTargetDarwin()) {
+    // If the function is empty, then we need to emit *something*. Otherwise,
+    // the function's label might be associated with something that it wasn't
+    // meant to be associated with. We emit a noop in this situation.
+    MachineFunction::iterator I = MF.begin();
+
+    if (++I == MF.end() && MF.front().empty())
+      O << "\tnop\n";
+  }
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true, true, VerboseAsm);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
+
+  // Emit post-function debug information.
+  DW->EndFunction(&MF);
+
+  O.flush();
+
+  return false;
+}
+
+void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                 const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+    else
+      assert(0 && "not implemented");
+    break;
+  case MachineOperand::MO_Immediate: {
+    if (!Modifier || strcmp(Modifier, "no_hash") != 0)
+      O << "#";
+
+    O << MO.getImm();
+    break;
+  }
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+    bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
+                  GV->hasLinkOnceLinkage());
+    if (isExt && isCallOp && Subtarget->isTargetDarwin() &&
+        TM.getRelocationModel() != Reloc::Static) {
+      printSuffixedName(Name, "$stub");
+      FnStubs.insert(Name);
+    } else
+      O << Name;
+
+    printOffset(MO.getOffset());
+
+    if (isCallOp && Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      O << "(PLT)";
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    std::string Name(TAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    if (isCallOp && Subtarget->isTargetDarwin() &&
+        TM.getRelocationModel() != Reloc::Static) {
+      printSuffixedName(Name, "$stub");
+      FnStubs.insert(Name);
+    } else
+      O << Name;
+    if (isCallOp && Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      O << "(PLT)";
+    break;
+  }
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  default:
+    O << "<unknown operand type>"; abort (); break;
+  }
+}
+
+static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
+                       const TargetAsmInfo *TAI) {
+  assert(V < (1 << 12) && "Not a valid so_imm value!");
+  unsigned Imm = ARM_AM::getSOImmValImm(V);
+  unsigned Rot = ARM_AM::getSOImmValRot(V);
+
+  // Print low-level immediate formation info, per
+  // A5.1.3: "Data-processing operands - Immediate".
+  if (Rot) {
+    O << "#" << Imm << ", " << Rot;
+    // Pretty printed version.
+    if (VerboseAsm)
+      O << ' ' << TAI->getCommentString()
+        << ' ' << (int)ARM_AM::rotr32(Imm, Rot);
+  } else {
+    O << "#" << Imm;
+  }
+}
+
+/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit
+/// immediate in bits 0-7.
+void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Not a valid so_imm value!");
+  printSOImm(O, MO.getImm(), VerboseAsm, TAI);
+}
+
+/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
+/// followed by an 'orr' to materialize.
+void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Not a valid so_imm value!");
+  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm());
+  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm());
+  printSOImm(O, ARM_AM::getSOImmVal(V1), VerboseAsm, TAI);
+  O << "\n\torr";
+  printPredicateOperand(MI, 2);
+  O << " ";
+  printOperand(MI, 0); 
+  O << ", ";
+  printOperand(MI, 0); 
+  O << ", ";
+  printSOImm(O, ARM_AM::getSOImmVal(V2), VerboseAsm, TAI);
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0    - e.g. R5
+//    REG REG 0,SH_OPC     - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+
+  // Print the shift opc.
+  O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()))
+    << " ";
+
+  if (MO2.getReg()) {
+    assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
+    O << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+  } else {
+    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  }
+}
+
+void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm()))  // Don't print +0.
+      O << ", #"
+        << (char)ARM_AM::getAM2Op(MO3.getImm())
+        << ARM_AM::getAM2Offset(MO3.getImm());
+    O << "]";
+    return;
+  }
+
+  O << ", "
+    << (char)ARM_AM::getAM2Op(MO3.getImm())
+    << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
+    O << ", "
+      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
+      << " #" << ShImm;
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op){
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    assert(ImmOffs && "Malformed indexed load / store!");
+    O << "#"
+      << (char)ARM_AM::getAM2Op(MO2.getImm())
+      << ImmOffs;
+    return;
+  }
+
+  O << (char)ARM_AM::getAM2Op(MO2.getImm())
+    << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
+    O << ", "
+      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
+      << " #" << ShImm;
+}
+
+void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+  
+  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+
+  if (MO2.getReg()) {
+    O << ", "
+      << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << TM.getRegisterInfo()->get(MO2.getReg()).AsmName
+      << "]";
+    return;
+  }
+  
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+    O << ", #"
+      << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << ImmOffs;
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (MO1.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO2.getImm())
+      << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  assert(ImmOffs && "Malformed indexed load / store!");
+  O << "#"
+    << (char)ARM_AM::getAM3Op(MO2.getImm())
+    << ImmOffs;
+}
+  
+void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
+                                          const char *Modifier) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    if (MO1.getReg() == ARM::SP) {
+      bool isLDM = (MI->getOpcode() == ARM::LDM ||
+                    MI->getOpcode() == ARM::LDM_RET);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+  } else {
+    printOperand(MI, Op);
+    if (ARM_AM::getAM4WBFlag(MO2.getImm()))
+      O << "!";
+  }
+}
+
+void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
+                                          const char *Modifier) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+  
+  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
+    if (MO1.getReg() == ARM::SP) {
+      bool isFLDM = (MI->getOpcode() == ARM::FLDMD ||
+                     MI->getOpcode() == ARM::FLDMS);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isFLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+    return;
+  } else if (Modifier && strcmp(Modifier, "base") == 0) {
+    // Used for FSTM{D|S} and LSTM{D|S} operations.
+    O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+    if (ARM_AM::getAM5WBFlag(MO2.getImm()))
+      O << "!";
+    return;
+  }
+  
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  
+  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+    O << ", #"
+      << (char)ARM_AM::getAM5Op(MO2.getImm())
+      << ImmOffs*4;
+  }
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op,
+                                           const char *Modifier) {
+  if (Modifier && strcmp(Modifier, "label") == 0) {
+    printPCLabel(MI, Op+1);
+    return;
+  }
+
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << "[pc, +" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).AsmName << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op,
+                                            unsigned Scale) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  if (MO3.getReg())
+    O << ", " << TM.getRegisterInfo()->get(MO3.getReg()).AsmName;
+  else if (unsigned ImmOffs = MO2.getImm()) {
+    O << ", #" << ImmOffs;
+    if (Scale > 1)
+      O << " * " << Scale;
+  }
+  O << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeS1Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 1);
+}
+void
+ARMAsmPrinter::printThumbAddrModeS2Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 2);
+}
+void
+ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 4);
+}
+
+void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  if (unsigned ImmOffs = MO2.getImm())
+    O << ", #" << ImmOffs << " * 4";
+  O << "]";
+}
+
+void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int opNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(opNum).getImm();
+  if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int opNum){
+  unsigned Reg = MI->getOperand(opNum).getReg();
+  if (Reg) {
+    assert(Reg == ARM::CPSR && "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int opNum) {
+  int Id = (int)MI->getOperand(opNum).getImm();
+  O << TAI->getPrivateGlobalPrefix() << "PC" << Id;
+}
+
+void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int opNum) {
+  O << "{";
+  for (unsigned i = opNum, e = MI->getNumOperands(); i != e; ++i) {
+    printOperand(MI, i);
+    if (i != e-1) O << ", ";
+  }
+  O << "}";
+}
+
+void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNo,
+                                       const char *Modifier) {
+  assert(Modifier && "This operand only works with a modifier!");
+  // There are two aspects to a CONSTANTPOOL_ENTRY operand, the label and the
+  // data itself.
+  if (!strcmp(Modifier, "label")) {
+    unsigned ID = MI->getOperand(OpNo).getImm();
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << ID << ":\n";
+  } else {
+    assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE");
+    unsigned CPI = MI->getOperand(OpNo).getIndex();
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
+    
+    if (MCPE.isMachineConstantPoolEntry()) {
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    } else {
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+      // remember to emit the weak reference
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(MCPE.Val.ConstVal))
+        if (GV->hasExternalWeakLinkage())
+          ExtWeakSymbols.insert(GV);
+    }
+  }
+}
+
+void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNo) {
+  const MachineOperand &MO1 = MI->getOperand(OpNo);
+  const MachineOperand &MO2 = MI->getOperand(OpNo+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+  O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    << '_' << JTI << '_' << MO2.getImm() << ":\n";
+
+  const char *JTEntryDirective = TAI->getJumpTableDirective();
+  if (!JTEntryDirective)
+    JTEntryDirective = TAI->getData32bitsDirective();
+
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  bool UseSet= TAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
+  std::set<MachineBasicBlock*> JTSets;
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (UseSet && JTSets.insert(MBB).second)
+      printPICJumpTableSetLabel(JTI, MO2.getImm(), MBB);
+
+    O << JTEntryDirective << ' ';
+    if (UseSet)
+      O << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+        << '_' << JTI << '_' << MO2.getImm()
+        << "_set_" << MBB->getNumber();
+    else if (TM.getRelocationModel() == Reloc::PIC_) {
+      printBasicBlockLabel(MBB, false, false, false);
+      // If the arch uses custom Jump Table directives, don't calc relative to JT
+      if (!TAI->getJumpTableDirective()) 
+        O << '-' << TAI->getPrivateGlobalPrefix() << "JTI"
+          << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm();
+    } else
+      printBasicBlockLabel(MBB, false, false, false);
+    if (i != e-1)
+      O << '\n';
+  }
+}
+
+
+bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant, const char *ExtraCode){
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'a': // Don't print "#" before a global var name or constant.
+    case 'c': // Don't print "$" before a global var name or constant.
+      printOperand(MI, OpNo, "no_hash");
+      return false;
+    case 'P': // Print a VFP double precision register.
+      printOperand(MI, OpNo);
+      return false;
+    case 'Q':
+      if (TM.getTargetData()->isLittleEndian())
+        break;
+      // Fallthrough
+    case 'R':
+      if (TM.getTargetData()->isBigEndian())
+        break;
+      // Fallthrough
+    case 'H': // Write second word of DI / DF reference.  
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printAddrMode2Operand(MI, OpNo);
+  return false;
+}
+
+void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  int Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::CONSTPOOL_ENTRY:
+    if (!InCPMode && AFI->isThumbFunction()) {
+      EmitAlignment(2);
+      InCPMode = true;
+    }
+    break;
+  default: {
+    if (InCPMode && AFI->isThumbFunction())
+      InCPMode = false;
+  }}
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+bool ARMAsmPrinter::doInitialization(Module &M) {
+
+  bool Result = AsmPrinter::doInitialization(M);
+
+  // Emit initial debug information.
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMI);
+  DW = getAnalysisIfAvailable<DwarfWriter>();
+  assert(DW && "Dwarf Writer is not available");
+  DW->BeginModule(&M, MMI, O, this, TAI);
+
+  // Darwin wants symbols to be quoted if they have complex names.
+  if (Subtarget->isTargetDarwin())
+    Mang->setUseQuotes(true);
+
+  // Emit ARM Build Attributes
+  if (Subtarget->isTargetELF()) {
+    // CPU Type
+    std::string CPUString = Subtarget->getCPUString();
+    if (CPUString != "generic")
+      O << "\t.cpu " << CPUString << '\n';
+
+    // FIXME: Emit FPU type
+    if (Subtarget->hasVFP2())
+      O << "\t.eabi_attribute " << ARMBuildAttrs::VFP_arch << ", 2\n";
+
+    // Signal various FP modes.
+    if (!UnsafeFPMath)
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_denormal << ", 1\n"
+        << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_exceptions << ", 1\n";
+
+    if (FiniteOnlyFPMath())
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_number_model << ", 1\n";
+    else
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_number_model << ", 3\n";
+
+    // 8-bytes alignment stuff.
+    O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_needed << ", 1\n"
+      << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_preserved << ", 1\n";
+
+    // FIXME: Should we signal R9 usage?
+  }
+
+  return Result;
+}
+
+/// PrintUnmangledNameSafely - Print out the printable characters in the name.
+/// Don't print things like \\n or \\0.
+static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
+  for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
+       Name != E; ++Name)
+    if (isprint(*Name))
+      OS << *Name;
+}
+
+void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())   // External global require no code
+    return;
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+
+  if (EmitSpecialLLVMGlobal(GVar)) {
+    if (Subtarget->isTargetDarwin() &&
+        TM.getRelocationModel() == Reloc::Static) {
+      if (GVar->getName() == "llvm.global_ctors")
+        O << ".reference .constructors_used\n";
+      else if (GVar->getName() == "llvm.global_dtors")
+        O << ".reference .destructors_used\n";
+    }
+    return;
+  }
+
+  std::string name = Mang->getValueName(GVar);
+  Constant *C = GVar->getInitializer();
+  const Type *Type = C->getType();
+  unsigned Size = TD->getTypeAllocSize(Type);
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+  bool isDarwin = Subtarget->isTargetDarwin();
+
+  printVisibility(name, GVar->getVisibility());
+
+  if (Subtarget->isTargetELF())
+    O << "\t.type " << name << ",%object\n";
+
+  if (C->isNullValue() && !GVar->hasSection() && !GVar->isThreadLocal() &&
+      !(isDarwin &&
+        TAI->SectionKindForGlobal(GVar) == SectionKind::RODataMergeStr)) {
+    // FIXME: This seems to be pretty darwin-specific
+
+    if (GVar->hasExternalLinkage()) {
+      SwitchToSection(TAI->SectionForGlobal(GVar));
+      if (const char *Directive = TAI->getZeroFillDirective()) {
+        O << "\t.globl\t" << name << "\n";
+        O << Directive << "__DATA, __common, " << name << ", "
+          << Size << ", " << Align << "\n";
+        return;
+      }
+    }
+
+    if (GVar->hasLocalLinkage() || GVar->isWeakForLinker()) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (isDarwin) {
+        if (GVar->hasLocalLinkage()) {
+          O << TAI->getLCOMMDirective()  << name << "," << Size
+            << ',' << Align;
+        } else if (GVar->hasCommonLinkage()) {
+          O << TAI->getCOMMDirective()  << name << "," << Size
+            << ',' << Align;
+        } else {
+          SwitchToSection(TAI->SectionForGlobal(GVar));
+          O << "\t.globl " << name << '\n'
+            << TAI->getWeakDefDirective() << name << '\n';
+          EmitAlignment(Align, GVar);
+          O << name << ":";
+          if (VerboseAsm) {
+            O << "\t\t\t\t" << TAI->getCommentString() << ' ';
+            PrintUnmangledNameSafely(GVar, O);
+          }
+          O << '\n';
+          EmitGlobalConstant(C);
+          return;
+        }
+      } else if (TAI->getLCOMMDirective() != NULL) {
+        if (GVar->hasLocalLinkage()) {
+          O << TAI->getLCOMMDirective() << name << "," << Size;
+        } else {
+          O << TAI->getCOMMDirective()  << name << "," << Size;
+          if (TAI->getCOMMDirectiveTakesAlignment())
+            O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+        }
+      } else {
+        SwitchToSection(TAI->SectionForGlobal(GVar));
+        if (GVar->hasLocalLinkage())
+          O << "\t.local\t" << name << "\n";
+        O << TAI->getCOMMDirective()  << name << "," << Size;
+        if (TAI->getCOMMDirectiveTakesAlignment())
+          O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+      }
+      if (VerboseAsm) {
+        O << "\t\t" << TAI->getCommentString() << " ";
+        PrintUnmangledNameSafely(GVar, O);
+      }
+      O << "\n";
+      return;
+    }
+  }
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+  switch (GVar->getLinkage()) {
+   case GlobalValue::CommonLinkage:
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::WeakAnyLinkage:
+   case GlobalValue::WeakODRLinkage:
+    if (isDarwin) {
+      O << "\t.globl " << name << "\n"
+        << "\t.weak_definition " << name << "\n";
+    } else {
+      O << "\t.weak " << name << "\n";
+    }
+    break;
+   case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+   case GlobalValue::ExternalLinkage:
+    O << "\t.globl " << name << "\n";
+    // FALL THROUGH
+   case GlobalValue::PrivateLinkage:
+   case GlobalValue::InternalLinkage:
+    break;
+   default:
+    assert(0 && "Unknown linkage type!");
+    break;
+  }
+
+  EmitAlignment(Align, GVar);
+  O << name << ":";
+  if (VerboseAsm) {
+    O << "\t\t\t\t" << TAI->getCommentString() << " ";
+    PrintUnmangledNameSafely(GVar, O);
+  }
+  O << "\n";
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size " << name << ", " << Size << "\n";
+
+  // If the initializer is a extern weak symbol, remember to emit the weak
+  // reference!
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+
+  EmitGlobalConstant(C);
+  O << '\n';
+}
+
+
+bool ARMAsmPrinter::doFinalization(Module &M) {
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    printModuleLevelGV(I);
+
+  if (Subtarget->isTargetDarwin()) {
+    SwitchToDataSection("");
+
+    // Output stubs for dynamically-linked functions
+    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      if (TM.getRelocationModel() == Reloc::PIC_)
+        SwitchToTextSection(".section __TEXT,__picsymbolstub4,symbol_stubs,"
+                            "none,16", 0);
+      else
+        SwitchToTextSection(".section __TEXT,__symbol_stub4,symbol_stubs,"
+                            "none,12", 0);
+
+      EmitAlignment(2);
+      O << "\t.code\t32\n";
+
+      const char *p = i->getKeyData();
+      printSuffixedName(p, "$stub");
+      O << ":\n";
+      O << "\t.indirect_symbol " << p << "\n";
+      O << "\tldr ip, ";
+      printSuffixedName(p, "$slp");
+      O << "\n";
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        printSuffixedName(p, "$scv");
+        O << ":\n";
+        O << "\tadd ip, pc, ip\n";
+      }
+      O << "\tldr pc, [ip, #0]\n";
+      printSuffixedName(p, "$slp");
+      O << ":\n";
+      O << "\t.long\t";
+      printSuffixedName(p, "$lazy_ptr");
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        O << "-(";
+        printSuffixedName(p, "$scv");
+        O << "+8)\n";
+      } else
+        O << "\n";
+      SwitchToDataSection(".lazy_symbol_pointer", 0);
+      printSuffixedName(p, "$lazy_ptr");
+      O << ":\n";
+      O << "\t.indirect_symbol " << p << "\n";
+      O << "\t.long\tdyld_stub_binding_helper\n";
+    }
+    O << "\n";
+
+    // Output non-lazy-pointers for external and common global variables.
+    if (!GVNonLazyPtrs.empty()) {
+      SwitchToDataSection("\t.non_lazy_symbol_pointer", 0);
+      for (StringSet<>::iterator i =  GVNonLazyPtrs.begin(),
+             e = GVNonLazyPtrs.end(); i != e; ++i) {
+        const char *p = i->getKeyData();
+        printSuffixedName(p, "$non_lazy_ptr");
+        O << ":\n";
+        O << "\t.indirect_symbol " << p << "\n";
+        O << "\t.long\t0\n";
+      }
+    }
+
+    if (!HiddenGVNonLazyPtrs.empty()) {
+      SwitchToSection(TAI->getDataSection());
+      for (StringSet<>::iterator i = HiddenGVNonLazyPtrs.begin(),
+             e = HiddenGVNonLazyPtrs.end(); i != e; ++i) {
+        const char *p = i->getKeyData();
+        EmitAlignment(2);
+        printSuffixedName(p, "$non_lazy_ptr");
+        O << ":\n";
+        O << "\t.long " << p << "\n";
+      }
+    }
+
+
+    // Emit initial debug information.
+    DW->EndModule();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    O << "\t.subsections_via_symbols\n";
+  } else {
+    // Emit final debug information for ELF.
+    DW->EndModule();
+  }
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// createARMCodePrinterPass - Returns a pass that prints the ARM
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createARMCodePrinterPass(raw_ostream &o,
+                                             ARMTargetMachine &tm,
+                                             CodeGenOpt::Level OptLevel,
+                                             bool verbose) {
+  return new ARMAsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
+
+namespace {
+  static struct Register {
+    Register() {
+      ARMTargetMachine::registerAsmPrinter(createARMCodePrinterPass);
+    }
+  } Registrator;
+}
diff --git a/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..524a748
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_partially_linked_object(LLVMARMAsmPrinter
+  ARMAsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMARMCodeGen n)
+
+add_dependencies(LLVMARMAsmPrinter ${n})
diff --git a/lib/Target/ARM/AsmPrinter/Makefile b/lib/Target/ARM/AsmPrinter/Makefile
new file mode 100644
index 0000000..ce36cec
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMAsmPrinter
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
new file mode 100644
index 0000000..2ac40f5
--- /dev/null
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(LLVM_TARGET_DEFINITIONS ARM.td)
+
+tablegen(ARMGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(ARMGenRegisterNames.inc -gen-register-enums)
+tablegen(ARMGenRegisterInfo.inc -gen-register-desc)
+tablegen(ARMGenInstrNames.inc -gen-instr-enums)
+tablegen(ARMGenInstrInfo.inc -gen-instr-desc)
+tablegen(ARMGenCodeEmitter.inc -gen-emitter)
+tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
+tablegen(ARMGenDAGISel.inc -gen-dag-isel)
+tablegen(ARMGenCallingConv.inc -gen-callingconv)
+tablegen(ARMGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(ARMCodeGen
+  ARMCodeEmitter.cpp
+  ARMConstantIslandPass.cpp
+  ARMConstantPoolValue.cpp
+  ARMInstrInfo.cpp
+  ARMISelDAGToDAG.cpp
+  ARMISelLowering.cpp
+  ARMJITInfo.cpp
+  ARMLoadStoreOptimizer.cpp
+  ARMRegisterInfo.cpp
+  ARMSubtarget.cpp
+  ARMTargetAsmInfo.cpp
+  ARMTargetMachine.cpp
+  )
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
new file mode 100644
index 0000000..9a3b9be
--- /dev/null
+++ b/lib/Target/ARM/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMARMCodeGen
+TARGET = ARM
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
+                ARMGenRegisterInfo.inc ARMGenInstrNames.inc \
+                ARMGenInstrInfo.inc ARMGenAsmWriter.inc \
+                ARMGenDAGISel.inc ARMGenSubtarget.inc \
+                ARMGenCodeEmitter.inc ARMGenCallingConv.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
new file mode 100644
index 0000000..4d3200b
--- /dev/null
+++ b/lib/Target/ARM/README-Thumb.txt
@@ -0,0 +1,228 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend (Thumb specific).
+//===---------------------------------------------------------------------===//
+
+* Add support for compiling functions in both ARM and Thumb mode, then taking
+  the smallest.
+
+* Add support for compiling individual basic blocks in thumb mode, when in a 
+  larger ARM function.  This can be used for presumed cold code, like paths
+  to abort (failure path of asserts), EH handling code, etc.
+
+* Thumb doesn't have normal pre/post increment addressing modes, but you can
+  load/store 32-bit integers with pre/postinc by using load/store multiple
+  instrs with a single register.
+
+* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
+  and cmp instructions can use high registers. Also, we can use them as
+  temporaries to spill values into.
+
+* In thumb mode, short, byte, and bool preferred alignments are currently set
+  to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
+  of 4).
+
+//===---------------------------------------------------------------------===//
+
+Potential jumptable improvements:
+
+* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
+  jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
+  function is even smaller. This also applies to ARM.
+
+* Thumb jumptable codegen can improve given some help from the assembler. This
+  is what we generate right now:
+
+	.set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
+LPCRELL0:
+	mov r1, #PCRELV0
+	add r1, pc
+	ldr r0, [r0, r1]
+	cpy pc, r0 
+	.align	2
+LJTI1_0_0:
+	.long	 LBB1_3
+        ...
+
+Note there is another pc relative add that we can take advantage of.
+     add r1, pc, #imm_8 * 4
+
+We should be able to generate:
+
+LPCRELL0:
+	add r1, LJTI1_0_0
+	ldr r0, [r0, r1]
+	cpy pc, r0 
+	.align	2
+LJTI1_0_0:
+	.long	 LBB1_3
+
+if the assembler can translate the add to:
+       add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
+
+Note the assembler also does something similar to constpool load:
+LPCRELL0:
+     ldr r0, LCPI1_0
+=>
+     ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
+
+
+//===---------------------------------------------------------------------===//
+
+We compiles the following:
+
+define i16 @func_entry_2E_ce(i32 %i) {
+        switch i32 %i, label %bb12.exitStub [
+                 i32 0, label %bb4.exitStub
+                 i32 1, label %bb9.exitStub
+                 i32 2, label %bb4.exitStub
+                 i32 3, label %bb4.exitStub
+                 i32 7, label %bb9.exitStub
+                 i32 8, label %bb.exitStub
+                 i32 9, label %bb9.exitStub
+        ]
+
+bb12.exitStub:
+        ret i16 0
+
+bb4.exitStub:
+        ret i16 1
+
+bb9.exitStub:
+        ret i16 2
+
+bb.exitStub:
+        ret i16 3
+}
+
+into:
+
+_func_entry_2E_ce:
+        mov r2, #1
+        lsl r2, r0
+        cmp r0, #9
+        bhi LBB1_4      @bb12.exitStub
+LBB1_1: @newFuncRoot
+        mov r1, #13
+        tst r2, r1
+        bne LBB1_5      @bb4.exitStub
+LBB1_2: @newFuncRoot
+        ldr r1, LCPI1_0
+        tst r2, r1
+        bne LBB1_6      @bb9.exitStub
+LBB1_3: @newFuncRoot
+        mov r1, #1
+        lsl r1, r1, #8
+        tst r2, r1
+        bne LBB1_7      @bb.exitStub
+LBB1_4: @bb12.exitStub
+        mov r0, #0
+        bx lr
+LBB1_5: @bb4.exitStub
+        mov r0, #1
+        bx lr
+LBB1_6: @bb9.exitStub
+        mov r0, #2
+        bx lr
+LBB1_7: @bb.exitStub
+        mov r0, #3
+        bx lr
+LBB1_8:
+        .align  2
+LCPI1_0:
+        .long   642
+
+
+gcc compiles to:
+
+	cmp	r0, #9
+	@ lr needed for prologue
+	bhi	L2
+	ldr	r3, L11
+	mov	r2, #1
+	mov	r1, r2, asl r0
+	ands	r0, r3, r2, asl r0
+	movne	r0, #2
+	bxne	lr
+	tst	r1, #13
+	beq	L9
+L3:
+	mov	r0, r2
+	bx	lr
+L9:
+	tst	r1, #256
+	movne	r0, #3
+	bxne	lr
+L2:
+	mov	r0, #0
+	bx	lr
+L12:
+	.align 2
+L11:
+	.long	642
+        
+
+GCC is doing a couple of clever things here:
+  1. It is predicating one of the returns.  This isn't a clear win though: in
+     cases where that return isn't taken, it is replacing one condbranch with
+     two 'ne' predicated instructions.
+  2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
+     tst.  This will probably require whole function isel.
+  3. GCC emits:
+  	tst	r1, #256
+     we emit:
+        mov r1, #1
+        lsl r1, r1, #8
+        tst r2, r1
+  
+
+//===---------------------------------------------------------------------===//
+
+When spilling in thumb mode and the sp offset is too large to fit in the ldr /
+str offset field, we load the offset from a constpool entry and add it to sp:
+
+ldr r2, LCPI
+add r2, sp
+ldr r2, [r2]
+
+These instructions preserve the condition code which is important if the spill
+is between a cmp and a bcc instruction. However, we can use the (potentially)
+cheaper sequnce if we know it's ok to clobber the condition register.
+
+add r2, sp, #255 * 4
+add r2, #132
+ldr r2, [r2, #7 * 4]
+
+This is especially bad when dynamic alloca is used. The all fixed size stack
+objects are referenced off the frame pointer with negative offsets. See
+oggenc for an example.
+
+//===---------------------------------------------------------------------===//
+
+We are reserving R3 as a scratch register under thumb mode. So if it is live in
+to the function, we save / restore R3 to / from R12. Until register scavenging
+is done, we should save R3 to a high callee saved reg at emitPrologue time
+(when hasFP is true or stack size is large) and restore R3 from that register
+instead. This allows us to at least get rid of the save to r12 everytime it is
+used.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen test/CodeGen/ARM/select.ll f7:
+
+	ldr r5, LCPI1_0
+LPC0:
+	add r5, pc
+	ldr r6, LCPI1_1
+	ldr r2, LCPI1_2
+	cpy r3, r6
+	cpy lr, pc
+	bx r5
+
+//===---------------------------------------------------------------------===//
+
+Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
+etc. Almost all Thumb instructions clobber condition code.
+
+//===---------------------------------------------------------------------===//
+
+Add ldmia, stmia support.
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
new file mode 100644
index 0000000..068c441e
--- /dev/null
+++ b/lib/Target/ARM/README.txt
@@ -0,0 +1,554 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend.
+//===---------------------------------------------------------------------===//
+
+Reimplement 'select' in terms of 'SEL'.
+
+* We would really like to support UXTAB16, but we need to prove that the
+  add doesn't need to overflow between the two 16-bit chunks.
+
+* Implement pre/post increment support.  (e.g. PR935)
+* Coalesce stack slots!
+* Implement smarter constant generation for binops with large immediates.
+
+* Consider materializing FP constants like 0.0f and 1.0f using integer 
+  immediate instructions then copy to FPU.  Slower than load into FPU?
+
+//===---------------------------------------------------------------------===//
+
+Crazy idea:  Consider code that uses lots of 8-bit or 16-bit values.  By the
+time regalloc happens, these values are now in a 32-bit register, usually with
+the top-bits known to be sign or zero extended.  If spilled, we should be able
+to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part
+of the reload.
+
+Doing this reduces the size of the stack frame (important for thumb etc), and
+also increases the likelihood that we will be able to reload multiple values
+from the stack with a single load.
+
+//===---------------------------------------------------------------------===//
+
+The constant island pass is in good shape.  Some cleanups might be desirable,
+but there is unlikely to be much improvement in the generated code.
+
+1.  There may be some advantage to trying to be smarter about the initial
+placement, rather than putting everything at the end.
+
+2.  There might be some compile-time efficiency to be had by representing
+consecutive islands as a single block rather than multiple blocks.
+
+3.  Use a priority queue to sort constant pool users in inverse order of
+    position so we always process the one closed to the end of functions
+    first. This may simply CreateNewWater.
+
+//===---------------------------------------------------------------------===//
+
+Eliminate copysign custom expansion. We are still generating crappy code with
+default expansion + if-conversion.
+
+//===---------------------------------------------------------------------===//
+
+Eliminate one instruction from:
+
+define i32 @_Z6slow4bii(i32 %x, i32 %y) {
+        %tmp = icmp sgt i32 %x, %y
+        %retval = select i1 %tmp, i32 %x, i32 %y
+        ret i32 %retval
+}
+
+__Z6slow4bii:
+        cmp r0, r1
+        movgt r1, r0
+        mov r0, r1
+        bx lr
+=>
+
+__Z6slow4bii:
+        cmp r0, r1
+        movle r0, r1
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
+Implement long long "X-3" with instructions that fold the immediate in.  These
+were disabled due to badness with the ARM carry flag on subtracts.
+
+//===---------------------------------------------------------------------===//
+
+We currently compile abs:
+int foo(int p) { return p < 0 ? -p : p; }
+
+into:
+
+_foo:
+        rsb r1, r0, #0
+        cmn r0, #1
+        movgt r1, r0
+        mov r0, r1
+        bx lr
+
+This is very, uh, literal.  This could be a 3 operation sequence:
+  t = (p sra 31); 
+  res = (p xor t)-t
+
+Which would be better.  This occurs in png decode.
+
+//===---------------------------------------------------------------------===//
+
+More load / store optimizations:
+1) Look past instructions without side-effects (not load, store, branch, etc.)
+   when forming the list of loads / stores to optimize.
+
+2) Smarter register allocation?
+We are probably missing some opportunities to use ldm / stm. Consider:
+
+ldr r5, [r0]
+ldr r4, [r0, #4]
+
+This cannot be merged into a ldm. Perhaps we will need to do the transformation
+before register allocation. Then teach the register allocator to allocate a
+chunk of consecutive registers.
+
+3) Better representation for block transfer? This is from Olden/power:
+
+	fldd d0, [r4]
+	fstd d0, [r4, #+32]
+	fldd d0, [r4, #+8]
+	fstd d0, [r4, #+40]
+	fldd d0, [r4, #+16]
+	fstd d0, [r4, #+48]
+	fldd d0, [r4, #+24]
+	fstd d0, [r4, #+56]
+
+If we can spare the registers, it would be better to use fldm and fstm here.
+Need major register allocator enhancement though.
+
+4) Can we recognize the relative position of constantpool entries? i.e. Treat
+
+	ldr r0, LCPI17_3
+	ldr r1, LCPI17_4
+	ldr r2, LCPI17_5
+
+   as
+	ldr r0, LCPI17
+	ldr r1, LCPI17+4
+	ldr r2, LCPI17+8
+
+   Then the ldr's can be combined into a single ldm. See Olden/power.
+
+Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
+double 64-bit FP constant:
+
+	adr	r0, L6
+	ldmia	r0, {r0-r1}
+
+	.align 2
+L6:
+	.long	-858993459
+	.long	1074318540
+
+5) Can we make use of ldrd and strd? Instead of generating ldm / stm, use
+ldrd/strd instead if there are only two destination registers that form an
+odd/even pair. However, we probably would pay a penalty if the address is not
+aligned on 8-byte boundary. This requires more information on load / store
+nodes (and MI's?) then we currently carry.
+
+6) struct copies appear to be done field by field 
+instead of by words, at least sometimes:
+
+struct foo { int x; short s; char c1; char c2; };
+void cpy(struct foo*a, struct foo*b) { *a = *b; }
+
+llvm code (-O2)
+        ldrb r3, [r1, #+6]
+        ldr r2, [r1]
+        ldrb r12, [r1, #+7]
+        ldrh r1, [r1, #+4]
+        str r2, [r0]
+        strh r1, [r0, #+4]
+        strb r3, [r0, #+6]
+        strb r12, [r0, #+7]
+gcc code (-O2)
+        ldmia   r1, {r1-r2}
+        stmia   r0, {r1-r2}
+
+In this benchmark poor handling of aggregate copies has shown up as
+having a large effect on size, and possibly speed as well (we don't have
+a good way to measure on ARM).
+
+//===---------------------------------------------------------------------===//
+
+* Consider this silly example:
+
+double bar(double x) {  
+  double r = foo(3.1);
+  return x+r;
+}
+
+_bar:
+        stmfd sp!, {r4, r5, r7, lr}
+        add r7, sp, #8
+        mov r4, r0
+        mov r5, r1
+        fldd d0, LCPI1_0
+        fmrrd r0, r1, d0
+        bl _foo
+        fmdrr d0, r4, r5
+        fmsr s2, r0
+        fsitod d1, s2
+        faddd d0, d1, d0
+        fmrrd r0, r1, d0
+        ldmfd sp!, {r4, r5, r7, pc}
+
+Ignore the prologue and epilogue stuff for a second. Note 
+	mov r4, r0
+	mov r5, r1
+the copys to callee-save registers and the fact they are only being used by the
+fmdrr instruction. It would have been better had the fmdrr been scheduled
+before the call and place the result in a callee-save DPR register. The two
+mov ops would not have been necessary.
+
+//===---------------------------------------------------------------------===//
+
+Calling convention related stuff:
+
+* gcc's parameter passing implementation is terrible and we suffer as a result:
+
+e.g.
+struct s {
+  double d1;
+  int s1;
+};
+
+void foo(struct s S) {
+  printf("%g, %d\n", S.d1, S.s1);
+}
+
+'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
+then reload them to r1, r2, and r3 before issuing the call (r0 contains the
+address of the format string):
+
+	stmfd	sp!, {r7, lr}
+	add	r7, sp, #0
+	sub	sp, sp, #12
+	stmia	sp, {r0, r1, r2}
+	ldmia	sp, {r1-r2}
+	ldr	r0, L5
+	ldr	r3, [sp, #8]
+L2:
+	add	r0, pc, r0
+	bl	L_printf$stub
+
+Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
+
+* Return an aggregate type is even worse:
+
+e.g.
+struct s foo(void) {
+  struct s S = {1.1, 2};
+  return S;
+}
+
+	mov	ip, r0
+	ldr	r0, L5
+	sub	sp, sp, #12
+L2:
+	add	r0, pc, r0
+	@ lr needed for prologue
+	ldmia	r0, {r0, r1, r2}
+	stmia	sp, {r0, r1, r2}
+	stmia	ip, {r0, r1, r2}
+	mov	r0, ip
+	add	sp, sp, #12
+	bx	lr
+
+r0 (and later ip) is the hidden parameter from caller to store the value in. The
+first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
+r2 into the address passed in. However, there is one additional stmia that
+stores r0, r1, and r2 to some stack location. The store is dead.
+
+The llvm-gcc generated code looks like this:
+
+csretcc void %foo(%struct.s* %agg.result) {
+entry:
+	%S = alloca %struct.s, align 4		; <%struct.s*> [#uses=1]
+	%memtmp = alloca %struct.s		; <%struct.s*> [#uses=1]
+	cast %struct.s* %S to sbyte*		; <sbyte*>:0 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
+	cast %struct.s* %agg.result to sbyte*		; <sbyte*>:1 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
+	cast %struct.s* %memtmp to sbyte*		; <sbyte*>:2 [#uses=1]
+	call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
+	ret void
+}
+
+llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
+constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
+into a number of load and stores, or 2) custom lower memcpy (of small size) to
+be ldmia / stmia. I think option 2 is better but the current register
+allocator cannot allocate a chunk of registers at a time.
+
+A feasible temporary solution is to use specific physical registers at the
+lowering time for small (<= 4 words?) transfer size.
+
+* ARM CSRet calling convention requires the hidden argument to be returned by
+the callee.
+
+//===---------------------------------------------------------------------===//
+
+We can definitely do a better job on BB placements to eliminate some branches.
+It's very common to see llvm generated assembly code that looks like this:
+
+LBB3:
+ ...
+LBB4:
+...
+  beq LBB3
+  b LBB2
+
+If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
+then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
+
+See McCat/18-imp/ComputeBoundingBoxes for an example.
+
+//===---------------------------------------------------------------------===//
+
+Register scavenging is now implemented.  The example in the previous version
+of this document produces optimal code at -O2.
+
+//===---------------------------------------------------------------------===//
+
+Pre-/post- indexed load / stores:
+
+1) We should not make the pre/post- indexed load/store transform if the base ptr
+is guaranteed to be live beyond the load/store. This can happen if the base
+ptr is live out of the block we are performing the optimization. e.g.
+
+mov r1, r2
+ldr r3, [r1], #4
+...
+
+vs.
+
+ldr r3, [r2]
+add r1, r2, #4
+...
+
+In most cases, this is just a wasted optimization. However, sometimes it can
+negatively impact the performance because two-address code is more restrictive
+when it comes to scheduling.
+
+Unfortunately, liveout information is currently unavailable during DAG combine
+time.
+
+2) Consider spliting a indexed load / store into a pair of add/sub + load/store
+   to solve #1 (in TwoAddressInstructionPass.cpp).
+
+3) Enhance LSR to generate more opportunities for indexed ops.
+
+4) Once we added support for multiple result patterns, write indexed loads
+   patterns instead of C++ instruction selection code.
+
+5) Use FLDM / FSTM to emulate indexed FP load / store.
+
+//===---------------------------------------------------------------------===//
+
+We should add i64 support to take advantage of the 64-bit load / stores.
+We can add a pseudo i64 register class containing pseudo registers that are
+register pairs. All other ops (e.g. add, sub) would be expanded as usual.
+
+We need to add pseudo instructions (i.e. gethi / getlo) to extract i32 registers
+from the i64 register. These are single moves which can be eliminated if the
+destination register is a sub-register of the source. We should implement proper
+subreg support in the register allocator to coalesce these away.
+
+There are other minor issues such as multiple instructions for a spill / restore
+/ move.
+
+//===---------------------------------------------------------------------===//
+
+Implement support for some more tricky ways to materialize immediates.  For
+example, to get 0xffff8000, we can use:
+
+mov r9, #&3f8000
+sub r9, r9, #&400000
+
+//===---------------------------------------------------------------------===//
+
+We sometimes generate multiple add / sub instructions to update sp in prologue
+and epilogue if the inc / dec value is too large to fit in a single immediate
+operand. In some cases, perhaps it might be better to load the value from a
+constantpool instead.
+
+//===---------------------------------------------------------------------===//
+
+GCC generates significantly better code for this function.
+
+int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
+    int i = 0;
+
+    if (StackPtr != 0) {
+       while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
+          Line[i++] = Stack[--StackPtr];
+        if (LineLen > 32768)
+        {
+            while (StackPtr != 0 && i < LineLen)
+            {
+                i++;
+                --StackPtr;
+            }
+        }
+    }
+    return StackPtr;
+}
+
+//===---------------------------------------------------------------------===//
+
+This should compile to the mlas instruction:
+int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
+
+//===---------------------------------------------------------------------===//
+
+At some point, we should triage these to see if they still apply to us:
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
+
+http://www.inf.u-szeged.hu/gcc-arm/
+http://citeseer.ist.psu.edu/debus04linktime.html
+
+//===---------------------------------------------------------------------===//
+
+gcc generates smaller code for this function at -O2 or -Os:
+
+void foo(signed char* p) {
+  if (*p == 3)
+     bar();
+   else if (*p == 4)
+    baz();
+  else if (*p == 5)
+    quux();
+}
+
+llvm decides it's a good idea to turn the repeated if...else into a
+binary tree, as if it were a switch; the resulting code requires -1 
+compare-and-branches when *p<=2 or *p==5, the same number if *p==4
+or *p>6, and +1 if *p==3.  So it should be a speed win
+(on balance).  However, the revised code is larger, with 4 conditional 
+branches instead of 3.
+
+More seriously, there is a byte->word extend before
+each comparison, where there should be only one, and the condition codes
+are not remembered when the same two values are compared twice.
+
+//===---------------------------------------------------------------------===//
+
+More register scavenging work:
+
+1. Use the register scavenger to track frame index materialized into registers
+   (those that do not fit in addressing modes) to allow reuse in the same BB.
+2. Finish scavenging for Thumb.
+3. We know some spills and restores are unnecessary. The issue is once live
+   intervals are merged, they are not never split. So every def is spilled
+   and every use requires a restore if the register allocator decides the
+   resulting live interval is not assigned a physical register. It may be
+   possible (with the help of the scavenger) to turn some spill / restore
+   pairs into register copies.
+
+//===---------------------------------------------------------------------===//
+
+More LSR enhancements possible:
+
+1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged
+   in a load / store.
+2. Allow iv reuse even when a type conversion is required. For example, i8
+   and i32 load / store addressing modes are identical.
+
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+int foo(int a, int b, int c, int d) {
+  long long acc = (long long)a * (long long)b;
+  acc += (long long)c * (long long)d;
+  return (int)(acc >> 32);
+}
+
+Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies 
+two signed 32-bit values to produce a 64-bit value, and accumulates this with 
+a 64-bit value.
+
+We currently get this with both v4 and v6:
+
+_foo:
+        smull r1, r0, r1, r0
+        smull r3, r2, r3, r2
+        adds r3, r3, r1
+        adc r0, r2, r0
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
+_Z8full_addjj:
+	adds	r2, r1, r2
+	movcc	r1, #0
+	movcs	r1, #1
+	str	r2, [r0, #0]
+	strb	r1, [r0, #4]
+	mov	pc, lr
+
+_Z11no_overflowjj:
+	cmn	r0, r1
+	movcs	r0, #0
+	movcc	r0, #1
+	mov	pc, lr
+
+not:
+
+__Z8full_addjj:
+        add r3, r2, r1
+        str r3, [r0]
+        mov r2, #1
+        mov r12, #0
+        cmp r3, r1
+        movlo r12, r2
+        str r12, [r0, #+4]
+        bx lr
+__Z11no_overflowjj:
+        add r3, r1, r0
+        mov r2, #1
+        mov r1, #0
+        cmp r3, r0
+        movhs r1, r2
+        mov r0, r1
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
diff --git a/lib/Target/Alpha/Alpha.h b/lib/Target/Alpha/Alpha.h
new file mode 100644
index 0000000..2815176
--- /dev/null
+++ b/lib/Target/Alpha/Alpha.h
@@ -0,0 +1,51 @@
+//===-- Alpha.h - Top-level interface for Alpha representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Alpha back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ALPHA_H
+#define TARGET_ALPHA_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  class AlphaTargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class raw_ostream;
+
+  FunctionPass *createAlphaISelDag(AlphaTargetMachine &TM);
+  FunctionPass *createAlphaCodePrinterPass(raw_ostream &OS,
+                                           TargetMachine &TM,
+                                           CodeGenOpt::Level OptLevel,
+                                           bool Verbose);
+  FunctionPass *createAlphaPatternInstructionSelector(TargetMachine &TM);
+  FunctionPass *createAlphaCodeEmitterPass(AlphaTargetMachine &TM,
+                                           MachineCodeEmitter &MCE);
+  FunctionPass *createAlphaJITCodeEmitterPass(AlphaTargetMachine &TM,
+                                           JITCodeEmitter &JCE);
+  FunctionPass *createAlphaLLRPPass(AlphaTargetMachine &tm);
+  FunctionPass *createAlphaBranchSelectionPass();
+
+} // end namespace llvm;
+
+// Defines symbolic names for Alpha registers.  This defines a mapping from
+// register name to register number.
+//
+#include "AlphaGenRegisterNames.inc"
+
+// Defines symbolic names for the Alpha instructions.
+//
+#include "AlphaGenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td
new file mode 100644
index 0000000..e3748c6
--- /dev/null
+++ b/lib/Target/Alpha/Alpha.td
@@ -0,0 +1,66 @@
+//===- Alpha.td - Describe the Alpha Target Machine --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//Alpha is little endian
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true",
+                                  "Enable CIX extentions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Schedule Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaSchedule.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AlphaInstrInfo.td"
+
+def AlphaInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+ // let TSFlagsFields = [];
+ // let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// Alpha Processor Definitions
+//===----------------------------------------------------------------------===//
+
+def : Processor<"generic", Alpha21264Itineraries, []>;
+def : Processor<"ev6"    , Alpha21264Itineraries, []>;
+def : Processor<"ev67"   , Alpha21264Itineraries, [FeatureCIX]>;
+
+//===----------------------------------------------------------------------===//
+// The Alpha Target
+//===----------------------------------------------------------------------===//
+
+
+def Alpha : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AlphaInstrInfo;
+}
diff --git a/lib/Target/Alpha/AlphaBranchSelector.cpp b/lib/Target/Alpha/AlphaBranchSelector.cpp
new file mode 100644
index 0000000..aca8ca7
--- /dev/null
+++ b/lib/Target/Alpha/AlphaBranchSelector.cpp
@@ -0,0 +1,67 @@
+//===-- AlphaBranchSelector.cpp - Convert Pseudo branchs ----------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace Pseudo COND_BRANCH_* with their appropriate real branch
+// Simplified version of the PPC Branch Selector
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+using namespace llvm;
+
+namespace {
+  struct VISIBILITY_HIDDEN AlphaBSel : public MachineFunctionPass {
+    static char ID;
+    AlphaBSel() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "Alpha Branch Selection";
+    }
+  };
+  char AlphaBSel::ID = 0;
+}
+
+/// createAlphaBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createAlphaBranchSelectionPass() {
+  return new AlphaBSel();
+}
+
+bool AlphaBSel::runOnMachineFunction(MachineFunction &Fn) {
+
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+    
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI) {
+      if (MBBI->getOpcode() == Alpha::COND_BRANCH_I ||
+          MBBI->getOpcode() == Alpha::COND_BRANCH_F) {
+        
+        // condbranch operands:
+        // 0. bc opcode
+        // 1. reg
+        // 2. target MBB
+        const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+        MBBI->setDesc(TII->get(MBBI->getOperand(0).getImm()));
+      }
+    }
+  }
+  
+  return true;
+}
+
diff --git a/lib/Target/Alpha/AlphaCodeEmitter.cpp b/lib/Target/Alpha/AlphaCodeEmitter.cpp
new file mode 100644
index 0000000..f50f007
--- /dev/null
+++ b/lib/Target/Alpha/AlphaCodeEmitter.cpp
@@ -0,0 +1,242 @@
+//===-- Alpha/AlphaCodeEmitter.cpp - Convert Alpha code to machine code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Alpha machine instructions
+// into relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "alpha-emitter"
+#include "AlphaTargetMachine.h"
+#include "AlphaRelocations.h"
+#include "Alpha.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+namespace {
+  	
+  class AlphaCodeEmitter {
+    MachineCodeEmitter &MCE;
+  public:
+    AlphaCodeEmitter(MachineCodeEmitter &mce) : MCE(mce) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI);
+
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO);
+  };
+
+  template <class CodeEmitter>
+  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass,
+      public AlphaCodeEmitter
+  {
+    const AlphaInstrInfo  *II;
+    TargetMachine         &TM;
+    CodeEmitter           &MCE;
+
+  public:
+    static char ID;
+    explicit Emitter(TargetMachine &tm, CodeEmitter &mce)
+      : MachineFunctionPass(&ID), AlphaCodeEmitter(mce), 
+        II(0), TM(tm), MCE(mce) {}
+    Emitter(TargetMachine &tm, CodeEmitter &mce, const AlphaInstrInfo& ii)
+      : MachineFunctionPass(&ID), AlphaCodeEmitter(mce),
+        II(&ii), TM(tm), MCE(mce) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "Alpha Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+    void emitBasicBlock(MachineBasicBlock &MBB);
+  };
+
+  template <class CodeEmitter>
+    char Emitter<CodeEmitter>::ID = 0;
+}
+
+/// createAlphaCodeEmitterPass - Return a pass that emits the collected Alpha
+/// code to the specified MCE object.
+
+FunctionPass *llvm::createAlphaCodeEmitterPass(AlphaTargetMachine &TM,
+                                               MachineCodeEmitter &MCE) {
+  return new Emitter<MachineCodeEmitter>(TM, MCE);
+}
+
+FunctionPass *llvm::createAlphaJITCodeEmitterPass(AlphaTargetMachine &TM,
+                                                  JITCodeEmitter &JCE) {
+  return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+template <class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+  II = ((AlphaTargetMachine&)MF.getTarget()).getInstrInfo();
+
+  do {
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      emitBasicBlock(*I);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+template <class CodeEmitter>
+void Emitter<CodeEmitter>::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+       I != E; ++I) {
+    const MachineInstr &MI = *I;
+    switch(MI.getOpcode()) {
+    default:
+      MCE.emitWordLE(getBinaryCodeForInstr(*I));
+      break;
+    case Alpha::ALTENT:
+    case Alpha::PCLABEL:
+    case Alpha::MEMLABEL:
+    case TargetInstrInfo::IMPLICIT_DEF:
+      break; //skip these
+    }
+  }
+}
+
+static unsigned getAlphaRegNumber(unsigned Reg) {
+  switch (Reg) {
+  case Alpha::R0  : case Alpha::F0  : return 0;
+  case Alpha::R1  : case Alpha::F1  : return 1;
+  case Alpha::R2  : case Alpha::F2  : return 2;
+  case Alpha::R3  : case Alpha::F3  : return 3;
+  case Alpha::R4  : case Alpha::F4  : return 4;
+  case Alpha::R5  : case Alpha::F5  : return 5;
+  case Alpha::R6  : case Alpha::F6  : return 6;
+  case Alpha::R7  : case Alpha::F7  : return 7;
+  case Alpha::R8  : case Alpha::F8  : return 8;
+  case Alpha::R9  : case Alpha::F9  : return 9;
+  case Alpha::R10 : case Alpha::F10 : return 10;
+  case Alpha::R11 : case Alpha::F11 : return 11;
+  case Alpha::R12 : case Alpha::F12 : return 12;
+  case Alpha::R13 : case Alpha::F13 : return 13;
+  case Alpha::R14 : case Alpha::F14 : return 14;
+  case Alpha::R15 : case Alpha::F15 : return 15;
+  case Alpha::R16 : case Alpha::F16 : return 16;
+  case Alpha::R17 : case Alpha::F17 : return 17;
+  case Alpha::R18 : case Alpha::F18 : return 18;
+  case Alpha::R19 : case Alpha::F19 : return 19;
+  case Alpha::R20 : case Alpha::F20 : return 20;
+  case Alpha::R21 : case Alpha::F21 : return 21;
+  case Alpha::R22 : case Alpha::F22 : return 22;
+  case Alpha::R23 : case Alpha::F23 : return 23;
+  case Alpha::R24 : case Alpha::F24 : return 24;
+  case Alpha::R25 : case Alpha::F25 : return 25;
+  case Alpha::R26 : case Alpha::F26 : return 26;
+  case Alpha::R27 : case Alpha::F27 : return 27;
+  case Alpha::R28 : case Alpha::F28 : return 28;
+  case Alpha::R29 : case Alpha::F29 : return 29;
+  case Alpha::R30 : case Alpha::F30 : return 30;
+  case Alpha::R31 : case Alpha::F31 : return 31;
+  default:
+    assert(0 && "Unhandled reg");
+    abort();
+  }
+}
+
+unsigned AlphaCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                           		             const MachineOperand &MO) {
+
+  unsigned rv = 0; // Return value; defaults to 0 for unhandled cases
+                   // or things that get fixed up later by the JIT.
+
+  if (MO.isReg()) {
+    rv = getAlphaRegNumber(MO.getReg());
+  } else if (MO.isImm()) {
+    rv = MO.getImm();
+  } else if (MO.isGlobal() || MO.isSymbol() || MO.isCPI()) {
+    DOUT << MO << " is a relocated op for " << MI << "\n";
+    unsigned Reloc = 0;
+    int Offset = 0;
+    bool useGOT = false;
+    switch (MI.getOpcode()) {
+    case Alpha::BSR:
+      Reloc = Alpha::reloc_bsr;
+      break;
+    case Alpha::LDLr:
+    case Alpha::LDQr:
+    case Alpha::LDBUr:
+    case Alpha::LDWUr:
+    case Alpha::LDSr:
+    case Alpha::LDTr:
+    case Alpha::LDAr:
+    case Alpha::STQr:
+    case Alpha::STLr:
+    case Alpha::STWr:
+    case Alpha::STBr:
+    case Alpha::STSr:
+    case Alpha::STTr:
+      Reloc = Alpha::reloc_gprellow;
+      break;
+    case Alpha::LDAHr:
+      Reloc = Alpha::reloc_gprelhigh;
+      break;
+    case Alpha::LDQl:
+      Reloc = Alpha::reloc_literal;
+      useGOT = true;
+      break;
+    case Alpha::LDAg:
+    case Alpha::LDAHg:
+      Reloc = Alpha::reloc_gpdist;
+      Offset = MI.getOperand(3).getImm();
+      break;
+    default:
+      assert(0 && "unknown relocatable instruction");
+      abort();
+    }
+    if (MO.isGlobal())
+      MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
+                                                 Reloc, MO.getGlobal(), Offset,
+                                                 isa<Function>(MO.getGlobal()),
+                                                 useGOT));
+    else if (MO.isSymbol())
+      MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                     Reloc, MO.getSymbolName(),
+                                                     Offset, true));
+    else
+     MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getIndex(), Offset));
+  } else if (MO.isMBB()) {
+    MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                               Alpha::reloc_bsr, MO.getMBB()));
+  }else {
+    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+    abort();
+  }
+
+  return rv;
+}
+
+#include "AlphaGenCodeEmitter.inc"
+
+
diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
new file mode 100644
index 0000000..affcd3e
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
@@ -0,0 +1,553 @@
+//===-- AlphaISelDAGToDAG.cpp - Alpha pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for Alpha,
+// converting from a legalized dag to a Alpha dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaTargetMachine.h"
+#include "AlphaISelLowering.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+
+  //===--------------------------------------------------------------------===//
+  /// AlphaDAGToDAGISel - Alpha specific code to select Alpha machine
+  /// instructions for SelectionDAG operations.
+  class AlphaDAGToDAGISel : public SelectionDAGISel {
+    static const int64_t IMM_LOW  = -32768;
+    static const int64_t IMM_HIGH = 32767;
+    static const int64_t IMM_MULT = 65536;
+    static const int64_t IMM_FULLHIGH = IMM_HIGH + IMM_HIGH * IMM_MULT;
+    static const int64_t IMM_FULLLOW = IMM_LOW + IMM_LOW  * IMM_MULT;
+
+    static int64_t get_ldah16(int64_t x) {
+      int64_t y = x / IMM_MULT;
+      if (x % IMM_MULT > IMM_HIGH)
+        ++y;
+      return y;
+    }
+
+    static int64_t get_lda16(int64_t x) {
+      return x - get_ldah16(x) * IMM_MULT;
+    }
+
+    /// get_zapImm - Return a zap mask if X is a valid immediate for a zapnot
+    /// instruction (if not, return 0).  Note that this code accepts partial
+    /// zap masks.  For example (and LHS, 1) is a valid zap, as long we know
+    /// that the bits 1-7 of LHS are already zero.  If LHS is non-null, we are
+    /// in checking mode.  If LHS is null, we assume that the mask has already
+    /// been validated before.
+    uint64_t get_zapImm(SDValue LHS, uint64_t Constant) {
+      uint64_t BitsToCheck = 0;
+      unsigned Result = 0;
+      for (unsigned i = 0; i != 8; ++i) {
+        if (((Constant >> 8*i) & 0xFF) == 0) {
+          // nothing to do.
+        } else {
+          Result |= 1 << i;
+          if (((Constant >> 8*i) & 0xFF) == 0xFF) {
+            // If the entire byte is set, zapnot the byte.
+          } else if (LHS.getNode() == 0) {
+            // Otherwise, if the mask was previously validated, we know its okay
+            // to zapnot this entire byte even though all the bits aren't set.
+          } else {
+            // Otherwise we don't know that the it's okay to zapnot this entire
+            // byte.  Only do this iff we can prove that the missing bits are
+            // already null, so the bytezap doesn't need to really null them.
+            BitsToCheck |= ~Constant & (0xFF << 8*i);
+          }
+        }
+      }
+      
+      // If there are missing bits in a byte (for example, X & 0xEF00), check to
+      // see if the missing bits (0x1000) are already known zero if not, the zap
+      // isn't okay to do, as it won't clear all the required bits.
+      if (BitsToCheck &&
+          !CurDAG->MaskedValueIsZero(LHS,
+                                     APInt(LHS.getValueSizeInBits(),
+                                           BitsToCheck)))
+        return 0;
+      
+      return Result;
+    }
+    
+    static uint64_t get_zapImm(uint64_t x) {
+      unsigned build = 0;
+      for(int i = 0; i != 8; ++i) {
+        if ((x & 0x00FF) == 0x00FF)
+          build |= 1 << i;
+        else if ((x & 0x00FF) != 0)
+          return 0;
+        x >>= 8;
+      }
+      return build;
+    }
+      
+    
+    static uint64_t getNearPower2(uint64_t x) {
+      if (!x) return 0;
+      unsigned at = CountLeadingZeros_64(x);
+      uint64_t complow = 1 << (63 - at);
+      uint64_t comphigh = 1 << (64 - at);
+      //cerr << x << ":" << complow << ":" << comphigh << "\n";
+      if (abs(complow - x) <= abs(comphigh - x))
+        return complow;
+      else
+        return comphigh;
+    }
+
+    static bool chkRemNearPower2(uint64_t x, uint64_t r, bool swap) {
+      uint64_t y = getNearPower2(x);
+      if (swap)
+        return (y - x) == r;
+      else
+        return (x - y) == r;
+    }
+
+    static bool isFPZ(SDValue N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && (CN->getValueAPF().isZero()));
+    }
+    static bool isFPZn(SDValue N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && CN->getValueAPF().isNegZero());
+    }
+    static bool isFPZp(SDValue N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && CN->getValueAPF().isPosZero());
+    }
+
+  public:
+    explicit AlphaDAGToDAGISel(AlphaTargetMachine &TM)
+      : SelectionDAGISel(TM)
+    {}
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(int64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDValue Op);
+    
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+    
+    virtual const char *getPassName() const {
+      return "Alpha DAG->DAG Pattern Instruction Selection";
+    } 
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      SDValue Op0;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        Op0 = Op;
+        break;
+      }
+      
+      OutOps.push_back(Op0);
+      return false;
+    }
+    
+// Include the pieces autogenerated from the target description.
+#include "AlphaGenDAGISel.inc"
+    
+private:
+    SDValue getGlobalBaseReg();
+    SDValue getGlobalRetAddr();
+    void SelectCALL(SDValue Op);
+
+  };
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+///
+SDValue AlphaDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GP = 0;
+  for(MachineRegisterInfo::livein_iterator ii = RegInfo->livein_begin(), 
+        ee = RegInfo->livein_end(); ii != ee; ++ii)
+    if (ii->first == Alpha::R29) {
+      GP = ii->second;
+      break;
+    }
+  assert(GP && "GOT PTR not in liveins");
+  // FIXME is there anywhere sensible to get a DebugLoc here?
+  return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), 
+                                DebugLoc::getUnknownLoc(), GP, MVT::i64);
+}
+
+/// getRASaveReg - Grab the return address
+///
+SDValue AlphaDAGToDAGISel::getGlobalRetAddr() {
+  unsigned RA = 0;
+  for(MachineRegisterInfo::livein_iterator ii = RegInfo->livein_begin(), 
+        ee = RegInfo->livein_end(); ii != ee; ++ii)
+    if (ii->first == Alpha::R26) {
+      RA = ii->second;
+      break;
+    }
+  assert(RA && "RA PTR not in liveins");
+  // FIXME is there anywhere sensible to get a DebugLoc here?
+  return CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                               DebugLoc::getUnknownLoc(), RA, MVT::i64);
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void AlphaDAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+  
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (N->getOpcode()) {
+  default: break;
+  case AlphaISD::CALL:
+    SelectCALL(Op);
+    return NULL;
+
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    return CurDAG->SelectNodeTo(N, Alpha::LDA, MVT::i64,
+                                CurDAG->getTargetFrameIndex(FI, MVT::i32),
+                                getI64Imm(0));
+  }
+  case ISD::GLOBAL_OFFSET_TABLE: {
+    SDValue Result = getGlobalBaseReg();
+    ReplaceUses(Op, Result);
+    return NULL;
+  }
+  case AlphaISD::GlobalRetAddr: {
+    SDValue Result = getGlobalRetAddr();
+    ReplaceUses(Op, Result);
+    return NULL;
+  }
+  
+  case AlphaISD::DivCall: {
+    SDValue Chain = CurDAG->getEntryNode();
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDValue N2 = Op.getOperand(2);
+    Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R24, N1, 
+                                 SDValue(0,0));
+    Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R25, N2, 
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R27, N0, 
+                                 Chain.getValue(1));
+    SDNode *CNode =
+      CurDAG->getTargetNode(Alpha::JSRs, dl, MVT::Other, MVT::Flag, 
+                            Chain, Chain.getValue(1));
+    Chain = CurDAG->getCopyFromReg(Chain, dl, Alpha::R27, MVT::i64, 
+                                   SDValue(CNode, 1));
+    return CurDAG->SelectNodeTo(N, Alpha::BISr, MVT::i64, Chain, Chain);
+  }
+
+  case ISD::READCYCLECOUNTER: {
+    SDValue Chain = N->getOperand(0);
+    return CurDAG->getTargetNode(Alpha::RPCC, dl, MVT::i64, MVT::Other,
+                                 Chain);
+  }
+
+  case ISD::Constant: {
+    uint64_t uval = cast<ConstantSDNode>(N)->getZExtValue();
+    
+    if (uval == 0) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                Alpha::R31, MVT::i64);
+      ReplaceUses(Op, Result);
+      return NULL;
+    }
+
+    int64_t val = (int64_t)uval;
+    int32_t val32 = (int32_t)val;
+    if (val <= IMM_HIGH + IMM_HIGH * IMM_MULT &&
+        val >= IMM_LOW  + IMM_LOW  * IMM_MULT)
+      break; //(LDAH (LDA))
+    if ((uval >> 32) == 0 && //empty upper bits
+        val32 <= IMM_HIGH + IMM_HIGH * IMM_MULT)
+      // val32 >= IMM_LOW  + IMM_LOW  * IMM_MULT) //always true
+      break; //(zext (LDAH (LDA)))
+    //Else use the constant pool
+    ConstantInt *C = ConstantInt::get(Type::Int64Ty, uval);
+    SDValue CPI = CurDAG->getTargetConstantPool(C, MVT::i64);
+    SDNode *Tmp = CurDAG->getTargetNode(Alpha::LDAHr, dl, MVT::i64, CPI,
+                                        getGlobalBaseReg());
+    return CurDAG->SelectNodeTo(N, Alpha::LDQr, MVT::i64, MVT::Other, 
+                                CPI, SDValue(Tmp, 0), CurDAG->getEntryNode());
+  }
+  case ISD::TargetConstantFP:
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+    bool isDouble = N->getValueType(0) == MVT::f64;
+    MVT T = isDouble ? MVT::f64 : MVT::f32;
+    if (CN->getValueAPF().isPosZero()) {
+      return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYST : Alpha::CPYSS,
+                                  T, CurDAG->getRegister(Alpha::F31, T),
+                                  CurDAG->getRegister(Alpha::F31, T));
+    } else if (CN->getValueAPF().isNegZero()) {
+      return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYSNT : Alpha::CPYSNS,
+                                  T, CurDAG->getRegister(Alpha::F31, T),
+                                  CurDAG->getRegister(Alpha::F31, T));
+    } else {
+      abort();
+    }
+    break;
+  }
+
+  case ISD::SETCC:
+    if (N->getOperand(0).getNode()->getValueType(0).isFloatingPoint()) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+      unsigned Opc = Alpha::WTF;
+      bool rev = false;
+      bool inv = false;
+      switch(CC) {
+      default: DEBUG(N->dump(CurDAG)); assert(0 && "Unknown FP comparison!");
+      case ISD::SETEQ: case ISD::SETOEQ: case ISD::SETUEQ:
+        Opc = Alpha::CMPTEQ; break;
+      case ISD::SETLT: case ISD::SETOLT: case ISD::SETULT: 
+        Opc = Alpha::CMPTLT; break;
+      case ISD::SETLE: case ISD::SETOLE: case ISD::SETULE: 
+        Opc = Alpha::CMPTLE; break;
+      case ISD::SETGT: case ISD::SETOGT: case ISD::SETUGT: 
+        Opc = Alpha::CMPTLT; rev = true; break;
+      case ISD::SETGE: case ISD::SETOGE: case ISD::SETUGE: 
+        Opc = Alpha::CMPTLE; rev = true; break;
+      case ISD::SETNE: case ISD::SETONE: case ISD::SETUNE:
+        Opc = Alpha::CMPTEQ; inv = true; break;
+      case ISD::SETO:
+        Opc = Alpha::CMPTUN; inv = true; break;
+      case ISD::SETUO:
+        Opc = Alpha::CMPTUN; break;
+      };
+      SDValue tmp1 = N->getOperand(rev?1:0);
+      SDValue tmp2 = N->getOperand(rev?0:1);
+      SDNode *cmp = CurDAG->getTargetNode(Opc, dl, MVT::f64, tmp1, tmp2);
+      if (inv) 
+        cmp = CurDAG->getTargetNode(Alpha::CMPTEQ, dl, 
+                                    MVT::f64, SDValue(cmp, 0), 
+                                    CurDAG->getRegister(Alpha::F31, MVT::f64));
+      switch(CC) {
+      case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE:
+      case ISD::SETUNE: case ISD::SETUGT: case ISD::SETUGE:
+       {
+         SDNode* cmp2 = CurDAG->getTargetNode(Alpha::CMPTUN, dl, MVT::f64,
+                                              tmp1, tmp2);
+         cmp = CurDAG->getTargetNode(Alpha::ADDT, dl, MVT::f64, 
+                                     SDValue(cmp2, 0), SDValue(cmp, 0));
+         break;
+       }
+      default: break;
+      }
+
+      SDNode* LD = CurDAG->getTargetNode(Alpha::FTOIT, dl,
+                                         MVT::i64, SDValue(cmp, 0));
+      return CurDAG->getTargetNode(Alpha::CMPULT, dl, MVT::i64, 
+                                   CurDAG->getRegister(Alpha::R31, MVT::i64),
+                                   SDValue(LD,0));
+    }
+    break;
+
+  case ISD::SELECT:
+    if (N->getValueType(0).isFloatingPoint() &&
+        (N->getOperand(0).getOpcode() != ISD::SETCC ||
+         !N->getOperand(0).getOperand(1).getValueType().isFloatingPoint())) {
+      //This should be the condition not covered by the Patterns
+      //FIXME: Don't have SelectCode die, but rather return something testable
+      // so that things like this can be caught in fall though code
+      //move int to fp
+      bool isDouble = N->getValueType(0) == MVT::f64;
+      SDValue cond = N->getOperand(0);
+      SDValue TV = N->getOperand(1);
+      SDValue FV = N->getOperand(2);
+      
+      SDNode* LD = CurDAG->getTargetNode(Alpha::ITOFT, dl, MVT::f64, cond);
+      return CurDAG->getTargetNode(isDouble?Alpha::FCMOVNET:Alpha::FCMOVNES,
+                                   dl, MVT::f64, FV, TV, SDValue(LD,0));
+    }
+    break;
+
+  case ISD::AND: {
+    ConstantSDNode* SC = NULL;
+    ConstantSDNode* MC = NULL;
+    if (N->getOperand(0).getOpcode() == ISD::SRL &&
+        (MC = dyn_cast<ConstantSDNode>(N->getOperand(1))) &&
+        (SC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)))) {
+      uint64_t sval = SC->getZExtValue();
+      uint64_t mval = MC->getZExtValue();
+      // If the result is a zap, let the autogened stuff handle it.
+      if (get_zapImm(N->getOperand(0), mval))
+        break;
+      // given mask X, and shift S, we want to see if there is any zap in the
+      // mask if we play around with the botton S bits
+      uint64_t dontcare = (~0ULL) >> (64 - sval);
+      uint64_t mask = mval << sval;
+      
+      if (get_zapImm(mask | dontcare))
+        mask = mask | dontcare;
+      
+      if (get_zapImm(mask)) {
+        SDValue Z = 
+          SDValue(CurDAG->getTargetNode(Alpha::ZAPNOTi, dl, MVT::i64,
+                                          N->getOperand(0).getOperand(0),
+                                          getI64Imm(get_zapImm(mask))), 0);
+        return CurDAG->getTargetNode(Alpha::SRLr, dl, MVT::i64, Z, 
+                                     getI64Imm(sval));
+      }
+    }
+    break;
+  }
+
+  }
+
+  return SelectCode(Op);
+}
+
+void AlphaDAGToDAGISel::SelectCALL(SDValue Op) {
+  //TODO: add flag stuff to prevent nondeturministic breakage!
+
+  SDNode *N = Op.getNode();
+  SDValue Chain = N->getOperand(0);
+  SDValue Addr = N->getOperand(1);
+  SDValue InFlag(0,0);  // Null incoming flag value.
+  DebugLoc dl = N->getDebugLoc();
+
+   std::vector<SDValue> CallOperands;
+   std::vector<MVT> TypeOperands;
+  
+   //grab the arguments
+   for(int i = 2, e = N->getNumOperands(); i < e; ++i) {
+     TypeOperands.push_back(N->getOperand(i).getValueType());
+     CallOperands.push_back(N->getOperand(i));
+   }
+   int count = N->getNumOperands() - 2;
+
+   static const unsigned args_int[] = {Alpha::R16, Alpha::R17, Alpha::R18,
+                                       Alpha::R19, Alpha::R20, Alpha::R21};
+   static const unsigned args_float[] = {Alpha::F16, Alpha::F17, Alpha::F18,
+                                         Alpha::F19, Alpha::F20, Alpha::F21};
+   
+   for (int i = 6; i < count; ++i) {
+     unsigned Opc = Alpha::WTF;
+     if (TypeOperands[i].isInteger()) {
+       Opc = Alpha::STQ;
+     } else if (TypeOperands[i] == MVT::f32) {
+       Opc = Alpha::STS;
+     } else if (TypeOperands[i] == MVT::f64) {
+       Opc = Alpha::STT;
+     } else
+       assert(0 && "Unknown operand"); 
+
+     SDValue Ops[] = { CallOperands[i],  getI64Imm((i - 6) * 8), 
+                       CurDAG->getCopyFromReg(Chain, dl, Alpha::R30, MVT::i64),
+                       Chain };
+     Chain = SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 4), 0);
+   }
+   for (int i = 0; i < std::min(6, count); ++i) {
+     if (TypeOperands[i].isInteger()) {
+       Chain = CurDAG->getCopyToReg(Chain, dl, args_int[i], 
+                                    CallOperands[i], InFlag);
+       InFlag = Chain.getValue(1);
+     } else if (TypeOperands[i] == MVT::f32 || TypeOperands[i] == MVT::f64) {
+       Chain = CurDAG->getCopyToReg(Chain, dl, args_float[i], 
+                                    CallOperands[i], InFlag);
+       InFlag = Chain.getValue(1);
+     } else
+       assert(0 && "Unknown operand"); 
+   }
+
+   // Finally, once everything is in registers to pass to the call, emit the
+   // call itself.
+   if (Addr.getOpcode() == AlphaISD::GPRelLo) {
+     SDValue GOT = getGlobalBaseReg();
+     Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R29, GOT, InFlag);
+     InFlag = Chain.getValue(1);
+     Chain = SDValue(CurDAG->getTargetNode(Alpha::BSR, dl, MVT::Other, 
+                                           MVT::Flag, Addr.getOperand(0), 
+                                           Chain, InFlag), 0);
+   } else {
+     Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R27, Addr, InFlag);
+     InFlag = Chain.getValue(1);
+     Chain = SDValue(CurDAG->getTargetNode(Alpha::JSR, dl, MVT::Other,
+                                             MVT::Flag, Chain, InFlag), 0);
+   }
+   InFlag = Chain.getValue(1);
+
+   std::vector<SDValue> CallResults;
+  
+   switch (N->getValueType(0).getSimpleVT()) {
+   default: assert(0 && "Unexpected ret value!");
+     case MVT::Other: break;
+   case MVT::i64:
+     Chain = CurDAG->getCopyFromReg(Chain, dl, 
+                                    Alpha::R0, MVT::i64, InFlag).getValue(1);
+     CallResults.push_back(Chain.getValue(0));
+     break;
+   case MVT::f32:
+     Chain = CurDAG->getCopyFromReg(Chain, dl, 
+                                    Alpha::F0, MVT::f32, InFlag).getValue(1);
+     CallResults.push_back(Chain.getValue(0));
+     break;
+   case MVT::f64:
+     Chain = CurDAG->getCopyFromReg(Chain, dl,
+                                    Alpha::F0, MVT::f64, InFlag).getValue(1);
+     CallResults.push_back(Chain.getValue(0));
+     break;
+   }
+
+   CallResults.push_back(Chain);
+   for (unsigned i = 0, e = CallResults.size(); i != e; ++i)
+     ReplaceUses(Op.getValue(i), CallResults[i]);
+}
+
+
+/// createAlphaISelDag - This pass converts a legalized DAG into a 
+/// Alpha-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createAlphaISelDag(AlphaTargetMachine &TM) {
+  return new AlphaDAGToDAGISel(TM);
+}
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
new file mode 100644
index 0000000..1001112
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -0,0 +1,798 @@
+//===-- AlphaISelLowering.cpp - Alpha DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AlphaISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaISelLowering.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+/// AddLiveIn - This helper function adds the specified physical register to the
+/// MachineFunction as a live in value.  It also creates a corresponding virtual
+/// register for it.
+static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
+                          TargetRegisterClass *RC) {
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  MF.getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM) {
+  // Set up the TargetLowering object.
+  //I am having problems with shr n ubyte 1
+  setShiftAmountType(MVT::i64);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  
+  setUsesGlobalOffsetTable(true);
+  
+  addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass);
+  addRegisterClass(MVT::f64, Alpha::F8RCRegisterClass);
+  addRegisterClass(MVT::f32, Alpha::F4RCRegisterClass);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+  
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8,  Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  //  setOperationAction(ISD::BRIND,        MVT::Other,   Expand);
+  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);  
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+
+  if (!TM.getSubtarget<AlphaSubtarget>().hasCT()) {
+    setOperationAction(ISD::CTPOP    , MVT::i64  , Expand);
+    setOperationAction(ISD::CTTZ     , MVT::i64  , Expand);
+    setOperationAction(ISD::CTLZ     , MVT::i64  , Expand);
+  }
+  setOperationAction(ISD::BSWAP    , MVT::i64, Expand);
+  setOperationAction(ISD::ROTL     , MVT::i64, Expand);
+  setOperationAction(ISD::ROTR     , MVT::i64, Expand);
+  
+  setOperationAction(ISD::SREM     , MVT::i64, Custom);
+  setOperationAction(ISD::UREM     , MVT::i64, Custom);
+  setOperationAction(ISD::SDIV     , MVT::i64, Custom);
+  setOperationAction(ISD::UDIV     , MVT::i64, Custom);
+
+  setOperationAction(ISD::ADDC     , MVT::i64, Expand);
+  setOperationAction(ISD::ADDE     , MVT::i64, Expand);
+  setOperationAction(ISD::SUBC     , MVT::i64, Expand);
+  setOperationAction(ISD::SUBE     , MVT::i64, Expand);
+
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+
+  // We don't support sin/cos/sqrt/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+
+  setOperationAction(ISD::SETCC, MVT::f32, Promote);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Promote);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // Not implemented yet.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // We want to legalize GlobalAddress and ConstantPool and
+  // ExternalSymbols nodes into the appropriate instructions to
+  // materialize the address.
+  setOperationAction(ISD::GlobalAddress,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,   MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,   MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,   MVT::i32,   Custom);
+
+  setOperationAction(ISD::RET,     MVT::Other, Custom);
+
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+
+  setStackPointerRegisterToSaveRestore(Alpha::R30);
+
+  addLegalFPImmediate(APFloat(+0.0)); //F31
+  addLegalFPImmediate(APFloat(+0.0f)); //F31
+  addLegalFPImmediate(APFloat(-0.0)); //-F31
+  addLegalFPImmediate(APFloat(-0.0f)); //-F31
+
+  setJumpBufSize(272);
+  setJumpBufAlignment(16);
+
+  computeRegisterProperties();
+}
+
+MVT AlphaTargetLowering::getSetCCResultType(MVT VT) const {
+  return MVT::i64;
+}
+
+const char *AlphaTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case AlphaISD::CVTQT_: return "Alpha::CVTQT_";
+  case AlphaISD::CVTQS_: return "Alpha::CVTQS_";
+  case AlphaISD::CVTTQ_: return "Alpha::CVTTQ_";
+  case AlphaISD::GPRelHi: return "Alpha::GPRelHi";
+  case AlphaISD::GPRelLo: return "Alpha::GPRelLo";
+  case AlphaISD::RelLit: return "Alpha::RelLit";
+  case AlphaISD::GlobalRetAddr: return "Alpha::GlobalRetAddr";
+  case AlphaISD::CALL:   return "Alpha::CALL";
+  case AlphaISD::DivCall: return "Alpha::DivCall";
+  case AlphaISD::RET_FLAG: return "Alpha::RET_FLAG";
+  case AlphaISD::COND_BRANCH_I: return "Alpha::COND_BRANCH_I";
+  case AlphaISD::COND_BRANCH_F: return "Alpha::COND_BRANCH_F";
+  }
+}
+
+static SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  MVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  
+  SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, JTI,
+                             DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+  SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, JTI, Hi);
+  return Lo;
+}
+
+//http://www.cs.arizona.edu/computer.help/policy/DIGITAL_unix/
+//AA-PY8AC-TET1_html/callCH3.html#BLOCK21
+
+//For now, just use variable size stack frame format
+
+//In a standard call, the first six items are passed in registers $16
+//- $21 and/or registers $f16 - $f21. (See Section 4.1.2 for details
+//of argument-to-register correspondence.) The remaining items are
+//collected in a memory argument list that is a naturally aligned
+//array of quadwords. In a standard call, this list, if present, must
+//be passed at 0(SP).
+//7 ... n         0(SP) ... (n-7)*8(SP)
+
+// //#define FP    $15
+// //#define RA    $26
+// //#define PV    $27
+// //#define GP    $29
+// //#define SP    $30
+
+static SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG,
+                                       int &VarArgsBase,
+                                       int &VarArgsOffset) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  std::vector<SDValue> ArgValues;
+  SDValue Root = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  AddLiveIn(MF, Alpha::R29, &Alpha::GPRCRegClass); //GP
+  AddLiveIn(MF, Alpha::R26, &Alpha::GPRCRegClass); //RA
+
+  unsigned args_int[] = {
+    Alpha::R16, Alpha::R17, Alpha::R18, Alpha::R19, Alpha::R20, Alpha::R21};
+  unsigned args_float[] = {
+    Alpha::F16, Alpha::F17, Alpha::F18, Alpha::F19, Alpha::F20, Alpha::F21};
+  
+  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues()-1; ArgNo != e; ++ArgNo) {
+    SDValue argt;
+    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+    SDValue ArgVal;
+
+    if (ArgNo  < 6) {
+      switch (ObjectVT.getSimpleVT()) {
+      default:
+        assert(false && "Invalid value type!");
+      case MVT::f64:
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], 
+                                      &Alpha::F8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Root, dl, args_float[ArgNo], ObjectVT);
+        break;
+      case MVT::f32:
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], 
+                                      &Alpha::F4RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Root, dl, args_float[ArgNo], ObjectVT);
+        break;
+      case MVT::i64:
+        args_int[ArgNo] = AddLiveIn(MF, args_int[ArgNo], 
+                                    &Alpha::GPRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Root, dl, args_int[ArgNo], MVT::i64);
+        break;
+      }
+    } else { //more args
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(8, 8 * (ArgNo - 6));
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i64);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
+    }
+    ArgValues.push_back(ArgVal);
+  }
+
+  // If the functions takes variable number of arguments, copy all regs to stack
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  if (isVarArg) {
+    VarArgsOffset = (Op.getNode()->getNumValues()-1) * 8;
+    std::vector<SDValue> LS;
+    for (int i = 0; i < 6; ++i) {
+      if (TargetRegisterInfo::isPhysicalRegister(args_int[i]))
+        args_int[i] = AddLiveIn(MF, args_int[i], &Alpha::GPRCRegClass);
+      SDValue argt = DAG.getCopyFromReg(Root, dl, args_int[i], MVT::i64);
+      int FI = MFI->CreateFixedObject(8, -8 * (6 - i));
+      if (i == 0) VarArgsBase = FI;
+      SDValue SDFI = DAG.getFrameIndex(FI, MVT::i64);
+      LS.push_back(DAG.getStore(Root, dl, argt, SDFI, NULL, 0));
+
+      if (TargetRegisterInfo::isPhysicalRegister(args_float[i]))
+        args_float[i] = AddLiveIn(MF, args_float[i], &Alpha::F8RCRegClass);
+      argt = DAG.getCopyFromReg(Root, dl, args_float[i], MVT::f64);
+      FI = MFI->CreateFixedObject(8, - 8 * (12 - i));
+      SDFI = DAG.getFrameIndex(FI, MVT::i64);
+      LS.push_back(DAG.getStore(Root, dl, argt, SDFI, NULL, 0));
+    }
+
+    //Set up a token factor with all the stack traffic
+    Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &LS[0], LS.size());
+  }
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size());
+}
+
+static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Copy = DAG.getCopyToReg(Op.getOperand(0), dl, Alpha::R26, 
+                                    DAG.getNode(AlphaISD::GlobalRetAddr, 
+                                                DebugLoc::getUnknownLoc(),
+                                                MVT::i64),
+                                    SDValue());
+  switch (Op.getNumOperands()) {
+  default:
+    assert(0 && "Do not know how to return this many arguments!");
+    abort();
+  case 1: 
+    break;
+    //return SDValue(); // ret void is legal
+  case 3: {
+    MVT ArgVT = Op.getOperand(1).getValueType();
+    unsigned ArgReg;
+    if (ArgVT.isInteger())
+      ArgReg = Alpha::R0;
+    else {
+      assert(ArgVT.isFloatingPoint());
+      ArgReg = Alpha::F0;
+    }
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg, 
+                            Op.getOperand(1), Copy.getValue(1));
+    if (DAG.getMachineFunction().getRegInfo().liveout_empty())
+      DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg);
+    break;
+  }
+  case 5: {
+    MVT ArgVT = Op.getOperand(1).getValueType();
+    unsigned ArgReg1, ArgReg2;
+    if (ArgVT.isInteger()) {
+      ArgReg1 = Alpha::R0;
+      ArgReg2 = Alpha::R1;
+    } else {
+      assert(ArgVT.isFloatingPoint());
+      ArgReg1 = Alpha::F0;
+      ArgReg2 = Alpha::F1;
+    }
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg1, 
+                            Op.getOperand(1), Copy.getValue(1));
+    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(), 
+                  DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg1)
+        == DAG.getMachineFunction().getRegInfo().liveout_end())
+      DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg1);
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg2, 
+                            Op.getOperand(3), Copy.getValue(1));
+    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(), 
+                   DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg2)
+        == DAG.getMachineFunction().getRegInfo().liveout_end())
+      DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg2);
+    break;
+  }
+  }
+  return DAG.getNode(AlphaISD::RET_FLAG, dl, 
+                     MVT::Other, Copy, Copy.getValue(1));
+}
+
+std::pair<SDValue, SDValue>
+AlphaTargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy, 
+                                 bool RetSExt, bool RetZExt, bool isVarArg,
+                                 bool isInreg, unsigned CallingConv, 
+                                 bool isTailCall, SDValue Callee, 
+                                 ArgListTy &Args, SelectionDAG &DAG,
+                                 DebugLoc dl) {
+  int NumBytes = 0;
+  if (Args.size() > 6)
+    NumBytes = (Args.size() - 6) * 8;
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  std::vector<SDValue> args_to_use;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i)
+  {
+    switch (getValueType(Args[i].Ty).getSimpleVT()) {
+    default: assert(0 && "Unexpected ValueType for argument!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      // Promote the integer to 64 bits.  If the input type is signed use a
+      // sign extend, otherwise use a zero extend.
+      if (Args[i].isSExt)
+        Args[i].Node = DAG.getNode(ISD::SIGN_EXTEND, dl, 
+                                   MVT::i64, Args[i].Node);
+      else if (Args[i].isZExt)
+        Args[i].Node = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                                   MVT::i64, Args[i].Node);
+      else
+        Args[i].Node = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Args[i].Node);
+      break;
+    case MVT::i64:
+    case MVT::f64:
+    case MVT::f32:
+      break;
+    }
+    args_to_use.push_back(Args[i].Node);
+  }
+
+  std::vector<MVT> RetVals;
+  MVT RetTyVT = getValueType(RetTy);
+  MVT ActualRetTyVT = RetTyVT;
+  if (RetTyVT.getSimpleVT() >= MVT::i1 && RetTyVT.getSimpleVT() <= MVT::i32)
+    ActualRetTyVT = MVT::i64;
+
+  if (RetTyVT != MVT::isVoid)
+    RetVals.push_back(ActualRetTyVT);
+  RetVals.push_back(MVT::Other);
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  Ops.insert(Ops.end(), args_to_use.begin(), args_to_use.end());
+  SDValue TheCall = DAG.getNode(AlphaISD::CALL, dl, 
+                                RetVals, &Ops[0], Ops.size());
+  Chain = TheCall.getValue(RetTyVT != MVT::isVoid);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), SDValue());
+  SDValue RetVal = TheCall;
+
+  if (RetTyVT != ActualRetTyVT) {
+    ISD::NodeType AssertKind = ISD::DELETED_NODE;
+    if (RetSExt)
+      AssertKind = ISD::AssertSext;
+    else if (RetZExt)
+      AssertKind = ISD::AssertZext;
+
+    if (AssertKind != ISD::DELETED_NODE)
+      RetVal = DAG.getNode(AssertKind, dl, MVT::i64, RetVal,
+                           DAG.getValueType(RetTyVT));
+
+    RetVal = DAG.getNode(ISD::TRUNCATE, dl, RetTyVT, RetVal);
+  }
+
+  return std::make_pair(RetVal, Chain);
+}
+
+void AlphaTargetLowering::LowerVAARG(SDNode *N, SDValue &Chain,
+                                     SDValue &DataPtr, SelectionDAG &DAG) {
+  Chain = N->getOperand(0);
+  SDValue VAListP = N->getOperand(1);
+  const Value *VAListS = cast<SrcValueSDNode>(N->getOperand(2))->getValue();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Base = DAG.getLoad(MVT::i64, dl, Chain, VAListP, VAListS, 0);
+  SDValue Tmp = DAG.getNode(ISD::ADD, dl, MVT::i64, VAListP,
+                              DAG.getConstant(8, MVT::i64));
+  SDValue Offset = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Base.getValue(1),
+                                    Tmp, NULL, 0, MVT::i32);
+  DataPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Base, Offset);
+  if (N->getValueType(0).isFloatingPoint())
+  {
+    //if fp && Offset < 6*8, then subtract 6*8 from DataPtr
+    SDValue FPDataPtr = DAG.getNode(ISD::SUB, dl, MVT::i64, DataPtr,
+                                      DAG.getConstant(8*6, MVT::i64));
+    SDValue CC = DAG.getSetCC(dl, MVT::i64, Offset,
+                                DAG.getConstant(8*6, MVT::i64), ISD::SETLT);
+    DataPtr = DAG.getNode(ISD::SELECT, dl, MVT::i64, CC, FPDataPtr, DataPtr);
+  }
+
+  SDValue NewOffset = DAG.getNode(ISD::ADD, dl, MVT::i64, Offset,
+                                    DAG.getConstant(8, MVT::i64));
+  Chain = DAG.getTruncStore(Offset.getValue(1), dl, NewOffset, Tmp, NULL, 0,
+                            MVT::i32);
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Wasn't expecting to be able to lower this!");
+  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG, 
+                                                           VarArgsBase,
+                                                           VarArgsOffset);
+
+  case ISD::RET: return LowerRET(Op,DAG);
+  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default: break;    // Don't custom lower most intrinsics.
+    case Intrinsic::alpha_umulh:
+      return DAG.getNode(ISD::MULHU, dl, MVT::i64, 
+                         Op.getOperand(1), Op.getOperand(2));
+    }
+  }
+
+  case ISD::SINT_TO_FP: {
+    assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+           "Unhandled SINT_TO_FP type in custom expander!");
+    SDValue LD;
+    bool isDouble = Op.getValueType() == MVT::f64;
+    LD = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op.getOperand(0));
+    SDValue FP = DAG.getNode(isDouble?AlphaISD::CVTQT_:AlphaISD::CVTQS_, dl,
+                               isDouble?MVT::f64:MVT::f32, LD);
+    return FP;
+  }
+  case ISD::FP_TO_SINT: {
+    bool isDouble = Op.getOperand(0).getValueType() == MVT::f64;
+    SDValue src = Op.getOperand(0);
+
+    if (!isDouble) //Promote
+      src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, src);
+    
+    src = DAG.getNode(AlphaISD::CVTTQ_, dl, MVT::f64, src);
+
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, src);
+  }
+  case ISD::ConstantPool: {
+    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+    Constant *C = CP->getConstVal();
+    SDValue CPI = DAG.getTargetConstantPool(C, MVT::i64, CP->getAlignment());
+    // FIXME there isn't really any debug info here
+    
+    SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, CPI,
+                               DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+    SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, CPI, Hi);
+    return Lo;
+  }
+  case ISD::GlobalTLSAddress:
+    assert(0 && "TLS not implemented for Alpha.");
+  case ISD::GlobalAddress: {
+    GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+    GlobalValue *GV = GSDN->getGlobal();
+    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i64, GSDN->getOffset());
+    // FIXME there isn't really any debug info here
+
+    //    if (!GV->hasWeakLinkage() && !GV->isDeclaration() && !GV->hasLinkOnceLinkage()) {
+    if (GV->hasLocalLinkage()) {
+      SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, GA,
+                                DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+      SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, GA, Hi);
+      return Lo;
+    } else
+      return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64, GA, 
+                         DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+  }
+  case ISD::ExternalSymbol: {
+    return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64, 
+                       DAG.getTargetExternalSymbol(cast<ExternalSymbolSDNode>(Op)
+                                                   ->getSymbol(), MVT::i64),
+                       DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+  }
+
+  case ISD::UREM:
+  case ISD::SREM:
+    //Expand only on constant case
+    if (Op.getOperand(1).getOpcode() == ISD::Constant) {
+      MVT VT = Op.getNode()->getValueType(0);
+      SDValue Tmp1 = Op.getNode()->getOpcode() == ISD::UREM ?
+        BuildUDIV(Op.getNode(), DAG, NULL) :
+        BuildSDIV(Op.getNode(), DAG, NULL);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Op.getOperand(1));
+      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Op.getOperand(0), Tmp1);
+      return Tmp1;
+    }
+    //fall through
+  case ISD::SDIV:
+  case ISD::UDIV:
+    if (Op.getValueType().isInteger()) {
+      if (Op.getOperand(1).getOpcode() == ISD::Constant)
+        return Op.getOpcode() == ISD::SDIV ? BuildSDIV(Op.getNode(), DAG, NULL) 
+          : BuildUDIV(Op.getNode(), DAG, NULL);
+      const char* opstr = 0;
+      switch (Op.getOpcode()) {
+      case ISD::UREM: opstr = "__remqu"; break;
+      case ISD::SREM: opstr = "__remq";  break;
+      case ISD::UDIV: opstr = "__divqu"; break;
+      case ISD::SDIV: opstr = "__divq";  break;
+      }
+      SDValue Tmp1 = Op.getOperand(0),
+        Tmp2 = Op.getOperand(1),
+        Addr = DAG.getExternalSymbol(opstr, MVT::i64);
+      return DAG.getNode(AlphaISD::DivCall, dl, MVT::i64, Addr, Tmp1, Tmp2);
+    }
+    break;
+
+  case ISD::VAARG: {
+    SDValue Chain, DataPtr;
+    LowerVAARG(Op.getNode(), Chain, DataPtr, DAG);
+
+    SDValue Result;
+    if (Op.getValueType() == MVT::i32)
+      Result = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Chain, DataPtr,
+                              NULL, 0, MVT::i32);
+    else
+      Result = DAG.getLoad(Op.getValueType(), dl, Chain, DataPtr, NULL, 0);
+    return Result;
+  }
+  case ISD::VACOPY: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue DestP = Op.getOperand(1);
+    SDValue SrcP = Op.getOperand(2);
+    const Value *DestS = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+    const Value *SrcS = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+    
+    SDValue Val = DAG.getLoad(getPointerTy(), dl, Chain, SrcP, SrcS, 0);
+    SDValue Result = DAG.getStore(Val.getValue(1), dl, Val, DestP, DestS, 0);
+    SDValue NP = DAG.getNode(ISD::ADD, dl, MVT::i64, SrcP, 
+                               DAG.getConstant(8, MVT::i64));
+    Val = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Result, 
+                         NP, NULL,0, MVT::i32);
+    SDValue NPD = DAG.getNode(ISD::ADD, dl, MVT::i64, DestP,
+                                DAG.getConstant(8, MVT::i64));
+    return DAG.getTruncStore(Val.getValue(1), dl, Val, NPD, NULL, 0, MVT::i32);
+  }
+  case ISD::VASTART: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue VAListP = Op.getOperand(1);
+    const Value *VAListS = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    
+    // vastart stores the address of the VarArgsBase and VarArgsOffset
+    SDValue FR  = DAG.getFrameIndex(VarArgsBase, MVT::i64);
+    SDValue S1  = DAG.getStore(Chain, dl, FR, VAListP, VAListS, 0);
+    SDValue SA2 = DAG.getNode(ISD::ADD, dl, MVT::i64, VAListP,
+                                DAG.getConstant(8, MVT::i64));
+    return DAG.getTruncStore(S1, dl, DAG.getConstant(VarArgsOffset, MVT::i64),
+                             SA2, NULL, 0, MVT::i32);
+  }
+  case ISD::RETURNADDR:        
+    return DAG.getNode(AlphaISD::GlobalRetAddr, DebugLoc::getUnknownLoc(),
+                       MVT::i64);
+      //FIXME: implement
+  case ISD::FRAMEADDR:          break;
+  }
+  
+  return SDValue();
+}
+
+void AlphaTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  assert(N->getValueType(0) == MVT::i32 &&
+         N->getOpcode() == ISD::VAARG &&
+         "Unknown node to custom promote!");
+
+  SDValue Chain, DataPtr;
+  LowerVAARG(N, Chain, DataPtr, DAG);
+  SDValue Res = DAG.getLoad(N->getValueType(0), dl, Chain, DataPtr, NULL, 0);
+  Results.push_back(Res);
+  Results.push_back(SDValue(Res.getNode(), 1));
+}
+
+
+//Inline Asm
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AlphaTargetLowering::ConstraintType 
+AlphaTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'f':
+    case 'r':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::vector<unsigned> AlphaTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;  // Unknown constriant letter
+    case 'f': 
+      return make_vector<unsigned>(Alpha::F0 , Alpha::F1 , Alpha::F2 ,
+                                   Alpha::F3 , Alpha::F4 , Alpha::F5 ,
+                                   Alpha::F6 , Alpha::F7 , Alpha::F8 , 
+                                   Alpha::F9 , Alpha::F10, Alpha::F11, 
+                                   Alpha::F12, Alpha::F13, Alpha::F14, 
+                                   Alpha::F15, Alpha::F16, Alpha::F17, 
+                                   Alpha::F18, Alpha::F19, Alpha::F20, 
+                                   Alpha::F21, Alpha::F22, Alpha::F23, 
+                                   Alpha::F24, Alpha::F25, Alpha::F26, 
+                                   Alpha::F27, Alpha::F28, Alpha::F29, 
+                                   Alpha::F30, Alpha::F31, 0);
+    case 'r': 
+      return make_vector<unsigned>(Alpha::R0 , Alpha::R1 , Alpha::R2 , 
+                                   Alpha::R3 , Alpha::R4 , Alpha::R5 , 
+                                   Alpha::R6 , Alpha::R7 , Alpha::R8 , 
+                                   Alpha::R9 , Alpha::R10, Alpha::R11, 
+                                   Alpha::R12, Alpha::R13, Alpha::R14, 
+                                   Alpha::R15, Alpha::R16, Alpha::R17, 
+                                   Alpha::R18, Alpha::R19, Alpha::R20, 
+                                   Alpha::R21, Alpha::R22, Alpha::R23, 
+                                   Alpha::R24, Alpha::R25, Alpha::R26, 
+                                   Alpha::R27, Alpha::R28, Alpha::R29, 
+                                   Alpha::R30, Alpha::R31, 0);
+    }
+  }
+  
+  return std::vector<unsigned>();
+}
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+AlphaTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  assert((MI->getOpcode() == Alpha::CAS32 ||
+          MI->getOpcode() == Alpha::CAS64 ||
+          MI->getOpcode() == Alpha::LAS32 ||
+          MI->getOpcode() == Alpha::LAS64 ||
+          MI->getOpcode() == Alpha::SWAP32 ||
+          MI->getOpcode() == Alpha::SWAP64) &&
+         "Unexpected instr type to insert");
+
+  bool is32 = MI->getOpcode() == Alpha::CAS32 || 
+    MI->getOpcode() == Alpha::LAS32 ||
+    MI->getOpcode() == Alpha::SWAP32;
+  
+  //Load locked store conditional for atomic ops take on the same form
+  //start:
+  //ll
+  //do stuff (maybe branch to exit)
+  //sc
+  //test sc and maybe branck to start
+  //exit:
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction::iterator It = BB;
+  ++It;
+  
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *llscMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+  sinkMBB->transferSuccessors(thisMBB);
+
+  F->insert(It, llscMBB);
+  F->insert(It, sinkMBB);
+
+  BuildMI(thisMBB, dl, TII->get(Alpha::BR)).addMBB(llscMBB);
+  
+  unsigned reg_res = MI->getOperand(0).getReg(),
+    reg_ptr = MI->getOperand(1).getReg(),
+    reg_v2 = MI->getOperand(2).getReg(),
+    reg_store = F->getRegInfo().createVirtualRegister(&Alpha::GPRCRegClass);
+
+  BuildMI(llscMBB, dl, TII->get(is32 ? Alpha::LDL_L : Alpha::LDQ_L), 
+          reg_res).addImm(0).addReg(reg_ptr);
+  switch (MI->getOpcode()) {
+  case Alpha::CAS32:
+  case Alpha::CAS64: {
+    unsigned reg_cmp 
+      = F->getRegInfo().createVirtualRegister(&Alpha::GPRCRegClass);
+    BuildMI(llscMBB, dl, TII->get(Alpha::CMPEQ), reg_cmp)
+      .addReg(reg_v2).addReg(reg_res);
+    BuildMI(llscMBB, dl, TII->get(Alpha::BEQ))
+      .addImm(0).addReg(reg_cmp).addMBB(sinkMBB);
+    BuildMI(llscMBB, dl, TII->get(Alpha::BISr), reg_store)
+      .addReg(Alpha::R31).addReg(MI->getOperand(3).getReg());
+    break;
+  }
+  case Alpha::LAS32:
+  case Alpha::LAS64: {
+    BuildMI(llscMBB, dl,TII->get(is32 ? Alpha::ADDLr : Alpha::ADDQr), reg_store)
+      .addReg(reg_res).addReg(reg_v2);
+    break;
+  }
+  case Alpha::SWAP32:
+  case Alpha::SWAP64: {
+    BuildMI(llscMBB, dl, TII->get(Alpha::BISr), reg_store)
+      .addReg(reg_v2).addReg(reg_v2);
+    break;
+  }
+  }
+  BuildMI(llscMBB, dl, TII->get(is32 ? Alpha::STL_C : Alpha::STQ_C), reg_store)
+    .addReg(reg_store).addImm(0).addReg(reg_ptr);
+  BuildMI(llscMBB, dl, TII->get(Alpha::BEQ))
+    .addImm(0).addReg(reg_store).addMBB(llscMBB);
+  BuildMI(llscMBB, dl, TII->get(Alpha::BR)).addMBB(sinkMBB);
+
+  thisMBB->addSuccessor(llscMBB);
+  llscMBB->addSuccessor(llscMBB);
+  llscMBB->addSuccessor(sinkMBB);
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+
+  return sinkMBB;
+}
+
+bool
+AlphaTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Alpha target isn't yet aware of offsets.
+  return false;
+}
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
new file mode 100644
index 0000000..fdd817c
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -0,0 +1,114 @@
+//===-- AlphaISelLowering.h - Alpha DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Alpha uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H
+#define LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H
+
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "Alpha.h"
+
+namespace llvm {
+
+  namespace AlphaISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      //These corrospond to the identical Instruction
+      CVTQT_, CVTQS_, CVTTQ_,
+
+      /// GPRelHi/GPRelLo - These represent the high and low 16-bit
+      /// parts of a global address respectively.
+      GPRelHi, GPRelLo, 
+
+      /// RetLit - Literal Relocation of a Global
+      RelLit,
+
+      /// GlobalRetAddr - used to restore the return address
+      GlobalRetAddr,
+      
+      /// CALL - Normal call.
+      CALL,
+
+      /// DIVCALL - used for special library calls for div and rem
+      DivCall,
+      
+      /// return flag operand
+      RET_FLAG,
+
+      /// CHAIN = COND_BRANCH CHAIN, OPC, (G|F)PRC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  
+      /// *PRC is the input register to compare to zero,
+      /// OPC is the branch opcode to use (e.g. Alpha::BEQ),
+      /// DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH_I, COND_BRANCH_F
+
+    };
+  }
+
+  class AlphaTargetLowering : public TargetLowering {
+    int VarArgsOffset;  // What is the offset to the first vaarg
+    int VarArgsBase;    // What is the base FrameIndex
+    bool useITOF;
+  public:
+    explicit AlphaTargetLowering(TargetMachine &TM);
+    
+    /// getSetCCResultType - Get the SETCC result ValueType
+    virtual MVT getSetCCResultType(MVT VT) const;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    // Friendly names for dumps
+    const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// LowerCallTo - This hook lowers an abstract call to a function into an
+    /// actual call.
+    virtual std::pair<SDValue, SDValue>
+    LowerCallTo(SDValue Chain, const Type *RetTy, bool RetSExt, bool RetZExt,
+                bool isVarArg, bool isInreg, unsigned CC, bool isTailCall, 
+                SDValue Callee, ArgListTy &Args, SelectionDAG &DAG, 
+                DebugLoc dl);
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    std::vector<unsigned> 
+      getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                        MVT VT) const;
+
+    bool hasITOF() { return useITOF; }
+
+    MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+  private:
+    // Helpers for custom lowering.
+    void LowerVAARG(SDNode *N, SDValue &Chain, SDValue &DataPtr,
+                    SelectionDAG &DAG);
+
+  };
+}
+
+#endif   // LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H
diff --git a/lib/Target/Alpha/AlphaInstrFormats.td b/lib/Target/Alpha/AlphaInstrFormats.td
new file mode 100644
index 0000000..6d82875
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrFormats.td
@@ -0,0 +1,268 @@
+//===- AlphaInstrFormats.td - Alpha Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//3.3:
+//Memory
+//Branch
+//Operate
+//Floating-point
+//PALcode
+
+def u8imm   : Operand<i64>;
+def s14imm  : Operand<i64>;
+def s16imm  : Operand<i64>;
+def s21imm  : Operand<i64>;
+def s64imm  : Operand<i64>;
+def u64imm  : Operand<i64>;
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+// Alpha instruction baseline
+class InstAlpha<bits<6> op, string asmstr, InstrItinClass itin> : Instruction {
+  field bits<32> Inst;
+  let Namespace = "Alpha";
+  let AsmString = asmstr;
+  let Inst{31-26} = op;
+  let Itinerary = itin;
+}
+
+
+//3.3.1
+class MForm<bits<6> opcode, bit load, string asmstr, list<dag> pattern, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let canFoldAsLoad = load;
+  let Defs = [R28]; //We may use this for frame index calculations, so reserve it here
+
+  bits<5> Ra;
+  bits<16> disp;
+  bits<5> Rb;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-0} = disp;
+}
+class MfcForm<bits<6> opcode, bits<16> fc, string asmstr, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {    
+  bits<5> Ra;
+
+  let OutOperandList = (ops GPRC:$RA);
+  let InOperandList = (ops);
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = 0;
+  let Inst{15-0} = fc;
+}
+class MfcPForm<bits<6> opcode, bits<16> fc, string asmstr, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {    
+  let OutOperandList = (ops);
+  let InOperandList = (ops);
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0;
+  let Inst{15-0} = fc;
+}
+
+class MbrForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<14> disp;
+
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-14} = TB;
+  let Inst{13-0} = disp;
+}
+class MbrpForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, list<dag> pattern, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern=pattern;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<14> disp;
+
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-14} = TB;
+  let Inst{13-0} = disp;
+}
+
+//3.3.2
+def target : Operand<OtherVT> {}
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+class BFormN<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+   : InstAlpha<opcode, asmstr, itin> {
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+  bits<64> Opc; //dummy
+  bits<5> Ra;
+  bits<21> disp;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-0} = disp;
+}
+}
+
+let isBranch = 1, isTerminator = 1 in
+class BFormD<bits<6> opcode, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (ops);
+  let InOperandList = (ops target:$DISP);
+  bits<5> Ra;
+  bits<21> disp;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-0} = disp;
+}
+
+//3.3.3
+class OForm<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RC);
+  let InOperandList = (ins GPRC:$RA, GPRC:$RB);
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm2<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RC);
+  let InOperandList = (ins GPRC:$RB);
+
+  bits<5> Rc;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = 31;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm4<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RDEST);
+  let InOperandList = (ins GPRC:$RCOND, GPRC:$RTRUE, GPRC:$RFALSE);
+  let Constraints = "$RFALSE = $RDEST";
+  let DisableEncoding = "$RFALSE";
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+//  let isTwoAddress = 1;
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+
+class OFormL<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RC);
+  let InOperandList = (ins GPRC:$RA, u8imm:$L);
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<8> LIT;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-13} = LIT;
+  let Inst{12} = 1;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm4L<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RDEST);
+  let InOperandList = (ins GPRC:$RCOND, s64imm:$RTRUE, GPRC:$RFALSE);
+  let Constraints = "$RFALSE = $RDEST";
+  let DisableEncoding = "$RFALSE";
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<8> LIT;
+  bits<7> Function = fun;
+
+//  let isTwoAddress = 1;
+  let Inst{25-21} = Ra;
+  let Inst{20-13} = LIT;
+  let Inst{12} = 1;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+//3.3.4
+class FPForm<bits<6> opcode, bits<11> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+
+  bits<5> Fc;
+  bits<5> Fa;
+  bits<5> Fb;
+  bits<11> Function = fun;
+
+  let Inst{25-21} = Fa;
+  let Inst{20-16} = Fb;
+  let Inst{15-5} = Function;
+  let Inst{4-0} = Fc;
+}
+
+//3.3.5
+class PALForm<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+  bits<26> Function;
+
+  let Inst{25-0} = Function;
+}
+
+
+// Pseudo instructions.
+class PseudoInstAlpha<dag OOL, dag IOL, string nm, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<0, nm, itin>  {
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let Pattern = pattern;
+
+}
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
new file mode 100644
index 0000000..a54d97d
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -0,0 +1,450 @@
+//===- AlphaInstrInfo.cpp - Alpha Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaGenInstrInfo.inc"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+using namespace llvm;
+
+AlphaInstrInfo::AlphaInstrInfo()
+  : TargetInstrInfoImpl(AlphaInsts, array_lengthof(AlphaInsts)),
+    RI(*this) { }
+
+
+bool AlphaInstrInfo::isMoveInstr(const MachineInstr& MI,
+                                 unsigned& sourceReg, unsigned& destReg,
+                                 unsigned& SrcSR, unsigned& DstSR) const {
+  unsigned oc = MI.getOpcode();
+  if (oc == Alpha::BISr   || 
+      oc == Alpha::CPYSS  || 
+      oc == Alpha::CPYST  ||
+      oc == Alpha::CPYSSt || 
+      oc == Alpha::CPYSTs) {
+    // or r1, r2, r2 
+    // cpys(s|t) r1 r2 r2
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isReg() &&
+           "invalid Alpha BIS instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      SrcSR = DstSR = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned 
+AlphaInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                    int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  case Alpha::LDL:
+  case Alpha::LDQ:
+  case Alpha::LDBU:
+  case Alpha::LDWU:
+  case Alpha::LDS:
+  case Alpha::LDT:
+    if (MI->getOperand(1).isFI()) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned 
+AlphaInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                   int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  case Alpha::STL:
+  case Alpha::STQ:
+  case Alpha::STB:
+  case Alpha::STW:
+  case Alpha::STS:
+  case Alpha::STT:
+    if (MI->getOperand(1).isFI()) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+static bool isAlphaIntCondCode(unsigned Opcode) {
+  switch (Opcode) {
+  case Alpha::BEQ: 
+  case Alpha::BNE: 
+  case Alpha::BGE: 
+  case Alpha::BGT: 
+  case Alpha::BLE: 
+  case Alpha::BLT: 
+  case Alpha::BLBC: 
+  case Alpha::BLBS:
+    return true;
+  default:
+    return false;
+  }
+}
+
+unsigned AlphaInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) && 
+         "Alpha branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, dl, get(Alpha::BR)).addMBB(TBB);
+    else                // Conditional branch
+      if (isAlphaIntCondCode(Cond[0].getImm()))
+        BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_I))
+          .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+      else
+        BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_F))
+          .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+  
+  // Two-way Conditional Branch.
+  if (isAlphaIntCondCode(Cond[0].getImm()))
+    BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_I))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  else
+    BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_F))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, dl, get(Alpha::BR)).addMBB(FBB);
+  return 2;
+}
+
+bool AlphaInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  const TargetRegisterClass *DestRC,
+                                  const TargetRegisterClass *SrcRC) const {
+  //cerr << "copyRegToReg " << DestReg << " <- " << SrcReg << "\n";
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (DestRC == Alpha::GPRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(Alpha::BISr), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg);
+  } else if (DestRC == Alpha::F4RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(Alpha::CPYSS), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg);
+  } else if (DestRC == Alpha::F8RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(Alpha::CPYST), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg);
+  } else {
+    // Attempt to copy register that is not GPR or FPR
+    return false;
+  }
+  
+  return true;
+}
+
+void
+AlphaInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                    const TargetRegisterClass *RC) const {
+  //cerr << "Trying to store " << getPrettyName(SrcReg) << " to "
+  //     << FrameIdx << "\n";
+  //BuildMI(MBB, MI, Alpha::WTF, 0).addReg(SrcReg);
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == Alpha::F4RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::STS))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::F8RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::STT))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::GPRCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::STQ))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else
+    abort();
+}
+
+void AlphaInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                       bool isKill,
+                                       SmallVectorImpl<MachineOperand> &Addr,
+                                       const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Opc = 0;
+  if (RC == Alpha::F4RCRegisterClass)
+    Opc = Alpha::STS;
+  else if (RC == Alpha::F8RCRegisterClass)
+    Opc = Alpha::STT;
+  else if (RC == Alpha::GPRCRegisterClass)
+    Opc = Alpha::STQ;
+  else
+    abort();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = 
+    BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+}
+
+void
+AlphaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC) const {
+  //cerr << "Trying to load " << getPrettyName(DestReg) << " to "
+  //     << FrameIdx << "\n";
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == Alpha::F4RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::LDS), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::F8RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::LDT), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::GPRCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::LDQ), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else
+    abort();
+}
+
+void AlphaInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                        SmallVectorImpl<MachineOperand> &Addr,
+                                        const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Opc = 0;
+  if (RC == Alpha::F4RCRegisterClass)
+    Opc = Alpha::LDS;
+  else if (RC == Alpha::F8RCRegisterClass)
+    Opc = Alpha::LDT;
+  else if (RC == Alpha::GPRCRegisterClass)
+    Opc = Alpha::LDQ;
+  else
+    abort();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = 
+    BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+}
+
+MachineInstr *AlphaInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                    MachineInstr *MI,
+                                          const SmallVectorImpl<unsigned> &Ops,
+                                                    int FrameIndex) const {
+   if (Ops.size() != 1) return NULL;
+
+   // Make sure this is a reg-reg copy.
+   unsigned Opc = MI->getOpcode();
+
+   MachineInstr *NewMI = NULL;
+   switch(Opc) {
+   default:
+     break;
+   case Alpha::BISr:
+   case Alpha::CPYSS:
+   case Alpha::CPYST:
+     if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+       if (Ops[0] == 0) {  // move -> store
+         unsigned InReg = MI->getOperand(1).getReg();
+         bool isKill = MI->getOperand(1).isKill();
+         Opc = (Opc == Alpha::BISr) ? Alpha::STQ : 
+           ((Opc == Alpha::CPYSS) ? Alpha::STS : Alpha::STT);
+         NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+           .addReg(InReg, getKillRegState(isKill))
+           .addFrameIndex(FrameIndex)
+           .addReg(Alpha::F31);
+       } else {           // load -> move
+         unsigned OutReg = MI->getOperand(0).getReg();
+         bool isDead = MI->getOperand(0).isDead();
+         Opc = (Opc == Alpha::BISr) ? Alpha::LDQ : 
+           ((Opc == Alpha::CPYSS) ? Alpha::LDS : Alpha::LDT);
+         NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+           .addReg(OutReg, RegState::Define | getDeadRegState(isDead))
+           .addFrameIndex(FrameIndex)
+           .addReg(Alpha::F31);
+       }
+     }
+     break;
+   }
+  return NewMI;
+}
+
+static unsigned AlphaRevCondCode(unsigned Opcode) {
+  switch (Opcode) {
+  case Alpha::BEQ: return Alpha::BNE;
+  case Alpha::BNE: return Alpha::BEQ;
+  case Alpha::BGE: return Alpha::BLT;
+  case Alpha::BGT: return Alpha::BLE;
+  case Alpha::BLE: return Alpha::BGT;
+  case Alpha::BLT: return Alpha::BGE;
+  case Alpha::BLBC: return Alpha::BLBS;
+  case Alpha::BLBS: return Alpha::BLBC;
+  case Alpha::FBEQ: return Alpha::FBNE;
+  case Alpha::FBNE: return Alpha::FBEQ;
+  case Alpha::FBGE: return Alpha::FBLT;
+  case Alpha::FBGT: return Alpha::FBLE;
+  case Alpha::FBLE: return Alpha::FBGT;
+  case Alpha::FBLT: return Alpha::FBGE;
+  default:
+    assert(0 && "Unknown opcode");
+  }
+  return 0; // Not reached
+}
+
+// Branch analysis.
+bool AlphaInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == Alpha::BR) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == Alpha::COND_BRANCH_I ||
+               LastInst->getOpcode() == Alpha::COND_BRANCH_F) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with Alpha::BR and Alpha::COND_BRANCH_*, handle it.
+  if ((SecondLastInst->getOpcode() == Alpha::COND_BRANCH_I ||
+      SecondLastInst->getOpcode() == Alpha::COND_BRANCH_F) && 
+      LastInst->getOpcode() == Alpha::BR) {
+    TBB =  SecondLastInst->getOperand(2).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two Alpha::BRs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == Alpha::BR && 
+      LastInst->getOpcode() == Alpha::BR) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned AlphaInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != Alpha::BR && 
+      I->getOpcode() != Alpha::COND_BRANCH_I &&
+      I->getOpcode() != Alpha::COND_BRANCH_F)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != Alpha::COND_BRANCH_I && 
+      I->getOpcode() != Alpha::COND_BRANCH_F)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void AlphaInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                                MachineBasicBlock::iterator MI) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  BuildMI(MBB, MI, DL, get(Alpha::BISr), Alpha::R31)
+    .addReg(Alpha::R31)
+    .addReg(Alpha::R31);
+}
+
+bool AlphaInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case Alpha::RETDAG: // Return.
+  case Alpha::RETDAGp:
+  case Alpha::BR:     // Uncond branch.
+  case Alpha::JMP:  // Indirect branch.
+    return true;
+  default: return false;
+  }
+}
+bool AlphaInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid Alpha branch opcode!");
+  Cond[0].setImm(AlphaRevCondCode(Cond[0].getImm()));
+  return false;
+}
+
diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h
new file mode 100644
index 0000000..182aa32
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.h
@@ -0,0 +1,97 @@
+//===- AlphaInstrInfo.h - Alpha Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHAINSTRUCTIONINFO_H
+#define ALPHAINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "AlphaRegisterInfo.h"
+
+namespace llvm {
+
+class AlphaInstrInfo : public TargetInstrInfoImpl {
+  const AlphaRegisterInfo RI;
+public:
+  AlphaInstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const AlphaRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+  
+  bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  void insertNoop(MachineBasicBlock &MBB, 
+                  MachineBasicBlock::iterator MI) const;
+  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
new file mode 100644
index 0000000..e73bdf9
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -0,0 +1,1137 @@
+//===- AlphaInstrInfo.td - The Alpha Instruction Set -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+include "AlphaInstrFormats.td"
+
+//********************
+//Custom DAG Nodes
+//********************
+
+def SDTFPUnaryOpUnC  : SDTypeProfile<1, 1, [
+  SDTCisFP<1>, SDTCisFP<0>
+]>;
+def Alpha_cvtqt   : SDNode<"AlphaISD::CVTQT_",    SDTFPUnaryOpUnC, []>;
+def Alpha_cvtqs   : SDNode<"AlphaISD::CVTQS_",    SDTFPUnaryOpUnC, []>;
+def Alpha_cvttq   : SDNode<"AlphaISD::CVTTQ_"  ,  SDTFPUnaryOp, []>;
+def Alpha_gprello : SDNode<"AlphaISD::GPRelLo",   SDTIntBinOp, []>;
+def Alpha_gprelhi : SDNode<"AlphaISD::GPRelHi",   SDTIntBinOp, []>;
+def Alpha_rellit  : SDNode<"AlphaISD::RelLit",    SDTIntBinOp, [SDNPMayLoad]>;
+
+def retflag       : SDNode<"AlphaISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_AlphaCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i64> ]>;
+def SDT_AlphaCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i64>,
+                                           SDTCisVT<1, i64> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AlphaCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_AlphaCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+//********************
+//Paterns for matching
+//********************
+def invX : SDNodeXForm<imm, [{ //invert
+  return getI64Imm(~N->getZExtValue());
+}]>;
+def negX : SDNodeXForm<imm, [{ //negate
+  return getI64Imm(~N->getZExtValue() + 1);
+}]>;
+def SExt32 : SDNodeXForm<imm, [{ //signed extend int to long
+  return getI64Imm(((int64_t)N->getZExtValue() << 32) >> 32);
+}]>;
+def SExt16 : SDNodeXForm<imm, [{ //signed extend int to long
+  return getI64Imm(((int64_t)N->getZExtValue() << 48) >> 48);
+}]>;
+def LL16 : SDNodeXForm<imm, [{ //lda part of constant
+  return getI64Imm(get_lda16(N->getZExtValue()));
+}]>;
+def LH16 : SDNodeXForm<imm, [{ //ldah part of constant (or more if too big)
+  return getI64Imm(get_ldah16(N->getZExtValue()));
+}]>;
+def iZAPX : SDNodeXForm<and, [{ // get imm to ZAPi
+  ConstantSDNode *RHS = cast<ConstantSDNode>(N->getOperand(1));
+  return getI64Imm(get_zapImm(SDValue(), RHS->getZExtValue()));
+}]>;
+def nearP2X : SDNodeXForm<imm, [{
+  return getI64Imm(Log2_64(getNearPower2((uint64_t)N->getZExtValue())));
+}]>;
+def nearP2RemX : SDNodeXForm<imm, [{
+  uint64_t x =
+    abs64(N->getZExtValue() - getNearPower2((uint64_t)N->getZExtValue()));
+  return getI64Imm(Log2_64(x));
+}]>;
+
+def immUExt8  : PatLeaf<(imm), [{ //imm fits in 8 bit zero extended field
+  return (uint64_t)N->getZExtValue() == (uint8_t)N->getZExtValue();
+}]>;
+def immUExt8inv  : PatLeaf<(imm), [{ //inverted imm fits in 8 bit zero extended field
+  return (uint64_t)~N->getZExtValue() == (uint8_t)~N->getZExtValue();
+}], invX>;
+def immUExt8neg  : PatLeaf<(imm), [{ //negated imm fits in 8 bit zero extended field
+  return ((uint64_t)~N->getZExtValue() + 1) ==
+         (uint8_t)((uint64_t)~N->getZExtValue() + 1);
+}], negX>;
+def immSExt16  : PatLeaf<(imm), [{ //imm fits in 16 bit sign extended field
+  return ((int64_t)N->getZExtValue() << 48) >> 48 ==
+         (int64_t)N->getZExtValue();
+}]>;
+def immSExt16int  : PatLeaf<(imm), [{ //(int)imm fits in a 16 bit sign extended field
+  return ((int64_t)N->getZExtValue() << 48) >> 48 ==
+         ((int64_t)N->getZExtValue() << 32) >> 32;
+}], SExt16>;
+
+def zappat : PatFrag<(ops node:$LHS), (and node:$LHS, imm:$L), [{
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS) return 0;
+  uint64_t build = get_zapImm(N->getOperand(0), (uint64_t)RHS->getZExtValue());
+  return build != 0;
+}]>;
+
+def immFPZ  : PatLeaf<(fpimm), [{ //the only fpconstant nodes are +/- 0.0
+  (void)N; // silence warning.
+  return true;
+}]>;
+
+def immRem1 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),1,0);}]>;
+def immRem2 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),2,0);}]>;
+def immRem3 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),3,0);}]>;
+def immRem4 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),4,0);}]>;
+def immRem5 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),5,0);}]>;
+def immRem1n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),1,1);}]>;
+def immRem2n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),2,1);}]>;
+def immRem3n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),3,1);}]>;
+def immRem4n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),4,1);}]>;
+def immRem5n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),5,1);}]>;
+
+def immRemP2n : PatLeaf<(imm), [{
+  return isPowerOf2_64(getNearPower2((uint64_t)N->getZExtValue()) -
+                         N->getZExtValue());
+}]>;
+def immRemP2 : PatLeaf<(imm), [{
+  return isPowerOf2_64(N->getZExtValue() -
+                         getNearPower2((uint64_t)N->getZExtValue()));
+}]>;
+def immUExt8ME : PatLeaf<(imm), [{ //use this imm for mulqi
+  int64_t d =  abs64((int64_t)N->getZExtValue() -
+               (int64_t)getNearPower2((uint64_t)N->getZExtValue()));
+  if (isPowerOf2_64(d)) return false;
+  switch (d) {
+    case 1: case 3: case 5: return false; 
+    default: return (uint64_t)N->getZExtValue() == (uint8_t)N->getZExtValue();
+  };
+}]>;
+
+def intop : PatFrag<(ops node:$op), (sext_inreg node:$op, i32)>;
+def add4  : PatFrag<(ops node:$op1, node:$op2),
+                    (add (shl node:$op1, 2), node:$op2)>;
+def sub4  : PatFrag<(ops node:$op1, node:$op2),
+                    (sub (shl node:$op1, 2), node:$op2)>;
+def add8  : PatFrag<(ops node:$op1, node:$op2),
+                    (add (shl node:$op1, 3), node:$op2)>;
+def sub8  : PatFrag<(ops node:$op1, node:$op2),
+                    (sub (shl node:$op1, 3), node:$op2)>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class CmpOpFrag<dag res> : PatFrag<(ops node:$R), res>;
+
+//Pseudo ops for selection
+
+def WTF : PseudoInstAlpha<(outs), (ins variable_ops), "#wtf", [], s_pseudo>;
+
+let hasCtrlDep = 1, Defs = [R30], Uses = [R30] in {
+def ADJUSTSTACKUP : PseudoInstAlpha<(outs), (ins s64imm:$amt),
+                "; ADJUP $amt", 
+                [(callseq_start timm:$amt)], s_pseudo>;
+def ADJUSTSTACKDOWN : PseudoInstAlpha<(outs), (ins s64imm:$amt1, s64imm:$amt2),
+                "; ADJDOWN $amt1",
+                [(callseq_end timm:$amt1, timm:$amt2)], s_pseudo>;
+}
+
+def ALTENT : PseudoInstAlpha<(outs), (ins s64imm:$TARGET), "$$$TARGET..ng:\n", [], s_pseudo>;
+def PCLABEL : PseudoInstAlpha<(outs), (ins s64imm:$num), "PCMARKER_$num:\n",[], s_pseudo>;
+def MEMLABEL : PseudoInstAlpha<(outs), (ins s64imm:$i, s64imm:$j, s64imm:$k, s64imm:$m),
+         "LSMARKER$$$i$$$j$$$k$$$m:", [], s_pseudo>;
+
+
+let usesCustomDAGSchedInserter = 1 in {   // Expanded by the scheduler.
+def CAS32 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$cmp, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_cmp_swap_32 GPRC:$ptr, GPRC:$cmp, GPRC:$swp))], s_pseudo>;
+def CAS64 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$cmp, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_cmp_swap_64 GPRC:$ptr, GPRC:$cmp, GPRC:$swp))], s_pseudo>;
+
+def LAS32 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_load_add_32 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+def LAS64 :PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_load_add_64 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+
+def SWAP32 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+        [(set GPRC:$dst, (atomic_swap_32 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+def SWAP64 :PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+        [(set GPRC:$dst, (atomic_swap_64 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+}
+
+//***********************
+//Real instructions
+//***********************
+
+//Operation Form:
+
+//conditional moves, int
+
+multiclass cmov_inst<bits<7> fun, string asmstr, PatFrag OpNode> {
+def r : OForm4<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"),
+             [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), GPRC:$RTRUE, GPRC:$RFALSE))], s_cmov>;
+def i : OForm4L<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"),
+             [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), immUExt8:$RTRUE, GPRC:$RFALSE))], s_cmov>;
+}
+
+defm CMOVEQ  : cmov_inst<0x24, "cmoveq",  CmpOpFrag<(seteq node:$R, 0)>>;
+defm CMOVNE  : cmov_inst<0x26, "cmovne",  CmpOpFrag<(setne node:$R, 0)>>;
+defm CMOVLT  : cmov_inst<0x44, "cmovlt",  CmpOpFrag<(setlt node:$R, 0)>>;
+defm CMOVLE  : cmov_inst<0x64, "cmovle",  CmpOpFrag<(setle node:$R, 0)>>;
+defm CMOVGT  : cmov_inst<0x66, "cmovgt",  CmpOpFrag<(setgt node:$R, 0)>>;
+defm CMOVGE  : cmov_inst<0x46, "cmovge",  CmpOpFrag<(setge node:$R, 0)>>;
+defm CMOVLBC : cmov_inst<0x16, "cmovlbc", CmpOpFrag<(xor   node:$R, 1)>>;
+defm CMOVLBS : cmov_inst<0x14, "cmovlbs", CmpOpFrag<(and   node:$R, 1)>>;
+
+//General pattern for cmov
+def : Pat<(select GPRC:$which, GPRC:$src1, GPRC:$src2),
+      (CMOVNEr GPRC:$src2, GPRC:$src1, GPRC:$which)>;
+def : Pat<(select GPRC:$which, GPRC:$src1, immUExt8:$src2),
+      (CMOVEQi GPRC:$src1, immUExt8:$src2, GPRC:$which)>;
+
+//Invert sense when we can for constants:
+def : Pat<(select (setne GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVEQi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setgt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVLEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setge GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVLTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setlt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVGEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setle GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVGTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+
+multiclass all_inst<bits<6> opc, bits<7> funl, bits<7> funq, 
+                    string asmstr, PatFrag OpNode, InstrItinClass itin> {
+  def Lr : OForm< opc, funl, !strconcat(asmstr, "l $RA,$RB,$RC"),
+               [(set GPRC:$RC, (intop (OpNode GPRC:$RA, GPRC:$RB)))], itin>;
+  def Li : OFormL<opc, funl, !strconcat(asmstr, "l $RA,$L,$RC"),
+               [(set GPRC:$RC, (intop (OpNode GPRC:$RA, immUExt8:$L)))], itin>;
+  def Qr : OForm< opc, funq, !strconcat(asmstr, "q $RA,$RB,$RC"),
+               [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>;
+  def Qi : OFormL<opc, funq, !strconcat(asmstr, "q $RA,$L,$RC"),
+               [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>;
+}
+
+defm MUL   : all_inst<0x13, 0x00, 0x20, "mul",   BinOpFrag<(mul node:$LHS, node:$RHS)>, s_imul>;
+defm ADD   : all_inst<0x10, 0x00, 0x20, "add",   BinOpFrag<(add node:$LHS, node:$RHS)>, s_iadd>;
+defm S4ADD : all_inst<0x10, 0x02, 0x22, "s4add", add4, s_iadd>;
+defm S8ADD : all_inst<0x10, 0x12, 0x32, "s8add", add8, s_iadd>;
+defm S4SUB : all_inst<0x10, 0x0B, 0x2B, "s4sub", sub4, s_iadd>;
+defm S8SUB : all_inst<0x10, 0x1B, 0x3B, "s8sub", sub8, s_iadd>;
+defm SUB   : all_inst<0x10, 0x09, 0x29, "sub",   BinOpFrag<(sub node:$LHS, node:$RHS)>, s_iadd>;
+//Const cases since legalize does sub x, int -> add x, inv(int) + 1
+def : Pat<(intop (add GPRC:$RA, immUExt8neg:$L)), (SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add GPRC:$RA, immUExt8neg:$L), (SUBQi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(intop (add4 GPRC:$RA, immUExt8neg:$L)), (S4SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add4 GPRC:$RA, immUExt8neg:$L), (S4SUBQi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(intop (add8 GPRC:$RA, immUExt8neg:$L)), (S8SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add8 GPRC:$RA, immUExt8neg:$L), (S8SUBQi GPRC:$RA, immUExt8neg:$L)>;
+
+multiclass log_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> {
+def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>;
+def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>;
+}
+multiclass inv_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> {
+def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, (not GPRC:$RB)))], itin>;
+def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8inv:$L))], itin>;
+}
+
+defm AND   : log_inst<0x11, 0x00, "and",   and,   s_ilog>;
+defm BIC   : inv_inst<0x11, 0x08, "bic",   and,   s_ilog>;
+defm BIS   : log_inst<0x11, 0x20, "bis",   or,    s_ilog>;
+defm ORNOT : inv_inst<0x11, 0x28, "ornot", or,    s_ilog>;
+defm XOR   : log_inst<0x11, 0x40, "xor",   xor,   s_ilog>;
+defm EQV   : inv_inst<0x11, 0x48, "eqv",   xor,   s_ilog>;
+
+defm SL    : log_inst<0x12, 0x39, "sll",   shl,   s_ishf>;
+defm SRA   : log_inst<0x12, 0x3c, "sra",   sra,   s_ishf>;
+defm SRL   : log_inst<0x12, 0x34, "srl",   srl,   s_ishf>;
+defm UMULH : log_inst<0x13, 0x30, "umulh", mulhu, s_imul>;
+
+def CTLZ     : OForm2<0x1C, 0x32, "CTLZ $RB,$RC", 
+                      [(set GPRC:$RC, (ctlz GPRC:$RB))], s_imisc>;
+def CTPOP    : OForm2<0x1C, 0x30, "CTPOP $RB,$RC", 
+                      [(set GPRC:$RC, (ctpop GPRC:$RB))], s_imisc>;
+def CTTZ     : OForm2<0x1C, 0x33, "CTTZ $RB,$RC", 
+                      [(set GPRC:$RC, (cttz GPRC:$RB))], s_imisc>;
+def EXTBL    : OForm< 0x12, 0x06, "EXTBL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 255))], s_ishf>;
+def EXTWL    : OForm< 0x12, 0x16, "EXTWL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 65535))], s_ishf>;
+def EXTLL    : OForm< 0x12, 0x26, "EXTLL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 4294967295))], s_ishf>;
+def SEXTB    : OForm2<0x1C, 0x00, "sextb $RB,$RC", 
+                      [(set GPRC:$RC, (sext_inreg GPRC:$RB, i8))], s_ishf>;
+def SEXTW    : OForm2<0x1C, 0x01, "sextw $RB,$RC", 
+                      [(set GPRC:$RC, (sext_inreg GPRC:$RB, i16))], s_ishf>;
+
+//def EXTBLi   : OFormL<0x12, 0x06, "EXTBL $RA,$L,$RC", []>; //Extract byte low
+//def EXTLH    : OForm< 0x12, 0x6A, "EXTLH $RA,$RB,$RC", []>; //Extract longword high
+//def EXTLHi   : OFormL<0x12, 0x6A, "EXTLH $RA,$L,$RC", []>; //Extract longword high
+//def EXTLLi   : OFormL<0x12, 0x26, "EXTLL $RA,$L,$RC", []>; //Extract longword low
+//def EXTQH    : OForm< 0x12, 0x7A, "EXTQH $RA,$RB,$RC", []>; //Extract quadword high
+//def EXTQHi   : OFormL<0x12, 0x7A, "EXTQH $RA,$L,$RC", []>; //Extract quadword high
+//def EXTQ     : OForm< 0x12, 0x36, "EXTQ $RA,$RB,$RC", []>; //Extract quadword low
+//def EXTQi    : OFormL<0x12, 0x36, "EXTQ $RA,$L,$RC", []>; //Extract quadword low
+//def EXTWH    : OForm< 0x12, 0x5A, "EXTWH $RA,$RB,$RC", []>; //Extract word high
+//def EXTWHi   : OFormL<0x12, 0x5A, "EXTWH $RA,$L,$RC", []>; //Extract word high
+//def EXTWLi   : OFormL<0x12, 0x16, "EXTWL $RA,$L,$RC", []>; //Extract word low
+
+//def INSBL    : OForm< 0x12, 0x0B, "INSBL $RA,$RB,$RC", []>; //Insert byte low
+//def INSBLi   : OFormL<0x12, 0x0B, "INSBL $RA,$L,$RC", []>; //Insert byte low
+//def INSLH    : OForm< 0x12, 0x67, "INSLH $RA,$RB,$RC", []>; //Insert longword high
+//def INSLHi   : OFormL<0x12, 0x67, "INSLH $RA,$L,$RC", []>; //Insert longword high
+//def INSLL    : OForm< 0x12, 0x2B, "INSLL $RA,$RB,$RC", []>; //Insert longword low
+//def INSLLi   : OFormL<0x12, 0x2B, "INSLL $RA,$L,$RC", []>; //Insert longword low
+//def INSQH    : OForm< 0x12, 0x77, "INSQH $RA,$RB,$RC", []>; //Insert quadword high
+//def INSQHi   : OFormL<0x12, 0x77, "INSQH $RA,$L,$RC", []>; //Insert quadword high
+//def INSQL    : OForm< 0x12, 0x3B, "INSQL $RA,$RB,$RC", []>; //Insert quadword low
+//def INSQLi   : OFormL<0x12, 0x3B, "INSQL $RA,$L,$RC", []>; //Insert quadword low
+//def INSWH    : OForm< 0x12, 0x57, "INSWH $RA,$RB,$RC", []>; //Insert word high
+//def INSWHi   : OFormL<0x12, 0x57, "INSWH $RA,$L,$RC", []>; //Insert word high
+//def INSWL    : OForm< 0x12, 0x1B, "INSWL $RA,$RB,$RC", []>; //Insert word low
+//def INSWLi   : OFormL<0x12, 0x1B, "INSWL $RA,$L,$RC", []>; //Insert word low
+
+//def MSKBL    : OForm< 0x12, 0x02, "MSKBL $RA,$RB,$RC", []>; //Mask byte low
+//def MSKBLi   : OFormL<0x12, 0x02, "MSKBL $RA,$L,$RC", []>; //Mask byte low
+//def MSKLH    : OForm< 0x12, 0x62, "MSKLH $RA,$RB,$RC", []>; //Mask longword high
+//def MSKLHi   : OFormL<0x12, 0x62, "MSKLH $RA,$L,$RC", []>; //Mask longword high
+//def MSKLL    : OForm< 0x12, 0x22, "MSKLL $RA,$RB,$RC", []>; //Mask longword low
+//def MSKLLi   : OFormL<0x12, 0x22, "MSKLL $RA,$L,$RC", []>; //Mask longword low
+//def MSKQH    : OForm< 0x12, 0x72, "MSKQH $RA,$RB,$RC", []>; //Mask quadword high
+//def MSKQHi   : OFormL<0x12, 0x72, "MSKQH $RA,$L,$RC", []>; //Mask quadword high
+//def MSKQL    : OForm< 0x12, 0x32, "MSKQL $RA,$RB,$RC", []>; //Mask quadword low
+//def MSKQLi   : OFormL<0x12, 0x32, "MSKQL $RA,$L,$RC", []>; //Mask quadword low
+//def MSKWH    : OForm< 0x12, 0x52, "MSKWH $RA,$RB,$RC", []>; //Mask word high
+//def MSKWHi   : OFormL<0x12, 0x52, "MSKWH $RA,$L,$RC", []>; //Mask word high
+//def MSKWL    : OForm< 0x12, 0x12, "MSKWL $RA,$RB,$RC", []>; //Mask word low
+//def MSKWLi   : OFormL<0x12, 0x12, "MSKWL $RA,$L,$RC", []>; //Mask word low
+                      
+def ZAPNOTi  : OFormL<0x12, 0x31, "zapnot $RA,$L,$RC", [], s_ishf>;
+
+// Define the pattern that produces ZAPNOTi.
+def : Pat<(zappat:$imm GPRC:$RA),
+          (ZAPNOTi GPRC:$RA, (iZAPX GPRC:$imm))>;
+
+
+//Comparison, int
+//So this is a waste of what this instruction can do, but it still saves something
+def CMPBGE  : OForm< 0x10, 0x0F, "cmpbge $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), (and GPRC:$RB, 255)))], s_ilog>;
+def CMPBGEi : OFormL<0x10, 0x0F, "cmpbge $RA,$L,$RC",
+                     [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), immUExt8:$L))], s_ilog>;
+def CMPEQ   : OForm< 0x10, 0x2D, "cmpeq $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (seteq GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPEQi  : OFormL<0x10, 0x2D, "cmpeq $RA,$L,$RC", 
+                     [(set GPRC:$RC, (seteq GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPLE   : OForm< 0x10, 0x6D, "cmple $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (setle GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPLEi  : OFormL<0x10, 0x6D, "cmple $RA,$L,$RC",
+                     [(set GPRC:$RC, (setle GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPLT   : OForm< 0x10, 0x4D, "cmplt $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setlt GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPLTi  : OFormL<0x10, 0x4D, "cmplt $RA,$L,$RC",
+                     [(set GPRC:$RC, (setlt GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPULE  : OForm< 0x10, 0x3D, "cmpule $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setule GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPULEi : OFormL<0x10, 0x3D, "cmpule $RA,$L,$RC",
+                     [(set GPRC:$RC, (setule GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPULT  : OForm< 0x10, 0x1D, "cmpult $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setult GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPULTi : OFormL<0x10, 0x1D, "cmpult $RA,$L,$RC", 
+                      [(set GPRC:$RC, (setult GPRC:$RA, immUExt8:$L))], s_iadd>;
+
+//Patterns for unsupported int comparisons
+def : Pat<(setueq GPRC:$X, GPRC:$Y), (CMPEQ GPRC:$X, GPRC:$Y)>;
+def : Pat<(setueq GPRC:$X, immUExt8:$Y), (CMPEQi GPRC:$X, immUExt8:$Y)>;
+
+def : Pat<(setugt GPRC:$X, GPRC:$Y), (CMPULT GPRC:$Y, GPRC:$X)>;
+def : Pat<(setugt immUExt8:$X, GPRC:$Y), (CMPULTi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setuge GPRC:$X, GPRC:$Y), (CMPULE GPRC:$Y, GPRC:$X)>;
+def : Pat<(setuge immUExt8:$X, GPRC:$Y), (CMPULEi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setgt GPRC:$X, GPRC:$Y), (CMPLT GPRC:$Y, GPRC:$X)>;
+def : Pat<(setgt immUExt8:$X, GPRC:$Y), (CMPLTi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setge GPRC:$X, GPRC:$Y), (CMPLE GPRC:$Y, GPRC:$X)>;
+def : Pat<(setge immUExt8:$X, GPRC:$Y), (CMPLEi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setne GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>;
+def : Pat<(setne GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQi GPRC:$X, immUExt8:$Y), 0)>;
+
+def : Pat<(setune GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>;
+def : Pat<(setune GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQ GPRC:$X, immUExt8:$Y), 0)>;
+
+
+let isReturn = 1, isTerminator = 1, Ra = 31, Rb = 26, disp = 1, Uses = [R26] in {
+  def RETDAG : MbrForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", s_jsr>; //Return from subroutine
+  def RETDAGp : MbrpForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", [(retflag)], s_jsr>; //Return from subroutine
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1, Ra = 31, disp = 0 in
+def JMP : MbrpForm< 0x1A, 0x00, (ops GPRC:$RS), "jmp $$31,($RS),0", 
+          [(brind GPRC:$RS)], s_jsr>; //Jump
+
+let isCall = 1, Ra = 26,
+    Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19,
+            R20, R21, R22, R23, R24, R25, R26, R27, R28, R29,
+            F0, F1,
+            F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+            F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R29] in {
+    def BSR : BFormD<0x34, "bsr $$26,$$$DISP..ng", [], s_jsr>; //Branch to subroutine
+}
+let isCall = 1, Ra = 26, Rb = 27, disp = 0,
+    Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19,
+            R20, R21, R22, R23, R24, R25, R26, R27, R28, R29,
+            F0, F1,
+            F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+            F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R27, R29] in {
+    def JSR : MbrForm< 0x1A, 0x01, (ops ), "jsr $$26,($$27),0", s_jsr>; //Jump to subroutine
+}
+
+let isCall = 1, Ra = 23, Rb = 27, disp = 0,
+    Defs = [R23, R24, R25, R27, R28], Uses = [R24, R25, R27] in
+  def JSRs : MbrForm< 0x1A, 0x01, (ops ), "jsr $$23,($$27),0", s_jsr>; //Jump to div or rem
+
+
+def JSR_COROUTINE : MbrForm< 0x1A, 0x03, (ops GPRC:$RD, GPRC:$RS, s14imm:$DISP), "jsr_coroutine $RD,($RS),$DISP", s_jsr>; //Jump to subroutine return
+
+
+let OutOperandList = (ops GPRC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDQ   : MForm<0x29, 1, "ldq $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDQr  : MForm<0x29, 1, "ldq $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDL   : MForm<0x28, 1, "ldl $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (sextloadi32 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDLr  : MForm<0x28, 1, "ldl $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (sextloadi32 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDBU  : MForm<0x0A, 1, "ldbu $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (zextloadi8 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDBUr : MForm<0x0A, 1, "ldbu $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (zextloadi8 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDWU  : MForm<0x0C, 1, "ldwu $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (zextloadi16 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDWUr : MForm<0x0C, 1, "ldwu $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (zextloadi16 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+}
+
+
+let OutOperandList = (ops), InOperandList = (ops GPRC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STB   : MForm<0x0E, 0, "stb $RA,$DISP($RB)",
+                 [(truncstorei8 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STBr  : MForm<0x0E, 0, "stb $RA,$DISP($RB)\t\t!gprellow",
+                 [(truncstorei8 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STW   : MForm<0x0D, 0, "stw $RA,$DISP($RB)",
+                 [(truncstorei16 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STWr  : MForm<0x0D, 0, "stw $RA,$DISP($RB)\t\t!gprellow",
+                 [(truncstorei16 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STL   : MForm<0x2C, 0, "stl $RA,$DISP($RB)",
+                 [(truncstorei32 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STLr  : MForm<0x2C, 0, "stl $RA,$DISP($RB)\t\t!gprellow",
+                 [(truncstorei32 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STQ   : MForm<0x2D, 0, "stq $RA,$DISP($RB)",
+                 [(store GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STQr  : MForm<0x2D, 0, "stq $RA,$DISP($RB)\t\t!gprellow",
+                 [(store GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+}
+
+//Load address
+let OutOperandList = (ops GPRC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDA   : MForm<0x08, 0, "lda $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_lda>;
+def LDAr  : MForm<0x08, 0, "lda $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_lda>;  //Load address
+def LDAH  : MForm<0x09, 0, "ldah $RA,$DISP($RB)",
+                 [], s_lda>;  //Load address high
+def LDAHr : MForm<0x09, 0, "ldah $RA,$DISP($RB)\t\t!gprelhigh",
+                 [(set GPRC:$RA, (Alpha_gprelhi tglobaladdr:$DISP, GPRC:$RB))], s_lda>;  //Load address high
+}
+
+let OutOperandList = (ops), InOperandList = (ops F4RC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STS  : MForm<0x26, 0, "sts $RA,$DISP($RB)",
+                [(store F4RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>;
+def STSr : MForm<0x26, 0, "sts $RA,$DISP($RB)\t\t!gprellow",
+                [(store F4RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>;
+}
+let OutOperandList = (ops F4RC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDS  : MForm<0x22, 1, "lds $RA,$DISP($RB)",
+                [(set F4RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>;
+def LDSr : MForm<0x22, 1, "lds $RA,$DISP($RB)\t\t!gprellow",
+                [(set F4RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>;
+}
+let OutOperandList = (ops), InOperandList = (ops F8RC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STT  : MForm<0x27, 0, "stt $RA,$DISP($RB)",
+                 [(store F8RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>;
+def STTr : MForm<0x27, 0, "stt $RA,$DISP($RB)\t\t!gprellow",
+                 [(store F8RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>;
+}
+let OutOperandList = (ops F8RC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDT  : MForm<0x23, 1, "ldt $RA,$DISP($RB)",
+                [(set F8RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>;
+def LDTr : MForm<0x23, 1, "ldt $RA,$DISP($RB)\t\t!gprellow",
+                [(set F8RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>;
+}
+
+
+//constpool rels
+def : Pat<(i64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDQr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (sextloadi32 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDLr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (zextloadi8 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDBUr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (zextloadi16 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDWUr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprello tconstpool:$DISP, GPRC:$RB)),
+          (LDAr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprelhi tconstpool:$DISP, GPRC:$RB)),
+          (LDAHr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(f32 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDSr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(f64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDTr tconstpool:$DISP, GPRC:$RB)>;
+
+//jumptable rels
+def : Pat<(i64 (Alpha_gprelhi tjumptable:$DISP, GPRC:$RB)),
+          (LDAHr tjumptable:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprello tjumptable:$DISP, GPRC:$RB)),
+          (LDAr tjumptable:$DISP, GPRC:$RB)>;
+
+
+//misc ext patterns
+def : Pat<(i64 (extloadi8 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDBU   immSExt16:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (extloadi16 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDWU  immSExt16:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (extloadi32 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDL   immSExt16:$DISP, GPRC:$RB)>;
+
+//0 disp patterns
+def : Pat<(i64 (load GPRC:$addr)),
+          (LDQ  0, GPRC:$addr)>;
+def : Pat<(f64 (load GPRC:$addr)),
+          (LDT  0, GPRC:$addr)>;
+def : Pat<(f32 (load GPRC:$addr)),
+          (LDS  0, GPRC:$addr)>;
+def : Pat<(i64 (sextloadi32 GPRC:$addr)),
+          (LDL  0, GPRC:$addr)>;
+def : Pat<(i64 (zextloadi16 GPRC:$addr)),
+          (LDWU 0, GPRC:$addr)>;
+def : Pat<(i64 (zextloadi8 GPRC:$addr)),
+          (LDBU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi8 GPRC:$addr)),
+          (LDBU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi16 GPRC:$addr)),
+          (LDWU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi32 GPRC:$addr)),
+          (LDL  0, GPRC:$addr)>;
+
+def : Pat<(store GPRC:$DATA, GPRC:$addr),
+          (STQ  GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(store F8RC:$DATA, GPRC:$addr),
+          (STT  F8RC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(store F4RC:$DATA, GPRC:$addr),
+          (STS  F4RC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei32 GPRC:$DATA, GPRC:$addr),
+          (STL  GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei16 GPRC:$DATA, GPRC:$addr),
+          (STW GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei8 GPRC:$DATA, GPRC:$addr),
+          (STB GPRC:$DATA, 0, GPRC:$addr)>;
+
+
+//load address, rellocated gpdist form
+let OutOperandList = (ops GPRC:$RA),
+    InOperandList = (ops s16imm:$DISP, GPRC:$RB, s16imm:$NUM),
+    mayLoad = 1 in {
+def LDAg  : MForm<0x08, 1, "lda $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>;  //Load address
+def LDAHg : MForm<0x09, 1, "ldah $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>;  //Load address
+}
+
+//Load quad, rellocated literal form
+let OutOperandList = (ops GPRC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in 
+def LDQl : MForm<0x29, 1, "ldq $RA,$DISP($RB)\t\t!literal",
+                 [(set GPRC:$RA, (Alpha_rellit tglobaladdr:$DISP, GPRC:$RB))], s_ild>;
+def : Pat<(Alpha_rellit texternalsym:$ext, GPRC:$RB),
+          (LDQl texternalsym:$ext, GPRC:$RB)>;
+
+let OutOperandList = (outs GPRC:$RR),
+    InOperandList = (ins GPRC:$RA, s64imm:$DISP, GPRC:$RB),
+    Constraints = "$RA = $RR",
+    DisableEncoding = "$RR" in {
+def STQ_C : MForm<0x2F, 0, "stq_l $RA,$DISP($RB)", [], s_ist>;
+def STL_C : MForm<0x2E, 0, "stl_l $RA,$DISP($RB)", [], s_ist>;
+}
+let OutOperandList = (ops GPRC:$RA),
+    InOperandList = (ops s64imm:$DISP, GPRC:$RB),
+    mayLoad = 1 in {
+def LDQ_L : MForm<0x2B, 1, "ldq_l $RA,$DISP($RB)", [], s_ild>;
+def LDL_L : MForm<0x2A, 1, "ldl_l $RA,$DISP($RB)", [], s_ild>;
+}
+
+def RPCC : MfcForm<0x18, 0xC000, "rpcc $RA", s_rpcc>; //Read process cycle counter
+def MB  : MfcPForm<0x18, 0x4000, "mb",  s_imisc>; //memory barrier
+def WMB : MfcPForm<0x18, 0x4400, "wmb", s_imisc>; //write memory barrier
+
+def : Pat<(membarrier (i64 imm:$ll), (i64 imm:$ls), (i64 imm:$sl), (i64 1), (i64 imm:$dev)),
+          (WMB)>;
+def : Pat<(membarrier (i64 imm:$ll), (i64 imm:$ls), (i64 imm:$sl), (i64 imm:$ss), (i64 imm:$dev)),
+          (MB)>;
+
+//Basic Floating point ops
+
+//Floats
+
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F4RC:$RB), Fa = 31 in 
+def SQRTS : FPForm<0x14, 0x58B, "sqrts/su $RB,$RC",
+                   [(set F4RC:$RC, (fsqrt F4RC:$RB))], s_fsqrts>;
+
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F4RC:$RA, F4RC:$RB) in {
+def ADDS  : FPForm<0x16, 0x580, "adds/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fadd F4RC:$RA, F4RC:$RB))], s_fadd>;
+def SUBS  : FPForm<0x16, 0x581, "subs/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fsub F4RC:$RA, F4RC:$RB))], s_fadd>;
+def DIVS  : FPForm<0x16, 0x583, "divs/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fdiv F4RC:$RA, F4RC:$RB))], s_fdivs>;
+def MULS  : FPForm<0x16, 0x582, "muls/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fmul F4RC:$RA, F4RC:$RB))], s_fmul>;
+
+def CPYSS  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fcopysign F4RC:$RB, F4RC:$RA))], s_fadd>;
+def CPYSES : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNS : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F4RC:$RA)))], s_fadd>;
+}
+
+//Doubles
+
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def SQRTT : FPForm<0x14, 0x5AB, "sqrtt/su $RB,$RC",
+                   [(set F8RC:$RC, (fsqrt F8RC:$RB))], s_fsqrtt>;
+
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RA, F8RC:$RB) in {
+def ADDT  : FPForm<0x16, 0x5A0, "addt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fadd F8RC:$RA, F8RC:$RB))], s_fadd>;
+def SUBT  : FPForm<0x16, 0x5A1, "subt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fsub F8RC:$RA, F8RC:$RB))], s_fadd>;
+def DIVT  : FPForm<0x16, 0x5A3, "divt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fdiv F8RC:$RA, F8RC:$RB))], s_fdivt>;
+def MULT  : FPForm<0x16, 0x5A2, "mult/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fmul F8RC:$RA, F8RC:$RB))], s_fmul>;
+
+def CPYST  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fcopysign F8RC:$RB, F8RC:$RA))], s_fadd>;
+def CPYSET : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNT : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F8RC:$RA)))], s_fadd>;
+
+def CMPTEQ : FPForm<0x16, 0x5A5, "cmpteq/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (seteq F8RC:$RA, F8RC:$RB))]>;
+def CMPTLE : FPForm<0x16, 0x5A7, "cmptle/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setle F8RC:$RA, F8RC:$RB))]>;
+def CMPTLT : FPForm<0x16, 0x5A6, "cmptlt/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setlt F8RC:$RA, F8RC:$RB))]>;
+def CMPTUN : FPForm<0x16, 0x5A4, "cmptun/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setuo F8RC:$RA, F8RC:$RB))]>;
+}
+
+//More CPYS forms:
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F4RC:$RA, F8RC:$RB) in {
+def CPYSTs  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fcopysign F8RC:$RB, F4RC:$RA))], s_fadd>;
+def CPYSNTs : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F4RC:$RA)))], s_fadd>;
+}
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F8RC:$RA, F4RC:$RB) in {
+def CPYSSt  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fcopysign F4RC:$RB, F8RC:$RA))], s_fadd>;
+def CPYSESt : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNSt : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F8RC:$RA)))], s_fadd>;
+}
+
+//conditional moves, floats
+let OutOperandList = (ops F4RC:$RDEST), InOperandList = (ops F4RC:$RFALSE, F4RC:$RTRUE, F8RC:$RCOND),
+    isTwoAddress = 1 in {
+def FCMOVEQS : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if = zero
+def FCMOVGES : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if >= zero
+def FCMOVGTS : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if > zero
+def FCMOVLES : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if <= zero
+def FCMOVLTS : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; // FCMOVE if < zero
+def FCMOVNES : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if != zero
+}
+//conditional moves, doubles
+let OutOperandList = (ops F8RC:$RDEST), InOperandList = (ops F8RC:$RFALSE, F8RC:$RTRUE, F8RC:$RCOND),
+    isTwoAddress = 1 in {
+def FCMOVEQT : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVGET : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVGTT : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVLET : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVLTT : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVNET : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+}
+
+//misc FP selects
+//Select double
+     
+def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+
+//Select single
+def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+
+
+
+let OutOperandList = (ops GPRC:$RC), InOperandList = (ops F4RC:$RA), Fb = 31 in 
+def FTOIS : FPForm<0x1C, 0x078, "ftois $RA,$RC",[], s_ftoi>; //Floating to integer move, S_floating
+let OutOperandList = (ops GPRC:$RC), InOperandList = (ops F8RC:$RA), Fb = 31 in 
+def FTOIT : FPForm<0x1C, 0x070, "ftoit $RA,$RC",
+        [(set GPRC:$RC, (bitconvert F8RC:$RA))], s_ftoi>; //Floating to integer move
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops GPRC:$RA), Fb = 31 in 
+def ITOFS : FPForm<0x14, 0x004, "itofs $RA,$RC",[], s_itof>; //Integer to floating move, S_floating
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops GPRC:$RA), Fb = 31 in 
+def ITOFT : FPForm<0x14, 0x024, "itoft $RA,$RC",
+        [(set F8RC:$RC, (bitconvert GPRC:$RA))], s_itof>; //Integer to floating move
+
+
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTQS : FPForm<0x16, 0x7BC, "cvtqs/sui $RB,$RC",
+        [(set F4RC:$RC, (Alpha_cvtqs F8RC:$RB))], s_fadd>;
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTQT : FPForm<0x16, 0x7BE, "cvtqt/sui $RB,$RC",
+        [(set F8RC:$RC, (Alpha_cvtqt F8RC:$RB))], s_fadd>;
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTTQ : FPForm<0x16, 0x52F, "cvttq/svc $RB,$RC",
+        [(set F8RC:$RC, (Alpha_cvttq F8RC:$RB))], s_fadd>;
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F4RC:$RB), Fa = 31 in 
+def CVTST : FPForm<0x16, 0x6AC, "cvtst/s $RB,$RC",
+                   [(set F8RC:$RC, (fextend F4RC:$RB))], s_fadd>;
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTTS : FPForm<0x16, 0x7AC, "cvtts/sui $RB,$RC",
+                   [(set F4RC:$RC, (fround F8RC:$RB))], s_fadd>;
+
+
+/////////////////////////////////////////////////////////
+//Branching
+/////////////////////////////////////////////////////////
+class br_icc<bits<6> opc, string asmstr>
+  : BFormN<opc, (ops u64imm:$opc, GPRC:$R, target:$dst), 
+    !strconcat(asmstr, " $R,$dst"),  s_icbr>;
+class br_fcc<bits<6> opc, string asmstr>
+  : BFormN<opc, (ops u64imm:$opc, F8RC:$R, target:$dst), 
+    !strconcat(asmstr, " $R,$dst"),  s_fbr>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+let Ra = 31 in
+def BR : BFormD<0x30, "br $$31,$DISP", [(br bb:$DISP)], s_ubr>;
+
+def COND_BRANCH_I : BFormN<0, (ops u64imm:$opc, GPRC:$R, target:$dst), 
+                    "{:comment} COND_BRANCH imm:$opc, GPRC:$R, bb:$dst", 
+                    s_icbr>;
+def COND_BRANCH_F : BFormN<0, (ops u64imm:$opc, F8RC:$R, target:$dst), 
+                    "{:comment} COND_BRANCH imm:$opc, F8RC:$R, bb:$dst",
+                    s_fbr>;
+//Branches, int
+def BEQ  : br_icc<0x39, "beq">;
+def BGE  : br_icc<0x3E, "bge">;
+def BGT  : br_icc<0x3F, "bgt">;
+def BLBC : br_icc<0x38, "blbc">;
+def BLBS : br_icc<0x3C, "blbs">;
+def BLE  : br_icc<0x3B, "ble">;
+def BLT  : br_icc<0x3A, "blt">;
+def BNE  : br_icc<0x3D, "bne">;
+
+//Branches, float
+def FBEQ : br_fcc<0x31, "fbeq">;
+def FBGE : br_fcc<0x36, "fbge">;
+def FBGT : br_fcc<0x37, "fbgt">;
+def FBLE : br_fcc<0x33, "fble">;
+def FBLT : br_fcc<0x32, "fblt">;
+def FBNE : br_fcc<0x36, "fbne">;
+}
+
+//An ugly trick to get the opcode as an imm I can use
+def immBRCond : SDNodeXForm<imm, [{
+  switch((uint64_t)N->getZExtValue()) {
+    default: assert(0 && "Unknown branch type");
+    case 0:  return getI64Imm(Alpha::BEQ);
+    case 1:  return getI64Imm(Alpha::BNE);
+    case 2:  return getI64Imm(Alpha::BGE);
+    case 3:  return getI64Imm(Alpha::BGT);
+    case 4:  return getI64Imm(Alpha::BLE);
+    case 5:  return getI64Imm(Alpha::BLT);
+    case 6:  return getI64Imm(Alpha::BLBS);
+    case 7:  return getI64Imm(Alpha::BLBC);
+    case 20: return getI64Imm(Alpha::FBEQ);
+    case 21: return getI64Imm(Alpha::FBNE);
+    case 22: return getI64Imm(Alpha::FBGE);
+    case 23: return getI64Imm(Alpha::FBGT);
+    case 24: return getI64Imm(Alpha::FBLE);
+    case 25: return getI64Imm(Alpha::FBLT);
+  }
+}]>;
+
+//Int cond patterns
+def : Pat<(brcond (seteq GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setge GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 2),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setgt GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 3),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (and   GPRC:$RA, 1), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 6),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setle GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 4),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setlt GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 5),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 1),  GPRC:$RA, bb:$DISP)>;
+
+def : Pat<(brcond GPRC:$RA, bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 1), GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, GPRC:$RB), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0), (CMPEQ GPRC:$RA, GPRC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, immUExt8:$L), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0), (CMPEQi GPRC:$RA, immUExt8:$L), bb:$DISP)>;
+
+//FP cond patterns
+def : Pat<(brcond (seteq F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setge F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 22),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setgt F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 23),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setle F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 24),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setlt F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 25),  F8RC:$RA, bb:$DISP)>;
+
+
+def : Pat<(brcond (seteq F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setoeq F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setueq F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setlt F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setolt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setult F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setle F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setole F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setule F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setgt F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setogt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setugt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+
+def : Pat<(brcond (setge F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setoge F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setuge F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+
+def : Pat<(brcond (setne F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setone F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setune F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+
+def : Pat<(brcond (setoeq F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setueq F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setoge F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setuge F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setogt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setugt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setole F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setule F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setolt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setult F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setone F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>;
+
+//End Branches
+
+//S_floating : IEEE Single
+//T_floating : IEEE Double
+
+//Unused instructions
+//Mnemonic Format Opcode Description
+//CALL_PAL Pcd 00 Trap to PALcode
+//ECB Mfc 18.E800 Evict cache block
+//EXCB Mfc 18.0400 Exception barrier
+//FETCH Mfc 18.8000 Prefetch data
+//FETCH_M Mfc 18.A000 Prefetch data, modify intent
+//LDQ_U Mem 0B Load unaligned quadword
+//MB Mfc 18.4000 Memory barrier
+//STQ_U Mem 0F Store unaligned quadword
+//TRAPB Mfc 18.0000 Trap barrier
+//WH64 Mfc 18.F800 Write hint  64 bytes
+//WMB Mfc 18.4400 Write memory barrier
+//MF_FPCR F-P 17.025 Move from FPCR
+//MT_FPCR F-P 17.024 Move to FPCR
+//There are in the Multimedia extentions, so let's not use them yet
+//def MAXSB8  : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum
+//def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum
+//def MAXUB8  : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum
+//def MAXUW4 : OForm< 0x1C, 0x3D, "MAXUW4 $RA,$RB,$RC">; //Vector unsigned word maximum
+//def MINSB8 : OForm< 0x1C, 0x38, "MINSB8 $RA,$RB,$RC">; //Vector signed byte minimum
+//def MINSW4 : OForm< 0x1C, 0x39, "MINSW4 $RA,$RB,$RC">; //Vector signed word minimum
+//def MINUB8 : OForm< 0x1C, 0x3A, "MINUB8 $RA,$RB,$RC">; //Vector unsigned byte minimum
+//def MINUW4 : OForm< 0x1C, 0x3B, "MINUW4 $RA,$RB,$RC">; //Vector unsigned word minimum
+//def PERR : OForm< 0x1C, 0x31, "PERR $RA,$RB,$RC">; //Pixel error
+//def PKLB : OForm< 0x1C, 0x37, "PKLB $RA,$RB,$RC">; //Pack longwords to bytes
+//def PKWB  : OForm<0x1C, 0x36, "PKWB $RA,$RB,$RC">; //Pack words to bytes
+//def UNPKBL : OForm< 0x1C, 0x35, "UNPKBL $RA,$RB,$RC">; //Unpack bytes to longwords
+//def UNPKBW : OForm< 0x1C, 0x34, "UNPKBW $RA,$RB,$RC">; //Unpack bytes to words
+//CVTLQ F-P 17.010 Convert longword to quadword
+//CVTQL F-P 17.030 Convert quadword to longword
+
+
+//Constant handling
+
+def immConst2Part  : PatLeaf<(imm), [{
+  //true if imm fits in a LDAH LDA pair
+  int64_t val = (int64_t)N->getZExtValue();
+  return (val <= IMM_FULLHIGH  && val >= IMM_FULLLOW);
+}]>;
+def immConst2PartInt  : PatLeaf<(imm), [{
+  //true if imm fits in a LDAH LDA pair with zeroext
+  uint64_t uval = N->getZExtValue();
+  int32_t val32 = (int32_t)uval;
+  return ((uval >> 32) == 0 && //empty upper bits
+          val32 <= IMM_FULLHIGH);
+//          val32 >= IMM_FULLLOW  + IMM_LOW  * IMM_MULT); //Always True
+}], SExt32>;
+
+def : Pat<(i64 immConst2Part:$imm),
+          (LDA (LL16 immConst2Part:$imm), (LDAH (LH16 immConst2Part:$imm), R31))>;
+
+def : Pat<(i64 immSExt16:$imm),
+          (LDA immSExt16:$imm, R31)>;
+
+def : Pat<(i64 immSExt16int:$imm),
+          (ZAPNOTi (LDA (SExt16 immSExt16int:$imm), R31), 15)>;
+def : Pat<(i64 immConst2PartInt:$imm),
+          (ZAPNOTi (LDA (LL16 (SExt32 immConst2PartInt:$imm)), 
+                        (LDAH (LH16 (SExt32 immConst2PartInt:$imm)), R31)), 15)>;
+
+
+//TODO: I want to just define these like this!
+//def : Pat<(i64 0),
+//          (R31)>;
+//def : Pat<(f64 0.0),
+//          (F31)>;
+//def : Pat<(f64 -0.0),
+//          (CPYSNT F31, F31)>;
+//def : Pat<(f32 0.0),
+//          (F31)>;
+//def : Pat<(f32 -0.0),
+//          (CPYSNS F31, F31)>;
+
+//Misc Patterns:
+
+def : Pat<(sext_inreg GPRC:$RB, i32),
+          (ADDLi GPRC:$RB, 0)>;
+
+def : Pat<(fabs F8RC:$RB),
+          (CPYST F31, F8RC:$RB)>;
+def : Pat<(fabs F4RC:$RB),
+          (CPYSS F31, F4RC:$RB)>;
+def : Pat<(fneg F8RC:$RB),
+          (CPYSNT F8RC:$RB, F8RC:$RB)>;
+def : Pat<(fneg F4RC:$RB),
+          (CPYSNS F4RC:$RB, F4RC:$RB)>;
+
+def : Pat<(fcopysign F4RC:$A, (fneg F4RC:$B)),
+          (CPYSNS F4RC:$B, F4RC:$A)>;
+def : Pat<(fcopysign F8RC:$A, (fneg F8RC:$B)),
+          (CPYSNT F8RC:$B, F8RC:$A)>;
+def : Pat<(fcopysign F4RC:$A, (fneg F8RC:$B)),
+          (CPYSNSt F8RC:$B, F4RC:$A)>;
+def : Pat<(fcopysign F8RC:$A, (fneg F4RC:$B)),
+          (CPYSNTs F4RC:$B, F8RC:$A)>;
+
+//Yes, signed multiply high is ugly
+def : Pat<(mulhs GPRC:$RA, GPRC:$RB),
+          (SUBQr (UMULHr GPRC:$RA, GPRC:$RB), (ADDQr (CMOVGEr GPRC:$RB, R31, GPRC:$RA), 
+                                                     (CMOVGEr GPRC:$RA, R31, GPRC:$RB)))>;
+
+//Stupid crazy arithmetic stuff:
+let AddedComplexity = 1 in {
+def : Pat<(mul GPRC:$RA, 5), (S4ADDQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 9), (S8ADDQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 3), (S4SUBQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 7), (S8SUBQr GPRC:$RA, GPRC:$RA)>;
+
+//slight tree expansion if we are multiplying near to a power of 2
+//n is above a power of 2
+def : Pat<(mul GPRC:$RA, immRem1:$imm), 
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem1:$imm)), GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, immRem2:$imm), 
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem2:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem3:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem3:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem4:$imm),
+          (S4ADDQr GPRC:$RA, (SLr GPRC:$RA, (nearP2X immRem4:$imm)))>;
+def : Pat<(mul GPRC:$RA, immRem5:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem5:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRemP2:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRemP2:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2:$imm)))>;
+
+//n is below a power of 2
+//FIXME: figure out why something is truncating the imm to 32bits
+// this will fix 2007-11-27-mulneg3
+//def : Pat<(mul GPRC:$RA, immRem1n:$imm), 
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem1n:$imm)), GPRC:$RA)>;
+//def : Pat<(mul GPRC:$RA, immRem2n:$imm), 
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem2n:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>;
+//def : Pat<(mul GPRC:$RA, immRem3n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem3n:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>;
+//def : Pat<(mul GPRC:$RA, immRem4n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem4n:$imm)), (SLi GPRC:$RA, 2))>;
+//def : Pat<(mul GPRC:$RA, immRem5n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem5n:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>;
+//def : Pat<(mul GPRC:$RA, immRemP2n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRemP2n:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2n:$imm)))>;
+} //Added complexity
diff --git a/lib/Target/Alpha/AlphaJITInfo.cpp b/lib/Target/Alpha/AlphaJITInfo.cpp
new file mode 100644
index 0000000..3fecb19
--- /dev/null
+++ b/lib/Target/Alpha/AlphaJITInfo.cpp
@@ -0,0 +1,307 @@
+//===-- AlphaJITInfo.cpp - Implement the JIT interfaces for the Alpha ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Alpha target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "AlphaJITInfo.h"
+#include "AlphaRelocations.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include "llvm/Support/Debug.h"
+#include <cstdlib>
+#include <map>
+using namespace llvm;
+
+#define BUILD_OFormatI(Op, RA, LIT, FUN, RC) \
+  ((Op << 26) | (RA << 21) | (LIT << 13) | (1 << 12) | (FUN << 5) | (RC))
+#define BUILD_OFormat(Op, RA, RB, FUN, RC) \
+  ((Op << 26) | (RA << 21) | (RB << 16) | (FUN << 5) | (RC))
+
+#define BUILD_LDA(RD, RS, IMM16) \
+  ((0x08 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_LDAH(RD, RS, IMM16) \
+  ((0x09 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+
+#define BUILD_LDQ(RD, RS, IMM16) \
+  ((0x29 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 0xFFFF))
+
+#define BUILD_JMP(RD, RS, IMM16) \
+  ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x00 << 14) | ((IMM16) & 0x3FFF))
+#define BUILD_JSR(RD, RS, IMM16) \
+  ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x01 << 14) | ((IMM16) & 0x3FFF))
+
+#define BUILD_SLLi(RD, RS, IMM8) \
+  (BUILD_OFormatI(0x12, RS, IMM8, 0x39, RD))
+
+#define BUILD_ORi(RD, RS, IMM8) \
+  (BUILD_OFormatI(0x11, RS, IMM8, 0x20, RD))
+
+#define BUILD_OR(RD, RS, RT) \
+  (BUILD_OFormat(0x11, RS, RT, 0x20, RD))
+
+
+
+static void EmitBranchToAt(void *At, void *To) {
+  unsigned long Fn = (unsigned long)To;
+
+  unsigned *AtI = (unsigned*)At;
+
+  AtI[0] = BUILD_OR(0, 27, 27);
+
+  DOUT << "Stub targeting " << To << "\n";
+
+  for (int x = 1; x <= 8; ++x) {
+    AtI[2*x - 1] = BUILD_SLLi(27,27,8);
+    unsigned d = (Fn >> (64 - 8 * x)) & 0x00FF;
+    //DOUT << "outputing " << hex << d << dec << "\n";
+    AtI[2*x] = BUILD_ORi(27, 27, d);
+  }
+  AtI[17] = BUILD_JMP(31,27,0); //jump, preserving ra, and setting pv
+  AtI[18] = 0x00FFFFFF; //mark this as a stub
+}
+
+void AlphaJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  //FIXME
+  assert(0);
+}
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+//static AlphaJITInfo* AlphaJTI;
+
+extern "C" {
+#ifdef __alpha
+
+  void AlphaCompilationCallbackC(long* oldpv, void* CameFromStub)
+  {
+    void* Target = JITCompilerFunction(CameFromStub);
+
+    //rewrite the stub to an unconditional branch
+    if (((unsigned*)CameFromStub)[18] == 0x00FFFFFF) {
+      DOUT << "Came from a stub, rewriting\n";
+      EmitBranchToAt(CameFromStub, Target);
+    } else {
+      DOUT << "confused, didn't come from stub at " << CameFromStub
+           << " old jump vector " << oldpv
+           << " new jump vector " << Target << "\n";
+    }
+
+    //Change pv to new Target
+    *oldpv = (long)Target;
+  }
+
+  void AlphaCompilationCallback(void);
+
+  asm(
+      ".text\n"
+      ".globl AlphaComilationCallbackC\n"
+      ".align 4\n"
+      ".globl AlphaCompilationCallback\n"
+      ".ent AlphaCompilationCallback\n"
+"AlphaCompilationCallback:\n"
+      //      //get JIT's GOT
+      "ldgp $29, 0($27)\n"
+      //Save args, callee saved, and perhaps others?
+      //args: $16-$21 $f16-$f21     (12)
+      //callee: $9-$14 $f2-$f9      (14)
+      //others: fp:$15 ra:$26 pv:$27 (3)
+      "lda $30, -232($30)\n"
+      "stq $16,   0($30)\n"
+      "stq $17,   8($30)\n"
+      "stq $18,  16($30)\n"
+      "stq $19,  24($30)\n"
+      "stq $20,  32($30)\n"
+      "stq $21,  40($30)\n"
+      "stt $f16, 48($30)\n"
+      "stt $f17, 56($30)\n"
+      "stt $f18, 64($30)\n"
+      "stt $f19, 72($30)\n"
+      "stt $f20, 80($30)\n"
+      "stt $f21, 88($30)\n"
+      "stq $9,   96($30)\n"
+      "stq $10, 104($30)\n"
+      "stq $11, 112($30)\n"
+      "stq $12, 120($30)\n"
+      "stq $13, 128($30)\n"
+      "stq $14, 136($30)\n"
+      "stt $f2, 144($30)\n"
+      "stt $f3, 152($30)\n"
+      "stt $f4, 160($30)\n"
+      "stt $f5, 168($30)\n"
+      "stt $f6, 176($30)\n"
+      "stt $f7, 184($30)\n"
+      "stt $f8, 192($30)\n"
+      "stt $f9, 200($30)\n"
+      "stq $15, 208($30)\n"
+      "stq $26, 216($30)\n"
+      "stq $27, 224($30)\n"
+
+      "addq $30, 224, $16\n" //pass the addr of saved pv as the first arg
+      "bis $0, $0, $17\n" //pass the roughly stub addr in second arg
+      "jsr $26, AlphaCompilationCallbackC\n" //call without saving ra
+
+      "ldq $16,   0($30)\n"
+      "ldq $17,   8($30)\n"
+      "ldq $18,  16($30)\n"
+      "ldq $19,  24($30)\n"
+      "ldq $20,  32($30)\n"
+      "ldq $21,  40($30)\n"
+      "ldt $f16, 48($30)\n"
+      "ldt $f17, 56($30)\n"
+      "ldt $f18, 64($30)\n"
+      "ldt $f19, 72($30)\n"
+      "ldt $f20, 80($30)\n"
+      "ldt $f21, 88($30)\n"
+      "ldq $9,   96($30)\n"
+      "ldq $10, 104($30)\n"
+      "ldq $11, 112($30)\n"
+      "ldq $12, 120($30)\n"
+      "ldq $13, 128($30)\n"
+      "ldq $14, 136($30)\n"
+      "ldt $f2, 144($30)\n"
+      "ldt $f3, 152($30)\n"
+      "ldt $f4, 160($30)\n"
+      "ldt $f5, 168($30)\n"
+      "ldt $f6, 176($30)\n"
+      "ldt $f7, 184($30)\n"
+      "ldt $f8, 192($30)\n"
+      "ldt $f9, 200($30)\n"
+      "ldq $15, 208($30)\n"
+      "ldq $26, 216($30)\n"
+      "ldq $27, 224($30)\n" //this was updated in the callback with the target
+
+      "lda $30, 232($30)\n" //restore sp
+      "jmp $31, ($27)\n" //jump to the new function
+      ".end AlphaCompilationCallback\n"
+      );
+#else
+  void AlphaCompilationCallback() {
+    cerr << "Cannot call AlphaCompilationCallback() on a non-Alpha arch!\n";
+    abort();
+  }
+#endif
+}
+
+void *AlphaJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                     JITCodeEmitter &JCE) {
+  //assert(Fn == AlphaCompilationCallback && "Where are you going?\n");
+  //Do things in a stupid slow way!
+  JCE.startGVStub(F, 19*4);
+  void* Addr = (void*)(intptr_t)JCE.getCurrentPCValue();
+  for (int x = 0; x < 19; ++ x)
+    JCE.emitWordLE(0);
+  EmitBranchToAt(Addr, Fn);
+  DOUT << "Emitting Stub to " << Fn << " at [" << Addr << "]\n";
+  return JCE.finishGVStub(F);
+}
+
+TargetJITInfo::LazyResolverFn
+AlphaJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  //  setZerothGOTEntry((void*)AlphaCompilationCallback);
+  return AlphaCompilationCallback;
+}
+
+//These describe LDAx
+static const int IMM_LOW  = -32768;
+static const int IMM_HIGH = 32767;
+static const int IMM_MULT = 65536;
+
+static long getUpper16(long l)
+{
+  long y = l / IMM_MULT;
+  if (l % IMM_MULT > IMM_HIGH)
+    ++y;
+  if (l % IMM_MULT < IMM_LOW)
+    --y;
+  assert((short)y == y && "displacement out of range");
+  return y;
+}
+
+static long getLower16(long l)
+{
+  long h = getUpper16(l);
+  long y = l - h * IMM_MULT;
+  assert(y == (short)y && "Displacement out of range");
+  return y;
+}
+
+void AlphaJITInfo::relocate(void *Function, MachineRelocation *MR,
+                            unsigned NumRelocs, unsigned char* GOTBase) {
+  //because gpdist are paired and relative to the pc of the first inst,
+  //we need to have some state
+
+  static std::map<std::pair<void*, int>, void*> gpdistmap;
+
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    long idx = 0;
+    bool doCommon = true;
+    switch ((Alpha::RelocationType)MR->getRelocationType()) {
+    default: assert(0 && "Unknown relocation type!");
+    case Alpha::reloc_literal:
+      //This is a LDQl
+      idx = MR->getGOTIndex();
+      DOUT << "Literal relocation to slot " << idx;
+      idx = (idx - GOToffset) * 8;
+      DOUT << " offset " << idx << "\n";
+      break;
+    case Alpha::reloc_gprellow:
+      idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
+      idx = getLower16(idx);
+      DOUT << "gprellow relocation offset " << idx << "\n";
+      DOUT << " Pointer is " << (void*)MR->getResultPointer()
+           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n";
+      break;
+    case Alpha::reloc_gprelhigh:
+      idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
+      idx = getUpper16(idx);
+      DOUT << "gprelhigh relocation offset " << idx << "\n";
+      DOUT << " Pointer is " << (void*)MR->getResultPointer()
+           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n";
+      break;
+    case Alpha::reloc_gpdist:
+      switch (*RelocPos >> 26) {
+      case 0x09: //LDAH
+        idx = &GOTBase[GOToffset * 8] - (unsigned char*)RelocPos;
+        idx = getUpper16(idx);
+        DOUT << "LDAH: " << idx << "\n";
+        //add the relocation to the map
+        gpdistmap[std::make_pair(Function, MR->getConstantVal())] = RelocPos;
+        break;
+      case 0x08: //LDA
+        assert(gpdistmap[std::make_pair(Function, MR->getConstantVal())] &&
+               "LDAg without seeing LDAHg");
+        idx = &GOTBase[GOToffset * 8] -
+          (unsigned char*)gpdistmap[std::make_pair(Function, MR->getConstantVal())];
+        idx = getLower16(idx);
+        DOUT << "LDA: " << idx << "\n";
+        break;
+      default:
+        assert(0 && "Cannot handle gpdist yet");
+      }
+      break;
+    case Alpha::reloc_bsr: {
+      idx = (((unsigned char*)MR->getResultPointer() -
+             (unsigned char*)RelocPos) >> 2) + 1; //skip first 2 inst of fun
+      *RelocPos |= (idx & ((1 << 21)-1));
+      doCommon = false;
+      break;
+    }
+    }
+    if (doCommon) {
+      short x = (short)idx;
+      assert(x == idx);
+      *(short*)RelocPos = x;
+    }
+  }
+}
diff --git a/lib/Target/Alpha/AlphaJITInfo.h b/lib/Target/Alpha/AlphaJITInfo.h
new file mode 100644
index 0000000..edff990
--- /dev/null
+++ b/lib/Target/Alpha/AlphaJITInfo.h
@@ -0,0 +1,47 @@
+//===- AlphaJITInfo.h - Alpha impl. of the JIT interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHA_JITINFO_H
+#define ALPHA_JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class TargetMachine;
+
+  class AlphaJITInfo : public TargetJITInfo {
+  protected:
+    TargetMachine &TM;
+  public:
+    explicit AlphaJITInfo(TargetMachine &tm) : TM(tm)
+    { useGOT = true; }
+
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  private:
+    static const unsigned GOToffset = 4096;
+
+  };
+}
+
+#endif
diff --git a/lib/Target/Alpha/AlphaLLRP.cpp b/lib/Target/Alpha/AlphaLLRP.cpp
new file mode 100644
index 0000000..0c51bc5
--- /dev/null
+++ b/lib/Target/Alpha/AlphaLLRP.cpp
@@ -0,0 +1,158 @@
+//===-- AlphaLLRP.cpp - Alpha Load Load Replay Trap elimination pass. -- --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Here we check for potential replay traps introduced by the spiller
+// We also align some branch targets if we can do so for free.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "alpha-nops"
+#include "Alpha.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+STATISTIC(nopintro, "Number of nops inserted");
+STATISTIC(nopalign, "Number of nops inserted for alignment");
+
+namespace {
+  cl::opt<bool>
+  AlignAll("alpha-align-all", cl::Hidden,
+                   cl::desc("Align all blocks"));
+
+  struct AlphaLLRPPass : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    AlphaTargetMachine &TM;
+
+    static char ID;
+    AlphaLLRPPass(AlphaTargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "Alpha NOP inserter";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F) {
+      const TargetInstrInfo *TII = F.getTarget().getInstrInfo();
+      bool Changed = false;
+      MachineInstr* prev[3] = {0,0,0};
+      DebugLoc dl = DebugLoc::getUnknownLoc();
+      unsigned count = 0;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI) {
+        MachineBasicBlock& MBB = *FI;
+        bool ub = false;
+        for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+          if (count%4 == 0)
+            prev[0] = prev[1] = prev[2] = 0; //Slots cleared at fetch boundary
+          ++count;
+          MachineInstr *MI = I++;
+          switch (MI->getOpcode()) {
+          case Alpha::LDQ:  case Alpha::LDL:
+          case Alpha::LDWU: case Alpha::LDBU:
+          case Alpha::LDT: case Alpha::LDS:
+          case Alpha::STQ:  case Alpha::STL:
+          case Alpha::STW:  case Alpha::STB:
+          case Alpha::STT: case Alpha::STS:
+           if (MI->getOperand(2).getReg() == Alpha::R30) {
+             if (prev[0] && 
+                 prev[0]->getOperand(2).getReg() == MI->getOperand(2).getReg()&&
+                 prev[0]->getOperand(1).getImm() == MI->getOperand(1).getImm()){
+               prev[0] = prev[1];
+               prev[1] = prev[2];
+               prev[2] = 0;
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31); 
+               Changed = true; nopintro += 1;
+               count += 1;
+             } else if (prev[1] 
+                        && prev[1]->getOperand(2).getReg() == 
+                        MI->getOperand(2).getReg()
+                        && prev[1]->getOperand(1).getImm() == 
+                        MI->getOperand(1).getImm()) {
+               prev[0] = prev[2];
+               prev[1] = prev[2] = 0;
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31); 
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31);
+               Changed = true; nopintro += 2;
+               count += 2;
+             } else if (prev[2] 
+                        && prev[2]->getOperand(2).getReg() == 
+                        MI->getOperand(2).getReg()
+                        && prev[2]->getOperand(1).getImm() == 
+                        MI->getOperand(1).getImm()) {
+               prev[0] = prev[1] = prev[2] = 0;
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31).addReg(Alpha::R31);
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31).addReg(Alpha::R31);
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31).addReg(Alpha::R31);
+               Changed = true; nopintro += 3;
+               count += 3;
+             }
+             prev[0] = prev[1];
+             prev[1] = prev[2];
+             prev[2] = MI;
+             break;
+           }
+           prev[0] = prev[1];
+           prev[1] = prev[2];
+           prev[2] = 0;
+           break;
+          case Alpha::ALTENT:
+          case Alpha::MEMLABEL:
+          case Alpha::PCLABEL:
+            --count;
+            break;
+          case Alpha::BR:
+          case Alpha::JMP:
+            ub = true;
+            //fall through
+          default:
+            prev[0] = prev[1];
+            prev[1] = prev[2];
+            prev[2] = 0;
+            break;
+          }
+        }
+        if (ub || AlignAll) {
+          //we can align stuff for free at this point
+          while (count % 4) {
+            BuildMI(MBB, MBB.end(), dl, TII->get(Alpha::BISr), Alpha::R31)
+              .addReg(Alpha::R31).addReg(Alpha::R31);
+            ++count;
+            ++nopalign;
+            prev[0] = prev[1];
+            prev[1] = prev[2];
+            prev[2] = 0;
+          }
+        }
+      }
+      return Changed;
+    }
+  };
+  char AlphaLLRPPass::ID = 0;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createAlphaLLRPPass(AlphaTargetMachine &tm) {
+  return new AlphaLLRPPass(tm);
+}
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
new file mode 100644
index 0000000..feee6e4
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -0,0 +1,335 @@
+//===- AlphaRegisterInfo.cpp - Alpha Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "Alpha.h"
+#include "AlphaRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+using namespace llvm;
+
+//These describe LDAx
+static const int IMM_LOW  = -32768;
+static const int IMM_HIGH = 32767;
+static const int IMM_MULT = 65536;
+
+static long getUpper16(long l)
+{
+  long y = l / IMM_MULT;
+  if (l % IMM_MULT > IMM_HIGH)
+    ++y;
+  return y;
+}
+
+static long getLower16(long l)
+{
+  long h = getUpper16(l);
+  return l - h * IMM_MULT;
+}
+
+AlphaRegisterInfo::AlphaRegisterInfo(const TargetInstrInfo &tii)
+  : AlphaGenRegisterInfo(Alpha::ADJUSTSTACKDOWN, Alpha::ADJUSTSTACKUP),
+    TII(tii)
+{
+}
+
+const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    Alpha::R9, Alpha::R10,
+    Alpha::R11, Alpha::R12,
+    Alpha::R13, Alpha::R14,
+    Alpha::F2, Alpha::F3,
+    Alpha::F4, Alpha::F5,
+    Alpha::F6, Alpha::F7,
+    Alpha::F8, Alpha::F9,  0
+  };
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+AlphaRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,  0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(Alpha::R15);
+  Reserved.set(Alpha::R30);
+  Reserved.set(Alpha::R31);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool AlphaRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->hasVarSizedObjects();
+}
+
+void AlphaRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP,
+    // <amt>'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New;
+      if (Old->getOpcode() == Alpha::ADJUSTSTACKDOWN) {
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Alpha::LDA), Alpha::R30)
+          .addImm(-Amount).addReg(Alpha::R30);
+      } else {
+         assert(Old->getOpcode() == Alpha::ADJUSTSTACKUP);
+         New=BuildMI(MF, Old->getDebugLoc(), TII.get(Alpha::LDA), Alpha::R30)
+          .addImm(Amount).addReg(Alpha::R30);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+//Alpha has a slightly funny stack:
+//Args
+//<- incoming SP
+//fixed locals (and spills, callee saved, etc)
+//<- FP
+//variable locals
+//<- SP
+
+void AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  bool FP = hasFP(MF);
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  // Add the base register of R30 (SP) or R15 (FP).
+  MI.getOperand(i + 1).ChangeToRegister(FP ? Alpha::R15 : Alpha::R30, false);
+
+  // Now add the frame object offset to the offset from the virtual frame index.
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  DOUT << "FI: " << FrameIndex << " Offset: " << Offset << "\n";
+
+  Offset += MF.getFrameInfo()->getStackSize();
+
+  DOUT << "Corrected Offset " << Offset
+       << " for stack size: " << MF.getFrameInfo()->getStackSize() << "\n";
+
+  if (Offset > IMM_HIGH || Offset < IMM_LOW) {
+    DOUT << "Unconditionally using R28 for evil purposes Offset: "
+         << Offset << "\n";
+    //so in this case, we need to use a temporary register, and move the
+    //original inst off the SP/FP
+    //fix up the old:
+    MI.getOperand(i + 1).ChangeToRegister(Alpha::R28, false);
+    MI.getOperand(i).ChangeToImmediate(getLower16(Offset));
+    //insert the new
+    MachineInstr* nMI=BuildMI(MF, MI.getDebugLoc(),
+                              TII.get(Alpha::LDAH), Alpha::R28)
+      .addImm(getUpper16(Offset)).addReg(FP ? Alpha::R15 : Alpha::R30);
+    MBB.insert(II, nMI);
+  } else {
+    MI.getOperand(i).ChangeToImmediate(Offset);
+  }
+}
+
+
+void AlphaRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+  bool FP = hasFP(MF);
+
+  static int curgpdist = 0;
+
+  //handle GOP offset
+  BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAHg), Alpha::R29)
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()))
+    .addReg(Alpha::R27).addImm(++curgpdist);
+  BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAg), Alpha::R29)
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()))
+    .addReg(Alpha::R29).addImm(curgpdist);
+
+  //evil const_cast until MO stuff setup to handle const
+  BuildMI(MBB, MBBI, dl, TII.get(Alpha::ALTENT))
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()));
+
+  // Get the number of bytes to allocate from the FrameInfo
+  long NumBytes = MFI->getStackSize();
+
+  if (FP)
+    NumBytes += 8; //reserve space for the old FP
+
+  // Do we need to allocate space on the stack?
+  if (NumBytes == 0) return;
+
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  NumBytes = (NumBytes+Align-1)/Align*Align;
+
+  // Update frame info to pretend that this is part of the stack...
+  MFI->setStackSize(NumBytes);
+
+  // adjust stack pointer: r30 -= numbytes
+  NumBytes = -NumBytes;
+  if (NumBytes >= IMM_LOW) {
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes)
+      .addReg(Alpha::R30);
+  } else if (getUpper16(NumBytes) >= IMM_LOW) {
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAH), Alpha::R30)
+      .addImm(getUpper16(NumBytes)).addReg(Alpha::R30);
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30)
+      .addImm(getLower16(NumBytes)).addReg(Alpha::R30);
+  } else {
+    cerr << "Too big a stack frame at " << NumBytes << "\n";
+    abort();
+  }
+
+  //now if we need to, save the old FP and set the new
+  if (FP)
+  {
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::STQ))
+      .addReg(Alpha::R15).addImm(0).addReg(Alpha::R30);
+    //this must be the last instr in the prolog
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::BISr), Alpha::R15)
+      .addReg(Alpha::R30).addReg(Alpha::R30);
+  }
+
+}
+
+void AlphaRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert((MBBI->getOpcode() == Alpha::RETDAG ||
+          MBBI->getOpcode() == Alpha::RETDAGp)
+         && "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  bool FP = hasFP(MF);
+
+  // Get the number of bytes allocated from the FrameInfo...
+  long NumBytes = MFI->getStackSize();
+
+  //now if we need to, restore the old FP
+  if (FP) {
+    //copy the FP into the SP (discards allocas)
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::BISr), Alpha::R30).addReg(Alpha::R15)
+      .addReg(Alpha::R15);
+    //restore the FP
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDQ), Alpha::R15)
+      .addImm(0).addReg(Alpha::R15);
+  }
+
+  if (NumBytes != 0) {
+    if (NumBytes <= IMM_HIGH) {
+      BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes)
+        .addReg(Alpha::R30);
+    } else if (getUpper16(NumBytes) <= IMM_HIGH) {
+      BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAH), Alpha::R30)
+        .addImm(getUpper16(NumBytes)).addReg(Alpha::R30);
+      BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30)
+        .addImm(getLower16(NumBytes)).addReg(Alpha::R30);
+    } else {
+      cerr << "Too big a stack frame at " << NumBytes << "\n";
+      abort();
+    }
+  }
+}
+
+unsigned AlphaRegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned AlphaRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? Alpha::R15 : Alpha::R30;
+}
+
+unsigned AlphaRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned AlphaRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+#include "AlphaGenRegisterInfo.inc"
+
+std::string AlphaRegisterInfo::getPrettyName(unsigned reg)
+{
+  std::string s(RegisterDescriptors[reg].Name);
+  return s;
+}
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
new file mode 100644
index 0000000..c4f5f7b
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -0,0 +1,67 @@
+//===- AlphaRegisterInfo.h - Alpha Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHAREGISTERINFO_H
+#define ALPHAREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "AlphaGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class Type;
+
+struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
+  const TargetInstrInfo &TII;
+
+  AlphaRegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  //void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+  static std::string getPrettyName(unsigned reg);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.td b/lib/Target/Alpha/AlphaRegisterInfo.td
new file mode 100644
index 0000000..35e6804
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.td
@@ -0,0 +1,171 @@
+//===- AlphaRegisterInfo.td - The Alpha Register File ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Alpha register set.
+//
+//===----------------------------------------------------------------------===//
+
+class AlphaReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Alpha";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : AlphaReg<n> {
+  let Num = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : AlphaReg<n> {
+  let Num = num;
+}
+
+//#define FP    $15
+//#define RA    $26
+//#define PV    $27
+//#define GP    $29
+//#define SP    $30
+
+// General-purpose registers
+def R0  : GPR< 0,  "$0">, DwarfRegNum<[0]>;
+def R1  : GPR< 1,  "$1">, DwarfRegNum<[1]>;
+def R2  : GPR< 2,  "$2">, DwarfRegNum<[2]>;
+def R3  : GPR< 3,  "$3">, DwarfRegNum<[3]>;
+def R4  : GPR< 4,  "$4">, DwarfRegNum<[4]>;
+def R5  : GPR< 5,  "$5">, DwarfRegNum<[5]>;
+def R6  : GPR< 6,  "$6">, DwarfRegNum<[6]>;
+def R7  : GPR< 7,  "$7">, DwarfRegNum<[7]>;
+def R8  : GPR< 8,  "$8">, DwarfRegNum<[8]>;
+def R9  : GPR< 9,  "$9">, DwarfRegNum<[9]>;
+def R10 : GPR<10, "$10">, DwarfRegNum<[10]>;
+def R11 : GPR<11, "$11">, DwarfRegNum<[11]>;
+def R12 : GPR<12, "$12">, DwarfRegNum<[12]>;
+def R13 : GPR<13, "$13">, DwarfRegNum<[13]>;
+def R14 : GPR<14, "$14">, DwarfRegNum<[14]>;
+def R15 : GPR<15, "$15">, DwarfRegNum<[15]>;
+def R16 : GPR<16, "$16">, DwarfRegNum<[16]>;
+def R17 : GPR<17, "$17">, DwarfRegNum<[17]>;
+def R18 : GPR<18, "$18">, DwarfRegNum<[18]>;
+def R19 : GPR<19, "$19">, DwarfRegNum<[19]>;
+def R20 : GPR<20, "$20">, DwarfRegNum<[20]>;
+def R21 : GPR<21, "$21">, DwarfRegNum<[21]>;
+def R22 : GPR<22, "$22">, DwarfRegNum<[22]>;
+def R23 : GPR<23, "$23">, DwarfRegNum<[23]>;
+def R24 : GPR<24, "$24">, DwarfRegNum<[24]>;
+def R25 : GPR<25, "$25">, DwarfRegNum<[25]>;
+def R26 : GPR<26, "$26">, DwarfRegNum<[26]>;
+def R27 : GPR<27, "$27">, DwarfRegNum<[27]>;
+def R28 : GPR<28, "$28">, DwarfRegNum<[28]>;
+def R29 : GPR<29, "$29">, DwarfRegNum<[29]>;
+def R30 : GPR<30, "$30">, DwarfRegNum<[30]>;
+def R31 : GPR<31, "$31">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : FPR< 0,  "$f0">, DwarfRegNum<[33]>;
+def F1  : FPR< 1,  "$f1">, DwarfRegNum<[34]>;
+def F2  : FPR< 2,  "$f2">, DwarfRegNum<[35]>;
+def F3  : FPR< 3,  "$f3">, DwarfRegNum<[36]>;
+def F4  : FPR< 4,  "$f4">, DwarfRegNum<[37]>;
+def F5  : FPR< 5,  "$f5">, DwarfRegNum<[38]>;
+def F6  : FPR< 6,  "$f6">, DwarfRegNum<[39]>;
+def F7  : FPR< 7,  "$f7">, DwarfRegNum<[40]>;
+def F8  : FPR< 8,  "$f8">, DwarfRegNum<[41]>;
+def F9  : FPR< 9,  "$f9">, DwarfRegNum<[42]>;
+def F10 : FPR<10, "$f10">, DwarfRegNum<[43]>;
+def F11 : FPR<11, "$f11">, DwarfRegNum<[44]>;
+def F12 : FPR<12, "$f12">, DwarfRegNum<[45]>;
+def F13 : FPR<13, "$f13">, DwarfRegNum<[46]>;
+def F14 : FPR<14, "$f14">, DwarfRegNum<[47]>;
+def F15 : FPR<15, "$f15">, DwarfRegNum<[48]>;
+def F16 : FPR<16, "$f16">, DwarfRegNum<[49]>;
+def F17 : FPR<17, "$f17">, DwarfRegNum<[50]>;
+def F18 : FPR<18, "$f18">, DwarfRegNum<[51]>;
+def F19 : FPR<19, "$f19">, DwarfRegNum<[52]>;
+def F20 : FPR<20, "$f20">, DwarfRegNum<[53]>;
+def F21 : FPR<21, "$f21">, DwarfRegNum<[54]>;
+def F22 : FPR<22, "$f22">, DwarfRegNum<[55]>;
+def F23 : FPR<23, "$f23">, DwarfRegNum<[56]>;
+def F24 : FPR<24, "$f24">, DwarfRegNum<[57]>;
+def F25 : FPR<25, "$f25">, DwarfRegNum<[58]>;
+def F26 : FPR<26, "$f26">, DwarfRegNum<[59]>;
+def F27 : FPR<27, "$f27">, DwarfRegNum<[60]>;
+def F28 : FPR<28, "$f28">, DwarfRegNum<[61]>;
+def F29 : FPR<29, "$f29">, DwarfRegNum<[62]>;
+def F30 : FPR<30, "$f30">, DwarfRegNum<[63]>;
+def F31 : FPR<31, "$f31">, DwarfRegNum<[64]>;
+
+  // //#define FP    $15
+  // //#define RA    $26
+  // //#define PV    $27
+  // //#define GP    $29
+  // //#define SP    $30
+  // $28 is undefined after any and all calls
+
+/// Register classes
+def GPRC : RegisterClass<"Alpha", [i64], 64,
+     // Volatile
+     [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22,
+      R23, R24, R25, R28, 
+     //Special meaning, but volatile
+     R27, //procedure address
+     R26, //return address
+     R29, //global offset table address
+     // Non-volatile
+     R9, R10, R11, R12, R13, R14,
+// Don't allocate 15, 30, 31
+     R15, R30, R31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-3;
+    }
+  }];
+}
+
+def F4RC : RegisterClass<"Alpha", [f32], 64, [F0, F1, 
+        F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+        F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+        // Saved:
+        F2, F3, F4, F5, F6, F7, F8, F9,
+        F31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    F4RCClass::iterator
+    F4RCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-1;
+    }
+  }];
+}
+
+def F8RC : RegisterClass<"Alpha", [f64], 64, [F0, F1, 
+        F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+        F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+        // Saved:
+        F2, F3, F4, F5, F6, F7, F8, F9,
+        F31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    F8RCClass::iterator
+    F8RCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-1;
+    }
+  }];
+}
diff --git a/lib/Target/Alpha/AlphaRelocations.h b/lib/Target/Alpha/AlphaRelocations.h
new file mode 100644
index 0000000..4c92045
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRelocations.h
@@ -0,0 +1,31 @@
+//===- AlphaRelocations.h - Alpha Code Relocations --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Alpha target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHARELOCATIONS_H
+#define ALPHARELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace Alpha {
+    enum RelocationType {
+      reloc_literal,
+      reloc_gprellow,
+      reloc_gprelhigh,
+      reloc_gpdist,
+      reloc_bsr
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/Alpha/AlphaSchedule.td b/lib/Target/Alpha/AlphaSchedule.td
new file mode 100644
index 0000000..b7b4560
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSchedule.td
@@ -0,0 +1,84 @@
+//===- AlphaSchedule.td - Alpha Scheduling Definitions -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//This is table 2-2 from the 21264 compiler writers guide
+//modified some
+
+//Pipelines
+
+def L0   : FuncUnit;
+def L1   : FuncUnit;
+def FST0 : FuncUnit;
+def FST1 : FuncUnit;
+def U0   : FuncUnit;
+def U1   : FuncUnit;
+def FA   : FuncUnit;
+def FM   : FuncUnit;
+
+def s_ild   : InstrItinClass;
+def s_fld   : InstrItinClass;
+def s_ist   : InstrItinClass;
+def s_fst   : InstrItinClass;
+def s_lda   : InstrItinClass;
+def s_rpcc  : InstrItinClass;
+def s_rx    : InstrItinClass;
+def s_mxpr  : InstrItinClass;
+def s_icbr  : InstrItinClass;
+def s_ubr   : InstrItinClass;
+def s_jsr   : InstrItinClass;
+def s_iadd  : InstrItinClass;
+def s_ilog  : InstrItinClass;
+def s_ishf  : InstrItinClass;
+def s_cmov  : InstrItinClass;
+def s_imul  : InstrItinClass;
+def s_imisc : InstrItinClass;
+def s_fbr   : InstrItinClass;
+def s_fadd  : InstrItinClass;
+def s_fmul  : InstrItinClass;
+def s_fcmov : InstrItinClass;
+def s_fdivt : InstrItinClass;
+def s_fdivs : InstrItinClass;
+def s_fsqrts: InstrItinClass;
+def s_fsqrtt: InstrItinClass;
+def s_ftoi  : InstrItinClass;
+def s_itof  : InstrItinClass;
+def s_pseudo : InstrItinClass;
+
+//Table 2�4 Instruction Class Latency in Cycles
+//modified some
+
+def Alpha21264Itineraries : ProcessorItineraries<[
+  InstrItinData<s_ild    , [InstrStage<3, [L0, L1]>]>,
+  InstrItinData<s_fld    , [InstrStage<4, [L0, L1]>]>,
+  InstrItinData<s_ist    , [InstrStage<0, [L0, L1]>]>,
+  InstrItinData<s_fst    , [InstrStage<0, [FST0, FST1, L0, L1]>]>,
+  InstrItinData<s_lda    , [InstrStage<1, [L0, L1, U0, U1]>]>,
+  InstrItinData<s_rpcc   , [InstrStage<1, [L1]>]>,
+  InstrItinData<s_rx     , [InstrStage<1, [L1]>]>,
+  InstrItinData<s_mxpr   , [InstrStage<1, [L0, L1]>]>,
+  InstrItinData<s_icbr   , [InstrStage<0, [U0, U1]>]>,
+  InstrItinData<s_ubr    , [InstrStage<3, [U0, U1]>]>,
+  InstrItinData<s_jsr    , [InstrStage<3, [L0]>]>,
+  InstrItinData<s_iadd   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_ilog   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_ishf   , [InstrStage<1, [U0, U1]>]>,
+  InstrItinData<s_cmov   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_imul   , [InstrStage<7, [U1]>]>,
+  InstrItinData<s_imisc  , [InstrStage<3, [U0]>]>,
+  InstrItinData<s_fbr    , [InstrStage<0, [FA]>]>,
+  InstrItinData<s_fadd   , [InstrStage<6, [FA]>]>,
+  InstrItinData<s_fmul   , [InstrStage<6, [FM]>]>,
+  InstrItinData<s_fcmov  , [InstrStage<6, [FA]>]>,
+  InstrItinData<s_fdivs  , [InstrStage<12, [FA]>]>,
+  InstrItinData<s_fdivt  , [InstrStage<15, [FA]>]>,
+  InstrItinData<s_fsqrts , [InstrStage<18, [FA]>]>,
+  InstrItinData<s_fsqrtt , [InstrStage<33, [FA]>]>,
+  InstrItinData<s_ftoi   , [InstrStage<3, [FST0, FST1, L0, L1]>]>,
+  InstrItinData<s_itof   , [InstrStage<4, [L0, L1]>]>
+]>;
diff --git a/lib/Target/Alpha/AlphaSubtarget.cpp b/lib/Target/Alpha/AlphaSubtarget.cpp
new file mode 100644
index 0000000..d5a9365
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSubtarget.cpp
@@ -0,0 +1,25 @@
+//===- AlphaSubtarget.cpp - Alpha Subtarget Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Alpha specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaSubtarget.h"
+#include "Alpha.h"
+#include "AlphaGenSubtarget.inc"
+using namespace llvm;
+
+AlphaSubtarget::AlphaSubtarget(const Module &M, const std::string &FS)
+  : HasCT(false) {
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
diff --git a/lib/Target/Alpha/AlphaSubtarget.h b/lib/Target/Alpha/AlphaSubtarget.h
new file mode 100644
index 0000000..0a944cb
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSubtarget.h
@@ -0,0 +1,47 @@
+//=====-- AlphaSubtarget.h - Define Subtarget for the Alpha --*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Alpha specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHASUBTARGET_H
+#define ALPHASUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+class AlphaSubtarget : public TargetSubtarget {
+protected:
+
+  bool HasCT;
+
+  InstrItineraryData InstrItins;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  AlphaSubtarget(const Module &M, const std::string &FS);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool hasCT() const { return HasCT; }
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/Alpha/AlphaTargetAsmInfo.cpp b/lib/Target/Alpha/AlphaTargetAsmInfo.cpp
new file mode 100644
index 0000000..6092ab6
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetAsmInfo.cpp
@@ -0,0 +1,31 @@
+//===-- AlphaTargetAsmInfo.cpp - Alpha asm properties -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AlphaTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaTargetMachine.h"
+#include "AlphaTargetAsmInfo.h"
+
+using namespace llvm;
+
+AlphaTargetAsmInfo::AlphaTargetAsmInfo(const AlphaTargetMachine &TM)
+  : TargetAsmInfo(TM) {
+  AlignmentIsInBytes = false;
+  PrivateGlobalPrefix = "$";
+  JumpTableDirective = ".gprel32";
+  JumpTableDataSection = "\t.section .rodata\n";
+  WeakRefDirective = "\t.weak\t";
+}
+
+unsigned AlphaTargetAsmInfo::RelocBehaviour() const {
+  return (TM.getRelocationModel() != Reloc::Static ?
+          Reloc::LocalOrGlobal : Reloc::Global);
+}
diff --git a/lib/Target/Alpha/AlphaTargetAsmInfo.h b/lib/Target/Alpha/AlphaTargetAsmInfo.h
new file mode 100644
index 0000000..7675b26
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetAsmInfo.h
@@ -0,0 +1,32 @@
+//=====-- AlphaTargetAsmInfo.h - Alpha asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AlphaTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHATARGETASMINFO_H
+#define ALPHATARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class AlphaTargetMachine;
+
+  struct AlphaTargetAsmInfo : public TargetAsmInfo {
+    explicit AlphaTargetAsmInfo(const AlphaTargetMachine &TM);
+
+    virtual unsigned RelocBehaviour() const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Alpha/AlphaTargetMachine.cpp b/lib/Target/Alpha/AlphaTargetMachine.cpp
new file mode 100644
index 0000000..4c83054
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetMachine.cpp
@@ -0,0 +1,126 @@
+//===-- AlphaTargetMachine.cpp - Define TargetMachine for Alpha -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaJITInfo.h"
+#include "AlphaTargetAsmInfo.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// AlphaTargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library.  In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this.  Though it is unused, do not remove it.
+extern "C" int AlphaTargetMachineModule;
+int AlphaTargetMachineModule = 0;
+
+// Register the targets
+static RegisterTarget<AlphaTargetMachine> X("alpha", "Alpha [experimental]");
+
+const TargetAsmInfo *AlphaTargetMachine::createTargetAsmInfo() const {
+  return new AlphaTargetAsmInfo(*this);
+}
+
+unsigned AlphaTargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "alpha*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 5 && TT[0] == 'a' && TT[1] == 'l' && TT[2] == 'p' &&
+      TT[3] == 'h' && TT[4] == 'a')
+    return 20;
+  // If the target triple is something non-alpha, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+unsigned AlphaTargetMachine::getJITMatchQuality() {
+#ifdef __alpha
+  return 10;
+#else
+  return 0;
+#endif
+}
+
+AlphaTargetMachine::AlphaTargetMachine(const Module &M, const std::string &FS)
+  : DataLayout("e-f128:128:128"),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+    JITInfo(*this),
+    Subtarget(M, FS),
+    TLInfo(*this) {
+  setRelocationModel(Reloc::PIC_);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool AlphaTargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createAlphaISelDag(*this));
+  return false;
+}
+bool AlphaTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel) {
+  // Must run branch selection immediately preceding the asm printer
+  PM.add(createAlphaBranchSelectionPass());
+  return false;
+}
+bool AlphaTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool Verbose,
+                                            raw_ostream &Out) {
+  PM.add(createAlphaLLRPPass(*this));
+  PM.add(createAlphaCodePrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
+bool AlphaTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel,
+                                        bool DumpAsm, MachineCodeEmitter &MCE) {
+  PM.add(createAlphaCodeEmitterPass(*this, MCE));
+  if (DumpAsm)
+    PM.add(createAlphaCodePrinterPass(errs(), *this, OptLevel, true));
+  return false;
+}
+bool AlphaTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel,
+                                        bool DumpAsm, JITCodeEmitter &JCE) {
+  PM.add(createAlphaJITCodeEmitterPass(*this, JCE));
+  if (DumpAsm)
+    PM.add(createAlphaCodePrinterPass(errs(), *this, OptLevel, true));
+  return false;
+}
+bool AlphaTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                              CodeGenOpt::Level OptLevel,
+                                              bool DumpAsm,
+                                              MachineCodeEmitter &MCE) {
+  return addCodeEmitter(PM, OptLevel, DumpAsm, MCE);
+}
+bool AlphaTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                              CodeGenOpt::Level OptLevel,
+                                              bool DumpAsm,
+                                              JITCodeEmitter &JCE) {
+  return addCodeEmitter(PM, OptLevel, DumpAsm, JCE);
+}
+
diff --git a/lib/Target/Alpha/AlphaTargetMachine.h b/lib/Target/Alpha/AlphaTargetMachine.h
new file mode 100644
index 0000000..51224e8
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetMachine.h
@@ -0,0 +1,82 @@
+//===-- AlphaTargetMachine.h - Define TargetMachine for Alpha ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Alpha-specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHA_TARGETMACHINE_H
+#define ALPHA_TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaJITInfo.h"
+#include "AlphaISelLowering.h"
+#include "AlphaSubtarget.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+class AlphaTargetMachine : public LLVMTargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+  AlphaInstrInfo InstrInfo;
+  TargetFrameInfo FrameInfo;
+  AlphaJITInfo JITInfo;
+  AlphaSubtarget Subtarget;
+  AlphaTargetLowering TLInfo;
+  
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  AlphaTargetMachine(const Module &M, const std::string &FS);
+
+  virtual const AlphaInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual const AlphaSubtarget   *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const AlphaRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual AlphaTargetLowering* getTargetLowering() const { 
+    return const_cast<AlphaTargetLowering*>(&TLInfo);
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual AlphaJITInfo* getJITInfo() {
+    return &JITInfo;
+  }
+
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+  
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel,
+                                  bool Verbose, raw_ostream &Out);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, MachineCodeEmitter &MCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, JITCodeEmitter &JCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm,
+                                    MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm,
+                                    JITCodeEmitter &JCE);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
new file mode 100644
index 0000000..74b48ee6
--- /dev/null
+++ b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
@@ -0,0 +1,305 @@
+//===-- AlphaAsmPrinter.cpp - Alpha LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format Alpha assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  struct VISIBILITY_HIDDEN AlphaAsmPrinter : public AsmPrinter {
+    /// Unique incrementer for label values for referencing Global values.
+    ///
+
+    explicit AlphaAsmPrinter(raw_ostream &o, TargetMachine &tm,
+                             const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                             bool V)
+      : AsmPrinter(o, tm, T, OL, V) {}
+
+    virtual const char *getPassName() const {
+      return "Alpha Assembly Printer";
+    }
+    bool printInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO, bool IsCallOp = false);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printBaseOffsetPair (const MachineInstr *MI, int i, bool brackets=true);
+    void printModuleLevelGV(const GlobalVariable* GVar);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI,
+                               unsigned OpNo,
+                               unsigned AsmVariant,
+                               const char *ExtraCode);
+  };
+} // end of anonymous namespace
+
+/// createAlphaCodePrinterPass - Returns a pass that prints the Alpha
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createAlphaCodePrinterPass(raw_ostream &o,
+                                               TargetMachine &tm,
+                                               CodeGenOpt::Level OptLevel,
+                                               bool verbose) {
+  return new AlphaAsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
+
+#include "AlphaGenAsmWriter.inc"
+
+void AlphaAsmPrinter::printOperand(const MachineInstr *MI, int opNum)
+{
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.getType() == MachineOperand::MO_Register) {
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Not physreg??");
+    O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+  } else if (MO.isImm()) {
+    O << MO.getImm();
+    assert(MO.getImm() < (1 << 30));
+  } else {
+    printOp(MO);
+  }
+}
+
+
+void AlphaAsmPrinter::printOp(const MachineOperand &MO, bool IsCallOp) {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << RI.get(MO.getReg()).AsmName;
+    return;
+
+  case MachineOperand::MO_Immediate:
+    cerr << "printOp() does not handle immediate values\n";
+    abort();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    return;
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    return;
+
+  case MachineOperand::MO_GlobalAddress: {
+    GlobalValue *GV = MO.getGlobal();
+    O << Mang->getValueName(GV);
+    if (GV->isDeclaration() && GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    return;
+  }
+
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool AlphaAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out jump tables referenced by the function
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  EmitAlignment(4, F);
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+    break;
+   case Function::ExternalLinkage:
+     O << "\t.globl " << CurrentFnName << "\n";
+     break;
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+    O << TAI->getWeakRefDirective() << CurrentFnName << "\n";
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  O << "\t.ent " << CurrentFnName << "\n";
+
+  O << CurrentFnName << ":\n";
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      ++EmittedInsts;
+      if (!printInstruction(II)) {
+        assert(0 && "Unhandled instruction in asm writer!");
+        abort();
+      }
+    }
+  }
+
+  O << "\t.end " << CurrentFnName << "\n";
+
+  // We didn't modify anything.
+  return false;
+}
+
+bool AlphaAsmPrinter::doInitialization(Module &M)
+{
+  if(TM.getSubtarget<AlphaSubtarget>().hasCT())
+    O << "\t.arch ev6\n"; //This might need to be ev67, so leave this test here
+  else
+    O << "\t.arch ev6\n";
+  O << "\t.set noat\n";
+  return AsmPrinter::doInitialization(M);
+}
+
+void AlphaAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer()) return;  // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar))
+    return;
+
+  std::string name = Mang->getValueName(GVar);
+  Constant *C = GVar->getInitializer();
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+  // 0: Switch to section
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  // 1: Check visibility
+  printVisibility(name, GVar->getVisibility());
+
+  // 2: Kind
+  switch (GVar->getLinkage()) {
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::WeakAnyLinkage:
+   case GlobalValue::WeakODRLinkage:
+   case GlobalValue::CommonLinkage:
+    O << TAI->getWeakRefDirective() << name << '\n';
+    break;
+   case GlobalValue::AppendingLinkage:
+   case GlobalValue::ExternalLinkage:
+      O << TAI->getGlobalDirective() << name << "\n";
+      break;
+    case GlobalValue::InternalLinkage:
+    case GlobalValue::PrivateLinkage:
+      break;
+    default:
+      assert(0 && "Unknown linkage type!");
+      cerr << "Unknown linkage type!\n";
+      abort();
+    }
+
+  // 3: Type, Size, Align
+  if (TAI->hasDotTypeDotSizeDirective()) {
+    O << "\t.type\t" << name << ", @object\n";
+    O << "\t.size\t" << name << ", " << Size << "\n";
+  }
+
+  EmitAlignment(Align, GVar);
+
+  O << name << ":\n";
+
+  // If the initializer is a extern weak symbol, remember to emit the weak
+  // reference!
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+
+  EmitGlobalConstant(C);
+  O << '\n';
+}
+
+bool AlphaAsmPrinter::doFinalization(Module &M) {
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    printModuleLevelGV(I);
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool AlphaAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode) {
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool AlphaAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo,
+                                            unsigned AsmVariant,
+                                            const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  O << "0(";
+  printOperand(MI, OpNo);
+  O << ")";
+  return false;
+}
diff --git a/lib/Target/Alpha/AsmPrinter/CMakeLists.txt b/lib/Target/Alpha/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..b62a7f6
--- /dev/null
+++ b/lib/Target/Alpha/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_partially_linked_object(LLVMAlphaAsmPrinter
+  AlphaAsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMAlphaCodeGen n)
+
+add_dependencies(LLVMAlphaAsmPrinter ${n})
diff --git a/lib/Target/Alpha/AsmPrinter/Makefile b/lib/Target/Alpha/AsmPrinter/Makefile
new file mode 100644
index 0000000..c5b3e94
--- /dev/null
+++ b/lib/Target/Alpha/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Alpha/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAlphaAsmPrinter
+
+# Hack: we need to include 'main' alpha target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Alpha/CMakeLists.txt b/lib/Target/Alpha/CMakeLists.txt
new file mode 100644
index 0000000..1e535f7
--- /dev/null
+++ b/lib/Target/Alpha/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(LLVM_TARGET_DEFINITIONS Alpha.td)
+
+tablegen(AlphaGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(AlphaGenRegisterNames.inc -gen-register-enums)
+tablegen(AlphaGenRegisterInfo.inc -gen-register-desc)
+tablegen(AlphaGenInstrNames.inc -gen-instr-enums)
+tablegen(AlphaGenInstrInfo.inc -gen-instr-desc)
+tablegen(AlphaGenCodeEmitter.inc -gen-emitter)
+tablegen(AlphaGenAsmWriter.inc -gen-asm-writer)
+tablegen(AlphaGenDAGISel.inc -gen-dag-isel)
+tablegen(AlphaGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(AlphaCodeGen
+  AlphaBranchSelector.cpp
+  AlphaCodeEmitter.cpp
+  AlphaInstrInfo.cpp
+  AlphaISelDAGToDAG.cpp
+  AlphaISelLowering.cpp
+  AlphaJITInfo.cpp
+  AlphaLLRP.cpp
+  AlphaRegisterInfo.cpp
+  AlphaSubtarget.cpp
+  AlphaTargetAsmInfo.cpp
+  AlphaTargetMachine.cpp
+  )
diff --git a/lib/Target/Alpha/Makefile b/lib/Target/Alpha/Makefile
new file mode 100644
index 0000000..d6c82c7
--- /dev/null
+++ b/lib/Target/Alpha/Makefile
@@ -0,0 +1,22 @@
+##===- lib/Target/Alpha/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMAlphaCodeGen
+TARGET = Alpha
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AlphaGenRegisterInfo.h.inc AlphaGenRegisterNames.inc \
+                AlphaGenRegisterInfo.inc AlphaGenInstrNames.inc \
+                AlphaGenInstrInfo.inc AlphaGenCodeEmitter.inc \
+                AlphaGenAsmWriter.inc AlphaGenDAGISel.inc \
+                AlphaGenSubtarget.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt
new file mode 100644
index 0000000..9ae1517
--- /dev/null
+++ b/lib/Target/Alpha/README.txt
@@ -0,0 +1,42 @@
+***
+
+add gcc builtins for alpha instructions
+
+
+***
+
+custom expand byteswap into nifty 
+extract/insert/mask byte/word/longword/quadword low/high
+sequences
+
+***
+
+see if any of the extract/insert/mask operations can be added
+
+***
+
+match more interesting things for cmovlbc cmovlbs (move if low bit clear/set)
+
+***
+
+lower srem and urem
+
+remq(i,j):  i - (j * divq(i,j)) if j != 0
+remqu(i,j): i - (j * divqu(i,j)) if j != 0
+reml(i,j):  i - (j * divl(i,j)) if j != 0
+remlu(i,j): i - (j * divlu(i,j)) if j != 0
+
+***
+
+add crazy vector instructions (MVI):
+
+(MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word
+PKWB, UNPKBW pack/unpack word to byte
+PKLB UNPKBL pack/unpack long to byte
+PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b))
+
+cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions)
+
+this has some good examples for other operations that can be synthesised well 
+from these rather meager vector ops (such as saturating add).
+http://www.alphalinux.org/docs/MVI-full.html
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
new file mode 100644
index 0000000..4d7b545
--- /dev/null
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -0,0 +1,3601 @@
+//===-- CBackend.cpp - Library for converting LLVM code to C --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library converts LLVM code to C code, compilable by GCC and other C
+// compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Config/config.h"
+#include <algorithm>
+#include <sstream>
+using namespace llvm;
+
+/// CBackendTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int CBackendTargetMachineModule;
+int CBackendTargetMachineModule = 0;
+
+// Register the target.
+static RegisterTarget<CTargetMachine> X("c", "C backend");
+
+namespace {
+  /// CBackendNameAllUsedStructsAndMergeFunctions - This pass inserts names for
+  /// any unnamed structure types that are used by the program, and merges
+  /// external functions with the same name.
+  ///
+  class CBackendNameAllUsedStructsAndMergeFunctions : public ModulePass {
+  public:
+    static char ID;
+    CBackendNameAllUsedStructsAndMergeFunctions() 
+      : ModulePass(&ID) {}
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+    }
+
+    virtual const char *getPassName() const {
+      return "C backend type canonicalizer";
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+
+  char CBackendNameAllUsedStructsAndMergeFunctions::ID = 0;
+
+  /// CWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C translation unit.
+  class CWriter : public FunctionPass, public InstVisitor<CWriter> {
+    raw_ostream &Out;
+    IntrinsicLowering *IL;
+    Mangler *Mang;
+    LoopInfo *LI;
+    const Module *TheModule;
+    const TargetAsmInfo* TAsm;
+    const TargetData* TD;
+    std::map<const Type *, std::string> TypeNames;
+    std::map<const ConstantFP *, unsigned> FPConstantMap;
+    std::set<Function*> intrinsicPrototypesAlreadyGenerated;
+    std::set<const Argument*> ByValParams;
+    unsigned FPCounter;
+
+  public:
+    static char ID;
+    explicit CWriter(raw_ostream &o)
+      : FunctionPass(&ID), Out(o), IL(0), Mang(0), LI(0), 
+        TheModule(0), TAsm(0), TD(0) {
+      FPCounter = 0;
+    }
+
+    virtual const char *getPassName() const { return "C backend"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    virtual bool doInitialization(Module &M);
+
+    bool runOnFunction(Function &F) {
+     // Do not codegen any 'available_externally' functions at all, they have
+     // definitions outside the translation unit.
+     if (F.hasAvailableExternallyLinkage())
+       return false;
+
+      LI = &getAnalysis<LoopInfo>();
+
+      // Get rid of intrinsics we can't handle.
+      lowerIntrinsics(F);
+
+      // Output all floating point constants that cannot be printed accurately.
+      printFloatingPointConstants(F);
+
+      printFunction(F);
+      return false;
+    }
+
+    virtual bool doFinalization(Module &M) {
+      // Free memory...
+      delete IL;
+      delete TD;
+      delete Mang;
+      FPConstantMap.clear();
+      TypeNames.clear();
+      ByValParams.clear();
+      intrinsicPrototypesAlreadyGenerated.clear();
+      return false;
+    }
+
+    raw_ostream &printType(raw_ostream &Out, const Type *Ty, 
+                            bool isSigned = false,
+                            const std::string &VariableName = "",
+                            bool IgnoreName = false,
+                            const AttrListPtr &PAL = AttrListPtr());
+    std::ostream &printType(std::ostream &Out, const Type *Ty, 
+                           bool isSigned = false,
+                           const std::string &VariableName = "",
+                           bool IgnoreName = false,
+                           const AttrListPtr &PAL = AttrListPtr());
+    raw_ostream &printSimpleType(raw_ostream &Out, const Type *Ty, 
+                                  bool isSigned, 
+                                  const std::string &NameSoFar = "");
+    std::ostream &printSimpleType(std::ostream &Out, const Type *Ty, 
+                                 bool isSigned, 
+                                 const std::string &NameSoFar = "");
+
+    void printStructReturnPointerFunctionType(raw_ostream &Out,
+                                              const AttrListPtr &PAL,
+                                              const PointerType *Ty);
+
+    /// writeOperandDeref - Print the result of dereferencing the specified
+    /// operand with '*'.  This is equivalent to printing '*' then using
+    /// writeOperand, but avoids excess syntax in some cases.
+    void writeOperandDeref(Value *Operand) {
+      if (isAddressExposed(Operand)) {
+        // Already something with an address exposed.
+        writeOperandInternal(Operand);
+      } else {
+        Out << "*(";
+        writeOperand(Operand);
+        Out << ")";
+      }
+    }
+    
+    void writeOperand(Value *Operand, bool Static = false);
+    void writeInstComputationInline(Instruction &I);
+    void writeOperandInternal(Value *Operand, bool Static = false);
+    void writeOperandWithCast(Value* Operand, unsigned Opcode);
+    void writeOperandWithCast(Value* Operand, const ICmpInst &I);
+    bool writeInstructionCast(const Instruction &I);
+
+    void writeMemoryAccess(Value *Operand, const Type *OperandType,
+                           bool IsVolatile, unsigned Alignment);
+
+  private :
+    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
+
+    void lowerIntrinsics(Function &F);
+
+    void printModule(Module *M);
+    void printModuleTypes(const TypeSymbolTable &ST);
+    void printContainedStructs(const Type *Ty, std::set<const Type *> &);
+    void printFloatingPointConstants(Function &F);
+    void printFloatingPointConstants(const Constant *C);
+    void printFunctionSignature(const Function *F, bool Prototype);
+
+    void printFunction(Function &);
+    void printBasicBlock(BasicBlock *BB);
+    void printLoop(Loop *L);
+
+    void printCast(unsigned opcode, const Type *SrcTy, const Type *DstTy);
+    void printConstant(Constant *CPV, bool Static);
+    void printConstantWithCast(Constant *CPV, unsigned Opcode);
+    bool printConstExprCast(const ConstantExpr *CE, bool Static);
+    void printConstantArray(ConstantArray *CPA, bool Static);
+    void printConstantVector(ConstantVector *CV, bool Static);
+
+    /// isAddressExposed - Return true if the specified value's name needs to
+    /// have its address taken in order to get a C value of the correct type.
+    /// This happens for global variables, byval parameters, and direct allocas.
+    bool isAddressExposed(const Value *V) const {
+      if (const Argument *A = dyn_cast<Argument>(V))
+        return ByValParams.count(A);
+      return isa<GlobalVariable>(V) || isDirectAlloca(V);
+    }
+    
+    // isInlinableInst - Attempt to inline instructions into their uses to build
+    // trees as much as possible.  To do this, we have to consistently decide
+    // what is acceptable to inline, so that variable declarations don't get
+    // printed and an extra copy of the expr is not emitted.
+    //
+    static bool isInlinableInst(const Instruction &I) {
+      // Always inline cmp instructions, even if they are shared by multiple
+      // expressions.  GCC generates horrible code if we don't.
+      if (isa<CmpInst>(I)) 
+        return true;
+
+      // Must be an expression, must be used exactly once.  If it is dead, we
+      // emit it inline where it would go.
+      if (I.getType() == Type::VoidTy || !I.hasOneUse() ||
+          isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
+          isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
+          isa<InsertValueInst>(I))
+        // Don't inline a load across a store or other bad things!
+        return false;
+
+      // Must not be used in inline asm, extractelement, or shufflevector.
+      if (I.hasOneUse()) {
+        const Instruction &User = cast<Instruction>(*I.use_back());
+        if (isInlineAsm(User) || isa<ExtractElementInst>(User) ||
+            isa<ShuffleVectorInst>(User))
+          return false;
+      }
+
+      // Only inline instruction it if it's use is in the same BB as the inst.
+      return I.getParent() == cast<Instruction>(I.use_back())->getParent();
+    }
+
+    // isDirectAlloca - Define fixed sized allocas in the entry block as direct
+    // variables which are accessed with the & operator.  This causes GCC to
+    // generate significantly better code than to emit alloca calls directly.
+    //
+    static const AllocaInst *isDirectAlloca(const Value *V) {
+      const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+      if (!AI) return false;
+      if (AI->isArrayAllocation())
+        return 0;   // FIXME: we can also inline fixed size array allocas!
+      if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
+        return 0;
+      return AI;
+    }
+    
+    // isInlineAsm - Check if the instruction is a call to an inline asm chunk
+    static bool isInlineAsm(const Instruction& I) {
+      if (isa<CallInst>(&I) && isa<InlineAsm>(I.getOperand(0)))
+        return true;
+      return false;
+    }
+    
+    // Instruction visitation functions
+    friend class InstVisitor<CWriter>;
+
+    void visitReturnInst(ReturnInst &I);
+    void visitBranchInst(BranchInst &I);
+    void visitSwitchInst(SwitchInst &I);
+    void visitInvokeInst(InvokeInst &I) {
+      assert(0 && "Lowerinvoke pass didn't work!");
+    }
+
+    void visitUnwindInst(UnwindInst &I) {
+      assert(0 && "Lowerinvoke pass didn't work!");
+    }
+    void visitUnreachableInst(UnreachableInst &I);
+
+    void visitPHINode(PHINode &I);
+    void visitBinaryOperator(Instruction &I);
+    void visitICmpInst(ICmpInst &I);
+    void visitFCmpInst(FCmpInst &I);
+
+    void visitCastInst (CastInst &I);
+    void visitSelectInst(SelectInst &I);
+    void visitCallInst (CallInst &I);
+    void visitInlineAsm(CallInst &I);
+    bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID, bool &WroteCallee);
+
+    void visitMallocInst(MallocInst &I);
+    void visitAllocaInst(AllocaInst &I);
+    void visitFreeInst  (FreeInst   &I);
+    void visitLoadInst  (LoadInst   &I);
+    void visitStoreInst (StoreInst  &I);
+    void visitGetElementPtrInst(GetElementPtrInst &I);
+    void visitVAArgInst (VAArgInst &I);
+    
+    void visitInsertElementInst(InsertElementInst &I);
+    void visitExtractElementInst(ExtractElementInst &I);
+    void visitShuffleVectorInst(ShuffleVectorInst &SVI);
+
+    void visitInsertValueInst(InsertValueInst &I);
+    void visitExtractValueInst(ExtractValueInst &I);
+
+    void visitInstruction(Instruction &I) {
+      cerr << "C Writer does not know about " << I;
+      abort();
+    }
+
+    void outputLValue(Instruction *I) {
+      Out << "  " << GetValueName(I) << " = ";
+    }
+
+    bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To);
+    void printPHICopiesForSuccessor(BasicBlock *CurBlock,
+                                    BasicBlock *Successor, unsigned Indent);
+    void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock,
+                            unsigned Indent);
+    void printGEPExpression(Value *Ptr, gep_type_iterator I,
+                            gep_type_iterator E, bool Static);
+
+    std::string GetValueName(const Value *Operand);
+  };
+}
+
+char CWriter::ID = 0;
+
+/// This method inserts names for any unnamed structure types that are used by
+/// the program, and removes names from structure types that are not used by the
+/// program.
+///
+bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
+  // Get a set of types that are used by the program...
+  std::set<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes();
+
+  // Loop over the module symbol table, removing types from UT that are
+  // already named, and removing names for types that are not used.
+  //
+  TypeSymbolTable &TST = M.getTypeSymbolTable();
+  for (TypeSymbolTable::iterator TI = TST.begin(), TE = TST.end();
+       TI != TE; ) {
+    TypeSymbolTable::iterator I = TI++;
+    
+    // If this isn't a struct or array type, remove it from our set of types
+    // to name. This simplifies emission later.
+    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second) &&
+        !isa<ArrayType>(I->second)) {
+      TST.remove(I);
+    } else {
+      // If this is not used, remove it from the symbol table.
+      std::set<const Type *>::iterator UTI = UT.find(I->second);
+      if (UTI == UT.end())
+        TST.remove(I);
+      else
+        UT.erase(UTI);    // Only keep one name for this type.
+    }
+  }
+
+  // UT now contains types that are not named.  Loop over it, naming
+  // structure types.
+  //
+  bool Changed = false;
+  unsigned RenameCounter = 0;
+  for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end();
+       I != E; ++I)
+    if (isa<StructType>(*I) || isa<ArrayType>(*I)) {
+      while (M.addTypeName("unnamed"+utostr(RenameCounter), *I))
+        ++RenameCounter;
+      Changed = true;
+    }
+      
+      
+  // Loop over all external functions and globals.  If we have two with
+  // identical names, merge them.
+  // FIXME: This code should disappear when we don't allow values with the same
+  // names when they have different types!
+  std::map<std::string, GlobalValue*> ExtSymbols;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E;) {
+    Function *GV = I++;
+    if (GV->isDeclaration() && GV->hasName()) {
+      std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X
+        = ExtSymbols.insert(std::make_pair(GV->getName(), GV));
+      if (!X.second) {
+        // Found a conflict, replace this global with the previous one.
+        GlobalValue *OldGV = X.first->second;
+        GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType()));
+        GV->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  // Do the same for globals.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *GV = I++;
+    if (GV->isDeclaration() && GV->hasName()) {
+      std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X
+        = ExtSymbols.insert(std::make_pair(GV->getName(), GV));
+      if (!X.second) {
+        // Found a conflict, replace this global with the previous one.
+        GlobalValue *OldGV = X.first->second;
+        GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType()));
+        GV->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  
+  return Changed;
+}
+
+/// printStructReturnPointerFunctionType - This is like printType for a struct
+/// return type, except, instead of printing the type as void (*)(Struct*, ...)
+/// print it as "Struct (*)(...)", for struct return functions.
+void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
+                                                   const AttrListPtr &PAL,
+                                                   const PointerType *TheTy) {
+  const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
+  std::stringstream FunctionInnards;
+  FunctionInnards << " (*) (";
+  bool PrintedType = false;
+
+  FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
+  const Type *RetTy = cast<PointerType>(I->get())->getElementType();
+  unsigned Idx = 1;
+  for (++I, ++Idx; I != E; ++I, ++Idx) {
+    if (PrintedType)
+      FunctionInnards << ", ";
+    const Type *ArgTy = *I;
+    if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+      assert(isa<PointerType>(ArgTy));
+      ArgTy = cast<PointerType>(ArgTy)->getElementType();
+    }
+    printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+    PrintedType = true;
+  }
+  if (FTy->isVarArg()) {
+    if (PrintedType)
+      FunctionInnards << ", ...";
+  } else if (!PrintedType) {
+    FunctionInnards << "void";
+  }
+  FunctionInnards << ')';
+  std::string tstr = FunctionInnards.str();
+  printType(Out, RetTy, 
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+}
+
+raw_ostream &
+CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
+                         const std::string &NameSoFar) {
+  assert((Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) && 
+         "Invalid type for printSimpleType");
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:   return Out << "void " << NameSoFar;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1) 
+      return Out << "bool " << NameSoFar;
+    else if (NumBits <= 8)
+      return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar;
+    else if (NumBits <= 16)
+      return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar;
+    else if (NumBits <= 32)
+      return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar;
+    else if (NumBits <= 64)
+      return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar;
+    else { 
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned?"llvmInt128":"llvmUInt128") << " " << NameSoFar;
+    }
+  }
+  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
+  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
+  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
+  // present matches host 'long double'.
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:  return Out << "long double " << NameSoFar;
+      
+  case Type::VectorTyID: {
+    const VectorType *VTy = cast<VectorType>(Ty);
+    return printSimpleType(Out, VTy->getElementType(), isSigned,
+                     " __attribute__((vector_size(" +
+                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
+  }
+    
+  default:
+    cerr << "Unknown primitive type: " << *Ty << "\n";
+    abort();
+  }
+}
+
+std::ostream &
+CWriter::printSimpleType(std::ostream &Out, const Type *Ty, bool isSigned,
+                         const std::string &NameSoFar) {
+  assert((Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) && 
+         "Invalid type for printSimpleType");
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:   return Out << "void " << NameSoFar;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1) 
+      return Out << "bool " << NameSoFar;
+    else if (NumBits <= 8)
+      return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar;
+    else if (NumBits <= 16)
+      return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar;
+    else if (NumBits <= 32)
+      return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar;
+    else if (NumBits <= 64)
+      return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar;
+    else { 
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned?"llvmInt128":"llvmUInt128") << " " << NameSoFar;
+    }
+  }
+  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
+  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
+  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
+  // present matches host 'long double'.
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:  return Out << "long double " << NameSoFar;
+      
+  case Type::VectorTyID: {
+    const VectorType *VTy = cast<VectorType>(Ty);
+    return printSimpleType(Out, VTy->getElementType(), isSigned,
+                     " __attribute__((vector_size(" +
+                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
+  }
+    
+  default:
+    cerr << "Unknown primitive type: " << *Ty << "\n";
+    abort();
+  }
+}
+
+// Pass the Type* and the variable name and this prints out the variable
+// declaration.
+//
+raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
+                                 bool isSigned, const std::string &NameSoFar,
+                                 bool IgnoreName, const AttrListPtr &PAL) {
+  if (Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) {
+    printSimpleType(Out, Ty, isSigned, NameSoFar);
+    return Out;
+  }
+
+  // Check to see if the type is named.
+  if (!IgnoreName || isa<OpaqueType>(Ty)) {
+    std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty);
+    if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar;
+  }
+
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID: {
+    const FunctionType *FTy = cast<FunctionType>(Ty);
+    std::stringstream FunctionInnards;
+    FunctionInnards << " (" << NameSoFar << ") (";
+    unsigned Idx = 1;
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+           E = FTy->param_end(); I != E; ++I) {
+      const Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(isa<PointerType>(ArgTy));
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      if (I != FTy->param_begin())
+        FunctionInnards << ", ";
+      printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+      ++Idx;
+    }
+    if (FTy->isVarArg()) {
+      if (FTy->getNumParams())
+        FunctionInnards << ", ...";
+    } else if (!FTy->getNumParams()) {
+      FunctionInnards << "void";
+    }
+    FunctionInnards << ')';
+    std::string tstr = FunctionInnards.str();
+    printType(Out, FTy->getReturnType(), 
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+    return Out;
+  }
+  case Type::StructTyID: {
+    const StructType *STy = cast<StructType>(Ty);
+    Out << NameSoFar + " {\n";
+    unsigned Idx = 0;
+    for (StructType::element_iterator I = STy->element_begin(),
+           E = STy->element_end(); I != E; ++I) {
+      Out << "  ";
+      printType(Out, *I, false, "field" + utostr(Idx++));
+      Out << ";\n";
+    }
+    Out << '}';
+    if (STy->isPacked())
+      Out << " __attribute__ ((packed))";
+    return Out;
+  }
+
+  case Type::PointerTyID: {
+    const PointerType *PTy = cast<PointerType>(Ty);
+    std::string ptrName = "*" + NameSoFar;
+
+    if (isa<ArrayType>(PTy->getElementType()) ||
+        isa<VectorType>(PTy->getElementType()))
+      ptrName = "(" + ptrName + ")";
+
+    if (!PAL.isEmpty())
+      // Must be a function ptr cast!
+      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
+    return printType(Out, PTy->getElementType(), false, ptrName);
+  }
+
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    unsigned NumElements = ATy->getNumElements();
+    if (NumElements == 0) NumElements = 1;
+    // Arrays are wrapped in structs to allow them to have normal
+    // value semantics (avoiding the array "decay").
+    Out << NameSoFar << " { ";
+    printType(Out, ATy->getElementType(), false,
+              "array[" + utostr(NumElements) + "]");
+    return Out << "; }";
+  }
+
+  case Type::OpaqueTyID: {
+    static int Count = 0;
+    std::string TyName = "struct opaque_" + itostr(Count++);
+    assert(TypeNames.find(Ty) == TypeNames.end());
+    TypeNames[Ty] = TyName;
+    return Out << TyName << ' ' << NameSoFar;
+  }
+  default:
+    assert(0 && "Unhandled case in getTypeProps!");
+    abort();
+  }
+
+  return Out;
+}
+
+// Pass the Type* and the variable name and this prints out the variable
+// declaration.
+//
+std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
+                                 bool isSigned, const std::string &NameSoFar,
+                                 bool IgnoreName, const AttrListPtr &PAL) {
+  if (Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) {
+    printSimpleType(Out, Ty, isSigned, NameSoFar);
+    return Out;
+  }
+
+  // Check to see if the type is named.
+  if (!IgnoreName || isa<OpaqueType>(Ty)) {
+    std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty);
+    if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar;
+  }
+
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID: {
+    const FunctionType *FTy = cast<FunctionType>(Ty);
+    std::stringstream FunctionInnards;
+    FunctionInnards << " (" << NameSoFar << ") (";
+    unsigned Idx = 1;
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+           E = FTy->param_end(); I != E; ++I) {
+      const Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(isa<PointerType>(ArgTy));
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      if (I != FTy->param_begin())
+        FunctionInnards << ", ";
+      printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+      ++Idx;
+    }
+    if (FTy->isVarArg()) {
+      if (FTy->getNumParams())
+        FunctionInnards << ", ...";
+    } else if (!FTy->getNumParams()) {
+      FunctionInnards << "void";
+    }
+    FunctionInnards << ')';
+    std::string tstr = FunctionInnards.str();
+    printType(Out, FTy->getReturnType(), 
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+    return Out;
+  }
+  case Type::StructTyID: {
+    const StructType *STy = cast<StructType>(Ty);
+    Out << NameSoFar + " {\n";
+    unsigned Idx = 0;
+    for (StructType::element_iterator I = STy->element_begin(),
+           E = STy->element_end(); I != E; ++I) {
+      Out << "  ";
+      printType(Out, *I, false, "field" + utostr(Idx++));
+      Out << ";\n";
+    }
+    Out << '}';
+    if (STy->isPacked())
+      Out << " __attribute__ ((packed))";
+    return Out;
+  }
+
+  case Type::PointerTyID: {
+    const PointerType *PTy = cast<PointerType>(Ty);
+    std::string ptrName = "*" + NameSoFar;
+
+    if (isa<ArrayType>(PTy->getElementType()) ||
+        isa<VectorType>(PTy->getElementType()))
+      ptrName = "(" + ptrName + ")";
+
+    if (!PAL.isEmpty())
+      // Must be a function ptr cast!
+      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
+    return printType(Out, PTy->getElementType(), false, ptrName);
+  }
+
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    unsigned NumElements = ATy->getNumElements();
+    if (NumElements == 0) NumElements = 1;
+    // Arrays are wrapped in structs to allow them to have normal
+    // value semantics (avoiding the array "decay").
+    Out << NameSoFar << " { ";
+    printType(Out, ATy->getElementType(), false,
+              "array[" + utostr(NumElements) + "]");
+    return Out << "; }";
+  }
+
+  case Type::OpaqueTyID: {
+    static int Count = 0;
+    std::string TyName = "struct opaque_" + itostr(Count++);
+    assert(TypeNames.find(Ty) == TypeNames.end());
+    TypeNames[Ty] = TyName;
+    return Out << TyName << ' ' << NameSoFar;
+  }
+  default:
+    assert(0 && "Unhandled case in getTypeProps!");
+    abort();
+  }
+
+  return Out;
+}
+
+void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
+
+  // As a special case, print the array as a string if it is an array of
+  // ubytes or an array of sbytes with positive values.
+  //
+  const Type *ETy = CPA->getType()->getElementType();
+  bool isString = (ETy == Type::Int8Ty || ETy == Type::Int8Ty);
+
+  // Make sure the last character is a null char, as automatically added by C
+  if (isString && (CPA->getNumOperands() == 0 ||
+                   !cast<Constant>(*(CPA->op_end()-1))->isNullValue()))
+    isString = false;
+
+  if (isString) {
+    Out << '\"';
+    // Keep track of whether the last number was a hexadecimal escape
+    bool LastWasHex = false;
+
+    // Do not include the last character, which we know is null
+    for (unsigned i = 0, e = CPA->getNumOperands()-1; i != e; ++i) {
+      unsigned char C = cast<ConstantInt>(CPA->getOperand(i))->getZExtValue();
+
+      // Print it out literally if it is a printable character.  The only thing
+      // to be careful about is when the last letter output was a hex escape
+      // code, in which case we have to be careful not to print out hex digits
+      // explicitly (the C compiler thinks it is a continuation of the previous
+      // character, sheesh...)
+      //
+      if (isprint(C) && (!LastWasHex || !isxdigit(C))) {
+        LastWasHex = false;
+        if (C == '"' || C == '\\')
+          Out << "\\" << (char)C;
+        else
+          Out << (char)C;
+      } else {
+        LastWasHex = false;
+        switch (C) {
+        case '\n': Out << "\\n"; break;
+        case '\t': Out << "\\t"; break;
+        case '\r': Out << "\\r"; break;
+        case '\v': Out << "\\v"; break;
+        case '\a': Out << "\\a"; break;
+        case '\"': Out << "\\\""; break;
+        case '\'': Out << "\\\'"; break;
+        default:
+          Out << "\\x";
+          Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
+          Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+          LastWasHex = true;
+          break;
+        }
+      }
+    }
+    Out << '\"';
+  } else {
+    Out << '{';
+    if (CPA->getNumOperands()) {
+      Out << ' ';
+      printConstant(cast<Constant>(CPA->getOperand(0)), Static);
+      for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
+        Out << ", ";
+        printConstant(cast<Constant>(CPA->getOperand(i)), Static);
+      }
+    }
+    Out << " }";
+  }
+}
+
+void CWriter::printConstantVector(ConstantVector *CP, bool Static) {
+  Out << '{';
+  if (CP->getNumOperands()) {
+    Out << ' ';
+    printConstant(cast<Constant>(CP->getOperand(0)), Static);
+    for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
+      Out << ", ";
+      printConstant(cast<Constant>(CP->getOperand(i)), Static);
+    }
+  }
+  Out << " }";
+}
+
+// isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
+// textually as a double (rather than as a reference to a stack-allocated
+// variable). We decide this by converting CFP to a string and back into a
+// double, and then checking whether the conversion results in a bit-equal
+// double to the original value of CFP. This depends on us and the target C
+// compiler agreeing on the conversion process (which is pretty likely since we
+// only deal in IEEE FP).
+//
+static bool isFPCSafeToPrint(const ConstantFP *CFP) {
+  bool ignored;
+  // Do long doubles in hex for now.
+  if (CFP->getType() != Type::FloatTy && CFP->getType() != Type::DoubleTy)
+    return false;
+  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  if (CFP->getType() == Type::FloatTy)
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+  char Buffer[100];
+  sprintf(Buffer, "%a", APF.convertToDouble());
+  if (!strncmp(Buffer, "0x", 2) ||
+      !strncmp(Buffer, "-0x", 3) ||
+      !strncmp(Buffer, "+0x", 3))
+    return APF.bitwiseIsEqual(APFloat(atof(Buffer)));
+  return false;
+#else
+  std::string StrVal = ftostr(APF);
+
+  while (StrVal[0] == ' ')
+    StrVal.erase(StrVal.begin());
+
+  // Check to make sure that the stringized number is not some string like "Inf"
+  // or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+  if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+      ((StrVal[0] == '-' || StrVal[0] == '+') &&
+       (StrVal[1] >= '0' && StrVal[1] <= '9')))
+    // Reparse stringized version!
+    return APF.bitwiseIsEqual(APFloat(atof(StrVal.c_str())));
+  return false;
+#endif
+}
+
+/// Print out the casting for a cast operation. This does the double casting
+/// necessary for conversion to the destination type, if necessary. 
+/// @brief Print a cast
+void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) {
+  // Print the destination type cast
+  switch (opc) {
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::IntToPtr:
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
+      Out << '(';
+      printType(Out, DstTy);
+      Out << ')';
+      break;
+    case Instruction::ZExt:
+    case Instruction::PtrToInt:
+    case Instruction::FPToUI: // For these, make sure we get an unsigned dest
+      Out << '(';
+      printSimpleType(Out, DstTy, false);
+      Out << ')';
+      break;
+    case Instruction::SExt: 
+    case Instruction::FPToSI: // For these, make sure we get a signed dest
+      Out << '(';
+      printSimpleType(Out, DstTy, true);
+      Out << ')';
+      break;
+    default:
+      assert(0 && "Invalid cast opcode");
+  }
+
+  // Print the source type cast
+  switch (opc) {
+    case Instruction::UIToFP:
+    case Instruction::ZExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, false);
+      Out << ')';
+      break;
+    case Instruction::SIToFP:
+    case Instruction::SExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, true); 
+      Out << ')';
+      break;
+    case Instruction::IntToPtr:
+    case Instruction::PtrToInt:
+      // Avoid "cast to pointer from integer of different size" warnings
+      Out << "(unsigned long)";
+      break;
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPToSI:
+    case Instruction::FPToUI:
+      break; // These don't need a source cast.
+    default:
+      assert(0 && "Invalid cast opcode");
+      break;
+  }
+}
+
+// printConstant - The LLVM Constant to C Constant converter.
+void CWriter::printConstant(Constant *CPV, bool Static) {
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
+    switch (CE->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      Out << "(";
+      printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
+      if (CE->getOpcode() == Instruction::SExt &&
+          CE->getOperand(0)->getType() == Type::Int1Ty) {
+        // Make sure we really sext from bool here by subtracting from 0
+        Out << "0-";
+      }
+      printConstant(CE->getOperand(0), Static);
+      if (CE->getType() == Type::Int1Ty &&
+          (CE->getOpcode() == Instruction::Trunc ||
+           CE->getOpcode() == Instruction::FPToUI ||
+           CE->getOpcode() == Instruction::FPToSI ||
+           CE->getOpcode() == Instruction::PtrToInt)) {
+        // Make sure we really truncate to bool here by anding with 1
+        Out << "&1u";
+      }
+      Out << ')';
+      return;
+
+    case Instruction::GetElementPtr:
+      Out << "(";
+      printGEPExpression(CE->getOperand(0), gep_type_begin(CPV),
+                         gep_type_end(CPV), Static);
+      Out << ")";
+      return;
+    case Instruction::Select:
+      Out << '(';
+      printConstant(CE->getOperand(0), Static);
+      Out << '?';
+      printConstant(CE->getOperand(1), Static);
+      Out << ':';
+      printConstant(CE->getOperand(2), Static);
+      Out << ')';
+      return;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    {
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE, Static); 
+      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+      switch (CE->getOpcode()) {
+      case Instruction::Add: Out << " + "; break;
+      case Instruction::Sub: Out << " - "; break;
+      case Instruction::Mul: Out << " * "; break;
+      case Instruction::URem:
+      case Instruction::SRem: 
+      case Instruction::FRem: Out << " % "; break;
+      case Instruction::UDiv: 
+      case Instruction::SDiv: 
+      case Instruction::FDiv: Out << " / "; break;
+      case Instruction::And: Out << " & "; break;
+      case Instruction::Or:  Out << " | "; break;
+      case Instruction::Xor: Out << " ^ "; break;
+      case Instruction::Shl: Out << " << "; break;
+      case Instruction::LShr:
+      case Instruction::AShr: Out << " >> "; break;
+      case Instruction::ICmp:
+        switch (CE->getPredicate()) {
+          case ICmpInst::ICMP_EQ: Out << " == "; break;
+          case ICmpInst::ICMP_NE: Out << " != "; break;
+          case ICmpInst::ICMP_SLT: 
+          case ICmpInst::ICMP_ULT: Out << " < "; break;
+          case ICmpInst::ICMP_SLE:
+          case ICmpInst::ICMP_ULE: Out << " <= "; break;
+          case ICmpInst::ICMP_SGT:
+          case ICmpInst::ICMP_UGT: Out << " > "; break;
+          case ICmpInst::ICMP_SGE:
+          case ICmpInst::ICMP_UGE: Out << " >= "; break;
+          default: assert(0 && "Illegal ICmp predicate");
+        }
+        break;
+      default: assert(0 && "Illegal opcode here!");
+      }
+      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    case Instruction::FCmp: {
+      Out << '('; 
+      bool NeedsClosingParens = printConstExprCast(CE, Static); 
+      if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
+        Out << "0";
+      else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
+        Out << "1";
+      else {
+        const char* op = 0;
+        switch (CE->getPredicate()) {
+        default: assert(0 && "Illegal FCmp predicate");
+        case FCmpInst::FCMP_ORD: op = "ord"; break;
+        case FCmpInst::FCMP_UNO: op = "uno"; break;
+        case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+        case FCmpInst::FCMP_UNE: op = "une"; break;
+        case FCmpInst::FCMP_ULT: op = "ult"; break;
+        case FCmpInst::FCMP_ULE: op = "ule"; break;
+        case FCmpInst::FCMP_UGT: op = "ugt"; break;
+        case FCmpInst::FCMP_UGE: op = "uge"; break;
+        case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+        case FCmpInst::FCMP_ONE: op = "one"; break;
+        case FCmpInst::FCMP_OLT: op = "olt"; break;
+        case FCmpInst::FCMP_OLE: op = "ole"; break;
+        case FCmpInst::FCMP_OGT: op = "ogt"; break;
+        case FCmpInst::FCMP_OGE: op = "oge"; break;
+        }
+        Out << "llvm_fcmp_" << op << "(";
+        printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+        Out << ", ";
+        printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+        Out << ")";
+      }
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    default:
+      cerr << "CWriter Error: Unhandled constant expression: "
+           << *CE << "\n";
+      abort();
+    }
+  } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
+    Out << "((";
+    printType(Out, CPV->getType()); // sign doesn't matter
+    Out << ")/*UNDEF*/";
+    if (!isa<VectorType>(CPV->getType())) {
+      Out << "0)";
+    } else {
+      Out << "{})";
+    }
+    return;
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    const Type* Ty = CI->getType();
+    if (Ty == Type::Int1Ty)
+      Out << (CI->getZExtValue() ? '1' : '0');
+    else if (Ty == Type::Int32Ty)
+      Out << CI->getZExtValue() << 'u';
+    else if (Ty->getPrimitiveSizeInBits() > 32)
+      Out << CI->getZExtValue() << "ull";
+    else {
+      Out << "((";
+      printSimpleType(Out, Ty, false) << ')';
+      if (CI->isMinValue(true)) 
+        Out << CI->getZExtValue() << 'u';
+      else
+        Out << CI->getSExtValue();
+      Out << ')';
+    }
+    return;
+  } 
+
+  switch (CPV->getType()->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID: 
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID: {
+    ConstantFP *FPC = cast<ConstantFP>(CPV);
+    std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
+    if (I != FPConstantMap.end()) {
+      // Because of FP precision problems we must load from a stack allocated
+      // value that holds the value in hex.
+      Out << "(*(" << (FPC->getType() == Type::FloatTy ? "float" : 
+                       FPC->getType() == Type::DoubleTy ? "double" :
+                       "long double")
+          << "*)&FPConstant" << I->second << ')';
+    } else {
+      double V;
+      if (FPC->getType() == Type::FloatTy)
+        V = FPC->getValueAPF().convertToFloat();
+      else if (FPC->getType() == Type::DoubleTy)
+        V = FPC->getValueAPF().convertToDouble();
+      else {
+        // Long double.  Convert the number to double, discarding precision.
+        // This is not awesome, but it at least makes the CBE output somewhat
+        // useful.
+        APFloat Tmp = FPC->getValueAPF();
+        bool LosesInfo;
+        Tmp.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &LosesInfo);
+        V = Tmp.convertToDouble();
+      }
+      
+      if (IsNAN(V)) {
+        // The value is NaN
+
+        // FIXME the actual NaN bits should be emitted.
+        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
+        // it's 0x7ff4.
+        const unsigned long QuietNaN = 0x7ff8UL;
+        //const unsigned long SignalNaN = 0x7ff4UL;
+
+        // We need to grab the first part of the FP #
+        char Buffer[100];
+
+        uint64_t ll = DoubleToBits(V);
+        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
+
+        std::string Num(&Buffer[0], &Buffer[6]);
+        unsigned long Val = strtoul(Num.c_str(), 0, 16);
+
+        if (FPC->getType() == Type::FloatTy)
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
+              << Buffer << "\") /*nan*/ ";
+        else
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
+              << Buffer << "\") /*nan*/ ";
+      } else if (IsInf(V)) {
+        // The value is Inf
+        if (V < 0) Out << '-';
+        Out << "LLVM_INF" << (FPC->getType() == Type::FloatTy ? "F" : "")
+            << " /*inf*/ ";
+      } else {
+        std::string Num;
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+        // Print out the constant as a floating point number.
+        char Buffer[100];
+        sprintf(Buffer, "%a", V);
+        Num = Buffer;
+#else
+        Num = ftostr(FPC->getValueAPF());
+#endif
+       Out << Num;
+      }
+    }
+    break;
+  }
+
+  case Type::ArrayTyID:
+    // Use C99 compound expression literal initializer syntax.
+    if (!Static) {
+      Out << "(";
+      printType(Out, CPV->getType());
+      Out << ")";
+    }
+    Out << "{ "; // Arrays are wrapped in struct types.
+    if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
+      printConstantArray(CA, Static);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      const ArrayType *AT = cast<ArrayType>(CPV->getType());
+      Out << '{';
+      if (AT->getNumElements()) {
+        Out << ' ';
+        Constant *CZ = Constant::getNullValue(AT->getElementType());
+        printConstant(CZ, Static);
+        for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(CZ, Static);
+        }
+      }
+      Out << " }";
+    }
+    Out << " }"; // Arrays are wrapped in struct types.
+    break;
+
+  case Type::VectorTyID:
+    // Use C99 compound expression literal initializer syntax.
+    if (!Static) {
+      Out << "(";
+      printType(Out, CPV->getType());
+      Out << ")";
+    }
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+      printConstantVector(CV, Static);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      const VectorType *VT = cast<VectorType>(CPV->getType());
+      Out << "{ ";
+      Constant *CZ = Constant::getNullValue(VT->getElementType());
+      printConstant(CZ, Static);
+      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
+        Out << ", ";
+        printConstant(CZ, Static);
+      }
+      Out << " }";
+    }
+    break;
+
+  case Type::StructTyID:
+    // Use C99 compound expression literal initializer syntax.
+    if (!Static) {
+      Out << "(";
+      printType(Out, CPV->getType());
+      Out << ")";
+    }
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      const StructType *ST = cast<StructType>(CPV->getType());
+      Out << '{';
+      if (ST->getNumElements()) {
+        Out << ' ';
+        printConstant(Constant::getNullValue(ST->getElementType(0)), Static);
+        for (unsigned i = 1, e = ST->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(Constant::getNullValue(ST->getElementType(i)), Static);
+        }
+      }
+      Out << " }";
+    } else {
+      Out << '{';
+      if (CPV->getNumOperands()) {
+        Out << ' ';
+        printConstant(cast<Constant>(CPV->getOperand(0)), Static);
+        for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) {
+          Out << ", ";
+          printConstant(cast<Constant>(CPV->getOperand(i)), Static);
+        }
+      }
+      Out << " }";
+    }
+    break;
+
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(CPV)) {
+      Out << "((";
+      printType(Out, CPV->getType()); // sign doesn't matter
+      Out << ")/*NULL*/0)";
+      break;
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
+      writeOperand(GV, Static);
+      break;
+    }
+    // FALL THROUGH
+  default:
+    cerr << "Unknown constant type: " << *CPV << "\n";
+    abort();
+  }
+}
+
+// Some constant expressions need to be casted back to the original types
+// because their operands were casted to the expected type. This function takes
+// care of detecting that case and printing the cast for the ConstantExpr.
+bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
+  bool NeedsExplicitCast = false;
+  const Type *Ty = CE->getOperand(0)->getType();
+  bool TypeIsSigned = false;
+  switch (CE->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+    if (!Ty->isIntOrIntVector()) break;
+    // FALL THROUGH
+  case Instruction::LShr:
+  case Instruction::URem: 
+  case Instruction::UDiv: NeedsExplicitCast = true; break;
+  case Instruction::AShr:
+  case Instruction::SRem: 
+  case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
+  case Instruction::SExt:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::ZExt:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    break;
+  default: break;
+  }
+  if (NeedsExplicitCast) {
+    Out << "((";
+    if (Ty->isInteger() && Ty != Type::Int1Ty)
+      printSimpleType(Out, Ty, TypeIsSigned);
+    else
+      printType(Out, Ty); // not integer, sign doesn't matter
+    Out << ")(";
+  }
+  return NeedsExplicitCast;
+}
+
+//  Print a constant assuming that it is the operand for a given Opcode. The
+//  opcodes that care about sign need to cast their operands to the expected
+//  type before the operation proceeds. This function does the casting.
+void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  const Type* OpTy = CPV->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+  bool typeIsSigned = false;
+
+  // Based on the Opcode for which this Constant is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true so it gets
+  // casted below.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break; 
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+      // We need to cast integer arithmetic so that it is always performed
+      // as unsigned, to avoid undefined behavior on overflow.
+      if (!OpTy->isIntOrIntVector()) break;
+      // FALL THROUGH
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      shouldCast = true;
+      break;
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+      shouldCast = true;
+      typeIsSigned = true;
+      break;
+  }
+
+  // Write out the casted constant if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, typeIsSigned);
+    Out << ")";
+    printConstant(CPV, false);
+    Out << ")";
+  } else 
+    printConstant(CPV, false);
+}
+
+std::string CWriter::GetValueName(const Value *Operand) {
+  std::string Name;
+
+  if (!isa<GlobalValue>(Operand) && Operand->getName() != "") {
+    std::string VarName;
+
+    Name = Operand->getName();
+    VarName.reserve(Name.capacity());
+
+    for (std::string::iterator I = Name.begin(), E = Name.end();
+         I != E; ++I) {
+      char ch = *I;
+
+      if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+            (ch >= '0' && ch <= '9') || ch == '_')) {
+        char buffer[5];
+        sprintf(buffer, "_%x_", ch);
+        VarName += buffer;
+      } else
+        VarName += ch;
+    }
+
+    Name = "llvm_cbe_" + VarName;
+  } else {
+    Name = Mang->getValueName(Operand);
+  }
+
+  return Name;
+}
+
+/// writeInstComputationInline - Emit the computation for the specified
+/// instruction inline, with no destination provided.
+void CWriter::writeInstComputationInline(Instruction &I) {
+  // If this is a non-trivial bool computation, make sure to truncate down to
+  // a 1 bit value.  This is important because we want "add i1 x, y" to return
+  // "0" when x and y are true, not "2" for example.
+  bool NeedBoolTrunc = false;
+  if (I.getType() == Type::Int1Ty && !isa<ICmpInst>(I) && !isa<FCmpInst>(I))
+    NeedBoolTrunc = true;
+  
+  if (NeedBoolTrunc)
+    Out << "((";
+  
+  visit(I);
+  
+  if (NeedBoolTrunc)
+    Out << ")&1)";
+}
+
+
+void CWriter::writeOperandInternal(Value *Operand, bool Static) {
+  if (Instruction *I = dyn_cast<Instruction>(Operand))
+    // Should we inline this instruction to build a tree?
+    if (isInlinableInst(*I) && !isDirectAlloca(I)) {
+      Out << '(';
+      writeInstComputationInline(*I);
+      Out << ')';
+      return;
+    }
+
+  Constant* CPV = dyn_cast<Constant>(Operand);
+
+  if (CPV && !isa<GlobalValue>(CPV))
+    printConstant(CPV, Static);
+  else
+    Out << GetValueName(Operand);
+}
+
+void CWriter::writeOperand(Value *Operand, bool Static) {
+  bool isAddressImplicit = isAddressExposed(Operand);
+  if (isAddressImplicit)
+    Out << "(&";  // Global variables are referenced as their addresses by llvm
+
+  writeOperandInternal(Operand, Static);
+
+  if (isAddressImplicit)
+    Out << ')';
+}
+
+// Some instructions need to have their result value casted back to the 
+// original types because their operands were casted to the expected type. 
+// This function takes care of detecting that case and printing the cast 
+// for the Instruction.
+bool CWriter::writeInstructionCast(const Instruction &I) {
+  const Type *Ty = I.getOperand(0)->getType();
+  switch (I.getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+    if (!Ty->isIntOrIntVector()) break;
+    // FALL THROUGH
+  case Instruction::LShr:
+  case Instruction::URem: 
+  case Instruction::UDiv: 
+    Out << "((";
+    printSimpleType(Out, Ty, false);
+    Out << ")(";
+    return true;
+  case Instruction::AShr:
+  case Instruction::SRem: 
+  case Instruction::SDiv: 
+    Out << "((";
+    printSimpleType(Out, Ty, true);
+    Out << ")(";
+    return true;
+  default: break;
+  }
+  return false;
+}
+
+// Write the operand with a cast to another type based on the Opcode being used.
+// This will be used in cases where an instruction has specific type
+// requirements (usually signedness) for its operands. 
+void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  const Type* OpTy = Operand->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+
+  // Indicate whether the cast should be to a signed type or not.
+  bool castIsSigned = false;
+
+  // Based on the Opcode for which this Operand is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break; 
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+      // We need to cast integer arithmetic so that it is always performed
+      // as unsigned, to avoid undefined behavior on overflow.
+      if (!OpTy->isIntOrIntVector()) break;
+      // FALL THROUGH
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem: // Cast to unsigned first
+      shouldCast = true;
+      castIsSigned = false;
+      break;
+    case Instruction::GetElementPtr:
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem: // Cast to signed first
+      shouldCast = true;
+      castIsSigned = true;
+      break;
+  }
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, castIsSigned);
+    Out << ")";
+    writeOperand(Operand);
+    Out << ")";
+  } else 
+    writeOperand(Operand);
+}
+
+// Write the operand with a cast to another type based on the icmp predicate 
+// being used. 
+void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
+  // This has to do a cast to ensure the operand has the right signedness. 
+  // Also, if the operand is a pointer, we make sure to cast to an integer when
+  // doing the comparison both for signedness and so that the C compiler doesn't
+  // optimize things like "p < NULL" to false (p may contain an integer value
+  // f.e.).
+  bool shouldCast = Cmp.isRelational();
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (!shouldCast) {
+    writeOperand(Operand);
+    return;
+  }
+  
+  // Should this be a signed comparison?  If so, convert to signed.
+  bool castIsSigned = Cmp.isSignedPredicate();
+
+  // If the operand was a pointer, convert to a large integer type.
+  const Type* OpTy = Operand->getType();
+  if (isa<PointerType>(OpTy))
+    OpTy = TD->getIntPtrType();
+  
+  Out << "((";
+  printSimpleType(Out, OpTy, castIsSigned);
+  Out << ")";
+  writeOperand(Operand);
+  Out << ")";
+}
+
+// generateCompilerSpecificCode - This is where we add conditional compilation
+// directives to cater to specific compilers as need be.
+//
+static void generateCompilerSpecificCode(raw_ostream& Out,
+                                         const TargetData *TD) {
+  // Alloca is hard to get, and we don't want to include stdlib.h here.
+  Out << "/* get a declaration for alloca */\n"
+      << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
+      << "#define  alloca(x) __builtin_alloca((x))\n"
+      << "#define _alloca(x) __builtin_alloca((x))\n"    
+      << "#elif defined(__APPLE__)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#define longjmp _longjmp\n"
+      << "#define setjmp _setjmp\n"
+      << "#elif defined(__sun__)\n"
+      << "#if defined(__sparcv9)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#else\n"
+      << "extern void *__builtin_alloca(unsigned int);\n"
+      << "#endif\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(_MSC_VER)\n"
+      << "#define inline _inline\n"
+      << "#define alloca(x) _alloca(x)\n"
+      << "#else\n"
+      << "#include <alloca.h>\n"
+      << "#endif\n\n";
+
+  // We output GCC specific attributes to preserve 'linkonce'ness on globals.
+  // If we aren't being compiled with GCC, just drop these attributes.
+  Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
+      << "#define __attribute__(X)\n"
+      << "#endif\n\n";
+
+  // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __EXTERNAL_WEAK__\n"
+      << "#endif\n\n";
+
+  // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#endif\n\n";
+
+  // Add hidden visibility support. FIXME: APPLE_CC?
+  Out << "#if defined(__GNUC__)\n"
+      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
+      << "#endif\n\n";
+    
+  // Define NaN and Inf as GCC builtins if using GCC, as 0 otherwise
+  // From the GCC documentation:
+  //
+  //   double __builtin_nan (const char *str)
+  //
+  // This is an implementation of the ISO C99 function nan.
+  //
+  // Since ISO C99 defines this function in terms of strtod, which we do
+  // not implement, a description of the parsing is in order. The string is
+  // parsed as by strtol; that is, the base is recognized by leading 0 or
+  // 0x prefixes. The number parsed is placed in the significand such that
+  // the least significant bit of the number is at the least significant
+  // bit of the significand. The number is truncated to fit the significand
+  // field provided. The significand is forced to be a quiet NaN.
+  //
+  // This function, if given a string literal, is evaluated early enough
+  // that it is considered a compile-time constant.
+  //
+  //   float __builtin_nanf (const char *str)
+  //
+  // Similar to __builtin_nan, except the return type is float.
+  //
+  //   double __builtin_inf (void)
+  //
+  // Similar to __builtin_huge_val, except a warning is generated if the
+  // target floating-point format does not support infinities. This
+  // function is suitable for implementing the ISO C99 macro INFINITY.
+  //
+  //   float __builtin_inff (void)
+  //
+  // Similar to __builtin_inf, except the return type is float.
+  Out << "#ifdef __GNUC__\n"
+      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
+      << "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
+      << "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
+      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
+      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality) "
+                              "__builtin_prefetch(addr,rw,locality)\n"
+      << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
+      << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
+      << "#define LLVM_ASM           __asm__\n"
+      << "#else\n"
+      << "#define LLVM_NAN(NanStr)   ((double)0.0)           /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  0.0F                    /* Float */\n"
+      << "#define LLVM_NANS(NanStr)  ((double)0.0)           /* Double */\n"
+      << "#define LLVM_NANSF(NanStr) 0.0F                    /* Float */\n"
+      << "#define LLVM_INF           ((double)0.0)           /* Double */\n"
+      << "#define LLVM_INFF          0.0F                    /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
+      << "#define __ATTRIBUTE_CTOR__\n"
+      << "#define __ATTRIBUTE_DTOR__\n"
+      << "#define LLVM_ASM(X)\n"
+      << "#endif\n\n";
+  
+  Out << "#if __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n"
+      << "#define __builtin_stack_save() 0   /* not implemented */\n"
+      << "#define __builtin_stack_restore(X) /* noop */\n"
+      << "#endif\n\n";
+
+  // Output typedefs for 128-bit integers. If these are needed with a
+  // 32-bit target or with a C compiler that doesn't support mode(TI),
+  // more drastic measures will be needed.
+  Out << "#if __GNUC__ && __LP64__ /* 128-bit integer types */\n"
+      << "typedef int __attribute__((mode(TI))) llvmInt128;\n"
+      << "typedef unsigned __attribute__((mode(TI))) llvmUInt128;\n"
+      << "#endif\n\n";
+
+  // Output target-specific code that should be inserted into main.
+  Out << "#define CODE_FOR_MAIN() /* Any target-specific code for main()*/\n";
+}
+
+/// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
+/// the StaticTors set.
+static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
+  ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!InitList) return;
+  
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
+      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
+      
+      if (CS->getOperand(1)->isNullValue())
+        return;  // Found a null terminator, exit printing.
+      Constant *FP = CS->getOperand(1);
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
+        if (CE->isCast())
+          FP = CE->getOperand(0);
+      if (Function *F = dyn_cast<Function>(FP))
+        StaticTors.insert(F);
+    }
+}
+
+enum SpecialGlobalClass {
+  NotSpecial = 0,
+  GlobalCtors, GlobalDtors,
+  NotPrinted
+};
+
+/// getGlobalVariableClass - If this is a global that is specially recognized
+/// by LLVM, return a code that indicates how we should handle it.
+static SpecialGlobalClass getGlobalVariableClass(const GlobalVariable *GV) {
+  // If this is a global ctors/dtors list, handle it now.
+  if (GV->hasAppendingLinkage() && GV->use_empty()) {
+    if (GV->getName() == "llvm.global_ctors")
+      return GlobalCtors;
+    else if (GV->getName() == "llvm.global_dtors")
+      return GlobalDtors;
+  }
+  
+  // Otherwise, it it is other metadata, don't print it.  This catches things
+  // like debug information.
+  if (GV->getSection() == "llvm.metadata")
+    return NotPrinted;
+  
+  return NotSpecial;
+}
+
+
+bool CWriter::doInitialization(Module &M) {
+  // Initialize
+  TheModule = &M;
+
+  TD = new TargetData(&M);
+  IL = new IntrinsicLowering(*TD);
+  IL->AddPrototypes(M);
+
+  // Ensure that all structure types have names...
+  Mang = new Mangler(M);
+  Mang->markCharUnacceptable('.');
+
+  // Keep track of which functions are static ctors/dtors so they can have
+  // an attribute added to their prototypes.
+  std::set<Function*> StaticCtors, StaticDtors;
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    switch (getGlobalVariableClass(I)) {
+    default: break;
+    case GlobalCtors:
+      FindStaticTors(I, StaticCtors);
+      break;
+    case GlobalDtors:
+      FindStaticTors(I, StaticDtors);
+      break;
+    }
+  }
+  
+  // get declaration for alloca
+  Out << "/* Provide Declarations */\n";
+  Out << "#include <stdarg.h>\n";      // Varargs support
+  Out << "#include <setjmp.h>\n";      // Unwind support
+  generateCompilerSpecificCode(Out, TD);
+
+  // Provide a definition for `bool' if not compiling with a C++ compiler.
+  Out << "\n"
+      << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n"
+
+      << "\n\n/* Support for floating point constants */\n"
+      << "typedef unsigned long long ConstantDoubleTy;\n"
+      << "typedef unsigned int        ConstantFloatTy;\n"
+      << "typedef struct { unsigned long long f1; unsigned short f2; "
+         "unsigned short pad[3]; } ConstantFP80Ty;\n"
+      // This is used for both kinds of 128-bit long double; meaning differs.
+      << "typedef struct { unsigned long long f1; unsigned long long f2; }"
+         " ConstantFP128Ty;\n"
+      << "\n\n/* Global Declarations */\n";
+
+  // First output all the declarations for the program, because C requires
+  // Functions & globals to be declared before they are used.
+  //
+
+  // Loop over the symbol table, emitting all named constants...
+  printModuleTypes(M.getTypeSymbolTable());
+
+  // Global variable declarations...
+  if (!M.global_empty()) {
+    Out << "\n/* External Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+
+      if (I->hasExternalLinkage() || I->hasExternalWeakLinkage() || 
+          I->hasCommonLinkage())
+        Out << "extern ";
+      else if (I->hasDLLImportLinkage())
+        Out << "__declspec(dllimport) ";
+      else
+        continue; // Internal Global
+
+      // Thread Local Storage
+      if (I->isThreadLocal())
+        Out << "__thread ";
+
+      printType(Out, I->getType()->getElementType(), false, GetValueName(I));
+
+      if (I->hasExternalWeakLinkage())
+         Out << " __EXTERNAL_WEAK__";
+      Out << ";\n";
+    }
+  }
+
+  // Function declarations
+  Out << "\n/* Function Declarations */\n";
+  Out << "double fmod(double, double);\n";   // Support for FP rem
+  Out << "float fmodf(float, float);\n";
+  Out << "long double fmodl(long double, long double);\n";
+  
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    // Don't print declarations for intrinsic functions.
+    if (!I->isIntrinsic() && I->getName() != "setjmp" &&
+        I->getName() != "longjmp" && I->getName() != "_setjmp") {
+      if (I->hasExternalWeakLinkage())
+        Out << "extern ";
+      printFunctionSignature(I, true);
+      if (I->hasWeakLinkage() || I->hasLinkOnceLinkage()) 
+        Out << " __ATTRIBUTE_WEAK__";
+      if (I->hasExternalWeakLinkage())
+        Out << " __EXTERNAL_WEAK__";
+      if (StaticCtors.count(I))
+        Out << " __ATTRIBUTE_CTOR__";
+      if (StaticDtors.count(I))
+        Out << " __ATTRIBUTE_DTOR__";
+      if (I->hasHiddenVisibility())
+        Out << " __HIDDEN__";
+      
+      if (I->hasName() && I->getName()[0] == 1)
+        Out << " LLVM_ASM(\"" << I->getName().c_str()+1 << "\")";
+          
+      Out << ";\n";
+    }
+  }
+
+  // Output the global variable declarations
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasLocalLinkage())
+          Out << "static ";
+        else
+          Out << "extern ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false, 
+                  GetValueName(I));
+
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasCommonLinkage())     // FIXME is this right?
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasExternalWeakLinkage())
+          Out << " __EXTERNAL_WEAK__";
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+        Out << ";\n";
+      }
+  }
+
+  // Output the global variable definitions and contents...
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Definitions and Initialization */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasLocalLinkage())
+          Out << "static ";
+        else if (I->hasDLLImportLinkage())
+          Out << "__declspec(dllimport) ";
+        else if (I->hasDLLExportLinkage())
+          Out << "__declspec(dllexport) ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false, 
+                  GetValueName(I));
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasCommonLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+        
+        // If the initializer is not null, emit the initializer.  If it is null,
+        // we try to avoid emitting large amounts of zeros.  The problem with
+        // this, however, occurs when the variable has weak linkage.  In this
+        // case, the assembler will complain about the variable being both weak
+        // and common, so we disable this optimization.
+        // FIXME common linkage should avoid this problem.
+        if (!I->getInitializer()->isNullValue()) {
+          Out << " = " ;
+          writeOperand(I->getInitializer(), true);
+        } else if (I->hasWeakLinkage()) {
+          // We have to specify an initializer, but it doesn't have to be
+          // complete.  If the value is an aggregate, print out { 0 }, and let
+          // the compiler figure out the rest of the zeros.
+          Out << " = " ;
+          if (isa<StructType>(I->getInitializer()->getType()) ||
+              isa<VectorType>(I->getInitializer()->getType())) {
+            Out << "{ 0 }";
+          } else if (isa<ArrayType>(I->getInitializer()->getType())) {
+            // As with structs and vectors, but with an extra set of braces
+            // because arrays are wrapped in structs.
+            Out << "{ { 0 } }";
+          } else {
+            // Just print it out normally.
+            writeOperand(I->getInitializer(), true);
+          }
+        }
+        Out << ";\n";
+      }
+  }
+
+  if (!M.empty())
+    Out << "\n\n/* Function Bodies */\n";
+
+  // Emit some helper functions for dealing with FCMP instruction's 
+  // predicates
+  Out << "static inline int llvm_fcmp_ord(double X, double Y) { ";
+  Out << "return X == X && Y == Y; }\n";
+  Out << "static inline int llvm_fcmp_uno(double X, double Y) { ";
+  Out << "return X != X || Y != Y; }\n";
+  Out << "static inline int llvm_fcmp_ueq(double X, double Y) { ";
+  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_une(double X, double Y) { ";
+  Out << "return X != Y; }\n";
+  Out << "static inline int llvm_fcmp_ult(double X, double Y) { ";
+  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_ugt(double X, double Y) { ";
+  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_ule(double X, double Y) { ";
+  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_uge(double X, double Y) { ";
+  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_oeq(double X, double Y) { ";
+  Out << "return X == Y ; }\n";
+  Out << "static inline int llvm_fcmp_one(double X, double Y) { ";
+  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_olt(double X, double Y) { ";
+  Out << "return X <  Y ; }\n";
+  Out << "static inline int llvm_fcmp_ogt(double X, double Y) { ";
+  Out << "return X >  Y ; }\n";
+  Out << "static inline int llvm_fcmp_ole(double X, double Y) { ";
+  Out << "return X <= Y ; }\n";
+  Out << "static inline int llvm_fcmp_oge(double X, double Y) { ";
+  Out << "return X >= Y ; }\n";
+  return false;
+}
+
+
+/// Output all floating point constants that cannot be printed accurately...
+void CWriter::printFloatingPointConstants(Function &F) {
+  // Scan the module for floating point constants.  If any FP constant is used
+  // in the function, we want to redirect it here so that we do not depend on
+  // the precision of the printed form, unless the printed form preserves
+  // precision.
+  //
+  for (constant_iterator I = constant_begin(&F), E = constant_end(&F);
+       I != E; ++I)
+    printFloatingPointConstants(*I);
+
+  Out << '\n';
+}
+
+void CWriter::printFloatingPointConstants(const Constant *C) {
+  // If this is a constant expression, recursively check for constant fp values.
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
+      printFloatingPointConstants(CE->getOperand(i));
+    return;
+  }
+    
+  // Otherwise, check for a FP constant that we need to print.
+  const ConstantFP *FPC = dyn_cast<ConstantFP>(C);
+  if (FPC == 0 ||
+      // Do not put in FPConstantMap if safe.
+      isFPCSafeToPrint(FPC) ||
+      // Already printed this constant?
+      FPConstantMap.count(FPC))
+    return;
+
+  FPConstantMap[FPC] = FPCounter;  // Number the FP constants
+  
+  if (FPC->getType() == Type::DoubleTy) {
+    double Val = FPC->getValueAPF().convertToDouble();
+    uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+    Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
+    << " = 0x" << utohexstr(i)
+    << "ULL;    /* " << Val << " */\n";
+  } else if (FPC->getType() == Type::FloatTy) {
+    float Val = FPC->getValueAPF().convertToFloat();
+    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
+    getZExtValue();
+    Out << "static const ConstantFloatTy FPConstant" << FPCounter++
+    << " = 0x" << utohexstr(i)
+    << "U;    /* " << Val << " */\n";
+  } else if (FPC->getType() == Type::X86_FP80Ty) {
+    // api needed to prevent premature destruction
+    APInt api = FPC->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    Out << "static const ConstantFP80Ty FPConstant" << FPCounter++
+    << " = { 0x" << utohexstr(p[0]) 
+    << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}"
+    << "}; /* Long double constant */\n";
+  } else if (FPC->getType() == Type::PPC_FP128Ty) {
+    APInt api = FPC->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    Out << "static const ConstantFP128Ty FPConstant" << FPCounter++
+    << " = { 0x"
+    << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
+    << "}; /* Long double constant */\n";
+    
+  } else {
+    assert(0 && "Unknown float type!");
+  }
+}
+
+
+
+/// printSymbolTable - Run through symbol table looking for type names.  If a
+/// type name is found, emit its declaration...
+///
+void CWriter::printModuleTypes(const TypeSymbolTable &TST) {
+  Out << "/* Helper union for bitcasts */\n";
+  Out << "typedef union {\n";
+  Out << "  unsigned int Int32;\n";
+  Out << "  unsigned long long Int64;\n";
+  Out << "  float Float;\n";
+  Out << "  double Double;\n";
+  Out << "} llvmBitCastUnion;\n";
+
+  // We are only interested in the type plane of the symbol table.
+  TypeSymbolTable::const_iterator I   = TST.begin();
+  TypeSymbolTable::const_iterator End = TST.end();
+
+  // If there are no type names, exit early.
+  if (I == End) return;
+
+  // Print out forward declarations for structure types before anything else!
+  Out << "/* Structure forward decls */\n";
+  for (; I != End; ++I) {
+    std::string Name = "struct l_" + Mang->makeNameProper(I->first);
+    Out << Name << ";\n";
+    TypeNames.insert(std::make_pair(I->second, Name));
+  }
+
+  Out << '\n';
+
+  // Now we can print out typedefs.  Above, we guaranteed that this can only be
+  // for struct or opaque types.
+  Out << "/* Typedefs */\n";
+  for (I = TST.begin(); I != End; ++I) {
+    std::string Name = "l_" + Mang->makeNameProper(I->first);
+    Out << "typedef ";
+    printType(Out, I->second, false, Name);
+    Out << ";\n";
+  }
+
+  Out << '\n';
+
+  // Keep track of which structures have been printed so far...
+  std::set<const Type *> StructPrinted;
+
+  // Loop over all structures then push them into the stack so they are
+  // printed in the correct order.
+  //
+  Out << "/* Structure contents */\n";
+  for (I = TST.begin(); I != End; ++I)
+    if (isa<StructType>(I->second) || isa<ArrayType>(I->second))
+      // Only print out used types!
+      printContainedStructs(I->second, StructPrinted);
+}
+
+// Push the struct onto the stack and recursively push all structs
+// this one depends on.
+//
+// TODO:  Make this work properly with vector types
+//
+void CWriter::printContainedStructs(const Type *Ty,
+                                    std::set<const Type*> &StructPrinted) {
+  // Don't walk through pointers.
+  if (isa<PointerType>(Ty) || Ty->isPrimitiveType() || Ty->isInteger()) return;
+  
+  // Print all contained types first.
+  for (Type::subtype_iterator I = Ty->subtype_begin(),
+       E = Ty->subtype_end(); I != E; ++I)
+    printContainedStructs(*I, StructPrinted);
+  
+  if (isa<StructType>(Ty) || isa<ArrayType>(Ty)) {
+    // Check to see if we have already printed this struct.
+    if (StructPrinted.insert(Ty).second) {
+      // Print structure type out.
+      std::string Name = TypeNames[Ty];
+      printType(Out, Ty, false, Name, true);
+      Out << ";\n\n";
+    }
+  }
+}
+
+void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F->hasStructRetAttr();
+  
+  if (F->hasLocalLinkage()) Out << "static ";
+  if (F->hasDLLImportLinkage()) Out << "__declspec(dllimport) ";
+  if (F->hasDLLExportLinkage()) Out << "__declspec(dllexport) ";  
+  switch (F->getCallingConv()) {
+   case CallingConv::X86_StdCall:
+    Out << "__attribute__((stdcall)) ";
+    break;
+   case CallingConv::X86_FastCall:
+    Out << "__attribute__((fastcall)) ";
+    break;
+  }
+  
+  // Loop over the arguments, printing them...
+  const FunctionType *FT = cast<FunctionType>(F->getFunctionType());
+  const AttrListPtr &PAL = F->getAttributes();
+
+  std::stringstream FunctionInnards;
+
+  // Print out the name...
+  FunctionInnards << GetValueName(F) << '(';
+
+  bool PrintedArg = false;
+  if (!F->isDeclaration()) {
+    if (!F->arg_empty()) {
+      Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      unsigned Idx = 1;
+      
+      // If this is a struct-return function, don't print the hidden
+      // struct-return argument.
+      if (isStructReturn) {
+        assert(I != E && "Invalid struct return function!");
+        ++I;
+        ++Idx;
+      }
+      
+      std::string ArgName;
+      for (; I != E; ++I) {
+        if (PrintedArg) FunctionInnards << ", ";
+        if (I->hasName() || !Prototype)
+          ArgName = GetValueName(I);
+        else
+          ArgName = "";
+        const Type *ArgTy = I->getType();
+        if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+          ArgTy = cast<PointerType>(ArgTy)->getElementType();
+          ByValParams.insert(I);
+        }
+        printType(FunctionInnards, ArgTy,
+            /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt),
+            ArgName);
+        PrintedArg = true;
+        ++Idx;
+      }
+    }
+  } else {
+    // Loop over the arguments, printing them.
+    FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end();
+    unsigned Idx = 1;
+    
+    // If this is a struct-return function, don't print the hidden
+    // struct-return argument.
+    if (isStructReturn) {
+      assert(I != E && "Invalid struct return function!");
+      ++I;
+      ++Idx;
+    }
+    
+    for (; I != E; ++I) {
+      if (PrintedArg) FunctionInnards << ", ";
+      const Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(isa<PointerType>(ArgTy));
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      printType(FunctionInnards, ArgTy,
+             /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt));
+      PrintedArg = true;
+      ++Idx;
+    }
+  }
+
+  // Finish printing arguments... if this is a vararg function, print the ...,
+  // unless there are no known types, in which case, we just emit ().
+  //
+  if (FT->isVarArg() && PrintedArg) {
+    if (PrintedArg) FunctionInnards << ", ";
+    FunctionInnards << "...";  // Output varargs portion of signature!
+  } else if (!FT->isVarArg() && !PrintedArg) {
+    FunctionInnards << "void"; // ret() -> ret(void) in C.
+  }
+  FunctionInnards << ')';
+  
+  // Get the return tpe for the function.
+  const Type *RetTy;
+  if (!isStructReturn)
+    RetTy = F->getReturnType();
+  else {
+    // If this is a struct-return function, print the struct-return type.
+    RetTy = cast<PointerType>(FT->getParamType(0))->getElementType();
+  }
+    
+  // Print out the return type and the signature built above.
+  printType(Out, RetTy, 
+            /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt),
+            FunctionInnards.str());
+}
+
+static inline bool isFPIntBitCast(const Instruction &I) {
+  if (!isa<BitCastInst>(I))
+    return false;
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DstTy = I.getType();
+  return (SrcTy->isFloatingPoint() && DstTy->isInteger()) ||
+         (DstTy->isFloatingPoint() && SrcTy->isInteger());
+}
+
+void CWriter::printFunction(Function &F) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F.hasStructRetAttr();
+
+  printFunctionSignature(&F, false);
+  Out << " {\n";
+  
+  // If this is a struct return function, handle the result with magic.
+  if (isStructReturn) {
+    const Type *StructTy =
+      cast<PointerType>(F.arg_begin()->getType())->getElementType();
+    Out << "  ";
+    printType(Out, StructTy, false, "StructReturn");
+    Out << ";  /* Struct return temporary */\n";
+
+    Out << "  ";
+    printType(Out, F.arg_begin()->getType(), false, 
+              GetValueName(F.arg_begin()));
+    Out << " = &StructReturn;\n";
+  }
+
+  bool PrintedVar = false;
+  
+  // print local variable information for the function
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    if (const AllocaInst *AI = isDirectAlloca(&*I)) {
+      Out << "  ";
+      printType(Out, AI->getAllocatedType(), false, GetValueName(AI));
+      Out << ";    /* Address-exposed local */\n";
+      PrintedVar = true;
+    } else if (I->getType() != Type::VoidTy && !isInlinableInst(*I)) {
+      Out << "  ";
+      printType(Out, I->getType(), false, GetValueName(&*I));
+      Out << ";\n";
+
+      if (isa<PHINode>(*I)) {  // Print out PHI node temporaries as well...
+        Out << "  ";
+        printType(Out, I->getType(), false,
+                  GetValueName(&*I)+"__PHI_TEMPORARY");
+        Out << ";\n";
+      }
+      PrintedVar = true;
+    }
+    // We need a temporary for the BitCast to use so it can pluck a value out
+    // of a union to do the BitCast. This is separate from the need for a
+    // variable to hold the result of the BitCast. 
+    if (isFPIntBitCast(*I)) {
+      Out << "  llvmBitCastUnion " << GetValueName(&*I)
+          << "__BITCAST_TEMPORARY;\n";
+      PrintedVar = true;
+    }
+  }
+
+  if (PrintedVar)
+    Out << '\n';
+
+  if (F.hasExternalLinkage() && F.getName() == "main")
+    Out << "  CODE_FOR_MAIN();\n";
+
+  // print the basic blocks
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Loop *L = LI->getLoopFor(BB)) {
+      if (L->getHeader() == BB && L->getParentLoop() == 0)
+        printLoop(L);
+    } else {
+      printBasicBlock(BB);
+    }
+  }
+
+  Out << "}\n\n";
+}
+
+void CWriter::printLoop(Loop *L) {
+  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
+      << "' to make GCC happy */\n";
+  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    Loop *BBLoop = LI->getLoopFor(BB);
+    if (BBLoop == L)
+      printBasicBlock(BB);
+    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
+      printLoop(BBLoop);
+  }
+  Out << "  } while (1); /* end of syntactic loop '"
+      << L->getHeader()->getName() << "' */\n";
+}
+
+void CWriter::printBasicBlock(BasicBlock *BB) {
+
+  // Don't print the label for the basic block if there are no uses, or if
+  // the only terminator use is the predecessor basic block's terminator.
+  // We have to scan the use list because PHI nodes use basic blocks too but
+  // do not require a label to be generated.
+  //
+  bool NeedsLabel = false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (isGotoCodeNecessary(*PI, BB)) {
+      NeedsLabel = true;
+      break;
+    }
+
+  if (NeedsLabel) Out << GetValueName(BB) << ":\n";
+
+  // Output all of the instructions in the basic block...
+  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
+       ++II) {
+    if (!isInlinableInst(*II) && !isDirectAlloca(II)) {
+      if (II->getType() != Type::VoidTy && !isInlineAsm(*II))
+        outputLValue(II);
+      else
+        Out << "  ";
+      writeInstComputationInline(*II);
+      Out << ";\n";
+    }
+  }
+
+  // Don't emit prefix or suffix for the terminator.
+  visit(*BB->getTerminator());
+}
+
+
+// Specific Instruction type classes... note that all of the casts are
+// necessary because we use the instruction classes as opaque types...
+//
+void CWriter::visitReturnInst(ReturnInst &I) {
+  // If this is a struct return function, return the temporary struct.
+  bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();
+
+  if (isStructReturn) {
+    Out << "  return StructReturn;\n";
+    return;
+  }
+  
+  // Don't output a void return if this is the last basic block in the function
+  if (I.getNumOperands() == 0 &&
+      &*--I.getParent()->getParent()->end() == I.getParent() &&
+      !I.getParent()->size() == 1) {
+    return;
+  }
+
+  if (I.getNumOperands() > 1) {
+    Out << "  {\n";
+    Out << "    ";
+    printType(Out, I.getParent()->getParent()->getReturnType());
+    Out << "   llvm_cbe_mrv_temp = {\n";
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+      Out << "      ";
+      writeOperand(I.getOperand(i));
+      if (i != e - 1)
+        Out << ",";
+      Out << "\n";
+    }
+    Out << "    };\n";
+    Out << "    return llvm_cbe_mrv_temp;\n";
+    Out << "  }\n";
+    return;
+  }
+
+  Out << "  return";
+  if (I.getNumOperands()) {
+    Out << ' ';
+    writeOperand(I.getOperand(0));
+  }
+  Out << ";\n";
+}
+
+void CWriter::visitSwitchInst(SwitchInst &SI) {
+
+  Out << "  switch (";
+  writeOperand(SI.getOperand(0));
+  Out << ") {\n  default:\n";
+  printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
+  printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
+  Out << ";\n";
+  for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2) {
+    Out << "  case ";
+    writeOperand(SI.getOperand(i));
+    Out << ":\n";
+    BasicBlock *Succ = cast<BasicBlock>(SI.getOperand(i+1));
+    printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
+    printBranchToBlock(SI.getParent(), Succ, 2);
+    if (Function::iterator(Succ) == next(Function::iterator(SI.getParent())))
+      Out << "    break;\n";
+  }
+  Out << "  }\n";
+}
+
+void CWriter::visitUnreachableInst(UnreachableInst &I) {
+  Out << "  /*UNREACHABLE*/;\n";
+}
+
+bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
+  /// FIXME: This should be reenabled, but loop reordering safe!!
+  return true;
+
+  if (next(Function::iterator(From)) != Function::iterator(To))
+    return true;  // Not the direct successor, we need a goto.
+
+  //isa<SwitchInst>(From->getTerminator())
+
+  if (LI->getLoopFor(From) != LI->getLoopFor(To))
+    return true;
+  return false;
+}
+
+void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
+                                          BasicBlock *Successor,
+                                          unsigned Indent) {
+  for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // Now we have to do the printing.
+    Value *IV = PN->getIncomingValueForBlock(CurBlock);
+    if (!isa<UndefValue>(IV)) {
+      Out << std::string(Indent, ' ');
+      Out << "  " << GetValueName(I) << "__PHI_TEMPORARY = ";
+      writeOperand(IV);
+      Out << ";   /* for PHI node */\n";
+    }
+  }
+}
+
+void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
+                                 unsigned Indent) {
+  if (isGotoCodeNecessary(CurBB, Succ)) {
+    Out << std::string(Indent, ' ') << "  goto ";
+    writeOperand(Succ);
+    Out << ";\n";
+  }
+}
+
+// Branch instruction printing - Avoid printing out a branch to a basic block
+// that immediately succeeds the current one.
+//
+void CWriter::visitBranchInst(BranchInst &I) {
+
+  if (I.isConditional()) {
+    if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) {
+      Out << "  if (";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(0), 2);
+
+      if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) {
+        Out << "  } else {\n";
+        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+        printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+      }
+    } else {
+      // First goto not necessary, assume second one is...
+      Out << "  if (!";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+    }
+
+    Out << "  }\n";
+  } else {
+    printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0);
+    printBranchToBlock(I.getParent(), I.getSuccessor(0), 0);
+  }
+  Out << "\n";
+}
+
+// PHI nodes get copied into temporary values at the end of predecessor basic
+// blocks.  We now need to copy these temporary values into the REAL value for
+// the PHI.
+void CWriter::visitPHINode(PHINode &I) {
+  writeOperand(&I);
+  Out << "__PHI_TEMPORARY";
+}
+
+
+void CWriter::visitBinaryOperator(Instruction &I) {
+  // binary instructions, shift instructions, setCond instructions.
+  assert(!isa<PointerType>(I.getType()));
+
+  // We must cast the results of binary operations which might be promoted.
+  bool needsCast = false;
+  if ((I.getType() == Type::Int8Ty) || (I.getType() == Type::Int16Ty) 
+      || (I.getType() == Type::FloatTy)) {
+    needsCast = true;
+    Out << "((";
+    printType(Out, I.getType(), false);
+    Out << ")(";
+  }
+
+  // If this is a negation operation, print it out as such.  For FP, we don't
+  // want to print "-0.0 - X".
+  if (BinaryOperator::isNeg(&I)) {
+    Out << "-(";
+    writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I)));
+    Out << ")";
+  } else if (I.getOpcode() == Instruction::FRem) {
+    // Output a call to fmod/fmodf instead of emitting a%b
+    if (I.getType() == Type::FloatTy)
+      Out << "fmodf(";
+    else if (I.getType() == Type::DoubleTy)
+      Out << "fmod(";
+    else  // all 3 flavors of long double
+      Out << "fmodl(";
+    writeOperand(I.getOperand(0));
+    Out << ", ";
+    writeOperand(I.getOperand(1));
+    Out << ")";
+  } else {
+
+    // Write out the cast of the instruction's value back to the proper type
+    // if necessary.
+    bool NeedsClosingParens = writeInstructionCast(I);
+
+    // Certain instructions require the operand to be forced to a specific type
+    // so we use writeOperandWithCast here instead of writeOperand. Similarly
+    // below for operand 1
+    writeOperandWithCast(I.getOperand(0), I.getOpcode());
+
+    switch (I.getOpcode()) {
+    case Instruction::Add:  Out << " + "; break;
+    case Instruction::Sub:  Out << " - "; break;
+    case Instruction::Mul:  Out << " * "; break;
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem: Out << " % "; break;
+    case Instruction::UDiv:
+    case Instruction::SDiv: 
+    case Instruction::FDiv: Out << " / "; break;
+    case Instruction::And:  Out << " & "; break;
+    case Instruction::Or:   Out << " | "; break;
+    case Instruction::Xor:  Out << " ^ "; break;
+    case Instruction::Shl : Out << " << "; break;
+    case Instruction::LShr:
+    case Instruction::AShr: Out << " >> "; break;
+    default: cerr << "Invalid operator type!" << I; abort();
+    }
+
+    writeOperandWithCast(I.getOperand(1), I.getOpcode());
+    if (NeedsClosingParens)
+      Out << "))";
+  }
+
+  if (needsCast) {
+    Out << "))";
+  }
+}
+
+void CWriter::visitICmpInst(ICmpInst &I) {
+  // We must cast the results of icmp which might be promoted.
+  bool needsCast = false;
+
+  // Write out the cast of the instruction's value back to the proper type
+  // if necessary.
+  bool NeedsClosingParens = writeInstructionCast(I);
+
+  // Certain icmp predicate require the operand to be forced to a specific type
+  // so we use writeOperandWithCast here instead of writeOperand. Similarly
+  // below for operand 1
+  writeOperandWithCast(I.getOperand(0), I);
+
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:  Out << " == "; break;
+  case ICmpInst::ICMP_NE:  Out << " != "; break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE: Out << " <= "; break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE: Out << " >= "; break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT: Out << " < "; break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT: Out << " > "; break;
+  default: cerr << "Invalid icmp predicate!" << I; abort();
+  }
+
+  writeOperandWithCast(I.getOperand(1), I);
+  if (NeedsClosingParens)
+    Out << "))";
+
+  if (needsCast) {
+    Out << "))";
+  }
+}
+
+void CWriter::visitFCmpInst(FCmpInst &I) {
+  if (I.getPredicate() == FCmpInst::FCMP_FALSE) {
+    Out << "0";
+    return;
+  }
+  if (I.getPredicate() == FCmpInst::FCMP_TRUE) {
+    Out << "1";
+    return;
+  }
+
+  const char* op = 0;
+  switch (I.getPredicate()) {
+  default: assert(0 && "Illegal FCmp predicate");
+  case FCmpInst::FCMP_ORD: op = "ord"; break;
+  case FCmpInst::FCMP_UNO: op = "uno"; break;
+  case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+  case FCmpInst::FCMP_UNE: op = "une"; break;
+  case FCmpInst::FCMP_ULT: op = "ult"; break;
+  case FCmpInst::FCMP_ULE: op = "ule"; break;
+  case FCmpInst::FCMP_UGT: op = "ugt"; break;
+  case FCmpInst::FCMP_UGE: op = "uge"; break;
+  case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+  case FCmpInst::FCMP_ONE: op = "one"; break;
+  case FCmpInst::FCMP_OLT: op = "olt"; break;
+  case FCmpInst::FCMP_OLE: op = "ole"; break;
+  case FCmpInst::FCMP_OGT: op = "ogt"; break;
+  case FCmpInst::FCMP_OGE: op = "oge"; break;
+  }
+
+  Out << "llvm_fcmp_" << op << "(";
+  // Write the first operand
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  // Write the second operand
+  writeOperand(I.getOperand(1));
+  Out << ")";
+}
+
+static const char * getFloatBitCastField(const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    default: assert(0 && "Invalid Type");
+    case Type::FloatTyID:  return "Float";
+    case Type::DoubleTyID: return "Double";
+    case Type::IntegerTyID: {
+      unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+      if (NumBits <= 32)
+        return "Int32";
+      else
+        return "Int64";
+    }
+  }
+}
+
+void CWriter::visitCastInst(CastInst &I) {
+  const Type *DstTy = I.getType();
+  const Type *SrcTy = I.getOperand(0)->getType();
+  if (isFPIntBitCast(I)) {
+    Out << '(';
+    // These int<->float and long<->double casts need to be handled specially
+    Out << GetValueName(&I) << "__BITCAST_TEMPORARY." 
+        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
+    writeOperand(I.getOperand(0));
+    Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
+        << getFloatBitCastField(I.getType());
+    Out << ')';
+    return;
+  }
+  
+  Out << '(';
+  printCast(I.getOpcode(), SrcTy, DstTy);
+
+  // Make a sext from i1 work by subtracting the i1 from 0 (an int).
+  if (SrcTy == Type::Int1Ty && I.getOpcode() == Instruction::SExt)
+    Out << "0-";
+  
+  writeOperand(I.getOperand(0));
+    
+  if (DstTy == Type::Int1Ty && 
+      (I.getOpcode() == Instruction::Trunc ||
+       I.getOpcode() == Instruction::FPToUI ||
+       I.getOpcode() == Instruction::FPToSI ||
+       I.getOpcode() == Instruction::PtrToInt)) {
+    // Make sure we really get a trunc to bool by anding the operand with 1 
+    Out << "&1u";
+  }
+  Out << ')';
+}
+
+void CWriter::visitSelectInst(SelectInst &I) {
+  Out << "((";
+  writeOperand(I.getCondition());
+  Out << ") ? (";
+  writeOperand(I.getTrueValue());
+  Out << ") : (";
+  writeOperand(I.getFalseValue());
+  Out << "))";
+}
+
+
+void CWriter::lowerIntrinsics(Function &F) {
+  // This is used to keep track of intrinsics that get generated to a lowered
+  // function. We must generate the prototypes before the function body which
+  // will only be expanded on first use (by the loop below).
+  std::vector<Function*> prototypesToGen;
+
+  // Examine all the instructions in this function to find the intrinsics that
+  // need to be lowered.
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (Function *F = CI->getCalledFunction())
+          switch (F->getIntrinsicID()) {
+          case Intrinsic::not_intrinsic:
+          case Intrinsic::memory_barrier:
+          case Intrinsic::vastart:
+          case Intrinsic::vacopy:
+          case Intrinsic::vaend:
+          case Intrinsic::returnaddress:
+          case Intrinsic::frameaddress:
+          case Intrinsic::setjmp:
+          case Intrinsic::longjmp:
+          case Intrinsic::prefetch:
+          case Intrinsic::dbg_stoppoint:
+          case Intrinsic::powi:
+          case Intrinsic::x86_sse_cmp_ss:
+          case Intrinsic::x86_sse_cmp_ps:
+          case Intrinsic::x86_sse2_cmp_sd:
+          case Intrinsic::x86_sse2_cmp_pd:
+          case Intrinsic::ppc_altivec_lvsl:
+              // We directly implement these intrinsics
+            break;
+          default:
+            // If this is an intrinsic that directly corresponds to a GCC
+            // builtin, we handle it.
+            const char *BuiltinName = "";
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+            // If we handle it, don't lower it.
+            if (BuiltinName[0]) break;
+            
+            // All other intrinsic calls we must lower.
+            Instruction *Before = 0;
+            if (CI != &BB->front())
+              Before = prior(BasicBlock::iterator(CI));
+
+            IL->LowerIntrinsicCall(CI);
+            if (Before) {        // Move iterator to instruction after call
+              I = Before; ++I;
+            } else {
+              I = BB->begin();
+            }
+            // If the intrinsic got lowered to another call, and that call has
+            // a definition then we need to make sure its prototype is emitted
+            // before any calls to it.
+            if (CallInst *Call = dyn_cast<CallInst>(I))
+              if (Function *NewF = Call->getCalledFunction())
+                if (!NewF->isDeclaration())
+                  prototypesToGen.push_back(NewF);
+
+            break;
+          }
+
+  // We may have collected some prototypes to emit in the loop above. 
+  // Emit them now, before the function that uses them is emitted. But,
+  // be careful not to emit them twice.
+  std::vector<Function*>::iterator I = prototypesToGen.begin();
+  std::vector<Function*>::iterator E = prototypesToGen.end();
+  for ( ; I != E; ++I) {
+    if (intrinsicPrototypesAlreadyGenerated.insert(*I).second) {
+      Out << '\n';
+      printFunctionSignature(*I, true);
+      Out << ";\n";
+    }
+  }
+}
+
+void CWriter::visitCallInst(CallInst &I) {
+  if (isa<InlineAsm>(I.getOperand(0)))
+    return visitInlineAsm(I);
+
+  bool WroteCallee = false;
+
+  // Handle intrinsic function calls first...
+  if (Function *F = I.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
+      if (visitBuiltinCall(I, ID, WroteCallee))
+        return;
+
+  Value *Callee = I.getCalledValue();
+
+  const PointerType  *PTy   = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
+
+  // If this is a call to a struct-return function, assign to the first
+  // parameter instead of passing it to the call.
+  const AttrListPtr &PAL = I.getAttributes();
+  bool hasByVal = I.hasByValArgument();
+  bool isStructRet = I.hasStructRetAttr();
+  if (isStructRet) {
+    writeOperandDeref(I.getOperand(1));
+    Out << " = ";
+  }
+  
+  if (I.isTailCall()) Out << " /*tail*/ ";
+  
+  if (!WroteCallee) {
+    // If this is an indirect call to a struct return function, we need to cast
+    // the pointer. Ditto for indirect calls with byval arguments.
+    bool NeedsCast = (hasByVal || isStructRet) && !isa<Function>(Callee);
+
+    // GCC is a real PITA.  It does not permit codegening casts of functions to
+    // function pointers if they are in a call (it generates a trap instruction
+    // instead!).  We work around this by inserting a cast to void* in between
+    // the function and the function pointer cast.  Unfortunately, we can't just
+    // form the constant expression here, because the folder will immediately
+    // nuke it.
+    //
+    // Note finally, that this is completely unsafe.  ANSI C does not guarantee
+    // that void* and function pointers have the same size. :( To deal with this
+    // in the common case, we handle casts where the number of arguments passed
+    // match exactly.
+    //
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
+      if (CE->isCast())
+        if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
+          NeedsCast = true;
+          Callee = RF;
+        }
+  
+    if (NeedsCast) {
+      // Ok, just cast the pointer type.
+      Out << "((";
+      if (isStructRet)
+        printStructReturnPointerFunctionType(Out, PAL,
+                             cast<PointerType>(I.getCalledValue()->getType()));
+      else if (hasByVal)
+        printType(Out, I.getCalledValue()->getType(), false, "", true, PAL);
+      else
+        printType(Out, I.getCalledValue()->getType());
+      Out << ")(void*)";
+    }
+    writeOperand(Callee);
+    if (NeedsCast) Out << ')';
+  }
+
+  Out << '(';
+
+  unsigned NumDeclaredParams = FTy->getNumParams();
+
+  CallSite::arg_iterator AI = I.op_begin()+1, AE = I.op_end();
+  unsigned ArgNo = 0;
+  if (isStructRet) {   // Skip struct return argument.
+    ++AI;
+    ++ArgNo;
+  }
+      
+  bool PrintedArg = false;
+  for (; AI != AE; ++AI, ++ArgNo) {
+    if (PrintedArg) Out << ", ";
+    if (ArgNo < NumDeclaredParams &&
+        (*AI)->getType() != FTy->getParamType(ArgNo)) {
+      Out << '(';
+      printType(Out, FTy->getParamType(ArgNo), 
+            /*isSigned=*/PAL.paramHasAttr(ArgNo+1, Attribute::SExt));
+      Out << ')';
+    }
+    // Check if the argument is expected to be passed by value.
+    if (I.paramHasAttr(ArgNo+1, Attribute::ByVal))
+      writeOperandDeref(*AI);
+    else
+      writeOperand(*AI);
+    PrintedArg = true;
+  }
+  Out << ')';
+}
+
+/// visitBuiltinCall - Handle the call to the specified builtin.  Returns true
+/// if the entire call is handled, return false it it wasn't handled, and
+/// optionally set 'WroteCallee' if the callee has already been printed out.
+bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
+                               bool &WroteCallee) {
+  switch (ID) {
+  default: {
+    // If this is an intrinsic that directly corresponds to a GCC
+    // builtin, we emit it here.
+    const char *BuiltinName = "";
+    Function *F = I.getCalledFunction();
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+    assert(BuiltinName[0] && "Unknown LLVM intrinsic!");
+    
+    Out << BuiltinName;
+    WroteCallee = true;
+    return false;
+  }
+  case Intrinsic::memory_barrier:
+    Out << "__sync_synchronize()";
+    return true;
+  case Intrinsic::vastart:
+    Out << "0; ";
+      
+    Out << "va_start(*(va_list*)";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    // Output the last argument to the enclosing function.
+    if (I.getParent()->getParent()->arg_empty()) {
+      cerr << "The C backend does not currently support zero "
+           << "argument varargs functions, such as '"
+           << I.getParent()->getParent()->getName() << "'!\n";
+      abort();
+    }
+    writeOperand(--I.getParent()->getParent()->arg_end());
+    Out << ')';
+    return true;
+  case Intrinsic::vaend:
+    if (!isa<ConstantPointerNull>(I.getOperand(1))) {
+      Out << "0; va_end(*(va_list*)";
+      writeOperand(I.getOperand(1));
+      Out << ')';
+    } else {
+      Out << "va_end(*(va_list*)0)";
+    }
+    return true;
+  case Intrinsic::vacopy:
+    Out << "0; ";
+    Out << "va_copy(*(va_list*)";
+    writeOperand(I.getOperand(1));
+    Out << ", *(va_list*)";
+    writeOperand(I.getOperand(2));
+    Out << ')';
+    return true;
+  case Intrinsic::returnaddress:
+    Out << "__builtin_return_address(";
+    writeOperand(I.getOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::frameaddress:
+    Out << "__builtin_frame_address(";
+    writeOperand(I.getOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::powi:
+    Out << "__builtin_powi(";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ')';
+    return true;
+  case Intrinsic::setjmp:
+    Out << "setjmp(*(jmp_buf*)";
+    writeOperand(I.getOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::longjmp:
+    Out << "longjmp(*(jmp_buf*)";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ')';
+    return true;
+  case Intrinsic::prefetch:
+    Out << "LLVM_PREFETCH((const void *)";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ", ";
+    writeOperand(I.getOperand(3));
+    Out << ")";
+    return true;
+  case Intrinsic::stacksave:
+    // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
+    // to work around GCC bugs (see PR1809).
+    Out << "0; *((void**)&" << GetValueName(&I)
+        << ") = __builtin_stack_save()";
+    return true;
+  case Intrinsic::dbg_stoppoint: {
+    // If we use writeOperand directly we get a "u" suffix which is rejected
+    // by gcc.
+    std::stringstream SPIStr;
+    DbgStopPointInst &SPI = cast<DbgStopPointInst>(I);
+    SPI.getDirectory()->print(SPIStr);
+    Out << "\n#line "
+        << SPI.getLine()
+        << " \"";
+    Out << SPIStr.str();
+    SPIStr.clear();
+    SPI.getFileName()->print(SPIStr);
+    Out << SPIStr.str() << "\"\n";
+    return true;
+  }
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse_cmp_ps:
+  case Intrinsic::x86_sse2_cmp_sd:
+  case Intrinsic::x86_sse2_cmp_pd:
+    Out << '(';
+    printType(Out, I.getType());
+    Out << ')';  
+    // Multiple GCC builtins multiplex onto this intrinsic.
+    switch (cast<ConstantInt>(I.getOperand(3))->getZExtValue()) {
+    default: assert(0 && "Invalid llvm.x86.sse.cmp!");
+    case 0: Out << "__builtin_ia32_cmpeq"; break;
+    case 1: Out << "__builtin_ia32_cmplt"; break;
+    case 2: Out << "__builtin_ia32_cmple"; break;
+    case 3: Out << "__builtin_ia32_cmpunord"; break;
+    case 4: Out << "__builtin_ia32_cmpneq"; break;
+    case 5: Out << "__builtin_ia32_cmpnlt"; break;
+    case 6: Out << "__builtin_ia32_cmpnle"; break;
+    case 7: Out << "__builtin_ia32_cmpord"; break;
+    }
+    if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
+      Out << 'p';
+    else
+      Out << 's';
+    if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
+      Out << 's';
+    else
+      Out << 'd';
+      
+    Out << "(";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ")";
+    return true;
+  case Intrinsic::ppc_altivec_lvsl:
+    Out << '(';
+    printType(Out, I.getType());
+    Out << ')';  
+    Out << "__builtin_altivec_lvsl(0, (void*)";
+    writeOperand(I.getOperand(1));
+    Out << ")";
+    return true;
+  }
+}
+
+//This converts the llvm constraint string to something gcc is expecting.
+//TODO: work out platform independent constraints and factor those out
+//      of the per target tables
+//      handle multiple constraint codes
+std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
+
+  assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
+
+  const char *const *table = 0;
+  
+  //Grab the translation table from TargetAsmInfo if it exists
+  if (!TAsm) {
+    std::string E;
+    const TargetMachineRegistry::entry* Match = 
+      TargetMachineRegistry::getClosestStaticTargetForModule(*TheModule, E);
+    if (Match) {
+      //Per platform Target Machines don't exist, so create it
+      // this must be done only once
+      const TargetMachine* TM = Match->CtorFn(*TheModule, "");
+      TAsm = TM->getTargetAsmInfo();
+    }
+  }
+  if (TAsm)
+    table = TAsm->getAsmCBE();
+
+  //Search the translation table if it exists
+  for (int i = 0; table && table[i]; i += 2)
+    if (c.Codes[0] == table[i])
+      return table[i+1];
+
+  //default is identity
+  return c.Codes[0];
+}
+
+//TODO: import logic from AsmPrinter.cpp
+static std::string gccifyAsm(std::string asmstr) {
+  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
+    if (asmstr[i] == '\n')
+      asmstr.replace(i, 1, "\\n");
+    else if (asmstr[i] == '\t')
+      asmstr.replace(i, 1, "\\t");
+    else if (asmstr[i] == '$') {
+      if (asmstr[i + 1] == '{') {
+        std::string::size_type a = asmstr.find_first_of(':', i + 1);
+        std::string::size_type b = asmstr.find_first_of('}', i + 1);
+        std::string n = "%" + 
+          asmstr.substr(a + 1, b - a - 1) +
+          asmstr.substr(i + 2, a - i - 2);
+        asmstr.replace(i, b - i + 1, n);
+        i += n.size() - 1;
+      } else
+        asmstr.replace(i, 1, "%");
+    }
+    else if (asmstr[i] == '%')//grr
+      { asmstr.replace(i, 1, "%%"); ++i;}
+  
+  return asmstr;
+}
+
+//TODO: assumptions about what consume arguments from the call are likely wrong
+//      handle communitivity
+void CWriter::visitInlineAsm(CallInst &CI) {
+  InlineAsm* as = cast<InlineAsm>(CI.getOperand(0));
+  std::vector<InlineAsm::ConstraintInfo> Constraints = as->ParseConstraints();
+  
+  std::vector<std::pair<Value*, int> > ResultVals;
+  if (CI.getType() == Type::VoidTy)
+    ;
+  else if (const StructType *ST = dyn_cast<StructType>(CI.getType())) {
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
+      ResultVals.push_back(std::make_pair(&CI, (int)i));
+  } else {
+    ResultVals.push_back(std::make_pair(&CI, -1));
+  }
+  
+  // Fix up the asm string for gcc and emit it.
+  Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
+  Out << "        :";
+
+  unsigned ValueCount = 0;
+  bool IsFirst = true;
+  
+  // Convert over all the output constraints.
+  for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    
+    if (I->Type != InlineAsm::isOutput) {
+      ++ValueCount;
+      continue;  // Ignore non-output constraints.
+    }
+    
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+    
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+
+    // Unpack the dest.
+    Value *DestVal;
+    int DestValNo = -1;
+    
+    if (ValueCount < ResultVals.size()) {
+      DestVal = ResultVals[ValueCount].first;
+      DestValNo = ResultVals[ValueCount].second;
+    } else
+      DestVal = CI.getOperand(ValueCount-ResultVals.size()+1);
+
+    if (I->isEarlyClobber)
+      C = "&"+C;
+      
+    Out << "\"=" << C << "\"(" << GetValueName(DestVal);
+    if (DestValNo != -1)
+      Out << ".field" << DestValNo; // Multiple retvals.
+    Out << ")";
+    ++ValueCount;
+  }
+  
+  
+  // Convert over all the input constraints.
+  Out << "\n        :";
+  IsFirst = true;
+  ValueCount = 0;
+  for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    if (I->Type != InlineAsm::isInput) {
+      ++ValueCount;
+      continue;  // Ignore non-input constraints.
+    }
+    
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+    
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+    
+    assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
+    Value *SrcVal = CI.getOperand(ValueCount-ResultVals.size()+1);
+    
+    Out << "\"" << C << "\"(";
+    if (!I->isIndirect)
+      writeOperand(SrcVal);
+    else
+      writeOperandDeref(SrcVal);
+    Out << ")";
+  }
+  
+  // Convert over the clobber constraints.
+  IsFirst = true;
+  ValueCount = 0;
+  for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    if (I->Type != InlineAsm::isClobber)
+      continue;  // Ignore non-input constraints.
+
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+    
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+    
+    Out << '\"' << C << '"';
+  }
+  
+  Out << ")";
+}
+
+void CWriter::visitMallocInst(MallocInst &I) {
+  assert(0 && "lowerallocations pass didn't work!");
+}
+
+void CWriter::visitAllocaInst(AllocaInst &I) {
+  Out << '(';
+  printType(Out, I.getType());
+  Out << ") alloca(sizeof(";
+  printType(Out, I.getType()->getElementType());
+  Out << ')';
+  if (I.isArrayAllocation()) {
+    Out << " * " ;
+    writeOperand(I.getOperand(0));
+  }
+  Out << ')';
+}
+
+void CWriter::visitFreeInst(FreeInst &I) {
+  assert(0 && "lowerallocations pass didn't work!");
+}
+
+void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
+                                 gep_type_iterator E, bool Static) {
+  
+  // If there are no indices, just print out the pointer.
+  if (I == E) {
+    writeOperand(Ptr);
+    return;
+  }
+    
+  // Find out if the last index is into a vector.  If so, we have to print this
+  // specially.  Since vectors can't have elements of indexable type, only the
+  // last index could possibly be of a vector element.
+  const VectorType *LastIndexIsVector = 0;
+  {
+    for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
+      LastIndexIsVector = dyn_cast<VectorType>(*TmpI);
+  }
+  
+  Out << "(";
+  
+  // If the last index is into a vector, we can't print it as &a[i][j] because
+  // we can't index into a vector with j in GCC.  Instead, emit this as
+  // (((float*)&a[i])+j)
+  if (LastIndexIsVector) {
+    Out << "((";
+    printType(Out, PointerType::getUnqual(LastIndexIsVector->getElementType()));
+    Out << ")(";
+  }
+  
+  Out << '&';
+
+  // If the first index is 0 (very typical) we can do a number of
+  // simplifications to clean up the code.
+  Value *FirstOp = I.getOperand();
+  if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) {
+    // First index isn't simple, print it the hard way.
+    writeOperand(Ptr);
+  } else {
+    ++I;  // Skip the zero index.
+
+    // Okay, emit the first operand. If Ptr is something that is already address
+    // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
+    if (isAddressExposed(Ptr)) {
+      writeOperandInternal(Ptr, Static);
+    } else if (I != E && isa<StructType>(*I)) {
+      // If we didn't already emit the first operand, see if we can print it as
+      // P->f instead of "P[0].f"
+      writeOperand(Ptr);
+      Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+      ++I;  // eat the struct index as well.
+    } else {
+      // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
+      Out << "(*";
+      writeOperand(Ptr);
+      Out << ")";
+    }
+  }
+
+  for (; I != E; ++I) {
+    if (isa<StructType>(*I)) {
+      Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+    } else if (isa<ArrayType>(*I)) {
+      Out << ".array[";
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else if (!isa<VectorType>(*I)) {
+      Out << '[';
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else {
+      // If the last index is into a vector, then print it out as "+j)".  This
+      // works with the 'LastIndexIsVector' code above.
+      if (isa<Constant>(I.getOperand()) &&
+          cast<Constant>(I.getOperand())->isNullValue()) {
+        Out << "))";  // avoid "+0".
+      } else {
+        Out << ")+(";
+        writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+        Out << "))";
+      }
+    }
+  }
+  Out << ")";
+}
+
+void CWriter::writeMemoryAccess(Value *Operand, const Type *OperandType,
+                                bool IsVolatile, unsigned Alignment) {
+
+  bool IsUnaligned = Alignment &&
+    Alignment < TD->getABITypeAlignment(OperandType);
+
+  if (!IsUnaligned)
+    Out << '*';
+  if (IsVolatile || IsUnaligned) {
+    Out << "((";
+    if (IsUnaligned)
+      Out << "struct __attribute__ ((packed, aligned(" << Alignment << "))) {";
+    printType(Out, OperandType, false, IsUnaligned ? "data" : "volatile*");
+    if (IsUnaligned) {
+      Out << "; } ";
+      if (IsVolatile) Out << "volatile ";
+      Out << "*";
+    }
+    Out << ")";
+  }
+
+  writeOperand(Operand);
+
+  if (IsVolatile || IsUnaligned) {
+    Out << ')';
+    if (IsUnaligned)
+      Out << "->data";
+  }
+}
+
+void CWriter::visitLoadInst(LoadInst &I) {
+  writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
+                    I.getAlignment());
+
+}
+
+void CWriter::visitStoreInst(StoreInst &I) {
+  writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
+                    I.isVolatile(), I.getAlignment());
+  Out << " = ";
+  Value *Operand = I.getOperand(0);
+  Constant *BitMask = 0;
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
+    if (!ITy->isPowerOf2ByteWidth())
+      // We have a bit width that doesn't match an even power-of-2 byte
+      // size. Consequently we must & the value with the type's bit mask
+      BitMask = ConstantInt::get(ITy, ITy->getBitMask());
+  if (BitMask)
+    Out << "((";
+  writeOperand(Operand);
+  if (BitMask) {
+    Out << ") & ";
+    printConstant(BitMask, false);
+    Out << ")"; 
+  }
+}
+
+void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  printGEPExpression(I.getPointerOperand(), gep_type_begin(I),
+                     gep_type_end(I), false);
+}
+
+void CWriter::visitVAArgInst(VAArgInst &I) {
+  Out << "va_arg(*(va_list*)";
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  printType(Out, I.getType());
+  Out << ");\n ";
+}
+
+void CWriter::visitInsertElementInst(InsertElementInst &I) {
+  const Type *EltTy = I.getType()->getElementType();
+  writeOperand(I.getOperand(0));
+  Out << ";\n  ";
+  Out << "((";
+  printType(Out, PointerType::getUnqual(EltTy));
+  Out << ")(&" << GetValueName(&I) << "))[";
+  writeOperand(I.getOperand(2));
+  Out << "] = (";
+  writeOperand(I.getOperand(1));
+  Out << ")";
+}
+
+void CWriter::visitExtractElementInst(ExtractElementInst &I) {
+  // We know that our operand is not inlined.
+  Out << "((";
+  const Type *EltTy = 
+    cast<VectorType>(I.getOperand(0)->getType())->getElementType();
+  printType(Out, PointerType::getUnqual(EltTy));
+  Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
+  writeOperand(I.getOperand(1));
+  Out << "]";
+}
+
+void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Out << "(";
+  printType(Out, SVI.getType());
+  Out << "){ ";
+  const VectorType *VT = SVI.getType();
+  unsigned NumElts = VT->getNumElements();
+  const Type *EltTy = VT->getElementType();
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (i) Out << ", ";
+    int SrcVal = SVI.getMaskValue(i);
+    if ((unsigned)SrcVal >= NumElts*2) {
+      Out << " 0/*undef*/ ";
+    } else {
+      Value *Op = SVI.getOperand((unsigned)SrcVal >= NumElts);
+      if (isa<Instruction>(Op)) {
+        // Do an extractelement of this value from the appropriate input.
+        Out << "((";
+        printType(Out, PointerType::getUnqual(EltTy));
+        Out << ")(&" << GetValueName(Op)
+            << "))[" << (SrcVal & (NumElts-1)) << "]";
+      } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
+        Out << "0";
+      } else {
+        printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal &
+                                                           (NumElts-1)),
+                      false);
+      }
+    }
+  }
+  Out << "}";
+}
+
+void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
+  // Start by copying the entire aggregate value into the result variable.
+  writeOperand(IVI.getOperand(0));
+  Out << ";\n  ";
+
+  // Then do the insert to update the field.
+  Out << GetValueName(&IVI);
+  for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
+       i != e; ++i) {
+    const Type *IndexedTy =
+      ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(), b, i+1);
+    if (isa<ArrayType>(IndexedTy))
+      Out << ".array[" << *i << "]";
+    else
+      Out << ".field" << *i;
+  }
+  Out << " = ";
+  writeOperand(IVI.getOperand(1));
+}
+
+void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
+  Out << "(";
+  if (isa<UndefValue>(EVI.getOperand(0))) {
+    Out << "(";
+    printType(Out, EVI.getType());
+    Out << ") 0/*UNDEF*/";
+  } else {
+    Out << GetValueName(EVI.getOperand(0));
+    for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
+         i != e; ++i) {
+      const Type *IndexedTy =
+        ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(), b, i+1);
+      if (isa<ArrayType>(IndexedTy))
+        Out << ".array[" << *i << "]";
+      else
+        Out << ".field" << *i;
+    }
+  }
+  Out << ")";
+}
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool CTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
+                                              raw_ostream &o,
+                                              CodeGenFileType FileType,
+                                              CodeGenOpt::Level OptLevel) {
+  if (FileType != TargetMachine::AssemblyFile) return true;
+
+  PM.add(createGCLoweringPass());
+  PM.add(createLowerAllocationsPass(true));
+  PM.add(createLowerInvokePass());
+  PM.add(createCFGSimplificationPass());   // clean up after lower invoke.
+  PM.add(new CBackendNameAllUsedStructsAndMergeFunctions());
+  PM.add(new CWriter(o));
+  PM.add(createGCInfoDeleter());
+  return false;
+}
diff --git a/lib/Target/CBackend/CMakeLists.txt b/lib/Target/CBackend/CMakeLists.txt
new file mode 100644
index 0000000..be24336
--- /dev/null
+++ b/lib/Target/CBackend/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_target(CBackend
+  CBackend.cpp
+  )
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
new file mode 100644
index 0000000..8b26245
--- /dev/null
+++ b/lib/Target/CBackend/CTargetMachine.h
@@ -0,0 +1,43 @@
+//===-- CTargetMachine.h - TargetMachine for the C backend ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the C backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CTARGETMACHINE_H
+#define CTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+struct CTargetMachine : public TargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+
+  CTargetMachine(const Module &M, const std::string &FS)
+    : DataLayout(&M) {}
+
+  virtual bool WantsWholeFile() const { return true; }
+  virtual bool addPassesToEmitWholeFile(PassManager &PM, raw_ostream &Out,
+                                        CodeGenFileType FileType,
+                                        CodeGenOpt::Level OptLevel);
+
+  // This class always works, but must be requested explicitly on 
+  // llc command line.
+  static unsigned getModuleMatchQuality(const Module &M) { return 0; }
+  
+  virtual const TargetData *getTargetData() const { return &DataLayout; }
+};
+
+} // End llvm namespace
+
+
+#endif
diff --git a/lib/Target/CBackend/Makefile b/lib/Target/CBackend/Makefile
new file mode 100644
index 0000000..336de0c
--- /dev/null
+++ b/lib/Target/CBackend/Makefile
@@ -0,0 +1,14 @@
+##===- lib/Target/CBackend/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMCBackend
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts += -Wno-format
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
new file mode 100644
index 0000000..1cf0a91
--- /dev/null
+++ b/lib/Target/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_llvm_library(LLVMTarget
+  DarwinTargetAsmInfo.cpp
+  ELFTargetAsmInfo.cpp
+  SubtargetFeature.cpp
+  Target.cpp
+  TargetAsmInfo.cpp
+  TargetData.cpp
+  TargetFrameInfo.cpp
+  TargetInstrInfo.cpp
+  TargetMachOWriterInfo.cpp
+  TargetMachine.cpp
+  TargetMachineRegistry.cpp
+  TargetRegisterInfo.cpp
+  TargetSubtarget.cpp
+  )
+
+# TODO: Support other targets besides X86. See Makefile.
+\ No newline at end of file
diff --git a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..4336b05
--- /dev/null
+++ b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,12 @@
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}/..
+  ${CMAKE_CURRENT_SOURCE_DIR}/..
+  )
+
+add_partially_linked_object(LLVMCellSPUAsmPrinter
+  SPUAsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMCellSPUCodeGen n)
+
+add_dependencies(LLVMCellSPUAsmPrinter ${n})
diff --git a/lib/Target/CellSPU/AsmPrinter/Makefile b/lib/Target/CellSPU/AsmPrinter/Makefile
new file mode 100644
index 0000000..dd56df7
--- /dev/null
+++ b/lib/Target/CellSPU/AsmPrinter/Makefile
@@ -0,0 +1,17 @@
+##===- lib/Target/CellSPU/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCellSPUAsmPrinter
+
+# Hack: we need to include 'main' CellSPU target directory to grab
+# private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
new file mode 100644
index 0000000..da1bf07
--- /dev/null
+++ b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
@@ -0,0 +1,623 @@
+//===-- SPUAsmPrinter.cpp - Print machine instrs to Cell SPU assembly -------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Cell SPU assembly language. This printer
+// is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include <set>
+using namespace llvm;
+
+namespace {
+  STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+  const std::string bss_section(".bss");
+
+  class VISIBILITY_HIDDEN SPUAsmPrinter : public AsmPrinter {
+    std::set<std::string> FnStubs, GVStubs;
+  public:
+    explicit SPUAsmPrinter(raw_ostream &O, TargetMachine &TM,
+                           const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                           bool V) :
+      AsmPrinter(O, TM, T, OL, V) {}
+
+    virtual const char *getPassName() const {
+      return "STI CBEA SPU Assembly Printer";
+    }
+
+    SPUTargetMachine &getTM() {
+      return static_cast<SPUTargetMachine&>(TM);
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    bool printInstruction(const MachineInstr *MI);
+
+    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO);
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero) {
+      unsigned RegNo = MO.getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) &&
+             "Not physreg??");
+      O << TM.getRegisterInfo()->get(RegNo).AsmName;
+    }
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isReg()) {
+        assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg())&&"Not physreg??");
+        O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+      } else if (MO.isImm()) {
+        O << MO.getImm();
+      } else {
+        printOp(MO);
+      }
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+
+
+    void
+    printS7ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      int value = MI->getOperand(OpNo).getImm();
+      value = (value << (32 - 7)) >> (32 - 7);
+
+      assert((value >= -(1 << 8) && value <= (1 << 7) - 1)
+             && "Invalid s7 argument");
+      O << value;
+    }
+
+    void
+    printU7ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImm();
+      assert(value < (1 << 8) && "Invalid u7 argument");
+      O << value;
+    }
+
+    void
+    printShufAddr(const MachineInstr *MI, unsigned OpNo)
+    {
+      char value = MI->getOperand(OpNo).getImm();
+      O << (int) value;
+      O << "(";
+      printOperand(MI, OpNo+1);
+      O << ")";
+    }
+
+    void
+    printS16ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (short) MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printU16ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (unsigned short)MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printU32ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (unsigned)MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printMemRegReg(const MachineInstr *MI, unsigned OpNo) {
+      // When used as the base register, r0 reads constant zero rather than
+      // the value contained in the register.  For this reason, the darwin
+      // assembler requires that we print r0 as 0 (no r) when used as the base.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+      O << ", ";
+      printOperand(MI, OpNo+1);
+    }
+
+    void
+    printU18ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImm();
+      assert(value <= (1 << 19) - 1 && "Invalid u18 argument");
+      O << value;
+    }
+
+    void
+    printS10ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
+                             >> 16);
+      assert((value >= -(1 << 9) && value <= (1 << 9) - 1)
+             && "Invalid s10 argument");
+      O << value;
+    }
+
+    void
+    printU10ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
+                             >> 16);
+      assert((value <= (1 << 10) - 1) && "Invalid u10 argument");
+      O << value;
+    }
+
+    void
+    printDFormAddr(const MachineInstr *MI, unsigned OpNo)
+    {
+      assert(MI->getOperand(OpNo).isImm() &&
+             "printDFormAddr first operand is not immediate");
+      int64_t value = int64_t(MI->getOperand(OpNo).getImm());
+      int16_t value16 = int16_t(value);
+      assert((value16 >= -(1 << (9+4)) && value16 <= (1 << (9+4)) - 1)
+             && "Invalid dform s10 offset argument");
+      O << (value16 & ~0xf) << "(";
+      printOperand(MI, OpNo+1);
+      O << ")";
+    }
+
+    void
+    printAddr256K(const MachineInstr *MI, unsigned OpNo)
+    {
+      /* Note: operand 1 is an offset or symbol name. */
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        if (MI->getOperand(OpNo+1).isImm()) {
+          int displ = int(MI->getOperand(OpNo+1).getImm());
+          if (displ > 0)
+            O << "+" << displ;
+          else if (displ < 0)
+            O << displ;
+        }
+      }
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printPCRelativeOperand(const MachineInstr *MI, unsigned OpNo) {
+      // Used to generate a ".-<target>", but it turns out that the assembler
+      // really wants the target.
+      //
+      // N.B.: This operand is used for call targets. Branch hints are another
+      // animal entirely.
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printHBROperand(const MachineInstr *MI, unsigned OpNo) {
+      // HBR operands are generated in front of branches, hence, the
+      // program counter plus the target.
+      O << ".+";
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        O << "@h";
+      }
+    }
+
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        O << "@l";
+      }
+    }
+
+    /// Print local store address
+    void printSymbolLSA(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printROTHNeg7Imm(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        int value = (int) MI->getOperand(OpNo).getImm();
+        assert((value >= 0 && value < 16)
+               && "Invalid negated immediate rotate 7-bit argument");
+        O << -value;
+      } else {
+        assert(0 &&"Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      }
+    }
+
+    void printROTNeg7Imm(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        int value = (int) MI->getOperand(OpNo).getImm();
+        assert((value >= 0 && value <= 32)
+               && "Invalid negated immediate rotate 7-bit argument");
+        O << -value;
+      } else {
+        assert(0 &&"Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      }
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &F) = 0;
+    //! Assembly printer cleanup after function has been emitted
+    virtual bool doFinalization(Module &M) = 0;
+  };
+
+  /// LinuxAsmPrinter - SPU assembly printer, customized for Linux
+  class VISIBILITY_HIDDEN LinuxAsmPrinter : public SPUAsmPrinter {
+    DwarfWriter *DW;
+    MachineModuleInfo *MMI;
+  public:
+    explicit LinuxAsmPrinter(raw_ostream &O, SPUTargetMachine &TM,
+                             const TargetAsmInfo *T, CodeGenOpt::Level F,
+                             bool V)
+      : SPUAsmPrinter(O, TM, T, F, V), DW(0), MMI(0) {}
+
+    virtual const char *getPassName() const {
+      return "STI CBEA SPU Assembly Printer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    //! Dump globals, perform cleanup after function emission
+    bool doFinalization(Module &M);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+      SPUAsmPrinter::getAnalysisUsage(AU);
+    }
+
+    //! Emit a global variable according to its section and type
+    void printModuleLevelGV(const GlobalVariable* GVar);
+  };
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer
+#include "SPUGenAsmWriter.inc"
+
+void SPUAsmPrinter::printOp(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    cerr << "printOp() does not handle immediate values\n";
+    abort();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() != Reloc::Static) {
+      std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
+      GVStubs.insert(Name);
+      O << "L" << Name << "$non_lazy_ptr";
+      return;
+    }
+    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+
+    // External or weakly linked global variables need non-lazily-resolved
+    // stubs
+    if (TM.getRelocationModel() != Reloc::Static) {
+      if (((GV->isDeclaration() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage() || GV->hasCommonLinkage()))) {
+        GVStubs.insert(Name);
+        O << "L" << Name << "$non_lazy_ptr";
+        return;
+      }
+    }
+    O << Name;
+
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool SPUAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo,
+                                          unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemRegReg(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single PowerPC MI in Darwin syntax
+/// to the current output stream.
+///
+void SPUAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+  printInstruction(MI);
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool
+LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF)
+{
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+
+  SwitchToSection(TAI->SectionForGlobal(F));
+  EmitAlignment(3, F);
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::PrivateLinkage:
+  case Function::InternalLinkage:  // Symbols default to internal.
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.global\t" << CurrentFnName << "\n"
+      << "\t.type\t" << CurrentFnName << ", @function\n";
+    break;
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+    O << "\t.global\t" << CurrentFnName << "\n";
+    O << "\t.weak_definition\t" << CurrentFnName << "\n";
+    break;
+  }
+  O << CurrentFnName << ":\n";
+
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+
+  O << "\t.size\t" << CurrentFnName << ",.-" << CurrentFnName << "\n";
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // Emit post-function debug information.
+  DW->EndFunction(&MF);
+
+  // We didn't modify anything.
+  return false;
+}
+
+
+bool LinuxAsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+  SwitchToTextSection("\t.text");
+  // Emit initial debug information.
+  DW = getAnalysisIfAvailable<DwarfWriter>();
+  assert(DW && "Dwarf Writer is not available");
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  DW->BeginModule(&M, MMI, O, this, TAI);
+  return Result;
+}
+
+/// PrintUnmangledNameSafely - Print out the printable characters in the name.
+/// Don't print things like \\n or \\0.
+static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
+  for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
+       Name != E; ++Name)
+    if (isprint(*Name))
+      OS << *Name;
+}
+
+/*!
+  Emit a global variable according to its section, alignment, etc.
+
+  \note This code was shamelessly copied from the PowerPC's assembly printer,
+  which sort of screams for some kind of refactorization of common code.
+ */
+void LinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return;
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar))
+    return;
+
+  std::string name = Mang->getValueName(GVar);
+
+  printVisibility(name, GVar->getVisibility());
+
+  Constant *C = GVar->getInitializer();
+  const Type *Type = C->getType();
+  unsigned Size = TD->getTypeAllocSize(Type);
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  if (C->isNullValue() && /* FIXME: Verify correct */
+      !GVar->hasSection() &&
+      (GVar->hasLocalLinkage() || GVar->hasExternalLinkage() ||
+       GVar->isWeakForLinker())) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (GVar->hasExternalLinkage()) {
+        O << "\t.global " << name << '\n';
+        O << "\t.type " << name << ", @object\n";
+        O << name << ":\n";
+        O << "\t.zero " << Size << '\n';
+      } else if (GVar->hasLocalLinkage()) {
+        O << TAI->getLCOMMDirective() << name << ',' << Size;
+      } else {
+        O << ".comm " << name << ',' << Size;
+      }
+      O << "\t\t" << TAI->getCommentString() << " '";
+      PrintUnmangledNameSafely(GVar, O);
+      O << "'\n";
+      return;
+  }
+
+  switch (GVar->getLinkage()) {
+    // Should never be seen for the CellSPU platform...
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::WeakAnyLinkage:
+   case GlobalValue::WeakODRLinkage:
+   case GlobalValue::CommonLinkage:
+    O << "\t.global " << name << '\n'
+      << "\t.type " << name << ", @object\n"
+      << "\t.weak " << name << '\n';
+    break;
+   case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+   case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << "\t.global " << name << '\n'
+      << "\t.type " << name << ", @object\n";
+    // FALL THROUGH
+   case GlobalValue::PrivateLinkage:
+   case GlobalValue::InternalLinkage:
+    break;
+   default:
+    cerr << "Unknown linkage type!";
+    abort();
+  }
+
+  EmitAlignment(Align, GVar);
+  O << name << ":\t\t\t\t" << TAI->getCommentString() << " '";
+  PrintUnmangledNameSafely(GVar, O);
+  O << "'\n";
+
+  // If the initializer is a extern weak symbol, remember to emit the weak
+  // reference!
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+
+  EmitGlobalConstant(C);
+  O << '\n';
+}
+
+bool LinuxAsmPrinter::doFinalization(Module &M) {
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    printModuleLevelGV(I);
+
+  // Emit initial debug information.
+  DW->EndModule();
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// createSPUCodePrinterPass - Returns a pass that prints the Cell SPU
+/// assembly code for a MachineFunction to the given output stream, in a format
+/// that the Linux SPU assembler can deal with.
+///
+FunctionPass *llvm::createSPUAsmPrinterPass(raw_ostream &o,
+                                            SPUTargetMachine &tm,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool verbose) {
+  return new LinuxAsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
new file mode 100644
index 0000000..e3e12ac
--- /dev/null
+++ b/lib/Target/CellSPU/CMakeLists.txt
@@ -0,0 +1,24 @@
+set(LLVM_TARGET_DEFINITIONS SPU.td)
+
+tablegen(SPUGenInstrNames.inc -gen-instr-enums)
+tablegen(SPUGenRegisterNames.inc -gen-register-enums)
+tablegen(SPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(SPUGenCodeEmitter.inc -gen-emitter)
+tablegen(SPUGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(SPUGenRegisterInfo.inc -gen-register-desc)
+tablegen(SPUGenInstrInfo.inc -gen-instr-desc)
+tablegen(SPUGenDAGISel.inc -gen-dag-isel)
+tablegen(SPUGenSubtarget.inc -gen-subtarget)
+tablegen(SPUGenCallingConv.inc -gen-callingconv)
+
+add_llvm_target(CellSPUCodeGen
+  SPUFrameInfo.cpp
+  SPUHazardRecognizers.cpp
+  SPUInstrInfo.cpp
+  SPUISelDAGToDAG.cpp
+  SPUISelLowering.cpp
+  SPURegisterInfo.cpp
+  SPUSubtarget.cpp
+  SPUTargetAsmInfo.cpp
+  SPUTargetMachine.cpp
+  )
diff --git a/lib/Target/CellSPU/CellSDKIntrinsics.td b/lib/Target/CellSPU/CellSDKIntrinsics.td
new file mode 100644
index 0000000..5d759a4
--- /dev/null
+++ b/lib/Target/CellSPU/CellSDKIntrinsics.td
@@ -0,0 +1,448 @@
+//===-- CellSDKIntrinsics.td - Cell SDK Intrinsics ---------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+///--==-- Arithmetic ops intrinsics --==--
+def CellSDKah:
+    RR_Int_v8i16<0b00010011000, "ah", IntegerOp, int_spu_si_ah>;
+def CellSDKahi:
+    RI10_Int_v8i16<0b00010011000, "ahi", IntegerOp, int_spu_si_ahi>;
+def CellSDKa:
+    RR_Int_v4i32<0b00000011000, "a", IntegerOp, int_spu_si_a>;
+def CellSDKai:
+    RI10_Int_v4i32<0b00111000, "ai", IntegerOp, int_spu_si_ai>;
+def CellSDKsfh:
+    RR_Int_v8i16<0b00010010000, "sfh", IntegerOp, int_spu_si_sfh>;
+def CellSDKsfhi:
+    RI10_Int_v8i16<0b10110000, "sfhi", IntegerOp, int_spu_si_sfhi>;
+def CellSDKsf:
+    RR_Int_v4i32<0b00000010000, "sf", IntegerOp, int_spu_si_sf>;
+def CellSDKsfi:
+    RI10_Int_v4i32<0b00110000, "sfi", IntegerOp, int_spu_si_sfi>;
+def CellSDKaddx:
+    RR_Int_v4i32<0b00000010110, "addx", IntegerOp, int_spu_si_addx>;
+def CellSDKcg:
+    RR_Int_v4i32<0b0100001100, "cg", IntegerOp, int_spu_si_cg>;
+def CellSDKcgx:
+    RR_Int_v4i32<0b01000010110, "cgx", IntegerOp, int_spu_si_cgx>;
+def CellSDKsfx:
+    RR_Int_v4i32<0b10000010110, "sfx", IntegerOp, int_spu_si_sfx>;
+def CellSDKbg:
+    RR_Int_v4i32<0b01000010000, "bg", IntegerOp, int_spu_si_bg>;
+def CellSDKbgx:
+    RR_Int_v4i32<0b11000010110, "bgx", IntegerOp, int_spu_si_bgx>;
+
+def CellSDKmpy:
+    RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpy $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpy (v8i16 VECREG:$rA),
+                                                (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyu:
+    RRForm<0b00110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyu $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyu (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))] >;
+
+def CellSDKmpyi:
+    RI10Form<0b00101110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "mpyi $rT, $rA, $val", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyi (v8i16 VECREG:$rA),
+                                                 i16ImmSExt10:$val))]>;
+
+def CellSDKmpyui:
+    RI10Form<0b10101110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "mpyui $rT, $rA, $val", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyui (v8i16 VECREG:$rA),
+                                                  i16ImmSExt10:$val))]>;
+
+def CellSDKmpya:
+    RRRForm<0b0011, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "mpya $rT, $rA, $rB, $rC", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpya (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB),
+                                                 (v8i16 VECREG:$rC)))]>;
+
+def CellSDKmpyh:
+    RRForm<0b10100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyh $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyh (v4i32 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpys:
+    RRForm<0b11100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpys $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpys (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhh:
+    RRForm<0b01100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhh $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhh (v8i16 VECREG:$rA),
+                                                  (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhha:
+    RRForm<0b01100010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhha $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhha (v8i16 VECREG:$rA),
+                                                   (v8i16 VECREG:$rB)))]>;
+
+// Not sure how to match a (set $rT, (add $rT (mpyhh $rA, $rB)))... so leave
+// as an intrinsic for the time being
+def CellSDKmpyhhu:
+    RRForm<0b01110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhhu $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhhu (v8i16 VECREG:$rA),
+                                                   (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhhau:
+    RRForm<0b01110010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhhau $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhhau (v8i16 VECREG:$rA),
+                                                    (v8i16 VECREG:$rB)))]>;
+
+def CellSDKand:
+        RRForm<0b1000011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "and\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_and (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKandc:
+        RRForm<0b10000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "andc\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_andc (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKandbi:
+     RI10Form<0b01101000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "andbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT),
+             (int_spu_si_andbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKandhi:
+     RI10Form<0b10101000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "andhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT),
+             (int_spu_si_andhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKandi:
+     RI10Form<0b00101000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "andi\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT),
+             (int_spu_si_andi (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "or\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_or (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKorc:
+        RRForm<0b10010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "addc\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_orc (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKorbi:
+     RI10Form<0b01100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "orbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT),
+             (int_spu_si_orbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKorhi:
+     RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "orhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT),
+             (int_spu_si_orhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKori:
+     RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "ori\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT),
+             (int_spu_si_ori (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKxor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "xor\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_xor (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKxorbi:
+     RI10Form<0b01100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "xorbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT), (int_spu_si_xorbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKxorhi:
+     RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+       "xorhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT), 
+             (int_spu_si_xorhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKxori:
+     RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "xori\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT), 
+             (int_spu_si_xori (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKnor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "nor\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_nor (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKnand:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "nand\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_nand (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+//===----------------------------------------------------------------------===//
+// Shift/rotate intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKshli:
+  Pat<(int_spu_si_shli (v4i32 VECREG:$rA), uimm7:$val),
+      (SHLIv4i32 VECREG:$rA, uimm7:$val)>;
+
+def CellSDKshlqbi:
+  Pat<(int_spu_si_shlqbi VECREG:$rA, R32C:$rB),
+      (SHLQBIv16i8 VECREG:$rA, R32C:$rB)>;
+
+def CellSDKshlqii:
+  Pat<(int_spu_si_shlqbii VECREG:$rA, uimm7:$val),
+      (SHLQBIIv16i8 VECREG:$rA, uimm7:$val)>;
+
+def CellSDKshlqby:
+  Pat<(int_spu_si_shlqby VECREG:$rA, R32C:$rB),
+      (SHLQBYv16i8 VECREG:$rA, R32C:$rB)>;
+
+def CellSDKshlqbyi:
+  Pat<(int_spu_si_shlqbyi VECREG:$rA, uimm7:$val),
+      (SHLQBYIv16i8 VECREG:$rA, uimm7:$val)>;
+          
+//===----------------------------------------------------------------------===//
+// Branch/compare intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKceq:
+  RRForm<0b00000011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceq\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_ceq (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKceqi:
+  RI10Form<0b00111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+        "ceqi\t $rT, $rA, $val", BranchResolv,
+    [(set (v4i32 VECREG:$rT), 
+          (int_spu_si_ceqi (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKceqb:
+  RRForm<0b00001011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceqb\t $rT, $rA, $rB", BranchResolv,
+        [(set (v16i8 VECREG:$rT), 
+              (int_spu_si_ceqb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKceqbi:
+  RI10Form<0b01111110, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+        "ceqbi\t $rT, $rA, $val", BranchResolv,
+    [(set (v16i8 VECREG:$rT), (int_spu_si_ceqbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKceqh:
+  RRForm<0b00010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceqh\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_ceqh (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKceqhi:
+  RI10Form<0b10111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+        "ceqhi\t $rT, $rA, $val", BranchResolv,
+    [(set (v8i16 VECREG:$rT), 
+          (int_spu_si_ceqhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+def CellSDKcgth:
+  RRForm<0b00010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgth\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT),
+              (int_spu_si_cgth (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKcgthi:
+  RI10Form<0b10111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "cgthi\t $rT, $rA, $val", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_cgthi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKcgt:
+  RRForm<0b00000010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgt\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_cgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKcgti:
+  RI10Form<0b00110010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "cgti\t $rT, $rA, $val", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_cgti (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKcgtb:
+  RRForm<0b00001010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgtb\t $rT, $rA, $rB", BranchResolv,
+        [(set (v16i8 VECREG:$rT), 
+              (int_spu_si_cgtb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKcgtbi:
+  RI10Form<0b01110010, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+    "cgtbi\t $rT, $rA, $val", BranchResolv,
+        [(set (v16i8 VECREG:$rT), (int_spu_si_cgtbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKclgth:
+  RRForm<0b00010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgth\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_clgth (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKclgthi:
+  RI10Form<0b10111010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "clgthi\t $rT, $rA, $val", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_clgthi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKclgt:
+  RRForm<0b00000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgt\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_clgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKclgti:
+  RI10Form<0b00111010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "clgti\t $rT, $rA, $val", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_clgti (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKclgtb:
+  RRForm<0b00001011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgtb\t $rT, $rA, $rB", BranchResolv,
+    [(set (v16i8 VECREG:$rT),
+          (int_spu_si_clgtb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKclgtbi:
+  RI10Form<0b01111010, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+    "clgtbi\t $rT, $rA, $val", BranchResolv,
+    [(set (v16i8 VECREG:$rT),
+          (int_spu_si_clgtbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKfa:
+  RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fa\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fa (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfs:
+  RRForm<0b10100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fs\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fs (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfm:
+  RRForm<0b01100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fm\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fm (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfceq:
+  RRForm<0b01000011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fceq\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fceq (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcgt:
+  RRForm<0b01000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcgt\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcgt (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcmeq:
+  RRForm<0b01010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcmeq\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcmeq (v4f32 VECREG:$rA),
+                                                    (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcmgt:
+  RRForm<0b01010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcmgt\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcmgt (v4f32 VECREG:$rA),
+                                                    (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfma:
+  RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fma\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fma (v4f32 VECREG:$rA),
+                                                  (v4f32 VECREG:$rB),
+                                                  (v4f32 VECREG:$rC)))]>;
+
+def CellSDKfnms:
+  RRRForm<0b1011, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fnms\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fnms (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB),
+                                                   (v4f32 VECREG:$rC)))]>;
+
+def CellSDKfms:
+  RRRForm<0b1111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fms\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fms (v4f32 VECREG:$rA),
+                                                  (v4f32 VECREG:$rB),
+                                                  (v4f32 VECREG:$rC)))]>;
+
+//===----------------------------------------------------------------------===//
+// Double precision floating-point intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKdfa:
+  RRForm<0b00110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfa\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfa (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfs:
+  RRForm<0b10110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfs\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfs (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfm:
+  RRForm<0b01110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfm\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfm (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfma:
+  RRForm<0b00111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfma\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfma (v2f64 VECREG:$rA),
+                                                   (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfnma:
+  RRForm<0b11111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfnma\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfnma (v2f64 VECREG:$rA),
+                                                    (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfnms:
+  RRForm<0b01111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfnms\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfnms (v2f64 VECREG:$rA),
+                                                    (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfms:
+  RRForm<0b10111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfms\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfms (v2f64 VECREG:$rA),
+                                                   (v2f64 VECREG:$rB)))]>;
diff --git a/lib/Target/CellSPU/Makefile b/lib/Target/CellSPU/Makefile
new file mode 100644
index 0000000..a460db3
--- /dev/null
+++ b/lib/Target/CellSPU/Makefile
@@ -0,0 +1,22 @@
+##===- lib/Target/CellSPU/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMCellSPUCodeGen
+TARGET = SPU
+
+BUILT_SOURCES = SPUGenInstrNames.inc SPUGenRegisterNames.inc \
+		SPUGenAsmWriter.inc SPUGenCodeEmitter.inc \
+		SPUGenRegisterInfo.h.inc SPUGenRegisterInfo.inc \
+		SPUGenInstrInfo.inc SPUGenDAGISel.inc \
+		SPUGenSubtarget.inc SPUGenCallingConv.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt
new file mode 100644
index 0000000..4783dd5
--- /dev/null
+++ b/lib/Target/CellSPU/README.txt
@@ -0,0 +1,90 @@
+//===- README.txt - Notes for improving CellSPU-specific code gen ---------===//
+
+This code was contributed by a team from the Computer Systems Research
+Department in The Aerospace Corporation:
+
+- Scott Michel (head bottle washer and much of the non-floating point
+  instructions)
+- Mark Thomas (floating point instructions)
+- Michael AuYeung (intrinsics)
+- Chandler Carruth (LLVM expertise)
+- Nehal Desai (debugging, i32 operations, RoadRunner SPU expertise)
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR
+OTHERWISE.  IN NO EVENT SHALL THE AEROSPACE CORPORATION BE LIABLE FOR DAMAGES
+OF ANY KIND OR NATURE WHETHER BASED IN CONTRACT, TORT, OR OTHERWISE ARISING
+OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE INCLUDING, WITHOUT
+LIMITATION, DAMAGES RESULTING FROM LOST OR CONTAMINATED DATA, LOST PROFITS OR
+REVENUE, COMPUTER MALFUNCTION, OR FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL,
+OR PUNITIVE  DAMAGES, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES OR
+SUCH DAMAGES ARE FORESEEABLE.
+
+---------------------------------------------------------------------------
+--WARNING--:
+--WARNING--: The CellSPU work is work-in-progress and "alpha" quality code.
+--WARNING--:
+
+If you are brave enough to try this code or help to hack on it, be sure
+to add 'spu' to configure's --enable-targets option, e.g.:
+
+        ./configure <your_configure_flags_here> \
+           --enable-targets=x86,x86_64,powerpc,spu
+
+---------------------------------------------------------------------------
+
+TODO:
+* Create a machine pass for performing dual-pipeline scheduling specifically
+  for CellSPU, and insert branch prediction instructions as needed.
+
+* i32 instructions:
+
+  * i32 division (work-in-progress)
+
+* i64 support (see i64operations.c test harness):
+
+  * shifts and comparison operators: done
+  * sign and zero extension: done
+  * addition: done
+  * subtraction: needed
+  * multiplication: done
+
+* i128 support:
+
+  * zero extension, any extension: done
+  * sign extension: needed
+  * arithmetic operators (add, sub, mul, div): needed
+  * logical operations (and, or, shl, srl, sra, xor, nor, nand): needed
+
+    * or: done
+
+* f64 support
+
+  * Comparison operators:
+    SETOEQ              unimplemented
+    SETOGT              unimplemented
+    SETOGE              unimplemented
+    SETOLT              unimplemented
+    SETOLE              unimplemented
+    SETONE              unimplemented
+    SETO                done (lowered)
+    SETUO               done (lowered)
+    SETUEQ              unimplemented
+    SETUGT              unimplemented
+    SETUGE              unimplemented
+    SETULT              unimplemented
+    SETULE              unimplemented
+    SETUNE              unimplemented
+
+* LLVM vector suport
+
+  * VSETCC needs to be implemented. It's pretty straightforward to code, but
+    needs implementation.
+
+* Intrinsics
+
+  * spu.h instrinsics added but not tested. Need to have an operational
+    llvm-spu-gcc in order to write a unit test harness.
+
+===-------------------------------------------------------------------------===
diff --git a/lib/Target/CellSPU/SPU.h b/lib/Target/CellSPU/SPU.h
new file mode 100644
index 0000000..77a062e
--- /dev/null
+++ b/lib/Target/CellSPU/SPU.h
@@ -0,0 +1,102 @@
+//===-- SPU.h - Top-level interface for Cell SPU Target ----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Cell SPU back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_IBMCELLSPU_H
+#define LLVM_TARGET_IBMCELLSPU_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class SPUTargetMachine;
+  class FunctionPass;
+  class raw_ostream;
+
+  FunctionPass *createSPUISelDag(SPUTargetMachine &TM);
+  FunctionPass *createSPUAsmPrinterPass(raw_ostream &o,
+                                        SPUTargetMachine &tm,
+                                        CodeGenOpt::Level OptLevel,
+                                        bool verbose);
+
+  /*--== Utility functions/predicates/etc used all over the place: --==*/
+  //! Predicate test for a signed 10-bit value
+  /*!
+    \param Value The input value to be tested
+
+    This predicate tests for a signed 10-bit value, returning the 10-bit value
+    as a short if true.
+   */
+  template<typename T>
+  inline bool isS10Constant(T Value);
+
+  template<>
+  inline bool isS10Constant<short>(short Value) {
+    int SExtValue = ((int) Value << (32 - 10)) >> (32 - 10);
+    return ((Value > 0 && Value <= (1 << 9) - 1)
+            || (Value < 0 && (short) SExtValue == Value));
+  }
+
+  template<>
+  inline bool isS10Constant<int>(int Value) {
+    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
+  }
+
+  template<>
+  inline bool isS10Constant<uint32_t>(uint32_t Value) {
+    return (Value <= ((1 << 9) - 1));
+  }
+
+  template<>
+  inline bool isS10Constant<int64_t>(int64_t Value) {
+    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
+  }
+
+  template<>
+  inline bool isS10Constant<uint64_t>(uint64_t Value) {
+    return (Value <= ((1 << 9) - 1));
+  }
+
+  //! Predicate test for an unsigned 10-bit value
+  /*!
+    \param Value The input value to be tested
+
+    This predicate tests for an unsigned 10-bit value, returning the 10-bit value
+    as a short if true.
+   */
+  inline bool isU10Constant(short Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(int Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(uint32_t Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(int64_t Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(uint64_t Value) {
+    return (Value == (Value & 0x3ff));
+  }
+}
+
+// Defines symbolic names for the SPU instructions.
+//
+#include "SPUGenInstrNames.inc"
+
+#endif /* LLVM_TARGET_IBMCELLSPU_H */
diff --git a/lib/Target/CellSPU/SPU.td b/lib/Target/CellSPU/SPU.td
new file mode 100644
index 0000000..8327fe0
--- /dev/null
+++ b/lib/Target/CellSPU/SPU.td
@@ -0,0 +1,66 @@
+//===- SPU.td - Describe the STI Cell SPU Target Machine ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the STI Cell SPU target machine.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+// Holder of code fragments (you'd think this'd already be in
+// a td file somewhere... :-)
+
+class CodeFrag<dag frag> {
+  dag Fragment = frag;
+}
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "SPURegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction formats, instructions
+//===----------------------------------------------------------------------===//
+
+include "SPUNodes.td"
+include "SPUOperands.td"
+include "SPUSchedule.td"
+include "SPUInstrFormats.td"
+include "SPUInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget features:
+//===----------------------------------------------------------------------===//
+
+def DefaultProc: SubtargetFeature<"", "ProcDirective", "SPU::DEFAULT_PROC", "">;
+def LargeMemFeature:
+  SubtargetFeature<"large_mem","UseLargeMem", "true",
+                   "Use large (>256) LSA memory addressing [default = false]">;
+
+def SPURev0 : Processor<"v0", SPUItineraries, [DefaultProc]>;
+
+//===----------------------------------------------------------------------===//
+// Calling convention:
+//===----------------------------------------------------------------------===//
+
+include "SPUCallingConv.td"
+
+// Target:
+
+def SPUInstrInfo : InstrInfo {
+  let isLittleEndianEncoding = 1;
+}
+
+def SPU : Target {
+  let InstructionSet = SPUInstrInfo;
+}
diff --git a/lib/Target/CellSPU/SPU128InstrInfo.td b/lib/Target/CellSPU/SPU128InstrInfo.td
new file mode 100644
index 0000000..3031fda
--- /dev/null
+++ b/lib/Target/CellSPU/SPU128InstrInfo.td
@@ -0,0 +1,41 @@
+//===--- SPU128InstrInfo.td - Cell SPU 128-bit operations -*- tablegen -*--===//
+//
+//                     Cell SPU 128-bit operations
+//
+//===----------------------------------------------------------------------===//
+                                  
+// zext 32->128: Zero extend 32-bit to 128-bit
+def : Pat<(i128 (zext R32C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 R32C:$rSrc, 12)>;
+
+// zext 64->128: Zero extend 64-bit to 128-bit
+def : Pat<(i128 (zext R64C:$rSrc)),
+          (ROTQMBYIr128_zext_r64 R64C:$rSrc, 8)>;
+
+// zext 16->128: Zero extend 16-bit to 128-bit
+def : Pat<(i128 (zext R16C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff)), 12)>;
+
+// zext 8->128: Zero extend 8-bit to 128-bit
+def : Pat<(i128 (zext R8C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// anyext 32->128: Zero extend 32-bit to 128-bit
+def : Pat<(i128 (anyext R32C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 R32C:$rSrc, 12)>;
+
+// anyext 64->128: Zero extend 64-bit to 128-bit
+def : Pat<(i128 (anyext R64C:$rSrc)),
+          (ROTQMBYIr128_zext_r64 R64C:$rSrc, 8)>;
+
+// anyext 16->128: Zero extend 16-bit to 128-bit
+def : Pat<(i128 (anyext R16C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff)), 12)>;
+
+// anyext 8->128: Zero extend 8-bit to 128-bit
+def : Pat<(i128 (anyext R8C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// Shift left
+def : Pat<(shl GPRC:$rA, R32C:$rB),
+          (SHLQBYBIr128 (SHLQBIr128 GPRC:$rA, R32C:$rB), R32C:$rB)>;
diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td
new file mode 100644
index 0000000..06eb149
--- /dev/null
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -0,0 +1,394 @@
+//====--- SPU64InstrInfo.td - Cell SPU 64-bit operations -*- tablegen -*--====//
+//
+//                     Cell SPU 64-bit operations
+//
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// 64-bit comparisons:
+//
+// 1. The instruction sequences for vector vice scalar differ by a
+//    constant. In the scalar case, we're only interested in the
+//    top two 32-bit slots, whereas we're interested in an exact
+//    all-four-slot match in the vector case.
+//
+// 2. There are no "immediate" forms, since loading 64-bit constants
+//    could be a constant pool load.
+//
+// 3. i64 setcc results are i32, which are subsequently converted to a FSM
+//    mask when used in a select pattern.
+//
+// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask (TODO)
+//    [Note: this may be moot, since gb produces v4i32 or r32.]
+//
+// 5. The code sequences for r64 and v2i64 are probably overly conservative,
+//    compared to the code that gcc produces.
+//
+// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!)
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBr64_cond:
+  SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
+           [/* no pattern */]>;
+
+// The generic i64 select pattern, which assumes that the comparison result
+// is in a 32-bit register that contains a select mask pattern (i.e., gather
+// bits result):
+
+def : Pat<(select R32C:$rCond, R64C:$rFalse, R64C:$rTrue),
+          (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 R32C:$rCond))>;
+
+// select the negative condition:
+class I64SELECTNegCond<PatFrag cond, CodeFrag compare>:
+  Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
+      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 compare.Fragment))>;
+
+// setcc the negative condition:
+class I64SETCCNegCond<PatFrag cond, CodeFrag compare>:
+  Pat<(cond R64C:$rA, R64C:$rB),
+      (XORIr32 compare.Fragment, -1)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// The i64 seteq fragment that does the scalar->vector conversion and
+// comparison:
+def CEQr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (ORv2i64_i64 R64C:$rA),
+                                           (ORv2i64_i64 R64C:$rB))), 0xb)>;
+
+// The i64 seteq fragment that does the vector comparison
+def CEQv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (CEQv4i32 VECREG:$rA, VECREG:$rB)), 0xf)>;
+
+// i64 seteq (equality): the setcc result is i32, which is converted to a
+// vector FSM mask when used in a select pattern.
+//
+// v2i64 seteq (equality): the setcc result is v4i32
+multiclass CompareEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CEQr64compare.Fragment)>;
+  def v2i64: CodeFrag<(ORi32_v4i32 CEQv2i64compare.Fragment)>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQv2i64compare.Fragment))>;
+}
+
+defm I64EQ: CompareEqual64;
+
+def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>;
+def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), I64EQv2i64.Fragment>;
+
+// i64 setne:
+def : I64SETCCNegCond<setne, I64EQr64>;
+def : I64SELECTNegCond<setne, I64EQr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setugt/setule:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CLGTr64ugt:
+    CodeFrag<(CLGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+
+def CLGTr64eq:
+    CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+    
+def CLGTr64compare:
+    CodeFrag<(SELBv2i64 CLGTr64ugt.Fragment,
+                        (XSWDv2i64 CLGTr64ugt.Fragment),
+                        CLGTr64eq.Fragment)>;
+
+def CLGTv2i64ugt:
+    CodeFrag<(CLGTv4i32 VECREG:$rA, VECREG:$rB)>;
+
+def CLGTv2i64eq:
+    CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
+    
+def CLGTv2i64compare:
+    CodeFrag<(SELBv2i64 CLGTv2i64ugt.Fragment,
+                        (XSWDv2i64 CLGTr64ugt.Fragment),
+                        CLGTv2i64eq.Fragment)>;
+
+multiclass CompareLogicalGreaterThan64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CLGTr64compare.Fragment)>;
+  def v2i64: CodeFrag<CLGTv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTv2i64compare.Fragment))>;
+}
+
+defm I64LGT: CompareLogicalGreaterThan64;
+
+def : Pat<(setugt R64C:$rA, R64C:$rB), I64LGTr64.Fragment>;
+def : Pat<(setugt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64LGTv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setule, I64LGTr64>;
+def : I64SELECTNegCond<setule, I64LGTr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setuge/setult:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CLGEr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CLGTr64ugt.Fragment,
+                                          CLGTr64eq.Fragment)), 0xb)>;
+
+def CLGEv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CLGTv2i64ugt.Fragment,
+                                          CLGTv2i64eq.Fragment)), 0xf)>;
+
+multiclass CompareLogicalGreaterEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CLGEr64compare.Fragment)>;
+  def v2i64: CodeFrag<CLGEv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEv2i64compare.Fragment))>;
+}
+
+defm I64LGE: CompareLogicalGreaterEqual64;
+
+def : Pat<(setuge R64C:$rA, R64C:$rB), I64LGEr64.Fragment>;
+def : Pat<(setuge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64LGEv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setult, I64LGEr64>;
+def : I64SELECTNegCond<setult, I64LGEr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setgt/setle:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CGTr64sgt:
+    CodeFrag<(CGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+
+def CGTr64eq:
+    CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+    
+def CGTr64compare:
+    CodeFrag<(SELBv2i64 CGTr64sgt.Fragment,
+                        (XSWDv2i64 CGTr64sgt.Fragment),
+                        CGTr64eq.Fragment)>;
+
+def CGTv2i64sgt:
+    CodeFrag<(CGTv4i32 VECREG:$rA, VECREG:$rB)>;
+
+def CGTv2i64eq:
+    CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
+    
+def CGTv2i64compare:
+    CodeFrag<(SELBv2i64 CGTv2i64sgt.Fragment,
+                        (XSWDv2i64 CGTr64sgt.Fragment),
+                        CGTv2i64eq.Fragment)>;
+
+multiclass CompareGreaterThan64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CGTr64compare.Fragment)>;
+  def v2i64: CodeFrag<CGTv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTv2i64compare.Fragment))>;
+}
+
+defm I64GT: CompareLogicalGreaterThan64;
+
+def : Pat<(setgt R64C:$rA, R64C:$rB), I64GTr64.Fragment>;
+def : Pat<(setgt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64GTv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setle, I64GTr64>;
+def : I64SELECTNegCond<setle, I64GTr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setge/setlt:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+    
+def CGEr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CGTr64sgt.Fragment,
+                                          CGTr64eq.Fragment)), 0xb)>;
+
+def CGEv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CGTv2i64sgt.Fragment,
+                                          CGTv2i64eq.Fragment)), 0xf)>;
+
+multiclass CompareGreaterEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CGEr64compare.Fragment)>;
+  def v2i64: CodeFrag<CGEv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEv2i64compare.Fragment))>;
+}
+
+defm I64GE: CompareGreaterEqual64;
+
+def : Pat<(setge R64C:$rA, R64C:$rB), I64GEr64.Fragment>;
+def : Pat<(setge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64GEv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setlt, I64GEr64>;
+def : I64SELECTNegCond<setlt, I64GEr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 add
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_add_cg<dag lhs, dag rhs>:
+    CodeFrag<(CGv4i32 lhs, rhs)>;
+
+class v2i64_add_1<dag lhs, dag rhs, dag cg, dag cg_mask>:
+    CodeFrag<(ADDXv4i32 lhs, rhs, (SHUFBv4i32 cg, cg, cg_mask))>;
+
+class v2i64_add<dag lhs, dag rhs, dag cg_mask>:
+    v2i64_add_1<lhs, rhs, v2i64_add_cg<lhs, rhs>.Fragment, cg_mask>;
+
+def : Pat<(SPUadd64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (ORi64_v2i64 v2i64_add<(ORv2i64_i64 R64C:$rA),
+                                  (ORv2i64_i64 R64C:$rB),
+                                  (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUadd64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_add<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 subtraction
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_sub_bg<dag lhs, dag rhs>: CodeFrag<(BGv4i32 lhs, rhs)>;
+
+class v2i64_sub<dag lhs, dag rhs, dag bg, dag bg_mask>:
+    CodeFrag<(SFXv4i32 lhs, rhs, (SHUFBv4i32 bg, bg, bg_mask))>;
+
+def : Pat<(SPUsub64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (ORi64_v2i64 v2i64_sub<(ORv2i64_i64 R64C:$rA),
+                                  (ORv2i64_i64 R64C:$rB),
+                                  v2i64_sub_bg<(ORv2i64_i64 R64C:$rA),
+                                               (ORv2i64_i64 R64C:$rB)>.Fragment,
+                                  (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUsub64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_sub<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     v2i64_sub_bg<(v2i64 VECREG:$rA),
+                                  (v2i64 VECREG:$rB)>.Fragment,
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 multiply
+//
+// Note: i64 multiply is simply the vector->scalar conversion of the
+// full-on v2i64 multiply, since the entire vector has to be manipulated
+// anyway.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_mul_ahi64<dag rA> :
+    CodeFrag<(SELBv4i32 rA, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_bhi64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_alo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_blo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_ashlq2<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x2)>;
+
+class v2i64_mul_ashlq4<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x4)>;
+
+class v2i64_mul_bshlq2<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x2)>;
+
+class v2i64_mul_bshlq4<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x4)>;
+
+class v2i64_highprod<dag rA, dag rB>:
+    CodeFrag<(Av4i32
+                (Av4i32
+                  (MPYUv4i32 v2i64_mul_bshlq4<rB>.Fragment,     // a1 x b3
+                             v2i64_mul_ahi64<rA>.Fragment),
+                  (MPYHv4i32 v2i64_mul_ahi64<rA>.Fragment,      // a0 x b3
+                             v2i64_mul_bshlq4<rB>.Fragment)),
+                (Av4i32
+                  (MPYHv4i32 v2i64_mul_bhi64<rB>.Fragment,
+                             v2i64_mul_ashlq4<rA>.Fragment),
+                  (Av4i32
+                      (MPYHv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                                 v2i64_mul_bhi64<rB>.Fragment),
+                    (Av4i32
+                      (MPYUv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                                 v2i64_mul_bhi64<rB>.Fragment),
+                      (Av4i32
+                        (MPYHv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment),
+                        (MPYUv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment))))))>;
+
+class v2i64_mul_a3_b3<dag rA, dag rB>:
+    CodeFrag<(MPYUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                        v2i64_mul_blo64<rB>.Fragment)>;
+
+class v2i64_mul_a2_b3<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                                       v2i64_mul_bshlq2<rB>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_mul_a3_b2<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_blo64<rB>.Fragment,
+                                       v2i64_mul_ashlq2<rA>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_lowsum<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_add<v2i64_mul_a3_b3<rA, rB>.Fragment,
+                        v2i64_mul_a2_b3<rA, rB>.Fragment, rCGmask>.Fragment,
+              v2i64_mul_a3_b2<rA, rB>.Fragment, rCGmask>;
+
+class v2i64_mul<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_lowsum<rA, rB, rCGmask>.Fragment,
+              (SELBv4i32 v2i64_highprod<rA, rB>.Fragment,
+                         (ILv4i32 0),
+                         (FSMBIv4i32 0x0f0f)),
+              rCGmask>;
+
+def : Pat<(SPUmul64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+          (ORi64_v2i64 v2i64_mul<(ORv2i64_i64 R64C:$rA),
+                                 (ORv2i64_i64 R64C:$rB),
+                                 (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+          v2i64_mul<(v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f64 comparisons
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBf64_cond:
+   SELBInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R32C:$rC),
+            [(set R64FP:$rT,
+                  (select R32C:$rC, R64FP:$rB, R64FP:$rA))]>;
diff --git a/lib/Target/CellSPU/SPUCallingConv.td b/lib/Target/CellSPU/SPUCallingConv.td
new file mode 100644
index 0000000..10dc837
--- /dev/null
+++ b/lib/Target/CellSPU/SPUCallingConv.td
@@ -0,0 +1,115 @@
+//===- SPUCallingConv.td - Calling Conventions for CellSPU ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the STI Cell SPU architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for Cell SPU: Everything can be passed back via $3:
+def RetCC_SPU : CallingConv<[
+  CCIfType<[i8],       CCAssignToReg<[R3]>>,
+  CCIfType<[i16],      CCAssignToReg<[R3]>>,
+  CCIfType<[i32],      CCAssignToReg<[R3]>>,
+  CCIfType<[i64],      CCAssignToReg<[R3]>>,
+  CCIfType<[i128],     CCAssignToReg<[R3]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[R3]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[R3]>>,
+  CCIfType<[v2i32],                                    CCAssignToReg<[R3]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// CellSPU Argument Calling Conventions
+// (note: this isn't used, but presumably should be at some point when other
+//  targets do.)
+//===----------------------------------------------------------------------===//
+/*
+def CC_SPU : CallingConv<[
+  CCIfType<[i8],  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[i16], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[i32], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[f32], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[i64], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[f64], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2i64, v2f64],
+                  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToStack<16, 16>>
+]>;
+*/
diff --git a/lib/Target/CellSPU/SPUFrameInfo.cpp b/lib/Target/CellSPU/SPUFrameInfo.cpp
new file mode 100644
index 0000000..60d7ba7
--- /dev/null
+++ b/lib/Target/CellSPU/SPUFrameInfo.cpp
@@ -0,0 +1,29 @@
+//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the Cell SPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUFrameInfo.h"
+#include "SPURegisterNames.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// SPUFrameInfo:
+//===----------------------------------------------------------------------===//
+
+SPUFrameInfo::SPUFrameInfo(const TargetMachine &tm):
+  TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+  TM(tm)
+{
+  LR[0].first = SPU::R0;
+  LR[0].second = 16;
+}
diff --git a/lib/Target/CellSPU/SPUFrameInfo.h b/lib/Target/CellSPU/SPUFrameInfo.h
new file mode 100644
index 0000000..e8ca333
--- /dev/null
+++ b/lib/Target/CellSPU/SPUFrameInfo.h
@@ -0,0 +1,79 @@
+//===-- SPUFrameInfo.h - Top-level interface for Cell SPU Target -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains CellSPU frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(SPUFRAMEINFO_H)
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "SPURegisterInfo.h"
+
+namespace llvm {
+  class SPUFrameInfo: public TargetFrameInfo {
+    const TargetMachine &TM;
+    std::pair<unsigned, int> LR[1];
+
+  public:
+    SPUFrameInfo(const TargetMachine &tm);
+
+    //! Return a function's saved spill slots
+    /*!
+      For CellSPU, a function's saved spill slots is just the link register.
+     */
+    const std::pair<unsigned, int> *
+    getCalleeSaveSpillSlots(unsigned &NumEntries) const;
+
+    //! Stack slot size (16 bytes)
+    static int stackSlotSize() {
+      return 16;
+    }
+    //! Maximum frame offset representable by a signed 10-bit integer
+    /*!
+      This is the maximum frame offset that can be expressed as a 10-bit
+      integer, used in D-form addresses.
+     */
+    static int maxFrameOffset() {
+      return ((1 << 9) - 1) * stackSlotSize();
+    }
+    //! Minimum frame offset representable by a signed 10-bit integer
+    static int minFrameOffset() {
+      return -(1 << 9) * stackSlotSize();
+    }
+    //! Minimum frame size (enough to spill LR + SP)
+    static int minStackSize() {
+      return (2 * stackSlotSize());
+    }
+    //! Frame size required to spill all registers plus frame info
+    static int fullSpillSize() {
+      return (SPURegisterInfo::getNumArgRegs() * stackSlotSize());
+    }
+    //! Convert frame index to stack offset
+    static int FItoStackOffset(int frame_index) {
+      return frame_index * stackSlotSize();
+    }
+    //! Number of instructions required to overcome hint-for-branch latency
+    /*!
+      HBR (hint-for-branch) instructions can be inserted when, for example,
+      we know that a given function is going to be called, such as printf(),
+      in the control flow graph. HBRs are only inserted if a sufficient number
+      of instructions occurs between the HBR and the target. Currently, HBRs
+      take 6 cycles, ergo, the magic number 6.
+     */
+    static int branchHintPenalty() {
+      return 6;
+    }
+  };
+}
+
+#define SPUFRAMEINFO_H 1
+#endif
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
new file mode 100644
index 0000000..caaa71a
--- /dev/null
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
@@ -0,0 +1,138 @@
+//===-- SPUHazardRecognizers.cpp - Cell Hazard Recognizer Impls -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on Cell SPU
+// processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sched"
+
+#include "SPUHazardRecognizers.h"
+#include "SPU.h"
+#include "SPUInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Cell SPU hazard recognizer
+//
+// This is the pipeline hazard recognizer for the Cell SPU processor. It does
+// very little right now.
+//===----------------------------------------------------------------------===//
+
+SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) :
+  TII(tii),
+  EvenOdd(0)
+{
+}
+
+/// Return the pipeline hazard type encountered or generated by this
+/// instruction. Currently returns NoHazard.
+///
+/// \return NoHazard
+ScheduleHazardRecognizer::HazardType
+SPUHazardRecognizer::getHazardType(SUnit *SU)
+{
+  // Initial thoughts on how to do this, but this code cannot work unless the
+  // function's prolog and epilog code are also being scheduled so that we can
+  // accurately determine which pipeline is being scheduled.
+#if 0
+  const SDNode *Node = SU->getNode()->getFlaggedMachineNode();
+  ScheduleHazardRecognizer::HazardType retval = NoHazard;
+  bool mustBeOdd = false;
+
+  switch (Node->getOpcode()) {
+  case SPU::LQDv16i8:
+  case SPU::LQDv8i16:
+  case SPU::LQDv4i32:
+  case SPU::LQDv4f32:
+  case SPU::LQDv2f64:
+  case SPU::LQDr128:
+  case SPU::LQDr64:
+  case SPU::LQDr32:
+  case SPU::LQDr16:
+  case SPU::LQAv16i8:
+  case SPU::LQAv8i16:
+  case SPU::LQAv4i32:
+  case SPU::LQAv4f32:
+  case SPU::LQAv2f64:
+  case SPU::LQAr128:
+  case SPU::LQAr64:
+  case SPU::LQAr32:
+  case SPU::LQXv4i32:
+  case SPU::LQXr128:
+  case SPU::LQXr64:
+  case SPU::LQXr32:
+  case SPU::LQXr16:
+  case SPU::STQDv16i8:
+  case SPU::STQDv8i16:
+  case SPU::STQDv4i32:
+  case SPU::STQDv4f32:
+  case SPU::STQDv2f64:
+  case SPU::STQDr128:
+  case SPU::STQDr64:
+  case SPU::STQDr32:
+  case SPU::STQDr16:
+  case SPU::STQDr8:
+  case SPU::STQAv16i8:
+  case SPU::STQAv8i16:
+  case SPU::STQAv4i32:
+  case SPU::STQAv4f32:
+  case SPU::STQAv2f64:
+  case SPU::STQAr128:
+  case SPU::STQAr64:
+  case SPU::STQAr32:
+  case SPU::STQAr16:
+  case SPU::STQAr8:
+  case SPU::STQXv16i8:
+  case SPU::STQXv8i16:
+  case SPU::STQXv4i32:
+  case SPU::STQXv4f32:
+  case SPU::STQXv2f64:
+  case SPU::STQXr128:
+  case SPU::STQXr64:
+  case SPU::STQXr32:
+  case SPU::STQXr16:
+  case SPU::STQXr8:
+  case SPU::RET:
+    mustBeOdd = true;
+    break;
+  default:
+    // Assume that this instruction can be on the even pipe
+    break;
+  }
+
+  if (mustBeOdd && !EvenOdd)
+    retval = Hazard;
+
+  DOUT << "SPUHazardRecognizer EvenOdd " << EvenOdd << " Hazard " << retval << "\n";
+  EvenOdd ^= 1;
+  return retval;
+#else
+  return NoHazard;
+#endif
+}
+
+void SPUHazardRecognizer::EmitInstruction(SUnit *SU)
+{
+}
+
+void SPUHazardRecognizer::AdvanceCycle()
+{
+  DOUT << "SPUHazardRecognizer::AdvanceCycle\n";
+}
+
+void SPUHazardRecognizer::EmitNoop()
+{
+  AdvanceCycle();
+}
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.h b/lib/Target/CellSPU/SPUHazardRecognizers.h
new file mode 100644
index 0000000..d0ae2d8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.h
@@ -0,0 +1,41 @@
+//===-- SPUHazardRecognizers.h - Cell SPU Hazard Recognizer -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on the Cell SPU
+// processor.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUHAZRECS_H
+#define SPUHAZRECS_H
+
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+
+namespace llvm {
+
+class TargetInstrInfo;
+  
+/// SPUHazardRecognizer
+class SPUHazardRecognizer : public ScheduleHazardRecognizer
+{
+private:
+  const TargetInstrInfo &TII;
+  int EvenOdd;
+
+public:
+  SPUHazardRecognizer(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SUnit *SU);
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void EmitNoop();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
new file mode 100644
index 0000000..779d75d
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -0,0 +1,1244 @@
+//===-- SPUISelDAGToDAG.cpp - CellSPU pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for the Cell SPU,
+// converting from a legalized dag to a SPU-target dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "SPUISelLowering.h"
+#include "SPUHazardRecognizers.h"
+#include "SPUFrameInfo.h"
+#include "SPURegisterNames.h"
+#include "SPUTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+namespace {
+  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
+  bool
+  isI64IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
+  bool
+  isI32IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i32 unsigned 10-bit immediate values
+  bool
+  isI32IntU10Immediate(ConstantSDNode *CN)
+  {
+    return isU10Constant(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant(CN->getSExtValue());
+  }
+
+  //! SDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntS10Immediate(SDNode *N)
+  {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+    return (CN != 0 && isI16IntS10Immediate(CN));
+  }
+
+  //! ConstantSDNode predicate for i16 unsigned 10-bit immediate values
+  bool
+  isI16IntU10Immediate(ConstantSDNode *CN)
+  {
+    return isU10Constant((short) CN->getZExtValue());
+  }
+
+  //! SDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntU10Immediate(SDNode *N)
+  {
+    return (N->getOpcode() == ISD::Constant
+            && isI16IntU10Immediate(cast<ConstantSDNode>(N)));
+  }
+
+  //! ConstantSDNode predicate for signed 16-bit values
+  /*!
+    \arg CN The constant SelectionDAG node holding the value
+    \arg Imm The returned 16-bit value, if returning true
+
+    This predicate tests the value in \a CN to see whether it can be
+    represented as a 16-bit, sign-extended quantity. Returns true if
+    this is the case.
+   */
+  bool
+  isIntS16Immediate(ConstantSDNode *CN, short &Imm)
+  {
+    MVT vt = CN->getValueType(0);
+    Imm = (short) CN->getZExtValue();
+    if (vt.getSimpleVT() >= MVT::i1 && vt.getSimpleVT() <= MVT::i16) {
+      return true;
+    } else if (vt == MVT::i32) {
+      int32_t i_val = (int32_t) CN->getZExtValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    } else {
+      int64_t i_val = (int64_t) CN->getZExtValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    }
+
+    return false;
+  }
+
+  //! SDNode predicate for signed 16-bit values.
+  bool
+  isIntS16Immediate(SDNode *N, short &Imm)
+  {
+    return (N->getOpcode() == ISD::Constant
+            && isIntS16Immediate(cast<ConstantSDNode>(N), Imm));
+  }
+
+  //! ConstantFPSDNode predicate for representing floats as 16-bit sign ext.
+  static bool
+  isFPS16Immediate(ConstantFPSDNode *FPN, short &Imm)
+  {
+    MVT vt = FPN->getValueType(0);
+    if (vt == MVT::f32) {
+      int val = FloatToBits(FPN->getValueAPF().convertToFloat());
+      int sval = (int) ((val << 16) >> 16);
+      Imm = (short) val;
+      return val == sval;
+    }
+
+    return false;
+  }
+
+  bool
+  isHighLow(const SDValue &Op)
+  {
+    return (Op.getOpcode() == SPUISD::IndirectAddr
+            && ((Op.getOperand(0).getOpcode() == SPUISD::Hi
+                 && Op.getOperand(1).getOpcode() == SPUISD::Lo)
+                || (Op.getOperand(0).getOpcode() == SPUISD::Lo
+                    && Op.getOperand(1).getOpcode() == SPUISD::Hi)));
+  }
+
+  //===------------------------------------------------------------------===//
+  //! MVT to "useful stuff" mapping structure:
+
+  struct valtype_map_s {
+    MVT VT;
+    unsigned ldresult_ins;      /// LDRESULT instruction (0 = undefined)
+    bool ldresult_imm;          /// LDRESULT instruction requires immediate?
+    unsigned lrinst;            /// LR instruction
+  };
+
+  const valtype_map_s valtype_map[] = {
+    { MVT::i8,    SPU::ORBIr8,  true,  SPU::LRr8 },
+    { MVT::i16,   SPU::ORHIr16, true,  SPU::LRr16 },
+    { MVT::i32,   SPU::ORIr32,  true,  SPU::LRr32 },
+    { MVT::i64,   SPU::ORr64,   false, SPU::LRr64 },
+    { MVT::f32,   SPU::ORf32,   false, SPU::LRf32 },
+    { MVT::f64,   SPU::ORf64,   false, SPU::LRf64 },
+    // vector types... (sigh!)
+    { MVT::v16i8, 0,            false, SPU::LRv16i8 },
+    { MVT::v8i16, 0,            false, SPU::LRv8i16 },
+    { MVT::v4i32, 0,            false, SPU::LRv4i32 },
+    { MVT::v2i64, 0,            false, SPU::LRv2i64 },
+    { MVT::v4f32, 0,            false, SPU::LRv4f32 },
+    { MVT::v2f64, 0,            false, SPU::LRv2f64 }
+  };
+
+  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
+
+  const valtype_map_s *getValueTypeMapEntry(MVT VT)
+  {
+    const valtype_map_s *retval = 0;
+    for (size_t i = 0; i < n_valtype_map; ++i) {
+      if (valtype_map[i].VT == VT) {
+        retval = valtype_map + i;
+        break;
+      }
+    }
+
+
+#ifndef NDEBUG
+    if (retval == 0) {
+      cerr << "SPUISelDAGToDAG.cpp: getValueTypeMapEntry returns NULL for "
+           << VT.getMVTString()
+           << "\n";
+      abort();
+    }
+#endif
+
+    return retval;
+  }
+
+  //! Generate the carry-generate shuffle mask.
+  SDValue getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;
+
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
+  }
+
+  //! Generate the borrow-generate shuffle mask
+  SDValue getBorrowGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;
+
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
+  }
+
+  //===------------------------------------------------------------------===//
+  /// SPUDAGToDAGISel - Cell SPU-specific code to select SPU machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class SPUDAGToDAGISel :
+    public SelectionDAGISel
+  {
+    SPUTargetMachine &TM;
+    SPUTargetLowering &SPUtli;
+    unsigned GlobalBaseReg;
+
+  public:
+    explicit SPUDAGToDAGISel(SPUTargetMachine &tm) :
+      SelectionDAGISel(tm),
+      TM(tm),
+      SPUtli(*tm.getTargetLowering())
+    { }
+
+    virtual bool runOnFunction(Function &Fn) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnFunction(Fn);
+      return true;
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(uint32_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
+      }
+
+    SDNode *emitBuildVector(SDValue build_vec) {
+      MVT vecVT = build_vec.getValueType();
+      MVT eltVT = vecVT.getVectorElementType();
+      SDNode *bvNode = build_vec.getNode();
+      DebugLoc dl = bvNode->getDebugLoc();
+
+      // Check to see if this vector can be represented as a CellSPU immediate
+      // constant by invoking all of the instruction selection predicates:
+      if (((vecVT == MVT::v8i16) &&
+           (SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i16).getNode() != 0)) ||
+          ((vecVT == MVT::v4i32) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_v4i32_imm(bvNode, *CurDAG).getNode() != 0))) ||
+          ((vecVT == MVT::v2i64) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0))))
+        return Select(build_vec);
+
+      // No, need to emit a constant pool spill:
+      std::vector<Constant*> CV;
+
+      for (size_t i = 0; i < build_vec.getNumOperands(); ++i) {
+        ConstantSDNode *V = dyn_cast<ConstantSDNode > (build_vec.getOperand(i));
+        CV.push_back(const_cast<ConstantInt *> (V->getConstantIntValue()));
+      }
+
+      Constant *CP = ConstantVector::get(CV);
+      SDValue CPIdx = CurDAG->getConstantPool(CP, SPUtli.getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+      SDValue CGPoolOffset =
+              SPU::LowerConstantPool(CPIdx, *CurDAG,
+                                     SPUtli.getSPUTargetMachine());
+      return SelectCode(CurDAG->getLoad(build_vec.getValueType(), dl,
+                                        CurDAG->getEntryNode(), CGPoolOffset,
+                                        PseudoSourceValue::getConstantPool(), 0,
+                                        false, Alignment));
+    }
+
+    /// Select - Convert the specified operand from a target-independent to a
+    /// target-specific node if it hasn't already been changed.
+    SDNode *Select(SDValue Op);
+
+    //! Emit the instruction sequence for i64 shl
+    SDNode *SelectSHLi64(SDValue &Op, MVT OpVT);
+
+    //! Emit the instruction sequence for i64 srl
+    SDNode *SelectSRLi64(SDValue &Op, MVT OpVT);
+
+    //! Emit the instruction sequence for i64 sra
+    SDNode *SelectSRAi64(SDValue &Op, MVT OpVT);
+
+    //! Emit the necessary sequence for loading i64 constants:
+    SDNode *SelectI64Constant(SDValue &Op, MVT OpVT, DebugLoc dl);
+
+    //! Alternate instruction emit sequence for loading i64 constants
+    SDNode *SelectI64Constant(uint64_t i64const, MVT OpVT, DebugLoc dl);
+
+    //! Returns true if the address N is an A-form (local store) address
+    bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    //! D-form address predicate
+    bool SelectDFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// Alternate D-form address using i7 offset predicate
+    bool SelectDForm2Addr(SDValue Op, SDValue N, SDValue &Disp,
+                          SDValue &Base);
+
+    /// D-form address selection workhorse
+    bool DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Disp,
+                               SDValue &Base, int minOffset, int maxOffset);
+
+    //! Address predicate if N can be expressed as an indexed [r+r] operation.
+    bool SelectXFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      SDValue Op0, Op1;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        if (!SelectDFormAddr(Op, Op, Op0, Op1)
+            && !SelectAFormAddr(Op, Op, Op0, Op1))
+          SelectXFormAddr(Op, Op, Op0, Op1);
+        break;
+      case 'o':   // offsetable
+        if (!SelectDFormAddr(Op, Op, Op0, Op1)
+            && !SelectAFormAddr(Op, Op, Op0, Op1)) {
+          Op0 = Op;
+          Op1 = getSmallIPtrImm(0);
+        }
+        break;
+      case 'v':   // not offsetable
+#if 1
+        assert(0 && "InlineAsmMemoryOperand 'v' constraint not handled.");
+#else
+        SelectAddrIdxOnly(Op, Op, Op0, Op1);
+#endif
+        break;
+      }
+
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "Cell SPU DAG->DAG Pattern Instruction Selection";
+    }
+
+    /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+    /// this target when scheduling the DAG.
+    virtual ScheduleHazardRecognizer *CreateTargetHazardRecognizer() {
+      const TargetInstrInfo *II = TM.getInstrInfo();
+      assert(II && "No InstrInfo?");
+      return new SPUHazardRecognizer(*II);
+    }
+
+    // Include the pieces autogenerated from the target description.
+#include "SPUGenDAGISel.inc"
+  };
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void
+SPUDAGToDAGISel::InstructionSelect()
+{
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+/*!
+ \arg Op The ISD instruction operand
+ \arg N The address to be tested
+ \arg Base The base address
+ \arg Index The base address index
+ */
+bool
+SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                    SDValue &Index) {
+  // These match the addr256k operand type:
+  MVT OffsVT = MVT::i16;
+  SDValue Zero = CurDAG->getTargetConstant(0, OffsVT);
+
+  switch (N.getOpcode()) {
+  case ISD::Constant:
+  case ISD::ConstantPool:
+  case ISD::GlobalAddress:
+    cerr << "SPU SelectAFormAddr: Constant/Pool/Global not lowered.\n";
+    abort();
+    /*NOTREACHED*/
+
+  case ISD::TargetConstant:
+  case ISD::TargetGlobalAddress:
+  case ISD::TargetJumpTable:
+    cerr << "SPUSelectAFormAddr: Target Constant/Pool/Global not wrapped as "
+         << "A-form address.\n";
+    abort();
+    /*NOTREACHED*/
+
+  case SPUISD::AFormAddr:
+    // Just load from memory if there's only a single use of the location,
+    // otherwise, this will get handled below with D-form offset addresses
+    if (N.hasOneUse()) {
+      SDValue Op0 = N.getOperand(0);
+      switch (Op0.getOpcode()) {
+      case ISD::TargetConstantPool:
+      case ISD::TargetJumpTable:
+        Base = Op0;
+        Index = Zero;
+        return true;
+
+      case ISD::TargetGlobalAddress: {
+        GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op0);
+        GlobalValue *GV = GSDN->getGlobal();
+        if (GV->getAlignment() == 16) {
+          Base = Op0;
+          Index = Zero;
+          return true;
+        }
+        break;
+      }
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+bool
+SPUDAGToDAGISel::SelectDForm2Addr(SDValue Op, SDValue N, SDValue &Disp,
+                                  SDValue &Base) {
+  const int minDForm2Offset = -(1 << 7);
+  const int maxDForm2Offset = (1 << 7) - 1;
+  return DFormAddressPredicate(Op, N, Disp, Base, minDForm2Offset,
+                               maxDForm2Offset);
+}
+
+/*!
+  \arg Op The ISD instruction (ignored)
+  \arg N The address to be tested
+  \arg Base Base address register/pointer
+  \arg Index Base address index
+
+  Examine the input address by a base register plus a signed 10-bit
+  displacement, [r+I10] (D-form address).
+
+  \return true if \a N is a D-form address with \a Base and \a Index set
+  to non-empty SDValue instances.
+*/
+bool
+SPUDAGToDAGISel::SelectDFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                                 SDValue &Index) {
+  return DFormAddressPredicate(Op, N, Base, Index,
+                               SPUFrameInfo::minFrameOffset(),
+                               SPUFrameInfo::maxFrameOffset());
+}
+
+bool
+SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
+                                      SDValue &Index, int minOffset,
+                                      int maxOffset) {
+  unsigned Opc = N.getOpcode();
+  MVT PtrTy = SPUtli.getPointerTy();
+
+  if (Opc == ISD::FrameIndex) {
+    // Stack frame index must be less than 512 (divided by 16):
+    FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N);
+    int FI = int(FIN->getIndex());
+    DEBUG(cerr << "SelectDFormAddr: ISD::FrameIndex = "
+               << FI << "\n");
+    if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+      return true;
+    }
+  } else if (Opc == ISD::ADD) {
+    // Generated by getelementptr
+    const SDValue Op0 = N.getOperand(0);
+    const SDValue Op1 = N.getOperand(1);
+
+    if ((Op0.getOpcode() == SPUISD::Hi && Op1.getOpcode() == SPUISD::Lo)
+        || (Op1.getOpcode() == SPUISD::Hi && Op0.getOpcode() == SPUISD::Lo)) {
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = N;
+      return true;
+    } else if (Op1.getOpcode() == ISD::Constant
+               || Op1.getOpcode() == ISD::TargetConstant) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1);
+      int32_t offset = int32_t(CN->getSExtValue());
+
+      if (Op0.getOpcode() == ISD::FrameIndex) {
+        FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op0);
+        int FI = int(FIN->getIndex());
+        DEBUG(cerr << "SelectDFormAddr: ISD::ADD offset = " << offset
+                   << " frame index = " << FI << "\n");
+
+        if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
+          Base = CurDAG->getTargetConstant(offset, PtrTy);
+          Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+          return true;
+        }
+      } else if (offset > minOffset && offset < maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = Op0;
+        return true;
+      }
+    } else if (Op0.getOpcode() == ISD::Constant
+               || Op0.getOpcode() == ISD::TargetConstant) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op0);
+      int32_t offset = int32_t(CN->getSExtValue());
+
+      if (Op1.getOpcode() == ISD::FrameIndex) {
+        FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op1);
+        int FI = int(FIN->getIndex());
+        DEBUG(cerr << "SelectDFormAddr: ISD::ADD offset = " << offset
+                   << " frame index = " << FI << "\n");
+
+        if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
+          Base = CurDAG->getTargetConstant(offset, PtrTy);
+          Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+          return true;
+        }
+      } else if (offset > minOffset && offset < maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = Op1;
+        return true;
+      }
+    }
+  } else if (Opc == SPUISD::IndirectAddr) {
+    // Indirect with constant offset -> D-Form address
+    const SDValue Op0 = N.getOperand(0);
+    const SDValue Op1 = N.getOperand(1);
+
+    if (Op0.getOpcode() == SPUISD::Hi
+        && Op1.getOpcode() == SPUISD::Lo) {
+      // (SPUindirect (SPUhi <arg>, 0), (SPUlo <arg>, 0))
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = N;
+      return true;
+    } else if (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1)) {
+      int32_t offset = 0;
+      SDValue idxOp;
+
+      if (isa<ConstantSDNode>(Op1)) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
+        offset = int32_t(CN->getSExtValue());
+        idxOp = Op0;
+      } else if (isa<ConstantSDNode>(Op0)) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op0);
+        offset = int32_t(CN->getSExtValue());
+        idxOp = Op1;
+      }
+
+      if (offset >= minOffset && offset <= maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = idxOp;
+        return true;
+      }
+    }
+  } else if (Opc == SPUISD::AFormAddr) {
+    Base = CurDAG->getTargetConstant(0, N.getValueType());
+    Index = N;
+    return true;
+  } else if (Opc == SPUISD::LDRESULT) {
+    Base = CurDAG->getTargetConstant(0, N.getValueType());
+    Index = N;
+    return true;
+  } else if (Opc == ISD::Register || Opc == ISD::CopyFromReg) {
+    unsigned OpOpc = Op.getOpcode();
+
+    if (OpOpc == ISD::STORE || OpOpc == ISD::LOAD) {
+      // Direct load/store without getelementptr
+      SDValue Addr, Offs;
+
+      // Get the register from CopyFromReg
+      if (Opc == ISD::CopyFromReg)
+        Addr = N.getOperand(1);
+      else
+        Addr = N;                       // Register
+
+      Offs = ((OpOpc == ISD::STORE) ? Op.getOperand(3) : Op.getOperand(2));
+
+      if (Offs.getOpcode() == ISD::Constant || Offs.getOpcode() == ISD::UNDEF) {
+        if (Offs.getOpcode() == ISD::UNDEF)
+          Offs = CurDAG->getTargetConstant(0, Offs.getValueType());
+
+        Base = Offs;
+        Index = Addr;
+        return true;
+      }
+    } else {
+      /* If otherwise unadorned, default to D-form address with 0 offset: */
+      if (Opc == ISD::CopyFromReg) {
+        Index = N.getOperand(1);
+      } else {
+        Index = N;
+      }
+
+      Base = CurDAG->getTargetConstant(0, Index.getValueType());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/*!
+  \arg Op The ISD instruction operand
+  \arg N The address operand
+  \arg Base The base pointer operand
+  \arg Index The offset/index operand
+
+  If the address \a N can be expressed as an A-form or D-form address, returns
+  false.  Otherwise, creates two operands, Base and Index that will become the
+  (r)(r) X-form address.
+*/
+bool
+SPUDAGToDAGISel::SelectXFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                                 SDValue &Index) {
+  if (!SelectAFormAddr(Op, N, Base, Index)
+      && !SelectDFormAddr(Op, N, Base, Index)) {
+    // If the address is neither A-form or D-form, punt and use an X-form
+    // address:
+    Base = N.getOperand(1);
+    Index = N.getOperand(0);
+    return true;
+  }
+
+  return false;
+}
+
+//! Convert the operand from a target-independent to a target-specific node
+/*!
+ */
+SDNode *
+SPUDAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  unsigned Opc = N->getOpcode();
+  int n_ops = -1;
+  unsigned NewOpc;
+  MVT OpVT = Op.getValueType();
+  SDValue Ops[8];
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+
+  if (Opc == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType());
+    SDValue Imm0 = CurDAG->getTargetConstant(0, Op.getValueType());
+
+    if (FI < 128) {
+      NewOpc = SPU::AIr32;
+      Ops[0] = TFI;
+      Ops[1] = Imm0;
+      n_ops = 2;
+    } else {
+      NewOpc = SPU::Ar32;
+      Ops[0] = CurDAG->getRegister(SPU::R1, Op.getValueType());
+      Ops[1] = SDValue(CurDAG->getTargetNode(SPU::ILAr32, dl, Op.getValueType(),
+                                             TFI, Imm0), 0);
+      n_ops = 2;
+    }
+  } else if (Opc == ISD::Constant && OpVT == MVT::i64) {
+    // Catch the i64 constants that end up here. Note: The backend doesn't
+    // attempt to legalize the constant (it's useless because DAGCombiner
+    // will insert 64-bit constants and we can't stop it).
+    return SelectI64Constant(Op, OpVT, Op.getDebugLoc());
+  } else if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND)
+             && OpVT == MVT::i64) {
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = Op0.getValueType();
+    MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
+    MVT OpVecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+    SDValue shufMask;
+
+    switch (Op0VT.getSimpleVT()) {
+    default:
+      cerr << "CellSPU Select: Unhandled zero/any extend MVT\n";
+      abort();
+      /*NOTREACHED*/
+      break;
+    case MVT::i32:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x00010203, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x08090a0b, MVT::i32));
+      break;
+
+    case MVT::i16:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80800203, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80800a0b, MVT::i32));
+      break;
+
+    case MVT::i8:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80808003, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x8080800b, MVT::i32));
+      break;
+    }
+
+    SDNode *shufMaskLoad = emitBuildVector(shufMask);
+    SDNode *PromoteScalar =
+            SelectCode(CurDAG->getNode(SPUISD::PREFSLOT2VEC, dl, Op0VecVT, Op0));
+
+    SDValue zextShuffle =
+            CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
+                            SDValue(PromoteScalar, 0),
+                            SDValue(PromoteScalar, 0),
+                            SDValue(shufMaskLoad, 0));
+
+    // N.B.: BIT_CONVERT replaces and updates the zextShuffle node, so we
+    // re-use it in the VEC2PREFSLOT selection without needing to explicitly
+    // call SelectCode (it's already done for us.)
+    SelectCode(CurDAG->getNode(ISD::BIT_CONVERT, dl, OpVecVT, zextShuffle));
+    return SelectCode(CurDAG->getNode(SPUISD::VEC2PREFSLOT, dl, OpVT,
+                                      zextShuffle));
+  } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl));
+
+    return SelectCode(CurDAG->getNode(SPUISD::ADD64_MARKER, dl, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getBorrowGenerateShufMask(*CurDAG, dl));
+
+    return SelectCode(CurDAG->getNode(SPUISD::SUB64_MARKER, dl, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl));
+
+    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, dl, OpVT,
+                                      Op.getOperand(0), Op.getOperand(1),
+                                      SDValue(CGLoad, 0)));
+  } else if (Opc == ISD::TRUNCATE) {
+    SDValue Op0 = Op.getOperand(0);
+    if ((Op0.getOpcode() == ISD::SRA || Op0.getOpcode() == ISD::SRL)
+        && OpVT == MVT::i32
+        && Op0.getValueType() == MVT::i64) {
+      // Catch (truncate:i32 ([sra|srl]:i64 arg, c), where c >= 32
+      //
+      // Take advantage of the fact that the upper 32 bits are in the
+      // i32 preferred slot and avoid shuffle gymnastics:
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+      if (CN != 0) {
+        unsigned shift_amt = unsigned(CN->getZExtValue());
+
+        if (shift_amt >= 32) {
+          SDNode *hi32 =
+                  CurDAG->getTargetNode(SPU::ORr32_r64, dl, OpVT,
+                                        Op0.getOperand(0));
+
+          shift_amt -= 32;
+          if (shift_amt > 0) {
+            // Take care of the additional shift, if present:
+            SDValue shift = CurDAG->getTargetConstant(shift_amt, MVT::i32);
+            unsigned Opc = SPU::ROTMAIr32_i32;
+
+            if (Op0.getOpcode() == ISD::SRL)
+              Opc = SPU::ROTMr32;
+
+            hi32 = CurDAG->getTargetNode(Opc, dl, OpVT, SDValue(hi32, 0),
+                                         shift);
+          }
+
+          return hi32;
+        }
+      }
+    }
+  } else if (Opc == ISD::SHL) {
+    if (OpVT == MVT::i64) {
+      return SelectSHLi64(Op, OpVT);
+    }
+  } else if (Opc == ISD::SRL) {
+    if (OpVT == MVT::i64) {
+      return SelectSRLi64(Op, OpVT);
+    }
+  } else if (Opc == ISD::SRA) {
+    if (OpVT == MVT::i64) {
+      return SelectSRAi64(Op, OpVT);
+    }
+  } else if (Opc == ISD::FNEG
+             && (OpVT == MVT::f64 || OpVT == MVT::v2f64)) {
+    DebugLoc dl = Op.getDebugLoc();
+    // Check if the pattern is a special form of DFNMS:
+    // (fneg (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC))
+    SDValue Op0 = Op.getOperand(0);
+    if (Op0.getOpcode() == ISD::FSUB) {
+      SDValue Op00 = Op0.getOperand(0);
+      if (Op00.getOpcode() == ISD::FMUL) {
+        unsigned Opc = SPU::DFNMSf64;
+        if (OpVT == MVT::v2f64)
+          Opc = SPU::DFNMSv2f64;
+
+        return CurDAG->getTargetNode(Opc, dl, OpVT,
+                                     Op00.getOperand(0),
+                                     Op00.getOperand(1),
+                                     Op0.getOperand(1));
+      }
+    }
+
+    SDValue negConst = CurDAG->getConstant(0x8000000000000000ULL, MVT::i64);
+    SDNode *signMask = 0;
+    unsigned Opc = SPU::XORfneg64;
+
+    if (OpVT == MVT::f64) {
+      signMask = SelectI64Constant(negConst, MVT::i64, dl);
+    } else if (OpVT == MVT::v2f64) {
+      Opc = SPU::XORfnegvec;
+      signMask = emitBuildVector(CurDAG->getNode(ISD::BUILD_VECTOR, dl,
+                                                 MVT::v2i64,
+                                                 negConst, negConst));
+    }
+
+    return CurDAG->getTargetNode(Opc, dl, OpVT,
+                                 Op.getOperand(0), SDValue(signMask, 0));
+  } else if (Opc == ISD::FABS) {
+    if (OpVT == MVT::f64) {
+      SDNode *signMask = SelectI64Constant(0x7fffffffffffffffULL, MVT::i64, dl);
+      return CurDAG->getTargetNode(SPU::ANDfabs64, dl, OpVT,
+                                   Op.getOperand(0), SDValue(signMask, 0));
+    } else if (OpVT == MVT::v2f64) {
+      SDValue absConst = CurDAG->getConstant(0x7fffffffffffffffULL, MVT::i64);
+      SDValue absVec = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                                       absConst, absConst);
+      SDNode *signMask = emitBuildVector(absVec);
+      return CurDAG->getTargetNode(SPU::ANDfabsvec, dl, OpVT,
+                                   Op.getOperand(0), SDValue(signMask, 0));
+    }
+  } else if (Opc == SPUISD::LDRESULT) {
+    // Custom select instructions for LDRESULT
+    MVT VT = N->getValueType(0);
+    SDValue Arg = N->getOperand(0);
+    SDValue Chain = N->getOperand(1);
+    SDNode *Result;
+    const valtype_map_s *vtm = getValueTypeMapEntry(VT);
+
+    if (vtm->ldresult_ins == 0) {
+      cerr << "LDRESULT for unsupported type: "
+           << VT.getMVTString()
+           << "\n";
+      abort();
+    }
+
+    Opc = vtm->ldresult_ins;
+    if (vtm->ldresult_imm) {
+      SDValue Zero = CurDAG->getTargetConstant(0, VT);
+
+      Result = CurDAG->getTargetNode(Opc, dl, VT, MVT::Other, Arg, Zero, Chain);
+    } else {
+      Result = CurDAG->getTargetNode(Opc, dl, VT, MVT::Other, Arg, Arg, Chain);
+    }
+
+    return Result;
+  } else if (Opc == SPUISD::IndirectAddr) {
+    // Look at the operands: SelectCode() will catch the cases that aren't
+    // specifically handled here.
+    //
+    // SPUInstrInfo catches the following patterns:
+    // (SPUindirect (SPUhi ...), (SPUlo ...))
+    // (SPUindirect $sp, imm)
+    MVT VT = Op.getValueType();
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    RegisterSDNode *RN;
+
+    if ((Op0.getOpcode() != SPUISD::Hi && Op1.getOpcode() != SPUISD::Lo)
+        || (Op0.getOpcode() == ISD::Register
+            && ((RN = dyn_cast<RegisterSDNode>(Op0.getNode())) != 0
+                && RN->getReg() != SPU::R1))) {
+      NewOpc = SPU::Ar32;
+      if (Op1.getOpcode() == ISD::Constant) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
+        Op1 = CurDAG->getTargetConstant(CN->getSExtValue(), VT);
+        NewOpc = (isI32IntS10Immediate(CN) ? SPU::AIr32 : SPU::Ar32);
+      }
+      Ops[0] = Op0;
+      Ops[1] = Op1;
+      n_ops = 2;
+    }
+  }
+
+  if (n_ops > 0) {
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, Ops, n_ops);
+    else
+      return CurDAG->getTargetNode(NewOpc, dl, OpVT, Ops, n_ops);
+  } else
+    return SelectCode(Op);
+}
+
+/*!
+ * Emit the instruction sequence for i64 left shifts. The basic algorithm
+ * is to fill the bottom two word slots with zeros so that zeros are shifted
+ * in as the entire quadword is shifted left.
+ *
+ * \note This code could also be used to implement v2i64 shl.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, MVT OpVT) {
+  SDValue Op0 = Op.getOperand(0);
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *SelMask, *ZeroFill, *Shift = 0;
+  SDValue SelMaskVal;
+  DebugLoc dl = Op.getDebugLoc();
+
+  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
+  SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16);
+  SelMask = CurDAG->getTargetNode(SPU::FSMBIv2i64, dl, VecVT, SelMaskVal);
+  ZeroFill = CurDAG->getTargetNode(SPU::ILv2i64, dl, VecVT,
+                                   CurDAG->getTargetConstant(0, OpVT));
+  VecOp0 = CurDAG->getTargetNode(SPU::SELBv2i64, dl, VecVT,
+                                 SDValue(ZeroFill, 0),
+                                 SDValue(VecOp0, 0),
+                                 SDValue(SelMask, 0));
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::SHLQBYIv2i64, dl, VecVT,
+                              SDValue(VecOp0, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::SHLQBIIv2i64, dl, VecVT,
+                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getTargetNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getTargetNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+    Shift =
+      CurDAG->getTargetNode(SPU::SHLQBYv2i64, dl, VecVT,
+                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::SHLQBIv2i64, dl, VecVT,
+                            SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 logical right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, MVT OpVT) {
+  SDValue Op0 = Op.getOperand(0);
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *Shift = 0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQMBYIv2i64, dl, VecVT,
+                              SDValue(VecOp0, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQMBIIv2i64, dl, VecVT,
+                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getTargetNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getTargetNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                            ShiftAmt,
+                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+
+    // Ensure that the shift amounts are negated!
+    Bytes = CurDAG->getTargetNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                  SDValue(Bytes, 0),
+                                  CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Bits = CurDAG->getTargetNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                 SDValue(Bits, 0),
+                                 CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQMBYv2i64, dl, VecVT,
+                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQMBIv2i64, dl, VecVT,
+                            SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 arithmetic right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) {
+  // Promote Op0 to vector
+  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = Op.getOperand(1);
+  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDNode *VecOp0 =
+    CurDAG->getTargetNode(SPU::ORv2i64_i64, dl, VecVT, Op.getOperand(0));
+
+  SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT);
+  SDNode *SignRot =
+    CurDAG->getTargetNode(SPU::ROTMAIv2i64_i32, dl, MVT::v2i64,
+                          SDValue(VecOp0, 0), SignRotAmt);
+  SDNode *UpperHalfSign =
+    CurDAG->getTargetNode(SPU::ORi32_v4i32, dl, MVT::i32, SDValue(SignRot, 0));
+
+  SDNode *UpperHalfSignMask =
+    CurDAG->getTargetNode(SPU::FSM64r32, dl, VecVT, SDValue(UpperHalfSign, 0));
+  SDNode *UpperLowerMask =
+    CurDAG->getTargetNode(SPU::FSMBIv2i64, dl, VecVT,
+                          CurDAG->getTargetConstant(0xff00ULL, MVT::i16));
+  SDNode *UpperLowerSelect =
+    CurDAG->getTargetNode(SPU::SELBv2i64, dl, VecVT,
+                          SDValue(UpperHalfSignMask, 0),
+                          SDValue(VecOp0, 0),
+                          SDValue(UpperLowerMask, 0));
+
+  SDNode *Shift = 0;
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      bytes = 31 - bytes;
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQBYIv2i64, dl, VecVT,
+                              SDValue(UpperLowerSelect, 0),
+                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      bits = 8 - bits;
+      Shift =
+        CurDAG->getTargetNode(SPU::ROTQBIIv2i64, dl, VecVT,
+                              SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0),
+                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *NegShift =
+      CurDAG->getTargetNode(SPU::SFIr32, dl, ShiftAmtVT,
+                            ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQBYBIv2i64_r32, dl, VecVT,
+                            SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0));
+    Shift =
+      CurDAG->getTargetNode(SPU::ROTQBIv2i64, dl, VecVT,
+                            SDValue(Shift, 0), SDValue(NegShift, 0));
+  }
+
+  return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ Do the necessary magic necessary to load a i64 constant
+ */
+SDNode *SPUDAGToDAGISel::SelectI64Constant(SDValue& Op, MVT OpVT,
+                                           DebugLoc dl) {
+  ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
+  return SelectI64Constant(CN->getZExtValue(), OpVT, dl);
+}
+
+SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, MVT OpVT,
+                                           DebugLoc dl) {
+  MVT OpVecVT = MVT::getVectorVT(OpVT, 2);
+  SDValue i64vec =
+          SPU::LowerV2I64Splat(OpVecVT, *CurDAG, Value64, dl);
+
+  // Here's where it gets interesting, because we have to parse out the
+  // subtree handed back in i64vec:
+
+  if (i64vec.getOpcode() == ISD::BIT_CONVERT) {
+    // The degenerate case where the upper and lower bits in the splat are
+    // identical:
+    SDValue Op0 = i64vec.getOperand(0);
+
+    ReplaceUses(i64vec, Op0);
+    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
+                                 SDValue(emitBuildVector(Op0), 0));
+  } else if (i64vec.getOpcode() == SPUISD::SHUFB) {
+    SDValue lhs = i64vec.getOperand(0);
+    SDValue rhs = i64vec.getOperand(1);
+    SDValue shufmask = i64vec.getOperand(2);
+
+    if (lhs.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(lhs, lhs.getOperand(0));
+      lhs = lhs.getOperand(0);
+    }
+
+    SDNode *lhsNode = (lhs.getNode()->isMachineOpcode()
+                       ? lhs.getNode()
+                       : emitBuildVector(lhs));
+
+    if (rhs.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(rhs, rhs.getOperand(0));
+      rhs = rhs.getOperand(0);
+    }
+
+    SDNode *rhsNode = (rhs.getNode()->isMachineOpcode()
+                       ? rhs.getNode()
+                       : emitBuildVector(rhs));
+
+    if (shufmask.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(shufmask, shufmask.getOperand(0));
+      shufmask = shufmask.getOperand(0);
+    }
+
+    SDNode *shufMaskNode = (shufmask.getNode()->isMachineOpcode()
+                            ? shufmask.getNode()
+                            : emitBuildVector(shufmask));
+
+    SDNode *shufNode =
+            Select(CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
+                                   SDValue(lhsNode, 0), SDValue(rhsNode, 0),
+                                   SDValue(shufMaskNode, 0)));
+
+    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
+                                 SDValue(shufNode, 0));
+  } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) {
+    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
+                                 SDValue(emitBuildVector(i64vec), 0));
+  } else {
+    cerr << "SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec condition\n";
+    abort();
+  }
+}
+
+/// createSPUISelDag - This pass converts a legalized DAG into a
+/// SPU-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSPUISelDag(SPUTargetMachine &TM) {
+  return new SPUDAGToDAGISel(TM);
+}
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
new file mode 100644
index 0000000..864a914
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -0,0 +1,2980 @@
+//
+//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPUTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPURegisterNames.h"
+#include "SPUISelLowering.h"
+#include "SPUTargetMachine.h"
+#include "SPUFrameInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+
+#include <map>
+
+using namespace llvm;
+
+// Used in getTargetNodeName() below
+namespace {
+  std::map<unsigned, const char *> node_names;
+
+  //! MVT mapping to useful data for Cell SPU
+  struct valtype_map_s {
+    const MVT   valtype;
+    const int   prefslot_byte;
+  };
+
+  const valtype_map_s valtype_map[] = {
+    { MVT::i1,   3 },
+    { MVT::i8,   3 },
+    { MVT::i16,  2 },
+    { MVT::i32,  0 },
+    { MVT::f32,  0 },
+    { MVT::i64,  0 },
+    { MVT::f64,  0 },
+    { MVT::i128, 0 }
+  };
+
+  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
+
+  const valtype_map_s *getValueTypeMapEntry(MVT VT) {
+    const valtype_map_s *retval = 0;
+
+    for (size_t i = 0; i < n_valtype_map; ++i) {
+      if (valtype_map[i].valtype == VT) {
+        retval = valtype_map + i;
+        break;
+      }
+    }
+
+#ifndef NDEBUG
+    if (retval == 0) {
+      cerr << "getValueTypeMapEntry returns NULL for "
+           << VT.getMVTString()
+           << "\n";
+      abort();
+    }
+#endif
+
+    return retval;
+  }
+
+  //! Expand a library call into an actual call DAG node
+  /*!
+   \note
+   This code is taken from SelectionDAGLegalize, since it is not exposed as
+   part of the LLVM SelectionDAG API.
+   */
+
+  SDValue
+  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
+                bool isSigned, SDValue &Hi, SPUTargetLowering &TLI) {
+    // The input chain to this libcall is the entry node of the function.
+    // Legalizing the call will automatically add the previous call to the
+    // dependence.
+    SDValue InChain = DAG.getEntryNode();
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      MVT ArgVT = Op.getOperand(i).getValueType();
+      const Type *ArgTy = ArgVT.getTypeForMVT();
+      Entry.Node = Op.getOperand(i);
+      Entry.Ty = ArgTy;
+      Entry.isSExt = isSigned;
+      Entry.isZExt = !isSigned;
+      Args.push_back(Entry);
+    }
+    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                           TLI.getPointerTy());
+
+    // Splice the libcall in wherever FindInputOutputChains tells us to.
+    const Type *RetTy = Op.getNode()->getValueType(0).getTypeForMVT();
+    std::pair<SDValue, SDValue> CallInfo =
+            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                            CallingConv::C, false, Callee, Args, DAG,
+                            Op.getDebugLoc());
+
+    return CallInfo.first;
+  }
+}
+
+SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
+  : TargetLowering(TM),
+    SPUTM(TM)
+{
+  // Fold away setcc operations if possible.
+  setPow2DivIsCheap();
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // Set RTLIB libcall names as used by SPU:
+  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
+
+  // Set up the SPU's register classes:
+  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
+  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
+  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
+  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
+  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
+  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
+  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
+
+  // SPU has no sign or zero extended loads for i1, i8, i16:
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
+
+  // SPU constant load actions are custom lowered:
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
+
+  // SPU's loads and stores have to be custom lowered:
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
+       ++sctype) {
+    MVT VT = (MVT::SimpleValueType)sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
+      MVT StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
+
+  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
+       ++sctype) {
+    MVT VT = (MVT::SimpleValueType) sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
+      MVT StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
+
+  // Expand the jumptable branches
+  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+
+  // Custom lower SELECT_CC for most cases, but expand by default
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
+
+  // SPU has no intrinsics for these particular operations:
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+
+  // SPU has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // We don't support sin/cos/sqrt/fmod
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+
+  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
+  // for f32!)
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  // SPU can do rotate right and left, so legalize it... but customize for i8
+  // because instructions don't exist.
+
+  // FIXME: Change from "expand" to appropriate type once ROTR is supported in
+  //        .td files.
+  setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
+  setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
+  setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
+
+  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i8,     Custom);
+
+  // SPU has no native version of shift left/right for i8
+  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRA,  MVT::i8,     Custom);
+
+  // Make these operations legal and handle them during instruction selection:
+  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRA,  MVT::i64,    Legal);
+
+  // Custom lower i8, i32 and i64 multiplications
+  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
+  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
+  setOperationAction(ISD::MUL,  MVT::i64,    Legal);
+
+  // Need to custom handle (some) common i8, i64 math ops
+  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
+  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
+  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
+  setOperationAction(ISD::SUB,  MVT::i64,    Legal);
+
+  // SPU does not have BSWAP. It does have i32 support CTLZ.
+  // CTPOP has to be custom lowered.
+  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
+  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
+
+  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
+
+  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
+
+  // SPU has a version of select that implements (a&~c)|(b&c), just like
+  // select ought to work:
+  setOperationAction(ISD::SELECT, MVT::i8,   Legal);
+  setOperationAction(ISD::SELECT, MVT::i16,  Legal);
+  setOperationAction(ISD::SELECT, MVT::i32,  Legal);
+  setOperationAction(ISD::SELECT, MVT::i64,  Legal);
+
+  setOperationAction(ISD::SETCC, MVT::i8,    Legal);
+  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
+  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
+  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
+  setOperationAction(ISD::SETCC, MVT::f64,   Custom);
+
+  // Custom lower i128 -> i64 truncates
+  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
+
+  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
+  // to expand to a libcall, hence the custom lowering:
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+  // FDIV on SPU requires custom lowering
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
+
+  // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Support label based line numbers.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
+       ++sctype) {
+    MVT VT = (MVT::SimpleValueType)sctype;
+
+    setOperationAction(ISD::GlobalAddress,  VT, Custom);
+    setOperationAction(ISD::ConstantPool,   VT, Custom);
+    setOperationAction(ISD::JumpTable,      VT, Custom);
+  }
+
+  // RET must be custom lowered, to meet ABI requirements
+  setOperationAction(ISD::RET,           MVT::Other, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
+
+  // Cell SPU has instructions for converting between i64 and fp.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+
+  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
+
+  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+  // First set operation action for all vector types to expand. Then we
+  // will selectively turn on ones that can be effectively codegen'd.
+  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
+
+  // "Odd size" vector classes that we're willing to support:
+  addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass);
+
+  for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+
+    // add/sub are legal for all supported vector VT's.
+    setOperationAction(ISD::ADD,     VT, Legal);
+    setOperationAction(ISD::SUB,     VT, Legal);
+    // mul has to be custom lowered.
+    setOperationAction(ISD::MUL,     VT, Legal);
+
+    setOperationAction(ISD::AND,     VT, Legal);
+    setOperationAction(ISD::OR,      VT, Legal);
+    setOperationAction(ISD::XOR,     VT, Legal);
+    setOperationAction(ISD::LOAD,    VT, Legal);
+    setOperationAction(ISD::SELECT,  VT, Legal);
+    setOperationAction(ISD::STORE,   VT, Legal);
+
+    // These operations need to be expanded:
+    setOperationAction(ISD::SDIV,    VT, Expand);
+    setOperationAction(ISD::SREM,    VT, Expand);
+    setOperationAction(ISD::UDIV,    VT, Expand);
+    setOperationAction(ISD::UREM,    VT, Expand);
+
+    // Custom lower build_vector, constant pool spills, insert and
+    // extract vector elements:
+    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::ConstantPool, VT, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+  }
+
+  setOperationAction(ISD::AND, MVT::v16i8, Custom);
+  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
+  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
+  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+
+  setShiftAmountType(MVT::i32);
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+
+  setStackPointerRegisterToSaveRestore(SPU::R1);
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+
+  computeRegisterProperties();
+
+  // Set pre-RA register scheduler default to BURR, which produces slightly
+  // better code than the default (could also be TDRR, but TargetLowering.h
+  // needs a mod to support that model):
+  setSchedulingPreference(SchedulingForRegPressure);
+}
+
+const char *
+SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
+{
+  if (node_names.empty()) {
+    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
+    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
+    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
+    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
+    node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
+    node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
+    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
+    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
+    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
+    node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
+    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
+    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
+    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
+    node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
+    node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
+    node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
+    node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL";
+    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
+    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
+    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
+            "SPUISD::ROTBYTES_LEFT_BITS";
+    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
+    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
+    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
+    node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
+    node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
+  }
+
+  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
+
+  return ((i != node_names.end()) ? i->second : 0);
+}
+
+//===----------------------------------------------------------------------===//
+// Return the Cell SPU's SETCC result type
+//===----------------------------------------------------------------------===//
+
+MVT SPUTargetLowering::getSetCCResultType(MVT VT) const {
+  // i16 and i32 are valid SETCC result types
+  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? VT : MVT::i32);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling convention code:
+//===----------------------------------------------------------------------===//
+
+#include "SPUGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// Custom lower loads for CellSPU
+/*!
+ All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to rotate to extract the requested element.
+
+ For extending loads, we also want to ensure that the following sequence is
+ emitted, e.g. for MVT::f32 extending load to MVT::f64:
+
+\verbatim
+%1  v16i8,ch = load
+%2  v16i8,ch = rotate %1
+%3  v4f8, ch = bitconvert %2
+%4  f32      = vec2perfslot %3
+%5  f64      = fp_extend %4
+\endverbatim
+*/
+static SDValue
+LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  LoadSDNode *LN = cast<LoadSDNode>(Op);
+  SDValue the_chain = LN->getChain();
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  MVT InVT = LN->getMemoryVT();
+  MVT OutVT = Op.getValueType();
+  ISD::LoadExtType ExtType = LN->getExtensionType();
+  unsigned alignment = LN->getAlignment();
+  const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (LN->getAddressingMode()) {
+  case ISD::UNINDEXED: {
+    SDValue result;
+    SDValue basePtr = LN->getBasePtr();
+    SDValue rotate;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and the rotation amount:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+        int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte);
+
+        if (rotamt < 0)
+          rotamt += 16;
+
+        rotate = DAG.getConstant(rotamt, MVT::i16);
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
+                 || (basePtr.getOpcode() == SPUISD::IndirectAddr
+                     && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
+                     && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
+        // Plain aligned a-form address: rotate into preferred slot
+        // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getConstant(rotamt, MVT::i16);
+      } else {
+        // Offset the rotate amount by the basePtr and the preferred slot
+        // byte offset
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                             basePtr,
+                             DAG.getConstant(rotamt, PtrVT));
+      }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
+
+      // Offset the rotate amount by the basePtr and the preferred slot
+      // byte offset
+      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           basePtr,
+                           DAG.getConstant(-vtm->prefslot_byte, PtrVT));
+    }
+
+    // Re-emit as a v16i8 vector load
+    result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+                         LN->getSrcValue(), LN->getSrcValueOffset(),
+                         LN->isVolatile(), 16);
+
+    // Update the chain
+    the_chain = result.getValue(1);
+
+    // Rotate into the preferred slot:
+    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8,
+                         result.getValue(0), rotate);
+
+    // Convert the loaded v16i8 vector to the appropriate vector type
+    // specified by the operand:
+    MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
+    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
+                         DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result));
+
+    // Handle extending loads by extending the scalar result:
+    if (ExtType == ISD::SEXTLOAD) {
+      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::EXTLOAD) {
+      unsigned NewOpc = ISD::ANY_EXTEND;
+
+      if (OutVT.isFloatingPoint())
+        NewOpc = ISD::FP_EXTEND;
+
+      result = DAG.getNode(NewOpc, dl, OutVT, result);
+    }
+
+    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
+    SDValue retops[2] = {
+      result,
+      the_chain
+    };
+
+    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
+                         retops, sizeof(retops) / sizeof(retops[0]));
+    return result;
+  }
+  case ISD::PRE_INC:
+  case ISD::PRE_DEC:
+  case ISD::POST_INC:
+  case ISD::POST_DEC:
+  case ISD::LAST_INDEXED_MODE:
+    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+            "UNINDEXED\n";
+    cerr << (unsigned) LN->getAddressingMode() << "\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDValue();
+}
+
+/// Custom lower stores for CellSPU
+/*!
+ All CellSPU stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to generate a shuffle to insert the
+ requested element into its place, then store the resulting block.
+ */
+static SDValue
+LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  StoreSDNode *SN = cast<StoreSDNode>(Op);
+  SDValue Value = SN->getValue();
+  MVT VT = Value.getValueType();
+  MVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned alignment = SN->getAlignment();
+
+  switch (SN->getAddressingMode()) {
+  case ISD::UNINDEXED: {
+    // The vector type we really want to load from the 16-byte chunk.
+    MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
+        stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
+
+    SDValue alignLoadVec;
+    SDValue basePtr = SN->getBasePtr();
+    SDValue the_chain = SN->getChain();
+    SDValue insertEltOffs;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and insertion byte:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant((offset & 0xf), PtrVT));
+
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else {
+        // Otherwise, assume it's at byte 0 of basePtr
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant(0, PtrVT));
+      }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
+
+      // Insertion point is solely determined by basePtr's contents
+      insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
+                                  basePtr,
+                                  DAG.getConstant(0, PtrVT));
+    }
+
+    // Re-emit as a v16i8 vector load
+    alignLoadVec = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+                               SN->getSrcValue(), SN->getSrcValueOffset(),
+                               SN->isVolatile(), 16);
+
+    // Update the chain
+    the_chain = alignLoadVec.getValue(1);
+
+    LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec);
+    SDValue theValue = SN->getValue();
+    SDValue result;
+
+    if (StVT != VT
+        && (theValue.getOpcode() == ISD::AssertZext
+            || theValue.getOpcode() == ISD::AssertSext)) {
+      // Drill down and get the value for zero- and sign-extended
+      // quantities
+      theValue = theValue.getOperand(0);
+    }
+
+    // If the base pointer is already a D-form address, then just create
+    // a new D-form address with a slot offset and the orignal base pointer.
+    // Otherwise generate a D-form address with the slot offset relative
+    // to the stack pointer, which is always aligned.
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        cerr << "CellSPU LowerSTORE: basePtr = ";
+        basePtr.getNode()->dump(&DAG);
+        cerr << "\n";
+      }
+#endif
+
+    SDValue insertEltOp =
+            DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs);
+    SDValue vectorizeOp =
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue);
+
+    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
+                         vectorizeOp, alignLoadVec,
+                         DAG.getNode(ISD::BIT_CONVERT, dl,
+                                     MVT::v4i32, insertEltOp));
+
+    result = DAG.getStore(the_chain, dl, result, basePtr,
+                          LN->getSrcValue(), LN->getSrcValueOffset(),
+                          LN->isVolatile(), LN->getAlignment());
+
+#if 0 && !defined(NDEBUG)
+    if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+      const SDValue &currentRoot = DAG.getRoot();
+
+      DAG.setRoot(result);
+      cerr << "------- CellSPU:LowerStore result:\n";
+      DAG.dump();
+      cerr << "-------\n";
+      DAG.setRoot(currentRoot);
+    }
+#endif
+
+    return result;
+    /*UNREACHED*/
+  }
+  case ISD::PRE_INC:
+  case ISD::PRE_DEC:
+  case ISD::POST_INC:
+  case ISD::POST_DEC:
+  case ISD::LAST_INDEXED_MODE:
+    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+            "UNINDEXED\n";
+    cerr << (unsigned) SN->getAddressingMode() << "\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDValue();
+}
+
+//! Generate the address of a constant pool entry.
+SDValue
+LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  MVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  Constant *C = CP->getConstVal();
+  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      // Just return the SDValue with the constant pool address in it.
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  }
+
+  assert(0 &&
+         "LowerConstantPool: Relocation model other than static"
+         " not supported.");
+  return SDValue();
+}
+
+//! Alternate entry point for generating the address of a constant pool entry
+SDValue
+SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
+  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
+}
+
+static SDValue
+LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  MVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  }
+
+  assert(0 &&
+         "LowerJumpTable: Relocation model other than static not supported.");
+  return SDValue();
+}
+
+static SDValue
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  MVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  GlobalValue *GV = GSDN->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  const TargetMachine &TM = DAG.getTarget();
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  } else {
+    cerr << "LowerGlobalAddress: Relocation model other than static not "
+         << "supported.\n";
+    abort();
+    /*NOTREACHED*/
+  }
+
+  return SDValue();
+}
+
+//! Custom lower double precision floating point constants
+static SDValue
+LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (VT == MVT::f64) {
+    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
+
+    assert((FP != 0) &&
+           "LowerConstantFP: Node is not ConstantFPSDNode");
+
+    uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
+    SDValue T = DAG.getConstant(dbits, MVT::i64);
+    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                       DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec));
+  }
+
+  return SDValue();
+}
+
+static SDValue
+LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
+{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SmallVector<SDValue, 48> ArgValues;
+  SDValue Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
+  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+
+  unsigned ArgOffset = SPUFrameInfo::minStackSize();
+  unsigned ArgRegIdx = 0;
+  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
+
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Add DAG nodes to load the arguments or copy them out of registers.
+  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues() - 1;
+       ArgNo != e; ++ArgNo) {
+    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    SDValue ArgVal;
+
+    if (ArgRegIdx < NumArgRegs) {
+      const TargetRegisterClass *ArgRegClass;
+
+      switch (ObjectVT.getSimpleVT()) {
+      default: {
+        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << ObjectVT.getMVTString()
+             << "\n";
+        abort();
+      }
+      case MVT::i8:
+        ArgRegClass = &SPU::R8CRegClass;
+        break;
+      case MVT::i16:
+        ArgRegClass = &SPU::R16CRegClass;
+        break;
+      case MVT::i32:
+        ArgRegClass = &SPU::R32CRegClass;
+        break;
+      case MVT::i64:
+        ArgRegClass = &SPU::R64CRegClass;
+        break;
+      case MVT::i128:
+        ArgRegClass = &SPU::GPRCRegClass;
+        break;
+      case MVT::f32:
+        ArgRegClass = &SPU::R32FPRegClass;
+        break;
+      case MVT::f64:
+        ArgRegClass = &SPU::R64FPRegClass;
+        break;
+      case MVT::v2f64:
+      case MVT::v4f32:
+      case MVT::v2i64:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        ArgRegClass = &SPU::VECREGRegClass;
+        break;
+      }
+
+      unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
+      RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+      ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
+      ++ArgRegIdx;
+    } else {
+      // We need to load the argument to a virtual register if we determined
+      // above that we ran out of physical registers of the appropriate type
+      // or we're forced to do vararg
+      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
+      ArgOffset += StackSlotSize;
+    }
+
+    ArgValues.push_back(ArgVal);
+    // Update the chain
+    Root = ArgVal.getOperand(0);
+  }
+
+  // vararg handling:
+  if (isVarArg) {
+    // unsigned int ptr_size = PtrVT.getSizeInBits() / 8;
+    // We will spill (79-3)+1 registers to the stack
+    SmallVector<SDValue, 79-3+1> MemOps;
+
+    // Create the frame slot
+
+    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
+      VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset);
+      SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+      SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
+      SDValue Store = DAG.getStore(Root, dl, ArgVal, FIN, NULL, 0);
+      Root = Store.getOperand(0);
+      MemOps.push_back(Store);
+
+      // Increment address by stack slot size for the next stored argument
+      ArgOffset += StackSlotSize;
+    }
+    if (!MemOps.empty())
+      Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                         &MemOps[0], MemOps.size());
+  }
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size());
+}
+
+/// isLSAAddress - Return the immediate to use if the specified
+/// value is representable as a LSA address.
+static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      (Addr << 14 >> 14) != Addr)
+    return 0;  // Top 14 bits have to be sext of immediate.
+
+  return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
+}
+
+static SDValue
+LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  SDValue Chain = TheCall->getChain();
+  SDValue Callee    = TheCall->getCallee();
+  unsigned NumOps     = TheCall->getNumArgs();
+  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
+  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
+  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  // Handy pointer type
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Accumulate how many bytes are to be pushed on the stack, including the
+  // linkage area, and parameter passing area.  According to the SPU ABI,
+  // we minimally need space for [LR] and [SP]
+  unsigned NumStackBytes = SPUFrameInfo::minStackSize();
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.
+  unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR]
+  unsigned ArgRegIdx = 0;
+
+  // Keep track of registers passing arguments
+  std::vector<std::pair<unsigned, SDValue> > RegsToPass;
+  // And the arguments passed on the stack
+  SmallVector<SDValue, 8> MemOpChains;
+
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = TheCall->getArg(i);
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    switch (Arg.getValueType().getSimpleVT()) {
+    default: assert(0 && "Unexpected ValueType for argument!");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::i128:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+        ArgOffset += StackSlotSize;
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+        ArgOffset += StackSlotSize;
+      }
+      break;
+    case MVT::v2i64:
+    case MVT::v2f64:
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+        ArgOffset += StackSlotSize;
+      }
+      break;
+    }
+  }
+
+  // Update number of stack bytes actually used, insert a call sequence start
+  NumStackBytes = (ArgOffset - SPUFrameInfo::minStackSize());
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
+                                                            true));
+
+  if (!MemOpChains.empty()) {
+    // Adjust the stack pointer for the stack arguments.
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = SPUISD::CALL;
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    GlobalValue *GV = G->getGlobal();
+    MVT CalleeVT = Callee.getValueType();
+    SDValue Zero = DAG.getConstant(0, PtrVT);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT);
+
+    if (!ST->usingLargeMem()) {
+      // Turn calls to targets that are defined (i.e., have bodies) into BRSL
+      // style calls, otherwise, external symbols are BRASL calls. This assumes
+      // that declared/defined symbols are in the same compilation unit and can
+      // be reached through PC-relative jumps.
+      //
+      // NOTE:
+      // This may be an unsafe assumption for JIT and really large compilation
+      // units.
+      if (GV->isDeclaration()) {
+        Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
+      } else {
+        Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
+      }
+    } else {
+      // "Large memory" mode: Turn all calls into indirect calls with a X-form
+      // address pairs:
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    MVT CalleeVT = Callee.getValueType();
+    SDValue Zero = DAG.getConstant(0, PtrVT);
+    SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
+        Callee.getValueType());
+
+    if (!ST->usingLargeMem()) {
+      Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
+    } else {
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
+    }
+  } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
+    // If this is an absolute destination address that appears to be a legal
+    // local store address, use the munged value.
+    Callee = SDValue(Dest, 0);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
+                      &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (TheCall->getValueType(0) != MVT::Other)
+    InFlag = Chain.getValue(1);
+
+  SDValue ResultVals[3];
+  unsigned NumResults = 0;
+
+  // If the call has results, copy the values out of the ret val registers.
+  switch (TheCall->getValueType(0).getSimpleVT()) {
+  default: assert(0 && "Unexpected ret value!");
+  case MVT::Other: break;
+  case MVT::i32:
+    if (TheCall->getValueType(1) == MVT::i32) {
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4,
+                                 MVT::i32, InFlag).getValue(1);
+      ResultVals[0] = Chain.getValue(0);
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
+                                 Chain.getValue(2)).getValue(1);
+      ResultVals[1] = Chain.getValue(0);
+      NumResults = 2;
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
+                                 InFlag).getValue(1);
+      ResultVals[0] = Chain.getValue(0);
+      NumResults = 1;
+    }
+    break;
+  case MVT::i64:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i64,
+                               InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    break;
+  case MVT::i128:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i128,
+                               InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    break;
+  case MVT::f32:
+  case MVT::f64:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
+                               InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    break;
+  case MVT::v2f64:
+  case MVT::v2i64:
+  case MVT::v4f32:
+  case MVT::v4i32:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
+                                   InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    break;
+  }
+
+  // If the function returns void, just return the chain.
+  if (NumResults == 0)
+    return Chain;
+
+  // Otherwise, merge everything together with a MERGE_VALUES node.
+  ResultVals[NumResults++] = Chain;
+  SDValue Res = DAG.getMergeValues(ResultVals, NumResults, dl);
+  return Res.getValue(Op.getResNo());
+}
+
+static SDValue
+LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+  CCState CCInfo(CC, isVarArg, TM, RVLocs);
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_SPU);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Op.getOperand(i*2+1), Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  else
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering:
+//===----------------------------------------------------------------------===//
+
+static ConstantSDNode *
+getVecImm(SDNode *N) {
+  SDValue OpVal(0, 0);
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return 0;
+  }
+
+  if (OpVal.getNode() != 0) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+      return CN;
+    }
+  }
+
+  return 0;
+}
+
+/// get_vec_i18imm - Test if this vector is a vector filled with the same value
+/// and the value fits into an unsigned 18-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                              MVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getZExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (Value <= 0x3ffff)
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i16imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                              MVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int64_t Value = CN->getSExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
+      return DAG.getTargetConstant(Value, ValueType);
+    }
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i10imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 10-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                              MVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int64_t Value = CN->getSExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (isS10Constant(Value))
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i8imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 8-bit constant, and if so, return the
+/// constant.
+///
+/// @note: The incoming vector is v16i8 because that's the only way we can load
+/// constant vectors. Thus, we test to see if the upper and lower bytes are the
+/// same value.
+SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                             MVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int Value = (int) CN->getZExtValue();
+    if (ValueType == MVT::i16
+        && Value <= 0xffff                 /* truncated from uint64_t */
+        && ((short) Value >> 8) == ((short) Value & 0xff))
+      return DAG.getTargetConstant(Value & 0xff, ValueType);
+    else if (ValueType == MVT::i8
+             && (Value & 0xff) == Value)
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                               MVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getZExtValue();
+    if ((ValueType == MVT::i32
+          && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
+        || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
+      return DAG.getTargetConstant(Value >> 16, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
+SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
+  }
+
+  return SDValue();
+}
+
+/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
+SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
+  }
+
+  return SDValue();
+}
+
+//! Lower a BUILD_VECTOR instruction creatively:
+SDValue
+LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  MVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
+  unsigned minSplatBits = EltVT.getSizeInBits();
+
+  if (minSplatBits < 16)
+    minSplatBits = 16;
+
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, minSplatBits)
+      || minSplatBits < SplatBitSize)
+    return SDValue();   // Wasn't a constant vector or splat exceeded min
+
+  uint64_t SplatBits = APSplatBits.getZExtValue();
+
+  switch (VT.getSimpleVT()) {
+  default:
+    cerr << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
+         << VT.getMVTString()
+         << "\n";
+    abort();
+    /*NOTREACHED*/
+  case MVT::v4f32: {
+    uint32_t Value32 = uint32_t(SplatBits);
+    assert(SplatBitSize == 32
+           && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDValue T = DAG.getConstant(Value32, MVT::i32);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
+    break;
+  }
+  case MVT::v2f64: {
+    uint64_t f64val = uint64_t(SplatBits);
+    assert(SplatBitSize == 64
+           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDValue T = DAG.getConstant(f64val, MVT::i64);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
+    break;
+  }
+  case MVT::v16i8: {
+   // 8-bit constants have to be expanded to 16-bits
+   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
+   SmallVector<SDValue, 8> Ops;
+
+   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
+   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
+  }
+  case MVT::v8i16: {
+    unsigned short Value16 = SplatBits;
+    SDValue T = DAG.getConstant(Value16, EltVT);
+    SmallVector<SDValue, 8> Ops;
+
+    Ops.assign(8, T);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
+  }
+  case MVT::v4i32: {
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
+  }
+  case MVT::v2i32: {
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T);
+  }
+  case MVT::v2i64: {
+    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
+  }
+  }
+
+  return SDValue();
+}
+
+/*!
+ */
+SDValue
+SPU::LowerV2I64Splat(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
+                     DebugLoc dl) {
+  uint32_t upper = uint32_t(SplatVal >> 32);
+  uint32_t lower = uint32_t(SplatVal);
+
+  if (upper == lower) {
+    // Magic constant that can be matched by IL, ILA, et. al.
+    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   Val, Val, Val, Val));
+  } else {
+    bool upper_special, lower_special;
+
+    // NOTE: This code creates common-case shuffle masks that can be easily
+    // detected as common expressions. It is not attempting to create highly
+    // specialized masks to replace any and all 0's, 0xff's and 0x80's.
+
+    // Detect if the upper or lower half is a special shuffle mask pattern:
+    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
+    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
+
+    // Both upper and lower are special, lower to a constant pool load:
+    if (lower_special && upper_special) {
+      SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
+      return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                         SplatValCN, SplatValCN);
+    }
+
+    SDValue LO32;
+    SDValue HI32;
+    SmallVector<SDValue, 16> ShufBytes;
+    SDValue Result;
+
+    // Create lower vector if not a special pattern
+    if (!lower_special) {
+      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
+      LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     LO32C, LO32C, LO32C, LO32C));
+    }
+
+    // Create upper vector if not a special pattern
+    if (!upper_special) {
+      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
+      HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     HI32C, HI32C, HI32C, HI32C));
+    }
+
+    // If either upper or lower are special, then the two input operands are
+    // the same (basically, one of them is a "don't care")
+    if (lower_special)
+      LO32 = HI32;
+    if (upper_special)
+      HI32 = LO32;
+
+    for (int i = 0; i < 4; ++i) {
+      uint64_t val = 0;
+      for (int j = 0; j < 4; ++j) {
+        SDValue V;
+        bool process_upper, process_lower;
+        val <<= 8;
+        process_upper = (upper_special && (i & 1) == 0);
+        process_lower = (lower_special && (i & 1) == 1);
+
+        if (process_upper || process_lower) {
+          if ((process_upper && upper == 0)
+                  || (process_lower && lower == 0))
+            val |= 0x80;
+          else if ((process_upper && upper == 0xffffffff)
+                  || (process_lower && lower == 0xffffffff))
+            val |= 0xc0;
+          else if ((process_upper && upper == 0x80000000)
+                  || (process_lower && lower == 0x80000000))
+            val |= (j == 0 ? 0xe0 : 0x80);
+        } else
+          val |= i * 4 + j + ((i & 1) * 16);
+      }
+
+      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
+    }
+
+    return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   &ShufBytes[0], ShufBytes.size()));
+  }
+}
+
+/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
+/// which the Cell can operate. The code inspects V3 to ascertain whether the
+/// permutation vector, V3, is monotonically increasing with one "exception"
+/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
+/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
+/// In either case, the net result is going to eventually invoke SHUFB to
+/// permute/shuffle the bytes from V1 and V2.
+/// \note
+/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
+/// control word for byte/halfword/word insertion. This takes care of a single
+/// element move from V2 into V1.
+/// \note
+/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+  // If we have a single element being moved from V1 to V2, this can be handled
+  // using the C*[DX] compute mask instructions, but the vector elements have
+  // to be monotonically increasing with one exception element.
+  MVT VecVT = V1.getValueType();
+  MVT EltVT = VecVT.getVectorElementType();
+  unsigned EltsFromV2 = 0;
+  unsigned V2Elt = 0;
+  unsigned V2EltIdx0 = 0;
+  unsigned CurrElt = 0;
+  unsigned MaxElts = VecVT.getVectorNumElements();
+  unsigned PrevElt = 0;
+  unsigned V0Elt = 0;
+  bool monotonic = true;
+  bool rotate = true;
+
+  if (EltVT == MVT::i8) {
+    V2EltIdx0 = 16;
+  } else if (EltVT == MVT::i16) {
+    V2EltIdx0 = 8;
+  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
+    V2EltIdx0 = 4;
+  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
+    V2EltIdx0 = 2;
+  } else
+    assert(0 && "Unhandled vector type in LowerVECTOR_SHUFFLE");
+
+  for (unsigned i = 0; i != MaxElts; ++i) {
+    if (SVN->getMaskElt(i) < 0)
+      continue;
+    
+    unsigned SrcElt = SVN->getMaskElt(i);
+
+    if (monotonic) {
+      if (SrcElt >= V2EltIdx0) {
+        if (1 >= (++EltsFromV2)) {
+          V2Elt = (V2EltIdx0 - SrcElt) << 2;
+        }
+      } else if (CurrElt != SrcElt) {
+        monotonic = false;
+      }
+
+      ++CurrElt;
+    }
+
+    if (rotate) {
+      if (PrevElt > 0 && SrcElt < MaxElts) {
+        if ((PrevElt == SrcElt - 1)
+            || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
+          PrevElt = SrcElt;
+          if (SrcElt == 0)
+            V0Elt = i;
+        } else {
+          rotate = false;
+        }
+      } else if (PrevElt == 0) {
+        // First time through, need to keep track of previous element
+        PrevElt = SrcElt;
+      } else {
+        // This isn't a rotation, takes elements from vector 2
+        rotate = false;
+      }
+    }
+  }
+
+  if (EltsFromV2 == 1 && monotonic) {
+    // Compute mask and shuffle
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    // Initialize temporary register to 0
+    SDValue InitTempReg =
+      DAG.getCopyToReg(DAG.getEntryNode(), dl, VReg, DAG.getConstant(0, PtrVT));
+    // Copy register's contents as index in SHUFFLE_MASK:
+    SDValue ShufMaskOp =
+      DAG.getNode(SPUISD::SHUFFLE_MASK, dl, MVT::v4i32,
+                  DAG.getTargetConstant(V2Elt, MVT::i32),
+                  DAG.getCopyFromReg(InitTempReg, dl, VReg, PtrVT));
+    // Use shuffle mask in SHUFB synthetic instruction:
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
+                       ShufMaskOp);
+  } else if (rotate) {
+    int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8;
+
+    return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
+                       V1, DAG.getConstant(rotamt, MVT::i16));
+  } else {
+   // Convert the SHUFFLE_VECTOR mask's input element units to the
+   // actual bytes.
+    unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+    SmallVector<SDValue, 16> ResultMask;
+    for (unsigned i = 0, e = MaxElts; i != e; ++i) {
+      unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
+
+      for (unsigned j = 0; j < BytesPerElement; ++j)
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
+    }
+
+    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
+  }
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op0.getNode()->getOpcode() == ISD::Constant) {
+    // For a constant, build the appropriate constant vector, which will
+    // eventually simplify to a vector register load.
+
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
+    SmallVector<SDValue, 16> ConstVecValues;
+    MVT VT;
+    size_t n_copies;
+
+    // Create a constant vector:
+    switch (Op.getValueType().getSimpleVT()) {
+    default: assert(0 && "Unexpected constant value type in "
+                         "LowerSCALAR_TO_VECTOR");
+    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
+    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
+    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
+    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
+    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
+    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
+    }
+
+    SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
+    for (size_t j = 0; j < n_copies; ++j)
+      ConstVecValues.push_back(CValue);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
+                       &ConstVecValues[0], ConstVecValues.size());
+  } else {
+    // Otherwise, copy the value from one register to another:
+    switch (Op0.getValueType().getSimpleVT()) {
+    default: assert(0 && "Unexpected value type in LowerSCALAR_TO_VECTOR");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+      return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  SDValue N = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue retval;
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+    // Constant argument:
+    int EltNo = (int) C->getZExtValue();
+
+    // sanity checks:
+    if (VT == MVT::i8 && EltNo >= 16)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
+    else if (VT == MVT::i16 && EltNo >= 8)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
+    else if (VT == MVT::i32 && EltNo >= 4)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
+    else if (VT == MVT::i64 && EltNo >= 2)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+
+    if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
+      // i32 and i64: Element 0 is the preferred slot
+      return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
+    }
+
+    // Need to generate shuffle mask and extract:
+    int prefslot_begin = -1, prefslot_end = -1;
+    int elt_byte = EltNo * VT.getSizeInBits() / 8;
+
+    switch (VT.getSimpleVT()) {
+    default:
+      assert(false && "Invalid value type!");
+    case MVT::i8: {
+      prefslot_begin = prefslot_end = 3;
+      break;
+    }
+    case MVT::i16: {
+      prefslot_begin = 2; prefslot_end = 3;
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      prefslot_begin = 0; prefslot_end = 3;
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      prefslot_begin = 0; prefslot_end = 7;
+      break;
+    }
+    }
+
+    assert(prefslot_begin != -1 && prefslot_end != -1 &&
+           "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
+
+    unsigned int ShufBytes[16];
+    for (int i = 0; i < 16; ++i) {
+      // zero fill uppper part of preferred slot, don't care about the
+      // other slots:
+      unsigned int mask_val;
+      if (i <= prefslot_end) {
+        mask_val =
+          ((i < prefslot_begin)
+           ? 0x80
+           : elt_byte + (i - prefslot_begin));
+
+        ShufBytes[i] = mask_val;
+      } else
+        ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
+    }
+
+    SDValue ShufMask[4];
+    for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
+      unsigned bidx = i * 4;
+      unsigned int bits = ((ShufBytes[bidx] << 24) |
+                           (ShufBytes[bidx+1] << 16) |
+                           (ShufBytes[bidx+2] << 8) |
+                           ShufBytes[bidx+3]);
+      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
+    }
+
+    SDValue ShufMaskVec =
+      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
+
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
+                                     N, N, ShufMaskVec));
+  } else {
+    // Variable index: Rotate the requested element into slot 0, then replicate
+    // slot 0 across the vector
+    MVT VecVT = N.getValueType();
+    if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
+      cerr << "LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit vector type!\n";
+      abort();
+    }
+
+    // Make life easier by making sure the index is zero-extended to i32
+    if (Elt.getValueType() != MVT::i32)
+      Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
+
+    // Scale the index to a bit/byte shift quantity
+    APInt scaleFactor =
+            APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
+    unsigned scaleShift = scaleFactor.logBase2();
+    SDValue vecShift;
+
+    if (scaleShift > 0) {
+      // Scale the shift factor:
+      Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
+                        DAG.getConstant(scaleShift, MVT::i32));
+    }
+
+    vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt);
+
+    // Replicate the bytes starting at byte 0 across the entire vector (for
+    // consistency with the notion of a unified register set)
+    SDValue replicate;
+
+    switch (VT.getSimpleVT()) {
+    default:
+      cerr << "LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector type\n";
+      abort();
+      /*NOTREACHED*/
+    case MVT::i8: {
+      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i16: {
+      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
+      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              loFactor, hiFactor, loFactor, hiFactor);
+      break;
+    }
+    }
+
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                     vecShift, vecShift, replicate));
+  }
+
+  return retval;
+}
+
+static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  SDValue VecOp = Op.getOperand(0);
+  SDValue ValOp = Op.getOperand(1);
+  SDValue IdxOp = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType();
+
+  ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
+  assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Use $sp ($1) because it's always 16-byte aligned and it's available:
+  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(CN->getSExtValue(), PtrVT));
+  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer);
+
+  SDValue result =
+    DAG.getNode(SPUISD::SHUFB, dl, VT,
+                DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
+                VecOp,
+                DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask));
+
+  return result;
+}
+
+static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
+                           const TargetLowering &TLI)
+{
+  SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
+  DebugLoc dl = Op.getDebugLoc();
+  MVT ShiftVT = TLI.getShiftAmountTy();
+
+  assert(Op.getValueType() == MVT::i8);
+  switch (Opc) {
+  default:
+    assert(0 && "Unhandled i8 math operator");
+    /*NOTREACHED*/
+    break;
+  case ISD::ADD: {
+    // 8-bit addition: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+
+  }
+
+  case ISD::SUB: {
+    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::ROTR:
+  case ISD::ROTL: {
+    SDValue N1 = Op.getOperand(1);
+    MVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
+                       ? ISD::ZERO_EXTEND
+                       : ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    // Replicate lower 8-bits into upper 8:
+    SDValue ExpandArg =
+      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
+                  DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              N0, DAG.getConstant(8, MVT::i32)));
+
+    // Truncate back down to i8
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
+  }
+  case ISD::SRL:
+  case ISD::SHL: {
+    SDValue N1 = Op.getOperand(1);
+    MVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::ZERO_EXTEND;
+
+      if (N1.getValueType().bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::SRA: {
+    SDValue N1 = Op.getOperand(1);
+    MVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::SIGN_EXTEND;
+
+      if (N1VT.bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::MUL: {
+    SDValue N1 = Op.getOperand(1);
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
+//! Lower byte immediate operations for v16i8 vectors:
+static SDValue
+LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
+  SDValue ConstVec;
+  SDValue Arg;
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  ConstVec = Op.getOperand(0);
+  Arg = Op.getOperand(1);
+  if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
+    if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
+      ConstVec = ConstVec.getOperand(0);
+    } else {
+      ConstVec = Op.getOperand(1);
+      Arg = Op.getOperand(0);
+      if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
+        ConstVec = ConstVec.getOperand(0);
+      }
+    }
+  }
+
+  if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
+    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
+    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
+
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
+
+    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                              HasAnyUndefs, minSplatBits)
+        && minSplatBits <= SplatBitSize) {
+      uint64_t SplatBits = APSplatBits.getZExtValue();
+      SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
+
+      SmallVector<SDValue, 16> tcVec;
+      tcVec.assign(16, tc);
+      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
+    }
+  }
+
+  // These operations (AND, OR, XOR) are legal, they just couldn't be custom
+  // lowered.  Return the operation, rather than a null SDValue.
+  return Op;
+}
+
+//! Custom lowering for CTPOP (count population)
+/*!
+  Custom lowering code that counts the number ones in the input
+  operand. SPU has such an instruction, but it counts the number of
+  ones per byte, which then have to be accumulated.
+*/
+static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (VT.getSimpleVT()) {
+  default:
+    assert(false && "Invalid value type!");
+  case MVT::i8: {
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
+  }
+
+  case MVT::i16: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
+
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i16);
+    SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
+    SDValue Shift1 = DAG.getConstant(8, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDValue CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
+
+    SDValue CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
+
+    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
+
+    return DAG.getNode(ISD::AND, dl, MVT::i16,
+                       DAG.getNode(ISD::ADD, dl, MVT::i16,
+                                   DAG.getNode(ISD::SRL, dl, MVT::i16,
+                                               Tmp1, Shift1),
+                                   Tmp1),
+                       Mask0);
+  }
+
+  case MVT::i32: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+    unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
+    SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
+    SDValue Shift1 = DAG.getConstant(16, MVT::i32);
+    SDValue Shift2 = DAG.getConstant(8, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDValue CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
+
+    SDValue CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
+
+    SDValue Comp1 =
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
+                  Shift1);
+
+    SDValue Sum1 =
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
+
+    SDValue Sum1_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
+
+    SDValue Comp2 =
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
+                  Shift2);
+    SDValue Sum2 =
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
+
+    return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
+  }
+
+  case MVT::i64:
+    break;
+  }
+
+  return SDValue();
+}
+
+//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
+/*!
+ f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
+ All conversions to i64 are expanded to a libcall.
+ */
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
+  MVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
+      || OpVT == MVT::i64) {
+    // Convert f32 / f64 to i32 / i64 via libcall.
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::FP_TO_SINT)
+             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
+             : RTLIB::getFPTOUINT(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
+/*!
+ i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
+ All conversions from i64 are expanded to a libcall.
+ */
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
+  MVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
+      || Op0VT == MVT::i64) {
+    // Convert i32, i64 to f64 via libcall:
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::SINT_TO_FP)
+             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
+             : RTLIB::getUINTTOFP(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SETCC
+/*!
+ This handles MVT::f64 (double floating point) condition lowering
+ */
+static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
+  DebugLoc dl = Op.getDebugLoc();
+  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
+
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  MVT lhsVT = lhs.getValueType();
+  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
+
+  MVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
+  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+  MVT IntVT(MVT::i64);
+
+  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
+  // selected to a NOP:
+  SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs);
+  SDValue lhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64lhs, DAG.getConstant(32, MVT::i32)));
+  SDValue lhsHi32abs =
+          DAG.getNode(ISD::AND, dl, MVT::i32,
+                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue lhsLo32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
+
+  // SETO and SETUO only use the lhs operand:
+  if (CC->get() == ISD::SETO) {
+    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
+    // SETUO
+    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+    return DAG.getNode(ISD::XOR, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhs, DAG.getConstantFP(0.0, lhsVT),
+                                    ISD::SETUO),
+                       DAG.getConstant(ccResultAllOnes, ccResultVT));
+  } else if (CC->get() == ISD::SETUO) {
+    // Evaluates to true if Op0 is [SQ]NaN
+    return DAG.getNode(ISD::AND, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsHi32abs,
+                                    DAG.getConstant(0x7ff00000, MVT::i32),
+                                    ISD::SETGE),
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsLo32,
+                                    DAG.getConstant(0, MVT::i32),
+                                    ISD::SETGT));
+  }
+
+  SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs);
+  SDValue rhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64rhs, DAG.getConstant(32, MVT::i32)));
+
+  // If a value is negative, subtract from the sign magnitude constant:
+  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
+
+  // Convert the sign-magnitude representation into 2's complement:
+  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      lhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
+  SDValue lhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      lhsSelectMask, lhsSignMag2TC, i64lhs);
+
+  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      rhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
+  SDValue rhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      rhsSelectMask, rhsSignMag2TC, i64rhs);
+
+  unsigned compareOp;
+
+  switch (CC->get()) {
+  case ISD::SETOEQ:
+  case ISD::SETUEQ:
+    compareOp = ISD::SETEQ; break;
+  case ISD::SETOGT:
+  case ISD::SETUGT:
+    compareOp = ISD::SETGT; break;
+  case ISD::SETOGE:
+  case ISD::SETUGE:
+    compareOp = ISD::SETGE; break;
+  case ISD::SETOLT:
+  case ISD::SETULT:
+    compareOp = ISD::SETLT; break;
+  case ISD::SETOLE:
+  case ISD::SETULE:
+    compareOp = ISD::SETLE; break;
+  case ISD::SETUNE:
+  case ISD::SETONE:
+    compareOp = ISD::SETNE; break;
+  default:
+    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
+    abort();
+    break;
+  }
+
+  SDValue result =
+          DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
+                       (ISD::CondCode) compareOp);
+
+  if ((CC->get() & 0x8) == 0) {
+    // Ordered comparison:
+    SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
+
+    result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
+  }
+
+  return result;
+}
+
+//! Lower ISD::SELECT_CC
+/*!
+  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
+  SELB instruction.
+
+  \note Need to revisit this in the future: if the code path through the true
+  and false value computations is longer than the latency of a branch (6
+  cycles), then it would be more advantageous to branch and insert a new basic
+  block and branch on the condition. However, this code does not make that
+  assumption, given the simplisitc uses so far.
+ */
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
+  MVT VT = Op.getValueType();
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  SDValue trueval = Op.getOperand(2);
+  SDValue falseval = Op.getOperand(3);
+  SDValue condition = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // NOTE: SELB's arguments: $rA, $rB, $mask
+  //
+  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
+  // where bits in $mask are 1. CCond will be inverted, having 1s where the
+  // condition was true and 0s where the condition was false. Hence, the
+  // arguments to SELB get reversed.
+
+  // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
+  // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
+  // with another "cannot select select_cc" assert:
+
+  SDValue compare = DAG.getNode(ISD::SETCC, dl,
+                                TLI.getSetCCResultType(Op.getValueType()),
+                                lhs, rhs, condition);
+  return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
+}
+
+//! Custom lower ISD::TRUNCATE
+static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
+{
+  // Type to truncate to
+  MVT VT = Op.getValueType();
+  MVT::SimpleValueType simpleVT = VT.getSimpleVT();
+  MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to truncate from
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+
+  if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) {
+    // Create shuffle mask, least significant doubleword of quadword
+    unsigned maskHigh = 0x08090a0b;
+    unsigned maskLow = 0x0c0d0e0f;
+    // Use a shuffle to perform the truncation
+    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32),
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32));
+
+    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                       Op0, Op0, shufMask);
+
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
+  }
+
+  return SDValue();             // Leave the truncate unmolested
+}
+
+//! Custom (target-specific) lowering entry point
+/*!
+  This is where LLVM's DAG selection process calls to do target-specific
+  lowering of nodes.
+ */
+SDValue
+SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
+{
+  unsigned Opc = (unsigned) Op.getOpcode();
+  MVT VT = Op.getValueType();
+
+  switch (Opc) {
+  default: {
+    cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
+    cerr << "Op.getOpcode() = " << Opc << "\n";
+    cerr << "*Op.getNode():\n";
+    Op.getNode()->dump();
+    abort();
+  }
+  case ISD::LOAD:
+  case ISD::EXTLOAD:
+  case ISD::SEXTLOAD:
+  case ISD::ZEXTLOAD:
+    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::ConstantFP:
+    return LowerConstantFP(Op, DAG);
+  case ISD::FORMAL_ARGUMENTS:
+    return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
+  case ISD::CALL:
+    return LowerCALL(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::RET:
+    return LowerRET(Op, DAG, getTargetMachine());
+
+  // i8, i64 math ops:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::ROTR:
+  case ISD::ROTL:
+  case ISD::SRL:
+  case ISD::SHL:
+  case ISD::SRA: {
+    if (VT == MVT::i8)
+      return LowerI8Math(Op, DAG, Opc, *this);
+    break;
+  }
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG, *this);
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG, *this);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+
+  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return LowerByteImmed(Op, DAG);
+
+  // Vector and i8 multiply:
+  case ISD::MUL:
+    if (VT == MVT::i8)
+      return LowerI8Math(Op, DAG, Opc, *this);
+
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG, *this);
+
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG, *this);
+
+  case ISD::TRUNCATE:
+    return LowerTRUNCATE(Op, DAG);
+  }
+
+  return SDValue();
+}
+
+void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG)
+{
+#if 0
+  unsigned Opc = (unsigned) N->getOpcode();
+  MVT OpVT = N->getValueType(0);
+
+  switch (Opc) {
+  default: {
+    cerr << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
+    cerr << "Op.getOpcode() = " << Opc << "\n";
+    cerr << "*Op.getNode():\n";
+    N->dump();
+    abort();
+    /*NOTREACHED*/
+  }
+  }
+#endif
+
+  /* Otherwise, return unchanged */
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue
+SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
+{
+#if 0
+  TargetMachine &TM = getTargetMachine();
+#endif
+  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
+  MVT NodeVT = N->getValueType(0);      // The node's value type
+  MVT Op0VT = Op0.getValueType();       // The first operand's result
+  SDValue Result;                       // Initially, empty result
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADD: {
+    SDValue Op1 = N->getOperand(1);
+
+    if (Op0.getOpcode() == SPUISD::IndirectAddr
+        || Op1.getOpcode() == SPUISD::IndirectAddr) {
+      // Normalize the operands to reduce repeated code
+      SDValue IndirectArg = Op0, AddArg = Op1;
+
+      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
+        IndirectArg = Op1;
+        AddArg = Op0;
+      }
+
+      if (isa<ConstantSDNode>(AddArg)) {
+        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
+        SDValue IndOp1 = IndirectArg.getOperand(1);
+
+        if (CN0->isNullValue()) {
+          // (add (SPUindirect <arg>, <arg>), 0) ->
+          // (SPUindirect <arg>, <arg>)
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return IndirectArg;
+        } else if (isa<ConstantSDNode>(IndOp1)) {
+          // (add (SPUindirect <arg>, <const>), <const>) ->
+          // (SPUindirect <arg>, <const + const>)
+          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
+          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
+          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
+                 << "), " << CN0->getSExtValue() << ")\n"
+                 << "With:    (SPUindirect <arg>, "
+                 << combinedConst << ")\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             IndirectArg, combinedValue);
+        }
+      }
+    }
+    break;
+  }
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
+      // (any_extend (SPUextract_elt0 <arg>)) ->
+      // (SPUextract_elt0 <arg>)
+      // Types must match, however...
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        cerr << "\nReplace: ";
+        N->dump(&DAG);
+        cerr << "\nWith:    ";
+        Op0.getNode()->dump(&DAG);
+        cerr << "\n";
+      }
+#endif
+
+      return Op0;
+    }
+    break;
+  }
+  case SPUISD::IndirectAddr: {
+    if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      if (CN != 0 && CN->getZExtValue() == 0) {
+        // (SPUindirect (SPUaform <addr>, 0), 0) ->
+        // (SPUaform <addr>, 0)
+
+        DEBUG(cerr << "Replace: ");
+        DEBUG(N->dump(&DAG));
+        DEBUG(cerr << "\nWith:    ");
+        DEBUG(Op0.getNode()->dump(&DAG));
+        DEBUG(cerr << "\n");
+
+        return Op0;
+      }
+    } else if (Op0.getOpcode() == ISD::ADD) {
+      SDValue Op1 = N->getOperand(1);
+      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
+        // (SPUindirect (add <arg>, <arg>), 0) ->
+        // (SPUindirect <arg>, <arg>)
+        if (CN1->isNullValue()) {
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             Op0.getOperand(0), Op0.getOperand(1));
+        }
+      }
+    }
+    break;
+  }
+  case SPUISD::SHLQUAD_L_BITS:
+  case SPUISD::SHLQUAD_L_BYTES:
+  case SPUISD::VEC_SHL:
+  case SPUISD::VEC_SRL:
+  case SPUISD::VEC_SRA:
+  case SPUISD::ROTBYTES_LEFT: {
+    SDValue Op1 = N->getOperand(1);
+
+    // Kill degenerate vector shifts:
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
+      if (CN->isNullValue()) {
+        Result = Op0;
+      }
+    }
+    break;
+  }
+  case SPUISD::PREFSLOT2VEC: {
+    switch (Op0.getOpcode()) {
+    default:
+      break;
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND: {
+      // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
+      // <arg>
+      // but only if the SPUprefslot2vec and <arg> types match.
+      SDValue Op00 = Op0.getOperand(0);
+      if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
+        SDValue Op000 = Op00.getOperand(0);
+        if (Op000.getValueType() == NodeVT) {
+          Result = Op000;
+        }
+      }
+      break;
+    }
+    case SPUISD::VEC2PREFSLOT: {
+      // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
+      // <arg>
+      Result = Op0.getOperand(0);
+      break;
+    }
+    }
+    break;
+  }
+  }
+
+  // Otherwise, return unchanged.
+#ifndef NDEBUG
+  if (Result.getNode()) {
+    DEBUG(cerr << "\nReplace.SPU: ");
+    DEBUG(N->dump(&DAG));
+    DEBUG(cerr << "\nWith:        ");
+    DEBUG(Result.getNode()->dump(&DAG));
+    DEBUG(cerr << "\n");
+  }
+#endif
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SPUTargetLowering::ConstraintType
+SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
+  if (ConstraintLetter.size() == 1) {
+    switch (ConstraintLetter[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(ConstraintLetter);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT VT) const
+{
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+    case 'r':   // R0-R31
+      if (VT == MVT::i64)
+        return std::make_pair(0U, SPU::R64CRegisterClass);
+      return std::make_pair(0U, SPU::R32CRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, SPU::R32FPRegisterClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, SPU::R64FPRegisterClass);
+      break;
+    case 'v':
+      return std::make_pair(0U, SPU::GPRCRegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//! Compute used/known bits for a SPU operand
+void
+SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                  const APInt &Mask,
+                                                  APInt &KnownZero,
+                                                  APInt &KnownOne,
+                                                  const SelectionDAG &DAG,
+                                                  unsigned Depth ) const {
+#if 0
+  const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
+
+  switch (Op.getOpcode()) {
+  default:
+    // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+    break;
+  case CALL:
+  case SHUFB:
+  case SHUFFLE_MASK:
+  case CNTB:
+  case SPUISD::PREFSLOT2VEC:
+  case SPUISD::LDRESULT:
+  case SPUISD::VEC2PREFSLOT:
+  case SPUISD::SHLQUAD_L_BITS:
+  case SPUISD::SHLQUAD_L_BYTES:
+  case SPUISD::VEC_SHL:
+  case SPUISD::VEC_SRL:
+  case SPUISD::VEC_SRA:
+  case SPUISD::VEC_ROTL:
+  case SPUISD::VEC_ROTR:
+  case SPUISD::ROTBYTES_LEFT:
+  case SPUISD::SELECT_MASK:
+  case SPUISD::SELB:
+  }
+#endif
+}
+
+unsigned
+SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    return 1;
+
+  case ISD::SETCC: {
+    MVT VT = Op.getValueType();
+
+    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
+      VT = MVT::i32;
+    }
+    return VT.getSizeInBits();
+  }
+  }
+}
+
+// LowerAsmOperandForConstraint
+void
+SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                char ConstraintLetter,
+                                                bool hasMemory,
+                                                std::vector<SDValue> &Ops,
+                                                SelectionDAG &DAG) const {
+  // Default, for the time being, to the base class handler
+  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, hasMemory,
+                                               Ops, DAG);
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode.
+bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
+                                                const Type *Ty) const {
+  // SPU's addresses are 256K:
+  return (V > -(1 << 18) && V < (1 << 18) - 1);
+}
+
+bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+  return false;
+}
+
+bool
+SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The SPU target isn't yet aware of offsets.
+  return false;
+}
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
new file mode 100644
index 0000000..866c632
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -0,0 +1,154 @@
+//===-- SPUISelLowering.h - Cell SPU DAG Lowering Interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Cell SPU uses to lower LLVM code into
+// a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_ISELLOWERING_H
+#define SPU_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "SPU.h"
+
+namespace llvm {
+  namespace SPUISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Pseudo instructions:
+      RET_FLAG,                 ///< Return with flag, matched by bi instruction
+
+      Hi,                       ///< High address component (upper 16)
+      Lo,                       ///< Low address component (lower 16)
+      PCRelAddr,                ///< Program counter relative address
+      AFormAddr,                ///< A-form address (local store)
+      IndirectAddr,             ///< D-Form "imm($r)" and X-form "$r($r)"
+
+      LDRESULT,                 ///< Load result (value, chain)
+      CALL,                     ///< CALL instruction
+      SHUFB,                    ///< Vector shuffle (permute)
+      SHUFFLE_MASK,             ///< Shuffle mask
+      CNTB,                     ///< Count leading ones in bytes
+      PREFSLOT2VEC,             ///< Promote scalar->vector
+      VEC2PREFSLOT,             ///< Extract element 0
+      SHLQUAD_L_BITS,           ///< Rotate quad left, by bits
+      SHLQUAD_L_BYTES,          ///< Rotate quad left, by bytes
+      VEC_SHL,                  ///< Vector shift left
+      VEC_SRL,                  ///< Vector shift right (logical)
+      VEC_SRA,                  ///< Vector shift right (arithmetic)
+      VEC_ROTL,                 ///< Vector rotate left
+      VEC_ROTR,                 ///< Vector rotate right
+      ROTBYTES_LEFT,            ///< Rotate bytes (loads -> ROTQBYI)
+      ROTBYTES_LEFT_BITS,       ///< Rotate bytes left by bit shift count
+      SELECT_MASK,              ///< Select Mask (FSM, FSMB, FSMH, FSMBI)
+      SELB,                     ///< Select bits -> (b & mask) | (a & ~mask)
+      // Markers: These aren't used to generate target-dependent nodes, but
+      // are used during instruction selection.
+      ADD64_MARKER,             ///< i64 addition marker
+      SUB64_MARKER,             ///< i64 subtraction marker
+      MUL64_MARKER,             ///< i64 multiply marker
+      LAST_SPUISD               ///< Last user-defined instruction
+    };
+  }
+
+  //! Utility functions specific to CellSPU:
+  namespace SPU {
+    SDValue get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                             MVT ValueType);
+    SDValue get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                             MVT ValueType);
+    SDValue get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                             MVT ValueType);
+    SDValue get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                            MVT ValueType);
+    SDValue get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                              MVT ValueType);
+    SDValue get_v4i32_imm(SDNode *N, SelectionDAG &DAG);
+    SDValue get_v2i64_imm(SDNode *N, SelectionDAG &DAG);
+
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
+                              const SPUTargetMachine &TM);
+    //! Simplify a MVT::v2i64 constant splat to CellSPU-ready form
+    SDValue LowerV2I64Splat(MVT OpVT, SelectionDAG &DAG, uint64_t splat,
+                             DebugLoc dl);
+  }
+
+  class SPUTargetMachine;            // forward dec'l.
+
+  class SPUTargetLowering :
+    public TargetLowering
+  {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int ReturnAddrIndex;              // FrameIndex for return slot.
+    SPUTargetMachine &SPUTM;
+
+  public:
+    //! The venerable constructor
+    /*!
+     This is where the CellSPU backend sets operation handling (i.e., legal,
+     custom, expand or promote.)
+     */
+    SPUTargetLowering(SPUTargetMachine &TM);
+
+    //! Get the target machine
+    SPUTargetMachine &getSPUTargetMachine() {
+      return SPUTM;
+    }
+
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the ValueType for ISD::SETCC
+    virtual MVT getSetCCResultType(MVT VT) const;
+
+    //! Custom lowering hooks
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    //! Custom lowering hook for nodes with illegal result types.
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth = 0) const;
+
+    ConstraintType getConstraintType(const std::string &ConstraintLetter) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT VT) const;
+
+    void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
+                                      bool hasMemory,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const;
+
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+    virtual bool isLegalAddressImmediate(GlobalValue *) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+  };
+}
+
+#endif
diff --git a/lib/Target/CellSPU/SPUInstrBuilder.h b/lib/Target/CellSPU/SPUInstrBuilder.h
new file mode 100644
index 0000000..5e268f8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrBuilder.h
@@ -0,0 +1,43 @@
+//==-- SPUInstrBuilder.h - Aides for building Cell SPU insts -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_INSTRBUILDER_H
+#define SPU_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/CellSPU/SPUInstrFormats.td b/lib/Target/CellSPU/SPUInstrFormats.td
new file mode 100644
index 0000000..21bc275
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrFormats.td
@@ -0,0 +1,298 @@
+//==== SPUInstrFormats.td - Cell SPU Instruction Formats ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Cell SPU instruction formats. Note that these are notationally similar to
+// PowerPC, like "A-Form". But the sizes of operands and fields differ.
+
+// This was kiped from the PPC instruction formats (seemed like a good idea...)
+
+class SPUInstr<dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SPU";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+}
+
+// RR Format
+class RRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : SPUInstr<OOL, IOL, asmstr, itin> {
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RB = 0 in {
+  // RR Format, where RB is zeroed (dont care):
+  class RRForm_1<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+                 InstrItinClass itin, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+
+  let RA = 0 in {
+    // RR Format, where RA and RB are zeroed (dont care):
+    // Used for reads from status control registers (see FPSCRRr32)
+    class RRForm_2<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+             : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+  }
+}
+
+let RT = 0 in {
+  // RR Format, where RT is zeroed (don't care), or as the instruction handbook
+  // says, "RT is a false target." Used in "Halt if" instructions
+  class RRForm_3<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+      : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RRR Format
+class RRRForm<bits<4> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RC;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-3} = opcode;
+  let Inst{4-10} = RT;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RC;
+}
+
+// RI7 Format
+class RI7Form<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> i7;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = i7;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// CVTIntFp Format
+class CVTIntFPForm<bits<10> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-9} = opcode;
+  let Inst{10-17} = 0;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RA = 0 in {
+  class BICondForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+
+  let RT = 0 in {
+    // Branch instruction format (without D/E flag settings)
+    class BRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+          : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+
+    class BIForm<bits<11> opcode, string asmstr, list<dag> pattern>
+             : RRForm<opcode, (outs), (ins R32C:$func), asmstr, BranchResolv,
+                      pattern>
+    { }
+
+    let RB = 0 in {
+      // Return instruction (bi, branch indirect), RA is zero (LR):
+      class RETForm<string asmstr, list<dag> pattern>
+             : BRForm<0b00010101100, (outs), (ins), asmstr, BranchResolv,
+                      pattern>
+      { }
+    }
+  }
+}
+
+// Branch indirect external data forms:
+class BISLEDForm<bits<2> DE_flag, string asmstr, list<dag> pattern>
+         : SPUInstr<(outs), (ins indcalltarget:$func), asmstr, BranchResolv>
+{
+  bits<7> Rcalldest;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = 0b11010101100;
+  let Inst{11} = 0;
+  let Inst{12-13} = DE_flag;
+  let Inst{14-17} = 0b0000;
+  let Inst{18-24} = Rcalldest;
+  let Inst{25-31} = 0b0000000;
+}
+
+// RI10 Format
+class RI10Form<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<10> i10;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-7} = opcode;
+  let Inst{8-17} = i10;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// RI10 Format, where the constant is zero (or effectively ignored by the
+// SPU)
+let i10 = 0 in {
+  class RI10Form_1<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+          : RI10Form<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RI10 Format, where RT is ignored.
+// This format is used primarily by the Halt If ... Immediate set of
+// instructions
+let RT = 0 in {
+  class RI10Form_2<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+        : RI10Form<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RI16 Format
+class RI16Form<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<16> i16;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-8} = opcode;
+  let Inst{9-24} = i16;
+  let Inst{25-31} = RT;
+}
+
+// Specialized version of the RI16 Format for unconditional branch relative and
+// branch absolute, branch and set link. Note that for branch and set link, the
+// link register doesn't have to be $lr, but this is actually hard coded into
+// the instruction pattern.
+
+let RT = 0 in {
+  class UncondBranch<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                     list<dag> pattern>
+    : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+
+  class BranchSetLink<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                      list<dag> pattern>
+        : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+}
+
+//===----------------------------------------------------------------------===//
+// Specialized versions of RI16:
+//===----------------------------------------------------------------------===//
+
+// RI18 Format
+class RI18Form<bits<7> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<18> i18;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-6} = opcode;
+  let Inst{7-24} = i18;
+  let Inst{25-31} = RT;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats for intrinsics:
+//===----------------------------------------------------------------------===//
+
+// RI10 Format for v8i16 intrinsics
+class RI10_Int_v8i16<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                            i16ImmSExt10:$val))] >;
+
+class RI10_Int_v4i32<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                            i32ImmSExt10:$val))] >;
+
+// RR Format for v8i16 intrinsics
+class RR_Int_v8i16<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                          (v8i16 VECREG:$rB)))] >;
+
+// RR Format for v4i32 intrinsics
+class RR_Int_v4i32<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                          (v4i32 VECREG:$rB)))] >;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions, like call frames:
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : SPUInstr<OOL, IOL, asmstr, NoItinerary> {
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
new file mode 100644
index 0000000..4af995a
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -0,0 +1,693 @@
+//===- SPUInstrInfo.cpp - Cell SPU Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell SPU implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPURegisterNames.h"
+#include "SPUInstrInfo.h"
+#include "SPUInstrBuilder.h"
+#include "SPUTargetMachine.h"
+#include "SPUGenInstrInfo.inc"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+  //! Predicate for an unconditional branch instruction
+  inline bool isUncondBranch(const MachineInstr *I) {
+    unsigned opc = I->getOpcode();
+
+    return (opc == SPU::BR
+            || opc == SPU::BRA
+            || opc == SPU::BI);
+  }
+
+  //! Predicate for a conditional branch instruction
+  inline bool isCondBranch(const MachineInstr *I) {
+    unsigned opc = I->getOpcode();
+
+    return (opc == SPU::BRNZr32
+            || opc == SPU::BRNZv4i32
+            || opc == SPU::BRZr32
+            || opc == SPU::BRZv4i32
+            || opc == SPU::BRHNZr16
+            || opc == SPU::BRHNZv8i16
+            || opc == SPU::BRHZr16
+            || opc == SPU::BRHZv8i16);
+  }
+}
+
+SPUInstrInfo::SPUInstrInfo(SPUTargetMachine &tm)
+  : TargetInstrInfoImpl(SPUInsts, sizeof(SPUInsts)/sizeof(SPUInsts[0])),
+    TM(tm),
+    RI(*TM.getSubtargetImpl(), *this)
+{ /* NOP */ }
+
+bool
+SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
+                          unsigned& sourceReg,
+                          unsigned& destReg,
+                          unsigned& SrcSR, unsigned& DstSR) const {
+  SrcSR = DstSR = 0;  // No sub-registers.
+
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case SPU::ORIv4i32:
+  case SPU::ORIr32:
+  case SPU::ORHIv8i16:
+  case SPU::ORHIr16:
+  case SPU::ORHIi8i16:
+  case SPU::ORBIv16i8:
+  case SPU::ORBIr8:
+  case SPU::ORIi16i32:
+  case SPU::ORIi8i32:
+  case SPU::AHIvec:
+  case SPU::AHIr16:
+  case SPU::AIv4i32:
+    assert(MI.getNumOperands() == 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isImm() &&
+           "invalid SPU ORI/ORHI/ORBI/AHI/AI/SFI/SFHI instruction!");
+    if (MI.getOperand(2).getImm() == 0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  case SPU::AIr32:
+    assert(MI.getNumOperands() == 3 &&
+           "wrong number of operands to AIr32");
+    if (MI.getOperand(0).isReg() &&
+        MI.getOperand(1).isReg() &&
+        (MI.getOperand(2).isImm() &&
+         MI.getOperand(2).getImm() == 0)) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  case SPU::LRr8:
+  case SPU::LRr16:
+  case SPU::LRr32:
+  case SPU::LRf32:
+  case SPU::LRr64:
+  case SPU::LRf64:
+  case SPU::LRr128:
+  case SPU::LRv16i8:
+  case SPU::LRv8i16:
+  case SPU::LRv4i32:
+  case SPU::LRv4f32:
+  case SPU::LRv2i64:
+  case SPU::LRv2f64:
+  case SPU::ORv16i8_i8:
+  case SPU::ORv8i16_i16:
+  case SPU::ORv4i32_i32:
+  case SPU::ORv2i64_i64:
+  case SPU::ORv4f32_f32:
+  case SPU::ORv2f64_f64:
+  case SPU::ORi8_v16i8:
+  case SPU::ORi16_v8i16:
+  case SPU::ORi32_v4i32:
+  case SPU::ORi64_v2i64:
+  case SPU::ORf32_v4f32:
+  case SPU::ORf64_v2f64:
+/*
+  case SPU::ORi128_r64:
+  case SPU::ORi128_f64:
+  case SPU::ORi128_r32:
+  case SPU::ORi128_f32:
+  case SPU::ORi128_r16:
+  case SPU::ORi128_r8:
+*/
+  case SPU::ORi128_vec:
+/*
+  case SPU::ORr64_i128:
+  case SPU::ORf64_i128:
+  case SPU::ORr32_i128:
+  case SPU::ORf32_i128:
+  case SPU::ORr16_i128:
+  case SPU::ORr8_i128:
+*/
+  case SPU::ORvec_i128:
+/*
+  case SPU::ORr16_r32:
+  case SPU::ORr8_r32:
+  case SPU::ORf32_r32:
+  case SPU::ORr32_f32:
+  case SPU::ORr32_r16:
+  case SPU::ORr32_r8:
+  case SPU::ORr16_r64:
+  case SPU::ORr8_r64:
+  case SPU::ORr64_r16:
+  case SPU::ORr64_r8:
+*/
+  case SPU::ORr64_r32:
+  case SPU::ORr32_r64:
+  case SPU::ORf32_r32:
+  case SPU::ORr32_f32:
+  case SPU::ORf64_r64:
+  case SPU::ORr64_f64: {
+    assert(MI.getNumOperands() == 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid SPU OR<type>_<vec> or LR instruction!");
+    if (MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  }
+  case SPU::ORv16i8:
+  case SPU::ORv8i16:
+  case SPU::ORv4i32:
+  case SPU::ORv2i64:
+  case SPU::ORr8:
+  case SPU::ORr16:
+  case SPU::ORr32:
+  case SPU::ORr64:
+  case SPU::ORr128:
+  case SPU::ORf32:
+  case SPU::ORf64:
+    assert(MI.getNumOperands() == 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isReg() &&
+           "invalid SPU OR(vec|r32|r64|gprc) instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  }
+
+  return false;
+}
+
+unsigned
+SPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                  int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SPU::LQDv16i8:
+  case SPU::LQDv8i16:
+  case SPU::LQDv4i32:
+  case SPU::LQDv4f32:
+  case SPU::LQDv2f64:
+  case SPU::LQDr128:
+  case SPU::LQDr64:
+  case SPU::LQDr32:
+  case SPU::LQDr16: {
+    const MachineOperand MOp1 = MI->getOperand(1);
+    const MachineOperand MOp2 = MI->getOperand(2);
+    if (MOp1.isImm() && MOp2.isFI()) {
+      FrameIndex = MOp2.getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  }
+  return 0;
+}
+
+unsigned
+SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                 int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SPU::STQDv16i8:
+  case SPU::STQDv8i16:
+  case SPU::STQDv4i32:
+  case SPU::STQDv4f32:
+  case SPU::STQDv2f64:
+  case SPU::STQDr128:
+  case SPU::STQDr64:
+  case SPU::STQDr32:
+  case SPU::STQDr16:
+  case SPU::STQDr8: {
+    const MachineOperand MOp1 = MI->getOperand(1);
+    const MachineOperand MOp2 = MI->getOperand(2);
+    if (MOp1.isImm() && MOp2.isFI()) {
+      FrameIndex = MOp2.getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  }
+  return 0;
+}
+
+bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const
+{
+  // We support cross register class moves for our aliases, such as R3 in any
+  // reg class to any other reg class containing R3.  This is required because
+  // we instruction select bitconvert i64 -> f64 as a noop for example, so our
+  // types have no specific meaning.
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (DestRC == SPU::R8CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr8), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R16CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr16), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R32CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr32), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R32FPRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRf32), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R64CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr64), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R64FPRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRf64), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::GPRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr128), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::VECREGRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRv16i8), DestReg).addReg(SrcReg);
+  } else {
+    // Attempt to copy unknown/unsupported register class!
+    return false;
+  }
+
+  return true;
+}
+
+void
+SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned SrcReg, bool isKill, int FrameIdx,
+                                     const TargetRegisterClass *RC) const
+{
+  unsigned opc;
+  bool isValidFrameIdx = (FrameIdx < SPUFrameInfo::maxFrameOffset());
+  if (RC == SPU::GPRCRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128);
+  } else if (RC == SPU::R64CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
+  } else if (RC == SPU::R64FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
+  } else if (RC == SPU::R32CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
+  } else if (RC == SPU::R32FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
+  } else if (RC == SPU::R16CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16);
+  } else if (RC == SPU::R8CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8;
+  } else {
+    assert(0 && "Unknown regclass!");
+    abort();
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(opc))
+                    .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
+}
+
+void SPUInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  cerr << "storeRegToAddr() invoked!\n";
+  abort();
+
+  if (Addr[0].isFI()) {
+    /* do what storeRegToStackSlot does here */
+  } else {
+    unsigned Opc = 0;
+    if (RC == SPU::GPRCRegisterClass) {
+      /* Opc = PPC::STW; */
+    } else if (RC == SPU::R16CRegisterClass) {
+      /* Opc = PPC::STD; */
+    } else if (RC == SPU::R32CRegisterClass) {
+      /* Opc = PPC::STFD; */
+    } else if (RC == SPU::R32FPRegisterClass) {
+      /* Opc = PPC::STFD; */
+    } else if (RC == SPU::R64FPRegisterClass) {
+      /* Opc = PPC::STFS; */
+    } else if (RC == SPU::VECREGRegisterClass) {
+      /* Opc = PPC::STVX; */
+    } else {
+      assert(0 && "Unknown regclass!");
+      abort();
+    }
+    DebugLoc DL = DebugLoc::getUnknownLoc();
+    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc))
+      .addReg(SrcReg, getKillRegState(isKill));
+    for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+      MIB.addOperand(Addr[i]);
+    NewMIs.push_back(MIB);
+  }
+}
+
+void
+SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC) const
+{
+  unsigned opc;
+  bool isValidFrameIdx = (FrameIdx < SPUFrameInfo::maxFrameOffset());
+  if (RC == SPU::GPRCRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128);
+  } else if (RC == SPU::R64CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
+  } else if (RC == SPU::R64FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
+  } else if (RC == SPU::R32CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
+  } else if (RC == SPU::R32FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
+  } else if (RC == SPU::R16CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16);
+  } else if (RC == SPU::R8CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8;
+  } else {
+    assert(0 && "Unknown regclass in loadRegFromStackSlot!");
+    abort();
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(opc), DestReg), FrameIdx);
+}
+
+/*!
+  \note We are really pessimistic here about what kind of a load we're doing.
+ */
+void SPUInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                   SmallVectorImpl<MachineOperand> &Addr,
+                                   const TargetRegisterClass *RC,
+                                   SmallVectorImpl<MachineInstr*> &NewMIs)
+    const {
+  cerr << "loadRegToAddr() invoked!\n";
+  abort();
+
+  if (Addr[0].isFI()) {
+    /* do what loadRegFromStackSlot does here... */
+  } else {
+    unsigned Opc = 0;
+    if (RC == SPU::R8CRegisterClass) {
+      /* do brilliance here */
+    } else if (RC == SPU::R16CRegisterClass) {
+      /* Opc = PPC::LWZ; */
+    } else if (RC == SPU::R32CRegisterClass) {
+      /* Opc = PPC::LD; */
+    } else if (RC == SPU::R32FPRegisterClass) {
+      /* Opc = PPC::LFD; */
+    } else if (RC == SPU::R64FPRegisterClass) {
+      /* Opc = PPC::LFS; */
+    } else if (RC == SPU::VECREGRegisterClass) {
+      /* Opc = PPC::LVX; */
+    } else if (RC == SPU::GPRCRegisterClass) {
+      /* Opc = something else! */
+    } else {
+      assert(0 && "Unknown regclass!");
+      abort();
+    }
+    DebugLoc DL = DebugLoc::getUnknownLoc();
+    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+    for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+      MIB.addOperand(Addr[i]);
+    NewMIs.push_back(MIB);
+  }
+}
+
+//! Return true if the specified load or store can be folded
+bool
+SPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                   const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  // Make sure this is a reg-reg copy.
+  unsigned Opc = MI->getOpcode();
+
+  switch (Opc) {
+  case SPU::ORv16i8:
+  case SPU::ORv8i16:
+  case SPU::ORv4i32:
+  case SPU::ORv2i64:
+  case SPU::ORr8:
+  case SPU::ORr16:
+  case SPU::ORr32:
+  case SPU::ORr64:
+  case SPU::ORf32:
+  case SPU::ORf64:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())
+      return true;
+    break;
+  }
+
+  return false;
+}
+
+/// foldMemoryOperand - SPU, like PPC, can only fold spills into
+/// copy instructions, turning them into load/store instructions.
+MachineInstr *
+SPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                    MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex) const
+{
+  if (Ops.size() != 1) return 0;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  MachineInstr *NewMI = 0;
+
+  switch (Opc) {
+  case SPU::ORv16i8:
+  case SPU::ORv8i16:
+  case SPU::ORv4i32:
+  case SPU::ORv2i64:
+  case SPU::ORr8:
+  case SPU::ORr16:
+  case SPU::ORr32:
+  case SPU::ORr64:
+  case SPU::ORf32:
+  case SPU::ORf64:
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      if (FrameIndex < SPUFrameInfo::maxFrameOffset()) {
+        MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(),
+                                          get(SPU::STQDr32));
+
+        MIB.addReg(InReg, getKillRegState(isKill));
+        NewMI = addFrameReference(MIB, FrameIndex);
+      }
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc));
+
+      MIB.addReg(OutReg, RegState::Define | getDeadRegState(isDead));
+      Opc = (FrameIndex < SPUFrameInfo::maxFrameOffset())
+        ? SPU::STQDr32 : SPU::STQXr32;
+      NewMI = addFrameReference(MIB, FrameIndex);
+    break;
+  }
+  }
+
+  return NewMI;
+}
+
+//! Branch analysis
+/*!
+  \note This code was kiped from PPC. There may be more branch analysis for
+  CellSPU than what's currently done here.
+ */
+bool
+SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                            MachineBasicBlock *&FBB,
+                            SmallVectorImpl<MachineOperand> &Cond,
+                            bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranch(LastInst)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (isCondBranch(LastInst)) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      DEBUG(cerr << "Pushing LastInst:               ");
+      DEBUG(LastInst->dump());
+      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a conditional and unconditional branch, handle it.
+  if (isCondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    DEBUG(cerr << "Pushing SecondLastInst:         ");
+    DEBUG(SecondLastInst->dump());
+    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned
+SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  if (!isCondBranch(I) && !isUncondBranch(I))
+    return 0;
+
+  // Remove the first branch.
+  DEBUG(cerr << "Removing branch:                ");
+  DEBUG(I->dump());
+  I->eraseFromParent();
+  I = MBB.end();
+  if (I == MBB.begin())
+    return 1;
+
+  --I;
+  if (!(isCondBranch(I) || isUncondBranch(I)))
+    return 1;
+
+  // Remove the second branch.
+  DEBUG(cerr << "Removing second branch:         ");
+  DEBUG(I->dump());
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "SPU branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty()) {
+      // Unconditional branch
+      MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(SPU::BR));
+      MIB.addMBB(TBB);
+
+      DEBUG(cerr << "Inserted one-way uncond branch: ");
+      DEBUG((*MIB).dump());
+    } else {
+      // Conditional branch
+      MachineInstrBuilder  MIB = BuildMI(&MBB, dl, get(Cond[0].getImm()));
+      MIB.addReg(Cond[1].getReg()).addMBB(TBB);
+
+      DEBUG(cerr << "Inserted one-way cond branch:   ");
+      DEBUG((*MIB).dump());
+    }
+    return 1;
+  } else {
+    MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(Cond[0].getImm()));
+    MachineInstrBuilder MIB2 = BuildMI(&MBB, dl, get(SPU::BR));
+
+    // Two-way Conditional Branch.
+    MIB.addReg(Cond[1].getReg()).addMBB(TBB);
+    MIB2.addMBB(FBB);
+
+    DEBUG(cerr << "Inserted conditional branch:    ");
+    DEBUG((*MIB).dump());
+    DEBUG(cerr << "part 2: ");
+    DEBUG((*MIB2).dump());
+   return 2;
+  }
+}
+
+bool
+SPUInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+  return (!MBB.empty() && isUncondBranch(&MBB.back()));
+}
+//! Reverses a branch's condition, returning false on success.
+bool
+SPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // Pretty brainless way of inverting the condition, but it works, considering
+  // there are only two conditions...
+  static struct {
+    unsigned Opc;               //! The incoming opcode
+    unsigned RevCondOpc;        //! The reversed condition opcode
+  } revconds[] = {
+    { SPU::BRNZr32, SPU::BRZr32 },
+    { SPU::BRNZv4i32, SPU::BRZv4i32 },
+    { SPU::BRZr32, SPU::BRNZr32 },
+    { SPU::BRZv4i32, SPU::BRNZv4i32 },
+    { SPU::BRHNZr16, SPU::BRHZr16 },
+    { SPU::BRHNZv8i16, SPU::BRHZv8i16 },
+    { SPU::BRHZr16, SPU::BRHNZr16 },
+    { SPU::BRHZv8i16, SPU::BRHNZv8i16 }
+  };
+
+  unsigned Opc = unsigned(Cond[0].getImm());
+  // Pretty dull mapping between the two conditions that SPU can generate:
+  for (int i = sizeof(revconds)/sizeof(revconds[0]) - 1; i >= 0; --i) {
+    if (revconds[i].Opc == Opc) {
+      Cond[0].setImm(revconds[i].RevCondOpc);
+      return false;
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h
new file mode 100644
index 0000000..ffb4087
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrInfo.h
@@ -0,0 +1,114 @@
+//===- SPUInstrInfo.h - Cell SPU Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CellSPU implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_INSTRUCTIONINFO_H
+#define SPU_INSTRUCTIONINFO_H
+
+#include "SPU.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "SPURegisterInfo.h"
+
+namespace llvm {
+  //! Cell SPU instruction information class
+  class SPUInstrInfo : public TargetInstrInfoImpl {
+    SPUTargetMachine &TM;
+    const SPURegisterInfo RI;
+  protected:
+    virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                            MachineInstr* MI,
+                                            const SmallVectorImpl<unsigned> &Ops,
+                                            int FrameIndex) const;
+
+    virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                                MachineInstr* MI,
+                                                const SmallVectorImpl<unsigned> &Ops,
+                                                MachineInstr* LoadMI) const {
+      return 0;
+    }
+
+  public:
+    explicit SPUInstrInfo(SPUTargetMachine &tm);
+
+    /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+    /// such, whenever a client has an instance of instruction info, it should
+    /// always be able to get register info as well (through this method).
+    ///
+    virtual const SPURegisterInfo &getRegisterInfo() const { return RI; }
+
+    /// Return true if the instruction is a register to register move and return
+    /// the source and dest operands and their sub-register indices by reference.
+    virtual bool isMoveInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+    unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                 int &FrameIndex) const;
+    unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                int &FrameIndex) const;
+
+    virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              unsigned DestReg, unsigned SrcReg,
+                              const TargetRegisterClass *DestRC,
+                              const TargetRegisterClass *SrcRC) const;
+
+    //! Store a register to a stack slot, based on its register class.
+    virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned SrcReg, bool isKill, int FrameIndex,
+                                     const TargetRegisterClass *RC) const;
+
+    //! Store a register to an address, based on its register class
+    virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                                                  SmallVectorImpl<MachineOperand> &Addr,
+                                                  const TargetRegisterClass *RC,
+                                                  SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+    //! Load a register from a stack slot, based on its register class.
+    virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      unsigned DestReg, int FrameIndex,
+                                      const TargetRegisterClass *RC) const;
+
+    //! Loqad a register from an address, based on its register class
+    virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                                         SmallVectorImpl<MachineOperand> &Addr,
+                                                         const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+    //! Return true if the specified load or store can be folded
+    virtual
+    bool canFoldMemoryOperand(const MachineInstr *MI,
+                              const SmallVectorImpl<unsigned> &Ops) const;
+
+    //! Return true if the specified block does not fall through
+    virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+
+    //! Reverses a branch's condition, returning false on success.
+    virtual
+    bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+    virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                               MachineBasicBlock *&FBB,
+                               SmallVectorImpl<MachineOperand> &Cond,
+                               bool AllowModify) const;
+
+    virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+    virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                              MachineBasicBlock *FBB,
+                              const SmallVectorImpl<MachineOperand> &Cond) const;
+   };
+}
+
+#endif
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
new file mode 100644
index 0000000..63eb85a
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -0,0 +1,4614 @@
+//==- SPUInstrInfo.td - Describe the Cell SPU Instructions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Cell SPU Instructions:
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO Items (not urgent today, but would be nice, low priority)
+//
+// ANDBI, ORBI: SPU constructs a 4-byte constant for these instructions by
+// concatenating the byte argument b as "bbbb". Could recognize this bit pattern
+// in 16-bit and 32-bit constants and reduce instruction count.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions:
+//===----------------------------------------------------------------------===//
+
+let hasCtrlDep = 1, Defs = [R1], Uses = [R1] in {
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm_i32:$amt),
+                                "${:comment} ADJCALLSTACKDOWN",
+                                [(callseq_start timm:$amt)]>;
+  def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm_i32:$amt),
+                                "${:comment} ADJCALLSTACKUP",
+                                [(callseq_end timm:$amt)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// DWARF debugging Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+def DWARF_LOC : Pseudo<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file),
+           ".loc $file, $line, $col",
+           [(dwarf_loc (i32 imm:$line), (i32 imm:$col), (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// Loads:
+// NB: The ordering is actually important, since the instruction selection
+// will try each of the instructions in sequence, i.e., the D-form first with
+// the 10-bit displacement, then the A-form with the 16 bit displacement, and
+// finally the X-form with the register-register.
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1 in {
+  class LoadDFormVec<ValueType vectype>
+    : RI10Form<0b00101100, (outs VECREG:$rT), (ins dformaddr:$src),
+               "lqd\t$rT, $src",
+               LoadStore,
+               [(set (vectype VECREG:$rT), (load dform_addr:$src))]>
+  { }
+
+  class LoadDForm<RegisterClass rclass>
+    : RI10Form<0b00101100, (outs rclass:$rT), (ins dformaddr:$src),
+               "lqd\t$rT, $src",
+               LoadStore,
+               [(set rclass:$rT, (load dform_addr:$src))]>
+  { }
+
+  multiclass LoadDForms
+  {
+    def v16i8: LoadDFormVec<v16i8>;
+    def v8i16: LoadDFormVec<v8i16>;
+    def v4i32: LoadDFormVec<v4i32>;
+    def v2i64: LoadDFormVec<v2i64>;
+    def v4f32: LoadDFormVec<v4f32>;
+    def v2f64: LoadDFormVec<v2f64>;
+
+    def v2i32: LoadDFormVec<v2i32>;
+
+    def r128:  LoadDForm<GPRC>;
+    def r64:   LoadDForm<R64C>;
+    def r32:   LoadDForm<R32C>;
+    def f32:   LoadDForm<R32FP>;
+    def f64:   LoadDForm<R64FP>;
+    def r16:   LoadDForm<R16C>;
+    def r8:    LoadDForm<R8C>;
+  }
+
+  class LoadAFormVec<ValueType vectype>
+    : RI16Form<0b100001100, (outs VECREG:$rT), (ins addr256k:$src),
+               "lqa\t$rT, $src",
+               LoadStore,
+               [(set (vectype VECREG:$rT), (load aform_addr:$src))]>
+  { }
+
+  class LoadAForm<RegisterClass rclass>
+    : RI16Form<0b100001100, (outs rclass:$rT), (ins addr256k:$src),
+               "lqa\t$rT, $src",
+               LoadStore,
+               [(set rclass:$rT, (load aform_addr:$src))]>
+  { }
+
+  multiclass LoadAForms
+  {
+    def v16i8: LoadAFormVec<v16i8>;
+    def v8i16: LoadAFormVec<v8i16>;
+    def v4i32: LoadAFormVec<v4i32>;
+    def v2i64: LoadAFormVec<v2i64>;
+    def v4f32: LoadAFormVec<v4f32>;
+    def v2f64: LoadAFormVec<v2f64>;
+
+    def v2i32: LoadAFormVec<v2i32>;
+
+    def r128:  LoadAForm<GPRC>;
+    def r64:   LoadAForm<R64C>;
+    def r32:   LoadAForm<R32C>;
+    def f32:   LoadAForm<R32FP>;
+    def f64:   LoadAForm<R64FP>;
+    def r16:   LoadAForm<R16C>;
+    def r8:    LoadAForm<R8C>;
+  }
+
+  class LoadXFormVec<ValueType vectype>
+    : RRForm<0b00100011100, (outs VECREG:$rT), (ins memrr:$src),
+             "lqx\t$rT, $src",
+             LoadStore,
+             [(set (vectype VECREG:$rT), (load xform_addr:$src))]>
+  { }
+
+  class LoadXForm<RegisterClass rclass>
+    : RRForm<0b00100011100, (outs rclass:$rT), (ins memrr:$src),
+             "lqx\t$rT, $src",
+             LoadStore,
+             [(set rclass:$rT, (load xform_addr:$src))]>
+  { }
+
+  multiclass LoadXForms
+  {
+    def v16i8: LoadXFormVec<v16i8>;
+    def v8i16: LoadXFormVec<v8i16>;
+    def v4i32: LoadXFormVec<v4i32>;
+    def v2i64: LoadXFormVec<v2i64>;
+    def v4f32: LoadXFormVec<v4f32>;
+    def v2f64: LoadXFormVec<v2f64>;
+
+    def v2i32: LoadXFormVec<v2i32>;
+
+    def r128:  LoadXForm<GPRC>;
+    def r64:   LoadXForm<R64C>;
+    def r32:   LoadXForm<R32C>;
+    def f32:   LoadXForm<R32FP>;
+    def f64:   LoadXForm<R64FP>;
+    def r16:   LoadXForm<R16C>;
+    def r8:    LoadXForm<R8C>;
+  }
+
+  defm LQA : LoadAForms;
+  defm LQD : LoadDForms;
+  defm LQX : LoadXForms;
+
+/* Load quadword, PC relative: Not much use at this point in time.
+   Might be of use later for relocatable code. It's effectively the
+   same as LQA, but uses PC-relative addressing.
+  def LQR : RI16Form<0b111001100, (outs VECREG:$rT), (ins s16imm:$disp),
+                     "lqr\t$rT, $disp", LoadStore,
+                     [(set VECREG:$rT, (load iaddr:$disp))]>;
+ */
+}
+
+//===----------------------------------------------------------------------===//
+// Stores:
+//===----------------------------------------------------------------------===//
+class StoreDFormVec<ValueType vectype>
+  : RI10Form<0b00100100, (outs), (ins VECREG:$rT, dformaddr:$src),
+             "stqd\t$rT, $src",
+             LoadStore,
+             [(store (vectype VECREG:$rT), dform_addr:$src)]>
+{ }
+
+class StoreDForm<RegisterClass rclass>
+  : RI10Form<0b00100100, (outs), (ins rclass:$rT, dformaddr:$src),
+             "stqd\t$rT, $src",
+             LoadStore,
+             [(store rclass:$rT, dform_addr:$src)]>
+{ }
+
+multiclass StoreDForms
+{
+  def v16i8: StoreDFormVec<v16i8>;
+  def v8i16: StoreDFormVec<v8i16>;
+  def v4i32: StoreDFormVec<v4i32>;
+  def v2i64: StoreDFormVec<v2i64>;
+  def v4f32: StoreDFormVec<v4f32>;
+  def v2f64: StoreDFormVec<v2f64>;
+
+  def v2i32: StoreDFormVec<v2i32>;
+
+  def r128:  StoreDForm<GPRC>;
+  def r64:   StoreDForm<R64C>;
+  def r32:   StoreDForm<R32C>;
+  def f32:   StoreDForm<R32FP>;
+  def f64:   StoreDForm<R64FP>;
+  def r16:   StoreDForm<R16C>;
+  def r8:    StoreDForm<R8C>;
+}
+
+class StoreAFormVec<ValueType vectype>
+  : RI16Form<0b0010010, (outs), (ins VECREG:$rT, addr256k:$src),
+             "stqa\t$rT, $src",
+             LoadStore,
+             [(store (vectype VECREG:$rT), aform_addr:$src)]>;
+
+class StoreAForm<RegisterClass rclass>
+  : RI16Form<0b001001, (outs), (ins rclass:$rT, addr256k:$src),
+             "stqa\t$rT, $src",
+             LoadStore,
+             [(store rclass:$rT, aform_addr:$src)]>;
+
+multiclass StoreAForms
+{
+  def v16i8: StoreAFormVec<v16i8>;
+  def v8i16: StoreAFormVec<v8i16>;
+  def v4i32: StoreAFormVec<v4i32>;
+  def v2i64: StoreAFormVec<v2i64>;
+  def v4f32: StoreAFormVec<v4f32>;
+  def v2f64: StoreAFormVec<v2f64>;
+
+  def v2i32: StoreAFormVec<v2i32>;
+
+  def r128:  StoreAForm<GPRC>;
+  def r64:   StoreAForm<R64C>;
+  def r32:   StoreAForm<R32C>;
+  def f32:   StoreAForm<R32FP>;
+  def f64:   StoreAForm<R64FP>;
+  def r16:   StoreAForm<R16C>;
+  def r8:    StoreAForm<R8C>;
+}
+
+class StoreXFormVec<ValueType vectype>
+  : RRForm<0b00100100, (outs), (ins VECREG:$rT, memrr:$src),
+           "stqx\t$rT, $src",
+           LoadStore,
+           [(store (vectype VECREG:$rT), xform_addr:$src)]>
+{ }
+
+class StoreXForm<RegisterClass rclass>
+  : RRForm<0b00100100, (outs), (ins rclass:$rT, memrr:$src),
+           "stqx\t$rT, $src",
+           LoadStore,
+           [(store rclass:$rT, xform_addr:$src)]>
+{ }
+
+multiclass StoreXForms
+{
+  def v16i8: StoreXFormVec<v16i8>;
+  def v8i16: StoreXFormVec<v8i16>;
+  def v4i32: StoreXFormVec<v4i32>;
+  def v2i64: StoreXFormVec<v2i64>;
+  def v4f32: StoreXFormVec<v4f32>;
+  def v2f64: StoreXFormVec<v2f64>;
+
+  def v2i32: StoreXFormVec<v2i32>;
+
+  def r128:  StoreXForm<GPRC>;
+  def r64:   StoreXForm<R64C>;
+  def r32:   StoreXForm<R32C>;
+  def f32:   StoreXForm<R32FP>;
+  def f64:   StoreXForm<R64FP>;
+  def r16:   StoreXForm<R16C>;
+  def r8:    StoreXForm<R8C>;
+}
+
+defm STQD : StoreDForms;
+defm STQA : StoreAForms;
+defm STQX : StoreXForms;
+
+/* Store quadword, PC relative: Not much use at this point in time. Might
+   be useful for relocatable code.
+def STQR : RI16Form<0b111000100, (outs), (ins VECREG:$rT, s16imm:$disp),
+                   "stqr\t$rT, $disp", LoadStore,
+                   [(store VECREG:$rT, iaddr:$disp)]>;
+*/
+
+//===----------------------------------------------------------------------===//
+// Generate Controls for Insertion:
+//===----------------------------------------------------------------------===//
+
+def CBD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cbd\t$rT, $src", ShuffleOp,
+    [(set (v16i8 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CBX: RRForm<0b00101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cbx\t$rT, $src", ShuffleOp,
+    [(set (v16i8 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CHD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "chd\t$rT, $src", ShuffleOp,
+    [(set (v8i16 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CHX: RRForm<0b10101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "chx\t$rT, $src", ShuffleOp,
+    [(set (v8i16 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CWD: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cwd\t$rT, $src", ShuffleOp,
+    [(set (v4i32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CWX: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cwx\t$rT, $src", ShuffleOp,
+    [(set (v4i32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CWDf32: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cwd\t$rT, $src", ShuffleOp,
+    [(set (v4f32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CWXf32: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cwx\t$rT, $src", ShuffleOp,
+    [(set (v4f32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CDD: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cdd\t$rT, $src", ShuffleOp,
+    [(set (v2i64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CDX: RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cdx\t$rT, $src", ShuffleOp,
+    [(set (v2i64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CDDf64: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cdd\t$rT, $src", ShuffleOp,
+    [(set (v2f64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CDXf64: RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cdx\t$rT, $src", ShuffleOp,
+    [(set (v2f64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// Constant formation:
+//===----------------------------------------------------------------------===//
+
+def ILHv8i16:
+  RI16Form<0b110000010, (outs VECREG:$rT), (ins s16imm:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set (v8i16 VECREG:$rT), (v8i16 v8i16SExt16Imm:$val))]>;
+
+def ILHr16:
+  RI16Form<0b110000010, (outs R16C:$rT), (ins s16imm:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set R16C:$rT, immSExt16:$val)]>;
+
+// Cell SPU doesn't have a native 8-bit immediate load, but ILH works ("with
+// the right constant")
+def ILHr8:
+  RI16Form<0b110000010, (outs R8C:$rT), (ins s16imm_i8:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set R8C:$rT, immSExt8:$val)]>;
+
+// IL does sign extension!
+
+class ILInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b100000010, OOL, IOL, "il\t$rT, $val",
+           ImmLoad, pattern>;
+
+class ILVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILInst<(outs VECREG:$rT), (ins immtype:$val),
+         [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILRegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILInst<(outs rclass:$rT), (ins immtype:$val),
+         [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmediateLoad
+{
+  def v2i64: ILVecInst<v2i64, s16imm_i64, v2i64SExt16Imm>;
+  def v4i32: ILVecInst<v4i32, s16imm_i32, v4i32SExt16Imm>;
+
+  // TODO: Need v2f64, v4f32
+
+  def r64: ILRegInst<R64C, s16imm_i64, immSExt16>;
+  def r32: ILRegInst<R32C, s16imm_i32, immSExt16>;
+  def f32: ILRegInst<R32FP, s16imm_f32, fpimmSExt16>;
+  def f64: ILRegInst<R64FP, s16imm_f64, fpimmSExt16>;
+}
+
+defm IL : ImmediateLoad;
+
+class ILHUInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b010000010, OOL, IOL, "ilhu\t$rT, $val",
+           ImmLoad, pattern>;
+
+class ILHUVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILHUInst<(outs VECREG:$rT), (ins immtype:$val),
+           [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILHURegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILHUInst<(outs rclass:$rT), (ins immtype:$val),
+           [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmLoadHalfwordUpper
+{
+  def v2i64: ILHUVecInst<v2i64, u16imm_i64, immILHUvec_i64>;
+  def v4i32: ILHUVecInst<v4i32, u16imm_i32, immILHUvec>;
+
+  def r64: ILHURegInst<R64C, u16imm_i64, hi16>;
+  def r32: ILHURegInst<R32C, u16imm_i32, hi16>;
+
+  // Loads the high portion of an address
+  def hi: ILHURegInst<R32C, symbolHi, hi16>;
+
+  // Used in custom lowering constant SFP loads:
+  def f32: ILHURegInst<R32FP, f16imm, hi16_f32>;
+}
+
+defm ILHU : ImmLoadHalfwordUpper;
+
+// Immediate load address (can also be used to load 18-bit unsigned constants,
+// see the zext 16->32 pattern)
+
+class ILAInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI18Form<0b1000010, OOL, IOL, "ila\t$rT, $val",
+           LoadNOP, pattern>;
+
+class ILAVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILAInst<(outs VECREG:$rT), (ins immtype:$val),
+          [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILARegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILAInst<(outs rclass:$rT), (ins immtype:$val),
+          [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmLoadAddress
+{
+  def v2i64: ILAVecInst<v2i64, u18imm, v2i64Uns18Imm>;
+  def v4i32: ILAVecInst<v4i32, u18imm, v4i32Uns18Imm>;
+
+  def r64: ILARegInst<R64C, u18imm_i64, imm18>;
+  def r32: ILARegInst<R32C, u18imm, imm18>;
+  def f32: ILARegInst<R32FP, f18imm, fpimm18>;
+  def f64: ILARegInst<R64FP, f18imm_f64, fpimm18>;
+
+  def hi: ILARegInst<R32C, symbolHi, imm18>;
+  def lo: ILARegInst<R32C, symbolLo, imm18>;
+
+  def lsa: ILAInst<(outs R32C:$rT), (ins symbolLSA:$val),
+                   [/* no pattern */]>;
+}
+
+defm ILA : ImmLoadAddress;
+
+// Immediate OR, Halfword Lower: The "other" part of loading large constants
+// into 32-bit registers. See the anonymous pattern Pat<(i32 imm:$imm), ...>
+// Note that these are really two operand instructions, but they're encoded
+// as three operands with the first two arguments tied-to each other.
+
+class IOHLInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b100000110, OOL, IOL, "iohl\t$rT, $val",
+           ImmLoad, pattern>,
+  RegConstraint<"$rS = $rT">,
+  NoEncode<"$rS">;
+
+class IOHLVecInst<ValueType vectype, Operand immtype /* , PatLeaf xform */>:
+  IOHLInst<(outs VECREG:$rT), (ins VECREG:$rS, immtype:$val),
+           [/* no pattern */]>;
+
+class IOHLRegInst<RegisterClass rclass, Operand immtype /* , PatLeaf xform */>:
+  IOHLInst<(outs rclass:$rT), (ins rclass:$rS, immtype:$val),
+           [/* no pattern */]>;
+
+multiclass ImmOrHalfwordLower
+{
+  def v2i64: IOHLVecInst<v2i64, u16imm_i64>;
+  def v4i32: IOHLVecInst<v4i32, u16imm_i32>;
+
+  def r32: IOHLRegInst<R32C, i32imm>;
+  def f32: IOHLRegInst<R32FP, f32imm>;
+
+  def lo: IOHLRegInst<R32C, symbolLo>;
+}
+
+defm IOHL: ImmOrHalfwordLower;
+
+// Form select mask for bytes using immediate, used in conjunction with the
+// SELB instruction:
+
+class FSMBIVec<ValueType vectype>:
+  RI16Form<0b101001100, (outs VECREG:$rT), (ins u16imm:$val),
+          "fsmbi\t$rT, $val",
+          SelectOp,
+          [(set (vectype VECREG:$rT), (SPUselmask (i16 immU16:$val)))]>;
+
+multiclass FormSelectMaskBytesImm
+{
+  def v16i8: FSMBIVec<v16i8>;
+  def v8i16: FSMBIVec<v8i16>;
+  def v4i32: FSMBIVec<v4i32>;
+  def v2i64: FSMBIVec<v2i64>;
+}
+
+defm FSMBI : FormSelectMaskBytesImm;
+
+// fsmb: Form select mask for bytes. N.B. Input operand, $rA, is 16-bits
+class FSMBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101100, OOL, IOL, "fsmb\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMBRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMBVecInst<ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskBits {
+  def v16i8_r16: FSMBRegInst<R16C, v16i8>;
+  def v16i8:     FSMBVecInst<v16i8>;
+}
+
+defm FSMB: FormSelectMaskBits;
+
+// fsmh: Form select mask for halfwords. N.B., Input operand, $rA, is
+// only 8-bits wide (even though it's input as 16-bits here)
+
+class FSMHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10101101100, OOL, IOL, "fsmh\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMHRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMHVecInst<ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskHalfword {
+  def v8i16_r16: FSMHRegInst<R16C, v8i16>;
+  def v8i16:     FSMHVecInst<v8i16>;
+}
+
+defm FSMH: FormSelectMaskHalfword;
+
+// fsm: Form select mask for words. Like the other fsm* instructions,
+// only the lower 4 bits of $rA are significant.
+
+class FSMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00101101100, OOL, IOL, "fsm\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMRegInst<ValueType vectype, RegisterClass rclass>:
+    FSMInst<(outs VECREG:$rT), (ins rclass:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMVecInst<ValueType vectype>:
+    FSMInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskWord {
+  def v4i32: FSMVecInst<v4i32>;
+
+  def r32 :  FSMRegInst<v4i32, R32C>;
+  def r16 :  FSMRegInst<v4i32, R16C>;
+}
+
+defm FSM : FormSelectMaskWord;
+
+// Special case when used for i64 math operations
+multiclass FormSelectMaskWord64 {
+  def r32 : FSMRegInst<v2i64, R32C>;
+  def r16 : FSMRegInst<v2i64, R16C>;
+}
+
+defm FSM64 : FormSelectMaskWord64;
+
+//===----------------------------------------------------------------------===//
+// Integer and Logical Operations:
+//===----------------------------------------------------------------------===//
+
+def AHv8i16:
+  RRForm<0b00010011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "ah\t$rT, $rA, $rB", IntegerOp,
+    [(set (v8i16 VECREG:$rT), (int_spu_si_ah VECREG:$rA, VECREG:$rB))]>;
+
+def : Pat<(add (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (AHv8i16 VECREG:$rA, VECREG:$rB)>;
+
+def AHr16:
+  RRForm<0b00010011000, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+    "ah\t$rT, $rA, $rB", IntegerOp,
+    [(set R16C:$rT, (add R16C:$rA, R16C:$rB))]>;
+
+def AHIvec:
+    RI10Form<0b10111000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "ahi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (add (v8i16 VECREG:$rA),
+                                     v8i16SExt10Imm:$val))]>;
+
+def AHIr16:
+  RI10Form<0b10111000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+    "ahi\t$rT, $rA, $val", IntegerOp,
+    [(set R16C:$rT, (add R16C:$rA, i16ImmSExt10:$val))]>;
+
+// v4i32, i32 add instruction:
+
+class AInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm<0b00000011000, OOL, IOL,
+         "a\t$rT, $rA, $rB", IntegerOp,
+         pattern>;
+
+class AVecInst<ValueType vectype>:
+  AInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA),
+                                         (vectype VECREG:$rB)))]>;
+
+class ARegInst<RegisterClass rclass>:
+  AInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+        [(set rclass:$rT, (add rclass:$rA, rclass:$rB))]>;
+        
+multiclass AddInstruction {
+  def v4i32: AVecInst<v4i32>;
+  def v16i8: AVecInst<v16i8>;
+  
+  def r32:   ARegInst<R32C>;
+}
+
+defm A : AddInstruction;
+
+class AIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00111000, OOL, IOL,
+             "ai\t$rT, $rA, $val", IntegerOp,
+             pattern>;
+
+class AIVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA), immpred:$val))]>;
+
+class AIFPVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            [/* no pattern */]>;
+
+class AIRegInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+           [(set rclass:$rT, (add rclass:$rA, immpred:$val))]>;
+
+// This is used to add epsilons to floating point numbers in the f32 fdiv code:
+class AIFPInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+           [/* no pattern */]>;
+
+multiclass AddImmediate {
+  def v4i32: AIVecInst<v4i32, v4i32SExt10Imm>;
+
+  def r32: AIRegInst<R32C, i32ImmSExt10>;
+
+  def v4f32: AIFPVecInst<v4f32, v4i32SExt10Imm>;
+  def f32: AIFPInst<R32FP, i32ImmSExt10>;
+}
+
+defm AI : AddImmediate;
+
+def SFHvec:
+    RRForm<0b00010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "sfh\t$rT, $rA, $rB", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (sub (v8i16 VECREG:$rA),
+                                     (v8i16 VECREG:$rB)))]>;
+
+def SFHr16:
+    RRForm<0b00010010000, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+      "sfh\t$rT, $rA, $rB", IntegerOp,
+      [(set R16C:$rT, (sub R16C:$rA, R16C:$rB))]>;
+
+def SFHIvec:
+    RI10Form<0b10110000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "sfhi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (sub v8i16SExt10Imm:$val,
+                                     (v8i16 VECREG:$rA)))]>;
+
+def SFHIr16 : RI10Form<0b10110000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+  "sfhi\t$rT, $rA, $val", IntegerOp,
+  [(set R16C:$rT, (sub i16ImmSExt10:$val, R16C:$rA))]>;
+
+def SFvec : RRForm<0b00000010000, (outs VECREG:$rT),
+                                  (ins VECREG:$rA, VECREG:$rB),
+  "sf\t$rT, $rA, $rB", IntegerOp,
+  [(set (v4i32 VECREG:$rT), (sub (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def SFr32 : RRForm<0b00000010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+  "sf\t$rT, $rA, $rB", IntegerOp,
+  [(set R32C:$rT, (sub R32C:$rA, R32C:$rB))]>;
+
+def SFIvec:
+    RI10Form<0b00110000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "sfi\t$rT, $rA, $val", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (sub v4i32SExt10Imm:$val,
+                                     (v4i32 VECREG:$rA)))]>;
+
+def SFIr32 : RI10Form<0b00110000, (outs R32C:$rT),
+                                  (ins R32C:$rA, s10imm_i32:$val),
+  "sfi\t$rT, $rA, $val", IntegerOp,
+  [(set R32C:$rT, (sub i32ImmSExt10:$val, R32C:$rA))]>;
+
+// ADDX: only available in vector form, doesn't match a pattern.
+class ADDXInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00000010110, OOL, IOL,
+      "addx\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class ADDXVecInst<ValueType vectype>:
+    ADDXInst<(outs VECREG:$rT),
+             (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+class ADDXRegInst<RegisterClass rclass>:
+    ADDXInst<(outs rclass:$rT),
+             (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+multiclass AddExtended {
+  def v2i64 : ADDXVecInst<v2i64>;
+  def v4i32 : ADDXVecInst<v4i32>;
+  def r64 : ADDXRegInst<R64C>;
+  def r32 : ADDXRegInst<R32C>;
+}
+
+defm ADDX : AddExtended;
+
+// CG: Generate carry for add
+class CGInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01000011000, OOL, IOL,
+      "cg\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class CGVecInst<ValueType vectype>:
+    CGInst<(outs VECREG:$rT),
+           (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+class CGRegInst<RegisterClass rclass>:
+    CGInst<(outs rclass:$rT),
+           (ins rclass:$rA, rclass:$rB),
+           [/* no pattern */]>;
+
+multiclass CarryGenerate {
+  def v2i64 : CGVecInst<v2i64>;
+  def v4i32 : CGVecInst<v4i32>;
+  def r64 : CGRegInst<R64C>;
+  def r32 : CGRegInst<R32C>;
+}
+
+defm CG : CarryGenerate;
+
+// SFX: Subract from, extended. This is used in conjunction with BG to subtract
+// with carry (borrow, in this case)
+class SFXInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000010110, OOL, IOL,
+      "sfx\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class SFXVecInst<ValueType vectype>:
+    SFXInst<(outs VECREG:$rT),
+            (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+class SFXRegInst<RegisterClass rclass>:
+    SFXInst<(outs rclass:$rT),
+            (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+multiclass SubtractExtended {
+  def v2i64 : SFXVecInst<v2i64>;
+  def v4i32 : SFXVecInst<v4i32>;
+  def r64 : SFXRegInst<R64C>;
+  def r32 : SFXRegInst<R32C>;
+}
+
+defm SFX : SubtractExtended;
+
+// BG: only available in vector form, doesn't match a pattern.
+class BGInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01000010000, OOL, IOL,
+      "bg\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class BGVecInst<ValueType vectype>:
+    BGInst<(outs VECREG:$rT),
+           (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+class BGRegInst<RegisterClass rclass>:
+    BGInst<(outs rclass:$rT),
+           (ins rclass:$rA, rclass:$rB),
+           [/* no pattern */]>;
+
+multiclass BorrowGenerate {
+  def v4i32 : BGVecInst<v4i32>;
+  def v2i64 : BGVecInst<v2i64>;
+  def r64 : BGRegInst<R64C>;
+  def r32 : BGRegInst<R32C>;
+}
+
+defm BG : BorrowGenerate;
+
+// BGX: Borrow generate, extended.
+def BGXvec:
+    RRForm<0b11000010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB,
+                                VECREG:$rCarry),
+      "bgx\t$rT, $rA, $rB", IntegerOp,
+      []>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+// Halfword multiply variants:
+// N.B: These can be used to build up larger quantities (16x16 -> 32)
+
+def MPYv8i16:
+  RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "mpy\t$rT, $rA, $rB", IntegerMulDiv,
+    [/* no pattern */]>;
+
+def MPYr16:
+  RRForm<0b00100011110, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+    "mpy\t$rT, $rA, $rB", IntegerMulDiv,
+    [(set R16C:$rT, (mul R16C:$rA, R16C:$rB))]>;
+
+// Unsigned 16-bit multiply:
+
+class MPYUInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00110011110, OOL, IOL,
+      "mpyu\t$rT, $rA, $rB", IntegerMulDiv,
+      pattern>;
+
+def MPYUv4i32:
+  MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+def MPYUr16:
+  MPYUInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
+           [(set R32C:$rT, (mul (zext R16C:$rA), (zext R16C:$rB)))]>;
+
+def MPYUr32:
+  MPYUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+           [/* no pattern */]>;
+
+// mpyi: multiply 16 x s10imm -> 32 result.
+
+class MPYIInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI10Form<0b00101110, OOL, IOL,
+    "mpyi\t$rT, $rA, $val", IntegerMulDiv,
+    pattern>;
+
+def MPYIvec:
+  MPYIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           [(set (v8i16 VECREG:$rT),
+                 (mul (v8i16 VECREG:$rA), v8i16SExt10Imm:$val))]>;
+
+def MPYIr16:
+  MPYIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+           [(set R16C:$rT, (mul R16C:$rA, i16ImmSExt10:$val))]>;
+
+// mpyui: same issues as other multiplies, plus, this doesn't match a
+// pattern... but may be used during target DAG selection or lowering
+
+class MPYUIInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI10Form<0b10101110, OOL, IOL,
+           "mpyui\t$rT, $rA, $val", IntegerMulDiv,
+           pattern>;
+    
+def MPYUIvec:
+  MPYUIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            []>;
+
+def MPYUIr16:
+  MPYUIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+            []>;
+
+// mpya: 16 x 16 + 16 -> 32 bit result
+class MPYAInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRRForm<0b0011, OOL, IOL,
+          "mpya\t$rT, $rA, $rB, $rC", IntegerMulDiv,
+          pattern>;
+          
+def MPYAv4i32:
+  MPYAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (v4i32 VECREG:$rT),
+                 (add (v4i32 (bitconvert (mul (v8i16 VECREG:$rA),
+                                              (v8i16 VECREG:$rB)))),
+                      (v4i32 VECREG:$rC)))]>;
+
+def MPYAr32:
+  MPYAInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (sext (mul R16C:$rA, R16C:$rB)),
+                                R32C:$rC))]>;
+                                
+def MPYAr32_sext:
+  MPYAInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (mul (sext R16C:$rA), (sext R16C:$rB)),
+                                R32C:$rC))]>;
+
+def MPYAr32_sextinreg:
+  MPYAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (mul (sext_inreg R32C:$rA, i16),
+                                     (sext_inreg R32C:$rB, i16)),
+                                R32C:$rC))]>;
+
+// mpyh: multiply high, used to synthesize 32-bit multiplies
+class MPYHInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm<0b10100011110, OOL, IOL,
+         "mpyh\t$rT, $rA, $rB", IntegerMulDiv,
+         pattern>;
+         
+def MPYHv4i32:
+    MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [/* no pattern */]>;
+
+def MPYHr32:
+    MPYHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+             [/* no pattern */]>;
+
+// mpys: multiply high and shift right (returns the top half of
+// a 16-bit multiply, sign extended to 32 bits.)
+
+class MPYSInst<dag OOL, dag IOL>:
+    RRForm<0b11100011110, OOL, IOL, 
+      "mpys\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYSv4i32:
+    MPYSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYSr16:
+    MPYSInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB)>;
+
+// mpyhh: multiply high-high (returns the 32-bit result from multiplying
+// the top 16 bits of the $rA, $rB)
+
+class MPYHHInst<dag OOL, dag IOL>:
+  RRForm<0b01100011110, OOL, IOL,
+        "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
+        [/* no pattern */]>;
+        
+def MPYHHv8i16:
+    MPYHHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+
+def MPYHHr32:
+    MPYHHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhha: Multiply high-high, add to $rT:
+
+class MPYHHAInst<dag OOL, dag IOL>:
+    RRForm<0b01100010110, OOL, IOL,
+      "mpyhha\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHAvec:
+    MPYHHAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAr32:
+    MPYHHAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhhu: Multiply high-high, unsigned, e.g.:
+//
+// +-------+-------+   +-------+-------+   +---------+
+// |  a0   .  a1   | x |  b0   .  b1   | = | a0 x b0 |
+// +-------+-------+   +-------+-------+   +---------+
+//
+// where a0, b0 are the upper 16 bits of the 32-bit word
+
+class MPYHHUInst<dag OOL, dag IOL>:
+    RRForm<0b01110011110, OOL, IOL,
+      "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHUv4i32:
+    MPYHHUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHUr32:
+    MPYHHUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhhau: Multiply high-high, unsigned
+
+class MPYHHAUInst<dag OOL, dag IOL>:
+    RRForm<0b01110010110, OOL, IOL,
+      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHAUvec:
+    MPYHHAUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAUr32:
+    MPYHHAUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// clz: Count leading zeroes
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class CLZInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10100101010, OOL, IOL, "clz\t$rT, $rA",
+             IntegerOp, pattern>;
+
+class CLZRegInst<RegisterClass rclass>:
+    CLZInst<(outs rclass:$rT), (ins rclass:$rA),
+            [(set rclass:$rT, (ctlz rclass:$rA))]>;
+
+class CLZVecInst<ValueType vectype>:
+    CLZInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (ctlz (vectype VECREG:$rA)))]>;
+
+multiclass CountLeadingZeroes {
+  def v4i32 : CLZVecInst<v4i32>;
+  def r32   : CLZRegInst<R32C>;
+}
+
+defm CLZ : CountLeadingZeroes;
+
+// cntb: Count ones in bytes (aka "population count")
+//
+// NOTE: This instruction is really a vector instruction, but the custom
+// lowering code uses it in unorthodox ways to support CTPOP for other
+// data types!
+
+def CNTBv16i8:
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v16i8 VECREG:$rT), (SPUcntb (v16i8 VECREG:$rA)))]>;
+
+def CNTBv8i16 :
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (SPUcntb (v8i16 VECREG:$rA)))]>;
+
+def CNTBv4i32 :
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (SPUcntb (v4i32 VECREG:$rA)))]>;
+
+// gbb: Gather the low order bits from each byte in $rA into a single 16-bit
+// quantity stored into $rT's slot 0, upper 16 bits are zeroed, as are
+// slots 1-3.
+//
+// Note: This instruction "pairs" with the fsmb instruction for all of the
+// various types defined here.
+//
+// Note 2: The "VecInst" and "RegInst" forms refer to the result being either
+// a vector or register.
+
+class GBBInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm_1<0b01001101100, OOL, IOL, "gbb\t$rT, $rA", GatherOp, pattern>;
+
+class GBBRegInst<RegisterClass rclass, ValueType vectype>:
+  GBBInst<(outs rclass:$rT), (ins VECREG:$rA),
+          [/* no pattern */]>;
+
+class GBBVecInst<ValueType vectype>:
+  GBBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+          [/* no pattern */]>;
+
+multiclass GatherBitsFromBytes {
+  def v16i8_r32: GBBRegInst<R32C, v16i8>;
+  def v16i8_r16: GBBRegInst<R16C, v16i8>;
+  def v16i8:     GBBVecInst<v16i8>;
+}
+
+defm GBB: GatherBitsFromBytes;
+
+// gbh: Gather all low order bits from each halfword in $rA into a single
+// 8-bit quantity stored in $rT's slot 0, with the upper bits of $rT set to 0
+// and slots 1-3 also set to 0.
+//
+// See notes for GBBInst, above.
+
+class GBHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10001101100, OOL, IOL, "gbh\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBHRegInst<RegisterClass rclass, ValueType vectype>:
+    GBHInst<(outs rclass:$rT), (ins VECREG:$rA),
+            [/* no pattern */]>;
+
+class GBHVecInst<ValueType vectype>:
+    GBHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [/* no pattern */]>;
+
+multiclass GatherBitsHalfword {
+  def v8i16_r32: GBHRegInst<R32C, v8i16>;
+  def v8i16_r16: GBHRegInst<R16C, v8i16>;
+  def v8i16:     GBHVecInst<v8i16>;
+}
+
+defm GBH: GatherBitsHalfword;
+
+// gb: Gather all low order bits from each word in $rA into a single
+// 4-bit quantity stored in $rT's slot 0, upper bits in $rT set to 0,
+// as well as slots 1-3.
+//
+// See notes for gbb, above.
+
+class GBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00001101100, OOL, IOL, "gb\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBRegInst<RegisterClass rclass, ValueType vectype>:
+    GBInst<(outs rclass:$rT), (ins VECREG:$rA),
+           [/* no pattern */]>;
+
+class GBVecInst<ValueType vectype>:
+    GBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+           [/* no pattern */]>;
+
+multiclass GatherBitsWord {
+  def v4i32_r32: GBRegInst<R32C, v4i32>;
+  def v4i32_r16: GBRegInst<R16C, v4i32>;
+  def v4i32:     GBVecInst<v4i32>;
+}
+
+defm GB: GatherBitsWord;
+
+// avgb: average bytes
+def AVGB:
+    RRForm<0b11001011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "avgb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// absdb: absolute difference of bytes
+def ABSDB:
+    RRForm<0b11001010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "absdb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// sumb: sum bytes into halfwords
+def SUMB:
+    RRForm<0b11001010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "sumb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// Sign extension operations:
+class XSBHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101010, OOL, IOL,
+      "xsbh\t$rDst, $rSrc",
+      IntegerOp, pattern>;
+
+class XSBHVecInst<ValueType vectype>:
+    XSBHInst<(outs VECREG:$rDst), (ins VECREG:$rSrc),
+      [(set (v8i16 VECREG:$rDst), (sext (vectype VECREG:$rSrc)))]>;
+
+class XSBHInRegInst<RegisterClass rclass, list<dag> pattern>:
+    XSBHInst<(outs rclass:$rDst), (ins rclass:$rSrc),
+             pattern>;
+
+multiclass ExtendByteHalfword {
+  def v16i8:     XSBHVecInst<v8i16>;
+  def r8:        XSBHInst<(outs R16C:$rDst), (ins R8C:$rSrc),
+                          [(set R16C:$rDst, (sext R8C:$rSrc))]>;
+  def r16:       XSBHInRegInst<R16C,
+                               [(set R16C:$rDst, (sext_inreg R16C:$rSrc, i8))]>;
+
+  // 32-bit form for XSBH: used to sign extend 8-bit quantities to 16-bit
+  // quantities to 32-bit quantities via a 32-bit register (see the sext 8->32
+  // pattern below). Intentionally doesn't match a pattern because we want the
+  // sext 8->32 pattern to do the work for us, namely because we need the extra
+  // XSHWr32.
+  def r32:   XSBHInRegInst<R32C, [/* no pattern */]>;
+  
+  // Same as the 32-bit version, but for i64
+  def r64:   XSBHInRegInst<R64C, [/* no pattern */]>;
+}
+
+defm XSBH : ExtendByteHalfword;
+
+// Sign extend halfwords to words:
+
+class XSHWInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101010, OOL, IOL, "xshw\t$rDest, $rSrc",
+            IntegerOp, pattern>;
+
+class XSHWVecInst<ValueType in_vectype, ValueType out_vectype>:
+    XSHWInst<(outs VECREG:$rDest), (ins VECREG:$rSrc),
+             [(set (out_vectype VECREG:$rDest),
+                   (sext (in_vectype VECREG:$rSrc)))]>;
+
+class XSHWInRegInst<RegisterClass rclass, list<dag> pattern>:
+    XSHWInst<(outs rclass:$rDest), (ins rclass:$rSrc),
+             pattern>;
+             
+class XSHWRegInst<RegisterClass rclass>:
+    XSHWInst<(outs rclass:$rDest), (ins R16C:$rSrc),
+             [(set rclass:$rDest, (sext R16C:$rSrc))]>;
+
+multiclass ExtendHalfwordWord {
+  def v4i32: XSHWVecInst<v4i32, v8i16>;
+  
+  def r16:   XSHWRegInst<R32C>;
+  
+  def r32:   XSHWInRegInst<R32C,
+                          [(set R32C:$rDest, (sext_inreg R32C:$rSrc, i16))]>;
+  def r64:   XSHWInRegInst<R64C, [/* no pattern */]>;
+}
+
+defm XSHW : ExtendHalfwordWord;
+
+// Sign-extend words to doublewords (32->64 bits)
+
+class XSWDInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01100101010, OOL, IOL, "xswd\t$rDst, $rSrc",
+              IntegerOp, pattern>;
+      
+class XSWDVecInst<ValueType in_vectype, ValueType out_vectype>:
+    XSWDInst<(outs VECREG:$rDst), (ins VECREG:$rSrc),
+             [(set (out_vectype VECREG:$rDst),
+                   (sext (out_vectype VECREG:$rSrc)))]>;
+      
+class XSWDRegInst<RegisterClass in_rclass, RegisterClass out_rclass>:
+    XSWDInst<(outs out_rclass:$rDst), (ins in_rclass:$rSrc),
+             [(set out_rclass:$rDst, (sext in_rclass:$rSrc))]>;
+             
+multiclass ExtendWordToDoubleWord {
+  def v2i64: XSWDVecInst<v4i32, v2i64>;
+  def r64:   XSWDRegInst<R32C, R64C>;
+  
+  def r64_inreg: XSWDInst<(outs R64C:$rDst), (ins R64C:$rSrc),
+                          [(set R64C:$rDst, (sext_inreg R64C:$rSrc, i32))]>;
+}
+
+defm XSWD : ExtendWordToDoubleWord;
+
+// AND operations
+
+class ANDInst<dag OOL, dag IOL, list<dag> pattern> :
+    RRForm<0b10000011000, OOL, IOL, "and\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ANDVecInst<ValueType vectype>:
+    ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (and (vectype VECREG:$rA),
+                                              (vectype VECREG:$rB)))]>;
+
+class ANDRegInst<RegisterClass rclass>:
+    ANDInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (and rclass:$rA, rclass:$rB))]>;
+
+multiclass BitwiseAnd
+{
+  def v16i8: ANDVecInst<v16i8>;
+  def v8i16: ANDVecInst<v8i16>;
+  def v4i32: ANDVecInst<v4i32>;
+  def v2i64: ANDVecInst<v2i64>;
+
+  def r128:  ANDRegInst<GPRC>;
+  def r64:   ANDRegInst<R64C>;
+  def r32:   ANDRegInst<R32C>;
+  def r16:   ANDRegInst<R16C>;
+  def r8:    ANDRegInst<R8C>;
+
+  //===---------------------------------------------
+  // Special instructions to perform the fabs instruction
+  def fabs32: ANDInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
+                      [/* Intentionally does not match a pattern */]>;
+
+  def fabs64: ANDInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
+                      [/* Intentionally does not match a pattern */]>;
+
+  def fabsvec: ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [/* Intentionally does not match a pattern */]>;
+
+  //===---------------------------------------------
+
+  // Hacked form of AND to zero-extend 16-bit quantities to 32-bit
+  // quantities -- see 16->32 zext pattern.
+  //
+  // This pattern is somewhat artificial, since it might match some
+  // compiler generated pattern but it is unlikely to do so.
+
+  def i16i32: ANDInst<(outs R32C:$rT), (ins R16C:$rA, R32C:$rB),
+                      [(set R32C:$rT, (and (zext R16C:$rA), R32C:$rB))]>;
+}
+
+defm AND : BitwiseAnd;
+
+// N.B.: vnot_conv is one of those special target selection pattern fragments,
+// in which we expect there to be a bit_convert on the constant. Bear in mind
+// that llvm translates "not <reg>" to "xor <reg>, -1" (or in this case, a
+// constant -1 vector.)
+
+class ANDCInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000011010, OOL, IOL, "andc\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ANDCVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
+    ANDCInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (and (vectype VECREG:$rA),
+                        (vnot_frag (vectype VECREG:$rB))))]>;
+
+class ANDCRegInst<RegisterClass rclass>:
+    ANDCInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (and rclass:$rA, (not rclass:$rB)))]>;
+
+multiclass AndComplement
+{
+  def v16i8: ANDCVecInst<v16i8>;
+  def v8i16: ANDCVecInst<v8i16>;
+  def v4i32: ANDCVecInst<v4i32>;
+  def v2i64: ANDCVecInst<v2i64>;
+
+  def r128: ANDCRegInst<GPRC>;
+  def r64:  ANDCRegInst<R64C>;
+  def r32:  ANDCRegInst<R32C>;
+  def r16:  ANDCRegInst<R16C>;
+  def r8:   ANDCRegInst<R8C>;
+
+  // Sometimes, the xor pattern has a bitcast constant:
+  def v16i8_conv: ANDCVecInst<v16i8, vnot_conv>;
+}
+
+defm ANDC : AndComplement;
+
+class ANDBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01101000, OOL, IOL, "andbi\t$rT, $rA, $val",
+             ByteOp, pattern>;
+
+multiclass AndByteImm
+{
+  def v16i8: ANDBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+                       [(set (v16i8 VECREG:$rT),
+                             (and (v16i8 VECREG:$rA),
+                                  (v16i8 v16i8U8Imm:$val)))]>;
+
+  def r8: ANDBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+                    [(set R8C:$rT, (and R8C:$rA, immU8:$val))]>;
+}
+
+defm ANDBI : AndByteImm;
+
+class ANDHIInst<dag OOL, dag IOL, list<dag> pattern> :
+    RI10Form<0b10101000, OOL, IOL, "andhi\t$rT, $rA, $val",
+             ByteOp, pattern>;
+
+multiclass AndHalfwordImm
+{
+  def v8i16: ANDHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v8i16 VECREG:$rT),
+                             (and (v8i16 VECREG:$rA), v8i16SExt10Imm:$val))]>;
+
+  def r16: ANDHIInst<(outs R16C:$rT), (ins R16C:$rA, u10imm:$val),
+                     [(set R16C:$rT, (and R16C:$rA, i16ImmUns10:$val))]>;
+
+  // Zero-extend i8 to i16:
+  def i8i16: ANDHIInst<(outs R16C:$rT), (ins R8C:$rA, u10imm:$val),
+                      [(set R16C:$rT, (and (zext R8C:$rA), i16ImmUns10:$val))]>;
+}
+
+defm ANDHI : AndHalfwordImm;
+
+class ANDIInst<dag OOL, dag IOL, list<dag> pattern> :
+    RI10Form<0b00101000, OOL, IOL, "andi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+multiclass AndWordImm
+{
+  def v4i32: ANDIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                      [(set (v4i32 VECREG:$rT),
+                            (and (v4i32 VECREG:$rA), v4i32SExt10Imm:$val))]>;
+
+  def r32: ANDIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (and R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // Hacked form of ANDI to zero-extend i8 quantities to i32. See the zext 8->32
+  // pattern below.
+  def i8i32: ANDIInst<(outs R32C:$rT), (ins R8C:$rA, s10imm_i32:$val),
+                      [(set R32C:$rT,
+                            (and (zext R8C:$rA), i32ImmSExt10:$val))]>;
+
+  // Hacked form of ANDI to zero-extend i16 quantities to i32. See the
+  // zext 16->32 pattern below.
+  //
+  // Note that this pattern is somewhat artificial, since it might match
+  // something the compiler generates but is unlikely to occur in practice.
+  def i16i32: ANDIInst<(outs R32C:$rT), (ins R16C:$rA, s10imm_i32:$val),
+                       [(set R32C:$rT,
+                             (and (zext R16C:$rA), i32ImmSExt10:$val))]>;
+}
+
+defm ANDI : AndWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Bitwise OR group:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Bitwise "or" (N.B.: These are also register-register copy instructions...)
+class ORInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000010000, OOL, IOL, "or\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ORVecInst<ValueType vectype>:
+    ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                           (vectype VECREG:$rB)))]>;
+
+class ORRegInst<RegisterClass rclass>:
+    ORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+           [(set rclass:$rT, (or rclass:$rA, rclass:$rB))]>;
+
+// ORCvtForm: OR conversion form
+//
+// This is used to "convert" the preferred slot to its vector equivalent, as
+// well as convert a vector back to its preferred slot.
+//
+// These are effectively no-ops, but need to exist for proper type conversion
+// and type coercion.
+
+class ORCvtForm<dag OOL, dag IOL, list<dag> pattern = [/* no pattern */]>
+          : SPUInstr<OOL, IOL, "or\t$rT, $rA, $rA", IntegerOp> {
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = 0b10000010000;
+  let Inst{11-17} = RA;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+class ORPromoteScalar<RegisterClass rclass>:
+    ORCvtForm<(outs VECREG:$rT), (ins rclass:$rA)>;
+
+class ORExtractElt<RegisterClass rclass>:
+    ORCvtForm<(outs rclass:$rT), (ins VECREG:$rA)>;
+
+/* class ORCvtRegGPRC<RegisterClass rclass>:
+    ORCvtForm<(outs GPRC:$rT), (ins rclass:$rA)>; */
+
+/* class ORCvtGPRCReg<RegisterClass rclass>:
+    ORCvtForm<(outs rclass:$rT), (ins GPRC:$rA)>; */
+    
+class ORCvtFormR32Reg<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs rclass:$rT), (ins R32C:$rA), pattern>;
+    
+class ORCvtFormRegR32<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs R32C:$rT), (ins rclass:$rA), pattern>;
+
+class ORCvtFormR64Reg<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs rclass:$rT), (ins R64C:$rA), pattern>;
+    
+class ORCvtFormRegR64<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs R64C:$rT), (ins rclass:$rA), pattern>;
+
+class ORCvtGPRCVec:
+    ORCvtForm<(outs VECREG:$rT), (ins GPRC:$rA)>;
+
+class ORCvtVecGPRC:
+    ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>;
+
+multiclass BitwiseOr
+{
+  def v16i8: ORVecInst<v16i8>;
+  def v8i16: ORVecInst<v8i16>;
+  def v4i32: ORVecInst<v4i32>;
+  def v2i64: ORVecInst<v2i64>;
+
+  def v4f32: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                    [(set (v4f32 VECREG:$rT),
+                          (v4f32 (bitconvert (or (v4i32 VECREG:$rA),
+                                                 (v4i32 VECREG:$rB)))))]>;
+
+  def v2f64: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                    [(set (v2f64 VECREG:$rT),
+                          (v2f64 (bitconvert (or (v2i64 VECREG:$rA),
+                                                 (v2i64 VECREG:$rB)))))]>;
+
+  def r128: ORRegInst<GPRC>;
+  def r64:  ORRegInst<R64C>;
+  def r32:  ORRegInst<R32C>;
+  def r16:  ORRegInst<R16C>;
+  def r8:   ORRegInst<R8C>;
+
+  // OR instructions used to copy f32 and f64 registers.
+  def f32: ORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                  [/* no pattern */]>;
+
+  def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+                  [/* no pattern */]>;
+
+  // scalar->vector promotion, prefslot2vec:
+  def v16i8_i8:  ORPromoteScalar<R8C>;
+  def v8i16_i16: ORPromoteScalar<R16C>;
+  def v4i32_i32: ORPromoteScalar<R32C>;
+  def v2i64_i64: ORPromoteScalar<R64C>;
+  def v4f32_f32: ORPromoteScalar<R32FP>;
+  def v2f64_f64: ORPromoteScalar<R64FP>;
+
+  // vector->scalar demotion, vec2prefslot:
+  def i8_v16i8:  ORExtractElt<R8C>;
+  def i16_v8i16: ORExtractElt<R16C>;
+  def i32_v4i32: ORExtractElt<R32C>;
+  def i64_v2i64: ORExtractElt<R64C>;
+  def f32_v4f32: ORExtractElt<R32FP>;
+  def f64_v2f64: ORExtractElt<R64FP>;
+
+  // Conversion from vector to GPRC
+  def i128_vec:  ORCvtVecGPRC;
+
+  // Conversion from GPRC to vector
+  def vec_i128:  ORCvtGPRCVec;
+
+/*
+  // Conversion from register to GPRC
+  def i128_r64:  ORCvtRegGPRC<R64C>;
+  def i128_f64:  ORCvtRegGPRC<R64FP>;
+  def i128_r32:  ORCvtRegGPRC<R32C>;
+  def i128_f32:  ORCvtRegGPRC<R32FP>;
+  def i128_r16:  ORCvtRegGPRC<R16C>;
+  def i128_r8:   ORCvtRegGPRC<R8C>;
+
+  // Conversion from GPRC to register
+  def r64_i128:  ORCvtGPRCReg<R64C>;
+  def f64_i128:  ORCvtGPRCReg<R64FP>;
+  def r32_i128:  ORCvtGPRCReg<R32C>;
+  def f32_i128:  ORCvtGPRCReg<R32FP>;
+  def r16_i128:  ORCvtGPRCReg<R16C>;
+  def r8_i128:   ORCvtGPRCReg<R8C>;
+*/
+/*
+  // Conversion from register to R32C:
+  def r32_r16:   ORCvtFormRegR32<R16C>;
+  def r32_r8:    ORCvtFormRegR32<R8C>;
+  
+  // Conversion from R32C to register
+  def r32_r16:   ORCvtFormR32Reg<R16C>;
+  def r32_r8:    ORCvtFormR32Reg<R8C>;
+*/
+  
+  // Conversion from R64C to register:
+  def r32_r64:   ORCvtFormR64Reg<R32C>;
+  // def r16_r64:   ORCvtFormR64Reg<R16C>;
+  // def r8_r64:    ORCvtFormR64Reg<R8C>;
+  
+  // Conversion to R64C from register:
+  def r64_r32:   ORCvtFormRegR64<R32C>;
+  // def r64_r16:   ORCvtFormRegR64<R16C>;
+  // def r64_r8:    ORCvtFormRegR64<R8C>;
+
+  // bitconvert patterns:
+  def r32_f32:   ORCvtFormR32Reg<R32FP,
+                                 [(set R32FP:$rT, (bitconvert R32C:$rA))]>;
+  def f32_r32:   ORCvtFormRegR32<R32FP,
+                                 [(set R32C:$rT, (bitconvert R32FP:$rA))]>;
+
+  def r64_f64:   ORCvtFormR64Reg<R64FP,
+                                 [(set R64FP:$rT, (bitconvert R64C:$rA))]>;
+  def f64_r64:   ORCvtFormRegR64<R64FP,
+                                 [(set R64C:$rT, (bitconvert R64FP:$rA))]>;
+}
+
+defm OR : BitwiseOr;
+
+// scalar->vector promotion patterns (preferred slot to vector):
+def : Pat<(v16i8 (SPUprefslot2vec R8C:$rA)),
+          (ORv16i8_i8 R8C:$rA)>;
+
+def : Pat<(v8i16 (SPUprefslot2vec R16C:$rA)),
+          (ORv8i16_i16 R16C:$rA)>;
+
+def : Pat<(v4i32 (SPUprefslot2vec R32C:$rA)),
+          (ORv4i32_i32 R32C:$rA)>;
+
+def : Pat<(v2i64 (SPUprefslot2vec R64C:$rA)),
+          (ORv2i64_i64 R64C:$rA)>;
+
+def : Pat<(v4f32 (SPUprefslot2vec R32FP:$rA)),
+          (ORv4f32_f32 R32FP:$rA)>;
+
+def : Pat<(v2f64 (SPUprefslot2vec R64FP:$rA)),
+          (ORv2f64_f64 R64FP:$rA)>;
+
+// ORi*_v*: Used to extract vector element 0 (the preferred slot), otherwise
+// known as converting the vector back to its preferred slot
+
+def : Pat<(SPUvec2prefslot (v16i8 VECREG:$rA)),
+          (ORi8_v16i8 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v8i16 VECREG:$rA)),
+          (ORi16_v8i16 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v4i32 VECREG:$rA)),
+          (ORi32_v4i32 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v2i64 VECREG:$rA)),
+          (ORi64_v2i64 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v4f32 VECREG:$rA)),
+          (ORf32_v4f32 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v2f64 VECREG:$rA)),
+          (ORf64_v2f64 VECREG:$rA)>;
+
+// Load Register: This is an assembler alias for a bitwise OR of a register
+// against itself. It's here because it brings some clarity to assembly
+// language output.
+
+let hasCtrlDep = 1 in {
+    class LRInst<dag OOL, dag IOL>
+              : SPUInstr<OOL, IOL, "lr\t$rT, $rA", IntegerOp> {
+      bits<7> RA;
+      bits<7> RT;
+
+      let Pattern = [/*no pattern*/];
+
+      let Inst{0-10} = 0b10000010000;   /* It's an OR operation */
+      let Inst{11-17} = RA;
+      let Inst{18-24} = RA;
+      let Inst{25-31} = RT;
+    }
+
+    class LRVecInst<ValueType vectype>:
+        LRInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+    class LRRegInst<RegisterClass rclass>:
+        LRInst<(outs rclass:$rT), (ins rclass:$rA)>;
+
+    multiclass LoadRegister {
+      def v2i64: LRVecInst<v2i64>;
+      def v2f64: LRVecInst<v2f64>;
+      def v4i32: LRVecInst<v4i32>;
+      def v4f32: LRVecInst<v4f32>;
+      def v8i16: LRVecInst<v8i16>;
+      def v16i8: LRVecInst<v16i8>;
+
+      def r128:  LRRegInst<GPRC>;
+      def r64:   LRRegInst<R64C>;
+      def f64:   LRRegInst<R64FP>;
+      def r32:   LRRegInst<R32C>;
+      def f32:   LRRegInst<R32FP>;
+      def r16:   LRRegInst<R16C>;
+      def r8:    LRRegInst<R8C>;
+    }
+
+    defm LR: LoadRegister;
+}
+
+// ORC: Bitwise "or" with complement (c = a | ~b)
+
+class ORCInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "orc\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ORCVecInst<ValueType vectype>:
+    ORCInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                            (vnot (vectype VECREG:$rB))))]>;
+
+class ORCRegInst<RegisterClass rclass>:
+  ORCInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (or rclass:$rA, (not rclass:$rB)))]>;
+
+multiclass BitwiseOrComplement
+{
+  def v16i8: ORCVecInst<v16i8>;
+  def v8i16: ORCVecInst<v8i16>;
+  def v4i32: ORCVecInst<v4i32>;
+  def v2i64: ORCVecInst<v2i64>;
+
+  def r128:  ORCRegInst<GPRC>;
+  def r64:   ORCRegInst<R64C>;
+  def r32:   ORCRegInst<R32C>;
+  def r16:   ORCRegInst<R16C>;
+  def r8:    ORCRegInst<R8C>;
+}
+
+defm ORC : BitwiseOrComplement;
+
+// OR byte immediate
+class ORBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01100000, OOL, IOL, "orbi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORBIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+             [(set (v16i8 VECREG:$rT), (or (vectype VECREG:$rA),
+                                           (vectype immpred:$val)))]>;
+
+multiclass BitwiseOrByteImm
+{
+  def v16i8: ORBIVecInst<v16i8, v16i8U8Imm>;
+
+  def r8: ORBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+                   [(set R8C:$rT, (or R8C:$rA, immU8:$val))]>;
+}
+
+defm ORBI : BitwiseOrByteImm;
+
+// OR halfword immediate
+class ORHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b10100000, OOL, IOL, "orhi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORHIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+              [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                              immpred:$val))]>;
+
+multiclass BitwiseOrHalfwordImm
+{
+  def v8i16: ORHIVecInst<v8i16, v8i16Uns10Imm>;
+
+  def r16: ORHIInst<(outs R16C:$rT), (ins R16C:$rA, u10imm:$val),
+                    [(set R16C:$rT, (or R16C:$rA, i16ImmUns10:$val))]>;
+
+  // Specialized ORHI form used to promote 8-bit registers to 16-bit
+  def i8i16: ORHIInst<(outs R16C:$rT), (ins R8C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (or (anyext R8C:$rA),
+                                          i16ImmSExt10:$val))]>;
+}
+
+defm ORHI : BitwiseOrHalfwordImm;
+
+class ORIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00100000, OOL, IOL, "ori\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+            [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                            immpred:$val))]>;
+
+// Bitwise "or" with immediate
+multiclass BitwiseOrImm
+{
+  def v4i32: ORIVecInst<v4i32, v4i32Uns10Imm>;
+
+  def r32: ORIInst<(outs R32C:$rT), (ins R32C:$rA, u10imm_i32:$val),
+                   [(set R32C:$rT, (or R32C:$rA, i32ImmUns10:$val))]>;
+
+  // i16i32: hacked version of the ori instruction to extend 16-bit quantities
+  // to 32-bit quantities. used exclusively to match "anyext" conversions (vide
+  // infra "anyext 16->32" pattern.)
+  def i16i32: ORIInst<(outs R32C:$rT), (ins R16C:$rA, s10imm_i32:$val),
+                      [(set R32C:$rT, (or (anyext R16C:$rA),
+                                          i32ImmSExt10:$val))]>;
+
+  // i8i32: Hacked version of the ORI instruction to extend 16-bit quantities
+  // to 32-bit quantities. Used exclusively to match "anyext" conversions (vide
+  // infra "anyext 16->32" pattern.)
+  def i8i32: ORIInst<(outs R32C:$rT), (ins R8C:$rA, s10imm_i32:$val),
+                     [(set R32C:$rT, (or (anyext R8C:$rA),
+                                         i32ImmSExt10:$val))]>;
+}
+
+defm ORI : BitwiseOrImm;
+
+// ORX: "or" across the vector: or's $rA's word slots leaving the result in
+// $rT[0], slots 1-3 are zeroed.
+//
+// FIXME: Needs to match an intrinsic pattern.
+def ORXv4i32:
+    RRForm<0b10010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "orx\t$rT, $rA, $rB", IntegerOp,
+      []>;
+
+// XOR:
+
+class XORInst<dag OOL, dag IOL, list<dag> pattern> :
+    RRForm<0b10010010000, OOL, IOL, "xor\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class XORVecInst<ValueType vectype>:
+    XORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (xor (vectype VECREG:$rA),
+                                              (vectype VECREG:$rB)))]>;
+
+class XORRegInst<RegisterClass rclass>:
+    XORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (xor rclass:$rA, rclass:$rB))]>;
+
+multiclass BitwiseExclusiveOr
+{
+  def v16i8: XORVecInst<v16i8>;
+  def v8i16: XORVecInst<v8i16>;
+  def v4i32: XORVecInst<v4i32>;
+  def v2i64: XORVecInst<v2i64>;
+
+  def r128:  XORRegInst<GPRC>;
+  def r64:   XORRegInst<R64C>;
+  def r32:   XORRegInst<R32C>;
+  def r16:   XORRegInst<R16C>;
+  def r8:    XORRegInst<R8C>;
+
+  // XOR instructions used to negate f32 and f64 quantities.
+
+  def fneg32: XORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
+                     [/* no pattern */]>;
+
+  def fneg64: XORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
+                     [/* no pattern */]>;
+
+  def fnegvec: XORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [/* no pattern, see fneg{32,64} */]>;
+}
+
+defm XOR : BitwiseExclusiveOr;
+
+//==----------------------------------------------------------
+
+class XORBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01100000, OOL, IOL, "xorbi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+multiclass XorByteImm
+{
+  def v16i8:
+    XORBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+              [(set (v16i8 VECREG:$rT), (xor (v16i8 VECREG:$rA), v16i8U8Imm:$val))]>;
+
+  def r8:
+    XORBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+              [(set R8C:$rT, (xor R8C:$rA, immU8:$val))]>;
+}
+
+defm XORBI : XorByteImm;
+
+def XORHIv8i16:
+    RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+      "xorhi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (xor (v8i16 VECREG:$rA),
+                                      v8i16SExt10Imm:$val))]>;
+
+def XORHIr16:
+    RI10Form<0b10100000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+      "xorhi\t$rT, $rA, $val", IntegerOp,
+      [(set R16C:$rT, (xor R16C:$rA, i16ImmSExt10:$val))]>;
+
+def XORIv4i32:
+    RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm_i32:$val),
+      "xori\t$rT, $rA, $val", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (xor (v4i32 VECREG:$rA),
+                                     v4i32SExt10Imm:$val))]>;
+
+def XORIr32:
+    RI10Form<0b00100000, (outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+      "xori\t$rT, $rA, $val", IntegerOp,
+      [(set R32C:$rT, (xor R32C:$rA, i32ImmSExt10:$val))]>;
+
+// NAND:
+
+class NANDInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010011000, OOL, IOL, "nand\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class NANDVecInst<ValueType vectype>:
+    NANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (vnot (and (vectype VECREG:$rA),
+                                                    (vectype VECREG:$rB))))]>;
+class NANDRegInst<RegisterClass rclass>:
+    NANDInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (not (and rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitwiseNand
+{
+  def v16i8: NANDVecInst<v16i8>;
+  def v8i16: NANDVecInst<v8i16>;
+  def v4i32: NANDVecInst<v4i32>;
+  def v2i64: NANDVecInst<v2i64>;
+
+  def r128:  NANDRegInst<GPRC>;
+  def r64:   NANDRegInst<R64C>;
+  def r32:   NANDRegInst<R32C>;
+  def r16:   NANDRegInst<R16C>;
+  def r8:    NANDRegInst<R8C>;
+}
+
+defm NAND : BitwiseNand;
+
+// NOR:
+
+class NORInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "nor\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class NORVecInst<ValueType vectype>:
+    NORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT), (vnot (or (vectype VECREG:$rA),
+                                                  (vectype VECREG:$rB))))]>;
+class NORRegInst<RegisterClass rclass>:
+    NORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+            [(set rclass:$rT, (not (or rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitwiseNor
+{
+  def v16i8: NORVecInst<v16i8>;
+  def v8i16: NORVecInst<v8i16>;
+  def v4i32: NORVecInst<v4i32>;
+  def v2i64: NORVecInst<v2i64>;
+
+  def r128:  NORRegInst<GPRC>;
+  def r64:   NORRegInst<R64C>;
+  def r32:   NORRegInst<R32C>;
+  def r16:   NORRegInst<R16C>;
+  def r8:    NORRegInst<R8C>;
+}
+
+defm NOR : BitwiseNor;
+
+// Select bits:
+class SELBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRRForm<0b1000, OOL, IOL, "selb\t$rT, $rA, $rB, $rC",
+            IntegerOp, pattern>;
+
+class SELBVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (or (and (vectype VECREG:$rC), (vectype VECREG:$rB)),
+                     (and (vnot_frag (vectype VECREG:$rC)),
+                          (vectype VECREG:$rA))))]>;
+
+class SELBVecVCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select (vectype VECREG:$rC),
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
+class SELBVecCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, R32C:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select R32C:$rC,
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
+class SELBRegInst<RegisterClass rclass>:
+  SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rclass:$rC),
+           [(set rclass:$rT,
+                 (or (and rclass:$rB, rclass:$rC),
+                     (and rclass:$rA, (not rclass:$rC))))]>;
+
+class SELBRegCondInst<RegisterClass rcond, RegisterClass rclass>:
+  SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rcond:$rC),
+           [(set rclass:$rT,
+                 (select rcond:$rC, rclass:$rB, rclass:$rA))]>;
+
+multiclass SelectBits
+{
+  def v16i8: SELBVecInst<v16i8>;
+  def v8i16: SELBVecInst<v8i16>;
+  def v4i32: SELBVecInst<v4i32>;
+  def v2i64: SELBVecInst<v2i64, vnot_conv>;
+
+  def r128:  SELBRegInst<GPRC>;
+  def r64:   SELBRegInst<R64C>;
+  def r32:   SELBRegInst<R32C>;
+  def r16:   SELBRegInst<R16C>;
+  def r8:    SELBRegInst<R8C>;
+
+  def v16i8_cond: SELBVecCondInst<v16i8>;
+  def v8i16_cond: SELBVecCondInst<v8i16>;
+  def v4i32_cond: SELBVecCondInst<v4i32>;
+  def v2i64_cond: SELBVecCondInst<v2i64>;
+
+  def v16i8_vcond: SELBVecCondInst<v16i8>;
+  def v8i16_vcond: SELBVecCondInst<v8i16>;
+  def v4i32_vcond: SELBVecCondInst<v4i32>;
+  def v2i64_vcond: SELBVecCondInst<v2i64>;
+
+  def v4f32_cond:
+        SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+                 [(set (v4f32 VECREG:$rT),
+                       (select (v4i32 VECREG:$rC),
+                               (v4f32 VECREG:$rB),
+                               (v4f32 VECREG:$rA)))]>;
+
+  // SELBr64_cond is defined in SPU64InstrInfo.td
+  def r32_cond:   SELBRegCondInst<R32C, R32C>;
+  def f32_cond:   SELBRegCondInst<R32C, R32FP>;
+  def r16_cond:   SELBRegCondInst<R16C, R16C>;
+  def r8_cond:    SELBRegCondInst<R8C,  R8C>;
+}
+
+defm SELB : SelectBits;
+
+class SPUselbPatVec<ValueType vectype, SPUInstr inst>:
+   Pat<(SPUselb (vectype VECREG:$rA), (vectype VECREG:$rB), (vectype VECREG:$rC)),
+       (inst VECREG:$rA, VECREG:$rB, VECREG:$rC)>;
+
+def : SPUselbPatVec<v16i8, SELBv16i8>;
+def : SPUselbPatVec<v8i16, SELBv8i16>;
+def : SPUselbPatVec<v4i32, SELBv4i32>;
+def : SPUselbPatVec<v2i64, SELBv2i64>;
+
+class SPUselbPatReg<RegisterClass rclass, SPUInstr inst>:
+   Pat<(SPUselb rclass:$rA, rclass:$rB, rclass:$rC),
+       (inst rclass:$rA, rclass:$rB, rclass:$rC)>;
+
+def : SPUselbPatReg<R8C,   SELBr8>;
+def : SPUselbPatReg<R16C,  SELBr16>;
+def : SPUselbPatReg<R32C,  SELBr32>;
+def : SPUselbPatReg<R64C,  SELBr64>;
+
+// EQV: Equivalence (1 for each same bit, otherwise 0)
+//
+// Note: There are a lot of ways to match this bit operator and these patterns
+// attempt to be as exhaustive as possible.
+
+class EQVInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "eqv\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class EQVVecInst<ValueType vectype>:
+    EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT),
+                  (or (and (vectype VECREG:$rA), (vectype VECREG:$rB)),
+                      (and (vnot (vectype VECREG:$rA)),
+                           (vnot (vectype VECREG:$rB)))))]>;
+
+class EQVRegInst<RegisterClass rclass>:
+    EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+            [(set rclass:$rT, (or (and rclass:$rA, rclass:$rB),
+                                  (and (not rclass:$rA), (not rclass:$rB))))]>;
+
+class EQVVecPattern1<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (xor (vectype VECREG:$rA), (vnot (vectype VECREG:$rB))))]>;
+
+class EQVRegPattern1<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (xor rclass:$rA, (not rclass:$rB)))]>;
+
+class EQVVecPattern2<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (or (and (vectype VECREG:$rA), (vectype VECREG:$rB)),
+                    (vnot (or (vectype VECREG:$rA), (vectype VECREG:$rB)))))]>;
+
+class EQVRegPattern2<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT,
+                (or (and rclass:$rA, rclass:$rB),
+                    (not (or rclass:$rA, rclass:$rB))))]>;
+
+class EQVVecPattern3<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (not (xor (vectype VECREG:$rA), (vectype VECREG:$rB))))]>;
+
+class EQVRegPattern3<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (not (xor rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitEquivalence
+{
+  def v16i8: EQVVecInst<v16i8>;
+  def v8i16: EQVVecInst<v8i16>;
+  def v4i32: EQVVecInst<v4i32>;
+  def v2i64: EQVVecInst<v2i64>;
+
+  def v16i8_1: EQVVecPattern1<v16i8>;
+  def v8i16_1: EQVVecPattern1<v8i16>;
+  def v4i32_1: EQVVecPattern1<v4i32>;
+  def v2i64_1: EQVVecPattern1<v2i64>;
+
+  def v16i8_2: EQVVecPattern2<v16i8>;
+  def v8i16_2: EQVVecPattern2<v8i16>;
+  def v4i32_2: EQVVecPattern2<v4i32>;
+  def v2i64_2: EQVVecPattern2<v2i64>;
+
+  def v16i8_3: EQVVecPattern3<v16i8>;
+  def v8i16_3: EQVVecPattern3<v8i16>;
+  def v4i32_3: EQVVecPattern3<v4i32>;
+  def v2i64_3: EQVVecPattern3<v2i64>;
+
+  def r128:  EQVRegInst<GPRC>;
+  def r64:   EQVRegInst<R64C>;
+  def r32:   EQVRegInst<R32C>;
+  def r16:   EQVRegInst<R16C>;
+  def r8:    EQVRegInst<R8C>;
+
+  def r128_1: EQVRegPattern1<GPRC>;
+  def r64_1:  EQVRegPattern1<R64C>;
+  def r32_1:  EQVRegPattern1<R32C>;
+  def r16_1:  EQVRegPattern1<R16C>;
+  def r8_1:   EQVRegPattern1<R8C>;
+
+  def r128_2: EQVRegPattern2<GPRC>;
+  def r64_2:  EQVRegPattern2<R64C>;
+  def r32_2:  EQVRegPattern2<R32C>;
+  def r16_2:  EQVRegPattern2<R16C>;
+  def r8_2:   EQVRegPattern2<R8C>;
+
+  def r128_3: EQVRegPattern3<GPRC>;
+  def r64_3:  EQVRegPattern3<R64C>;
+  def r32_3:  EQVRegPattern3<R32C>;
+  def r16_3:  EQVRegPattern3<R16C>;
+  def r8_3:   EQVRegPattern3<R8C>;
+}
+
+defm EQV: BitEquivalence;
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle...
+//===----------------------------------------------------------------------===//
+// SPUshuffle is generated in LowerVECTOR_SHUFFLE and gets replaced with SHUFB.
+// See the SPUshuffle SDNode operand above, which sets up the DAG pattern
+// matcher to emit something when the LowerVECTOR_SHUFFLE generates a node with
+// the SPUISD::SHUFB opcode.
+//===----------------------------------------------------------------------===//
+
+class SHUFBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRRForm<0b1000, OOL, IOL, "shufb\t$rT, $rA, $rB, $rC",
+            IntegerOp, pattern>;
+
+class SHUFBVecInst<ValueType resultvec, ValueType maskvec>:
+    SHUFBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+              [(set (resultvec VECREG:$rT),
+                    (SPUshuffle (resultvec VECREG:$rA),
+                                (resultvec VECREG:$rB),
+                                (maskvec VECREG:$rC)))]>;
+
+class SHUFBGPRCInst:
+    SHUFBInst<(outs VECREG:$rT), (ins GPRC:$rA, GPRC:$rB, VECREG:$rC),
+              [/* no pattern */]>;
+
+multiclass ShuffleBytes
+{
+  def v16i8     : SHUFBVecInst<v16i8, v16i8>;
+  def v16i8_m32 : SHUFBVecInst<v16i8, v4i32>;
+  def v8i16     : SHUFBVecInst<v8i16, v16i8>;
+  def v8i16_m32 : SHUFBVecInst<v8i16, v4i32>;
+  def v4i32     : SHUFBVecInst<v4i32, v16i8>;
+  def v4i32_m32 : SHUFBVecInst<v4i32, v4i32>;
+  def v2i64     : SHUFBVecInst<v2i64, v16i8>;
+  def v2i64_m32 : SHUFBVecInst<v2i64, v4i32>;
+
+  def v4f32     : SHUFBVecInst<v4f32, v16i8>;
+  def v4f32_m32 : SHUFBVecInst<v4f32, v4i32>;
+
+  def v2f64     : SHUFBVecInst<v2f64, v16i8>;
+  def v2f64_m32 : SHUFBVecInst<v2f64, v4i32>;
+
+  def gprc      : SHUFBGPRCInst;
+}
+
+defm SHUFB : ShuffleBytes;
+
+//===----------------------------------------------------------------------===//
+// Shift and rotate group:
+//===----------------------------------------------------------------------===//
+
+class SHLHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11111010000, OOL, IOL, "shlh\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class SHLHVecInst<ValueType vectype>:
+    SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_shl (vectype VECREG:$rA), R16C:$rB))]>;
+
+multiclass ShiftLeftHalfword
+{
+  def v8i16: SHLHVecInst<v8i16>;
+  def r16:   SHLHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                      [(set R16C:$rT, (shl R16C:$rA, R16C:$rB))]>;
+  def r16_r32: SHLHInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+                        [(set R16C:$rT, (shl R16C:$rA, R32C:$rB))]>;
+}
+
+defm SHLH : ShiftLeftHalfword;
+
+//===----------------------------------------------------------------------===//
+
+class SHLHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111010000, OOL, IOL, "shlhi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class SHLHIVecInst<ValueType vectype>:
+    SHLHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+              [(set (vectype VECREG:$rT),
+                    (SPUvec_shl (vectype VECREG:$rA), (i16 uimm7:$val)))]>;
+
+multiclass ShiftLeftHalfwordImm
+{
+  def v8i16: SHLHIVecInst<v8i16>;
+  def r16: SHLHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm:$val),
+                     [(set R16C:$rT, (shl R16C:$rA, (i16 uimm7:$val)))]>;
+}
+
+defm SHLHI : ShiftLeftHalfwordImm;
+
+def : Pat<(SPUvec_shl (v8i16 VECREG:$rA), (i32 uimm7:$val)),
+          (SHLHIv8i16 VECREG:$rA, uimm7:$val)>;
+
+def : Pat<(shl R16C:$rA, (i32 uimm7:$val)),
+          (SHLHIr16 R16C:$rA, uimm7:$val)>;
+
+//===----------------------------------------------------------------------===//
+
+class SHLInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11111010000, OOL, IOL, "shl\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+multiclass ShiftLeftWord
+{
+  def v4i32:
+      SHLInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB),
+              [(set (v4i32 VECREG:$rT),
+                    (SPUvec_shl (v4i32 VECREG:$rA), R16C:$rB))]>;
+  def r32:
+      SHLInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+              [(set R32C:$rT, (shl R32C:$rA, R32C:$rB))]>;
+}
+
+defm SHL: ShiftLeftWord;
+
+//===----------------------------------------------------------------------===//
+
+class SHLIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111010000, OOL, IOL, "shli\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+multiclass ShiftLeftWordImm
+{
+  def v4i32:
+    SHLIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+             [(set (v4i32 VECREG:$rT),
+                   (SPUvec_shl (v4i32 VECREG:$rA), (i32 uimm7:$val)))]>;
+
+  def r32:
+    SHLIInst<(outs R32C:$rT), (ins R32C:$rA, u7imm_i32:$val),
+             [(set R32C:$rT, (shl R32C:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm SHLI : ShiftLeftWordImm;
+
+//===----------------------------------------------------------------------===//
+// SHLQBI vec form: Note that this will shift the entire vector (the 128-bit
+// register) to the left. Vector form is here to ensure type correctness.
+//
+// The shift count is in the lowest 3 bits (29-31) of $rB, so only a bit shift
+// of 7 bits is actually possible.
+//
+// Note also that SHLQBI/SHLQBII are used in conjunction with SHLQBY/SHLQBYI
+// to shift i64 and i128. SHLQBI is the residual left over after shifting by
+// bytes with SHLQBY.
+
+class SHLQBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11011011100, OOL, IOL, "shlqbi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class SHLQBIVecInst<ValueType vectype>:
+    SHLQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUshlquad_l_bits (vectype VECREG:$rA), R32C:$rB))]>;
+
+class SHLQBIRegInst<RegisterClass rclass>:
+    SHLQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+               [/* no pattern */]>;
+
+multiclass ShiftLeftQuadByBits
+{
+  def v16i8: SHLQBIVecInst<v16i8>;
+  def v8i16: SHLQBIVecInst<v8i16>;
+  def v4i32: SHLQBIVecInst<v4i32>;
+  def v4f32: SHLQBIVecInst<v4f32>;
+  def v2i64: SHLQBIVecInst<v2i64>;
+  def v2f64: SHLQBIVecInst<v2f64>;
+
+  def r128:  SHLQBIRegInst<GPRC>;
+}
+
+defm SHLQBI : ShiftLeftQuadByBits;
+
+// See note above on SHLQBI. In this case, the predicate actually does then
+// enforcement, whereas with SHLQBI, we have to "take it on faith."
+class SHLQBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11011111100, OOL, IOL, "shlqbii\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class SHLQBIIVecInst<ValueType vectype>:
+    SHLQBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUshlquad_l_bits (vectype VECREG:$rA), (i32 bitshift:$val)))]>;
+
+multiclass ShiftLeftQuadByBitsImm
+{
+  def v16i8 : SHLQBIIVecInst<v16i8>;
+  def v8i16 : SHLQBIIVecInst<v8i16>;
+  def v4i32 : SHLQBIIVecInst<v4i32>;
+  def v4f32 : SHLQBIIVecInst<v4f32>;
+  def v2i64 : SHLQBIIVecInst<v2i64>;
+  def v2f64 : SHLQBIIVecInst<v2f64>;
+}
+
+defm SHLQBII : ShiftLeftQuadByBitsImm;
+
+// SHLQBY, SHLQBYI vector forms: Shift the entire vector to the left by bytes,
+// not by bits. See notes above on SHLQBI.
+
+class SHLQBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111011100, OOL, IOL, "shlqby\t$rT, $rA, $rB",
+            RotateShift, pattern>;
+
+class SHLQBYVecInst<ValueType vectype>:
+    SHLQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUshlquad_l_bytes (vectype VECREG:$rA), R32C:$rB))]>;
+
+multiclass ShiftLeftQuadBytes
+{
+  def v16i8: SHLQBYVecInst<v16i8>;
+  def v8i16: SHLQBYVecInst<v8i16>;
+  def v4i32: SHLQBYVecInst<v4i32>;
+  def v4f32: SHLQBYVecInst<v4f32>;
+  def v2i64: SHLQBYVecInst<v2i64>;
+  def v2f64: SHLQBYVecInst<v2f64>;
+  def r128: SHLQBYInst<(outs GPRC:$rT), (ins GPRC:$rA, R32C:$rB),
+                       [(set GPRC:$rT, (SPUshlquad_l_bytes GPRC:$rA, R32C:$rB))]>;
+}
+
+defm SHLQBY: ShiftLeftQuadBytes;
+
+class SHLQBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111111100, OOL, IOL, "shlqbyi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class SHLQBYIVecInst<ValueType vectype>:
+    SHLQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUshlquad_l_bytes (vectype VECREG:$rA), (i32 uimm7:$val)))]>;
+
+multiclass ShiftLeftQuadBytesImm
+{
+  def v16i8: SHLQBYIVecInst<v16i8>;
+  def v8i16: SHLQBYIVecInst<v8i16>;
+  def v4i32: SHLQBYIVecInst<v4i32>;
+  def v4f32: SHLQBYIVecInst<v4f32>;
+  def v2i64: SHLQBYIVecInst<v2i64>;
+  def v2f64: SHLQBYIVecInst<v2f64>;
+  def r128:  SHLQBYIInst<(outs GPRC:$rT), (ins GPRC:$rA, u7imm_i32:$val),
+                         [(set GPRC:$rT,
+                               (SPUshlquad_l_bytes GPRC:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm SHLQBYI : ShiftLeftQuadBytesImm;
+
+class SHLQBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111001111, OOL, IOL, "shlqbybi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class SHLQBYBIVecInst<ValueType vectype>:
+    SHLQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+class SHLQBYBIRegInst<RegisterClass rclass>:
+    SHLQBYBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                 [/* no pattern */]>;
+
+multiclass ShiftLeftQuadBytesBitCount
+{
+  def v16i8: SHLQBYBIVecInst<v16i8>;
+  def v8i16: SHLQBYBIVecInst<v8i16>;
+  def v4i32: SHLQBYBIVecInst<v4i32>;
+  def v4f32: SHLQBYBIVecInst<v4f32>;
+  def v2i64: SHLQBYBIVecInst<v2i64>;
+  def v2f64: SHLQBYBIVecInst<v2f64>;
+
+  def r128:  SHLQBYBIRegInst<GPRC>;
+}
+
+defm SHLQBYBI : ShiftLeftQuadBytesBitCount;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate halfword:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class ROTHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111010000, OOL, IOL, "roth\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTHVecInst<ValueType vectype>:
+    ROTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_rotl VECREG:$rA, VECREG:$rB))]>;
+
+class ROTHRegInst<RegisterClass rclass>:
+    ROTHInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (rotl rclass:$rA, rclass:$rB))]>;
+
+multiclass RotateLeftHalfword
+{
+  def v8i16: ROTHVecInst<v8i16>;
+  def r16: ROTHRegInst<R16C>;
+}
+
+defm ROTH: RotateLeftHalfword;
+
+def ROTHr16_r32: ROTHInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+                          [(set R16C:$rT, (rotl R16C:$rA, R32C:$rB))]>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate halfword, immediate:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class ROTHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00111110000, OOL, IOL, "rothi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTHIVecInst<ValueType vectype>:
+    ROTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+              [(set (vectype VECREG:$rT),
+                    (SPUvec_rotl VECREG:$rA, (i16 uimm7:$val)))]>;
+
+multiclass RotateLeftHalfwordImm
+{
+  def v8i16: ROTHIVecInst<v8i16>;
+  def r16: ROTHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm:$val),
+                     [(set R16C:$rT, (rotl R16C:$rA, (i16 uimm7:$val)))]>;
+  def r16_r32: ROTHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm_i32:$val),
+                         [(set R16C:$rT, (rotl R16C:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm ROTHI: RotateLeftHalfwordImm;
+
+def : Pat<(SPUvec_rotl VECREG:$rA, (i32 uimm7:$val)),
+          (ROTHIv8i16 VECREG:$rA, imm:$val)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate word:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00011010000, OOL, IOL, "rot\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTVecInst<ValueType vectype>:
+    ROTInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+            [(set (vectype VECREG:$rT),
+                  (SPUvec_rotl (vectype VECREG:$rA), R32C:$rB))]>;
+
+class ROTRegInst<RegisterClass rclass>:
+    ROTInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+            [(set rclass:$rT,
+                  (rotl rclass:$rA, R32C:$rB))]>;
+
+multiclass RotateLeftWord
+{
+  def v4i32: ROTVecInst<v4i32>;
+  def r32:   ROTRegInst<R32C>;
+}
+
+defm ROT: RotateLeftWord;
+
+// The rotate amount is in the same bits whether we've got an 8-bit, 16-bit or
+// 32-bit register
+def ROTr32_r16_anyext:
+    ROTInst<(outs R32C:$rT), (ins R32C:$rA, R16C:$rB),
+            [(set R32C:$rT, (rotl R32C:$rA, (i32 (anyext R16C:$rB))))]>;
+
+def : Pat<(rotl R32C:$rA, (i32 (zext R16C:$rB))),
+          (ROTr32_r16_anyext R32C:$rA, R16C:$rB)>;
+
+def : Pat<(rotl R32C:$rA, (i32 (sext R16C:$rB))),
+          (ROTr32_r16_anyext R32C:$rA, R16C:$rB)>;
+
+def ROTr32_r8_anyext:
+    ROTInst<(outs R32C:$rT), (ins R32C:$rA, R8C:$rB),
+            [(set R32C:$rT, (rotl R32C:$rA, (i32 (anyext R8C:$rB))))]>;
+
+def : Pat<(rotl R32C:$rA, (i32 (zext R8C:$rB))),
+          (ROTr32_r8_anyext R32C:$rA, R8C:$rB)>;
+
+def : Pat<(rotl R32C:$rA, (i32 (sext R8C:$rB))),
+          (ROTr32_r8_anyext R32C:$rA, R8C:$rB)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate word, immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00011110000, OOL, IOL, "roti\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTIVecInst<ValueType vectype, Operand optype, ValueType inttype, PatLeaf pred>:
+    ROTIInst<(outs VECREG:$rT), (ins VECREG:$rA, optype:$val),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_rotl (vectype VECREG:$rA), (inttype pred:$val)))]>;
+
+class ROTIRegInst<RegisterClass rclass, Operand optype, ValueType inttype, PatLeaf pred>:
+    ROTIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+             [(set rclass:$rT, (rotl rclass:$rA, (inttype pred:$val)))]>;
+
+multiclass RotateLeftWordImm
+{
+  def v4i32: ROTIVecInst<v4i32, u7imm_i32, i32, uimm7>;
+  def v4i32_i16: ROTIVecInst<v4i32, u7imm, i16, uimm7>;
+  def v4i32_i8:  ROTIVecInst<v4i32, u7imm_i8, i8, uimm7>;
+
+  def r32:       ROTIRegInst<R32C, u7imm_i32, i32, uimm7>;
+  def r32_i16:   ROTIRegInst<R32C, u7imm, i16, uimm7>;
+  def r32_i8:    ROTIRegInst<R32C, u7imm_i8, i8, uimm7>;
+}
+
+defm ROTI : RotateLeftWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad by byte (count)
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111011100, OOL, IOL, "rotqby\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQBYVecInst<ValueType vectype>:
+    ROTQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUrotbytes_left (vectype VECREG:$rA), R32C:$rB))]>;
+
+multiclass RotateQuadLeftByBytes
+{
+  def v16i8: ROTQBYVecInst<v16i8>;
+  def v8i16: ROTQBYVecInst<v8i16>;
+  def v4i32: ROTQBYVecInst<v4i32>;
+  def v4f32: ROTQBYVecInst<v4f32>;
+  def v2i64: ROTQBYVecInst<v2i64>;
+  def v2f64: ROTQBYVecInst<v2f64>;
+}
+
+defm ROTQBY: RotateQuadLeftByBytes;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad by byte (count), immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00111111100, OOL, IOL, "rotqbyi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQBYIVecInst<ValueType vectype>:
+    ROTQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUrotbytes_left (vectype VECREG:$rA), (i16 uimm7:$val)))]>;
+
+multiclass RotateQuadByBytesImm
+{
+  def v16i8: ROTQBYIVecInst<v16i8>;
+  def v8i16: ROTQBYIVecInst<v8i16>;
+  def v4i32: ROTQBYIVecInst<v4i32>;
+  def v4f32: ROTQBYIVecInst<v4f32>;
+  def v2i64: ROTQBYIVecInst<v2i64>;
+  def vfi64: ROTQBYIVecInst<v2f64>;
+}
+
+defm ROTQBYI: RotateQuadByBytesImm;
+
+// See ROTQBY note above.
+class ROTQBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00110011100, OOL, IOL,
+      "rotqbybi\t$rT, $rA, $shift",
+      RotateShift, pattern>;
+
+class ROTQBYBIVecInst<ValueType vectype, RegisterClass rclass>:
+    ROTQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, rclass:$shift),
+      [(set (vectype VECREG:$rT),
+            (SPUrotbytes_left_bits (vectype VECREG:$rA), rclass:$shift))]>;
+
+multiclass RotateQuadByBytesByBitshift {
+  def v16i8_r32: ROTQBYBIVecInst<v16i8, R32C>;
+  def v8i16_r32: ROTQBYBIVecInst<v8i16, R32C>;
+  def v4i32_r32: ROTQBYBIVecInst<v4i32, R32C>;
+  def v2i64_r32: ROTQBYBIVecInst<v2i64, R32C>;
+}
+
+defm ROTQBYBI : RotateQuadByBytesByBitshift;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// See ROTQBY note above.
+//
+// Assume that the user of this instruction knows to shift the rotate count
+// into bit 29
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00011011100, OOL, IOL, "rotqbi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQBIVecInst<ValueType vectype>:
+    ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [/* no pattern yet */]>;
+
+class ROTQBIRegInst<RegisterClass rclass>:
+    ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+               [/* no pattern yet */]>;
+
+multiclass RotateQuadByBitCount
+{
+  def v16i8: ROTQBIVecInst<v16i8>;
+  def v8i16: ROTQBIVecInst<v8i16>;
+  def v4i32: ROTQBIVecInst<v4i32>;
+  def v2i64: ROTQBIVecInst<v2i64>;
+
+  def r128:  ROTQBIRegInst<GPRC>;
+  def r64:   ROTQBIRegInst<R64C>;
+}
+
+defm ROTQBI: RotateQuadByBitCount;
+
+class ROTQBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00011111100, OOL, IOL, "rotqbii\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQBIIVecInst<ValueType vectype, Operand optype, ValueType inttype,
+                     PatLeaf pred>:
+    ROTQBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, optype:$val),
+                [/* no pattern yet */]>;
+
+class ROTQBIIRegInst<RegisterClass rclass, Operand optype, ValueType inttype,
+                     PatLeaf pred>:
+    ROTQBIIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+                [/* no pattern yet */]>;
+
+multiclass RotateQuadByBitCountImm
+{
+  def v16i8: ROTQBIIVecInst<v16i8, u7imm_i32, i32, uimm7>;
+  def v8i16: ROTQBIIVecInst<v8i16, u7imm_i32, i32, uimm7>;
+  def v4i32: ROTQBIIVecInst<v4i32, u7imm_i32, i32, uimm7>;
+  def v2i64: ROTQBIIVecInst<v2i64, u7imm_i32, i32, uimm7>;
+
+  def r128:  ROTQBIIRegInst<GPRC, u7imm_i32, i32, uimm7>;
+  def r64:   ROTQBIIRegInst<R64C, u7imm_i32, i32, uimm7>;
+}
+
+defm ROTQBII : RotateQuadByBitCountImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// ROTHM v8i16 form:
+// NOTE(1): No vector rotate is generated by the C/C++ frontend (today),
+//          so this only matches a synthetically generated/lowered code
+//          fragment.
+// NOTE(2): $rB must be negated before the right rotate!
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTHMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10111010000, OOL, IOL, "rothm\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+def ROTHMv8i16:
+    ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+              [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R32C:$rB),
+          (ROTHMv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R16C:$rB),
+          (ROTHMv8i16 VECREG:$rA,
+                      (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R8C:$rB),
+          (ROTHMv8i16 VECREG:$rA,
+                      (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB) ), 0))>;
+
+// ROTHM r16 form: Rotate 16-bit quantity to right, zero fill at the left
+// Note: This instruction doesn't match a pattern because rB must be negated
+// for the instruction to work. Thus, the pattern below the instruction!
+
+def ROTHMr16:
+    ROTHMInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+              [/* see patterns below - $rB must be negated! */]>;
+
+def : Pat<(srl R16C:$rA, R32C:$rB),
+          (ROTHMr16 R16C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(srl R16C:$rA, R16C:$rB),
+          (ROTHMr16 R16C:$rA,
+                    (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(srl R16C:$rA, R8C:$rB),
+          (ROTHMr16 R16C:$rA,
+                    (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB) ), 0))>;
+
+// ROTHMI v8i16 form: See the comment for ROTHM v8i16. The difference here is
+// that the immediate can be complemented, so that the user doesn't have to
+// worry about it.
+
+class ROTHMIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10111110000, OOL, IOL, "rothmi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+def ROTHMIv8i16:
+    ROTHMIInst<(outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val),
+               [/* no pattern */]>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i32 imm:$val)),
+          (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
+
+def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i16 imm:$val)),
+         (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
+
+def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i8 imm:$val)),
+         (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
+
+def ROTHMIr16:
+    ROTHMIInst<(outs R16C:$rT), (ins R16C:$rA, rothNeg7imm:$val),
+               [/* no pattern */]>;
+
+def: Pat<(srl R16C:$rA, (i32 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, uimm7:$val)>;
+
+def: Pat<(srl R16C:$rA, (i16 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, uimm7:$val)>;
+
+def: Pat<(srl R16C:$rA, (i8 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, uimm7:$val)>;
+
+// ROTM v4i32 form: See the ROTHM v8i16 comments.
+class ROTMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10011010000, OOL, IOL, "rotm\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+def ROTMv4i32:
+    ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+             [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, R32C:$rB),
+          (ROTMv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, R16C:$rB),
+          (ROTMv4i32 VECREG:$rA,
+                     (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, R8C:$rB),
+          (ROTMv4i32 VECREG:$rA,
+                     (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMr32:
+    ROTMInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+             [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(srl R32C:$rA, R32C:$rB),
+          (ROTMr32 R32C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(srl R32C:$rA, R16C:$rB),
+          (ROTMr32 R32C:$rA,
+                   (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(srl R32C:$rA, R8C:$rB),
+          (ROTMr32 R32C:$rA,
+                   (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+// ROTMI v4i32 form: See the comment for ROTHM v8i16.
+def ROTMIv4i32:
+    RI7Form<0b10011110000, (outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+      "rotmi\t$rT, $rA, $val", RotateShift,
+      [(set (v4i32 VECREG:$rT),
+            (SPUvec_srl VECREG:$rA, (i32 uimm7:$val)))]>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, (i16 uimm7:$val)),
+          (ROTMIv4i32 VECREG:$rA, uimm7:$val)>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, (i8 uimm7:$val)),
+          (ROTMIv4i32 VECREG:$rA, uimm7:$val)>;
+
+// ROTMI r32 form: know how to complement the immediate value.
+def ROTMIr32:
+    RI7Form<0b10011110000, (outs R32C:$rT), (ins R32C:$rA, rotNeg7imm:$val),
+      "rotmi\t$rT, $rA, $val", RotateShift,
+      [(set R32C:$rT, (srl R32C:$rA, (i32 uimm7:$val)))]>;
+
+def : Pat<(srl R32C:$rA, (i16 imm:$val)),
+          (ROTMIr32 R32C:$rA, uimm7:$val)>;
+
+def : Pat<(srl R32C:$rA, (i8 imm:$val)),
+          (ROTMIr32 R32C:$rA, uimm7:$val)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// ROTQMBY: This is a vector form merely so that when used in an
+// instruction pattern, type checking will succeed. This instruction assumes
+// that the user knew to negate $rB.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10111011100, OOL, IOL, "rotqmby\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQMBYVecInst<ValueType vectype>:
+    ROTQMBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern, $rB must be negated */]>;
+
+class ROTQMBYRegInst<RegisterClass rclass>:
+    ROTQMBYInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+multiclass RotateQuadBytes
+{
+  def v16i8: ROTQMBYVecInst<v16i8>;
+  def v8i16: ROTQMBYVecInst<v8i16>;
+  def v4i32: ROTQMBYVecInst<v4i32>;
+  def v2i64: ROTQMBYVecInst<v2i64>;
+
+  def r128: ROTQMBYRegInst<GPRC>;
+  def r64:  ROTQMBYRegInst<R64C>;
+}
+
+defm ROTQMBY : RotateQuadBytes;
+
+class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQMBYIVecInst<ValueType vectype>:
+    ROTQMBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+class ROTQMBYIRegInst<RegisterClass rclass, Operand optype, ValueType inttype,
+                      PatLeaf pred>:
+    ROTQMBYIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+                 [/* no pattern */]>;
+
+// 128-bit zero extension form:
+class ROTQMBYIZExtInst<RegisterClass rclass, Operand optype, PatLeaf pred>:
+    ROTQMBYIInst<(outs GPRC:$rT), (ins rclass:$rA, optype:$val),
+                 [/* no pattern */]>;
+
+multiclass RotateQuadBytesImm
+{
+  def v16i8: ROTQMBYIVecInst<v16i8>;
+  def v8i16: ROTQMBYIVecInst<v8i16>;
+  def v4i32: ROTQMBYIVecInst<v4i32>;
+  def v2i64: ROTQMBYIVecInst<v2i64>;
+
+  def r128:  ROTQMBYIRegInst<GPRC, rotNeg7imm, i32, uimm7>;
+  def r64:   ROTQMBYIRegInst<R64C, rotNeg7imm, i32, uimm7>;
+  
+  def r128_zext_r8:  ROTQMBYIZExtInst<R8C, rotNeg7imm, uimm7>;
+  def r128_zext_r16: ROTQMBYIZExtInst<R16C, rotNeg7imm, uimm7>;
+  def r128_zext_r32: ROTQMBYIZExtInst<R32C, rotNeg7imm, uimm7>;
+  def r128_zext_r64: ROTQMBYIZExtInst<R64C, rotNeg7imm, uimm7>;
+}
+
+defm ROTQMBYI : RotateQuadBytesImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate right and mask by bit count
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10110011100, OOL, IOL, "rotqmbybi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQMBYBIVecInst<ValueType vectype>:
+    ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                  [/* no pattern, */]>;
+
+multiclass RotateMaskQuadByBitCount
+{
+  def v16i8: ROTQMBYBIVecInst<v16i8>;
+  def v8i16: ROTQMBYBIVecInst<v8i16>;
+  def v4i32: ROTQMBYBIVecInst<v4i32>;
+  def v2i64: ROTQMBYBIVecInst<v2i64>;
+}
+
+defm ROTQMBYBI: RotateMaskQuadByBitCount;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad and mask by bits
+// Note that the rotate amount has to be negated
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10011011100, OOL, IOL, "rotqmbi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQMBIVecInst<ValueType vectype>:
+    ROTQMBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+class ROTQMBIRegInst<RegisterClass rclass>:
+    ROTQMBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+multiclass RotateMaskQuadByBits
+{
+  def v16i8: ROTQMBIVecInst<v16i8>;
+  def v8i16: ROTQMBIVecInst<v8i16>;
+  def v4i32: ROTQMBIVecInst<v4i32>;
+  def v2i64: ROTQMBIVecInst<v2i64>;
+
+  def r128:  ROTQMBIRegInst<GPRC>;
+  def r64:   ROTQMBIRegInst<R64C>;
+}
+
+defm ROTQMBI: RotateMaskQuadByBits;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad and mask by bits, immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10011111100, OOL, IOL, "rotqmbii\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQMBIIVecInst<ValueType vectype>:
+   ROTQMBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+class ROTQMBIIRegInst<RegisterClass rclass>:
+   ROTQMBIIInst<(outs rclass:$rT), (ins rclass:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+multiclass RotateMaskQuadByBitsImm
+{
+  def v16i8: ROTQMBIIVecInst<v16i8>;
+  def v8i16: ROTQMBIIVecInst<v8i16>;
+  def v4i32: ROTQMBIIVecInst<v4i32>;
+  def v2i64: ROTQMBIIVecInst<v2i64>;
+
+  def r128:  ROTQMBIIRegInst<GPRC>;
+  def r64:   ROTQMBIIRegInst<R64C>;
+}
+
+defm ROTQMBII: RotateMaskQuadByBitsImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def ROTMAHv8i16:
+    RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+      "rotmah\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R32C:$rB),
+          (ROTMAHv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R16C:$rB),
+          (ROTMAHv8i16 VECREG:$rA,
+                       (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R8C:$rB),
+          (ROTMAHv8i16 VECREG:$rA,
+                       (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMAHr16:
+    RRForm<0b01111010000, (outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+      "rotmah\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(sra R16C:$rA, R32C:$rB),
+          (ROTMAHr16 R16C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(sra R16C:$rA, R16C:$rB),
+          (ROTMAHr16 R16C:$rA,
+                     (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(sra R16C:$rA, R8C:$rB),
+          (ROTMAHr16 R16C:$rA,
+                     (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMAHIv8i16:
+    RRForm<0b01111110000, (outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val),
+      "rotmahi\t$rT, $rA, $val", RotateShift,
+      [(set (v8i16 VECREG:$rT),
+            (SPUvec_sra (v8i16 VECREG:$rA), (i32 uimm7:$val)))]>;
+
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (i16 uimm7:$val)),
+          (ROTMAHIv8i16 (v8i16 VECREG:$rA), (i32 uimm7:$val))>;
+
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (i8 uimm7:$val)),
+          (ROTMAHIv8i16 (v8i16 VECREG:$rA), (i32 uimm7:$val))>;
+
+def ROTMAHIr16:
+    RRForm<0b01111110000, (outs R16C:$rT), (ins R16C:$rA, rothNeg7imm_i16:$val),
+      "rotmahi\t$rT, $rA, $val", RotateShift,
+      [(set R16C:$rT, (sra R16C:$rA, (i16 uimm7:$val)))]>;
+
+def : Pat<(sra R16C:$rA, (i32 imm:$val)),
+          (ROTMAHIr16 R16C:$rA, uimm7:$val)>;
+
+def : Pat<(sra R16C:$rA, (i8 imm:$val)),
+          (ROTMAHIr16 R16C:$rA, uimm7:$val)>;
+
+def ROTMAv4i32:
+    RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+      "rotma\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R32C:$rB),
+          (ROTMAv4i32 (v4i32 VECREG:$rA), (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R16C:$rB),
+          (ROTMAv4i32 (v4i32 VECREG:$rA),
+                      (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R8C:$rB),
+          (ROTMAv4i32 (v4i32 VECREG:$rA),
+                      (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMAr32:
+    RRForm<0b01011010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+      "rotma\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(sra R32C:$rA, R32C:$rB),
+          (ROTMAr32 R32C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(sra R32C:$rA, R16C:$rB),
+          (ROTMAr32 R32C:$rA,
+                    (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(sra R32C:$rA, R8C:$rB),
+          (ROTMAr32 R32C:$rA,
+                    (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+class ROTMAIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011110000, OOL, IOL,
+      "rotmai\t$rT, $rA, $val",
+      RotateShift, pattern>;
+
+class ROTMAIVecInst<ValueType vectype, Operand intop, ValueType inttype>:
+    ROTMAIInst<(outs VECREG:$rT), (ins VECREG:$rA, intop:$val),
+      [(set (vectype VECREG:$rT),
+            (SPUvec_sra VECREG:$rA, (inttype uimm7:$val)))]>;
+
+class ROTMAIRegInst<RegisterClass rclass, Operand intop, ValueType inttype>:
+    ROTMAIInst<(outs rclass:$rT), (ins rclass:$rA, intop:$val),
+      [(set rclass:$rT, (sra rclass:$rA, (inttype uimm7:$val)))]>;
+
+multiclass RotateMaskAlgebraicImm {
+  def v2i64_i32 : ROTMAIVecInst<v2i64, rotNeg7imm, i32>;
+  def v4i32_i32 : ROTMAIVecInst<v4i32, rotNeg7imm, i32>;
+  def r64_i32 : ROTMAIRegInst<R64C, rotNeg7imm, i32>;
+  def r32_i32 : ROTMAIRegInst<R32C, rotNeg7imm, i32>;
+}
+
+defm ROTMAI : RotateMaskAlgebraicImm;
+
+//===----------------------------------------------------------------------===//
+// Branch and conditionals:
+//===----------------------------------------------------------------------===//
+
+let isTerminator = 1, isBarrier = 1 in {
+  // Halt If Equal (r32 preferred slot only, no vector form)
+  def HEQr32:
+    RRForm_3<0b00011011110, (outs), (ins R32C:$rA, R32C:$rB),
+      "heq\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HEQIr32 :
+    RI10Form_2<0b11111110, (outs), (ins R32C:$rA, s10imm:$val),
+      "heqi\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+
+  // HGT/HGTI: These instructions use signed arithmetic for the comparison,
+  // contrasting with HLGT/HLGTI, which use unsigned comparison:
+  def HGTr32:
+    RRForm_3<0b00011010010, (outs), (ins R32C:$rA, R32C:$rB),
+      "hgt\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HGTIr32:
+    RI10Form_2<0b11110010, (outs), (ins R32C:$rA, s10imm:$val),
+      "hgti\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HLGTr32:
+    RRForm_3<0b00011011010, (outs), (ins R32C:$rA, R32C:$rB),
+      "hlgt\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HLGTIr32:
+    RI10Form_2<0b11111010, (outs), (ins R32C:$rA, s10imm:$val),
+      "hlgti\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+}
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Comparison operators for i8, i16 and i32:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class CEQBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001011110, OOL, IOL, "ceqb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualByte
+{
+  def v16i8 :
+    CEQBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (seteq (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CEQBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (seteq R8C:$rA, R8C:$rB))]>;
+}
+
+class CEQBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01111110, OOL, IOL, "ceqbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualByteImm
+{
+  def v16i8 :
+    CEQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (seteq (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CEQBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+             [(set R8C:$rT, (seteq R8C:$rA, immSExt8:$val))]>;
+}
+
+class CEQHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010011110, OOL, IOL, "ceqh\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualHalfword
+{
+  def v8i16 : CEQHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (seteq (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CEQHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (seteq R16C:$rA, R16C:$rB))]>;
+}
+
+class CEQHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10111110, OOL, IOL, "ceqhi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualHalfwordImm
+{
+  def v8i16 : CEQHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                        [(set (v8i16 VECREG:$rT),
+                              (seteq (v8i16 VECREG:$rA),
+                                     (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CEQHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (seteq R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CEQInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000011110, OOL, IOL, "ceq\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualWord
+{
+  def v4i32 : CEQInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (seteq (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CEQInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                    [(set R32C:$rT, (seteq R32C:$rA, R32C:$rB))]>;
+}
+
+class CEQIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00111110, OOL, IOL, "ceqi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualWordImm
+{
+  def v4i32 : CEQIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (seteq (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CEQIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (seteq R32C:$rA, i32ImmSExt10:$val))]>;
+}
+
+class CGTBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001010010, OOL, IOL, "cgtb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrByte
+{
+  def v16i8 :
+    CGTBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (setgt (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CGTBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (setgt R8C:$rA, R8C:$rB))]>;
+}
+
+class CGTBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01110010, OOL, IOL, "cgtbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrByteImm
+{
+  def v16i8 :
+    CGTBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (setgt (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CGTBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+              [(set R8C:$rT, (setgt R8C:$rA, immSExt8:$val))]>;
+}
+
+class CGTHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010010010, OOL, IOL, "cgth\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrHalfword
+{
+  def v8i16 : CGTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (setgt (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CGTHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (setgt R16C:$rA, R16C:$rB))]>;
+}
+
+class CGTHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10110010, OOL, IOL, "cgthi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrHalfwordImm
+{
+  def v8i16 : CGTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                        [(set (v8i16 VECREG:$rT),
+                              (setgt (v8i16 VECREG:$rA),
+                                     (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CGTHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (setgt R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CGTInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000010010, OOL, IOL, "cgt\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrWord
+{
+  def v4i32 : CGTInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (setgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CGTInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                    [(set R32C:$rT, (setgt R32C:$rA, R32C:$rB))]>;
+}
+
+class CGTIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00110010, OOL, IOL, "cgti\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrWordImm
+{
+  def v4i32 : CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (setgt R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // CGTIv4f32, CGTIf32: These are used in the f32 fdiv instruction sequence:
+  def v4f32: CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 (bitconvert (v4f32 VECREG:$rA))),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def f32:   CGTIInst<(outs R32C:$rT), (ins R32FP:$rA, s10imm_i32:$val),
+                      [/* no pattern */]>;
+}
+
+class CLGTBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001011010, OOL, IOL, "clgtb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrByte
+{
+  def v16i8 :
+    CLGTBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (setugt (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CLGTBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (setugt R8C:$rA, R8C:$rB))]>;
+}
+
+class CLGTBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01111010, OOL, IOL, "clgtbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrByteImm
+{
+  def v16i8 :
+    CLGTBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (setugt (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CLGTBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+             [(set R8C:$rT, (setugt R8C:$rA, immSExt8:$val))]>;
+}
+
+class CLGTHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010011010, OOL, IOL, "clgth\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrHalfword
+{
+  def v8i16 : CLGTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (setugt (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CLGTHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (setugt R16C:$rA, R16C:$rB))]>;
+}
+
+class CLGTHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10111010, OOL, IOL, "clgthi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrHalfwordImm
+{
+  def v8i16 : CLGTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                         [(set (v8i16 VECREG:$rT),
+                               (setugt (v8i16 VECREG:$rA),
+                                       (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CLGTHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                       [(set R16C:$rT, (setugt R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CLGTInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000011010, OOL, IOL, "clgt\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrWord
+{
+  def v4i32 : CLGTInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (setugt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CLGTInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                     [(set R32C:$rT, (setugt R32C:$rA, R32C:$rB))]>;
+}
+
+class CLGTIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00111010, OOL, IOL, "clgti\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrWordImm
+{
+  def v4i32 : CLGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setugt (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CLGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                     [(set R32C:$rT, (setugt R32C:$rA, i32ImmSExt10:$val))]>;
+}
+
+defm CEQB   : CmpEqualByte;
+defm CEQBI  : CmpEqualByteImm;
+defm CEQH   : CmpEqualHalfword;
+defm CEQHI  : CmpEqualHalfwordImm;
+defm CEQ    : CmpEqualWord;
+defm CEQI   : CmpEqualWordImm;
+defm CGTB   : CmpGtrByte;
+defm CGTBI  : CmpGtrByteImm;
+defm CGTH   : CmpGtrHalfword;
+defm CGTHI  : CmpGtrHalfwordImm;
+defm CGT    : CmpGtrWord;
+defm CGTI   : CmpGtrWordImm;
+defm CLGTB  : CmpLGtrByte;
+defm CLGTBI : CmpLGtrByteImm;
+defm CLGTH  : CmpLGtrHalfword;
+defm CLGTHI : CmpLGtrHalfwordImm;
+defm CLGT   : CmpLGtrWord;
+defm CLGTI  : CmpLGtrWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// For SETCC primitives not supported above (setlt, setle, setge, etc.)
+// define a pattern to generate the right code, as a binary operator
+// (in a manner of speaking.)
+//
+// Notes:
+// 1. This only matches the setcc set of conditionals. Special pattern
+//    matching is used for select conditionals.
+//
+// 2. The "DAG" versions of these classes is almost exclusively used for
+//    i64 comparisons. See the tblgen fundamentals documentation for what
+//    ".ResultInstrs[0]" means; see TargetSelectionDAG.td and the Pattern
+//    class for where ResultInstrs originates.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class SETCCNegCondReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                      SPUInstr xorinst, SPUInstr cmpare>:
+  Pat<(cond rclass:$rA, rclass:$rB),
+      (xorinst (cmpare rclass:$rA, rclass:$rB), (inttype -1))>;
+
+class SETCCNegCondImm<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                      PatLeaf immpred, SPUInstr xorinst, SPUInstr cmpare>:
+  Pat<(cond rclass:$rA, (inttype immpred:$imm)),
+      (xorinst (cmpare rclass:$rA, (inttype immpred:$imm)), (inttype -1))>;
+
+def : SETCCNegCondReg<setne, R8C, i8, XORBIr8,  CEQBr8>;
+def : SETCCNegCondImm<setne, R8C, i8, immSExt8, XORBIr8, CEQBIr8>;
+
+def : SETCCNegCondReg<setne, R16C, i16, XORHIr16,     CEQHr16>;
+def : SETCCNegCondImm<setne, R16C, i16, i16ImmSExt10, XORHIr16, CEQHIr16>;
+
+def : SETCCNegCondReg<setne, R32C, i32, XORIr32, CEQr32>;
+def : SETCCNegCondImm<setne, R32C, i32, i32ImmSExt10, XORIr32, CEQIr32>;
+
+class SETCCBinOpReg<PatFrag cond, RegisterClass rclass,
+                    SPUInstr binop, SPUInstr cmpOp1, SPUInstr cmpOp2>:
+    Pat<(cond rclass:$rA, rclass:$rB),
+        (binop (cmpOp1 rclass:$rA, rclass:$rB),
+               (cmpOp2 rclass:$rA, rclass:$rB))>;
+
+class SETCCBinOpImm<PatFrag cond, RegisterClass rclass, PatLeaf immpred,
+                    ValueType immtype,
+                    SPUInstr binop, SPUInstr cmpOp1, SPUInstr cmpOp2>:
+    Pat<(cond rclass:$rA, (immtype immpred:$imm)),
+        (binop (cmpOp1 rclass:$rA, (immtype immpred:$imm)),
+               (cmpOp2 rclass:$rA, (immtype immpred:$imm)))>;
+
+def : SETCCBinOpReg<setge, R8C, ORr8, CGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setge, R8C, immSExt8, i8, ORr8, CGTBIr8, CEQBIr8>;
+def : SETCCBinOpReg<setlt, R8C, NORr8, CGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setlt, R8C, immSExt8, i8, NORr8, CGTBIr8, CEQBIr8>;
+def : Pat<(setle R8C:$rA, R8C:$rB),
+          (XORBIr8 (CGTBr8 R8C:$rA, R8C:$rB), 0xff)>;
+def :  Pat<(setle R8C:$rA, immU8:$imm),
+           (XORBIr8 (CGTBIr8 R8C:$rA, immU8:$imm), 0xff)>;
+
+def : SETCCBinOpReg<setge, R16C, ORr16, CGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setge, R16C, i16ImmSExt10, i16,
+                    ORr16, CGTHIr16, CEQHIr16>;
+def : SETCCBinOpReg<setlt, R16C, NORr16, CGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setlt, R16C, i16ImmSExt10, i16, NORr16, CGTHIr16, CEQHIr16>;
+def : Pat<(setle R16C:$rA, R16C:$rB),
+          (XORHIr16 (CGTHr16 R16C:$rA, R16C:$rB), 0xffff)>;
+def : Pat<(setle R16C:$rA, i16ImmSExt10:$imm),
+          (XORHIr16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$imm), 0xffff)>;
+
+def : SETCCBinOpReg<setge, R32C, ORr32, CGTr32, CEQr32>;
+def : SETCCBinOpImm<setge, R32C, i32ImmSExt10, i32,
+                    ORr32, CGTIr32, CEQIr32>;
+def : SETCCBinOpReg<setlt, R32C, NORr32, CGTr32, CEQr32>;
+def : SETCCBinOpImm<setlt, R32C, i32ImmSExt10, i32, NORr32, CGTIr32, CEQIr32>;
+def : Pat<(setle R32C:$rA, R32C:$rB),
+          (XORIr32 (CGTr32 R32C:$rA, R32C:$rB), 0xffffffff)>;
+def : Pat<(setle R32C:$rA, i32ImmSExt10:$imm),
+          (XORIr32 (CGTIr32 R32C:$rA, i32ImmSExt10:$imm), 0xffffffff)>;
+
+def : SETCCBinOpReg<setuge, R8C, ORr8, CLGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setuge, R8C, immSExt8, i8, ORr8, CLGTBIr8, CEQBIr8>;
+def : SETCCBinOpReg<setult, R8C, NORr8, CLGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setult, R8C, immSExt8, i8, NORr8, CLGTBIr8, CEQBIr8>;
+def : Pat<(setule R8C:$rA, R8C:$rB),
+          (XORBIr8 (CLGTBr8 R8C:$rA, R8C:$rB), 0xff)>;
+def :  Pat<(setule R8C:$rA, immU8:$imm),
+           (XORBIr8 (CLGTBIr8 R8C:$rA, immU8:$imm), 0xff)>;
+
+def : SETCCBinOpReg<setuge, R16C, ORr16, CLGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setuge, R16C, i16ImmSExt10, i16,
+                    ORr16, CLGTHIr16, CEQHIr16>;
+def : SETCCBinOpReg<setult, R16C, NORr16, CLGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setult, R16C, i16ImmSExt10, i16, NORr16,
+                    CLGTHIr16, CEQHIr16>;
+def : Pat<(setule R16C:$rA, R16C:$rB),
+          (XORHIr16 (CLGTHr16 R16C:$rA, R16C:$rB), 0xffff)>;
+def :  Pat<(setule R16C:$rA, i16ImmSExt10:$imm),
+           (XORHIr16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$imm), 0xffff)>;
+
+def : SETCCBinOpReg<setuge, R32C, ORr32, CLGTr32, CEQr32>;
+def : SETCCBinOpImm<setuge, R32C, i32ImmSExt10, i32,
+                    ORr32, CLGTIr32, CEQIr32>;
+def : SETCCBinOpReg<setult, R32C, NORr32, CLGTr32, CEQr32>;
+def : SETCCBinOpImm<setult, R32C, i32ImmSExt10, i32, NORr32, CLGTIr32, CEQIr32>;
+def : Pat<(setule R32C:$rA, R32C:$rB),
+          (XORIr32 (CLGTr32 R32C:$rA, R32C:$rB), 0xffffffff)>;
+def : Pat<(setule R32C:$rA, i32ImmSExt10:$imm),
+          (XORIr32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$imm), 0xffffffff)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// select conditional patterns:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class SELECTNegCondReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                       SPUInstr selinstr, SPUInstr cmpare>:
+  Pat<(select (inttype (cond rclass:$rA, rclass:$rB)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rTrue, rclass:$rFalse,
+                (cmpare rclass:$rA, rclass:$rB))>;
+
+class SELECTNegCondImm<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                       PatLeaf immpred, SPUInstr selinstr, SPUInstr cmpare>:
+  Pat<(select (inttype (cond rclass:$rA, immpred:$imm)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rTrue, rclass:$rFalse,
+                (cmpare rclass:$rA, immpred:$imm))>;
+
+def : SELECTNegCondReg<setne, R8C, i8, SELBr8, CEQBr8>;
+def : SELECTNegCondImm<setne, R8C, i8, immSExt8, SELBr8, CEQBIr8>;
+def : SELECTNegCondReg<setle, R8C, i8, SELBr8, CGTBr8>;
+def : SELECTNegCondImm<setle, R8C, i8, immSExt8, SELBr8, CGTBr8>;
+def : SELECTNegCondReg<setule, R8C, i8, SELBr8, CLGTBr8>;
+def : SELECTNegCondImm<setule, R8C, i8, immU8, SELBr8, CLGTBIr8>;
+
+def : SELECTNegCondReg<setne, R16C, i16, SELBr16, CEQHr16>;
+def : SELECTNegCondImm<setne, R16C, i16, i16ImmSExt10, SELBr16, CEQHIr16>;
+def : SELECTNegCondReg<setle, R16C, i16, SELBr16, CGTHr16>;
+def : SELECTNegCondImm<setle, R16C, i16, i16ImmSExt10, SELBr16, CGTHIr16>;
+def : SELECTNegCondReg<setule, R16C, i16, SELBr16, CLGTHr16>;
+def : SELECTNegCondImm<setule, R16C, i16, i16ImmSExt10, SELBr16, CLGTHIr16>;
+
+def : SELECTNegCondReg<setne, R32C, i32, SELBr32, CEQr32>;
+def : SELECTNegCondImm<setne, R32C, i32, i32ImmSExt10, SELBr32, CEQIr32>;
+def : SELECTNegCondReg<setle, R32C, i32, SELBr32, CGTr32>;
+def : SELECTNegCondImm<setle, R32C, i32, i32ImmSExt10, SELBr32, CGTIr32>;
+def : SELECTNegCondReg<setule, R32C, i32, SELBr32, CLGTr32>;
+def : SELECTNegCondImm<setule, R32C, i32, i32ImmSExt10, SELBr32, CLGTIr32>;
+
+class SELECTBinOpReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                     SPUInstr selinstr, SPUInstr binop, SPUInstr cmpOp1,
+                     SPUInstr cmpOp2>:
+  Pat<(select (inttype (cond rclass:$rA, rclass:$rB)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rFalse, rclass:$rTrue,
+                (binop (cmpOp1 rclass:$rA, rclass:$rB),
+                       (cmpOp2 rclass:$rA, rclass:$rB)))>;
+
+class SELECTBinOpImm<PatFrag cond, RegisterClass rclass, PatLeaf immpred,
+                     ValueType inttype,
+                     SPUInstr selinstr, SPUInstr binop, SPUInstr cmpOp1,
+                     SPUInstr cmpOp2>:
+    Pat<(select (inttype (cond rclass:$rA, (inttype immpred:$imm))),
+                rclass:$rTrue, rclass:$rFalse),
+        (selinstr rclass:$rFalse, rclass:$rTrue,
+                  (binop (cmpOp1 rclass:$rA, (inttype immpred:$imm)),
+                         (cmpOp2 rclass:$rA, (inttype immpred:$imm))))>;
+
+def : SELECTBinOpReg<setge, R8C, i8, SELBr8, ORr8, CGTBr8, CEQBr8>;
+def : SELECTBinOpImm<setge, R8C, immSExt8, i8,
+                     SELBr8, ORr8, CGTBIr8, CEQBIr8>;
+
+def : SELECTBinOpReg<setge, R16C, i16, SELBr16, ORr16, CGTHr16, CEQHr16>;
+def : SELECTBinOpImm<setge, R16C, i16ImmSExt10, i16,
+                     SELBr16, ORr16, CGTHIr16, CEQHIr16>;
+
+def : SELECTBinOpReg<setge, R32C, i32, SELBr32, ORr32, CGTr32, CEQr32>;
+def : SELECTBinOpImm<setge, R32C, i32ImmSExt10, i32,
+                     SELBr32, ORr32, CGTIr32, CEQIr32>;
+
+def : SELECTBinOpReg<setuge, R8C, i8, SELBr8, ORr8, CLGTBr8, CEQBr8>;
+def : SELECTBinOpImm<setuge, R8C, immSExt8, i8,
+                     SELBr8, ORr8, CLGTBIr8, CEQBIr8>;
+
+def : SELECTBinOpReg<setuge, R16C, i16, SELBr16, ORr16, CLGTHr16, CEQHr16>;
+def : SELECTBinOpImm<setuge, R16C, i16ImmUns10, i16,
+                     SELBr16, ORr16, CLGTHIr16, CEQHIr16>;
+
+def : SELECTBinOpReg<setuge, R32C, i32, SELBr32, ORr32, CLGTr32, CEQr32>;
+def : SELECTBinOpImm<setuge, R32C, i32ImmUns10, i32,
+                     SELBr32, ORr32, CLGTIr32, CEQIr32>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+let isCall = 1,
+  // All calls clobber the non-callee-saved registers:
+  Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9,
+          R10,R11,R12,R13,R14,R15,R16,R17,R18,R19,
+          R20,R21,R22,R23,R24,R25,R26,R27,R28,R29,
+          R30,R31,R32,R33,R34,R35,R36,R37,R38,R39,
+          R40,R41,R42,R43,R44,R45,R46,R47,R48,R49,
+          R50,R51,R52,R53,R54,R55,R56,R57,R58,R59,
+          R60,R61,R62,R63,R64,R65,R66,R67,R68,R69,
+          R70,R71,R72,R73,R74,R75,R76,R77,R78,R79],
+  // All of these instructions use $lr (aka $0)
+  Uses = [R0]  in {
+  // Branch relative and set link: Used if we actually know that the target
+  // is within [-32768, 32767] bytes of the target
+  def BRSL:
+    BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func, variable_ops),
+      "brsl\t$$lr, $func",
+      [(SPUcall (SPUpcrel tglobaladdr:$func, 0))]>;
+
+  // Branch absolute and set link: Used if we actually know that the target
+  // is an absolute address
+  def BRASL:
+    BranchSetLink<0b011001100, (outs), (ins calltarget:$func, variable_ops),
+      "brasl\t$$lr, $func",
+      [(SPUcall (SPUaform tglobaladdr:$func, 0))]>;
+
+  // Branch indirect and set link if external data. These instructions are not
+  // actually generated, matched by an intrinsic:
+  def BISLED_00: BISLEDForm<0b11, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_E0: BISLEDForm<0b10, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_0D: BISLEDForm<0b01, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_ED: BISLEDForm<0b00, "bisled\t$$lr, $func", [/* empty pattern */]>;
+
+  // Branch indirect and set link. This is the "X-form" address version of a
+  // function call
+  def BISL:
+    BIForm<0b10010101100, "bisl\t$$lr, $func", [(SPUcall R32C:$func)]>;
+}
+
+// Support calls to external symbols:      
+def : Pat<(SPUcall (SPUpcrel texternalsym:$func, 0)),
+          (BRSL texternalsym:$func)>;
+      
+def : Pat<(SPUcall (SPUaform texternalsym:$func, 0)),
+          (BRASL texternalsym:$func)>;
+
+// Unconditional branches:
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+  def BR :
+    UncondBranch<0b001001100, (outs), (ins brtarget:$dest),
+      "br\t$dest",
+      [(br bb:$dest)]>;
+
+  // Unconditional, absolute address branch
+  def BRA:
+    UncondBranch<0b001100000, (outs), (ins brtarget:$dest),
+      "bra\t$dest",
+      [/* no pattern */]>;
+
+  // Indirect branch
+  def BI:
+    BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
+
+  // Conditional branches:
+  class BRNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b010000100, (outs), IOL, "brnz\t$rCond,$dest",
+             BranchResolv, pattern>;
+
+  class BRNZRegInst<RegisterClass rclass>:
+    BRNZInst<(ins rclass:$rCond, brtarget:$dest),
+             [(brcond rclass:$rCond, bb:$dest)]>;
+
+  class BRNZVecInst<ValueType vectype>:
+    BRNZInst<(ins VECREG:$rCond, brtarget:$dest),
+             [(brcond (vectype VECREG:$rCond), bb:$dest)]>;
+
+  multiclass BranchNotZero {
+    def v4i32 : BRNZVecInst<v4i32>;
+    def r32   : BRNZRegInst<R32C>;
+  }
+
+  defm BRNZ : BranchNotZero;
+
+  class BRZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b000000100, (outs), IOL, "brz\t$rT,$dest",
+             BranchResolv, pattern>;
+
+  class BRZRegInst<RegisterClass rclass>:
+    BRZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRZVecInst<ValueType vectype>:
+    BRZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZero {
+    def v4i32: BRZVecInst<v4i32>;
+    def r32:   BRZRegInst<R32C>;
+  }
+
+  defm BRZ: BranchZero;
+
+  // Note: LLVM doesn't do branch conditional, indirect. Otherwise these would
+  // be useful:
+  /*
+  class BINZInst<dag IOL, list<dag> pattern>:
+   BICondForm<0b10010100100, (outs), IOL, "binz\t$rA, $dest", pattern>;
+
+  class BINZRegInst<RegisterClass rclass>:
+    BINZInst<(ins rclass:$rA, brtarget:$dest),
+             [(brcond rclass:$rA, R32C:$dest)]>;
+
+  class BINZVecInst<ValueType vectype>:
+    BINZInst<(ins VECREG:$rA, R32C:$dest),
+             [(brcond (vectype VECREG:$rA), R32C:$dest)]>;
+
+  multiclass BranchNotZeroIndirect {
+    def v4i32: BINZVecInst<v4i32>;
+    def r32:   BINZRegInst<R32C>;
+  }
+
+  defm BINZ: BranchNotZeroIndirect;
+
+  class BIZInst<dag IOL, list<dag> pattern>:
+    BICondForm<0b00010100100, (outs), IOL, "biz\t$rA, $func", pattern>;
+
+  class BIZRegInst<RegisterClass rclass>:
+    BIZInst<(ins rclass:$rA, R32C:$func), [/* no pattern */]>;
+
+  class BIZVecInst<ValueType vectype>:
+    BIZInst<(ins VECREG:$rA, R32C:$func), [/* no pattern */]>;
+
+  multiclass BranchZeroIndirect {
+    def v4i32: BIZVecInst<v4i32>;
+    def r32:   BIZRegInst<R32C>;
+  }
+
+  defm BIZ: BranchZeroIndirect;
+  */
+
+  class BRHNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b011000100, (outs), IOL, "brhnz\t$rCond,$dest", BranchResolv,
+             pattern>;
+
+  class BRHNZRegInst<RegisterClass rclass>:
+    BRHNZInst<(ins rclass:$rCond, brtarget:$dest),
+              [(brcond rclass:$rCond, bb:$dest)]>;
+
+  class BRHNZVecInst<ValueType vectype>:
+    BRHNZInst<(ins VECREG:$rCond, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchNotZeroHalfword {
+    def v8i16: BRHNZVecInst<v8i16>;
+    def r16:   BRHNZRegInst<R16C>;
+  }
+
+  defm BRHNZ: BranchNotZeroHalfword;
+
+  class BRHZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b001000100, (outs), IOL, "brhz\t$rT,$dest", BranchResolv,
+             pattern>;
+
+  class BRHZRegInst<RegisterClass rclass>:
+    BRHZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRHZVecInst<ValueType vectype>:
+    BRHZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZeroHalfword {
+    def v8i16: BRHZVecInst<v8i16>;
+    def r16:   BRHZRegInst<R16C>;
+  }
+
+  defm BRHZ: BranchZeroHalfword;
+}
+
+//===----------------------------------------------------------------------===//
+// setcc and brcond patterns:
+//===----------------------------------------------------------------------===//
+
+def : Pat<(brcond (i16 (seteq R16C:$rA, 0)), bb:$dest),
+          (BRHZr16 R16C:$rA, bb:$dest)>;
+def : Pat<(brcond (i16 (setne R16C:$rA, 0)), bb:$dest),
+          (BRHNZr16 R16C:$rA, bb:$dest)>;
+
+def : Pat<(brcond (i32 (seteq R32C:$rA, 0)), bb:$dest),
+          (BRZr32 R32C:$rA, bb:$dest)>;
+def : Pat<(brcond (i32 (setne R32C:$rA, 0)), bb:$dest),
+          (BRNZr32 R32C:$rA, bb:$dest)>;
+
+multiclass BranchCondEQ<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (CEQHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CEQHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CEQIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CEQr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDeq : BranchCondEQ<seteq, BRHNZr16, BRNZr32>;
+defm BRCONDne : BranchCondEQ<setne, BRHZr16, BRZr32>;
+
+multiclass BranchCondLGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm : Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                   (brinst16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CLGTHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CLGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDugt : BranchCondLGT<setugt, BRHNZr16, BRNZr32>;
+defm BRCONDule : BranchCondLGT<setule, BRHZr16, BRZr32>;
+
+multiclass BranchCondLGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
+                           SPUInstr orinst32, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (orinst16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$val),
+                                      (CEQHIr16 R16C:$rA, i16ImmSExt10:$val)),
+                            bb:$dest)>;
+
+  def r16: Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+               (brinst16 (orinst16 (CLGTHr16 R16C:$rA, R16:$rB),
+                                   (CEQHr16 R16C:$rA, R16:$rB)),
+                         bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (orinst32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$val),
+                                       (CEQIr32 R32C:$rA, i32ImmSExt10:$val)),
+                             bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (orinst32 (CLGTr32 R32C:$rA, R32C:$rB),
+                                    (CEQr32 R32C:$rA, R32C:$rB)),
+                          bb:$dest)>;
+}
+
+defm BRCONDuge : BranchCondLGTEQ<setuge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDult : BranchCondLGTEQ<setult, ORr16, BRHZr16, ORr32, BRZr32>;
+
+multiclass BranchCondGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm : Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                   (brinst16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CGTHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CGTIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDgt : BranchCondGT<setgt, BRHNZr16, BRNZr32>;
+defm BRCONDle : BranchCondGT<setle, BRHZr16, BRZr32>;
+
+multiclass BranchCondGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
+                          SPUInstr orinst32, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (orinst16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$val),
+                                      (CEQHIr16 R16C:$rA, i16ImmSExt10:$val)),
+                            bb:$dest)>;
+
+  def r16: Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+               (brinst16 (orinst16 (CGTHr16 R16C:$rA, R16:$rB),
+                                   (CEQHr16 R16C:$rA, R16:$rB)),
+                         bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (orinst32 (CGTIr32 R32C:$rA, i32ImmSExt10:$val),
+                                       (CEQIr32 R32C:$rA, i32ImmSExt10:$val)),
+                             bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (orinst32 (CGTr32 R32C:$rA, R32C:$rB),
+                                    (CEQr32 R32C:$rA, R32C:$rB)),
+                          bb:$dest)>;
+}
+
+defm BRCONDge : BranchCondGTEQ<setge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDlt : BranchCondGTEQ<setlt, ORr16, BRHZr16, ORr32, BRZr32>;
+
+let isTerminator = 1, isBarrier = 1 in {
+  let isReturn = 1 in {
+    def RET:
+        RETForm<"bi\t$$lr", [(retflag)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Single precision floating point instructions
+//===----------------------------------------------------------------------===//
+
+class FAInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011000100, OOL, IOL, "fa\t$rT, $rA, $rB",
+           SPrecFP, pattern>;
+
+class FAVecInst<ValueType vectype>:
+    FAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
+multiclass SFPAdd
+{
+  def v4f32: FAVecInst<v4f32>;
+  def f32:   FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>;
+}
+
+defm FA : SFPAdd;
+
+class FSInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011000100, OOL, IOL, "fs\t$rT, $rA, $rB",
+           SPrecFP, pattern>;
+
+class FSVecInst<ValueType vectype>:
+    FSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (vectype VECREG:$rT),
+                 (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
+multiclass SFPSub
+{
+  def v4f32: FSVecInst<v4f32>;
+  def f32:   FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>;
+}
+
+defm FS : SFPSub;
+
+// Floating point reciprocal estimate
+
+class FRESTInst<dag OOL, dag IOL>:
+  RRForm_1<0b00110111000, OOL, IOL,
+           "frest\t$rT, $rA", SPrecFP,
+           [/* no pattern */]>;
+
+def FRESTv4f32 :
+    FRESTInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+def FRESTf32 :
+    FRESTInst<(outs R32FP:$rT), (ins R32FP:$rA)>;
+
+// Floating point interpolate (used in conjunction with reciprocal estimate)
+def FIv4f32 :
+    RRForm<0b00101011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "fi\t$rT, $rA, $rB", SPrecFP,
+      [/* no pattern */]>;
+
+def FIf32 :
+    RRForm<0b00101011110, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fi\t$rT, $rA, $rB", SPrecFP,
+      [/* no pattern */]>;
+
+//--------------------------------------------------------------------------
+// Basic single precision floating point comparisons:
+//
+// Note: There is no support on SPU for single precision NaN. Consequently,
+// ordered and unordered comparisons are the same.
+//--------------------------------------------------------------------------
+
+def FCEQf32 :
+    RRForm<0b01000011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fceq\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setueq R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setoeq R32FP:$rA, R32FP:$rB),
+          (FCEQf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCMEQf32 :
+    RRForm<0b01010011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcmeq\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setueq (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setoeq (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMEQf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCGTf32 :
+    RRForm<0b01000011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcgt\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setugt R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setugt R32FP:$rA, R32FP:$rB),
+          (FCGTf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCMGTf32 :
+    RRForm<0b01010011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcmgt\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setugt (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setugt (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMGTf32 R32FP:$rA, R32FP:$rB)>;
+
+//--------------------------------------------------------------------------
+// Single precision floating point comparisons and SETCC equivalents:
+//--------------------------------------------------------------------------
+
+def : SETCCNegCondReg<setune, R32FP, i32, XORIr32, FCEQf32>;
+def : SETCCNegCondReg<setone, R32FP, i32, XORIr32, FCEQf32>;
+
+def : SETCCBinOpReg<setuge, R32FP, ORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setoge, R32FP, ORr32, FCGTf32, FCEQf32>;
+
+def : SETCCBinOpReg<setult, R32FP, NORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setolt, R32FP, NORr32, FCGTf32, FCEQf32>;
+
+def : Pat<(setule R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
+def : Pat<(setole R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
+
+// FP Status and Control Register Write
+// Why isn't rT a don't care in the ISA?
+// Should we create a special RRForm_3 for this guy and zero out the rT?
+def FSCRWf32 :
+    RRForm_1<0b01011101110, (outs R32FP:$rT), (ins R32FP:$rA),
+      "fscrwr\t$rA", SPrecFP,
+      [/* This instruction requires an intrinsic. Note: rT is unused. */]>;
+
+// FP Status and Control Register Read
+def FSCRRf32 :
+    RRForm_2<0b01011101110, (outs R32FP:$rT), (ins),
+      "fscrrd\t$rT", SPrecFP,
+      [/* This instruction requires an intrinsic */]>;
+
+// llvm instruction space
+// How do these map onto cell instructions?
+// fdiv rA rB
+//   frest rC rB        # c = 1/b (both lines)
+//   fi rC rB rC
+//   fm rD rA rC        # d = a * 1/b
+//   fnms rB rD rB rA # b = - (d * b - a) --should == 0 in a perfect world
+//   fma rB rB rC rD            # b = b * c + d
+//                              = -(d *b -a) * c + d
+//                              = a * c - c ( a *b *c - a)
+
+// fcopysign (???)
+
+// Library calls:
+// These llvm instructions will actually map to library calls.
+// All that's needed, then, is to check that the appropriate library is
+// imported and do a brsl to the proper function name.
+// frem # fmod(x, y): x - (x/y) * y
+// (Note: fmod(double, double), fmodf(float,float)
+// fsqrt?
+// fsin?
+// fcos?
+// Unimplemented SPU instruction space
+// floating reciprocal absolute square root estimate (frsqest)
+
+// The following are probably just intrinsics
+// status and control register write
+// status and control register read
+
+//--------------------------------------
+// Floating point multiply instructions
+//--------------------------------------
+
+def FMv4f32:
+    RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "fm\t$rT, $rA, $rB", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (fmul (v4f32 VECREG:$rA),
+                                      (v4f32 VECREG:$rB)))]>;
+
+def FMf32 :
+    RRForm<0b01100011010, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fm\t$rT, $rA, $rB", SPrecFP,
+      [(set R32FP:$rT, (fmul R32FP:$rA, R32FP:$rB))]>;
+
+// Floating point multiply and add
+// e.g. d = c + (a * b)
+def FMAv4f32:
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fadd (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB))))]>;
+
+def FMAf32:
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fadd R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+// FP multiply and subtract
+// Subtracts value in rC from product
+// res = a * b - c
+def FMSv4f32 :
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+                  (v4f32 VECREG:$rC)))]>;
+
+def FMSf32 :
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT,
+            (fsub (fmul R32FP:$rA, R32FP:$rB), R32FP:$rC))]>;
+
+// Floating Negative Mulitply and Subtract
+// Subtracts product from value in rC
+// res = fneg(fms a b c)
+//     = - (a * b - c)
+//     = c - a * b
+// NOTE: subtraction order
+// fsub a b = a - b
+// fs a b = b - a?
+def FNMSf32 :
+    RRRForm<0b1101, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fsub R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+def FNMSv4f32 :
+    RRRForm<0b1101, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA),
+                        (v4f32 VECREG:$rB))))]>;
+
+//--------------------------------------
+// Floating Point Conversions
+// Signed conversions:
+def CSiFv4f32:
+    CVTIntFPForm<0b0101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "csflt\t$rT, $rA, 0", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (sint_to_fp (v4i32 VECREG:$rA)))]>;
+
+// Convert signed integer to floating point
+def CSiFf32 :
+    CVTIntFPForm<0b0101101110, (outs R32FP:$rT), (ins R32C:$rA),
+      "csflt\t$rT, $rA, 0", SPrecFP,
+      [(set R32FP:$rT, (sint_to_fp R32C:$rA))]>;
+
+// Convert unsigned into to float
+def CUiFv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cuflt\t$rT, $rA, 0", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (uint_to_fp (v4i32 VECREG:$rA)))]>;
+
+def CUiFf32 :
+    CVTIntFPForm<0b1101101110, (outs R32FP:$rT), (ins R32C:$rA),
+      "cuflt\t$rT, $rA, 0", SPrecFP,
+      [(set R32FP:$rT, (uint_to_fp R32C:$rA))]>;
+
+// Convert float to unsigned int
+// Assume that scale = 0
+
+def CFUiv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cfltu\t$rT, $rA, 0", SPrecFP,
+      [(set (v4i32 VECREG:$rT), (fp_to_uint (v4f32 VECREG:$rA)))]>;
+
+def CFUif32 :
+    CVTIntFPForm<0b1101101110, (outs R32C:$rT), (ins R32FP:$rA),
+      "cfltu\t$rT, $rA, 0", SPrecFP,
+      [(set R32C:$rT, (fp_to_uint R32FP:$rA))]>;
+
+// Convert float to signed int
+// Assume that scale = 0
+
+def CFSiv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cflts\t$rT, $rA, 0", SPrecFP,
+      [(set (v4i32 VECREG:$rT), (fp_to_sint (v4f32 VECREG:$rA)))]>;
+
+def CFSif32 :
+    CVTIntFPForm<0b1101101110, (outs R32C:$rT), (ins R32FP:$rA),
+      "cflts\t$rT, $rA, 0", SPrecFP,
+      [(set R32C:$rT, (fp_to_sint R32FP:$rA))]>;
+
+//===----------------------------------------------------------------------==//
+// Single<->Double precision conversions
+//===----------------------------------------------------------------------==//
+
+// NOTE: We use "vec" name suffix here to avoid confusion (e.g. input is a
+// v4f32, output is v2f64--which goes in the name?)
+
+// Floating point extend single to double
+// NOTE: Not sure if passing in v4f32 to FESDvec is correct since it
+// operates on two double-word slots (i.e. 1st and 3rd fp numbers
+// are ignored).
+def FESDvec :
+    RRForm_1<0b00011101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "fesd\t$rT, $rA", SPrecFP,
+      [(set (v2f64 VECREG:$rT), (fextend (v4f32 VECREG:$rA)))]>;
+
+def FESDf32 :
+    RRForm_1<0b00011101110, (outs R64FP:$rT), (ins R32FP:$rA),
+      "fesd\t$rT, $rA", SPrecFP,
+      [(set R64FP:$rT, (fextend R32FP:$rA))]>;
+
+// Floating point round double to single
+//def FRDSvec :
+//    RRForm_1<0b10011101110, (outs VECREG:$rT), (ins VECREG:$rA),
+//      "frds\t$rT, $rA,", SPrecFP,
+//      [(set (v4f32 R32FP:$rT), (fround (v2f64 R64FP:$rA)))]>;
+
+def FRDSf64 :
+    RRForm_1<0b10011101110, (outs R32FP:$rT), (ins R64FP:$rA),
+      "frds\t$rT, $rA", SPrecFP,
+      [(set R32FP:$rT, (fround R64FP:$rA))]>;
+
+//ToDo include anyextend?
+
+//===----------------------------------------------------------------------==//
+// Double precision floating point instructions
+//===----------------------------------------------------------------------==//
+def FAf64 :
+    RRForm<0b00110011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfa\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fadd R64FP:$rA, R64FP:$rB))]>;
+
+def FAv2f64 :
+    RRForm<0b00110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfa\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT), (fadd (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FSf64 :
+    RRForm<0b10100011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfs\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fsub R64FP:$rA, R64FP:$rB))]>;
+
+def FSv2f64 :
+    RRForm<0b10100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfs\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fsub (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FMf64 :
+    RRForm<0b01100011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfm\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fmul R64FP:$rA, R64FP:$rB))]>;
+
+def FMv2f64:
+    RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfm\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FMAf64:
+    RRForm<0b00111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfma\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fadd R64FP:$rC, (fmul R64FP:$rA, R64FP:$rB)))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMAv2f64:
+    RRForm<0b00111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfma\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fadd (v2f64 VECREG:$rC),
+                  (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMSf64 :
+    RRForm<0b10111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfms\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMSv2f64 :
+    RRForm<0b10111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfms\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fsub (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)),
+                  (v2f64 VECREG:$rC)))]>;
+
+// DFNMS: - (a * b - c)
+// - (a * b) + c => c - (a * b)
+
+class DFNMSInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01111010110, OOL, IOL, "dfnms\t$rT, $rA, $rB",
+           DPrecFP, pattern>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+class DFNMSVecInst<list<dag> pattern>:
+    DFNMSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+              pattern>;
+
+class DFNMSRegInst<list<dag> pattern>:
+    DFNMSInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+             pattern>;
+
+multiclass DFMultiplySubtract
+{
+  def v2f64 : DFNMSVecInst<[(set (v2f64 VECREG:$rT), 
+                                 (fsub (v2f64 VECREG:$rC),
+                                       (fmul (v2f64 VECREG:$rA),
+                                             (v2f64 VECREG:$rB))))]>;
+
+  def f64 : DFNMSRegInst<[(set R64FP:$rT,
+                               (fsub R64FP:$rC,
+                                     (fmul R64FP:$rA, R64FP:$rB)))]>;
+}
+
+defm DFNMS : DFMultiplySubtract;
+
+// - (a * b + c)
+// - (a * b) - c
+def FNMAf64 :
+    RRForm<0b11111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfnma\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fneg (fadd R64FP:$rC, (fmul R64FP:$rA, R64FP:$rB))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FNMAv2f64 :
+    RRForm<0b11111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfnma\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fneg (fadd (v2f64 VECREG:$rC),
+                        (fmul (v2f64 VECREG:$rA),
+                              (v2f64 VECREG:$rB)))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+//===----------------------------------------------------------------------==//
+// Floating point negation and absolute value
+//===----------------------------------------------------------------------==//
+
+def : Pat<(fneg (v4f32 VECREG:$rA)),
+          (XORfnegvec (v4f32 VECREG:$rA),
+                      (v4f32 (ILHUv4i32 0x8000)))>;
+
+def : Pat<(fneg R32FP:$rA),
+          (XORfneg32 R32FP:$rA, (ILHUr32 0x8000))>;
+
+// Floating point absolute value
+// Note: f64 fabs is custom-selected.
+
+def : Pat<(fabs R32FP:$rA),
+          (ANDfabs32 R32FP:$rA, (IOHLr32 (ILHUr32 0x7fff), 0xffff))>;
+
+def : Pat<(fabs (v4f32 VECREG:$rA)),
+          (ANDfabsvec (v4f32 VECREG:$rA),
+                      (IOHLv4i32 (ILHUv4i32 0x7fff), 0xffff))>;
+
+//===----------------------------------------------------------------------===//
+// Hint for branch instructions:
+//===----------------------------------------------------------------------===//
+
+/* def HBR : SPUInstr<(outs), (ins), "hbr\t" */
+
+//===----------------------------------------------------------------------===//
+// Execution, Load NOP (execute NOPs belong in even pipeline, load NOPs belong
+// in the odd pipeline)
+//===----------------------------------------------------------------------===//
+
+def ENOP : SPUInstr<(outs), (ins), "enop", ExecNOP> {
+  let Pattern = [];
+
+  let Inst{0-10} = 0b10000000010;
+  let Inst{11-17} = 0;
+  let Inst{18-24} = 0;
+  let Inst{25-31} = 0;
+}
+
+def LNOP : SPUInstr<(outs), (ins), "lnop", LoadNOP> {
+  let Pattern = [];
+
+  let Inst{0-10} = 0b10000000000;
+  let Inst{11-17} = 0;
+  let Inst{18-24} = 0;
+  let Inst{25-31} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Bit conversions (type conversions between vector/packed types)
+// NOTE: Promotions are handled using the XS* instructions.
+//===----------------------------------------------------------------------===//
+def : Pat<(v16i8 (bitconvert (v8i16 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VECREG:$src))), (v16i8 VECREG:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VECREG:$src))), (v8i16 VECREG:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VECREG:$src))), (v4i32 VECREG:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VECREG:$src))), (v2i64 VECREG:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VECREG:$src))), (v4f32 VECREG:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v16i8 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v2f64 VECREG:$src))), (v2f64 VECREG:$src)>;
+
+def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v8i16 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v4i32 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v2i64 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v4f32 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v2f64 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+
+def : Pat<(v16i8 (bitconvert (i128 GPRC:$src))),
+          (v16i8 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v8i16 (bitconvert (i128 GPRC:$src))),
+          (v8i16 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v4i32 (bitconvert (i128 GPRC:$src))),
+          (v4i32 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v2i64 (bitconvert (i128 GPRC:$src))),
+          (v2i64 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v4f32 (bitconvert (i128 GPRC:$src))),
+          (v4f32 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v2f64 (bitconvert (i128 GPRC:$src))),
+          (v2f64 (ORvec_i128 GPRC:$src))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction patterns:
+//===----------------------------------------------------------------------===//
+
+// General 32-bit constants:
+def : Pat<(i32 imm:$imm),
+          (IOHLr32 (ILHUr32 (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Single precision float constants:
+def : Pat<(f32 fpimm:$imm),
+          (IOHLf32 (ILHUf32 (HI16_f32 fpimm:$imm)), (LO16_f32 fpimm:$imm))>;
+
+// General constant 32-bit vectors
+def : Pat<(v4i32 v4i32Imm:$imm),
+          (IOHLv4i32 (v4i32 (ILHUv4i32 (HI16_vec v4i32Imm:$imm))),
+                     (LO16_vec v4i32Imm:$imm))>;
+
+// 8-bit constants
+def : Pat<(i8 imm:$imm),
+          (ILHr8 imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Call instruction patterns:
+//===----------------------------------------------------------------------===//
+// Return void
+def : Pat<(ret),
+          (RET)>;
+
+//===----------------------------------------------------------------------===//
+// Zero/Any/Sign extensions
+//===----------------------------------------------------------------------===//
+
+// sext 8->32: Sign extend bytes to words
+def : Pat<(sext_inreg R32C:$rSrc, i8),
+          (XSHWr32 (XSBHr32 R32C:$rSrc))>;
+
+def : Pat<(i32 (sext R8C:$rSrc)),
+          (XSHWr16 (XSBHr8 R8C:$rSrc))>;
+
+// sext 8->64: Sign extend bytes to double word
+def : Pat<(sext_inreg R64C:$rSrc, i8),
+          (XSWDr64_inreg (XSHWr64 (XSBHr64 R64C:$rSrc)))>;
+          
+def : Pat<(i64 (sext R8C:$rSrc)),
+          (XSWDr64 (XSHWr16 (XSBHr8 R8C:$rSrc)))>;
+
+// zext 8->16: Zero extend bytes to halfwords
+def : Pat<(i16 (zext R8C:$rSrc)),
+          (ANDHIi8i16 R8C:$rSrc, 0xff)>;
+
+// zext 8->32: Zero extend bytes to words
+def : Pat<(i32 (zext R8C:$rSrc)),
+          (ANDIi8i32 R8C:$rSrc, 0xff)>;
+
+// zext 8->64: Zero extend bytes to double words
+def : Pat<(i64 (zext R8C:$rSrc)),
+          (ORi64_v2i64 (SELBv4i32 (ROTQMBYv4i32
+                                    (ORv4i32_i32 (ANDIi8i32 R8C:$rSrc, 0xff)),
+                                    0x4),
+                                  (ILv4i32 0x0),
+                                  (FSMBIv4i32 0x0f0f)))>;
+
+// anyext 8->16: Extend 8->16 bits, irrespective of sign, preserves high bits
+def : Pat<(i16 (anyext R8C:$rSrc)),
+          (ORHIi8i16 R8C:$rSrc, 0)>;
+
+// anyext 8->32: Extend 8->32 bits, irrespective of sign, preserves high bits
+def : Pat<(i32 (anyext R8C:$rSrc)),
+          (ORIi8i32 R8C:$rSrc, 0)>;
+
+// sext 16->64: Sign extend halfword to double word
+def : Pat<(sext_inreg R64C:$rSrc, i16),
+          (XSWDr64_inreg (XSHWr64 R64C:$rSrc))>;
+          
+def : Pat<(sext R16C:$rSrc),
+          (XSWDr64 (XSHWr16 R16C:$rSrc))>;
+
+// zext 16->32: Zero extend halfwords to words
+def : Pat<(i32 (zext R16C:$rSrc)),
+          (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff))>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xf))),
+          (ANDIi16i32 R16C:$rSrc, 0xf)>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xff))),
+          (ANDIi16i32 R16C:$rSrc, 0xff)>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xfff))),
+          (ANDIi16i32 R16C:$rSrc, 0xfff)>;
+
+// anyext 16->32: Extend 16->32 bits, irrespective of sign
+def : Pat<(i32 (anyext R16C:$rSrc)),
+          (ORIi16i32 R16C:$rSrc, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Truncates:
+// These truncates are for the SPU's supported types (i8, i16, i32). i64 and
+// above are custom lowered.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i8 (trunc GPRC:$src)),
+          (ORi8_v16i8
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)))>;
+
+def : Pat<(i8 (trunc R64C:$src)),
+          (ORi8_v16i8
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)))>;
+
+def : Pat<(i8 (trunc R32C:$src)),
+          (ORi8_v16i8
+            (SHUFBv4i32_m32
+               (ORv4i32_i32 R32C:$src),
+               (ORv4i32_i32 R32C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>;
+
+def : Pat<(i8 (trunc R16C:$src)),
+          (ORi8_v16i8
+            (SHUFBv4i32_m32
+               (ORv8i16_i16 R16C:$src),
+               (ORv8i16_i16 R16C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>;
+
+def : Pat<(i16 (trunc GPRC:$src)),
+          (ORi16_v8i16
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)))>;
+
+def : Pat<(i16 (trunc R64C:$src)),
+          (ORi16_v8i16
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)))>;
+
+def : Pat<(i16 (trunc R32C:$src)),
+          (ORi16_v8i16
+            (SHUFBv4i32_m32
+               (ORv4i32_i32 R32C:$src),
+               (ORv4i32_i32 R32C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)))>;
+
+def : Pat<(i32 (trunc GPRC:$src)),
+          (ORi32_v4i32
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)))>;
+
+def : Pat<(i32 (trunc R64C:$src)),
+          (ORi32_v4i32
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)))>;
+
+//===----------------------------------------------------------------------===//
+// Address generation: SPU, like PPC, has to split addresses into high and
+// low parts in order to load them into a register.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(SPUaform tglobaladdr:$in, 0),  (ILAlsa tglobaladdr:$in)>;
+def : Pat<(SPUaform texternalsym:$in, 0), (ILAlsa texternalsym:$in)>;
+def : Pat<(SPUaform tjumptable:$in, 0),   (ILAlsa tjumptable:$in)>;
+def : Pat<(SPUaform tconstpool:$in, 0),   (ILAlsa  tconstpool:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tglobaladdr:$in, 0),
+                       (SPUlo tglobaladdr:$in, 0)),
+          (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
+
+def : Pat<(SPUindirect (SPUhi texternalsym:$in, 0),
+                       (SPUlo texternalsym:$in, 0)),
+          (IOHLlo (ILHUhi texternalsym:$in), texternalsym:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tjumptable:$in, 0),
+                       (SPUlo tjumptable:$in, 0)),
+          (IOHLlo (ILHUhi tjumptable:$in), tjumptable:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tconstpool:$in, 0),
+                       (SPUlo tconstpool:$in, 0)),
+          (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;
+
+def : Pat<(add (SPUhi tglobaladdr:$in, 0), (SPUlo tglobaladdr:$in, 0)),
+          (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
+
+def : Pat<(add (SPUhi texternalsym:$in, 0), (SPUlo texternalsym:$in, 0)),
+          (IOHLlo (ILHUhi texternalsym:$in), texternalsym:$in)>;
+
+def : Pat<(add (SPUhi tjumptable:$in, 0), (SPUlo tjumptable:$in, 0)),
+          (IOHLlo (ILHUhi tjumptable:$in), tjumptable:$in)>;
+
+def : Pat<(add (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)),
+          (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;
+
+// Intrinsics:
+include "CellSDKIntrinsics.td"
+// Various math operator instruction sequences
+include "SPUMathInstr.td"
+// 64-bit "instructions"/support
+include "SPU64InstrInfo.td"
+// 128-bit "instructions"/support
+include "SPU128InstrInfo.td"
diff --git a/lib/Target/CellSPU/SPUMachineFunction.h b/lib/Target/CellSPU/SPUMachineFunction.h
new file mode 100644
index 0000000..6a66967
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMachineFunction.h
@@ -0,0 +1,43 @@
+//===-- SPUMachineFunctionInfo.h - Private data used for CellSPU --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IBM Cell SPU specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_MACHINE_FUNCTION_INFO_H
+#define SPU_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// SPUFunctionInfo - Cell SPU target-specific information for each
+/// MachineFunction
+class SPUFunctionInfo : public MachineFunctionInfo {
+private:
+  /// UsesLR - Indicates whether LR is used in the current function.
+  ///
+  bool UsesLR;
+
+public:
+  SPUFunctionInfo(MachineFunction& MF) 
+  : UsesLR(false)
+  {}
+
+  void setUsesLR(bool U) { UsesLR = U; }
+  bool usesLR()          { return UsesLR; }
+
+};
+
+} // end of namespace llvm
+
+
+#endif
+
diff --git a/lib/Target/CellSPU/SPUMathInstr.td b/lib/Target/CellSPU/SPUMathInstr.td
new file mode 100644
index 0000000..80ebde3
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMathInstr.td
@@ -0,0 +1,97 @@
+//======--- SPUMathInst.td - Cell SPU math operations -*- tablegen -*---======//
+//
+//                     Cell SPU math operations
+//
+// This target description file contains instruction sequences for various
+// math operations, such as vector multiplies, i32 multiply, etc., for the
+// SPU's i32, i16 i8 and corresponding vector types.
+//
+// Any resemblance to libsimdmath or the Cell SDK simdmath library is
+// purely and completely coincidental.
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v16i8 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)),
+          (ORv4i32
+           (ANDv4i32
+            (SELBv4i32 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAHIv8i16 VECREG:$rA, 8),
+                                             (ROTMAHIv8i16 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)),
+            (ILAv4i32 0x0000ffff)),
+           (SHLIv4i32
+            (SELBv4i32 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 16),
+                                 (ROTMAIv4i32_i32 VECREG:$rB, 16)),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 8),
+                                             (ROTMAIv4i32_i32 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)), 16))>;
+                        
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v8i16 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (SELBv8i16 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                     (SHLIv4i32 (MPYHHv8i16 VECREG:$rA, VECREG:$rB), 16),
+                     (FSMBIv8i16 0xcccc))>;
+                 
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v4i32, i32 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def MPYv4i32:
+  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+      (Av4i32
+        (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB),
+                (MPYHv4i32 VECREG:$rB, VECREG:$rA)),
+        (MPYUv4i32 VECREG:$rA, VECREG:$rB))>;
+
+def MPYi32:
+  Pat<(mul R32C:$rA, R32C:$rB),
+      (Ar32
+        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
+              (MPYHr32 R32C:$rB, R32C:$rA)),
+        (MPYUr32 R32C:$rA, R32C:$rB))>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f32, v4f32 divide instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Reciprocal estimate and interpolation
+def Interpf32: CodeFrag<(FIf32 R32FP:$rB, (FRESTf32 R32FP:$rB))>;
+// Division estimate
+def DivEstf32: CodeFrag<(FMf32 R32FP:$rA, Interpf32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphf32: CodeFrag<(FMAf32 (FNMSf32 DivEstf32.Fragment, R32FP:$rB, R32FP:$rA),
+                               Interpf32.Fragment,
+                               DivEstf32.Fragment)>;
+// Epsilon addition
+def Epsilonf32: CodeFrag<(AIf32 NRaphf32.Fragment, 1)>;
+
+def : Pat<(fdiv R32FP:$rA, R32FP:$rB),
+          (SELBf32_cond NRaphf32.Fragment,
+                        Epsilonf32.Fragment,
+                        (CGTIf32 (FNMSf32 R32FP:$rB, Epsilonf32.Fragment, R32FP:$rA), -1))>;
+
+// Reciprocal estimate and interpolation
+def Interpv4f32: CodeFrag<(FIv4f32 (v4f32 VECREG:$rB), (FRESTv4f32 (v4f32 VECREG:$rB)))>;
+// Division estimate
+def DivEstv4f32: CodeFrag<(FMv4f32 (v4f32 VECREG:$rA), Interpv4f32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphv4f32: CodeFrag<(FMAv4f32 (FNMSv4f32 DivEstv4f32.Fragment,
+                                              (v4f32 VECREG:$rB),
+                                              (v4f32 VECREG:$rA)),
+                                   Interpv4f32.Fragment,
+                                   DivEstv4f32.Fragment)>;
+// Epsilon addition
+def Epsilonv4f32: CodeFrag<(AIv4f32 NRaphv4f32.Fragment, 1)>;
+
+def : Pat<(fdiv (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+          (SELBv4f32_cond NRaphv4f32.Fragment,
+                        Epsilonv4f32.Fragment,
+                        (CGTIv4f32 (FNMSv4f32 (v4f32 VECREG:$rB),
+                                              Epsilonv4f32.Fragment,
+                                              (v4f32 VECREG:$rA)), -1))>;
diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td
new file mode 100644
index 0000000..87c4115
--- /dev/null
+++ b/lib/Target/CellSPU/SPUNodes.td
@@ -0,0 +1,156 @@
+//===- SPUNodes.td - Specialized SelectionDAG nodes used for CellSPU ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Type profiles and SelectionDAG nodes used by CellSPU
+//
+//===----------------------------------------------------------------------===//
+
+// Type profile for a call sequence
+def SDT_SPUCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+
+// SPU_GenControl: Type profile for generating control words for insertions
+def SPU_GenControl : SDTypeProfile<1, 1, []>;
+def SPUshufmask    : SDNode<"SPUISD::SHUFFLE_MASK", SPU_GenControl, []>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPUCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPUCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+//===----------------------------------------------------------------------===//
+// Operand constraints:
+//===----------------------------------------------------------------------===//
+
+def SDT_SPUCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def SPUcall       : SDNode<"SPUISD::CALL", SDT_SPUCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// Operand type constraints for vector shuffle/permute operations
+def SDT_SPUshuffle   : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+]>;
+
+// Vector binary operator type constraints (needs a further constraint to
+// ensure that operand 0 is a vector...):
+
+def SPUVecBinop: SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+]>;
+
+// Trinary operators, e.g., addx, carry generate
+def SPUIntTrinaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>
+]>;
+
+// SELECT_MASK type constraints: There are several variations for the various
+// vector types (this avoids having to bit_convert all over the place.)
+def SPUselmask_type: SDTypeProfile<1, 1, [
+  SDTCisInt<1>
+]>;
+
+// SELB type constraints:
+def SPUselb_type: SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<0, 3> ]>;
+
+// SPU Vector shift pseudo-instruction type constraints
+def SPUvecshift_type: SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
+
+// "marker" type for i64 operators that need a shuffle mask
+// (i.e., uses cg or bg or another instruction that needs to
+// use shufb to get things in the right place.)
+// Op0: The result
+// Op1, 2: LHS, RHS
+// Op3: Carry-generate shuffle mask
+
+def SPUmarker_type : SDTypeProfile<1, 3, [
+  SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>;
+
+//===----------------------------------------------------------------------===//
+// Synthetic/pseudo-instructions
+//===----------------------------------------------------------------------===//
+
+// SPU CNTB:
+def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>;
+
+// SPU vector shuffle node, matched by the SPUISD::SHUFB enum (see
+// SPUISelLowering.h):
+def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>;
+
+// Shift left quadword by bits and bytes
+def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>;
+def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>;
+
+// Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only):
+def SPUvec_shl: SDNode<"SPUISD::VEC_SHL", SPUvecshift_type, []>;
+def SPUvec_srl: SDNode<"SPUISD::VEC_SRL", SPUvecshift_type, []>;
+def SPUvec_sra: SDNode<"SPUISD::VEC_SRA", SPUvecshift_type, []>;
+
+def SPUvec_rotl: SDNode<"SPUISD::VEC_ROTL", SPUvecshift_type, []>;
+def SPUvec_rotr: SDNode<"SPUISD::VEC_ROTR", SPUvecshift_type, []>;
+
+// Vector rotate left, bits shifted out of the left are rotated in on the right
+def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT",
+                             SPUvecshift_type, []>;
+
+// Vector rotate left by bytes, but the count is given in bits and the SPU
+// internally converts it to bytes (saves an instruction to mask off lower
+// three bits)
+def SPUrotbytes_left_bits : SDNode<"SPUISD::ROTBYTES_LEFT_BITS",
+                                   SPUvecshift_type>;
+
+// SPU form select mask for bytes, immediate
+def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>;
+
+// SPU select bits instruction
+def SPUselb: SDNode<"SPUISD::SELB", SPUselb_type, []>;
+
+def SDTprefslot2vec: SDTypeProfile<1, 1, []>;
+def SPUprefslot2vec: SDNode<"SPUISD::PREFSLOT2VEC", SDTprefslot2vec, []>;
+
+def SPU_vec_demote   : SDTypeProfile<1, 1, []>;
+def SPUvec2prefslot: SDNode<"SPUISD::VEC2PREFSLOT", SPU_vec_demote, []>;
+
+// Address high and low components, used for [r+r] type addressing
+def SPUhi : SDNode<"SPUISD::Hi", SDTIntBinOp, []>;
+def SPUlo : SDNode<"SPUISD::Lo", SDTIntBinOp, []>;
+
+// PC-relative address
+def SPUpcrel : SDNode<"SPUISD::PCRelAddr", SDTIntBinOp, []>;
+
+// A-Form local store addresses
+def SPUaform : SDNode<"SPUISD::AFormAddr", SDTIntBinOp, []>;
+
+// Indirect [D-Form "imm($reg)" and X-Form "$reg($reg)"] addresses
+def SPUindirect : SDNode<"SPUISD::IndirectAddr", SDTIntBinOp, []>;
+
+// i64 markers: supplies extra operands used to generate the i64 operator
+// instruction sequences
+def SPUadd64 : SDNode<"SPUISD::ADD64_MARKER", SPUmarker_type, []>;
+def SPUsub64 : SDNode<"SPUISD::SUB64_MARKER", SPUmarker_type, []>;
+def SPUmul64 : SDNode<"SPUISD::MUL64_MARKER", SPUmarker_type, []>;
+
+//===----------------------------------------------------------------------===//
+// Constraints: (taken from PPCInstrInfo.td)
+//===----------------------------------------------------------------------===//
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+//===----------------------------------------------------------------------===//
+// Return (flag isn't quite what it means: the operations are flagged so that
+// instruction scheduling doesn't disassociate them.)
+//===----------------------------------------------------------------------===//
+
+def retflag     : SDNode<"SPUISD::RET_FLAG", SDTNone,
+                         [SDNPHasChain, SDNPOptInFlag]>;
diff --git a/lib/Target/CellSPU/SPUOperands.td b/lib/Target/CellSPU/SPUOperands.td
new file mode 100644
index 0000000..802628f
--- /dev/null
+++ b/lib/Target/CellSPU/SPUOperands.td
@@ -0,0 +1,655 @@
+//===- SPUOperands.td - Cell SPU Instruction Operands ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+// Cell SPU Instruction Operands:
+//===----------------------------------------------------------------------===//
+
+def LO16 : SDNodeXForm<imm, [{
+  unsigned val = N->getZExtValue();
+  // Transformation function: get the low 16 bits.
+  return getI32Imm(val & 0xffff);
+}]>;
+
+def LO16_vec : SDNodeXForm<scalar_to_vector, [{
+  SDValue OpVal(0, 0);
+
+  // Transformation function: get the low 16 bit immediate from a build_vector
+  // node.
+  assert(N->getOpcode() == ISD::BUILD_VECTOR
+         && "LO16_vec got something other than a BUILD_VECTOR");
+
+  // Get first constant operand...
+  for (unsigned i = 0, e = N->getNumOperands();
+       OpVal.getNode() == 0 && i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+  }
+  
+  assert(OpVal.getNode() != 0 && "LO16_vec did not locate a <defined> node");
+  ConstantSDNode *CN = cast<ConstantSDNode>(OpVal);
+  return getI32Imm((unsigned)CN->getZExtValue() & 0xffff);
+}]>;
+
+// Transform an immediate, returning the high 16 bits shifted down:
+def HI16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+// Transformation function: shift the high 16 bit immediate from a build_vector
+// node into the low 16 bits, and return a 16-bit constant.
+def HI16_vec : SDNodeXForm<scalar_to_vector, [{
+  SDValue OpVal(0, 0);
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR
+         && "HI16_vec got something other than a BUILD_VECTOR");
+  
+  // Get first constant operand...
+  for (unsigned i = 0, e = N->getNumOperands();
+       OpVal.getNode() == 0 && i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+  }
+  
+  assert(OpVal.getNode() != 0 && "HI16_vec did not locate a <defined> node");
+  ConstantSDNode *CN = cast<ConstantSDNode>(OpVal);
+  return getI32Imm((unsigned)CN->getZExtValue() >> 16);
+}]>;
+
+// simm7 predicate - True if the immediate fits in an 7-bit signed
+// field.
+def simm7: PatLeaf<(imm), [{
+  int sextVal = int(N->getSExtValue());
+  return (sextVal >= -64 && sextVal <= 63);
+}]>;
+
+// uimm7 predicate - True if the immediate fits in an 7-bit unsigned
+// field.
+def uimm7: PatLeaf<(imm), [{
+  return (N->getZExtValue() <= 0x7f);
+}]>;
+
+// immSExt8 predicate - True if the immediate fits in an 8-bit sign extended
+// field.
+def immSExt8  : PatLeaf<(imm), [{
+  int Value = int(N->getSExtValue());
+  return (Value >= -(1 << 8) && Value <= (1 << 8) - 1);
+}]>;
+
+// immU8: immediate, unsigned 8-bit quantity
+def immU8 : PatLeaf<(imm), [{
+  return (N->getZExtValue() <= 0xff);
+}]>;
+
+// i64ImmSExt10 predicate - True if the i64 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i64ImmSExt10  : PatLeaf<(imm), [{
+  return isI64IntS10Immediate(N);
+}]>;
+
+// i32ImmSExt10 predicate - True if the i32 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i32ImmSExt10  : PatLeaf<(imm), [{
+  return isI32IntS10Immediate(N);
+}]>;
+
+// i32ImmUns10 predicate - True if the i32 immediate fits in a 10-bit unsigned
+// field.  Used by RI10Form instructions like 'ldq'.
+def i32ImmUns10  : PatLeaf<(imm), [{
+  return isI32IntU10Immediate(N);
+}]>;
+
+// i16ImmSExt10 predicate - True if the i16 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i16ImmSExt10  : PatLeaf<(imm), [{
+  return isI16IntS10Immediate(N);
+}]>;
+
+// i16ImmUns10 predicate - True if the i16 immediate fits into a 10-bit unsigned
+// value. Used by RI10Form instructions.
+def i16ImmUns10 : PatLeaf<(imm), [{
+  return isI16IntU10Immediate(N);
+}]>;
+
+def immSExt16  : PatLeaf<(imm), [{
+  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  short Ignored;
+  return isIntS16Immediate(N, Ignored);
+}]>;
+
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+def immU16 : PatLeaf<(imm), [{
+  // immU16 predicate- True if the immediate fits into a 16-bit unsigned field.
+  return (uint64_t)N->getZExtValue() == (N->getZExtValue() & 0xffff);
+}]>;
+
+def imm18  : PatLeaf<(imm), [{
+  // imm18 predicate: True if the immediate fits into an 18-bit unsigned field.
+  int Value = (int) N->getZExtValue();
+  return ((Value & ((1 << 19) - 1)) == Value);
+}]>;
+
+def lo16 : PatLeaf<(imm), [{
+  // lo16 predicate - returns true if the immediate has all zeros in the
+  // low order bits and is a 32-bit constant:
+  if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = N->getZExtValue();
+    return ((val & 0x0000ffff) == val);
+  }
+
+  return false;
+}], LO16>;
+
+def hi16 : PatLeaf<(imm), [{
+  // hi16 predicate - returns true if the immediate has all zeros in the
+  // low order bits and is a 32-bit constant:
+  if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = uint32_t(N->getZExtValue());
+    return ((val & 0xffff0000) == val);
+  } else if (N->getValueType(0) == MVT::i64) {
+    uint64_t val = N->getZExtValue();
+    return ((val & 0xffff0000ULL) == val);
+  }
+
+  return false;
+}], HI16>;
+
+def bitshift : PatLeaf<(imm), [{
+  // bitshift predicate - returns true if 0 < imm <= 7 for SHLQBII
+  // (shift left quadword by bits immediate)
+  int64_t Val = N->getZExtValue();
+  return (Val > 0 && Val <= 7);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point operands:
+//===----------------------------------------------------------------------===//
+
+// Transform a float, returning the high 16 bits shifted down, as if
+// the float was really an unsigned integer:
+def HI16_f32 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) >> 16);
+}]>;
+
+// Transformation function on floats: get the low 16 bits as if the float was
+// an unsigned integer.
+def LO16_f32 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) & 0xffff);
+}]>;
+
+def FPimm_sext16 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm((int) ((FloatToBits(fval) << 16) >> 16));
+}]>;
+
+def FPimm_u18 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) & ((1 << 19) - 1));
+}]>;
+
+def fpimmSExt16 : PatLeaf<(fpimm), [{
+  short Ignored;
+  return isFPS16Immediate(N, Ignored);  
+}], FPimm_sext16>;
+
+// Does the SFP constant only have upp 16 bits set?
+def hi16_f32 : PatLeaf<(fpimm), [{
+  if (N->getValueType(0) == MVT::f32) {
+    uint32_t val = FloatToBits(N->getValueAPF().convertToFloat());
+    return ((val & 0xffff0000) == val);
+  }
+
+  return false;
+}], HI16_f32>;
+
+// Does the SFP constant fit into 18 bits?
+def fpimm18  : PatLeaf<(fpimm), [{
+  if (N->getValueType(0) == MVT::f32) {
+    uint32_t Value = FloatToBits(N->getValueAPF().convertToFloat());
+    return ((Value & ((1 << 19) - 1)) == Value);
+  }
+
+  return false;
+}], FPimm_u18>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands (TODO):
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// build_vector operands:
+//===----------------------------------------------------------------------===//
+
+// v16i8SExt8Imm_xform function: convert build_vector to 8-bit sign extended
+// immediate constant load for v16i8 vectors. N.B.: The incoming constant has
+// to be a 16-bit quantity with the upper and lower bytes equal (e.g., 0x2a2a).
+def v16i8SExt8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8);
+}]>;
+
+// v16i8SExt8Imm: Predicate test for 8-bit sign extended immediate constant
+// load, works in conjunction with its transform function. N.B.: This relies the
+// incoming constant being a 16-bit quantity, where the upper and lower bytes
+// are EXACTLY the same (e.g., 0x2a2a)
+def v16i8SExt8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8).getNode() != 0;
+}], v16i8SExt8Imm_xform>;
+
+// v16i8U8Imm_xform function: convert build_vector to unsigned 8-bit
+// immediate constant load for v16i8 vectors. N.B.: The incoming constant has
+// to be a 16-bit quantity with the upper and lower bytes equal (e.g., 0x2a2a).
+def v16i8U8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8);
+}]>;
+
+// v16i8U8Imm: Predicate test for unsigned 8-bit immediate constant
+// load, works in conjunction with its transform function. N.B.: This relies the
+// incoming constant being a 16-bit quantity, where the upper and lower bytes
+// are EXACTLY the same (e.g., 0x2a2a)
+def v16i8U8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8).getNode() != 0;
+}], v16i8U8Imm_xform>;
+
+// v8i16SExt8Imm_xform function: convert build_vector to 8-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16SExt8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt8Imm: Predicate test for 8-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16SExt8Imm_xform>;
+
+// v8i16SExt10Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt10Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16SExt10Imm_xform>;
+
+// v8i16Uns10Imm_xform function: convert build_vector to 16-bit unsigned
+// immediate constant load for v8i16 vectors.
+def v8i16Uns10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16Uns10Imm: Predicate test for 16-bit unsigned immediate constant
+// load, works in conjunction with its transform function.
+def v8i16Uns10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16Uns10Imm_xform>;
+
+// v8i16SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16Uns16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16Uns16Imm_xform>;
+
+// v4i32SExt10Imm_xform function: convert build_vector to 10-bit sign extended
+// immediate constant load for v4i32 vectors.
+def v4i32SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32SExt10Imm: Predicate test for 10-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v4i32SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32SExt10Imm_xform>;
+
+// v4i32Uns10Imm_xform function: convert build_vector to 10-bit unsigned
+// immediate constant load for v4i32 vectors.
+def v4i32Uns10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32Uns10Imm: Predicate test for 10-bit unsigned immediate constant
+// load, works in conjunction with its transform function.
+def v4i32Uns10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32Uns10Imm_xform>;
+
+// v4i32SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v4i32 vectors.
+def v4i32SExt16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v4i32SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32SExt16Imm_xform>;
+
+// v4i32Uns18Imm_xform function: convert build_vector to 18-bit unsigned
+// immediate constant load for v4i32 vectors.
+def v4i32Uns18Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32Uns18Imm: Predicate test for 18-bit unsigned immediate constant load,
+// works in conjunction with its transform function.
+def v4i32Uns18Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32Uns18Imm_xform>;
+
+// ILHUvec_get_imm xform function: convert build_vector to ILHUvec imm constant
+// load.
+def ILHUvec_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i32);
+}]>;
+
+/// immILHUvec: Predicate test for a ILHU constant vector.
+def immILHUvec: PatLeaf<(build_vector), [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], ILHUvec_get_imm>;
+
+// Catch-all for any other i32 vector constants
+def v4i32_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_v4i32_imm(N, *CurDAG);
+}]>;
+
+def v4i32Imm: PatLeaf<(build_vector), [{
+  return SPU::get_v4i32_imm(N, *CurDAG).getNode() != 0;
+}], v4i32_get_imm>;
+
+// v2i64SExt10Imm_xform function: convert build_vector to 10-bit sign extended
+// immediate constant load for v2i64 vectors.
+def v2i64SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64SExt10Imm: Predicate test for 10-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v2i64SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64SExt10Imm_xform>;
+
+// v2i64SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v2i64 vectors.
+def v2i64SExt16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v2i64SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64SExt16Imm_xform>;
+
+// v2i64Uns18Imm_xform function: convert build_vector to 18-bit unsigned
+// immediate constant load for v2i64 vectors.
+def v2i64Uns18Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64Uns18Imm: Predicate test for 18-bit unsigned immediate constant load,
+// works in conjunction with its transform function.
+def v2i64Uns18Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64Uns18Imm_xform>;
+
+/// immILHUvec: Predicate test for a ILHU constant vector.
+def immILHUvec_i64: PatLeaf<(build_vector), [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], ILHUvec_get_imm>;
+
+// Catch-all for any other i32 vector constants
+def v2i64_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_v2i64_imm(N, *CurDAG);
+}]>;
+
+def v2i64Imm: PatLeaf<(build_vector), [{
+  return SPU::get_v2i64_imm(N, *CurDAG).getNode() != 0;
+}], v2i64_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+
+def s7imm: Operand<i8> {
+  let PrintMethod = "printS7ImmOperand";
+}
+
+def s7imm_i8: Operand<i8> {
+  let PrintMethod = "printS7ImmOperand";
+}
+
+def u7imm: Operand<i16> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+def u7imm_i8: Operand<i8> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+def u7imm_i32: Operand<i32> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+// Halfword, signed 10-bit constant
+def s10imm : Operand<i16> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i8: Operand<i8> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i32: Operand<i32> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i64: Operand<i64> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+// Unsigned 10-bit integers:
+def u10imm: Operand<i16> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def u10imm_i8: Operand<i8> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def u10imm_i32: Operand<i32> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def s16imm  : Operand<i16> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i8: Operand<i8> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i32: Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i64: Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_f32: Operand<f32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_f64: Operand<f64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def u16imm_i64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u16imm_i32 : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def f16imm : Operand<f32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def s18imm  : Operand<i32> {
+  let PrintMethod = "printS18ImmOperand";
+}
+
+def u18imm : Operand<i32> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def u18imm_i64 : Operand<i64> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def f18imm : Operand<f32> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def f18imm_f64 : Operand<f64> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+// Negated 7-bit halfword rotate immediate operands
+def rothNeg7imm : Operand<i32> {
+  let PrintMethod = "printROTHNeg7Imm";
+}
+
+def rothNeg7imm_i16 : Operand<i16> {
+  let PrintMethod = "printROTHNeg7Imm";
+}
+
+// Negated 7-bit word rotate immediate operands
+def rotNeg7imm : Operand<i32> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def rotNeg7imm_i16 : Operand<i16> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def rotNeg7imm_i8 : Operand<i8> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def target : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+}
+
+// Absolute address call target
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+  let MIOperandInfo = (ops u18imm:$calldest);
+}
+
+// PC relative call target
+def relcalltarget : Operand<iPTR> {
+  let PrintMethod = "printPCRelativeOperand";
+  let MIOperandInfo = (ops s16imm:$calldest);
+}
+
+// Branch targets:
+def brtarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelativeOperand";
+}
+
+// Hint for branch target
+def hbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printHBROperand";
+}
+
+// Indirect call target
+def indcalltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+  let MIOperandInfo = (ops ptr_rc:$calldest);
+}
+
+def symbolHi: Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+
+def symbolLo: Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
+
+def symbolLSA: Operand<i32> {
+  let PrintMethod = "printSymbolLSA";
+}
+
+// Shuffle address memory operaand [s7imm(reg) d-format]
+def shufaddr : Operand<iPTR> {
+  let PrintMethod = "printShufAddr";
+  let MIOperandInfo = (ops s7imm:$imm, ptr_rc:$reg);
+}
+
+// memory s10imm(reg) operand
+def dformaddr : Operand<iPTR> {
+  let PrintMethod = "printDFormAddr";
+  let MIOperandInfo = (ops s10imm:$imm, ptr_rc:$reg);
+}
+
+// 256K local store address
+// N.B.: The tblgen code generator expects to have two operands, an offset
+// and a pointer. Of these, only the immediate is actually used.
+def addr256k : Operand<iPTR> {
+  let PrintMethod = "printAddr256K";
+  let MIOperandInfo = (ops s16imm:$imm, ptr_rc:$reg);
+}
+
+// memory s18imm(reg) operand
+def memri18 : Operand<iPTR> {
+  let PrintMethod = "printMemRegImmS18";
+  let MIOperandInfo = (ops s18imm:$imm, ptr_rc:$reg);
+}
+
+// memory register + register operand
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc:$reg_a, ptr_rc:$reg_b);
+}
+
+// Define SPU-specific addressing modes: These come in three basic
+// flavors:
+//
+// D-form   : [r+I10] (10-bit signed offset + reg)
+// X-form   : [r+r]   (reg+reg)
+// A-form   : abs     (256K LSA offset)
+// D-form(2): [r+I7]  (7-bit signed offset + reg)
+
+def dform_addr   : ComplexPattern<iPTR, 2, "SelectDFormAddr",     [], []>;
+def xform_addr   : ComplexPattern<iPTR, 2, "SelectXFormAddr",     [], []>;
+def aform_addr   : ComplexPattern<iPTR, 2, "SelectAFormAddr",     [], []>;
+def dform2_addr  : ComplexPattern<iPTR, 2, "SelectDForm2Addr",    [], []>;
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
new file mode 100644
index 0000000..e031048
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -0,0 +1,614 @@
+//===- SPURegisterInfo.cpp - Cell SPU Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "SPU.h"
+#include "SPURegisterInfo.h"
+#include "SPURegisterNames.h"
+#include "SPUInstrBuilder.h"
+#include "SPUSubtarget.h"
+#include "SPUMachineFunction.h"
+#include "SPUFrameInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+unsigned SPURegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace SPU;
+  switch (RegEnum) {
+  case SPU::R0: return 0;
+  case SPU::R1: return 1;
+  case SPU::R2: return 2;
+  case SPU::R3: return 3;
+  case SPU::R4: return 4;
+  case SPU::R5: return 5;
+  case SPU::R6: return 6;
+  case SPU::R7: return 7;
+  case SPU::R8: return 8;
+  case SPU::R9: return 9;
+  case SPU::R10: return 10;
+  case SPU::R11: return 11;
+  case SPU::R12: return 12;
+  case SPU::R13: return 13;
+  case SPU::R14: return 14;
+  case SPU::R15: return 15;
+  case SPU::R16: return 16;
+  case SPU::R17: return 17;
+  case SPU::R18: return 18;
+  case SPU::R19: return 19;
+  case SPU::R20: return 20;
+  case SPU::R21: return 21;
+  case SPU::R22: return 22;
+  case SPU::R23: return 23;
+  case SPU::R24: return 24;
+  case SPU::R25: return 25;
+  case SPU::R26: return 26;
+  case SPU::R27: return 27;
+  case SPU::R28: return 28;
+  case SPU::R29: return 29;
+  case SPU::R30: return 30;
+  case SPU::R31: return 31;
+  case SPU::R32: return 32;
+  case SPU::R33: return 33;
+  case SPU::R34: return 34;
+  case SPU::R35: return 35;
+  case SPU::R36: return 36;
+  case SPU::R37: return 37;
+  case SPU::R38: return 38;
+  case SPU::R39: return 39;
+  case SPU::R40: return 40;
+  case SPU::R41: return 41;
+  case SPU::R42: return 42;
+  case SPU::R43: return 43;
+  case SPU::R44: return 44;
+  case SPU::R45: return 45;
+  case SPU::R46: return 46;
+  case SPU::R47: return 47;
+  case SPU::R48: return 48;
+  case SPU::R49: return 49;
+  case SPU::R50: return 50;
+  case SPU::R51: return 51;
+  case SPU::R52: return 52;
+  case SPU::R53: return 53;
+  case SPU::R54: return 54;
+  case SPU::R55: return 55;
+  case SPU::R56: return 56;
+  case SPU::R57: return 57;
+  case SPU::R58: return 58;
+  case SPU::R59: return 59;
+  case SPU::R60: return 60;
+  case SPU::R61: return 61;
+  case SPU::R62: return 62;
+  case SPU::R63: return 63;
+  case SPU::R64: return 64;
+  case SPU::R65: return 65;
+  case SPU::R66: return 66;
+  case SPU::R67: return 67;
+  case SPU::R68: return 68;
+  case SPU::R69: return 69;
+  case SPU::R70: return 70;
+  case SPU::R71: return 71;
+  case SPU::R72: return 72;
+  case SPU::R73: return 73;
+  case SPU::R74: return 74;
+  case SPU::R75: return 75;
+  case SPU::R76: return 76;
+  case SPU::R77: return 77;
+  case SPU::R78: return 78;
+  case SPU::R79: return 79;
+  case SPU::R80: return 80;
+  case SPU::R81: return 81;
+  case SPU::R82: return 82;
+  case SPU::R83: return 83;
+  case SPU::R84: return 84;
+  case SPU::R85: return 85;
+  case SPU::R86: return 86;
+  case SPU::R87: return 87;
+  case SPU::R88: return 88;
+  case SPU::R89: return 89;
+  case SPU::R90: return 90;
+  case SPU::R91: return 91;
+  case SPU::R92: return 92;
+  case SPU::R93: return 93;
+  case SPU::R94: return 94;
+  case SPU::R95: return 95;
+  case SPU::R96: return 96;
+  case SPU::R97: return 97;
+  case SPU::R98: return 98;
+  case SPU::R99: return 99;
+  case SPU::R100: return 100;
+  case SPU::R101: return 101;
+  case SPU::R102: return 102;
+  case SPU::R103: return 103;
+  case SPU::R104: return 104;
+  case SPU::R105: return 105;
+  case SPU::R106: return 106;
+  case SPU::R107: return 107;
+  case SPU::R108: return 108;
+  case SPU::R109: return 109;
+  case SPU::R110: return 110;
+  case SPU::R111: return 111;
+  case SPU::R112: return 112;
+  case SPU::R113: return 113;
+  case SPU::R114: return 114;
+  case SPU::R115: return 115;
+  case SPU::R116: return 116;
+  case SPU::R117: return 117;
+  case SPU::R118: return 118;
+  case SPU::R119: return 119;
+  case SPU::R120: return 120;
+  case SPU::R121: return 121;
+  case SPU::R122: return 122;
+  case SPU::R123: return 123;
+  case SPU::R124: return 124;
+  case SPU::R125: return 125;
+  case SPU::R126: return 126;
+  case SPU::R127: return 127;
+  default:
+    cerr << "Unhandled reg in SPURegisterInfo::getRegisterNumbering!\n";
+    abort();
+  }
+}
+
+SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
+                                 const TargetInstrInfo &tii) :
+  SPUGenRegisterInfo(SPU::ADJCALLSTACKDOWN, SPU::ADJCALLSTACKUP),
+  Subtarget(subtarget),
+  TII(tii)
+{
+}
+
+// SPU's 128-bit registers used for argument passing:
+static const unsigned SPU_ArgRegs[] = {
+  SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
+  SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
+  SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
+  SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
+  SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
+  SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
+  SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
+  SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
+  SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
+  SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
+  SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
+};
+
+const unsigned *
+SPURegisterInfo::getArgRegs()
+{
+  return SPU_ArgRegs;
+}
+
+unsigned
+SPURegisterInfo::getNumArgRegs()
+{
+  return sizeof(SPU_ArgRegs) / sizeof(SPU_ArgRegs[0]);
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass * SPURegisterInfo::getPointerRegClass() const
+{
+  return &SPU::R32CRegClass;
+}
+
+const unsigned *
+SPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const
+{
+  // Cell ABI calling convention
+  static const unsigned SPU_CalleeSaveRegs[] = {
+    SPU::R80, SPU::R81, SPU::R82, SPU::R83,
+    SPU::R84, SPU::R85, SPU::R86, SPU::R87,
+    SPU::R88, SPU::R89, SPU::R90, SPU::R91,
+    SPU::R92, SPU::R93, SPU::R94, SPU::R95,
+    SPU::R96, SPU::R97, SPU::R98, SPU::R99,
+    SPU::R100, SPU::R101, SPU::R102, SPU::R103,
+    SPU::R104, SPU::R105, SPU::R106, SPU::R107,
+    SPU::R108, SPU::R109, SPU::R110, SPU::R111,
+    SPU::R112, SPU::R113, SPU::R114, SPU::R115,
+    SPU::R116, SPU::R117, SPU::R118, SPU::R119,
+    SPU::R120, SPU::R121, SPU::R122, SPU::R123,
+    SPU::R124, SPU::R125, SPU::R126, SPU::R127,
+    SPU::R2,    /* environment pointer */
+    SPU::R1,    /* stack pointer */
+    SPU::R0,    /* link register */
+    0 /* end */
+  };
+
+  return SPU_CalleeSaveRegs;
+}
+
+const TargetRegisterClass* const*
+SPURegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const
+{
+  // Cell ABI Calling Convention
+  static const TargetRegisterClass * const SPU_CalleeSaveRegClasses[] = {
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, /* environment pointer */
+    &SPU::GPRCRegClass, /* stack pointer */
+    &SPU::GPRCRegClass, /* link register */
+    0 /* end */
+  };
+
+  return SPU_CalleeSaveRegClasses;
+}
+
+/*!
+ R0 (link register), R1 (stack pointer) and R2 (environment pointer -- this is
+ generally unused) are the Cell's reserved registers
+ */
+BitVector SPURegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(SPU::R0);                // LR
+  Reserved.set(SPU::R1);                // SP
+  Reserved.set(SPU::R2);                // environment pointer
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+static bool needsFP(const MachineFunction &MF) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+//--------------------------------------------------------------------------
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.  This is true if the function needs a frame pointer and has
+// a non-zero stack size.
+bool
+SPURegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getStackSize() && needsFP(MF);
+}
+
+//--------------------------------------------------------------------------
+void
+SPURegisterInfo::eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I)
+  const
+{
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+void
+SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                                     RegScavenger *RS) const
+{
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  MachineOperand &SPOp = MI.getOperand(i);
+  int FrameIndex = SPOp.getIndex();
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+
+  // Most instructions, except for generated FrameIndex additions using AIr32
+  // and ILAr32, have the immediate in operand 1. AIr32 and ILAr32 have the
+  // immediate in operand 2.
+  unsigned OpNo = 1;
+  if (MI.getOpcode() == SPU::AIr32 || MI.getOpcode() == SPU::ILAr32)
+    OpNo = 2;
+
+  MachineOperand &MO = MI.getOperand(OpNo);
+
+  // Offset is biased by $lr's slot at the bottom.
+  Offset += MO.getImm() + MFI->getStackSize() + SPUFrameInfo::minStackSize();
+  assert((Offset & 0xf) == 0
+         && "16-byte alignment violated in eliminateFrameIndex");
+
+  // Replace the FrameIndex with base register with $sp (aka $r1)
+  SPOp.ChangeToRegister(SPU::R1, false);
+  if (Offset > SPUFrameInfo::maxFrameOffset()
+      || Offset < SPUFrameInfo::minFrameOffset()) {
+    cerr << "Large stack adjustment ("
+         << Offset
+         << ") in SPURegisterInfo::eliminateFrameIndex.";
+  } else {
+    MO.ChangeToImmediate(Offset);
+  }
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void
+SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const
+{
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned Align = std::max(TargetAlign, MFI->getMaxAlignment());
+  assert(isPowerOf2_32(Align) && "Alignment is not power of 2");
+  unsigned AlignMask = Align - 1;
+
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                           RegScavenger *RS)
+  const {
+  // Mark LR and SP unused, since the prolog spills them to stack and
+  // we don't want anyone else to spill them for us.
+  //
+  // Also, unless R2 is really used someday, don't spill it automatically.
+  MF.getRegInfo().setPhysRegUnused(SPU::R0);
+  MF.getRegInfo().setPhysRegUnused(SPU::R1);
+  MF.getRegInfo().setPhysRegUnused(SPU::R2);
+}
+
+void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
+{
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Prepare for debug frame info.
+  bool hasDebugInfo = MMI && MMI->hasDebugInfo();
+  unsigned FrameLabelId = 0;
+
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  determineFrameLayout(MF);
+  int FrameSize = MFI->getStackSize();
+
+  assert((FrameSize & 0xf) == 0
+         && "SPURegisterInfo::emitPrologue: FrameSize not aligned");
+
+  if (FrameSize > 0 || MFI->hasCalls()) {
+    FrameSize = -(FrameSize + SPUFrameInfo::minStackSize());
+    if (hasDebugInfo) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addImm(FrameLabelId);
+    }
+
+    // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp)
+    // for the ABI
+    BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16)
+      .addReg(SPU::R1);
+    if (isS10Constant(FrameSize)) {
+      // Spill $sp to adjusted $sp
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize)
+        .addReg(SPU::R1);
+      // Adjust $sp by required amout
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1)
+        .addImm(FrameSize);
+    } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) {
+      // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
+      // $r2 to adjust $sp:
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
+        .addImm(-16)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
+        .addImm(FrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
+        .addReg(SPU::R1)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2)
+        .addReg(SPU::R2)
+        .addImm(16);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+    } else {
+      cerr << "Unhandled frame size: " << FrameSize << "\n";
+      abort();
+    }
+
+    if (hasDebugInfo) {
+      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+
+      // Show update of SP.
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+
+      // Add callee saved registers to move list.
+      const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+      for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+        int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+        unsigned Reg = CSI[I].getReg();
+        if (Reg == SPU::R0) continue;
+        MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+        MachineLocation CSSrc(Reg);
+        Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+      }
+
+      // Mark effective beginning of when frame pointer is ready.
+      unsigned ReadyLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addImm(ReadyLabelId);
+
+      MachineLocation FPDst(SPU::R1);
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+    }
+  } else {
+    // This is a leaf function -- insert a branch hint iff there are
+    // sufficient number instructions in the basic block. Note that
+    // this is just a best guess based on the basic block's size.
+    if (MBB.size() >= (unsigned) SPUFrameInfo::branchHintPenalty()) {
+      MachineBasicBlock::iterator MBBI = prior(MBB.end());
+      dl = MBBI->getDebugLoc();
+
+      // Insert terminator label
+      unsigned BranchLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addImm(BranchLabelId);
+    }
+  }
+}
+
+void
+SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+{
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int FrameSize = MFI->getStackSize();
+  int LinkSlotOffset = SPUFrameInfo::stackSlotSize();
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  assert(MBBI->getOpcode() == SPU::RET &&
+         "Can only insert epilog into returning blocks");
+  assert((FrameSize & 0xf) == 0
+         && "SPURegisterInfo::emitEpilogue: FrameSize not aligned");
+  if (FrameSize > 0 || MFI->hasCalls()) {
+    FrameSize = FrameSize + SPUFrameInfo::minStackSize();
+    if (isS10Constant(FrameSize + LinkSlotOffset)) {
+      // Reload $lr, adjust $sp by required amount
+      // Note: We do this to slightly improve dual issue -- not by much, but it
+      // is an opportunity for dual issue.
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
+        .addImm(FrameSize + LinkSlotOffset)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1)
+        .addReg(SPU::R1)
+        .addImm(FrameSize);
+    } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) {
+      // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
+      // $r2 to adjust $sp:
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
+        .addImm(16)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
+        .addImm(FrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
+        .addReg(SPU::R1)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
+        .addImm(16)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2).
+        addReg(SPU::R2)
+        .addImm(16);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+    } else {
+      cerr << "Unhandled frame size: " << FrameSize << "\n";
+      abort();
+    }
+   }
+}
+
+unsigned
+SPURegisterInfo::getRARegister() const
+{
+  return SPU::R0;
+}
+
+unsigned
+SPURegisterInfo::getFrameRegister(MachineFunction &MF) const
+{
+  return SPU::R1;
+}
+
+void
+SPURegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const
+{
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(SPU::R1, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+
+int
+SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
+  return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+#include "SPUGenRegisterInfo.inc"
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
new file mode 100644
index 0000000..5b6e9ec
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -0,0 +1,101 @@
+//===- SPURegisterInfo.h - Cell SPU Register Information Impl ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell SPU implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_REGISTERINFO_H
+#define SPU_REGISTERINFO_H
+
+#include "SPU.h"
+#include "SPUGenRegisterInfo.h.inc"
+
+namespace llvm {
+  class SPUSubtarget;
+  class TargetInstrInfo;
+  class Type;
+
+  class SPURegisterInfo : public SPUGenRegisterInfo {
+  private:
+    const SPUSubtarget &Subtarget;
+    const TargetInstrInfo &TII;
+
+    //! Predicate: Does the machine function use the link register?
+    bool usesLR(MachineFunction &MF) const;
+
+  public:
+    SPURegisterInfo(const SPUSubtarget &subtarget, const TargetInstrInfo &tii);
+    
+    //! Translate a register's enum value to a register number
+    /*!
+      This method translates a register's enum value to it's regiser number,
+      e.g. SPU::R14 -> 14.
+     */
+    static unsigned getRegisterNumbering(unsigned RegEnum);
+
+    /// getPointerRegClass - Return the register class to use to hold pointers.
+    /// This is used for addressing modes.
+    virtual const TargetRegisterClass *getPointerRegClass() const;
+
+    //! Return the array of callee-saved registers
+    virtual const unsigned* getCalleeSavedRegs(const MachineFunction *MF) const;
+
+    //! Return the register class array of the callee-saved registers
+    virtual const TargetRegisterClass* const *
+      getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+    //! Return the reserved registers
+    BitVector getReservedRegs(const MachineFunction &MF) const;
+
+    //! Prediate: Target has dedicated frame pointer
+    bool hasFP(const MachineFunction &MF) const;
+    //! Eliminate the call frame setup pseudo-instructions
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const;
+    //! Convert frame indicies into machine operands
+    void eliminateFrameIndex(MachineBasicBlock::iterator II, int,
+                             RegScavenger *RS) const;
+    //! Determine the frame's layour
+    void determineFrameLayout(MachineFunction &MF) const;
+
+    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                              RegScavenger *RS = NULL) const;
+    //! Emit the function prologue
+    void emitPrologue(MachineFunction &MF) const;
+    //! Emit the function epilogue
+    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+    //! Get return address register (LR, aka R0)
+    unsigned getRARegister() const;
+    //! Get the stack frame register (SP, aka R1)
+    unsigned getFrameRegister(MachineFunction &MF) const;
+    //! Perform target-specific stack frame setup.
+    void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+    //------------------------------------------------------------------------
+    // New methods added:
+    //------------------------------------------------------------------------
+
+    //! Return the array of argument passing registers
+    /*!
+      \note The size of this array is returned by getArgRegsSize().
+     */
+    static const unsigned *getArgRegs();
+
+    //! Return the size of the argument passing register array
+    static unsigned getNumArgRegs();
+
+    //! Get DWARF debugging register number
+    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/CellSPU/SPURegisterInfo.td b/lib/Target/CellSPU/SPURegisterInfo.td
new file mode 100644
index 0000000..bb88f2b
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterInfo.td
@@ -0,0 +1,429 @@
+//===- SPURegisterInfo.td - The Cell SPU Register File -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class SPUReg<string n> : Register<n> {
+  let Namespace = "SPU";
+}
+
+// The SPU's register are all 128-bits wide, which makes specifying the
+// registers relatively easy, if relatively mundane:
+
+class SPUVecReg<bits<7> num, string n> : SPUReg<n> {
+  field bits<7> Num = num;
+}
+
+def R0 : SPUVecReg<0, "$lr">, DwarfRegNum<[0]>;
+def R1 : SPUVecReg<1, "$sp">, DwarfRegNum<[1]>;
+def R2 : SPUVecReg<2, "$2">, DwarfRegNum<[2]>;
+def R3 : SPUVecReg<3, "$3">, DwarfRegNum<[3]>;
+def R4 : SPUVecReg<4, "$4">, DwarfRegNum<[4]>;
+def R5 : SPUVecReg<5, "$5">, DwarfRegNum<[5]>;
+def R6 : SPUVecReg<6, "$6">, DwarfRegNum<[6]>;
+def R7 : SPUVecReg<7, "$7">, DwarfRegNum<[7]>;
+def R8 : SPUVecReg<8, "$8">, DwarfRegNum<[8]>;
+def R9 : SPUVecReg<9, "$9">, DwarfRegNum<[9]>;
+def R10 : SPUVecReg<10, "$10">, DwarfRegNum<[10]>;
+def R11 : SPUVecReg<11, "$11">, DwarfRegNum<[11]>;
+def R12 : SPUVecReg<12, "$12">, DwarfRegNum<[12]>;
+def R13 : SPUVecReg<13, "$13">, DwarfRegNum<[13]>;
+def R14 : SPUVecReg<14, "$14">, DwarfRegNum<[14]>;
+def R15 : SPUVecReg<15, "$15">, DwarfRegNum<[15]>;
+def R16 : SPUVecReg<16, "$16">, DwarfRegNum<[16]>;
+def R17 : SPUVecReg<17, "$17">, DwarfRegNum<[17]>;
+def R18 : SPUVecReg<18, "$18">, DwarfRegNum<[18]>;
+def R19 : SPUVecReg<19, "$19">, DwarfRegNum<[19]>;
+def R20 : SPUVecReg<20, "$20">, DwarfRegNum<[20]>;
+def R21 : SPUVecReg<21, "$21">, DwarfRegNum<[21]>;
+def R22 : SPUVecReg<22, "$22">, DwarfRegNum<[22]>;
+def R23 : SPUVecReg<23, "$23">, DwarfRegNum<[23]>;
+def R24 : SPUVecReg<24, "$24">, DwarfRegNum<[24]>;
+def R25 : SPUVecReg<25, "$25">, DwarfRegNum<[25]>;
+def R26 : SPUVecReg<26, "$26">, DwarfRegNum<[26]>;
+def R27 : SPUVecReg<27, "$27">, DwarfRegNum<[27]>;
+def R28 : SPUVecReg<28, "$28">, DwarfRegNum<[28]>;
+def R29 : SPUVecReg<29, "$29">, DwarfRegNum<[29]>;
+def R30 : SPUVecReg<30, "$30">, DwarfRegNum<[30]>;
+def R31 : SPUVecReg<31, "$31">, DwarfRegNum<[31]>;
+def R32 : SPUVecReg<32, "$32">, DwarfRegNum<[32]>;
+def R33 : SPUVecReg<33, "$33">, DwarfRegNum<[33]>;
+def R34 : SPUVecReg<34, "$34">, DwarfRegNum<[34]>;
+def R35 : SPUVecReg<35, "$35">, DwarfRegNum<[35]>;
+def R36 : SPUVecReg<36, "$36">, DwarfRegNum<[36]>;
+def R37 : SPUVecReg<37, "$37">, DwarfRegNum<[37]>;
+def R38 : SPUVecReg<38, "$38">, DwarfRegNum<[38]>;
+def R39 : SPUVecReg<39, "$39">, DwarfRegNum<[39]>;
+def R40 : SPUVecReg<40, "$40">, DwarfRegNum<[40]>;
+def R41 : SPUVecReg<41, "$41">, DwarfRegNum<[41]>;
+def R42 : SPUVecReg<42, "$42">, DwarfRegNum<[42]>;
+def R43 : SPUVecReg<43, "$43">, DwarfRegNum<[43]>;
+def R44 : SPUVecReg<44, "$44">, DwarfRegNum<[44]>;
+def R45 : SPUVecReg<45, "$45">, DwarfRegNum<[45]>;
+def R46 : SPUVecReg<46, "$46">, DwarfRegNum<[46]>;
+def R47 : SPUVecReg<47, "$47">, DwarfRegNum<[47]>;
+def R48 : SPUVecReg<48, "$48">, DwarfRegNum<[48]>;
+def R49 : SPUVecReg<49, "$49">, DwarfRegNum<[49]>;
+def R50 : SPUVecReg<50, "$50">, DwarfRegNum<[50]>;
+def R51 : SPUVecReg<51, "$51">, DwarfRegNum<[51]>;
+def R52 : SPUVecReg<52, "$52">, DwarfRegNum<[52]>;
+def R53 : SPUVecReg<53, "$53">, DwarfRegNum<[53]>;
+def R54 : SPUVecReg<54, "$54">, DwarfRegNum<[54]>;
+def R55 : SPUVecReg<55, "$55">, DwarfRegNum<[55]>;
+def R56 : SPUVecReg<56, "$56">, DwarfRegNum<[56]>;
+def R57 : SPUVecReg<57, "$57">, DwarfRegNum<[57]>;
+def R58 : SPUVecReg<58, "$58">, DwarfRegNum<[58]>;
+def R59 : SPUVecReg<59, "$59">, DwarfRegNum<[59]>;
+def R60 : SPUVecReg<60, "$60">, DwarfRegNum<[60]>;
+def R61 : SPUVecReg<61, "$61">, DwarfRegNum<[61]>;
+def R62 : SPUVecReg<62, "$62">, DwarfRegNum<[62]>;
+def R63 : SPUVecReg<63, "$63">, DwarfRegNum<[63]>;
+def R64 : SPUVecReg<64, "$64">, DwarfRegNum<[64]>;
+def R65 : SPUVecReg<65, "$65">, DwarfRegNum<[65]>;
+def R66 : SPUVecReg<66, "$66">, DwarfRegNum<[66]>;
+def R67 : SPUVecReg<67, "$67">, DwarfRegNum<[67]>;
+def R68 : SPUVecReg<68, "$68">, DwarfRegNum<[68]>;
+def R69 : SPUVecReg<69, "$69">, DwarfRegNum<[69]>;
+def R70 : SPUVecReg<70, "$70">, DwarfRegNum<[70]>;
+def R71 : SPUVecReg<71, "$71">, DwarfRegNum<[71]>;
+def R72 : SPUVecReg<72, "$72">, DwarfRegNum<[72]>;
+def R73 : SPUVecReg<73, "$73">, DwarfRegNum<[73]>;
+def R74 : SPUVecReg<74, "$74">, DwarfRegNum<[74]>;
+def R75 : SPUVecReg<75, "$75">, DwarfRegNum<[75]>;
+def R76 : SPUVecReg<76, "$76">, DwarfRegNum<[76]>;
+def R77 : SPUVecReg<77, "$77">, DwarfRegNum<[77]>;
+def R78 : SPUVecReg<78, "$78">, DwarfRegNum<[78]>;
+def R79 : SPUVecReg<79, "$79">, DwarfRegNum<[79]>;
+def R80 : SPUVecReg<80, "$80">, DwarfRegNum<[80]>;
+def R81 : SPUVecReg<81, "$81">, DwarfRegNum<[81]>;
+def R82 : SPUVecReg<82, "$82">, DwarfRegNum<[82]>;
+def R83 : SPUVecReg<83, "$83">, DwarfRegNum<[83]>;
+def R84 : SPUVecReg<84, "$84">, DwarfRegNum<[84]>;
+def R85 : SPUVecReg<85, "$85">, DwarfRegNum<[85]>;
+def R86 : SPUVecReg<86, "$86">, DwarfRegNum<[86]>;
+def R87 : SPUVecReg<87, "$87">, DwarfRegNum<[87]>;
+def R88 : SPUVecReg<88, "$88">, DwarfRegNum<[88]>;
+def R89 : SPUVecReg<89, "$89">, DwarfRegNum<[89]>;
+def R90 : SPUVecReg<90, "$90">, DwarfRegNum<[90]>;
+def R91 : SPUVecReg<91, "$91">, DwarfRegNum<[91]>;
+def R92 : SPUVecReg<92, "$92">, DwarfRegNum<[92]>;
+def R93 : SPUVecReg<93, "$93">, DwarfRegNum<[93]>;
+def R94 : SPUVecReg<94, "$94">, DwarfRegNum<[94]>;
+def R95 : SPUVecReg<95, "$95">, DwarfRegNum<[95]>;
+def R96 : SPUVecReg<96, "$96">, DwarfRegNum<[96]>;
+def R97 : SPUVecReg<97, "$97">, DwarfRegNum<[97]>;
+def R98 : SPUVecReg<98, "$98">, DwarfRegNum<[98]>;
+def R99 : SPUVecReg<99, "$99">, DwarfRegNum<[99]>;
+def R100 : SPUVecReg<100, "$100">, DwarfRegNum<[100]>;
+def R101 : SPUVecReg<101, "$101">, DwarfRegNum<[101]>;
+def R102 : SPUVecReg<102, "$102">, DwarfRegNum<[102]>;
+def R103 : SPUVecReg<103, "$103">, DwarfRegNum<[103]>;
+def R104 : SPUVecReg<104, "$104">, DwarfRegNum<[104]>;
+def R105 : SPUVecReg<105, "$105">, DwarfRegNum<[105]>;
+def R106 : SPUVecReg<106, "$106">, DwarfRegNum<[106]>;
+def R107 : SPUVecReg<107, "$107">, DwarfRegNum<[107]>;
+def R108 : SPUVecReg<108, "$108">, DwarfRegNum<[108]>;
+def R109 : SPUVecReg<109, "$109">, DwarfRegNum<[109]>;
+def R110 : SPUVecReg<110, "$110">, DwarfRegNum<[110]>;
+def R111 : SPUVecReg<111, "$111">, DwarfRegNum<[111]>;
+def R112 : SPUVecReg<112, "$112">, DwarfRegNum<[112]>;
+def R113 : SPUVecReg<113, "$113">, DwarfRegNum<[113]>;
+def R114 : SPUVecReg<114, "$114">, DwarfRegNum<[114]>;
+def R115 : SPUVecReg<115, "$115">, DwarfRegNum<[115]>;
+def R116 : SPUVecReg<116, "$116">, DwarfRegNum<[116]>;
+def R117 : SPUVecReg<117, "$117">, DwarfRegNum<[117]>;
+def R118 : SPUVecReg<118, "$118">, DwarfRegNum<[118]>;
+def R119 : SPUVecReg<119, "$119">, DwarfRegNum<[119]>;
+def R120 : SPUVecReg<120, "$120">, DwarfRegNum<[120]>;
+def R121 : SPUVecReg<121, "$121">, DwarfRegNum<[121]>;
+def R122 : SPUVecReg<122, "$122">, DwarfRegNum<[122]>;
+def R123 : SPUVecReg<123, "$123">, DwarfRegNum<[123]>;
+def R124 : SPUVecReg<124, "$124">, DwarfRegNum<[124]>;
+def R125 : SPUVecReg<125, "$125">, DwarfRegNum<[125]>;
+def R126 : SPUVecReg<126, "$126">, DwarfRegNum<[126]>;
+def R127 : SPUVecReg<127, "$127">, DwarfRegNum<[127]>;
+
+/* Need floating point status register here: */
+/* def FPCSR : ... */
+
+// The SPU's registers as 128-bit wide entities, and can function as general
+// purpose registers, where the operands are in the "preferred slot":
+def GPRC : RegisterClass<"SPU", [i128], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 64-bit wide (double word integer) "preferred slot":
+def R64C : RegisterClass<"SPU", [i64], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R64CClass::iterator
+    R64CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R64CClass::iterator
+    R64CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 64-bit wide (double word) FP "preferred slot":
+def R64FP : RegisterClass<"SPU", [f64], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R64FPClass::iterator
+    R64FPClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R64FPClass::iterator
+    R64FPClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 32-bit wide (word) "preferred slot":
+def R32C : RegisterClass<"SPU", [i32], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R32CClass::iterator
+    R32CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R32CClass::iterator
+    R32CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as single precision floating point "preferred slot":
+def R32FP : RegisterClass<"SPU", [f32], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R32FPClass::iterator
+    R32FPClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R32FPClass::iterator
+    R32FPClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 16-bit wide (halfword) "preferred slot":
+def R16C : RegisterClass<"SPU", [i16], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R16CClass::iterator
+    R16CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R16CClass::iterator
+    R16CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 8-bit wide (byte) "preferred slot":
+def R8C : RegisterClass<"SPU", [i8], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R8CClass::iterator
+    R8CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R8CClass::iterator
+    R8CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as vector registers:
+def VECREG : RegisterClass<"SPU",
+                           [v16i8,v8i16,v2i32,v4i32,v4f32,v2i64,v2f64],
+                           128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VECREGClass::iterator
+    VECREGClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    VECREGClass::iterator
+    VECREGClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
diff --git a/lib/Target/CellSPU/SPURegisterNames.h b/lib/Target/CellSPU/SPURegisterNames.h
new file mode 100644
index 0000000..6c3afdf
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterNames.h
@@ -0,0 +1,18 @@
+//===- SPURegisterNames.h - Wrapper header for SPU register names -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_REGISTER_NAMES_H
+#define SPU_REGISTER_NAMES_H
+
+// Define symbolic names for Cell registers.  This defines a mapping from
+// register name to register number.
+//
+#include "SPUGenRegisterNames.inc"
+
+#endif
diff --git a/lib/Target/CellSPU/SPUSchedule.td b/lib/Target/CellSPU/SPUSchedule.td
new file mode 100644
index 0000000..785dc46
--- /dev/null
+++ b/lib/Target/CellSPU/SPUSchedule.td
@@ -0,0 +1,57 @@
+//===- SPUSchedule.td - Cell Scheduling Definitions --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Even pipeline:
+
+def EVEN_UNIT : FuncUnit;       // Even execution unit: (PC & 0x7 == 000)
+def ODD_UNIT  : FuncUnit;       // Odd execution unit:  (PC & 0x7 == 100)
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Cell SPU
+//===----------------------------------------------------------------------===//
+
+def LoadStore    : InstrItinClass;              // ODD_UNIT
+def BranchHints  : InstrItinClass;              // ODD_UNIT
+def BranchResolv : InstrItinClass;              // ODD_UNIT
+def ChanOpSPR    : InstrItinClass;              // ODD_UNIT
+def ShuffleOp    : InstrItinClass;              // ODD_UNIT
+def SelectOp     : InstrItinClass;              // ODD_UNIT
+def GatherOp     : InstrItinClass;              // ODD_UNIT
+def LoadNOP      : InstrItinClass;              // ODD_UNIT
+def ExecNOP      : InstrItinClass;              // EVEN_UNIT
+def SPrecFP      : InstrItinClass;              // EVEN_UNIT
+def DPrecFP      : InstrItinClass;              // EVEN_UNIT
+def FPInt        : InstrItinClass;              // EVEN_UNIT (FP<->integer)
+def ByteOp       : InstrItinClass;              // EVEN_UNIT
+def IntegerOp    : InstrItinClass;              // EVEN_UNIT
+def IntegerMulDiv: InstrItinClass;              // EVEN_UNIT
+def RotateShift  : InstrItinClass;              // EVEN_UNIT
+def ImmLoad      : InstrItinClass;              // EVEN_UNIT
+
+/* Note: The itinerary for the Cell SPU is somewhat contrived... */
+def SPUItineraries : ProcessorItineraries<[
+  InstrItinData<LoadStore   , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<BranchHints , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<BranchResolv, [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<ChanOpSPR   , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<ShuffleOp   , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<SelectOp    , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<GatherOp    , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<LoadNOP     , [InstrStage<1,  [ODD_UNIT]>]>,
+  InstrItinData<ExecNOP     , [InstrStage<1,  [EVEN_UNIT]>]>,
+  InstrItinData<SPrecFP     , [InstrStage<6,  [EVEN_UNIT]>]>,
+  InstrItinData<DPrecFP     , [InstrStage<13, [EVEN_UNIT]>]>,
+  InstrItinData<FPInt       , [InstrStage<2,  [EVEN_UNIT]>]>,
+  InstrItinData<ByteOp      , [InstrStage<4,  [EVEN_UNIT]>]>,
+  InstrItinData<IntegerOp   , [InstrStage<2,  [EVEN_UNIT]>]>,
+  InstrItinData<RotateShift , [InstrStage<4,  [EVEN_UNIT]>]>,
+  InstrItinData<IntegerMulDiv,[InstrStage<7,  [EVEN_UNIT]>]>,
+  InstrItinData<ImmLoad     , [InstrStage<2,  [EVEN_UNIT]>]>
+  ]>;
diff --git a/lib/Target/CellSPU/SPUSubtarget.cpp b/lib/Target/CellSPU/SPUSubtarget.cpp
new file mode 100644
index 0000000..0a1c2f7
--- /dev/null
+++ b/lib/Target/CellSPU/SPUSubtarget.cpp
@@ -0,0 +1,40 @@
+//===- SPUSubtarget.cpp - STI Cell SPU Subtarget Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CellSPU-specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUSubtarget.h"
+#include "SPU.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "SPUGenSubtarget.inc"
+
+using namespace llvm;
+
+SPUSubtarget::SPUSubtarget(const TargetMachine &tm, const Module &M,
+                           const std::string &FS) :
+  TM(tm),
+  StackAlignment(16),
+  ProcDirective(SPU::DEFAULT_PROC),
+  UseLargeMem(false)
+{
+  // Should be the target SPU processor type. For now, since there's only
+  // one, simply default to the current "v0" default:
+  std::string default_cpu("v0");
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, default_cpu);
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void SPUSubtarget::SetJITMode() {
+}
diff --git a/lib/Target/CellSPU/SPUSubtarget.h b/lib/Target/CellSPU/SPUSubtarget.h
new file mode 100644
index 0000000..b6a3409
--- /dev/null
+++ b/lib/Target/CellSPU/SPUSubtarget.h
@@ -0,0 +1,95 @@
+//===-- SPUSubtarget.h - Define Subtarget for the Cell SPU ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Cell SPU-specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CELLSUBTARGET_H
+#define CELLSUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+  class Module;
+  class GlobalValue;
+  class TargetMachine;
+
+  namespace SPU {
+    enum {
+      PROC_NONE,
+      DEFAULT_PROC
+    };
+  }
+    
+  class SPUSubtarget : public TargetSubtarget {
+  protected:
+    const TargetMachine &TM;
+    
+    /// stackAlignment - The minimum alignment known to hold of the stack frame
+    /// on entry to the function and which must be maintained by every function.
+    unsigned StackAlignment;
+    
+    /// Selected instruction itineraries (one entry per itinerary class.)
+    InstrItineraryData InstrItins;
+
+    /// Which SPU processor (this isn't really used, but it's there to keep
+    /// the C compiler happy)
+    unsigned ProcDirective;
+
+    /// Use (assume) large memory -- effectively disables the LQA/STQA
+    /// instructions that assume 259K local store.
+    bool UseLargeMem;
+    
+  public:
+    /// This constructor initializes the data members to match that
+    /// of the specified module.
+    ///
+    SPUSubtarget(const TargetMachine &TM, const Module &M,
+                 const std::string &FS);
+    
+    /// ParseSubtargetFeatures - Parses features string setting specified 
+    /// subtarget options.  Definition of function is auto generated by tblgen.
+    std::string ParseSubtargetFeatures(const std::string &FS,
+                                       const std::string &CPU);
+
+    /// SetJITMode - This is called to inform the subtarget info that we are
+    /// producing code for the JIT.
+    void SetJITMode();
+
+    /// getStackAlignment - Returns the minimum alignment known to hold of the
+    /// stack frame on entry to the function and which must be maintained by
+    /// every function for this subtarget.
+    unsigned getStackAlignment() const { return StackAlignment; }
+    
+    /// getInstrItins - Return the instruction itineraies based on subtarget 
+    /// selection.
+    const InstrItineraryData &getInstrItineraryData() const {
+      return InstrItins;
+    }
+
+    /// Use large memory addressing predicate
+    bool usingLargeMem() const {
+      return UseLargeMem;
+    }
+
+    /// getTargetDataString - Return the pointer size and type alignment
+    /// properties of this subtarget.
+    const char *getTargetDataString() const {
+      return "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128"
+             "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:128:128-v128:128:128"
+             "-s:128:128";
+    }
+  };
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/CellSPU/SPUTargetAsmInfo.cpp b/lib/Target/CellSPU/SPUTargetAsmInfo.cpp
new file mode 100644
index 0000000..ff88ed8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUTargetAsmInfo.cpp
@@ -0,0 +1,74 @@
+//===-- SPUTargetAsmInfo.cpp - Cell SPU asm properties ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SPUTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUTargetAsmInfo.h"
+#include "SPUTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+SPULinuxTargetAsmInfo::SPULinuxTargetAsmInfo(const SPUTargetMachine &TM) :
+    SPUTargetAsmInfo<ELFTargetAsmInfo>(TM) {
+  PCSymbol = ".";
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+  // This corresponds to what the gcc SPU compiler emits, for consistency.
+  CStringSection = ".rodata.str";
+
+  // Has leb128, .loc and .file
+  HasLEB128 = true;
+  HasDotLocAndDotFile = true;
+
+  // BSS section needs to be emitted as ".section"
+  BSSSection = "\t.section\t.bss";
+  BSSSection_ = getUnnamedSection("\t.section\t.bss",
+                                  SectionFlags::Writeable | SectionFlags::BSS,
+                                  true);
+
+  SupportsDebugInformation = true;
+  NeedsSet = true;
+  SupportsMacInfoSection = false;
+  DwarfAbbrevSection =  "\t.section        .debug_abbrev,\"\",@progbits";
+  DwarfInfoSection =    "\t.section        .debug_info,\"\",@progbits";
+  DwarfLineSection =    "\t.section        .debug_line,\"\",@progbits";
+  DwarfFrameSection =   "\t.section        .debug_frame,\"\",@progbits";
+  DwarfPubNamesSection = "\t.section        .debug_pubnames,\"\",@progbits";
+  DwarfPubTypesSection = "\t.section        .debug_pubtypes,\"\",progbits";
+  DwarfStrSection =     "\t.section        .debug_str,\"MS\",@progbits,1";
+  DwarfLocSection =     "\t.section        .debug_loc,\"\",@progbits";
+  DwarfARangesSection = "\t.section        .debug_aranges,\"\",@progbits";
+  DwarfRangesSection =  "\t.section        .debug_ranges,\"\",@progbits";
+  DwarfMacInfoSection = "\t.section        .debug_macinfo,\"\",progbits";
+
+  // Exception handling is not supported on CellSPU (think about it: you only
+  // have 256K for code+data. Would you support exception handling?)
+  SupportsExceptionHandling = false;
+}
+
+/// PreferredEHDataFormat - This hook allows the target to select data
+/// format used for encoding pointers in exception handling data. Reason is
+/// 0 for data, 1 for code labels, 2 for function pointers. Global is true
+/// if the symbol can be relocated.
+unsigned
+SPULinuxTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                             bool Global) const {
+  // We really need to write something here.
+  return TargetAsmInfo::PreferredEHDataFormat(Reason, Global);
+}
+
+// Instantiate default implementation.
+TEMPLATE_INSTANTIATION(class SPUTargetAsmInfo<TargetAsmInfo>);
diff --git a/lib/Target/CellSPU/SPUTargetAsmInfo.h b/lib/Target/CellSPU/SPUTargetAsmInfo.h
new file mode 100644
index 0000000..d10a565
--- /dev/null
+++ b/lib/Target/CellSPU/SPUTargetAsmInfo.h
@@ -0,0 +1,51 @@
+//===-- SPUTargetAsmInfo.h - Cell SPU asm properties -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SPUTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUTARGETASMINFO_H
+#define SPUTARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+#include "SPUTargetMachine.h"
+#include "SPUSubtarget.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class SPUTargetMachine;
+  
+  template <class BaseTAI>
+  struct SPUTargetAsmInfo : public BaseTAI {
+    explicit SPUTargetAsmInfo(const SPUTargetMachine &TM):
+      BaseTAI(TM) {
+      /* (unused today)
+       * const SPUSubtarget *Subtarget = &TM.getSubtarget<SPUSubtarget>(); */
+
+      BaseTAI::ZeroDirective = "\t.space\t";
+      BaseTAI::SetDirective = "\t.set";
+      BaseTAI::Data64bitsDirective = "\t.quad\t";
+      BaseTAI::AlignmentIsInBytes = false;
+      BaseTAI::LCOMMDirective = "\t.lcomm\t";
+      BaseTAI::InlineAsmStart = "# InlineAsm Start";
+      BaseTAI::InlineAsmEnd = "# InlineAsm End";
+    }
+  };
+  
+  struct SPULinuxTargetAsmInfo : public SPUTargetAsmInfo<ELFTargetAsmInfo> {
+    explicit SPULinuxTargetAsmInfo(const SPUTargetMachine &TM);
+    virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                           bool Global) const;
+  };
+} // namespace llvm
+
+#endif /* SPUTARGETASMINFO_H */
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
new file mode 100644
index 0000000..7fa9022
--- /dev/null
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -0,0 +1,98 @@
+//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the Cell SPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPURegisterNames.h"
+#include "SPUTargetAsmInfo.h"
+#include "SPUTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+
+using namespace llvm;
+
+/// CellSPUTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int CellSPUTargetMachineModule;
+int CellSPUTargetMachineModule = 0;
+
+namespace {
+  // Register the targets
+  RegisterTarget<SPUTargetMachine>
+  CELLSPU("cellspu", "STI CBEA Cell SPU [experimental]");
+}
+
+const std::pair<unsigned, int> *
+SPUFrameInfo::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 1;
+  return &LR[0];
+}
+
+const TargetAsmInfo *
+SPUTargetMachine::createTargetAsmInfo() const
+{
+  return new SPULinuxTargetAsmInfo(*this);
+}
+
+unsigned
+SPUTargetMachine::getModuleMatchQuality(const Module &M)
+{
+  // We strongly match "spu-*" or "cellspu-*".
+  std::string TT = M.getTargetTriple();
+  if ((TT.size() == 3 && std::string(TT.begin(), TT.begin()+3) == "spu")
+      || (TT.size() == 7 && std::string(TT.begin(), TT.begin()+7) == "cellspu")
+      || (TT.size() >= 4 && std::string(TT.begin(), TT.begin()+4) == "spu-")
+      || (TT.size() >= 8 && std::string(TT.begin(), TT.begin()+8) == "cellspu-"))
+    return 20;
+  
+  return 0;                     // No match at all...
+}
+
+SPUTargetMachine::SPUTargetMachine(const Module &M, const std::string &FS)
+  : Subtarget(*this, M, FS),
+    DataLayout(Subtarget.getTargetDataString()),
+    InstrInfo(*this),
+    FrameInfo(*this),
+    TLInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData())
+{
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  setRelocationModel(Reloc::Static);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool
+SPUTargetMachine::addInstSelector(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel)
+{
+  // Install an instruction selector.
+  PM.add(createSPUISelDag(*this));
+  return false;
+}
+
+bool SPUTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          bool Verbose,
+                                          raw_ostream &Out) {
+  PM.add(createSPUAsmPrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
new file mode 100644
index 0000000..cd39203
--- /dev/null
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -0,0 +1,95 @@
+//===-- SPUTargetMachine.h - Define TargetMachine for Cell SPU ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CellSPU-specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_TARGETMACHINE_H
+#define SPU_TARGETMACHINE_H
+
+#include "SPUSubtarget.h"
+#include "SPUInstrInfo.h"
+#include "SPUISelLowering.h"
+#include "SPUFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+class PassManager;
+class GlobalValue;
+class TargetFrameInfo;
+
+/// SPUTargetMachine
+///
+class SPUTargetMachine : public LLVMTargetMachine {
+  SPUSubtarget        Subtarget;
+  const TargetData    DataLayout;
+  SPUInstrInfo        InstrInfo;
+  SPUFrameInfo        FrameInfo;
+  SPUTargetLowering   TLInfo;
+  InstrItineraryData  InstrItins;
+  
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  SPUTargetMachine(const Module &M, const std::string &FS);
+
+  /// Return the subtarget implementation object
+  virtual const SPUSubtarget     *getSubtargetImpl() const {
+    return &Subtarget;
+  }
+  virtual const SPUInstrInfo     *getInstrInfo() const {
+    return &InstrInfo;
+  }
+  virtual const SPUFrameInfo     *getFrameInfo() const {
+    return &FrameInfo;
+  }
+  /*!
+    \note Cell SPU does not support JIT today. It could support JIT at some
+    point.
+   */
+  virtual       TargetJITInfo    *getJITInfo() {
+    return NULL;
+  }
+  
+  //! Module match function
+  /*!
+    Module matching function called by TargetMachineRegistry().
+   */
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  virtual       SPUTargetLowering *getTargetLowering() const { 
+   return const_cast<SPUTargetLowering*>(&TLInfo); 
+  }
+
+  virtual const SPURegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  
+  virtual const TargetData *getTargetData() const {
+    return &DataLayout;
+  }
+
+  virtual const InstrItineraryData getInstrItineraryData() const {  
+    return InstrItins;
+  }
+  
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM,
+                               CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel,
+                                  bool Verbose, raw_ostream &Out);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/CppBackend/CMakeLists.txt b/lib/Target/CppBackend/CMakeLists.txt
new file mode 100644
index 0000000..f8182b8
--- /dev/null
+++ b/lib/Target/CppBackend/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_target(CppBackend
+  CPPBackend.cpp
+  )
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
new file mode 100644
index 0000000..4082989
--- /dev/null
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -0,0 +1,2007 @@
+//===-- CPPBackend.cpp - Library for converting LLVM code to C++ code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the writing of the LLVM IR as a set of C++ calls to the
+// LLVM IR interface. The input module is assumed to be verified.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Config/config.h"
+#include <algorithm>
+#include <set>
+
+using namespace llvm;
+
+static cl::opt<std::string>
+FuncName("cppfname", cl::desc("Specify the name of the generated function"),
+         cl::value_desc("function name"));
+
+enum WhatToGenerate {
+  GenProgram,
+  GenModule,
+  GenContents,
+  GenFunction,
+  GenFunctions,
+  GenInline,
+  GenVariable,
+  GenType
+};
+
+static cl::opt<WhatToGenerate> GenerationType("cppgen", cl::Optional,
+  cl::desc("Choose what kind of output to generate"),
+  cl::init(GenProgram),
+  cl::values(
+    clEnumValN(GenProgram,  "program",   "Generate a complete program"),
+    clEnumValN(GenModule,   "module",    "Generate a module definition"),
+    clEnumValN(GenContents, "contents",  "Generate contents of a module"),
+    clEnumValN(GenFunction, "function",  "Generate a function definition"),
+    clEnumValN(GenFunctions,"functions", "Generate all function definitions"),
+    clEnumValN(GenInline,   "inline",    "Generate an inline function"),
+    clEnumValN(GenVariable, "variable",  "Generate a variable definition"),
+    clEnumValN(GenType,     "type",      "Generate a type definition"),
+    clEnumValEnd
+  )
+);
+
+static cl::opt<std::string> NameToGenerate("cppfor", cl::Optional,
+  cl::desc("Specify the name of the thing to generate"),
+  cl::init("!bad!"));
+
+/// CppBackendTargetMachineModule - Note that this is used on hosts
+/// that cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int CppBackendTargetMachineModule;
+int CppBackendTargetMachineModule = 0;
+
+// Register the target.
+static RegisterTarget<CPPTargetMachine> X("cpp", "C++ backend");
+
+namespace {
+  typedef std::vector<const Type*> TypeList;
+  typedef std::map<const Type*,std::string> TypeMap;
+  typedef std::map<const Value*,std::string> ValueMap;
+  typedef std::set<std::string> NameSet;
+  typedef std::set<const Type*> TypeSet;
+  typedef std::set<const Value*> ValueSet;
+  typedef std::map<const Value*,std::string> ForwardRefMap;
+
+  /// CppWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C++ translation unit.
+  class CppWriter : public ModulePass {
+    raw_ostream &Out;
+    const Module *TheModule;
+    uint64_t uniqueNum;
+    TypeMap TypeNames;
+    ValueMap ValueNames;
+    TypeMap UnresolvedTypes;
+    TypeList TypeStack;
+    NameSet UsedNames;
+    TypeSet DefinedTypes;
+    ValueSet DefinedValues;
+    ForwardRefMap ForwardRefs;
+    bool is_inline;
+
+  public:
+    static char ID;
+    explicit CppWriter(raw_ostream &o) :
+      ModulePass(&ID), Out(o), uniqueNum(0), is_inline(false) {}
+
+    virtual const char *getPassName() const { return "C++ backend"; }
+
+    bool runOnModule(Module &M);
+
+    void printProgram(const std::string& fname, const std::string& modName );
+    void printModule(const std::string& fname, const std::string& modName );
+    void printContents(const std::string& fname, const std::string& modName );
+    void printFunction(const std::string& fname, const std::string& funcName );
+    void printFunctions();
+    void printInline(const std::string& fname, const std::string& funcName );
+    void printVariable(const std::string& fname, const std::string& varName );
+    void printType(const std::string& fname, const std::string& typeName );
+
+    void error(const std::string& msg);
+
+  private:
+    void printLinkageType(GlobalValue::LinkageTypes LT);
+    void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
+    void printCallingConv(unsigned cc);
+    void printEscapedString(const std::string& str);
+    void printCFP(const ConstantFP* CFP);
+
+    std::string getCppName(const Type* val);
+    inline void printCppName(const Type* val);
+
+    std::string getCppName(const Value* val);
+    inline void printCppName(const Value* val);
+
+    void printAttributes(const AttrListPtr &PAL, const std::string &name);
+    bool printTypeInternal(const Type* Ty);
+    inline void printType(const Type* Ty);
+    void printTypes(const Module* M);
+
+    void printConstant(const Constant *CPV);
+    void printConstants(const Module* M);
+
+    void printVariableUses(const GlobalVariable *GV);
+    void printVariableHead(const GlobalVariable *GV);
+    void printVariableBody(const GlobalVariable *GV);
+
+    void printFunctionUses(const Function *F);
+    void printFunctionHead(const Function *F);
+    void printFunctionBody(const Function *F);
+    void printInstruction(const Instruction *I, const std::string& bbname);
+    std::string getOpName(Value*);
+
+    void printModuleBody();
+  };
+
+  static unsigned indent_level = 0;
+  inline raw_ostream& nl(raw_ostream& Out, int delta = 0) {
+    Out << "\n";
+    if (delta >= 0 || indent_level >= unsigned(-delta))
+      indent_level += delta;
+    for (unsigned i = 0; i < indent_level; ++i)
+      Out << "  ";
+    return Out;
+  }
+
+  inline void in() { indent_level++; }
+  inline void out() { if (indent_level >0) indent_level--; }
+
+  inline void
+  sanitize(std::string& str) {
+    for (size_t i = 0; i < str.length(); ++i)
+      if (!isalnum(str[i]) && str[i] != '_')
+        str[i] = '_';
+  }
+
+  inline std::string
+  getTypePrefix(const Type* Ty ) {
+    switch (Ty->getTypeID()) {
+    case Type::VoidTyID:     return "void_";
+    case Type::IntegerTyID:
+      return std::string("int") + utostr(cast<IntegerType>(Ty)->getBitWidth()) +
+        "_";
+    case Type::FloatTyID:    return "float_";
+    case Type::DoubleTyID:   return "double_";
+    case Type::LabelTyID:    return "label_";
+    case Type::FunctionTyID: return "func_";
+    case Type::StructTyID:   return "struct_";
+    case Type::ArrayTyID:    return "array_";
+    case Type::PointerTyID:  return "ptr_";
+    case Type::VectorTyID:   return "packed_";
+    case Type::OpaqueTyID:   return "opaque_";
+    default:                 return "other_";
+    }
+    return "unknown_";
+  }
+
+  // Looks up the type in the symbol table and returns a pointer to its name or
+  // a null pointer if it wasn't found. Note that this isn't the same as the
+  // Mode::getTypeName function which will return an empty string, not a null
+  // pointer if the name is not found.
+  inline const std::string*
+  findTypeName(const TypeSymbolTable& ST, const Type* Ty) {
+    TypeSymbolTable::const_iterator TI = ST.begin();
+    TypeSymbolTable::const_iterator TE = ST.end();
+    for (;TI != TE; ++TI)
+      if (TI->second == Ty)
+        return &(TI->first);
+    return 0;
+  }
+
+  void CppWriter::error(const std::string& msg) {
+    cerr << msg << "\n";
+    exit(2);
+  }
+
+  // printCFP - Print a floating point constant .. very carefully :)
+  // This makes sure that conversion to/from floating yields the same binary
+  // result so that we don't lose precision.
+  void CppWriter::printCFP(const ConstantFP *CFP) {
+    bool ignored;
+    APFloat APF = APFloat(CFP->getValueAPF());  // copy
+    if (CFP->getType() == Type::FloatTy)
+      APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+    Out << "ConstantFP::get(";
+    Out << "APFloat(";
+#if HAVE_PRINTF_A
+    char Buffer[100];
+    sprintf(Buffer, "%A", APF.convertToDouble());
+    if ((!strncmp(Buffer, "0x", 2) ||
+         !strncmp(Buffer, "-0x", 3) ||
+         !strncmp(Buffer, "+0x", 3)) &&
+        APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
+      if (CFP->getType() == Type::DoubleTy)
+        Out << "BitsToDouble(" << Buffer << ")";
+      else
+        Out << "BitsToFloat((float)" << Buffer << ")";
+      Out << ")";
+    } else {
+#endif
+      std::string StrVal = ftostr(CFP->getValueAPF());
+
+      while (StrVal[0] == ' ')
+        StrVal.erase(StrVal.begin());
+
+      // Check to make sure that the stringized number is not some string like
+      // "Inf" or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+      if (((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+           ((StrVal[0] == '-' || StrVal[0] == '+') &&
+            (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
+          (CFP->isExactlyValue(atof(StrVal.c_str())))) {
+        if (CFP->getType() == Type::DoubleTy)
+          Out <<  StrVal;
+        else
+          Out << StrVal << "f";
+      } else if (CFP->getType() == Type::DoubleTy)
+        Out << "BitsToDouble(0x"
+            << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
+            << "ULL) /* " << StrVal << " */";
+      else
+        Out << "BitsToFloat(0x"
+            << utohexstr((uint32_t)CFP->getValueAPF().
+                                        bitcastToAPInt().getZExtValue())
+            << "U) /* " << StrVal << " */";
+      Out << ")";
+#if HAVE_PRINTF_A
+    }
+#endif
+    Out << ")";
+  }
+
+  void CppWriter::printCallingConv(unsigned cc){
+    // Print the calling convention.
+    switch (cc) {
+    case CallingConv::C:     Out << "CallingConv::C"; break;
+    case CallingConv::Fast:  Out << "CallingConv::Fast"; break;
+    case CallingConv::Cold:  Out << "CallingConv::Cold"; break;
+    case CallingConv::FirstTargetCC: Out << "CallingConv::FirstTargetCC"; break;
+    default:                 Out << cc; break;
+    }
+  }
+
+  void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
+    switch (LT) {
+    case GlobalValue::InternalLinkage:
+      Out << "GlobalValue::InternalLinkage"; break;
+    case GlobalValue::PrivateLinkage:
+      Out << "GlobalValue::PrivateLinkage"; break;
+    case GlobalValue::AvailableExternallyLinkage:
+      Out << "GlobalValue::AvailableExternallyLinkage "; break;
+    case GlobalValue::LinkOnceAnyLinkage:
+      Out << "GlobalValue::LinkOnceAnyLinkage "; break;
+    case GlobalValue::LinkOnceODRLinkage:
+      Out << "GlobalValue::LinkOnceODRLinkage "; break;
+    case GlobalValue::WeakAnyLinkage:
+      Out << "GlobalValue::WeakAnyLinkage"; break;
+    case GlobalValue::WeakODRLinkage:
+      Out << "GlobalValue::WeakODRLinkage"; break;
+    case GlobalValue::AppendingLinkage:
+      Out << "GlobalValue::AppendingLinkage"; break;
+    case GlobalValue::ExternalLinkage:
+      Out << "GlobalValue::ExternalLinkage"; break;
+    case GlobalValue::DLLImportLinkage:
+      Out << "GlobalValue::DLLImportLinkage"; break;
+    case GlobalValue::DLLExportLinkage:
+      Out << "GlobalValue::DLLExportLinkage"; break;
+    case GlobalValue::ExternalWeakLinkage:
+      Out << "GlobalValue::ExternalWeakLinkage"; break;
+    case GlobalValue::GhostLinkage:
+      Out << "GlobalValue::GhostLinkage"; break;
+    case GlobalValue::CommonLinkage:
+      Out << "GlobalValue::CommonLinkage"; break;
+    }
+  }
+
+  void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
+    switch (VisType) {
+    default: assert(0 && "Unknown GVar visibility");
+    case GlobalValue::DefaultVisibility:
+      Out << "GlobalValue::DefaultVisibility";
+      break;
+    case GlobalValue::HiddenVisibility:
+      Out << "GlobalValue::HiddenVisibility";
+      break;
+    case GlobalValue::ProtectedVisibility:
+      Out << "GlobalValue::ProtectedVisibility";
+      break;
+    }
+  }
+
+  // printEscapedString - Print each character of the specified string, escaping
+  // it if it is not printable or if it is an escape char.
+  void CppWriter::printEscapedString(const std::string &Str) {
+    for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+      unsigned char C = Str[i];
+      if (isprint(C) && C != '"' && C != '\\') {
+        Out << C;
+      } else {
+        Out << "\\x"
+            << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
+            << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+      }
+    }
+  }
+
+  std::string CppWriter::getCppName(const Type* Ty) {
+    // First, handle the primitive types .. easy
+    if (Ty->isPrimitiveType() || Ty->isInteger()) {
+      switch (Ty->getTypeID()) {
+      case Type::VoidTyID:   return "Type::VoidTy";
+      case Type::IntegerTyID: {
+        unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+        return "IntegerType::get(" + utostr(BitWidth) + ")";
+      }
+      case Type::X86_FP80TyID: return "Type::X86_FP80Ty";
+      case Type::FloatTyID:    return "Type::FloatTy";
+      case Type::DoubleTyID:   return "Type::DoubleTy";
+      case Type::LabelTyID:    return "Type::LabelTy";
+      default:
+        error("Invalid primitive type");
+        break;
+      }
+      return "Type::VoidTy"; // shouldn't be returned, but make it sensible
+    }
+
+    // Now, see if we've seen the type before and return that
+    TypeMap::iterator I = TypeNames.find(Ty);
+    if (I != TypeNames.end())
+      return I->second;
+
+    // Okay, let's build a new name for this type. Start with a prefix
+    const char* prefix = 0;
+    switch (Ty->getTypeID()) {
+    case Type::FunctionTyID:    prefix = "FuncTy_"; break;
+    case Type::StructTyID:      prefix = "StructTy_"; break;
+    case Type::ArrayTyID:       prefix = "ArrayTy_"; break;
+    case Type::PointerTyID:     prefix = "PointerTy_"; break;
+    case Type::OpaqueTyID:      prefix = "OpaqueTy_"; break;
+    case Type::VectorTyID:      prefix = "VectorTy_"; break;
+    default:                    prefix = "OtherTy_"; break; // prevent breakage
+    }
+
+    // See if the type has a name in the symboltable and build accordingly
+    const std::string* tName = findTypeName(TheModule->getTypeSymbolTable(), Ty);
+    std::string name;
+    if (tName)
+      name = std::string(prefix) + *tName;
+    else
+      name = std::string(prefix) + utostr(uniqueNum++);
+    sanitize(name);
+
+    // Save the name
+    return TypeNames[Ty] = name;
+  }
+
+  void CppWriter::printCppName(const Type* Ty) {
+    printEscapedString(getCppName(Ty));
+  }
+
+  std::string CppWriter::getCppName(const Value* val) {
+    std::string name;
+    ValueMap::iterator I = ValueNames.find(val);
+    if (I != ValueNames.end() && I->first == val)
+      return  I->second;
+
+    if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(val)) {
+      name = std::string("gvar_") +
+        getTypePrefix(GV->getType()->getElementType());
+    } else if (isa<Function>(val)) {
+      name = std::string("func_");
+    } else if (const Constant* C = dyn_cast<Constant>(val)) {
+      name = std::string("const_") + getTypePrefix(C->getType());
+    } else if (const Argument* Arg = dyn_cast<Argument>(val)) {
+      if (is_inline) {
+        unsigned argNum = std::distance(Arg->getParent()->arg_begin(),
+                                        Function::const_arg_iterator(Arg)) + 1;
+        name = std::string("arg_") + utostr(argNum);
+        NameSet::iterator NI = UsedNames.find(name);
+        if (NI != UsedNames.end())
+          name += std::string("_") + utostr(uniqueNum++);
+        UsedNames.insert(name);
+        return ValueNames[val] = name;
+      } else {
+        name = getTypePrefix(val->getType());
+      }
+    } else {
+      name = getTypePrefix(val->getType());
+    }
+    name += (val->hasName() ? val->getName() : utostr(uniqueNum++));
+    sanitize(name);
+    NameSet::iterator NI = UsedNames.find(name);
+    if (NI != UsedNames.end())
+      name += std::string("_") + utostr(uniqueNum++);
+    UsedNames.insert(name);
+    return ValueNames[val] = name;
+  }
+
+  void CppWriter::printCppName(const Value* val) {
+    printEscapedString(getCppName(val));
+  }
+
+  void CppWriter::printAttributes(const AttrListPtr &PAL,
+                                  const std::string &name) {
+    Out << "AttrListPtr " << name << "_PAL;";
+    nl(Out);
+    if (!PAL.isEmpty()) {
+      Out << '{'; in(); nl(Out);
+      Out << "SmallVector<AttributeWithIndex, 4> Attrs;"; nl(Out);
+      Out << "AttributeWithIndex PAWI;"; nl(Out);
+      for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
+        unsigned index = PAL.getSlot(i).Index;
+        Attributes attrs = PAL.getSlot(i).Attrs;
+        Out << "PAWI.Index = " << index << "U; PAWI.Attrs = 0 ";
+#define HANDLE_ATTR(X)                 \
+        if (attrs & Attribute::X)      \
+          Out << " | Attribute::" #X;  \
+        attrs &= ~Attribute::X;
+        
+        HANDLE_ATTR(SExt);
+        HANDLE_ATTR(ZExt);
+        HANDLE_ATTR(NoReturn);
+        HANDLE_ATTR(InReg);
+        HANDLE_ATTR(StructRet);
+        HANDLE_ATTR(NoUnwind);
+        HANDLE_ATTR(NoAlias);
+        HANDLE_ATTR(ByVal);
+        HANDLE_ATTR(Nest);
+        HANDLE_ATTR(ReadNone);
+        HANDLE_ATTR(ReadOnly);
+        HANDLE_ATTR(NoInline);
+        HANDLE_ATTR(AlwaysInline);
+        HANDLE_ATTR(OptimizeForSize);
+        HANDLE_ATTR(StackProtect);
+        HANDLE_ATTR(StackProtectReq);
+        HANDLE_ATTR(NoCapture);
+#undef HANDLE_ATTR
+        assert(attrs == 0 && "Unhandled attribute!");
+        Out << ";";
+        nl(Out);
+        Out << "Attrs.push_back(PAWI);";
+        nl(Out);
+      }
+      Out << name << "_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());";
+      nl(Out);
+      out(); nl(Out);
+      Out << '}'; nl(Out);
+    }
+  }
+
+  bool CppWriter::printTypeInternal(const Type* Ty) {
+    // We don't print definitions for primitive types
+    if (Ty->isPrimitiveType() || Ty->isInteger())
+      return false;
+
+    // If we already defined this type, we don't need to define it again.
+    if (DefinedTypes.find(Ty) != DefinedTypes.end())
+      return false;
+
+    // Everything below needs the name for the type so get it now.
+    std::string typeName(getCppName(Ty));
+
+    // Search the type stack for recursion. If we find it, then generate this
+    // as an OpaqueType, but make sure not to do this multiple times because
+    // the type could appear in multiple places on the stack. Once the opaque
+    // definition is issued, it must not be re-issued. Consequently we have to
+    // check the UnresolvedTypes list as well.
+    TypeList::const_iterator TI = std::find(TypeStack.begin(), TypeStack.end(),
+                                            Ty);
+    if (TI != TypeStack.end()) {
+      TypeMap::const_iterator I = UnresolvedTypes.find(Ty);
+      if (I == UnresolvedTypes.end()) {
+        Out << "PATypeHolder " << typeName << "_fwd = OpaqueType::get();";
+        nl(Out);
+        UnresolvedTypes[Ty] = typeName;
+      }
+      return true;
+    }
+
+    // We're going to print a derived type which, by definition, contains other
+    // types. So, push this one we're printing onto the type stack to assist with
+    // recursive definitions.
+    TypeStack.push_back(Ty);
+
+    // Print the type definition
+    switch (Ty->getTypeID()) {
+    case Type::FunctionTyID:  {
+      const FunctionType* FT = cast<FunctionType>(Ty);
+      Out << "std::vector<const Type*>" << typeName << "_args;";
+      nl(Out);
+      FunctionType::param_iterator PI = FT->param_begin();
+      FunctionType::param_iterator PE = FT->param_end();
+      for (; PI != PE; ++PI) {
+        const Type* argTy = static_cast<const Type*>(*PI);
+        bool isForward = printTypeInternal(argTy);
+        std::string argName(getCppName(argTy));
+        Out << typeName << "_args.push_back(" << argName;
+        if (isForward)
+          Out << "_fwd";
+        Out << ");";
+        nl(Out);
+      }
+      bool isForward = printTypeInternal(FT->getReturnType());
+      std::string retTypeName(getCppName(FT->getReturnType()));
+      Out << "FunctionType* " << typeName << " = FunctionType::get(";
+      in(); nl(Out) << "/*Result=*/" << retTypeName;
+      if (isForward)
+        Out << "_fwd";
+      Out << ",";
+      nl(Out) << "/*Params=*/" << typeName << "_args,";
+      nl(Out) << "/*isVarArg=*/" << (FT->isVarArg() ? "true" : "false") << ");";
+      out();
+      nl(Out);
+      break;
+    }
+    case Type::StructTyID: {
+      const StructType* ST = cast<StructType>(Ty);
+      Out << "std::vector<const Type*>" << typeName << "_fields;";
+      nl(Out);
+      StructType::element_iterator EI = ST->element_begin();
+      StructType::element_iterator EE = ST->element_end();
+      for (; EI != EE; ++EI) {
+        const Type* fieldTy = static_cast<const Type*>(*EI);
+        bool isForward = printTypeInternal(fieldTy);
+        std::string fieldName(getCppName(fieldTy));
+        Out << typeName << "_fields.push_back(" << fieldName;
+        if (isForward)
+          Out << "_fwd";
+        Out << ");";
+        nl(Out);
+      }
+      Out << "StructType* " << typeName << " = StructType::get("
+          << typeName << "_fields, /*isPacked=*/"
+          << (ST->isPacked() ? "true" : "false") << ");";
+      nl(Out);
+      break;
+    }
+    case Type::ArrayTyID: {
+      const ArrayType* AT = cast<ArrayType>(Ty);
+      const Type* ET = AT->getElementType();
+      bool isForward = printTypeInternal(ET);
+      std::string elemName(getCppName(ET));
+      Out << "ArrayType* " << typeName << " = ArrayType::get("
+          << elemName << (isForward ? "_fwd" : "")
+          << ", " << utostr(AT->getNumElements()) << ");";
+      nl(Out);
+      break;
+    }
+    case Type::PointerTyID: {
+      const PointerType* PT = cast<PointerType>(Ty);
+      const Type* ET = PT->getElementType();
+      bool isForward = printTypeInternal(ET);
+      std::string elemName(getCppName(ET));
+      Out << "PointerType* " << typeName << " = PointerType::get("
+          << elemName << (isForward ? "_fwd" : "")
+          << ", " << utostr(PT->getAddressSpace()) << ");";
+      nl(Out);
+      break;
+    }
+    case Type::VectorTyID: {
+      const VectorType* PT = cast<VectorType>(Ty);
+      const Type* ET = PT->getElementType();
+      bool isForward = printTypeInternal(ET);
+      std::string elemName(getCppName(ET));
+      Out << "VectorType* " << typeName << " = VectorType::get("
+          << elemName << (isForward ? "_fwd" : "")
+          << ", " << utostr(PT->getNumElements()) << ");";
+      nl(Out);
+      break;
+    }
+    case Type::OpaqueTyID: {
+      Out << "OpaqueType* " << typeName << " = OpaqueType::get();";
+      nl(Out);
+      break;
+    }
+    default:
+      error("Invalid TypeID");
+    }
+
+    // If the type had a name, make sure we recreate it.
+    const std::string* progTypeName =
+      findTypeName(TheModule->getTypeSymbolTable(),Ty);
+    if (progTypeName) {
+      Out << "mod->addTypeName(\"" << *progTypeName << "\", "
+          << typeName << ");";
+      nl(Out);
+    }
+
+    // Pop us off the type stack
+    TypeStack.pop_back();
+
+    // Indicate that this type is now defined.
+    DefinedTypes.insert(Ty);
+
+    // Early resolve as many unresolved types as possible. Search the unresolved
+    // types map for the type we just printed. Now that its definition is complete
+    // we can resolve any previous references to it. This prevents a cascade of
+    // unresolved types.
+    TypeMap::iterator I = UnresolvedTypes.find(Ty);
+    if (I != UnresolvedTypes.end()) {
+      Out << "cast<OpaqueType>(" << I->second
+          << "_fwd.get())->refineAbstractTypeTo(" << I->second << ");";
+      nl(Out);
+      Out << I->second << " = cast<";
+      switch (Ty->getTypeID()) {
+      case Type::FunctionTyID: Out << "FunctionType"; break;
+      case Type::ArrayTyID:    Out << "ArrayType"; break;
+      case Type::StructTyID:   Out << "StructType"; break;
+      case Type::VectorTyID:   Out << "VectorType"; break;
+      case Type::PointerTyID:  Out << "PointerType"; break;
+      case Type::OpaqueTyID:   Out << "OpaqueType"; break;
+      default:                 Out << "NoSuchDerivedType"; break;
+      }
+      Out << ">(" << I->second << "_fwd.get());";
+      nl(Out); nl(Out);
+      UnresolvedTypes.erase(I);
+    }
+
+    // Finally, separate the type definition from other with a newline.
+    nl(Out);
+
+    // We weren't a recursive type
+    return false;
+  }
+
+  // Prints a type definition. Returns true if it could not resolve all the
+  // types in the definition but had to use a forward reference.
+  void CppWriter::printType(const Type* Ty) {
+    assert(TypeStack.empty());
+    TypeStack.clear();
+    printTypeInternal(Ty);
+    assert(TypeStack.empty());
+  }
+
+  void CppWriter::printTypes(const Module* M) {
+    // Walk the symbol table and print out all its types
+    const TypeSymbolTable& symtab = M->getTypeSymbolTable();
+    for (TypeSymbolTable::const_iterator TI = symtab.begin(), TE = symtab.end();
+         TI != TE; ++TI) {
+
+      // For primitive types and types already defined, just add a name
+      TypeMap::const_iterator TNI = TypeNames.find(TI->second);
+      if (TI->second->isInteger() || TI->second->isPrimitiveType() ||
+          TNI != TypeNames.end()) {
+        Out << "mod->addTypeName(\"";
+        printEscapedString(TI->first);
+        Out << "\", " << getCppName(TI->second) << ");";
+        nl(Out);
+        // For everything else, define the type
+      } else {
+        printType(TI->second);
+      }
+    }
+
+    // Add all of the global variables to the value table...
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I) {
+      if (I->hasInitializer())
+        printType(I->getInitializer()->getType());
+      printType(I->getType());
+    }
+
+    // Add all the functions to the table
+    for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+         FI != FE; ++FI) {
+      printType(FI->getReturnType());
+      printType(FI->getFunctionType());
+      // Add all the function arguments
+      for (Function::const_arg_iterator AI = FI->arg_begin(),
+             AE = FI->arg_end(); AI != AE; ++AI) {
+        printType(AI->getType());
+      }
+
+      // Add all of the basic blocks and instructions
+      for (Function::const_iterator BB = FI->begin(),
+             E = FI->end(); BB != E; ++BB) {
+        printType(BB->getType());
+        for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+             ++I) {
+          printType(I->getType());
+          for (unsigned i = 0; i < I->getNumOperands(); ++i)
+            printType(I->getOperand(i)->getType());
+        }
+      }
+    }
+  }
+
+
+  // printConstant - Print out a constant pool entry...
+  void CppWriter::printConstant(const Constant *CV) {
+    // First, if the constant is actually a GlobalValue (variable or function)
+    // or its already in the constant list then we've printed it already and we
+    // can just return.
+    if (isa<GlobalValue>(CV) || ValueNames.find(CV) != ValueNames.end())
+      return;
+
+    std::string constName(getCppName(CV));
+    std::string typeName(getCppName(CV->getType()));
+
+    if (isa<GlobalValue>(CV)) {
+      // Skip variables and functions, we emit them elsewhere
+      return;
+    }
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      std::string constValue = CI->getValue().toString(10, true);
+      Out << "ConstantInt* " << constName << " = ConstantInt::get(APInt("
+          << cast<IntegerType>(CI->getType())->getBitWidth() << ",  \""
+          <<  constValue << "\", " << constValue.length() << ", 10));";
+    } else if (isa<ConstantAggregateZero>(CV)) {
+      Out << "ConstantAggregateZero* " << constName
+          << " = ConstantAggregateZero::get(" << typeName << ");";
+    } else if (isa<ConstantPointerNull>(CV)) {
+      Out << "ConstantPointerNull* " << constName
+          << " = ConstantPointerNull::get(" << typeName << ");";
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+      Out << "ConstantFP* " << constName << " = ";
+      printCFP(CFP);
+      Out << ";";
+    } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+      if (CA->isString() && CA->getType()->getElementType() == Type::Int8Ty) {
+        Out << "Constant* " << constName << " = ConstantArray::get(\"";
+        std::string tmp = CA->getAsString();
+        bool nullTerminate = false;
+        if (tmp[tmp.length()-1] == 0) {
+          tmp.erase(tmp.length()-1);
+          nullTerminate = true;
+        }
+        printEscapedString(tmp);
+        // Determine if we want null termination or not.
+        if (nullTerminate)
+          Out << "\", true"; // Indicate that the null terminator should be
+                             // added.
+        else
+          Out << "\", false";// No null terminator
+        Out << ");";
+      } else {
+        Out << "std::vector<Constant*> " << constName << "_elems;";
+        nl(Out);
+        unsigned N = CA->getNumOperands();
+        for (unsigned i = 0; i < N; ++i) {
+          printConstant(CA->getOperand(i)); // recurse to print operands
+          Out << constName << "_elems.push_back("
+              << getCppName(CA->getOperand(i)) << ");";
+          nl(Out);
+        }
+        Out << "Constant* " << constName << " = ConstantArray::get("
+            << typeName << ", " << constName << "_elems);";
+      }
+    } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+      Out << "std::vector<Constant*> " << constName << "_fields;";
+      nl(Out);
+      unsigned N = CS->getNumOperands();
+      for (unsigned i = 0; i < N; i++) {
+        printConstant(CS->getOperand(i));
+        Out << constName << "_fields.push_back("
+            << getCppName(CS->getOperand(i)) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName << " = ConstantStruct::get("
+          << typeName << ", " << constName << "_fields);";
+    } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
+      Out << "std::vector<Constant*> " << constName << "_elems;";
+      nl(Out);
+      unsigned N = CP->getNumOperands();
+      for (unsigned i = 0; i < N; ++i) {
+        printConstant(CP->getOperand(i));
+        Out << constName << "_elems.push_back("
+            << getCppName(CP->getOperand(i)) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName << " = ConstantVector::get("
+          << typeName << ", " << constName << "_elems);";
+    } else if (isa<UndefValue>(CV)) {
+      Out << "UndefValue* " << constName << " = UndefValue::get("
+          << typeName << ");";
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        Out << "std::vector<Constant*> " << constName << "_indices;";
+        nl(Out);
+        printConstant(CE->getOperand(0));
+        for (unsigned i = 1; i < CE->getNumOperands(); ++i ) {
+          printConstant(CE->getOperand(i));
+          Out << constName << "_indices.push_back("
+              << getCppName(CE->getOperand(i)) << ");";
+          nl(Out);
+        }
+        Out << "Constant* " << constName
+            << " = ConstantExpr::getGetElementPtr("
+            << getCppName(CE->getOperand(0)) << ", "
+            << "&" << constName << "_indices[0], "
+            << constName << "_indices.size()"
+            << " );";
+      } else if (CE->isCast()) {
+        printConstant(CE->getOperand(0));
+        Out << "Constant* " << constName << " = ConstantExpr::getCast(";
+        switch (CE->getOpcode()) {
+        default: assert(0 && "Invalid cast opcode");
+        case Instruction::Trunc: Out << "Instruction::Trunc"; break;
+        case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
+        case Instruction::SExt:  Out << "Instruction::SExt"; break;
+        case Instruction::FPTrunc:  Out << "Instruction::FPTrunc"; break;
+        case Instruction::FPExt:  Out << "Instruction::FPExt"; break;
+        case Instruction::FPToUI:  Out << "Instruction::FPToUI"; break;
+        case Instruction::FPToSI:  Out << "Instruction::FPToSI"; break;
+        case Instruction::UIToFP:  Out << "Instruction::UIToFP"; break;
+        case Instruction::SIToFP:  Out << "Instruction::SIToFP"; break;
+        case Instruction::PtrToInt:  Out << "Instruction::PtrToInt"; break;
+        case Instruction::IntToPtr:  Out << "Instruction::IntToPtr"; break;
+        case Instruction::BitCast:  Out << "Instruction::BitCast"; break;
+        }
+        Out << ", " << getCppName(CE->getOperand(0)) << ", "
+            << getCppName(CE->getType()) << ");";
+      } else {
+        unsigned N = CE->getNumOperands();
+        for (unsigned i = 0; i < N; ++i ) {
+          printConstant(CE->getOperand(i));
+        }
+        Out << "Constant* " << constName << " = ConstantExpr::";
+        switch (CE->getOpcode()) {
+        case Instruction::Add:    Out << "getAdd(";  break;
+        case Instruction::Sub:    Out << "getSub("; break;
+        case Instruction::Mul:    Out << "getMul("; break;
+        case Instruction::UDiv:   Out << "getUDiv("; break;
+        case Instruction::SDiv:   Out << "getSDiv("; break;
+        case Instruction::FDiv:   Out << "getFDiv("; break;
+        case Instruction::URem:   Out << "getURem("; break;
+        case Instruction::SRem:   Out << "getSRem("; break;
+        case Instruction::FRem:   Out << "getFRem("; break;
+        case Instruction::And:    Out << "getAnd("; break;
+        case Instruction::Or:     Out << "getOr("; break;
+        case Instruction::Xor:    Out << "getXor("; break;
+        case Instruction::ICmp:
+          Out << "getICmp(ICmpInst::ICMP_";
+          switch (CE->getPredicate()) {
+          case ICmpInst::ICMP_EQ:  Out << "EQ"; break;
+          case ICmpInst::ICMP_NE:  Out << "NE"; break;
+          case ICmpInst::ICMP_SLT: Out << "SLT"; break;
+          case ICmpInst::ICMP_ULT: Out << "ULT"; break;
+          case ICmpInst::ICMP_SGT: Out << "SGT"; break;
+          case ICmpInst::ICMP_UGT: Out << "UGT"; break;
+          case ICmpInst::ICMP_SLE: Out << "SLE"; break;
+          case ICmpInst::ICMP_ULE: Out << "ULE"; break;
+          case ICmpInst::ICMP_SGE: Out << "SGE"; break;
+          case ICmpInst::ICMP_UGE: Out << "UGE"; break;
+          default: error("Invalid ICmp Predicate");
+          }
+          break;
+        case Instruction::FCmp:
+          Out << "getFCmp(FCmpInst::FCMP_";
+          switch (CE->getPredicate()) {
+          case FCmpInst::FCMP_FALSE: Out << "FALSE"; break;
+          case FCmpInst::FCMP_ORD:   Out << "ORD"; break;
+          case FCmpInst::FCMP_UNO:   Out << "UNO"; break;
+          case FCmpInst::FCMP_OEQ:   Out << "OEQ"; break;
+          case FCmpInst::FCMP_UEQ:   Out << "UEQ"; break;
+          case FCmpInst::FCMP_ONE:   Out << "ONE"; break;
+          case FCmpInst::FCMP_UNE:   Out << "UNE"; break;
+          case FCmpInst::FCMP_OLT:   Out << "OLT"; break;
+          case FCmpInst::FCMP_ULT:   Out << "ULT"; break;
+          case FCmpInst::FCMP_OGT:   Out << "OGT"; break;
+          case FCmpInst::FCMP_UGT:   Out << "UGT"; break;
+          case FCmpInst::FCMP_OLE:   Out << "OLE"; break;
+          case FCmpInst::FCMP_ULE:   Out << "ULE"; break;
+          case FCmpInst::FCMP_OGE:   Out << "OGE"; break;
+          case FCmpInst::FCMP_UGE:   Out << "UGE"; break;
+          case FCmpInst::FCMP_TRUE:  Out << "TRUE"; break;
+          default: error("Invalid FCmp Predicate");
+          }
+          break;
+        case Instruction::Shl:     Out << "getShl("; break;
+        case Instruction::LShr:    Out << "getLShr("; break;
+        case Instruction::AShr:    Out << "getAShr("; break;
+        case Instruction::Select:  Out << "getSelect("; break;
+        case Instruction::ExtractElement: Out << "getExtractElement("; break;
+        case Instruction::InsertElement:  Out << "getInsertElement("; break;
+        case Instruction::ShuffleVector:  Out << "getShuffleVector("; break;
+        default:
+          error("Invalid constant expression");
+          break;
+        }
+        Out << getCppName(CE->getOperand(0));
+        for (unsigned i = 1; i < CE->getNumOperands(); ++i)
+          Out << ", " << getCppName(CE->getOperand(i));
+        Out << ");";
+      }
+    } else {
+      error("Bad Constant");
+      Out << "Constant* " << constName << " = 0; ";
+    }
+    nl(Out);
+  }
+
+  void CppWriter::printConstants(const Module* M) {
+    // Traverse all the global variables looking for constant initializers
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I)
+      if (I->hasInitializer())
+        printConstant(I->getInitializer());
+
+    // Traverse the LLVM functions looking for constants
+    for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+         FI != FE; ++FI) {
+      // Add all of the basic blocks and instructions
+      for (Function::const_iterator BB = FI->begin(),
+             E = FI->end(); BB != E; ++BB) {
+        for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+             ++I) {
+          for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+            if (Constant* C = dyn_cast<Constant>(I->getOperand(i))) {
+              printConstant(C);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void CppWriter::printVariableUses(const GlobalVariable *GV) {
+    nl(Out) << "// Type Definitions";
+    nl(Out);
+    printType(GV->getType());
+    if (GV->hasInitializer()) {
+      Constant* Init = GV->getInitializer();
+      printType(Init->getType());
+      if (Function* F = dyn_cast<Function>(Init)) {
+        nl(Out)<< "/ Function Declarations"; nl(Out);
+        printFunctionHead(F);
+      } else if (GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
+        nl(Out) << "// Global Variable Declarations"; nl(Out);
+        printVariableHead(gv);
+      } else  {
+        nl(Out) << "// Constant Definitions"; nl(Out);
+        printConstant(gv);
+      }
+      if (GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
+        nl(Out) << "// Global Variable Definitions"; nl(Out);
+        printVariableBody(gv);
+      }
+    }
+  }
+
+  void CppWriter::printVariableHead(const GlobalVariable *GV) {
+    nl(Out) << "GlobalVariable* " << getCppName(GV);
+    if (is_inline) {
+      Out << " = mod->getGlobalVariable(";
+      printEscapedString(GV->getName());
+      Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
+      nl(Out) << "if (!" << getCppName(GV) << ") {";
+      in(); nl(Out) << getCppName(GV);
+    }
+    Out << " = new GlobalVariable(";
+    nl(Out) << "/*Type=*/";
+    printCppName(GV->getType()->getElementType());
+    Out << ",";
+    nl(Out) << "/*isConstant=*/" << (GV->isConstant()?"true":"false");
+    Out << ",";
+    nl(Out) << "/*Linkage=*/";
+    printLinkageType(GV->getLinkage());
+    Out << ",";
+    nl(Out) << "/*Initializer=*/0, ";
+    if (GV->hasInitializer()) {
+      Out << "// has initializer, specified below";
+    }
+    nl(Out) << "/*Name=*/\"";
+    printEscapedString(GV->getName());
+    Out << "\",";
+    nl(Out) << "mod);";
+    nl(Out);
+
+    if (GV->hasSection()) {
+      printCppName(GV);
+      Out << "->setSection(\"";
+      printEscapedString(GV->getSection());
+      Out << "\");";
+      nl(Out);
+    }
+    if (GV->getAlignment()) {
+      printCppName(GV);
+      Out << "->setAlignment(" << utostr(GV->getAlignment()) << ");";
+      nl(Out);
+    }
+    if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
+      printCppName(GV);
+      Out << "->setVisibility(";
+      printVisibilityType(GV->getVisibility());
+      Out << ");";
+      nl(Out);
+    }
+    if (is_inline) {
+      out(); Out << "}"; nl(Out);
+    }
+  }
+
+  void CppWriter::printVariableBody(const GlobalVariable *GV) {
+    if (GV->hasInitializer()) {
+      printCppName(GV);
+      Out << "->setInitializer(";
+      Out << getCppName(GV->getInitializer()) << ");";
+      nl(Out);
+    }
+  }
+
+  std::string CppWriter::getOpName(Value* V) {
+    if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
+      return getCppName(V);
+
+    // See if its alread in the map of forward references, if so just return the
+    // name we already set up for it
+    ForwardRefMap::const_iterator I = ForwardRefs.find(V);
+    if (I != ForwardRefs.end())
+      return I->second;
+
+    // This is a new forward reference. Generate a unique name for it
+    std::string result(std::string("fwdref_") + utostr(uniqueNum++));
+
+    // Yes, this is a hack. An Argument is the smallest instantiable value that
+    // we can make as a placeholder for the real value. We'll replace these
+    // Argument instances later.
+    Out << "Argument* " << result << " = new Argument("
+        << getCppName(V->getType()) << ");";
+    nl(Out);
+    ForwardRefs[V] = result;
+    return result;
+  }
+
+  // printInstruction - This member is called for each Instruction in a function.
+  void CppWriter::printInstruction(const Instruction *I,
+                                   const std::string& bbname) {
+    std::string iName(getCppName(I));
+
+    // Before we emit this instruction, we need to take care of generating any
+    // forward references. So, we get the names of all the operands in advance
+    std::string* opNames = new std::string[I->getNumOperands()];
+    for (unsigned i = 0; i < I->getNumOperands(); i++) {
+      opNames[i] = getOpName(I->getOperand(i));
+    }
+
+    switch (I->getOpcode()) {
+    default:
+      error("Invalid instruction");
+      break;
+
+    case Instruction::Ret: {
+      const ReturnInst* ret =  cast<ReturnInst>(I);
+      Out << "ReturnInst::Create("
+          << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
+      break;
+    }
+    case Instruction::Br: {
+      const BranchInst* br = cast<BranchInst>(I);
+      Out << "BranchInst::Create(" ;
+      if (br->getNumOperands() == 3 ) {
+        Out << opNames[2] << ", "
+            << opNames[1] << ", "
+            << opNames[0] << ", ";
+
+      } else if (br->getNumOperands() == 1) {
+        Out << opNames[0] << ", ";
+      } else {
+        error("Branch with 2 operands?");
+      }
+      Out << bbname << ");";
+      break;
+    }
+    case Instruction::Switch: {
+      const SwitchInst* sw = cast<SwitchInst>(I);
+      Out << "SwitchInst* " << iName << " = SwitchInst::Create("
+          << opNames[0] << ", "
+          << opNames[1] << ", "
+          << sw->getNumCases() << ", " << bbname << ");";
+      nl(Out);
+      for (unsigned i = 2; i < sw->getNumOperands(); i += 2 ) {
+        Out << iName << "->addCase("
+            << opNames[i] << ", "
+            << opNames[i+1] << ");";
+        nl(Out);
+      }
+      break;
+    }
+    case Instruction::Invoke: {
+      const InvokeInst* inv = cast<InvokeInst>(I);
+      Out << "std::vector<Value*> " << iName << "_params;";
+      nl(Out);
+      for (unsigned i = 3; i < inv->getNumOperands(); ++i) {
+        Out << iName << "_params.push_back("
+            << opNames[i] << ");";
+        nl(Out);
+      }
+      Out << "InvokeInst *" << iName << " = InvokeInst::Create("
+          << opNames[0] << ", "
+          << opNames[1] << ", "
+          << opNames[2] << ", "
+          << iName << "_params.begin(), " << iName << "_params.end(), \"";
+      printEscapedString(inv->getName());
+      Out << "\", " << bbname << ");";
+      nl(Out) << iName << "->setCallingConv(";
+      printCallingConv(inv->getCallingConv());
+      Out << ");";
+      printAttributes(inv->getAttributes(), iName);
+      Out << iName << "->setAttributes(" << iName << "_PAL);";
+      nl(Out);
+      break;
+    }
+    case Instruction::Unwind: {
+      Out << "new UnwindInst("
+          << bbname << ");";
+      break;
+    }
+    case Instruction::Unreachable:{
+      Out << "new UnreachableInst("
+          << bbname << ");";
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:{
+      Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
+      switch (I->getOpcode()) {
+      case Instruction::Add: Out << "Instruction::Add"; break;
+      case Instruction::Sub: Out << "Instruction::Sub"; break;
+      case Instruction::Mul: Out << "Instruction::Mul"; break;
+      case Instruction::UDiv:Out << "Instruction::UDiv"; break;
+      case Instruction::SDiv:Out << "Instruction::SDiv"; break;
+      case Instruction::FDiv:Out << "Instruction::FDiv"; break;
+      case Instruction::URem:Out << "Instruction::URem"; break;
+      case Instruction::SRem:Out << "Instruction::SRem"; break;
+      case Instruction::FRem:Out << "Instruction::FRem"; break;
+      case Instruction::And: Out << "Instruction::And"; break;
+      case Instruction::Or:  Out << "Instruction::Or";  break;
+      case Instruction::Xor: Out << "Instruction::Xor"; break;
+      case Instruction::Shl: Out << "Instruction::Shl"; break;
+      case Instruction::LShr:Out << "Instruction::LShr"; break;
+      case Instruction::AShr:Out << "Instruction::AShr"; break;
+      default: Out << "Instruction::BadOpCode"; break;
+      }
+      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+      printEscapedString(I->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::FCmp: {
+      Out << "FCmpInst* " << iName << " = new FCmpInst(";
+      switch (cast<FCmpInst>(I)->getPredicate()) {
+      case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
+      case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
+      case FCmpInst::FCMP_OGT  : Out << "FCmpInst::FCMP_OGT"; break;
+      case FCmpInst::FCMP_OGE  : Out << "FCmpInst::FCMP_OGE"; break;
+      case FCmpInst::FCMP_OLT  : Out << "FCmpInst::FCMP_OLT"; break;
+      case FCmpInst::FCMP_OLE  : Out << "FCmpInst::FCMP_OLE"; break;
+      case FCmpInst::FCMP_ONE  : Out << "FCmpInst::FCMP_ONE"; break;
+      case FCmpInst::FCMP_ORD  : Out << "FCmpInst::FCMP_ORD"; break;
+      case FCmpInst::FCMP_UNO  : Out << "FCmpInst::FCMP_UNO"; break;
+      case FCmpInst::FCMP_UEQ  : Out << "FCmpInst::FCMP_UEQ"; break;
+      case FCmpInst::FCMP_UGT  : Out << "FCmpInst::FCMP_UGT"; break;
+      case FCmpInst::FCMP_UGE  : Out << "FCmpInst::FCMP_UGE"; break;
+      case FCmpInst::FCMP_ULT  : Out << "FCmpInst::FCMP_ULT"; break;
+      case FCmpInst::FCMP_ULE  : Out << "FCmpInst::FCMP_ULE"; break;
+      case FCmpInst::FCMP_UNE  : Out << "FCmpInst::FCMP_UNE"; break;
+      case FCmpInst::FCMP_TRUE : Out << "FCmpInst::FCMP_TRUE"; break;
+      default: Out << "FCmpInst::BAD_ICMP_PREDICATE"; break;
+      }
+      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+      printEscapedString(I->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::ICmp: {
+      Out << "ICmpInst* " << iName << " = new ICmpInst(";
+      switch (cast<ICmpInst>(I)->getPredicate()) {
+      case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
+      case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
+      case ICmpInst::ICMP_ULE: Out << "ICmpInst::ICMP_ULE"; break;
+      case ICmpInst::ICMP_SLE: Out << "ICmpInst::ICMP_SLE"; break;
+      case ICmpInst::ICMP_UGE: Out << "ICmpInst::ICMP_UGE"; break;
+      case ICmpInst::ICMP_SGE: Out << "ICmpInst::ICMP_SGE"; break;
+      case ICmpInst::ICMP_ULT: Out << "ICmpInst::ICMP_ULT"; break;
+      case ICmpInst::ICMP_SLT: Out << "ICmpInst::ICMP_SLT"; break;
+      case ICmpInst::ICMP_UGT: Out << "ICmpInst::ICMP_UGT"; break;
+      case ICmpInst::ICMP_SGT: Out << "ICmpInst::ICMP_SGT"; break;
+      default: Out << "ICmpInst::BAD_ICMP_PREDICATE"; break;
+      }
+      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+      printEscapedString(I->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::Malloc: {
+      const MallocInst* mallocI = cast<MallocInst>(I);
+      Out << "MallocInst* " << iName << " = new MallocInst("
+          << getCppName(mallocI->getAllocatedType()) << ", ";
+      if (mallocI->isArrayAllocation())
+        Out << opNames[0] << ", " ;
+      Out << "\"";
+      printEscapedString(mallocI->getName());
+      Out << "\", " << bbname << ");";
+      if (mallocI->getAlignment())
+        nl(Out) << iName << "->setAlignment("
+            << mallocI->getAlignment() << ");";
+      break;
+    }
+    case Instruction::Free: {
+      Out << "FreeInst* " << iName << " = new FreeInst("
+          << getCppName(I->getOperand(0)) << ", " << bbname << ");";
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst* allocaI = cast<AllocaInst>(I);
+      Out << "AllocaInst* " << iName << " = new AllocaInst("
+          << getCppName(allocaI->getAllocatedType()) << ", ";
+      if (allocaI->isArrayAllocation())
+        Out << opNames[0] << ", ";
+      Out << "\"";
+      printEscapedString(allocaI->getName());
+      Out << "\", " << bbname << ");";
+      if (allocaI->getAlignment())
+        nl(Out) << iName << "->setAlignment("
+            << allocaI->getAlignment() << ");";
+      break;
+    }
+    case Instruction::Load:{
+      const LoadInst* load = cast<LoadInst>(I);
+      Out << "LoadInst* " << iName << " = new LoadInst("
+          << opNames[0] << ", \"";
+      printEscapedString(load->getName());
+      Out << "\", " << (load->isVolatile() ? "true" : "false" )
+          << ", " << bbname << ");";
+      break;
+    }
+    case Instruction::Store: {
+      const StoreInst* store = cast<StoreInst>(I);
+      Out << " new StoreInst("
+          << opNames[0] << ", "
+          << opNames[1] << ", "
+          << (store->isVolatile() ? "true" : "false")
+          << ", " << bbname << ");";
+      break;
+    }
+    case Instruction::GetElementPtr: {
+      const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
+      if (gep->getNumOperands() <= 2) {
+        Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
+            << opNames[0];
+        if (gep->getNumOperands() == 2)
+          Out << ", " << opNames[1];
+      } else {
+        Out << "std::vector<Value*> " << iName << "_indices;";
+        nl(Out);
+        for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
+          Out << iName << "_indices.push_back("
+              << opNames[i] << ");";
+          nl(Out);
+        }
+        Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
+            << opNames[0] << ", " << iName << "_indices.begin(), "
+            << iName << "_indices.end()";
+      }
+      Out << ", \"";
+      printEscapedString(gep->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::PHI: {
+      const PHINode* phi = cast<PHINode>(I);
+
+      Out << "PHINode* " << iName << " = PHINode::Create("
+          << getCppName(phi->getType()) << ", \"";
+      printEscapedString(phi->getName());
+      Out << "\", " << bbname << ");";
+      nl(Out) << iName << "->reserveOperandSpace("
+        << phi->getNumIncomingValues()
+          << ");";
+      nl(Out);
+      for (unsigned i = 0; i < phi->getNumOperands(); i+=2) {
+        Out << iName << "->addIncoming("
+            << opNames[i] << ", " << opNames[i+1] << ");";
+        nl(Out);
+      }
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast: {
+      const CastInst* cst = cast<CastInst>(I);
+      Out << "CastInst* " << iName << " = new ";
+      switch (I->getOpcode()) {
+      case Instruction::Trunc:    Out << "TruncInst"; break;
+      case Instruction::ZExt:     Out << "ZExtInst"; break;
+      case Instruction::SExt:     Out << "SExtInst"; break;
+      case Instruction::FPTrunc:  Out << "FPTruncInst"; break;
+      case Instruction::FPExt:    Out << "FPExtInst"; break;
+      case Instruction::FPToUI:   Out << "FPToUIInst"; break;
+      case Instruction::FPToSI:   Out << "FPToSIInst"; break;
+      case Instruction::UIToFP:   Out << "UIToFPInst"; break;
+      case Instruction::SIToFP:   Out << "SIToFPInst"; break;
+      case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
+      case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
+      case Instruction::BitCast:  Out << "BitCastInst"; break;
+      default: assert(!"Unreachable"); break;
+      }
+      Out << "(" << opNames[0] << ", "
+          << getCppName(cst->getType()) << ", \"";
+      printEscapedString(cst->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::Call:{
+      const CallInst* call = cast<CallInst>(I);
+      if (const InlineAsm* ila = dyn_cast<InlineAsm>(call->getCalledValue())) {
+        Out << "InlineAsm* " << getCppName(ila) << " = InlineAsm::get("
+            << getCppName(ila->getFunctionType()) << ", \""
+            << ila->getAsmString() << "\", \""
+            << ila->getConstraintString() << "\","
+            << (ila->hasSideEffects() ? "true" : "false") << ");";
+        nl(Out);
+      }
+      if (call->getNumOperands() > 2) {
+        Out << "std::vector<Value*> " << iName << "_params;";
+        nl(Out);
+        for (unsigned i = 1; i < call->getNumOperands(); ++i) {
+          Out << iName << "_params.push_back(" << opNames[i] << ");";
+          nl(Out);
+        }
+        Out << "CallInst* " << iName << " = CallInst::Create("
+            << opNames[0] << ", " << iName << "_params.begin(), "
+            << iName << "_params.end(), \"";
+      } else if (call->getNumOperands() == 2) {
+        Out << "CallInst* " << iName << " = CallInst::Create("
+            << opNames[0] << ", " << opNames[1] << ", \"";
+      } else {
+        Out << "CallInst* " << iName << " = CallInst::Create(" << opNames[0]
+            << ", \"";
+      }
+      printEscapedString(call->getName());
+      Out << "\", " << bbname << ");";
+      nl(Out) << iName << "->setCallingConv(";
+      printCallingConv(call->getCallingConv());
+      Out << ");";
+      nl(Out) << iName << "->setTailCall("
+          << (call->isTailCall() ? "true":"false");
+      Out << ");";
+      printAttributes(call->getAttributes(), iName);
+      Out << iName << "->setAttributes(" << iName << "_PAL);";
+      nl(Out);
+      break;
+    }
+    case Instruction::Select: {
+      const SelectInst* sel = cast<SelectInst>(I);
+      Out << "SelectInst* " << getCppName(sel) << " = SelectInst::Create(";
+      Out << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+      printEscapedString(sel->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::UserOp1:
+      /// FALL THROUGH
+    case Instruction::UserOp2: {
+      /// FIXME: What should be done here?
+      break;
+    }
+    case Instruction::VAArg: {
+      const VAArgInst* va = cast<VAArgInst>(I);
+      Out << "VAArgInst* " << getCppName(va) << " = new VAArgInst("
+          << opNames[0] << ", " << getCppName(va->getType()) << ", \"";
+      printEscapedString(va->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::ExtractElement: {
+      const ExtractElementInst* eei = cast<ExtractElementInst>(I);
+      Out << "ExtractElementInst* " << getCppName(eei)
+          << " = new ExtractElementInst(" << opNames[0]
+          << ", " << opNames[1] << ", \"";
+      printEscapedString(eei->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::InsertElement: {
+      const InsertElementInst* iei = cast<InsertElementInst>(I);
+      Out << "InsertElementInst* " << getCppName(iei)
+          << " = InsertElementInst::Create(" << opNames[0]
+          << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+      printEscapedString(iei->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::ShuffleVector: {
+      const ShuffleVectorInst* svi = cast<ShuffleVectorInst>(I);
+      Out << "ShuffleVectorInst* " << getCppName(svi)
+          << " = new ShuffleVectorInst(" << opNames[0]
+          << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+      printEscapedString(svi->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::ExtractValue: {
+      const ExtractValueInst *evi = cast<ExtractValueInst>(I);
+      Out << "std::vector<unsigned> " << iName << "_indices;";
+      nl(Out);
+      for (unsigned i = 0; i < evi->getNumIndices(); ++i) {
+        Out << iName << "_indices.push_back("
+            << evi->idx_begin()[i] << ");";
+        nl(Out);
+      }
+      Out << "ExtractValueInst* " << getCppName(evi)
+          << " = ExtractValueInst::Create(" << opNames[0]
+          << ", "
+          << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+      printEscapedString(evi->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::InsertValue: {
+      const InsertValueInst *ivi = cast<InsertValueInst>(I);
+      Out << "std::vector<unsigned> " << iName << "_indices;";
+      nl(Out);
+      for (unsigned i = 0; i < ivi->getNumIndices(); ++i) {
+        Out << iName << "_indices.push_back("
+            << ivi->idx_begin()[i] << ");";
+        nl(Out);
+      }
+      Out << "InsertValueInst* " << getCppName(ivi)
+          << " = InsertValueInst::Create(" << opNames[0]
+          << ", " << opNames[1] << ", "
+          << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+      printEscapedString(ivi->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+  }
+  DefinedValues.insert(I);
+  nl(Out);
+  delete [] opNames;
+}
+
+  // Print out the types, constants and declarations needed by one function
+  void CppWriter::printFunctionUses(const Function* F) {
+    nl(Out) << "// Type Definitions"; nl(Out);
+    if (!is_inline) {
+      // Print the function's return type
+      printType(F->getReturnType());
+
+      // Print the function's function type
+      printType(F->getFunctionType());
+
+      // Print the types of each of the function's arguments
+      for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+           AI != AE; ++AI) {
+        printType(AI->getType());
+      }
+    }
+
+    // Print type definitions for every type referenced by an instruction and
+    // make a note of any global values or constants that are referenced
+    SmallPtrSet<GlobalValue*,64> gvs;
+    SmallPtrSet<Constant*,64> consts;
+    for (Function::const_iterator BB = F->begin(), BE = F->end();
+         BB != BE; ++BB){
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+           I != E; ++I) {
+        // Print the type of the instruction itself
+        printType(I->getType());
+
+        // Print the type of each of the instruction's operands
+        for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+          Value* operand = I->getOperand(i);
+          printType(operand->getType());
+
+          // If the operand references a GVal or Constant, make a note of it
+          if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
+            gvs.insert(GV);
+            if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+              if (GVar->hasInitializer())
+                consts.insert(GVar->getInitializer());
+          } else if (Constant* C = dyn_cast<Constant>(operand))
+            consts.insert(C);
+        }
+      }
+    }
+
+    // Print the function declarations for any functions encountered
+    nl(Out) << "// Function Declarations"; nl(Out);
+    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+         I != E; ++I) {
+      if (Function* Fun = dyn_cast<Function>(*I)) {
+        if (!is_inline || Fun != F)
+          printFunctionHead(Fun);
+      }
+    }
+
+    // Print the global variable declarations for any variables encountered
+    nl(Out) << "// Global Variable Declarations"; nl(Out);
+    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+         I != E; ++I) {
+      if (GlobalVariable* F = dyn_cast<GlobalVariable>(*I))
+        printVariableHead(F);
+    }
+
+  // Print the constants found
+    nl(Out) << "// Constant Definitions"; nl(Out);
+    for (SmallPtrSet<Constant*,64>::iterator I = consts.begin(),
+           E = consts.end(); I != E; ++I) {
+      printConstant(*I);
+    }
+
+    // Process the global variables definitions now that all the constants have
+    // been emitted. These definitions just couple the gvars with their constant
+    // initializers.
+    nl(Out) << "// Global Variable Definitions"; nl(Out);
+    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+         I != E; ++I) {
+      if (GlobalVariable* GV = dyn_cast<GlobalVariable>(*I))
+        printVariableBody(GV);
+    }
+  }
+
+  void CppWriter::printFunctionHead(const Function* F) {
+    nl(Out) << "Function* " << getCppName(F);
+    if (is_inline) {
+      Out << " = mod->getFunction(\"";
+      printEscapedString(F->getName());
+      Out << "\", " << getCppName(F->getFunctionType()) << ");";
+      nl(Out) << "if (!" << getCppName(F) << ") {";
+      nl(Out) << getCppName(F);
+    }
+    Out<< " = Function::Create(";
+    nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
+    nl(Out) << "/*Linkage=*/";
+    printLinkageType(F->getLinkage());
+    Out << ",";
+    nl(Out) << "/*Name=*/\"";
+    printEscapedString(F->getName());
+    Out << "\", mod); " << (F->isDeclaration()? "// (external, no body)" : "");
+    nl(Out,-1);
+    printCppName(F);
+    Out << "->setCallingConv(";
+    printCallingConv(F->getCallingConv());
+    Out << ");";
+    nl(Out);
+    if (F->hasSection()) {
+      printCppName(F);
+      Out << "->setSection(\"" << F->getSection() << "\");";
+      nl(Out);
+    }
+    if (F->getAlignment()) {
+      printCppName(F);
+      Out << "->setAlignment(" << F->getAlignment() << ");";
+      nl(Out);
+    }
+    if (F->getVisibility() != GlobalValue::DefaultVisibility) {
+      printCppName(F);
+      Out << "->setVisibility(";
+      printVisibilityType(F->getVisibility());
+      Out << ");";
+      nl(Out);
+    }
+    if (F->hasGC()) {
+      printCppName(F);
+      Out << "->setGC(\"" << F->getGC() << "\");";
+      nl(Out);
+    }
+    if (is_inline) {
+      Out << "}";
+      nl(Out);
+    }
+    printAttributes(F->getAttributes(), getCppName(F));
+    printCppName(F);
+    Out << "->setAttributes(" << getCppName(F) << "_PAL);";
+    nl(Out);
+  }
+
+  void CppWriter::printFunctionBody(const Function *F) {
+    if (F->isDeclaration())
+      return; // external functions have no bodies.
+
+    // Clear the DefinedValues and ForwardRefs maps because we can't have
+    // cross-function forward refs
+    ForwardRefs.clear();
+    DefinedValues.clear();
+
+    // Create all the argument values
+    if (!is_inline) {
+      if (!F->arg_empty()) {
+        Out << "Function::arg_iterator args = " << getCppName(F)
+            << "->arg_begin();";
+        nl(Out);
+      }
+      for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+           AI != AE; ++AI) {
+        Out << "Value* " << getCppName(AI) << " = args++;";
+        nl(Out);
+        if (AI->hasName()) {
+          Out << getCppName(AI) << "->setName(\"" << AI->getName() << "\");";
+          nl(Out);
+        }
+      }
+    }
+
+    // Create all the basic blocks
+    nl(Out);
+    for (Function::const_iterator BI = F->begin(), BE = F->end();
+         BI != BE; ++BI) {
+      std::string bbname(getCppName(BI));
+      Out << "BasicBlock* " << bbname << " = BasicBlock::Create(\"";
+      if (BI->hasName())
+        printEscapedString(BI->getName());
+      Out << "\"," << getCppName(BI->getParent()) << ",0);";
+      nl(Out);
+    }
+
+    // Output all of its basic blocks... for the function
+    for (Function::const_iterator BI = F->begin(), BE = F->end();
+         BI != BE; ++BI) {
+      std::string bbname(getCppName(BI));
+      nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+      nl(Out);
+
+      // Output all of the instructions in the basic block...
+      for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
+           I != E; ++I) {
+        printInstruction(I,bbname);
+      }
+    }
+
+    // Loop over the ForwardRefs and resolve them now that all instructions
+    // are generated.
+    if (!ForwardRefs.empty()) {
+      nl(Out) << "// Resolve Forward References";
+      nl(Out);
+    }
+
+    while (!ForwardRefs.empty()) {
+      ForwardRefMap::iterator I = ForwardRefs.begin();
+      Out << I->second << "->replaceAllUsesWith("
+          << getCppName(I->first) << "); delete " << I->second << ";";
+      nl(Out);
+      ForwardRefs.erase(I);
+    }
+  }
+
+  void CppWriter::printInline(const std::string& fname,
+                              const std::string& func) {
+    const Function* F = TheModule->getFunction(func);
+    if (!F) {
+      error(std::string("Function '") + func + "' not found in input module");
+      return;
+    }
+    if (F->isDeclaration()) {
+      error(std::string("Function '") + func + "' is external!");
+      return;
+    }
+    nl(Out) << "BasicBlock* " << fname << "(Module* mod, Function *"
+            << getCppName(F);
+    unsigned arg_count = 1;
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      Out << ", Value* arg_" << arg_count;
+    }
+    Out << ") {";
+    nl(Out);
+    is_inline = true;
+    printFunctionUses(F);
+    printFunctionBody(F);
+    is_inline = false;
+    Out << "return " << getCppName(F->begin()) << ";";
+    nl(Out) << "}";
+    nl(Out);
+  }
+
+  void CppWriter::printModuleBody() {
+    // Print out all the type definitions
+    nl(Out) << "// Type Definitions"; nl(Out);
+    printTypes(TheModule);
+
+    // Functions can call each other and global variables can reference them so
+    // define all the functions first before emitting their function bodies.
+    nl(Out) << "// Function Declarations"; nl(Out);
+    for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+         I != E; ++I)
+      printFunctionHead(I);
+
+    // Process the global variables declarations. We can't initialze them until
+    // after the constants are printed so just print a header for each global
+    nl(Out) << "// Global Variable Declarations\n"; nl(Out);
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I) {
+      printVariableHead(I);
+    }
+
+    // Print out all the constants definitions. Constants don't recurse except
+    // through GlobalValues. All GlobalValues have been declared at this point
+    // so we can proceed to generate the constants.
+    nl(Out) << "// Constant Definitions"; nl(Out);
+    printConstants(TheModule);
+
+    // Process the global variables definitions now that all the constants have
+    // been emitted. These definitions just couple the gvars with their constant
+    // initializers.
+    nl(Out) << "// Global Variable Definitions"; nl(Out);
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I) {
+      printVariableBody(I);
+    }
+
+    // Finally, we can safely put out all of the function bodies.
+    nl(Out) << "// Function Definitions"; nl(Out);
+    for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+         I != E; ++I) {
+      if (!I->isDeclaration()) {
+        nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
+                << ")";
+        nl(Out) << "{";
+        nl(Out,1);
+        printFunctionBody(I);
+        nl(Out,-1) << "}";
+        nl(Out);
+      }
+    }
+  }
+
+  void CppWriter::printProgram(const std::string& fname,
+                               const std::string& mName) {
+    Out << "#include <llvm/Module.h>\n";
+    Out << "#include <llvm/DerivedTypes.h>\n";
+    Out << "#include <llvm/Constants.h>\n";
+    Out << "#include <llvm/GlobalVariable.h>\n";
+    Out << "#include <llvm/Function.h>\n";
+    Out << "#include <llvm/CallingConv.h>\n";
+    Out << "#include <llvm/BasicBlock.h>\n";
+    Out << "#include <llvm/Instructions.h>\n";
+    Out << "#include <llvm/InlineAsm.h>\n";
+    Out << "#include <llvm/Support/MathExtras.h>\n";
+    Out << "#include <llvm/Support/raw_ostream.h>\n";
+    Out << "#include <llvm/Pass.h>\n";
+    Out << "#include <llvm/PassManager.h>\n";
+    Out << "#include <llvm/ADT/SmallVector.h>\n";
+    Out << "#include <llvm/Analysis/Verifier.h>\n";
+    Out << "#include <llvm/Assembly/PrintModulePass.h>\n";
+    Out << "#include <algorithm>\n";
+    Out << "using namespace llvm;\n\n";
+    Out << "Module* " << fname << "();\n\n";
+    Out << "int main(int argc, char**argv) {\n";
+    Out << "  Module* Mod = " << fname << "();\n";
+    Out << "  verifyModule(*Mod, PrintMessageAction);\n";
+    Out << "  outs().flush();\n";
+    Out << "  PassManager PM;\n";
+    Out << "  PM.add(createPrintModulePass(&outs()));\n";
+    Out << "  PM.run(*Mod);\n";
+    Out << "  return 0;\n";
+    Out << "}\n\n";
+    printModule(fname,mName);
+  }
+
+  void CppWriter::printModule(const std::string& fname,
+                              const std::string& mName) {
+    nl(Out) << "Module* " << fname << "() {";
+    nl(Out,1) << "// Module Construction";
+    nl(Out) << "Module* mod = new Module(\"" << mName << "\");";
+    if (!TheModule->getTargetTriple().empty()) {
+      nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");";
+    }
+    if (!TheModule->getTargetTriple().empty()) {
+      nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple()
+              << "\");";
+    }
+
+    if (!TheModule->getModuleInlineAsm().empty()) {
+      nl(Out) << "mod->setModuleInlineAsm(\"";
+      printEscapedString(TheModule->getModuleInlineAsm());
+      Out << "\");";
+    }
+    nl(Out);
+
+    // Loop over the dependent libraries and emit them.
+    Module::lib_iterator LI = TheModule->lib_begin();
+    Module::lib_iterator LE = TheModule->lib_end();
+    while (LI != LE) {
+      Out << "mod->addLibrary(\"" << *LI << "\");";
+      nl(Out);
+      ++LI;
+    }
+    printModuleBody();
+    nl(Out) << "return mod;";
+    nl(Out,-1) << "}";
+    nl(Out);
+  }
+
+  void CppWriter::printContents(const std::string& fname,
+                                const std::string& mName) {
+    Out << "\nModule* " << fname << "(Module *mod) {\n";
+    Out << "\nmod->setModuleIdentifier(\"" << mName << "\");\n";
+    printModuleBody();
+    Out << "\nreturn mod;\n";
+    Out << "\n}\n";
+  }
+
+  void CppWriter::printFunction(const std::string& fname,
+                                const std::string& funcName) {
+    const Function* F = TheModule->getFunction(funcName);
+    if (!F) {
+      error(std::string("Function '") + funcName + "' not found in input module");
+      return;
+    }
+    Out << "\nFunction* " << fname << "(Module *mod) {\n";
+    printFunctionUses(F);
+    printFunctionHead(F);
+    printFunctionBody(F);
+    Out << "return " << getCppName(F) << ";\n";
+    Out << "}\n";
+  }
+
+  void CppWriter::printFunctions() {
+    const Module::FunctionListType &funcs = TheModule->getFunctionList();
+    Module::const_iterator I  = funcs.begin();
+    Module::const_iterator IE = funcs.end();
+
+    for (; I != IE; ++I) {
+      const Function &func = *I;
+      if (!func.isDeclaration()) {
+        std::string name("define_");
+        name += func.getName();
+        printFunction(name, func.getName());
+      }
+    }
+  }
+
+  void CppWriter::printVariable(const std::string& fname,
+                                const std::string& varName) {
+    const GlobalVariable* GV = TheModule->getNamedGlobal(varName);
+
+    if (!GV) {
+      error(std::string("Variable '") + varName + "' not found in input module");
+      return;
+    }
+    Out << "\nGlobalVariable* " << fname << "(Module *mod) {\n";
+    printVariableUses(GV);
+    printVariableHead(GV);
+    printVariableBody(GV);
+    Out << "return " << getCppName(GV) << ";\n";
+    Out << "}\n";
+  }
+
+  void CppWriter::printType(const std::string& fname,
+                            const std::string& typeName) {
+    const Type* Ty = TheModule->getTypeByName(typeName);
+    if (!Ty) {
+      error(std::string("Type '") + typeName + "' not found in input module");
+      return;
+    }
+    Out << "\nType* " << fname << "(Module *mod) {\n";
+    printType(Ty);
+    Out << "return " << getCppName(Ty) << ";\n";
+    Out << "}\n";
+  }
+
+  bool CppWriter::runOnModule(Module &M) {
+    TheModule = &M;
+
+    // Emit a header
+    Out << "// Generated by llvm2cpp - DO NOT MODIFY!\n\n";
+
+    // Get the name of the function we're supposed to generate
+    std::string fname = FuncName.getValue();
+
+    // Get the name of the thing we are to generate
+    std::string tgtname = NameToGenerate.getValue();
+    if (GenerationType == GenModule ||
+        GenerationType == GenContents ||
+        GenerationType == GenProgram ||
+        GenerationType == GenFunctions) {
+      if (tgtname == "!bad!") {
+        if (M.getModuleIdentifier() == "-")
+          tgtname = "<stdin>";
+        else
+          tgtname = M.getModuleIdentifier();
+      }
+    } else if (tgtname == "!bad!")
+      error("You must use the -for option with -gen-{function,variable,type}");
+
+    switch (WhatToGenerate(GenerationType)) {
+     case GenProgram:
+      if (fname.empty())
+        fname = "makeLLVMModule";
+      printProgram(fname,tgtname);
+      break;
+     case GenModule:
+      if (fname.empty())
+        fname = "makeLLVMModule";
+      printModule(fname,tgtname);
+      break;
+     case GenContents:
+      if (fname.empty())
+        fname = "makeLLVMModuleContents";
+      printContents(fname,tgtname);
+      break;
+     case GenFunction:
+      if (fname.empty())
+        fname = "makeLLVMFunction";
+      printFunction(fname,tgtname);
+      break;
+     case GenFunctions:
+      printFunctions();
+      break;
+     case GenInline:
+      if (fname.empty())
+        fname = "makeLLVMInline";
+      printInline(fname,tgtname);
+      break;
+     case GenVariable:
+      if (fname.empty())
+        fname = "makeLLVMVariable";
+      printVariable(fname,tgtname);
+      break;
+     case GenType:
+      if (fname.empty())
+        fname = "makeLLVMType";
+      printType(fname,tgtname);
+      break;
+     default:
+      error("Invalid generation option");
+    }
+
+    return false;
+  }
+}
+
+char CppWriter::ID = 0;
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool CPPTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
+                                                raw_ostream &o,
+                                                CodeGenFileType FileType,
+                                                CodeGenOpt::Level OptLevel) {
+  if (FileType != TargetMachine::AssemblyFile) return true;
+  PM.add(new CppWriter(o));
+  return false;
+}
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
new file mode 100644
index 0000000..db4bc0e
--- /dev/null
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -0,0 +1,44 @@
+//===-- CPPTargetMachine.h - TargetMachine for the C++ backend --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the C++ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPPTARGETMACHINE_H
+#define CPPTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+struct CPPTargetMachine : public TargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+
+  CPPTargetMachine(const Module &M, const std::string &FS)
+    : DataLayout(&M) {}
+
+  virtual bool WantsWholeFile() const { return true; }
+  virtual bool addPassesToEmitWholeFile(PassManager &PM, raw_ostream &Out,
+                                        CodeGenFileType FileType,
+                                        CodeGenOpt::Level OptLevel);
+
+  // This class always works, but shouldn't be the default in most cases.
+  static unsigned getModuleMatchQuality(const Module &M) { return 1; }
+
+  virtual const TargetData *getTargetData() const { return &DataLayout; }
+};
+
+} // End llvm namespace
+
+
+#endif
diff --git a/lib/Target/CppBackend/Makefile b/lib/Target/CppBackend/Makefile
new file mode 100644
index 0000000..ca7e1a8
--- /dev/null
+++ b/lib/Target/CppBackend/Makefile
@@ -0,0 +1,14 @@
+##===- lib/Target/CppBackend/Makefile --- ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMCppBackend
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts += -Wno-format
diff --git a/lib/Target/DarwinTargetAsmInfo.cpp b/lib/Target/DarwinTargetAsmInfo.cpp
new file mode 100644
index 0000000..05d2351
--- /dev/null
+++ b/lib/Target/DarwinTargetAsmInfo.cpp
@@ -0,0 +1,169 @@
+//===-- DarwinTargetAsmInfo.cpp - Darwin asm properties ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on Darwin-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/DarwinTargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+using namespace llvm;
+
+DarwinTargetAsmInfo::DarwinTargetAsmInfo(const TargetMachine &TM) 
+  : TargetAsmInfo(TM) {
+
+  CStringSection_ = getUnnamedSection("\t.cstring",
+                                SectionFlags::Mergeable | SectionFlags::Strings);
+  FourByteConstantSection = getUnnamedSection("\t.literal4\n",
+                                              SectionFlags::Mergeable);
+  EightByteConstantSection = getUnnamedSection("\t.literal8\n",
+                                               SectionFlags::Mergeable);
+
+  // Note: 16-byte constant section is subtarget specific and should be provided
+  // there, if needed.
+  SixteenByteConstantSection = 0;
+
+  ReadOnlySection = getUnnamedSection("\t.const\n", SectionFlags::None);
+
+  TextCoalSection =
+    getNamedSection("\t__TEXT,__textcoal_nt,coalesced,pure_instructions",
+                    SectionFlags::Code);
+  ConstTextCoalSection = getNamedSection("\t__TEXT,__const_coal,coalesced",
+                                         SectionFlags::None);
+  ConstDataCoalSection = getNamedSection("\t__DATA,__const_coal,coalesced",
+                                         SectionFlags::None);
+  ConstDataSection = getUnnamedSection(".const_data", SectionFlags::None);
+  DataCoalSection = getNamedSection("\t__DATA,__datacoal_nt,coalesced",
+                                    SectionFlags::Writeable);
+}
+
+/// emitUsedDirectiveFor - On Darwin, internally linked data beginning with
+/// the PrivateGlobalPrefix or the LessPrivateGlobalPrefix does not have the
+/// directive emitted (this occurs in ObjC metadata).
+
+bool
+DarwinTargetAsmInfo::emitUsedDirectiveFor(const GlobalValue* GV,
+                                          Mangler *Mang) const {
+  if (GV==0)
+    return false;
+  if (GV->hasLocalLinkage() && !isa<Function>(GV) &&
+      ((strlen(getPrivateGlobalPrefix()) != 0 &&
+        Mang->getValueName(GV).substr(0,strlen(getPrivateGlobalPrefix())) ==
+          getPrivateGlobalPrefix()) ||
+       (strlen(getLessPrivateGlobalPrefix()) != 0 &&
+        Mang->getValueName(GV).substr(0,strlen(getLessPrivateGlobalPrefix())) ==
+          getLessPrivateGlobalPrefix())))
+    return false;
+  return true;
+}
+
+const Section*
+DarwinTargetAsmInfo::SelectSectionForGlobal(const GlobalValue *GV) const {
+  SectionKind::Kind Kind = SectionKindForGlobal(GV);
+  bool isWeak = GV->isWeakForLinker();
+  bool isNonStatic = TM.getRelocationModel() != Reloc::Static;
+
+  switch (Kind) {
+   case SectionKind::Text:
+    if (isWeak)
+      return TextCoalSection;
+    else
+      return TextSection;
+   case SectionKind::Data:
+   case SectionKind::ThreadData:
+   case SectionKind::BSS:
+   case SectionKind::ThreadBSS:
+    if (cast<GlobalVariable>(GV)->isConstant())
+      return (isWeak ? ConstDataCoalSection : ConstDataSection);
+    else
+      return (isWeak ? DataCoalSection : DataSection);
+   case SectionKind::ROData:
+    return (isWeak ? ConstDataCoalSection :
+            (isNonStatic ? ConstDataSection : getReadOnlySection()));
+   case SectionKind::RODataMergeStr:
+    return (isWeak ?
+            ConstTextCoalSection :
+            MergeableStringSection(cast<GlobalVariable>(GV)));
+   case SectionKind::RODataMergeConst:
+    return (isWeak ?
+            ConstDataCoalSection:
+            MergeableConstSection(cast<GlobalVariable>(GV)));
+   default:
+    assert(0 && "Unsuported section kind for global");
+  }
+
+  // FIXME: Do we have any extra special weird cases?
+  return NULL;
+}
+
+const Section*
+DarwinTargetAsmInfo::MergeableStringSection(const GlobalVariable *GV) const {
+  const TargetData *TD = TM.getTargetData();
+  Constant *C = cast<GlobalVariable>(GV)->getInitializer();
+  const Type *Ty = cast<ArrayType>(C->getType())->getElementType();
+
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (Size) {
+    unsigned Align = TD->getPreferredAlignment(GV);
+    if (Align <= 32)
+      return getCStringSection_();
+  }
+
+  return getReadOnlySection();
+}
+
+const Section*
+DarwinTargetAsmInfo::MergeableConstSection(const GlobalVariable *GV) const {
+  Constant *C = GV->getInitializer();
+
+  return MergeableConstSection(C->getType());
+}
+
+inline const Section*
+DarwinTargetAsmInfo::MergeableConstSection(const Type *Ty) const {
+  const TargetData *TD = TM.getTargetData();
+
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (Size == 4)
+    return FourByteConstantSection;
+  else if (Size == 8)
+    return EightByteConstantSection;
+  else if (Size == 16 && SixteenByteConstantSection)
+    return SixteenByteConstantSection;
+
+  return getReadOnlySection();
+}
+
+const Section*
+DarwinTargetAsmInfo::SelectSectionForMachineConst(const Type *Ty) const {
+  const Section* S = MergeableConstSection(Ty);
+
+  // Handle weird special case, when compiling PIC stuff.
+  if (S == getReadOnlySection() &&
+      TM.getRelocationModel() != Reloc::Static)
+    return ConstDataSection;
+
+  return S;
+}
+
+std::string
+DarwinTargetAsmInfo::UniqueSectionForGlobal(const GlobalValue* GV,
+                                               SectionKind::Kind kind) const {
+  assert(0 && "Darwin does not use unique sections");
+  return "";
+}
diff --git a/lib/Target/ELFTargetAsmInfo.cpp b/lib/Target/ELFTargetAsmInfo.cpp
new file mode 100644
index 0000000..8f6e96e
--- /dev/null
+++ b/lib/Target/ELFTargetAsmInfo.cpp
@@ -0,0 +1,227 @@
+//===-- ELFTargetAsmInfo.cpp - ELF asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on ELF-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+using namespace llvm;
+
+ELFTargetAsmInfo::ELFTargetAsmInfo(const TargetMachine &TM)
+  : TargetAsmInfo(TM) {
+
+  BSSSection_  = getUnnamedSection("\t.bss",
+                                   SectionFlags::Writeable | SectionFlags::BSS);
+  ReadOnlySection = getNamedSection("\t.rodata", SectionFlags::None);
+  TLSDataSection = getNamedSection("\t.tdata",
+                                   SectionFlags::Writeable | SectionFlags::TLS);
+  TLSBSSSection = getNamedSection("\t.tbss",
+                SectionFlags::Writeable | SectionFlags::TLS | SectionFlags::BSS);
+
+  DataRelSection = getNamedSection("\t.data.rel", SectionFlags::Writeable);
+  DataRelLocalSection = getNamedSection("\t.data.rel.local",
+                                        SectionFlags::Writeable);
+  DataRelROSection = getNamedSection("\t.data.rel.ro",
+                                     SectionFlags::Writeable);
+  DataRelROLocalSection = getNamedSection("\t.data.rel.ro.local",
+                                          SectionFlags::Writeable);
+}
+
+SectionKind::Kind
+ELFTargetAsmInfo::SectionKindForGlobal(const GlobalValue *GV) const {
+  SectionKind::Kind Kind = TargetAsmInfo::SectionKindForGlobal(GV);
+
+  if (Kind != SectionKind::Data)
+    return Kind;
+
+  // Decide, whether we need data.rel stuff
+  const GlobalVariable* GVar = dyn_cast<GlobalVariable>(GV);
+  if (GVar->hasInitializer()) {
+    Constant *C = GVar->getInitializer();
+    bool isConstant = GVar->isConstant();
+    unsigned Reloc = RelocBehaviour();
+    if (Reloc != Reloc::None && C->ContainsRelocations(Reloc))
+      return (C->ContainsRelocations(Reloc::Global) ?
+              (isConstant ?
+               SectionKind::DataRelRO : SectionKind::DataRel) :
+              (isConstant ?
+               SectionKind::DataRelROLocal : SectionKind::DataRelLocal));
+  }
+
+  return Kind;
+}
+
+const Section*
+ELFTargetAsmInfo::SelectSectionForGlobal(const GlobalValue *GV) const {
+  SectionKind::Kind Kind = SectionKindForGlobal(GV);
+
+  if (const Function *F = dyn_cast<Function>(GV)) {
+    switch (F->getLinkage()) {
+     default: assert(0 && "Unknown linkage type!");
+     case Function::PrivateLinkage:
+     case Function::InternalLinkage:
+     case Function::DLLExportLinkage:
+     case Function::ExternalLinkage:
+      return TextSection;
+     case Function::WeakAnyLinkage:
+     case Function::WeakODRLinkage:
+     case Function::LinkOnceAnyLinkage:
+     case Function::LinkOnceODRLinkage:
+      std::string Name = UniqueSectionForGlobal(GV, Kind);
+      unsigned Flags = SectionFlagsForGlobal(GV, Name.c_str());
+      return getNamedSection(Name.c_str(), Flags);
+    }
+  } else if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
+    if (GVar->isWeakForLinker()) {
+      std::string Name = UniqueSectionForGlobal(GVar, Kind);
+      unsigned Flags = SectionFlagsForGlobal(GVar, Name.c_str());
+      return getNamedSection(Name.c_str(), Flags);
+    } else {
+      switch (Kind) {
+       case SectionKind::Data:
+       case SectionKind::SmallData:
+        return DataSection;
+       case SectionKind::DataRel:
+        return DataRelSection;
+       case SectionKind::DataRelLocal:
+        return DataRelLocalSection;
+       case SectionKind::DataRelRO:
+        return DataRelROSection;
+       case SectionKind::DataRelROLocal:
+        return DataRelROLocalSection;
+       case SectionKind::BSS:
+       case SectionKind::SmallBSS:
+        // ELF targets usually have BSS sections
+        return getBSSSection_();
+       case SectionKind::ROData:
+       case SectionKind::SmallROData:
+        return getReadOnlySection();
+       case SectionKind::RODataMergeStr:
+        return MergeableStringSection(GVar);
+       case SectionKind::RODataMergeConst:
+        return MergeableConstSection(GVar);
+       case SectionKind::ThreadData:
+        // ELF targets usually support TLS stuff
+        return TLSDataSection;
+       case SectionKind::ThreadBSS:
+        return TLSBSSSection;
+       default:
+        assert(0 && "Unsuported section kind for global");
+      }
+    }
+  } else
+    assert(0 && "Unsupported global");
+
+  return NULL;
+}
+
+const Section*
+ELFTargetAsmInfo::SelectSectionForMachineConst(const Type *Ty) const {
+  // FIXME: Support data.rel stuff someday
+  return MergeableConstSection(Ty);
+}
+
+const Section*
+ELFTargetAsmInfo::MergeableConstSection(const GlobalVariable *GV) const {
+  Constant *C = GV->getInitializer();
+  return MergeableConstSection(C->getType());
+}
+
+inline const Section*
+ELFTargetAsmInfo::MergeableConstSection(const Type *Ty) const {
+  const TargetData *TD = TM.getTargetData();
+
+  // FIXME: string here is temporary, until stuff will fully land in.
+  // We cannot use {Four,Eight,Sixteen}ByteConstantSection here, since it's
+  // currently directly used by asmprinter.
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (Size == 4 || Size == 8 || Size == 16) {
+    std::string Name =  ".rodata.cst" + utostr(Size);
+
+    return getNamedSection(Name.c_str(),
+                           SectionFlags::setEntitySize(SectionFlags::Mergeable,
+                                                       Size));
+  }
+
+  return getReadOnlySection();
+}
+
+const Section*
+ELFTargetAsmInfo::MergeableStringSection(const GlobalVariable *GV) const {
+  const TargetData *TD = TM.getTargetData();
+  Constant *C = cast<GlobalVariable>(GV)->getInitializer();
+  const Type *Ty = cast<ArrayType>(C->getType())->getElementType();
+
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (Size <= 16) {
+    assert(getCStringSection() && "Should have string section prefix");
+
+    // We also need alignment here
+    unsigned Align = TD->getPrefTypeAlignment(Ty);
+    if (Align < Size)
+      Align = Size;
+
+    std::string Name = getCStringSection() + utostr(Size) + '.' + utostr(Align);
+    unsigned Flags = SectionFlags::setEntitySize(SectionFlags::Mergeable |
+                                                 SectionFlags::Strings,
+                                                 Size);
+    return getNamedSection(Name.c_str(), Flags);
+  }
+
+  return getReadOnlySection();
+}
+
+std::string ELFTargetAsmInfo::printSectionFlags(unsigned flags) const {
+  std::string Flags = ",\"";
+
+  if (!(flags & SectionFlags::Debug))
+    Flags += 'a';
+  if (flags & SectionFlags::Code)
+    Flags += 'x';
+  if (flags & SectionFlags::Writeable)
+    Flags += 'w';
+  if (flags & SectionFlags::Mergeable)
+    Flags += 'M';
+  if (flags & SectionFlags::Strings)
+    Flags += 'S';
+  if (flags & SectionFlags::TLS)
+    Flags += 'T';
+  if (flags & SectionFlags::Small)
+    Flags += 's';
+
+  Flags += "\",";
+
+  // If comment string is '@', e.g. as on ARM - use '%' instead
+  if (strcmp(CommentString, "@") == 0)
+    Flags += '%';
+  else
+    Flags += '@';
+
+  // FIXME: There can be exceptions here
+  if (flags & SectionFlags::BSS)
+    Flags += "nobits";
+  else
+    Flags += "progbits";
+
+  if (unsigned entitySize = SectionFlags::getEntitySize(flags))
+    Flags += "," + utostr(entitySize);
+
+  return Flags;
+}
diff --git a/lib/Target/IA64/AsmPrinter/CMakeLists.txt b/lib/Target/IA64/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..1d552bd
--- /dev/null
+++ b/lib/Target/IA64/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,12 @@
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}/..
+  ${CMAKE_CURRENT_SOURCE_DIR}/..
+  )
+
+add_partially_linked_object(LLVMIA64AsmPrinter
+  IA64AsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMIA64CodeGen n)
+
+add_dependencies(LLVMIA64AsmPrinter ${n})
diff --git a/lib/Target/IA64/AsmPrinter/IA64AsmPrinter.cpp b/lib/Target/IA64/AsmPrinter/IA64AsmPrinter.cpp
new file mode 100644
index 0000000..fc54e23
--- /dev/null
+++ b/lib/Target/IA64/AsmPrinter/IA64AsmPrinter.cpp
@@ -0,0 +1,376 @@
+//===-- IA64AsmPrinter.cpp - Print out IA64 LLVM as assembly --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to assembly accepted by the GNU binutils 'gas'
+// assembler. The Intel 'ias' and HP-UX 'as' assemblers *may* choke on this
+// output, but if so that's a bug I'd like to hear about: please file a bug
+// report in bugzilla. FYI, the not too bad 'ias' assembler is bundled with
+// the Intel C/C++ compiler for Itanium Linux.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "IA64.h"
+#include "IA64TargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class IA64AsmPrinter : public AsmPrinter {
+    std::set<std::string> ExternalFunctionNames, ExternalObjectNames;
+  public:
+    explicit IA64AsmPrinter(raw_ostream &O, TargetMachine &TM,
+                            const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                            bool V)
+      : AsmPrinter(O, TM, T, OL, V) {}
+
+    virtual const char *getPassName() const {
+      return "IA64 Assembly Printer";
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    bool printInstruction(const MachineInstr *MI);
+
+    // This method is used by the tablegen'erated instruction printer.
+    void printOperand(const MachineInstr *MI, unsigned OpNo){
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.getType() == MachineOperand::MO_Register) {
+        assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+               "Not physref??");
+        //XXX Bug Workaround: See note in Printer::doInitialization about %.
+        O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+      } else {
+        printOp(MO);
+      }
+    }
+
+    void printS8ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      int val=(unsigned int)MI->getOperand(OpNo).getImm();
+      if(val>=128) val=val-256; // if negative, flip sign
+      O << val;
+    }
+    void printS14ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      int val=(unsigned int)MI->getOperand(OpNo).getImm();
+      if(val>=8192) val=val-16384; // if negative, flip sign
+      O << val;
+    }
+    void printS22ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      int val=(unsigned int)MI->getOperand(OpNo).getImm();
+      if(val>=2097152) val=val-4194304; // if negative, flip sign
+      O << val;
+    }
+    void printU64ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (uint64_t)MI->getOperand(OpNo).getImm();
+    }
+    void printS64ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+// XXX : nasty hack to avoid GPREL22 "relocation truncated to fit" linker
+// errors - instead of add rX = @gprel(CPI<whatever>), r1;; we now
+// emit movl rX = @gprel(CPI<whatever);;
+//      add  rX = rX, r1;
+// this gives us 64 bits instead of 22 (for the add long imm) to play
+// with, which shuts up the linker. The problem is that the constant
+// pool entries aren't immediates at this stage, so we check here.
+// If it's an immediate, print it the old fashioned way. If it's
+// not, we print it as a constant pool index.
+      if (MI->getOperand(OpNo).isImm()) {
+        O << (int64_t)MI->getOperand(OpNo).getImm();
+      } else { // this is a constant pool reference: FIXME: assert this
+        printOp(MI->getOperand(OpNo));
+      }
+    }
+
+    void printGlobalOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo), false); // this is NOT a br.call instruction
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo), true); // this is a br.call instruction
+    }
+
+    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO, bool isBRCALLinsn= false);
+    void printModuleLevelGV(const GlobalVariable* GVar);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+  };
+} // end of anonymous namespace
+
+
+// Include the auto-generated portion of the assembly writer.
+#include "IA64GenAsmWriter.inc"
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool IA64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  const Function *F = MF.getFunction();
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  // Print out labels for the function.
+  EmitAlignment(5);
+  O << "\t.global\t" << CurrentFnName << '\n';
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  O << "\t.type\t" << CurrentFnName << ", @function\n";
+  O << CurrentFnName << ":\n";
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block if there are any predecessors.
+    if (!I->pred_empty()) {
+      printBasicBlockLabel(I, true, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+
+  // We didn't modify anything.
+  return false;
+}
+
+void IA64AsmPrinter::printOp(const MachineOperand &MO,
+                             bool isBRCALLinsn /* = false */) {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << RI.get(MO.getReg()).AsmName;
+    return;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+  case MachineOperand::MO_ConstantPoolIndex: {
+    O << "@gprel(" << TAI->getPrivateGlobalPrefix()
+      << "CPI" << getFunctionNumber() << "_" << MO.getIndex() << ")";
+    return;
+  }
+
+  case MachineOperand::MO_GlobalAddress: {
+
+    // functions need @ltoff(@fptr(fn_name)) form
+    GlobalValue *GV = MO.getGlobal();
+    Function *F = dyn_cast<Function>(GV);
+
+    bool Needfptr=false; // if we're computing an address @ltoff(X), do
+                         // we need to decorate it so it becomes
+                         // @ltoff(@fptr(X)) ?
+    if (F && !isBRCALLinsn /*&& F->isDeclaration()*/)
+      Needfptr=true;
+
+    // if this is the target of a call instruction, we should define
+    // the function somewhere (GNU gas has no problem without this, but
+    // Intel ias rightly complains of an 'undefined symbol')
+
+    if (F /*&& isBRCALLinsn*/ && F->isDeclaration())
+      ExternalFunctionNames.insert(Mang->getValueName(MO.getGlobal()));
+    else
+      if (GV->isDeclaration()) // e.g. stuff like 'stdin'
+        ExternalObjectNames.insert(Mang->getValueName(MO.getGlobal()));
+
+    if (!isBRCALLinsn)
+      O << "@ltoff(";
+    if (Needfptr)
+      O << "@fptr(";
+    O << Mang->getValueName(MO.getGlobal());
+
+    if (Needfptr && !isBRCALLinsn)
+      O << "#))"; // close both fptr( and ltoff(
+    else {
+      if (Needfptr)
+        O << "#)"; // close only fptr(
+      if (!isBRCALLinsn)
+        O << "#)"; // close only ltoff(
+    }
+
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << " + " << Offset;
+    else if (Offset < 0)
+      O << " - " << -Offset;
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    ExternalFunctionNames.insert(MO.getSymbolName());
+    return;
+  default:
+    O << "<AsmPrinter: unknown operand type: " << MO.getType() << " >"; return;
+  }
+}
+
+/// printMachineInstruction -- Print out a single IA64 LLVM instruction
+/// MI to the current output stream.
+///
+void IA64AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+bool IA64AsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+
+  O << "\n.ident \"LLVM-ia64\"\n\n"
+    << "\t.psr    lsb\n"  // should be "msb" on HP-UX, for starters
+    << "\t.radix  C\n"
+    << "\t.psr    abi64\n"; // we only support 64 bits for now
+  return Result;
+}
+
+void IA64AsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return; // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar))
+    return;
+
+  O << "\n\n";
+  std::string name = Mang->getValueName(GVar);
+  Constant *C = GVar->getInitializer();
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+  printVisibility(name, GVar->getVisibility());
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  if (C->isNullValue() && !GVar->hasSection()) {
+    if (!GVar->isThreadLocal() &&
+        (GVar->hasLocalLinkage() || GVar->isWeakForLinker())) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (GVar->hasLocalLinkage()) {
+        O << "\t.lcomm " << name << "#," << Size
+          << ',' << (1 << Align);
+        O << '\n';
+      } else {
+        O << "\t.common " << name << "#," << Size
+          << ',' << (1 << Align);
+        O << '\n';
+      }
+
+      return;
+    }
+  }
+
+  switch (GVar->getLinkage()) {
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::CommonLinkage:
+   case GlobalValue::WeakAnyLinkage:
+   case GlobalValue::WeakODRLinkage:
+    // Nonnull linkonce -> weak
+    O << "\t.weak " << name << '\n';
+    break;
+   case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+   case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << TAI->getGlobalDirective() << name << '\n';
+    // FALL THROUGH
+   case GlobalValue::InternalLinkage:
+   case GlobalValue::PrivateLinkage:
+    break;
+   case GlobalValue::GhostLinkage:
+    cerr << "GhostLinkage cannot appear in IA64AsmPrinter!\n";
+    abort();
+   case GlobalValue::DLLImportLinkage:
+    cerr << "DLLImport linkage is not supported by this target!\n";
+    abort();
+   case GlobalValue::DLLExportLinkage:
+    cerr << "DLLExport linkage is not supported by this target!\n";
+    abort();
+   default:
+    assert(0 && "Unknown linkage type!");
+  }
+
+  EmitAlignment(Align, GVar);
+
+  if (TAI->hasDotTypeDotSizeDirective()) {
+    O << "\t.type " << name << ",@object\n";
+    O << "\t.size " << name << ',' << Size << '\n';
+  }
+
+  O << name << ":\n";
+  EmitGlobalConstant(C);
+}
+
+
+bool IA64AsmPrinter::doFinalization(Module &M) {
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    printModuleLevelGV(I);
+
+  // we print out ".global X \n .type X, @function" for each external function
+  O << "\n\n// br.call targets referenced (and not defined) above: \n";
+  for (std::set<std::string>::iterator i = ExternalFunctionNames.begin(),
+       e = ExternalFunctionNames.end(); i!=e; ++i) {
+    O << "\t.global " << *i << "\n\t.type " << *i << ", @function\n";
+  }
+  O << "\n\n";
+
+  // we print out ".global X \n .type X, @object" for each external object
+  O << "\n\n// (external) symbols referenced (and not defined) above: \n";
+  for (std::set<std::string>::iterator i = ExternalObjectNames.begin(),
+       e = ExternalObjectNames.end(); i!=e; ++i) {
+    O << "\t.global " << *i << "\n\t.type " << *i << ", @object\n";
+  }
+  O << "\n\n";
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// createIA64CodePrinterPass - Returns a pass that prints the IA64
+/// assembly code for a MachineFunction to the given output stream, using
+/// the given target machine description.
+///
+FunctionPass *llvm::createIA64CodePrinterPass(raw_ostream &o,
+                                              IA64TargetMachine &tm,
+                                              CodeGenOpt::Level OptLevel,
+                                              bool verbose) {
+  return new IA64AsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
diff --git a/lib/Target/IA64/AsmPrinter/Makefile b/lib/Target/IA64/AsmPrinter/Makefile
new file mode 100644
index 0000000..12880f3
--- /dev/null
+++ b/lib/Target/IA64/AsmPrinter/Makefile
@@ -0,0 +1,17 @@
+##===- lib/Target/IA64/AsmPrinter/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMIA64AsmPrinter
+
+# Hack: we need to include 'main' IA64 target directory to grab
+# private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/IA64/CMakeLists.txt b/lib/Target/IA64/CMakeLists.txt
new file mode 100644
index 0000000..26f86ca
--- /dev/null
+++ b/lib/Target/IA64/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_TARGET_DEFINITIONS IA64.td)
+
+tablegen(IA64GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(IA64GenRegisterNames.inc -gen-register-enums)
+tablegen(IA64GenRegisterInfo.inc -gen-register-desc)
+tablegen(IA64GenInstrNames.inc -gen-instr-enums)
+tablegen(IA64GenInstrInfo.inc -gen-instr-desc)
+tablegen(IA64GenAsmWriter.inc -gen-asm-writer)
+tablegen(IA64GenDAGISel.inc -gen-dag-isel)
+
+add_llvm_target(IA64CodeGen
+  IA64Bundling.cpp
+  IA64InstrInfo.cpp
+  IA64ISelDAGToDAG.cpp
+  IA64ISelLowering.cpp
+  IA64RegisterInfo.cpp
+  IA64Subtarget.cpp
+  IA64TargetAsmInfo.cpp
+  IA64TargetMachine.cpp
+  )
diff --git a/lib/Target/IA64/IA64.h b/lib/Target/IA64/IA64.h
new file mode 100644
index 0000000..ec8e3d6
--- /dev/null
+++ b/lib/Target/IA64/IA64.h
@@ -0,0 +1,58 @@
+//===-- IA64.h - Top-level interface for IA64 representation ------*- C++ -*-===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the IA64
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_IA64_H
+#define TARGET_IA64_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class IA64TargetMachine;
+class FunctionPass;
+class raw_ostream;
+
+/// createIA64DAGToDAGInstructionSelector - This pass converts an LLVM
+/// function into IA64 machine code in a sane, DAG->DAG transform.
+///
+FunctionPass *createIA64DAGToDAGInstructionSelector(IA64TargetMachine &TM);
+
+/// createIA64BundlingPass - This pass adds stop bits and bundles
+/// instructions.
+///
+FunctionPass *createIA64BundlingPass(IA64TargetMachine &TM);
+
+/// createIA64CodePrinterPass - Returns a pass that prints the IA64
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *createIA64CodePrinterPass(raw_ostream &o,
+                                        IA64TargetMachine &tm,
+                                        CodeGenOpt::Level OptLevel,
+                                        bool verbose);
+
+} // End llvm namespace
+
+// Defines symbolic names for IA64 registers.  This defines a mapping from
+// register name to register number.
+//
+#include "IA64GenRegisterNames.inc"
+
+// Defines symbolic names for the IA64 instructions.
+//
+#include "IA64GenInstrNames.inc"
+
+#endif
+
+
diff --git a/lib/Target/IA64/IA64.td b/lib/Target/IA64/IA64.td
new file mode 100644
index 0000000..c469281
--- /dev/null
+++ b/lib/Target/IA64/IA64.td
@@ -0,0 +1,39 @@
+//===-- IA64.td - Target definition file for Intel IA64 -------------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel IA64 architecture,
+// also known variously as ia64, IA-64, IPF, "the Itanium architecture" etc.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "IA64RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "IA64InstrInfo.td"
+
+def IA64InstrInfo : InstrInfo { }
+
+def IA64 : Target {
+  // Our instruction set
+  let InstructionSet = IA64InstrInfo;
+
+}
+
+
diff --git a/lib/Target/IA64/IA64Bundling.cpp b/lib/Target/IA64/IA64Bundling.cpp
new file mode 100644
index 0000000..3a9ba6c
--- /dev/null
+++ b/lib/Target/IA64/IA64Bundling.cpp
@@ -0,0 +1,118 @@
+//===-- IA64Bundling.cpp - IA-64 instruction bundling pass. ------------ --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Add stops where required to prevent read-after-write and write-after-write
+// dependencies, for both registers and memory addresses. There are exceptions:
+//
+//    - Compare instructions (cmp*, tbit, tnat, fcmp, frcpa) are OK with
+//      WAW dependencies so long as they all target p0, or are of parallel
+//      type (.and*/.or*)
+//
+// FIXME: bundling, for now, is left to the assembler.
+// FIXME: this might be an appropriate place to translate between different
+//        instructions that do the same thing, if this helps bundling.
+// 
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ia64-codegen"
+#include "IA64.h"
+#include "IA64InstrInfo.h"
+#include "IA64TargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(StopBitsAdded, "Number of stop bits added");
+
+namespace {
+  struct IA64BundlingPass : public MachineFunctionPass {
+    static char ID;
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    IA64TargetMachine &TM;
+
+    IA64BundlingPass(IA64TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "IA64 (Itanium) Bundling Pass";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+    // XXX: ugly global, but pending writes can cross basic blocks. Note that
+    // taken branches end instruction groups. So we only need to worry about
+    // 'fallthrough' code
+    std::set<unsigned> PendingRegWrites;
+  };
+  char IA64BundlingPass::ID = 0;
+} // end of anonymous namespace
+
+/// createIA64BundlingPass - Returns a pass that adds STOP (;;) instructions
+/// and arranges the result into bundles.
+///
+FunctionPass *llvm::createIA64BundlingPass(IA64TargetMachine &tm) {
+  return new IA64BundlingPass(tm);
+}
+
+/// runOnMachineBasicBlock - add stops and bundle this MBB.
+///
+bool IA64BundlingPass::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineInstr *CurrentInsn = I++;
+    std::set<unsigned> CurrentReads, CurrentWrites, OrigWrites;
+
+    for(unsigned i=0; i < CurrentInsn->getNumOperands(); i++) {
+      MachineOperand &MO=CurrentInsn->getOperand(i);
+      if (MO.isReg()) {
+        if(MO.isUse()) { // TODO: exclude p0
+          CurrentReads.insert(MO.getReg());
+        }
+        if(MO.isDef()) { // TODO: exclude p0
+          CurrentWrites.insert(MO.getReg());
+          OrigWrites.insert(MO.getReg()); // FIXME: use a nondestructive
+                                          // set_intersect instead?
+        }
+      }
+    }
+    
+    // CurrentReads/CurrentWrites contain info for the current instruction.
+    // Does it read or write any registers that are pending a write?
+    // (i.e. not separated by a stop)
+    set_intersect(CurrentReads, PendingRegWrites);
+    set_intersect(CurrentWrites, PendingRegWrites);
+    
+    if(! (CurrentReads.empty() && CurrentWrites.empty()) ) {
+      // there is a conflict, insert a stop and reset PendingRegWrites
+      CurrentInsn = BuildMI(MBB, CurrentInsn, CurrentInsn->getDebugLoc(),
+                            TM.getInstrInfo()->get(IA64::STOP), 0);
+      PendingRegWrites=OrigWrites; // carry over current writes to next insn
+      Changed=true; StopBitsAdded++; // update stats      
+    } else { // otherwise, track additional pending writes
+      set_union(PendingRegWrites, OrigWrites);
+    }
+  } // onto the next insn in the MBB
+
+  return Changed;
+}
+
diff --git a/lib/Target/IA64/IA64ISelDAGToDAG.cpp b/lib/Target/IA64/IA64ISelDAGToDAG.cpp
new file mode 100644
index 0000000..9800c50
--- /dev/null
+++ b/lib/Target/IA64/IA64ISelDAGToDAG.cpp
@@ -0,0 +1,575 @@
+//===---- IA64ISelDAGToDAG.cpp - IA64 pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for IA64,
+// converting a legalized dag to an IA64 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ia64-codegen"
+#include "IA64.h"
+#include "IA64TargetMachine.h"
+#include "IA64ISelLowering.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// IA64DAGToDAGISel - IA64 specific code to select IA64 machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class IA64DAGToDAGISel : public SelectionDAGISel {
+    unsigned GlobalBaseReg;
+  public:
+    explicit IA64DAGToDAGISel(IA64TargetMachine &TM)
+      : SelectionDAGISel(TM) {}
+    
+    virtual bool runOnFunction(Function &Fn) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      return SelectionDAGISel::runOnFunction(Fn);
+    }
+ 
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    // SDValue getGlobalBaseReg(); TODO: hmm
+    
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDValue N);
+    
+    SDNode *SelectIntImmediateExpr(SDValue LHS, SDValue RHS,
+                                   unsigned OCHi, unsigned OCLo,
+                                   bool IsArithmetic = false,
+                                   bool Negate = false);
+    SDNode *SelectBitfieldInsert(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC);
+
+    /// SelectAddr - Given the specified address, return the two operands for a
+    /// load/store instruction, and return true if it should be an indexed [r+r]
+    /// operation.
+    bool SelectAddr(SDValue Addr, SDValue &Op1, SDValue &Op2);
+
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+    
+    virtual const char *getPassName() const {
+      return "IA64 (Itanium) DAG->DAG Instruction Selector";
+    } 
+
+// Include the pieces autogenerated from the target description.
+#include "IA64GenDAGISel.inc"
+    
+private:
+    SDNode *SelectDIV(SDValue Op);
+  };
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void IA64DAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+SDNode *IA64DAGToDAGISel::SelectDIV(SDValue Op) {
+  SDNode *N = Op.getNode();
+  SDValue Chain = N->getOperand(0);
+  SDValue Tmp1 = N->getOperand(0);
+  SDValue Tmp2 = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  bool isFP=false;
+
+  if(Tmp1.getValueType().isFloatingPoint())
+    isFP=true;
+    
+  bool isModulus=false; // is it a division or a modulus?
+  bool isSigned=false;
+
+  switch(N->getOpcode()) {
+    case ISD::FDIV:
+    case ISD::SDIV:  isModulus=false; isSigned=true;  break;
+    case ISD::UDIV:  isModulus=false; isSigned=false; break;
+    case ISD::FREM:
+    case ISD::SREM:  isModulus=true;  isSigned=true;  break;
+    case ISD::UREM:  isModulus=true;  isSigned=false; break;
+  }
+
+  // TODO: check for integer divides by powers of 2 (or other simple patterns?)
+
+    SDValue TmpPR, TmpPR2;
+    SDValue TmpF1, TmpF2, TmpF3, TmpF4, TmpF5, TmpF6, TmpF7, TmpF8;
+    SDValue TmpF9, TmpF10,TmpF11,TmpF12,TmpF13,TmpF14,TmpF15;
+    SDNode *Result;
+
+    // we'll need copies of F0 and F1
+    SDValue F0 = CurDAG->getRegister(IA64::F0, MVT::f64);
+    SDValue F1 = CurDAG->getRegister(IA64::F1, MVT::f64);
+    
+    // OK, emit some code:
+
+    if(!isFP) {
+      // first, load the inputs into FP regs.
+      TmpF1 =
+        SDValue(CurDAG->getTargetNode(IA64::SETFSIG, dl, MVT::f64, Tmp1), 0);
+      Chain = TmpF1.getValue(1);
+      TmpF2 =
+        SDValue(CurDAG->getTargetNode(IA64::SETFSIG, dl, MVT::f64, Tmp2), 0);
+      Chain = TmpF2.getValue(1);
+      
+      // next, convert the inputs to FP
+      if(isSigned) {
+        TmpF3 =
+          SDValue(CurDAG->getTargetNode(IA64::FCVTXF, dl, MVT::f64, TmpF1), 0);
+        Chain = TmpF3.getValue(1);
+        TmpF4 =
+          SDValue(CurDAG->getTargetNode(IA64::FCVTXF, dl, MVT::f64, TmpF2), 0);
+        Chain = TmpF4.getValue(1);
+      } else { // is unsigned
+        TmpF3 =
+          SDValue(CurDAG->getTargetNode(IA64::FCVTXUFS1, dl, MVT::f64, TmpF1), 
+                  0);
+        Chain = TmpF3.getValue(1);
+        TmpF4 =
+          SDValue(CurDAG->getTargetNode(IA64::FCVTXUFS1, dl, MVT::f64, TmpF2), 
+                  0);
+        Chain = TmpF4.getValue(1);
+      }
+
+    } else { // this is an FP divide/remainder, so we 'leak' some temp
+             // regs and assign TmpF3=Tmp1, TmpF4=Tmp2
+      TmpF3=Tmp1;
+      TmpF4=Tmp2;
+    }
+
+    // we start by computing an approximate reciprocal (good to 9 bits?)
+    // note, this instruction writes _both_ TmpF5 (answer) and TmpPR (predicate)
+    if(isFP)
+      TmpF5 = SDValue(CurDAG->getTargetNode(IA64::FRCPAS0, dl, MVT::f64,
+                                            MVT::i1, TmpF3, TmpF4), 0);
+    else
+      TmpF5 = SDValue(CurDAG->getTargetNode(IA64::FRCPAS1, dl, MVT::f64,
+                                            MVT::i1, TmpF3, TmpF4), 0);
+                                  
+    TmpPR = TmpF5.getValue(1);
+    Chain = TmpF5.getValue(2);
+
+    SDValue minusB;
+    if(isModulus) { // for remainders, it'll be handy to have
+                             // copies of -input_b
+      minusB = SDValue(CurDAG->getTargetNode(IA64::SUB, dl, MVT::i64,
+                  CurDAG->getRegister(IA64::r0, MVT::i64), Tmp2), 0);
+      Chain = minusB.getValue(1);
+    }
+    
+    SDValue TmpE0, TmpY1, TmpE1, TmpY2;
+
+    SDValue OpsE0[] = { TmpF4, TmpF5, F1, TmpPR };
+    TmpE0 = SDValue(CurDAG->getTargetNode(IA64::CFNMAS1, dl, MVT::f64,
+                                            OpsE0, 4), 0);
+    Chain = TmpE0.getValue(1);
+    SDValue OpsY1[] = { TmpF5, TmpE0, TmpF5, TmpPR };
+    TmpY1 = SDValue(CurDAG->getTargetNode(IA64::CFMAS1, dl, MVT::f64,
+                                            OpsY1, 4), 0);
+    Chain = TmpY1.getValue(1);
+    SDValue OpsE1[] = { TmpE0, TmpE0, F0, TmpPR };
+    TmpE1 = SDValue(CurDAG->getTargetNode(IA64::CFMAS1, dl, MVT::f64,
+                                            OpsE1, 4), 0);
+    Chain = TmpE1.getValue(1);
+    SDValue OpsY2[] = { TmpY1, TmpE1, TmpY1, TmpPR };
+    TmpY2 = SDValue(CurDAG->getTargetNode(IA64::CFMAS1, dl, MVT::f64,
+                                            OpsY2, 4), 0);
+    Chain = TmpY2.getValue(1);
+    
+    if(isFP) { // if this is an FP divide, we finish up here and exit early
+      if(isModulus)
+        assert(0 && "Sorry, try another FORTRAN compiler.");
+ 
+      SDValue TmpE2, TmpY3, TmpQ0, TmpR0;
+
+      SDValue OpsE2[] = { TmpE1, TmpE1, F0, TmpPR };
+      TmpE2 = SDValue(CurDAG->getTargetNode(IA64::CFMAS1, dl, MVT::f64,
+                                              OpsE2, 4), 0);
+      Chain = TmpE2.getValue(1);
+      SDValue OpsY3[] = { TmpY2, TmpE2, TmpY2, TmpPR };
+      TmpY3 = SDValue(CurDAG->getTargetNode(IA64::CFMAS1, dl, MVT::f64,
+                                              OpsY3, 4), 0);
+      Chain = TmpY3.getValue(1);
+      SDValue OpsQ0[] = { Tmp1, TmpY3, F0, TmpPR };
+      TmpQ0 =
+        SDValue(CurDAG->getTargetNode(IA64::CFMADS1, dl,  // double prec!
+                                      MVT::f64, OpsQ0, 4), 0);
+      Chain = TmpQ0.getValue(1);
+      SDValue OpsR0[] = { Tmp2, TmpQ0, Tmp1, TmpPR };
+      TmpR0 =
+        SDValue(CurDAG->getTargetNode(IA64::CFNMADS1, dl, // double prec!
+                                      MVT::f64, OpsR0, 4), 0);
+      Chain = TmpR0.getValue(1);
+
+// we want Result to have the same target register as the frcpa, so
+// we two-address hack it. See the comment "for this to work..." on
+// page 48 of Intel application note #245415
+      SDValue Ops[] = { TmpF5, TmpY3, TmpR0, TmpQ0, TmpPR };
+      Result = CurDAG->getTargetNode(IA64::TCFMADS0, dl, // d.p. s0 rndg!
+                                     MVT::f64, Ops, 5);
+      Chain = SDValue(Result, 1);
+      return Result; // XXX: early exit!
+    } else { // this is *not* an FP divide, so there's a bit left to do:
+    
+      SDValue TmpQ2, TmpR2, TmpQ3, TmpQ;
+
+      SDValue OpsQ2[] = { TmpF3, TmpY2, F0, TmpPR };
+      TmpQ2 = SDValue(CurDAG->getTargetNode(IA64::CFMAS1, dl, MVT::f64,
+                                              OpsQ2, 4), 0);
+      Chain = TmpQ2.getValue(1);
+      SDValue OpsR2[] = { TmpF4, TmpQ2, TmpF3, TmpPR };
+      TmpR2 = SDValue(CurDAG->getTargetNode(IA64::CFNMAS1, dl, MVT::f64,
+                                              OpsR2, 4), 0);
+      Chain = TmpR2.getValue(1);
+      
+// we want TmpQ3 to have the same target register as the frcpa? maybe we
+// should two-address hack it. See the comment "for this to work..." on page
+// 48 of Intel application note #245415
+      SDValue OpsQ3[] = { TmpF5, TmpR2, TmpY2, TmpQ2, TmpPR };
+      TmpQ3 = SDValue(CurDAG->getTargetNode(IA64::TCFMAS1, dl, MVT::f64,
+                                         OpsQ3, 5), 0);
+      Chain = TmpQ3.getValue(1);
+
+      // STORY: without these two-address instructions (TCFMAS1 and TCFMADS0)
+      // the FPSWA won't be able to help out in the case of large/tiny
+      // arguments. Other fun bugs may also appear, e.g. 0/x = x, not 0.
+      
+      if(isSigned)
+        TmpQ = SDValue(CurDAG->getTargetNode(IA64::FCVTFXTRUNCS1, dl,
+                                               MVT::f64, TmpQ3), 0);
+      else
+        TmpQ = SDValue(CurDAG->getTargetNode(IA64::FCVTFXUTRUNCS1, dl,
+                                               MVT::f64, TmpQ3), 0);
+      
+      Chain = TmpQ.getValue(1);
+
+      if(isModulus) {
+        SDValue FPminusB =
+          SDValue(CurDAG->getTargetNode(IA64::SETFSIG, dl, MVT::f64, minusB),
+                  0);
+        Chain = FPminusB.getValue(1);
+        SDValue Remainder =
+          SDValue(CurDAG->getTargetNode(IA64::XMAL, dl, MVT::f64,
+                                          TmpQ, FPminusB, TmpF1), 0);
+        Chain = Remainder.getValue(1);
+        Result = CurDAG->getTargetNode(IA64::GETFSIG, dl, MVT::i64, Remainder);
+        Chain = SDValue(Result, 1);
+      } else { // just an integer divide
+        Result = CurDAG->getTargetNode(IA64::GETFSIG, dl, MVT::i64, TmpQ);
+        Chain = SDValue(Result, 1);
+      }
+
+      return Result;
+    } // wasn't an FP divide
+}
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *IA64DAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (N->getOpcode()) {
+  default: break;
+
+  case IA64ISD::BRCALL: { // XXX: this is also a hack!
+    SDValue Chain = N->getOperand(0);
+    SDValue InFlag;  // Null incoming flag value.
+
+    if(N->getNumOperands()==3) { // we have an incoming chain, callee and flag
+      InFlag = N->getOperand(2);
+    }
+
+    unsigned CallOpcode;
+    SDValue CallOperand;
+    
+    // if we can call directly, do so
+    if (GlobalAddressSDNode *GASD =
+      dyn_cast<GlobalAddressSDNode>(N->getOperand(1))) {
+      CallOpcode = IA64::BRCALL_IPREL_GA;
+      CallOperand = CurDAG->getTargetGlobalAddress(GASD->getGlobal(), MVT::i64);
+    } else if (isa<ExternalSymbolSDNode>(N->getOperand(1))) {
+      // FIXME: we currently NEED this case for correctness, to avoid
+      // "non-pic code with imm reloc.n against dynamic symbol" errors
+    CallOpcode = IA64::BRCALL_IPREL_ES;
+    CallOperand = N->getOperand(1);
+  } else {
+    // otherwise we need to load the function descriptor,
+    // load the branch target (function)'s entry point and GP,
+    // branch (call) then restore the GP
+    SDValue FnDescriptor = N->getOperand(1);
+   
+    // load the branch target's entry point [mem] and 
+    // GP value [mem+8]
+    SDValue targetEntryPoint=
+      SDValue(CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64, MVT::Other,
+                                      FnDescriptor, CurDAG->getEntryNode()), 0);
+    Chain = targetEntryPoint.getValue(1);
+    SDValue targetGPAddr=
+      SDValue(CurDAG->getTargetNode(IA64::ADDS, dl, MVT::i64, 
+                                      FnDescriptor,
+                                      CurDAG->getConstant(8, MVT::i64)), 0);
+    Chain = targetGPAddr.getValue(1);
+    SDValue targetGP =
+      SDValue(CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64,MVT::Other,
+                                      targetGPAddr, CurDAG->getEntryNode()), 0);
+    Chain = targetGP.getValue(1);
+
+    Chain = CurDAG->getCopyToReg(Chain, dl, IA64::r1, targetGP, InFlag);
+    InFlag = Chain.getValue(1);
+    Chain = CurDAG->getCopyToReg(Chain, dl, IA64::B6,
+                                 targetEntryPoint, InFlag); // FLAG these?
+    InFlag = Chain.getValue(1);
+    
+    CallOperand = CurDAG->getRegister(IA64::B6, MVT::i64);
+    CallOpcode = IA64::BRCALL_INDIRECT;
+  }
+ 
+   // Finally, once everything is setup, emit the call itself
+   if (InFlag.getNode())
+     Chain = SDValue(CurDAG->getTargetNode(CallOpcode, dl, MVT::Other,
+                                           MVT::Flag, CallOperand, InFlag), 0);
+   else // there might be no arguments
+     Chain = SDValue(CurDAG->getTargetNode(CallOpcode, dl, MVT::Other,
+                                           MVT::Flag, CallOperand, Chain), 0);
+   InFlag = Chain.getValue(1);
+
+   std::vector<SDValue> CallResults;
+
+   CallResults.push_back(Chain);
+   CallResults.push_back(InFlag);
+
+   for (unsigned i = 0, e = CallResults.size(); i != e; ++i)
+     ReplaceUses(Op.getValue(i), CallResults[i]);
+   return NULL;
+  }
+  
+  case IA64ISD::GETFD: {
+    SDValue Input = N->getOperand(0);
+    return CurDAG->getTargetNode(IA64::GETFD, dl, MVT::i64, Input);
+  } 
+  
+  case ISD::FDIV:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    return SelectDIV(Op);
+ 
+  case ISD::TargetConstantFP: {
+    SDValue Chain = CurDAG->getEntryNode(); // this is a constant, so..
+
+    SDValue V;
+    ConstantFPSDNode* N2 = cast<ConstantFPSDNode>(N);
+    if (N2->getValueAPF().isPosZero()) {
+      V = CurDAG->getCopyFromReg(Chain, dl, IA64::F0, MVT::f64);
+    } else if (N2->isExactlyValue(N2->getValueType(0) == MVT::f32 ? 
+                                  APFloat(+1.0f) : APFloat(+1.0))) {
+      V = CurDAG->getCopyFromReg(Chain, dl, IA64::F1, MVT::f64);
+    } else
+      assert(0 && "Unexpected FP constant!");
+    
+    ReplaceUses(SDValue(N, 0), V);
+    return 0;
+  }
+
+  case ISD::FrameIndex: { // TODO: reduce creepyness
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, IA64::MOV, MVT::i64,
+                                  CurDAG->getTargetFrameIndex(FI, MVT::i64));
+    else
+      return CurDAG->getTargetNode(IA64::MOV, dl, MVT::i64,
+                                   CurDAG->getTargetFrameIndex(FI, MVT::i64));
+  }
+
+  case ISD::ConstantPool: { // TODO: nuke the constant pool
+    // (ia64 doesn't need one)
+    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    Constant *C = CP->getConstVal();
+    SDValue CPI = CurDAG->getTargetConstantPool(C, MVT::i64,
+                                                  CP->getAlignment());
+    return CurDAG->getTargetNode(IA64::ADDL_GA, dl, MVT::i64, // ?
+                                 CurDAG->getRegister(IA64::r1, MVT::i64), CPI);
+  }
+
+  case ISD::GlobalAddress: {
+    GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
+    SDValue GA = CurDAG->getTargetGlobalAddress(GV, MVT::i64);
+    SDValue Tmp =
+      SDValue(CurDAG->getTargetNode(IA64::ADDL_GA, dl, MVT::i64, 
+                                      CurDAG->getRegister(IA64::r1,
+                                                          MVT::i64), GA), 0);
+    return CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64, MVT::Other, Tmp,
+                                 CurDAG->getEntryNode());
+  }
+  
+/* XXX
+   case ISD::ExternalSymbol: {
+     SDValue EA = CurDAG->getTargetExternalSymbol(
+       cast<ExternalSymbolSDNode>(N)->getSymbol(),
+       MVT::i64);
+     SDValue Tmp = CurDAG->getTargetNode(IA64::ADDL_EA, dl, MVT::i64, 
+                                           CurDAG->getRegister(IA64::r1,
+                                                               MVT::i64),
+                                           EA);
+     return CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64, Tmp);
+   }
+*/
+
+  case ISD::LOAD: { // FIXME: load -1, not 1, for bools?
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    SDValue Chain = LD->getChain();
+    SDValue Address = LD->getBasePtr();
+
+    MVT TypeBeingLoaded = LD->getMemoryVT();
+    unsigned Opc;
+    switch (TypeBeingLoaded.getSimpleVT()) {
+    default:
+#ifndef NDEBUG
+      N->dump(CurDAG);
+#endif
+      assert(0 && "Cannot load this type!");
+    case MVT::i1: { // this is a bool
+      Opc = IA64::LD1; // first we load a byte, then compare for != 0
+      if(N->getValueType(0) == MVT::i1) { // XXX: early exit!
+        return CurDAG->SelectNodeTo(N, IA64::CMPNE, MVT::i1, MVT::Other,
+                                    SDValue(CurDAG->getTargetNode(Opc, dl,
+                                                                  MVT::i64,
+                                                                  Address), 0),
+                                    CurDAG->getRegister(IA64::r0, MVT::i64),
+                                    Chain);
+      }
+      /* otherwise, we want to load a bool into something bigger: LD1
+         will do that for us, so we just fall through */
+    }
+    case MVT::i8:  Opc = IA64::LD1; break;
+    case MVT::i16: Opc = IA64::LD2; break;
+    case MVT::i32: Opc = IA64::LD4; break;
+    case MVT::i64: Opc = IA64::LD8; break;
+    
+    case MVT::f32: Opc = IA64::LDF4; break;
+    case MVT::f64: Opc = IA64::LDF8; break;
+    }
+
+    // TODO: comment this
+    return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), MVT::Other,
+                                Address, Chain);
+  }
+  
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    SDValue Address = ST->getBasePtr();
+    SDValue Chain = ST->getChain();
+   
+    unsigned Opc;
+    if (ISD::isNON_TRUNCStore(N)) {
+      switch (N->getOperand(1).getValueType().getSimpleVT()) {
+      default: assert(0 && "unknown type in store");
+      case MVT::i1: { // this is a bool
+        Opc = IA64::ST1; // we store either 0 or 1 as a byte 
+        // first load zero!
+        SDValue Initial = CurDAG->getCopyFromReg(Chain, dl, IA64::r0, MVT::i64);
+        Chain = Initial.getValue(1);
+        // then load 1 into the same reg iff the predicate to store is 1
+        SDValue Tmp = ST->getValue();
+        Tmp =
+          SDValue(CurDAG->getTargetNode(IA64::TPCADDS, dl, MVT::i64, Initial,
+                                          CurDAG->getTargetConstant(1,
+                                                                    MVT::i64),
+                                          Tmp), 0);
+        return CurDAG->SelectNodeTo(N, Opc, MVT::Other, Address, Tmp, Chain);
+      }
+      case MVT::i64: Opc = IA64::ST8;  break;
+      case MVT::f64: Opc = IA64::STF8; break;
+      }
+    } else { // Truncating store
+      switch(ST->getMemoryVT().getSimpleVT()) {
+      default: assert(0 && "unknown type in truncstore");
+      case MVT::i8:  Opc = IA64::ST1;  break;
+      case MVT::i16: Opc = IA64::ST2;  break;
+      case MVT::i32: Opc = IA64::ST4;  break;
+      case MVT::f32: Opc = IA64::STF4; break;
+      }
+    }
+    
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    return CurDAG->SelectNodeTo(N, Opc, MVT::Other, N2, N1, Chain);
+  }
+
+  case ISD::BRCOND: {
+    SDValue Chain = N->getOperand(0);
+    SDValue CC = N->getOperand(1);
+    MachineBasicBlock *Dest =
+      cast<BasicBlockSDNode>(N->getOperand(2))->getBasicBlock();
+    //FIXME - we do NOT need long branches all the time
+    return CurDAG->SelectNodeTo(N, IA64::BRLCOND_NOTCALL, MVT::Other, CC, 
+                                CurDAG->getBasicBlock(Dest), Chain);
+  }
+
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END: {
+    int64_t Amt = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned Opc = N->getOpcode() == ISD::CALLSEQ_START ?
+      IA64::ADJUSTCALLSTACKDOWN : IA64::ADJUSTCALLSTACKUP;
+    SDValue N0 = N->getOperand(0);
+    return CurDAG->SelectNodeTo(N, Opc, MVT::Other, getI64Imm(Amt), N0);
+  }
+
+  case ISD::BR:
+    // FIXME: we don't need long branches all the time!
+    SDValue N0 = N->getOperand(0);
+    return CurDAG->SelectNodeTo(N, IA64::BRL_NOTCALL, MVT::Other, 
+                                N->getOperand(1), N0);
+  }
+  
+  return SelectCode(Op);
+}
+
+
+/// createIA64DAGToDAGInstructionSelector - This pass converts a legalized DAG
+/// into an IA64-specific DAG, ready for instruction scheduling.
+///
+FunctionPass
+*llvm::createIA64DAGToDAGInstructionSelector(IA64TargetMachine &TM) {
+  return new IA64DAGToDAGISel(TM);
+}
+
diff --git a/lib/Target/IA64/IA64ISelLowering.cpp b/lib/Target/IA64/IA64ISelLowering.cpp
new file mode 100644
index 0000000..34a0686
--- /dev/null
+++ b/lib/Target/IA64/IA64ISelLowering.cpp
@@ -0,0 +1,622 @@
+//===-- IA64ISelLowering.cpp - IA64 DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64ISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64ISelLowering.h"
+#include "IA64MachineFunctionInfo.h"
+#include "IA64TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+using namespace llvm;
+
+IA64TargetLowering::IA64TargetLowering(TargetMachine &TM)
+  : TargetLowering(TM) {
+ 
+  // register class for general registers
+  addRegisterClass(MVT::i64, IA64::GRRegisterClass);
+
+  // register class for FP registers
+  addRegisterClass(MVT::f64, IA64::FPRegisterClass);
+
+  // register class for predicate registers
+  addRegisterClass(MVT::i1, IA64::PRRegisterClass);
+
+  setLoadExtAction(ISD::EXTLOAD          , MVT::i1   , Promote);
+
+  setLoadExtAction(ISD::ZEXTLOAD         , MVT::i1   , Promote);
+
+  setLoadExtAction(ISD::SEXTLOAD         , MVT::i1   , Promote);
+  setLoadExtAction(ISD::SEXTLOAD         , MVT::i8   , Expand);
+  setLoadExtAction(ISD::SEXTLOAD         , MVT::i16  , Expand);
+  setLoadExtAction(ISD::SEXTLOAD         , MVT::i32  , Expand);
+
+  setOperationAction(ISD::BRIND            , MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+
+  // ia64 uses SELECT not SELECT_CC
+  setOperationAction(ISD::SELECT_CC        , MVT::Other,  Expand);
+  
+  // We need to handle ISD::RET for void functions ourselves,
+  // so we get a chance to restore ar.pfs before adding a
+  // br.ret insn
+  setOperationAction(ISD::RET, MVT::Other, Custom);
+
+  setShiftAmountType(MVT::i64);
+
+  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+
+  setOperationAction(ISD::UREM             , MVT::f32  , Expand);
+  setOperationAction(ISD::UREM             , MVT::f64  , Expand);
+
+  setOperationAction(ISD::MEMBARRIER       , MVT::Other, Expand);
+
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+
+  // We don't support sin/cos/sqrt/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+    
+  // FIXME: IA64 supports fcopysign natively!
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  
+  // We don't have line number support yet.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // IA64 has ctlz in the form of the 'fnorm' instruction.  The Legalizer 
+  // expansion for ctlz/cttz in terms of ctpop is much larger, but lower
+  // latency.
+  // FIXME: Custom lower CTLZ when compiling for size?
+  setOperationAction(ISD::CTLZ , MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+  setOperationAction(ISD::ROTL , MVT::i64  , Expand);
+  setOperationAction(ISD::ROTR , MVT::i64  , Expand);
+
+  // FIXME: IA64 has this, but is not implemented. should be mux @rev
+  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // Thread Local Storage
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+  setStackPointerRegisterToSaveRestore(IA64::r12);
+
+  setJumpBufSize(704); // on ia64-linux, jmp_bufs are 704 bytes..
+  setJumpBufAlignment(16); // ...and must be 16-byte aligned
+  
+  computeRegisterProperties();
+
+  addLegalFPImmediate(APFloat(+0.0));
+  addLegalFPImmediate(APFloat(-0.0));
+  addLegalFPImmediate(APFloat(+1.0));
+  addLegalFPImmediate(APFloat(-1.0));
+}
+
+const char *IA64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case IA64ISD::GETFD:  return "IA64ISD::GETFD";
+  case IA64ISD::BRCALL: return "IA64ISD::BRCALL";  
+  case IA64ISD::RET_FLAG: return "IA64ISD::RET_FLAG";
+  }
+}
+  
+MVT IA64TargetLowering::getSetCCResultType(MVT VT) const {
+  return MVT::i1;
+}
+
+void IA64TargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &ArgValues,
+                                        DebugLoc dl) {
+  //
+  // add beautiful description of IA64 stack frame format
+  // here (from intel 24535803.pdf most likely)
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  
+  GP = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+  SP = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+  RP = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+  
+  MachineBasicBlock& BB = MF.front();
+
+  unsigned args_int[] = {IA64::r32, IA64::r33, IA64::r34, IA64::r35,
+                         IA64::r36, IA64::r37, IA64::r38, IA64::r39};
+
+  unsigned args_FP[] = {IA64::F8, IA64::F9, IA64::F10, IA64::F11,
+                        IA64::F12,IA64::F13,IA64::F14, IA64::F15};
+
+  unsigned argVreg[8];
+  unsigned argPreg[8];
+  unsigned argOpc[8];
+
+  unsigned used_FPArgs = 0; // how many FP args have been used so far?
+
+  unsigned ArgOffset = 0;
+  int count = 0;
+
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    {
+      SDValue newroot, argt;
+      if(count < 8) { // need to fix this logic? maybe.
+
+        switch (getValueType(I->getType()).getSimpleVT()) {
+          default:
+            assert(0 && "ERROR in LowerArgs: can't lower this type of arg.\n"); 
+          case MVT::f32:
+            // fixme? (well, will need to for weird FP structy stuff,
+            // see intel ABI docs)
+          case MVT::f64:
+//XXX            BuildMI(&BB, IA64::IDEF, 0, args_FP[used_FPArgs]);
+            MF.getRegInfo().addLiveIn(args_FP[used_FPArgs]);
+            // mark this reg as liveIn
+            // floating point args go into f8..f15 as-needed, the increment
+            argVreg[count] =                              // is below..:
+            MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::f64));
+            // FP args go into f8..f15 as needed: (hence the ++)
+            argPreg[count] = args_FP[used_FPArgs++];
+            argOpc[count] = IA64::FMOV;
+            argt = newroot = DAG.getCopyFromReg(DAG.getRoot(), dl,
+                                                argVreg[count], MVT::f64);
+            if (I->getType() == Type::FloatTy)
+              argt = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, argt,
+                                 DAG.getIntPtrConstant(0));
+            break;
+          case MVT::i1: // NOTE: as far as C abi stuff goes,
+                        // bools are just boring old ints
+          case MVT::i8:
+          case MVT::i16:
+          case MVT::i32:
+          case MVT::i64:
+//XXX            BuildMI(&BB, IA64::IDEF, 0, args_int[count]);
+            MF.getRegInfo().addLiveIn(args_int[count]);
+            // mark this register as liveIn
+            argVreg[count] =
+            MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+            argPreg[count] = args_int[count];
+            argOpc[count] = IA64::MOV;
+            argt = newroot =
+              DAG.getCopyFromReg(DAG.getRoot(), dl, argVreg[count], MVT::i64);
+            if ( getValueType(I->getType()) != MVT::i64)
+              argt = DAG.getNode(ISD::TRUNCATE, dl, getValueType(I->getType()),
+                  newroot);
+            break;
+        }
+      } else { // more than 8 args go into the frame
+        // Create the frame index object for this incoming parameter...
+        ArgOffset = 16 + 8 * (count - 8);
+        int FI = MFI->CreateFixedObject(8, ArgOffset);
+
+        // Create the SelectionDAG nodes corresponding to a load
+        //from this parameter
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i64);
+        argt = newroot = DAG.getLoad(getValueType(I->getType()), dl,
+                                     DAG.getEntryNode(), FIN, NULL, 0);
+      }
+      ++count;
+      DAG.setRoot(newroot.getValue(1));
+      ArgValues.push_back(argt);
+    }
+
+
+  // Create a vreg to hold the output of (what will become)
+  // the "alloc" instruction
+  VirtGPR = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+  BuildMI(&BB, dl, TII->get(IA64::PSEUDO_ALLOC), VirtGPR);
+  // we create a PSEUDO_ALLOC (pseudo)instruction for now
+/*
+  BuildMI(&BB, IA64::IDEF, 0, IA64::r1);
+
+  // hmm:
+  BuildMI(&BB, IA64::IDEF, 0, IA64::r12);
+  BuildMI(&BB, IA64::IDEF, 0, IA64::rp);
+  // ..hmm.
+  
+  BuildMI(&BB, IA64::MOV, 1, GP).addReg(IA64::r1);
+
+  // hmm:
+  BuildMI(&BB, IA64::MOV, 1, SP).addReg(IA64::r12);
+  BuildMI(&BB, IA64::MOV, 1, RP).addReg(IA64::rp);
+  // ..hmm.
+*/
+
+  unsigned tempOffset=0;
+
+  // if this is a varargs function, we simply lower llvm.va_start by
+  // pointing to the first entry
+  if(F.isVarArg()) {
+    tempOffset=0;
+    VarArgsFrameIndex = MFI->CreateFixedObject(8, tempOffset);
+  }
+
+  // here we actually do the moving of args, and store them to the stack
+  // too if this is a varargs function:
+  for (int i = 0; i < count && i < 8; ++i) {
+    BuildMI(&BB, dl, TII->get(argOpc[i]), argVreg[i]).addReg(argPreg[i]);
+    if(F.isVarArg()) {
+      // if this is a varargs function, we copy the input registers to the stack
+      int FI = MFI->CreateFixedObject(8, tempOffset);
+      tempOffset+=8;   //XXX: is it safe to use r22 like this?
+      BuildMI(&BB, dl, TII->get(IA64::MOV), IA64::r22).addFrameIndex(FI);
+      // FIXME: we should use st8.spill here, one day
+      BuildMI(&BB, dl, TII->get(IA64::ST8), IA64::r22).addReg(argPreg[i]);
+    }
+  }
+
+  // Finally, inform the code generator which regs we return values in.
+  // (see the ISD::RET: case in the instruction selector)
+  switch (getValueType(F.getReturnType()).getSimpleVT()) {
+  default: assert(0 && "i have no idea where to return this type!");
+  case MVT::isVoid: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+    MF.getRegInfo().addLiveOut(IA64::r8);
+    break;
+  case MVT::f32:
+  case MVT::f64:
+    MF.getRegInfo().addLiveOut(IA64::F8);
+    break;
+  }
+}
+
+std::pair<SDValue, SDValue>
+IA64TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
+                                bool RetSExt, bool RetZExt, bool isVarArg,
+                                bool isInreg, unsigned CallingConv, 
+                                bool isTailCall, SDValue Callee, 
+                                ArgListTy &Args, SelectionDAG &DAG,
+                                DebugLoc dl) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  unsigned NumBytes = 16;
+  unsigned outRegsUsed = 0;
+
+  if (Args.size() > 8) {
+    NumBytes += (Args.size() - 8) * 8;
+    outRegsUsed = 8;
+  } else {
+    outRegsUsed = Args.size();
+  }
+
+  // FIXME? this WILL fail if we ever try to pass around an arg that
+  // consumes more than a single output slot (a 'real' double, int128
+  // some sort of aggregate etc.), as we'll underestimate how many 'outX'
+  // registers we use. Hopefully, the assembler will notice.
+  MF.getInfo<IA64FunctionInfo>()->outRegsUsed=
+    std::max(outRegsUsed, MF.getInfo<IA64FunctionInfo>()->outRegsUsed);
+
+  // keep stack frame 16-byte aligned
+  // assert(NumBytes==((NumBytes+15) & ~15) && 
+  //        "stack frame not 16-byte aligned!");
+  NumBytes = (NumBytes+15) & ~15;
+  
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue StackPtr;
+  std::vector<SDValue> Stores;
+  std::vector<SDValue> Converts;
+  std::vector<SDValue> RegValuesToPass;
+  unsigned ArgOffset = 16;
+  
+  for (unsigned i = 0, e = Args.size(); i != e; ++i)
+    {
+      SDValue Val = Args[i].Node;
+      MVT ObjectVT = Val.getValueType();
+      SDValue ValToStore(0, 0), ValToConvert(0, 0);
+      unsigned ObjSize=8;
+      switch (ObjectVT.getSimpleVT()) {
+      default: assert(0 && "unexpected argument type!");
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32: {
+        //promote to 64-bits, sign/zero extending based on type
+        //of the argument
+        ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+        if (Args[i].isSExt)
+          ExtendKind = ISD::SIGN_EXTEND;
+        else if (Args[i].isZExt)
+          ExtendKind = ISD::ZERO_EXTEND;
+        Val = DAG.getNode(ExtendKind, dl, MVT::i64, Val);
+        // XXX: fall through
+      }
+      case MVT::i64:
+        //ObjSize = 8;
+        if(RegValuesToPass.size() >= 8) {
+          ValToStore = Val;
+        } else {
+          RegValuesToPass.push_back(Val);
+        }
+        break;
+      case MVT::f32:
+        //promote to 64-bits
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+        // XXX: fall through
+      case MVT::f64:
+        if(RegValuesToPass.size() >= 8) {
+          ValToStore = Val;
+        } else {
+          RegValuesToPass.push_back(Val);
+          if(1 /* TODO: if(calling external or varadic function)*/ ) {
+            ValToConvert = Val; // additionally pass this FP value as an int
+          }
+        }
+        break;
+      }
+      
+      if(ValToStore.getNode()) {
+        if(!StackPtr.getNode()) {
+          StackPtr = DAG.getRegister(IA64::r12, MVT::i64);
+        }
+        SDValue PtrOff = DAG.getConstant(ArgOffset, getPointerTy());
+        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, PtrOff);
+        Stores.push_back(DAG.getStore(Chain, dl, ValToStore, PtrOff, NULL, 0));
+        ArgOffset += ObjSize;
+      }
+
+      if(ValToConvert.getNode()) {
+        Converts.push_back(DAG.getNode(IA64ISD::GETFD, dl,
+                                       MVT::i64, ValToConvert));
+      }
+    }
+
+  // Emit all stores, make sure they occur before any copies into physregs.
+  if (!Stores.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &Stores[0],Stores.size());
+
+  static const unsigned IntArgRegs[] = {
+    IA64::out0, IA64::out1, IA64::out2, IA64::out3, 
+    IA64::out4, IA64::out5, IA64::out6, IA64::out7
+  };
+
+  static const unsigned FPArgRegs[] = {
+    IA64::F8,  IA64::F9,  IA64::F10, IA64::F11, 
+    IA64::F12, IA64::F13, IA64::F14, IA64::F15
+  };
+
+  SDValue InFlag;
+  
+  // save the current GP, SP and RP : FIXME: do we need to do all 3 always?
+  SDValue GPBeforeCall = DAG.getCopyFromReg(Chain, dl, IA64::r1, 
+                                            MVT::i64, InFlag);
+  Chain = GPBeforeCall.getValue(1);
+  InFlag = Chain.getValue(2);
+  SDValue SPBeforeCall = DAG.getCopyFromReg(Chain, dl, IA64::r12, 
+                                            MVT::i64, InFlag);
+  Chain = SPBeforeCall.getValue(1);
+  InFlag = Chain.getValue(2);
+  SDValue RPBeforeCall = DAG.getCopyFromReg(Chain, dl, IA64::rp, 
+                                            MVT::i64, InFlag);
+  Chain = RPBeforeCall.getValue(1);
+  InFlag = Chain.getValue(2);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing integer args into regs out[0-7]
+  // mapped 1:1 and the FP args into regs F8-F15 "lazily"
+  // TODO: for performance, we should only copy FP args into int regs when we
+  // know this is required (i.e. for varardic or external (unknown) functions)
+
+  // first to the FP->(integer representation) conversions, these are
+  // flagged for now, but shouldn't have to be (TODO)
+  unsigned seenConverts = 0;
+  for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) {
+    if(RegValuesToPass[i].getValueType().isFloatingPoint()) {
+      Chain = DAG.getCopyToReg(Chain, dl, IntArgRegs[i], 
+                               Converts[seenConverts++], InFlag);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  // next copy args into the usual places, these are flagged
+  unsigned usedFPArgs = 0;
+  for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl,
+      RegValuesToPass[i].getValueType().isInteger() ?
+        IntArgRegs[i] : FPArgRegs[usedFPArgs++], RegValuesToPass[i], InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+/*
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i64);
+  }
+*/
+
+  std::vector<MVT> NodeTys;
+  std::vector<SDValue> CallOperands;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+  CallOperands.push_back(Chain);
+  CallOperands.push_back(Callee);
+
+  // emit the call itself
+  if (InFlag.getNode())
+    CallOperands.push_back(InFlag);
+  else
+    assert(0 && "this should never happen!\n");
+
+  // to make way for a hack:
+  Chain = DAG.getNode(IA64ISD::BRCALL, dl, NodeTys,
+                      &CallOperands[0], CallOperands.size());
+  InFlag = Chain.getValue(1);
+
+  // restore the GP, SP and RP after the call  
+  Chain = DAG.getCopyToReg(Chain, dl, IA64::r1, GPBeforeCall, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, IA64::r12, SPBeforeCall, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, IA64::rp, RPBeforeCall, InFlag);
+  InFlag = Chain.getValue(1);
+ 
+  std::vector<MVT> RetVals;
+  RetVals.push_back(MVT::Other);
+  RetVals.push_back(MVT::Flag);
+ 
+  MVT RetTyVT = getValueType(RetTy);
+  SDValue RetVal;
+  if (RetTyVT != MVT::isVoid) {
+    switch (RetTyVT.getSimpleVT()) {
+    default: assert(0 && "Unknown value type to return!");
+    case MVT::i1: { // bools are just like other integers (returned in r8)
+      // we *could* fall through to the truncate below, but this saves a
+      // few redundant predicate ops
+      SDValue boolInR8 = DAG.getCopyFromReg(Chain, dl, IA64::r8, 
+                                            MVT::i64,InFlag);
+      InFlag = boolInR8.getValue(2);
+      Chain = boolInR8.getValue(1);
+      SDValue zeroReg = DAG.getCopyFromReg(Chain, dl, IA64::r0, 
+                                           MVT::i64, InFlag);
+      InFlag = zeroReg.getValue(2);
+      Chain = zeroReg.getValue(1);
+      
+      RetVal = DAG.getSetCC(dl, MVT::i1, boolInR8, zeroReg, ISD::SETNE);
+      break;
+    }
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      RetVal = DAG.getCopyFromReg(Chain, dl, IA64::r8, MVT::i64, InFlag);
+      Chain = RetVal.getValue(1);
+      
+      // keep track of whether it is sign or zero extended (todo: bools?)
+/* XXX
+      RetVal = DAG.getNode(RetTy->isSigned() ? ISD::AssertSext :ISD::AssertZext,
+                           dl, MVT::i64, RetVal, DAG.getValueType(RetTyVT));
+*/
+      RetVal = DAG.getNode(ISD::TRUNCATE, dl, RetTyVT, RetVal);
+      break;
+    case MVT::i64:
+      RetVal = DAG.getCopyFromReg(Chain, dl, IA64::r8, MVT::i64, InFlag);
+      Chain = RetVal.getValue(1);
+      InFlag = RetVal.getValue(2); // XXX dead
+      break;
+    case MVT::f32:
+      RetVal = DAG.getCopyFromReg(Chain, dl, IA64::F8, MVT::f64, InFlag);
+      Chain = RetVal.getValue(1);
+      RetVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, RetVal,
+                           DAG.getIntPtrConstant(0));
+      break;
+    case MVT::f64:
+      RetVal = DAG.getCopyFromReg(Chain, dl, IA64::F8, MVT::f64, InFlag);
+      Chain = RetVal.getValue(1);
+      InFlag = RetVal.getValue(2); // XXX dead
+      break;
+    }
+  }
+  
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), SDValue());
+  return std::make_pair(RetVal, Chain);
+}
+
+SDValue IA64TargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Should not custom lower this!");
+  case ISD::GlobalTLSAddress:
+    assert(0 && "TLS not implemented for IA64.");
+  case ISD::RET: {
+    SDValue AR_PFSVal, Copy;
+    
+    switch(Op.getNumOperands()) {
+     default:
+      assert(0 && "Do not know how to return this many arguments!");
+      abort();
+    case 1: 
+      AR_PFSVal = DAG.getCopyFromReg(Op.getOperand(0), dl, VirtGPR, MVT::i64);
+      AR_PFSVal = DAG.getCopyToReg(AR_PFSVal.getValue(1), dl, IA64::AR_PFS, 
+                                   AR_PFSVal);
+      return DAG.getNode(IA64ISD::RET_FLAG, dl, MVT::Other, AR_PFSVal);
+    case 3: {
+      // Copy the result into the output register & restore ar.pfs
+      MVT ArgVT = Op.getOperand(1).getValueType();
+      unsigned ArgReg = ArgVT.isInteger() ? IA64::r8 : IA64::F8;
+
+      AR_PFSVal = DAG.getCopyFromReg(Op.getOperand(0), dl, VirtGPR, MVT::i64);
+      Copy = DAG.getCopyToReg(AR_PFSVal.getValue(1), dl, ArgReg, 
+                              Op.getOperand(1), SDValue());
+      AR_PFSVal = DAG.getCopyToReg(Copy.getValue(0), dl, 
+                                   IA64::AR_PFS, AR_PFSVal, Copy.getValue(1));
+      return DAG.getNode(IA64ISD::RET_FLAG, dl, MVT::Other,
+                         AR_PFSVal, AR_PFSVal.getValue(1));
+    }
+    }
+    return SDValue();
+  }
+  case ISD::VAARG: {
+    MVT VT = getPointerTy();
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    SDValue VAList = DAG.getLoad(VT, dl, Op.getOperand(0), Op.getOperand(1), 
+                                   SV, 0);
+    // Increment the pointer, VAList, to the next vaarg
+    SDValue VAIncr = DAG.getNode(ISD::ADD, dl, VT, VAList, 
+                                   DAG.getConstant(VT.getSizeInBits()/8,
+                                                   VT));
+    // Store the incremented VAList to the legalized pointer
+    VAIncr = DAG.getStore(VAList.getValue(1), dl, VAIncr,
+                          Op.getOperand(1), SV, 0);
+    // Load the actual argument out of the pointer VAList
+    return DAG.getLoad(Op.getValueType(), dl, VAIncr, VAList, NULL, 0);
+  }
+  case ISD::VASTART: {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, MVT::i64);
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+  }
+  // Frame & Return address.  Currently unimplemented
+  case ISD::RETURNADDR:         break;
+  case ISD::FRAMEADDR:          break;
+  }
+  return SDValue();
+}
diff --git a/lib/Target/IA64/IA64ISelLowering.h b/lib/Target/IA64/IA64ISelLowering.h
new file mode 100644
index 0000000..edf7eb8
--- /dev/null
+++ b/lib/Target/IA64/IA64ISelLowering.h
@@ -0,0 +1,76 @@
+//===-- IA64ISelLowering.h - IA64 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that IA64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_IA64_IA64ISELLOWERING_H
+#define LLVM_TARGET_IA64_IA64ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "IA64.h"
+
+namespace llvm {
+  namespace IA64ISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// GETFD - the getf.d instruction takes a floating point operand and
+      /// returns its 64-bit memory representation as an i64
+      GETFD,
+
+      // TODO: explain this hack
+      BRCALL,
+      
+      // RET_FLAG - Return with a flag operand
+      RET_FLAG
+    };
+  }  
+  
+  class IA64TargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    //int ReturnAddrIndex;              // FrameIndex for return slot.
+    unsigned GP, SP, RP; // FIXME - clean this mess up
+  public:
+    explicit IA64TargetLowering(TargetMachine &TM);
+
+    unsigned VirtGPR; // this is public so it can be accessed in the selector
+                      // for ISD::RET. add an accessor instead? FIXME
+    const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType: return ISD::SETCC's result type.
+    virtual MVT getSetCCResultType(MVT VT) const;
+      
+    /// LowerArguments - This hook must be implemented to indicate how we should
+    /// lower the arguments for the specified function, into the specified DAG.
+    virtual void LowerArguments(Function &F, SelectionDAG &DAG,
+                                SmallVectorImpl<SDValue> &ArgValues,
+                                DebugLoc dl);
+    
+    /// LowerCallTo - This hook lowers an abstract call to a function into an
+    /// actual call.
+    virtual std::pair<SDValue, SDValue>
+      LowerCallTo(SDValue Chain, const Type *RetTy,
+                  bool RetSExt, bool RetZExt, bool isVarArg, bool isInreg,
+                  unsigned CC, bool isTailCall, 
+                  SDValue Callee, ArgListTy &Args, SelectionDAG &DAG,
+                  DebugLoc dl);
+
+    /// LowerOperation - for custom lowering specific ops
+    /// (currently, only "ret void")
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+    
+  };
+}
+
+#endif   // LLVM_TARGET_IA64_IA64ISELLOWERING_H
diff --git a/lib/Target/IA64/IA64InstrBuilder.h b/lib/Target/IA64/IA64InstrBuilder.h
new file mode 100644
index 0000000..a5d4dca
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrBuilder.h
@@ -0,0 +1,40 @@
+//===-- IA64PCInstrBuilder.h - Aids for building IA64 insts -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64_INSTRBUILDER_H
+#define IA64_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/IA64/IA64InstrFormats.td b/lib/Target/IA64/IA64InstrFormats.td
new file mode 100644
index 0000000..c465880
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrFormats.td
@@ -0,0 +1,80 @@
+//===- IA64InstrFormats.td - IA64 Instruction Formats --*- tablegen -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//  - Warning: the stuff in here isn't really being used, so is mostly
+//             junk. It'll get fixed as the JIT gets built.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+class InstIA64<bits<4> op, dag OOL, dag IOL, string asmstr> : Instruction { 
+  // IA64 instruction baseline
+  field bits<41> Inst;
+  let Namespace = "IA64";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+
+  let Inst{40-37} = op;
+}
+
+//"Each Itanium instruction is categorized into one of six types."
+//We should have:
+// A, I, M, F, B, L+X
+
+class AForm<bits<4> opcode, bits<6> qpReg, dag OOL, dag IOL, string asmstr> : 
+  InstIA64<opcode, OOL, IOL, asmstr> {
+
+  let Inst{5-0} = qpReg;
+}
+
+class AForm_DAG<bits<4> opcode, bits<6> qpReg, dag OOL, dag IOL, string asmstr,
+      list<dag> pattern> : 
+  InstIA64<opcode, OOL, IOL, asmstr> {
+
+  let Pattern = pattern;
+  let Inst{5-0} = qpReg;
+}
+
+let isBranch = 1, isTerminator = 1 in
+class BForm<bits<4> opcode, bits<6> x6, bits<3> btype, dag OOL, dag IOL, string asmstr> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+
+  let Inst{32-27} = x6;
+  let Inst{8-6} = btype;
+}
+
+class MForm<bits<4> opcode, bits<6> x6, dag OOL, dag IOL, string asmstr> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+    bits<7> Ra;
+    bits<7> Rb;
+    bits<16> disp;
+
+    let Inst{35-30} = x6;
+//  let Inst{20-16} = Rb;
+    let Inst{15-0} = disp;
+}
+
+class RawForm<bits<4> opcode, bits<26> rest, dag OOL, dag IOL, string asmstr> :
+  InstIA64<opcode, OOL, IOL, asmstr> {
+    let Inst{25-0} = rest;
+}
+
+// Pseudo instructions.
+class PseudoInstIA64<dag OOL, dag IOL, string nm> : InstIA64<0, OOL, IOL, nm>  {
+}
+
+class PseudoInstIA64_DAG<dag OOL, dag IOL, string nm, list<dag> pattern>
+  : InstIA64<0, OOL, IOL, nm> {
+  let Pattern = pattern;
+}
+
diff --git a/lib/Target/IA64/IA64InstrInfo.cpp b/lib/Target/IA64/IA64InstrInfo.cpp
new file mode 100644
index 0000000..5f89d4f
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrInfo.cpp
@@ -0,0 +1,193 @@
+//===- IA64InstrInfo.cpp - IA64 Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64InstrInfo.h"
+#include "IA64.h"
+#include "IA64InstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/ADT/SmallVector.h"
+#include "IA64GenInstrInfo.inc"
+using namespace llvm;
+
+IA64InstrInfo::IA64InstrInfo()
+  : TargetInstrInfoImpl(IA64Insts, sizeof(IA64Insts)/sizeof(IA64Insts[0])),
+    RI(*this) {
+}
+
+
+bool IA64InstrInfo::isMoveInstr(const MachineInstr& MI,
+                                unsigned& sourceReg,
+                                unsigned& destReg,
+                                unsigned& SrcSR, unsigned& DstSR) const {
+  SrcSR = DstSR = 0;  // No sub-registers.
+
+  unsigned oc = MI.getOpcode();
+  if (oc == IA64::MOV || oc == IA64::FMOV) {
+  // TODO: this doesn't detect predicate moves
+     assert(MI.getNumOperands() >= 2 &&
+             /* MI.getOperand(0).isReg() &&
+             MI.getOperand(1).isReg() && */
+             "invalid register-register move instruction");
+     if (MI.getOperand(0).isReg() &&
+         MI.getOperand(1).isReg()) {
+       // if both operands of the MOV/FMOV are registers, then
+       // yes, this is a move instruction
+       sourceReg = MI.getOperand(1).getReg();
+       destReg = MI.getOperand(0).getReg();
+       return true;
+     }
+  }
+  return false; // we don't consider e.g. %regN = MOV <FrameIndex #x> a
+                // move instruction
+}
+
+unsigned
+IA64InstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond)const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Can only insert uncond branches so far.
+  assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
+  BuildMI(&MBB, dl, get(IA64::BRL_NOTCALL)).addMBB(TBB);
+  return 1;
+}
+
+bool IA64InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 const TargetRegisterClass *DestRC,
+                                 const TargetRegisterClass *SrcRC) const {
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if(DestRC == IA64::PRRegisterClass ) // if a bool, we use pseudocode
+    // (SrcReg) DestReg = cmp.eq.unc(r0, r0)
+    BuildMI(MBB, MI, DL, get(IA64::PCMPEQUNC), DestReg)
+      .addReg(IA64::r0).addReg(IA64::r0).addReg(SrcReg);
+  else // otherwise, MOV works (for both gen. regs and FP regs)
+    BuildMI(MBB, MI, DL, get(IA64::MOV), DestReg).addReg(SrcReg);
+  
+  return true;
+}
+
+void IA64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned SrcReg, bool isKill,
+                                           int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == IA64::FPRegisterClass) {
+    BuildMI(MBB, MI, DL, get(IA64::STF_SPILL)).addFrameIndex(FrameIdx)
+      .addReg(SrcReg, getKillRegState(isKill));
+  } else if (RC == IA64::GRRegisterClass) {
+    BuildMI(MBB, MI, DL, get(IA64::ST8)).addFrameIndex(FrameIdx)
+      .addReg(SrcReg, getKillRegState(isKill));
+  } else if (RC == IA64::PRRegisterClass) {
+    /* we use IA64::r2 as a temporary register for doing this hackery. */
+    // first we load 0:
+    BuildMI(MBB, MI, DL, get(IA64::MOV), IA64::r2).addReg(IA64::r0);
+    // then conditionally add 1:
+    BuildMI(MBB, MI, DL, get(IA64::CADDIMM22), IA64::r2).addReg(IA64::r2)
+      .addImm(1).addReg(SrcReg, getKillRegState(isKill));
+    // and then store it to the stack
+    BuildMI(MBB, MI, DL, get(IA64::ST8))
+      .addFrameIndex(FrameIdx)
+      .addReg(IA64::r2);
+  } else assert(0 &&
+      "sorry, I don't know how to store this sort of reg in the stack\n");
+}
+
+void IA64InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                   bool isKill,
+                                   SmallVectorImpl<MachineOperand> &Addr,
+                                   const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Opc = 0;
+  if (RC == IA64::FPRegisterClass) {
+    Opc = IA64::STF8;
+  } else if (RC == IA64::GRRegisterClass) {
+    Opc = IA64::ST8;
+  } else if (RC == IA64::PRRegisterClass) {
+    Opc = IA64::ST1;
+  } else {
+    assert(0 &&
+      "sorry, I don't know how to store this sort of reg\n");
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  NewMIs.push_back(MIB);
+  return;
+
+}
+
+void IA64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         unsigned DestReg, int FrameIdx,
+                                         const TargetRegisterClass *RC)const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == IA64::FPRegisterClass) {
+    BuildMI(MBB, MI, DL, get(IA64::LDF_FILL), DestReg).addFrameIndex(FrameIdx);
+  } else if (RC == IA64::GRRegisterClass) {
+    BuildMI(MBB, MI, DL, get(IA64::LD8), DestReg).addFrameIndex(FrameIdx);
+  } else if (RC == IA64::PRRegisterClass) {
+    // first we load a byte from the stack into r2, our 'predicate hackery'
+    // scratch reg
+    BuildMI(MBB, MI, DL, get(IA64::LD8), IA64::r2).addFrameIndex(FrameIdx);
+    // then we compare it to zero. If it _is_ zero, compare-not-equal to
+    // r0 gives us 0, which is what we want, so that's nice.
+    BuildMI(MBB, MI, DL, get(IA64::CMPNE), DestReg)
+      .addReg(IA64::r2)
+      .addReg(IA64::r0);
+  } else {
+    assert(0 &&
+           "sorry, I don't know how to load this sort of reg from the stack\n");
+  }
+}
+
+void IA64InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                    SmallVectorImpl<MachineOperand> &Addr,
+                                    const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Opc = 0;
+  if (RC == IA64::FPRegisterClass) {
+    Opc = IA64::LDF8;
+  } else if (RC == IA64::GRRegisterClass) {
+    Opc = IA64::LD8;
+  } else if (RC == IA64::PRRegisterClass) {
+    Opc = IA64::LD1;
+  } else {
+    assert(0 &&
+      "sorry, I don't know how to load this sort of reg\n");
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+  return;
+}
diff --git a/lib/Target/IA64/IA64InstrInfo.h b/lib/Target/IA64/IA64InstrInfo.h
new file mode 100644
index 0000000..79236c2
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrInfo.h
@@ -0,0 +1,70 @@
+//===- IA64InstrInfo.h - IA64 Instruction Information ----------*- C++ -*- ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64INSTRUCTIONINFO_H
+#define IA64INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "IA64RegisterInfo.h"
+
+namespace llvm {
+
+class IA64InstrInfo : public TargetInstrInfoImpl {
+  const IA64RegisterInfo RI;
+public:
+  IA64InstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const IA64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+};
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/IA64/IA64InstrInfo.td b/lib/Target/IA64/IA64InstrInfo.td
new file mode 100644
index 0000000..2ab9897
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrInfo.td
@@ -0,0 +1,751 @@
+//===- IA64InstrInfo.td - Describe the IA64 Instruction Set -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the IA64 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+include "IA64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// IA-64 specific DAG Nodes.
+//
+
+def IA64getfd : SDNode<"IA64ISD::GETFD", SDTFPToIntOp, []>;
+
+def retflag         : SDNode<"IA64ISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+//===---------
+// Instruction types
+
+class isA { bit A=1; } // I or M unit
+class isM { bit M=1; } // M unit
+class isI { bit I=1; } // I unit
+class isB { bit B=1; } // B unit
+class isF { bit F=1; } // F unit
+class isLX { bit LX=1; } // I/B
+
+//===---------
+
+def u2imm : Operand<i8>;
+def u6imm : Operand<i8>;
+def s8imm : Operand<i8> {
+  let PrintMethod = "printS8ImmOperand";
+}
+def s14imm  : Operand<i64> {
+  let PrintMethod = "printS14ImmOperand";
+}
+def s22imm  : Operand<i64> {
+  let PrintMethod = "printS22ImmOperand";
+}
+def u64imm  : Operand<i64> {
+  let PrintMethod = "printU64ImmOperand";
+}
+def s64imm  : Operand<i64> {
+  let PrintMethod = "printS64ImmOperand";
+}
+
+let PrintMethod = "printGlobalOperand" in
+  def globaladdress : Operand<i64>;
+
+// the asmprinter needs to know about calls
+let PrintMethod = "printCallOperand" in
+  def calltarget : Operand<i64>;
+  
+/* new daggy action!!! */
+
+def is32ones : PatLeaf<(i64 imm), [{
+  // is32ones predicate - True if the immediate is 0x00000000FFFFFFFF 
+  // Used to create ZXT4s appropriately 
+  uint64_t v = (uint64_t)N->getZExtValue();
+  return (v == 0x00000000FFFFFFFFLL);
+}]>;
+
+// isMIXable predicates - True if the immediate is
+// 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF
+// etc, through 0x00000000FFFFFFFF
+// Used to test for the suitability of mix* 
+def isMIX1Lable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getZExtValue()==0xFF00FF00FF00FF00LL);
+}]>;
+def isMIX1Rable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getZExtValue()==0x00FF00FF00FF00FFLL);
+}]>;
+def isMIX2Lable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getZExtValue()==0xFFFF0000FFFF0000LL);
+}]>;
+def isMIX2Rable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getZExtValue()==0x0000FFFF0000FFFFLL);
+}]>;
+def isMIX4Lable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getZExtValue()==0xFFFFFFFF00000000LL);
+}]>;
+def isMIX4Rable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getZExtValue()==0x00000000FFFFFFFFLL);
+}]>;
+
+def isSHLADDimm: PatLeaf<(i64 imm), [{
+  // isSHLADDimm predicate - True if the immediate is exactly 1, 2, 3 or 4
+  // - 0 is *not* okay.
+  // Used to create shladd instructions appropriately
+  int64_t v = (int64_t)N->getZExtValue();
+  return (v >= 1 && v <= 4);
+}]>;
+
+def immSExt14  : PatLeaf<(i64 imm), [{
+  // immSExt14 predicate - True if the immediate fits in a 14-bit sign extended
+  // field.  Used by instructions like 'adds'.
+  int64_t v = (int64_t)N->getZExtValue();
+  return (v <= 8191 && v >= -8192);
+}]>;
+
+// imm64 predicate - True if the immediate fits in a 64-bit 
+// field - i.e., true. used to keep movl happy
+def imm64  : PatLeaf<(i64 imm)>; 
+
+def ADD  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "add $dst = $src1, $src2",
+           [(set GR:$dst, (add GR:$src1, GR:$src2))]>, isA;
+
+def ADD1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "add $dst = $src1, $src2, 1",
+           [(set GR:$dst, (add (add GR:$src1, GR:$src2), 1))]>, isA;
+
+def ADDS : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s14imm:$imm),
+           "adds $dst = $imm, $src1",
+           [(set GR:$dst, (add GR:$src1, immSExt14:$imm))]>, isA;
+ 
+def MOVL : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins s64imm:$imm),
+           "movl $dst = $imm",
+           [(set GR:$dst, imm64:$imm)]>, isLX;
+
+def ADDL_GA : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, globaladdress:$imm),
+           "addl $dst = $imm, $src1",
+           []>, isA;
+
+// hmm 
+def ADDL_EA : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, calltarget:$imm),
+           "addl $dst = $imm, $src1",
+           []>, isA;
+ 
+def SUB  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "sub $dst = $src1, $src2",
+           [(set GR:$dst, (sub GR:$src1, GR:$src2))]>, isA;
+
+def SUB1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+           "sub $dst = $src1, $src2, 1",
+           [(set GR:$dst, (add (sub GR: $src1, GR:$src2), -1))]>, isA;
+
+let isTwoAddress = 1 in {
+def TPCADDIMM22 : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, s22imm:$imm, PR:$qp),
+    "($qp) add $dst = $imm, $dst">, isA;
+def TPCADDS : AForm_DAG<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, s14imm:$imm, PR:$qp),
+    "($qp) adds $dst = $imm, $dst",
+    []>, isA;
+def TPCMPIMM8NE : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src1, s22imm:$imm, GR:$src2, PR:$qp),
+    "($qp) cmp.ne $dst , p0 = $imm, $src2">, isA;
+}
+
+// zero extend a bool (predicate reg) into an integer reg
+def ZXTb : Pat<(zext PR:$src),
+          (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src)>;
+def AXTb : Pat<(anyext PR:$src),
+          (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src)>; 
+
+// normal sign/zero-extends
+def SXT1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "sxt1 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i8))]>, isI;
+def ZXT1 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "zxt1 $dst = $src",
+           [(set GR:$dst, (and GR:$src, 255))]>, isI;
+def SXT2 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "sxt2 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i16))]>, isI;
+def ZXT2 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "zxt2 $dst = $src",
+           [(set GR:$dst, (and GR:$src, 65535))]>, isI;
+def SXT4 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "sxt4 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i32))]>, isI;
+def ZXT4 : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "zxt4 $dst = $src",
+           [(set GR:$dst, (and GR:$src, is32ones))]>, isI;
+
+// fixme: shrs vs shru?
+def MIX1L : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix1.l $dst = $src1, $src2",
+          [(set GR:$dst, (or (and GR:$src1, isMIX1Lable),
+                          (and (srl GR:$src2, (i64 8)), isMIX1Lable)))]>, isI;
+
+def MIX2L : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix2.l $dst = $src1, $src2",
+          [(set GR:$dst, (or (and GR:$src1, isMIX2Lable),
+                          (and (srl GR:$src2, (i64 16)), isMIX2Lable)))]>, isI;
+
+def MIX4L : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix4.l $dst = $src1, $src2",
+          [(set GR:$dst, (or (and GR:$src1, isMIX4Lable),
+                          (and (srl GR:$src2, (i64 32)), isMIX4Lable)))]>, isI;
+
+def MIX1R : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix1.r $dst = $src1, $src2",
+          [(set GR:$dst, (or (and (shl GR:$src1, (i64 8)), isMIX1Rable),
+                          (and GR:$src2, isMIX1Rable)))]>, isI;
+
+def MIX2R : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix2.r $dst = $src1, $src2",
+          [(set GR:$dst, (or (and (shl GR:$src1, (i64 16)), isMIX2Rable),
+                          (and GR:$src2, isMIX2Rable)))]>, isI;
+
+def MIX4R : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "mix4.r $dst = $src1, $src2",
+          [(set GR:$dst, (or (and (shl GR:$src1, (i64 32)), isMIX4Rable),
+                          (and GR:$src2, isMIX4Rable)))]>, isI;
+
+def GETFSIGD : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins FP:$src),
+  "getf.sig $dst = $src",
+  []>, isM;
+
+def SETFSIGD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins GR:$src),
+  "setf.sig $dst = $src",
+  []>, isM;
+
+def XMALD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.l $dst = $src1, $src2, $src3",
+  []>, isF;
+def XMAHD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.h $dst = $src1, $src2, $src3",
+  []>, isF;
+def XMAHUD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.hu $dst = $src1, $src2, $src3",
+  []>, isF;
+
+// pseudocode for integer multiplication 
+def : Pat<(mul GR:$src1, GR:$src2),
+           (GETFSIGD (XMALD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>;
+def : Pat<(mulhs GR:$src1, GR:$src2),
+           (GETFSIGD (XMAHD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>;
+def : Pat<(mulhu GR:$src1, GR:$src2),
+           (GETFSIGD (XMAHUD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>;
+
+// TODO: addp4 (addp4 dst = src, r0 is a 32-bit add)
+// has imm form, too
+
+// def ADDS : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s14imm:$imm),
+//   "adds $dst = $imm, $src1">;
+
+def AND   : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "and $dst = $src1, $src2",
+          [(set GR:$dst, (and GR:$src1, GR:$src2))]>, isA;
+def ANDCM : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "andcm $dst = $src1, $src2",
+          [(set GR:$dst, (and GR:$src1, (not GR:$src2)))]>, isA;
+// TODO: and/andcm/or/xor/add/sub/shift immediate forms
+def OR    : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "or $dst = $src1, $src2",
+          [(set GR:$dst, (or GR:$src1, GR:$src2))]>, isA;
+
+def pOR   : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+          "($qp) or $dst = $src1, $src2">, isA;
+
+// the following are all a bit unfortunate: we throw away the complement
+// of the compare!
+def CMPEQ : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.eq $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (seteq GR:$src1, GR:$src2))]>, isA;
+def CMPGT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.gt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setgt GR:$src1, GR:$src2))]>, isA;
+def CMPGE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.ge $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setge GR:$src1, GR:$src2))]>, isA;
+def CMPLT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.lt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setlt GR:$src1, GR:$src2))]>, isA;
+def CMPLE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.le $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setle GR:$src1, GR:$src2))]>, isA;
+def CMPNE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.ne $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setne GR:$src1, GR:$src2))]>, isA;
+def CMPLTU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.ltu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setult GR:$src1, GR:$src2))]>, isA;
+def CMPGTU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.gtu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setugt GR:$src1, GR:$src2))]>, isA;
+def CMPLEU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.leu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setule GR:$src1, GR:$src2))]>, isA;
+def CMPGEU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2),
+          "cmp.geu $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setuge GR:$src1, GR:$src2))]>, isA;
+
+// and we do the whole thing again for FP compares!
+def FCMPEQ : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.eq $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (seteq FP:$src1, FP:$src2))]>, isF;
+def FCMPGT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.gt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setgt FP:$src1, FP:$src2))]>, isF;
+def FCMPGE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.ge $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setge FP:$src1, FP:$src2))]>, isF;
+def FCMPLT : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.lt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setlt FP:$src1, FP:$src2))]>, isF;
+def FCMPLE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.le $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setle FP:$src1, FP:$src2))]>, isF;
+def FCMPNE : AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.neq $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setne FP:$src1, FP:$src2))]>, isF;
+def FCMPLTU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.lt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setult FP:$src1, FP:$src2))]>, isF;
+def FCMPGTU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.gt $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setugt FP:$src1, FP:$src2))]>, isF;
+def FCMPLEU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.le $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setule FP:$src1, FP:$src2))]>, isF;
+def FCMPGEU: AForm_DAG<0x03, 0x0b, (outs PR:$dst), (ins FP:$src1, FP:$src2),
+          "fcmp.ge $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (setuge FP:$src1, FP:$src2))]>, isF;
+
+def PCMPEQUNCR0R0 : AForm<0x03, 0x0b, (outs PR:$dst), (ins PR:$qp),
+    "($qp) cmp.eq.unc $dst, p0 = r0, r0">, isA;
+
+def : Pat<(trunc GR:$src),  // truncate i64 to i1
+          (CMPNE GR:$src, r0)>; // $src!=0? If so, PR:$dst=true
+
+let isTwoAddress=1 in {
+  def TPCMPEQR0R0 : AForm<0x03, 0x0b, (outs PR:$dst), (ins PR:$bogus, PR:$qp),
+    "($qp) cmp.eq $dst, p0 = r0, r0">, isA;
+  def TPCMPNER0R0 : AForm<0x03, 0x0b, (outs PR:$dst), (ins PR:$bogus, PR:$qp),
+    "($qp) cmp.ne $dst, p0 = r0, r0">, isA;
+}
+
+/* our pseudocode for OR on predicates is:
+pC = pA OR pB
+-------------
+(pA) cmp.eq.unc pC,p0 = r0,r0  // pC = pA
+ ;;
+(pB) cmp.eq pC,p0 = r0,r0 // if (pB) pC = 1 */
+
+def bOR   : Pat<(or PR:$src1, PR:$src2),
+          (TPCMPEQR0R0 (PCMPEQUNCR0R0 PR:$src1), PR:$src2)>;
+
+/* our pseudocode for AND on predicates is:
+ *
+(pA) cmp.eq.unc pC,p0 = r0,r0   // pC = pA
+     cmp.eq pTemp,p0 = r0,r0    // pTemp = NOT pB
+     ;;
+(pB) cmp.ne pTemp,p0 = r0,r0
+     ;;
+(pTemp)cmp.ne pC,p0 = r0,r0    // if (NOT pB) pC = 0  */
+
+def bAND  : Pat<(and PR:$src1, PR:$src2),
+          ( TPCMPNER0R0 (PCMPEQUNCR0R0 PR:$src1),
+            (TPCMPNER0R0 (CMPEQ r0, r0), PR:$src2) )>;
+
+/* one possible routine for XOR on predicates is:
+
+      // Compute px = py ^ pz
+        // using sum of products: px = (py & !pz) | (pz & !py)
+        // Uses 5 instructions in 3 cycles.
+        // cycle 1
+(pz)    cmp.eq.unc      px = r0, r0     // px = pz
+(py)    cmp.eq.unc      pt = r0, r0     // pt = py
+        ;;
+        // cycle 2
+(pt)    cmp.ne.and      px = r0, r0     // px = px & !pt (px = pz & !pt)
+(pz)    cmp.ne.and      pt = r0, r0     // pt = pt & !pz
+        ;;
+        } { .mmi
+        // cycle 3
+(pt)    cmp.eq.or       px = r0, r0     // px = px | pt
+
+*** Another, which we use here, requires one scratch GR. it is:
+
+        mov             rt = 0          // initialize rt off critical path
+        ;;
+
+        // cycle 1
+(pz)    cmp.eq.unc      px = r0, r0     // px = pz
+(pz)    mov             rt = 1          // rt = pz
+        ;;
+        // cycle 2
+(py)    cmp.ne          px = 1, rt      // if (py) px = !pz
+
+.. these routines kindly provided by Jim Hull
+*/
+  
+def bXOR  : Pat<(xor PR:$src1, PR:$src2),
+          (TPCMPIMM8NE (PCMPEQUNCR0R0 PR:$src2), 1,
+                        (TPCADDS (ADDS r0, 0), 1, PR:$src2),
+                         PR:$src1)>;
+
+def XOR   : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "xor $dst = $src1, $src2",
+          [(set GR:$dst, (xor GR:$src1, GR:$src2))]>, isA;
+
+def SHLADD: AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1,s64imm:$imm,GR:$src2),
+          "shladd $dst = $src1, $imm, $src2",
+          [(set GR:$dst, (add GR:$src2, (shl GR:$src1, isSHLADDimm:$imm)))]>, isA;
+
+def SHL   : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "shl $dst = $src1, $src2",
+          [(set GR:$dst, (shl GR:$src1, GR:$src2))]>, isI;
+
+def SHRU  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "shr.u $dst = $src1, $src2",
+          [(set GR:$dst, (srl GR:$src1, GR:$src2))]>, isI;
+
+def SHRS  : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, GR:$src2),
+          "shr $dst = $src1, $src2",
+          [(set GR:$dst, (sra GR:$src1, GR:$src2))]>, isI;
+
+def MOV : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src), "mov $dst = $src">, isA;
+def FMOV : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "mov $dst = $src">, isF; // XXX: there _is_ no fmov
+def PMOV : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src, PR:$qp),
+  "($qp) mov $dst = $src">, isA;
+
+def SPILL_ALL_PREDICATES_TO_GR : AForm<0x03, 0x0b, (outs GR:$dst), (ins),
+  "mov $dst = pr">, isI;
+def FILL_ALL_PREDICATES_FROM_GR : AForm<0x03, 0x0b, (outs), (ins GR:$src),
+  "mov pr = $src">, isI;
+
+let isTwoAddress = 1 in {
+  def CMOV : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src2, GR:$src, PR:$qp),
+    "($qp) mov $dst = $src">, isA;
+}
+
+def PFMOV : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src, PR:$qp),
+  "($qp) mov $dst = $src">, isF;
+
+let isTwoAddress = 1 in {
+  def CFMOV : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src2, FP:$src, PR:$qp),
+    "($qp) mov $dst = $src">, isF;
+}
+
+def SELECTINT : Pat<(select PR:$which, GR:$src1, GR:$src2),
+          (CMOV (MOV GR:$src2), GR:$src1, PR:$which)>; // note order!
+def SELECTFP : Pat<(select PR:$which, FP:$src1, FP:$src2),
+          (CFMOV (FMOV FP:$src2), FP:$src1, PR:$which)>; // note order!
+// TODO: can do this faster, w/o using any integer regs (see pattern isel)
+def SELECTBOOL : Pat<(select PR:$which, PR:$src1, PR:$src2), // note order!
+          (CMPNE (CMOV
+            (MOV (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src2)),
+            (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src1), PR:$which), r0)>;
+
+// load constants of various sizes // FIXME: prettyprint -ve constants
+def : Pat<(i64 immSExt14:$imm), (ADDS r0, immSExt14:$imm)>;
+def : Pat<(i1 -1), (CMPEQ r0, r0)>; // TODO: this should just be a ref to p0
+def : Pat<(i1  0), (CMPNE r0, r0)>; // TODO: any instruction actually *using*
+                                    //       this predicate should be killed!
+
+// TODO: support postincrement (reg, imm9) loads+stores - this needs more
+// tablegen support
+
+def IUSE : PseudoInstIA64<(outs), (ins variable_ops), "// IUSE">;
+def ADJUSTCALLSTACKUP : PseudoInstIA64<(outs), (ins variable_ops),
+                                        "// ADJUSTCALLSTACKUP">;
+def ADJUSTCALLSTACKDOWN : PseudoInstIA64<(outs), (ins variable_ops),
+                                         "// ADJUSTCALLSTACKDOWN">;
+def PSEUDO_ALLOC : PseudoInstIA64<(outs), (ins GR:$foo), "// PSEUDO_ALLOC">;
+
+def ALLOC : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins i8imm:$inputs, i8imm:$locals, i8imm:$outputs, i8imm:$rotating),
+    "alloc $dst = ar.pfs,$inputs,$locals,$outputs,$rotating">, isM;
+
+let isTwoAddress = 1 in {
+  def TCMPNE : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4),
+    "cmp.ne $dst, p0 = $src3, $src4">, isA;
+  
+  def TPCMPEQOR : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.eq.or $dst, p0 = $src3, $src4">, isA;
+  
+  def TPCMPNE : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.ne $dst, p0 = $src3, $src4">, isA;
+  
+  def TPCMPEQ : AForm<0x03, 0x0b,
+  (outs PR:$dst), (ins PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.eq $dst, p0 = $src3, $src4">, isA;
+}
+
+def MOVSIMM14 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s14imm:$imm),
+  "mov $dst = $imm">, isA;
+def MOVSIMM22 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s22imm:$imm),
+  "mov $dst = $imm">, isA;
+def MOVLIMM64 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s64imm:$imm),
+  "movl $dst = $imm">, isLX;
+
+def SHLI : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, u6imm:$imm), 
+  "shl $dst = $src1, $imm">, isI;
+def SHRUI : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, u6imm:$imm),
+  "shr.u $dst = $src1, $imm">, isI;
+def SHRSI : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, u6imm:$imm),
+  "shr $dst = $src1, $imm">, isI;
+
+def EXTRU : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, u6imm:$imm1, u6imm:$imm2),
+  "extr.u $dst = $src1, $imm1, $imm2">, isI;
+
+def DEPZ : AForm<0x03, 0x0b,
+  (outs GR:$dst), (ins GR:$src1, u6imm:$imm1, u6imm:$imm2),
+  "dep.z $dst = $src1, $imm1, $imm2">, isI;
+
+def PCMPEQOR : AForm<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.eq.or $dst, p0 = $src1, $src2">, isA;
+def PCMPEQUNC : AForm<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.eq.unc $dst, p0 = $src1, $src2">, isA;
+def PCMPNE : AForm<0x03, 0x0b, (outs PR:$dst), (ins GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.ne $dst, p0 = $src1, $src2">, isA;
+
+// two destinations! 
+def BCMPEQ : AForm<0x03, 0x0b, (outs PR:$dst1, PR:$dst2), (ins GR:$src1, GR:$src2),
+  "cmp.eq $dst1, dst2 = $src1, $src2">, isA;
+
+def ADDIMM14 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s14imm:$imm),
+  "adds $dst = $imm, $src1">, isA;
+
+def ADDIMM22 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s22imm:$imm),
+  "add $dst = $imm, $src1">, isA;
+def CADDIMM22 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$src1, s22imm:$imm, PR:$qp),
+  "($qp) add $dst = $imm, $src1">, isA;
+
+def SUBIMM8 : AForm<0x03, 0x0b, (outs GR:$dst), (ins s8imm:$imm, GR:$src2),
+  "sub $dst = $imm, $src2">, isA;
+
+let mayStore = 1 in {
+  def ST1 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st1 [$dstPtr] = $value">, isM;
+  def ST2 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st2 [$dstPtr] = $value">, isM;
+  def ST4 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st4 [$dstPtr] = $value">, isM;
+  def ST8 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, GR:$value),
+    "st8 [$dstPtr] = $value">, isM;
+  def STF4 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, FP:$value),
+    "stfs [$dstPtr] = $value">, isM;
+  def STF8 : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, FP:$value),
+    "stfd [$dstPtr] = $value">, isM;
+  def STF_SPILL : AForm<0x03, 0x0b, (outs), (ins GR:$dstPtr, FP:$value),
+    "stf.spill [$dstPtr] = $value">, isM;
+}
+
+let canFoldAsLoad = 1 in {
+  def LD1 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld1 $dst = [$srcPtr]">, isM;
+  def LD2 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld2 $dst = [$srcPtr]">, isM;
+  def LD4 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld4 $dst = [$srcPtr]">, isM;
+  def LD8 : AForm<0x03, 0x0b, (outs GR:$dst), (ins GR:$srcPtr),
+    "ld8 $dst = [$srcPtr]">, isM;
+  def LDF4 : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$srcPtr),
+    "ldfs $dst = [$srcPtr]">, isM;
+  def LDF8 : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$srcPtr),
+    "ldfd $dst = [$srcPtr]">, isM;
+  def LDF_FILL : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$srcPtr),
+    "ldf.fill $dst = [$srcPtr]">, isM;
+}
+
+def POPCNT : AForm_DAG<0x03, 0x0b, (outs GR:$dst), (ins GR:$src),
+  "popcnt $dst = $src",
+  [(set GR:$dst, (ctpop GR:$src))]>, isI;
+
+// some FP stuff:  // TODO: single-precision stuff?
+def FADD : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fadd $dst = $src1, $src2",
+  [(set FP:$dst, (fadd FP:$src1, FP:$src2))]>, isF;
+def FADDS: AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fadd.s $dst = $src1, $src2">, isF;
+def FSUB : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fsub $dst = $src1, $src2",
+  [(set FP:$dst, (fsub FP:$src1, FP:$src2))]>, isF;
+def FMPY : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2),
+  "fmpy $dst = $src1, $src2",
+  [(set FP:$dst, (fmul FP:$src1, FP:$src2))]>, isF;
+def FMA : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fma $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (fadd (fmul FP:$src1, FP:$src2), FP:$src3))]>, isF;
+def FMS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fms $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (fsub (fmul FP:$src1, FP:$src2), FP:$src3))]>, isF;
+def FNMA : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "fnma $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (fneg (fadd (fmul FP:$src1, FP:$src2), FP:$src3)))]>, isF;
+def FABS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fabs $dst = $src",
+  [(set FP:$dst, (fabs FP:$src))]>, isF;
+def FNEG : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fneg $dst = $src",
+  [(set FP:$dst, (fneg FP:$src))]>, isF;
+def FNEGABS : AForm_DAG<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fnegabs $dst = $src",
+  [(set FP:$dst, (fneg (fabs FP:$src)))]>, isF;
+
+let isTwoAddress=1 in {
+def TCFMAS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF;
+def TCFMADS0 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF;
+}
+
+def CFMAS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF;
+def CFNMAS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fnma.s1 $dst = $src1, $src2, $src3">, isF;
+
+def CFMADS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s1 $dst = $src1, $src2, $src3">, isF;
+def CFMADS0 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF;
+def CFNMADS1 : AForm<0x03, 0x0b,
+  (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fnma.d.s1 $dst = $src1, $src2, $src3">, isF;
+
+def FRCPAS0 : AForm<0x03, 0x0b, (outs FP:$dstFR, PR:$dstPR), (ins FP:$src1, FP:$src2),
+  "frcpa.s0 $dstFR, $dstPR = $src1, $src2">, isF;
+def FRCPAS1 : AForm<0x03, 0x0b, (outs FP:$dstFR, PR:$dstPR), (ins FP:$src1, FP:$src2),
+  "frcpa.s1 $dstFR, $dstPR = $src1, $src2">, isF;
+
+def XMAL : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src1, FP:$src2, FP:$src3),
+  "xma.l $dst = $src1, $src2, $src3">, isF;
+
+def FCVTXF : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.xf $dst = $src">, isF;
+def FCVTXUF : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.xuf $dst = $src">, isF;
+def FCVTXUFS1 : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.xuf.s1 $dst = $src">, isF;
+def FCVTFX : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fx $dst = $src">, isF;
+def FCVTFXU : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fxu $dst = $src">, isF;
+
+def FCVTFXTRUNC : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fx.trunc $dst = $src">, isF;
+def FCVTFXUTRUNC : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fxu.trunc $dst = $src">, isF;
+
+def FCVTFXTRUNCS1 : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fx.trunc.s1 $dst = $src">, isF;
+def FCVTFXUTRUNCS1 : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fcvt.fxu.trunc.s1 $dst = $src">, isF;
+
+def FNORMD : AForm<0x03, 0x0b, (outs FP:$dst), (ins FP:$src),
+  "fnorm.d $dst = $src">, isF;
+
+def GETFD : AForm<0x03, 0x0b, (outs GR:$dst), (ins FP:$src),
+  "getf.d $dst = $src">, isM;
+def SETFD : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$src),
+  "setf.d $dst = $src">, isM;
+
+def GETFSIG : AForm<0x03, 0x0b, (outs GR:$dst), (ins FP:$src),
+  "getf.sig $dst = $src">, isM;
+def SETFSIG : AForm<0x03, 0x0b, (outs FP:$dst), (ins GR:$src),
+  "setf.sig $dst = $src">, isM;
+
+// these four FP<->int conversion patterns need checking/cleaning
+def SINT_TO_FP : Pat<(sint_to_fp GR:$src),
+  (FNORMD (FCVTXF (SETFSIG GR:$src)))>;
+def UINT_TO_FP : Pat<(uint_to_fp GR:$src),
+  (FNORMD (FCVTXUF (SETFSIG GR:$src)))>;
+def FP_TO_SINT : Pat<(i64 (fp_to_sint FP:$src)),
+  (GETFSIG (FCVTFXTRUNC FP:$src))>;
+def FP_TO_UINT : Pat<(i64 (fp_to_uint FP:$src)),
+  (GETFSIG (FCVTFXUTRUNC FP:$src))>;
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+def fpimmn0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+def fpimmn1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+def : Pat<(f64 fpimm0), (FMOV F0)>;
+def : Pat<(f64 fpimm1), (FMOV F1)>;
+def : Pat<(f64 fpimmn0), (FNEG F0)>;
+def : Pat<(f64 fpimmn1), (FNEG F1)>;
+
+let isTerminator = 1, isBranch = 1 in {
+  def BRL_NOTCALL : RawForm<0x03, 0xb0, (outs), (ins i64imm:$dst),
+    "(p0) brl.cond.sptk $dst">, isB;
+  def BRLCOND_NOTCALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, i64imm:$dst),
+    "($qp) brl.cond.sptk $dst">, isB;
+  def BRCOND_NOTCALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, GR:$dst),
+    "($qp) br.cond.sptk $dst">, isB;
+}
+
+let isCall = 1, /* isTerminator = 1, isBranch = 1, */
+  Uses = [out0,out1,out2,out3,out4,out5,out6,out7],
+// all calls clobber non-callee-saved registers, and for now, they are these:
+  Defs = [r2,r3,r8,r9,r10,r11,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,
+  r25,r26,r27,r28,r29,r30,r31,
+  p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,
+  F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,
+  F32,F33,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48,F49,
+  F50,F51,F52,F53,F54,F55,F56,
+  F57,F58,F59,F60,F61,F62,F63,F64,F65,F66,F67,F68,F69,F70,F71,F72,F73,F74,
+  F75,F76,F77,F78,F79,F80,F81,
+  F82,F83,F84,F85,F86,F87,F88,F89,F90,F91,F92,F93,F94,F95,F96,F97,F98,F99,
+  F100,F101,F102,F103,F104,F105,
+  F106,F107,F108,F109,F110,F111,F112,F113,F114,F115,F116,F117,F118,F119,
+  F120,F121,F122,F123,F124,F125,F126,F127,
+  out0,out1,out2,out3,out4,out5,out6,out7] in {
+// old pattern call
+  def BRCALL: RawForm<0x03, 0xb0, (outs), (ins calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;   // FIXME: teach llvm about branch regs?
+// new daggy stuff!  
+
+// calls a globaladdress
+  def BRCALL_IPREL_GA : RawForm<0x03, 0xb0, (outs), (ins calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;       // FIXME: teach llvm about branch regs?
+// calls an externalsymbol
+  def BRCALL_IPREL_ES : RawForm<0x03, 0xb0, (outs), (ins calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;       // FIXME: teach llvm about branch regs?
+// calls through a function descriptor
+  def BRCALL_INDIRECT : RawForm<0x03, 0xb0, (outs), (ins GR:$branchreg),
+  "br.call.sptk rp = $branchreg">, isB; // FIXME: teach llvm about branch regs?
+  def BRLCOND_CALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, i64imm:$dst),
+    "($qp) brl.cond.call.sptk $dst">, isB;
+  def BRCOND_CALL : RawForm<0x03, 0xb0, (outs), (ins PR:$qp, GR:$dst),
+    "($qp) br.cond.call.sptk $dst">, isB;
+}
+
+// Return branch:
+let isTerminator = 1, isReturn = 1 in
+  def RET : AForm_DAG<0x03, 0x0b, (outs), (ins),
+            "br.ret.sptk.many rp",
+            [(retflag)]>, isB; // return
+def : Pat<(ret), (RET)>;
+
+// the evil stop bit of despair
+def STOP : PseudoInstIA64<(outs), (ins variable_ops), ";;">;
+
diff --git a/lib/Target/IA64/IA64MachineFunctionInfo.h b/lib/Target/IA64/IA64MachineFunctionInfo.h
new file mode 100644
index 0000000..fb93056
--- /dev/null
+++ b/lib/Target/IA64/IA64MachineFunctionInfo.h
@@ -0,0 +1,34 @@
+//===-- IA64MachineFunctionInfo.h - IA64-specific information ---*- C++ -*-===//
+//===--                   for MachineFunction                 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares IA64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64MACHINEFUNCTIONINFO_H
+#define IA64MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+//#include "IA64JITInfo.h"
+
+namespace llvm {
+
+class IA64FunctionInfo : public MachineFunctionInfo {
+
+public:
+  unsigned outRegsUsed; // how many 'out' registers are used
+  // by this machinefunction? (used to compute the appropriate
+  // entry in the 'alloc' instruction at the top of the
+  // machinefunction)
+  IA64FunctionInfo(MachineFunction& MF) { outRegsUsed=0; };
+
+};
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/IA64/IA64RegisterInfo.cpp b/lib/Target/IA64/IA64RegisterInfo.cpp
new file mode 100644
index 0000000..7ad6f51
--- /dev/null
+++ b/lib/Target/IA64/IA64RegisterInfo.cpp
@@ -0,0 +1,319 @@
+//===- IA64RegisterInfo.cpp - IA64 Register Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on IA64.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64.h"
+#include "IA64RegisterInfo.h"
+#include "IA64InstrBuilder.h"
+#include "IA64MachineFunctionInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+IA64RegisterInfo::IA64RegisterInfo(const TargetInstrInfo &tii)
+  : IA64GenRegisterInfo(IA64::ADJUSTCALLSTACKDOWN, IA64::ADJUSTCALLSTACKUP),
+    TII(tii) {}
+
+const unsigned* IA64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    IA64::r5,  0
+  };
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+IA64RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &IA64::GRRegClass,  0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector IA64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(IA64::r0);
+  Reserved.set(IA64::r1);
+  Reserved.set(IA64::r2);
+  Reserved.set(IA64::r5);
+  Reserved.set(IA64::r12);
+  Reserved.set(IA64::r13);
+  Reserved.set(IA64::r22);
+  Reserved.set(IA64::rp);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool IA64RegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+void IA64RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub SP, <amt>' and the adjcallstackdown instruction into 'add SP,
+    // <amt>'
+    MachineInstr *Old = I;
+    unsigned Amount = Old->getOperand(0).getImm();
+    DebugLoc dl = Old->getDebugLoc();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      if (Old->getOpcode() == IA64::ADJUSTCALLSTACKDOWN) {
+        BuildMI(MBB, I, dl, TII.get(IA64::ADDIMM22), IA64::r12)
+          .addReg(IA64::r12).addImm(-Amount);
+      } else {
+        assert(Old->getOpcode() == IA64::ADJUSTCALLSTACKUP);
+        BuildMI(MBB, I, dl, TII.get(IA64::ADDIMM22), IA64::r12)
+          .addReg(IA64::r12).addImm(Amount);
+      }
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void IA64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                           int SPAdj, RegScavenger *RS)const{
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool FP = hasFP(MF);
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  // choose a base register: ( hasFP? framepointer : stack pointer )
+  unsigned BaseRegister = FP ? IA64::r5 : IA64::r12;
+  // Add the base register
+  MI.getOperand(i).ChangeToRegister(BaseRegister, false);
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  Offset += MF.getFrameInfo()->getStackSize();
+
+  // XXX: we use 'r22' as another hack+slash temporary register here :(
+  if (Offset <= 8191 && Offset >= -8192) { // smallish offset
+    // Fix up the old:
+    MI.getOperand(i).ChangeToRegister(IA64::r22, false);
+    //insert the new
+    BuildMI(MBB, II, dl, TII.get(IA64::ADDIMM22), IA64::r22)
+      .addReg(BaseRegister).addImm(Offset);
+  } else { // it's big
+    //fix up the old:
+    MI.getOperand(i).ChangeToRegister(IA64::r22, false);
+    BuildMI(MBB, II, dl, TII.get(IA64::MOVLIMM64), IA64::r22).addImm(Offset);
+    BuildMI(MBB, II, dl, TII.get(IA64::ADD), IA64::r22).addReg(BaseRegister)
+      .addReg(IA64::r22);
+  }
+
+}
+
+void IA64RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool FP = hasFP(MF);
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // first, we handle the 'alloc' instruction, that should be right up the
+  // top of any function
+  static const unsigned RegsInOrder[96] = { // there are 96 GPRs the
+                                            // RSE worries about
+        IA64::r32, IA64::r33, IA64::r34, IA64::r35,
+        IA64::r36, IA64::r37, IA64::r38, IA64::r39, IA64::r40, IA64::r41,
+        IA64::r42, IA64::r43, IA64::r44, IA64::r45, IA64::r46, IA64::r47,
+        IA64::r48, IA64::r49, IA64::r50, IA64::r51, IA64::r52, IA64::r53,
+        IA64::r54, IA64::r55, IA64::r56, IA64::r57, IA64::r58, IA64::r59,
+        IA64::r60, IA64::r61, IA64::r62, IA64::r63, IA64::r64, IA64::r65,
+        IA64::r66, IA64::r67, IA64::r68, IA64::r69, IA64::r70, IA64::r71,
+        IA64::r72, IA64::r73, IA64::r74, IA64::r75, IA64::r76, IA64::r77,
+        IA64::r78, IA64::r79, IA64::r80, IA64::r81, IA64::r82, IA64::r83,
+        IA64::r84, IA64::r85, IA64::r86, IA64::r87, IA64::r88, IA64::r89,
+        IA64::r90, IA64::r91, IA64::r92, IA64::r93, IA64::r94, IA64::r95,
+        IA64::r96, IA64::r97, IA64::r98, IA64::r99, IA64::r100, IA64::r101,
+        IA64::r102, IA64::r103, IA64::r104, IA64::r105, IA64::r106, IA64::r107,
+        IA64::r108, IA64::r109, IA64::r110, IA64::r111, IA64::r112, IA64::r113,
+        IA64::r114, IA64::r115, IA64::r116, IA64::r117, IA64::r118, IA64::r119,
+        IA64::r120, IA64::r121, IA64::r122, IA64::r123, IA64::r124, IA64::r125,
+        IA64::r126, IA64::r127 };
+
+  unsigned numStackedGPRsUsed=0;
+  for (int i=0; i != 96; i++) {
+    if (MF.getRegInfo().isPhysRegUsed(RegsInOrder[i]))
+      numStackedGPRsUsed=i+1; // (i+1 and not ++ - consider fn(fp, fp, int)
+  }
+
+  unsigned numOutRegsUsed=MF.getInfo<IA64FunctionInfo>()->outRegsUsed;
+
+  // XXX FIXME : this code should be a bit more reliable (in case there _isn't_
+  // a pseudo_alloc in the MBB)
+  unsigned dstRegOfPseudoAlloc;
+  for(MBBI = MBB.begin(); /*MBBI->getOpcode() != IA64::PSEUDO_ALLOC*/; ++MBBI) {
+    assert(MBBI != MBB.end());
+    if(MBBI->getOpcode() == IA64::PSEUDO_ALLOC) {
+      dstRegOfPseudoAlloc=MBBI->getOperand(0).getReg();
+      break;
+    }
+  }
+
+  if (MBBI != MBB.end()) dl = MBBI->getDebugLoc();
+
+  BuildMI(MBB, MBBI, dl, TII.get(IA64::ALLOC)).
+     addReg(dstRegOfPseudoAlloc).addImm(0).
+     addImm(numStackedGPRsUsed).addImm(numOutRegsUsed).addImm(0);
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned NumBytes = MFI->getStackSize();
+
+  if(FP)
+    NumBytes += 8; // reserve space for the old FP
+
+  // Do we need to allocate space on the stack?
+  if (NumBytes == 0)
+    return;
+
+  // Add 16 bytes at the bottom of the stack (scratch area)
+  // and round the size to a multiple of the alignment.
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned Size = 16 + (FP ? 8 : 0);
+  NumBytes = (NumBytes+Size+Align-1)/Align*Align;
+
+  // Update frame info to pretend that this is part of the stack...
+  MFI->setStackSize(NumBytes);
+
+  // adjust stack pointer: r12 -= numbytes
+  if (NumBytes <= 8191) {
+    BuildMI(MBB, MBBI, dl, TII.get(IA64::ADDIMM22),IA64::r12).addReg(IA64::r12).
+      addImm(-NumBytes);
+  } else { // we use r22 as a scratch register here
+    // first load the decrement into r22
+    BuildMI(MBB, MBBI, dl, TII.get(IA64::MOVLIMM64), IA64::r22).
+      addImm(-NumBytes);
+    // FIXME: MOVLSI32 expects a _u_32imm
+    // then add (subtract) it to r12 (stack ptr)
+    BuildMI(MBB, MBBI, dl, TII.get(IA64::ADD), IA64::r12)
+      .addReg(IA64::r12).addReg(IA64::r22);
+    
+  }
+
+  // now if we need to, save the old FP and set the new
+  if (FP) {
+    BuildMI(MBB, MBBI,dl,TII.get(IA64::ST8)).addReg(IA64::r12).addReg(IA64::r5);
+    // this must be the last instr in the prolog ?  (XXX: why??)
+    BuildMI(MBB, MBBI, dl, TII.get(IA64::MOV), IA64::r5).addReg(IA64::r12);
+  }
+
+}
+
+void IA64RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert(MBBI->getOpcode() == IA64::RET &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  bool FP = hasFP(MF);
+
+  // Get the number of bytes allocated from the FrameInfo...
+  unsigned NumBytes = MFI->getStackSize();
+
+  //now if we need to, restore the old FP
+  if (FP) {
+    //copy the FP into the SP (discards allocas)
+    BuildMI(MBB, MBBI, dl, TII.get(IA64::MOV), IA64::r12).addReg(IA64::r5);
+    //restore the FP
+    BuildMI(MBB, MBBI, dl, TII.get(IA64::LD8), IA64::r5).addReg(IA64::r5);
+  }
+
+  if (NumBytes != 0) {
+    if (NumBytes <= 8191) {
+      BuildMI(MBB, MBBI, dl, TII.get(IA64::ADDIMM22),IA64::r12).
+        addReg(IA64::r12).addImm(NumBytes);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(IA64::MOVLIMM64), IA64::r22).
+        addImm(NumBytes);
+      BuildMI(MBB, MBBI, dl, TII.get(IA64::ADD), IA64::r12).addReg(IA64::r12).
+        addReg(IA64::r22);
+    }
+  }
+}
+
+unsigned IA64RegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned IA64RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? IA64::r5 : IA64::r12;
+}
+
+unsigned IA64RegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned IA64RegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int IA64RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+#include "IA64GenRegisterInfo.inc"
+
diff --git a/lib/Target/IA64/IA64RegisterInfo.h b/lib/Target/IA64/IA64RegisterInfo.h
new file mode 100644
index 0000000..0c5083e
--- /dev/null
+++ b/lib/Target/IA64/IA64RegisterInfo.h
@@ -0,0 +1,63 @@
+//===- IA64RegisterInfo.h - IA64 Register Information Impl ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64REGISTERINFO_H
+#define IA64REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "IA64GenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct IA64RegisterInfo : public IA64GenRegisterInfo {
+  const TargetInstrInfo &TII;
+
+  IA64RegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/IA64/IA64RegisterInfo.td b/lib/Target/IA64/IA64RegisterInfo.td
new file mode 100644
index 0000000..dd72dc3
--- /dev/null
+++ b/lib/Target/IA64/IA64RegisterInfo.td
@@ -0,0 +1,509 @@
+//===- IA64RegisterInfo.td - Describe the IA64 Register File ----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the IA64 register file, defining the registers
+// themselves, aliases between the registers, and the register classes built
+// out of the registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+class IA64Register<string n> : Register<n> {
+    let Namespace = "IA64";
+}
+
+// GR - One of 128 32-bit general registers
+class GR<bits<7> num, string n> : IA64Register<n> {
+    field bits<7> Num = num;
+}
+
+// FP - One of 128 82-bit floating-point registers
+class FP<bits<7> num, string n> : IA64Register<n> {
+    field bits<7> Num = num;
+}
+
+// PR - One of 64 1-bit predicate registers
+class PR<bits<6> num, string n> : IA64Register<n> {
+    field bits<6> Num = num;
+}
+
+/* general registers */
+def r0 : GR< 0, "r0">, DwarfRegNum<[0]>;
+def r1 : GR< 1, "r1">, DwarfRegNum<[1]>;
+def r2 : GR< 2, "r2">, DwarfRegNum<[2]>;
+def r3 : GR< 3, "r3">, DwarfRegNum<[3]>;
+def r4 : GR< 4, "r4">, DwarfRegNum<[4]>;
+def r5 : GR< 5, "r5">, DwarfRegNum<[5]>;
+def r6 : GR< 6, "r6">, DwarfRegNum<[6]>;
+def r7 : GR< 7, "r7">, DwarfRegNum<[7]>;
+def r8 : GR< 8, "r8">, DwarfRegNum<[8]>;
+def r9 : GR< 9, "r9">, DwarfRegNum<[9]>;
+def r10 : GR< 10, "r10">, DwarfRegNum<[10]>;
+def r11 : GR< 11, "r11">, DwarfRegNum<[11]>;
+def r12 : GR< 12, "r12">, DwarfRegNum<[12]>;
+def r13 : GR< 13, "r13">, DwarfRegNum<[13]>;
+def r14 : GR< 14, "r14">, DwarfRegNum<[14]>;
+def r15 : GR< 15, "r15">, DwarfRegNum<[15]>;
+def r16 : GR< 16, "r16">, DwarfRegNum<[16]>;
+def r17 : GR< 17, "r17">, DwarfRegNum<[17]>;
+def r18 : GR< 18, "r18">, DwarfRegNum<[18]>;
+def r19 : GR< 19, "r19">, DwarfRegNum<[19]>;
+def r20 : GR< 20, "r20">, DwarfRegNum<[20]>;
+def r21 : GR< 21, "r21">, DwarfRegNum<[21]>;
+def r22 : GR< 22, "r22">, DwarfRegNum<[22]>;
+def r23 : GR< 23, "r23">, DwarfRegNum<[23]>;
+def r24 : GR< 24, "r24">, DwarfRegNum<[24]>;
+def r25 : GR< 25, "r25">, DwarfRegNum<[25]>;
+def r26 : GR< 26, "r26">, DwarfRegNum<[26]>;
+def r27 : GR< 27, "r27">, DwarfRegNum<[27]>;
+def r28 : GR< 28, "r28">, DwarfRegNum<[28]>;
+def r29 : GR< 29, "r29">, DwarfRegNum<[29]>;
+def r30 : GR< 30, "r30">, DwarfRegNum<[30]>;
+def r31 : GR< 31, "r31">, DwarfRegNum<[31]>;
+def r32 : GR< 32, "r32">, DwarfRegNum<[32]>;
+def r33 : GR< 33, "r33">, DwarfRegNum<[33]>;
+def r34 : GR< 34, "r34">, DwarfRegNum<[34]>;
+def r35 : GR< 35, "r35">, DwarfRegNum<[35]>;
+def r36 : GR< 36, "r36">, DwarfRegNum<[36]>;
+def r37 : GR< 37, "r37">, DwarfRegNum<[37]>;
+def r38 : GR< 38, "r38">, DwarfRegNum<[38]>;
+def r39 : GR< 39, "r39">, DwarfRegNum<[39]>;
+def r40 : GR< 40, "r40">, DwarfRegNum<[40]>;
+def r41 : GR< 41, "r41">, DwarfRegNum<[41]>;
+def r42 : GR< 42, "r42">, DwarfRegNum<[42]>;
+def r43 : GR< 43, "r43">, DwarfRegNum<[43]>;
+def r44 : GR< 44, "r44">, DwarfRegNum<[44]>;
+def r45 : GR< 45, "r45">, DwarfRegNum<[45]>;
+def r46 : GR< 46, "r46">, DwarfRegNum<[46]>;
+def r47 : GR< 47, "r47">, DwarfRegNum<[47]>;
+def r48 : GR< 48, "r48">, DwarfRegNum<[48]>;
+def r49 : GR< 49, "r49">, DwarfRegNum<[49]>;
+def r50 : GR< 50, "r50">, DwarfRegNum<[50]>;
+def r51 : GR< 51, "r51">, DwarfRegNum<[51]>;
+def r52 : GR< 52, "r52">, DwarfRegNum<[52]>;
+def r53 : GR< 53, "r53">, DwarfRegNum<[53]>;
+def r54 : GR< 54, "r54">, DwarfRegNum<[54]>;
+def r55 : GR< 55, "r55">, DwarfRegNum<[55]>;
+def r56 : GR< 56, "r56">, DwarfRegNum<[56]>;
+def r57 : GR< 57, "r57">, DwarfRegNum<[57]>;
+def r58 : GR< 58, "r58">, DwarfRegNum<[58]>;
+def r59 : GR< 59, "r59">, DwarfRegNum<[59]>;
+def r60 : GR< 60, "r60">, DwarfRegNum<[60]>;
+def r61 : GR< 61, "r61">, DwarfRegNum<[61]>;
+def r62 : GR< 62, "r62">, DwarfRegNum<[62]>;
+def r63 : GR< 63, "r63">, DwarfRegNum<[63]>;
+def r64 : GR< 64, "r64">, DwarfRegNum<[64]>;
+def r65 : GR< 65, "r65">, DwarfRegNum<[65]>;
+def r66 : GR< 66, "r66">, DwarfRegNum<[66]>;
+def r67 : GR< 67, "r67">, DwarfRegNum<[67]>;
+def r68 : GR< 68, "r68">, DwarfRegNum<[68]>;
+def r69 : GR< 69, "r69">, DwarfRegNum<[69]>;
+def r70 : GR< 70, "r70">, DwarfRegNum<[70]>;
+def r71 : GR< 71, "r71">, DwarfRegNum<[71]>;
+def r72 : GR< 72, "r72">, DwarfRegNum<[72]>;
+def r73 : GR< 73, "r73">, DwarfRegNum<[73]>;
+def r74 : GR< 74, "r74">, DwarfRegNum<[74]>;
+def r75 : GR< 75, "r75">, DwarfRegNum<[75]>;
+def r76 : GR< 76, "r76">, DwarfRegNum<[76]>;
+def r77 : GR< 77, "r77">, DwarfRegNum<[77]>;
+def r78 : GR< 78, "r78">, DwarfRegNum<[78]>;
+def r79 : GR< 79, "r79">, DwarfRegNum<[79]>;
+def r80 : GR< 80, "r80">, DwarfRegNum<[80]>;
+def r81 : GR< 81, "r81">, DwarfRegNum<[81]>;
+def r82 : GR< 82, "r82">, DwarfRegNum<[82]>;
+def r83 : GR< 83, "r83">, DwarfRegNum<[83]>;
+def r84 : GR< 84, "r84">, DwarfRegNum<[84]>;
+def r85 : GR< 85, "r85">, DwarfRegNum<[85]>;
+def r86 : GR< 86, "r86">, DwarfRegNum<[86]>;
+def r87 : GR< 87, "r87">, DwarfRegNum<[87]>;
+def r88 : GR< 88, "r88">, DwarfRegNum<[88]>;
+def r89 : GR< 89, "r89">, DwarfRegNum<[89]>;
+def r90 : GR< 90, "r90">, DwarfRegNum<[90]>;
+def r91 : GR< 91, "r91">, DwarfRegNum<[91]>;
+def r92 : GR< 92, "r92">, DwarfRegNum<[92]>;
+def r93 : GR< 93, "r93">, DwarfRegNum<[93]>;
+def r94 : GR< 94, "r94">, DwarfRegNum<[94]>;
+def r95 : GR< 95, "r95">, DwarfRegNum<[95]>;
+def r96 : GR< 96, "r96">, DwarfRegNum<[96]>;
+def r97 : GR< 97, "r97">, DwarfRegNum<[97]>;
+def r98 : GR< 98, "r98">, DwarfRegNum<[98]>;
+def r99 : GR< 99, "r99">, DwarfRegNum<[99]>;
+def r100 : GR< 100, "r100">, DwarfRegNum<[100]>;
+def r101 : GR< 101, "r101">, DwarfRegNum<[101]>;
+def r102 : GR< 102, "r102">, DwarfRegNum<[102]>;
+def r103 : GR< 103, "r103">, DwarfRegNum<[103]>;
+def r104 : GR< 104, "r104">, DwarfRegNum<[104]>;
+def r105 : GR< 105, "r105">, DwarfRegNum<[105]>;
+def r106 : GR< 106, "r106">, DwarfRegNum<[106]>;
+def r107 : GR< 107, "r107">, DwarfRegNum<[107]>;
+def r108 : GR< 108, "r108">, DwarfRegNum<[108]>;
+def r109 : GR< 109, "r109">, DwarfRegNum<[109]>;
+def r110 : GR< 110, "r110">, DwarfRegNum<[110]>;
+def r111 : GR< 111, "r111">, DwarfRegNum<[111]>;
+def r112 : GR< 112, "r112">, DwarfRegNum<[112]>;
+def r113 : GR< 113, "r113">, DwarfRegNum<[113]>;
+def r114 : GR< 114, "r114">, DwarfRegNum<[114]>;
+def r115 : GR< 115, "r115">, DwarfRegNum<[115]>;
+def r116 : GR< 116, "r116">, DwarfRegNum<[116]>;
+def r117 : GR< 117, "r117">, DwarfRegNum<[117]>;
+def r118 : GR< 118, "r118">, DwarfRegNum<[118]>;
+def r119 : GR< 119, "r119">, DwarfRegNum<[119]>;
+def r120 : GR< 120, "r120">, DwarfRegNum<[120]>;
+def r121 : GR< 121, "r121">, DwarfRegNum<[121]>;
+def r122 : GR< 122, "r122">, DwarfRegNum<[122]>;
+def r123 : GR< 123, "r123">, DwarfRegNum<[123]>;
+def r124 : GR< 124, "r124">, DwarfRegNum<[124]>;
+def r125 : GR< 125, "r125">, DwarfRegNum<[125]>;
+def r126 : GR< 126, "r126">, DwarfRegNum<[126]>;
+def r127 : GR< 127, "r127">, DwarfRegNum<[127]>;
+
+/* floating-point registers */
+def F0 : FP< 0, "f0">, DwarfRegNum<[128]>;
+def F1 : FP< 1, "f1">, DwarfRegNum<[129]>;
+def F2 : FP< 2, "f2">, DwarfRegNum<[130]>;
+def F3 : FP< 3, "f3">, DwarfRegNum<[131]>;
+def F4 : FP< 4, "f4">, DwarfRegNum<[132]>;
+def F5 : FP< 5, "f5">, DwarfRegNum<[133]>;
+def F6 : FP< 6, "f6">, DwarfRegNum<[134]>;
+def F7 : FP< 7, "f7">, DwarfRegNum<[135]>;
+def F8 : FP< 8, "f8">, DwarfRegNum<[136]>;
+def F9 : FP< 9, "f9">, DwarfRegNum<[137]>;
+def F10 : FP< 10, "f10">, DwarfRegNum<[138]>;
+def F11 : FP< 11, "f11">, DwarfRegNum<[139]>;
+def F12 : FP< 12, "f12">, DwarfRegNum<[140]>;
+def F13 : FP< 13, "f13">, DwarfRegNum<[141]>;
+def F14 : FP< 14, "f14">, DwarfRegNum<[142]>;
+def F15 : FP< 15, "f15">, DwarfRegNum<[143]>;
+def F16 : FP< 16, "f16">, DwarfRegNum<[144]>;
+def F17 : FP< 17, "f17">, DwarfRegNum<[145]>;
+def F18 : FP< 18, "f18">, DwarfRegNum<[146]>;
+def F19 : FP< 19, "f19">, DwarfRegNum<[147]>;
+def F20 : FP< 20, "f20">, DwarfRegNum<[148]>;
+def F21 : FP< 21, "f21">, DwarfRegNum<[149]>;
+def F22 : FP< 22, "f22">, DwarfRegNum<[150]>;
+def F23 : FP< 23, "f23">, DwarfRegNum<[151]>;
+def F24 : FP< 24, "f24">, DwarfRegNum<[152]>;
+def F25 : FP< 25, "f25">, DwarfRegNum<[153]>;
+def F26 : FP< 26, "f26">, DwarfRegNum<[154]>;
+def F27 : FP< 27, "f27">, DwarfRegNum<[155]>;
+def F28 : FP< 28, "f28">, DwarfRegNum<[156]>;
+def F29 : FP< 29, "f29">, DwarfRegNum<[157]>;
+def F30 : FP< 30, "f30">, DwarfRegNum<[158]>;
+def F31 : FP< 31, "f31">, DwarfRegNum<[159]>;
+def F32 : FP< 32, "f32">, DwarfRegNum<[160]>;
+def F33 : FP< 33, "f33">, DwarfRegNum<[161]>;
+def F34 : FP< 34, "f34">, DwarfRegNum<[162]>;
+def F35 : FP< 35, "f35">, DwarfRegNum<[163]>;
+def F36 : FP< 36, "f36">, DwarfRegNum<[164]>;
+def F37 : FP< 37, "f37">, DwarfRegNum<[165]>;
+def F38 : FP< 38, "f38">, DwarfRegNum<[166]>;
+def F39 : FP< 39, "f39">, DwarfRegNum<[167]>;
+def F40 : FP< 40, "f40">, DwarfRegNum<[168]>;
+def F41 : FP< 41, "f41">, DwarfRegNum<[169]>;
+def F42 : FP< 42, "f42">, DwarfRegNum<[170]>;
+def F43 : FP< 43, "f43">, DwarfRegNum<[171]>;
+def F44 : FP< 44, "f44">, DwarfRegNum<[172]>;
+def F45 : FP< 45, "f45">, DwarfRegNum<[173]>;
+def F46 : FP< 46, "f46">, DwarfRegNum<[174]>;
+def F47 : FP< 47, "f47">, DwarfRegNum<[175]>;
+def F48 : FP< 48, "f48">, DwarfRegNum<[176]>;
+def F49 : FP< 49, "f49">, DwarfRegNum<[177]>;
+def F50 : FP< 50, "f50">, DwarfRegNum<[178]>;
+def F51 : FP< 51, "f51">, DwarfRegNum<[179]>;
+def F52 : FP< 52, "f52">, DwarfRegNum<[180]>;
+def F53 : FP< 53, "f53">, DwarfRegNum<[181]>;
+def F54 : FP< 54, "f54">, DwarfRegNum<[182]>;
+def F55 : FP< 55, "f55">, DwarfRegNum<[183]>;
+def F56 : FP< 56, "f56">, DwarfRegNum<[184]>;
+def F57 : FP< 57, "f57">, DwarfRegNum<[185]>;
+def F58 : FP< 58, "f58">, DwarfRegNum<[186]>;
+def F59 : FP< 59, "f59">, DwarfRegNum<[187]>;
+def F60 : FP< 60, "f60">, DwarfRegNum<[188]>;
+def F61 : FP< 61, "f61">, DwarfRegNum<[189]>;
+def F62 : FP< 62, "f62">, DwarfRegNum<[190]>;
+def F63 : FP< 63, "f63">, DwarfRegNum<[191]>;
+def F64 : FP< 64, "f64">, DwarfRegNum<[192]>;
+def F65 : FP< 65, "f65">, DwarfRegNum<[193]>;
+def F66 : FP< 66, "f66">, DwarfRegNum<[194]>;
+def F67 : FP< 67, "f67">, DwarfRegNum<[195]>;
+def F68 : FP< 68, "f68">, DwarfRegNum<[196]>;
+def F69 : FP< 69, "f69">, DwarfRegNum<[197]>;
+def F70 : FP< 70, "f70">, DwarfRegNum<[198]>;
+def F71 : FP< 71, "f71">, DwarfRegNum<[199]>;
+def F72 : FP< 72, "f72">, DwarfRegNum<[200]>;
+def F73 : FP< 73, "f73">, DwarfRegNum<[201]>;
+def F74 : FP< 74, "f74">, DwarfRegNum<[202]>;
+def F75 : FP< 75, "f75">, DwarfRegNum<[203]>;
+def F76 : FP< 76, "f76">, DwarfRegNum<[204]>;
+def F77 : FP< 77, "f77">, DwarfRegNum<[205]>;
+def F78 : FP< 78, "f78">, DwarfRegNum<[206]>;
+def F79 : FP< 79, "f79">, DwarfRegNum<[207]>;
+def F80 : FP< 80, "f80">, DwarfRegNum<[208]>;
+def F81 : FP< 81, "f81">, DwarfRegNum<[209]>;
+def F82 : FP< 82, "f82">, DwarfRegNum<[210]>;
+def F83 : FP< 83, "f83">, DwarfRegNum<[211]>;
+def F84 : FP< 84, "f84">, DwarfRegNum<[212]>;
+def F85 : FP< 85, "f85">, DwarfRegNum<[213]>;
+def F86 : FP< 86, "f86">, DwarfRegNum<[214]>;
+def F87 : FP< 87, "f87">, DwarfRegNum<[215]>;
+def F88 : FP< 88, "f88">, DwarfRegNum<[216]>;
+def F89 : FP< 89, "f89">, DwarfRegNum<[217]>;
+def F90 : FP< 90, "f90">, DwarfRegNum<[218]>;
+def F91 : FP< 91, "f91">, DwarfRegNum<[219]>;
+def F92 : FP< 92, "f92">, DwarfRegNum<[220]>;
+def F93 : FP< 93, "f93">, DwarfRegNum<[221]>;
+def F94 : FP< 94, "f94">, DwarfRegNum<[222]>;
+def F95 : FP< 95, "f95">, DwarfRegNum<[223]>;
+def F96 : FP< 96, "f96">, DwarfRegNum<[224]>;
+def F97 : FP< 97, "f97">, DwarfRegNum<[225]>;
+def F98 : FP< 98, "f98">, DwarfRegNum<[226]>;
+def F99 : FP< 99, "f99">, DwarfRegNum<[227]>;
+def F100 : FP< 100, "f100">, DwarfRegNum<[228]>;
+def F101 : FP< 101, "f101">, DwarfRegNum<[229]>;
+def F102 : FP< 102, "f102">, DwarfRegNum<[230]>;
+def F103 : FP< 103, "f103">, DwarfRegNum<[231]>;
+def F104 : FP< 104, "f104">, DwarfRegNum<[232]>;
+def F105 : FP< 105, "f105">, DwarfRegNum<[233]>;
+def F106 : FP< 106, "f106">, DwarfRegNum<[234]>;
+def F107 : FP< 107, "f107">, DwarfRegNum<[235]>;
+def F108 : FP< 108, "f108">, DwarfRegNum<[236]>;
+def F109 : FP< 109, "f109">, DwarfRegNum<[237]>;
+def F110 : FP< 110, "f110">, DwarfRegNum<[238]>;
+def F111 : FP< 111, "f111">, DwarfRegNum<[239]>;
+def F112 : FP< 112, "f112">, DwarfRegNum<[240]>;
+def F113 : FP< 113, "f113">, DwarfRegNum<[241]>;
+def F114 : FP< 114, "f114">, DwarfRegNum<[242]>;
+def F115 : FP< 115, "f115">, DwarfRegNum<[243]>;
+def F116 : FP< 116, "f116">, DwarfRegNum<[244]>;
+def F117 : FP< 117, "f117">, DwarfRegNum<[245]>;
+def F118 : FP< 118, "f118">, DwarfRegNum<[246]>;
+def F119 : FP< 119, "f119">, DwarfRegNum<[247]>;
+def F120 : FP< 120, "f120">, DwarfRegNum<[248]>;
+def F121 : FP< 121, "f121">, DwarfRegNum<[249]>;
+def F122 : FP< 122, "f122">, DwarfRegNum<[250]>;
+def F123 : FP< 123, "f123">, DwarfRegNum<[251]>;
+def F124 : FP< 124, "f124">, DwarfRegNum<[252]>;
+def F125 : FP< 125, "f125">, DwarfRegNum<[253]>;
+def F126 : FP< 126, "f126">, DwarfRegNum<[254]>;
+def F127 : FP< 127, "f127">, DwarfRegNum<[255]>;
+
+/* predicate registers */
+def p0 : PR< 0, "p0">, DwarfRegNum<[256]>;
+def p1 : PR< 1, "p1">, DwarfRegNum<[257]>;
+def p2 : PR< 2, "p2">, DwarfRegNum<[258]>;
+def p3 : PR< 3, "p3">, DwarfRegNum<[259]>;
+def p4 : PR< 4, "p4">, DwarfRegNum<[260]>;
+def p5 : PR< 5, "p5">, DwarfRegNum<[261]>;
+def p6 : PR< 6, "p6">, DwarfRegNum<[262]>;
+def p7 : PR< 7, "p7">, DwarfRegNum<[263]>;
+def p8 : PR< 8, "p8">, DwarfRegNum<[264]>;
+def p9 : PR< 9, "p9">, DwarfRegNum<[265]>;
+def p10 : PR< 10, "p10">, DwarfRegNum<[266]>;
+def p11 : PR< 11, "p11">, DwarfRegNum<[267]>;
+def p12 : PR< 12, "p12">, DwarfRegNum<[268]>;
+def p13 : PR< 13, "p13">, DwarfRegNum<[269]>;
+def p14 : PR< 14, "p14">, DwarfRegNum<[270]>;
+def p15 : PR< 15, "p15">, DwarfRegNum<[271]>;
+def p16 : PR< 16, "p16">, DwarfRegNum<[272]>;
+def p17 : PR< 17, "p17">, DwarfRegNum<[273]>;
+def p18 : PR< 18, "p18">, DwarfRegNum<[274]>;
+def p19 : PR< 19, "p19">, DwarfRegNum<[275]>;
+def p20 : PR< 20, "p20">, DwarfRegNum<[276]>;
+def p21 : PR< 21, "p21">, DwarfRegNum<[277]>;
+def p22 : PR< 22, "p22">, DwarfRegNum<[278]>;
+def p23 : PR< 23, "p23">, DwarfRegNum<[279]>;
+def p24 : PR< 24, "p24">, DwarfRegNum<[280]>;
+def p25 : PR< 25, "p25">, DwarfRegNum<[281]>;
+def p26 : PR< 26, "p26">, DwarfRegNum<[282]>;
+def p27 : PR< 27, "p27">, DwarfRegNum<[283]>;
+def p28 : PR< 28, "p28">, DwarfRegNum<[284]>;
+def p29 : PR< 29, "p29">, DwarfRegNum<[285]>;
+def p30 : PR< 30, "p30">, DwarfRegNum<[286]>;
+def p31 : PR< 31, "p31">, DwarfRegNum<[287]>;
+def p32 : PR< 32, "p32">, DwarfRegNum<[288]>;
+def p33 : PR< 33, "p33">, DwarfRegNum<[289]>;
+def p34 : PR< 34, "p34">, DwarfRegNum<[290]>;
+def p35 : PR< 35, "p35">, DwarfRegNum<[291]>;
+def p36 : PR< 36, "p36">, DwarfRegNum<[292]>;
+def p37 : PR< 37, "p37">, DwarfRegNum<[293]>;
+def p38 : PR< 38, "p38">, DwarfRegNum<[294]>;
+def p39 : PR< 39, "p39">, DwarfRegNum<[295]>;
+def p40 : PR< 40, "p40">, DwarfRegNum<[296]>;
+def p41 : PR< 41, "p41">, DwarfRegNum<[297]>;
+def p42 : PR< 42, "p42">, DwarfRegNum<[298]>;
+def p43 : PR< 43, "p43">, DwarfRegNum<[299]>;
+def p44 : PR< 44, "p44">, DwarfRegNum<[300]>;
+def p45 : PR< 45, "p45">, DwarfRegNum<[301]>;
+def p46 : PR< 46, "p46">, DwarfRegNum<[302]>;
+def p47 : PR< 47, "p47">, DwarfRegNum<[303]>;
+def p48 : PR< 48, "p48">, DwarfRegNum<[304]>;
+def p49 : PR< 49, "p49">, DwarfRegNum<[305]>;
+def p50 : PR< 50, "p50">, DwarfRegNum<[306]>;
+def p51 : PR< 51, "p51">, DwarfRegNum<[307]>;
+def p52 : PR< 52, "p52">, DwarfRegNum<[308]>;
+def p53 : PR< 53, "p53">, DwarfRegNum<[309]>;
+def p54 : PR< 54, "p54">, DwarfRegNum<[310]>;
+def p55 : PR< 55, "p55">, DwarfRegNum<[311]>;
+def p56 : PR< 56, "p56">, DwarfRegNum<[312]>;
+def p57 : PR< 57, "p57">, DwarfRegNum<[313]>;
+def p58 : PR< 58, "p58">, DwarfRegNum<[314]>;
+def p59 : PR< 59, "p59">, DwarfRegNum<[315]>;
+def p60 : PR< 60, "p60">, DwarfRegNum<[316]>;
+def p61 : PR< 61, "p61">, DwarfRegNum<[317]>;
+def p62 : PR< 62, "p62">, DwarfRegNum<[318]>;
+def p63 : PR< 63, "p63">, DwarfRegNum<[319]>;
+
+// XXX : this is temporary, we'll eventually have the output registers
+// in the general purpose register class too?
+def out0 : GR<0, "out0">, DwarfRegNum<[120]>;
+def out1 : GR<1, "out1">, DwarfRegNum<[121]>;
+def out2 : GR<2, "out2">, DwarfRegNum<[122]>;
+def out3 : GR<3, "out3">, DwarfRegNum<[123]>;
+def out4 : GR<4, "out4">, DwarfRegNum<[124]>;
+def out5 : GR<5, "out5">, DwarfRegNum<[125]>;
+def out6 : GR<6, "out6">, DwarfRegNum<[126]>;
+def out7 : GR<7, "out7">, DwarfRegNum<[127]>;
+
+// application (special) registers:
+
+// "previous function state" application register
+def AR_PFS : GR<0, "ar.pfs">, DwarfRegNum<[331]>;
+
+// "return pointer" (this is really branch register b0)
+def rp : GR<0, "rp">, DwarfRegNum<[-1]>;
+
+// branch reg 6
+def B6 : GR<0, "b6">, DwarfRegNum<[326]>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// these are the scratch (+stacked) general registers
+// FIXME/XXX  we also reserve a frame pointer (r5)
+// FIXME/XXX  we also reserve r2 for spilling/filling predicates
+// in IA64RegisterInfo.cpp
+// FIXME/XXX  we also reserve r22 for calculating addresses
+// in IA64RegisterInfo.cpp
+
+def GR : RegisterClass<"IA64", [i64], 64, 
+       [
+       
+//FIXME!: for both readability and performance, we don't want the out
+//        registers to be the first ones allocated
+
+        out7, out6, out5, out4, out3, out2, out1, out0,
+        r3,  r8,  r9,  r10, r11, r14, r15,
+        r16, r17, r18, r19, r20, r21, r23,
+        r24, r25, r26, r27, r28, r29, r30, r31,
+        r32, r33, r34, r35, r36, r37, r38, r39,
+        r40, r41, r42, r43, r44, r45, r46, r47,
+        r48, r49, r50, r51, r52, r53, r54, r55,
+        r56, r57, r58, r59, r60, r61, r62, r63,
+        r64, r65, r66, r67, r68, r69, r70, r71,
+        r72, r73, r74, r75, r76, r77, r78, r79,
+        r80, r81, r82, r83, r84, r85, r86, r87,
+        r88, r89, r90, r91, r92, r93, r94, r95,
+        r96, r97, r98, r99, r100, r101, r102, r103,
+        r104, r105, r106, r107, r108, r109, r110, r111,
+        r112, r113, r114, r115, r116, r117, r118, r119,
+        // last 17 are special (look down)
+        r120, r121, r122, r123, r124, r125, r126, r127,
+        r0, r1, r2, r5, r12, r13, r22, rp, AR_PFS]>
+  {
+    let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GRClass::iterator
+    GRClass::allocation_order_begin(const MachineFunction &MF) const {
+        // hide the 8 out? registers appropriately:
+        return begin()+(8-(MF.getInfo<IA64FunctionInfo>()->outRegsUsed));
+      }
+
+      GRClass::iterator
+      GRClass::allocation_order_end(const MachineFunction &MF) const {
+        // the 9 special registers r0,r1,r2,r5,r12,r13 etc
+        int numReservedRegs=9;
+        
+        // we also can't allocate registers for use as locals if they're already
+        // required as 'out' registers
+        numReservedRegs+=MF.getInfo<IA64FunctionInfo>()->outRegsUsed;
+        return end()-numReservedRegs; // hide registers appropriately
+      }
+  }];
+}
+
+
+// these are the scratch (+stacked) FP registers
+
+def FP : RegisterClass<"IA64", [f64], 64, 
+       [F6, F7, 
+        F8, F9, F10, F11, F12, F13, F14, F15, 
+        F32, F33, F34, F35, F36, F37, F38, F39, 
+        F40, F41, F42, F43, F44, F45, F46, F47, 
+        F48, F49, F50, F51, F52, F53, F54, F55, 
+        F56, F57, F58, F59, F60, F61, F62, F63, 
+        F64, F65, F66, F67, F68, F69, F70, F71, 
+        F72, F73, F74, F75, F76, F77, F78, F79, 
+        F80, F81, F82, F83, F84, F85, F86, F87, 
+        F88, F89, F90, F91, F92, F93, F94, F95, 
+        F96, F97, F98, F99, F100, F101, F102, F103, 
+        F104, F105, F106, F107, F108, F109, F110, F111, 
+        F112, F113, F114, F115, F116, F117, F118, F119, 
+        F120, F121, F122, F123, F124, F125, F126, F127,
+        F0, F1]> // these last two are hidden
+  {
+// the 128s here are to make stf.spill/ldf.fill happy,
+// when storing full (82-bit) FP regs to stack slots
+// we need to 16-byte align
+    let Size=128;
+    let Alignment=128;
+
+    let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FPClass::iterator
+    FPClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin(); // we don't hide any FP regs from the start
+    }
+
+    FPClass::iterator
+    FPClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-2; // we hide regs F0, F1 from the end 
+    }
+  }];
+}
+
+// these are the predicate registers, p0 (1/TRUE) is not here
+def PR : RegisterClass<"IA64", [i1], 64, 
+
+// for now, let's be wimps and only have the scratch predicate regs
+ [p6, p7, p8, p9, p10, p11, p12, p13, p14, p15]> {
+   let Size = 64;
+ }
+
+/*
+ [p1, p2, p3, p4, p5, p6, p7,
+  p8, p9, p10, p11, p12, p13, p14, p15,
+  p16, p17, p18, p19, p20, p21, p22, p23,
+  p24, p25, p26, p27, p28, p29, p30, p31,
+  p32, p33, p34, p35, p36, p37, p38, p39,
+  p40, p41, p42, p43, p44, p45, p46, p47,
+  p48, p49, p50, p51, p52, p53, p54, p55,
+  p56, p57, p58, p59, p60, p61, p62, p63]>;
+  */
diff --git a/lib/Target/IA64/IA64Subtarget.cpp b/lib/Target/IA64/IA64Subtarget.cpp
new file mode 100644
index 0000000..4eca50b
--- /dev/null
+++ b/lib/Target/IA64/IA64Subtarget.cpp
@@ -0,0 +1,18 @@
+//===-- IA64Subtarget.cpp - IA64 Subtarget Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget"
+#include "IA64Subtarget.h"
+using namespace llvm;
+
+IA64Subtarget::IA64Subtarget() {}
diff --git a/lib/Target/IA64/IA64Subtarget.h b/lib/Target/IA64/IA64Subtarget.h
new file mode 100644
index 0000000..0387af5
--- /dev/null
+++ b/lib/Target/IA64/IA64Subtarget.h
@@ -0,0 +1,28 @@
+//====---- IA64Subtarget.h - Define Subtarget for the IA64 -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IA64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64SUBTARGET_H
+#define IA64SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+namespace llvm {
+
+class IA64Subtarget : public TargetSubtarget {
+public:
+  IA64Subtarget();
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/IA64/IA64TargetAsmInfo.cpp b/lib/Target/IA64/IA64TargetAsmInfo.cpp
new file mode 100644
index 0000000..2ae8beb
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetAsmInfo.cpp
@@ -0,0 +1,44 @@
+//===-- IA64TargetAsmInfo.cpp - IA64 asm properties -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the IA64TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64TargetAsmInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+IA64TargetAsmInfo::IA64TargetAsmInfo(const TargetMachine &TM):
+  ELFTargetAsmInfo(TM) {
+  CommentString = "//";
+  Data8bitsDirective = "\tdata1\t";     // FIXME: check that we are
+  Data16bitsDirective = "\tdata2.ua\t"; // disabling auto-alignment
+  Data32bitsDirective = "\tdata4.ua\t"; // properly
+  Data64bitsDirective = "\tdata8.ua\t";
+  ZeroDirective = "\t.skip\t";
+  AsciiDirective = "\tstring\t";
+
+  GlobalVarAddrPrefix="";
+  GlobalVarAddrSuffix="";
+  FunctionAddrPrefix="@fptr(";
+  FunctionAddrSuffix=")";
+
+  // FIXME: would be nice to have rodata (no 'w') when appropriate?
+  ConstantPoolSection = "\n\t.section .data, \"aw\", \"progbits\"\n";
+}
+
+unsigned IA64TargetAsmInfo::RelocBehaviour() const {
+  return (TM.getRelocationModel() != Reloc::Static ?
+          Reloc::LocalOrGlobal : Reloc::Global);
+}
+
+// FIXME: Support small data/bss/rodata sections someday.
diff --git a/lib/Target/IA64/IA64TargetAsmInfo.h b/lib/Target/IA64/IA64TargetAsmInfo.h
new file mode 100644
index 0000000..130822e
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetAsmInfo.h
@@ -0,0 +1,33 @@
+//=====-- IA64TargetAsmInfo.h - IA64 asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the IA64TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64TARGETASMINFO_H
+#define IA64TARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class TargetMachine;
+
+  struct IA64TargetAsmInfo : public ELFTargetAsmInfo {
+    explicit IA64TargetAsmInfo(const TargetMachine &TM);
+    virtual unsigned RelocBehaviour() const;
+  };
+
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/IA64/IA64TargetMachine.cpp b/lib/Target/IA64/IA64TargetMachine.cpp
new file mode 100644
index 0000000..878a00a
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetMachine.cpp
@@ -0,0 +1,94 @@
+//===-- IA64TargetMachine.cpp - Define TargetMachine for IA64 -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64TargetAsmInfo.h"
+#include "IA64TargetMachine.h"
+#include "IA64.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+/// IA64TargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library.  In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this.  Though it is unused, do not remove it.
+extern "C" int IA64TargetMachineModule;
+int IA64TargetMachineModule = 0;
+
+static RegisterTarget<IA64TargetMachine> X("ia64", 
+                                           "IA-64 (Itanium) [experimental]");
+
+const TargetAsmInfo *IA64TargetMachine::createTargetAsmInfo() const {
+  return new IA64TargetAsmInfo(*this);
+}
+
+unsigned IA64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // we match [iI][aA]*64
+  bool seenIA64=false;
+  std::string TT = M.getTargetTriple();
+
+  if (TT.size() >= 4) {
+    if( (TT[0]=='i' || TT[0]=='I') &&
+        (TT[1]=='a' || TT[1]=='A') ) {
+      for(unsigned int i=2; i<(TT.size()-1); i++)
+        if(TT[i]=='6' && TT[i+1]=='4')
+          seenIA64=true;
+    }
+
+    if (seenIA64)
+      return 20; // strong match
+  }
+  // If the target triple is something non-ia64, we don't match.
+  if (!TT.empty()) return 0;
+
+#if defined(__ia64__) || defined(__IA64__)
+  return 5;
+#else
+  return 0;
+#endif
+}
+
+/// IA64TargetMachine ctor - Create an LP64 architecture model
+///
+IA64TargetMachine::IA64TargetMachine(const Module &M, const std::string &FS)
+  : DataLayout("e-f80:128:128"),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+    TLInfo(*this) { // FIXME? check this stuff
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool IA64TargetMachine::addInstSelector(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel){
+  PM.add(createIA64DAGToDAGInstructionSelector(*this));
+  return false;
+}
+
+bool IA64TargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  // Make sure everything is bundled happily
+  PM.add(createIA64BundlingPass(*this));
+  return true;
+}
+bool IA64TargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                           CodeGenOpt::Level OptLevel,
+                                           bool Verbose,
+                                           raw_ostream &Out) {
+  PM.add(createIA64CodePrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
+
diff --git a/lib/Target/IA64/IA64TargetMachine.h b/lib/Target/IA64/IA64TargetMachine.h
new file mode 100644
index 0000000..29d625c
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetMachine.h
@@ -0,0 +1,64 @@
+//===-- IA64TargetMachine.h - Define TargetMachine for IA64 ---*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IA64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_IA64TARGETMACHINE_H
+#define LLVM_TARGET_IA64TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "IA64InstrInfo.h"
+#include "IA64ISelLowering.h"
+#include "IA64Subtarget.h"
+
+namespace llvm {
+
+class IA64TargetMachine : public LLVMTargetMachine {
+  IA64Subtarget    Subtarget;
+  const TargetData DataLayout;       // Calculates type size & alignment
+  IA64InstrInfo      InstrInfo;
+  TargetFrameInfo    FrameInfo;
+  //IA64JITInfo      JITInfo;
+  IA64TargetLowering TLInfo;
+  
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+public:
+  IA64TargetMachine(const Module &M, const std::string &FS);
+
+  virtual const IA64InstrInfo      *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo    *getFrameInfo() const { return &FrameInfo; }
+  virtual const IA64Subtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual       IA64TargetLowering *getTargetLowering() const { 
+    return const_cast<IA64TargetLowering*>(&TLInfo);
+  }
+  virtual const IA64RegisterInfo   *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel,
+                                  bool Verbose, raw_ostream &Out);
+};
+} // End llvm namespace
+
+#endif
+
+
diff --git a/lib/Target/IA64/Makefile b/lib/Target/IA64/Makefile
new file mode 100644
index 0000000..d383254
--- /dev/null
+++ b/lib/Target/IA64/Makefile
@@ -0,0 +1,20 @@
+##===- lib/Target/IA64/Makefile -----------------------------*- Makefile -*-===##
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMIA64CodeGen
+TARGET = IA64
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = IA64GenRegisterInfo.h.inc IA64GenRegisterNames.inc \
+                IA64GenRegisterInfo.inc IA64GenInstrNames.inc \
+                IA64GenInstrInfo.inc IA64GenAsmWriter.inc \
+		IA64GenDAGISel.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/IA64/README b/lib/Target/IA64/README
new file mode 100644
index 0000000..60761ac
--- /dev/null
+++ b/lib/Target/IA64/README
@@ -0,0 +1,48 @@
+TODO:
+  - Un-bitrot ISel
+  - Hook up If-Conversion a la ARM target
+  - Hook up all branch analysis functions
+  - Instruction scheduling
+  - Bundling
+  - Dynamic Optimization
+  - Testing and bugfixing
+  - stop passing FP args in both FP *and* integer regs when not required
+  - allocate low (nonstacked) registers more aggressively
+  - clean up and thoroughly test the isel patterns.
+  - fix stacked register allocation order: (for readability) we don't want
+    the out? registers being the first ones used
+  - fix up floating point
+    (nb http://gcc.gnu.org/wiki?pagename=ia64%20floating%20point )
+  - bundling!
+    (we will avoid the mess that is:
+     http://gcc.gnu.org/ml/gcc/2003-12/msg00832.html )
+  - instruction scheduling (hmmmm! ;)
+  - counted loop support
+  - make integer + FP mul/div more clever (we have fixed pseudocode atm)
+  - track and use comparison complements
+
+INFO:
+  - we are strictly LP64 here, no support for ILP32 on HP-UX. Linux users
+    don't need to worry about this.
+  - i have instruction scheduling/bundling pseudocode, that really works
+    (has been tested, albeit at the perl-script level).
+    so, before you go write your own, send me an email!
+
+KNOWN DEFECTS AT THE CURRENT TIME:
+  - C++ vtables contain naked function pointers, not function descriptors,
+  which is bad. see http://llvm.cs.uiuc.edu/bugs/show_bug.cgi?id=406
+  - varargs are broken
+  - alloca doesn't work (indeed, stack frame layout is bogus)
+  - no support for big-endian environments
+  - (not really the backend, but...) the CFE has some issues on IA64.
+    these will probably be fixed soon.
+  
+ACKNOWLEDGEMENTS:
+  - Chris Lattner (x100)
+  - Other LLVM developers ("hey, that looks familiar")
+
+CONTACT:
+  - You can email me at duraid@octopus.com.au. If you find a small bug,
+    just email me. If you find a big bug, please file a bug report
+    in bugzilla! http://llvm.cs.uiuc.edu is your one stop shop for all
+    things LLVM.
diff --git a/lib/Target/MSIL/CMakeLists.txt b/lib/Target/MSIL/CMakeLists.txt
new file mode 100644
index 0000000..b1d47ef
--- /dev/null
+++ b/lib/Target/MSIL/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_target(MSIL
+  MSILWriter.cpp
+  )
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
new file mode 100644
index 0000000..ada851d
--- /dev/null
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -0,0 +1,1680 @@
+//===-- MSILWriter.cpp - Library for converting LLVM code to MSIL ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library converts LLVM code to MSIL code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSILWriter.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Passes.h"
+
+namespace {
+  // TargetMachine for the MSIL 
+  struct VISIBILITY_HIDDEN MSILTarget : public TargetMachine {
+    const TargetData DataLayout;       // Calculates type size & alignment
+
+    MSILTarget(const Module &M, const std::string &FS)
+      : DataLayout(&M) {}
+
+    virtual bool WantsWholeFile() const { return true; }
+    virtual bool addPassesToEmitWholeFile(PassManager &PM, raw_ostream &Out,
+                                          CodeGenFileType FileType,
+                                          CodeGenOpt::Level OptLevel);
+
+    // This class always works, but shouldn't be the default in most cases.
+    static unsigned getModuleMatchQuality(const Module &M) { return 1; }
+
+    virtual const TargetData *getTargetData() const { return &DataLayout; }
+  };
+}
+
+/// MSILTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int MSILTargetMachineModule;
+int MSILTargetMachineModule = 0;
+
+static RegisterTarget<MSILTarget> X("msil", "MSIL backend");
+
+bool MSILModule::runOnModule(Module &M) {
+  ModulePtr = &M;
+  TD = &getAnalysis<TargetData>();
+  bool Changed = false;
+  // Find named types.  
+  TypeSymbolTable& Table = M.getTypeSymbolTable();
+  std::set<const Type *> Types = getAnalysis<FindUsedTypes>().getTypes();
+  for (TypeSymbolTable::iterator I = Table.begin(), E = Table.end(); I!=E; ) {
+    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second))
+      Table.remove(I++);
+    else {
+      std::set<const Type *>::iterator T = Types.find(I->second);
+      if (T==Types.end())
+        Table.remove(I++);
+      else {
+        Types.erase(T);
+        ++I;
+      }
+    }
+  }
+  // Find unnamed types.
+  unsigned RenameCounter = 0;
+  for (std::set<const Type *>::const_iterator I = Types.begin(),
+       E = Types.end(); I!=E; ++I)
+    if (const StructType *STy = dyn_cast<StructType>(*I)) {
+      while (ModulePtr->addTypeName("unnamed$"+utostr(RenameCounter), STy))
+        ++RenameCounter;
+      Changed = true;
+    }
+  // Pointer for FunctionPass.
+  UsedTypes = &getAnalysis<FindUsedTypes>().getTypes();
+  return Changed;
+}
+
+char MSILModule::ID = 0;
+char MSILWriter::ID = 0;
+
+bool MSILWriter::runOnFunction(Function &F) {
+  if (F.isDeclaration()) return false;
+
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
+  LInfo = &getAnalysis<LoopInfo>();
+  printFunction(F);
+  return false;
+}
+
+
+bool MSILWriter::doInitialization(Module &M) {
+  ModulePtr = &M;
+  Mang = new Mangler(M);
+  Out << ".assembly extern mscorlib {}\n";
+  Out << ".assembly MSIL {}\n\n";
+  Out << "// External\n";
+  printExternals();
+  Out << "// Declarations\n";
+  printDeclarations(M.getTypeSymbolTable());
+  Out << "// Definitions\n";
+  printGlobalVariables();
+  Out << "// Startup code\n";
+  printModuleStartup();
+  return false;
+}
+
+
+bool MSILWriter::doFinalization(Module &M) {
+  delete Mang;
+  return false;
+}
+
+
+void MSILWriter::printModuleStartup() {
+  Out <<
+  ".method static public int32 $MSIL_Startup() {\n"
+  "\t.entrypoint\n"
+  "\t.locals (native int i)\n"
+  "\t.locals (native int argc)\n"
+  "\t.locals (native int ptr)\n"
+  "\t.locals (void* argv)\n"
+  "\t.locals (string[] args)\n"
+  "\tcall\tstring[] [mscorlib]System.Environment::GetCommandLineArgs()\n"
+  "\tdup\n"
+  "\tstloc\targs\n"
+  "\tldlen\n"
+  "\tconv.i4\n"
+  "\tdup\n"
+  "\tstloc\targc\n";
+  printPtrLoad(TD->getPointerSize());
+  Out <<
+  "\tmul\n"
+  "\tlocalloc\n"
+  "\tstloc\targv\n"
+  "\tldc.i4.0\n"
+  "\tstloc\ti\n"
+  "L_01:\n"
+  "\tldloc\ti\n"
+  "\tldloc\targc\n"
+  "\tceq\n"
+  "\tbrtrue\tL_02\n"
+  "\tldloc\targs\n"
+  "\tldloc\ti\n"
+  "\tldelem.ref\n"
+  "\tcall\tnative int [mscorlib]System.Runtime.InteropServices.Marshal::"
+           "StringToHGlobalAnsi(string)\n"
+  "\tstloc\tptr\n"
+  "\tldloc\targv\n"
+  "\tldloc\ti\n";
+  printPtrLoad(TD->getPointerSize());
+  Out << 
+  "\tmul\n"
+  "\tadd\n"
+  "\tldloc\tptr\n"
+  "\tstind.i\n"
+  "\tldloc\ti\n"
+  "\tldc.i4.1\n"
+  "\tadd\n"
+  "\tstloc\ti\n"
+  "\tbr\tL_01\n"
+  "L_02:\n"
+  "\tcall void $MSIL_Init()\n";
+
+  // Call user 'main' function.
+  const Function* F = ModulePtr->getFunction("main");
+  if (!F || F->isDeclaration()) {
+    Out << "\tldc.i4.0\n\tret\n}\n";
+    return;
+  }
+  bool BadSig = true;
+  std::string Args("");
+  Function::const_arg_iterator Arg1,Arg2;
+
+  switch (F->arg_size()) {
+  case 0:
+    BadSig = false;
+    break;
+  case 1:
+    Arg1 = F->arg_begin();
+    if (Arg1->getType()->isInteger()) {
+      Out << "\tldloc\targc\n";
+      Args = getTypeName(Arg1->getType());
+      BadSig = false;
+    }
+    break;
+  case 2:
+    Arg1 = Arg2 = F->arg_begin(); ++Arg2;
+    if (Arg1->getType()->isInteger() && 
+        Arg2->getType()->getTypeID() == Type::PointerTyID) {
+      Out << "\tldloc\targc\n\tldloc\targv\n";
+      Args = getTypeName(Arg1->getType())+","+getTypeName(Arg2->getType());
+      BadSig = false;
+    }
+    break;
+  default:
+    BadSig = true;
+  }
+
+  bool RetVoid = (F->getReturnType()->getTypeID() == Type::VoidTyID);
+  if (BadSig || (!F->getReturnType()->isInteger() && !RetVoid)) {
+    Out << "\tldc.i4.0\n";
+  } else {
+    Out << "\tcall\t" << getTypeName(F->getReturnType()) <<
+      getConvModopt(F->getCallingConv()) << "main(" << Args << ")\n";
+    if (RetVoid)
+      Out << "\tldc.i4.0\n";
+    else
+      Out << "\tconv.i4\n";
+  }
+  Out << "\tret\n}\n";
+}
+
+bool MSILWriter::isZeroValue(const Value* V) {
+  if (const Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+
+std::string MSILWriter::getValueName(const Value* V) {
+  // Name into the quotes allow control and space characters.
+  return "'"+Mang->getValueName(V)+"'";
+}
+
+
+std::string MSILWriter::getLabelName(const std::string& Name) {
+  if (Name.find('.')!=std::string::npos) {
+    std::string Tmp(Name);
+    // Replace unaccepable characters in the label name.
+    for (std::string::iterator I = Tmp.begin(), E = Tmp.end(); I!=E; ++I)
+      if (*I=='.') *I = '@';
+    return Tmp;
+  }
+  return Name;
+}
+
+
+std::string MSILWriter::getLabelName(const Value* V) {
+  return getLabelName(Mang->getValueName(V));
+}
+
+
+std::string MSILWriter::getConvModopt(unsigned CallingConvID) {
+  switch (CallingConvID) {
+  case CallingConv::C:
+  case CallingConv::Cold:
+  case CallingConv::Fast:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) ";
+  case CallingConv::X86_FastCall:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvFastcall) ";
+  case CallingConv::X86_StdCall:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvStdcall) ";
+  default:
+    cerr << "CallingConvID = " << CallingConvID << '\n';
+    assert(0 && "Unsupported calling convention");
+  }
+  return ""; // Not reached
+}
+
+
+std::string MSILWriter::getArrayTypeName(Type::TypeID TyID, const Type* Ty) {
+  std::string Tmp = "";
+  const Type* ElemTy = Ty;
+  assert(Ty->getTypeID()==TyID && "Invalid type passed");
+  // Walk trought array element types.
+  for (;;) {
+    // Multidimensional array.
+    if (ElemTy->getTypeID()==TyID) {
+      if (const ArrayType* ATy = dyn_cast<ArrayType>(ElemTy))
+        Tmp += utostr(ATy->getNumElements());
+      else if (const VectorType* VTy = dyn_cast<VectorType>(ElemTy))
+        Tmp += utostr(VTy->getNumElements());
+      ElemTy = cast<SequentialType>(ElemTy)->getElementType();
+    }
+    // Base element type found.
+    if (ElemTy->getTypeID()!=TyID) break;
+    Tmp += ",";
+  }
+  return getTypeName(ElemTy, false, true)+"["+Tmp+"]";
+}
+
+
+std::string MSILWriter::getPrimitiveTypeName(const Type* Ty, bool isSigned) {
+  unsigned NumBits = 0;
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:
+    return "void ";
+  case Type::IntegerTyID:
+    NumBits = getBitWidth(Ty);
+    if(NumBits==1)
+      return "bool ";
+    if (!isSigned)
+      return "unsigned int"+utostr(NumBits)+" ";
+    return "int"+utostr(NumBits)+" ";
+  case Type::FloatTyID:
+    return "float32 ";
+  case Type::DoubleTyID:
+    return "float64 "; 
+  default:
+    cerr << "Type = " << *Ty << '\n';
+    assert(0 && "Invalid primitive type");
+  }
+  return ""; // Not reached
+}
+
+
+std::string MSILWriter::getTypeName(const Type* Ty, bool isSigned,
+                                    bool isNested) {
+  if (Ty->isPrimitiveType() || Ty->isInteger())
+    return getPrimitiveTypeName(Ty,isSigned);
+  // FIXME: "OpaqueType" support
+  switch (Ty->getTypeID()) {
+  case Type::PointerTyID:
+    return "void* ";
+  case Type::StructTyID:
+    if (isNested)
+      return ModulePtr->getTypeName(Ty);
+    return "valuetype '"+ModulePtr->getTypeName(Ty)+"' ";
+  case Type::ArrayTyID:
+    if (isNested)
+      return getArrayTypeName(Ty->getTypeID(),Ty);
+    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
+  case Type::VectorTyID:
+    if (isNested)
+      return getArrayTypeName(Ty->getTypeID(),Ty);
+    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
+  default:
+    cerr << "Type = " << *Ty << '\n';
+    assert(0 && "Invalid type in getTypeName()");
+  }
+  return ""; // Not reached
+}
+
+
+MSILWriter::ValueType MSILWriter::getValueLocation(const Value* V) {
+  // Function argument
+  if (isa<Argument>(V))
+    return ArgumentVT;
+  // Function
+  else if (const Function* F = dyn_cast<Function>(V))
+    return F->hasLocalLinkage() ? InternalVT : GlobalVT;
+  // Variable
+  else if (const GlobalVariable* G = dyn_cast<GlobalVariable>(V))
+    return G->hasLocalLinkage() ? InternalVT : GlobalVT;
+  // Constant
+  else if (isa<Constant>(V))
+    return isa<ConstantExpr>(V) ? ConstExprVT : ConstVT;
+  // Local variable
+  return LocalVT;
+}
+
+
+std::string MSILWriter::getTypePostfix(const Type* Ty, bool Expand,
+                                       bool isSigned) {
+  unsigned NumBits = 0;
+  switch (Ty->getTypeID()) {
+  // Integer constant, expanding for stack operations.
+  case Type::IntegerTyID:
+    NumBits = getBitWidth(Ty);
+    // Expand integer value to "int32" or "int64".
+    if (Expand) return (NumBits<=32 ? "i4" : "i8");
+    if (NumBits==1) return "i1";
+    return (isSigned ? "i" : "u")+utostr(NumBits/8);
+  // Float constant.
+  case Type::FloatTyID:
+    return "r4";
+  case Type::DoubleTyID:
+    return "r8";
+  case Type::PointerTyID:
+    return "i"+utostr(TD->getTypeAllocSize(Ty));
+  default:
+    cerr << "TypeID = " << Ty->getTypeID() << '\n';
+    assert(0 && "Invalid type in TypeToPostfix()");
+  }
+  return ""; // Not reached
+}
+
+
+void MSILWriter::printConvToPtr() {
+  switch (ModulePtr->getPointerSize()) {
+  case Module::Pointer32:
+    printSimpleInstruction("conv.u4");
+    break;
+  case Module::Pointer64:
+    printSimpleInstruction("conv.u8");
+    break;
+  default:
+    assert(0 && "Module use not supporting pointer size");
+  }
+}
+
+
+void MSILWriter::printPtrLoad(uint64_t N) {
+  switch (ModulePtr->getPointerSize()) {
+  case Module::Pointer32:
+    printSimpleInstruction("ldc.i4",utostr(N).c_str());
+    // FIXME: Need overflow test?
+    if (!isUInt32(N)) {
+      cerr << "Value = " << utostr(N) << '\n';
+      assert(0 && "32-bit pointer overflowed");
+    }
+    break;
+  case Module::Pointer64:
+    printSimpleInstruction("ldc.i8",utostr(N).c_str());
+    break;
+  default:
+    assert(0 && "Module use not supporting pointer size");
+  }
+}
+
+
+void MSILWriter::printValuePtrLoad(const Value* V) {
+  printValueLoad(V);
+  printConvToPtr();
+}
+
+
+void MSILWriter::printConstLoad(const Constant* C) {
+  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(C)) {
+    // Integer constant
+    Out << "\tldc." << getTypePostfix(C->getType(),true) << '\t';
+    if (CInt->isMinValue(true))
+      Out << CInt->getSExtValue();
+    else
+      Out << CInt->getZExtValue();
+  } else if (const ConstantFP* FP = dyn_cast<ConstantFP>(C)) {
+    // Float constant
+    uint64_t X;
+    unsigned Size;
+    if (FP->getType()->getTypeID()==Type::FloatTyID) {
+      X = (uint32_t)FP->getValueAPF().bitcastToAPInt().getZExtValue();
+      Size = 4;  
+    } else {
+      X = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+      Size = 8;  
+    }
+    Out << "\tldc.r" << Size << "\t( " << utohexstr(X) << ')';
+  } else if (isa<UndefValue>(C)) {
+    // Undefined constant value = NULL.
+    printPtrLoad(0);
+  } else {
+    cerr << "Constant = " << *C << '\n';
+    assert(0 && "Invalid constant value");
+  }
+  Out << '\n';
+}
+
+
+void MSILWriter::printValueLoad(const Value* V) {
+  MSILWriter::ValueType Location = getValueLocation(V);
+  switch (Location) {
+  // Global variable or function address.
+  case GlobalVT:
+  case InternalVT:
+    if (const Function* F = dyn_cast<Function>(V)) {
+      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
+      printSimpleInstruction("ldftn",
+        getCallSignature(F->getFunctionType(),NULL,Name).c_str());
+    } else {
+      std::string Tmp;
+      const Type* ElemTy = cast<PointerType>(V->getType())->getElementType();
+      if (Location==GlobalVT && cast<GlobalVariable>(V)->hasDLLImportLinkage()) {
+        Tmp = "void* "+getValueName(V);
+        printSimpleInstruction("ldsfld",Tmp.c_str());
+      } else {
+        Tmp = getTypeName(ElemTy)+getValueName(V);
+        printSimpleInstruction("ldsflda",Tmp.c_str());
+      }
+    }
+    break;
+  // Function argument.
+  case ArgumentVT:
+    printSimpleInstruction("ldarg",getValueName(V).c_str());
+    break;
+  // Local function variable.
+  case LocalVT:
+    printSimpleInstruction("ldloc",getValueName(V).c_str());
+    break;
+  // Constant value.
+  case ConstVT:
+    if (isa<ConstantPointerNull>(V))
+      printPtrLoad(0);
+    else
+      printConstLoad(cast<Constant>(V));
+    break;
+  // Constant expression.
+  case ConstExprVT:
+    printConstantExpr(cast<ConstantExpr>(V));
+    break;
+  default:
+    cerr << "Value = " << *V << '\n';
+    assert(0 && "Invalid value location");
+  }
+}
+
+
+void MSILWriter::printValueSave(const Value* V) {
+  switch (getValueLocation(V)) {
+  case ArgumentVT:
+    printSimpleInstruction("starg",getValueName(V).c_str());
+    break;
+  case LocalVT:
+    printSimpleInstruction("stloc",getValueName(V).c_str());
+    break;
+  default:
+    cerr << "Value  = " << *V << '\n';
+    assert(0 && "Invalid value location");
+  }
+}
+
+
+void MSILWriter::printBinaryInstruction(const char* Name, const Value* Left,
+                                        const Value* Right) {
+  printValueLoad(Left);
+  printValueLoad(Right);
+  Out << '\t' << Name << '\n';
+}
+
+
+void MSILWriter::printSimpleInstruction(const char* Inst, const char* Operand) {
+  if(Operand) 
+    Out << '\t' << Inst << '\t' << Operand << '\n';
+  else
+    Out << '\t' << Inst << '\n';
+}
+
+
+void MSILWriter::printPHICopy(const BasicBlock* Src, const BasicBlock* Dst) {
+  for (BasicBlock::const_iterator I = Dst->begin(), E = Dst->end();
+       isa<PHINode>(I); ++I) {
+    const PHINode* Phi = cast<PHINode>(I);
+    const Value* Val = Phi->getIncomingValueForBlock(Src);
+    if (isa<UndefValue>(Val)) continue;
+    printValueLoad(Val);
+    printValueSave(Phi);
+  }
+}
+
+
+void MSILWriter::printBranchToBlock(const BasicBlock* CurrBB,
+                                    const BasicBlock* TrueBB,
+                                    const BasicBlock* FalseBB) {
+  if (TrueBB==FalseBB) {
+    // "TrueBB" and "FalseBB" destination equals
+    printPHICopy(CurrBB,TrueBB);
+    printSimpleInstruction("pop");
+    printSimpleInstruction("br",getLabelName(TrueBB).c_str());
+  } else if (FalseBB==NULL) {
+    // If "FalseBB" not used the jump have condition
+    printPHICopy(CurrBB,TrueBB);
+    printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
+  } else if (TrueBB==NULL) {
+    // If "TrueBB" not used the jump is unconditional
+    printPHICopy(CurrBB,FalseBB);
+    printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+  } else {
+    // Copy PHI instructions for each block
+    std::string TmpLabel;
+    // Print PHI instructions for "TrueBB"
+    if (isa<PHINode>(TrueBB->begin())) {
+      TmpLabel = getLabelName(TrueBB)+"$phi_"+utostr(getUniqID());
+      printSimpleInstruction("brtrue",TmpLabel.c_str());
+    } else {
+      printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
+    }
+    // Print PHI instructions for "FalseBB"
+    if (isa<PHINode>(FalseBB->begin())) {
+      printPHICopy(CurrBB,FalseBB);
+      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+    } else {
+      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+    }
+    if (isa<PHINode>(TrueBB->begin())) {
+      // Handle "TrueBB" PHI Copy
+      Out << TmpLabel << ":\n";
+      printPHICopy(CurrBB,TrueBB);
+      printSimpleInstruction("br",getLabelName(TrueBB).c_str());
+    }
+  }
+}
+
+
+void MSILWriter::printBranchInstruction(const BranchInst* Inst) {
+  if (Inst->isUnconditional()) {
+    printBranchToBlock(Inst->getParent(),NULL,Inst->getSuccessor(0));
+  } else {
+    printValueLoad(Inst->getCondition());
+    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(0),
+                       Inst->getSuccessor(1));
+  }
+}
+
+
+void MSILWriter::printSelectInstruction(const Value* Cond, const Value* VTrue,
+                                        const Value* VFalse) {
+  std::string TmpLabel = std::string("select$true_")+utostr(getUniqID());
+  printValueLoad(VTrue);
+  printValueLoad(Cond);
+  printSimpleInstruction("brtrue",TmpLabel.c_str());
+  printSimpleInstruction("pop");
+  printValueLoad(VFalse);
+  Out << TmpLabel << ":\n";
+}
+
+
+void MSILWriter::printIndirectLoad(const Value* V) {
+  const Type* Ty = V->getType();
+  printValueLoad(V);
+  if (const PointerType* P = dyn_cast<PointerType>(Ty))
+    Ty = P->getElementType();
+  std::string Tmp = "ldind."+getTypePostfix(Ty, false);
+  printSimpleInstruction(Tmp.c_str());
+}
+
+
+void MSILWriter::printIndirectSave(const Value* Ptr, const Value* Val) {
+  printValueLoad(Ptr);
+  printValueLoad(Val);
+  printIndirectSave(Val->getType());
+}
+
+
+void MSILWriter::printIndirectSave(const Type* Ty) {
+  // Instruction need signed postfix for any type.
+  std::string postfix = getTypePostfix(Ty, false);
+  if (*postfix.begin()=='u') *postfix.begin() = 'i';
+  postfix = "stind."+postfix;
+  printSimpleInstruction(postfix.c_str());
+}
+
+
+void MSILWriter::printCastInstruction(unsigned int Op, const Value* V,
+                                      const Type* Ty) {
+  std::string Tmp("");
+  printValueLoad(V);
+  switch (Op) {
+  // Signed
+  case Instruction::SExt:
+  case Instruction::SIToFP:
+  case Instruction::FPToSI:
+    Tmp = "conv."+getTypePostfix(Ty,false,true);
+    printSimpleInstruction(Tmp.c_str());
+    break;
+  // Unsigned
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::FPToUI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+    Tmp = "conv."+getTypePostfix(Ty,false);
+    printSimpleInstruction(Tmp.c_str());
+    break;
+  // Do nothing
+  case Instruction::BitCast:
+    // FIXME: meaning that ld*/st* instruction do not change data format.
+    break;
+  default:
+    cerr << "Opcode = " << Op << '\n';
+    assert(0 && "Invalid conversion instruction");
+  }
+}
+
+
+void MSILWriter::printGepInstruction(const Value* V, gep_type_iterator I,
+                                     gep_type_iterator E) {
+  unsigned Size;
+  // Load address
+  printValuePtrLoad(V);
+  // Calculate element offset.
+  for (; I!=E; ++I){
+    Size = 0;
+    const Value* IndexValue = I.getOperand();
+    if (const StructType* StrucTy = dyn_cast<StructType>(*I)) {
+      uint64_t FieldIndex = cast<ConstantInt>(IndexValue)->getZExtValue();
+      // Offset is the sum of all previous structure fields.
+      for (uint64_t F = 0; F<FieldIndex; ++F)
+        Size += TD->getTypeAllocSize(StrucTy->getContainedType((unsigned)F));
+      printPtrLoad(Size);
+      printSimpleInstruction("add");
+      continue;
+    } else if (const SequentialType* SeqTy = dyn_cast<SequentialType>(*I)) {
+      Size = TD->getTypeAllocSize(SeqTy->getElementType());
+    } else {
+      Size = TD->getTypeAllocSize(*I);
+    }
+    // Add offset of current element to stack top.
+    if (!isZeroValue(IndexValue)) {
+      // Constant optimization.
+      if (const ConstantInt* C = dyn_cast<ConstantInt>(IndexValue)) {
+        if (C->getValue().isNegative()) {
+          printPtrLoad(C->getValue().abs().getZExtValue()*Size);
+          printSimpleInstruction("sub");
+          continue;
+        } else
+          printPtrLoad(C->getZExtValue()*Size);
+      } else {
+        printPtrLoad(Size);
+        printValuePtrLoad(IndexValue);
+        printSimpleInstruction("mul");
+      }
+      printSimpleInstruction("add");
+    }
+  }
+}
+
+
+std::string MSILWriter::getCallSignature(const FunctionType* Ty,
+                                         const Instruction* Inst,
+                                         std::string Name) {
+  std::string Tmp("");
+  if (Ty->isVarArg()) Tmp += "vararg ";
+  // Name and return type.
+  Tmp += getTypeName(Ty->getReturnType())+Name+"(";
+  // Function argument type list.
+  unsigned NumParams = Ty->getNumParams();
+  for (unsigned I = 0; I!=NumParams; ++I) {
+    if (I!=0) Tmp += ",";
+    Tmp += getTypeName(Ty->getParamType(I));
+  }
+  // CLR needs to know the exact amount of parameters received by vararg
+  // function, because caller cleans the stack.
+  if (Ty->isVarArg() && Inst) {
+    // Origin to function arguments in "CallInst" or "InvokeInst".
+    unsigned Org = isa<InvokeInst>(Inst) ? 3 : 1;
+    // Print variable argument types.
+    unsigned NumOperands = Inst->getNumOperands()-Org;
+    if (NumParams<NumOperands) {
+      if (NumParams!=0) Tmp += ", ";
+      Tmp += "... , ";
+      for (unsigned J = NumParams; J!=NumOperands; ++J) {
+        if (J!=NumParams) Tmp += ", ";
+        Tmp += getTypeName(Inst->getOperand(J+Org)->getType());
+      }
+    }
+  }
+  return Tmp+")";
+}
+
+
+void MSILWriter::printFunctionCall(const Value* FnVal,
+                                   const Instruction* Inst) {
+  // Get function calling convention.
+  std::string Name = "";
+  if (const CallInst* Call = dyn_cast<CallInst>(Inst))
+    Name = getConvModopt(Call->getCallingConv());
+  else if (const InvokeInst* Invoke = dyn_cast<InvokeInst>(Inst))
+    Name = getConvModopt(Invoke->getCallingConv());
+  else {
+    cerr << "Instruction = " << Inst->getName() << '\n';
+    assert(0 && "Need \"Invoke\" or \"Call\" instruction only");
+  }
+  if (const Function* F = dyn_cast<Function>(FnVal)) {
+    // Direct call.
+    Name += getValueName(F);
+    printSimpleInstruction("call",
+      getCallSignature(F->getFunctionType(),Inst,Name).c_str());
+  } else {
+    // Indirect function call.
+    const PointerType* PTy = cast<PointerType>(FnVal->getType());
+    const FunctionType* FTy = cast<FunctionType>(PTy->getElementType());
+    // Load function address.
+    printValueLoad(FnVal);
+    printSimpleInstruction("calli",getCallSignature(FTy,Inst,Name).c_str());
+  }
+}
+
+
+void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
+  std::string Name;
+  switch (Inst->getIntrinsicID()) {
+  case Intrinsic::vastart:
+    Name = getValueName(Inst->getOperand(1));
+    Name.insert(Name.length()-1,"$valist");
+    // Obtain the argument handle.
+    printSimpleInstruction("ldloca",Name.c_str());
+    printSimpleInstruction("arglist");
+    printSimpleInstruction("call",
+      "instance void [mscorlib]System.ArgIterator::.ctor"
+      "(valuetype [mscorlib]System.RuntimeArgumentHandle)");
+    // Save as pointer type "void*"
+    printValueLoad(Inst->getOperand(1));
+    printSimpleInstruction("ldloca",Name.c_str());
+    printIndirectSave(PointerType::getUnqual(IntegerType::get(8)));
+    break;
+  case Intrinsic::vaend:
+    // Close argument list handle.
+    printIndirectLoad(Inst->getOperand(1));
+    printSimpleInstruction("call","instance void [mscorlib]System.ArgIterator::End()");
+    break;
+  case Intrinsic::vacopy:
+    // Copy "ArgIterator" valuetype.
+    printIndirectLoad(Inst->getOperand(1));
+    printIndirectLoad(Inst->getOperand(2));
+    printSimpleInstruction("cpobj","[mscorlib]System.ArgIterator");
+    break;        
+  default:
+    cerr << "Intrinsic ID = " << Inst->getIntrinsicID() << '\n';
+    assert(0 && "Invalid intrinsic function");
+  }
+}
+
+
+void MSILWriter::printCallInstruction(const Instruction* Inst) {
+  if (isa<IntrinsicInst>(Inst)) {
+    // Handle intrinsic function.
+    printIntrinsicCall(cast<IntrinsicInst>(Inst));
+  } else {
+    // Load arguments to stack and call function.
+    for (int I = 1, E = Inst->getNumOperands(); I!=E; ++I)
+      printValueLoad(Inst->getOperand(I));
+    printFunctionCall(Inst->getOperand(0),Inst);
+  }
+}
+
+
+void MSILWriter::printICmpInstruction(unsigned Predicate, const Value* Left,
+                                      const Value* Right) {
+  switch (Predicate) {
+  case ICmpInst::ICMP_EQ:
+    printBinaryInstruction("ceq",Left,Right);
+    break;
+  case ICmpInst::ICMP_NE:
+    // Emulate = not neg (Op1 eq Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("neg");
+    printSimpleInstruction("not");
+    break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE:
+    // Emulate = (Op1 eq Op2) or (Op1 lt Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    if (Predicate==ICmpInst::ICMP_ULE)
+      printBinaryInstruction("clt.un",Left,Right);
+    else
+      printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+    // Emulate = (Op1 eq Op2) or (Op1 gt Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    if (Predicate==ICmpInst::ICMP_UGE)
+      printBinaryInstruction("cgt.un",Left,Right);
+    else
+      printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case ICmpInst::ICMP_ULT:
+    printBinaryInstruction("clt.un",Left,Right);
+    break;
+  case ICmpInst::ICMP_SLT:
+    printBinaryInstruction("clt",Left,Right);
+    break;
+  case ICmpInst::ICMP_UGT:
+    printBinaryInstruction("cgt.un",Left,Right);
+  case ICmpInst::ICMP_SGT:
+    printBinaryInstruction("cgt",Left,Right);
+    break;
+  default:
+    cerr << "Predicate = " << Predicate << '\n';
+    assert(0 && "Invalid icmp predicate");
+  }
+}
+
+
+void MSILWriter::printFCmpInstruction(unsigned Predicate, const Value* Left,
+                                      const Value* Right) {
+  // FIXME: Correct comparison
+  std::string NanFunc = "bool [mscorlib]System.Double::IsNaN(float64)";
+  switch (Predicate) {
+  case FCmpInst::FCMP_UGT:
+    // X >  Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("cgt",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OGT:
+    // X >  Y
+    printBinaryInstruction("cgt",Left,Right);
+    break;
+  case FCmpInst::FCMP_UGE:
+    // X >= Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OGE:
+    // X >= Y
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_ULT:
+    // X <  Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("clt",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OLT:
+    // X <  Y
+    printBinaryInstruction("clt",Left,Right);
+    break;
+  case FCmpInst::FCMP_ULE:
+    // X <= Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OLE:
+    // X <= Y
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_UEQ:
+    // X == Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OEQ:
+    // X == Y
+    printBinaryInstruction("ceq",Left,Right);
+    break;
+  case FCmpInst::FCMP_UNE:
+    // X != Y
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("neg");
+    printSimpleInstruction("not");
+    break;
+  case FCmpInst::FCMP_ONE:
+    // X != Y && llvm_fcmp_ord(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("not");
+    break;
+  case FCmpInst::FCMP_ORD:
+    // return X == X && Y == Y
+    printBinaryInstruction("ceq",Left,Left);
+    printBinaryInstruction("ceq",Right,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_UNO:
+    // X != X || Y != Y
+    printBinaryInstruction("ceq",Left,Left);
+    printSimpleInstruction("not");
+    printBinaryInstruction("ceq",Right,Right);
+    printSimpleInstruction("not");
+    printSimpleInstruction("or");
+    break;
+  default:
+    assert(0 && "Illegal FCmp predicate");
+  }
+}
+
+
+void MSILWriter::printInvokeInstruction(const InvokeInst* Inst) {
+  std::string Label = "leave$normal_"+utostr(getUniqID());
+  Out << ".try {\n";
+  // Load arguments
+  for (int I = 3, E = Inst->getNumOperands(); I!=E; ++I)
+    printValueLoad(Inst->getOperand(I));
+  // Print call instruction
+  printFunctionCall(Inst->getOperand(0),Inst);
+  // Save function result and leave "try" block
+  printValueSave(Inst);
+  printSimpleInstruction("leave",Label.c_str());
+  Out << "}\n";
+  Out << "catch [mscorlib]System.Exception {\n";
+  // Redirect to unwind block
+  printSimpleInstruction("pop");
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getUnwindDest());
+  Out << "}\n" << Label << ":\n";
+  // Redirect to continue block
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getNormalDest());
+}
+
+
+void MSILWriter::printSwitchInstruction(const SwitchInst* Inst) {
+  // FIXME: Emulate with IL "switch" instruction
+  // Emulate = if () else if () else if () else ...
+  for (unsigned int I = 1, E = Inst->getNumCases(); I!=E; ++I) {
+    printValueLoad(Inst->getCondition());
+    printValueLoad(Inst->getCaseValue(I));
+    printSimpleInstruction("ceq");
+    // Condition jump to successor block
+    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(I),NULL);
+  }
+  // Jump to default block
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getDefaultDest());
+}
+
+
+void MSILWriter::printVAArgInstruction(const VAArgInst* Inst) {
+  printIndirectLoad(Inst->getOperand(0));
+  printSimpleInstruction("call",
+    "instance typedref [mscorlib]System.ArgIterator::GetNextArg()");
+  printSimpleInstruction("refanyval","void*");
+  std::string Name = 
+    "ldind."+getTypePostfix(PointerType::getUnqual(IntegerType::get(8)),false);
+  printSimpleInstruction(Name.c_str());
+}
+
+
+void MSILWriter::printAllocaInstruction(const AllocaInst* Inst) {
+  uint64_t Size = TD->getTypeAllocSize(Inst->getAllocatedType());
+  // Constant optimization.
+  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
+    printPtrLoad(CInt->getZExtValue()*Size);
+  } else {
+    printPtrLoad(Size);
+    printValueLoad(Inst->getOperand(0));
+    printSimpleInstruction("mul");
+  }
+  printSimpleInstruction("localloc");
+}
+
+
+void MSILWriter::printInstruction(const Instruction* Inst) {
+  const Value *Left = 0, *Right = 0;
+  if (Inst->getNumOperands()>=1) Left = Inst->getOperand(0);
+  if (Inst->getNumOperands()>=2) Right = Inst->getOperand(1);
+  // Print instruction
+  // FIXME: "ShuffleVector","ExtractElement","InsertElement" support.
+  switch (Inst->getOpcode()) {
+  // Terminator
+  case Instruction::Ret:
+    if (Inst->getNumOperands()) {
+      printValueLoad(Left);
+      printSimpleInstruction("ret");
+    } else
+      printSimpleInstruction("ret");
+    break;
+  case Instruction::Br:
+    printBranchInstruction(cast<BranchInst>(Inst));
+    break;
+  // Binary
+  case Instruction::Add:
+    printBinaryInstruction("add",Left,Right);
+    break;
+  case Instruction::Sub:
+    printBinaryInstruction("sub",Left,Right);
+    break;
+  case Instruction::Mul:  
+    printBinaryInstruction("mul",Left,Right);
+    break;
+  case Instruction::UDiv:
+    printBinaryInstruction("div.un",Left,Right);
+    break;
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+    printBinaryInstruction("div",Left,Right);
+    break;
+  case Instruction::URem:
+    printBinaryInstruction("rem.un",Left,Right);
+    break;
+  case Instruction::SRem:
+  case Instruction::FRem:
+    printBinaryInstruction("rem",Left,Right);
+    break;
+  // Binary Condition
+  case Instruction::ICmp:
+    printICmpInstruction(cast<ICmpInst>(Inst)->getPredicate(),Left,Right);
+    break;
+  case Instruction::FCmp:
+    printFCmpInstruction(cast<FCmpInst>(Inst)->getPredicate(),Left,Right);
+    break;
+  // Bitwise Binary
+  case Instruction::And:
+    printBinaryInstruction("and",Left,Right);
+    break;
+  case Instruction::Or:
+    printBinaryInstruction("or",Left,Right);
+    break;
+  case Instruction::Xor:
+    printBinaryInstruction("xor",Left,Right);
+    break;
+  case Instruction::Shl:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shl");
+    break;
+  case Instruction::LShr:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shr.un");
+    break;
+  case Instruction::AShr:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shr");
+    break;
+  case Instruction::Select:
+    printSelectInstruction(Inst->getOperand(0),Inst->getOperand(1),Inst->getOperand(2));
+    break;
+  case Instruction::Load:
+    printIndirectLoad(Inst->getOperand(0));
+    break;
+  case Instruction::Store:
+    printIndirectSave(Inst->getOperand(1), Inst->getOperand(0));
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    printCastInstruction(Inst->getOpcode(),Left,
+                         cast<CastInst>(Inst)->getDestTy());
+    break;
+  case Instruction::GetElementPtr:
+    printGepInstruction(Inst->getOperand(0),gep_type_begin(Inst),
+                        gep_type_end(Inst));
+    break;
+  case Instruction::Call:
+    printCallInstruction(cast<CallInst>(Inst));
+    break;
+  case Instruction::Invoke:
+    printInvokeInstruction(cast<InvokeInst>(Inst));
+    break;
+  case Instruction::Unwind:
+    printSimpleInstruction("newobj",
+      "instance void [mscorlib]System.Exception::.ctor()");
+    printSimpleInstruction("throw");
+    break;
+  case Instruction::Switch:
+    printSwitchInstruction(cast<SwitchInst>(Inst));
+    break;
+  case Instruction::Alloca:
+    printAllocaInstruction(cast<AllocaInst>(Inst));
+    break;
+  case Instruction::Malloc:
+    assert(0 && "LowerAllocationsPass used");
+    break;
+  case Instruction::Free:
+    assert(0 && "LowerAllocationsPass used");
+    break;
+  case Instruction::Unreachable:
+    printSimpleInstruction("ldstr", "\"Unreachable instruction\"");
+    printSimpleInstruction("newobj",
+      "instance void [mscorlib]System.Exception::.ctor(string)");
+    printSimpleInstruction("throw");
+    break;
+  case Instruction::VAArg:
+    printVAArgInstruction(cast<VAArgInst>(Inst));
+    break;
+  default:
+    cerr << "Instruction = " << Inst->getName() << '\n';
+    assert(0 && "Unsupported instruction");
+  }
+}
+
+
+void MSILWriter::printLoop(const Loop* L) {
+  Out << getLabelName(L->getHeader()->getName()) << ":\n";
+  const std::vector<BasicBlock*>& blocks = L->getBlocks();
+  for (unsigned I = 0, E = blocks.size(); I!=E; I++) {
+    BasicBlock* BB = blocks[I];
+    Loop* BBLoop = LInfo->getLoopFor(BB);
+    if (BBLoop == L)
+      printBasicBlock(BB);
+    else if (BB==BBLoop->getHeader() && BBLoop->getParentLoop()==L)
+      printLoop(BBLoop);
+  }
+  printSimpleInstruction("br",getLabelName(L->getHeader()->getName()).c_str());
+}
+
+
+void MSILWriter::printBasicBlock(const BasicBlock* BB) {
+  Out << getLabelName(BB) << ":\n";
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
+    const Instruction* Inst = I;
+    // Comment llvm original instruction
+    // Out << "\n//" << *Inst << "\n";
+    // Do not handle PHI instruction in current block
+    if (Inst->getOpcode()==Instruction::PHI) continue;
+    // Print instruction
+    printInstruction(Inst);
+    // Save result
+    if (Inst->getType()!=Type::VoidTy) {
+      // Do not save value after invoke, it done in "try" block
+      if (Inst->getOpcode()==Instruction::Invoke) continue;
+      printValueSave(Inst);
+    }
+  }
+}
+
+
+void MSILWriter::printLocalVariables(const Function& F) {
+  std::string Name;
+  const Type* Ty = NULL;
+  std::set<const Value*> Printed;
+  const Value* VaList = NULL;
+  unsigned StackDepth = 8;
+  // Find local variables
+  for (const_inst_iterator I = inst_begin(&F), E = inst_end(&F); I!=E; ++I) {
+    if (I->getOpcode()==Instruction::Call ||
+        I->getOpcode()==Instruction::Invoke) {
+      // Test stack depth.
+      if (StackDepth<I->getNumOperands())
+        StackDepth = I->getNumOperands();
+    }
+    const AllocaInst* AI = dyn_cast<AllocaInst>(&*I);
+    if (AI && !isa<GlobalVariable>(AI)) {
+      // Local variable allocation.
+      Ty = PointerType::getUnqual(AI->getAllocatedType());
+      Name = getValueName(AI);
+      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
+    } else if (I->getType()!=Type::VoidTy) {
+      // Operation result.
+      Ty = I->getType();
+      Name = getValueName(&*I);
+      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
+    }
+    // Test on 'va_list' variable    
+    bool isVaList = false;     
+    if (const VAArgInst* VaInst = dyn_cast<VAArgInst>(&*I)) {
+      // "va_list" as "va_arg" instruction operand.
+      isVaList = true;
+      VaList = VaInst->getOperand(0);
+    } else if (const IntrinsicInst* Inst = dyn_cast<IntrinsicInst>(&*I)) {
+      // "va_list" as intrinsic function operand. 
+      switch (Inst->getIntrinsicID()) {
+      case Intrinsic::vastart:
+      case Intrinsic::vaend:
+      case Intrinsic::vacopy:
+        isVaList = true;
+        VaList = Inst->getOperand(1);
+        break;
+      default:
+        isVaList = false;
+      }
+    }
+    // Print "va_list" variable.
+    if (isVaList && Printed.insert(VaList).second) {
+      Name = getValueName(VaList);
+      Name.insert(Name.length()-1,"$valist");
+      Out << "\t.locals (valuetype [mscorlib]System.ArgIterator "
+          << Name << ")\n";
+    }
+  }
+  printSimpleInstruction(".maxstack",utostr(StackDepth*2).c_str());
+}
+
+
+void MSILWriter::printFunctionBody(const Function& F) {
+  // Print body
+  for (Function::const_iterator I = F.begin(), E = F.end(); I!=E; ++I) {
+    if (Loop *L = LInfo->getLoopFor(I)) {
+      if (L->getHeader()==I && L->getParentLoop()==0)
+        printLoop(L);
+    } else {
+      printBasicBlock(I);
+    }
+  }
+}
+
+
+void MSILWriter::printConstantExpr(const ConstantExpr* CE) {
+  const Value *left = 0, *right = 0;
+  if (CE->getNumOperands()>=1) left = CE->getOperand(0);
+  if (CE->getNumOperands()>=2) right = CE->getOperand(1);
+  // Print instruction
+  switch (CE->getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    printCastInstruction(CE->getOpcode(),left,CE->getType());
+    break;
+  case Instruction::GetElementPtr:
+    printGepInstruction(CE->getOperand(0),gep_type_begin(CE),gep_type_end(CE));
+    break;
+  case Instruction::ICmp:
+    printICmpInstruction(CE->getPredicate(),left,right);
+    break;
+  case Instruction::FCmp:
+    printFCmpInstruction(CE->getPredicate(),left,right);
+    break;
+  case Instruction::Select:
+    printSelectInstruction(CE->getOperand(0),CE->getOperand(1),CE->getOperand(2));
+    break;
+  case Instruction::Add:
+    printBinaryInstruction("add",left,right);
+    break;
+  case Instruction::Sub:
+    printBinaryInstruction("sub",left,right);
+    break;
+  case Instruction::Mul:
+    printBinaryInstruction("mul",left,right);
+    break;
+  case Instruction::UDiv:
+    printBinaryInstruction("div.un",left,right);
+    break;
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+    printBinaryInstruction("div",left,right);
+    break;
+  case Instruction::URem:
+    printBinaryInstruction("rem.un",left,right);
+    break;
+  case Instruction::SRem:
+  case Instruction::FRem:
+    printBinaryInstruction("rem",left,right);
+    break;
+  case Instruction::And:
+    printBinaryInstruction("and",left,right);
+    break;
+  case Instruction::Or:
+    printBinaryInstruction("or",left,right);
+    break;
+  case Instruction::Xor:
+    printBinaryInstruction("xor",left,right);
+    break;
+  case Instruction::Shl:
+    printBinaryInstruction("shl",left,right);
+    break;
+  case Instruction::LShr:
+    printBinaryInstruction("shr.un",left,right);
+    break;
+  case Instruction::AShr:
+    printBinaryInstruction("shr",left,right);
+    break;
+  default:
+    cerr << "Expression = " << *CE << "\n";
+    assert(0 && "Invalid constant expression");
+  }
+}
+
+
+void MSILWriter::printStaticInitializerList() {
+  // List of global variables with uninitialized fields.
+  for (std::map<const GlobalVariable*,std::vector<StaticInitializer> >::iterator
+       VarI = StaticInitList.begin(), VarE = StaticInitList.end(); VarI!=VarE;
+       ++VarI) {
+    const std::vector<StaticInitializer>& InitList = VarI->second;
+    if (InitList.empty()) continue;
+    // For each uninitialized field.
+    for (std::vector<StaticInitializer>::const_iterator I = InitList.begin(),
+         E = InitList.end(); I!=E; ++I) {
+      if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(I->constant)) {
+        // Out << "\n// Init " << getValueName(VarI->first) << ", offset " <<
+        //  utostr(I->offset) << ", type "<< *I->constant->getType() << "\n\n";
+        // Load variable address
+        printValueLoad(VarI->first);
+        // Add offset
+        if (I->offset!=0) {
+          printPtrLoad(I->offset);
+          printSimpleInstruction("add");
+        }
+        // Load value
+        printConstantExpr(CE);
+        // Save result at offset
+        std::string postfix = getTypePostfix(CE->getType(),true);
+        if (*postfix.begin()=='u') *postfix.begin() = 'i';
+        postfix = "stind."+postfix;
+        printSimpleInstruction(postfix.c_str());
+      } else {
+        cerr << "Constant = " << *I->constant << '\n';
+        assert(0 && "Invalid static initializer");
+      }
+    }
+  }
+}
+
+
+void MSILWriter::printFunction(const Function& F) {
+  bool isSigned = F.paramHasAttr(0, Attribute::SExt);
+  Out << "\n.method static ";
+  Out << (F.hasLocalLinkage() ? "private " : "public ");
+  if (F.isVarArg()) Out << "vararg ";
+  Out << getTypeName(F.getReturnType(),isSigned) << 
+    getConvModopt(F.getCallingConv()) << getValueName(&F) << '\n';
+  // Arguments
+  Out << "\t(";
+  unsigned ArgIdx = 1;
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I!=E;
+       ++I, ++ArgIdx) {
+    isSigned = F.paramHasAttr(ArgIdx, Attribute::SExt);
+    if (I!=F.arg_begin()) Out << ", ";
+    Out << getTypeName(I->getType(),isSigned) << getValueName(I);
+  }
+  Out << ") cil managed\n";
+  // Body
+  Out << "{\n";
+  printLocalVariables(F);
+  printFunctionBody(F);
+  Out << "}\n";
+}
+
+
+void MSILWriter::printDeclarations(const TypeSymbolTable& ST) {
+  std::string Name;
+  std::set<const Type*> Printed;
+  for (std::set<const Type*>::const_iterator
+       UI = UsedTypes->begin(), UE = UsedTypes->end(); UI!=UE; ++UI) {
+    const Type* Ty = *UI;
+    if (isa<ArrayType>(Ty) || isa<VectorType>(Ty) || isa<StructType>(Ty))
+      Name = getTypeName(Ty, false, true);
+    // Type with no need to declare.
+    else continue;
+    // Print not duplicated type
+    if (Printed.insert(Ty).second) {
+      Out << ".class value explicit ansi sealed '" << Name << "'";
+      Out << " { .pack " << 1 << " .size " << TD->getTypeAllocSize(Ty);
+      Out << " }\n\n";
+    }
+  }
+}
+
+
+unsigned int MSILWriter::getBitWidth(const Type* Ty) {
+  unsigned int N = Ty->getPrimitiveSizeInBits();
+  assert(N!=0 && "Invalid type in getBitWidth()");
+  switch (N) {
+  case 1:
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+    return N;
+  default:
+    cerr << "Bits = " << N << '\n';
+    assert(0 && "Unsupported integer width");
+  }
+  return 0; // Not reached
+}
+
+
+void MSILWriter::printStaticConstant(const Constant* C, uint64_t& Offset) {
+  uint64_t TySize = 0;
+  const Type* Ty = C->getType();
+  // Print zero initialized constant.
+  if (isa<ConstantAggregateZero>(C) || C->isNullValue()) {
+    TySize = TD->getTypeAllocSize(C->getType());
+    Offset += TySize;
+    Out << "int8 (0) [" << TySize << "]";
+    return;
+  }
+  // Print constant initializer
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    TySize = TD->getTypeAllocSize(Ty);
+    const ConstantInt* Int = cast<ConstantInt>(C);
+    Out << getPrimitiveTypeName(Ty,true) << "(" << Int->getSExtValue() << ")";
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    TySize = TD->getTypeAllocSize(Ty);
+    const ConstantFP* FP = cast<ConstantFP>(C);
+    if (Ty->getTypeID() == Type::FloatTyID)
+      Out << "int32 (" << 
+        (uint32_t)FP->getValueAPF().bitcastToAPInt().getZExtValue() << ')';
+    else
+      Out << "int64 (" << 
+        FP->getValueAPF().bitcastToAPInt().getZExtValue() << ')';
+    break;
+  }
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID:
+    for (unsigned I = 0, E = C->getNumOperands(); I<E; I++) {
+      if (I!=0) Out << ",\n";
+      printStaticConstant(C->getOperand(I),Offset);
+    }
+    break;
+  case Type::PointerTyID:
+    TySize = TD->getTypeAllocSize(C->getType());
+    // Initialize with global variable address
+    if (const GlobalVariable *G = dyn_cast<GlobalVariable>(C)) {
+      std::string name = getValueName(G);
+      Out << "&(" << name.insert(name.length()-1,"$data") << ")";
+    } else {
+      // Dynamic initialization
+      if (!isa<ConstantPointerNull>(C) && !C->isNullValue())
+        InitListPtr->push_back(StaticInitializer(C,Offset));
+      // Null pointer initialization
+      if (TySize==4) Out << "int32 (0)";
+      else if (TySize==8) Out << "int64 (0)";
+      else assert(0 && "Invalid pointer size");
+    }
+    break;
+  default:
+    cerr << "TypeID = " << Ty->getTypeID() << '\n';
+    assert(0 && "Invalid type in printStaticConstant()");
+  }
+  // Increase offset.
+  Offset += TySize;
+}
+
+
+void MSILWriter::printStaticInitializer(const Constant* C,
+                                        const std::string& Name) {
+  switch (C->getType()->getTypeID()) {
+  case Type::IntegerTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID: 
+    Out << getPrimitiveTypeName(C->getType(), false);
+    break;
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID:
+  case Type::PointerTyID:
+    Out << getTypeName(C->getType());
+    break;
+  default:
+    cerr << "Type = " << *C << "\n";
+    assert(0 && "Invalid constant type");
+  }
+  // Print initializer
+  std::string label = Name;
+  label.insert(label.length()-1,"$data");
+  Out << Name << " at " << label << '\n';
+  Out << ".data " << label << " = {\n";
+  uint64_t offset = 0;
+  printStaticConstant(C,offset);
+  Out << "\n}\n\n";
+}
+
+
+void MSILWriter::printVariableDefinition(const GlobalVariable* G) {
+  const Constant* C = G->getInitializer();
+  if (C->isNullValue() || isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
+    InitListPtr = 0;
+  else
+    InitListPtr = &StaticInitList[G];
+  printStaticInitializer(C,getValueName(G));
+}
+
+
+void MSILWriter::printGlobalVariables() {
+  if (ModulePtr->global_empty()) return;
+  Module::global_iterator I,E;
+  for (I = ModulePtr->global_begin(), E = ModulePtr->global_end(); I!=E; ++I) {
+    // Variable definition
+    Out << ".field static " << (I->isDeclaration() ? "public " :
+                                                     "private ");
+    if (I->isDeclaration()) {
+      Out << getTypeName(I->getType()) << getValueName(&*I) << "\n\n";
+    } else
+      printVariableDefinition(&*I);
+  }
+}
+
+
+const char* MSILWriter::getLibraryName(const Function* F) {
+  return getLibraryForSymbol(F->getName().c_str(), true, F->getCallingConv());
+}
+
+
+const char* MSILWriter::getLibraryName(const GlobalVariable* GV) {
+  return getLibraryForSymbol(Mang->getValueName(GV).c_str(), false, 0);
+}
+
+
+const char* MSILWriter::getLibraryForSymbol(const char* Name, bool isFunction,
+                                           unsigned CallingConv) {
+  // TODO: Read *.def file with function and libraries definitions.
+  return "MSVCRT.DLL";  
+}
+
+
+void MSILWriter::printExternals() {
+  Module::const_iterator I,E;
+  // Functions.
+  for (I=ModulePtr->begin(),E=ModulePtr->end(); I!=E; ++I) {
+    // Skip intrisics
+    if (I->isIntrinsic()) continue;
+    if (I->isDeclaration()) {
+      const Function* F = I; 
+      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
+      std::string Sig = 
+        getCallSignature(cast<FunctionType>(F->getFunctionType()), NULL, Name);
+      Out << ".method static hidebysig pinvokeimpl(\""
+          << getLibraryName(F) << "\")\n\t" << Sig << " preservesig {}\n\n";
+    }
+  }
+  // External variables and static initialization.
+  Out <<
+  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
+  "  native int LoadLibrary(string) preservesig {}\n"
+  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
+  "  native int GetProcAddress(native int, string) preservesig {}\n";
+  Out <<
+  ".method private static void* $MSIL_Import(string lib,string sym)\n"
+  " managed cil\n{\n"
+  "\tldarg\tlib\n"
+  "\tcall\tnative int LoadLibrary(string)\n"
+  "\tldarg\tsym\n"
+  "\tcall\tnative int GetProcAddress(native int,string)\n"
+  "\tdup\n"
+  "\tbrtrue\tL_01\n"
+  "\tldstr\t\"Can no import variable\"\n"
+  "\tnewobj\tinstance void [mscorlib]System.Exception::.ctor(string)\n"
+  "\tthrow\n"
+  "L_01:\n"
+  "\tret\n"
+  "}\n\n"
+  ".method static private void $MSIL_Init() managed cil\n{\n";
+  printStaticInitializerList();
+  // Foreach global variable.
+  for (Module::global_iterator I = ModulePtr->global_begin(),
+       E = ModulePtr->global_end(); I!=E; ++I) {
+    if (!I->isDeclaration() || !I->hasDLLImportLinkage()) continue;
+    // Use "LoadLibrary"/"GetProcAddress" to recive variable address.
+    std::string Label = "not_null$_"+utostr(getUniqID());
+    std::string Tmp = getTypeName(I->getType())+getValueName(&*I);
+    printSimpleInstruction("ldsflda",Tmp.c_str());
+    Out << "\tldstr\t\"" << getLibraryName(&*I) << "\"\n";
+    Out << "\tldstr\t\"" << Mang->getValueName(&*I) << "\"\n";
+    printSimpleInstruction("call","void* $MSIL_Import(string,string)");
+    printIndirectSave(I->getType());
+  }
+  printSimpleInstruction("ret");
+  Out << "}\n\n";
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM, raw_ostream &o,
+                                          CodeGenFileType FileType,
+                                          CodeGenOpt::Level OptLevel)
+{
+  if (FileType != TargetMachine::AssemblyFile) return true;
+  MSILWriter* Writer = new MSILWriter(o);
+  PM.add(createGCLoweringPass());
+  PM.add(createLowerAllocationsPass(true));
+  // FIXME: Handle switch trougth native IL instruction "switch"
+  PM.add(createLowerSwitchPass());
+  PM.add(createCFGSimplificationPass());
+  PM.add(new MSILModule(Writer->UsedTypes,Writer->TD));
+  PM.add(Writer);
+  PM.add(createGCInfoDeleter());
+  return false;
+}
diff --git a/lib/Target/MSIL/MSILWriter.h b/lib/Target/MSIL/MSILWriter.h
new file mode 100644
index 0000000..45f5579
--- /dev/null
+++ b/lib/Target/MSIL/MSILWriter.h
@@ -0,0 +1,255 @@
+//===-- MSILWriter.h - TargetMachine for the MSIL ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSILWriter that is used by the MSIL.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MSILWRITER_H
+#define MSILWRITER_H
+
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Support/Mangler.h"
+#include <ios>
+using namespace llvm;
+
+namespace {
+
+  class MSILModule : public ModulePass {
+    Module *ModulePtr;
+    const std::set<const Type *>*& UsedTypes;
+    const TargetData*& TD;
+
+  public:
+    static char ID;
+    MSILModule(const std::set<const Type *>*& _UsedTypes,
+               const TargetData*& _TD)
+      : ModulePass(&ID), UsedTypes(_UsedTypes), TD(_TD) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+      AU.addRequired<TargetData>();
+    }
+
+    virtual const char *getPassName() const {
+      return "MSIL backend definitions";
+    }
+
+    virtual bool runOnModule(Module &M);
+
+  };
+
+  class MSILWriter  : public FunctionPass {
+    struct StaticInitializer {
+      const Constant* constant;
+      uint64_t offset;
+      
+      StaticInitializer()
+        : constant(0), offset(0) {}
+
+      StaticInitializer(const Constant* _constant, uint64_t _offset)
+        : constant(_constant), offset(_offset) {} 
+    };
+
+    uint64_t UniqID;
+
+    uint64_t getUniqID() {
+      return ++UniqID;
+    }
+
+  public:
+    raw_ostream &Out;
+    Module* ModulePtr;
+    const TargetData* TD;
+    Mangler* Mang;
+    LoopInfo *LInfo;
+    std::vector<StaticInitializer>* InitListPtr;
+    std::map<const GlobalVariable*,std::vector<StaticInitializer> >
+      StaticInitList;
+    const std::set<const Type *>* UsedTypes;
+    static char ID;
+    MSILWriter(raw_ostream &o) : FunctionPass(&ID), Out(o) {
+      UniqID = 0;
+    }
+
+    enum ValueType {
+      UndefVT,
+      GlobalVT,
+      InternalVT,
+      ArgumentVT,
+      LocalVT,
+      ConstVT,
+      ConstExprVT
+    };
+
+    bool isVariable(ValueType V) {
+      return V==GlobalVT || V==InternalVT || V==ArgumentVT || V==LocalVT;
+    }
+
+    bool isConstValue(ValueType V) {
+      return V==ConstVT || V==ConstExprVT;
+    }
+
+    virtual const char *getPassName() const { return "MSIL backend"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual bool doInitialization(Module &M);
+
+    virtual bool doFinalization(Module &M);
+
+    void printModuleStartup();
+
+    bool isZeroValue(const Value* V);
+
+    std::string getValueName(const Value* V);
+
+    std::string getLabelName(const Value* V);
+
+    std::string getLabelName(const std::string& Name);
+
+    std::string getConvModopt(unsigned CallingConvID);
+
+    std::string getArrayTypeName(Type::TypeID TyID, const Type* Ty);
+
+    std::string getPrimitiveTypeName(const Type* Ty, bool isSigned);
+
+    std::string getFunctionTypeName(const Type* Ty);
+
+    std::string getPointerTypeName(const Type* Ty);
+
+    std::string getTypeName(const Type* Ty, bool isSigned = false,
+                            bool isNested = false);
+
+    ValueType getValueLocation(const Value* V);
+
+    std::string getTypePostfix(const Type* Ty, bool Expand,
+                               bool isSigned = false);
+
+    void printConvToPtr();
+
+    void printPtrLoad(uint64_t N);
+
+    void printValuePtrLoad(const Value* V);
+
+    void printConstLoad(const Constant* C);
+
+    void printValueLoad(const Value* V);
+
+    void printValueSave(const Value* V);
+
+    void printBinaryInstruction(const char* Name, const Value* Left,
+                                const Value* Right);
+
+    void printSimpleInstruction(const char* Inst, const char* Operand = NULL);
+
+    void printPHICopy(const BasicBlock* Src, const BasicBlock* Dst);
+
+    void printBranchToBlock(const BasicBlock* CurrBB,
+                            const BasicBlock* TrueBB,
+                            const BasicBlock* FalseBB);
+
+    void printBranchInstruction(const BranchInst* Inst);
+
+    void printSelectInstruction(const Value* Cond, const Value* VTrue,
+                                const Value* VFalse);
+
+    void printIndirectLoad(const Value* V);
+
+    void printIndirectSave(const Value* Ptr, const Value* Val);
+
+    void printIndirectSave(const Type* Ty);
+
+    void printCastInstruction(unsigned int Op, const Value* V,
+                              const Type* Ty);
+
+    void printGepInstruction(const Value* V, gep_type_iterator I,
+                             gep_type_iterator E);
+
+    std::string getCallSignature(const FunctionType* Ty,
+                                 const Instruction* Inst,
+                                 std::string Name);
+
+    void printFunctionCall(const Value* FnVal, const Instruction* Inst);
+
+    void printIntrinsicCall(const IntrinsicInst* Inst);
+
+    void printCallInstruction(const Instruction* Inst);
+
+    void printICmpInstruction(unsigned Predicate, const Value* Left,
+                              const Value* Right);
+
+    void printFCmpInstruction(unsigned Predicate, const Value* Left,
+                              const Value* Right);
+
+    void printInvokeInstruction(const InvokeInst* Inst);
+
+    void printSwitchInstruction(const SwitchInst* Inst);
+
+    void printVAArgInstruction(const VAArgInst* Inst);
+
+    void printAllocaInstruction(const AllocaInst* Inst);
+
+    void printInstruction(const Instruction* Inst);
+
+    void printLoop(const Loop* L);
+
+    void printBasicBlock(const BasicBlock* BB);
+    
+    void printLocalVariables(const Function& F);
+
+    void printFunctionBody(const Function& F);
+
+    void printConstantExpr(const ConstantExpr* CE);
+
+    void printStaticInitializerList();
+
+    void printFunction(const Function& F);
+
+    void printDeclarations(const TypeSymbolTable& ST);
+
+    unsigned int getBitWidth(const Type* Ty);
+
+    void printStaticConstant(const Constant* C, uint64_t& Offset);
+
+    void printStaticInitializer(const Constant* C, const std::string& Name);
+
+    void printVariableDefinition(const GlobalVariable* G);
+
+    void printGlobalVariables();
+
+    const char* getLibraryName(const Function* F);
+
+    const char* getLibraryName(const GlobalVariable* GV); 
+    
+    const char* getLibraryForSymbol(const char* Name, bool isFunction,
+                                    unsigned CallingConv);
+
+    void printExternals();
+  };
+}
+
+#endif
+
diff --git a/lib/Target/MSIL/Makefile b/lib/Target/MSIL/Makefile
new file mode 100644
index 0000000..94265ed
--- /dev/null
+++ b/lib/Target/MSIL/Makefile
@@ -0,0 +1,14 @@
+##===- lib/Target/MSIL/Makefile ----------------------------*- Makefile -*-===##
+#
+#		      The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMMSIL
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts := $(CompileCommonOpts) -Wno-format
diff --git a/lib/Target/MSIL/README.TXT b/lib/Target/MSIL/README.TXT
new file mode 100644
index 0000000..d797c71
--- /dev/null
+++ b/lib/Target/MSIL/README.TXT
@@ -0,0 +1,26 @@
+//===---------------------------------------------------------------------===// 
+
+Vector instructions support.
+
+ShuffleVector
+ExtractElement
+InsertElement
+
+//===---------------------------------------------------------------------===// 
+
+Add "OpaqueType" type.
+
+//===---------------------------------------------------------------------===// 
+
+"switch" instruction emulation with CLI "switch" instruction.
+
+//===---------------------------------------------------------------------===// 
+
+Write linker for external function, because function export need to know 
+dynamic library where function located.
+
+.method static hidebysig pinvokeimpl("msvcrt.dll" cdecl)
+    void free(void*) preservesig {}
+
+
+
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
new file mode 100644
index 0000000..6701773
--- /dev/null
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(LLVM_TARGET_DEFINITIONS MSP430.td)
+
+tablegen(MSP430GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(MSP430GenRegisterNames.inc -gen-register-enums)
+tablegen(MSP430GenRegisterInfo.inc -gen-register-desc)
+tablegen(MSP430GenInstrNames.inc -gen-instr-enums)
+tablegen(MSP430GenInstrInfo.inc -gen-instr-desc)
+tablegen(MSP430GenAsmWriter.inc -gen-asm-writer)
+tablegen(MSP430GenDAGISel.inc -gen-dag-isel)
+tablegen(MSP430GenCallingConv.inc -gen-callingconv)
+tablegen(MSP430GenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(MSP430
+  MSP430AsmPrinter.cpp
+  MSP430FrameInfo.cpp
+  MSP430InstrInfo.cpp
+  MSP430ISelDAGToDAG.cpp
+  MSP430ISelLowering.cpp
+  MSP430RegisterInfo.cpp
+  MSP430Subtarget.cpp
+  MSP430TargetAsmInfo.cpp
+  MSP430TargetMachine.cpp
+  )
diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
new file mode 100644
index 0000000..ed0cd04
--- /dev/null
+++ b/lib/Target/MSP430/MSP430.h
@@ -0,0 +1,40 @@
+//==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MSP430 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_H
+#define LLVM_TARGET_MSP430_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MSP430TargetMachine;
+  class FunctionPass;
+  class raw_ostream;
+
+  FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
+                                    CodeGenOpt::Level OptLevel);
+  FunctionPass *createMSP430CodePrinterPass(raw_ostream &o,
+                                            MSP430TargetMachine &tm,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool verbose);
+} // end namespace llvm;
+
+// Defines symbolic names for MSP430 registers.
+// This defines a mapping from register name to register number.
+#include "MSP430GenRegisterNames.inc"
+
+// Defines symbolic names for the MSP430 instructions.
+#include "MSP430GenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
new file mode 100644
index 0000000..89313ab
--- /dev/null
+++ b/lib/Target/MSP430/MSP430.td
@@ -0,0 +1,60 @@
+//===- MSP430.td - Describe the MSP430 Target Machine ---------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MSP430 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureX
+ : SubtargetFeature<"ext", "ExtendedInsts", "true",
+                    "Enable MSP430-X extensions">;
+
+//===----------------------------------------------------------------------===//
+// MSP430 supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrInfo.td"
+
+def MSP430InstrInfo : InstrInfo {} 
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def MSP430 : Target {
+  let InstructionSet = MSP430InstrInfo;
+}
+
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
new file mode 100644
index 0000000..71b785b
--- /dev/null
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -0,0 +1,267 @@
+//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the MSP430 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN MSP430AsmPrinter : public AsmPrinter {
+  public:
+    MSP430AsmPrinter(raw_ostream &O, MSP430TargetMachine &TM,
+                     const TargetAsmInfo *TAI,
+                     CodeGenOpt::Level OL, bool V)
+      : AsmPrinter(O, TM, TAI, OL, V) {}
+
+    virtual const char *getPassName() const {
+      return "MSP430 Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      const char* Modifier = 0);
+    void printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                            const char* Modifier = 0);
+    void printCCOperand(const MachineInstr *MI, int OpNum);
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    void printMachineInstruction(const MachineInstr * MI);
+
+    void emitFunctionHeader(const MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+    }
+  };
+} // end of anonymous namespace
+
+#include "MSP430GenAsmWriter.inc"
+
+/// createMSP430CodePrinterPass - Returns a pass that prints the MSP430
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createMSP430CodePrinterPass(raw_ostream &o,
+                                                MSP430TargetMachine &tm,
+                                                CodeGenOpt::Level OptLevel,
+                                                bool verbose) {
+  return new MSP430AsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
+
+bool MSP430AsmPrinter::doInitialization(Module &M) {
+  Mang = new Mangler(M, "", TAI->getPrivateGlobalPrefix());
+  return false; // success
+}
+
+
+bool MSP430AsmPrinter::doFinalization(Module &M) {
+  return AsmPrinter::doFinalization(M);
+}
+
+void MSP430AsmPrinter::emitFunctionHeader(const MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  unsigned FnAlign = 4;
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
+    FnAlign = 1;
+
+  EmitAlignment(FnAlign, F);
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.globl\t" << CurrentFnName << '\n';
+    break;
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    O << "\t.weak\t" << CurrentFnName << '\n';
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  O << "\t.type\t" << CurrentFnName << ",@function\n"
+    << CurrentFnName << ":\n";
+}
+
+bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print the 'header' of function
+  emitFunctionHeader(MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (!VerboseAsm && (I->pred_empty() || I->isOnlyReachableByFallthrough())) {
+      // This is an entry block or a block that's only reachable via a
+      // fallthrough edge. In non-VerboseAsm mode, don't print the label.
+    } else {
+      printBasicBlockLabel(I, true, true, VerboseAsm);
+      O << '\n';
+    }
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II)
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+  }
+
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
+
+  O.flush();
+
+  // We didn't modify anything
+  return false;
+}
+
+void MSP430AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Call the autogenerated instruction printer routines.
+  if (printInstruction(MI))
+    return;
+
+  assert(0 && "Should not happen");
+}
+
+void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                    const char* Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    assert (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+            "Virtual registers should be already mapped!");
+    O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+    return;
+  case MachineOperand::MO_Immediate:
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << '#';
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    std::string Name = Mang->getValueName(MO.getGlobal());
+    assert(MO.getOffset() == 0 && "No offsets allowed!");
+
+    if (isCallOp)
+      O << '#';
+    else if (isMemOp)
+      O << '&';
+
+    O << Name;
+
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    std::string Name(TAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    if (isCallOp)
+      O << '#';
+    O << Name;
+    return;
+  }
+  default:
+    assert(0 && "Not implemented yet!");
+  }
+}
+
+void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                                          const char* Modifier) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Disp = MI->getOperand(OpNum+1);
+
+  if (Base.isGlobal())
+    printOperand(MI, OpNum, "mem");
+  else if (Disp.isImm() && !Base.getReg())
+    printOperand(MI, OpNum);
+  else if (Base.getReg()) {
+    if (Disp.getImm()) {
+      printOperand(MI, OpNum + 1, "nohash");
+      O << '(';
+      printOperand(MI, OpNum);
+      O << ')';
+    } else {
+      O << '@';
+      printOperand(MI, OpNum);
+    }
+  } else
+    assert(0 && "Unsupported memory operand");
+}
+
+void MSP430AsmPrinter::printCCOperand(const MachineInstr *MI, int OpNum) {
+  unsigned CC = MI->getOperand(OpNum).getImm();
+
+  switch (CC) {
+  default:
+   assert(0 && "Unsupported CC code");
+   break;
+  case MSP430::COND_E:
+   O << "eq";
+   break;
+  case MSP430::COND_NE:
+   O << "ne";
+   break;
+  case MSP430::COND_HS:
+   O << "hs";
+   break;
+  case MSP430::COND_LO:
+   O << "lo";
+   break;
+  case MSP430::COND_GE:
+   O << "ge";
+   break;
+  case MSP430::COND_L:
+   O << 'l';
+   break;
+  }
+}
diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
new file mode 100644
index 0000000..ad27cc9
--- /dev/null
+++ b/lib/Target/MSP430/MSP430CallingConv.td
@@ -0,0 +1,37 @@
+//==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MSP430 architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MSP430 Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_MSP430 : CallingConv<[
+  // i8 are returned in registers R15B, R14B, R13B, R12B
+  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+
+  // i16 are returned in registers R15, R14, R13, R12
+  CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_MSP430 : CallingConv<[
+  // Promote i8 arguments to i16.
+  CCIfType<[i8], CCPromoteToType<i16>>,
+
+  // The first 4 integer arguments of non-varargs functions are passed in
+  // integer registers.
+  CCIfNotVarArg<CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>>,
+
+  // Integer values get stored in stack slots that are 2 bytes in
+  // size and 2-byte aligned.
+  CCIfType<[i16], CCAssignToStack<2, 2>>
+]>;
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
new file mode 100644
index 0000000..bf49ec0
--- /dev/null
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -0,0 +1,194 @@
+//===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+/// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class MSP430DAGToDAGISel : public SelectionDAGISel {
+    MSP430TargetLowering &Lowering;
+    const MSP430Subtarget &Subtarget;
+
+  public:
+    MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getTargetLowering()),
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "MSP430 DAG->DAG Pattern Instruction Selection";
+    }
+
+    // Include the pieces autogenerated from the target description.
+  #include "MSP430GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDValue Op);
+    bool SelectAddr(SDValue Op, SDValue Addr, SDValue &Base, SDValue &Disp);
+
+  #ifndef NDEBUG
+    unsigned Indent;
+  #endif
+  };
+}  // end anonymous namespace
+
+/// createMSP430ISelDag - This pass converts a legalized DAG into a
+/// MSP430-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createMSP430ISelDag(MSP430TargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new MSP430DAGToDAGISel(TM, OptLevel);
+}
+
+// FIXME: This is pretty dummy routine and needs to be rewritten in the future.
+bool MSP430DAGToDAGISel::SelectAddr(SDValue Op, SDValue Addr,
+                                    SDValue &Base, SDValue &Disp) {
+  // Try to match frame address first.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i16);
+    Disp = CurDAG->getTargetConstant(0, MVT::i16);
+    return true;
+  }
+
+  switch (Addr.getOpcode()) {
+  case ISD::ADD:
+   // Operand is a result from ADD with constant operand which fits into i16.
+   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      uint64_t CVal = CN->getZExtValue();
+      // Offset should fit into 16 bits.
+      if (((CVal << 48) >> 48) == CVal) {
+        SDValue N0 = Addr.getOperand(0);
+        if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N0))
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i16);
+        else
+          Base = N0;
+
+        Disp = CurDAG->getTargetConstant(CVal, MVT::i16);
+        return true;
+      }
+    }
+    break;
+  case MSP430ISD::Wrapper:
+    SDValue N0 = Addr.getOperand(0);
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      Base = CurDAG->getTargetGlobalAddress(G->getGlobal(),
+                                            MVT::i16, G->getOffset());
+      Disp = CurDAG->getTargetConstant(0, MVT::i16);
+      return true;
+    } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      Base = CurDAG->getTargetExternalSymbol(E->getSymbol(), MVT::i16);
+      Disp = CurDAG->getTargetConstant(0, MVT::i16);
+    }
+    break;
+  };
+
+  Base = Addr;
+  Disp = CurDAG->getTargetConstant(0, MVT::i16);
+
+  return true;
+}
+
+
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void MSP430DAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+
+  // Codegen the basic block.
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection begins:\n";
+  Indent = 0;
+#endif
+  SelectRoot(*CurDAG);
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection ends:\n";
+#endif
+
+  CurDAG->RemoveDeadNodes();
+}
+
+SDNode *MSP430DAGToDAGISel::Select(SDValue Op) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Dump information about the Node being selected
+  #ifndef NDEBUG
+  DOUT << std::string(Indent, ' ') << "Selecting: ";
+  DEBUG(Node->dump(CurDAG));
+  DOUT << "\n";
+  Indent += 2;
+  #endif
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    #ifndef NDEBUG
+    DOUT << std::string(Indent-2, ' ') << "== ";
+    DEBUG(Node->dump(CurDAG));
+    DOUT << "\n";
+    Indent -= 2;
+    #endif
+    return NULL;
+  }
+
+  // Few custom selection stuff.
+  switch (Node->getOpcode()) {
+  default: break;
+  case ISD::FrameIndex: {
+    assert(Op.getValueType() == MVT::i16);
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
+    if (Node->hasOneUse())
+      return CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
+    return CurDAG->getTargetNode(MSP430::ADD16ri, dl, MVT::i16,
+                                 TFI, CurDAG->getTargetConstant(0, MVT::i16));
+  }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Op);
+
+  #ifndef NDEBUG
+  DOUT << std::string(Indent-2, ' ') << "=> ";
+  if (ResNode == NULL || ResNode == Op.getNode())
+    DEBUG(Op.getNode()->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DOUT << "\n";
+  Indent -= 2;
+  #endif
+
+  return ResNode;
+}
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
new file mode 100644
index 0000000..14db20e
--- /dev/null
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -0,0 +1,670 @@
+//===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation  ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-lower"
+
+#include "MSP430ISelLowering.h"
+#include "MSP430.h"
+#include "MSP430TargetMachine.h"
+#include "MSP430Subtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/VectorExtras.h"
+using namespace llvm;
+
+MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
+  TargetLowering(tm), Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8,  MSP430::GR8RegisterClass);
+  addRegisterClass(MVT::i16, MSP430::GR16RegisterClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Provide all sorts of operation actions
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  // Even if we have only 1 bit shift here, we can perform
+  // shifts of the whole bitwidth 1 bit per step.
+  setShiftAmountType(MVT::i8);
+
+  setStackPointerRegisterToSaveRestore(MSP430::SPW);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setSchedulingPreference(SchedulingForLatency);
+
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  // We don't have any truncstores
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+  setOperationAction(ISD::SRA,              MVT::i8,    Custom);
+  setOperationAction(ISD::SHL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRA,              MVT::i16,   Custom);
+  setOperationAction(ISD::SHL,              MVT::i16,   Custom);
+  setOperationAction(ISD::SRL,              MVT::i16,   Custom);
+  setOperationAction(ISD::ROTL,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTR,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTL,             MVT::i16,   Expand);
+  setOperationAction(ISD::ROTR,             MVT::i16,   Expand);
+  setOperationAction(ISD::RET,              MVT::Other, Custom);
+  setOperationAction(ISD::GlobalAddress,    MVT::i16,   Custom);
+  setOperationAction(ISD::ExternalSymbol,   MVT::i16,   Custom);
+  setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
+  setOperationAction(ISD::BRIND,            MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,            MVT::i8,    Custom);
+  setOperationAction(ISD::BR_CC,            MVT::i16,   Custom);
+  setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
+  setOperationAction(ISD::SETCC,            MVT::i8,    Expand);
+  setOperationAction(ISD::SETCC,            MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT,           MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT,           MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC,        MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::i16,   Custom);
+  setOperationAction(ISD::SIGN_EXTEND,      MVT::i16,   Custom);
+
+  // FIXME: Implement efficiently multiplication by a constant
+  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
+  setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+}
+
+SDValue MSP430TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
+  case ISD::SHL: // FALLTHROUGH
+  case ISD::SRL:
+  case ISD::SRA:              return LowerShifts(Op, DAG);
+  case ISD::RET:              return LowerRET(Op, DAG);
+  case ISD::CALL:             return LowerCALL(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::ExternalSymbol:   return LowerExternalSymbol(Op, DAG);
+  case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::SIGN_EXTEND:      return LowerSIGN_EXTEND(Op, DAG);
+  default:
+    assert(0 && "unimplemented operand");
+    return SDValue();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MSP430GenCallingConv.inc"
+
+SDValue MSP430TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op,
+                                                    SelectionDAG &DAG) {
+  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (CC) {
+  default:
+    assert(0 && "Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Op, DAG);
+  }
+}
+
+SDValue MSP430TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  unsigned CallingConv = TheCall->getCallingConv();
+  switch (CallingConv) {
+  default:
+    assert(0 && "Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Op, DAG, CallingConv);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+// FIXME: varargs
+SDValue MSP430TargetLowering::LowerCCCArguments(SDValue Op,
+                                                SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SDValue Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  unsigned CC = MF.getFunction()->getCallingConv();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_MSP430);
+
+  assert(!isVarArg && "Varargs not supported yet");
+
+  SmallVector<SDValue, 16> ArgValues;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      MVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT()) {
+      default:
+        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << RegVT.getSimpleVT()
+             << "\n";
+        abort();
+      case MVT::i16:
+        unsigned VReg =
+          RegInfo.createVirtualRegister(MSP430::GR16RegisterClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Root, dl, VReg, RegVT);
+
+        // If this is an 8-bit value, it is really passed promoted to 16
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+        ArgValues.push_back(ArgValue);
+      }
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+      if (ObjSize > 2) {
+        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << VA.getLocVT().getSimpleVT()
+             << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset());
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
+      ArgValues.push_back(DAG.getLoad(VA.getLocVT(), dl, Root, FIN,
+                                      PseudoSourceValue::getFixedStack(FI), 0));
+    }
+  }
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+}
+
+SDValue MSP430TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+
+  // Analize return values of ISD::RET
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_MSP430);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  // The chain is always operand #0
+  SDValue Chain = Op.getOperand(0);
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // ISD::RET => ret chain, (regnum1,val1), ...
+    // So i*2+1 index only the regnums
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Op.getOperand(i*2+1), Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(MSP430ISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+
+  // Return Void
+  return DAG.getNode(MSP430ISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: sret.
+SDValue MSP430TargetLowering::LowerCCCCallTo(SDValue Op, SelectionDAG &DAG,
+                                             unsigned CC) {
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  SDValue Chain  = TheCall->getChain();
+  SDValue Callee = TheCall->getCallee();
+  bool isVarArg  = TheCall->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+
+  CCInfo.AnalyzeCallOperands(TheCall, CC_MSP430);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments start after the 5 first operands of ISD::CALL
+    SDValue Arg = TheCall->getArg(i);
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SPW, getPointerTy());
+
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(VA.getLocMemOffset()));
+
+
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         PseudoSourceValue::getStack(),
+                                         VA.getLocMemOffset()));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emited instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i16);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i16);
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
+                 Op.getResNo());
+}
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDNode*
+MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                      CallSDNode *TheCall,
+                                      unsigned CallingConv,
+                                      SelectionDAG &DAG) {
+  bool isVarArg = TheCall->isVarArg();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_MSP430);
+  SmallVector<SDValue, 8> ResultVals;
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    ResultVals.push_back(Chain.getValue(0));
+  }
+
+  ResultVals.push_back(Chain);
+
+  // Merge everything together with a MERGE_VALUES node.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
+                     &ResultVals[0], ResultVals.size()).getNode();
+}
+
+SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
+                                          SelectionDAG &DAG) {
+  unsigned Opc = Op.getOpcode();
+  SDNode* N = Op.getNode();
+  MVT VT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // We currently only lower shifts of constant argument.
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    return SDValue();
+
+  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+
+  // Expand the stuff into sequence of shifts.
+  // FIXME: for some shift amounts this might be done better!
+  // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N
+  SDValue Victim = N->getOperand(0);
+
+  if (Opc == ISD::SRL && ShiftAmount) {
+    // Emit a special goodness here:
+    // srl A, 1 => clrc; rrc A
+    Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+    ShiftAmount -= 1;
+  }
+
+  while (ShiftAmount--)
+    Victim = DAG.getNode((Opc == ISD::SHL ? MSP430ISD::RLA : MSP430ISD::RRA),
+                         dl, VT, Victim);
+
+  return Victim;
+}
+
+SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  // Create the TargetGlobalAddress node, folding in the constant offset.
+  SDValue Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+  return DAG.getNode(MSP430ISD::Wrapper, Op.getDebugLoc(),
+                     getPointerTy(), Result);
+}
+
+SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
+                                                  SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);;
+}
+
+static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, unsigned &TargetCC,
+                       ISD::CondCode CC,
+                       DebugLoc dl, SelectionDAG &DAG) {
+  // FIXME: Handle bittests someday
+  assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
+
+  // FIXME: Handle jump negative someday
+  TargetCC = MSP430::COND_INVALID;
+  switch (CC) {
+  default: assert(0 && "Invalid integer condition!");
+  case ISD::SETEQ:
+    TargetCC = MSP430::COND_E;  // aka COND_Z
+    break;
+  case ISD::SETNE:
+    TargetCC = MSP430::COND_NE; // aka COND_NZ
+    break;
+  case ISD::SETULE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETUGE:
+    TargetCC = MSP430::COND_HS; // aka COND_C
+    break;
+  case ISD::SETUGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETULT:
+    TargetCC = MSP430::COND_LO; // aka COND_NC
+    break;
+  case ISD::SETLE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETGE:
+    TargetCC = MSP430::COND_GE;
+    break;
+  case ISD::SETGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETLT:
+    TargetCC = MSP430::COND_L;
+    break;
+  }
+
+  return DAG.getNode(MSP430ISD::CMP, dl, MVT::Flag, LHS, RHS);
+}
+
+
+SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue Dest  = Op.getOperand(4);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  unsigned TargetCC = MSP430::COND_INVALID;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  return DAG.getNode(MSP430ISD::BR_CC, dl, Op.getValueType(),
+                     Chain,
+                     Dest, DAG.getConstant(TargetCC, MVT::i8),
+                     Flag);
+}
+
+SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TrueV  = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  DebugLoc dl    = Op.getDebugLoc();
+
+  unsigned TargetCC = MSP430::COND_INVALID;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(TrueV);
+  Ops.push_back(FalseV);
+  Ops.push_back(DAG.getConstant(TargetCC, MVT::i8));
+  Ops.push_back(Flag);
+
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+}
+
+SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                               SelectionDAG &DAG) {
+  SDValue Val = Op.getOperand(0);
+  MVT VT      = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert(VT == MVT::i16 && "Only support i16 for now!");
+
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT,
+                     DAG.getNode(ISD::ANY_EXTEND, dl, VT, Val),
+                     DAG.getValueType(Val.getValueType()));
+}
+
+const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
+  case MSP430ISD::RRA:                return "MSP430ISD::RRA";
+  case MSP430ISD::RLA:                return "MSP430ISD::RLA";
+  case MSP430ISD::RRC:                return "MSP430ISD::RRC";
+  case MSP430ISD::CALL:               return "MSP430ISD::CALL";
+  case MSP430ISD::Wrapper:            return "MSP430ISD::Wrapper";
+  case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
+  case MSP430ISD::CMP:                return "MSP430ISD::CMP";
+  case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock*
+MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  assert((MI->getOpcode() == MSP430::Select16 ||
+          MI->getOpcode() == MSP430::Select8) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   jCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(copy1MBB)
+    .addImm(MI->getOperand(3).getImm());
+  F->insert(I, copy0MBB);
+  F->insert(I, copy1MBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  copy1MBB->transferSuccessors(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(copy1MBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to copy1MBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(copy1MBB);
+
+  //  copy1MBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = copy1MBB;
+  BuildMI(BB, dl, TII.get(MSP430::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
new file mode 100644
index 0000000..404534d
--- /dev/null
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -0,0 +1,103 @@
+//==-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MSP430 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_ISELLOWERING_H
+#define LLVM_TARGET_MSP430_ISELLOWERING_H
+
+#include "MSP430.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace MSP430ISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Return with a flag operand. Operand 0 is the chain operand.
+      RET_FLAG,
+
+      /// Y = R{R,L}A X, rotate right (left) arithmetically
+      RRA, RLA,
+
+      /// Y = RRC X, rotate right via carry
+      RRC,
+
+      /// CALL/TAILCALL - These operations represent an abstract call
+      /// instruction, which includes a bunch of information.
+      CALL,
+
+      /// Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+      /// and TargetGlobalAddress.
+      Wrapper,
+
+      /// CMP - Compare instruction.
+      CMP,
+
+      /// SetCC. Operand 0 is condition code, and operand 1 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// MSP430 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// instruction.
+      BR_CC,
+
+      /// SELECT_CC. Operand 0 and operand 1 are selection variable, operand 3
+      /// is condition code and operand 4 is flag operand.
+      SELECT_CC
+    };
+  }
+
+  class MSP430Subtarget;
+  class MSP430TargetMachine;
+
+  class MSP430TargetLowering : public TargetLowering {
+  public:
+    explicit MSP430TargetLowering(MSP430TargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCCCArguments(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerShifts(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG);
+
+    SDValue LowerCCCCallTo(SDValue Op, SelectionDAG &DAG,
+                           unsigned CC);
+    SDNode* LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallSDNode *TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+
+    MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB) const;
+
+  private:
+    const MSP430Subtarget &Subtarget;
+    const MSP430TargetMachine &TM;
+  };
+} // namespace llvm
+
+#endif // LLVM_TARGET_MSP430_ISELLOWERING_H
diff --git a/lib/Target/MSP430/MSP430InstrFormats.td b/lib/Target/MSP430/MSP430InstrFormats.td
new file mode 100644
index 0000000..61b3399
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrFormats.td
@@ -0,0 +1,67 @@
+//===- MSP430InstrFormats.td - MSP430 Instruction Formats-----*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MSP430 instructions format here
+//
+
+// Generic MSP430 Format
+class MSP430Inst<dag outs, dag ins, string asmstr> : Instruction {
+  field bits<16> Inst;
+
+  let Namespace = "MSP430";
+
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  let AsmString   = asmstr;
+}
+
+// FIXME: Create different classes for different addressing modes.
+
+// MSP430 Double Operand (Format I) Instructions
+class IForm<bits<4> opcode, bit ad, bit bw, bits<2> as,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, asmstr> {
+  let Pattern = pattern;
+  
+  let Inst{12-15} = opcode;
+  let Inst{7}     = ad;
+  let Inst{6}     = bw;
+  let Inst{4-5}   = as;
+}
+
+// MSP430 Single Operand (Format II) Instructions
+class IIForm<bits<9> opcode, bit bw, bits<2> ad,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, asmstr> {
+  let Pattern = pattern;
+  
+  let Inst{7-15} = opcode;
+  let Inst{6}    = bw;
+  let Inst{4-5}  = ad;
+}
+
+// MSP430 Conditional Jumps Instructions
+class CJForm<bits<3> opcode, bits<3> cond, bit s,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, asmstr> {
+  let Pattern = pattern;
+  
+  let Inst{13-15} = opcode;
+  let Inst{10-12} = cond;
+  let Inst{9}     = s;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, asmstr> {
+  let Pattern = pattern;
+  let Inst{15-0} = 0;
+}
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
new file mode 100644
index 0000000..91112c3
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -0,0 +1,177 @@
+//===- MSP430InstrInfo.cpp - MSP430 Instruction Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "MSP430GenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+using namespace llvm;
+
+MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
+  : TargetInstrInfoImpl(MSP430Insts, array_lengthof(MSP430Insts)),
+    RI(tm, *this), TM(tm) {}
+
+void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill));
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill));
+  else
+    assert(0 && "Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0);
+  else
+    assert(0 && "Cannot store this register to stack slot!");
+}
+
+bool MSP430InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == SrcRC) {
+    unsigned Opc;
+    if (DestRC == &MSP430::GR16RegClass) {
+      Opc = MSP430::MOV16rr;
+    } else if (DestRC == &MSP430::GR8RegClass) {
+      Opc = MSP430::MOV8rr;
+    } else {
+      return false;
+    }
+
+    BuildMI(MBB, I, DL, get(Opc), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  return false;
+}
+
+bool
+MSP430InstrInfo::isMoveInstr(const MachineInstr& MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers yet.
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case MSP430::MOV8rr:
+  case MSP430::MOV16rr:
+   assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid register-register move instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+}
+
+bool
+MSP430InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MFI->setCalleeSavedFrameSize(CSI.size() * 2);
+
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, DL, get(MSP430::PUSH16r))
+      .addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool
+MSP430InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    BuildMI(MBB, MI, DL, get(MSP430::POP16r), CSI[i].getReg());
+
+  return true;
+}
+
+unsigned
+MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                              MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "MSP430 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(MSP430::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  assert(0 && "Implement conditional branches!");
+
+  return Count;
+}
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
new file mode 100644
index 0000000..e07aaca
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -0,0 +1,84 @@
+//===- MSP430InstrInfo.h - MSP430 Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430INSTRINFO_H
+#define LLVM_TARGET_MSP430INSTRINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "MSP430RegisterInfo.h"
+
+namespace llvm {
+
+class MSP430TargetMachine;
+
+namespace MSP430 {
+  // MSP430 specific condition code.
+  enum CondCode {
+    COND_E  = 0,  // aka COND_Z
+    COND_NE = 1,  // aka COND_NZ
+    COND_HS = 2,  // aka COND_C
+    COND_LO = 3,  // aka COND_NC
+    COND_GE = 4,
+    COND_L  = 5,
+
+    COND_INVALID
+  };
+}
+
+class MSP430InstrInfo : public TargetInstrInfoImpl {
+  const MSP430RegisterInfo RI;
+  MSP430TargetMachine &TM;
+public:
+  explicit MSP430InstrInfo(MSP430TargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  bool copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *DestRC,
+                    const TargetRegisterClass *SrcRC) const;
+
+  bool isMoveInstr(const MachineInstr& MI,
+                   unsigned &SrcReg, unsigned &DstReg,
+                   unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill,
+                                   int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIdx,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+
+};
+
+}
+
+#endif
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
new file mode 100644
index 0000000..39c08e4
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -0,0 +1,901 @@
+//===- MSP430InstrInfo.td - MSP430 Instruction defs -----------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the MSP430 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def SDT_MSP430Cmp          : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
+                                                  SDTCisVT<1, i8>]>;
+def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, 
+                                                  SDTCisVT<3, i8>]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def MSP430retflag : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInFlag]>;
+
+def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
+def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
+def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+
+def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
+                     [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+def MSP430callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_MSP430CallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def MSP430callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_MSP430CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
+def MSP430cmp     : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutFlag]>;
+def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC, [SDNPHasChain, SDNPInFlag]>;
+def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC, [SDNPInFlag]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+def memsrc : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+def memdst : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+// Operand for printing out a condition code.
+def cc : Operand<i8> {
+  let PrintMethod = "printCCOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// MSP430 Complex Pattern Definitions.
+//===----------------------------------------------------------------------===//
+
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SRW.
+let Defs = [SPW, SRW], Uses = [SPW] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+                              "#ADJCALLSTACKDOWN",
+                              [(MSP430callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomDAGSchedInserter = 1 in {
+  def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cc),
+                        "# Select8 PSEUDO",
+                        [(set GR8:$dst,
+                          (MSP430selectcc GR8:$src1, GR8:$src2, imm:$cc))]>;
+  def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cc),
+                        "# Select16 PSEUDO",
+                        [(set GR16:$dst,
+                          (MSP430selectcc GR16:$src1, GR16:$src2, imm:$cc))]>;
+}
+
+let neverHasSideEffects = 1 in
+def NOP : Pseudo<(outs), (ins), "nop", []>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1 in {
+  def RET : Pseudo<(outs), (ins), "ret", [(MSP430retflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+
+// Direct branch
+let isBarrier = 1 in
+  def JMP : Pseudo<(outs), (ins brtarget:$dst),
+                   "jmp\t$dst",
+                   [(br bb:$dst)]>;
+
+// Conditional branches
+let Uses = [SRW] in
+  def JCC : Pseudo<(outs), (ins brtarget:$dst, cc:$cc),
+                            "j$cc $dst",
+                            [(MSP430brcc bb:$dst, imm:$cc)]>;
+} // isBranch, isTerminator
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. SPW is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [R12W, R13W, R14W, R15W, SRW],
+      Uses = [SPW] in {
+    def CALLi     : Pseudo<(outs), (ins i16imm:$dst, variable_ops),
+                           "call\t${dst:call}", [(MSP430call imm:$dst)]>;
+    def CALLr     : Pseudo<(outs), (ins GR16:$dst, variable_ops),
+                           "call\t$dst", [(MSP430call GR16:$dst)]>;
+    def CALLm     : Pseudo<(outs), (ins memsrc:$dst, variable_ops),
+                           "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
+  }
+
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [SPW], Uses = [SPW], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP16r   : Pseudo<(outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH16r  : Pseudo<(outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// FIXME: Provide proper encoding!
+let neverHasSideEffects = 1 in {
+def MOV8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                     "mov.b\t{$src, $dst}",
+                     []>;
+def MOV16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                     "mov.w\t{$src, $dst}",
+                     []>;
+}
+
+// FIXME: Provide proper encoding!
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : Pseudo<(outs GR8:$dst), (ins i8imm:$src),
+                     "mov.b\t{$src, $dst}",
+                     [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Pseudo<(outs GR16:$dst), (ins i16imm:$src),
+                     "mov.w\t{$src, $dst}",
+                     [(set GR16:$dst, imm:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def MOV8rm  : Pseudo<(outs GR8:$dst), (ins memsrc:$src),
+                "mov.b\t{$src, $dst}",
+                [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : Pseudo<(outs GR16:$dst), (ins memsrc:$src),
+                "mov.w\t{$src, $dst}",
+                [(set GR16:$dst, (load addr:$src))]>;
+}
+
+def MOVZX16rr8 : Pseudo<(outs GR16:$dst), (ins GR8:$src),
+                "mov.b\t{$src, $dst}",
+                [(set GR16:$dst, (zext GR8:$src))]>;
+def MOVZX16rm8 : Pseudo<(outs GR16:$dst), (ins memsrc:$src),
+                "mov.b\t{$src, $dst}",
+                [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
+
+// Any instruction that defines a 8-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 8-bit operation will zero-extend
+// up to 16 bits.
+def def8 : PatLeaf<(i8 GR8:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetInstrInfo::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 8-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i16 (zext def8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+
+def MOV8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "mov.b\t{$src, $dst}",
+                [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "mov.w\t{$src, $dst}",
+                [(store (i16 imm:$src), addr:$dst)]>;
+
+def MOV8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "mov.b\t{$src, $dst}",
+                [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "mov.w\t{$src, $dst}",
+                [(store GR16:$src, addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let isTwoAddress = 1 in {
+
+let Defs = [SRW] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+// FIXME: Provide proper encoding!
+def ADD8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                     "add.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
+                      (implicit SRW)]>;
+def ADD16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                     "add.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
+                      (implicit SRW)]>;
+}
+
+def ADD8rm  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                     "add.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+def ADD16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                     "add.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+
+def ADD8ri  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                     "add.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+def ADD16ri : Pseudo<(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                     "add.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def ADD8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "add.b\t{$src, $dst}",
+                [(store (add (load addr:$dst), GR8:$src), addr:$dst),
+                 (implicit SRW)]>;
+def ADD16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "add.w\t{$src, $dst}",
+                [(store (add (load addr:$dst), GR16:$src), addr:$dst),
+                 (implicit SRW)]>;
+
+def ADD8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "add.b\t{$src, $dst}",
+                [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+def ADD16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "add.w\t{$src, $dst}",
+                [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+
+def ADD8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "add.b\t{$src, $dst}",
+                [(store (add (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+def ADD16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "add.w\t{$src, $dst}",
+                [(store (add (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+
+let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
+def ADC8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                     "addc.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (adde GR8:$src1, GR8:$src2)),
+                      (implicit SRW)]>;
+def ADC16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                     "addc.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (adde GR16:$src1, GR16:$src2)),
+                      (implicit SRW)]>;
+} // isCommutable
+
+def ADC8ri  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                     "addc.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (adde GR8:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+def ADC16ri : Pseudo<(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                     "addc.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (adde GR16:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+
+def ADC8rm  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                     "addc.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (adde GR8:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+def ADC16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                     "addc.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (adde GR16:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def ADC8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "addc.b\t{$src, $dst}",
+                [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
+                 (implicit SRW)]>;
+def ADC16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "addc.w\t{$src, $dst}",
+                [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
+                 (implicit SRW)]>;
+
+def ADC8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "addc.b\t{$src, $dst}",
+                [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+def ADC16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "addc.w\t{$src, $dst}",
+                [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+
+def ADC8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "addc.b\t{$src, $dst}",
+                [(store (adde (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+def ADC16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "addc.w\t{$src, $dst}",
+                [(store (adde (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
+def AND8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                     "and.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
+                      (implicit SRW)]>;
+def AND16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                     "and.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
+                      (implicit SRW)]>;
+}
+
+def AND8ri  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                     "and.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+def AND16ri : Pseudo<(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                     "and.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+
+def AND8rm  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                     "and.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (and GR8:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+def AND16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                     "and.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (and GR16:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def AND8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "and.b\t{$src, $dst}",
+                [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+                 (implicit SRW)]>;
+def AND16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "and.w\t{$src, $dst}",
+                [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+                 (implicit SRW)]>;
+
+def AND8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "and.b\t{$src, $dst}",
+                [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+def AND16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "and.w\t{$src, $dst}",
+                [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+
+def AND8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "and.b\t{$src, $dst}",
+                [(store (and (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+def AND16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "and.w\t{$src, $dst}",
+                [(store (and (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+}
+
+
+let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
+def XOR8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                     "xor.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
+                      (implicit SRW)]>;
+def XOR16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                     "xor.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
+                      (implicit SRW)]>;
+}
+
+def XOR8ri  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                     "xor.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+def XOR16ri : Pseudo<(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                     "xor.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+
+def XOR8rm  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                     "xor.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+def XOR16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                     "xor.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def XOR8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "xor.b\t{$src, $dst}",
+                [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+                 (implicit SRW)]>;
+def XOR16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "xor.w\t{$src, $dst}",
+                [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+                 (implicit SRW)]>;
+
+def XOR8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "xor.b\t{$src, $dst}",
+                [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+def XOR16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "xor.w\t{$src, $dst}",
+                [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+
+def XOR8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "xor.b\t{$src, $dst}",
+                [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+def XOR16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "xor.w\t{$src, $dst}",
+                [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+}
+
+
+def SUB8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                     "sub.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
+                      (implicit SRW)]>;
+def SUB16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                     "sub.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
+                      (implicit SRW)]>;
+
+def SUB8ri  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                     "sub.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+def SUB16ri : Pseudo<(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                     "sub.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+
+def SUB8rm  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                     "sub.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+def SUB16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                     "sub.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def SUB8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "sub.b\t{$src, $dst}",
+                [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
+                 (implicit SRW)]>;
+def SUB16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "sub.w\t{$src, $dst}",
+                [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
+                 (implicit SRW)]>;
+
+def SUB8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "sub.b\t{$src, $dst}",
+                [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+def SUB16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "sub.w\t{$src, $dst}",
+                [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+
+def SUB8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "sub.b\t{$src, $dst}",
+                [(store (sub (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+def SUB16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "sub.w\t{$src, $dst}",
+                [(store (sub (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+def SBC8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                     "subc.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (sube GR8:$src1, GR8:$src2)),
+                      (implicit SRW)]>;
+def SBC16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                     "subc.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (sube GR16:$src1, GR16:$src2)),
+                      (implicit SRW)]>;
+
+def SBC8ri  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                     "subc.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (sube GR8:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+def SBC16ri : Pseudo<(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                     "subc.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (sube GR16:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+
+def SBC8rm  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                     "subc.b\t{$src2, $dst}",
+                     [(set GR8:$dst, (sube GR8:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+def SBC16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                     "subc.w\t{$src2, $dst}",
+                     [(set GR16:$dst, (sube GR16:$src1, (load addr:$src2))),
+                      (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def SBC8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "subc.b\t{$src, $dst}",
+                [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
+                 (implicit SRW)]>;
+def SBC16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "subc.w\t{$src, $dst}",
+                [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
+                 (implicit SRW)]>;
+
+def SBC8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "subc.b\t{$src, $dst}",
+                [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+def SBC16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "subc.w\t{$src, $dst}",
+                [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+
+def SBC8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "subc.b\t{$src, $dst}",
+                [(store (sube (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+def SBC16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "subc.w\t{$src, $dst}",
+                [(store (sube (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+// FIXME: Provide proper encoding!
+def SAR8r1  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                     "rra.b\t$dst",
+                     [(set GR8:$dst, (MSP430rra GR8:$src)),
+                      (implicit SRW)]>;
+def SAR16r1 : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                     "rra.w\t$dst",
+                     [(set GR16:$dst, (MSP430rra GR16:$src)),
+                      (implicit SRW)]>;
+
+def SHL8r1  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                     "rla.b\t$dst",
+                     [(set GR8:$dst, (MSP430rla GR8:$src)),
+                      (implicit SRW)]>;
+def SHL16r1 : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                     "rla.w\t$dst",
+                     [(set GR16:$dst, (MSP430rla GR16:$src)),
+                      (implicit SRW)]>;
+
+def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                      "clrc\n\t"
+                      "rrc.b\t$dst",
+                      [(set GR8:$dst, (MSP430rrc GR8:$src)),
+                       (implicit SRW)]>;
+def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                      "clrc\n\t"
+                      "rrc.w\t$dst",
+                      [(set GR16:$dst, (MSP430rrc GR16:$src)),
+                       (implicit SRW)]>;
+
+def SEXT16r : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                     "sxt\t$dst",
+                     [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+                      (implicit SRW)]>;
+
+} // Defs = [SRW]
+
+def SWPB16r : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                     "swpb\t$dst",
+                     [(set GR16:$dst, (bswap GR16:$src))]>;
+
+let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
+def OR8rr  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                    "bis.b\t{$src2, $dst}",
+                    [(set GR8:$dst, (or GR8:$src1, GR8:$src2))]>;
+def OR16rr : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "bis.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (or GR16:$src1, GR16:$src2))]>;
+}
+
+def OR8ri  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "bis.b\t{$src2, $dst}",
+                    [(set GR8:$dst, (or GR8:$src1, imm:$src2))]>;
+def OR16ri : Pseudo<(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "bis.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (or GR16:$src1, imm:$src2))]>;
+
+def OR8rm  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                    "bis.b\t{$src2, $dst}",
+                    [(set GR8:$dst, (or GR8:$src1, (load addr:$src2)))]>;
+def OR16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "bis.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (or GR16:$src1, (load addr:$src2)))]>;
+
+let isTwoAddress = 0 in {
+def OR8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
+                "bis.b\t{$src, $dst}",
+                [(store (or (load addr:$dst), GR8:$src), addr:$dst),
+                 (implicit SRW)]>;
+def OR16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
+                "bis.w\t{$src, $dst}",
+                [(store (or (load addr:$dst), GR16:$src), addr:$dst),
+                 (implicit SRW)]>;
+
+def OR8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
+                "bis.b\t{$src, $dst}",
+                [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+def OR16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
+                "bis.w\t{$src, $dst}",
+                [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                 (implicit SRW)]>;
+
+def OR8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "bis.b\t{$src, $dst}",
+                [(store (or (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+def OR16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "bis.w\t{$src, $dst}",
+                [(store (or (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                 (implicit SRW)]>;
+}
+
+} // isTwoAddress = 1
+
+// Integer comparisons
+let Defs = [SRW] in {
+def CMP8rr  : Pseudo<(outs), (ins GR8:$src1, GR8:$src2),
+                     "cmp.b\t{$src1, $src2}",
+                     [(MSP430cmp GR8:$src1, GR8:$src2), (implicit SRW)]>;
+def CMP16rr : Pseudo<(outs), (ins GR16:$src1, GR16:$src2),
+                     "cmp.w\t{$src1, $src2}",
+                     [(MSP430cmp GR16:$src1, GR16:$src2), (implicit SRW)]>;
+
+def CMP8ir  : Pseudo<(outs), (ins i8imm:$src1, GR8:$src2),
+                   "cmp.b\t{$src1, $src2}",
+                   [(MSP430cmp imm:$src1, GR8:$src2), (implicit SRW)]>;
+def CMP16ir : Pseudo<(outs), (ins i16imm:$src1, GR16:$src2),
+                     "cmp.w\t{$src1, $src2}",
+                     [(MSP430cmp imm:$src1, GR16:$src2), (implicit SRW)]>;
+
+def CMP8im  : Pseudo<(outs), (ins i8imm:$src1, memsrc:$src2),
+                     "cmp.b\t{$src1, $src2}",
+                      [(MSP430cmp (i8 imm:$src1), (load addr:$src2)), (implicit SRW)]>;
+def CMP16im : Pseudo<(outs), (ins i16imm:$src1, memsrc:$src2),
+                     "cmp.w\t{$src1, $src2}",
+                      [(MSP430cmp (i16 imm:$src1), (load addr:$src2)), (implicit SRW)]>;
+
+// FIXME: imm is allowed only on src operand, not on dst.
+
+//def CMP8ri  : Pseudo<(outs), (ins GR8:$src1, i8imm:$src2),
+//                   "cmp.b\t{$src1, $src2}",
+//                   [(MSP430cmp GR8:$src1, imm:$src2), (implicit SRW)]>;
+//def CMP16ri : Pseudo<(outs), (ins GR16:$src1, i16imm:$src2),
+//                     "cmp.w\t{$src1, $src2}",
+//                     [(MSP430cmp GR16:$src1, imm:$src2), (implicit SRW)]>;
+
+//def CMP8mi  : Pseudo<(outs), (ins memsrc:$src1, i8imm:$src2),
+//                "cmp.b\t{$src1, $src2}",
+//                [(MSP430cmp (load addr:$src1), (i8 imm:$src2)), (implicit SRW)]>;
+//def CMP16mi : Pseudo<(outs), (ins memsrc:$src1, i16imm:$src2),
+//                "cmp.w\t{$src1, $src2}",
+//                [(MSP430cmp (load addr:$src1), (i16 imm:$src2)), (implicit SRW)]>;
+
+
+// Imm 0, +1, +2, +4, +8 are encoded via constant generator registers.
+// That's why we can use them as dest operands. 
+// We don't define new class for them, since they would need special encoding
+// in the future.
+
+def CMP8ri0 : Pseudo<(outs), (ins GR8:$src1),
+                     "cmp.b\t{$src1, #0}",
+                     [(MSP430cmp GR8:$src1, 0), (implicit SRW)]>;
+def CMP16ri0: Pseudo<(outs), (ins GR16:$src1),
+                     "cmp.w\t{$src1, #0}",
+                     [(MSP430cmp GR16:$src1, 0), (implicit SRW)]>;
+def CMP8ri1 : Pseudo<(outs), (ins GR8:$src1),
+                     "cmp.b\t{$src1, #1}",
+                     [(MSP430cmp GR8:$src1, 1), (implicit SRW)]>;
+def CMP16ri1: Pseudo<(outs), (ins GR16:$src1),
+                     "cmp.w\t{$src1, #1}",
+                     [(MSP430cmp GR16:$src1, 1), (implicit SRW)]>;
+def CMP8ri2 : Pseudo<(outs), (ins GR8:$src1),
+                     "cmp.b\t{$src1, #2}",
+                     [(MSP430cmp GR8:$src1, 2), (implicit SRW)]>;
+def CMP16ri2: Pseudo<(outs), (ins GR16:$src1),
+                     "cmp.w\t{$src1, #2}",
+                     [(MSP430cmp GR16:$src1, 2), (implicit SRW)]>;
+def CMP8ri4 : Pseudo<(outs), (ins GR8:$src1),
+                     "cmp.b\t{$src1, #4}",
+                     [(MSP430cmp GR8:$src1, 4), (implicit SRW)]>;
+def CMP16ri4: Pseudo<(outs), (ins GR16:$src1),
+                     "cmp.w\t{$src1, #4}",
+                     [(MSP430cmp GR16:$src1, 4), (implicit SRW)]>;
+def CMP8ri8 : Pseudo<(outs), (ins GR8:$src1),
+                     "cmp.b\t{$src1, #8}",
+                     [(MSP430cmp GR8:$src1, 8), (implicit SRW)]>;
+def CMP16ri8: Pseudo<(outs), (ins GR16:$src1),
+                     "cmp.w\t{$src1, #8}",
+                     [(MSP430cmp GR16:$src1, 8), (implicit SRW)]>;
+
+def CMP8rm  : Pseudo<(outs), (ins GR8:$src1, memsrc:$src2),
+                     "cmp.b\t{$src1, $src2}",
+                     [(MSP430cmp GR8:$src1, (load addr:$src2)), (implicit SRW)]>;
+def CMP16rm : Pseudo<(outs), (ins GR16:$src1, memsrc:$src2),
+                     "cmp.w\t{$src1, $src2}",
+                     [(MSP430cmp GR16:$src1, (load addr:$src2)), (implicit SRW)]>;
+
+def CMP8mr  : Pseudo<(outs), (ins memsrc:$src1, GR8:$src2),
+                "cmp.b\t{$src1, $src2}",
+                [(MSP430cmp (load addr:$src1), GR8:$src2), (implicit SRW)]>;
+def CMP16mr : Pseudo<(outs), (ins memsrc:$src1, GR16:$src2),
+                "cmp.w\t{$src1, $src2}",
+                [(MSP430cmp (load addr:$src1), GR16:$src2), (implicit SRW)]>;
+
+def CMP8mi0 : Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.b\t{$src1, #0}",
+                [(MSP430cmp (load addr:$src1), (i8 0)), (implicit SRW)]>;
+def CMP16mi0: Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.w\t{$src1, #0}",
+                [(MSP430cmp (load addr:$src1), (i16 0)), (implicit SRW)]>;
+def CMP8mi1 : Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.b\t{$src1, #1}",
+                [(MSP430cmp (load addr:$src1), (i8 1)), (implicit SRW)]>;
+def CMP16mi1: Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.w\t{$src1, #1}",
+                [(MSP430cmp (load addr:$src1), (i16 1)), (implicit SRW)]>;
+def CMP8mi2 : Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.b\t{$src1, #2}",
+                [(MSP430cmp (load addr:$src1), (i8 2)), (implicit SRW)]>;
+def CMP16mi2: Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.w\t{$src1, #2}",
+                [(MSP430cmp (load addr:$src1), (i16 2)), (implicit SRW)]>;
+def CMP8mi4 : Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.b\t{$src1, #4}",
+                [(MSP430cmp (load addr:$src1), (i8 4)), (implicit SRW)]>;
+def CMP16mi4: Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.w\t{$src1, #4}",
+                [(MSP430cmp (load addr:$src1), (i16 4)), (implicit SRW)]>;
+def CMP8mi8 : Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.b\t{$src1, #8}",
+                [(MSP430cmp (load addr:$src1), (i8 8)), (implicit SRW)]>;
+def CMP16mi8: Pseudo<(outs), (ins memsrc:$src1),
+                "cmp.w\t{$src1, #8}",
+                [(MSP430cmp (load addr:$src1), (i16 8)), (implicit SRW)]>;
+
+} // Defs = [SRW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+
+// extload
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+
+// anyext
+def : Pat<(anyext addr:$src), (MOVZX16rr8 GR8:$src)>;
+
+// truncs
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, subreg_8bit)>;
+
+// GlobalAddress, ExternalSymbol
+def : Pat<(i16 (MSP430Wrapper tglobaladdr:$dst)), (MOV16ri tglobaladdr:$dst)>;
+def : Pat<(i16 (MSP430Wrapper texternalsym:$dst)), (MOV16ri texternalsym:$dst)>;
+
+def : Pat<(add GR16:$src1, (MSP430Wrapper tglobaladdr :$src2)),
+          (ADD16ri GR16:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR16:$src1, (MSP430Wrapper texternalsym:$src2)),
+          (ADD16ri GR16:$src1, texternalsym:$src2)>;
+
+def : Pat<(store (i16 (MSP430Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper texternalsym:$src)), addr:$dst),
+          (MOV16mi addr:$dst, texternalsym:$src)>;
+
+// calls
+def : Pat<(MSP430call (i16 tglobaladdr:$dst)),
+          (CALLi tglobaladdr:$dst)>;
+def : Pat<(MSP430call (i16 texternalsym:$dst)),
+          (CALLi texternalsym:$dst)>;
+
+// add and sub always produce carry
+def : Pat<(addc GR16:$src1, GR16:$src2),
+          (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(addc GR16:$src1, (load addr:$src2)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(addc GR16:$src1, imm:$src2),
+          (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR16:$src), addr:$dst),
+          (ADD16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (ADD16mm addr:$dst, addr:$src)>;
+
+def : Pat<(addc GR8:$src1, GR8:$src2),
+          (ADD8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(addc GR8:$src1, (load addr:$src2)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(addc GR8:$src1, imm:$src2),
+          (ADD8ri GR8:$src1, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR8:$src), addr:$dst),
+          (ADD8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (ADD8mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR16:$src1, GR16:$src2),
+          (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(subc GR16:$src1, (load addr:$src2)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(subc GR16:$src1, imm:$src2),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR16:$src), addr:$dst),
+          (SUB16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (SUB16mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR8:$src1, GR8:$src2),
+          (SUB8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(subc GR8:$src1, (load addr:$src2)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(subc GR8:$src1, imm:$src2),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR8:$src), addr:$dst),
+          (SUB8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (SUB8mm addr:$dst, addr:$src)>;
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
new file mode 100644
index 0000000..b94d7e4
--- /dev/null
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -0,0 +1,39 @@
+//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares MSP430-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430MACHINEFUNCTIONINFO_H
+#define MSP430MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MSP430MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private MSP430 target-specific information for each MachineFunction.
+class MSP430MachineFunctionInfo : public MachineFunctionInfo {
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+public:
+  MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+  MSP430MachineFunctionInfo(MachineFunction &MF) : CalleeSavedFrameSize(0) {}
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
new file mode 100644
index 0000000..ef6f997
--- /dev/null
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -0,0 +1,355 @@
+//===- MSP430RegisterInfo.cpp - MSP430 Register Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-reg-info"
+
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430RegisterInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+
+using namespace llvm;
+
+// FIXME: Provide proper call frame setup / destroy opcodes.
+MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
+                                       const TargetInstrInfo &tii)
+  : MSP430GenRegisterInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+    TM(tm), TII(tii) {
+  StackAlign = TM.getFrameInfo()->getStackAlignment();
+}
+
+const unsigned*
+MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = {
+    MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
+    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    0
+  };
+
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+MSP430RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    0
+  };
+
+  return CalleeSavedRegClasses;
+}
+
+BitVector
+MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  // Mark 4 special registers as reserved.
+  Reserved.set(MSP430::PCW);
+  Reserved.set(MSP430::SPW);
+  Reserved.set(MSP430::SRW);
+  Reserved.set(MSP430::CGW);
+
+  // Mark frame pointer as reserved if needed.
+  if (hasFP(MF))
+    Reserved.set(MSP430::FPW);
+
+  return Reserved;
+}
+
+const TargetRegisterClass* MSP430RegisterInfo::getPointerRegClass() const {
+  return &MSP430::GR16RegClass;
+}
+
+
+bool MSP430RegisterInfo::hasFP(const MachineFunction &MF) const {
+  return NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+bool MSP430RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void MSP430RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub SPW, <amt>' and the
+    // adjcallstackdown instruction into 'add SPW, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, Old->getDebugLoc(),
+                      TII.get(MSP430::SUB16ri), MSP430::SPW)
+          .addReg(MSP430::SPW).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == getCallFrameDestroyOpcode());
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount)
+          New = BuildMI(MF, Old->getDebugLoc(),
+                        TII.get(MSP430::ADD16ri), MSP430::SPW)
+            .addReg(MSP430::SPW).addImm(Amount);
+      }
+
+      if (New) {
+        // The SRW implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction...
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      MachineInstr *Old = I;
+      MachineInstr *New =
+        BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
+                MSP430::SPW).addReg(MSP430::SPW).addImm(CalleeAmt);
+      // The SRW implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void
+MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  unsigned BasePtr = (hasFP(MF) ? MSP430::FPW : MSP430::SPW);
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  // Skip the saved PC
+  Offset += 2;
+
+  if (!hasFP(MF))
+    Offset += MF.getFrameInfo()->getStackSize();
+  else
+    Offset += 2; // Skip the saved FPW
+
+  // Fold imm into offset
+  Offset += MI.getOperand(i+1).getImm();
+
+  if (MI.getOpcode() == MSP430::ADD16ri) {
+    // This is actually "load effective address" of the stack slot
+    // instruction. We have only two-address instructions, thus we need to
+    // expand it into mov + add
+
+    MI.setDesc(TII.get(MSP430::MOV16rr));
+    MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+    if (Offset == 0)
+      return;
+
+    // We need to materialize the offset via add instruction.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if (Offset < 0)
+      BuildMI(MBB, next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+        .addReg(DstReg).addImm(-Offset);
+    else
+      BuildMI(MBB, next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+        .addReg(DstReg).addImm(Offset);
+
+    return;
+  }
+
+  MI.getOperand(i).ChangeToRegister(BasePtr, false);
+  MI.getOperand(i+1).ChangeToImmediate(Offset);
+}
+
+void
+MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
+                                                                         const {
+  // Create a frame entry for the FPW register that must be saved.
+  if (hasFP(MF)) {
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4);
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for FPW register must be last in order to be found!");
+    FrameIdx = 0;
+  }
+}
+
+
+void MSP430RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+
+  uint64_t NumBytes = 0;
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - MSP430FI->getCalleeSavedFrameSize();
+
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save FPW into the appropriate stack slot...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(MSP430::FPW, RegState::Kill);
+
+    // Update FPW with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FPW)
+      .addReg(MSP430::SPW);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(MSP430::FPW);
+
+  } else
+    NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() && (MBBI->getOpcode() == MSP430::PUSH16r))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  if (NumBytes) { // adjust stack pointer: SPW -= numbytes
+    // If there is an SUB16ri of SPW immediately before this instruction, merge
+    // the two.
+    //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+    // If there is an ADD16ri or SUB16ri of SPW immediately after this
+    // instruction, merge the two instructions.
+    // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);
+
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+void MSP430RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  switch (RetOpcode) {
+  case MSP430::RET: break;  // These are ok
+  default:
+    assert(0 && "Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  unsigned CSSize = MSP430FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - CSSize;
+
+    // pop FPW.
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FPW);
+  } else
+    NumBytes = StackSize - CSSize;
+
+  // Skip the callee-saved pop instructions.
+  MachineBasicBlock::iterator LastCSPop = MBBI;
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    unsigned Opc = PI->getOpcode();
+    if (Opc != MSP430::POP16r && !PI->getDesc().isTerminator())
+      break;
+    --MBBI;
+  }
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD16ri or SUB16ri of SPW immediately before this
+  // instruction, merge the two instructions.
+  //if (NumBytes || MFI->hasVarSizedObjects())
+  //  mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  if (MFI->hasVarSizedObjects()) {
+    assert(0 && "Not implemented yet!");
+  } else {
+    // adjust stack pointer back: SPW += numbytes
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+unsigned MSP430RegisterInfo::getRARegister() const {
+  return MSP430::PCW;
+}
+
+unsigned MSP430RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? MSP430::FPW : MSP430::SPW;
+}
+
+int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "Not implemented yet!");
+}
+
+#include "MSP430GenRegisterInfo.inc"
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
new file mode 100644
index 0000000..a210e36
--- /dev/null
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -0,0 +1,70 @@
+//===- MSP430RegisterInfo.h - MSP430 Register Information Impl --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430REGISTERINFO_H
+#define LLVM_TARGET_MSP430REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "MSP430GenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class MSP430TargetMachine;
+
+struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
+private:
+  MSP430TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  /// StackAlign - Default stack alignment.
+  ///
+  unsigned StackAlign;
+public:
+  MSP430RegisterInfo(MSP430TargetMachine &tm, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const*
+    getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  const TargetRegisterClass* getPointerRegClass() const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  //! Get DWARF debugging register number
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430REGISTERINFO_H
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
new file mode 100644
index 0000000..4078626
--- /dev/null
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -0,0 +1,122 @@
+//===- MSP430RegisterInfo.td - MSP430 Register defs ----------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MSP430 register file
+//===----------------------------------------------------------------------===//
+
+class MSP430Reg<bits<4> num, string n> : Register<n> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs> 
+  : RegisterWithSubRegs<n, subregs> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+def PCB  : MSP430Reg<0,  "r0">;
+def SPB  : MSP430Reg<1,  "r1">;
+def SRB  : MSP430Reg<2,  "r2">;
+def CGB  : MSP430Reg<3,  "r3">;
+def FPB  : MSP430Reg<4,  "r4">;
+def R5B  : MSP430Reg<5,  "r5">;
+def R6B  : MSP430Reg<6,  "r6">;
+def R7B  : MSP430Reg<7,  "r7">;
+def R8B  : MSP430Reg<8,  "r8">;
+def R9B  : MSP430Reg<9,  "r9">;
+def R10B : MSP430Reg<10, "r10">;
+def R11B : MSP430Reg<11, "r11">;
+def R12B : MSP430Reg<12, "r12">;
+def R13B : MSP430Reg<13, "r13">;
+def R14B : MSP430Reg<14, "r14">;
+def R15B : MSP430Reg<15, "r15">;
+
+def PCW  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
+def SPW  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
+def SRW  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
+def CGW  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
+def FPW  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def R5W  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
+def R6W  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
+def R7W  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
+def R8W  : MSP430RegWithSubregs<8,  "r8",  [R8B]>;
+def R9W  : MSP430RegWithSubregs<9,  "r9",  [R9B]>;
+def R10W : MSP430RegWithSubregs<10, "r10", [R10B]>;
+def R11W : MSP430RegWithSubregs<11, "r11", [R11B]>;
+def R12W : MSP430RegWithSubregs<12, "r12", [R12B]>;
+def R13W : MSP430RegWithSubregs<13, "r13", [R13B]>;
+def R14W : MSP430RegWithSubregs<14, "r14", [R14B]>;
+def R15W : MSP430RegWithSubregs<15, "r15", [R15B]>;
+
+def : SubRegSet<1, [PCW, SPW, SRW, CGW, FPW,
+                    R5W, R6W, R7W, R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
+                   [PCB, SPB, SRB, CGB, FPB,
+                    R5B, R6B, R7B, R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def subreg_8bit : PatLeaf<(i32 1)>;
+
+def GR8 : RegisterClass<"MSP430", [i8], 8,
+   // Volatile registers
+  [R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
+   // Frame pointer, sometimes allocable
+   FPB,
+   // Volatile, but not allocable
+   PCB, SPB, SRB, CGB]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR8Class::iterator
+    GR8Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Depending on whether the function uses frame pointer or not, last 5 or 4
+      // registers on the list above are reserved
+      if (RI->hasFP(MF))
+        return end()-5;
+      else
+        return end()-4;
+    }
+  }];
+}
+
+def GR16 : RegisterClass<"MSP430", [i16], 16,
+   // Volatile registers
+  [R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W,
+   // Frame pointer, sometimes allocable
+   FPW,
+   // Volatile, but not allocable
+   PCW, SPW, SRW, CGW]>
+{
+  let SubRegClassList = [GR8];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR16Class::iterator
+    GR16Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Depending on whether the function uses frame pointer or not, last 5 or 4
+      // registers on the list above are reserved
+      if (RI->hasFP(MF))
+        return end()-5;
+      else
+        return end()-4;
+    }
+  }];
+}
+
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
new file mode 100644
index 0000000..ef9e103
--- /dev/null
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -0,0 +1,27 @@
+//===- MSP430Subtarget.cpp - MSP430 Subtarget Information ---------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430Subtarget.h"
+#include "MSP430.h"
+#include "MSP430GenSubtarget.inc"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+MSP430Subtarget::MSP430Subtarget(const TargetMachine &TM, const Module &M,
+                                 const std::string &FS) {
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
new file mode 100644
index 0000000..96c8108
--- /dev/null
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -0,0 +1,41 @@
+//====-- MSP430Subtarget.h - Define Subtarget for the MSP430 ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_SUBTARGET_H
+#define LLVM_TARGET_MSP430_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+class TargetMachine;
+
+class MSP430Subtarget : public TargetSubtarget {
+  bool ExtendedInsts;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  MSP430Subtarget(const TargetMachine &TM, const Module &M,
+                  const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_MSP430_SUBTARGET_H
diff --git a/lib/Target/MSP430/MSP430TargetAsmInfo.cpp b/lib/Target/MSP430/MSP430TargetAsmInfo.cpp
new file mode 100644
index 0000000..ab181de
--- /dev/null
+++ b/lib/Target/MSP430/MSP430TargetAsmInfo.cpp
@@ -0,0 +1,22 @@
+//===-- MSP430TargetAsmInfo.cpp - MSP430 asm properties -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MSP430TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430TargetAsmInfo.h"
+#include "MSP430TargetMachine.h"
+
+using namespace llvm;
+
+MSP430TargetAsmInfo::MSP430TargetAsmInfo(const MSP430TargetMachine &TM)
+  : ELFTargetAsmInfo(TM) {
+  AlignmentIsInBytes = false;
+}
diff --git a/lib/Target/MSP430/MSP430TargetAsmInfo.h b/lib/Target/MSP430/MSP430TargetAsmInfo.h
new file mode 100644
index 0000000..b58d5c9
--- /dev/null
+++ b/lib/Target/MSP430/MSP430TargetAsmInfo.h
@@ -0,0 +1,31 @@
+//=====-- MSP430TargetAsmInfo.h - MSP430 asm properties -------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MSP430TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430TARGETASMINFO_H
+#define MSP430TARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class MSP430TargetMachine;
+
+  struct MSP430TargetAsmInfo : public ELFTargetAsmInfo {
+    explicit MSP430TargetAsmInfo(const MSP430TargetMachine &TM);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
new file mode 100644
index 0000000..7886946
--- /dev/null
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -0,0 +1,76 @@
+//===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430TargetAsmInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+
+using namespace llvm;
+
+/// MSP430TargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int MSP430TargetMachineModule;
+int MSP430TargetMachineModule = 0;
+
+
+// Register the targets
+static RegisterTarget<MSP430TargetMachine>
+X("msp430", "MSP430 [experimental]");
+
+MSP430TargetMachine::MSP430TargetMachine(const Module &M,
+                                         const std::string &FS) :
+  Subtarget(*this, M, FS),
+  // FIXME: Check TargetData string.
+  DataLayout("e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"),
+  InstrInfo(*this), TLInfo(*this),
+  FrameInfo(TargetFrameInfo::StackGrowsDown, 2, -2) { }
+
+const TargetAsmInfo *MSP430TargetMachine::createTargetAsmInfo() const {
+  return new MSP430TargetAsmInfo(*this);
+}
+
+bool MSP430TargetMachine::addInstSelector(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createMSP430ISelDag(*this, OptLevel));
+  return false;
+}
+
+bool MSP430TargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                             CodeGenOpt::Level OptLevel,
+                                             bool Verbose,
+                                             raw_ostream &Out) {
+  // Output assembly language.
+  PM.add(createMSP430CodePrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
+
+unsigned MSP430TargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+
+  // We strongly match msp430
+  if (TT.size() >= 6 && TT[0] == 'm' && TT[1] == 's' && TT[2] == 'p' &&
+      TT[3] == '4' &&  TT[4] == '3' && TT[5] == '0')
+    return 20;
+
+  return 0;
+}
+
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
new file mode 100644
index 0000000..d9ffa2b
--- /dev/null
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -0,0 +1,68 @@
+//==-- MSP430TargetMachine.h - Define TargetMachine for MSP430 ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H
+#define LLVM_TARGET_MSP430_TARGETMACHINE_H
+
+#include "MSP430InstrInfo.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430RegisterInfo.h"
+#include "MSP430Subtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// MSP430TargetMachine
+///
+class MSP430TargetMachine : public LLVMTargetMachine {
+  MSP430Subtarget        Subtarget;
+  const TargetData       DataLayout;       // Calculates type size & alignment
+  MSP430InstrInfo        InstrInfo;
+  MSP430TargetLowering   TLInfo;
+
+  // MSP430 does not have any call stack frame, therefore not having
+  // any MSP430 specific FrameInfo class.
+  TargetFrameInfo       FrameInfo;
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+public:
+  MSP430TargetMachine(const Module &M, const std::string &FS);
+
+  virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const MSP430InstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const MSP430Subtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  virtual const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual MSP430TargetLowering *getTargetLowering() const {
+    return const_cast<MSP430TargetLowering*>(&TLInfo);
+  }
+
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel, bool Verbose,
+                                  raw_ostream &Out);
+  static unsigned getModuleMatchQuality(const Module &M);
+}; // MSP430TargetMachine.
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430_TARGETMACHINE_H
diff --git a/lib/Target/MSP430/Makefile b/lib/Target/MSP430/Makefile
new file mode 100644
index 0000000..45cb3aa
--- /dev/null
+++ b/lib/Target/MSP430/Makefile
@@ -0,0 +1,21 @@
+##===- lib/Target/MSP430/Makefile --------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source 
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMMSP430
+TARGET = MSP430
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = MSP430GenRegisterInfo.h.inc MSP430GenRegisterNames.inc \
+		MSP430GenRegisterInfo.inc MSP430GenInstrNames.inc \
+		MSP430GenInstrInfo.inc MSP430GenAsmWriter.inc \
+		MSP430GenDAGISel.inc MSP430GenCallingConv.inc \
+		MSP430GenSubtarget.inc
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/MSP430/README.txt b/lib/Target/MSP430/README.txt
new file mode 100644
index 0000000..b14e93d
--- /dev/null
+++ b/lib/Target/MSP430/README.txt
@@ -0,0 +1,42 @@
+//===---------------------------------------------------------------------===//
+// MSP430 backend.
+//===---------------------------------------------------------------------===//
+
+DISCLAIMER: Thid backend should be considered as highly experimental. I never
+seen nor worked with this MCU, all information was gathered from datasheet
+only. The original intention of making this backend was to write documentation
+of form "How to write backend for dummies" :) Thes notes hopefully will be
+available pretty soon.
+
+Some things are incomplete / not implemented yet (this list surely is not
+complete as well):
+
+0. Implement asmprinting for variables :)
+
+1. Verify, how stuff is handling implicit zext with 8 bit operands (this might
+be modelled currently in improper way - should we need to mark the superreg as
+def for every 8 bit instruction?).
+
+2. Libcalls: multiplication, division, remainder. Note, that calling convention
+for libcalls is incomptible with calling convention of libcalls of msp430-gcc
+(these cannot be used though due to license restriction).
+
+3. Implement multiplication / division by constant (dag combiner hook?).
+
+4. Implement non-constant shifts.
+
+5. Implement varargs stuff.
+
+6. Verify and fix (if needed) how's stuff playing with i32 / i64.
+
+7. Implement floating point stuff (softfp?)
+
+8. Implement instruction encoding for (possible) direct code emission in the
+future.
+
+9. Since almost all instructions set flags - implement brcond / select in better
+way (currently they emit explicit comparison).
+
+10. Handle imm in comparisons in better way (see comment in MSP430InstrInfo.td)
+
+11. Implement hooks for better memory op folding, etc.
diff --git a/lib/Target/Makefile b/lib/Target/Makefile
new file mode 100644
index 0000000..50a360f
--- /dev/null
+++ b/lib/Target/Makefile
@@ -0,0 +1,20 @@
+#===- lib/Target/Makefile ----------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMTarget
+BUILD_ARCHIVE = 1
+
+# We include this early so we can access the value of TARGETS_TO_BUILD as the
+# value for PARALLEL_DIRS which must be set before Makefile.rules is included
+include $(LEVEL)/Makefile.config
+
+PARALLEL_DIRS := $(TARGETS_TO_BUILD)
+
+include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/Target/Mips/AsmPrinter/CMakeLists.txt b/lib/Target/Mips/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..6a868c2
--- /dev/null
+++ b/lib/Target/Mips/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,12 @@
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}/..
+  ${CMAKE_CURRENT_SOURCE_DIR}/..
+  )
+
+add_partially_linked_object(LLVMMipsAsmPrinter
+  MipsAsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMMipsCodeGen n)
+
+add_dependencies(LLVMMipsAsmPrinter ${n})
diff --git a/lib/Target/Mips/AsmPrinter/Makefile b/lib/Target/Mips/AsmPrinter/Makefile
new file mode 100644
index 0000000..a2fecf4
--- /dev/null
+++ b/lib/Target/Mips/AsmPrinter/Makefile
@@ -0,0 +1,17 @@
+##===- lib/Target/Mips/AsmPrinter/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMipsAsmPrinter
+
+# Hack: we need to include 'main' Mips target directory to grab
+# private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
new file mode 100644
index 0000000..dfb6238
--- /dev/null
+++ b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
@@ -0,0 +1,580 @@
+//===-- MipsAsmPrinter.cpp - Mips LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MIPS assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-asm-printer"
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN MipsAsmPrinter : public AsmPrinter {
+    const MipsSubtarget *Subtarget;
+  public:
+    explicit MipsAsmPrinter(raw_ostream &O, MipsTargetMachine &TM, 
+                            const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                            bool V)
+      : AsmPrinter(O, TM, T, OL, V) {
+      Subtarget = &TM.getSubtarget<MipsSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "Mips Assembly Printer";
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
+                         unsigned AsmVariant, const char *ExtraCode);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printUnsignedImm(const MachineInstr *MI, int opNum);
+    void printMemOperand(const MachineInstr *MI, int opNum, 
+                         const char *Modifier = 0);
+    void printFCCOperand(const MachineInstr *MI, int opNum, 
+                         const char *Modifier = 0);
+    void printModuleLevelGV(const GlobalVariable* GVar);
+    void printSavedRegsBitmask(MachineFunction &MF);
+    void printHex32(unsigned int Value);
+
+    const char *emitCurrentABIString(void);
+    void emitFunctionStart(MachineFunction &MF);
+    void emitFunctionEnd(MachineFunction &MF);
+    void emitFrameDirective(MachineFunction &MF);
+
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+  };
+} // end of anonymous namespace
+
+#include "MipsGenAsmWriter.inc"
+
+/// createMipsCodePrinterPass - Returns a pass that prints the MIPS
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+FunctionPass *llvm::createMipsCodePrinterPass(raw_ostream &o,
+                                              MipsTargetMachine &tm,
+                                              CodeGenOpt::Level OptLevel,
+                                              bool verbose) {
+  return new MipsAsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  Mips Asm Directives
+//
+//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+//  Describe the stack frame.
+//
+//  -- Mask directives "(f)mask  bitmask, offset" 
+//  Tells the assembler which registers are saved and where.
+//  bitmask - contain a little endian bitset indicating which registers are 
+//            saved on function prologue (e.g. with a 0x80000000 mask, the 
+//            assembler knows the register 31 (RA) is saved at prologue.
+//  offset  - the position before stack pointer subtraction indicating where 
+//            the first saved register on prologue is located. (e.g. with a
+//
+//  Consider the following function prologue:
+//
+//    .frame  $fp,48,$ra
+//    .mask   0xc0000000,-8
+//       addiu $sp, $sp, -48
+//       sw $ra, 40($sp)
+//       sw $fp, 36($sp)
+//
+//    With a 0xc0000000 mask, the assembler knows the register 31 (RA) and 
+//    30 (FP) are saved at prologue. As the save order on prologue is from 
+//    left to right, RA is saved first. A -8 offset means that after the 
+//    stack pointer subtration, the first register in the mask (RA) will be
+//    saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mask directives
+//===----------------------------------------------------------------------===//
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point 
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MipsAsmPrinter::
+printSavedRegsBitmask(MachineFunction &MF)
+{
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+             
+  // CPU and FPU Saved Registers Bitmasks
+  unsigned int CPUBitmask = 0;
+  unsigned int FPUBitmask = 0;
+
+  // Set the CPU and FPU Bitmasks
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(CSI[i].getReg());
+    if (CSI[i].getRegClass() == Mips::CPURegsRegisterClass)
+      CPUBitmask |= (1 << RegNum);
+    else
+      FPUBitmask |= (1 << RegNum);
+  }
+
+  // Return Address and Frame registers must also be set in CPUBitmask.
+  if (RI.hasFP(MF)) 
+    CPUBitmask |= (1 << MipsRegisterInfo::
+                getRegisterNumbering(RI.getFrameRegister(MF)));
+  
+  if (MF.getFrameInfo()->hasCalls()) 
+    CPUBitmask |= (1 << MipsRegisterInfo::
+                getRegisterNumbering(RI.getRARegister()));
+
+  // Print CPUBitmask
+  O << "\t.mask \t"; printHex32(CPUBitmask); O << ','
+    << MipsFI->getCPUTopSavedRegOff() << '\n';
+
+  // Print FPUBitmask
+  O << "\t.fmask\t"; printHex32(FPUBitmask); O << ","
+    << MipsFI->getFPUTopSavedRegOff() << '\n';
+}
+
+// Print a 32 bit hex number with all numbers.
+void MipsAsmPrinter::
+printHex32(unsigned int Value) 
+{
+  O << "0x";
+  for (int i = 7; i >= 0; i--) 
+    O << utohexstr( (Value & (0xF << (i*4))) >> (i*4) );
+}
+
+//===----------------------------------------------------------------------===//
+// Frame and Set directives
+//===----------------------------------------------------------------------===//
+
+/// Frame Directive
+void MipsAsmPrinter::
+emitFrameDirective(MachineFunction &MF)
+{
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+
+  unsigned stackReg  = RI.getFrameRegister(MF);
+  unsigned returnReg = RI.getRARegister();
+  unsigned stackSize = MF.getFrameInfo()->getStackSize();
+
+
+  O << "\t.frame\t" << '$' << LowercaseString(RI.get(stackReg).AsmName)
+                    << ',' << stackSize << ','
+                    << '$' << LowercaseString(RI.get(returnReg).AsmName)
+                    << '\n';
+}
+
+/// Emit Set directives.
+const char * MipsAsmPrinter::
+emitCurrentABIString(void) 
+{  
+  switch(Subtarget->getTargetABI()) {
+    case MipsSubtarget::O32:  return "abi32";  
+    case MipsSubtarget::O64:  return "abiO64";
+    case MipsSubtarget::N32:  return "abiN32";
+    case MipsSubtarget::N64:  return "abi64";
+    case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
+    default: break;
+  }
+
+  assert(0 && "Unknown Mips ABI");
+  return NULL;
+}  
+
+/// Emit the directives used by GAS on the start of functions
+void MipsAsmPrinter::
+emitFunctionStart(MachineFunction &MF)
+{
+  // Print out the label for the function.
+  const Function *F = MF.getFunction();
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  // 2 bits aligned
+  EmitAlignment(2, F);
+
+  O << "\t.globl\t"  << CurrentFnName << '\n';
+  O << "\t.ent\t"    << CurrentFnName << '\n';
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  if ((TAI->hasDotTypeDotSizeDirective()) && Subtarget->isLinux())
+    O << "\t.type\t"   << CurrentFnName << ", @function\n";
+
+  O << CurrentFnName << ":\n";
+
+  emitFrameDirective(MF);
+  printSavedRegsBitmask(MF);
+
+  O << '\n';
+}
+
+/// Emit the directives used by GAS on the end of functions
+void MipsAsmPrinter::
+emitFunctionEnd(MachineFunction &MF) 
+{
+  // There are instruction for this macros, but they must
+  // always be at the function end, and we can't emit and
+  // break with BB logic. 
+  O << "\t.set\tmacro\n"; 
+  O << "\t.set\treorder\n"; 
+
+  O << "\t.end\t" << CurrentFnName << '\n';
+  if (TAI->hasDotTypeDotSizeDirective() && !Subtarget->isLinux())
+    O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+bool MipsAsmPrinter::
+runOnMachineFunction(MachineFunction &MF) 
+{
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out jump tables referenced by the function
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  O << "\n\n";
+
+  // Emit the function start directives
+  emitFunctionStart(MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true, true);
+      O << '\n';
+    }
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printInstruction(II);
+      ++EmittedInsts;
+    }
+
+    // Each Basic Block is separated by a newline
+    O << '\n';
+  }
+
+  // Emit function end directives
+  emitFunctionEnd(MF);
+
+  // We didn't modify anything.
+  return false;
+}
+
+// Print out an operand for an inline asm expression.
+bool MipsAsmPrinter::
+PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
+                unsigned AsmVariant, const char *ExtraCode) 
+{
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) 
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+void MipsAsmPrinter::
+printOperand(const MachineInstr *MI, int opNum) 
+{
+  const MachineOperand &MO = MI->getOperand(opNum);
+  const TargetRegisterInfo  &RI = *TM.getRegisterInfo();
+  bool closeP = false;
+  bool isPIC = (TM.getRelocationModel() == Reloc::PIC_);
+  bool isCodeLarge = (TM.getCodeModel() == CodeModel::Large);
+
+  // %hi and %lo used on mips gas to load global addresses on
+  // static code. %got is used to load global addresses when 
+  // using PIC_. %call16 is used to load direct call targets
+  // on PIC_ and small code size. %call_lo and %call_hi load 
+  // direct call targets on PIC_ and large code size.
+  if (MI->getOpcode() == Mips::LUi && !MO.isReg() && !MO.isImm()) {
+    if ((isPIC) && (isCodeLarge))
+      O << "%call_hi(";
+    else
+      O << "%hi(";
+    closeP = true;
+  } else if ((MI->getOpcode() == Mips::ADDiu) && !MO.isReg() && !MO.isImm()) {
+    const MachineOperand &firstMO = MI->getOperand(opNum-1);
+    if (firstMO.getReg() == Mips::GP)
+      O << "%gp_rel(";
+    else
+      O << "%lo(";
+    closeP = true;
+  } else if ((isPIC) && (MI->getOpcode() == Mips::LW) &&
+             (!MO.isReg()) && (!MO.isImm())) {
+    const MachineOperand &firstMO = MI->getOperand(opNum-1);
+    const MachineOperand &lastMO  = MI->getOperand(opNum+1);
+    if ((firstMO.isReg()) && (lastMO.isReg())) {
+      if ((firstMO.getReg() == Mips::T9) && (lastMO.getReg() == Mips::GP) 
+          && (!isCodeLarge))
+        O << "%call16(";
+      else if ((firstMO.getReg() != Mips::T9) && (lastMO.getReg() == Mips::GP))
+        O << "%got(";
+      else if ((firstMO.getReg() == Mips::T9) && (lastMO.getReg() != Mips::GP) 
+               && (isCodeLarge))
+        O << "%call_lo(";
+      closeP = true;
+    }
+  }
+ 
+  switch (MO.getType()) 
+  {
+    case MachineOperand::MO_Register:
+      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        O << '$' << LowercaseString (RI.get(MO.getReg()).AsmName);
+      else
+        O << '$' << MO.getReg();
+      break;
+
+    case MachineOperand::MO_Immediate:
+      O << (short int)MO.getImm();
+      break;
+
+    case MachineOperand::MO_MachineBasicBlock:
+      printBasicBlockLabel(MO.getMBB());
+      return;
+
+    case MachineOperand::MO_GlobalAddress:
+      {
+        const GlobalValue *GV = MO.getGlobal();
+        O << Mang->getValueName(GV);
+      }
+      break;
+
+    case MachineOperand::MO_ExternalSymbol:
+      O << MO.getSymbolName();
+      break;
+
+    case MachineOperand::MO_JumpTableIndex:
+      O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+      break;
+
+    case MachineOperand::MO_ConstantPoolIndex:
+      O << TAI->getPrivateGlobalPrefix() << "CPI"
+        << getFunctionNumber() << "_" << MO.getIndex();
+      break;
+  
+    default:
+      O << "<unknown operand type>"; abort (); break;
+  }
+
+  if (closeP) O << ")";
+}
+
+void MipsAsmPrinter::
+printUnsignedImm(const MachineInstr *MI, int opNum) 
+{
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.getType() == MachineOperand::MO_Immediate)
+    O << (unsigned short int)MO.getImm();
+  else 
+    printOperand(MI, opNum);
+}
+
+void MipsAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier) 
+{
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  if (Modifier && !strcmp(Modifier, "stackloc")) {
+    printOperand(MI, opNum+1);
+    O << ", ";
+    printOperand(MI, opNum);
+    return;
+  }
+
+  // Load/Store memory operands -- imm($reg) 
+  // If PIC target the target is loaded as the 
+  // pattern lw $25,%call16($28)
+  printOperand(MI, opNum);
+  O << "(";
+  printOperand(MI, opNum+1);
+  O << ")";
+}
+
+void MipsAsmPrinter::
+printFCCOperand(const MachineInstr *MI, int opNum, const char *Modifier) 
+{
+  const MachineOperand& MO = MI->getOperand(opNum);
+  O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm()); 
+}
+
+bool MipsAsmPrinter::
+doInitialization(Module &M) 
+{
+  Mang = new Mangler(M, "", TAI->getPrivateGlobalPrefix());
+
+  // Tell the assembler which ABI we are using
+  O << "\t.section .mdebug." << emitCurrentABIString() << '\n';
+
+  // TODO: handle O64 ABI
+  if (Subtarget->isABI_EABI())
+    O << "\t.section .gcc_compiled_long" << 
+      (Subtarget->isGP32bit() ? "32" : "64") << '\n';
+
+  // return to previous section
+  O << "\t.previous" << '\n'; 
+
+  return false; // success
+}
+
+void MipsAsmPrinter::
+printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return;   // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar))
+    return;
+
+  O << "\n\n";
+  std::string name = Mang->getValueName(GVar);
+  Constant *C = GVar->getInitializer();
+  const Type *CTy = C->getType();
+  unsigned Size = TD->getTypeAllocSize(CTy);
+  const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+  bool printSizeAndType = true;
+
+  // A data structure or array is aligned in memory to the largest
+  // alignment boundary required by any data type inside it (this matches
+  // the Preferred Type Alignment). For integral types, the alignment is
+  // the type size.
+  unsigned Align;
+  if (CTy->getTypeID() == Type::IntegerTyID ||
+      CTy->getTypeID() == Type::VoidTyID) {
+    assert(!(Size & (Size-1)) && "Alignment is not a power of two!");
+    Align = Log2_32(Size);
+  } else
+    Align = TD->getPreferredTypeAlignmentShift(CTy);
+
+  printVisibility(name, GVar->getVisibility());
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  if (C->isNullValue() && !GVar->hasSection()) {
+    if (!GVar->isThreadLocal() &&
+        (GVar->hasLocalLinkage() || GVar->isWeakForLinker())) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (GVar->hasLocalLinkage())
+        O << "\t.local\t" << name << '\n';
+
+      O << TAI->getCOMMDirective() << name << ',' << Size;
+      if (TAI->getCOMMDirectiveTakesAlignment())
+        O << ',' << (1 << Align);
+
+      O << '\n';
+      return;
+    }
+  }
+  switch (GVar->getLinkage()) {
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::CommonLinkage:
+   case GlobalValue::WeakAnyLinkage:
+   case GlobalValue::WeakODRLinkage:
+    // FIXME: Verify correct for weak.
+    // Nonnull linkonce -> weak
+    O << "\t.weak " << name << '\n';
+    break;
+   case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of their name
+    // or something.  For now, just emit them as external.
+   case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << TAI->getGlobalDirective() << name << '\n';
+    // Fall Through
+   case GlobalValue::PrivateLinkage:
+   case GlobalValue::InternalLinkage:
+    if (CVA && CVA->isCString())
+      printSizeAndType = false;
+    break;
+   case GlobalValue::GhostLinkage:
+    cerr << "Should not have any unmaterialized functions!\n";
+    abort();
+   case GlobalValue::DLLImportLinkage:
+    cerr << "DLLImport linkage is not supported by this target!\n";
+    abort();
+   case GlobalValue::DLLExportLinkage:
+    cerr << "DLLExport linkage is not supported by this target!\n";
+    abort();
+   default:
+    assert(0 && "Unknown linkage type!");
+  }
+
+  EmitAlignment(Align, GVar);
+
+  if (TAI->hasDotTypeDotSizeDirective() && printSizeAndType) {
+    O << "\t.type " << name << ",@object\n";
+    O << "\t.size " << name << ',' << Size << '\n';
+  }
+
+  O << name << ":\n";
+  EmitGlobalConstant(C);
+}
+
+bool MipsAsmPrinter::
+doFinalization(Module &M)
+{
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I)
+    printModuleLevelGV(I);
+
+  O << '\n';
+
+  return AsmPrinter::doFinalization(M);
+}
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
new file mode 100644
index 0000000..70c7a51
--- /dev/null
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(LLVM_TARGET_DEFINITIONS Mips.td)
+
+tablegen(MipsGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(MipsGenRegisterNames.inc -gen-register-enums)
+tablegen(MipsGenRegisterInfo.inc -gen-register-desc)
+tablegen(MipsGenInstrNames.inc -gen-instr-enums)
+tablegen(MipsGenInstrInfo.inc -gen-instr-desc)
+tablegen(MipsGenAsmWriter.inc -gen-asm-writer)
+tablegen(MipsGenDAGISel.inc -gen-dag-isel)
+tablegen(MipsGenCallingConv.inc -gen-callingconv)
+tablegen(MipsGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(MipsCodeGen
+  MipsDelaySlotFiller.cpp
+  MipsInstrInfo.cpp
+  MipsISelDAGToDAG.cpp
+  MipsISelLowering.cpp
+  MipsRegisterInfo.cpp
+  MipsSubtarget.cpp
+  MipsTargetAsmInfo.cpp
+  MipsTargetMachine.cpp
+  )
diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
new file mode 100644
index 0000000..48ab5f9
--- /dev/null
+++ b/lib/Target/Mips/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/Mips/Makefile ----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMMipsCodeGen
+TARGET = Mips
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = MipsGenRegisterInfo.h.inc MipsGenRegisterNames.inc \
+                MipsGenRegisterInfo.inc MipsGenInstrNames.inc \
+                MipsGenInstrInfo.inc MipsGenAsmWriter.inc \
+                MipsGenDAGISel.inc MipsGenCallingConv.inc \
+                MipsGenSubtarget.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
new file mode 100644
index 0000000..0accb4e
--- /dev/null
+++ b/lib/Target/Mips/Mips.h
@@ -0,0 +1,41 @@
+//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in 
+// the LLVM Mips back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MIPS_H
+#define TARGET_MIPS_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MipsTargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class raw_ostream;
+
+  FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+  FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsCodePrinterPass(raw_ostream &OS, 
+                                          MipsTargetMachine &TM,
+                                          CodeGenOpt::Level OptLevel,
+                                          bool Verbose);
+} // end namespace llvm;
+
+// Defines symbolic names for Mips registers.  This defines a mapping from
+// register name to register number.
+#include "MipsGenRegisterNames.inc"
+
+// Defines symbolic names for the Mips instructions.
+#include "MipsGenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
new file mode 100644
index 0000000..79a78d8
--- /dev/null
+++ b/lib/Target/Mips/Mips.td
@@ -0,0 +1,88 @@
+//===- Mips.td - Describe the Mips Target Machine ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the Mips target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+include "MipsSchedule.td"
+include "MipsInstrInfo.td"
+include "MipsCallingConv.td"
+
+def MipsInstrInfo : InstrInfo {
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// Mips Subtarget features                                                    //
+//===----------------------------------------------------------------------===//
+
+def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
+                                "General Purpose Registers are 64-bit wide.">;
+def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
+                                "Support 64-bit FP registers.">;
+def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
+                                "true", "Only supports single precision float">;
+def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
+                                "Mips1 ISA Support">;
+def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
+                                "Mips2 ISA Support">;
+def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
+                                "Enable o32 ABI">;
+def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
+                                "Enable eabi ABI">;
+def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU", 
+                                "true", "Enable vector FPU instructions.">;
+def FeatureSEInReg     : SubtargetFeature<"seinreg", "HasSEInReg", "true", 
+                                "Enable 'signext in register' instructions.">;
+def FeatureCondMov     : SubtargetFeature<"condmov", "HasCondMov", "true", 
+                                "Enable 'conditional move' instructions.">;
+def FeatureMulDivAdd   : SubtargetFeature<"muldivadd", "HasMulDivAdd", "true",
+                                "Enable 'multiply add/sub' instructions.">;
+def FeatureMinMax      : SubtargetFeature<"minmax", "HasMinMax", "true",
+                                "Enable 'min/max' instructions.">;
+def FeatureSwap        : SubtargetFeature<"swap", "HasSwap", "true",
+                                "Enable 'byte/half swap' instructions.">;
+def FeatureBitCount    : SubtargetFeature<"bitcount", "HasBitCount", "true",
+                                "Enable 'count leading bits' instructions.">;
+
+//===----------------------------------------------------------------------===//
+// Mips processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, MipsGenericItineraries, Features>;
+
+def : Proc<"mips1", [FeatureMips1]>;
+def : Proc<"r2000", [FeatureMips1]>;
+def : Proc<"r3000", [FeatureMips1]>;
+
+def : Proc<"mips2", [FeatureMips2]>;
+def : Proc<"r6000", [FeatureMips2]>;
+
+// Allegrex is a 32bit subset of r4000, both for interger and fp registers, 
+// but much more similar to Mips2 than Mips3. It also contains some of 
+// Mips32/Mips32r2 instructions and a custom vector fpu processor. 
+def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI, 
+      FeatureVFPU, FeatureSEInReg, FeatureCondMov, FeatureMulDivAdd,
+      FeatureMinMax, FeatureSwap, FeatureBitCount]>;
+
+def Mips : Target {
+  let InstructionSet = MipsInstrInfo;
+}
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
new file mode 100644
index 0000000..01fe92e
--- /dev/null
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -0,0 +1,86 @@
+//===- MipsCallingConv.td - Calling Conventions for Mips --------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Mips architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>: 
+  CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Mips O32 Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Only the return rules are defined here for O32. The rules for argument 
+// passing are defined in MipsISelLowering.cpp.
+def RetCC_MipsO32 : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // f32 are returned in registers F0, F1
+  CCIfType<[f32], CCAssignToReg<[F0, F1]>>,
+
+  // f64 are returned in register D0
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips EABI Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsEABI : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+  // Single fp arguments are passed in pairs within 32-bit mode 
+  CCIfType<[f32], CCIfSubtarget<"isSingleFloat()", 
+                  CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>,
+
+  CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()", 
+                  CCAssignToReg<[F12, F14, F16, F18]>>>,
+
+  // The first 4 doubl fp arguments are passed in single fp registers.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", 
+                  CCAssignToReg<[D6, D7, D8, D9]>>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToStack<8, 8>>>
+]>;
+
+def RetCC_MipsEABI : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // f32 are returned in registers F0, F1
+  CCIfType<[f32], CCAssignToReg<[F0, F1]>>,
+
+  // f64 are returned in register D0
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Calling Convention Dispatch
+//===----------------------------------------------------------------------===//
+
+def CC_Mips : CallingConv<[
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>
+]>;
+
+def RetCC_Mips : CallingConv<[
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
+  CCDelegateTo<RetCC_MipsO32>
+]>;
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
new file mode 100644
index 0000000..a2b615d
--- /dev/null
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -0,0 +1,77 @@
+//===-- DelaySlotFiller.cpp - Mips delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fills delay slots with NOPs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "Mips Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// Currently, we fill delay slots with NOPs. We assume there is only one
+/// delay slot per delayed instruction.
+bool Filler::
+runOnMachineBasicBlock(MachineBasicBlock &MBB) 
+{
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->getDesc().hasDelaySlot()) {
+      MachineBasicBlock::iterator J = I;
+      ++J;
+      BuildMI(MBB, J, I->getDebugLoc(), TII->get(Mips::NOP));
+      ++FilledSlots;
+      Changed = true;
+    }
+  return Changed;
+}
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+  return new Filler(tm);
+}
+
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
new file mode 100644
index 0000000..f05ac70
--- /dev/null
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -0,0 +1,392 @@
+//===-- MipsISelDAGToDAG.cpp - A dag to dag inst selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+#include "Mips.h"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class VISIBILITY_HIDDEN MipsDAGToDAGISel : public SelectionDAGISel {
+
+  /// TM - Keep a reference to MipsTargetMachine.
+  MipsTargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const MipsSubtarget &Subtarget;
+ 
+public:
+  explicit MipsDAGToDAGISel(MipsTargetMachine &tm) :
+  SelectionDAGISel(tm),
+  TM(tm), Subtarget(tm.getSubtarget<MipsSubtarget>()) {}
+  
+  virtual void InstructionSelect();
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  } 
+  
+
+private:  
+  // Include the pieces autogenerated from the target description.
+  #include "MipsGenDAGISel.inc"
+
+  SDValue getGlobalBaseReg();
+  SDNode *Select(SDValue N);
+
+  // Complex Pattern.
+  bool SelectAddr(SDValue Op, SDValue N, 
+                  SDValue &Base, SDValue &Offset);
+
+
+  // getI32Imm - Return a target constant with the specified
+  // value, of type i32.
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+
+  #ifndef NDEBUG
+  unsigned Indent;
+  #endif
+};
+
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void MipsDAGToDAGISel::
+InstructionSelect() 
+{
+  DEBUG(BB->dump());
+  // Codegen the basic block.
+  #ifndef NDEBUG
+  DOUT << "===== Instruction selection begins:\n";
+  Indent = 0;
+  #endif
+
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+
+  #ifndef NDEBUG
+  DOUT << "===== Instruction selection ends:\n";
+  #endif
+
+  CurDAG->RemoveDeadNodes();
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDValue MipsDAGToDAGISel::getGlobalBaseReg() {
+  MachineFunction* MF = BB->getParent();
+  unsigned GP = 0;
+  for(MachineRegisterInfo::livein_iterator ii = MF->getRegInfo().livein_begin(),
+        ee = MF->getRegInfo().livein_end(); ii != ee; ++ii)
+    if (ii->first == Mips::GP) {
+      GP = ii->second;
+      break;
+    }
+  assert(GP && "GOT PTR not in liveins");
+  // FIXME is there a sensible place to get debug info for this?
+  return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), 
+                                DebugLoc::getUnknownLoc(), GP, MVT::i32);
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsDAGToDAGISel::
+SelectAddr(SDValue Op, SDValue Addr, SDValue &Offset, SDValue &Base)
+{
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+    
+  // on PIC code Load GA
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetGlobalAddress) || 
+        (Addr.getOpcode() == ISD::TargetJumpTable)){
+      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Offset = Addr;
+      return true;
+    }
+  } else {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }    
+  
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (Predicate_immSExt16(CN)) {
+
+        // If the first operand is a FI, get the TargetFI Node
+        if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                    (Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+SDNode* MipsDAGToDAGISel::
+Select(SDValue N) 
+{
+  SDNode *Node = N.getNode();
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Dump information about the Node being selected
+  #ifndef NDEBUG
+  DOUT << std::string(Indent, ' ') << "Selecting: ";
+  DEBUG(Node->dump(CurDAG));
+  DOUT << "\n";
+  Indent += 2;
+  #endif
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    #ifndef NDEBUG
+    DOUT << std::string(Indent-2, ' ') << "== ";
+    DEBUG(Node->dump(CurDAG));
+    DOUT << "\n";
+    Indent -= 2;
+    #endif
+    return NULL;
+  }
+
+  ///
+  // Instruction Selection not handled by the auto-generated 
+  // tablegen selection should be handled here.
+  /// 
+  switch(Opcode) {
+
+    default: break;
+
+    case ISD::SUBE: 
+    case ISD::ADDE: {
+      SDValue InFlag = Node->getOperand(2), CmpLHS;
+      unsigned Opc = InFlag.getOpcode(); Opc=Opc;
+      assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || 
+              (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&  
+             "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+      unsigned MOp;
+      if (Opcode == ISD::ADDE) {
+        CmpLHS = InFlag.getValue(0);
+        MOp = Mips::ADDu;
+      } else { 
+        CmpLHS = InFlag.getOperand(0);
+        MOp = Mips::SUBu;
+      }
+
+      SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+
+      SDValue LHS = Node->getOperand(0);
+      SDValue RHS = Node->getOperand(1);
+
+      MVT VT = LHS.getValueType();
+      SDNode *Carry = CurDAG->getTargetNode(Mips::SLTu, dl, VT, Ops, 2);
+      SDNode *AddCarry = CurDAG->getTargetNode(Mips::ADDu, dl, VT, 
+                                               SDValue(Carry,0), RHS);
+
+      return CurDAG->SelectNodeTo(N.getNode(), MOp, VT, MVT::Flag,
+                                  LHS, SDValue(AddCarry,0));
+    }
+
+    /// Mul/Div with two results
+    case ISD::SDIVREM:
+    case ISD::UDIVREM:
+    case ISD::SMUL_LOHI:
+    case ISD::UMUL_LOHI: {
+      SDValue Op1 = Node->getOperand(0);
+      SDValue Op2 = Node->getOperand(1);
+
+      unsigned Op;
+      if (Opcode == ISD::UMUL_LOHI || Opcode == ISD::SMUL_LOHI)
+        Op = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
+      else
+        Op = (Opcode == ISD::UDIVREM ? Mips::DIVu : Mips::DIV);
+
+      SDNode *Node = CurDAG->getTargetNode(Op, dl, MVT::Flag, Op1, Op2);
+
+      SDValue InFlag = SDValue(Node, 0);
+      SDNode *Lo = CurDAG->getTargetNode(Mips::MFLO, dl, MVT::i32, 
+                                         MVT::Flag, InFlag);
+      InFlag = SDValue(Lo,1);
+      SDNode *Hi = CurDAG->getTargetNode(Mips::MFHI, dl, MVT::i32, InFlag);
+
+      if (!N.getValue(0).use_empty()) 
+        ReplaceUses(N.getValue(0), SDValue(Lo,0));
+
+      if (!N.getValue(1).use_empty()) 
+        ReplaceUses(N.getValue(1), SDValue(Hi,0));
+
+      return NULL;
+    }
+
+    /// Special Muls
+    case ISD::MUL: 
+    case ISD::MULHS:
+    case ISD::MULHU: {
+      SDValue MulOp1 = Node->getOperand(0);
+      SDValue MulOp2 = Node->getOperand(1);
+
+      unsigned MulOp  = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+      SDNode *MulNode = CurDAG->getTargetNode(MulOp, dl, 
+                                              MVT::Flag, MulOp1, MulOp2);
+
+      SDValue InFlag = SDValue(MulNode, 0);
+
+      if (MulOp == ISD::MUL)
+        return CurDAG->getTargetNode(Mips::MFLO, dl, MVT::i32, InFlag);
+      else
+        return CurDAG->getTargetNode(Mips::MFHI, dl, MVT::i32, InFlag);
+    }
+
+    /// Div/Rem operations
+    case ISD::SREM:
+    case ISD::UREM:
+    case ISD::SDIV: 
+    case ISD::UDIV: {
+      SDValue Op1 = Node->getOperand(0);
+      SDValue Op2 = Node->getOperand(1);
+
+      unsigned Op, MOp;
+      if (Opcode == ISD::SDIV || Opcode == ISD::UDIV) {
+        Op  = (Opcode == ISD::SDIV ? Mips::DIV : Mips::DIVu);
+        MOp = Mips::MFLO;
+      } else {
+        Op  = (Opcode == ISD::SREM ? Mips::DIV : Mips::DIVu);
+        MOp = Mips::MFHI;
+      }
+      SDNode *Node = CurDAG->getTargetNode(Op, dl, MVT::Flag, Op1, Op2);
+
+      SDValue InFlag = SDValue(Node, 0);
+      return CurDAG->getTargetNode(MOp, dl, MVT::i32, InFlag);
+    }
+
+    // Get target GOT address.
+    case ISD::GLOBAL_OFFSET_TABLE: {
+      SDValue Result = getGlobalBaseReg();
+      ReplaceUses(N, Result);
+      return NULL;
+    }
+
+    /// Handle direct and indirect calls when using PIC. On PIC, when 
+    /// GOT is smaller than about 64k (small code) the GA target is 
+    /// loaded with only one instruction. Otherwise GA's target must 
+    /// be loaded with 3 instructions. 
+    case MipsISD::JmpLink: {
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        //bool isCodeLarge = (TM.getCodeModel() == CodeModel::Large);
+        SDValue Chain  = Node->getOperand(0);
+        SDValue Callee = Node->getOperand(1);
+        SDValue T9Reg = CurDAG->getRegister(Mips::T9, MVT::i32);
+        SDValue InFlag(0, 0);
+
+        if ( (isa<GlobalAddressSDNode>(Callee)) ||
+             (isa<ExternalSymbolSDNode>(Callee)) )
+        {
+          /// Direct call for global addresses and external symbols
+          SDValue GPReg = CurDAG->getRegister(Mips::GP, MVT::i32);
+
+          // Use load to get GOT target
+          SDValue Ops[] = { Callee, GPReg, Chain };
+          SDValue Load = SDValue(CurDAG->getTargetNode(Mips::LW, dl, MVT::i32, 
+                                     MVT::Other, Ops, 3), 0);
+          Chain = Load.getValue(1);
+
+          // Call target must be on T9
+          Chain = CurDAG->getCopyToReg(Chain, dl, T9Reg, Load, InFlag);
+        } else 
+          /// Indirect call
+          Chain = CurDAG->getCopyToReg(Chain, dl, T9Reg, Callee, InFlag);
+
+        // Emit Jump and Link Register
+        SDNode *ResNode = CurDAG->getTargetNode(Mips::JALR, dl, MVT::Other,
+                                  MVT::Flag, T9Reg, Chain);
+        Chain  = SDValue(ResNode, 0);
+        InFlag = SDValue(ResNode, 1);
+        ReplaceUses(SDValue(Node, 0), Chain);
+        ReplaceUses(SDValue(Node, 1), InFlag);
+        return ResNode;
+      } 
+    }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(N);
+
+  #ifndef NDEBUG
+  DOUT << std::string(Indent-2, ' ') << "=> ";
+  if (ResNode == NULL || ResNode == N.getNode())
+    DEBUG(N.getNode()->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DOUT << "\n";
+  Indent -= 2;
+  #endif
+
+  return ResNode;
+}
+
+/// createMipsISelDag - This pass converts a legalized DAG into a 
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
+  return new MipsDAGToDAGISel(TM);
+}
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
new file mode 100644
index 0000000..9281940
--- /dev/null
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -0,0 +1,1254 @@
+//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-lower"
+
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "MipsSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+const char *MipsTargetLowering::
+getTargetNodeName(unsigned Opcode) const 
+{
+  switch (Opcode) 
+  {
+    case MipsISD::JmpLink    : return "MipsISD::JmpLink";
+    case MipsISD::Hi         : return "MipsISD::Hi";
+    case MipsISD::Lo         : return "MipsISD::Lo";
+    case MipsISD::GPRel      : return "MipsISD::GPRel";
+    case MipsISD::Ret        : return "MipsISD::Ret";
+    case MipsISD::CMov       : return "MipsISD::CMov";
+    case MipsISD::SelectCC   : return "MipsISD::SelectCC";
+    case MipsISD::FPSelectCC : return "MipsISD::FPSelectCC";
+    case MipsISD::FPBrcond   : return "MipsISD::FPBrcond";
+    case MipsISD::FPCmp      : return "MipsISD::FPCmp";
+    case MipsISD::FPRound    : return "MipsISD::FPRound";
+    default                  : return NULL;
+  }
+}
+
+MipsTargetLowering::
+MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM) 
+{
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+
+  // Mips does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...). 
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // JumpTable targets must use GOT when using PIC_
+  setUsesGlobalOffsetTable(true);
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass);
+  addRegisterClass(MVT::f32, Mips::FGR32RegisterClass);
+
+  // When dealing with single precision only, use libcalls
+  if (!Subtarget->isSingleFloat())
+    if (!Subtarget->isFP64bit())
+      addRegisterClass(MVT::f64, Mips::AFGR64RegisterClass);
+
+  // Legal fp constants
+  addLegalFPImmediate(APFloat(+0.0f));
+
+  // Load extented operations for i1 types must be promoted 
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+
+  // Used by legalize types to correctly generate the setcc result. 
+  // Without this, every float setcc comes with a AND/OR with the result, 
+  // we don't want this, since the fpcmp result goes to a flag register, 
+  // which is used implicitly by brcond and select operations.
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+  // Mips Custom Operations
+  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::RET,                MVT::Other, Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
+  setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Custom);
+  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
+
+  // We custom lower AND/OR to handle the case where the DAG contain 'ands/ors' 
+  // with operands comming from setcc fp comparions. This is necessary since 
+  // the result from these setcc are in a flag registers (FCR31).
+  setOperationAction(ISD::AND,              MVT::i32,   Custom);
+  setOperationAction(ISD::OR,               MVT::i32,   Custom);
+
+  // Operations not directly supported by Mips.
+  setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
+  setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
+  setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
+  setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
+  setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Expand);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::DBG_STOPPOINT,     MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC,         MVT::Other, Expand);
+  setOperationAction(ISD::DBG_LABEL,         MVT::Other, Expand);
+  setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
+  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Expand);
+
+  if (Subtarget->isSingleFloat())
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+  if (!Subtarget->hasSEInReg()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  }
+
+  if (!Subtarget->hasBitCount())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  if (!Subtarget->hasSwap())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  setStackPointerRegisterToSaveRestore(Mips::SP);
+  computeRegisterProperties();
+}
+
+
+MVT MipsTargetLowering::getSetCCResultType(MVT VT) const {
+  return MVT::i32;
+}
+
+
+SDValue MipsTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) 
+{
+  switch (Op.getOpcode()) 
+  {
+    case ISD::AND:                return LowerANDOR(Op, DAG);
+    case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+    case ISD::CALL:               return LowerCALL(Op, DAG);
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
+    case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+    case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+    case ISD::OR:                 return LowerANDOR(Op, DAG);
+    case ISD::RET:                return LowerRET(Op, DAG);
+    case ISD::SELECT:             return LowerSELECT(Op, DAG);
+    case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+
+// AddLiveIn - This helper function adds the specified physical register to the
+// MachineFunction as a live in value.  It also creates a corresponding
+// virtual register for it.
+static unsigned
+AddLiveIn(MachineFunction &MF, unsigned PReg, TargetRegisterClass *RC) 
+{
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  MF.getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+// A address must be loaded from a small section if its size is less than the 
+// small section size threshold. Data in this section must be addressed using 
+// gp_rel operator.
+bool MipsTargetLowering::IsInSmallSection(unsigned Size) {
+  return (Size > 0 && (Size <= Subtarget->getSSectionThreshold()));
+}
+
+// Discover if this global address can be placed into small data/bss section. 
+bool MipsTargetLowering::IsGlobalInSmallSection(GlobalValue *GV)
+{
+  const TargetData *TD = getTargetData();
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+
+  if (!GVA)
+    return false;
+  
+  const Type *Ty = GV->getType()->getElementType();
+  unsigned Size = TD->getTypeAllocSize(Ty);
+
+  // if this is a internal constant string, there is a special
+  // section for it, but not in small data/bss.
+  if (GVA->hasInitializer() && GV->hasLocalLinkage()) {
+    Constant *C = GVA->getInitializer();
+    const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+    if (CVA && CVA->isCString()) 
+      return false;
+  }
+
+  return IsInSmallSection(Size);
+}
+
+// Get fp branch code (not opcode) from condition code.
+static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
+  if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
+    return Mips::BRANCH_T;
+
+  if (CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT)
+    return Mips::BRANCH_F;
+
+  return Mips::BRANCH_INVALID;
+}
+  
+static unsigned FPBranchCodeToOpc(Mips::FPBranchCode BC) {
+  switch(BC) {
+    default:
+      assert(0 && "Unknown branch code");
+    case Mips::BRANCH_T  : return Mips::BC1T;
+    case Mips::BRANCH_F  : return Mips::BC1F;
+    case Mips::BRANCH_TL : return Mips::BC1TL;
+    case Mips::BRANCH_FL : return Mips::BC1FL;
+  }
+}
+
+static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown fp condition code!");
+  case ISD::SETEQ:  
+  case ISD::SETOEQ: return Mips::FCOND_EQ;
+  case ISD::SETUNE: return Mips::FCOND_OGL;
+  case ISD::SETLT:  
+  case ISD::SETOLT: return Mips::FCOND_OLT;
+  case ISD::SETGT:  
+  case ISD::SETOGT: return Mips::FCOND_OGT;
+  case ISD::SETLE:  
+  case ISD::SETOLE: return Mips::FCOND_OLE; 
+  case ISD::SETGE:
+  case ISD::SETOGE: return Mips::FCOND_OGE;
+  case ISD::SETULT: return Mips::FCOND_ULT;
+  case ISD::SETULE: return Mips::FCOND_ULE; 
+  case ISD::SETUGT: return Mips::FCOND_UGT;
+  case ISD::SETUGE: return Mips::FCOND_UGE;
+  case ISD::SETUO:  return Mips::FCOND_UN; 
+  case ISD::SETO:   return Mips::FCOND_OR;
+  case ISD::SETNE:  
+  case ISD::SETONE: return Mips::FCOND_NEQ;
+  case ISD::SETUEQ: return Mips::FCOND_UEQ;
+  }
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  bool isFPCmp = false;
+  DebugLoc dl = MI->getDebugLoc();
+
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case Mips::Select_FCC:
+  case Mips::Select_FCC_S32:
+  case Mips::Select_FCC_D32:
+    isFPCmp = true; // FALL THROUGH
+  case Mips::Select_CC:
+  case Mips::Select_CC_S32:
+  case Mips::Select_CC_D32: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   setcc r1, r2, r3
+    //   bNE   r1, r0, copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+
+    // Emit the right instruction according to the type of the operands compared
+    if (isFPCmp) {
+      // Find the condiction code present in the setcc operation.
+      Mips::CondCode CC = (Mips::CondCode)MI->getOperand(4).getImm();
+      // Get the branch opcode from the branch code.
+      unsigned Opc = FPBranchCodeToOpc(GetFPBranchCodeFromCond(CC));
+      BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
+    } else
+      BuildMI(BB, dl, TII->get(Mips::BNE)).addReg(MI->getOperand(1).getReg())
+        .addReg(Mips::ZERO).addMBB(sinkMBB);
+
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+        e = BB->succ_end(); i != e; ++i)
+      sinkMBB->addSuccessor(*i);
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, dl, TII->get(Mips::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(3).getReg()).addMBB(thisMBB);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue MipsTargetLowering::
+LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG)
+{
+  if (!Subtarget->isMips1())
+    return Op;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned CCReg = AddLiveIn(MF, Mips::FCR31, Mips::CCRRegisterClass);
+
+  SDValue Chain = DAG.getEntryNode();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Src = Op.getOperand(0);
+
+  // Set the condition register
+  SDValue CondReg = DAG.getCopyFromReg(Chain, dl, CCReg, MVT::i32);
+  CondReg = DAG.getCopyToReg(Chain, dl, Mips::AT, CondReg);
+  CondReg = DAG.getCopyFromReg(CondReg, dl, Mips::AT, MVT::i32);
+
+  SDValue Cst = DAG.getConstant(3, MVT::i32);
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, CondReg, Cst);
+  Cst = DAG.getConstant(2, MVT::i32);
+  SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i32, Or, Cst);
+
+  SDValue InFlag(0, 0);
+  CondReg = DAG.getCopyToReg(Chain, dl, Mips::FCR31, Xor, InFlag);
+
+  // Emit the round instruction and bit convert to integer
+  SDValue Trunc = DAG.getNode(MipsISD::FPRound, dl, MVT::f32,
+                              Src, CondReg.getValue(1));
+  SDValue BitCvt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Trunc);
+  return BitCvt;
+}
+
+SDValue MipsTargetLowering::
+LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG)
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get a reference from Mips stack pointer
+  SDValue StackPointer = DAG.getCopyFromReg(Chain, dl, Mips::SP, MVT::i32);
+
+  // Subtract the dynamic size from the actual stack size to
+  // obtain the new stack size.
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, StackPointer, Size);
+
+  // The Sub result contains the new stack start address, so it 
+  // must be placed in the stack pointer register.
+  Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub);
+  
+  // This node always has two return values: a new stack pointer 
+  // value and a chain
+  SDValue Ops[2] = { Sub, Chain };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue MipsTargetLowering::
+LowerANDOR(SDValue Op, SelectionDAG &DAG)
+{
+  SDValue LHS   = Op.getOperand(0);
+  SDValue RHS   = Op.getOperand(1);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  if (LHS.getOpcode() != MipsISD::FPCmp || RHS.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  SDValue True  = DAG.getConstant(1, MVT::i32);
+  SDValue False = DAG.getConstant(0, MVT::i32);
+
+  SDValue LSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), 
+                             LHS, True, False, LHS.getOperand(2));
+  SDValue RSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), 
+                             RHS, True, False, RHS.getOperand(2));
+
+  return DAG.getNode(Op.getOpcode(), dl, MVT::i32, LSEL, RSEL);
+}
+
+SDValue MipsTargetLowering::
+LowerBRCOND(SDValue Op, SelectionDAG &DAG)
+{
+  // The first operand is the chain, the second is the condition, the third is 
+  // the block to branch to if the condition is true.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Dest = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(1).getOpcode() != MipsISD::FPCmp)
+    return Op;
+  
+  SDValue CondRes = Op.getOperand(1);
+  SDValue CCNode  = CondRes.getOperand(2);
+  Mips::CondCode CC =
+    (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
+  SDValue BrCode = DAG.getConstant(GetFPBranchCodeFromCond(CC), MVT::i32); 
+
+  return DAG.getNode(MipsISD::FPBrcond, dl, Op.getValueType(), Chain, BrCode, 
+             Dest, CondRes);
+}
+
+SDValue MipsTargetLowering::
+LowerSETCC(SDValue Op, SelectionDAG &DAG)
+{
+  // The operands to this are the left and right operands to compare (ops #0, 
+  // and #1) and the condition code to compare them with (op #2) as a 
+  // CondCodeSDNode.
+  SDValue LHS = Op.getOperand(0); 
+  SDValue RHS = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  
+  return DAG.getNode(MipsISD::FPCmp, dl, Op.getValueType(), LHS, RHS, 
+                 DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32));
+}
+
+SDValue MipsTargetLowering::
+LowerSELECT(SDValue Op, SelectionDAG &DAG) 
+{
+  SDValue Cond  = Op.getOperand(0); 
+  SDValue True  = Op.getOperand(1);
+  SDValue False = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // if the incomming condition comes from a integer compare, the select 
+  // operation must be SelectCC or a conditional move if the subtarget 
+  // supports it.
+  if (Cond.getOpcode() != MipsISD::FPCmp) {
+    if (Subtarget->hasCondMov() && !True.getValueType().isFloatingPoint())
+      return Op;
+    return DAG.getNode(MipsISD::SelectCC, dl, True.getValueType(), 
+                       Cond, True, False);
+  }
+
+  // if the incomming condition comes from fpcmp, the select
+  // operation must use FPSelectCC.
+  SDValue CCNode = Cond.getOperand(2);
+  return DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), 
+                     Cond, True, False, CCNode);
+}
+
+SDValue MipsTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) 
+{
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+
+  if (!Subtarget->hasABICall()) {
+    SDVTList VTs = DAG.getVTList(MVT::i32);
+    SDValue Ops[] = { GA };
+    // %gp_rel relocation
+    if (!isa<Function>(GV) && IsGlobalInSmallSection(GV)) { 
+      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, Ops, 1);
+      SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
+      return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode); 
+    }
+    // %hi/%lo relocation
+    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, Ops, 1);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
+
+  } else { // Abicall relocations, TODO: make this cleaner.
+    SDValue ResNode = DAG.getLoad(MVT::i32, dl, 
+                                  DAG.getEntryNode(), GA, NULL, 0);
+    // On functions and global targets not internal linked only
+    // a load from got/GP is necessary for PIC to work.
+    if (!GV->hasLocalLinkage() || isa<Function>(GV))
+      return ResNode;
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
+  }
+
+  assert(0 && "Dont know how to handle GlobalAddress");
+  return SDValue(0,0);
+}
+
+SDValue MipsTargetLowering::
+LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG)
+{
+  assert(0 && "TLS not implemented for MIPS.");
+  return SDValue(); // Not reached
+}
+
+SDValue MipsTargetLowering::
+LowerJumpTable(SDValue Op, SelectionDAG &DAG) 
+{
+  SDValue ResNode;
+  SDValue HiPart; 
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  MVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+    SDVTList VTs = DAG.getVTList(MVT::i32);
+    SDValue Ops[] = { JTI };
+    HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, Ops, 1);
+  } else // Emit Load from Global Pointer
+    HiPart = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), JTI, NULL, 0);
+
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTI);
+  ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
+
+  return ResNode;
+}
+
+SDValue MipsTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) 
+{
+  SDValue ResNode;
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  Constant *C = N->getConstVal();
+  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment());
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  // gp_rel relocation
+  // FIXME: we should reference the constant pool using small data sections, 
+  // but the asm printer currently doens't support this feature without
+  // hacking it. This feature should come soon so we can uncomment the 
+  // stuff below.
+  //if (!Subtarget->hasABICall() &&  
+  //    IsInSmallSection(getTargetData()->getTypeAllocSize(C->getType()))) {
+  //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
+  //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
+  //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode); 
+  //} else { // %hi/%lo relocation
+    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, MVT::i32, CP);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CP);
+    ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
+  //}
+
+  return ResNode;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//
+//  The lower operations present on calling convention works on this order:
+//      LowerCALL (virt regs --> phys regs, virt regs --> stack) 
+//      LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
+//      LowerRET (virt regs --> phys regs)
+//      LowerCALL (phys regs --> virt regs)
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TODO: Implement a generic logic using tblgen that can support this. 
+// Mips O32 ABI rules:
+// ---
+// i32 - Passed in A0, A1, A2, A3 and stack
+// f32 - Only passed in f32 registers if no int reg has been used yet to hold 
+//       an argument. Otherwise, passed in A1, A2, A3 and stack.
+// f64 - Only passed in two aliased f32 registers if no int reg has been used 
+//       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is 
+//       not used, it must be shadowed. If only A3 is avaiable, shadow it and
+//       go to stack.
+//===----------------------------------------------------------------------===//
+
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
+                       MVT LocVT, CCValAssign::LocInfo LocInfo,
+                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  static const unsigned IntRegsSize=4, FloatRegsSize=2; 
+
+  static const unsigned IntRegs[] = {
+      Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+  static const unsigned F32Regs[] = {
+      Mips::F12, Mips::F14
+  };
+  static const unsigned F64Regs[] = {
+      Mips::D6, Mips::D7
+  };
+
+  unsigned Reg=0;
+  unsigned UnallocIntReg = State.getFirstUnallocated(IntRegs, IntRegsSize);
+  bool IntRegUsed = (IntRegs[UnallocIntReg] != (unsigned (Mips::A0)));
+
+  // Promote i8 and i16
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && IntRegUsed)) {
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    IntRegUsed = true;
+    LocVT = MVT::i32;
+  }
+
+  if (ValVT.isFloatingPoint() && !IntRegUsed) {
+    if (ValVT == MVT::f32)
+      Reg = State.AllocateReg(F32Regs, FloatRegsSize);
+    else
+      Reg = State.AllocateReg(F64Regs, FloatRegsSize);
+  }
+
+  if (ValVT == MVT::f64 && IntRegUsed) {
+    if (UnallocIntReg != IntRegsSize) {
+      // If we hit register A3 as the first not allocated, we must
+      // mark it as allocated (shadow) and use the stack instead.
+      if (IntRegs[UnallocIntReg] != (unsigned (Mips::A3)))
+        Reg = Mips::A2;
+      for (;UnallocIntReg < IntRegsSize; ++UnallocIntReg)
+        State.AllocateReg(UnallocIntReg);
+    } 
+    LocVT = MVT::i32;
+  }
+
+  if (!Reg) {
+    unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
+    unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  } else
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+
+  return false; // CC must always match
+}
+
+//===----------------------------------------------------------------------===//
+//                  CALL Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerCALL - functions arguments are copied from virtual regs to 
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: isVarArg, isTailCall.
+SDValue MipsTargetLowering::
+LowerCALL(SDValue Op, SelectionDAG &DAG)
+{
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  SDValue Chain = TheCall->getChain();
+  SDValue Callee = TheCall->getCallee();
+  bool isVarArg = TheCall->isVarArg();
+  unsigned CC = TheCall->getCallingConv();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+
+  // To meet O32 ABI, Mips must always allocate 16 bytes on
+  // the stack (even if less than 4 are used as arguments)
+  if (Subtarget->isABI_O32()) {
+    int VTsize = MVT(MVT::i32).getSizeInBits()/8;
+    MFI->CreateFixedObject(VTsize, (VTsize*3));
+    CCInfo.AnalyzeCallOperands(TheCall, CC_MipsO32);
+  } else
+    CCInfo.AnalyzeCallOperands(TheCall, CC_Mips);
+  
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  // With EABI is it possible to have 16 args on registers.
+  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // First/LastArgStackLoc contains the first/last 
+  // "at stack" argument location.
+  int LastArgStackLoc = 0;
+  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue Arg = TheCall->getArg(i);
+    CCValAssign &VA = ArgLocs[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: 
+      if (Subtarget->isABI_O32() && VA.isRegLoc()) {
+        if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i32)
+          Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Arg);
+        if (VA.getValVT() == MVT::f64 && VA.getLocVT() == MVT::i32) {
+          Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+          SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
+                                   DAG.getConstant(0, getPointerTy()));
+          SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
+                                   DAG.getConstant(1, getPointerTy()));
+          RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
+          RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi));
+          continue;
+        }  
+      }
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+    
+    // Arguments that can be passed on register must be kept at 
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+    
+    // Register can't get to this point...
+    assert(VA.isMemLoc());
+    
+    // Create the frame index object for this incoming parameter
+    // This guarantees that when allocating Local Area the firsts
+    // 16 bytes which are alwayes reserved won't be overwritten
+    // if O32 ABI is used. For EABI the first address is zero.
+    LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset());
+    int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                    LastArgStackLoc);
+
+    SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+
+    // emit ISD::STORE whichs stores the 
+    // parameter value to a stack Location
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())     
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token 
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 
+  // node so that legalize doesn't hack it. 
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // MipsJmpLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...  
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are 
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Create a stack location to hold GP when PIC is used. This stack 
+  // location is used on function prologue to save GP and also after all 
+  // emited CALL's to restore GP. 
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+      // Function can have an arbitrary number of calls, so 
+      // hold the LastArgStackLoc with the biggest offset.
+      int FI;
+      MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+      if (LastArgStackLoc >= MipsFI->getGPStackOffset()) {
+        LastArgStackLoc = (!LastArgStackLoc) ? (16) : (LastArgStackLoc+4);
+        // Create the frame index only once. SPOffset here can be anything 
+        // (this will be fixed on processFunctionBeforeFrameFinalized)
+        if (MipsFI->getGPStackOffset() == -1) {
+          FI = MFI->CreateFixedObject(4, 0);
+          MipsFI->setGPFI(FI);
+        }
+        MipsFI->setGPStackOffset(LastArgStackLoc);
+      }
+
+      // Reload GP value.
+      FI = MipsFI->getGPFI();
+      SDValue FIN = DAG.getFrameIndex(FI,getPointerTy());
+      SDValue GPLoad = DAG.getLoad(MVT::i32, dl, Chain, FIN, NULL, 0);
+      Chain = GPLoad.getValue(1);
+      Chain = DAG.getCopyToReg(Chain, dl, DAG.getRegister(Mips::GP, MVT::i32), 
+                               GPLoad, SDValue(0,0));
+      InFlag = Chain.getValue(1);
+  }      
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG), Op.getResNo());
+}
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the 
+/// ISD::CALL.
+SDNode *MipsTargetLowering::
+LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 
+        unsigned CallingConv, SelectionDAG &DAG) {
+  
+  bool isVarArg = TheCall->isVarArg();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_Mips);
+  SmallVector<SDValue, 8> ResultVals;
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    ResultVals.push_back(Chain.getValue(0));
+  }
+  
+  ResultVals.push_back(Chain);
+
+  // Merge everything together with a MERGE_VALUES node.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
+                     &ResultVals[0], ResultVals.size()).getNode();
+}
+
+//===----------------------------------------------------------------------===//
+//             FORMAL_ARGUMENTS Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerFORMAL_ARGUMENTS - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: isVarArg
+SDValue MipsTargetLowering::
+LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) 
+{
+  SDValue Root = Op.getOperand(0);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  DebugLoc dl = Op.getDebugLoc();
+
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+
+  unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
+
+  // GP must be live into PIC and non-PIC call target.
+  AddLiveIn(MF, Mips::GP, Mips::CPURegsRegisterClass);
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+
+  if (Subtarget->isABI_O32())
+    CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_MipsO32);
+  else
+    CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_Mips);
+
+  SmallVector<SDValue, 16> ArgValues;
+  SDValue StackPtr;
+
+  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored on registers
+    if (VA.isRegLoc()) {
+      MVT RegVT = VA.getLocVT();
+      TargetRegisterClass *RC = 0;
+
+      if (RegVT == MVT::i32)
+        RC = Mips::CPURegsRegisterClass; 
+      else if (RegVT == MVT::f32) 
+        RC = Mips::FGR32RegisterClass;
+      else if (RegVT == MVT::f64) {
+        if (!Subtarget->isSingleFloat()) 
+          RC = Mips::AFGR64RegisterClass;
+      } else  
+        assert(0 && "RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+      // Transform the arguments stored on 
+      // physical registers into virtual ones
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it has been passed promoted 
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then 
+      // truncate to the right size.
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        unsigned Opcode = 0;
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          Opcode = ISD::AssertSext;
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          Opcode = ISD::AssertZext;
+        if (Opcode)
+          ArgValue = DAG.getNode(Opcode, dl, RegVT, ArgValue, 
+                                 DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+
+      // Handle O32 ABI cases: i32->f32 and (i32,i32)->f64 
+      if (Subtarget->isABI_O32()) {
+        if (RegVT == MVT::i32 && VA.getValVT() == MVT::f32) 
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue);
+        if (RegVT == MVT::i32 && VA.getValVT() == MVT::f64) {
+          unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(), 
+                                    VA.getLocReg()+1, RC);
+          SDValue ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg2, RegVT);
+          SDValue Hi = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue);
+          SDValue Lo = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue2);
+          ArgValue = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::f64, Lo, Hi);
+        }
+      }
+
+      ArgValues.push_back(ArgValue);
+
+      // To meet ABI, when VARARGS are passed on registers, the registers
+      // must have their values written to the caller stack frame. 
+      if ((isVarArg) && (Subtarget->isABI_O32())) {
+        if (StackPtr.getNode() == 0)
+          StackPtr = DAG.getRegister(StackReg, getPointerTy());
+     
+        // The stack pointer offset is relative to the caller stack frame. 
+        // Since the real stack size is unknown here, a negative SPOffset 
+        // is used so there's a way to adjust these offsets when the stack
+        // size get known (on EliminateFrameIndex). A dummy SPOffset is 
+        // used instead of a direct negative address (which is recorded to
+        // be used on emitPrologue) to avoid mis-calc of the first stack 
+        // offset on PEI::calculateFrameObjectOffsets.
+        // Arguments are always 32-bit.
+        int FI = MFI->CreateFixedObject(4, 0);
+        MipsFI->recordStoreVarArgsFI(FI, -(4+(i*4)));
+        SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+      
+        // emit ISD::STORE whichs stores the 
+        // parameter value to a stack Location
+        ArgValues.push_back(DAG.getStore(Root, dl, ArgValue, PtrOff, NULL, 0));
+      }
+
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+      
+      // The stack pointer offset is relative to the caller stack frame. 
+      // Since the real stack size is unknown here, a negative SPOffset 
+      // is used so there's a way to adjust these offsets when the stack
+      // size get known (on EliminateFrameIndex). A dummy SPOffset is 
+      // used instead of a direct negative address (which is recorded to
+      // be used on emitPrologue) to avoid mis-calc of the first stack 
+      // offset on PEI::calculateFrameObjectOffsets.
+      // Arguments are always 32-bit.
+      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+      int FI = MFI->CreateFixedObject(ArgSize, 0);
+      MipsFI->recordLoadArgsFI(FI, -(ArgSize+
+        (FirstStackArgLoc + VA.getLocMemOffset())));
+
+      // Create load nodes to retrieve arguments from the stack
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), dl, Root, FIN, NULL, 0));
+    }
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    unsigned Reg = MipsFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
+      MipsFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]);
+    Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root);
+  }
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue MipsTargetLowering::
+LowerRET(SDValue Op, SelectionDAG &DAG)
+{
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+
+  // Analize return values of ISD::RET
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_Mips);
+
+  // If this is the first return lowered for this function, add 
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  // The chain is always operand #0
+  SDValue Chain = Op.getOperand(0);
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // ISD::RET => ret chain, (regnum1,val1), ...
+    // So i*2+1 index only the regnums
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
+                             Op.getOperand(i*2+1), Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into $v0.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF      = DAG.getMachineFunction();
+    MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    unsigned Reg = MipsFI->getSRetReturnReg();
+
+    if (!Reg) 
+      assert(0 && "sret virtual register not created in the entry block");
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+    Chain = DAG.getCopyToReg(Chain, dl, Mips::V0, Val, Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  // Return on Mips is always a "jr $ra"
+  if (Flag.getNode())
+    return DAG.getNode(MipsISD::Ret, dl, MVT::Other, 
+                       Chain, DAG.getRegister(Mips::RA, MVT::i32), Flag);
+  else // Return Void
+    return DAG.getNode(MipsISD::Ret, dl, MVT::Other, 
+                       Chain, DAG.getRegister(Mips::RA, MVT::i32));
+}
+
+//===----------------------------------------------------------------------===//
+//                           Mips Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MipsTargetLowering::ConstraintType MipsTargetLowering::
+getConstraintType(const std::string &Constraint) const 
+{
+  // Mips specific constrainy 
+  // GCC config/mips/constraints.md
+  //
+  // 'd' : An address register. Equivalent to r 
+  //       unless generating MIPS16 code. 
+  // 'y' : Equivalent to r; retained for 
+  //       backwards compatibility. 
+  // 'f' : Floating Point registers.      
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      default : break;
+      case 'd':     
+      case 'y': 
+      case 'f':
+        return C_RegisterClass;
+        break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// getRegClassForInlineAsmConstraint - Given a constraint letter (e.g. "r"),
+/// return a list of registers that can be used to satisfy the constraint.
+/// This should only be used for C_RegisterClass constraints.
+std::pair<unsigned, const TargetRegisterClass*> MipsTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
+{
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, Mips::CPURegsRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, Mips::FGR32RegisterClass);
+      if (VT == MVT::f64)    
+        if ((!Subtarget->isSingleFloat()) && (!Subtarget->isFP64bit()))
+          return std::make_pair(0U, Mips::AFGR64RegisterClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::vector<unsigned> MipsTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT VT) const
+{
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {         
+    default : break;
+    case 'r':
+    // GCC Mips Constraint Letters
+    case 'd':     
+    case 'y': 
+      return make_vector<unsigned>(Mips::T0, Mips::T1, Mips::T2, Mips::T3, 
+             Mips::T4, Mips::T5, Mips::T6, Mips::T7, Mips::S0, Mips::S1, 
+             Mips::S2, Mips::S3, Mips::S4, Mips::S5, Mips::S6, Mips::S7, 
+             Mips::T8, 0);
+
+    case 'f':
+      if (VT == MVT::f32) {
+        if (Subtarget->isSingleFloat())
+          return make_vector<unsigned>(Mips::F2, Mips::F3, Mips::F4, Mips::F5,
+                 Mips::F6, Mips::F7, Mips::F8, Mips::F9, Mips::F10, Mips::F11,
+                 Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24,
+                 Mips::F25, Mips::F26, Mips::F27, Mips::F28, Mips::F29,
+                 Mips::F30, Mips::F31, 0);
+        else
+          return make_vector<unsigned>(Mips::F2, Mips::F4, Mips::F6, Mips::F8, 
+                 Mips::F10, Mips::F20, Mips::F22, Mips::F24, Mips::F26, 
+                 Mips::F28, Mips::F30, 0);
+      }
+
+      if (VT == MVT::f64)    
+        if ((!Subtarget->isSingleFloat()) && (!Subtarget->isFP64bit()))
+          return make_vector<unsigned>(Mips::D1, Mips::D2, Mips::D3, Mips::D4, 
+                 Mips::D5, Mips::D10, Mips::D11, Mips::D12, Mips::D13, 
+                 Mips::D14, Mips::D15, 0);
+  }
+  return std::vector<unsigned>();
+}
+
+bool
+MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Mips target isn't yet aware of offsets.
+  return false;
+}
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
new file mode 100644
index 0000000..55cd6ea
--- /dev/null
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -0,0 +1,130 @@
+//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MipsISELLOWERING_H
+#define MipsISELLOWERING_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "Mips.h"
+#include "MipsSubtarget.h"
+
+namespace llvm {
+  namespace MipsISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Get the Higher 16 bits from a 32-bit immediate
+      // No relation with Mips Hi register
+      Hi, 
+
+      // Get the Lower 16 bits from a 32-bit immediate
+      // No relation with Mips Lo register
+      Lo, 
+
+      // Handle gp_rel (small data/bss sections) relocation.
+      GPRel,
+
+      // Conditional Move
+      CMov,
+
+      // Select CC Pseudo Instruction
+      SelectCC,
+
+      // Floating Point Select CC Pseudo Instruction
+      FPSelectCC,
+
+      // Floating Point Branch Conditional
+      FPBrcond,
+
+      // Floating Point Compare
+      FPCmp,
+
+      // Floating Point Rounding
+      FPRound,
+
+      // Return 
+      Ret
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class MipsTargetLowering : public TargetLowering 
+  {
+    // FrameIndex for return slot.
+    int ReturnAddrIndex;
+  public:
+
+    explicit MipsTargetLowering(MipsTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific 
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - get the ISD::SETCC result ValueType
+    MVT getSetCCResultType(MVT VT) const;
+
+  private:
+    // Subtarget Info
+    const MipsSubtarget *Subtarget;
+
+    // Lower Operand helpers
+    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+    bool IsGlobalInSmallSection(GlobalValue *GV); 
+    bool IsInSmallSection(unsigned Size); 
+
+    // Lower Operand specifics
+    SDValue LowerANDOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *MBB) const;
+
+    // Inline asm support
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*> 
+              getRegForInlineAsmConstraint(const std::string &Constraint,
+              MVT VT) const;
+
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+              MVT VT) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+  };
+}
+
+#endif // MipsISELLOWERING_H
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
new file mode 100644
index 0000000..b6a6d2f
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -0,0 +1,304 @@
+//===- MipsInstrFPU.td - Mips FPU Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+// ------------------------
+// * 64bit fp:
+//    - 32 64-bit registers (default mode)
+//    - 16 even 32-bit registers (32-bit compatible mode) for
+//      single and double access.
+// * 32bit fp:
+//    - 16 even 32-bit registers - single and double (aliased)
+//    - 32 32-bit registers (within single-only mode)
+//===----------------------------------------------------------------------===//
+
+// Floating Point Compare and Branch
+def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisSameAs<0, 2>, SDTCisInt<0>,
+                                     SDTCisVT<1, OtherVT>]>;
+def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<0>, 
+                                  SDTCisInt<2>]>;
+def SDT_MipsFPSelectCC : SDTypeProfile<1, 4, [SDTCisInt<1>, SDTCisInt<4>,
+                                  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+
+def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInFlag]>;
+def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond, 
+                          [SDNPHasChain]>; 
+def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp>;
+def MipsFPSelectCC : SDNode<"MipsISD::FPSelectCC", SDT_MipsFPSelectCC>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printFCCOperand" in
+  def condcode : Operand<i32>;
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+def In32BitMode      : Predicate<"!Subtarget.isFP64bit()">;
+def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">;
+def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//
+// A set of multiclasses is used to address the register usage. 
+//
+// S32 - single precision in 16 32bit even fp registers
+//       single precision in 32 32bit fp registers in SingleOnly mode
+// S64 - single precision in 32 64bit fp registers (In64BitMode)
+// D32 - double precision in 16 32bit even fp registers
+// D64 - double precision in 32 64bit fp registers (In64BitMode)
+//
+// Only S32 and D32 are supported right now.
+//===----------------------------------------------------------------------===//
+
+multiclass FFR1_1<bits<6> funct, string asmstr> 
+{
+  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
+      !strconcat(asmstr, ".s $fd, $fs"), []>;
+
+  def _D32  : FFR<0x11, funct, 0x1, (outs FGR32:$fd), (ins AFGR64:$fs),
+      !strconcat(asmstr, ".d $fd, $fs"), []>, Requires<[In32BitMode]>;
+}
+
+multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp> 
+{
+  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
+                 !strconcat(asmstr, ".s $fd, $fs"), 
+                 [(set FGR32:$fd, (FOp FGR32:$fs))]>;
+
+  def _D32  : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
+                 !strconcat(asmstr, ".d $fd, $fs"), 
+                 [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>;
+}
+
+class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc, 
+              RegisterClass RcDst, string asmstr>: 
+  FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs), 
+      !strconcat(asmstr, " $fd, $fs"), []>; 
+
+
+multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp> {
+  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), 
+                 (ins FGR32:$fs, FGR32:$ft), 
+                 !strconcat(asmstr, ".s $fd, $fs, $ft"),
+                 [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>;
+
+  def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), 
+                 (ins AFGR64:$fs, AFGR64:$ft), 
+                 !strconcat(asmstr, ".d $fd, $fs, $ft"),
+                 [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>,
+                 Requires<[In32BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+//===----------------------------------------------------------------------===//
+
+let ft = 0 in {
+  defm FLOOR_W : FFR1_1<0b001111, "floor.w">;
+  defm CEIL_W  : FFR1_1<0b001110, "ceil.w">;
+  defm ROUND_W : FFR1_1<0b001100, "round.w">;
+  defm TRUNC_W : FFR1_1<0b001101, "trunc.w">;
+  defm CVTW    : FFR1_1<0b100100, "cvt.w">;
+  defm FMOV    : FFR1_1<0b000110, "mov">;
+
+  defm FABS    : FFR1_2<0b000101, "abs",  fabs>; 
+  defm FNEG    : FFR1_2<0b000111, "neg",  fneg>; 
+  defm FSQRT   : FFR1_2<0b000100, "sqrt", fsqrt>;
+
+  /// Convert to Single Precison
+  def CVTS_W32 : FFR1_3<0b100000, 0x2, FGR32,  FGR32,  "cvt.s.w">;
+
+  let Predicates = [IsNotSingleFloat] in {
+    /// Ceil to long signed integer
+    def CEIL_LS   : FFR1_3<0b001010, 0x0, FGR32, FGR32, "ceil.l">;
+    def CEIL_LD   : FFR1_3<0b001010, 0x1, AFGR64, AFGR64, "ceil.l">;
+
+    /// Round to long signed integer
+    def ROUND_LS  : FFR1_3<0b001000, 0x0, FGR32, FGR32, "round.l">;
+    def ROUND_LD  : FFR1_3<0b001000, 0x1, AFGR64, AFGR64, "round.l">;
+
+    /// Floor to long signed integer
+    def FLOOR_LS  : FFR1_3<0b001011, 0x0, FGR32, FGR32, "floor.l">;
+    def FLOOR_LD  : FFR1_3<0b001011, 0x1, AFGR64, AFGR64, "floor.l">;
+
+    /// Trunc to long signed integer
+    def TRUNC_LS  : FFR1_3<0b001001, 0x0, FGR32, FGR32, "trunc.l">;
+    def TRUNC_LD  : FFR1_3<0b001001, 0x1, AFGR64, AFGR64, "trunc.l">;
+
+    /// Convert to long signed integer
+    def CVTL_S    : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">; 
+    def CVTL_D    : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">; 
+
+    /// Convert to Double Precison 
+    def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">; 
+    def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">; 
+    def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">; 
+                   
+    /// Convert to Single Precison
+    def CVTS_D32 : FFR1_3<0b100000, 0x1, FGR32, AFGR64, "cvt.s.d">;
+    def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">; 
+  }
+}
+
+// The odd-numbered registers are only referenced when doing loads,
+// stores, and moves between floating-point and integer registers.
+// When defining instructions, we reference all 32-bit registers, 
+// regardless of register aliasing.
+let fd = 0 in {
+  /// Move Control Registers From/To CPU Registers
+  def CFC1  : FFR<0x11, 0x0, 0x2, (outs CPURegs:$rt), (ins CCR:$fs),
+                  "cfc1 $rt, $fs", []>;
+
+  def CTC1  : FFR<0x11, 0x0, 0x6, (outs CCR:$rt), (ins CPURegs:$fs),
+                  "ctc1 $fs, $rt", []>;
+                  
+  def MFC1  : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
+                  "mfc1 $rt, $fs", []>;
+
+  def MTC1  : FFR<0x11, 0x00, 0x04, (outs FGR32:$fs), (ins CPURegs:$rt),
+                  "mtc1 $rt, $fs", []>;
+}
+
+/// Floating Point Memory Instructions
+let Predicates = [IsNotSingleFloat] in {
+  def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr), 
+                 "ldc1 $ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
+
+  def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr), 
+                 "sdc1 $ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
+}
+
+// LWC1 and SWC1 can always be emited with odd registers.
+def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1 $ft, $addr",
+               [(set FGR32:$ft, (load addr:$addr))]>; 
+def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), "swc1 $ft, $addr",
+               [(store FGR32:$ft, addr:$addr)]>; 
+
+/// Floating-point Aritmetic
+defm FADD : FFR1_4<0x10, "add", fadd>;
+defm FDIV : FFR1_4<0x03, "div", fdiv>;
+defm FMUL : FFR1_4<0x02, "mul", fmul>;
+defm FSUB : FFR1_4<0x01, "sub", fsub>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Branch Codes
+//===----------------------------------------------------------------------===//
+// Mips branch codes. These correspond to condcode in MipsInstrInfo.h. 
+// They must be kept in synch.
+def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
+def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
+def MIPS_BRANCH_FL : PatLeaf<(i32 2)>;
+def MIPS_BRANCH_TL : PatLeaf<(i32 3)>;
+
+/// Floating Point Branch of False/True (Likely)
+let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in {
+  class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs), 
+        (ins brtarget:$dst), !strconcat(asmstr, " $dst"),
+        [(MipsFPBrcond op, bb:$dst, FCR31)]>;
+}
+def BC1F  : FBRANCH<MIPS_BRANCH_F,  "bc1f">;
+def BC1T  : FBRANCH<MIPS_BRANCH_T,  "bc1t">;
+def BC1FL : FBRANCH<MIPS_BRANCH_FL, "bc1fl">;
+def BC1TL : FBRANCH<MIPS_BRANCH_TL, "bc1tl">;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Flag Conditions
+//===----------------------------------------------------------------------===//
+// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h. 
+// They must be kept in synch.
+def MIPS_FCOND_F    : PatLeaf<(i32 0)>;
+def MIPS_FCOND_UN   : PatLeaf<(i32 1)>;
+def MIPS_FCOND_EQ   : PatLeaf<(i32 2)>;
+def MIPS_FCOND_UEQ  : PatLeaf<(i32 3)>;
+def MIPS_FCOND_OLT  : PatLeaf<(i32 4)>;
+def MIPS_FCOND_ULT  : PatLeaf<(i32 5)>;
+def MIPS_FCOND_OLE  : PatLeaf<(i32 6)>;
+def MIPS_FCOND_ULE  : PatLeaf<(i32 7)>;
+def MIPS_FCOND_SF   : PatLeaf<(i32 8)>;
+def MIPS_FCOND_NGLE : PatLeaf<(i32 9)>;
+def MIPS_FCOND_SEQ  : PatLeaf<(i32 10)>;
+def MIPS_FCOND_NGL  : PatLeaf<(i32 11)>;
+def MIPS_FCOND_LT   : PatLeaf<(i32 12)>;
+def MIPS_FCOND_NGE  : PatLeaf<(i32 13)>;
+def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
+def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
+
+/// Floating Point Compare
+let hasDelaySlot = 1, Defs=[FCR31] in {
+  def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
+      "c.$cc.s $fs, $ft", [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc), 
+      (implicit FCR31)]>;
+  
+  def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
+      "c.$cc.d $fs, $ft", [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc),
+      (implicit FCR31)]>, Requires<[In32BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Pseudo-Instructions
+//===----------------------------------------------------------------------===//
+
+// For some explanation, see Select_CC at MipsInstrInfo.td. We also embedd a 
+// condiciton code to enable easy handling by the Custom Inserter.
+let usesCustomDAGSchedInserter = 1, Uses=[FCR31] in {
+  class PseudoFPSelCC<RegisterClass RC, string asmstr> : 
+    MipsPseudo<(outs RC:$dst), 
+               (ins CPURegs:$CmpRes, RC:$T, RC:$F, condcode:$cc), asmstr, 
+               [(set RC:$dst, (MipsFPSelectCC CPURegs:$CmpRes, RC:$T, RC:$F,
+                 imm:$cc))]>;
+}
+
+// The values to be selected are fp but the condition test is with integers.
+def Select_CC_S32 : PseudoSelCC<FGR32, "# MipsSelect_CC_S32_f32">;
+def Select_CC_D32 : PseudoSelCC<AFGR64, "# MipsSelect_CC_D32_f32">,
+                    Requires<[In32BitMode]>;
+
+// The values to be selected are int but the condition test is done with fp.
+def Select_FCC     : PseudoFPSelCC<CPURegs, "# MipsSelect_FCC">;
+
+// The values to be selected and the condition test is done with fp.
+def Select_FCC_S32 : PseudoFPSelCC<FGR32, "# MipsSelect_FCC_S32_f32">;
+def Select_FCC_D32 : PseudoFPSelCC<AFGR64, "# MipsSelect_FCC_D32_f32">, 
+                     Requires<[In32BitMode]>;
+
+def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src), 
+                             "# MOVCCRToCCR", []>; 
+
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def : Pat<(f32 fpimm0), (MTC1 ZERO)>;
+
+def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVTS_W32 (MTC1 CPURegs:$src))>;
+def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVTD_W32 (MTC1 CPURegs:$src))>;
+
+def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>;
+
+def : Pat<(i32 (bitconvert FGR32:$src)),  (MFC1 FGR32:$src)>;
+def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>;
+
+let Predicates = [In32BitMode] in { 
+  def : Pat<(f32 (fround AFGR64:$src)), (CVTS_D32 AFGR64:$src)>;
+  def : Pat<(f64 (fextend FGR32:$src)), (CVTD_S32 FGR32:$src)>;
+}
+
+// MipsFPRound is only emitted for MipsI targets.
+def : Pat<(f32 (MipsFPRound AFGR64:$src)), (CVTW_D32 AFGR64:$src)>;
+
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
new file mode 100644
index 0000000..0853272
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -0,0 +1,182 @@
+//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  rs      - src reg.
+//  rt      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  rd      - dst reg, only used on 3 regs instr.
+//  shamt   - only used on shift instructions, contains the shift amount.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+// Generic Mips Format
+class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern, 
+               InstrItinClass itin>: Instruction 
+{
+  field bits<32> Inst;
+
+  let Namespace = "Mips";
+
+  bits<6> opcode;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{31-26} = opcode;   
+  
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+}
+
+// Mips Pseudo Instructions Format
+class MipsPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
+      MipsInst<outs, ins, asmstr, pattern, IIPseudo>;
+
+//===----------------------------------------------------------------------===//
+// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin>:
+      MipsInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<5>  shamt;
+  bits<6>  funct;
+
+  let opcode = op;
+  let funct  = _funct;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt; 
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<5>  rt;
+  bits<5>  rs;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt; 
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Mips : <|opcode|address|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<26> addr;
+
+  let opcode = op;
+  
+  let Inst{25-0} = addr;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  FLOATING POINT INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  fs      - src reg.
+//  ft      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  fd      - dst reg, only used on 3 regs instr.
+//  fmt     - double or single precision.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Format FR instruction class in Mips : <|opcode|fmt|ft|fs|fd|funct|>
+//===----------------------------------------------------------------------===//
+
+class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins, 
+          string asmstr, list<dag> pattern> : 
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+{
+  bits<5>  fd;
+  bits<5>  fs;
+  bits<5>  ft;
+  bits<5>  fmt;
+  bits<6>  funct;
+
+  let opcode = op;
+  let funct  = _funct;
+  let fmt    = _fmt;
+
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft; 
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>: 
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+{
+  bits<5>  ft;
+  bits<5>  base;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = ft; 
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Compare instruction class in Mips : <|010001|fmt|ft|fs|0000011|condcode|>
+//===----------------------------------------------------------------------===//
+
+class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> : 
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+{
+  bits<5>  fs;
+  bits<5>  ft;
+  bits<4>  cc;
+  bits<5>  fmt;
+
+  let opcode = 0x11;
+  let fmt    = _fmt;
+
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft; 
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = 0;
+  let Inst{5-4}   = 0b11;
+  let Inst{3-0}   = cc;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
new file mode 100644
index 0000000..6225fa9
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -0,0 +1,623 @@
+//===- MipsInstrInfo.cpp - Mips Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "MipsGenInstrInfo.inc"
+
+using namespace llvm;
+
+MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
+  : TargetInstrInfoImpl(MipsInsts, array_lengthof(MipsInsts)),
+    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+bool MipsInstrInfo::
+isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg,
+            unsigned &SrcSubIdx, unsigned &DstSubIdx) const 
+{
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  // addu $dst, $src, $zero || addu $dst, $zero, $src
+  // or   $dst, $src, $zero || or   $dst, $zero, $src
+  if ((MI.getOpcode() == Mips::ADDu) || (MI.getOpcode() == Mips::OR)) {
+    if (MI.getOperand(1).getReg() == Mips::ZERO) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+      return true;
+    } else if (MI.getOperand(2).getReg() == Mips::ZERO) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  // mov $fpDst, $fpSrc
+  // mfc $gpDst, $fpSrc
+  // mtc $fpDst, $gpSrc
+  if (MI.getOpcode() == Mips::FMOV_S32 || 
+      MI.getOpcode() == Mips::FMOV_D32 || 
+      MI.getOpcode() == Mips::MFC1 || 
+      MI.getOpcode() == Mips::MTC1 ||
+      MI.getOpcode() == Mips::MOVCCRToCCR) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  }
+
+  // addiu $dst, $src, 0
+  if (MI.getOpcode() == Mips::ADDiu) {
+    if ((MI.getOperand(1).isReg()) && (isZeroImm(MI.getOperand(2)))) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsInstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const 
+{
+  if ((MI->getOpcode() == Mips::LW) || (MI->getOpcode() == Mips::LWC1) ||
+      (MI->getOpcode() == Mips::LDC1)) {
+    if ((MI->getOperand(2).isFI()) && // is a stack slot
+        (MI->getOperand(1).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsInstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const 
+{
+  if ((MI->getOpcode() == Mips::SW) || (MI->getOpcode() == Mips::SWC1) ||
+      (MI->getOpcode() == Mips::SDC1)) {
+    if ((MI->getOperand(2).isFI()) && // is a stack slot
+        (MI->getOperand(1).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+void MipsInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const 
+{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  BuildMI(MBB, MI, DL, get(Mips::NOP));
+}
+
+bool MipsInstrInfo::
+copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+             unsigned DestReg, unsigned SrcReg,
+             const TargetRegisterClass *DestRC,
+             const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC != SrcRC) {
+
+    // Copy to/from FCR31 condition register
+    if ((DestRC == Mips::CPURegsRegisterClass) && 
+        (SrcRC == Mips::CCRRegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::CFC1), DestReg).addReg(SrcReg);
+    else if ((DestRC == Mips::CCRRegisterClass) && 
+        (SrcRC == Mips::CPURegsRegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::CTC1), DestReg).addReg(SrcReg);
+
+    // Moves between coprocessors and cpu
+    else if ((DestRC == Mips::CPURegsRegisterClass) && 
+        (SrcRC == Mips::FGR32RegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::MFC1), DestReg).addReg(SrcReg);
+    else if ((DestRC == Mips::FGR32RegisterClass) &&
+             (SrcRC == Mips::CPURegsRegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::MTC1), DestReg).addReg(SrcReg);
+
+    // Move from/to Hi/Lo registers
+    else if ((DestRC == Mips::HILORegisterClass) &&
+             (SrcRC == Mips::CPURegsRegisterClass)) {
+      unsigned Opc = (DestReg == Mips::HI) ? Mips::MTHI : Mips::MTLO;
+      BuildMI(MBB, I, DL, get(Opc), DestReg);
+    } else if ((SrcRC == Mips::HILORegisterClass) &&
+               (DestRC == Mips::CPURegsRegisterClass)) {
+      unsigned Opc = (SrcReg == Mips::HI) ? Mips::MFHI : Mips::MFLO;
+      BuildMI(MBB, I, DL, get(Opc), DestReg);
+
+    // Can't copy this register
+    } else
+      return false; 
+
+    return true;
+  }
+
+  if (DestRC == Mips::CPURegsRegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::ADDu), DestReg).addReg(Mips::ZERO)
+      .addReg(SrcReg);
+  else if (DestRC == Mips::FGR32RegisterClass) 
+    BuildMI(MBB, I, DL, get(Mips::FMOV_S32), DestReg).addReg(SrcReg);
+  else if (DestRC == Mips::AFGR64RegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg).addReg(SrcReg);
+  else if (DestRC == Mips::CCRRegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::MOVCCRToCCR), DestReg).addReg(SrcReg);
+  else
+    // Can't copy this register
+    return false;
+  
+  return true;
+}
+
+void MipsInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI, 
+                    const TargetRegisterClass *RC) const {
+  unsigned Opc;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == Mips::CPURegsRegisterClass) 
+    Opc = Mips::SW;
+  else if (RC == Mips::FGR32RegisterClass)
+    Opc = Mips::SWC1;
+  else {
+    assert(RC == Mips::AFGR64RegisterClass);
+    Opc = Mips::SDC1;
+  }
+  
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+          .addImm(0).addFrameIndex(FI);
+}
+
+void MipsInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+  bool isKill, SmallVectorImpl<MachineOperand> &Addr, 
+  const TargetRegisterClass *RC, SmallVectorImpl<MachineInstr*> &NewMIs) const 
+{
+  unsigned Opc;
+  if (RC == Mips::CPURegsRegisterClass) 
+    Opc = Mips::SW;
+  else if (RC == Mips::FGR32RegisterClass)
+    Opc = Mips::SWC1;
+  else {
+    assert(RC == Mips::AFGR64RegisterClass);
+    Opc = Mips::SDC1;
+  }
+  
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc))
+    .addReg(SrcReg, getKillRegState(isKill));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+  return;
+}
+
+void MipsInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const 
+{
+  unsigned Opc;
+  if (RC == Mips::CPURegsRegisterClass) 
+    Opc = Mips::LW;
+  else if (RC == Mips::FGR32RegisterClass)
+    Opc = Mips::LWC1;
+  else {
+    assert(RC == Mips::AFGR64RegisterClass);
+    Opc = Mips::LDC1;
+  }
+  
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0).addFrameIndex(FI);
+}
+
+void MipsInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                    SmallVectorImpl<MachineOperand> &Addr,
+                                    const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Opc;
+  if (RC == Mips::CPURegsRegisterClass) 
+    Opc = Mips::LW;
+  else if (RC == Mips::FGR32RegisterClass)
+    Opc = Mips::LWC1;
+  else {
+    assert(RC == Mips::AFGR64RegisterClass);
+    Opc = Mips::LDC1;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+  return;
+}
+
+MachineInstr *MipsInstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF,
+                      MachineInstr* MI,
+                      const SmallVectorImpl<unsigned> &Ops, int FI) const 
+{
+  if (Ops.size() != 1) return NULL;
+
+  MachineInstr *NewMI = NULL;
+
+  switch (MI->getOpcode()) {
+  case Mips::ADDu:
+    if ((MI->getOperand(0).isReg()) &&
+        (MI->getOperand(1).isReg()) &&
+        (MI->getOperand(1).getReg() == Mips::ZERO) &&
+        (MI->getOperand(2).isReg())) {
+      if (Ops[0] == 0) {    // COPY -> STORE
+        unsigned SrcReg = MI->getOperand(2).getReg();
+        bool isKill = MI->getOperand(2).isKill();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(Mips::SW))
+          .addReg(SrcReg, getKillRegState(isKill))
+          .addImm(0).addFrameIndex(FI);
+      } else {              // COPY -> LOAD
+        unsigned DstReg = MI->getOperand(0).getReg();
+        bool isDead = MI->getOperand(0).isDead();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(Mips::LW))
+          .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
+          .addImm(0).addFrameIndex(FI);
+      }
+    }
+    break;
+  case Mips::FMOV_S32:
+  case Mips::FMOV_D32:
+    if ((MI->getOperand(0).isReg()) &&
+        (MI->getOperand(1).isReg())) {
+      const TargetRegisterClass 
+        *RC = RI.getRegClass(MI->getOperand(0).getReg());
+      unsigned StoreOpc, LoadOpc;
+
+      if (RC == Mips::FGR32RegisterClass) {
+        LoadOpc = Mips::LWC1; StoreOpc = Mips::SWC1;
+      } else {
+        assert(RC == Mips::AFGR64RegisterClass);
+        LoadOpc = Mips::LDC1; StoreOpc = Mips::SDC1;
+      }
+
+      if (Ops[0] == 0) {    // COPY -> STORE
+        unsigned SrcReg = MI->getOperand(1).getReg();
+        bool isKill = MI->getOperand(1).isKill();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(StoreOpc))
+          .addReg(SrcReg, getKillRegState(isKill))
+          .addImm(0).addFrameIndex(FI) ;
+      } else {              // COPY -> LOAD
+        unsigned DstReg = MI->getOperand(0).getReg();
+        bool isDead = MI->getOperand(0).isDead();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(LoadOpc))
+          .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
+          .addImm(0).addFrameIndex(FI);
+      }
+    }
+    break;
+  }
+
+  return NewMI;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+/// GetCondFromBranchOpc - Return the Mips CC that matches 
+/// the correspondent Branch instruction opcode.
+static Mips::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
+{
+  switch (BrOpc) {
+  default: return Mips::COND_INVALID;
+  case Mips::BEQ  : return Mips::COND_E;
+  case Mips::BNE  : return Mips::COND_NE;
+  case Mips::BGTZ : return Mips::COND_GZ;
+  case Mips::BGEZ : return Mips::COND_GEZ;
+  case Mips::BLTZ : return Mips::COND_LZ;
+  case Mips::BLEZ : return Mips::COND_LEZ;
+
+  // We dont do fp branch analysis yet!  
+  case Mips::BC1T : 
+  case Mips::BC1F : return Mips::COND_INVALID;
+  }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+unsigned Mips::GetCondBranchFromCond(Mips::CondCode CC) 
+{
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case Mips::COND_E   : return Mips::BEQ;
+  case Mips::COND_NE  : return Mips::BNE;
+  case Mips::COND_GZ  : return Mips::BGTZ;
+  case Mips::COND_GEZ : return Mips::BGEZ;
+  case Mips::COND_LZ  : return Mips::BLTZ;
+  case Mips::COND_LEZ : return Mips::BLEZ;
+
+  case Mips::FCOND_F:
+  case Mips::FCOND_UN:
+  case Mips::FCOND_EQ:
+  case Mips::FCOND_UEQ:
+  case Mips::FCOND_OLT:
+  case Mips::FCOND_ULT:
+  case Mips::FCOND_OLE:
+  case Mips::FCOND_ULE:
+  case Mips::FCOND_SF:
+  case Mips::FCOND_NGLE:
+  case Mips::FCOND_SEQ:
+  case Mips::FCOND_NGL:
+  case Mips::FCOND_LT:
+  case Mips::FCOND_NGE:
+  case Mips::FCOND_LE:
+  case Mips::FCOND_NGT: return Mips::BC1T;
+
+  case Mips::FCOND_T:
+  case Mips::FCOND_OR:
+  case Mips::FCOND_NEQ:
+  case Mips::FCOND_OGL:
+  case Mips::FCOND_UGE:
+  case Mips::FCOND_OGE:
+  case Mips::FCOND_UGT:
+  case Mips::FCOND_OGT:
+  case Mips::FCOND_ST:
+  case Mips::FCOND_GLE:
+  case Mips::FCOND_SNE:
+  case Mips::FCOND_GL:
+  case Mips::FCOND_NLT:
+  case Mips::FCOND_GE:
+  case Mips::FCOND_NLE:
+  case Mips::FCOND_GT: return Mips::BC1F;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified 
+/// condition, e.g. turning COND_E to COND_NE.
+Mips::CondCode Mips::GetOppositeBranchCondition(Mips::CondCode CC) 
+{
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case Mips::COND_E   : return Mips::COND_NE;
+  case Mips::COND_NE  : return Mips::COND_E;
+  case Mips::COND_GZ  : return Mips::COND_LEZ;
+  case Mips::COND_GEZ : return Mips::COND_LZ;
+  case Mips::COND_LZ  : return Mips::COND_GEZ;
+  case Mips::COND_LEZ : return Mips::COND_GZ;
+  case Mips::FCOND_F  : return Mips::FCOND_T;
+  case Mips::FCOND_UN : return Mips::FCOND_OR;
+  case Mips::FCOND_EQ : return Mips::FCOND_NEQ;
+  case Mips::FCOND_UEQ: return Mips::FCOND_OGL;
+  case Mips::FCOND_OLT: return Mips::FCOND_UGE;
+  case Mips::FCOND_ULT: return Mips::FCOND_OGE;
+  case Mips::FCOND_OLE: return Mips::FCOND_UGT;
+  case Mips::FCOND_ULE: return Mips::FCOND_OGT;
+  case Mips::FCOND_SF:  return Mips::FCOND_ST;
+  case Mips::FCOND_NGLE:return Mips::FCOND_GLE;
+  case Mips::FCOND_SEQ: return Mips::FCOND_SNE;
+  case Mips::FCOND_NGL: return Mips::FCOND_GL;
+  case Mips::FCOND_LT:  return Mips::FCOND_NLT;
+  case Mips::FCOND_NGE: return Mips::FCOND_GE;
+  case Mips::FCOND_LE:  return Mips::FCOND_NLE;
+  case Mips::FCOND_NGT: return Mips::FCOND_GT;
+  }
+}
+
+bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const 
+{
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+  
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (!LastInst->getDesc().isBranch())
+      return true;
+
+    // Unconditional branch
+    if (LastOpc == Mips::J) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+
+    Mips::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == Mips::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Conditional branch
+    // Block ends with fall-through condbranch.
+    if (LastOpc != Mips::COND_INVALID) {
+      int LastNumOp = LastInst->getNumOperands();
+
+      TBB = LastInst->getOperand(LastNumOp-1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+
+      for (int i=0; i<LastNumOp-1; i++) {
+        Cond.push_back(LastInst->getOperand(i));
+      }
+
+      return false;
+    }
+  }
+  
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with Mips::J and a Mips::BNE/Mips::BEQ, handle it.
+  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
+  Mips::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+
+  if (BranchCode != Mips::COND_INVALID && LastOpc == Mips::J) {
+    int SecondNumOp = SecondLastInst->getNumOperands();
+
+    TBB = SecondLastInst->getOperand(SecondNumOp-1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+
+    for (int i=0; i<SecondNumOp-1; i++) {
+      Cond.push_back(SecondLastInst->getOperand(i));
+    }
+
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it. The last 
+  // one is not executed, so remove it.
+  if ((SecondLastOpc == Mips::J) && (LastOpc == Mips::J)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned MipsInstrInfo::
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, 
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 3 || Cond.size() == 2 || Cond.size() == 0) &&
+         "Mips branch conditions can have two|three components!");
+
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch?
+      BuildMI(&MBB, dl, get(Mips::J)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm());
+      const TargetInstrDesc &TID = get(Opc);
+
+      if (TID.getNumOperands() == 3)
+        BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg())
+                          .addReg(Cond[2].getReg())
+                          .addMBB(TBB);
+      else
+        BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg())
+                          .addMBB(TBB);
+
+    }                             
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm());
+  const TargetInstrDesc &TID = get(Opc);
+
+  if (TID.getNumOperands() == 3)
+    BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg()).addReg(Cond[2].getReg())
+                      .addMBB(TBB);
+  else
+    BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg()).addMBB(TBB);
+
+  BuildMI(&MBB, dl, get(Mips::J)).addMBB(FBB);
+  return 2;
+}
+
+unsigned MipsInstrInfo::
+RemoveBranch(MachineBasicBlock &MBB) const 
+{
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != Mips::J && 
+      GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+  
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+/// BlockHasNoFallThrough - Analyze if MachineBasicBlock does not
+/// fall-through into its successor block.
+bool MipsInstrInfo::
+BlockHasNoFallThrough(const MachineBasicBlock &MBB) const 
+{
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case Mips::RET:     // Return.
+  case Mips::JR:      // Indirect branch.
+  case Mips::J:       // Uncond branch.
+    return true;
+  default: return false;
+  }
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the 
+/// specified Branch instruction.
+bool MipsInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const 
+{
+  assert( (Cond.size() == 3 || Cond.size() == 2) && 
+          "Invalid Mips branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((Mips::CondCode)Cond[0].getImm()));
+  return false;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
new file mode 100644
index 0000000..334244e
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -0,0 +1,223 @@
+//===- MipsInstrInfo.h - Mips Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSINSTRUCTIONINFO_H
+#define MIPSINSTRUCTIONINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+
+namespace Mips {
+
+  // Mips Branch Codes
+  enum FPBranchCode {
+    BRANCH_F,
+    BRANCH_T,
+    BRANCH_FL,
+    BRANCH_TL,
+    BRANCH_INVALID
+  };
+
+  // Mips Condition Codes
+  enum CondCode {
+    // To be used with float branch True
+    FCOND_F,
+    FCOND_UN,
+    FCOND_EQ,
+    FCOND_UEQ,
+    FCOND_OLT,
+    FCOND_ULT,
+    FCOND_OLE,
+    FCOND_ULE,
+    FCOND_SF,
+    FCOND_NGLE,
+    FCOND_SEQ,
+    FCOND_NGL,
+    FCOND_LT,
+    FCOND_NGE,
+    FCOND_LE,
+    FCOND_NGT,
+
+    // To be used with float branch False
+    // This conditions have the same mnemonic as the
+    // above ones, but are used with a branch False;
+    FCOND_T,
+    FCOND_OR,
+    FCOND_NEQ,
+    FCOND_OGL,
+    FCOND_UGE,
+    FCOND_OGE,
+    FCOND_UGT,
+    FCOND_OGT,
+    FCOND_ST,
+    FCOND_GLE,
+    FCOND_SNE,
+    FCOND_GL,
+    FCOND_NLT,
+    FCOND_GE,
+    FCOND_NLE,
+    FCOND_GT,
+
+    // Only integer conditions
+    COND_E,
+    COND_GZ,
+    COND_GEZ,
+    COND_LZ,
+    COND_LEZ,
+    COND_NE,
+    COND_INVALID
+  };
+  
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(Mips::CondCode CC);
+
+  /// MipsCCToString - Map each FP condition code to its string
+  inline static const char *MipsFCCToString(Mips::CondCode CC) 
+  {
+    switch (CC) {
+      default: assert(0 && "Unknown condition code");
+      case FCOND_F:
+      case FCOND_T:   return "f";
+      case FCOND_UN:
+      case FCOND_OR:  return "un";
+      case FCOND_EQ: 
+      case FCOND_NEQ: return "eq";
+      case FCOND_UEQ:
+      case FCOND_OGL: return "ueq";
+      case FCOND_OLT:
+      case FCOND_UGE: return "olt";
+      case FCOND_ULT:
+      case FCOND_OGE: return "ult";
+      case FCOND_OLE:
+      case FCOND_UGT: return "ole";
+      case FCOND_ULE:
+      case FCOND_OGT: return "ule";
+      case FCOND_SF:
+      case FCOND_ST:  return "sf";
+      case FCOND_NGLE:
+      case FCOND_GLE: return "ngle";
+      case FCOND_SEQ:
+      case FCOND_SNE: return "seq";
+      case FCOND_NGL:
+      case FCOND_GL:  return "ngl";
+      case FCOND_LT:
+      case FCOND_NLT: return "lt";
+      case FCOND_NGE:
+      case FCOND_GE:  return "ge";
+      case FCOND_LE:
+      case FCOND_NLE: return "nle";
+      case FCOND_NGT:
+      case FCOND_GT:  return "gt";
+    }
+  }
+}
+
+class MipsInstrInfo : public TargetInstrInfoImpl {
+  MipsTargetMachine &TM;
+  const MipsRegisterInfo RI;
+public:
+  explicit MipsInstrInfo(MipsTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MipsRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+ 
+  /// Branch Analysis
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB, 
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+  
+  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  /// Insert nop instruction when hazard condition is found
+  virtual void insertNoop(MachineBasicBlock &MBB, 
+                          MachineBasicBlock::iterator MI) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
new file mode 100644
index 0000000..b9276fe
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -0,0 +1,707 @@
+//===- MipsInstrInfo.td - Mips Register defs --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Mips profiles and nodes
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsRet          : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_MipsSelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, 
+                                         SDTCisSameAs<2, 3>, SDTCisInt<1>]>;
+def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, 
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>,
+                                         SDTCisInt<4>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// Call
+def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, [SDNPHasChain,
+                         SDNPOutFlag]>;
+
+// Hi and Lo nodes are used to handle global addresses. Used on 
+// MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol 
+// static model. (nothing to do with Mips Registers Hi and Lo)
+def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
+def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
+
+// Return
+def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain, 
+                     SDNPOptInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// Select Condition Code
+def MipsSelectCC  : SDNode<"MipsISD::SelectCC", SDT_MipsSelectCC>;
+
+// Conditional Move
+def MipsCMov      : SDNode<"MipsISD::CMov", SDT_MipsCMov>;
+
+//===----------------------------------------------------------------------===//
+// Mips Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasSEInReg  : Predicate<"Subtarget.hasSEInReg()">;
+def HasBitCount : Predicate<"Subtarget.hasBitCount()">;
+def HasSwap     : Predicate<"Subtarget.hasSwap()">;
+def HasCondMov  : Predicate<"Subtarget.hasCondMov()">;
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Instruction operand types
+def brtarget    : Operand<OtherVT>;
+def calltarget  : Operand<i32>;
+def simm16      : Operand<i32>;
+def shamt       : Operand<i32>;
+
+// Unsigned Operand
+def uimm16      : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// Address operand
+def mem : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops simm16, CPURegs);
+}
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
+  else
+    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
+}]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (uint32_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+  else
+    return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : PatLeaf<(imm), [{
+  return N->getZExtValue() == ((N->getZExtValue()) & 0x1f) ;
+}]>;
+
+// Mips Address Mode! SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def addr : ComplexPattern<i32, 2, "SelectAddr", [frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic 3 register operands
+let isCommutable = 1 in
+class ArithR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
+             InstrItinClass itin>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin>;
+
+let isCommutable = 1 in
+class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [], IIAlu>;
+
+// Arithmetic 2 register operands
+class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, Od:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, imm_type:$c))], IIAlu>;
+
+class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, Od:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [], IIAlu>;
+
+// Arithmetic Multiply ADD/SUB
+let rd=0 in
+class MArithR<bits<6> func, string instr_asm> :
+  FR< 0x1c,
+      func,
+      (outs CPURegs:$rs),
+      (ins CPURegs:$rt),
+      !strconcat(instr_asm, "\t$rs, $rt"),
+      [], IIImul>;
+
+//  Logical
+class LogicR<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu>;
+
+class LogicI<bits<6> op, string instr_asm, SDNode OpNode>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, uimm16:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))], IIAlu>;
+
+class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))], IIAlu>;
+
+// Shifts
+let rt = 0 in
+class LogicR_shift_imm<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, shamt:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt5:$c))], IIAlu>;
+
+class LogicR_shift_reg<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu>;
+
+// Load Upper Imediate
+class LoadUpper<bits<6> op, string instr_asm>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins uimm16:$imm),
+      !strconcat(instr_asm, "\t$dst, $imm"),
+      [], IIAlu>;
+
+// Memory Load/Store
+let canFoldAsLoad = 1, hasDelaySlot = 1 in
+class LoadM<bits<6> op, string instr_asm, PatFrag OpNode>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins mem:$addr),
+      !strconcat(instr_asm, "\t$dst, $addr"),
+      [(set CPURegs:$dst, (OpNode addr:$addr))], IILoad>;
+
+class StoreM<bits<6> op, string instr_asm, PatFrag OpNode>:
+  FI< op,
+      (outs),
+      (ins CPURegs:$dst, mem:$addr),
+      !strconcat(instr_asm, "\t$dst, $addr"),
+      [(OpNode CPURegs:$dst, addr:$addr)], IIStore>;
+
+// Conditional Branch
+let isBranch = 1, isTerminator=1, hasDelaySlot = 1 in {
+class CBranch<bits<6> op, string instr_asm, PatFrag cond_op>:
+  FI< op,
+      (outs),
+      (ins CPURegs:$a, CPURegs:$b, brtarget:$offset),
+      !strconcat(instr_asm, "\t$a, $b, $offset"),
+      [(brcond (cond_op CPURegs:$a, CPURegs:$b), bb:$offset)],
+      IIBranch>;
+
+
+class CBranchZero<bits<6> op, string instr_asm, PatFrag cond_op>:
+  FI< op,
+      (outs),
+      (ins CPURegs:$src, brtarget:$offset),
+      !strconcat(instr_asm, "\t$src, $offset"),
+      [(brcond (cond_op CPURegs:$src, 0), bb:$offset)],
+      IIBranch>;
+}
+
+// SetCC
+class SetCC_R<bits<6> op, bits<6> func, string instr_asm,
+      PatFrag cond_op>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (cond_op CPURegs:$b, CPURegs:$c))],
+      IIAlu>;
+
+class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op,
+      Operand Od, PatLeaf imm_type>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, Od:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (cond_op CPURegs:$b, imm_type:$c))],
+      IIAlu>;
+
+// Unconditional branch
+let isBranch=1, isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+class JumpFJ<bits<6> op, string instr_asm>:
+  FJ< op,
+      (outs),
+      (ins brtarget:$target),
+      !strconcat(instr_asm, "\t$target"),
+      [(br bb:$target)], IIBranch>;
+
+let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1 in
+class JumpFR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (outs),
+      (ins CPURegs:$target),
+      !strconcat(instr_asm, "\t$target"),
+      [(brind CPURegs:$target)], IIBranch>;
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1,
+  // All calls clobber the non-callee saved registers...
+  Defs = [AT, V0, V1, A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
+          K0, K1, F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13,
+          F14, F15, F16, F17, F18, F19], Uses = [GP] in {
+  class JumpLink<bits<6> op, string instr_asm>:
+    FJ< op,
+        (outs),
+        (ins calltarget:$target),
+        !strconcat(instr_asm, "\t$target"),
+        [(MipsJmpLink imm:$target)], IIBranch>;
+
+  let rd=31 in
+  class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm>:
+    FR< op,
+        func,
+        (outs),
+        (ins CPURegs:$rs),
+        !strconcat(instr_asm, "\t$rs"),
+        [(MipsJmpLink CPURegs:$rs)], IIBranch>;
+
+  class BranchLink<string instr_asm>:
+    FI< 0x1,
+        (outs),
+        (ins CPURegs:$rs, brtarget:$target),
+        !strconcat(instr_asm, "\t$rs, $target"),
+        [], IIBranch>;
+}
+
+// Mul, Div
+class MulDiv<bits<6> func, string instr_asm, InstrItinClass itin>:
+  FR< 0x00,
+      func,
+      (outs),
+      (ins CPURegs:$a, CPURegs:$b),
+      !strconcat(instr_asm, "\t$a, $b"),
+      [], itin>;
+
+// Move from Hi/Lo
+class MoveFromLOHI<bits<6> func, string instr_asm>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins),
+      !strconcat(instr_asm, "\t$dst"),
+      [], IIHiLo>;
+
+class MoveToLOHI<bits<6> func, string instr_asm>:
+  FR< 0x00,
+      func,
+      (outs),
+      (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$src"),
+      [], IIHiLo>;
+
+class EffectiveAddress<string instr_asm> :
+  FI<0x09,
+     (outs CPURegs:$dst),
+     (ins mem:$addr),
+     instr_asm,
+     [(set CPURegs:$dst, addr:$addr)], IIAlu>;
+
+// Count Leading Ones/Zeros in Word
+class CountLeading<bits<6> func, string instr_asm, SDNode CountOp>:
+  FR< 0x1c, func, (outs CPURegs:$dst), (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$dst, $src"), 
+      [(set CPURegs:$dst, (CountOp CPURegs:$src))], IIAlu>;
+
+// Sign Extend in Register.
+class SignExtInReg<bits<6> func, string instr_asm, ValueType vt>:
+  FR< 0x3f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$dst, $src"),
+      [(set CPURegs:$dst, (sext_inreg CPURegs:$src, vt))], NoItinerary>;
+
+// Byte Swap
+class ByteSwap<bits<6> func, string instr_asm>:
+  FR< 0x1f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$dst, $src"),
+      [(set CPURegs:$dst, (bswap CPURegs:$src))], NoItinerary>;
+
+// Conditional Move
+class CondMov<bits<6> func, string instr_asm, PatLeaf MovCode>:
+  FR< 0x00, func, (outs CPURegs:$dst), (ins CPURegs:$F, CPURegs:$T, 
+      CPURegs:$cond), !strconcat(instr_asm, "\t$dst, $T, $cond"), 
+      [(set CPURegs:$dst, (MipsCMov CPURegs:$F, CPURegs:$T, 
+                           CPURegs:$cond, MovCode))], NoItinerary>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// As stack alignment is always done with addiu, we need a 16-bit immediate
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins uimm16:$amt),
+                                  "!ADJCALLSTACKDOWN $amt",
+                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins uimm16:$amt1, uimm16:$amt2),
+                                  "!ADJCALLSTACKUP $amt1",
+                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+// Some assembly macros need to avoid pseudoinstructions and assembler
+// automatic reodering, we should reorder ourselves.
+def MACRO     : MipsPseudo<(outs), (ins), ".set\tmacro",     []>;
+def REORDER   : MipsPseudo<(outs), (ins), ".set\treorder",   []>;
+def NOMACRO   : MipsPseudo<(outs), (ins), ".set\tnomacro",   []>;
+def NOREORDER : MipsPseudo<(outs), (ins), ".set\tnoreorder", []>;
+
+// When handling PIC code the assembler needs .cpload and .cprestore
+// directives. If the real instructions corresponding these directives
+// are used, we have the same behavior, but get also a bunch of warnings
+// from the assembler.
+def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
+def CPRESTORE : MipsPseudo<(outs), (ins uimm16:$loc), ".cprestore\t$loc\n", []>;
+
+// The supported Mips ISAs dont have any instruction close to the SELECT_CC 
+// operation. The solution is to create a Mips pseudo SELECT_CC instruction
+// (MipsSelectCC), use LowerSELECT_CC to generate this instruction and finally 
+// replace it for real supported nodes into EmitInstrWithCustomInserter
+let usesCustomDAGSchedInserter = 1 in {
+  class PseudoSelCC<RegisterClass RC, string asmstr>: 
+    MipsPseudo<(outs RC:$dst), (ins CPURegs:$CmpRes, RC:$T, RC:$F), asmstr, 
+    [(set RC:$dst, (MipsSelectCC CPURegs:$CmpRes, RC:$T, RC:$F))]>;
+}
+
+def Select_CC : PseudoSelCC<CPURegs, "# MipsSelect_CC_i32">;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsI Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def ADDiu   : ArithI<0x09, "addiu", add, simm16, immSExt16>;
+def ADDi    : ArithOverflowI<0x08, "addi",  add, simm16, immSExt16>;
+def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16>;
+def SLTiu   : SetCC_I<0x0b, "sltiu", setult, simm16, immSExt16>;
+def ANDi    : LogicI<0x0c, "andi", and>;
+def ORi     : LogicI<0x0d, "ori",  or>;
+def XORi    : LogicI<0x0e, "xori",  xor>;
+def LUi     : LoadUpper<0x0f, "lui">;
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu>;
+def SUBu    : ArithR<0x00, 0x23, "subu", sub, IIAlu>;
+def ADD     : ArithOverflowR<0x00, 0x20, "add">;
+def SUB     : ArithOverflowR<0x00, 0x22, "sub">;
+def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt>;
+def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult>;
+def AND     : LogicR<0x24, "and", and>;
+def OR      : LogicR<0x25, "or",  or>;
+def XOR     : LogicR<0x26, "xor", xor>;
+def NOR     : LogicNOR<0x00, 0x27, "nor">;
+
+/// Shift Instructions
+def SLL     : LogicR_shift_imm<0x00, "sll", shl>;
+def SRL     : LogicR_shift_imm<0x02, "srl", srl>;
+def SRA     : LogicR_shift_imm<0x03, "sra", sra>;
+def SLLV    : LogicR_shift_reg<0x04, "sllv", shl>;
+def SRLV    : LogicR_shift_reg<0x06, "srlv", srl>;
+def SRAV    : LogicR_shift_reg<0x07, "srav", sra>;
+
+/// Load and Store Instructions
+def LB      : LoadM<0x20, "lb",  sextloadi8>;
+def LBu     : LoadM<0x24, "lbu", zextloadi8>;
+def LH      : LoadM<0x21, "lh",  sextloadi16>;
+def LHu     : LoadM<0x25, "lhu", zextloadi16>;
+def LW      : LoadM<0x23, "lw",  load>;
+def SB      : StoreM<0x28, "sb", truncstorei8>;
+def SH      : StoreM<0x29, "sh", truncstorei16>;
+def SW      : StoreM<0x2b, "sw", store>;
+
+/// Jump and Branch Instructions
+def J       : JumpFJ<0x02, "j">;
+def JR      : JumpFR<0x00, 0x08, "jr">;
+def JAL     : JumpLink<0x03, "jal">;
+def JALR    : JumpLinkReg<0x00, 0x09, "jalr">;
+def BEQ     : CBranch<0x04, "beq", seteq>;
+def BNE     : CBranch<0x05, "bne", setne>;
+
+let rt=1 in
+  def BGEZ  : CBranchZero<0x01, "bgez", setge>;
+
+let rt=0 in {
+  def BGTZ  : CBranchZero<0x07, "bgtz", setgt>;
+  def BLEZ  : CBranchZero<0x07, "blez", setle>;
+  def BLTZ  : CBranchZero<0x01, "bltz", setlt>;
+}
+
+def BGEZAL  : BranchLink<"bgezal">;
+def BLTZAL  : BranchLink<"bltzal">;
+
+let isReturn=1, isTerminator=1, hasDelaySlot=1,
+    isBarrier=1, hasCtrlDep=1, rs=0, rt=0, shamt=0 in
+  def RET : FR <0x00, 0x02, (outs), (ins CPURegs:$target),
+                "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>;
+
+/// Multiply and Divide Instructions. 
+let Defs = [HI, LO] in {
+  def MULT    : MulDiv<0x18, "mult", IIImul>;
+  def MULTu   : MulDiv<0x19, "multu", IIImul>;
+  def DIV     : MulDiv<0x1a, "div", IIIdiv>;
+  def DIVu    : MulDiv<0x1b, "divu", IIIdiv>;
+}
+
+let Defs = [HI] in
+  def MTHI  : MoveToLOHI<0x11, "mthi">;
+let Defs = [LO] in
+  def MTLO  : MoveToLOHI<0x13, "mtlo">;
+
+let Uses = [HI] in
+  def MFHI  : MoveFromLOHI<0x10, "mfhi">;
+let Uses = [LO] in
+  def MFLO  : MoveFromLOHI<0x12, "mflo">;
+
+/// Sign Ext In Register Instructions.
+let Predicates = [HasSEInReg] in {
+  let shamt = 0x10, rs = 0 in
+    def SEB : SignExtInReg<0x21, "seb", i8>;
+
+  let shamt = 0x18, rs = 0 in
+    def SEH : SignExtInReg<0x20, "seh", i16>;
+}
+
+/// Count Leading
+let Predicates = [HasBitCount] in {
+  let rt = 0 in
+    def CLZ : CountLeading<0b010110, "clz", ctlz>;
+}
+
+/// Byte Swap
+let Predicates = [HasSwap] in {
+  let shamt = 0x3, rs = 0 in
+    def WSBW : ByteSwap<0x20, "wsbw">;
+}
+
+/// Conditional Move
+def MIPS_CMOV_ZERO  : PatLeaf<(i32 0)>;
+def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>;
+
+let Predicates = [HasCondMov], isTwoAddress = 1 in {
+  def MOVN : CondMov<0x0a, "movn", MIPS_CMOV_NZERO>;
+  def MOVZ : CondMov<0x0b, "movz", MIPS_CMOV_ZERO>;
+}
+
+/// No operation
+let addr=0 in
+  def NOP   : FJ<0, (outs), (ins), "nop", [], IIAlu>;
+
+// FrameIndexes are legalized when they are operands from load/store
+// instructions. The same not happens for stack address copies, so an
+// add op with mem ComplexPattern is used and the stack address copy
+// can be matched. It's similar to Sparc LEA_ADDRi
+def LEA_ADDiu : EffectiveAddress<"addiu\t$dst, ${addr:stackloc}">;
+
+// MADD*/MSUB* are not part of MipsI either.
+//def MADD    : MArithR<0x00, "madd">;
+//def MADDU   : MArithR<0x01, "maddu">;
+//def MSUB    : MArithR<0x04, "msub">;
+//def MSUBU   : MArithR<0x05, "msubu">;
+
+// MUL is a assembly macro in the current used ISAs. In recent ISA's
+// it is a real instruction.
+//def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul>;
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Small immediates
+def : Pat<(i32 immSExt16:$in),
+          (ADDiu ZERO, imm:$in)>;
+def : Pat<(i32 immZExt16:$in),
+          (ORi ZERO, imm:$in)>;
+
+// Arbitrary immediates
+def : Pat<(i32 imm:$imm),
+          (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Carry patterns
+def : Pat<(subc CPURegs:$lhs, CPURegs:$rhs),
+          (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
+def : Pat<(addc CPURegs:$lhs, CPURegs:$rhs),
+          (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
+def : Pat<(addc  CPURegs:$src, imm:$imm),
+          (ADDiu CPURegs:$src, imm:$imm)>;
+
+// Call
+def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+          (JAL tglobaladdr:$dst)>;
+def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+          (JAL texternalsym:$dst)>;
+def : Pat<(MipsJmpLink CPURegs:$dst),
+          (JALR CPURegs:$dst)>;
+
+// hi/lo relocs
+def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
+          (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
+
+def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
+          (ADDiu CPURegs:$hi, tjumptable:$lo)>;
+
+def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
+          (ADDiu CPURegs:$hi, tconstpool:$lo)>;
+
+// gp_rel relocs
+def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)), 
+          (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
+def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)), 
+          (ADDiu CPURegs:$gp, tconstpool:$in)>;
+
+// Mips does not have "not", so we expand our way
+def : Pat<(not CPURegs:$in),
+          (NOR CPURegs:$in, ZERO)>;
+
+// extended load and stores
+def : Pat<(extloadi1  addr:$src), (LBu addr:$src)>;
+def : Pat<(extloadi8  addr:$src), (LBu addr:$src)>;
+def : Pat<(extloadi16 addr:$src), (LHu addr:$src)>;
+
+// peepholes
+def : Pat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+
+// brcond patterns
+def : Pat<(brcond (setne CPURegs:$lhs, 0), bb:$dst),
+          (BNE CPURegs:$lhs, ZERO, bb:$dst)>;
+def : Pat<(brcond (seteq CPURegs:$lhs, 0), bb:$dst),
+          (BEQ CPURegs:$lhs, ZERO, bb:$dst)>;
+
+def : Pat<(brcond (setge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setuge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
+          (BEQ (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setuge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
+          (BEQ (SLTiu CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond (setle CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setule CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond CPURegs:$cond, bb:$dst),
+          (BNE CPURegs:$cond, ZERO, bb:$dst)>;
+
+// select patterns
+def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$lhs, CPURegs:$rhs))>;
+def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs))>;
+def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs))>;
+def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTiu CPURegs:$lh, immSExt16:$rh))>;
+
+def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$rhs, CPURegs:$lhs))>;
+def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs))>;
+
+def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
+def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVN CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
+
+def : Pat<(select CPURegs:$cond, CPURegs:$T, CPURegs:$F), 
+          (MOVN CPURegs:$F, CPURegs:$T, CPURegs:$cond)>;
+
+// setcc patterns
+def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs),
+          (SLTu (XOR CPURegs:$lhs, CPURegs:$rhs), 1)>;
+def : Pat<(setne CPURegs:$lhs, CPURegs:$rhs),
+          (SLTu ZERO, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
+
+def : Pat<(setle CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLT CPURegs:$rhs, CPURegs:$lhs), 1)>;
+def : Pat<(setule CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLTu CPURegs:$rhs, CPURegs:$lhs), 1)>;
+
+def : Pat<(setgt CPURegs:$lhs, CPURegs:$rhs),
+          (SLT CPURegs:$rhs, CPURegs:$lhs)>;
+def : Pat<(setugt CPURegs:$lhs, CPURegs:$rhs),
+          (SLTu CPURegs:$rhs, CPURegs:$lhs)>;
+
+def : Pat<(setge CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLT CPURegs:$lhs, CPURegs:$rhs), 1)>;
+def : Pat<(setuge CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLTu CPURegs:$lhs, CPURegs:$rhs), 1)>;
+
+def : Pat<(setge CPURegs:$lhs, immSExt16:$rhs),
+          (XORi (SLTi CPURegs:$lhs, immSExt16:$rhs), 1)>;
+def : Pat<(setuge CPURegs:$lhs, immSExt16:$rhs),
+          (XORi (SLTiu CPURegs:$lhs, immSExt16:$rhs), 1)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFPU.td"
+
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
new file mode 100644
index 0000000..b95394e
--- /dev/null
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -0,0 +1,131 @@
+//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS_MACHINE_FUNCTION_INFO_H
+#define MIPS_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+namespace llvm {
+
+/// MipsFunctionInfo - This class is derived from MachineFunction private
+/// Mips target-specific information for each MachineFunction.
+class MipsFunctionInfo : public MachineFunctionInfo {
+
+private:
+  /// Holds for each function where on the stack the Frame Pointer must be 
+  /// saved. This is used on Prologue and Epilogue to emit FP save/restore
+  int FPStackOffset;
+
+  /// Holds for each function where on the stack the Return Address must be 
+  /// saved. This is used on Prologue and Epilogue to emit RA save/restore
+  int RAStackOffset;
+
+  /// At each function entry, two special bitmask directives must be emitted
+  /// to help debugging, for CPU and FPU callee saved registers. Both need
+  /// the negative offset from the final stack size and its higher registers
+  /// location on the stack.
+  int CPUTopSavedRegOff;
+  int FPUTopSavedRegOff;
+
+  /// MipsFIHolder - Holds a FrameIndex and it's Stack Pointer Offset
+  struct MipsFIHolder {
+
+    int FI;
+    int SPOffset;
+
+    MipsFIHolder(int FrameIndex, int StackPointerOffset)
+      : FI(FrameIndex), SPOffset(StackPointerOffset) {}
+  };
+
+  /// When PIC is used the GP must be saved on the stack on the function 
+  /// prologue and must be reloaded from this stack location after every 
+  /// call. A reference to its stack location and frame index must be kept 
+  /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
+  MipsFIHolder GPHolder;
+
+  /// On LowerFORMAL_ARGUMENTS the stack size is unknown, so the Stack 
+  /// Pointer Offset calculation of "not in register arguments" must be 
+  /// postponed to emitPrologue. 
+  SmallVector<MipsFIHolder, 16> FnLoadArgs;
+  bool HasLoadArgs;
+
+  // When VarArgs, we must write registers back to caller stack, preserving 
+  // on register arguments. Since the stack size is unknown on 
+  // LowerFORMAL_ARGUMENTS, the Stack Pointer Offset calculation must be
+  // postponed to emitPrologue. 
+  SmallVector<MipsFIHolder, 4> FnStoreVarArgs;
+  bool HasStoreVarArgs;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+public:
+  MipsFunctionInfo(MachineFunction& MF) 
+  : FPStackOffset(0), RAStackOffset(0), CPUTopSavedRegOff(0), 
+    FPUTopSavedRegOff(0), GPHolder(-1,-1), HasLoadArgs(false), 
+    HasStoreVarArgs(false), SRetReturnReg(0)
+  {}
+
+  int getFPStackOffset() const { return FPStackOffset; }
+  void setFPStackOffset(int Off) { FPStackOffset = Off; }
+
+  int getRAStackOffset() const { return RAStackOffset; }
+  void setRAStackOffset(int Off) { RAStackOffset = Off; }
+
+  int getCPUTopSavedRegOff() const { return CPUTopSavedRegOff; }
+  void setCPUTopSavedRegOff(int Off) { CPUTopSavedRegOff = Off; }
+
+  int getFPUTopSavedRegOff() const { return FPUTopSavedRegOff; }
+  void setFPUTopSavedRegOff(int Off) { FPUTopSavedRegOff = Off; }
+
+  int getGPStackOffset() const { return GPHolder.SPOffset; }
+  int getGPFI() const { return GPHolder.FI; }
+  void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; }
+  void setGPFI(int FI) { GPHolder.FI = FI; }
+
+  bool hasLoadArgs() const { return HasLoadArgs; }
+  bool hasStoreVarArgs() const { return HasStoreVarArgs; } 
+
+  void recordLoadArgsFI(int FI, int SPOffset) {
+    if (!HasLoadArgs) HasLoadArgs=true;
+    FnLoadArgs.push_back(MipsFIHolder(FI, SPOffset));
+  }
+  void recordStoreVarArgsFI(int FI, int SPOffset) {
+    if (!HasStoreVarArgs) HasStoreVarArgs=true;
+    FnStoreVarArgs.push_back(MipsFIHolder(FI, SPOffset));
+  }
+
+  void adjustLoadArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasLoadArgs()) return;
+    for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i) 
+      MFI->setObjectOffset( FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset );
+  }
+  void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasStoreVarArgs()) return; 
+    for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i) 
+      MFI->setObjectOffset( FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset );
+  }
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+};
+
+} // end of namespace llvm
+
+#endif // MIPS_MACHINE_FUNCTION_INFO_H
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
new file mode 100644
index 0000000..579d4db
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -0,0 +1,535 @@
+//===- MipsRegisterInfo.cpp - MIPS Register Information -== -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-reg-info"
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsRegisterInfo.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST, 
+                                   const TargetInstrInfo &tii)
+  : MipsGenRegisterInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+    Subtarget(ST), TII(tii) {}
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// Mips::RA, return the number that it corresponds to (e.g. 31).
+unsigned MipsRegisterInfo::
+getRegisterNumbering(unsigned RegEnum) 
+{
+  switch (RegEnum) {
+    case Mips::ZERO : case Mips::F0 : case Mips::D0 : return 0;
+    case Mips::AT   : case Mips::F1 : return 1;
+    case Mips::V0   : case Mips::F2 : case Mips::D1 : return 2;
+    case Mips::V1   : case Mips::F3 : return 3;
+    case Mips::A0   : case Mips::F4 : case Mips::D2 : return 4;
+    case Mips::A1   : case Mips::F5 : return 5;
+    case Mips::A2   : case Mips::F6 : case Mips::D3 : return 6;
+    case Mips::A3   : case Mips::F7 : return 7;
+    case Mips::T0   : case Mips::F8 : case Mips::D4 : return 8;
+    case Mips::T1   : case Mips::F9 : return 9;
+    case Mips::T2   : case Mips::F10: case Mips::D5: return 10;
+    case Mips::T3   : case Mips::F11: return 11;
+    case Mips::T4   : case Mips::F12: case Mips::D6: return 12;
+    case Mips::T5   : case Mips::F13: return 13;
+    case Mips::T6   : case Mips::F14: case Mips::D7: return 14;
+    case Mips::T7   : case Mips::F15: return 15;
+    case Mips::T8   : case Mips::F16: case Mips::D8: return 16;
+    case Mips::T9   : case Mips::F17: return 17;
+    case Mips::S0   : case Mips::F18: case Mips::D9: return 18;
+    case Mips::S1   : case Mips::F19: return 19;
+    case Mips::S2   : case Mips::F20: case Mips::D10: return 20;
+    case Mips::S3   : case Mips::F21: return 21;
+    case Mips::S4   : case Mips::F22: case Mips::D11: return 22;
+    case Mips::S5   : case Mips::F23: return 23;
+    case Mips::S6   : case Mips::F24: case Mips::D12: return 24;
+    case Mips::S7   : case Mips::F25: return 25;
+    case Mips::K0   : case Mips::F26: case Mips::D13: return 26;
+    case Mips::K1   : case Mips::F27: return 27;
+    case Mips::GP   : case Mips::F28: case Mips::D14: return 28;
+    case Mips::SP   : case Mips::F29: return 29;
+    case Mips::FP   : case Mips::F30: case Mips::D15: return 30;
+    case Mips::RA   : case Mips::F31: return 31;
+    default: assert(0 && "Unknown register number!");
+  }    
+  return 0; // Not reached
+}
+
+unsigned MipsRegisterInfo::getPICCallReg(void) { return Mips::T9; }
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods 
+//===----------------------------------------------------------------------===//
+
+/// Mips Callee Saved Registers
+const unsigned* MipsRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const 
+{
+  // Mips callee-save register range is $16-$23, $f20-$f30
+  static const unsigned SingleFloatOnlyCalleeSavedRegs[] = {
+    Mips::S0, Mips::S1, Mips::S2, Mips::S3, 
+    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
+    Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25, 
+    Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30, 0
+  };
+
+  static const unsigned BitMode32CalleeSavedRegs[] = {
+    Mips::S0, Mips::S1, Mips::S2, Mips::S3, 
+    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
+    Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, 
+    Mips::D10, Mips::D11, Mips::D12, Mips::D13, Mips::D14, Mips::D15,0
+  };
+
+  if (Subtarget.isSingleFloat())
+    return SingleFloatOnlyCalleeSavedRegs;
+  else
+    return BitMode32CalleeSavedRegs;
+}
+
+/// Mips Callee Saved Register Classes
+const TargetRegisterClass* const* 
+MipsRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const 
+{
+  static const TargetRegisterClass * const SingleFloatOnlyCalleeSavedRC[] = {
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, 0
+  };
+
+  static const TargetRegisterClass * const BitMode32CalleeSavedRC[] = {
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass,
+    &Mips::AFGR64RegClass, &Mips::AFGR64RegClass, &Mips::AFGR64RegClass, 
+    &Mips::AFGR64RegClass, &Mips::AFGR64RegClass, &Mips::AFGR64RegClass, 0
+  };
+
+  if (Subtarget.isSingleFloat())
+    return SingleFloatOnlyCalleeSavedRC;
+  else
+    return BitMode32CalleeSavedRC;
+}
+
+BitVector MipsRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const
+{
+  BitVector Reserved(getNumRegs());
+  Reserved.set(Mips::ZERO);
+  Reserved.set(Mips::AT);
+  Reserved.set(Mips::K0);
+  Reserved.set(Mips::K1);
+  Reserved.set(Mips::GP);
+  Reserved.set(Mips::SP);
+  Reserved.set(Mips::FP);
+  Reserved.set(Mips::RA);
+
+  // SRV4 requires that odd register can't be used.
+  if (!Subtarget.isSingleFloat())
+    for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2)
+      Reserved.set(FReg);
+  
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack referencesare are done thought a positive offset
+// from the stack/frame pointer, so the stack is considering
+// to grow up! Otherwise terrible hacks would have to be made
+// to get this stack ABI compliant :)
+//
+//  The stack frame required by the ABI (after call):
+//  Offset
+//
+//  0                 ----------
+//  4                 Args to pass 
+//  .                 saved $GP  (used in PIC)
+//  .                 Alloca allocations
+//  .                 Local Area
+//  .                 CPU "Callee Saved" Registers
+//  .                 saved FP
+//  .                 saved RA
+//  .                 FPU "Callee Saved" Registers
+//  StackSize         -----------
+//
+// Offset - offset from sp after stack allocation on function prologue
+//
+// The sp is the stack pointer subtracted/added from the stack size
+// at the Prologue/Epilogue
+//
+// References to the previous stack (to obtain arguments) are done
+// with offsets that exceeds the stack size: (stacksize+(4*(num_arg-1))
+//
+// Examples:
+// - reference to the actual stack frame
+//   for any local area var there is smt like : FI >= 0, StackOffset: 4
+//     sw REGX, 4(SP)
+//
+// - reference to previous stack frame
+//   suppose there's a load to the 5th arguments : FI < 0, StackOffset: 16.
+//   The emitted instruction will be something like:
+//     lw REGX, 16+StackSize(SP)
+//
+// Since the total stack size is unknown on LowerFORMAL_ARGUMENTS, all
+// stack references (ObjectOffset) created to reference the function 
+// arguments, are negative numbers. This way, on eliminateFrameIndex it's
+// possible to detect those references and the offsets are adjusted to
+// their real location.
+//
+//===----------------------------------------------------------------------===//
+
+void MipsRegisterInfo::adjustMipsStackFrame(MachineFunction &MF) const
+{
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+
+  // Min and Max CSI FrameIndex.
+  int MinCSFI = -1, MaxCSFI = -1; 
+
+  // See the description at MipsMachineFunction.h
+  int TopCPUSavedRegOff = -1, TopFPUSavedRegOff = -1;
+
+  // Replace the dummy '0' SPOffset by the negative offsets, as explained on 
+  // LowerFORMAL_ARGUMENTS. Leaving '0' for while is necessary to avoid 
+  // the approach done by calculateFrameObjectOffsets to the stack frame.
+  MipsFI->adjustLoadArgsFI(MFI);
+  MipsFI->adjustStoreVarArgsFI(MFI); 
+
+  // It happens that the default stack frame allocation order does not directly 
+  // map to the convention used for mips. So we must fix it. We move the callee 
+  // save register slots after the local variables area, as described in the
+  // stack frame above.
+  unsigned CalleeSavedAreaSize = 0;
+  if (!CSI.empty()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size()-1].getFrameIdx();
+  }
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    CalleeSavedAreaSize += MFI->getObjectAlignment(CSI[i].getFrameIdx());
+
+  // Adjust local variables. They should come on the stack right
+  // after the arguments.
+  int LastOffsetFI = -1;
+  for (int i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (i >= MinCSFI && i <= MaxCSFI)
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    unsigned Offset = MFI->getObjectOffset(i) - CalleeSavedAreaSize;
+    if (LastOffsetFI == -1)
+      LastOffsetFI = i;
+    if (Offset > MFI->getObjectOffset(LastOffsetFI))
+      LastOffsetFI = i;
+    MFI->setObjectOffset(i, Offset);
+  }
+
+  // Adjust CPU Callee Saved Registers Area. Registers RA and FP must
+  // be saved in this CPU Area there is the need. This whole Area must 
+  // be aligned to the default Stack Alignment requirements.
+  unsigned StackOffset = 0;
+  unsigned RegSize = Subtarget.isGP32bit() ? 4 : 8;
+
+  if (LastOffsetFI >= 0)
+    StackOffset = MFI->getObjectOffset(LastOffsetFI)+ 
+                  MFI->getObjectSize(LastOffsetFI);
+  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
+
+  for (unsigned i = 0, e = CSI.size(); i != e ; ++i) {
+    if (CSI[i].getRegClass() != Mips::CPURegsRegisterClass)
+      break;
+    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
+    TopCPUSavedRegOff = StackOffset;
+    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
+  }
+
+  if (hasFP(MF)) {
+    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize), 
+                         StackOffset);
+    MipsFI->setFPStackOffset(StackOffset);
+    TopCPUSavedRegOff = StackOffset;
+    StackOffset += RegSize;
+  }
+
+  if (MFI->hasCalls()) {
+    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize), 
+                         StackOffset);
+    MipsFI->setRAStackOffset(StackOffset);
+    TopCPUSavedRegOff = StackOffset;
+    StackOffset += RegSize;
+  }
+  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
+  
+  // Adjust FPU Callee Saved Registers Area. This Area must be 
+  // aligned to the default Stack Alignment requirements.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    if (CSI[i].getRegClass() == Mips::CPURegsRegisterClass)
+      continue;
+    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
+    TopFPUSavedRegOff = StackOffset;
+    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
+  }
+  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
+
+  // Update frame info
+  MFI->setStackSize(StackOffset);
+
+  // Recalculate the final tops offset. The final values must be '0'
+  // if there isn't a callee saved register for CPU or FPU, otherwise
+  // a negative offset is needed.
+  if (TopCPUSavedRegOff >= 0)
+    MipsFI->setCPUTopSavedRegOff(TopCPUSavedRegOff-StackOffset);
+
+  if (TopFPUSavedRegOff >= 0)
+    MipsFI->setFPUTopSavedRegOff(TopFPUSavedRegOff-StackOffset);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool MipsRegisterInfo::
+hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+// This function eliminate ADJCALLSTACKDOWN, 
+// ADJCALLSTACKUP pseudo instructions
+void MipsRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+void MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, 
+                    RegScavenger *RS) const 
+{
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  unsigned i = 0;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && 
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  #ifndef NDEBUG
+  DOUT << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DOUT << "<--------->\n";
+  MI.print(DOUT);
+  #endif
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int stackSize  = MF.getFrameInfo()->getStackSize();
+  int spOffset   = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  #ifndef NDEBUG
+  DOUT << "FrameIndex : " << FrameIndex << "\n";
+  DOUT << "spOffset   : " << spOffset << "\n";
+  DOUT << "stackSize  : " << stackSize << "\n";
+  #endif
+
+  // as explained on LowerFORMAL_ARGUMENTS, detect negative offsets 
+  // and adjust SPOffsets considering the final stack size.
+  int Offset = ((spOffset < 0) ? (stackSize + (-(spOffset+4))) : (spOffset));
+  Offset    += MI.getOperand(i-1).getImm();
+
+  #ifndef NDEBUG
+  DOUT << "Offset     : " << Offset << "\n";
+  DOUT << "<--------->\n";
+  #endif
+
+  MI.getOperand(i-1).ChangeToImmediate(Offset);
+  MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
+}
+
+void MipsRegisterInfo::
+emitPrologue(MachineFunction &MF) const 
+{
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+  bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_);
+
+  // Get the right frame order for Mips.
+  adjustMipsStackFrame(MF);
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->hasCalls()) return;
+
+  int FPOffset = MipsFI->getFPStackOffset();
+  int RAOffset = MipsFI->getRAStackOffset();
+
+  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOREORDER));
+  
+  // TODO: check need from GP here.
+  if (isPIC && Subtarget.isABI_O32()) 
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPLOAD)).addReg(getPICCallReg());
+  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
+
+  // Adjust stack : addi sp, sp, (-imm)
+  BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
+      .addReg(Mips::SP).addImm(-StackSize);
+
+  // Save the return address only if the function isnt a leaf one.
+  // sw  $ra, stack_loc($sp)
+  if (MFI->hasCalls()) { 
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
+        .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP);
+  }
+
+  // if framepointer enabled, save it and set it
+  // to point to the stack pointer
+  if (hasFP(MF)) {
+    // sw  $fp,stack_loc($sp)
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
+      .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP);
+
+    // move $fp, $sp
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP)
+      .addReg(Mips::SP).addReg(Mips::ZERO);
+  }
+
+  // PIC speficic function prologue
+  if ((isPIC) && (MFI->hasCalls())) {
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE))
+      .addImm(MipsFI->getGPStackOffset());
+  }
+}
+
+void MipsRegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const 
+{
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI         = MF.getInfo<MipsFunctionInfo>();
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  // Get the number of bytes from FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Get the FI's where RA and FP are saved.
+  int FPOffset = MipsFI->getFPStackOffset();
+  int RAOffset = MipsFI->getRAStackOffset();
+
+  // if framepointer enabled, restore it and restore the
+  // stack pointer
+  if (hasFP(MF)) {
+    // move $sp, $fp
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::SP)
+      .addReg(Mips::FP).addReg(Mips::ZERO);
+
+    // lw  $fp,stack_loc($sp)
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::FP)
+      .addImm(FPOffset).addReg(Mips::SP);
+  }
+
+  // Restore the return address only if the function isnt a leaf one.
+  // lw  $ra, stack_loc($sp)
+  if (MFI->hasCalls()) { 
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::RA)
+      .addImm(RAOffset).addReg(Mips::SP);
+  }
+
+  // adjust stack  : insert addi sp, sp, (imm)
+  if (NumBytes) {
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
+      .addReg(Mips::SP).addImm(NumBytes);
+  }
+}
+
+
+void MipsRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+  // Set the SPOffset on the FI where GP must be saved/loaded.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_);
+  if (MFI->hasCalls() && isPIC) { 
+    MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    MFI->setObjectOffset(MipsFI->getGPFI(), MipsFI->getGPStackOffset());
+  }    
+}
+
+unsigned MipsRegisterInfo::
+getRARegister() const {
+  return Mips::RA;
+}
+
+unsigned MipsRegisterInfo::
+getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? Mips::FP : Mips::SP;
+}
+
+unsigned MipsRegisterInfo::
+getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned MipsRegisterInfo::
+getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int MipsRegisterInfo::
+getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+#include "MipsGenRegisterInfo.inc"
+
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
new file mode 100644
index 0000000..808e995
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -0,0 +1,78 @@
+//===- MipsRegisterInfo.h - Mips Register Information Impl ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSREGISTERINFO_H
+#define MIPSREGISTERINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "MipsGenRegisterInfo.h.inc"
+
+namespace llvm {
+class MipsSubtarget;
+class TargetInstrInfo;
+class Type;
+
+struct MipsRegisterInfo : public MipsGenRegisterInfo {
+  const MipsSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+  
+  MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii);
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// Mips::RA, return the number that it corresponds to (e.g. 31).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Get PIC indirect call register
+  static unsigned getPICCallReg(void); 
+
+  /// Adjust the Mips stack frame.
+  void adjustMipsStackFrame(MachineFunction &MF) const;
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction* MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  /// Stack Frame Processing Methods
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  /// Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  /// Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
new file mode 100644
index 0000000..bbb275c
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -0,0 +1,252 @@
+//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MIPS register file
+//===----------------------------------------------------------------------===//
+
+// We have banks of 32 registers each.
+class MipsReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Mips";
+}
+
+// Mips CPU Registers
+class MipsGPRReg<bits<5> num, string n> : MipsReg<n> {
+  let Num = num;
+}
+
+// Mips 32-bit FPU Registers
+class FPR<bits<5> num, string n> : MipsReg<n> {
+  let Num = num;
+}
+
+// Mips 64-bit (aliased) FPU Registers
+class AFPR<bits<5> num, string n, list<Register> aliases> : MipsReg<n> {
+  let Num = num;
+  let Aliases = aliases;
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Mips" in {
+
+  // General Purpose Registers
+  def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<[0]>;
+  def AT   : MipsGPRReg< 1, "AT">,   DwarfRegNum<[1]>;
+  def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
+  def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
+  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[5]>;
+  def A1   : MipsGPRReg< 5, "5">,    DwarfRegNum<[5]>;
+  def A2   : MipsGPRReg< 6, "6">,    DwarfRegNum<[6]>;
+  def A3   : MipsGPRReg< 7, "7">,    DwarfRegNum<[7]>;
+  def T0   : MipsGPRReg< 8, "8">,    DwarfRegNum<[8]>;
+  def T1   : MipsGPRReg< 9, "9">,    DwarfRegNum<[9]>;
+  def T2   : MipsGPRReg< 10, "10">,  DwarfRegNum<[10]>;
+  def T3   : MipsGPRReg< 11, "11">,  DwarfRegNum<[11]>;
+  def T4   : MipsGPRReg< 12, "12">,  DwarfRegNum<[12]>;
+  def T5   : MipsGPRReg< 13, "13">,  DwarfRegNum<[13]>;
+  def T6   : MipsGPRReg< 14, "14">,  DwarfRegNum<[14]>;
+  def T7   : MipsGPRReg< 15, "15">,  DwarfRegNum<[15]>;
+  def S0   : MipsGPRReg< 16, "16">,  DwarfRegNum<[16]>;
+  def S1   : MipsGPRReg< 17, "17">,  DwarfRegNum<[17]>;
+  def S2   : MipsGPRReg< 18, "18">,  DwarfRegNum<[18]>;
+  def S3   : MipsGPRReg< 19, "19">,  DwarfRegNum<[19]>;
+  def S4   : MipsGPRReg< 20, "20">,  DwarfRegNum<[20]>;
+  def S5   : MipsGPRReg< 21, "21">,  DwarfRegNum<[21]>;
+  def S6   : MipsGPRReg< 22, "22">,  DwarfRegNum<[22]>;
+  def S7   : MipsGPRReg< 23, "23">,  DwarfRegNum<[23]>;
+  def T8   : MipsGPRReg< 24, "24">,  DwarfRegNum<[24]>;
+  def T9   : MipsGPRReg< 25, "25">,  DwarfRegNum<[25]>;
+  def K0   : MipsGPRReg< 26, "26">,  DwarfRegNum<[26]>;
+  def K1   : MipsGPRReg< 27, "27">,  DwarfRegNum<[27]>;
+  def GP   : MipsGPRReg< 28, "GP">,  DwarfRegNum<[28]>;
+  def SP   : MipsGPRReg< 29, "SP">,  DwarfRegNum<[29]>;
+  def FP   : MipsGPRReg< 30, "FP">,  DwarfRegNum<[30]>;
+  def RA   : MipsGPRReg< 31, "RA">,  DwarfRegNum<[31]>;
+  
+  /// Mips Single point precision FPU Registers
+  def F0  : FPR< 0,  "F0">, DwarfRegNum<[32]>;
+  def F1  : FPR< 1,  "F1">, DwarfRegNum<[33]>;
+  def F2  : FPR< 2,  "F2">, DwarfRegNum<[34]>;
+  def F3  : FPR< 3,  "F3">, DwarfRegNum<[35]>;
+  def F4  : FPR< 4,  "F4">, DwarfRegNum<[36]>;
+  def F5  : FPR< 5,  "F5">, DwarfRegNum<[37]>;
+  def F6  : FPR< 6,  "F6">, DwarfRegNum<[38]>;
+  def F7  : FPR< 7,  "F7">, DwarfRegNum<[39]>;
+  def F8  : FPR< 8,  "F8">, DwarfRegNum<[40]>;
+  def F9  : FPR< 9,  "F9">, DwarfRegNum<[41]>;
+  def F10 : FPR<10, "F10">, DwarfRegNum<[42]>;
+  def F11 : FPR<11, "F11">, DwarfRegNum<[43]>;
+  def F12 : FPR<12, "F12">, DwarfRegNum<[44]>;
+  def F13 : FPR<13, "F13">, DwarfRegNum<[45]>;
+  def F14 : FPR<14, "F14">, DwarfRegNum<[46]>;
+  def F15 : FPR<15, "F15">, DwarfRegNum<[47]>;
+  def F16 : FPR<16, "F16">, DwarfRegNum<[48]>;
+  def F17 : FPR<17, "F17">, DwarfRegNum<[49]>;
+  def F18 : FPR<18, "F18">, DwarfRegNum<[50]>;
+  def F19 : FPR<19, "F19">, DwarfRegNum<[51]>;
+  def F20 : FPR<20, "F20">, DwarfRegNum<[52]>;
+  def F21 : FPR<21, "F21">, DwarfRegNum<[53]>;
+  def F22 : FPR<22, "F22">, DwarfRegNum<[54]>;
+  def F23 : FPR<23, "F23">, DwarfRegNum<[55]>;
+  def F24 : FPR<24, "F24">, DwarfRegNum<[56]>;
+  def F25 : FPR<25, "F25">, DwarfRegNum<[57]>;
+  def F26 : FPR<26, "F26">, DwarfRegNum<[58]>;
+  def F27 : FPR<27, "F27">, DwarfRegNum<[59]>;
+  def F28 : FPR<28, "F28">, DwarfRegNum<[60]>;
+  def F29 : FPR<29, "F29">, DwarfRegNum<[61]>;
+  def F30 : FPR<30, "F30">, DwarfRegNum<[62]>;
+  def F31 : FPR<31, "F31">, DwarfRegNum<[63]>;
+  
+  /// Mips Double point precision FPU Registers (aliased
+  /// with the single precision to hold 64 bit values)
+  def D0  : AFPR< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
+  def D1  : AFPR< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>;
+  def D2  : AFPR< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
+  def D3  : AFPR< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>;
+  def D4  : AFPR< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
+  def D5  : AFPR<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
+  def D6  : AFPR<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
+  def D7  : AFPR<14, "F14", [F14, F15]>, DwarfRegNum<[46]>;
+  def D8  : AFPR<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
+  def D9  : AFPR<18, "F18", [F18, F19]>, DwarfRegNum<[50]>;
+  def D10 : AFPR<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
+  def D11 : AFPR<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
+  def D12 : AFPR<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
+  def D13 : AFPR<26, "F26", [F26, F27]>, DwarfRegNum<[58]>;
+  def D14 : AFPR<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
+  def D15 : AFPR<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+
+  // Hi/Lo registers
+  def HI  : Register<"hi">, DwarfRegNum<[64]>;
+  def LO  : Register<"lo">, DwarfRegNum<[65]>;
+
+  // Status flags register
+  def FCR31 : Register<"31">;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+def CPURegs : RegisterClass<"Mips", [i32], 32, 
+  // Return Values and Arguments
+  [V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Reserved
+  ZERO, AT, K0, K1, GP, SP, FP, RA]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    CPURegsClass::iterator
+    CPURegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // The last 8 registers on the list above are reserved
+      return end()-8;
+    }
+  }];
+}
+
+// 64bit fp:
+// * FGR64  - 32 64-bit registers
+// * AFGR64 - 16 32-bit even registers (32-bit FP Mode) 
+//
+// 32bit fp:
+// * FGR32 - 16 32-bit even registers
+// * FGR32 - 32 32-bit registers (single float only mode)
+def FGR32 : RegisterClass<"Mips", [f32], 32, 
+  // Return Values and Arguments
+  [F0, F1, F2, F3, F12, F13, F14, F15,
+  // Not preserved across procedure calls
+  F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19, 
+  // Callee save
+  F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+  // Reserved
+  F31]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+
+    static const unsigned MIPS_FGR32[] = {
+      Mips::F0,  Mips::F1,  Mips::F2,  Mips::F3,  Mips::F12,  Mips::F13, 
+      Mips::F14, Mips::F15, Mips::F4,  Mips::F5,  Mips::F6,   Mips::F7, 
+      Mips::F8,  Mips::F9,  Mips::F10, Mips::F11, Mips::F16,  Mips::F17, 
+      Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22,  Mips::F23, 
+      Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28,  Mips::F29, 
+      Mips::F30
+    };
+
+    static const unsigned MIPS_SVR4_FGR32[] = {
+      Mips::F0,  Mips::F2,  Mips::F12, Mips::F14, Mips::F4, 
+      Mips::F6,  Mips::F8,  Mips::F10, Mips::F16, Mips::F18, 
+      Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30,
+    };
+
+    FGR32Class::iterator
+    FGR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+      if (Subtarget.isSingleFloat())
+        return MIPS_FGR32;
+      else
+        return MIPS_SVR4_FGR32; 
+    }
+
+    FGR32Class::iterator
+    FGR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+      if (Subtarget.isSingleFloat())
+        return MIPS_FGR32 + (sizeof(MIPS_FGR32) / sizeof(unsigned));
+      else
+        return MIPS_SVR4_FGR32 + (sizeof(MIPS_SVR4_FGR32) / sizeof(unsigned));
+    }
+  }];
+}
+
+def AFGR64 : RegisterClass<"Mips", [f64], 64, 
+  // Return Values and Arguments
+  [D0, D1, D6, D7,
+  // Not preserved across procedure calls
+  D2, D3, D4, D5, D8, D9, 
+  // Callee save
+  D10, D11, D12, D13, D14,
+  // Reserved
+  D15]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    AFGR64Class::iterator
+    AFGR64Class::allocation_order_end(const MachineFunction &MF) const {
+      // The last register on the list above is reserved
+      return end()-1;
+    }
+  }];
+}
+
+// Condition Register for floating point operations
+def CCR  : RegisterClass<"Mips", [i32], 32, [FCR31]>;
+
+// Hi/Lo Registers
+def HILO : RegisterClass<"Mips", [i32], 32, [HI, LO]>;
+
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
new file mode 100644
index 0000000..0c3ca573
--- /dev/null
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -0,0 +1,63 @@
+//===- MipsSchedule.td - Mips Scheduling Definitions ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across Mips chips sets. Based on GCC/Mips backend files.
+//===----------------------------------------------------------------------===//
+def ALU     : FuncUnit;
+def IMULDIV : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Mips 
+//===----------------------------------------------------------------------===//
+def IIAlu              : InstrItinClass;
+def IILoad             : InstrItinClass;
+def IIStore            : InstrItinClass;
+def IIXfer             : InstrItinClass;
+def IIBranch           : InstrItinClass;
+def IIHiLo             : InstrItinClass;
+def IIImul             : InstrItinClass;
+def IIIdiv             : InstrItinClass;
+def IIFcvt             : InstrItinClass;
+def IIFmove            : InstrItinClass;
+def IIFcmp             : InstrItinClass;
+def IIFadd             : InstrItinClass;
+def IIFmulSingle       : InstrItinClass;
+def IIFmulDouble       : InstrItinClass;
+def IIFdivSingle       : InstrItinClass;
+def IIFdivDouble       : InstrItinClass;
+def IIFsqrtSingle      : InstrItinClass;
+def IIFsqrtDouble      : InstrItinClass;
+def IIFrecipFsqrtStep  : InstrItinClass;
+def IIPseudo           : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Mips Generic instruction itineraries.
+//===----------------------------------------------------------------------===//
+def MipsGenericItineraries : ProcessorItineraries<[
+  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIHiLo             , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<IIImul             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<IIIdiv             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<IIFcvt             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIFmove            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIFcmp             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIFadd             , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<IIFmulSingle       , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<IIFmulDouble       , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<IIFdivSingle       , [InstrStage<23, [ALU]>]>,
+  InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
+  InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
+  InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
+  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>
+]>;
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
new file mode 100644
index 0000000..4245f27
--- /dev/null
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -0,0 +1,77 @@
+//===- MipsSubtarget.cpp - Mips Subtarget Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Mips specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSubtarget.h"
+#include "Mips.h"
+#include "MipsGenSubtarget.inc"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool>
+NotABICall("disable-mips-abicall", cl::Hidden,
+           cl::desc("Disable code for SVR4-style dynamic objects"));
+static cl::opt<bool>
+AbsoluteCall("enable-mips-absolute-call", cl::Hidden,
+             cl::desc("Enable absolute call within abicall"));
+static cl::opt<unsigned>
+SSThreshold("mips-ssection-threshold", cl::Hidden,
+            cl::desc("Small data and bss section threshold size (default=8)"),
+            cl::init(8));
+
+MipsSubtarget::MipsSubtarget(const TargetMachine &TM, const Module &M, 
+                             const std::string &FS, bool little) : 
+  MipsArchVersion(Mips1), MipsABI(O32), IsLittle(little), IsSingleFloat(false),
+  IsFP64bit(false), IsGP64bit(false), HasVFPU(false), HasABICall(true), 
+  HasAbsoluteCall(false), IsLinux(true), HasSEInReg(false), HasCondMov(false),
+  HasMulDivAdd(false), HasMinMax(false), HasSwap(false), HasBitCount(false)
+{
+  std::string CPU = "mips1";
+  MipsArchVersion = Mips1;
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+  const std::string& TT = M.getTargetTriple();
+
+  // Small section size threshold
+  SSectionThreshold = SSThreshold;
+
+  // Is the target system Linux ?
+  if (TT.find("linux") == std::string::npos)
+    IsLinux = false;
+
+  // When only the target triple is specified and is 
+  // a allegrex target, set the features. We also match
+  // big and little endian allegrex cores (dont really
+  // know if a big one exists)
+  if (TT.find("mipsallegrex") != std::string::npos ||
+      TT.find("psp") != std::string::npos) {
+    MipsABI = EABI;
+    IsSingleFloat = true;
+    MipsArchVersion = Mips2;
+    HasVFPU = true; // Enables Allegrex Vector FPU (not supported yet)
+    HasSEInReg = true;
+    HasBitCount = true;
+    HasSwap = true;
+    HasCondMov = true;
+  }
+
+  // Abicall is the default for O32 ABI, but is disabled within EABI and in
+  // static code.
+  if (NotABICall || isABI_EABI() || (TM.getRelocationModel() == Reloc::Static))
+    HasABICall = false;
+
+  // TODO: disable when handling 64 bit symbols in the future.
+  if (HasABICall && AbsoluteCall)
+    HasAbsoluteCall = true;
+}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
new file mode 100644
index 0000000..61c37c1
--- /dev/null
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -0,0 +1,139 @@
+//=====-- MipsSubtarget.h - Define Subtarget for the Mips -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSUBTARGET_H
+#define MIPSSUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+class MipsSubtarget : public TargetSubtarget {
+
+public:
+  enum MipsABIEnum {
+    O32, O64, N32, N64, EABI
+  }; 
+
+protected:
+
+  enum MipsArchEnum {
+    Mips1, Mips2, Mips3, Mips4, Mips32, Mips32r2, Mips64, Mips64r2
+  };
+
+  // Mips architecture version 
+  MipsArchEnum MipsArchVersion;
+
+  // Mips supported ABIs 
+  MipsABIEnum MipsABI;
+
+  // IsLittle - The target is Little Endian
+  bool IsLittle;
+
+  // IsSingleFloat - The target only supports single precision float
+  // point operations. This enable the target to use all 32 32-bit
+  // floating point registers instead of only using even ones.
+  bool IsSingleFloat;
+
+  // IsFP64bit - The target processor has 64-bit floating point registers.
+  bool IsFP64bit;
+
+  // IsFP64bit - General-purpose registers are 64 bits wide
+  bool IsGP64bit;
+
+  // HasVFPU - Processor has a vector floating point unit.
+  bool HasVFPU;
+
+  // IsABICall - Enable SRV4 code for SVR4-style dynamic objects 
+  bool HasABICall;
+
+  // HasAbsoluteCall - Enable code that is not fully position-independent.
+  // Only works with HasABICall enabled.
+  bool HasAbsoluteCall;
+
+  // isLinux - Target system is Linux. Is false we consider ELFOS for now.
+  bool IsLinux;
+
+  // Put global and static items less than or equal to SSectionThreshold 
+  // bytes into the small data or bss section. The default is 8.
+  unsigned SSectionThreshold;
+
+  /// Features related to the presence of specific instructions.
+  
+  // HasSEInReg - SEB and SEH (signext in register) instructions.
+  bool HasSEInReg;
+
+  // HasCondMov - Conditional mov (MOVZ, MOVN) instructions.
+  bool HasCondMov;
+
+  // HasMulDivAdd - Multiply add and sub (MADD, MADDu, MSUB, MSUBu) 
+  // instructions.
+  bool HasMulDivAdd;
+
+  // HasMinMax - MIN and MAX instructions.
+  bool HasMinMax;
+
+  // HasSwap - Byte and half swap instructions.
+  bool HasSwap;
+
+  // HasBitCount - Count leading '1' and '0' bits.
+  bool HasBitCount;
+
+  InstrItineraryData InstrItins;
+
+public:
+
+  /// Only O32 and EABI supported right now.
+  bool isABI_EABI() const { return MipsABI == EABI; }
+  bool isABI_O32() const { return MipsABI == O32; }
+  unsigned getTargetABI() const { return MipsABI; }
+
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  MipsSubtarget(const TargetMachine &TM, const Module &M, 
+                const std::string &FS, bool little);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool isMips1() const { return MipsArchVersion == Mips1; }
+
+  bool isLittle() const { return IsLittle; }
+  bool isFP64bit() const { return IsFP64bit; };
+  bool isGP64bit() const { return IsGP64bit; };
+  bool isGP32bit() const { return !IsGP64bit; };
+  bool isSingleFloat() const { return IsSingleFloat; };
+  bool isNotSingleFloat() const { return !IsSingleFloat; };
+  bool hasVFPU() const { return HasVFPU; };
+  bool hasABICall() const { return HasABICall; };
+  bool hasAbsoluteCall() const { return HasAbsoluteCall; };
+  bool isLinux() const { return IsLinux; };
+  unsigned getSSectionThreshold() const { return SSectionThreshold; }
+
+  /// Features related to the presence of specific instructions.
+  bool hasSEInReg()   const { return HasSEInReg; };
+  bool hasCondMov()   const { return HasCondMov; };
+  bool hasMulDivAdd() const { return HasMulDivAdd; };
+  bool hasMinMax()    const { return HasMinMax; };
+  bool hasSwap()      const { return HasSwap; };
+  bool hasBitCount()  const { return HasBitCount; };
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/Mips/MipsTargetAsmInfo.cpp b/lib/Target/Mips/MipsTargetAsmInfo.cpp
new file mode 100644
index 0000000..c197b0c
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetAsmInfo.cpp
@@ -0,0 +1,98 @@
+//===-- MipsTargetAsmInfo.cpp - Mips asm properties -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetAsmInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/GlobalVariable.h"
+
+using namespace llvm;
+
+MipsTargetAsmInfo::MipsTargetAsmInfo(const MipsTargetMachine &TM):
+  ELFTargetAsmInfo(TM) {
+
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+
+  AlignmentIsInBytes          = false;
+  COMMDirectiveTakesAlignment = true;
+  Data16bitsDirective         = "\t.half\t";
+  Data32bitsDirective         = "\t.word\t";
+  Data64bitsDirective         = NULL;
+  PrivateGlobalPrefix         = "$";
+  JumpTableDataSection        = "\t.rdata";
+  CommentString               = "#";
+  ZeroDirective               = "\t.space\t";
+  BSSSection                  = "\t.section\t.bss";
+  CStringSection              = ".rodata.str";
+
+  if (!Subtarget->hasABICall()) {
+    JumpTableDirective = "\t.word\t";
+    SmallDataSection = getNamedSection("\t.sdata", SectionFlags::Writeable);
+    SmallBSSSection = getNamedSection("\t.sbss",
+                                      SectionFlags::Writeable |
+                                      SectionFlags::BSS);
+  } else
+    JumpTableDirective = "\t.gpword\t";
+
+}
+
+unsigned MipsTargetAsmInfo::
+SectionFlagsForGlobal(const GlobalValue *GV, const char* Name) const {
+  unsigned Flags = ELFTargetAsmInfo::SectionFlagsForGlobal(GV, Name);
+  // Mask out Small Section flag bit, Mips doesnt support 's' section symbol
+  // for its small sections.
+  return (Flags & (~SectionFlags::Small));
+}
+
+SectionKind::Kind MipsTargetAsmInfo::
+SectionKindForGlobal(const GlobalValue *GV) const {
+  SectionKind::Kind K = ELFTargetAsmInfo::SectionKindForGlobal(GV);
+
+  if (Subtarget->hasABICall())
+    return K;
+
+  if (K != SectionKind::Data && K != SectionKind::BSS &&
+      K != SectionKind::RODataMergeConst)
+    return K;
+
+  if (isa<GlobalVariable>(GV)) {
+    const TargetData *TD = TM.getTargetData();
+    unsigned Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+    unsigned Threshold = Subtarget->getSSectionThreshold();
+
+    if (Size > 0 && Size <= Threshold) {
+      if (K == SectionKind::BSS)
+        return SectionKind::SmallBSS;
+      else
+        return SectionKind::SmallData;
+    }
+  }
+
+  return K;
+}
+
+const Section* MipsTargetAsmInfo::
+SelectSectionForGlobal(const GlobalValue *GV) const {
+  SectionKind::Kind K = SectionKindForGlobal(GV);
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+
+  if (GVA && (!GVA->isWeakForLinker()))
+    switch (K) {
+      case SectionKind::SmallData:
+        return getSmallDataSection();
+      case SectionKind::SmallBSS:
+        return getSmallBSSSection();
+      default: break;
+    }
+
+  return ELFTargetAsmInfo::SelectSectionForGlobal(GV);
+}
diff --git a/lib/Target/Mips/MipsTargetAsmInfo.h b/lib/Target/Mips/MipsTargetAsmInfo.h
new file mode 100644
index 0000000..2b5a739
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetAsmInfo.h
@@ -0,0 +1,51 @@
+//=====-- MipsTargetAsmInfo.h - Mips asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETASMINFO_H
+#define MIPSTARGETASMINFO_H
+
+#include "MipsSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class GlobalValue;
+  class MipsTargetMachine;
+
+  struct MipsTargetAsmInfo : public ELFTargetAsmInfo {
+    explicit MipsTargetAsmInfo(const MipsTargetMachine &TM);
+
+    /// SectionKindForGlobal - This hook allows the target to select proper
+    /// section kind used for global emission.
+    virtual SectionKind::Kind
+    SectionKindForGlobal(const GlobalValue *GV) const;
+
+    /// SectionFlagsForGlobal - This hook allows the target to select proper
+    /// section flags either for given global or for section.
+    virtual unsigned
+    SectionFlagsForGlobal(const GlobalValue *GV = NULL,
+                          const char* name = NULL) const;
+
+    virtual const Section* SelectSectionForGlobal(const GlobalValue *GV) const;
+
+    private:
+      const MipsSubtarget *Subtarget;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
new file mode 100644
index 0000000..ef524e3
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -0,0 +1,133 @@
+//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Mips target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsTargetAsmInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+/// MipsTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int MipsTargetMachineModule;
+int MipsTargetMachineModule = 0;
+
+// Register the target.
+static RegisterTarget<MipsTargetMachine>    X("mips", "Mips");
+static RegisterTarget<MipselTargetMachine>  Y("mipsel", "Mipsel");
+
+const TargetAsmInfo *MipsTargetMachine::
+createTargetAsmInfo() const 
+{
+  return new MipsTargetAsmInfo(*this);
+}
+
+// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
+// The stack is always 8 byte aligned
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables 
+// an easier handling.
+// Using CodeModel::Large enables different CALL behavior.
+MipsTargetMachine::
+MipsTargetMachine(const Module &M, const std::string &FS, bool isLittle=false):
+  Subtarget(*this, M, FS, isLittle), 
+  DataLayout(isLittle ? std::string("e-p:32:32:32-i8:8:32-i16:16:32") :
+                        std::string("E-p:32:32:32-i8:8:32-i16:16:32")), 
+  InstrInfo(*this), 
+  FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0),
+  TLInfo(*this) 
+{
+  // Abicall enables PIC by default
+  if (Subtarget.hasABICall())
+    setRelocationModel(Reloc::PIC_);  
+
+  // TODO: create an option to enable long calls, like -mlong-calls, 
+  // that would be our CodeModel::Large. It must not work with Abicall.
+  if (getCodeModel() == CodeModel::Default)
+    setCodeModel(CodeModel::Small);
+}
+
+MipselTargetMachine::
+MipselTargetMachine(const Module &M, const std::string &FS) :
+  MipsTargetMachine(M, FS, true) {}
+
+// return 0 and must specify -march to gen MIPS code.
+unsigned MipsTargetMachine::
+getModuleMatchQuality(const Module &M) 
+{
+  // We strongly match "mips*-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 5 && std::string(TT.begin(), TT.begin()+5) == "mips-")
+    return 20;
+  
+  if (TT.size() >= 13 && std::string(TT.begin(), 
+      TT.begin()+13) == "mipsallegrex-")
+    return 20;
+
+  return 0;
+}
+
+// return 0 and must specify -march to gen MIPSEL code.
+unsigned MipselTargetMachine::
+getModuleMatchQuality(const Module &M) 
+{
+  // We strongly match "mips*el-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 7 && std::string(TT.begin(), TT.begin()+7) == "mipsel-")
+    return 20;
+
+  if (TT.size() >= 15 && std::string(TT.begin(), 
+      TT.begin()+15) == "mipsallegrexel-")
+    return 20;
+
+  if (TT.size() == 3 && std::string(TT.begin(), TT.begin()+3) == "psp")
+    return 20;
+  
+  return 0;
+}
+
+// Install an instruction selector pass using 
+// the ISelDag to gen Mips code.
+bool MipsTargetMachine::
+addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel) 
+{
+  PM.add(createMipsISelDag(*this));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before 
+// machine code is emitted. return true if -print-machineinstrs should 
+// print out the code after the passes.
+bool MipsTargetMachine::
+addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) 
+{
+  PM.add(createMipsDelaySlotFillerPass(*this));
+  return true;
+}
+
+// Implements the AssemblyEmitter for the target. Must return
+// true if AssemblyEmitter is supported
+bool MipsTargetMachine::
+addAssemblyEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, 
+                   bool Verbose, raw_ostream &Out) 
+{
+  // Output assembly language.
+  PM.add(createMipsCodePrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
new file mode 100644
index 0000000..a9e1df2
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -0,0 +1,80 @@
+//===-- MipsTargetMachine.h - Define TargetMachine for Mips -00--*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETMACHINE_H
+#define MIPSTARGETMACHINE_H
+
+#include "MipsSubtarget.h"
+#include "MipsInstrInfo.h"
+#include "MipsISelLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+
+namespace llvm {
+  class raw_ostream;
+  
+  class MipsTargetMachine : public LLVMTargetMachine {
+    MipsSubtarget       Subtarget;
+    const TargetData    DataLayout; // Calculates type size & alignment
+    MipsInstrInfo       InstrInfo;
+    TargetFrameInfo     FrameInfo;
+    MipsTargetLowering  TLInfo;
+  
+  protected:
+    virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+  public:
+    MipsTargetMachine(const Module &M, const std::string &FS, bool isLittle);
+
+    virtual const MipsInstrInfo   *getInstrInfo()     const 
+    { return &InstrInfo; }
+    virtual const TargetFrameInfo *getFrameInfo()     const 
+    { return &FrameInfo; }
+    virtual const MipsSubtarget   *getSubtargetImpl() const 
+    { return &Subtarget; }
+    virtual const TargetData      *getTargetData()    const 
+    { return &DataLayout;}
+
+    virtual const MipsRegisterInfo *getRegisterInfo()  const {
+      return &InstrInfo.getRegisterInfo();
+    }
+
+    virtual MipsTargetLowering   *getTargetLowering() const { 
+      return const_cast<MipsTargetLowering*>(&TLInfo); 
+    }
+
+    static unsigned getModuleMatchQuality(const Module &M);
+
+    // Pass Pipeline Configuration
+    virtual bool addInstSelector(PassManagerBase &PM,
+                                 CodeGenOpt::Level OptLevel);
+    virtual bool addPreEmitPass(PassManagerBase &PM,
+                                CodeGenOpt::Level OptLevel);
+    virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool Verbose, raw_ostream &Out);
+  };
+
+/// MipselTargetMachine - Mipsel target machine.
+///
+class MipselTargetMachine : public MipsTargetMachine {
+public:
+  MipselTargetMachine(const Module &M, const std::string &FS);
+
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/PIC16/CMakeLists.txt b/lib/Target/PIC16/CMakeLists.txt
new file mode 100644
index 0000000..00d737a
--- /dev/null
+++ b/lib/Target/PIC16/CMakeLists.txt
@@ -0,0 +1,24 @@
+set(LLVM_TARGET_DEFINITIONS PIC16.td)
+
+tablegen(PIC16GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(PIC16GenRegisterNames.inc -gen-register-enums)
+tablegen(PIC16GenRegisterInfo.inc -gen-register-desc)
+tablegen(PIC16GenInstrNames.inc -gen-instr-enums)
+tablegen(PIC16GenInstrInfo.inc -gen-instr-desc)
+tablegen(PIC16GenAsmWriter.inc -gen-asm-writer)
+tablegen(PIC16GenDAGISel.inc -gen-dag-isel)
+tablegen(PIC16GenCallingConv.inc -gen-callingconv)
+tablegen(PIC16GenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(PIC16
+  PIC16AsmPrinter.cpp
+  PIC16DebugInfo.cpp
+  PIC16InstrInfo.cpp
+  PIC16ISelDAGToDAG.cpp
+  PIC16ISelLowering.cpp
+  PIC16MemSelOpt.cpp
+  PIC16RegisterInfo.cpp
+  PIC16Subtarget.cpp
+  PIC16TargetAsmInfo.cpp
+  PIC16TargetMachine.cpp
+  )
diff --git a/lib/Target/PIC16/Makefile b/lib/Target/PIC16/Makefile
new file mode 100644
index 0000000..c429324
--- /dev/null
+++ b/lib/Target/PIC16/Makefile
@@ -0,0 +1,21 @@
+##===- lib/Target/PIC16/Makefile ---------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source 
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMPIC16
+TARGET = PIC16
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = PIC16GenRegisterInfo.h.inc PIC16GenRegisterNames.inc \
+		PIC16GenRegisterInfo.inc PIC16GenInstrNames.inc \
+		PIC16GenInstrInfo.inc PIC16GenAsmWriter.inc \
+		PIC16GenDAGISel.inc PIC16GenCallingConv.inc \
+		PIC16GenSubtarget.inc
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/PIC16/PIC16.h b/lib/Target/PIC16/PIC16.h
new file mode 100644
index 0000000..40bed2f
--- /dev/null
+++ b/lib/Target/PIC16/PIC16.h
@@ -0,0 +1,345 @@
+//===-- PIC16.h - Top-level interface for PIC16 representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in 
+// the LLVM PIC16 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PIC16_H
+#define LLVM_TARGET_PIC16_H
+
+#include "llvm/Target/TargetMachine.h"
+#include <iosfwd>
+#include <cassert>
+#include <sstream>
+#include <cstring>
+#include <string>
+
+namespace llvm {
+  class PIC16TargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class raw_ostream;
+
+namespace PIC16CC {
+  enum CondCodes {
+    EQ,
+    NE,
+    LT,
+    LE,
+    GT,
+    GE,
+    ULT,
+    UGT,
+    ULE,
+    UGE
+  };
+}
+  // A Central class to manage all ABI naming conventions.
+  // PAN - [P]ic16 [A]BI [N]ames
+  class PAN {
+    public:
+    // Map the name of the symbol to its section name.
+    // Current ABI:
+    // -----------------------------------------------------
+    // ALL Names are prefixed with the symobl '@'.
+    // ------------------------------------------------------
+    // Global variables do not have any '.' in their names.
+    // These are maily function names and global variable names.
+    // Example - @foo,  @i
+    // -------------------------------------------------------
+    // Functions and auto variables.
+    // Names are mangled as <prefix><funcname>.<tag>.<varname>
+    // Where <prefix> is '@' and <tag> is any one of
+    // the following
+    // .auto. - an automatic var of a function.
+    // .temp. - temproray data of a function.
+    // .ret.  - return value label for a function.
+    // .frame. - Frame label for a function where retval, args
+    //           and temps are stored.
+    // .args. - Label used to pass arguments to a direct call.
+    // Example - Function name:   @foo
+    //           Its frame:       @foo.frame.
+    //           Its retval:      @foo.ret.
+    //           Its local vars:  @foo.auto.a
+    //           Its temp data:   @foo.temp.
+    //           Its arg passing: @foo.args.
+    //----------------------------------------------
+    // Libcall - compiler generated libcall names must start with .lib.
+    //           This id will be used to emit extern decls for libcalls.
+    // Example - libcall name:   @.lib.sra.i8
+    //           To pass args:   @.lib.sra.i8.args.
+    //           To return val:  @.lib.sra.i8.ret.
+    //----------------------------------------------
+    // SECTION Names
+    // uninitialized globals - @udata.<num>.#
+    // initialized globals - @idata.<num>.#
+    // Function frame - @<func>.frame_section.
+    // Function autos - @<func>.autos_section.
+    // Declarations - @section.0
+    //----------------------------------------------------------
+    
+    // Tags used to mangle different names. 
+    enum TAGS {
+      PREFIX_SYMBOL,
+      GLOBAL,
+      STATIC_LOCAL,
+      AUTOS_LABEL,
+      FRAME_LABEL,
+      RET_LABEL,
+      ARGS_LABEL,
+      TEMPS_LABEL,
+      
+      LIBCALL,
+      
+      FRAME_SECTION,
+      AUTOS_SECTION,
+      CODE_SECTION
+    };
+
+    // Textual names of the tags.
+    inline static const char *getTagName(TAGS tag) {
+      switch (tag) {
+      default: return "";
+      case PREFIX_SYMBOL:    return "@";
+      case AUTOS_LABEL:       return ".auto.";
+      case FRAME_LABEL:       return ".frame.";
+      case TEMPS_LABEL:       return ".temp.";
+      case ARGS_LABEL:       return ".args.";
+      case RET_LABEL:       return ".ret.";
+      case LIBCALL:       return ".lib.";
+      case FRAME_SECTION:       return ".frame_section.";
+      case AUTOS_SECTION:       return ".autos_section.";
+      case CODE_SECTION:       return ".code_section.";
+      }
+    }
+
+    // Get tag type for the Symbol.
+    inline static TAGS getSymbolTag(const std::string &Sym) {
+      if (Sym.find(getTagName(TEMPS_LABEL)) != std::string::npos)
+        return TEMPS_LABEL;
+
+      if (Sym.find(getTagName(FRAME_LABEL)) != std::string::npos)
+        return FRAME_LABEL;
+
+      if (Sym.find(getTagName(RET_LABEL)) != std::string::npos)
+        return RET_LABEL;
+
+      if (Sym.find(getTagName(ARGS_LABEL)) != std::string::npos)
+        return ARGS_LABEL;
+
+      if (Sym.find(getTagName(AUTOS_LABEL)) != std::string::npos)
+        return AUTOS_LABEL;
+
+      if (Sym.find(getTagName(LIBCALL)) != std::string::npos)
+        return LIBCALL;
+
+      // It does not have any Tag. So its a true global or static local.
+      if (Sym.find(".") == std::string::npos) 
+        return GLOBAL;
+      
+      // If a . is there, then it may be static local.
+      // We should mangle these as well in clang.
+      if (Sym.find(".") != std::string::npos) 
+        return STATIC_LOCAL;
+ 
+      assert (0 && "Could not determine Symbol's tag");
+    }
+
+    // addPrefix - add prefix symbol to a name if there isn't one already.
+    inline static std::string addPrefix (const std::string &Name) {
+      std::string prefix = getTagName (PREFIX_SYMBOL);
+
+      // If this name already has a prefix, nothing to do.
+      if (Name.compare(0, prefix.size(), prefix) == 0)
+        return Name;
+
+      return prefix + Name;
+    }
+
+    // Get mangled func name from a mangled sym name.
+    // In all cases func name is the first component before a '.'.
+    static inline std::string getFuncNameForSym(const std::string &Sym1) {
+      assert (getSymbolTag(Sym1) != GLOBAL && "not belongs to a function");
+
+      std::string Sym = addPrefix(Sym1);
+
+      // Position of the . after func name. That's where func name ends.
+      size_t func_name_end = Sym.find ('.');
+
+      return Sym.substr (0, func_name_end);
+    }
+
+    // Get Frame start label for a func.
+    static std::string getFrameLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(FRAME_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getRetvalLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(RET_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getArgsLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(ARGS_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getTempdataLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(TEMPS_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getFrameSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(FRAME_SECTION);
+      return Func1 + tag + "# UDATA_OVR";
+    }
+
+    static std::string getAutosSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(AUTOS_SECTION);
+      return Func1 + tag + "# UDATA_OVR";
+    }
+
+    static std::string getCodeSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(CODE_SECTION);
+      return Func1 + tag + "# CODE";
+    }
+
+    // udata and idata section names are generated by a given number.
+    // @udata.<num>.# 
+    static std::string getUdataSectionName(unsigned num) {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << "udata." << num << ".# UDATA"; 
+       return o.str(); 
+    }
+
+    static std::string getIdataSectionName(unsigned num) {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << "idata." << num << ".# IDATA"; 
+       return o.str(); 
+    }
+
+    inline static bool isLocalName (const std::string &Name) {
+      if (getSymbolTag(Name) == AUTOS_LABEL)
+        return true;
+
+      return false;
+    }
+
+    inline static bool isLocalToFunc (std::string &Func, std::string &Var) {
+      if (! isLocalName(Var)) return false;
+
+      std::string Func1 = addPrefix(Func);
+      // Extract func name of the varilable.
+      const std::string &fname = getFuncNameForSym(Var);
+
+      if (fname.compare(Func1) == 0)
+        return true;
+
+      return false;
+    }
+
+
+    // Get the section for the given external symbol names.
+    // This tries to find the type (Tag) of the symbol from its mangled name
+    // and return appropriate section name for it.
+    static inline std::string getSectionNameForSym(const std::string &Sym1) {
+      std::string Sym = addPrefix(Sym1);
+
+      std::string SectionName;
+ 
+      std::string Fname = getFuncNameForSym (Sym);
+      TAGS id = getSymbolTag (Sym);
+
+      switch (id) {
+        default : assert (0 && "Could not determine external symbol type");
+        case FRAME_LABEL:
+        case RET_LABEL:
+        case TEMPS_LABEL:
+        case ARGS_LABEL:  {
+          return getFrameSectionName(Fname);
+        }
+        case AUTOS_LABEL: {
+          return getAutosSectionName(Fname);
+        }
+      }
+    }
+  }; // class PAN.
+
+
+  // External symbol names require memory to live till the program end.
+  // So we have to allocate it and keep.
+  inline static const char *createESName (const std::string &name) {
+    char *tmpName = new char[name.size() + 1];
+    strcpy (tmpName, name.c_str());
+    return tmpName;
+  }
+
+
+
+  inline static const char *PIC16CondCodeToString(PIC16CC::CondCodes CC) {
+    switch (CC) {
+    default: assert(0 && "Unknown condition code");
+    case PIC16CC::NE:  return "ne";
+    case PIC16CC::EQ:   return "eq";
+    case PIC16CC::LT:   return "lt";
+    case PIC16CC::ULT:   return "lt";
+    case PIC16CC::LE:  return "le";
+    case PIC16CC::GT:  return "gt";
+    case PIC16CC::UGT:  return "gt";
+    case PIC16CC::GE:   return "ge";
+    }
+  }
+
+  inline static bool isSignedComparison(PIC16CC::CondCodes CC) {
+    switch (CC) {
+    default: assert(0 && "Unknown condition code");
+    case PIC16CC::NE:  
+    case PIC16CC::EQ: 
+    case PIC16CC::LT:
+    case PIC16CC::LE:
+    case PIC16CC::GE:
+    case PIC16CC::GT:
+      return true;
+    case PIC16CC::ULT:
+    case PIC16CC::UGT:
+    case PIC16CC::ULE:
+    case PIC16CC::UGE:
+      return false;   // condition codes for unsigned comparison. 
+    }
+  }
+
+
+
+  FunctionPass *createPIC16ISelDag(PIC16TargetMachine &TM);
+  FunctionPass *createPIC16CodePrinterPass(raw_ostream &OS, 
+                                           PIC16TargetMachine &TM,
+                                           CodeGenOpt::Level OptLevel,
+                                           bool Verbose);
+  // Banksel optimzer pass.
+  FunctionPass *createPIC16MemSelOptimizerPass();
+} // end namespace llvm;
+
+// Defines symbolic names for PIC16 registers.  This defines a mapping from
+// register name to register number.
+#include "PIC16GenRegisterNames.inc"
+
+// Defines symbolic names for the PIC16 instructions.
+#include "PIC16GenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/PIC16/PIC16.td b/lib/Target/PIC16/PIC16.td
new file mode 100644
index 0000000..b2b9b1c
--- /dev/null
+++ b/lib/Target/PIC16/PIC16.td
@@ -0,0 +1,40 @@
+//===- PIC16.td - Describe the PIC16 Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the PIC16 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "PIC16RegisterInfo.td"
+include "PIC16InstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureCooper : SubtargetFeature<"cooper", "IsCooper", "true",
+                                     "PIC16 Cooper ISA Support">;
+
+//===----------------------------------------------------------------------===//
+// PIC16 supported processors.
+//===----------------------------------------------------------------------===//
+
+def : Processor<"generic", NoItineraries, []>;
+def : Processor<"cooper", NoItineraries, [FeatureCooper]>;
+
+
+def PIC16InstrInfo : InstrInfo {} 
+
+def PIC16 : Target {
+  let InstructionSet = PIC16InstrInfo;
+}
+
diff --git a/lib/Target/PIC16/PIC16AsmPrinter.cpp b/lib/Target/PIC16/PIC16AsmPrinter.cpp
new file mode 100644
index 0000000..ef3bc4b
--- /dev/null
+++ b/lib/Target/PIC16/PIC16AsmPrinter.cpp
@@ -0,0 +1,404 @@
+//===-- PIC16AsmPrinter.cpp - PIC16 LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PIC16 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16AsmPrinter.h"
+#include "PIC16TargetAsmInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+using namespace llvm;
+
+#include "PIC16GenAsmWriter.inc"
+
+bool PIC16AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  printInstruction(MI);
+  return true;
+}
+
+/// runOnMachineFunction - This uses the printInstruction()
+/// method to print assembly for each instruction.
+///
+bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  // This calls the base class function required to be called at beginning
+  // of runOnMachineFunction.
+  SetupMachineFunction(MF);
+
+  // Get the mangled name.
+  const Function *F = MF.getFunction();
+  CurrentFnName = Mang->getValueName(F);
+
+  // Emit the function variables.
+  EmitFunctionFrame(MF);
+
+  // Emit function begin debug directives
+  DbgInfo.EmitFunctBeginDI(F);
+
+  EmitAutos(CurrentFnName);
+  const char *codeSection = PAN::getCodeSectionName(CurrentFnName).c_str();
+ 
+  const Section *fCodeSection = TAI->getNamedSection(codeSection,
+                                                     SectionFlags::Code);
+  O <<  "\n";
+  // Start the Code Section.
+  SwitchToSection (fCodeSection);
+
+  // Emit the frame address of the function at the beginning of code.
+  O << "\tretlw  low(" << PAN::getFrameLabel(CurrentFnName) << ")\n";
+  O << "\tretlw  high(" << PAN::getFrameLabel(CurrentFnName) << ")\n";
+
+  // Emit function start label.
+  O << CurrentFnName << ":\n";
+
+  // For emitting line directives, we need to keep track of the current
+  // source line. When it changes then only emit the line directive.
+  unsigned CurLine = 0;
+  O << "\n"; 
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Emit the line directive if source line changed.
+      const DebugLoc DL = II->getDebugLoc();
+      if (!DL.isUnknown()) {
+        unsigned line = MF.getDebugLocTuple(DL).Line;
+        if (line != CurLine) {
+          O << "\t.line " << line << "\n";
+          CurLine = line;
+        }
+      }
+        
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+  
+  // Emit function end debug directives.
+  DbgInfo.EmitFunctEndDI(F, CurLine);
+  return false;  // we didn't modify anything.
+}
+
+/// createPIC16CodePrinterPass - Returns a pass that prints the PIC16
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createPIC16CodePrinterPass(raw_ostream &o,
+                                               PIC16TargetMachine &tm,
+                                               CodeGenOpt::Level OptLevel,
+                                               bool verbose) {
+  return new PIC16AsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
+
+
+// printOperand - print operand of insn.
+void PIC16AsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+      else
+        assert(0 && "not implemented");
+        return;
+
+    case MachineOperand::MO_Immediate:
+      O << (int)MO.getImm();
+      return;
+
+    case MachineOperand::MO_GlobalAddress: {
+      O << Mang->getValueName(MO.getGlobal());
+      break;
+    }
+    case MachineOperand::MO_ExternalSymbol: {
+       const char *Sname = MO.getSymbolName();
+
+      // If its a libcall name, record it to decls section.
+      if (PAN::getSymbolTag(Sname) == PAN::LIBCALL) {
+        LibcallDecls.push_back(Sname);
+      }
+
+      O  << Sname;
+      break;
+    }
+    case MachineOperand::MO_MachineBasicBlock:
+      printBasicBlockLabel(MO.getMBB());
+      return;
+
+    default:
+      assert(0 && " Operand type not supported.");
+  }
+}
+
+void PIC16AsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  O << PIC16CondCodeToString((PIC16CC::CondCodes)CC);
+}
+
+void PIC16AsmPrinter::printLibcallDecls(void) {
+  // If no libcalls used, return.
+  if (LibcallDecls.empty()) return;
+
+  O << TAI->getCommentString() << "External decls for libcalls - BEGIN." <<"\n";
+  // Remove duplicate entries.
+  LibcallDecls.sort();
+  LibcallDecls.unique();
+  for (std::list<const char*>::const_iterator I = LibcallDecls.begin(); 
+       I != LibcallDecls.end(); I++) {
+    O << TAI->getExternDirective() << *I << "\n";
+    O << TAI->getExternDirective() << PAN::getArgsLabel(*I) << "\n";
+    O << TAI->getExternDirective() << PAN::getRetvalLabel(*I) << "\n";
+  }
+  O << TAI->getCommentString() << "External decls for libcalls - END." <<"\n";
+}
+
+bool PIC16AsmPrinter::doInitialization (Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+  DbgInfo.EmitFileDirective(M);
+
+  // FIXME:: This is temporary solution to generate the include file.
+  // The processor should be passed to llc as in input and the header file
+  // should be generated accordingly.
+  O << "\n\t#include P16F1937.INC\n";
+  MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMI);
+  DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+  assert(DW && "Dwarf Writer is not available");
+  DW->BeginModule(&M, MMI, O, this, TAI);
+
+  // Set the section names for all globals.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    I->setSection(TAI->SectionForGlobal(I)->getName());
+  }
+
+  EmitFunctionDecls(M);
+  EmitUndefinedVars(M);
+  EmitDefinedVars(M);
+  EmitIData(M);
+  EmitUData(M);
+  EmitRomData(M);
+  DbgInfo.PopulateFunctsDI(M); 
+  return Result;
+}
+
+// Emit extern decls for functions imported from other modules, and emit
+// global declarations for function defined in this module and which are
+// available to other modules.
+void PIC16AsmPrinter::EmitFunctionDecls (Module &M) {
+ // Emit declarations for external functions.
+  O << TAI->getCommentString() << "Function Declarations - BEGIN." <<"\n";
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; I++) {
+    std::string Name = Mang->getValueName(I);
+    if (Name.compare("@abort") == 0)
+      continue;
+    
+    // If it is llvm intrinsic call then don't emit
+    if (Name.find("llvm.") != std::string::npos)
+      continue;
+
+    if (! (I->isDeclaration() || I->hasExternalLinkage()))
+      continue;
+
+    const char *directive = I->isDeclaration() ? TAI->getExternDirective() :
+                                                 TAI->getGlobalDirective();
+      
+    O << directive << Name << "\n";
+    O << directive << PAN::getRetvalLabel(Name) << "\n";
+    O << directive << PAN::getArgsLabel(Name) << "\n";
+  }
+
+  O << TAI->getCommentString() << "Function Declarations - END." <<"\n";
+}
+
+// Emit variables imported from other Modules.
+void PIC16AsmPrinter::EmitUndefinedVars (Module &M)
+{
+  std::vector<const GlobalVariable*> Items = PTAI->ExternalVarDecls->Items;
+  if (! Items.size()) return;
+
+  O << "\n" << TAI->getCommentString() << "Imported Variables - BEGIN" << "\n";
+  for (unsigned j = 0; j < Items.size(); j++) {
+    O << TAI->getExternDirective() << Mang->getValueName(Items[j]) << "\n";
+  }
+  O << TAI->getCommentString() << "Imported Variables - END" << "\n";
+}
+
+// Emit variables defined in this module and are available to other modules.
+void PIC16AsmPrinter::EmitDefinedVars (Module &M)
+{
+  std::vector<const GlobalVariable*> Items = PTAI->ExternalVarDefs->Items;
+  if (! Items.size()) return;
+
+  O << "\n" <<  TAI->getCommentString() << "Exported Variables - BEGIN" << "\n";
+  for (unsigned j = 0; j < Items.size(); j++) {
+    O << TAI->getGlobalDirective() << Mang->getValueName(Items[j]) << "\n";
+  }
+  O <<  TAI->getCommentString() << "Exported Variables - END" << "\n";
+}
+
+// Emit initialized data placed in ROM.
+void PIC16AsmPrinter::EmitRomData (Module &M)
+{
+
+  std::vector<const GlobalVariable*> Items = PTAI->ROSection->Items;
+  if (! Items.size()) return;
+
+  // Print ROData ection.
+  O << "\n";
+  SwitchToSection(PTAI->ROSection->S_);
+  for (unsigned j = 0; j < Items.size(); j++) {
+    O << Mang->getValueName(Items[j]);
+    Constant *C = Items[j]->getInitializer();
+    int AddrSpace = Items[j]->getType()->getAddressSpace();
+    EmitGlobalConstant(C, AddrSpace);
+  }
+}
+
+bool PIC16AsmPrinter::doFinalization(Module &M) {
+  printLibcallDecls();
+  DbgInfo.EmitVarDebugInfo(M);
+  O << "\n\t" << ".EOF";
+  O << "\n\t" << "END\n";
+  bool Result = AsmPrinter::doFinalization(M);
+  return Result;
+}
+
+void PIC16AsmPrinter::EmitFunctionFrame(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  std::string FuncName = Mang->getValueName(F);
+  const TargetData *TD = TM.getTargetData();
+  // Emit the data section name.
+  O << "\n"; 
+  const char *SectionName = PAN::getFrameSectionName(CurrentFnName).c_str();
+
+  const Section *fPDataSection = TAI->getNamedSection(SectionName,
+                                                      SectionFlags::Writeable);
+  SwitchToSection(fPDataSection);
+  
+  // Emit function frame label
+  O << PAN::getFrameLabel(CurrentFnName) << ":\n";
+
+  const Type *RetType = F->getReturnType();
+  unsigned RetSize = 0; 
+  if (RetType->getTypeID() != Type::VoidTyID) 
+    RetSize = TD->getTypeAllocSize(RetType);
+  
+  //Emit function return value space
+  // FIXME: Do not emit RetvalLable when retsize is zero. To do this
+  // we will need to avoid printing a global directive for Retval label
+  // in emitExternandGloblas.
+  if(RetSize > 0)
+     O << PAN::getRetvalLabel(CurrentFnName) << " RES " << RetSize << "\n";
+  else
+     O << PAN::getRetvalLabel(CurrentFnName) << ": \n";
+   
+  // Emit variable to hold the space for function arguments 
+  unsigned ArgSize = 0;
+  for (Function::const_arg_iterator argi = F->arg_begin(),
+           arge = F->arg_end(); argi != arge ; ++argi) {
+    const Type *Ty = argi->getType();
+    ArgSize += TD->getTypeAllocSize(Ty);
+   }
+
+  O << PAN::getArgsLabel(CurrentFnName) << " RES " << ArgSize << "\n";
+
+  // Emit temporary space
+  int TempSize = PTLI->GetTmpSize();
+  if (TempSize > 0 )
+    O << PAN::getTempdataLabel(CurrentFnName) << " RES  " << TempSize <<"\n";
+}
+
+void PIC16AsmPrinter::EmitIData (Module &M) {
+
+  // Print all IDATA sections.
+  std::vector <PIC16Section *>IDATASections = PTAI->IDATASections;
+  for (unsigned i = 0; i < IDATASections.size(); i++) {
+    O << "\n";
+    SwitchToSection(IDATASections[i]->S_);
+    std::vector<const GlobalVariable*> Items = IDATASections[i]->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      std::string Name = Mang->getValueName(Items[j]);
+      Constant *C = Items[j]->getInitializer();
+      int AddrSpace = Items[j]->getType()->getAddressSpace();
+      O << Name;
+      EmitGlobalConstant(C, AddrSpace);
+    }
+  }
+}
+
+void PIC16AsmPrinter::EmitUData (Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print all BSS sections.
+  std::vector <PIC16Section *>BSSSections = PTAI->BSSSections;
+  for (unsigned i = 0; i < BSSSections.size(); i++) {
+    O << "\n";
+    SwitchToSection(BSSSections[i]->S_);
+    std::vector<const GlobalVariable*> Items = BSSSections[i]->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      std::string Name = Mang->getValueName(Items[j]);
+      Constant *C = Items[j]->getInitializer();
+      const Type *Ty = C->getType();
+      unsigned Size = TD->getTypeAllocSize(Ty);
+
+      O << Name << " " <<"RES"<< " " << Size ;
+      O << "\n";
+    }
+  }
+}
+
+void PIC16AsmPrinter::EmitAutos (std::string FunctName)
+{
+  // Section names for all globals are already set.
+
+  const TargetData *TD = TM.getTargetData();
+
+  // Now print Autos section for this function.
+  std::string SectionName = PAN::getAutosSectionName(FunctName);
+  std::vector <PIC16Section *>AutosSections = PTAI->AutosSections;
+  for (unsigned i = 0; i < AutosSections.size(); i++) {
+    O << "\n";
+    if (AutosSections[i]->S_->getName() == SectionName) { 
+      SwitchToSection(AutosSections[i]->S_);
+      std::vector<const GlobalVariable*> Items = AutosSections[i]->Items;
+      for (unsigned j = 0; j < Items.size(); j++) {
+        std::string VarName = Mang->getValueName(Items[j]);
+        Constant *C = Items[j]->getInitializer();
+        const Type *Ty = C->getType();
+        unsigned Size = TD->getTypeAllocSize(Ty);
+        // Emit memory reserve directive.
+        O << VarName << "  RES  " << Size << "\n";
+      }
+      break;
+    }
+  }
+}
+
diff --git a/lib/Target/PIC16/PIC16AsmPrinter.h b/lib/Target/PIC16/PIC16AsmPrinter.h
new file mode 100644
index 0000000..2545dfd
--- /dev/null
+++ b/lib/Target/PIC16/PIC16AsmPrinter.h
@@ -0,0 +1,70 @@
+//===-- PIC16AsmPrinter.h - PIC16 LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PIC16 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16ASMPRINTER_H
+#define PIC16ASMPRINTER_H
+
+#include "PIC16.h"
+#include "PIC16TargetMachine.h"
+#include "PIC16DebugInfo.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "PIC16TargetAsmInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <list>
+#include <string>
+
+namespace llvm {
+  struct VISIBILITY_HIDDEN PIC16AsmPrinter : public AsmPrinter {
+    explicit PIC16AsmPrinter(raw_ostream &O, PIC16TargetMachine &TM,
+                             const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                             bool V)
+      : AsmPrinter(O, TM, T, OL, V), DbgInfo(O,T) {
+      PTLI = TM.getTargetLowering();
+      PTAI = static_cast<const PIC16TargetAsmInfo *> (T);
+    }
+    private :
+    virtual const char *getPassName() const {
+      return "PIC16 Assembly Printer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printCCOperand(const MachineInstr *MI, int opNum);
+    bool printInstruction(const MachineInstr *MI); // definition autogenerated.
+    bool printMachineInstruction(const MachineInstr *MI);
+    void EmitFunctionDecls (Module &M);
+    void EmitUndefinedVars (Module &M);
+    void EmitDefinedVars (Module &M);
+    void EmitIData (Module &M);
+    void EmitUData (Module &M);
+    void EmitAutos (std::string FunctName);
+    void EmitRomData (Module &M);
+    void EmitFunctionFrame(MachineFunction &MF);
+    void printLibcallDecls(void);
+    protected:
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    private:
+    PIC16TargetLowering *PTLI;
+    PIC16DbgInfo DbgInfo;
+    const PIC16TargetAsmInfo *PTAI;
+    std::list<const char *> LibcallDecls; // List of extern decls.
+  };
+} // end of namespace
+
+#endif
diff --git a/lib/Target/PIC16/PIC16DebugInfo.cpp b/lib/Target/PIC16/PIC16DebugInfo.cpp
new file mode 100644
index 0000000..4d43811
--- /dev/null
+++ b/lib/Target/PIC16/PIC16DebugInfo.cpp
@@ -0,0 +1,270 @@
+//===-- PIC16DebugInfo.cpp - Implementation for PIC16 Debug Information ======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the helper functions for representing debug information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16DebugInfo.h" 
+#include "llvm/GlobalVariable.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+PIC16DbgInfo::~PIC16DbgInfo() {
+  for(std::map<std::string, DISubprogram *>::iterator i = FunctNameMap.begin();
+      i!=FunctNameMap.end(); i++) 
+    delete i->second;
+  FunctNameMap.clear();
+}
+
+void PIC16DbgInfo::PopulateDebugInfo(DIType Ty, unsigned short &TypeNo,
+                                     bool &HasAux, int Aux[], 
+                                     std::string &TypeName) {
+  if (Ty.isBasicType(Ty.getTag())) {
+    std::string Name = "";
+    Ty.getName(Name);
+    unsigned short BaseTy = GetTypeDebugNumber(Name);
+    TypeNo = TypeNo << PIC16Dbg::S_BASIC;
+    TypeNo = TypeNo | (0xffff & BaseTy);
+  }
+  else if (Ty.isDerivedType(Ty.getTag())) {
+    switch(Ty.getTag())
+    {
+      case dwarf::DW_TAG_pointer_type:
+        TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+        TypeNo = TypeNo | PIC16Dbg::DT_PTR;
+        break;
+      default:
+        TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+    }
+    DIType BaseType = DIDerivedType(Ty.getGV()).getTypeDerivedFrom();
+    PopulateDebugInfo(BaseType, TypeNo, HasAux, Aux, TypeName);
+  }
+  else if (Ty.isCompositeType(Ty.getTag())) {
+    switch (Ty.getTag()) {
+      case dwarf::DW_TAG_array_type: {
+        DICompositeType CTy = DICompositeType(Ty.getGV());
+        DIArray Elements = CTy.getTypeArray();
+        unsigned short size = 1;
+        unsigned short Dimension[4]={0,0,0,0};
+        for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+          DIDescriptor Element = Elements.getElement(i);
+          if (Element.getTag() == dwarf::DW_TAG_subrange_type) {
+            TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+            TypeNo = TypeNo | PIC16Dbg::DT_ARY;
+            DISubrange SubRange = DISubrange(Element.getGV());
+            Dimension[i] = SubRange.getHi() - SubRange.getLo() + 1;
+            // Each dimension is represented by 2 bytes starting at byte 9.
+            Aux[8+i*2+0] = Dimension[i];
+            Aux[8+i*2+1] = Dimension[i] >> 8;
+            size = size * Dimension[i];
+          }
+        }
+        HasAux = true;
+        // In auxillary entry for array, 7th and 8th byte represent array size.
+        Aux[6] = size;
+        Aux[7] = size >> 8;
+        DIType BaseType = CTy.getTypeDerivedFrom();
+        PopulateDebugInfo(BaseType, TypeNo, HasAux, Aux, TypeName);
+
+        break;
+      }
+      case dwarf:: DW_TAG_union_type:
+      case dwarf::DW_TAG_structure_type: {
+        DICompositeType CTy = DICompositeType(Ty.getGV());
+        TypeNo = TypeNo << PIC16Dbg::S_BASIC;
+        if (Ty.getTag() == dwarf::DW_TAG_structure_type)
+          TypeNo = TypeNo | PIC16Dbg::T_STRUCT;
+        else
+          TypeNo = TypeNo | PIC16Dbg::T_UNION;
+        CTy.getName(TypeName);
+        unsigned size = CTy.getSizeInBits()/8;
+        // 7th and 8th byte represent size.   
+        HasAux = true;
+        Aux[6] = size;
+        Aux[7] = size >> 8;
+        break;
+      }
+      case dwarf::DW_TAG_enumeration_type: {
+        TypeNo = TypeNo << PIC16Dbg::S_BASIC;
+        TypeNo = TypeNo | PIC16Dbg::T_ENUM;
+        break;
+      }
+      default:
+        TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+    }
+  }
+  else {
+    TypeNo = PIC16Dbg::T_NULL;
+    HasAux = false;
+  }
+  return;
+}
+
+
+unsigned PIC16DbgInfo::GetTypeDebugNumber(std::string &type)  {
+  if (type == "char")
+    return PIC16Dbg::T_CHAR;
+  else if (type == "short")
+    return PIC16Dbg::T_SHORT;
+  else if (type == "int")
+    return PIC16Dbg::T_INT;
+  else if (type == "long")
+    return PIC16Dbg::T_LONG;
+  else if (type == "unsigned char")
+    return PIC16Dbg::T_UCHAR;
+  else if (type == "unsigned short")
+    return PIC16Dbg::T_USHORT;
+  else if (type == "unsigned int")
+    return PIC16Dbg::T_UINT;
+  else if (type == "unsigned long")
+    return PIC16Dbg::T_ULONG;
+  else
+    return 0;
+}
+
+short PIC16DbgInfo::getClass(DIGlobalVariable DIGV) {
+  short ClassNo;
+  if (PAN::isLocalName(DIGV.getGlobal()->getName())) {
+    // Generating C_AUTO here fails due to error in linker. Change it once
+    // linker is fixed.
+    ClassNo = PIC16Dbg::C_STAT;
+  }
+  else if (DIGV.isLocalToUnit())
+    ClassNo = PIC16Dbg::C_STAT;
+  else
+    ClassNo = PIC16Dbg::C_EXT;
+  return ClassNo;
+}
+
+void PIC16DbgInfo::PopulateFunctsDI(Module &M) {
+  GlobalVariable *Root = M.getGlobalVariable("llvm.dbg.subprograms");
+  if (!Root)
+    return;
+  Constant *RootC = cast<Constant>(*Root->use_begin());
+
+  for (Value::use_iterator UI = RootC->use_begin(), UE = Root->use_end();
+       UI != UE; ++UI)
+    for (Value::use_iterator UUI = UI->use_begin(), UUE = UI->use_end();
+         UUI != UUE; ++UUI) {
+      GlobalVariable *GVSP = cast<GlobalVariable>(*UUI);
+      DISubprogram *SP = new DISubprogram(GVSP);
+      std::string Name;
+      SP->getLinkageName(Name);
+      FunctNameMap[Name] = SP; 
+    }
+  return;
+}
+
+DISubprogram* PIC16DbgInfo::getFunctDI(std::string FunctName) {
+  return FunctNameMap[FunctName];
+}
+
+void PIC16DbgInfo::EmitFunctBeginDI(const Function *F) {
+  std::string FunctName = F->getName();
+  DISubprogram *SP = getFunctDI(FunctName);
+  if (SP) {
+    std::string FunctBeginSym = ".bf." + FunctName;
+    std::string BlockBeginSym = ".bb." + FunctName;
+
+    int FunctBeginLine = SP->getLineNumber();
+    int BFAux[PIC16Dbg::AuxSize] = {0};
+    BFAux[4] = FunctBeginLine;
+    BFAux[5] = FunctBeginLine >> 8;
+    // Emit debug directives for beginning of function.
+    EmitSymbol(FunctBeginSym, PIC16Dbg::C_FCN);
+    EmitAuxEntry(FunctBeginSym, BFAux, PIC16Dbg::AuxSize);
+    EmitSymbol(BlockBeginSym, PIC16Dbg::C_BLOCK);
+    EmitAuxEntry(BlockBeginSym, BFAux, PIC16Dbg::AuxSize);
+  }
+}
+
+void PIC16DbgInfo::EmitFunctEndDI(const Function *F, unsigned Line) {
+  std::string FunctName = F->getName();
+  DISubprogram *SP = getFunctDI(FunctName);
+  if (SP) {
+    std::string FunctEndSym = ".ef." + FunctName;
+    std::string BlockEndSym = ".eb." + FunctName;
+
+    // Emit debug directives for end of function.
+    EmitSymbol(BlockEndSym, PIC16Dbg::C_BLOCK);
+    int EFAux[PIC16Dbg::AuxSize] = {0};
+    // 5th and 6th byte stand for line number.
+    EFAux[4] = Line;
+    EFAux[5] = Line >> 8;
+    EmitAuxEntry(BlockEndSym, EFAux, PIC16Dbg::AuxSize);
+    EmitSymbol(FunctEndSym, PIC16Dbg::C_FCN);
+    EmitAuxEntry(FunctEndSym, EFAux, PIC16Dbg::AuxSize);
+  }
+}
+
+/// EmitAuxEntry - Emit Auxiliary debug information.
+///
+void PIC16DbgInfo::EmitAuxEntry(const std::string VarName, int Aux[], int num) {
+  O << "\n\t.dim " << VarName << ", 1" ;
+  for (int i = 0; i<num; i++)
+    O << "," << Aux[i];
+}
+
+void PIC16DbgInfo::EmitSymbol(std::string Name, int Class) {
+  O << "\n\t" << ".def "<< Name << ", debug, class = " << Class;
+}
+
+void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
+  GlobalVariable *Root = M.getGlobalVariable("llvm.dbg.global_variables");
+  if (!Root)
+    return;
+
+  Constant *RootC = cast<Constant>(*Root->use_begin());
+  for (Value::use_iterator UI = RootC->use_begin(), UE = Root->use_end();
+       UI != UE; ++UI) {
+    for (Value::use_iterator UUI = UI->use_begin(), UUE = UI->use_end();
+         UUI != UUE; ++UUI) {
+      DIGlobalVariable DIGV(cast<GlobalVariable>(*UUI));
+      DIType Ty = DIGV.getType();
+      unsigned short TypeNo = 0;
+      bool HasAux = false;
+      int Aux[PIC16Dbg::AuxSize] = { 0 };
+      std::string TypeName = "";
+      std::string VarName = TAI->getGlobalPrefix()+DIGV.getGlobal()->getName();
+      PopulateDebugInfo(Ty, TypeNo, HasAux, Aux, TypeName);
+      // Emit debug info only if type information is availaible.
+      if (TypeNo != PIC16Dbg::T_NULL) {
+        O << "\n\t.type " << VarName << ", " << TypeNo;
+        short ClassNo = getClass(DIGV);
+        O << "\n\t.class " << VarName << ", " << ClassNo;
+        if (HasAux) {
+          if (TypeName != "") {
+           // Emit debug info for structure and union objects after
+           // .dim directive supports structure/union tag name in aux entry.
+           /* O << "\n\t.dim " << VarName << ", 1," << TypeName;
+            for (int i = 0; i<PIC16Dbg::AuxSize; i++)
+              O << "," << Aux[i];*/
+         }
+          else {
+            EmitAuxEntry(VarName, Aux, PIC16Dbg::AuxSize);
+          }
+        }
+      }
+    }
+  }
+  O << "\n";
+}
+
+void PIC16DbgInfo::EmitFileDirective(Module &M) {
+  GlobalVariable *CU = M.getNamedGlobal("llvm.dbg.compile_unit");
+  if (CU) {
+    DICompileUnit DIUnit(CU);
+    std::string Dir, FN;
+    O << "\n\t.file\t\"" << DIUnit.getDirectory(Dir) <<"/"
+      << DIUnit.getFilename(FN) << "\"" ;
+  }
+}
diff --git a/lib/Target/PIC16/PIC16DebugInfo.h b/lib/Target/PIC16/PIC16DebugInfo.h
new file mode 100644
index 0000000..96b23da
--- /dev/null
+++ b/lib/Target/PIC16/PIC16DebugInfo.h
@@ -0,0 +1,114 @@
+//===-- PIC16DebugInfo.h - Interfaces for PIC16 Debug Information ============//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the helper functions for representing debug information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16DBG_H
+#define PIC16DBG_H
+
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetAsmInfo.h" 
+#include <map>
+
+namespace llvm {
+  namespace PIC16Dbg {
+    enum VarType {
+      T_NULL,
+      T_VOID,
+      T_CHAR,
+      T_SHORT,
+      T_INT,
+      T_LONG,
+      T_FLOAT,
+      T_DOUBLE,
+      T_STRUCT,
+      T_UNION,
+      T_ENUM,
+      T_MOE,
+      T_UCHAR,
+      T_USHORT,
+      T_UINT,
+      T_ULONG
+    };
+    enum DerivedType {
+      DT_NONE,
+      DT_PTR,
+      DT_FCN,
+      DT_ARY
+    };
+    enum TypeSize {
+      S_BASIC = 5,
+      S_DERIVED = 3
+    };
+    enum DbgClass {
+      C_NULL,
+      C_AUTO,
+      C_EXT,
+      C_STAT,
+      C_REG,
+      C_EXTDEF,
+      C_LABEL,
+      C_ULABEL,
+      C_MOS,
+      C_ARG,
+      C_STRTAG,
+      C_MOU,
+      C_UNTAG,
+      C_TPDEF,
+      C_USTATIC,
+      C_ENTAG,
+      C_MOE,
+      C_REGPARM,
+      C_FIELD,
+      C_AUTOARG,
+      C_LASTENT,
+      C_BLOCK = 100,
+      C_FCN,
+      C_EOS,
+      C_FILE,
+      C_LINE,
+      C_ALIAS,
+      C_HIDDEN,
+      C_EOF,
+      C_LIST,
+      C_SECTION,
+      C_EFCN = 255
+    };
+    enum SymbolSize {
+      AuxSize =20
+    };
+  }
+
+  class raw_ostream;
+
+  class PIC16DbgInfo {
+    std::map <std::string, DISubprogram *> FunctNameMap;
+    raw_ostream &O;
+    const TargetAsmInfo *TAI;
+  public:
+     PIC16DbgInfo(raw_ostream &o, const TargetAsmInfo *T) : O(o), TAI(T) {}
+    ~PIC16DbgInfo();
+    void PopulateDebugInfo(DIType Ty, unsigned short &TypeNo, bool &HasAux,
+                           int Aux[], std::string &TypeName);
+    unsigned GetTypeDebugNumber(std::string &type);
+    short getClass(DIGlobalVariable DIGV);
+    void PopulateFunctsDI(Module &M);
+    DISubprogram *getFunctDI(std::string FunctName);
+    void EmitFunctBeginDI(const Function *F);
+    void EmitFunctEndDI(const Function *F, unsigned Line);
+    void EmitAuxEntry(const std::string VarName, int Aux[], int num);
+    inline void EmitSymbol(std::string Name, int Class);
+    void EmitVarDebugInfo(Module &M);
+    void EmitFileDirective(Module &M);
+  };
+} // end namespace llvm;
+#endif
diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
new file mode 100644
index 0000000..6c2b8ec
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
@@ -0,0 +1,59 @@
+//===-- PIC16ISelDAGToDAG.cpp - A dag to dag inst selector for PIC16 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the PIC16 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-isel"
+
+#include "PIC16ISelDAGToDAG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+/// createPIC16ISelDag - This pass converts a legalized DAG into a
+/// PIC16-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createPIC16ISelDag(PIC16TargetMachine &TM) {
+  return new PIC16DAGToDAGISel(TM);
+}
+
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void PIC16DAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode* PIC16DAGToDAGISel::Select(SDValue N) {
+
+  // Select the default instruction.
+  SDNode *ResNode = SelectCode(N);
+
+  return ResNode;
+}
+
+
+// SelectDirectAddr - Match a direct address for DAG. 
+// A direct address could be a globaladdress or externalsymbol.
+bool PIC16DAGToDAGISel::SelectDirectAddr(SDValue Op, SDValue N, 
+                                      SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress
+      || N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+
+  return false;
+}
diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.h b/lib/Target/PIC16/PIC16ISelDAGToDAG.h
new file mode 100644
index 0000000..83abed3
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.h
@@ -0,0 +1,60 @@
+//===-- PIC16ISelDAGToDAG.cpp - A dag to dag inst selector for PIC16 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the PIC16 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-isel"
+
+#include "PIC16.h"
+#include "PIC16ISelLowering.h"
+#include "PIC16RegisterInfo.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Intrinsics.h"
+using namespace llvm;
+
+namespace {
+
+class VISIBILITY_HIDDEN PIC16DAGToDAGISel : public SelectionDAGISel {
+
+  /// TM - Keep a reference to PIC16TargetMachine.
+  PIC16TargetMachine &TM;
+
+  /// PIC16Lowering - This object fully describes how to lower LLVM code to an
+  /// PIC16-specific SelectionDAG.
+  PIC16TargetLowering PIC16Lowering;
+
+public:
+  explicit PIC16DAGToDAGISel(PIC16TargetMachine &tm) : 
+        SelectionDAGISel(tm),
+        TM(tm), PIC16Lowering(*TM.getTargetLowering()) {}
+  
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "PIC16 DAG->DAG Pattern Instruction Selection";
+  } 
+
+  virtual void InstructionSelect();
+  
+private:
+  // Include the pieces autogenerated from the target description.
+#include "PIC16GenDAGISel.inc"
+
+  SDNode *Select(SDValue N);
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDValue Op, SDValue N, SDValue &Address);
+
+};
+
+}
+
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
new file mode 100644
index 0000000..92fdcb2
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -0,0 +1,1756 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PIC16 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-lower"
+
+#include "PIC16ISelLowering.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Function.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+
+using namespace llvm;
+
+static const char *getIntrinsicName(unsigned opcode) {
+  std::string Basename;
+  switch(opcode) {
+  default: assert (0 && "do not know intrinsic name");
+  case PIC16ISD::SRA_I8: Basename = "sra.i8"; break;
+  case RTLIB::SRA_I16: Basename = "sra.i16"; break;
+  case RTLIB::SRA_I32: Basename = "sra.i32"; break;
+
+  case PIC16ISD::SLL_I8: Basename = "sll.i8"; break;
+  case RTLIB::SHL_I16: Basename = "sll.i16"; break;
+  case RTLIB::SHL_I32: Basename = "sll.i32"; break;
+
+  case PIC16ISD::SRL_I8: Basename = "srl.i8"; break;
+  case RTLIB::SRL_I16: Basename = "srl.i16"; break;
+  case RTLIB::SRL_I32: Basename = "srl.i32"; break;
+
+  case PIC16ISD::MUL_I8: Basename = "mul.i8"; break;
+  case RTLIB::MUL_I16: Basename = "mul.i16"; break;
+  case RTLIB::MUL_I32: Basename = "mul.i32"; break;
+  }
+  
+  std::string prefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
+  std::string tagname = PAN::getTagName(PAN::LIBCALL);
+  std::string Fullname = prefix + tagname + Basename; 
+
+  // The name has to live through program life.
+  char *tmp = new char[Fullname.size() + 1];
+  strcpy (tmp, Fullname.c_str());
+  
+  return tmp;
+}
+
+// PIC16TargetLowering Constructor.
+PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
+  : TargetLowering(TM), TmpSize(0) {
+  
+  Subtarget = &TM.getSubtarget<PIC16Subtarget>();
+
+  addRegisterClass(MVT::i8, PIC16::GPRRegisterClass);
+
+  setShiftAmountType(MVT::i8);
+  setShiftAmountFlavor(Extend);
+
+  // SRA library call names
+  setPIC16LibcallName(PIC16ISD::SRA_I8, getIntrinsicName(PIC16ISD::SRA_I8));
+  setLibcallName(RTLIB::SRA_I16, getIntrinsicName(RTLIB::SRA_I16));
+  setLibcallName(RTLIB::SRA_I32, getIntrinsicName(RTLIB::SRA_I32));
+
+  // SHL library call names
+  setPIC16LibcallName(PIC16ISD::SLL_I8, getIntrinsicName(PIC16ISD::SLL_I8));
+  setLibcallName(RTLIB::SHL_I16, getIntrinsicName(RTLIB::SHL_I16));
+  setLibcallName(RTLIB::SHL_I32, getIntrinsicName(RTLIB::SHL_I32));
+
+  // SRL library call names
+  setPIC16LibcallName(PIC16ISD::SRL_I8, getIntrinsicName(PIC16ISD::SRL_I8));
+  setLibcallName(RTLIB::SRL_I16, getIntrinsicName(RTLIB::SRL_I16));
+  setLibcallName(RTLIB::SRL_I32, getIntrinsicName(RTLIB::SRL_I32));
+
+  // MUL Library call names
+  setPIC16LibcallName(PIC16ISD::MUL_I8, getIntrinsicName(PIC16ISD::MUL_I8));
+  setLibcallName(RTLIB::MUL_I16, getIntrinsicName(RTLIB::MUL_I16));
+  setLibcallName(RTLIB::MUL_I32, getIntrinsicName(RTLIB::MUL_I32));
+
+  setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
+
+  setOperationAction(ISD::LOAD,   MVT::i8,  Legal);
+  setOperationAction(ISD::LOAD,   MVT::i16, Custom);
+  setOperationAction(ISD::LOAD,   MVT::i32, Custom);
+
+  setOperationAction(ISD::STORE,  MVT::i8,  Legal);
+  setOperationAction(ISD::STORE,  MVT::i16, Custom);
+  setOperationAction(ISD::STORE,  MVT::i32, Custom);
+
+  setOperationAction(ISD::ADDE,    MVT::i8,  Custom);
+  setOperationAction(ISD::ADDC,    MVT::i8,  Custom);
+  setOperationAction(ISD::SUBE,    MVT::i8,  Custom);
+  setOperationAction(ISD::SUBC,    MVT::i8,  Custom);
+  setOperationAction(ISD::ADD,    MVT::i8,  Custom);
+  setOperationAction(ISD::ADD,    MVT::i16, Custom);
+
+  setOperationAction(ISD::OR,     MVT::i8,  Custom);
+  setOperationAction(ISD::AND,    MVT::i8,  Custom);
+  setOperationAction(ISD::XOR,    MVT::i8,  Custom);
+
+  setOperationAction(ISD::FrameIndex, MVT::i16, Custom);
+  setOperationAction(ISD::CALL,   MVT::i16, Custom);
+  setOperationAction(ISD::RET,    MVT::Other, Custom);
+
+  setOperationAction(ISD::MUL,    MVT::i8,  Custom); 
+  setOperationAction(ISD::MUL,    MVT::i16, Expand);
+  setOperationAction(ISD::MUL,    MVT::i32, Expand);
+
+  setOperationAction(ISD::SMUL_LOHI,    MVT::i8,  Expand);
+  setOperationAction(ISD::SMUL_LOHI,    MVT::i16, Expand);
+  setOperationAction(ISD::SMUL_LOHI,    MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI,    MVT::i8,  Expand);
+  setOperationAction(ISD::UMUL_LOHI,    MVT::i16, Expand);
+  setOperationAction(ISD::UMUL_LOHI,    MVT::i32, Expand);
+  setOperationAction(ISD::MULHU,        MVT::i8, Expand);
+  setOperationAction(ISD::MULHU,        MVT::i16, Expand);
+  setOperationAction(ISD::MULHU,        MVT::i32, Expand);
+  setOperationAction(ISD::MULHS,        MVT::i8, Expand);
+  setOperationAction(ISD::MULHS,        MVT::i16, Expand);
+  setOperationAction(ISD::MULHS,        MVT::i32, Expand);
+
+  setOperationAction(ISD::SRA,    MVT::i8,  Custom);
+  setOperationAction(ISD::SRA,    MVT::i16, Expand);
+  setOperationAction(ISD::SRA,    MVT::i32, Expand);
+  setOperationAction(ISD::SHL,    MVT::i8,  Custom);
+  setOperationAction(ISD::SHL,    MVT::i16, Expand);
+  setOperationAction(ISD::SHL,    MVT::i32, Expand);
+  setOperationAction(ISD::SRL,    MVT::i8,  Custom);
+  setOperationAction(ISD::SRL,    MVT::i16, Expand);
+  setOperationAction(ISD::SRL,    MVT::i32, Expand);
+
+  // PIC16 does not support shift parts
+  setOperationAction(ISD::SRA_PARTS,    MVT::i8,  Expand);
+  setOperationAction(ISD::SRA_PARTS,    MVT::i16, Expand);
+  setOperationAction(ISD::SRA_PARTS,    MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS,    MVT::i8, Expand);
+  setOperationAction(ISD::SHL_PARTS,    MVT::i16, Expand);
+  setOperationAction(ISD::SHL_PARTS,    MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS,    MVT::i8, Expand);
+  setOperationAction(ISD::SRL_PARTS,    MVT::i16, Expand);
+  setOperationAction(ISD::SRL_PARTS,    MVT::i32, Expand);
+
+
+  // PIC16 does not have a SETCC, expand it to SELECT_CC.
+  setOperationAction(ISD::SETCC,  MVT::i8, Expand);
+  setOperationAction(ISD::SELECT,  MVT::i8, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  setOperationAction(ISD::SELECT_CC,  MVT::i8, Custom);
+  setOperationAction(ISD::BR_CC,  MVT::i8, Custom);
+
+  //setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
+  setTruncStoreAction(MVT::i16,   MVT::i8,  Custom);
+
+  // Now deduce the information based on the above mentioned 
+  // actions
+  computeRegisterProperties();
+}
+
+// getOutFlag - Extract the flag result if the Op has it.
+static SDValue getOutFlag(SDValue &Op) {
+  // Flag is the last value of the node.
+  SDValue Flag = Op.getValue(Op.getNode()->getNumValues() - 1);
+
+  assert (Flag.getValueType() == MVT::Flag 
+          && "Node does not have an out Flag");
+
+  return Flag;
+}
+// Get the TmpOffset for FrameIndex
+unsigned PIC16TargetLowering::GetTmpOffsetForFI(unsigned FI, unsigned size) {
+  std::map<unsigned, unsigned>::iterator 
+            MapIt = FiTmpOffsetMap.find(FI);
+  if (MapIt != FiTmpOffsetMap.end())
+      return MapIt->second;
+
+  // This FI (FrameIndex) is not yet mapped, so map it
+  FiTmpOffsetMap[FI] = TmpSize; 
+  TmpSize += size;
+  return FiTmpOffsetMap[FI];
+}
+
+// To extract chain value from the SDValue Nodes
+// This function will help to maintain the chain extracting
+// code at one place. In case of any change in future it will
+// help maintain the code.
+static SDValue getChain(SDValue &Op) { 
+  SDValue Chain = Op.getValue(Op.getNode()->getNumValues() - 1);
+
+  // If the last value returned in Flag then the chain is
+  // second last value returned.
+  if (Chain.getValueType() == MVT::Flag)
+    Chain = Op.getValue(Op.getNode()->getNumValues() - 2);
+  
+  // All nodes may not produce a chain. Therefore following assert
+  // verifies that the node is returning a chain only.
+  assert (Chain.getValueType() == MVT::Other 
+          && "Node does not have a chain");
+
+  return Chain;
+}
+
+/// PopulateResults - Helper function to LowerOperation.
+/// If a node wants to return multiple results after lowering,
+/// it stuffs them into an array of SDValue called Results.
+
+static void PopulateResults(SDValue N, SmallVectorImpl<SDValue>&Results) {
+  if (N.getOpcode() == ISD::MERGE_VALUES) {
+    int NumResults = N.getNumOperands();
+    for( int i = 0; i < NumResults; i++)
+      Results.push_back(N.getOperand(i));
+  }
+  else
+    Results.push_back(N);
+}
+
+MVT PIC16TargetLowering::getSetCCResultType(MVT ValType) const {
+  return MVT::i8;
+}
+
+/// The type legalizer framework of generating legalizer can generate libcalls
+/// only when the operand/result types are illegal.
+/// PIC16 needs to generate libcalls even for the legal types (i8) for some ops.
+/// For example an arithmetic right shift. These functions are used to lower
+/// such operations that generate libcall for legal types.
+
+void 
+PIC16TargetLowering::setPIC16LibcallName(PIC16ISD::PIC16Libcall Call,
+                                         const char *Name) {
+  PIC16LibcallNames[Call] = Name; 
+}
+
+const char *
+PIC16TargetLowering::getPIC16LibcallName(PIC16ISD::PIC16Libcall Call) {
+  return PIC16LibcallNames[Call];
+}
+
+SDValue
+PIC16TargetLowering::MakePIC16Libcall(PIC16ISD::PIC16Libcall Call,
+                                      MVT RetVT, const SDValue *Ops,
+                                      unsigned NumOps, bool isSigned,
+                                      SelectionDAG &DAG, DebugLoc dl) {
+
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForMVT();
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(getPIC16LibcallName(Call), MVT::i8);
+
+   const Type *RetTy = RetVT.getTypeForMVT();
+   std::pair<SDValue,SDValue> CallInfo = 
+     LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                 false, CallingConv::C, false, Callee, Args, DAG, dl);
+
+  return CallInfo.first;
+}
+
+const char *PIC16TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:                         return NULL;
+  case PIC16ISD::Lo:               return "PIC16ISD::Lo";
+  case PIC16ISD::Hi:               return "PIC16ISD::Hi";
+  case PIC16ISD::MTLO:             return "PIC16ISD::MTLO";
+  case PIC16ISD::MTHI:             return "PIC16ISD::MTHI";
+  case PIC16ISD::MTPCLATH:         return "PIC16ISD::MTPCLATH";
+  case PIC16ISD::PIC16Connect:     return "PIC16ISD::PIC16Connect";
+  case PIC16ISD::Banksel:          return "PIC16ISD::Banksel";
+  case PIC16ISD::PIC16Load:        return "PIC16ISD::PIC16Load";
+  case PIC16ISD::PIC16LdArg:       return "PIC16ISD::PIC16LdArg";
+  case PIC16ISD::PIC16LdWF:        return "PIC16ISD::PIC16LdWF";
+  case PIC16ISD::PIC16Store:       return "PIC16ISD::PIC16Store";
+  case PIC16ISD::PIC16StWF:        return "PIC16ISD::PIC16StWF";
+  case PIC16ISD::BCF:              return "PIC16ISD::BCF";
+  case PIC16ISD::LSLF:             return "PIC16ISD::LSLF";
+  case PIC16ISD::LRLF:             return "PIC16ISD::LRLF";
+  case PIC16ISD::RLF:              return "PIC16ISD::RLF";
+  case PIC16ISD::RRF:              return "PIC16ISD::RRF";
+  case PIC16ISD::CALL:             return "PIC16ISD::CALL";
+  case PIC16ISD::CALLW:            return "PIC16ISD::CALLW";
+  case PIC16ISD::SUBCC:            return "PIC16ISD::SUBCC";
+  case PIC16ISD::SELECT_ICC:       return "PIC16ISD::SELECT_ICC";
+  case PIC16ISD::BRCOND:           return "PIC16ISD::BRCOND";
+  case PIC16ISD::Dummy:            return "PIC16ISD::Dummy";
+  }
+}
+
+void PIC16TargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) {
+
+  switch (N->getOpcode()) {
+    case ISD::GlobalAddress:
+      Results.push_back(ExpandGlobalAddress(N, DAG));
+      return;
+    case ISD::ExternalSymbol:
+      Results.push_back(ExpandExternalSymbol(N, DAG));
+      return;
+    case ISD::STORE:
+      Results.push_back(ExpandStore(N, DAG));
+      return;
+    case ISD::LOAD:
+      PopulateResults(ExpandLoad(N, DAG), Results);
+      return;
+    case ISD::ADD:
+      // Results.push_back(ExpandAdd(N, DAG));
+      return;
+    case ISD::FrameIndex:
+      Results.push_back(ExpandFrameIndex(N, DAG));
+      return;
+    default:
+      assert (0 && "not implemented");
+      return;
+  }
+}
+
+SDValue PIC16TargetLowering::ExpandFrameIndex(SDNode *N, SelectionDAG &DAG) {
+
+  // Currently handling FrameIndex of size MVT::i16 only
+  // One example of this scenario is when return value is written on
+  // FrameIndex#0
+
+  if (N->getValueType(0) != MVT::i16)
+    return SDValue();
+
+  // Expand the FrameIndex into ExternalSymbol and a Constant node
+  // The constant will represent the frame index number
+  // Get the current function frame
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Func = MF.getFunction();
+  const std::string Name = Func->getName();
+  
+  FrameIndexSDNode *FR = dyn_cast<FrameIndexSDNode>(SDValue(N,0));
+  // FIXME there isn't really debug info here
+  DebugLoc dl = FR->getDebugLoc();
+  int Index = FR->getIndex();
+
+  // Expand FrameIndex like GlobalAddress and ExternalSymbol
+  // Also use Offset field for lo and hi parts. The default 
+  // offset is zero.
+  SDValue Offset = DAG.getConstant(0, MVT::i8);
+  SDValue FI = DAG.getTargetFrameIndex(Index, MVT::i8);
+  SDValue Lo = DAG.getNode(PIC16ISD::Lo, dl, MVT::i8, FI, Offset);
+  SDValue Hi = DAG.getNode(PIC16ISD::Hi, dl, MVT::i8, FI, Offset);
+  return DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0), Lo, Hi);
+}
+
+
+SDValue PIC16TargetLowering::ExpandStore(SDNode *N, SelectionDAG &DAG) { 
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  SDValue Chain = St->getChain();
+  SDValue Src = St->getValue();
+  SDValue Ptr = St->getBasePtr();
+  MVT ValueType = Src.getValueType();
+  unsigned StoreOffset = 0;
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue PtrLo, PtrHi;
+  LegalizeAddress(Ptr, DAG, PtrLo, PtrHi, StoreOffset, dl);
+ 
+  if (ValueType == MVT::i8) {
+    return DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other, Chain, Src,
+                        PtrLo, PtrHi, 
+                        DAG.getConstant (0 + StoreOffset, MVT::i8));
+  }
+  else if (ValueType == MVT::i16) {
+    // Get the Lo and Hi parts from MERGE_VALUE or BUILD_PAIR.
+    SDValue SrcLo, SrcHi;
+    GetExpandedParts(Src, DAG, SrcLo, SrcHi);
+    SDValue ChainLo = Chain, ChainHi = Chain;
+    if (Chain.getOpcode() == ISD::TokenFactor) {
+      ChainLo = Chain.getOperand(0);
+      ChainHi = Chain.getOperand(1);
+    }
+    SDValue Store1 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other,
+                                 ChainLo,
+                                 SrcLo, PtrLo, PtrHi,
+                                 DAG.getConstant (0 + StoreOffset, MVT::i8));
+
+    SDValue Store2 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainHi, 
+                                 SrcHi, PtrLo, PtrHi,
+                                 DAG.getConstant (1 + StoreOffset, MVT::i8));
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, getChain(Store1),
+                       getChain(Store2));
+  }
+  else if (ValueType == MVT::i32) {
+    // Get the Lo and Hi parts from MERGE_VALUE or BUILD_PAIR.
+    SDValue SrcLo, SrcHi;
+    GetExpandedParts(Src, DAG, SrcLo, SrcHi);
+
+    // Get the expanded parts of each of SrcLo and SrcHi.
+    SDValue SrcLo1, SrcLo2, SrcHi1, SrcHi2;
+    GetExpandedParts(SrcLo, DAG, SrcLo1, SrcLo2);
+    GetExpandedParts(SrcHi, DAG, SrcHi1, SrcHi2);
+
+    SDValue ChainLo = Chain, ChainHi = Chain;
+    if (Chain.getOpcode() == ISD::TokenFactor) {  
+      ChainLo = Chain.getOperand(0);
+      ChainHi = Chain.getOperand(1);
+    }
+    SDValue ChainLo1 = ChainLo, ChainLo2 = ChainLo, ChainHi1 = ChainHi,
+            ChainHi2 = ChainHi;
+    if (ChainLo.getOpcode() == ISD::TokenFactor) {
+      ChainLo1 = ChainLo.getOperand(0);
+      ChainLo2 = ChainLo.getOperand(1);
+    }
+    if (ChainHi.getOpcode() == ISD::TokenFactor) {
+      ChainHi1 = ChainHi.getOperand(0);
+      ChainHi2 = ChainHi.getOperand(1);
+    }
+    SDValue Store1 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other,
+                                 ChainLo1,
+                                 SrcLo1, PtrLo, PtrHi,
+                                 DAG.getConstant (0 + StoreOffset, MVT::i8));
+
+    SDValue Store2 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainLo2,
+                                 SrcLo2, PtrLo, PtrHi,
+                                 DAG.getConstant (1 + StoreOffset, MVT::i8));
+
+    SDValue Store3 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainHi1,
+                                 SrcHi1, PtrLo, PtrHi,
+                                 DAG.getConstant (2 + StoreOffset, MVT::i8));
+
+    SDValue Store4 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainHi2,
+                                 SrcHi2, PtrLo, PtrHi,
+                                 DAG.getConstant (3 + StoreOffset, MVT::i8));
+
+    SDValue RetLo =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                                 getChain(Store1), getChain(Store2));
+    SDValue RetHi =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                                 getChain(Store3), getChain(Store4));
+    return  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, RetLo, RetHi);
+
+  }
+  else {
+    assert (0 && "value type not supported");
+    return SDValue();
+  }
+}
+
+SDValue PIC16TargetLowering::ExpandExternalSymbol(SDNode *N, SelectionDAG &DAG)
+{
+  ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(SDValue(N, 0));
+  // FIXME there isn't really debug info here
+  DebugLoc dl = ES->getDebugLoc();
+
+  SDValue TES = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i8);
+  SDValue Offset = DAG.getConstant(0, MVT::i8);
+  SDValue Lo = DAG.getNode(PIC16ISD::Lo, dl, MVT::i8, TES, Offset);
+  SDValue Hi = DAG.getNode(PIC16ISD::Hi, dl, MVT::i8, TES, Offset);
+
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16, Lo, Hi);
+}
+
+// ExpandGlobalAddress - 
+SDValue PIC16TargetLowering::ExpandGlobalAddress(SDNode *N, SelectionDAG &DAG) {
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(SDValue(N, 0));
+  // FIXME there isn't really debug info here
+  DebugLoc dl = G->getDebugLoc();
+  
+  SDValue TGA = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i8,
+                                           G->getOffset());
+
+  SDValue Offset = DAG.getConstant(0, MVT::i8);
+  SDValue Lo = DAG.getNode(PIC16ISD::Lo, dl, MVT::i8, TGA, Offset);
+  SDValue Hi = DAG.getNode(PIC16ISD::Hi, dl, MVT::i8, TGA, Offset);
+
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16, Lo, Hi);
+}
+
+bool PIC16TargetLowering::isDirectAddress(const SDValue &Op) {
+  assert (Op.getNode() != NULL && "Can't operate on NULL SDNode!!");
+
+  if (Op.getOpcode() == ISD::BUILD_PAIR) {
+   if (Op.getOperand(0).getOpcode() == PIC16ISD::Lo) 
+     return true;
+  }
+  return false;
+}
+
+// Return true if DirectAddress is in ROM_SPACE
+bool PIC16TargetLowering::isRomAddress(const SDValue &Op) {
+
+  // RomAddress is a GlobalAddress in ROM_SPACE_
+  // If the Op is not a GlobalAddress return NULL without checking
+  // anything further.
+  if (!isDirectAddress(Op))
+    return false; 
+
+  // Its a GlobalAddress.
+  // It is BUILD_PAIR((PIC16Lo TGA), (PIC16Hi TGA)) and Op is BUILD_PAIR
+  SDValue TGA = Op.getOperand(0).getOperand(0);
+  GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(TGA);
+
+  if (GSDN->getAddressSpace() == PIC16ISD::ROM_SPACE)
+    return true;
+
+  // Any other address space return it false
+  return false;
+}
+
+
+// GetExpandedParts - This function is on the similiar lines as
+// the GetExpandedInteger in type legalizer is. This returns expanded
+// parts of Op in Lo and Hi. 
+
+void PIC16TargetLowering::GetExpandedParts(SDValue Op, SelectionDAG &DAG,
+                                           SDValue &Lo, SDValue &Hi) {  
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  MVT NewVT = getTypeToTransformTo(N->getValueType(0));
+
+  // Extract the lo component.
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NewVT, Op,
+                   DAG.getConstant(0, MVT::i8));
+
+  // extract the hi component
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NewVT, Op,
+                   DAG.getConstant(1, MVT::i8));
+}
+
+// Legalize FrameIndex into ExternalSymbol and offset.
+void 
+PIC16TargetLowering::LegalizeFrameIndex(SDValue Op, SelectionDAG &DAG,
+                                        SDValue &ES, int &Offset) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Func = MF.getFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const std::string Name = Func->getName();
+
+  FrameIndexSDNode *FR = dyn_cast<FrameIndexSDNode>(Op);
+
+  // FrameIndices are not stack offsets. But they represent the request
+  // for space on stack. That space requested may be more than one byte. 
+  // Therefore, to calculate the stack offset that a FrameIndex aligns
+  // with, we need to traverse all the FrameIndices available earlier in 
+  // the list and add their requested size.
+  unsigned FIndex = FR->getIndex();
+  const char *tmpName;
+  if (FIndex < ReservedFrameCount) {
+    tmpName = createESName(PAN::getFrameLabel(Name));
+    ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+    Offset = 0;
+    for (unsigned i=0; i<FIndex ; ++i) {
+      Offset += MFI->getObjectSize(i);
+    }
+  } else {
+   // FrameIndex has been made for some temporary storage 
+    tmpName = createESName(PAN::getTempdataLabel(Name));
+    ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+    Offset = GetTmpOffsetForFI(FIndex, MFI->getObjectSize(FIndex));
+  }
+
+  return;
+}
+
+// This function legalizes the PIC16 Addresses. If the Pointer is  
+//  -- Direct address variable residing 
+//     --> then a Banksel for that variable will be created.
+//  -- Rom variable            
+//     --> then it will be treated as an indirect address.
+//  -- Indirect address 
+//     --> then the address will be loaded into FSR
+//  -- ADD with constant operand
+//     --> then constant operand of ADD will be returned as Offset
+//         and non-constant operand of ADD will be treated as pointer.
+// Returns the high and lo part of the address, and the offset(in case of ADD).
+
+void PIC16TargetLowering::LegalizeAddress(SDValue Ptr, SelectionDAG &DAG, 
+                                          SDValue &Lo, SDValue &Hi,
+                                          unsigned &Offset, DebugLoc dl) {
+
+  // Offset, by default, should be 0
+  Offset = 0;
+
+  // If the pointer is ADD with constant,
+  // return the constant value as the offset  
+  if (Ptr.getOpcode() == ISD::ADD) {
+    SDValue OperLeft = Ptr.getOperand(0);
+    SDValue OperRight = Ptr.getOperand(1);
+    if (OperLeft.getOpcode() == ISD::Constant) {
+      Offset = dyn_cast<ConstantSDNode>(OperLeft)->getZExtValue();
+      Ptr = OperRight;
+    } else if (OperRight.getOpcode() == ISD::Constant) {
+      Offset = dyn_cast<ConstantSDNode>(OperRight)->getZExtValue();
+      Ptr = OperLeft;
+    }
+  }
+
+  // If the pointer is Type i8 and an external symbol
+  // then treat it as direct address.
+  // One example for such case is storing and loading
+  // from function frame during a call
+  if (Ptr.getValueType() == MVT::i8) {
+    switch (Ptr.getOpcode()) {
+    case ISD::TargetExternalSymbol:
+      Lo = Ptr;
+      Hi = DAG.getConstant(1, MVT::i8);
+      return;
+    }
+  }
+
+  // Expansion of FrameIndex has Lo/Hi parts
+  if (isDirectAddress(Ptr)) { 
+      SDValue TFI = Ptr.getOperand(0).getOperand(0); 
+      if (TFI.getOpcode() == ISD::TargetFrameIndex) {
+        int FrameOffset;
+        LegalizeFrameIndex(TFI, DAG, Lo, FrameOffset);
+        Hi = DAG.getConstant(1, MVT::i8);
+        Offset += FrameOffset; 
+        return;
+      }
+  }
+
+  if (isDirectAddress(Ptr) && !isRomAddress(Ptr)) {
+    // Direct addressing case for RAM variables. The Hi part is constant
+    // and the Lo part is the TGA itself.
+    Lo = Ptr.getOperand(0).getOperand(0);
+
+    // For direct addresses Hi is a constant. Value 1 for the constant
+    // signifies that banksel needs to generated for it. Value 0 for
+    // the constant signifies that banksel does not need to be generated 
+    // for it. Mark it as 1 now and optimize later. 
+    Hi = DAG.getConstant(1, MVT::i8);
+    return; 
+  }
+
+  // Indirect addresses. Get the hi and lo parts of ptr. 
+  GetExpandedParts(Ptr, DAG, Lo, Hi);
+
+  // Put the hi and lo parts into FSR.
+  Lo = DAG.getNode(PIC16ISD::MTLO, dl, MVT::i8, Lo);
+  Hi = DAG.getNode(PIC16ISD::MTHI, dl, MVT::i8, Hi);
+
+  return;
+}
+
+SDValue PIC16TargetLowering::ExpandLoad(SDNode *N, SelectionDAG &DAG) {
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(SDValue(N, 0));
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  DebugLoc dl = LD->getDebugLoc();
+
+  SDValue Load, Offset;
+  SDVTList Tys; 
+  MVT VT, NewVT;
+  SDValue PtrLo, PtrHi;
+  unsigned LoadOffset;
+
+  // Legalize direct/indirect addresses. This will give the lo and hi parts
+  // of the address and the offset.
+  LegalizeAddress(Ptr, DAG, PtrLo, PtrHi, LoadOffset, dl);
+
+  // Load from the pointer (direct address or FSR) 
+  VT = N->getValueType(0);
+  unsigned NumLoads = VT.getSizeInBits() / 8; 
+  std::vector<SDValue> PICLoads;
+  unsigned iter;
+  MVT MemVT = LD->getMemoryVT();
+  if(ISD::isNON_EXTLoad(N)) {
+    for (iter=0; iter<NumLoads ; ++iter) {
+      // Add the pointer offset if any
+      Offset = DAG.getConstant(iter + LoadOffset, MVT::i8);
+      Tys = DAG.getVTList(MVT::i8, MVT::Other); 
+      Load = DAG.getNode(PIC16ISD::PIC16Load, dl, Tys, Chain, PtrLo, PtrHi,
+                         Offset); 
+      PICLoads.push_back(Load);
+    }
+  } else {
+    // If it is extended load then use PIC16Load for Memory Bytes
+    // and for all extended bytes perform action based on type of
+    // extention - i.e. SignExtendedLoad or ZeroExtendedLoad
+
+    
+    // For extended loads this is the memory value type
+    // i.e. without any extension
+    MVT MemVT = LD->getMemoryVT();
+    unsigned MemBytes = MemVT.getSizeInBits() / 8;
+    unsigned ExtdBytes = VT.getSizeInBits() / 8;
+    Offset = DAG.getConstant(LoadOffset, MVT::i8);
+
+    Tys = DAG.getVTList(MVT::i8, MVT::Other); 
+    // For MemBytes generate PIC16Load with proper offset
+    for (iter=0; iter<MemBytes; ++iter) {
+      // Add the pointer offset if any
+      Offset = DAG.getConstant(iter + LoadOffset, MVT::i8);
+      Load = DAG.getNode(PIC16ISD::PIC16Load, dl, Tys, Chain, PtrLo, PtrHi,
+                         Offset); 
+      PICLoads.push_back(Load);
+    }
+
+    // For SignExtendedLoad
+    if (ISD::isSEXTLoad(N)) {
+      // For all ExtdBytes use the Right Shifted(Arithmetic) Value of the 
+      // highest MemByte
+      SDValue SRA = DAG.getNode(ISD::SRA, dl, MVT::i8, Load, 
+                                DAG.getConstant(7, MVT::i8));
+      for (iter=MemBytes; iter<ExtdBytes; ++iter) { 
+        PICLoads.push_back(SRA);
+      }
+    } else if (ISD::isZEXTLoad(N)) {
+      // ZeroExtendedLoad -- For all ExtdBytes use constant 0
+      SDValue ConstZero = DAG.getConstant(0, MVT::i8);
+      for (iter=MemBytes; iter<ExtdBytes; ++iter) { 
+        PICLoads.push_back(ConstZero);
+      }
+    }
+  }
+  SDValue BP;
+
+  if (VT == MVT::i8) {
+    // Operand of Load is illegal -- Load itself is legal
+    return PICLoads[0];
+  }
+  else if (VT == MVT::i16) {
+    BP = DAG.getNode(ISD::BUILD_PAIR, dl, VT, PICLoads[0], PICLoads[1]);
+    if (MemVT == MVT::i8)
+      Chain = getChain(PICLoads[0]);
+    else
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                          getChain(PICLoads[0]), getChain(PICLoads[1]));
+  } else if (VT == MVT::i32) {
+    SDValue BPs[2];
+    BPs[0] = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16, 
+                         PICLoads[0], PICLoads[1]);
+    BPs[1] = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16,
+                         PICLoads[2], PICLoads[3]);
+    BP = DAG.getNode(ISD::BUILD_PAIR, dl, VT, BPs[0], BPs[1]);
+    if (MemVT == MVT::i8)
+      Chain = getChain(PICLoads[0]);
+    else if (MemVT == MVT::i16)
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                          getChain(PICLoads[0]), getChain(PICLoads[1]));
+    else {
+      SDValue Chains[2];
+      Chains[0] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                              getChain(PICLoads[0]), getChain(PICLoads[1]));
+      Chains[1] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                              getChain(PICLoads[2]), getChain(PICLoads[3]));
+      Chain =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Chains[0], Chains[1]);
+    }
+  }
+  Tys = DAG.getVTList(VT, MVT::Other); 
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, BP, Chain);
+}
+
+SDValue PIC16TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal shift to lower");
+ 
+  SDNode *N = Op.getNode();
+  SDValue Value = N->getOperand(0);
+  SDValue Amt = N->getOperand(1);
+  PIC16ISD::PIC16Libcall CallCode;
+  switch (N->getOpcode()) {
+  case ISD::SRA:
+    CallCode = PIC16ISD::SRA_I8;
+    break;
+  case ISD::SHL:
+    CallCode = PIC16ISD::SLL_I8;
+    break;
+  case ISD::SRL:
+    CallCode = PIC16ISD::SRL_I8;
+    break;
+  default:
+    assert ( 0 && "This shift is not implemented yet.");
+    return SDValue();
+  }
+  SmallVector<SDValue, 2> Ops(2);
+  Ops[0] = Value;
+  Ops[1] = Amt;
+  SDValue Call = MakePIC16Libcall(CallCode, N->getValueType(0), &Ops[0], 2, 
+                                  true, DAG, N->getDebugLoc());
+  return Call;
+}
+
+void
+PIC16TargetLowering::LowerOperationWrapper(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  SDValue Op = SDValue(N, 0);
+  SDValue Res;
+  unsigned i;
+  switch (Op.getOpcode()) {
+    case ISD::FORMAL_ARGUMENTS:
+      Res = LowerFORMAL_ARGUMENTS(Op, DAG); break;
+    case ISD::LOAD:
+      Res = ExpandLoad(Op.getNode(), DAG); break;
+    case ISD::CALL:
+      Res = LowerCALL(Op, DAG); break;
+    default: {
+      // All other operations are handled in LowerOperation.
+      Res = LowerOperation(Op, DAG);
+      if (Res.getNode())
+        Results.push_back(Res);
+        
+      return; 
+    }
+  }
+
+  N = Res.getNode();
+  unsigned NumValues = N->getNumValues(); 
+  for (i = 0; i < NumValues ; i++) {
+    Results.push_back(SDValue(N, i)); 
+  }
+}
+
+SDValue PIC16TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+    case ISD::FORMAL_ARGUMENTS:
+      return LowerFORMAL_ARGUMENTS(Op, DAG);
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+      return LowerADD(Op, DAG);
+    case ISD::SUB:
+    case ISD::SUBC:
+    case ISD::SUBE:
+      return LowerSUB(Op, DAG);
+    case ISD::LOAD:
+      return ExpandLoad(Op.getNode(), DAG);
+    case ISD::STORE:
+      return ExpandStore(Op.getNode(), DAG);
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      return LowerShift(Op, DAG);
+    case ISD::OR:
+    case ISD::AND:
+    case ISD::XOR:
+      return LowerBinOp(Op, DAG);
+    case ISD::CALL:
+      return LowerCALL(Op, DAG);
+    case ISD::RET:
+      return LowerRET(Op, DAG);
+    case ISD::BR_CC:
+      return LowerBR_CC(Op, DAG);
+    case ISD::SELECT_CC:
+      return LowerSELECT_CC(Op, DAG);
+  }
+  return SDValue();
+}
+
+SDValue PIC16TargetLowering::ConvertToMemOperand(SDValue Op,
+                                                 SelectionDAG &DAG,
+                                                 DebugLoc dl) {
+  assert (Op.getValueType() == MVT::i8 
+          && "illegal value type to store on stack.");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Func = MF.getFunction();
+  const std::string FuncName = Func->getName();
+
+
+  // Put the value on stack.
+  // Get a stack slot index and convert to es.
+  int FI = MF.getFrameInfo()->CreateStackObject(1, 1);
+  const char *tmpName = createESName(PAN::getTempdataLabel(FuncName));
+  SDValue ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+
+  // Store the value to ES.
+  SDValue Store = DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other,
+                               DAG.getEntryNode(),
+                               Op, ES, 
+                               DAG.getConstant (1, MVT::i8), // Banksel.
+                               DAG.getConstant (GetTmpOffsetForFI(FI, 1), 
+                                                MVT::i8));
+
+  // Load the value from ES.
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other);
+  SDValue Load = DAG.getNode(PIC16ISD::PIC16Load, dl, Tys, Store,
+                             ES, DAG.getConstant (1, MVT::i8),
+                             DAG.getConstant (GetTmpOffsetForFI(FI, 1), 
+                             MVT::i8));
+    
+  return Load.getValue(0);
+}
+
+SDValue PIC16TargetLowering::
+LowerIndirectCallArguments(SDValue Op, SDValue Chain, SDValue InFlag,
+                           SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                           SelectionDAG &DAG) {
+  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
+  unsigned NumOps = TheCall->getNumArgs();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  // If call has no arguments then do nothing and return.
+  if (NumOps == 0)
+    return Chain;
+
+  std::vector<SDValue> Ops;
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Arg, StoreRet;
+
+  // For PIC16 ABI the arguments come after the return value. 
+  unsigned RetVals = TheCall->getNumRetVals();
+  for (unsigned i = 0, ArgOffset = RetVals; i < NumOps; i++) {
+    // Get the arguments
+    Arg = TheCall->getArg(i);
+    
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(Arg);
+    Ops.push_back(DataAddr_Lo);
+    Ops.push_back(DataAddr_Hi);
+    Ops.push_back(DAG.getConstant(ArgOffset, MVT::i8));
+    Ops.push_back(InFlag);
+
+    StoreRet = DAG.getNode (PIC16ISD::PIC16StWF, dl, Tys, &Ops[0], Ops.size());
+
+    Chain = getChain(StoreRet);
+    InFlag = getOutFlag(StoreRet);
+    ArgOffset++;
+  }
+  return Chain;
+}
+
+SDValue PIC16TargetLowering::
+LowerDirectCallArguments(SDValue Op, SDValue Chain, SDValue ArgLabel, 
+                         SDValue InFlag, SelectionDAG &DAG) {
+  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
+  unsigned NumOps = TheCall->getNumArgs();
+  DebugLoc dl = TheCall->getDebugLoc();
+  std::string Name;
+  SDValue Arg, StoreAt;
+  MVT ArgVT;
+  unsigned Size=0;
+  unsigned ArgCount=0;
+
+  // If call has no arguments then do nothing and return.
+  if (NumOps == 0)
+    return Chain; 
+
+  // FIXME: This portion of code currently assumes only
+  // primitive types being passed as arguments.
+
+  // Legalize the address before use
+  SDValue PtrLo, PtrHi;
+  unsigned AddressOffset;
+  int StoreOffset = 0;
+  LegalizeAddress(ArgLabel, DAG, PtrLo, PtrHi, AddressOffset, dl);
+  SDValue StoreRet;
+
+  std::vector<SDValue> Ops;
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  for (unsigned i=ArgCount, Offset = 0; i<NumOps; i++) {
+    // Get the argument
+    Arg = TheCall->getArg(i);
+    StoreOffset = (Offset + AddressOffset);
+   
+    // Store the argument on frame
+
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(Arg);
+    Ops.push_back(PtrLo);
+    Ops.push_back(PtrHi);
+    Ops.push_back(DAG.getConstant(StoreOffset, MVT::i8));
+    Ops.push_back(InFlag);
+
+    StoreRet = DAG.getNode (PIC16ISD::PIC16StWF, dl, Tys, &Ops[0], Ops.size());
+
+    Chain = getChain(StoreRet);
+    InFlag = getOutFlag(StoreRet);
+
+    // Update the frame offset to be used for next argument
+    ArgVT = Arg.getValueType();
+    Size = ArgVT.getSizeInBits();
+    Size = Size/8;    // Calculate size in bytes
+    Offset += Size;   // Increase the frame offset
+  }
+  return Chain;
+}
+
+SDValue PIC16TargetLowering::
+LowerIndirectCallReturn (SDValue Op, SDValue Chain, SDValue InFlag,
+                         SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                         SelectionDAG &DAG) {
+  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
+  DebugLoc dl = TheCall->getDebugLoc();
+  unsigned RetVals = TheCall->getNumRetVals();
+
+  // If call does not have anything to return
+  // then do nothing and go back.
+  if (RetVals == 0)
+    return Chain;
+
+  // Call has something to return
+  std::vector<SDValue> ResultVals;
+  SDValue LoadRet;
+
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other, MVT::Flag);
+  for(unsigned i=0;i<RetVals;i++) {
+    LoadRet = DAG.getNode(PIC16ISD::PIC16LdWF, dl, Tys, Chain, DataAddr_Lo,
+                          DataAddr_Hi, DAG.getConstant(i, MVT::i8),
+                          InFlag);
+    InFlag = getOutFlag(LoadRet);
+    Chain = getChain(LoadRet);
+    ResultVals.push_back(LoadRet);
+  }
+  ResultVals.push_back(Chain);
+  SDValue Res = DAG.getMergeValues(&ResultVals[0], ResultVals.size(), dl);
+  return Res;
+}
+
+SDValue PIC16TargetLowering::
+LowerDirectCallReturn(SDValue Op, SDValue Chain, SDValue RetLabel,
+                      SDValue InFlag, SelectionDAG &DAG) {
+  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
+  DebugLoc dl = TheCall->getDebugLoc();
+  // Currently handling primitive types only. They will come in
+  // i8 parts
+  unsigned RetVals = TheCall->getNumRetVals();
+  
+  std::vector<SDValue> ResultVals;
+
+  // Return immediately if the return type is void
+  if (RetVals == 0)
+    return Chain;
+
+  // Call has something to return
+  
+  // Legalize the address before use
+  SDValue LdLo, LdHi;
+  unsigned LdOffset;
+  LegalizeAddress(RetLabel, DAG, LdLo, LdHi, LdOffset, dl);
+
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other, MVT::Flag);
+  SDValue LoadRet;
+ 
+  for(unsigned i=0, Offset=0;i<RetVals;i++) {
+
+    LoadRet = DAG.getNode(PIC16ISD::PIC16LdWF, dl, Tys, Chain, LdLo, LdHi,
+                          DAG.getConstant(LdOffset + Offset, MVT::i8),
+                          InFlag);
+
+    InFlag = getOutFlag(LoadRet);
+
+    Chain = getChain(LoadRet);
+    Offset++;
+    ResultVals.push_back(LoadRet);
+  }
+
+  // To return use MERGE_VALUES
+  ResultVals.push_back(Chain);
+  SDValue Res = DAG.getMergeValues(&ResultVals[0], ResultVals.size(), dl);
+  return Res;
+}
+
+SDValue PIC16TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getNumOperands() == 1)   // return void
+    return Op;
+
+  // return should have odd number of operands
+  if ((Op.getNumOperands() % 2) == 0 ) {
+    assert(0 && "Do not know how to return this many arguments!");
+    abort();
+  }
+  
+  // Number of values to return 
+  unsigned NumRet = (Op.getNumOperands() / 2);
+
+  // Function returns value always on stack with the offset starting
+  // from 0 
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  std::string FuncName = F->getName();
+
+  const char *tmpName = createESName(PAN::getFrameLabel(FuncName));
+  SDVTList VTs  = DAG.getVTList (MVT::i8, MVT::Other);
+  SDValue ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+  SDValue BS = DAG.getConstant(1, MVT::i8);
+  SDValue RetVal;
+  for(unsigned i=0;i<NumRet; ++i) {
+    RetVal = Op.getNode()->getOperand(2*i + 1);
+    Chain =  DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other, Chain, RetVal,
+                        ES, BS,
+                        DAG.getConstant (i, MVT::i8));
+      
+  }
+  return DAG.getNode(ISD::RET, dl, MVT::Other, Chain);
+}
+
+// CALL node may have some operands non-legal to PIC16. Generate new CALL
+// node with all the operands legal.
+// Currently only Callee operand of the CALL node is non-legal. This function
+// legalizes the Callee operand and uses all other operands as are to generate
+// new CALL node.
+
+SDValue PIC16TargetLowering::LegalizeCALL(SDValue Op, SelectionDAG &DAG) {
+    CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
+    SDValue Chain = TheCall->getChain();
+    SDValue Callee = TheCall->getCallee();
+    DebugLoc dl = TheCall->getDebugLoc();
+    unsigned i =0;
+
+    assert(Callee.getValueType() == MVT::i16 &&
+           "Don't know how to legalize this call node!!!");
+    assert(Callee.getOpcode() == ISD::BUILD_PAIR &&
+           "Don't know how to legalize this call node!!!");
+
+    if (isDirectAddress(Callee)) {
+       // Come here for direct calls
+       Callee = Callee.getOperand(0).getOperand(0);
+    } else {
+      // Come here for indirect calls
+      SDValue Lo, Hi;
+      // Indirect addresses. Get the hi and lo parts of ptr.
+      GetExpandedParts(Callee, DAG, Lo, Hi);
+      // Connect Lo and Hi parts of the callee with the PIC16Connect
+      Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Lo, Hi);
+    }
+    std::vector<SDValue> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+
+    // Add the call arguments and their flags
+    unsigned NumArgs = TheCall->getNumArgs();
+    for(i=0;i<NumArgs;i++) {
+       Ops.push_back(TheCall->getArg(i));
+       Ops.push_back(TheCall->getArgFlagsVal(i));
+    }
+    std::vector<MVT> NodeTys;
+    unsigned NumRets = TheCall->getNumRetVals();
+    for(i=0;i<NumRets;i++)
+       NodeTys.push_back(TheCall->getRetValType(i));
+
+   // Return a Chain as well
+   NodeTys.push_back(MVT::Other);
+   
+   SDVTList VTs = DAG.getVTList(&NodeTys[0], NodeTys.size());
+   // Generate new call with all the operands legal
+   return DAG.getCall(TheCall->getCallingConv(), dl,
+                      TheCall->isVarArg(), TheCall->isTailCall(),
+                      TheCall->isInreg(), VTs, &Ops[0], Ops.size());
+}
+
+void PIC16TargetLowering::
+GetDataAddress(DebugLoc dl, SDValue Callee, SDValue &Chain, 
+               SDValue &DataAddr_Lo, SDValue &DataAddr_Hi,
+               SelectionDAG &DAG) {
+   assert (Callee.getOpcode() == PIC16ISD::PIC16Connect
+           && "Don't know what to do of such callee!!");
+   SDValue ZeroOperand = DAG.getConstant(0, MVT::i8);
+   SDValue SeqStart  = DAG.getCALLSEQ_START(Chain, ZeroOperand);
+   Chain = getChain(SeqStart);
+   SDValue OperFlag = getOutFlag(SeqStart); // To manage the data dependency
+
+   // Get the Lo and Hi part of code address
+   SDValue Lo = Callee.getOperand(0);
+   SDValue Hi = Callee.getOperand(1);
+
+   SDValue Data_Lo, Data_Hi;
+   SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other, MVT::Flag);
+   // Subtract 2 from Address to get the Lower part of DataAddress.
+   SDVTList VTList = DAG.getVTList(MVT::i8, MVT::Flag);
+   Data_Lo = DAG.getNode(ISD::SUBC, dl, VTList, Lo, 
+                         DAG.getConstant(2, MVT::i8));
+   SDValue Ops[3] = { Hi, DAG.getConstant(0, MVT::i8), Data_Lo.getValue(1)};
+   Data_Hi = DAG.getNode(ISD::SUBE, dl, VTList, Ops, 3);
+   SDValue PCLATH = DAG.getNode(PIC16ISD::MTPCLATH, dl, MVT::i8, Data_Hi);
+   Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Data_Lo, PCLATH);
+   SDValue Call = DAG.getNode(PIC16ISD::CALLW, dl, Tys, Chain, Callee,
+                              OperFlag);
+   Chain = getChain(Call);
+   OperFlag = getOutFlag(Call);
+   SDValue SeqEnd = DAG.getCALLSEQ_END(Chain, ZeroOperand, ZeroOperand,
+                                       OperFlag);
+   Chain = getChain(SeqEnd);
+   OperFlag = getOutFlag(SeqEnd);
+
+   // Low part of Data Address 
+   DataAddr_Lo = DAG.getNode(PIC16ISD::MTLO, dl, MVT::i8, Call, OperFlag);
+
+   // Make the second call.
+   SeqStart  = DAG.getCALLSEQ_START(Chain, ZeroOperand);
+   Chain = getChain(SeqStart);
+   OperFlag = getOutFlag(SeqStart); // To manage the data dependency
+
+   // Subtract 1 from Address to get high part of data address.
+   Data_Lo = DAG.getNode(ISD::SUBC, dl, VTList, Lo, 
+                         DAG.getConstant(1, MVT::i8));
+   SDValue HiOps[3] = { Hi, DAG.getConstant(0, MVT::i8), Data_Lo.getValue(1)};
+   Data_Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+   PCLATH = DAG.getNode(PIC16ISD::MTPCLATH, dl, MVT::i8, Data_Hi);
+
+   // Use new Lo to make another CALLW
+   Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Data_Lo, PCLATH);
+   Call = DAG.getNode(PIC16ISD::CALLW, dl, Tys, Chain, Callee, OperFlag);
+   Chain = getChain(Call);
+   OperFlag = getOutFlag(Call);
+   SeqEnd = DAG.getCALLSEQ_END(Chain, ZeroOperand, ZeroOperand,
+                                        OperFlag);
+   Chain = getChain(SeqEnd);
+   OperFlag = getOutFlag(SeqEnd);
+   // Hi part of Data Address
+   DataAddr_Hi = DAG.getNode(PIC16ISD::MTHI, dl, MVT::i8, Call, OperFlag);
+}
+
+
+SDValue PIC16TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
+    CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
+    SDValue Chain = TheCall->getChain();
+    SDValue Callee = TheCall->getCallee();
+    DebugLoc dl = TheCall->getDebugLoc();
+    if (Callee.getValueType() == MVT::i16 &&
+      Callee.getOpcode() == ISD::BUILD_PAIR) {
+          // Control should come here only from TypeLegalizer for lowering
+          
+          // Legalize the non-legal arguments of call and return the
+          // new call with legal arguments.
+          return LegalizeCALL(Op, DAG);
+    }
+    // Control should come here from Legalize DAG.
+    // Here all the operands of CALL node should be legal.
+    
+    // If this is an indirect call then to pass the arguments
+    // and read the return value back, we need the data address
+    // of the function being called. 
+    // To get the data address two more calls need to be made.
+
+    // The flag to track if this is a direct or indirect call.
+    bool IsDirectCall = true;    
+    unsigned RetVals = TheCall->getNumRetVals();
+    unsigned NumArgs = TheCall->getNumArgs();
+
+    SDValue DataAddr_Lo, DataAddr_Hi; 
+    if (Callee.getOpcode() == PIC16ISD::PIC16Connect) { 
+       IsDirectCall = false;    // This is indirect call
+       // Read DataAddress only if we have to pass arguments or 
+       // read return value. 
+       if ((RetVals > 0) || (NumArgs > 0)) 
+         GetDataAddress(dl, Callee, Chain, DataAddr_Lo, DataAddr_Hi, DAG);
+    }
+
+    SDValue ZeroOperand = DAG.getConstant(0, MVT::i8);
+
+    // Start the call sequence.
+    // Carring the Constant 0 along the CALLSEQSTART
+    // because there is nothing else to carry.
+    SDValue SeqStart  = DAG.getCALLSEQ_START(Chain, ZeroOperand);
+    Chain = getChain(SeqStart);
+    SDValue OperFlag = getOutFlag(SeqStart); // To manage the data dependency
+    std::string Name;
+
+    // For any direct call - callee will be GlobalAddressNode or
+    // ExternalSymbol
+    SDValue ArgLabel, RetLabel;
+    if (IsDirectCall) { 
+       // Considering the GlobalAddressNode case here.
+       if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+          GlobalValue *GV = G->getGlobal();
+          Callee = DAG.getTargetGlobalAddress(GV, MVT::i8);
+          Name = G->getGlobal()->getName();
+       } else {// Considering the ExternalSymbol case here
+          ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Callee);
+          Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i8); 
+          Name = ES->getSymbol();
+       }
+
+       // Label for argument passing
+       const char *argFrame = createESName(PAN::getArgsLabel(Name));
+       ArgLabel = DAG.getTargetExternalSymbol(argFrame, MVT::i8);
+
+       // Label for reading return value
+       const char *retName = createESName(PAN::getRetvalLabel(Name));
+       RetLabel = DAG.getTargetExternalSymbol(retName, MVT::i8);
+    } else {
+       // if indirect call
+       SDValue CodeAddr_Lo = Callee.getOperand(0);
+       SDValue CodeAddr_Hi = Callee.getOperand(1);
+
+       /*CodeAddr_Lo = DAG.getNode(ISD::ADD, dl, MVT::i8, CodeAddr_Lo,
+                                 DAG.getConstant(2, MVT::i8));*/
+
+       // move Hi part in PCLATH
+       CodeAddr_Hi = DAG.getNode(PIC16ISD::MTPCLATH, dl, MVT::i8, CodeAddr_Hi);
+       Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, CodeAddr_Lo,
+                            CodeAddr_Hi);
+    } 
+
+    // Pass the argument to function before making the call.
+    SDValue CallArgs;
+    if (IsDirectCall) {
+      CallArgs = LowerDirectCallArguments(Op, Chain, ArgLabel, OperFlag, DAG);
+      Chain = getChain(CallArgs);
+      OperFlag = getOutFlag(CallArgs);
+    } else {
+      CallArgs = LowerIndirectCallArguments(Op, Chain, OperFlag, DataAddr_Lo, 
+                                            DataAddr_Hi, DAG);
+      Chain = getChain(CallArgs);
+      OperFlag = getOutFlag(CallArgs);
+    }
+
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue PICCall = DAG.getNode(PIC16ISD::CALL, dl, Tys, Chain, Callee,
+                                  OperFlag);
+    Chain = getChain(PICCall);
+    OperFlag = getOutFlag(PICCall);
+
+
+    // Carrying the Constant 0 along the CALLSEQSTART
+    // because there is nothing else to carry.
+    SDValue SeqEnd = DAG.getCALLSEQ_END(Chain, ZeroOperand, ZeroOperand,
+                                        OperFlag);
+    Chain = getChain(SeqEnd);
+    OperFlag = getOutFlag(SeqEnd);
+
+    // Lower the return value reading after the call.
+    if (IsDirectCall)
+      return LowerDirectCallReturn(Op, Chain, RetLabel, OperFlag, DAG);
+    else
+      return LowerIndirectCallReturn(Op, Chain, OperFlag, DataAddr_Lo,
+                                     DataAddr_Hi, DAG);
+}
+
+bool PIC16TargetLowering::isDirectLoad(const SDValue Op) {
+  if (Op.getOpcode() == PIC16ISD::PIC16Load)
+    if (Op.getOperand(1).getOpcode() == ISD::TargetGlobalAddress
+     || Op.getOperand(1).getOpcode() == ISD::TargetExternalSymbol)
+      return true;
+  return false;
+}
+
+// NeedToConvertToMemOp - Returns true if one of the operands of the
+// operation 'Op' needs to be put into memory. Also returns the
+// operand no. of the operand to be converted in 'MemOp'. Remember, PIC16 has 
+// no instruction that can operation on two registers. Most insns take
+// one register and one memory operand (addwf) / Constant (addlw).
+bool PIC16TargetLowering::NeedToConvertToMemOp(SDValue Op, unsigned &MemOp) {
+  // If one of the operand is a constant, return false.
+  if (Op.getOperand(0).getOpcode() == ISD::Constant ||
+      Op.getOperand(1).getOpcode() == ISD::Constant)
+    return false;    
+
+  // Return false if one of the operands is already a direct
+  // load and that operand has only one use.
+  if (isDirectLoad(Op.getOperand(0))) {
+    if (Op.getOperand(0).hasOneUse())
+      return false;
+    else 
+      MemOp = 0;
+  }
+  if (isDirectLoad(Op.getOperand(1))) {
+    if (Op.getOperand(1).hasOneUse())
+      return false;
+    else 
+      MemOp = 1; 
+  }
+  return true;
+}  
+
+// LowerBinOp - Lower a commutative binary operation that does not
+// affect status flag carry.
+SDValue PIC16TargetLowering::LowerBinOp(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal Op to lower");
+
+  unsigned MemOp = 1;
+  if (NeedToConvertToMemOp(Op, MemOp)) {
+    // Put one value on stack.
+    SDValue NewVal = ConvertToMemOperand (Op.getOperand(MemOp), DAG, dl);
+
+    return DAG.getNode(Op.getOpcode(), dl, MVT::i8, Op.getOperand(MemOp ^ 1),
+    NewVal);
+  }
+  else {
+    return Op;
+  }
+}
+
+// LowerADD - Lower all types of ADD operations including the ones
+// that affects carry.
+SDValue PIC16TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) {
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal add to lower");
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned MemOp = 1;
+  if (NeedToConvertToMemOp(Op, MemOp)) {
+    // Put one value on stack.
+    SDValue NewVal = ConvertToMemOperand (Op.getOperand(MemOp), DAG, dl);
+    
+    // ADDC and ADDE produce two results.
+    SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Flag);
+
+    // ADDE has three operands, the last one is the carry bit.
+    if (Op.getOpcode() == ISD::ADDE)
+      return DAG.getNode(Op.getOpcode(), dl, Tys, Op.getOperand(MemOp ^ 1),
+                         NewVal, Op.getOperand(2));
+    // ADDC has two operands.
+    else if (Op.getOpcode() == ISD::ADDC)
+      return DAG.getNode(Op.getOpcode(), dl, Tys, Op.getOperand(MemOp ^ 1),
+                         NewVal);
+    // ADD it is. It produces only one result.
+    else
+      return DAG.getNode(Op.getOpcode(), dl, MVT::i8, Op.getOperand(MemOp ^ 1),
+                         NewVal);
+  }
+  else
+    return Op;
+}
+
+SDValue PIC16TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal sub to lower");
+
+  // Nothing to do if the first operand is already a direct load and it has
+  // only one use.
+  if (isDirectLoad(Op.getOperand(0)) && Op.getOperand(0).hasOneUse())
+    return Op;
+
+  // Put first operand on stack.
+  SDValue NewVal = ConvertToMemOperand (Op.getOperand(0), DAG, dl);
+
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Flag);
+  if (Op.getOpcode() == ISD::SUBE)
+    return DAG.getNode(Op.getOpcode(), dl, Tys, NewVal, Op.getOperand(1),
+                       Op.getOperand(2));
+  else
+    return DAG.getNode(Op.getOpcode(), dl, Tys, NewVal, Op.getOperand(1));
+}
+
+void PIC16TargetLowering::InitReservedFrameCount(const Function *F) {
+  unsigned NumArgs = F->arg_size();
+
+  bool isVoidFunc = (F->getReturnType()->getTypeID() == Type::VoidTyID);
+
+  if (isVoidFunc)
+    ReservedFrameCount = NumArgs;
+  else
+    ReservedFrameCount = NumArgs + 1;
+}
+
+// LowerFORMAL_ARGUMENTS - Argument values are loaded from the
+// <fname>.args + offset. All arguments are already broken to leaglized
+// types, so the offset just runs from 0 to NumArgVals - 1.
+
+SDValue PIC16TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, 
+                                                   SelectionDAG &DAG) {
+  SmallVector<SDValue, 8> ArgValues;
+  unsigned NumArgVals = Op.getNode()->getNumValues() - 1;
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Chain = Op.getOperand(0);    // Formal arguments' chain
+
+
+  // Get the callee's name to create the <fname>.args label to pass args.
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  std::string FuncName = F->getName();
+
+  // Reset the map of FI and TmpOffset 
+  ResetTmpOffsetMap();
+  // Initialize the ReserveFrameCount
+  InitReservedFrameCount(F);
+
+  // Create the <fname>.args external symbol.
+  const char *tmpName = createESName(PAN::getArgsLabel(FuncName));
+  SDValue ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+
+  // Load arg values from the label + offset.
+  SDVTList VTs  = DAG.getVTList (MVT::i8, MVT::Other);
+  SDValue BS = DAG.getConstant(1, MVT::i8);
+  for (unsigned i = 0; i < NumArgVals ; ++i) {
+    SDValue Offset = DAG.getConstant(i, MVT::i8);
+    SDValue PICLoad = DAG.getNode(PIC16ISD::PIC16LdArg, dl, VTs, Chain, ES, BS,
+                                  Offset);
+    Chain = getChain(PICLoad);
+    ArgValues.push_back(PICLoad);
+  }
+
+  // Return a MERGE_VALUE node.
+  ArgValues.push_back(Op.getOperand(0));
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), 
+                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+}
+
+// Perform DAGCombine of PIC16Load.
+// FIXME - Need a more elaborate comment here.
+SDValue PIC16TargetLowering::
+PerformPIC16LoadCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Chain = N->getOperand(0); 
+  if (N->hasNUsesOfValue(0, 0)) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), Chain);
+  }
+  return SDValue();
+}
+
+// For all the functions with arguments some STORE nodes are generated 
+// that store the argument on the frameindex. However in PIC16 the arguments
+// are passed on stack only. Therefore these STORE nodes are redundant. 
+// To remove these STORE nodes will be removed in PerformStoreCombine 
+//
+// Currently this function is doint nothing and will be updated for removing
+// unwanted store operations
+SDValue PIC16TargetLowering::
+PerformStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  return SDValue(N, 0);
+  /*
+  // Storing an undef value is of no use, so remove it
+  if (isStoringUndef(N, Chain, DAG)) {
+    return Chain; // remove the store and return the chain
+  }
+  //else everything is ok.
+  return SDValue(N, 0);
+  */
+}
+
+SDValue PIC16TargetLowering::PerformDAGCombine(SDNode *N, 
+                                               DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  case ISD::STORE:   
+   return PerformStoreCombine(N, DCI); 
+  case PIC16ISD::PIC16Load:   
+    return PerformPIC16LoadCombine(N, DCI);
+  }
+  return SDValue();
+}
+
+static PIC16CC::CondCodes IntCCToPIC16CC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition code!");
+  case ISD::SETNE:  return PIC16CC::NE;
+  case ISD::SETEQ:  return PIC16CC::EQ;
+  case ISD::SETGT:  return PIC16CC::GT;
+  case ISD::SETGE:  return PIC16CC::GE;
+  case ISD::SETLT:  return PIC16CC::LT;
+  case ISD::SETLE:  return PIC16CC::LE;
+  case ISD::SETULT: return PIC16CC::ULT;
+  case ISD::SETULE: return PIC16CC::LE;
+  case ISD::SETUGE: return PIC16CC::GE;
+  case ISD::SETUGT: return PIC16CC::UGT;
+  }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+      CC == ISD::SETNE &&
+      (LHS.getOpcode() == PIC16ISD::SELECT_ICC &&
+        LHS.getOperand(3).getOpcode() == PIC16ISD::SUBCC) &&
+      isa<ConstantSDNode>(LHS.getOperand(0)) &&
+      isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(0))->getZExtValue() == 1 &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 0) {
+    SDValue CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+// Returns appropriate CMP insn and corresponding condition code in PIC16CC
+SDValue PIC16TargetLowering::getPIC16Cmp(SDValue LHS, SDValue RHS, 
+                                         unsigned CC, SDValue &PIC16CC, 
+                                         SelectionDAG &DAG, DebugLoc dl) {
+  PIC16CC::CondCodes CondCode = (PIC16CC::CondCodes) CC;
+
+  // PIC16 sub is literal - W. So Swap the operands and condition if needed.
+  // i.e. a < 12 can be rewritten as 12 > a.
+  if (RHS.getOpcode() == ISD::Constant) {
+
+    SDValue Tmp = LHS;
+    LHS = RHS;
+    RHS = Tmp;
+
+    switch (CondCode) {
+    default: break;
+    case PIC16CC::LT:
+      CondCode = PIC16CC::GT; 
+      break;
+    case PIC16CC::GT:
+      CondCode = PIC16CC::LT; 
+      break;
+    case PIC16CC::ULT:
+      CondCode = PIC16CC::UGT; 
+      break;
+    case PIC16CC::UGT:
+      CondCode = PIC16CC::ULT; 
+      break;
+    case PIC16CC::GE:
+      CondCode = PIC16CC::LE; 
+      break;
+    case PIC16CC::LE:
+      CondCode = PIC16CC::GE;
+      break;
+    case PIC16CC::ULE:
+      CondCode = PIC16CC::UGE;
+      break;
+    case PIC16CC::UGE:
+      CondCode = PIC16CC::ULE;
+      break;
+    }
+  }
+
+  PIC16CC = DAG.getConstant(CondCode, MVT::i8);
+
+  // These are signed comparisons. 
+  SDValue Mask = DAG.getConstant(128, MVT::i8);
+  if (isSignedComparison(CondCode)) {
+    LHS = DAG.getNode (ISD::XOR, dl, MVT::i8, LHS, Mask);
+    RHS = DAG.getNode (ISD::XOR, dl, MVT::i8, RHS, Mask); 
+  }
+
+  SDVTList VTs = DAG.getVTList (MVT::i8, MVT::Flag);
+  // We can use a subtract operation to set the condition codes. But
+  // we need to put one operand in memory if required.
+  // Nothing to do if the first operand is already a valid type (direct load 
+  // for subwf and literal for sublw) and it is used by this operation only. 
+  if ((LHS.getOpcode() == ISD::Constant || isDirectLoad(LHS)) 
+      && LHS.hasOneUse())
+    return DAG.getNode(PIC16ISD::SUBCC, dl, VTs, LHS, RHS);
+
+  // else convert the first operand to mem.
+  LHS = ConvertToMemOperand (LHS, DAG, dl);
+  return DAG.getNode(PIC16ISD::SUBCC, dl, VTs, LHS, RHS);
+}
+
+
+SDValue PIC16TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  unsigned ORIGCC = ~0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If this is a select_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  // i.e.
+  // A setcc: lhs, rhs, cc is expanded by llvm to 
+  // select_cc: result of setcc, 0, 1, 0, setne
+  // We can think of it as:
+  // select_cc: lhs, rhs, 1, 0, cc
+  LookThroughSetCC(LHS, RHS, CC, ORIGCC);
+  if (ORIGCC == ~0U) ORIGCC = IntCCToPIC16CC (CC);
+
+  SDValue PIC16CC;
+  SDValue Cmp = getPIC16Cmp(LHS, RHS, ORIGCC, PIC16CC, DAG, dl);
+
+  return DAG.getNode (PIC16ISD::SELECT_ICC, dl, TrueVal.getValueType(), TrueVal,
+                      FalseVal, PIC16CC, Cmp.getValue(1)); 
+}
+
+MachineBasicBlock *
+PIC16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  unsigned CC = (PIC16CC::CondCodes)MI->getOperand(3).getImm();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(PIC16::pic16brcond)).addMBB(sinkMBB).addImm(CC);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  sinkMBB->transferSuccessors(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, dl, TII.get(PIC16::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+
+SDValue PIC16TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);   // LHS of the condition.
+  SDValue RHS = Op.getOperand(3);   // RHS of the condition.
+  SDValue Dest = Op.getOperand(4);  // BB to jump to
+  unsigned ORIGCC = ~0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If this is a br_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, ORIGCC);
+  if (ORIGCC == ~0U) ORIGCC = IntCCToPIC16CC (CC);
+
+  // Get the Compare insn and condition code.
+  SDValue PIC16CC;
+  SDValue Cmp = getPIC16Cmp(LHS, RHS, ORIGCC, PIC16CC, DAG, dl);
+
+  return DAG.getNode(PIC16ISD::BRCOND, dl, MVT::Other, Chain, Dest, PIC16CC, 
+                     Cmp.getValue(1));
+}
+
diff --git a/lib/Target/PIC16/PIC16ISelLowering.h b/lib/Target/PIC16/PIC16ISelLowering.h
new file mode 100644
index 0000000..ca9650d
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelLowering.h
@@ -0,0 +1,227 @@
+//===-- PIC16ISelLowering.h - PIC16 DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PIC16 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16ISELLOWERING_H
+#define PIC16ISELLOWERING_H
+
+#include "PIC16.h"
+#include "PIC16Subtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include <map>
+
+namespace llvm {
+  namespace PIC16ISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      Lo,            // Low 8-bits of GlobalAddress.
+      Hi,            // High 8-bits of GlobalAddress.
+      PIC16Load,
+      PIC16LdArg,   // This is replica of PIC16Load but used to load function 
+                    // arguments and is being used for facilitating for some 
+                    // store removal optimizations. 
+
+      PIC16LdWF,
+      PIC16Store,
+      PIC16StWF,
+      Banksel,
+      MTLO,          // Move to low part of FSR
+      MTHI,          // Move to high part of FSR
+      MTPCLATH,      // Move to PCLATCH
+      PIC16Connect,  // General connector for PIC16 nodes
+      BCF,
+      LSLF,          // PIC16 Logical shift left
+      LRLF,          // PIC16 Logical shift right
+      RLF,           // Rotate left through carry
+      RRF,           // Rotate right through carry
+      CALL,          // PIC16 Call instruction 
+      CALLW,         // PIC16 CALLW instruction 
+      SUBCC,         // Compare for equality or inequality.
+      SELECT_ICC,    // Psuedo to be caught in schedular and expanded to brcond.
+      BRCOND,        // Conditional branch.
+      Dummy
+    };
+
+    // Keep track of different address spaces. 
+    enum AddressSpace {
+      RAM_SPACE = 0,   // RAM address space
+      ROM_SPACE = 1    // ROM address space number is 1
+    };
+    enum PIC16Libcall {
+      MUL_I8 = RTLIB::UNKNOWN_LIBCALL + 1,
+      SRA_I8,
+      SLL_I8,
+      SRL_I8,
+      PIC16UnknownCall
+    };
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class PIC16TargetLowering : public TargetLowering {
+  public:
+    explicit PIC16TargetLowering(PIC16TargetMachine &TM);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual MVT getSetCCResultType(MVT ValType) const;
+    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerADD(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSUB(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBinOp(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
+    // Call returns
+    SDValue 
+    LowerDirectCallReturn(SDValue Op, SDValue Chain, SDValue FrameAddress, 
+                          SDValue InFlag, SelectionDAG &DAG);
+    SDValue 
+    LowerIndirectCallReturn(SDValue Op, SDValue Chain, SDValue InFlag,
+                            SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                            SelectionDAG &DAG);
+
+    // Call arguments
+    SDValue 
+    LowerDirectCallArguments(SDValue Op, SDValue Chain, SDValue FrameAddress, 
+                             SDValue InFlag, SelectionDAG &DAG);
+
+    SDValue 
+    LowerIndirectCallArguments(SDValue Op, SDValue Chain, SDValue InFlag, 
+                               SDValue DataAddr_Lo, SDValue DataAddr_Hi, 
+                               SelectionDAG &DAG);
+
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue getPIC16Cmp(SDValue LHS, SDValue RHS, unsigned OrigCC, SDValue &CC,
+                        SelectionDAG &DAG, DebugLoc dl);
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *MBB) const;
+
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+    virtual void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG);
+    virtual void LowerOperationWrapper(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG);
+
+    SDValue ExpandStore(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandLoad(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandGlobalAddress(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandExternalSymbol(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandFrameIndex(SDNode *N, SelectionDAG &DAG);
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; 
+    SDValue PerformPIC16LoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; 
+    SDValue PerformStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; 
+
+    // This function returns the Tmp Offset for FrameIndex. If any TmpOffset 
+    // already exists for the FI then it returns the same else it creates the 
+    // new offset and returns.
+    unsigned GetTmpOffsetForFI(unsigned FI, unsigned slot_size); 
+    void ResetTmpOffsetMap() { FiTmpOffsetMap.clear(); SetTmpSize(0); }
+    void InitReservedFrameCount(const Function *F); 
+
+    // Return the size of Tmp variable 
+    unsigned GetTmpSize() { return TmpSize; }
+    void SetTmpSize(unsigned Size) { TmpSize = Size; }
+
+  private:
+    // If the Node is a BUILD_PAIR representing a direct Address,
+    // then this function will return true.
+    bool isDirectAddress(const SDValue &Op);
+
+    // If the Node is a DirectAddress in ROM_SPACE then this 
+    // function will return true
+    bool isRomAddress(const SDValue &Op);
+
+    // Extract the Lo and Hi component of Op. 
+    void GetExpandedParts(SDValue Op, SelectionDAG &DAG, SDValue &Lo, 
+                          SDValue &Hi); 
+
+
+    // Load pointer can be a direct or indirect address. In PIC16 direct
+    // addresses need Banksel and Indirect addresses need to be loaded to
+    // FSR first. Handle address specific cases here.
+    void LegalizeAddress(SDValue Ptr, SelectionDAG &DAG, SDValue &Chain, 
+                         SDValue &NewPtr, unsigned &Offset, DebugLoc dl);
+
+    // FrameIndex should be broken down into ExternalSymbol and FrameOffset. 
+    void LegalizeFrameIndex(SDValue Op, SelectionDAG &DAG, SDValue &ES, 
+                            int &Offset);
+
+
+    // CALL node should have all legal operands only. Legalize all non-legal
+    // operands of CALL node and then return the new call will all operands
+    // legal.
+    SDValue LegalizeCALL(SDValue Op, SelectionDAG &DAG);
+
+    // For indirect calls data address of the callee frame need to be
+    // extracted. This function fills the arguments DataAddr_Lo and 
+    // DataAddr_Hi with the address of the callee frame.
+    void GetDataAddress(DebugLoc dl, SDValue Callee, SDValue &Chain,
+                        SDValue &DataAddr_Lo, SDValue &DataAddr_Hi,
+                        SelectionDAG &DAG); 
+
+    // We can not have both operands of a binary operation in W.
+    // This function is used to put one operand on stack and generate a load.
+    SDValue ConvertToMemOperand(SDValue Op, SelectionDAG &DAG, DebugLoc dl); 
+
+    // This function checks if we need to put an operand of an operation on
+    // stack and generate a load or not.
+    bool NeedToConvertToMemOp(SDValue Op, unsigned &MemOp); 
+
+    /// Subtarget - Keep a pointer to the PIC16Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const PIC16Subtarget *Subtarget;
+
+
+    // Extending the LIB Call framework of LLVM
+    // to hold the names of PIC16Libcalls.
+    const char *PIC16LibcallNames[PIC16ISD::PIC16UnknownCall]; 
+
+    // To set and retrieve the lib call names.
+    void setPIC16LibcallName(PIC16ISD::PIC16Libcall Call, const char *Name);
+    const char *getPIC16LibcallName(PIC16ISD::PIC16Libcall Call);
+
+    // Make PIC16 Libcall.
+    SDValue MakePIC16Libcall(PIC16ISD::PIC16Libcall Call, MVT RetVT, 
+                             const SDValue *Ops, unsigned NumOps, bool isSigned,
+                             SelectionDAG &DAG, DebugLoc dl);
+
+    // Check if operation has a direct load operand.
+    inline bool isDirectLoad(const SDValue Op);
+
+  private:
+    // The frameindexes generated for spill/reload are stack based.
+    // This maps maintain zero based indexes for these FIs.
+    std::map<unsigned, unsigned> FiTmpOffsetMap;
+    unsigned TmpSize;
+
+    // These are the frames for return value and argument passing 
+    // These FrameIndices will be expanded to foo.frame external symbol
+    // and all others will be expanded to foo.tmp external symbol.
+    unsigned ReservedFrameCount; 
+  };
+} // namespace llvm
+
+#endif // PIC16ISELLOWERING_H
diff --git a/lib/Target/PIC16/PIC16InstrFormats.td b/lib/Target/PIC16/PIC16InstrFormats.td
new file mode 100644
index 0000000..e213ea8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrFormats.td
@@ -0,0 +1,117 @@
+//===- PIC16InstrFormats.td - PIC16 Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe PIC16 instructions format
+//
+//  All the possible PIC16 fields are:
+//
+//  opcode  - operation code.
+//  f       - 7-bit register file address.
+//  d       - 1-bit direction specifier
+//  k       - 8/11 bit literals
+//  b       - 3 bits bit num specifier
+//
+//===----------------------------------------------------------------------===//
+
+// Generic PIC16 Format
+// PIC16 Instructions are 14-bit wide.
+
+// FIXME: Add Cooper Specific Formats if any.
+
+class PIC16Inst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "PIC16";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Byte Oriented instruction class in PIC16 : <|opcode|d|f|>
+// opcode = 6 bits.
+// d = direction = 1 bit.
+// f = file register address = 7 bits.
+//===----------------------------------------------------------------------===//
+
+class ByteFormat<bits<6> opcode, dag outs, dag ins, string asmstr,
+                 list<dag> pattern>
+  :PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<1>  d;
+  bits<7>  f;
+
+  let Inst{13-8} = opcode;
+
+  let Inst{7} = d;
+  let Inst{6-0} = f; 
+}
+
+//===----------------------------------------------------------------------===//
+// Bit Oriented instruction class in PIC16 : <|opcode|b|f|>
+// opcode = 4 bits.
+// b = bit specifier = 3 bits.
+// f = file register address = 7 bits.
+//===----------------------------------------------------------------------===//
+
+class BitFormat<bits<4> opcode, dag outs, dag ins, string asmstr, 
+                list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<3>  b;
+  bits<7>  f;
+
+  let Inst{13-10} = opcode;
+
+  let Inst{9-7} = b;
+  let Inst{6-0} = f; 
+}
+
+//===----------------------------------------------------------------------===//
+// Literal Format instruction class in PIC16 : <|opcode|k|>
+// opcode = 6 bits
+// k = literal = 8 bits
+//===----------------------------------------------------------------------===//
+
+class LiteralFormat<bits<6> opcode, dag outs, dag ins, string asmstr, 
+                    list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<8> k;
+  
+  let Inst{13-8} = opcode;
+
+  let Inst{7-0} = k; 
+}
+
+//===----------------------------------------------------------------------===//
+// Control Format instruction class in PIC16 : <|opcode|k|>
+// opcode = 3 bits.
+// k = jump address = 11 bits.
+//===----------------------------------------------------------------------===//
+
+class ControlFormat<bits<3> opcode, dag outs, dag ins, string asmstr, 
+                    list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<11> k;
+
+  let Inst{13-11} = opcode;
+
+  let Inst{10-0} = k; 
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instruction class in PIC16
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+   let Inst{13-6} = 0;
+}
diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
new file mode 100644
index 0000000..2a769e8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp
@@ -0,0 +1,186 @@
+//===- PIC16InstrInfo.cpp - PIC16 Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16InstrInfo.h"
+#include "PIC16TargetMachine.h"
+#include "PIC16GenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include <cstdio>
+
+
+using namespace llvm;
+
+// FIXME: Add the subtarget support on this constructor.
+PIC16InstrInfo::PIC16InstrInfo(PIC16TargetMachine &tm)
+  : TargetInstrInfoImpl(PIC16Insts, array_lengthof(PIC16Insts)),
+    TM(tm), 
+    RegInfo(*this, *TM.getSubtargetImpl()) {}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  
+/// If not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned PIC16InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == PIC16::movwf 
+      && MI->getOperand(0).isReg()
+      && MI->getOperand(1).isSymbol()) {
+    FrameIndex = MI->getOperand(1).getIndex();
+    return MI->getOperand(0).getReg();
+  }
+  return 0;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the dest reg along with the FrameIndex of the stack slot.  
+/// If not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned PIC16InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == PIC16::movf 
+      && MI->getOperand(0).isReg()
+      && MI->getOperand(1).isSymbol()) {
+    FrameIndex = MI->getOperand(1).getIndex();
+    return MI->getOperand(0).getReg();
+  }
+  return 0;
+}
+
+
+void PIC16InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool isKill, int FI,
+                                         const TargetRegisterClass *RC) const {
+  PIC16TargetLowering *PTLI = TM.getTargetLowering();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  const Function *Func = MBB.getParent()->getFunction();
+  const std::string FuncName = Func->getName();
+
+  const char *tmpName = createESName(PAN::getTempdataLabel(FuncName));
+
+  // On the order of operands here: think "movwf SrcReg, tmp_slot, offset".
+  if (RC == PIC16::GPRRegisterClass) {
+    //MachineFunction &MF = *MBB.getParent();
+    //MachineRegisterInfo &RI = MF.getRegInfo();
+    BuildMI(MBB, I, DL, get(PIC16::movwf))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 1))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else if (RC == PIC16::FSR16RegisterClass) {
+    // This is a 16-bit register and the frameindex given by llvm is of
+    // size two here. Break this index N into two zero based indexes and 
+    // put one into the map. The second one is always obtained by adding 1
+    // to the first zero based index. In fact it is going to use 3 slots
+    // as saving FSRs corrupts W also and hence we need to save/restore W also.
+
+    unsigned opcode = (SrcReg == PIC16::FSR0) ? PIC16::save_fsr0 
+                                                 : PIC16::save_fsr1;
+    BuildMI(MBB, I, DL, get(opcode))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 3))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else
+    assert(0 && "Can't store this register to stack slot");
+}
+
+void PIC16InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DestReg, int FI,
+                                          const TargetRegisterClass *RC) const {
+  PIC16TargetLowering *PTLI = TM.getTargetLowering();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  const Function *Func = MBB.getParent()->getFunction();
+  const std::string FuncName = Func->getName();
+
+  const char *tmpName = createESName(PAN::getTempdataLabel(FuncName));
+
+  // On the order of operands here: think "movf FrameIndex, W".
+  if (RC == PIC16::GPRRegisterClass) {
+    //MachineFunction &MF = *MBB.getParent();
+    //MachineRegisterInfo &RI = MF.getRegInfo();
+    BuildMI(MBB, I, DL, get(PIC16::movf), DestReg)
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 1))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else if (RC == PIC16::FSR16RegisterClass) {
+    // This is a 16-bit register and the frameindex given by llvm is of
+    // size two here. Break this index N into two zero based indexes and 
+    // put one into the map. The second one is always obtained by adding 1
+    // to the first zero based index. In fact it is going to use 3 slots
+    // as saving FSRs corrupts W also and hence we need to save/restore W also.
+
+    unsigned opcode = (DestReg == PIC16::FSR0) ? PIC16::restore_fsr0 
+                                                 : PIC16::restore_fsr1;
+    BuildMI(MBB, I, DL, get(opcode), DestReg)
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 3))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else
+    assert(0 && "Can't load this register from stack slot");
+}
+
+bool PIC16InstrInfo::copyRegToReg (MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == PIC16::FSR16RegisterClass) {
+    BuildMI(MBB, I, DL, get(PIC16::copy_fsr), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (DestRC == PIC16::GPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(PIC16::copy_w), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  // Not yet supported.
+  return false;
+}
+
+bool PIC16InstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg, unsigned &DestReg,
+                                 unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  if (MI.getOpcode() == PIC16::copy_fsr
+      || MI.getOpcode() == PIC16::copy_w) {
+    DestReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  }
+
+  return false;
+}
+
diff --git a/lib/Target/PIC16/PIC16InstrInfo.h b/lib/Target/PIC16/PIC16InstrInfo.h
new file mode 100644
index 0000000..0b67679
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrInfo.h
@@ -0,0 +1,70 @@
+//===- PIC16InstrInfo.h - PIC16 Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16INSTRUCTIONINFO_H
+#define PIC16INSTRUCTIONINFO_H
+
+#include "PIC16.h"
+#include "PIC16RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+
+class PIC16InstrInfo : public TargetInstrInfoImpl 
+{
+  PIC16TargetMachine &TM;
+  const PIC16RegisterInfo RegInfo;
+public:
+  explicit PIC16InstrInfo(PIC16TargetMachine &TM);
+
+  virtual const PIC16RegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, 
+                                       int &FrameIndex) const;
+                                                                               
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI, 
+                                      int &FrameIndex) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+                                                                               
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  };
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/PIC16/PIC16InstrInfo.td b/lib/Target/PIC16/PIC16InstrInfo.td
new file mode 100644
index 0000000..c572188
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrInfo.td
@@ -0,0 +1,522 @@
+//===- PIC16InstrInfo.td - PIC16 Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PIC16 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// PIC16 Specific Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Specific Type Profiles.
+//===----------------------------------------------------------------------===//
+
+// Generic type profiles for i8/i16 unary/binary operations.
+// Taking one i8 or i16 and producing void.
+def SDTI8VoidOp : SDTypeProfile<0, 1, [SDTCisI8<0>]>;
+def SDTI16VoidOp : SDTypeProfile<0, 1, [SDTCisI16<0>]>;
+
+// Taking one value and producing an output of same type.
+def SDTI8UnaryOp : SDTypeProfile<1, 1, [SDTCisI8<0>, SDTCisI8<1>]>;
+def SDTI16UnaryOp : SDTypeProfile<1, 1, [SDTCisI16<0>, SDTCisI16<1>]>;
+
+// Taking two values and producing an output of same type.
+def SDTI8BinOp : SDTypeProfile<1, 2, [SDTCisI8<0>, SDTCisI8<1>, SDTCisI8<2>]>;
+def SDTI16BinOp : SDTypeProfile<1, 2, [SDTCisI16<0>, SDTCisI16<1>, 
+                                       SDTCisI16<2>]>;
+
+// Node specific type profiles.
+def SDT_PIC16Load : SDTypeProfile<1, 3, [SDTCisI8<0>, SDTCisI8<1>, 
+                                          SDTCisI8<2>, SDTCisI8<3>]>;
+
+def SDT_PIC16Store : SDTypeProfile<0, 4, [SDTCisI8<0>, SDTCisI8<1>, 
+                                          SDTCisI8<2>, SDTCisI8<3>]>;
+
+def SDT_PIC16Connect : SDTypeProfile<1, 2, [SDTCisI8<0>, SDTCisI8<1>,
+                                            SDTCisI8<2>]>;
+
+// PIC16ISD::CALL type prorile
+def SDT_PIC16call : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def SDT_PIC16callw : SDTypeProfile<1, -1, [SDTCisInt<0>]>;
+
+// PIC16ISD::BRCOND
+def SDT_PIC16Brcond: SDTypeProfile<0, 2, 
+                                   [SDTCisVT<0, OtherVT>, SDTCisI8<1>]>;
+
+// PIC16ISD::BRCOND
+def SDT_PIC16Selecticc: SDTypeProfile<1, 3, 
+                                   [SDTCisI8<0>, SDTCisI8<1>, SDTCisI8<2>,
+                                    SDTCisI8<3>]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 addressing modes matching via DAG.
+//===----------------------------------------------------------------------===//
+def diraddr : ComplexPattern<i8, 1, "SelectDirectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def PIC16callseq_start : SDNode<"ISD::CALLSEQ_START", SDTI8VoidOp,
+                                [SDNPHasChain, SDNPOutFlag]>;
+def PIC16callseq_end   : SDNode<"ISD::CALLSEQ_END", SDTI8VoidOp, 
+                                [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// Low 8-bits of GlobalAddress.
+def PIC16Lo : SDNode<"PIC16ISD::Lo", SDTI8BinOp>;  
+
+// High 8-bits of GlobalAddress.
+def PIC16Hi : SDNode<"PIC16ISD::Hi", SDTI8BinOp>;
+
+// The MTHI and MTLO nodes are used only to match them in the incoming 
+// DAG for replacement by corresponding set_fsrhi, set_fsrlo insntructions.
+// These nodes are not used for defining any instructions.
+def MTLO     : SDNode<"PIC16ISD::MTLO", SDTI8UnaryOp>;
+def MTHI     : SDNode<"PIC16ISD::MTHI", SDTI8UnaryOp>;
+def MTPCLATH : SDNode<"PIC16ISD::MTPCLATH", SDTI8UnaryOp>;
+
+// Node to generate Bank Select for a GlobalAddress.
+def Banksel : SDNode<"PIC16ISD::Banksel", SDTI8UnaryOp>;
+
+// Node to match a direct store operation.
+def PIC16Store : SDNode<"PIC16ISD::PIC16Store", SDT_PIC16Store, [SDNPHasChain]>;
+def PIC16StWF : SDNode<"PIC16ISD::PIC16StWF", SDT_PIC16Store, 
+                       [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+// Node to match a direct load operation.
+def PIC16Load  : SDNode<"PIC16ISD::PIC16Load", SDT_PIC16Load, [SDNPHasChain]>;
+def PIC16LdArg  : SDNode<"PIC16ISD::PIC16LdArg", SDT_PIC16Load, [SDNPHasChain]>;
+def PIC16LdWF  : SDNode<"PIC16ISD::PIC16LdWF", SDT_PIC16Load, 
+                       [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def PIC16Connect: SDNode<"PIC16ISD::PIC16Connect", SDT_PIC16Connect, []>;
+
+// Node to match PIC16 call
+def PIC16call : SDNode<"PIC16ISD::CALL", SDT_PIC16call,
+                              [SDNPHasChain , SDNPOptInFlag, SDNPOutFlag]>;
+def PIC16callw : SDNode<"PIC16ISD::CALLW", SDT_PIC16callw,
+                              [SDNPHasChain , SDNPOptInFlag, SDNPOutFlag]>;
+
+// Node to match a comparison instruction.
+def PIC16Subcc : SDNode<"PIC16ISD::SUBCC", SDTI8BinOp, [SDNPOutFlag]>;
+
+// Node to match a conditional branch.
+def PIC16Brcond : SDNode<"PIC16ISD::BRCOND", SDT_PIC16Brcond, 
+                         [SDNPHasChain, SDNPInFlag]>;
+
+def PIC16Selecticc : SDNode<"PIC16ISD::SELECT_ICC", SDT_PIC16Selecticc, 
+                         [SDNPInFlag]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Operand Definitions.
+//===----------------------------------------------------------------------===//
+def i8mem : Operand<i8>;
+def brtarget: Operand<OtherVT>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i8>;
+
+include "PIC16InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PIC16 Common Classes.
+//===----------------------------------------------------------------------===//
+
+// W = W Op F : Load the value from F and do Op to W.
+let isTwoAddress = 1, mayLoad = 1 in
+class BinOpFW<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs GPR:$dst),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset, W"),
+             [(set GPR:$dst, (OpNode GPR:$src, (PIC16Load diraddr:$ptrlo,
+                                             (i8 imm:$ptrhi),
+                                             (i8 imm:$offset))))]>;
+
+// F = F Op W : Load the value from F, do op with W and store in F.
+// This insn class is not marked as TwoAddress because the reg is
+// being used as a source operand only. (Remember a TwoAddress insn
+// needs a copyRegToReg.)
+let mayStore = 1 in
+class BinOpWF<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset"),
+             [(PIC16Store (OpNode GPR:$src, (PIC16Load diraddr:$ptrlo,
+                                             (i8 imm:$ptrhi),
+                                             (i8 imm:$offset))),
+                                             diraddr:$ptrlo,
+                                             (i8 imm:$ptrhi), (i8 imm:$offset)
+                                             )]>;
+
+// W = W Op L : Do Op of L with W and place result in W.
+let isTwoAddress = 1 in
+class BinOpLW<bits<6> opcode, string OpcStr, SDNode OpNode> :
+  LiteralFormat<opcode, (outs GPR:$dst),
+                (ins GPR:$src, i8imm:$literal),
+                !strconcat(OpcStr, " $literal"),
+                [(set GPR:$dst, (OpNode GPR:$src, (i8 imm:$literal)))]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Instructions.
+//===----------------------------------------------------------------------===//
+
+// Pseudo-instructions.
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i8imm:$amt),
+                       "!ADJCALLSTACKDOWN $amt",
+                       [(PIC16callseq_start imm:$amt)]>;
+
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i8imm:$amt),
+                       "!ADJCALLSTACKUP $amt", 
+                       [(PIC16callseq_end imm:$amt)]>;
+
+//-----------------------------------
+// Vaious movlw insn patterns.
+//-----------------------------------
+let isReMaterializable = 1 in {
+// Move 8-bit literal to W.
+def movlw : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src),
+                      "movlw $src",
+                      [(set GPR:$dst, (i8 imm:$src))]>;
+
+// Move a Lo(TGA) to W.
+def movlw_lo_1 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw LOW(${src}) + ${src2}",
+                      [(set GPR:$dst, (PIC16Lo tglobaladdr:$src, imm:$src2 ))]>;
+
+// Move a Lo(TES) to W.
+def movlw_lo_2 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw LOW(${src}) + ${src2}",
+                      [(set GPR:$dst, (PIC16Lo texternalsym:$src, imm:$src2 ))]>;
+
+// Move a Hi(TGA) to W.
+def movlw_hi_1 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw HIGH(${src}) + ${src2}",
+                      [(set GPR:$dst, (PIC16Hi tglobaladdr:$src, imm:$src2))]>;
+
+// Move a Hi(TES) to W.
+def movlw_hi_2 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw HIGH(${src}) + ${src2}",
+                      [(set GPR:$dst, (PIC16Hi texternalsym:$src, imm:$src2))]>;
+}
+
+//-------------------
+// FSR setting insns. 
+//-------------------
+// These insns are matched via a DAG replacement pattern.
+def set_fsrlo:
+  ByteFormat<0, (outs FSR16:$fsr), 
+             (ins GPR:$val),
+             "movwf ${fsr}L",
+             []>;
+
+let isTwoAddress = 1 in
+def set_fsrhi:
+  ByteFormat<0, (outs FSR16:$dst), 
+             (ins FSR16:$src, GPR:$val),
+             "movwf ${dst}H",
+             []>;
+
+def set_pclath:
+  ByteFormat<0, (outs PCLATHR:$dst), 
+             (ins GPR:$val),
+             "movwf ${dst}",
+             [(set PCLATHR:$dst , (MTPCLATH GPR:$val))]>;
+
+//----------------------------
+// copyRegToReg 
+// copyRegToReg insns. These are dummy. They should always be deleted
+// by the optimizer and never be present in the final generated code.
+// if they are, then we have to write correct macros for these insns.
+//----------------------------
+def copy_fsr:
+  Pseudo<(outs FSR16:$dst), (ins FSR16:$src), "copy_fsr $dst, $src", []>;
+
+def copy_w:
+  Pseudo<(outs GPR:$dst), (ins GPR:$src), "copy_w $dst, $src", []>;
+
+class SAVE_FSR<string OpcStr>:
+  Pseudo<(outs), 
+         (ins FSR16:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi), 
+         !strconcat(OpcStr, " $ptrlo, $offset"),
+         []>; 
+ 
+def save_fsr0: SAVE_FSR<"save_fsr0">;
+def save_fsr1: SAVE_FSR<"save_fsr1">;
+
+class RESTORE_FSR<string OpcStr>:
+  Pseudo<(outs FSR16:$dst), 
+         (ins i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi), 
+         !strconcat(OpcStr, " $ptrlo, $offset"),
+         []>; 
+
+def restore_fsr0: RESTORE_FSR<"restore_fsr0">;
+def restore_fsr1: RESTORE_FSR<"restore_fsr1">;
+
+//--------------------------
+// Store to memory
+//-------------------------
+
+// Direct store.
+// Input operands are: val = W, ptrlo = GA, offset = offset, ptrhi = banksel.
+let mayStore = 1 in
+class MOVWF_INSN<bits<6> OpCode, SDNode OpNodeDest, SDNode Op>:
+  ByteFormat<0, (outs), 
+             (ins GPR:$val, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+             "movwf ${ptrlo} + ${offset}",
+             [(Op GPR:$val, OpNodeDest:$ptrlo, (i8 imm:$ptrhi), 
+               (i8 imm:$offset))]>;
+
+// Store W to a Global Address.
+def movwf : MOVWF_INSN<0, tglobaladdr, PIC16Store>;
+
+// Store W to an External Symobol.
+def movwf_1 : MOVWF_INSN<0, texternalsym, PIC16Store>;
+
+// Store with InFlag and OutFlag
+// This is same as movwf_1 but has a flag. A flag is required to 
+// order the stores while passing the params to function.
+def movwf_2 : MOVWF_INSN<0, texternalsym, PIC16StWF>;
+
+// Indirect store. Matched via a DAG replacement pattern.
+def store_indirect : 
+  ByteFormat<0, (outs), 
+             (ins GPR:$val, FSR16:$fsr, i8imm:$offset),
+             "movwi $offset[$fsr]",
+             []>;
+
+//----------------------------
+// Load from memory
+//----------------------------
+// Direct load.
+// Input Operands are: ptrlo = GA, offset = offset, ptrhi = banksel.
+// Output: dst = W
+let mayLoad = 1 in
+class MOVF_INSN<bits<6> OpCode, SDNode OpNodeSrc, SDNode Op>:
+  ByteFormat<0, (outs GPR:$dst), 
+             (ins i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+             "movf ${ptrlo} + ${offset}, W",
+             [(set GPR:$dst, 
+               (Op OpNodeSrc:$ptrlo, (i8 imm:$ptrhi),
+               (i8 imm:$offset)))]>;
+
+// Load from a GA.
+def movf : MOVF_INSN<0, tglobaladdr, PIC16Load>;
+
+// Load from an ES.
+def movf_1 : MOVF_INSN<0, texternalsym, PIC16Load>;
+def movf_1_1 : MOVF_INSN<0, texternalsym, PIC16LdArg>;
+
+// Load with InFlag and OutFlag
+// This is same as movf_1 but has a flag. A flag is required to 
+// order the loads while copying the return value of a function.
+def movf_2 : MOVF_INSN<0, texternalsym, PIC16LdWF>;
+
+// Indirect load. Matched via a DAG replacement pattern.
+def load_indirect : 
+  ByteFormat<0, (outs GPR:$dst), 
+             (ins FSR16:$fsr, i8imm:$offset),
+             "moviw $offset[$fsr]",
+             []>;
+
+//-------------------------
+// Bitwise operations patterns
+//--------------------------
+// W = W op [F]
+let Defs = [STATUS] in {
+def OrFW :  BinOpFW<0, "iorwf", or>;
+def XOrFW : BinOpFW<0, "xorwf", xor>;
+def AndFW : BinOpFW<0, "andwf", and>;
+
+// F = W op [F]
+def OrWF :  BinOpWF<0, "iorwf", or>;
+def XOrWF : BinOpWF<0, "xorwf", xor>;
+def AndWF : BinOpWF<0, "andwf", and>;
+
+//-------------------------
+// Various add/sub patterns.
+//-------------------------
+
+// W = W + [F]
+def addfw_1: BinOpFW<0, "addwf", add>;
+def addfw_2: BinOpFW<0, "addwf", addc>;
+
+let Uses = [STATUS] in
+def addfwc: BinOpFW<0, "addwfc", adde>;  // With Carry.
+
+// F = W + [F]
+def addwf_1: BinOpWF<0, "addwf", add>;
+def addwf_2: BinOpWF<0, "addwf", addc>;
+let Uses = [STATUS] in
+def addwfc: BinOpWF<0, "addwfc", adde>;  // With Carry.
+}
+
+// W -= [F] ; load from F and sub the value from W.
+let isTwoAddress = 1, mayLoad = 1 in
+class SUBFW<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs GPR:$dst),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset, W"),
+             [(set GPR:$dst, (OpNode (PIC16Load diraddr:$ptrlo,
+                                      (i8 imm:$ptrhi), (i8 imm:$offset)),
+                                      GPR:$src))]>;
+let Defs = [STATUS] in {
+def subfw_1: SUBFW<0, "subwf", sub>;
+def subfw_2: SUBFW<0, "subwf", subc>;
+
+let Uses = [STATUS] in
+def subfwb: SUBFW<0, "subwfb", sube>;  // With Borrow.
+
+def subfw_cc: SUBFW<0, "subwf", PIC16Subcc>;
+}
+
+// [F] -= W ; 
+let mayStore = 1 in
+class SUBWF<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset"),
+             [(PIC16Store (OpNode (PIC16Load diraddr:$ptrlo,
+                                      (i8 imm:$ptrhi), (i8 imm:$offset)),
+                                      GPR:$src), diraddr:$ptrlo,
+                                      (i8 imm:$ptrhi), (i8 imm:$offset))]>;
+
+let Defs = [STATUS] in {
+def subwf_1: SUBWF<0, "subwf", sub>;
+def subwf_2: SUBWF<0, "subwf", subc>;
+
+let Uses = [STATUS] in
+  def subwfb: SUBWF<0, "subwfb", sube>;  // With Borrow.
+
+def subwf_cc: SUBWF<0, "subwf", PIC16Subcc>;
+}
+
+// addlw 
+let Defs = [STATUS] in {
+def addlw_1 : BinOpLW<0, "addlw", add>;
+def addlw_2 : BinOpLW<0, "addlw", addc>;
+
+let Uses = [STATUS] in
+def addlwc : BinOpLW<0, "addlwc", adde>; // With Carry. (Assembler macro).
+
+// bitwise operations involving a literal and w.
+def andlw : BinOpLW<0, "andlw", and>;
+def xorlw : BinOpLW<0, "xorlw", xor>;
+def orlw  : BinOpLW<0, "iorlw", or>;
+}
+
+// sublw 
+// W = C - W ; sub W from literal. (Without borrow).
+let isTwoAddress = 1 in
+class SUBLW<bits<6> opcode, SDNode OpNode> :
+  LiteralFormat<opcode, (outs GPR:$dst),
+                (ins GPR:$src, i8imm:$literal),
+                "sublw $literal",
+                [(set GPR:$dst, (OpNode (i8 imm:$literal), GPR:$src))]>;
+
+let Defs = [STATUS] in {
+def sublw_1 : SUBLW<0, sub>;
+def sublw_2 : SUBLW<0, subc>;
+def sublw_cc : SUBLW<0, PIC16Subcc>;
+}
+
+// Call instruction.
+let isCall = 1,
+    Defs = [W, FSR0, FSR1] in {
+    def CALL: LiteralFormat<0x1, (outs), (ins i8imm:$func),
+            //"call ${func} + 2",
+            "call ${func}",
+            [(PIC16call diraddr:$func)]>;
+}
+
+let isCall = 1,
+    Defs = [W, FSR0, FSR1] in {
+    def CALL_1: LiteralFormat<0x1, (outs), (ins GPR:$func, PCLATHR:$pc),
+            "callw",
+            [(PIC16call (PIC16Connect GPR:$func, PCLATHR:$pc))]>;
+}
+
+let isCall = 1,
+    Defs = [FSR0, FSR1] in {
+    def CALLW: LiteralFormat<0x1, (outs GPR:$dest), 
+                                  (ins GPR:$func, PCLATHR:$pc),
+            "callw",
+            [(set GPR:$dest, (PIC16callw (PIC16Connect GPR:$func, PCLATHR:$pc)))]>;
+}
+
+let Uses = [STATUS], isBranch = 1, isTerminator = 1, hasDelaySlot = 0 in
+def pic16brcond: ControlFormat<0x0, (outs), (ins brtarget:$dst, CCOp:$cc),
+                          "b$cc $dst",
+                          [(PIC16Brcond bb:$dst, imm:$cc)]>;
+
+// Unconditional branch.
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 0 in
+def br_uncond: ControlFormat<0x0, (outs), (ins brtarget:$dst),
+                          "goto $dst",
+                          [(br bb:$dst)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in {   // Expanded by the scheduler.
+  def SELECT_CC_Int_ICC
+   : Pseudo<(outs GPR:$dst), (ins GPR:$T, GPR:$F, i8imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set GPR:$dst, (PIC16Selecticc GPR:$T, GPR:$F,
+                                             imm:$Cond))]>;
+}
+
+
+// Banksel.
+def banksel : 
+  Pseudo<(outs),
+         (ins i8mem:$ptr),
+         "banksel $ptr",
+         []>;
+
+def pagesel : 
+  Pseudo<(outs),
+         (ins i8mem:$ptr),
+         "movlp $ptr",
+         []>;
+
+
+// Return insn.
+def Return : 
+  ControlFormat<0, (outs), (ins), "return", [(ret)]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Replacment Patterns.
+//===----------------------------------------------------------------------===//
+
+// Identify an indirect store and select insns for it.
+def : Pat<(PIC16Store GPR:$val, (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (store_indirect GPR:$val, 
+           (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+
+def : Pat<(PIC16StWF GPR:$val, (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (store_indirect GPR:$val, 
+           (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+
+// Identify an indirect load and select insns for it.
+def : Pat<(PIC16Load (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (load_indirect  (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+
+def : Pat<(PIC16LdWF (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (load_indirect  (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+
diff --git a/lib/Target/PIC16/PIC16MemSelOpt.cpp b/lib/Target/PIC16/PIC16MemSelOpt.cpp
new file mode 100644
index 0000000..20f926d
--- /dev/null
+++ b/lib/Target/PIC16/PIC16MemSelOpt.cpp
@@ -0,0 +1,169 @@
+//===-- PIC16MemSelOpt.cpp - PIC16 banksel optimizer  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which optimizes the emitting of banksel 
+// instructions before accessing data memory. This currently works within
+// a basic block only and keep tracks of the last accessed memory bank.
+// If memory access continues to be in the same bank it just makes banksel
+// immediate, which is a part of the insn accessing the data memory, from 1
+// to zero. The asm printer emits a banksel only if that immediate is 1. 
+//
+// FIXME: this is not implemented yet.  The banksel pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-codegen"
+#include "PIC16.h"
+#include "PIC16InstrInfo.h"
+#include "PIC16TargetAsmInfo.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+namespace {
+  struct VISIBILITY_HIDDEN MemSelOpt : public MachineFunctionPass {
+    static char ID;
+    MemSelOpt() : MachineFunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { 
+      return "PIC16 Memsel Optimizer"; 
+    }
+
+   bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+   bool processInstruction(MachineInstr *MI);
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    MachineBasicBlock *MBB;     // Current basic block
+    std::string CurBank;
+
+  };
+  char MemSelOpt::ID = 0;
+}
+
+FunctionPass *llvm::createPIC16MemSelOptimizerPass() { 
+  return new MemSelOpt(); 
+}
+
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool MemSelOpt::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  bool Changed = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    Changed |= processBasicBlock(MF, *I);
+  }
+
+  return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool MemSelOpt::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  // Let us assume that when entering a basic block now bank is selected.
+  // Ideally we should look at the predecessors for this information.
+  CurBank=""; 
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    Changed |= processInstruction(I);
+  }
+  return Changed;
+}
+
+bool MemSelOpt::processInstruction(MachineInstr *MI) {
+  bool Changed = false;
+
+  unsigned NumOperands = MI->getNumOperands();
+  if (NumOperands == 0) return false;
+
+
+  // If this insn is not going to access any memory, return.
+  const TargetInstrDesc &TID = TII->get(MI->getOpcode());
+  if (! (TID.isCall() || TID.mayLoad() || TID.mayStore()))
+    return false;
+
+  // Scan for the memory address operand.
+  // FIXME: Should we use standard interfaces like memoperands_iterator,
+  // hasMemOperand() etc ?
+  int MemOpPos = -1;
+  for (unsigned i = 0; i < NumOperands; i++) {
+    MachineOperand Op = MI->getOperand(i);
+    if (Op.getType() ==  MachineOperand::MO_GlobalAddress ||
+        Op.getType() ==  MachineOperand::MO_ExternalSymbol) {
+      // We found one mem operand. Next one should be BS.
+      MemOpPos = i;
+      break;
+    }
+  }
+
+  // If we did not find an insn accessing memory. Continue.
+  if (MemOpPos == -1) return Changed;
+ 
+  // Get the MemOp.
+  MachineOperand &Op = MI->getOperand(MemOpPos);
+
+  // If this is a pagesel material, handle it first.
+  if (MI->getOpcode() == PIC16::CALL) {
+    DebugLoc dl = MI->getDebugLoc();
+    BuildMI(*MBB, MI, dl, TII->get(PIC16::pagesel)).
+      addOperand(Op);
+    return true;
+  }
+
+  // Get the section name(NewBank) for MemOp.
+  // This assumes that the section names for globals are laready set by
+  // AsmPrinter->doInitialization.
+  std::string NewBank = CurBank;
+  if (Op.getType() ==  MachineOperand::MO_GlobalAddress &&
+      Op.getGlobal()->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE) {
+    NewBank = Op.getGlobal()->getSection();
+  } else if (Op.getType() ==  MachineOperand::MO_ExternalSymbol) {
+    // External Symbol is generated for temp data and arguments. They are
+    // in fpdata.<functionname>.# section.
+    std::string Sym = Op.getSymbolName();
+    NewBank = PAN::getSectionNameForSym(Sym);
+  }
+ 
+  // If the previous and new section names are same, we don't need to
+  // emit banksel. 
+  if (NewBank.compare(CurBank) != 0 ) {
+    DebugLoc dl = MI->getDebugLoc();
+    BuildMI(*MBB, MI, dl, TII->get(PIC16::banksel)).
+      addOperand(Op);
+    Changed = true;
+    CurBank = NewBank;
+  }
+
+  return Changed;
+}
+
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.cpp b/lib/Target/PIC16/PIC16RegisterInfo.cpp
new file mode 100644
index 0000000..eb758d8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16RegisterInfo.cpp
@@ -0,0 +1,91 @@
+//===- PIC16RegisterInfo.cpp - PIC16 Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-reg-info"
+
+#include "PIC16.h"
+#include "PIC16RegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+
+
+using namespace llvm;
+
+PIC16RegisterInfo::PIC16RegisterInfo(const TargetInstrInfo &tii,
+                                     const PIC16Subtarget &st)
+  : PIC16GenRegisterInfo(PIC16::ADJCALLSTACKDOWN, PIC16::ADJCALLSTACKUP),
+    TII(tii),
+    ST(st) {}
+
+#include "PIC16GenRegisterInfo.inc"
+
+/// PIC16 Callee Saved Registers
+const unsigned* PIC16RegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+// PIC16 Callee Saved Reg Classes
+const TargetRegisterClass* const*
+PIC16RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector PIC16RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+bool PIC16RegisterInfo::hasFP(const MachineFunction &MF) const {
+  return false;
+}
+
+void PIC16RegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    RegScavenger *RS) const
+{    /* NOT YET IMPLEMENTED */  }
+
+void PIC16RegisterInfo::emitPrologue(MachineFunction &MF) const
+{    /* NOT YET IMPLEMENTED */  }
+
+void PIC16RegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+{    /* NOT YET IMPLEMENTED */  }
+
+int PIC16RegisterInfo::
+getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "Not keeping track of debug information yet!!");
+  return -1;
+}
+
+unsigned PIC16RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  assert(0 && "PIC16 Does not have any frame register");
+  return 0;
+}
+
+unsigned PIC16RegisterInfo::getRARegister() const {
+  assert(0 && "PIC16 Does not have any return address register");
+  return 0;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void PIC16RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.h b/lib/Target/PIC16/PIC16RegisterInfo.h
new file mode 100644
index 0000000..83689d0
--- /dev/null
+++ b/lib/Target/PIC16/PIC16RegisterInfo.h
@@ -0,0 +1,68 @@
+//===- PIC16RegisterInfo.h - PIC16 Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16REGISTERINFO_H
+#define PIC16REGISTERINFO_H
+
+#include "PIC16GenRegisterInfo.h.inc"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+// Forward Declarations.
+  class PIC16Subtarget;
+  class TargetInstrInfo;
+
+class PIC16RegisterInfo : public PIC16GenRegisterInfo {
+  private:
+    const TargetInstrInfo &TII;
+    const PIC16Subtarget &ST;
+  
+  public:
+    PIC16RegisterInfo(const TargetInstrInfo &tii, 
+                      const PIC16Subtarget &st);
+
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // PIC16 callee saved registers
+  virtual const unsigned* 
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  // PIC16 callee saved register classes
+  virtual const TargetRegisterClass* const *
+  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  virtual bool hasFP(const MachineFunction &MF) const;
+
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                        int SPAdj, RegScavenger *RS=NULL) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  virtual unsigned getFrameRegister(MachineFunction &MF) const;
+  virtual unsigned getRARegister() const;
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.td b/lib/Target/PIC16/PIC16RegisterInfo.td
new file mode 100644
index 0000000..2959d91
--- /dev/null
+++ b/lib/Target/PIC16/PIC16RegisterInfo.td
@@ -0,0 +1,33 @@
+//===- PIC16RegisterInfo.td - PIC16 Register defs ------------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PIC16 register file
+//===----------------------------------------------------------------------===//
+
+class PIC16Reg<string n> : Register<n> {
+  let Namespace = "PIC16";
+}
+
+// PIC16 Registers.
+def W   : PIC16Reg<"W">;
+def FSR0   : PIC16Reg<"FSR0">;
+def FSR1   : PIC16Reg<"FSR1">;
+def BS     : PIC16Reg<"BS">;
+def PCLATH : PIC16Reg<"PCLATH">;
+
+def STATUS : PIC16Reg<"STATUS">;
+
+// PIC16 Register classes.
+def GPR     : RegisterClass<"PIC16", [i8],  8, [W]>;
+def FSR16   : RegisterClass<"PIC16", [i16], 8, [FSR0, FSR1]>;
+def BSR     : RegisterClass<"PIC16", [i8],  8, [BS]>;
+def PCLATHR : RegisterClass<"PIC16", [i8],  8, [PCLATH]>;
+def STATUSR : RegisterClass<"PIC16", [i8],  8, [STATUS]>;
+
diff --git a/lib/Target/PIC16/PIC16Subtarget.cpp b/lib/Target/PIC16/PIC16Subtarget.cpp
new file mode 100644
index 0000000..db8a5d8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Subtarget.cpp
@@ -0,0 +1,27 @@
+//===- PIC16Subtarget.cpp - PIC16 Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PIC16 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16Subtarget.h"
+#include "PIC16GenSubtarget.inc"
+
+using namespace llvm;
+
+PIC16Subtarget::PIC16Subtarget(const Module &M, const std::string &FS, 
+                               bool Cooper)
+  :IsCooper(Cooper)
+{
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
diff --git a/lib/Target/PIC16/PIC16Subtarget.h b/lib/Target/PIC16/PIC16Subtarget.h
new file mode 100644
index 0000000..e5147a0
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Subtarget.h
@@ -0,0 +1,45 @@
+//=====-- PIC16Subtarget.h - Define Subtarget for the PIC16 ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PIC16 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16SUBTARGET_H
+#define PIC16SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+class PIC16Subtarget : public TargetSubtarget {
+
+  // IsCooper - Target ISA is Cooper.
+  bool IsCooper;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  PIC16Subtarget(const Module &M, const std::string &FS, bool Cooper);
+  
+  /// isCooper - Returns true if the target ISA is Cooper.
+  bool isCooper() const { return IsCooper; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+};
+} // End llvm namespace
+
+#endif  // PIC16SUBTARGET_H
diff --git a/lib/Target/PIC16/PIC16TargetAsmInfo.cpp b/lib/Target/PIC16/PIC16TargetAsmInfo.cpp
new file mode 100644
index 0000000..d2657f0
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetAsmInfo.cpp
@@ -0,0 +1,264 @@
+//===-- PIC16TargetAsmInfo.cpp - PIC16 asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the PIC16TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16TargetAsmInfo.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/DerivedTypes.h"
+
+using namespace llvm;
+
+PIC16TargetAsmInfo::
+PIC16TargetAsmInfo(const PIC16TargetMachine &TM) 
+  : TargetAsmInfo(TM) {
+  CommentString = ";";
+  GlobalPrefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
+  GlobalDirective = "\tglobal\t";
+  ExternDirective = "\textern\t";
+
+  Data8bitsDirective = " db ";
+  Data16bitsDirective = " dw ";
+  Data32bitsDirective = " dl ";
+  RomData8bitsDirective = " dw ";
+  RomData16bitsDirective = " rom_di ";
+  RomData32bitsDirective = " rom_dl ";
+  ZeroDirective = NULL;
+  AsciiDirective = " dt ";
+  AscizDirective = NULL;
+  BSSSection_  = getNamedSection("udata.# UDATA",
+                              SectionFlags::Writeable | SectionFlags::BSS);
+  ReadOnlySection = getNamedSection("romdata.# ROMDATA", SectionFlags::None);
+  DataSection = getNamedSection("idata.# IDATA", SectionFlags::Writeable);
+  SwitchToSectionDirective = "";
+  // Need because otherwise a .text symbol is emitted by DwarfWriter
+  // in BeginModule, and gpasm cribbs for that .text symbol.
+  TextSection = getUnnamedSection("", SectionFlags::Code);
+  ROSection = new PIC16Section(getReadOnlySection());
+  ExternalVarDecls = new PIC16Section(getNamedSection("ExternalVarDecls"));
+  ExternalVarDefs = new PIC16Section(getNamedSection("ExternalVarDefs"));
+  // Set it to false because we weed to generate c file name and not bc file
+  // name.
+  HasSingleParameterDotFile = false;
+}
+
+const char *PIC16TargetAsmInfo::getRomDirective(unsigned size) const
+{
+  if (size == 8)
+    return RomData8bitsDirective;
+  else if (size == 16)
+    return RomData16bitsDirective;
+  else if (size == 32)
+    return RomData32bitsDirective;
+  else
+    return NULL;
+}
+
+
+const char *PIC16TargetAsmInfo::getASDirective(unsigned size, 
+                                               unsigned AS) const {
+  if (AS == PIC16ISD::ROM_SPACE)
+    return getRomDirective(size);
+  else
+    return NULL;
+}
+
+const Section *
+PIC16TargetAsmInfo::getBSSSectionForGlobal(const GlobalVariable *GV) const {
+  assert (GV->hasInitializer() && "This global doesn't need space");
+  Constant *C = GV->getInitializer();
+  assert (C->isNullValue() && "Unitialized globals has non-zero initializer");
+
+  // Find how much space this global needs.
+  const TargetData *TD = TM.getTargetData();
+  const Type *Ty = C->getType(); 
+  unsigned ValSize = TD->getTypeAllocSize(Ty);
+ 
+  // Go through all BSS Sections and assign this variable
+  // to the first available section having enough space.
+  PIC16Section *FoundBSS = NULL;
+  for (unsigned i = 0; i < BSSSections.size(); i++) {
+    if (DataBankSize - BSSSections[i]->Size >= ValSize) {
+      FoundBSS = BSSSections[i];
+      break;
+    }
+  }
+
+  // No BSS section spacious enough was found. Crate a new one.
+  if (! FoundBSS) {
+    std::string name = PAN::getUdataSectionName(BSSSections.size());
+    const Section *NewSection = getNamedSection (name.c_str());
+
+    FoundBSS = new PIC16Section(NewSection);
+
+    // Add this newly created BSS section to the list of BSSSections.
+    BSSSections.push_back(FoundBSS);
+  }
+  
+  // Insert the GV into this BSS.
+  FoundBSS->Items.push_back(GV);
+  FoundBSS->Size += ValSize;
+
+  // We can't do this here because GV is const .
+  // const std::string SName = FoundBSS->S_->getName();
+  // GV->setSection(SName);
+
+  return FoundBSS->S_;
+} 
+
+const Section *
+PIC16TargetAsmInfo::getIDATASectionForGlobal(const GlobalVariable *GV) const {
+  assert (GV->hasInitializer() && "This global doesn't need space");
+  Constant *C = GV->getInitializer();
+  assert (!C->isNullValue() && "initialized globals has zero initializer");
+  assert (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE &&
+          "can split initialized RAM data only");
+
+  // Find how much space this global needs.
+  const TargetData *TD = TM.getTargetData();
+  const Type *Ty = C->getType(); 
+  unsigned ValSize = TD->getTypeAllocSize(Ty);
+ 
+  // Go through all IDATA Sections and assign this variable
+  // to the first available section having enough space.
+  PIC16Section *FoundIDATA = NULL;
+  for (unsigned i = 0; i < IDATASections.size(); i++) {
+    if ( DataBankSize - IDATASections[i]->Size >= ValSize) {
+      FoundIDATA = IDATASections[i]; 
+      break;
+    }
+  }
+
+  // No IDATA section spacious enough was found. Crate a new one.
+  if (! FoundIDATA) {
+    std::string name = PAN::getIdataSectionName(IDATASections.size());
+    const Section *NewSection = getNamedSection (name.c_str());
+
+    FoundIDATA = new PIC16Section(NewSection);
+
+    // Add this newly created IDATA section to the list of IDATASections.
+    IDATASections.push_back(FoundIDATA);
+  }
+  
+  // Insert the GV into this IDATA.
+  FoundIDATA->Items.push_back(GV);
+  FoundIDATA->Size += ValSize;
+
+  // We can't do this here because GV is const .
+  // GV->setSection(FoundIDATA->S->getName());
+
+  return FoundIDATA->S_;
+} 
+
+// Get the section for an automatic variable of a function.
+// For PIC16 they are globals only with mangled names.
+const Section *
+PIC16TargetAsmInfo::getSectionForAuto(const GlobalVariable *GV) const {
+
+  const std::string name = PAN::getSectionNameForSym(GV->getName());
+
+  // Go through all Auto Sections and assign this variable
+  // to the appropriate section.
+  PIC16Section *FoundAutoSec = NULL;
+  for (unsigned i = 0; i < AutosSections.size(); i++) {
+    if ( AutosSections[i]->S_->getName() == name) {
+      FoundAutoSec = AutosSections[i];
+      break;
+    }
+  }
+
+  // No Auto section was found. Crate a new one.
+  if (! FoundAutoSec) {
+    const Section *NewSection = getNamedSection (name.c_str());
+
+    FoundAutoSec = new PIC16Section(NewSection);
+
+    // Add this newly created autos section to the list of AutosSections.
+    AutosSections.push_back(FoundAutoSec);
+  }
+
+  // Insert the auto into this section.
+  FoundAutoSec->Items.push_back(GV);
+
+  return FoundAutoSec->S_;
+}
+
+
+// Override default implementation to put the true globals into
+// multiple data sections if required.
+const Section*
+PIC16TargetAsmInfo::SelectSectionForGlobal(const GlobalValue *GV1) const {
+  // We select the section based on the initializer here, so it really
+  // has to be a GlobalVariable.
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GV1); 
+
+  if (!GV)
+    return TargetAsmInfo::SelectSectionForGlobal(GV1);
+
+  // Record Exteranl Var Decls.
+  if (GV->isDeclaration()) {
+    ExternalVarDecls->Items.push_back(GV);
+    return ExternalVarDecls->S_;
+  }
+    
+  assert (GV->hasInitializer() && "A def without initializer?");
+
+  // First, if this is an automatic variable for a function, get the section
+  // name for it and return.
+  const std::string name = GV->getName();
+  if (PAN::isLocalName(name)) {
+    return getSectionForAuto(GV);
+  }
+
+  // Record Exteranl Var Defs.
+  if (GV->hasExternalLinkage() || GV->hasCommonLinkage()) {
+    ExternalVarDefs->Items.push_back(GV);
+  }
+
+  // See if this is an uninitialized global.
+  const Constant *C = GV->getInitializer();
+  if (C->isNullValue()) 
+    return getBSSSectionForGlobal(GV); 
+
+  // If this is initialized data in RAM. Put it in the correct IDATA section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE) 
+    return getIDATASectionForGlobal(GV);
+
+  // This is initialized data in rom, put it in the readonly section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) {
+    ROSection->Items.push_back(GV);
+    return ROSection->S_;
+  }
+
+  // Else let the default implementation take care of it.
+  return TargetAsmInfo::SelectSectionForGlobal(GV);
+}
+
+PIC16TargetAsmInfo::~PIC16TargetAsmInfo() {
+  
+  for (unsigned i = 0; i < BSSSections.size(); i++) {
+      delete BSSSections[i]; 
+  }
+
+  for (unsigned i = 0; i < IDATASections.size(); i++) {
+      delete IDATASections[i]; 
+  }
+
+  for (unsigned i = 0; i < AutosSections.size(); i++) {
+      delete AutosSections[i]; 
+  }
+
+  delete ROSection;
+  delete ExternalVarDecls;
+  delete ExternalVarDefs;
+}
diff --git a/lib/Target/PIC16/PIC16TargetAsmInfo.h b/lib/Target/PIC16/PIC16TargetAsmInfo.h
new file mode 100644
index 0000000..e464e36
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetAsmInfo.h
@@ -0,0 +1,79 @@
+//=====-- PIC16TargetAsmInfo.h - PIC16 asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the PIC16TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16TARGETASMINFO_H
+#define PIC16TARGETASMINFO_H
+
+#include "PIC16.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include <vector>
+#include "llvm/Module.h"
+#define DataBankSize 80
+namespace llvm {
+
+  // Forward declaration.
+  class PIC16TargetMachine;
+  class GlobalVariable;
+
+  // PIC16 Splits the global data into mulitple udata and idata sections.
+  // Each udata and idata section needs to contain a list of globals that
+  // they contain, in order to avoid scanning over all the global values 
+  // again and printing only those that match the current section. 
+  // Keeping values inside the sections make printing a section much easier.
+  struct PIC16Section {
+      const Section *S_; // Connection to actual Section.
+      unsigned Size;  // Total size of the objects contained.
+      std::vector<const GlobalVariable*> Items;
+     
+      PIC16Section (const Section *s) { S_ = s; Size = 0; }
+  };
+      
+  struct PIC16TargetAsmInfo : public TargetAsmInfo {
+    std::string getSectionNameForSym(const std::string &Sym) const;
+    PIC16TargetAsmInfo(const PIC16TargetMachine &TM);
+    mutable std::vector<PIC16Section *> BSSSections;
+    mutable std::vector<PIC16Section *> IDATASections;
+    mutable std::vector<PIC16Section *> AutosSections;
+    mutable PIC16Section *ROSection;
+    mutable PIC16Section *ExternalVarDecls;
+    mutable PIC16Section *ExternalVarDefs;
+    virtual ~PIC16TargetAsmInfo();
+
+    private:
+    const char *RomData8bitsDirective;
+    const char *RomData16bitsDirective;
+    const char *RomData32bitsDirective;
+    const char *getRomDirective(unsigned size) const;
+    virtual const char *getASDirective(unsigned size, unsigned AS) const;
+    const Section *getBSSSectionForGlobal(const GlobalVariable *GV) const;
+    const Section *getIDATASectionForGlobal(const GlobalVariable *GV) const;
+    const Section *getSectionForAuto(const GlobalVariable *GV) const;
+    virtual const Section *SelectSectionForGlobal(const GlobalValue *GV) const;
+
+
+    public:
+    void SetSectionForGVs(Module &M);
+    std::vector<PIC16Section *> getBSSSections() const {
+      return BSSSections;
+    }
+    std::vector<PIC16Section *> getIDATASections()  const {
+      return IDATASections;
+    }
+    std::vector<PIC16Section *> getAutosSections()  const {
+      return AutosSections;
+    }
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/PIC16/PIC16TargetMachine.cpp b/lib/Target/PIC16/PIC16TargetMachine.cpp
new file mode 100644
index 0000000..bda6326
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetMachine.cpp
@@ -0,0 +1,79 @@
+//===-- PIC16TargetMachine.cpp - Define TargetMachine for PIC16 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PIC16 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16TargetAsmInfo.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+
+using namespace llvm;
+
+/// PIC16TargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int PIC16TargetMachineModule;
+int PIC16TargetMachineModule = 0;
+
+
+// Register the targets
+static RegisterTarget<PIC16TargetMachine> 
+X("pic16", "PIC16 14-bit [experimental].");
+static RegisterTarget<CooperTargetMachine> 
+Y("cooper", "PIC16 Cooper [experimental].");
+
+// PIC16TargetMachine - Traditional PIC16 Machine.
+PIC16TargetMachine::PIC16TargetMachine(const Module &M, const std::string &FS,
+                                       bool Cooper)
+: Subtarget(M, FS, Cooper),
+  DataLayout("e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"), 
+  InstrInfo(*this), TLInfo(*this),
+  FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0) { }
+
+// CooperTargetMachine - Uses the same PIC16TargetMachine, but makes IsCooper
+// as true.
+CooperTargetMachine::CooperTargetMachine(const Module &M, const std::string &FS)
+  : PIC16TargetMachine(M, FS, true) {}
+
+
+const TargetAsmInfo *PIC16TargetMachine::createTargetAsmInfo() const {
+  return new PIC16TargetAsmInfo(*this);
+}
+
+bool PIC16TargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createPIC16ISelDag(*this));
+  return false;
+}
+
+bool PIC16TargetMachine::
+addAssemblyEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                   bool Verbose, raw_ostream &Out) {
+  // Output assembly language.
+  PM.add(createPIC16CodePrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
+
+bool PIC16TargetMachine::addPostRegAlloc(PassManagerBase &PM, 
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createPIC16MemSelOptimizerPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+
diff --git a/lib/Target/PIC16/PIC16TargetMachine.h b/lib/Target/PIC16/PIC16TargetMachine.h
new file mode 100644
index 0000000..7f62d5c
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetMachine.h
@@ -0,0 +1,76 @@
+//===-- PIC16TargetMachine.h - Define TargetMachine for PIC16 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PIC16 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PIC16_TARGETMACHINE_H
+#define PIC16_TARGETMACHINE_H
+
+#include "PIC16InstrInfo.h"
+#include "PIC16ISelLowering.h"
+#include "PIC16RegisterInfo.h"
+#include "PIC16Subtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// PIC16TargetMachine
+///
+class PIC16TargetMachine : public LLVMTargetMachine {
+  PIC16Subtarget        Subtarget;
+  const TargetData      DataLayout;       // Calculates type size & alignment
+  PIC16InstrInfo        InstrInfo;
+  PIC16TargetLowering   TLInfo;
+
+  // PIC16 does not have any call stack frame, therefore not having 
+  // any PIC16 specific FrameInfo class.
+  TargetFrameInfo       FrameInfo;
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+public:
+  PIC16TargetMachine(const Module &M, const std::string &FS, 
+                     bool Cooper = false);
+
+  virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const PIC16InstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const PIC16Subtarget *getSubtargetImpl() const { return &Subtarget; }
+ 
+  virtual const PIC16RegisterInfo *getRegisterInfo() const { 
+    return &(InstrInfo.getRegisterInfo()); 
+  }
+
+  virtual PIC16TargetLowering *getTargetLowering() const { 
+    return const_cast<PIC16TargetLowering*>(&TLInfo); 
+  }
+
+  virtual bool addInstSelector(PassManagerBase &PM,
+                               CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel,
+                                  bool Verbose, raw_ostream &Out);
+  virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+}; // PIC16TargetMachine.
+
+/// CooperTargetMachine
+class CooperTargetMachine : public PIC16TargetMachine {
+public:
+  CooperTargetMachine(const Module &M, const std::string &FS);
+}; // CooperTargetMachine.
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt b/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..1ed483a
--- /dev/null
+++ b/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_partially_linked_object(LLVMPowerPCAsmPrinter
+  PPCAsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMPowerPCCodeGen n)
+
+add_dependencies(LLVMPowerPCAsmPrinter ${n})
diff --git a/lib/Target/PowerPC/AsmPrinter/Makefile b/lib/Target/PowerPC/AsmPrinter/Makefile
new file mode 100644
index 0000000..269ef92
--- /dev/null
+++ b/lib/Target/PowerPC/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/PowerPC/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPowerPCAsmPrinter
+
+# Hack: we need to include 'main' PowerPC target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
new file mode 100644
index 0000000..7723982
--- /dev/null
+++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
@@ -0,0 +1,1204 @@
+//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly --------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PowerPC assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "PPC.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN PPCAsmPrinter : public AsmPrinter {
+  protected:
+    StringSet<> FnStubs, GVStubs, HiddenGVStubs;
+    const PPCSubtarget &Subtarget;
+  public:
+    explicit PPCAsmPrinter(raw_ostream &O, TargetMachine &TM,
+                           const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                           bool V)
+      : AsmPrinter(O, TM, T, OL, V),
+        Subtarget(TM.getSubtarget<PPCSubtarget>()) {}
+
+    virtual const char *getPassName() const {
+      return "PowerPC Assembly Printer";
+    }
+
+    PPCTargetMachine &getTM() {
+      return static_cast<PPCTargetMachine&>(TM);
+    }
+
+    unsigned enumRegToMachineReg(unsigned enumReg) {
+      switch (enumReg) {
+      default: assert(0 && "Unhandled register!"); break;
+      case PPC::CR0:  return  0;
+      case PPC::CR1:  return  1;
+      case PPC::CR2:  return  2;
+      case PPC::CR3:  return  3;
+      case PPC::CR4:  return  4;
+      case PPC::CR5:  return  5;
+      case PPC::CR6:  return  6;
+      case PPC::CR7:  return  7;
+      }
+      abort();
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    bool printInstruction(const MachineInstr *MI);
+
+    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO);
+
+    /// stripRegisterPrefix - This method strips the character prefix from a
+    /// register name so that only the number is left.  Used by for linux asm.
+    const char *stripRegisterPrefix(const char *RegName) {
+      switch (RegName[0]) {
+      case 'r':
+      case 'f':
+      case 'v': return RegName + 1;
+      case 'c': if (RegName[1] == 'r') return RegName + 2;
+      }
+
+      return RegName;
+    }
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero) {
+      unsigned RegNo = MO.getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
+
+      // If we should use 0 for R0.
+      if (R0AsZero && RegNo == PPC::R0) {
+        O << "0";
+        return;
+      }
+
+      const char *RegName = TM.getRegisterInfo()->get(RegNo).AsmName;
+      // Linux assembler (Others?) does not take register mnemonics.
+      // FIXME - What about special registers used in mfspr/mtspr?
+      if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+      O << RegName;
+    }
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isReg()) {
+        printRegister(MO, false);
+      } else if (MO.isImm()) {
+        O << MO.getImm();
+      } else {
+        printOp(MO);
+      }
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+
+
+    void printS5ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      char value = MI->getOperand(OpNo).getImm();
+      value = (value << (32-5)) >> (32-5);
+      O << (int)value;
+    }
+    void printU5ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      unsigned char value = MI->getOperand(OpNo).getImm();
+      assert(value <= 31 && "Invalid u5imm argument!");
+      O << (unsigned int)value;
+    }
+    void printU6ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      unsigned char value = MI->getOperand(OpNo).getImm();
+      assert(value <= 63 && "Invalid u6imm argument!");
+      O << (unsigned int)value;
+    }
+    void printS16ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (short)MI->getOperand(OpNo).getImm();
+    }
+    void printU16ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (unsigned short)MI->getOperand(OpNo).getImm();
+    }
+    void printS16X4ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        O << (short)(MI->getOperand(OpNo).getImm()*4);
+      } else {
+        O << "lo16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\")";
+        else
+          O << ')';
+      }
+    }
+    void printBranchOperand(const MachineInstr *MI, unsigned OpNo) {
+      // Branches can take an immediate operand.  This is used by the branch
+      // selection pass to print $+8, an eight byte displacement from the PC.
+      if (MI->getOperand(OpNo).isImm()) {
+        O << "$+" << MI->getOperand(OpNo).getImm()*4;
+      } else {
+        printOp(MI->getOperand(OpNo));
+      }
+    }
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (TM.getRelocationModel() != Reloc::Static) {
+        if (MO.getType() == MachineOperand::MO_GlobalAddress) {
+          GlobalValue *GV = MO.getGlobal();
+          if (((GV->isDeclaration() || GV->hasWeakLinkage() ||
+                GV->hasLinkOnceLinkage() || GV->hasCommonLinkage()))) {
+            // Dynamically-resolved functions need a stub for the function.
+            std::string Name = Mang->getValueName(GV);
+            FnStubs.insert(Name);
+            printSuffixedName(Name, "$stub");
+            if (GV->hasExternalWeakLinkage())
+              ExtWeakSymbols.insert(GV);
+            return;
+          }
+        }
+        if (MO.getType() == MachineOperand::MO_ExternalSymbol) {
+          std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
+          FnStubs.insert(Name);
+          printSuffixedName(Name, "$stub");
+          return;
+        }
+      }
+
+      printOp(MI->getOperand(OpNo));
+    }
+    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo) {
+     O << (int)MI->getOperand(OpNo).getImm()*4;
+    }
+    void printPICLabel(const MachineInstr *MI, unsigned OpNo) {
+      O << "\"L" << getFunctionNumber() << "$pb\"\n";
+      O << "\"L" << getFunctionNumber() << "$pb\":";
+    }
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        if (Subtarget.isDarwin()) O << "ha16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\"";
+        if (Subtarget.isDarwin())
+          O << ')';
+        else
+          O << "@ha";
+      }
+    }
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        if (Subtarget.isDarwin()) O << "lo16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\"";
+        if (Subtarget.isDarwin())
+          O << ')';
+        else
+          O << "@l";
+      }
+    }
+    void printcrbitm(const MachineInstr *MI, unsigned OpNo) {
+      unsigned CCReg = MI->getOperand(OpNo).getReg();
+      unsigned RegNo = enumRegToMachineReg(CCReg);
+      O << (0x80 >> RegNo);
+    }
+    // The new addressing mode printers.
+    void printMemRegImm(const MachineInstr *MI, unsigned OpNo) {
+      printSymbolLo(MI, OpNo);
+      O << '(';
+      if (MI->getOperand(OpNo+1).isReg() &&
+          MI->getOperand(OpNo+1).getReg() == PPC::R0)
+        O << "0";
+      else
+        printOperand(MI, OpNo+1);
+      O << ')';
+    }
+    void printMemRegImmShifted(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm())
+        printS16X4ImmOperand(MI, OpNo);
+      else
+        printSymbolLo(MI, OpNo);
+      O << '(';
+      if (MI->getOperand(OpNo+1).isReg() &&
+          MI->getOperand(OpNo+1).getReg() == PPC::R0)
+        O << "0";
+      else
+        printOperand(MI, OpNo+1);
+      O << ')';
+    }
+
+    void printMemRegReg(const MachineInstr *MI, unsigned OpNo) {
+      // When used as the base register, r0 reads constant zero rather than
+      // the value contained in the register.  For this reason, the darwin
+      // assembler requires that we print r0 as 0 (no r) when used as the base.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      printRegister(MO, true);
+      O << ", ";
+      printOperand(MI, OpNo+1);
+    }
+
+    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
+                               const char *Modifier);
+
+    virtual bool runOnMachineFunction(MachineFunction &F) = 0;
+    virtual bool doFinalization(Module &M) = 0;
+
+    virtual void EmitExternalGlobal(const GlobalVariable *GV);
+  };
+
+  /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+  class VISIBILITY_HIDDEN PPCLinuxAsmPrinter : public PPCAsmPrinter {
+    DwarfWriter *DW;
+    MachineModuleInfo *MMI;
+  public:
+    explicit PPCLinuxAsmPrinter(raw_ostream &O, PPCTargetMachine &TM,
+                                const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                                bool V)
+      : PPCAsmPrinter(O, TM, T, OL, V), DW(0), MMI(0) {}
+
+    virtual const char *getPassName() const {
+      return "Linux PPC Assembly Printer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+      PPCAsmPrinter::getAnalysisUsage(AU);
+    }
+
+    void printModuleLevelGV(const GlobalVariable* GVar);
+  };
+
+  /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
+  /// OS X
+  class VISIBILITY_HIDDEN PPCDarwinAsmPrinter : public PPCAsmPrinter {
+    DwarfWriter *DW;
+    MachineModuleInfo *MMI;
+    raw_ostream &OS;
+  public:
+    explicit PPCDarwinAsmPrinter(raw_ostream &O, PPCTargetMachine &TM,
+                                 const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                                 bool V)
+      : PPCAsmPrinter(O, TM, T, OL, V), DW(0), MMI(0), OS(O) {}
+
+    virtual const char *getPassName() const {
+      return "Darwin PPC Assembly Printer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+      PPCAsmPrinter::getAnalysisUsage(AU);
+    }
+
+    void printModuleLevelGV(const GlobalVariable* GVar);
+  };
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer
+#include "PPCGenAsmWriter.inc"
+
+void PPCAsmPrinter::printOp(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    cerr << "printOp() does not handle immediate values\n";
+    abort();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    // FIXME: PIC relocation model
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() != Reloc::Static) {
+      std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
+      GVStubs.insert(Name);
+      printSuffixedName(Name, "$non_lazy_ptr");
+      return;
+    }
+    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+
+    // External or weakly linked global variables need non-lazily-resolved stubs
+    if (TM.getRelocationModel() != Reloc::Static) {
+      if (GV->isDeclaration() || GV->isWeakForLinker()) {
+        if (GV->hasHiddenVisibility()) {
+          if (!GV->isDeclaration() && !GV->hasCommonLinkage())
+            O << Name;
+          else {
+            HiddenGVStubs.insert(Name);
+            printSuffixedName(Name, "$non_lazy_ptr");
+          }
+        } else {
+          GVStubs.insert(Name);
+          printSuffixedName(Name, "$non_lazy_ptr");
+        }
+        if (GV->hasExternalWeakLinkage())
+          ExtWeakSymbols.insert(GV);
+        return;
+      }
+    }
+    O << Name;
+
+    printOffset(MO.getOffset());
+
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// EmitExternalGlobal - In this case we need to use the indirect symbol.
+///
+void PPCAsmPrinter::EmitExternalGlobal(const GlobalVariable *GV) {
+  std::string Name;
+  getGlobalLinkName(GV, Name);
+  if (TM.getRelocationModel() != Reloc::Static) {
+    if (GV->hasHiddenVisibility())
+      HiddenGVStubs.insert(Name);
+    else
+      GVStubs.insert(Name);
+    printSuffixedName(Name, "$non_lazy_ptr");
+    return;
+  }
+  O << Name;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+      // PPC never has a prefix.
+      printOperand(MI, OpNo);
+      return false;
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                          unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  if (MI->getOperand(OpNo).isReg())
+    printMemRegReg(MI, OpNo);
+  else
+    printMemRegImm(MI, OpNo);
+  return false;
+}
+
+void PPCAsmPrinter::printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
+                                          const char *Modifier) {
+  assert(Modifier && "Must specify 'cc' or 'reg' as predicate op modifier!");
+  unsigned Code = MI->getOperand(OpNo).getImm();
+  if (!strcmp(Modifier, "cc")) {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_ALWAYS: return; // Don't print anything for always.
+    case PPC::PRED_LT: O << "lt"; return;
+    case PPC::PRED_LE: O << "le"; return;
+    case PPC::PRED_EQ: O << "eq"; return;
+    case PPC::PRED_GE: O << "ge"; return;
+    case PPC::PRED_GT: O << "gt"; return;
+    case PPC::PRED_NE: O << "ne"; return;
+    case PPC::PRED_UN: O << "un"; return;
+    case PPC::PRED_NU: O << "nu"; return;
+    }
+
+  } else {
+    assert(!strcmp(Modifier, "reg") &&
+           "Need to specify 'cc' or 'reg' as predicate op modifier!");
+    // Don't print the register for 'always'.
+    if (Code == PPC::PRED_ALWAYS) return;
+    printOperand(MI, OpNo+1);
+  }
+}
+
+
+/// printMachineInstruction -- Print out a single PowerPC MI in Darwin syntax to
+/// the current output stream.
+///
+void PPCAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Check for slwi/srwi mnemonics.
+  if (MI->getOpcode() == PPC::RLWINM) {
+    bool FoundMnemonic = false;
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char MB = MI->getOperand(3).getImm();
+    unsigned char ME = MI->getOperand(4).getImm();
+    if (SH <= 31 && MB == 0 && ME == (31-SH)) {
+      O << "\tslwi "; FoundMnemonic = true;
+    }
+    if (SH <= 31 && MB == (32-SH) && ME == 31) {
+      O << "\tsrwi "; FoundMnemonic = true;
+      SH = 32-SH;
+    }
+    if (FoundMnemonic) {
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << ", " << (unsigned int)SH << '\n';
+      return;
+    }
+  } else if (MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) {
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      O << "\tmr ";
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << '\n';
+      return;
+    }
+  } else if (MI->getOpcode() == PPC::RLDICR) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char ME = MI->getOperand(3).getImm();
+    // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
+    if (63-SH == ME) {
+      O << "\tsldi ";
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << ", " << (unsigned int)SH << '\n';
+      return;
+    }
+  }
+
+  if (printInstruction(MI))
+    return; // Printer was automatically generated
+
+  assert(0 && "Unhandled instruction in asm writer!");
+  abort();
+  return;
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool PPCLinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::PrivateLinkage:
+  case Function::InternalLinkage:  // Symbols default to internal.
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.global\t" << CurrentFnName << '\n'
+      << "\t.type\t" << CurrentFnName << ", @function\n";
+    break;
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+    O << "\t.global\t" << CurrentFnName << '\n';
+    O << "\t.weak\t" << CurrentFnName << '\n';
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  EmitAlignment(2, F);
+  O << CurrentFnName << ":\n";
+
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+
+  O << "\t.size\t" << CurrentFnName << ",.-" << CurrentFnName << '\n';
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  // Emit post-function debug information.
+  DW->EndFunction(&MF);
+
+  O.flush();
+
+  // We didn't modify anything.
+  return false;
+}
+
+bool PPCLinuxAsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+
+  // Emit initial debug information.
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMI);
+  DW = getAnalysisIfAvailable<DwarfWriter>();
+  assert(DW && "DwarfWriter is not available");
+  DW->BeginModule(&M, MMI, O, this, TAI);
+
+  // GNU as handles section names wrapped in quotes
+  Mang->setUseQuotes(true);
+
+  SwitchToSection(TAI->getTextSection());
+
+  return Result;
+}
+
+/// PrintUnmangledNameSafely - Print out the printable characters in the name.
+/// Don't print things like \\n or \\0.
+static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
+  for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
+       Name != E; ++Name)
+    if (isprint(*Name))
+      OS << *Name;
+}
+
+void PPCLinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return;   // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar))
+    return;
+
+  std::string name = Mang->getValueName(GVar);
+
+  printVisibility(name, GVar->getVisibility());
+
+  Constant *C = GVar->getInitializer();
+  const Type *Type = C->getType();
+  unsigned Size = TD->getTypeAllocSize(Type);
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  if (C->isNullValue() && /* FIXME: Verify correct */
+      !GVar->hasSection() &&
+      (GVar->hasLocalLinkage() || GVar->hasExternalLinkage() ||
+       GVar->isWeakForLinker())) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (GVar->hasExternalLinkage()) {
+        O << "\t.global " << name << '\n';
+        O << "\t.type " << name << ", @object\n";
+        O << name << ":\n";
+        O << "\t.zero " << Size << '\n';
+      } else if (GVar->hasLocalLinkage()) {
+        O << TAI->getLCOMMDirective() << name << ',' << Size;
+      } else {
+        O << ".comm " << name << ',' << Size;
+      }
+      if (VerboseAsm) {
+        O << "\t\t" << TAI->getCommentString() << " '";
+        PrintUnmangledNameSafely(GVar, O);
+        O << "'";
+      }
+      O << '\n';
+      return;
+  }
+
+  switch (GVar->getLinkage()) {
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::WeakAnyLinkage:
+   case GlobalValue::WeakODRLinkage:
+   case GlobalValue::CommonLinkage:
+    O << "\t.global " << name << '\n'
+      << "\t.type " << name << ", @object\n"
+      << "\t.weak " << name << '\n';
+    break;
+   case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+   case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << "\t.global " << name << '\n'
+      << "\t.type " << name << ", @object\n";
+    // FALL THROUGH
+   case GlobalValue::InternalLinkage:
+   case GlobalValue::PrivateLinkage:
+    break;
+   default:
+    cerr << "Unknown linkage type!";
+    abort();
+  }
+
+  EmitAlignment(Align, GVar);
+  O << name << ":";
+  if (VerboseAsm) {
+    O << "\t\t\t\t" << TAI->getCommentString() << " '";
+    PrintUnmangledNameSafely(GVar, O);
+    O << "'";
+  }
+  O << '\n';
+
+  // If the initializer is a extern weak symbol, remember to emit the weak
+  // reference!
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+
+  EmitGlobalConstant(C);
+  O << '\n';
+}
+
+bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    printModuleLevelGV(I);
+
+  // TODO
+
+  // Emit initial debug information.
+  DW->EndModule();
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool PPCDarwinAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::PrivateLinkage:
+  case Function::InternalLinkage:  // Symbols default to internal.
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.globl\t" << CurrentFnName << '\n';
+    break;
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+    O << "\t.globl\t" << CurrentFnName << '\n';
+    O << "\t.weak_definition\t" << CurrentFnName << '\n';
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  EmitAlignment(F->hasFnAttr(Attribute::OptimizeForSize) ? 2 : 4, F);
+  O << CurrentFnName << ":\n";
+
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
+
+  // If the function is empty, then we need to emit *something*. Otherwise, the
+  // function's label might be associated with something that it wasn't meant to
+  // be associated with. We emit a noop in this situation.
+  MachineFunction::iterator I = MF.begin();
+
+  if (++I == MF.end() && MF.front().empty())
+    O << "\tnop\n";
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true, true, VerboseAsm);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+         II != IE; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // Emit post-function debug information.
+  DW->EndFunction(&MF);
+
+  // We didn't modify anything.
+  return false;
+}
+
+
+bool PPCDarwinAsmPrinter::doInitialization(Module &M) {
+  static const char *const CPUDirectives[] = {
+    "",
+    "ppc",
+    "ppc601",
+    "ppc602",
+    "ppc603",
+    "ppc7400",
+    "ppc750",
+    "ppc970",
+    "ppc64"
+  };
+
+  unsigned Directive = Subtarget.getDarwinDirective();
+  if (Subtarget.isGigaProcessor() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_970;
+  if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
+    Directive = PPC::DIR_7400;
+  if (Subtarget.isPPC64() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_64;
+  assert(Directive <= PPC::DIR_64 && "Directive out of range.");
+  O << "\t.machine " << CPUDirectives[Directive] << '\n';
+
+  bool Result = AsmPrinter::doInitialization(M);
+
+  // Emit initial debug information.
+  // We need this for Personality functions.
+  // AsmPrinter::doInitialization should have done this analysis.
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMI);
+  DW = getAnalysisIfAvailable<DwarfWriter>();
+  assert(DW && "DwarfWriter is not available");
+  DW->BeginModule(&M, MMI, O, this, TAI);
+
+  // Darwin wants symbols to be quoted if they have complex names.
+  Mang->setUseQuotes(true);
+
+  // Prime text sections so they are adjacent.  This reduces the likelihood a
+  // large data or debug section causes a branch to exceed 16M limit.
+  SwitchToTextSection("\t.section __TEXT,__textcoal_nt,coalesced,"
+                      "pure_instructions");
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    SwitchToTextSection("\t.section __TEXT,__picsymbolstub1,symbol_stubs,"
+                          "pure_instructions,32");
+  } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
+    SwitchToTextSection("\t.section __TEXT,__symbol_stub1,symbol_stubs,"
+                        "pure_instructions,16");
+  }
+  SwitchToSection(TAI->getTextSection());
+
+  return Result;
+}
+
+void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return;   // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar)) {
+    if (TM.getRelocationModel() == Reloc::Static) {
+      if (GVar->getName() == "llvm.global_ctors")
+        O << ".reference .constructors_used\n";
+      else if (GVar->getName() == "llvm.global_dtors")
+        O << ".reference .destructors_used\n";
+    }
+    return;
+  }
+
+  std::string name = Mang->getValueName(GVar);
+
+  printVisibility(name, GVar->getVisibility());
+
+  Constant *C = GVar->getInitializer();
+  const Type *Type = C->getType();
+  unsigned Size = TD->getTypeAllocSize(Type);
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  if (C->isNullValue() && /* FIXME: Verify correct */
+      !GVar->hasSection() &&
+      (GVar->hasLocalLinkage() || GVar->hasExternalLinkage() ||
+       GVar->isWeakForLinker()) &&
+      TAI->SectionKindForGlobal(GVar) != SectionKind::RODataMergeStr) {
+    if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+    if (GVar->hasExternalLinkage()) {
+      O << "\t.globl " << name << '\n';
+      O << "\t.zerofill __DATA, __common, " << name << ", "
+        << Size << ", " << Align;
+    } else if (GVar->hasLocalLinkage()) {
+      O << TAI->getLCOMMDirective() << name << ',' << Size << ',' << Align;
+    } else if (!GVar->hasCommonLinkage()) {
+      O << "\t.globl " << name << '\n'
+        << TAI->getWeakDefDirective() << name << '\n';
+      EmitAlignment(Align, GVar);
+      O << name << ":";
+      if (VerboseAsm) {
+        O << "\t\t\t\t" << TAI->getCommentString() << " ";
+        PrintUnmangledNameSafely(GVar, O);
+      }
+      O << '\n';
+      EmitGlobalConstant(C);
+      return;
+    } else {
+      O << ".comm " << name << ',' << Size;
+      // Darwin 9 and above support aligned common data.
+      if (Subtarget.isDarwin9())
+        O << ',' << Align;
+    }
+    if (VerboseAsm) {
+      O << "\t\t" << TAI->getCommentString() << " '";
+      PrintUnmangledNameSafely(GVar, O);
+      O << "'";
+    }
+    O << '\n';
+    return;
+  }
+
+  switch (GVar->getLinkage()) {
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::WeakAnyLinkage:
+   case GlobalValue::WeakODRLinkage:
+   case GlobalValue::CommonLinkage:
+    O << "\t.globl " << name << '\n'
+      << "\t.weak_definition " << name << '\n';
+    break;
+   case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+   case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << "\t.globl " << name << '\n';
+    // FALL THROUGH
+   case GlobalValue::InternalLinkage:
+   case GlobalValue::PrivateLinkage:
+    break;
+   default:
+    cerr << "Unknown linkage type!";
+    abort();
+  }
+
+  EmitAlignment(Align, GVar);
+  O << name << ":";
+  if (VerboseAsm) {
+    O << "\t\t\t\t" << TAI->getCommentString() << " '";
+    PrintUnmangledNameSafely(GVar, O);
+    O << "'";
+  }
+  O << '\n';
+
+  // If the initializer is a extern weak symbol, remember to emit the weak
+  // reference!
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+
+  EmitGlobalConstant(C);
+  O << '\n';
+}
+
+bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    printModuleLevelGV(I);
+
+  bool isPPC64 = TD->getPointerSizeInBits() == 64;
+
+  // Output stubs for dynamically-linked functions
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      SwitchToTextSection("\t.section __TEXT,__picsymbolstub1,symbol_stubs,"
+                          "pure_instructions,32");
+      EmitAlignment(4);
+      const char *p = i->getKeyData();
+      bool hasQuote = p[0]=='\"';
+      printSuffixedName(p, "$stub");
+      O << ":\n";
+      O << "\t.indirect_symbol " << p << '\n';
+      O << "\tmflr r0\n";
+      O << "\tbcl 20,31,";
+      if (hasQuote)
+        O << "\"L0$" << &p[1];
+      else
+        O << "L0$" << p;
+      O << '\n';
+      if (hasQuote)
+        O << "\"L0$" << &p[1];
+      else
+        O << "L0$" << p;
+      O << ":\n";
+      O << "\tmflr r11\n";
+      O << "\taddis r11,r11,ha16(";
+      printSuffixedName(p, "$lazy_ptr");
+      O << "-";
+      if (hasQuote)
+        O << "\"L0$" << &p[1];
+      else
+        O << "L0$" << p;
+      O << ")\n";
+      O << "\tmtlr r0\n";
+      if (isPPC64)
+        O << "\tldu r12,lo16(";
+      else
+        O << "\tlwzu r12,lo16(";
+      printSuffixedName(p, "$lazy_ptr");
+      O << "-";
+      if (hasQuote)
+        O << "\"L0$" << &p[1];
+      else
+        O << "L0$" << p;
+      O << ")(r11)\n";
+      O << "\tmtctr r12\n";
+      O << "\tbctr\n";
+      SwitchToDataSection(".lazy_symbol_pointer");
+      printSuffixedName(p, "$lazy_ptr");
+      O << ":\n";
+      O << "\t.indirect_symbol " << p << '\n';
+      if (isPPC64)
+        O << "\t.quad dyld_stub_binding_helper\n";
+      else
+        O << "\t.long dyld_stub_binding_helper\n";
+    }
+  } else {
+    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      SwitchToTextSection("\t.section __TEXT,__symbol_stub1,symbol_stubs,"
+                          "pure_instructions,16");
+      EmitAlignment(4);
+      const char *p = i->getKeyData();
+      printSuffixedName(p, "$stub");
+      O << ":\n";
+      O << "\t.indirect_symbol " << p << '\n';
+      O << "\tlis r11,ha16(";
+      printSuffixedName(p, "$lazy_ptr");
+      O << ")\n";
+      if (isPPC64)
+        O << "\tldu r12,lo16(";
+      else
+        O << "\tlwzu r12,lo16(";
+      printSuffixedName(p, "$lazy_ptr");
+      O << ")(r11)\n";
+      O << "\tmtctr r12\n";
+      O << "\tbctr\n";
+      SwitchToDataSection(".lazy_symbol_pointer");
+      printSuffixedName(p, "$lazy_ptr");
+      O << ":\n";
+      O << "\t.indirect_symbol " << p << '\n';
+      if (isPPC64)
+        O << "\t.quad dyld_stub_binding_helper\n";
+      else
+        O << "\t.long dyld_stub_binding_helper\n";
+    }
+  }
+
+  O << '\n';
+
+  if (TAI->doesSupportExceptionHandling() && MMI) {
+    // Add the (possibly multiple) personalities to the set of global values.
+    // Only referenced functions get into the Personalities list.
+    const std::vector<Function *>& Personalities = MMI->getPersonalities();
+
+    for (std::vector<Function *>::const_iterator I = Personalities.begin(),
+           E = Personalities.end(); I != E; ++I)
+      if (*I) GVStubs.insert("_" + (*I)->getName());
+  }
+
+  // Output stubs for external and common global variables.
+  if (!GVStubs.empty()) {
+    SwitchToDataSection(".non_lazy_symbol_pointer");
+    for (StringSet<>::iterator i = GVStubs.begin(), e = GVStubs.end();
+         i != e; ++i) {
+      std::string p = i->getKeyData();
+      printSuffixedName(p, "$non_lazy_ptr");
+      O << ":\n";
+      O << "\t.indirect_symbol " << p << '\n';
+      if (isPPC64)
+        O << "\t.quad\t0\n";
+      else
+        O << "\t.long\t0\n";
+    }
+  }
+
+  if (!HiddenGVStubs.empty()) {
+    SwitchToSection(TAI->getDataSection());
+    for (StringSet<>::iterator i = HiddenGVStubs.begin(), e = HiddenGVStubs.end();
+         i != e; ++i) {
+      std::string p = i->getKeyData();
+      EmitAlignment(isPPC64 ? 3 : 2);
+      printSuffixedName(p, "$non_lazy_ptr");
+      O << ":\n";
+      if (isPPC64)
+        O << "\t.quad\t";
+      else
+        O << "\t.long\t";
+      O << p << '\n';
+    }
+  }
+
+
+  // Emit initial debug information.
+  DW->EndModule();
+
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never generates
+  // code that does this, it is always safe to set.
+  O << "\t.subsections_via_symbols\n";
+
+  return AsmPrinter::doFinalization(M);
+}
+
+
+
+/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
+/// for a MachineFunction to the given output stream, in a format that the
+/// Darwin assembler can deal with.
+///
+FunctionPass *llvm::createPPCAsmPrinterPass(raw_ostream &o,
+                                            PPCTargetMachine &tm,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool verbose) {
+  const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
+
+  if (Subtarget->isDarwin()) {
+    return new PPCDarwinAsmPrinter(o, tm, tm.getTargetAsmInfo(),
+                                   OptLevel, verbose);
+  } else {
+    return new PPCLinuxAsmPrinter(o, tm, tm.getTargetAsmInfo(),
+                                  OptLevel, verbose);
+  }
+}
+
+namespace {
+  static struct Register {
+    Register() {
+      PPCTargetMachine::registerAsmPrinter(createPPCAsmPrinterPass);
+    }
+  } Registrator;
+}
+
+extern "C" int PowerPCAsmPrinterForceLink;
+int PowerPCAsmPrinterForceLink = 0;
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
new file mode 100644
index 0000000..0b67aff
--- /dev/null
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(LLVM_TARGET_DEFINITIONS PPC.td)
+
+tablegen(PPCGenInstrNames.inc -gen-instr-enums)
+tablegen(PPCGenRegisterNames.inc -gen-register-enums)
+tablegen(PPCGenAsmWriter.inc -gen-asm-writer)
+tablegen(PPCGenCodeEmitter.inc -gen-emitter)
+tablegen(PPCGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(PPCGenRegisterInfo.inc -gen-register-desc)
+tablegen(PPCGenInstrInfo.inc -gen-instr-desc)
+tablegen(PPCGenDAGISel.inc -gen-dag-isel)
+tablegen(PPCGenCallingConv.inc -gen-callingconv)
+tablegen(PPCGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(PowerPCCodeGen
+  PPCBranchSelector.cpp
+  PPCCodeEmitter.cpp
+  PPCHazardRecognizers.cpp
+  PPCInstrInfo.cpp
+  PPCISelDAGToDAG.cpp
+  PPCISelLowering.cpp
+  PPCJITInfo.cpp
+  PPCMachOWriterInfo.cpp
+  PPCPredicates.cpp
+  PPCRegisterInfo.cpp
+  PPCSubtarget.cpp
+  PPCTargetAsmInfo.cpp
+  PPCTargetMachine.cpp
+  )
diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
new file mode 100644
index 0000000..db68897
--- /dev/null
+++ b/lib/Target/PowerPC/Makefile
@@ -0,0 +1,22 @@
+##===- lib/Target/PowerPC/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMPowerPCCodeGen
+TARGET = PPC
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = PPCGenInstrNames.inc PPCGenRegisterNames.inc \
+                PPCGenAsmWriter.inc  PPCGenCodeEmitter.inc \
+                PPCGenRegisterInfo.h.inc PPCGenRegisterInfo.inc \
+                PPCGenInstrInfo.inc PPCGenDAGISel.inc \
+                PPCGenSubtarget.inc PPCGenCallingConv.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
new file mode 100644
index 0000000..c844e21
--- /dev/null
+++ b/lib/Target/PowerPC/PPC.h
@@ -0,0 +1,49 @@
+//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PowerPC back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_H
+#define LLVM_TARGET_POWERPC_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class PPCTargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class raw_ostream;
+  
+FunctionPass *createPPCBranchSelectionPass();
+FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+FunctionPass *createPPCAsmPrinterPass(raw_ostream &OS,
+                                      PPCTargetMachine &TM,
+                                      CodeGenOpt::Level OptLevel, bool Verbose);
+FunctionPass *createPPCCodeEmitterPass(PPCTargetMachine &TM,
+                                       MachineCodeEmitter &MCE);
+FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                       JITCodeEmitter &MCE);
+} // end namespace llvm;
+
+// Defines symbolic names for PowerPC registers.  This defines a mapping from
+// register name to register number.
+//
+#include "PPCGenRegisterNames.inc"
+
+// Defines symbolic names for the PowerPC instructions.
+//
+#include "PPCGenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
new file mode 100644
index 0000000..08f5bb4
--- /dev/null
+++ b/lib/Target/PowerPC/PPC.td
@@ -0,0 +1,114 @@
+//===- PPC.td - Describe the PowerPC Target Machine --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Subtarget features.
+//
+ 
+//===----------------------------------------------------------------------===//
+// CPU Directives                                                             //
+//===----------------------------------------------------------------------===//
+
+def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">;
+def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">;
+def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">;
+def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
+def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
+def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
+def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+
+def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
+                                        "Enable 64-bit instructions">;
+def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
+                              "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
+                                        "Enable Altivec instructions">;
+def FeatureGPUL      : SubtargetFeature<"gpul","IsGigaProcessor", "true",
+                                        "Enable GPUL instructions">;
+def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
+                                        "Enable the fsqrt instruction">; 
+def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
+                                        "Enable the stfiwx instruction">; 
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PPCRegisterInfo.td"
+include "PPCSchedule.td"
+include "PPCInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC processors supported.
+//
+
+def : Processor<"generic", G3Itineraries, [Directive32]>;
+def : Processor<"601", G3Itineraries, [Directive601]>;
+def : Processor<"602", G3Itineraries, [Directive602]>;
+def : Processor<"603", G3Itineraries, [Directive603]>;
+def : Processor<"603e", G3Itineraries, [Directive603]>;
+def : Processor<"603ev", G3Itineraries, [Directive603]>;
+def : Processor<"604", G3Itineraries, [Directive604]>;
+def : Processor<"604e", G3Itineraries, [Directive604]>;
+def : Processor<"620", G3Itineraries, [Directive620]>;
+def : Processor<"g3", G3Itineraries, [Directive7400]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"750", G4Itineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"970", G5Itineraries,
+                  [Directive970, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"g5", G5Itineraries,
+                  [Directive970, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"ppc", G3Itineraries, [Directive32]>;
+def : Processor<"ppc64", G5Itineraries,
+                  [Directive64, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PPCCallingConv.td"
+
+def PPCInstrInfo : InstrInfo {
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the PPCInstrInfo.h file.
+  let TSFlagsFields = ["PPC970_First",
+                       "PPC970_Single",
+                       "PPC970_Cracked",
+                       "PPC970_Unit"];
+  let TSFlagsShifts = [0, 1, 2, 3];
+
+  let isLittleEndianEncoding = 1;
+}
+
+
+def PPC : Target {
+  // Information about the instructions.
+  let InstructionSet = PPCInstrInfo;
+}
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
new file mode 100644
index 0000000..b95a502
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -0,0 +1,174 @@
+//===-- PPCBranchSelector.cpp - Emit long conditional branches-----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 16 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch psuedo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-branch-select"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCPredicates.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+  struct VISIBILITY_HIDDEN PPCBSel : public MachineFunctionPass {
+    static char ID;
+    PPCBSel() : MachineFunctionPass(&ID) {}
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "PowerPC Branch Selector";
+    }
+  };
+  char PPCBSel::ID = 0;
+}
+
+/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createPPCBranchSelectionPass() {
+  return new PPCBSel();
+}
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += TII->GetInstSizeInBytes(MBBI);
+    
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+  
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 15)) {
+    BlockSizes.clear();
+    return false;
+  }
+  
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+8
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+  
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) {
+          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          continue;
+        }
+        
+        // Determine the offset from the current branch to the destination
+        // block.
+        MachineBasicBlock *Dest = I->getOperand(2).getMBB();
+        
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+          
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt16(BranchSize)) {
+          MBBStartOffset += 4;
+          continue;
+        }
+        
+        // Otherwise, we have to expand it to a long branch.
+        // The BCC operands are:
+        // 0. PPC branch predicate
+        // 1. CR register
+        // 2. Target MBB
+        PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+        unsigned CRReg = I->getOperand(1).getReg();
+        
+        MachineInstr *OldBranch = I;
+        DebugLoc dl = OldBranch->getDebugLoc();
+        
+        // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+        BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+          .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+        
+        // Remember that this instruction is 8-bytes, increase the size of the
+        // block by 4, remember to iterate.
+        BlockSizes[MBB.getNumber()] += 4;
+        MBBStartOffset += 8;
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+  
+  BlockSizes.clear();
+  return true;
+}
+
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
new file mode 100644
index 0000000..9f916f3
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -0,0 +1,66 @@
+//===- PPCCallingConv.td - Calling Conventions for PowerPC ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PowerPC 32- and 64-bit
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for PowerPC
+def RetCC_PPC : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+  
+  CCIfType<[f32], CCAssignToReg<[F1]>>,
+  CCIfType<[f64], CCAssignToReg<[F1, F2]>>,
+  
+  // Vector types are always returned in V2.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+/*
+def CC_PPC : CallingConv<[
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  
+  // Common sub-targets passes FP values in F1 - F13
+  CCIfType<[f32, f64], CCIfSubtarget<"isMachoABI()",
+           CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8,F9,F10,F11,F12,F13]>>>,
+  // ELF32 sub-target pass FP values in F1 - F8.
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+           
+  // The first 12 Vector arguments are passed in altivec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+              CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10,V11,V12,V13]>>
+
+/*
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToStack<16, 16>>*/
+]>;
+
+*/
+
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
new file mode 100644
index 0000000..aa3dce1
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -0,0 +1,266 @@
+//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC32 -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to
+// JIT-compile bitcode to native PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+#include "PPCRelocations.h"
+#include "PPC.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+  class PPCCodeEmitter {
+    TargetMachine &TM;
+    MachineCodeEmitter &MCE;
+  public:
+    PPCCodeEmitter(TargetMachine &tm, MachineCodeEmitter &mce):
+        TM(tm), MCE(mce) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI);
+
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO);
+
+    /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record
+    /// its address in the function into this pointer.
+
+    void *MovePCtoLROffset;
+  };
+
+  template <class CodeEmitter>
+  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass,
+      public PPCCodeEmitter
+  {
+    TargetMachine &TM;
+    CodeEmitter &MCE;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  public:
+    static char ID;
+    Emitter(TargetMachine &tm, CodeEmitter &mce)
+      : MachineFunctionPass(&ID), PPCCodeEmitter(tm, mce), TM(tm), MCE(mce) {}
+
+    const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
+
+    /// runOnMachineFunction - emits the given MachineFunction to memory
+    ///
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    /// emitBasicBlock - emits the given MachineBasicBlock to memory
+    ///
+    void emitBasicBlock(MachineBasicBlock &MBB);
+
+    /// getValueBit - return the particular bit of Val
+    ///
+    unsigned getValueBit(int64_t Val, unsigned bit) { return (Val >> bit) & 1; }
+  };
+
+  template <class CodeEmitter>
+    char Emitter<CodeEmitter>::ID = 0;
+}
+	
+/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code
+/// to the specified MCE object.
+FunctionPass *llvm::createPPCCodeEmitterPass(PPCTargetMachine &TM,
+                                             MachineCodeEmitter &MCE) {
+  return new Emitter<MachineCodeEmitter>(TM, MCE);
+}
+
+FunctionPass *llvm::createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+template <class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
+  do {
+    MovePCtoLROffset = 0;
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      emitBasicBlock(*BB);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+template <class CodeEmitter>
+void Emitter<CodeEmitter>::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+  
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){
+    const MachineInstr &MI = *I;
+    switch (MI.getOpcode()) {
+    default:
+      MCE.emitWordBE(getBinaryCodeForInstr(MI));
+      break;
+    case TargetInstrInfo::DBG_LABEL:
+    case TargetInstrInfo::EH_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getImm());
+      break;
+    case TargetInstrInfo::IMPLICIT_DEF:
+      break; // pseudo opcode, no side effects
+    case PPC::MovePCtoLR:
+    case PPC::MovePCtoLR8:
+      assert(TM.getRelocationModel() == Reloc::PIC_);
+      MovePCtoLROffset = (void*)MCE.getCurrentPCValue();
+      MCE.emitWordBE(0x48000005);   // bl 1
+      break;
+    }
+  }
+}
+
+unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) {
+
+  unsigned rv = 0; // Return value; defaults to 0 for unhandled cases
+                   // or things that get fixed up later by the JIT.
+  if (MO.isReg()) {
+    rv = PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+
+    // Special encoding for MTCRF and MFOCRF, which uses a bit mask for the
+    // register, not the register number directly.
+    if ((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
+        (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7)) {
+      rv = 0x80 >> rv;
+    }
+  } else if (MO.isImm()) {
+    rv = MO.getImm();
+  } else if (MO.isGlobal() || MO.isSymbol() ||
+             MO.isCPI() || MO.isJTI()) {
+    unsigned Reloc = 0;
+    if (MI.getOpcode() == PPC::BL_Macho || MI.getOpcode() == PPC::BL8_Macho ||
+        MI.getOpcode() == PPC::BL_ELF || MI.getOpcode() == PPC::BL8_ELF ||
+        MI.getOpcode() == PPC::TAILB || MI.getOpcode() == PPC::TAILB8)
+      Reloc = PPC::reloc_pcrel_bx;
+    else {
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+      }
+      switch (MI.getOpcode()) {
+      default: MI.dump(); assert(0 && "Unknown instruction for relocation!");
+      case PPC::LIS:
+      case PPC::LIS8:
+      case PPC::ADDIS:
+      case PPC::ADDIS8:
+        Reloc = PPC::reloc_absolute_high;       // Pointer to symbol
+        break;
+      case PPC::LI:
+      case PPC::LI8:
+      case PPC::LA:
+      // Loads.
+      case PPC::LBZ:
+      case PPC::LBZ8:
+      case PPC::LHA:
+      case PPC::LHA8:
+      case PPC::LHZ:
+      case PPC::LHZ8:
+      case PPC::LWZ:
+      case PPC::LWZ8:
+      case PPC::LFS:
+      case PPC::LFD:
+      
+      // Stores.
+      case PPC::STB:
+      case PPC::STB8:
+      case PPC::STH:
+      case PPC::STH8:
+      case PPC::STW:
+      case PPC::STW8:
+      case PPC::STFS:
+      case PPC::STFD:
+        Reloc = PPC::reloc_absolute_low;
+        break;
+
+      case PPC::LWA:
+      case PPC::LD:
+      case PPC::STD:
+      case PPC::STD_32:
+        Reloc = PPC::reloc_absolute_low_ix;
+        break;
+      }
+    }
+    
+    MachineRelocation R;
+    if (MO.isGlobal()) {
+      R = MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                   MO.getGlobal(), 0,
+                                   isa<Function>(MO.getGlobal()));
+    } else if (MO.isSymbol()) {
+      R = MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                       Reloc, MO.getSymbolName(), 0);
+    } else if (MO.isCPI()) {
+      R = MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getIndex(), 0);
+    } else {
+      assert(MO.isJTI());
+      R = MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getIndex(), 0);
+    }
+    
+    // If in PIC mode, we need to encode the negated address of the
+    // 'movepctolr' into the unrelocated field.  After relocation, we'll have
+    // &gv-&movepctolr-4 in the imm field.  Once &movepctolr is added to the imm
+    // field, we get &gv.  This doesn't happen for branch relocations, which are
+    // always implicitly pc relative.
+    if (TM.getRelocationModel() == Reloc::PIC_ && Reloc != PPC::reloc_pcrel_bx){
+      assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+      R.setConstantVal(-(intptr_t)MovePCtoLROffset - 4);
+    }
+    MCE.addRelocation(R);
+    
+  } else if (MO.isMBB()) {
+    unsigned Reloc = 0;
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == PPC::B || Opcode == PPC::BL_Macho ||
+        Opcode == PPC::BLA_Macho || Opcode == PPC::BL_ELF || 
+        Opcode == PPC::BLA_ELF)
+      Reloc = PPC::reloc_pcrel_bx;
+    else // BCC instruction
+      Reloc = PPC::reloc_pcrel_bcx;
+    MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                               Reloc, MO.getMBB()));
+  } else {
+    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+    abort();
+  }
+
+  return rv;
+}
+
+#include "PPCGenCodeEmitter.inc"
+
diff --git a/lib/Target/PowerPC/PPCFrameInfo.h b/lib/Target/PowerPC/PPCFrameInfo.h
new file mode 100644
index 0000000..1b5893d
--- /dev/null
+++ b/lib/Target/PowerPC/PPCFrameInfo.h
@@ -0,0 +1,93 @@
+//===-- PPCFrameInfo.h - Define TargetFrameInfo for PowerPC -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_FRAMEINFO_H
+#define POWERPC_FRAMEINFO_H
+
+#include "PPC.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class PPCFrameInfo: public TargetFrameInfo {
+  const TargetMachine &TM;
+
+public:
+  PPCFrameInfo(const TargetMachine &tm, bool LP64)
+    : TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), TM(tm) {
+  }
+
+  /// getReturnSaveOffset - Return the previous frame offset to save the
+  /// return address.
+  static unsigned getReturnSaveOffset(bool LP64, bool isMacho) {
+    if (isMacho)
+      return LP64 ? 16 : 8;
+    // For ELF 32 ABI:
+    return 4;
+  }
+
+  /// getFramePointerSaveOffset - Return the previous frame offset to save the
+  /// frame pointer.
+  static unsigned getFramePointerSaveOffset(bool LP64, bool isMacho) {
+    // For MachO ABI:
+    // Use the TOC save slot in the PowerPC linkage area for saving the frame
+    // pointer (if needed.)  LLVM does not generate code that uses the TOC (R2
+    // is treated as a caller saved register.)
+    if (isMacho)
+      return LP64 ? 40 : 20;
+    
+    // For ELF 32 ABI:
+    // Save it right before the link register
+    return -4U;
+  }
+  
+  /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
+  ///
+  static unsigned getLinkageSize(bool LP64, bool isMacho) {
+    if (isMacho)
+      return 6 * (LP64 ? 8 : 4);
+    
+    // For ELF 32 ABI:
+    return 8;
+  }
+
+  /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI
+  /// argument area.
+  static unsigned getMinCallArgumentsSize(bool LP64, bool isMacho) {
+    // For Macho ABI:
+    // The prolog code of the callee may store up to 8 GPR argument registers to
+    // the stack, allowing va_start to index over them in memory if its varargs.
+    // Because we cannot tell if this is needed on the caller side, we have to
+    // conservatively assume that it is needed.  As such, make sure we have at
+    // least enough stack space for the caller to store the 8 GPRs.
+    if (isMacho)
+      return 8 * (LP64 ? 8 : 4);
+    
+    // For ELF 32 ABI:
+    // There is no default stack allocated for the 8 first GPR arguments.
+    return 0;
+  }
+
+  /// getMinCallFrameSize - Return the minimum size a call frame can be using
+  /// the PowerPC ABI.
+  static unsigned getMinCallFrameSize(bool LP64, bool isMacho) {
+    // The call frame needs to be at least big enough for linkage and 8 args.
+    return getLinkageSize(LP64, isMacho) +
+           getMinCallArgumentsSize(LP64, isMacho);
+  }
+  
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
new file mode 100644
index 0000000..e7658fc
--- /dev/null
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -0,0 +1,304 @@
+//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "PPCHazardRecognizers.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// PowerPC 970 Hazard Recognizer
+//
+// This models the dispatch group formation of the PPC970 processor.  Dispatch
+// groups are bundles of up to five instructions that can contain various mixes
+// of instructions.  The PPC970 can dispatch a peak of 4 non-branch and one 
+// branch instruction per-cycle.
+//
+// There are a number of restrictions to dispatch group formation: some
+// instructions can only be issued in the first slot of a dispatch group, & some
+// instructions fill an entire dispatch group.  Additionally, only branches can
+// issue in the 5th (last) slot.
+//
+// Finally, there are a number of "structural" hazards on the PPC970.  These
+// conditions cause large performance penalties due to misprediction, recovery,
+// and replay logic that has to happen.  These cases include setting a CTR and
+// branching through it in the same dispatch group, and storing to an address,
+// then loading from the same address within a dispatch group.  To avoid these
+// conditions, we insert no-op instructions when appropriate.
+//
+// FIXME: This is missing some significant cases:
+//   1. Modeling of microcoded instructions.
+//   2. Handling of serialized operations.
+//   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
+//
+
+PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
+  : TII(tii) {
+  EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EndDispatchGroup() {
+  DOUT << "=== Start of dispatch group\n";
+  NumIssued = 0;
+  
+  // Structural hazard info.
+  HasCTRSet = false;
+  NumStores = 0;
+}
+
+
+PPCII::PPC970_Unit 
+PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
+                                     bool &isFirst, bool &isSingle,
+                                     bool &isCracked,
+                                     bool &isLoad, bool &isStore) {
+  if ((int)Opcode >= 0) {
+    isFirst = isSingle = isCracked = isLoad = isStore = false;
+    return PPCII::PPC970_Pseudo;
+  }
+  Opcode = ~Opcode;
+  
+  const TargetInstrDesc &TID = TII.get(Opcode);
+  
+  isLoad  = TID.mayLoad();
+  isStore = TID.mayStore();
+  
+  unsigned TSFlags = TID.TSFlags;
+  
+  isFirst   = TSFlags & PPCII::PPC970_First;
+  isSingle  = TSFlags & PPCII::PPC970_Single;
+  isCracked = TSFlags & PPCII::PPC970_Cracked;
+  return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask);
+}
+
+/// isLoadOfStoredAddress - If we have a load from the previously stored pointer
+/// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
+bool PPCHazardRecognizer970::
+isLoadOfStoredAddress(unsigned LoadSize, SDValue Ptr1, SDValue Ptr2) const {
+  for (unsigned i = 0, e = NumStores; i != e; ++i) {
+    // Handle exact and commuted addresses.
+    if (Ptr1 == StorePtr1[i] && Ptr2 == StorePtr2[i])
+      return true;
+    if (Ptr2 == StorePtr1[i] && Ptr1 == StorePtr2[i])
+      return true;
+    
+    // Okay, we don't have an exact match, if this is an indexed offset, see if
+    // we have overlap (which happens during fp->int conversion for example).
+    if (StorePtr2[i] == Ptr2) {
+      if (ConstantSDNode *StoreOffset = dyn_cast<ConstantSDNode>(StorePtr1[i]))
+        if (ConstantSDNode *LoadOffset = dyn_cast<ConstantSDNode>(Ptr1)) {
+          // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
+          // to see if the load and store actually overlap.
+          int StoreOffs = StoreOffset->getZExtValue();
+          int LoadOffs  = LoadOffset->getZExtValue();
+          if (StoreOffs < LoadOffs) {
+            if (int(StoreOffs+StoreSize[i]) > LoadOffs) return true;
+          } else {
+            if (int(LoadOffs+LoadSize) > StoreOffs) return true;
+          }
+        }
+    }
+  }
+  return false;
+}
+
+/// getHazardType - We return hazard for any non-branch instruction that would
+/// terminate terminate the dispatch group.  We turn NoopHazard for any
+/// instructions that wouldn't terminate the dispatch group that would cause a
+/// pipeline flush.
+ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970::
+getHazardType(SUnit *SU) {
+  const SDNode *Node = SU->getNode()->getFlaggedMachineNode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType = 
+    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;  
+  unsigned Opcode = Node->getMachineOpcode();
+
+  // We can only issue a PPC970_First/PPC970_Single instruction (such as
+  // crand/mtspr/etc) if this is the first cycle of the dispatch group.
+  if (NumIssued != 0 && (isFirst || isSingle))
+    return Hazard;
+  
+  // If this instruction is cracked into two ops by the decoder, we know that
+  // it is not a branch and that it cannot issue if 3 other instructions are
+  // already in the dispatch group.
+  if (isCracked && NumIssued > 2)
+    return Hazard;
+      
+  switch (InstrType) {
+  default: assert(0 && "Unknown instruction type!");
+  case PPCII::PPC970_FXU:
+  case PPCII::PPC970_LSU:
+  case PPCII::PPC970_FPU:
+  case PPCII::PPC970_VALU:
+  case PPCII::PPC970_VPERM:
+    // We can only issue a branch as the last instruction in a group.
+    if (NumIssued == 4) return Hazard;
+    break;
+  case PPCII::PPC970_CRU:
+    // We can only issue a CR instruction in the first two slots.
+    if (NumIssued >= 2) return Hazard;
+    break;
+  case PPCII::PPC970_BRU:
+    break;
+  }
+  
+  // Do not allow MTCTR and BCTRL to be in the same dispatch group.
+  if (HasCTRSet && (Opcode == PPC::BCTRL_Macho || Opcode == PPC::BCTRL_ELF))
+    return NoopHazard;
+  
+  // If this is a load following a store, make sure it's not to the same or
+  // overlapping address.
+  if (isLoad && NumStores) {
+    unsigned LoadSize;
+    switch (Opcode) {
+    default: assert(0 && "Unknown load!");
+    case PPC::LBZ:   case PPC::LBZU:
+    case PPC::LBZX:
+    case PPC::LBZ8:  case PPC::LBZU8:
+    case PPC::LBZX8:
+    case PPC::LVEBX:
+      LoadSize = 1;
+      break;
+    case PPC::LHA:   case PPC::LHAU:
+    case PPC::LHAX:
+    case PPC::LHZ:   case PPC::LHZU:
+    case PPC::LHZX:
+    case PPC::LVEHX:
+    case PPC::LHBRX:
+    case PPC::LHA8:   case PPC::LHAU8:
+    case PPC::LHAX8:
+    case PPC::LHZ8:   case PPC::LHZU8:
+    case PPC::LHZX8:
+      LoadSize = 2;
+      break;
+    case PPC::LFS:    case PPC::LFSU:
+    case PPC::LFSX:
+    case PPC::LWZ:    case PPC::LWZU:
+    case PPC::LWZX:
+    case PPC::LWA:
+    case PPC::LWAX:
+    case PPC::LVEWX:
+    case PPC::LWBRX:
+    case PPC::LWZ8:
+    case PPC::LWZX8:
+      LoadSize = 4;
+      break;
+    case PPC::LFD:    case PPC::LFDU:
+    case PPC::LFDX:
+    case PPC::LD:     case PPC::LDU:
+    case PPC::LDX:
+      LoadSize = 8;
+      break;
+    case PPC::LVX:
+    case PPC::LVXL:
+      LoadSize = 16;
+      break;
+    }
+    
+    if (isLoadOfStoredAddress(LoadSize, 
+                              Node->getOperand(0), Node->getOperand(1)))
+      return NoopHazard;
+  }
+  
+  return NoHazard;
+}
+
+void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
+  const SDNode *Node = SU->getNode()->getFlaggedMachineNode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType = 
+    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return;  
+  unsigned Opcode = Node->getMachineOpcode();
+
+  // Update structural hazard information.
+  if (Opcode == PPC::MTCTR) HasCTRSet = true;
+  
+  // Track the address stored to.
+  if (isStore) {
+    unsigned ThisStoreSize;
+    switch (Opcode) {
+    default: assert(0 && "Unknown store instruction!");
+    case PPC::STB:    case PPC::STB8:
+    case PPC::STBU:   case PPC::STBU8:
+    case PPC::STBX:   case PPC::STBX8:
+    case PPC::STVEBX:
+      ThisStoreSize = 1;
+      break;
+    case PPC::STH:    case PPC::STH8:
+    case PPC::STHU:   case PPC::STHU8:
+    case PPC::STHX:   case PPC::STHX8:
+    case PPC::STVEHX:
+    case PPC::STHBRX:
+      ThisStoreSize = 2;
+      break;
+    case PPC::STFS:
+    case PPC::STFSU:
+    case PPC::STFSX:
+    case PPC::STWX:   case PPC::STWX8:
+    case PPC::STWUX:
+    case PPC::STW:    case PPC::STW8:
+    case PPC::STWU:   case PPC::STWU8:
+    case PPC::STVEWX:
+    case PPC::STFIWX:
+    case PPC::STWBRX:
+      ThisStoreSize = 4;
+      break;
+    case PPC::STD_32:
+    case PPC::STDX_32:
+    case PPC::STD:
+    case PPC::STDU:
+    case PPC::STFD:
+    case PPC::STFDX:
+    case PPC::STDX:
+    case PPC::STDUX:
+      ThisStoreSize = 8;
+      break;
+    case PPC::STVX:
+    case PPC::STVXL:
+      ThisStoreSize = 16;
+      break;
+    }
+    
+    StoreSize[NumStores] = ThisStoreSize;
+    StorePtr1[NumStores] = Node->getOperand(1);
+    StorePtr2[NumStores] = Node->getOperand(2);
+    ++NumStores;
+  }
+  
+  if (InstrType == PPCII::PPC970_BRU || isSingle)
+    NumIssued = 4;  // Terminate a d-group.
+  ++NumIssued;
+  
+  // If this instruction is cracked into two ops by the decoder, remember that
+  // we issued two pieces.
+  if (isCracked)
+    ++NumIssued;
+  
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::AdvanceCycle() {
+  assert(NumIssued < 5 && "Illegal dispatch group!");
+  ++NumIssued;
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
new file mode 100644
index 0000000..74bf8e5
--- /dev/null
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -0,0 +1,73 @@
+//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCHAZRECS_H
+#define PPCHAZRECS_H
+
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "PPCInstrInfo.h"
+
+namespace llvm {
+  
+/// PPCHazardRecognizer970 - This class defines a finite state automata that
+/// models the dispatch logic on the PowerPC 970 (aka G5) processor.  This
+/// promotes good dispatch group formation and implements noop insertion to
+/// avoid structural hazards that cause significant performance penalties (e.g.
+/// setting the CTR register then branching through it within a dispatch group),
+/// or storing then loading from the same address within a dispatch group.
+class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
+  const TargetInstrInfo &TII;
+  
+  unsigned NumIssued;  // Number of insts issued, including advanced cycles.
+  
+  // Various things that can cause a structural hazard.
+  
+  // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
+  bool HasCTRSet;
+  
+  // StoredPtr - Keep track of the address of any store.  If we see a load from
+  // the same address (or one that aliases it), disallow the store.  We can have
+  // up to four stores in one dispatch group, hence we track up to 4.
+  //
+  // This is null if we haven't seen a store yet.  We keep track of both
+  // operands of the store here, since we support [r+r] and [r+i] addressing.
+  SDValue StorePtr1[4], StorePtr2[4];
+  unsigned  StoreSize[4];
+  unsigned NumStores;
+  
+public:
+  PPCHazardRecognizer970(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SUnit *SU);
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  
+private:
+  /// EndDispatchGroup - Called when we are finishing a new dispatch group.
+  ///
+  void EndDispatchGroup();
+  
+  /// GetInstrType - Classify the specified powerpc opcode according to its
+  /// pipeline.
+  PPCII::PPC970_Unit GetInstrType(unsigned Opcode,
+                                  bool &isFirst, bool &isSingle,bool &isCracked,
+                                  bool &isLoad, bool &isStore);
+  
+  bool isLoadOfStoredAddress(unsigned LoadSize,
+                             SDValue Ptr1, SDValue Ptr2) const;
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
new file mode 100644
index 0000000..823e316
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -0,0 +1,1170 @@
+//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for PowerPC,
+// converting from a legalized dag to a PPC dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-codegen"
+#include "PPC.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCISelLowering.h"
+#include "PPCHazardRecognizers.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// PPCDAGToDAGISel - PPC specific code to select PPC machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class VISIBILITY_HIDDEN PPCDAGToDAGISel : public SelectionDAGISel {
+    PPCTargetMachine &TM;
+    PPCTargetLowering &PPCLowering;
+    const PPCSubtarget &PPCSubTarget;
+    unsigned GlobalBaseReg;
+  public:
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
+      : SelectionDAGISel(tm), TM(tm),
+        PPCLowering(*TM.getTargetLowering()),
+        PPCSubTarget(*TM.getSubtargetImpl()) {}
+    
+    virtual bool runOnFunction(Function &Fn) {
+      // Do not codegen any 'available_externally' functions at all, they have
+      // definitions outside the translation unit.
+      if (Fn.hasAvailableExternallyLinkage())
+        return false;
+
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnFunction(Fn);
+      
+      InsertVRSaveCode(Fn);
+      return true;
+    }
+   
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+    
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
+    }
+    
+    /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s 
+    /// with any number of 0s on either side.  The 1s are allowed to wrap from
+    /// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.
+    /// 0x0F0F0000 is not, since all 1s are not contiguous.
+    static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME);
+
+
+    /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
+    /// rotate and mask opcode and mask operation.
+    static bool isRotateAndMask(SDNode *N, unsigned Mask, bool IsShiftMask,
+                                unsigned &SH, unsigned &MB, unsigned &ME);
+    
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+    
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDValue Op);
+    
+    SDNode *SelectBitfieldInsert(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, DebugLoc dl);
+
+    /// SelectAddrImm - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement [r+imm].
+    bool SelectAddrImm(SDValue Op, SDValue N, SDValue &Disp,
+                       SDValue &Base) {
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG);
+    }
+    
+    /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
+    /// immediate field.  Because preinc imms have already been validated, just
+    /// accept it.
+    bool SelectAddrImmOffs(SDValue Op, SDValue N, SDValue &Out) const {
+      Out = N;
+      return true;
+    }
+      
+    /// SelectAddrIdx - Given the specified addressed, check to see if it can be
+    /// represented as an indexed [r+r] operation.  Returns false if it can
+    /// be represented by [r+imm], which are preferred.
+    bool SelectAddrIdx(SDValue Op, SDValue N, SDValue &Base,
+                       SDValue &Index) {
+      return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
+    }
+    
+    /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddrIdxOnly(SDValue Op, SDValue N, SDValue &Base,
+                           SDValue &Index) {
+      return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrImmShift - Returns true if the address N can be represented by
+    /// a base register plus a signed 14-bit displacement [r+imm*4].  Suitable
+    /// for use by STD and friends.
+    bool SelectAddrImmShift(SDValue Op, SDValue N, SDValue &Disp,
+                            SDValue &Base) {
+      return PPCLowering.SelectAddressRegImmShift(N, Disp, Base, *CurDAG);
+    }
+      
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      SDValue Op0, Op1;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        if (!SelectAddrIdx(Op, Op, Op0, Op1))
+          SelectAddrImm(Op, Op, Op0, Op1);
+        break;
+      case 'o':   // offsetable
+        if (!SelectAddrImm(Op, Op, Op0, Op1)) {
+          Op0 = Op;
+          Op1 = getSmallIPtrImm(0);
+        }
+        break;
+      case 'v':   // not offsetable
+        SelectAddrIdxOnly(Op, Op, Op0, Op1);
+        break;
+      }
+      
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    
+    SDValue BuildSDIVSequence(SDNode *N);
+    SDValue BuildUDIVSequence(SDNode *N);
+    
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+    
+    void InsertVRSaveCode(Function &Fn);
+
+    virtual const char *getPassName() const {
+      return "PowerPC DAG->DAG Pattern Instruction Selection";
+    } 
+    
+    /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+    /// this target when scheduling the DAG.
+    virtual ScheduleHazardRecognizer *CreateTargetHazardRecognizer() {
+      // Should use subtarget info to pick the right hazard recognizer.  For
+      // now, always return a PPC970 recognizer.
+      const TargetInstrInfo *II = TM.getInstrInfo();
+      assert(II && "No InstrInfo?");
+      return new PPCHazardRecognizer970(*II); 
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "PPCGenDAGISel.inc"
+    
+private:
+    SDNode *SelectSETCC(SDValue Op);
+  };
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void PPCDAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+/// InsertVRSaveCode - Once the entire function has been instruction selected,
+/// all virtual registers are created and all machine instructions are built,
+/// check to see if we need to save/restore VRSAVE.  If so, do it.
+void PPCDAGToDAGISel::InsertVRSaveCode(Function &F) {
+  // Check to see if this function uses vector registers, which means we have to
+  // save and restore the VRSAVE register and update it with the regs we use.  
+  //
+  // In this case, there will be virtual registers of vector type type created
+  // by the scheduler.  Detect them now.
+  MachineFunction &Fn = MachineFunction::get(&F);
+  bool HasVectorVReg = false;
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister, 
+       e = RegInfo->getLastVirtReg()+1; i != e; ++i)
+    if (RegInfo->getRegClass(i) == &PPC::VRRCRegClass) {
+      HasVectorVReg = true;
+      break;
+    }
+  if (!HasVectorVReg) return;  // nothing to do.
+      
+  // If we have a vector register, we want to emit code into the entry and exit
+  // blocks to save and restore the VRSAVE register.  We do this here (instead
+  // of marking all vector instructions as clobbering VRSAVE) for two reasons:
+  //
+  // 1. This (trivially) reduces the load on the register allocator, by not
+  //    having to represent the live range of the VRSAVE register.
+  // 2. This (more significantly) allows us to create a temporary virtual
+  //    register to hold the saved VRSAVE value, allowing this temporary to be
+  //    register allocated, instead of forcing it to be spilled to the stack.
+
+  // Create two vregs - one to hold the VRSAVE register that is live-in to the
+  // function and one for the value after having bits or'd into it.
+  unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  MachineBasicBlock &EntryBB = *Fn.begin();
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Emit the following code into the entry block:
+  // InVRSAVE = MFVRSAVE
+  // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
+  // MTVRSAVE UpdatedVRSAVE
+  MachineBasicBlock::iterator IP = EntryBB.begin();  // Insert Point
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
+          UpdatedVRSAVE).addReg(InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
+  
+  // Find all return blocks, outputting a restore in each epilog.
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    if (!BB->empty() && BB->back().getDesc().isReturn()) {
+      IP = BB->end(); --IP;
+      
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = IP;
+      while (I2 != BB->begin() && (--I2)->getDesc().isTerminator())
+        IP = I2;
+      
+      // Emit: MTVRSAVE InVRSave
+      BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
+    }        
+  }
+}
+
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
+  if (!GlobalBaseReg) {
+    const TargetInstrInfo &TII = *TM.getInstrInfo();
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = BB->getParent()->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    DebugLoc dl = DebugLoc::getUnknownLoc();
+
+    if (PPCLowering.getPointerTy() == MVT::i32) {
+      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR), PPC::LR);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+    } else {
+      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8), PPC::LR8);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
+    }
+  }
+  return CurDAG->getRegister(GlobalBaseReg,
+                             PPCLowering.getPointerTy()).getNode();
+}
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
+/// operand.  If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc
+         && isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+  if (isShiftedMask_32(Val)) {
+    // look for the first non-zero bit
+    MB = CountLeadingZeros_32(Val);
+    // look for the first zero bit after the run of ones
+    ME = CountLeadingZeros_32((Val - 1) ^ Val);
+    return true;
+  } else {
+    Val = ~Val; // invert mask
+    if (isShiftedMask_32(Val)) {
+      // effectively look for the first zero bit
+      ME = CountLeadingZeros_32(Val) - 1;
+      // effectively look for the first one bit after the run of zeros
+      MB = CountLeadingZeros_32((Val - 1) ^ Val) + 1;
+      return true;
+    }
+  }
+  // no run present
+  return false;
+}
+
+bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask, 
+                                      bool IsShiftMask, unsigned &SH, 
+                                      unsigned &MB, unsigned &ME) {
+  // Don't even go down this path for i64, since different logic will be
+  // necessary for rldicl/rldicr/rldimi.
+  if (N->getValueType(0) != MVT::i32)
+    return false;
+
+  unsigned Shift  = 32;
+  unsigned Indeterminant = ~0;  // bit mask marking indeterminant results
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() != 2 ||
+      !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
+    return false;
+  
+  if (Opcode == ISD::SHL) {
+    // apply shift left to mask if it comes first
+    if (IsShiftMask) Mask = Mask << Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu << Shift);
+  } else if (Opcode == ISD::SRL) { 
+    // apply shift right to mask if it comes first
+    if (IsShiftMask) Mask = Mask >> Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu >> Shift);
+    // adjust for the left rotate
+    Shift = 32 - Shift;
+  } else if (Opcode == ISD::ROTL) {
+    Indeterminant = 0;
+  } else {
+    return false;
+  }
+  
+  // if the mask doesn't intersect any Indeterminant bits
+  if (Mask && !(Mask & Indeterminant)) {
+    SH = Shift & 31;
+    // make sure the mask is still a mask (wrap arounds may not be)
+    return isRunOfOnes(Mask, MB, ME);
+  }
+  return false;
+}
+
+/// SelectBitfieldInsert - turn an or of two masked values into
+/// the rotate left word immediate then mask insert (rlwimi) instruction.
+SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+  
+  APInt LKZ, LKO, RKZ, RKO;
+  CurDAG->ComputeMaskedBits(Op0, APInt::getAllOnesValue(32), LKZ, LKO);
+  CurDAG->ComputeMaskedBits(Op1, APInt::getAllOnesValue(32), RKZ, RKO);
+  
+  unsigned TargetMask = LKZ.getZExtValue();
+  unsigned InsertMask = RKZ.getZExtValue();
+  
+  if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
+    unsigned Op0Opc = Op0.getOpcode();
+    unsigned Op1Opc = Op1.getOpcode();
+    unsigned Value, SH = 0;
+    TargetMask = ~TargetMask;
+    InsertMask = ~InsertMask;
+
+    // If the LHS has a foldable shift and the RHS does not, then swap it to the
+    // RHS so that we can fold the shift into the insert.
+    if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+      if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+          Op0.getOperand(0).getOpcode() == ISD::SRL) {
+        if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+            Op1.getOperand(0).getOpcode() != ISD::SRL) {
+          std::swap(Op0, Op1);
+          std::swap(Op0Opc, Op1Opc);
+          std::swap(TargetMask, InsertMask);
+        }
+      }
+    } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+      if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+          Op1.getOperand(0).getOpcode() != ISD::SRL) {
+        std::swap(Op0, Op1);
+        std::swap(Op0Opc, Op1Opc);
+        std::swap(TargetMask, InsertMask);
+      }
+    }
+    
+    unsigned MB, ME;
+    if (InsertMask && isRunOfOnes(InsertMask, MB, ME)) {
+      SDValue Tmp1, Tmp2, Tmp3;
+      bool DisjointMask = (TargetMask ^ InsertMask) == 0xFFFFFFFF;
+
+      if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+          isInt32Immediate(Op1.getOperand(1), Value)) {
+        Op1 = Op1.getOperand(0);
+        SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+      }
+      if (Op1Opc == ISD::AND) {
+        unsigned SHOpc = Op1.getOperand(0).getOpcode();
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
+            isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+          Op1 = Op1.getOperand(0).getOperand(0);
+          SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+        } else {
+          Op1 = Op1.getOperand(0);
+        }
+      }
+      
+      Tmp3 = (Op0Opc == ISD::AND && DisjointMask) ? Op0.getOperand(0) : Op0;
+      SH &= 31;
+      SDValue Ops[] = { Tmp3, Op1, getI32Imm(SH), getI32Imm(MB),
+                          getI32Imm(ME) };
+      return CurDAG->getTargetNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+    }
+  }
+  return 0;
+}
+
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
+                                    ISD::CondCode CC, DebugLoc dl) {
+  // Always select the LHS.
+  unsigned Opc;
+  
+  if (LHS.getValueType() == MVT::i32) {
+    unsigned Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt32Immediate(RHS, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt16(Imm))
+          return SDValue(CurDAG->getTargetNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt16((int)Imm))
+          return SDValue(CurDAG->getTargetNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136 
+        //   cmpw cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmplwi cr0,r0,0x5678
+        //   beq cr0,L6
+        SDValue Xor(CurDAG->getTargetNode(PPC::XORIS, dl, MVT::i32, LHS,
+                                            getI32Imm(Imm >> 16)), 0);
+        return SDValue(CurDAG->getTargetNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+                                               getI32Imm(Imm & 0xFFFF)), 0);
+      }
+      Opc = PPC::CMPLW;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt32Immediate(RHS, Imm) && isUInt16(Imm))
+        return SDValue(CurDAG->getTargetNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                               getI32Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLW;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getTargetNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                               getI32Imm((int)SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPW;
+    }
+  } else if (LHS.getValueType() == MVT::i64) {
+    uint64_t Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt64Immediate(RHS.getNode(), Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt16(Imm))
+          return SDValue(CurDAG->getTargetNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt16(Imm))
+          return SDValue(CurDAG->getTargetNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136 
+        //   cmpd cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmpldi cr0,r0,0x5678
+        //   beq cr0,L6
+        if (isUInt32(Imm)) {
+          SDValue Xor(CurDAG->getTargetNode(PPC::XORIS8, dl, MVT::i64, LHS,
+                                              getI64Imm(Imm >> 16)), 0);
+          return SDValue(CurDAG->getTargetNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+                                                 getI64Imm(Imm & 0xFFFF)), 0);
+        }
+      }
+      Opc = PPC::CMPLD;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt16(Imm))
+        return SDValue(CurDAG->getTargetNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                               getI64Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLD;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getTargetNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                               getI64Imm(SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPD;
+    }
+  } else if (LHS.getValueType() == MVT::f32) {
+    Opc = PPC::FCMPUS;
+  } else {
+    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+    Opc = PPC::FCMPUD;
+  }
+  return SDValue(CurDAG->getTargetNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+}
+
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
+  switch (CC) {
+  case ISD::SETUEQ:
+  case ISD::SETONE:
+  case ISD::SETOLE:
+  case ISD::SETOGE:
+    assert(0 && "Should be lowered by legalize!");
+  default: assert(0 && "Unknown condition!"); abort();
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return PPC::PRED_EQ;
+  case ISD::SETUNE:
+  case ISD::SETNE:  return PPC::PRED_NE;
+  case ISD::SETOLT:
+  case ISD::SETLT:  return PPC::PRED_LT;
+  case ISD::SETULE:
+  case ISD::SETLE:  return PPC::PRED_LE;
+  case ISD::SETOGT:
+  case ISD::SETGT:  return PPC::PRED_GT;
+  case ISD::SETUGE:
+  case ISD::SETGE:  return PPC::PRED_GE;
+  case ISD::SETO:   return PPC::PRED_NU;
+  case ISD::SETUO:  return PPC::PRED_UN;
+    // These two are invalid for floating point.  Assume we have int.
+  case ISD::SETULT: return PPC::PRED_LT;
+  case ISD::SETUGT: return PPC::PRED_GT;
+  }
+}
+
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
+///
+/// If this returns with Other != -1, then the returned comparison is an or of
+/// two simpler comparisons.  In this case, Invert is guaranteed to be false.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert, int &Other) {
+  Invert = false;
+  Other = -1;
+  switch (CC) {
+  default: assert(0 && "Unknown condition!"); abort();
+  case ISD::SETOLT:
+  case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
+  case ISD::SETOGT:
+  case ISD::SETGT:  return 1;                  // Bit #1 = SETOGT
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return 2;                  // Bit #2 = SETOEQ
+  case ISD::SETUO:  return 3;                  // Bit #3 = SETUO
+  case ISD::SETUGE:
+  case ISD::SETGE:  Invert = true; return 0;   // !Bit #0 = SETUGE
+  case ISD::SETULE:
+  case ISD::SETLE:  Invert = true; return 1;   // !Bit #1 = SETULE
+  case ISD::SETUNE:
+  case ISD::SETNE:  Invert = true; return 2;   // !Bit #2 = SETUNE
+  case ISD::SETO:   Invert = true; return 3;   // !Bit #3 = SETO
+  case ISD::SETUEQ: 
+  case ISD::SETOGE: 
+  case ISD::SETOLE: 
+  case ISD::SETONE:
+    assert(0 && "Invalid branch code: should be expanded by legalize");
+  // These are invalid for floating point.  Assume integer.
+  case ISD::SETULT: return 0;
+  case ISD::SETUGT: return 1;
+  }
+  return 0;
+}
+
+SDNode *PPCDAGToDAGISel::SelectSETCC(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  unsigned Imm;
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (isInt32Immediate(N->getOperand(1), Imm)) {
+    // We can codegen setcc op, imm very efficiently compared to a brcond.
+    // Check for those cases here.
+    // setcc op, 0
+    if (Imm == 0) {
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ: {
+        Op = SDValue(CurDAG->getTargetNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+        SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETNE: {
+        SDValue AD =
+          SDValue(CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                          Op, getI32Imm(~0U)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, 
+                                    AD.getValue(1));
+      }
+      case ISD::SETLT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDValue T =
+          SDValue(CurDAG->getTargetNode(PPC::NEG, dl, MVT::i32, Op), 0);
+        T = SDValue(CurDAG->getTargetNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+        SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      }
+    } else if (Imm == ~0U) {        // setcc op, -1
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ:
+        Op = SDValue(CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                             Op, getI32Imm(1)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
+                              SDValue(CurDAG->getTargetNode(PPC::LI, dl, 
+                                                            MVT::i32,
+                                                            getI32Imm(0)), 0),
+                                      Op.getValue(1));
+      case ISD::SETNE: {
+        Op = SDValue(CurDAG->getTargetNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                           Op, getI32Imm(~0U));
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0),
+                                    Op, SDValue(AD, 1));
+      }
+      case ISD::SETLT: {
+        SDValue AD = SDValue(CurDAG->getTargetNode(PPC::ADDI, dl, MVT::i32, Op,
+                                                       getI32Imm(1)), 0);
+        SDValue AN = SDValue(CurDAG->getTargetNode(PPC::AND, dl, MVT::i32, AD,
+                                                       Op), 0);
+        SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        Op = SDValue(CurDAG->getTargetNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 
+                     0);
+        return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, 
+                                    getI32Imm(1));
+      }
+      }
+    }
+  }
+  
+  bool Inv;
+  int OtherCondIdx;
+  unsigned Idx = getCRIdxForSetCC(CC, Inv, OtherCondIdx);
+  SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+  SDValue IntCR;
+  
+  // Force the ccreg into CR7.
+  SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+  
+  SDValue InFlag(0, 0);  // Null incoming flag value.
+  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, 
+                               InFlag).getValue(1);
+  
+  if (PPCSubTarget.isGigaProcessor() && OtherCondIdx == -1)
+    IntCR = SDValue(CurDAG->getTargetNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                            CCReg), 0);
+  else
+    IntCR = SDValue(CurDAG->getTargetNode(PPC::MFCR, dl, MVT::i32, CCReg), 0);
+  
+  SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
+                      getI32Imm(31), getI32Imm(31) };
+  if (OtherCondIdx == -1 && !Inv)
+    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+
+  // Get the specified bit.
+  SDValue Tmp =
+    SDValue(CurDAG->getTargetNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+  if (Inv) {
+    assert(OtherCondIdx == -1 && "Can't have split plus negation");
+    return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
+  }
+
+  // Otherwise, we have to turn an operation like SETONE -> SETOLT | SETOGT.
+  // We already got the bit for the first part of the comparison (e.g. SETULE).
+
+  // Get the other bit of the comparison.
+  Ops[1] = getI32Imm((32-(3-OtherCondIdx)) & 31);
+  SDValue OtherCond = 
+    SDValue(CurDAG->getTargetNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+
+  return CurDAG->SelectNodeTo(N, PPC::OR, MVT::i32, Tmp, OtherCond);
+}
+
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = Op.getDebugLoc();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  
+  case ISD::Constant: {
+    if (N->getValueType(0) == MVT::i64) {
+      // Get 64 bit value.
+      int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+      // Assume no remaining bits.
+      unsigned Remainder = 0;
+      // Assume no shift required.
+      unsigned Shift = 0;
+      
+      // If it can't be represented as a 32 bit value.
+      if (!isInt32(Imm)) {
+        Shift = CountTrailingZeros_64(Imm);
+        int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+        
+        // If the shifted value fits 32 bits.
+        if (isInt32(ImmSh)) {
+          // Go with the shifted value.
+          Imm = ImmSh;
+        } else {
+          // Still stuck with a 64 bit value.
+          Remainder = Imm;
+          Shift = 32;
+          Imm >>= 32;
+        }
+      }
+      
+      // Intermediate operand.
+      SDNode *Result;
+
+      // Handle first 32 bits.
+      unsigned Lo = Imm & 0xFFFF;
+      unsigned Hi = (Imm >> 16) & 0xFFFF;
+      
+      // Simple value.
+      if (isInt16(Imm)) {
+       // Just the Lo bits.
+        Result = CurDAG->getTargetNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+      } else if (Lo) {
+        // Handle the Hi bits.
+        unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+        Result = CurDAG->getTargetNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+        // And Lo bits.
+        Result = CurDAG->getTargetNode(PPC::ORI8, dl, MVT::i64,
+                                       SDValue(Result, 0), getI32Imm(Lo));
+      } else {
+       // Just the Hi bits.
+        Result = CurDAG->getTargetNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+      }
+      
+      // If no shift, we're done.
+      if (!Shift) return Result;
+
+      // Shift for next step if the upper 32-bits were not zero.
+      if (Imm) {
+        Result = CurDAG->getTargetNode(PPC::RLDICR, dl, MVT::i64,
+                                       SDValue(Result, 0),
+                                       getI32Imm(Shift), getI32Imm(63 - Shift));
+      }
+
+      // Add in the last bits as required.
+      if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+        Result = CurDAG->getTargetNode(PPC::ORIS8, dl, MVT::i64,
+                                       SDValue(Result, 0), getI32Imm(Hi));
+      } 
+      if ((Lo = Remainder & 0xFFFF)) {
+        Result = CurDAG->getTargetNode(PPC::ORI8, dl, MVT::i64,
+                                       SDValue(Result, 0), getI32Imm(Lo));
+      }
+      
+      return Result;
+    }
+    break;
+  }
+  
+  case ISD::SETCC:
+    return SelectSETCC(Op);
+  case PPCISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+    
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType());
+    unsigned Opc = Op.getValueType() == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, Opc, Op.getValueType(), TFI,
+                                  getSmallIPtrImm(0));
+    return CurDAG->getTargetNode(Opc, dl, Op.getValueType(), TFI,
+                                 getSmallIPtrImm(0));
+  }
+
+  case PPCISD::MFCR: {
+    SDValue InFlag = N->getOperand(1);
+    // Use MFOCRF if supported.
+    if (PPCSubTarget.isGigaProcessor())
+      return CurDAG->getTargetNode(PPC::MFOCRF, dl, MVT::i32,
+                                   N->getOperand(0), InFlag);
+    else
+      return CurDAG->getTargetNode(PPC::MFCR, dl, MVT::i32, InFlag);
+  }
+    
+  case ISD::SDIV: {
+    // FIXME: since this depends on the setting of the carry flag from the srawi
+    //        we should really be making notes about that for the scheduler.
+    // FIXME: It sure would be nice if we could cheaply recognize the 
+    //        srl/add/sra pattern the dag combiner will generate for this as
+    //        sra/addze rather than having to handle sdiv ourselves.  oh well.
+    unsigned Imm;
+    if (isInt32Immediate(N->getOperand(1), Imm)) {
+      SDValue N0 = N->getOperand(0);
+      if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
+        SDNode *Op =
+          CurDAG->getTargetNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
+                                N0, getI32Imm(Log2_32(Imm)));
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
+                                    SDValue(Op, 0), SDValue(Op, 1));
+      } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
+        SDNode *Op =
+          CurDAG->getTargetNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
+                                N0, getI32Imm(Log2_32(-Imm)));
+        SDValue PT =
+          SDValue(CurDAG->getTargetNode(PPC::ADDZE, dl, MVT::i32,
+                                          SDValue(Op, 0), SDValue(Op, 1)),
+                    0);
+        return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
+      }
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+    
+  case ISD::LOAD: {
+    // Handle preincrement loads.
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    MVT LoadedVT = LD->getMemoryVT();
+    
+    // Normal loads are handled by code generated from the .td file.
+    if (LD->getAddressingMode() != ISD::PRE_INC)
+      break;
+    
+    SDValue Offset = LD->getOffset();
+    if (isa<ConstantSDNode>(Offset) ||
+        Offset.getOpcode() == ISD::TargetGlobalAddress) {
+      
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT()) {
+          default: assert(0 && "Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDU; break;
+          case MVT::f32: Opcode = PPC::LFSU; break;
+          case MVT::i32: Opcode = PPC::LWZU; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT()) {
+          default: assert(0 && "Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDU; break;
+          case MVT::i32: Opcode = PPC::LWZU8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU8; break;
+        }
+      }
+      
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Offset, Base, Chain };
+      // FIXME: PPC64
+      return CurDAG->getTargetNode(Opcode, dl, LD->getValueType(0),
+                                   PPCLowering.getPointerTy(),
+                                   MVT::Other, Ops, 3);
+    } else {
+      assert(0 && "R+R preindex loads not supported yet!");
+    }
+  }
+    
+  case ISD::AND: {
+    unsigned Imm, Imm2, SH, MB, ME;
+
+    // If this is an and of a value rotated between 0 and 31 bits and then and'd
+    // with a mask, emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
+      SDValue Val = N->getOperand(0).getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // If this is just a masked value where the input is not handled above, and
+    // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRunOfOnes(Imm, MB, ME) && 
+        N->getOperand(0).getOpcode() != ISD::ROTL) {
+      SDValue Val = N->getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // AND X, 0 -> 0, not "rlwinm 32".
+    if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
+      ReplaceUses(SDValue(N, 0), N->getOperand(1));
+      return NULL;
+    }
+    // ISD::OR doesn't get all the bitfield insertion fun.
+    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
+    if (isInt32Immediate(N->getOperand(1), Imm) && 
+        N->getOperand(0).getOpcode() == ISD::OR &&
+        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
+      unsigned MB, ME;
+      Imm = ~(Imm^Imm2);
+      if (isRunOfOnes(Imm, MB, ME)) {
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                            N->getOperand(0).getOperand(1),
+                            getI32Imm(0), getI32Imm(MB),getI32Imm(ME) };
+        return CurDAG->getTargetNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+      }
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::OR:
+    if (N->getValueType(0) == MVT::i32)
+      if (SDNode *I = SelectBitfieldInsert(N))
+        return I;
+      
+    // Other cases are autogenerated.
+    break;
+  case ISD::SHL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SRL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) { 
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SELECT_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+    
+    // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+      if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+        if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+          if (N1C->isNullValue() && N3C->isNullValue() &&
+              N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
+              // FIXME: Implement this optzn for PPC64.
+              N->getValueType(0) == MVT::i32) {
+            SDNode *Tmp =
+              CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                    N->getOperand(0), getI32Imm(~0U));
+            return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
+                                        SDValue(Tmp, 0), N->getOperand(0),
+                                        SDValue(Tmp, 1));
+          }
+
+    SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+    unsigned BROpc = getPredicateForSetCC(CC);
+
+    unsigned SelectCCOp;
+    if (N->getValueType(0) == MVT::i32)
+      SelectCCOp = PPC::SELECT_CC_I4;
+    else if (N->getValueType(0) == MVT::i64)
+      SelectCCOp = PPC::SELECT_CC_I8;
+    else if (N->getValueType(0) == MVT::f32)
+      SelectCCOp = PPC::SELECT_CC_F4;
+    else if (N->getValueType(0) == MVT::f64)
+      SelectCCOp = PPC::SELECT_CC_F8;
+    else
+      SelectCCOp = PPC::SELECT_CC_VRRC;
+
+    SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
+                        getI32Imm(BROpc) };
+    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
+  }
+  case PPCISD::COND_BRANCH: {
+    // Op #0 is the Chain.
+    // Op #1 is the PPC::PRED_* number.
+    // Op #2 is the CR#
+    // Op #3 is the Dest MBB
+    // Op #4 is the Flag.
+    // Prevent PPC::PRED_* from being selected into LI.
+    SDValue Pred =
+      getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
+      N->getOperand(0), N->getOperand(4) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5);
+  }
+  case ISD::BR_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
+    SDValue Ops[] = { getI32Imm(getPredicateForSetCC(CC)), CondCode, 
+                        N->getOperand(4), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
+  }
+  case ISD::BRIND: {
+    // FIXME: Should custom lower this.
+    SDValue Chain = N->getOperand(0);
+    SDValue Target = N->getOperand(1);
+    unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    Chain = SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Other, Target,
+                                            Chain), 0);
+    return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain);
+  }
+  case ISD::DECLARE: {
+    SDValue Chain = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
+    
+    // FIXME: We need to handle this for VLAs.
+    if (!FINode) {
+      ReplaceUses(Op.getValue(0), Chain);
+      return NULL;
+    }
+    
+    if (N2.getOpcode() == ISD::ADD) {
+      if (N2.getOperand(0).getOpcode() == ISD::ADD &&
+          N2.getOperand(0).getOperand(0).getOpcode() == PPCISD::GlobalBaseReg &&
+          N2.getOperand(0).getOperand(1).getOpcode() == PPCISD::Hi &&
+          N2.getOperand(1).getOpcode() == PPCISD::Lo)
+        N2 = N2.getOperand(0).getOperand(1).getOperand(0);
+      else if (N2.getOperand(0).getOpcode() == ISD::ADD &&
+          N2.getOperand(0).getOperand(0).getOpcode() == PPCISD::GlobalBaseReg &&
+          N2.getOperand(0).getOperand(1).getOpcode() == PPCISD::Lo &&
+               N2.getOperand(1).getOpcode() == PPCISD::Hi)
+        N2 = N2.getOperand(0).getOperand(1).getOperand(0);
+      else if (N2.getOperand(0).getOpcode() == PPCISD::Hi &&
+               N2.getOperand(1).getOpcode() == PPCISD::Lo)
+        N2 = N2.getOperand(0).getOperand(0);
+    }
+    
+    // If we don't have a global address here, the debug info is mangled, just
+    // drop it.
+    if (!isa<GlobalAddressSDNode>(N2)) {
+      ReplaceUses(Op.getValue(0), Chain);
+      return NULL;
+    }
+    int FI = cast<FrameIndexSDNode>(N1)->getIndex();
+    GlobalValue *GV = cast<GlobalAddressSDNode>(N2)->getGlobal();
+    SDValue Tmp1 = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GV, TLI.getPointerTy());
+    return CurDAG->SelectNodeTo(N, TargetInstrInfo::DECLARE,
+                                MVT::Other, Tmp1, Tmp2, Chain);
+  }
+  }
+  
+  return SelectCode(Op);
+}
+
+
+
+/// createPPCISelDag - This pass converts a legalized DAG into a 
+/// PowerPC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
+  return new PPCDAGToDAGISel(TM);
+}
+
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
new file mode 100644
index 0000000..a7744b8
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -0,0 +1,4878 @@
+//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCISelLowering.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCPerfectShuffle.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/DerivedTypes.h"
+using namespace llvm;
+
+static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc",
+cl::desc("enable preincrement load/store generation on PPC (experimental)"),
+                                     cl::Hidden);
+
+PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
+  : TargetLowering(TM), PPCSubTarget(*TM.getSubtargetImpl()) {
+
+  setPow2DivIsCheap();
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, PPC::GPRCRegisterClass);
+  addRegisterClass(MVT::f32, PPC::F4RCRegisterClass);
+  addRegisterClass(MVT::f64, PPC::F8RCRegisterClass);
+
+  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PowerPC has pre-inc load and store's.
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+
+  // This is used in the ppcf128->int sequence.  Note it has different semantics
+  // from FP_ROUND:  that rounds to nearest, this rounds to zero.
+  setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+
+  // PowerPC has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+  // We don't support sin/cos/sqrt/fmod/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
+  // If we're enabling GP optimizations, use hardware square root
+  if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) {
+    setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  }
+
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  // PowerPC does not have BSWAP, CTPOP or CTTZ
+  setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+
+  // PowerPC does not have ROTR
+  setOperationAction(ISD::ROTR, MVT::i32   , Expand);
+  setOperationAction(ISD::ROTR, MVT::i64   , Expand);
+
+  // PowerPC does not have Select
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+
+  // PowerPC wants to turn select_cc of FP into fsel when possible.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // PowerPC wants to optimize integer setcc a bit
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+
+  // PowerPC does not have BRCOND which requires SetCC
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
+
+  // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+  // PowerPC does not have [U|S]INT_TO_FP
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Expand);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Support label based line numbers.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
+
+  // RET must be custom lowered, to meet ABI requirements.
+  setOperationAction(ISD::RET               , MVT::Other, Custom);
+
+  // TRAP is legal.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  // VAARG is custom lowered with ELF 32 ABI
+  if (TM.getSubtarget<PPCSubtarget>().isELF32_ABI())
+    setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Comparisons that require checking two conditions.
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+
+  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+    // They also have instructions for converting between i64 and fp.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+
+    // FIXME: disable this lowered code.  This generates 64-bit register values,
+    // and we don't model the fact that the top part is clobbered by calls.  We
+    // need to flag these together so that the value isn't live across a call.
+    //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+    // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
+  } else {
+    // PowerPC does not have FP_TO_UINT on 32-bit implementations.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  }
+
+  if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) {
+    // 64-bit PowerPC implementations can support i64 types directly
+    addRegisterClass(MVT::i64, PPC::G8RCRegisterClass);
+    // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+    // 64-bit PowerPC wants to expand i128 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  } else {
+    // 32-bit PowerPC wants to expand i64 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  }
+
+  if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) {
+    // First set operation action for all vector types to expand. Then we
+    // will selectively turn on ones that can be effectively codegen'd.
+    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // add/sub are legal for all supported vector VT's.
+      setOperationAction(ISD::ADD , VT, Legal);
+      setOperationAction(ISD::SUB , VT, Legal);
+
+      // We promote all shuffles to v16i8.
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
+      AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
+
+      // We promote all non-typed operations to v4i32.
+      setOperationAction(ISD::AND   , VT, Promote);
+      AddPromotedToType (ISD::AND   , VT, MVT::v4i32);
+      setOperationAction(ISD::OR    , VT, Promote);
+      AddPromotedToType (ISD::OR    , VT, MVT::v4i32);
+      setOperationAction(ISD::XOR   , VT, Promote);
+      AddPromotedToType (ISD::XOR   , VT, MVT::v4i32);
+      setOperationAction(ISD::LOAD  , VT, Promote);
+      AddPromotedToType (ISD::LOAD  , VT, MVT::v4i32);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
+      setOperationAction(ISD::STORE, VT, Promote);
+      AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
+
+      // No other operations are legal.
+      setOperationAction(ISD::MUL , VT, Expand);
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FNEG, VT, Expand);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::UDIVREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Expand);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::CTPOP, VT, Expand);
+      setOperationAction(ISD::CTLZ, VT, Expand);
+      setOperationAction(ISD::CTTZ, VT, Expand);
+    }
+
+    // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
+    // with merges, splats, etc.
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::AND   , MVT::v4i32, Legal);
+    setOperationAction(ISD::OR    , MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
+    setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+    setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+
+    addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass);
+
+    setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+  }
+
+  setShiftAmountType(MVT::i32);
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+    setStackPointerRegisterToSaveRestore(PPC::X1);
+    setExceptionPointerRegister(PPC::X3);
+    setExceptionSelectorRegister(PPC::X4);
+  } else {
+    setStackPointerRegisterToSaveRestore(PPC::R1);
+    setExceptionPointerRegister(PPC::R3);
+    setExceptionSelectorRegister(PPC::R4);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::BR_CC);
+  setTargetDAGCombine(ISD::BSWAP);
+
+  // Darwin long double math library functions have $LDBL128 appended.
+  if (TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
+    setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
+    setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
+    setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
+    setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
+    setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
+    setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
+    setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
+    setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
+    setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
+  }
+
+  computeRegisterProperties();
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.
+unsigned PPCTargetLowering::getByValTypeAlignment(const Type *Ty) const {
+  TargetMachine &TM = getTargetMachine();
+  // Darwin passes everything on 4 byte boundary.
+  if (TM.getSubtarget<PPCSubtarget>().isDarwin())
+    return 4;
+  // FIXME Elf TBD
+  return 4;
+}
+
+const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case PPCISD::FSEL:            return "PPCISD::FSEL";
+  case PPCISD::FCFID:           return "PPCISD::FCFID";
+  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
+  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
+  case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
+  case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
+  case PPCISD::VPERM:           return "PPCISD::VPERM";
+  case PPCISD::Hi:              return "PPCISD::Hi";
+  case PPCISD::Lo:              return "PPCISD::Lo";
+  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRL:             return "PPCISD::SRL";
+  case PPCISD::SRA:             return "PPCISD::SRA";
+  case PPCISD::SHL:             return "PPCISD::SHL";
+  case PPCISD::EXTSW_32:        return "PPCISD::EXTSW_32";
+  case PPCISD::STD_32:          return "PPCISD::STD_32";
+  case PPCISD::CALL_ELF:        return "PPCISD::CALL_ELF";
+  case PPCISD::CALL_Macho:      return "PPCISD::CALL_Macho";
+  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
+  case PPCISD::BCTRL_Macho:     return "PPCISD::BCTRL_Macho";
+  case PPCISD::BCTRL_ELF:       return "PPCISD::BCTRL_ELF";
+  case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
+  case PPCISD::MFCR:            return "PPCISD::MFCR";
+  case PPCISD::VCMP:            return "PPCISD::VCMP";
+  case PPCISD::VCMPo:           return "PPCISD::VCMPo";
+  case PPCISD::LBRX:            return "PPCISD::LBRX";
+  case PPCISD::STBRX:           return "PPCISD::STBRX";
+  case PPCISD::LARX:            return "PPCISD::LARX";
+  case PPCISD::STCX:            return "PPCISD::STCX";
+  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
+  case PPCISD::MFFS:            return "PPCISD::MFFS";
+  case PPCISD::MTFSB0:          return "PPCISD::MTFSB0";
+  case PPCISD::MTFSB1:          return "PPCISD::MTFSB1";
+  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
+  case PPCISD::MTFSF:           return "PPCISD::MTFSF";
+  case PPCISD::TAILCALL:        return "PPCISD::TAILCALL";
+  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  }
+}
+
+
+MVT PPCTargetLowering::getSetCCResultType(MVT VT) const {
+  return MVT::i32;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Node matching predicates, for use by the tblgen matching code.
+//===----------------------------------------------------------------------===//
+
+/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
+      if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+        return CFP->getValueAPF().isZero();
+  }
+  return false;
+}
+
+/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if it matches the specified value.
+static bool isConstantOrUndef(int Op, int Val) {
+  return Op < 0 || Op == Val;
+}
+
+/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUHUM instruction.
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+1))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+1))
+        return false;
+  }
+  return true;
+}
+
+/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUWUM instruction.
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+3))
+        return false;
+  }
+  return true;
+}
+
+/// isVMerge - Common function, used to match vmrg* shuffles.
+///
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
+                     unsigned LHSStart, unsigned RHSStart) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         "PPC only supports shuffles by bytes!");
+  assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
+         "Unsupported merge size!");
+
+  for (unsigned i = 0; i != 8/UnitSize; ++i)     // Step over units
+    for (unsigned j = 0; j != UnitSize; ++j) {   // Step over bytes within unit
+      if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
+                             LHSStart+j+i*UnitSize) ||
+          !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
+                             RHSStart+j+i*UnitSize))
+        return false;
+    }
+  return true;
+}
+
+/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 
+                             bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 8, 24);
+  return isVMerge(N, UnitSize, 8, 8);
+}
+
+/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 
+                             bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 0, 16);
+  return isVMerge(N, UnitSize, 0, 0);
+}
+
+
+/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+/// amount, otherwise return -1.
+int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         "PPC only supports shuffles by bytes!");
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
+    /*search*/;
+
+  if (i == 16) return -1;  // all undef.
+
+  // Otherwise, check to see if the rest of the elements are consecutively
+  // numbered from this value.
+  unsigned ShiftAmt = SVOp->getMaskElt(i);
+  if (ShiftAmt < i) return -1;
+  ShiftAmt -= i;
+
+  if (!isUnary) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+        return -1;
+  } else {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+        return -1;
+  }
+  return ShiftAmt;
+}
+
+/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of a single element that is suitable for input to
+/// VSPLTB/VSPLTH/VSPLTW.
+bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned ElementBase = N->getMaskElt(0);
+  
+  // FIXME: Handle UNDEF elements too!
+  if (ElementBase >= 16)
+    return false;
+
+  // Check that the indices are consecutive, in the case of a multi-byte element
+  // splatted with a v16i8 mask.
+  for (unsigned i = 1; i != EltSize; ++i)
+    if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
+      return false;
+
+  for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
+    if (N->getMaskElt(i) < 0) continue;
+    for (unsigned j = 0; j != EltSize; ++j)
+      if (N->getMaskElt(i+j) != N->getMaskElt(j))
+        return false;
+  }
+  return true;
+}
+
+/// isAllNegativeZeroVector - Returns true if all elements of build_vector
+/// are -0.0.
+bool PPC::isAllNegativeZeroVector(SDNode *N) {
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+
+  APInt APVal, APUndef;
+  unsigned BitSize;
+  bool HasAnyUndefs;
+  
+  if (BV->isConstantSplat(APVal, APUndef, BitSize, HasAnyUndefs, 32))
+    if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+      return CFP->getValueAPF().isNegZero();
+
+  return false;
+}
+
+/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  assert(isSplatShuffleMask(SVOp, EltSize));
+  return SVOp->getMaskElt(0) / EltSize;
+}
+
+/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
+/// by using a vspltis[bhw] instruction of the specified element size, return
+/// the constant being splatted.  The ByteSize field indicates the number of
+/// bytes of each element [124] -> [bhw].
+SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  SDValue OpVal(0, 0);
+
+  // If ByteSize of the splat is bigger than the element size of the
+  // build_vector, then we have a case where we are checking for a splat where
+  // multiple elements of the buildvector are folded together into a single
+  // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
+  unsigned EltSize = 16/N->getNumOperands();
+  if (EltSize < ByteSize) {
+    unsigned Multiple = ByteSize/EltSize;   // Number of BV entries per spltval.
+    SDValue UniquedVals[4];
+    assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
+
+    // See if all of the elements in the buildvector agree across.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      // If the element isn't a constant, bail fully out.
+      if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
+
+
+      if (UniquedVals[i&(Multiple-1)].getNode() == 0)
+        UniquedVals[i&(Multiple-1)] = N->getOperand(i);
+      else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
+        return SDValue();  // no match.
+    }
+
+    // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
+    // either constant or undef values that are identical for each chunk.  See
+    // if these chunks can form into a larger vspltis*.
+
+    // Check to see if all of the leading entries are either 0 or -1.  If
+    // neither, then this won't fit into the immediate field.
+    bool LeadingZero = true;
+    bool LeadingOnes = true;
+    for (unsigned i = 0; i != Multiple-1; ++i) {
+      if (UniquedVals[i].getNode() == 0) continue;  // Must have been undefs.
+
+      LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
+      LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
+    }
+    // Finally, check the least significant entry.
+    if (LeadingZero) {
+      if (UniquedVals[Multiple-1].getNode() == 0)
+        return DAG.getTargetConstant(0, MVT::i32);  // 0,0,0,undef
+      int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
+      if (Val < 16)
+        return DAG.getTargetConstant(Val, MVT::i32);  // 0,0,0,4 -> vspltisw(4)
+    }
+    if (LeadingOnes) {
+      if (UniquedVals[Multiple-1].getNode() == 0)
+        return DAG.getTargetConstant(~0U, MVT::i32);  // -1,-1,-1,undef
+      int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
+      if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
+        return DAG.getTargetConstant(Val, MVT::i32);
+    }
+
+    return SDValue();
+  }
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return SDValue();
+  }
+
+  if (OpVal.getNode() == 0) return SDValue();  // All UNDEF: use implicit def.
+
+  unsigned ValSizeInBytes = EltSize;
+  uint64_t Value = 0;
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+    Value = CN->getZExtValue();
+  } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+    assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
+    Value = FloatToBits(CN->getValueAPF().convertToFloat());
+  }
+
+  // If the splat value is larger than the element value, then we can never do
+  // this splat.  The only case that we could fit the replicated bits into our
+  // immediate field for would be zero, and we prefer to use vxor for it.
+  if (ValSizeInBytes < ByteSize) return SDValue();
+
+  // If the element value is larger than the splat value, cut it in half and
+  // check to see if the two halves are equal.  Continue doing this until we
+  // get to ByteSize.  This allows us to handle 0x01010101 as 0x01.
+  while (ValSizeInBytes > ByteSize) {
+    ValSizeInBytes >>= 1;
+
+    // If the top half equals the bottom half, we're still ok.
+    if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) !=
+         (Value                        & ((1 << (8*ValSizeInBytes))-1)))
+      return SDValue();
+  }
+
+  // Properly sign extend the value.
+  int ShAmt = (4-ByteSize)*8;
+  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+
+  // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
+  if (MaskVal == 0) return SDValue();
+
+  // Finally, if this value fits in a 5 bit sext field, return it
+  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+    return DAG.getTargetConstant(MaskVal, MVT::i32);
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing Mode Selection
+//===----------------------------------------------------------------------===//
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
+                                            SDValue &Index,
+                                            SelectionDAG &DAG) const {
+  short imm = 0;
+  if (N.getOpcode() == ISD::ADD) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+    if (N.getOperand(1).getOpcode() == PPCISD::Lo)
+      return false;    // r+i
+
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  } else if (N.getOpcode() == ISD::OR) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i can fold it if we can.
+
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are provably
+    // disjoint.
+    APInt LHSKnownZero, LHSKnownOne;
+    APInt RHSKnownZero, RHSKnownOne;
+    DAG.ComputeMaskedBits(N.getOperand(0),
+                          APInt::getAllOnesValue(N.getOperand(0)
+                            .getValueSizeInBits()),
+                          LHSKnownZero, LHSKnownOne);
+
+    if (LHSKnownZero.getBoolValue()) {
+      DAG.ComputeMaskedBits(N.getOperand(1),
+                            APInt::getAllOnesValue(N.getOperand(1)
+                              .getValueSizeInBits()),
+                            RHSKnownZero, RHSKnownOne);
+      // If all of the bits are known zero on the LHS or RHS, the add won't
+      // carry.
+      if (~(LHSKnownZero | RHSKnownZero) == 0) {
+        Base = N.getOperand(0);
+        Index = N.getOperand(1);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 16-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.
+bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
+                                            SDValue &Base,
+                                            SelectionDAG &DAG) const {
+  // FIXME dl should come from parent load or store, not from address
+  DebugLoc dl = N.getDebugLoc();
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0),
+                            APInt::getAllOnesValue(N.getOperand(0)
+                                                   .getValueSizeInBits()),
+                            LHSKnownZero, LHSKnownOne);
+
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+
+    // If this address fits entirely in a 16-bit sext immediate field, codegen
+    // this as "d, 0"
+    short Imm;
+    if (isIntS16Immediate(CN, Imm)) {
+      Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
+      Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+      return true;
+    }
+
+    // Handle 32-bit sext immediates with LIS + addr mode.
+    if (CN->getValueType(0) == MVT::i32 ||
+        (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+      int Addr = (int)CN->getZExtValue();
+
+      // Otherwise, break this down into an LIS + disp.
+      Disp = DAG.getTargetConstant((short)Addr, MVT::i32);
+
+      Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32);
+      unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+      Base = SDValue(DAG.getTargetNode(Opc, dl, CN->getValueType(0), Base), 0);
+      return true;
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+/// represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
+                                                SDValue &Index,
+                                                SelectionDAG &DAG) const {
+  // Check to see if we can easily represent this as an [r+r] address.  This
+  // will fail if it thinks that the address is more profitably represented as
+  // reg+imm, e.g. where imm = 0.
+  if (SelectAddressRegReg(N, Base, Index, DAG))
+    return true;
+
+  // If the operand is an addition, always emit this as [r+r], since this is
+  // better (for code size, and execution, as the memop does the add for free)
+  // than emitting an explicit add.
+  if (N.getOpcode() == ISD::ADD) {
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  }
+
+  // Otherwise, do it the hard way, using R0 as the base register.
+  Base = DAG.getRegister(PPC::R0, N.getValueType());
+  Index = N;
+  return true;
+}
+
+/// SelectAddressRegImmShift - Returns true if the address N can be
+/// represented by a base register plus a signed 14-bit displacement
+/// [r+imm*4].  Suitable for use by STD and friends.
+bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
+                                                 SDValue &Base,
+                                                 SelectionDAG &DAG) const {
+  // FIXME dl should come from the parent load or store, not the address
+  DebugLoc dl = N.getDebugLoc();
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      Disp =  DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0),
+                            APInt::getAllOnesValue(N.getOperand(0)
+                                                   .getValueSizeInBits()),
+                            LHSKnownZero, LHSKnownOne);
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.  Verify low two bits are clear.
+    if ((CN->getZExtValue() & 3) == 0) {
+      // If this address fits entirely in a 14-bit sext immediate field, codegen
+      // this as "d, 0"
+      short Imm;
+      if (isIntS16Immediate(CN, Imm)) {
+        Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
+        Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+        return true;
+      }
+
+      // Fold the low-part of 32-bit absolute addresses into addr mode.
+      if (CN->getValueType(0) == MVT::i32 ||
+          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+        int Addr = (int)CN->getZExtValue();
+
+        // Otherwise, break this down into an LIS + disp.
+        Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32);
+        Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32);
+        unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+        Base = SDValue(DAG.getTargetNode(Opc, dl, CN->getValueType(0), Base),0);
+        return true;
+      }
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                  SDValue &Offset,
+                                                  ISD::MemIndexedMode &AM,
+                                                  SelectionDAG &DAG) const {
+  // Disabled by default for now.
+  if (!EnablePPCPreinc) return false;
+
+  SDValue Ptr;
+  MVT VT;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT = LD->getMemoryVT();
+
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ST = ST;
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  // PowerPC doesn't have preinc load/store instructions for vectors.
+  if (VT.isVector())
+    return false;
+
+  // TODO: Check reg+reg first.
+
+  // LDU/STU use reg+imm*4, others use reg+imm.
+  if (VT != MVT::i64) {
+    // reg + imm
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG))
+      return false;
+  } else {
+    // reg + imm * 4.
+    if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG))
+      return false;
+  }
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    // PPC64 doesn't have lwau, but it does have lwaux.  Reject preinc load of
+    // sext i32 to i64 when addr mode is r+i.
+    if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
+        LD->getExtensionType() == ISD::SEXTLOAD &&
+        isa<ConstantSDNode>(Offset))
+      return false;
+  }
+
+  AM = ISD::PRE_INC;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) {
+  MVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  Constant *C = CP->getConstVal();
+  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  const TargetMachine &TM = DAG.getTarget();
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, CPI, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, dl, PtrVT, CPI, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to the constant pool.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, dl, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg,
+                                 DebugLoc::getUnknownLoc(), PtrVT), Hi);
+  }
+
+  Lo = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  return Lo;
+}
+
+SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  MVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't really any debug loc here
+  DebugLoc dl = Op.getDebugLoc();
+
+  const TargetMachine &TM = DAG.getTarget();
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, JTI, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, dl, PtrVT, JTI, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to the constant pool.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, dl, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg,
+                                 DebugLoc::getUnknownLoc(), PtrVT), Hi);
+  }
+
+  Lo = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  return Lo;
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) {
+  assert(0 && "TLS not implemented for PPC.");
+  return SDValue(); // Not reached
+}
+
+SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) {
+  MVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  GlobalValue *GV = GSDN->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = GSDN->getDebugLoc();
+
+  const TargetMachine &TM = DAG.getTarget();
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, GA, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, dl, PtrVT, GA, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to globals.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, dl, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg,
+                                 DebugLoc::getUnknownLoc(), PtrVT), Hi);
+  }
+
+  Lo = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+
+  if (!TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV))
+    return Lo;
+
+  // If the global is weak or external, we have to go through the lazy
+  // resolution stub.
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Lo, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If we're comparing for equality to zero, expose the fact that this is
+  // implented as a ctlz/srl pair on ppc, so that the dag combiner can
+  // fold the new nodes.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (C->isNullValue() && CC == ISD::SETEQ) {
+      MVT VT = Op.getOperand(0).getValueType();
+      SDValue Zext = Op.getOperand(0);
+      if (VT.bitsLT(MVT::i32)) {
+        VT = MVT::i32;
+        Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
+      }
+      unsigned Log2b = Log2_32(VT.getSizeInBits());
+      SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
+      SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
+                                DAG.getConstant(Log2b, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
+    }
+    // Leave comparisons against 0 and -1 alone for now, since they're usually
+    // optimized.  FIXME: revisit this when we can custom lower all setcc
+    // optimizations.
+    if (C->isAllOnesValue() || C->isNullValue())
+      return SDValue();
+  }
+
+  // If we have an integer seteq/setne, turn it into a compare against zero
+  // by xor'ing the rhs with the lhs, which is faster than setting a
+  // condition register, reading it back out, and masking the correct bit.  The
+  // normal approach here uses sub to do this instead of xor.  Using xor exposes
+  // the result to other bit-twiddling opportunities.
+  MVT LHSVT = Op.getOperand(0).getValueType();
+  if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    MVT VT = Op.getValueType();
+    SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
+                                Op.getOperand(1));
+    return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, LHSVT), CC);
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
+                              int VarArgsFrameIndex,
+                              int VarArgsStackOffset,
+                              unsigned VarArgsNumGPR,
+                              unsigned VarArgsNumFPR,
+                              const PPCSubtarget &Subtarget) {
+
+  assert(0 && "VAARG in ELF32 ABI not implemented yet!");
+  return SDValue(); // Not reached
+}
+
+SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  DebugLoc dl = Op.getDebugLoc();
+
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = (PtrVT == MVT::i64);
+  const Type *IntPtrTy =
+    DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = Trmp; Args.push_back(Entry);
+
+  // TrampSize == (isPPC64 ? 48 : 40);
+  Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40,
+                               isPPC64 ? MVT::i64 : MVT::i32);
+  Args.push_back(Entry);
+
+  Entry.Node = FPtr; Args.push_back(Entry);
+  Entry.Node = Nest; Args.push_back(Entry);
+
+  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+  std::pair<SDValue, SDValue> CallResult =
+    LowerCallTo(Chain, Op.getValueType().getTypeForMVT(), false, false,
+                false, false, CallingConv::C, false,
+                DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+                Args, DAG, dl);
+
+  SDValue Ops[] =
+    { CallResult.first, CallResult.second };
+
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                                        int VarArgsFrameIndex,
+                                        int VarArgsStackOffset,
+                                        unsigned VarArgsNumGPR,
+                                        unsigned VarArgsNumFPR,
+                                        const PPCSubtarget &Subtarget) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Subtarget.isMachoABI()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+  }
+
+  // For ELF 32 ABI we follow the layout of the va_list struct.
+  // We suppose the given va_list is already allocated.
+  //
+  // typedef struct {
+  //  char gpr;     /* index into the array of 8 GPRs
+  //                 * stored in the register save area
+  //                 * gpr=0 corresponds to r3,
+  //                 * gpr=1 to r4, etc.
+  //                 */
+  //  char fpr;     /* index into the array of 8 FPRs
+  //                 * stored in the register save area
+  //                 * fpr=0 corresponds to f1,
+  //                 * fpr=1 to f2, etc.
+  //                 */
+  //  char *overflow_arg_area;
+  //                /* location on stack that holds
+  //                 * the next overflow argument
+  //                 */
+  //  char *reg_save_area;
+  //               /* where r3:r10 and f1:f8 (if saved)
+  //                * are stored
+  //                */
+  // } va_list[1];
+
+
+  SDValue ArgGPR = DAG.getConstant(VarArgsNumGPR, MVT::i8);
+  SDValue ArgFPR = DAG.getConstant(VarArgsNumFPR, MVT::i8);
+
+
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SDValue StackOffsetFI = DAG.getFrameIndex(VarArgsStackOffset, PtrVT);
+  SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+
+  uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
+  SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, PtrVT);
+
+  uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
+  SDValue ConstStackOffset = DAG.getConstant(StackOffset, PtrVT);
+
+  uint64_t FPROffset = 1;
+  SDValue ConstFPROffset = DAG.getConstant(FPROffset, PtrVT);
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Store first byte : number of int regs
+  SDValue firstStore = DAG.getStore(Op.getOperand(0), dl, ArgGPR,
+                                      Op.getOperand(1), SV, 0);
+  uint64_t nextOffset = FPROffset;
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
+                                  ConstFPROffset);
+
+  // Store second byte : number of float regs
+  SDValue secondStore =
+    DAG.getStore(firstStore, dl, ArgFPR, nextPtr, SV, nextOffset);
+  nextOffset += StackOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
+
+  // Store second word : arguments given on stack
+  SDValue thirdStore =
+    DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, SV, nextOffset);
+  nextOffset += FrameOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
+
+  // Store third word : arguments given in registers
+  return DAG.getStore(thirdStore, dl, FR, nextPtr, SV, nextOffset);
+
+}
+
+#include "PPCGenCallingConv.inc"
+
+/// GetFPR - Get the set of FP registers that should be allocated for arguments,
+/// depending on which subtarget is selected.
+static const unsigned *GetFPR(const PPCSubtarget &Subtarget) {
+  if (Subtarget.isMachoABI()) {
+    static const unsigned FPR[] = {
+      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+      PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
+    };
+    return FPR;
+  }
+
+
+  static const unsigned FPR[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8
+  };
+  return FPR;
+}
+
+/// CalculateStackSlotSize - Calculates the size reserved for this argument on
+/// the stack.
+static unsigned CalculateStackSlotSize(SDValue Arg, ISD::ArgFlagsTy Flags,
+                                       bool isVarArg, unsigned PtrByteSize) {
+  MVT ArgVT = Arg.getValueType();
+  unsigned ArgSize =ArgVT.getSizeInBits()/8;
+  if (Flags.isByVal())
+    ArgSize = Flags.getByValSize();
+  ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+  return ArgSize;
+}
+
+SDValue
+PPCTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op,
+                                         SelectionDAG &DAG,
+                                         int &VarArgsFrameIndex,
+                                         int &VarArgsStackOffset,
+                                         unsigned &VarArgsNumGPR,
+                                         unsigned &VarArgsNumFPR,
+                                         const PPCSubtarget &Subtarget) {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SmallVector<SDValue, 8> ArgValues;
+  SDValue Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  bool isMachoABI = Subtarget.isMachoABI();
+  bool isELF32_ABI = Subtarget.isELF32_ABI();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  unsigned CC = MF.getFunction()->getCallingConv();
+  bool isImmutable = !(PerformTailCallOpt && (CC==CallingConv::Fast));
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI);
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const unsigned GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const unsigned GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const unsigned *FPR = GetFPR(Subtarget);
+
+  static const unsigned VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+  const unsigned Num_FPR_Regs = isMachoABI ? 13 : 8;
+  const unsigned Num_VR_Regs  = array_lengthof( VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  // In 32-bit non-varargs functions, the stack space for vectors is after the
+  // stack space for non-vectors.  We do not use this space unless we have
+  // too many vectors to fit in registers, something that only occurs in
+  // constructed examples:), but we have to walk the arglist to figure
+  // that out...for the pathological case, compute VecArgOffset as the
+  // start of the vector parameter area.  Computing VecArgOffset is the
+  // entire point of the following loop.
+  // Altivec is not mentioned in the ppc32 Elf Supplement, so I'm not trying
+  // to handle Elf here.
+  unsigned VecArgOffset = ArgOffset;
+  if (!isVarArg && !isPPC64) {
+    for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues()-1; ArgNo != e;
+         ++ArgNo) {
+      MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+      unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+      ISD::ArgFlagsTy Flags =
+        cast<ARG_FLAGSSDNode>(Op.getOperand(ArgNo+3))->getArgFlags();
+
+      if (Flags.isByVal()) {
+        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
+        ObjSize = Flags.getByValSize();
+        unsigned ArgSize =
+                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+        VecArgOffset += ArgSize;
+        continue;
+      }
+
+      switch(ObjectVT.getSimpleVT()) {
+      default: assert(0 && "Unhandled argument type!");
+      case MVT::i32:
+      case MVT::f32:
+        VecArgOffset += isPPC64 ? 8 : 4;
+        break;
+      case MVT::i64:  // PPC64
+      case MVT::f64:
+        VecArgOffset += 8;
+        break;
+      case MVT::v4f32:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        // Nothing to do, we're only looking at Nonvector args here.
+        break;
+      }
+    }
+  }
+  // We've found where the vector parameter area in memory is.  Skip the
+  // first 12 parameters; these don't use that memory.
+  VecArgOffset = ((VecArgOffset+15)/16)*16;
+  VecArgOffset += 12*16;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+  //
+  // In the ELF 32 ABI, GPRs and stack are double word align: an argument
+  // represented with two words (long long or double) must be copied to an
+  // even GPR_idx value or to an even ArgOffset value.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues() - 1;
+       ArgNo != e; ++ArgNo) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags =
+      cast<ARG_FLAGSSDNode>(Op.getOperand(ArgNo+3))->getArgFlags();
+    // See if next argument requires stack alignment in ELF
+    bool Align = Flags.isSplit();
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg || isPPC64) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(Op.getValue(ArgNo),
+                                                  Flags,
+                                                  isVarArg,
+                                                  PtrByteSize);
+      } else  nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Op.getValue(ArgNo),
+                                                Flags,
+                                                isVarArg,
+                                                PtrByteSize);
+
+    // FIXME alignment for ELF may not be right
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Double word align in ELF
+      if (Align && isELF32_ABI) GPR_idx += (GPR_idx % 2);
+      // Objects of size 1 and 2 are right justified, everything else is
+      // left justified.  This means the memory address is adjusted forwards.
+      if (ObjSize==1 || ObjSize==2) {
+        CurArgOffset = CurArgOffset + (4 - ObjSize);
+      }
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgValues.push_back(FIN);
+      if (ObjSize==1 || ObjSize==2) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+          RegInfo.addLiveIn(GPR[GPR_idx], VReg);
+          SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, PtrVT);
+          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                               NULL, 0, ObjSize==1 ? MVT::i8 : MVT::i16 );
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          if (isMachoABI) ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += PtrByteSize;
+        }
+        continue;
+      }
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgVal will be address of the beginning of
+        // the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+          RegInfo.addLiveIn(GPR[GPR_idx], VReg);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          if (isMachoABI) ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT()) {
+    default: assert(0 && "Unhandled argument type!");
+    case MVT::i32:
+      if (!isPPC64) {
+        // Double word align in ELF
+        if (Align && isELF32_ABI) GPR_idx += (GPR_idx % 2);
+
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+          RegInfo.addLiveIn(GPR[GPR_idx], VReg);
+          ArgVal = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+          ++GPR_idx;
+        } else {
+          needsLoad = true;
+          ArgSize = PtrByteSize;
+        }
+        // Stack align in ELF
+        if (needsLoad && Align && isELF32_ABI)
+          ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+        // All int arguments reserve stack space in Macho ABI.
+        if (isMachoABI || needsLoad) ArgOffset += PtrByteSize;
+        break;
+      }
+      // FALLTHROUGH
+    case MVT::i64:  // PPC64
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+        RegInfo.addLiveIn(GPR[GPR_idx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32) {
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          if (Flags.isSExt())
+            ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                                 DAG.getValueType(ObjectVT));
+          else if (Flags.isZExt())
+            ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                                 DAG.getValueType(ObjectVT));
+
+          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+        }
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      // All int arguments reserve stack space in Macho ABI.
+      if (isMachoABI || needsLoad) ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs && isMachoABI) {
+        ++GPR_idx;
+        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+          ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+        if (ObjectVT == MVT::f32)
+          VReg = RegInfo.createVirtualRegister(&PPC::F4RCRegClass);
+        else
+          VReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+        RegInfo.addLiveIn(FPR[FPR_idx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+      }
+
+      // Stack align in ELF
+      if (needsLoad && Align && isELF32_ABI)
+        ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+      // All FP arguments reserve stack space in Macho ABI.
+      if (isMachoABI || needsLoad) ArgOffset += isPPC64 ? 8 : ObjSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = RegInfo.createVirtualRegister(&PPC::VRRCRegClass);
+        RegInfo.addLiveIn(VR[VR_idx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs);
+        }
+        ++VR_idx;
+      } else {
+        if (!isVarArg && !isPPC64) {
+          // Vectors go after all the nonvectors.
+          CurArgOffset = VecArgOffset;
+          VecArgOffset += 16;
+        } else {
+          // Vectors are aligned.
+          ArgOffset = ((ArgOffset+15)/16)*16;
+          CurArgOffset = ArgOffset;
+          ArgOffset += 16;
+        }
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
+    }
+
+    ArgValues.push_back(ArgVal);
+  }
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized function's reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  // Add the Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameInfo::getMinCallFrameSize(isPPC64, isMachoABI));
+  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameInfo()->
+    getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+  FI->setMinReservedArea(MinReservedArea);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+
+    int depth;
+    if (isELF32_ABI) {
+      VarArgsNumGPR = GPR_idx;
+      VarArgsNumFPR = FPR_idx;
+
+      // Make room for Num_GPR_Regs, Num_FPR_Regs and for a possible frame
+      // pointer.
+      depth = -(Num_GPR_Regs * PtrVT.getSizeInBits()/8 +
+                Num_FPR_Regs * MVT(MVT::f64).getSizeInBits()/8 +
+                PtrVT.getSizeInBits()/8);
+
+      VarArgsStackOffset = MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                                                  ArgOffset);
+
+    }
+    else
+      depth = ArgOffset;
+
+    VarArgsFrameIndex = MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                                               depth);
+    SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+
+    // In ELF 32 ABI, the fixed integer arguments of a variadic function are
+    // stored to the VarArgsFrameIndex on the stack.
+    if (isELF32_ABI) {
+      for (GPR_idx = 0; GPR_idx != VarArgsNumGPR; ++GPR_idx) {
+        SDValue Val = DAG.getRegister(GPR[GPR_idx], PtrVT);
+        SDValue Store = DAG.getStore(Root, dl, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        // Increment the address by four for the next argument to store
+        SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+        FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+      }
+    }
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg;
+      if (isPPC64)
+        VReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+      else
+        VReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+
+      RegInfo.addLiveIn(GPR[GPR_idx], VReg);
+      SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    // In ELF 32 ABI, the double arguments are stored to the VarArgsFrameIndex
+    // on the stack.
+    if (isELF32_ABI) {
+      for (FPR_idx = 0; FPR_idx != VarArgsNumFPR; ++FPR_idx) {
+        SDValue Val = DAG.getRegister(FPR[FPR_idx], MVT::f64);
+        SDValue Store = DAG.getStore(Root, dl, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        // Increment the address by eight for the next argument to store
+        SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8,
+                                           PtrVT);
+        FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+      }
+
+      for (; FPR_idx != Num_FPR_Regs; ++FPR_idx) {
+        unsigned VReg;
+        VReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+
+        RegInfo.addLiveIn(FPR[FPR_idx], VReg);
+        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::f64);
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        // Increment the address by eight for the next argument to store
+        SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8,
+                                           PtrVT);
+        FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+      }
+    }
+  }
+
+  if (!MemOps.empty())
+    Root = DAG.getNode(ISD::TokenFactor, dl,
+                       MVT::Other, &MemOps[0], MemOps.size());
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size());
+}
+
+/// CalculateParameterAndLinkageAreaSize - Get the size of the paramter plus
+/// linkage area.
+static unsigned
+CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
+                                     bool isPPC64,
+                                     bool isMachoABI,
+                                     bool isVarArg,
+                                     unsigned CC,
+                                     CallSDNode *TheCall,
+                                     unsigned &nAltivecParamsAtEnd) {
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI);
+  unsigned NumOps = TheCall->getNumArgs();
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  // Add up all the space actually used.
+  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+  // they all go in registers, but we must reserve stack space for them for
+  // possible use by the caller.  In varargs or 64-bit calls, parameters are
+  // assigned stack space in order, with padding so Altivec parameters are
+  // 16-byte aligned.
+  nAltivecParamsAtEnd = 0;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = TheCall->getArg(i);
+    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+    MVT ArgVT = Arg.getValueType();
+    // Varargs Altivec parameters are padded to a 16 byte boundary.
+    if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 ||
+        ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8) {
+      if (!isVarArg && !isPPC64) {
+        // Non-varargs Altivec parameters go after all the non-Altivec
+        // parameters; handle those later so we know how much padding we need.
+        nAltivecParamsAtEnd++;
+        continue;
+      }
+      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
+      NumBytes = ((NumBytes+15)/16)*16;
+    }
+    NumBytes += CalculateStackSlotSize(Arg, Flags, isVarArg, PtrByteSize);
+  }
+
+   // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    NumBytes = ((NumBytes+15)/16)*16;
+    NumBytes += 16*nAltivecParamsAtEnd;
+  }
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes,
+                      PPCFrameInfo::getMinCallFrameSize(isPPC64, isMachoABI));
+
+  // Tail call needs the stack to be aligned.
+  if (CC==CallingConv::Fast && PerformTailCallOpt) {
+    unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameInfo()->
+      getStackAlignment();
+    unsigned AlignMask = TargetAlign-1;
+    NumBytes = (NumBytes + AlignMask) & ~AlignMask;
+  }
+
+  return NumBytes;
+}
+
+/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
+/// adjusted to accomodate the arguments for the tailcall.
+static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool IsTailCall,
+                                   unsigned ParamSize) {
+
+  if (!IsTailCall) return 0;
+
+  PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+  unsigned CallerMinReservedArea = FI->getMinReservedArea();
+  int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
+  // Remember only if the new adjustement is bigger.
+  if (SPDiff < FI->getTailCallSPDelta())
+    FI->setTailCallSPDelta(SPDiff);
+
+  return SPDiff;
+}
+
+/// IsEligibleForTailCallElimination - Check to see whether the next instruction
+/// following the call is a return. A function is eligible if caller/callee
+/// calling conventions match, currently only fastcc supports tail calls, and
+/// the function CALL is immediatly followed by a RET.
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
+                                                     SDValue Ret,
+                                                     SelectionDAG& DAG) const {
+  // Variable argument functions are not supported.
+  if (!PerformTailCallOpt || TheCall->isVarArg())
+    return false;
+
+  if (CheckTailCallReturnConstraints(TheCall, Ret)) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned CallerCC = MF.getFunction()->getCallingConv();
+    unsigned CalleeCC = TheCall->getCallingConv();
+    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+      // Functions containing by val parameters are not supported.
+      for (unsigned i = 0; i != TheCall->getNumArgs(); i++) {
+         ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+         if (Flags.isByVal()) return false;
+      }
+
+      SDValue Callee = TheCall->getCallee();
+      // Non PIC/GOT  tail calls are supported.
+      if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+        return true;
+
+      // At the moment we can only do local tail calls (in same module, hidden
+      // or protected) if we are generating PIC.
+      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+        return G->getGlobal()->hasHiddenVisibility()
+            || G->getGlobal()->hasProtectedVisibility();
+    }
+  }
+
+  return false;
+}
+
+/// isCallCompatibleAddress - Return the immediate to use if the specified
+/// 32-bit value is representable in the immediate field of a BxA instruction.
+static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      (Addr << 6 >> 6) != Addr)
+    return 0;  // Top 6 bits have to be sext of immediate.
+
+  return DAG.getConstant((int)C->getZExtValue() >> 2,
+                         DAG.getTargetLoweringInfo().getPointerTy()).getNode();
+}
+
+namespace {
+
+struct TailCallArgumentInfo {
+  SDValue Arg;
+  SDValue FrameIdxOp;
+  int       FrameIdx;
+
+  TailCallArgumentInfo() : FrameIdx(0) {}
+};
+
+}
+
+/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
+static void
+StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
+                                           SDValue Chain,
+                   const SmallVector<TailCallArgumentInfo, 8> &TailCallArgs,
+                   SmallVector<SDValue, 8> &MemOpChains,
+                   DebugLoc dl) {
+  for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
+    SDValue Arg = TailCallArgs[i].Arg;
+    SDValue FIN = TailCallArgs[i].FrameIdxOp;
+    int FI = TailCallArgs[i].FrameIdx;
+    // Store relative to framepointer.
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN,
+                                       PseudoSourceValue::getFixedStack(FI),
+                                       0));
+  }
+}
+
+/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
+/// the appropriate stack slot for the tail call optimized function call.
+static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
+                                               MachineFunction &MF,
+                                               SDValue Chain,
+                                               SDValue OldRetAddr,
+                                               SDValue OldFP,
+                                               int SPDiff,
+                                               bool isPPC64,
+                                               bool isMachoABI,
+                                               DebugLoc dl) {
+  if (SPDiff) {
+    // Calculate the new stack slot for the return address.
+    int SlotSize = isPPC64 ? 8 : 4;
+    int NewRetAddrLoc = SPDiff + PPCFrameInfo::getReturnSaveOffset(isPPC64,
+                                                                   isMachoABI);
+    int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                          NewRetAddrLoc);
+    int NewFPLoc = SPDiff + PPCFrameInfo::getFramePointerSaveOffset(isPPC64,
+                                                                    isMachoABI);
+    int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc);
+
+    MVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+    SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
+    Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+                         PseudoSourceValue::getFixedStack(NewRetAddr), 0);
+    SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
+    Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
+                         PseudoSourceValue::getFixedStack(NewFPIdx), 0);
+  }
+  return Chain;
+}
+
+/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
+/// the position of the argument.
+static void
+CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
+                         SDValue Arg, int SPDiff, unsigned ArgOffset,
+                      SmallVector<TailCallArgumentInfo, 8>& TailCallArguments) {
+  int Offset = ArgOffset + SPDiff;
+  uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
+  int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
+  MVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+  SDValue FIN = DAG.getFrameIndex(FI, VT);
+  TailCallArgumentInfo Info;
+  Info.Arg = Arg;
+  Info.FrameIdxOp = FIN;
+  Info.FrameIdx = FI;
+  TailCallArguments.push_back(Info);
+}
+
+/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
+/// stack slot. Returns the chain as result and the loaded frame pointers in
+/// LROpOut/FPOpout. Used when tail calling.
+SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                                        int SPDiff,
+                                                        SDValue Chain,
+                                                        SDValue &LROpOut,
+                                                        SDValue &FPOpOut,
+                                                        DebugLoc dl) {
+  if (SPDiff) {
+    // Load the LR and FP stack slot for later adjusting.
+    MVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
+    LROpOut = getReturnAddrFrameIndex(DAG);
+    LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, NULL, 0);
+    Chain = SDValue(LROpOut.getNode(), 1);
+    FPOpOut = getFramePointerFrameIndex(DAG);
+    FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, NULL, 0);
+    Chain = SDValue(FPOpOut.getNode(), 1);
+  }
+  return Chain;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          unsigned Size, DebugLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Size, MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       false, NULL, 0, NULL, 0);
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
+/// tail calls.
+static void
+LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
+                 SDValue Arg, SDValue PtrOff, int SPDiff,
+                 unsigned ArgOffset, bool isPPC64, bool isTailCall,
+                 bool isVector, SmallVector<SDValue, 8> &MemOpChains,
+                 SmallVector<TailCallArgumentInfo, 8>& TailCallArguments,
+                 DebugLoc dl) {
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  if (!isTailCall) {
+    if (isVector) {
+      SDValue StackPtr;
+      if (isPPC64)
+        StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+      else
+        StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                           DAG.getConstant(ArgOffset, PtrVT));
+    }
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+  // Calculate and remember argument location.
+  } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
+                                  TailCallArguments);
+}
+
+SDValue PPCTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG,
+                                       const PPCSubtarget &Subtarget,
+                                       TargetMachine &TM) {
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  SDValue Chain  = TheCall->getChain();
+  bool isVarArg   = TheCall->isVarArg();
+  unsigned CC     = TheCall->getCallingConv();
+  bool isTailCall = TheCall->isTailCall()
+                 && CC == CallingConv::Fast && PerformTailCallOpt;
+  SDValue Callee = TheCall->getCallee();
+  unsigned NumOps  = TheCall->getNumArgs();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  bool isMachoABI = Subtarget.isMachoABI();
+  bool isELF32_ABI  = Subtarget.isELF32_ABI();
+
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // args_to_use will accumulate outgoing args for the PPCISD::CALL case in
+  // SelectExpr to use to put the arguments in the appropriate registers.
+  std::vector<SDValue> args_to_use;
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (PerformTailCallOpt && CC==CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  unsigned nAltivecParamsAtEnd = 0;
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes =
+    CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isMachoABI, isVarArg, CC,
+                                         TheCall, nAltivecParamsAtEnd);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr;
+  if (isPPC64)
+    StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+  else
+    StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const unsigned GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const unsigned GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const unsigned *FPR = GetFPR(Subtarget);
+
+  static const unsigned VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR_32);
+  const unsigned NumFPRs = isMachoABI ? 13 : 8;
+  const unsigned NumVRs  = array_lengthof( VR);
+
+  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  std::vector<std::pair<unsigned, SDValue> > RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    bool inMem = false;
+    SDValue Arg = TheCall->getArg(i);
+    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+    // See if next argument requires stack alignment in ELF
+    bool Align = Flags.isSplit();
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    // Stack align in ELF 32
+    if (isELF32_ABI && Align)
+      PtrOff = DAG.getConstant(ArgOffset + ((ArgOffset/4) % 2) * PtrByteSize,
+                               StackPtr.getValueType());
+    else
+      PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // On PPC64, promote integers to 64-bit values.
+    if (isPPC64 && Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME Elf untested, what are alignment rules?
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    if (Flags.isByVal()) {
+      unsigned Size = Flags.getByValSize();
+      if (isELF32_ABI && Align) GPR_idx += (GPR_idx % 2);
+      if (Size==1 || Size==2) {
+        // Very small objects are passed right-justified.
+        // Everything else is passed left-justified.
+        MVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                          NULL, 0, VT);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          if (isMachoABI)
+            ArgOffset += PtrByteSize;
+        } else {
+          SDValue Const = DAG.getConstant(4 - Size, PtrOff.getValueType());
+          SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+          SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
+                                CallSeqStart.getNode()->getOperand(0),
+                                Flags, DAG, Size, dl);
+          // This must go outside the CALLSEQ_START..END.
+          SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                               CallSeqStart.getNode()->getOperand(1));
+          DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                                 NewCallSeqStart.getNode());
+          Chain = CallSeqStart = NewCallSeqStart;
+          ArgOffset += PtrByteSize;
+        }
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+      SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                            CallSeqStart.getNode()->getOperand(0),
+                            Flags, DAG, Size, dl);
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+      // And copy the pieces of it that fit into registers.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, NULL, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          if (isMachoABI)
+            ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getValueType().getSimpleVT()) {
+    default: assert(0 && "Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      // Double word align in ELF
+      if (isELF32_ABI && Align) GPR_idx += (GPR_idx % 2);
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+        inMem = true;
+      }
+      if (inMem || isMachoABI) {
+        // Stack align in ELF
+        if (isELF32_ABI && Align)
+          ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+
+        ArgOffset += PtrByteSize;
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, NULL, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            if (isMachoABI) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++],
+                                                                Load));
+          }
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, NULL, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            if (isMachoABI) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++],
+                                                                Load));
+          }
+        } else {
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
+          // GPRs.
+          if (isMachoABI) {
+            if (GPR_idx != NumGPRs)
+              ++GPR_idx;
+            if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
+                !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
+              ++GPR_idx;
+          }
+        }
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+        inMem = true;
+      }
+      if (inMem || isMachoABI) {
+        // Stack align in ELF
+        if (isELF32_ABI && Align)
+          ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+        if (isPPC64)
+          ArgOffset += 8;
+        else
+          ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
+      }
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, NULL, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, NULL, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else if (nAltivecParamsAtEnd==0) {
+        // We are emitting Altivec params in order.
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
+    }
+  }
+  // If all Altivec parameters fit in registers, as they usually do,
+  // they get stack space following the non-Altivec parameters.  We
+  // don't track this here because nobody below needs it.
+  // If there are more Altivec parameters than fit in registers emit
+  // the stores here.
+  if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+    unsigned j = 0;
+    // Offset is aligned; skip 1st 12 params which go in V registers.
+    ArgOffset = ((ArgOffset+15)/16)*16;
+    ArgOffset += 12*16;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      SDValue Arg = TheCall->getArg(i);
+      MVT ArgType = Arg.getValueType();
+      if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
+          ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
+        if (++j > NumVRs) {
+          SDValue PtrOff;
+          // We are emitting Altivec params in order.
+          LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                           isPPC64, isTailCall, true, MemOpChains,
+                           TailCallArguments, dl);
+          ArgOffset += 16;
+        }
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // With the ELF 32 ABI, set CR6 to true if this is a vararg call.
+  if (isVarArg && isELF32_ABI) {
+    SDValue SetCR(DAG.getTargetNode(PPC::CRSET, dl, MVT::i32), 0);
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::CR1EQ, SetCR, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Emit a sequence of copyto/copyfrom virtual registers for arguments that
+  // might overwrite each other in case of tail call optimization.
+  if (isTailCall) {
+    SmallVector<SDValue, 8> MemOpChains2;
+    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
+                                      MemOpChains2, dl);
+    if (!MemOpChains2.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOpChains2[0], MemOpChains2.size());
+
+    // Store the return address to the appropriate stack slot.
+    Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
+                                          isPPC64, isMachoABI, dl);
+  }
+
+  // Emit callseq_end just before tailcall node.
+  if (isTailCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<MVT> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = isMachoABI? PPCISD::CALL_Macho : PPCISD::CALL_ELF;
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), Callee.getValueType());
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType());
+  else if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
+    // If this is an absolute destination address, use the munged value.
+    Callee = SDValue(Dest, 0);
+  else {
+    // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
+    // to do the call, we can't use PPCISD::CALL.
+    SDValue MTCTROps[] = {Chain, Callee, InFlag};
+    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, MTCTROps,
+                        2 + (InFlag.getNode() != 0));
+    InFlag = Chain.getValue(1);
+
+    // Copy the callee address into R12/X12 on darwin.
+    if (isMachoABI) {
+      unsigned Reg = Callee.getValueType() == MVT::i32 ? PPC::R12 : PPC::X12;
+      Chain = DAG.getCopyToReg(Chain, dl, Reg, Callee, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Other);
+    NodeTys.push_back(MVT::Flag);
+    Ops.push_back(Chain);
+    CallOpc = isMachoABI ? PPCISD::BCTRL_Macho : PPCISD::BCTRL_ELF;
+    Callee.setNode(0);
+    // Add CTR register as callee so a bctr can be emitted later.
+    if (isTailCall)
+      Ops.push_back(DAG.getRegister(PPC::CTR, getPointerTy()));
+  }
+
+  // If this is a direct call, pass the chain and the callee.
+  if (Callee.getNode()) {
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+  }
+  // If this is a tail call add stack pointer delta.
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(SPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // When performing tail call optimization the callee pops its arguments off
+  // the stack. Account for this here so these bytes can be pushed back on in
+  // PPCRegisterInfo::eliminateCallFramePseudoInstr.
+  int BytesCalleePops =
+    (CC==CallingConv::Fast && PerformTailCallOpt) ? NumBytes : 0;
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  // Emit tail call.
+  if (isTailCall) {
+    assert(InFlag.getNode() &&
+           "Flag must be set. Depend on flag being set in LowerRET");
+    Chain = DAG.getNode(PPCISD::TAILCALL, dl,
+                        TheCall->getVTList(), &Ops[0], Ops.size());
+    return SDValue(Chain.getNode(), Op.getResNo());
+  }
+
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(BytesCalleePops, true),
+                             InFlag);
+  if (TheCall->getValueType(0) != MVT::Other)
+    InFlag = Chain.getValue(1);
+
+  SmallVector<SDValue, 16> ResultVals;
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CallerCC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  CCState CCInfo(CallerCC, isVarArg, TM, RVLocs);
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_PPC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    MVT VT = VA.getValVT();
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyFromReg(Chain, dl,
+                               VA.getLocReg(), VT, InFlag).getValue(1);
+    ResultVals.push_back(Chain.getValue(0));
+    InFlag = Chain.getValue(2);
+  }
+
+  // If the function returns void, just return the chain.
+  if (RVLocs.empty())
+    return Chain;
+
+  // Otherwise, merge everything together with a MERGE_VALUES node.
+  ResultVals.push_back(Chain);
+  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
+                            &ResultVals[0], ResultVals.size());
+  return Res.getValue(Op.getResNo());
+}
+
+SDValue PPCTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG,
+                                      TargetMachine &TM) {
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+  CCState CCInfo(CC, isVarArg, TM, RVLocs);
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_PPC);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Chain = Op.getOperand(0);
+
+  Chain = GetPossiblePreceedingTailCall(Chain, PPCISD::TAILCALL);
+  if (Chain.getOpcode() == PPCISD::TAILCALL) {
+    SDValue TailCall = Chain;
+    SDValue TargetAddress = TailCall.getOperand(1);
+    SDValue StackAdjustment = TailCall.getOperand(2);
+
+    assert(((TargetAddress.getOpcode() == ISD::Register &&
+             cast<RegisterSDNode>(TargetAddress)->getReg() == PPC::CTR) ||
+            TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
+            TargetAddress.getOpcode() == ISD::TargetGlobalAddress ||
+            isa<ConstantSDNode>(TargetAddress)) &&
+    "Expecting an global address, external symbol, absolute value or register");
+
+    assert(StackAdjustment.getOpcode() == ISD::Constant &&
+           "Expecting a const value");
+
+    SmallVector<SDValue,8> Operands;
+    Operands.push_back(Chain.getOperand(0));
+    Operands.push_back(TargetAddress);
+    Operands.push_back(StackAdjustment);
+    // Copy registers used by the call. Last operand is a flag so it is not
+    // copied.
+    for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) {
+      Operands.push_back(Chain.getOperand(i));
+    }
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Operands[0],
+                       Operands.size());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Op.getOperand(i*2+1), Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  else
+    return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                   const PPCSubtarget &Subtarget) {
+  // When we pop the dynamic allocation we need to restore the SP link.
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the corect type for pointers.
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Construct the stack pointer operand.
+  bool IsPPC64 = Subtarget.isPPC64();
+  unsigned SP = IsPPC64 ? PPC::X1 : PPC::R1;
+  SDValue StackPtr = DAG.getRegister(SP, PtrVT);
+
+  // Get the operands for the STACKRESTORE.
+  SDValue Chain = Op.getOperand(0);
+  SDValue SaveSP = Op.getOperand(1);
+
+  // Load the old link SP.
+  SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr, NULL, 0);
+
+  // Restore the stack pointer.
+  Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
+
+  // Store the old link SP.
+  return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, NULL, 0);
+}
+
+
+
+SDValue
+PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsPPC64 = PPCSubTarget.isPPC64();
+  bool isMachoABI = PPCSubTarget.isMachoABI();
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int RASI = FI->getReturnAddrSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!RASI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int LROffset = PPCFrameInfo::getReturnSaveOffset(IsPPC64, isMachoABI);
+    // Allocate the frame index for frame pointer save area.
+    RASI = MF.getFrameInfo()->CreateFixedObject(IsPPC64? 8 : 4, LROffset);
+    // Save the result.
+    FI->setReturnAddrSaveIndex(RASI);
+  }
+  return DAG.getFrameIndex(RASI, PtrVT);
+}
+
+SDValue
+PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsPPC64 = PPCSubTarget.isPPC64();
+  bool isMachoABI = PPCSubTarget.isMachoABI();
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, isMachoABI);
+
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(IsPPC64? 8 : 4, FPOffset);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+  return DAG.getFrameIndex(FPSI, PtrVT);
+}
+
+SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                         SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) {
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the corect type for pointers.
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Negate the size.
+  SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
+                                  DAG.getConstant(0, PtrVT), Size);
+  // Construct a node for the frame pointer save index.
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNALLOC node.
+  SDValue Ops[3] = { Chain, NegSize, FPSIdx };
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops, 3);
+}
+
+/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
+/// possible.
+SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  // Not FP? Not a fsel.
+  if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
+      !Op.getOperand(2).getValueType().isFloatingPoint())
+    return Op;
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  // Cannot handle SETEQ/SETNE.
+  if (CC == ISD::SETEQ || CC == ISD::SETNE) return Op;
+
+  MVT ResVT = Op.getValueType();
+  MVT CmpVT = Op.getOperand(0).getValueType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+  SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If the RHS of the comparison is a 0.0, we don't need to do the
+  // subtraction at all.
+  if (isFloatingPointZero(RHS))
+    switch (CC) {
+    default: break;       // SETUO etc aren't handled by fsel.
+    case ISD::SETULT:
+    case ISD::SETLT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOGE:
+    case ISD::SETGE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+    case ISD::SETUGT:
+    case ISD::SETGT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOLE:
+    case ISD::SETLE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                         DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
+    }
+
+  SDValue Cmp;
+  switch (CC) {
+  default: break;       // SETUO etc aren't handled by fsel.
+  case ISD::SETULT:
+  case ISD::SETLT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  case ISD::SETUGT:
+  case ISD::SETGT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOLE:
+  case ISD::SETLE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  }
+  return Op;
+}
+
+// FIXME: Split this code up when LegalizeDAGTypes lands.
+SDValue PPCTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+                                           DebugLoc dl) {
+  assert(Op.getOperand(0).getValueType().isFloatingPoint());
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::f32)
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+  SDValue Tmp;
+  switch (Op.getValueType().getSimpleVT()) {
+  default: assert(0 && "Unhandled FP_TO_SINT type in custom expander!");
+  case MVT::i32:
+    Tmp = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Src);
+    break;
+  case MVT::i64:
+    Tmp = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Src);
+    break;
+  }
+
+  // Convert the FP value to an int value through memory.
+  SDValue FIPtr = DAG.CreateStackTemporary(MVT::f64);
+
+  // Emit a store to the stack slot.
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, NULL, 0);
+
+  // Result is a load from the stack slot.  If loading 4 bytes, make sure to
+  // add in a bias.
+  if (Op.getValueType() == MVT::i32)
+    FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
+                        DAG.getConstant(4, FIPtr.getValueType()));
+  return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Don't handle ppc_fp128 here; let it be lowered to a libcall.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+
+  if (Op.getOperand(0).getValueType() == MVT::i64) {
+    SDValue Bits = DAG.getNode(ISD::BIT_CONVERT, dl,
+                               MVT::f64, Op.getOperand(0));
+    SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits);
+    if (Op.getValueType() == MVT::f32)
+      FP = DAG.getNode(ISD::FP_ROUND, dl,
+                       MVT::f32, FP, DAG.getIntPtrConstant(0));
+    return FP;
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+         "Unhandled SINT_TO_FP type in custom expander!");
+  // Since we only generate this in 64-bit mode, we can take advantage of
+  // 64-bit registers.  In particular, sign extend the input value into the
+  // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
+  // then lfd it and fcfid it.
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(8, 8);
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  SDValue Ext64 = DAG.getNode(PPCISD::EXTSW_32, dl, MVT::i32,
+                                Op.getOperand(0));
+
+  // STD the extended value into the stack slot.
+  MachineMemOperand MO(PseudoSourceValue::getFixedStack(FrameIdx),
+                       MachineMemOperand::MOStore, 0, 8, 8);
+  SDValue Store = DAG.getNode(PPCISD::STD_32, dl, MVT::Other,
+                                DAG.getEntryNode(), Ext64, FIdx,
+                                DAG.getMemOperand(MO));
+  // Load the value as a double.
+  SDValue Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, NULL, 0);
+
+  // FCFID it and return it.
+  SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Ld);
+  if (Op.getValueType() == MVT::f32)
+    FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0));
+  return FP;
+}
+
+SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  /*
+   The rounding mode is in bits 30:31 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to 0
+     10 Round to +inf
+     11 Round to -inf
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MVT VT = Op.getValueType();
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  std::vector<MVT> NodeTys;
+  SDValue MFFSreg, InFlag;
+
+  // Save FP Control Word to register
+  NodeTys.push_back(MVT::f64);    // return register
+  NodeTys.push_back(MVT::Flag);   // unused in this context
+  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
+
+  // Save FP register to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain,
+                                 StackSlot, NULL, 0);
+
+  // Load FP Control Word from low 32 bits of stack slot.
+  SDValue Four = DAG.getConstant(4, PtrVT);
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, NULL, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::AND, dl, MVT::i32,
+                CWD, DAG.getConstant(3, MVT::i32));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, dl, MVT::i32,
+                DAG.getNode(ISD::AND, dl, MVT::i32,
+                            DAG.getNode(ISD::XOR, dl, MVT::i32,
+                                        CWD, DAG.getConstant(3, MVT::i32)),
+                            DAG.getConstant(3, MVT::i32)),
+                DAG.getConstant(1, MVT::i32));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SHL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  MVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
+  SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  MVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
+  SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRA!");
+
+  // Expand into a bunch of logical ops, followed by a select_cc.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  MVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
+  SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
+  SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT),
+                                  Tmp4, Tmp6, ISD::SETLE);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering.
+//
+
+/// BuildSplatI - Build a canonical splati of Val with an element size of
+/// SplatSize.  Cast the result to VT.
+static SDValue BuildSplatI(int Val, unsigned SplatSize, MVT VT,
+                             SelectionDAG &DAG, DebugLoc dl) {
+  assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
+
+  static const MVT VTys[] = { // canonical VT to use for each size.
+    MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
+  };
+
+  MVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+
+  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
+  if (Val == -1)
+    SplatSize = 1;
+
+  MVT CanonicalVT = VTys[SplatSize-1];
+
+  // Build a canonical splat for this value.
+  SDValue Elt = DAG.getConstant(Val, MVT::i32);
+  SmallVector<SDValue, 8> Ops;
+  Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT,
+                              &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BIT_CONVERT, dl, ReqVT, Res);
+}
+
+/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
+                                SelectionDAG &DAG, DebugLoc dl,
+                                MVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = LHS.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), LHS, RHS);
+}
+
+/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
+                                SDValue Op2, SelectionDAG &DAG,
+                                DebugLoc dl, MVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op0.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
+}
+
+
+/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
+/// amount.  The result has the specified value type.
+static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
+                             MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  // Force LHS/RHS to be the right type.
+  LHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, LHS);
+  RHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, RHS);
+
+  int Ops[16];
+  for (unsigned i = 0; i != 16; ++i)
+    Ops[i] = i + Amt;
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, T);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.  If we CAN select this case, and if it
+// selects to a single instruction, return Op.  Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+
+  // Check if this is a splat of a constant value.
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs) || SplatBitSize > 32)
+    return SDValue();
+
+  unsigned SplatBits = APSplatBits.getZExtValue();
+  unsigned SplatUndef = APSplatUndef.getZExtValue();
+  unsigned SplatSize = SplatBitSize / 8;
+
+  // First, handle single instruction cases.
+
+  // All zeros?
+  if (SplatBits == 0) {
+    // Canonicalize all zero vectors to be v4i32.
+    if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
+      SDValue Z = DAG.getConstant(0, MVT::i32);
+      Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z);
+      Op = DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Z);
+    }
+    return Op;
+  }
+
+  // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
+  int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
+                    (32-SplatBitSize));
+  if (SextVal >= -16 && SextVal <= 15)
+    return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+
+
+  // Two instruction sequences.
+
+  // If this value is in the range [-32,30] and is even, use:
+  //    tmp = VSPLTI[bhw], result = add tmp, tmp
+  if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) {
+    SDValue Res = BuildSplatI(SextVal >> 1, SplatSize, MVT::Other, DAG, dl);
+    Res = DAG.getNode(ISD::ADD, dl, Res.getValueType(), Res, Res);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+  }
+
+  // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
+  // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000).  This is important
+  // for fneg/fabs.
+  if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
+    // Make -1 and vspltisw -1:
+    SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+
+    // Make the VSLW intrinsic, computing 0x8000_0000.
+    SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
+                                   OnesV, DAG, dl);
+
+    // xor by OnesV to invert it.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+  }
+
+  // Check to see if this is a wide variety of vsplti*, binop self cases.
+  static const signed char SplatCsts[] = {
+    -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
+    -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
+  };
+
+  for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
+    // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
+    // cases which are ambiguous (e.g. formation of 0x8000_0000).  'vsplti -1'
+    int i = SplatCsts[idx];
+
+    // Figure out what shift amount will be used by altivec if shifted by i in
+    // this splat size.
+    unsigned TypeShiftAmt = i & (SplatBitSize-1);
+
+    // vsplti + shl self.
+    if (SextVal == (i << (int)TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
+        Intrinsic::ppc_altivec_vslw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + srl self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
+        Intrinsic::ppc_altivec_vsrw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + sra self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
+        Intrinsic::ppc_altivec_vsraw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + rol self.
+    if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
+                         ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
+        Intrinsic::ppc_altivec_vrlw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // t = vsplti c, result = vsldoi t, t, 1
+    if (SextVal == ((i << 8) | (i >> (TypeShiftAmt-8)))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 2
+    if (SextVal == ((i << 16) | (i >> (TypeShiftAmt-16)))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 3
+    if (SextVal == ((i << 24) | (i >> (TypeShiftAmt-24)))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
+    }
+  }
+
+  // Three instruction sequences.
+
+  // Odd, in range [17,31]:  (vsplti C)-(vsplti -16).
+  if (SextVal >= 0 && SextVal <= 31) {
+    SDValue LHS = BuildSplatI(SextVal-16, SplatSize, MVT::Other, DAG, dl);
+    SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl);
+    LHS = DAG.getNode(ISD::SUB, dl, LHS.getValueType(), LHS, RHS);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), LHS);
+  }
+  // Odd, in range [-31,-17]:  (vsplti C)+(vsplti -16).
+  if (SextVal >= -31 && SextVal <= 0) {
+    SDValue LHS = BuildSplatI(SextVal+16, SplatSize, MVT::Other, DAG, dl);
+    SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl);
+    LHS = DAG.getNode(ISD::ADD, dl, LHS.getValueType(), LHS, RHS);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), LHS);
+  }
+
+  return SDValue();
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      DebugLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0,  // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VMRGHW,
+    OP_VMRGLW,
+    OP_VSPLTISW0,
+    OP_VSPLTISW1,
+    OP_VSPLTISW2,
+    OP_VSPLTISW3,
+    OP_VSLDOI4,
+    OP_VSLDOI8,
+    OP_VSLDOI12
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+
+  int ShufIdxs[16];
+  switch (OpNum) {
+  default: assert(0 && "Unknown i32 permute!");
+  case OP_VMRGHW:
+    ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
+    ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+    ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
+    ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+    break;
+  case OP_VMRGLW:
+    ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+    ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+    ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+    ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+    break;
+  case OP_VSPLTISW0:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+0;
+    break;
+  case OP_VSPLTISW1:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+4;
+    break;
+  case OP_VSPLTISW2:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+8;
+    break;
+  case OP_VSPLTISW3:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+12;
+    break;
+  case OP_VSLDOI4:
+    return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI8:
+    return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI12:
+    return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
+  }
+  MVT VT = OpLHS.getValueType();
+  OpLHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OpLHS);
+  OpRHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OpRHS);
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, T);
+}
+
+/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
+/// is a shuffle we can handle in a single instruction, return it.  Otherwise,
+/// return the code it can be lowered into.  Worst case, it can always be
+/// lowered into a vperm.
+SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                               SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  MVT VT = Op.getValueType();
+
+  // Cases that are handled by instructions that take permute immediates
+  // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
+  // selected by the instruction selector.
+  if (V2.getOpcode() == ISD::UNDEF) {
+    if (PPC::isSplatShuffleMask(SVOp, 1) ||
+        PPC::isSplatShuffleMask(SVOp, 2) ||
+        PPC::isSplatShuffleMask(SVOp, 4) ||
+        PPC::isVPKUWUMShuffleMask(SVOp, true) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, true) ||
+        PPC::isVSLDOIShuffleMask(SVOp, true) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, true) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, true) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, true)) {
+      return Op;
+    }
+  }
+
+  // Altivec has a variety of "shuffle immediates" that take two vector inputs
+  // and produce a fixed permutation.  If any of these match, do not lower to
+  // VPERM.
+  if (PPC::isVPKUWUMShuffleMask(SVOp, false) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, false) ||
+      PPC::isVSLDOIShuffleMask(SVOp, false) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, false) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, false) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, false))
+    return Op;
+
+  // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
+  // perfect shuffle table to emit an optimal matching sequence.
+  SmallVector<int, 16> PermMask;
+  SVOp->getMask(PermMask);
+  
+  unsigned PFIndexes[4];
+  bool isFourElementShuffle = true;
+  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+    unsigned EltNo = 8;   // Start out undef.
+    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
+      if (PermMask[i*4+j] < 0)
+        continue;   // Undef, ignore it.
+
+      unsigned ByteSource = PermMask[i*4+j];
+      if ((ByteSource & 3) != j) {
+        isFourElementShuffle = false;
+        break;
+      }
+
+      if (EltNo == 8) {
+        EltNo = ByteSource/4;
+      } else if (EltNo != ByteSource/4) {
+        isFourElementShuffle = false;
+        break;
+      }
+    }
+    PFIndexes[i] = EltNo;
+  }
+
+  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+  // perfect shuffle vector to determine if it is cost effective to do this as
+  // discrete instructions, or whether we should use a vperm.
+  if (isFourElementShuffle) {
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost  = (PFEntry >> 30);
+
+    // Determining when to avoid vperm is tricky.  Many things affect the cost
+    // of vperm, particularly how many times the perm mask needs to be computed.
+    // For example, if the perm mask can be hoisted out of a loop or is already
+    // used (perhaps because there are multiple permutes with the same shuffle
+    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
+    // the loop requires an extra register.
+    //
+    // As a compromise, we only emit discrete instructions if the shuffle can be
+    // generated in 3 or fewer operations.  When we have loop information
+    // available, if this block is within a loop, we should avoid using vperm
+    // for 3-operation perms and use a constant pool load instead.
+    if (Cost < 3)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
+  // vector that will get spilled to the constant pool.
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+  // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
+  // that it is in input element units, not in bytes.  Convert now.
+  MVT EltVT = V1.getValueType().getVectorElementType();
+  unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+  SmallVector<SDValue, 16> ResultMask;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
+
+    for (unsigned j = 0; j != BytesPerElement; ++j)
+      ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+                                           MVT::i32));
+  }
+
+  SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+  return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask);
+}
+
+/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
+/// altivec comparison.  If it is, return true and fill in Opc/isDot with
+/// information about the intrinsic.
+static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
+                                  bool &isDot) {
+  unsigned IntrinsicID =
+    cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+  CompareOpc = -1;
+  isDot = false;
+  switch (IntrinsicID) {
+  default: return false;
+    // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+
+    // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  }
+  return true;
+}
+
+/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
+/// lower, do it, otherwise return null.
+SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) {
+  // If this is a lowered altivec predicate compare, CompareOpc is set to the
+  // opcode number of the comparison.
+  DebugLoc dl = Op.getDebugLoc();
+  int CompareOpc;
+  bool isDot;
+  if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
+    return SDValue();    // Don't custom lower most intrinsics.
+
+  // If this is a non-dot comparison, make the VCMP node and we are done.
+  if (!isDot) {
+    SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
+                                Op.getOperand(1), Op.getOperand(2),
+                                DAG.getConstant(CompareOpc, MVT::i32));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Tmp);
+  }
+
+  // Create the PPCISD altivec 'dot' comparison node.
+  SDValue Ops[] = {
+    Op.getOperand(2),  // LHS
+    Op.getOperand(3),  // RHS
+    DAG.getConstant(CompareOpc, MVT::i32)
+  };
+  std::vector<MVT> VTs;
+  VTs.push_back(Op.getOperand(2).getValueType());
+  VTs.push_back(MVT::Flag);
+  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+
+  // Now that we have the comparison, emit a copy from the CR to a GPR.
+  // This is flagged to the above dot comparison.
+  SDValue Flags = DAG.getNode(PPCISD::MFCR, dl, MVT::i32,
+                                DAG.getRegister(PPC::CR6, MVT::i32),
+                                CompNode.getValue(1));
+
+  // Unpack the result based on how the target uses it.
+  unsigned BitNo;   // Bit # of CR6.
+  bool InvertBit;   // Invert result?
+  switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+  default:  // Can't happen, don't crash on invalid number though.
+  case 0:   // Return the value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = false;
+    break;
+  case 1:   // Return the inverted value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = true;
+    break;
+  case 2:   // Return the value of the LT bit of CR6.
+    BitNo = 2; InvertBit = false;
+    break;
+  case 3:   // Return the inverted value of the LT bit of CR6.
+    BitNo = 2; InvertBit = true;
+    break;
+  }
+
+  // Shift the bit into the low position.
+  Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
+                      DAG.getConstant(8-(3-BitNo), MVT::i32));
+  // Isolate the bit.
+  Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
+                      DAG.getConstant(1, MVT::i32));
+
+  // If we are supposed to, toggle the bit.
+  if (InvertBit)
+    Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
+                        DAG.getConstant(1, MVT::i32));
+  return Flags;
+}
+
+SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Create a stack slot that is 16-byte aligned.
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(16, 16);
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  // Store the input value into Value#0 of the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
+                                 Op.getOperand(0), FIdx, NULL, 0);
+  // Load it out.
+  return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  if (Op.getValueType() == MVT::v4i32) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
+    SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
+
+    SDValue RHSSwap =   // = vrlw RHS, 16
+      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
+
+    // Shrinkify inputs to v8i16.
+    LHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, LHS);
+    RHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, RHS);
+    RHSSwap = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, RHSSwap);
+
+    // Low parts multiplied together, generating 32-bit results (we ignore the
+    // top parts).
+    SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                        LHS, RHS, DAG, dl, MVT::v4i32);
+
+    SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+                                      LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
+    // Shift the high parts up 16 bits.
+    HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
+                              Neg16, DAG, dl);
+    return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
+  } else if (Op.getValueType() == MVT::v8i16) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
+
+    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
+                            LHS, RHS, Zero, DAG, dl);
+  } else if (Op.getValueType() == MVT::v16i8) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    // Multiply the even 8-bit parts, producing 16-bit sums.
+    SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
+                                           LHS, RHS, DAG, dl, MVT::v8i16);
+    EvenParts = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, EvenParts);
+
+    // Multiply the odd 8-bit parts, producing 16-bit sums.
+    SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
+                                          LHS, RHS, DAG, dl, MVT::v8i16);
+    OddParts = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OddParts);
+
+    // Merge the results together.
+    int Ops[16];
+    for (unsigned i = 0; i != 8; ++i) {
+      Ops[i*2  ] = 2*i+1;
+      Ops[i*2+1] = 2*i+1+16;
+    }
+    return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+  } else {
+    assert(0 && "Unknown mul to lower!");
+    abort();
+  }
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Wasn't expecting to be able to lower this!");
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset,
+                        VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget);
+
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset,
+                      VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget);
+
+  case ISD::FORMAL_ARGUMENTS:
+    return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex,
+                                 VarArgsStackOffset, VarArgsNumGPR,
+                                 VarArgsNumFPR, PPCSubTarget);
+
+  case ISD::CALL:               return LowerCALL(Op, DAG, PPCSubTarget,
+                                                 getTargetMachine());
+  case ISD::RET:                return LowerRET(Op, DAG, getTargetMachine());
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
+
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG,
+                                                       Op.getDebugLoc());
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+
+  // Lower 64-bit shifts.
+  case ISD::SHL_PARTS:          return LowerSHL_PARTS(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
+  case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
+
+  // Frame & Return address.
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  }
+  return SDValue();
+}
+
+void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default:
+    assert(false && "Do not know how to custom type legalize this operation!");
+    return;
+  case ISD::FP_ROUND_INREG: {
+    assert(N->getValueType(0) == MVT::ppcf128);
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128);
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(0));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(1));
+
+    // This sequence changes FPSCR to do round-to-zero, adds the two halves
+    // of the long double, and puts FPSCR back the way it was.  We do not
+    // actually model FPSCR.
+    std::vector<MVT> NodeTys;
+    SDValue Ops[4], Result, MFFSreg, InFlag, FPreg;
+
+    NodeTys.push_back(MVT::f64);   // Return register
+    NodeTys.push_back(MVT::Flag);    // Returns a flag for later insns
+    Result = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
+    MFFSreg = Result.getValue(0);
+    InFlag = Result.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Flag);   // Returns a flag
+    Ops[0] = DAG.getConstant(31, MVT::i32);
+    Ops[1] = InFlag;
+    Result = DAG.getNode(PPCISD::MTFSB1, dl, NodeTys, Ops, 2);
+    InFlag = Result.getValue(0);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Flag);   // Returns a flag
+    Ops[0] = DAG.getConstant(30, MVT::i32);
+    Ops[1] = InFlag;
+    Result = DAG.getNode(PPCISD::MTFSB0, dl, NodeTys, Ops, 2);
+    InFlag = Result.getValue(0);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::f64);    // result of add
+    NodeTys.push_back(MVT::Flag);   // Returns a flag
+    Ops[0] = Lo;
+    Ops[1] = Hi;
+    Ops[2] = InFlag;
+    Result = DAG.getNode(PPCISD::FADDRTZ, dl, NodeTys, Ops, 3);
+    FPreg = Result.getValue(0);
+    InFlag = Result.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::f64);
+    Ops[0] = DAG.getConstant(1, MVT::i32);
+    Ops[1] = MFFSreg;
+    Ops[2] = FPreg;
+    Ops[3] = InFlag;
+    Result = DAG.getNode(PPCISD::MTFSF, dl, NodeTys, Ops, 4);
+    FPreg = Result.getValue(0);
+
+    // We know the low half is about to be thrown away, so just use something
+    // convenient.
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
+                                FPreg, FPreg));
+    return;
+  }
+  case ISD::FP_TO_SINT:
+    Results.push_back(LowerFP_TO_SINT(SDValue(N, 0), DAG, dl));
+    return;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                    bool is64bit, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned TmpReg = (!BinOpcode) ? incr :
+    RegInfo.createVirtualRegister(
+       is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                 (const TargetRegisterClass *) &PPC::GPRCRegClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   l[wd]arx dest, ptr
+  //   add r0, dest, incr
+  //   st[wd]cx. r0, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+    .addReg(ptrA).addReg(ptrB);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+    .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool is8bit,    // operation
+                                            unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  // In 64 bit mode we have to use 64 bits for addresses, even though the
+  // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
+  // registers without caring whether they're 32 or 64, but here we're
+  // doing actual arithmetic on the addresses.
+  bool is64bit = PPCSubTarget.isPPC64();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  const TargetRegisterClass *RC =
+    is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+              (const TargetRegisterClass *) &PPC::GPRCRegClass;
+  unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+  unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+  unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+  unsigned Ptr1Reg;
+  unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  // The 4-byte load must be aligned, while a char or short may be
+  // anywhere in the word.  Hence all this nasty bookkeeping code.
+  //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+  //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+  //   xori shift, shift1, 24 [16]
+  //   rlwinm ptr, ptr1, 0, 0, 29
+  //   slw incr2, incr, shift
+  //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+  //   slw mask, mask2, shift
+  //  loopMBB:
+  //   lwarx tmpDest, ptr
+  //   add tmp, tmpDest, incr2
+  //   andc tmp2, tmpDest, mask
+  //   and tmp3, tmp, mask
+  //   or tmp4, tmp3, tmp2
+  //   stwcx. tmp4, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  //   srw dest, tmpDest, shift
+
+  if (ptrA!=PPC::R0) {
+    Ptr1Reg = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+      .addReg(ptrA).addReg(ptrB);
+  } else {
+    Ptr1Reg = ptrB;
+  }
+  BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+      .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+      .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+  if (is64bit)
+    BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(61);
+  else
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+  BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
+      .addReg(incr).addReg(ShiftReg);
+  if (is8bit)
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+  else {
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+    BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
+  }
+  BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+      .addReg(Mask2Reg).addReg(ShiftReg);
+
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+    .addReg(PPC::R0).addReg(PtrReg);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
+      .addReg(Incr2Reg).addReg(TmpDestReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
+    .addReg(TmpDestReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
+    .addReg(TmpReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
+    .addReg(Tmp3Reg).addReg(Tmp2Reg);
+  BuildMI(BB, dl, TII->get(PPC::STWCX))
+    .addReg(Tmp4Reg).addReg(PPC::R0).addReg(PtrReg);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  BuildMI(BB, dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg).addReg(ShiftReg);
+  return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  // To "insert" these instructions we actually have to insert their
+  // control-flow patterns.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  MachineFunction *F = BB->getParent();
+
+  if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+      MI->getOpcode() == PPC::SELECT_CC_I8 ||
+      MI->getOpcode() == PPC::SELECT_CC_F4 ||
+      MI->getOpcode() == PPC::SELECT_CC_F8 ||
+      MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+
+    // The incoming instruction knows the destination vreg to set, the
+    // condition code register to branch on, the true/false values to
+    // select between, and a branch opcode to use.
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    unsigned SelectPred = MI->getOperand(4).getImm();
+    DebugLoc dl = MI->getDebugLoc();
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+    // Update machine-CFG edges by transferring all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    sinkMBB->transferSuccessors(BB);
+    // Next, add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, dl, TII->get(PPC::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  }
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::ADD8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::AND8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::OR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::SUBF8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32)
+    BB = EmitAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64)
+    BB = EmitAtomicBinary(MI, BB, true, 0);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
+           MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64) {
+    bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->transferSuccessors(BB);
+
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // loop1MBB:
+    //   l[wd]arx dest, ptr
+    //   cmp[wd] dest, oldval
+    //   bne- midMBB
+    // loop2MBB:
+    //   st[wd]cx. newval, ptr
+    //   bne- loopMBB
+    //   b exitBB
+    // midMBB:
+    //   st[wd]cx. dest, ptr
+    // exitBB:
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+      .addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
+      .addReg(oldval).addReg(dest);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(newval).addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(dest).addReg(ptrA).addReg(ptrB);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+  } else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
+             MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
+    // We must use 64-bit registers for addresses when targeting 64-bit,
+    // since we're actually doing arithmetic on them.  Other registers
+    // can be 32-bit.
+    bool is64bit = PPCSubTarget.isPPC64();
+    bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->transferSuccessors(BB);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    const TargetRegisterClass *RC =
+      is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                (const TargetRegisterClass *) &PPC::GPRCRegClass;
+    unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+    unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+    unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+    unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+    unsigned Ptr1Reg;
+    unsigned TmpReg = RegInfo.createVirtualRegister(RC);
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // The 4-byte load must be aligned, while a char or short may be
+    // anywhere in the word.  Hence all this nasty bookkeeping code.
+    //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+    //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+    //   xori shift, shift1, 24 [16]
+    //   rlwinm ptr, ptr1, 0, 0, 29
+    //   slw newval2, newval, shift
+    //   slw oldval2, oldval,shift
+    //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+    //   slw mask, mask2, shift
+    //   and newval3, newval2, mask
+    //   and oldval3, oldval2, mask
+    // loop1MBB:
+    //   lwarx tmpDest, ptr
+    //   and tmp, tmpDest, mask
+    //   cmpw tmp, oldval3
+    //   bne- midMBB
+    // loop2MBB:
+    //   andc tmp2, tmpDest, mask
+    //   or tmp4, tmp2, newval3
+    //   stwcx. tmp4, ptr
+    //   bne- loop1MBB
+    //   b exitBB
+    // midMBB:
+    //   stwcx. tmpDest, ptr
+    // exitBB:
+    //   srw dest, tmpDest, shift
+    if (ptrA!=PPC::R0) {
+      Ptr1Reg = RegInfo.createVirtualRegister(RC);
+      BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+        .addReg(ptrA).addReg(ptrB);
+    } else {
+      Ptr1Reg = ptrB;
+    }
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+        .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+        .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+    if (is64bit)
+      BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(61);
+    else
+      BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+    BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
+        .addReg(newval).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
+        .addReg(oldval).addReg(ShiftReg);
+    if (is8bit)
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+    else {
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+      BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
+        .addReg(Mask3Reg).addImm(65535);
+    }
+    BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+        .addReg(Mask2Reg).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
+        .addReg(NewVal2Reg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
+        .addReg(OldVal2Reg).addReg(MaskReg);
+
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+        .addReg(PPC::R0).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
+        .addReg(TmpReg).addReg(OldVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+        .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
+        .addReg(Tmp2Reg).addReg(NewVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
+        .addReg(PPC::R0).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
+      .addReg(PPC::R0).addReg(PtrReg);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+    BuildMI(BB, dl, TII->get(PPC::SRW),dest).addReg(TmpReg).addReg(ShiftReg);
+  } else {
+    assert(0 && "Unexpected instr type to insert");
+  }
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  TargetMachine &TM = getTargetMachine();
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default: break;
+  case PPCISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getZExtValue() == 0)   // 0 << V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getZExtValue() == 0)   // 0 >>u V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRA:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getZExtValue() == 0 ||   //  0 >>s V -> 0.
+          C->isAllOnesValue())    // -1 >>s V -> -1.
+        return N->getOperand(0);
+    }
+    break;
+
+  case ISD::SINT_TO_FP:
+    if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+      if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
+        // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
+        // We allow the src/dst to be either f32/f64, but the intermediate
+        // type must be i64.
+        if (N->getOperand(0).getValueType() == MVT::i64 &&
+            N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) {
+          SDValue Val = N->getOperand(0).getOperand(0);
+          if (Val.getValueType() == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+            DCI.AddToWorklist(Val.getNode());
+          }
+
+          Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          if (N->getValueType(0) == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val,
+                              DAG.getIntPtrConstant(0));
+            DCI.AddToWorklist(Val.getNode());
+          }
+          return Val;
+        } else if (N->getOperand(0).getValueType() == MVT::i32) {
+          // If the intermediate type is i32, we can avoid the load/store here
+          // too.
+        }
+      }
+    }
+    break;
+  case ISD::STORE:
+    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
+    if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
+        !cast<StoreSDNode>(N)->isTruncatingStore() &&
+        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
+        N->getOperand(1).getValueType() == MVT::i32 &&
+        N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
+      SDValue Val = N->getOperand(1).getOperand(0);
+      if (Val.getValueType() == MVT::f32) {
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+        DCI.AddToWorklist(Val.getNode());
+      }
+      Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
+      DCI.AddToWorklist(Val.getNode());
+
+      Val = DAG.getNode(PPCISD::STFIWX, dl, MVT::Other, N->getOperand(0), Val,
+                        N->getOperand(2), N->getOperand(3));
+      DCI.AddToWorklist(Val.getNode());
+      return Val;
+    }
+
+    // Turn STORE (BSWAP) -> sthbrx/stwbrx.
+    if (N->getOperand(1).getOpcode() == ISD::BSWAP &&
+        N->getOperand(1).getNode()->hasOneUse() &&
+        (N->getOperand(1).getValueType() == MVT::i32 ||
+         N->getOperand(1).getValueType() == MVT::i16)) {
+      SDValue BSwapOp = N->getOperand(1).getOperand(0);
+      // Do an any-extend to 32-bits if this is a half-word input.
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
+
+      return DAG.getNode(PPCISD::STBRX, dl, MVT::Other, N->getOperand(0),
+                         BSwapOp, N->getOperand(2), N->getOperand(3),
+                         DAG.getValueType(N->getOperand(1).getValueType()));
+    }
+    break;
+  case ISD::BSWAP:
+    // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
+    if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+        N->getOperand(0).hasOneUse() &&
+        (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16)) {
+      SDValue Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+      // Create the byte-swapping load.
+      std::vector<MVT> VTs;
+      VTs.push_back(MVT::i32);
+      VTs.push_back(MVT::Other);
+      SDValue MO = DAG.getMemOperand(LD->getMemOperand());
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        MO,                // MemOperand
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDValue BSLoad = DAG.getNode(PPCISD::LBRX, dl, VTs, Ops, 4);
+
+      // If this is an i16 load, insert the truncate.
+      SDValue ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
+
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+      // Return N so it doesn't get rechecked!
+      return SDValue(N, 0);
+    }
+
+    break;
+  case PPCISD::VCMP: {
+    // If a VCMPo node already exists with exactly the same operands as this
+    // node, use its result instead of this node (VCMPo computes both a CR6 and
+    // a normal output).
+    //
+    if (!N->getOperand(0).hasOneUse() &&
+        !N->getOperand(1).hasOneUse() &&
+        !N->getOperand(2).hasOneUse()) {
+
+      // Scan all of the users of the LHS, looking for VCMPo's that match.
+      SDNode *VCMPoNode = 0;
+
+      SDNode *LHSN = N->getOperand(0).getNode();
+      for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
+           UI != E; ++UI)
+        if (UI->getOpcode() == PPCISD::VCMPo &&
+            UI->getOperand(1) == N->getOperand(1) &&
+            UI->getOperand(2) == N->getOperand(2) &&
+            UI->getOperand(0) == N->getOperand(0)) {
+          VCMPoNode = *UI;
+          break;
+        }
+
+      // If there is no VCMPo node, or if the flag value has a single use, don't
+      // transform this.
+      if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+        break;
+
+      // Look at the (necessarily single) use of the flag value.  If it has a
+      // chain, this transformation is more complex.  Note that multiple things
+      // could use the value result, which we should ignore.
+      SDNode *FlagUser = 0;
+      for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+           FlagUser == 0; ++UI) {
+        assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+        SDNode *User = *UI;
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+            FlagUser = User;
+            break;
+          }
+        }
+      }
+
+      // If the user is a MFCR instruction, we know this is safe.  Otherwise we
+      // give up for right now.
+      if (FlagUser->getOpcode() == PPCISD::MFCR)
+        return SDValue(VCMPoNode, 0);
+    }
+    break;
+  }
+  case ISD::BR_CC: {
+    // If this is a branch on an altivec predicate comparison, lower this so
+    // that we don't have to do a MFCR: instead, branch directly on CR6.  This
+    // lowering is done pre-legalize, because the legalizer lowers the predicate
+    // compare down to code that is difficult to reassemble.
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+    int CompareOpc;
+    bool isDot;
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        getAltivecCompareInfo(LHS, CompareOpc, isDot)) {
+      assert(isDot && "Can't compare against a vector result!");
+
+      // If this is a comparison against something other than 0/1, then we know
+      // that the condition is never/always true.
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (Val != 0 && Val != 1) {
+        if (CC == ISD::SETEQ)      // Cond never true, remove branch.
+          return N->getOperand(0);
+        // Always !=, turn it into an unconditional branch.
+        return DAG.getNode(ISD::BR, dl, MVT::Other,
+                           N->getOperand(0), N->getOperand(4));
+      }
+
+      bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
+
+      // Create the PPCISD altivec 'dot' comparison node.
+      std::vector<MVT> VTs;
+      SDValue Ops[] = {
+        LHS.getOperand(2),  // LHS of compare
+        LHS.getOperand(3),  // RHS of compare
+        DAG.getConstant(CompareOpc, MVT::i32)
+      };
+      VTs.push_back(LHS.getOperand(2).getValueType());
+      VTs.push_back(MVT::Flag);
+      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+
+      // Unpack the result based on how the target uses it.
+      PPC::Predicate CompOpc;
+      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+      default:  // Can't happen, don't crash on invalid number though.
+      case 0:   // Branch on the value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
+        break;
+      case 1:   // Branch on the inverted value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
+        break;
+      case 2:   // Branch on the value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
+        break;
+      case 3:   // Branch on the inverted value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
+        break;
+      }
+
+      return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
+                         DAG.getConstant(CompOpc, MVT::i32),
+                         DAG.getRegister(PPC::CR6, MVT::i32),
+                         N->getOperand(4), CompNode.getValue(1));
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       const APInt &Mask,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case PPCISD::LBRX: {
+    // lhbrx is known to have the top bits cleared out.
+    if (cast<VTSDNode>(Op.getOperand(3))->getVT() == MVT::i16)
+      KnownZero = 0xFFFF0000;
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    default: break;
+    case Intrinsic::ppc_altivec_vcmpbfp_p:
+    case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    case Intrinsic::ppc_altivec_vcmpequb_p:
+    case Intrinsic::ppc_altivec_vcmpequh_p:
+    case Intrinsic::ppc_altivec_vcmpequw_p:
+    case Intrinsic::ppc_altivec_vcmpgefp_p:
+    case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    case Intrinsic::ppc_altivec_vcmpgtub_p:
+    case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    case Intrinsic::ppc_altivec_vcmpgtuw_p:
+      KnownZero = ~1U;  // All bits but the low one are known to be zero.
+      break;
+    }
+  }
+  }
+}
+
+
+/// getConstraintType - Given a constraint, return the type of
+/// constraint it is for this target.
+PPCTargetLowering::ConstraintType
+PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+    case 'r':   // R0-R31
+      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+        return std::make_pair(0U, PPC::G8RCRegisterClass);
+      return std::make_pair(0U, PPC::GPRCRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, PPC::F4RCRegisterClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, PPC::F8RCRegisterClass);
+      break;
+    case 'v':
+      return std::make_pair(0U, PPC::VRRCRegisterClass);
+    case 'y':   // crrc
+      return std::make_pair(0U, PPC::CRRCRegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops. If hasMemory is true
+/// it means one of the asm constraint of the inline asm instruction being
+/// processed is 'm'.
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
+                                                     bool hasMemory,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0,0);
+  switch (Letter) {
+  default: break;
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P': {
+    ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
+    if (!CST) return; // Must be an immediate to match.
+    unsigned Value = CST->getZExtValue();
+    switch (Letter) {
+    default: assert(0 && "Unknown constraint letter!");
+    case 'I':  // "I" is a signed 16-bit constant.
+      if ((short)Value == (int)Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
+    case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
+      if ((short)Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
+      if ((Value >> 16) == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'M':  // "M" is a constant that is greater than 31.
+      if (Value > 31)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'N':  // "N" is a positive constant that is an exact power of two.
+      if ((int)Value > 0 && isPowerOf2_32(Value))
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'O':  // "O" is the constant zero.
+      if (Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
+      if ((short)-Value == (int)-Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    }
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  // Handle standard constraint letters.
+  TargetLowering::LowerAsmOperandForConstraint(Op, Letter, hasMemory, Ops, DAG);
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              const Type *Ty) const {
+  // FIXME: PPC does not allow r+i addressing modes for vectors!
+
+  // PPC allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // PPC only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default:
+    // No other scales are supported.
+    return false;
+  }
+
+  return true;
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,const Type *Ty) const{
+  // PPC allows a sign-extended 16-bit immediate field.
+  return (V > -(1 << 16) && V < (1 << 16)-1);
+}
+
+bool PPCTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+  return false;
+}
+
+SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  // Just load the return address off the stack.
+  SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
+
+  // Make sure the function really does not optimize away the store of the RA
+  // to the stack.
+  FuncInfo->setLRStoreRequired();
+  return DAG.getLoad(getPointerTy(), dl,
+                     DAG.getEntryNode(), RetAddrFI, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool is31 = (NoFramePointerElim || MFI->hasVarSizedObjects())
+                  && MFI->getStackSize();
+
+  if (isPPC64)
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, is31 ? PPC::X31 : PPC::X1,
+      MVT::i64);
+  else
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, is31 ? PPC::R31 : PPC::R1,
+      MVT::i32);
+}
+
+bool
+PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The PowerPC target isn't yet aware of offsets.
+  return false;
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
new file mode 100644
index 0000000..7946474
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -0,0 +1,394 @@
+//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PPC uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "PPC.h"
+#include "PPCSubtarget.h"
+
+namespace llvm {
+  namespace PPCISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// FSEL - Traditional three-operand fsel node.
+      ///
+      FSEL,
+      
+      /// FCFID - The FCFID instruction, taking an f64 operand and producing
+      /// and f64 value containing the FP representation of the integer that
+      /// was temporarily in the f64 operand.
+      FCFID,
+      
+      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 
+      /// operand, producing an f64 value containing the integer representation
+      /// of that FP value.
+      FCTIDZ, FCTIWZ,
+      
+      /// STFIWX - The STFIWX instruction.  The first operand is an input token
+      /// chain, then an f64 value to store, then an address to store it to,
+      /// then a SRCVALUE for the address.
+      STFIWX,
+      
+      // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
+      // three v4f32 operands and producing a v4f32 result.
+      VMADDFP, VNMSUBFP,
+      
+      /// VPERM - The PPC VPERM Instruction.
+      ///
+      VPERM,
+      
+      /// Hi/Lo - These represent the high and low 16-bit parts of a global
+      /// address respectively.  These nodes have two operands, the first of
+      /// which must be a TargetGlobalAddress, and the second of which must be a
+      /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+      /// though these are usually folded into other nodes.
+      Hi, Lo,
+      
+      /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an allocation on the stack.
+      DYNALLOC,
+      
+      /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+      
+      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
+      /// shift amounts.  These nodes are generated by the multi-precision shift
+      /// code.
+      SRL, SRA, SHL,
+      
+      /// EXTSW_32 - This is the EXTSW instruction for use with "32-bit"
+      /// registers.
+      EXTSW_32,
+
+      /// STD_32 - This is the STD instruction for use with "32-bit" registers.
+      STD_32,
+      
+      /// CALL - A direct function call.
+      CALL_Macho, CALL_ELF,
+      
+      /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+      /// MTCTR instruction.
+      MTCTR,
+      
+      /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+      /// BCTRL instruction.
+      BCTRL_Macho, BCTRL_ELF,
+      
+      /// Return with a flag operand, matched by 'blr'
+      RET_FLAG,
+      
+      /// R32 = MFCR(CRREG, INFLAG) - Represents the MFCR/MFOCRF instructions.
+      /// This copies the bits corresponding to the specified CRREG into the
+      /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+      MFCR,
+
+      /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+      /// instructions.  For lack of better number, we use the opcode number
+      /// encoding for the OPC field to identify the compare.  For example, 838
+      /// is VCMPGTSH.
+      VCMP,
+      
+      /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
+      /// altivec VCMP*o instructions.  For lack of better number, we use the 
+      /// opcode number encoding for the OPC field to identify the compare.  For
+      /// example, 838 is VCMPGTSH.
+      VCMPo,
+      
+      /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+      /// condition register to branch on, OPC is the branch opcode to use (e.g.
+      /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH,
+      
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, SRCVALUE, Type - This is a 
+      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+      /// i32.
+      STBRX, 
+      
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, SRCVALUE, Type - This is a 
+      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+      /// or i32.
+      LBRX,
+
+      // The following 5 instructions are used only as part of the
+      // long double-to-int conversion sequence.
+
+      /// OUTFLAG = MFFS F8RC - This moves the FPSCR (not modelled) into the
+      /// register.
+      MFFS,
+
+      /// OUTFLAG = MTFSB0 INFLAG - This clears a bit in the FPSCR.
+      MTFSB0,
+
+      /// OUTFLAG = MTFSB1 INFLAG - This sets a bit in the FPSCR.
+      MTFSB1,
+
+      /// F8RC, OUTFLAG = FADDRTZ F8RC, F8RC, INFLAG - This is an FADD done with
+      /// rounding towards zero.  It has flags added so it won't move past the 
+      /// FPSCR-setting instructions.
+      FADDRTZ,
+
+      /// MTFSF = F8RC, INFLAG - This moves the register into the FPSCR.
+      MTFSF,
+
+      /// LARX = This corresponds to PPC l{w|d}arx instrcution: load and
+      /// reserve indexed. This is used to implement atomic operations.
+      LARX,
+
+      /// STCX = This corresponds to PPC stcx. instrcution: store conditional
+      /// indexed. This is used to implement atomic operations.
+      STCX,
+
+      /// TAILCALL - Indicates a tail call should be taken.
+      TAILCALL,
+      /// TC_RETURN - A tail call return.
+      ///   operand #0 chain
+      ///   operand #1 callee (register or absolute)
+      ///   operand #2 stack adjustment
+      ///   operand #3 optional in flag
+      TC_RETURN
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace PPC {
+    /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUHUM instruction.
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+    
+    /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUWUM instruction.
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+
+    /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            bool isUnary);
+
+    /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            bool isUnary);
+    
+    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+    /// amount, otherwise return -1.
+    int isVSLDOIShuffleMask(SDNode *N, bool isUnary);
+    
+    /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a splat of a single element that is suitable for input to
+    /// VSPLTB/VSPLTH/VSPLTW.
+    bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
+    
+    /// isAllNegativeZeroVector - Returns true if all elements of build_vector
+    /// are -0.0.
+    bool isAllNegativeZeroVector(SDNode *N);
+
+    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize);
+    
+    /// get_VSPLTI_elt - If this is a build_vector of constants which can be
+    /// formed by using a vspltis[bhw] instruction of the specified element
+    /// size, return the constant being splatted.  The ByteSize field indicates
+    /// the number of bytes of each element [124] -> [bhw].
+    SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+  
+  class PPCTargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int VarArgsStackOffset;           // StackOffset for start of stack
+                                      // arguments.
+    unsigned VarArgsNumGPR;           // Index of the first unused integer
+                                      // register for parameter passing.
+    unsigned VarArgsNumFPR;           // Index of the first unused double
+                                      // register for parameter passing.
+    int ReturnAddrIndex;              // FrameIndex for return slot.
+    const PPCSubtarget &PPCSubTarget;
+  public:
+    explicit PPCTargetLowering(PPCTargetMachine &TM);
+    
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual MVT getSetCCResultType(MVT VT) const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                           SDValue &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG) const;
+    
+    /// SelectAddressRegReg - Given the specified addressed, check to see if it
+    /// can be represented as an indexed [r+r] operation.  Returns false if it
+    /// can be more efficiently represented with [r+imm].
+    bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
+                             SelectionDAG &DAG) const;
+    
+    /// SelectAddressRegImm - Returns true if the address N can be represented
+    /// by a base register plus a signed 16-bit displacement [r+imm], and if it
+    /// is not better represented as reg+reg.
+    bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
+                             SelectionDAG &DAG) const;
+    
+    /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
+                                 SelectionDAG &DAG) const;
+
+    /// SelectAddressRegImmShift - Returns true if the address N can be
+    /// represented by a base register plus a signed 14-bit displacement
+    /// [r+imm*4].  Suitable for use by STD and friends.
+    bool SelectAddressRegImmShift(SDValue N, SDValue &Disp, SDValue &Base,
+                                  SelectionDAG &DAG) const;
+
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero, 
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *MBB) const;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, 
+                                        MachineBasicBlock *MBB, bool is64Bit,
+                                        unsigned BinOpcode) const;
+    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI, 
+                                                MachineBasicBlock *MBB, 
+                                            bool is8bit, unsigned Opcode) const;
+    
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT VT) const;
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area.  This is the actual
+    /// alignment, not its logarithm.
+    unsigned getByValTypeAlignment(const Type *Ty) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              char ConstraintLetter,
+                                              bool hasMemory,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+    
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode for load / store of the
+    /// given type.
+    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+
+    /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
+    /// the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Target which want to do tail call
+    /// optimization should implement this function.
+    virtual bool IsEligibleForTailCallOptimization(CallSDNode *TheCall,
+                                                   SDValue Ret,
+                                                   SelectionDAG &DAG) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+  private:
+    SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
+    SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
+
+    SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                         int SPDiff,
+                                         SDValue Chain,
+                                         SDValue &LROpOut,
+                                         SDValue &FPOpOut,
+                                         DebugLoc dl);
+
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                           int VarArgsFrameIndex, int VarArgsStackOffset,
+                           unsigned VarArgsNumGPR, unsigned VarArgsNumFPR,
+                           const PPCSubtarget &Subtarget);
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG, int VarArgsFrameIndex,
+                         int VarArgsStackOffset, unsigned VarArgsNumGPR,
+                         unsigned VarArgsNumFPR, const PPCSubtarget &Subtarget);
+    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG,
+                                    int &VarArgsFrameIndex, 
+                                    int &VarArgsStackOffset,
+                                    unsigned &VarArgsNumGPR,
+                                    unsigned &VarArgsNumFPR,
+                                    const PPCSubtarget &Subtarget);
+    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG,
+                        const PPCSubtarget &Subtarget, TargetMachine &TM);
+    SDValue LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM);
+    SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                const PPCSubtarget &Subtarget);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                      const PPCSubtarget &Subtarget);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG, DebugLoc dl);
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG);
+  };
+}
+
+#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
new file mode 100644
index 0000000..417c8ed
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -0,0 +1,723 @@
+//===- PPCInstr64Bit.td - The PowerPC 64-bit Support -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC 64-bit instructions.  These patterns are used
+// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands.
+//
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def symbolHi64 : Operand<i64> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo64 : Operand<i64> {
+  let PrintMethod = "printSymbolLo";
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit transformation functions.
+//
+
+def SHL64 : SDNodeXForm<imm, [{
+  // Transformation function: 63 - imm
+  return getI32Imm(63 - N->getZExtValue());
+}]>;
+
+def SRL64 : SDNodeXForm<imm, [{
+  // Transformation function: 64 - imm
+  return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def HI32_48 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 32));
+}]>;
+
+def HI48_64 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 48));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls.
+//
+
+let Defs = [LR8] in
+  def MovePCtoLR8 : Pseudo<(outs), (ins piclabel:$label), "bl $label", []>,
+                    PPC970_Unit_BRU;
+
+// Macho ABI Calls.
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the PPC64 non-callee saved registers.
+  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR8,CTR8,
+          CR0,CR1,CR5,CR6,CR7] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL8_Macho  : IForm<18, 0, 1,
+                           (outs), (ins calltarget:$func, variable_ops), 
+                           "bl $func", BrB, []>;  // See Pat patterns below.
+    def BLA8_Macho : IForm<18, 1, 1,
+                           (outs), (ins aaddr:$func, variable_ops),
+                           "bla $func", BrB, [(PPCcall_Macho (i64 imm:$func))]>;
+  }
+  let Uses = [CTR8, RM] in {
+    def BCTRL8_Macho : XLForm_2_ext<19, 528, 20, 0, 1, 
+                                 (outs), (ins variable_ops),
+                                 "bctrl", BrB,
+                                 [(PPCbctrl_Macho)]>, Requires<[In64BitMode]>;
+  }
+}
+
+// ELF 64 ABI Calls = Macho ABI Calls
+// Used to define BL8_ELF and BLA8_ELF
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the PPC64 non-callee saved registers.
+  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR8,CTR8,
+          CR0,CR1,CR5,CR6,CR7] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL8_ELF  : IForm<18, 0, 1,
+                         (outs), (ins calltarget:$func, variable_ops), 
+                         "bl $func", BrB, []>;  // See Pat patterns below.                            
+    def BLA8_ELF : IForm<18, 1, 1,
+                         (outs), (ins aaddr:$func, variable_ops),
+                         "bla $func", BrB, [(PPCcall_ELF (i64 imm:$func))]>;
+  }
+  let Uses = [CTR8, RM] in {
+    def BCTRL8_ELF : XLForm_2_ext<19, 528, 20, 0, 1,
+                               (outs), (ins variable_ops),
+                               "bctrl", BrB,
+                               [(PPCbctrl_ELF)]>, Requires<[In64BitMode]>;
+  }
+}
+
+
+// Calls
+def : Pat<(PPCcall_Macho (i64 tglobaladdr:$dst)),
+          (BL8_Macho tglobaladdr:$dst)>;
+def : Pat<(PPCcall_Macho (i64 texternalsym:$dst)),
+          (BL8_Macho texternalsym:$dst)>;
+
+def : Pat<(PPCcall_ELF (i64 tglobaladdr:$dst)),
+          (BL8_ELF tglobaladdr:$dst)>;
+def : Pat<(PPCcall_ELF (i64 texternalsym:$dst)),
+          (BL8_ELF texternalsym:$dst)>;
+
+// Atomic operations
+let usesCustomDAGSchedInserter = 1 in {
+  let Uses = [CR0] in {
+    def ATOMIC_LOAD_ADD_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_add_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_sub_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_OR_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_or_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_xor_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_AND_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_and_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_nand_64 xoaddr:$ptr, G8RC:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$old, G8RC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I64 PSEUDO!",
+      [(set G8RC:$dst, 
+                    (atomic_cmp_swap_64 xoaddr:$ptr, G8RC:$old, G8RC:$new))]>;
+
+    def ATOMIC_SWAP_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$new),
+      "${:comment} ATOMIC_SWAP_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_swap_64 xoaddr:$ptr, G8RC:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LDARX : XForm_1<31,  84, (outs G8RC:$rD), (ins memrr:$ptr),
+                   "ldarx $rD, $ptr", LdStLDARX,
+                   [(set G8RC:$rD, (PPClarx xoaddr:$ptr))]>;
+
+let Defs = [CR0] in
+def STDCX : XForm_1<31, 214, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdcx. $rS, $dst", LdStSTDCX,
+                   [(PPCstcx G8RC:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi8 :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNd8 $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+                 "#TC_RETURNa8 $func $offset",
+                 [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNr8 $dst $offset",
+                 []>;
+
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in
+def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+    Requires<[In64BitMode]>;
+
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB8   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", BrB,
+                  []>;
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA8   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+                  "ba $dst", BrB,
+                  []>;
+
+def : Pat<(PPCtc_return (i64 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi8 texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
+          (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit SPR manipulation instrs.
+
+let Uses = [CTR8] in {
+def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs G8RC:$rT), (ins),
+                           "mfctr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Pattern = [(PPCmtctr G8RC:$rS)], Defs = [CTR8] in {
+def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins G8RC:$rS),
+                           "mtctr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [X1], Uses = [X1] in
+def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),
+                       "${:comment} DYNALLOC8 $result, $negsize, $fpsi",
+                       [(set G8RC:$result,
+                             (PPCdynalloc G8RC:$negsize, iaddr:$fpsi))]>;
+
+let Defs = [LR8] in {
+def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins G8RC:$rS),
+                           "mtlr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR8] in {
+def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins),
+                           "mflr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+//===----------------------------------------------------------------------===//
+// Fixed point instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+// Copies, extends, truncates.
+def OR4To8  : XForm_6<31, 444, (outs G8RC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   []>;
+def OR8To4  : XForm_6<31, 444, (outs GPRC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   []>;
+
+def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
+                      "li $rD, $imm", IntGeneral,
+                      [(set G8RC:$rD, immSExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
+                      "lis $rD, $imm", IntGeneral,
+                      [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
+
+// Logical ops.
+def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "nand $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (and G8RC:$rS, G8RC:$rB)))]>;
+def AND8 : XForm_6<31,  28, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "and $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (and G8RC:$rS, G8RC:$rB))]>;
+def ANDC8: XForm_6<31,  60, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "andc $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (and G8RC:$rS, (not G8RC:$rB)))]>;
+def OR8  : XForm_6<31, 444, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (or G8RC:$rS, G8RC:$rB))]>;
+def NOR8 : XForm_6<31, 124, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "nor $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (or G8RC:$rS, G8RC:$rB)))]>;
+def ORC8 : XForm_6<31, 412, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "orc $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (or G8RC:$rS, (not G8RC:$rB)))]>;
+def EQV8 : XForm_6<31, 284, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "eqv $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (xor G8RC:$rS, G8RC:$rB)))]>;
+def XOR8 : XForm_6<31, 316, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "xor $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (xor G8RC:$rS, G8RC:$rB))]>;
+
+// Logical ops with immediate.
+def ANDIo8  : DForm_4<28, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "andi. $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (and G8RC:$src1, immZExt16:$src2))]>,
+                      isDOT;
+def ANDISo8 : DForm_4<29, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                     "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set G8RC:$dst, (and G8RC:$src1,imm16ShiftedZExt:$src2))]>,
+                     isDOT;
+def ORI8    : DForm_4<24, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "ori $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (or G8RC:$src1, immZExt16:$src2))]>;
+def ORIS8   : DForm_4<25, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "oris $dst, $src1, $src2", IntGeneral,
+                    [(set G8RC:$dst, (or G8RC:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI8   : DForm_4<26, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "xori $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (xor G8RC:$src1, immZExt16:$src2))]>;
+def XORIS8  : DForm_4<27, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "xoris $dst, $src1, $src2", IntGeneral,
+                   [(set G8RC:$dst, (xor G8RC:$src1, imm16ShiftedZExt:$src2))]>;
+
+def ADD8  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "add $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>;
+                     
+def ADDC8 : XOForm_1<31, 10, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (addc G8RC:$rA, G8RC:$rB))]>,
+                     PPC970_DGroup_Cracked;
+def ADDE8 : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "adde $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (adde G8RC:$rA, G8RC:$rB))]>;
+                     
+def ADDI8  : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "addi $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs G8RC:$rD), (ins G8RC:$rA, symbolHi64:$imm),
+                     "addis $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>;
+
+def SUBFIC8: DForm_2< 8, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (subc immSExt16:$imm, G8RC:$rA))]>;
+def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>;
+def SUBFC8 : XOForm_1<31, 8, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                      "subfc $rT, $rA, $rB", IntGeneral,
+                      [(set G8RC:$rT, (subc G8RC:$rB, G8RC:$rA))]>,
+                      PPC970_DGroup_Cracked;
+
+def SUBFE8 : XOForm_1<31, 136, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                      "subfe $rT, $rA, $rB", IntGeneral,
+                      [(set G8RC:$rT, (sube G8RC:$rB, G8RC:$rA))]>;
+def ADDME8  : XOForm_3<31, 234, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "addme $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, immAllOnes))]>;
+def ADDZE8  : XOForm_3<31, 202, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "addze $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, 0))]>;
+def NEG8    : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "neg $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (ineg G8RC:$rA))]>;
+def SUBFME8 : XOForm_3<31, 232, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "subfme $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (sube immAllOnes, G8RC:$rA))]>;
+def SUBFZE8 : XOForm_3<31, 200, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "subfze $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (sube 0, G8RC:$rA))]>;
+
+
+
+def MULHD : XOForm_1<31, 73, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulhd $rT, $rA, $rB", IntMulHW,
+                     [(set G8RC:$rT, (mulhs G8RC:$rA, G8RC:$rB))]>;
+def MULHDU : XOForm_1<31, 9, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulhdu $rT, $rA, $rB", IntMulHWU,
+                     [(set G8RC:$rT, (mulhu G8RC:$rA, G8RC:$rB))]>;
+
+def CMPD   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins G8RC:$rA, G8RC:$rB),
+                          "cmpd $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPLD  : XForm_16_ext<31, 32, (outs CRRC:$crD), (ins G8RC:$rA, G8RC:$rB),
+                          "cmpld $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPDI  : DForm_5_ext<11, (outs CRRC:$crD), (ins G8RC:$rA, s16imm:$imm),
+                         "cmpdi $crD, $rA, $imm", IntCompare>, isPPC64;
+def CMPLDI : DForm_6_ext<10, (outs CRRC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                         "cmpldi $dst, $src1, $src2", IntCompare>, isPPC64;
+
+def SLD  : XForm_6<31,  27, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "sld $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (PPCshl G8RC:$rS, GPRC:$rB))]>, isPPC64;
+def SRD  : XForm_6<31, 539, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "srd $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (PPCsrl G8RC:$rS, GPRC:$rB))]>, isPPC64;
+def SRAD : XForm_6<31, 794, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "srad $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (PPCsra G8RC:$rS, GPRC:$rB))]>, isPPC64;
+                   
+def EXTSB8 : XForm_11<31, 954, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsb $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i8))]>;
+def EXTSH8 : XForm_11<31, 922, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsh $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i16))]>;
+
+def EXTSW  : XForm_11<31, 986, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i32))]>, isPPC64;
+/// EXTSW_32 - Just like EXTSW, but works on '32-bit' registers.
+def EXTSW_32 : XForm_11<31, 986, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (PPCextsw_32 GPRC:$rS))]>, isPPC64;
+def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64;
+
+def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
+                      "sradi $rA, $rS, $SH", IntRotateD,
+                      [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
+def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "cntlzd $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (ctlz G8RC:$rS))]>;
+
+def DIVD  : XOForm_1<31, 489, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "divd $rT, $rA, $rB", IntDivD,
+                     [(set G8RC:$rT, (sdiv G8RC:$rA, G8RC:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVDU : XOForm_1<31, 457, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "divdu $rT, $rA, $rB", IntDivD,
+                     [(set G8RC:$rT, (udiv G8RC:$rA, G8RC:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULLD : XOForm_1<31, 233, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulld $rT, $rA, $rB", IntMulHD,
+                     [(set G8RC:$rT, (mul G8RC:$rA, G8RC:$rB))]>, isPPC64;
+
+
+let isCommutable = 1 in {
+def RLDIMI : MDForm_1<30, 3,
+                      (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateD,
+                      []>, isPPC64, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+
+// Rotate instructions.
+def RLDCL  : MDForm_1<30, 0,
+                      (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB, u6imm:$MB),
+                      "rldcl $rA, $rS, $rB, $MB", IntRotateD,
+                      []>, isPPC64;
+def RLDICL : MDForm_1<30, 0,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MB),
+                      "rldicl $rA, $rS, $SH, $MB", IntRotateD,
+                      []>, isPPC64;
+def RLDICR : MDForm_1<30, 1,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME),
+                      "rldicr $rA, $rS, $SH, $ME", IntRotateD,
+                      []>, isPPC64;
+}  // End FXU Operations.
+
+
+//===----------------------------------------------------------------------===//
+// Load/Store instructions.
+//
+
+
+// Sign extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LHA8: DForm_1<42, (outs G8RC:$rD), (ins memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set G8RC:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LWA  : DSForm_1<58, 2, (outs G8RC:$rD), (ins memrix:$src),
+                    "lwa $rD, $src", LdStLWA,
+                    [(set G8RC:$rD, (sextloadi32 ixaddr:$src))]>, isPPC64,
+                    PPC970_DGroup_Cracked;
+def LHAX8: XForm_1<31, 343, (outs G8RC:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set G8RC:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LWAX : XForm_1<31, 341, (outs G8RC:$rD), (ins memrr:$src),
+                   "lwax $rD, $src", LdStLHA,
+                   [(set G8RC:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+
+// Update forms.
+let mayLoad = 1 in
+def LHAU8 : DForm_1<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp,
+                            ptr_rc:$rA),
+                    "lhau $rD, $disp($rA)", LdStGeneral,
+                    []>, RegConstraint<"$rA = $ea_result">,
+                    NoEncode<"$ea_result">;
+// NO LWAU!
+
+}
+
+// Zero extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ8 : DForm_1<34, (outs G8RC:$rD), (ins memri:$src),
+                  "lbz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi8 iaddr:$src))]>;
+def LHZ8 : DForm_1<40, (outs G8RC:$rD), (ins memri:$src),
+                  "lhz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ8 : DForm_1<32, (outs G8RC:$rD), (ins memri:$src),
+                  "lwz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+
+def LBZX8 : XForm_1<31,  87, (outs G8RC:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1<31, 279, (outs G8RC:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1<31,  23, (outs G8RC:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi32 xaddr:$src))]>;
+                   
+                   
+// Update forms.
+let mayLoad = 1 in {
+def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                    "lbzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                    "lhzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                    "lwzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+}
+}
+
+
+// Full 8-byte loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LD   : DSForm_1<58, 0, (outs G8RC:$rD), (ins memrix:$src),
+                    "ld $rD, $src", LdStLD,
+                    [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64;
+def LDX  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
+                   "ldx $rD, $src", LdStLD,
+                   [(set G8RC:$rD, (load xaddr:$src))]>, isPPC64;
+                   
+let mayLoad = 1 in
+def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr),
+                    "ldu $rD, $addr", LdStLD,
+                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
+                    NoEncode<"$ea_result">;
+
+}
+
+let PPC970_Unit = 2 in {
+// Truncating stores.                       
+def STB8 : DForm_1<38, (outs), (ins G8RC:$rS, memri:$src),
+                   "stb $rS, $src", LdStGeneral,
+                   [(truncstorei8 G8RC:$rS, iaddr:$src)]>;
+def STH8 : DForm_1<44, (outs), (ins G8RC:$rS, memri:$src),
+                   "sth $rS, $src", LdStGeneral,
+                   [(truncstorei16 G8RC:$rS, iaddr:$src)]>;
+def STW8 : DForm_1<36, (outs), (ins G8RC:$rS, memri:$src),
+                   "stw $rS, $src", LdStGeneral,
+                   [(truncstorei32 G8RC:$rS, iaddr:$src)]>;
+def STBX8 : XForm_8<31, 215, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStGeneral,
+                   [(truncstorei8 G8RC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STHX8 : XForm_8<31, 407, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStGeneral,
+                   [(truncstorei16 G8RC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STWX8 : XForm_8<31, 151, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStGeneral,
+                   [(truncstorei32 G8RC:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+// Normal 8-byte stores.
+def STD  : DSForm_1<62, 0, (outs), (ins G8RC:$rS, memrix:$dst),
+                    "std $rS, $dst", LdStSTD,
+                    [(store G8RC:$rS, ixaddr:$dst)]>, isPPC64;
+def STDX  : XForm_8<31, 149, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdx $rS, $dst", LdStSTD,
+                   [(store G8RC:$rS, xaddr:$dst)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+}
+
+let PPC970_Unit = 2 in {
+
+def STBU8 : DForm_1<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
+                                         iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STHU8 : DForm_1<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                        (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
+                                        iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STWU8 : DForm_1<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+
+
+def STDU : DSForm_1<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                                s16immX4:$ptroff, ptr_rc:$ptrreg),
+                    "stdu $rS, $ptroff($ptrreg)", LdStSTD,
+                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    isPPC64;
+
+let mayStore = 1 in
+def STDUX : XForm_8<31, 181, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdux $rS, $dst", LdStSTD,
+                   []>, isPPC64;
+
+// STD_32/STDX_32 - Just like STD/STDX, but uses a '32-bit' input register.
+def STD_32  : DSForm_1<62, 0, (outs), (ins GPRC:$rT, memrix:$dst),
+                       "std $rT, $dst", LdStSTD,
+                       [(PPCstd_32  GPRC:$rT, ixaddr:$dst)]>, isPPC64;
+def STDX_32  : XForm_8<31, 149, (outs), (ins GPRC:$rT, memrr:$dst),
+                       "stdx $rT, $dst", LdStSTD,
+                       [(PPCstd_32  GPRC:$rT, xaddr:$dst)]>, isPPC64,
+                       PPC970_DGroup_Cracked;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Floating point instructions.
+//
+
+
+let PPC970_Unit = 3, Uses = [RM] in {  // FPU Operations.
+def FCFID  : XForm_26<63, 846, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fcfid $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (PPCfcfid F8RC:$frB))]>, isPPC64;
+def FCTIDZ : XForm_26<63, 815, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fctidz $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (PPCfctidz F8RC:$frB))]>, isPPC64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Patterns
+//
+
+// Extensions and truncates to/from 32-bit regs.
+def : Pat<(i64 (zext GPRC:$in)),
+          (RLDICL (OR4To8 GPRC:$in, GPRC:$in), 0, 32)>;
+def : Pat<(i64 (anyext GPRC:$in)),
+          (OR4To8 GPRC:$in, GPRC:$in)>;
+def : Pat<(i32 (trunc G8RC:$in)),
+          (OR8To4 G8RC:$in, G8RC:$in)>;
+
+// Extending loads with i64 targets.
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ8 iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX8 xaddr:$src)>;
+def : Pat<(extloadi32 iaddr:$src),
+          (LWZ8 iaddr:$src)>;
+def : Pat<(extloadi32 xaddr:$src),
+          (LWZX8 xaddr:$src)>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
+// amounts.
+def : Pat<(sra G8RC:$rS, GPRC:$rB),
+          (SRAD G8RC:$rS, GPRC:$rB)>;
+def : Pat<(srl G8RC:$rS, GPRC:$rB),
+          (SRD G8RC:$rS, GPRC:$rB)>;
+def : Pat<(shl G8RC:$rS, GPRC:$rB),
+          (SLD G8RC:$rS, GPRC:$rB)>;
+
+// SHL/SRL
+def : Pat<(shl G8RC:$in, (i32 imm:$imm)),
+          (RLDICR G8RC:$in, imm:$imm, (SHL64 imm:$imm))>;
+def : Pat<(srl G8RC:$in, (i32 imm:$imm)),
+          (RLDICL G8RC:$in, (SRL64 imm:$imm), imm:$imm)>;
+
+// ROTL
+def : Pat<(rotl G8RC:$in, GPRC:$sh),
+          (RLDCL G8RC:$in, GPRC:$sh, 0)>;
+def : Pat<(rotl G8RC:$in, (i32 imm:$imm)),
+          (RLDICL G8RC:$in, imm:$imm, 0)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI8  tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in , 0), (LI8  tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
+def : Pat<(add G8RC:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS8 G8RC:$in, tglobaladdr:$g)>;
+def : Pat<(add G8RC:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS8 G8RC:$in, tconstpool:$g)>;
+def : Pat<(add G8RC:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS8 G8RC:$in, tjumptable:$g)>;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
new file mode 100644
index 0000000..9a5be79
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -0,0 +1,668 @@
+//===- PPCInstrAltivec.td - The PowerPC Altivec Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Altivec extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Altivec transformation functions and pattern fragments.
+//
+
+
+def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+}]>;
+def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+}]>;
+def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+}]>;
+def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+}]>;
+
+
+def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+}]>;
+def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+}]>;
+def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+}]>;
+def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+}]>;
+def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+}]>;
+def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+}]>;
+
+
+def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+}]>;
+def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+}]>;
+def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+}]>;
+def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+}]>;
+def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+}]>;
+def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+}]>;
+
+
+def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false));
+}]>;
+def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, false) != -1;
+}], VSLDOI_get_imm>;
+
+
+/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
+/// vector_shuffle(X,undef,mask) by the dag combiner.
+def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true));
+}]>;
+def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, true) != -1;
+}], VSLDOI_unary_get_imm>;
+
+
+// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
+def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1));
+}]>;
+def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
+}], VSPLTB_get_imm>;
+def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2));
+}]>;
+def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
+}], VSPLTH_get_imm>;
+def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4));
+}]>;
+def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4);
+}], VSPLTW_get_imm>;
+
+
+// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
+def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
+}]>;
+def vecspltisb : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != 0;
+}], VSPLTISB_get_imm>;
+
+// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
+def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
+}]>;
+def vecspltish : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != 0;
+}], VSPLTISH_get_imm>;
+
+// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
+def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
+}]>;
+def vecspltisw : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != 0;
+}], VSPLTISW_get_imm>;
+
+def V_immneg0 : PatLeaf<(build_vector), [{
+  return PPC::isAllNegativeZeroVector(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// VA1a_Int - A VAForm_1a intrinsic definition.
+class VA1a_Int<bits<6> xo, string opc, Intrinsic IntID>
+  : VAForm_1a<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, VRRC:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+                       [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB, VRRC:$vC))]>;
+
+// VX1_Int - A VXForm_1 intrinsic definition.
+class VX1_Int<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB))]>;
+
+// VX2_Int - A VXForm_2 intrinsic definition.
+class VX2_Int<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_2<xo, (outs VRRC:$vD), (ins VRRC:$vB),
+             !strconcat(opc, " $vD, $vB"), VecFP,
+             [(set VRRC:$vD, (IntID VRRC:$vB))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def DSS      : DSS_Form<822, (outs),
+                        (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dss $STRM", LdStGeneral /*FIXME*/, []>;
+def DSSALL   : DSS_Form<822, (outs),
+                        (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dssall", LdStGeneral /*FIXME*/, []>;
+def DST      : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTT     : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dstt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTST    : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dstst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTSTT   : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dststt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+
+def DST64    : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTT64   : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dstt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTST64  : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dstst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTSTT64 : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dststt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+
+def MFVSCR : VXForm_4<1540, (outs VRRC:$vD), (ins),
+                      "mfvscr $vD", LdStGeneral,
+                      [(set VRRC:$vD, (int_ppc_altivec_mfvscr))]>; 
+def MTVSCR : VXForm_5<1604, (outs), (ins VRRC:$vB),
+                      "mtvscr $vB", LdStGeneral,
+                      [(int_ppc_altivec_mtvscr VRRC:$vB)]>; 
+
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {  // Loads.
+def LVEBX: XForm_1<31,   7, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvebx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+def LVEHX: XForm_1<31,  39, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvehx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+def LVEWX: XForm_1<31,  71, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvewx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+def LVX  : XForm_1<31, 103, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+def LVXL : XForm_1<31, 359, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvxl $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+}
+
+def LVSL : XForm_1<31,   6, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvsl $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+def LVSR : XForm_1<31,  38, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvsr $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+
+let PPC970_Unit = 2 in {   // Stores.
+def STVEBX: XForm_8<31, 135, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvebx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvebx VRRC:$rS, xoaddr:$dst)]>;
+def STVEHX: XForm_8<31, 167, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvehx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvehx VRRC:$rS, xoaddr:$dst)]>;
+def STVEWX: XForm_8<31, 199, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvewx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvewx VRRC:$rS, xoaddr:$dst)]>;
+def STVX  : XForm_8<31, 231, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvx VRRC:$rS, xoaddr:$dst)]>;
+def STVXL : XForm_8<31, 487, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvxl $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvxl VRRC:$rS, xoaddr:$dst)]>;
+}
+
+let PPC970_Unit = 5 in {  // VALU Operations.
+// VA-Form instructions.  3-input AltiVec ops.
+def VMADDFP : VAForm_1<46, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vmaddfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set VRRC:$vD, (fadd (fmul VRRC:$vA, VRRC:$vC),
+                                             VRRC:$vB))]>,
+                       Requires<[FPContractions]>;
+def VNMSUBFP: VAForm_1<47, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vnmsubfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set VRRC:$vD, (fsub V_immneg0,
+                                             (fsub (fmul VRRC:$vA, VRRC:$vC),
+                                                   VRRC:$vB)))]>,
+                       Requires<[FPContractions]>;
+
+def VMHADDSHS  : VA1a_Int<32, "vmhaddshs",  int_ppc_altivec_vmhaddshs>;
+def VMHRADDSHS : VA1a_Int<33, "vmhraddshs", int_ppc_altivec_vmhraddshs>;
+def VMLADDUHM  : VA1a_Int<34, "vmladduhm",  int_ppc_altivec_vmladduhm>;
+def VPERM      : VA1a_Int<43, "vperm",      int_ppc_altivec_vperm>;
+def VSEL       : VA1a_Int<42, "vsel",       int_ppc_altivec_vsel>;
+
+// Shuffles.
+def VSLDOI  : VAForm_2<44, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, u5imm:$SH),
+                       "vsldoi $vD, $vA, $vB, $SH", VecFP,
+                       [(set VRRC:$vD, 
+                         (vsldoi_shuffle:$SH (v16i8 VRRC:$vA), VRRC:$vB))]>;
+
+// VX-Form instructions.  AltiVec arithmetic ops.
+def VADDFP : VXForm_1<10, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vaddfp $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (fadd VRRC:$vA, VRRC:$vB))]>;
+                      
+def VADDUBM : VXForm_1<0, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vaddubm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VADDUHM : VXForm_1<64, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vadduhm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v8i16 VRRC:$vA), VRRC:$vB))]>;
+def VADDUWM : VXForm_1<128, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vadduwm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v4i32 VRRC:$vA), VRRC:$vB))]>;
+                      
+def VADDCUW : VX1_Int<384, "vaddcuw", int_ppc_altivec_vaddcuw>;
+def VADDSBS : VX1_Int<768, "vaddsbs", int_ppc_altivec_vaddsbs>;
+def VADDSHS : VX1_Int<832, "vaddshs", int_ppc_altivec_vaddshs>;
+def VADDSWS : VX1_Int<896, "vaddsws", int_ppc_altivec_vaddsws>;
+def VADDUBS : VX1_Int<512, "vaddubs", int_ppc_altivec_vaddubs>;
+def VADDUHS : VX1_Int<576, "vadduhs", int_ppc_altivec_vadduhs>;
+def VADDUWS : VX1_Int<640, "vadduws", int_ppc_altivec_vadduws>;
+                             
+                             
+def VAND : VXForm_1<1028, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                    "vand $vD, $vA, $vB", VecFP,
+                    [(set VRRC:$vD, (and (v4i32 VRRC:$vA), VRRC:$vB))]>;
+def VANDC : VXForm_1<1092, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                     "vandc $vD, $vA, $vB", VecFP,
+                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA), (vnot VRRC:$vB)))]>;
+
+def VCFSX  : VXForm_1<842, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vcfsx $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfsx VRRC:$vB, imm:$UIMM))]>;
+def VCFUX  : VXForm_1<778, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vcfux $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfux VRRC:$vB, imm:$UIMM))]>;
+def VCTSXS : VXForm_1<970, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vctsxs $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctsxs VRRC:$vB, imm:$UIMM))]>;
+def VCTUXS : VXForm_1<906, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vctuxs $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctuxs VRRC:$vB, imm:$UIMM))]>;
+def VEXPTEFP : VX2_Int<394, "vexptefp", int_ppc_altivec_vexptefp>;
+def VLOGEFP  : VX2_Int<458, "vlogefp",  int_ppc_altivec_vlogefp>;
+
+def VAVGSB : VX1_Int<1282, "vavgsb", int_ppc_altivec_vavgsb>;
+def VAVGSH : VX1_Int<1346, "vavgsh", int_ppc_altivec_vavgsh>;
+def VAVGSW : VX1_Int<1410, "vavgsw", int_ppc_altivec_vavgsw>;
+def VAVGUB : VX1_Int<1026, "vavgub", int_ppc_altivec_vavgub>;
+def VAVGUH : VX1_Int<1090, "vavguh", int_ppc_altivec_vavguh>;
+def VAVGUW : VX1_Int<1154, "vavguw", int_ppc_altivec_vavguw>;
+
+def VMAXFP : VX1_Int<1034, "vmaxfp", int_ppc_altivec_vmaxfp>;
+def VMAXSB : VX1_Int< 258, "vmaxsb", int_ppc_altivec_vmaxsb>;
+def VMAXSH : VX1_Int< 322, "vmaxsh", int_ppc_altivec_vmaxsh>;
+def VMAXSW : VX1_Int< 386, "vmaxsw", int_ppc_altivec_vmaxsw>;
+def VMAXUB : VX1_Int<   2, "vmaxub", int_ppc_altivec_vmaxub>;
+def VMAXUH : VX1_Int<  66, "vmaxuh", int_ppc_altivec_vmaxuh>;
+def VMAXUW : VX1_Int< 130, "vmaxuw", int_ppc_altivec_vmaxuw>;
+def VMINFP : VX1_Int<1098, "vminfp", int_ppc_altivec_vminfp>;
+def VMINSB : VX1_Int< 770, "vminsb", int_ppc_altivec_vminsb>;
+def VMINSH : VX1_Int< 834, "vminsh", int_ppc_altivec_vminsh>;
+def VMINSW : VX1_Int< 898, "vminsw", int_ppc_altivec_vminsw>;
+def VMINUB : VX1_Int< 514, "vminub", int_ppc_altivec_vminub>;
+def VMINUH : VX1_Int< 578, "vminuh", int_ppc_altivec_vminuh>;
+def VMINUW : VX1_Int< 642, "vminuw", int_ppc_altivec_vminuw>;
+
+def VMRGHB : VXForm_1< 12, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghb $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrghb_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGHH : VXForm_1< 76, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghh $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrghh_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGHW : VXForm_1<140, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghw $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrghw_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGLB : VXForm_1<268, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglb $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrglb_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGLH : VXForm_1<332, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglh $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrglh_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGLW : VXForm_1<396, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglw $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrglw_shuffle VRRC:$vA, VRRC:$vB))]>;
+
+def VMSUMMBM : VA1a_Int<37, "vmsummbm", int_ppc_altivec_vmsummbm>;
+def VMSUMSHM : VA1a_Int<40, "vmsumshm", int_ppc_altivec_vmsumshm>;
+def VMSUMSHS : VA1a_Int<41, "vmsumshs", int_ppc_altivec_vmsumshs>;
+def VMSUMUBM : VA1a_Int<36, "vmsumubm", int_ppc_altivec_vmsumubm>;
+def VMSUMUHM : VA1a_Int<38, "vmsumuhm", int_ppc_altivec_vmsumuhm>;
+def VMSUMUHS : VA1a_Int<39, "vmsumuhs", int_ppc_altivec_vmsumuhs>;
+
+def VMULESB : VX1_Int<776, "vmulesb", int_ppc_altivec_vmulesb>;
+def VMULESH : VX1_Int<840, "vmulesh", int_ppc_altivec_vmulesh>;
+def VMULEUB : VX1_Int<520, "vmuleub", int_ppc_altivec_vmuleub>;
+def VMULEUH : VX1_Int<584, "vmuleuh", int_ppc_altivec_vmuleuh>;
+def VMULOSB : VX1_Int<264, "vmulosb", int_ppc_altivec_vmulosb>;
+def VMULOSH : VX1_Int<328, "vmulosh", int_ppc_altivec_vmulosh>;
+def VMULOUB : VX1_Int<  8, "vmuloub", int_ppc_altivec_vmuloub>;
+def VMULOUH : VX1_Int< 72, "vmulouh", int_ppc_altivec_vmulouh>;
+                       
+def VREFP     : VX2_Int<266, "vrefp",     int_ppc_altivec_vrefp>;
+def VRFIM     : VX2_Int<714, "vrfim",     int_ppc_altivec_vrfim>;
+def VRFIN     : VX2_Int<522, "vrfin",     int_ppc_altivec_vrfin>;
+def VRFIP     : VX2_Int<650, "vrfip",     int_ppc_altivec_vrfip>;
+def VRFIZ     : VX2_Int<586, "vrfiz",     int_ppc_altivec_vrfiz>;
+def VRSQRTEFP : VX2_Int<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
+
+def VSUBCUW : VX1_Int<74, "vsubcuw", int_ppc_altivec_vsubcuw>;
+
+def VSUBFP  : VXForm_1<74, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubfp $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (fsub VRRC:$vA, VRRC:$vB))]>;
+def VSUBUBM : VXForm_1<1024, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsububm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VSUBUHM : VXForm_1<1088, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubuhm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v8i16 VRRC:$vA), VRRC:$vB))]>;
+def VSUBUWM : VXForm_1<1152, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubuwm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v4i32 VRRC:$vA), VRRC:$vB))]>;
+                      
+def VSUBSBS : VX1_Int<1792, "vsubsbs" , int_ppc_altivec_vsubsbs>;
+def VSUBSHS : VX1_Int<1856, "vsubshs" , int_ppc_altivec_vsubshs>;
+def VSUBSWS : VX1_Int<1920, "vsubsws" , int_ppc_altivec_vsubsws>;
+def VSUBUBS : VX1_Int<1536, "vsububs" , int_ppc_altivec_vsububs>;
+def VSUBUHS : VX1_Int<1600, "vsubuhs" , int_ppc_altivec_vsubuhs>;
+def VSUBUWS : VX1_Int<1664, "vsubuws" , int_ppc_altivec_vsubuws>;
+def VSUMSWS : VX1_Int<1928, "vsumsws" , int_ppc_altivec_vsumsws>;
+def VSUM2SWS: VX1_Int<1672, "vsum2sws", int_ppc_altivec_vsum2sws>;
+def VSUM4SBS: VX1_Int<1672, "vsum4sbs", int_ppc_altivec_vsum4sbs>;
+def VSUM4SHS: VX1_Int<1608, "vsum4shs", int_ppc_altivec_vsum4shs>;
+def VSUM4UBS: VX1_Int<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs>;
+
+def VNOR : VXForm_1<1284, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                    "vnor $vD, $vA, $vB", VecFP,
+                    [(set VRRC:$vD, (vnot (or (v4i32 VRRC:$vA), VRRC:$vB)))]>;
+def VOR : VXForm_1<1156, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vor $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (or (v4i32 VRRC:$vA), VRRC:$vB))]>;
+def VXOR : VXForm_1<1220, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vxor $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (xor (v4i32 VRRC:$vA), VRRC:$vB))]>;
+
+def VRLB   : VX1_Int<   4, "vrlb", int_ppc_altivec_vrlb>;
+def VRLH   : VX1_Int<  68, "vrlh", int_ppc_altivec_vrlh>;
+def VRLW   : VX1_Int< 132, "vrlw", int_ppc_altivec_vrlw>;
+
+def VSL    : VX1_Int< 452, "vsl" , int_ppc_altivec_vsl >;
+def VSLO   : VX1_Int<1036, "vslo", int_ppc_altivec_vslo>;
+def VSLB   : VX1_Int< 260, "vslb", int_ppc_altivec_vslb>;
+def VSLH   : VX1_Int< 324, "vslh", int_ppc_altivec_vslh>;
+def VSLW   : VX1_Int< 388, "vslw", int_ppc_altivec_vslw>;
+
+def VSPLTB : VXForm_1<524, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vspltb $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD,
+                        (vspltb_shuffle:$UIMM (v16i8 VRRC:$vB), (undef)))]>;
+def VSPLTH : VXForm_1<588, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vsplth $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD,
+                        (vsplth_shuffle:$UIMM (v16i8 VRRC:$vB), (undef)))]>;
+def VSPLTW : VXForm_1<652, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vspltw $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD, 
+                        (vspltw_shuffle:$UIMM (v16i8 VRRC:$vB), (undef)))]>;
+
+def VSR    : VX1_Int< 708, "vsr"  , int_ppc_altivec_vsr>;
+def VSRO   : VX1_Int<1100, "vsro" , int_ppc_altivec_vsro>;
+def VSRAB  : VX1_Int< 772, "vsrab", int_ppc_altivec_vsrab>;
+def VSRAH  : VX1_Int< 836, "vsrah", int_ppc_altivec_vsrah>;
+def VSRAW  : VX1_Int< 900, "vsraw", int_ppc_altivec_vsraw>;
+def VSRB   : VX1_Int< 516, "vsrb" , int_ppc_altivec_vsrb>;
+def VSRH   : VX1_Int< 580, "vsrh" , int_ppc_altivec_vsrh>;
+def VSRW   : VX1_Int< 644, "vsrw" , int_ppc_altivec_vsrw>;
+
+
+def VSPLTISB : VXForm_3<780, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltisb $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v16i8 vecspltisb:$SIMM))]>;
+def VSPLTISH : VXForm_3<844, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltish $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v8i16 vecspltish:$SIMM))]>;
+def VSPLTISW : VXForm_3<908, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltisw $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v4i32 vecspltisw:$SIMM))]>;
+
+// Vector Pack.
+def VPKPX   : VX1_Int<782, "vpkpx", int_ppc_altivec_vpkpx>;
+def VPKSHSS : VX1_Int<398, "vpkshss", int_ppc_altivec_vpkshss>;
+def VPKSHUS : VX1_Int<270, "vpkshus", int_ppc_altivec_vpkshus>;
+def VPKSWSS : VX1_Int<462, "vpkswss", int_ppc_altivec_vpkswss>;
+def VPKSWUS : VX1_Int<334, "vpkswus", int_ppc_altivec_vpkswus>;
+def VPKUHUM : VXForm_1<14, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                       "vpkuhum $vD, $vA, $vB", VecFP,
+                       [(set VRRC:$vD,
+                         (vpkuhum_shuffle (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VPKUHUS : VX1_Int<142, "vpkuhus", int_ppc_altivec_vpkuhus>;
+def VPKUWUM : VXForm_1<78, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                       "vpkuwum $vD, $vA, $vB", VecFP,
+                       [(set VRRC:$vD,
+                         (vpkuwum_shuffle (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VPKUWUS : VX1_Int<206, "vpkuwus", int_ppc_altivec_vpkuwus>;
+
+// Vector Unpack.
+def VUPKHPX : VX2_Int<846, "vupkhpx", int_ppc_altivec_vupkhpx>;
+def VUPKHSB : VX2_Int<526, "vupkhsb", int_ppc_altivec_vupkhsb>;
+def VUPKHSH : VX2_Int<590, "vupkhsh", int_ppc_altivec_vupkhsh>;
+def VUPKLPX : VX2_Int<974, "vupklpx", int_ppc_altivec_vupklpx>;
+def VUPKLSB : VX2_Int<654, "vupklsb", int_ppc_altivec_vupklsb>;
+def VUPKLSH : VX2_Int<718, "vupklsh", int_ppc_altivec_vupklsh>;
+
+
+// Altivec Comparisons.
+
+class VCMP<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),asmstr,VecFPCompare,
+              [(set VRRC:$vD, (Ty (PPCvcmp VRRC:$vA, VRRC:$vB, xo)))]>;
+class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),asmstr,VecFPCompare,
+              [(set VRRC:$vD, (Ty (PPCvcmp_o VRRC:$vA, VRRC:$vB, xo)))]> {
+  let Defs = [CR6];
+  let RC = 1;
+}
+
+// f32 element comparisons.0
+def VCMPBFP   : VCMP <966, "vcmpbfp $vD, $vA, $vB"  , v4f32>;
+def VCMPBFPo  : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPEQFP  : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
+def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP  : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
+def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP  : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
+def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+
+// i8 element comparisons.
+def VCMPEQUB  : VCMP <  6, "vcmpequb $vD, $vA, $vB" , v16i8>;
+def VCMPEQUBo : VCMPo<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB  : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
+def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB  : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
+def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPEQUH  : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
+def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH  : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
+def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH  : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
+def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPEQUW  : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
+def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW  : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
+def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
+def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+                      
+def V_SET0 : VXForm_setzero<1220, (outs VRRC:$vD), (ins),
+                      "vxor $vD, $vD, $vD", VecFP,
+                      [(set VRRC:$vD, (v4i32 immAllZerosV))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Additional Altivec Patterns
+//
+
+// DS* intrinsics
+def : Pat<(int_ppc_altivec_dssall), (DSSALL 1, 0, 0, 0)>;
+def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>;
+
+//  * 32-bit
+def : Pat<(int_ppc_altivec_dst GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstt GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTT 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstst GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dststt GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTSTT 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+
+//  * 64-bit
+def : Pat<(int_ppc_altivec_dst G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DST64 0, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstt G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTT64 1, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstst G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTST64 0, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dststt G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTSTT64 1, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+
+// Loads.
+def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store (v4i32 VRRC:$rS), xoaddr:$dst),
+          (STVX (v4i32 VRRC:$rS), xoaddr:$dst)>;
+
+// Bit conversions.
+def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+// Shuffles.
+
+// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
+def:Pat<(vsldoi_unary_shuffle:$in (v16i8 VRRC:$vA), undef),
+        (VSLDOI VRRC:$vA, VRRC:$vA, (VSLDOI_unary_get_imm VRRC:$in))>;
+def:Pat<(vpkuwum_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VPKUWUM VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vpkuhum_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VPKUHUM VRRC:$vA, VRRC:$vA)>;
+
+// Match vmrg*(x,x)
+def:Pat<(vmrglb_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGLB VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrglh_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGLH VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrglw_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGLW VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrghb_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGHB VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrghh_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGHH VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrghw_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGHW VRRC:$vA, VRRC:$vA)>;
+
+// Logical Operations
+def : Pat<(v4i32 (vnot VRRC:$vA)),      (VNOR VRRC:$vA, VRRC:$vA)>;
+def : Pat<(v4i32 (vnot_conv VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>;
+
+def : Pat<(v4i32 (vnot_conv (or VRRC:$A, VRRC:$B))),
+          (VNOR VRRC:$A, VRRC:$B)>;
+def : Pat<(v4i32 (and VRRC:$A, (vnot_conv VRRC:$B))),
+          (VANDC VRRC:$A, VRRC:$B)>;
+
+def : Pat<(fmul VRRC:$vA, VRRC:$vB),
+          (VMADDFP VRRC:$vA, VRRC:$vB, (v4i32 (V_SET0)))>; 
+
+// Fused multiply add and multiply sub for packed float.  These are represented
+// separately from the real instructions above, for operations that must have
+// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
+def : Pat<(PPCvmaddfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+def : Pat<(PPCvnmsubfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+
+def : Pat<(int_ppc_altivec_vmaddfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+def : Pat<(int_ppc_altivec_vnmsubfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+
+def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC),
+          (VPERM VRRC:$vA, VRRC:$vB, VRRC:$vC)>;
diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h
new file mode 100644
index 0000000..1de6911
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrBuilder.h
@@ -0,0 +1,43 @@
+//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_INSTRBUILDER_H
+#define POWERPC_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
new file mode 100644
index 0000000..54cebcd
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -0,0 +1,875 @@
+//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// PowerPC instruction formats
+
+class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+  
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+}
+
+class PPC970_DGroup_First   { bits<1> PPC970_First = 1;  }
+class PPC970_DGroup_Single  { bits<1> PPC970_Single = 1; }
+class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; }
+class PPC970_MicroCode;
+
+class PPC970_Unit_Pseudo   { bits<3> PPC970_Unit = 0;   }
+class PPC970_Unit_FXU      { bits<3> PPC970_Unit = 1;   }
+class PPC970_Unit_LSU      { bits<3> PPC970_Unit = 2;   }
+class PPC970_Unit_FPU      { bits<3> PPC970_Unit = 3;   }
+class PPC970_Unit_CRU      { bits<3> PPC970_Unit = 4;   }
+class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
+class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
+class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
+
+
+// 1.7.1 I-Form
+class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.2 B-Form
+class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, BrB> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  bits<14> BD;
+
+  bits<5> BI;
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+
+  let Inst{6-10}  = BIBO{4-0};
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+
+// 1.7.4 D-Form
+class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<5>  B;
+  bits<16> C;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> C;
+  bits<5>  B;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern>;
+
+class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> B;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = 0;
+  let Inst{16-31} = B;
+}
+
+class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  B;
+  bits<5>  A;
+  bits<16> C;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+              
+class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+  : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+  let C = 0;
+}
+
+class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3>  BF;
+  bits<1>  L;
+  bits<5>  RA;
+  bits<16> I;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-31} = I;
+}
+
+class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_5<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> 
+  : DForm_5<opcode, OOL, IOL, asmstr, itin>;
+
+class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_6<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+
+// 1.7.5 DS-Form
+class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RST;
+  bits<14> DS;
+  bits<5>  RA;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = RA;
+  let Inst{16-29} = DS;
+  let Inst{30-31} = xo;
+}
+
+// 1.7.6 X-Form
+class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// This is the same as XForm_base_r3xo, but the first two operands are swapped
+// when code is emitted.
+class XForm_base_r3xo_swapped
+        <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+        InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RST;
+  bits<5> B;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+}
+
+class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+    let Pattern = pattern;
+}
+
+class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let B = 0;
+  let Pattern = pattern;
+}
+
+class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L; 
+  bits<5> RA;
+  bits<5> RB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin>
+  : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> FRA;
+  bits<5> FRB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 31;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+}
+
+class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// This is used for MFFS, MTFSB0, MTFSB1.  42 is arbitrary; this series of
+// numbers presumably relates to some document, but I haven't found it.
+class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<5> FM;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FM;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// DCB_Form - Form X instruction, used for dcb* instructions.
+class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = immfield;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+
+// DSS_Form - Form X instruction, used for altivec dss* instructions.
+class DSS_Form<bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<1> T;
+  bits<2> STRM;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6}     = T;
+  let Inst{7-8}   = 0;
+  let Inst{9-10}  = STRM;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.7 XL-Form
+class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  bits<5> CRA;
+  bits<5> CRB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRA;
+  let Inst{16-20} = CRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRD;
+  let Inst{16-20} = CRD;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr, 
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo;
+  let Inst{31}    = lk;
+}
+
+class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  
+  let BO = BIBO{2-6};
+  let BI{0-1} = BIBO{0-1};
+  let BI{2-4} = CR;
+  let BH = 0;
+}
+
+
+class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo,  bits<5> bi, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
+class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<3> BFA;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-13} = BFA;
+  let Inst{14-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.8 XFX-Form
+class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+  bits<10> SPR;
+
+  let Inst{6-10}  = RT;
+  let Inst{11}    = SPR{4};
+  let Inst{12}    = SPR{3};
+  let Inst{13}    = SPR{2};
+  let Inst{14}    = SPR{1};
+  let Inst{15}    = SPR{0};
+  let Inst{16}    = SPR{9};
+  let Inst{17}    = SPR{8};
+  let Inst{18}    = SPR{7};
+  let Inst{19}    = SPR{6};
+  let Inst{20}    = SPR{5};
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+   
+  let Inst{6-10}  = RT;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8>  FXM;
+  bits<5>  ST;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 0;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  ST;
+  bits<8>  FXM;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 1;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>;
+
+class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                    dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+// XFL-Form - MTFSF
+// This is probably 1.7.9, but I don't have the reference that uses this
+// numbering scheme...
+class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      string cstr, InstrItinClass itin, list<dag>pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8> FM;
+  bits<5> RT;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+  let Constraints = cstr;
+
+  let Inst{6} = 0;
+  let Inst{7-14}  = FM;
+  let Inst{15} = 0;
+  let Inst{16-20} = RT;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// 1.7.10 XS-Form - SRADI.
+class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RS;
+  bits<6> SH;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = A;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+// 1.7.11 XO-Form
+class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21}    = oe;
+  let Inst{22-30} = xo;
+  let Inst{31}    = RC;  
+}
+
+class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, 
+               dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> {
+  let RB = 0;
+}
+
+// 1.7.12 A-Form
+class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRC;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-25} = FRC;
+  let Inst{26-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRC = 0;
+}
+
+class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+// 1.7.13 M-Form
+class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<5> MB;
+  bits<5> ME;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = MB;
+  let Inst{26-30} = ME;
+  let Inst{31}    = RC;
+}
+
+class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// 1.7.14 MD-Form
+class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<6> SH;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+
+
+// E-1 VA-Form
+
+// VAForm_1 - DACB ordering.
+class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VC;
+  bits<5> VB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+// VAForm_1a - DABC ordering.
+class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<5> VC;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<4> SH;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 0;
+  let Inst{22-25} = SH;
+  let Inst{26-31} = xo;
+}
+
+// E-2 VX-Form
+class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let VA = VD;
+  let VB = VD;
+}
+
+
+class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> IMM;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
+class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
+class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// E-4 VXR-Form
+class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit RC = 0;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = RC;
+  let Inst{22-31} = xo;
+}
+
+//===----------------------------------------------------------------------===//
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : I<0, OOL, IOL, asmstr, NoItinerary> {
+  let PPC64 = 0;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
new file mode 100644
index 0000000..778f034
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -0,0 +1,818 @@
+//===- PPCInstrInfo.cpp - PowerPC32 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPredicates.h"
+#include "PPCGenInstrInfo.inc"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+using namespace llvm;
+
+extern cl::opt<bool> EnablePPC32RS;  // FIXME (64-bit): See PPCRegisterInfo.cpp.
+extern cl::opt<bool> EnablePPC64RS;  // FIXME (64-bit): See PPCRegisterInfo.cpp.
+
+PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
+  : TargetInstrInfoImpl(PPCInsts, array_lengthof(PPCInsts)), TM(tm),
+    RI(*TM.getSubtargetImpl(), *this) {}
+
+bool PPCInstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned& sourceReg,
+                               unsigned& destReg,
+                               unsigned& sourceSubIdx,
+                               unsigned& destSubIdx) const {
+  sourceSubIdx = destSubIdx = 0; // No sub-registers.
+
+  unsigned oc = MI.getOpcode();
+  if (oc == PPC::OR || oc == PPC::OR8 || oc == PPC::VOR ||
+      oc == PPC::OR4To8 || oc == PPC::OR8To4) {                // or r1, r2, r2
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isReg() &&
+           "invalid PPC OR instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::ADDI) {             // addi r1, r2, 0
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(2).isImm() &&
+           "invalid PPC ADDI instruction!");
+    if (MI.getOperand(1).isReg() && MI.getOperand(2).getImm() == 0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::ORI) {             // ori r1, r2, 0
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isImm() &&
+           "invalid PPC ORI instruction!");
+    if (MI.getOperand(2).getImm() == 0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::FMRS || oc == PPC::FMRD ||
+             oc == PPC::FMRSD) {      // fmr r1, r2
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid PPC FMR instruction");
+    sourceReg = MI.getOperand(1).getReg();
+    destReg = MI.getOperand(0).getReg();
+    return true;
+  } else if (oc == PPC::MCRF) {             // mcrf cr1, cr2
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid PPC MCRF instruction");
+    sourceReg = MI.getOperand(1).getReg();
+    destReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  return false;
+}
+
+unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, 
+                                           int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::LD:
+  case PPC::LWZ:
+  case PPC::LFS:
+  case PPC::LFD:
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI, 
+                                          int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::STD:
+  case PPC::STW:
+  case PPC::STFS:
+  case PPC::STFD:
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+// commuteInstruction - We can commute rlwimi instructions, but only if the
+// rotate amt is zero.  We also have to munge the immediates a bit.
+MachineInstr *
+PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+
+  // Normal instructions can be commuted the obvious way.
+  if (MI->getOpcode() != PPC::RLWIMI)
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+  
+  // Cannot commute if it has a non-zero rotate count.
+  if (MI->getOperand(3).getImm() != 0)
+    return 0;
+  
+  // If we have a zero rotate count, we have:
+  //   M = mask(MB,ME)
+  //   Op0 = (Op1 & ~M) | (Op2 & M)
+  // Change this to:
+  //   M = mask((ME+1)&31, (MB-1)&31)
+  //   Op0 = (Op2 & ~M) | (Op1 & M)
+
+  // Swap op1/op2
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  unsigned Reg2 = MI->getOperand(2).getReg();
+  bool Reg1IsKill = MI->getOperand(1).isKill();
+  bool Reg2IsKill = MI->getOperand(2).isKill();
+  bool ChangeReg0 = false;
+  // If machine instrs are no longer in two-address forms, update
+  // destination register as well.
+  if (Reg0 == Reg1) {
+    // Must be two address instruction!
+    assert(MI->getDesc().getOperandConstraint(0, TOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    Reg2IsKill = false;
+    ChangeReg0 = true;
+  }
+
+  // Masks.
+  unsigned MB = MI->getOperand(4).getImm();
+  unsigned ME = MI->getOperand(5).getImm();
+
+  if (NewMI) {
+    // Create a new instruction.
+    unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg();
+    bool Reg0IsDead = MI->getOperand(0).isDead();
+    return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
+      .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+      .addReg(Reg2, getKillRegState(Reg2IsKill))
+      .addReg(Reg1, getKillRegState(Reg1IsKill))
+      .addImm((ME+1) & 31)
+      .addImm((MB-1) & 31);
+  }
+
+  if (ChangeReg0)
+    MI->getOperand(0).setReg(Reg2);
+  MI->getOperand(2).setReg(Reg1);
+  MI->getOperand(1).setReg(Reg2);
+  MI->getOperand(2).setIsKill(Reg1IsKill);
+  MI->getOperand(1).setIsKill(Reg2IsKill);
+  
+  // Swap the mask around.
+  MI->getOperand(4).setImm((ME+1) & 31);
+  MI->getOperand(5).setImm((MB-1) & 31);
+  return MI;
+}
+
+void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                              MachineBasicBlock::iterator MI) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  BuildMI(MBB, MI, DL, get(PPC::NOP));
+}
+
+
+// Branch analysis.
+bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == PPC::B) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BCC) {
+      if (!LastInst->getOperand(2).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with PPC::B and PPC:BCC, handle it.
+  if (SecondLastInst->getOpcode() == PPC::BCC && 
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(2).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(2).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two PPC:Bs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == PPC::B && 
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != PPC::BCC)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) && 
+         "PPC branch conditions have two components!");
+  
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, dl, get(PPC::B)).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, dl, get(PPC::BCC))
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+  
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, dl, get(PPC::BCC))
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, dl, get(PPC::B)).addMBB(FBB);
+  return 2;
+}
+
+bool PPCInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const {
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (DestRC == PPC::GPRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::OR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (DestRC == PPC::G8RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::OR8), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (DestRC == PPC::F4RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::FMRS), DestReg).addReg(SrcReg);
+  } else if (DestRC == PPC::F8RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::FMRD), DestReg).addReg(SrcReg);
+  } else if (DestRC == PPC::CRRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::MCRF), DestReg).addReg(SrcReg);
+  } else if (DestRC == PPC::VRRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::VOR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (DestRC == PPC::CRBITRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::CROR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else {
+    // Attempt to copy register that is not GPR or FPR
+    return false;
+  }
+  
+  return true;
+}
+
+bool
+PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
+                                  unsigned SrcReg, bool isKill,
+                                  int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (RC == PPC::GPRCRegisterClass) {
+    if (SrcReg != PPC::LR) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                         .addReg(SrcReg,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    } else {
+      // FIXME: this spills LR immediately to memory in one step.  To do this,
+      // we use R11, which we know cannot be used in the prolog/epilog.  This is
+      // a hack.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFLR), PPC::R11));
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                         .addReg(PPC::R11,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    }
+  } else if (RC == PPC::G8RCRegisterClass) {
+    if (SrcReg != PPC::LR8) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+                                         .addReg(SrcReg,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    } else {
+      // FIXME: this spills LR immediately to memory in one step.  To do this,
+      // we use R11, which we know cannot be used in the prolog/epilog.  This is
+      // a hack.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFLR8), PPC::X11));
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+                                         .addReg(PPC::X11,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    }
+  } else if (RC == PPC::F8RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (RC == PPC::F4RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (RC == PPC::CRRCRegisterClass) {
+    if ((EnablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
+        (EnablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
+      // FIXME (64-bit): Enable
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
+                                         .addReg(SrcReg,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+      return true;
+    } else {
+      // FIXME: We use R0 here, because it isn't available for RA.  We need to
+      // store the CR in the low 4-bits of the saved value.  First, issue a MFCR
+      // to save all of the CRBits.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFCR), PPC::R0));
+    
+      // If the saved register wasn't CR0, shift the bits left so that they are
+      // in CR0's slot.
+      if (SrcReg != PPC::CR0) {
+        unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(SrcReg)*4;
+        // rlwinm r0, r0, ShiftBits, 0, 31.
+        NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), PPC::R0)
+                       .addReg(PPC::R0).addImm(ShiftBits).addImm(0).addImm(31));
+      }
+    
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                         .addReg(PPC::R0,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    }
+  } else if (RC == PPC::CRBITRCRegisterClass) {
+    // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
+    // backend currently only uses CR1EQ as an individual bit, this should
+    // not cause any bug. If we need other uses of CR bits, the following
+    // code may be invalid.
+    unsigned Reg = 0;
+    if (SrcReg >= PPC::CR0LT || SrcReg <= PPC::CR0UN) 
+      Reg = PPC::CR0;
+    else if (SrcReg >= PPC::CR1LT || SrcReg <= PPC::CR1UN) 
+      Reg = PPC::CR1;
+    else if (SrcReg >= PPC::CR2LT || SrcReg <= PPC::CR2UN) 
+      Reg = PPC::CR2;
+    else if (SrcReg >= PPC::CR3LT || SrcReg <= PPC::CR3UN) 
+      Reg = PPC::CR3;
+    else if (SrcReg >= PPC::CR4LT || SrcReg <= PPC::CR4UN) 
+      Reg = PPC::CR4;
+    else if (SrcReg >= PPC::CR5LT || SrcReg <= PPC::CR5UN) 
+      Reg = PPC::CR5;
+    else if (SrcReg >= PPC::CR6LT || SrcReg <= PPC::CR6UN) 
+      Reg = PPC::CR6;
+    else if (SrcReg >= PPC::CR7LT || SrcReg <= PPC::CR7UN) 
+      Reg = PPC::CR7;
+
+    return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx, 
+                               PPC::CRRCRegisterClass, NewMIs);
+
+  } else if (RC == PPC::VRRCRegisterClass) {
+    // We don't have indexed addressing for vector loads.  Emit:
+    // R0 = ADDI FI#
+    // STVX VAL, 0, R0
+    // 
+    // FIXME: We use R0 here, because it isn't available for RA.
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+                                       FrameIdx, 0, 0));
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::STVX))
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addReg(PPC::R0)
+                     .addReg(PPC::R0));
+  } else {
+    assert(0 && "Unknown regclass!");
+    abort();
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned SrcReg, bool isKill, int FrameIdx,
+                                  const TargetRegisterClass *RC) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+
+  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs)) {
+    PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+    FuncInfo->setSpillsCR();
+  }
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+}
+
+void PPCInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const{
+  if (Addr[0].isFI()) {
+    if (StoreRegToStackSlot(MF, SrcReg, isKill,
+                            Addr[0].getIndex(), RC, NewMIs)) {
+      PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+      FuncInfo->setSpillsCR();
+    }
+
+    return;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  unsigned Opc = 0;
+  if (RC == PPC::GPRCRegisterClass) {
+    Opc = PPC::STW;
+  } else if (RC == PPC::G8RCRegisterClass) {
+    Opc = PPC::STD;
+  } else if (RC == PPC::F8RCRegisterClass) {
+    Opc = PPC::STFD;
+  } else if (RC == PPC::F4RCRegisterClass) {
+    Opc = PPC::STFS;
+  } else if (RC == PPC::VRRCRegisterClass) {
+    Opc = PPC::STVX;
+  } else {
+    assert(0 && "Unknown regclass!");
+    abort();
+  }
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc))
+    .addReg(SrcReg, getKillRegState(isKill));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+  return;
+}
+
+void
+PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   SmallVectorImpl<MachineInstr*> &NewMIs)const{
+  if (RC == PPC::GPRCRegisterClass) {
+    if (DestReg != PPC::LR) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                                 DestReg), FrameIdx));
+    } else {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                                 PPC::R11), FrameIdx));
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR)).addReg(PPC::R11));
+    }
+  } else if (RC == PPC::G8RCRegisterClass) {
+    if (DestReg != PPC::LR8) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
+                                         FrameIdx));
+    } else {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD),
+                                                 PPC::R11), FrameIdx));
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::R11));
+    }
+  } else if (RC == PPC::F8RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
+                                       FrameIdx));
+  } else if (RC == PPC::F4RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
+                                       FrameIdx));
+  } else if (RC == PPC::CRRCRegisterClass) {
+    // FIXME: We use R0 here, because it isn't available for RA.
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ), PPC::R0),
+                                       FrameIdx));
+    
+    // If the reloaded register isn't CR0, shift the bits right so that they are
+    // in the right CR's slot.
+    if (DestReg != PPC::CR0) {
+      unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(DestReg)*4;
+      // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), PPC::R0)
+                    .addReg(PPC::R0).addImm(32-ShiftBits).addImm(0).addImm(31));
+    }
+    
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTCRF), DestReg).addReg(PPC::R0));
+  } else if (RC == PPC::CRBITRCRegisterClass) {
+   
+    unsigned Reg = 0;
+    if (DestReg >= PPC::CR0LT || DestReg <= PPC::CR0UN) 
+      Reg = PPC::CR0;
+    else if (DestReg >= PPC::CR1LT || DestReg <= PPC::CR1UN) 
+      Reg = PPC::CR1;
+    else if (DestReg >= PPC::CR2LT || DestReg <= PPC::CR2UN) 
+      Reg = PPC::CR2;
+    else if (DestReg >= PPC::CR3LT || DestReg <= PPC::CR3UN) 
+      Reg = PPC::CR3;
+    else if (DestReg >= PPC::CR4LT || DestReg <= PPC::CR4UN) 
+      Reg = PPC::CR4;
+    else if (DestReg >= PPC::CR5LT || DestReg <= PPC::CR5UN) 
+      Reg = PPC::CR5;
+    else if (DestReg >= PPC::CR6LT || DestReg <= PPC::CR6UN) 
+      Reg = PPC::CR6;
+    else if (DestReg >= PPC::CR7LT || DestReg <= PPC::CR7UN) 
+      Reg = PPC::CR7;
+
+    return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx, 
+                                PPC::CRRCRegisterClass, NewMIs);
+
+  } else if (RC == PPC::VRRCRegisterClass) {
+    // We don't have indexed addressing for vector loads.  Emit:
+    // R0 = ADDI FI#
+    // Dest = LVX 0, R0
+    // 
+    // FIXME: We use R0 here, because it isn't available for RA.
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+                                       FrameIdx, 0, 0));
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(PPC::R0)
+                     .addReg(PPC::R0));
+  } else {
+    assert(0 && "Unknown regclass!");
+    abort();
+  }
+}
+
+void
+PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+}
+
+void PPCInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                   SmallVectorImpl<MachineOperand> &Addr,
+                                   const TargetRegisterClass *RC,
+                                   SmallVectorImpl<MachineInstr*> &NewMIs)const{
+  if (Addr[0].isFI()) {
+    LoadRegFromStackSlot(MF, DebugLoc::getUnknownLoc(),
+                         DestReg, Addr[0].getIndex(), RC, NewMIs);
+    return;
+  }
+
+  unsigned Opc = 0;
+  if (RC == PPC::GPRCRegisterClass) {
+    assert(DestReg != PPC::LR && "Can't handle this yet!");
+    Opc = PPC::LWZ;
+  } else if (RC == PPC::G8RCRegisterClass) {
+    assert(DestReg != PPC::LR8 && "Can't handle this yet!");
+    Opc = PPC::LD;
+  } else if (RC == PPC::F8RCRegisterClass) {
+    Opc = PPC::LFD;
+  } else if (RC == PPC::F4RCRegisterClass) {
+    Opc = PPC::LFS;
+  } else if (RC == PPC::VRRCRegisterClass) {
+    Opc = PPC::LVX;
+  } else {
+    assert(0 && "Unknown regclass!");
+    abort();
+  }
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+  return;
+}
+
+/// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
+/// copy instructions, turning them into load/store instructions.
+MachineInstr *PPCInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                                  int FrameIndex) const {
+  if (Ops.size() != 1) return NULL;
+
+  // Make sure this is a reg-reg copy.  Note that we can't handle MCRF, because
+  // it takes more than one instruction to store it.
+  unsigned Opc = MI->getOpcode();
+  unsigned OpNum = Ops[0];
+
+  MachineInstr *NewMI = NULL;
+  if ((Opc == PPC::OR &&
+       MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STW))
+                                .addReg(InReg, getKillRegState(isKill)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LWZ))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead)),
+                                FrameIndex);
+    }
+  } else if ((Opc == PPC::OR8 &&
+              MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STD))
+                                .addReg(InReg, getKillRegState(isKill)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LD))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead)),
+                                FrameIndex);
+    }
+  } else if (Opc == PPC::FMRD) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STFD))
+                                .addReg(InReg, getKillRegState(isKill)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LFD))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead)),
+                                FrameIndex);
+    }
+  } else if (Opc == PPC::FMRS) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STFS))
+                                .addReg(InReg, getKillRegState(isKill)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LFS))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead)),
+                                FrameIndex);
+    }
+  }
+
+  return NewMI;
+}
+
+bool PPCInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                  const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  // Make sure this is a reg-reg copy.  Note that we can't handle MCRF, because
+  // it takes more than one instruction to store it.
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == PPC::OR &&
+       MI->getOperand(1).getReg() == MI->getOperand(2).getReg()))
+    return true;
+  else if ((Opc == PPC::OR8 &&
+              MI->getOperand(1).getReg() == MI->getOperand(2).getReg()))
+    return true;
+  else if (Opc == PPC::FMRD || Opc == PPC::FMRS)
+    return true;
+
+  return false;
+}
+
+
+bool PPCInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case PPC::BLR:   // Return.
+  case PPC::B:     // Uncond branch.
+  case PPC::BCTR:  // Indirect branch.
+    return true;
+  default: return false;
+  }
+}
+
+bool PPCInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+  // Leave the CR# the same, but invert the condition.
+  Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  return false;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case PPC::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return MF->getTarget().getTargetAsmInfo()->getInlineAsmLength(AsmStr);
+  }
+  case PPC::DBG_LABEL:
+  case PPC::EH_LABEL:
+  case PPC::GC_LABEL:
+    return 0;
+  default:
+    return 4; // PowerPC instructions are all 4 bytes
+  }
+}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
new file mode 100644
index 0000000..492634c
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -0,0 +1,168 @@
+//===- PPCInstrInfo.h - PowerPC Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_INSTRUCTIONINFO_H
+#define POWERPC32_INSTRUCTIONINFO_H
+
+#include "PPC.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "PPCRegisterInfo.h"
+
+namespace llvm {
+
+/// PPCII - This namespace holds all of the PowerPC target-specific
+/// per-instruction flags.  These must match the corresponding definitions in
+/// PPC.td and PPCInstrFormats.td.
+namespace PPCII {
+enum {
+  // PPC970 Instruction Flags.  These flags describe the characteristics of the
+  // PowerPC 970 (aka G5) dispatch groups and how they are formed out of
+  // raw machine instructions.
+
+  /// PPC970_First - This instruction starts a new dispatch group, so it will
+  /// always be the first one in the group.
+  PPC970_First = 0x1,
+  
+  /// PPC970_Single - This instruction starts a new dispatch group and
+  /// terminates it, so it will be the sole instruction in the group.
+  PPC970_Single = 0x2,
+
+  /// PPC970_Cracked - This instruction is cracked into two pieces, requiring
+  /// two dispatch pipes to be available to issue.
+  PPC970_Cracked = 0x4,
+  
+  /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that
+  /// an instruction is issued to.
+  PPC970_Shift = 3,
+  PPC970_Mask = 0x07 << PPC970_Shift
+};
+enum PPC970_Unit {
+  /// These are the various PPC970 execution unit pipelines.  Each instruction
+  /// is one of these.
+  PPC970_Pseudo = 0 << PPC970_Shift,   // Pseudo instruction
+  PPC970_FXU    = 1 << PPC970_Shift,   // Fixed Point (aka Integer/ALU) Unit
+  PPC970_LSU    = 2 << PPC970_Shift,   // Load Store Unit
+  PPC970_FPU    = 3 << PPC970_Shift,   // Floating Point Unit
+  PPC970_CRU    = 4 << PPC970_Shift,   // Control Register Unit
+  PPC970_VALU   = 5 << PPC970_Shift,   // Vector ALU
+  PPC970_VPERM  = 6 << PPC970_Shift,   // Vector Permute Unit
+  PPC970_BRU    = 7 << PPC970_Shift    // Branch Unit
+};
+}
+  
+  
+class PPCInstrInfo : public TargetInstrInfoImpl {
+  PPCTargetMachine &TM;
+  const PPCRegisterInfo RI;
+
+  bool StoreRegToStackSlot(MachineFunction &MF,
+                           unsigned SrcReg, bool isKill, int FrameIdx,
+                           const TargetRegisterClass *RC,
+                           SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  void LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, 
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            SmallVectorImpl<MachineInstr*> &NewMIs) const;
+public:
+  explicit PPCInstrInfo(PPCTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const;
+
+  // commuteInstruction - We can commute rlwimi instructions, but only if the
+  // rotate amt is zero.  We also have to munge the immediates a bit.
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+  
+  virtual void insertNoop(MachineBasicBlock &MBB, 
+                          MachineBasicBlock::iterator MI) const;
+
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  
+  /// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
+  /// copy instructions, turning them into load/store instructions.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                              const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                              const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+
+  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops) const;
+  
+  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  
+  /// GetInstSize - Return the number of bytes of code the specified
+  /// instruction may be.  This returns the maximum number of bytes.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
new file mode 100644
index 0000000..772e25a
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -0,0 +1,1475 @@
+//===- PPCInstrInfo.td - The PowerPC Instruction Set -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the subset of the 32-bit PowerPC instruction set, as used
+// by the PowerPC instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+include "PPCInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific type constraints.
+//
+def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                         SDTCisVT<1, i32> ]>;
+def SDT_PPCvperm   : SDTypeProfile<1, 3, [
+  SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
+]>;
+
+def SDT_PPCvcmp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
+]>;
+
+def SDT_PPCcondbr : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClbrx : SDTypeProfile<1, 3, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>
+]>;
+def SDT_PPCstbrx : SDTypeProfile<0, 4, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>
+]>;
+
+def SDT_PPClarx : SDTypeProfile<1, 1, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstcx : SDTypeProfile<0, 2, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
+  SDTCisPtrTy<0>, SDTCisVT<1, i32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific DAG Nodes.
+//
+
+def PPCfcfid  : SDNode<"PPCISD::FCFID" , SDTFPUnaryOp, []>;
+def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
+def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
+                       [SDNPHasChain, SDNPMayStore]>;
+
+// This sequence is used for long double->int conversions.  It changes the
+// bits in the FPSCR which is not modelled.  
+def PPCmffs   : SDNode<"PPCISD::MFFS", SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>,
+                        [SDNPOutFlag]>;
+def PPCmtfsb0 : SDNode<"PPCISD::MTFSB0", SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                       [SDNPInFlag, SDNPOutFlag]>;
+def PPCmtfsb1 : SDNode<"PPCISD::MTFSB1", SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                       [SDNPInFlag, SDNPOutFlag]>;
+def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp,
+                       [SDNPInFlag, SDNPOutFlag]>;
+def PPCmtfsf  : SDNode<"PPCISD::MTFSF", SDTypeProfile<1, 3, 
+                       [SDTCisVT<0, f64>, SDTCisInt<1>, SDTCisVT<2, f64>,
+                        SDTCisVT<3, f64>]>,
+                       [SDNPInFlag]>;
+
+def PPCfsel   : SDNode<"PPCISD::FSEL",  
+   // Type constraint for fsel.
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, 
+                        SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
+def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
+def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
+
+def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
+// amounts.  These nodes are generated by the multi-precision shift code.
+def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
+def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
+def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
+
+def PPCextsw_32   : SDNode<"PPCISD::EXTSW_32"  , SDTIntUnaryOp>;
+def PPCstd_32     : SDNode<"PPCISD::STD_32"    , SDTStore,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def PPCcall_Macho : SDNode<"PPCISD::CALL_Macho", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCcall_ELF   : SDNode<"PPCISD::CALL_ELF", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCbctrl_Macho  : SDNode<"PPCISD::BCTRL_Macho", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def PPCbctrl_ELF  : SDNode<"PPCISD::BCTRL_ELF", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
+                        [SDNPHasChain,  SDNPOptInFlag]>;
+
+def PPCtailcall : SDNode<"PPCISD::TAILCALL",     SDT_PPCCall,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutFlag]>;
+
+def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
+                           [SDNPHasChain, SDNPMayLoad]>;
+def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to support atomic operations
+def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
+                          [SDNPHasChain, SDNPMayLoad]>;
+def PPCstcx      : SDNode<"PPCISD::STCX", SDT_PPCstcx,
+                          [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to support dynamic alloca.
+def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific transformation functions and pattern fragments.
+//
+
+def SHL32 : SDNodeXForm<imm, [{
+  // Transformation function: 31 - imm
+  return getI32Imm(31 - N->getZExtValue());
+}]>;
+
+def SRL32 : SDNodeXForm<imm, [{
+  // Transformation function: 32 - imm
+  return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  // Transformation function: get the low 16 bits.
+  return getI32Imm((unsigned short)N->getZExtValue());
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+def HA16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  signed int Val = N->getZExtValue();
+  return getI32Imm((Val - (signed short)Val) >> 16);
+}]>;
+def MB : SDNodeXForm<imm, [{
+  // Transformation function: get the start bit of a mask
+  unsigned mb = 0, me;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(mb);
+}]>;
+
+def ME : SDNodeXForm<imm, [{
+  // Transformation function: get the end bit of a mask
+  unsigned mb, me = 0;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(me);
+}]>;
+def maskimm32 : PatLeaf<(imm), [{
+  // maskImm predicate - True if immediate is a run of ones.
+  unsigned mb, me;
+  if (N->getValueType(0) == MVT::i32)
+    return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  else
+    return false;
+}]>;
+
+def immSExt16  : PatLeaf<(imm), [{
+  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.  Used by instructions like 'addi'.
+  if (N->getValueType(0) == MVT::i32)
+    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
+  else
+    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
+}]>;
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.  Used by instructions like 'ori'.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// imm16Shifted* - These match immediates where the low 16-bits are zero.  There
+// are two forms: imm16ShiftedSExt and imm16ShiftedZExt.  These two forms are
+// identical in 32-bit mode, but in 64-bit mode, they return true if the
+// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
+// clear).
+def imm16ShiftedZExt : PatLeaf<(imm), [{
+  // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'xoris'.
+  return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
+}], HI16>;
+
+def imm16ShiftedSExt : PatLeaf<(imm), [{
+  // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'addis'.  Identical to 
+  // imm16ShiftedZExt in 32-bit mode.
+  if (N->getZExtValue() & 0xFFFF) return false;
+  if (N->getValueType(0) == MVT::i32)
+    return true;
+  // For 64-bit, make sure it is sext right.
+  return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
+}], HI16>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Flag Definitions.
+
+class isPPC64 { bit PPC64 = 1; }
+class isDOT   {
+  list<Register> Defs = [CR0];
+  bit RC  = 1;
+}
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+def s5imm   : Operand<i32> {
+  let PrintMethod = "printS5ImmOperand";
+}
+def u5imm   : Operand<i32> {
+  let PrintMethod = "printU5ImmOperand";
+}
+def u6imm   : Operand<i32> {
+  let PrintMethod = "printU6ImmOperand";
+}
+def s16imm  : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm  : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def s16immX4  : Operand<i32> {   // Multiply imm by 4 before printing.
+  let PrintMethod = "printS16X4ImmOperand";
+}
+def target : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+}
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+}
+def aaddr : Operand<iPTR> {
+  let PrintMethod = "printAbsAddrOperand";
+}
+def piclabel: Operand<iPTR> {
+  let PrintMethod = "printPICLabel";
+}
+def symbolHi: Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo: Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
+def crbitm: Operand<i8> {
+  let PrintMethod = "printcrbitm";
+}
+// Address operands
+def memri : Operand<iPTR> {
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+}
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
+  let PrintMethod = "printMemRegImmShifted";
+  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+}
+
+// PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
+// that doesn't matter.
+def pred : PredicateOperand<OtherVT, (ops imm, CRRC),
+                                     (ops (i32 20), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Define PowerPC specific addressing mode.
+def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
+def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
+def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
+
+/// This is just the offset part of iaddr, used for preinc.
+def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def FPContractions : Predicate<"!NoExcessFPPrecision">;
+def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Definitions.
+
+// Pseudo-instructions:
+
+let hasCtrlDep = 1 in {
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt),
+                              "${:comment} ADJCALLSTACKDOWN",
+                              [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "${:comment} ADJCALLSTACKUP",
+                              [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def UPDATE_VRSAVE    : Pseudo<(outs GPRC:$rD), (ins GPRC:$rS),
+                              "UPDATE_VRSAVE $rD, $rS", []>;
+}
+
+let Defs = [R1], Uses = [R1] in
+def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi),
+                       "${:comment} DYNALLOC $result, $negsize, $fpsi",
+                       [(set GPRC:$result,
+                             (PPCdynalloc GPRC:$negsize, iaddr:$fpsi))]>;
+                         
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1,    // Expanded by the scheduler.
+    PPC970_Single = 1 in {
+  def SELECT_CC_I4 : Pseudo<(outs GPRC:$dst), (ins CRRC:$cond, GPRC:$T, GPRC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_I8 : Pseudo<(outs G8RC:$dst), (ins CRRC:$cond, G8RC:$T, G8RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_F4  : Pseudo<(outs F4RC:$dst), (ins CRRC:$cond, F4RC:$T, F4RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_F8  : Pseudo<(outs F8RC:$dst), (ins CRRC:$cond, F8RC:$T, F8RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_VRRC: Pseudo<(outs VRRC:$dst), (ins CRRC:$cond, VRRC:$T, VRRC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+}
+
+// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
+// scavenge a register for it.
+def SPILL_CR : Pseudo<(outs), (ins GPRC:$cond, memri:$F),
+                     "${:comment} SPILL_CR $cond $F", []>;
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isReturn = 1, Uses = [LR, RM] in
+    def BLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$p),
+                          "b${p:cc}lr ${p:reg}", BrB, 
+                          [(retflag)]>;
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in
+    def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>;
+}
+
+let Defs = [LR] in
+  def MovePCtoLR : Pseudo<(outs), (ins piclabel:$label), "bl $label", []>,
+                   PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let isBarrier = 1 in {
+  def B   : IForm<18, 0, 0, (outs), (ins target:$dst),
+                  "b $dst", BrB,
+                  [(br bb:$dst)]>;
+  }
+
+  // BCC represents an arbitrary conditional branch on a predicate.
+  // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
+  // a two-value operand where a dag node expects two operands. :( 
+  def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, target:$dst),
+                  "b${cond:cc} ${cond:reg}, $dst"
+                  /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+}
+
+// Macho ABI Calls.
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the non-callee saved registers...
+  Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR,CTR,
+          CR0,CR1,CR5,CR6,CR7,
+          CR0LT,CR0GT,CR0EQ,CR0UN,CR1LT,CR1GT,CR1EQ,CR1UN,CR5LT,CR5GT,CR5EQ,
+          CR5UN,CR6LT,CR6GT,CR6EQ,CR6UN,CR7LT,CR7GT,CR7EQ,CR7UN] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL_Macho  : IForm<18, 0, 1,
+                          (outs), (ins calltarget:$func, variable_ops), 
+                          "bl $func", BrB, []>;  // See Pat patterns below.
+    def BLA_Macho : IForm<18, 1, 1, 
+                          (outs), (ins aaddr:$func, variable_ops),
+                          "bla $func", BrB, [(PPCcall_Macho (i32 imm:$func))]>;
+  }
+  let Uses = [CTR, RM] in {
+    def BCTRL_Macho : XLForm_2_ext<19, 528, 20, 0, 1, 
+                                 (outs), (ins variable_ops),
+                                 "bctrl", BrB,
+                                 [(PPCbctrl_Macho)]>, Requires<[In32BitMode]>;
+  }
+}
+
+// ELF ABI Calls.
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the non-callee saved registers...
+  Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR,CTR,
+          CR0,CR1,CR5,CR6,CR7,
+          CR0LT,CR0GT,CR0EQ,CR0UN,CR1LT,CR1GT,CR1EQ,CR1UN,CR5LT,CR5GT,CR5EQ,
+          CR5UN,CR6LT,CR6GT,CR6EQ,CR6UN,CR7LT,CR7GT,CR7EQ,CR7UN] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL_ELF  : IForm<18, 0, 1,
+                        (outs), (ins calltarget:$func, variable_ops), 
+                        "bl $func", BrB, []>;  // See Pat patterns below.
+    def BLA_ELF : IForm<18, 1, 1,
+                        (outs), (ins aaddr:$func, variable_ops),
+                        "bla $func", BrB,
+                        [(PPCcall_ELF (i32 imm:$func))]>;
+  }
+  let Uses = [CTR, RM] in {
+    def BCTRL_ELF : XLForm_2_ext<19, 528, 20, 0, 1,
+                               (outs), (ins variable_ops),
+                               "bctrl", BrB,
+                               [(PPCbctrl_ELF)]>, Requires<[In32BitMode]>;
+  }
+}
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNd $dst $offset",
+                 []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+                 "#TC_RETURNa $func $offset",
+                 [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNr $dst $offset",
+                 []>;
+
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM]  in
+def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+     Requires<[In32BitMode]>;
+
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", BrB,
+                  []>;
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+                  "ba $dst", BrB,
+                  []>;
+
+
+// DCB* instructions.
+def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst),
+                      "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBF   : DCB_Form<86, 0, (outs), (ins memrr:$dst),
+                      "dcbf $dst", LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBI   : DCB_Form<470, 0, (outs), (ins memrr:$dst),
+                      "dcbi $dst", LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBST  : DCB_Form<54, 0, (outs), (ins memrr:$dst),
+                      "dcbst $dst", LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBT   : DCB_Form<278, 0, (outs), (ins memrr:$dst),
+                      "dcbt $dst", LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBTST : DCB_Form<246, 0, (outs), (ins memrr:$dst),
+                      "dcbtst $dst", LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZ   : DCB_Form<1014, 0, (outs), (ins memrr:$dst),
+                      "dcbz $dst", LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst),
+                      "dcbzl $dst", LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+
+// Atomic operations
+let usesCustomDAGSchedInserter = 1 in {
+  let Uses = [CR0] in {
+    def ATOMIC_LOAD_ADD_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_add_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_sub_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_and_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_or_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_xor_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_nand_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_ADD_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_add_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_sub_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_and_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_or_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_xor_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_nand_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_ADD_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_add_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_sub_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_and_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_or_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_xor_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_nand_32 xoaddr:$ptr, GPRC:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I8 PSEUDO!",
+      [(set GPRC:$dst, 
+                    (atomic_cmp_swap_8 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I16 PSEUDO!",
+      [(set GPRC:$dst, 
+                    (atomic_cmp_swap_16 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I32 PSEUDO!",
+      [(set GPRC:$dst, 
+                    (atomic_cmp_swap_32 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
+
+    def ATOMIC_SWAP_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new),
+      "${:comment} ATOMIC_SWAP_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_swap_8 xoaddr:$ptr, GPRC:$new))]>;
+    def ATOMIC_SWAP_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new),
+      "${:comment} ATOMIC_SWAP_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_swap_16 xoaddr:$ptr, GPRC:$new))]>;
+    def ATOMIC_SWAP_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new),
+      "${:comment} ATOMIC_SWAP_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_swap_32 xoaddr:$ptr, GPRC:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LWARX : XForm_1<31,  20, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwarx $rD, $src", LdStLWARX,
+                   [(set GPRC:$rD, (PPClarx xoaddr:$src))]>;
+
+let Defs = [CR0] in
+def STWCX : XForm_1<31, 150, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwcx. $rS, $dst", LdStSTWCX,
+                   [(PPCstcx GPRC:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let isBarrier = 1, hasCtrlDep = 1 in
+def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStGeneral, [(trap)]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Load Instructions.
+//
+
+// Unindexed (r+i) Loads. 
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ : DForm_1<34, (outs GPRC:$rD), (ins memri:$src),
+                  "lbz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (zextloadi8 iaddr:$src))]>;
+def LHA : DForm_1<42, (outs GPRC:$rD), (ins memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set GPRC:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LHZ : DForm_1<40, (outs GPRC:$rD), (ins memri:$src),
+                  "lhz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ : DForm_1<32, (outs GPRC:$rD), (ins memri:$src),
+                  "lwz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (load iaddr:$src))]>;
+
+def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
+                  "lfs $rD, $src", LdStLFDU,
+                  [(set F4RC:$rD, (load iaddr:$src))]>;
+def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
+                  "lfd $rD, $src", LdStLFD,
+                  [(set F8RC:$rD, (load iaddr:$src))]>;
+
+
+// Unindexed (r+i) Loads with Update (preinc).
+let mayLoad = 1 in {
+def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lbzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lhau $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lhzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lwzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                  "lfs $rD, $addr", LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                  "lfd $rD, $addr", LdStLFD,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+
+// Indexed (r+r) Loads.
+//
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZX : XForm_1<31,  87, (outs GPRC:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (zextloadi8 xaddr:$src))]>;
+def LHAX : XForm_1<31, 343, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set GPRC:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LHZX : XForm_1<31, 279, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX : XForm_1<31,  23, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (load xaddr:$src))]>;
+                   
+                   
+def LHBRX : XForm_1<31, 790, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhbrx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i16))]>;
+def LWBRX : XForm_1<31,  534, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwbrx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i32))]>;
+
+def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
+                      "lfsx $frD, $src", LdStLFDU,
+                      [(set F4RC:$frD, (load xaddr:$src))]>;
+def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
+                      "lfdx $frD, $src", LdStLFDU,
+                      [(set F8RC:$frD, (load xaddr:$src))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// PPC32 Store Instructions.
+//
+
+// Unindexed (r+i) Stores.
+let PPC970_Unit = 2 in {
+def STB  : DForm_1<38, (outs), (ins GPRC:$rS, memri:$src),
+                   "stb $rS, $src", LdStGeneral,
+                   [(truncstorei8 GPRC:$rS, iaddr:$src)]>;
+def STH  : DForm_1<44, (outs), (ins GPRC:$rS, memri:$src),
+                   "sth $rS, $src", LdStGeneral,
+                   [(truncstorei16 GPRC:$rS, iaddr:$src)]>;
+def STW  : DForm_1<36, (outs), (ins GPRC:$rS, memri:$src),
+                   "stw $rS, $src", LdStGeneral,
+                   [(store GPRC:$rS, iaddr:$src)]>;
+def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
+                   "stfs $rS, $dst", LdStUX,
+                   [(store F4RC:$rS, iaddr:$dst)]>;
+def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
+                   "stfd $rS, $dst", LdStUX,
+                   [(store F8RC:$rS, iaddr:$dst)]>;
+}
+
+// Unindexed (r+i) Stores with Update (preinc).
+let PPC970_Unit = 2 in {
+def STBU  : DForm_1<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
+                                         iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STHU  : DForm_1<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                        (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
+                                        iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STWU  : DForm_1<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STFSU : DForm_1<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stfsu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STFDU : DForm_1<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stfdu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+}
+
+
+// Indexed (r+r) Stores.
+//
+let PPC970_Unit = 2 in {
+def STBX  : XForm_8<31, 215, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStGeneral,
+                   [(truncstorei8 GPRC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STHX  : XForm_8<31, 407, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStGeneral,
+                   [(truncstorei16 GPRC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStGeneral,
+                   [(store GPRC:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+                   
+let mayStore = 1 in {
+def STWUX : XForm_8<31, 183, (outs), (ins GPRC:$rS, GPRC:$rA, GPRC:$rB),
+                   "stwux $rS, $rA, $rB", LdStGeneral,
+                   []>;
+}
+def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "sthbrx $rS, $dst", LdStGeneral,
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i16)]>, 
+                   PPC970_DGroup_Cracked;
+def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwbrx $rS, $dst", LdStGeneral,
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i32)]>,
+                   PPC970_DGroup_Cracked;
+
+def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
+                     "stfiwx $frS, $dst", LdStUX,
+                     [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>;
+                     
+def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst),
+                     "stfsx $frS, $dst", LdStUX,
+                     [(store F4RC:$frS, xaddr:$dst)]>;
+def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst),
+                     "stfdx $frS, $dst", LdStUX,
+                     [(store F8RC:$frS, xaddr:$dst)]>;
+}
+
+let isBarrier = 1 in
+def SYNC : XForm_24_sync<31, 598, (outs), (ins),
+                        "sync", LdStSync,
+                        [(int_ppc_sync)]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Arithmetic Instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ADDI   : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addi $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
+def ADDIC  : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addic $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (addc GPRC:$rA, immSExt16:$imm))]>,
+                     PPC970_DGroup_Cracked;
+def ADDICo : DForm_2<13, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addic. $rD, $rA, $imm", IntGeneral,
+                     []>;
+def ADDIS  : DForm_2<15, (outs GPRC:$rD), (ins GPRC:$rA, symbolHi:$imm),
+                     "addis $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>;
+def LA     : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$sym),
+                     "la $rD, $sym($rA)", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA,
+                                          (PPClo tglobaladdr:$sym, 0)))]>;
+def MULLI  : DForm_2< 7, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "mulli $rD, $rA, $imm", IntMulLI,
+                     [(set GPRC:$rD, (mul GPRC:$rA, immSExt16:$imm))]>;
+def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
+
+let isReMaterializable = 1 in {
+  def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
+                       "li $rD, $imm", IntGeneral,
+                       [(set GPRC:$rD, immSExt16:$imm)]>;
+  def LIS : DForm_2_r0<15, (outs GPRC:$rD), (ins symbolHi:$imm),
+                       "lis $rD, $imm", IntGeneral,
+                       [(set GPRC:$rD, imm16ShiftedSExt:$imm)]>;
+}
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ANDIo : DForm_4<28, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "andi. $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (and GPRC:$src1, immZExt16:$src2))]>,
+                    isDOT;
+def ANDISo : DForm_4<29, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (and GPRC:$src1,imm16ShiftedZExt:$src2))]>,
+                    isDOT;
+def ORI   : DForm_4<24, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "ori $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (or GPRC:$src1, immZExt16:$src2))]>;
+def ORIS  : DForm_4<25, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "oris $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (or GPRC:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI  : DForm_4<26, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "xori $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (xor GPRC:$src1, immZExt16:$src2))]>;
+def XORIS : DForm_4<27, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "xoris $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (xor GPRC:$src1,imm16ShiftedZExt:$src2))]>;
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntGeneral,
+                         []>;
+def CMPWI : DForm_5_ext<11, (outs CRRC:$crD), (ins GPRC:$rA, s16imm:$imm),
+                        "cmpwi $crD, $rA, $imm", IntCompare>;
+def CMPLWI : DForm_6_ext<10, (outs CRRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                         "cmplwi $dst, $src1, $src2", IntCompare>;
+}
+
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def NAND : XForm_6<31, 476, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "nand $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (and GPRC:$rS, GPRC:$rB)))]>;
+def AND  : XForm_6<31,  28, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "and $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (and GPRC:$rS, GPRC:$rB))]>;
+def ANDC : XForm_6<31,  60, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "andc $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (and GPRC:$rS, (not GPRC:$rB)))]>;
+def OR   : XForm_6<31, 444, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (or GPRC:$rS, GPRC:$rB))]>;
+def NOR  : XForm_6<31, 124, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "nor $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (or GPRC:$rS, GPRC:$rB)))]>;
+def ORC  : XForm_6<31, 412, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "orc $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (or GPRC:$rS, (not GPRC:$rB)))]>;
+def EQV  : XForm_6<31, 284, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "eqv $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (xor GPRC:$rS, GPRC:$rB)))]>;
+def XOR  : XForm_6<31, 316, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "xor $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (xor GPRC:$rS, GPRC:$rB))]>;
+def SLW  : XForm_6<31,  24, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "slw $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (PPCshl GPRC:$rS, GPRC:$rB))]>;
+def SRW  : XForm_6<31, 536, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "srw $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (PPCsrl GPRC:$rS, GPRC:$rB))]>;
+def SRAW : XForm_6<31, 792, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "sraw $rA, $rS, $rB", IntShift,
+                   [(set GPRC:$rA, (PPCsra GPRC:$rS, GPRC:$rB))]>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def SRAWI : XForm_10<31, 824, (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH), 
+                     "srawi $rA, $rS, $SH", IntShift,
+                     [(set GPRC:$rA, (sra GPRC:$rS, (i32 imm:$SH)))]>;
+def CNTLZW : XForm_11<31,  26, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "cntlzw $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (ctlz GPRC:$rS))]>;
+def EXTSB  : XForm_11<31, 954, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsb $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (sext_inreg GPRC:$rS, i8))]>;
+def EXTSH  : XForm_11<31, 922, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsh $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (sext_inreg GPRC:$rS, i16))]>;
+
+def CMPW   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
+                          "cmpw $crD, $rA, $rB", IntCompare>;
+def CMPLW  : XForm_16_ext<31, 32, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
+                          "cmplw $crD, $rA, $rB", IntCompare>;
+}
+let PPC970_Unit = 3 in {  // FPU Operations.
+//def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
+//                      "fcmpo $crD, $fA, $fB", FPCompare>;
+def FCMPUS : XForm_17<63, 0, (outs CRRC:$crD), (ins F4RC:$fA, F4RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+def FCMPUD : XForm_17<63, 0, (outs CRRC:$crD), (ins F8RC:$fA, F8RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+
+let Uses = [RM] in {
+  def FCTIWZ : XForm_26<63, 15, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fctiwz $frD, $frB", FPGeneral,
+                        [(set F8RC:$frD, (PPCfctiwz F8RC:$frB))]>;
+  def FRSP   : XForm_26<63, 12, (outs F4RC:$frD), (ins F8RC:$frB),
+                        "frsp $frD, $frB", FPGeneral,
+                        [(set F4RC:$frD, (fround F8RC:$frB))]>;
+  def FSQRT  : XForm_26<63, 22, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fsqrt $frD, $frB", FPSqrt,
+                        [(set F8RC:$frD, (fsqrt F8RC:$frB))]>;
+  def FSQRTS : XForm_26<59, 22, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "fsqrts $frD, $frB", FPSqrt,
+                        [(set F4RC:$frD, (fsqrt F4RC:$frB))]>;
+  }
+}
+
+/// FMR is split into 3 versions, one for 4/8 byte FP, and one for extending.
+///
+/// Note that these are defined as pseudo-ops on the PPC970 because they are
+/// often coalesced away and we don't want the dispatch group builder to think
+/// that they will fill slots (which could cause the load of a LSU reject to
+/// sneak into a d-group with a store).
+def FMRS   : XForm_26<63, 72, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      []>,  // (set F4RC:$frD, F4RC:$frB)
+                      PPC970_Unit_Pseudo;
+def FMRD   : XForm_26<63, 72, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      []>,  // (set F8RC:$frD, F8RC:$frB)
+                      PPC970_Unit_Pseudo;
+def FMRSD  : XForm_26<63, 72, (outs F8RC:$frD), (ins F4RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fextend F4RC:$frB))]>,
+                      PPC970_Unit_Pseudo;
+
+let PPC970_Unit = 3 in {  // FPU Operations.
+// These are artificially split into two different forms, for 4/8 byte FP.
+def FABSS  : XForm_26<63, 264, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fabs F4RC:$frB))]>;
+def FABSD  : XForm_26<63, 264, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fabs F8RC:$frB))]>;
+def FNABSS : XForm_26<63, 136, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fneg (fabs F4RC:$frB)))]>;
+def FNABSD : XForm_26<63, 136, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fneg (fabs F8RC:$frB)))]>;
+def FNEGS  : XForm_26<63, 40, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fneg F4RC:$frB))]>;
+def FNEGD  : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fneg F8RC:$frB))]>;
+}
+                      
+
+// XL-Form instructions.  condition register logical ops.
+//
+def MCRF   : XLForm_3<19, 0, (outs CRRC:$BF), (ins CRRC:$BFA),
+                      "mcrf $BF, $BFA", BrMCR>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def CREQV  : XLForm_1<19, 289, (outs CRBITRC:$CRD),
+                               (ins CRBITRC:$CRA, CRBITRC:$CRB),
+                      "creqv $CRD, $CRA, $CRB", BrCR,
+                      []>;
+
+def CROR  : XLForm_1<19, 449, (outs CRBITRC:$CRD),
+                               (ins CRBITRC:$CRA, CRBITRC:$CRB),
+                      "cror $CRD, $CRA, $CRB", BrCR,
+                      []>;
+
+def CRSET  : XLForm_1_ext<19, 289, (outs CRBITRC:$dst), (ins),
+              "creqv $dst, $dst, $dst", BrCR,
+              []>;
+
+// XFX-Form instructions.  Instructions that deal with SPRs.
+//
+let Uses = [CTR] in {
+def MFCTR : XFXForm_1_ext<31, 339, 9, (outs GPRC:$rT), (ins),
+                          "mfctr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Defs = [CTR], Pattern = [(PPCmtctr GPRC:$rS)] in {
+def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins GPRC:$rS),
+                          "mtctr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [LR] in {
+def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins GPRC:$rS),
+                          "mtlr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR] in {
+def MFLR  : XFXForm_1_ext<31, 339, 8, (outs GPRC:$rT), (ins),
+                          "mflr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed like
+// a GPR on the PPC970.  As such, copies in and out have the same performance
+// characteristics as an OR instruction.
+def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins GPRC:$rS),
+                             "mtspr 256, $rS", IntGeneral>,
+               PPC970_DGroup_Single, PPC970_Unit_FXU;
+def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs GPRC:$rT), (ins),
+                             "mfspr $rT, 256", IntGeneral>,
+               PPC970_DGroup_First, PPC970_Unit_FXU;
+
+def MTCRF : XFXForm_5<31, 144, (outs), (ins crbitm:$FXM, GPRC:$rS),
+                      "mtcrf $FXM, $rS", BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+// FIXME:  this Uses all the CR registers.  Marking it as such is 
+// necessary for DeadMachineInstructionElim to do the right thing.
+// However, marking it also exposes PR 2964, and causes crashes in
+// the Local RA because it doesn't like this sequence:
+//  vreg = MCRF  CR0
+//  MFCR  <kill of whatever preg got assigned to vreg>
+// For now DeadMachineInstructionElim is turned off, so don't do the marking.
+def MFCR  : XFXForm_3<31, 19, (outs GPRC:$rT), (ins), "mfcr $rT", SprMFCR>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+def MFOCRF: XFXForm_5a<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
+                       "mfcr $rT, $FXM", SprMFCR>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Instructions to manipulate FPSCR.  Only long double handling uses these.
+// FPSCR is not modelled; we use the SDNode Flag to keep things in order.
+
+let Uses = [RM], Defs = [RM] in { 
+  def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+                         "mtfsb0 $FM", IntMTFSB0,
+                        [(PPCmtfsb0 (i32 imm:$FM))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+                         "mtfsb1 $FM", IntMTFSB0,
+                        [(PPCmtfsb1 (i32 imm:$FM))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  // MTFSF does not actually produce an FP result.  We pretend it copies
+  // input reg B to the output.  If we didn't do this it would look like the
+  // instruction had no outputs (because we aren't modelling the FPSCR) and
+  // it would be deleted.
+  def MTFSF  : XFLForm<63, 711, (outs F8RC:$FRA),
+                                (ins i32imm:$FM, F8RC:$rT, F8RC:$FRB),
+                         "mtfsf $FM, $rT", "$FRB = $FRA", IntMTFSB0,
+                         [(set F8RC:$FRA, (PPCmtfsf (i32 imm:$FM), 
+                                                     F8RC:$rT, F8RC:$FRB))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+let Uses = [RM] in {
+  def MFFS   : XForm_42<63, 583, (outs F8RC:$rT), (ins), 
+                         "mffs $rT", IntMFFS,
+                         [(set F8RC:$rT, (PPCmffs))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def FADDrtz: AForm_2<63, 21,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F8RC:$FRT, (PPCfaddrtz F8RC:$FRA, F8RC:$FRB))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+// XO-Form instructions.  Arithmetic instructions that can set overflow bit
+//
+def ADD4  : XOForm_1<31, 266, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "add $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>;
+def ADDC  : XOForm_1<31, 10, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (addc GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_Cracked;
+def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "adde $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (adde GPRC:$rA, GPRC:$rB))]>;
+def DIVW  : XOForm_1<31, 491, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "divw $rT, $rA, $rB", IntDivW,
+                     [(set GPRC:$rT, (sdiv GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVWU : XOForm_1<31, 459, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "divwu $rT, $rA, $rB", IntDivW,
+                     [(set GPRC:$rT, (udiv GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULHW : XOForm_1<31, 75, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mulhw $rT, $rA, $rB", IntMulHW,
+                     [(set GPRC:$rT, (mulhs GPRC:$rA, GPRC:$rB))]>;
+def MULHWU : XOForm_1<31, 11, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mulhwu $rT, $rA, $rB", IntMulHWU,
+                     [(set GPRC:$rT, (mulhu GPRC:$rA, GPRC:$rB))]>;
+def MULLW : XOForm_1<31, 235, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mullw $rT, $rA, $rB", IntMulHW,
+                     [(set GPRC:$rT, (mul GPRC:$rA, GPRC:$rB))]>;
+def SUBF  : XOForm_1<31, 40, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (sub GPRC:$rB, GPRC:$rA))]>;
+def SUBFC : XOForm_1<31, 8, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "subfc $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (subc GPRC:$rB, GPRC:$rA))]>,
+                     PPC970_DGroup_Cracked;
+def SUBFE : XOForm_1<31, 136, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "subfe $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (sube GPRC:$rB, GPRC:$rA))]>;
+def ADDME  : XOForm_3<31, 234, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "addme $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, immAllOnes))]>;
+def ADDZE  : XOForm_3<31, 202, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "addze $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, 0))]>;
+def NEG    : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "neg $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (ineg GPRC:$rA))]>;
+def SUBFME : XOForm_3<31, 232, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "subfme $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (sube immAllOnes, GPRC:$rA))]>;
+def SUBFZE : XOForm_3<31, 200, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "subfze $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (sube 0, GPRC:$rA))]>;
+}
+
+// A-Form instructions.  Most of the instructions executed in the FPU are of
+// this type.
+//
+let PPC970_Unit = 3 in {  // FPU Operations.
+let Uses = [RM] in {
+  def FMADD : AForm_1<63, 29, 
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fadd (fmul F8RC:$FRA, F8RC:$FRC),
+                                             F8RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FMADDS : AForm_1<59, 29,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC),
+                                             F4RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FMSUB : AForm_1<63, 28,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fsub (fmul F8RC:$FRA, F8RC:$FRC),
+                                             F8RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FMSUBS : AForm_1<59, 28,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fsub (fmul F4RC:$FRA, F4RC:$FRC),
+                                             F4RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FNMADD : AForm_1<63, 31,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fneg (fadd (fmul F8RC:$FRA, F8RC:$FRC),
+                                                   F8RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+  def FNMADDS : AForm_1<59, 31,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fneg (fadd (fmul F4RC:$FRA, F4RC:$FRC),
+                                                   F4RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+  def FNMSUB : AForm_1<63, 30,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fneg (fsub (fmul F8RC:$FRA, F8RC:$FRC),
+                                                   F8RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+  def FNMSUBS : AForm_1<59, 30,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fneg (fsub (fmul F4RC:$FRA, F4RC:$FRC),
+                                                   F4RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+}
+// FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
+// having 4 of these, force the comparison to always be an 8-byte double (code
+// should use an FMRSD if the input comparison value really wants to be a float)
+// and 4/8 byte forms for the result and operand type..
+def FSELD : AForm_1<63, 23,
+                    (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F8RC:$FRT, (PPCfsel F8RC:$FRA,F8RC:$FRC,F8RC:$FRB))]>;
+def FSELS : AForm_1<63, 23,
+                     (outs F4RC:$FRT), (ins F8RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                     "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (PPCfsel F8RC:$FRA,F4RC:$FRC,F4RC:$FRB))]>;
+let Uses = [RM] in {
+  def FADD  : AForm_2<63, 21,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>;
+  def FADDS : AForm_2<59, 21,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fadds $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fadd F4RC:$FRA, F4RC:$FRB))]>;
+  def FDIV  : AForm_2<63, 18,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fdiv $FRT, $FRA, $FRB", FPDivD,
+                      [(set F8RC:$FRT, (fdiv F8RC:$FRA, F8RC:$FRB))]>;
+  def FDIVS : AForm_2<59, 18,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fdivs $FRT, $FRA, $FRB", FPDivS,
+                      [(set F4RC:$FRT, (fdiv F4RC:$FRA, F4RC:$FRB))]>;
+  def FMUL  : AForm_3<63, 25,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fmul $FRT, $FRA, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRB))]>;
+  def FMULS : AForm_3<59, 25,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fmuls $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>;
+  def FSUB  : AForm_2<63, 20,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fsub $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>;
+  def FSUBS : AForm_2<59, 20,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fsubs $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fsub F4RC:$FRA, F4RC:$FRB))]>;
+  }
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+// M-Form instructions.  rotate and mask instructions.
+//
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+def RLWIMI : MForm_2<20,
+                     (outs GPRC:$rA), (ins GPRC:$rSi, GPRC:$rS, u5imm:$SH, u5imm:$MB, 
+                      u5imm:$ME), "rlwimi $rA, $rS, $SH, $MB, $ME", IntRotate,
+                      []>, PPC970_DGroup_Cracked, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+def RLWINM : MForm_2<21,
+                     (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>;
+def RLWINMo : MForm_2<21,
+                     (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm. $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>, isDOT, PPC970_DGroup_Cracked;
+def RLWNM  : MForm_2<23,
+                     (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB, u5imm:$MB, u5imm:$ME),
+                     "rlwnm $rA, $rS, $rB, $MB, $ME", IntGeneral,
+                     []>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DWARF Pseudo Instructions
+//
+
+def DWARF_LOC        : Pseudo<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file),
+                              "${:comment} .loc $file, $line, $col",
+                      [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
+                                  (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Patterns
+//
+
+// Arbitrary immediate support.  Implement in terms of LIS/ORI.
+def : Pat<(i32 imm:$imm),
+          (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Implement the 'not' operation with the NOR instruction.
+def NOT : Pat<(not GPRC:$in),
+              (NOR GPRC:$in, GPRC:$in)>;
+
+// ADD an arbitrary immediate.
+def : Pat<(add GPRC:$in, imm:$imm),
+          (ADDIS (ADDI GPRC:$in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
+// OR an arbitrary immediate.
+def : Pat<(or GPRC:$in, imm:$imm),
+          (ORIS (ORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// XOR an arbitrary immediate.
+def : Pat<(xor GPRC:$in, imm:$imm),
+          (XORIS (XORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// SUBFIC
+def : Pat<(sub  immSExt16:$imm, GPRC:$in),
+          (SUBFIC GPRC:$in, imm:$imm)>;
+
+// SHL/SRL
+def : Pat<(shl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, imm:$imm, 0, (SHL32 imm:$imm))>;
+def : Pat<(srl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, (SRL32 imm:$imm), imm:$imm, 31)>;
+
+// ROTL
+def : Pat<(rotl GPRC:$in, GPRC:$sh),
+          (RLWNM GPRC:$in, GPRC:$sh, 0, 31)>;
+def : Pat<(rotl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, imm:$imm, 0, 31)>;
+
+// RLWNM
+def : Pat<(and (rotl GPRC:$in, GPRC:$sh), maskimm32:$imm),
+          (RLWNM GPRC:$in, GPRC:$sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
+
+// Calls
+def : Pat<(PPCcall_Macho (i32 tglobaladdr:$dst)),
+          (BL_Macho tglobaladdr:$dst)>;
+def : Pat<(PPCcall_Macho (i32 texternalsym:$dst)),
+          (BL_Macho texternalsym:$dst)>;
+def : Pat<(PPCcall_ELF (i32 tglobaladdr:$dst)),
+          (BL_ELF tglobaladdr:$dst)>;
+def : Pat<(PPCcall_ELF (i32 texternalsym:$dst)),
+          (BL_ELF texternalsym:$dst)>;
+
+
+def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
+          (TCRETURNri CTRRC:$dst, imm:$imm)>;
+
+
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
+def : Pat<(add GPRC:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS GPRC:$in, tglobaladdr:$g)>;
+def : Pat<(add GPRC:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS GPRC:$in, tconstpool:$g)>;
+def : Pat<(add GPRC:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS GPRC:$in, tjumptable:$g)>;
+
+// Fused negative multiply subtract, alternate pattern
+def : Pat<(fsub F8RC:$B, (fmul F8RC:$A, F8RC:$C)),
+          (FNMSUB F8RC:$A, F8RC:$C, F8RC:$B)>,
+          Requires<[FPContractions]>;
+def : Pat<(fsub F4RC:$B, (fmul F4RC:$A, F4RC:$C)),
+          (FNMSUBS F4RC:$A, F4RC:$C, F4RC:$B)>,
+          Requires<[FPContractions]>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
+// amounts.
+def : Pat<(sra GPRC:$rS, GPRC:$rB),
+          (SRAW GPRC:$rS, GPRC:$rB)>;
+def : Pat<(srl GPRC:$rS, GPRC:$rB),
+          (SRW GPRC:$rS, GPRC:$rB)>;
+def : Pat<(shl GPRC:$rS, GPRC:$rB),
+          (SLW GPRC:$rS, GPRC:$rB)>;
+
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX xaddr:$src)>;
+def : Pat<(extloadf32 iaddr:$src),
+          (FMRSD (LFS iaddr:$src))>;
+def : Pat<(extloadf32 xaddr:$src),
+          (FMRSD (LFSX xaddr:$src))>;
+
+// Memory barriers
+def : Pat<(membarrier (i32 imm:$ll),
+                      (i32 imm:$ls),
+                      (i32 imm:$sl),
+                      (i32 imm:$ss),
+                      (i32 imm:$device)),
+           (SYNC)>;
+
+include "PPCInstrAltivec.td"
+include "PPCInstr64Bit.td"
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
new file mode 100644
index 0000000..035647e
--- /dev/null
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -0,0 +1,437 @@
+//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the 32-bit PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "PPCJITInfo.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/System/Memory.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+#define BUILD_ADDIS(RD,RS,IMM16) \
+  ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_ORI(RD,RS,UIMM16) \
+  ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_ORIS(RD,RS,UIMM16) \
+  ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_RLDICR(RD,RS,SH,ME) \
+  ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \
+   (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1))
+#define BUILD_MTSPR(RS,SPR)      \
+  ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1))
+#define BUILD_BCCTRx(BO,BI,LINK) \
+  ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1))
+#define BUILD_B(TARGET, LINK) \
+  ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1))
+
+// Pseudo-ops
+#define BUILD_LIS(RD,IMM16)    BUILD_ADDIS(RD,0,IMM16)
+#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6)
+#define BUILD_MTCTR(RS)        BUILD_MTSPR(RS,9)
+#define BUILD_BCTR(LINK)       BUILD_BCCTRx(20,0,LINK)
+
+static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){
+  intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2;
+  unsigned *AtI = (unsigned*)(intptr_t)At;
+
+  if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+    AtI[0] = BUILD_B(Offset, isCall);     // b/bl target
+  } else if (!is64Bit) {
+    AtI[0] = BUILD_LIS(12, To >> 16);     // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To);       // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_MTCTR(12);             // mtctr r12
+    AtI[3] = BUILD_BCTR(isCall);          // bctr/bctrl
+  } else {
+    AtI[0] = BUILD_LIS(12, To >> 48);      // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To >> 32);  // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_SLDI(12, 12, 32);       // sldi r12, r12, 32
+    AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address)
+    AtI[4] = BUILD_ORI(12, 12, To);        // ori r12, r12, lo16(address)
+    AtI[5] = BUILD_MTCTR(12);              // mtctr r12
+    AtI[6] = BUILD_BCTR(isCall);           // bctr/bctrl
+  }
+}
+
+extern "C" void PPC32CompilationCallback();
+extern "C" void PPC64CompilationCallback();
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    !(defined(__ppc64__) || defined(__FreeBSD__))
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC32CompilationCallback\n"
+"_PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   PowerPC64 ABI linkage    -  24 bytes
+    //                 parameters -  32 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  32 bytes
+    "mflr r0\n"
+    "stw r0,  8(r1)\n"
+    "stwu r1, -208(r1)\n"
+    // Save all int arg registers
+    "stw r10, 204(r1)\n"    "stw r9,  200(r1)\n"
+    "stw r8,  196(r1)\n"    "stw r7,  192(r1)\n"
+    "stw r6,  188(r1)\n"    "stw r5,  184(r1)\n"
+    "stw r4,  180(r1)\n"    "stw r3,  176(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 168(r1)\n"   "stfd f12, 160(r1)\n"
+    "stfd f11, 152(r1)\n"   "stfd f10, 144(r1)\n"
+    "stfd f9,  136(r1)\n"   "stfd f8,  128(r1)\n"
+    "stfd f7,  120(r1)\n"   "stfd f6,  112(r1)\n"
+    "stfd f5,  104(r1)\n"   "stfd f4,   96(r1)\n"
+    "stfd f3,   88(r1)\n"   "stfd f2,   80(r1)\n"
+    "stfd f1,   72(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   r3, r0\n"
+    "lwz  r2, 208(r1)\n" // stub's frame
+    "lwz  r4, 8(r2)\n" // stub's lr
+    "li   r5, 0\n"       // 0 == 32 bit
+    "bl _PPCCompilationCallbackC\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "lwz r10, 204(r1)\n"    "lwz r9,  200(r1)\n"
+    "lwz r8,  196(r1)\n"    "lwz r7,  192(r1)\n"
+    "lwz r6,  188(r1)\n"    "lwz r5,  184(r1)\n"
+    "lwz r4,  180(r1)\n"    "lwz r3,  176(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 168(r1)\n"    "lfd f12, 160(r1)\n"
+    "lfd f11, 152(r1)\n"    "lfd f10, 144(r1)\n"
+    "lfd f9,  136(r1)\n"    "lfd f8,  128(r1)\n"
+    "lfd f7,  120(r1)\n"    "lfd f6,  112(r1)\n"
+    "lfd f5,  104(r1)\n"    "lfd f4,   96(r1)\n"
+    "lfd f3,   88(r1)\n"    "lfd f2,   80(r1)\n"
+    "lfd f1,   72(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  r1, 208(r1)\n"
+    "lwz  r2, 8(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+
+#elif defined(__PPC__) && !defined(__ppc64__)
+// Linux & FreeBSD / PPC 32 support
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC32CompilationCallback\n"
+"PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   8 double registers       -  64 bytes
+    //   8 int registers          -  32 bytes
+    "mflr 0\n"
+    "stw 0,  4(1)\n"
+    "stwu 1, -104(1)\n"
+    // Save all int arg registers
+    "stw 10, 100(1)\n"   "stw 9,  96(1)\n"
+    "stw 8,  92(1)\n"    "stw 7,  88(1)\n"
+    "stw 6,  84(1)\n"    "stw 5,  80(1)\n"
+    "stw 4,  76(1)\n"    "stw 3,  72(1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd 8,  64(1)\n"
+    "stfd 7,  56(1)\n"   "stfd 6,  48(1)\n"
+    "stfd 5,  40(1)\n"   "stfd 4,  32(1)\n"
+    "stfd 3,  24(1)\n"   "stfd 2,  16(1)\n"
+    "stfd 1,  8(1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   3, 0\n"
+    "lwz  5, 104(1)\n" // stub's frame
+    "lwz  4, 4(5)\n" // stub's lr
+    "li   5, 0\n"       // 0 == 32 bit
+    "bl PPCCompilationCallbackC\n"
+    "mtctr 3\n"
+    // Restore all int arg registers
+    "lwz 10, 100(1)\n"   "lwz 9,  96(1)\n"
+    "lwz 8,  92(1)\n"    "lwz 7,  88(1)\n"
+    "lwz 6,  84(1)\n"    "lwz 5,  80(1)\n"
+    "lwz 4,  76(1)\n"    "lwz 3,  72(1)\n"
+    // Restore all FP arg registers
+    "lfd 8,  64(1)\n"
+    "lfd 7,  56(1)\n"    "lfd 6,  48(1)\n"
+    "lfd 5,  40(1)\n"    "lfd 4,  32(1)\n"
+    "lfd 3,  24(1)\n"    "lfd 2,  16(1)\n"
+    "lfd 1,  8(1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  1, 104(1)\n"
+    "lwz  0, 4(1)\n"
+    "mtlr 0\n"
+    "bctr\n"
+    );
+#else
+void PPC32CompilationCallback() {
+  assert(0 && "This is not a power pc, you can't execute this!");
+  abort();
+}
+#endif
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    defined(__ppc64__)
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC64CompilationCallback\n"
+"_PPC64CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // Set up a proper stack frame
+    // Layout
+    //   PowerPC64 ABI linkage    -  48 bytes
+    //                 parameters -  64 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  64 bytes
+    "mflr r0\n"
+    "std r0,  16(r1)\n"
+    "stdu r1, -280(r1)\n"
+    // Save all int arg registers
+    "std r10, 272(r1)\n"    "std r9,  264(r1)\n"
+    "std r8,  256(r1)\n"    "std r7,  248(r1)\n"
+    "std r6,  240(r1)\n"    "std r5,  232(r1)\n"
+    "std r4,  224(r1)\n"    "std r3,  216(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 208(r1)\n"    "stfd f12, 200(r1)\n"
+    "stfd f11, 192(r1)\n"    "stfd f10, 184(r1)\n"
+    "stfd f9,  176(r1)\n"    "stfd f8,  168(r1)\n"
+    "stfd f7,  160(r1)\n"    "stfd f6,  152(r1)\n"
+    "stfd f5,  144(r1)\n"    "stfd f4,  136(r1)\n"
+    "stfd f3,  128(r1)\n"    "stfd f2,  120(r1)\n"
+    "stfd f1,  112(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 1.
+    "mr   r3, r0\n"
+    "ld   r2, 280(r1)\n" // stub's frame
+    "ld   r4, 16(r2)\n"  // stub's lr
+    "li   r5, 1\n"       // 1 == 64 bit
+    "bl _PPCCompilationCallbackC\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "ld r10, 272(r1)\n"    "ld r9,  264(r1)\n"
+    "ld r8,  256(r1)\n"    "ld r7,  248(r1)\n"
+    "ld r6,  240(r1)\n"    "ld r5,  232(r1)\n"
+    "ld r4,  224(r1)\n"    "ld r3,  216(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 208(r1)\n"    "lfd f12, 200(r1)\n"
+    "lfd f11, 192(r1)\n"    "lfd f10, 184(r1)\n"
+    "lfd f9,  176(r1)\n"    "lfd f8,  168(r1)\n"
+    "lfd f7,  160(r1)\n"    "lfd f6,  152(r1)\n"
+    "lfd f5,  144(r1)\n"    "lfd f4,  136(r1)\n"
+    "lfd f3,  128(r1)\n"    "lfd f2,  120(r1)\n"
+    "lfd f1,  112(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "ld  r1, 280(r1)\n"
+    "ld  r2, 16(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+#else
+void PPC64CompilationCallback() {
+  assert(0 && "This is not a power pc, you can't execute this!");
+  abort();
+}
+#endif
+
+extern "C" void *PPCCompilationCallbackC(unsigned *StubCallAddrPlus4,
+                                         unsigned *OrigCallAddrPlus4,
+                                         bool is64Bit) {
+  // Adjust the pointer to the address of the call instruction in the stub
+  // emitted by emitFunctionStub, rather than the instruction after it.
+  unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
+  unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1;
+
+  void *Target = JITCompilerFunction(StubCallAddr);
+
+  // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite
+  // it to branch directly to the destination.  If so, rewrite it so it does not
+  // need to go through the stub anymore.
+  unsigned OrigCallInst = *OrigCallAddr;
+  if ((OrigCallInst >> 26) == 18) {     // Direct call.
+    intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2;
+    
+    if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+      // Clear the original target out.
+      OrigCallInst &= (63 << 26) | 3;
+      // Fill in the new target.
+      OrigCallInst |= (Offset & ((1 << 24)-1)) << 2;
+      // Replace the call.
+      *OrigCallAddr = OrigCallInst;
+    }
+  }
+
+  // Assert that we are coming from a stub that was created with our
+  // emitFunctionStub.
+  if ((*StubCallAddr >> 26) == 18)
+    StubCallAddr -= 3;
+  else {
+  assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!");
+    StubCallAddr -= is64Bit ? 9 : 6;
+  }
+
+  // Rewrite the stub with an unconditional branch to the target, for any users
+  // who took the address of the stub.
+  EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit);
+
+  // Put the address of the target function to call and the address to return to
+  // after calling the target function in a place that is easy to get on the
+  // stack after we restore all regs.
+  return Target;
+}
+
+
+
+TargetJITInfo::LazyResolverFn
+PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) {
+  JITCompilerFunction = Fn;
+  return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback;
+}
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+defined(__APPLE__)
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+#endif
+
+void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)PPC32CompilationCallback && 
+      Fn != (void*)(intptr_t)PPC64CompilationCallback) {
+    JCE.startGVStub(F, 7*4);
+    intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    EmitBranchToAt(Addr, (intptr_t)Fn, false, is64Bit);
+    sys::Memory::InvalidateInstructionCache((void*)Addr, 7*4);
+    return JCE.finishGVStub(F);
+  }
+
+  JCE.startGVStub(F, 10*4);
+  intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+  if (is64Bit) {
+    JCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
+  } else if (TM.getSubtargetImpl()->isMachoABI()){
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
+  } else {
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610024);     // stw r11, 36(r1)
+  }
+  intptr_t BranchAddr = (intptr_t)JCE.getCurrentPCValue();
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  EmitBranchToAt(BranchAddr, (intptr_t)Fn, true, is64Bit);
+  sys::Memory::InvalidateInstructionCache((void*)Addr, 10*4);
+  return JCE.finishGVStub(F);
+}
+
+
+void PPCJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((PPC::RelocationType)MR->getRelocationType()) {
+    default: assert(0 && "Unknown relocation type!");
+    case PPC::reloc_pcrel_bx:
+      // PC-relative relocation for b and bl instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 24)-1))  << 2;
+      break;
+    case PPC::reloc_pcrel_bcx:
+      // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other
+      // bcx instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 14)-1))  << 2;
+      break;
+    case PPC::reloc_absolute_high:     // high bits of ref -> low 16 of instr
+    case PPC::reloc_absolute_low: {    // low bits of ref  -> low 16 of instr
+      ResultPtr += MR->getConstantVal();
+
+      // If this is a high-part access, get the high-part.
+      if (MR->getRelocationType() == PPC::reloc_absolute_high) {
+        // If the low part will have a carry (really a borrow) from the low
+        // 16-bits into the high 16, add a bit to borrow from.
+        if (((int)ResultPtr << 16) < 0)
+          ResultPtr += 1 << 16;
+        ResultPtr >>= 16;
+      }
+
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 65535;
+      unsigned HighBits = *RelocPos & ~65535;
+      *RelocPos = LowBits | HighBits;  // Slam into low 16-bits
+      break;
+    }
+    case PPC::reloc_absolute_low_ix: {  // low bits of ref  -> low 14 of instr
+      ResultPtr += MR->getConstantVal();
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 0xFFFC;
+      unsigned HighBits = *RelocPos & 0xFFFF0003;
+      *RelocPos = LowBits | HighBits;  // Slam into low 14-bits.
+      break;
+    }
+    }
+  }
+}
+
+void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit);
+}
diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
new file mode 100644
index 0000000..2e25b29
--- /dev/null
+++ b/lib/Target/PowerPC/PPCJITInfo.h
@@ -0,0 +1,48 @@
+//===- PPCJITInfo.h - PowerPC impl. of the JIT interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_JITINFO_H
+#define POWERPC_JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+
+namespace llvm {
+  class PPCTargetMachine;
+
+  class PPCJITInfo : public TargetJITInfo {
+  protected:
+    PPCTargetMachine &TM;
+    bool is64Bit;
+  public:
+    PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) {
+      useGOT = 0;
+      is64Bit = tmIs64Bit;
+    }
+
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+    
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  };
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCMachOWriterInfo.cpp b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp
new file mode 100644
index 0000000..3bfa6d7
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp
@@ -0,0 +1,151 @@
+//===-- PPCMachOWriterInfo.cpp - Mach-O Writer Info for the PowerPC -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Mach-O writer information for the PowerPC backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMachOWriterInfo.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachORelocation.h"
+#include "llvm/Support/OutputBuffer.h"
+#include <cstdio>
+using namespace llvm;
+
+PPCMachOWriterInfo::PPCMachOWriterInfo(const PPCTargetMachine &TM)
+  : TargetMachOWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64 ?
+                          HDR_CPU_TYPE_POWERPC64 :
+                          HDR_CPU_TYPE_POWERPC,
+                          HDR_CPU_SUBTYPE_POWERPC_ALL) {}
+PPCMachOWriterInfo::~PPCMachOWriterInfo() {}
+
+/// GetTargetRelocation - For the MachineRelocation MR, convert it to one or
+/// more PowerPC MachORelocation(s), add the new relocations to the
+/// MachOSection, and rewrite the instruction at the section offset if required
+/// by that relocation type.
+unsigned PPCMachOWriterInfo::GetTargetRelocation(MachineRelocation &MR,
+                                                 unsigned FromIdx,
+                                                 unsigned ToAddr,
+                                                 unsigned ToIdx,
+                                                 OutputBuffer &RelocOut,
+                                                 OutputBuffer &SecOut,
+                                                 bool Scattered,
+                                                 bool isExtern) const {
+  unsigned NumRelocs = 0;
+  uint64_t Addr = 0;
+
+  // Get the address of whatever it is we're relocating, if possible.
+  if (!isExtern)
+    Addr = (uintptr_t)MR.getResultPointer() + ToAddr;
+
+  switch ((PPC::RelocationType)MR.getRelocationType()) {
+  default: assert(0 && "Unknown PPC relocation type!");
+  case PPC::reloc_absolute_low_ix:
+    assert(0 && "Unhandled PPC relocation type!");
+    break;
+  case PPC::reloc_vanilla:
+    {
+      // FIXME: need to handle 64 bit vanilla relocs
+      MachORelocation VANILLA(MR.getMachineCodeOffset(), ToIdx,
+                              false, 2, isExtern,
+                              PPC_RELOC_VANILLA,
+                              Scattered, (intptr_t)MR.getResultPointer());
+      ++NumRelocs;
+
+      if (Scattered) {
+        RelocOut.outword(VANILLA.getPackedFields());
+        RelocOut.outword(VANILLA.getAddress());
+      } else {
+        RelocOut.outword(VANILLA.getAddress());
+        RelocOut.outword(VANILLA.getPackedFields());
+      }
+      
+      intptr_t SymbolOffset;
+
+      if (Scattered)
+        SymbolOffset = Addr + MR.getConstantVal();
+      else
+        SymbolOffset = Addr;
+
+      printf("vanilla fixup: sec_%x[%x] = %x\n", FromIdx,
+             unsigned(MR.getMachineCodeOffset()),
+             unsigned(SymbolOffset));
+      SecOut.fixword(SymbolOffset, MR.getMachineCodeOffset());
+    }
+    break;
+  case PPC::reloc_pcrel_bx:
+    {
+      // FIXME: Presumably someday we will need to branch to other, non-extern
+      // functions too.  Need to figure out some way to distinguish between
+      // target is BB and target is function.
+      if (isExtern) {
+        MachORelocation BR24(MR.getMachineCodeOffset(), ToIdx, true, 2, 
+                             isExtern, PPC_RELOC_BR24, Scattered, 
+                             (intptr_t)MR.getMachineCodeOffset());
+        RelocOut.outword(BR24.getAddress());
+        RelocOut.outword(BR24.getPackedFields());
+        ++NumRelocs;
+      }
+
+      Addr -= MR.getMachineCodeOffset();
+      Addr >>= 2;
+      Addr &= 0xFFFFFF;
+      Addr <<= 2;
+      Addr |= (SecOut[MR.getMachineCodeOffset()] << 24);
+      Addr |= (SecOut[MR.getMachineCodeOffset()+3] & 0x3);
+      SecOut.fixword(Addr, MR.getMachineCodeOffset());
+      break;
+    }
+  case PPC::reloc_pcrel_bcx:
+    {
+      Addr -= MR.getMachineCodeOffset();
+      Addr &= 0xFFFC;
+
+      SecOut.fixhalf(Addr, MR.getMachineCodeOffset() + 2);
+      break;
+    }
+  case PPC::reloc_absolute_high:
+    {
+      MachORelocation HA16(MR.getMachineCodeOffset(), ToIdx, false, 2,
+                           isExtern, PPC_RELOC_HA16);
+      MachORelocation PAIR(Addr & 0xFFFF, 0xFFFFFF, false, 2, isExtern,
+                           PPC_RELOC_PAIR);
+      NumRelocs = 2;
+
+      RelocOut.outword(HA16.getRawAddress());
+      RelocOut.outword(HA16.getPackedFields());
+      RelocOut.outword(PAIR.getRawAddress());
+      RelocOut.outword(PAIR.getPackedFields());
+
+      Addr += 0x8000;
+
+      SecOut.fixhalf(Addr >> 16, MR.getMachineCodeOffset() + 2);
+      break;
+    }
+  case PPC::reloc_absolute_low:
+    {
+      MachORelocation LO16(MR.getMachineCodeOffset(), ToIdx, false, 2,
+                           isExtern, PPC_RELOC_LO16);
+      MachORelocation PAIR(Addr >> 16, 0xFFFFFF, false, 2, isExtern,
+                           PPC_RELOC_PAIR);
+      NumRelocs = 2;
+
+      RelocOut.outword(LO16.getRawAddress());
+      RelocOut.outword(LO16.getPackedFields());
+      RelocOut.outword(PAIR.getRawAddress());
+      RelocOut.outword(PAIR.getPackedFields());
+
+      SecOut.fixhalf(Addr, MR.getMachineCodeOffset() + 2);
+      break;
+    }
+  }
+
+  return NumRelocs;
+}
diff --git a/lib/Target/PowerPC/PPCMachOWriterInfo.h b/lib/Target/PowerPC/PPCMachOWriterInfo.h
new file mode 100644
index 0000000..d46334d
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachOWriterInfo.h
@@ -0,0 +1,55 @@
+//===-- PPCMachOWriterInfo.h - Mach-O Writer Info for PowerPC ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Mach-O writer information for the PowerPC backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_MACHO_WRITER_INFO_H
+#define PPC_MACHO_WRITER_INFO_H
+
+#include "llvm/Target/TargetMachOWriterInfo.h"
+
+namespace llvm {
+
+  // Forward declarations
+  class MachineRelocation;
+  class OutputBuffer;
+  class PPCTargetMachine;
+
+  class PPCMachOWriterInfo : public TargetMachOWriterInfo {
+  public:
+    PPCMachOWriterInfo(const PPCTargetMachine &TM);
+    virtual ~PPCMachOWriterInfo();
+
+    virtual unsigned GetTargetRelocation(MachineRelocation &MR,
+                                         unsigned FromIdx,
+                                         unsigned ToAddr,
+                                         unsigned ToIdx,
+                                         OutputBuffer &RelocOut,
+                                         OutputBuffer &SecOut,
+                                         bool Scattered, bool Extern) const;
+
+    // Constants for the relocation r_type field.
+    // See <mach-o/ppc/reloc.h>
+    enum {
+      PPC_RELOC_VANILLA, // generic relocation
+      PPC_RELOC_PAIR,    // the second relocation entry of a pair
+      PPC_RELOC_BR14,    // 14 bit branch displacement to word address
+      PPC_RELOC_BR24,    // 24 bit branch displacement to word address
+      PPC_RELOC_HI16,    // a PAIR follows with the low 16 bits
+      PPC_RELOC_LO16,    // a PAIR follows with the high 16 bits
+      PPC_RELOC_HA16,    // a PAIR follows, which is sign extended to 32b
+      PPC_RELOC_LO14     // LO16 with low 2 bits implicitly zero
+    };
+  };
+
+} // end llvm namespace
+
+#endif // PPC_MACHO_WRITER_INFO_H
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
new file mode 100644
index 0000000..42883d7
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -0,0 +1,104 @@
+//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_MACHINE_FUNCTION_INFO_H
+#define PPC_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// PPCFunctionInfo - This class is derived from MachineFunction private
+/// PowerPC target-specific information for each MachineFunction.
+class PPCFunctionInfo : public MachineFunctionInfo {
+private:
+  /// FramePointerSaveIndex - Frame index of where the old frame pointer is
+  /// stored.  Also used as an anchor for instructions that need to be altered
+  /// when using frame pointers (dyna_add, dyna_sub.)
+  int FramePointerSaveIndex;
+  
+  /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
+  ///
+  int ReturnAddrSaveIndex;
+
+  /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
+  /// function.  This is only valid after the initial scan of the function by
+  /// PEI.
+  bool MustSaveLR;
+
+  /// SpillsCR - Indicates whether CR is spilled in the current function.
+  bool SpillsCR;
+
+  /// LRStoreRequired - The bool indicates whether there is some explicit use of
+  /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
+  /// requires that the code generator produce a store of LR to the stack on
+  /// entry, even though LR may otherwise apparently not be used.
+  bool LRStoreRequired;
+
+  /// MinReservedArea - This is the frame size that is at least reserved in a
+  /// potential caller (parameter+linkage area).
+  unsigned MinReservedArea;
+
+  /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
+  /// amount the stack pointer is adjusted to make the frame bigger for tail
+  /// calls. Used for creating an area before the register spill area.
+  int TailCallSPDelta;
+
+  /// HasFastCall - Does this function contain a fast call. Used to determine
+  /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
+  bool HasFastCall;
+
+public:
+  PPCFunctionInfo(MachineFunction &MF) 
+    : FramePointerSaveIndex(0),
+      ReturnAddrSaveIndex(0),
+      SpillsCR(false),
+      LRStoreRequired(false),
+      MinReservedArea(0),
+      TailCallSPDelta(0),
+      HasFastCall(false) {}
+
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+  
+  int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; }
+  void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; }
+
+  unsigned getMinReservedArea() const { return MinReservedArea; }
+  void setMinReservedArea(unsigned size) { MinReservedArea = size; }
+
+  int getTailCallSPDelta() const { return TailCallSPDelta; }
+  void setTailCallSPDelta(int size) { TailCallSPDelta = size; }
+
+  /// MustSaveLR - This is set when the prolog/epilog inserter does its initial
+  /// scan of the function. It is true if the LR/LR8 register is ever explicitly
+  /// defined/clobbered in the machine function (e.g. by calls and movpctolr,
+  /// which is used in PIC generation), or if the LR stack slot is explicitly
+  /// referenced by builtin_return_address.
+  void setMustSaveLR(bool U) { MustSaveLR = U; }
+  bool mustSaveLR() const    { return MustSaveLR; }
+
+  void setSpillsCR()       { SpillsCR = true; }
+  bool isCRSpilled() const { return SpillsCR; }
+
+  void setLRStoreRequired() { LRStoreRequired = true; }
+  bool isLRStoreRequired() const { return LRStoreRequired; }
+
+  void setHasFastCall() { HasFastCall = true; }
+  bool hasFastCall() const { return HasFastCall;}
+};
+
+} // end of namespace llvm
+
+
+#endif
diff --git a/lib/Target/PowerPC/PPCPerfectShuffle.h b/lib/Target/PowerPC/PPCPerfectShuffle.h
new file mode 100644
index 0000000..3164e33
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle without using vperm.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 292 entries have cost 1
+// 1384 entries have cost 2
+// 3061 entries have cost 3
+// 1733 entries have cost 4
+// 60 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  202162278U,	// <0,0,0,0>: Cost 1 vspltisw0 LHS
+  1140850790U,	// <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS
+  2617247181U,	// <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0>
+  2635163787U,	// <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0>
+  1543507254U,	// <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS
+  2281701705U,	// <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5>
+  2617250133U,	// <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0>
+  2659054575U,	// <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,0,u>: Cost 1 vspltisw0 LHS
+  1141686282U,	// <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1>
+  67944550U,	// <0,0,1,1>: Cost 1 vmrghw LHS, LHS
+  1685241958U,	// <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2215870716U,	// <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1141727570U,	// <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  2215428562U,	// <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7>
+  2215428589U,	// <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7>
+  2659062768U,	// <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1>
+  67945117U,	// <0,0,1,u>: Cost 1 vmrghw LHS, LHS
+  2684356045U,	// <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0>
+  2216009830U,	// <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2216009901U,	// <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2>
+  2698290853U,	// <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0>
+  3289751890U,	// <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5>
+  3758098275U,	// <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1>
+  2684356538U,	// <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7>
+  3758098410U,	// <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1>
+  2216010397U,	// <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2702272651U,	// <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0>
+  2216656998U,	// <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  3844669704U,	// <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3>
+  2216657148U,	// <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0>
+  2684357122U,	// <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6>
+  3732820066U,	// <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0>
+  3778005624U,	// <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7>
+  3374713464U,	// <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7>
+  2216657565U,	// <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217361408U,	// <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0>
+  1143619686U,	// <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  3291103405U,	// <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2>
+  3827269988U,	// <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5>
+  1143619922U,	// <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1610616118U,	// <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  3758099833U,	// <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2>
+  3854107016U,	// <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5>
+  1143620253U,	// <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2284396544U,	// <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0>
+  2218025062U,	// <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS
+  3758100203U,	// <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3>
+  3395966100U,	// <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3>
+  3804549052U,	// <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5>
+  2302314964U,	// <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5>
+  2785821138U,	// <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  3395966428U,	// <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7>
+  2787148260U,	// <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7>
+  2684358997U,	// <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0>
+  2218631270U,	// <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2684359162U,	// <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3>
+  3758101042U,	// <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5>
+  3732843830U,	// <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS
+  3758101227U,	// <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1>
+  2684359480U,	// <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6>
+  2724836173U,	// <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0>
+  2725499806U,	// <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0>
+  2726163439U,	// <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  2219311206U,	// <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS
+  3868557900U,	// <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3>
+  3377400112U,	// <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3>
+  2684360038U,	// <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6>
+  3732852834U,	// <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0>
+  3871507060U,	// <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7>
+  2303658616U,	// <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7>
+  2726163439U,	// <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,u,0>: Cost 1 vspltisw0 LHS
+  72589414U,	// <0,0,u,1>: Cost 1 vmrghw LHS, LHS
+  1685242525U,	// <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2220073212U,	// <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1146331474U,	// <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  1610619034U,	// <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  2785821138U,	// <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  2659120119U,	// <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u>
+  72589981U,	// <0,0,u,u>: Cost 1 vmrghw LHS, LHS
+  2698297344U,	// <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0>
+  1624555622U,	// <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  2758984428U,	// <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1>
+  2635237524U,	// <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0>
+  2693652818U,	// <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5>
+  2281701714U,	// <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5>
+  2698297846U,	// <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7>
+  2659128312U,	// <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0>
+  1624556189U,	// <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  1543585802U,	// <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1>
+  1141728052U,	// <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1141728150U,	// <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2295644334U,	// <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3>
+  1543589174U,	// <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS
+  2290999634U,	// <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5>
+  2617332135U,	// <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1>
+  2617332720U,	// <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1>
+  1142171004U,	// <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0>
+  1561509990U,	// <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS
+  2623308516U,	// <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2>
+  2698298984U,	// <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1561513270U,	// <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS
+  2647199304U,	// <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2>
+  2698299322U,	// <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7>
+  1585402874U,	// <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2698299540U,	// <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0>
+  3290399540U,	// <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1>
+  2698299720U,	// <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0>
+  2698299804U,	// <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3>
+  2698299906U,	// <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6>
+  3832726521U,	// <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0>
+  2724842160U,	// <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0>
+  2706926275U,	// <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1>
+  2698300190U,	// <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2>
+  2635268198U,	// <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS
+  2217362228U,	// <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1>
+  2217362326U,	// <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0>
+  2635270296U,	// <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4>
+  2635271478U,	// <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS
+  1624558902U,	// <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2659160910U,	// <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1>
+  2659161084U,	// <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4>
+  1624559145U,	// <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  3832726639U,	// <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1>
+  2714889871U,	// <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1>
+  2302314646U,	// <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  3834717321U,	// <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0>
+  3832726679U,	// <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5>
+  2717544403U,	// <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1>
+  2718208036U,	// <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1>
+  3792613493U,	// <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1>
+  2719535302U,	// <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1>
+  2659172454U,	// <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS
+  3832726735U,	// <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7>
+  2724844026U,	// <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3>
+  3775361608U,	// <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0>
+  2659175734U,	// <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS
+  3832726771U,	// <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7>
+  2724844344U,	// <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6>
+  1651102542U,	// <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1>
+  1651766175U,	// <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1>
+  2724844536U,	// <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0>
+  3377397770U,	// <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1>
+  2698302636U,	// <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0>
+  2728162531U,	// <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1>
+  2724844902U,	// <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6>
+  3377398098U,	// <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5>
+  2724845076U,	// <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0>
+  2724845164U,	// <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7>
+  2724845186U,	// <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2>
+  1561559142U,	// <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS
+  1146331956U,	// <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1146332054U,	// <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1561562422U,	// <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS
+  1624561818U,	// <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2220074191U,	// <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7>
+  1585452032U,	// <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2214593997U,	// <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0>
+  2214675999U,	// <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1>
+  2214594152U,	// <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2>
+  1207959654U,	// <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  3709054262U,	// <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS
+  3375350836U,	// <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5>
+  2214594490U,	// <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7>
+  3288336362U,	// <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1>
+  1207959659U,	// <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS
+  2215871994U,	// <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2215470623U,	// <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1141728872U,	// <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1141728934U,	// <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2215872323U,	// <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2215872405U,	// <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1141729210U,	// <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2215430122U,	// <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1141729368U,	// <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3>
+  3289736698U,	// <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0>
+  3289744927U,	// <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1>
+  2216011368U,	// <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2>
+  2216019622U,	// <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1>
+  3289769795U,	// <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5>
+  3289778069U,	// <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6>
+  2216044474U,	// <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7>
+  3732960259U,	// <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2>
+  2216061016U,	// <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3>
+  2758985382U,	// <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1>
+  2758985392U,	// <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2>
+  3290400360U,	// <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2>
+  2758985408U,	// <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0>
+  2758985422U,	// <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5>
+  2785822424U,	// <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6>
+  3290400698U,	// <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7>
+  2765915876U,	// <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0>
+  2758985453U,	// <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0>
+  3291104762U,	// <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0>
+  2217362979U,	// <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2217363048U,	// <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2>
+  2217363110U,	// <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1>
+  3291105087U,	// <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1>
+  3291105173U,	// <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6>
+  2217363386U,	// <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7>
+  3788639688U,	// <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0>
+  2217363515U,	// <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1>
+  3376054371U,	// <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0>
+  3788639888U,	// <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2>
+  3376055912U,	// <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2>
+  2302312550U,	// <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3376054375U,	// <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4>
+  3374728244U,	// <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5>
+  3805229154U,	// <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0>
+  3376055512U,	// <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7>
+  2302312555U,	// <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3709100134U,	// <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS
+  3709100950U,	// <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0>
+  3709102010U,	// <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7>
+  2758985658U,	// <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7>
+  3709103414U,	// <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS
+  3732992098U,	// <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0>
+  3292374970U,	// <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7>
+  3798594383U,	// <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2>
+  2758985703U,	// <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7>
+  3788641274U,	// <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2>
+  3377398508U,	// <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1>
+  3377398590U,	// <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2>
+  2303656038U,	// <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  3709111606U,	// <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS
+  3377398836U,	// <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5>
+  3803903447U,	// <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2>
+  3293054954U,	// <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1>
+  2303656043U,	// <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2220074490U,	// <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2220074527U,	// <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1146332776U,	// <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1146332838U,	// <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2220074819U,	// <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2220074901U,	// <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1146333114U,	// <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2220074986U,	// <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1146333243U,	// <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1>
+  2629410816U,	// <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0>
+  2753530006U,	// <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2>
+  2629412301U,	// <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0>
+  2214594972U,	// <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3>
+  2758985908U,	// <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5>
+  3733016674U,	// <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0>
+  3777364488U,	// <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7>
+  2281703354U,	// <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7>
+  2758985941U,	// <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2>
+  1141729430U,	// <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2215471334U,	// <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2215471425U,	// <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1141729692U,	// <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1141729794U,	// <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2215430738U,	// <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5>
+  2215430776U,	// <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7>
+  2295646138U,	// <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7>
+  1141730078U,	// <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2758986032U,	// <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3>
+  3709141910U,	// <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0>
+  3289753921U,	// <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2>
+  2770929992U,	// <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0>
+  3289754114U,	// <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6>
+  3362095460U,	// <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5>
+  3832727910U,	// <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3>
+  3365414842U,	// <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7>
+  2771298677U,	// <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0>
+  2216659094U,	// <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2>
+  3290409190U,	// <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1>
+  2703624496U,	// <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3>
+  2216683932U,	// <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3>
+  2216692226U,	// <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6>
+  3733041250U,	// <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0>
+  3832727988U,	// <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0>
+  3374712762U,	// <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7>
+  2216725278U,	// <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2>
+  2217363606U,	// <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2>
+  3291105510U,	// <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1>
+  3291105601U,	// <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2>
+  2217363868U,	// <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3>
+  2217363970U,	// <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6>
+  2758986242U,	// <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6>
+  3727077685U,	// <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4>
+  3364767674U,	// <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7>
+  2217364254U,	// <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2>
+  3832728102U,	// <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6>
+  3405916003U,	// <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1>
+  3376055840U,	// <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2>
+  3376055679U,	// <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3>
+  3376055194U,	// <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4>
+  3859565138U,	// <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5>
+  2727514210U,	// <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  3376056250U,	// <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7>
+  2727514210U,	// <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  2758986360U,	// <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  3709174678U,	// <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0>
+  3795284411U,	// <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3>
+  3709175980U,	// <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6>
+  3833096860U,	// <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7>
+  3376728235U,	// <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5>
+  3859565229U,	// <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6>
+  2773879472U,	// <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0>
+  2758986360U,	// <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  2303656854U,	// <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0>
+  3807229018U,	// <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u>
+  2727515284U,	// <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3>
+  3377399410U,	// <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3>
+  3377398682U,	// <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4>
+  3801257409U,	// <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7>
+  3377399980U,	// <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6>
+  3375409082U,	// <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7>
+  2731497082U,	// <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3>
+  1146333334U,	// <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2220075238U,	// <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2220075329U,	// <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1146333596U,	// <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1146333698U,	// <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2758986566U,	// <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6>
+  2803739472U,	// <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7>
+  2295703482U,	// <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7>
+  1146333982U,	// <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2214595473U,	// <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0>
+  2693677158U,	// <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS
+  3839437689U,	// <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3>
+  3709200559U,	// <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0>
+  2693677394U,	// <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5>
+  1140854070U,	// <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  3767419409U,	// <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7>
+  3854109604U,	// <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1>
+  1140854313U,	// <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS
+  1141689234U,	// <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2215431114U,	// <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3>
+  2215431221U,	// <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635466928U,	// <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1>
+  1141689552U,	// <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  67947830U,	// <0,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2215431545U,	// <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2659357716U,	// <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1>
+  67948073U,	// <0,4,1,u>: Cost 1 vmrghw LHS, RHS
+  3767420369U,	// <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4>
+  3767420451U,	// <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5>
+  3767420520U,	// <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2>
+  2698323625U,	// <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4>
+  3709218102U,	// <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS
+  2216013110U,	// <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767420858U,	// <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7>
+  3774719981U,	// <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4>
+  2216013353U,	// <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767421078U,	// <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2>
+  3776710880U,	// <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4>
+  3833097325U,	// <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4>
+  3767421340U,	// <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3>
+  3767421442U,	// <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6>
+  2216660278U,	// <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  3833097361U,	// <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4>
+  3780692678U,	// <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4>
+  2216660521U,	// <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2617573416U,	// <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4>
+  2217364450U,	// <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0>
+  3691316771U,	// <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5>
+  3709233331U,	// <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4>
+  2785823952U,	// <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4>
+  1143622966U,	// <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  3691319723U,	// <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5>
+  3854109932U,	// <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5>
+  1143623209U,	// <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2635497574U,	// <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS
+  2635498390U,	// <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0>
+  3709240936U,	// <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2>
+  2635499700U,	// <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5>
+  2635500854U,	// <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS
+  2785824044U,	// <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6>
+  1685245238U,	// <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659390488U,	// <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5>
+  1685245256U,	// <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  3839438161U,	// <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7>
+  3798610347U,	// <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5>
+  3798610426U,	// <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3>
+  3795956237U,	// <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4>
+  3733138742U,	// <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS
+  2218634550U,	// <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS
+  3798610744U,	// <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6>
+  2724868945U,	// <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4>
+  2725532578U,	// <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4>
+  3383371465U,	// <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0>
+  3800601668U,	// <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4>
+  3775386826U,	// <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3>
+  3801928934U,	// <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4>
+  3721202998U,	// <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS
+  2780368328U,	// <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  3383372686U,	// <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6>
+  3854110170U,	// <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0>
+  2780368328U,	// <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  1146334098U,	// <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2220076002U,	// <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0>
+  2220076085U,	// <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635524279U,	// <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u>
+  1146334416U,	// <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  72592694U,	// <0,4,u,5>: Cost 1 vmrghw LHS, RHS
+  1685245481U,	// <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659415067U,	// <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u>
+  72592937U,	// <0,4,u,u>: Cost 1 vmrghw LHS, RHS
+  2281704337U,	// <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0>
+  2704965734U,	// <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  3778707666U,	// <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3>
+  3778707708U,	// <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0>
+  2687050057U,	// <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5>
+  2214596612U,	// <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5>
+  2785824372U,	// <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1>
+  3854110332U,	// <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0>
+  2704966301U,	// <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  1567768678U,	// <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS
+  2312236570U,	// <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1>
+  2215431915U,	// <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641512598U,	// <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2>
+  1567771538U,	// <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1>
+  1141690372U,	// <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1141690466U,	// <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2641515514U,	// <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2>
+  1141690615U,	// <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5>
+  3772736973U,	// <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0>
+  3778709024U,	// <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2>
+  3778709096U,	// <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2>
+  3778709158U,	// <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1>
+  3772737275U,	// <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5>
+  3859566351U,	// <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3>
+  3778709434U,	// <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7>
+  3805251562U,	// <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1>
+  3775391807U,	// <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5>
+  2704967830U,	// <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2>
+  3776719073U,	// <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5>
+  3777382706U,	// <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5>
+  3778709887U,	// <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1>
+  2704968148U,	// <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5>
+  3857428317U,	// <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0>
+  3364096514U,	// <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6>
+  3780700871U,	// <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5>
+  2707622680U,	// <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5>
+  2728856466U,	// <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1>
+  3697361674U,	// <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4>
+  3697362601U,	// <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4>
+  3364766635U,	// <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3>
+  2217365428U,	// <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6>
+  2704969014U,	// <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  2785824700U,	// <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5>
+  3364766963U,	// <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7>
+  2704969257U,	// <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  3846148050U,	// <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0>
+  2326203282U,	// <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1>
+  3291746027U,	// <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3>
+  3376054482U,	// <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3>
+  3790655366U,	// <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5>
+  2785824772U,	// <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5>
+  2724876386U,	// <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0>
+  3858903057U,	// <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0>
+  2736820484U,	// <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0>
+  2659467366U,	// <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS
+  3859566643U,	// <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7>
+  3798618618U,	// <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3>
+  3852857410U,	// <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4>
+  2659470646U,	// <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS
+  2659471458U,	// <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  3832729696U,	// <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7>
+  1712083042U,	// <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0>
+  1712156779U,	// <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0>
+  2731512826U,	// <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2>
+  3859566717U,	// <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0>
+  3798619284U,	// <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3>
+  3778712803U,	// <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1>
+  2728858936U,	// <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5>
+  3859566753U,	// <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0>
+  3377398135U,	// <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6>
+  3798619686U,	// <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0>
+  2731513468U,	// <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5>
+  1567826022U,	// <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS
+  2704971566U,	// <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  2220076779U,	// <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641569942U,	// <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2>
+  1567828889U,	// <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u>
+  1146335236U,	// <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1146335330U,	// <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  1713410308U,	// <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0>
+  1713484045U,	// <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0>
+  2214596949U,	// <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0>
+  2214678951U,	// <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1>
+  2214597114U,	// <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3>
+  3852857653U,	// <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4>
+  3832729919U,	// <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5>
+  3721293427U,	// <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0>
+  2214597432U,	// <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6>
+  1207962934U,	// <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  1207962935U,	// <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS
+  2215432481U,	// <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2>
+  2215432615U,	// <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1141690874U,	// <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2215432754U,	// <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2215432817U,	// <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5>
+  2215432939U,	// <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1>
+  1141691192U,	// <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221905718U,	// <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  1221905719U,	// <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS
+  3852857787U,	// <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3>
+  3289764265U,	// <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3>
+  3289690618U,	// <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3>
+  3862589907U,	// <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0>
+  3733253430U,	// <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS
+  3733254242U,	// <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0>
+  3777390522U,	// <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7>
+  2785825274U,	// <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3>
+  2785825283U,	// <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3>
+  3777390742U,	// <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2>
+  3863106066U,	// <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0>
+  3777390899U,	// <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6>
+  3290436146U,	// <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5>
+  3779381762U,	// <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6>
+  3779381798U,	// <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6>
+  3733262920U,	// <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0>
+  2300972342U,	// <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2300972343U,	// <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS
+  3802606482U,	// <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1>
+  2217365931U,	// <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2217366010U,	// <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3>
+  3291107890U,	// <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5>
+  3291099805U,	// <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4>
+  3777391926U,	// <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS
+  2217366328U,	// <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6>
+  2291027254U,	// <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  2291027255U,	// <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS
+  3852858033U,	// <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6>
+  3395964532U,	// <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1>
+  3864507069U,	// <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0>
+  3376056678U,	// <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3>
+  3721334070U,	// <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS
+  3395964860U,	// <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5>
+  3864802017U,	// <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0>
+  2302315830U,	// <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  2302315831U,	// <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS
+  3852858108U,	// <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0>
+  3398624745U,	// <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1>
+  2218668538U,	// <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3>
+  3292418610U,	// <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5>
+  3733286198U,	// <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS
+  3797299889U,	// <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6>
+  2785825592U,	// <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6>
+  2785825602U,	// <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7>
+  2785825611U,	// <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7>
+  2785825614U,	// <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1>
+  2758988632U,	// <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2>
+  3377400084U,	// <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2>
+  2792166248U,	// <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0>
+  2785825654U,	// <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5>
+  2785825664U,	// <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  3859567493U,	// <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2>
+  2303659318U,	// <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303659319U,	// <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2785825695U,	// <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1>
+  2220077479U,	// <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1146335738U,	// <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2792829881U,	// <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0>
+  2785825735U,	// <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5>
+  2785825664U,	// <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  1146336056U,	// <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221963062U,	// <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  1221963063U,	// <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS
+  2653593600U,	// <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0>
+  2706309222U,	// <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  3709421498U,	// <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7>
+  2281705978U,	// <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3>
+  2785825816U,	// <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5>
+  2785825826U,	// <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6>
+  2653598037U,	// <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0>
+  2214598252U,	// <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7>
+  2706309789U,	// <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  1141691386U,	// <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2215433290U,	// <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1>
+  2706310038U,	// <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0>
+  2322190842U,	// <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3>
+  1141691750U,	// <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2215433654U,	// <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5>
+  2653606230U,	// <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1>
+  1141692012U,	// <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1141692034U,	// <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  2785825940U,	// <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3>
+  3768108576U,	// <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2>
+  3780052584U,	// <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2>
+  2794820780U,	// <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0>
+  3859641528U,	// <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3>
+  3733327970U,	// <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0>
+  3778062266U,	// <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7>
+  3733328944U,	// <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2>
+  2795189465U,	// <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0>
+  2324861026U,	// <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0>
+  3780053233U,	// <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3>
+  3780053296U,	// <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3>
+  3778062725U,	// <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7>
+  3780053506U,	// <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6>
+  3803941469U,	// <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7>
+  2706311800U,	// <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7>
+  3398603586U,	// <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7>
+  2707639066U,	// <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7>
+  2217366522U,	// <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2>
+  3727369110U,	// <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0>
+  3291108500U,	// <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3>
+  3727370872U,	// <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7>
+  2217366886U,	// <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6>
+  2706312502U,	// <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  3786026321U,	// <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7>
+  2217367148U,	// <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7>
+  2706312745U,	// <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2322223202U,	// <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0>
+  3399946987U,	// <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1>
+  3291780244U,	// <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3>
+  3727378582U,	// <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2>
+  3727379766U,	// <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS
+  3859568054U,	// <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5>
+  2785826241U,	// <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7>
+  3395965762U,	// <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7>
+  2787153363U,	// <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7>
+  2785826268U,	// <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7>
+  3780055420U,	// <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3>
+  3859568110U,	// <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7>
+  3874534903U,	// <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7>
+  3859641856U,	// <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7>
+  3733360738U,	// <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0>
+  3859568145U,	// <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6>
+  2797770260U,	// <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0>
+  2797843997U,	// <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0>
+  2785826342U,	// <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0>
+  3727393686U,	// <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0>
+  3868563003U,	// <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3>
+  3377397988U,	// <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3>
+  2219349350U,	// <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6>
+  3859568217U,	// <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6>
+  2730202588U,	// <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7>
+  2785826412U,	// <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7>
+  2731529854U,	// <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7>
+  1146336250U,	// <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2706315054U,	// <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  2653660845U,	// <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u>
+  2322248186U,	// <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3>
+  1146336614U,	// <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2706315418U,	// <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2653663581U,	// <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u>
+  1146336876U,	// <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1146336898U,	// <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  202162278U,	// <0,u,0,0>: Cost 1 vspltisw0 LHS
+  1624612966U,	// <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS
+  2629780986U,	// <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0>
+  1207959708U,	// <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  1544097078U,	// <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS
+  1140856986U,	// <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  2698355253U,	// <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7>
+  1207962952U,	// <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  202162278U,	// <0,u,0,u>: Cost 1 vspltisw0 LHS
+  1142134483U,	// <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2>
+  67950382U,	// <0,u,1,1>: Cost 1 vmrghw LHS, LHS
+  1142175624U,	// <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  1142175676U,	// <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1>
+  1142134847U,	// <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  67950746U,	// <0,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1142175952U,	// <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221905736U,	// <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  67950949U,	// <0,u,1,u>: Cost 1 vmrghw LHS, LHS
+  1562026086U,	// <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS
+  2216015662U,	// <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2698356328U,	// <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2>
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1562029366U,	// <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS
+  2216016026U,	// <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  2698356666U,	// <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7>
+  1585919033U,	// <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2758989756U,	// <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1>
+  2216662830U,	// <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2703665461U,	// <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u>
+  2758989782U,	// <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0>
+  2758989796U,	// <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5>
+  2216663194U,	// <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2706319993U,	// <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u>
+  2300972360U,	// <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2216663397U,	// <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217367251U,	// <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2>
+  1143625518U,	// <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2217367432U,	// <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3>
+  2217367484U,	// <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1>
+  1143619922U,	// <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1143625882U,	// <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2217367760U,	// <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7>
+  2291027272U,	// <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  1143626085U,	// <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2635792486U,	// <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS
+  2635793302U,	// <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0>
+  2302314646U,	// <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  2635794648U,	// <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5>
+  2635795766U,	// <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS
+  2717601754U,	// <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u>
+  1685248154U,	// <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2302315848U,	// <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  1685248172U,	// <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2759358645U,	// <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7>
+  2218637102U,	// <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2724901370U,	// <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3>
+  2758990032U,	// <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7>
+  2659691830U,	// <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS
+  2659471458U,	// <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  2724901688U,	// <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6>
+  1651159893U,	// <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u>
+  1651823526U,	// <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u>
+  2785827072U,	// <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1>
+  2803964168U,	// <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0>
+  2727556249U,	// <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u>
+  2303656092U,	// <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2785827112U,	// <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5>
+  2785827122U,	// <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6>
+  2730210781U,	// <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u>
+  2303659336U,	// <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303656097U,	// <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  202162278U,	// <0,u,u,0>: Cost 1 vspltisw0 LHS
+  72595246U,	// <0,u,u,1>: Cost 1 vmrghw LHS, LHS
+  1146337160U,	// <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1146337343U,	// <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  72595610U,	// <0,u,u,5>: Cost 1 vmrghw LHS, RHS
+  1146337488U,	// <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221963080U,	// <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2756853760U,	// <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0>
+  1677803530U,	// <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1>
+  3759497387U,	// <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0>
+  2686419196U,	// <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0>
+  2751766565U,	// <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1>
+  2687746462U,	// <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0>
+  3776086518U,	// <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7>
+  2689073728U,	// <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0>
+  1678319689U,	// <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1>
+  2287091712U,	// <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0>
+  1147568230U,	// <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS
+  1683112038U,	// <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  3294970108U,	// <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0>
+  2623892790U,	// <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS
+  2647781007U,	// <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1>
+  2791948430U,	// <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  3721524218U,	// <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2>
+  1683112092U,	// <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2222112768U,	// <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0>
+  1148371046U,	// <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  3356862524U,	// <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2>
+  2702345894U,	// <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1>
+  2222113106U,	// <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5>
+  2299709908U,	// <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  3760162746U,	// <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7>
+  3369470584U,	// <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7>
+  1148371613U,	// <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  2686421142U,	// <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2>
+  2283128486U,	// <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1>
+  3296305326U,	// <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3>
+  3760163199U,	// <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1>
+  3760163330U,	// <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6>
+  3779406377U,	// <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0>
+  3865690416U,	// <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7>
+  3366824568U,	// <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7>
+  2707655452U,	// <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0>
+  2734861202U,	// <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1>
+  2756854098U,	// <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5>
+  3830595931U,	// <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5>
+  3296968960U,	// <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4>
+  3830595949U,	// <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5>
+  2686422326U,	// <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  3297378806U,	// <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7>
+  3810594248U,	// <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0>
+  2686422569U,	// <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2284470272U,	// <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0>
+  2284471974U,	// <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1>
+  3809267435U,	// <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3>
+  3297968384U,	// <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4>
+  2284471977U,	// <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4>
+  3721555603U,	// <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5>
+  3792679010U,	// <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0>
+  3792679037U,	// <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0>
+  2284471981U,	// <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u>
+  3356893184U,	// <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0>
+  2224676966U,	// <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS
+  3298295985U,	// <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6>
+  3298345212U,	// <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0>
+  2224972114U,	// <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  3808604907U,	// <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1>
+  3799978808U,	// <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6>
+  2726237006U,	// <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1>
+  2224677522U,	// <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1>
+  2726237176U,	// <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0>
+  2285815462U,	// <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1>
+  3805951193U,	// <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0>
+  3807941859U,	// <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1>
+  3799979366U,	// <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6>
+  3803297165U,	// <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0>
+  3799979540U,	// <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0>
+  3799979628U,	// <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7>
+  2731546240U,	// <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0>
+  2284494848U,	// <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0>
+  1683112594U,	// <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1>
+  1683112605U,	// <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2734200772U,	// <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0>
+  2757075629U,	// <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1>
+  2686425242U,	// <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2791948430U,	// <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  2736855304U,	// <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0>
+  1683112659U,	// <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1610694666U,	// <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1>
+  1616003174U,	// <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS
+  2283767958U,	// <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2>
+  3357507596U,	// <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3>
+  2689745234U,	// <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5>
+  3357507922U,	// <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5>
+  3294397647U,	// <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7>
+  3373433334U,	// <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7>
+  1616003730U,	// <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1>
+  1550221414U,	// <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,1,1>: Cost 1 vspltisw1 LHS
+  2287093910U,	// <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2>
+  2287092615U,	// <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3>
+  1550224694U,	// <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  2287092050U,	// <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5>
+  2689746127U,	// <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7>
+  2659800138U,	// <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1>
+  269271142U,	// <1,1,1,u>: Cost 1 vspltisw1 LHS
+  2222113516U,	// <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1>
+  2756854663U,	// <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3>
+  1148371862U,	// <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689746598U,	// <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1>
+  2618002742U,	// <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS
+  2299707730U,	// <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5>
+  2689746874U,	// <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7>
+  3361506511U,	// <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7>
+  1148371862U,	// <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689747094U,	// <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2>
+  2691074278U,	// <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1>
+  3356870806U,	// <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2>
+  2283126958U,	// <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3>
+  2689747458U,	// <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6>
+  3356868946U,	// <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5>
+  3811265144U,	// <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7>
+  3362841807U,	// <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7>
+  2689747742U,	// <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2>
+  2623987814U,	// <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS
+  2758181931U,	// <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5>
+  2223408022U,	// <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  3697731734U,	// <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2>
+  2283798784U,	// <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4>
+  1616006454U,	// <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  3297379535U,	// <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7>
+  3373466102U,	// <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7>
+  1616006697U,	// <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2760762479U,	// <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1>
+  2284470282U,	// <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1>
+  2284472470U,	// <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2>
+  3358212270U,	// <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3>
+  2284470285U,	// <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4>
+  1210728786U,	// <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2737524834U,	// <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0>
+  3360867535U,	// <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7>
+  1210728786U,	// <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  3697746022U,	// <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS
+  2756854991U,	// <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7>
+  2737525242U,	// <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3>
+  3839149281U,	// <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7>
+  3697749302U,	// <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS
+  3356893522U,	// <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5>
+  2283151537U,	// <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6>
+  2791949566U,	// <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0>
+  2792613127U,	// <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0>
+  2737525754U,	// <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2>
+  2291786386U,	// <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1>
+  3365528292U,	// <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2>
+  3365528455U,	// <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3>
+  2737526118U,	// <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6>
+  3365527890U,	// <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5>
+  3365528377U,	// <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6>
+  2291786959U,	// <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7>
+  2737526402U,	// <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2>
+  1550221414U,	// <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,u,1>: Cost 1 vspltisw1 LHS
+  1148371862U,	// <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689750972U,	// <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1>
+  1550224694U,	// <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1616009370U,	// <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2689751248U,	// <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7>
+  2736863497U,	// <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1>
+  269271142U,	// <1,1,u,u>: Cost 1 vspltisw1 LHS
+  2702360576U,	// <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0>
+  1628618854U,	// <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2685771949U,	// <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2>
+  2283765862U,	// <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  2702360914U,	// <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5>
+  3788046813U,	// <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0>
+  2688426481U,	// <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2>
+  2726249024U,	// <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1628619421U,	// <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2690417380U,	// <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2>
+  2702361396U,	// <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1>
+  2287093352U,	// <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2>
+  1213349990U,	// <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  3764159522U,	// <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5>
+  3295053672U,	// <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6>
+  2221311930U,	// <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7>
+  3799991593U,	// <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7>
+  1213349995U,	// <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS
+  2624045158U,	// <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS
+  2702362144U,	// <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2>
+  2283120232U,	// <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2>
+  1225965670U,	// <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2624048438U,	// <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS
+  3356860763U,	// <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5>
+  2222114746U,	// <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7>
+  2299708632U,	// <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7>
+  1225965675U,	// <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS
+  470597734U,	// <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544340276U,	// <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544341096U,	// <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544341916U,	// <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3>
+  470601014U,	// <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592119300U,	// <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592119802U,	// <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592120314U,	// <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470603566U,	// <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708335471U,	// <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2>
+  3838043908U,	// <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5>
+  3357541992U,	// <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2>
+  2283798630U,	// <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2726251728U,	// <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4>
+  1628622134U,	// <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  3297077178U,	// <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7>
+  2726251976U,	// <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1628622377U,	// <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  2714308168U,	// <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2>
+  3297633827U,	// <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5>
+  2284471912U,	// <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2>
+  1210728550U,	// <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  3776106420U,	// <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6>
+  2726252548U,	// <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5>
+  2726252642U,	// <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0>
+  3799994538U,	// <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0>
+  1210728555U,	// <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720280865U,	// <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2>
+  2702365096U,	// <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2>
+  2726253050U,	// <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3>
+  2283151462U,	// <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  3697823030U,	// <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS
+  3298715497U,	// <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7>
+  2726253368U,	// <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6>
+  2724926296U,	// <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2>
+  2283151467U,	// <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652511738U,	// <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2>
+  3371500916U,	// <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1>
+  3365529192U,	// <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2>
+  2291785830U,	// <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2726253926U,	// <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6>
+  3788051845U,	// <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1>
+  3794023894U,	// <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1>
+  2726254119U,	// <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1>
+  1657820802U,	// <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2>
+  470638699U,	// <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1544381236U,	// <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544382056U,	// <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544382614U,	// <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  470641974U,	// <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1628625050U,	// <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  1592160762U,	// <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592161274U,	// <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470644526U,	// <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2769389708U,	// <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1>
+  2685780070U,	// <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2685780142U,	// <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3>
+  2686443775U,	// <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3>
+  2769684656U,	// <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1>
+  3357507940U,	// <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5>
+  3759522294U,	// <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7>
+  3357509562U,	// <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7>
+  2685780637U,	// <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2287092630U,	// <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0>
+  2221312230U,	// <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1>
+  2691752839U,	// <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3>
+  2287093362U,	// <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3>
+  2287092634U,	// <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4>
+  3360835107U,	// <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5>
+  3759523041U,	// <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7>
+  2287093690U,	// <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7>
+  2287092638U,	// <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u>
+  2222114966U,	// <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2>
+  2222115057U,	// <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3>
+  2630092320U,	// <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2>
+  2685781670U,	// <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1>
+  2222115330U,	// <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6>
+  3373449572U,	// <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5>
+  2222115448U,	// <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2299709370U,	// <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7>
+  2222115614U,	// <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2>
+  2771380607U,	// <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1>
+  3356874468U,	// <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1>
+  3759524168U,	// <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0>
+  2283792796U,	// <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3>
+  3356869530U,	// <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4>
+  3721760428U,	// <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3>
+  3296496248U,	// <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7>
+  3356870586U,	// <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7>
+  2771970503U,	// <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1>
+  2772044240U,	// <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1>
+  3362186135U,	// <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1>
+  3297151280U,	// <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3>
+  3357542002U,	// <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3>
+  3357540626U,	// <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4>
+  2685783350U,	// <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  3357546622U,	// <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6>
+  3357542330U,	// <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7>
+  2685783593U,	// <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2284471190U,	// <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0>
+  3358213015U,	// <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1>
+  2630116899U,	// <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5>
+  2284471922U,	// <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3>
+  2284471194U,	// <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4>
+  2284471843U,	// <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5>
+  3358218366U,	// <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6>
+  2284472250U,	// <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7>
+  2284471198U,	// <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u>
+  2224752790U,	// <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2>
+  3832736385U,	// <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7>
+  3703866916U,	// <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6>
+  3356894834U,	// <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3>
+  3356894106U,	// <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4>
+  3356894755U,	// <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5>
+  3356899130U,	// <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6>
+  2283153338U,	// <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2283153338U,	// <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2774035139U,	// <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1>
+  3703874767U,	// <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7>
+  3703875109U,	// <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7>
+  3365529202U,	// <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3>
+  3365528474U,	// <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4>
+  3789387159U,	// <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1>
+  3865692927U,	// <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7>
+  3363538874U,	// <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7>
+  2774625035U,	// <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1>
+  2284495766U,	// <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0>
+  2685785902U,	// <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2630141478U,	// <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u>
+  2283169880U,	// <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3>
+  2284495770U,	// <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4>
+  2685786266U,	// <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2222115448U,	// <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2284496826U,	// <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7>
+  2685786469U,	// <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2684461069U,	// <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4>
+  2686451814U,	// <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  3759530159U,	// <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4>
+  2686451968U,	// <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2684461394U,	// <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5>
+  1701989266U,	// <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1>
+  3776119286U,	// <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7>
+  2689106500U,	// <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4>
+  1702210477U,	// <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1>
+  2221312914U,	// <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1>
+  2691097399U,	// <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4>
+  3760194454U,	// <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0>
+  3766166489U,	// <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4>
+  2334870736U,	// <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4>
+  1147571510U,	// <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  3760194794U,	// <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7>
+  3867315188U,	// <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0>
+  1147571753U,	// <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2222115730U,	// <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1>
+  2222115812U,	// <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  3760195176U,	// <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2>
+  2702378662U,	// <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1>
+  2323598544U,	// <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4>
+  1148374326U,	// <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  3760195514U,	// <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7>
+  3373451932U,	// <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7>
+  1148374569U,	// <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2702379160U,	// <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4>
+  3760195840U,	// <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0>
+  3776121160U,	// <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0>
+  3760195996U,	// <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3>
+  2686454274U,	// <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6>
+  3356870350U,	// <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5>
+  3800009392U,	// <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0>
+  3366824604U,	// <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7>
+  2707688224U,	// <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4>
+  2775731368U,	// <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0>
+  3830820018U,	// <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1>
+  3691980454U,	// <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1>
+  3357541282U,	// <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3>
+  2781039824U,	// <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4>
+  2686455094U,	// <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  3357541528U,	// <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6>
+  3810627020U,	// <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4>
+  2686455337U,	// <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  2624217190U,	// <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS
+  2284470309U,	// <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1>
+  2618246822U,	// <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1>
+  3358212297U,	// <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3>
+  2284470312U,	// <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4>
+  2284470637U,	// <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5>
+  1683115318U,	// <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3721851898U,	// <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2>
+  1683115336U,	// <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3794039075U,	// <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4>
+  3830820186U,	// <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7>
+  3800011258U,	// <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3>
+  3807973938U,	// <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5>
+  3298716880U,	// <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4>
+  2224680246U,	// <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  3800011576U,	// <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6>
+  2726269774U,	// <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1>
+  2224680489U,	// <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726269948U,	// <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4>
+  3383444141U,	// <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1>
+  3805983961U,	// <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0>
+  3807974667U,	// <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5>
+  2736887142U,	// <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6>
+  3365528403U,	// <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5>
+  3800012308U,	// <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0>
+  3800012396U,	// <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7>
+  2731579012U,	// <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4>
+  2624241766U,	// <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS
+  2686457646U,	// <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  2618271398U,	// <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1>
+  2734233544U,	// <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4>
+  2689775679U,	// <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6>
+  1152355638U,	// <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS
+  1683115561U,	// <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2736888076U,	// <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4>
+  1683115579U,	// <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2687123456U,	// <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0>
+  1613381734U,	// <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759538352U,	// <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5>
+  3760865532U,	// <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0>
+  1613381970U,	// <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2687787427U,	// <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5>
+  2781777524U,	// <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1>
+  3733828717U,	// <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0>
+  1613382301U,	// <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2781040271U,	// <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1>
+  2687124276U,	// <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1>
+  2687124374U,	// <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0>
+  3760866297U,	// <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0>
+  2693096491U,	// <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5>
+  2687124591U,	// <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1>
+  2687124723U,	// <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7>
+  3360834803U,	// <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7>
+  2687124860U,	// <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0>
+  2323598792U,	// <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0>
+  2687125027U,	// <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5>
+  2687125096U,	// <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2>
+  2687125158U,	// <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1>
+  2642185188U,	// <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2>
+  2323598554U,	// <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5>
+  2687125434U,	// <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7>
+  3373450483U,	// <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7>
+  2687125563U,	// <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1>
+  2687125654U,	// <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2>
+  2312990234U,	// <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1>
+  3760867649U,	// <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2>
+  2687125916U,	// <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3>
+  2687126018U,	// <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6>
+  3386731738U,	// <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5>
+  3356871170U,	// <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6>
+  3808643779U,	// <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1>
+  2687126302U,	// <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2>
+  2642198630U,	// <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS
+  2687126498U,	// <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0>
+  3715941923U,	// <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5>
+  3709970701U,	// <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4>
+  2687126736U,	// <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4>
+  1613385014U,	// <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2283801090U,	// <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  3733861489U,	// <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4>
+  1613385257U,	// <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2624290918U,	// <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS
+  2624291676U,	// <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5>
+  3698034211U,	// <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5>
+  2284471211U,	// <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3>
+  2624294198U,	// <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS
+  2284471132U,	// <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5>
+  2284472834U,	// <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6>
+  2284471539U,	// <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7>
+  2284471216U,	// <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u>
+  2785316900U,	// <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1>
+  2781040691U,	// <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7>
+  2734903802U,	// <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3>
+  3848736834U,	// <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4>
+  3298717620U,	// <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6>
+  3298717700U,	// <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5>
+  2734904120U,	// <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6>
+  2781040738U,	// <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0>
+  2781040747U,	// <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0>
+  2734904314U,	// <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2>
+  2315677210U,	// <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1>
+  3808646292U,	// <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3>
+  3808646371U,	// <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1>
+  2734904678U,	// <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6>
+  3389418714U,	// <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5>
+  3365528656U,	// <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6>
+  2734904940U,	// <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7>
+  2734904962U,	// <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2>
+  2687129299U,	// <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2>
+  1613387566U,	// <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2687129480U,	// <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3>
+  2687129532U,	// <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1>
+  1661163546U,	// <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5>
+  1613387930U,	// <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2687129808U,	// <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7>
+  2781040900U,	// <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0>
+  1613388133U,	// <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759546368U,	// <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0>
+  2685804646U,	// <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2685804721U,	// <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6>
+  3861270834U,	// <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1>
+  3759546706U,	// <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5>
+  2687795620U,	// <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6>
+  2688459253U,	// <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6>
+  2283769142U,	// <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  2685805213U,	// <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  3698073702U,	// <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS
+  3759547188U,	// <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1>
+  2221314554U,	// <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3>
+  3759547401U,	// <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7>
+  3698076982U,	// <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS
+  3767510141U,	// <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6>
+  2334872376U,	// <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6>
+  1213353270U,	// <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  1213353271U,	// <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS
+  3704053862U,	// <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS
+  3759547961U,	// <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0>
+  2222117370U,	// <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3>
+  3759548070U,	// <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1>
+  3704057142U,	// <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS
+  3373451057U,	// <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5>
+  2685806522U,	// <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7>
+  1225968950U,	// <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1225968951U,	// <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS
+  3759548566U,	// <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2>
+  3842912793U,	// <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7>
+  3759548774U,	// <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3>
+  3759548828U,	// <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3>
+  3759548930U,	// <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6>
+  3809315421U,	// <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7>
+  3386733368U,	// <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6>
+  2283130166U,	// <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS
+  2283130167U,	// <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS
+  3704070246U,	// <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS
+  3862229608U,	// <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5>
+  3704071741U,	// <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4>
+  3721988610U,	// <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6>
+  3704073526U,	// <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS
+  2685807926U,	// <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3865621141U,	// <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5>
+  2283801910U,	// <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  2685808169U,	// <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3710050406U,	// <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS
+  3710051571U,	// <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7>
+  3405989597U,	// <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2>
+  3358214502U,	// <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3>
+  3710053686U,	// <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS
+  3721998025U,	// <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5>
+  2332250936U,	// <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6>
+  1210731830U,	// <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210731831U,	// <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS
+  2791289597U,	// <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1>
+  3698115430U,	// <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6>
+  3698116538U,	// <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7>
+  3356894132U,	// <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3>
+  3698117942U,	// <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS
+  3722006218U,	// <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6>
+  2781041464U,	// <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6>
+  2283154742U,	// <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283154743U,	// <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS
+  1718211406U,	// <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1>
+  2792026967U,	// <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1>
+  2765411170U,	// <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3>
+  3854783336U,	// <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0>
+  2781041526U,	// <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5>
+  3365528664U,	// <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5>
+  2791953290U,	// <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7>
+  2291789110U,	// <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1718801302U,	// <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1>
+  1718875039U,	// <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1>
+  2685810478U,	// <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2792764337U,	// <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1>
+  3759552444U,	// <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1>
+  2781041607U,	// <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5>
+  2685810842U,	// <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  2689792208U,	// <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7>
+  1210756406U,	// <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  1210756407U,	// <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS
+  2793280496U,	// <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1>
+  2694439014U,	// <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  3393343912U,	// <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2>
+  3397325306U,	// <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3>
+  2793575444U,	// <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1>
+  3722030797U,	// <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0>
+  2688467446U,	// <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7>
+  2689131079U,	// <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7>
+  2694439570U,	// <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1>
+  2654265354U,	// <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1>
+  2794017866U,	// <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1>
+  3768181639U,	// <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3>
+  2334872058U,	// <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3>
+  2654268726U,	// <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS
+  3792069797U,	// <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1>
+  2694440143U,	// <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7>
+  2334872386U,	// <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7>
+  2695767409U,	// <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7>
+  2654273638U,	// <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2222117973U,	// <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3>
+  2299711912U,	// <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2654275734U,	// <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2>
+  2654276918U,	// <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS
+  3385397675U,	// <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5>
+  2654278056U,	// <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2>
+  2323599627U,	// <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7>
+  2654279470U,	// <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2795271395U,	// <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1>
+  3768183059U,	// <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1>
+  3728025254U,	// <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1>
+  3768183196U,	// <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3>
+  3768183298U,	// <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6>
+  3792071255U,	// <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1>
+  3780127361U,	// <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7>
+  3847779617U,	// <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0>
+  2795861291U,	// <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1>
+  2795935028U,	// <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1>
+  3728032975U,	// <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7>
+  3839153480U,	// <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3>
+  3397358074U,	// <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3>
+  3854783835U,	// <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4>
+  2694442294U,	// <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  3786100058U,	// <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7>
+  3722065254U,	// <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6>
+  2694442537U,	// <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654298214U,	// <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS
+  3854783893U,	// <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u>
+  3710126010U,	// <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7>
+  2332250618U,	// <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3>
+  2654301494U,	// <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS
+  2284474795U,	// <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5>
+  2718330931U,	// <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7>
+  2332250946U,	// <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7>
+  2719658197U,	// <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7>
+  2332921954U,	// <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0>
+  3768185254U,	// <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0>
+  3710134202U,	// <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7>
+  3710134561U,	// <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6>
+  3710135606U,	// <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS
+  3864884745U,	// <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7>
+  3854784017U,	// <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6>
+  2791953940U,	// <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0>
+  2792617501U,	// <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0>
+  2797925927U,	// <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1>
+  3365528426U,	// <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1>
+  3728058022U,	// <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1>
+  3365528509U,	// <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3>
+  3854784079U,	// <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5>
+  3722088148U,	// <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7>
+  3728060845U,	// <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7>
+  2781042284U,	// <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7>
+  2798515823U,	// <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1>
+  2654322705U,	// <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u>
+  2694444846U,	// <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  2299711912U,	// <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2323649018U,	// <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3>
+  2654326070U,	// <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS
+  2694445210U,	// <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654327214U,	// <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u>
+  2323649346U,	// <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7>
+  2694445413U,	// <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  1610752017U,	// <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u>
+  1613406310U,	// <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  2685821107U,	// <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u>
+  2283765916U,	// <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  1613406549U,	// <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u>
+  1725880054U,	// <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1>
+  2688475639U,	// <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u>
+  2283769160U,	// <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  1613406877U,	// <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  1550221414U,	// <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,u,1,1>: Cost 1 vspltisw1 LHS
+  1683117870U,	// <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1213350044U,	// <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  1550224694U,	// <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1147574426U,	// <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2687149326U,	// <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7>
+  1213353288U,	// <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  269271142U,	// <1,u,1,u>: Cost 1 vspltisw1 LHS
+  2222118611U,	// <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2>
+  1148376878U,	// <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  1148371862U,	// <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  1225965724U,	// <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2222118975U,	// <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6>
+  1148377242U,	// <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2687150010U,	// <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7>
+  1225968968U,	// <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1148377445U,	// <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  471040156U,	// <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544782644U,	// <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544783464U,	// <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544784022U,	// <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471043382U,	// <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592561668U,	// <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592562170U,	// <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592562682U,	// <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  471045934U,	// <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708384629U,	// <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u>
+  2687151101U,	// <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0>
+  2223408022U,	// <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  2283798684U,	// <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2642422785U,	// <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4>
+  1613409590U,	// <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2283801090U,	// <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  2283801928U,	// <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  1613409833U,	// <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2284471235U,	// <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0>
+  2284472046U,	// <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1>
+  2284472533U,	// <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2>
+  1210728604U,	// <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2284471239U,	// <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4>
+  1210728786U,	// <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  1683118234U,	// <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210731848U,	// <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210728609U,	// <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720330023U,	// <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u>
+  2757376190U,	// <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7>
+  2726302202U,	// <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3>
+  2283151516U,	// <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  2224972114U,	// <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  2224683162U,	// <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726302520U,	// <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6>
+  2283154760U,	// <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283151521U,	// <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652560896U,	// <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u>
+  2333590225U,	// <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1>
+  2765412628U,	// <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3>
+  2291785884U,	// <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2781042984U,	// <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5>
+  3365527953U,	// <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5>
+  2791954748U,	// <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7>
+  2291789128U,	// <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1657869960U,	// <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u>
+  471081121U,	// <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  269271142U,	// <1,u,u,1>: Cost 1 vspltisw1 LHS
+  1544824424U,	// <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544824982U,	// <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471084342U,	// <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1613412506U,	// <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  1683118477U,	// <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210756424U,	// <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  471086894U,	// <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2226757632U,	// <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0>
+  2226757734U,	// <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS
+  3826622483U,	// <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1>
+  3843211292U,	// <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1>
+  3300499794U,	// <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5>
+  3356256724U,	// <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5>
+  3825664056U,	// <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2>
+  3762889289U,	// <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0>
+  2226758301U,	// <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS
+  2227429386U,	// <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1>
+  2227429478U,	// <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS
+  1691156582U,	// <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2666358997U,	// <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2>
+  2227462482U,	// <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5>
+  3722186464U,	// <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1>
+  3867099278U,	// <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7>
+  3366881912U,	// <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7>
+  1691156636U,	// <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2228027392U,	// <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0>
+  1154285670U,	// <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  2228027565U,	// <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2>
+  3301769468U,	// <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0>
+  2228027730U,	// <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5>
+  3301769635U,	// <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5>
+  3780806586U,	// <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7>
+  3368880760U,	// <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7>
+  1154286237U,	// <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS
+  1213440000U,	// <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1213441702U,	// <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2228535470U,	// <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3>
+  2636515632U,	// <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3>
+  2287182962U,	// <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4>
+  2660405346U,	// <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0>
+  2228535798U,	// <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660406420U,	// <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3>
+  1213441709U,	// <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3368894464U,	// <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0>
+  2764898642U,	// <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5>
+  3826622811U,	// <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5>
+  3843211620U,	// <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5>
+  3838640493U,	// <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5>
+  2732944694U,	// <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS
+  3797396857U,	// <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2>
+  3867099528U,	// <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5>
+  2764898705U,	// <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5>
+  3364257792U,	// <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0>
+  2230124646U,	// <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  3304235184U,	// <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5>
+  3364260144U,	// <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3>
+  3303817554U,	// <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5>
+  3364260146U,	// <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5>
+  3867099602U,	// <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7>
+  3364260472U,	// <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7>
+  2230125213U,	// <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2230796288U,	// <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0>
+  1157054566U,	// <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  2230796465U,	// <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6>
+  3304538364U,	// <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0>
+  2230796626U,	// <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5>
+  3797398205U,	// <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0>
+  3304538614U,	// <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7>
+  3798725471U,	// <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0>
+  1157055133U,	// <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  3371573248U,	// <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0>
+  2231189606U,	// <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS
+  3801380003U,	// <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0>
+  3802043636U,	// <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0>
+  3806688614U,	// <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6>
+  3356317308U,	// <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5>
+  3804034535U,	// <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0>
+  3806688876U,	// <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7>
+  2231190173U,	// <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS
+  1208836096U,	// <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1208837798U,	// <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  1691157149U,	// <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2636556597U,	// <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u>
+  2282579625U,	// <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2660446306U,	// <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0>
+  2228535798U,	// <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660447385U,	// <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u>
+  1208837805U,	// <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3692388523U,	// <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0>
+  2757526244U,	// <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2>
+  2330290974U,	// <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2>
+  3843212020U,	// <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0>
+  3692391734U,	// <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS
+  3300533362U,	// <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4>
+  3794084337U,	// <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2>
+  3374170614U,	// <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7>
+  2758042403U,	// <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2>
+  2690482924U,	// <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1>
+  2764899124U,	// <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1>
+  2695791510U,	// <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0>
+  3362235271U,	// <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3>
+  3692399926U,	// <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS
+  3832226649U,	// <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2>
+  3301205235U,	// <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7>
+  3768870179U,	// <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1>
+  2695791988U,	// <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1>
+  2618663085U,	// <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2>
+  2228028212U,	// <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1>
+  2618664552U,	// <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2>
+  2759000984U,	// <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2>
+  2618666294U,	// <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS
+  2295136594U,	// <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5>
+  3769534376U,	// <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7>
+  2793358266U,	// <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0>
+  2618668846U,	// <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS
+  2282536969U,	// <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208795146U,	// <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1213442198U,	// <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2287181998U,	// <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2618674486U,	// <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS
+  1208795474U,	// <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2287182001U,	// <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287183055U,	// <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208795153U,	// <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  3692421295U,	// <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4>
+  3838641195U,	// <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5>
+  2330323742U,	// <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2>
+  3692423318U,	// <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2>
+  3692424502U,	// <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS
+  2695793974U,	// <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3799395705U,	// <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2>
+  3368895695U,	// <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7>
+  2695794217U,	// <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3692429488U,	// <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5>
+  3364257802U,	// <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1>
+  3692431253U,	// <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6>
+  3692431874U,	// <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6>
+  3692432694U,	// <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS
+  3364258130U,	// <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5>
+  3303875827U,	// <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7>
+  3867100333U,	// <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0>
+  3692435246U,	// <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS
+  2618695857U,	// <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6>
+  2230797108U,	// <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1>
+  2618697658U,	// <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7>
+  3692439702U,	// <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2>
+  2618699062U,	// <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS
+  3364929874U,	// <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5>
+  3692442424U,	// <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6>
+  3798733664U,	// <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1>
+  2618701614U,	// <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS
+  3799397370U,	// <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2>
+  3371573258U,	// <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1>
+  2330351234U,	// <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  3799397658U,	// <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2>
+  3799397734U,	// <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6>
+  3371573586U,	// <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5>
+  3799397870U,	// <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7>
+  3799397956U,	// <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3>
+  2330351234U,	// <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  2282577929U,	// <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208836106U,	// <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1208838294U,	// <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282578094U,	// <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282577933U,	// <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1208836434U,	// <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282578097U,	// <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287224015U,	// <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208836113U,	// <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  2226759117U,	// <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0>
+  1624047718U,	// <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  2697789613U,	// <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2>
+  2226767526U,	// <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1>
+  2697789778U,	// <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5>
+  3300657000U,	// <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6>
+  2226988986U,	// <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7>
+  3734271139U,	// <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0>
+  1624048285U,	// <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  3831268868U,	// <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1>
+  2293138804U,	// <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1>
+  2697790358U,	// <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0>
+  2293137510U,	// <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  3771532331U,	// <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5>
+  3767551106U,	// <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2>
+  3301173178U,	// <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7>
+  3372853169U,	// <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7>
+  2293137515U,	// <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS
+  1556938854U,	// <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2295137733U,	// <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1>
+  336380006U,	// <2,2,2,2>: Cost 1 vspltisw2 LHS
+  1221394534U,	// <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS
+  1556942134U,	// <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2228029370U,	// <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7>
+  2660545701U,	// <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2>
+  336380006U,	// <2,2,2,u>: Cost 1 vspltisw2 LHS
+  2697791638U,	// <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2>
+  2765489840U,	// <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2>
+  1213441640U,	// <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135053414U,	// <2,2,3,3>: Cost 1 vmrglw LHS, LHS
+  2697792002U,	// <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6>
+  2330313780U,	// <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5>
+  2287183549U,	// <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6>
+  2660553894U,	// <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3>
+  135053419U,	// <2,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2630697062U,	// <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS
+  3771534282U,	// <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3>
+  2764900109U,	// <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5>
+  2295152742U,	// <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS
+  2295154282U,	// <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4>
+  1624050998U,	// <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  2229675962U,	// <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7>
+  3368896433U,	// <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7>
+  1624051241U,	// <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  3771534920U,	// <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2>
+  3364258540U,	// <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1>
+  2296489576U,	// <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2>
+  2290516070U,	// <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  3771535284U,	// <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6>
+  2290517044U,	// <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5>
+  2697793634U,	// <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0>
+  3370231729U,	// <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7>
+  2290516075U,	// <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2230797801U,	// <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1>
+  3304539679U,	// <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1>
+  2764900273U,	// <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7>
+  2764900282U,	// <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7>
+  2230798129U,	// <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5>
+  3304540008U,	// <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6>
+  1157056442U,	// <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725000033U,	// <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2>
+  1157056442U,	// <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2793359338U,	// <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1>
+  3371574725U,	// <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1>
+  2297833064U,	// <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2>
+  2297831526U,	// <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  2697794918U,	// <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6>
+  3371575053U,	// <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5>
+  3304933297U,	// <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7>
+  2297833393U,	// <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7>
+  2297831531U,	// <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1556938854U,	// <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1624053550U,	// <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  336380006U,	// <2,2,u,2>: Cost 1 vspltisw2 LHS
+  135094374U,	// <2,2,u,3>: Cost 1 vmrglw LHS, LHS
+  1556942134U,	// <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1624053914U,	// <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  1157056442U,	// <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2660594859U,	// <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u>
+  135094379U,	// <2,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611448320U,	// <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537706598U,	// <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2689835181U,	// <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2689835260U,	// <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611448658U,	// <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2732966354U,	// <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2732966390U,	// <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660603052U,	// <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0>
+  537707165U,	// <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689835748U,	// <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611449140U,	// <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611449238U,	// <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  3763577805U,	// <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1>
+  2689836112U,	// <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689836143U,	// <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689836239U,	// <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  3366881210U,	// <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7>
+  1616094588U,	// <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2689836493U,	// <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0>
+  2685191711U,	// <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611449960U,	// <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611450022U,	// <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2689836822U,	// <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5>
+  2689836904U,	// <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6>
+  1611450298U,	// <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2295138234U,	// <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7>
+  1611450456U,	// <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3>
+  1213440918U,	// <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282538527U,	// <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1>
+  1557022322U,	// <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3>
+  1208796786U,	// <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1213440922U,	// <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282538531U,	// <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2287188094U,	// <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1213441978U,	// <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  1208796791U,	// <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u>
+  1551056998U,	// <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS
+  1551057818U,	// <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4>
+  2624800360U,	// <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2>
+  2624800918U,	// <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2>
+  1551060278U,	// <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS
+  537709878U,	// <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2732969337U,	// <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2660635824U,	// <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4>
+  537710121U,	// <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689838664U,	// <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2732969615U,	// <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2732969707U,	// <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3>
+  3763580721U,	// <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1>
+  2689839028U,	// <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659228164U,	// <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659228258U,	// <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  3364259770U,	// <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7>
+  1659228420U,	// <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2230798486U,	// <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2>
+  2732970407U,	// <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1659228666U,	// <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2230798748U,	// <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3>
+  2230798850U,	// <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6>
+  2732970731U,	// <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659228984U,	// <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659229006U,	// <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1659229087U,	// <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1>
+  1659229178U,	// <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2726999125U,	// <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3>
+  2727662758U,	// <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3>
+  2732971235U,	// <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1>
+  1659229542U,	// <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2732971446U,	// <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2732971484U,	// <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7>
+  1659229804U,	// <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659229826U,	// <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1208837014U,	// <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  537712430U,	// <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1616099205U,	// <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0>
+  1208837746U,	// <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1208837018U,	// <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  537712794U,	// <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1616099536U,	// <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1208838074U,	// <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  537712997U,	// <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  3771547648U,	// <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0>
+  2697805926U,	// <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS
+  3770884269U,	// <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2>
+  3806716164U,	// <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u>
+  3771547986U,	// <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5>
+  2226761014U,	// <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3853462427U,	// <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1>
+  3867102116U,	// <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1>
+  2226761257U,	// <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3849186231U,	// <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2>
+  3301207010U,	// <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0>
+  3766240150U,	// <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0>
+  3766240226U,	// <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4>
+  3301207248U,	// <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4>
+  2227432758U,	// <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS
+  3758941400U,	// <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7>
+  3768894758U,	// <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4>
+  2227433001U,	// <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS
+  2228030354U,	// <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1>
+  3770885657U,	// <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4>
+  2697807466U,	// <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4>
+  3368880468U,	// <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3>
+  2228030672U,	// <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4>
+  1154288950U,	// <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  3771549617U,	// <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7>
+  3368880796U,	// <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7>
+  1154289193U,	// <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS
+  2636808294U,	// <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS
+  2287181861U,	// <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1>
+  2228866102U,	// <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3>
+  2636810580U,	// <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3>
+  1256574160U,	// <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1213441742U,	// <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2228866430U,	// <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7>
+  2660701368U,	// <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3>
+  1213441745U,	// <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3704586342U,	// <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS
+  3782831051U,	// <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4>
+  3704587900U,	// <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4>
+  3368896123U,	// <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3>
+  2793360592U,	// <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4>
+  2697809206U,	// <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  3303198078U,	// <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7>
+  3867102444U,	// <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5>
+  2697809449U,	// <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  2630852710U,	// <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS
+  2624881572U,	// <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5>
+  2630854269U,	// <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5>
+  2666686677U,	// <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2>
+  2630855990U,	// <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS
+  2230127926U,	// <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS
+  1691159862U,	// <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  3867102520U,	// <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0>
+  1691159880U,	// <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230799250U,	// <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1>
+  3304541130U,	// <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3>
+  2230799417U,	// <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6>
+  3304541323U,	// <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7>
+  2230799568U,	// <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4>
+  1157057846U,	// <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3304541566U,	// <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7>
+  3798758243U,	// <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4>
+  1157058089U,	// <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3806721018U,	// <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2>
+  3853831590U,	// <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2>
+  3801412775U,	// <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4>
+  3802076408U,	// <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4>
+  3401436368U,	// <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4>
+  2793360840U,	// <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0>
+  3804067307U,	// <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4>
+  3867102682U,	// <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0>
+  2793360867U,	// <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0>
+  2630877286U,	// <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS
+  2282580144U,	// <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2630878848U,	// <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u>
+  2636851545U,	// <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u>
+  1256615120U,	// <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1208837838U,	// <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  1691160105U,	// <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2660742333U,	// <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u>
+  1208837841U,	// <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3766910976U,	// <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0>
+  2693169254U,	// <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3760939181U,	// <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2>
+  3843214936U,	// <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0>
+  3760939355U,	// <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5>
+  3867102827U,	// <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1>
+  3867102836U,	// <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1>
+  3867102844U,	// <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0>
+  2693169821U,	// <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3766911724U,	// <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1>
+  3766911796U,	// <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1>
+  2693170070U,	// <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0>
+  3384798262U,	// <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3>
+  2693170228U,	// <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5>
+  3301208068U,	// <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5>
+  3366879607U,	// <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6>
+  3867102925U,	// <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0>
+  2695824760U,	// <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5>
+  2642845798U,	// <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS
+  2295139218U,	// <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1>
+  2699142760U,	// <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2>
+  3766912678U,	// <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1>
+  2699142925U,	// <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5>
+  2228031492U,	// <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5>
+  2295138818U,	// <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6>
+  3368879347U,	// <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7>
+  2295138820U,	// <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u>
+  2287184866U,	// <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256573842U,	// <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642855630U,	// <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5>
+  2287182763U,	// <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287184870U,	// <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256574170U,	// <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1213442562U,	// <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287183091U,	// <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1213442564U,	// <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3716604006U,	// <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS
+  3716604822U,	// <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0>
+  3766914099U,	// <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0>
+  3368895403U,	// <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3>
+  3716607031U,	// <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4>
+  2693172534U,	// <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3363588610U,	// <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6>
+  3368895731U,	// <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7>
+  2693172777U,	// <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3704668262U,	// <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS
+  3704669078U,	// <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0>
+  3704669830U,	// <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5>
+  3364259460U,	// <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3>
+  3704671542U,	// <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS
+  2793361412U,	// <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  3364258167U,	// <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6>
+  3867103249U,	// <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0>
+  2793361412U,	// <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  2642878566U,	// <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS
+  3386166810U,	// <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1>
+  2723033594U,	// <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3>
+  3848523842U,	// <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4>
+  2723033713U,	// <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5>
+  2230800388U,	// <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5>
+  2230800482U,	// <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0>
+  2785841252U,	// <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2>
+  2785914989U,	// <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2>
+  3796775930U,	// <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2>
+  3800757335U,	// <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5>
+  3853463689U,	// <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3>
+  3796776218U,	// <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2>
+  3796776294U,	// <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6>
+  3803411867U,	// <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5>
+  3371575081U,	// <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6>
+  3796776516U,	// <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3>
+  3371575083U,	// <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u>
+  2287225826U,	// <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256614802U,	// <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642896590U,	// <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5>
+  2287223723U,	// <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287225830U,	// <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256615130U,	// <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1208838658U,	// <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287224051U,	// <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1208838660U,	// <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3772227584U,	// <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0>
+  2698485862U,	// <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3759620282U,	// <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6>
+  3710675299U,	// <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0>
+  3767583058U,	// <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5>
+  3378153265U,	// <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5>
+  3865186637U,	// <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1>
+  2330291510U,	// <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS
+  2698486429U,	// <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3734569062U,	// <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS
+  3764929346U,	// <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6>
+  3772228502U,	// <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0>
+  3734571158U,	// <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2>
+  3734572342U,	// <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS
+  3767583878U,	// <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6>
+  3768247511U,	// <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6>
+  2293140790U,	// <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  2293140791U,	// <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS
+  3704717414U,	// <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS
+  3395424589U,	// <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1>
+  2228031993U,	// <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2>
+  2698487485U,	// <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6>
+  3704720694U,	// <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS
+  3773556575U,	// <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6>
+  2698487738U,	// <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7>
+  1221397814U,	// <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  1221397815U,	// <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS
+  2636955750U,	// <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS
+  2330314217U,	// <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2636957626U,	// <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7>
+  2287184230U,	// <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636959030U,	// <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS
+  2648903448U,	// <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3>
+  1256575800U,	// <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135056694U,	// <2,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135056695U,	// <2,6,3,u>: Cost 1 vmrglw LHS, RHS
+  3710705766U,	// <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS
+  3698762677U,	// <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4>
+  3710707389U,	// <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6>
+  3710708071U,	// <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4>
+  3710709046U,	// <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS
+  2698489142U,	// <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  3796782457U,	// <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2>
+  2295156022U,	// <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  2295156023U,	// <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS
+  3303870753U,	// <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2>
+  3788820134U,	// <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6>
+  3779530520U,	// <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3>
+  3303871026U,	// <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5>
+  3303871117U,	// <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6>
+  3791474666U,	// <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6>
+  3792138299U,	// <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6>
+  2290519350U,	// <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2290519351U,	// <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2631008358U,	// <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS
+  3372893673U,	// <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1>
+  2791445264U,	// <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2>
+  2230800968U,	// <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0>
+  2631011638U,	// <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS
+  3372894001U,	// <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5>
+  2793362232U,	// <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6>
+  2295835958U,	// <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2295835959U,	// <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2793362254U,	// <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1>
+  2792035160U,	// <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2>
+  2792108897U,	// <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2>
+  2769474408U,	// <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0>
+  2793362294U,	// <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5>
+  3371575089U,	// <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5>
+  2792403845U,	// <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2>
+  2297834806U,	// <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2297834807U,	// <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2636996710U,	// <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS
+  2698491694U,	// <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  2636998631U,	// <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7>
+  2282580326U,	// <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636999990U,	// <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS
+  2698492058U,	// <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  1256616760U,	// <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135097654U,	// <2,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135097655U,	// <2,6,u,u>: Cost 1 vmrglw LHS, RHS
+  2666864742U,	// <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS
+  1719620602U,	// <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2>
+  3768254637U,	// <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2>
+  3393417722U,	// <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3>
+  2666868022U,	// <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS
+  3867104290U,	// <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6>
+  3728667127U,	// <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0>
+  2666869817U,	// <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2>
+  1720136761U,	// <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2>
+  3728670822U,	// <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS
+  3774227252U,	// <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1>
+  3774227350U,	// <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0>
+  2323001850U,	// <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  3728674102U,	// <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS
+  3774227567U,	// <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1>
+  2694513880U,	// <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7>
+  3396744002U,	// <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7>
+  2323001850U,	// <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  2654937190U,	// <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS
+  3728679732U,	// <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1>
+  2700486248U,	// <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2>
+  2321682938U,	// <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3>
+  2654940470U,	// <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS
+  3859584196U,	// <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6>
+  2700486577U,	// <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7>
+  2228033132U,	// <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7>
+  2701813843U,	// <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7>
+  1581203558U,	// <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  2654946100U,	// <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1>
+  2637031354U,	// <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7>
+  1256575482U,	// <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581206838U,	// <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS
+  2654949380U,	// <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5>
+  1581208058U,	// <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3>
+  1256575810U,	// <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581209390U,	// <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  3728695398U,	// <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS
+  3869758782U,	// <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2>
+  3728696936U,	// <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2>
+  3393450490U,	// <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3>
+  3728698678U,	// <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS
+  2700487990U,	// <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3728699899U,	// <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4>
+  3867104626U,	// <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0>
+  2700488233U,	// <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3855160709U,	// <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1>
+  3728704406U,	// <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0>
+  3370233956U,	// <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2>
+  2320380410U,	// <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  3728706870U,	// <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS
+  3867104694U,	// <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5>
+  3792146492U,	// <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7>
+  3394122562U,	// <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7>
+  2320380410U,	// <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  2230801402U,	// <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2>
+  3768258984U,	// <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2>
+  2730349050U,	// <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  3372894575U,	// <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3>
+  2230801766U,	// <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6>
+  3304543670U,	// <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5>
+  3728716285U,	// <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6>
+  2230802028U,	// <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7>
+  2730349050U,	// <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  2793362983U,	// <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1>
+  3728721112U,	// <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7>
+  3371574933U,	// <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2>
+  2327695866U,	// <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3>
+  3728723254U,	// <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS
+  3371574855U,	// <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5>
+  2730350062U,	// <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7>
+  2793363052U,	// <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7>
+  2798671471U,	// <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1>
+  1581244518U,	// <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1724929666U,	// <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2>
+  2637072314U,	// <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7>
+  1256616442U,	// <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581247798U,	// <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS
+  2700490906U,	// <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  1581249023U,	// <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u>
+  1256616770U,	// <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581250350U,	// <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1611489280U,	// <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537747563U,	// <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685231277U,	// <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685231356U,	// <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611489618U,	// <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2226763930U,	// <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  2733007350U,	// <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660971737U,	// <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0>
+  537748125U,	// <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689876708U,	// <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611490100U,	// <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611490198U,	// <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  2293137564U,	// <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  2689877072U,	// <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689877103U,	// <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689877199U,	// <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2293140808U,	// <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  1616135548U,	// <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  1556938854U,	// <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1154291502U,	// <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  336380006U,	// <2,u,2,2>: Cost 1 vspltisw2 LHS
+  1611490982U,	// <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  1556942134U,	// <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1154291866U,	// <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  1611491258U,	// <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1221397832U,	// <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  336380006U,	// <2,u,2,u>: Cost 1 vspltisw2 LHS
+  1611491478U,	// <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2>
+  1213440073U,	// <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1>
+  1213442261U,	// <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135053468U,	// <2,u,3,3>: Cost 1 vmrglw LHS, LHS
+  1611491842U,	// <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6>
+  1213440401U,	// <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5>
+  1213442589U,	// <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135056712U,	// <2,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135053473U,	// <2,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1551425638U,	// <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS
+  1551426503U,	// <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4>
+  2625169000U,	// <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2>
+  2625169558U,	// <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2>
+  1551428918U,	// <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS
+  537750838U,	// <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2733010297U,	// <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2295156040U,	// <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  537751081U,	// <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689879624U,	// <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2230130478U,	// <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2631149217U,	// <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5>
+  2290516124U,	// <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2689879988U,	// <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659269124U,	// <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1691162778U,	// <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2290519368U,	// <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  1691162796U,	// <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230802131U,	// <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2>
+  1157060398U,	// <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659269626U,	// <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2764904656U,	// <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7>
+  2230802495U,	// <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6>
+  1157060762U,	// <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  1659269944U,	// <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659269966U,	// <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1157060965U,	// <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659270138U,	// <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2727040090U,	// <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u>
+  2727703723U,	// <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u>
+  2297831580U,	// <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1659270502U,	// <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2733012406U,	// <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2730358255U,	// <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u>
+  1659270764U,	// <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659270786U,	// <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1213481923U,	// <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0>
+  537753390U,	// <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  336380006U,	// <2,u,u,2>: Cost 1 vspltisw2 LHS
+  135094428U,	// <2,u,u,3>: Cost 1 vmrglw LHS, LHS
+  1213481927U,	// <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4>
+  537753754U,	// <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1208838685U,	// <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135097672U,	// <2,u,u,7>: Cost 1 vmrglw LHS, RHS
+  135094433U,	// <2,u,u,u>: Cost 1 vmrglw LHS, LHS
+  1678557184U,	// <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1678557194U,	// <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2631181989U,	// <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0>
+  2289223984U,	// <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3>
+  2756943909U,	// <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1>
+  3362965729U,	// <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5>
+  3362966054U,	// <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6>
+  2289224312U,	// <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7>
+  1683202121U,	// <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1>
+  1557446758U,	// <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS
+  2752741467U,	// <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1>
+  604815462U,	// <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2631190676U,	// <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0>
+  1557450038U,	// <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS
+  2667024388U,	// <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5>
+  2800074894U,	// <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7>
+  2661053667U,	// <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1>
+  604815516U,	// <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696521165U,	// <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0>
+  2752741549U,	// <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2>
+  2691876456U,	// <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2>
+  2691876518U,	// <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1>
+  3830685895U,	// <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1>
+  3765618536U,	// <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6>
+  2691876794U,	// <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7>
+  2701166596U,	// <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0>
+  2756944108U,	// <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2>
+  2691877014U,	// <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2>
+  1161003110U,	// <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2691877168U,	// <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3>
+  2691877246U,	// <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0>
+  2691877378U,	// <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6>
+  3765619238U,	// <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6>
+  2691877496U,	// <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  3368962680U,	// <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7>
+  1161003677U,	// <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2289254400U,	// <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0>
+  1678557522U,	// <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2631214761U,	// <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4>
+  2235580672U,	// <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  2756944237U,	// <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5>
+  1618136374U,	// <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  3309322742U,	// <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7>
+  3362998904U,	// <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7>
+  1683202449U,	// <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  3765620296U,	// <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2>
+  2752299427U,	// <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5>
+  3789508346U,	// <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0>
+  3403486842U,	// <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3>
+  3765620660U,	// <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6>
+  2733682692U,	// <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5>
+  2800075218U,	// <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7>
+  3873817044U,	// <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0>
+  2800075234U,	// <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5>
+  2752299501U,	// <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7>
+  2236547174U,	// <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2733683194U,	// <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3>
+  3844473352U,	// <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7>
+  3310289234U,	// <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5>
+  3873817114U,	// <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7>
+  2733683512U,	// <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6>
+  2725057384U,	// <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0>
+  2236547741U,	// <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2297905152U,	// <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0>
+  2297906854U,	// <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1>
+  2727711916U,	// <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0>
+  3371649328U,	// <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3>
+  2733684070U,	// <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6>
+  3734843490U,	// <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0>
+  3798799895U,	// <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3>
+  2733684332U,	// <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7>
+  2297906861U,	// <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u>
+  1557504102U,	// <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS
+  1678557842U,	// <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1>
+  604816029U,	// <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2691880892U,	// <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1>
+  1557507382U,	// <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS
+  1618139290U,	// <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  2691881168U,	// <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7>
+  2661111018U,	// <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u>
+  604816083U,	// <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2619310332U,	// <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0>
+  2756944612U,	// <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2>
+  2289221724U,	// <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2>
+  2619312278U,	// <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2>
+  2619313462U,	// <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS
+  2289221970U,	// <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5>
+  2232599768U,	// <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7>
+  3362964687U,	// <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7>
+  2619316014U,	// <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS
+  2756944683U,	// <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1>
+  1678558004U,	// <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2691883927U,	// <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1>
+  3826631496U,	// <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3>
+  2756944723U,	// <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5>
+  2756944732U,	// <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  3830686561U,	// <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1>
+  3734869228U,	// <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1>
+  1678558004U,	// <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2696529358U,	// <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1>
+  2756944775U,	// <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  2294548630U,	// <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2>
+  1678558102U,	// <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0>
+  2631273782U,	// <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS
+  2756944811U,	// <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  3830686644U,	// <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3>
+  2800075706U,	// <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0>
+  1679000515U,	// <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0>
+  2619334911U,	// <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3>
+  2295218186U,	// <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1>
+  2293229718U,	// <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2>
+  2619337116U,	// <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3>
+  2619338038U,	// <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS
+  2295218514U,	// <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5>
+  3830686729U,	// <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7>
+  3368961231U,	// <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7>
+  2619340590U,	// <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS
+  2619343104U,	// <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4>
+  2289254410U,	// <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1>
+  2289256598U,	// <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2>
+  2619345410U,	// <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6>
+  2619346230U,	// <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS
+  2756944976U,	// <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6>
+  3362996401U,	// <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6>
+  3362997455U,	// <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7>
+  2619348782U,	// <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS
+  2756945007U,	// <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1>
+  3830686840U,	// <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1>
+  3358361750U,	// <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2>
+  3830686857U,	// <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0>
+  2756945047U,	// <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5>
+  2294571346U,	// <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5>
+  3806105698U,	// <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0>
+  3873817774U,	// <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1>
+  2756945079U,	// <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1>
+  3830686912U,	// <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1>
+  2756945103U,	// <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2236547990U,	// <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0>
+  3826631905U,	// <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7>
+  3830686952U,	// <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5>
+  2756945139U,	// <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  3830686972U,	// <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7>
+  2800076030U,	// <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0>
+  2756945166U,	// <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7>
+  3699081318U,	// <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS
+  2297905162U,	// <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1>
+  2297907350U,	// <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2>
+  3365675182U,	// <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3>
+  3699084598U,	// <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS
+  2297905490U,	// <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5>
+  2297905329U,	// <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  3368330447U,	// <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7>
+  2297905169U,	// <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u>
+  2619375876U,	// <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u>
+  1678558004U,	// <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2289289366U,	// <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2>
+  1679000956U,	// <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0>
+  2619378998U,	// <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS
+  2756945297U,	// <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3>
+  2297905329U,	// <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  2800076192U,	// <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0>
+  1683203497U,	// <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0>
+  3362964203U,	// <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0>
+  2289222380U,	// <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1>
+  2289222462U,	// <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2>
+  1215479910U,	// <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3362964207U,	// <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4>
+  2289222708U,	// <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2232600506U,	// <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7>
+  3396142296U,	// <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7>
+  1215479915U,	// <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3699105894U,	// <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS
+  3765633844U,	// <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1>
+  2691892120U,	// <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2>
+  2752300575U,	// <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1>
+  3699109174U,	// <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS
+  3830687280U,	// <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0>
+  3830687289U,	// <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0>
+  3874260548U,	// <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2>
+  2752742988U,	// <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1>
+  2631344230U,	// <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS
+  2697201184U,	// <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2>
+  1678558824U,	// <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678558834U,	// <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  2631347510U,	// <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS
+  3368953613U,	// <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5>
+  2234304442U,	// <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7>
+  3368953777U,	// <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7>
+  1679001247U,	// <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3>
+  1678558886U,	// <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1>
+  2752300719U,	// <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1>
+  2752300729U,	// <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2>
+  1221476454U,	// <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS
+  1678558926U,	// <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5>
+  2800076503U,	// <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5>
+  2234746810U,	// <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7>
+  2800076516U,	// <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0>
+  1678558958U,	// <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1>
+  3699130470U,	// <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS
+  3362996972U,	// <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1>
+  2289256040U,	// <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2>
+  1215512678U,	// <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3362998676U,	// <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4>
+  2691894582U,	// <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2235582394U,	// <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7>
+  3734967544U,	// <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4>
+  1215512683U,	// <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3705110630U,	// <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS
+  3368313985U,	// <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1>
+  3368314472U,	// <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2>
+  2756945768U,	// <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6>
+  3705113910U,	// <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS
+  3310061416U,	// <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6>
+  3310135226U,	// <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7>
+  3370305457U,	// <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7>
+  2752743317U,	// <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6>
+  2631376998U,	// <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS
+  3705119540U,	// <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1>
+  2631378621U,	// <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6>
+  1678559162U,	// <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2631380278U,	// <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS
+  3370976956U,	// <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5>
+  2237065146U,	// <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7>
+  3798815594U,	// <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2>
+  1679001575U,	// <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  2800076778U,	// <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1>
+  3371647724U,	// <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1>
+  2297906792U,	// <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2>
+  1224163430U,	// <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  3705130294U,	// <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS
+  3371648052U,	// <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5>
+  2297906877U,	// <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6>
+  3371648702U,	// <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7>
+  1224163435U,	// <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1679001659U,	// <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1>
+  2752743492U,	// <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1>
+  1678558824U,	// <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678559320U,	// <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3>
+  1679001699U,	// <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5>
+  2691897498U,	// <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2237908922U,	// <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7>
+  2800519289U,	// <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0>
+  1679001731U,	// <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1>
+  1215480726U,	// <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1678559382U,	// <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2>
+  2631403200U,	// <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0>
+  2289223282U,	// <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3>
+  2752301232U,	// <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1>
+  3362965027U,	// <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5>
+  3362965352U,	// <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6>
+  2289223610U,	// <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7>
+  1678559445U,	// <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2>
+  3830687964U,	// <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0>
+  2752301286U,	// <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1>
+  2752301297U,	// <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3>
+  2305157532U,	// <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3>
+  3830688000U,	// <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0>
+  3830688009U,	// <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0>
+  3830688019U,	// <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1>
+  3362973626U,	// <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7>
+  2752743719U,	// <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3>
+  2631417958U,	// <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS
+  3826043193U,	// <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3>
+  1624131186U,	// <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3>
+  2752301384U,	// <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0>
+  2631421238U,	// <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS
+  3826485602U,	// <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u>
+  2752301414U,	// <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3>
+  2771249519U,	// <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3>
+  1628112984U,	// <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3>
+  1563656294U,	// <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS
+  2301855911U,	// <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1>
+  2697873730U,	// <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3>
+  403488870U,	// <3,3,3,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  2301856239U,	// <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5>
+  2697874067U,	// <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7>
+  2295220154U,	// <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7>
+  403488870U,	// <3,3,3,u>: Cost 1 vspltisw3 LHS
+  2289255318U,	// <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0>
+  2631435162U,	// <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4>
+  2631435972U,	// <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4>
+  2289256050U,	// <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3>
+  1215513498U,	// <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679002114U,	// <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6>
+  3362998120U,	// <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6>
+  2289256378U,	// <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7>
+  1679002141U,	// <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6>
+  3831130657U,	// <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1>
+  3376277671U,	// <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1>
+  3771617012U,	// <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3>
+  2302536092U,	// <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3>
+  3831130697U,	// <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5>
+  2294572579U,	// <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5>
+  2800519773U,	// <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7>
+  3368314810U,	// <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7>
+  2800519791U,	// <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7>
+  2800077432U,	// <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7>
+  3310291185U,	// <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3>
+  2789165706U,	// <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7>
+  2764982931U,	// <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7>
+  2800077468U,	// <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7>
+  3873819301U,	// <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7>
+  2297235304U,	// <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6>
+  2725081963U,	// <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3>
+  2725745596U,	// <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3>
+  2631458918U,	// <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS
+  3705201460U,	// <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1>
+  2631460551U,	// <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7>
+  2297906802U,	// <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3>
+  2631462198U,	// <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS
+  3371648547U,	// <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5>
+  3371648548U,	// <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6>
+  1224165306U,	// <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1224165306U,	// <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1215480726U,	// <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1679002398U,	// <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2>
+  1659967368U,	// <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3>
+  403488870U,	// <3,3,u,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  1679002438U,	// <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6>
+  2756946764U,	// <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3>
+  1224165306U,	// <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  403488870U,	// <3,3,u,u>: Cost 1 vspltisw3 LHS
+  2691907584U,	// <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0>
+  1618165862U,	// <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631476937U,	// <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0>
+  2232601732U,	// <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0>
+  2691907922U,	// <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5>
+  1158860086U,	// <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  3306343806U,	// <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7>
+  3366947484U,	// <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7>
+  1618166429U,	// <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631483494U,	// <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS
+  2691908404U,	// <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1>
+  1618166682U,	// <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4>
+  3765650393U,	// <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4>
+  2631486774U,	// <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS
+  2756946914U,	// <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0>
+  3765650639U,	// <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7>
+  3735090439U,	// <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1>
+  1622148480U,	// <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4>
+  3765650893U,	// <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0>
+  3831131154U,	// <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3>
+  2691909224U,	// <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2>
+  2691909286U,	// <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1>
+  2699208469U,	// <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4>
+  2233863478U,	// <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS
+  2691909562U,	// <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7>
+  2701199368U,	// <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4>
+  2691909691U,	// <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1>
+  2691909782U,	// <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2>
+  3765651686U,	// <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1>
+  2691909972U,	// <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3>
+  2691910044U,	// <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1>
+  1161006390U,	// <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691910300U,	// <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  3368962716U,	// <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7>
+  1161006633U,	// <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2631508070U,	// <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS
+  2631508890U,	// <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4>
+  2631509709U,	// <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4>
+  2289256788U,	// <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3>
+  1726336208U,	// <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4>
+  1618169142U,	// <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  3362998858U,	// <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6>
+  2289257116U,	// <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7>
+  1618169385U,	// <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  1557774438U,	// <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS
+  2631516980U,	// <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1>
+  1557776078U,	// <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5>
+  2631518358U,	// <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2>
+  1557777718U,	// <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS
+  2296563406U,	// <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5>
+  604818742U,	// <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2661381387U,	// <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5>
+  604818760U,	// <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  3705266278U,	// <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS
+  3831131482U,	// <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7>
+  2733715962U,	// <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3>
+  3844771180U,	// <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7>
+  2800078197U,	// <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7>
+  2236550454U,	// <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716280U,	// <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6>
+  2725090156U,	// <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4>
+  2236550697U,	// <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716474U,	// <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2>
+  3371647013U,	// <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1>
+  2727744688U,	// <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4>
+  3371649364U,	// <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3>
+  2733716838U,	// <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6>
+  2297906894U,	// <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5>
+  3371647180U,	// <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6>
+  2733717100U,	// <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7>
+  2297906897U,	// <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u>
+  1557799014U,	// <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS
+  1618171694U,	// <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  1557800657U,	// <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u>
+  2691913660U,	// <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1>
+  1557802294U,	// <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS
+  1618172058U,	// <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  604818985U,	// <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2661405966U,	// <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u>
+  604819003U,	// <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2643492966U,	// <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS
+  2756947528U,	// <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2>
+  2331029019U,	// <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2643495062U,	// <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2>
+  2756947554U,	// <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1>
+  2800078443U,	// <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1>
+  2289224194U,	// <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6>
+  3362964723U,	// <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7>
+  2756947590U,	// <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1>
+  2800078479U,	// <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1>
+  2333027218U,	// <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1>
+  2691916699U,	// <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5>
+  3832901294U,	// <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5>
+  2800078519U,	// <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5>
+  3830689467U,	// <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0>
+  3830689481U,	// <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5>
+  3873820365U,	// <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0>
+  2800078551U,	// <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1>
+  3770967487U,	// <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4>
+  2697225763U,	// <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5>
+  3830689523U,	// <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2>
+  2699216590U,	// <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5>
+  2699216662U,	// <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5>
+  2783047439U,	// <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3>
+  2783121176U,	// <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3>
+  3856936737U,	// <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3>
+  2701871194U,	// <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5>
+  2643517542U,	// <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS
+  2331052946U,	// <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1>
+  3699345010U,	// <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3>
+  2705189276U,	// <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3>
+  2705189359U,	// <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5>
+  2331053274U,	// <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5>
+  2295220738U,	// <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6>
+  3368961267U,	// <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7>
+  2295220740U,	// <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u>
+  2643525734U,	// <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS
+  2331061138U,	// <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1>
+  2235584280U,	// <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3>
+  2643528194U,	// <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6>
+  2735713498U,	// <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5>
+  2756947892U,	// <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6>
+  2289256962U,	// <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6>
+  3362997491U,	// <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7>
+  2756947919U,	// <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6>
+  2800078803U,	// <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1>
+  2800078812U,	// <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1>
+  2631591639U,	// <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5>
+  3832901616U,	// <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3>
+  2800078843U,	// <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5>
+  1726337028U,	// <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078862U,	// <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6>
+  3368314099U,	// <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7>
+  1726337028U,	// <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078884U,	// <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1>
+  2800078899U,	// <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7>
+  2631599832U,	// <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6>
+  2800078914U,	// <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4>
+  2800078924U,	// <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5>
+  2800078935U,	// <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7>
+  2297235970U,	// <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6>
+  1726337122U,	// <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0>
+  1726337131U,	// <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0>
+  3699376230U,	// <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS
+  2333739922U,	// <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1>
+  3699378106U,	// <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7>
+  3371647915U,	// <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3>
+  3699379510U,	// <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS
+  2333740250U,	// <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5>
+  2297907714U,	// <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6>
+  3370984691U,	// <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7>
+  2297907716U,	// <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u>
+  2800079046U,	// <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1>
+  2756948176U,	// <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2>
+  2331029019U,	// <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2800079076U,	// <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4>
+  2800079085U,	// <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4>
+  1726337028U,	// <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2289289730U,	// <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6>
+  1726337284U,	// <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0>
+  1726337293U,	// <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0>
+  3773628416U,	// <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0>
+  2699886694U,	// <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789167401U,	// <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1>
+  3362965862U,	// <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3>
+  3773628754U,	// <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5>
+  3723284326U,	// <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0>
+  2800079181U,	// <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1>
+  1215483190U,	// <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1215483191U,	// <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS
+  3873821032U,	// <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1>
+  3773629236U,	// <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1>
+  2691924892U,	// <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6>
+  3830690184U,	// <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6>
+  3873821072U,	// <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5>
+  3873821082U,	// <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6>
+  3403453240U,	// <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6>
+  2289233206U,	// <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2289233207U,	// <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2661498982U,	// <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS
+  3770975780U,	// <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6>
+  2631640797U,	// <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2>
+  3771639485U,	// <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6>
+  2661502262U,	// <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS
+  2699888488U,	// <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6>
+  2661503482U,	// <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3>
+  1715425786U,	// <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3>
+  1715499523U,	// <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3>
+  3773630614U,	// <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2>
+  3372942825U,	// <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1>
+  2234749434U,	// <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3>
+  3368962406U,	// <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3>
+  2699889154U,	// <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6>
+  3773631068U,	// <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6>
+  2331054904U,	// <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6>
+  1221479734U,	// <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  1221479735U,	// <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS
+  2235584801U,	// <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2>
+  3717342106U,	// <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4>
+  2789167729U,	// <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5>
+  2235585074U,	// <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5>
+  2235585165U,	// <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6>
+  2699889974U,	// <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  2800079509U,	// <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5>
+  1215515958U,	// <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1215515959U,	// <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS
+  3873821356U,	// <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1>
+  3372959209U,	// <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1>
+  3862909629U,	// <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0>
+  3773632358U,	// <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0>
+  3873821396U,	// <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5>
+  3873821405U,	// <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5>
+  3862909672U,	// <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7>
+  2294574390U,	// <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2294574391U,	// <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2800079613U,	// <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1>
+  3873821446U,	// <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1>
+  2789167888U,	// <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2>
+  3844920090U,	// <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3>
+  2800079653U,	// <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5>
+  3723333484U,	// <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6>
+  1726337848U,	// <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726337858U,	// <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7>
+  1726337867U,	// <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7>
+  1726337870U,	// <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1>
+  2297906665U,	// <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1>
+  2792117090U,	// <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3>
+  2297907558U,	// <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3>
+  1726337910U,	// <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5>
+  2297906993U,	// <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5>
+  2297906832U,	// <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6>
+  1224166710U,	// <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224166711U,	// <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1726337951U,	// <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1>
+  2699892526U,	// <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789168049U,	// <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1>
+  2792854460U,	// <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3>
+  1726337991U,	// <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5>
+  2699892890U,	// <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  1726337848U,	// <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1215548726U,	// <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  1215548727U,	// <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS
+  2700558336U,	// <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0>
+  1626816614U,	// <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700558513U,	// <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6>
+  2331030010U,	// <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3>
+  2700558674U,	// <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5>
+  2800079906U,	// <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6>
+  2655588936U,	// <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0>
+  2800079919U,	// <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1>
+  1626817181U,	// <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  3774300899U,	// <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1>
+  2700559156U,	// <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1>
+  2700559254U,	// <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0>
+  3774301148U,	// <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7>
+  3774301227U,	// <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5>
+  3774301295U,	// <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1>
+  3768329441U,	// <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7>
+  3403453250U,	// <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7>
+  2700559740U,	// <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0>
+  2700559849U,	// <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1>
+  3770983973U,	// <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7>
+  2700559976U,	// <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2>
+  2698569415U,	// <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7>
+  2700560177U,	// <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5>
+  3773638505U,	// <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7>
+  1626818490U,	// <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7>
+  2795140307U,	// <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3>
+  1628145756U,	// <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7>
+  2700560534U,	// <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2>
+  3774302438U,	// <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1>
+  2700560742U,	// <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3>
+  2700560796U,	// <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3>
+  2700560898U,	// <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6>
+  3774302821U,	// <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6>
+  2700561079U,	// <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7>
+  2700561091U,	// <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1>
+  2700561182U,	// <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2>
+  2655617126U,	// <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS
+  3774303178U,	// <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3>
+  2655619002U,	// <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7>
+  2331062778U,	// <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3>
+  2655620406U,	// <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS
+  1626819894U,	// <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  2655621708U,	// <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4>
+  2800080247U,	// <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5>
+  1626820137U,	// <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  3774303816U,	// <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2>
+  3873822093U,	// <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0>
+  3774303998U,	// <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4>
+  3862910368U,	// <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1>
+  3774304180U,	// <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6>
+  2800080310U,	// <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5>
+  2800080321U,	// <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7>
+  3873822147U,	// <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0>
+  2800080339U,	// <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7>
+  2800080348U,	// <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7>
+  3873822181U,	// <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7>
+  2789168622U,	// <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7>
+  2700563016U,	// <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0>
+  2800080384U,	// <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7>
+  3862910472U,	// <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6>
+  2700563256U,	// <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6>
+  2800080404U,	// <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0>
+  2793149988U,	// <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7>
+  2637725798U,	// <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS
+  3371649227U,	// <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1>
+  2637727674U,	// <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7>
+  2297907567U,	// <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3>
+  2637729078U,	// <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS
+  3371649312U,	// <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5>
+  2655646287U,	// <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7>
+  1726338668U,	// <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1726338668U,	// <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  2700564179U,	// <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2>
+  1626822446U,	// <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700564357U,	// <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0>
+  2700564412U,	// <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1>
+  2700564543U,	// <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6>
+  1626822810U,	// <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  1662654672U,	// <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7>
+  1726338668U,	// <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1626823013U,	// <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  1678557184U,	// <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1679005395U,	// <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2>
+  2289221787U,	// <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2>
+  1215479964U,	// <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  2752747245U,	// <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1>
+  1158863002U,	// <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  2289224221U,	// <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6>
+  1215483208U,	// <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1679005458U,	// <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2>
+  1558036582U,	// <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS
+  1678558004U,	// <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  604821294U,	// <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2752747317U,	// <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1>
+  1558039862U,	// <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS
+  2756949830U,	// <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0>
+  2800080726U,	// <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7>
+  2289233224U,	// <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  604821348U,	// <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696586709U,	// <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u>
+  2757392246U,	// <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3>
+  1624172151U,	// <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u>
+  1679005576U,	// <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3>
+  2631789878U,	// <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS
+  2699904874U,	// <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u>
+  1626826683U,	// <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u>
+  1726338988U,	// <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3>
+  1683208117U,	// <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3>
+  1679005628U,	// <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1>
+  1161008942U,	// <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2752747471U,	// <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2>
+  403488870U,	// <3,u,3,3>: Cost 1 vspltisw3 LHS
+  1679005668U,	// <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5>
+  1161009306U,	// <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691943104U,	// <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7>
+  1221479752U,	// <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  403488870U,	// <3,u,3,u>: Cost 1 vspltisw3 LHS
+  2289255363U,	// <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0>
+  1161844526U,	// <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS
+  2289256661U,	// <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2>
+  1215512732U,	// <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  1215513498U,	// <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679005759U,	// <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6>
+  2289256989U,	// <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6>
+  1215515976U,	// <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1679005786U,	// <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6>
+  1558069350U,	// <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS
+  2631811892U,	// <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1>
+  1558071026U,	// <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5>
+  2752747646U,	// <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6>
+  1558072630U,	// <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS
+  1726337028U,	// <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  604821658U,	// <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2294574408U,	// <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  604821676U,	// <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2631819366U,	// <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS
+  2757392574U,	// <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7>
+  2631821043U,	// <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6>
+  1679005904U,	// <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  2631822646U,	// <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS
+  2236553370U,	// <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  1726337848U,	// <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726339309U,	// <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0>
+  1683208445U,	// <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7>
+  1726339328U,	// <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1>
+  2297905225U,	// <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1>
+  2631829236U,	// <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7>
+  1224163484U,	// <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1726339368U,	// <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5>
+  2297905553U,	// <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5>
+  2297905392U,	// <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6>
+  1224166728U,	// <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224163489U,	// <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1683208529U,	// <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1>
+  1679006043U,	// <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2>
+  604821861U,	// <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  403488870U,	// <3,u,u,3>: Cost 1 vspltisw3 LHS
+  1683208569U,	// <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5>
+  1679006083U,	// <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6>
+  604821901U,	// <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  1215548744U,	// <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  604821915U,	// <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2759016448U,	// <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0>
+  1165115494U,	// <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS
+  3717531337U,	// <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0>
+  3369675785U,	// <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3>
+  2751791144U,	// <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4>
+  2238857630U,	// <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0>
+  3312591341U,	// <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7>
+  3369676113U,	// <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7>
+  1165116061U,	// <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS
+  2637824102U,	// <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS
+  2637824922U,	// <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4>
+  1685274726U,	// <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637826512U,	// <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1>
+  2637827382U,	// <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS
+  2661716070U,	// <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4>
+  3729486427U,	// <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1>
+  2661717300U,	// <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1>
+  1685274780U,	// <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  3711574118U,	// <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS
+  2240200806U,	// <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  3771663992U,	// <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0>
+  2698585801U,	// <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0>
+  3373672105U,	// <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4>
+  3810813795U,	// <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1>
+  3772327866U,	// <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7>
+  3386280568U,	// <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7>
+  2701903966U,	// <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0>
+  3699638374U,	// <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS
+  2753560832U,	// <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  3772328276U,	// <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3>
+  3827302674U,	// <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4>
+  3699641654U,	// <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS
+  3779627588U,	// <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0>
+  3772328604U,	// <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7>
+  3780954854U,	// <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0>
+  2753560832U,	// <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  2725129106U,	// <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1>
+  1167720550U,	// <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  3839172953U,	// <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3>
+  3772329051U,	// <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4>
+  2241462610U,	// <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5>
+  2698587446U,	// <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  3772329297U,	// <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7>
+  3735483703U,	// <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4>
+  1167721117U,	// <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS
+  1168556032U,	// <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  94814310U,	// <4,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2242298029U,	// <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2637859284U,	// <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5>
+  1168556370U,	// <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2242306530U,	// <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5>
+  2242298358U,	// <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661750072U,	// <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5>
+  94814877U,	// <4,0,5,u>: Cost 1 vmrghw RHS, LHS
+  3316580362U,	// <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1>
+  2242846822U,	// <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3798872570U,	// <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3>
+  3796218413U,	// <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0>
+  3834528273U,	// <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7>
+  3798872811U,	// <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1>
+  3316621876U,	// <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6>
+  2725131121U,	// <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0>
+  2242847389U,	// <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3377692672U,	// <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0>
+  2243493990U,	// <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  3775648970U,	// <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3>
+  3802191110U,	// <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0>
+  3317236050U,	// <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5>
+  3803518376U,	// <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0>
+  3317236214U,	// <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7>
+  3798873708U,	// <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7>
+  2243494557U,	// <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS
+  1170546688U,	// <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  96804966U,	// <4,0,u,1>: Cost 1 vmrghw RHS, LHS
+  1685275293U,	// <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637883863U,	// <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u>
+  1170547026U,	// <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2698590362U,	// <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  2244289014U,	// <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661774651U,	// <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u>
+  96805533U,	// <4,0,u,u>: Cost 1 vmrghw RHS, LHS
+  2667749478U,	// <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS
+  2689966182U,	// <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS
+  2238571418U,	// <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4>
+  3711633880U,	// <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0>
+  2689966418U,	// <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5>
+  3361046866U,	// <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5>
+  3741495802U,	// <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3>
+  3741496314U,	// <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2>
+  2689966765U,	// <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1>
+  3764372222U,	// <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1>
+  2758206263U,	// <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4>
+  2698593178U,	// <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4>
+  3361057810U,	// <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3>
+  3827303250U,	// <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4>
+  2287313234U,	// <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5>
+  3763709171U,	// <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7>
+  3361058138U,	// <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7>
+  2239759744U,	// <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4>
+  2637906022U,	// <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS
+  2637906842U,	// <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4>
+  3763709544U,	// <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2>
+  1685275546U,	// <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4>
+  2637909302U,	// <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS
+  3361063250U,	// <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5>
+  3763709882U,	// <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7>
+  3735541054U,	// <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2>
+  1685644231U,	// <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4>
+  2702575792U,	// <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1>
+  3832759257U,	// <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4>
+  3833349090U,	// <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4>
+  3763710364U,	// <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3>
+  2707884546U,	// <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6>
+  3361071442U,	// <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5>
+  3772336796U,	// <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7>
+  3775654595U,	// <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1>
+  2707884856U,	// <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1>
+  2667782246U,	// <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS
+  2241463092U,	// <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1>
+  2241553306U,	// <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4>
+  3827303484U,	// <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4>
+  2667785424U,	// <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4>
+  2689969462U,	// <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  3763711322U,	// <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7>
+  3867116636U,	// <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0>
+  2689969705U,	// <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  1546273106U,	// <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5>
+  1168556852U,	// <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1168556950U,	// <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2620016790U,	// <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2>
+  1546276150U,	// <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS
+  2620018692U,	// <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5>
+  2242299087U,	// <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667795450U,	// <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2>
+  1546278702U,	// <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS
+  3781628193U,	// <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2>
+  3832759503U,	// <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7>
+  3316261786U,	// <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4>
+  3781628466U,	// <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5>
+  3827303658U,	// <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7>
+  3361096018U,	// <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5>
+  3788264248U,	// <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6>
+  3788264270U,	// <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1>
+  3832759566U,	// <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7>
+  2726466580U,	// <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1>
+  3377692682U,	// <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1>
+  3377694870U,	// <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2>
+  3802199303U,	// <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1>
+  2731775334U,	// <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6>
+  3377693010U,	// <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5>
+  3365749804U,	// <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6>
+  3788265068U,	// <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7>
+  2731775644U,	// <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1>
+  1546297685U,	// <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u>
+  1170547508U,	// <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1170547606U,	// <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  1689257344U,	// <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4>
+  1546300726U,	// <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS
+  2284716370U,	// <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5>
+  2244289743U,	// <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667820026U,	// <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2>
+  1546303278U,	// <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS
+  3729621094U,	// <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS
+  3763716198U,	// <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS
+  2238858856U,	// <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2>
+  2295930982U,	// <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3763716434U,	// <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5>
+  2238859107U,	// <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1>
+  2238859194U,	// <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7>
+  3312601066U,	// <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1>
+  2295930987U,	// <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3699769446U,	// <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS
+  3313255971U,	// <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5>
+  3361056360U,	// <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2>
+  2287312998U,	// <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3788932148U,	// <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5>
+  3313256290U,	// <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0>
+  3838289469U,	// <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4>
+  3369682865U,	// <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7>
+  2287313003U,	// <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3838658133U,	// <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1>
+  3711722394U,	// <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4>
+  2759018088U,	// <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2>
+  2759018098U,	// <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3>
+  3838658168U,	// <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0>
+  3369027341U,	// <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5>
+  2240227258U,	// <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7>
+  3735614791U,	// <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2>
+  2759018143U,	// <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3>
+  2759018150U,	// <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1>
+  3831948975U,	// <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1>
+  3832759993U,	// <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2>
+  2759018180U,	// <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4>
+  2759018185U,	// <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0>
+  3839542998U,	// <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4>
+  3314640826U,	// <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7>
+  2765948648U,	// <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4>
+  2759018222U,	// <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1>
+  3838658295U,	// <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1>
+  3315205667U,	// <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5>
+  2241463912U,	// <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2>
+  1234829414U,	// <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2241464085U,	// <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4>
+  2241546087U,	// <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5>
+  2241464250U,	// <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7>
+  3741602873U,	// <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2>
+  1234829419U,	// <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2626060390U,	// <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS
+  2626061364U,	// <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5>
+  1168557672U,	// <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222230118U,	// <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  2626063670U,	// <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS
+  2242299752U,	// <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1168558010U,	// <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2242299882U,	// <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1222230123U,	// <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS
+  3711754342U,	// <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS
+  3711755162U,	// <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4>
+  3838658481U,	// <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7>
+  2759018426U,	// <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7>
+  3838658499U,	// <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7>
+  3735646310U,	// <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4>
+  3316590522U,	// <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7>
+  3798889331U,	// <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2>
+  2759018471U,	// <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7>
+  3874564074U,	// <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1>
+  3800880230U,	// <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2>
+  3371722344U,	// <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2>
+  2303950950U,	// <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  3371722346U,	// <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4>
+  3371722509U,	// <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5>
+  3317237690U,	// <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7>
+  3317237738U,	// <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1>
+  2303950955U,	// <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2759018555U,	// <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1>
+  2626085943U,	// <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u>
+  1170548328U,	// <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222254694U,	// <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2759018595U,	// <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5>
+  2244290408U,	// <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1170548666U,	// <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2769266813U,	// <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4>
+  1222254699U,	// <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2238859414U,	// <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2>
+  2759018646U,	// <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2>
+  3312314708U,	// <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3>
+  2238859676U,	// <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3>
+  2295931802U,	// <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4>
+  3735670886U,	// <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4>
+  3312315036U,	// <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7>
+  3369674682U,	// <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7>
+  2759018709U,	// <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2>
+  3361055638U,	// <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0>
+  3831949542U,	// <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1>
+  2703917978U,	// <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3361056370U,	// <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3>
+  2295939994U,	// <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4>
+  3361056291U,	// <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5>
+  3378972520U,	// <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6>
+  3361056698U,	// <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7>
+  2703917978U,	// <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3832760624U,	// <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3>
+  3711796122U,	// <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4>
+  3832760641U,	// <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2>
+  2770962764U,	// <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4>
+  2759018836U,	// <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3>
+  3827304802U,	// <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u>
+  3832760678U,	// <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3>
+  3859597679U,	// <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3>
+  2771331449U,	// <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4>
+  2240841878U,	// <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2>
+  3776997635U,	// <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3>
+  2703919444U,	// <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3>
+  2759018908U,	// <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3>
+  2759018918U,	// <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4>
+  3386951446U,	// <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5>
+  3777661596U,	// <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7>
+  3375007674U,	// <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7>
+  2707901242U,	// <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3>
+  2759018960U,	// <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1>
+  2759018970U,	// <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2>
+  2632099605U,	// <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4>
+  2241464732U,	// <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3>
+  2759019000U,	// <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5>
+  2753563138U,	// <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6>
+  3777662316U,	// <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7>
+  2308573114U,	// <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7>
+  2759019032U,	// <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1>
+  1168558230U,	// <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2242300134U,	// <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1>
+  2632107798U,	// <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5>
+  1168558492U,	// <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1168558594U,	// <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2295973654U,	// <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5>
+  2242300536U,	// <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295973818U,	// <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7>
+  1168558878U,	// <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  3832760952U,	// <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7>
+  3711828890U,	// <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4>
+  3316484436U,	// <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3>
+  3711830512U,	// <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6>
+  2759019164U,	// <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3361097251U,	// <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5>
+  3316624045U,	// <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6>
+  2773912244U,	// <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4>
+  2759019164U,	// <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3377693590U,	// <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0>
+  3365751680U,	// <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1>
+  2727810232U,	// <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3>
+  3377694322U,	// <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3>
+  2303951770U,	// <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4>
+  3741700198U,	// <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4>
+  3377695216U,	// <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6>
+  3375703994U,	// <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7>
+  2731792030U,	// <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3>
+  1170548886U,	// <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2759019294U,	// <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2>
+  2632132377U,	// <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u>
+  1170549148U,	// <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1170549250U,	// <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2759019334U,	// <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6>
+  2244291192U,	// <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295998394U,	// <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7>
+  1170549534U,	// <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  1165118354U,	// <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1637482598U,	// <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  3711854285U,	// <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4>
+  3827305344U,	// <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1>
+  2711224658U,	// <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5>
+  1165118774U,	// <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3312602489U,	// <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2>
+  3369675420U,	// <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7>
+  1165119017U,	// <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3369682633U,	// <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0>
+  2287313581U,	// <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1>
+  2759019466U,	// <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3>
+  3369683284U,	// <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3>
+  2311204048U,	// <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4>
+  2239319350U,	// <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS
+  3784967411U,	// <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7>
+  3369683612U,	// <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7>
+  2763000832U,	// <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3>
+  3711869030U,	// <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS
+  3711869850U,	// <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4>
+  2240203830U,	// <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3>
+  2698618573U,	// <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4>
+  2711226133U,	// <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4>
+  2240204086U,	// <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2711226298U,	// <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7>
+  3832761416U,	// <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3>
+  2701936738U,	// <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4>
+  2711226518U,	// <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2>
+  3777005828U,	// <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4>
+  3832761453U,	// <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4>
+  2301266260U,	// <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  2705254903U,	// <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4>
+  2240843062U,	// <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  3832761489U,	// <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4>
+  3375008412U,	// <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7>
+  2301266260U,	// <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  1570373734U,	// <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS
+  2308574089U,	// <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1>
+  2644117096U,	// <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2>
+  2638146039U,	// <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4>
+  229035318U,	// <4,4,4,4>: Cost 1 vspltisw0 RHS
+  1167723830U,	// <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS
+  2644120058U,	// <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3>
+  2662036827U,	// <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4>
+  229035318U,	// <4,4,4,u>: Cost 1 vspltisw0 RHS
+  1168558994U,	// <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  2638152602U,	// <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4>
+  2242300981U,	// <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638154232U,	// <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5>
+  1168559322U,	// <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5>
+  94817590U,	// <4,4,5,5>: Cost 1 vmrghw RHS, RHS
+  1685278006U,	// <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2242309576U,	// <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  94817833U,	// <4,4,5,u>: Cost 1 vmrghw RHS, RHS
+  3316591506U,	// <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1>
+  3758428587U,	// <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5>
+  2711228922U,	// <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3>
+  3796251185U,	// <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4>
+  2711229085U,	// <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4>
+  2242850102U,	// <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2242850169U,	// <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2>
+  2725163893U,	// <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4>
+  2242850345U,	// <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2711229434U,	// <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2>
+  3377694410U,	// <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1>
+  3868593584U,	// <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3>
+  3377695060U,	// <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3>
+  2729145691U,	// <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4>
+  2243497270U,	// <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  3871542744U,	// <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7>
+  2303953564U,	// <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7>
+  2243497513U,	// <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS
+  1170549650U,	// <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  1637488430U,	// <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  2244291637U,	// <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638178811U,	// <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u>
+  229035318U,	// <4,4,u,4>: Cost 1 vspltisw0 RHS
+  96808246U,	// <4,4,u,5>: Cost 1 vmrghw RHS, RHS
+  1685278249U,	// <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2244292040U,	// <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  96808489U,	// <4,4,u,u>: Cost 1 vmrghw RHS, RHS
+  2698625024U,	// <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0>
+  1624883302U,	// <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2638186190U,	// <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5>
+  2638187004U,	// <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0>
+  2687345005U,	// <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5>
+  2238861316U,	// <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5>
+  2662077302U,	// <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5>
+  2662077792U,	// <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0>
+  1624883869U,	// <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  3361057762U,	// <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0>
+  2691326803U,	// <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5>
+  2698625942U,	// <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0>
+  3361055659U,	// <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3>
+  3761087567U,	// <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5>
+  2693981335U,	// <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5>
+  2305231362U,	// <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  3361055987U,	// <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7>
+  2695972234U,	// <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5>
+  2638200934U,	// <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS
+  3761088035U,	// <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5>
+  2697963133U,	// <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5>
+  1624884942U,	// <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5>
+  2698626838U,	// <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5>
+  3772368744U,	// <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6>
+  2698627002U,	// <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7>
+  3775023122U,	// <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5>
+  1628203107U,	// <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5>
+  2698627222U,	// <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2>
+  3765070057U,	// <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4>
+  2698627404U,	// <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4>
+  2698627484U,	// <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3>
+  2698627580U,	// <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0>
+  3779668553U,	// <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5>
+  2725169844U,	// <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4>
+  2707253995U,	// <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5>
+  2698627870U,	// <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2>
+  2638217318U,	// <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS
+  2308574098U,	// <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1>
+  2698628150U,	// <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3>
+  2638219776U,	// <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4>
+  2698628314U,	// <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5>
+  1624886582U,	// <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  2698628478U,	// <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7>
+  2662110564U,	// <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4>
+  1624886825U,	// <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1570455654U,	// <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS
+  2312564250U,	// <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1>
+  2644199118U,	// <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5>
+  2295974966U,	// <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3>
+  1570458842U,	// <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5>
+  1168568324U,	// <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5>
+  1168568418U,	// <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2295975294U,	// <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7>
+  1168716036U,	// <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0>
+  1564491878U,	// <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS
+  2626290768U,	// <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6>
+  2632263465U,	// <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6>
+  1564494338U,	// <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6>
+  1564495158U,	// <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS
+  2638237464U,	// <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3>
+  2656154253U,	// <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2725172218U,	// <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2>
+  3859599489U,	// <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4>
+  2698630320U,	// <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4>
+  2728490251U,	// <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5>
+  2725172576U,	// <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0>
+  3317239812U,	// <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5>
+  2725172760U,	// <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4>
+  2725172844U,	// <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7>
+  2725172866U,	// <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2>
+  1564508262U,	// <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS
+  1624889134U,	// <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2698631045U,	// <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0>
+  1564510724U,	// <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u>
+  1564511542U,	// <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS
+  1624889498U,	// <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1170550882U,	// <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  3312595285U,	// <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0>
+  3763748966U,	// <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS
+  2238861818U,	// <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3>
+  3767730432U,	// <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4>
+  3763749202U,	// <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5>
+  2238862059U,	// <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1>
+  2238862136U,	// <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6>
+  2295934262U,	// <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  2295934263U,	// <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS
+  3378973999U,	// <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0>
+  3378974648U,	// <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1>
+  3779675034U,	// <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4>
+  3378974002U,	// <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3>
+  3378974003U,	// <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4>
+  3767731352U,	// <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6>
+  3378974734U,	// <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6>
+  2287316278U,	// <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  2287316279U,	// <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS
+  3735904358U,	// <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS
+  3763750435U,	// <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5>
+  3313938937U,	// <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2>
+  3772376782U,	// <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5>
+  3852890591U,	// <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3>
+  3735908454U,	// <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4>
+  3801573306U,	// <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7>
+  2785858042U,	// <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3>
+  2785858051U,	// <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3>
+  3863065101U,	// <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4>
+  3314586024U,	// <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2>
+  3863212575U,	// <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4>
+  3863286312U,	// <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4>
+  3767732738U,	// <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6>
+  3779676746U,	// <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6>
+  3398898488U,	// <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6>
+  2301267254U,	// <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2301267255U,	// <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS
+  3852890715U,	// <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1>
+  3315208615U,	// <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1>
+  2241466874U,	// <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3>
+  3852890745U,	// <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4>
+  2241467037U,	// <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4>
+  2241549039U,	// <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5>
+  2241467192U,	// <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6>
+  1234832694U,	// <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  1234832695U,	// <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS
+  2242302241U,	// <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2242310567U,	// <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1168568826U,	// <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2242302514U,	// <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2242302605U,	// <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2242310891U,	// <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1168569144U,	// <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222233398U,	// <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  1222233399U,	// <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS
+  3316576545U,	// <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2>
+  3316584871U,	// <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1>
+  2242851322U,	// <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3>
+  3316601394U,	// <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5>
+  3852890916U,	// <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4>
+  3316617963U,	// <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1>
+  2242884408U,	// <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6>
+  2785858370U,	// <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7>
+  2785858379U,	// <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7>
+  2785858382U,	// <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1>
+  3859600215U,	// <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1>
+  3317240314U,	// <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3>
+  2792199020U,	// <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4>
+  2785858422U,	// <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5>
+  3856651132U,	// <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2>
+  3317240632U,	// <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6>
+  2303954230U,	// <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303954231U,	// <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2244292897U,	// <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2244293031U,	// <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1170551290U,	// <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2244293170U,	// <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2244293261U,	// <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2244293355U,	// <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1170551608U,	// <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222257974U,	// <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS
+  1222257975U,	// <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS
+  2238862330U,	// <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2>
+  2706604134U,	// <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3312604308U,	// <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3>
+  3768402176U,	// <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4>
+  2238862648U,	// <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5>
+  3859600418U,	// <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6>
+  3729994393U,	// <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0>
+  2238862956U,	// <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7>
+  2706604701U,	// <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3385610338U,	// <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0>
+  3780346676U,	// <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1>
+  2706604954U,	// <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3385610746U,	// <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3>
+  3385610342U,	// <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4>
+  3385610667U,	// <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5>
+  3768403178U,	// <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7>
+  3385611074U,	// <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7>
+  2706604954U,	// <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3859600532U,	// <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3>
+  3712091034U,	// <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4>
+  3774375528U,	// <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2>
+  2794853552U,	// <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4>
+  2785858744U,	// <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3>
+  3735982182U,	// <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4>
+  3774375875U,	// <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7>
+  3735983476U,	// <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2>
+  2795222237U,	// <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4>
+  3780348054U,	// <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2>
+  3730015130U,	// <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4>
+  3780348244U,	// <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3>
+  3778357673U,	// <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7>
+  2325155942U,	// <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4>
+  3779684939U,	// <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7>
+  2706606748U,	// <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7>
+  3398898498U,	// <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7>
+  2707934014U,	// <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7>
+  2785858868U,	// <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1>
+  3780348874U,	// <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3>
+  3780349000U,	// <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3>
+  2308575738U,	// <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3>
+  2656283856U,	// <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4>
+  2706607414U,	// <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2656285341U,	// <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4>
+  2241468012U,	// <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7>
+  2706607657U,	// <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  1168569338U,	// <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2242311242U,	// <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1>
+  2242303178U,	// <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3>
+  2242311395U,	// <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1168569702U,	// <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2242311606U,	// <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5>
+  2242311662U,	// <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1168569964U,	// <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1168569986U,	// <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  3316593658U,	// <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2>
+  3316593738U,	// <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1>
+  3316634800U,	// <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4>
+  3386978810U,	// <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3>
+  2785859072U,	// <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7>
+  3736014950U,	// <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4>
+  3316594158U,	// <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7>
+  2797803032U,	// <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4>
+  2797876769U,	// <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4>
+  2243499002U,	// <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2>
+  3718103962U,	// <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4>
+  3317257418U,	// <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3>
+  3377695816U,	// <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3>
+  2243532134U,	// <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6>
+  3317282230U,	// <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5>
+  2730497536U,	// <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7>
+  2243556972U,	// <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7>
+  2243565186U,	// <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2>
+  1170551802U,	// <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2706609966U,	// <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  2244293797U,	// <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2>
+  2244293859U,	// <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1170552166U,	// <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2706610330U,	// <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2244294126U,	// <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1170552428U,	// <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1170552450U,	// <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  1165118354U,	// <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1624907878U,	// <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638407377U,	// <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u>
+  2295931036U,	// <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  2687369584U,	// <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u>
+  1165121690U,	// <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  2662298489U,	// <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u>
+  2295934280U,	// <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  1624908445U,	// <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638413926U,	// <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS
+  2691351382U,	// <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u>
+  1685280558U,	// <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2287313052U,	// <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  2299257799U,	// <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4>
+  2694005914U,	// <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u>
+  2305231362U,	// <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  2287316296U,	// <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  1685280612U,	// <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2638422118U,	// <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS
+  2240206638U,	// <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  2697987712U,	// <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u>
+  1624909521U,	// <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u>
+  2759391121U,	// <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3>
+  2240207002U,	// <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2698651578U,	// <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7>
+  2785859500U,	// <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3>
+  1628227686U,	// <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u>
+  2759022524U,	// <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1>
+  2801342408U,	// <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4>
+  2703960409U,	// <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u>
+  2759022554U,	// <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4>
+  2759022564U,	// <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5>
+  2240845978U,	// <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  2706614941U,	// <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u>
+  2301267272U,	// <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2759022596U,	// <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1>
+  1570668646U,	// <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS
+  1167726382U,	// <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  2698652753U,	// <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3>
+  1234829468U,	// <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  229035318U,	// <4,u,4,4>: Cost 1 vspltisw0 RHS
+  1624911158U,	// <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS
+  2698653081U,	// <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7>
+  1234832712U,	// <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  229035318U,	// <4,u,4,u>: Cost 1 vspltisw0 RHS
+  1168561875U,	// <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2>
+  94820142U,	// <4,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1168562053U,	// <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0>
+  1222230172U,	// <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  1168562239U,	// <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6>
+  94820506U,	// <4,u,5,5>: Cost 1 vmrghw RHS, RHS
+  1685280922U,	// <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  1222233416U,	// <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  94820709U,	// <4,u,5,u>: Cost 1 vmrghw RHS, LHS
+  1564713062U,	// <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS
+  2626511979U,	// <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6>
+  2632484676U,	// <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6>
+  1564715549U,	// <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6>
+  1564716342U,	// <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS
+  2242853018U,	// <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2656375464U,	// <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6>
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2785859840U,	// <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1>
+  2243499822U,	// <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  2727851197U,	// <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u>
+  2303951004U,	// <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2785859880U,	// <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5>
+  2243500186U,	// <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  2730505729U,	// <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u>
+  2303954248U,	// <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303951009U,	// <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  1564729446U,	// <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS
+  96810798U,	// <4,u,u,1>: Cost 1 vmrghw RHS, LHS
+  1685281125U,	// <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  1222254748U,	// <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  229035318U,	// <4,u,u,4>: Cost 1 vspltisw0 RHS
+  96811162U,	// <4,u,u,5>: Cost 1 vmrghw RHS, RHS
+  1685281165U,	// <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2754232320U,	// <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0>
+  2754232330U,	// <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1>
+  3718194894U,	// <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5>
+  3376385762U,	// <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3>
+  2754232357U,	// <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1>
+  3845816370U,	// <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5>
+  3782353389U,	// <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7>
+  3376386090U,	// <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7>
+  2757402697U,	// <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1>
+  2626543718U,	// <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS
+  2626544751U,	// <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1>
+  1680490598U,	// <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3766428665U,	// <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0>
+  2626546998U,	// <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS
+  2650435539U,	// <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1>
+  3783017715U,	// <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7>
+  3385019000U,	// <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7>
+  1680490652U,	// <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3376398336U,	// <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0>
+  2245877862U,	// <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  3773064808U,	// <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2>
+  2705295054U,	// <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  3827974343U,	// <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1>
+  3845816530U,	// <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3>
+  3779037114U,	// <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7>
+  3810887658U,	// <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1>
+  2245878429U,	// <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2710603926U,	// <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2>
+  3827974396U,	// <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0>
+  3779037516U,	// <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4>
+  3779037596U,	// <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3>
+  2705295868U,	// <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0>
+  3379726804U,	// <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5>
+  3802925748U,	// <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4>
+  3363138168U,	// <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7>
+  2707950400U,	// <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0>
+  2626568294U,	// <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS
+  1680490834U,	// <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  3828048219U,	// <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5>
+  2710604932U,	// <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0>
+  2754232685U,	// <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5>
+  2705296694U,	// <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779038590U,	// <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7>
+  2713259464U,	// <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1680490834U,	// <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  2311307264U,	// <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0>
+  1174437990U,	// <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  3779038946U,	// <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3>
+  3845816752U,	// <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0>
+  2248180050U,	// <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5>
+  2248180194U,	// <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5>
+  3779039274U,	// <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7>
+  3385051768U,	// <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7>
+  1174438557U,	// <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2302689280U,	// <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0>
+  1175208038U,	// <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  3787002362U,	// <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3>
+  3376432160U,	// <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3>
+  2248950098U,	// <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5>
+  2248950180U,	// <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  3376433702U,	// <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6>
+  2729186166U,	// <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5>
+  1175208605U,	// <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2713261050U,	// <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2>
+  3365823599U,	// <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1>
+  3808900317U,	// <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4>
+  3784348899U,	// <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1>
+  2729186656U,	// <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0>
+  3787003268U,	// <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0>
+  3802928664U,	// <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4>
+  3787003431U,	// <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1>
+  2731841188U,	// <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0>
+  2626601062U,	// <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS
+  1683145366U,	// <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5>
+  1680491165U,	// <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2705295054U,	// <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  2754233005U,	// <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1>
+  2705299610U,	// <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779041488U,	// <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7>
+  2737150252U,	// <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0>
+  1680491219U,	// <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2713927680U,	// <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0>
+  1640185958U,	// <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2310607866U,	// <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  3787669756U,	// <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0>
+  2713928018U,	// <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5>
+  2306621778U,	// <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5>
+  3787670006U,	// <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7>
+  3736188301U,	// <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0>
+  1640186525U,	// <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2650505318U,	// <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS
+  2754233140U,	// <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1>
+  2311276694U,	// <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2>
+  2311278315U,	// <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3>
+  2758435667U,	// <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5>
+  2754233180U,	// <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5>
+  3385016497U,	// <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6>
+  2311278643U,	// <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7>
+  2758730615U,	// <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5>
+  3700367462U,	// <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS
+  3830629255U,	// <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3>
+  2713929320U,	// <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2>
+  2754233238U,	// <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0>
+  2759099300U,	// <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5>
+  2754233259U,	// <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3>
+  2713929658U,	// <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7>
+  3872359354U,	// <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0>
+  2754233283U,	// <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0>
+  2713929878U,	// <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2>
+  3363135498U,	// <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1>
+  3363137686U,	// <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2>
+  2713930140U,	// <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3>
+  2713930242U,	// <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6>
+  2289394002U,	// <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5>
+  3787672184U,	// <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7>
+  3787672259U,	// <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1>
+  2713930526U,	// <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2>
+  1634880402U,	// <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1>
+  2760205355U,	// <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5>
+  2760279092U,	// <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5>
+  3787672708U,	// <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0>
+  2713930960U,	// <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4>
+  1640189238U,	// <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  3786345848U,	// <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1>
+  3787009481U,	// <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1>
+  1640189466U,	// <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1>
+  2754233455U,	// <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1>
+  2713931407U,	// <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1>
+  2713931499U,	// <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3>
+  3827975305U,	// <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0>
+  2754233495U,	// <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5>
+  2288746834U,	// <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5>
+  2713931827U,	// <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7>
+  3787673725U,	// <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0>
+  2754233527U,	// <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1>
+  2668462182U,	// <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS
+  2290746002U,	// <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1>
+  2302691478U,	// <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2>
+  3364488071U,	// <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3>
+  2302689536U,	// <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4>
+  2754233587U,	// <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7>
+  2713932600U,	// <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6>
+  2713932622U,	// <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1>
+  2302689297U,	// <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u>
+  2713932794U,	// <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2>
+  3365822474U,	// <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1>
+  3365824662U,	// <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2>
+  3787674851U,	// <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1>
+  2713933158U,	// <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6>
+  2292080978U,	// <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5>
+  3365823613U,	// <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6>
+  2713933420U,	// <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7>
+  2713933442U,	// <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2>
+  1658771190U,	// <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1>
+  1640191790U,	// <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2762933624U,	// <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5>
+  2754233724U,	// <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0>
+  2763081098U,	// <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5>
+  1640192154U,	// <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  2713934032U,	// <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7>
+  2713934080U,	// <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1>
+  1640192357U,	// <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  3779051520U,	// <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0>
+  2705309798U,	// <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  3838813637U,	// <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1>
+  2302640230U,	// <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3765117266U,	// <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5>
+  3381027892U,	// <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5>
+  3842794985U,	// <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1>
+  3408232554U,	// <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7>
+  2302640235U,	// <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3700432998U,	// <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS
+  3765117785U,	// <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2>
+  2311276136U,	// <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2>
+  1237532774U,	// <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700436278U,	// <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS
+  3381036084U,	// <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5>
+  3385018045U,	// <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6>
+  3385017560U,	// <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7>
+  1237532779U,	// <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700441190U,	// <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS
+  3700442242U,	// <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2>
+  2754233960U,	// <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2>
+  2754233970U,	// <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3>
+  2765071997U,	// <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5>
+  3834021508U,	// <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3>
+  3842795152U,	// <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6>
+  3376402492U,	// <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7>
+  2754234015U,	// <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3>
+  2754234022U,	// <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1>
+  3827975855U,	// <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1>
+  2644625102U,	// <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393766U,	// <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1691993806U,	// <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5>
+  2785052375U,	// <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5>
+  3854812897U,	// <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6>
+  3802942187U,	// <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5>
+  1692288754U,	// <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5>
+  3839846139U,	// <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5>
+  2709294052U,	// <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2>
+  2766251789U,	// <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5>
+  2765735702U,	// <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5>
+  3840141087U,	// <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5>
+  2705313078U,	// <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2712612217U,	// <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2>
+  3787017674U,	// <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2>
+  2765735747U,	// <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5>
+  3834021704U,	// <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1>
+  3834021714U,	// <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2>
+  2311308904U,	// <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2>
+  1237565542U,	// <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3834021744U,	// <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5>
+  3369124916U,	// <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5>
+  2248181690U,	// <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7>
+  3786354825U,	// <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3>
+  1237565547U,	// <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3700473958U,	// <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS
+  3700475014U,	// <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6>
+  2296718952U,	// <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2>
+  1228947558U,	// <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3700477238U,	// <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS
+  3834021836U,	// <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7>
+  2248951738U,	// <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7>
+  3370461105U,	// <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7>
+  1228947563U,	// <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3786355706U,	// <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2>
+  3783038037U,	// <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3>
+  3365824104U,	// <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2>
+  2292080742U,	// <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS
+  3842131986U,	// <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5>
+  3371795508U,	// <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5>
+  3786356206U,	// <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7>
+  3786356332U,	// <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7>
+  2292080747U,	// <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS
+  2754234427U,	// <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1>
+  2705315630U,	// <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  2296735336U,	// <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2>
+  1228963942U,	// <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  1695311971U,	// <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5>
+  2705315994U,	// <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2769201269U,	// <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5>
+  3370477489U,	// <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7>
+  1695606919U,	// <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5>
+  3827976331U,	// <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0>
+  2754234518U,	// <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2>
+  3706472290U,	// <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0>
+  3700500630U,	// <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2>
+  2754234544U,	// <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1>
+  3376383766U,	// <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5>
+  3769770513U,	// <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7>
+  3376383930U,	// <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7>
+  2754234581U,	// <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2>
+  2311275414U,	// <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0>
+  2305967971U,	// <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1>
+  2692047787U,	// <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3>
+  2311276146U,	// <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3>
+  2311275418U,	// <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4>
+  3765789807U,	// <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1>
+  3765789939U,	// <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7>
+  2311276474U,	// <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7>
+  2696029585U,	// <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3>
+  2311288709U,	// <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  3765790243U,	// <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5>
+  3827976513U,	// <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2>
+  2765736268U,	// <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4>
+  2246248962U,	// <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6>
+  3765790563U,	// <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1>
+  3827976550U,	// <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3>
+  3842795887U,	// <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3>
+  2769054073U,	// <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4>
+  3827976575U,	// <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1>
+  3765790963U,	// <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5>
+  3839478162U,	// <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2>
+  2754234780U,	// <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3>
+  2771708327U,	// <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5>
+  3363137059U,	// <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5>
+  3375081320U,	// <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6>
+  3363137466U,	// <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7>
+  2772003275U,	// <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5>
+  2772077012U,	// <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5>
+  3765791714U,	// <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0>
+  2709965878U,	// <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3>
+  2772298223U,	// <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5>
+  2772371960U,	// <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5>
+  2754234882U,	// <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6>
+  3839478282U,	// <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5>
+  3376416698U,	// <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7>
+  2754234909U,	// <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6>
+  2311308182U,	// <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0>
+  3765792421U,	// <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5>
+  2715938575U,	// <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3>
+  2311308914U,	// <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3>
+  2311308186U,	// <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4>
+  2248182354U,	// <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5>
+  3765792837U,	// <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7>
+  2311309242U,	// <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7>
+  2311308190U,	// <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u>
+  2632777830U,	// <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3706520372U,	// <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1>
+  2632779624U,	// <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6>
+  2632780290U,	// <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6>
+  2632781110U,	// <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS
+  2248952413U,	// <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7>
+  2302691176U,	// <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302691258U,	// <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7>
+  2632783662U,	// <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3365823382U,	// <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0>
+  3706529011U,	// <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7>
+  3706529641U,	// <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7>
+  3365824114U,	// <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3>
+  2774362859U,	// <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5>
+  3365824035U,	// <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5>
+  3383740183U,	// <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6>
+  3363833786U,	// <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7>
+  2774657807U,	// <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5>
+  2632794214U,	// <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS
+  2754235166U,	// <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2>
+  2632796010U,	// <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u>
+  2632796676U,	// <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u>
+  2632797494U,	// <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS
+  2754235206U,	// <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6>
+  2302691176U,	// <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302707642U,	// <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7>
+  2754235229U,	// <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2>
+  3765133325U,	// <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4>
+  2705326182U,	// <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  3718489806U,	// <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5>
+  3718490624U,	// <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4>
+  2709307730U,	// <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5>
+  2302641870U,	// <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5>
+  3376383695U,	// <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6>
+  3384351018U,	// <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7>
+  2705326749U,	// <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  2305971057U,	// <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0>
+  3765134171U,	// <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4>
+  3766461338U,	// <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4>
+  3766461437U,	// <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4>
+  2311277776U,	// <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4>
+  2754235362U,	// <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0>
+  3783050483U,	// <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7>
+  3385019036U,	// <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7>
+  2311276241U,	// <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u>
+  3718504550U,	// <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS
+  3783050787U,	// <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5>
+  3773097576U,	// <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2>
+  2705327822U,	// <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  3773097767U,	// <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4>
+  2765737014U,	// <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3>
+  3779069882U,	// <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7>
+  3376401052U,	// <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7>
+  2245881370U,	// <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1>
+  3779070102U,	// <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2>
+  3363135525U,	// <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1>
+  3779070284U,	// <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4>
+  3779070364U,	// <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3>
+  2705328640U,	// <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4>
+  2307311310U,	// <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5>
+  3866021012U,	// <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7>
+  3363138204U,	// <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7>
+  2707983172U,	// <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4>
+  2708646805U,	// <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4>
+  2709310438U,	// <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4>
+  3779071030U,	// <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3>
+  2710637704U,	// <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4>
+  2754235600U,	// <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4>
+  1704676570U,	// <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5>
+  3779071358U,	// <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7>
+  2713292236U,	// <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4>
+  1704897781U,	// <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5>
+  2626871398U,	// <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS
+  2626872471U,	// <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5>
+  2765737230U,	// <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3>
+  3700615318U,	// <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2>
+  2626874678U,	// <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS
+  1174441270U,	// <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS
+  1680493878U,	// <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  3385051804U,	// <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7>
+  1680493896U,	// <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2248952722U,	// <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1>
+  2302692152U,	// <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  3382406107U,	// <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2>
+  3700623874U,	// <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6>
+  2248953040U,	// <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4>
+  1175211318U,	// <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3376432280U,	// <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6>
+  2729218934U,	// <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5>
+  1175211561U,	// <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3787035642U,	// <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2>
+  3365822501U,	// <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1>
+  3808933085U,	// <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4>
+  3784381707U,	// <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5>
+  2713294182U,	// <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6>
+  2309998286U,	// <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5>
+  3383740111U,	// <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6>
+  3787036239U,	// <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5>
+  2731873960U,	// <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4>
+  2626895974U,	// <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS
+  2626897050U,	// <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u>
+  2644813518U,	// <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5>
+  2705327822U,	// <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  2626899254U,	// <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS
+  1707331102U,	// <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5>
+  1680494121U,	// <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2737183024U,	// <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4>
+  1680494139U,	// <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2302642684U,	// <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0>
+  1640218726U,	// <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  3376384510U,	// <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2>
+  3376385078U,	// <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3>
+  2754236002U,	// <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1>
+  2717942242U,	// <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5>
+  2244907106U,	// <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  3376385406U,	// <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7>
+  1640219293U,	// <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2305969365U,	// <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0>
+  1237536282U,	// <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2713961366U,	// <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0>
+  3766469630U,	// <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5>
+  2782326455U,	// <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5>
+  2311277786U,	// <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5>
+  2311277058U,	// <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6>
+  3385017587U,	// <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7>
+  1237536282U,	// <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  3376400892U,	// <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0>
+  3827977963U,	// <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3>
+  2302659070U,	// <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2>
+  2765737726U,	// <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4>
+  3839479558U,	// <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3>
+  2781073167U,	// <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3>
+  2713962426U,	// <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7>
+  3376401790U,	// <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7>
+  2769055531U,	// <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4>
+  2713962646U,	// <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2>
+  3765143786U,	// <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5>
+  3839479621U,	// <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3>
+  2289394603U,	// <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3>
+  2713963010U,	// <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6>
+  2313285150U,	// <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5>
+  3363138050U,	// <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6>
+  3363136755U,	// <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7>
+  2713963294U,	// <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2>
+  2713963410U,	// <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1>
+  3827978127U,	// <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5>
+  3839479704U,	// <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5>
+  3376417846U,	// <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3>
+  1637567706U,	// <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5>
+  1640222006U,	// <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS
+  2310640998U,	// <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6>
+  3376418174U,	// <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7>
+  1640222238U,	// <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5>
+  1577091174U,	// <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  2311310226U,	// <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1>
+  2713964303U,	// <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3>
+  2311311119U,	// <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3>
+  1577094454U,	// <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,5,5>: Cost 1 vspltisw1 RHS
+  2311309826U,	// <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6>
+  2311311447U,	// <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7>
+  296144182U,	// <5,5,5,u>: Cost 1 vspltisw1 RHS
+  2248953460U,	// <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1>
+  2326580114U,	// <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1>
+  2713965050U,	// <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3>
+  3700697602U,	// <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6>
+  2785644620U,	// <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5>
+  2781073495U,	// <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7>
+  1228950018U,	// <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965390U,	// <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1>
+  1228950018U,	// <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965562U,	// <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2>
+  3383741330U,	// <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1>
+  3718620878U,	// <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5>
+  3365823403U,	// <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3>
+  2713965926U,	// <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6>
+  2717947318U,	// <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5>
+  3365825026U,	// <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6>
+  2292081907U,	// <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7>
+  2713966210U,	// <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2>
+  1577091174U,	// <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1640224558U,	// <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2713966469U,	// <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0>
+  2713966524U,	// <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1>
+  1577094454U,	// <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,u,5>: Cost 1 vspltisw1 RHS
+  1228950018U,	// <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713966848U,	// <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1>
+  296144182U,	// <5,5,u,u>: Cost 1 vspltisw1 RHS
+  2705342464U,	// <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0>
+  1631600742U,	// <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3773112493U,	// <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2>
+  2705342720U,	// <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4>
+  2705342802U,	// <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5>
+  3779084708U,	// <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6>
+  3779084790U,	// <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7>
+  2302643510U,	// <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631601309U,	// <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3767141092U,	// <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2>
+  2705343284U,	// <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1>
+  2705343382U,	// <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0>
+  3779085282U,	// <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4>
+  2693399632U,	// <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6>
+  3767805089U,	// <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6>
+  2311279416U,	// <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6>
+  1237536054U,	// <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1237536055U,	// <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS
+  3773113789U,	// <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2>
+  3779085855U,	// <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1>
+  2699372136U,	// <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2>
+  2705344166U,	// <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1>
+  2699372329U,	// <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6>
+  2705344360U,	// <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6>
+  2705344442U,	// <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7>
+  2302659894U,	// <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2702026861U,	// <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6>
+  2705344662U,	// <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2>
+  3767142661U,	// <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5>
+  3773114689U,	// <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2>
+  2705344924U,	// <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3>
+  1631603202U,	// <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6>
+  3842945597U,	// <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7>
+  3779086962U,	// <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1>
+  2289397046U,	// <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634257734U,	// <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6>
+  2644926566U,	// <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS
+  3779087306U,	// <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3>
+  2790142577U,	// <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5>
+  2644929026U,	// <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6>
+  2711317723U,	// <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6>
+  1631604022U,	// <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  2712644989U,	// <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6>
+  2302676278U,	// <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631604265U,	// <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  3842945708U,	// <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1>
+  3767144133U,	// <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1>
+  2705346328U,	// <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3>
+  3779088207U,	// <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4>
+  2717290420U,	// <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6>
+  2705346574U,	// <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6>
+  2705346596U,	// <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1>
+  1237568822U,	// <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  1237568823U,	// <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS
+  2650914918U,	// <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS
+  3364490949U,	// <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1>
+  2248954362U,	// <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3>
+  2302693144U,	// <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3>
+  2650918198U,	// <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS
+  2650918926U,	// <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6>
+  2302693390U,	// <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6>
+  1228950838U,	// <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228950839U,	// <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS
+  497467494U,	// <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571210036U,	// <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571210856U,	// <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571211414U,	// <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497470774U,	// <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571213316U,	// <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571213818U,	// <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571214956U,	// <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7>
+  497473326U,	// <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497475686U,	// <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631606574U,	// <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  1571219048U,	// <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571219606U,	// <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497478967U,	// <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631606938U,	// <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  1571222010U,	// <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1228967222U,	// <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497481518U,	// <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS
+  3768475648U,	// <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0>
+  2694733926U,	// <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  3718711395U,	// <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5>
+  3384349178U,	// <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3>
+  2694734162U,	// <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5>
+  3384347884U,	// <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5>
+  3730658026U,	// <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0>
+  3718714362U,	// <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2>
+  2694734493U,	// <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2311278690U,	// <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0>
+  2305970923U,	// <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1>
+  3768476566U,	// <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0>
+  2311279098U,	// <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3>
+  2311278694U,	// <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4>
+  3768476783U,	// <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1>
+  2694735091U,	// <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7>
+  2311279426U,	// <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7>
+  2696062357U,	// <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7>
+  3383701602U,	// <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0>
+  3768477219U,	// <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5>
+  3768477288U,	// <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2>
+  2309960186U,	// <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3383701606U,	// <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4>
+  3768477545U,	// <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7>
+  3766486970U,	// <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7>
+  3383702338U,	// <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7>
+  2309960186U,	// <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3768477846U,	// <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2>
+  3768477975U,	// <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5>
+  3786393932U,	// <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4>
+  3768478108U,	// <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3>
+  2795599115U,	// <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5>
+  3385037470U,	// <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5>
+  3780422309U,	// <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7>
+  3848107301U,	// <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4>
+  2795894063U,	// <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5>
+  2795967800U,	// <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5>
+  3768478690U,	// <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0>
+  3718744163U,	// <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5>
+  3784404107U,	// <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7>
+  2796262748U,	// <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5>
+  2694737206U,	// <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2712653182U,	// <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7>
+  2713316815U,	// <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7>
+  2694737449U,	// <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2311311458U,	// <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0>
+  3768479433U,	// <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5>
+  3768479521U,	// <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3>
+  2311311866U,	// <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3>
+  2311311462U,	// <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4>
+  2248185270U,	// <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5>
+  2718625879U,	// <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7>
+  2311312194U,	// <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7>
+  2311311466U,	// <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u>
+  2248954874U,	// <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2>
+  3322696778U,	// <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1>
+  2248955028U,	// <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2656963074U,	// <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6>
+  2248955238U,	// <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6>
+  2248955329U,	// <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7>
+  2656965360U,	// <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6>
+  2248955500U,	// <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7>
+  2248955522U,	// <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2>
+  3718766694U,	// <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS
+  3724739827U,	// <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7>
+  3718768739U,	// <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5>
+  3365826337U,	// <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3>
+  2798253647U,	// <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5>
+  3365826258U,	// <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5>
+  3730715377U,	// <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7>
+  2310665836U,	// <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7>
+  2798548595U,	// <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5>
+  2311336034U,	// <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0>
+  2694739758U,	// <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2248955028U,	// <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2311336442U,	// <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3>
+  2311336038U,	// <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4>
+  2694740122U,	// <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2656981746U,	// <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u>
+  2311336770U,	// <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7>
+  2694740325U,	// <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2705358848U,	// <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0>
+  1631617126U,	// <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2310607866U,	// <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  2302640284U,	// <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  2754238189U,	// <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1>
+  2305296114U,	// <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5>
+  2244907106U,	// <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  2302643528U,	// <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631617693U,	// <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2627133542U,	// <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS
+  1237536282U,	// <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  1680496430U,	// <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1237532828U,	// <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  2693416018U,	// <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u>
+  2756892486U,	// <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0>
+  2694743284U,	// <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u>
+  1237536072U,	// <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1680496484U,	// <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2311288709U,	// <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  2245883694U,	// <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2699388520U,	// <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2>
+  2754238344U,	// <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3>
+  2699388715U,	// <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u>
+  2757408666U,	// <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3>
+  2705360826U,	// <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7>
+  2302659912U,	// <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2754238389U,	// <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3>
+  2754238396U,	// <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1>
+  3827980229U,	// <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1>
+  2644625102U,	// <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393820U,	// <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1631619588U,	// <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u>
+  2785056749U,	// <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5>
+  3363138077U,	// <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6>
+  2289397064U,	// <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634274120U,	// <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u>
+  1634937753U,	// <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u>
+  1728272410U,	// <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5>
+  2710006843U,	// <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u>
+  2765740076U,	// <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5>
+  1637592285U,	// <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u>
+  1631620406U,	// <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  2712661375U,	// <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u>
+  2302676296U,	// <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631620649U,	// <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  1577091174U,	// <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1174443822U,	// <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2766035058U,	// <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3>
+  1237565596U,	// <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  1577094454U,	// <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,u,5,5>: Cost 1 vspltisw1 RHS
+  1680496794U,	// <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1237568840U,	// <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  296144182U,	// <5,u,5,u>: Cost 1 vspltisw1 RHS
+  2633146470U,	// <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS
+  1175213870U,	// <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2633148309U,	// <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6>
+  1228947612U,	// <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  2633149750U,	// <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS
+  1175214234U,	// <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  1228950018U,	// <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  1228950856U,	// <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228947617U,	// <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  497614950U,	// <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571357492U,	// <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571358312U,	// <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571358870U,	// <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497618248U,	// <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571360772U,	// <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571361274U,	// <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571361786U,	// <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2>
+  497620782U,	// <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497623142U,	// <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631622958U,	// <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  1680496997U,	// <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1228963996U,	// <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  497626441U,	// <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS
+  296144182U,	// <5,u,u,5>: Cost 1 vspltisw1 RHS
+  1680497037U,	// <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1228967240U,	// <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497628974U,	// <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS
+  2772451328U,	// <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0>
+  2772451338U,	// <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1>
+  3771146417U,	// <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6>
+  3383095739U,	// <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3>
+  3846193189U,	// <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1>
+  3724832803U,	// <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0>
+  3383095985U,	// <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6>
+  3383096067U,	// <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7>
+  2772451401U,	// <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1>
+  2651095142U,	// <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS
+  2251612262U,	// <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS
+  1698709606U,	// <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2651097602U,	// <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6>
+  2651098422U,	// <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS
+  2651099172U,	// <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1>
+  2657071869U,	// <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1>
+  3724841978U,	// <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2>
+  1698709660U,	// <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2252292096U,	// <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0>
+  1178550374U,	// <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3826655418U,	// <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6>
+  3777783485U,	// <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6>
+  2252292434U,	// <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5>
+  3785746280U,	// <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6>
+  2252292593U,	// <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2>
+  3736794583U,	// <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2>
+  1178550941U,	// <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3375153152U,	// <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0>
+  2772451584U,	// <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4>
+  3777784163U,	// <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0>
+  3846193426U,	// <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4>
+  2712005122U,	// <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6>
+  3724857382U,	// <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3>
+  3802335864U,	// <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7>
+  3801672410U,	// <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6>
+  2772451647U,	// <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4>
+  3383123968U,	// <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0>
+  2772451666U,	// <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5>
+  3773803577U,	// <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6>
+  3724864002U,	// <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6>
+  3846193517U,	// <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5>
+  2712005935U,	// <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0>
+  3327009265U,	// <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2>
+  3383126648U,	// <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7>
+  2772451729U,	// <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5>
+  3373178880U,	// <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0>
+  2254266470U,	// <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS
+  3785748248U,	// <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3>
+  3790393190U,	// <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0>
+  3328000338U,	// <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5>
+  3785748494U,	// <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6>
+  3785748516U,	// <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1>
+  3379153528U,	// <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7>
+  2254267037U,	// <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS
+  2254897152U,	// <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0>
+  1181155430U,	// <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  3785748923U,	// <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3>
+  3785749042U,	// <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5>
+  2254897490U,	// <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5>
+  3785749169U,	// <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6>
+  2724614962U,	// <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0>
+  3787739982U,	// <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1>
+  1181155997U,	// <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1235664896U,	// <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235666598U,	// <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  3712943720U,	// <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2>
+  2639202936U,	// <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7>
+  2639203638U,	// <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS
+  2309409236U,	// <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  3712946517U,	// <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0>
+  2309409400U,	// <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235666605U,	// <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  1235673088U,	// <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235674790U,	// <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  1698710173U,	// <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2639211129U,	// <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u>
+  2639211830U,	// <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS
+  2712008858U,	// <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS
+  2657129220U,	// <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u>
+  2309417592U,	// <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1698710227U,	// <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  3775799296U,	// <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0>
+  2702057574U,	// <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3373143763U,	// <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2>
+  3695045122U,	// <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6>
+  3775799634U,	// <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5>
+  3383091538U,	// <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5>
+  3368493233U,	// <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6>
+  3362522319U,	// <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7>
+  2702058141U,	// <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3834250027U,	// <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1>
+  2772452148U,	// <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  3832038210U,	// <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6>
+  3373150660U,	// <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3>
+  3834250067U,	// <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5>
+  3373146450U,	// <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5>
+  3826656102U,	// <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6>
+  3362530511U,	// <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7>
+  2772452148U,	// <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  2669092966U,	// <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS
+  2252292916U,	// <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1>
+  2252293014U,	// <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0>
+  2772452246U,	// <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0>
+  2669096246U,	// <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS
+  3846194091U,	// <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3>
+  2702059450U,	// <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7>
+  3870081978U,	// <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0>
+  2702059633U,	// <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1>
+  3775801494U,	// <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2>
+  3777128723U,	// <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1>
+  3775801702U,	// <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3>
+  3775801756U,	// <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3>
+  3775801858U,	// <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6>
+  3375153490U,	// <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5>
+  3826656265U,	// <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7>
+  3775802051U,	// <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1>
+  3775802142U,	// <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2>
+  3846194206U,	// <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1>
+  3846194219U,	// <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5>
+  3846194228U,	// <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5>
+  3846194236U,	// <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4>
+  3846194246U,	// <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5>
+  2760508496U,	// <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6>
+  3368526001U,	// <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6>
+  3870082144U,	// <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4>
+  2760729707U,	// <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6>
+  2714668660U,	// <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1>
+  3834619005U,	// <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6>
+  3834692742U,	// <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6>
+  3846194317U,	// <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4>
+  3834840216U,	// <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6>
+  3834913953U,	// <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6>
+  2719977570U,	// <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0>
+  3367208143U,	// <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7>
+  2719977724U,	// <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1>
+  2669125734U,	// <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS
+  2254897972U,	// <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1>
+  2254898070U,	// <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0>
+  3775803929U,	// <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7>
+  2669129014U,	// <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS
+  2322006354U,	// <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5>
+  2725950264U,	// <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6>
+  3793720142U,	// <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1>
+  2254898556U,	// <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0>
+  2627330150U,	// <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS
+  1235664906U,	// <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235667094U,	// <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309406894U,	// <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2627333430U,	// <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS
+  1235665234U,	// <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309406897U,	// <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309407222U,	// <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235664913U,	// <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  2627338342U,	// <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS
+  1235673098U,	// <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235675286U,	// <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2772452732U,	// <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0>
+  2627341622U,	// <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS
+  1235673426U,	// <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309415089U,	// <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309415414U,	// <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235673105U,	// <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  3324683725U,	// <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0>
+  2725290086U,	// <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS
+  3771162801U,	// <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6>
+  2309349478U,	// <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3730951478U,	// <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS
+  3840738784U,	// <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1>
+  3842655721U,	// <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1>
+  3736925671U,	// <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0>
+  2309349483U,	// <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3367840468U,	// <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0>
+  3325355551U,	// <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1>
+  3373147752U,	// <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2>
+  2299404390U,	// <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  3701099830U,	// <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS
+  3767846054U,	// <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2>
+  3826656825U,	// <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0>
+  3373147838U,	// <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7>
+  2299404395U,	// <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2657222758U,	// <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS
+  3771164219U,	// <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2>
+  2766481000U,	// <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2>
+  2772452978U,	// <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3>
+  2657226038U,	// <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS
+  3790407528U,	// <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6>
+  2252294074U,	// <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7>
+  2252294148U,	// <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0>
+  2772453023U,	// <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3>
+  2772453030U,	// <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1>
+  3834250930U,	// <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4>
+  2765596349U,	// <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6>
+  2301411430U,	// <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS
+  2772453070U,	// <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5>
+  2765817560U,	// <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6>
+  2252933050U,	// <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7>
+  2796340968U,	// <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4>
+  2766038771U,	// <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6>
+  3725008998U,	// <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS
+  3368530217U,	// <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1>
+  3840222989U,	// <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5>
+  2309382246U,	// <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  3725012278U,	// <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS
+  2766481193U,	// <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6>
+  3842656049U,	// <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5>
+  3327010820U,	// <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0>
+  2766702404U,	// <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6>
+  3713073254U,	// <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS
+  3789082310U,	// <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2>
+  3840665439U,	// <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6>
+  2766997352U,	// <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6>
+  3713076534U,	// <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS
+  3791736842U,	// <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2>
+  3373180605U,	// <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6>
+  3793064108U,	// <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2>
+  2767366037U,	// <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6>
+  3701137510U,	// <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS
+  3701138647U,	// <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6>
+  2254898792U,	// <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2>
+  1248264294U,	// <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  3701140790U,	// <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS
+  3725029435U,	// <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6>
+  2254899130U,	// <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7>
+  2725294981U,	// <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2>
+  1248264299U,	// <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS
+  2633375846U,	// <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS
+  2309407468U,	// <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235666536U,	// <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161923174U,	// <6,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2633379126U,	// <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS
+  2309407796U,	// <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309408445U,	// <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309407960U,	// <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161923179U,	// <6,2,7,u>: Cost 1 vmrglw RHS, LHS
+  2633384038U,	// <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS
+  2309415660U,	// <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235674728U,	// <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161931366U,	// <6,2,u,3>: Cost 1 vmrglw RHS, LHS
+  2633387318U,	// <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS
+  2769135725U,	// <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6>
+  2309416637U,	// <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309416152U,	// <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161931371U,	// <6,2,u,u>: Cost 1 vmrglw RHS, LHS
+  3777806336U,	// <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0>
+  2704064614U,	// <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  3765862577U,	// <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6>
+  3843393708U,	// <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6>
+  2250516994U,	// <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6>
+  3725054014U,	// <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0>
+  3383093096U,	// <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6>
+  3368495034U,	// <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7>
+  2704065181U,	// <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  2251622550U,	// <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  3777807156U,	// <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1>
+  3765863348U,	// <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3>
+  3373147762U,	// <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3>
+  3834251525U,	// <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5>
+  3373147683U,	// <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5>
+  3391727545U,	// <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6>
+  2299406266U,	// <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7>
+  2251622550U,	// <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  2252294294U,	// <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2>
+  3326036198U,	// <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1>
+  3771836045U,	// <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3>
+  2252294556U,	// <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3>
+  2252294658U,	// <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6>
+  3840739677U,	// <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3>
+  2704066490U,	// <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7>
+  3368511418U,	// <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7>
+  2252294942U,	// <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2>
+  3707158630U,	// <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS
+  3765864692U,	// <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6>
+  2704066918U,	// <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3>
+  2772453788U,	// <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3>
+  2772453799U,	// <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5>
+  3789752888U,	// <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6>
+  3840739770U,	// <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6>
+  2301413306U,	// <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7>
+  2775108043U,	// <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5>
+  2651340902U,	// <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS
+  3846195674U,	// <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2>
+  3845974503U,	// <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6>
+  2651343362U,	// <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6>
+  2651344182U,	// <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS
+  1698712066U,	// <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6>
+  3383125864U,	// <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6>
+  3368527802U,	// <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7>
+  1698933277U,	// <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6>
+  3373179798U,	// <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0>
+  3707176179U,	// <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7>
+  2716012312U,	// <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3>
+  3373180530U,	// <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3>
+  2254309890U,	// <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6>
+  3785773070U,	// <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6>
+  3840739932U,	// <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6>
+  2299439034U,	// <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7>
+  2719994110U,	// <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3>
+  2254899350U,	// <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2>
+  3328641254U,	// <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1>
+  2633443257U,	// <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6>
+  2254899612U,	// <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3>
+  2254899714U,	// <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6>
+  3785773772U,	// <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6>
+  2725966648U,	// <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6>
+  2322007994U,	// <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7>
+  2254899998U,	// <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2>
+  1559707750U,	// <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  2633450292U,	// <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1>
+  1559709626U,	// <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7>
+  1235666546U,	// <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559711030U,	// <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS
+  2309408291U,	// <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2633454152U,	// <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0>
+  1235666874U,	// <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559713582U,	// <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  1559715942U,	// <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  2633458484U,	// <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1>
+  1559717819U,	// <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u>
+  1235674738U,	// <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559719222U,	// <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS
+  1701366598U,	// <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6>
+  2633462353U,	// <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0>
+  1235675066U,	// <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559721774U,	// <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  3785777152U,	// <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0>
+  2712035430U,	// <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3771179185U,	// <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6>
+  3846196096U,	// <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1>
+  3785777490U,	// <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5>
+  2250517814U,	// <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS
+  3324259703U,	// <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0>
+  3383092458U,	// <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7>
+  2712035997U,	// <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3325356946U,	// <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1>
+  3785777972U,	// <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1>
+  3846196170U,	// <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3>
+  3325365380U,	// <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0>
+  3852168155U,	// <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2>
+  2251615542U,	// <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS
+  3325357432U,	// <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1>
+  3870084088U,	// <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4>
+  2251615785U,	// <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS
+  2252295058U,	// <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1>
+  3771180605U,	// <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4>
+  3785778792U,	// <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2>
+  3777816253U,	// <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6>
+  2252295376U,	// <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4>
+  1178553654U,	// <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  2252295545U,	// <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2>
+  3326037448U,	// <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0>
+  1178553897U,	// <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS
+  3785779350U,	// <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2>
+  3383118648U,	// <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1>
+  3777816935U,	// <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4>
+  3785779612U,	// <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3>
+  2712037890U,	// <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6>
+  2252754230U,	// <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3784452764U,	// <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7>
+  3801705178U,	// <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6>
+  2252754473U,	// <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3787770770U,	// <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1>
+  3383126840U,	// <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1>
+  3327380534U,	// <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3>
+  3784453265U,	// <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4>
+  2253630672U,	// <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4>
+  2778426587U,	// <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6>
+  3383128789U,	// <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6>
+  3381799580U,	// <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7>
+  2778647798U,	// <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6>
+  2651422822U,	// <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS
+  3701277928U,	// <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5>
+  3701278650U,	// <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7>
+  2651425282U,	// <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6>
+  2651426102U,	// <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS
+  2651426892U,	// <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5>
+  1698712886U,	// <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3725169658U,	// <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2>
+  1698712904U,	// <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2254900114U,	// <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1>
+  3389115192U,	// <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1>
+  3785781727U,	// <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3>
+  3785781810U,	// <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5>
+  2254900432U,	// <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4>
+  1181158710U,	// <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2254900605U,	// <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6>
+  3787772750U,	// <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1>
+  1181158953U,	// <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2639495270U,	// <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS
+  2639496090U,	// <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4>
+  3707267011U,	// <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7>
+  2639497884U,	// <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7>
+  1237658832U,	// <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235666638U,	// <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  3713241753U,	// <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0>
+  2309409436U,	// <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235666641U,	// <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  2639503462U,	// <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS
+  2639504282U,	// <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4>
+  3701303226U,	// <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7>
+  2639506077U,	// <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u>
+  1235676368U,	// <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235674830U,	// <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  1698713129U,	// <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2309417628U,	// <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1698713147U,	// <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3775832064U,	// <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0>
+  2702090342U,	// <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3775832241U,	// <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6>
+  3719227906U,	// <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6>
+  3775832402U,	// <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5>
+  3385085146U,	// <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5>
+  2309351938U,	// <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6>
+  3376459134U,	// <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7>
+  2702090909U,	// <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3719233546U,	// <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1>
+  3775832884U,	// <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1>
+  3775832982U,	// <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0>
+  3846196909U,	// <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4>
+  3719236984U,	// <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1>
+  3856150209U,	// <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6>
+  3834252997U,	// <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1>
+  3870084817U,	// <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4>
+  3769861532U,	// <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5>
+  2645500006U,	// <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS
+  3719242548U,	// <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1>
+  3775833704U,	// <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2>
+  3775833766U,	// <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1>
+  2645503353U,	// <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2>
+  2252296196U,	// <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5>
+  2702092218U,	// <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7>
+  3719246842U,	// <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2>
+  2702092405U,	// <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5>
+  3775834262U,	// <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2>
+  3777161495U,	// <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5>
+  3775834470U,	// <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3>
+  3775834524U,	// <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3>
+  3775834626U,	// <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6>
+  3385109722U,	// <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5>
+  2309376514U,	// <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3775834819U,	// <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1>
+  2309376514U,	// <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3719258214U,	// <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS
+  3385117586U,	// <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1>
+  3327242008U,	// <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3>
+  3719260674U,	// <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6>
+  3719261563U,	// <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4>
+  2702093622U,	// <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  2309384706U,	// <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6>
+  3870085060U,	// <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4>
+  2702093865U,	// <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  3719266406U,	// <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS
+  3789106889U,	// <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5>
+  3785789208U,	// <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3>
+  3373183950U,	// <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3>
+  2717355964U,	// <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5>
+  2791772164U,	// <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5>
+  2772455438U,	// <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6>
+  3373183549U,	// <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7>
+  2720010496U,	// <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5>
+  2772455460U,	// <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1>
+  2322008978U,	// <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1>
+  3840225335U,	// <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2>
+  2772455490U,	// <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4>
+  2772455500U,	// <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5>
+  2254901252U,	// <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5>
+  2772455520U,	// <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7>
+  2785874024U,	// <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6>
+  2772455532U,	// <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1>
+  2627625062U,	// <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS
+  1235667858U,	// <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309409278U,	// <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309407659U,	// <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627628342U,	// <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS
+  1235668186U,	// <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235667458U,	// <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309407987U,	// <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235667460U,	// <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2627633254U,	// <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS
+  1235676050U,	// <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309417470U,	// <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309415851U,	// <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627636534U,	// <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS
+  1235676378U,	// <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235675650U,	// <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309416179U,	// <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235675652U,	// <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2309352751U,	// <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0>
+  1650917478U,	// <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  2250584570U,	// <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3>
+  3846197554U,	// <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1>
+  2724659538U,	// <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5>
+  3725275225U,	// <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0>
+  2791772493U,	// <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1>
+  2309352758U,	// <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  1650918045U,	// <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  3325358368U,	// <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1>
+  2299406449U,	// <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1>
+  2724660118U,	// <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0>
+  3373148518U,	// <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3>
+  3834253712U,	// <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5>
+  3373147953U,	// <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5>
+  2323297080U,	// <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6>
+  2299407670U,	// <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2299407671U,	// <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2252296489U,	// <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1>
+  3326038394U,	// <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1>
+  1178554874U,	// <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724660902U,	// <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1>
+  2252296817U,	// <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5>
+  3840741864U,	// <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3>
+  2252296976U,	// <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2>
+  2785874426U,	// <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3>
+  1178554874U,	// <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724661398U,	// <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2>
+  3375154665U,	// <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1>
+  3375154909U,	// <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2>
+  2301413734U,	// <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3>
+  2772455986U,	// <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5>
+  3375154993U,	// <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5>
+  2323313464U,	// <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6>
+  2301414710U,	// <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2301414711U,	// <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2724662162U,	// <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1>
+  3326939559U,	// <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1>
+  2253271546U,	// <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3>
+  3383127346U,	// <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3>
+  2309385523U,	// <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4>
+  1650920758U,	// <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  2724662653U,	// <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6>
+  2309385526U,	// <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  1650921001U,	// <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  3725312102U,	// <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS
+  3373180393U,	// <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1>
+  3791769368U,	// <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3>
+  3373181286U,	// <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3>
+  3725315382U,	// <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS
+  2299439221U,	// <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5>
+  2724663394U,	// <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0>
+  2299440438U,	// <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  2299440439U,	// <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1583808614U,	// <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2254574074U,	// <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3>
+  2322010609U,	// <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3>
+  1583811894U,	// <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2322010773U,	// <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5>
+  363253046U,	// <6,6,6,6>: Cost 1 vspltisw2 RHS
+  1248267574U,	// <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS
+  363253046U,	// <6,6,6,u>: Cost 1 vspltisw2 RHS
+  2309410095U,	// <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0>
+  2309408233U,	// <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1>
+  2311402373U,	// <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2>
+  2309409126U,	// <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  2309410099U,	// <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4>
+  2309408561U,	// <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5>
+  1237660472U,	// <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  161926454U,	// <6,6,7,7>: Cost 1 vmrglw RHS, RHS
+  161926455U,	// <6,6,7,u>: Cost 1 vmrglw RHS, RHS
+  1583808614U,	// <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1650923310U,	// <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  1178554874U,	// <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2309417318U,	// <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  1583811894U,	// <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1650923674U,	// <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  363253046U,	// <6,6,u,6>: Cost 1 vspltisw2 RHS
+  161934646U,	// <6,6,u,7>: Cost 1 vmrglw RHS, RHS
+  161934647U,	// <6,6,u,u>: Cost 1 vmrglw RHS, RHS
+  1638318080U,	// <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564576358U,	// <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712060077U,	// <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712060156U,	// <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638318418U,	// <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577865314U,	// <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0>
+  2712060406U,	// <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2651608058U,	// <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2>
+  564576925U,	// <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712060643U,	// <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638318900U,	// <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638318998U,	// <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  3766559753U,	// <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7>
+  2712060971U,	// <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712061039U,	// <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712061135U,	// <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  3373148612U,	// <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7>
+  1638319484U,	// <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712061373U,	// <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712061471U,	// <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638319720U,	// <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638319782U,	// <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712061709U,	// <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712061800U,	// <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1638320058U,	// <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252297836U,	// <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7>
+  1638320187U,	// <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638320278U,	// <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712062182U,	// <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2712062256U,	// <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3>
+  1638320540U,	// <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638320642U,	// <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712062546U,	// <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712062584U,	// <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2712062659U,	// <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1>
+  1638320926U,	// <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638321042U,	// <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712062922U,	// <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712063029U,	// <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712063108U,	// <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638321360U,	// <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564579638U,	// <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712063357U,	// <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6>
+  2712063439U,	// <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7>
+  564579881U,	// <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712063560U,	// <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714054287U,	// <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712063742U,	// <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  3373181295U,	// <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3>
+  2712063924U,	// <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638322180U,	// <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638322274U,	// <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  3373181380U,	// <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7>
+  1640313092U,	// <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712064289U,	// <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712064423U,	// <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638322682U,	// <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712064562U,	// <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712064653U,	// <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712064747U,	// <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638323000U,	// <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638323022U,	// <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638323168U,	// <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3>
+  1237659746U,	// <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309411158U,	// <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1>
+  2639718330U,	// <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7>
+  1235669498U,	// <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1237659750U,	// <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309411243U,	// <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5>
+  1583895362U,	// <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7>
+  1235669826U,	// <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  1235669503U,	// <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u>
+  1638323923U,	// <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564582190U,	// <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638324101U,	// <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638324156U,	// <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638324287U,	// <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564582554U,	// <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638324432U,	// <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  1235678018U,	// <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  564582757U,	// <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  1638326272U,	// <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564584550U,	// <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712068269U,	// <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2309349532U,	// <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  1638326610U,	// <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577939051U,	// <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0>
+  2712068598U,	// <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2309352776U,	// <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  564585117U,	// <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712068835U,	// <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638327092U,	// <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1698715438U,	// <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2299404444U,	// <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2712069163U,	// <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712069231U,	// <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712069327U,	// <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  2299407688U,	// <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  1698715492U,	// <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2712069565U,	// <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  1178556206U,	// <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  1638327912U,	// <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638327974U,	// <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712069901U,	// <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  1178556570U,	// <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  1638328250U,	// <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252298496U,	// <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1>
+  1638328379U,	// <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638328470U,	// <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712070374U,	// <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2704107883U,	// <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u>
+  1638328732U,	// <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638328834U,	// <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712070738U,	// <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712070776U,	// <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2301414728U,	// <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  1638329118U,	// <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638329234U,	// <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712071114U,	// <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712071221U,	// <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2309382300U,	// <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  1638329552U,	// <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564587831U,	// <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712071545U,	// <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2309385544U,	// <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  564588073U,	// <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712071752U,	// <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714062479U,	// <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712071934U,	// <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2299437212U,	// <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS
+  2712072116U,	// <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638330372U,	// <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1698715802U,	// <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2299440456U,	// <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1698715820U,	// <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  1583808614U,	// <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1181161262U,	// <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1638330874U,	// <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1248264348U,	// <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  1583811894U,	// <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1181161626U,	// <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  363253046U,	// <6,u,6,6>: Cost 1 vspltisw2 RHS
+  1638331214U,	// <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  363253046U,	// <6,u,6,u>: Cost 1 vspltisw2 RHS
+  1560076390U,	// <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS
+  1235664969U,	// <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1560078311U,	// <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7>
+  161923228U,	// <6,u,7,3>: Cost 1 vmrglw RHS, LHS
+  1560079670U,	// <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS
+  1235665297U,	// <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235667485U,	// <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  161926472U,	// <6,u,7,7>: Cost 1 vmrglw RHS, RHS
+  161923233U,	// <6,u,7,u>: Cost 1 vmrglw RHS, LHS
+  1560084582U,	// <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS
+  564590382U,	// <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1560086504U,	// <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u>
+  161931420U,	// <6,u,u,3>: Cost 1 vmrglw RHS, LHS
+  1560087862U,	// <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS
+  564590746U,	// <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS
+  363253046U,	// <6,u,u,6>: Cost 1 vspltisw2 RHS
+  161934664U,	// <6,u,u,7>: Cost 1 vmrglw RHS, RHS
+  161931425U,	// <6,u,u,u>: Cost 1 vmrglw RHS, LHS
+  1705426944U,	// <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705426954U,	// <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1>
+  3713550266U,	// <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7>
+  2316063892U,	// <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3>
+  2779168805U,	// <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1>
+  2663698530U,	// <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2657727309U,	// <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0>
+  2316064220U,	// <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7>
+  1705427017U,	// <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1>
+  1583988838U,	// <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS
+  2779168859U,	// <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1>
+  631685222U,	// <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2639817411U,	// <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1>
+  1583992118U,	// <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS
+  2657734660U,	// <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5>
+  1583993678U,	// <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1>
+  2657735672U,	// <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0>
+  631685276U,	// <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779168933U,	// <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3>
+  2767667377U,	// <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6>
+  2718713448U,	// <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2>
+  2718713510U,	// <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1>
+  3841409228U,	// <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6>
+  3852910802U,	// <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3>
+  2718713786U,	// <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7>
+  3847160036U,	// <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3>
+  2767667440U,	// <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6>
+  2718714006U,	// <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2>
+  2779169020U,	// <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0>
+  3852910853U,	// <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0>
+  2718714268U,	// <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3>
+  2718714370U,	// <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6>
+  2718714461U,	// <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7>
+  2706770608U,	// <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0>
+  3847160114U,	// <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0>
+  2779169083U,	// <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0>
+  2718714770U,	// <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1>
+  1705427282U,	// <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5>
+  3713583034U,	// <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7>
+  3713583814U,	// <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4>
+  2779169133U,	// <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5>
+  1644973366U,	// <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  2657760081U,	// <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4>
+  2259468868U,	// <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4>
+  1705427345U,	// <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5>
+  2718715508U,	// <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1>
+  2260123750U,	// <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS
+  3792457451U,	// <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3>
+  3852911024U,	// <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0>
+  2718715836U,	// <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5>
+  2718715908U,	// <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5>
+  1644974178U,	// <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0>
+  3792457853U,	// <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0>
+  1646301444U,	// <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0>
+  2720706901U,	// <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0>
+  2779169270U,	// <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7>
+  2718716410U,	// <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3>
+  2722697800U,	// <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0>
+  3852911121U,	// <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7>
+  3852911130U,	// <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7>
+  2718716728U,	// <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6>
+  2718716750U,	// <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1>
+  2779169333U,	// <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7>
+  2718716922U,	// <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2>
+  1187872870U,	// <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2718717076U,	// <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3>
+  3847160408U,	// <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6>
+  2718717286U,	// <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6>
+  2718717377U,	// <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7>
+  2718717404U,	// <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7>
+  2718717478U,	// <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0>
+  1187873437U,	// <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS
+  1584046182U,	// <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS
+  1705427602U,	// <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1>
+  631685789U,	// <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS
+  2639874762U,	// <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u>
+  1584049462U,	// <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS
+  1644976282U,	// <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  1584051029U,	// <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u>
+  2718718208U,	// <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1>
+  631685843U,	// <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS
+  2721374218U,	// <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1>
+  2779169507U,	// <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1>
+  2779169516U,	// <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1>
+  3852911348U,	// <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0>
+  2669743414U,	// <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS
+  2316058962U,	// <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5>
+  2316059044U,	// <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6>
+  2669745146U,	// <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2>
+  2779169570U,	// <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1>
+  2779169579U,	// <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1>
+  1705427764U,	// <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169598U,	// <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2>
+  3713632972U,	// <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1>
+  2779169619U,	// <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5>
+  2779169628U,	// <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5>
+  2657809239U,	// <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1>
+  3835290474U,	// <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1>
+  1705427764U,	// <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169660U,	// <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1>
+  2779169671U,	// <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3>
+  2779169680U,	// <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3>
+  1705427862U,	// <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0>
+  2779169700U,	// <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5>
+  2779169707U,	// <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3>
+  2657817432U,	// <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2>
+  2803057594U,	// <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0>
+  1705427907U,	// <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0>
+  3776538827U,	// <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1>
+  2319400970U,	// <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1>
+  2316085398U,	// <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2>
+  3852911591U,	// <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0>
+  3852911600U,	// <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0>
+  2319401298U,	// <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5>
+  3833668617U,	// <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7>
+  3367265487U,	// <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7>
+  2319400977U,	// <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u>
+  2724031378U,	// <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1>
+  2779169835U,	// <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5>
+  2779169844U,	// <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5>
+  3852911672U,	// <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0>
+  2669776182U,	// <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS
+  2779169872U,	// <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6>
+  3835290712U,	// <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5>
+  2669778278U,	// <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6>
+  2779169898U,	// <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5>
+  2779169903U,	// <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1>
+  3835585661U,	// <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6>
+  3841410182U,	// <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6>
+  3852911753U,	// <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0>
+  2779169943U,	// <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5>
+  2318754130U,	// <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5>
+  2718724195U,	// <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1>
+  3859178670U,	// <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1>
+  2779169975U,	// <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1>
+  2720715094U,	// <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1>
+  2761549007U,	// <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7>
+  2779170008U,	// <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7>
+  3835438305U,	// <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7>
+  3835512042U,	// <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7>
+  2761843955U,	// <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7>
+  3835659516U,	// <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7>
+  2803057918U,	// <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0>
+  2762065166U,	// <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7>
+  2669797478U,	// <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS
+  2322087946U,	// <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1>
+  2317448186U,	// <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2>
+  3395829934U,	// <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3>
+  2669800758U,	// <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS
+  2322088274U,	// <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5>
+  3375923377U,	// <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6>
+  2731996780U,	// <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7>
+  2322087953U,	// <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u>
+  2779170146U,	// <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1>
+  1705427764U,	// <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779170164U,	// <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1>
+  1705428348U,	// <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0>
+  2779170186U,	// <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5>
+  2763171221U,	// <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7>
+  2657866590U,	// <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u>
+  2803058080U,	// <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0>
+  1705428393U,	// <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0>
+  3713695846U,	// <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS
+  2779170237U,	// <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2>
+  2779170245U,	// <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1>
+  1242316902U,	// <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3713699126U,	// <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS
+  3852912096U,	// <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1>
+  2767668713U,	// <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1>
+  2256488426U,	// <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1>
+  1242316907U,	// <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3852912132U,	// <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1>
+  3852912141U,	// <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1>
+  3852912149U,	// <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0>
+  2779170335U,	// <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1>
+  3852912172U,	// <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5>
+  3840747062U,	// <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6>
+  3841410617U,	// <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0>
+  3795125538U,	// <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0>
+  2779170380U,	// <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1>
+  2779170389U,	// <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1>
+  3852912222U,	// <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1>
+  1705428584U,	// <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705428594U,	// <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3>
+  2779170429U,	// <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5>
+  3852912259U,	// <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2>
+  2767668880U,	// <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6>
+  3841336981U,	// <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2>
+  1705428639U,	// <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3>
+  1705428646U,	// <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1>
+  2779170479U,	// <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1>
+  2767668925U,	// <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6>
+  1245659238U,	// <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705428686U,	// <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5>
+  2779170519U,	// <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5>
+  2657899362U,	// <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3>
+  2319406574U,	// <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7>
+  1705428718U,	// <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1>
+  3713728614U,	// <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS
+  3852912388U,	// <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5>
+  2779170573U,	// <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5>
+  1242349670U,	// <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3713731894U,	// <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS
+  2779170601U,	// <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6>
+  2767669041U,	// <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5>
+  3389834456U,	// <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7>
+  1242349675U,	// <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3852912456U,	// <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1>
+  3852912466U,	// <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2>
+  3852912475U,	// <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2>
+  2779170664U,	// <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6>
+  3852912496U,	// <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5>
+  3792474116U,	// <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5>
+  2718732388U,	// <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2>
+  3841337228U,	// <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6>
+  2779170709U,	// <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6>
+  2640003174U,	// <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS
+  2721386920U,	// <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2>
+  2767595441U,	// <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7>
+  1693927354U,	// <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7>
+  2640006454U,	// <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS
+  3841558476U,	// <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7>
+  2657923941U,	// <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6>
+  3841337310U,	// <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7>
+  1694296039U,	// <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7>
+  2803058666U,	// <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1>
+  3852912632U,	// <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6>
+  2322089576U,	// <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2>
+  1248346214U,	// <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  3841337362U,	// <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5>
+  3395830836U,	// <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5>
+  2261616570U,	// <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7>
+  3371943857U,	// <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7>
+  1248346219U,	// <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705429051U,	// <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1>
+  2779170884U,	// <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1>
+  1705428584U,	// <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1695254620U,	// <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7>
+  1705429091U,	// <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5>
+  2779170924U,	// <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5>
+  2767669361U,	// <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1>
+  2803058809U,	// <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0>
+  1695623305U,	// <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7>
+  2779170955U,	// <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0>
+  1705429142U,	// <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2>
+  2634057732U,	// <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0>
+  2779170983U,	// <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1>
+  2779170992U,	// <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1>
+  3852912829U,	// <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5>
+  2657948520U,	// <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0>
+  2316060602U,	// <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7>
+  1705429205U,	// <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2>
+  3852912860U,	// <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0>
+  2779171046U,	// <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1>
+  2779171057U,	// <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3>
+  3852912887U,	// <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0>
+  3852912896U,	// <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0>
+  3852912905U,	// <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0>
+  3835291923U,	// <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1>
+  3841411356U,	// <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1>
+  2779171111U,	// <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3>
+  2779171120U,	// <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3>
+  3852912952U,	// <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2>
+  2779171137U,	// <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2>
+  2779171144U,	// <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0>
+  2779171156U,	// <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3>
+  3852912989U,	// <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3>
+  2767669606U,	// <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3>
+  2767669615U,	// <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3>
+  2779171189U,	// <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0>
+  2779171198U,	// <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0>
+  3852913032U,	// <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1>
+  2704140655U,	// <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3>
+  1705429404U,	// <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171238U,	// <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4>
+  3852913070U,	// <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3>
+  2657973099U,	// <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3>
+  2767669700U,	// <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7>
+  1705429404U,	// <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171280U,	// <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1>
+  2779171290U,	// <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2>
+  2634090504U,	// <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4>
+  2779171311U,	// <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5>
+  2779171319U,	// <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4>
+  1705429506U,	// <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6>
+  2722057593U,	// <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2>
+  2316093370U,	// <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7>
+  1705429533U,	// <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6>
+  3852913185U,	// <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1>
+  3795799695U,	// <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1>
+  3852913203U,	// <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1>
+  3852913214U,	// <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3>
+  3852913225U,	// <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5>
+  2779171410U,	// <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5>
+  2718740581U,	// <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3>
+  3841411685U,	// <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6>
+  2720067847U,	// <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3>
+  2773420664U,	// <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7>
+  3847236225U,	// <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7>
+  1648316922U,	// <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3>
+  2773641875U,	// <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7>
+  2773715612U,	// <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7>
+  3847531173U,	// <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7>
+  2722059024U,	// <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2>
+  2767669943U,	// <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7>
+  1652298720U,	// <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3>
+  2767669955U,	// <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1>
+  3841411788U,	// <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1>
+  2767669978U,	// <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6>
+  2722059546U,	// <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2>
+  2767669995U,	// <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5>
+  3852913396U,	// <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5>
+  2722059758U,	// <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7>
+  2302183354U,	// <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7>
+  2767670027U,	// <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1>
+  2774747930U,	// <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7>
+  1705429790U,	// <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2>
+  1660262316U,	// <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3>
+  1705429404U,	// <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2775042878U,	// <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7>
+  1705429830U,	// <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6>
+  2779171660U,	// <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3>
+  2767670101U,	// <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3>
+  1705429853U,	// <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2>
+  2718744576U,	// <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0>
+  1645002854U,	// <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  3852913527U,	// <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1>
+  3852913536U,	// <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1>
+  2316061904U,	// <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4>
+  1705429906U,	// <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658022257U,	// <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0>
+  2256489928U,	// <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1707420589U,	// <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1>
+  3852913590U,	// <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1>
+  2718745396U,	// <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1>
+  2779171786U,	// <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3>
+  3852913616U,	// <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0>
+  3852913627U,	// <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2>
+  2779171810U,	// <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0>
+  3792487631U,	// <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7>
+  3394456220U,	// <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7>
+  2779171837U,	// <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0>
+  3852913673U,	// <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3>
+  3852913682U,	// <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3>
+  2718746216U,	// <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2>
+  2718746278U,	// <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1>
+  2779171885U,	// <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3>
+  2779171893U,	// <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2>
+  2718746554U,	// <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7>
+  3847457864U,	// <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3>
+  2779171921U,	// <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3>
+  2718746774U,	// <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2>
+  3852913762U,	// <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2>
+  3852913772U,	// <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3>
+  2718747036U,	// <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3>
+  2718747138U,	// <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6>
+  2779171972U,	// <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0>
+  2706803380U,	// <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4>
+  3847457946U,	// <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4>
+  2781162655U,	// <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0>
+  2718747538U,	// <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1>
+  3852913842U,	// <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1>
+  3852913852U,	// <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2>
+  2316096696U,	// <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3>
+  1705430224U,	// <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705430234U,	// <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5>
+  2658055029U,	// <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4>
+  2316097024U,	// <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7>
+  1707420917U,	// <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5>
+  1584316518U,	// <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS
+  2658059060U,	// <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1>
+  2640144314U,	// <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7>
+  2640145131U,	// <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5>
+  1584319798U,	// <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS
+  2779172134U,	// <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0>
+  631688502U,	// <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2658063354U,	// <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2>
+  631688520U,	// <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS
+  3852914001U,	// <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7>
+  3852914010U,	// <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7>
+  2718749178U,	// <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3>
+  2722730572U,	// <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4>
+  2723394205U,	// <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4>
+  2779172221U,	// <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6>
+  2718749496U,	// <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6>
+  2718749518U,	// <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1>
+  2779172249U,	// <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7>
+  2718749690U,	// <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2>
+  3847458214U,	// <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2>
+  2718749880U,	// <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3>
+  3847458236U,	// <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6>
+  2718750004U,	// <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1>
+  1187876150U,	// <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2718750208U,	// <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7>
+  2718750286U,	// <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4>
+  1187876393U,	// <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS
+  1584341094U,	// <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS
+  1645008686U,	// <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  2640168890U,	// <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7>
+  2640169710U,	// <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u>
+  1584344374U,	// <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS
+  1705430554U,	// <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1>
+  631688745U,	// <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS
+  2718750976U,	// <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1>
+  631688763U,	// <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS
+  2646147174U,	// <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS
+  2779172424U,	// <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2>
+  3852914258U,	// <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3>
+  3852914268U,	// <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4>
+  2779172450U,	// <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1>
+  2316061914U,	// <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5>
+  2316061186U,	// <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6>
+  2646152186U,	// <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2>
+  2779172486U,	// <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1>
+  2781163151U,	// <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1>
+  2321378194U,	// <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1>
+  3852914339U,	// <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3>
+  3852914350U,	// <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5>
+  2781163191U,	// <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5>
+  3852914363U,	// <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0>
+  3835588297U,	// <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5>
+  3835588306U,	// <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5>
+  2781163223U,	// <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1>
+  3852914400U,	// <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1>
+  2781163243U,	// <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3>
+  3852914419U,	// <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2>
+  2779172606U,	// <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4>
+  3780552497U,	// <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5>
+  2781163279U,	// <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2779172632U,	// <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3>
+  3835588385U,	// <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3>
+  2779172650U,	// <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3>
+  3852914481U,	// <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1>
+  2319403922U,	// <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1>
+  2319404409U,	// <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  3852914510U,	// <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3>
+  3779226131U,	// <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5>
+  2319404250U,	// <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5>
+  2319403522U,	// <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6>
+  3852914547U,	// <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4>
+  2319403524U,	// <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u>
+  2646179942U,	// <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS
+  2316094354U,	// <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1>
+  3852914582U,	// <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3>
+  3852914592U,	// <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4>
+  2646183372U,	// <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4>
+  2779172788U,	// <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6>
+  2316093954U,	// <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6>
+  2646185318U,	// <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6>
+  2779172815U,	// <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6>
+  2781163475U,	// <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1>
+  2781163484U,	// <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1>
+  3852914662U,	// <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2>
+  3852914672U,	// <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3>
+  2781163515U,	// <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5>
+  1705431044U,	// <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172878U,	// <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6>
+  3835588632U,	// <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7>
+  1705431044U,	// <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172900U,	// <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1>
+  2781163571U,	// <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7>
+  3852914743U,	// <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2>
+  2779172930U,	// <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4>
+  2779172940U,	// <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5>
+  2781163607U,	// <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  2779172960U,	// <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7>
+  1705431138U,	// <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0>
+  1705578603U,	// <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0>
+  2646204518U,	// <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2322090898U,	// <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1>
+  3719947880U,	// <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2>
+  3719948438U,	// <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2>
+  2646207951U,	// <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7>
+  2322091226U,	// <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5>
+  2322090498U,	// <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6>
+  2646210156U,	// <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7>
+  2646210350U,	// <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2779173062U,	// <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1>
+  2779173072U,	// <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2>
+  2319404409U,	// <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  2779173092U,	// <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4>
+  2779173101U,	// <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4>
+  1705431044U,	// <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779173118U,	// <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3>
+  1705578756U,	// <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0>
+  1707421965U,	// <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0>
+  3852914966U,	// <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0>
+  2779173153U,	// <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2>
+  2256491002U,	// <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3>
+  3852914994U,	// <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1>
+  3852915003U,	// <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1>
+  2316062652U,	// <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316063544U,	// <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6>
+  1242320182U,	// <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1242320183U,	// <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS
+  3852915048U,	// <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1>
+  3377866217U,	// <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1>
+  3852915068U,	// <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3>
+  3833672072U,	// <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6>
+  3852915088U,	// <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5>
+  3395122056U,	// <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5>
+  3389813560U,	// <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6>
+  2779173287U,	// <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1>
+  2779320752U,	// <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1>
+  2658181222U,	// <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS
+  3852915140U,	// <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3>
+  2257973754U,	// <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3>
+  3841413589U,	// <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2>
+  2658184502U,	// <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS
+  3852915176U,	// <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3>
+  2658186117U,	// <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2>
+  1705431546U,	// <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3>
+  1705579011U,	// <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3>
+  3714015334U,	// <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS
+  3777243425U,	// <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6>
+  2319405957U,	// <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2>
+  3375229286U,	// <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3>
+  2779173426U,	// <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5>
+  3375228721U,	// <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5>
+  2319405880U,	// <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6>
+  1245662518U,	// <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1245662519U,	// <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS
+  3852915291U,	// <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1>
+  3389834729U,	// <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1>
+  2259472890U,	// <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3>
+  3852915321U,	// <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4>
+  3852915330U,	// <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4>
+  2779173517U,	// <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6>
+  2316096312U,	// <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6>
+  1242352950U,	// <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1242352951U,	// <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS
+  3852915372U,	// <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1>
+  3835294392U,	// <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4>
+  3852915395U,	// <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6>
+  3852915404U,	// <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6>
+  3852915412U,	// <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5>
+  3377899313U,	// <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5>
+  2718765160U,	// <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6>
+  2779173611U,	// <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1>
+  2779321076U,	// <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1>
+  2658213990U,	// <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS
+  3852915462U,	// <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1>
+  2718765562U,	// <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3>
+  3714042622U,	// <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6>
+  2658217270U,	// <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS
+  2724074224U,	// <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6>
+  1705431864U,	// <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705431874U,	// <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7>
+  1705579339U,	// <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7>
+  1705431886U,	// <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1>
+  2779173719U,	// <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1>
+  2779173729U,	// <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2>
+  2779173736U,	// <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0>
+  1705431926U,	// <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5>
+  2779173759U,	// <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5>
+  2779173765U,	// <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2>
+  1248349494U,	// <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS
+  1705431958U,	// <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1>
+  1705579423U,	// <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1>
+  2779173801U,	// <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2>
+  2779321266U,	// <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2>
+  2779321273U,	// <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0>
+  1705579463U,	// <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5>
+  2779173841U,	// <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6>
+  1705431864U,	// <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705432032U,	// <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3>
+  1705579495U,	// <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1>
+  1242320994U,	// <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705432058U,	// <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2>
+  3841414146U,	// <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1>
+  2316063226U,	// <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3>
+  2779173908U,	// <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1>
+  2658242658U,	// <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0>
+  2658243468U,	// <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0>
+  2316063554U,	// <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7>
+  1705432121U,	// <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2>
+  3852915777U,	// <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1>
+  2779173962U,	// <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1>
+  2779173973U,	// <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3>
+  3389813242U,	// <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3>
+  3852915813U,	// <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1>
+  3852915821U,	// <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0>
+  3835294839U,	// <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1>
+  2329343596U,	// <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7>
+  2779174027U,	// <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3>
+  2803061908U,	// <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3>
+  3852915869U,	// <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3>
+  2779174053U,	// <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2>
+  2779174060U,	// <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0>
+  2803061944U,	// <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3>
+  3852915905U,	// <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3>
+  2767672522U,	// <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3>
+  2791855315U,	// <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3>
+  2768999644U,	// <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3>
+  2779174115U,	// <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1>
+  3852915948U,	// <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1>
+  3841414394U,	// <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6>
+  1245663738U,	// <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174155U,	// <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5>
+  3852915988U,	// <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5>
+  2706827959U,	// <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7>
+  2319405890U,	// <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7>
+  1245663738U,	// <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174200U,	// <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5>
+  3852916030U,	// <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2>
+  3714099130U,	// <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7>
+  2316095994U,	// <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3>
+  1242353766U,	// <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705432422U,	// <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6>
+  2658276240U,	// <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4>
+  2316096322U,	// <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7>
+  1705432449U,	// <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6>
+  3852916101U,	// <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1>
+  3854906765U,	// <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0>
+  3852916121U,	// <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3>
+  3389846010U,	// <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3>
+  3852916141U,	// <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5>
+  2779174326U,	// <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5>
+  2779174337U,	// <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7>
+  2329376364U,	// <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7>
+  2779321811U,	// <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7>
+  2658287718U,	// <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS
+  3852916197U,	// <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7>
+  2779174382U,	// <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7>
+  2316112378U,	// <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3>
+  2658290998U,	// <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS
+  3852916233U,	// <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7>
+  1651004226U,	// <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7>
+  2779174420U,	// <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0>
+  1652331492U,	// <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7>
+  1590526054U,	// <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS
+  2328728623U,	// <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1>
+  2724746451U,	// <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3>
+  2322092538U,	// <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3>
+  1590529334U,	// <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS
+  2328728951U,	// <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5>
+  2724746770U,	// <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7>
+  430361910U,	// <7,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,7,u>: Cost 1 vspltisw3 RHS
+  1242320994U,	// <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705580162U,	// <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2>
+  2779321996U,	// <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3>
+  1245663738U,	// <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  1242353766U,	// <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705580202U,	// <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6>
+  1662949620U,	// <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7>
+  430361910U,	// <7,7,u,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,u,u>: Cost 1 vspltisw3 RHS
+  1705426944U,	// <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705432787U,	// <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2>
+  2316060885U,	// <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2>
+  1242316956U,	// <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  2779174637U,	// <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1>
+  1182750874U,	// <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS
+  2316061213U,	// <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6>
+  1242320200U,	// <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1705432850U,	// <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2>
+  1584578662U,	// <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS
+  1705427764U,	// <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  631691054U,	// <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2640407307U,	// <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1>
+  1584581942U,	// <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS
+  2779174726U,	// <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0>
+  1584583574U,	// <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1>
+  2779322201U,	// <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1>
+  631691108U,	// <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779174763U,	// <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1>
+  2779174774U,	// <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3>
+  1705428584U,	// <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705432965U,	// <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0>
+  2779174801U,	// <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3>
+  2779174810U,	// <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3>
+  2767673251U,	// <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3>
+  1705580460U,	// <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3>
+  1705433010U,	// <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0>
+  1705433020U,	// <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1>
+  2779174853U,	// <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1>
+  2767673299U,	// <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6>
+  1245659292U,	// <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705433060U,	// <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5>
+  2779174893U,	// <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5>
+  2706836152U,	// <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u>
+  1245662536U,	// <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1705433092U,	// <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1>
+  2779174925U,	// <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1>
+  1185732398U,	// <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS
+  2316093653U,	// <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2>
+  1242349724U,	// <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  1705430224U,	// <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705433151U,	// <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6>
+  2316093981U,	// <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6>
+  1242352968U,	// <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1705433178U,	// <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6>
+  1584611430U,	// <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS
+  2781165670U,	// <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0>
+  2640439226U,	// <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7>
+  2640440079U,	// <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5>
+  1584614710U,	// <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS
+  1705431044U,	// <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  631691418U,	// <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2779322525U,	// <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1>
+  631691436U,	// <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS
+  2779175087U,	// <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1>
+  2779175102U,	// <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7>
+  1648357887U,	// <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u>
+  1705433296U,	// <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7>
+  2779175127U,	// <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5>
+  2779175138U,	// <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7>
+  1651012419U,	// <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u>
+  1705580788U,	// <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7>
+  1705433341U,	// <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7>
+  1705580800U,	// <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1>
+  1187878702U,	// <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2768042263U,	// <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6>
+  1248346268U,	// <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705580840U,	// <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5>
+  1187879066U,	// <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2779322679U,	// <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2>
+  430361910U,	// <7,u,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,u,7,u>: Cost 1 vspltisw3 RHS
+  1705433425U,	// <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1>
+  1705433435U,	// <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2>
+  631691621U,	// <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS
+  1705433451U,	// <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0>
+  1705433465U,	// <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5>
+  1705433475U,	// <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6>
+  631691661U,	// <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS
+  430361910U,	// <7,u,u,7>: Cost 1 vspltisw3 RHS
+  631691675U,	// <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS
+  202162278U,	// <u,0,0,0>: Cost 1 vspltisw0 LHS
+  1678598154U,	// <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2634500154U,	// <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0>
+  2289596269U,	// <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3>
+  1548815670U,	// <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS
+  2663698530U,	// <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2658390942U,	// <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0>
+  2289596597U,	// <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7>
+  202162278U,	// <u,0,0,u>: Cost 1 vspltisw0 LHS
+  1560764518U,	// <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS
+  115720294U,	// <u,0,1,1>: Cost 1 vmrghw LHS, LHS
+  604856427U,	// <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2634508438U,	// <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2>
+  1560767798U,	// <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS
+  2652426438U,	// <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1>
+  1584657311U,	// <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1>
+  2658399226U,	// <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2>
+  604856476U,	// <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696889850U,	// <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0>
+  1190174822U,	// <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  2692245096U,	// <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2>
+  2692245158U,	// <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1>
+  2263916882U,	// <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5>
+  2299709908U,	// <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  2692245434U,	// <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7>
+  2701535281U,	// <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0>
+  1190175389U,	// <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS
+  1209237504U,	// <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1209239206U,	// <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2704189813U,	// <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0>
+  2692245916U,	// <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3>
+  2282981033U,	// <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2664386658U,	// <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0>
+  2691877496U,	// <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  2664388218U,	// <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3>
+  1209239213U,	// <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  2289623040U,	// <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0>
+  1678598482U,	// <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2634532926U,	// <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4>
+  2235580672U,	// <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  1143619922U,	// <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1618505014U,	// <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  2658423714U,	// <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4>
+  2713259464U,	// <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1683243409U,	// <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  1192443904U,	// <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  118702182U,	// <u,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2266185901U,	// <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2640513816U,	// <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5>
+  1192444242U,	// <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2718789636U,	// <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5>
+  1645047915U,	// <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0>
+  2664404604U,	// <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5>
+  118702749U,	// <u,0,5,u>: Cost 1 vmrghw RHS, LHS
+  2302910464U,	// <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0>
+  1192886374U,	// <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  2718790138U,	// <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3>
+  2722771537U,	// <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0>
+  2266628434U,	// <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5>
+  2248950180U,	// <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  2718790456U,	// <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6>
+  2718790478U,	// <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1>
+  1192886941U,	// <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1235812352U,	// <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235814054U,	// <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  2728080601U,	// <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0>
+  2640530202U,	// <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7>
+  2640530742U,	// <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS
+  2309556692U,	// <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  2730735133U,	// <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0>
+  2309556856U,	// <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235814061U,	// <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  202162278U,	// <u,0,u,0>: Cost 1 vspltisw0 LHS
+  120365158U,	// <u,0,u,1>: Cost 1 vmrghw LHS, LHS
+  604856989U,	// <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2692249532U,	// <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1>
+  1560825142U,	// <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS
+  1618507930U,	// <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  1584714662U,	// <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u>
+  2309565048U,	// <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  604857043U,	// <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  1611210825U,	// <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1>
+  1616519270U,	// <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS
+  2287605459U,	// <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2>
+  2640546588U,	// <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0>
+  2622631222U,	// <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS
+  2289590610U,	// <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5>
+  2664436630U,	// <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1>
+  2664437376U,	// <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0>
+  1616519889U,	// <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1>
+  1548894866U,	// <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1>
+  269271142U,	// <u,1,1,1>: Cost 1 vspltisw1 LHS
+  1189462934U,	// <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2622638230U,	// <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2>
+  1548897590U,	// <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS
+  2756985692U,	// <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  2658472872U,	// <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1>
+  2287614142U,	// <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7>
+  269271142U,	// <u,1,1,u>: Cost 1 vspltisw1 LHS
+  1566818406U,	// <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS
+  2756985735U,	// <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  1148371862U,	// <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1566821686U,	// <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS
+  2756985771U,	// <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  2690262970U,	// <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7>
+  1590711938U,	// <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  2282979337U,	// <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1209237514U,	// <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1209239702U,	// <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282979502U,	// <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282979341U,	// <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1209237842U,	// <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282979505U,	// <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287625423U,	// <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1209237521U,	// <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  1635101613U,	// <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1>
+  2289623050U,	// <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1>
+  2289625238U,	// <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2>
+  2640579360U,	// <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4>
+  2622663990U,	// <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS
+  1616522550U,	// <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  2664469398U,	// <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1>
+  2664470148U,	// <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4>
+  1616522793U,	// <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  1548927638U,	// <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5>
+  1192444724U,	// <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1192444822U,	// <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2622670998U,	// <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2>
+  1548930358U,	// <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS
+  1210728786U,	// <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2714153058U,	// <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0>
+  2670449658U,	// <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2>
+  1548932910U,	// <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS
+  2622677655U,	// <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6>
+  2756986063U,	// <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2302912662U,	// <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2>
+  3696421014U,	// <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2>
+  2622680374U,	// <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS
+  2756986099U,	// <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  2714153784U,	// <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6>
+  1651692438U,	// <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1>
+  1652356071U,	// <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1>
+  2628657254U,	// <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS
+  1235812362U,	// <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235814550U,	// <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309554350U,	// <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2628660534U,	// <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS
+  1235812690U,	// <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309554353U,	// <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309554678U,	// <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235812369U,	// <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  1548952217U,	// <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u>
+  269271142U,	// <u,1,u,1>: Cost 1 vspltisw1 LHS
+  1209280662U,	// <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1548954934U,	// <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS
+  1209278802U,	// <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2283020465U,	// <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  1590761096U,	// <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  2702876672U,	// <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0>
+  1629134950U,	// <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS
+  2289591912U,	// <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2>
+  1215848550U,	// <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2702877010U,	// <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5>
+  2289222708U,	// <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2779178473U,	// <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1>
+  2726249024U,	// <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1215848555U,	// <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2690933539U,	// <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2>
+  2628683124U,	// <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1>
+  1189463656U,	// <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1213866086U,	// <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  2628685110U,	// <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS
+  2263205736U,	// <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6>
+  1189463994U,	// <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2263205866U,	// <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1213866091U,	// <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1556938854U,	// <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2697569869U,	// <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2>
+  336380006U,	// <u,2,2,2>: Cost 1 vspltisw2 LHS
+  1678599794U,	// <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  1556942134U,	// <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2702878650U,	// <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7>
+  2300229831U,	// <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7>
+  336380006U,	// <u,2,2,u>: Cost 1 vspltisw2 LHS
+  475243165U,	// <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1548985140U,	// <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1209239144U,	// <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135495782U,	// <u,2,3,3>: Cost 1 vmrglw LHS, LHS
+  475245878U,	// <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1596764164U,	// <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1596764666U,	// <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1596765178U,	// <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135495787U,	// <u,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2708851630U,	// <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2>
+  2217362979U,	// <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2289624680U,	// <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2>
+  1215881318U,	// <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2726767824U,	// <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4>
+  1629138230U,	// <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  2779178801U,	// <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5>
+  2726251976U,	// <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1215881323U,	// <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2628714598U,	// <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS
+  2628715896U,	// <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5>
+  1192445544U,	// <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1213898854U,	// <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2628717878U,	// <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS
+  2726768644U,	// <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5>
+  1192445882U,	// <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2266187754U,	// <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1213898859U,	// <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2634694758U,	// <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS
+  2721460657U,	// <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2>
+  2296940136U,	// <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2>
+  1678600122U,	// <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2634698038U,	// <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS
+  3370682125U,	// <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5>
+  1157056442U,	// <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725442455U,	// <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2>
+  1678600167U,	// <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  1653027897U,	// <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2>
+  2309554924U,	// <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235813992U,	// <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  162070630U,	// <u,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2634706230U,	// <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS
+  2309555252U,	// <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309555901U,	// <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309555416U,	// <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  162070635U,	// <u,2,7,u>: Cost 1 vmrglw RHS, LHS
+  475284130U,	// <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1549026100U,	// <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  336380006U,	// <u,2,u,2>: Cost 1 vspltisw2 LHS
+  135536742U,	// <u,2,u,3>: Cost 1 vmrglw LHS, LHS
+  475286838U,	// <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1629141146U,	// <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  1194108858U,	// <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  1596806138U,	// <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135536747U,	// <u,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611890688U,	// <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  538149020U,	// <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685632685U,	// <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685632764U,	// <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611891026U,	// <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2733408722U,	// <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2658612153U,	// <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0>
+  2289592250U,	// <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7>
+  538149533U,	// <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1189464214U,	// <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  1611891508U,	// <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611891606U,	// <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  1189464476U,	// <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1189464578U,	// <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2690278511U,	// <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2690278607U,	// <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2287609786U,	// <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7>
+  1611892092U,	// <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2685634042U,	// <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0>
+  2685634079U,	// <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611892328U,	// <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611892390U,	// <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2685634371U,	// <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5>
+  2685634453U,	// <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6>
+  1611892666U,	// <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2300225466U,	// <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7>
+  1611892795U,	// <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1>
+  1209238422U,	// <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282980247U,	// <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1>
+  1561004120U,	// <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3>
+  403488870U,	// <u,3,3,3>: Cost 1 vspltisw3 LHS
+  1209238426U,	// <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282980899U,	// <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2282985598U,	// <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1209239482U,	// <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  403488870U,	// <u,3,3,u>: Cost 1 vspltisw3 LHS
+  1555038310U,	// <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS
+  1555039616U,	// <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4>
+  2628781672U,	// <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2>
+  2289624690U,	// <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3>
+  1555041590U,	// <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS
+  538152246U,	// <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2658644925U,	// <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4>
+  2289625018U,	// <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7>
+  538152489U,	// <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1192446102U,	// <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2733411983U,	// <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2634762330U,	// <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5>
+  1192446364U,	// <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1192446466U,	// <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  1659670532U,	// <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659670626U,	// <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  2287642554U,	// <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7>
+  1659670788U,	// <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2634768486U,	// <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS
+  2733412775U,	// <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1648390659U,	// <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3>
+  2634770973U,	// <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6>
+  2634771766U,	// <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS
+  2733413099U,	// <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659671352U,	// <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659671374U,	// <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1652372457U,	// <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3>
+  1561034854U,	// <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  2634777396U,	// <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1>
+  1561036892U,	// <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7>
+  1235814002U,	// <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1561038134U,	// <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS
+  2309555747U,	// <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2309556072U,	// <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6>
+  1235814330U,	// <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1561040686U,	// <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  1611896531U,	// <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2>
+  538154798U,	// <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1611896712U,	// <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3>
+  403488870U,	// <u,3,u,3>: Cost 1 vspltisw3 LHS
+  1611896895U,	// <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6>
+  538155162U,	// <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1611897040U,	// <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1209280442U,	// <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  538155365U,	// <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  1165118354U,	// <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1618534502U,	// <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  2634795102U,	// <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0>
+  2686451968U,	// <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2692276562U,	// <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5>
+  1705438098U,	// <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658685890U,	// <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0>
+  2256489928U,	// <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1618535069U,	// <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1189464978U,	// <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2692277044U,	// <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1>
+  1618535367U,	// <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4>
+  2640775992U,	// <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1>
+  1189465296U,	// <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  115723574U,	// <u,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2263207289U,	// <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2664666780U,	// <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1>
+  115723817U,	// <u,4,1,u>: Cost 1 vmrghw LHS, RHS
+  2263919506U,	// <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1>
+  2222115812U,	// <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  2692277864U,	// <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2>
+  2692277926U,	// <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1>
+  2324114640U,	// <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4>
+  1190178102U,	// <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278202U,	// <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7>
+  2701568053U,	// <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4>
+  1190178345U,	// <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278422U,	// <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2>
+  2282981552U,	// <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2704222585U,	// <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4>
+  2692278684U,	// <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3>
+  1257016528U,	// <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1209239246U,	// <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2691910300U,	// <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  2664683166U,	// <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3>
+  1209239249U,	// <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  1573027942U,	// <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS
+  2634826695U,	// <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4>
+  2634827874U,	// <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4>
+  2289629073U,	// <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3>
+  229035318U,	// <u,4,4,4>: Cost 1 vspltisw0 RHS
+  1618537782U,	// <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS
+  2658718662U,	// <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4>
+  2289629401U,	// <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7>
+  229035318U,	// <u,4,4,u>: Cost 1 vspltisw0 RHS
+  1561092198U,	// <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS
+  2628863370U,	// <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5>
+  1561094243U,	// <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5>
+  2634836118U,	// <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2>
+  1561095478U,	// <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS
+  118705462U,	// <u,4,5,5>: Cost 1 vmrghw RHS, RHS
+  604859702U,	// <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2658726906U,	// <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2>
+  604859720U,	// <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2266631058U,	// <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1>
+  2302692152U,	// <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  2718822906U,	// <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3>
+  2722804309U,	// <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4>
+  2723467942U,	// <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4>
+  1192889654U,	// <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2718823224U,	// <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6>
+  2718823246U,	// <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1>
+  1192889897U,	// <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2640822374U,	// <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS
+  2640823194U,	// <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4>
+  2728113373U,	// <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4>
+  2640825150U,	// <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7>
+  1235815632U,	// <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235814094U,	// <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  2730767905U,	// <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4>
+  2309556892U,	// <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235814097U,	// <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  1561116774U,	// <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS
+  1618540334U,	// <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1561118822U,	// <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u>
+  2692282300U,	// <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1>
+  229035318U,	// <u,4,u,4>: Cost 1 vspltisw0 RHS
+  120368438U,	// <u,4,u,5>: Cost 1 vmrghw LHS, RHS
+  604859945U,	// <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2309565084U,	// <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  604859963U,	// <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2690293760U,	// <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0>
+  1616552038U,	// <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2640840434U,	// <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5>
+  2640841536U,	// <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0>
+  1613381970U,	// <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2316135642U,	// <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5>
+  2289592834U,	// <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6>
+  2664732324U,	// <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0>
+  1616552661U,	// <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5>
+  1573077094U,	// <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  1237536282U,	// <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2690294678U,	// <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0>
+  2646821014U,	// <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2>
+  1573080602U,	// <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1>
+  1189466116U,	// <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1189466210U,	// <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2646823930U,	// <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2>
+  1573082926U,	// <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  2640855142U,	// <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS
+  2697594448U,	// <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5>
+  2690295400U,	// <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2>
+  1625179890U,	// <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5>
+  2699585347U,	// <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5>
+  2781171471U,	// <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2690295738U,	// <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7>
+  3775318070U,	// <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5>
+  1628498055U,	// <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5>
+  2287627234U,	// <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1257016210U,	// <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2646836942U,	// <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5>
+  2287625131U,	// <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287627238U,	// <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1257016538U,	// <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1209240066U,	// <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287625459U,	// <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1209240068U,	// <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  2640871526U,	// <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS
+  2316168082U,	// <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1>
+  2640873202U,	// <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5>
+  2640874308U,	// <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4>
+  1637788917U,	// <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5>
+  1616555318U,	// <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  2287638591U,	// <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6>
+  2664765096U,	// <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4>
+  1616555561U,	// <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  1573109862U,	// <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS
+  2646852404U,	// <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1>
+  2646853224U,	// <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2>
+  2287646618U,	// <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3>
+  1573113374U,	// <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5>
+  296144182U,	// <u,5,5,5>: Cost 1 vspltisw1 RHS
+  1192448098U,	// <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2287646946U,	// <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7>
+  296144182U,	// <u,5,5,u>: Cost 1 vspltisw1 RHS
+  1567146086U,	// <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS
+  2628945300U,	// <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6>
+  2634917997U,	// <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6>
+  1567148870U,	// <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6>
+  1567149366U,	// <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS
+  2781171799U,	// <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  1228950018U,	// <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  2628952166U,	// <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS
+  1235815314U,	// <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309556734U,	// <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309555115U,	// <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2628955446U,	// <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS
+  1235815642U,	// <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235814914U,	// <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309555443U,	// <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235814916U,	// <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  1567162470U,	// <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS
+  1616557870U,	// <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2690299781U,	// <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0>
+  1567165256U,	// <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u>
+  1567165750U,	// <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS
+  296144182U,	// <u,5,u,5>: Cost 1 vspltisw1 RHS
+  1209281026U,	// <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2705563648U,	// <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0>
+  1631821926U,	// <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  2262462970U,	// <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3>
+  2646886941U,	// <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6>
+  2705563986U,	// <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5>
+  2316062652U,	// <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316137272U,	// <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6>
+  1215851830U,	// <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  1215851831U,	// <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS
+  2634948710U,	// <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS
+  2705564468U,	// <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1>
+  1189466618U,	// <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2263208498U,	// <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2693620843U,	// <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6>
+  2652868860U,	// <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1>
+  1189466936U,	// <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1213869366U,	// <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  1213869367U,	// <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS
+  2658844774U,	// <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS
+  3771344465U,	// <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6>
+  1178554874U,	// <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2698929907U,	// <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6>
+  2699593540U,	// <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6>
+  2700257173U,	// <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6>
+  2705565626U,	// <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7>
+  1226485046U,	// <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  1226485047U,	// <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS
+  2705565846U,	// <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2>
+  2330756585U,	// <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2330756829U,	// <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2>
+  2282981734U,	// <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  1631824413U,	// <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6>
+  2652885246U,	// <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3>
+  1257018168U,	// <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135499062U,	// <u,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135499063U,	// <u,6,3,u>: Cost 1 vmrglw LHS, RHS
+  2646917222U,	// <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS
+  2217365931U,	// <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2790167156U,	// <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u>
+  2646919709U,	// <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6>
+  2711538934U,	// <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6>
+  1631825206U,	// <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  2316170040U,	// <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6>
+  1215884598U,	// <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  1215884599U,	// <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS
+  2634981478U,	// <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS
+  2266190247U,	// <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1192448506U,	// <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2266190386U,	// <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2634984758U,	// <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS
+  2652901632U,	// <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5>
+  1192448824U,	// <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1213902134U,	// <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1213902135U,	// <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1583808614U,	// <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2718839290U,	// <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3>
+  2670823965U,	// <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6>
+  1583811894U,	// <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2724147961U,	// <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6>
+  363253046U,	// <u,6,6,6>: Cost 1 vspltisw2 RHS
+  1229172022U,	// <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS
+  363253046U,	// <u,6,6,u>: Cost 1 vspltisw2 RHS
+  499458150U,	// <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1573200692U,	// <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1573201512U,	// <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573202070U,	// <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499461673U,	// <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1573203972U,	// <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1235817272U,	// <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  162073910U,	// <u,6,7,7>: Cost 1 vmrglw RHS, RHS
+  162073911U,	// <u,6,7,u>: Cost 1 vmrglw RHS, RHS
+  499466342U,	// <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631827758U,	// <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  1573209704U,	// <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573210262U,	// <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499469866U,	// <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631828122U,	// <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  363253046U,	// <u,6,u,6>: Cost 1 vspltisw2 RHS
+  135540022U,	// <u,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135540023U,	// <u,6,u,u>: Cost 1 vmrglw LHS, RHS
+  1638465536U,	// <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564723814U,	// <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712207533U,	// <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712207612U,	// <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638465874U,	// <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1579192580U,	// <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0>
+  2712207862U,	// <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2316137282U,	// <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7>
+  564724381U,	// <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  1189467130U,	// <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  1638466356U,	// <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638466454U,	// <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  2311500282U,	// <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3>
+  1189467494U,	// <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2712208495U,	// <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2694956302U,	// <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7>
+  1189467756U,	// <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1638466940U,	// <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712208829U,	// <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712208927U,	// <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638467176U,	// <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638467238U,	// <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712209165U,	// <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712209256U,	// <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1627187175U,	// <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7>
+  2324116290U,	// <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7>
+  1628514441U,	// <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7>
+  1638467734U,	// <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712209638U,	// <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2700929387U,	// <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u>
+  1638467996U,	// <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638468098U,	// <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712210002U,	// <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  1585189856U,	// <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3>
+  1257018178U,	// <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1638468382U,	// <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638468498U,	// <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712210378U,	// <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712210485U,	// <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712210564U,	// <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638468816U,	// <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564727112U,	// <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712210809U,	// <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2712210888U,	// <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0>
+  564727337U,	// <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  1192449018U,	// <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2714201743U,	// <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712211198U,	// <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2311533050U,	// <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3>
+  1192449382U,	// <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  1638469636U,	// <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638469730U,	// <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  1192449644U,	// <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1638469892U,	// <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712211745U,	// <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712211879U,	// <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638470138U,	// <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712212018U,	// <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712212109U,	// <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712212203U,	// <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638470456U,	// <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638470478U,	// <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638470559U,	// <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1>
+  1235816546U,	// <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309558371U,	// <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1>
+  2641045434U,	// <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7>
+  1235816954U,	// <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1235816550U,	// <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309558375U,	// <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5>
+  1585222628U,	// <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7>
+  430361910U,	// <u,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <u,7,7,u>: Cost 1 vspltisw3 RHS
+  1638471379U,	// <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564729646U,	// <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638471557U,	// <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638471612U,	// <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638471743U,	// <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564730010U,	// <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638471888U,	// <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  430361910U,	// <u,7,u,7>: Cost 1 vspltisw3 RHS
+  564730213U,	// <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  202162278U,	// <u,u,0,0>: Cost 1 vspltisw0 LHS
+  538189985U,	// <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685673645U,	// <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  1215848604U,	// <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  1611931986U,	// <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  1579266317U,	// <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0>
+  2289592861U,	// <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6>
+  1215851848U,	// <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  538190493U,	// <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1549411025U,	// <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1>
+  115726126U,	// <u,u,1,1>: Cost 1 vmrghw LHS, LHS
+  604862254U,	// <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  1213866140U,	// <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1549413686U,	// <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS
+  115726490U,	// <u,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1585247207U,	// <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1>
+  1213869384U,	// <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  604862308U,	// <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  1567334502U,	// <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS
+  1190180654U,	// <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  336380006U,	// <u,u,2,2>: Cost 1 vspltisw2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1567337782U,	// <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS
+  1190181018U,	// <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  1611933626U,	// <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1226485064U,	// <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  475685587U,	// <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1209239278U,	// <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1>
+  1209239765U,	// <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135495836U,	// <u,u,3,3>: Cost 1 vmrglw LHS, LHS
+  475688246U,	// <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1209239282U,	// <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5>
+  1209240093U,	// <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135499080U,	// <u,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135495841U,	// <u,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1555406950U,	// <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS
+  1555408301U,	// <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4>
+  2289625301U,	// <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2>
+  1215881372U,	// <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  229035318U,	// <u,u,4,4>: Cost 1 vspltisw0 RHS
+  538193206U,	// <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2289625629U,	// <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6>
+  1215884616U,	// <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  538193449U,	// <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1549443797U,	// <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5>
+  118708014U,	// <u,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1561389191U,	// <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5>
+  1213898908U,	// <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  1549446454U,	// <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS
+  118708378U,	// <u,u,5,5>: Cost 1 vmrghw RHS, RHS
+  604862618U,	// <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  1213902152U,	// <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  604862636U,	// <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  1567367270U,	// <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS
+  1192892206U,	// <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1638478330U,	// <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1679046864U,	// <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  1567370550U,	// <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS
+  1192892570U,	// <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  363253046U,	// <u,u,6,6>: Cost 1 vspltisw2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  499605606U,	// <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1235812425U,	// <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1561405577U,	// <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7>
+  162070684U,	// <u,u,7,3>: Cost 1 vmrglw RHS, LHS
+  499609147U,	// <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1235812753U,	// <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235814941U,	// <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  162073928U,	// <u,u,7,7>: Cost 1 vmrglw RHS, RHS
+  162070689U,	// <u,u,7,u>: Cost 1 vmrglw RHS, LHS
+  475726552U,	// <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  538195758U,	// <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  604862821U,	// <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  475729206U,	// <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  538196122U,	// <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  604862861U,	// <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/lib/Target/PowerPC/PPCPredicates.cpp b/lib/Target/PowerPC/PPCPredicates.cpp
new file mode 100644
index 0000000..08a2812
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPredicates.cpp
@@ -0,0 +1,30 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  default: assert(0 && "Unknown PPC branch opcode!");
+  case PPC::PRED_EQ: return PPC::PRED_NE;
+  case PPC::PRED_NE: return PPC::PRED_EQ;
+  case PPC::PRED_LT: return PPC::PRED_GE;
+  case PPC::PRED_GE: return PPC::PRED_LT;
+  case PPC::PRED_GT: return PPC::PRED_LE;
+  case PPC::PRED_LE: return PPC::PRED_GT;
+  case PPC::PRED_NU: return PPC::PRED_UN;
+  case PPC::PRED_UN: return PPC::PRED_NU;
+  }
+}
diff --git a/lib/Target/PowerPC/PPCPredicates.h b/lib/Target/PowerPC/PPCPredicates.h
new file mode 100644
index 0000000..b2c8315
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPredicates.h
@@ -0,0 +1,39 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
+#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
+
+#include "PPC.h"
+
+namespace llvm {
+namespace PPC {
+  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
+  enum Predicate {
+    PRED_ALWAYS = (0 << 5) | 20,
+    PRED_LT     = (0 << 5) | 12,
+    PRED_LE     = (1 << 5) |  4,
+    PRED_EQ     = (2 << 5) | 12,
+    PRED_GE     = (0 << 5) |  4,
+    PRED_GT     = (1 << 5) | 12,
+    PRED_NE     = (2 << 5) |  4,
+    PRED_UN     = (3 << 5) | 12,
+    PRED_NU     = (3 << 5) |  4
+  };
+  
+  /// Invert the specified predicate.  != -> ==, < -> >=.
+  Predicate InvertPredicate(Predicate Opcode);
+}
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
new file mode 100644
index 0000000..5d5beeb
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -0,0 +1,1446 @@
+//===- PPCRegisterInfo.cpp - PowerPC Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCRegisterInfo.h"
+#include "PPCFrameInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+using namespace llvm;
+
+// FIXME This disables some code that aligns the stack to a boundary
+// bigger than the default (16 bytes on Darwin) when there is a stack local
+// of greater alignment.  This does not currently work, because the delta
+// between old and new stack pointers is added to offsets that reference
+// incoming parameters after the prolog is generated, and the code that 
+// does that doesn't handle a variable delta.  You don't want to do that
+// anyway; a better approach is to reserve another register that retains
+// to the incoming stack pointer, and reference parameters relative to that.
+#define ALIGN_STACK 0
+
+// FIXME (64-bit): Eventually enable by default.
+cl::opt<bool> EnablePPC32RS("enable-ppc32-regscavenger",
+                            cl::init(false),
+                            cl::desc("Enable PPC32 register scavenger"),
+                            cl::Hidden);
+cl::opt<bool> EnablePPC64RS("enable-ppc64-regscavenger",
+                            cl::init(false),
+                            cl::desc("Enable PPC64 register scavenger"),
+                            cl::Hidden);
+#define EnableRegisterScavenging \
+  ((EnablePPC32RS && !Subtarget.isPPC64()) || \
+   (EnablePPC64RS && Subtarget.isPPC64()))
+
+// FIXME (64-bit): Should be inlined.
+bool
+PPCRegisterInfo::requiresRegisterScavenging(const MachineFunction &) const {
+  return EnableRegisterScavenging;
+}
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+unsigned PPCRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace PPC;
+  switch (RegEnum) {
+  case 0: return 0;
+  case R0 :  case X0 :  case F0 :  case V0 : case CR0:  case CR0LT: return  0;
+  case R1 :  case X1 :  case F1 :  case V1 : case CR1:  case CR0GT: return  1;
+  case R2 :  case X2 :  case F2 :  case V2 : case CR2:  case CR0EQ: return  2;
+  case R3 :  case X3 :  case F3 :  case V3 : case CR3:  case CR0UN: return  3;
+  case R4 :  case X4 :  case F4 :  case V4 : case CR4:  case CR1LT: return  4;
+  case R5 :  case X5 :  case F5 :  case V5 : case CR5:  case CR1GT: return  5;
+  case R6 :  case X6 :  case F6 :  case V6 : case CR6:  case CR1EQ: return  6;
+  case R7 :  case X7 :  case F7 :  case V7 : case CR7:  case CR1UN: return  7;
+  case R8 :  case X8 :  case F8 :  case V8 : case CR2LT: return  8;
+  case R9 :  case X9 :  case F9 :  case V9 : case CR2GT: return  9;
+  case R10:  case X10:  case F10:  case V10: case CR2EQ: return 10;
+  case R11:  case X11:  case F11:  case V11: case CR2UN: return 11;
+  case R12:  case X12:  case F12:  case V12: case CR3LT: return 12;
+  case R13:  case X13:  case F13:  case V13: case CR3GT: return 13;
+  case R14:  case X14:  case F14:  case V14: case CR3EQ: return 14;
+  case R15:  case X15:  case F15:  case V15: case CR3UN: return 15;
+  case R16:  case X16:  case F16:  case V16: case CR4LT: return 16;
+  case R17:  case X17:  case F17:  case V17: case CR4GT: return 17;
+  case R18:  case X18:  case F18:  case V18: case CR4EQ: return 18;
+  case R19:  case X19:  case F19:  case V19: case CR4UN: return 19;
+  case R20:  case X20:  case F20:  case V20: case CR5LT: return 20;
+  case R21:  case X21:  case F21:  case V21: case CR5GT: return 21;
+  case R22:  case X22:  case F22:  case V22: case CR5EQ: return 22;
+  case R23:  case X23:  case F23:  case V23: case CR5UN: return 23;
+  case R24:  case X24:  case F24:  case V24: case CR6LT: return 24;
+  case R25:  case X25:  case F25:  case V25: case CR6GT: return 25;
+  case R26:  case X26:  case F26:  case V26: case CR6EQ: return 26;
+  case R27:  case X27:  case F27:  case V27: case CR6UN: return 27;
+  case R28:  case X28:  case F28:  case V28: case CR7LT: return 28;
+  case R29:  case X29:  case F29:  case V29: case CR7GT: return 29;
+  case R30:  case X30:  case F30:  case V30: case CR7EQ: return 30;
+  case R31:  case X31:  case F31:  case V31: case CR7UN: return 31;
+  default:
+    cerr << "Unhandled reg in PPCRegisterInfo::getRegisterNumbering!\n";
+    abort();
+  }
+}
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
+                                 const TargetInstrInfo &tii)
+  : PPCGenRegisterInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+    Subtarget(ST), TII(tii) {
+  ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
+  ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
+  ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
+  ImmToIdxMap[PPC::LWZ]  = PPC::LWZX;   ImmToIdxMap[PPC::LWA]  = PPC::LWAX;
+  ImmToIdxMap[PPC::LFS]  = PPC::LFSX;   ImmToIdxMap[PPC::LFD]  = PPC::LFDX;
+  ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
+  ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
+  ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+
+  // 64-bit
+  ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
+  ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8;
+  ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
+  ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
+  ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; ImmToIdxMap[PPC::STD_32] = PPC::STDX_32;
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *PPCRegisterInfo::getPointerRegClass() const {
+  if (Subtarget.isPPC64())
+    return &PPC::G8RCRegClass;
+  else
+    return &PPC::GPRCRegClass;
+}
+
+const unsigned*
+PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  // 32-bit Darwin calling convention. 
+  static const unsigned Macho32_CalleeSavedRegs[] = {
+              PPC::R13, PPC::R14, PPC::R15,
+    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
+
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+    
+    PPC::LR,  0
+  };
+  
+  static const unsigned ELF32_CalleeSavedRegs[] = {
+              PPC::R13, PPC::R14, PPC::R15,
+    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
+
+                                  PPC::F9,
+    PPC::F10, PPC::F11, PPC::F12, PPC::F13,
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+    
+    PPC::LR,  0
+  };
+  // 64-bit Darwin calling convention. 
+  static const unsigned Macho64_CalleeSavedRegs[] = {
+    PPC::X14, PPC::X15,
+    PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+    PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+    PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+    PPC::X28, PPC::X29, PPC::X30, PPC::X31,
+    
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+    
+    PPC::LR8,  0
+  };
+  
+  if (Subtarget.isMachoABI())
+    return Subtarget.isPPC64() ? Macho64_CalleeSavedRegs :
+                                 Macho32_CalleeSavedRegs;
+  
+  // ELF 32.
+  return ELF32_CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  // 32-bit Macho calling convention. 
+  static const TargetRegisterClass * const Macho32_CalleeSavedRegClasses[] = {
+                       &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    
+    &PPC::GPRCRegClass, 0
+  };
+  
+  static const TargetRegisterClass * const ELF32_CalleeSavedRegClasses[] = {
+                       &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+
+                                                             &PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    
+    &PPC::GPRCRegClass, 0
+  };
+  
+  // 64-bit Macho calling convention. 
+  static const TargetRegisterClass * const Macho64_CalleeSavedRegClasses[] = {
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    
+    &PPC::G8RCRegClass, 0
+  };
+  
+  if (Subtarget.isMachoABI())
+    return Subtarget.isPPC64() ? Macho64_CalleeSavedRegClasses :
+                                 Macho32_CalleeSavedRegClasses;
+  
+  // ELF 32.
+  return ELF32_CalleeSavedRegClasses;
+}
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+static bool needsFP(const MachineFunction &MF) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects() ||
+    (PerformTailCallOpt && MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+}
+
+static bool spillsCR(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isCRSpilled();
+}
+
+BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(PPC::R0);
+  Reserved.set(PPC::R1);
+  Reserved.set(PPC::LR);
+  Reserved.set(PPC::LR8);
+  Reserved.set(PPC::RM);
+
+  // In Linux, r2 is reserved for the OS.
+  if (!Subtarget.isDarwin())
+    Reserved.set(PPC::R2);
+
+  // On PPC64, r13 is the thread pointer. Never allocate this register. Note
+  // that this is over conservative, as it also prevents allocation of R31 when
+  // the FP is not needed.
+  if (Subtarget.isPPC64()) {
+    Reserved.set(PPC::R13);
+    Reserved.set(PPC::R31);
+
+    if (!EnableRegisterScavenging)
+      Reserved.set(PPC::R0);    // FIXME (64-bit): Remove
+
+    Reserved.set(PPC::X0);
+    Reserved.set(PPC::X1);
+    Reserved.set(PPC::X13);
+    Reserved.set(PPC::X31);
+  }
+
+  if (needsFP(MF))
+    Reserved.set(PPC::R31);
+
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.  This is true if the function needs a frame pointer and has
+// a non-zero stack size.
+bool PPCRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getStackSize() && needsFP(MF);
+}
+
+/// MustSaveLR - Return true if this function requires that we save the LR
+/// register onto the stack in the prolog and restore it in the epilog of the
+/// function.
+static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+  const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
+  
+  // We need a save/restore of LR if there is any def of LR (which is
+  // defined by calls, including the PIC setup sequence), or if there is
+  // some use of the LR stack slot (e.g. for builtin_return_address).
+  // (LR comes in 32 and 64 bit versions.)
+  MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
+  return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
+}
+
+
+
+void PPCRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (PerformTailCallOpt && I->getOpcode() == PPC::ADJCALLSTACKUP) {
+    // Add (actually subtract) back the amount the callee popped on return.
+    if (int CalleeAmt =  I->getOperand(1).getImm()) {
+      bool is64Bit = Subtarget.isPPC64();
+      CalleeAmt *= -1;
+      unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
+      unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
+      unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
+      unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
+      unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
+      unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
+      MachineInstr *MI = I;
+      DebugLoc dl = MI->getDebugLoc();
+
+      if (isInt16(CalleeAmt)) {
+        BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg).addReg(StackReg).
+          addImm(CalleeAmt);
+      } else {
+        MachineBasicBlock::iterator MBBI = I;
+        BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CalleeAmt >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CalleeAmt & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
+          .addReg(StackReg)
+          .addReg(StackReg)
+          .addReg(TmpReg);
+      }
+    }
+  }
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+/// findScratchRegister - Find a 'free' PPC register. Try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static
+unsigned findScratchRegister(MachineBasicBlock::iterator II, RegScavenger *RS,
+                             const TargetRegisterClass *RC, int SPAdj) {
+  assert(RS && "Register scavenging must be on");
+  unsigned Reg = RS->FindUnusedReg(RC, true);
+  // FIXME: move ARM callee-saved reg scan to target independent code, then 
+  // search for already spilled CS register here.
+  if (Reg == 0)
+    Reg = RS->scavengeRegister(RC, II, SPAdj);
+  return Reg;
+}
+
+/// lowerDynamicAlloc - Generate the code for allocating an object in the
+/// current frame.  The sequence of code with be in the general form
+///
+///   addi   R0, SP, \#frameSize ; get the address of the previous frame
+///   stwxu  R0, SP, Rnegsize   ; add and update the SP with the negated size
+///   addi   Rnew, SP, \#maxCalFrameSize ; get the top of the allocation
+///
+void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
+                                        int SPAdj, RegScavenger *RS) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = Subtarget.isPPC64();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Get the maximum call stack size.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  // Get the total frame size.
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  assert(MaxAlign <= TargetAlign &&
+         "Dynamic alloca with large aligns not supported");
+
+  // Determine the previous frame's address.  If FrameSize can't be
+  // represented as 16 bits or we need special alignment, then we load the
+  // previous frame's address from 0(SP).  Why not do an addis of the hi? 
+  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. 
+  // Constructing the constant and adding would take 3 instructions. 
+  // Fortunately, a frame greater than 32K is rare.
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = LP64 ? G8RC : GPRC;
+
+  // FIXME (64-bit): Use "findScratchRegister"
+  unsigned Reg;
+  if (EnableRegisterScavenging)
+    Reg = findScratchRegister(II, RS, RC, SPAdj);
+  else
+    Reg = PPC::R0;
+  
+  if (MaxAlign < TargetAlign && isInt16(FrameSize)) {
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+      .addReg(PPC::R31)
+      .addImm(FrameSize);
+  } else if (LP64) {
+    if (EnableRegisterScavenging) // FIXME (64-bit): Use "true" part.
+      BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
+        .addImm(0)
+        .addReg(PPC::X1);
+    else
+      BuildMI(MBB, II, dl, TII.get(PPC::LD), PPC::X0)
+        .addImm(0)
+        .addReg(PPC::X1);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
+      .addImm(0)
+      .addReg(PPC::R1);
+  }
+  
+  // Grow the stack and update the stack pointer link, then determine the
+  // address of new allocated space.
+  if (LP64) {
+    if (EnableRegisterScavenging) // FIXME (64-bit): Use "true" part.
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+        .addReg(Reg, RegState::Kill)
+        .addReg(PPC::X1)
+        .addReg(MI.getOperand(1).getReg());
+    else
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+        .addReg(PPC::X0, RegState::Kill)
+        .addReg(PPC::X1)
+        .addReg(MI.getOperand(1).getReg());
+
+    if (!MI.getOperand(1).isKill())
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+        .addReg(PPC::X1)
+        .addImm(maxCallFrameSize);
+    else
+      // Implicitly kill the register.
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+        .addReg(PPC::X1)
+        .addImm(maxCallFrameSize)
+        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX))
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::R1)
+      .addReg(MI.getOperand(1).getReg());
+
+    if (!MI.getOperand(1).isKill())
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+        .addReg(PPC::R1)
+        .addImm(maxCallFrameSize);
+    else
+      // Implicitly kill the register.
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+        .addReg(PPC::R1)
+        .addImm(maxCallFrameSize)
+        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+  }
+  
+  // Discard the DYNALLOC instruction.
+  MBB.erase(II);
+}
+
+/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
+/// reserving a whole register (R0), we scrounge for one here. This generates
+/// code like this:
+///
+///   mfcr rA                  ; Move the conditional register into GPR rA.
+///   rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot.
+///   stw rA, FI               ; Store rA to the frame.
+///
+void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex, int SPAdj,
+                                      RegScavenger *RS) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CR <SrcReg>, <offset>, <FI>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
+  unsigned Reg = findScratchRegister(II, RS, RC, SPAdj);
+
+  // We need to store the CR in the low 4-bits of the saved value. First, issue
+  // an MFCR to save all of the CRBits. Add an implicit kill of the CR.
+  if (!MI.getOperand(0).isKill())
+    BuildMI(MBB, II, dl, TII.get(PPC::MFCR), Reg);
+  else
+    // Implicitly kill the CR register.
+    BuildMI(MBB, II, dl, TII.get(PPC::MFCR), Reg)
+      .addReg(MI.getOperand(0).getReg(), RegState::ImplicitKill);
+    
+  // If the saved register wasn't CR0, shift the bits left so that they are in
+  // CR0's slot.
+  unsigned SrcReg = MI.getOperand(0).getReg();
+  if (SrcReg != PPC::CR0)
+    // rlwinm rA, rA, ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(PPC::RLWINM), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addImm(PPCRegisterInfo::getRegisterNumbering(SrcReg) * 4)
+      .addImm(0)
+      .addImm(31);
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW))
+                    .addReg(Reg, getKillRegState(MI.getOperand(1).getImm())),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Find out which operand is the frame index.
+  unsigned FIOperandNo = 0;
+  while (!MI.getOperand(FIOperandNo).isFI()) {
+    ++FIOperandNo;
+    assert(FIOperandNo != MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+  // Take into account whether it's an add or mem instruction
+  unsigned OffsetOperandNo = (FIOperandNo == 2) ? 1 : 2;
+  if (MI.getOpcode() == TargetInstrInfo::INLINEASM)
+    OffsetOperandNo = FIOperandNo-1;
+
+  // Get the frame index.
+  int FrameIndex = MI.getOperand(FIOperandNo).getIndex();
+
+  // Get the frame pointer save index.  Users of this index are primarily
+  // DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+  // Get the instruction opcode.
+  unsigned OpC = MI.getOpcode();
+  
+  // Special case for dynamic alloca.
+  if (FPSI && FrameIndex == FPSI &&
+      (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
+    lowerDynamicAlloc(II, SPAdj, RS);
+    return;
+  }
+
+  // Special case for pseudo-op SPILL_CR.
+  if (EnableRegisterScavenging) // FIXME (64-bit): Enable by default.
+    if (OpC == PPC::SPILL_CR) {
+      lowerCRSpilling(II, FrameIndex, SPAdj, RS);
+      return;
+    }
+
+  // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+  MI.getOperand(FIOperandNo).ChangeToRegister(hasFP(MF) ? PPC::R31 : PPC::R1,
+                                              false);
+
+  // Figure out if the offset in the instruction is shifted right two bits. This
+  // is true for instructions like "STD", which the machine implicitly adds two
+  // low zeros to.
+  bool isIXAddr = false;
+  switch (OpC) {
+  case PPC::LWA:
+  case PPC::LD:
+  case PPC::STD:
+  case PPC::STD_32:
+    isIXAddr = true;
+    break;
+  }
+  
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+  if (!isIXAddr)
+    Offset += MI.getOperand(OffsetOperandNo).getImm();
+  else
+    Offset += MI.getOperand(OffsetOperandNo).getImm() << 2;
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  Offset += MFI->getStackSize();
+
+  // If we can, encode the offset directly into the instruction.  If this is a
+  // normal PPC "ri" instruction, any 16-bit value can be safely encoded.  If
+  // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
+  // clear can be encoded.  This is extremely uncommon, because normally you
+  // only "std" to a stack slot that is at least 4-byte aligned, but it can
+  // happen in invalid code.
+  if (isInt16(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
+    if (isIXAddr)
+      Offset >>= 2;    // The actual encoded value has the low two bits zero.
+    MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // The offset doesn't fit into a single register, scavenge one to build the
+  // offset in.
+  // FIXME: figure out what SPAdj is doing here.
+
+  // FIXME (64-bit): Use "findScratchRegister".
+  unsigned SReg;
+  if (EnableRegisterScavenging)
+    SReg = findScratchRegister(II, RS, &PPC::GPRCRegClass, SPAdj);
+  else
+    SReg = PPC::R0;
+
+  // Insert a set of rA with the full offset value before the ld, st, or add
+  BuildMI(MBB, II, dl, TII.get(PPC::LIS), SReg)
+    .addImm(Offset >> 16);
+  BuildMI(MBB, II, dl, TII.get(PPC::ORI), SReg)
+    .addReg(SReg, RegState::Kill)
+    .addImm(Offset);
+
+  // Convert into indexed form of the instruction:
+  // 
+  //   sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
+  //   addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
+  unsigned OperandBase;
+
+  if (OpC != TargetInstrInfo::INLINEASM) {
+    assert(ImmToIdxMap.count(OpC) &&
+           "No indexed form of load or store available!");
+    unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+    MI.setDesc(TII.get(NewOpcode));
+    OperandBase = 1;
+  } else {
+    OperandBase = OffsetOperandNo;
+  }
+    
+  unsigned StackReg = MI.getOperand(FIOperandNo).getReg();
+  MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
+  MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false);
+}
+
+/// VRRegNo - Map from a numbered VR register to its enum value.
+///
+static const unsigned short VRRegNo[] = {
+ PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
+ PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+/// RemoveVRSaveCode - We have found that this function does not need any code
+/// to manipulate the VRSAVE register, even though it uses vector registers.
+/// This can happen when the only registers used are known to be live in or out
+/// of the function.  Remove all of the VRSAVE related code from the function.
+static void RemoveVRSaveCode(MachineInstr *MI) {
+  MachineBasicBlock *Entry = MI->getParent();
+  MachineFunction *MF = Entry->getParent();
+
+  // We know that the MTVRSAVE instruction immediately follows MI.  Remove it.
+  MachineBasicBlock::iterator MBBI = MI;
+  ++MBBI;
+  assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
+  MBBI->eraseFromParent();
+  
+  bool RemovedAllMTVRSAVEs = true;
+  // See if we can find and remove the MTVRSAVE instruction from all of the
+  // epilog blocks.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().getDesc().isReturn()) {
+      bool FoundIt = false;
+      for (MBBI = I->end(); MBBI != I->begin(); ) {
+        --MBBI;
+        if (MBBI->getOpcode() == PPC::MTVRSAVE) {
+          MBBI->eraseFromParent();  // remove it.
+          FoundIt = true;
+          break;
+        }
+      }
+      RemovedAllMTVRSAVEs &= FoundIt;
+    }
+  }
+
+  // If we found and removed all MTVRSAVE instructions, remove the read of
+  // VRSAVE as well.
+  if (RemovedAllMTVRSAVEs) {
+    MBBI = MI;
+    assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
+    --MBBI;
+    assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
+    MBBI->eraseFromParent();
+  }
+  
+  // Finally, nuke the UPDATE_VRSAVE.
+  MI->eraseFromParent();
+}
+
+// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
+// instruction selector.  Based on the vector registers that have been used,
+// transform this into the appropriate ORI instruction.
+static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
+  MachineFunction *MF = MI->getParent()->getParent();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned UsedRegMask = 0;
+  for (unsigned i = 0; i != 32; ++i)
+    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+      UsedRegMask |= 1 << (31-i);
+  
+  // Live in and live out values already must be in the mask, so don't bother
+  // marking them.
+  for (MachineRegisterInfo::livein_iterator
+       I = MF->getRegInfo().livein_begin(),
+       E = MF->getRegInfo().livein_end(); I != E; ++I) {
+    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(I->first);
+    if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+  for (MachineRegisterInfo::liveout_iterator
+       I = MF->getRegInfo().liveout_begin(),
+       E = MF->getRegInfo().liveout_end(); I != E; ++I) {
+    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(*I);
+    if (VRRegNo[RegNo] == *I)              // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+  
+  // If no registers are used, turn this into a copy.
+  if (UsedRegMask == 0) {
+    // Remove all VRSAVE code.
+    RemoveVRSaveCode(MI);
+    return;
+  }
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask);
+  } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+  } else {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+
+    BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+      .addReg(DstReg, RegState::Kill)
+      .addImm(UsedRegMask & 0xFFFF);
+  }
+  
+  // Remove the old UPDATE_VRSAVE instruction.
+  MI->eraseFromParent();
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void PPCRegisterInfo::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;  //
+
+  // If we are a leaf function, and use up to 224 bytes of stack space,
+  // don't have a frame pointer, calls, or dynamic alloca then we do not need
+  // to adjust the stack pointer (we fit in the Red Zone).
+  if (!DisableRedZone &&
+      FrameSize <= 224 &&                          // Fits in red zone.
+      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
+      !MFI->hasCalls() &&                          // No calls.
+      (!ALIGN_STACK || MaxAlign <= TargetAlign)) { // No special alignment.
+    // No need for frame
+    MFI->setStackSize(0);
+    return;
+  }
+  
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  
+  // Maximum call frame needs to be at least big enough for linkage and 8 args.
+  unsigned minCallFrameSize =
+    PPCFrameInfo::getMinCallFrameSize(Subtarget.isPPC64(), 
+                                      Subtarget.isMachoABI());
+  maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+  
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+  
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+  
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+void
+PPCRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  //  Save and clear the LR state.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  unsigned LR = getRARegister();
+  FI->setMustSaveLR(MustSaveLR(MF, LR));
+  MF.getRegInfo().setPhysRegUnused(LR);
+
+  //  Save R31 if necessary
+  int FPSI = FI->getFramePointerSaveIndex();
+  bool IsPPC64 = Subtarget.isPPC64();
+  bool IsELF32_ABI = Subtarget.isELF32_ABI();
+  bool IsMachoABI  = Subtarget.isMachoABI();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+ 
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI && (NoFramePointerElim || MFI->hasVarSizedObjects()) &&
+      IsELF32_ABI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64,
+                                                           IsMachoABI);
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(IsPPC64? 8 : 4, FPOffset);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);                      
+  }
+
+  // Reserve stack space to move the linkage area to in case of a tail call.
+  int TCSPDelta = 0;
+  if (PerformTailCallOpt && (TCSPDelta=FI->getTailCallSPDelta()) < 0) {
+    int AddFPOffsetAmount = IsELF32_ABI ? -4 : 0;
+    MF.getFrameInfo()->CreateFixedObject( -1 * TCSPDelta,
+                                          AddFPOffsetAmount + TCSPDelta);
+  }
+  // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
+  // a large stack, which will require scavenging a register to materialize a
+  // large offset.
+  // FIXME: this doesn't actually check stack size, so is a bit pessimistic
+  // FIXME: doesn't detect whether or not we need to spill vXX, which requires
+  //        r0 for now.
+
+  if (EnableRegisterScavenging) // FIXME (64-bit): Enable.
+    if (needsFP(MF) || spillsCR(MF)) {
+      const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+      const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+      const TargetRegisterClass *RC = IsPPC64 ? G8RC : GPRC;
+      RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                         RC->getAlignment()));
+    }
+}
+
+void
+PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
+       !MF.getFunction()->doesNotThrow() ||
+       UnwindTablesMandatory;
+  
+  // Prepare for frame info.
+  unsigned FrameLabelId = 0;
+
+  // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
+  // process it.
+  for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+    if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+      HandleVRSaveUpdate(MBBI, TII);
+      break;
+    }
+  }
+  
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  determineFrameLayout(MF);
+  unsigned FrameSize = MFI->getStackSize();
+  
+  int NegFrameSize = -FrameSize;
+  
+  // Get processor type.
+  bool IsPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool IsMachoABI = Subtarget.isMachoABI();
+  // Check if the link register (LR) must be saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF) && FrameSize;
+  
+  int LROffset = PPCFrameInfo::getReturnSaveOffset(IsPPC64, IsMachoABI);
+  int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, IsMachoABI);
+
+  if (IsPPC64) {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR8), PPC::X0);
+      
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
+        .addReg(PPC::X31)
+        .addImm(FPOffset/4)
+        .addReg(PPC::X1);
+    
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
+        .addReg(PPC::X0)
+        .addImm(LROffset / 4)
+        .addReg(PPC::X1);
+  } else {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
+      
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
+        .addReg(PPC::R31)
+        .addImm(FPOffset)
+        .addReg(PPC::R1);
+
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
+        .addReg(PPC::R0)
+        .addImm(LROffset)
+        .addReg(PPC::R1);
+  }
+  
+  // Skip if a leaf routine.
+  if (!FrameSize) return;
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  if (needsFrameMoves) {
+    // Mark effective beginning of when frame pointer becomes valid.
+    FrameLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::DBG_LABEL)).addImm(FrameLabelId);
+  }
+  
+  // Adjust stack pointer: r1 += NegFrameSize.
+  // If there is a preferred stack alignment, align R1 now
+  if (!IsPPC64) {
+    // PPC32.
+    if (ALIGN_STACK && MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
+      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
+        .addReg(PPC::R1)
+        .addImm(0)
+        .addImm(32 - Log2_32(MaxAlign))
+        .addImm(31);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC) ,PPC::R0)
+        .addReg(PPC::R0, RegState::Kill)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+        .addReg(PPC::R1)
+        .addReg(PPC::R1)
+        .addReg(PPC::R0);
+    } else if (isInt16(NegFrameSize)) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
+        .addReg(PPC::R1)
+        .addImm(NegFrameSize)
+        .addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
+        .addReg(PPC::R0, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+        .addReg(PPC::R1)
+        .addReg(PPC::R1)
+        .addReg(PPC::R0);
+    }
+  } else {    // PPC64.
+    if (ALIGN_STACK && MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
+      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
+        .addReg(PPC::X1)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
+        .addReg(PPC::X0)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+        .addReg(PPC::X1)
+        .addReg(PPC::X1)
+        .addReg(PPC::X0);
+    } else if (isInt16(NegFrameSize)) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
+        .addReg(PPC::X1)
+        .addImm(NegFrameSize / 4)
+        .addReg(PPC::X1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
+        .addReg(PPC::X0, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+        .addReg(PPC::X1)
+        .addReg(PPC::X1)
+        .addReg(PPC::X0);
+    }
+  }
+  
+  if (needsFrameMoves) {
+    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+    
+    if (NegFrameSize) {
+      // Show update of SP.
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, NegFrameSize);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    } else {
+      MachineLocation SP(IsPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabelId, SP, SP));
+    }
+    
+    if (HasFP) {
+      MachineLocation FPDst(MachineLocation::VirtualFP, FPOffset);
+      MachineLocation FPSrc(IsPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc));
+    }
+
+    // Add callee saved registers to move list.
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+      unsigned Reg = CSI[I].getReg();
+      if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+    }
+    
+    MachineLocation LRDst(MachineLocation::VirtualFP, LROffset);
+    MachineLocation LRSrc(IsPPC64 ? PPC::LR8 : PPC::LR);
+    Moves.push_back(MachineMove(FrameLabelId, LRDst, LRSrc));
+    
+    // Mark effective beginning of when frame pointer is ready.
+    unsigned ReadyLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::DBG_LABEL)).addImm(ReadyLabelId);
+    
+    MachineLocation FPDst(HasFP ? (IsPPC64 ? PPC::X31 : PPC::R31) :
+                                  (IsPPC64 ? PPC::X1 : PPC::R1));
+    MachineLocation FPSrc(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+  }
+
+  // If there is a frame pointer, copy R1 into R31
+  if (HasFP) {
+    if (!IsPPC64) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R31)
+        .addReg(PPC::R1)
+        .addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X31)
+        .addReg(PPC::X1)
+        .addReg(PPC::X1);
+    }
+  }
+}
+
+void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  assert( (RetOpcode == PPC::BLR ||
+           RetOpcode == PPC::TCRETURNri ||
+           RetOpcode == PPC::TCRETURNdi ||
+           RetOpcode == PPC::TCRETURNai ||
+           RetOpcode == PPC::TCRETURNri8 ||
+           RetOpcode == PPC::TCRETURNdi8 ||
+           RetOpcode == PPC::TCRETURNai8) &&
+         "Can only insert epilog into returning blocks");
+
+  // Get alignment info so we know how to restore r1
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Get the number of bytes allocated from the FrameInfo.
+  int FrameSize = MFI->getStackSize();
+
+  // Get processor type.
+  bool IsPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool IsMachoABI = Subtarget.isMachoABI();
+  // Check if the link register (LR) has been saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF) && FrameSize;
+  
+  int LROffset = PPCFrameInfo::getReturnSaveOffset(IsPPC64, IsMachoABI);
+  int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, IsMachoABI);
+  
+  bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
+    RetOpcode == PPC::TCRETURNdi ||
+    RetOpcode == PPC::TCRETURNai ||
+    RetOpcode == PPC::TCRETURNri8 ||
+    RetOpcode == PPC::TCRETURNdi8 ||
+    RetOpcode == PPC::TCRETURNai8;
+
+  if (UsesTCRet) {
+    int MaxTCRetDelta = FI->getTailCallSPDelta();
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int Delta = StackAdj - MaxTCRetDelta;
+    assert((Delta >= 0) && "Delta must be positive");
+    if (MaxTCRetDelta>0)
+      FrameSize += (StackAdj +Delta);
+    else
+      FrameSize += StackAdj;
+  }
+
+  if (FrameSize) {
+    // The loaded (or persistent) stack pointer value is offset by the 'stwu'
+    // on entry to the function.  Add this offset back now.
+    if (!IsPPC64) {
+      // If this function contained a fastcc call and PerformTailCallOpt is
+      // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+      // call which invalidates the stack pointer value in SP(0). So we use the
+      // value of R31 in this case.
+      if (FI->hasFastCall() && isInt16(FrameSize)) {
+        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
+          .addReg(PPC::R31).addImm(FrameSize);
+      } else if(FI->hasFastCall()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
+          .addReg(PPC::R0, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD4))
+          .addReg(PPC::R1)
+          .addReg(PPC::R31)
+          .addReg(PPC::R0);
+      } else if (isInt16(FrameSize) &&
+                 (!ALIGN_STACK || TargetAlign >= MaxAlign) &&
+                 !MFI->hasVarSizedObjects()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
+          .addReg(PPC::R1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ),PPC::R1)
+          .addImm(0).addReg(PPC::R1);
+      }
+    } else {
+      if (FI->hasFastCall() && isInt16(FrameSize)) {
+        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
+          .addReg(PPC::X31).addImm(FrameSize);
+      } else if(FI->hasFastCall()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
+          .addReg(PPC::X0, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD8))
+          .addReg(PPC::X1)
+          .addReg(PPC::X31)
+          .addReg(PPC::X0);
+      } else if (isInt16(FrameSize) && TargetAlign >= MaxAlign &&
+            !MFI->hasVarSizedObjects()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
+           .addReg(PPC::X1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X1)
+           .addImm(0).addReg(PPC::X1);
+      }
+    }
+  }
+
+  if (IsPPC64) {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X0)
+        .addImm(LROffset/4).addReg(PPC::X1);
+        
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X31)
+        .addImm(FPOffset/4).addReg(PPC::X1);
+        
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR8)).addReg(PPC::X0);
+  } else {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R0)
+          .addImm(LROffset).addReg(PPC::R1);
+        
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R31)
+          .addImm(FPOffset).addReg(PPC::R1);
+          
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR)).addReg(PPC::R0);
+  }
+
+  // Callee pop calling convention. Pop parameter/linkage area. Used for tail
+  // call optimization
+  if (PerformTailCallOpt && RetOpcode == PPC::BLR &&
+      MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+     PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+     unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+     unsigned StackReg = IsPPC64 ? PPC::X1 : PPC::R1;
+     unsigned FPReg = IsPPC64 ? PPC::X31 : PPC::R31;
+     unsigned TmpReg = IsPPC64 ? PPC::X0 : PPC::R0;
+     unsigned ADDIInstr = IsPPC64 ? PPC::ADDI8 : PPC::ADDI;
+     unsigned ADDInstr = IsPPC64 ? PPC::ADD8 : PPC::ADD4;
+     unsigned LISInstr = IsPPC64 ? PPC::LIS8 : PPC::LIS;
+     unsigned ORIInstr = IsPPC64 ? PPC::ORI8 : PPC::ORI;
+
+     if (CallerAllocatedAmt && isInt16(CallerAllocatedAmt)) {
+       BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
+         .addReg(StackReg).addImm(CallerAllocatedAmt);
+     } else {
+       BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CallerAllocatedAmt >> 16);
+       BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CallerAllocatedAmt & 0xFFFF);
+       BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
+          .addReg(StackReg)
+          .addReg(FPReg)
+          .addReg(TmpReg);
+     }
+  } else if (RetOpcode == PPC::TCRETURNdi) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri) {
+    MBBI = prior(MBB.end());
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+  } else if (RetOpcode == PPC::TCRETURNai) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+  } else if (RetOpcode == PPC::TCRETURNdi8) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri8) {
+    MBBI = prior(MBB.end());
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+  } else if (RetOpcode == PPC::TCRETURNai8) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+  }
+}
+
+unsigned PPCRegisterInfo::getRARegister() const {
+  return !Subtarget.isPPC64() ? PPC::LR : PPC::LR8;
+}
+
+unsigned PPCRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  if (!Subtarget.isPPC64())
+    return hasFP(MF) ? PPC::R31 : PPC::R1;
+  else
+    return hasFP(MF) ? PPC::X31 : PPC::X1;
+}
+
+void PPCRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(PPC::R1, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+unsigned PPCRegisterInfo::getEHExceptionRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3;
+}
+
+unsigned PPCRegisterInfo::getEHHandlerRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
+}
+
+int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
+  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+#include "PPCGenRegisterInfo.inc"
+
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
new file mode 100644
index 0000000..9506b65
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -0,0 +1,95 @@
+//===- PPCRegisterInfo.h - PowerPC Register Information Impl -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_REGISTERINFO_H
+#define POWERPC32_REGISTERINFO_H
+
+#include "PPC.h"
+#include "PPCGenRegisterInfo.h.inc"
+#include <map>
+
+namespace llvm {
+class PPCSubtarget;
+class TargetInstrInfo;
+class Type;
+
+class PPCRegisterInfo : public PPCGenRegisterInfo {
+  std::map<unsigned, unsigned> ImmToIdxMap;
+  const PPCSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+public:
+  PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
+  
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// PPC::F14, return the number that it corresponds to (e.g. 14).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  virtual const TargetRegisterClass *getPointerRegClass() const;  
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const { return true; }
+
+  /// requiresRegisterScavenging - We require a register scavenger.
+  /// FIXME (64-bit): Should be inlined.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void lowerDynamicAlloc(MachineBasicBlock::iterator II,
+                         int SPAdj, RegScavenger *RS) const;
+  void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex,
+                       int SPAdj, RegScavenger *RS) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  /// determineFrameLayout - Determine the size of the frame and maximum call
+  /// frame size.
+  void determineFrameLayout(MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
new file mode 100644
index 0000000..9e15a55
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -0,0 +1,360 @@
+//===- PPCRegisterInfo.td - The PowerPC Register File ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class PPCReg<string n> : Register<n> {
+  let Namespace = "PPC";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// GP8 - One of the 32 64-bit general-purpose registers
+class GP8<GPR SubReg, string n> : PPCReg<n> {
+  field bits<5> Num = SubReg.Num;
+  let SubRegs = [SubReg];
+}
+
+// SPR - One of the 32-bit special-purpose registers
+class SPR<bits<10> num, string n> : PPCReg<n> {
+  field bits<10> Num = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// CR - One of the 8 4-bit condition registers
+class CR<bits<3> num, string n> : PPCReg<n> {
+  field bits<3> Num = num;
+}
+
+// CRBIT - One of the 32 1-bit condition register fields
+class CRBIT<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+
+// General-purpose registers
+def R0  : GPR< 0,  "r0">, DwarfRegNum<[0]>;
+def R1  : GPR< 1,  "r1">, DwarfRegNum<[1]>;
+def R2  : GPR< 2,  "r2">, DwarfRegNum<[2]>;
+def R3  : GPR< 3,  "r3">, DwarfRegNum<[3]>;
+def R4  : GPR< 4,  "r4">, DwarfRegNum<[4]>;
+def R5  : GPR< 5,  "r5">, DwarfRegNum<[5]>;
+def R6  : GPR< 6,  "r6">, DwarfRegNum<[6]>;
+def R7  : GPR< 7,  "r7">, DwarfRegNum<[7]>;
+def R8  : GPR< 8,  "r8">, DwarfRegNum<[8]>;
+def R9  : GPR< 9,  "r9">, DwarfRegNum<[9]>;
+def R10 : GPR<10, "r10">, DwarfRegNum<[10]>;
+def R11 : GPR<11, "r11">, DwarfRegNum<[11]>;
+def R12 : GPR<12, "r12">, DwarfRegNum<[12]>;
+def R13 : GPR<13, "r13">, DwarfRegNum<[13]>;
+def R14 : GPR<14, "r14">, DwarfRegNum<[14]>;
+def R15 : GPR<15, "r15">, DwarfRegNum<[15]>;
+def R16 : GPR<16, "r16">, DwarfRegNum<[16]>;
+def R17 : GPR<17, "r17">, DwarfRegNum<[17]>;
+def R18 : GPR<18, "r18">, DwarfRegNum<[18]>;
+def R19 : GPR<19, "r19">, DwarfRegNum<[19]>;
+def R20 : GPR<20, "r20">, DwarfRegNum<[20]>;
+def R21 : GPR<21, "r21">, DwarfRegNum<[21]>;
+def R22 : GPR<22, "r22">, DwarfRegNum<[22]>;
+def R23 : GPR<23, "r23">, DwarfRegNum<[23]>;
+def R24 : GPR<24, "r24">, DwarfRegNum<[24]>;
+def R25 : GPR<25, "r25">, DwarfRegNum<[25]>;
+def R26 : GPR<26, "r26">, DwarfRegNum<[26]>;
+def R27 : GPR<27, "r27">, DwarfRegNum<[27]>;
+def R28 : GPR<28, "r28">, DwarfRegNum<[28]>;
+def R29 : GPR<29, "r29">, DwarfRegNum<[29]>;
+def R30 : GPR<30, "r30">, DwarfRegNum<[30]>;
+def R31 : GPR<31, "r31">, DwarfRegNum<[31]>;
+
+// 64-bit General-purpose registers
+def X0  : GP8< R0,  "r0">, DwarfRegNum<[0]>;
+def X1  : GP8< R1,  "r1">, DwarfRegNum<[1]>;
+def X2  : GP8< R2,  "r2">, DwarfRegNum<[2]>;
+def X3  : GP8< R3,  "r3">, DwarfRegNum<[3]>;
+def X4  : GP8< R4,  "r4">, DwarfRegNum<[4]>;
+def X5  : GP8< R5,  "r5">, DwarfRegNum<[5]>;
+def X6  : GP8< R6,  "r6">, DwarfRegNum<[6]>;
+def X7  : GP8< R7,  "r7">, DwarfRegNum<[7]>;
+def X8  : GP8< R8,  "r8">, DwarfRegNum<[8]>;
+def X9  : GP8< R9,  "r9">, DwarfRegNum<[9]>;
+def X10 : GP8<R10, "r10">, DwarfRegNum<[10]>;
+def X11 : GP8<R11, "r11">, DwarfRegNum<[11]>;
+def X12 : GP8<R12, "r12">, DwarfRegNum<[12]>;
+def X13 : GP8<R13, "r13">, DwarfRegNum<[13]>;
+def X14 : GP8<R14, "r14">, DwarfRegNum<[14]>;
+def X15 : GP8<R15, "r15">, DwarfRegNum<[15]>;
+def X16 : GP8<R16, "r16">, DwarfRegNum<[16]>;
+def X17 : GP8<R17, "r17">, DwarfRegNum<[17]>;
+def X18 : GP8<R18, "r18">, DwarfRegNum<[18]>;
+def X19 : GP8<R19, "r19">, DwarfRegNum<[19]>;
+def X20 : GP8<R20, "r20">, DwarfRegNum<[20]>;
+def X21 : GP8<R21, "r21">, DwarfRegNum<[21]>;
+def X22 : GP8<R22, "r22">, DwarfRegNum<[22]>;
+def X23 : GP8<R23, "r23">, DwarfRegNum<[23]>;
+def X24 : GP8<R24, "r24">, DwarfRegNum<[24]>;
+def X25 : GP8<R25, "r25">, DwarfRegNum<[25]>;
+def X26 : GP8<R26, "r26">, DwarfRegNum<[26]>;
+def X27 : GP8<R27, "r27">, DwarfRegNum<[27]>;
+def X28 : GP8<R28, "r28">, DwarfRegNum<[28]>;
+def X29 : GP8<R29, "r29">, DwarfRegNum<[29]>;
+def X30 : GP8<R30, "r30">, DwarfRegNum<[30]>;
+def X31 : GP8<R31, "r31">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
+def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
+def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
+def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
+def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
+def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
+def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
+def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
+def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
+def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
+def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
+def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
+def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
+def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
+def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
+def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
+def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
+def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
+def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
+def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
+def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
+def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
+def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
+def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
+def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
+def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
+def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
+def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
+def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
+def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
+def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
+def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
+
+// Vector registers
+def V0  : VR< 0,  "v0">, DwarfRegNum<[77]>;
+def V1  : VR< 1,  "v1">, DwarfRegNum<[78]>;
+def V2  : VR< 2,  "v2">, DwarfRegNum<[79]>;
+def V3  : VR< 3,  "v3">, DwarfRegNum<[80]>;
+def V4  : VR< 4,  "v4">, DwarfRegNum<[81]>;
+def V5  : VR< 5,  "v5">, DwarfRegNum<[82]>;
+def V6  : VR< 6,  "v6">, DwarfRegNum<[83]>;
+def V7  : VR< 7,  "v7">, DwarfRegNum<[84]>;
+def V8  : VR< 8,  "v8">, DwarfRegNum<[85]>;
+def V9  : VR< 9,  "v9">, DwarfRegNum<[86]>;
+def V10 : VR<10, "v10">, DwarfRegNum<[87]>;
+def V11 : VR<11, "v11">, DwarfRegNum<[88]>;
+def V12 : VR<12, "v12">, DwarfRegNum<[89]>;
+def V13 : VR<13, "v13">, DwarfRegNum<[90]>;
+def V14 : VR<14, "v14">, DwarfRegNum<[91]>;
+def V15 : VR<15, "v15">, DwarfRegNum<[92]>;
+def V16 : VR<16, "v16">, DwarfRegNum<[93]>;
+def V17 : VR<17, "v17">, DwarfRegNum<[94]>;
+def V18 : VR<18, "v18">, DwarfRegNum<[95]>;
+def V19 : VR<19, "v19">, DwarfRegNum<[96]>;
+def V20 : VR<20, "v20">, DwarfRegNum<[97]>;
+def V21 : VR<21, "v21">, DwarfRegNum<[98]>;
+def V22 : VR<22, "v22">, DwarfRegNum<[99]>;
+def V23 : VR<23, "v23">, DwarfRegNum<[100]>;
+def V24 : VR<24, "v24">, DwarfRegNum<[101]>;
+def V25 : VR<25, "v25">, DwarfRegNum<[102]>;
+def V26 : VR<26, "v26">, DwarfRegNum<[103]>;
+def V27 : VR<27, "v27">, DwarfRegNum<[104]>;
+def V28 : VR<28, "v28">, DwarfRegNum<[105]>;
+def V29 : VR<29, "v29">, DwarfRegNum<[106]>;
+def V30 : VR<30, "v30">, DwarfRegNum<[107]>;
+def V31 : VR<31, "v31">, DwarfRegNum<[108]>;
+
+// Condition registers
+def CR0 : CR<0, "cr0">, DwarfRegNum<[68]>;
+def CR1 : CR<1, "cr1">, DwarfRegNum<[69]>;
+def CR2 : CR<2, "cr2">, DwarfRegNum<[70]>;
+def CR3 : CR<3, "cr3">, DwarfRegNum<[71]>;
+def CR4 : CR<4, "cr4">, DwarfRegNum<[72]>;
+def CR5 : CR<5, "cr5">, DwarfRegNum<[73]>;
+def CR6 : CR<6, "cr6">, DwarfRegNum<[74]>;
+def CR7 : CR<7, "cr7">, DwarfRegNum<[75]>;
+
+// Condition register bits
+def CR0LT : CRBIT< 0, "0">, DwarfRegNum<[0]>;
+def CR0GT : CRBIT< 1, "1">, DwarfRegNum<[0]>;
+def CR0EQ : CRBIT< 2, "2">, DwarfRegNum<[0]>;
+def CR0UN : CRBIT< 3, "3">, DwarfRegNum<[0]>;
+def CR1LT : CRBIT< 4, "4">, DwarfRegNum<[0]>;
+def CR1GT : CRBIT< 5, "5">, DwarfRegNum<[0]>;
+def CR1EQ : CRBIT< 6, "6">, DwarfRegNum<[0]>;
+def CR1UN : CRBIT< 7, "7">, DwarfRegNum<[0]>;
+def CR2LT : CRBIT< 8, "8">, DwarfRegNum<[0]>;
+def CR2GT : CRBIT< 9, "9">, DwarfRegNum<[0]>;
+def CR2EQ : CRBIT<10, "10">, DwarfRegNum<[0]>;
+def CR2UN : CRBIT<11, "11">, DwarfRegNum<[0]>;
+def CR3LT : CRBIT<12, "12">, DwarfRegNum<[0]>;
+def CR3GT : CRBIT<13, "13">, DwarfRegNum<[0]>;
+def CR3EQ : CRBIT<14, "14">, DwarfRegNum<[0]>;
+def CR3UN : CRBIT<15, "15">, DwarfRegNum<[0]>;
+def CR4LT : CRBIT<16, "16">, DwarfRegNum<[0]>;
+def CR4GT : CRBIT<17, "17">, DwarfRegNum<[0]>;
+def CR4EQ : CRBIT<18, "18">, DwarfRegNum<[0]>;
+def CR4UN : CRBIT<19, "19">, DwarfRegNum<[0]>;
+def CR5LT : CRBIT<20, "20">, DwarfRegNum<[0]>;
+def CR5GT : CRBIT<21, "21">, DwarfRegNum<[0]>;
+def CR5EQ : CRBIT<22, "22">, DwarfRegNum<[0]>;
+def CR5UN : CRBIT<23, "23">, DwarfRegNum<[0]>;
+def CR6LT : CRBIT<24, "24">, DwarfRegNum<[0]>;
+def CR6GT : CRBIT<25, "25">, DwarfRegNum<[0]>;
+def CR6EQ : CRBIT<26, "26">, DwarfRegNum<[0]>;
+def CR6UN : CRBIT<27, "27">, DwarfRegNum<[0]>;
+def CR7LT : CRBIT<28, "28">, DwarfRegNum<[0]>;
+def CR7GT : CRBIT<29, "29">, DwarfRegNum<[0]>;
+def CR7EQ : CRBIT<30, "30">, DwarfRegNum<[0]>;
+def CR7UN : CRBIT<31, "31">, DwarfRegNum<[0]>;
+
+def : SubRegSet<1, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0LT, CR1LT, CR2LT, CR3LT, CR4LT, CR5LT, CR6LT, CR7LT]>;
+def : SubRegSet<2, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0GT, CR1GT, CR2GT, CR3GT, CR4GT, CR5GT, CR6GT, CR7GT]>;
+def : SubRegSet<3, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0EQ, CR1EQ, CR2EQ, CR3EQ, CR4EQ, CR5EQ, CR6EQ, CR7EQ]>;
+def : SubRegSet<4, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0UN, CR1UN, CR2UN, CR3UN, CR4UN, CR5UN, CR6UN, CR7UN]>;
+
+// Link register
+def LR  : SPR<8, "lr">, DwarfRegNum<[65]>;
+//let Aliases = [LR] in
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65]>;
+
+// Count register
+def CTR  : SPR<9, "ctr">, DwarfRegNum<[66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66]>;
+
+// VRsave register
+def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[107]>;
+
+// FP rounding mode:  bits 30 and 31 of the FP status and control register
+// This is not allocated as a normal register; it appears only in
+// Uses and Defs.  The ABI says it needs to be preserved by a function,
+// but this is not achieved by saving and restoring it as with
+// most registers, it has to be done in code; to make this work all the
+// return and call instructions are described as Uses of RM, so instructions
+// that do nothing but change RM will not get deleted.
+// Also, in the architecture it is not really a SPR; 512 is arbitrary.
+def RM: SPR<512, "**ROUNDING MODE**">, DwarfRegNum<[0]>;
+
+/// Register classes
+// Allocate volatiles first
+// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
+def GPRC : RegisterClass<"PPC", [i32], 32,
+     [R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12,
+      R30, R29, R28, R27, R26, R25, R24, R23, R22, R21, R20, R19, R18, R17,
+      R16, R15, R14, R13, R31, R0, R1, LR]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
+      // In Linux, r2 is reserved for the OS.
+      if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
+        return begin()+1;
+
+      return begin();
+    }
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+      // On PPC64, r13 is the thread pointer.  Never allocate this register.
+      // Note that this is overconservative, as it also prevents allocation of
+      // R31 when the FP is not needed.
+      if (MF.getTarget().getSubtarget<PPCSubtarget>().isPPC64())
+        return end()-5;  // don't allocate R13, R31, R0, R1, LR
+        
+      if (needsFP(MF))
+        return end()-4;  // don't allocate R31, R0, R1, LR
+      else
+        return end()-3;  // don't allocate R0, R1, LR
+    }
+  }];
+}
+def G8RC : RegisterClass<"PPC", [i64], 64,
+     [X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12,
+      X30, X29, X28, X27, X26, X25, X24, X23, X22, X21, X20, X19, X18, X17,
+      X16, X15, X14, X31, X13, X0, X1, LR8]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    G8RCClass::iterator
+    G8RCClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    G8RCClass::iterator
+    G8RCClass::allocation_order_end(const MachineFunction &MF) const {
+      if (needsFP(MF))
+        return end()-5;
+      else
+        return end()-4;
+    }
+  }];
+}
+
+
+
+def F8RC : RegisterClass<"PPC", [f64], 64, [F0, F1, F2, F3, F4, F5, F6, F7,
+  F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21,
+  F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+def F4RC : RegisterClass<"PPC", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7,
+  F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21,
+  F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
+ [V2, V3, V4, V5, V0, V1, 
+  V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
+  V22, V23, V24, V25, V26, V27, V28, V29, V30, V31]>;
+
+def CRRC : RegisterClass<"PPC", [i32], 32, [CR0, CR1, CR5, CR6, CR7, CR2, 
+  CR3, CR4]>;
+
+def CRBITRC : RegisterClass<"PPC", [i32], 32,
+  [CR0LT, CR0GT, CR0EQ, CR0UN,
+   CR1LT, CR1GT, CR1EQ, CR1UN,
+   CR2LT, CR2GT, CR2EQ, CR2UN,
+   CR3LT, CR3GT, CR3EQ, CR3UN,
+   CR4LT, CR4GT, CR4EQ, CR4UN,
+   CR5LT, CR5GT, CR5EQ, CR5UN,
+   CR6LT, CR6GT, CR6EQ, CR6UN,
+   CR7LT, CR7GT, CR7EQ, CR7UN
+  ]>
+{
+  let CopyCost = -1;
+}
+
+
+def CTRRC : RegisterClass<"PPC", [i32], 32, [CTR]>;
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, [CTR8]>;
diff --git a/lib/Target/PowerPC/PPCRelocations.h b/lib/Target/PowerPC/PPCRelocations.h
new file mode 100644
index 0000000..a33e7e0
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRelocations.h
@@ -0,0 +1,56 @@
+//===- PPCRelocations.h - PPC32 Code Relocations ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC32RELOCATIONS_H
+#define PPC32RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+// Hack to rid us of a PPC pre-processor symbol which is erroneously
+// defined in a PowerPC header file (bug in Linux/PPC)
+#ifdef PPC
+#undef PPC
+#endif
+
+namespace llvm {
+  namespace PPC {
+    enum RelocationType {
+      // reloc_vanilla - A standard relocation, where the address of the
+      // relocated object completely overwrites the address of the relocation.
+      reloc_vanilla,
+    
+      // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions.
+      reloc_pcrel_bx,
+
+      // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE,
+      // and other bcx instructions.
+      reloc_pcrel_bcx,
+
+      // reloc_absolute_high - Absolute relocation, for the loadhi instruction
+      // (which is really addis).  Add the high 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_high,
+
+      // reloc_absolute_low - Absolute relocation, for the la instruction (which
+      // is really an addi).  Add the low 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_low,
+      
+      // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store
+      // instruction which have two implicit zero bits.
+      reloc_absolute_low_ix
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
new file mode 100644
index 0000000..d589414
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -0,0 +1,508 @@
+//===- PPCSchedule.td - PowerPC Scheduling Definitions -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across PowerPC chips sets
+//
+def BPU    : FuncUnit; // Branch unit
+def SLU    : FuncUnit; // Store/load unit
+def SRU    : FuncUnit; // special register unit
+def IU1    : FuncUnit; // integer unit 1 (simple)
+def IU2    : FuncUnit; // integer unit 2 (complex)
+def IU3    : FuncUnit; // integer unit 3 (7450 simple)
+def IU4    : FuncUnit; // integer unit 4 (7450 simple)
+def FPU1   : FuncUnit; // floating point unit 1
+def FPU2   : FuncUnit; // floating point unit 2
+def VPU    : FuncUnit; // vector permutation unit
+def VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def VFPU   : FuncUnit; // vector floating point unit
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for PowerPC
+//
+def IntGeneral   : InstrItinClass;
+def IntCompare   : InstrItinClass;
+def IntDivD      : InstrItinClass;
+def IntDivW      : InstrItinClass;
+def IntMFFS      : InstrItinClass;
+def IntMFVSCR    : InstrItinClass;
+def IntMTFSB0    : InstrItinClass;
+def IntMTSRD     : InstrItinClass;
+def IntMulHD     : InstrItinClass;
+def IntMulHW     : InstrItinClass;
+def IntMulHWU    : InstrItinClass;
+def IntMulLI     : InstrItinClass;
+def IntRFID      : InstrItinClass;
+def IntRotateD   : InstrItinClass;
+def IntRotate    : InstrItinClass;
+def IntShift     : InstrItinClass;
+def IntTrapD     : InstrItinClass;
+def IntTrapW     : InstrItinClass;
+def BrB          : InstrItinClass;
+def BrCR         : InstrItinClass;
+def BrMCR        : InstrItinClass;
+def BrMCRX       : InstrItinClass;
+def LdStDCBA     : InstrItinClass;
+def LdStDCBF     : InstrItinClass;
+def LdStDCBI     : InstrItinClass;
+def LdStGeneral  : InstrItinClass;
+def LdStDSS      : InstrItinClass;
+def LdStICBI     : InstrItinClass;
+def LdStUX       : InstrItinClass;
+def LdStLD       : InstrItinClass;
+def LdStLDARX    : InstrItinClass;
+def LdStLFD      : InstrItinClass;
+def LdStLFDU     : InstrItinClass;
+def LdStLHA      : InstrItinClass;
+def LdStLMW      : InstrItinClass;
+def LdStLVecX    : InstrItinClass;
+def LdStLWA      : InstrItinClass;
+def LdStLWARX    : InstrItinClass;
+def LdStSLBIA    : InstrItinClass;
+def LdStSLBIE    : InstrItinClass;
+def LdStSTD      : InstrItinClass;
+def LdStSTDCX    : InstrItinClass;
+def LdStSTVEBX   : InstrItinClass;
+def LdStSTWCX    : InstrItinClass;
+def LdStSync     : InstrItinClass;
+def SprISYNC     : InstrItinClass;
+def SprMFSR      : InstrItinClass;
+def SprMTMSR     : InstrItinClass;
+def SprMTSR      : InstrItinClass;
+def SprTLBSYNC   : InstrItinClass;
+def SprMFCR      : InstrItinClass;
+def SprMFMSR     : InstrItinClass;
+def SprMFSPR     : InstrItinClass;
+def SprMFTB      : InstrItinClass;
+def SprMTSPR     : InstrItinClass;
+def SprMTSRIN    : InstrItinClass;
+def SprRFI       : InstrItinClass;
+def SprSC        : InstrItinClass;
+def FPGeneral    : InstrItinClass;
+def FPCompare    : InstrItinClass;
+def FPDivD       : InstrItinClass;
+def FPDivS       : InstrItinClass;
+def FPFused      : InstrItinClass;
+def FPRes        : InstrItinClass;
+def FPSqrt       : InstrItinClass;
+def VecGeneral   : InstrItinClass;
+def VecFP        : InstrItinClass;
+def VecFPCompare : InstrItinClass;
+def VecComplex   : InstrItinClass;
+def VecPerm      : InstrItinClass;
+def VecFPRound   : InstrItinClass;
+def VecVSL       : InstrItinClass;
+def VecVSR       : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "PPCScheduleG3.td"
+include "PPCScheduleG4.td"
+include "PPCScheduleG4Plus.td"
+include "PPCScheduleG5.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction to itinerary class map - When add new opcodes to the supported
+// set, refer to the following table to determine which itinerary class the
+// opcode belongs.
+//
+//    opcode     itinerary class
+//    ======     ===============
+//    add        IntGeneral
+//    addc       IntGeneral
+//    adde       IntGeneral
+//    addi       IntGeneral
+//    addic      IntGeneral
+//    addic.     IntGeneral
+//    addis      IntGeneral
+//    addme      IntGeneral
+//    addze      IntGeneral
+//    and        IntGeneral
+//    andc       IntGeneral
+//    andi.      IntGeneral
+//    andis.     IntGeneral
+//    b          BrB
+//    bc         BrB
+//    bcctr      BrB
+//    bclr       BrB
+//    cmp        IntCompare
+//    cmpi       IntCompare
+//    cmpl       IntCompare
+//    cmpli      IntCompare
+//    cntlzd     IntRotateD
+//    cntlzw     IntGeneral
+//    crand      BrCR
+//    crandc     BrCR
+//    creqv      BrCR
+//    crnand     BrCR
+//    crnor      BrCR
+//    cror       BrCR
+//    crorc      BrCR
+//    crxor      BrCR
+//    dcba       LdStDCBA
+//    dcbf       LdStDCBF
+//    dcbi       LdStDCBI
+//    dcbst      LdStDCBF
+//    dcbt       LdStGeneral
+//    dcbtst     LdStGeneral
+//    dcbz       LdStDCBF
+//    divd       IntDivD
+//    divdu      IntDivD
+//    divw       IntDivW
+//    divwu      IntDivW
+//    dss        LdStDSS
+//    dst        LdStDSS
+//    dstst      LdStDSS
+//    eciwx      LdStGeneral
+//    ecowx      LdStGeneral
+//    eieio      LdStGeneral
+//    eqv        IntGeneral
+//    extsb      IntGeneral
+//    extsh      IntGeneral
+//    extsw      IntRotateD
+//    fabs       FPGeneral
+//    fadd       FPGeneral
+//    fadds      FPGeneral
+//    fcfid      FPGeneral
+//    fcmpo      FPCompare
+//    fcmpu      FPCompare
+//    fctid      FPGeneral
+//    fctidz     FPGeneral
+//    fctiw      FPGeneral
+//    fctiwz     FPGeneral
+//    fdiv       FPDivD
+//    fdivs      FPDivS
+//    fmadd      FPFused
+//    fmadds     FPGeneral
+//    fmr        FPGeneral
+//    fmsub      FPFused
+//    fmsubs     FPGeneral
+//    fmul       FPFused
+//    fmuls      FPGeneral
+//    fnabs      FPGeneral
+//    fneg       FPGeneral
+//    fnmadd     FPFused
+//    fnmadds    FPGeneral
+//    fnmsub     FPFused
+//    fnmsubs    FPGeneral
+//    fres       FPRes
+//    frsp       FPGeneral
+//    frsqrte    FPGeneral
+//    fsel       FPGeneral
+//    fsqrt      FPSqrt
+//    fsqrts     FPSqrt
+//    fsub       FPGeneral
+//    fsubs      FPGeneral
+//    icbi       LdStICBI
+//    isync      SprISYNC
+//    lbz        LdStGeneral
+//    lbzu       LdStGeneral
+//    lbzux      LdStUX
+//    lbzx       LdStGeneral
+//    ld         LdStLD
+//    ldarx      LdStLDARX
+//    ldu        LdStLD
+//    ldux       LdStLD
+//    ldx        LdStLD
+//    lfd        LdStLFD
+//    lfdu       LdStLFDU
+//    lfdux      LdStLFDU
+//    lfdx       LdStLFDU
+//    lfs        LdStLFDU
+//    lfsu       LdStLFDU
+//    lfsux      LdStLFDU
+//    lfsx       LdStLFDU
+//    lha        LdStLHA
+//    lhau       LdStLHA
+//    lhaux      LdStLHA
+//    lhax       LdStLHA
+//    lhbrx      LdStGeneral
+//    lhz        LdStGeneral
+//    lhzu       LdStGeneral
+//    lhzux      LdStUX
+//    lhzx       LdStGeneral
+//    lmw        LdStLMW
+//    lswi       LdStLMW
+//    lswx       LdStLMW
+//    lvebx      LdStLVecX
+//    lvehx      LdStLVecX
+//    lvewx      LdStLVecX
+//    lvsl       LdStLVecX
+//    lvsr       LdStLVecX
+//    lvx        LdStLVecX
+//    lvxl       LdStLVecX
+//    lwa        LdStLWA
+//    lwarx      LdStLWARX
+//    lwaux      LdStLHA
+//    lwax       LdStLHA
+//    lwbrx      LdStGeneral
+//    lwz        LdStGeneral
+//    lwzu       LdStGeneral
+//    lwzux      LdStUX
+//    lwzx       LdStGeneral
+//    mcrf       BrMCR
+//    mcrfs      FPGeneral
+//    mcrxr      BrMCRX
+//    mfcr       SprMFCR
+//    mffs       IntMFFS
+//    mfmsr      SprMFMSR
+//    mfspr      SprMFSPR
+//    mfsr       SprMFSR
+//    mfsrin     SprMFSR
+//    mftb       SprMFTB
+//    mfvscr     IntMFVSCR
+//    mtcrf      BrMCRX
+//    mtfsb0     IntMTFSB0
+//    mtfsb1     IntMTFSB0
+//    mtfsf      IntMTFSB0
+//    mtfsfi     IntMTFSB0
+//    mtmsr      SprMTMSR
+//    mtmsrd     LdStLD
+//    mtspr      SprMTSPR
+//    mtsr       SprMTSR
+//    mtsrd      IntMTSRD
+//    mtsrdin    IntMTSRD
+//    mtsrin     SprMTSRIN
+//    mtvscr     IntMFVSCR
+//    mulhd      IntMulHD
+//    mulhdu     IntMulHD
+//    mulhw      IntMulHW
+//    mulhwu     IntMulHWU
+//    mulld      IntMulHD
+//    mulli      IntMulLI
+//    mullw      IntMulHW
+//    nand       IntGeneral
+//    neg        IntGeneral
+//    nor        IntGeneral
+//    or         IntGeneral
+//    orc        IntGeneral
+//    ori        IntGeneral
+//    oris       IntGeneral
+//    rfi        SprRFI
+//    rfid       IntRFID
+//    rldcl      IntRotateD
+//    rldcr      IntRotateD
+//    rldic      IntRotateD
+//    rldicl     IntRotateD
+//    rldicr     IntRotateD
+//    rldimi     IntRotateD
+//    rlwimi     IntRotate
+//    rlwinm     IntGeneral
+//    rlwnm      IntGeneral
+//    sc         SprSC
+//    slbia      LdStSLBIA
+//    slbie      LdStSLBIE
+//    sld        IntRotateD
+//    slw        IntGeneral
+//    srad       IntRotateD
+//    sradi      IntRotateD
+//    sraw       IntShift
+//    srawi      IntShift
+//    srd        IntRotateD
+//    srw        IntGeneral
+//    stb        LdStGeneral
+//    stbu       LdStGeneral
+//    stbux      LdStGeneral
+//    stbx       LdStGeneral
+//    std        LdStSTD
+//    stdcx.     LdStSTDCX
+//    stdu       LdStSTD
+//    stdux      LdStSTD
+//    stdx       LdStSTD
+//    stfd       LdStUX
+//    stfdu      LdStUX
+//    stfdux     LdStUX
+//    stfdx      LdStUX
+//    stfiwx     LdStUX
+//    stfs       LdStUX
+//    stfsu      LdStUX
+//    stfsux     LdStUX
+//    stfsx      LdStUX
+//    sth        LdStGeneral
+//    sthbrx     LdStGeneral
+//    sthu       LdStGeneral
+//    sthux      LdStGeneral
+//    sthx       LdStGeneral
+//    stmw       LdStLMW
+//    stswi      LdStLMW
+//    stswx      LdStLMW
+//    stvebx     LdStSTVEBX
+//    stvehx     LdStSTVEBX
+//    stvewx     LdStSTVEBX
+//    stvx       LdStSTVEBX
+//    stvxl      LdStSTVEBX
+//    stw        LdStGeneral
+//    stwbrx     LdStGeneral
+//    stwcx.     LdStSTWCX
+//    stwu       LdStGeneral
+//    stwux      LdStGeneral
+//    stwx       LdStGeneral
+//    subf       IntGeneral
+//    subfc      IntGeneral
+//    subfe      IntGeneral
+//    subfic     IntGeneral
+//    subfme     IntGeneral
+//    subfze     IntGeneral
+//    sync       LdStSync
+//    td         IntTrapD
+//    tdi        IntTrapD
+//    tlbia      LdStSLBIA
+//    tlbie      LdStDCBF
+//    tlbsync    SprTLBSYNC
+//    tw         IntTrapW
+//    twi        IntTrapW
+//    vaddcuw    VecGeneral
+//    vaddfp     VecFP
+//    vaddsbs    VecGeneral
+//    vaddshs    VecGeneral
+//    vaddsws    VecGeneral
+//    vaddubm    VecGeneral
+//    vaddubs    VecGeneral
+//    vadduhm    VecGeneral
+//    vadduhs    VecGeneral
+//    vadduwm    VecGeneral
+//    vadduws    VecGeneral
+//    vand       VecGeneral
+//    vandc      VecGeneral
+//    vavgsb     VecGeneral
+//    vavgsh     VecGeneral
+//    vavgsw     VecGeneral
+//    vavgub     VecGeneral
+//    vavguh     VecGeneral
+//    vavguw     VecGeneral
+//    vcfsx      VecFP
+//    vcfux      VecFP
+//    vcmpbfp    VecFPCompare
+//    vcmpeqfp   VecFPCompare
+//    vcmpequb   VecGeneral
+//    vcmpequh   VecGeneral
+//    vcmpequw   VecGeneral
+//    vcmpgefp   VecFPCompare
+//    vcmpgtfp   VecFPCompare
+//    vcmpgtsb   VecGeneral
+//    vcmpgtsh   VecGeneral
+//    vcmpgtsw   VecGeneral
+//    vcmpgtub   VecGeneral
+//    vcmpgtuh   VecGeneral
+//    vcmpgtuw   VecGeneral
+//    vctsxs     VecFP
+//    vctuxs     VecFP
+//    vexptefp   VecFP
+//    vlogefp    VecFP
+//    vmaddfp    VecFP
+//    vmaxfp     VecFPCompare
+//    vmaxsb     VecGeneral
+//    vmaxsh     VecGeneral
+//    vmaxsw     VecGeneral
+//    vmaxub     VecGeneral
+//    vmaxuh     VecGeneral
+//    vmaxuw     VecGeneral
+//    vmhaddshs  VecComplex
+//    vmhraddshs VecComplex
+//    vminfp     VecFPCompare
+//    vminsb     VecGeneral
+//    vminsh     VecGeneral
+//    vminsw     VecGeneral
+//    vminub     VecGeneral
+//    vminuh     VecGeneral
+//    vminuw     VecGeneral
+//    vmladduhm  VecComplex
+//    vmrghb     VecPerm
+//    vmrghh     VecPerm
+//    vmrghw     VecPerm
+//    vmrglb     VecPerm
+//    vmrglh     VecPerm
+//    vmrglw     VecPerm
+//    vmsubfp    VecFP
+//    vmsummbm   VecComplex
+//    vmsumshm   VecComplex
+//    vmsumshs   VecComplex
+//    vmsumubm   VecComplex
+//    vmsumuhm   VecComplex
+//    vmsumuhs   VecComplex
+//    vmulesb    VecComplex
+//    vmulesh    VecComplex
+//    vmuleub    VecComplex
+//    vmuleuh    VecComplex
+//    vmulosb    VecComplex
+//    vmulosh    VecComplex
+//    vmuloub    VecComplex
+//    vmulouh    VecComplex
+//    vnor       VecGeneral
+//    vor        VecGeneral
+//    vperm      VecPerm
+//    vpkpx      VecPerm
+//    vpkshss    VecPerm
+//    vpkshus    VecPerm
+//    vpkswss    VecPerm
+//    vpkswus    VecPerm
+//    vpkuhum    VecPerm
+//    vpkuhus    VecPerm
+//    vpkuwum    VecPerm
+//    vpkuwus    VecPerm
+//    vrefp      VecFPRound
+//    vrfim      VecFPRound
+//    vrfin      VecFPRound
+//    vrfip      VecFPRound
+//    vrfiz      VecFPRound
+//    vrlb       VecGeneral
+//    vrlh       VecGeneral
+//    vrlw       VecGeneral
+//    vrsqrtefp  VecFP
+//    vsel       VecGeneral
+//    vsl        VecVSL
+//    vslb       VecGeneral
+//    vsldoi     VecPerm
+//    vslh       VecGeneral
+//    vslo       VecPerm
+//    vslw       VecGeneral
+//    vspltb     VecPerm
+//    vsplth     VecPerm
+//    vspltisb   VecPerm
+//    vspltish   VecPerm
+//    vspltisw   VecPerm
+//    vspltw     VecPerm
+//    vsr        VecVSR
+//    vsrab      VecGeneral
+//    vsrah      VecGeneral
+//    vsraw      VecGeneral
+//    vsrb       VecGeneral
+//    vsrh       VecGeneral
+//    vsro       VecPerm
+//    vsrw       VecGeneral
+//    vsubcuw    VecGeneral
+//    vsubfp     VecFP
+//    vsubsbs    VecGeneral
+//    vsubshs    VecGeneral
+//    vsubsws    VecGeneral
+//    vsububm    VecGeneral
+//    vsububs    VecGeneral
+//    vsubuhm    VecGeneral
+//    vsubuhs    VecGeneral
+//    vsubuwm    VecGeneral
+//    vsubuws    VecGeneral
+//    vsum2sws   VecComplex
+//    vsum4sbs   VecComplex
+//    vsum4shs   VecComplex
+//    vsum4ubs   VecComplex
+//    vsumsws    VecComplex
+//    vupkhpx    VecPerm
+//    vupkhsb    VecPerm
+//    vupkhsh    VecPerm
+//    vupklpx    VecPerm
+//    vupklsb    VecPerm
+//    vupklsh    VecPerm
+//    vxor       VecGeneral
+//    xor        IntGeneral
+//    xori       IntGeneral
+//    xoris      IntGeneral
+//
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
new file mode 100644
index 0000000..f72194d
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -0,0 +1,63 @@
+//===- PPCScheduleG3.td - PPC G3 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G3 (750) processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def G3Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBA    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<2, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>
+]>;
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
new file mode 100644
index 0000000..92ed20f
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -0,0 +1,73 @@
+//===- PPCScheduleG4.td - PPC G4 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4 (7400) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<8, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<8, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecComplex  , [InstrStage<3, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<1, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<1, [VIU1]>]>
+]>;
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
new file mode 100644
index 0000000..7474ba4
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -0,0 +1,76 @@
+//===- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4+ (7450) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4PlusItineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntDivW     , [InstrStage<23, [IU2]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<2, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<2, [IU2]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<5, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<14, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<4, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VIU1]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecVSR      , [InstrStage<2, [VPU]>]>
+]>;
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
new file mode 100644
index 0000000..d282147
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -0,0 +1,83 @@
+//===- PPCScheduleG5.td - PPC G5 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G5 (970) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G5Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<3, [IU1, IU2]>]>,
+  InstrItinData<IntDivD     , [InstrStage<68, [IU1]>]>,
+  InstrItinData<IntDivW     , [InstrStage<36, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<6, [IU2]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<IntMulHD    , [InstrStage<7, [IU1, IU2]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
+  InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<4, [BPU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [BPU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<SprMFSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<10, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
+  InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPFused     , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPRes       , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPSqrt      , [InstrStage<40, [FPU1, FPU2]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<5, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<3, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<3, [VPU]>]>
+]>;
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
new file mode 100644
index 0000000..425d8e6
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -0,0 +1,152 @@
+//===- PowerPCSubtarget.cpp - PPC Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCSubtarget.h"
+#include "PPC.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "PPCGenSubtarget.inc"
+#include <cstdlib>
+using namespace llvm;
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_host.h>
+#include <mach/host_info.h>
+#include <mach/machine.h>
+
+/// GetCurrentPowerPCFeatures - Returns the current CPUs features.
+static const char *GetCurrentPowerPCCPU() {
+  host_basic_info_data_t hostInfo;
+  mach_msg_type_number_t infoCount;
+
+  infoCount = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
+            &infoCount);
+            
+  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
+
+  switch(hostInfo.cpu_subtype) {
+  case CPU_SUBTYPE_POWERPC_601:   return "601";
+  case CPU_SUBTYPE_POWERPC_602:   return "602";
+  case CPU_SUBTYPE_POWERPC_603:   return "603";
+  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
+  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
+  case CPU_SUBTYPE_POWERPC_604:   return "604";
+  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
+  case CPU_SUBTYPE_POWERPC_620:   return "620";
+  case CPU_SUBTYPE_POWERPC_750:   return "750";
+  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
+  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
+  case CPU_SUBTYPE_POWERPC_970:   return "970";
+  default: ;
+  }
+  
+  return "generic";
+}
+#endif
+
+
+PPCSubtarget::PPCSubtarget(const TargetMachine &tm, const Module &M,
+                           const std::string &FS, bool is64Bit)
+  : TM(tm)
+  , StackAlignment(16)
+  , DarwinDirective(PPC::DIR_NONE)
+  , IsGigaProcessor(false)
+  , Has64BitSupport(false)
+  , Use64BitRegs(false)
+  , IsPPC64(is64Bit)
+  , HasAltivec(false)
+  , HasFSQRT(false)
+  , HasSTFIWX(false)
+  , HasLazyResolverStubs(false)
+  , DarwinVers(0) {
+
+  // Determine default and user specified characteristics
+  std::string CPU = "generic";
+#if defined(__APPLE__)
+  CPU = GetCurrentPowerPCCPU();
+#endif
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  // If we are generating code for ppc64, verify that options make sense.
+  if (is64Bit) {
+    Has64BitSupport = true;
+    // Silently force 64-bit register use on ppc64.
+    Use64BitRegs = true;
+  }
+  
+  // If the user requested use of 64-bit regs, but the cpu selected doesn't
+  // support it, ignore.
+  if (use64BitRegs() && !has64BitSupport())
+    Use64BitRegs = false;
+  
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string &TT = M.getTargetTriple();
+  if (TT.length() > 7) {
+    // Determine which version of darwin this is.
+    size_t DarwinPos = TT.find("-darwin");
+    if (DarwinPos != std::string::npos) {
+      if (isdigit(TT[DarwinPos+7]))
+        DarwinVers = atoi(&TT[DarwinPos+7]);
+      else
+        DarwinVers = 8;  // Minimum supported darwin is Tiger.
+    }
+  } else if (TT.empty()) {
+    // Try to autosense the subtarget from the host compiler.
+#if defined(__APPLE__)
+#if __APPLE_CC__ > 5400
+    DarwinVers = 9;  // GCC 5400+ is Leopard.
+#else
+    DarwinVers = 8;  // Minimum supported darwin is Tiger.
+#endif
+#endif
+  }
+
+  // Set up darwin-specific properties.
+  if (isDarwin()) {
+    HasLazyResolverStubs = true;
+    AsmFlavor = NewMnemonic;
+  } else {
+    AsmFlavor = OldMnemonic;
+  }
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void PPCSubtarget::SetJITMode() {
+  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
+  // everything is.  This matters for PPC64, which codegens in PIC mode without
+  // stubs.
+  HasLazyResolverStubs = false;
+}
+
+
+/// hasLazyResolverStub - Return true if accesses to the specified global have
+/// to go through a dyld lazy resolution stub.  This means that an extra load
+/// is required to get the address of the global.
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
+  // We never hae stubs if HasLazyResolverStubs=false or if in static mode.
+  if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
+    return false;
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+  bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
+  if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage())
+    return false;
+  return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+         GV->hasCommonLinkage() || isDecl;
+}
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
new file mode 100644
index 0000000..176f3e1
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -0,0 +1,160 @@
+//=====-- PPCSubtarget.h - Define Subtarget for the PPC -------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPCSUBTARGET_H
+#define POWERPCSUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+
+namespace PPC {
+  // -m directive values.
+  enum {
+    DIR_NONE,
+    DIR_32,
+    DIR_601, 
+    DIR_602, 
+    DIR_603, 
+    DIR_7400,
+    DIR_750, 
+    DIR_970, 
+    DIR_64  
+  };
+}
+
+class Module;
+class GlobalValue;
+class TargetMachine;
+  
+class PPCSubtarget : public TargetSubtarget {
+public:
+  enum AsmWriterFlavorTy {
+    OldMnemonic, NewMnemonic, Unset
+  };
+protected:
+  const TargetMachine &TM;
+  
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned StackAlignment;
+  
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+  
+  /// Which cpu directive was used.
+  unsigned DarwinDirective;
+
+  /// AsmFlavor - Which PPC asm dialect to use.
+  AsmWriterFlavorTy AsmFlavor;
+
+  /// Used by the ISel to turn in optimizations for POWER4-derived architectures
+  bool IsGigaProcessor;
+  bool Has64BitSupport;
+  bool Use64BitRegs;
+  bool IsPPC64;
+  bool HasAltivec;
+  bool HasFSQRT;
+  bool HasSTFIWX;
+  bool HasLazyResolverStubs;
+  
+  /// DarwinVers - Nonzero if this is a darwin platform.  Otherwise, the numeric
+  /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
+  unsigned char DarwinVers; // Is any darwin-ppc platform.
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  PPCSubtarget(const TargetMachine &TM, const Module &M,
+               const std::string &FS, bool is64Bit);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  
+  /// SetJITMode - This is called to inform the subtarget info that we are
+  /// producing code for the JIT.
+  void SetJITMode();
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return StackAlignment; }
+  
+  /// getDarwinDirective - Returns the -m directive specified for the cpu.
+  ///
+  unsigned getDarwinDirective() const { return DarwinDirective; }
+  
+  /// getInstrItins - Return the instruction itineraies based on subtarget 
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  /// getTargetDataString - Return the pointer size and type alignment
+  /// properties of this subtarget.
+  const char *getTargetDataString() const {
+    // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+    // documentation are wrong; these are correct (i.e. "what gcc does").
+    return isPPC64() ? "E-p:64:64-f64:64:64-i64:64:64-f128:64:128"
+                     : "E-p:32:32-f64:32:64-i64:32:64-f128:64:128";
+  }
+
+  /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
+  ///
+  bool isPPC64() const { return IsPPC64; }
+  
+  /// has64BitSupport - Return true if the selected CPU supports 64-bit
+  /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
+  bool has64BitSupport() const { return Has64BitSupport; }
+  
+  /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
+  /// registers in 32-bit mode when possible.  This can only true if
+  /// has64BitSupport() returns true.
+  bool use64BitRegs() const { return Use64BitRegs; }
+  
+  /// hasLazyResolverStub - Return true if accesses to the specified global have
+  /// to go through a dyld lazy resolution stub.  This means that an extra load
+  /// is required to get the address of the global.
+  bool hasLazyResolverStub(const GlobalValue *GV) const;
+  
+  // Specific obvious features.
+  bool hasFSQRT() const { return HasFSQRT; }
+  bool hasSTFIWX() const { return HasSTFIWX; }
+  bool hasAltivec() const { return HasAltivec; }
+  bool isGigaProcessor() const { return IsGigaProcessor; }
+
+  /// isDarwin - True if this is any darwin platform.
+  bool isDarwin() const { return DarwinVers != 0; }
+  /// isDarwin - True if this is darwin9 (leopard, 10.5) or above.
+  bool isDarwin9() const { return DarwinVers >= 9; }
+
+  /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
+  unsigned getDarwinVers() const { return DarwinVers; }
+
+  bool isMachoABI() const { return isDarwin() || IsPPC64; }
+  bool isELF32_ABI() const { return !isDarwin() && !IsPPC64; }
+
+  unsigned getAsmFlavor() const {
+    return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
+  }
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/PowerPC/PPCTargetAsmInfo.cpp b/lib/Target/PowerPC/PPCTargetAsmInfo.cpp
new file mode 100644
index 0000000..c69e591
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetAsmInfo.cpp
@@ -0,0 +1,161 @@
+//===-- PPCTargetAsmInfo.cpp - PPC asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the DarwinTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetAsmInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Dwarf.h"
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+PPCDarwinTargetAsmInfo::PPCDarwinTargetAsmInfo(const PPCTargetMachine &TM):
+  PPCTargetAsmInfo<DarwinTargetAsmInfo>(TM) {
+  PCSymbol = ".";
+  CommentString = ";";
+  GlobalPrefix = "_";
+  PrivateGlobalPrefix = "L";
+  LessPrivateGlobalPrefix = "l";
+  StringConstantPrefix = "\1LC";
+  ConstantPoolSection = "\t.const\t";
+  JumpTableDataSection = ".const";
+  CStringSection = "\t.cstring";
+  if (TM.getRelocationModel() == Reloc::Static) {
+    StaticCtorsSection = ".constructor";
+    StaticDtorsSection = ".destructor";
+  } else {
+    StaticCtorsSection = ".mod_init_func";
+    StaticDtorsSection = ".mod_term_func";
+  }
+  HasSingleParameterDotFile = false;
+  SwitchToSectionDirective = "\t.section ";
+  UsedDirective = "\t.no_dead_strip\t";
+  WeakDefDirective = "\t.weak_definition ";
+  WeakRefDirective = "\t.weak_reference ";
+  HiddenDirective = "\t.private_extern ";
+  SupportsExceptionHandling = true;
+  NeedsIndirectEncoding = true;
+  NeedsSet = true;
+  BSSSection = 0;
+  
+  DwarfEHFrameSection =
+  ".section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support";
+  DwarfExceptionSection = ".section __DATA,__gcc_except_tab";
+  GlobalEHDirective = "\t.globl\t";
+  SupportsWeakOmittedEHFrame = false;
+
+  DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+  DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+  DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+  DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+  DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+  DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+  DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+  DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+  DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+  DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+  DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+  
+  // In non-PIC modes, emit a special label before jump tables so that the
+  // linker can perform more accurate dead code stripping.
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    // Emit a local label that is preserved until the linker runs.
+    JumpTableSpecialLabelPrefix = "l";
+  }
+}
+
+/// PreferredEHDataFormat - This hook allows the target to select data
+/// format used for encoding pointers in exception handling data. Reason is
+/// 0 for data, 1 for code labels, 2 for function pointers. Global is true
+/// if the symbol can be relocated.
+unsigned
+PPCDarwinTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                              bool Global) const {
+  if (Reason == DwarfEncoding::Functions && Global)
+    return (DW_EH_PE_pcrel | DW_EH_PE_indirect | DW_EH_PE_sdata4);
+  else if (Reason == DwarfEncoding::CodeLabels || !Global)
+    return DW_EH_PE_pcrel;
+  else
+    return DW_EH_PE_absptr;
+}
+
+const char *
+PPCDarwinTargetAsmInfo::getEHGlobalPrefix() const
+{
+  const PPCSubtarget* Subtarget = &TM.getSubtarget<PPCSubtarget>();
+  if (Subtarget->getDarwinVers() > 9)
+    return PrivateGlobalPrefix;
+  else
+    return "";
+}
+
+PPCLinuxTargetAsmInfo::PPCLinuxTargetAsmInfo(const PPCTargetMachine &TM) :
+  PPCTargetAsmInfo<ELFTargetAsmInfo>(TM) {
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+  ConstantPoolSection = "\t.section .rodata.cst4\t";
+  JumpTableDataSection = ".section .rodata.cst4";
+  CStringSection = ".rodata.str";
+  StaticCtorsSection = ".section\t.ctors,\"aw\",@progbits";
+  StaticDtorsSection = ".section\t.dtors,\"aw\",@progbits";
+  UsedDirective = "\t# .no_dead_strip\t";
+  WeakRefDirective = "\t.weak\t";
+  BSSSection = "\t.section\t\".sbss\",\"aw\",@nobits";
+
+  // PPC/Linux normally uses named section for BSS.
+  BSSSection_  = getNamedSection("\t.bss",
+                                 SectionFlags::Writeable | SectionFlags::BSS,
+                                 /* Override */ true);
+
+  // Debug Information
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+  DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"\",@progbits";
+  DwarfInfoSection =    "\t.section\t.debug_info,\"\",@progbits";
+  DwarfLineSection =    "\t.section\t.debug_line,\"\",@progbits";
+  DwarfFrameSection =   "\t.section\t.debug_frame,\"\",@progbits";
+  DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",@progbits";
+  DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",@progbits";
+  DwarfStrSection =     "\t.section\t.debug_str,\"\",@progbits";
+  DwarfLocSection =     "\t.section\t.debug_loc,\"\",@progbits";
+  DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits";
+  DwarfRangesSection =  "\t.section\t.debug_ranges,\"\",@progbits";
+  DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits";
+
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Exceptions handling
+  if (!TM.getSubtargetImpl()->isPPC64())
+    SupportsExceptionHandling = true;
+  AbsoluteEHSectionOffsets = false;
+  DwarfEHFrameSection = "\t.section\t.eh_frame,\"aw\",@progbits";
+  DwarfExceptionSection = "\t.section\t.gcc_except_table,\"a\",@progbits";
+}
+
+/// PreferredEHDataFormat - This hook allows the target to select data
+/// format used for encoding pointers in exception handling data. Reason is
+/// 0 for data, 1 for code labels, 2 for function pointers. Global is true
+/// if the symbol can be relocated.
+unsigned
+PPCLinuxTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                             bool Global) const {
+  // We really need to write something here.
+  return TargetAsmInfo::PreferredEHDataFormat(Reason, Global);
+}
+
+// Instantiate default implementation.
+TEMPLATE_INSTANTIATION(class PPCTargetAsmInfo<TargetAsmInfo>);
diff --git a/lib/Target/PowerPC/PPCTargetAsmInfo.h b/lib/Target/PowerPC/PPCTargetAsmInfo.h
new file mode 100644
index 0000000..edf40c9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetAsmInfo.h
@@ -0,0 +1,62 @@
+//=====-- PPCTargetAsmInfo.h - PPC asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the DarwinTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETASMINFO_H
+#define PPCTARGETASMINFO_H
+
+#include "PPCTargetMachine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/DarwinTargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+  template <class BaseTAI>
+  struct PPCTargetAsmInfo : public BaseTAI {
+    explicit PPCTargetAsmInfo(const PPCTargetMachine &TM):
+      BaseTAI(TM) {
+      const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
+      bool isPPC64 = Subtarget->isPPC64();
+
+      BaseTAI::ZeroDirective = "\t.space\t";
+      BaseTAI::SetDirective = "\t.set";
+      BaseTAI::Data64bitsDirective = isPPC64 ? "\t.quad\t" : 0;
+      BaseTAI::AlignmentIsInBytes = false;
+      BaseTAI::LCOMMDirective = "\t.lcomm\t";
+      BaseTAI::InlineAsmStart = "# InlineAsm Start";
+      BaseTAI::InlineAsmEnd = "# InlineAsm End";
+      BaseTAI::AssemblerDialect = Subtarget->getAsmFlavor();
+    }
+  };
+
+  typedef PPCTargetAsmInfo<TargetAsmInfo> PPCGenericTargetAsmInfo;
+
+  EXTERN_TEMPLATE_INSTANTIATION(class PPCTargetAsmInfo<TargetAsmInfo>);
+
+  struct PPCDarwinTargetAsmInfo : public PPCTargetAsmInfo<DarwinTargetAsmInfo> {
+    explicit PPCDarwinTargetAsmInfo(const PPCTargetMachine &TM);
+    virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                           bool Global) const;
+    virtual const char *getEHGlobalPrefix() const;
+  };
+
+  struct PPCLinuxTargetAsmInfo : public PPCTargetAsmInfo<ELFTargetAsmInfo> {
+    explicit PPCLinuxTargetAsmInfo(const PPCTargetMachine &TM);
+    virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                           bool Global) const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
new file mode 100644
index 0000000..ef3f0fc
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -0,0 +1,250 @@
+//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCTargetAsmInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// PowerPCTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int PowerPCTargetMachineModule;
+int PowerPCTargetMachineModule = 0;
+
+// Register the targets
+static RegisterTarget<PPC32TargetMachine>
+X("ppc32", "PowerPC 32");
+static RegisterTarget<PPC64TargetMachine>
+Y("ppc64", "PowerPC 64");
+
+// No assembler printer by default
+PPCTargetMachine::AsmPrinterCtorFn PPCTargetMachine::AsmPrinterCtor = 0;
+
+const TargetAsmInfo *PPCTargetMachine::createTargetAsmInfo() const {
+  if (Subtarget.isDarwin())
+    return new PPCDarwinTargetAsmInfo(*this);
+  else
+    return new PPCLinuxTargetAsmInfo(*this);
+}
+
+unsigned PPC32TargetMachine::getJITMatchQuality() {
+#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__)
+  if (sizeof(void*) == 4)
+    return 10;
+#endif
+  return 0;
+}
+unsigned PPC64TargetMachine::getJITMatchQuality() {
+#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__)
+  if (sizeof(void*) == 8)
+    return 10;
+#endif
+  return 0;
+}
+
+unsigned PPC32TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "powerpc-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 8 && std::string(TT.begin(), TT.begin()+8) == "powerpc-")
+    return 20;
+  
+  // If the target triple is something non-powerpc, we don't match.
+  if (!TT.empty()) return 0;
+  
+  if (M.getEndianness()  == Module::BigEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+  
+  return getJITMatchQuality()/2;
+}
+
+unsigned PPC64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "powerpc64-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 10 && std::string(TT.begin(), TT.begin()+10) == "powerpc64-")
+    return 20;
+  
+  if (M.getEndianness()  == Module::BigEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+  
+  return getJITMatchQuality()/2;
+}
+
+
+PPCTargetMachine::PPCTargetMachine(const Module &M, const std::string &FS,
+                                   bool is64Bit)
+  : Subtarget(*this, M, FS, is64Bit),
+    DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
+    FrameInfo(*this, is64Bit), JITInfo(*this, is64Bit), TLInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()), MachOWriterInfo(*this) {
+
+  if (getRelocationModel() == Reloc::Default) {
+    if (Subtarget.isDarwin())
+      setRelocationModel(Reloc::DynamicNoPIC);
+    else
+      setRelocationModel(Reloc::Static);
+  }
+}
+
+/// Override this for PowerPC.  Tail merging happily breaks up instruction issue
+/// groups, which typically degrades performance.
+bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
+
+PPC32TargetMachine::PPC32TargetMachine(const Module &M, const std::string &FS) 
+  : PPCTargetMachine(M, FS, false) {
+}
+
+
+PPC64TargetMachine::PPC64TargetMachine(const Module &M, const std::string &FS)
+  : PPCTargetMachine(M, FS, true) {
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool PPCTargetMachine::addInstSelector(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createPPCISelDag(*this));
+  return false;
+}
+
+bool PPCTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  // Must run branch selection immediately preceding the asm printer.
+  PM.add(createPPCBranchSelectionPass());
+  return false;
+}
+
+bool PPCTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          bool Verbose,
+                                          raw_ostream &Out) {
+  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+  if (AsmPrinterCtor)
+    PM.add(AsmPrinterCtor(Out, *this, OptLevel, Verbose));
+
+  return false;
+}
+
+bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      bool DumpAsm, MachineCodeEmitter &MCE) {
+  // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
+  // FIXME: This should be moved to TargetJITInfo!!
+  if (Subtarget.isPPC64()) {
+    // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many
+    // instructions to materialize arbitrary global variable + function +
+    // constant pool addresses.
+    setRelocationModel(Reloc::PIC_);
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    DisableJumpTables = true;      
+  } else {
+    setRelocationModel(Reloc::Static);
+  }
+  
+  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
+  // writing?
+  Subtarget.SetJITMode();
+  
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCCodeEmitterPass(*this, MCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      bool DumpAsm, JITCodeEmitter &JCE) {
+  // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
+  // FIXME: This should be moved to TargetJITInfo!!
+  if (Subtarget.isPPC64()) {
+    // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many
+    // instructions to materialize arbitrary global variable + function +
+    // constant pool addresses.
+    setRelocationModel(Reloc::PIC_);
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    DisableJumpTables = true;      
+  } else {
+    setRelocationModel(Reloc::Static);
+  }
+  
+  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
+  // writing?
+  Subtarget.SetJITMode();
+  
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool PPCTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool DumpAsm, 
+                                            MachineCodeEmitter &MCE) {
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCCodeEmitterPass(*this, MCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool PPCTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool DumpAsm,
+                                            JITCodeEmitter &JCE) {
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
new file mode 100644
index 0000000..086d2f4
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -0,0 +1,120 @@
+//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC -----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_TARGETMACHINE_H
+#define PPC_TARGETMACHINE_H
+
+#include "PPCFrameInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCJITInfo.h"
+#include "PPCInstrInfo.h"
+#include "PPCISelLowering.h"
+#include "PPCMachOWriterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+class PassManager;
+class GlobalValue;
+
+/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
+///
+class PPCTargetMachine : public LLVMTargetMachine {
+  PPCSubtarget        Subtarget;
+  const TargetData    DataLayout;       // Calculates type size & alignment
+  PPCInstrInfo        InstrInfo;
+  PPCFrameInfo        FrameInfo;
+  PPCJITInfo          JITInfo;
+  PPCTargetLowering   TLInfo;
+  InstrItineraryData  InstrItins;
+  PPCMachOWriterInfo  MachOWriterInfo;
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
+  // set this functions to ctor pointer at startup time if they are linked in.
+  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
+                                            PPCTargetMachine &tm, 
+                                            CodeGenOpt::Level OptLevel,
+                                            bool verbose);
+  static AsmPrinterCtorFn AsmPrinterCtor;
+
+public:
+  PPCTargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+
+  virtual const PPCInstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const PPCFrameInfo     *getFrameInfo() const { return &FrameInfo; }
+  virtual       PPCJITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual       PPCTargetLowering *getTargetLowering() const { 
+   return const_cast<PPCTargetLowering*>(&TLInfo); 
+  }
+  virtual const PPCRegisterInfo  *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  
+  virtual const TargetData    *getTargetData() const    { return &DataLayout; }
+  virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual const InstrItineraryData getInstrItineraryData() const {  
+    return InstrItins;
+  }
+  virtual const PPCMachOWriterInfo *getMachOWriterInfo() const {
+    return &MachOWriterInfo;
+  }
+
+  static void registerAsmPrinter(AsmPrinterCtorFn F) {
+    AsmPrinterCtor = F;
+  }
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel, 
+                                  bool Verbose, raw_ostream &Out);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, MachineCodeEmitter &MCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, JITCodeEmitter &JCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm, MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm, JITCodeEmitter &JCE);
+  virtual bool getEnableTailMergeDefault() const;
+};
+
+/// PPC32TargetMachine - PowerPC 32-bit target machine.
+///
+class PPC32TargetMachine : public PPCTargetMachine {
+public:
+  PPC32TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// PPC64TargetMachine - PowerPC 64-bit target machine.
+///
+class PPC64TargetMachine : public PPCTargetMachine {
+public:
+  PPC64TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
new file mode 100644
index 0000000..688fb30
--- /dev/null
+++ b/lib/Target/PowerPC/README.txt
@@ -0,0 +1,799 @@
+//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
+
+TODO:
+* gpr0 allocation
+* implement do-loop -> bdnz transform
+* lmw/stmw pass a la arm load store optimizer for prolog/epilog
+
+===-------------------------------------------------------------------------===
+
+Support 'update' load/store instructions.  These are cracked on the G5, but are
+still a codesize win.
+
+With preinc enabled, this:
+
+long *%test4(long *%X, long *%dest) {
+        %Y = getelementptr long* %X, int 4
+        %A = load long* %Y
+        store long %A, long* %dest
+        ret long* %Y
+}
+
+compiles to:
+
+_test4:
+        mr r2, r3
+        lwzu r5, 32(r2)
+        lwz r3, 36(r3)
+        stw r5, 0(r4)
+        stw r3, 4(r4)
+        mr r3, r2
+        blr 
+
+with -sched=list-burr, I get:
+
+_test4:
+        lwz r2, 36(r3)
+        lwzu r5, 32(r3)
+        stw r2, 4(r4)
+        stw r5, 0(r4)
+        blr 
+
+===-------------------------------------------------------------------------===
+
+We compile the hottest inner loop of viterbi to:
+
+        li r6, 0
+        b LBB1_84       ;bb432.i
+LBB1_83:        ;bb420.i
+        lbzx r8, r5, r7
+        addi r6, r7, 1
+        stbx r8, r4, r7
+LBB1_84:        ;bb432.i
+        mr r7, r6
+        cmplwi cr0, r7, 143
+        bne cr0, LBB1_83        ;bb420.i
+
+The CBE manages to produce:
+
+	li r0, 143
+	mtctr r0
+loop:
+	lbzx r2, r2, r11
+	stbx r0, r2, r9
+	addi r2, r2, 1
+	bdz later
+	b loop
+
+This could be much better (bdnz instead of bdz) but it still beats us.  If we
+produced this with bdnz, the loop would be a single dispatch group.
+
+===-------------------------------------------------------------------------===
+
+Compile:
+
+void foo(int *P) {
+ if (P)  *P = 0;
+}
+
+into:
+
+_foo:
+        cmpwi cr0,r3,0
+        beqlr cr0
+        li r0,0
+        stw r0,0(r3)
+        blr
+
+This is effectively a simple form of predication.
+
+===-------------------------------------------------------------------------===
+
+Lump the constant pool for each function into ONE pic object, and reference
+pieces of it as offsets from the start.  For functions like this (contrived
+to have lots of constants obviously):
+
+double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
+
+We generate:
+
+_X:
+        lis r2, ha16(.CPI_X_0)
+        lfd f0, lo16(.CPI_X_0)(r2)
+        lis r2, ha16(.CPI_X_1)
+        lfd f2, lo16(.CPI_X_1)(r2)
+        fmadd f0, f1, f0, f2
+        lis r2, ha16(.CPI_X_2)
+        lfd f1, lo16(.CPI_X_2)(r2)
+        lis r2, ha16(.CPI_X_3)
+        lfd f2, lo16(.CPI_X_3)(r2)
+        fmadd f1, f0, f1, f2
+        blr
+
+It would be better to materialize .CPI_X into a register, then use immediates
+off of the register to avoid the lis's.  This is even more important in PIC 
+mode.
+
+Note that this (and the static variable version) is discussed here for GCC:
+http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
+
+Here's another example (the sgn function):
+double testf(double a) {
+       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+it produces a BB like this:
+LBB1_1: ; cond_true
+        lis r2, ha16(LCPI1_0)
+        lfs f0, lo16(LCPI1_0)(r2)
+        lis r2, ha16(LCPI1_1)
+        lis r3, ha16(LCPI1_2)
+        lfs f2, lo16(LCPI1_2)(r3)
+        lfs f3, lo16(LCPI1_1)(r2)
+        fsub f0, f0, f1
+        fsel f1, f0, f2, f3
+        blr 
+
+===-------------------------------------------------------------------------===
+
+PIC Code Gen IPO optimization:
+
+Squish small scalar globals together into a single global struct, allowing the 
+address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
+of the GOT on targets with one).
+
+Note that this is discussed here for GCC:
+http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
+
+===-------------------------------------------------------------------------===
+
+Implement Newton-Rhapson method for improving estimate instructions to the
+correct accuracy, and implementing divide as multiply by reciprocal when it has
+more than one use.  Itanium will want this too.
+
+===-------------------------------------------------------------------------===
+
+Compile offsets from allocas:
+
+int *%test() {
+        %X = alloca { int, int }
+        %Y = getelementptr {int,int}* %X, int 0, uint 1
+        ret int* %Y
+}
+
+into a single add, not two:
+
+_test:
+        addi r2, r1, -8
+        addi r3, r2, 4
+        blr
+
+--> important for C++.
+
+===-------------------------------------------------------------------------===
+
+No loads or stores of the constants should be needed:
+
+struct foo { double X, Y; };
+void xxx(struct foo F);
+void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
+
+===-------------------------------------------------------------------------===
+
+Darwin Stub LICM optimization:
+
+Loops like this:
+  
+  for (...)  bar();
+
+Have to go through an indirect stub if bar is external or linkonce.  It would 
+be better to compile it as:
+
+     fp = &bar;
+     for (...)  fp();
+
+which only computes the address of bar once (instead of each time through the 
+stub).  This is Darwin specific and would have to be done in the code generator.
+Probably not a win on x86.
+
+===-------------------------------------------------------------------------===
+
+Simple IPO for argument passing, change:
+  void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
+
+the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
+of arguments get assigned to r3 through r10. That is, if you have a function
+foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
+argument bytes for r4 and r5. The trick then would be to shuffle the argument
+order for functions we can internalize so that the maximum number of 
+integers/pointers get passed in regs before you see any of the fp arguments.
+
+Instead of implementing this, it would actually probably be easier to just 
+implement a PPC fastcc, where we could do whatever we wanted to the CC, 
+including having this work sanely.
+
+===-------------------------------------------------------------------------===
+
+Fix Darwin FP-In-Integer Registers ABI
+
+Darwin passes doubles in structures in integer registers, which is very very 
+bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation 
+that percolates these things out of functions.
+
+Check out how horrible this is:
+http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
+
+This is an extension of "interprocedural CC unmunging" that can't be done with
+just fastcc.
+
+===-------------------------------------------------------------------------===
+
+Compile this:
+
+int foo(int a) {
+  int b = (a < 8);
+  if (b) {
+    return b * 3;     // ignore the fact that this is always 3.
+  } else {
+    return 2;
+  }
+}
+
+into something not this:
+
+_foo:
+1)      cmpwi cr7, r3, 8
+        mfcr r2, 1
+        rlwinm r2, r2, 29, 31, 31
+1)      cmpwi cr0, r3, 7
+        bgt cr0, LBB1_2 ; UnifiedReturnBlock
+LBB1_1: ; then
+        rlwinm r2, r2, 0, 31, 31
+        mulli r3, r2, 3
+        blr
+LBB1_2: ; UnifiedReturnBlock
+        li r3, 2
+        blr
+
+In particular, the two compares (marked 1) could be shared by reversing one.
+This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
+same operands (but backwards) exists.  In this case, this wouldn't save us 
+anything though, because the compares still wouldn't be shared.
+
+===-------------------------------------------------------------------------===
+
+We should custom expand setcc instead of pretending that we have it.  That
+would allow us to expose the access of the crbit after the mfcr, allowing
+that access to be trivially folded into other ops.  A simple example:
+
+int foo(int a, int b) { return (a < b) << 4; }
+
+compiles into:
+
+_foo:
+        cmpw cr7, r3, r4
+        mfcr r2, 1
+        rlwinm r2, r2, 29, 31, 31
+        slwi r3, r2, 4
+        blr
+
+===-------------------------------------------------------------------------===
+
+Fold add and sub with constant into non-extern, non-weak addresses so this:
+
+static int a;
+void bar(int b) { a = b; }
+void foo(unsigned char *c) {
+  *c = a;
+}
+
+So that 
+
+_foo:
+        lis r2, ha16(_a)
+        la r2, lo16(_a)(r2)
+        lbz r2, 3(r2)
+        stb r2, 0(r3)
+        blr
+
+Becomes
+
+_foo:
+        lis r2, ha16(_a+3)
+        lbz r2, lo16(_a+3)(r2)
+        stb r2, 0(r3)
+        blr
+
+===-------------------------------------------------------------------------===
+
+We generate really bad code for this:
+
+int f(signed char *a, _Bool b, _Bool c) {
+   signed char t = 0;
+  if (b)  t = *a;
+  if (c)  *a = t;
+}
+
+===-------------------------------------------------------------------------===
+
+This:
+int test(unsigned *P) { return *P >> 24; }
+
+Should compile to:
+
+_test:
+        lbz r3,0(r3)
+        blr
+
+not:
+
+_test:
+        lwz r2, 0(r3)
+        srwi r3, r2, 24
+        blr
+
+===-------------------------------------------------------------------------===
+
+On the G5, logical CR operations are more expensive in their three
+address form: ops that read/write the same register are half as expensive as
+those that read from two registers that are different from their destination.
+
+We should model this with two separate instructions.  The isel should generate
+the "two address" form of the instructions.  When the register allocator 
+detects that it needs to insert a copy due to the two-addresness of the CR
+logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
+we can convert to the "three address" instruction, to save code space.
+
+This only matters when we start generating cr logical ops.
+
+===-------------------------------------------------------------------------===
+
+We should compile these two functions to the same thing:
+
+#include <stdlib.h>
+void f(int a, int b, int *P) {
+  *P = (a-b)>=0?(a-b):(b-a);
+}
+void g(int a, int b, int *P) {
+  *P = abs(a-b);
+}
+
+Further, they should compile to something better than:
+
+_g:
+        subf r2, r4, r3
+        subfic r3, r2, 0
+        cmpwi cr0, r2, -1
+        bgt cr0, LBB2_2 ; entry
+LBB2_1: ; entry
+        mr r2, r3
+LBB2_2: ; entry
+        stw r2, 0(r5)
+        blr
+
+GCC produces:
+
+_g:
+        subf r4,r4,r3
+        srawi r2,r4,31
+        xor r0,r2,r4
+        subf r0,r2,r0
+        stw r0,0(r5)
+        blr
+
+... which is much nicer.
+
+This theoretically may help improve twolf slightly (used in dimbox.c:142?).
+
+===-------------------------------------------------------------------------===
+
+int foo(int N, int ***W, int **TK, int X) {
+  int t, i;
+  
+  for (t = 0; t < N; ++t)
+    for (i = 0; i < 4; ++i)
+      W[t / X][i][t % X] = TK[i][t];
+      
+  return 5;
+}
+
+We generate relatively atrocious code for this loop compared to gcc.
+
+We could also strength reduce the rem and the div:
+http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
+
+===-------------------------------------------------------------------------===
+
+float foo(float X) { return (int)(X); }
+
+Currently produces:
+
+_foo:
+        fctiwz f0, f1
+        stfd f0, -8(r1)
+        lwz r2, -4(r1)
+        extsw r2, r2
+        std r2, -16(r1)
+        lfd f0, -16(r1)
+        fcfid f0, f0
+        frsp f1, f0
+        blr
+
+We could use a target dag combine to turn the lwz/extsw into an lwa when the 
+lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
+win only.
+
+===-------------------------------------------------------------------------===
+
+We generate ugly code for this:
+
+void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
+  unsigned code = 0;
+  if(dx < -dw) code |= 1;
+  if(dx > dw)  code |= 2;
+  if(dy < -dw) code |= 4;
+  if(dy > dw)  code |= 8;
+  if(dz < -dw) code |= 16;
+  if(dz > dw)  code |= 32;
+  *ret = code;
+}
+
+===-------------------------------------------------------------------------===
+
+Complete the signed i32 to FP conversion code using 64-bit registers
+transformation, good for PI.  See PPCISelLowering.cpp, this comment:
+
+     // FIXME: disable this lowered code.  This generates 64-bit register values,
+     // and we don't model the fact that the top part is clobbered by calls.  We
+     // need to flag these together so that the value isn't live across a call.
+     //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+Also, if the registers are spilled to the stack, we have to ensure that all
+64-bits of them are save/restored, otherwise we will miscompile the code.  It
+sounds like we need to get the 64-bit register classes going.
+
+===-------------------------------------------------------------------------===
+
+%struct.B = type { i8, [3 x i8] }
+
+define void @bar(%struct.B* %b) {
+entry:
+        %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
+        %tmp = load i32* %tmp          ; <uint> [#uses=1]
+        %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
+        %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
+        %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
+        %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
+        %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
+        %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
+        %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
+        %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
+        %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
+        %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
+        store i32 %tmp13, i32* %tmp8
+        ret void
+}
+
+We emit:
+
+_foo:
+        lwz r2, 0(r3)
+        slwi r4, r2, 1
+        or r4, r4, r2
+        rlwimi r2, r4, 0, 0, 0
+        stw r2, 0(r3)
+        blr
+
+We could collapse a bunch of those ORs and ANDs and generate the following
+equivalent code:
+
+_foo:
+        lwz r2, 0(r3)
+        rlwinm r4, r2, 1, 0, 0
+        or r2, r2, r4
+        stw r2, 0(r3)
+        blr
+
+===-------------------------------------------------------------------------===
+
+We compile:
+
+unsigned test6(unsigned x) { 
+  return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
+}
+
+into:
+
+_test6:
+        lis r2, 255
+        rlwinm r3, r3, 16, 0, 31
+        ori r2, r2, 255
+        and r3, r3, r2
+        blr
+
+GCC gets it down to:
+
+_test6:
+        rlwinm r0,r3,16,8,15
+        rlwinm r3,r3,16,24,31
+        or r3,r3,r0
+        blr
+
+
+===-------------------------------------------------------------------------===
+
+Consider a function like this:
+
+float foo(float X) { return X + 1234.4123f; }
+
+The FP constant ends up in the constant pool, so we need to get the LR register.
+ This ends up producing code like this:
+
+_foo:
+.LBB_foo_0:     ; entry
+        mflr r11
+***     stw r11, 8(r1)
+        bl "L00000$pb"
+"L00000$pb":
+        mflr r2
+        addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
+        lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
+        fadds f1, f1, f0
+***     lwz r11, 8(r1)
+        mtlr r11
+        blr
+
+This is functional, but there is no reason to spill the LR register all the way
+to the stack (the two marked instrs): spilling it to a GPR is quite enough.
+
+Implementing this will require some codegen improvements.  Nate writes:
+
+"So basically what we need to support the "no stack frame save and restore" is a
+generalization of the LR optimization to "callee-save regs".
+
+Currently, we have LR marked as a callee-save reg.  The register allocator sees
+that it's callee save, and spills it directly to the stack.
+
+Ideally, something like this would happen:
+
+LR would be in a separate register class from the GPRs. The class of LR would be
+marked "unspillable".  When the register allocator came across an unspillable
+reg, it would ask "what is the best class to copy this into that I *can* spill"
+If it gets a class back, which it will in this case (the gprs), it grabs a free
+register of that class.  If it is then later necessary to spill that reg, so be
+it.
+
+===-------------------------------------------------------------------------===
+
+We compile this:
+int test(_Bool X) {
+  return X ? 524288 : 0;
+}
+
+to: 
+_test:
+        cmplwi cr0, r3, 0
+        lis r2, 8
+        li r3, 0
+        beq cr0, LBB1_2 ;entry
+LBB1_1: ;entry
+        mr r3, r2
+LBB1_2: ;entry
+        blr 
+
+instead of:
+_test:
+        addic r2,r3,-1
+        subfe r0,r2,r3
+        slwi r3,r0,19
+        blr
+
+This sort of thing occurs a lot due to globalopt.
+
+===-------------------------------------------------------------------------===
+
+We currently compile 32-bit bswap:
+
+declare i32 @llvm.bswap.i32(i32 %A)
+define i32 @test(i32 %A) {
+        %B = call i32 @llvm.bswap.i32(i32 %A)
+        ret i32 %B
+}
+
+to:
+
+_test:
+        rlwinm r2, r3, 24, 16, 23
+        slwi r4, r3, 24
+        rlwimi r2, r3, 8, 24, 31
+        rlwimi r4, r3, 8, 8, 15
+        rlwimi r4, r2, 0, 16, 31
+        mr r3, r4
+        blr 
+
+it would be more efficient to produce:
+
+_foo:   mr r0,r3
+        rlwinm r3,r3,8,0xffffffff
+        rlwimi r3,r0,24,0,7
+        rlwimi r3,r0,24,16,23
+        blr
+
+===-------------------------------------------------------------------------===
+
+test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
+
+__ZNK4llvm5APInt17countLeadingZerosEv:
+        ld r2, 0(r3)
+        cntlzd r2, r2
+        or r2, r2, r2     <<-- silly.
+        addi r3, r2, -64
+        blr 
+
+The dead or is a 'truncate' from 64- to 32-bits.
+
+===-------------------------------------------------------------------------===
+
+We generate horrible ppc code for this:
+
+#define N  2000000
+double   a[N],c[N];
+void simpleloop() {
+   int j;
+   for (j=0; j<N; j++)
+     c[j] = a[j];
+}
+
+LBB1_1: ;bb
+        lfdx f0, r3, r4
+        addi r5, r5, 1                 ;; Extra IV for the exit value compare.
+        stfdx f0, r2, r4
+        addi r4, r4, 8
+
+        xoris r6, r5, 30               ;; This is due to a large immediate.
+        cmplwi cr0, r6, 33920
+        bne cr0, LBB1_1
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
+__Z11no_overflowjj:
+        add r4,r3,r4
+        subfc r3,r3,r4
+        li r3,0
+        adde r3,r3,r3
+        blr
+
+(or better) not:
+
+__Z11no_overflowjj:
+        add r2, r4, r3
+        cmplw cr7, r2, r3
+        mfcr r2
+        rlwinm r2, r2, 29, 31, 31
+        xori r3, r2, 1
+        blr 
+
+//===---------------------------------------------------------------------===//
+
+We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
+example:
+#include <math.h>
+int test(double x, double y) { return islessequal(x, y);}
+int test2(double x, double y) {  return islessgreater(x, y);}
+int test3(double x, double y) {  return !islessequal(x, y);}
+
+Compiles into (all three are similar, but the bits differ):
+
+_test:
+	fcmpu cr7, f1, f2
+	mfcr r2
+	rlwinm r3, r2, 29, 31, 31
+	rlwinm r2, r2, 31, 31, 31
+	or r3, r2, r3
+	blr 
+
+GCC compiles this into:
+
+ _test:
+	fcmpu cr7,f1,f2
+	cror 30,28,30
+	mfcr r3
+	rlwinm r3,r3,31,1
+	blr
+        
+which is more efficient and can use mfocr.  See PR642 for some more context.
+
+//===---------------------------------------------------------------------===//
+
+void foo(float *data, float d) {
+   long i;
+   for (i = 0; i < 8000; i++)
+      data[i] = d;
+}
+void foo2(float *data, float d) {
+   long i;
+   data--;
+   for (i = 0; i < 8000; i++) {
+      data[1] = d;
+      data++;
+   }
+}
+
+These compile to:
+
+_foo:
+	li r2, 0
+LBB1_1:	; bb
+	addi r4, r2, 4
+	stfsx f1, r3, r2
+	cmplwi cr0, r4, 32000
+	mr r2, r4
+	bne cr0, LBB1_1	; bb
+	blr 
+_foo2:
+	li r2, 0
+LBB2_1:	; bb
+	addi r4, r2, 4
+	stfsx f1, r3, r2
+	cmplwi cr0, r4, 32000
+	mr r2, r4
+	bne cr0, LBB2_1	; bb
+	blr 
+
+The 'mr' could be eliminated to folding the add into the cmp better.
+
+//===---------------------------------------------------------------------===//
+Codegen for the following (low-probability) case deteriorated considerably 
+when the correctness fixes for unordered comparisons went in (PR 642, 58871).
+It should be possible to recover the code quality described in the comments.
+
+; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
+; This should produce one 'or' or 'cror' instruction per function.
+
+; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
+; PR2964
+
+define i32 @test(double %x, double %y) nounwind  {
+entry:
+	%tmp3 = fcmp ole double %x, %y		; <i1> [#uses=1]
+	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp345
+}
+
+define i32 @test2(double %x, double %y) nounwind  {
+entry:
+	%tmp3 = fcmp one double %x, %y		; <i1> [#uses=1]
+	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp345
+}
+
+define i32 @test3(double %x, double %y) nounwind  {
+entry:
+	%tmp3 = fcmp ugt double %x, %y		; <i1> [#uses=1]
+	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp34
+}
+//===----------------------------------------------------------------------===//
+; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
+
+; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and 
+; should not be generated except with -enable-finite-only-fp-math or the like).
+; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
+; recognize a more elaborate tree than a simple SETxx.
+
+define double @test_FNEG_sel(double %A, double %B, double %C) {
+        %D = sub double -0.000000e+00, %A               ; <double> [#uses=1]
+        %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
+        %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
+        ret double %E
+}
+
diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt
new file mode 100644
index 0000000..1e4c6fb
--- /dev/null
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt
@@ -0,0 +1,211 @@
+//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
+
+Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
+registers, to generate better spill code.
+
+//===----------------------------------------------------------------------===//
+
+The first should be a single lvx from the constant pool, the second should be 
+a xor/stvx:
+
+void foo(void) {
+  int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
+  bar (x);
+}
+
+#include <string.h>
+void foo(void) {
+  int x[8] __attribute__((aligned(128)));
+  memset (x, 0, sizeof (x));
+  bar (x);
+}
+
+//===----------------------------------------------------------------------===//
+
+Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
+
+When -ffast-math is on, we can use 0.0.
+
+//===----------------------------------------------------------------------===//
+
+  Consider this:
+  v4f32 Vector;
+  v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
+
+Since we know that "Vector" is 16-byte aligned and we know the element offset 
+of ".X", we should change the load into a lve*x instruction, instead of doing
+a load/store/lve*x sequence.
+
+//===----------------------------------------------------------------------===//
+
+For functions that use altivec AND have calls, we are VRSAVE'ing all call
+clobbered regs.
+
+//===----------------------------------------------------------------------===//
+
+Implement passing vectors by value into calls and receiving them as arguments.
+
+//===----------------------------------------------------------------------===//
+
+GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
+of C1/C2/C3, then a load and vperm of Variable.
+
+//===----------------------------------------------------------------------===//
+
+We need a way to teach tblgen that some operands of an intrinsic are required to
+be constants.  The verifier should enforce this constraint.
+
+//===----------------------------------------------------------------------===//
+
+We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
+aligned stack slot, followed by a load/vperm.  We should probably just store it
+to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
+in memory this is a big win.
+
+//===----------------------------------------------------------------------===//
+
+extract_vector_elt of an arbitrary constant vector can be done with the 
+following instructions:
+
+vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
+vec_ste(&destloc,0,vTemp);
+
+We can do an arbitrary non-constant value by using lvsr/perm/ste.
+
+//===----------------------------------------------------------------------===//
+
+If we want to tie instruction selection into the scheduler, we can do some
+constant formation with different instructions.  For example, we can generate
+"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
+"vsplti 0" or "vxor", each of which use different execution units, thus could
+help scheduling.
+
+This is probably only reasonable for a post-pass scheduler.
+
+//===----------------------------------------------------------------------===//
+
+For this function:
+
+void test(vector float *A, vector float *B) {
+  vector float C = (vector float)vec_cmpeq(*A, *B);
+  if (!vec_any_eq(*A, *B))
+    *B = (vector float){0,0,0,0};
+  *A = C;
+}
+
+we get the following basic block:
+
+	...
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp v4, v3, v2
+        vcmpeqfp. v2, v3, v2
+        bne cr6, LBB1_2 ; cond_next
+
+The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
+vcmpeqfp. result is used by a branch.  This can be improved.
+
+//===----------------------------------------------------------------------===//
+
+The code generated for this is truly aweful:
+
+vector float test(float a, float b) {
+ return (vector float){ 0.0, a, 0.0, 0.0}; 
+}
+
+LCPI1_0:                                        ;  float
+        .space  4
+        .text
+        .globl  _test
+        .align  4
+_test:
+        mfspr r2, 256
+        oris r3, r2, 4096
+        mtspr 256, r3
+        lis r3, ha16(LCPI1_0)
+        addi r4, r1, -32
+        stfs f1, -16(r1)
+        addi r5, r1, -16
+        lfs f0, lo16(LCPI1_0)(r3)
+        stfs f0, -32(r1)
+        lvx v2, 0, r4
+        lvx v3, 0, r5
+        vmrghw v3, v3, v2
+        vspltw v2, v2, 0
+        vmrghw v2, v2, v3
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+int foo(vector float *x, vector float *y) {
+        if (vec_all_eq(*x,*y)) return 3245; 
+        else return 12;
+}
+
+A predicate compare being used in a select_cc should have the same peephole
+applied to it as a predicate compare used by a br_cc.  There should be no
+mfcr here:
+
+_foo:
+        mfspr r2, 256
+        oris r5, r2, 12288
+        mtspr 256, r5
+        li r5, 12
+        li r6, 3245
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp. v2, v3, v2
+        mfcr r3, 2
+        rlwinm r3, r3, 25, 31, 31
+        cmpwi cr0, r3, 0
+        bne cr0, LBB1_2 ; entry
+LBB1_1: ; entry
+        mr r6, r5
+LBB1_2: ; entry
+        mr r3, r6
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+CodeGen/PowerPC/vec_constants.ll has an and operation that should be
+codegen'd to andc.  The issue is that the 'all ones' build vector is
+SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
+which prevents the vnot pattern from matching.
+
+
+//===----------------------------------------------------------------------===//
+
+An alternative to the store/store/load approach for illegal insert element 
+lowering would be:
+
+1. store element to any ol' slot
+2. lvx the slot
+3. lvsl 0; splat index; vcmpeq to generate a select mask
+4. lvsl slot + x; vperm to rotate result into correct slot
+5. vsel result together.
+
+//===----------------------------------------------------------------------===//
+
+Should codegen branches on vec_any/vec_all to avoid mfcr.  Two examples:
+
+#include <altivec.h>
+ int f(vector float a, vector float b)
+ {
+  int aa = 0;
+  if (vec_all_ge(a, b))
+    aa |= 0x1;
+  if (vec_any_ge(a,b))
+    aa |= 0x2;
+  return aa;
+}
+
+vector float f(vector float a, vector float b) { 
+  if (vec_any_eq(a, b)) 
+    return a; 
+  else 
+    return b; 
+}
+
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
new file mode 100644
index 0000000..f68cf0e
--- /dev/null
+++ b/lib/Target/README.txt
@@ -0,0 +1,1679 @@
+Target Independent Opportunities:
+
+//===---------------------------------------------------------------------===//
+
+With the recent changes to make the implicit def/use set explicit in
+machineinstrs, we should change the target descriptions for 'call' instructions
+so that the .td files don't list all the call-clobbered registers as implicit
+defs.  Instead, these should be added by the code generator (e.g. on the dag).
+
+This has a number of uses:
+
+1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
+   for their different impdef sets.
+2. Targets with multiple calling convs (e.g. x86) which have different clobber
+   sets don't need copies of call instructions.
+3. 'Interprocedural register allocation' can be done to reduce the clobber sets
+   of calls.
+
+//===---------------------------------------------------------------------===//
+
+Make the PPC branch selector target independant
+
+//===---------------------------------------------------------------------===//
+
+Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
+precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
+safe in general, even on darwin.  See the libm implementation of hypot for
+examples (which special case when x/y are exactly zero to get signed zeros etc
+right).
+
+//===---------------------------------------------------------------------===//
+
+Solve this DAG isel folding deficiency:
+
+int X, Y;
+
+void fn1(void)
+{
+  X = X | (Y << 3);
+}
+
+compiles to
+
+fn1:
+	movl Y, %eax
+	shll $3, %eax
+	orl X, %eax
+	movl %eax, X
+	ret
+
+The problem is the store's chain operand is not the load X but rather
+a TokenFactor of the load X and load Y, which prevents the folding.
+
+There are two ways to fix this:
+
+1. The dag combiner can start using alias analysis to realize that y/x
+   don't alias, making the store to X not dependent on the load from Y.
+2. The generated isel could be made smarter in the case it can't
+   disambiguate the pointers.
+
+Number 1 is the preferred solution.
+
+This has been "fixed" by a TableGen hack. But that is a short term workaround
+which will be removed once the proper fix is made.
+
+//===---------------------------------------------------------------------===//
+
+On targets with expensive 64-bit multiply, we could LSR this:
+
+for (i = ...; ++i) {
+   x = 1ULL << i;
+
+into:
+ long long tmp = 1;
+ for (i = ...; ++i, tmp+=tmp)
+   x = tmp;
+
+This would be a win on ppc32, but not x86 or ppc64.
+
+//===---------------------------------------------------------------------===//
+
+Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
+
+//===---------------------------------------------------------------------===//
+
+Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
+
+//===---------------------------------------------------------------------===//
+
+Interesting? testcase for add/shift/mul reassoc:
+
+int bar(int x, int y) {
+  return x*x*x+y+x*x*x*x*x*y*y*y*y;
+}
+int foo(int z, int n) {
+  return bar(z, n) + bar(2*z, 2*n);
+}
+
+Reassociate should handle the example in GCC PR16157.
+
+//===---------------------------------------------------------------------===//
+
+These two functions should generate the same code on big-endian systems:
+
+int g(int *j,int *l)  {  return memcmp(j,l,4);  }
+int h(int *j, int *l) {  return *j - *l; }
+
+this could be done in SelectionDAGISel.cpp, along with other special cases,
+for 1,2,4,8 bytes.
+
+//===---------------------------------------------------------------------===//
+
+It would be nice to revert this patch:
+http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
+
+And teach the dag combiner enough to simplify the code expanded before 
+legalize.  It seems plausible that this knowledge would let it simplify other
+stuff too.
+
+//===---------------------------------------------------------------------===//
+
+For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+to the type size. It works but can be overly conservative as the alignment of
+specific vector types are target dependent.
+
+//===---------------------------------------------------------------------===//
+
+We should produce an unaligned load from code like this:
+
+v4sf example(float *P) {
+  return (v4sf){P[0], P[1], P[2], P[3] };
+}
+
+//===---------------------------------------------------------------------===//
+
+Add support for conditional increments, and other related patterns.  Instead
+of:
+
+	movl 136(%esp), %eax
+	cmpl $0, %eax
+	je LBB16_2	#cond_next
+LBB16_1:	#cond_true
+	incl _foo
+LBB16_2:	#cond_next
+
+emit:
+	movl	_foo, %eax
+	cmpl	$1, %edi
+	sbbl	$-1, %eax
+	movl	%eax, _foo
+
+//===---------------------------------------------------------------------===//
+
+Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
+
+Expand these to calls of sin/cos and stores:
+      double sincos(double x, double *sin, double *cos);
+      float sincosf(float x, float *sin, float *cos);
+      long double sincosl(long double x, long double *sin, long double *cos);
+
+Doing so could allow SROA of the destination pointers.  See also:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
+
+This is now easily doable with MRVs.  We could even make an intrinsic for this
+if anyone cared enough about sincos.
+
+//===---------------------------------------------------------------------===//
+
+Turn this into a single byte store with no load (the other 3 bytes are
+unmodified):
+
+define void @test(i32* %P) {
+	%tmp = load i32* %P
+        %tmp14 = or i32 %tmp, 3305111552
+        %tmp15 = and i32 %tmp14, 3321888767
+        store i32 %tmp15, i32* %P
+        ret void
+}
+
+//===---------------------------------------------------------------------===//
+
+dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
+
+Compile:
+
+int bar(int x)
+{
+  int t = __builtin_clz(x);
+  return -(t>>5);
+}
+
+to:
+
+_bar:   addic r3,r3,-1
+        subfe r3,r3,r3
+        blr
+
+//===---------------------------------------------------------------------===//
+
+Legalize should lower ctlz like this:
+  ctlz(x) = popcnt((x-1) & ~x)
+
+on targets that have popcnt but not ctlz.  itanium, what else?
+
+//===---------------------------------------------------------------------===//
+
+quantum_sigma_x in 462.libquantum contains the following loop:
+
+      for(i=0; i<reg->size; i++)
+	{
+	  /* Flip the target bit of each basis state */
+	  reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
+	} 
+
+Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
+so cool to turn it into something like:
+
+   long long Res = ((MAX_UNSIGNED) 1 << target);
+   if (target < 32) {
+     for(i=0; i<reg->size; i++)
+       reg->node[i].state ^= Res & 0xFFFFFFFFULL;
+   } else {
+     for(i=0; i<reg->size; i++)
+       reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
+   }
+   
+... which would only do one 32-bit XOR per loop iteration instead of two.
+
+It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
+alas...
+
+//===---------------------------------------------------------------------===//
+
+This isn't recognized as bswap by instcombine (yes, it really is bswap):
+
+unsigned long reverse(unsigned v) {
+    unsigned t;
+    t = v ^ ((v << 16) | (v >> 16));
+    t &= ~0xff0000;
+    v = (v << 24) | (v >> 8);
+    return v ^ (t >> 8);
+}
+
+//===---------------------------------------------------------------------===//
+
+These idioms should be recognized as popcount (see PR1488):
+
+unsigned countbits_slow(unsigned v) {
+  unsigned c;
+  for (c = 0; v; v >>= 1)
+    c += v & 1;
+  return c;
+}
+unsigned countbits_fast(unsigned v){
+  unsigned c;
+  for (c = 0; v; c++)
+    v &= v - 1; // clear the least significant bit set
+  return c;
+}
+
+BITBOARD = unsigned long long
+int PopCnt(register BITBOARD a) {
+  register int c=0;
+  while(a) {
+    c++;
+    a &= a - 1;
+  }
+  return c;
+}
+unsigned int popcount(unsigned int input) {
+  unsigned int count = 0;
+  for (unsigned int i =  0; i < 4 * 8; i++)
+    count += (input >> i) & i;
+  return count;
+}
+
+//===---------------------------------------------------------------------===//
+
+These should turn into single 16-bit (unaligned?) loads on little/big endian
+processors.
+
+unsigned short read_16_le(const unsigned char *adr) {
+  return adr[0] | (adr[1] << 8);
+}
+unsigned short read_16_be(const unsigned char *adr) {
+  return (adr[0] << 8) | adr[1];
+}
+
+//===---------------------------------------------------------------------===//
+
+-instcombine should handle this transform:
+   icmp pred (sdiv X / C1 ), C2
+when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands. 
+
+Currently InstCombine avoids this transform but will do it when the signs of
+the operands and the sign of the divide match. See the FIXME in 
+InstructionCombining.cpp in the visitSetCondInst method after the switch case 
+for Instruction::UDiv (around line 4447) for more details.
+
+The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
+this construct. 
+
+//===---------------------------------------------------------------------===//
+
+viterbi speeds up *significantly* if the various "history" related copy loops
+are turned into memcpy calls at the source level.  We need a "loops to memcpy"
+pass.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+typedef unsigned U32;
+typedef unsigned long long U64;
+int test (U32 *inst, U64 *regs) {
+    U64 effective_addr2;
+    U32 temp = *inst;
+    int r1 = (temp >> 20) & 0xf;
+    int b2 = (temp >> 16) & 0xf;
+    effective_addr2 = temp & 0xfff;
+    if (b2) effective_addr2 += regs[b2];
+    b2 = (temp >> 12) & 0xf;
+    if (b2) effective_addr2 += regs[b2];
+    effective_addr2 &= regs[4];
+     if ((effective_addr2 & 3) == 0)
+        return 1;
+    return 0;
+}
+
+Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
+we don't eliminate the computation of the top half of effective_addr2 because
+we don't have whole-function selection dags.  On x86, this means we use one
+extra register for the function when effective_addr2 is declared as U64 than
+when it is declared U32.
+
+//===---------------------------------------------------------------------===//
+
+Promote for i32 bswap can use i64 bswap + shr.  Useful on targets with 64-bit
+regs and bswap, like itanium.
+
+//===---------------------------------------------------------------------===//
+
+LSR should know what GPR types a target has.  This code:
+
+volatile short X, Y; // globals
+
+void foo(int N) {
+  int i;
+  for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+produces two identical IV's (after promotion) on PPC/ARM:
+
+LBB1_1: @bb.preheader
+        mov r3, #0
+        mov r2, r3
+        mov r1, r3
+LBB1_2: @bb
+        ldr r12, LCPI1_0
+        ldr r12, [r12]
+        strh r2, [r12]
+        ldr r12, LCPI1_1
+        ldr r12, [r12]
+        strh r3, [r12]
+        add r1, r1, #1    <- [0,+,1]
+        add r3, r3, #4
+        add r2, r2, #1    <- [0,+,1]
+        cmp r1, r0
+        bne LBB1_2      @bb
+
+
+//===---------------------------------------------------------------------===//
+
+Tail call elim should be more aggressive, checking to see if the call is
+followed by an uncond branch to an exit block.
+
+; This testcase is due to tail-duplication not wanting to copy the return
+; instruction into the terminating blocks because there was other code
+; optimized out of the function after the taildup happened.
+; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
+
+define i32 @t4(i32 %a) {
+entry:
+	%tmp.1 = and i32 %a, 1		; <i32> [#uses=1]
+	%tmp.2 = icmp ne i32 %tmp.1, 0		; <i1> [#uses=1]
+	br i1 %tmp.2, label %then.0, label %else.0
+
+then.0:		; preds = %entry
+	%tmp.5 = add i32 %a, -1		; <i32> [#uses=1]
+	%tmp.3 = call i32 @t4( i32 %tmp.5 )		; <i32> [#uses=1]
+	br label %return
+
+else.0:		; preds = %entry
+	%tmp.7 = icmp ne i32 %a, 0		; <i1> [#uses=1]
+	br i1 %tmp.7, label %then.1, label %return
+
+then.1:		; preds = %else.0
+	%tmp.11 = add i32 %a, -2		; <i32> [#uses=1]
+	%tmp.9 = call i32 @t4( i32 %tmp.11 )		; <i32> [#uses=1]
+	br label %return
+
+return:		; preds = %then.1, %else.0, %then.0
+	%result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
+                            [ %tmp.9, %then.1 ]
+	ret i32 %result.0
+}
+
+//===---------------------------------------------------------------------===//
+
+Tail recursion elimination is not transforming this function, because it is
+returning n, which fails the isDynamicConstant check in the accumulator 
+recursion checks.
+
+long long fib(const long long n) {
+  switch(n) {
+    case 0:
+    case 1:
+      return n;
+    default:
+      return fib(n-1) + fib(n-2);
+  }
+}
+
+//===---------------------------------------------------------------------===//
+
+Tail recursion elimination should handle:
+
+int pow2m1(int n) {
+ if (n == 0)
+   return 0;
+ return 2 * pow2m1 (n - 1) + 1;
+}
+
+Also, multiplies can be turned into SHL's, so they should be handled as if
+they were associative.  "return foo() << 1" can be tail recursion eliminated.
+
+//===---------------------------------------------------------------------===//
+
+Argument promotion should promote arguments for recursive functions, like 
+this:
+
+; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
+
+define internal i32 @foo(i32* %x) {
+entry:
+	%tmp = load i32* %x		; <i32> [#uses=0]
+	%tmp.foo = call i32 @foo( i32* %x )		; <i32> [#uses=1]
+	ret i32 %tmp.foo
+}
+
+define i32 @bar(i32* %x) {
+entry:
+	%tmp3 = call i32 @foo( i32* %x )		; <i32> [#uses=1]
+	ret i32 %tmp3
+}
+
+//===---------------------------------------------------------------------===//
+
+"basicaa" should know how to look through "or" instructions that act like add
+instructions.  For example in this code, the x*4+1 is turned into x*4 | 1, and
+basicaa can't analyze the array subscript, leading to duplicated loads in the
+generated code:
+
+void test(int X, int Y, int a[]) {
+int i;
+  for (i=2; i<1000; i+=4) {
+  a[i+0] = a[i-1+0]*a[i-2+0];
+  a[i+1] = a[i-1+1]*a[i-2+1];
+  a[i+2] = a[i-1+2]*a[i-2+2];
+  a[i+3] = a[i-1+3]*a[i-2+3];
+  }
+}
+
+BasicAA also doesn't do this for add.  It needs to know that &A[i+1] != &A[i].
+
+//===---------------------------------------------------------------------===//
+
+We should investigate an instruction sinking pass.  Consider this silly
+example in pic mode:
+
+#include <assert.h>
+void foo(int x) {
+  assert(x);
+  //...
+}
+
+we compile this to:
+_foo:
+	subl	$28, %esp
+	call	"L1$pb"
+"L1$pb":
+	popl	%eax
+	cmpl	$0, 32(%esp)
+	je	LBB1_2	# cond_true
+LBB1_1:	# return
+	# ...
+	addl	$28, %esp
+	ret
+LBB1_2:	# cond_true
+...
+
+The PIC base computation (call+popl) is only used on one path through the 
+code, but is currently always computed in the entry block.  It would be 
+better to sink the picbase computation down into the block for the 
+assertion, as it is the only one that uses it.  This happens for a lot of 
+code with early outs.
+
+Another example is loads of arguments, which are usually emitted into the 
+entry block on targets like x86.  If not used in all paths through a 
+function, they should be sunk into the ones that do.
+
+In this case, whole-function-isel would also handle this.
+
+//===---------------------------------------------------------------------===//
+
+Investigate lowering of sparse switch statements into perfect hash tables:
+http://burtleburtle.net/bob/hash/perfect.html
+
+//===---------------------------------------------------------------------===//
+
+We should turn things like "load+fabs+store" and "load+fneg+store" into the
+corresponding integer operations.  On a yonah, this loop:
+
+double a[256];
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] = -a[i];
+}
+
+is twice as slow as this loop:
+
+long long a[256];
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] ^= (1ULL << 63);
+}
+
+and I suspect other processors are similar.  On X86 in particular this is a
+big win because doing this with integers allows the use of read/modify/write
+instructions.
+
+//===---------------------------------------------------------------------===//
+
+DAG Combiner should try to combine small loads into larger loads when 
+profitable.  For example, we compile this C++ example:
+
+struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
+extern THotKey m_HotKey;
+THotKey GetHotKey () { return m_HotKey; }
+
+into (-O3 -fno-exceptions -static -fomit-frame-pointer):
+
+__Z9GetHotKeyv:
+	pushl	%esi
+	movl	8(%esp), %eax
+	movb	_m_HotKey+3, %cl
+	movb	_m_HotKey+4, %dl
+	movb	_m_HotKey+2, %ch
+	movw	_m_HotKey, %si
+	movw	%si, (%eax)
+	movb	%ch, 2(%eax)
+	movb	%cl, 3(%eax)
+	movb	%dl, 4(%eax)
+	popl	%esi
+	ret	$4
+
+GCC produces:
+
+__Z9GetHotKeyv:
+	movl	_m_HotKey, %edx
+	movl	4(%esp), %eax
+	movl	%edx, (%eax)
+	movzwl	_m_HotKey+4, %edx
+	movw	%dx, 4(%eax)
+	ret	$4
+
+The LLVM IR contains the needed alignment info, so we should be able to 
+merge the loads and stores into 4-byte loads:
+
+	%struct.THotKey = type { i16, i8, i8, i8 }
+define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
+...
+	%tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
+	%tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
+	%tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
+	%tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
+
+Alternatively, we should use a small amount of base-offset alias analysis
+to make it so the scheduler doesn't need to hold all the loads in regs at
+once.
+
+//===---------------------------------------------------------------------===//
+
+We should add an FRINT node to the DAG to model targets that have legal
+implementations of ceil/floor/rint.
+
+//===---------------------------------------------------------------------===//
+
+This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
+contains a testcase that compiles down to:
+
+	%struct.XMM128 = type { <4 x float> }
+..
+	%src = alloca %struct.XMM128
+..
+	%tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
+	%tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
+	store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
+	%tmp66 = load <4 x float>* %tmp65, align 16		
+	%tmp71 = add <4 x float> %tmp66, %tmp66		
+
+If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
+into a bitcast of the vector value and a store to the pointer, then the 
+store->load could be easily removed.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+int test() {
+  long long input[8] = {1,1,1,1,1,1,1,1};
+  foo(input);
+}
+
+We currently compile this into a memcpy from a global array since the 
+initializer is fairly large and not memset'able.  This is good, but the memcpy
+gets lowered to load/stores in the code generator.  This is also ok, except
+that the codegen lowering for memcpy doesn't handle the case when the source
+is a constant global.  This gives us atrocious code like this:
+
+	call	"L1$pb"
+"L1$pb":
+	popl	%eax
+	movl	_C.0.1444-"L1$pb"+32(%eax), %ecx
+	movl	%ecx, 40(%esp)
+	movl	_C.0.1444-"L1$pb"+20(%eax), %ecx
+	movl	%ecx, 28(%esp)
+	movl	_C.0.1444-"L1$pb"+36(%eax), %ecx
+	movl	%ecx, 44(%esp)
+	movl	_C.0.1444-"L1$pb"+44(%eax), %ecx
+	movl	%ecx, 52(%esp)
+	movl	_C.0.1444-"L1$pb"+40(%eax), %ecx
+	movl	%ecx, 48(%esp)
+	movl	_C.0.1444-"L1$pb"+12(%eax), %ecx
+	movl	%ecx, 20(%esp)
+	movl	_C.0.1444-"L1$pb"+4(%eax), %ecx
+...
+
+instead of:
+	movl	$1, 16(%esp)
+	movl	$0, 20(%esp)
+	movl	$1, 24(%esp)
+	movl	$0, 28(%esp)
+	movl	$1, 32(%esp)
+	movl	$0, 36(%esp)
+	...
+
+//===---------------------------------------------------------------------===//
+
+http://llvm.org/PR717:
+
+The following code should compile into "ret int undef". Instead, LLVM
+produces "ret int 0":
+
+int f() {
+  int x = 4;
+  int y;
+  if (x == 3) y = 0;
+  return y;
+}
+
+//===---------------------------------------------------------------------===//
+
+The loop unroller should partially unroll loops (instead of peeling them)
+when code growth isn't too bad and when an unroll count allows simplification
+of some code within the loop.  One trivial example is:
+
+#include <stdio.h>
+int main() {
+    int nRet = 17;
+    int nLoop;
+    for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
+        if ( nLoop & 1 )
+            nRet += 2;
+        else
+            nRet -= 1;
+    }
+    return nRet;
+}
+
+Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
+reduction in code size.  The resultant code would then also be suitable for
+exit value computation.
+
+//===---------------------------------------------------------------------===//
+
+We miss a bunch of rotate opportunities on various targets, including ppc, x86,
+etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
+matching code in dag combine doesn't look through truncates aggressively 
+enough.  Here are some testcases reduces from GCC PR17886:
+
+unsigned long long f(unsigned long long x, int y) {
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f2(unsigned x, int y){
+  return (x << y) | (x >> 32-y); 
+} 
+unsigned long long f3(unsigned long long x){
+  int y = 9;
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f4(unsigned x){
+  int y = 10;
+  return (x << y) | (x >> 32-y); 
+}
+unsigned long long f5(unsigned long long x, unsigned long long y) {
+  return (x << 8) | ((y >> 48) & 0xffull);
+}
+unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
+  switch(z) {
+  case 1:
+    return (x << 8) | ((y >> 48) & 0xffull);
+  case 2:
+    return (x << 16) | ((y >> 40) & 0xffffull);
+  case 3:
+    return (x << 24) | ((y >> 32) & 0xffffffull);
+  case 4:
+    return (x << 32) | ((y >> 24) & 0xffffffffull);
+  default:
+    return (x << 40) | ((y >> 16) & 0xffffffffffull);
+  }
+}
+
+On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these 
+generate truly horrible code, instead of using shld and friends.  On
+ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
+badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
+
+//===---------------------------------------------------------------------===//
+
+We do a number of simplifications in simplify libcalls to strength reduce
+standard library functions, but we don't currently merge them together.  For
+example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
+be done safely if "b" isn't modified between the strlen and memcpy of course.
+
+//===---------------------------------------------------------------------===//
+
+Reassociate should turn things like:
+
+int factorial(int X) {
+ return X*X*X*X*X*X*X*X;
+}
+
+into llvm.powi calls, allowing the code generator to produce balanced
+multiplication trees.
+
+//===---------------------------------------------------------------------===//
+
+We generate a horrible  libcall for llvm.powi.  For example, we compile:
+
+#include <cmath>
+double f(double a) { return std::pow(a, 4); }
+
+into:
+
+__Z1fd:
+	subl	$12, %esp
+	movsd	16(%esp), %xmm0
+	movsd	%xmm0, (%esp)
+	movl	$4, 8(%esp)
+	call	L___powidf2$stub
+	addl	$12, %esp
+	ret
+
+GCC produces:
+
+__Z1fd:
+	subl	$12, %esp
+	movsd	16(%esp), %xmm0
+	mulsd	%xmm0, %xmm0
+	mulsd	%xmm0, %xmm0
+	movsd	%xmm0, (%esp)
+	fldl	(%esp)
+	addl	$12, %esp
+	ret
+
+//===---------------------------------------------------------------------===//
+
+We compile this program: (from GCC PR11680)
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
+
+Into code that runs the same speed in fast/slow modes, but both modes run 2x
+slower than when compile with GCC (either 4.0 or 4.2):
+
+$ llvm-g++ perf.cpp -O3 -fno-exceptions
+$ time ./a.out fast
+1.821u 0.003s 0:01.82 100.0%	0+0k 0+0io 0pf+0w
+
+$ g++ perf.cpp -O3 -fno-exceptions
+$ time ./a.out fast
+0.821u 0.001s 0:00.82 100.0%	0+0k 0+0io 0pf+0w
+
+It looks like we are making the same inlining decisions, so this may be raw
+codegen badness or something else (haven't investigated).
+
+//===---------------------------------------------------------------------===//
+
+We miss some instcombines for stuff like this:
+void bar (void);
+void foo (unsigned int a) {
+  /* This one is equivalent to a >= (3 << 2).  */
+  if ((a >> 2) >= 3)
+    bar ();
+}
+
+A few other related ones are in GCC PR14753.
+
+//===---------------------------------------------------------------------===//
+
+Divisibility by constant can be simplified (according to GCC PR12849) from
+being a mulhi to being a mul lo (cheaper).  Testcase:
+
+void bar(unsigned n) {
+  if (n % 3 == 0)
+    true();
+}
+
+I think this basically amounts to a dag combine to simplify comparisons against
+multiply hi's into a comparison against the mullo.
+
+//===---------------------------------------------------------------------===//
+
+Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
+bunch of other stuff from this example (see PR1604): 
+
+#include <cstdio>
+struct test {
+    int val;
+    virtual ~test() {}
+};
+
+int main() {
+    test t;
+    std::scanf("%d", &t.val);
+    std::printf("%d\n", t.val);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x -
+10) u< 10, but only when the comparisons have matching sign.
+
+This could be converted with a similiar technique. (PR1941)
+
+define i1 @test(i8 %x) {
+  %A = icmp uge i8 %x, 5
+  %B = icmp slt i8 %x, 20
+  %C = and i1 %A, %B
+  ret i1 %C
+}
+
+//===---------------------------------------------------------------------===//
+
+These functions perform the same computation, but produce different assembly.
+
+define i8 @select(i8 %x) readnone nounwind {
+  %A = icmp ult i8 %x, 250
+  %B = select i1 %A, i8 0, i8 1
+  ret i8 %B 
+}
+
+define i8 @addshr(i8 %x) readnone nounwind {
+  %A = zext i8 %x to i9
+  %B = add i9 %A, 6       ;; 256 - 250 == 6
+  %C = lshr i9 %B, 8
+  %D = trunc i9 %C to i8
+  ret i8 %D
+}
+
+//===---------------------------------------------------------------------===//
+
+From gcc bug 24696:
+int
+f (unsigned long a, unsigned long b, unsigned long c)
+{
+  return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0);
+}
+int
+f (unsigned long a, unsigned long b, unsigned long c)
+{
+  return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
+}
+Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
+"clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 20192:
+#define PMD_MASK    (~((1UL << 23) - 1))
+void clear_pmd_range(unsigned long start, unsigned long end)
+{
+   if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK))
+       f();
+}
+The expression should optimize to something like
+"!((start|end)&~PMD_MASK). Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 15241:
+unsigned int
+foo (unsigned int a, unsigned int b)
+{
+ if (a <= 7 && b <= 7)
+   baz ();
+}
+Should combine to "(a|b) <= 7".  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 3756:
+int
+pn (int n)
+{
+ return (n >= 0 ? 1 : -1);
+}
+Should combine to (n >> 31) | 1.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts | llc".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 28685:
+int test(int a, int b)
+{
+ int lt = a < b;
+ int eq = a == b;
+
+ return (lt || eq);
+}
+Should combine to "a <= b".  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts | llc".
+
+//===---------------------------------------------------------------------===//
+
+void a(int variable)
+{
+ if (variable == 4 || variable == 6)
+   bar();
+}
+This should optimize to "if ((variable | 2) == 6)".  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
+
+//===---------------------------------------------------------------------===//
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
+i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+These should combine to the same thing.  Currently, the first function
+produces better code on X86.
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 15784:
+#define abs(x) x>0?x:-x
+int f(int x, int y)
+{
+ return (abs(x)) >= 0;
+}
+This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 14753:
+void
+rotate_cst (unsigned int a)
+{
+ a = (a << 10) | (a >> 22);
+ if (a == 123)
+   bar ();
+}
+void
+minus_cst (unsigned int a)
+{
+ unsigned int tem;
+
+ tem = 20 - a;
+ if (tem == 5)
+   bar ();
+}
+void
+mask_gt (unsigned int a)
+{
+ /* This is equivalent to a > 15.  */
+ if ((a & ~7) > 8)
+   bar ();
+}
+void
+rshift_gt (unsigned int a)
+{
+ /* This is equivalent to a > 23.  */
+ if ((a >> 2) > 5)
+   bar ();
+}
+All should simplify to a single comparison.  All of these are
+currently not optimized with "clang -emit-llvm-bc | opt
+-std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 32605:
+int c(int* x) {return (char*)x+2 == (char*)x;}
+Should combine to 0.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
+
+//===---------------------------------------------------------------------===//
+
+int a(unsigned char* b) {return *b > 99;}
+There's an unnecessary zext in the generated code with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
+Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
+with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
+Should combine to "x | (y & 3)".  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(unsigned a) {return ((a | 1) & 3) | (a & -4);}
+Should combine to "a | 1".  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
+Should fold to "(~a & c) | (a & b)".  Currently not optimized with
+"clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a,int b) {return (~(a|b))|a;}
+Should fold to "a|~b".  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b) {return (a&&b) || (a&&!b);}
+Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
+| opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
+Should fold to "a ? b : c", or at least something sane.  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
+Should fold to a && (b || c).  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return x | ((x & 8) ^ 8);}
+Should combine to x | 8.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return x ^ ((x & 8) ^ 8);}
+Should also combine to x | 8.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return (x & 8) == 0 ? -1 : -9;}
+Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return (x & 8) == 0 ? -9 : -1;}
+Should combine to x | -9.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return ((x | -9) ^ 8) & x;}
+Should combine to x & -9.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
+Should combine to "a * 0x88888888 >> 31".  Currently not optimized
+with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(char* x) {if ((*x & 32) == 0) return b();}
+There's an unnecessary zext in the generated code with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(unsigned long long x) {return 40 * (x >> 1);}
+Should combine to "20 * (((unsigned)x) & -2)".  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+We would like to do the following transform in the instcombiner:
+
+  -X/C -> X/-C
+
+However, this isn't valid if (-X) overflows. We can implement this when we
+have the concept of a "C signed subtraction" operator that which is undefined
+on overflow.
+
+//===---------------------------------------------------------------------===//
+
+This was noticed in the entryblock for grokdeclarator in 403.gcc:
+
+        %tmp = icmp eq i32 %decl_context, 4          
+        %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context 
+        %tmp1 = icmp eq i32 %decl_context_addr.0, 1 
+        %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0
+
+tmp1 should be simplified to something like:
+  (!tmp || decl_context == 1)
+
+This allows recursive simplifications, tmp1 is used all over the place in
+the function, e.g. by:
+
+        %tmp23 = icmp eq i32 %decl_context_addr.1, 0            ; <i1> [#uses=1]
+        %tmp24 = xor i1 %tmp1, true             ; <i1> [#uses=1]
+        %or.cond8 = and i1 %tmp23, %tmp24               ; <i1> [#uses=1]
+
+later.
+
+//===---------------------------------------------------------------------===//
+
+Store sinking: This code:
+
+void f (int n, int *cond, int *res) {
+    int i;
+    *res = 0;
+    for (i = 0; i < n; i++)
+        if (*cond)
+            *res ^= 234; /* (*) */
+}
+
+On this function GVN hoists the fully redundant value of *res, but nothing
+moves the store out.  This gives us this code:
+
+bb:		; preds = %bb2, %entry
+	%.rle = phi i32 [ 0, %entry ], [ %.rle6, %bb2 ]	
+	%i.05 = phi i32 [ 0, %entry ], [ %indvar.next, %bb2 ]
+	%1 = load i32* %cond, align 4
+	%2 = icmp eq i32 %1, 0
+	br i1 %2, label %bb2, label %bb1
+
+bb1:		; preds = %bb
+	%3 = xor i32 %.rle, 234	
+	store i32 %3, i32* %res, align 4
+	br label %bb2
+
+bb2:		; preds = %bb, %bb1
+	%.rle6 = phi i32 [ %3, %bb1 ], [ %.rle, %bb ]	
+	%indvar.next = add i32 %i.05, 1	
+	%exitcond = icmp eq i32 %indvar.next, %n
+	br i1 %exitcond, label %return, label %bb
+
+DSE should sink partially dead stores to get the store out of the loop.
+
+Here's another partial dead case:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12395
+
+//===---------------------------------------------------------------------===//
+
+Scalar PRE hoists the mul in the common block up to the else:
+
+int test (int a, int b, int c, int g) {
+  int d, e;
+  if (a)
+    d = b * c;
+  else
+    d = b - c;
+  e = b * c + g;
+  return d + e;
+}
+
+It would be better to do the mul once to reduce codesize above the if.
+This is GCC PR38204.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR37810 is an interesting case where we should sink load/store reload
+into the if block and outside the loop, so we don't reload/store it on the
+non-call path.
+
+for () {
+  *P += 1;
+  if ()
+    call();
+  else
+    ...
+->
+tmp = *P
+for () {
+  tmp += 1;
+  if () {
+    *P = tmp;
+    call();
+    tmp = *P;
+  } else ...
+}
+*P = tmp;
+
+We now hoist the reload after the call (Transforms/GVN/lpre-call-wrap.ll), but
+we don't sink the store.  We need partially dead store sinking.
+
+//===---------------------------------------------------------------------===//
+
+[PHI TRANSLATE GEPs]
+
+GCC PR37166: Sinking of loads prevents SROA'ing the "g" struct on the stack
+leading to excess stack traffic. This could be handled by GVN with some crazy
+symbolic phi translation.  The code we get looks like (g is on the stack):
+
+bb2:		; preds = %bb1
+..
+	%9 = getelementptr %struct.f* %g, i32 0, i32 0		
+	store i32 %8, i32* %9, align  bel %bb3
+
+bb3:		; preds = %bb1, %bb2, %bb
+	%c_addr.0 = phi %struct.f* [ %g, %bb2 ], [ %c, %bb ], [ %c, %bb1 ]
+	%b_addr.0 = phi %struct.f* [ %b, %bb2 ], [ %g, %bb ], [ %b, %bb1 ]
+	%10 = getelementptr %struct.f* %c_addr.0, i32 0, i32 0
+	%11 = load i32* %10, align 4
+
+%11 is fully redundant, an in BB2 it should have the value %8.
+
+GCC PR33344 is a similar case.
+
+//===---------------------------------------------------------------------===//
+
+There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
+GCC testsuite.  There are many pre testcases as ssa-pre-*.c
+
+//===---------------------------------------------------------------------===//
+
+There are some interesting cases in testsuite/gcc.dg/tree-ssa/pred-comm* in the
+GCC testsuite.  For example, predcom-1.c is:
+
+ for (i = 2; i < 1000; i++)
+    fib[i] = (fib[i-1] + fib[i - 2]) & 0xffff;
+
+which compiles into:
+
+bb1:		; preds = %bb1, %bb1.thread
+	%indvar = phi i32 [ 0, %bb1.thread ], [ %0, %bb1 ]	
+	%i.0.reg2mem.0 = add i32 %indvar, 2		
+	%0 = add i32 %indvar, 1		; <i32> [#uses=3]
+	%1 = getelementptr [1000 x i32]* @fib, i32 0, i32 %0		
+	%2 = load i32* %1, align 4		; <i32> [#uses=1]
+	%3 = getelementptr [1000 x i32]* @fib, i32 0, i32 %indvar	
+	%4 = load i32* %3, align 4		; <i32> [#uses=1]
+	%5 = add i32 %4, %2		; <i32> [#uses=1]
+	%6 = and i32 %5, 65535		; <i32> [#uses=1]
+	%7 = getelementptr [1000 x i32]* @fib, i32 0, i32 %i.0.reg2mem.0
+	store i32 %6, i32* %7, align 4
+	%exitcond = icmp eq i32 %0, 998		; <i1> [#uses=1]
+	br i1 %exitcond, label %return, label %bb1
+
+This is basically:
+  LOAD fib[i+1]
+  LOAD fib[i]
+  STORE fib[i+2]
+
+instead of handling this as a loop or other xform, all we'd need to do is teach
+load PRE to phi translate the %0 add (i+1) into the predecessor as (i'+1+1) =
+(i'+2) (where i' is the previous iteration of i).  This would find the store
+which feeds it.
+
+predcom-2.c is apparently the same as predcom-1.c
+predcom-3.c is very similar but needs loads feeding each other instead of
+store->load.
+predcom-4.c seems the same as the rest.
+
+
+//===---------------------------------------------------------------------===//
+
+Other simple load PRE cases:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35287 [LPRE crit edge splitting]
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34677 (licm does this, LPRE crit edge)
+  llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | opt -mem2reg -simplifycfg -gvn | llvm-dis
+
+//===---------------------------------------------------------------------===//
+
+Type based alias analysis:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
+
+//===---------------------------------------------------------------------===//
+
+When GVN/PRE finds a store of float* to a must aliases pointer when expecting
+an int*, it should turn it into a bitcast.  This is a nice generalization of
+the SROA hack that would apply to other cases, e.g.:
+
+int foo(int C, int *P, float X) {
+  if (C) {
+    bar();
+    *P = 42;
+  } else
+    *(float*)P = X;
+
+   return *P;
+}
+
+
+One example (that requires crazy phi translation) is:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS]
+
+//===---------------------------------------------------------------------===//
+
+A/B get pinned to the stack because we turn an if/then into a select instead
+of PRE'ing the load/store.  This may be fixable in instcombine:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
+
+
+
+Interesting missed case because of control flow flattening (should be 2 loads):
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
+With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | 
+             opt -mem2reg -gvn -instcombine | llvm-dis
+we miss it because we need 1) GEP PHI TRAN, 2) CRIT EDGE 3) MULTIPLE DIFFERENT
+VALS PRODUCED BY ONE BLOCK OVER DIFFERENT PATHS
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19633
+We could eliminate the branch condition here, loading from null is undefined:
+
+struct S { int w, x, y, z; };
+struct T { int r; struct S s; };
+void bar (struct S, int);
+void foo (int a, struct T b)
+{
+  struct S *c = 0;
+  if (a)
+    c = &b.s;
+  bar (*c, a);
+}
+
+//===---------------------------------------------------------------------===//
+
+simplifylibcalls should do several optimizations for strspn/strcspn:
+
+strcspn(x, "") -> strlen(x)
+strcspn("", x) -> 0
+strspn("", x) -> 0
+strspn(x, "") -> strlen(x)
+strspn(x, "a") -> strchr(x, 'a')-x
+
+strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
+
+size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
+                     int __reject3) {
+  register size_t __result = 0;
+  while (__s[__result] != '\0' && __s[__result] != __reject1 &&
+         __s[__result] != __reject2 && __s[__result] != __reject3)
+    ++__result;
+  return __result;
+}
+
+This should turn into a switch on the character.  See PR3253 for some notes on
+codegen.
+
+456.hmmer apparently uses strcspn and strspn a lot.  471.omnetpp uses strspn.
+
+//===---------------------------------------------------------------------===//
+
+"gas" uses this idiom:
+  else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
+..
+  else if (strchr ("<>", *intel_parser.op_string)
+
+Those should be turned into a switch.
+
+//===---------------------------------------------------------------------===//
+
+252.eon contains this interesting code:
+
+        %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
+        %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
+        %strlen = call i32 @strlen(i8* %3072)    ; uses = 1
+        %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
+        call void @llvm.memcpy.i32(i8* %endptr, 
+          i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
+        %3074 = call i32 @strlen(i8* %endptr) nounwind readonly 
+        
+This is interesting for a couple reasons.  First, in this:
+
+        %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
+        %strlen = call i32 @strlen(i8* %3072)  
+
+The strlen could be replaced with: %strlen = sub %3072, %3073, because the
+strcpy call returns a pointer to the end of the string.  Based on that, the
+endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
+
+Second, the memcpy+strlen strlen can be replaced with:
+
+        %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly 
+
+Because the destination was just copied into the specified memory buffer.  This,
+in turn, can be constant folded to "4".
+
+In other code, it contains:
+
+        %endptr6978 = bitcast i8* %endptr69 to i32*            
+        store i32 7107374, i32* %endptr6978, align 1
+        %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly    
+
+Which could also be constant folded.  Whatever is producing this should probably
+be fixed to leave this as a memcpy from a string.
+
+Further, eon also has an interesting partially redundant strlen call:
+
+bb8:            ; preds = %_ZN18eonImageCalculatorC1Ev.exit
+        %682 = getelementptr i8** %argv, i32 6          ; <i8**> [#uses=2]
+        %683 = load i8** %682, align 4          ; <i8*> [#uses=4]
+        %684 = load i8* %683, align 1           ; <i8> [#uses=1]
+        %685 = icmp eq i8 %684, 0               ; <i1> [#uses=1]
+        br i1 %685, label %bb10, label %bb9
+
+bb9:            ; preds = %bb8
+        %686 = call i32 @strlen(i8* %683) nounwind readonly          
+        %687 = icmp ugt i32 %686, 254           ; <i1> [#uses=1]
+        br i1 %687, label %bb10, label %bb11
+
+bb10:           ; preds = %bb9, %bb8
+        %688 = call i32 @strlen(i8* %683) nounwind readonly          
+
+This could be eliminated by doing the strlen once in bb8, saving code size and
+improving perf on the bb8->9->10 path.
+
+//===---------------------------------------------------------------------===//
+
+I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
+which looks like:
+       %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0 
+ 
+
+bb62:           ; preds = %bb55, %bb53
+        %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]             
+        %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
+        %172 = add i32 %171, -1         ; <i32> [#uses=1]
+        %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172       
+
+...  no stores ...
+       br i1 %or.cond, label %bb65, label %bb72
+
+bb65:           ; preds = %bb62
+        store i8 0, i8* %173, align 1
+        br label %bb72
+
+bb72:           ; preds = %bb65, %bb62
+        %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]            
+        %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
+
+Note that on the bb62->bb72 path, that the %177 strlen call is partially
+redundant with the %171 call.  At worst, we could shove the %177 strlen call
+up into the bb65 block moving it out of the bb62->bb72 path.   However, note
+that bb65 stores to the string, zeroing out the last byte.  This means that on
+that path the value of %177 is actually just %171-1.  A sub is cheaper than a
+strlen!
+
+This pattern repeats several times, basically doing:
+
+  A = strlen(P);
+  P[A-1] = 0;
+  B = strlen(P);
+  where it is "obvious" that B = A-1.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty contains this interesting pattern:
+
+%77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
+                       i8* %30)
+%phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
+br i1 %phitmp648, label %bb70, label %bb76
+
+bb70:           ; preds = %OptionMatch.exit91, %bb69
+        %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
+
+This is basically:
+  cststr = "abcdef";
+  if (strstr(cststr, P) == cststr) {
+     x = strlen(P);
+     ...
+
+The strstr call would be significantly cheaper written as:
+
+cststr = "abcdef";
+if (memcmp(P, str, strlen(P)))
+  x = strlen(P);
+
+This is memcmp+strlen instead of strstr.  This also makes the strlen fully
+redundant.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty also contains this code:
+
+%1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
+%1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
+%1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
+%1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
+%1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909         
+
+The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty has this interesting pattern with the "out.4543" variable:
+
+call void @llvm.memcpy.i32(
+        i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
+       i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1) 
+%101 = call@printf(i8* ...   @out.4543, i32 0, i32 0)) nounwind 
+
+It is basically doing:
+
+  memcpy(globalarray, "string");
+  printf(...,  globalarray);
+  
+Anyway, by knowing that printf just reads the memory and forward substituting
+the string directly into the printf, this eliminates reads from globalarray.
+Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
+other similar functions) there are many stores to "out".  Once all the printfs
+stop using "out", all that is left is the memcpy's into it.  This should allow
+globalopt to remove the "stored only" global.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+define inreg i32 @foo(i8* inreg %p) nounwind {
+  %tmp0 = load i8* %p
+  %tmp1 = ashr i8 %tmp0, 5
+  %tmp2 = sext i8 %tmp1 to i32
+  ret i32 %tmp2
+}
+
+could be dagcombine'd to a sign-extending load with a shift.
+For example, on x86 this currently gets this:
+
+	movb	(%eax), %al
+	sarb	$5, %al
+	movsbl	%al, %eax
+
+while it could get this:
+
+	movsbl	(%eax), %eax
+	sarl	$5, %eax
+
+//===---------------------------------------------------------------------===//
+
+GCC PR31029:
+
+int test(int x) { return 1-x == x; }     // --> return false
+int test2(int x) { return 2-x == x; }    // --> return x == 1 ?
+
+Always foldable for odd constants, what is the rule for even?
+
+//===---------------------------------------------------------------------===//
+
+PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
+for next field in struct (which is at same address).
+
+For example: store of float into { {{}}, float } could be turned into a store to
+the float directly.
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+double foo(double a) {    return sin(a); }
+
+This compiles into this on x86-64 Linux:
+foo:
+	subq	$8, %rsp
+	call	sin
+	addq	$8, %rsp
+	ret
+vs:
+
+foo:
+        jmp sin
+
+//===---------------------------------------------------------------------===//
+
+The arg promotion pass should make use of nocapture to make its alias analysis
+stuff much more precise.
+
+//===---------------------------------------------------------------------===//
+
+The following functions should be optimized to use a select instead of a
+branch (from gcc PR40072):
+
+char char_int(int m) {if(m>7) return 0; return m;}
+int int_char(char m) {if(m>7) return 0; return m;}
+
+//===---------------------------------------------------------------------===//
+
+Instcombine should replace the load with a constant in:
+
+  static const char x[4] = {'a', 'b', 'c', 'd'};
+  
+  unsigned int y(void) {
+    return *(unsigned int *)x;
+  }
+
+It currently only does this transformation when the size of the constant 
+is the same as the size of the integer (so, try x[5]) and the last byte 
+is a null (making it a C string). There's no need for these restrictions.
+
+//===---------------------------------------------------------------------===//
+
+InstCombine's "turn load from constant into constant" optimization should be
+more aggressive in the presence of bitcasts.  For example, because of unions,
+this code:
+
+union vec2d {
+    double e[2];
+    double v __attribute__((vector_size(16)));
+};
+typedef union vec2d vec2d;
+
+static vec2d a={{1,2}}, b={{3,4}};
+    
+vec2d foo () {
+    return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v };
+}
+
+Compiles into:
+
+@a = internal constant %0 { [2 x double] 
+           [double 1.000000e+00, double 2.000000e+00] }, align 16
+@b = internal constant %0 { [2 x double]
+           [double 3.000000e+00, double 4.000000e+00] }, align 16
+...
+define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind {
+entry:
+	%0 = load <2 x double>* getelementptr (%struct.vec2d* 
+           bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16
+	%1 = load <2 x double>* getelementptr (%struct.vec2d* 
+           bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16
+
+
+Instcombine should be able to optimize away the loads (and thus the globals).
+
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/AsmPrinter/CMakeLists.txt b/lib/Target/Sparc/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..394b4cd
--- /dev/null
+++ b/lib/Target/Sparc/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_partially_linked_object(LLVMSparcAsmPrinter
+  SparcAsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMSparcCodeGen n)
+
+add_dependencies(LLVMSparcAsmPrinter ${n})
diff --git a/lib/Target/Sparc/AsmPrinter/Makefile b/lib/Target/Sparc/AsmPrinter/Makefile
new file mode 100644
index 0000000..f12a6ac
--- /dev/null
+++ b/lib/Target/Sparc/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Sparc/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSparcAsmPrinter
+
+# Hack: we need to include 'main' Sparc target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
new file mode 100644
index 0000000..61707f5
--- /dev/null
+++ b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
@@ -0,0 +1,355 @@
+//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format SPARC assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Sparc.h"
+#include "SparcInstrInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+#include <cstring>
+#include <map>
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN SparcAsmPrinter : public AsmPrinter {
+    /// We name each basic block in a Function with a unique number, so
+    /// that we can consistently refer to them later. This is cleared
+    /// at the beginning of each call to runOnMachineFunction().
+    ///
+    typedef std::map<const Value *, unsigned> ValueMapTy;
+    ValueMapTy NumberForBB;
+  public:
+    explicit SparcAsmPrinter(raw_ostream &O, TargetMachine &TM,
+                             const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                             bool V)
+      : AsmPrinter(O, TM, T, OL, V) {}
+
+    virtual const char *getPassName() const {
+      return "Sparc Assembly Printer";
+    }
+
+    void printModuleLevelGV(const GlobalVariable* GVar);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printMemOperand(const MachineInstr *MI, int opNum,
+                         const char *Modifier = 0);
+    void printCCOperand(const MachineInstr *MI, int opNum);
+
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+  };
+} // end of anonymous namespace
+
+#include "SparcGenAsmWriter.inc"
+
+/// createSparcCodePrinterPass - Returns a pass that prints the SPARC
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createSparcCodePrinterPass(raw_ostream &o,
+                                               TargetMachine &tm,
+                                               CodeGenOpt::Level OptLevel,
+                                               bool verbose) {
+  return new SparcAsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
+
+/// runOnMachineFunction - This uses the printInstruction()
+/// method to print assembly for each instruction.
+///
+bool SparcAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // BBNumber is used here so that a given Printer will never give two
+  // BBs the same name. (If you have a better way, please let me know!)
+  static unsigned BBNumber = 0;
+
+  O << "\n\n";
+
+  // Print out the label for the function.
+  const Function *F = MF.getFunction();
+  SwitchToSection(TAI->SectionForGlobal(F));
+  EmitAlignment(4, F);
+  O << "\t.globl\t" << CurrentFnName << '\n';
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  O << "\t.type\t" << CurrentFnName << ", #function\n";
+  O << CurrentFnName << ":\n";
+
+  // Number each basic block so that we can consistently refer to them
+  // in PC-relative references.
+  // FIXME: Why not use the MBB numbers?
+  NumberForBB.clear();
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    NumberForBB[I->getBasicBlock()] = BBNumber++;
+  }
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printInstruction(II);
+      ++EmittedInsts;
+    }
+  }
+
+  // We didn't modify anything.
+  return false;
+}
+
+void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand (opNum);
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+  bool CloseParen = false;
+  if (MI->getOpcode() == SP::SETHIi && !MO.isReg() && !MO.isImm()) {
+    O << "%hi(";
+    CloseParen = true;
+  } else if ((MI->getOpcode() == SP::ORri || MI->getOpcode() == SP::ADDri) &&
+             !MO.isReg() && !MO.isImm()) {
+    O << "%lo(";
+    CloseParen = true;
+  }
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      O << "%" << LowercaseString (RI.get(MO.getReg()).AsmName);
+    else
+      O << "%reg" << MO.getReg();
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << (int)MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    {
+      const GlobalValue *GV = MO.getGlobal();
+      O << Mang->getValueName(GV);
+    }
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    break;
+  default:
+    O << "<unknown operand type>"; abort (); break;
+  }
+  if (CloseParen) O << ")";
+}
+
+void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      const char *Modifier) {
+  printOperand(MI, opNum);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1);
+    return;
+  }
+
+  if (MI->getOperand(opNum+1).isReg() &&
+      MI->getOperand(opNum+1).getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MI->getOperand(opNum+1).isImm() &&
+      MI->getOperand(opNum+1).getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+  if (MI->getOperand(opNum+1).isGlobal() ||
+      MI->getOperand(opNum+1).isCPI()) {
+    O << "%lo(";
+    printOperand(MI, opNum+1);
+    O << ")";
+  } else {
+    printOperand(MI, opNum+1);
+  }
+}
+
+void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+bool SparcAsmPrinter::doInitialization(Module &M) {
+  Mang = new Mangler(M, "", TAI->getPrivateGlobalPrefix());
+  return false; // success
+}
+
+bool SparcAsmPrinter::doFinalization(Module &M) {
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    printModuleLevelGV(I);
+
+  O << '\n';
+
+  return AsmPrinter::doFinalization(M);
+}
+
+void SparcAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return;  // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar))
+    return;
+
+  O << "\n\n";
+  std::string name = Mang->getValueName(GVar);
+  Constant *C = GVar->getInitializer();
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  unsigned Align = TD->getPreferredAlignment(GVar);
+
+  printVisibility(name, GVar->getVisibility());
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  if (C->isNullValue() && !GVar->hasSection()) {
+    if (!GVar->isThreadLocal() &&
+        (GVar->hasLocalLinkage() || GVar->isWeakForLinker())) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (GVar->hasLocalLinkage())
+        O << "\t.local " << name << '\n';
+
+      O << TAI->getCOMMDirective() << name << ',' << Size;
+      if (TAI->getCOMMDirectiveTakesAlignment())
+        O << ',' << (1 << Align);
+
+      O << '\n';
+      return;
+    }
+  }
+
+  switch (GVar->getLinkage()) {
+   case GlobalValue::CommonLinkage:
+   case GlobalValue::LinkOnceAnyLinkage:
+   case GlobalValue::LinkOnceODRLinkage:
+   case GlobalValue::WeakAnyLinkage: // FIXME: Verify correct for weak.
+   case GlobalValue::WeakODRLinkage: // FIXME: Verify correct for weak.
+    // Nonnull linkonce -> weak
+    O << "\t.weak " << name << '\n';
+    break;
+   case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+   case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << TAI->getGlobalDirective() << name << '\n';
+    // FALL THROUGH
+   case GlobalValue::PrivateLinkage:
+   case GlobalValue::InternalLinkage:
+    break;
+   case GlobalValue::GhostLinkage:
+    cerr << "Should not have any unmaterialized functions!\n";
+    abort();
+   case GlobalValue::DLLImportLinkage:
+    cerr << "DLLImport linkage is not supported by this target!\n";
+    abort();
+   case GlobalValue::DLLExportLinkage:
+    cerr << "DLLExport linkage is not supported by this target!\n";
+    abort();
+   default:
+    assert(0 && "Unknown linkage type!");
+  }
+
+  EmitAlignment(Align, GVar);
+
+  if (TAI->hasDotTypeDotSizeDirective()) {
+    O << "\t.type " << name << ",#object\n";
+    O << "\t.size " << name << ',' << Size << '\n';
+  }
+
+  O << name << ":\n";
+  EmitGlobalConstant(C);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'r':
+     break;
+    }
+  }
+
+  printOperand(MI, OpNo);
+
+  return false;
+}
+
+bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo,
+                                            unsigned AsmVariant,
+                                            const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo);
+  O << ']';
+
+  return false;
+}
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
new file mode 100644
index 0000000..eefa7e8
--- /dev/null
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(LLVM_TARGET_DEFINITIONS Sparc.td)
+
+tablegen(SparcGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(SparcGenRegisterNames.inc -gen-register-enums)
+tablegen(SparcGenRegisterInfo.inc -gen-register-desc)
+tablegen(SparcGenInstrNames.inc -gen-instr-enums)
+tablegen(SparcGenInstrInfo.inc -gen-instr-desc)
+tablegen(SparcGenAsmWriter.inc -gen-asm-writer)
+tablegen(SparcGenDAGISel.inc -gen-dag-isel)
+tablegen(SparcGenSubtarget.inc -gen-subtarget)
+tablegen(SparcGenCallingConv.inc -gen-callingconv)
+
+add_llvm_target(SparcCodeGen
+  DelaySlotFiller.cpp
+  FPMover.cpp
+  SparcInstrInfo.cpp
+  SparcISelDAGToDAG.cpp
+  SparcISelLowering.cpp
+  SparcRegisterInfo.cpp
+  SparcSubtarget.cpp
+  SparcTargetAsmInfo.cpp
+  SparcTargetMachine.cpp
+  )
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
new file mode 100644
index 0000000..15b26c2
--- /dev/null
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -0,0 +1,76 @@
+//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple local pass that fills delay slots with NOPs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delayslotfiller"
+#include "Sparc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "SPARC Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Sparc MachineFunctions
+///
+FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+  return new Filler(tm);
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// Currently, we fill delay slots with NOPs. We assume there is only one
+/// delay slot per delayed instruction.
+///
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->getDesc().hasDelaySlot()) {
+      MachineBasicBlock::iterator J = I;
+      ++J;
+      BuildMI(MBB, J, DebugLoc::getUnknownLoc(), TII->get(SP::NOP));
+      ++FilledSlots;
+      Changed = true;
+    }
+  return Changed;
+}
diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
new file mode 100644
index 0000000..f72a4c4
--- /dev/null
+++ b/lib/Target/Sparc/FPMover.cpp
@@ -0,0 +1,139 @@
+//===-- FPMover.cpp - Sparc double-precision floating point move fixer ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand FpMOVD/FpABSD/FpNEGD instructions into their single-precision pieces.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "fpmover"
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+STATISTIC(NumFpDs , "Number of instructions translated");
+STATISTIC(NoopFpDs, "Number of noop instructions removed");
+
+namespace {
+  struct FPMover : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    
+    static char ID;
+    explicit FPMover(TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "Sparc Double-FP Move Fixer";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+  };
+  char FPMover::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcFPMoverPass - Returns a pass that turns FpMOVD
+/// instructions into FMOVS instructions
+///
+FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) {
+  return new FPMover(tm);
+}
+
+/// getDoubleRegPair - Given a DFP register, return the even and odd FP
+/// registers that correspond to it.
+static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
+                             unsigned &OddReg) {
+  static const unsigned EvenHalvesOfPairs[] = {
+    SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14,
+    SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30
+  };
+  static const unsigned OddHalvesOfPairs[] = {
+    SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15,
+    SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31
+  };
+  static const unsigned DoubleRegsInOrder[] = {
+    SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8,
+    SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15
+  };
+  for (unsigned i = 0; i < sizeof(DoubleRegsInOrder)/sizeof(unsigned); ++i)
+    if (DoubleRegsInOrder[i] == DoubleReg) {
+      EvenReg = EvenHalvesOfPairs[i];
+      OddReg = OddHalvesOfPairs[i];
+      return;
+    }
+  assert(0 && "Can't find reg");
+}
+
+/// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB.
+///
+bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineInstr *MI = I++;
+    DebugLoc dl = MI->getDebugLoc();
+    if (MI->getOpcode() == SP::FpMOVD || MI->getOpcode() == SP::FpABSD ||
+        MI->getOpcode() == SP::FpNEGD) {
+      Changed = true;
+      unsigned DestDReg = MI->getOperand(0).getReg();
+      unsigned SrcDReg  = MI->getOperand(1).getReg();
+      if (DestDReg == SrcDReg && MI->getOpcode() == SP::FpMOVD) {
+        MBB.erase(MI);   // Eliminate the noop copy.
+        ++NoopFpDs;
+        continue;
+      }
+      
+      unsigned EvenSrcReg = 0, OddSrcReg = 0, EvenDestReg = 0, OddDestReg = 0;
+      getDoubleRegPair(DestDReg, EvenDestReg, OddDestReg);
+      getDoubleRegPair(SrcDReg, EvenSrcReg, OddSrcReg);
+
+      const TargetInstrInfo *TII = TM.getInstrInfo();
+      if (MI->getOpcode() == SP::FpMOVD)
+        MI->setDesc(TII->get(SP::FMOVS));
+      else if (MI->getOpcode() == SP::FpNEGD)
+        MI->setDesc(TII->get(SP::FNEGS));
+      else if (MI->getOpcode() == SP::FpABSD)
+        MI->setDesc(TII->get(SP::FABSS));
+      else
+        assert(0 && "Unknown opcode!");
+        
+      MI->getOperand(0).setReg(EvenDestReg);
+      MI->getOperand(1).setReg(EvenSrcReg);
+      DOUT << "FPMover: the modified instr is: " << *MI;
+      // Insert copy for the other half of the double.
+      if (DestDReg != SrcDReg) {
+        MI = BuildMI(MBB, I, dl, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg)
+          .addReg(OddSrcReg);
+        DOUT << "FPMover: the inserted instr is: " << *MI;
+      }
+      ++NumFpDs;
+    }
+  }
+  return Changed;
+}
+
+bool FPMover::runOnMachineFunction(MachineFunction &F) {
+  // If the target has V9 instructions, the fp-mover pseudos will never be
+  // emitted.  Avoid a scan of the instructions to improve compile time.
+  if (TM.getSubtarget<SparcSubtarget>().isV9())
+    return false;
+  
+  bool Changed = false;
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI)
+    Changed |= runOnMachineBasicBlock(*FI);
+  return Changed;
+}
diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
new file mode 100644
index 0000000..fdf6afa
--- /dev/null
+++ b/lib/Target/Sparc/Makefile
@@ -0,0 +1,22 @@
+##===- lib/Target/Sparc/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMSparcCodeGen
+TARGET = Sparc
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = SparcGenRegisterInfo.h.inc SparcGenRegisterNames.inc \
+                SparcGenRegisterInfo.inc SparcGenInstrNames.inc \
+                SparcGenInstrInfo.inc SparcGenAsmWriter.inc \
+                SparcGenDAGISel.inc SparcGenSubtarget.inc SparcGenCallingConv.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
new file mode 100644
index 0000000..cc24abf
--- /dev/null
+++ b/lib/Target/Sparc/README.txt
@@ -0,0 +1,58 @@
+
+To-do
+-----
+
+* Keep the address of the constant pool in a register instead of forming its
+  address all of the time.
+* We can fold small constant offsets into the %hi/%lo references to constant
+  pool addresses as well.
+* When in V9 mode, register allocate %icc[0-3].
+* Add support for isel'ing UMUL_LOHI instead of marking it as Expand.
+* Emit the 'Branch on Integer Register with Prediction' instructions.  It's
+  not clear how to write a pattern for this though:
+
+float %t1(int %a, int* %p) {
+        %C = seteq int %a, 0
+        br bool %C, label %T, label %F
+T:
+        store int 123, int* %p
+        br label %F
+F:
+        ret float undef
+}
+
+codegens to this:
+
+t1:
+        save -96, %o6, %o6
+1)      subcc %i0, 0, %l0
+1)      bne .LBBt1_2    ! F
+        nop
+.LBBt1_1:       ! T
+        or %g0, 123, %l0
+        st %l0, [%i1]
+.LBBt1_2:       ! F
+        restore %g0, %g0, %g0
+        retl
+        nop
+
+1) should be replaced with a brz in V9 mode.
+
+* Same as above, but emit conditional move on register zero (p192) in V9 
+  mode.  Testcase:
+
+int %t1(int %a, int %b) {
+        %C = seteq int %a, 0
+        %D = select bool %C, int %a, int %b
+        ret int %D
+}
+
+* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling 
+  with the Y register, if they are faster.
+
+* Codegen bswap(load)/store(bswap) -> load/store ASI
+
+* Implement frame pointer elimination, e.g. eliminate save/restore for 
+  leaf fns.
+* Fill delay slots
+
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
new file mode 100644
index 0000000..bb03f30
--- /dev/null
+++ b/lib/Target/Sparc/Sparc.h
@@ -0,0 +1,119 @@
+//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Sparc back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_SPARC_H
+#define TARGET_SPARC_H
+
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+namespace llvm {
+  class FunctionPass;
+  class SparcTargetMachine;
+  class raw_ostream;
+
+  FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
+  FunctionPass *createSparcCodePrinterPass(raw_ostream &OS, TargetMachine &TM,
+                                           CodeGenOpt::Level OptLevel,
+                                           bool Verbose);
+  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcFPMoverPass(TargetMachine &TM);
+} // end namespace llvm;
+
+// Defines symbolic names for Sparc registers.  This defines a mapping from
+// register name to register number.
+//
+#include "SparcGenRegisterNames.inc"
+
+// Defines symbolic names for the Sparc instructions.
+//
+#include "SparcGenInstrNames.inc"
+
+
+namespace llvm {
+  // Enums corresponding to Sparc condition codes, both icc's and fcc's.  These
+  // values must be kept in sync with the ones in the .td file.
+  namespace SPCC {
+    enum CondCodes {
+      //ICC_A   =  8   ,  // Always
+      //ICC_N   =  0   ,  // Never
+      ICC_NE  =  9   ,  // Not Equal
+      ICC_E   =  1   ,  // Equal
+      ICC_G   = 10   ,  // Greater
+      ICC_LE  =  2   ,  // Less or Equal
+      ICC_GE  = 11   ,  // Greater or Equal
+      ICC_L   =  3   ,  // Less
+      ICC_GU  = 12   ,  // Greater Unsigned
+      ICC_LEU =  4   ,  // Less or Equal Unsigned
+      ICC_CC  = 13   ,  // Carry Clear/Great or Equal Unsigned
+      ICC_CS  =  5   ,  // Carry Set/Less Unsigned
+      ICC_POS = 14   ,  // Positive
+      ICC_NEG =  6   ,  // Negative
+      ICC_VC  = 15   ,  // Overflow Clear
+      ICC_VS  =  7   ,  // Overflow Set
+      
+      //FCC_A   =  8+16,  // Always
+      //FCC_N   =  0+16,  // Never
+      FCC_U   =  7+16,  // Unordered
+      FCC_G   =  6+16,  // Greater
+      FCC_UG  =  5+16,  // Unordered or Greater
+      FCC_L   =  4+16,  // Less
+      FCC_UL  =  3+16,  // Unordered or Less
+      FCC_LG  =  2+16,  // Less or Greater
+      FCC_NE  =  1+16,  // Not Equal
+      FCC_E   =  9+16,  // Equal
+      FCC_UE  = 10+16,  // Unordered or Equal
+      FCC_GE  = 11+16,  // Greater or Equal
+      FCC_UGE = 12+16,  // Unordered or Greater or Equal
+      FCC_LE  = 13+16,  // Less or Equal
+      FCC_ULE = 14+16,  // Unordered or Less or Equal
+      FCC_O   = 15+16   // Ordered
+    };
+  }
+  
+  inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
+    switch (CC) {
+    default: assert(0 && "Unknown condition code");
+    case SPCC::ICC_NE:  return "ne";
+    case SPCC::ICC_E:   return "e";
+    case SPCC::ICC_G:   return "g";
+    case SPCC::ICC_LE:  return "le";
+    case SPCC::ICC_GE:  return "ge";
+    case SPCC::ICC_L:   return "l";
+    case SPCC::ICC_GU:  return "gu";
+    case SPCC::ICC_LEU: return "leu";
+    case SPCC::ICC_CC:  return "cc";
+    case SPCC::ICC_CS:  return "cs";
+    case SPCC::ICC_POS: return "pos";
+    case SPCC::ICC_NEG: return "neg";
+    case SPCC::ICC_VC:  return "vc";
+    case SPCC::ICC_VS:  return "vs";
+    case SPCC::FCC_U:   return "u";
+    case SPCC::FCC_G:   return "g";
+    case SPCC::FCC_UG:  return "ug";
+    case SPCC::FCC_L:   return "l";
+    case SPCC::FCC_UL:  return "ul";
+    case SPCC::FCC_LG:  return "lg";
+    case SPCC::FCC_NE:  return "ne";
+    case SPCC::FCC_E:   return "e";
+    case SPCC::FCC_UE:  return "ue";
+    case SPCC::FCC_GE:  return "ge";
+    case SPCC::FCC_UGE: return "uge";
+    case SPCC::FCC_LE:  return "le";
+    case SPCC::FCC_ULE: return "ule";
+    case SPCC::FCC_O:   return "o";
+    }       
+  }
+}  // end namespace llvm
+#endif
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
new file mode 100644
index 0000000..53ea8f4
--- /dev/null
+++ b/lib/Target/Sparc/Sparc.td
@@ -0,0 +1,76 @@
+//===- Sparc.td - Describe the Sparc Target Machine -------------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget features.
+//
+ 
+def FeatureV9
+  : SubtargetFeature<"v9", "IsV9", "true",
+                     "Enable SPARC-V9 instructions">;
+def FeatureV8Deprecated
+  : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+                     "Enable deprecated V8 instructions in V9 mode">;
+def FeatureVIS
+  : SubtargetFeature<"vis", "IsVIS", "true",
+                     "Enable UltraSPARC Visual Instruction Set extensions">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SparcRegisterInfo.td"
+include "SparcCallingConv.td"
+include "SparcInstrInfo.td"
+
+def SparcInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"v8",              []>;
+def : Proc<"supersparc",      []>;
+def : Proc<"sparclite",       []>;
+def : Proc<"f934",            []>;
+def : Proc<"hypersparc",      []>;
+def : Proc<"sparclite86x",    []>;
+def : Proc<"sparclet",        []>;
+def : Proc<"tsc701",          []>;
+def : Proc<"v9",              [FeatureV9]>;
+def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3-vis", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Sparc : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = SparcInstrInfo;
+}
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
new file mode 100644
index 0000000..33ecfdf
--- /dev/null
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -0,0 +1,32 @@
+//===- SparcCallingConv.td - Calling Conventions Sparc -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Sparc architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Sparc 32-bit C return-value convention.
+def RetCC_Sparc32 : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+]>;
+
+// Sparc 32-bit C Calling convention.
+def CC_Sparc32 : CallingConv<[
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[i32, f32, f64], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
new file mode 100644
index 0000000..c9bd62d
--- /dev/null
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -0,0 +1,215 @@
+//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SPARC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "SparcTargetMachine.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class SparcDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const SparcSubtarget &Subtarget;
+public:
+  explicit SparcDAGToDAGISel(SparcTargetMachine &TM)
+    : SelectionDAGISel(TM),
+      Subtarget(TM.getSubtarget<SparcSubtarget>()) {
+  }
+
+  SDNode *Select(SDValue Op);
+
+  // Complex Pattern Selectors.
+  bool SelectADDRrr(SDValue Op, SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDRri(SDValue Op, SDValue N, SDValue &Base,
+                    SDValue &Offset);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+
+  /// InstructionSelect - This callback is invoked by
+  /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+  virtual void InstructionSelect();
+
+  virtual const char *getPassName() const {
+    return "SPARC DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "SparcGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void SparcDAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+bool SparcDAGToDAGISel::SelectADDRri(SDValue Op, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (Predicate_simm13(CN)) {
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          // Constant offset from frame ref.
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+        return true;
+      }
+    }
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(1);
+      Offset = Addr.getOperand(0).getOperand(0);
+      return true;
+    }
+    if (Addr.getOperand(1).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(0);
+      Offset = Addr.getOperand(1).getOperand(0);
+      return true;
+    }
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool SparcDAGToDAGISel::SelectADDRrr(SDValue Op, SDValue Addr,
+                                     SDValue &R1,  SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (isa<ConstantSDNode>(Addr.getOperand(1)) &&
+        Predicate_simm13(Addr.getOperand(1).getNode()))
+      return false;  // Let the reg+imm pattern catch this!
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
+        Addr.getOperand(1).getOpcode() == SPISD::Lo)
+      return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+  R2 = CurDAG->getRegister(SP::G0, MVT::i32);
+  return true;
+}
+
+SDNode *SparcDAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::SDIV:
+  case ISD::UDIV: {
+    // FIXME: should use a custom expander to expose the SRA to the dag.
+    SDValue DivLHS = N->getOperand(0);
+    SDValue DivRHS = N->getOperand(1);
+
+    // Set the Y register to the high-part.
+    SDValue TopPart;
+    if (N->getOpcode() == ISD::SDIV) {
+      TopPart = SDValue(CurDAG->getTargetNode(SP::SRAri, dl, MVT::i32, DivLHS,
+                                   CurDAG->getTargetConstant(31, MVT::i32)), 0);
+    } else {
+      TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
+    }
+    TopPart = SDValue(CurDAG->getTargetNode(SP::WRYrr, dl, MVT::Flag, TopPart,
+                                     CurDAG->getRegister(SP::G0, MVT::i32)), 0);
+
+    // FIXME: Handle div by immediate.
+    unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
+    return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS,
+                                TopPart);
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    // FIXME: Handle mul by immediate.
+    SDValue MulLHS = N->getOperand(0);
+    SDValue MulRHS = N->getOperand(1);
+    unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
+    SDNode *Mul = CurDAG->getTargetNode(Opcode, dl, MVT::i32, MVT::Flag,
+                                        MulLHS, MulRHS);
+    // The high part is in the Y register.
+    return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDValue(Mul, 1));
+    return NULL;
+  }
+  }
+
+  return SelectCode(Op);
+}
+
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool
+SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                char ConstraintCode,
+                                                std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+   if (!SelectADDRrr(Op, Op, Op0, Op1))
+     SelectADDRri(Op, Op, Op0, Op1);
+   break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+/// createSparcISelDag - This pass converts a legalized DAG into a
+/// SPARC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSparcISelDag(SparcTargetMachine &TM) {
+  return new SparcDAGToDAGISel(TM);
+}
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
new file mode 100644
index 0000000..3ec7e06
--- /dev/null
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -0,0 +1,1049 @@
+//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "SparcTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/VectorExtras.h"
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "SparcGenCallingConv.inc"
+
+static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CC, isVarArg, DAG.getTarget(), RVLocs);
+
+  // Analize return values of ISD::RET
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_Sparc32);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // ISD::RET => ret chain, (regnum1,val1), ...
+    // So i*2+1 index only the regnums.
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
+                             Op.getOperand(i*2+1), Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+/// LowerArguments - V8 uses a very simple ABI, where all values are passed in
+/// either one or two GPRs, including FP values.  TODO: we should pass FP values
+/// in FP registers for fastcc functions.
+void
+SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &ArgValues,
+                                    DebugLoc dl) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  static const unsigned ArgRegs[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+
+  const unsigned *CurArgReg = ArgRegs, *ArgRegEnd = ArgRegs+6;
+  unsigned ArgOffset = 68;
+
+  SDValue Root = DAG.getRoot();
+  std::vector<SDValue> OutChains;
+
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    MVT ObjectVT = getValueType(I->getType());
+
+    switch (ObjectVT.getSimpleVT()) {
+    default: assert(0 && "Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      if (I->use_empty()) {                // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        ArgValues.push_back(DAG.getUNDEF(ObjectVT));
+      } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+        unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        MF.getRegInfo().addLiveIn(*CurArgReg++, VReg);
+        SDValue Arg = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+        if (ObjectVT != MVT::i32) {
+          unsigned AssertOp = ISD::AssertSext;
+          Arg = DAG.getNode(AssertOp, dl, MVT::i32, Arg,
+                            DAG.getValueType(ObjectVT));
+          Arg = DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, Arg);
+        }
+        ArgValues.push_back(Arg);
+      } else {
+        int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+        SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+        SDValue Load;
+        if (ObjectVT == MVT::i32) {
+          Load = DAG.getLoad(MVT::i32, dl, Root, FIPtr, NULL, 0);
+        } else {
+          ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
+
+          // Sparc is big endian, so add an offset based on the ObjectVT.
+          unsigned Offset = 4-std::max(1U, ObjectVT.getSizeInBits()/8);
+          FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
+                              DAG.getConstant(Offset, MVT::i32));
+          Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Root, FIPtr,
+                                NULL, 0, ObjectVT);
+          Load = DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, Load);
+        }
+        ArgValues.push_back(Load);
+      }
+
+      ArgOffset += 4;
+      break;
+    case MVT::f32:
+      if (I->use_empty()) {                // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        ArgValues.push_back(DAG.getUNDEF(ObjectVT));
+      } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+        // FP value is passed in an integer register.
+        unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        MF.getRegInfo().addLiveIn(*CurArgReg++, VReg);
+        SDValue Arg = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+
+        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Arg);
+        ArgValues.push_back(Arg);
+      } else {
+        int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+        SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+        SDValue Load = DAG.getLoad(MVT::f32, dl, Root, FIPtr, NULL, 0);
+        ArgValues.push_back(Load);
+      }
+      ArgOffset += 4;
+      break;
+
+    case MVT::i64:
+    case MVT::f64:
+      if (I->use_empty()) {                // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        ArgValues.push_back(DAG.getUNDEF(ObjectVT));
+      } else {
+        SDValue HiVal;
+        if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+          unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+          MF.getRegInfo().addLiveIn(*CurArgReg++, VRegHi);
+          HiVal = DAG.getCopyFromReg(Root, dl, VRegHi, MVT::i32);
+        } else {
+          int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+          SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          HiVal = DAG.getLoad(MVT::i32, dl, Root, FIPtr, NULL, 0);
+        }
+
+        SDValue LoVal;
+        if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+          unsigned VRegLo = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+          MF.getRegInfo().addLiveIn(*CurArgReg++, VRegLo);
+          LoVal = DAG.getCopyFromReg(Root, dl, VRegLo, MVT::i32);
+        } else {
+          int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset+4);
+          SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          LoVal = DAG.getLoad(MVT::i32, dl, Root, FIPtr, NULL, 0);
+        }
+
+        // Compose the two halves together into an i64 unit.
+        SDValue WholeValue =
+          DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+
+        // If we want a double, do a bit convert.
+        if (ObjectVT == MVT::f64)
+          WholeValue = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, WholeValue);
+
+        ArgValues.push_back(WholeValue);
+      }
+      ArgOffset += 8;
+      break;
+    }
+  }
+
+  // Store remaining ArgRegs to the stack if this is a varargs function.
+  if (F.isVarArg()) {
+    // Remember the vararg offset for the va_start implementation.
+    VarArgsFrameOffset = ArgOffset;
+
+    for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
+      SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
+
+      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+
+      OutChains.push_back(DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, NULL, 0));
+      ArgOffset += 4;
+    }
+  }
+
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &OutChains[0], OutChains.size()));
+}
+
+static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  unsigned CallingConv = TheCall->getCallingConv();
+  SDValue Chain = TheCall->getChain();
+  SDValue Callee = TheCall->getCallee();
+  bool isVarArg = TheCall->isVarArg();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+#if 0
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallingConv, isVarArg, DAG.getTarget(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Op.getNode(), CC_Sparc32);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+  // FIXME: We can't use this until f64 is known to take two GPRs.
+#else
+  (void)CC_Sparc32;
+
+  // Count the size of the outgoing arguments.
+  unsigned ArgsSize = 0;
+  for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; ++i) {
+    switch (TheCall->getArg(i).getValueType().getSimpleVT()) {
+      default: assert(0 && "Unknown value type!");
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32:
+      case MVT::f32:
+        ArgsSize += 4;
+        break;
+      case MVT::i64:
+      case MVT::f64:
+        ArgsSize += 8;
+        break;
+    }
+  }
+  if (ArgsSize > 4*6)
+    ArgsSize -= 4*6;    // Space for first 6 arguments is prereserved.
+  else
+    ArgsSize = 0;
+#endif
+
+  // Keep stack frames 8-byte aligned.
+  ArgsSize = (ArgsSize+7) & ~7;
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+#if 0
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments start after the 5 first operands of ISD::CALL
+    SDValue Arg = TheCall->getArg(i);
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+    // FIXME: VERIFY THAT 68 IS RIGHT.
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()+68);
+    PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff);
+    MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+  }
+
+#else
+  static const unsigned ArgRegs[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+  unsigned ArgOffset = 68;
+
+  for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; ++i) {
+    SDValue Val = TheCall->getArg(i);
+    MVT ObjectVT = Val.getValueType();
+    SDValue ValToStore(0, 0);
+    unsigned ObjSize;
+    switch (ObjectVT.getSimpleVT()) {
+    default: assert(0 && "Unhandled argument type!");
+    case MVT::i32:
+      ObjSize = 4;
+
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;
+      } else {
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Val));
+      }
+      break;
+    case MVT::f32:
+      ObjSize = 4;
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;
+      } else {
+        // Convert this to a FP value in an int reg.
+        Val = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Val);
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Val));
+      }
+      break;
+    case MVT::f64: {
+      ObjSize = 8;
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;    // Whole thing is passed in memory.
+        break;
+      }
+
+      // Break into top and bottom parts by storing to the stack and loading
+      // out the parts as integers.  Top part goes in a reg.
+      SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32);
+      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, 
+                                   Val, StackPtr, NULL, 0);
+      // Sparc is big-endian, so the high part comes first.
+      SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr, NULL, 0, 0);
+      // Increment the pointer to the other half.
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             DAG.getIntPtrConstant(4));
+      // Load the low part.
+      SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr, NULL, 0, 0);
+
+      RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Hi));
+
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Lo;
+        ArgOffset += 4;
+        ObjSize = 4;
+      } else {
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Lo));
+      }
+      break;
+    }
+    case MVT::i64: {
+      ObjSize = 8;
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;    // Whole thing is passed in memory.
+        break;
+      }
+
+      // Split the value into top and bottom part.  Top part goes in a reg.
+      SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Val,
+                                 DAG.getConstant(1, MVT::i32));
+      SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Val,
+                                 DAG.getConstant(0, MVT::i32));
+      RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Hi));
+
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Lo;
+        ArgOffset += 4;
+        ObjSize = 4;
+      } else {
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Lo));
+      }
+      break;
+    }
+    }
+
+    if (ValToStore.getNode()) {
+      SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+      SDValue PtrOff = DAG.getConstant(ArgOffset, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, dl, ValToStore, 
+                                         PtrOff, NULL, 0));
+    }
+    ArgOffset += ObjSize;
+  }
+#endif
+
+  // Emit all stores, make sure the occur before any copies into physregs.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    unsigned Reg = RegsToPass[i].first;
+    // Remap I0->I7 -> O0->O7.
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg-SP::I0+SP::O0;
+
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  std::vector<MVT> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+  SDValue Ops[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops, InFlag.getNode() ? 3 : 2);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallingConv, isVarArg, DAG.getTarget(), RVLocs);
+
+  RVInfo.AnalyzeCallResult(TheCall, RetCC_Sparc32);
+  SmallVector<SDValue, 8> ResultVals;
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    unsigned Reg = RVLocs[i].getLocReg();
+
+    // Remap I0->I7 -> O0->O7.
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg-SP::I0+SP::O0;
+
+    Chain = DAG.getCopyFromReg(Chain, dl, Reg,
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    ResultVals.push_back(Chain.getValue(0));
+  }
+
+  ResultVals.push_back(Chain);
+
+  // Merge everything together with a MERGE_VALUES node.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, 
+                     TheCall->getVTList(), &ResultVals[0],
+                     ResultVals.size());
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
+/// condition.
+static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ISD::SETEQ:  return SPCC::ICC_E;
+  case ISD::SETNE:  return SPCC::ICC_NE;
+  case ISD::SETLT:  return SPCC::ICC_L;
+  case ISD::SETGT:  return SPCC::ICC_G;
+  case ISD::SETLE:  return SPCC::ICC_LE;
+  case ISD::SETGE:  return SPCC::ICC_GE;
+  case ISD::SETULT: return SPCC::ICC_CS;
+  case ISD::SETULE: return SPCC::ICC_LEU;
+  case ISD::SETUGT: return SPCC::ICC_GU;
+  case ISD::SETUGE: return SPCC::ICC_CC;
+  }
+}
+
+/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
+/// FCC condition.
+static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return SPCC::FCC_E;
+  case ISD::SETNE:
+  case ISD::SETUNE: return SPCC::FCC_NE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return SPCC::FCC_L;
+  case ISD::SETGT:
+  case ISD::SETOGT: return SPCC::FCC_G;
+  case ISD::SETLE:
+  case ISD::SETOLE: return SPCC::FCC_LE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return SPCC::FCC_GE;
+  case ISD::SETULT: return SPCC::FCC_UL;
+  case ISD::SETULE: return SPCC::FCC_ULE;
+  case ISD::SETUGT: return SPCC::FCC_UG;
+  case ISD::SETUGE: return SPCC::FCC_UGE;
+  case ISD::SETUO:  return SPCC::FCC_U;
+  case ISD::SETO:   return SPCC::FCC_O;
+  case ISD::SETONE: return SPCC::FCC_LG;
+  case ISD::SETUEQ: return SPCC::FCC_UE;
+  }
+}
+
+
+SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, SP::IntRegsRegisterClass);
+  addRegisterClass(MVT::f32, SP::FPRegsRegisterClass);
+  addRegisterClass(MVT::f64, SP::DFPRegsRegisterClass);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Sparc doesn't have i1 sign extending load
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Custom legalize GlobalAddress nodes into LO/HI parts.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool , MVT::i32, Custom);
+
+  // Sparc doesn't have sext_inreg, replace them with shl/sra
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  // Sparc has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // Custom expand fp<->sint
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+  // Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+
+  // Sparc has no select or setcc: expand to SELECT_CC.
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f64, Expand);
+
+  // Sparc doesn't have BRCOND either, it has BR_CC.
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // SPARC has no intrinsics for these particular operations.
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // FIXME: Sparc provides these multiplies, but we don't have them yet.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // RET must be custom lowered, to meet ABI requirements
+  setOperationAction(ISD::RET               , MVT::Other, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  // VAARG needs to be lowered to not do unaligned accesses for doubles.
+  setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+
+  // No debug info support yet.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  setOperationAction(ISD::DECLARE, MVT::Other, Expand);
+
+  setStackPointerRegisterToSaveRestore(SP::O6);
+
+  if (TM.getSubtarget<SparcSubtarget>().isV9())
+    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+
+  computeRegisterProperties();
+}
+
+const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case SPISD::CMPICC:     return "SPISD::CMPICC";
+  case SPISD::CMPFCC:     return "SPISD::CMPFCC";
+  case SPISD::BRICC:      return "SPISD::BRICC";
+  case SPISD::BRFCC:      return "SPISD::BRFCC";
+  case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
+  case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
+  case SPISD::Hi:         return "SPISD::Hi";
+  case SPISD::Lo:         return "SPISD::Lo";
+  case SPISD::FTOI:       return "SPISD::FTOI";
+  case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::CALL:       return "SPISD::CALL";
+  case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
+  }
+}
+
+/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+/// be zero. Op is expected to be a target specific node. Used by DAG
+/// combiner.
+void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                         const APInt &Mask,
+                                                         APInt &KnownZero,
+                                                         APInt &KnownOne,
+                                                         const SelectionDAG &DAG,
+                                                         unsigned Depth) const {
+  APInt KnownZero2, KnownOne2;
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case SPISD::SELECT_ICC:
+  case SPISD::SELECT_FCC:
+    DAG.ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne,
+                          Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2,
+                          Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+      CC == ISD::SETNE &&
+      ((LHS.getOpcode() == SPISD::SELECT_ICC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
+       (LHS.getOpcode() == SPISD::SELECT_FCC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
+      isa<ConstantSDNode>(LHS.getOperand(0)) &&
+      isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(0))->getZExtValue() == 1 &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 0) {
+    SDValue CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+static SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) {
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, GA);
+  SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, GA);
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+}
+
+static SDValue LowerCONSTANTPOOL(SDValue Op, SelectionDAG &DAG) {
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  Constant *C = N->getConstVal();
+  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment());
+  SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, CP);
+  SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, CP);
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Convert the fp value to integer in an FP register.
+  assert(Op.getValueType() == MVT::i32);
+  Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  assert(Op.getOperand(0).getValueType() == MVT::i32);
+  SDValue Tmp = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0));
+  // Convert the int value to FP in an FP register.
+  return DAG.getNode(SPISD::ITOF, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a br_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  // Get the condition flag.
+  SDValue CompareFlag;
+  if (LHS.getValueType() == MVT::i32) {
+    std::vector<MVT> VTs;
+    VTs.push_back(MVT::i32);
+    VTs.push_back(MVT::Flag);
+    SDValue Ops[2] = { LHS, RHS };
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+    Opc = SPISD::BRICC;
+  } else {
+    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Flag, LHS, RHS);
+    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    Opc = SPISD::BRFCC;
+  }
+  return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a select_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  SDValue CompareFlag;
+  if (LHS.getValueType() == MVT::i32) {
+    std::vector<MVT> VTs;
+    VTs.push_back(LHS.getValueType());   // subcc returns a value
+    VTs.push_back(MVT::Flag);
+    SDValue Ops[2] = { LHS, RHS };
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    Opc = SPISD::SELECT_ICC;
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+  } else {
+    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Flag, LHS, RHS);
+    Opc = SPISD::SELECT_FCC;
+    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+  }
+  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                              SparcTargetLowering &TLI) {
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Offset = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                 DAG.getRegister(SP::I6, MVT::i32),
+                                 DAG.getConstant(TLI.getVarArgsFrameOffset(),
+                                                 MVT::i32));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, Offset, Op.getOperand(1), SV, 0);
+}
+
+static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  MVT VT = Node->getValueType(0);
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue VAList = DAG.getLoad(MVT::i32, dl, InChain, VAListPtr, SV, 0);
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue NextPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, VAList,
+                                  DAG.getConstant(VT.getSizeInBits()/8,
+                                                  MVT::i32));
+  // Store the incremented VAList to the legalized pointer
+  InChain = DAG.getStore(VAList.getValue(1), dl, NextPtr,
+                         VAListPtr, SV, 0);
+  // Load the actual argument out of the pointer VAList, unless this is an
+  // f64 load.
+  if (VT != MVT::f64)
+    return DAG.getLoad(VT, dl, InChain, VAList, NULL, 0);
+
+  // Otherwise, load it as i64, then do a bitconvert.
+  SDValue V = DAG.getLoad(MVT::i64, dl, InChain, VAList, NULL, 0);
+
+  // Bit-Convert the value to f64.
+  SDValue Ops[2] = {
+    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, V),
+    V.getValue(1)
+  };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);  // Legalize the chain.
+  SDValue Size  = Op.getOperand(1);  // Legalize the size.
+  DebugLoc dl = Op.getDebugLoc();
+
+  unsigned SPReg = SP::O6;
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size); // Value
+  Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
+
+  // The resultant pointer is actually 16 words from the bottom of the stack,
+  // to provide a register spill area.
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
+                                 DAG.getConstant(96, MVT::i32));
+  SDValue Ops[2] = { NewVal, Chain };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+
+SDValue SparcTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Should not custom lower this!");
+  // Frame & Return address.  Currently unimplemented
+  case ISD::RETURNADDR: return SDValue();
+  case ISD::FRAMEADDR:  return SDValue();
+  case ISD::GlobalTLSAddress:
+    assert(0 && "TLS not implemented for Sparc.");
+  case ISD::GlobalAddress:      return LowerGLOBALADDRESS(Op, DAG);
+  case ISD::ConstantPool:       return LowerCONSTANTPOOL(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::CALL:               return LowerCALL(Op, DAG);
+  case ISD::RET:                return LowerRET(Op, DAG);
+  }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  unsigned BROpcode;
+  unsigned CC;
+  DebugLoc dl = MI->getDebugLoc();
+  // Figure out the conditional branch opcode to use for this select_cc.
+  switch (MI->getOpcode()) {
+  default: assert(0 && "Unknown SELECT_CC!");
+  case SP::SELECT_CC_Int_ICC:
+  case SP::SELECT_CC_FP_ICC:
+  case SP::SELECT_CC_DFP_ICC:
+    BROpcode = SP::BCOND;
+    break;
+  case SP::SELECT_CC_Int_FCC:
+  case SP::SELECT_CC_FP_FCC:
+  case SP::SELECT_CC_DFP_FCC:
+    BROpcode = SP::FBCOND;
+    break;
+  }
+
+  CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  sinkMBB->transferSuccessors(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, dl, TII.get(SP::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Sparc Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SparcTargetLowering::ConstraintType
+SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'r': return C_RegisterClass;
+    }
+  }
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, SP::IntRegsRegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+std::vector<unsigned> SparcTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT VT) const {
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {
+  default: break;
+  case 'r':
+    return make_vector<unsigned>(SP::L0, SP::L1, SP::L2, SP::L3,
+                                 SP::L4, SP::L5, SP::L6, SP::L7,
+                                 SP::I0, SP::I1, SP::I2, SP::I3,
+                                 SP::I4, SP::I5,
+                                 SP::O0, SP::O1, SP::O2, SP::O3,
+                                 SP::O4, SP::O5, SP::O7, 0);
+  }
+
+  return std::vector<unsigned>();
+}
+
+bool
+SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Sparc target isn't yet aware of offsets.
+  return false;
+}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
new file mode 100644
index 0000000..fe6811f
--- /dev/null
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -0,0 +1,79 @@
+//===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_ISELLOWERING_H
+#define SPARC_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "Sparc.h"
+
+namespace llvm {
+  namespace SPISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CMPICC,      // Compare two GPR operands, set icc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG     // Return with a flag operand.
+    };
+  }
+
+  class SparcTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+  public:
+    SparcTargetLowering(TargetMachine &TM);
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// in Mask are known to be either zero or one and return them in the
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual void LowerArguments(Function &F, SelectionDAG &DAG,
+                                SmallVectorImpl<SDValue> &ArgValues,
+                                DebugLoc dl);
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *MBB) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      MVT VT) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+  };
+} // end namespace llvm
+
+#endif    // SPARC_ISELLOWERING_H
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
new file mode 100644
index 0000000..6535259
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -0,0 +1,114 @@
+//===- SparcInstrFormats.td - Sparc Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SP";
+
+  bits<2> op;
+  let Inst{31-30} = op;               // Top two bits are the 'op' field
+  
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #2 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+// Format 2 instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern> {
+  bits<3>  op2;
+  bits<22> imm22;
+  let op          = 0;    // op = 0
+  let Inst{24-22} = op2;
+  let Inst{21-0}  = imm22;
+}
+
+// Specific F2 classes: SparcV8 manual, page 44
+//
+class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
+   : F2<outs, ins, asmstr, pattern> {
+  bits<5>  rd;
+
+  let op2         = op2Val;
+
+  let Inst{29-25} = rd;
+}
+
+class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr, 
+           list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
+  bits<4>   cond;
+  bit       annul = 0;     // currently unused
+
+  let cond        = condVal;
+  let op2         = op2Val;
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #3 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+  bits<6> op3;
+  bits<5> rs1;
+  let op{1} = 1;   // Op = 2 or 3
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+  let Inst{18-14} = rs1;
+}
+
+// Specific F3 classes: SparcV8 manual, page 44
+//
+class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<8> asi = 0; // asi not currently used
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12-5} = asi;   // address space identifier
+  let Inst{4-0}  = rs2;
+}
+
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins, 
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<13> simm13;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12-0} = simm13;
+}
+
+// floating-point
+class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
new file mode 100644
index 0000000..d2f6b9b
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -0,0 +1,277 @@
+//===- SparcInstrInfo.cpp - Sparc Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+#include "Sparc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "SparcGenInstrInfo.inc"
+using namespace llvm;
+
+SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
+  : TargetInstrInfoImpl(SparcInsts, array_lengthof(SparcInsts)),
+    RI(ST, *this), Subtarget(ST) {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool SparcInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg, unsigned &DstReg,
+                                 unsigned &SrcSR, unsigned &DstSR) const {
+  SrcSR = DstSR = 0; // No sub-registers.
+
+  // We look for 3 kinds of patterns here:
+  // or with G0 or 0
+  // add with G0 or 0
+  // fmovs or FpMOVD (pseudo double move).
+  if (MI.getOpcode() == SP::ORrr || MI.getOpcode() == SP::ADDrr) {
+    if (MI.getOperand(1).getReg() == SP::G0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+      return true;
+    } else if (MI.getOperand(2).getReg() == SP::G0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  } else if ((MI.getOpcode() == SP::ORri || MI.getOpcode() == SP::ADDri) &&
+             isZeroImm(MI.getOperand(2)) && MI.getOperand(1).isReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  } else if (MI.getOpcode() == SP::FMOVS || MI.getOpcode() == SP::FpMOVD ||
+             MI.getOpcode() == SP::FMOVD) {
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  if (MI->getOpcode() == SP::LDri ||
+      MI->getOpcode() == SP::LDFri ||
+      MI->getOpcode() == SP::LDDFri) {
+    if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == SP::STri ||
+      MI->getOpcode() == SP::STFri ||
+      MI->getOpcode() == SP::STDFri) {
+    if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
+        MI->getOperand(1).getImm() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+unsigned
+SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond)const{
+  // FIXME this should probably take a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Can only insert uncond branches so far.
+  assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
+  BuildMI(&MBB, dl, get(SP::BA)).addMBB(TBB);
+  return 1;
+}
+
+bool SparcInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  const TargetRegisterClass *DestRC,
+                                  const TargetRegisterClass *SrcRC) const {
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0).addReg(SrcReg);
+  else if (DestRC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg).addReg(SrcReg);
+  else if (DestRC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD),DestReg)
+      .addReg(SrcReg);
+  else
+    // Can't copy this register
+    return false;
+
+  return true;
+}
+
+void SparcInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+  if (RC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill));
+  else if (RC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill));
+  else if (RC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill));
+  else
+    assert(0 && "Can't store this register to stack slot");
+}
+
+void SparcInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                    bool isKill,
+                                    SmallVectorImpl<MachineOperand> &Addr,
+                                    const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Opc = 0;
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (RC == SP::IntRegsRegisterClass)
+    Opc = SP::STri;
+  else if (RC == SP::FPRegsRegisterClass)
+    Opc = SP::STFri;
+  else if (RC == SP::DFPRegsRegisterClass)
+    Opc = SP::STDFri;
+  else
+    assert(0 && "Can't load this register");
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  NewMIs.push_back(MIB);
+  return;
+}
+
+void SparcInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else
+    assert(0 && "Can't load this register from stack slot");
+}
+
+void SparcInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                     SmallVectorImpl<MachineOperand> &Addr,
+                                     const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Opc = 0;
+  if (RC == SP::IntRegsRegisterClass)
+    Opc = SP::LDri;
+  else if (RC == SP::FPRegsRegisterClass)
+    Opc = SP::LDFri;
+  else if (RC == SP::DFPRegsRegisterClass)
+    Opc = SP::LDDFri;
+  else
+    assert(0 && "Can't load this register");
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+  return;
+}
+
+MachineInstr *SparcInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                    MachineInstr* MI,
+                                          const SmallVectorImpl<unsigned> &Ops,
+                                                    int FI) const {
+  if (Ops.size() != 1) return NULL;
+
+  unsigned OpNum = Ops[0];
+  bool isFloat = false;
+  MachineInstr *NewMI = NULL;
+  switch (MI->getOpcode()) {
+  case SP::ORrr:
+    if (MI->getOperand(1).isReg() && MI->getOperand(1).getReg() == SP::G0&&
+        MI->getOperand(0).isReg() && MI->getOperand(2).isReg()) {
+      if (OpNum == 0)    // COPY -> STORE
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(SP::STri))
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addReg(MI->getOperand(2).getReg());
+      else               // COPY -> LOAD
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(SP::LDri),
+                        MI->getOperand(0).getReg())
+          .addFrameIndex(FI)
+          .addImm(0);
+    }
+    break;
+  case SP::FMOVS:
+    isFloat = true;
+    // FALLTHROUGH
+  case SP::FMOVD:
+    if (OpNum == 0) { // COPY -> STORE
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      NewMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(isFloat ? SP::STFri : SP::STDFri))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill));
+    } else {             // COPY -> LOAD
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(isFloat ? SP::LDFri : SP::LDDFri))
+        .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
+        .addFrameIndex(FI)
+        .addImm(0);
+    }
+    break;
+  }
+
+  return NewMI;
+}
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
new file mode 100644
index 0000000..ab661b9
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -0,0 +1,114 @@
+//===- SparcInstrInfo.h - Sparc Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCINSTRUCTIONINFO_H
+#define SPARCINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "SparcRegisterInfo.h"
+
+namespace llvm {
+
+/// SPII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SPII {
+  enum {
+    Pseudo = (1<<0),
+    Load = (1<<1),
+    Store = (1<<2),
+    DelaySlot = (1<<3)
+  };
+}
+
+class SparcInstrInfo : public TargetInstrInfoImpl {
+  const SparcRegisterInfo RI;
+  const SparcSubtarget& Subtarget;
+public:
+  explicit SparcInstrInfo(SparcSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+};
+
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
new file mode 100644
index 0000000..2d6c920
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -0,0 +1,769 @@
+//===- SparcInstrInfo.td - Target Description for Sparc Target ------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Sparc instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// HasV9 - This predicate is true when the target processor supports V9
+// instructions.  Note that the machine may be running in 32-bit mode.
+def HasV9   : Predicate<"Subtarget.isV9()">;
+
+// HasNoV9 - This predicate is true when the target doesn't have V9
+// instructions.  Use of this is just a hack for the isel not having proper
+// costs for V8 instructions that are more expensive than their V9 ones.
+def HasNoV9 : Predicate<"!Subtarget.isV9()">;
+
+// HasVIS - This is true when the target processor has VIS extensions.
+def HasVIS : Predicate<"Subtarget.isVIS()">;
+
+// UseDeprecatedInsts - This predicate is true when the target processor is a
+// V8, or when it is V9 but the V8 deprecated instructions are efficient enough
+// to use when appropriate.  In either of these cases, the instruction selector
+// will pick deprecated instructions.
+def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def simm11  : PatLeaf<(imm), [{
+  // simm11 predicate - True if the imm fits in a 11-bit sign extended field.
+  return (((int)N->getZExtValue() << (32-11)) >> (32-11)) ==
+         (int)N->getZExtValue();
+}]>;
+
+def simm13  : PatLeaf<(imm), [{
+  // simm13 predicate - True if the imm fits in a 13-bit sign extended field.
+  return (((int)N->getZExtValue() << (32-13)) >> (32-13)) ==
+         (int)N->getZExtValue();
+}]>;
+
+def LO10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() & 1023,
+                                   MVT::i32);
+}]>;
+
+def HI22 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 10, MVT::i32);
+}]>;
+
+def SETHIimm : PatLeaf<(imm), [{
+  return (((unsigned)N->getZExtValue() >> 10) << 10) ==
+         (unsigned)N->getZExtValue();
+}], HI22>;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>;
+
+// Address operands
+def MEMrr : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops IntRegs, i32imm);
+}
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i32>;
+
+def SDTSPcmpfcc : 
+SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPbrcc : 
+SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+def SDTSPselectcc :
+SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
+def SDTSPFTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDTSPITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTIntBinOp, [SDNPOutFlag]>;
+def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutFlag]>;
+def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>;
+def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>;
+
+def SPhi    : SDNode<"SPISD::Hi", SDTIntUnaryOp>;
+def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
+
+def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
+def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+
+def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInFlag]>;
+def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDT_SPCall    : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"SPISD::CALL", SDT_SPCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def retflag       : SDNode<"SPISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+//===----------------------------------------------------------------------===//
+// SPARC Flag Conditions
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the CCOp::CondCode enum
+// values.
+class ICC_VAL<int N> : PatLeaf<(i32 N)>;
+def ICC_NE  : ICC_VAL< 9>;  // Not Equal
+def ICC_E   : ICC_VAL< 1>;  // Equal
+def ICC_G   : ICC_VAL<10>;  // Greater
+def ICC_LE  : ICC_VAL< 2>;  // Less or Equal
+def ICC_GE  : ICC_VAL<11>;  // Greater or Equal
+def ICC_L   : ICC_VAL< 3>;  // Less
+def ICC_GU  : ICC_VAL<12>;  // Greater Unsigned
+def ICC_LEU : ICC_VAL< 4>;  // Less or Equal Unsigned
+def ICC_CC  : ICC_VAL<13>;  // Carry Clear/Great or Equal Unsigned
+def ICC_CS  : ICC_VAL< 5>;  // Carry Set/Less Unsigned
+def ICC_POS : ICC_VAL<14>;  // Positive
+def ICC_NEG : ICC_VAL< 6>;  // Negative
+def ICC_VC  : ICC_VAL<15>;  // Overflow Clear
+def ICC_VS  : ICC_VAL< 7>;  // Overflow Set
+
+class FCC_VAL<int N> : PatLeaf<(i32 N)>;
+def FCC_U   : FCC_VAL<23>;  // Unordered
+def FCC_G   : FCC_VAL<22>;  // Greater
+def FCC_UG  : FCC_VAL<21>;  // Unordered or Greater
+def FCC_L   : FCC_VAL<20>;  // Less
+def FCC_UL  : FCC_VAL<19>;  // Unordered or Less
+def FCC_LG  : FCC_VAL<18>;  // Less or Greater
+def FCC_NE  : FCC_VAL<17>;  // Not Equal
+def FCC_E   : FCC_VAL<25>;  // Equal
+def FCC_UE  : FCC_VAL<24>;  // Unordered or Equal
+def FCC_GE  : FCC_VAL<25>;  // Greater or Equal
+def FCC_UGE : FCC_VAL<26>;  // Unordered or Greater or Equal
+def FCC_LE  : FCC_VAL<27>;  // Less or Equal
+def FCC_ULE : FCC_VAL<28>;  // Unordered or Less or Equal
+def FCC_O   : FCC_VAL<29>;  // Ordered
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
+  def rr  : F3_1<2, Op3Val, 
+                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set IntRegs:$dst, (OpNode IntRegs:$b, simm13:$c))]>;
+}
+
+/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
+/// pattern.
+multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
+  def rr  : F3_1<2, Op3Val, 
+                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern>;
+
+let Defs = [O6], Uses = [O6] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                               "!ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "!ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+// FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the 
+// fpmover pass.
+let Predicates = [HasNoV9] in {  // Only emit these in V8 mode.
+  def FpMOVD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpMOVD $src, $dst", []>;
+  def FpNEGD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpNEGD $src, $dst",
+                      [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>;
+  def FpABSD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpABSD $src, $dst",
+                      [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>;
+}
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded by the
+// scheduler into a branch sequence.  This has to handle all permutations of
+// selection between i32/f32/f64 on ICC and FCC.
+let usesCustomDAGSchedInserter = 1 in {   // Expanded by the scheduler.
+  def SELECT_CC_Int_ICC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set IntRegs:$dst, (SPselecticc IntRegs:$T, IntRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_Int_FCC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_FCC PSEUDO!",
+            [(set IntRegs:$dst, (SPselectfcc IntRegs:$T, IntRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_FP_ICC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_ICC PSEUDO!",
+            [(set FPRegs:$dst, (SPselecticc FPRegs:$T, FPRegs:$F,
+                                            imm:$Cond))]>;
+  def SELECT_CC_FP_FCC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_FCC PSEUDO!",
+            [(set FPRegs:$dst, (SPselectfcc FPRegs:$T, FPRegs:$F,
+                                            imm:$Cond))]>;
+  def SELECT_CC_DFP_ICC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_ICC PSEUDO!",
+            [(set DFPRegs:$dst, (SPselecticc DFPRegs:$T, DFPRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_DFP_FCC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_FCC PSEUDO!",
+            [(set DFPRegs:$dst, (SPselectfcc DFPRegs:$T, DFPRegs:$F,
+                                             imm:$Cond))]>;
+}
+
+
+// Section A.3 - Synthetic Instructions, p. 85
+// special cases of JMPL:
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1 in {
+  let rd = O7.Num, rs1 = G0.Num, simm13 = 8 in
+    def RETL: F3_2<2, 0b111000, (outs), (ins), "retl", [(retflag)]>;
+}
+
+// Section B.1 - Load Integer Instructions, p. 90
+def LDSBrr : F3_1<3, 0b001001,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi8 ADDRrr:$addr))]>;
+def LDSBri : F3_2<3, 0b001001,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi8 ADDRri:$addr))]>;
+def LDSHrr : F3_1<3, 0b001010,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi16 ADDRrr:$addr))]>;
+def LDSHri : F3_2<3, 0b001010,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi16 ADDRri:$addr))]>;
+def LDUBrr : F3_1<3, 0b000001,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldub [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi8 ADDRrr:$addr))]>;
+def LDUBri : F3_2<3, 0b000001,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldub [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi8 ADDRri:$addr))]>;
+def LDUHrr : F3_1<3, 0b000010,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "lduh [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi16 ADDRrr:$addr))]>;
+def LDUHri : F3_2<3, 0b000010,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "lduh [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi16 ADDRri:$addr))]>;
+def LDrr   : F3_1<3, 0b000000,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set IntRegs:$dst, (load ADDRrr:$addr))]>;
+def LDri   : F3_2<3, 0b000000,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set IntRegs:$dst, (load ADDRri:$addr))]>;
+
+// Section B.2 - Load Floating-point Instructions, p. 92
+def LDFrr  : F3_1<3, 0b100000,
+                  (outs FPRegs:$dst), (ins MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set FPRegs:$dst, (load ADDRrr:$addr))]>;
+def LDFri  : F3_2<3, 0b100000,
+                  (outs FPRegs:$dst), (ins MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set FPRegs:$dst, (load ADDRri:$addr))]>;
+def LDDFrr : F3_1<3, 0b100011,
+                  (outs DFPRegs:$dst), (ins MEMrr:$addr),
+                  "ldd [$addr], $dst",
+                  [(set DFPRegs:$dst, (load ADDRrr:$addr))]>;
+def LDDFri : F3_2<3, 0b100011,
+                  (outs DFPRegs:$dst), (ins MEMri:$addr),
+                  "ldd [$addr], $dst",
+                  [(set DFPRegs:$dst, (load ADDRri:$addr))]>;
+
+// Section B.4 - Store Integer Instructions, p. 95
+def STBrr : F3_1<3, 0b000101,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 IntRegs:$src, ADDRrr:$addr)]>;
+def STBri : F3_2<3, 0b000101,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 IntRegs:$src, ADDRri:$addr)]>;
+def STHrr : F3_1<3, 0b000110,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 IntRegs:$src, ADDRrr:$addr)]>;
+def STHri : F3_2<3, 0b000110,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 IntRegs:$src, ADDRri:$addr)]>;
+def STrr  : F3_1<3, 0b000100,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store IntRegs:$src, ADDRrr:$addr)]>;
+def STri  : F3_2<3, 0b000100,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store IntRegs:$src, ADDRri:$addr)]>;
+
+// Section B.5 - Store Floating-point Instructions, p. 97
+def STFrr   : F3_1<3, 0b100100,
+                   (outs), (ins MEMrr:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store FPRegs:$src, ADDRrr:$addr)]>;
+def STFri   : F3_2<3, 0b100100,
+                   (outs), (ins MEMri:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store FPRegs:$src, ADDRri:$addr)]>;
+def STDFrr  : F3_1<3, 0b100111,
+                   (outs), (ins MEMrr:$addr, DFPRegs:$src),
+                   "std  $src, [$addr]",
+                   [(store DFPRegs:$src, ADDRrr:$addr)]>;
+def STDFri  : F3_2<3, 0b100111,
+                   (outs), (ins MEMri:$addr, DFPRegs:$src),
+                   "std $src, [$addr]",
+                   [(store DFPRegs:$src, ADDRri:$addr)]>;
+
+// Section B.9 - SETHI Instruction, p. 104
+def SETHIi: F2_1<0b100,
+                 (outs IntRegs:$dst), (ins i32imm:$src),
+                 "sethi $src, $dst",
+                 [(set IntRegs:$dst, SETHIimm:$src)]>;
+
+// Section B.10 - NOP Instruction, p. 105
+// (It's a special case of SETHI)
+let rd = 0, imm22 = 0 in
+  def NOP : F2_1<0b100, (outs), (ins), "nop", []>;
+
+// Section B.11 - Logical Instructions, p. 106
+defm AND    : F3_12<"and", 0b000001, and>;
+
+def ANDNrr  : F3_1<2, 0b000101,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "andn $b, $c, $dst",
+                   [(set IntRegs:$dst, (and IntRegs:$b, (not IntRegs:$c)))]>;
+def ANDNri  : F3_2<2, 0b000101,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "andn $b, $c, $dst", []>;
+
+defm OR     : F3_12<"or", 0b000010, or>;
+
+def ORNrr   : F3_1<2, 0b000110,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "orn $b, $c, $dst",
+                   [(set IntRegs:$dst, (or IntRegs:$b, (not IntRegs:$c)))]>;
+def ORNri   : F3_2<2, 0b000110,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "orn $b, $c, $dst", []>;
+defm XOR    : F3_12<"xor", 0b000011, xor>;
+
+def XNORrr  : F3_1<2, 0b000111,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "xnor $b, $c, $dst",
+                   [(set IntRegs:$dst, (not (xor IntRegs:$b, IntRegs:$c)))]>;
+def XNORri  : F3_2<2, 0b000111,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "xnor $b, $c, $dst", []>;
+
+// Section B.12 - Shift Instructions, p. 107
+defm SLL : F3_12<"sll", 0b100101, shl>;
+defm SRL : F3_12<"srl", 0b100110, srl>;
+defm SRA : F3_12<"sra", 0b100111, sra>;
+
+// Section B.13 - Add Instructions, p. 108
+defm ADD   : F3_12<"add", 0b000000, add>;
+
+// "LEA" forms of add (patterns to make tblgen happy)
+def LEA_ADDri   : F3_2<2, 0b000000,
+                   (outs IntRegs:$dst), (ins MEMri:$addr),
+                   "add ${addr:arith}, $dst",
+                   [(set IntRegs:$dst, ADDRri:$addr)]>;
+                   
+defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
+defm ADDX  : F3_12<"addx", 0b001000, adde>;
+
+// Section B.15 - Subtract Instructions, p. 110
+defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
+defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
+defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
+
+def SUBXCCrr: F3_1<2, 0b011100, 
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "subxcc $b, $c, $dst", []>;
+
+// Section B.18 - Multiply Instructions, p. 113
+defm UMUL : F3_12np<"umul", 0b001010>;
+defm SMUL : F3_12  <"smul", 0b001011, mul>;
+
+
+// Section B.19 - Divide Instructions, p. 115
+defm UDIV : F3_12np<"udiv", 0b001110>;
+defm SDIV : F3_12np<"sdiv", 0b001111>;
+
+// Section B.20 - SAVE and RESTORE, p. 117
+defm SAVE    : F3_12np<"save"   , 0b111100>;
+defm RESTORE : F3_12np<"restore", 0b111101>;
+
+// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
+
+// conditional branch class:
+class BranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b010, (outs), ins, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
+
+let isBarrier = 1 in
+  def BA   : BranchSP<0b1000, (ins brtarget:$dst),
+                      "ba $dst",
+                      [(br bb:$dst)]>;
+                      
+// FIXME: the encoding for the JIT should look at the condition field.
+def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                     "b$cc $dst",
+                     [(SPbricc bb:$dst, imm:$cc)]>;
+
+
+// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
+
+// floating-point conditional branch class:
+class FPBranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b110, (outs), ins, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
+
+// FIXME: the encoding for the JIT should look at the condition field.
+def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                      "fb$cc $dst",
+                      [(SPbrfcc bb:$dst, imm:$cc)]>;
+
+
+// Section B.24 - Call and Link Instruction, p. 125
+// This is the only Format 1 instruction
+let Uses = [O0, O1, O2, O3, O4, O5],
+    hasDelaySlot = 1, isCall = 1,
+    Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7,
+    D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15] in { 
+  def CALL : InstSP<(outs), (ins calltarget:$dst),
+                    "call $dst", []> {
+    bits<30> disp;
+    let op = 1;
+    let Inst{29-0} = disp;
+  }
+  
+  // indirect calls
+  def JMPLrr : F3_1<2, 0b111000,
+                    (outs), (ins MEMrr:$ptr),
+                    "call $ptr",
+                    [(call  ADDRrr:$ptr)]>;
+  def JMPLri : F3_2<2, 0b111000,
+                    (outs), (ins MEMri:$ptr),
+                    "call $ptr",
+                    [(call  ADDRri:$ptr)]>;
+}
+
+// Section B.28 - Read State Register Instructions
+def RDY : F3_1<2, 0b101000,
+               (outs IntRegs:$dst), (ins),
+               "rd %y, $dst", []>;
+
+// Section B.29 - Write State Register Instructions
+def WRYrr : F3_1<2, 0b110000,
+                 (outs), (ins IntRegs:$b, IntRegs:$c),
+                 "wr $b, $c, %y", []>;
+def WRYri : F3_2<2, 0b110000,
+                 (outs), (ins IntRegs:$b, i32imm:$c),
+                 "wr $b, $c, %y", []>;
+
+// Convert Integer to Floating-point Instructions, p. 141
+def FITOS : F3_3<2, 0b110100, 0b011000100,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fitos $src, $dst",
+                 [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
+def FITOD : F3_3<2, 0b110100, 0b011001000, 
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fitod $src, $dst",
+                 [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+
+// Convert Floating-point to Integer Instructions, p. 142
+def FSTOI : F3_3<2, 0b110100, 0b011010001,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fstoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
+def FDTOI : F3_3<2, 0b110100, 0b011010010,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+
+// Convert between Floating-point Formats Instructions, p. 143
+def FSTOD : F3_3<2, 0b110100, 0b011001001, 
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fstod $src, $dst",
+                 [(set DFPRegs:$dst, (fextend FPRegs:$src))]>;
+def FDTOS : F3_3<2, 0b110100, 0b011000110,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtos $src, $dst",
+                 [(set FPRegs:$dst, (fround DFPRegs:$src))]>;
+
+// Floating-point Move Instructions, p. 144
+def FMOVS : F3_3<2, 0b110100, 0b000000001,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fmovs $src, $dst", []>;
+def FNEGS : F3_3<2, 0b110100, 0b000000101, 
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fnegs $src, $dst",
+                 [(set FPRegs:$dst, (fneg FPRegs:$src))]>;
+def FABSS : F3_3<2, 0b110100, 0b000001001, 
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fabss $src, $dst",
+                 [(set FPRegs:$dst, (fabs FPRegs:$src))]>;
+
+
+// Floating-point Square Root Instructions, p.145
+def FSQRTS : F3_3<2, 0b110100, 0b000101001, 
+                  (outs FPRegs:$dst), (ins FPRegs:$src),
+                  "fsqrts $src, $dst",
+                  [(set FPRegs:$dst, (fsqrt FPRegs:$src))]>;
+def FSQRTD : F3_3<2, 0b110100, 0b000101010, 
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                  "fsqrtd $src, $dst",
+                  [(set DFPRegs:$dst, (fsqrt DFPRegs:$src))]>;
+
+
+
+// Floating-point Add and Subtract Instructions, p. 146
+def FADDS  : F3_3<2, 0b110100, 0b001000001,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fadds $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fadd FPRegs:$src1, FPRegs:$src2))]>;
+def FADDD  : F3_3<2, 0b110100, 0b001000010,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "faddd $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fadd DFPRegs:$src1, DFPRegs:$src2))]>;
+def FSUBS  : F3_3<2, 0b110100, 0b001000101,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fsubs $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fsub FPRegs:$src1, FPRegs:$src2))]>;
+def FSUBD  : F3_3<2, 0b110100, 0b001000110,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fsubd $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fsub DFPRegs:$src1, DFPRegs:$src2))]>;
+
+// Floating-point Multiply and Divide Instructions, p. 147
+def FMULS  : F3_3<2, 0b110100, 0b001001001,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fmuls $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fmul FPRegs:$src1, FPRegs:$src2))]>;
+def FMULD  : F3_3<2, 0b110100, 0b001001010,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fmuld $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fmul DFPRegs:$src1, DFPRegs:$src2))]>;
+def FSMULD : F3_3<2, 0b110100, 0b001101001,
+                  (outs DFPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fsmuld $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fmul (fextend FPRegs:$src1),
+                                            (fextend FPRegs:$src2)))]>;
+def FDIVS  : F3_3<2, 0b110100, 0b001001101,
+                 (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                 "fdivs $src1, $src2, $dst",
+                 [(set FPRegs:$dst, (fdiv FPRegs:$src1, FPRegs:$src2))]>;
+def FDIVD  : F3_3<2, 0b110100, 0b001001110,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                 "fdivd $src1, $src2, $dst",
+                 [(set DFPRegs:$dst, (fdiv DFPRegs:$src1, DFPRegs:$src2))]>;
+
+// Floating-point Compare Instructions, p. 148
+// Note: the 2nd template arg is different for these guys.
+// Note 2: the result of a FCMP is not available until the 2nd cycle
+// after the instr is retired, but there is no interlock. This behavior
+// is modelled with a forced noop after the instruction.
+def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+                  (outs), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fcmps $src1, $src2\n\tnop",
+                  [(SPcmpfcc FPRegs:$src1, FPRegs:$src2)]>;
+def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+                  (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fcmpd $src1, $src2\n\tnop",
+                  [(SPcmpfcc DFPRegs:$src1, DFPRegs:$src2)]>;
+
+
+//===----------------------------------------------------------------------===//
+// V9 Instructions
+//===----------------------------------------------------------------------===//
+
+// V9 Conditional Moves.
+let Predicates = [HasV9], isTwoAddress = 1 in {
+  // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
+  // FIXME: Add instruction encodings for the JIT some day.
+  def MOVICCrr
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
+             "mov$cc %icc, $F, $dst",
+             [(set IntRegs:$dst,
+                         (SPselecticc IntRegs:$F, IntRegs:$T, imm:$cc))]>;
+  def MOVICCri
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
+             "mov$cc %icc, $F, $dst",
+             [(set IntRegs:$dst,
+                          (SPselecticc simm11:$F, IntRegs:$T, imm:$cc))]>;
+
+  def MOVFCCrr
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
+             "mov$cc %fcc0, $F, $dst",
+             [(set IntRegs:$dst,
+                         (SPselectfcc IntRegs:$F, IntRegs:$T, imm:$cc))]>;
+  def MOVFCCri
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
+             "mov$cc %fcc0, $F, $dst",
+             [(set IntRegs:$dst,
+                          (SPselectfcc simm11:$F, IntRegs:$T, imm:$cc))]>;
+
+  def FMOVS_ICC
+    : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
+             "fmovs$cc %icc, $F, $dst",
+             [(set FPRegs:$dst,
+                         (SPselecticc FPRegs:$F, FPRegs:$T, imm:$cc))]>;
+  def FMOVD_ICC
+    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+             "fmovd$cc %icc, $F, $dst",
+             [(set DFPRegs:$dst,
+                         (SPselecticc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>;
+  def FMOVS_FCC
+    : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
+             "fmovs$cc %fcc0, $F, $dst",
+             [(set FPRegs:$dst,
+                         (SPselectfcc FPRegs:$F, FPRegs:$T, imm:$cc))]>;
+  def FMOVD_FCC
+    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+             "fmovd$cc %fcc0, $F, $dst",
+             [(set DFPRegs:$dst,
+                         (SPselectfcc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>;
+
+}
+
+// Floating-Point Move Instructions, p. 164 of the V9 manual.
+let Predicates = [HasV9] in {
+  def FMOVD : F3_3<2, 0b110100, 0b000000010,
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fmovd $src, $dst", []>;
+  def FNEGD : F3_3<2, 0b110100, 0b000000110, 
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fnegd $src, $dst",
+                   [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>;
+  def FABSD : F3_3<2, 0b110100, 0b000001010, 
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fabsd $src, $dst",
+                   [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>;
+}
+
+// POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
+// the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
+def POPCrr : F3_1<2, 0b101110, 
+                  (outs IntRegs:$dst), (ins IntRegs:$src),
+                  "popc $src, $dst", []>, Requires<[HasV9]>;
+def : Pat<(ctpop IntRegs:$src),
+          (POPCrr (SLLri IntRegs:$src, 0))>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm13:$val),
+          (ORri G0, imm:$val)>;
+// Arbitrary immediates.
+def : Pat<(i32 imm:$val),
+          (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+
+// subc
+def : Pat<(subc IntRegs:$b, IntRegs:$c),
+          (SUBCCrr IntRegs:$b, IntRegs:$c)>;
+def : Pat<(subc IntRegs:$b, simm13:$val),
+          (SUBCCri IntRegs:$b, imm:$val)>;
+
+// Global addresses, constant pool entries
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORri G0, tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORri G0, tconstpool:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add IntRegs:$r, (SPlo tglobaladdr:$in)),
+          (ADDri IntRegs:$r, tglobaladdr:$in)>;
+def : Pat<(add IntRegs:$r, (SPlo tconstpool:$in)),
+          (ADDri IntRegs:$r, tconstpool:$in)>;
+
+// Calls: 
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+def : Pat<(ret), (RETL)>;
+
+// Map integer extload's to zextloads.
+def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+
+// zextload bool -> zextload byte
+def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
new file mode 100644
index 0000000..59efb19
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -0,0 +1,196 @@
+//===- SparcRegisterInfo.cpp - SPARC Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPARC implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcRegisterInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
+                                     const TargetInstrInfo &tii)
+  : SparcGenRegisterInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
+    Subtarget(st), TII(tii) {
+}
+
+const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(SP::G2);
+  Reserved.set(SP::G3);
+  Reserved.set(SP::G4);
+  Reserved.set(SP::O6);
+  Reserved.set(SP::I6);
+  Reserved.set(SP::I7);
+  Reserved.set(SP::G0);
+  Reserved.set(SP::G5);
+  Reserved.set(SP::G6);
+  Reserved.set(SP::G7);
+  return Reserved;
+}
+
+
+const TargetRegisterClass* const*
+SparcRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+bool SparcRegisterInfo::hasFP(const MachineFunction &MF) const {
+  return false;
+}
+
+void SparcRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+  DebugLoc dl = MI.getDebugLoc();
+  int Size = MI.getOperand(0).getImm();
+  if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+    Size = -Size;
+  if (Size)
+    BuildMI(MBB, I, dl, TII.get(SP::ADDri), SP::O6).addReg(SP::O6).addImm(Size);
+  MBB.erase(I);
+}
+
+void SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(i+1).getImm();
+
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(i).ChangeToRegister(SP::I6, false);
+    MI.getOperand(i+1).ChangeToImmediate(Offset);
+  } else {
+    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to 
+    // scavenge a register here instead of reserving G1 all of the time.
+    unsigned OffHi = (unsigned)Offset >> 10U;
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(SP::I6);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(i).ChangeToRegister(SP::G1, false);
+    MI.getOperand(i+1).ChangeToImmediate(Offset & ((1 << 10)-1));
+  }
+}
+
+void SparcRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {}
+
+void SparcRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Get the number of bytes to allocate from the FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Emit the correct save instruction based on the number of bytes in
+  // the frame. Minimum stack frame size according to V8 ABI is:
+  //   16 words for register window spill
+  //    1 word for address of returned aggregate-value
+  // +  6 words for passing parameters on the stack
+  // ----------
+  //   23 words * 4 bytes per word = 92 bytes
+  NumBytes += 92;
+
+  // Round up to next doubleword boundary -- a double-word boundary
+  // is required by the ABI.
+  NumBytes = (NumBytes + 7) & ~7;
+  NumBytes = -NumBytes;
+  
+  if (NumBytes >= -4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVEri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+  } else {
+    // Emit this the hard way.  This clobbers G1 which we always know is 
+    // available here.
+    unsigned OffHi = (unsigned)NumBytes >> 10U;
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVErr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+  }
+}
+
+void SparcRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+  assert(MBBI->getOpcode() == SP::RETL &&
+         "Can only put epilog before 'retl' instruction!");
+  BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+    .addReg(SP::G0);
+}
+
+unsigned SparcRegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned SparcRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  assert(0 && "What is the frame register");
+  return SP::G1;
+}
+
+unsigned SparcRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned SparcRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+#include "SparcGenRegisterInfo.inc"
+
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
new file mode 100644
index 0000000..fc863f3
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -0,0 +1,67 @@
+//===- SparcRegisterInfo.h - Sparc Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCREGISTERINFO_H
+#define SPARCREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "SparcGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class SparcSubtarget;
+class TargetInstrInfo;
+class Type;
+
+struct SparcRegisterInfo : public SparcGenRegisterInfo {
+  SparcSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+  
+  SparcRegisterInfo(SparcSubtarget &st, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...  
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
new file mode 100644
index 0000000..e3a50ca
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -0,0 +1,158 @@
+//===- SparcRegisterInfo.td - Sparc Register defs ----------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Sparc register file 
+//===----------------------------------------------------------------------===//
+
+class SparcReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "SP";
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rf - 32-bit floating-point registers
+class Rf<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rd - Slots in the FP register file for 64-bit floating-point values.
+class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
+  let Num = num;
+  let SubRegs = subregs;
+}
+
+// Integer registers
+def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
+def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>; 
+def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
+def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>; 
+def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
+def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
+def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
+def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>; 
+def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
+def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>; 
+def O6 : Ri<14, "O6">, DwarfRegNum<[14]>;
+def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
+def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
+def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>; 
+def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
+def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>; 
+def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
+def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
+def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
+def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>; 
+def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
+def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>; 
+def I6 : Ri<30, "I6">, DwarfRegNum<[30]>;
+def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : Rf< 0,  "F0">, DwarfRegNum<[32]>;
+def F1  : Rf< 1,  "F1">, DwarfRegNum<[33]>;
+def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>; 
+def F3  : Rf< 3,  "F3">, DwarfRegNum<[35]>;
+def F4  : Rf< 4,  "F4">, DwarfRegNum<[36]>;
+def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>; 
+def F6  : Rf< 6,  "F6">, DwarfRegNum<[38]>;
+def F7  : Rf< 7,  "F7">, DwarfRegNum<[39]>;
+def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>; 
+def F9  : Rf< 9,  "F9">, DwarfRegNum<[41]>;
+def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>; 
+def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
+def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>; 
+def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
+def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>; 
+def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
+def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>; 
+def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
+def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
+def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
+def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
+def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>; 
+def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
+def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>; 
+def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
+def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>; 
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>; 
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[46]>; 
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[50]>; 
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[58]>; 
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7,
+                                     I0, I1, I2, I3, I4, I5,
+                                     O0, O1, O2, O3, O4, O5, O7,
+
+   // FIXME: G1 reserved for now for large imm generation by frame code.
+                                     G1,
+                                     // Non-allocatable regs:
+                                     G2, G3, G4, // FIXME: OK for use only in
+                                                 // applications, not libraries.
+                                     O6, // stack ptr
+                                     I6, // frame ptr
+                                     I7, // return address
+                                     G0, // constant zero
+                                     G5, G6, G7 // reserved for kernel
+                                     ]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    IntRegsClass::iterator
+    IntRegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // FIXME: These special regs should be taken out of the regclass!
+      return end()-10  // Don't allocate special registers
+         -1;  // FIXME: G1 reserved for large imm generation by frame code.
+    }
+  }];
+}
+
+def FPRegs : RegisterClass<"SP", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8,
+  F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22,
+  F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+
+def DFPRegs : RegisterClass<"SP", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7,
+  D8, D9, D10, D11, D12, D13, D14, D15]>;
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
new file mode 100644
index 0000000..aaddbff
--- /dev/null
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -0,0 +1,43 @@
+//===- SparcSubtarget.cpp - SPARC Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPARC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSubtarget.h"
+#include "SparcGenSubtarget.inc"
+using namespace llvm;
+
+// FIXME: temporary.
+#include "llvm/Support/CommandLine.h"
+namespace {
+  cl::opt<bool> EnableV9("enable-sparc-v9-insts", cl::Hidden,
+                          cl::desc("Enable V9 instructions in the V8 target"));
+}
+
+SparcSubtarget::SparcSubtarget(const Module &M, const std::string &FS) {
+  // Set the default features.
+  IsV9 = false;
+  V8DeprecatedInsts = false;
+  IsVIS = false;
+  
+  // Determine default and user specified characteristics
+  std::string CPU = "generic";
+
+  // FIXME: autodetect host here!
+  CPU = "v9";   // What is a good way to detect V9?
+  
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  // Unless explicitly enabled, disable the V9 instructions.
+  if (!EnableV9)
+    IsV9 = false;
+}
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
new file mode 100644
index 0000000..e5a5ba4
--- /dev/null
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -0,0 +1,43 @@
+//=====-- SparcSubtarget.h - Define Subtarget for the SPARC ----*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPARC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_SUBTARGET_H
+#define SPARC_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+  class Module;
+  
+class SparcSubtarget : public TargetSubtarget {
+  bool IsV9;
+  bool V8DeprecatedInsts;
+  bool IsVIS;
+public:
+  SparcSubtarget(const Module &M, const std::string &FS);
+
+  bool isV9() const { return IsV9; }
+  bool isVIS() const { return IsVIS; }
+  bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/SparcTargetAsmInfo.cpp b/lib/Target/Sparc/SparcTargetAsmInfo.cpp
new file mode 100644
index 0000000..c13d45c
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetAsmInfo.cpp
@@ -0,0 +1,50 @@
+//===-- SparcTargetAsmInfo.cpp - Sparc asm properties -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetAsmInfo.h"
+
+using namespace llvm;
+
+SparcELFTargetAsmInfo::SparcELFTargetAsmInfo(const TargetMachine &TM):
+  ELFTargetAsmInfo(TM) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "!";
+  ConstantPoolSection = "\t.section \".rodata\",#alloc\n";
+  COMMDirectiveTakesAlignment = true;
+  CStringSection=".rodata.str";
+
+  // Sparc normally uses named section for BSS.
+  BSSSection_  = getNamedSection("\t.bss",
+                                 SectionFlags::Writeable | SectionFlags::BSS,
+                                 /* Override */ true);
+}
+
+std::string SparcELFTargetAsmInfo::printSectionFlags(unsigned flags) const {
+  if (flags & SectionFlags::Mergeable)
+    return ELFTargetAsmInfo::printSectionFlags(flags);
+
+  std::string Flags;
+  if (!(flags & SectionFlags::Debug))
+    Flags += ",#alloc";
+  if (flags & SectionFlags::Code)
+    Flags += ",#execinstr";
+  if (flags & SectionFlags::Writeable)
+    Flags += ",#write";
+  if (flags & SectionFlags::TLS)
+    Flags += ",#tls";
+
+  return Flags;
+}
diff --git a/lib/Target/Sparc/SparcTargetAsmInfo.h b/lib/Target/Sparc/SparcTargetAsmInfo.h
new file mode 100644
index 0000000..1af5d80
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetAsmInfo.h
@@ -0,0 +1,33 @@
+//=====-- SparcTargetAsmInfo.h - Sparc asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETASMINFO_H
+#define SPARCTARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class TargetMachine;
+
+  struct SparcELFTargetAsmInfo : public ELFTargetAsmInfo {
+    explicit SparcELFTargetAsmInfo(const TargetMachine &TM);
+
+    std::string printSectionFlags(unsigned flags) const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
new file mode 100644
index 0000000..eda0309
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -0,0 +1,94 @@
+//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetAsmInfo.h"
+#include "SparcTargetMachine.h"
+#include "Sparc.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+/// SparcTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int SparcTargetMachineModule;
+int SparcTargetMachineModule = 0;
+
+// Register the target.
+static RegisterTarget<SparcTargetMachine> X("sparc", "SPARC");
+
+const TargetAsmInfo *SparcTargetMachine::createTargetAsmInfo() const {
+  // FIXME: Handle Solaris subtarget someday :)
+  return new SparcELFTargetAsmInfo(*this);
+}
+
+/// SparcTargetMachine ctor - Create an ILP32 architecture model
+///
+SparcTargetMachine::SparcTargetMachine(const Module &M, const std::string &FS)
+  : DataLayout("E-p:32:32-f128:128:128"),
+    Subtarget(M, FS), TLInfo(*this), InstrInfo(Subtarget),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0) {
+}
+
+unsigned SparcTargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "sparc-")
+    return 20;
+  
+  // If the target triple is something non-sparc, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::BigEndian &&
+      M.getPointerSize() == Module::Pointer32)
+#ifdef __sparc__
+    return 20;   // BE/32 ==> Prefer sparc on sparc
+#else
+    return 5;    // BE/32 ==> Prefer ppc elsewhere
+#endif
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+#if defined(__sparc__)
+  return 10;
+#else
+  return 0;
+#endif
+}
+
+bool SparcTargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createSparcISelDag(*this));
+  return false;
+}
+
+/// addPreEmitPass - This pass may be implemented by targets that want to run
+/// passes immediately before machine code is emitted.  This should return
+/// true if -print-machineinstrs should print out the code after the passes.
+bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel){
+  PM.add(createSparcFPMoverPass(*this));
+  PM.add(createSparcDelaySlotFillerPass(*this));
+  return true;
+}
+
+bool SparcTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool Verbose,
+                                            raw_ostream &Out) {
+  // Output assembly language.
+  PM.add(createSparcCodePrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
new file mode 100644
index 0000000..40b44f2
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -0,0 +1,63 @@
+//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Sparc specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETMACHINE_H
+#define SPARCTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+#include "SparcISelLowering.h"
+
+namespace llvm {
+
+class Module;
+
+class SparcTargetMachine : public LLVMTargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+  SparcSubtarget Subtarget;
+  SparcTargetLowering TLInfo;
+  SparcInstrInfo InstrInfo;
+  TargetFrameInfo FrameInfo;
+  
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  SparcTargetMachine(const Module &M, const std::string &FS);
+
+  virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual const SparcSubtarget   *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const SparcRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual SparcTargetLowering* getTargetLowering() const {
+    return const_cast<SparcTargetLowering*>(&TLInfo);
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel,
+                                  bool Verbose, raw_ostream &Out);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp
new file mode 100644
index 0000000..f937025
--- /dev/null
+++ b/lib/Target/SubtargetFeature.cpp
@@ -0,0 +1,364 @@
+//===- SubtargetFeature.cpp - CPU characteristics Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SubtargetFeature interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/SubtargetFeature.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Streams.h"
+#include <algorithm>
+#include <ostream>
+#include <cassert>
+#include <cctype>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                          Static Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// hasFlag - Determine if a feature has a flag; '+' or '-'
+///
+static inline bool hasFlag(const std::string &Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' or '-' flag
+  return Ch == '+' || Ch =='-';
+}
+
+/// StripFlag - Return string stripped of flag.
+///
+static inline std::string StripFlag(const std::string &Feature) {
+  return hasFlag(Feature) ? Feature.substr(1) : Feature;
+}
+
+/// isEnabled - Return true if enable flag; '+'.
+///
+static inline bool isEnabled(const std::string &Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' for enabled
+  return Ch == '+';
+}
+
+/// PrependFlag - Return a string with a prepended flag; '+' or '-'.
+///
+static inline std::string PrependFlag(const std::string &Feature,
+                                      bool IsEnabled) {
+  assert(!Feature.empty() && "Empty string");
+  if (hasFlag(Feature)) return Feature;
+  return std::string(IsEnabled ? "+" : "-") + Feature;
+}
+
+/// Split - Splits a string of comma separated items in to a vector of strings.
+///
+static void Split(std::vector<std::string> &V, const std::string &S) {
+  // Start at beginning of string.
+  size_t Pos = 0;
+  while (true) {
+    // Find the next comma
+    size_t Comma = S.find(',', Pos);
+    // If no comma found then the the rest of the string is used
+    if (Comma == std::string::npos) {
+      // Add string to vector
+      V.push_back(S.substr(Pos));
+      break;
+    }
+    // Otherwise add substring to vector
+    V.push_back(S.substr(Pos, Comma - Pos));
+    // Advance to next item
+    Pos = Comma + 1;
+  }
+}
+
+/// Join a vector of strings to a string with a comma separating each element.
+///
+static std::string Join(const std::vector<std::string> &V) {
+  // Start with empty string.
+  std::string Result;
+  // If the vector is not empty 
+  if (!V.empty()) {
+    // Start with the CPU feature
+    Result = V[0];
+    // For each successive feature
+    for (size_t i = 1; i < V.size(); i++) {
+      // Add a comma
+      Result += ",";
+      // Add the feature
+      Result += V[i];
+    }
+  }
+  // Return the features string 
+  return Result;
+}
+
+/// Adding features.
+void SubtargetFeatures::AddFeature(const std::string &String,
+                                   bool IsEnabled) {
+  // Don't add empty features
+  if (!String.empty()) {
+    // Convert to lowercase, prepend flag and add to vector
+    Features.push_back(PrependFlag(LowercaseString(String), IsEnabled));
+  }
+}
+
+/// Find KV in array using binary search.
+template<typename T> const T *Find(const std::string &S, const T *A, size_t L) {
+  // Make the lower bound element we're looking for
+  T KV;
+  KV.Key = S.c_str();
+  // Determine the end of the array
+  const T *Hi = A + L;
+  // Binary search the array
+  const T *F = std::lower_bound(A, Hi, KV);
+  // If not found then return NULL
+  if (F == Hi || std::string(F->Key) != S) return NULL;
+  // Return the found array item
+  return F;
+}
+
+/// getLongestEntryLength - Return the length of the longest entry in the table.
+///
+static size_t getLongestEntryLength(const SubtargetFeatureKV *Table,
+                                    size_t Size) {
+  size_t MaxLen = 0;
+  for (size_t i = 0; i < Size; i++)
+    MaxLen = std::max(MaxLen, std::strlen(Table[i].Key));
+  return MaxLen;
+}
+
+/// Display help for feature choices.
+///
+static void Help(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize,
+                 const SubtargetFeatureKV *FeatTable, size_t FeatTableSize) {
+  // Determine the length of the longest CPU and Feature entries.
+  unsigned MaxCPULen  = getLongestEntryLength(CPUTable, CPUTableSize);
+  unsigned MaxFeatLen = getLongestEntryLength(FeatTable, FeatTableSize);
+
+  // Print the CPU table.
+  cerr << "Available CPUs for this target:\n\n";
+  for (size_t i = 0; i != CPUTableSize; i++)
+    cerr << "  " << CPUTable[i].Key
+         << std::string(MaxCPULen - std::strlen(CPUTable[i].Key), ' ')
+         << " - " << CPUTable[i].Desc << ".\n";
+  cerr << "\n";
+  
+  // Print the Feature table.
+  cerr << "Available features for this target:\n\n";
+  for (size_t i = 0; i != FeatTableSize; i++)
+    cerr << "  " << FeatTable[i].Key
+         << std::string(MaxFeatLen - std::strlen(FeatTable[i].Key), ' ')
+         << " - " << FeatTable[i].Desc << ".\n";
+  cerr << "\n";
+  
+  cerr << "Use +feature to enable a feature, or -feature to disable it.\n"
+       << "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
+  exit(1);
+}
+
+//===----------------------------------------------------------------------===//
+//                    SubtargetFeatures Implementation
+//===----------------------------------------------------------------------===//
+
+SubtargetFeatures::SubtargetFeatures(const std::string &Initial) {
+  // Break up string into separate features
+  Split(Features, Initial);
+}
+
+
+std::string SubtargetFeatures::getString() const {
+  return Join(Features);
+}
+void SubtargetFeatures::setString(const std::string &Initial) {
+  // Throw out old features
+  Features.clear();
+  // Break up string into separate features
+  Split(Features, LowercaseString(Initial));
+}
+
+
+/// setCPU - Set the CPU string.  Replaces previous setting.  Setting to ""
+/// clears CPU.
+void SubtargetFeatures::setCPU(const std::string &String) {
+  Features[0] = LowercaseString(String);
+}
+
+
+/// setCPUIfNone - Setting CPU string only if no string is set.
+///
+void SubtargetFeatures::setCPUIfNone(const std::string &String) {
+  if (Features[0].empty()) setCPU(String);
+}
+
+/// getCPU - Returns current CPU.
+///
+const std::string & SubtargetFeatures::getCPU() const {
+  return Features[0];
+}
+
+
+/// SetImpliedBits - For each feature that is (transitively) implied by this
+/// feature, set it.
+///
+static
+void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                    const SubtargetFeatureKV *FeatureTable,
+                    size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FeatureEntry->Implies & FE.Value) {
+      Bits |= FE.Value;
+      SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// ClearImpliedBits - For each feature that (transitively) implies this
+/// feature, clear it.
+/// 
+static
+void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                      const SubtargetFeatureKV *FeatureTable,
+                      size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FE.Implies & FeatureEntry->Value) {
+      Bits &= ~FE.Value;
+      ClearImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// getBits - Get feature bits.
+///
+uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
+                                          size_t CPUTableSize,
+                                    const SubtargetFeatureKV *FeatureTable,
+                                          size_t FeatureTableSize) {
+  assert(CPUTable && "missing CPU table");
+  assert(FeatureTable && "missing features table");
+#ifndef NDEBUG
+  for (size_t i = 1; i < CPUTableSize; i++) {
+    assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
+           "CPU table is not sorted");
+  }
+  for (size_t i = 1; i < FeatureTableSize; i++) {
+    assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
+          "CPU features table is not sorted");
+  }
+#endif
+  uint32_t Bits = 0;                    // Resulting bits
+
+  // Check if help is needed
+  if (Features[0] == "help")
+    Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+  
+  // Find CPU entry
+  const SubtargetFeatureKV *CPUEntry =
+                            Find(Features[0], CPUTable, CPUTableSize);
+  // If there is a match
+  if (CPUEntry) {
+    // Set base feature bits
+    Bits = CPUEntry->Value;
+
+    // Set the feature implied by this CPU feature, if any.
+    for (size_t i = 0; i < FeatureTableSize; ++i) {
+      const SubtargetFeatureKV &FE = FeatureTable[i];
+      if (CPUEntry->Value & FE.Value)
+        SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  } else {
+    cerr << "'" << Features[0]
+         << "' is not a recognized processor for this target"
+         << " (ignoring processor)"
+         << "\n";
+  }
+  // Iterate through each feature
+  for (size_t i = 1; i < Features.size(); i++) {
+    const std::string &Feature = Features[i];
+    
+    // Check for help
+    if (Feature == "+help")
+      Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+    
+    // Find feature in table.
+    const SubtargetFeatureKV *FeatureEntry =
+                       Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
+    // If there is a match
+    if (FeatureEntry) {
+      // Enable/disable feature in bits
+      if (isEnabled(Feature)) {
+        Bits |=  FeatureEntry->Value;
+
+        // For each feature that this implies, set it.
+        SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      } else {
+        Bits &= ~FeatureEntry->Value;
+
+        // For each feature that implies this, clear it.
+        ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      }
+    } else {
+      cerr << "'" << Feature
+           << "' is not a recognized feature for this target"
+           << " (ignoring feature)"
+           << "\n";
+    }
+  }
+
+  return Bits;
+}
+
+/// Get info pointer
+void *SubtargetFeatures::getInfo(const SubtargetInfoKV *Table,
+                                       size_t TableSize) {
+  assert(Table && "missing table");
+#ifndef NDEBUG
+  for (size_t i = 1; i < TableSize; i++) {
+    assert(strcmp(Table[i - 1].Key, Table[i].Key) < 0 && "Table is not sorted");
+  }
+#endif
+
+  // Find entry
+  const SubtargetInfoKV *Entry = Find(Features[0], Table, TableSize);
+  
+  if (Entry) {
+    return Entry->Value;
+  } else {
+    cerr << "'" << Features[0]
+         << "' is not a recognized processor for this target"
+         << " (ignoring processor)"
+         << "\n";
+    return NULL;
+  }
+}
+
+/// print - Print feature string.
+///
+void SubtargetFeatures::print(std::ostream &OS) const {
+  for (size_t i = 0; i < Features.size(); i++) {
+    OS << Features[i] << "  ";
+  }
+  OS << "\n";
+}
+
+/// dump - Dump feature info.
+///
+void SubtargetFeatures::dump() const {
+  print(*cerr.stream());
+}
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
new file mode 100644
index 0000000..ed544b7
--- /dev/null
+++ b/lib/Target/Target.cpp
@@ -0,0 +1,94 @@
+//===-- Target.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the C bindings for libLLVMTarget.a, which implements
+// target information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Target.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetData.h"
+#include <cstring>
+
+using namespace llvm;
+
+LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
+  return wrap(new TargetData(StringRep));
+}
+
+void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new TargetData(*unwrap(TD)));
+}
+
+char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
+  std::string StringRep = unwrap(TD)->getStringRepresentation();
+  return strdup(StringRep.c_str());
+}
+
+LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD) {
+  return unwrap(TD)->isLittleEndian();
+}
+
+unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
+  return unwrap(TD)->getPointerSize();
+}
+
+LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType());
+}
+
+unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
+}
+
+unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeStoreSize(unwrap(Ty));
+}
+
+unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeAllocSize(unwrap(Ty));
+}
+
+unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getCallFrameTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getPrefTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
+                                        LLVMValueRef GlobalVar) {
+  return unwrap(TD)->getPreferredAlignment(unwrap<GlobalVariable>(GlobalVar));
+}
+
+unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                             unsigned long long Offset) {
+  const StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
+}
+
+unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                                       unsigned Element) {
+  const StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
+}
+
+void LLVMInvalidateStructLayout(LLVMTargetDataRef TD, LLVMTypeRef StructTy) {
+  unwrap(TD)->InvalidateStructLayoutInfo(unwrap<StructType>(StructTy));
+}
+
+void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
+  delete unwrap(TD);
+}
diff --git a/lib/Target/TargetAsmInfo.cpp b/lib/Target/TargetAsmInfo.cpp
new file mode 100644
index 0000000..6a2de6f
--- /dev/null
+++ b/lib/Target/TargetAsmInfo.cpp
@@ -0,0 +1,461 @@
+//===-- TargetAsmInfo.cpp - Asm Info ---------------------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Dwarf.h"
+#include <cctype>
+#include <cstring>
+
+using namespace llvm;
+
+void TargetAsmInfo::fillDefaultValues() {
+  BSSSection = "\t.bss";
+  BSSSection_ = 0;
+  ReadOnlySection = 0;
+  SmallDataSection = 0;
+  SmallBSSSection = 0;
+  SmallRODataSection = 0;
+  TLSDataSection = 0;
+  TLSBSSSection = 0;
+  ZeroFillDirective = 0;
+  NonexecutableStackDirective = 0;
+  NeedsSet = false;
+  MaxInstLength = 4;
+  PCSymbol = "$";
+  SeparatorChar = ';';
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".";
+  LessPrivateGlobalPrefix = "";
+  JumpTableSpecialLabelPrefix = 0;
+  GlobalVarAddrPrefix = "";
+  GlobalVarAddrSuffix = "";
+  FunctionAddrPrefix = "";
+  FunctionAddrSuffix = "";
+  PersonalityPrefix = "";
+  PersonalitySuffix = "";
+  NeedsIndirectEncoding = false;
+  InlineAsmStart = "#APP";
+  InlineAsmEnd = "#NO_APP";
+  AssemblerDialect = 0;
+  StringConstantPrefix = ".str";
+  ZeroDirective = "\t.zero\t";
+  ZeroDirectiveSuffix = 0;
+  AsciiDirective = "\t.ascii\t";
+  AscizDirective = "\t.asciz\t";
+  Data8bitsDirective = "\t.byte\t";
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = "\t.quad\t";
+  AlignDirective = "\t.align\t";
+  AlignmentIsInBytes = true;
+  TextAlignFillValue = 0;
+  SwitchToSectionDirective = "\t.section\t";
+  TextSectionStartSuffix = "";
+  DataSectionStartSuffix = "";
+  SectionEndDirectiveSuffix = 0;
+  ConstantPoolSection = "\t.section .rodata";
+  JumpTableDataSection = "\t.section .rodata";
+  JumpTableDirective = 0;
+  CStringSection = 0;
+  CStringSection_ = 0;
+  // FIXME: Flags are ELFish - replace with normal section stuff.
+  StaticCtorsSection = "\t.section .ctors,\"aw\",@progbits";
+  StaticDtorsSection = "\t.section .dtors,\"aw\",@progbits";
+  GlobalDirective = "\t.globl\t";
+  SetDirective = 0;
+  LCOMMDirective = 0;
+  COMMDirective = "\t.comm\t";
+  COMMDirectiveTakesAlignment = true;
+  HasDotTypeDotSizeDirective = true;
+  HasSingleParameterDotFile = true;
+  UsedDirective = 0;
+  WeakRefDirective = 0;
+  WeakDefDirective = 0;
+  // FIXME: These are ELFish - move to ELFTAI.
+  HiddenDirective = "\t.hidden\t";
+  ProtectedDirective = "\t.protected\t";
+  AbsoluteDebugSectionOffsets = false;
+  AbsoluteEHSectionOffsets = false;
+  HasLEB128 = false;
+  HasDotLocAndDotFile = false;
+  SupportsDebugInformation = false;
+  SupportsExceptionHandling = false;
+  DwarfRequiresFrameSection = true;
+  DwarfUsesInlineInfoSection = false;
+  SupportsMacInfoSection = true;
+  NonLocalEHFrameLabel = false;
+  GlobalEHDirective = 0;
+  SupportsWeakOmittedEHFrame = true;
+  DwarfSectionOffsetDirective = 0;
+  DwarfAbbrevSection = ".debug_abbrev";
+  DwarfInfoSection = ".debug_info";
+  DwarfLineSection = ".debug_line";
+  DwarfFrameSection = ".debug_frame";
+  DwarfPubNamesSection = ".debug_pubnames";
+  DwarfPubTypesSection = ".debug_pubtypes";
+  DwarfDebugInlineSection = ".debug_inlined";
+  DwarfStrSection = ".debug_str";
+  DwarfLocSection = ".debug_loc";
+  DwarfARangesSection = ".debug_aranges";
+  DwarfRangesSection = ".debug_ranges";
+  DwarfMacInfoSection = ".debug_macinfo";
+  DwarfEHFrameSection = ".eh_frame";
+  DwarfExceptionSection = ".gcc_except_table";
+  AsmTransCBE = 0;
+  TextSection = getUnnamedSection("\t.text", SectionFlags::Code);
+  DataSection = getUnnamedSection("\t.data", SectionFlags::Writeable);
+}
+
+TargetAsmInfo::TargetAsmInfo(const TargetMachine &tm)
+  : TM(tm) {
+  fillDefaultValues();
+}
+
+TargetAsmInfo::~TargetAsmInfo() {
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorChar or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorChar or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+unsigned TargetAsmInfo::getInlineAsmLength(const char *Str) const {
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || *Str == SeparatorChar)
+      atInsnStart = true;
+    if (atInsnStart && !isspace(*Str)) {
+      Length += MaxInstLength;
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, CommentString, strlen(CommentString))==0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
+
+unsigned TargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                              bool Global) const {
+  return dwarf::DW_EH_PE_absptr;
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV) {
+  if (!GV->hasInitializer())
+    return true;
+
+  // Leave constant zeros in readonly constant sections, so they can be shared
+  Constant *C = GV->getInitializer();
+  return (C->isNullValue() && !GV->isConstant() && !NoZerosInBSS);
+}
+
+static bool isConstantString(const Constant *C) {
+  // First check: is we have constant array of i8 terminated with zero
+  const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+  // Check, if initializer is a null-terminated string
+  if (CVA && CVA->isCString())
+    return true;
+
+  // Another possibility: [1 x i8] zeroinitializer
+  if (isa<ConstantAggregateZero>(C)) {
+    if (const ArrayType *Ty = dyn_cast<ArrayType>(C->getType())) {
+      return (Ty->getElementType() == Type::Int8Ty &&
+              Ty->getNumElements() == 1);
+    }
+  }
+
+  return false;
+}
+
+unsigned TargetAsmInfo::RelocBehaviour() const {
+  // By default - all relocations in PIC mode would force symbol to be
+  // placed in r/w section.
+  return (TM.getRelocationModel() != Reloc::Static ?
+          Reloc::LocalOrGlobal : Reloc::None);
+}
+
+SectionKind::Kind
+TargetAsmInfo::SectionKindForGlobal(const GlobalValue *GV) const {
+  // Early exit - functions should be always in text sections.
+  if (isa<Function>(GV))
+    return SectionKind::Text;
+
+  const GlobalVariable* GVar = dyn_cast<GlobalVariable>(GV);
+  bool isThreadLocal = GVar->isThreadLocal();
+  assert(GVar && "Invalid global value for section selection");
+
+  if (isSuitableForBSS(GVar)) {
+    // Variable can be easily put to BSS section.
+    return (isThreadLocal ? SectionKind::ThreadBSS : SectionKind::BSS);
+  } else if (GVar->isConstant() && !isThreadLocal) {
+    // Now we know, that varible has initializer and it is constant. We need to
+    // check its initializer to decide, which section to output it into. Also
+    // note, there is no thread-local r/o section.
+    Constant *C = GVar->getInitializer();
+    if (C->ContainsRelocations(Reloc::LocalOrGlobal)) {
+      // Decide, whether it is still possible to put symbol into r/o section.
+      unsigned Reloc = RelocBehaviour();
+
+      // We already did a query for 'all' relocs, thus - early exits.
+      if (Reloc == Reloc::LocalOrGlobal)
+        return SectionKind::Data;
+      else if (Reloc == Reloc::None)
+        return SectionKind::ROData;
+      else {
+        // Ok, target wants something funny. Honour it.
+        return (C->ContainsRelocations(Reloc) ?
+                SectionKind::Data : SectionKind::ROData);
+      }
+    } else {
+      // Check, if initializer is a null-terminated string
+      if (isConstantString(C))
+        return SectionKind::RODataMergeStr;
+      else
+        return SectionKind::RODataMergeConst;
+    }
+  }
+
+  // Variable either is not constant or thread-local - output to data section.
+  return (isThreadLocal ? SectionKind::ThreadData : SectionKind::Data);
+}
+
+unsigned
+TargetAsmInfo::SectionFlagsForGlobal(const GlobalValue *GV,
+                                     const char* Name) const {
+  unsigned Flags = SectionFlags::None;
+
+  // Decode flags from global itself.
+  if (GV) {
+    SectionKind::Kind Kind = SectionKindForGlobal(GV);
+    switch (Kind) {
+     case SectionKind::Text:
+      Flags |= SectionFlags::Code;
+      break;
+     case SectionKind::ThreadData:
+     case SectionKind::ThreadBSS:
+      Flags |= SectionFlags::TLS;
+      // FALLS THROUGH
+     case SectionKind::Data:
+     case SectionKind::DataRel:
+     case SectionKind::DataRelLocal:
+     case SectionKind::DataRelRO:
+     case SectionKind::DataRelROLocal:
+     case SectionKind::BSS:
+      Flags |= SectionFlags::Writeable;
+      break;
+     case SectionKind::ROData:
+     case SectionKind::RODataMergeStr:
+     case SectionKind::RODataMergeConst:
+      // No additional flags here
+      break;
+     case SectionKind::SmallData:
+     case SectionKind::SmallBSS:
+      Flags |= SectionFlags::Writeable;
+      // FALLS THROUGH
+     case SectionKind::SmallROData:
+      Flags |= SectionFlags::Small;
+      break;
+     default:
+      assert(0 && "Unexpected section kind!");
+    }
+
+    if (GV->isWeakForLinker())
+      Flags |= SectionFlags::Linkonce;
+  }
+
+  // Add flags from sections, if any.
+  if (Name && *Name) {
+    Flags |= SectionFlags::Named;
+
+    // Some lame default implementation based on some magic section names.
+    if (strncmp(Name, ".gnu.linkonce.b.", 16) == 0 ||
+        strncmp(Name, ".llvm.linkonce.b.", 17) == 0 ||
+        strncmp(Name, ".gnu.linkonce.sb.", 17) == 0 ||
+        strncmp(Name, ".llvm.linkonce.sb.", 18) == 0)
+      Flags |= SectionFlags::BSS;
+    else if (strcmp(Name, ".tdata") == 0 ||
+             strncmp(Name, ".tdata.", 7) == 0 ||
+             strncmp(Name, ".gnu.linkonce.td.", 17) == 0 ||
+             strncmp(Name, ".llvm.linkonce.td.", 18) == 0)
+      Flags |= SectionFlags::TLS;
+    else if (strcmp(Name, ".tbss") == 0 ||
+             strncmp(Name, ".tbss.", 6) == 0 ||
+             strncmp(Name, ".gnu.linkonce.tb.", 17) == 0 ||
+             strncmp(Name, ".llvm.linkonce.tb.", 18) == 0)
+      Flags |= SectionFlags::BSS | SectionFlags::TLS;
+  }
+
+  return Flags;
+}
+
+const Section*
+TargetAsmInfo::SectionForGlobal(const GlobalValue *GV) const {
+  const Section* S;
+  // Select section name
+  if (GV->hasSection()) {
+    // Honour section already set, if any
+    unsigned Flags = SectionFlagsForGlobal(GV,
+                                           GV->getSection().c_str());
+    S = getNamedSection(GV->getSection().c_str(), Flags);
+  } else {
+    // Use default section depending on the 'type' of global
+    S = SelectSectionForGlobal(GV);
+  }
+
+  return S;
+}
+
+// Lame default implementation. Calculate the section name for global.
+const Section*
+TargetAsmInfo::SelectSectionForGlobal(const GlobalValue *GV) const {
+  SectionKind::Kind Kind = SectionKindForGlobal(GV);
+
+  if (GV->isWeakForLinker()) {
+    std::string Name = UniqueSectionForGlobal(GV, Kind);
+    unsigned Flags = SectionFlagsForGlobal(GV, Name.c_str());
+    return getNamedSection(Name.c_str(), Flags);
+  } else {
+    if (Kind == SectionKind::Text)
+      return getTextSection();
+    else if (isBSS(Kind) && getBSSSection_())
+      return getBSSSection_();
+    else if (getReadOnlySection() && SectionKind::isReadOnly(Kind))
+      return getReadOnlySection();
+  }
+
+  return getDataSection();
+}
+
+// Lame default implementation. Calculate the section name for machine const.
+const Section*
+TargetAsmInfo::SelectSectionForMachineConst(const Type *Ty) const {
+  // FIXME: Support data.rel stuff someday
+  return getDataSection();
+}
+
+std::string
+TargetAsmInfo::UniqueSectionForGlobal(const GlobalValue* GV,
+                                      SectionKind::Kind Kind) const {
+  switch (Kind) {
+   case SectionKind::Text:
+    return ".gnu.linkonce.t." + GV->getName();
+   case SectionKind::Data:
+    return ".gnu.linkonce.d." + GV->getName();
+   case SectionKind::DataRel:
+    return ".gnu.linkonce.d.rel" + GV->getName();
+   case SectionKind::DataRelLocal:
+    return ".gnu.linkonce.d.rel.local" + GV->getName();
+   case SectionKind::DataRelRO:
+    return ".gnu.linkonce.d.rel.ro" + GV->getName();
+   case SectionKind::DataRelROLocal:
+    return ".gnu.linkonce.d.rel.ro.local" + GV->getName();
+   case SectionKind::SmallData:
+    return ".gnu.linkonce.s." + GV->getName();
+   case SectionKind::BSS:
+    return ".gnu.linkonce.b." + GV->getName();
+   case SectionKind::SmallBSS:
+    return ".gnu.linkonce.sb." + GV->getName();
+   case SectionKind::ROData:
+   case SectionKind::RODataMergeConst:
+   case SectionKind::RODataMergeStr:
+    return ".gnu.linkonce.r." + GV->getName();
+   case SectionKind::SmallROData:
+    return ".gnu.linkonce.s2." + GV->getName();
+   case SectionKind::ThreadData:
+    return ".gnu.linkonce.td." + GV->getName();
+   case SectionKind::ThreadBSS:
+    return ".gnu.linkonce.tb." + GV->getName();
+   default:
+    assert(0 && "Unknown section kind");
+  }
+  return NULL;
+}
+
+const Section*
+TargetAsmInfo::getNamedSection(const char *Name, unsigned Flags,
+                               bool Override) const {
+  Section& S = Sections[Name];
+
+  // This is newly-created section, set it up properly.
+  if (S.Flags == SectionFlags::Invalid || Override) {
+    S.Flags = Flags | SectionFlags::Named;
+    S.Name = Name;
+  }
+
+  return &S;
+}
+
+const Section*
+TargetAsmInfo::getUnnamedSection(const char *Directive, unsigned Flags,
+                                 bool Override) const {
+  Section& S = Sections[Directive];
+
+  // This is newly-created section, set it up properly.
+  if (S.Flags == SectionFlags::Invalid || Override) {
+    S.Flags = Flags & ~SectionFlags::Named;
+    S.Name = Directive;
+  }
+
+  return &S;
+}
+
+const std::string&
+TargetAsmInfo::getSectionFlags(unsigned Flags) const {
+  SectionFlags::FlagsStringsMapType::iterator I = FlagsStrings.find(Flags);
+
+  // We didn't print these flags yet, print and save them to map. This reduces
+  // amount of heap trashing due to std::string construction / concatenation.
+  if (I == FlagsStrings.end())
+    I = FlagsStrings.insert(std::make_pair(Flags,
+                                           printSectionFlags(Flags))).first;
+
+  return I->second;
+}
+
+unsigned TargetAsmInfo::getULEB128Size(unsigned Value) {
+  unsigned Size = 0;
+  do {
+    Value >>= 7;
+    Size += sizeof(int8_t);
+  } while (Value);
+  return Size;
+}
+
+unsigned TargetAsmInfo::getSLEB128Size(int Value) {
+  unsigned Size = 0;
+  int Sign = Value >> (8 * sizeof(Value) - 1);
+  bool IsMore;
+
+  do {
+    unsigned Byte = Value & 0x7f;
+    Value >>= 7;
+    IsMore = Value != Sign || ((Byte ^ Sign) & 0x40) != 0;
+    Size += sizeof(int8_t);
+  } while (IsMore);
+  return Size;
+}
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
new file mode 100644
index 0000000..67fefbb
--- /dev/null
+++ b/lib/Target/TargetData.cpp
@@ -0,0 +1,603 @@
+//===-- TargetData.cpp - Data size & alignment routines --------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target properties related to datatype size/offset/alignment
+// information.
+//
+// This structure should be created once, filled in if the defaults are not
+// correct and then passed around by const&.  None of the members functions
+// require modification to the object.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetData.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Constants.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <cstdlib>
+using namespace llvm;
+
+// Handle the Pass registration stuff necessary to use TargetData's.
+
+// Register the default SparcV9 implementation...
+static RegisterPass<TargetData> X("targetdata", "Target Data Layout", false, 
+                                  true);
+char TargetData::ID = 0;
+
+//===----------------------------------------------------------------------===//
+// Support for StructLayout
+//===----------------------------------------------------------------------===//
+
+StructLayout::StructLayout(const StructType *ST, const TargetData &TD) {
+  StructAlignment = 0;
+  StructSize = 0;
+  NumElements = ST->getNumElements();
+
+  // Loop over each of the elements, placing them in memory.
+  for (unsigned i = 0, e = NumElements; i != e; ++i) {
+    const Type *Ty = ST->getElementType(i);
+    unsigned TyAlign = ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty);
+
+    // Add padding if necessary to align the data element properly.
+    if ((StructSize & (TyAlign-1)) != 0)
+      StructSize = TargetData::RoundUpAlignment(StructSize, TyAlign);
+
+    // Keep track of maximum alignment constraint.
+    StructAlignment = std::max(TyAlign, StructAlignment);
+
+    MemberOffsets[i] = StructSize;
+    StructSize += TD.getTypeAllocSize(Ty); // Consume space for this data item
+  }
+
+  // Empty structures have alignment of 1 byte.
+  if (StructAlignment == 0) StructAlignment = 1;
+
+  // Add padding to the end of the struct so that it could be put in an array
+  // and all array elements would be aligned correctly.
+  if ((StructSize & (StructAlignment-1)) != 0)
+    StructSize = TargetData::RoundUpAlignment(StructSize, StructAlignment);
+}
+
+
+/// getElementContainingOffset - Given a valid offset into the structure,
+/// return the structure index that contains it.
+unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
+  const uint64_t *SI =
+    std::upper_bound(&MemberOffsets[0], &MemberOffsets[NumElements], Offset);
+  assert(SI != &MemberOffsets[0] && "Offset not in structure type!");
+  --SI;
+  assert(*SI <= Offset && "upper_bound didn't work");
+  assert((SI == &MemberOffsets[0] || *(SI-1) <= Offset) &&
+         (SI+1 == &MemberOffsets[NumElements] || *(SI+1) > Offset) &&
+         "Upper bound didn't work!");
+  
+  // Multiple fields can have the same offset if any of them are zero sized.
+  // For example, in { i32, [0 x i32], i32 }, searching for offset 4 will stop
+  // at the i32 element, because it is the last element at that offset.  This is
+  // the right one to return, because anything after it will have a higher
+  // offset, implying that this element is non-empty.
+  return SI-&MemberOffsets[0];
+}
+
+//===----------------------------------------------------------------------===//
+// TargetAlignElem, TargetAlign support
+//===----------------------------------------------------------------------===//
+
+TargetAlignElem
+TargetAlignElem::get(AlignTypeEnum align_type, unsigned char abi_align,
+                     unsigned char pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  TargetAlignElem retval;
+  retval.AlignType = align_type;
+  retval.ABIAlign = abi_align;
+  retval.PrefAlign = pref_align;
+  retval.TypeBitWidth = bit_width;
+  return retval;
+}
+
+bool
+TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
+  return (AlignType == rhs.AlignType
+          && ABIAlign == rhs.ABIAlign
+          && PrefAlign == rhs.PrefAlign
+          && TypeBitWidth == rhs.TypeBitWidth);
+}
+
+std::ostream &
+TargetAlignElem::dump(std::ostream &os) const {
+  return os << AlignType
+            << TypeBitWidth
+            << ":" << (int) (ABIAlign * 8)
+            << ":" << (int) (PrefAlign * 8);
+}
+
+const TargetAlignElem TargetData::InvalidAlignmentElem =
+                TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0);
+
+//===----------------------------------------------------------------------===//
+//                       TargetData Class Implementation
+//===----------------------------------------------------------------------===//
+
+/*!
+ A TargetDescription string consists of a sequence of hyphen-delimited
+ specifiers for target endianness, pointer size and alignments, and various
+ primitive type sizes and alignments. A typical string looks something like:
+ <br><br>
+ "E-p:32:32:32-i1:8:8-i8:8:8-i32:32:32-i64:32:64-f32:32:32-f64:32:64"
+ <br><br>
+ (note: this string is not fully specified and is only an example.)
+ \p
+ Alignments come in two flavors: ABI and preferred. ABI alignment (abi_align,
+ below) dictates how a type will be aligned within an aggregate and when used
+ as an argument.  Preferred alignment (pref_align, below) determines a type's
+ alignment when emitted as a global.
+ \p
+ Specifier string details:
+ <br><br>
+ <i>[E|e]</i>: Endianness. "E" specifies a big-endian target data model, "e"
+ specifies a little-endian target data model.
+ <br><br>
+ <i>p:@verbatim<size>:<abi_align>:<pref_align>@endverbatim</i>: Pointer size, 
+ ABI and preferred alignment.
+ <br><br>
+ <i>@verbatim<type><size>:<abi_align>:<pref_align>@endverbatim</i>: Numeric type
+ alignment. Type is
+ one of <i>i|f|v|a</i>, corresponding to integer, floating point, vector (aka
+ packed) or aggregate.  Size indicates the size, e.g., 32 or 64 bits.
+ \p
+ The default string, fully specified is:
+ <br><br>
+ "E-p:64:64:64-a0:0:0-f32:32:32-f64:0:64"
+ "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:0:64"
+ "-v64:64:64-v128:128:128"
+ <br><br>
+ Note that in the case of aggregates, 0 is the default ABI and preferred
+ alignment. This is a special case, where the aggregate's computed worst-case
+ alignment will be used.
+ */ 
+void TargetData::init(const std::string &TargetDescription) {
+  std::string temp = TargetDescription;
+  
+  LittleEndian = false;
+  PointerMemSize = 8;
+  PointerABIAlign   = 8;
+  PointerPrefAlign = PointerABIAlign;
+
+  // Default alignments
+  setAlignment(INTEGER_ALIGN,   1,  1, 1);   // i1
+  setAlignment(INTEGER_ALIGN,   1,  1, 8);   // i8
+  setAlignment(INTEGER_ALIGN,   2,  2, 16);  // i16
+  setAlignment(INTEGER_ALIGN,   4,  4, 32);  // i32
+  setAlignment(INTEGER_ALIGN,   4,  8, 64);  // i64
+  setAlignment(FLOAT_ALIGN,     4,  4, 32);  // float
+  setAlignment(FLOAT_ALIGN,     8,  8, 64);  // double
+  setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32
+  setAlignment(VECTOR_ALIGN,   16, 16, 128); // v16i8, v8i16, v4i32, ...
+  setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct, union, class, ...
+
+  while (!temp.empty()) {
+    std::string token = getToken(temp, "-");
+    std::string arg0 = getToken(token, ":");
+    const char *p = arg0.c_str();
+    switch(*p) {
+    case 'E':
+      LittleEndian = false;
+      break;
+    case 'e':
+      LittleEndian = true;
+      break;
+    case 'p':
+      PointerMemSize = atoi(getToken(token,":").c_str()) / 8;
+      PointerABIAlign = atoi(getToken(token,":").c_str()) / 8;
+      PointerPrefAlign = atoi(getToken(token,":").c_str()) / 8;
+      if (PointerPrefAlign == 0)
+        PointerPrefAlign = PointerABIAlign;
+      break;
+    case 'i':
+    case 'v':
+    case 'f':
+    case 'a':
+    case 's': {
+      AlignTypeEnum align_type = STACK_ALIGN; // Dummy init, silence warning
+      switch(*p) {
+        case 'i': align_type = INTEGER_ALIGN; break;
+        case 'v': align_type = VECTOR_ALIGN; break;
+        case 'f': align_type = FLOAT_ALIGN; break;
+        case 'a': align_type = AGGREGATE_ALIGN; break;
+        case 's': align_type = STACK_ALIGN; break;
+      }
+      uint32_t size = (uint32_t) atoi(++p);
+      unsigned char abi_align = atoi(getToken(token, ":").c_str()) / 8;
+      unsigned char pref_align = atoi(getToken(token, ":").c_str()) / 8;
+      if (pref_align == 0)
+        pref_align = abi_align;
+      setAlignment(align_type, abi_align, pref_align, size);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
+TargetData::TargetData(const Module *M) 
+  : ImmutablePass(&ID) {
+  init(M->getDataLayout());
+}
+
+void
+TargetData::setAlignment(AlignTypeEnum align_type, unsigned char abi_align,
+                         unsigned char pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == align_type &&
+        Alignments[i].TypeBitWidth == bit_width) {
+      // Update the abi, preferred alignments.
+      Alignments[i].ABIAlign = abi_align;
+      Alignments[i].PrefAlign = pref_align;
+      return;
+    }
+  }
+  
+  Alignments.push_back(TargetAlignElem::get(align_type, abi_align,
+                                            pref_align, bit_width));
+}
+
+/// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or 
+/// preferred if ABIInfo = false) the target wants for the specified datatype.
+unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType, 
+                                      uint32_t BitWidth, bool ABIInfo,
+                                      const Type *Ty) const {
+  // Check to see if we have an exact match and remember the best match we see.
+  int BestMatchIdx = -1;
+  int LargestInt = -1;
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == AlignType &&
+        Alignments[i].TypeBitWidth == BitWidth)
+      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
+    
+    // The best match so far depends on what we're looking for.
+    if (AlignType == VECTOR_ALIGN && Alignments[i].AlignType == VECTOR_ALIGN) {
+      // If this is a specification for a smaller vector type, we will fall back
+      // to it.  This happens because <128 x double> can be implemented in terms
+      // of 64 <2 x double>.
+      if (Alignments[i].TypeBitWidth < BitWidth) {
+        // Verify that we pick the biggest of the fallbacks.
+        if (BestMatchIdx == -1 ||
+            Alignments[BestMatchIdx].TypeBitWidth < Alignments[i].TypeBitWidth)
+          BestMatchIdx = i;
+      }
+    } else if (AlignType == INTEGER_ALIGN && 
+               Alignments[i].AlignType == INTEGER_ALIGN) {
+      // The "best match" for integers is the smallest size that is larger than
+      // the BitWidth requested.
+      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 || 
+           Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
+        BestMatchIdx = i;
+      // However, if there isn't one that's larger, then we must use the
+      // largest one we have (see below)
+      if (LargestInt == -1 || 
+          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
+        LargestInt = i;
+    }
+  }
+
+  // Okay, we didn't find an exact solution.  Fall back here depending on what
+  // is being looked for.
+  if (BestMatchIdx == -1) {
+    // If we didn't find an integer alignment, fall back on most conservative.
+    if (AlignType == INTEGER_ALIGN) {
+      BestMatchIdx = LargestInt;
+    } else {
+      assert(AlignType == VECTOR_ALIGN && "Unknown alignment type!");
+
+      // If we didn't find a vector size that is smaller or equal to this type,
+      // then we will end up scalarizing this to its element type.  Just return
+      // the alignment of the element.
+      return getAlignment(cast<VectorType>(Ty)->getElementType(), ABIInfo);
+    }
+  }
+
+  // Since we got a "best match" index, just return it.
+  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
+                 : Alignments[BestMatchIdx].PrefAlign;
+}
+
+namespace {
+
+/// LayoutInfo - The lazy cache of structure layout information maintained by
+/// TargetData.  Note that the struct types must have been free'd before
+/// llvm_shutdown is called (and thus this is deallocated) because all the
+/// targets with cached elements should have been destroyed.
+///
+typedef std::pair<const TargetData*,const StructType*> LayoutKey;
+
+struct DenseMapLayoutKeyInfo {
+  static inline LayoutKey getEmptyKey() { return LayoutKey(0, 0); }
+  static inline LayoutKey getTombstoneKey() {
+    return LayoutKey((TargetData*)(intptr_t)-1, 0);
+  }
+  static unsigned getHashValue(const LayoutKey &Val) {
+    return DenseMapInfo<void*>::getHashValue(Val.first) ^
+           DenseMapInfo<void*>::getHashValue(Val.second);
+  }
+  static bool isEqual(const LayoutKey &LHS, const LayoutKey &RHS) {
+    return LHS == RHS;
+  }
+
+  static bool isPod() { return true; }
+};
+
+typedef DenseMap<LayoutKey, StructLayout*, DenseMapLayoutKeyInfo> LayoutInfoTy;
+
+}
+
+static ManagedStatic<LayoutInfoTy> LayoutInfo;
+
+TargetData::~TargetData() {
+  if (!LayoutInfo.isConstructed())
+    return;
+  
+  // Remove any layouts for this TD.
+  LayoutInfoTy &TheMap = *LayoutInfo;
+  for (LayoutInfoTy::iterator I = TheMap.begin(), E = TheMap.end(); I != E; ) {
+    if (I->first.first == this) {
+      I->second->~StructLayout();
+      free(I->second);
+      TheMap.erase(I++);
+    } else {
+      ++I;
+    }
+  }
+}
+
+const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
+  LayoutInfoTy &TheMap = *LayoutInfo;
+  
+  StructLayout *&SL = TheMap[LayoutKey(this, Ty)];
+  if (SL) return SL;
+
+  // Otherwise, create the struct layout.  Because it is variable length, we 
+  // malloc it, then use placement new.
+  int NumElts = Ty->getNumElements();
+  StructLayout *L =
+    (StructLayout *)malloc(sizeof(StructLayout)+(NumElts-1)*sizeof(uint64_t));
+  
+  // Set SL before calling StructLayout's ctor.  The ctor could cause other
+  // entries to be added to TheMap, invalidating our reference.
+  SL = L;
+  
+  new (L) StructLayout(Ty, *this);
+  return L;
+}
+
+/// InvalidateStructLayoutInfo - TargetData speculatively caches StructLayout
+/// objects.  If a TargetData object is alive when types are being refined and
+/// removed, this method must be called whenever a StructType is removed to
+/// avoid a dangling pointer in this cache.
+void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const {
+  if (!LayoutInfo.isConstructed()) return;  // No cache.
+  
+  LayoutInfoTy::iterator I = LayoutInfo->find(LayoutKey(this, Ty));
+  if (I == LayoutInfo->end()) return;
+  
+  I->second->~StructLayout();
+  free(I->second);
+  LayoutInfo->erase(I);
+}
+
+
+std::string TargetData::getStringRepresentation() const {
+  std::string repr;
+  repr.append(LittleEndian ? "e" : "E");
+  repr.append("-p:").append(itostr((int64_t) (PointerMemSize * 8))).
+      append(":").append(itostr((int64_t) (PointerABIAlign * 8))).
+      append(":").append(itostr((int64_t) (PointerPrefAlign * 8)));
+  for (align_const_iterator I = Alignments.begin();
+       I != Alignments.end();
+       ++I) {
+    repr.append("-").append(1, (char) I->AlignType).
+      append(utostr((int64_t) I->TypeBitWidth)).
+      append(":").append(utostr((uint64_t) (I->ABIAlign * 8))).
+      append(":").append(utostr((uint64_t) (I->PrefAlign * 8)));
+  }
+  return repr;
+}
+
+
+uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
+  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
+  switch (Ty->getTypeID()) {
+  case Type::LabelTyID:
+  case Type::PointerTyID:
+    return getPointerSizeInBits();
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    return getTypeAllocSizeInBits(ATy->getElementType())*ATy->getNumElements();
+  }
+  case Type::StructTyID:
+    // Get the layout annotation... which is lazily created on demand.
+    return getStructLayout(cast<StructType>(Ty))->getSizeInBits();
+  case Type::IntegerTyID:
+    return cast<IntegerType>(Ty)->getBitWidth();
+  case Type::VoidTyID:
+    return 8;
+  case Type::FloatTyID:
+    return 32;
+  case Type::DoubleTyID:
+    return 64;
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+    return 128;
+  // In memory objects this is always aligned to a higher boundary, but
+  // only 80 bits contain information.
+  case Type::X86_FP80TyID:
+    return 80;
+  case Type::VectorTyID:
+    return cast<VectorType>(Ty)->getBitWidth();
+  default:
+    assert(0 && "TargetData::getTypeSizeInBits(): Unsupported type");
+    break;
+  }
+  return 0;
+}
+
+/*!
+  \param abi_or_pref Flag that determines which alignment is returned. true
+  returns the ABI alignment, false returns the preferred alignment.
+  \param Ty The underlying type for which alignment is determined.
+
+  Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
+  == false) for the requested type \a Ty.
+ */
+unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
+  int AlignType = -1;
+
+  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
+  switch (Ty->getTypeID()) {
+  // Early escape for the non-numeric types.
+  case Type::LabelTyID:
+  case Type::PointerTyID:
+    return (abi_or_pref
+            ? getPointerABIAlignment()
+            : getPointerPrefAlignment());
+  case Type::ArrayTyID:
+    return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref);
+
+  case Type::StructTyID: {
+    // Packed structure types always have an ABI alignment of one.
+    if (cast<StructType>(Ty)->isPacked() && abi_or_pref)
+      return 1;
+
+    // Get the layout annotation... which is lazily created on demand.
+    const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
+    unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
+    return std::max(Align, (unsigned)Layout->getAlignment());
+  }
+  case Type::IntegerTyID:
+  case Type::VoidTyID:
+    AlignType = INTEGER_ALIGN;
+    break;
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  // PPC_FP128TyID and FP128TyID have different data contents, but the
+  // same size and alignment, so they look the same here.
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+  case Type::X86_FP80TyID:
+    AlignType = FLOAT_ALIGN;
+    break;
+  case Type::VectorTyID:
+    AlignType = VECTOR_ALIGN;
+    break;
+  default:
+    assert(0 && "Bad type for getAlignment!!!");
+    break;
+  }
+
+  return getAlignmentInfo((AlignTypeEnum)AlignType, getTypeSizeInBits(Ty),
+                          abi_or_pref, Ty);
+}
+
+unsigned char TargetData::getABITypeAlignment(const Type *Ty) const {
+  return getAlignment(Ty, true);
+}
+
+unsigned char TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
+    if (Alignments[i].AlignType == STACK_ALIGN)
+      return Alignments[i].ABIAlign;
+
+  return getABITypeAlignment(Ty);
+}
+
+unsigned char TargetData::getPrefTypeAlignment(const Type *Ty) const {
+  return getAlignment(Ty, false);
+}
+
+unsigned char TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
+  unsigned Align = (unsigned) getPrefTypeAlignment(Ty);
+  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
+  return Log2_32(Align);
+}
+
+/// getIntPtrType - Return an unsigned integer type that is the same size or
+/// greater to the host pointer size.
+const IntegerType *TargetData::getIntPtrType() const {
+  return IntegerType::get(getPointerSizeInBits());
+}
+
+
+uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
+                                      unsigned NumIndices) const {
+  const Type *Ty = ptrTy;
+  assert(isa<PointerType>(Ty) && "Illegal argument for getIndexedOffset()");
+  uint64_t Result = 0;
+
+  generic_gep_type_iterator<Value* const*>
+    TI = gep_type_begin(ptrTy, Indices, Indices+NumIndices);
+  for (unsigned CurIDX = 0; CurIDX != NumIndices; ++CurIDX, ++TI) {
+    if (const StructType *STy = dyn_cast<StructType>(*TI)) {
+      assert(Indices[CurIDX]->getType() == Type::Int32Ty &&
+             "Illegal struct idx");
+      unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
+
+      // Get structure layout information...
+      const StructLayout *Layout = getStructLayout(STy);
+
+      // Add in the offset, as calculated by the structure layout info...
+      Result += Layout->getElementOffset(FieldNo);
+
+      // Update Ty to refer to current element
+      Ty = STy->getElementType(FieldNo);
+    } else {
+      // Update Ty to refer to current element
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // Get the array index and the size of each array element.
+      int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue();
+      Result += arrayIdx * (int64_t)getTypeAllocSize(Ty);
+    }
+  }
+
+  return Result;
+}
+
+/// getPreferredAlignment - Return the preferred alignment of the specified
+/// global.  This includes an explicitly requested alignment (if the global
+/// has one).
+unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
+  const Type *ElemType = GV->getType()->getElementType();
+  unsigned Alignment = getPrefTypeAlignment(ElemType);
+  if (GV->getAlignment() > Alignment)
+    Alignment = GV->getAlignment();
+
+  if (GV->hasInitializer()) {
+    if (Alignment < 16) {
+      // If the global is not external, see if it is large.  If so, give it a
+      // larger alignment.
+      if (getTypeSizeInBits(ElemType) > 128)
+        Alignment = 16;    // 16-byte alignment.
+    }
+  }
+  return Alignment;
+}
+
+/// getPreferredAlignmentLog - Return the preferred alignment of the
+/// specified global, returned in log form.  This includes an explicitly
+/// requested alignment (if the global has one).
+unsigned TargetData::getPreferredAlignmentLog(const GlobalVariable *GV) const {
+  return Log2_32(getPreferredAlignment(GV));
+}
diff --git a/lib/Target/TargetFrameInfo.cpp b/lib/Target/TargetFrameInfo.cpp
new file mode 100644
index 0000000..873d60a
--- /dev/null
+++ b/lib/Target/TargetFrameInfo.cpp
@@ -0,0 +1,19 @@
+//===-- TargetFrameInfo.cpp - Implement machine frame interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the layout of a stack frame on the target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include <cstdlib>
+using namespace llvm;
+
+TargetFrameInfo::~TargetFrameInfo() {
+}
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
new file mode 100644
index 0000000..ceaea0c
--- /dev/null
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -0,0 +1,50 @@
+//===-- TargetInstrInfo.cpp - Target Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Constant.h"
+#include "llvm/DerivedTypes.h"
+using namespace llvm;
+
+TargetInstrInfo::TargetInstrInfo(const TargetInstrDesc* Desc,
+                                 unsigned numOpcodes)
+  : Descriptors(Desc), NumOpcodes(numOpcodes) {
+}
+
+TargetInstrInfo::~TargetInstrInfo() {
+}
+
+bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+  
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+/// getInstrOperandRegClass - Return register class of the operand of an
+/// instruction of the specified TargetInstrDesc.
+const TargetRegisterClass*
+llvm::getInstrOperandRegClass(const TargetRegisterInfo *TRI,
+                        const TargetInstrDesc &II, unsigned Op) {
+  if (Op >= II.getNumOperands())
+    return NULL;
+  if (II.OpInfo[Op].isLookupPtrRegClass())
+    return TRI->getPointerRegClass();
+  return TRI->getRegClass(II.OpInfo[Op].RegClass);
+}
diff --git a/lib/Target/TargetIntrinsicInfo.cpp b/lib/Target/TargetIntrinsicInfo.cpp
new file mode 100644
index 0000000..d8da08e
--- /dev/null
+++ b/lib/Target/TargetIntrinsicInfo.cpp
@@ -0,0 +1,22 @@
+//===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetIntrinsicInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+using namespace llvm;
+
+TargetIntrinsicInfo::TargetIntrinsicInfo(const char **desc, unsigned count)
+  : Intrinsics(desc), NumIntrinsics(count) {
+}
+
+TargetIntrinsicInfo::~TargetIntrinsicInfo() {
+}
diff --git a/lib/Target/TargetMachOWriterInfo.cpp b/lib/Target/TargetMachOWriterInfo.cpp
new file mode 100644
index 0000000..d608119
--- /dev/null
+++ b/lib/Target/TargetMachOWriterInfo.cpp
@@ -0,0 +1,25 @@
+//===-- llvm/Target/TargetMachOWriterInfo.h - MachO Writer Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TargetMachOWriterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachOWriterInfo.h"
+#include "llvm/CodeGen/MachineRelocation.h"
+using namespace llvm;
+
+TargetMachOWriterInfo::~TargetMachOWriterInfo() {}
+
+MachineRelocation
+TargetMachOWriterInfo::GetJTRelocation(unsigned Offset,
+                                       MachineBasicBlock *MBB) const {
+  // FIXME: do something about PIC
+  return MachineRelocation::getBB(Offset, MachineRelocation::VANILLA, MBB);
+}
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
new file mode 100644
index 0000000..1b042dd
--- /dev/null
+++ b/lib/Target/TargetMachine.cpp
@@ -0,0 +1,229 @@
+//===-- TargetMachine.cpp - General Target Information ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// Command-line options that tend to be useful on more than one back-end.
+//
+
+namespace llvm {
+  bool LessPreciseFPMADOption;
+  bool PrintMachineCode;
+  bool NoFramePointerElim;
+  bool NoExcessFPPrecision;
+  bool UnsafeFPMath;
+  bool FiniteOnlyFPMathOption;
+  bool HonorSignDependentRoundingFPMathOption;
+  bool UseSoftFloat;
+  bool NoImplicitFloat;
+  bool NoZerosInBSS;
+  bool ExceptionHandling;
+  bool UnwindTablesMandatory;
+  Reloc::Model RelocationModel;
+  CodeModel::Model CMModel;
+  bool PerformTailCallOpt;
+  unsigned StackAlignment;
+  bool RealignStack;
+  bool DisableJumpTables;
+  bool StrongPHIElim;
+  bool DisableRedZone;
+  bool AsmVerbosityDefault(false);
+}
+
+static cl::opt<bool, true>
+PrintCode("print-machineinstrs",
+  cl::desc("Print generated machine code"),
+  cl::location(PrintMachineCode), cl::init(false));
+static cl::opt<bool, true>
+DisableFPElim("disable-fp-elim",
+  cl::desc("Disable frame pointer elimination optimization"),
+  cl::location(NoFramePointerElim),
+  cl::init(false));
+static cl::opt<bool, true>
+DisableExcessPrecision("disable-excess-fp-precision",
+  cl::desc("Disable optimizations that may increase FP precision"),
+  cl::location(NoExcessFPPrecision),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableFPMAD("enable-fp-mad",
+  cl::desc("Enable less precise MAD instructions to be generated"),
+  cl::location(LessPreciseFPMADOption),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableUnsafeFPMath("enable-unsafe-fp-math",
+  cl::desc("Enable optimizations that may decrease FP precision"),
+  cl::location(UnsafeFPMath),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableFiniteOnlyFPMath("enable-finite-only-fp-math",
+  cl::desc("Enable optimizations that assumes non- NaNs / +-Infs"),
+  cl::location(FiniteOnlyFPMathOption),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
+  cl::Hidden,
+  cl::desc("Force codegen to assume rounding mode can change dynamically"),
+  cl::location(HonorSignDependentRoundingFPMathOption),
+  cl::init(false));
+static cl::opt<bool, true>
+GenerateSoftFloatCalls("soft-float",
+  cl::desc("Generate software floating point library calls"),
+  cl::location(UseSoftFloat),
+  cl::init(false));
+static cl::opt<bool, true>
+GenerateNoImplicitFloats("no-implicit-float",
+  cl::desc("Don't generate implicit floating point instructions (x86-only)"),
+  cl::location(NoImplicitFloat),
+  cl::init(false));
+static cl::opt<bool, true>
+DontPlaceZerosInBSS("nozero-initialized-in-bss",
+  cl::desc("Don't place zero-initialized symbols into bss section"),
+  cl::location(NoZerosInBSS),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableExceptionHandling("enable-eh",
+  cl::desc("Emit DWARF exception handling (default if target supports)"),
+  cl::location(ExceptionHandling),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableUnwindTables("unwind-tables",
+  cl::desc("Generate unwinding tables for all functions"),
+  cl::location(UnwindTablesMandatory),
+  cl::init(false));
+
+static cl::opt<llvm::Reloc::Model, true>
+DefRelocationModel("relocation-model",
+  cl::desc("Choose relocation model"),
+  cl::location(RelocationModel),
+  cl::init(Reloc::Default),
+  cl::values(
+    clEnumValN(Reloc::Default, "default",
+               "Target default relocation model"),
+    clEnumValN(Reloc::Static, "static",
+               "Non-relocatable code"),
+    clEnumValN(Reloc::PIC_, "pic",
+               "Fully relocatable, position independent code"),
+    clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
+               "Relocatable external references, non-relocatable code"),
+    clEnumValEnd));
+static cl::opt<llvm::CodeModel::Model, true>
+DefCodeModel("code-model",
+  cl::desc("Choose code model"),
+  cl::location(CMModel),
+  cl::init(CodeModel::Default),
+  cl::values(
+    clEnumValN(CodeModel::Default, "default",
+               "Target default code model"),
+    clEnumValN(CodeModel::Small, "small",
+               "Small code model"),
+    clEnumValN(CodeModel::Kernel, "kernel",
+               "Kernel code model"),
+    clEnumValN(CodeModel::Medium, "medium",
+               "Medium code model"),
+    clEnumValN(CodeModel::Large, "large",
+               "Large code model"),
+    clEnumValEnd));
+static cl::opt<bool, true>
+EnablePerformTailCallOpt("tailcallopt",
+  cl::desc("Turn on tail call optimization."),
+  cl::location(PerformTailCallOpt),
+  cl::init(false));
+static cl::opt<unsigned, true>
+OverrideStackAlignment("stack-alignment",
+  cl::desc("Override default stack alignment"),
+  cl::location(StackAlignment),
+  cl::init(0));
+static cl::opt<bool, true>
+EnableRealignStack("realign-stack",
+  cl::desc("Realign stack if needed"),
+  cl::location(RealignStack),
+  cl::init(true));
+static cl::opt<bool, true>
+DisableSwitchTables(cl::Hidden, "disable-jump-tables", 
+  cl::desc("Do not generate jump tables."),
+  cl::location(DisableJumpTables),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
+  cl::desc("Use strong PHI elimination."),
+  cl::location(StrongPHIElim),
+  cl::init(false));
+static cl::opt<bool, true>
+DisableRedZoneOption("disable-red-zone",
+  cl::desc("Do not emit code that uses the red zone."),
+  cl::location(DisableRedZone),
+  cl::init(false));
+
+//---------------------------------------------------------------------------
+// TargetMachine Class
+//
+
+TargetMachine::~TargetMachine() {
+  delete AsmInfo;
+}
+
+/// getRelocationModel - Returns the code generation relocation model. The
+/// choices are static, PIC, and dynamic-no-pic, and target default.
+Reloc::Model TargetMachine::getRelocationModel() {
+  return RelocationModel;
+}
+
+/// setRelocationModel - Sets the code generation relocation model.
+void TargetMachine::setRelocationModel(Reloc::Model Model) {
+  RelocationModel = Model;
+}
+
+/// getCodeModel - Returns the code model. The choices are small, kernel,
+/// medium, large, and target default.
+CodeModel::Model TargetMachine::getCodeModel() {
+  return CMModel;
+}
+
+/// setCodeModel - Sets the code model.
+void TargetMachine::setCodeModel(CodeModel::Model Model) {
+  CMModel = Model;
+}
+
+bool TargetMachine::getAsmVerbosityDefault() {
+  return AsmVerbosityDefault;
+}
+
+void TargetMachine::setAsmVerbosityDefault(bool V) {
+  AsmVerbosityDefault = V;
+}
+
+namespace llvm {
+  /// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
+  /// is specified on the command line.  When this flag is off(default), the
+  /// code generator is not allowed to generate mad (multiply add) if the
+  /// result is "less precise" than doing those operations individually.
+  bool LessPreciseFPMAD() { return UnsafeFPMath || LessPreciseFPMADOption; }
+
+  /// FiniteOnlyFPMath - This returns true when the -enable-finite-only-fp-math
+  /// option is specified on the command line. If this returns false (default),
+  /// the code generator is not allowed to assume that FP arithmetic arguments
+  /// and results are never NaNs or +-Infs.
+  bool FiniteOnlyFPMath() { return UnsafeFPMath || FiniteOnlyFPMathOption; }
+  
+  /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
+  /// that the rounding mode of the FPU can change from its default.
+  bool HonorSignDependentRoundingFPMath() {
+    return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+  }
+}
+
diff --git a/lib/Target/TargetMachineRegistry.cpp b/lib/Target/TargetMachineRegistry.cpp
new file mode 100644
index 0000000..c1a4777
--- /dev/null
+++ b/lib/Target/TargetMachineRegistry.cpp
@@ -0,0 +1,78 @@
+//===-- TargetMachineRegistry.cpp - Target Auto Registration Impl ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the RegisterTarget class, which TargetMachine
+// implementations should use to register themselves with the system.  This file
+// also exposes the TargetMachineRegistry class, which allows tools to inspect
+// all of registered targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachineRegistry.h"
+#include <algorithm>
+using namespace llvm;
+
+/// getClosestStaticTargetForModule - Given an LLVM module, pick the best target
+/// that is compatible with the module.  If no close target can be found, this
+/// returns null and sets the Error string to a reason.
+const TargetMachineRegistry::entry *
+TargetMachineRegistry::getClosestStaticTargetForModule(const Module &M,
+                                                       std::string &Error) {
+  std::vector<std::pair<unsigned, const entry *> > UsableTargets;
+  for (Registry<TargetMachine>::iterator I = begin(), E = end(); I != E; ++I)
+    if (unsigned Qual = I->ModuleMatchQualityFn(M))
+      UsableTargets.push_back(std::make_pair(Qual, &*I));
+
+  if (UsableTargets.empty()) {
+    Error = "No available targets are compatible with this module";
+    return 0;
+  } else if (UsableTargets.size() == 1)
+    return UsableTargets.back().second;
+
+  // Otherwise, take the best target, but make sure we don't have two equally
+  // good best targets.
+  std::sort(UsableTargets.begin(), UsableTargets.end());
+  if (UsableTargets.back().first ==UsableTargets[UsableTargets.size()-2].first){
+    Error = "Cannot choose between targets \"" +
+      std::string(UsableTargets.back().second->Name) + "\" and \"" +
+      std::string(UsableTargets[UsableTargets.size()-2].second->Name) + "\"";
+    return 0;
+  }
+  return UsableTargets.back().second;
+}
+
+/// getClosestTargetForJIT - Pick the best target that is compatible with
+/// the current host.  If no close target can be found, this returns null
+/// and sets the Error string to a reason.
+const TargetMachineRegistry::entry *
+TargetMachineRegistry::getClosestTargetForJIT(std::string &Error) {
+  std::vector<std::pair<unsigned, const entry *> > UsableTargets;
+  for (Registry<TargetMachine>::iterator I = begin(), E = end(); I != E; ++I)
+    if (unsigned Qual = I->JITMatchQualityFn())
+      UsableTargets.push_back(std::make_pair(Qual, &*I));
+
+  if (UsableTargets.empty()) {
+    Error = "No JIT is available for this host";
+    return 0;
+  } else if (UsableTargets.size() == 1)
+    return UsableTargets.back().second;
+
+  // Otherwise, take the best target.  If there is a tie, just pick one.
+  unsigned MaxQual = UsableTargets.front().first;
+  const entry *MaxQualTarget = UsableTargets.front().second;
+
+  for (unsigned i = 1, e = UsableTargets.size(); i != e; ++i)
+    if (UsableTargets[i].first > MaxQual) {
+      MaxQual = UsableTargets[i].first;
+      MaxQualTarget = UsableTargets[i].second;
+    }
+
+  return MaxQualTarget;
+}
+
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
new file mode 100644
index 0000000..a84fdaa
--- /dev/null
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -0,0 +1,144 @@
+//===- TargetRegisterInfo.cpp - Target Register Information Implementation ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetRegisterInfo interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/BitVector.h"
+
+using namespace llvm;
+
+TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterDesc *D, unsigned NR,
+                             regclass_iterator RCB, regclass_iterator RCE,
+                             int CFSO, int CFDO,
+                             const unsigned* subregs, const unsigned subregsize,
+                         const unsigned* superregs, const unsigned superregsize,
+                         const unsigned* aliases, const unsigned aliasessize)
+  : SubregHash(subregs), SubregHashSize(subregsize),
+    SuperregHash(superregs), SuperregHashSize(superregsize),
+    AliasesHash(aliases), AliasesHashSize(aliasessize),
+    Desc(D), NumRegs(NR), RegClassBegin(RCB), RegClassEnd(RCE) {
+  assert(NumRegs < FirstVirtualRegister &&
+         "Target has too many physical registers!");
+
+  CallFrameSetupOpcode   = CFSO;
+  CallFrameDestroyOpcode = CFDO;
+}
+
+TargetRegisterInfo::~TargetRegisterInfo() {}
+
+/// getPhysicalRegisterRegClass - Returns the Register Class of a physical
+/// register of the given type. If type is MVT::Other, then just return any
+/// register class the register belongs to.
+const TargetRegisterClass *
+TargetRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, MVT VT) const {
+  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+
+  // Pick the most super register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass* BestRC = 0;
+  for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
+    const TargetRegisterClass* RC = *I;
+    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
+        (!BestRC || BestRC->hasSuperClass(RC)))
+      BestRC = RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
+/// getAllocatableSetForRC - Toggle the bits that represent allocatable
+/// registers for the specific register class.
+static void getAllocatableSetForRC(MachineFunction &MF,
+                                   const TargetRegisterClass *RC, BitVector &R){  
+  for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
+         E = RC->allocation_order_end(MF); I != E; ++I)
+    R.set(*I);
+}
+
+BitVector TargetRegisterInfo::getAllocatableSet(MachineFunction &MF,
+                                          const TargetRegisterClass *RC) const {
+  BitVector Allocatable(NumRegs);
+  if (RC) {
+    getAllocatableSetForRC(MF, RC, Allocatable);
+    return Allocatable;
+  }
+
+  for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
+         E = regclass_end(); I != E; ++I)
+    getAllocatableSetForRC(MF, *I, Allocatable);
+  return Allocatable;
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index. This is the default implementation
+/// which is likely incorrect for the target.
+int TargetRegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
+    TFI.getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
+}
+
+/// getInitialFrameState - Returns a list of machine moves that are assumed
+/// on entry to a function.
+void
+TargetRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const {
+  // Default is to do nothing.
+}
+
+const TargetRegisterClass *
+llvm::getCommonSubClass(const TargetRegisterClass *A,
+                        const TargetRegisterClass *B) {
+  // First take care of the trivial cases
+  if (A == B)
+    return A;
+  if (!A || !B)
+    return 0;
+
+  // If B is a subclass of A, it will be handled in the loop below
+  if (B->hasSubClass(A))
+    return A;
+
+  const TargetRegisterClass *Best = 0;
+  for (TargetRegisterClass::sc_iterator I = A->subclasses_begin();
+       const TargetRegisterClass *X = *I; ++I) {
+    if (X == B)
+      return B;                 // B is a subclass of A
+
+    // X must be a common subclass of A and B
+    if (!B->hasSubClass(X))
+      continue;
+
+    // A superclass is definitely better.
+    if (!Best || Best->hasSuperClass(X)) {
+      Best = X;
+      continue;
+    }
+
+    // A subclass is definitely worse
+    if (Best->hasSubClass(X))
+      continue;
+
+    // Best and *I have no super/sub class relation - pick the larger class, or
+    // the smaller spill size.
+    int nb = std::distance(Best->begin(), Best->end());
+    int ni = std::distance(X->begin(), X->end());
+    if (ni>nb || (ni==nb && X->getSize() < Best->getSize()))
+      Best = X;
+  }
+  return Best;
+}
diff --git a/lib/Target/TargetSubtarget.cpp b/lib/Target/TargetSubtarget.cpp
new file mode 100644
index 0000000..95c92ca
--- /dev/null
+++ b/lib/Target/TargetSubtarget.cpp
@@ -0,0 +1,22 @@
+//===-- TargetSubtarget.cpp - General Target Information -------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetSubtarget.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// TargetSubtarget Class
+//
+TargetSubtarget::TargetSubtarget() {}
+
+TargetSubtarget::~TargetSubtarget() {}
diff --git a/lib/Target/X86/AsmPrinter/CMakeLists.txt b/lib/Target/X86/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..dbd03d8
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,11 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_partially_linked_object(LLVMX86AsmPrinter
+  X86ATTAsmPrinter.cpp
+  X86AsmPrinter.cpp
+  X86IntelAsmPrinter.cpp
+  )
+
+target_name_of_partially_linked_object(LLVMX86CodeGen n)
+
+add_dependencies(LLVMX86AsmPrinter ${n})
diff --git a/lib/Target/X86/AsmPrinter/Makefile b/lib/Target/X86/AsmPrinter/Makefile
new file mode 100644
index 0000000..ba89ac6
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86AsmPrinter
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.cpp
new file mode 100644
index 0000000..8afe2ea
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.cpp
@@ -0,0 +1,1075 @@
+//===-- X86ATTAsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to AT&T format assembly
+// language. This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTAsmPrinter.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "X86TargetAsmInfo.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static std::string getPICLabelString(unsigned FnNum,
+                                     const TargetAsmInfo *TAI,
+                                     const X86Subtarget* Subtarget) {
+  std::string label;
+  if (Subtarget->isTargetDarwin())
+    label =  "\"L" + utostr_32(FnNum) + "$pb\"";
+  else if (Subtarget->isTargetELF())
+    label = ".Lllvm$" + utostr_32(FnNum) + "." "$piclabel";
+  else
+    assert(0 && "Don't know how to print PIC label!\n");
+
+  return label;
+}
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+                                                    const TargetData *TD) {
+  X86MachineFunctionInfo Info;
+  uint64_t Size = 0;
+
+  switch (F->getCallingConv()) {
+  case CallingConv::X86_StdCall:
+    Info.setDecorationStyle(StdCall);
+    break;
+  case CallingConv::X86_FastCall:
+    Info.setDecorationStyle(FastCall);
+    break;
+  default:
+    return Info;
+  }
+
+  unsigned argNum = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI, ++argNum) {
+    const Type* Ty = AI->getType();
+
+    // 'Dereference' type in case of byval parameter attribute
+    if (F->paramHasAttr(argNum, Attribute::ByVal))
+      Ty = cast<PointerType>(Ty)->getElementType();
+
+    // Size should be aligned to DWORD boundary
+    Size += ((TD->getTypeAllocSize(Ty) + 3)/4)*4;
+  }
+
+  // We're not supporting tooooo huge arguments :)
+  Info.setBytesToPopOnReturn((unsigned int)Size);
+  return Info;
+}
+
+/// PrintUnmangledNameSafely - Print out the printable characters in the name.
+/// Don't print things like \\n or \\0.
+static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
+  for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
+       Name != E; ++Name)
+    if (isprint(*Name))
+      OS << *Name;
+}
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86ATTAsmPrinter::decorateName(std::string &Name,
+                                    const GlobalValue *GV) {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F) return;
+
+  // We don't want to decorate non-stdcall or non-fastcall functions right now
+  unsigned CC = F->getCallingConv();
+  if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+    return;
+
+  // Decorate names only when we're targeting Cygwin/Mingw32 targets
+  if (!Subtarget->isTargetCygMing())
+    return;
+
+  FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+  const X86MachineFunctionInfo *Info;
+  if (info_item == FunctionInfoMap.end()) {
+    // Calculate apropriate function info and populate map
+    FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+    Info = &FunctionInfoMap[F];
+  } else {
+    Info = &info_item->second;
+  }
+
+  const FunctionType *FT = F->getFunctionType();
+  switch (Info->getDecorationStyle()) {
+  case None:
+    break;
+  case StdCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+    break;
+  case FastCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+    if (Name[0] == '_') {
+      Name[0] = '@';
+    } else {
+      Name = '@' + Name;
+    }
+    break;
+  default:
+    assert(0 && "Unsupported DecorationStyle");
+  }
+}
+
+void X86ATTAsmPrinter::emitFunctionHeader(const MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+
+  decorateName(CurrentFnName, F);
+
+  SwitchToSection(TAI->SectionForGlobal(F));
+
+  unsigned FnAlign = 4;
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
+    FnAlign = 1;
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+    EmitAlignment(FnAlign, F);
+    break;
+  case Function::DLLExportLinkage:
+  case Function::ExternalLinkage:
+    EmitAlignment(FnAlign, F);
+    O << "\t.globl\t" << CurrentFnName << '\n';
+    break;
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    EmitAlignment(FnAlign, F);
+    if (Subtarget->isTargetDarwin()) {
+      O << "\t.globl\t" << CurrentFnName << '\n';
+      O << TAI->getWeakDefDirective() << CurrentFnName << '\n';
+    } else if (Subtarget->isTargetCygMing()) {
+      O << "\t.globl\t" << CurrentFnName << "\n"
+           "\t.linkonce discard\n";
+    } else {
+      O << "\t.weak\t" << CurrentFnName << '\n';
+    }
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  if (Subtarget->isTargetELF())
+    O << "\t.type\t" << CurrentFnName << ",@function\n";
+  else if (Subtarget->isTargetCygMing()) {
+    O << "\t.def\t " << CurrentFnName
+      << ";\t.scl\t" <<
+      (F->hasInternalLinkage() ? COFF::C_STAT : COFF::C_EXT)
+      << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+      << ";\t.endef\n";
+  }
+
+  O << CurrentFnName << ":\n";
+  // Add some workaround for linkonce linkage on Cygwin\MinGW
+  if (Subtarget->isTargetCygMing() &&
+      (F->hasLinkOnceLinkage() || F->hasWeakLinkage()))
+    O << "Lllvm$workaround$fake$stub$" << CurrentFnName << ":\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86ATTAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  this->MF = &MF;
+  unsigned CC = F->getCallingConv();
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Populate function information map.  Actually, We don't want to populate
+  // non-stdcall or non-fastcall functions' information right now.
+  if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+    FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  if (F->hasDLLExportLinkage())
+    DLLExportedFns.insert(Mang->makeNameProper(F->getName(), ""));
+
+  // Print the 'header' of function
+  emitFunctionHeader(MF);
+
+  // Emit pre-function debug and/or EH information.
+  if (TAI->doesSupportDebugInformation() || TAI->doesSupportExceptionHandling())
+    DW->BeginFunction(&MF);
+
+  // Print out code for the function.
+  bool hasAnyRealCode = false;
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (!VerboseAsm && (I->pred_empty() || I->isOnlyReachableByFallthrough())) {
+      // This is an entry block or a block that's only reachable via a
+      // fallthrough edge. In non-VerboseAsm mode, don't print the label.
+    } else {
+      printBasicBlockLabel(I, true, true, VerboseAsm);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+         II != IE; ++II) {
+      // Print the assembly for the instruction.
+      if (!II->isLabel())
+        hasAnyRealCode = true;
+      printMachineInstruction(II);
+    }
+  }
+
+  if (Subtarget->isTargetDarwin() && !hasAnyRealCode) {
+    // If the function is empty, then we need to emit *something*. Otherwise,
+    // the function's label might be associated with something that it wasn't
+    // meant to be associated with. We emit a noop in this situation.
+    // We are assuming inline asms are code.
+    O << "\tnop\n";
+  }
+
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
+
+  // Emit post-function debug information.
+  if (TAI->doesSupportDebugInformation())
+    DW->EndFunction(&MF);
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  O.flush();
+
+  // We didn't modify anything.
+  return false;
+}
+
+static inline bool shouldPrintGOT(TargetMachine &TM, const X86Subtarget* ST) {
+  return ST->isPICStyleGOT() && TM.getRelocationModel() == Reloc::PIC_;
+}
+
+static inline bool shouldPrintPLT(TargetMachine &TM, const X86Subtarget* ST) {
+  return ST->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_ &&
+      (ST->isPICStyleRIPRel() || ST->isPICStyleGOT());
+}
+
+static inline bool shouldPrintStub(TargetMachine &TM, const X86Subtarget* ST) {
+  return ST->isPICStyleStub() && TM.getRelocationModel() != Reloc::Static;
+}
+
+void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                    const char *Modifier, bool NotRIPRel) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Virtual registers should not make it this far!");
+    O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      MVT VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+      Reg = getX86SubSuperRegister(Reg, VT);
+    }
+    O << TRI->getAsmName(Reg);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier || (strcmp(Modifier, "debug") &&
+                      strcmp(Modifier, "mem") &&
+                      strcmp(Modifier, "call")))
+      O << '$';
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB(), false, false, VerboseAsm);
+    return;
+  case MachineOperand::MO_JumpTableIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << '$';
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget->isPICStyleStub())
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+      else if (Subtarget->isPICStyleGOT())
+        O << "@GOTOFF";
+    }
+
+    if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+      O << "(%rip)";
+    return;
+  }
+  case MachineOperand::MO_ConstantPoolIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << '$';
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget->isPICStyleStub())
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+      else if (Subtarget->isPICStyleGOT())
+        O << "@GOTOFF";
+    }
+
+    printOffset(MO.getOffset());
+
+    if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+      O << "(%rip)";
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    bool needCloseParen = false;
+
+    const GlobalValue *GV = MO.getGlobal();
+    const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+    if (!GVar) {
+      // If GV is an alias then use the aliasee for determining
+      // thread-localness.
+      if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+        GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+    }
+
+    bool isThreadLocal = GVar && GVar->isThreadLocal();
+
+    std::string Name = Mang->getValueName(GV);
+    decorateName(Name, GV);
+
+    if (!isMemOp && !isCallOp)
+      O << '$';
+    else if (Name[0] == '$') {
+      // The name begins with a dollar-sign. In order to avoid having it look
+      // like an integer immediate to the assembler, enclose it in parens.
+      O << '(';
+      needCloseParen = true;
+    }
+
+    if (shouldPrintStub(TM, Subtarget)) {
+      // Link-once, declaration, or Weakly-linked global variables need
+      // non-lazily-resolved stubs
+      if (GV->isDeclaration() || GV->isWeakForLinker()) {
+        // Dynamically-resolved functions need a stub for the function.
+        if (isCallOp && isa<Function>(GV)) {
+          // Function stubs are no longer needed for Mac OS X 10.5 and up.
+          if (Subtarget->isTargetDarwin() && Subtarget->getDarwinVers() >= 9) {
+            O << Name;
+          } else {
+            FnStubs.insert(Name);
+            printSuffixedName(Name, "$stub");
+          }
+        } else if (GV->hasHiddenVisibility()) {
+          if (!GV->isDeclaration() && !GV->hasCommonLinkage())
+            // Definition is not definitely in the current translation unit.
+            O << Name;
+          else {
+            HiddenGVStubs.insert(Name);
+            printSuffixedName(Name, "$non_lazy_ptr");
+          }
+        } else {
+          GVStubs.insert(Name);
+          printSuffixedName(Name, "$non_lazy_ptr");
+        }
+      } else {
+        if (GV->hasDLLImportLinkage())
+          O << "__imp_";
+        O << Name;
+      }
+
+      if (!isCallOp && TM.getRelocationModel() == Reloc::PIC_)
+        O << '-' << getPICLabelString(getFunctionNumber(), TAI, Subtarget);
+    } else {
+      if (GV->hasDLLImportLinkage()) {
+        O << "__imp_";
+      }
+      O << Name;
+
+      if (isCallOp) {
+        if (shouldPrintPLT(TM, Subtarget)) {
+          // Assemble call via PLT for externally visible symbols
+          if (!GV->hasHiddenVisibility() && !GV->hasProtectedVisibility() &&
+              !GV->hasLocalLinkage())
+            O << "@PLT";
+        }
+        if (Subtarget->isTargetCygMing() && GV->isDeclaration())
+          // Save function name for later type emission
+          FnStubs.insert(Name);
+      }
+    }
+
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+
+    printOffset(MO.getOffset());
+
+    if (isThreadLocal) {
+      TLSModel::Model model = getTLSModel(GVar, TM.getRelocationModel());
+      switch (model) {
+      case TLSModel::GeneralDynamic:
+        O << "@TLSGD";
+        break;
+      case TLSModel::LocalDynamic:
+        // O << "@TLSLD"; // local dynamic not implemented
+        O << "@TLSGD";
+        break;
+      case TLSModel::InitialExec:
+        if (Subtarget->is64Bit()) {
+          assert (!NotRIPRel);
+          O << "@GOTTPOFF(%rip)";
+        } else {
+          O << "@INDNTPOFF";
+        }
+        break;
+      case TLSModel::LocalExec:
+        if (Subtarget->is64Bit())
+          O << "@TPOFF";
+        else
+          O << "@NTPOFF";
+        break;
+      default:
+        assert (0 && "Unknown TLS model");
+      }
+    } else if (isMemOp) {
+      if (shouldPrintGOT(TM, Subtarget)) {
+        if (Subtarget->GVRequiresExtraLoad(GV, TM, false))
+          O << "@GOT";
+        else
+          O << "@GOTOFF";
+      } else if (Subtarget->isPICStyleRIPRel() && !NotRIPRel) {
+        if (TM.getRelocationModel() != Reloc::Static) {
+          if (Subtarget->GVRequiresExtraLoad(GV, TM, false))
+            O << "@GOTPCREL";
+
+          if (needCloseParen) {
+            needCloseParen = false;
+            O << ')';
+          }
+        }
+
+        // Use rip when possible to reduce code size, except when
+        // index or base register are also part of the address. e.g.
+        // foo(%rip)(%rcx,%rax,4) is not legal
+        O << "(%rip)";
+      }
+    }
+
+    if (needCloseParen)
+      O << ')';
+
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    bool needCloseParen = false;
+    std::string Name(TAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    // Print function stub suffix unless it's Mac OS X 10.5 and up.
+    if (isCallOp && shouldPrintStub(TM, Subtarget) && 
+        !(Subtarget->isTargetDarwin() && Subtarget->getDarwinVers() >= 9)) {
+      FnStubs.insert(Name);
+      printSuffixedName(Name, "$stub");
+      return;
+    }
+    if (!isMemOp && !isCallOp)
+      O << '$';
+    else if (Name[0] == '$') {
+      // The name begins with a dollar-sign. In order to avoid having it look
+      // like an integer immediate to the assembler, enclose it in parens.
+      O << '(';
+      needCloseParen = true;
+    }
+
+    O << Name;
+
+    if (shouldPrintPLT(TM, Subtarget)) {
+      std::string GOTName(TAI->getGlobalPrefix());
+      GOTName+="_GLOBAL_OFFSET_TABLE_";
+      if (Name == GOTName)
+        // HACK! Emit extra offset to PC during printing GOT offset to
+        // compensate for the size of popl instruction. The resulting code
+        // should look like:
+        //   call .piclabel
+        // piclabel:
+        //   popl %some_register
+        //   addl $_GLOBAL_ADDRESS_TABLE_ + [.-piclabel], %some_register
+        O << " + [.-"
+          << getPICLabelString(getFunctionNumber(), TAI, Subtarget) << ']';
+
+      if (isCallOp)
+        O << "@PLT";
+    }
+
+    if (needCloseParen)
+      O << ')';
+
+    if (!isCallOp && Subtarget->isPICStyleRIPRel())
+      O << "(%rip)";
+
+    return;
+  }
+  default:
+    O << "<unknown operand type>"; return;
+  }
+}
+
+void X86ATTAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImm();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86ATTAsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                                            const char *Modifier,
+                                            bool NotRIPRel) {
+  MachineOperand BaseReg  = MI->getOperand(Op);
+  MachineOperand IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  NotRIPRel |= IndexReg.getReg() || BaseReg.getReg();
+  if (DispSpec.isGlobal() ||
+      DispSpec.isCPI() ||
+      DispSpec.isJTI() ||
+      DispSpec.isSymbol()) {
+    printOperand(MI, Op+3, "mem", NotRIPRel);
+  } else {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << DispVal;
+  }
+
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+    unsigned BaseRegOperand = 0, IndexRegOperand = 2;
+
+    // There are cases where we can end up with ESP/RSP in the indexreg slot.
+    // If this happens, swap the base/index register to support assemblers that
+    // don't work when the index is *SP.
+    if (IndexReg.getReg() == X86::ESP || IndexReg.getReg() == X86::RSP) {
+      assert(ScaleVal == 1 && "Scale not supported for stack pointer!");
+      std::swap(BaseReg, IndexReg);
+      std::swap(BaseRegOperand, IndexRegOperand);
+    }
+
+    O << '(';
+    if (BaseReg.getReg())
+      printOperand(MI, Op+BaseRegOperand, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op+IndexRegOperand, Modifier);
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                         const char *Modifier, bool NotRIPRel){
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  MachineOperand Segment = MI->getOperand(Op+4);
+  if (Segment.getReg()) {
+      printOperand(MI, Op+4, Modifier);
+      O << ':';
+    }
+  printLeaMemReference(MI, Op, Modifier, NotRIPRel);
+}
+
+void X86ATTAsmPrinter::printPICJumpTableSetLabel(unsigned uid,
+                                           const MachineBasicBlock *MBB) const {
+  if (!TAI->getSetDirective())
+    return;
+
+  // We don't need .set machinery if we have GOT-style relocations
+  if (Subtarget->isPICStyleGOT())
+    return;
+
+  O << TAI->getSetDirective() << ' ' << TAI->getPrivateGlobalPrefix()
+    << getFunctionNumber() << '_' << uid << "_set_" << MBB->getNumber() << ',';
+  printBasicBlockLabel(MBB, false, false, false);
+  if (Subtarget->isPICStyleRIPRel())
+    O << '-' << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << uid << '\n';
+  else
+    O << '-' << getPICLabelString(getFunctionNumber(), TAI, Subtarget) << '\n';
+}
+
+void X86ATTAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  std::string label = getPICLabelString(getFunctionNumber(), TAI, Subtarget);
+  O << label << '\n' << label << ':';
+}
+
+
+void X86ATTAsmPrinter::printPICJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                              const MachineBasicBlock *MBB,
+                                              unsigned uid) const
+{
+  const char *JTEntryDirective = MJTI->getEntrySize() == 4 ?
+    TAI->getData32bitsDirective() : TAI->getData64bitsDirective();
+
+  O << JTEntryDirective << ' ';
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    if (Subtarget->isPICStyleRIPRel() || Subtarget->isPICStyleStub()) {
+      O << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+        << '_' << uid << "_set_" << MBB->getNumber();
+    } else if (Subtarget->isPICStyleGOT()) {
+      printBasicBlockLabel(MBB, false, false, false);
+      O << "@GOTOFF";
+    } else
+      assert(0 && "Don't know how to print MBB label for this PIC mode");
+  } else
+    printBasicBlockLabel(MBB, false, false, false);
+}
+
+bool X86ATTAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+                                         const char Mode) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  case 'q': // Print DImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i64);
+    break;
+  }
+
+  O << '%'<< TRI->getAsmName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86ATTAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+      printOperand(MI, OpNo, "mem", /*NotRIPRel=*/true);
+      return false;
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print DImode register
+      if (MI->getOperand(OpNo).isReg())
+        return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+      printOperand(MI, OpNo);
+      return false;
+
+    case 'P': // Don't print @PLT, but do print as memory.
+      printOperand(MI, OpNo, "mem", /*NotRIPRel=*/true);
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool X86ATTAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo,
+                                             unsigned AsmVariant,
+                                             const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print SImode register
+      // These only apply to registers, ignore on mem.
+      break;
+    case 'P': // Don't print @PLT, but do print as memory.
+      printMemReference(MI, OpNo, "mem", /*NotRIPRel=*/true);
+      return false;
+    }
+  }
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction MI in
+/// AT&T syntax to the current output stream.
+///
+void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+/// doInitialization
+bool X86ATTAsmPrinter::doInitialization(Module &M) {
+
+  bool Result = AsmPrinter::doInitialization(M);
+
+  if (TAI->doesSupportDebugInformation()) {
+    // Let PassManager know we need debug information and relay
+    // the MachineModuleInfo address on to DwarfWriter.
+    // AsmPrinter::doInitialization did this analysis.
+    MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+    DW = getAnalysisIfAvailable<DwarfWriter>();
+    DW->BeginModule(&M, MMI, O, this, TAI);
+  }
+
+  // Darwin wants symbols to be quoted if they have complex names.
+  if (Subtarget->isTargetDarwin())
+    Mang->setUseQuotes(true);
+
+  return Result;
+}
+
+
+void X86ATTAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return;   // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar)) {
+    if (Subtarget->isTargetDarwin() &&
+        TM.getRelocationModel() == Reloc::Static) {
+      if (GVar->getName() == "llvm.global_ctors")
+        O << ".reference .constructors_used\n";
+      else if (GVar->getName() == "llvm.global_dtors")
+        O << ".reference .destructors_used\n";
+    }
+    return;
+  }
+
+  std::string name = Mang->getValueName(GVar);
+  Constant *C = GVar->getInitializer();
+  const Type *Type = C->getType();
+  unsigned Size = TD->getTypeAllocSize(Type);
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+  printVisibility(name, GVar->getVisibility());
+
+  if (Subtarget->isTargetELF())
+    O << "\t.type\t" << name << ",@object\n";
+
+  SwitchToSection(TAI->SectionForGlobal(GVar));
+
+  if (C->isNullValue() && !GVar->hasSection() &&
+      !(Subtarget->isTargetDarwin() &&
+        TAI->SectionKindForGlobal(GVar) == SectionKind::RODataMergeStr)) {
+    // FIXME: This seems to be pretty darwin-specific
+    if (GVar->hasExternalLinkage()) {
+      if (const char *Directive = TAI->getZeroFillDirective()) {
+        O << "\t.globl " << name << '\n';
+        O << Directive << "__DATA, __common, " << name << ", "
+          << Size << ", " << Align << '\n';
+        return;
+      }
+    }
+
+    if (!GVar->isThreadLocal() &&
+        (GVar->hasLocalLinkage() || GVar->isWeakForLinker())) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (TAI->getLCOMMDirective() != NULL) {
+        if (GVar->hasLocalLinkage()) {
+          O << TAI->getLCOMMDirective() << name << ',' << Size;
+          if (Subtarget->isTargetDarwin())
+            O << ',' << Align;
+        } else if (Subtarget->isTargetDarwin() && !GVar->hasCommonLinkage()) {
+          O << "\t.globl " << name << '\n'
+            << TAI->getWeakDefDirective() << name << '\n';
+          EmitAlignment(Align, GVar);
+          O << name << ":";
+          if (VerboseAsm) {
+            O << "\t\t\t\t" << TAI->getCommentString() << ' ';
+            PrintUnmangledNameSafely(GVar, O);
+          }
+          O << '\n';
+          EmitGlobalConstant(C);
+          return;
+        } else {
+          O << TAI->getCOMMDirective()  << name << ',' << Size;
+          if (TAI->getCOMMDirectiveTakesAlignment())
+            O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+        }
+      } else {
+        if (!Subtarget->isTargetCygMing()) {
+          if (GVar->hasLocalLinkage())
+            O << "\t.local\t" << name << '\n';
+        }
+        O << TAI->getCOMMDirective()  << name << ',' << Size;
+        if (TAI->getCOMMDirectiveTakesAlignment())
+          O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+      }
+      if (VerboseAsm) {
+        O << "\t\t" << TAI->getCommentString() << ' ';
+        PrintUnmangledNameSafely(GVar, O);
+      }
+      O << '\n';
+      return;
+    }
+  }
+
+  switch (GVar->getLinkage()) {
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      O << "\t.globl " << name << '\n'
+        << TAI->getWeakDefDirective() << name << '\n';
+    } else if (Subtarget->isTargetCygMing()) {
+      O << "\t.globl\t" << name << "\n"
+           "\t.linkonce same_size\n";
+    } else {
+      O << "\t.weak\t" << name << '\n';
+    }
+    break;
+  case GlobalValue::DLLExportLinkage:
+  case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+  case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << "\t.globl " << name << '\n';
+    // FALL THROUGH
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::InternalLinkage:
+     break;
+  default:
+    assert(0 && "Unknown linkage type!");
+  }
+
+  EmitAlignment(Align, GVar);
+  O << name << ":";
+  if (VerboseAsm){
+    O << "\t\t\t\t" << TAI->getCommentString() << ' ';
+    PrintUnmangledNameSafely(GVar, O);
+  }
+  O << '\n';
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << name << ", " << Size << '\n';
+
+  EmitGlobalConstant(C);
+}
+
+/// printGVStub - Print stub for a global value.
+///
+void X86ATTAsmPrinter::printGVStub(const char *GV, const char *Prefix) {
+  printSuffixedName(GV, "$non_lazy_ptr", Prefix);
+  O << ":\n\t.indirect_symbol ";
+  if (Prefix) O << Prefix;
+  O << GV << "\n\t.long\t0\n";
+}
+
+/// printHiddenGVStub - Print stub for a hidden global value.
+///
+void X86ATTAsmPrinter::printHiddenGVStub(const char *GV, const char *Prefix) {
+  EmitAlignment(2);
+  printSuffixedName(GV, "$non_lazy_ptr", Prefix);
+  if (Prefix) O << Prefix;
+  O << ":\n" << TAI->getData32bitsDirective() << GV << '\n';
+}
+
+
+bool X86ATTAsmPrinter::doFinalization(Module &M) {
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    printModuleLevelGV(I);
+
+    if (I->hasDLLExportLinkage())
+      DLLExportedGVs.insert(Mang->makeNameProper(I->getName(),""));
+
+    // If the global is a extern weak symbol, remember to emit the weak
+    // reference!
+    // FIXME: This is rather hacky, since we'll emit references to ALL weak
+    // stuff, not used. But currently it's the only way to deal with extern weak
+    // initializers hidden deep inside constant expressions.
+    if (I->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(I);
+  }
+
+  for (Module::const_iterator I = M.begin(), E = M.end();
+       I != E; ++I) {
+    // If the global is a extern weak symbol, remember to emit the weak
+    // reference!
+    // FIXME: This is rather hacky, since we'll emit references to ALL weak
+    // stuff, not used. But currently it's the only way to deal with extern weak
+    // initializers hidden deep inside constant expressions.
+    if (I->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(I);
+  }
+
+  // Output linker support code for dllexported globals
+  if (!DLLExportedGVs.empty())
+    SwitchToDataSection(".section .drectve");
+
+  for (StringSet<>::iterator i = DLLExportedGVs.begin(),
+         e = DLLExportedGVs.end();
+         i != e; ++i)
+    O << "\t.ascii \" -export:" << i->getKeyData() << ",data\"\n";
+
+  if (!DLLExportedFns.empty()) {
+    SwitchToDataSection(".section .drectve");
+  }
+
+  for (StringSet<>::iterator i = DLLExportedFns.begin(),
+         e = DLLExportedFns.end();
+         i != e; ++i)
+    O << "\t.ascii \" -export:" << i->getKeyData() << "\"\n";
+
+  if (Subtarget->isTargetDarwin()) {
+    SwitchToDataSection("");
+
+    // Output stubs for dynamically-linked functions
+    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      SwitchToDataSection("\t.section __IMPORT,__jump_table,symbol_stubs,"
+                          "self_modifying_code+pure_instructions,5", 0);
+      const char *p = i->getKeyData();
+      printSuffixedName(p, "$stub");
+      O << ":\n"
+           "\t.indirect_symbol " << p << "\n"
+           "\thlt ; hlt ; hlt ; hlt ; hlt\n";
+    }
+
+    O << '\n';
+
+    // Print global value stubs.
+    bool InStubSection = false;
+    if (TAI->doesSupportExceptionHandling() && MMI && !Subtarget->is64Bit()) {
+      // Add the (possibly multiple) personalities to the set of global values.
+      // Only referenced functions get into the Personalities list.
+      const std::vector<Function *>& Personalities = MMI->getPersonalities();
+      for (std::vector<Function *>::const_iterator I = Personalities.begin(),
+             E = Personalities.end(); I != E; ++I) {
+        if (!*I)
+          continue;
+        if (!InStubSection) {
+          SwitchToDataSection(
+                     "\t.section __IMPORT,__pointers,non_lazy_symbol_pointers");
+          InStubSection = true;
+        }
+        printGVStub((*I)->getNameStart(), "_");
+      }
+    }
+
+    // Output stubs for external and common global variables.
+    if (!InStubSection && !GVStubs.empty())
+      SwitchToDataSection(
+                    "\t.section __IMPORT,__pointers,non_lazy_symbol_pointers");
+    for (StringSet<>::iterator i = GVStubs.begin(), e = GVStubs.end();
+         i != e; ++i)
+      printGVStub(i->getKeyData());
+
+    if (!HiddenGVStubs.empty()) {
+      SwitchToSection(TAI->getDataSection());
+      for (StringSet<>::iterator i = HiddenGVStubs.begin(), e = HiddenGVStubs.end();
+           i != e; ++i)
+        printHiddenGVStub(i->getKeyData());
+    }
+
+    // Emit final debug information.
+    DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+    DW->EndModule();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    O << "\t.subsections_via_symbols\n";
+  } else if (Subtarget->isTargetCygMing()) {
+    // Emit type information for external functions
+    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      O << "\t.def\t " << i->getKeyData()
+        << ";\t.scl\t" << COFF::C_EXT
+        << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+        << ";\t.endef\n";
+    }
+
+    // Emit final debug information.
+    DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+    DW->EndModule();
+  } else if (Subtarget->isTargetELF()) {
+    // Emit final debug information.
+    DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+    DW->EndModule();
+  }
+
+  return AsmPrinter::doFinalization(M);
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter.inc"
diff --git a/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.h b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.h
new file mode 100644
index 0000000..5b40e73
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.h
@@ -0,0 +1,164 @@
+//===-- X86ATTAsmPrinter.h - Convert X86 LLVM code to AT&T assembly -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ATTASMPRINTER_H
+#define X86ATTASMPRINTER_H
+
+#include "../X86.h"
+#include "../X86MachineFunctionInfo.h"
+#include "../X86TargetMachine.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MachineJumpTableInfo;
+
+class VISIBILITY_HIDDEN X86ATTAsmPrinter : public AsmPrinter {
+  DwarfWriter *DW;
+  MachineModuleInfo *MMI;
+  const X86Subtarget *Subtarget;
+ public:
+  explicit X86ATTAsmPrinter(raw_ostream &O, X86TargetMachine &TM,
+                            const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                            bool V)
+    : AsmPrinter(O, TM, T, OL, V), DW(0), MMI(0) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+  }
+
+  virtual const char *getPassName() const {
+    return "X86 AT&T-Style Assembly Printer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    if (Subtarget->isTargetDarwin() ||
+        Subtarget->isTargetELF() ||
+        Subtarget->isTargetCygMing()) {
+      AU.addRequired<MachineModuleInfo>();
+    }
+    AU.addRequired<DwarfWriter>();
+    AsmPrinter::getAnalysisUsage(AU);
+  }
+
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+  /// printInstruction - This method is automatically generated by tablegen
+  /// from the instruction set description.  This method returns true if the
+  /// machine instruction was sufficiently described to print it, otherwise it
+  /// returns false.
+  bool printInstruction(const MachineInstr *MI);
+
+  // These methods are used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0, bool NotRIPRel = false);
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo, "subreg64");
+  }
+
+  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+
+  void printMachineInstruction(const MachineInstr *MI);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL, bool NotRIPRel = false);
+  void printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                            const char *Modifier=NULL, bool NotRIPRel = false);
+  void printPICJumpTableSetLabel(unsigned uid,
+                                 const MachineBasicBlock *MBB) const;
+  void printPICJumpTableSetLabel(unsigned uid, unsigned uid2,
+                                 const MachineBasicBlock *MBB) const {
+    AsmPrinter::printPICJumpTableSetLabel(uid, uid2, MBB);
+  }
+  void printPICJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                              const MachineBasicBlock *MBB,
+                              unsigned uid) const;
+
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+  void printModuleLevelGV(const GlobalVariable* GVar);
+
+  void printGVStub(const char *GV, const char *Prefix = NULL);
+  void printHiddenGVStub(const char *GV, const char *Prefix = NULL);
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+  void emitFunctionHeader(const MachineFunction &MF);
+
+  // Necessary for Darwin to print out the apprioriate types of linker stubs
+  StringSet<> FnStubs, GVStubs, HiddenGVStubs;
+
+  // Necessary for dllexport support
+  StringSet<> DLLExportedFns, DLLExportedGVs;
+
+  // We have to propagate some information about MachineFunction to
+  // AsmPrinter. It's ok, when we're printing the function, since we have
+  // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+  // Unfortunately, this is not possible when we're printing reference to
+  // Function (e.g. calling it and so on). Even more, there is no way to get the
+  // corresponding MachineFunctions: it can even be not created at all. That's
+  // why we should use additional structure, when we're collecting all necessary
+  // information.
+  //
+  // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+  // function, since we have to use arguments' size for decoration.
+  typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+  FMFInfoMap FunctionInfoMap;
+
+  void decorateName(std::string& Name, const GlobalValue* GV);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
new file mode 100644
index 0000000..c874849
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -0,0 +1,50 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM IR to X86 assembly -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTAsmPrinter.h"
+#include "X86IntelAsmPrinter.h"
+#include "X86Subtarget.h"
+using namespace llvm;
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86 assembly code
+/// for a MachineFunction to the given output stream, using the given target
+/// machine description.
+///
+FunctionPass *llvm::createX86CodePrinterPass(raw_ostream &o,
+                                             X86TargetMachine &tm,
+                                             CodeGenOpt::Level OptLevel,
+                                             bool verbose) {
+  const X86Subtarget *Subtarget = &tm.getSubtarget<X86Subtarget>();
+
+  if (Subtarget->isFlavorIntel()) {
+    return new X86IntelAsmPrinter(o, tm, tm.getTargetAsmInfo(),
+                                  OptLevel, verbose);
+  } else {
+    return new X86ATTAsmPrinter(o, tm, tm.getTargetAsmInfo(),
+                                OptLevel, verbose);
+  }
+}
+
+namespace {
+  static struct Register {
+    Register() {
+      X86TargetMachine::registerAsmPrinter(createX86CodePrinterPass);
+    }
+  } Registrator;
+}
+
+extern "C" int X86AsmPrinterForceLink;
+int X86AsmPrinterForceLink = 0;
diff --git a/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.cpp
new file mode 100644
index 0000000..6599349
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.cpp
@@ -0,0 +1,609 @@
+//===-- X86IntelAsmPrinter.cpp - Convert X86 LLVM code to Intel assembly --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Intel format assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelAsmPrinter.h"
+#include "X86InstrInfo.h"
+#include "X86TargetAsmInfo.h"
+#include "X86.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+                                                    const TargetData *TD) {
+  X86MachineFunctionInfo Info;
+  uint64_t Size = 0;
+
+  switch (F->getCallingConv()) {
+  case CallingConv::X86_StdCall:
+    Info.setDecorationStyle(StdCall);
+    break;
+  case CallingConv::X86_FastCall:
+    Info.setDecorationStyle(FastCall);
+    break;
+  default:
+    return Info;
+  }
+
+  unsigned argNum = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI, ++argNum) {
+    const Type* Ty = AI->getType();
+
+    // 'Dereference' type in case of byval parameter attribute
+    if (F->paramHasAttr(argNum, Attribute::ByVal))
+      Ty = cast<PointerType>(Ty)->getElementType();
+
+    // Size should be aligned to DWORD boundary
+    Size += ((TD->getTypeAllocSize(Ty) + 3)/4)*4;
+  }
+
+  // We're not supporting tooooo huge arguments :)
+  Info.setBytesToPopOnReturn((unsigned int)Size);
+  return Info;
+}
+
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86IntelAsmPrinter::decorateName(std::string &Name,
+                                      const GlobalValue *GV) {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F) return;
+
+  // We don't want to decorate non-stdcall or non-fastcall functions right now
+  unsigned CC = F->getCallingConv();
+  if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+    return;
+
+  FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+  const X86MachineFunctionInfo *Info;
+  if (info_item == FunctionInfoMap.end()) {
+    // Calculate apropriate function info and populate map
+    FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+    Info = &FunctionInfoMap[F];
+  } else {
+    Info = &info_item->second;
+  }
+
+  const FunctionType *FT = F->getFunctionType();
+  switch (Info->getDecorationStyle()) {
+  case None:
+    break;
+  case StdCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+    break;
+  case FastCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+    if (Name[0] == '_')
+      Name[0] = '@';
+    else
+      Name = '@' + Name;
+
+    break;
+  default:
+    assert(0 && "Unsupported DecorationStyle");
+  }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86IntelAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  unsigned CC = F->getCallingConv();
+
+  // Populate function information map.  Actually, We don't want to populate
+  // non-stdcall or non-fastcall functions' information right now.
+  if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+    FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+  decorateName(CurrentFnName, F);
+
+  SwitchToTextSection("_text", F);
+
+  unsigned FnAlign = 4;
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
+    FnAlign = 1;
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unsupported linkage type!");
+  case Function::PrivateLinkage:
+  case Function::InternalLinkage:
+    EmitAlignment(FnAlign);
+    break;
+  case Function::DLLExportLinkage:
+    DLLExportedFns.insert(CurrentFnName);
+    //FALLS THROUGH
+  case Function::ExternalLinkage:
+    O << "\tpublic " << CurrentFnName << "\n";
+    EmitAlignment(FnAlign);
+    break;
+  }
+
+  O << CurrentFnName << "\tproc near\n";
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block if there are any predecessors.
+    if (!I->pred_empty()) {
+      printBasicBlockLabel(I, true, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  O << CurrentFnName << "\tendp\n";
+
+  O.flush();
+
+  // We didn't modify anything.
+  return false;
+}
+
+void X86IntelAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImm();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86IntelAsmPrinter::printOp(const MachineOperand &MO,
+                                 const char *Modifier) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      unsigned Reg = MO.getReg();
+      if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+        MVT VT = (strcmp(Modifier,"subreg64") == 0) ?
+          MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 :
+                      ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8));
+        Reg = getX86SubSuperRegister(Reg, VT);
+      }
+      O << TRI->getName(Reg);
+    } else
+      O << "reg" << MO.getReg();
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    return;
+  case MachineOperand::MO_JumpTableIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << "OFFSET ";
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << "_" << MO.getIndex();
+    return;
+  }
+  case MachineOperand::MO_ConstantPoolIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << "OFFSET ";
+    O << "[" << TAI->getPrivateGlobalPrefix() << "CPI"
+      << getFunctionNumber() << "_" << MO.getIndex();
+    printOffset(MO.getOffset());
+    O << "]";
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+
+    decorateName(Name, GV);
+
+    if (!isMemOp && !isCallOp) O << "OFFSET ";
+    if (GV->hasDLLImportLinkage()) {
+      // FIXME: This should be fixed with full support of stdcall & fastcall
+      // CC's
+      O << "__imp_";
+    }
+    O << Name;
+    printOffset(MO.getOffset());
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    if (!isCallOp) O << "OFFSET ";
+    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  }
+  default:
+    O << "<unknown operand type>"; return;
+  }
+}
+
+void X86IntelAsmPrinter::printLeaMemReference(const MachineInstr *MI,
+                                              unsigned Op,
+                                              const char *Modifier) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  int ScaleVal                   = MI->getOperand(Op+1).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  O << "[";
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOp(BaseReg, Modifier);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << "*";
+    printOp(IndexReg, Modifier);
+    NeedPlus = true;
+  }
+
+  if (DispSpec.isGlobal() || DispSpec.isCPI() ||
+      DispSpec.isJTI()) {
+    if (NeedPlus)
+      O << " + ";
+    printOp(DispSpec, "mem");
+  } else {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || (!BaseReg.getReg() && !IndexReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  O << "]";
+}
+
+void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                           const char *Modifier) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  MachineOperand Segment = MI->getOperand(Op+4);
+  if (Segment.getReg()) {
+      printOperand(MI, Op+4, Modifier);
+      O << ':';
+    }
+  printLeaMemReference(MI, Op, Modifier);
+}
+
+void X86IntelAsmPrinter::printPICJumpTableSetLabel(unsigned uid,
+                                           const MachineBasicBlock *MBB) const {
+  if (!TAI->getSetDirective())
+    return;
+
+  O << TAI->getSetDirective() << ' ' << TAI->getPrivateGlobalPrefix()
+    << getFunctionNumber() << '_' << uid << "_set_" << MBB->getNumber() << ',';
+  printBasicBlockLabel(MBB, false, false, false);
+  O << '-' << "\"L" << getFunctionNumber() << "$pb\"'\n";
+}
+
+void X86IntelAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  O << "\"L" << getFunctionNumber() << "$pb\"\n";
+  O << "\"L" << getFunctionNumber() << "$pb\":";
+}
+
+bool X86IntelAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+                                           const char Mode) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  }
+
+  O << '%' << TRI->getName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86IntelAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                         unsigned AsmVariant,
+                                         const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+      return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+    }
+  }
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool X86IntelAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                               unsigned OpNo,
+                                               unsigned AsmVariant,
+                                               const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in Intel syntax to the current output stream.
+///
+void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+bool X86IntelAsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+
+  Mang->markCharUnacceptable('.');
+
+  O << "\t.686\n\t.model flat\n\n";
+
+  // Emit declarations for external functions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->isDeclaration()) {
+      std::string Name = Mang->getValueName(I);
+      decorateName(Name, I);
+
+      O << "\textern " ;
+      if (I->hasDLLImportLinkage()) {
+        O << "__imp_";
+      }
+      O << Name << ":near\n";
+    }
+
+  // Emit declarations for external globals.  Note that VC++ always declares
+  // external globals to have type byte, and if that's good enough for VC++...
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->isDeclaration()) {
+      std::string Name = Mang->getValueName(I);
+
+      O << "\textern " ;
+      if (I->hasDLLImportLinkage()) {
+        O << "__imp_";
+      }
+      O << Name << ":byte\n";
+    }
+  }
+
+  return Result;
+}
+
+bool X86IntelAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->isDeclaration()) continue;   // External global require no code
+
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I))
+      continue;
+
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+    bool bCustomSegment = false;
+
+    switch (I->getLinkage()) {
+    case GlobalValue::CommonLinkage:
+    case GlobalValue::LinkOnceAnyLinkage:
+    case GlobalValue::LinkOnceODRLinkage:
+    case GlobalValue::WeakAnyLinkage:
+    case GlobalValue::WeakODRLinkage:
+      SwitchToDataSection("");
+      O << name << "?\tsegment common 'COMMON'\n";
+      bCustomSegment = true;
+      // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+      // are also available.
+      break;
+    case GlobalValue::AppendingLinkage:
+      SwitchToDataSection("");
+      O << name << "?\tsegment public 'DATA'\n";
+      bCustomSegment = true;
+      // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+      // are also available.
+      break;
+    case GlobalValue::DLLExportLinkage:
+      DLLExportedGVs.insert(name);
+      // FALL THROUGH
+    case GlobalValue::ExternalLinkage:
+      O << "\tpublic " << name << "\n";
+      // FALL THROUGH
+    case GlobalValue::InternalLinkage:
+      SwitchToSection(TAI->getDataSection());
+      break;
+    default:
+      assert(0 && "Unknown linkage type!");
+    }
+
+    if (!bCustomSegment)
+      EmitAlignment(Align, I);
+
+    O << name << ":";
+    if (VerboseAsm)
+      O << "\t\t\t\t" << TAI->getCommentString()
+        << " " << I->getName();
+    O << '\n';
+
+    EmitGlobalConstant(C);
+
+    if (bCustomSegment)
+      O << name << "?\tends\n";
+  }
+
+    // Output linker support code for dllexported globals
+  if (!DLLExportedGVs.empty() || !DLLExportedFns.empty()) {
+    SwitchToDataSection("");
+    O << "; WARNING: The following code is valid only with MASM v8.x"
+      << "and (possible) higher\n"
+      << "; This version of MASM is usually shipped with Microsoft "
+      << "Visual Studio 2005\n"
+      << "; or (possible) further versions. Unfortunately, there is no "
+      << "way to support\n"
+      << "; dllexported symbols in the earlier versions of MASM in fully "
+      << "automatic way\n\n";
+    O << "_drectve\t segment info alias('.drectve')\n";
+  }
+
+  for (StringSet<>::iterator i = DLLExportedGVs.begin(),
+         e = DLLExportedGVs.end();
+         i != e; ++i)
+    O << "\t db ' /EXPORT:" << i->getKeyData() << ",data'\n";
+
+  for (StringSet<>::iterator i = DLLExportedFns.begin(),
+         e = DLLExportedFns.end();
+         i != e; ++i)
+    O << "\t db ' /EXPORT:" << i->getKeyData() << "'\n";
+
+  if (!DLLExportedGVs.empty() || !DLLExportedFns.empty())
+    O << "_drectve\t ends\n";
+
+  // Bypass X86SharedAsmPrinter::doFinalization().
+  bool Result = AsmPrinter::doFinalization(M);
+  SwitchToDataSection("");
+  O << "\tend\n";
+  return Result;
+}
+
+void X86IntelAsmPrinter::EmitString(const ConstantArray *CVA) const {
+  unsigned NumElts = CVA->getNumOperands();
+  if (NumElts) {
+    // ML does not have escape sequences except '' for '.  It also has a maximum
+    // string length of 255.
+    unsigned len = 0;
+    bool inString = false;
+    for (unsigned i = 0; i < NumElts; i++) {
+      int n = cast<ConstantInt>(CVA->getOperand(i))->getZExtValue() & 255;
+      if (len == 0)
+        O << "\tdb ";
+
+      if (n >= 32 && n <= 127) {
+        if (!inString) {
+          if (len > 0) {
+            O << ",'";
+            len += 2;
+          } else {
+            O << "'";
+            len++;
+          }
+          inString = true;
+        }
+        if (n == '\'') {
+          O << "'";
+          len++;
+        }
+        O << char(n);
+      } else {
+        if (inString) {
+          O << "'";
+          len++;
+          inString = false;
+        }
+        if (len > 0) {
+          O << ",";
+          len++;
+        }
+        O << n;
+        len += 1 + (n > 9) + (n > 99);
+      }
+
+      if (len > 60) {
+        if (inString) {
+          O << "'";
+          inString = false;
+        }
+        O << "\n";
+        len = 0;
+      }
+    }
+
+    if (len > 0) {
+      if (inString)
+        O << "'";
+      O << "\n";
+    }
+  }
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter1.inc"
diff --git a/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.h b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.h
new file mode 100644
index 0000000..9520d98
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.h
@@ -0,0 +1,152 @@
+//===-- X86IntelAsmPrinter.h - Convert X86 LLVM code to Intel assembly ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Intel assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INTELASMPRINTER_H
+#define X86INTELASMPRINTER_H
+
+#include "../X86.h"
+#include "../X86MachineFunctionInfo.h"
+#include "../X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+struct VISIBILITY_HIDDEN X86IntelAsmPrinter : public AsmPrinter {
+  explicit X86IntelAsmPrinter(raw_ostream &O, X86TargetMachine &TM,
+                              const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                              bool V)
+    : AsmPrinter(O, TM, T, OL, V) {}
+
+  virtual const char *getPassName() const {
+    return "X86 Intel-Style Assembly Printer";
+  }
+
+  /// printInstruction - This method is automatically generated by tablegen
+  /// from the instruction set description.  This method returns true if the
+  /// machine instruction was sufficiently described to print it, otherwise it
+  /// returns false.
+  bool printInstruction(const MachineInstr *MI);
+
+  // This method is used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (MO.isReg()) {
+      assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+             "Not physreg??");
+      O << TM.getRegisterInfo()->get(MO.getReg()).Name;  // Capitalized names
+    } else {
+      printOp(MO, Modifier);
+    }
+  }
+
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "BYTE PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "WORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "XWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printLeaMemReference(MI, OpNo, "subreg64");
+  }
+
+  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+  void printMachineInstruction(const MachineInstr *MI);
+  void printOp(const MachineOperand &MO, const char *Modifier = 0);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
+  void printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                            const char *Modifier=NULL);
+  void printPICJumpTableSetLabel(unsigned uid,
+                                 const MachineBasicBlock *MBB) const;
+  void printPICJumpTableSetLabel(unsigned uid, unsigned uid2,
+                                 const MachineBasicBlock *MBB) const {
+    AsmPrinter::printPICJumpTableSetLabel(uid, uid2, MBB);
+  }
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+  bool runOnMachineFunction(MachineFunction &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+  // We have to propagate some information about MachineFunction to
+  // AsmPrinter. It's ok, when we're printing the function, since we have
+  // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+  // Unfortunately, this is not possible when we're printing reference to
+  // Function (e.g. calling it and so on). Even more, there is no way to get the
+  // corresponding MachineFunctions: it can even be not created at all. That's
+  // why we should use additional structure, when we're collecting all necessary
+  // information.
+  //
+  // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+  // function, since we have to use arguments' size for decoration.
+  typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+  FMFInfoMap FunctionInfoMap;
+
+  void decorateName(std::string& Name, const GlobalValue* GV);
+
+  virtual void EmitString(const ConstantArray *CVA) const;
+
+  // Necessary for dllexport support
+  StringSet<> DLLExportedFns, DLLExportedGVs;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
new file mode 100644
index 0000000..d982990
--- /dev/null
+++ b/lib/Target/X86/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(LLVM_TARGET_DEFINITIONS X86.td)
+
+tablegen(X86GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(X86GenRegisterNames.inc -gen-register-enums)
+tablegen(X86GenRegisterInfo.inc -gen-register-desc)
+tablegen(X86GenInstrNames.inc -gen-instr-enums)
+tablegen(X86GenInstrInfo.inc -gen-instr-desc)
+tablegen(X86GenAsmWriter.inc -gen-asm-writer)
+tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(X86GenDAGISel.inc -gen-dag-isel)
+tablegen(X86GenFastISel.inc -gen-fast-isel)
+tablegen(X86GenCallingConv.inc -gen-callingconv)
+tablegen(X86GenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(X86CodeGen
+  X86CodeEmitter.cpp
+  X86ELFWriterInfo.cpp
+  X86FloatingPoint.cpp
+  X86FloatingPointRegKill.cpp
+  X86ISelDAGToDAG.cpp
+  X86ISelLowering.cpp
+  X86InstrInfo.cpp
+  X86JITInfo.cpp
+  X86RegisterInfo.cpp
+  X86Subtarget.cpp
+  X86TargetAsmInfo.cpp
+  X86TargetMachine.cpp
+  X86FastISel.cpp
+  )
diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
new file mode 100644
index 0000000..44f1c5d
--- /dev/null
+++ b/lib/Target/X86/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMX86CodeGen
+TARGET = X86
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
+                X86GenRegisterInfo.inc X86GenInstrNames.inc \
+                X86GenInstrInfo.inc X86GenAsmWriter.inc \
+                X86GenAsmWriter1.inc X86GenDAGISel.inc  \
+                X86GenFastISel.inc \
+                X86GenCallingConv.inc X86GenSubtarget.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt
new file mode 100644
index 0000000..be28e8b
--- /dev/null
+++ b/lib/Target/X86/README-FPStack.txt
@@ -0,0 +1,85 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (extloadf64f32 addr:$src2)))]>;
+                // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (X86fild addr:$src2, i32)))]>;
+                // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier needs to be global.  Also, it should handle simple permutates
+to reduce number of shuffle instructions, e.g. turning:
+
+fld P	->		fld Q
+fld Q			fld P
+fxch
+
+or:
+
+fxch	->		fucomi
+fucomi			jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+	subl $20, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, 8(%esp)
+	fldl 8(%esp)
+	fisttpll (%esp)
+	movl (%esp), %eax
+	addl $20, %esp
+	ret
+
+This just requires being smarter when custom expanding fptoui.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt
new file mode 100644
index 0000000..a6c8616
--- /dev/null
+++ b/lib/Target/X86/README-MMX.txt
@@ -0,0 +1,71 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: MMX-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+#include <mmintrin.h>
+
+__v2si qux(int A) {
+  return (__v2si){ 0, A };
+}
+
+is compiled into:
+
+_qux:
+        subl $28, %esp
+        movl 32(%esp), %eax
+        movd %eax, %mm0
+        movq %mm0, (%esp)
+        movl (%esp), %eax
+        movl %eax, 20(%esp)
+        movq %mm0, 8(%esp)
+        movl 12(%esp), %eax
+        movl %eax, 16(%esp)
+        movq 16(%esp), %mm0
+        addl $28, %esp
+        ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+        subl    $12, %esp
+        movl    16(%esp), %eax
+        movl    20(%esp), %edx
+        movl    $0, (%eax)
+        movl    %edx, 4(%eax)
+        addl    $12, %esp
+        ret     $4
+
+//===---------------------------------------------------------------------===//
+
+We generate crappy code for this:
+
+__m64 t() {
+  return _mm_cvtsi32_si64(1);
+}
+
+_t:
+	subl	$12, %esp
+	movl	$1, %eax
+	movd	%eax, %mm0
+	movq	%mm0, (%esp)
+	movl	(%esp), %eax
+	movl	4(%esp), %edx
+	addl	$12, %esp
+	ret
+
+The extra stack traffic is covered in the previous entry. But the other reason
+is we are not smart about materializing constants in MMX registers. With -m64
+
+	movl	$1, %eax
+	movd	%eax, %mm0
+	movd	%mm0, %rax
+	ret
+
+We should be using a constantpool load instead:
+	movq	LC0(%rip), %rax
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
new file mode 100644
index 0000000..71ad51c
--- /dev/null
+++ b/lib/Target/X86/README-SSE.txt
@@ -0,0 +1,918 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+- Consider eliminating the unaligned SSE load intrinsics, replacing them with
+  unaligned LLVM load instructions.
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline:  Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs on x86-32.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+        %C = select bool %B, double 123.412, double 523.01123123
+        ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
+like this:
+
+  X += y
+
+and the register allocator decides to spill X, it is cheaper to emit this as:
+
+Y += [xslot]
+store Y -> [xslot]
+
+than as:
+
+tmp = [xslot]
+tmp += y
+store tmp -> [xslot]
+
+..and this uses one fewer register (so this should be done at load folding
+time, not at spiller time).  *Note* however that this can only be done
+if Y is dead.  Here's a testcase:
+
+@.str_3 = external global [15 x i8]
+declare void @printf(i32, ...)
+define void @main() {
+build_tree.exit:
+	br label %no_exit.i7
+
+no_exit.i7:		; preds = %no_exit.i7, %build_tree.exit
+	%tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
+                                   [ %tmp.34.i18, %no_exit.i7 ]
+	%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
+                                    [ %tmp.28.i16, %no_exit.i7 ]
+	%tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
+	%tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+	br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
+
+Compute_Tree.exit23:		; preds = %no_exit.i7
+	tail call void (i32, ...)* @printf( i32 0 )
+	store double %tmp.34.i18, double* null
+	ret void
+}
+
+We currently emit:
+
+.BBmain_1:
+        xorpd %XMM1, %XMM1
+        addsd %XMM0, %XMM1
+***     movsd %XMM2, QWORD PTR [%ESP + 8]
+***     addsd %XMM2, %XMM1
+***     movsd QWORD PTR [%ESP + 8], %XMM2
+        jmp .BBmain_1   # no_exit.i7
+
+This is a bugpoint reduced testcase, which is why the testcase doesn't make
+much sense (e.g. its an infinite loop). :)
+
+//===---------------------------------------------------------------------===//
+
+SSE should implement 'select_cc' using 'emulated conditional moves' that use
+pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+
+double %X(double %Y, double %Z, double %A, double %B) {
+        %C = setlt double %A, %B
+        %z = add double %Z, 0.0    ;; select operand is not a load
+        %D = select bool %C, double %Y, double %z
+        ret double %D
+}
+
+We currently emit:
+
+_X:
+        subl $12, %esp
+        xorpd %xmm0, %xmm0
+        addsd 24(%esp), %xmm0
+        movsd 32(%esp), %xmm1
+        movsd 16(%esp), %xmm2
+        ucomisd 40(%esp), %xmm1
+        jb LBB_X_2
+LBB_X_1:
+        movsd %xmm0, %xmm2
+LBB_X_2:
+        movsd %xmm2, (%esp)
+        fldl (%esp)
+        addl $12, %esp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+It's not clear whether we should use pxor or xorps / xorpd to clear XMM
+registers. The choice may depend on subtarget information. We should do some
+more experiments on different x86 machines.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+  if (copysign(1.0, x) == copysign(1.0, y))
+into:
+  if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+How to decide when to use the "floating point version" of logical ops? Here are
+some code fragments:
+
+	movaps LCPI5_5, %xmm2
+	divps %xmm1, %xmm2
+	mulps %xmm2, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm2
+	andps LCPI5_1, %xmm3
+	por %xmm2, %xmm3
+	movdqa %xmm3, (%edi)
+
+	movaps LCPI5_5, %xmm1
+	divps %xmm0, %xmm1
+	mulps %xmm1, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm1
+	andps LCPI5_1, %xmm3
+	orps %xmm1, %xmm3
+	movaps %xmm3, 112(%esp)
+	movaps %xmm3, (%ebx)
+
+Due to some minor source change, the later case ended up using orps and movaps
+instead of por and movdqa. Does it matter?
+
+//===---------------------------------------------------------------------===//
+
+X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
+to choose between movaps, movapd, and movdqa based on types of source and
+destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+        movaps    (%edx), %xmm2                                 #59.21
+        movaps    (%edx), %xmm5                                 #60.21
+        movaps    (%edx), %xmm4                                 #61.21
+        movaps    (%edx), %xmm3                                 #62.21
+        movl      40(%ecx), %ebp                                #69.49
+        shufps    $0, %xmm2, %xmm5                              #60.21
+        movl      100(%esp), %ebx                               #69.20
+        movl      (%ebx), %edi                                  #69.20
+        imull     %ebp, %edi                                    #69.49
+        addl      (%eax), %edi                                  #70.33
+        shufps    $85, %xmm2, %xmm4                             #61.21
+        shufps    $170, %xmm2, %xmm3                            #62.21
+        shufps    $255, %xmm2, %xmm2                            #63.21
+        lea       (%ebp,%ebp,2), %ebx                           #69.49
+        negl      %ebx                                          #69.49
+        lea       -3(%edi,%ebx), %ebx                           #70.33
+        shll      $4, %ebx                                      #68.37
+        addl      32(%ecx), %ebx                                #68.37
+        testb     $15, %bl                                      #91.13
+        jne       L_B1.24       # Prob 5%                       #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%reg1078 = MOV32ri -3
+	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+	%reg1080 = IMUL32rr %reg1079, %reg1037
+	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+	%reg1082 = SHL32ri %reg1038, 4
+	%reg1039 = ADD32rr %reg1036, %reg1082
+	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+	%reg1040 = MOV32rr %reg1039
+	%reg1084 = AND32ri8 %reg1039, 15
+	CMP32ri8 %reg1084, 0
+	JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%EAX = MOV32ri -3
+	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+	%EDX = MOV32rm %EDX, 1, %NOREG, 40
+	IMUL32rr %EAX<def&use>, %EDX
+	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 0
+	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+	%EAX = LEA32r %ESI, 1, %EAX, -3
+	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 32
+	%EDI = MOV32rr %EAX
+	SHL32ri %EDI<def&use>, 4
+	ADD32rr %EDI<def&use>, %ESI
+	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+	%XMM1 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM1<def&use>, %XMM1, 170
+	%XMM2 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM2<def&use>, %XMM2, 0
+	%XMM3 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM3<def&use>, %XMM3, 255
+	SHUFPSrr %XMM0<def&use>, %XMM0, 85
+	%EBX = MOV32rr %EDI
+	AND32ri8 %EBX<def&use>, 15
+	CMP32ri8 %EBX, 0
+	JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
+
+LLVM is producing bad code.
+
+LBB_main_4:	# cond_true44
+	addps %xmm1, %xmm2
+	subps %xmm3, %xmm2
+	movaps (%ecx), %xmm4
+	movaps %xmm2, %xmm1
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+There are two problems. 1) No need to two loop induction variables. We can
+compare against 262144 * 16. 2) Known register coalescer issue. We should
+be able eliminate one of the movaps:
+
+	addps %xmm2, %xmm1    <=== Commute!
+	subps %xmm3, %xmm1
+	movaps (%ecx), %xmm4
+	movaps %xmm1, %xmm1   <=== Eliminate!
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of 
+xmm1 are already zero'd.  We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea.  Consider code like this:
+
+__m128 test(__m128 a) {
+  float b = *(float*)&A;
+  ...
+  return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled.  This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could 
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte 
+chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+  return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+define void @test(float* %P, <4 x float>* %P2 ) {
+        %xFloat0.688 = load float* %P
+        %tmp = load <4 x float>* %P2
+        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
+        store <4 x float> %inFloat3.713, <4 x float>* %P2
+        ret void
+}
+
+Generates:
+
+_test:
+	movl	8(%esp), %eax
+	movaps	(%eax), %xmm0
+	pxor	%xmm1, %xmm1
+	movaps	%xmm0, %xmm2
+	shufps	$50, %xmm1, %xmm2
+	shufps	$132, %xmm2, %xmm0
+	movaps	%xmm0, (%eax)
+	ret
+
+Would it be better to generate:
+
+_test:
+        movl 8(%esp), %ecx
+        movaps (%ecx), %xmm0
+	xor %eax, %eax
+        pinsrw $6, %eax, %xmm0
+        pinsrw $7, %eax, %xmm0
+        movaps %xmm0, (%ecx)
+        ret
+
+?
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should materialize vector constants like "all ones" and "signbit" with 
+code like:
+
+     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
+
+and:
+     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
+     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
+
+instead of using a load from the constant pool.  The later is important for
+ABS/NEG/copysign etc.
+
+//===---------------------------------------------------------------------===//
+
+These functions:
+
+#include <xmmintrin.h>
+__m128i a;
+void x(unsigned short n) {
+  a = _mm_slli_epi32 (a, n);
+}
+void y(unsigned n) {
+  a = _mm_slli_epi32 (a, n);
+}
+
+compile to ( -O3 -static -fomit-frame-pointer):
+_x:
+        movzwl  4(%esp), %eax
+        movd    %eax, %xmm0
+        movaps  _a, %xmm1
+        pslld   %xmm0, %xmm1
+        movaps  %xmm1, _a
+        ret
+_y:
+        movd    4(%esp), %xmm0
+        movaps  _a, %xmm1
+        pslld   %xmm0, %xmm1
+        movaps  %xmm1, _a
+        ret
+
+"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
+like movd would be sufficient in both cases as the value is already zero 
+extended in the 32-bit stack slot IIRC.  For signed short, it should also be
+save, as a really-signed value would be undefined for pslld.
+
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+int t1(double d) { return signbit(d); }
+
+This currently compiles to:
+	subl	$12, %esp
+	movsd	16(%esp), %xmm0
+	movsd	%xmm0, (%esp)
+	movl	4(%esp), %eax
+	shrl	$31, %eax
+	addl	$12, %esp
+	ret
+
+We should use movmskp{s|d} instead.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
+(aligned) vector load.  This functionality has a couple of problems.
+
+1. The code to infer alignment from loads of globals is in the X86 backend,
+   not the dag combiner.  This is because dagcombine2 needs to be able to see
+   through the X86ISD::Wrapper node, which DAGCombine can't really do.
+2. The code for turning 4 x load into a single vector load is target 
+   independent and should be moved to the dag combiner.
+3. The code for turning 4 x load into a vector load can only handle a direct 
+   load from a global or a direct load from the stack.  It should be generalized
+   to handle any load from P, P+4, P+8, P+12, where P can be anything.
+4. The alignment inference code cannot handle loads from globals in non-static
+   mode because it doesn't look through the extra dyld stub load.  If you try
+   vec_align.ll without -relocation-model=static, you'll see what I mean.
+
+//===---------------------------------------------------------------------===//
+
+We should lower store(fneg(load p), q) into an integer load+xor+store, which
+eliminates a constant pool load.  For example, consider:
+
+define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
+entry:
+ %tmp6 = sub float -0.000000e+00, %z.1		; <float> [#uses=1]
+ %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
+ ret i64 %tmp20
+}
+
+This currently compiles to:
+
+LCPI1_0:					#  <4 x float>
+	.long	2147483648	# float -0
+	.long	2147483648	# float -0
+	.long	2147483648	# float -0
+	.long	2147483648	# float -0
+_ccosf:
+	subl	$12, %esp
+	movss	16(%esp), %xmm0
+	movss	%xmm0, 4(%esp)
+	movss	20(%esp), %xmm0
+	xorps	LCPI1_0, %xmm0
+	movss	%xmm0, (%esp)
+	call	L_ccoshf$stub
+	addl	$12, %esp
+	ret
+
+Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
+this code computes the pic base and does two loads to do the constant pool 
+load, so the improvement is much bigger.
+
+The tricky part about this xform is that the argument load/store isn't exposed
+until post-legalize, and at that point, the fneg has been custom expanded into 
+an X86 fxor.  This means that we need to handle this case in the x86 backend
+instead of in target independent code.
+
+//===---------------------------------------------------------------------===//
+
+Non-SSE4 insert into 16 x i8 is atrociously bad.
+
+//===---------------------------------------------------------------------===//
+
+<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
+is memory.
+
+//===---------------------------------------------------------------------===//
+
+SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
+sitting between the truncate and the extract.
+
+//===---------------------------------------------------------------------===//
+
+INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
+any number of 0.0 simultaneously.  Currently we only use it for simple
+insertions.
+
+See comments in LowerINSERT_VECTOR_ELT_SSE4.
+
+//===---------------------------------------------------------------------===//
+
+On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
+Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
+legal, it'll just take a few extra patterns written in the .td file.
+
+Note: this is not a code quality issue; the custom lowered code happens to be
+right, but we shouldn't have to custom lower anything.  This is probably related
+to <2 x i64> ops being so bad.
+
+//===---------------------------------------------------------------------===//
+
+'select' on vectors and scalars could be a whole lot better.  We currently 
+lower them to conditional branches.  On x86-64 for example, we compile this:
+
+double test(double a, double b, double c, double d) { return a<b ? c : d; }
+
+to:
+
+_test:
+	ucomisd	%xmm0, %xmm1
+	ja	LBB1_2	# entry
+LBB1_1:	# entry
+	movapd	%xmm3, %xmm2
+LBB1_2:	# entry
+	movapd	%xmm2, %xmm0
+	ret
+
+instead of:
+
+_test:
+	cmpltsd	%xmm1, %xmm0
+	andpd	%xmm0, %xmm2
+	andnpd	%xmm3, %xmm0
+	orpd	%xmm2, %xmm0
+	ret
+
+For unpredictable branches, the later is much more efficient.  This should
+just be a matter of having scalar sse map to SELECT_CC and custom expanding
+or iseling it.
+
+//===---------------------------------------------------------------------===//
+
+LLVM currently generates stack realignment code, when it is not necessary
+needed. The problem is that we need to know about stack alignment too early,
+before RA runs.
+
+At that point we don't know, whether there will be vector spill, or not.
+Stack realignment logic is overly conservative here, but otherwise we can
+produce unaligned loads/stores.
+
+Fixing this will require some huge RA changes.
+
+Testcase:
+#include <emmintrin.h>
+
+typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
+
+static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
+- 22725, - 12873};;
+
+vSInt16 madd(vSInt16 b)
+{
+    return _mm_madd_epi16(a, b);
+}
+
+Generated code (x86-32, linux):
+madd:
+        pushl   %ebp
+        movl    %esp, %ebp
+        andl    $-16, %esp
+        movaps  .LCPI1_0, %xmm1
+        pmaddwd %xmm1, %xmm0
+        movl    %ebp, %esp
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+#include <emmintrin.h> 
+__m128 foo2 (float x) {
+ return _mm_set_ps (0, 0, x, 0);
+}
+
+In x86-32 mode, we generate this spiffy code:
+
+_foo2:
+	movss	4(%esp), %xmm0
+	pshufd	$81, %xmm0, %xmm0
+	ret
+
+in x86-64 mode, we generate this code, which could be better:
+
+_foo2:
+	xorps	%xmm1, %xmm1
+	movss	%xmm0, %xmm1
+	pshufd	$81, %xmm1, %xmm0
+	ret
+
+In sse4 mode, we could use insertps to make both better.
+
+Here's another testcase that could use insertps [mem]:
+
+#include <xmmintrin.h>
+extern float x2, x3;
+__m128 foo1 (float x1, float x4) {
+ return _mm_set_ps (x2, x1, x3, x4);
+}
+
+gcc mainline compiles it to:
+
+foo1:
+       insertps        $0x10, x2(%rip), %xmm0
+       insertps        $0x10, x3(%rip), %xmm1
+       movaps  %xmm1, %xmm2
+       movlhps %xmm0, %xmm2
+       movaps  %xmm2, %xmm0
+       ret
+
+//===---------------------------------------------------------------------===//
+
+We compile vector multiply-by-constant into poor code:
+
+define <4 x i32> @f(<4 x i32> %i) nounwind  {
+	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
+	ret <4 x i32> %A
+}
+
+On targets without SSE4.1, this compiles into:
+
+LCPI1_0:					##  <4 x i32>
+	.long	10
+	.long	10
+	.long	10
+	.long	10
+	.text
+	.align	4,0x90
+	.globl	_f
+_f:
+	pshufd	$3, %xmm0, %xmm1
+	movd	%xmm1, %eax
+	imull	LCPI1_0+12, %eax
+	movd	%eax, %xmm1
+	pshufd	$1, %xmm0, %xmm2
+	movd	%xmm2, %eax
+	imull	LCPI1_0+4, %eax
+	movd	%eax, %xmm2
+	punpckldq	%xmm1, %xmm2
+	movd	%xmm0, %eax
+	imull	LCPI1_0, %eax
+	movd	%eax, %xmm1
+	movhlps	%xmm0, %xmm0
+	movd	%xmm0, %eax
+	imull	LCPI1_0+8, %eax
+	movd	%eax, %xmm0
+	punpckldq	%xmm0, %xmm1
+	movaps	%xmm1, %xmm0
+	punpckldq	%xmm2, %xmm0
+	ret
+
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+__m128i
+foo2 (char x)
+{
+  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
+}
+
+into:
+	movl	$1, %eax
+	xorps	%xmm0, %xmm0
+	pinsrw	$2, %eax, %xmm0
+	movzbl	4(%esp), %eax
+	pinsrw	$3, %eax, %xmm0
+	movl	$256, %eax
+	pinsrw	$7, %eax, %xmm0
+	ret
+
+
+gcc-4.2:
+	subl	$12, %esp
+	movzbl	16(%esp), %eax
+	movdqa	LC0, %xmm0
+	pinsrw	$3, %eax, %xmm0
+	addl	$12, %esp
+	ret
+	.const
+	.align 4
+LC0:
+	.word	0
+	.word	0
+	.word	1
+	.word	0
+	.word	0
+	.word	0
+	.word	0
+	.word	256
+
+With SSE4, it should be
+      movdqa  .LC0(%rip), %xmm0
+      pinsrb  $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+	.align	4
+LCPI1_1:					##  float
+	.long	1065353216	## float 1
+	.const
+
+	.align	4
+LCPI1_0:					##  <4 x float>
+	.space	4
+	.long	1065353216	## float 1
+	.space	4
+	.long	1065353216	## float 1
+	.text
+	.align	4,0x90
+	.globl	_t
+_t:
+	xorps	%xmm0, %xmm0
+	movhps	LCPI1_0, %xmm0
+	movss	LCPI1_1, %xmm1
+	movaps	%xmm0, %xmm2
+	shufps	$2, %xmm1, %xmm2
+	shufps	$132, %xmm2, %xmm0
+	movaps	%xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+  return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext  %x) nounwind  {
+	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
+	ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+	subl	$4, %esp
+	movzbl	8(%esp), %eax
+	cvtsi2ss	%eax, %xmm0
+	movss	%xmm0, (%esp)
+	flds	(%esp)
+	addl	$4, %esp
+	ret
+
+We should be able to use:
+  cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE.  SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+    return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:					
+	.long	1082130432	## float 4.000000e+00
+_MonteCarlo_num_flops:
+	subl	$4, %esp
+	movl	8(%esp), %eax
+	movl	%eax, (%esp)
+	fildl	(%esp)
+	fmuls	LCPI1_0
+	addl	$4, %esp
+	ret
+        
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+	subl	$12, %esp
+	cvtsi2sd	16(%esp), %xmm0
+	mulsd	LCPI1_0, %xmm0
+	movsd	%xmm0, (%esp)
+	fldl	(%esp)
+	addl	$12, %esp
+	ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-UNIMPLEMENTED.txt b/lib/Target/X86/README-UNIMPLEMENTED.txt
new file mode 100644
index 0000000..69dc8ee
--- /dev/null
+++ b/lib/Target/X86/README-UNIMPLEMENTED.txt
@@ -0,0 +1,14 @@
+//===---------------------------------------------------------------------===//
+// Testcases that crash the X86 backend because they aren't implemented
+//===---------------------------------------------------------------------===//
+
+These are cases we know the X86 backend doesn't handle.  Patches are welcome
+and appreciated, because no one has signed up to implemented these yet.
+Implementing these would allow elimination of the corresponding intrinsics,
+which would be great.
+
+1) vector shifts
+2) vector comparisons
+3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
+4) bitcasts from vectors to scalars: PR2804
+
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..ad12137
--- /dev/null
+++ b/lib/Target/X86/README-X86-64.txt
@@ -0,0 +1,251 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+  xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+	jmp	_xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+	ucomiss	LC0(%rip), %xmm0
+	cvttss2siq	%xmm0, %rdx
+	jb	L3
+	subss	LC0(%rip), %xmm0
+	movabsq	$-9223372036854775808, %rax
+	cvttss2siq	%xmm0, %rdx
+	xorq	%rax, %rdx
+L3:
+	movq	%rdx, %rax
+	ret
+
+instead of
+
+_conv:
+	movss LCPI1_0(%rip), %xmm1
+	cvttss2siq %xmm0, %rcx
+	movaps %xmm0, %xmm2
+	subss %xmm1, %xmm2
+	cvttss2siq %xmm2, %rax
+	movabsq $-9223372036854775808, %rdx
+	xorq %rdx, %rax
+	ucomiss %xmm1, %xmm0
+	cmovb %rcx, %rax
+	ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+  memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+	movq _b@GOTPCREL(%rip), %rax
+	movzbq (%rax), %rax
+	movq %rax, %rcx
+	shlq $8, %rcx
+	orq %rax, %rcx
+	movq %rcx, %rax
+	shlq $16, %rax
+	orq %rcx, %rax
+	movq %rax, %rcx
+	shlq $32, %rcx
+	movq _X@GOTPCREL(%rip), %rdx
+	orq %rax, %rcx
+	movq %rcx, (%rdx)
+	ret
+
+gcc:
+	movq	_b@GOTPCREL(%rip), %rax
+	movabsq	$72340172838076673, %rdx
+	movzbq	(%rax), %rax
+	imulq	%rdx, %rax
+	movq	_X@GOTPCREL(%rip), %rdx
+	movq	%rax, (%rdx)
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+	movzbl	%al, %edx
+	leaq	0(,%rdx,4), %rax
+	leaq	4+L2(%rip), %rdx
+	leaq	239(%rsp), %rax
+       	jmp	*%rdx
+	movaps	%xmm7, -15(%rax)
+	movaps	%xmm6, -31(%rax)
+	movaps	%xmm5, -47(%rax)
+	movaps	%xmm4, -63(%rax)
+	movaps	%xmm3, -79(%rax)
+	movaps	%xmm2, -95(%rax)
+	movaps	%xmm1, -111(%rax)
+	movaps	%xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non- 
+   POD structure or union type, or contains unaligned fields, it has class 
+   MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS. 
+3. Each field of an object is classified recursively so that always two fields
+   are considered. The resulting class is calculated according to the classes
+   of the fields in the eightbyte: 
+   (a) If both classes are equal, this is the resulting class. 
+   (b) If one of the classes is NO_CLASS, the resulting class is the other 
+       class. 
+   (c) If one of the classes is MEMORY, the result is the MEMORY class. 
+   (d) If one of the classes is INTEGER, the result is the INTEGER. 
+   (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+      class. 
+   (f) Otherwise class SSE is used. 
+4. Then a post merger cleanup is done: 
+   (a) If one of the classes is MEMORY, the whole argument is passed in memory. 
+   (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+    typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+    typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+    typedef struct { int64_t i; int8_t j; int64_t d; } S;
+    void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in 
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+    void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+    void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+    void %test(long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+    call void %test( undef, undef, undef, undef, undef, undef,
+                     %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+    void %test(int %d,
+               long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+        movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+        leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.
+
+//===---------------------------------------------------------------------===//
+
+It's not possible to reference AH, BH, CH, and DH registers in an instruction
+requiring REX prefix. However, divb and mulb both produce results in AH. If isel
+emits a CopyFromReg which gets turned into a movb and that can be allocated a
+r8b - r15b.
+
+To get around this, isel emits a CopyFromReg from AX and then right shift it
+down by 8 and truncate it. It's not pretty but it works. We need some register
+allocation magic to make the hack go away (e.g. putting additional constraints
+on the result of the movb).
+
+//===---------------------------------------------------------------------===//
+
+The x86-64 ABI for hidden-argument struct returns requires that the
+incoming value of %rdi be copied into %rax by the callee upon return.
+
+The idea is that it saves callers from having to remember this value,
+which would often require a callee-saved register. Callees usually
+need to keep this value live for most of their body anyway, so it
+doesn't add a significant burden on them.
+
+We currently implement this in codegen, however this is suboptimal
+because it means that it would be quite awkward to implement the
+optimization for callers.
+
+A better implementation would be to relax the LLVM IR rules for sret
+arguments to allow a function with an sret argument to have a non-void
+return type, and to have the front-end to set up the sret argument value
+as the return value of the function. The front-end could more easily
+emit uses of the returned struct value to be in terms of the function's
+lowered return value, and it would free non-C frontends from a
+complication only required by a C-based ABI.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
new file mode 100644
index 0000000..710bd03
--- /dev/null
+++ b/lib/Target/X86/README.txt
@@ -0,0 +1,1899 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?)  This is available on Atom processors.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case.  We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
+This should be one DIV/IDIV instruction, not a libcall:
+
+unsigned test(unsigned long long X, unsigned Y) {
+        return X/Y;
+}
+
+This can be done trivially with a custom legalizer.  What about overflow 
+though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be  ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+Also, this might be better.  It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+        movl %ecx, %eax
+        shrl $5, %eax
+        movl %eax, %edx
+        xorl $1, %edx
+        sall %cl, %eax
+        sall %cl. %edx
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Compile this:
+_Bool f(_Bool a) { return a!=1; }
+
+into:
+        movzbl  %dil, %eax
+        xorl    $1, %eax
+        ret
+
+(Although note that this isn't a legal way to express the code that llvm-gcc
+currently generates for that function.)
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time if not an
+   issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+   Sequencing of Instructions".
+4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
+   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
+   and other related papers.
+   http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+Only use inc/neg/not instructions on processors where they are faster than
+add/sub/xor.  They are slower on the P4 due to only updating some processor
+flags.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare.  The
+pattern is written as (cmp reg, (load p)).  Because the compare isn't 
+commutative, it is not matched with the load on both sides.  The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+How about intrinsics? An example is:
+  *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
+
+compiles to
+	pmuludq (%eax), %xmm0
+	movl 8(%esp), %eax
+	movdqa (%eax), %xmm1
+	pmulhuw %xmm0, %xmm1
+
+The transformation probably requires a X86 specific pass or a DAG combiner
+target specific hook.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+        movl 8(%esp), %eax
+        cmpl %eax, 4(%esp)
+        setl %al
+        movzbl %al, %eax
+        ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+        movl 8(%esp), %ebx
+        xor  %eax, %eax
+        cmpl %ebx, 4(%esp)
+        setl %al
+        ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important.  e.g., for:
+
+void setbit(int *target, int bit) {
+    *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+    *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+	movl $16843009, 4(%edx)
+	movl $16843009, (%edx)
+	movw $257, 8(%edx)
+
+It might be better to generate
+
+	movl $16843009, %eax
+	movl %eax, 4(%edx)
+	movl %eax, (%edx)
+	movw al, 8(%edx)
+	
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
+get this:
+
+define i32 @test1(i32 %X) {
+    %Y = sdiv i32 %X, 8
+    ret i32 %Y
+}
+
+_test1:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        sarl $31, %ecx
+        shrl $29, %ecx
+        addl %ecx, %eax
+        sarl $3, %eax
+        ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+        movl    4(%esp), %eax
+        cmpl    $-1, %eax
+        leal    7(%eax), %ecx
+        cmovle  %ecx, %eax
+        sarl    $3, %eax
+        ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+  if (_mm_comige_ss(*A, *B))
+    return 3;
+  else
+    return 4;
+}
+
+_test:
+	movl 8(%esp), %eax
+	movaps (%eax), %xmm0
+	movl 4(%esp), %eax
+	movaps (%eax), %xmm1
+	comiss %xmm0, %xmm1
+	setae %al
+	movzbl %al, %ecx
+	movl $3, %eax
+	movl $4, %edx
+	cmpl $0, %ecx
+	cmove %edx, %eax
+	ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+  return a * 3;
+}
+
+We currently emits
+	imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+	movl	4(%esp), %eax
+	leal	(%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
+//===---------------------------------------------------------------------===//
+
+__builtin_ffs codegen is messy.
+
+int ffs_(unsigned X) { return __builtin_ffs(X); }
+
+llvm produces:
+ffs_:
+        movl    4(%esp), %ecx
+        bsfl    %ecx, %eax
+        movl    $32, %edx
+        cmove   %edx, %eax
+        incl    %eax
+        xorl    %edx, %edx
+        testl   %ecx, %ecx
+        cmove   %edx, %eax
+        ret
+
+vs gcc:
+
+_ffs_:
+        movl    $-1, %edx
+        bsfl    4(%esp), %eax
+        cmove   %edx, %eax
+        addl    $1, %eax
+        ret
+
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+define i32 @foo(i32* %a, i32 %t) {
+entry:
+	br label %cond_true
+
+cond_true:		; preds = %cond_true, %entry
+	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
+	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
+	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
+	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
+	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
+	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
+	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
+	br i1 %tmp, label %bb12, label %cond_true
+
+bb12:		; preds = %cond_true
+	ret i32 %tmp7
+}
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+  float fl = (int) (u & 0xffff);
+  float fh = (int) (u >> 16);
+  fh *= 0x1.0p16f;
+  return fh + fl;
+}
+
+00000000        subl    $0x04,%esp
+00000003        movl    0x08(%esp,1),%eax
+00000007        movl    %eax,%ecx
+00000009        shrl    $0x10,%ecx
+0000000c        cvtsi2ss        %ecx,%xmm0
+00000010        andl    $0x0000ffff,%eax
+00000015        cvtsi2ss        %eax,%xmm1
+00000019        mulss   0x00000078,%xmm0
+00000021        addss   %xmm1,%xmm0
+00000025        movss   %xmm0,(%esp,1)
+0000002a        flds    (%esp,1)
+0000002d        addl    $0x04,%esp
+00000030        ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+  if (a == 4 || a == 6)
+    b++;
+  return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".  For example, instead of:
+
+int G;
+void f(int X, int Y) {
+  G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+        movl $14, %eax
+        movl $13, %ecx
+        movl 4(%esp), %edx
+        testl %edx, %edx
+        cmovl %eax, %ecx
+        movl %ecx, _G
+        ret
+
+it could be:
+_f:
+        movl    4(%esp), %eax
+        sarl    $31, %eax
+        notl    %eax
+        addl    $14, %eax
+        movl    %eax, _G
+        ret
+
+etc.
+
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+       return (a < b ? -1 : 0);
+}
+to:
+_usesbb:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	sbbl	%eax, %eax
+	ret
+
+instead of:
+_usesbb:
+	xorl	%eax, %eax
+	movl	8(%esp), %ecx
+	cmpl	%ecx, 4(%esp)
+	movl	$4294967295, %ecx
+	cmovb	%ecx, %eax
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+	call fastcc void %test1( )
+	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+	ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+	subl $16, %esp
+	call _test5
+	addl $12, %esp
+	subl $16, %esp
+	movl $_test5, (%esp)
+	call _test6
+	addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+define i32 @test3(i32 %X) {
+        %tmp1 = urem i32 %X, 255
+        ret i32 %tmp1
+}
+
+Currently it compiles to:
+
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
+...
+
+This could be "reassociated" into:
+
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
+
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary.  In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
+937  937 0x3d0a incl     %esi
+3    3   0x3d0b cmpb     %bl, %dl
+27   27  0x3d0d jnz      0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+    if ((p[0] == 1) & (p[1] == 2)) return 1;
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Start using the flags more.  For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x += y) == 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+       addl    %esi, (%rdi)
+       movl    %edx, %eax
+       cmovne  %ecx, %eax
+       ret
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        movl %esi, (%rdi)
+        testl %esi, %esi
+        cmove %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x + y) < 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+
+add_zf:
+        addl    (%rdi), %esi
+        movl    %edx, %eax
+        cmovns  %ecx, %eax
+        ret
+
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        testl %esi, %esi
+        cmovs %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        movl 8(%esp), %edx
+        cmpl %edx, %ecx
+        jne LBB1_2      #UnifiedReturnBlock
+LBB1_1: #cond_true
+        addl $2, %eax
+        ret
+LBB1_2: #UnifiedReturnBlock
+        movl %ecx, %eax
+        ret
+_f2:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        cmpl 8(%esp), %ecx
+        sete %cl
+        movzbl %cl, %ecx
+        leal 1(%ecx,%eax), %eax
+        ret
+
+both of which are inferior to GCC's:
+
+_f:
+        movl    4(%esp), %edx
+        leal    1(%edx), %eax
+        addl    $2, %edx
+        cmpl    8(%esp), %eax
+        cmove   %edx, %eax
+        ret
+_f2:
+        movl    4(%esp), %eax
+        addl    $1, %eax
+        xorl    %edx, %edx
+        cmpl    8(%esp), %eax
+        sete    %dl
+        addl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+  if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne LBB1_1
+        addl $12, %esp
+        ret
+LBB1_1:
+        call L_abort$stub
+
+It would be better to produce:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne L_abort$stub
+        addl $12, %esp
+        ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        subl $12, %esp
+        call L_abort$stub
+
+Both are useful in different situations.  Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        pop %eax   # realign stack.
+        call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead (likewise dec).  For example, on X86-64, compile:
+
+int foo(int A, int B) {
+  return A+1;
+}
+
+to:
+
+_foo:
+        leal    1(%edi), %eax
+        ret
+
+instead of:
+
+_foo:
+        incl %edi
+        movl %edi, %eax
+        ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y.  Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered.  However, this copy is not needed if the register
+;; allocator turns the shift into an LEA.  This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN:   not grep {mov E.X, E.X}
+
+@G = external global i32		; <i32*> [#uses=3]
+
+define i32 @test1(i32 %X, i32 %Y) {
+	%Z = add i32 %X, %Y		; <i32> [#uses=1]
+	volatile store i32 %Y, i32* @G
+	volatile store i32 %Z, i32* @G
+	ret i32 %X
+}
+
+define i32 @test2(i32 %X) {
+	%Z = add i32 %X, 1		; <i32> [#uses=1]
+	volatile store i32 %Z, i32* @G
+	ret i32 %X
+}
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+	neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Leaf functions that require one 4-byte spill slot have a prolog like this:
+
+_foo:
+        pushl   %esi
+        subl    $4, %esp
+...
+and an epilog like this:
+        addl    $4, %esp
+        popl    %esi
+        ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
+branches.  We generate really poor code for:
+
+double testf(double a) {
+       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+        subl    $20, %esp
+        pxor    %xmm0, %xmm0
+        movsd   24(%esp), %xmm1
+        ucomisd %xmm0, %xmm1
+        setnp   %al
+        sete    %cl
+        testb   %cl, %al
+        jne     LBB1_5  # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+	jp LBB1_1
+	je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+       cvtss2sd        LCPI1_0(%rip), %xmm2
+        cvtss2sd        LCPI1_1(%rip), %xmm3
+        ucomisd %xmm1, %xmm0
+        ja      LBB1_3  # cond_true
+LBB1_2: # cond_true
+        movapd  %xmm3, %xmm2
+LBB1_3: # cond_true
+        movapd  %xmm2, %xmm0
+        ret
+
+We should sink the load into xmm3 into the LBB1_2 block.  This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
+
+        _Z11no_overflowjj:
+                addl    %edi, %esi
+                setae   %al
+                ret
+
+FIXME: That code looks wrong; bool return is normally defined as zext.
+
+on x86-64, not:
+
+__Z11no_overflowjj:
+        addl    %edi, %esi
+        cmpl    %edi, %esi
+        setae   %al
+        movzbl  %al, %eax
+        ret
+
+
+//===---------------------------------------------------------------------===//
+
+Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
+condition register is dead. xor reg reg is shorter than mov reg, #0.
+
+//===---------------------------------------------------------------------===//
+
+We aren't matching RMW instructions aggressively
+enough.  Here's a reduced testcase (more in PR1160):
+
+define void @test(i32* %huge_ptr, i32* %target_ptr) {
+        %A = load i32* %huge_ptr                ; <i32> [#uses=1]
+        %B = load i32* %target_ptr              ; <i32> [#uses=1]
+        %C = or i32 %A, %B              ; <i32> [#uses=1]
+        store i32 %C, i32* %target_ptr
+        ret void
+}
+
+$ llvm-as < t.ll | llc -march=x86-64
+
+_test:
+        movl (%rdi), %eax
+        orl (%rsi), %eax
+        movl %eax, (%rsi)
+        ret
+
+That should be something like:
+
+_test:
+        movl (%rdi), %eax
+        orl %eax, (%rsi)
+        ret
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader:		; preds = %cond_next94
+	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
+	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
+	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
+	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
+	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
+	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
+	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
+	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
+	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
+	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
+	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
+	br label %bb114
+
+produces:
+
+LBB3_5:	# bb114.preheader
+	movswl	-68(%ebp), %eax
+	movl	$32, %ecx
+	movl	%ecx, -80(%ebp)
+	subl	%eax, -80(%ebp)
+	movswl	-52(%ebp), %eax
+	movl	%ecx, -84(%ebp)
+	subl	%eax, -84(%ebp)
+	movswl	-70(%ebp), %eax
+	movl	%ecx, -88(%ebp)
+	subl	%eax, -88(%ebp)
+	movswl	-50(%ebp), %eax
+	subl	%eax, %ecx
+	movl	%ecx, -76(%ebp)
+	movswl	-42(%ebp), %eax
+	movl	%eax, -92(%ebp)
+	movswl	-66(%ebp), %eax
+	movl	%eax, -96(%ebp)
+	movw	$0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack 
+slot into the movl.  The above instructions could be:
+	movl    $32, -80(%ebp)
+...
+	movl    $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
+	br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+	testw	%cx, %cx
+	movswl	%cx, %esi
+	jns	LBB4_109	# cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+  if (foo < 4294967297LL)
+    abort();
+}
+
+to:
+
+compare:
+        subl    $4, %esp
+        cmpl    $0, 8(%esp)
+        setne   %al
+        movzbw  %al, %ax
+        cmpl    $1, 12(%esp)
+        setg    %cl
+        movzbw  %cl, %cx
+        cmove   %ax, %cx
+        testb   $1, %cl
+        jne     .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        call    abort
+.LBB1_2:        # UnifiedReturnBlock
+        addl    $4, %esp
+        ret
+
+(also really horrible code on ppc).  This is due to the expand code for 64-bit
+compares.  GCC produces multiple branches, which is much nicer:
+
+compare:
+        subl    $12, %esp
+        movl    20(%esp), %edx
+        movl    16(%esp), %eax
+        decl    %edx
+        jle     .L7
+.L5:
+        addl    $12, %esp
+        ret
+        .p2align 4,,7
+.L7:
+        jl      .L4
+        cmpl    $0, %eax
+        .p2align 4,,8
+        ja      .L5
+.L4:
+        .p2align 4,,9
+        call    abort
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or  that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:  
+
+int callee(int32, int64); 
+int caller(int32 arg1, int32 arg2) { 
+  int64 local = arg2 * 2; 
+  return callee(arg2, (int64)local); 
+}
+
+[arg1]          [!arg2 no longer valid since we moved local onto it]
+[arg2]      ->  [(int64)
+[RETADDR]        local  ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+   overwrite a caller parameter which is used by the callee and only
+   push them onto the top of the stack.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg1,arg2);
+   }
+
+   Here we don't need to write any variables to the top of the stack
+   since they don't overwrite each other.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg2,arg1);
+   }
+
+   Here we need to push the arguments because they overwrite each
+   other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+  int i = 0;
+  unsigned long int z = 0;
+
+  do {
+    z -= 0x00004000;
+    i++;
+    if (i > 0x00040000)
+      abort ();
+  } while (z > 0);
+  exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+	subl	$28, %esp
+	xorl	%eax, %eax
+	jmp	L2
+L3:
+	cmpl	$262144, %eax
+	je	L10
+L2:
+	addl	$1, %eax
+	cmpl	$262145, %eax
+	jne	L3
+	call	L_abort$stub
+L10:
+	movl	$0, (%esp)
+	call	L_exit$stub
+
+llvm:
+
+_main:
+	subl	$12, %esp
+	movl	$1, %eax
+	movl	$16384, %ecx
+LBB1_1:	# bb
+	cmpl	$262145, %eax
+	jge	LBB1_4	# cond_true
+LBB1_2:	# cond_next
+	incl	%eax
+	addl	$4294950912, %ecx
+	cmpl	$16384, %ecx
+	jne	LBB1_1	# bb
+LBB1_3:	# bb11
+	xorl	%eax, %eax
+	addl	$12, %esp
+	ret
+LBB1_4:	# cond_true
+	call	L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+        leal    1(%eax), %edx
+        cmpl    $262145, %edx
+   =>
+        cmpl    $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+	%Y = fptosi double %X to i64
+	ret i64 %Y
+}
+
+compiles to:
+
+_test:
+	subl	$20, %esp
+	movsd	24(%esp), %xmm0
+	movsd	%xmm0, 8(%esp)
+	fldl	8(%esp)
+	fisttpll	(%esp)
+	movl	4(%esp), %edx
+	movl	(%esp), %eax
+	addl	$20, %esp
+	#FP_REG_KILL
+	ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+        movzwl  4(%esp), %eax
+        orl     $255, %eax
+        ret
+
+instead of:
+_foo:
+        movl    $255, %eax
+        orl     4(%esp), %eax
+        andl    $65535, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+We're codegen'ing multiply of long longs inefficiently:
+
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+  return arg1 *  arg2;
+}
+
+We compile to (fomit-frame-pointer):
+
+_LLM:
+	pushl	%esi
+	movl	8(%esp), %ecx
+	movl	16(%esp), %esi
+	movl	%esi, %eax
+	mull	%ecx
+	imull	12(%esp), %esi
+	addl	%edx, %esi
+	imull	20(%esp), %ecx
+	movl	%esi, %edx
+	addl	%ecx, %edx
+	popl	%esi
+	ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area.  ICC apparently produces:
+
+        movl      8(%esp), %ecx
+        imull     12(%esp), %ecx
+        movl      16(%esp), %eax
+        imull     4(%esp), %eax 
+        addl      %eax, %ecx  
+        movl      4(%esp), %eax
+        mull      12(%esp) 
+        addl      %ecx, %edx
+        ret
+
+Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg".  Instead of:
+
+xorl    %eax, %eax
+movl    %eax, 124(%esp)
+
+we should get:
+
+movl    $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2:	# bb
+	movl	(%esi,%edi,4), %ebx
+	addl	(%ecx,%edi,4), %ebx
+	addl	(%edx,%edi,4), %ebx
+	movl	%ebx, (%ecx,%edi,4)
+	incl	%edi
+	cmpl	%eax, %edi
+	jne	LBB1_2	# bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov.  Something like:
+
+        movl    (%esi,%edi,4), %ebx
+        addl    (%edx,%edi,4), %ebx
+        addl    %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2:	# bb
+	movl	(%ecx,%edi,4), %ebx
+	subl	(%esi,%edi,4), %ebx
+	subl	(%edx,%edi,4), %ebx
+	movl	%ebx, (%ecx,%edi,4)
+	incl	%edi
+	cmpl	%eax, %edi
+	jne	LBB9_2	# bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+	xorpd	LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+	movsd	(%esp), %xmm0
+	xorpd	LCPI1_0, %xmm0
+	movsd	%xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall.  Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+  double X = *P;
+  a = X;
+  bar();
+  X = -X;
+  b = X;
+  bar();
+  c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+handling llvm.memory.barrier on pre SSE2 cpus
+
+should generate:
+lock ; mov %esp, %esp
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+  long long prod = (long long)a*b;
+  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+        movl    4(%esp), %eax
+        cltd
+        xorl    %edx, %eax
+        subl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+int test(unsigned long a, unsigned long b) { return -(a < b); }
+
+We currently compile this to:
+
+define i32 @test(i32 %a, i32 %b) nounwind  {
+	%tmp3 = icmp ult i32 %a, %b		; <i1> [#uses=1]
+	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	%tmp5 = sub i32 0, %tmp34		; <i32> [#uses=1]
+	ret i32 %tmp5
+}
+
+and
+
+_test:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	setb	%al
+	movzbl	%al, %eax
+	negl	%eax
+	ret
+
+Several deficiencies here.  First, we should instcombine zext+neg into sext:
+
+define i32 @test2(i32 %a, i32 %b) nounwind  {
+	%tmp3 = icmp ult i32 %a, %b		; <i1> [#uses=1]
+	%tmp34 = sext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp34
+}
+
+However, before we can do that, we have to fix the bad codegen that we get for
+sext from bool:
+
+_test2:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	setb	%al
+	movzbl	%al, %eax
+	shll	$31, %eax
+	sarl	$31, %eax
+	ret
+
+This code should be at least as good as the code above.  Once this is fixed, we
+can optimize this specific case even more to:
+
+	movl	8(%esp), %eax
+	xorl	%ecx, %ecx
+        cmpl    %eax, 4(%esp)
+        sbbl    %ecx, %ecx
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from 
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+  if (arg1 >> 48)
+    return (first_one[arg1 >> 48]);
+  return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+        movl    8(%esp), %eax
+        cmpl    $65536, %eax
+        movl    4(%esp), %ecx
+        jb      .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        shrl    $16, %eax
+        movzbl  first_one(%eax), %eax
+        ret
+.LBB1_2:        # UnifiedReturnBlock
+        xorl    %eax, %eax
+        ret
+
+There are a few possible improvements here:
+1. We should be able to eliminate the dead load into %ecx
+2. We could change the "movl 8(%esp), %eax" into
+   "movzwl 10(%esp), %eax"; this lets us change the cmpl
+   into a testl, which is shorter, and eliminate the shift.
+
+We could also in theory eliminate the branch by using a conditional
+for the address of the load, but that seems unlikely to be worthwhile
+in general.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
+entry:
+	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
+	br i1 %tmp2, label %bb7, label %bb
+
+bb:		; preds = %entry
+	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
+	ret i32 %tmp6
+
+bb7:		; preds = %entry
+	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
+	ret i32 %tmp10
+}
+
+to:
+
+_foo:
+	cmpb	$0, 16(%esp)
+	movl	12(%esp), %ecx
+	movl	8(%esp), %eax
+	movl	4(%esp), %edx
+	je	LBB1_2	# bb7
+LBB1_1:	# bb
+	addl	%edx, %eax
+	ret
+LBB1_2:	# bb7
+	movl	%edx, %eax
+	subl	%ecx, %eax
+	ret
+
+The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
+if it commuted the addl in LBB1_1.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15:        # bb310
+        cvtss2sd        LCPI1_0, %xmm1
+        addsd   %xmm1, %xmm0
+        movsd   176(%esp), %xmm2
+        mulsd   %xmm0, %xmm2
+        movapd  %xmm2, %xmm3
+        mulsd   %xmm3, %xmm3
+        movapd  %xmm3, %xmm4
+        mulsd   LCPI1_23, %xmm4
+        addsd   LCPI1_24, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_25, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_26, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_27, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_28, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   %xmm1, %xmm4
+        mulsd   %xmm2, %xmm4
+        movsd   152(%esp), %xmm1
+        addsd   %xmm4, %xmm1
+        movsd   %xmm1, 152(%esp)
+        incl    %eax
+        cmpl    %eax, %esi
+        jge     LBB1_15 # bb310
+LBB1_16:        # bb358.loopexit
+        movsd   152(%esp), %xmm0
+        addsd   %xmm0, %xmm0
+        addsd   LCPI1_22, %xmm0
+        movsd   %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Legalize loses track of the fact that bools are always zero extended when in
+memory.  This causes us to compile abort_gzip (from 164.gzip) from:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind  {
+entry:
+	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
+	br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i:		; preds = %entry
+	tail call void @exit( i32 1 ) noreturn nounwind 
+	unreachable
+bb4.i:		; preds = %entry
+	store i1 true, i1* @in_exit.4870.b
+	tail call void @exit( i32 1 ) noreturn nounwind 
+	unreachable
+}
+declare void @exit(i32) noreturn nounwind 
+
+into:
+
+_abort_gzip:
+	subl	$12, %esp
+	movb	_in_exit.4870.b, %al
+	notb	%al
+	testb	$1, %al
+	jne	LBB1_2	## bb4.i
+LBB1_1:	## bb.i
+  ...
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+  return x-y-1;
+}
+
+into (-m64):
+
+_test:
+	decl	%edi
+	movl	%edi, %eax
+	subl	%esi, %eax
+	ret
+
+it would be better to codegen as: x+~y  (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+	subq    $200, %rsp
+        movaps  %xmm7, 160(%rsp)
+        movaps  %xmm6, 144(%rsp)
+        movaps  %xmm5, 128(%rsp)
+        movaps  %xmm4, 112(%rsp)
+        movaps  %xmm3, 96(%rsp)
+        movaps  %xmm2, 80(%rsp)
+        movaps  %xmm1, 64(%rsp)
+        movaps  %xmm0, 48(%rsp)
+        movq    %r9, 40(%rsp)
+        movq    %r8, 32(%rsp)
+        movq    %rcx, 24(%rsp)
+        movq    %rdx, 16(%rsp)
+        movq    %rsi, 8(%rsp)
+        leaq    (%rsp), %rax
+        movq    %rax, 192(%rsp)
+        leaq    208(%rsp), %rax
+        movq    %rax, 184(%rsp)
+        movl    $48, 180(%rsp)
+        movl    $8, 176(%rsp)
+        movl    176(%rsp), %eax
+        cmpl    $47, %eax
+        jbe     .LBB1_3 # bb
+.LBB1_1:        # bb3
+        movq    184(%rsp), %rcx
+        leaq    8(%rcx), %rax
+        movq    %rax, 184(%rsp)
+.LBB1_2:        # bb4
+        movl    (%rcx), %eax
+        addq    $200, %rsp
+        ret
+.LBB1_3:        # bb
+        movl    %eax, %ecx
+        addl    $8, %eax
+        addq    192(%rsp), %rcx
+        movl    %eax, 176(%rsp)
+        jmp     .LBB1_2 # bb4
+
+gcc 4.3 generates:
+	subq    $96, %rsp
+.LCFI0:
+        leaq    104(%rsp), %rax
+        movq    %rsi, -80(%rsp)
+        movl    $8, -120(%rsp)
+        movq    %rax, -112(%rsp)
+        leaq    -88(%rsp), %rax
+        movq    %rax, -104(%rsp)
+        movl    $8, %eax
+        cmpl    $48, %eax
+        jb      .L6
+        movq    -112(%rsp), %rdx
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+        .p2align 4,,10
+        .p2align 3
+.L6:
+        mov     %eax, %edx
+        addq    -104(%rsp), %rdx
+        addl    $8, %eax
+        movl    %eax, -120(%rsp)
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+
+and it gets compiled into this on x86:
+	pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        leal    12(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        leal    16(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        movl    12(%ebp), %eax
+        addl    $4, %esp
+        popl    %ebp
+        ret
+
+gcc 4.3 generates:
+	pushl   %ebp
+        movl    %esp, %ebp
+        movl    12(%ebp), %eax
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall.  The expected code is something
+like the following:
+
+	movl	12(%ebp), %eax
+	movl	16(%ebp), %ecx
+	xorl	%edx, %edx
+	divl	%ecx
+	movl	8(%ebp), %eax
+	divl	%ecx
+	movl	%edx, %eax
+	ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+    return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+    return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+normal:
+  ret i32 %sum
+overflow:
+  call void @llvm.trap()
+  unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+	movl	4(%esp), %eax
+	addl	8(%esp), %eax
+	jo	LBB1_2	## overflow
+LBB1_1:	## normal
+	ret
+LBB1_2:	## overflow
+	ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+	movl	%ebx, %eax
+	imull	(%edi,%ecx,4)
+	shrdl	$31, %edx, %eax
+	addl	%eax, (%esi,%ecx,4)
+	incl	%ecx
+	cmpl	$149, %ecx
+	jle	.L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1:	## bb1
+	movl	24(%esp), %eax
+	movl	(%eax,%edi,4), %ebx
+	movl	%ebx, %ebp
+	imull	%esi, %ebp
+	movl	%ebx, %eax
+	mull	%ecx
+	addl	%ebp, %edx
+	sarl	$31, %ebx
+	imull	%ecx, %ebx
+	addl	%edx, %ebx
+	shldl	$1, %eax, %ebx
+	movl	20(%esp), %eax
+	addl	%ebx, (%eax,%edi,4)
+	incl	%edi
+	cmpl	$150, %edi
+	jne	LBB1_1	## bb1
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+	%tmp = alloca <16 x float>, align 16
+	%tmp2 = alloca <16 x float>, align 16
+	store <16 x float> %A, <16 x float>* %tmp
+	%s = bitcast <16 x float>* %tmp to i8*
+	%s2 = bitcast <16 x float>* %tmp2 to i8*
+	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+	%R = load <16 x float>* %tmp2
+	ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+	subl	$140, %esp
+	movaps	%xmm3, 112(%esp)
+	movaps	%xmm2, 96(%esp)
+	movaps	%xmm1, 80(%esp)
+	movaps	%xmm0, 64(%esp)
+	movl	60(%esp), %eax
+	movl	%eax, 124(%esp)
+	movl	56(%esp), %eax
+	movl	%eax, 120(%esp)
+	movl	52(%esp), %eax
+        <many many more 32-bit copies>
+      	movaps	(%esp), %xmm0
+	movaps	16(%esp), %xmm1
+	movaps	32(%esp), %xmm2
+	movaps	48(%esp), %xmm3
+	addl	$140, %esp
+	ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors.  GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+   preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+   code contains more than 3 branches.
+   
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+  Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
new file mode 100644
index 0000000..fd13b02
--- /dev/null
+++ b/lib/Target/X86/X86.h
@@ -0,0 +1,84 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class X86TargetMachine;
+class FunctionPass;
+class MachineCodeEmitter;
+class JITCodeEmitter;
+class raw_ostream;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX87FPRegKillInserterPass - This function returns a pass which
+/// inserts FP_REG_KILL instructions where needed.
+///
+FunctionPass *createX87FPRegKillInserterPass();
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.
+///
+FunctionPass *createX86CodePrinterPass(raw_ostream &o,
+                                       X86TargetMachine &tm,
+                                       CodeGenOpt::Level OptLevel,
+                                       bool Verbose);
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+
+FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM, 
+                                       MachineCodeEmitter &MCE);
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                          JITCodeEmitter &JCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+/// createX86MaxStackAlignmentCalculatorPass - This function returns a pass
+/// which calculates maximal stack alignment required for function
+///
+FunctionPass *createX86MaxStackAlignmentCalculatorPass();
+
+} // End llvm namespace
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#include "X86GenRegisterNames.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#include "X86GenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
new file mode 100644
index 0000000..8df138d
--- /dev/null
+++ b/lib/Target/X86/X86.td
@@ -0,0 +1,184 @@
+//===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, refered to
+// here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features.
+//===----------------------------------------------------------------------===//
+ 
+def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+                                      "Enable MMX instructions">;
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      [FeatureMMX]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def FeatureSSE41   : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+                                      "Enable SSE 4.1 instructions",
+                                      [FeatureSSSE3]>;
+def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+                                      "Enable SSE 4.2 instructions",
+                                      [FeatureSSE41]>;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions">;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions">;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+                                       "Bit testing of memory is slow">;
+def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+                                      "Support SSE 4a instructions">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"i386",            []>;
+def : Proc<"i486",            []>;
+def : Proc<"i586",            []>;
+def : Proc<"pentium",         []>;
+def : Proc<"pentium-mmx",     [FeatureMMX]>;
+def : Proc<"i686",            []>;
+def : Proc<"pentiumpro",      []>;
+def : Proc<"pentium2",        [FeatureMMX]>;
+def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"x86-64",          [FeatureSSE2,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona",          [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"core2",           [FeatureSSSE3,  Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"penryn",          [FeatureSSE41,  Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"atom",            [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem]>;
+
+def : Proc<"k6",              [FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"athlon",          [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird",    [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A,
+                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"barcelona",       [FeatureSSE3,   FeatureSSE4A,
+                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+
+def : Proc<"winchip-c6",      [FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3",              [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo {
+
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the X86InstrInfo.h file.
+  let TSFlagsFields = ["FormBits",
+                       "hasOpSizePrefix",
+                       "hasAdSizePrefix",
+                       "Prefix",
+                       "hasREX_WPrefix",
+                       "ImmTypeBits",
+                       "FPFormBits",
+                       "hasLockPrefix",
+                       "SegOvrBits",
+                       "Opcode"];
+  let TSFlagsShifts = [0,
+                       6,
+                       7,
+                       8,
+                       12,
+                       13,
+                       16,
+                       19,
+                       20,
+                       24];
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTAsmPrinter";
+  int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelAsmPrinter";
+  int Variant = 1;
+}
+
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/lib/Target/X86/X86COFF.h b/lib/Target/X86/X86COFF.h
new file mode 100644
index 0000000..0a8e4e6
--- /dev/null
+++ b/lib/Target/X86/X86COFF.h
@@ -0,0 +1,95 @@
+//===--- X86COFF.h - Some definitions from COFF documentations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just defines some symbols found in COFF documentation. They are
+// used to emit function type information for COFF targets (Cygwin/Mingw32).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_H
+#define X86COFF_H
+
+namespace COFF 
+{
+/// Storage class tells where and what the symbol represents
+enum StorageClass {
+  C_EFCN =   -1,  ///< Physical end of function
+  C_NULL    = 0,  ///< No symbol
+  C_AUTO    = 1,  ///< External definition
+  C_EXT     = 2,  ///< External symbol
+  C_STAT    = 3,  ///< Static
+  C_REG     = 4,  ///< Register variable
+  C_EXTDEF  = 5,  ///< External definition
+  C_LABEL   = 6,  ///< Label
+  C_ULABEL  = 7,  ///< Undefined label
+  C_MOS     = 8,  ///< Member of structure
+  C_ARG     = 9,  ///< Function argument
+  C_STRTAG  = 10, ///< Structure tag
+  C_MOU     = 11, ///< Member of union
+  C_UNTAG   = 12, ///< Union tag
+  C_TPDEF   = 13, ///< Type definition
+  C_USTATIC = 14, ///< Undefined static
+  C_ENTAG   = 15, ///< Enumeration tag
+  C_MOE     = 16, ///< Member of enumeration
+  C_REGPARM = 17, ///< Register parameter
+  C_FIELD   = 18, ///< Bit field
+
+  C_BLOCK  = 100, ///< ".bb" or ".eb" - beginning or end of block
+  C_FCN    = 101, ///< ".bf" or ".ef" - beginning or end of function
+  C_EOS    = 102, ///< End of structure
+  C_FILE   = 103, ///< File name
+  C_LINE   = 104, ///< Line number, reformatted as symbol
+  C_ALIAS  = 105, ///< Duplicate tag
+  C_HIDDEN = 106  ///< External symbol in dmert public lib
+};
+
+/// The type of the symbol. This is made up of a base type and a derived type.
+/// For example, pointer to int is "pointer to T" and "int"
+enum SymbolType {
+  T_NULL   = 0,  ///< No type info
+  T_ARG    = 1,  ///< Void function argument (only used by compiler)
+  T_VOID   = 1,  ///< The same as above. Just named differently in some specs.
+  T_CHAR   = 2,  ///< Character
+  T_SHORT  = 3,  ///< Short integer
+  T_INT    = 4,  ///< Integer
+  T_LONG   = 5,  ///< Long integer
+  T_FLOAT  = 6,  ///< Floating point
+  T_DOUBLE = 7,  ///< Double word
+  T_STRUCT = 8,  ///< Structure
+  T_UNION  = 9,  ///< Union
+  T_ENUM   = 10, ///< Enumeration
+  T_MOE    = 11, ///< Member of enumeration
+  T_UCHAR  = 12, ///< Unsigned character
+  T_USHORT = 13, ///< Unsigned short
+  T_UINT   = 14, ///< Unsigned integer
+  T_ULONG  = 15  ///< Unsigned long
+};
+
+/// Derived type of symbol
+enum SymbolDerivedType {
+  DT_NON = 0, ///< No derived type
+  DT_PTR = 1, ///< Pointer to T
+  DT_FCN = 2, ///< Function returning T
+  DT_ARY = 3  ///< Array of T
+};
+
+/// Masks for extracting parts of type
+enum SymbolTypeMasks {
+  N_BTMASK = 017, ///< Mask for base type
+  N_TMASK  = 060  ///< Mask for derived type
+};
+
+/// Offsets of parts of type
+enum Shifts {
+  N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT)
+};
+
+}
+
+#endif // X86COFF_H
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..7f99203
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,360 @@
+//===- X86CallingConv.td - Calling Conventions X86 32/64 ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.  For i8, the ABI
+  // requires the values to be in AL and AH, however this code uses AL and DL
+  // instead. This is because using AH for the second register conflicts with
+  // the way LLVM does multiple return values -- a return of {i16,i8} would end
+  // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+  // for functions that return two i8 values are currently expected to pack the
+  // values into an i16 (which uses AX, and thus AL:AH).
+  CCIfType<[i8] , CCAssignToReg<[AL, DL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+  
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMMM2 and XMM3
+  // can only be used by ABI non-compliant code. If the target doesn't have XMM
+  // registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToReg<[MM0]>>,
+
+  // Long double types are always returned in ST0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in ST0, unless marked
+  // with "inreg" (used here to distinguish one kind of reg from another,
+  // weirdly; this is really the sse-regparm calling convention) in which
+  // case they use XMM0, otherwise it is the same as the common X86 calling
+  // conv.
+  CCIfInReg<CCIfSubtarget<"hasSSE2()",
+    CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+  CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+  // SSE2, otherwise it is the the C calling conventions.
+  // This can happen when a float, 2 x float, or 3 x float vector is split by
+  // target lowering, and is returned in 1-3 sse regs.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+
+  // MMX vector types are always returned in XMM0 except for v1i64 which is
+  // returned in RAX. This disagrees with ABI documentation but is bug
+  // compatible with gcc.
+  CCIfType<[v1i64], CCAssignToReg<[RAX]>>,
+  CCIfType<[v8i8, v4i16, v2i32, v2f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+  // The X86-Win64 calling convention always returns __m64 values in RAX.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[RAX]>>,
+
+  // And FP in XMM0 only.
+  CCIfType<[f32], CCAssignToReg<[XMM0]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0]>>,
+
+  // Otherwise, everything is the same as 'normal' X86-64 C CC.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+  
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+  // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
+  // registers on Darwin.
+  CCIfType<[v8i8, v4i16, v2i32, v2f32],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCIfSubtarget<"hasSSE2()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>>,
+
+  // The first 8 v1i64 vector arguments are passed in GPRs on Darwin.
+  CCIfType<[v1i64],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
+ 
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToStack<8, 8>>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+  // FIXME: Handle byval stuff.
+  // FIXME: Handle varargs.
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The first 4 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+                                   [RCX , RDX , R8  , R9  ]>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32],
+           CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
+                                   [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 16-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 16>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 16-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 16>>
+]>;
+
+// Tail call convention (fast): One register is reserved for target address,
+// namely R9
+def CC_X86_64_TailCall : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+  
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+  // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
+  // registers on Darwin.
+  CCIfType<[v8i8, v4i16, v2i32, v2f32],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+ 
+  // The first 8 v1i64 vector arguments are passed in GPRs on Darwin.
+  CCIfType<[v1i64],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
+ 
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // The first 3 float or double arguments, if marked 'inreg' and if the call
+  // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+  // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx
+  // registers if the call is not a vararg call.
+  CCIfNotVarArg<CCIfType<[v8i8, v4i16, v2i32, v2f32],
+                CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+  // Long doubles get slots whose size depends on the subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+  // The first 4 SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 4>>]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in ECX.
+  CCIfNest<CCAssignToReg<[ECX]>>,
+
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+  // Handles byval parameters.  Note that we can't rely on the delegation
+  // to CC_X86_32_Common for this because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // The first 3 float or double arguments, if the call is not a vararg
+  // call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+  // Doubles get 8-byte slots that are 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..e988a5c
--- /dev/null
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,811 @@
+//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "X86Relocations.h"
+#include "X86.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+template<class CodeEmitter>
+  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
+    const X86InstrInfo  *II;
+    const TargetData    *TD;
+    X86TargetMachine    &TM;
+    CodeEmitter         &MCE;
+    intptr_t PICBaseOffset;
+    bool Is64BitMode;
+    bool IsPIC;
+  public:
+    static char ID;
+    explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
+      : MachineFunctionPass(&ID), II(0), TD(0), TM(tm), 
+      MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+    Emitter(X86TargetMachine &tm, CodeEmitter &mce,
+            const X86InstrInfo &ii, const TargetData &td, bool is64)
+      : MachineFunctionPass(&ID), II(&ii), TD(&td), TM(tm), 
+      MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "X86 Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI,
+                         const TargetInstrDesc *Desc);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+    void emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                           intptr_t Disp = 0, intptr_t PCAdj = 0,
+                           bool NeedStub = false, bool Indirect = false);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
+                              intptr_t PCAdj = 0);
+    void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                              intptr_t PCAdj = 0);
+
+    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+                               intptr_t PCAdj = 0);
+
+    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+    void emitRegModRMByte(unsigned RegOpcodeField);
+    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+    void emitConstant(uint64_t Val, unsigned Size);
+
+    void emitMemModRMByte(const MachineInstr &MI,
+                          unsigned Op, unsigned RegOpcodeField,
+                          intptr_t PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo) const;
+
+    bool gvNeedsNonLazyPtr(const GlobalValue *GV);
+  };
+
+template<class CodeEmitter>
+  char Emitter<CodeEmitter>::ID = 0;
+}
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified templated MachineCodeEmitter object.
+
+namespace llvm {
+	
+FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
+                                       MachineCodeEmitter &MCE) {
+  return new Emitter<MachineCodeEmitter>(TM, MCE);
+}
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                          JITCodeEmitter &JCE) {
+  return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+} // end namespace llvm
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+ 
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
+  
+  II = TM.getInstrInfo();
+  TD = TM.getTargetData();
+  Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  
+  do {
+    DOUT << "JITTing function '" << MF.getFunction()->getName() << "'\n";
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I) {
+        const TargetInstrDesc &Desc = I->getDesc();
+        emitInstruction(*I, &Desc);
+        // MOVPC32r is basically a call plus a pop instruction.
+        if (Desc.getOpcode() == X86::MOVPC32r)
+          emitInstruction(*I, &II->get(X86::POP32r));
+        NumEmitted++;  // Keep track of the # of mi's emitted
+      }
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+  // Remember where this reference was and where it is to so we can
+  // deal with it later.
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             X86::reloc_pcrel_word, MBB));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                                intptr_t Disp /* = 0 */,
+                                intptr_t PCAdj /* = 0 */,
+                                bool NeedStub /* = false */,
+                                bool Indirect /* = false */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           GV, RelocCST, NeedStub)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               GV, RelocCST, NeedStub);
+  MCE.addRelocation(MR);
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+                                                     unsigned Reloc) {
+  intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, RelocCST));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+                                   intptr_t Disp /* = 0 */,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getX86RegNum(unsigned RegNo) const {
+  return II->getRegisterInfo().getX86RegNum(RegNo);
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                      unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
+                                            unsigned RegOpcodeFld){
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, 
+                                       unsigned Index,
+                                       unsigned Base) {
+  // SIB byte is in the same format as the ModRMByte...
+  MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
+  // Output the constant in little endian byte order...
+  for (unsigned i = 0; i != Size; ++i) {
+    MCE.emitByte(Val & 255);
+    Val >>= 8;
+  }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
+/// sign-extended field. 
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::gvNeedsNonLazyPtr(const GlobalValue *GV) {
+  // For Darwin, simulate the linktime GOT by using the same non-lazy-pointer
+  // mechanism as 32-bit mode.
+  return (!Is64BitMode || TM.getSubtarget<X86Subtarget>().isTargetDarwin()) &&
+    TM.getSubtarget<X86Subtarget>().GVRequiresExtraLoad(GV, TM, false);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
+                                    int DispVal, intptr_t PCAdj) {
+  // If this is a simple integer displacement that doesn't require a relocation,
+  // emit it now.
+  if (!RelocOp) {
+    emitConstant(DispVal, 4);
+    return;
+  }
+  
+  // Otherwise, this is something that requires a relocation.  Emit it as such
+  // now.
+  if (RelocOp->isGlobal()) {
+    // In 64-bit static small code model, we could potentially emit absolute.
+    // But it's probably not beneficial.
+    //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
+    //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    bool NeedStub = isa<Function>(RelocOp->getGlobal());
+    bool Indirect = gvNeedsNonLazyPtr(RelocOp->getGlobal());
+    emitGlobalAddress(RelocOp->getGlobal(), rt, RelocOp->getOffset(),
+                      PCAdj, NeedStub, Indirect);
+  } else if (RelocOp->isCPI()) {
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_picrel_word;
+    emitConstPoolAddress(RelocOp->getIndex(), rt,
+                         RelocOp->getOffset(), PCAdj);
+  } else if (RelocOp->isJTI()) {
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_picrel_word;
+    emitJumpTableAddress(RelocOp->getIndex(), rt, PCAdj);
+  } else {
+    assert(0 && "Unknown value to relocate!");
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
+                               unsigned Op, unsigned RegOpcodeField,
+                               intptr_t PCAdj) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = 0;
+  
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobal()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isCPI()) {
+    if (Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
+      DispVal += Op3.getOffset();
+    }
+  } else if (Op3.isJTI()) {
+    if (Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
+    }
+  } else {
+    DispVal = Op3.getImm();
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &Scale    = MI.getOperand(Op+1);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Is a SIB byte needed?
+  if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
+      IndexReg.getReg() == 0 &&
+      (BaseReg == 0 || getX86RegNum(BaseReg) != N86::ESP)) {
+    if (BaseReg == 0) {  // Just a displacement?
+      // Emit special case [disp32] encoding
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+      
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+    } else {
+      unsigned BaseRegNo = getX86RegNum(BaseReg);
+      if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+        // Emit simple indirect register encoding... [EAX] f.e.
+        MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+      } else if (!DispForReloc && isDisp8(DispVal)) {
+        // Emit the disp8 encoding... [REG+disp8]
+        MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+        emitConstant(DispVal, 1);
+      } else {
+        // Emit the most general non-SIB encoding: [REG+disp32]
+        MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+        emitDisplacementField(DispForReloc, DispVal, PCAdj);
+      }
+    }
+
+  } else {  // We need a SIB byte, so start by outputting the ModR/M byte first
+    assert(IndexReg.getReg() != X86::ESP &&
+           IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+    bool ForceDisp32 = false;
+    bool ForceDisp8  = false;
+    if (BaseReg == 0) {
+      // If there is no base register, we emit the special case SIB byte with
+      // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+      ForceDisp32 = true;
+    } else if (DispForReloc) {
+      // Emit the normal disp32 encoding.
+      MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+      ForceDisp32 = true;
+    } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) {
+      // Emit no displacement ModR/M byte
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+    } else if (isDisp8(DispVal)) {
+      // Emit the disp8 encoding...
+      MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+      ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+    } else {
+      // Emit the normal disp32 encoding...
+      MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+    }
+
+    // Calculate what the SS field value should be...
+    static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+    unsigned SS = SSTable[Scale.getImm()];
+
+    if (BaseReg == 0) {
+      // Handle the SIB byte for the case where there is no base.  The
+      // displacement has already been output.
+      unsigned IndexRegNo;
+      if (IndexReg.getReg())
+        IndexRegNo = getX86RegNum(IndexReg.getReg());
+      else
+        IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+      emitSIBByte(SS, IndexRegNo, 5);
+    } else {
+      unsigned BaseRegNo = getX86RegNum(BaseReg);
+      unsigned IndexRegNo;
+      if (IndexReg.getReg())
+        IndexRegNo = getX86RegNum(IndexReg.getReg());
+      else
+        IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+      emitSIBByte(SS, IndexRegNo, BaseRegNo);
+    }
+
+    // Do we need to output a displacement?
+    if (ForceDisp8) {
+      emitConstant(DispVal, 1);
+    } else if (DispVal != 0 || ForceDisp32) {
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+    }
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(
+                              const MachineInstr &MI,
+                              const TargetInstrDesc *Desc) {
+  DOUT << MI;
+
+  unsigned Opcode = Desc->Opcode;
+
+  // Emit the lock opcode prefix as needed.
+  if (Desc->TSFlags & X86II::LOCK) MCE.emitByte(0xF0);
+
+  // Emit segment override opcode prefix as needed.
+  switch (Desc->TSFlags & X86II::SegOvrMask) {
+  case X86II::FS:
+    MCE.emitByte(0x64);
+    break;
+  case X86II::GS:
+    MCE.emitByte(0x65);
+    break;
+  default: assert(0 && "Invalid segment!");
+  case 0: break;  // No segment override!
+  }
+
+  // Emit the repeat opcode prefix as needed.
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
+
+  // Emit the operand size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+  // Emit the address size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::AdSize) MCE.emitByte(0x67);
+
+  bool Need0FPrefix = false;
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::TB:  // Two-byte opcode prefix
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    Need0FPrefix = true;
+    break;
+  case X86II::REP: break; // already handled.
+  case X86II::XS:   // F3 0F
+    MCE.emitByte(0xF3);
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    MCE.emitByte(0xF2);
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+    MCE.emitByte(0xD8+
+                 (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+                                   >> X86II::Op0Shift));
+    break; // Two-byte opcode prefix
+  default: assert(0 && "Invalid prefix!");
+  case 0: break;  // No prefix!
+  }
+
+  if (Is64BitMode) {
+    // REX prefix
+    unsigned REX = X86InstrInfo::determineREX(MI);
+    if (REX)
+      MCE.emitByte(0x40 | REX);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    MCE.emitByte(0x0F);
+
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::T8:  // 0F 38
+    MCE.emitByte(0x38);
+    break;
+  case X86II::TA:    // 0F 3A
+    MCE.emitByte(0x3A);
+    break;
+  }
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+    ++CurOp;
+  else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+    --NumOps;
+
+  unsigned char BaseOpcode = II->getBaseOpcodeFor(Desc);
+  switch (Desc->TSFlags & X86II::FormMask) {
+  default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+    // Remember the current PC offset, this is the PIC relocation
+    // base address.
+    switch (Opcode) {
+    default: 
+      assert(0 && "psuedo instructions should be removed before code emission");
+      break;
+    case TargetInstrInfo::INLINEASM: {
+      // We allow inline assembler nodes with empty bodies - they can
+      // implicitly define registers, which is ok for JIT.
+      if (MI.getOperand(0).getSymbolName()[0]) {
+        assert(0 && "JIT does not support inline asm!\n");
+        abort();
+      }
+      break;
+    }
+    case TargetInstrInfo::DBG_LABEL:
+    case TargetInstrInfo::EH_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getImm());
+      break;
+    case TargetInstrInfo::IMPLICIT_DEF:
+    case TargetInstrInfo::DECLARE:
+    case X86::DWARF_LOC:
+    case X86::FP_REG_KILL:
+      break;
+    case X86::MOVPC32r: {
+      // This emits the "call" portion of this pseudo instruction.
+      MCE.emitByte(BaseOpcode);
+      emitConstant(0, X86InstrInfo::sizeOfImm(Desc));
+      // Remember PIC base.
+      PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
+      X86JITInfo *JTI = TM.getJITInfo();
+      JTI->setPICBase(MCE.getCurrentPCValue());
+      break;
+    }
+    }
+    CurOp = NumOps;
+    break;
+  case X86II::RawFrm:
+    MCE.emitByte(BaseOpcode);
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+
+      DOUT << "RawFrm CurOp " << CurOp << "\n";
+      DOUT << "isMBB " << MO.isMBB() << "\n";
+      DOUT << "isGlobal " << MO.isGlobal() << "\n";
+      DOUT << "isSymbol " << MO.isSymbol() << "\n";
+      DOUT << "isImm " << MO.isImm() << "\n";
+
+      if (MO.isMBB()) {
+        emitPCRelativeBlockAddress(MO.getMBB());
+      } else if (MO.isGlobal()) {
+        // Assume undefined functions may be outside the Small codespace.
+        bool NeedStub = 
+          (Is64BitMode && 
+              (TM.getCodeModel() == CodeModel::Large ||
+               TM.getSubtarget<X86Subtarget>().isTargetDarwin())) ||
+          Opcode == X86::TAILJMPd;
+        emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+                          MO.getOffset(), 0, NeedStub);
+      } else if (MO.isSymbol()) {
+        emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+      } else if (MO.isImm()) {
+        if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+          // Fix up immediate operand for pc relative calls.
+          intptr_t Imm = (intptr_t)MO.getImm();
+          Imm = Imm - MCE.getCurrentPCValue() - 4;
+          emitConstant(Imm, X86InstrInfo::sizeOfImm(Desc));
+        } else
+          emitConstant(MO.getImm(), X86InstrInfo::sizeOfImm(Desc));
+      } else {
+        assert(0 && "Unknown RawFrm operand!");
+      }
+    }
+    break;
+
+  case X86II::AddRegFrm:
+    MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+    
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      if (MO1.isImm())
+        emitConstant(MO1.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+          : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+        // This should not occur on Darwin for relocatable objects.
+        if (Opcode == X86::MOV64ri)
+          rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
+        if (MO1.isGlobal()) {
+          bool NeedStub = isa<Function>(MO1.getGlobal());
+          bool Indirect = gvNeedsNonLazyPtr(MO1.getGlobal());
+          emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                            NeedStub, Indirect);
+        } else if (MO1.isSymbol())
+          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+        else if (MO1.isCPI())
+          emitConstPoolAddress(MO1.getIndex(), rt);
+        else if (MO1.isJTI())
+          emitJumpTableAddress(MO1.getIndex(), rt);
+      }
+    }
+    break;
+
+  case X86II::MRMDestReg: {
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), X86InstrInfo::sizeOfImm(Desc));
+    break;
+  }
+  case X86II::MRMDestMem: {
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp,
+                     getX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)
+                                  .getReg()));
+    CurOp +=  X86AddrNumOperands + 1;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), X86InstrInfo::sizeOfImm(Desc));
+    break;
+  }
+
+  case X86II::MRMSrcReg:
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86InstrInfo::sizeOfImm(Desc));
+    break;
+
+  case X86II::MRMSrcMem: {
+    // FIXME: Maybe lea should have its own form?
+    int AddrOperands;
+    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      AddrOperands = X86AddrNumOperands - 1; // No segment register
+    else
+      AddrOperands = X86AddrNumOperands;
+
+    intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+      X86InstrInfo::sizeOfImm(Desc) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+                     PCAdj);
+    CurOp += AddrOperands + 1;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86InstrInfo::sizeOfImm(Desc));
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    MCE.emitByte(BaseOpcode);
+
+    // Special handling of lfence, mfence, monitor, and mwait.
+    if (Desc->getOpcode() == X86::LFENCE ||
+        Desc->getOpcode() == X86::MFENCE ||
+        Desc->getOpcode() == X86::MONITOR ||
+        Desc->getOpcode() == X86::MWAIT) {
+      emitRegModRMByte((Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+      switch (Desc->getOpcode()) {
+      default: break;
+      case X86::MONITOR:
+        MCE.emitByte(0xC8);
+        break;
+      case X86::MWAIT:
+        MCE.emitByte(0xC9);
+        break;
+      }
+    } else {
+      emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+                       (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+    }
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      if (MO1.isImm())
+        emitConstant(MO1.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+          : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+        if (Opcode == X86::MOV64ri32)
+          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+        if (MO1.isGlobal()) {
+          bool NeedStub = isa<Function>(MO1.getGlobal());
+          bool Indirect = gvNeedsNonLazyPtr(MO1.getGlobal());
+          emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                            NeedStub, Indirect);
+        } else if (MO1.isSymbol())
+          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+        else if (MO1.isCPI())
+          emitConstPoolAddress(MO1.getIndex(), rt);
+        else if (MO1.isJTI())
+          emitJumpTableAddress(MO1.getIndex(), rt);
+      }
+    }
+    break;
+  }
+
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    intptr_t PCAdj = (CurOp + X86AddrNumOperands != NumOps) ?
+      (MI.getOperand(CurOp+X86AddrNumOperands).isImm() ? 
+          X86InstrInfo::sizeOfImm(Desc) : 4) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     PCAdj);
+    CurOp += X86AddrNumOperands;
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      if (MO.isImm())
+        emitConstant(MO.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+          : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+        if (Opcode == X86::MOV64mi32)
+          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+        if (MO.isGlobal()) {
+          bool NeedStub = isa<Function>(MO.getGlobal());
+          bool Indirect = gvNeedsNonLazyPtr(MO.getGlobal());
+          emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+                            NeedStub, Indirect);
+        } else if (MO.isSymbol())
+          emitExternalSymbolAddress(MO.getSymbolName(), rt);
+        else if (MO.isCPI())
+          emitConstPoolAddress(MO.getIndex(), rt);
+        else if (MO.isJTI())
+          emitJumpTableAddress(MO.getIndex(), rt);
+      }
+    }
+    break;
+  }
+
+  case X86II::MRMInitReg:
+    MCE.emitByte(BaseOpcode);
+    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    ++CurOp;
+    break;
+  }
+
+  if (!Desc->isVariadic() && CurOp != NumOps) {
+    cerr << "Cannot encode: ";
+    MI.dump();
+    cerr << '\n';
+    abort();
+  }
+}
+
diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm
new file mode 100644
index 0000000..8002f98
--- /dev/null
+++ b/lib/Target/X86/X86CompilationCallback_Win64.asm
@@ -0,0 +1,67 @@
+;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---===
+;;
+;;                     The LLVM Compiler Infrastructure
+;;
+;; This file is distributed under the University of Illinois Open Source
+;; License. See LICENSE.TXT for details.
+;;
+;;===----------------------------------------------------------------------===
+;;
+;; This file implements the JIT interfaces for the X86 target.
+;;
+;;===----------------------------------------------------------------------===
+
+extrn X86CompilationCallback2: PROC
+
+.code
+X86CompilationCallback proc
+    push    rbp
+
+    ; Save RSP
+    mov     rbp, rsp
+
+    ; Save all int arg registers
+    push    rcx
+    push    rdx
+    push    r8
+    push    r9
+
+    ; Align stack on 16-byte boundary.
+    and     rsp, -16
+
+    ; Save all XMM arg registers
+    sub     rsp, 64
+    movaps  [rsp],     xmm0
+    movaps  [rsp+16],  xmm1
+    movaps  [rsp+32],  xmm2
+    movaps  [rsp+48],  xmm3
+
+    ; JIT callee
+
+    ; Pass prev frame and return address
+    mov     rcx, rbp
+    mov     rdx, qword ptr [rbp+8]
+    call    X86CompilationCallback2
+
+    ; Restore all XMM arg registers
+    movaps  xmm3, [rsp+48]
+    movaps  xmm2, [rsp+32]
+    movaps  xmm1, [rsp+16]
+    movaps  xmm0, [rsp]
+
+    ; Restore RSP
+    mov     rsp, rbp
+
+    ; Restore all int arg registers
+    sub     rsp, 32
+    pop     r9
+    pop     r8
+    pop     rdx
+    pop     rcx
+
+    ; Restore RBP
+    pop     rbp
+    ret
+X86CompilationCallback endp
+
+End
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
new file mode 100644
index 0000000..4c3cc82
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -0,0 +1,18 @@
+//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ELFWriterInfo.h"
+using namespace llvm;
+
+X86ELFWriterInfo::X86ELFWriterInfo() : TargetELFWriterInfo(EM_386) {}
+X86ELFWriterInfo::~X86ELFWriterInfo() {}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
new file mode 100644
index 0000000..06e051a
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -0,0 +1,29 @@
+//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ELF_WRITER_INFO_H
+#define X86_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+  class X86ELFWriterInfo : public TargetELFWriterInfo {
+  public:
+    X86ELFWriterInfo();
+    virtual ~X86ELFWriterInfo();
+  };
+
+} // end llvm namespace
+
+#endif // X86_ELF_WRITER_INFO_H
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..b3667be
--- /dev/null
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,1549 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+  
+class X86FastISel : public FastISel {
+  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget *Subtarget;
+
+  /// StackPtr - Register used as the stack pointer.
+  ///
+  unsigned StackPtr;
+
+  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 
+  /// floating point ops.
+  /// When SSE is available, use it for f32 operations.
+  /// When SSE2 is available, use it for f64 operations.
+  bool X86ScalarSSEf64;
+  bool X86ScalarSSEf32;
+
+public:
+  explicit X86FastISel(MachineFunction &mf,
+                       MachineModuleInfo *mmi,
+                       DwarfWriter *dw,
+                       DenseMap<const Value *, unsigned> &vm,
+                       DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+                       DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                       , SmallSet<Instruction*, 8> &cil
+#endif
+                       )
+    : FastISel(mf, mmi, dw, vm, bm, am
+#ifndef NDEBUG
+               , cil
+#endif
+               ) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+    StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+    X86ScalarSSEf64 = Subtarget->hasSSE2();
+    X86ScalarSSEf32 = Subtarget->hasSSE1();
+  }
+
+  virtual bool TargetSelectInstruction(Instruction *I);
+
+#include "X86GenFastISel.inc"
+
+private:
+  bool X86FastEmitCompare(Value *LHS, Value *RHS, MVT VT);
+  
+  bool X86FastEmitLoad(MVT VT, const X86AddressMode &AM, unsigned &RR);
+
+  bool X86FastEmitStore(MVT VT, Value *Val,
+                        const X86AddressMode &AM);
+  bool X86FastEmitStore(MVT VT, unsigned Val,
+                        const X86AddressMode &AM);
+
+  bool X86FastEmitExtend(ISD::NodeType Opc, MVT DstVT, unsigned Src, MVT SrcVT,
+                         unsigned &ResultReg);
+  
+  bool X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall);
+
+  bool X86SelectLoad(Instruction *I);
+  
+  bool X86SelectStore(Instruction *I);
+
+  bool X86SelectCmp(Instruction *I);
+
+  bool X86SelectZExt(Instruction *I);
+
+  bool X86SelectBranch(Instruction *I);
+
+  bool X86SelectShift(Instruction *I);
+
+  bool X86SelectSelect(Instruction *I);
+
+  bool X86SelectTrunc(Instruction *I);
+ 
+  bool X86SelectFPExt(Instruction *I);
+  bool X86SelectFPTrunc(Instruction *I);
+
+  bool X86SelectExtractValue(Instruction *I);
+
+  bool X86VisitIntrinsicCall(IntrinsicInst &I);
+  bool X86SelectCall(Instruction *I);
+
+  CCAssignFn *CCAssignFnForCall(unsigned CC, bool isTailCall = false);
+
+  const X86InstrInfo *getInstrInfo() const {
+    return getTargetMachine()->getInstrInfo();
+  }
+  const X86TargetMachine *getTargetMachine() const {
+    return static_cast<const X86TargetMachine *>(&TM);
+  }
+
+  unsigned TargetMaterializeConstant(Constant *C);
+
+  unsigned TargetMaterializeAlloca(AllocaInst *C);
+
+  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+  /// computed in an SSE register, not on the X87 floating point stack.
+  bool isScalarFPTypeInSSEReg(MVT VT) const {
+    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+  }
+
+  bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
+};
+  
+} // end anonymous namespace.
+
+bool X86FastISel::isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1) {
+  VT = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !VT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+  
+  // For now, require SSE/SSE2 for performing floating-point operations,
+  // since x87 requires additional work.
+  if (VT == MVT::f64 && !X86ScalarSSEf64)
+     return false;
+  if (VT == MVT::f32 && !X86ScalarSSEf32)
+     return false;
+  // Similarly, no f80 support yet.
+  if (VT == MVT::f80)
+    return false;
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// CCAssignFnForCall - Selects the correct CCAssignFn for a given calling
+/// convention.
+CCAssignFn *X86FastISel::CCAssignFnForCall(unsigned CC, bool isTaillCall) {
+  if (Subtarget->is64Bit()) {
+    if (Subtarget->isTargetWin64())
+      return CC_X86_Win64_C;
+    else if (CC == CallingConv::Fast && isTaillCall)
+      return CC_X86_64_TailCall;
+    else
+      return CC_X86_64_C;
+  }
+
+  if (CC == CallingConv::X86_FastCall)
+    return CC_X86_32_FastCall;
+  else if (CC == CallingConv::Fast)
+    return CC_X86_32_FastCC;
+  else
+    return CC_X86_32_C;
+}
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(MVT VT, const X86AddressMode &AM,
+                                  unsigned &ResultReg) {
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.getSimpleVT()) {
+  default: return false;
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = X86::GR8RegisterClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = X86::GR16RegisterClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = X86::GR32RegisterClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = X86::GR64RegisterClass;
+    break;
+  case MVT::f32:
+    if (Subtarget->hasSSE1()) {
+      Opc = X86::MOVSSrm;
+      RC  = X86::FR32RegisterClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = X86::RFP32RegisterClass;
+    }
+    break;
+  case MVT::f64:
+    if (Subtarget->hasSSE2()) {
+      Opc = X86::MOVSDrm;
+      RC  = X86::FR64RegisterClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = X86::RFP64RegisterClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  }
+
+  ResultReg = createResultReg(RC);
+  addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+  return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool
+X86FastISel::X86FastEmitStore(MVT VT, unsigned Val,
+                              const X86AddressMode &AM) {
+  // Get opcode and regclass of the output for the given store instruction.
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT()) {
+  case MVT::f80: // No f80 support yet.
+  default: return false;
+  case MVT::i8:  Opc = X86::MOV8mr;  break;
+  case MVT::i16: Opc = X86::MOV16mr; break;
+  case MVT::i32: Opc = X86::MOV32mr; break;
+  case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+  case MVT::f32:
+    Opc = Subtarget->hasSSE1() ? X86::MOVSSmr : X86::ST_Fp32m;
+    break;
+  case MVT::f64:
+    Opc = Subtarget->hasSSE2() ? X86::MOVSDmr : X86::ST_Fp64m;
+    break;
+  }
+  
+  addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM).addReg(Val);
+  return true;
+}
+
+bool X86FastISel::X86FastEmitStore(MVT VT, Value *Val,
+                                   const X86AddressMode &AM) {
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Val))
+    Val = Constant::getNullValue(TD.getIntPtrType());
+  
+  // If this is a store of a simple constant, fold the constant into the store.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    unsigned Opc = 0;
+    switch (VT.getSimpleVT()) {
+    default: break;
+    case MVT::i8:  Opc = X86::MOV8mi;  break;
+    case MVT::i16: Opc = X86::MOV16mi; break;
+    case MVT::i32: Opc = X86::MOV32mi; break;
+    case MVT::i64:
+      // Must be a 32-bit sign extended value.
+      if ((int)CI->getSExtValue() == CI->getSExtValue())
+        Opc = X86::MOV64mi32;
+      break;
+    }
+    
+    if (Opc) {
+      addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM)
+                             .addImm(CI->getSExtValue());
+      return true;
+    }
+  }
+  
+  unsigned ValReg = getRegForValue(Val);
+  if (ValReg == 0)
+    return false;    
+ 
+  return X86FastEmitStore(VT, ValReg, AM);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, MVT DstVT,
+                                    unsigned Src, MVT SrcVT,
+                                    unsigned &ResultReg) {
+  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
+  
+  if (RR != 0) {
+    ResultReg = RR;
+    return true;
+  } else
+    return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
+  User *U;
+  unsigned Opcode = Instruction::UserOp1;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+  } else if (ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectAddress(U->getOperand(0), AM, isCall);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM, isCall);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM, isCall);
+    break;
+
+  case Instruction::Alloca: {
+    if (isCall) break;
+    // Do static allocas.
+    const AllocaInst *A = cast<AllocaInst>(V);
+    DenseMap<const AllocaInst*, int>::iterator SI = StaticAllocaMap.find(A);
+    if (SI != StaticAllocaMap.end()) {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = SI->second;
+      return true;
+    }
+    break;
+  }
+
+  case Instruction::Add: {
+    if (isCall) break;
+    // Adds of constants are common and easy enough.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+      // They have to fit in the 32-bit signed displacement field though.
+      if (isInt32(Disp)) {
+        AM.Disp = (uint32_t)Disp;
+        return X86SelectAddress(U->getOperand(0), AM, isCall);
+      }
+    }
+    break;
+  }
+
+  case Instruction::GetElementPtr: {
+    if (isCall) break;
+    // Pattern-match simple GEPs.
+    uint64_t Disp = (int32_t)AM.Disp;
+    unsigned IndexReg = AM.IndexReg;
+    unsigned Scale = AM.Scale;
+    gep_type_iterator GTI = gep_type_begin(U);
+    // Iterate through the indices, folding what we can. Constants can be
+    // folded, and one dynamic index can be handled, if the scale is supported.
+    for (User::op_iterator i = U->op_begin() + 1, e = U->op_end();
+         i != e; ++i, ++GTI) {
+      Value *Op = *i;
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        Disp += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+        } else if (IndexReg == 0 &&
+                   (!AM.GV ||
+                    !getTargetMachine()->symbolicAddressesAreRIPRel()) &&
+                   (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op);
+          if (IndexReg == 0)
+            return false;
+        } else
+          // Unsupported.
+          goto unsupported_gep;
+      }
+    }
+    // Check for displacement overflow.
+    if (!isInt32(Disp))
+      break;
+    // Ok, the GEP indices were covered by constant-offset and scaled-index
+    // addressing. Update the address state and move on to examining the base.
+    AM.IndexReg = IndexReg;
+    AM.Scale = Scale;
+    AM.Disp = (uint32_t)Disp;
+    return X86SelectAddress(U->getOperand(0), AM, isCall);
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
+  }
+  }
+
+  // Handle constant address.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Default &&
+        TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (getTargetMachine()->symbolicAddressesAreRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle TLS yet.
+    if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Set up the basic address.
+    AM.GV = GV;
+    if (!isCall &&
+        TM.getRelocationModel() == Reloc::PIC_ &&
+        !Subtarget->is64Bit())
+      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(&MF);
+
+    // Emit an extra load if the ABI requires it.
+    if (Subtarget->GVRequiresExtraLoad(GV, TM, isCall)) {
+      // Check to see if we've already materialized this
+      // value in a register in this block.
+      if (unsigned Reg = LocalValueMap[V]) {
+        AM.Base.Reg = Reg;
+        AM.GV = 0;
+        return true;
+      }
+      // Issue load from stub if necessary.
+      unsigned Opc = 0;
+      const TargetRegisterClass *RC = NULL;
+      if (TLI.getPointerTy() == MVT::i32) {
+        Opc = X86::MOV32rm;
+        RC  = X86::GR32RegisterClass;
+      } else {
+        Opc = X86::MOV64rm;
+        RC  = X86::GR64RegisterClass;
+      }
+
+      X86AddressMode StubAM;
+      StubAM.Base.Reg = AM.Base.Reg;
+      StubAM.GV = AM.GV;
+      unsigned ResultReg = createResultReg(RC);
+      addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), StubAM);
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = ResultReg;
+      AM.GV = 0;
+
+      // Prevent loading GV stub multiple times in same MBB.
+      LocalValueMap[V] = AM.Base.Reg;
+    }
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !getTargetMachine()->symbolicAddressesAreRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(Instruction* I) {
+  MVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(I->getOperand(1), AM, false))
+    return false;
+
+  return X86FastEmitStore(VT, I->getOperand(0), AM);
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(Instruction *I)  {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(I->getOperand(0), AM, false))
+    return false;
+
+  unsigned ResultReg = 0;
+  if (X86FastEmitLoad(VT, AM, ResultReg)) {
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  return false;
+}
+
+static unsigned X86ChooseCmpOpcode(MVT VT) {
+  switch (VT.getSimpleVT()) {
+  default:       return 0;
+  case MVT::i8:  return X86::CMP8rr;
+  case MVT::i16: return X86::CMP16rr;
+  case MVT::i32: return X86::CMP32rr;
+  case MVT::i64: return X86::CMP64rr;
+  case MVT::f32: return X86::UCOMISSrr;
+  case MVT::f64: return X86::UCOMISDrr;
+  }
+}
+
+/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
+/// of the comparison, return an opcode that works for the compare (e.g.
+/// CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(MVT VT, ConstantInt *RHSC) {
+  switch (VT.getSimpleVT()) {
+  // Otherwise, we can't fold the immediate into this comparison.
+  default: return 0;
+  case MVT::i8: return X86::CMP8ri;
+  case MVT::i16: return X86::CMP16ri;
+  case MVT::i32: return X86::CMP32ri;
+  case MVT::i64:
+    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+    // field.
+    if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
+      return X86::CMP64ri32;
+    return 0;
+  }
+}
+
+bool X86FastISel::X86FastEmitCompare(Value *Op0, Value *Op1, MVT VT) {
+  unsigned Op0Reg = getRegForValue(Op0);
+  if (Op0Reg == 0) return false;
+  
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Op1))
+    Op1 = Constant::getNullValue(TD.getIntPtrType());
+  
+  // We have two options: compare with register or immediate.  If the RHS of
+  // the compare is an immediate that we can fold into this compare, use
+  // CMPri, otherwise use CMPrr.
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+      BuildMI(MBB, DL, TII.get(CompareImmOpc)).addReg(Op0Reg)
+                                          .addImm(Op1C->getSExtValue());
+      return true;
+    }
+  }
+  
+  unsigned CompareOpc = X86ChooseCmpOpcode(VT);
+  if (CompareOpc == 0) return false;
+    
+  unsigned Op1Reg = getRegForValue(Op1);
+  if (Op1Reg == 0) return false;
+  BuildMI(MBB, DL, TII.get(CompareOpc)).addReg(Op0Reg).addReg(Op1Reg);
+  
+  return true;
+}
+
+bool X86FastISel::X86SelectCmp(Instruction *I) {
+  CmpInst *CI = cast<CmpInst>(I);
+
+  MVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  unsigned ResultReg = createResultReg(&X86::GR8RegClass);
+  unsigned SetCCOpc;
+  bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
+  switch (CI->getPredicate()) {
+  case CmpInst::FCMP_OEQ: {
+    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+      return false;
+    
+    unsigned EReg = createResultReg(&X86::GR8RegClass);
+    unsigned NPReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(MBB, DL, TII.get(X86::SETEr), EReg);
+    BuildMI(MBB, DL, TII.get(X86::SETNPr), NPReg);
+    BuildMI(MBB, DL, 
+            TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  case CmpInst::FCMP_UNE: {
+    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+      return false;
+
+    unsigned NEReg = createResultReg(&X86::GR8RegClass);
+    unsigned PReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(MBB, DL, TII.get(X86::SETNEr), NEReg);
+    BuildMI(MBB, DL, TII.get(X86::SETPr), PReg);
+    BuildMI(MBB, DL, TII.get(X86::OR8rr), ResultReg).addReg(PReg).addReg(NEReg);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
+  case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+  case CmpInst::FCMP_OLT: SwapArgs = true;  SetCCOpc = X86::SETAr;  break;
+  case CmpInst::FCMP_OLE: SwapArgs = true;  SetCCOpc = X86::SETAEr; break;
+  case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+  case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
+  case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr;  break;
+  case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr;  break;
+  case CmpInst::FCMP_UGT: SwapArgs = true;  SetCCOpc = X86::SETBr;  break;
+  case CmpInst::FCMP_UGE: SwapArgs = true;  SetCCOpc = X86::SETBEr; break;
+  case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
+  case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+  
+  case CmpInst::ICMP_EQ:  SwapArgs = false; SetCCOpc = X86::SETEr;  break;
+  case CmpInst::ICMP_NE:  SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+  case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
+  case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+  case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
+  case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+  case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr;  break;
+  case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
+  case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr;  break;
+  case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
+  default:
+    return false;
+  }
+
+  Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+  if (SwapArgs)
+    std::swap(Op0, Op1);
+
+  // Emit a compare of Op0/Op1.
+  if (!X86FastEmitCompare(Op0, Op1, VT))
+    return false;
+  
+  BuildMI(MBB, DL, TII.get(SetCCOpc), ResultReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectZExt(Instruction *I) {
+  // Handle zero-extension from i1 to i8, which is common.
+  if (I->getType() == Type::Int8Ty &&
+      I->getOperand(0)->getType() == Type::Int1Ty) {
+    unsigned ResultReg = getRegForValue(I->getOperand(0));
+    if (ResultReg == 0) return false;
+    // Set the high bits to zero.
+    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg);
+    if (ResultReg == 0) return false;
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool X86FastISel::X86SelectBranch(Instruction *I) {
+  // Unconditional branches are selected by tablegen-generated code.
+  // Handle a conditional branch.
+  BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TrueMBB = MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FalseMBB = MBBMap[BI->getSuccessor(1)];
+
+  // Fold the common case of a conditional branch with a comparison.
+  if (CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse()) {
+      MVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (MBB->isLayoutSuccessor(TrueMBB)) {
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
+      unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
+
+      switch (Predicate) {
+      case CmpInst::FCMP_OEQ:
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::FCMP_UNE;
+        // FALL THROUGH
+      case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE; break;
+      case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA;  break;
+      case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE; break;
+      case CmpInst::FCMP_OLT: SwapArgs = true;  BranchOpc = X86::JA;  break;
+      case CmpInst::FCMP_OLE: SwapArgs = true;  BranchOpc = X86::JAE; break;
+      case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE; break;
+      case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP; break;
+      case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP;  break;
+      case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE;  break;
+      case CmpInst::FCMP_UGT: SwapArgs = true;  BranchOpc = X86::JB;  break;
+      case CmpInst::FCMP_UGE: SwapArgs = true;  BranchOpc = X86::JBE; break;
+      case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB;  break;
+      case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE; break;
+          
+      case CmpInst::ICMP_EQ:  SwapArgs = false; BranchOpc = X86::JE;  break;
+      case CmpInst::ICMP_NE:  SwapArgs = false; BranchOpc = X86::JNE; break;
+      case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA;  break;
+      case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE; break;
+      case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB;  break;
+      case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE; break;
+      case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG;  break;
+      case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE; break;
+      case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL;  break;
+      case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE; break;
+      default:
+        return false;
+      }
+      
+      Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+      if (SwapArgs)
+        std::swap(Op0, Op1);
+
+      // Emit a compare of the LHS and RHS, setting the flags.
+      if (!X86FastEmitCompare(Op0, Op1, VT))
+        return false;
+      
+      BuildMI(MBB, DL, TII.get(BranchOpc)).addMBB(TrueMBB);
+
+      if (Predicate == CmpInst::FCMP_UNE) {
+        // X86 requires a second branch to handle UNE (and OEQ,
+        // which is mapped to UNE above).
+        BuildMI(MBB, DL, TII.get(X86::JP)).addMBB(TrueMBB);
+      }
+
+      FastEmitBranch(FalseMBB);
+      MBB->addSuccessor(TrueMBB);
+      return true;
+    }
+  } else if (ExtractValueInst *EI =
+             dyn_cast<ExtractValueInst>(BI->getCondition())) {
+    // Check to see if the branch instruction is from an "arithmetic with
+    // overflow" intrinsic. The main way these intrinsics are used is:
+    //
+    //   %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+    //   %sum = extractvalue { i32, i1 } %t, 0
+    //   %obit = extractvalue { i32, i1 } %t, 1
+    //   br i1 %obit, label %overflow, label %normal
+    //
+    // The %sum and %obit are converted in an ADD and a SETO/SETB before
+    // reaching the branch. Therefore, we search backwards through the MBB
+    // looking for the SETO/SETB instruction. If an instruction modifies the
+    // EFLAGS register before we reach the SETO/SETB instruction, then we can't
+    // convert the branch into a JO/JB instruction.
+    if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){
+      if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow ||
+          CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
+        const MachineInstr *SetMI = 0;
+        unsigned Reg = lookUpRegForValue(EI);
+
+        for (MachineBasicBlock::const_reverse_iterator
+               RI = MBB->rbegin(), RE = MBB->rend(); RI != RE; ++RI) {
+          const MachineInstr &MI = *RI;
+
+          if (MI.modifiesRegister(Reg)) {
+            unsigned Src, Dst, SrcSR, DstSR;
+
+            if (getInstrInfo()->isMoveInstr(MI, Src, Dst, SrcSR, DstSR)) {
+              Reg = Src;
+              continue;
+            }
+
+            SetMI = &MI;
+            break;
+          }
+
+          const TargetInstrDesc &TID = MI.getDesc();
+          if (TID.hasUnmodeledSideEffects() ||
+              TID.hasImplicitDefOfPhysReg(X86::EFLAGS))
+            break;
+        }
+
+        if (SetMI) {
+          unsigned OpCode = SetMI->getOpcode();
+
+          if (OpCode == X86::SETOr || OpCode == X86::SETBr) {
+            BuildMI(MBB, DL, TII.get(OpCode == X86::SETOr ? X86::JO : X86::JB))
+              .addMBB(TrueMBB);
+            FastEmitBranch(FalseMBB);
+            MBB->addSuccessor(TrueMBB);
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  // Otherwise do a clumsy setcc and re-test it.
+  unsigned OpReg = getRegForValue(BI->getCondition());
+  if (OpReg == 0) return false;
+
+  BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(OpReg).addReg(OpReg);
+  BuildMI(MBB, DL, TII.get(X86::JNE)).addMBB(TrueMBB);
+  FastEmitBranch(FalseMBB);
+  MBB->addSuccessor(TrueMBB);
+  return true;
+}
+
+bool X86FastISel::X86SelectShift(Instruction *I) {
+  unsigned CReg = 0, OpReg = 0, OpImm = 0;
+  const TargetRegisterClass *RC = NULL;
+  if (I->getType() == Type::Int8Ty) {
+    CReg = X86::CL;
+    RC = &X86::GR8RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break;
+    default: return false;
+    }
+  } else if (I->getType() == Type::Int16Ty) {
+    CReg = X86::CX;
+    RC = &X86::GR16RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break;
+    default: return false;
+    }
+  } else if (I->getType() == Type::Int32Ty) {
+    CReg = X86::ECX;
+    RC = &X86::GR32RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break;
+    default: return false;
+    }
+  } else if (I->getType() == Type::Int64Ty) {
+    CReg = X86::RCX;
+    RC = &X86::GR64RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break;
+    default: return false;
+    }
+  } else {
+    return false;
+  }
+
+  MVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+  
+  // Fold immediate in shl(x,3).
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(MBB, DL, TII.get(OpImm), 
+            ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  TII.copyRegToReg(*MBB, MBB->end(), CReg, Op1Reg, RC, RC);
+
+  // The shift instruction uses X86::CL. If we defined a super-register
+  // of X86::CL, emit an EXTRACT_SUBREG to precisely describe what
+  // we're doing here.
+  if (CReg != X86::CL)
+    BuildMI(MBB, DL, TII.get(TargetInstrInfo::EXTRACT_SUBREG), X86::CL)
+      .addReg(CReg).addImm(X86::SUBREG_8BIT);
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(MBB, DL, TII.get(OpReg), ResultReg).addReg(Op0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectSelect(Instruction *I) {
+  MVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+    return false;
+  
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  if (VT.getSimpleVT() == MVT::i16) {
+    Opc = X86::CMOVE16rr;
+    RC = &X86::GR16RegClass;
+  } else if (VT.getSimpleVT() == MVT::i32) {
+    Opc = X86::CMOVE32rr;
+    RC = &X86::GR32RegClass;
+  } else if (VT.getSimpleVT() == MVT::i64) {
+    Opc = X86::CMOVE64rr;
+    RC = &X86::GR64RegClass;
+  } else {
+    return false; 
+  }
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  unsigned Op2Reg = getRegForValue(I->getOperand(2));
+  if (Op2Reg == 0) return false;
+
+  BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(Op0Reg).addReg(Op0Reg);
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(MBB, DL, TII.get(Opc), ResultReg).addReg(Op1Reg).addReg(Op2Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectFPExt(Instruction *I) {
+  // fpext from float to double.
+  if (Subtarget->hasSSE2() && I->getType() == Type::DoubleTy) {
+    Value *V = I->getOperand(0);
+    if (V->getType() == Type::FloatTy) {
+      unsigned OpReg = getRegForValue(V);
+      if (OpReg == 0) return false;
+      unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
+      BuildMI(MBB, DL, TII.get(X86::CVTSS2SDrr), ResultReg).addReg(OpReg);
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(Instruction *I) {
+  if (Subtarget->hasSSE2()) {
+    if (I->getType() == Type::FloatTy) {
+      Value *V = I->getOperand(0);
+      if (V->getType() == Type::DoubleTy) {
+        unsigned OpReg = getRegForValue(V);
+        if (OpReg == 0) return false;
+        unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
+        BuildMI(MBB, DL, TII.get(X86::CVTSD2SSrr), ResultReg).addReg(OpReg);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectTrunc(Instruction *I) {
+  if (Subtarget->is64Bit())
+    // All other cases should be handled by the tblgen generated code.
+    return false;
+  MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  MVT DstVT = TLI.getValueType(I->getType());
+  
+  // This code only handles truncation to byte right now.
+  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+    // All other cases should be handled by the tblgen generated code.
+    return false;
+  if (SrcVT != MVT::i16 && SrcVT != MVT::i32)
+    // All other cases should be handled by the tblgen generated code.
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  // First issue a copy to GR16_ABCD or GR32_ABCD.
+  unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16rr : X86::MOV32rr;
+  const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
+    ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+  unsigned CopyReg = createResultReg(CopyRC);
+  BuildMI(MBB, DL, TII.get(CopyOpc), CopyReg).addReg(InputReg);
+
+  // Then issue an extract_subreg.
+  unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+                                                  CopyReg, X86::SUBREG_8BIT);
+  if (!ResultReg)
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectExtractValue(Instruction *I) {
+  ExtractValueInst *EI = cast<ExtractValueInst>(I);
+  Value *Agg = EI->getAggregateOperand();
+
+  if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) {
+    switch (CI->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::uadd_with_overflow:
+      // Cheat a little. We know that the registers for "add" and "seto" are
+      // allocated sequentially. However, we only keep track of the register
+      // for "add" in the value map. Use extractvalue's index to get the
+      // correct register for "seto".
+      UpdateValueMap(I, lookUpRegForValue(Agg) + *EI->idx_begin());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86VisitIntrinsicCall(IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow: {
+    // Replace "add with overflow" intrinsics with an "add" instruction followed
+    // by a seto/setc instruction. Later on, when the "extractvalue"
+    // instructions are encountered, we use the fact that two registers were
+    // created sequentially to get the correct registers for the "sum" and the
+    // "overflow bit".
+    const Function *Callee = I.getCalledFunction();
+    const Type *RetTy =
+      cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    Value *Op1 = I.getOperand(1);
+    Value *Op2 = I.getOperand(2);
+    unsigned Reg1 = getRegForValue(Op1);
+    unsigned Reg2 = getRegForValue(Op2);
+
+    if (Reg1 == 0 || Reg2 == 0)
+      // FIXME: Handle values *not* in registers.
+      return false;
+
+    unsigned OpC = 0;
+    if (VT == MVT::i32)
+      OpC = X86::ADD32rr;
+    else if (VT == MVT::i64)
+      OpC = X86::ADD64rr;
+    else
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(MBB, DL, TII.get(OpC), ResultReg).addReg(Reg1).addReg(Reg2);
+    unsigned DestReg1 = UpdateValueMap(&I, ResultReg);
+
+    // If the add with overflow is an intra-block value then we just want to
+    // create temporaries for it like normal.  If it is a cross-block value then
+    // UpdateValueMap will return the cross-block register used.  Since we
+    // *really* want the value to be live in the register pair known by
+    // UpdateValueMap, we have to use DestReg1+1 as the destination register in
+    // the cross block case.  In the non-cross-block case, we should just make
+    // another register for the value.
+    if (DestReg1 != ResultReg)
+      ResultReg = DestReg1+1;
+    else
+      ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8));
+    
+    unsigned Opc = X86::SETBr;
+    if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
+      Opc = X86::SETOr;
+    BuildMI(MBB, DL, TII.get(Opc), ResultReg);
+    return true;
+  }
+  }
+}
+
+bool X86FastISel::X86SelectCall(Instruction *I) {
+  CallInst *CI = cast<CallInst>(I);
+  Value *Callee = I->getOperand(0);
+
+  // Can't handle inline asm yet.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Handle intrinsic calls.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
+    return X86VisitIntrinsicCall(*II);
+
+  // Handle only C and fastcc calling conventions for now.
+  CallSite CS(CI);
+  unsigned CC = CS.getCallingConv();
+  if (CC != CallingConv::C &&
+      CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall)
+    return false;
+
+  // On X86, -tailcallopt changes the fastcc ABI. FastISel doesn't
+  // handle this for now.
+  if (CC == CallingConv::Fast && PerformTailCallOpt)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  if (FTy->isVarArg())
+    return false;
+
+  // Handle *simple* calls for now.
+  const Type *RetTy = CS.getType();
+  MVT RetVT;
+  if (RetTy == Type::VoidTy)
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT, true))
+    return false;
+
+  // Materialize callee address in a register. FIXME: GV address can be
+  // handled with a CALLpcrel32 instead.
+  X86AddressMode CalleeAM;
+  if (!X86SelectAddress(Callee, CalleeAM, true))
+    return false;
+  unsigned CalleeOp = 0;
+  GlobalValue *GV = 0;
+  if (CalleeAM.Base.Reg != 0) {
+    assert(CalleeAM.GV == 0);
+    CalleeOp = CalleeAM.Base.Reg;
+  } else if (CalleeAM.GV != 0) {
+    assert(CalleeAM.GV != 0);
+    GV = CalleeAM.GV;
+  } else
+    return false;
+
+  // Allow calls which produce i1 results.
+  bool AndToI1 = false;
+  if (RetVT == MVT::i1) {
+    RetVT = MVT::i8;
+    AndToI1 = true;
+  }
+
+  // Deal with call operands first.
+  SmallVector<Value*, 8> ArgVals;
+  SmallVector<unsigned, 8> Args;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(CS.arg_size());
+  ArgVals.reserve(CS.arg_size());
+  ArgVTs.reserve(CS.arg_size());
+  ArgFlags.reserve(CS.arg_size());
+  for (CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    const Type *ArgTy = (*i)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT))
+      return false;
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(Arg);
+    ArgVals.push_back(*i);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, TM, ArgLocs);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TM.getRegisterInfo()->getCallFrameSetupOpcode();
+  BuildMI(MBB, DL, TII.get(AdjStackDown)).addImm(NumBytes);
+
+  // Process argument: walk the register/memloc assignments, inserting
+  // copies / loads.
+  SmallVector<unsigned, 4> RegArgs;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = Args[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+  
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt: {
+      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      assert(Emitted && "Failed to emit a sext!"); Emitted=Emitted;
+      Emitted = true;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::ZExt: {
+      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      assert(Emitted && "Failed to emit a zext!"); Emitted=Emitted;
+      Emitted = true;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::AExt: {
+      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                    Arg, ArgVT, Arg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                    Arg, ArgVT, Arg);
+      
+      assert(Emitted && "Failed to emit a aext!"); Emitted=Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    }
+    
+    if (VA.isRegLoc()) {
+      TargetRegisterClass* RC = TLI.getRegClassFor(ArgVT);
+      bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), VA.getLocReg(),
+                                      Arg, RC, RC);
+      assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+      Emitted = true;
+      RegArgs.push_back(VA.getLocReg());
+    } else {
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      X86AddressMode AM;
+      AM.Base.Reg = StackPtr;
+      AM.Disp = LocMemOffset;
+      Value *ArgVal = ArgVals[VA.getValNo()];
+      
+      // If this is a really simple value, emit this with the Value* version of
+      // X86FastEmitStore.  If it isn't simple, we don't want to do this, as it
+      // can cause us to reevaluate the argument.
+      if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal))
+        X86FastEmitStore(ArgVT, ArgVal, AM);
+      else
+        X86FastEmitStore(ArgVT, Arg, AM);
+    }
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.  
+  if (!Subtarget->is64Bit() &&
+      TM.getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT()) {
+    TargetRegisterClass *RC = X86::GR32RegisterClass;
+    unsigned Base = getInstrInfo()->getGlobalBaseReg(&MF);
+    bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), X86::EBX, Base, RC, RC);
+    assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+    Emitted = true;
+  }
+
+  // Issue the call.
+  unsigned CallOpc = CalleeOp
+    ? (Subtarget->is64Bit() ? X86::CALL64r       : X86::CALL32r)
+    : (Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32);
+  MachineInstrBuilder MIB = CalleeOp
+    ? BuildMI(MBB, DL, TII.get(CallOpc)).addReg(CalleeOp)
+    : BuildMI(MBB, DL, TII.get(CallOpc)).addGlobalAddress(GV);
+
+  // Add an implicit use GOT pointer in EBX.
+  if (!Subtarget->is64Bit() &&
+      TM.getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    MIB.addReg(X86::EBX);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i]);
+
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
+  BuildMI(MBB, DL, TII.get(AdjStackUp)).addImm(NumBytes).addImm(0);
+
+  // Now handle call return value (if any).
+  if (RetVT.getSimpleVT() != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, TM, RVLocs);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_X86);
+
+    // Copy all of the result registers out of their specified physreg.
+    assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
+    MVT CopyVT = RVLocs[0].getValVT();
+    TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+    TargetRegisterClass *SrcRC = DstRC;
+    
+    // If this is a call to a function that returns an fp value on the x87 fp
+    // stack, but where we prefer to use the value in xmm registers, copy it
+    // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+    if ((RVLocs[0].getLocReg() == X86::ST0 ||
+         RVLocs[0].getLocReg() == X86::ST1) &&
+        isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
+      CopyVT = MVT::f80;
+      SrcRC = X86::RSTRegisterClass;
+      DstRC = X86::RFP80RegisterClass;
+    }
+
+    unsigned ResultReg = createResultReg(DstRC);
+    bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                    RVLocs[0].getLocReg(), DstRC, SrcRC);
+    assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+    Emitted = true;
+    if (CopyVT != RVLocs[0].getValVT()) {
+      // Round the F80 the right size, which also moves to the appropriate xmm
+      // register. This is accomplished by storing the F80 value in memory and
+      // then loading it back. Ewww...
+      MVT ResVT = RVLocs[0].getValVT();
+      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+      unsigned MemSize = ResVT.getSizeInBits()/8;
+      int FI = MFI.CreateStackObject(MemSize, MemSize);
+      addFrameReference(BuildMI(MBB, DL, TII.get(Opc)), FI).addReg(ResultReg);
+      DstRC = ResVT == MVT::f32
+        ? X86::FR32RegisterClass : X86::FR64RegisterClass;
+      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+      ResultReg = createResultReg(DstRC);
+      addFrameReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg), FI);
+    }
+
+    if (AndToI1) {
+      // Mask out all but lowest bit for some call which produces an i1.
+      unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+      BuildMI(MBB, DL, 
+              TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1);
+      ResultReg = AndResult;
+    }
+
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+
+bool
+X86FastISel::TargetSelectInstruction(Instruction *I)  {
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+    return X86SelectLoad(I);
+  case Instruction::Store:
+    return X86SelectStore(I);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return X86SelectCmp(I);
+  case Instruction::ZExt:
+    return X86SelectZExt(I);
+  case Instruction::Br:
+    return X86SelectBranch(I);
+  case Instruction::Call:
+    return X86SelectCall(I);
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::Shl:
+    return X86SelectShift(I);
+  case Instruction::Select:
+    return X86SelectSelect(I);
+  case Instruction::Trunc:
+    return X86SelectTrunc(I);
+  case Instruction::FPExt:
+    return X86SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return X86SelectFPTrunc(I);
+  case Instruction::ExtractValue:
+    return X86SelectExtractValue(I);
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+    MVT DstVT = TLI.getValueType(I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return X86SelectZExt(I);
+    if (DstVT.bitsLT(SrcVT))
+      return X86SelectTrunc(I);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned X86FastISel::TargetMaterializeConstant(Constant *C) {
+  MVT VT;
+  if (!isTypeLegal(C->getType(), VT))
+    return false;
+  
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.getSimpleVT()) {
+  default: return false;
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = X86::GR8RegisterClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = X86::GR16RegisterClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = X86::GR32RegisterClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = X86::GR64RegisterClass;
+    break;
+  case MVT::f32:
+    if (Subtarget->hasSSE1()) {
+      Opc = X86::MOVSSrm;
+      RC  = X86::FR32RegisterClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = X86::RFP32RegisterClass;
+    }
+    break;
+  case MVT::f64:
+    if (Subtarget->hasSSE2()) {
+      Opc = X86::MOVSDrm;
+      RC  = X86::FR64RegisterClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = X86::RFP64RegisterClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  }
+  
+  // Materialize addresses with LEA instructions.
+  if (isa<GlobalValue>(C)) {
+    X86AddressMode AM;
+    if (X86SelectAddress(C, AM, false)) {
+      if (TLI.getPointerTy() == MVT::i32)
+        Opc = X86::LEA32r;
+      else
+        Opc = X86::LEA64r;
+      unsigned ResultReg = createResultReg(RC);
+      addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+      return ResultReg;
+    }
+    return 0;
+  }
+  
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // Alignment of vector types.  FIXME!
+    Align = TD.getTypeAllocSize(C->getType());
+  }
+  
+  // x86-32 PIC requires a PIC base register for constant pools.
+  unsigned PICBase = 0;
+  if (TM.getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->is64Bit())
+    PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+
+  // Create the load from the constant pool.
+  unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+  unsigned ResultReg = createResultReg(RC);
+  addConstantPoolReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg), MCPOffset,
+                           PICBase);
+
+  return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeAlloca(AllocaInst *C) {
+  // Fail on dynamic allocas. At this point, getRegForValue has already
+  // checked its CSE maps, so if we're here trying to handle a dynamic
+  // alloca, we're not going to succeed. X86SelectAddress has a
+  // check for dynamic allocas, because it's called directly from
+  // various places, but TargetMaterializeAlloca also needs a check
+  // in order to avoid recursion between getRegForValue,
+  // X86SelectAddrss, and TargetMaterializeAlloca.
+  if (!StaticAllocaMap.count(C))
+    return 0;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(C, AM, false))
+    return 0;
+  unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned ResultReg = createResultReg(RC);
+  addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+  return ResultReg;
+}
+
+namespace llvm {
+  llvm::FastISel *X86::createFastISel(MachineFunction &mf,
+                        MachineModuleInfo *mmi,
+                        DwarfWriter *dw,
+                        DenseMap<const Value *, unsigned> &vm,
+                        DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+                        DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                        , SmallSet<Instruction*, 8> &cil
+#endif
+                        ) {
+    return new X86FastISel(mf, mmi, dw, vm, bm, am
+#ifndef NDEBUG
+                           , cil
+#endif
+                           );
+  }
+}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..0f2fbcc
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1187 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// virtual registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// This pass is hampered by the lack of decent CFG manipulation routines for
+// machine code.  In particular, this wants to be able to split critical edges
+// as necessary, traverse the machine basic block CFG in depth-first order, and
+// allow there to be multiple machine basic blocks for each LLVM basicblock
+// (needed for critical edge splitting).
+//
+// In particular, this pass currently barfs on critical edges.  Because of this,
+// it requires the instruction selector to insert FP_REG_KILL instructions on
+// the exits of any basic block that has critical edges going from it, or which
+// branch to a critical basic block.
+//
+// FIXME: this is not implemented yet.  The stackifier pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  struct VISIBILITY_HIDDEN FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    MachineBasicBlock *MBB;     // Current basic block
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned RegMap[8];         // Track which stack slot contains each register
+    unsigned StackTop;          // The current top of the FP stack.
+
+    void dumpStack() const {
+      cerr << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        cerr << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+      cerr << "\n";
+    }
+  private:
+    /// isStackEmpty - Return true if the FP stack is empty.
+    bool isStackEmpty() const {
+      return StackTop == 0;
+    }
+    
+    // getSlot - Return the stack slot number a particular register number is
+    // in.
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < 8 && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    // getStackEntry - Return the X86::FP<n> register in register ST(i).
+    unsigned getStackEntry(unsigned STi) const {
+      assert(STi < StackTop && "Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    // getSTReg - Return the X86::ST(i) register which contains the specified
+    // FP<RegNo> register.
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack.
+    void pushReg(unsigned Reg) {
+      assert(Reg < 8 && "Register number out of range!");
+      assert(StackTop < 8 && "Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+      MachineInstr *MI = I;
+      DebugLoc dl = MI->getDebugLoc();
+      if (isAtTop(RegNo)) return;
+      
+      unsigned STReg = getSTReg(RegNo);
+      unsigned RegOnTop = getStackEntry(0);
+
+      // Swap the slots the regs are in.
+      std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+      // Swap stack slot contents.
+      assert(RegMap[RegOnTop] < StackTop);
+      std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+      // Emit an fxch to update the runtime processors version of the state.
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+      NumFXCH++;
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+      DebugLoc dl = I->getDebugLoc();
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    // popStackAfter - Pop the current value off of the top of the FP stack
+    // after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    // freeStackSlotAfter - Free the specified register from the register stack,
+    // so that it is no longer in a register.  If the register is currently at
+    // the top of the stack, we just pop the current instruction, otherwise we
+    // store the current top-of-stack into the specified slot, then pop the top
+    // of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isReg() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+  for (unsigned i = 0; i <= 6; ++i)
+    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  TII = MF.getTarget().getInstrInfo();
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  SmallPtrSet<MachineBasicBlock*, 8> Processed;
+  MachineBasicBlock *Entry = MF.begin();
+
+  bool Changed = false;
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
+         I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+       I != E; ++I)
+    Changed |= processBasicBlock(MF, **I);
+
+  return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    unsigned Flags = MI->getDesc().TSFlags;
+    
+    unsigned FPInstClass = Flags & X86II::FPTypeMask;
+    if (MI->getOpcode() == TargetInstrInfo::INLINEASM)
+      FPInstClass = X86II::SpecialFP;
+    
+    if (FPInstClass == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = 0;
+    if (I != BB.begin())
+      PrevMI = prior(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DOUT << "\nFPInst:\t" << *MI;
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (FPInstClass) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: assert(0 && "Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+        DOUT << "Register FP#" << Reg-X86::FP0 << " is dead!\n";
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG(
+      MachineBasicBlock::iterator PrevI(PrevMI);
+      if (I == PrevI) {
+        cerr << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+        cerr << "Inserted instructions:\n\t";
+        Start->print(*cerr.stream(), &MF.getTarget());
+        while (++Start != next(I)) {}
+      }
+      dumpStack();
+    );
+
+    Changed = true;
+  }
+
+  assert(isStackEmpty() && "Stack not empty at end of basic block?");
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    unsigned from;
+    unsigned to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool operator<(unsigned V, const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+#ifndef NDEBUG
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 0; i != NumEntries-1; ++i)
+    if (!(Table[i] < Table[i+1])) return false;
+  return true;
+}
+#endif
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, array_lengthof(TABLE)) &&              \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ABS_Fp80     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m64  , X86::ADD_F64m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CHS_Fp80     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp80  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVB_Fp80   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVE_Fp80   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp80  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp80  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp80  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp80   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::COS_Fp80     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m64  , X86::DIV_F64m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m80  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m80  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m80  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp16m80  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp32m80  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::IST_Fp64m80  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp080     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp180     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp32m64   , X86::LD_F32m   },
+  { X86::LD_Fp32m80   , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::LD_Fp64m80   , X86::LD_F64m   },
+  { X86::LD_Fp80m     , X86::LD_F80m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m64  , X86::MUL_F64m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SIN_Fp80     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::SQRT_Fp80    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m64   , X86::ST_F64m   },
+  { X86::ST_FpP80m    , X86::ST_FP80m  },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m64  , X86::SUB_F64m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::TST_Fp80     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr80  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr80   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  MachineInstr* MI = I;
+  DebugLoc dl = MI->getDebugLoc();
+  ASSERT_SORTED(PopTable);
+  assert(StackTop > 0 && "Cannot pop empty stack!");
+  RegMap[Stack[--StackTop]] = ~0;     // Update state
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+  if (Opcode != -1) {
+    I->setDesc(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  MachineInstr *MI  = I;
+  DebugLoc dl = MI->getDebugLoc();
+  I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned DestReg = getFPReg(MI->getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert((NumOps == X86AddrNumOperands + 1 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+  //
+  if (!KillsSrc &&
+      (MI->getOpcode() == X86::IST_Fp64m32 ||
+       MI->getOpcode() == X86::ISTT_Fp16m32 ||
+       MI->getOpcode() == X86::ISTT_Fp32m32 ||
+       MI->getOpcode() == X86::ISTT_Fp64m32 ||
+       MI->getOpcode() == X86::IST_Fp64m64 ||
+       MI->getOpcode() == X86::ISTT_Fp16m64 ||
+       MI->getOpcode() == X86::ISTT_Fp32m64 ||
+       MI->getOpcode() == X86::ISTT_Fp64m64 ||
+       MI->getOpcode() == X86::IST_Fp64m80 ||
+       MI->getOpcode() == X86::ISTT_Fp16m80 ||
+       MI->getOpcode() == X86::ISTT_Fp32m80 ||
+       MI->getOpcode() == X86::ISTT_Fp64m80 ||
+       MI->getOpcode() == X86::ST_FpP80m)) {
+    duplicateToTop(Reg, 7 /*temp register*/, I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+  
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  if (MI->getOpcode() == X86::IST_FP64m ||
+      MI->getOpcode() == X86::ISTT_FP16m ||
+      MI->getOpcode() == X86::ISTT_FP32m ||
+      MI->getOpcode() == X86::ISTT_FP64m ||
+      MI->getOpcode() == X86::ST_FP80m) {
+    assert(StackTop > 0 && "Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+#ifndef NDEBUG
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  if (KillsSrc) {
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    assert(StackTop > 0 && "Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI->getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(1);   // Drop the source operand.
+  MI->RemoveOperand(0);   // Drop the destination operand.
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::ADD_Fp80  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::DIV_Fp80  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::MUL_Fp80  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+  { X86::SUB_Fp80  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::DIV_Fp80  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+  { X86::SUB_Fp80  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp80  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp80  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::ADD_Fp80  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::DIV_Fp80  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::MUL_Fp80  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+  { X86::SUB_Fp80  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI->getOperand(0));
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  const TableEntry *InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
+                      MI->getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(I++);
+  I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->RemoveOperand(1);
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+
+  unsigned Op0 = getFPReg(MI->getOperand(0));
+  unsigned Op1 = getFPReg(MI->getOperand(2));
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);
+  MI->RemoveOperand(1);
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  DebugLoc dl = MI->getDebugLoc();
+  switch (MI->getOpcode()) {
+  default: assert(0 && "Unknown SpecialFP instruction!");
+  case X86::FpGET_ST0_32:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST0_64:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST0_80:// Appears immediately after a call returning FP type!
+    assert(StackTop == 0 && "Stack should be empty after a call!");
+    pushReg(getFPReg(MI->getOperand(0)));
+    break;
+  case X86::FpGET_ST1_32:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST1_64:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST1_80:{// Appears immediately after a call returning FP type!
+    // FpGET_ST1 should occur right after a FpGET_ST0 for a call or inline asm.
+    // The pattern we expect is:
+    //  CALL
+    //  FP1 = FpGET_ST0
+    //  FP4 = FpGET_ST1
+    //
+    // At this point, we've pushed FP1 on the top of stack, so it should be
+    // present if it isn't dead.  If it was dead, we already emitted a pop to
+    // remove it from the stack and StackTop = 0.
+    
+    // Push FP4 as top of stack next.
+    pushReg(getFPReg(MI->getOperand(0)));
+
+    // If StackTop was 0 before we pushed our operand, then ST(0) must have been
+    // dead.  In this case, the ST(1) value is the only thing that is live, so
+    // it should be on the TOS (after the pop that was emitted) and is.  Just
+    // continue in this case.
+    if (StackTop == 1)
+      break;
+    
+    // Because pushReg just pushed ST(1) as TOS, we now have to swap the two top
+    // elements so that our accounting is correct.
+    unsigned RegOnTop = getStackEntry(0);
+    unsigned RegNo = getStackEntry(1);
+    
+    // Swap the slots the regs are in.
+    std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+    
+    // Swap stack slot contents.
+    assert(RegMap[RegOnTop] < StackTop);
+    std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+    break;
+  }
+  case X86::FpSET_ST0_32:
+  case X86::FpSET_ST0_64:
+  case X86::FpSET_ST0_80:
+    assert((StackTop == 1 || StackTop == 2)
+           && "Stack should have one or two element on it to return!");
+    --StackTop;   // "Forget" we have something on the top of stack!
+    break;
+  case X86::FpSET_ST1_32:
+  case X86::FpSET_ST1_64:
+  case X86::FpSET_ST1_80:
+    // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+    if (StackTop == 1) {
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+      NumFXCH++;
+      StackTop = 0;
+      break;
+    }
+    assert(StackTop == 2 && "Stack should have two element on it to return!");
+    --StackTop;   // "Forget" we have something on the top of stack!
+    break;
+  case X86::MOV_Fp3232:
+  case X86::MOV_Fp3264:
+  case X86::MOV_Fp6432:
+  case X86::MOV_Fp6464: 
+  case X86::MOV_Fp3280:
+  case X86::MOV_Fp6480:
+  case X86::MOV_Fp8032:
+  case X86::MOV_Fp8064: 
+  case X86::MOV_Fp8080: {
+    const MachineOperand &MO1 = MI->getOperand(1);
+    unsigned SrcReg = getFPReg(MO1);
+
+    const MachineOperand &MO0 = MI->getOperand(0);
+    // These can be created due to inline asm. Two address pass can introduce
+    // copies from RFP registers to virtual registers.
+    if (MO0.getReg() == X86::ST0 && SrcReg == 0) {
+      assert(MO1.isKill());
+      // Treat %ST0<def> = MOV_Fp8080 %FP0<kill>
+      // like  FpSET_ST0_80 %FP0<kill>, %ST0<imp-def>
+      assert((StackTop == 1 || StackTop == 2)
+             && "Stack should have one or two element on it to return!");
+      --StackTop;   // "Forget" we have something on the top of stack!
+      break;
+    } else if (MO0.getReg() == X86::ST1 && SrcReg == 1) {
+      assert(MO1.isKill());
+      // Treat %ST1<def> = MOV_Fp8080 %FP1<kill>
+      // like  FpSET_ST1_80 %FP0<kill>, %ST1<imp-def>
+      // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+      if (StackTop == 1) {
+        BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+        NumFXCH++;
+        StackTop = 0;
+        break;
+      }
+      assert(StackTop == 2 && "Stack should have two element on it to return!");
+      --StackTop;   // "Forget" we have something on the top of stack!
+      break;
+    }
+
+    unsigned DestReg = getFPReg(MO0);
+    if (MI->killsRegister(X86::FP0+SrcReg)) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcReg);
+      assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
+      Stack[Slot] = DestReg;
+      RegMap[DestReg] = Slot;
+
+    } else {
+      // For FMOV we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcReg, DestReg, I);
+    }
+    }
+    break;
+  case TargetInstrInfo::INLINEASM: {
+    // The inline asm MachineInstr currently only *uses* FP registers for the
+    // 'f' constraint.  These should be turned into the current ST(x) register
+    // in the machine instr.  Also, any kills should be explicitly popped after
+    // the inline asm.
+    unsigned Kills[7];
+    unsigned NumKills = 0;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      assert(Op.isUse() && "Only handle inline asm uses right now");
+      
+      unsigned FPReg = getFPReg(Op);
+      Op.setReg(getSTReg(FPReg));
+      
+      // If we kill this operand, make sure to pop it from the stack after the
+      // asm.  We just remember it for now, and pop them all off at the end in
+      // a batch.
+      if (Op.isKill())
+        Kills[NumKills++] = FPReg;
+    }
+
+    // If this asm kills any FP registers (is the last use of them) we must
+    // explicitly emit pop instructions for them.  Do this now after the asm has
+    // executed so that the ST(x) numbers are not off (which would happen if we
+    // did this inline with operand rewriting).
+    //
+    // Note: this might be a non-optimal pop sequence.  We might be able to do
+    // better by trying to pop in stack order or something.
+    MachineBasicBlock::iterator InsertPt = MI;
+    while (NumKills)
+      freeStackSlotAfter(InsertPt, Kills[--NumKills]);
+
+    // Don't delete the inline asm!
+    return;
+  }
+      
+  case X86::RET:
+  case X86::RETI:
+    // If RET has an FP register use operand, pass the first one in ST(0) and
+    // the second one in ST(1).
+    if (isStackEmpty()) return;  // Quick check to see if any are possible.
+    
+    // Find the register operands.
+    unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+    
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      // FP Register uses must be kills unless there are two uses of the same
+      // register, in which case only one will be a kill.
+      assert(Op.isUse() &&
+             (Op.isKill() ||                        // Marked kill.
+              getFPReg(Op) == FirstFPRegOp ||       // Second instance.
+              MI->killsRegister(Op.getReg())) &&    // Later use is marked kill.
+             "Ret only defs operands, and values aren't live beyond it");
+
+      if (FirstFPRegOp == ~0U)
+        FirstFPRegOp = getFPReg(Op);
+      else {
+        assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+        SecondFPRegOp = getFPReg(Op);
+      }
+
+      // Remove the operand so that later passes don't see it.
+      MI->RemoveOperand(i);
+      --i, --e;
+    }
+    
+    // There are only four possibilities here:
+    // 1) we are returning a single FP value.  In this case, it has to be in
+    //    ST(0) already, so just declare success by removing the value from the
+    //    FP Stack.
+    if (SecondFPRegOp == ~0U) {
+      // Assert that the top of stack contains the right FP register.
+      assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+             "Top of stack not the right register for RET!");
+      
+      // Ok, everything is good, mark the value as not being on the stack
+      // anymore so that our assertion about the stack being empty at end of
+      // block doesn't fire.
+      StackTop = 0;
+      return;
+    }
+    
+    // Otherwise, we are returning two values:
+    // 2) If returning the same value for both, we only have one thing in the FP
+    //    stack.  Consider:  RET FP1, FP1
+    if (StackTop == 1) {
+      assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+             "Stack misconfiguration for RET!");
+      
+      // Duplicate the TOS so that we return it twice.  Just pick some other FPx
+      // register to hold it.
+      unsigned NewReg = (FirstFPRegOp+1)%7;
+      duplicateToTop(FirstFPRegOp, NewReg, MI);
+      FirstFPRegOp = NewReg;
+    }
+    
+    /// Okay we know we have two different FPx operands now:
+    assert(StackTop == 2 && "Must have two values live!");
+    
+    /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+    ///    in ST(1).  In this case, emit an fxch.
+    if (getStackEntry(0) == SecondFPRegOp) {
+      assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+      moveToTop(FirstFPRegOp, MI);
+    }
+    
+    /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+    /// ST(1).  Just remove both from our understanding of the stack and return.
+    assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+    assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+    StackTop = 0;
+    return;
+  }
+
+  I = MBB->erase(I);  // Remove the pseudo instruction
+  --I;
+}
diff --git a/lib/Target/X86/X86FloatingPointRegKill.cpp b/lib/Target/X86/X86FloatingPointRegKill.cpp
new file mode 100644
index 0000000..009846e
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPointRegKill.cpp
@@ -0,0 +1,139 @@
+//===-- X86FloatingPoint.cpp - FP_REG_KILL inserter -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts FP_REG_KILL instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Instructions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumFPKill, "Number of FP_REG_KILL instructions added");
+
+namespace {
+  struct VISIBILITY_HIDDEN FPRegKiller : public MachineFunctionPass {
+    static char ID;
+    FPRegKiller() : MachineFunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { return "X86 FP_REG_KILL inserter"; }
+  };
+  char FPRegKiller::ID = 0;
+}
+
+FunctionPass *llvm::createX87FPRegKillInserterPass() { return new FPRegKiller(); }
+
+bool FPRegKiller::runOnMachineFunction(MachineFunction &MF) {
+  // If we are emitting FP stack code, scan the basic block to determine if this
+  // block defines any FP values.  If so, put an FP_REG_KILL instruction before
+  // the terminator of the block.
+
+  // Note that FP stack instructions are used in all modes for long double,
+  // so we always need to do this check.
+  // Also note that it's possible for an FP stack register to be live across
+  // an instruction that produces multiple basic blocks (SSE CMOV) so we
+  // must check all the generated basic blocks.
+
+  // Scan all of the machine instructions in these MBBs, checking for FP
+  // stores.  (RFP32 and RFP64 will not exist in SSE mode, but RFP80 might.)
+
+  // Fast-path: If nothing is using the x87 registers, we don't need to do
+  // any scanning.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (MRI.getRegClassVirtRegs(X86::RFP80RegisterClass).empty() &&
+      MRI.getRegClassVirtRegs(X86::RFP64RegisterClass).empty() &&
+      MRI.getRegClassVirtRegs(X86::RFP32RegisterClass).empty())
+    return false;
+
+  bool Changed = false;
+  const X86Subtarget &Subtarget = MF.getTarget().getSubtarget<X86Subtarget>();
+  MachineFunction::iterator MBBI = MF.begin();
+  MachineFunction::iterator EndMBB = MF.end();
+  for (; MBBI != EndMBB; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    
+    // If this block returns, ignore it.  We don't want to insert an FP_REG_KILL
+    // before the return.
+    if (!MBB->empty()) {
+      MachineBasicBlock::iterator EndI = MBB->end();
+      --EndI;
+      if (EndI->getDesc().isReturn())
+        continue;
+    }
+    
+    bool ContainsFPCode = false;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         !ContainsFPCode && I != E; ++I) {
+      if (I->getNumOperands() != 0 && I->getOperand(0).isReg()) {
+        const TargetRegisterClass *clas;
+        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+          if (I->getOperand(op).isReg() && I->getOperand(op).isDef() &&
+            TargetRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
+              ((clas = MRI.getRegClass(I->getOperand(op).getReg())) == 
+                 X86::RFP32RegisterClass ||
+               clas == X86::RFP64RegisterClass ||
+               clas == X86::RFP80RegisterClass)) {
+            ContainsFPCode = true;
+            break;
+          }
+        }
+      }
+    }
+    // Check PHI nodes in successor blocks.  These PHI's will be lowered to have
+    // a copy of the input value in this block.  In SSE mode, we only care about
+    // 80-bit values.
+    if (!ContainsFPCode) {
+      // Final check, check LLVM BB's that are successors to the LLVM BB
+      // corresponding to BB for FP PHI nodes.
+      const BasicBlock *LLVMBB = MBB->getBasicBlock();
+      const PHINode *PN;
+      for (succ_const_iterator SI = succ_begin(LLVMBB), E = succ_end(LLVMBB);
+           !ContainsFPCode && SI != E; ++SI) {
+        for (BasicBlock::const_iterator II = SI->begin();
+             (PN = dyn_cast<PHINode>(II)); ++II) {
+          if (PN->getType()==Type::X86_FP80Ty ||
+              (!Subtarget.hasSSE1() && PN->getType()->isFloatingPoint()) ||
+              (!Subtarget.hasSSE2() && PN->getType()==Type::DoubleTy)) {
+            ContainsFPCode = true;
+            break;
+          }
+        }
+      }
+    }
+    // Finally, if we found any FP code, emit the FP_REG_KILL instruction.
+    if (ContainsFPCode) {
+      BuildMI(*MBB, MBBI->getFirstTerminator(), DebugLoc::getUnknownLoc(),
+              MF.getTarget().getInstrInfo()->get(X86::FP_REG_KILL));
+      ++NumFPKill;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..bd1fea7
--- /dev/null
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,1716 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#include "llvm/Support/CommandLine.h"
+static cl::opt<bool> AvoidDupAddrCompute("x86-avoid-dup-address", cl::Hidden);
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+  /// SDValue's instead of register numbers for the leaves of the matched
+  /// tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    bool isRIPRel;     // RIP as base?
+    unsigned Scale;
+    SDValue IndexReg; 
+    int32_t Disp;
+    SDValue Segment;
+    GlobalValue *GV;
+    Constant *CP;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+
+    X86ISelAddressMode()
+      : BaseType(RegBase), isRIPRel(false), Scale(1), IndexReg(), Disp(0),
+        Segment(), GV(0), CP(0), ES(0), JT(-1), Align(0) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != 0 || CP != 0 || ES != 0 || JT != -1;
+    }
+
+    void dump() {
+      cerr << "X86ISelAddressMode " << this << "\n";
+      cerr << "Base.Reg ";
+              if (Base.Reg.getNode() != 0) Base.Reg.getNode()->dump(); 
+              else cerr << "nul";
+      cerr << " Base.FrameIndex " << Base.FrameIndex << "\n";
+      cerr << "isRIPRel " << isRIPRel << " Scale" << Scale << "\n";
+      cerr << "IndexReg ";
+              if (IndexReg.getNode() != 0) IndexReg.getNode()->dump();
+              else cerr << "nul"; 
+      cerr << " Disp " << Disp << "\n";
+      cerr << "GV "; if (GV) GV->dump(); 
+                     else cerr << "nul";
+      cerr << " CP "; if (CP) CP->dump(); 
+                     else cerr << "nul";
+      cerr << "\n";
+      cerr << "ES "; if (ES) cerr << ES; else cerr << "nul";
+      cerr  << " JT" << JT << " Align" << Align << "\n";
+    }
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86 specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class VISIBILITY_HIDDEN X86DAGToDAGISel : public SelectionDAGISel {
+    /// TM - Keep a reference to X86TargetMachine.
+    ///
+    X86TargetMachine &TM;
+
+    /// X86Lowering - This object fully describes how to lower LLVM code to an
+    /// X86-specific SelectionDAG.
+    X86TargetLowering &X86Lowering;
+
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// CurBB - Current BB being isel'd.
+    ///
+    MachineBasicBlock *CurBB;
+
+    /// OptForSize - If true, selector should try to optimize for code size
+    /// instead of performance.
+    bool OptForSize;
+
+  public:
+    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel),
+        TM(tm), X86Lowering(*TM.getTargetLowering()),
+        Subtarget(&TM.getSubtarget<X86Subtarget>()),
+        OptForSize(false) {}
+
+    virtual const char *getPassName() const {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+
+    virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
+
+    virtual
+      bool IsLegalAndProfitableToFold(SDNode *N, SDNode *U, SDNode *Root) const;
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDValue N);
+    SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+
+    bool MatchSegmentBaseAddress(SDValue N, X86ISelAddressMode &AM);
+    bool MatchLoad(SDValue N, X86ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddress(SDValue N, X86ISelAddressMode &AM,
+                      unsigned Depth = 0);
+    bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
+    bool SelectAddr(SDValue Op, SDValue N, SDValue &Base,
+                    SDValue &Scale, SDValue &Index, SDValue &Disp,
+                    SDValue &Segment);
+    bool SelectLEAAddr(SDValue Op, SDValue N, SDValue &Base,
+                       SDValue &Scale, SDValue &Index, SDValue &Disp);
+    bool SelectScalarSSELoad(SDValue Op, SDValue Pred,
+                             SDValue N, SDValue &Base, SDValue &Scale,
+                             SDValue &Index, SDValue &Disp,
+                             SDValue &Segment,
+                             SDValue &InChain, SDValue &OutChain);
+    bool TryFoldLoad(SDValue P, SDValue N,
+                     SDValue &Base, SDValue &Scale,
+                     SDValue &Index, SDValue &Disp,
+                     SDValue &Segment);
+    void PreprocessForRMW();
+    void PreprocessForFPConvert();
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps);
+    
+    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, 
+                                   SDValue &Scale, SDValue &Index,
+                                   SDValue &Disp, SDValue &Segment) {
+      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+        CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+        AM.Base.Reg;
+      Scale = getI8Imm(AM.Scale);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+                                             AM.Align, AM.Disp);
+      else if (AM.ES)
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32);
+      else if (AM.JT != -1)
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32);
+      else
+        Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+
+      if (AM.Segment.getNode())
+        Segment = AM.Segment;
+      else
+        Segment = CurDAG->getRegister(0, MVT::i32);
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDValue getI8Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI16Imm - Return a target constant with the specified value, of type
+    /// i16.
+    inline SDValue getI16Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i16);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getGlobalBaseReg - Return an SDNode that returns the value of
+    /// the global base register. Output instructions required to
+    /// initialize the global base register, if necessary.
+    ///
+    SDNode *getGlobalBaseReg();
+
+#ifndef NDEBUG
+    unsigned Indent;
+#endif
+  };
+}
+
+
+bool X86DAGToDAGISel::IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+                                                 SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  if (U == Root)
+    switch (U->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      SDValue Op1 = U->getOperand(1);
+
+      // If the other operand is a 8-bit immediate we should fold the immediate
+      // instead. This reduces code size.
+      // e.g.
+      // movl 4(%esp), %eax
+      // addl $4, %eax
+      // vs.
+      // movl $4, %eax
+      // addl 4(%esp), %eax
+      // The former is 2 bytes shorter. In case where the increment is 1, then
+      // the saving can be 4 bytes (by using incl %eax).
+      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+        if (Imm->getAPIntValue().isSignedIntN(8))
+          return false;
+
+      // If the other operand is a TLS address, we should fold it instead.
+      // This produces
+      // movl    %gs:0, %eax
+      // leal    i@NTPOFF(%eax), %eax
+      // instead of
+      // movl    $i@NTPOFF, %eax
+      // addl    %gs:0, %eax
+      // if the block also has an access to a second TLS address this will save
+      // a load.
+      // FIXME: This is probably also true for non TLS addresses.
+      if (Op1.getOpcode() == X86ISD::Wrapper) {
+        SDValue Val = Op1.getOperand(0);
+        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+          return false;
+      }
+    }
+    }
+
+  // Proceed to 'generic' cycle finder code
+  return SelectionDAGISel::IsLegalAndProfitableToFold(N, U, Root);
+}
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
+                                 SDValue Store, SDValue TF) {
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
+    if (Load.getNode() == TF.getOperand(i).getNode())
+      Ops.push_back(Load.getOperand(0));
+    else
+      Ops.push_back(TF.getOperand(i));
+  CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Load, TF, Load.getOperand(1), Load.getOperand(2));
+  CurDAG->UpdateNodeOperands(Store, Load.getValue(1), Store.getOperand(1),
+                             Store.getOperand(2), Store.getOperand(3));
+}
+
+/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.
+/// 
+static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
+                      SDValue &Load) {
+  if (N.getOpcode() == ISD::BIT_CONVERT)
+    N = N.getOperand(0);
+
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+  if (!LD || LD->isVolatile())
+    return false;
+  if (LD->getAddressingMode() != ISD::UNINDEXED)
+    return false;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
+    return false;
+
+  if (N.hasOneUse() &&
+      N.getOperand(1) == Address &&
+      N.getNode()->isOperandOf(Chain.getNode())) {
+    Load = N;
+    return true;
+  }
+  return false;
+}
+
+/// MoveBelowCallSeqStart - Replace CALLSEQ_START operand with load's chain
+/// operand and move load below the call's chain operand.
+static void MoveBelowCallSeqStart(SelectionDAG *CurDAG, SDValue Load,
+                                  SDValue Call, SDValue CallSeqStart) {
+  SmallVector<SDValue, 8> Ops;
+  SDValue Chain = CallSeqStart.getOperand(0);
+  if (Chain.getNode() == Load.getNode())
+    Ops.push_back(Load.getOperand(0));
+  else {
+    assert(Chain.getOpcode() == ISD::TokenFactor &&
+           "Unexpected CallSeqStart chain operand");
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+      if (Chain.getOperand(i).getNode() == Load.getNode())
+        Ops.push_back(Load.getOperand(0));
+      else
+        Ops.push_back(Chain.getOperand(i));
+    SDValue NewChain =
+      CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+                      MVT::Other, &Ops[0], Ops.size());
+    Ops.clear();
+    Ops.push_back(NewChain);
+  }
+  for (unsigned i = 1, e = CallSeqStart.getNumOperands(); i != e; ++i)
+    Ops.push_back(CallSeqStart.getOperand(i));
+  CurDAG->UpdateNodeOperands(CallSeqStart, &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Load, Call.getOperand(0),
+                             Load.getOperand(1), Load.getOperand(2));
+  Ops.clear();
+  Ops.push_back(SDValue(Load.getNode(), 1));
+  for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
+    Ops.push_back(Call.getOperand(i));
+  CurDAG->UpdateNodeOperands(Call, &Ops[0], Ops.size());
+}
+
+/// isCalleeLoad - Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain) {
+  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+    return false;
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+  if (!LD ||
+      LD->isVolatile() ||
+      LD->getAddressingMode() != ISD::UNINDEXED ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  // Now let's find the callseq_start.
+  while (Chain.getOpcode() != ISD::CALLSEQ_START) {
+    if (!Chain.hasOneUse())
+      return false;
+    Chain = Chain.getOperand(0);
+  }
+  
+  if (Chain.getOperand(0).getNode() == Callee.getNode())
+    return true;
+  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()))
+    return true;
+  return false;
+}
+
+
+/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
+/// This is only run if not in -O0 mode.
+/// This allows the instruction selector to pick more read-modify-write
+/// instructions. This is a common case:
+///
+///     [Load chain]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///      /      \-
+///     /         |
+/// [TokenFactor] [Op]
+///     ^          ^
+///     |          |
+///      \        /
+///       \      /
+///       [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+///     [Load chain]
+///         ^
+///         |
+///    [TokenFactor]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///       |     \- 
+///       |       | 
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+void X86DAGToDAGISel::PreprocessForRMW() {
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+    if (I->getOpcode() == X86ISD::CALL) {
+      /// Also try moving call address load from outside callseq_start to just
+      /// before the call to allow it to be folded.
+      ///
+      ///     [Load chain]
+      ///         ^
+      ///         |
+      ///       [Load]
+      ///       ^    ^
+      ///       |    |
+      ///      /      \--
+      ///     /          |
+      ///[CALLSEQ_START] |
+      ///     ^          |
+      ///     |          |
+      /// [LOAD/C2Reg]   |
+      ///     |          |
+      ///      \        /
+      ///       \      /
+      ///       [CALL]
+      SDValue Chain = I->getOperand(0);
+      SDValue Load  = I->getOperand(1);
+      if (!isCalleeLoad(Load, Chain))
+        continue;
+      MoveBelowCallSeqStart(CurDAG, Load, SDValue(I, 0), Chain);
+      ++NumLoadMoved;
+      continue;
+    }
+
+    if (!ISD::isNON_TRUNCStore(I))
+      continue;
+    SDValue Chain = I->getOperand(0);
+
+    if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
+      continue;
+
+    SDValue N1 = I->getOperand(1);
+    SDValue N2 = I->getOperand(2);
+    if ((N1.getValueType().isFloatingPoint() &&
+         !N1.getValueType().isVector()) ||
+        !N1.hasOneUse())
+      continue;
+
+    bool RModW = false;
+    SDValue Load;
+    unsigned Opcode = N1.getNode()->getOpcode();
+    switch (Opcode) {
+    case ISD::ADD:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::VECTOR_SHUFFLE: {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      RModW = isRMWLoad(N10, Chain, N2, Load);
+      if (!RModW)
+        RModW = isRMWLoad(N11, Chain, N2, Load);
+      break;
+    }
+    case ISD::SUB:
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+    case ISD::ROTL:
+    case ISD::ROTR:
+    case ISD::SUBC:
+    case ISD::SUBE:
+    case X86ISD::SHLD:
+    case X86ISD::SHRD: {
+      SDValue N10 = N1.getOperand(0);
+      RModW = isRMWLoad(N10, Chain, N2, Load);
+      break;
+    }
+    }
+
+    if (RModW) {
+      MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
+      ++NumLoadMoved;
+    }
+  }
+}
+
+
+/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
+/// nodes that target the FP stack to be store and load to the stack.  This is a
+/// gross hack.  We would like to simply mark these as being illegal, but when
+/// we do that, legalize produces these when it expands calls, then expands
+/// these in the same legalize pass.  We would like dag combine to be able to
+/// hack on these between the call expansion and the node legalization.  As such
+/// this pass basically does "really late" legalization of these inline with the
+/// X86 isel pass.
+void X86DAGToDAGISel::PreprocessForFPConvert() {
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+    
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    MVT SrcVT = N->getOperand(0).getValueType();
+    MVT DstVT = N->getValueType(0);
+    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    if (!SrcIsSSE && !DstIsSSE) {
+      // If this is an FPStack extension, it is a noop.
+      if (N->getOpcode() == ISD::FP_EXTEND)
+        continue;
+      // If this is a value-preserving FPStack truncation, it is a noop.
+      if (N->getConstantOperandVal(1))
+        continue;
+    }
+   
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    MVT MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+    
+    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+    DebugLoc dl = N->getDebugLoc();
+    
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+                                          N->getOperand(0),
+                                          MemTmp, NULL, 0, MemVT);
+    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+                                        NULL, 0, MemVT);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+    
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    CurDAG->DeleteNode(N);
+  }  
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
+/// when it has created a SelectionDAG for us to codegen.
+void X86DAGToDAGISel::InstructionSelect() {
+  CurBB = BB;  // BB can change as result of isel.
+  const Function *F = CurDAG->getMachineFunction().getFunction();
+  OptForSize = F->hasFnAttr(Attribute::OptimizeForSize);
+
+  DEBUG(BB->dump());
+  if (OptLevel != CodeGenOpt::None)
+    PreprocessForRMW();
+
+  // FIXME: This should only happen when not compiled with -O0.
+  PreprocessForFPConvert();
+
+  // Codegen the basic block.
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection begins:\n";
+  Indent = 0;
+#endif
+  SelectRoot(*CurDAG);
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection ends:\n";
+#endif
+
+  CurDAG->RemoveDeadNodes();
+}
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+                                             MachineFrameInfo *MFI) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  if (Subtarget->isTargetCygMing())
+    BuildMI(BB, DebugLoc::getUnknownLoc(),
+            TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
+  // If this is main, emit special code for main.
+  MachineBasicBlock *BB = MF.begin();
+  if (Fn.hasExternalLinkage() && Fn.getName() == "main")
+    EmitSpecialCodeForMain(BB, MF.getFrameInfo());
+}
+
+
+bool X86DAGToDAGISel::MatchSegmentBaseAddress(SDValue N,
+                                              X86ISelAddressMode &AM) {
+  assert(N.getOpcode() == X86ISD::SegmentBaseAddress);
+  SDValue Segment = N.getOperand(0);
+
+  if (AM.Segment.getNode() == 0) {
+    AM.Segment = Segment;
+    return false;
+  }
+
+  return true;
+}
+
+bool X86DAGToDAGISel::MatchLoad(SDValue N, X86ISelAddressMode &AM) {
+  // This optimization is valid because the GNU TLS model defines that
+  // gs:0 (or fs:0 on X86-64) contains its own address.
+  // For more information see http://people.redhat.com/drepper/tls.pdf
+
+  SDValue Address = N.getOperand(1);
+  if (Address.getOpcode() == X86ISD::SegmentBaseAddress &&
+      !MatchSegmentBaseAddress (Address, AM))
+    return false;
+
+  return true;
+}
+
+bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+  bool is64Bit = Subtarget->is64Bit();
+  DOUT << "Wrapper: 64bit " << is64Bit;
+  DOUT << " AM "; DEBUG(AM.dump()); DOUT << "\n";
+
+  // Under X86-64 non-small code model, GV (and friends) are 64-bits.
+  if (is64Bit && (TM.getCodeModel() != CodeModel::Small))
+    return true;
+
+  // Base and index reg must be 0 in order to use rip as base.
+  bool canUsePICRel = !AM.Base.Reg.getNode() && !AM.IndexReg.getNode();
+  if (is64Bit && !canUsePICRel && TM.symbolicAddressesAreRIPRel())
+    return true;
+
+  if (AM.hasSymbolicDisplacement())
+    return true;
+  // If value is available in a register both base and index components have
+  // been picked, we can't fit the result available in the register in the
+  // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement.
+
+  SDValue N0 = N.getOperand(0);
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+    uint64_t Offset = G->getOffset();
+    if (!is64Bit || isInt32(AM.Disp + Offset)) {
+      GlobalValue *GV = G->getGlobal();
+      bool isRIPRel = TM.symbolicAddressesAreRIPRel();
+      if (N0.getOpcode() == llvm::ISD::TargetGlobalTLSAddress) {
+        TLSModel::Model model =
+          getTLSModel (GV, TM.getRelocationModel());
+        if (is64Bit && model == TLSModel::InitialExec)
+          isRIPRel = true;
+      }
+      AM.GV = GV;
+      AM.Disp += Offset;
+      AM.isRIPRel = isRIPRel;
+      return false;
+    }
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+    uint64_t Offset = CP->getOffset();
+    if (!is64Bit || isInt32(AM.Disp + Offset)) {
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.Disp += Offset;
+      AM.isRIPRel = TM.symbolicAddressesAreRIPRel();
+      return false;
+    }
+  } else if (ExternalSymbolSDNode *S =dyn_cast<ExternalSymbolSDNode>(N0)) {
+    AM.ES = S->getSymbol();
+    AM.isRIPRel = TM.symbolicAddressesAreRIPRel();
+    return false;
+  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+    AM.JT = J->getIndex();
+    AM.isRIPRel = TM.symbolicAddressesAreRIPRel();
+    return false;
+  }
+
+  return true;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode.
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
+                                   unsigned Depth) {
+  bool is64Bit = Subtarget->is64Bit();
+  DebugLoc dl = N.getDebugLoc();
+  DOUT << "MatchAddress: "; DEBUG(AM.dump());
+  // Limit recursion.
+  if (Depth > 5)
+    return MatchAddressBase(N, AM);
+  
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRel) {
+    if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) {
+      uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+      if (!is64Bit || isInt32(AM.Disp + Val)) {
+        AM.Disp += Val;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (!is64Bit || isInt32(AM.Disp + Val)) {
+      AM.Disp += Val;
+      return false;
+    }
+    break;
+  }
+
+  case X86ISD::SegmentBaseAddress:
+    if (!MatchSegmentBaseAddress(N, AM))
+      return false;
+    break;
+
+  case X86ISD::Wrapper:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::LOAD:
+    if (!MatchLoad(N, AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase
+        && AM.Base.Reg.getNode() == 0) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1 || AM.isRIPRel)
+      break;
+      
+    if (ConstantSDNode
+          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+      unsigned Val = CN->getZExtValue();
+      if (Val == 1 || Val == 2 || Val == 3) {
+        AM.Scale = 1 << Val;
+        SDValue ShVal = N.getNode()->getOperand(0);
+
+        // Okay, we know that we have a scale by now.  However, if the scaled
+        // value is an add of something and a constant, we can fold the
+        // constant into the disp field here.
+        if (ShVal.getNode()->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
+            isa<ConstantSDNode>(ShVal.getNode()->getOperand(1))) {
+          AM.IndexReg = ShVal.getNode()->getOperand(0);
+          ConstantSDNode *AddVal =
+            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          uint64_t Disp = AM.Disp + (AddVal->getSExtValue() << Val);
+          if (!is64Bit || isInt32(Disp))
+            AM.Disp = Disp;
+          else
+            AM.IndexReg = ShVal;
+        } else {
+          AM.IndexReg = ShVal;
+        }
+        return false;
+      }
+    break;
+    }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    // A mul_lohi where we need the low part can be folded as a plain multiply.
+    if (N.getResNo() != 0) break;
+    // FALL THROUGH
+  case ISD::MUL:
+  case X86ISD::MUL_IMM:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base.Reg.getNode() == 0 &&
+        AM.IndexReg.getNode() == 0 &&
+        !AM.isRIPRel) {
+      if (ConstantSDNode
+            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+            CN->getZExtValue() == 9) {
+          AM.Scale = unsigned(CN->getZExtValue())-1;
+
+          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+            Reg = MulVal.getNode()->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+            uint64_t Disp = AM.Disp + AddVal->getSExtValue() *
+                                      CN->getZExtValue();
+            if (!is64Bit || isInt32(Disp))
+              AM.Disp = Disp;
+            else
+              Reg = N.getNode()->getOperand(0);
+          } else {
+            Reg = N.getNode()->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base.Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Test if the LHS of the sub can be folded.
+    X86ISelAddressMode Backup = AM;
+    if (MatchAddress(N.getNode()->getOperand(0), AM, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRIPRel) {
+      AM = Backup;
+      break;
+    }
+    int Cost = 0;
+    SDValue RHS = N.getNode()->getOperand(1);
+    // If the RHS involves a register with multiple uses, this
+    // transformation incurs an extra mov, due to the neg instruction
+    // clobbering its operand.
+    if (!RHS.getNode()->hasOneUse() ||
+        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+      ++Cost;
+    // If the base is a register with multiple uses, this
+    // transformation may save a mov.
+    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+         AM.Base.Reg.getNode() &&
+         !AM.Base.Reg.getNode()->hasOneUse()) ||
+        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+      --Cost;
+    // If the folded LHS was interesting, this transformation saves
+    // address arithmetic.
+    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+        ((AM.Disp != 0) && (Backup.Disp == 0)) +
+        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+      --Cost;
+    // If it doesn't look like it may be an overall win, don't do it.
+    if (Cost >= 0) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+    AM.Scale = 1;
+
+    // Insert the new nodes into the topological ordering.
+    if (Zero.getNode()->getNodeId() == -1 ||
+        Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Zero.getNode());
+      Zero.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    if (Neg.getNode()->getNodeId() == -1 ||
+        Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Neg.getNode());
+      Neg.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    return false;
+  }
+
+  case ISD::ADD: {
+    X86ISelAddressMode Backup = AM;
+    if (!MatchAddress(N.getNode()->getOperand(0), AM, Depth+1) &&
+        !MatchAddress(N.getNode()->getOperand(1), AM, Depth+1))
+      return false;
+    AM = Backup;
+    if (!MatchAddress(N.getNode()->getOperand(1), AM, Depth+1) &&
+        !MatchAddress(N.getNode()->getOperand(0), AM, Depth+1))
+      return false;
+    AM = Backup;
+
+    // If we couldn't fold both operands into the address at the same time,
+    // see if we can just put each operand into a register and fold at least
+    // the add.
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        !AM.Base.Reg.getNode() &&
+        !AM.IndexReg.getNode() &&
+        !AM.isRIPRel) {
+      AM.Base.Reg = N.getNode()->getOperand(0);
+      AM.IndexReg = N.getNode()->getOperand(1);
+      AM.Scale = 1;
+      return false;
+    }
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      X86ISelAddressMode Backup = AM;
+      uint64_t Offset = CN->getSExtValue();
+      // Start with the LHS as an addr mode.
+      if (!MatchAddress(N.getOperand(0), AM, Depth+1) &&
+          // Address could not have picked a GV address for the displacement.
+          AM.GV == NULL &&
+          // On x86-64, the resultant disp must fit in 32-bits.
+          (!is64Bit || isInt32(AM.Disp + Offset)) &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp += Offset;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+      
+  case ISD::AND: {
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
+    SDValue Shift = N.getOperand(0);
+    if (Shift.getNumOperands() != 2) break;
+
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+    // Not when RIP is used as the base.
+    if (AM.isRIPRel) break;
+
+    SDValue X = Shift.getOperand(0);
+    ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+    if (!C1 || !C2) break;
+
+    // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This
+    // allows us to convert the shift and and into an h-register extract and
+    // a scaled index.
+    if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) {
+      unsigned ScaleLog = 8 - C1->getZExtValue();
+      if (ScaleLog > 0 && ScaleLog < 4 &&
+          C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) {
+        SDValue Eight = CurDAG->getConstant(8, MVT::i8);
+        SDValue Mask = CurDAG->getConstant(0xff, N.getValueType());
+        SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+                                      X, Eight);
+        SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(),
+                                      Srl, Mask);
+        SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8);
+        SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+                                      And, ShlCount);
+
+        // Insert the new nodes into the topological ordering.
+        if (Eight.getNode()->getNodeId() == -1 ||
+            Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Eight.getNode());
+          Eight.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Mask.getNode()->getNodeId() == -1 ||
+            Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Mask.getNode());
+          Mask.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Srl.getNode()->getNodeId() == -1 ||
+            Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(Shift.getNode(), Srl.getNode());
+          Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
+        }
+        if (And.getNode()->getNodeId() == -1 ||
+            And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(N.getNode(), And.getNode());
+          And.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        if (ShlCount.getNode()->getNodeId() == -1 ||
+            ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), ShlCount.getNode());
+          ShlCount.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        if (Shl.getNode()->getNodeId() == -1 ||
+            Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(N.getNode(), Shl.getNode());
+          Shl.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        CurDAG->ReplaceAllUsesWith(N, Shl);
+        AM.IndexReg = And;
+        AM.Scale = (1 << ScaleLog);
+        return false;
+      }
+    }
+
+    // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
+    // allows us to fold the shift into this addressing mode.
+    if (Shift.getOpcode() != ISD::SHL) break;
+
+    // Not likely to be profitable if either the AND or SHIFT node has more
+    // than one use (unless all uses are for address computation). Besides,
+    // isel mechanism requires their node ids to be reused.
+    if (!N.hasOneUse() || !Shift.hasOneUse())
+      break;
+    
+    // Verify that the shift amount is something we can fold.
+    unsigned ShiftCst = C1->getZExtValue();
+    if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3)
+      break;
+    
+    // Get the new AND mask, this folds to a constant.
+    SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+                                         SDValue(C2, 0), SDValue(C1, 0));
+    SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X, 
+                                     NewANDMask);
+    SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+                                       NewAND, SDValue(C1, 0));
+
+    // Insert the new nodes into the topological ordering.
+    if (C1->getNodeId() > X.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(X.getNode(), C1);
+      C1->setNodeId(X.getNode()->getNodeId());
+    }
+    if (NewANDMask.getNode()->getNodeId() == -1 ||
+        NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode());
+      NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId());
+    }
+    if (NewAND.getNode()->getNodeId() == -1 ||
+        NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode());
+      NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId());
+    }
+    if (NewSHIFT.getNode()->getNodeId() == -1 ||
+        NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode());
+      NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+
+    CurDAG->ReplaceAllUsesWith(N, NewSHIFT);
+    
+    AM.Scale = 1 << ShiftCst;
+    AM.IndexReg = NewAND;
+    return false;
+  }
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, check to see if the scale index register is set.
+    if (AM.IndexReg.getNode() == 0 && !AM.isRIPRel) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool X86DAGToDAGISel::SelectAddr(SDValue Op, SDValue N, SDValue &Base,
+                                 SDValue &Scale, SDValue &Index,
+                                 SDValue &Disp, SDValue &Segment) {
+  X86ISelAddressMode AM;
+  bool Done = false;
+  if (AvoidDupAddrCompute && !N.hasOneUse()) {
+    unsigned Opcode = N.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex &&
+        Opcode != X86ISD::Wrapper) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LEA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = N.getNode()->use_begin(),
+             UE = N.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(N, AM);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+
+  if (!Done && MatchAddress(N, AM))
+    return false;
+
+  MVT VT = N.getValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
+/// match a load whose top elements are either undef or zeros.  The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDValue Op, SDValue Pred,
+                                          SDValue N, SDValue &Base,
+                                          SDValue &Scale, SDValue &Index,
+                                          SDValue &Disp, SDValue &Segment,
+                                          SDValue &InChain,
+                                          SDValue &OutChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    InChain = N.getOperand(0).getValue(1);
+    if (ISD::isNON_EXTLoad(InChain.getNode()) &&
+        InChain.getValue(0).hasOneUse() &&
+        N.hasOneUse() &&
+        IsLegalAndProfitableToFold(N.getNode(), Pred.getNode(), Op.getNode())) {
+      LoadSDNode *LD = cast<LoadSDNode>(InChain);
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+        return false;
+      OutChain = LD->getChain();
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 
+      N.getOperand(0).getNode()->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+      N.getOperand(0).getOperand(0).hasOneUse()) {
+    // Okay, this is a zero extending load.  Fold it.
+    LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+    if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+      return false;
+    OutChain = LD->getChain();
+    InChain = SDValue(LD, 1);
+    return true;
+  }
+  return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDValue Op, SDValue N,
+                                    SDValue &Base, SDValue &Scale,
+                                    SDValue &Index, SDValue &Disp) {
+  X86ISelAddressMode AM;
+
+  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+  // segments.
+  SDValue Copy = AM.Segment;
+  SDValue T = CurDAG->getRegister(0, MVT::i32);
+  AM.Segment = T;
+  if (MatchAddress(N, AM))
+    return false;
+  assert (T == AM.Segment);
+  AM.Segment = Copy;
+
+  MVT VT = N.getValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base.Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some expermentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.hasSymbolicDisplacement()) {
+    // For X86-64, we should always use lea to materialize RIP relative
+    // addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base.Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity++;
+
+  if (Complexity > 2) {
+    SDValue Segment;
+    getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+    return true;
+  }
+  return false;
+}
+
+bool X86DAGToDAGISel::TryFoldLoad(SDValue P, SDValue N,
+                                  SDValue &Base, SDValue &Scale,
+                                  SDValue &Index, SDValue &Disp,
+                                  SDValue &Segment) {
+  if (ISD::isNON_EXTLoad(N.getNode()) &&
+      N.hasOneUse() &&
+      IsLegalAndProfitableToFold(N.getNode(), P.getNode(), P.getNode()))
+    return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
+  return false;
+}
+
+/// getGlobalBaseReg - Return an SDNode that returns the value of
+/// the global base register. Output instructions required to
+/// initialize the global base register, if necessary.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  MachineFunction *MF = CurBB->getParent();
+  unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+static SDNode *FindCallStartFromCall(SDNode *Node) {
+  if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+    assert(Node->getOperand(0).getValueType() == MVT::Other &&
+         "Node doesn't have a token chain argument!");
+  return FindCallStartFromCall(Node->getOperand(0).getNode());
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = Node->getOperand(2);
+  SDValue In2H = Node->getOperand(3);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(In1, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return NULL;
+  SDValue LSI = Node->getOperand(4);    // MemOperand
+  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, LSI, Chain};
+  return CurDAG->getTargetNode(Opc, Node->getDebugLoc(),
+                               MVT::i32, MVT::i32, MVT::Other, Ops,
+                               array_lengthof(Ops));
+}
+
+SDNode *X86DAGToDAGISel::Select(SDValue N) {
+  SDNode *Node = N.getNode();
+  MVT NVT = Node->getValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc dl = Node->getDebugLoc();
+  
+#ifndef NDEBUG
+  DOUT << std::string(Indent, ' ') << "Selecting: ";
+  DEBUG(Node->dump(CurDAG));
+  DOUT << "\n";
+  Indent += 2;
+#endif
+
+  if (Node->isMachineOpcode()) {
+#ifndef NDEBUG
+    DOUT << std::string(Indent-2, ' ') << "== ";
+    DEBUG(Node->dump(CurDAG));
+    DOUT << "\n";
+    Indent -= 2;
+#endif
+    return NULL;   // Already selected.
+  }
+
+  switch (Opcode) {
+    default: break;
+    case X86ISD::GlobalBaseReg: 
+      return getGlobalBaseReg();
+
+    case X86ISD::ATOMOR64_DAG:
+      return SelectAtomic64(Node, X86::ATOMOR6432);
+    case X86ISD::ATOMXOR64_DAG:
+      return SelectAtomic64(Node, X86::ATOMXOR6432);
+    case X86ISD::ATOMADD64_DAG:
+      return SelectAtomic64(Node, X86::ATOMADD6432);
+    case X86ISD::ATOMSUB64_DAG:
+      return SelectAtomic64(Node, X86::ATOMSUB6432);
+    case X86ISD::ATOMNAND64_DAG:
+      return SelectAtomic64(Node, X86::ATOMNAND6432);
+    case X86ISD::ATOMAND64_DAG:
+      return SelectAtomic64(Node, X86::ATOMAND6432);
+    case X86ISD::ATOMSWAP64_DAG:
+      return SelectAtomic64(Node, X86::ATOMSWAP6432);
+
+    case ISD::SMUL_LOHI:
+    case ISD::UMUL_LOHI: {
+      SDValue N0 = Node->getOperand(0);
+      SDValue N1 = Node->getOperand(1);
+
+      bool isSigned = Opcode == ISD::SMUL_LOHI;
+      if (!isSigned)
+        switch (NVT.getSimpleVT()) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+        case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+        case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+        case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+        }
+      else
+        switch (NVT.getSimpleVT()) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+        case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+        case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+        case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+        }
+
+      unsigned LoReg, HiReg;
+      switch (NVT.getSimpleVT()) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
+      case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
+      case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+      case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+      }
+
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      // multiplty is commmutative
+      if (!foldedLoad) {
+        foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+        if (foldedLoad)
+          std::swap(N0, N1);
+      }
+
+      SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+                                              N0, SDValue()).getValue(1);
+
+      if (foldedLoad) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                          InFlag };
+        SDNode *CNode =
+          CurDAG->getTargetNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+                                array_lengthof(Ops));
+        InFlag = SDValue(CNode, 1);
+        // Update the chain.
+        ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+      } else {
+        InFlag =
+          SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+      }
+
+      // Copy the low half of the result, if it is needed.
+      if (!N.getValue(0).use_empty()) {
+        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                  LoReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
+        ReplaceUses(N.getValue(0), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.getNode()->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
+      // Copy the high half of the result, if it is needed.
+      if (!N.getValue(1).use_empty()) {
+        SDValue Result;
+        if (HiReg == X86::AH && Subtarget->is64Bit()) {
+          // Prevent use of AH in a REX instruction by referencing AX instead.
+          // Shift it down 8 bits.
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                          X86::AX, MVT::i16, InFlag);
+          InFlag = Result.getValue(2);
+          Result = SDValue(CurDAG->getTargetNode(X86::SHR16ri, dl, MVT::i16,
+                                                 Result,
+                                     CurDAG->getTargetConstant(8, MVT::i8)), 0);
+          // Then truncate it down to i8.
+          SDValue SRIdx = CurDAG->getTargetConstant(X86::SUBREG_8BIT, MVT::i32);
+          Result = SDValue(CurDAG->getTargetNode(X86::EXTRACT_SUBREG, dl,
+                                                   MVT::i8, Result, SRIdx), 0);
+        } else {
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                          HiReg, NVT, InFlag);
+          InFlag = Result.getValue(2);
+        }
+        ReplaceUses(N.getValue(1), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.getNode()->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
+
+#ifndef NDEBUG
+      Indent -= 2;
+#endif
+
+      return NULL;
+    }
+      
+    case ISD::SDIVREM:
+    case ISD::UDIVREM: {
+      SDValue N0 = Node->getOperand(0);
+      SDValue N1 = Node->getOperand(1);
+
+      bool isSigned = Opcode == ISD::SDIVREM;
+      if (!isSigned)
+        switch (NVT.getSimpleVT()) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+        case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+        case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+        case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+        }
+      else
+        switch (NVT.getSimpleVT()) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+        case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+        case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+        case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+        }
+
+      unsigned LoReg, HiReg;
+      unsigned ClrOpcode, SExtOpcode;
+      switch (NVT.getSimpleVT()) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i8:
+        LoReg = X86::AL;  HiReg = X86::AH;
+        ClrOpcode  = 0;
+        SExtOpcode = X86::CBW;
+        break;
+      case MVT::i16:
+        LoReg = X86::AX;  HiReg = X86::DX;
+        ClrOpcode  = X86::MOV16r0;
+        SExtOpcode = X86::CWD;
+        break;
+      case MVT::i32:
+        LoReg = X86::EAX; HiReg = X86::EDX;
+        ClrOpcode  = X86::MOV32r0;
+        SExtOpcode = X86::CDQ;
+        break;
+      case MVT::i64:
+        LoReg = X86::RAX; HiReg = X86::RDX;
+        ClrOpcode  = X86::MOV64r0;
+        SExtOpcode = X86::CQO;
+        break;
+      }
+
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+      SDValue InFlag;
+      if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+        // Special case for div8, just use a move with zero extension to AX to
+        // clear the upper 8 bits (AH).
+        SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+        if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+          SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+          Move =
+            SDValue(CurDAG->getTargetNode(X86::MOVZX16rm8, dl, MVT::i16, 
+                                          MVT::Other, Ops,
+                                          array_lengthof(Ops)), 0);
+          Chain = Move.getValue(1);
+          ReplaceUses(N0.getValue(1), Chain);
+        } else {
+          Move =
+            SDValue(CurDAG->getTargetNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
+          Chain = CurDAG->getEntryNode();
+        }
+        Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
+        InFlag = Chain.getValue(1);
+      } else {
+        InFlag =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                               LoReg, N0, SDValue()).getValue(1);
+        if (isSigned && !signBitIsZero) {
+          // Sign extend the low part into the high part.
+          InFlag =
+            SDValue(CurDAG->getTargetNode(SExtOpcode, dl, MVT::Flag, InFlag),0);
+        } else {
+          // Zero out the high part, effectively zero extending the input.
+          SDValue ClrNode = SDValue(CurDAG->getTargetNode(ClrOpcode, dl, NVT), 
+                                    0);
+          InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, HiReg,
+                                        ClrNode, InFlag).getValue(1);
+        }
+      }
+
+      if (foldedLoad) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                          InFlag };
+        SDNode *CNode =
+          CurDAG->getTargetNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+                                array_lengthof(Ops));
+        InFlag = SDValue(CNode, 1);
+        // Update the chain.
+        ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+      } else {
+        InFlag =
+          SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+      }
+
+      // Copy the division (low) result, if it is needed.
+      if (!N.getValue(0).use_empty()) {
+        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                  LoReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
+        ReplaceUses(N.getValue(0), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.getNode()->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
+      // Copy the remainder (high) result, if it is needed.
+      if (!N.getValue(1).use_empty()) {
+        SDValue Result;
+        if (HiReg == X86::AH && Subtarget->is64Bit()) {
+          // Prevent use of AH in a REX instruction by referencing AX instead.
+          // Shift it down 8 bits.
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                          X86::AX, MVT::i16, InFlag);
+          InFlag = Result.getValue(2);
+          Result = SDValue(CurDAG->getTargetNode(X86::SHR16ri, dl, MVT::i16,
+                                        Result,
+                                        CurDAG->getTargetConstant(8, MVT::i8)), 
+                           0);
+          // Then truncate it down to i8.
+          SDValue SRIdx = CurDAG->getTargetConstant(X86::SUBREG_8BIT, MVT::i32);
+          Result = SDValue(CurDAG->getTargetNode(X86::EXTRACT_SUBREG, dl,
+                                                   MVT::i8, Result, SRIdx), 0);
+        } else {
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                          HiReg, NVT, InFlag);
+          InFlag = Result.getValue(2);
+        }
+        ReplaceUses(N.getValue(1), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.getNode()->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
+
+#ifndef NDEBUG
+      Indent -= 2;
+#endif
+
+      return NULL;
+    }
+
+    case ISD::DECLARE: {
+      // Handle DECLARE nodes here because the second operand may have been
+      // wrapped in X86ISD::Wrapper.
+      SDValue Chain = Node->getOperand(0);
+      SDValue N1 = Node->getOperand(1);
+      SDValue N2 = Node->getOperand(2);
+      FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
+      
+      // FIXME: We need to handle this for VLAs.
+      if (!FINode) {
+        ReplaceUses(N.getValue(0), Chain);
+        return NULL;
+      }
+      
+      if (N2.getOpcode() == ISD::ADD &&
+          N2.getOperand(0).getOpcode() == X86ISD::GlobalBaseReg)
+        N2 = N2.getOperand(1);
+      
+      // If N2 is not Wrapper(decriptor) then the llvm.declare is mangled
+      // somehow, just ignore it.
+      if (N2.getOpcode() != X86ISD::Wrapper) {
+        ReplaceUses(N.getValue(0), Chain);
+        return NULL;
+      }
+      GlobalAddressSDNode *GVNode =
+        dyn_cast<GlobalAddressSDNode>(N2.getOperand(0));
+      if (GVNode == 0) {
+        ReplaceUses(N.getValue(0), Chain);
+        return NULL;
+      }
+      SDValue Tmp1 = CurDAG->getTargetFrameIndex(FINode->getIndex(),
+                                                 TLI.getPointerTy());
+      SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GVNode->getGlobal(),
+                                                    TLI.getPointerTy());
+      SDValue Ops[] = { Tmp1, Tmp2, Chain };
+      return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
+                                   MVT::Other, Ops,
+                                   array_lengthof(Ops));
+    }
+  }
+
+  SDNode *ResNode = SelectCode(N);
+
+#ifndef NDEBUG
+  DOUT << std::string(Indent-2, ' ') << "=> ";
+  if (ResNode == NULL || ResNode == N.getNode())
+    DEBUG(N.getNode()->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DOUT << "\n";
+  Indent -= 2;
+#endif
+
+  return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, Op2, Op3, Op4;
+  switch (ConstraintCode) {
+  case 'o':   // offsetable        ??
+  case 'v':   // not offsetable    ??
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op, Op, Op0, Op1, Op2, Op3, Op4))
+      return true;
+    break;
+  }
+  
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  OutOps.push_back(Op4);
+  return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+                                     llvm::CodeGenOpt::Level OptLevel) {
+  return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..882ee3a
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,8794 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool>
+DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
+
+// Forward declarations.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+                       SDValue V2);
+
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
+  : TargetLowering(TM) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
+  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
+  RegInfo = TM.getRegisterInfo();
+  TD = getTargetData();
+
+  // Set up the TargetLowering object.
+
+  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  setShiftAmountType(MVT::i8);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setSchedulingPreference(SchedulingForRegPressure);
+  setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
+  setStackPointerRegisterToSaveRestore(X86StackPtr);
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget->isTargetMingw()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
+  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // We don't accept any truncstore of integer registers.
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+
+  // SETOEQ and SETUNE require checking two conditions.
+  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
+  } else if (!UseSoftFloat) {
+    if (X86ScalarSSEf64) {
+      // We have an impenetrably clever algorithm for ui64->double only.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i64  , Custom);
+    }
+    // We have an algorithm for SSE2, and we turn this into a 64-bit
+    // FILD for other targets.
+    setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+
+  if (!UseSoftFloat && !NoImplicitFloat) {
+    // SSE has no i16 to fp conversion, only i32
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
+  }
+
+  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
+  // are Legal, f80 is custom lowered.
+  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (X86ScalarSSEf32) {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+    // f32 and f64 cases are Legal, f80 case is not
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+  } else if (!UseSoftFloat) {
+    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With SSE3 we can use fisttpll to convert to a signed i64; without
+      // SSE, we're stuck with a fistpll.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSEf64) {
+    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
+    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
+  }
+
+  // Scalar integer divide and remainder are lowered to use operations that
+  // produce two results, to match the available instructions. This exposes
+  // the two-result form to trivial CSE, which is able to combine x/y and x%y
+  // into a single instruction.
+  //
+  // Scalar integer multiply-high is also lowered to use two-result
+  // operations, to match the available instructions. However, plain multiply
+  // (low) operations are left as Legal, as there are single-result
+  // instructions for this in x86. Using the two-result multiply instructions
+  // when both high and low results are needed must be arranged by dagcombine.
+  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
+  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
+  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
+  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
+  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
+  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
+  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
+  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
+  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
+  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
+  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
+  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
+  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
+
+  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
+  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
+  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
+  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
+  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
+  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
+    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
+  setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
+  // X86 wants to expand cmov itself.
+  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+  }
+  // X86 ret instruction may pop stack.
+  setOperationAction(ISD::RET             , MVT::Other, Custom);
+  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+
+  // Darwin ABI issue.
+  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
+  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
+    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
+    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
+    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasSSE1())
+    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
+
+  if (!Subtarget->hasSSE2())
+    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
+
+  // Expand certain atomics
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+
+  if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+  }
+
+  // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  // FIXME - use subtarget debug flags
+  if (!Subtarget->isTargetDarwin() &&
+      !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing()) {
+    setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+  if (Subtarget->is64Bit()) {
+    setExceptionPointerRegister(X86::RAX);
+    setExceptionSelectorRegister(X86::RDX);
+  } else {
+    setExceptionPointerRegister(X86::EAX);
+    setExceptionSelectorRegister(X86::EDX);
+  }
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
+  } else {
+    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  if (Subtarget->isTargetCygMing())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  if (!UseSoftFloat && X86ScalarSSEf64) {
+    // f32 and f64 use SSE.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+
+    // Use ANDPD to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f64, Custom);
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f64, Custom);
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    // Use ANDPD and ORPD to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::f64, Expand);
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    addLegalFPImmediate(APFloat(+0.0)); // xorpd
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+  } else if (!UseSoftFloat && X86ScalarSSEf32) {
+    // Use SSE for f32, x87 for f64.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+
+    // Use ANDPS to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+
+    // Use ANDPS and ORPS to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+    // Special cases we handle for FP constants.
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+    }
+  } else if (!UseSoftFloat) {
+    // f32 and f64 in x87.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+    }
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+  }
+
+  // Long double always uses X87.
+  if (!UseSoftFloat) {
+    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+    {
+      bool ignored;
+      APFloat TmpFlt(+0.0);
+      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                     &ignored);
+      addLegalFPImmediate(TmpFlt);  // FLD0
+      TmpFlt.changeSign();
+      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+      APFloat TmpFlt2(+1.0);
+      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                      &ignored);
+      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.changeSign();
+      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+    }
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
+    }
+  }
+
+  // Always use a library call for pow.
+  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+
+  // First set operation action for all vector types to either promote
+  // (for widening) or expand (for scalarization). Then we will selectively
+  // turn on ones that can be effectively codegen'd.
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+  // with -msoft-float, disable use of MMX as well.
+  if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
+    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
+    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
+
+    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
+
+    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
+
+    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
+
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
+
+    setTruncStoreAction(MVT::v8i16,             MVT::v8i8, Expand);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i8, Expand);
+    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
+  }
+
+  if (!UseSoftFloat && Subtarget->hasSSE1()) {
+    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
+  }
+
+  if (!UseSoftFloat && Subtarget->hasSSE2()) {
+    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+
+    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+    // registers cannot be used even for integer operations.
+    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+
+    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+      // Do not attempt to custom lower non-power-of-2 vectors
+      if (!isPowerOf2_32(VT.getVectorNumElements()))
+        continue;
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+      setOperationAction(ISD::AND,    (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::AND,    (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::OR,     (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::XOR,    (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::LOAD,   (MVT::SimpleValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
+      AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64);
+    }
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+
+  }
+
+  if (Subtarget->hasSSE41()) {
+    // FIXME: Do we need to handle scalar-to-vector here?
+    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+
+    // i8 and i16 vectors are custom , because the source register and source
+    // source memory operand types are not the same width.  f32 vectors are
+    // custom since the immediate controlling the insert encodes additional
+    // information.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+    }
+  }
+
+  if (Subtarget->hasSSE42()) {
+    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Add/Sub/Mul with overflow operations are custom lowered.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
+  if (!Subtarget->is64Bit()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, 0);
+    setLibcallName(RTLIB::SRL_I128, 0);
+    setLibcallName(RTLIB::SRA_I128, 0);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::STORE);
+  if (Subtarget->is64Bit())
+    setTargetDAGCombine(ISD::MUL);
+
+  computeRegisterProperties();
+
+  // FIXME: These should be based on subtarget info. Plus, the values should
+  // be smaller when we are in optimizing for size mode.
+  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
+  allowUnalignedMemoryAccesses = true; // x86 supports it!
+  setPrefLoopAlignment(16);
+  benefitFromCodePlacementOpt = true;
+}
+
+
+MVT X86TargetLowering::getSetCCResultType(MVT VT) const {
+  return MVT::i8;
+}
+
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+  if (MaxAlign == 16)
+    return;
+  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->getBitWidth() == 128)
+      MaxAlign = 16;
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == 16)
+        break;
+    }
+  }
+  return;
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+  if (Subtarget->is64Bit()) {
+    // Max of 8 and alignment of type.
+    unsigned TyAlign = TD->getABITypeAlignment(Ty);
+    if (TyAlign > 8)
+      return TyAlign;
+    return 8;
+  }
+
+  unsigned Align = 4;
+  if (Subtarget->hasSSE1())
+    getMaxByValAlign(Ty, Align);
+  return Align;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+/// determining it.
+MVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
+                                       bool isSrcConst, bool isSrcStr) const {
+  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
+  // linux.  This is because the stack realignment code can't handle certain
+  // cases like PR2962.  This should be removed when PR2962 is fixed.
+  if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) {
+    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
+      return MVT::v4i32;
+    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
+      return MVT::v4f32;
+  }
+  if (Subtarget->is64Bit() && Size >= 8)
+    return MVT::i64;
+  return MVT::i32;
+}
+
+/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+/// jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                      SelectionDAG &DAG) const {
+  if (usesGlobalOffsetTable())
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
+  if (!Subtarget->isPICStyleRIPRel())
+    // This doesn't have DebugLoc associated with it, but is not really the
+    // same as a Register.
+    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
+                       getPointerTy());
+  return Table;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+/// LowerRET - Lower an ISD::RET node.
+SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+  SDValue Chain = Op.getOperand(0);
+
+  // Handle tail call return.
+  Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL);
+  if (Chain.getOpcode() == X86ISD::TAILCALL) {
+    SDValue TailCall = Chain;
+    SDValue TargetAddress = TailCall.getOperand(1);
+    SDValue StackAdjustment = TailCall.getOperand(2);
+    assert(((TargetAddress.getOpcode() == ISD::Register &&
+               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
+                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
+              TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
+              TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
+             "Expecting an global address, external symbol, or register");
+    assert(StackAdjustment.getOpcode() == ISD::Constant &&
+           "Expecting a const value");
+
+    SmallVector<SDValue,8> Operands;
+    Operands.push_back(Chain.getOperand(0));
+    Operands.push_back(TargetAddress);
+    Operands.push_back(StackAdjustment);
+    // Copy registers used by the call. Last operand is a flag so it is not
+    // copied.
+    for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) {
+      Operands.push_back(Chain.getOperand(i));
+    }
+    return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0],
+                       Operands.size());
+  }
+
+  // Regular return.
+  SDValue Flag;
+
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  // Operand #1 = Bytes To Pop
+  RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue ValToCopy = Op.getOperand(i*2+1);
+
+    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+    // the RET instruction and handled by the FP Stackifier.
+    if (VA.getLocReg() == X86::ST0 ||
+        VA.getLocReg() == X86::ST1) {
+      // If this is a copy from an xmm register to ST(0), use an FPExtend to
+      // change the value to the FP stack register class.
+      if (isScalarFPTypeInSSEReg(VA.getValVT()))
+        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+      RetOps.push_back(ValToCopy);
+      // Don't emit a copytoreg.
+      continue;
+    }
+
+    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+    // which is returned in RAX / RDX.
+    if (Subtarget->is64Bit()) {
+      MVT ValVT = ValToCopy.getValueType();
+      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
+        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
+        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
+      }
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into %rax.
+  if (Subtarget->is64Bit() &&
+      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      FuncInfo->setSRetReturnReg(Reg);
+    }
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(X86ISD::RET_FLAG, dl,
+                     MVT::Other, &RetOps[0], RetOps.size());
+}
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered.  The returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDNode *X86TargetLowering::
+LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
+                unsigned CallingConv, SelectionDAG &DAG) {
+
+  DebugLoc dl = TheCall->getDebugLoc();
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool isVarArg = TheCall->isVarArg();
+  bool Is64Bit = Subtarget->is64Bit();
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
+
+  SmallVector<SDValue, 8> ResultVals;
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    MVT CopyVT = VA.getValVT();
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) {
+      cerr << "SSE register return with SSE disabled\n";
+      exit(1);
+    }
+
+    // If this is a call to a function that returns an fp value on the floating
+    // point stack, but where we prefer to use the value in xmm registers, copy
+    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::ST0 ||
+         VA.getLocReg() == X86::ST1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+    }
+
+    SDValue Val;
+    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
+      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
+      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                   MVT::v2i64, InFlag).getValue(1);
+        Val = Chain.getValue(0);
+        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+                          Val, DAG.getConstant(0, MVT::i64));        
+      } else {
+        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                   MVT::i64, InFlag).getValue(1);
+        Val = Chain.getValue(0);
+      }
+      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                 CopyVT, InFlag).getValue(1);
+      Val = Chain.getValue(0);
+    }
+    InFlag = Chain.getValue(2);
+
+    if (CopyVT != VA.getValVT()) {
+      // Round the F80 the right size, which also moves to the appropriate xmm
+      // register.
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1));
+    }
+
+    ResultVals.push_back(Val);
+  }
+
+  // Merge everything together with a MERGE_VALUES node.
+  ResultVals.push_back(Chain);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
+                     &ResultVals[0], ResultVals.size()).getNode();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+//  For info on fast calling convention see Fast Calling Convention (tail call)
+//  implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a CALL node uses struct return
+/// semantics.
+static bool CallIsStructReturn(CallSDNode *TheCall) {
+  unsigned NumOps = TheCall->getNumArgs();
+  if (!NumOps)
+    return false;
+
+  return TheCall->getArgFlags(0).isSRet();
+}
+
+/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
+/// return semantics.
+static bool ArgsAreStructReturn(SDValue Op) {
+  unsigned NumArgs = Op.getNode()->getNumValues() - 1;
+  if (!NumArgs)
+    return false;
+
+  return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet();
+}
+
+/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires
+/// the callee to pop its own arguments. Callee pop is necessary to support tail
+/// calls.
+bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
+  if (IsVarArg)
+    return false;
+
+  switch (CallingConv) {
+  default:
+    return false;
+  case CallingConv::X86_StdCall:
+    return !Subtarget->is64Bit();
+  case CallingConv::X86_FastCall:
+    return !Subtarget->is64Bit();
+  case CallingConv::Fast:
+    return PerformTailCallOpt;
+  }
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
+/// given CallingConvention value.
+CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
+  if (Subtarget->is64Bit()) {
+    if (Subtarget->isTargetWin64())
+      return CC_X86_Win64_C;
+    else if (CC == CallingConv::Fast && PerformTailCallOpt)
+      return CC_X86_64_TailCall;
+    else
+      return CC_X86_64_C;
+  }
+
+  if (CC == CallingConv::X86_FastCall)
+    return CC_X86_32_FastCall;
+  else if (CC == CallingConv::Fast)
+    return CC_X86_32_FastCC;
+  else
+    return CC_X86_32_C;
+}
+
+/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to
+/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
+NameDecorationStyle
+X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
+  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  if (CC == CallingConv::X86_FastCall)
+    return FastCall;
+  else if (CC == CallingConv::X86_StdCall)
+    return StdCall;
+  return None;
+}
+
+
+/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
+/// in a register before calling.
+bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
+  return !IsTailCall && !Is64Bit &&
+    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+    Subtarget->isPICStyleGOT();
+}
+
+/// CallRequiresFnAddressInReg - Check whether the call requires the function
+/// address to be loaded in a register.
+bool
+X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
+  return !Is64Bit && IsTailCall &&
+    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+    Subtarget->isPICStyleGOT();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*AlwaysInline=*/true, NULL, 0, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG,
+                                              const CCValAssign &VA,
+                                              MachineFrameInfo *MFI,
+                                              unsigned CC,
+                                              SDValue Root, unsigned i) {
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags =
+    cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags();
+  bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt;
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can be
+  // changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                  VA.getLocMemOffset(), isImmutable);
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+  if (Flags.isByVal())
+    return FIN;
+  return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN,
+                     PseudoSourceValue::getFixedStack(FI), 0);
+}
+
+SDValue
+X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  DebugLoc dl = Op.getDebugLoc();
+
+  const Function* Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget->isTargetCygMing() &&
+      Fn->getName() == "main")
+    FuncInfo->setForceFramePointer(true);
+
+  // Decorate the function name.
+  FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SDValue Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  unsigned CC = MF.getFunction()->getCallingConv();
+  bool Is64Bit = Subtarget->is64Bit();
+  bool IsWin64 = Subtarget->isTargetWin64();
+
+  assert(!(isVarArg && CC == CallingConv::Fast) &&
+         "Var args not supported with calling convention fastcc");
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC));
+
+  SmallVector<SDValue, 8> ArgValues;
+  unsigned LastVal = ~0U;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+
+    if (VA.isRegLoc()) {
+      MVT RegVT = VA.getLocVT();
+      TargetRegisterClass *RC = NULL;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else if (Is64Bit && RegVT == MVT::i64)
+        RC = X86::GR64RegisterClass;
+      else if (RegVT == MVT::f32)
+        RC = X86::FR32RegisterClass;
+      else if (RegVT == MVT::f64)
+        RC = X86::FR64RegisterClass;
+      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
+        RC = X86::VR128RegisterClass;
+      else if (RegVT.isVector()) {
+        assert(RegVT.getSizeInBits() == 64);
+        if (!Is64Bit)
+          RC = X86::VR64RegisterClass;     // MMX values are passed in MMXs.
+        else {
+          // Darwin calling convention passes MMX values in either GPRs or
+          // XMMs in x86-64. Other targets pass them in memory.
+          if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) {
+            RC = X86::VR128RegisterClass;  // MMX values are passed in XMMs.
+            RegVT = MVT::v2i64;
+          } else {
+            RC = X86::GR64RegisterClass;   // v1i64 values are passed in GPRs.
+            RegVT = MVT::i64;
+          }
+        }
+      } else {
+        assert(0 && "Unknown argument type!");
+      }
+
+      unsigned Reg = DAG.getMachineFunction().addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+      // Handle MMX values passed in GPRs.
+      if (Is64Bit && RegVT != VA.getLocVT()) {
+        if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass)
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
+        else if (RC == X86::VR128RegisterClass) {
+          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+                                 ArgValue, DAG.getConstant(0, MVT::i64));
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
+        }
+      }
+
+      ArgValues.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc());
+      ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i));
+    }
+  }
+
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      FuncInfo->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]);
+    Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root);
+  }
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  // align stack specially for tail calls
+  if (PerformTailCallOpt && CC == CallingConv::Fast)
+    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    if (Is64Bit || CC != CallingConv::X86_FastCall) {
+      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+    }
+    if (Is64Bit) {
+      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+      // FIXME: We should really autogenerate these arrays
+      static const unsigned GPR64ArgRegsWin64[] = {
+        X86::RCX, X86::RDX, X86::R8,  X86::R9
+      };
+      static const unsigned XMMArgRegsWin64[] = {
+        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
+      };
+      static const unsigned GPR64ArgRegs64Bit[] = {
+        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+      };
+      static const unsigned XMMArgRegs64Bit[] = {
+        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+      };
+      const unsigned *GPR64ArgRegs, *XMMArgRegs;
+
+      if (IsWin64) {
+        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
+        GPR64ArgRegs = GPR64ArgRegsWin64;
+        XMMArgRegs = XMMArgRegsWin64;
+      } else {
+        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+        GPR64ArgRegs = GPR64ArgRegs64Bit;
+        XMMArgRegs = XMMArgRegs64Bit;
+      }
+      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+                                                       TotalNumIntRegs);
+      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
+                                                       TotalNumXMMRegs);
+
+      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+             "SSE register cannot be used when SSE is disabled!");
+      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) &&
+             "SSE register cannot be used when SSE is disabled!");
+      if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1())
+        // Kernel mode asks for SSE to be disabled, so don't push them
+        // on the stack.
+        TotalNumXMMRegs = 0;
+
+      // For X86-64, if there are vararg parameters that are passed via
+      // registers, then we must store them to their spots on the stack so they
+      // may be loaded by deferencing the result of va_next.
+      VarArgsGPOffset = NumIntRegs * 8;
+      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
+      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
+                                                 TotalNumXMMRegs * 16, 16);
+
+      // Store the integer parameter registers.
+      SmallVector<SDValue, 8> MemOps;
+      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                                  DAG.getIntPtrConstant(VarArgsGPOffset));
+      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
+                                     X86::GR64RegisterClass);
+        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64);
+        SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                          DAG.getIntPtrConstant(8));
+      }
+
+      // Now store the XMM (fp + vector) parameter registers.
+      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                        DAG.getIntPtrConstant(VarArgsFPOffset));
+      for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+        unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
+                                     X86::VR128RegisterClass);
+        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32);
+        SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                          DAG.getIntPtrConstant(16));
+      }
+      if (!MemOps.empty())
+          Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             &MemOps[0], MemOps.size());
+    }
+  }
+
+  ArgValues.push_back(Root);
+
+  // Some CCs need callee pop.
+  if (IsCalleePop(isVarArg, CC)) {
+    BytesToPopOnReturn  = StackSize; // Callee pops everything.
+    BytesCallerReserves = 0;
+  } else {
+    BytesToPopOnReturn  = 0; // Callee pops nothing.
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op))
+      BytesToPopOnReturn = 4;
+    BytesCallerReserves = StackSize;
+  }
+
+  if (!Is64Bit) {
+    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
+    if (CC == CallingConv::X86_FastCall)
+      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
+  }
+
+  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
+                                    const SDValue &StackPtr,
+                                    const CCValAssign &VA,
+                                    SDValue Chain,
+                                    SDValue Arg, ISD::ArgFlagsTy Flags) {
+  DebugLoc dl = TheCall->getDebugLoc();
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal()) {
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+  }
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      PseudoSourceValue::getStack(), LocMemOffset);
+}
+
+/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+                                           SDValue &OutRetAddr,
+                                           SDValue Chain,
+                                           bool IsTailCall,
+                                           bool Is64Bit,
+                                           int FPDiff,
+                                           DebugLoc dl) {
+  if (!IsTailCall || FPDiff==0) return Chain;
+
+  // Adjust the Return address stack slot.
+  MVT VT = getPointerTy();
+  OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+  // Load the "old" Return address.
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
+  return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue
+EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
+                         SDValue Chain, SDValue RetAddrFrIdx,
+                         bool Is64Bit, int FPDiff, DebugLoc dl) {
+  // Store the return address to the appropriate stack slot.
+  if (!FPDiff) return Chain;
+  // Calculate the new stack slot for the return address.
+  int SlotSize = Is64Bit ? 8 : 4;
+  int NewReturnAddrFI =
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
+  MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
+  return Chain;
+}
+
+SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  SDValue Chain       = TheCall->getChain();
+  unsigned CC         = TheCall->getCallingConv();
+  bool isVarArg       = TheCall->isVarArg();
+  bool IsTailCall     = TheCall->isTailCall() &&
+                        CC == CallingConv::Fast && PerformTailCallOpt;
+  SDValue Callee      = TheCall->getCallee();
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsStructRet    = CallIsStructReturn(TheCall);
+  DebugLoc dl         = TheCall->getDebugLoc();
+
+  assert(!(isVarArg && CC == CallingConv::Fast) &&
+         "Var args not supported with calling convention fastcc");
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  if (PerformTailCallOpt && CC == CallingConv::Fast)
+    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+  int FPDiff = 0;
+  if (IsTailCall) {
+    // Lower arguments at fp - stackoffset + fpdiff.
+    unsigned NumBytesCallerPushed =
+      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+    FPDiff = NumBytesCallerPushed - NumBytes;
+
+    // Set the delta of movement of the returnaddr stackslot.
+    // But only set if delta is greater than previous delta.
+    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
+      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue RetAddrFrIdx;
+  // Load return adress for tail calls.
+  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit,
+                                  FPDiff, dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization arguments are handle later.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = TheCall->getArg(i);
+    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      if (Is64Bit) {
+        MVT RegVT = VA.getLocVT();
+        if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
+          switch (VA.getLocReg()) {
+          default:
+            break;
+          case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX:
+          case X86::R8: {
+            // Special case: passing MMX values in GPR registers.
+            Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+            break;
+          }
+          case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
+          case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: {
+            // Special case: passing MMX values in XMM registers.
+            Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+            Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+            Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+            break;
+          }
+          }
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      if (!IsTailCall || (IsTailCall && isByVal)) {
+        assert(VA.isMemLoc());
+        if (StackPtr.getNode() == 0)
+          StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+
+        MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
+                                               Chain, Arg, Flags));
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!IsTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
+    Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
+                             DAG.getNode(X86ISD::GlobalBaseReg,
+                                         DebugLoc::getUnknownLoc(),
+                                         getPointerTy()),
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+  // If we are tail calling and generating PIC/GOT style code load the address
+  // of the callee into ecx. The value in ecx is used as target of the tail
+  // jump. This is done to circumvent the ebx/callee-saved problem for tail
+  // calls on PIC/GOT architectures. Normally we would just put the address of
+  // GOT into ebx and then call target@PLT. But for tail callss ebx would be
+  // restored (since ebx is callee saved) before jumping to the target@PLT.
+  if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
+    // Note: The actual moving to ecx is done further down.
+    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+    if (G && !G->getGlobal()->hasHiddenVisibility() &&
+        !G->getGlobal()->hasProtectedVisibility())
+      Callee =  LowerGlobalAddress(Callee, DAG);
+    else if (isa<ExternalSymbolSDNode>(Callee))
+      Callee = LowerExternalSymbol(Callee,DAG);
+  }
+
+  if (Is64Bit && isVarArg) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // FIXME: Verify this on Win64
+    // Count the number of XMM registers allocated.
+    static const unsigned XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+
+    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
+                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (IsTailCall) {
+    SmallVector<SDValue, 8> MemOpChains2;
+    SDValue FIN;
+    int FI = 0;
+    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      CCValAssign &VA = ArgLocs[i];
+      if (!VA.isRegLoc()) {
+        assert(VA.isMemLoc());
+        SDValue Arg = TheCall->getArg(i);
+        ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+        // Create frame index.
+        int32_t Offset = VA.getLocMemOffset()+FPDiff;
+        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
+        FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+        if (Flags.isByVal()) {
+          // Copy relative to framepointer.
+          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+          if (StackPtr.getNode() == 0)
+            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+                                          getPointerTy());
+          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain,
+                                                           Flags, DAG, dl));
+        } else {
+          // Store relative to framepointer.
+          MemOpChains2.push_back(
+            DAG.getStore(Chain, dl, Arg, FIN,
+                         PseudoSourceValue::getFixedStack(FI), 0));
+        }
+      }
+    }
+
+    if (!MemOpChains2.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOpChains2[0], MemOpChains2.size());
+
+    // Copy arguments to their registers.
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
+
+    // Store the return address to the appropriate stack slot.
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+                                     FPDiff, dl);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+                                        getTargetMachine(), true))
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(),
+                                          G->getOffset());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+  } else if (IsTailCall) {
+    unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
+
+    Chain = DAG.getCopyToReg(Chain,  dl,
+                             DAG.getRegister(Opc, getPointerTy()),
+                             Callee,InFlag);
+    Callee = DAG.getRegister(Opc, getPointerTy());
+    // Add register as live out.
+    DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+
+  if (IsTailCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                           DAG.getIntPtrConstant(0, true), InFlag);
+    InFlag = Chain.getValue(1);
+
+    // Returns a chain & a flag for retval copy to use.
+    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+    Ops.clear();
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (IsTailCall)
+    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (!IsTailCall && !Is64Bit &&
+      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+  // Add an implicit use of AL for x86 vararg functions.
+  if (Is64Bit && isVarArg)
+    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  if (IsTailCall) {
+    assert(InFlag.getNode() &&
+           "Flag must be set. Depend on flag being set in LowerRET");
+    Chain = DAG.getNode(X86ISD::TAILCALL, dl,
+                        TheCall->getVTList(), &Ops[0], Ops.size());
+
+    return SDValue(Chain.getNode(), Op.getResNo());
+  }
+
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPush;
+  if (IsCalleePop(isVarArg, CC))
+    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
+  else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet)
+    // If this is is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    NumBytesForCalleeToPush = 4;
+  else
+    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
+
+  // Returns a flag for retval copy to use.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+                                                   true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
+                 Op.getResNo());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+//  Like std call, callee cleans arguments, convention except that ECX is
+//  reserved for storing the tail called function address. Only 2 registers are
+//  free for argument passing (inreg). Tail call optimization is performed
+//  provided:
+//                * tailcallopt is enabled
+//                * caller/callee are fastcc
+//  On X86_64 architecture with GOT-style position independent code only local
+//  (within module) calls are supported at the moment.
+//  To keep the stack aligned according to platform abi the function
+//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
+//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+//  If a tail called function callee has more arguments than the caller the
+//  caller needs to make sure that there is room to move the RETADDR to. This is
+//  achieved by reserving an area the size of the argument delta right after the
+//  original REtADDR, but before the saved framepointer or the spilled registers
+//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+//  stack layout:
+//    arg1
+//    arg2
+//    RETADDR
+//    [ new RETADDR
+//      move area ]
+//    (possible EBP)
+//    ESI
+//    EDI
+//    local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+                                                        SelectionDAG& DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameInfo &TFI = *TM.getFrameInfo();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  uint64_t AlignMask = StackAlignment - 1;
+  int64_t Offset = StackSize;
+  uint64_t SlotSize = TD->getPointerSize();
+  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+    // Number smaller than 12 so just add the difference.
+    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+  } else {
+    // Mask out lower bits, add stackalignment once plus the 12 bytes.
+    Offset = ((~AlignMask) & Offset) + StackAlignment +
+      (StackAlignment-SlotSize);
+  }
+  return Offset;
+}
+
+/// IsEligibleForTailCallElimination - Check to see whether the next instruction
+/// following the call is a return. A function is eligible if caller/callee
+/// calling conventions match, currently only fastcc supports tail calls, and
+/// the function CALL is immediatly followed by a RET.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
+                                                      SDValue Ret,
+                                                      SelectionDAG& DAG) const {
+  if (!PerformTailCallOpt)
+    return false;
+
+  if (CheckTailCallReturnConstraints(TheCall, Ret)) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned CallerCC = MF.getFunction()->getCallingConv();
+    unsigned CalleeCC= TheCall->getCallingConv();
+    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+      SDValue Callee = TheCall->getCallee();
+      // On x86/32Bit PIC/GOT  tail calls are supported.
+      if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
+          !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
+        return true;
+
+      // Can only do local tail calls (in same module, hidden or protected) on
+      // x86_64 PIC/GOT at the moment.
+      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+        return G->getGlobal()->hasHiddenVisibility()
+            || G->getGlobal()->hasProtectedVisibility();
+    }
+  }
+
+  return false;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(MachineFunction &mf,
+                                  MachineModuleInfo *mmo,
+                                  DwarfWriter *dw,
+                                  DenseMap<const Value *, unsigned> &vm,
+                                  DenseMap<const BasicBlock *,
+                                           MachineBasicBlock *> &bm,
+                                  DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                                  , SmallSet<Instruction*, 8> &cil
+#endif
+                                  ) {
+  return X86::createFastISel(mf, mmo, dw, vm, bm, am
+#ifndef NDEBUG
+                             , cil
+#endif
+                             );
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    uint64_t SlotSize = TD->getPointerSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+
+/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_NS;
+      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        return X86::COND_S;
+      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+        // X < 1   -> X <= 0
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_LE;
+      }
+    }
+
+    switch (SetCCOpcode) {
+    default: assert(0 && "Invalid integer condition!");
+    case ISD::SETEQ:  return X86::COND_E;
+    case ISD::SETGT:  return X86::COND_G;
+    case ISD::SETGE:  return X86::COND_GE;
+    case ISD::SETLT:  return X86::COND_L;
+    case ISD::SETLE:  return X86::COND_LE;
+    case ISD::SETNE:  return X86::COND_NE;
+    case ISD::SETULT: return X86::COND_B;
+    case ISD::SETUGT: return X86::COND_A;
+    case ISD::SETULE: return X86::COND_BE;
+    case ISD::SETUGE: return X86::COND_AE;
+    }
+  }
+
+  // First determine if it is required or is profitable to flip the operands.
+
+  // If LHS is a foldable load, but RHS is not, flip the condition.
+  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
+      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
+    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(LHS, RHS);
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    std::swap(LHS, RHS);
+    break;
+  }
+
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (SetCCOpcode) {
+  default: assert(0 && "Condcode should be pre-legalized away");
+  case ISD::SETUEQ:
+  case ISD::SETEQ:   return X86::COND_E;
+  case ISD::SETOLT:              // flipped
+  case ISD::SETOGT:
+  case ISD::SETGT:   return X86::COND_A;
+  case ISD::SETOLE:              // flipped
+  case ISD::SETOGE:
+  case ISD::SETGE:   return X86::COND_AE;
+  case ISD::SETUGT:              // flipped
+  case ISD::SETULT:
+  case ISD::SETLT:   return X86::COND_B;
+  case ISD::SETUGE:              // flipped
+  case ISD::SETULE:
+  case ISD::SETLE:   return X86::COND_BE;
+  case ISD::SETONE:
+  case ISD::SETNE:   return X86::COND_NE;
+  case ISD::SETUO:   return X86::COND_P;
+  case ISD::SETO:    return X86::COND_NP;
+  }
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+/// isUndefOrInRange - Return true if Val is undef or if its value falls within
+/// the specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+  return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
+/// specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+  if (Val < 0 || Val == CmpVal)
+    return true;
+  return false;
+}
+
+/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
+/// the second operand.
+static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
+    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
+  if (VT == MVT::v2f64 || VT == MVT::v2i64)
+    return (Mask[0] < 2 && Mask[1] < 2);
+  return false;
+}
+
+bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M; 
+  N->getMask(M);
+  return ::isPSHUFDMask(M, N->getValueType(0));
+}
+
+/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFHW.
+static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  if (VT != MVT::v8i16)
+    return false;
+  
+  // Lower quadword copied in order or undef.
+  for (int i = 0; i != 4; ++i)
+    if (Mask[i] >= 0 && Mask[i] != i)
+      return false;
+  
+  // Upper quadword shuffled.
+  for (int i = 4; i != 8; ++i)
+    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+      return false;
+  
+  return true;
+}
+
+bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M; 
+  N->getMask(M);
+  return ::isPSHUFHWMask(M, N->getValueType(0));
+}
+
+/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFLW.
+static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  if (VT != MVT::v8i16)
+    return false;
+  
+  // Upper quadword copied in order.
+  for (int i = 4; i != 8; ++i)
+    if (Mask[i] >= 0 && Mask[i] != i)
+      return false;
+  
+  // Lower quadword shuffled.
+  for (int i = 0; i != 4; ++i)
+    if (Mask[i] >= 4)
+      return false;
+  
+  return true;
+}
+
+bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M; 
+  N->getMask(M);
+  return ::isPSHUFLWMask(M, N->getValueType(0));
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  int NumElems = VT.getVectorNumElements();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+  
+  int Half = NumElems / 2;
+  for (int i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, NumElems))
+      return false;
+  for (int i = Half; i < NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+      return false;
+  
+  return true;
+}
+
+bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isSHUFPMask(M, N->getValueType(0));
+}
+
+/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
+/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
+/// half elements to come from vector 1 (which would equal the dest.) and
+/// the upper half to come from vector 2.
+static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  int NumElems = VT.getVectorNumElements();
+  
+  if (NumElems != 2 && NumElems != 4) 
+    return false;
+  
+  int Half = NumElems / 2;
+  for (int i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+      return false;
+  for (int i = Half; i < NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, NumElems))
+      return false;
+  return true;
+}
+
+static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return isCommutedSHUFPMask(M, N->getValueType(0));
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
+  if (N->getValueType(0).getVectorNumElements() != 4)
+    return false;
+
+  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(N->getMaskElt(0), 6) &&
+         isUndefOrEqual(N->getMaskElt(1), 7) &&
+         isUndefOrEqual(N->getMaskElt(2), 2) &&
+         isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
+      return false;
+
+  for (unsigned i = NumElems/2; i < NumElems; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i))
+      return false;
+
+  return true;
+}
+
+/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+/// and MOVLHPS.
+bool X86::isMOVHPMask(ShuffleVectorSDNode *N) {
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i))
+      return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
+      return false;
+
+  return true;
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  
+  if (NumElems != 4)
+    return false;
+  
+  return isUndefOrEqual(N->getMaskElt(0), 2) && 
+         isUndefOrEqual(N->getMaskElt(1), 3) &&
+         isUndefOrEqual(N->getMaskElt(2), 2) && 
+         isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, MVT VT,
+                         bool V2IsSplat = false) {
+  int NumElts = VT.getVectorNumElements();
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+  
+  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (V2IsSplat) {
+      if (!isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, MVT VT, 
+                         bool V2IsSplat = false) {
+  int NumElts = VT.getVectorNumElements();
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+  
+  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j + NumElts/2))
+      return false;
+    if (V2IsSplat) {
+      if (isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  int NumElems = VT.getVectorNumElements();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+  
+  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+  return true;
+}
+
+bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  int NumElems = VT.getVectorNumElements();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+  
+  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+  return true;
+}
+
+bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+  int NumElts = VT.getVectorNumElements();
+  if (NumElts != 2 && NumElts != 4)
+    return false;
+  
+  if (!isUndefOrEqual(Mask[0], NumElts))
+    return false;
+  
+  for (int i = 1; i < NumElts; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+  
+  return true;
+}
+
+bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isMOVLMask(M, N->getValueType(0));
+}
+
+/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT,
+                               bool V2IsSplat = false, bool V2IsUndef = false) {
+  int NumOps = VT.getVectorNumElements();
+  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+    return false;
+  
+  if (!isUndefOrEqual(Mask[0], 0))
+    return false;
+  
+  for (int i = 1; i < NumOps; ++i)
+    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
+          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
+          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
+      return false;
+  
+  return true;
+}
+
+static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
+                           bool V2IsUndef = false) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
+  if (N->getValueType(0).getVectorNumElements() != 4)
+    return false;
+
+  // Expect 1, 1, 3, 3
+  for (unsigned i = 0; i < 2; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt >= 0 && Elt != 1)
+      return false;
+  }
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt >= 0 && Elt != 3)
+      return false;
+    if (Elt == 3)
+      HasHi = true;
+  }
+  // Don't use movshdup if it can be done with a shufps.
+  // FIXME: verify that matching u, u, 3, 3 is what we want.
+  return HasHi;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
+  if (N->getValueType(0).getVectorNumElements() != 4)
+    return false;
+
+  // Expect 0, 0, 2, 2
+  for (unsigned i = 0; i < 2; ++i)
+    if (N->getMaskElt(i) > 0)
+      return false;
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt >= 0 && Elt != 2)
+      return false;
+    if (Elt == 2)
+      HasHi = true;
+  }
+  // Don't use movsldup if it can be done with a shufps.
+  return HasHi;
+}
+
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
+  int e = N->getValueType(0).getVectorNumElements() / 2;
+  
+  for (int i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i))
+      return false;
+  for (int i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
+      return false;
+  return true;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+/// instructions.
+unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
+
+  unsigned Shift = (NumOperands == 4) ? 2 : 1;
+  unsigned Mask = 0;
+  for (int i = 0; i < NumOperands; ++i) {
+    int Val = SVOp->getMaskElt(NumOperands-i-1);
+    if (Val < 0) Val = 0;
+    if (Val >= NumOperands) Val -= NumOperands;
+    Mask |= Val;
+    if (i != NumOperands - 1)
+      Mask <<= Shift;
+  }
+  return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+/// instructions.
+unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the last 4.
+  for (unsigned i = 7; i >= 4; --i) {
+    int Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      Mask |= (Val - 4);
+    if (i != 4)
+      Mask <<= 2;
+  }
+  return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+/// instructions.
+unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the first 4.
+  for (int i = 3; i >= 0; --i) {
+    int Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      Mask |= Val;
+    if (i != 0)
+      Mask <<= 2;
+  }
+  return Mask;
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
+/// their permute mask.
+static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
+                                    SelectionDAG &DAG) {
+  MVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> MaskVec;
+  
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = SVOp->getMaskElt(i);
+    if (idx < 0)
+      MaskVec.push_back(idx);
+    else if (idx < (int)NumElems)
+      MaskVec.push_back(idx + NumElems);
+    else
+      MaskVec.push_back(idx - NumElems);
+  }
+  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+                              SVOp->getOperand(0), &MaskVec[0]);
+}
+
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, MVT VT) {
+  unsigned NumElems = VT.getVectorNumElements();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = Mask[i];
+    if (idx < 0)
+      continue;
+    else if (idx < (int)NumElems)
+      Mask[i] = idx + NumElems;
+    else
+      Mask[i] = idx - NumElems;
+  }
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
+  if (Op->getValueType(0).getVectorNumElements() != 4)
+    return false;
+  for (unsigned i = 0, e = 2; i != e; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
+      return false;
+  for (unsigned i = 2; i != 4; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
+      return false;
+  return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+  N = N->getOperand(0).getNode();
+  if (!ISD::isNON_EXTLoad(N))
+    return false;
+  if (LD)
+    *LD = cast<LoadSDNode>(N);
+  return true;
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
+                               ShuffleVectorSDNode *Op) {
+  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+    return false;
+  // Is V2 is a vector load, don't do this transformation. We will try to use
+  // load folding shufps op.
+  if (ISD::isNON_EXTLoad(V2))
+    return false;
+
+  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+  
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i))
+      return false;
+  for (unsigned i = NumElems/2; i != NumElems; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
+      return false;
+  return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  SDValue SplatValue = N->getOperand(0);
+  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i) != SplatValue)
+      return false;
+  return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDValue Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
+          (isa<ConstantFPSDNode>(Elt) &&
+           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector. 
+/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
+static bool isZeroShuffle(ShuffleVectorSDNode *N) {
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx >= (int)NumElems) {
+      unsigned Opc = V2.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V2.getOperand(Idx-NumElems)))
+        return false;
+    } else if (Idx >= 0) {
+      unsigned Opc = V1.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V1.getOperand(Idx)))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG,
+                             DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDValue Vec;
+  if (VT.getSizeInBits() == 64) { // MMX
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+  } else if (HasSSE2) {  // SSE2
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  } else { // SSE1
+    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec;
+  if (VT.getSizeInBits() == 64)  // MMX
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+  else                                              // SSE
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  MVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  
+  bool Changed = false;
+  SmallVector<int, 8> MaskVec;
+  SVOp->getMask(MaskVec);
+  
+  for (unsigned i = 0; i != NumElems; ++i) {
+    if (MaskVec[i] > (int)NumElems) {
+      MaskVec[i] = NumElems;
+      Changed = true;
+    }
+  }
+  if (Changed)
+    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
+                                SVOp->getOperand(1), &MaskVec[0]);
+  return SDValue(SVOp, 0);
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+                       SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  Mask.push_back(NumElems);
+  for (unsigned i = 1; i != NumElems; ++i)
+    Mask.push_back(i);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+    Mask.push_back(i);
+    Mask.push_back(i + NumElems);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned Half = NumElems/2;
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0; i != Half; ++i) {
+    Mask.push_back(i + Half);
+    Mask.push_back(i + NumElems + Half);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 
+                            bool HasSSE2) {
+  if (SV->getValueType(0).getVectorNumElements() <= 4)
+    return SDValue(SV, 0);
+  
+  MVT PVT = MVT::v4f32;
+  MVT VT = SV->getValueType(0);
+  DebugLoc dl = SV->getDebugLoc();
+  SDValue V1 = SV->getOperand(0);
+  int NumElems = VT.getVectorNumElements();
+  int EltNo = SV->getSplatIndex();
+
+  // unpack elements to the correct location
+  while (NumElems > 4) {
+    if (EltNo < NumElems/2) {
+      V1 = getUnpackl(DAG, dl, VT, V1, V1);
+    } else {
+      V1 = getUnpackh(DAG, dl, VT, V1, V1);
+      EltNo -= NumElems/2;
+    }
+    NumElems >>= 1;
+  }
+  
+  // Perform the splat.
+  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
+  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector.  This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+                                             bool isZero, bool HasSSE2,
+                                             SelectionDAG &DAG) {
+  MVT VT = V2.getValueType();
+  SDValue V1 = isZero
+    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 16> MaskVec;
+  for (unsigned i = 0; i != NumElems; ++i)
+    // If this is the insertion idx, put the low elt of V2 here.
+    MaskVec.push_back(i == Idx ? NumElems : i);
+  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements in a result of
+/// a shuffle that is zero.
+static
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
+                                  bool Low, SelectionDAG &DAG) {
+  unsigned NumZeros = 0;
+  for (int i = 0; i < NumElems; ++i) {
+    unsigned Index = Low ? i : NumElems-i-1;
+    int Idx = SVOp->getMaskElt(Index);
+    if (Idx < 0) {
+      ++NumZeros;
+      continue;
+    }
+    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
+    if (Elt.getNode() && isZeroNode(Elt))
+      ++NumZeros;
+    else
+      break;
+  }
+  return NumZeros;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+/// FIXME: split into pslldqi, psrldqi, palignr variants.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  int NumElems = SVOp->getValueType(0).getVectorNumElements();
+
+  isLeft = true;
+  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
+  if (!NumZeros) {
+    isLeft = false;
+    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
+    if (!NumZeros)
+      return false;
+  }
+  bool SeenV1 = false;
+  bool SeenV2 = false;
+  for (int i = NumZeros; i < NumElems; ++i) {
+    int Val = isLeft ? (i - NumZeros) : i;
+    int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
+    if (Idx < 0)
+      continue;
+    if (Idx < NumElems)
+      SeenV1 = true;
+    else {
+      Idx -= NumElems;
+      SeenV2 = true;
+    }
+    if (Idx != Val)
+      return false;
+  }
+  if (SeenV1 && SeenV2)
+    return false;
+
+  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
+  ShAmt = NumZeros;
+  return true;
+}
+
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, true, DAG, dl);
+      else
+        V = DAG.getUNDEF(MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDValue ThisElt(0, 0), LastElt(0, 0);
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                              MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              ThisElt, DAG.getConstant(8, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.getNode())
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                        DAG.getIntPtrConstant(i/2));
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, true, DAG, dl);
+        else
+          V = DAG.getUNDEF(MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                      MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getIntPtrConstant(i));
+    }
+  }
+
+  return V;
+}
+
+/// getVShift - Return a vector logical shift node.
+///
+static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
+                         unsigned NumBits, SelectionDAG &DAG,
+                         const TargetLowering &TLI, DebugLoc dl) {
+  bool isMMX = VT.getSizeInBits() == 64;
+  MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                     DAG.getNode(Opc, dl, ShVT, SrcOp,
+                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
+  if (ISD::isBuildVectorAllZeros(Op.getNode())
+      || ISD::isBuildVectorAllOnes(Op.getNode())) {
+    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
+    // eliminated on x86-32 hosts.
+    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+      return Op;
+
+    if (ISD::isBuildVectorAllOnes(Op.getNode()))
+      return getOnesVector(Op.getValueType(), DAG, dl);
+    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+  }
+
+  MVT VT = Op.getValueType();
+  MVT EVT = VT.getVectorElementType();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  unsigned NumElems = Op.getNumOperands();
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  bool IsAllConstants = true;
+  SmallSet<SDValue, 8> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    Values.insert(Elt);
+    if (Elt.getOpcode() != ISD::Constant &&
+        Elt.getOpcode() != ISD::ConstantFP)
+      IsAllConstants = false;
+    if (isZeroNode(Elt))
+      NumZero++;
+    else {
+      NonZeros |= (1 << i);
+      NumNonZero++;
+    }
+  }
+
+  if (NumNonZero == 0) {
+    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+    return DAG.getUNDEF(VT);
+  }
+
+  // Special case for single non-zero, non-undef, element.
+  if (NumNonZero == 1 && NumElems <= 4) {
+    unsigned Idx = CountTrailingZeros_32(NonZeros);
+    SDValue Item = Op.getOperand(Idx);
+
+    // If this is an insertion of an i64 value on x86-32, and if the top bits of
+    // the value are obviously zero, truncate the value to i32 and do the
+    // insertion that way.  Only do this if the value is non-constant or if the
+    // value is a constant being inserted into element 0.  It is cheaper to do
+    // a constant pool load than it is to do a movd + shuffle.
+    if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
+        (!IsAllConstants || Idx == 0)) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+        // Handle MMX and SSE both.
+        MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
+        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
+
+        // Truncate the value (which may itself be a constant) to i32, and
+        // convert it to a vector with movd (S2V+shuffle to zero extend).
+        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+                                           Subtarget->hasSSE2(), DAG);
+
+        // Now we have our 32-bit value zero extended in the low element of
+        // a vector.  If Idx != 0, swizzle it into place.
+        if (Idx != 0) {
+          SmallVector<int, 4> Mask;
+          Mask.push_back(Idx);
+          for (unsigned i = 1; i != VecElts; ++i)
+            Mask.push_back(i);
+          Item = DAG.getVectorShuffle(VecVT, dl, Item,
+                                      DAG.getUNDEF(Item.getValueType()), 
+                                      &Mask[0]);
+        }
+        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
+      }
+    }
+
+    // If we have a constant or non-constant insertion into the low element of
+    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
+    // depending on what the source datatype is.  Because we can only get here
+    // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
+    if (Idx == 0 &&
+        // Don't do this for i64 values on x86-32.
+        (EVT != MVT::i64 || Subtarget->is64Bit())) {
+      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+                                         Subtarget->hasSSE2(), DAG);
+    }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = VT.getSizeInBits();
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this, dl);
+    }
+
+    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+      return SDValue();
+
+    // Otherwise, if this is a vector with i32 or f32 elements, and the element
+    // is a non-constant being inserted into an element other than the low one,
+    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
+    // movd/movss) to move this into the low element, then shuffle it into
+    // place.
+    if (EVTBits == 32) {
+      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      // Turn it into a shuffle of zero and zero-extended scalar to vector.
+      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+                                         Subtarget->hasSSE2(), DAG);
+      SmallVector<int, 8> MaskVec;
+      for (unsigned i = 0; i < NumElems; i++)
+        MaskVec.push_back(i == Idx ? 0 : 1);
+      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+    }
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1)
+    return SDValue();
+
+  // A vector full of immediates; various special cases are already
+  // handled, so this is best done with a single constant-pool load.
+  if (IsAllConstants)
+    return SDValue();
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+                                 Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true,
+                                         Subtarget->hasSSE2(), DAG);
+    }
+    return SDValue();
+  }
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16) {
+    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.getNode()) return V;
+  }
+
+  if (EVTBits == 16 && NumElems == 8) {
+    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.getNode()) return V;
+  }
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  SmallVector<SDValue, 8> V;
+  V.resize(NumElems);
+  if (NumElems == 4 && NumZero > 0) {
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1 << i));
+      if (isZero)
+        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+      else
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          V[i] = V[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+          break;
+        case 2:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+        case 3:
+          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+      }
+    }
+
+    SmallVector<int, 8> MaskVec;
+    bool Reverse = (NonZeros & 0x3) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      MaskVec.push_back(Reverse ? 1-i : i);
+    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
+    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+  }
+
+  if (Values.size() > 2) {
+    // If we have SSE 4.1, Expand into a number of inserts unless the number of
+    // values to be inserted is equal to the number of elements, in which case
+    // use the unpack code below in the hopes of matching the consecutive elts
+    // load merge pattern for shuffles. 
+    // FIXME: We could probably just check that here directly.
+    if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 
+        getSubtarget()->hasSSE41()) {
+      V[0] = DAG.getUNDEF(VT);
+      for (unsigned i = 0; i < NumElems; ++i)
+        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
+                             Op.getOperand(i), DAG.getIntPtrConstant(i));
+      return V[0];
+    }
+    // Expand into a number of unpckl*.
+    // e.g. for v4f32
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    NumElems >>= 1;
+    while (NumElems != 0) {
+      for (unsigned i = 0; i < NumElems; ++i)
+        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
+      NumElems >>= 1;
+    }
+    return V[0];
+  }
+
+  return SDValue();
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all]   pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static
+SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 8> MaskVals;
+
+  // Determine if more than 1 of the words in each of the low and high quadwords
+  // of the result come from the same quadword of one of the two inputs.  Undef
+  // mask values count as coming from any quadword, for better codegen.
+  SmallVector<unsigned, 4> LoQuad(4);
+  SmallVector<unsigned, 4> HiQuad(4);
+  BitVector InputQuads(4);
+  for (unsigned i = 0; i < 8; ++i) {
+    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
+    int EltIdx = SVOp->getMaskElt(i);
+    MaskVals.push_back(EltIdx);
+    if (EltIdx < 0) {
+      ++Quad[0];
+      ++Quad[1];
+      ++Quad[2];
+      ++Quad[3];
+      continue;
+    }
+    ++Quad[EltIdx / 4];
+    InputQuads.set(EltIdx / 4);
+  }
+
+  int BestLoQuad = -1;
+  unsigned MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (LoQuad[i] > MaxQuad) {
+      BestLoQuad = i;
+      MaxQuad = LoQuad[i];
+    }
+  }
+
+  int BestHiQuad = -1;
+  MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (HiQuad[i] > MaxQuad) {
+      BestHiQuad = i;
+      MaxQuad = HiQuad[i];
+    }
+  }
+
+  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+  // of the two input vectors, shuffle them into one input vector so only a 
+  // single pshufb instruction is necessary. If There are more than 2 input
+  // quads, disable the next transformation since it does not help SSSE3.
+  bool V1Used = InputQuads[0] || InputQuads[1];
+  bool V2Used = InputQuads[2] || InputQuads[3];
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    if (InputQuads.count() == 2 && V1Used && V2Used) {
+      BestLoQuad = InputQuads.find_first();
+      BestHiQuad = InputQuads.find_next(BestLoQuad);
+    }
+    if (InputQuads.count() > 2) {
+      BestLoQuad = -1;
+      BestHiQuad = -1;
+    }
+  }
+
+  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+  // the shuffle mask.  If a quad is scored as -1, that means that it contains
+  // words from all 4 input quadwords.
+  SDValue NewV;
+  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+    SmallVector<int, 8> MaskV;
+    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
+    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
+    NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 
+                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
+                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
+    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
+
+    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+    // source words for the shuffle, to aid later transformations.
+    bool AllWordsInNewV = true;
+    bool InOrder[2] = { true, true };
+    for (unsigned i = 0; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx != (int)i)
+        InOrder[i/4] = false;
+      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+        continue;
+      AllWordsInNewV = false;
+      break;
+    }
+
+    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+    if (AllWordsInNewV) {
+      for (int i = 0; i != 8; ++i) {
+        int idx = MaskVals[i];
+        if (idx < 0)
+          continue;
+        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 
+        if ((idx != i) && idx < 4)
+          pshufhw = false;
+        if ((idx != i) && idx > 3)
+          pshuflw = false;
+      }
+      V1 = NewV;
+      V2Used = false;
+      BestLoQuad = 0;
+      BestHiQuad = 1;
+    }
+
+    // If we've eliminated the use of V2, and the new mask is a pshuflw or
+    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
+    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 
+                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+    }
+  }
+  
+  // If we have SSSE3, and all words of the result are from 1 input vector,
+  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
+  // is present, fall back to case 4.
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+    
+    // If we have elements from both input vectors, set the high bit of the
+    // shuffle mask element to zero out elements that come from V2 in the V1 
+    // mask, and elements that come from V1 in the V2 mask, so that the two
+    // results can be OR'd together.
+    bool TwoInputs = V1Used && V2Used;
+    for (unsigned i = 0; i != 8; ++i) {
+      int EltIdx = MaskVals[i] * 2;
+      if (TwoInputs && (EltIdx >= 16)) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+    }
+    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    if (!TwoInputs)
+      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+    
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 8; ++i) {
+      int EltIdx = MaskVals[i] * 2;
+      if (EltIdx < 16) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+    }
+    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+  }
+
+  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+  // and update MaskVals with new element order.
+  BitVector InOrder(8);
+  if (BestLoQuad >= 0) {
+    SmallVector<int, 8> MaskV;
+    for (int i = 0; i != 4; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        MaskV.push_back(-1);
+        InOrder.set(i);
+      } else if ((idx / 4) == BestLoQuad) {
+        MaskV.push_back(idx & 3);
+        InOrder.set(i);
+      } else {
+        MaskV.push_back(-1);
+      }
+    }
+    for (unsigned i = 4; i != 8; ++i)
+      MaskV.push_back(i);
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+  }
+  
+  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+  // and update MaskVals with the new element order.
+  if (BestHiQuad >= 0) {
+    SmallVector<int, 8> MaskV;
+    for (unsigned i = 0; i != 4; ++i)
+      MaskV.push_back(i);
+    for (unsigned i = 4; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        MaskV.push_back(-1);
+        InOrder.set(i);
+      } else if ((idx / 4) == BestHiQuad) {
+        MaskV.push_back((idx & 3) + 4);
+        InOrder.set(i);
+      } else {
+        MaskV.push_back(-1);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+  }
+  
+  // In case BestHi & BestLo were both -1, which means each quadword has a word
+  // from each of the four input quadwords, calculate the InOrder bitvector now
+  // before falling through to the insert/extract cleanup.
+  if (BestLoQuad == -1 && BestHiQuad == -1) {
+    NewV = V1;
+    for (int i = 0; i != 8; ++i)
+      if (MaskVals[i] < 0 || MaskVals[i] == i)
+        InOrder.set(i);
+  }
+  
+  // The other elements are put in the right place using pextrw and pinsrw.
+  for (unsigned i = 0; i != 8; ++i) {
+    if (InOrder[i])
+      continue;
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    SDValue ExtOp = (EltIdx < 8)
+    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx))
+    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+                  DAG.getIntPtrConstant(EltIdx - 8));
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+                       DAG.getIntPtrConstant(i));
+  }
+  return NewV;
+}
+
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
+static
+SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 16> MaskVals;
+  SVOp->getMask(MaskVals);
+  
+  // If we have SSSE3, case 1 is generated when all result bytes come from
+  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is 
+  // present, fall back to case 3.
+  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
+  bool V1Only = true;
+  bool V2Only = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    if (EltIdx < 16)
+      V2Only = false;
+    else
+      V1Only = false;
+  }
+  
+  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+    
+    // If all result elements are from one input vector, then only translate
+    // undef mask values to 0x80 (zero out result) in the pshufb mask. 
+    //
+    // Otherwise, we have elements from both input vectors, and must zero out
+    // elements that come from V2 in the first mask, and V1 in the second mask
+    // so that we can OR them together.
+    bool TwoInputs = !(V1Only || V2Only);
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+    }
+    // If all the elements are from V2, assign it to V1 and return after
+    // building the first pshufb.
+    if (V2Only)
+      V1 = V2;
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    if (!TwoInputs)
+      return V1;
+    
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 16) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+    }
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+  }
+  
+  // No SSSE3 - Calculate in place words and then fix all out of place words
+  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
+  // the 16 different words that comprise the two doublequadword input vectors.
+  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
+  SDValue NewV = V2Only ? V2 : V1;
+  for (int i = 0; i != 8; ++i) {
+    int Elt0 = MaskVals[i*2];
+    int Elt1 = MaskVals[i*2+1];
+    
+    // This word of the result is all undef, skip it.
+    if (Elt0 < 0 && Elt1 < 0)
+      continue;
+    
+    // This word of the result is already in the correct place, skip it.
+    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
+      continue;
+    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+      continue;
+    
+    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+    SDValue InsElt;
+
+    // If Elt0 and Elt1 are defined, are consecutive, and can be load
+    // using a single extract together, load it and store it.
+    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                        DAG.getIntPtrConstant(i));
+      continue;
+    }
+
+    // If Elt1 is defined, extract it from the appropriate source.  If the
+    // source byte is not also odd, shift the extracted word left 8 bits
+    // otherwise clear the bottom 8 bits if we need to do an or.
+    if (Elt1 >= 0) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      if ((Elt1 & 1) == 0)
+        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+                             DAG.getConstant(8, TLI.getShiftAmountTy()));
+      else if (Elt0 >= 0)
+        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+                             DAG.getConstant(0xFF00, MVT::i16));
+    }
+    // If Elt0 is defined, extract it from the appropriate source.  If the
+    // source byte is not also even, shift the extracted word right 8 bits. If
+    // Elt1 was also defined, OR the extracted values together before
+    // inserting them in the result.
+    if (Elt0 >= 0) {
+      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+      if ((Elt0 & 1) != 0)
+        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+                              DAG.getConstant(8, TLI.getShiftAmountTy()));
+      else if (Elt1 >= 0)
+        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+                             DAG.getConstant(0x00FF, MVT::i16));
+      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+                         : InsElt0;
+    }
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                       DAG.getIntPtrConstant(i));
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
+}
+
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG,
+                                 TargetLowering &TLI, DebugLoc dl) {
+  MVT VT = SVOp->getValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
+  MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
+  MVT MaskEltVT = MaskVT.getVectorElementType();
+  MVT NewVT = MaskVT;
+  switch (VT.getSimpleVT()) {
+  default: assert(false && "Unexpected!");
+  case MVT::v4f32: NewVT = MVT::v2f64; break;
+  case MVT::v4i32: NewVT = MVT::v2i64; break;
+  case MVT::v8i16: NewVT = MVT::v4i32; break;
+  case MVT::v16i8: NewVT = MVT::v4i32; break;
+  }
+
+  if (NewWidth == 2) {
+    if (VT.isInteger())
+      NewVT = MVT::v2i64;
+    else
+      NewVT = MVT::v2f64;
+  }
+  int Scale = NumElems / NewWidth;
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i < NumElems; i += Scale) {
+    int StartIdx = -1;
+    for (int j = 0; j < Scale; ++j) {
+      int EltIdx = SVOp->getMaskElt(i+j);
+      if (EltIdx < 0)
+        continue;
+      if (StartIdx == -1)
+        StartIdx = EltIdx - (EltIdx % Scale);
+      if (EltIdx != StartIdx + j)
+        return SDValue();
+    }
+    if (StartIdx == -1)
+      MaskVec.push_back(-1);
+    else
+      MaskVec.push_back(StartIdx / Scale);
+  }
+
+  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
+  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
+  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
+
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
+                            SDValue SrcOp, SelectionDAG &DAG,
+                            const X86Subtarget *Subtarget, DebugLoc dl) {
+  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    LoadSDNode *LD = NULL;
+    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+      LD = dyn_cast<LoadSDNode>(SrcOp);
+    if (!LD) {
+      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+      // instead.
+      MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
+          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
+          SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
+        // PR2108
+        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   OpVT,
+                                                   SrcOp.getOperand(0)
+                                                          .getOperand(0))));
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                 DAG.getNode(ISD::BIT_CONVERT, dl,
+                                             OpVT, SrcOp)));
+}
+
+/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
+/// shuffles.
+static SDValue
+LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT VT = SVOp->getValueType(0);
+  
+  SmallVector<std::pair<int, int>, 8> Locs;
+  Locs.resize(4);
+  SmallVector<int, 8> Mask1(4U, -1);
+  SmallVector<int, 8> PermMask;
+  SVOp->getMask(PermMask);
+
+  unsigned NumHi = 0;
+  unsigned NumLo = 0;
+  for (unsigned i = 0; i != 4; ++i) {
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else {
+      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+      if (Idx < 4) {
+        Locs[i] = std::make_pair(0, NumLo);
+        Mask1[NumLo] = Idx;
+        NumLo++;
+      } else {
+        Locs[i] = std::make_pair(1, NumHi);
+        if (2+NumHi < 4)
+          Mask1[2+NumHi] = Idx;
+        NumHi++;
+      }
+    }
+  }
+
+  if (NumLo <= 2 && NumHi <= 2) {
+    // If no more than two elements come from either vector. This can be
+    // implemented with two shuffles. First shuffle gather the elements.
+    // The second shuffle, which takes the first shuffle as both of its
+    // vector operands, put the elements into the right order.
+    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    SmallVector<int, 8> Mask2(4U, -1);
+    
+    for (unsigned i = 0; i != 4; ++i) {
+      if (Locs[i].first == -1)
+        continue;
+      else {
+        unsigned Idx = (i < 2) ? 0 : 4;
+        Idx += Locs[i].first * 2 + Locs[i].second;
+        Mask2[i] = Idx;
+      }
+    }
+
+    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
+  } else if (NumLo == 3 || NumHi == 3) {
+    // Otherwise, we must have three elements from one vector, call it X, and
+    // one element from the other, call it Y.  First, use a shufps to build an
+    // intermediate vector with the one element from Y and the element from X
+    // that will be in the same half in the final destination (the indexes don't
+    // matter). Then, use a shufps to build the final vector, taking the half
+    // containing the element from Y from the intermediate, and the other half
+    // from X.
+    if (NumHi == 3) {
+      // Normalize it so the 3 elements come from V1.
+      CommuteVectorShuffleMask(PermMask, VT);
+      std::swap(V1, V2);
+    }
+
+    // Find the element from V2.
+    unsigned HiIndex;
+    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+      int Val = PermMask[HiIndex];
+      if (Val < 0)
+        continue;
+      if (Val >= 4)
+        break;
+    }
+
+    Mask1[0] = PermMask[HiIndex];
+    Mask1[1] = -1;
+    Mask1[2] = PermMask[HiIndex^1];
+    Mask1[3] = -1;
+    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    if (HiIndex >= 2) {
+      Mask1[0] = PermMask[0];
+      Mask1[1] = PermMask[1];
+      Mask1[2] = HiIndex & 1 ? 6 : 4;
+      Mask1[3] = HiIndex & 1 ? 4 : 6;
+      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+    } else {
+      Mask1[0] = HiIndex & 1 ? 2 : 0;
+      Mask1[1] = HiIndex & 1 ? 0 : 2;
+      Mask1[2] = PermMask[2];
+      Mask1[3] = PermMask[3];
+      if (Mask1[2] >= 0)
+        Mask1[2] += 4;
+      if (Mask1[3] >= 0)
+        Mask1[3] += 4;
+      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+    }
+  }
+
+  // Break it into (shuffle shuffle_hi, shuffle_lo).
+  Locs.clear();
+  SmallVector<int,8> LoMask(4U, -1);
+  SmallVector<int,8> HiMask(4U, -1);
+
+  SmallVector<int,8> *MaskPtr = &LoMask;
+  unsigned MaskIdx = 0;
+  unsigned LoIdx = 0;
+  unsigned HiIdx = 2;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (i == 2) {
+      MaskPtr = &HiMask;
+      MaskIdx = 1;
+      LoIdx = 0;
+      HiIdx = 2;
+    }
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else if (Idx < 4) {
+      Locs[i] = std::make_pair(MaskIdx, LoIdx);
+      (*MaskPtr)[LoIdx] = Idx;
+      LoIdx++;
+    } else {
+      Locs[i] = std::make_pair(MaskIdx, HiIdx);
+      (*MaskPtr)[HiIdx] = Idx;
+      HiIdx++;
+    }
+  }
+
+  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+  SmallVector<int, 8> MaskOps;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (Locs[i].first == -1) {
+      MaskOps.push_back(-1);
+    } else {
+      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
+      MaskOps.push_back(Idx);
+    }
+  }
+  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
+
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned NumElems = VT.getVectorNumElements();
+  bool isMMX = VT.getSizeInBits() == 64;
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsSplat = false;
+  bool V2IsSplat = false;
+
+  if (isZeroShuffle(SVOp))
+    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+
+  // Promote splats to v4f32.
+  if (SVOp->isSplat()) {
+    if (isMMX || NumElems < 4) 
+      return Op;
+    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
+  }
+
+  // If the shuffle can be profitably rewritten as a narrower shuffle, then
+  // do it!
+  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+    if (NewOp.getNode())
+      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                         LowerVECTOR_SHUFFLE(NewOp, DAG));
+  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+    // FIXME: Figure out a cleaner way to do this.
+    // Try to make use of movq to zero out the top part.
+    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+      if (NewOp.getNode()) {
+        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
+          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
+                              DAG, Subtarget, dl);
+      }
+    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
+        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
+                            DAG, Subtarget, dl);
+    }
+  }
+  
+  if (X86::isPSHUFDMask(SVOp))
+    return Op;
+  
+  // Check if this can be converted into a logical shift.
+  bool isLeft = false;
+  unsigned ShAmt = 0;
+  SDValue ShVal;
+  bool isShift = getSubtarget()->hasSSE2() &&
+  isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  if (isShift && ShVal.hasOneUse()) {
+    // If the shifted value has multiple uses, it may be cheaper to use
+    // v_set0 + movlhps or movhlps, etc.
+    MVT EVT = VT.getVectorElementType();
+    ShAmt *= EVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+  
+  if (X86::isMOVLMask(SVOp)) {
+    if (V1IsUndef)
+      return V2;
+    if (ISD::isBuildVectorAllZeros(V1.getNode()))
+      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+    if (!isMMX)
+      return Op;
+  }
+  
+  // FIXME: fold these into legal mask.
+  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
+                 X86::isMOVSLDUPMask(SVOp) ||
+                 X86::isMOVHLPSMask(SVOp) ||
+                 X86::isMOVHPMask(SVOp) ||
+                 X86::isMOVLPMask(SVOp)))
+    return Op;
+
+  if (ShouldXformToMOVHLPS(SVOp) ||
+      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  if (isShift) {
+    // No better options. Use a vshl / vsrl.
+    MVT EVT = VT.getVectorElementType();
+    ShAmt *= EVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+  
+  bool Commuted = false;
+  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
+  // 1,1,1,1 -> v8i16 though.
+  V1IsSplat = isSplatVector(V1.getNode());
+  V2IsSplat = isSplatVector(V2.getNode());
+
+  // Canonicalize the splat or undef, if present, to be on the RHS.
+  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+    Op = CommuteVectorShuffle(SVOp, DAG);
+    SVOp = cast<ShuffleVectorSDNode>(Op);
+    V1 = SVOp->getOperand(0);
+    V2 = SVOp->getOperand(1);
+    std::swap(V1IsSplat, V2IsSplat);
+    std::swap(V1IsUndef, V2IsUndef);
+    Commuted = true;
+  }
+
+  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+    // Shuffling low element of v1 into undef, just return v1.
+    if (V2IsUndef) 
+      return V1;
+    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+    // the instruction selector will not match, so get a canonical MOVL with
+    // swapped operands to undo the commute.
+    return getMOVL(DAG, dl, VT, V2, V1);
+  }
+
+  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
+      X86::isUNPCKH_v_undef_Mask(SVOp) ||
+      X86::isUNPCKLMask(SVOp) ||
+      X86::isUNPCKHMask(SVOp))
+    return Op;
+
+  if (V2IsSplat) {
+    // Normalize mask so all entries that point to V2 points to its first
+    // element then try to match unpck{h|l} again. If match, return a
+    // new vector_shuffle with the corrected mask.
+    SDValue NewMask = NormalizeMask(SVOp, DAG);
+    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
+    if (NSVOp != SVOp) {
+      if (X86::isUNPCKLMask(NSVOp, true)) {
+        return NewMask;
+      } else if (X86::isUNPCKHMask(NSVOp, true)) {
+        return NewMask;
+      }
+    }
+  }
+
+  if (Commuted) {
+    // Commute is back and try unpck* again.
+    // FIXME: this seems wrong.
+    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
+    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
+        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
+        X86::isUNPCKLMask(NewSVOp) ||
+        X86::isUNPCKHMask(NewSVOp))
+      return NewOp;
+  }
+
+  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
+
+  // Normalize the node to match x86 shuffle ops if needed
+  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  // Check for legal shuffle and return?
+  SmallVector<int, 16> PermMask;
+  SVOp->getMask(PermMask);
+  if (isShuffleMaskLegal(PermMask, VT))
+    return Op;
+  
+  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+  if (VT == MVT::v8i16) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  if (VT == MVT::v16i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+  
+  // Handle all 4 wide cases with a number of shuffles except for MMX.
+  if (NumElems == 4 && !isMMX)
+    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
+                                                SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  if (VT.getSizeInBits() == 8) {
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  } else if (VT.getSizeInBits() == 16) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                 MVT::v4i32,
+                                                 Op.getOperand(0)),
+                                     Op.getOperand(1)));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  } else if (VT == MVT::f32) {
+    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+    // the result back to FR32 register. It's only worth matching if the
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
+    if (!Op.hasOneUse())
+      return SDValue();
+    SDNode *User = *Op.getNode()->use_begin();
+    if ((User->getOpcode() != ISD::STORE ||
+         (isa<ConstantSDNode>(Op.getOperand(1)) &&
+          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+        (User->getOpcode() != ISD::BIT_CONVERT ||
+         User->getValueType(0) != MVT::i32))
+      return SDValue();
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
+                                              Op.getOperand(0)),
+                                              Op.getOperand(1));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
+  } else if (VT == MVT::i32) {
+    // ExtractPS works with constant index.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return Op;
+  }
+  return SDValue();
+}
+
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
+
+  if (Subtarget->hasSSE41()) {
+    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  // TODO: handle v16i8.
+  if (VT.getSizeInBits() == 16) {
+    SDValue Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                 MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
+    // Transform it so it match pextrw which produces a 32-bit result.
+    MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EVT, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  } else if (VT.getSizeInBits() == 32) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+    
+    // SHUFPS the element to the lowest double word, then movss.
+    int Mask[4] = { Idx, -1, -1, -1 };
+    MVT VVT = Op.getOperand(0).getValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  } else if (VT.getSizeInBits() == 64) {
+    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+    //        to match extract_elt for f64.
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    int Mask[2] = { 1, -1 };
+    MVT VVT = Op.getOperand(0).getValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
+  MVT VT = Op.getValueType();
+  MVT EVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
+      isa<ConstantSDNode>(N2)) {
+    unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
+                                              : X86ISD::PINSRW;
+    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+    // argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+    // Bits [7:6] of the constant are the source select.  This will always be
+    //  zero here.  The DAG Combiner may combine an extract_elt index into these
+    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
+    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
+    // Bits [5:4] of the constant are the destination select.  This is the
+    //  value of the incoming immediate.
+    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
+    //   combine either bitwise AND or insert of float 0.0 to set these bits.
+    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+  } else if (EVT == MVT::i32) {
+    // InsertPS works with constant index.
+    if (isa<ConstantSDNode>(N2))
+      return Op;
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  MVT EVT = VT.getVectorElementType();
+
+  if (Subtarget->hasSSE41())
+    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+  if (EVT == MVT::i8)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  if (EVT.getSizeInBits() == 16) {
+    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+    // as its second argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  if (Op.getValueType() == MVT::v2f32)
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
+                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
+                                               Op.getOperand(0))));
+
+  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+  MVT VT = MVT::v2i32;
+  switch (Op.getValueType().getSimpleVT()) {
+  default: break;
+  case MVT::v16i8:
+  case MVT::v8i16:
+    VT = MVT::v4i32;
+    break;
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really any debug info here, should come from the parent
+  DebugLoc dl = CP->getDebugLoc();
+  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+                                             CP->getAlignment());
+  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(),
+                                     getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+                                      int64_t Offset,
+                                      SelectionDAG &DAG) const {
+  bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  bool ExtraLoadRequired =
+    Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false);
+
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  SDValue Result;
+  if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) {
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Offset = 0;
+  } else
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0);
+  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (IsPic && !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
+
+  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
+  // load the value at address GV, not the value of GV itself. This means that
+  // the GlobalAddress must be in the base or index register of the address, not
+  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
+  // The same applies for external symbols during PIC codegen
+  if (ExtraLoadRequired)
+    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+                         PseudoSourceValue::getGOT(), 0);
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+           SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg) {
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  DebugLoc dl = GA->getDebugLoc();
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                           GA->getValueType(0),
+                                           GA->getOffset());
+  if (InFlag) {
+    SDValue Ops[] = { Chain,  TGA, *InFlag };
+    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+  } else {
+    SDValue Ops[]  = { Chain, TGA };
+    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+  }
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const MVT PtrVT) {
+  SDValue InFlag;
+  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+                                     DAG.getNode(X86ISD::GlobalBaseReg,
+                                                 DebugLoc::getUnknownLoc(),
+                                                 PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const MVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
+// "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                   const MVT PtrVT, TLSModel::Model model,
+                                   bool is64Bit) {
+  DebugLoc dl = GA->getDebugLoc();
+  // Get the Thread Pointer
+  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
+                             DebugLoc::getUnknownLoc(), PtrVT,
+                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
+                                             MVT::i32));
+
+  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
+                                      NULL, 0);
+
+  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
+  // exec)
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                             GA->getValueType(0),
+                                             GA->getOffset());
+  SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+
+  if (model == TLSModel::InitialExec)
+    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+                         PseudoSourceValue::getGOT(), 0);
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
+  // TODO: implement the "local dynamic" model
+  // TODO: implement the "initial exec"model for pic executables
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  GlobalValue *GV = GA->getGlobal();
+  TLSModel::Model model =
+    getTLSModel (GV, getTargetMachine().getRelocationModel());
+  if (Subtarget->is64Bit()) {
+    switch (model) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic: // not implemented
+      return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, true);
+    }
+  } else {
+    switch (model) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic: // not implemented
+      return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, false);
+    }
+  }
+  assert(0 && "Unreachable");
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(),
+                                     getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  // FIXME there isn't really any debug into here
+  DebugLoc dl = JT->getDebugLoc();
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(),
+                                     getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  MVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue Tmp1 = isSRA ?
+    DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                DAG.getConstant(VTBits - 1, MVT::i8)) :
+    DAG.getConstant(0, VT);
+
+  SDValue Tmp2, Tmp3;
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  } else {
+    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+  }
+
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits, MVT::i8));
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
+                               AndNode, DAG.getConstant(0, MVT::i8));
+
+  SDValue Hi, Lo;
+  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+  } else {
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+  }
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getValueType();
+  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  // These are really Legal; return the operand so the caller accepts it as
+  // Legal.
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+    return Op;
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      Subtarget->is64Bit()) {
+    return Op;
+  }
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Size = SrcVT.getSizeInBits()/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot,
+                               PseudoSourceValue::getFixedStack(SSFI), 0);
+  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain,
+                                     SDValue StackSlot,
+                                     SelectionDAG &DAG) {
+  // Build the FILD
+  DebugLoc dl = Op.getDebugLoc();
+  SDVTList Tys;
+  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  if (useSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(StackSlot);
+  Ops.push_back(DAG.getValueType(SrcVT));
+  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
+                                 Tys, &Ops[0], Ops.size());
+
+  if (useSSE) {
+    Chain = Result.getValue(1);
+    SDValue InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    Tys = DAG.getVTList(MVT::Other);
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Result);
+    Ops.push_back(StackSlot);
+    Ops.push_back(DAG.getValueType(Op.getValueType()));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size());
+    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
+                         PseudoSourceValue::getFixedStack(SSFI), 0);
+  }
+
+  return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
+  // This algorithm is not obvious. Here it is in C code, more or less:
+  /*
+    double uint64_to_double( uint32_t hi, uint32_t lo ) {
+      static const __m128i exp = { 0x4330000045300000ULL, 0 };
+      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
+
+      // Copy ints to xmm registers.
+      __m128i xh = _mm_cvtsi32_si128( hi );
+      __m128i xl = _mm_cvtsi32_si128( lo );
+
+      // Combine into low half of a single xmm register.
+      __m128i x = _mm_unpacklo_epi32( xh, xl );
+      __m128d d;
+      double sd;
+
+      // Merge in appropriate exponents to give the integer bits the right
+      // magnitude.
+      x = _mm_unpacklo_epi32( x, exp );
+
+      // Subtract away the biases to deal with the IEEE-754 double precision
+      // implicit 1.
+      d = _mm_sub_pd( (__m128d) x, bias );
+
+      // All conversions up to here are exact. The correctly rounded result is
+      // calculated using the current rounding mode using the following
+      // horizontal add.
+      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
+      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
+                                // store doesn't really need to be here (except
+                                // maybe to zero the other double)
+      return sd;
+    }
+  */
+
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Build some magic constants.
+  std::vector<Constant*> CV0;
+  CV0.push_back(ConstantInt::get(APInt(32, 0x45300000)));
+  CV0.push_back(ConstantInt::get(APInt(32, 0x43300000)));
+  CV0.push_back(ConstantInt::get(APInt(32, 0)));
+  CV0.push_back(ConstantInt::get(APInt(32, 0)));
+  Constant *C0 = ConstantVector::get(CV0);
+  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+
+  std::vector<Constant*> CV1;
+  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL))));
+  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                        Op.getOperand(0),
+                                        DAG.getIntPtrConstant(1)));
+  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                        Op.getOperand(0),
+                                        DAG.getIntPtrConstant(0)));
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                              PseudoSourceValue::getConstantPool(), 0,
+                              false, 16);
+  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
+  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
+  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                              PseudoSourceValue::getConstantPool(), 0,
+                              false, 16);
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+
+  // Add the halves; easiest way is to swap them into another reg first.
+  int ShufMask[2] = { 1, -1 };
+  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
+                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
+  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+                     DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // FP constant to bias correct the final result.
+  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                   MVT::f64);
+
+  // Load the 32-bit value into an XMM register.
+  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                         Op.getOperand(0),
+                                         DAG.getIntPtrConstant(0)));
+
+  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
+                     DAG.getIntPtrConstant(0));
+
+  // Or the load with the bias.
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Load)),
+                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Bias)));
+  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
+                   DAG.getIntPtrConstant(0));
+
+  // Subtract the bias.
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+  // Handle final rounding.
+  MVT DestVT = Op.getValueType();
+
+  if (DestVT.bitsLT(MVT::f64)) {
+    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                       DAG.getIntPtrConstant(0));
+  } else if (DestVT.bitsGT(MVT::f64)) {
+    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+  }
+
+  // Handle final rounding.
+  return Sub;
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+  MVT SrcVT = N0.getValueType();
+  if (SrcVT == MVT::i64) {
+    // We only handle SSE2 f64 target here; caller can expand the rest.
+    if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
+      return SDValue();
+
+    return LowerUINT_TO_FP_i64(Op, DAG);
+  } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
+    return LowerUINT_TO_FP_i32(Op, DAG);
+  }
+
+  assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
+
+  // Make a 64-bit buffer, and use it to build an FILD.
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  SDValue WordOff = DAG.getConstant(4, getPointerTy());
+  SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+                                   getPointerTy(), StackSlot, WordOff);
+  SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                                StackSlot, NULL, 0);
+  SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+                                OffsetSlot, NULL, 0);
+  return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+}
+
+std::pair<SDValue,SDValue> X86TargetLowering::
+FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  MVT DstTy = Op.getValueType();
+
+  if (!IsSigned) {
+    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+    DstTy = MVT::i64;
+  }
+
+  assert(DstTy.getSimpleVT() <= MVT::i64 &&
+         DstTy.getSimpleVT() >= MVT::i16 &&
+         "Unknown FP_TO_SINT to lower!");
+
+  // These are really Legal.
+  if (DstTy == MVT::i32 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+  if (Subtarget->is64Bit() &&
+      DstTy == MVT::i64 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+
+  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
+  // stack slot.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = DstTy.getSizeInBits()/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  
+  unsigned Opc;
+  switch (DstTy.getSimpleVT()) {
+  default: assert(0 && "Invalid FP_TO_SINT to lower!");
+  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
+
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Value = Op.getOperand(0);
+  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
+    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, dl, Value, StackSlot,
+                         PseudoSourceValue::getFixedStack(SSFI), 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDValue Ops[] = {
+      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
+    };
+    Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  }
+
+  // Build the FP_TO_INT*_IN_MEM
+  SDValue Ops[] = { Chain, Value, StackSlot };
+  SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
+
+  return std::make_pair(FIST, StackSlot);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (FIST.getNode() == 0) return Op;
+
+  // Load the result.
+  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                     FIST, StackSlot, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  assert(FIST.getNode() && "Unexpected failure");
+
+  // Load the result.
+  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                     FIST, StackSlot, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType();
+  MVT EltVT = VT;
+  if (VT.isVector())
+    EltVT = VT.getVectorElementType();
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31))));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                               PseudoSourceValue::getConstantPool(), 0,
+                               false, 16);
+  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType();
+  MVT EltVT = VT;
+  unsigned EltNum = 1;
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    EltNum = VT.getVectorNumElements();
+  }
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63)));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31)));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                               PseudoSourceValue::getConstantPool(), 0,
+                               false, 16);
+  if (VT.isVector()) {
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
+                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                                Op.getOperand(0)),
+                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
+  } else {
+    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+  }
+}
+
+SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType();
+  MVT SrcVT = Op1.getValueType();
+
+  // If second operand is smaller, extend it first.
+  if (SrcVT.bitsLT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+    SrcVT = VT;
+  }
+  // And if it is bigger, shrink it first.
+  if (SrcVT.bitsGT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+    SrcVT = VT;
+  }
+
+  // At this point the operands and the result should have the same
+  // type, and that won't be f80 since that is not custom lowered.
+
+  // First get the sign bit of second operand.
+  std::vector<Constant*> CV;
+  if (SrcVT == MVT::f64) {
+    CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
+  } else {
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+                                PseudoSourceValue::getConstantPool(), 0,
+                                false, 16);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+  // Shift sign bit right or left if the two operands have different types.
+  if (SrcVT.bitsGT(VT)) {
+    // Op0 is MVT::f32, Op1 is MVT::f64.
+    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+                          DAG.getConstant(32, MVT::i32));
+    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+                          DAG.getIntPtrConstant(0));
+  }
+
+  // Clear first operand sign bit.
+  CV.clear();
+  if (VT == MVT::f64) {
+    CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
+  } else {
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+  }
+  C = ConstantVector::get(CV);
+  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                                PseudoSourceValue::getConstantPool(), 0,
+                                false, 16);
+  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+                                    SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CF and OF aren't always set the way we want. Determine which
+  // of these we need.
+  bool NeedCF = false;
+  bool NeedOF = false;
+  switch (X86CC) {
+  case X86::COND_A: case X86::COND_AE:
+  case X86::COND_B: case X86::COND_BE:
+    NeedCF = true;
+    break;
+  case X86::COND_G: case X86::COND_GE:
+  case X86::COND_L: case X86::COND_LE:
+  case X86::COND_O: case X86::COND_NO:
+    NeedOF = true;
+    break;
+  default: break;
+  }
+
+  // See if we can use the EFLAGS value from the operand instead of
+  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+  // we prove that the arithmetic won't overflow, we can't use OF or CF.
+  if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
+    unsigned Opcode = 0;
+    unsigned NumOperands = 0;
+    switch (Op.getNode()->getOpcode()) {
+    case ISD::ADD:
+      // Due to an isel shortcoming, be conservative if this add is likely to
+      // be selected as part of a load-modify-store instruction. When the root
+      // node in a match is a store, isel doesn't know how to remap non-chain
+      // non-flag uses of other nodes in the match, such as the ADD in this
+      // case. This leads to the ADD being left around and reselected, with
+      // the result being two adds in the output.
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+        if (UI->getOpcode() == ISD::STORE)
+          goto default_case;
+      if (ConstantSDNode *C =
+            dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+        // An add of one will be selected as an INC.
+        if (C->getAPIntValue() == 1) {
+          Opcode = X86ISD::INC;
+          NumOperands = 1;
+          break;
+        }
+        // An add of negative one (subtract of one) will be selected as a DEC.
+        if (C->getAPIntValue().isAllOnesValue()) {
+          Opcode = X86ISD::DEC;
+          NumOperands = 1;
+          break;
+        }
+      }
+      // Otherwise use a regular EFLAGS-setting add.
+      Opcode = X86ISD::ADD;
+      NumOperands = 2;
+      break;
+    case ISD::SUB:
+      // Due to the ISEL shortcoming noted above, be conservative if this sub is
+      // likely to be selected as part of a load-modify-store instruction.
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+        if (UI->getOpcode() == ISD::STORE)
+          goto default_case;
+      // Otherwise use a regular EFLAGS-setting sub.
+      Opcode = X86ISD::SUB;
+      NumOperands = 2;
+      break;
+    case X86ISD::ADD:
+    case X86ISD::SUB:
+    case X86ISD::INC:
+    case X86ISD::DEC:
+      return SDValue(Op.getNode(), 1);
+    default:
+    default_case:
+      break;
+    }
+    if (Opcode != 0) {
+      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0; i != NumOperands; ++i)
+        Ops.push_back(Op.getOperand(i));
+      SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+      DAG.ReplaceAllUsesWith(Op, New);
+      return SDValue(New.getNode(), 1);
+    }
+  }
+
+  // Otherwise just emit a CMP with 0, which is the TEST pattern.
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                     DAG.getConstant(0, Op.getValueType()));
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                                   SelectionDAG &DAG) {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+    if (C->getAPIntValue() == 0)
+      return EmitTest(Op0, X86CC, DAG);
+
+  DebugLoc dl = Op0.getDebugLoc();
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  if (Op0.getOpcode() == ISD::AND &&
+      Op0.hasOneUse() &&
+      Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue LHS, RHS;
+    if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
+      if (ConstantSDNode *Op010C =
+            dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
+        if (Op010C->getZExtValue() == 1) {
+          LHS = Op0.getOperand(0);
+          RHS = Op0.getOperand(1).getOperand(1);
+        }
+    } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
+      if (ConstantSDNode *Op000C =
+            dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
+        if (Op000C->getZExtValue() == 1) {
+          LHS = Op0.getOperand(1);
+          RHS = Op0.getOperand(0).getOperand(1);
+        }
+    } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
+      ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
+      SDValue AndLHS = Op0.getOperand(0);
+      if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+        LHS = AndLHS.getOperand(0);
+        RHS = AndLHS.getOperand(1);
+      }
+    }
+
+    if (LHS.getNode()) {
+      // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
+      // instruction.  Since the shift amount is in-range-or-undefined, we know
+      // that doing a bittest on the i16 value is ok.  We extend to i32 because
+      // the encoding for the i16 version is larger than the i32 version.
+      if (LHS.getValueType() == MVT::i8)
+        LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+      // If the operand types disagree, extend the shift amount to match.  Since
+      // BT ignores high bits (like shifts) we can use anyextend.
+      if (LHS.getValueType() != RHS.getValueType())
+        RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+      SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+      unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                         DAG.getConstant(Cond, MVT::i8), BT);
+    }
+  }
+
+  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+
+  SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
+  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                     DAG.getConstant(X86CC, MVT::i8), Cond);
+}
+
+SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Cond;
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (isFP) {
+    unsigned SSECC = 8;
+    MVT VT0 = Op0.getValueType();
+    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
+    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+    bool Swap = false;
+
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  SSECC = 0; break;
+    case ISD::SETOGT:
+    case ISD::SETGT: Swap = true; // Fallthrough
+    case ISD::SETLT:
+    case ISD::SETOLT: SSECC = 1; break;
+    case ISD::SETOGE:
+    case ISD::SETGE: Swap = true; // Fallthrough
+    case ISD::SETLE:
+    case ISD::SETOLE: SSECC = 2; break;
+    case ISD::SETUO:  SSECC = 3; break;
+    case ISD::SETUNE:
+    case ISD::SETNE:  SSECC = 4; break;
+    case ISD::SETULE: Swap = true;
+    case ISD::SETUGE: SSECC = 5; break;
+    case ISD::SETULT: Swap = true;
+    case ISD::SETUGT: SSECC = 6; break;
+    case ISD::SETO:   SSECC = 7; break;
+    }
+    if (Swap)
+      std::swap(Op0, Op1);
+
+    // In the two special cases we can't handle, emit two comparisons.
+    if (SSECC == 8) {
+      if (SetCCOpcode == ISD::SETUEQ) {
+        SDValue UNORD, EQ;
+        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
+        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
+        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
+      }
+      else if (SetCCOpcode == ISD::SETONE) {
+        SDValue ORD, NEQ;
+        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
+        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
+        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+      }
+      assert(0 && "Illegal FP comparison");
+    }
+    // Handle all other FP comparisons here.
+    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
+  }
+
+  // We are handling one of the integer comparisons here.  Since SSE only has
+  // GT and EQ comparisons for integer, swapping operands and multiple
+  // operations may be required for some comparisons.
+  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
+  bool Swap = false, Invert = false, FlipSigns = false;
+
+  switch (VT.getSimpleVT()) {
+  default: break;
+  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
+  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
+  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
+  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETNE:  Invert = true;
+  case ISD::SETEQ:  Opc = EQOpc; break;
+  case ISD::SETLT:  Swap = true;
+  case ISD::SETGT:  Opc = GTOpc; break;
+  case ISD::SETGE:  Swap = true;
+  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
+  case ISD::SETULT: Swap = true;
+  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
+  case ISD::SETUGE: Swap = true;
+  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
+  }
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    MVT EltVT = VT.getVectorElementType();
+    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
+                                      EltVT);
+    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
+    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
+                                    SignBits.size());
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
+  }
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  // If the logical-not of the result is required, perform that now.
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+  unsigned Opc = Op.getNode()->getOpcode();
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+    return true;
+  if (Op.getResNo() == 1 &&
+      (Opc == X86ISD::ADD ||
+       Opc == X86ISD::SUB ||
+       Opc == X86ISD::SMUL ||
+       Opc == X86ISD::UMUL ||
+       Opc == X86ISD::INC ||
+       Opc == X86ISD::DEC))
+    return true;
+
+  return false;
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDValue Cond  = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC;
+
+  if (Cond.getOpcode() == ISD::SETCC)
+    Cond = LowerSETCC(Cond, DAG);
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  if (Cond.getOpcode() == X86ISD::SETCC) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    MVT VT = Op.getValueType();
+
+    bool IllegalFPCMov = false;
+    if (VT.isFloatingPoint() && !VT.isVector() &&
+        !isScalarFPTypeInSSEReg(VT))  // FPStack?
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+        Opc == X86ISD::BT) { // FIXME
+      Cond = Cmp;
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+  }
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+  SmallVector<SDValue, 4> Ops;
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  Ops.push_back(Op.getOperand(2));
+  Ops.push_back(Op.getOperand(1));
+  Ops.push_back(CC);
+  Ops.push_back(Cond);
+  return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size());
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+  Opc = Op.getOpcode();
+  if (Opc != ISD::OR && Opc != ISD::AND)
+    return false;
+  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(0).hasOneUse() &&
+          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+  if (Op.getOpcode() != ISD::XOR)
+    return false;
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (N1C && N1C->getAPIntValue() == 1) {
+    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+      Op.getOperand(0).hasOneUse();
+  }
+  return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Dest  = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC;
+
+  if (Cond.getOpcode() == ISD::SETCC)
+    Cond = LowerSETCC(Cond, DAG);
+#if 0
+  // FIXME: LowerXALUO doesn't handle these!!
+  else if (Cond.getOpcode() == X86ISD::ADD  ||
+           Cond.getOpcode() == X86ISD::SUB  ||
+           Cond.getOpcode() == X86ISD::SMUL ||
+           Cond.getOpcode() == X86ISD::UMUL)
+    Cond = LowerXALUO(Cond, DAG);
+#endif
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  if (Cond.getOpcode() == X86ISD::SETCC) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+      Cond = Cmp;
+      addTest = false;
+    } else {
+      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+      default: break;
+      case X86::COND_O:
+      case X86::COND_B:
+        // These can only come from an arithmetic instruction with overflow,
+        // e.g. SADDO, UADDO.
+        Cond = Cond.getNode()->getOperand(1);
+        addTest = false;
+        break;
+      }
+    }
+  } else {
+    unsigned CondOpc;
+    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+      SDValue Cmp = Cond.getOperand(0).getOperand(1);
+      if (CondOpc == ISD::OR) {
+        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+        // two branches instead of an explicit OR instruction with a
+        // separate test.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp)) {
+          CC = Cond.getOperand(0).getOperand(0);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = Cond.getOperand(1).getOperand(0);
+          Cond = Cmp;
+          addTest = false;
+        }
+      } else { // ISD::AND
+        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+        // two branches instead of an explicit AND instruction with a
+        // separate test. However, we only do this if this block doesn't
+        // have a fall-through edge, because this requires an explicit
+        // jmp when the condition is false.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp) &&
+            Op.getNode()->hasOneUse()) {
+          X86::CondCode CCode =
+            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode = X86::GetOppositeBranchCondition(CCode);
+          CC = DAG.getConstant(CCode, MVT::i8);
+          SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
+          // Look for an unconditional branch following this conditional branch.
+          // We need this because we need to reverse the successors in order
+          // to implement FCMP_OEQ.
+          if (User.getOpcode() == ISD::BR) {
+            SDValue FalseBB = User.getOperand(1);
+            SDValue NewBR =
+              DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
+            assert(NewBR == User);
+            Dest = FalseBB;
+
+            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                                Chain, Dest, CC, Cmp);
+            X86::CondCode CCode =
+              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode = X86::GetOppositeBranchCondition(CCode);
+            CC = DAG.getConstant(CCode, MVT::i8);
+            Cond = Cmp;
+            addTest = false;
+          }
+        }
+      }
+    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+      // It should be transformed during dag combiner except when the condition
+      // is set by a arithmetics with overflow node.
+      X86::CondCode CCode =
+        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      CC = DAG.getConstant(CCode, MVT::i8);
+      Cond = Cond.getOperand(0).getOperand(1);
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+  }
+  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, CC, Cond);
+}
+
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                           SelectionDAG &DAG) {
+  assert(Subtarget->isTargetCygMing() &&
+         "This should be used only on Cygwin/Mingw targets");
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  // FIXME: Ensure alignment here
+
+  SDValue Flag;
+
+  MVT IntPtr = getPointerTy();
+  MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Ops[] = { Chain,
+                      DAG.getTargetExternalSymbol("_alloca", IntPtr),
+                      DAG.getRegister(X86::EAX, IntPtr),
+                      DAG.getRegister(X86StackPtr, SPTy),
+                      Flag };
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
+  Flag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(0, true),
+                             DAG.getIntPtrConstant(0, true),
+                             Flag);
+
+  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+
+  SDValue Ops1[2] = { Chain.getValue(0), Chain };
+  return DAG.getMergeValues(Ops1, 2, dl);
+}
+
+SDValue
+X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                           SDValue Chain,
+                                           SDValue Dst, SDValue Src,
+                                           SDValue Size, unsigned Align,
+                                           const Value *DstSV,
+                                           uint64_t DstSVOff) {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
+  if ((Align & 3) != 0 ||
+      !ConstantSize ||
+      ConstantSize->getZExtValue() >
+        getSubtarget()->getMaxInlineSizeThreshold()) {
+    SDValue InFlag(0, 0);
+
+    // Check to see if there is a specialized entry-point for memory zeroing.
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+    if (const char *bzeroEntry =  V &&
+        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+      MVT IntPtr = getPointerTy();
+      const Type *IntPtrTy = TD->getIntPtrType();
+      TargetLowering::ArgListTy Args;
+      TargetLowering::ArgListEntry Entry;
+      Entry.Node = Dst;
+      Entry.Ty = IntPtrTy;
+      Args.push_back(Entry);
+      Entry.Node = Size;
+      Args.push_back(Entry);
+      std::pair<SDValue,SDValue> CallResult =
+        LowerCallTo(Chain, Type::VoidTy, false, false, false, false,
+                    CallingConv::C, false,
+                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
+      return CallResult.second;
+    }
+
+    // Otherwise have the target-independent code call memset.
+    return SDValue();
+  }
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  SDValue InFlag(0, 0);
+  MVT AVT;
+  SDValue Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getZExtValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      ValReg = X86::AX;
+      Val = (Val << 8) | Val;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      ValReg = X86::EAX;
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+        AVT = MVT::i64;
+        ValReg = X86::RAX;
+        Val = (Val << 32) | Val;
+      }
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      ValReg = X86::AL;
+      Count = DAG.getIntPtrConstant(SizeVal);
+      break;
+    }
+
+    if (AVT.bitsGT(MVT::i8)) {
+      unsigned UBytes = AVT.getSizeInBits() / 8;
+      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+      BytesLeft = SizeVal % UBytes;
+    }
+
+    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
+                              InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = DAG.getIntPtrConstant(SizeVal);
+    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+                                                              X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+                                                              X86::EDI,
+                            Dst, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getValueType(AVT));
+  Ops.push_back(InFlag);
+  Chain  = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count  = Size;
+    MVT CVT = Count.getValueType();
+    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
+                                                             X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(DAG.getValueType(MVT::i8));
+    Ops.push_back(InFlag);
+    Chain  = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
+  } else if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    MVT AddrVT = Dst.getValueType();
+    MVT SizeVT = Size.getValueType();
+
+    Chain = DAG.getMemset(Chain, dl,
+                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+                                      DAG.getConstant(Offset, AddrVT)),
+                          Src,
+                          DAG.getConstant(BytesLeft, SizeVT),
+                          Align, DstSV, DstSVOff + Offset);
+  }
+
+  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                      SDValue Chain, SDValue Dst, SDValue Src,
+                                      SDValue Size, unsigned Align,
+                                      bool AlwaysInline,
+                                      const Value *DstSV, uint64_t DstSVOff,
+                                      const Value *SrcSV, uint64_t SrcSVOff) {
+  // This requires the copy size to be a constant, preferrably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
+    return SDValue();
+
+  /// If not DWORD aligned, call the library.
+  if ((Align & 3) != 0)
+    return SDValue();
+
+  // DWORD aligned
+  MVT AVT = MVT::i32;
+  if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
+    AVT = MVT::i64;
+
+  unsigned UBytes = AVT.getSizeInBits() / 8;
+  unsigned CountVal = SizeVal / UBytes;
+  SDValue Count = DAG.getIntPtrConstant(CountVal);
+  unsigned BytesLeft = SizeVal % UBytes;
+
+  SDValue InFlag(0, 0);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+                                                              X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+                                                             X86::EDI,
+                            Dst, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+                                                              X86::ESI,
+                            Src, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getValueType(AVT));
+  Ops.push_back(InFlag);
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size());
+
+  SmallVector<SDValue, 4> Results;
+  Results.push_back(RepMovs);
+  if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    MVT DstVT = Dst.getValueType();
+    MVT SrcVT = Src.getValueType();
+    MVT SizeVT = Size.getValueType();
+    Results.push_back(DAG.getMemcpy(Chain, dl,
+                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+                                                DAG.getConstant(Offset, DstVT)),
+                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+                                                DAG.getConstant(Offset, SrcVT)),
+                                    DAG.getConstant(BytesLeft, SizeVT),
+                                    Align, AlwaysInline,
+                                    DstSV, DstSVOff + Offset,
+                                    SrcSV, SrcSVOff + Offset));
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &Results[0], Results.size());
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (!Subtarget->is64Bit()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDValue, 8> MemOps;
+  SDValue FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDValue Store = DAG.getStore(Op.getOperand(0), dl,
+                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
+                                 FIN, SV, 0);
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  Store = DAG.getStore(Op.getOperand(0), dl,
+                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
+                       FIN, SV, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(8));
+  SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &MemOps[0], MemOps.size());
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
+  SDValue Chain = Op.getOperand(0);
+  SDValue SrcPtr = Op.getOperand(1);
+  SDValue SrcSV = Op.getOperand(2);
+
+  assert(0 && "VAArgInst is not yet implemented for x86-64!");
+  abort();
+  return SDValue();
+}
+
+SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+  SDValue Chain = Op.getOperand(0);
+  SDValue DstPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
+                       DAG.getIntPtrConstant(24), 8, false,
+                       DstSV, 0, SrcSV, 0);
+}
+
+SDValue
+X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  // Comparison intrinsics.
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd: {
+    unsigned Opc = 0;
+    ISD::CondCode CC = ISD::SETCC_INVALID;
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse2_comilt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse2_comile_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse2_comigt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse2_comige_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse2_comineq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETNE;
+      break;
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse2_ucomile_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse2_ucomige_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETNE;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8), Cond);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  // Fix vector shift instructions where the last operand is a non-immediate
+  // i32 value.
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d: {
+    SDValue ShAmt = Op.getOperand(2);
+    if (isa<ConstantSDNode>(ShAmt))
+      return SDValue();
+
+    unsigned NewIntNo = 0;
+    MVT ShAmtVT = MVT::v4i32;
+    switch (IntNo) {
+    case Intrinsic::x86_sse2_pslli_w:
+      NewIntNo = Intrinsic::x86_sse2_psll_w;
+      break;
+    case Intrinsic::x86_sse2_pslli_d:
+      NewIntNo = Intrinsic::x86_sse2_psll_d;
+      break;
+    case Intrinsic::x86_sse2_pslli_q:
+      NewIntNo = Intrinsic::x86_sse2_psll_q;
+      break;
+    case Intrinsic::x86_sse2_psrli_w:
+      NewIntNo = Intrinsic::x86_sse2_psrl_w;
+      break;
+    case Intrinsic::x86_sse2_psrli_d:
+      NewIntNo = Intrinsic::x86_sse2_psrl_d;
+      break;
+    case Intrinsic::x86_sse2_psrli_q:
+      NewIntNo = Intrinsic::x86_sse2_psrl_q;
+      break;
+    case Intrinsic::x86_sse2_psrai_w:
+      NewIntNo = Intrinsic::x86_sse2_psra_w;
+      break;
+    case Intrinsic::x86_sse2_psrai_d:
+      NewIntNo = Intrinsic::x86_sse2_psra_d;
+      break;
+    default: {
+      ShAmtVT = MVT::v2i32;
+      switch (IntNo) {
+      case Intrinsic::x86_mmx_pslli_w:
+        NewIntNo = Intrinsic::x86_mmx_psll_w;
+        break;
+      case Intrinsic::x86_mmx_pslli_d:
+        NewIntNo = Intrinsic::x86_mmx_psll_d;
+        break;
+      case Intrinsic::x86_mmx_pslli_q:
+        NewIntNo = Intrinsic::x86_mmx_psll_q;
+        break;
+      case Intrinsic::x86_mmx_psrli_w:
+        NewIntNo = Intrinsic::x86_mmx_psrl_w;
+        break;
+      case Intrinsic::x86_mmx_psrli_d:
+        NewIntNo = Intrinsic::x86_mmx_psrl_d;
+        break;
+      case Intrinsic::x86_mmx_psrli_q:
+        NewIntNo = Intrinsic::x86_mmx_psrl_q;
+        break;
+      case Intrinsic::x86_mmx_psrai_w:
+        NewIntNo = Intrinsic::x86_mmx_psra_w;
+        break;
+      case Intrinsic::x86_mmx_psrai_d:
+        NewIntNo = Intrinsic::x86_mmx_psra_d;
+        break;
+      default: abort();  // Can't reach here.
+      }
+      break;
+    }
+    }
+    MVT VT = Op.getValueType();
+    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(NewIntNo, MVT::i32),
+                       Op.getOperand(1), ShAmt);
+  }
+  }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+      DAG.getConstant(TD->getPointerSize(),
+                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       NULL, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+                                                     SelectionDAG &DAG) {
+  return DAG.getIntPtrConstant(2*TD->getPointerSize());
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+{
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  DebugLoc dl       = Op.getDebugLoc();
+
+  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
+                                  getPointerTy());
+  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
+
+  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
+                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
+  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
+  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+  MF.getRegInfo().addLiveOut(StoreAddrReg);
+
+  return DAG.getNode(X86ISD::EH_RETURN, dl,
+                     MVT::Other,
+                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
+}
+
+SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
+                                             SelectionDAG &DAG) {
+  SDValue Root = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  DebugLoc dl  = Op.getDebugLoc();
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  const X86InstrInfo *TII =
+    ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+
+  if (Subtarget->is64Bit()) {
+    SDValue OutChains[6];
+
+    // Large code-model.
+
+    const unsigned char JMP64r  = TII->getBaseOpcodeFor(X86::JMP64r);
+    const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
+
+    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
+    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+
+    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+    // Load the pointer to the nested function into R11.
+    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+    SDValue Addr = Trmp;
+    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, TrmpAddr, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(2, MVT::i64));
+    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
+
+    // Load the 'nest' parameter value into R10.
+    // R10 is specified in X86CallingConv.td
+    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(10, MVT::i64));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, TrmpAddr, 10);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(12, MVT::i64));
+    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
+
+    // Jump to the nested function.
+    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(20, MVT::i64));
+    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, TrmpAddr, 20);
+
+    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(22, MVT::i64));
+    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
+                                TrmpAddr, 22);
+
+    SDValue Ops[] =
+      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
+    return DAG.getMergeValues(Ops, 2, dl);
+  } else {
+    const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+    unsigned CC = Func->getCallingConv();
+    unsigned NestReg;
+
+    switch (CC) {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::X86_StdCall: {
+      // Pass 'nest' parameter in ECX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::ECX;
+
+      // Check that ECX wasn't needed by an 'inreg' parameter.
+      const FunctionType *FTy = Func->getFunctionType();
+      const AttrListPtr &Attrs = Func->getAttributes();
+
+      if (!Attrs.isEmpty() && !Func->isVarArg()) {
+        unsigned InRegCount = 0;
+        unsigned Idx = 1;
+
+        for (FunctionType::param_iterator I = FTy->param_begin(),
+             E = FTy->param_end(); I != E; ++I, ++Idx)
+          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
+            // FIXME: should only count parameters that are lowered to integers.
+            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+
+        if (InRegCount > 2) {
+          cerr << "Nest register in use - reduce number of inreg parameters!\n";
+          abort();
+        }
+      }
+      break;
+    }
+    case CallingConv::X86_FastCall:
+    case CallingConv::Fast:
+      // Pass 'nest' parameter in EAX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::EAX;
+      break;
+    }
+
+    SDValue OutChains[4];
+    SDValue Addr, Disp;
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(10, MVT::i32));
+    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+    const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
+    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+    OutChains[0] = DAG.getStore(Root, dl,
+                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+                                Trmp, TrmpAddr, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(1, MVT::i32));
+    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
+
+    const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(5, MVT::i32));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
+                                TrmpAddr, 5, false, 1);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(6, MVT::i32));
+    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
+
+    SDValue Ops[] =
+      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
+  /*
+   The rounding mode is in bits 11:10 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to -inf
+     10 Round to +inf
+     11 Round to 0
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameInfo &TFI = *TM.getFrameInfo();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Save FP Control Word to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
+                              DAG.getEntryNode(), StackSlot);
+
+  // Load FP Control Word from stack slot
+  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::SRL, dl, MVT::i16,
+                DAG.getNode(ISD::AND, dl, MVT::i16,
+                            CWD, DAG.getConstant(0x800, MVT::i16)),
+                DAG.getConstant(11, MVT::i8));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, dl, MVT::i16,
+                DAG.getNode(ISD::AND, dl, MVT::i16,
+                            CWD, DAG.getConstant(0x400, MVT::i16)),
+                DAG.getConstant(9, MVT::i8));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::AND, dl, MVT::i16,
+                DAG.getNode(ISD::ADD, dl, MVT::i16,
+                            DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
+                            DAG.getConstant(1, MVT::i16)),
+                DAG.getConstant(3, MVT::i16));
+
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  MVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // If src is zero (i.e. bsr sets ZF), returns NumBits.
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(Op);
+  Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT));
+  Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
+  Ops.push_back(Op.getValue(1));
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
+
+  // Finally xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  MVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+
+  // If src is zero (i.e. bsf sets ZF), returns NumBits.
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(Op);
+  Ops.push_back(DAG.getConstant(NumBits, OpVT));
+  Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
+  Ops.push_back(Op.getValue(1));
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+  DebugLoc dl = Op.getDebugLoc();
+
+  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
+  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
+  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
+  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
+  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
+  //
+  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
+  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
+  //  return AloBlo + AloBhi + AhiBlo;
+
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                       A, DAG.getConstant(32, MVT::i32));
+  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                       B, DAG.getConstant(32, MVT::i32));
+  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       A, B);
+  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       A, Bhi);
+  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       Ahi, B);
+  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                       AloBhi, DAG.getConstant(32, MVT::i32));
+  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                       AhiBlo, DAG.getConstant(32, MVT::i32));
+  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+  return Res;
+}
+
+
+SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+  // looks for this combo and may remove the "setcc" instruction if the "setcc"
+  // has only one use.
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned BaseOp = 0;
+  unsigned Cond = 0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Unknown ovf instruction!");
+  case ISD::SADDO:
+    // A subtract of one will be selected as a INC. Note that INC doesn't
+    // set CF, so we can't do this for UADDO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (C->getAPIntValue() == 1) {
+        BaseOp = X86ISD::INC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SSUBO:
+    // A subtract of one will be selected as a DEC. Note that DEC doesn't
+    // set CF, so we can't do this for USUBO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (C->getAPIntValue() == 1) {
+        BaseOp = X86ISD::DEC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_O;
+    break;
+  case ISD::USUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SMULO:
+    BaseOp = X86ISD::SMUL;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UMULO:
+    BaseOp = X86ISD::UMUL;
+    Cond = X86::COND_B;
+    break;
+  }
+
+  // Also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
+
+  SDValue SetCC =
+    DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
+                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
+  return Sum;
+}
+
+SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
+  MVT T = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Reg = 0;
+  unsigned size = 0;
+  switch(T.getSimpleVT()) {
+  default:
+    assert(false && "Invalid value type!");
+  case MVT::i8:  Reg = X86::AL;  size = 1; break;
+  case MVT::i16: Reg = X86::AX;  size = 2; break;
+  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  case MVT::i64:
+    assert(Subtarget->is64Bit() && "Node not type legal!");
+    Reg = X86::RAX; size = 8;
+    break;
+  }
+  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
+                                    Op.getOperand(2), SDValue());
+  SDValue Ops[] = { cpIn.getValue(0),
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, MVT::i8),
+                    cpIn.getValue(1) };
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
+  SDValue cpOut =
+    DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
+  return cpOut;
+}
+
+SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
+                                                 SelectionDAG &DAG) {
+  assert(Subtarget->is64Bit() && "Result not type legalized?");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue TheChain = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
+  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
+                                   rax.getValue(2));
+  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
+                            DAG.getConstant(32, MVT::i8));
+  SDValue Ops[] = {
+    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
+    rdx.getValue(1)
+  };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  MVT T = Node->getValueType(0);
+  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+                              DAG.getConstant(0, T), Node->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+                       cast<AtomicSDNode>(Node)->getMemoryVT(),
+                       Node->getOperand(0),
+                       Node->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Node)->getSrcValue(),
+                       cast<AtomicSDNode>(Node)->getAlignment());
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Should not custom lower this!");
+  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
+  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
+  case ISD::FABS:               return LowerFABS(Op, DAG);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::CALL:               return LowerCALL(Op, DAG);
+  case ISD::RET:                return LowerRET(Op, DAG);
+  case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
+  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
+  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:              return LowerXALUO(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
+  }
+}
+
+void X86TargetLowering::
+ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
+                        SelectionDAG &DAG, unsigned NewOp) {
+  MVT T = Node->getValueType(0);
+  DebugLoc dl = Node->getDebugLoc();
+  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(0));
+  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(1));
+  // This is a generalized SDNode, not an AtomicSDNode, so it doesn't
+  // have a MemOperand.  Pass the info through as a normal operand.
+  SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand());
+  SDValue Ops[] = { Chain, In1, In2L, In2H, LSI };
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5);
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(Result.getValue(2));
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default:
+    assert(false && "Do not know how to custom type legalize this operation!");
+    return;
+  case ISD::FP_TO_SINT: {
+    std::pair<SDValue,SDValue> Vals =
+        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
+    SDValue FIST = Vals.first, StackSlot = Vals.second;
+    if (FIST.getNode() != 0) {
+      MVT VT = N->getValueType(0);
+      // Return a load from the stack slot.
+      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
+    }
+    return;
+  }
+  case ISD::READCYCLECOUNTER: {
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue TheChain = N->getOperand(0);
+    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
+                                     rd.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
+                                     eax.getValue(2));
+    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+    SDValue Ops[] = { eax, edx };
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
+    Results.push_back(edx.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_CMP_SWAP: {
+    MVT T = N->getValueType(0);
+    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+    SDValue cpInL, cpInH;
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+                        DAG.getConstant(0, MVT::i32));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+                        DAG.getConstant(1, MVT::i32));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
+                             cpInL.getValue(1));
+    SDValue swapInL, swapInH;
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+                          DAG.getConstant(0, MVT::i32));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+                          DAG.getConstant(1, MVT::i32));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
+                               cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
+                               swapInL.getValue(1));
+    SDValue Ops[] = { swapInH.getValue(0),
+                      N->getOperand(1),
+                      swapInH.getValue(1) };
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
+                                        MVT::i32, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
+                                        MVT::i32, cpOutL.getValue(2));
+    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+    Results.push_back(cpOutH.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_LOAD_ADD:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_AND:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_NAND:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_OR:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_SUB:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_XOR:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
+    return;
+  case ISD::ATOMIC_SWAP:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+    return;
+  }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case X86ISD::BSF:                return "X86ISD::BSF";
+  case X86ISD::BSR:                return "X86ISD::BSR";
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FSRL:               return "X86ISD::FSRL";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::TAILCALL:           return "X86ISD::TAILCALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::BT:                 return "X86ISD::BT";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
+  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
+  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
+  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
+  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
+  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
+  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
+  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
+  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
+  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
+  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
+  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
+  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
+  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
+  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
+  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
+  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
+  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
+  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
+  case X86ISD::ADD:                return "X86ISD::ADD";
+  case X86ISD::SUB:                return "X86ISD::SUB";
+  case X86ISD::SMUL:               return "X86ISD::SMUL";
+  case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::INC:                return "X86ISD::INC";
+  case X86ISD::DEC:                return "X86ISD::DEC";
+  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              const Type *Ty) const {
+  // X86 supports extremely general addressing modes.
+
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
+    return false;
+
+  if (AM.BaseGV) {
+    // We can only fold this if we don't need an extra load.
+    if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
+      return false;
+    // If BaseGV requires a register, we cannot also have a BaseReg.
+    if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) &&
+        AM.HasBaseReg)
+      return false;
+
+    // X86-64 only supports addr of globals in small code model.
+    if (Subtarget->is64Bit()) {
+      if (getTargetMachine().getCodeModel() != CodeModel::Small)
+        return false;
+      // If lower 4G is not available, then we must use rip-relative addressing.
+      if (AM.BaseOffs || AM.Scale > 1)
+        return false;
+    }
+  }
+
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+
+  return true;
+}
+
+
+bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+  if (!Ty1->isInteger() || !Ty2->isInteger())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return Subtarget->is64Bit() || NumBits1 < 64;
+}
+
+bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return Subtarget->is64Bit() || NumBits1 < 64;
+}
+
+bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const {
+  // i16 instructions are longer (0x66 prefix) and potentially slower.
+  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 
+                                      MVT VT) const {
+  // Only do shuffles on 128-bit vector types for now.
+  if (VT.getSizeInBits() == 64)
+    return false;
+
+  // FIXME: pshufb, blends, palignr, shifts.
+  return (VT.getVectorNumElements() == 2 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isMOVLMask(M, VT) ||
+          isSHUFPMask(M, VT) ||
+          isPSHUFDMask(M, VT) ||
+          isPSHUFHWMask(M, VT) ||
+          isPSHUFLWMask(M, VT) ||
+          isUNPCKLMask(M, VT) ||
+          isUNPCKHMask(M, VT) ||
+          isUNPCKL_v_undef_Mask(M, VT) ||
+          isUNPCKH_v_undef_Mask(M, VT));
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                          MVT VT) const {
+  unsigned NumElts = VT.getVectorNumElements();
+  // FIXME: This collection of masks seems suspect.
+  if (NumElts == 2)
+    return true;
+  if (NumElts == 4 && VT.getSizeInBits() == 128) {
+    return (isMOVLMask(Mask, VT)  ||
+            isCommutedMOVLMask(Mask, VT, true) ||
+            isSHUFPMask(Mask, VT) ||
+            isCommutedSHUFPMask(Mask, VT));
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpc,
+                                                       unsigned immOpc,
+                                                       unsigned LoadOpc,
+                                                       unsigned CXchgOpc,
+                                                       unsigned copyOpc,
+                                                       unsigned notOpc,
+                                                       unsigned EAXreg,
+                                                       TargetRegisterClass *RC,
+                                                       bool invSrc) const {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld  t1 = [bitinstr.addr]
+  //     op  t2 = t1, [bitinstr.val]
+  //     mov EAX = t1
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz  newMBB
+  //     fallthrough -->nextMBB
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+
+  // Insert instructions into newMBB based on incoming instruction
+  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+         "unexpected number of operands");
+  DebugLoc dl = bInstr->getDebugLoc();
+  MachineOperand& destOper = bInstr->getOperand(0);
+  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  int numArgs = bInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &bInstr->getOperand(i+1);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int valArgIndx = lastAddrIndx + 1;
+
+  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
+  if (invSrc) {
+    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
+  }
+  else
+    tt = t1;
+
+  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
+  MIB.addReg(tt);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
+  MIB.addReg(t1);
+
+  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t2);
+  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
+  MIB.addReg(EAXreg);
+
+  // insert branch
+  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+// private utility function:  64 bit atomics on 32 bit host.
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpcL,
+                                                       unsigned regOpcH,
+                                                       unsigned immOpcL,
+                                                       unsigned immOpcH,
+                                                       bool invSrc) const {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB (instructions are in pairs, except cmpxchg8b)
+  //     ld t1,t2 = [bitinstr.addr]
+  //   newMBB:
+  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
+  //     op  t5, t6 <- out1, out2, [bitinstr.val]
+  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
+  //     mov ECX, EBX <- t5, t6
+  //     mov EAX, EDX <- t1, t2
+  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
+  //     mov t3, t4 <- EAX, EDX
+  //     bz  newMBB
+  //     result in out1, out2
+  //     fallthrough -->nextMBB
+
+  const TargetRegisterClass *RC = X86::GR32RegisterClass;
+  const unsigned LoadOpc = X86::MOV32rm;
+  const unsigned copyOpc = X86::MOV32rr;
+  const unsigned NotOpc = X86::NOT32r;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+
+  DebugLoc dl = bInstr->getDebugLoc();
+  // Insert instructions into newMBB based on incoming instruction
+  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
+  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
+         "unexpected number of operands");
+  MachineOperand& dest1Oper = bInstr->getOperand(0);
+  MachineOperand& dest2Oper = bInstr->getOperand(1);
+  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  for (int i=0; i < 2 + X86AddrNumOperands; ++i)
+    argOpers[i] = &bInstr->getOperand(i+2);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+
+  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
+  // add 4 to displacement.
+  for (int i=0; i <= lastAddrIndx-2; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MachineOperand newOp3 = *(argOpers[3]);
+  if (newOp3.isImm())
+    newOp3.setImm(newOp3.getImm()+4);
+  else
+    newOp3.setOffset(newOp3.getOffset()+4);
+  (*MIB).addOperand(newOp3);
+  (*MIB).addOperand(*argOpers[lastAddrIndx]);
+
+  // t3/4 are defined later, at the bottom of the loop
+  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
+  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
+    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
+  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
+    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
+
+  unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
+  if (invSrc) {
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1);
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2);
+  } else {
+    tt1 = t1;
+    tt2 = t2;
+  }
+
+  int valArgIndx = lastAddrIndx + 1;
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
+  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
+  if (regOpcL != X86::MOV32rr)
+    MIB.addReg(tt1);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+  assert(argOpers[valArgIndx + 1]->isReg() ==
+         argOpers[valArgIndx]->isReg());
+  assert(argOpers[valArgIndx + 1]->isImm() ==
+         argOpers[valArgIndx]->isImm());
+  if (argOpers[valArgIndx + 1]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
+  if (regOpcH != X86::MOV32rr)
+    MIB.addReg(tt2);
+  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
+  MIB.addReg(t1);
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
+  MIB.addReg(t2);
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
+  MIB.addReg(t5);
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
+  MIB.addReg(t6);
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
+  MIB.addReg(X86::EAX);
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
+  MIB.addReg(X86::EDX);
+
+  // insert branch
+  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
+                                                      MachineBasicBlock *MBB,
+                                                      unsigned cmovOpc) const {
+  // For the atomic min/max operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld t1 = [min/max.addr]
+  //     mov t2 = [min/max.val]
+  //     cmp  t1, t2
+  //     cmov[cond] t2 = t1
+  //     mov EAX = t1
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz   newMBB
+  //     fallthrough -->nextMBB
+  //
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+
+  // newMBB jumps to newMBB and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+
+  DebugLoc dl = mInstr->getDebugLoc();
+  // Insert instructions into newMBB based on incoming instruction
+  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+         "unexpected number of operands");
+  MachineOperand& destOper = mInstr->getOperand(0);
+  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  int numArgs = mInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &mInstr->getOperand(i+1);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int valArgIndx = lastAddrIndx + 1;
+
+  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  // We only support register and immediate values
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
+
+  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
+  MIB.addReg(t1);
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
+  MIB.addReg(t1);
+  MIB.addReg(t2);
+
+  // Generate movc
+  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
+  MIB.addReg(t2);
+  MIB.addReg(t1);
+
+  // Cmp and exchange if none has modified the memory location
+  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t3);
+  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).addMemOperand(*F, *mInstr->memoperands_begin());
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
+  MIB.addReg(X86::EAX);
+
+  // insert branch
+  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case X86::CMOV_V1I64:
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    unsigned Opc =
+      X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+    BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+    // Update machine-CFG edges by transferring all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    sinkMBB->transferSuccessors(BB);
+
+    // Add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM:
+  case X86::FP80_TO_INT16_IN_MEM:
+  case X86::FP80_TO_INT32_IN_MEM:
+  case X86::FP80_TO_INT64_IN_MEM: {
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    MachineFunction *F = BB->getParent();
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
+    addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW),
+                      CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: assert(0 && "illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+    }
+
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isReg()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImm())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImm())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobal()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+  case X86::ATOMAND32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+                                               X86::AND32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass);
+  case X86::ATOMOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
+                                               X86::OR32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass);
+  case X86::ATOMXOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
+                                               X86::XOR32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass);
+  case X86::ATOMNAND32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+                                               X86::AND32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass, true);
+  case X86::ATOMMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
+  case X86::ATOMMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
+  case X86::ATOMUMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
+  case X86::ATOMUMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+
+  case X86::ATOMAND16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+                                               X86::AND16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass);
+  case X86::ATOMOR16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
+                                               X86::OR16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass);
+  case X86::ATOMXOR16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
+                                               X86::XOR16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass);
+  case X86::ATOMNAND16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+                                               X86::AND16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass, true);
+  case X86::ATOMMIN16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
+  case X86::ATOMMAX16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
+  case X86::ATOMUMIN16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
+  case X86::ATOMUMAX16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
+
+  case X86::ATOMAND8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+                                               X86::AND8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass);
+  case X86::ATOMOR8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
+                                               X86::OR8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass);
+  case X86::ATOMXOR8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
+                                               X86::XOR8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass);
+  case X86::ATOMNAND8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+                                               X86::AND8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass, true);
+  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+  // This group is for 64-bit host.
+  case X86::ATOMAND64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+                                               X86::AND64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass);
+  case X86::ATOMOR64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
+                                               X86::OR64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass);
+  case X86::ATOMXOR64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
+                                               X86::XOR64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass);
+  case X86::ATOMNAND64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+                                               X86::AND64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass, true);
+  case X86::ATOMMIN64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
+  case X86::ATOMMAX64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
+  case X86::ATOMUMIN64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
+  case X86::ATOMUMAX64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+
+  // This group does 64-bit operations on a 32-bit host.
+  case X86::ATOMAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               false);
+  case X86::ATOMOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::OR32rr, X86::OR32rr,
+                                               X86::OR32ri, X86::OR32ri,
+                                               false);
+  case X86::ATOMXOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::XOR32rr, X86::XOR32rr,
+                                               X86::XOR32ri, X86::XOR32ri,
+                                               false);
+  case X86::ATOMNAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               true);
+  case X86::ATOMADD6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::ADD32rr, X86::ADC32rr,
+                                               X86::ADD32ri, X86::ADC32ri,
+                                               false);
+  case X86::ATOMSUB6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::SUB32rr, X86::SBB32rr,
+                                               X86::SUB32ri, X86::SBB32ri,
+                                               false);
+  case X86::ATOMSWAP6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::MOV32rr, X86::MOV32rr,
+                                               X86::MOV32ri, X86::MOV32ri,
+                                               false);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       const APInt &Mask,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::SMUL:
+  case X86ISD::UMUL:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+    // These nodes' second result is a boolean.
+    if (Op.getResNo() == 0)
+      break;
+    // Fallthrough
+  case X86ISD::SETCC:
+    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
+                                       Mask.getBitWidth() - 1);
+    break;
+  }
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       GlobalValue* &GA, int64_t &Offset) const{
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+      return true;
+    }
+  }
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
+                               const TargetLowering &TLI) {
+  GlobalValue *GV;
+  int64_t Offset = 0;
+  if (TLI.isGAPlusOffset(Base, GV, Offset))
+    return (GV->getAlignment() >= N && (Offset % N) == 0);
+  // DAG combine handles the stack object case.
+  return false;
+}
+
+static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
+                                     MVT EVT, SDNode *&Base,
+                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
+                                     const TargetLowering &TLI) {
+  Base = NULL;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    if (N->getMaskElt(i) < 0) {
+      if (!Base)
+        return false;
+      continue;
+    }
+
+    SDValue Elt = DAG.getShuffleScalarElt(N, i);
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return false;
+    if (!Base) {
+      Base = Elt.getNode();
+      if (Base->getOpcode() == ISD::UNDEF)
+        return false;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    if (!TLI.isConsecutiveLoad(Elt.getNode(), Base,
+                               EVT.getSizeInBits()/8, i, MFI))
+      return false;
+  }
+  return true;
+}
+
+/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
+/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
+/// if the load addresses are consecutive, non-overlapping, and in the right
+/// order.  In the case of v2i64, it will see if it can rewrite the
+/// shuffle to be an appropriate build vector so it can take advantage of
+// performBuildVectorCombine.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+                                     const TargetLowering &TLI) {
+  DebugLoc dl = N->getDebugLoc();
+  MVT VT = N->getValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // For x86-32 machines, if we see an insert and then a shuffle in a v2i64
+  // where the upper half is 0, it is advantageous to rewrite it as a build
+  // vector of (0, val) so it can use movq.
+  if (VT == MVT::v2i64) {
+    SDValue In[2];
+    In[0] = N->getOperand(0);
+    In[1] = N->getOperand(1);
+    int Idx0 = SVN->getMaskElt(0);
+    int Idx1 = SVN->getMaskElt(1);
+    // FIXME: can we take advantage of undef index?
+    if (Idx0 >= 0 && Idx1 >= 0 &&
+        In[Idx0/2].getOpcode() == ISD::INSERT_VECTOR_ELT &&
+        In[Idx1/2].getOpcode() == ISD::BUILD_VECTOR) {
+      ConstantSDNode* InsertVecIdx =
+                             dyn_cast<ConstantSDNode>(In[Idx0/2].getOperand(2));
+      if (InsertVecIdx &&
+          InsertVecIdx->getZExtValue() == (unsigned)(Idx0 % 2) &&
+          isZeroNode(In[Idx1/2].getOperand(Idx1 % 2))) {
+        return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                           In[Idx0/2].getOperand(1),
+                           In[Idx1/2].getOperand(Idx1 % 2));
+      }
+    }
+  }
+
+  // Try to combine a vector_shuffle into a 128-bit load.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  SDNode *Base = NULL;
+  if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, Base, DAG, MFI, TLI))
+    return SDValue();
+
+  LoadSDNode *LD = cast<LoadSDNode>(Base);
+  if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI))
+    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+                       LD->getSrcValue(), LD->getSrcValueOffset(),
+                       LD->isVolatile());
+  return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+                     LD->getSrcValue(), LD->getSrcValueOffset(),
+                     LD->isVolatile(), LD->getAlignment());
+}
+
+/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
+static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         const X86Subtarget *Subtarget,
+                                         const TargetLowering &TLI) {
+  unsigned NumOps = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Ignore single operand BUILD_VECTOR.
+  if (NumOps == 1)
+    return SDValue();
+
+  MVT VT = N->getValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
+    // We are looking for load i64 and zero extend. We want to transform
+    // it before legalizer has a chance to expand it. Also look for i64
+    // BUILD_PAIR bit casted to f64.
+    return SDValue();
+  // This must be an insertion into a zero vector.
+  SDValue HighElt = N->getOperand(1);
+  if (!isZeroNode(HighElt))
+    return SDValue();
+
+  // Value must be a load.
+  SDNode *Base = N->getOperand(0).getNode();
+  if (!isa<LoadSDNode>(Base)) {
+    if (Base->getOpcode() != ISD::BIT_CONVERT)
+      return SDValue();
+    Base = Base->getOperand(0).getNode();
+    if (!isa<LoadSDNode>(Base))
+      return SDValue();
+  }
+
+  // Transform it into VZEXT_LOAD addr.
+  LoadSDNode *LD = cast<LoadSDNode>(Base);
+
+  // Load must not be an extload.
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return SDValue();
+
+  // Load type should legal type so we don't have to legalize it.
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
+  SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+  TargetLowering::TargetLoweringOpt TLO(DAG);
+  TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
+  DCI.CommitTargetLoweringOpt(TLO);
+  return ResNode;
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  SDValue Cond = N->getOperand(0);
+  // Get the LHS/RHS of the select.
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  
+  // If we have SSE[12] support, try to form min/max nodes.
+  if (Subtarget->hasSSE2() &&
+      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
+      Cond.getOpcode() == ISD::SETCC) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    unsigned Opcode = 0;
+    if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+      switch (CC) {
+      default: break;
+      case ISD::SETOLE: // (X <= Y) ? X : Y -> min
+      case ISD::SETULE:
+      case ISD::SETLE:
+        if (!UnsafeFPMath) break;
+        // FALL THROUGH.
+      case ISD::SETOLT:  // (X olt/lt Y) ? X : Y -> min
+      case ISD::SETLT:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETOGT: // (X > Y) ? X : Y -> max
+      case ISD::SETUGT:
+      case ISD::SETGT:
+        if (!UnsafeFPMath) break;
+        // FALL THROUGH.
+      case ISD::SETUGE:  // (X uge/ge Y) ? X : Y -> max
+      case ISD::SETGE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+      switch (CC) {
+      default: break;
+      case ISD::SETOGT: // (X > Y) ? Y : X -> min
+      case ISD::SETUGT:
+      case ISD::SETGT:
+        if (!UnsafeFPMath) break;
+        // FALL THROUGH.
+      case ISD::SETUGE:  // (X uge/ge Y) ? Y : X -> min
+      case ISD::SETGE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETOLE:   // (X <= Y) ? Y : X -> max
+      case ISD::SETULE:
+      case ISD::SETLE:
+        if (!UnsafeFPMath) break;
+        // FALL THROUGH.
+      case ISD::SETOLT:   // (X olt/lt Y) ? Y : X -> max
+      case ISD::SETLT:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    }
+
+    if (Opcode)
+      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+  }
+  
+  // If this is a select between two integer constants, try to do some
+  // optimizations.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+      // Don't do this for crazy integer types.
+      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+        // so that TrueC (the true value) is larger than FalseC.
+        bool NeedsCondInvert = false;
+        
+        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+            // Efficiently invertible.
+            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
+             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
+              isa<ConstantSDNode>(Cond.getOperand(1))))) {
+          NeedsCondInvert = true;
+          std::swap(TrueC, FalseC);
+        }
+   
+        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
+        if (FalseC->getAPIntValue() == 0 &&
+            TrueC->getAPIntValue().isPowerOf2()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+          
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+          
+          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+                             DAG.getConstant(ShAmt, MVT::i8));
+        }
+        
+        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+          
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                             FalseC->getValueType(0), Cond);
+          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                             SDValue(FalseC, 0));
+        }
+        
+        // Optimize cases that will turn into an LEA instruction.  This requires
+        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+          
+          bool isFastMultiplier = false;
+          if (Diff < 10) {
+            switch ((unsigned char)Diff) {
+              default: break;
+              case 1:  // result = add base, cond
+              case 2:  // result = lea base(    , cond*2)
+              case 3:  // result = lea base(cond, cond*2)
+              case 4:  // result = lea base(    , cond*4)
+              case 5:  // result = lea base(cond, cond*4)
+              case 8:  // result = lea base(    , cond*8)
+              case 9:  // result = lea base(cond, cond*8)
+                isFastMultiplier = true;
+                break;
+            }
+          }
+          
+          if (isFastMultiplier) {
+            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+            if (NeedsCondInvert) // Invert the condition if needed.
+              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(1, Cond.getValueType()));
+            
+            // Zero extend the condition if needed.
+            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                               Cond);
+            // Scale the condition by the difference.
+            if (Diff != 1)
+              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(Diff, Cond.getValueType()));
+            
+            // Add the base if non-zero.
+            if (FalseC->getAPIntValue() != 0)
+              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                                 SDValue(FalseC, 0));
+            return Cond;
+          }
+        }      
+      }
+  }
+      
+  return SDValue();
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  DebugLoc DL = N->getDebugLoc();
+  
+  // If the flag operand isn't dead, don't touch this CMOV.
+  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+    return SDValue();
+  
+  // If this is a select between two integer constants, try to do some
+  // optimizations.  Note that the operands are ordered the opposite of SELECT
+  // operands.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+      // larger than FalseC (the false value).
+      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+        
+      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueC, FalseC);
+      }
+        
+      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
+      // This is efficient for any integer data type (including i8/i16) and
+      // shift amount.
+      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+        SDValue Cond = N->getOperand(3);
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+      
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+        
+        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(ShAmt, MVT::i8));
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+      
+      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
+      // for any integer data type, including i8/i16.
+      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+        SDValue Cond = N->getOperand(3);
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+        
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                           SDValue(FalseC, 0));
+        
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+      
+      // Optimize cases that will turn into an LEA instruction.  This requires
+      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+       
+        bool isFastMultiplier = false;
+        if (Diff < 10) {
+          switch ((unsigned char)Diff) {
+          default: break;
+          case 1:  // result = add base, cond
+          case 2:  // result = lea base(    , cond*2)
+          case 3:  // result = lea base(cond, cond*2)
+          case 4:  // result = lea base(    , cond*4)
+          case 5:  // result = lea base(cond, cond*4)
+          case 8:  // result = lea base(    , cond*8)
+          case 9:  // result = lea base(cond, cond*8)
+            isFastMultiplier = true;
+            break;
+          }
+        }
+        
+        if (isFastMultiplier) {
+          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+          SDValue Cond = N->getOperand(3);
+          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                             DAG.getConstant(CC, MVT::i8), Cond);
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                             Cond);
+          // Scale the condition by the difference.
+          if (Diff != 1)
+            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(Diff, Cond.getValueType()));
+
+          // Add the base if non-zero.
+          if (FalseC->getAPIntValue() != 0)
+            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                               SDValue(FalseC, 0));
+          if (N->getNumValues() == 2)  // Dead flag value?
+            return DCI.CombineTo(N, Cond, SDValue());
+          return Cond;
+        }
+      }      
+    }
+  }
+  return SDValue();
+}
+
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DAG.getMachineFunction().
+      getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  MVT VT = N->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+  uint64_t MulAmt = C->getZExtValue();
+  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+    return SDValue();
+
+  uint64_t MulAmt1 = 0;
+  uint64_t MulAmt2 = 0;
+  if ((MulAmt % 9) == 0) {
+    MulAmt1 = 9;
+    MulAmt2 = MulAmt / 9;
+  } else if ((MulAmt % 5) == 0) {
+    MulAmt1 = 5;
+    MulAmt2 = MulAmt / 5;
+  } else if ((MulAmt % 3) == 0) {
+    MulAmt1 = 3;
+    MulAmt2 = MulAmt / 3;
+  }
+  if (MulAmt2 &&
+      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+    DebugLoc DL = N->getDebugLoc();
+
+    if (isPowerOf2_64(MulAmt2) &&
+        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+      // If second multiplifer is pow2, issue it first. We want the multiply by
+      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+      // is an add.
+      std::swap(MulAmt1, MulAmt2);
+
+    SDValue NewMul;
+    if (isPowerOf2_64(MulAmt1)) 
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                           DAG.getConstant(MulAmt1, VT));
+
+    if (isPowerOf2_64(MulAmt2)) 
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+    else 
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+                           DAG.getConstant(MulAmt2, VT));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, NewMul, false);
+  }
+  return SDValue();
+}
+
+
+/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
+///                       when possible.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  // On X86 with SSE2 support, we can transform this to a vector shift if
+  // all elements are shifted by the same amount.  We can't do this in legalize
+  // because the a constant vector is typically transformed to a constant pool
+  // so we have no knowledge of the shift amount.
+  if (!Subtarget->hasSSE2())
+    return SDValue();
+
+  MVT VT = N->getValueType(0);
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+    return SDValue();
+
+  SDValue ShAmtOp = N->getOperand(1);
+  MVT EltVT = VT.getVectorElementType();
+  DebugLoc DL = N->getDebugLoc();
+  SDValue BaseShAmt;
+  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
+    unsigned NumElts = VT.getVectorNumElements();
+    unsigned i = 0;
+    for (; i != NumElts; ++i) {
+      SDValue Arg = ShAmtOp.getOperand(i);
+      if (Arg.getOpcode() == ISD::UNDEF) continue;
+      BaseShAmt = Arg;
+      break;
+    }
+    for (; i != NumElts; ++i) {
+      SDValue Arg = ShAmtOp.getOperand(i);
+      if (Arg.getOpcode() == ISD::UNDEF) continue;
+      if (Arg != BaseShAmt) {
+        return SDValue();
+      }
+    }
+  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
+             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
+    BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
+                            DAG.getIntPtrConstant(0));
+  } else
+    return SDValue();
+
+  if (EltVT.bitsGT(MVT::i32))
+    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
+  else if (EltVT.bitsLT(MVT::i32))
+    BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt);
+
+  // The shift amount is identical so we can do a vector shift.
+  SDValue  ValOp = N->getOperand(0);
+  switch (N->getOpcode()) {
+  default:
+    assert(0 && "Unknown shift opcode!");
+    break;
+  case ISD::SHL:
+    if (VT == MVT::v2i64)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v4i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v8i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+                         ValOp, BaseShAmt);
+    break;
+  case ISD::SRA:
+    if (VT == MVT::v4i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v8i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+                         ValOp, BaseShAmt);
+    break;
+  case ISD::SRL:
+    if (VT == MVT::v2i64)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v4i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT ==  MVT::v8i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+                         ValOp, BaseShAmt);
+    break;
+  }
+  return SDValue();
+}
+
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
+  // the FP state in cases where an emms may be missing.
+  // A preferable solution to the general problem is to figure out the right
+  // places to insert EMMS.  This qualifies as a quick hack.
+
+  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  MVT VT = St->getValue().getValueType();
+  if (VT.getSizeInBits() != 64)
+    return SDValue();
+
+  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2();
+  if ((VT.isVector() ||
+       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+      isa<LoadSDNode>(St->getValue()) &&
+      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+      St->getChain().hasOneUse() && !St->isVolatile()) {
+    SDNode* LdVal = St->getValue().getNode();
+    LoadSDNode *Ld = 0;
+    int TokenFactorIndex = -1;
+    SmallVector<SDValue, 8> Ops;
+    SDNode* ChainVal = St->getChain().getNode();
+    // Must be a store of a load.  We currently handle two cases:  the load
+    // is a direct child, and it's under an intervening TokenFactor.  It is
+    // possible to dig deeper under nested TokenFactors.
+    if (ChainVal == LdVal)
+      Ld = cast<LoadSDNode>(St->getChain());
+    else if (St->getValue().hasOneUse() &&
+             ChainVal->getOpcode() == ISD::TokenFactor) {
+      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
+        if (ChainVal->getOperand(i).getNode() == LdVal) {
+          TokenFactorIndex = i;
+          Ld = cast<LoadSDNode>(St->getValue());
+        } else
+          Ops.push_back(ChainVal->getOperand(i));
+      }
+    }
+
+    if (!Ld || !ISD::isNormalLoad(Ld))
+      return SDValue();
+
+    // If this is not the MMX case, i.e. we are just turning i64 load/store
+    // into f64 load/store, avoid the transformation if there are multiple
+    // uses of the loaded value.
+    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+      return SDValue();
+
+    DebugLoc LdDL = Ld->getDebugLoc();
+    DebugLoc StDL = N->getDebugLoc();
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+    // pair instead.
+    if (Subtarget->is64Bit() || F64IsLegal) {
+      MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
+                                  Ld->getBasePtr(), Ld->getSrcValue(),
+                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
+                                  Ld->getAlignment());
+      SDValue NewChain = NewLd.getValue(1);
+      if (TokenFactorIndex != -1) {
+        Ops.push_back(NewChain);
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+                               Ops.size());
+      }
+      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+                          St->getSrcValue(), St->getSrcValueOffset(),
+                          St->isVolatile(), St->getAlignment());
+    }
+
+    // Otherwise, lower to two pairs of 32-bit loads / stores.
+    SDValue LoAddr = Ld->getBasePtr();
+    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+                                 DAG.getConstant(4, MVT::i32));
+
+    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                               Ld->isVolatile(), Ld->getAlignment());
+    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
+                               Ld->isVolatile(),
+                               MinAlign(Ld->getAlignment(), 4));
+
+    SDValue NewChain = LoLd.getValue(1);
+    if (TokenFactorIndex != -1) {
+      Ops.push_back(LoLd);
+      Ops.push_back(HiLd);
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+                             Ops.size());
+    }
+
+    LoAddr = St->getBasePtr();
+    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+                         DAG.getConstant(4, MVT::i32));
+
+    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+                                St->getSrcValue(), St->getSrcValueOffset(),
+                                St->isVolatile(), St->getAlignment());
+    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+                                St->getSrcValue(),
+                                St->getSrcValueOffset() + 4,
+                                St->isVolatile(),
+                                MinAlign(St->getAlignment(), 4));
+    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+  }
+  return SDValue();
+}
+
+/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
+/// X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+  // F[X]OR(0.0, x) -> x
+  // F[X]OR(x, 0.0) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  return SDValue();
+}
+
+/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+  // FAND(0.0, x) -> 0.0
+  // FAND(x, 0.0) -> 0.0
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  return SDValue();
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+                                SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // BT ignores high bits in the bit index operand.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.hasOneUse()) {
+    unsigned BitWidth = Op1.getValueSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG);
+    TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return SDValue();
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
+  case ISD::BUILD_VECTOR:
+    return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
+  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
+  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case X86ISD::FXOR:
+  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
+  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'A':
+      return C_Register;
+    case 'f':
+    case 'r':
+    case 'R':
+    case 'l':
+    case 'q':
+    case 'Q':
+    case 'x':
+    case 'y':
+    case 'Y':
+      return C_RegisterClass;
+    case 'e':
+    case 'Z':
+      return C_Other;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(MVT ConstraintVT) const {
+  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+  // 'f' like normal targets.
+  if (ConstraintVT.isFloatingPoint()) {
+    if (Subtarget->hasSSE2())
+      return "Y";
+    if (Subtarget->hasSSE1())
+      return "x";
+  }
+
+  return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     char Constraint,
+                                                     bool hasMemory,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  switch (Constraint) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 255) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'e': {
+    // 32-bit signed value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      const ConstantInt *CI = C->getConstantIntValue();
+      if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) {
+        // Widen to 64 bits here to get it sign extended.
+        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+        break;
+      }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    }
+    return;
+  }
+  case 'Z': {
+    // 32-bit unsigned value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      const ConstantInt *CI = C->getConstantIntValue();
+      if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    return;
+  }
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+      // Widen to 64 bits here to get it sign extended.
+      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+      break;
+    }
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = 0;
+    int64_t Offset = 0;
+
+    // Match either (GA), (GA+C), (GA+C1+C2), etc.
+    while (1) {
+      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+        Offset += GA->getOffset();
+        break;
+      } else if (Op.getOpcode() == ISD::ADD) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      } else if (Op.getOpcode() == ISD::SUB) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += -C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      }
+      
+      // Otherwise, this isn't something we can handle, reject it.
+      return;
+    }
+
+    if (hasMemory)
+      Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), Offset, DAG);
+    else
+      Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+                                      Offset);
+    Result = Op;
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+                                                      Ops, DAG);
+}
+
+std::vector<unsigned> X86TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    // FIXME: not handling fp-stack yet!
+    switch (Constraint[0]) {      // GCC X86 Constraint Letters
+    default: break;  // Unknown constraint letter
+    case 'q':   // Q_REGS (GENERAL_REGS in 64-bit mode)
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32)
+        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
+      else if (VT == MVT::i16)
+        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
+      else if (VT == MVT::i8)
+        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
+      else if (VT == MVT::i64)
+        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
+      break;
+    }
+  }
+
+  return std::vector<unsigned>();
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+    case 'R':   // LEGACY_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, X86::GR8RegisterClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, X86::GR16RegisterClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, X86::GR32RegisterClass);
+      return std::make_pair(0U, X86::GR64RegisterClass);
+    case 'f':  // FP Stack registers.
+      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+      // value to the correct fpstack register class.
+      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, X86::RFP32RegisterClass);
+      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, X86::RFP64RegisterClass);
+      return std::make_pair(0U, X86::RFP80RegisterClass);
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget->hasMMX()) break;
+      return std::make_pair(0U, X86::VR64RegisterClass);
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget->hasSSE2()) break;
+      // FALL THROUGH.
+    case 'x':   // SSE_REGS if SSE1 allowed
+      if (!Subtarget->hasSSE1()) break;
+
+      switch (VT.getSimpleVT()) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(0U, X86::FR32RegisterClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(0U, X86::FR64RegisterClass);
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(0U, X86::VR128RegisterClass);
+      }
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (Res.second == 0) {
+    // GCC calls "st(0)" just plain "st".
+    if (StringsEqualNoCase("{st}", Constraint)) {
+      Res.first = X86::ST0;
+      Res.second = X86::RFP80RegisterClass;
+    }
+    // 'A' means EAX + EDX.
+    if (Constraint == "A") {
+      Res.first = X86::EAX;
+      Res.second = X86::GRADRegisterClass;
+    }
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  if (Res.second->hasType(VT))
+    return Res;   // Correct type already, nothing to do.
+
+  // All of the single-register GCC register classes map their values onto
+  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
+  // really want an 8-bit or 32-bit register, map to the appropriate register
+  // class and return the appropriate register.
+  if (Res.second == X86::GR16RegisterClass) {
+    if (VT == MVT::i8) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::AL; break;
+      case X86::DX: DestReg = X86::DL; break;
+      case X86::CX: DestReg = X86::CL; break;
+      case X86::BX: DestReg = X86::BL; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = X86::GR8RegisterClass;
+      }
+    } else if (VT == MVT::i32) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::EAX; break;
+      case X86::DX: DestReg = X86::EDX; break;
+      case X86::CX: DestReg = X86::ECX; break;
+      case X86::BX: DestReg = X86::EBX; break;
+      case X86::SI: DestReg = X86::ESI; break;
+      case X86::DI: DestReg = X86::EDI; break;
+      case X86::BP: DestReg = X86::EBP; break;
+      case X86::SP: DestReg = X86::ESP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = X86::GR32RegisterClass;
+      }
+    } else if (VT == MVT::i64) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::RAX; break;
+      case X86::DX: DestReg = X86::RDX; break;
+      case X86::CX: DestReg = X86::RCX; break;
+      case X86::BX: DestReg = X86::RBX; break;
+      case X86::SI: DestReg = X86::RSI; break;
+      case X86::DI: DestReg = X86::RDI; break;
+      case X86::BP: DestReg = X86::RBP; break;
+      case X86::SP: DestReg = X86::RSP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = X86::GR64RegisterClass;
+      }
+    }
+  } else if (Res.second == X86::FR32RegisterClass ||
+             Res.second == X86::FR64RegisterClass ||
+             Res.second == X86::VR128RegisterClass) {
+    // Handle references to XMM physical registers that got mapped into the
+    // wrong class.  This can happen with constraints like {xmm0} where the
+    // target independent register mapper will just pick the first match it can
+    // find, ignoring the required type.
+    if (VT == MVT::f32)
+      Res.second = X86::FR32RegisterClass;
+    else if (VT == MVT::f64)
+      Res.second = X86::FR64RegisterClass;
+    else if (X86::VR128RegisterClass->hasType(VT))
+      Res.second = X86::VR128RegisterClass;
+  }
+
+  return Res;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Widen vector type
+//===----------------------------------------------------------------------===//
+
+/// getWidenVectorType: given a vector type, returns the type to widen
+/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+/// If there is no vector type that we want to widen to, returns MVT::Other
+/// When and where to widen is target dependent based on the cost of
+/// scalarizing vs using the wider vector type.
+
+MVT X86TargetLowering::getWidenVectorType(MVT VT) const {
+  assert(VT.isVector());
+  if (isTypeLegal(VT))
+    return VT;
+
+  // TODO: In computeRegisterProperty, we can compute the list of legal vector
+  //       type based on element type.  This would speed up our search (though
+  //       it may not be worth it since the size of the list is relatively
+  //       small).
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NElts = VT.getVectorNumElements();
+
+  // On X86, it make sense to widen any vector wider than 1
+  if (NElts <= 1)
+    return MVT::Other;
+
+  for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
+       nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+    MVT SVT = (MVT::SimpleValueType)nVT;
+
+    if (isTypeLegal(SVT) &&
+        SVT.getVectorElementType() == EltVT &&
+        SVT.getVectorNumElements() > NElts)
+      return SVT;
+  }
+  return MVT::Other;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..550f8bd
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,705 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86Subtarget.h"
+#include "X86RegisterInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// BSF - Bit scan forward.
+      /// BSR - Bit scan reverse.
+      BSF,
+      BSR,
+
+      /// SHLD, SHRD - Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// FAND - Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// FOR - Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      /// FSRL - Bitwise logical right shift of floating point values. These
+      /// corresponds to X86::PSRLDQ.
+      FSRL,
+
+      /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// FLD - This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// FST - This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// CALL/TAILCALL - These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      /// The CALL vs TAILCALL distinction boils down to whether the callee is
+      /// known not to modify the caller's stack frame, as is standard with
+      /// LLVM.
+      CALL,
+      TAILCALL,
+      
+      /// RDTSC_DAG - This operation implements the lowering for 
+      /// readcyclecounter
+      RDTSC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, COMI, UCOMI,
+
+      /// X86 bit-test instructions.
+      BT,
+
+      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// X86 conditional moves. Operand 0 and operand 1 are the two values
+      /// to select from. Operand 2 is the condition code, and operand 3 is the
+      /// flag operand produced by a CMP or TEST instruction. It also writes a
+      /// flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 0 is the chain operand, operand
+      /// 1 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// Wrapper - A wrapper node for TargetConstantPool,
+      /// TargetExternalSymbol, and TargetGlobalAddress.
+      Wrapper,
+
+      /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRB.
+      PEXTRB,
+
+      /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// INSERTPS - Insert any element of a 4 x float vector into any element
+      /// of a destination 4 x floatvector.
+      INSERTPS,
+
+      /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRB.
+      PINSRB,
+
+      /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW,
+
+      /// PSHUFB - Shuffle 16 8-bit values within a vector.
+      PSHUFB,
+
+      /// FMAX, FMIN - Floating point max and min.
+      ///
+      FMAX, FMIN,
+
+      /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+      /// approximation.  Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
+      // TLSADDR - Thread Local Storage.
+      TLSADDR,
+
+      // SegmentBaseAddress - The address segment:0
+      SegmentBaseAddress,
+
+      // EH_RETURN - Exception Handling helpers.
+      EH_RETURN,
+      
+      /// TC_RETURN - Tail call return.
+      ///   operand #0 chain
+      ///   operand #1 callee (register or absolute)
+      ///   operand #2 stack adjustment
+      ///   operand #3 optional in flag
+      TC_RETURN,
+
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
+      LCMPXCHG_DAG,
+      LCMPXCHG8_DAG,
+
+      // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, 
+      // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - 
+      // Atomic 64-bit binary operations.
+      ATOMADD64_DAG,
+      ATOMSUB64_DAG,
+      ATOMOR64_DAG,
+      ATOMXOR64_DAG,
+      ATOMAND64_DAG,
+      ATOMNAND64_DAG,
+      ATOMSWAP64_DAG,
+
+      // FNSTCW16m - Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      // VZEXT_MOVL - Vector move low and zero extend.
+      VZEXT_MOVL,
+
+      // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+      VZEXT_LOAD,
+
+      // VSHL, VSRL - Vector logical left / right shift.
+      VSHL, VSRL,
+
+      // CMPPD, CMPPS - Vector double/float comparison.
+      // CMPPD, CMPPS - Vector double/float comparison.
+      CMPPD, CMPPS,
+      
+      // PCMP* - Vector integer comparisons.
+      PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ,
+      PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ,
+
+      // ADD, SUB, SMUL, UMUL, etc. - Arithmetic operations with FLAGS results.
+      ADD, SUB, SMUL, UMUL,
+      INC, DEC,
+
+      // MUL_IMM - X86 specific multiply by immediate.
+      MUL_IMM
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace X86 {
+    /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+    bool isPSHUFDMask(ShuffleVectorSDNode *N);
+
+    /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+    bool isPSHUFHWMask(ShuffleVectorSDNode *N);
+
+    /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+    bool isPSHUFLWMask(ShuffleVectorSDNode *N);
+
+    /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to SHUFP*.
+    bool isSHUFPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+    bool isMOVHLPSMask(ShuffleVectorSDNode *N);
+
+    /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+    /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+    /// <2, 3, 2, 3>
+    bool isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N);
+
+    /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for MOVLP{S|D}.
+    bool isMOVLPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for MOVHP{S|D}.
+    /// as well as MOVLHPS.
+    bool isMOVHPMask(ShuffleVectorSDNode *N);
+
+    /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to UNPCKL.
+    bool isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+    /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to UNPCKH.
+    bool isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+    /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+    /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+    /// <0, 0, 1, 1>
+    bool isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N);
+
+    /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+    /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+    /// <2, 2, 3, 3>
+    bool isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N);
+
+    /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVSS,
+    /// MOVSD, and MOVD, i.e. setting the lowest element.
+    bool isMOVLMask(ShuffleVectorSDNode *N);
+
+    /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+    bool isMOVSHDUPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+    bool isMOVSLDUPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+    bool isMOVDDUPMask(ShuffleVectorSDNode *N);
+
+    /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+    /// instructions.
+    unsigned getShuffleSHUFImmediate(SDNode *N);
+
+    /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+    /// instructions.
+    unsigned getShufflePSHUFHWImmediate(SDNode *N);
+
+    /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle
+    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+    /// instructions.
+    unsigned getShufflePSHUFLWImmediate(SDNode *N);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
+  class X86TargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int RegSaveFrameIndex;            // X86-64 vararg func register save area.
+    unsigned VarArgsGPOffset;         // X86-64 vararg func int reg offset.
+    unsigned VarArgsFPOffset;         // X86-64 vararg func fp reg offset.
+    int BytesToPopOnReturn;           // Number of arg bytes ret should pop.
+    int BytesCallerReserves;          // Number of arg bytes caller makes.
+
+  public:
+    explicit X86TargetLowering(X86TargetMachine &TM);
+
+    /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+    /// jumptable.
+    SDValue getPICJumpTableRelocBase(SDValue Table,
+                                       SelectionDAG &DAG) const;
+
+    // Return the number of bytes that a function should pop when it returns (in
+    // addition to the space used by the return address).
+    //
+    unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+
+    // Return the number of bytes that the caller reserves for arguments passed
+    // to this function.
+    unsigned getBytesCallerReserves() const { return BytesCallerReserves; }
+ 
+    /// getStackPtrReg - Return the stack pointer register we are using: either
+    /// ESP or RSP.
+    unsigned getStackPtrReg() const { return X86StackPtr; }
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area. For X86, aggregates
+    /// that contains are placed at 16-byte boundaries while the rest are at
+    /// 4-byte boundaries.
+    virtual unsigned getByValTypeAlignment(const Type *Ty) const;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+    /// determining it.
+    virtual
+    MVT getOptimalMemOpType(uint64_t Size, unsigned Align,
+                            bool isSrcConst, bool isSrcStr) const;
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *MBB) const;
+
+ 
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual MVT getSetCCResultType(MVT VT) const;
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified 
+    /// in Mask are known to be either zero or one and return them in the 
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero, 
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual bool
+    isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) const;
+    
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+     
+    std::vector<unsigned> 
+      getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                        MVT VT) const;
+
+    virtual const char *LowerXConstraint(MVT ConstraintVT) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              char ConstraintLetter,
+                                              bool hasMemory,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+    
+    /// getRegForInlineAsmConstraint - Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT VT) const;
+    
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+    /// register EAX to i16 by referencing its sub-register AX.
+    virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isTruncateFree(MVT VT1, MVT VT2) const;
+
+    /// isZExtFree - Return true if any actual instruction that defines a
+    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in
+    /// unknown ways, such as incoming arguments, or copies from unknown
+    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+    /// all instructions that define 32-bit values implicit zero-extend the
+    /// result out to 64 bits.
+    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(MVT VT1, MVT VT2) const;
+
+    /// isNarrowingProfitable - Return true if it's profitable to narrow
+    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+    /// from i32 to i8 but not from i32 to i16.
+    virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const;
+
+    /// isShuffleMaskLegal - Targets can use this to indicate that they only
+    /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+    /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+    /// values are assumed to be legal.
+    virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                                    MVT VT) const;
+
+    /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+    /// used by Targets can use this to indicate if there is a suitable
+    /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+    /// pool entry.
+    virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                        MVT VT) const;
+
+    /// ShouldShrinkFPConstant - If true, then instruction selection should
+    /// seek to shrink the FP constant of the specified type to a smaller type
+    /// in order to save space and / or reduce runtime.
+    virtual bool ShouldShrinkFPConstant(MVT VT) const {
+      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+      // expensive than a straight movsd. On the other hand, it's important to
+      // shrink long double fp constant since fldt is very slow.
+      return !X86ScalarSSEf64 || VT == MVT::f80;
+    }
+    
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Target which want to do tail call
+    /// optimization should implement this function.
+    virtual bool IsEligibleForTailCallOptimization(CallSDNode *TheCall, 
+                                                   SDValue Ret, 
+                                                   SelectionDAG &DAG) const;
+
+    virtual const X86Subtarget* getSubtarget() {
+      return Subtarget;
+    }
+
+    /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+    /// computed in an SSE register, not on the X87 floating point stack.
+    bool isScalarFPTypeInSSEReg(MVT VT) const {
+      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+    }
+
+    /// getWidenVectorType: given a vector type, returns the type to widen
+    /// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+    /// If there is no vector type that we want to widen to, returns MVT::Other
+    /// When and were to widen is target dependent based on the cost of
+    /// scalarizing vs using the wider vector type.
+    virtual MVT getWidenVectorType(MVT VT) const;
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    virtual FastISel *
+    createFastISel(MachineFunction &mf,
+                   MachineModuleInfo *mmi, DwarfWriter *dw,
+                   DenseMap<const Value *, unsigned> &,
+                   DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+                   DenseMap<const AllocaInst *, int> &
+#ifndef NDEBUG
+                   , SmallSet<Instruction*, 8> &
+#endif
+                   );
+    
+  private:
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+    const X86RegisterInfo *RegInfo;
+    const TargetData *TD;
+
+    /// X86StackPtr - X86 physical register used as stack ptr.
+    unsigned X86StackPtr;
+   
+    /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 
+    /// floating point ops.
+    /// When SSE is available, use it for f32 operations.
+    /// When SSE2 is available, use it for f64 operations.
+    bool X86ScalarSSEf32;
+    bool X86ScalarSSEf64;
+
+    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+
+    SDValue LowerMemArgument(SDValue Op, SelectionDAG &DAG,
+                               const CCValAssign &VA,  MachineFrameInfo *MFI,
+                               unsigned CC, SDValue Root, unsigned i);
+
+    SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
+                               const SDValue &StackPtr,
+                               const CCValAssign &VA, SDValue Chain,
+                               SDValue Arg, ISD::ArgFlagsTy Flags);
+
+    // Call lowering helpers.
+    bool IsCalleePop(bool isVarArg, unsigned CallingConv);
+    bool CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall);
+    bool CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall);
+    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+                                SDValue Chain, bool IsTailCall, bool Is64Bit,
+                                int FPDiff, DebugLoc dl);
+
+    CCAssignFn *CCAssignFnForNode(unsigned CallingConv) const;
+    NameDecorationStyle NameDecorationForFORMAL_ARGUMENTS(SDValue Op);
+    unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG);
+
+    std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                               bool isSigned);
+    
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+                               int64_t Offset, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG);
+    SDValue BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG);
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFABS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG);
+
+    SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG);
+
+    void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                 SelectionDAG &DAG, unsigned NewOp);
+
+    SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                    SDValue Chain,
+                                    SDValue Dst, SDValue Src,
+                                    SDValue Size, unsigned Align,
+                                    const Value *DstSV, uint64_t DstSVOff);
+    SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                    SDValue Chain,
+                                    SDValue Dst, SDValue Src,
+                                    SDValue Size, unsigned Align,
+                                    bool AlwaysInline,
+                                    const Value *DstSV, uint64_t DstSVOff,
+                                    const Value *SrcSV, uint64_t SrcSVOff);
+    
+    /// Utility function to emit atomic bitwise operations (and, or, xor).
+    // It takes the bitwise instruction to expand, the associated machine basic
+    // block, and the associated X86 opcodes for reg/reg and reg/imm.
+    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
+                                                    MachineInstr *BInstr,
+                                                    MachineBasicBlock *BB,
+                                                    unsigned regOpc,
+                                                    unsigned immOpc,
+                                                    unsigned loadOpc,
+                                                    unsigned cxchgOpc,
+                                                    unsigned copyOpc,
+                                                    unsigned notOpc,
+                                                    unsigned EAXreg,
+                                                    TargetRegisterClass *RC,
+                                                    bool invSrc = false) const;
+
+    MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
+                                                    MachineInstr *BInstr,
+                                                    MachineBasicBlock *BB,
+                                                    unsigned regOpcL,
+                                                    unsigned regOpcH,
+                                                    unsigned immOpcL,
+                                                    unsigned immOpcH,
+                                                    bool invSrc = false) const;
+    
+    /// Utility function to emit atomic min and max.  It takes the min/max
+    /// instruction to expand, the associated basic block, and the associated
+    /// cmov opcode for moving the min or max value.
+    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
+                                                          MachineBasicBlock *BB,
+                                                        unsigned cmovOpc) const;
+
+    /// Emit nodes that will be selected as "test Op0,Op0", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG);
+
+    /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                    SelectionDAG &DAG);
+  };
+
+  namespace X86 {
+    FastISel *createFastISel(MachineFunction &mf,
+                           MachineModuleInfo *mmi, DwarfWriter *dw,
+                           DenseMap<const Value *, unsigned> &,
+                           DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+                           DenseMap<const AllocaInst *, int> &
+#ifndef NDEBUG
+                           , SmallSet<Instruction*, 8> &
+#endif
+                           );
+  }
+}
+
+#endif    // X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
new file mode 100644
index 0000000..dc15e4a
--- /dev/null
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -0,0 +1,1937 @@
+//====- X86Instr64bit.td - Describe X86-64 Instructions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86-64 instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64>;
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64>;
+
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printlea64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64, i32imm);
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printlea64_32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Pattern Definitions.
+//
+def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+                        [add, mul, X86mul_imm, shl, or, frameindex, X86Wrapper],
+                        []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+def i64immSExt8  : PatLeaf<(i64 imm), [{
+  // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int64_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+def i64immSExt32  : PatLeaf<(i64 imm), [{
+  // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // sign extended field.
+  return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
+}]>;
+
+def i64immZExt32  : PatLeaf<(i64 imm), [{
+  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // unsignedsign extended field.
+  return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
+}]>;
+
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[In64BitMode]>;
+def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. RSP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+      Uses = [RSP] in {
+      
+    // NOTE: this pattern doesn't match "X86call imm", because we do not know
+    // that the offset between an arbitrary immediate and the call will fit in
+    // the 32-bit pcrel field that we have.
+    def CALL64pcrel32 : I<0xE8, RawFrm,
+                          (outs), (ins i64i32imm:$dst, variable_ops),
+                          "call\t${dst:call}", []>,
+                        Requires<[In64BitMode]>;
+    def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+                          "call\t{*}$dst", [(X86call GR64:$dst)]>;
+    def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+                          "call\t{*}$dst", [(X86call (loadi64 addr:$dst))]>;
+  }
+
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi64 : I<0, Pseudo, (outs), (ins i64imm:$dst, i32imm:$offset,
+                                         variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNri64 : I<0, Pseudo, (outs), (ins GR64:$dst, i32imm:$offset,
+                                         variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
+                   "jmp{q}\t{*}$dst  # TAILCALL",
+                   []>;     
+
+// Branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+                     [(brind GR64:$dst)]>;
+  def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+                     [(brind (loadi64 addr:$dst))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1 in {
+def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+                     "ret\t#eh_return, addr: $addr",
+                     [(X86ehret GR64:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+def LEAVE64  : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", []>;
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP64r   : I<0x58, AddRegFrm,
+                 (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
+let mayStore = 1 in
+def PUSH64r  : I<0x50, AddRegFrm,
+                 (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1 in
+def POPFQ    : I<0x9D, RawFrm, (outs), (ins), "popf", []>, REX_W;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1 in
+def PUSHFQ   : I<0x9C, RawFrm, (outs), (ins), "pushf", []>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (outs GR32:$dst), (ins lea64_32mem:$src),
+                  "lea{l}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+                  "lea{q}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)]>;
+
+let isTwoAddress = 1 in
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+                  "bswap{q}\t$dst", 
+                  [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsf GR64:$src)), (implicit EFLAGS)]>, TB;
+def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsf (loadi64 addr:$src))),
+                   (implicit EFLAGS)]>, TB;
+
+def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsr GR64:$src)), (implicit EFLAGS)]>, TB;
+def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsr (loadi64 addr:$src))),
+                   (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+// Repeat string ops
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI] in
+def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+                   [(X86rep_movs i64)]>, REP;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI] in
+def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+                   [(X86rep_stos i64)]>, REP;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+
+let neverHasSideEffects = 1 in
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1  in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+                    "movabs{q}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, imm:$src)]>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+                      "mov{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, i64immSExt32:$src)]>;
+}
+
+let canFoldAsLoad = 1 in
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))]>;
+
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)]>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                      "mov{q}\t{$src, $dst|$dst, $src}",
+                      [(store i64immSExt32:$src, addr:$dst)]>;
+
+// Sign/Zero extenders
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))]>, TB;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))]>, TB;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+
+// Use movzbl instead of movzbq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR64:$dst, (zext GR8:$src))]>, TB;
+def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+// Use movzwl instead of movzwq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                   "movz{wl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR64:$dst, (zext GR16:$src))]>, TB;
+def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                   "movz{wl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+
+// There's no movzlq instruction, but movl can be used for this purpose, using
+// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
+// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
+// zero-extension, however this isn't possible when the 32-bit value is
+// defined by a truncate or is copied from something where the high bits aren't
+// necessarily all zero. In such cases, we fall back to these explicit zext
+// instructions.
+def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
+                    "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                    [(set GR64:$dst, (zext GR32:$src))]>;
+def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                    [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetInstrInfo::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, x86_subreg_32bit)>;
+
+let neverHasSideEffects = 1 in {
+  let Defs = [RAX], Uses = [EAX] in
+  def CDQE : RI<0x98, RawFrm, (outs), (ins),
+               "{cltq|cdqe}", []>;     // RAX = signext(EAX)
+
+  let Defs = [RAX,RDX], Uses = [RAX] in
+  def CQO  : RI<0x99, RawFrm, (outs), (ins),
+                "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions...
+//
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isConvertibleToThreeAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Addition
+def ADD64rr    : RI<0x01, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "add{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (add GR64:$src1, GR64:$src2)),
+                     (implicit EFLAGS)]>;
+
+// Register-Integer Addition
+def ADD64ri8  : RIi8<0x83, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                     "add{q}\t{$src2, $dst|$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2)),
+                      (implicit EFLAGS)]>;
+def ADD64ri32 : RIi32<0x81, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                      "add{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2)),
+                       (implicit EFLAGS)]>;
+} // isConvertibleToThreeAddress
+
+// Register-Memory Addition
+def ADD64rm     : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                     "add{q}\t{$src2, $dst|$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, (load addr:$src2))),
+                      (implicit EFLAGS)]>;
+} // isTwoAddress
+
+// Memory-Register Addition
+def ADD64mr  : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                  "add{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (add (load addr:$dst), GR64:$src2), addr:$dst),
+                   (implicit EFLAGS)]>;
+def ADD64mi8 : RIi8<0x83, MRM0m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+                    "add{q}\t{$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst),
+                 (implicit EFLAGS)]>;
+def ADD64mi32 : RIi32<0x81, MRM0m, (outs), (ins i64mem:$dst, i64i32imm :$src2),
+                      "add{q}\t{$src2, $dst|$dst, $src2}",
+               [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst),
+                (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def ADC64rr  : RI<0x11, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                  "adc{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>;
+
+def ADC64rm  : RI<0x13, MRMSrcMem , (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                  "adc{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>;
+
+def ADC64ri8 : RIi8<0x83, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "adc{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>;
+def ADC64ri32 : RIi32<0x81, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                      "adc{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def ADC64mr  : RI<0x11, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                  "adc{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADC64mi8 : RIi8<0x83, MRM2m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+                    "adc{q}\t{$src2, $dst|$dst, $src2}",
+                 [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def ADC64mi32 : RIi32<0x81, MRM2m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+                      "adc{q}\t{$src2, $dst|$dst, $src2}",
+                 [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+} // Uses = [EFLAGS]
+
+let isTwoAddress = 1 in {
+// Register-Register Subtraction
+def SUB64rr  : RI<0x29, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                  "sub{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+
+// Register-Memory Subtraction
+def SUB64rm  : RI<0x2B, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                  "sub{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+
+// Register-Integer Subtraction
+def SUB64ri8 : RIi8<0x83, MRM5r, (outs GR64:$dst),
+                                 (ins GR64:$src1, i64i8imm:$src2),
+                    "sub{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2)),
+                     (implicit EFLAGS)]>;
+def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64i32imm:$src2),
+                      "sub{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2)),
+                       (implicit EFLAGS)]>;
+} // isTwoAddress
+
+// Memory-Register Subtraction
+def SUB64mr  : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
+                  "sub{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sub (load addr:$dst), GR64:$src2), addr:$dst),
+                   (implicit EFLAGS)]>;
+
+// Memory-Integer Subtraction
+def SUB64mi8 : RIi8<0x83, MRM5m, (outs), (ins i64mem:$dst, i64i8imm :$src2), 
+                    "sub{q}\t{$src2, $dst|$dst, $src2}",
+                    [(store (sub (load addr:$dst), i64immSExt8:$src2),
+                            addr:$dst),
+                     (implicit EFLAGS)]>;
+def SUB64mi32 : RIi32<0x81, MRM5m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+                      "sub{q}\t{$src2, $dst|$dst, $src2}",
+                      [(store (sub (load addr:$dst), i64immSExt32:$src2),
+                              addr:$dst),
+                       (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+let isTwoAddress = 1 in {
+def SBB64rr    : RI<0x19, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>;
+
+def SBB64rm  : RI<0x1B, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                  "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>;
+
+def SBB64ri8 : RIi8<0x83, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>;
+def SBB64ri32 : RIi32<0x81, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                      "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def SBB64mr  : RI<0x19, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
+                  "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SBB64mi8 : RIi8<0x83, MRM3m, (outs), (ins i64mem:$dst, i64i8imm :$src2), 
+                    "sbb{q}\t{$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def SBB64mi32 : RIi32<0x81, MRM3m, (outs), (ins i64mem:$dst, i64i32imm:$src2), 
+                      "sbb{q}\t{$src2, $dst|$dst, $src2}",
+              [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+// Unsigned multiplication
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in {
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+                "mul{q}\t$src", []>;         // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+                "mul{q}\t$src", []>;         // RAX,RDX = RAX*[mem64]
+
+// Signed multiplication
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src),
+                 "imul{q}\t$src", []>;         // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+                 "imul{q}\t$src", []>;         // RAX,RDX = RAX*[mem64]
+}
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Signed Integer Multiplication
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+                                   (ins GR64:$src1, GR64:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (mul GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>, TB;
+
+// Register-Memory Signed Integer Multiplication
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64mem:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>, TB;
+} // isTwoAddress
+
+// Suprisingly enough, these are not two address instructions!
+
+// Register-Integer Signed Integer Multiplication
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                        (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2)),
+                        (implicit EFLAGS)]>;
+
+// Memory-Integer Signed Integer Multiplication
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, (mul (load addr:$src1),
+                                            i64immSExt8:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                        (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                        [(set GR64:$dst, (mul (load addr:$src1),
+                                              i64immSExt32:$src2)),
+                         (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Unsigned division / remainder
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in {
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),        // RDX:RAX/r64 = RAX,RDX
+                "div{q}\t$src", []>;
+// Signed division / remainder
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),        // RDX:RAX/r64 = RAX,RDX
+                "idiv{q}\t$src", []>;
+let mayLoad = 1 in {
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),      // RDX:RAX/[mem64] = RAX,RDX
+                "div{q}\t$src", []>;
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),      // RDX:RAX/[mem64] = RAX,RDX
+                "idiv{q}\t$src", []>;
+}
+}
+
+// Unary instructions
+let Defs = [EFLAGS], CodeSize = 2 in {
+let isTwoAddress = 1 in
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src), "neg{q}\t$dst",
+                [(set GR64:$dst, (ineg GR64:$src)),
+                 (implicit EFLAGS)]>;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+                 (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst",
+                [(set GR64:$dst, (add GR64:$src, 1)),
+                 (implicit EFLAGS)]>;
+def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+                [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+                 (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst",
+                [(set GR64:$dst, (add GR64:$src, -1)),
+                 (implicit EFLAGS)]>;
+def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+                [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+                 (implicit EFLAGS)]>;
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src), "inc{w}\t$dst",
+                  [(set GR16:$dst, (add GR16:$src, 1)),
+                   (implicit EFLAGS)]>,
+                OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src), "inc{l}\t$dst",
+                  [(set GR32:$dst, (add GR32:$src, 1)),
+                   (implicit EFLAGS)]>,
+                Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src), "dec{w}\t$dst",
+                  [(set GR16:$dst, (add GR16:$src, -1)),
+                   (implicit EFLAGS)]>,
+                OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src), "dec{l}\t$dst",
+                  [(set GR32:$dst, (add GR32:$src, -1)),
+                   (implicit EFLAGS)]>,
+                Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress
+
+// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
+// how to unfold them.
+let isTwoAddress = 0, CodeSize = 2 in {
+  def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+                    [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  OpSize, Requires<[In64BitMode]>;
+  def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+                    [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  Requires<[In64BitMode]>;
+  def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+                    [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  OpSize, Requires<[In64BitMode]>;
+  def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+                    [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  Requires<[In64BitMode]>;
+}
+} // Defs = [EFLAGS], CodeSize
+
+
+let Defs = [EFLAGS] in {
+// Shift instructions
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src),
+                  "shl{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (shl GR64:$src, CL))]>;
+let isConvertibleToThreeAddress = 1 in   // Can transform into LEA.
+def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                    "shl{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+// NOTE: We don't use shifts of a register by one, because 'add reg,reg' is
+// cheaper.
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t{%cl, $dst|$dst, %CL}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shl{q}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t$dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src),
+                  "shr{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (srl GR64:$src, CL))]>;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                  "shr{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shr{q}\t$dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t{%cl, $dst|$dst, %CL}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shr{q}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t$dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src),
+                 "sar{q}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR64:$dst, (sra GR64:$src, CL))]>;
+def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                   "sar{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t$dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
+                 "sar{q}\t{%cl, $dst|$dst, %CL}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "sar{q}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+                  "sar{q}\t$dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Rotate instructions
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src),
+                  "rol{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotl GR64:$src, CL))]>;
+def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                    "rol{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t$dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROL64mCL :  I<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+                  "rol{q}\t{%cl, $dst|$dst, %CL}",
+                  [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "rol{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+                 "rol{q}\t$dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src),
+                  "ror{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotr GR64:$src, CL))]>;
+def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                    "ror{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t$dst",
+                  [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
+                  "ror{q}\t{%cl, $dst|$dst, %CL}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "ror{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+                 "ror{q}\t$dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Double shift instructions (generalizations of rotate)
+let isTwoAddress = 1 in {
+let Uses = [CL] in {
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>, TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, TB;
+}
+
+let isCommutable = 1 in {  // FIXME: Update X86InstrInfo::commuteInstruction
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))]>,
+                 TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))]>,
+                 TB;
+} // isCommutable
+} // isTwoAddress
+
+let Uses = [CL] in {
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)]>, TB;
+}
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)]>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem, 
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)]>,
+                 TB;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+//  Logical Instructions...
+//
+
+let isTwoAddress = 1 , AddedComplexity = 15 in
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src), "not{q}\t$dst",
+                [(set GR64:$dst, (not GR64:$src))]>;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def AND64rr  : RI<0x21, MRMDestReg, 
+                  (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                  "and{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+def AND64rm  : RI<0x23, MRMSrcMem,
+                  (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                  "and{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+def AND64ri8 : RIi8<0x83, MRM4r, 
+                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "and{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2)),
+                     (implicit EFLAGS)]>;
+def AND64ri32  : RIi32<0x81, MRM4r, 
+                       (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                       "and{q}\t{$src2, $dst|$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2)),
+                        (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def AND64mr  : RI<0x21, MRMDestMem,
+                  (outs), (ins i64mem:$dst, GR64:$src),
+                  "and{q}\t{$src, $dst|$dst, $src}",
+                  [(store (and (load addr:$dst), GR64:$src), addr:$dst),
+                   (implicit EFLAGS)]>;
+def AND64mi8 : RIi8<0x83, MRM4m,
+                    (outs), (ins i64mem:$dst, i64i8imm :$src),
+                    "and{q}\t{$src, $dst|$dst, $src}",
+                 [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+def AND64mi32  : RIi32<0x81, MRM4m,
+                       (outs), (ins i64mem:$dst, i64i32imm:$src),
+                       "and{q}\t{$src, $dst|$dst, $src}",
+             [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def OR64rr   : RI<0x09, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                  "or{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+def OR64rm   : RI<0x0B, MRMSrcMem , (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                  "or{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+def OR64ri8  : RIi8<0x83, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "or{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2)),
+                     (implicit EFLAGS)]>;
+def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                     "or{q}\t{$src2, $dst|$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2)),
+                      (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                "or{q}\t{$src, $dst|$dst, $src}",
+                [(store (or (load addr:$dst), GR64:$src), addr:$dst),
+                 (implicit EFLAGS)]>;
+def OR64mi8  : RIi8<0x83, MRM1m, (outs), (ins i64mem:$dst, i64i8imm:$src),
+                    "or{q}\t{$src, $dst|$dst, $src}",
+                  [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst),
+                   (implicit EFLAGS)]>;
+def OR64mi32 : RIi32<0x81, MRM1m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                     "or{q}\t{$src, $dst|$dst, $src}",
+              [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+               (implicit EFLAGS)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def XOR64rr  : RI<0x31, MRMDestReg,  (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), 
+                  "xor{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+def XOR64rm  : RI<0x33, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), 
+                  "xor{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+def XOR64ri8 : RIi8<0x83, MRM6r,  (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "xor{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2)),
+                     (implicit EFLAGS)]>;
+def XOR64ri32 : RIi32<0x81, MRM6r, 
+                      (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), 
+                      "xor{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2)),
+                       (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def XOR64mr  : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                  "xor{q}\t{$src, $dst|$dst, $src}",
+                  [(store (xor (load addr:$dst), GR64:$src), addr:$dst),
+                   (implicit EFLAGS)]>;
+def XOR64mi8 : RIi8<0x83, MRM6m, (outs), (ins i64mem:$dst, i64i8imm :$src),
+                    "xor{q}\t{$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+def XOR64mi32 : RIi32<0x81, MRM6m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                      "xor{q}\t{$src, $dst|$dst, $src}",
+             [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+// Integer comparison
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in
+def TEST64rr : RI<0x85, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                  "test{q}\t{$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, GR64:$src2), 0),
+                   (implicit EFLAGS)]>;
+def TEST64rm : RI<0x85, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+                  "test{q}\t{$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, (loadi64 addr:$src2)), 0),
+                   (implicit EFLAGS)]>;
+def TEST64ri32 : RIi32<0xF7, MRM0r, (outs),
+                                        (ins GR64:$src1, i64i32imm:$src2),
+                       "test{q}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR64:$src1, i64immSExt32:$src2), 0),
+                      (implicit EFLAGS)]>;
+def TEST64mi32 : RIi32<0xF7, MRM0m, (outs),
+                                        (ins i64mem:$src1, i64i32imm:$src2),
+                       "test{q}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0),
+                 (implicit EFLAGS)]>;
+
+def CMP64rr : RI<0x39, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, GR64:$src2),
+                  (implicit EFLAGS)]>;
+def CMP64mr : RI<0x39, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                 [(X86cmp (loadi64 addr:$src1), GR64:$src2),
+                   (implicit EFLAGS)]>;
+def CMP64rm : RI<0x3B, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+                 "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, (loadi64 addr:$src2)),
+                  (implicit EFLAGS)]>;
+def CMP64ri8 : RIi8<0x83, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp GR64:$src1, i64immSExt8:$src2),
+                     (implicit EFLAGS)]>;
+def CMP64ri32 : RIi32<0x81, MRM7r, (outs), (ins GR64:$src1, i64i32imm:$src2),
+                      "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                      [(X86cmp GR64:$src1, i64immSExt32:$src2),
+                       (implicit EFLAGS)]>;
+def CMP64mi8 : RIi8<0x83, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (loadi64 addr:$src1), i64immSExt8:$src2),
+                     (implicit EFLAGS)]>;
+def CMP64mi32 : RIi32<0x81, MRM7m, (outs),
+                                       (ins i64mem:$src1, i64i32imm:$src2),
+                      "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                      [(X86cmp (loadi64 addr:$src1), i64immSExt32:$src2),
+                       (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bt{q}\t{$src2, $src1|$src1, $src2}",
+               [(X86bt GR64:$src1, GR64:$src2),
+                (implicit EFLAGS)]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Disable these instructions for now.
+//def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+//               "bt{q}\t{$src2, $src1|$src1, $src2}",
+//               [(X86bt (loadi64 addr:$src1), GR64:$src2),
+//                (implicit EFLAGS)]>, TB;
+
+def BT64ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR64:$src1, i64immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT64mi8 : Ii8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi64 addr:$src1), i64immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+// Conditional moves
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+let isCommutable = 1 in {
+def CMOVB64rr : RI<0x42, MRMSrcReg,       // if <u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovb\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rr: RI<0x43, MRMSrcReg,       // if >=u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovae\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rr : RI<0x44, MRMSrcReg,       // if ==, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmove\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rr: RI<0x45, MRMSrcReg,       // if !=, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovne\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rr: RI<0x46, MRMSrcReg,       // if <=u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovbe\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rr : RI<0x47, MRMSrcReg,       // if >u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmova\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rr : RI<0x4C, MRMSrcReg,       // if <s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovl\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rr: RI<0x4D, MRMSrcReg,       // if >=s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovge\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rr: RI<0x4E, MRMSrcReg,       // if <=s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovle\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rr : RI<0x4F, MRMSrcReg,       // if >s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovg\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rr : RI<0x48, MRMSrcReg,       // if signed, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovs\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rr: RI<0x49, MRMSrcReg,       // if !signed, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovns\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rr : RI<0x4A, MRMSrcReg,       // if parity, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovp\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rr : RI<0x4B, MRMSrcReg,       // if !parity, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovnp\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rr : RI<0x40, MRMSrcReg,       // if overflow, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovo\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rr : RI<0x41, MRMSrcReg,       // if !overflow, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovno\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_NO, EFLAGS))]>, TB;
+} // isCommutable = 1
+
+def CMOVB64rm : RI<0x42, MRMSrcMem,       // if <u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovb\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rm: RI<0x43, MRMSrcMem,       // if >=u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovae\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rm : RI<0x44, MRMSrcMem,       // if ==, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmove\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rm: RI<0x45, MRMSrcMem,       // if !=, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovne\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rm: RI<0x46, MRMSrcMem,       // if <=u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovbe\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rm : RI<0x47, MRMSrcMem,       // if >u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmova\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rm : RI<0x4C, MRMSrcMem,       // if <s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovl\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rm: RI<0x4D, MRMSrcMem,       // if >=s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovge\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rm: RI<0x4E, MRMSrcMem,       // if <=s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovle\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rm : RI<0x4F, MRMSrcMem,       // if >s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovg\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rm : RI<0x48, MRMSrcMem,       // if signed, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovs\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rm: RI<0x49, MRMSrcMem,       // if !signed, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovns\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rm : RI<0x4A, MRMSrcMem,       // if parity, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovp\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rm : RI<0x4B, MRMSrcMem,       // if !parity, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovnp\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rm : RI<0x40, MRMSrcMem,       // if overflow, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovo\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rm : RI<0x41, MRMSrcMem,       // if !overflow, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovno\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_NO, EFLAGS))]>, TB;
+} // isTwoAddress
+
+//===----------------------------------------------------------------------===//
+//  Conversion Instructions...
+//
+
+// f64 -> signed i64
+def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                           "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst,
+                             (int_x86_sse2_cvtsd2si64 VR128:$src))]>;
+def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f128mem:$src),
+                           "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (int_x86_sse2_cvtsd2si64
+                                             (load addr:$src)))]>;
+def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+                        "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
+                        "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                            "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse2_cvttsd2si64 VR128:$src))]>;
+def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f128mem:$src),
+                            "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse2_cvttsd2si64
+                               (load addr:$src)))]>;
+
+// Signed i64 -> f64
+def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
+                           (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+                           "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst,
+                             (int_x86_sse2_cvtsi642sd VR128:$src1,
+                              GR64:$src2))]>;
+def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
+                           (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+                           "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst,
+                             (int_x86_sse2_cvtsi642sd VR128:$src1,
+                              (loadi64 addr:$src2)))]>;
+} // isTwoAddress
+
+// Signed i64 -> f32
+def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR64:$src),
+                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i64mem:$src),
+                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+  def Int_CVTSI2SS64rr : RSSI<0x2A, MRMSrcReg,
+                              (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+                              "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+                              [(set VR128:$dst,
+                                (int_x86_sse_cvtsi642ss VR128:$src1,
+                                 GR64:$src2))]>;
+  def Int_CVTSI2SS64rm : RSSI<0x2A, MRMSrcMem,
+                              (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+                              "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+                              [(set VR128:$dst,
+                                (int_x86_sse_cvtsi642ss VR128:$src1,
+                                 (loadi64 addr:$src2)))]>;
+}
+
+// f32 -> signed i64
+def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                           "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst,
+                             (int_x86_sse_cvtss2si64 VR128:$src))]>;
+def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+                           "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (int_x86_sse_cvtss2si64
+                                             (load addr:$src)))]>;
+def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+                        "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+                        "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                            "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse_cvttss2si64 VR128:$src))]>;
+def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+                            "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse_cvttss2si64 (load addr:$src)))]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor. Use xorl instead of xorq; it's
+// equivalent due to implicit zero-extending, and it sometimes has a smaller
+// encoding.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let Defs = [EFLAGS], AddedComplexity = 1,
+    isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64r0  : I<0x31, MRMInitReg,  (outs GR64:$dst), (ins),
+                "xor{l}\t${dst:subreg32}, ${dst:subreg32}",
+                [(set GR64:$dst, 0)]>;
+
+// Materialize i64 constant where top 32-bits are zero.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
+                        "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                        [(set GR64:$dst, i64immZExt32:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//===----------------------------------------------------------------------===//
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [RSP] in
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64imm:$sym),
+                   ".byte\t0x66; "
+                   "leaq\t${sym:mem}(%rip), %rdi; "
+                   ".word\t0x6666; "
+                   "rex64; "
+                   "call\t__tls_get_addr@PLT",
+                  [(X86tlsaddr tglobaltlsaddr:$sym)]>,
+                  Requires<[In64BitMode]>;
+
+let AddedComplexity = 5 in
+def MOV64GSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "movq\t%gs:$src, $dst",
+                 [(set GR64:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5 in
+def MOV64FSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "movq\t%fs:$src, $dst",
+                 [(set GR64:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// Atomic Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [RAX, EFLAGS], Uses = [RAX] in {
+def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
+               "lock\n\t"
+               "cmpxchgq\t$swap,$ptr",
+               [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
+}
+
+let Constraints = "$val = $dst" in {
+let Defs = [EFLAGS] in
+def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
+               "lock\n\t"
+               "xadd\t$val, $ptr",
+               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
+                TB, LOCK;
+def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
+                  "xchg\t$val, $ptr", 
+                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
+}
+
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMAND64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>;
+def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMOR64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>;
+def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMXOR64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>;
+def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMNAND64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>;
+def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val),
+               "#ATOMMIN64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>;
+def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMMAX64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMUMIN64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMUMAX64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[NotSmallCode]>;
+
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+
+// Calls
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall GR64:$dst),
+          (CALL64r GR64:$dst)>;
+
+
+// tailcall stuff
+def : Pat<(X86tailcall GR32:$dst),
+          (TAILCALL)>;
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+          (TAILCALL)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+          (TAILCALL)>;
+
+def : Pat<(X86tcret GR64:$dst, imm:$off),
+          (TCRETURNri64 GR64:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi64 texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+          (TCRETURNdi64 texternalsym:$dst, imm:$off)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(parallel (X86cmp GR64:$src1, 0), (implicit EFLAGS)),
+          (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO64rm GR64:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload
+// When extloading from 16-bit and smaller memory locations into 64-bit registers,
+// use zero-extending loads so that the entire 64-bit register is defined, avoiding
+// partial-register updates.
+def : Pat<(extloadi64i1 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i32 addr:$src),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (MOV32rm addr:$src),
+                         x86_subreg_32bit)>;
+def : Pat<(extloadi16i1 addr:$src), 
+          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), (MOV8rm addr:$src), 
+                         x86_subreg_8bit)>,
+         Requires<[In64BitMode]>;
+def : Pat<(extloadi16i8 addr:$src), 
+          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), (MOV8rm addr:$src), 
+                         x86_subreg_8bit)>,
+         Requires<[In64BitMode]>;
+
+// anyext
+def : Pat<(i64 (anyext GR8:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR16:$src, x86_subreg_16bit)>;
+def : Pat<(i64 (anyext GR32:$src)), 
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, x86_subreg_32bit)>;
+def : Pat<(i16 (anyext GR8:$src)),
+          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>,
+         Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext GR8:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>,
+         Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR64:$src1, 128),
+          (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+          (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+          (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+          (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+          (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+          (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, x86_subreg_8bit))>,
+      Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, x86_subreg_8bit)))>,
+      Requires<[In64BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit))>,
+      Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+          (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)))>,
+      Requires<[In64BitMode]>;
+
+// trunc patterns
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_REGCLASS GR64:$src, GR64_ABCD),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_REGCLASS GR64:$src, GR64_ABCD),
+                            x86_subreg_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// (shl x (and y, 63)) ==> (shl x, y)
+def : Pat<(shl GR64:$src1, (and CL:$amt, 63)),
+          (SHL64rCL GR64:$src1)>;
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+          (SHL64mCL addr:$dst)>;
+
+def : Pat<(srl GR64:$src1, (and CL:$amt, 63)),
+          (SHR64rCL GR64:$src1)>;
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+          (SHR64mCL addr:$dst)>;
+
+def : Pat<(sra GR64:$src1, (and CL:$amt, 63)),
+          (SAR64rCL GR64:$src1)>;
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+          (SAR64mCL addr:$dst)>;
+
+// (or (x >> c) | (y << (64 - c))) ==> (shrd64 x, y, c)
+def : Pat<(or (srl GR64:$src1, CL:$amt),
+              (shl GR64:$src2, (sub 64, CL:$amt))),
+          (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), CL:$amt),
+                     (shl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+          (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(or (srl GR64:$src1, (i8 (trunc RCX:$amt))),
+              (shl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+          (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), (i8 (trunc RCX:$amt))),
+                     (shl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+                 addr:$dst),
+          (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+          (SHRD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi64 addr:$dst), (i8 imm:$amt1),
+                       GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHRD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+def : Pat<(or (shl GR64:$src1, CL:$amt),
+              (srl GR64:$src2, (sub 64, CL:$amt))),
+          (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), CL:$amt),
+                     (srl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+          (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(or (shl GR64:$src1, (i8 (trunc RCX:$amt))),
+              (srl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+          (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), (i8 (trunc RCX:$amt))),
+                     (srl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+                 addr:$dst),
+          (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+          (SHLD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
+                       GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHLD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(addc GR64:$src1, (load addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, imm:$src2)>;
+
+def : Pat<(subc GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(subc GR64:$src1, (load addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(subc GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(subc GR64:$src1, imm:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// Register-Register Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Memory Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+
+// Register-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Memory-Register Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB64mr addr:$dst, GR64:$src2)>;
+
+// Memory-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Memory Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (IMUL64rm GR64:$src1, addr:$src2)>;
+
+// Register-Integer Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Memory-Integer Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// INC and DEC with EFLAGS result. Note that these do not set CF.
+def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
+          (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i16 (X86inc_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC64_16m addr:$dst)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
+          (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i16 (X86dec_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC64_16m addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
+          (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i32 (X86inc_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC64_32m addr:$dst)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
+          (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i32 (X86dec_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC64_32m addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR64:$src), (implicit EFLAGS)),
+          (INC64r GR64:$src)>;
+def : Pat<(parallel (store (i64 (X86inc_flag (loadi64 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC64m addr:$dst)>;
+def : Pat<(parallel (X86dec_flag GR64:$src), (implicit EFLAGS)),
+          (DEC64r GR64:$src)>;
+def : Pat<(parallel (store (i64 (X86dec_flag (loadi64 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC64m addr:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE Instructions
+//===----------------------------------------------------------------------===//
+
+// Move instructions...
+
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))]>;
+def MOVPQIto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                           (iPTR 0)))]>;
+
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>;
+def MOV64toSDrm : RPDI<0x6E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+
+def MOVSDto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR64:$dst,
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, OpSize, REX_W;
+}
+
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+
+let isTwoAddress = 1 in {
+  multiclass SS41I_insert64<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr, 
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+                   OpSize, REX_W;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+                                       imm:$src3)))]>, OpSize, REX_W;
+  }
+}
+
+defm PINSRQ      : SS41I_insert64<0x22, "pinsrq">;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..39504cd
--- /dev/null
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,168 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  unsigned Disp;
+  GlobalValue *GV;
+
+  X86AddressMode() : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0) {
+    Base.Reg = 0;
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+inline const MachineInstrBuilder &addDirectMem(const MachineInstrBuilder &MIB,
+                                               unsigned Reg) {
+  // Because memory references are always represented with four
+  // values, this adds: Reg, [1, NoReg, 0] to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+}
+
+inline const MachineInstrBuilder &addLeaOffset(const MachineInstrBuilder &MIB,
+                                            int Offset) {
+  return MIB.addImm(1).addReg(0).addImm(Offset);
+}
+
+inline const MachineInstrBuilder &addOffset(const MachineInstrBuilder &MIB,
+                                            int Offset) {
+  return addLeaOffset(MIB, Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB,
+                                               unsigned Reg, bool isKill,
+                                               int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+inline const MachineInstrBuilder &addLeaRegOffset(const MachineInstrBuilder &MIB,
+                                                  unsigned Reg, bool isKill,
+                                                  int Offset) {
+  return addLeaOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, bool isKill1,
+                                            unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+    .addReg(Reg2, getKillRegState(isKill2)).addImm(0);
+}
+
+inline const MachineInstrBuilder &addLeaAddress(const MachineInstrBuilder &MIB,
+                                                const X86AddressMode &AM) {
+  assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else if (AM.BaseType == X86AddressMode::FrameIndexBase)
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  else
+    assert (0);
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    return MIB.addGlobalAddress(AM.GV, AM.Disp);
+  else
+    return MIB.addImm(AM.Disp);
+}
+
+inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+                                                 const X86AddressMode &AM) {
+  return addLeaAddress(MIB, AM).addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Flags = 0;
+  if (TID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (TID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand MMO(PseudoSourceValue::getFixedStack(FI),
+                        Flags,
+                        MFI.getObjectOffset(FI) + Offset,
+                        MFI.getObjectSize(FI),
+                        MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         unsigned GlobalBaseReg = 0) {
+  //FIXME: factor this
+  return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+    .addConstantPoolIndex(CPI).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..bc7def4
--- /dev/null
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,597 @@
+//==- X86InstrFPStack.td - Describe the X86 Instruction Set --*- tablegen -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, 
+                                           SDTCisVT<1, f80>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
+                             [SDNPHasChain, SDNPMayLoad]>;
+def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
+                             [SDNPHasChain, SDNPInFlag, SDNPMayStore]>;
+def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
+                             [SDNPHasChain, SDNPMayLoad]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+                             [SDNPHasChain, SDNPOutFlag, SDNPMayLoad]>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m",          SDTX86CwdStore,
+                             [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomDAGSchedInserter = 1 in {  // Expanded by the scheduler.
+  def FP32_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i16mem:$dst, RFP32:$src),
+                              "##FP32_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i32mem:$dst, RFP32:$src),
+                              "##FP32_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i64mem:$dst, RFP32:$src),
+                              "##FP32_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i16mem:$dst, RFP64:$src),
+                              "##FP64_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i32mem:$dst, RFP64:$src),
+                              "##FP64_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i64mem:$dst, RFP64:$src),
+                              "##FP64_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+  def FP80_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i16mem:$dst, RFP80:$src),
+                              "##FP80_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i32mem:$dst, RFP80:$src),
+                              "##FP80_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i64mem:$dst, RFP80:$src),
+                              "##FP80_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+let isTerminator = 1 in
+  let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
+    def FP_REG_KILL  : I<0, Pseudo, (outs), (ins), "##FP_REG_KILL", []>;
+
+// All FP Stack operations are represented with four instructions here.  The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values.  These sizes apply to the values, 
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information.  These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation 
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// Pseudo Instructions for FP stack return values.
+def FpGET_ST0_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+
+// FpGET_ST1* should only be issued *after* an FpGET_ST0* has been issued when
+// there are two values live out on the stack from a call or inlineasm.  This
+// magic is handled by the stackifier.  It is not valid to emit FpGET_ST1* and
+// then FpGET_ST0*.  In addition, it is invalid for any FP-using operations to
+// occur between them.
+def FpGET_ST1_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+
+let Defs = [ST0] in {
+def FpSET_ST0_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(0) = FPR
+}
+
+let Defs = [ST1] in {
+def FpSET_ST1_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(1) = FPR
+}
+
+// FpIf32, FpIf64 - Floating Point Psuedo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Register copies.  Just copies, the shortening ones do not truncate.
+let neverHasSideEffects = 1 in {
+  def MOV_Fp3232 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), SpecialFP, []>; 
+  def MOV_Fp3264 : FpIf32<(outs RFP64:$dst), (ins RFP32:$src), SpecialFP, []>; 
+  def MOV_Fp6432 : FpIf32<(outs RFP32:$dst), (ins RFP64:$src), SpecialFP, []>; 
+  def MOV_Fp6464 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), SpecialFP, []>; 
+  def MOV_Fp8032 : FpIf32<(outs RFP32:$dst), (ins RFP80:$src), SpecialFP, []>; 
+  def MOV_Fp3280 : FpIf32<(outs RFP80:$dst), (ins RFP32:$src), SpecialFP, []>; 
+  def MOV_Fp8064 : FpIf64<(outs RFP64:$dst), (ins RFP80:$src), SpecialFP, []>; 
+  def MOV_Fp6480 : FpIf64<(outs RFP80:$dst), (ins RFP64:$src), SpecialFP, []>; 
+  def MOV_Fp8080 : FpI_  <(outs RFP80:$dst), (ins RFP80:$src), SpecialFP, []>; 
+}
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+                [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP32:$dst, 
+                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{s}\t$src"))> { let mayLoad = 1; }
+def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{l}\t$src"))> { let mayLoad = 1; }
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{s}\t$src"))> { let mayLoad = 1; }
+def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{l}\t$src"))> { let mayLoad = 1; }
+}
+
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+
+class FPST0rInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r   : FPST0rInst <0xC0, "fadd\t$op">;
+def ADD_FrST0   : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, %ST(0)}">;
+def ADD_FPrST0  : FPrST0PInst<0xC0, "faddp\t$op">;
+def SUBR_FST0r  : FPST0rInst <0xE8, "fsubr\t$op">;
+def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUB_FPrST0  : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
+def SUB_FST0r   : FPST0rInst <0xE0, "fsub\t$op">;
+def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
+def MUL_FST0r   : FPST0rInst <0xC8, "fmul\t$op">;
+def MUL_FrST0   : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, %ST(0)}">;
+def MUL_FPrST0  : FPrST0PInst<0xC8, "fmulp\t$op">;
+def DIVR_FST0r  : FPST0rInst <0xF8, "fdivr\t$op">;
+def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIV_FPrST0  : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
+def DIV_FST0r   : FPST0rInst <0xF0, "fdiv\t$op">;
+def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+                 [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F     : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
+}
+
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+let neverHasSideEffects = 1 in {
+def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+
+// Floating point cmovs.
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+                       CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+                       CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+                     CondMovFP,
+                     [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+                                        cc, EFLAGS))]>;
+}
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+}
+
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovb\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovbe\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVE_F  : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmove\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVP_F  : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovu\t {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnb\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnbe\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovne\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnu\t{$op, %st(0)|%ST(0), $op}">, DB;
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1, mayHaveSideEffects = 1 in
+  def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32  : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+                    [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, neverHasSideEffects = 1 in {
+def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1 in {
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+}
+let mayStore = 1 in {
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP80:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP80:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP80:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+
+let mayStore = 1 in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">;
+}
+
+// FP Stack manipulation instructions.
+def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9;
+def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD;
+def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD;
+def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm1)]>;
+}
+
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
+
+
+// Floating point compares.
+let Defs = [EFLAGS] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                        []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                        []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                        []>;  // FPSW = cmp ST(0) with ST(i)
+                        
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(X86cmp RFP32:$lhs, RFP32:$rhs),
+                   (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(X86cmp RFP64:$lhs, RFP64:$rhs),
+                   (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                  [(X86cmp RFP80:$lhs, RFP80:$rhs),
+                   (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+}
+
+let Defs = [EFLAGS], Uses = [ST0] in {
+def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg),
+                    "fucom\t$reg">, DD;
+def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg),
+                    "fucomp\t$reg">, DD;
+def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
+                    (outs), (ins),
+                    "fucompp">, DA;
+
+def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg),
+                    "fucomi\t{$reg, %st(0)|%ST(0), $reg}">, DB;
+def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg),
+                    "fucomip\t{$reg, %st(0)|%ST(0), $reg}">, DF;
+}
+
+// Floating point flag ops.
+let Defs = [AX] in
+def FNSTSW8r  : I<0xE0, RawFrm,                  // AX = fp flags
+                  (outs), (ins), "fnstsw", []>, DF;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+                  [(X86fp_cwd_get16 addr:$dst)]>;
+                  
+let mayLoad = 1 in
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (MOV_Fp3264 RFP32:$src)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (MOV_Fp3280 RFP32:$src)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (MOV_Fp6480 RFP64:$src)>,
+           Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack.  We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (MOV_Fp6432 RFP64:$src)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (MOV_Fp8032 RFP80:$src)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (MOV_Fp8064 RFP80:$src)>,
+           Requires<[FPStackf64]>;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..eeed5bd
--- /dev/null
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,285 @@
+//===- X86InstrFormats.td - X86 Instruction Formats --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo     : Format<0>; def RawFrm     : Format<1>;
+def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
+def MRMSrcMem  : Format<6>;
+def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
+def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
+def MRM6r  : Format<22>; def MRM7r  : Format<23>;
+def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
+def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
+def MRM6m  : Format<30>; def MRM7m  : Format<31>;
+def MRMInitReg : Format<32>;
+
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoImm  : ImmType<0>;
+def Imm8   : ImmType<1>;
+def Imm16  : ImmType<2>;
+def Imm32  : ImmType<3>;
+def Imm64  : ImmType<4>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class LOCK   { bit hasLockPrefix = 1; }
+class SegFS  { bits<2> SegOvrBits = 1; }
+class SegGS  { bits<2> SegOvrBits = 2; }
+class TB     { bits<4> Prefix = 1; }
+class REP    { bits<4> Prefix = 2; }
+class D8     { bits<4> Prefix = 3; }
+class D9     { bits<4> Prefix = 4; }
+class DA     { bits<4> Prefix = 5; }
+class DB     { bits<4> Prefix = 6; }
+class DC     { bits<4> Prefix = 7; }
+class DD     { bits<4> Prefix = 8; }
+class DE     { bits<4> Prefix = 9; }
+class DF     { bits<4> Prefix = 10; }
+class XD     { bits<4> Prefix = 11; }
+class XS     { bits<4> Prefix = 12; }
+class T8     { bits<4> Prefix = 13; }
+class TA     { bits<4> Prefix = 14; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+              string AsmStr>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<6> FormBits = Form.Value;
+  ImmType ImmT = i;
+  bits<3> ImmTypeBits = ImmT.Value;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  string AsmString = AsmStr;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
+  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
+
+  bits<4> Prefix = 0;       // Which prefix byte does this inst have?
+  bit hasREX_WPrefix  = 0;  // Does this inst requires the REX.W prefix?
+  FPFormat FPForm;          // What flavor of FP instruction is this?
+  bits<3> FPFormBits = 0;
+  bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
+  bits<2> SegOvrBits = 0;   // Segment override prefix.
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+  : X86Inst<o, f, NoImm, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm8 , outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm16, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm32, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
+  : I<o, F, outs, ins, asm, []> {}
+
+// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
+  let FPForm = fp; let FPFormBits = FPForm.Value;
+  let Pattern = pattern;
+}
+
+// SSE1 Instruction Templates:
+// 
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with TB prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+
+// SSE2 Instruction Templates:
+// 
+//   SDI    - SSE2 instructions with XD prefix.
+//   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+//   PDI    - SSE2 instructions with TB and OpSize prefixes.
+//   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+// 
+//   S3I   - SSE3 instructions with TB and OpSize prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+// 
+//   SS38I - SSSE3 instructions with T8 prefix.
+//   SS3AI - SSSE3 instructions with TA prefix.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. We put those instructions here because they better
+// fit into the SSSE3 instruction category rather than the MMX category.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+// 
+//   SS48I - SSE 4.1 instructions with T8 prefix.
+//   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
+
+// SSE4.2 Instruction Templates:
+// 
+//   SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE42]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern>
+  : X86Inst<o, f, Imm64, outs, ins, asm>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : SSI<o, F, outs, ins, asm, pattern>, REX_W;
+class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : SDI<o, F, outs, ins, asm, pattern>, REX_W;
+class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
+
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..2cd3733
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,3227 @@
+//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86GenInstrInfo.inc"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetAsmInfo.h"
+
+using namespace llvm;
+
+namespace {
+  cl::opt<bool>
+  NoFusing("disable-spill-fusing",
+           cl::desc("Disable fusing of spill code into instructions"));
+  cl::opt<bool>
+  PrintFailedFusing("print-failed-fuse-candidates",
+                    cl::desc("Print instructions that the allocator wants to"
+                             " fuse, but the X86 backend currently can't"),
+                    cl::Hidden);
+  cl::opt<bool>
+  ReMatPICStubLoad("remat-pic-stub-load",
+                   cl::desc("Re-materialize load from stub in PIC mode"),
+                   cl::init(false), cl::Hidden);
+}
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+  : TargetInstrInfoImpl(X86Insts, array_lengthof(X86Insts)),
+    TM(tm), RI(tm, *this) {
+  SmallVector<unsigned,16> AmbEntries;
+  static const unsigned OpTbl2Addr[][2] = {
+    { X86::ADC32ri,     X86::ADC32mi },
+    { X86::ADC32ri8,    X86::ADC32mi8 },
+    { X86::ADC32rr,     X86::ADC32mr },
+    { X86::ADC64ri32,   X86::ADC64mi32 },
+    { X86::ADC64ri8,    X86::ADC64mi8 },
+    { X86::ADC64rr,     X86::ADC64mr },
+    { X86::ADD16ri,     X86::ADD16mi },
+    { X86::ADD16ri8,    X86::ADD16mi8 },
+    { X86::ADD16rr,     X86::ADD16mr },
+    { X86::ADD32ri,     X86::ADD32mi },
+    { X86::ADD32ri8,    X86::ADD32mi8 },
+    { X86::ADD32rr,     X86::ADD32mr },
+    { X86::ADD64ri32,   X86::ADD64mi32 },
+    { X86::ADD64ri8,    X86::ADD64mi8 },
+    { X86::ADD64rr,     X86::ADD64mr },
+    { X86::ADD8ri,      X86::ADD8mi },
+    { X86::ADD8rr,      X86::ADD8mr },
+    { X86::AND16ri,     X86::AND16mi },
+    { X86::AND16ri8,    X86::AND16mi8 },
+    { X86::AND16rr,     X86::AND16mr },
+    { X86::AND32ri,     X86::AND32mi },
+    { X86::AND32ri8,    X86::AND32mi8 },
+    { X86::AND32rr,     X86::AND32mr },
+    { X86::AND64ri32,   X86::AND64mi32 },
+    { X86::AND64ri8,    X86::AND64mi8 },
+    { X86::AND64rr,     X86::AND64mr },
+    { X86::AND8ri,      X86::AND8mi },
+    { X86::AND8rr,      X86::AND8mr },
+    { X86::DEC16r,      X86::DEC16m },
+    { X86::DEC32r,      X86::DEC32m },
+    { X86::DEC64_16r,   X86::DEC64_16m },
+    { X86::DEC64_32r,   X86::DEC64_32m },
+    { X86::DEC64r,      X86::DEC64m },
+    { X86::DEC8r,       X86::DEC8m },
+    { X86::INC16r,      X86::INC16m },
+    { X86::INC32r,      X86::INC32m },
+    { X86::INC64_16r,   X86::INC64_16m },
+    { X86::INC64_32r,   X86::INC64_32m },
+    { X86::INC64r,      X86::INC64m },
+    { X86::INC8r,       X86::INC8m },
+    { X86::NEG16r,      X86::NEG16m },
+    { X86::NEG32r,      X86::NEG32m },
+    { X86::NEG64r,      X86::NEG64m },
+    { X86::NEG8r,       X86::NEG8m },
+    { X86::NOT16r,      X86::NOT16m },
+    { X86::NOT32r,      X86::NOT32m },
+    { X86::NOT64r,      X86::NOT64m },
+    { X86::NOT8r,       X86::NOT8m },
+    { X86::OR16ri,      X86::OR16mi },
+    { X86::OR16ri8,     X86::OR16mi8 },
+    { X86::OR16rr,      X86::OR16mr },
+    { X86::OR32ri,      X86::OR32mi },
+    { X86::OR32ri8,     X86::OR32mi8 },
+    { X86::OR32rr,      X86::OR32mr },
+    { X86::OR64ri32,    X86::OR64mi32 },
+    { X86::OR64ri8,     X86::OR64mi8 },
+    { X86::OR64rr,      X86::OR64mr },
+    { X86::OR8ri,       X86::OR8mi },
+    { X86::OR8rr,       X86::OR8mr },
+    { X86::ROL16r1,     X86::ROL16m1 },
+    { X86::ROL16rCL,    X86::ROL16mCL },
+    { X86::ROL16ri,     X86::ROL16mi },
+    { X86::ROL32r1,     X86::ROL32m1 },
+    { X86::ROL32rCL,    X86::ROL32mCL },
+    { X86::ROL32ri,     X86::ROL32mi },
+    { X86::ROL64r1,     X86::ROL64m1 },
+    { X86::ROL64rCL,    X86::ROL64mCL },
+    { X86::ROL64ri,     X86::ROL64mi },
+    { X86::ROL8r1,      X86::ROL8m1 },
+    { X86::ROL8rCL,     X86::ROL8mCL },
+    { X86::ROL8ri,      X86::ROL8mi },
+    { X86::ROR16r1,     X86::ROR16m1 },
+    { X86::ROR16rCL,    X86::ROR16mCL },
+    { X86::ROR16ri,     X86::ROR16mi },
+    { X86::ROR32r1,     X86::ROR32m1 },
+    { X86::ROR32rCL,    X86::ROR32mCL },
+    { X86::ROR32ri,     X86::ROR32mi },
+    { X86::ROR64r1,     X86::ROR64m1 },
+    { X86::ROR64rCL,    X86::ROR64mCL },
+    { X86::ROR64ri,     X86::ROR64mi },
+    { X86::ROR8r1,      X86::ROR8m1 },
+    { X86::ROR8rCL,     X86::ROR8mCL },
+    { X86::ROR8ri,      X86::ROR8mi },
+    { X86::SAR16r1,     X86::SAR16m1 },
+    { X86::SAR16rCL,    X86::SAR16mCL },
+    { X86::SAR16ri,     X86::SAR16mi },
+    { X86::SAR32r1,     X86::SAR32m1 },
+    { X86::SAR32rCL,    X86::SAR32mCL },
+    { X86::SAR32ri,     X86::SAR32mi },
+    { X86::SAR64r1,     X86::SAR64m1 },
+    { X86::SAR64rCL,    X86::SAR64mCL },
+    { X86::SAR64ri,     X86::SAR64mi },
+    { X86::SAR8r1,      X86::SAR8m1 },
+    { X86::SAR8rCL,     X86::SAR8mCL },
+    { X86::SAR8ri,      X86::SAR8mi },
+    { X86::SBB32ri,     X86::SBB32mi },
+    { X86::SBB32ri8,    X86::SBB32mi8 },
+    { X86::SBB32rr,     X86::SBB32mr },
+    { X86::SBB64ri32,   X86::SBB64mi32 },
+    { X86::SBB64ri8,    X86::SBB64mi8 },
+    { X86::SBB64rr,     X86::SBB64mr },
+    { X86::SHL16rCL,    X86::SHL16mCL },
+    { X86::SHL16ri,     X86::SHL16mi },
+    { X86::SHL32rCL,    X86::SHL32mCL },
+    { X86::SHL32ri,     X86::SHL32mi },
+    { X86::SHL64rCL,    X86::SHL64mCL },
+    { X86::SHL64ri,     X86::SHL64mi },
+    { X86::SHL8rCL,     X86::SHL8mCL },
+    { X86::SHL8ri,      X86::SHL8mi },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL },
+    { X86::SHLD16rri8,  X86::SHLD16mri8 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL },
+    { X86::SHLD32rri8,  X86::SHLD32mri8 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL },
+    { X86::SHLD64rri8,  X86::SHLD64mri8 },
+    { X86::SHR16r1,     X86::SHR16m1 },
+    { X86::SHR16rCL,    X86::SHR16mCL },
+    { X86::SHR16ri,     X86::SHR16mi },
+    { X86::SHR32r1,     X86::SHR32m1 },
+    { X86::SHR32rCL,    X86::SHR32mCL },
+    { X86::SHR32ri,     X86::SHR32mi },
+    { X86::SHR64r1,     X86::SHR64m1 },
+    { X86::SHR64rCL,    X86::SHR64mCL },
+    { X86::SHR64ri,     X86::SHR64mi },
+    { X86::SHR8r1,      X86::SHR8m1 },
+    { X86::SHR8rCL,     X86::SHR8mCL },
+    { X86::SHR8ri,      X86::SHR8mi },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL },
+    { X86::SHRD16rri8,  X86::SHRD16mri8 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL },
+    { X86::SHRD32rri8,  X86::SHRD32mri8 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL },
+    { X86::SHRD64rri8,  X86::SHRD64mri8 },
+    { X86::SUB16ri,     X86::SUB16mi },
+    { X86::SUB16ri8,    X86::SUB16mi8 },
+    { X86::SUB16rr,     X86::SUB16mr },
+    { X86::SUB32ri,     X86::SUB32mi },
+    { X86::SUB32ri8,    X86::SUB32mi8 },
+    { X86::SUB32rr,     X86::SUB32mr },
+    { X86::SUB64ri32,   X86::SUB64mi32 },
+    { X86::SUB64ri8,    X86::SUB64mi8 },
+    { X86::SUB64rr,     X86::SUB64mr },
+    { X86::SUB8ri,      X86::SUB8mi },
+    { X86::SUB8rr,      X86::SUB8mr },
+    { X86::XOR16ri,     X86::XOR16mi },
+    { X86::XOR16ri8,    X86::XOR16mi8 },
+    { X86::XOR16rr,     X86::XOR16mr },
+    { X86::XOR32ri,     X86::XOR32mi },
+    { X86::XOR32ri8,    X86::XOR32mi8 },
+    { X86::XOR32rr,     X86::XOR32mr },
+    { X86::XOR64ri32,   X86::XOR64mi32 },
+    { X86::XOR64ri8,    X86::XOR64mi8 },
+    { X86::XOR64rr,     X86::XOR64mr },
+    { X86::XOR8ri,      X86::XOR8mi },
+    { X86::XOR8rr,      X86::XOR8mr }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
+    unsigned RegOp = OpTbl2Addr[i][0];
+    unsigned MemOp = OpTbl2Addr[i][1];
+    if (!RegOp2MemOpTable2Addr.insert(std::make_pair((unsigned*)RegOp,
+                                                     MemOp)).second)
+      assert(false && "Duplicated entries?");
+    unsigned AuxInfo = 0 | (1 << 4) | (1 << 5); // Index 0,folded load and store
+    if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                                std::make_pair(RegOp,
+                                                              AuxInfo))).second)
+      AmbEntries.push_back(MemOp);
+  }
+
+  // If the third value is 1, then it's folding either a load or a store.
+  static const unsigned OpTbl0[][3] = {
+    { X86::BT16ri8,     X86::BT16mi8, 1 },
+    { X86::BT32ri8,     X86::BT32mi8, 1 },
+    { X86::BT64ri8,     X86::BT64mi8, 1 },
+    { X86::CALL32r,     X86::CALL32m, 1 },
+    { X86::CALL64r,     X86::CALL64m, 1 },
+    { X86::CMP16ri,     X86::CMP16mi, 1 },
+    { X86::CMP16ri8,    X86::CMP16mi8, 1 },
+    { X86::CMP16rr,     X86::CMP16mr, 1 },
+    { X86::CMP32ri,     X86::CMP32mi, 1 },
+    { X86::CMP32ri8,    X86::CMP32mi8, 1 },
+    { X86::CMP32rr,     X86::CMP32mr, 1 },
+    { X86::CMP64ri32,   X86::CMP64mi32, 1 },
+    { X86::CMP64ri8,    X86::CMP64mi8, 1 },
+    { X86::CMP64rr,     X86::CMP64mr, 1 },
+    { X86::CMP8ri,      X86::CMP8mi, 1 },
+    { X86::CMP8rr,      X86::CMP8mr, 1 },
+    { X86::DIV16r,      X86::DIV16m, 1 },
+    { X86::DIV32r,      X86::DIV32m, 1 },
+    { X86::DIV64r,      X86::DIV64m, 1 },
+    { X86::DIV8r,       X86::DIV8m, 1 },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0 },
+    { X86::FsMOVAPDrr,  X86::MOVSDmr, 0 },
+    { X86::FsMOVAPSrr,  X86::MOVSSmr, 0 },
+    { X86::IDIV16r,     X86::IDIV16m, 1 },
+    { X86::IDIV32r,     X86::IDIV32m, 1 },
+    { X86::IDIV64r,     X86::IDIV64m, 1 },
+    { X86::IDIV8r,      X86::IDIV8m, 1 },
+    { X86::IMUL16r,     X86::IMUL16m, 1 },
+    { X86::IMUL32r,     X86::IMUL32m, 1 },
+    { X86::IMUL64r,     X86::IMUL64m, 1 },
+    { X86::IMUL8r,      X86::IMUL8m, 1 },
+    { X86::JMP32r,      X86::JMP32m, 1 },
+    { X86::JMP64r,      X86::JMP64m, 1 },
+    { X86::MOV16ri,     X86::MOV16mi, 0 },
+    { X86::MOV16rr,     X86::MOV16mr, 0 },
+    { X86::MOV32ri,     X86::MOV32mi, 0 },
+    { X86::MOV32rr,     X86::MOV32mr, 0 },
+    { X86::MOV64ri32,   X86::MOV64mi32, 0 },
+    { X86::MOV64rr,     X86::MOV64mr, 0 },
+    { X86::MOV8ri,      X86::MOV8mi, 0 },
+    { X86::MOV8rr,      X86::MOV8mr, 0 },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0 },
+    { X86::MOVAPDrr,    X86::MOVAPDmr, 0 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr, 0 },
+    { X86::MOVDQArr,    X86::MOVDQAmr, 0 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0 },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0 },
+    { X86::MOVPS2SSrr,  X86::MOVPS2SSmr, 0 },
+    { X86::MOVSDrr,     X86::MOVSDmr, 0 },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr, 0 },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0 },
+    { X86::MOVSSrr,     X86::MOVSSmr, 0 },
+    { X86::MOVUPDrr,    X86::MOVUPDmr, 0 },
+    { X86::MOVUPSrr,    X86::MOVUPSmr, 0 },
+    { X86::MUL16r,      X86::MUL16m, 1 },
+    { X86::MUL32r,      X86::MUL32m, 1 },
+    { X86::MUL64r,      X86::MUL64m, 1 },
+    { X86::MUL8r,       X86::MUL8m, 1 },
+    { X86::SETAEr,      X86::SETAEm, 0 },
+    { X86::SETAr,       X86::SETAm, 0 },
+    { X86::SETBEr,      X86::SETBEm, 0 },
+    { X86::SETBr,       X86::SETBm, 0 },
+    { X86::SETEr,       X86::SETEm, 0 },
+    { X86::SETGEr,      X86::SETGEm, 0 },
+    { X86::SETGr,       X86::SETGm, 0 },
+    { X86::SETLEr,      X86::SETLEm, 0 },
+    { X86::SETLr,       X86::SETLm, 0 },
+    { X86::SETNEr,      X86::SETNEm, 0 },
+    { X86::SETNOr,      X86::SETNOm, 0 },
+    { X86::SETNPr,      X86::SETNPm, 0 },
+    { X86::SETNSr,      X86::SETNSm, 0 },
+    { X86::SETOr,       X86::SETOm, 0 },
+    { X86::SETPr,       X86::SETPm, 0 },
+    { X86::SETSr,       X86::SETSm, 0 },
+    { X86::TAILJMPr,    X86::TAILJMPm, 1 },
+    { X86::TEST16ri,    X86::TEST16mi, 1 },
+    { X86::TEST32ri,    X86::TEST32mi, 1 },
+    { X86::TEST64ri32,  X86::TEST64mi32, 1 },
+    { X86::TEST8ri,     X86::TEST8mi, 1 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
+    unsigned RegOp = OpTbl0[i][0];
+    unsigned MemOp = OpTbl0[i][1];
+    if (!RegOp2MemOpTable0.insert(std::make_pair((unsigned*)RegOp,
+                                                 MemOp)).second)
+      assert(false && "Duplicated entries?");
+    unsigned FoldedLoad = OpTbl0[i][2];
+    // Index 0, folded load or store.
+    unsigned AuxInfo = 0 | (FoldedLoad << 4) | ((FoldedLoad^1) << 5);
+    if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+      if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                     std::make_pair(RegOp, AuxInfo))).second)
+        AmbEntries.push_back(MemOp);
+  }
+
+  static const unsigned OpTbl1[][2] = {
+    { X86::CMP16rr,         X86::CMP16rm },
+    { X86::CMP32rr,         X86::CMP32rm },
+    { X86::CMP64rr,         X86::CMP64rm },
+    { X86::CMP8rr,          X86::CMP8rm },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm },
+    { X86::FsMOVAPDrr,      X86::MOVSDrm },
+    { X86::FsMOVAPSrr,      X86::MOVSSrm },
+    { X86::IMUL16rri,       X86::IMUL16rmi },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8 },
+    { X86::IMUL32rri,       X86::IMUL32rmi },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm },
+    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm },
+    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm },
+    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm },
+    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm },
+    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm },
+    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm },
+    { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm },
+    { X86::Int_CVTSD2SIrr,  X86::Int_CVTSD2SIrm },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm },
+    { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm },
+    { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm },
+    { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm },
+    { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm },
+    { X86::MOV16rr,         X86::MOV16rm },
+    { X86::MOV32rr,         X86::MOV32rm },
+    { X86::MOV64rr,         X86::MOV64rm },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm },
+    { X86::MOV8rr,          X86::MOV8rm },
+    { X86::MOVAPDrr,        X86::MOVAPDrm },
+    { X86::MOVAPSrr,        X86::MOVAPSrm },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm },
+    { X86::MOVDQArr,        X86::MOVDQArm },
+    { X86::MOVSD2PDrr,      X86::MOVSD2PDrm },
+    { X86::MOVSDrr,         X86::MOVSDrm },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm },
+    { X86::MOVSS2PSrr,      X86::MOVSS2PSrm },
+    { X86::MOVSSrr,         X86::MOVSSrm },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm },
+    { X86::MOVUPSrr,        X86::MOVUPSrm },
+    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm },
+    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8 },
+    { X86::MOVZX64rr16,     X86::MOVZX64rm16 },
+    { X86::MOVZX64rr32,     X86::MOVZX64rm32 },
+    { X86::MOVZX64rr8,      X86::MOVZX64rm8 },
+    { X86::PSHUFDri,        X86::PSHUFDmi },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi },
+    { X86::RCPPSr,          X86::RCPPSm },
+    { X86::RCPPSr_Int,      X86::RCPPSm_Int },
+    { X86::RSQRTPSr,        X86::RSQRTPSm },
+    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int },
+    { X86::RSQRTSSr,        X86::RSQRTSSm },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int },
+    { X86::SQRTPDr,         X86::SQRTPDm },
+    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int },
+    { X86::SQRTPSr,         X86::SQRTPSm },
+    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int },
+    { X86::SQRTSDr,         X86::SQRTSDm },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int },
+    { X86::SQRTSSr,         X86::SQRTSSm },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int },
+    { X86::TEST16rr,        X86::TEST16rm },
+    { X86::TEST32rr,        X86::TEST32rm },
+    { X86::TEST64rr,        X86::TEST64rm },
+    { X86::TEST8rr,         X86::TEST8rm },
+    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+    { X86::UCOMISDrr,       X86::UCOMISDrm },
+    { X86::UCOMISSrr,       X86::UCOMISSrm }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
+    unsigned RegOp = OpTbl1[i][0];
+    unsigned MemOp = OpTbl1[i][1];
+    if (!RegOp2MemOpTable1.insert(std::make_pair((unsigned*)RegOp,
+                                                 MemOp)).second)
+      assert(false && "Duplicated entries?");
+    unsigned AuxInfo = 1 | (1 << 4); // Index 1, folded load
+    if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+      if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                     std::make_pair(RegOp, AuxInfo))).second)
+        AmbEntries.push_back(MemOp);
+  }
+
+  static const unsigned OpTbl2[][2] = {
+    { X86::ADC32rr,         X86::ADC32rm },
+    { X86::ADC64rr,         X86::ADC64rm },
+    { X86::ADD16rr,         X86::ADD16rm },
+    { X86::ADD32rr,         X86::ADD32rm },
+    { X86::ADD64rr,         X86::ADD64rm },
+    { X86::ADD8rr,          X86::ADD8rm },
+    { X86::ADDPDrr,         X86::ADDPDrm },
+    { X86::ADDPSrr,         X86::ADDPSrm },
+    { X86::ADDSDrr,         X86::ADDSDrm },
+    { X86::ADDSSrr,         X86::ADDSSrm },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm },
+    { X86::AND16rr,         X86::AND16rm },
+    { X86::AND32rr,         X86::AND32rm },
+    { X86::AND64rr,         X86::AND64rm },
+    { X86::AND8rr,          X86::AND8rm },
+    { X86::ANDNPDrr,        X86::ANDNPDrm },
+    { X86::ANDNPSrr,        X86::ANDNPSrm },
+    { X86::ANDPDrr,         X86::ANDPDrm },
+    { X86::ANDPSrr,         X86::ANDPSrm },
+    { X86::CMOVA16rr,       X86::CMOVA16rm },
+    { X86::CMOVA32rr,       X86::CMOVA32rm },
+    { X86::CMOVA64rr,       X86::CMOVA64rm },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm },
+    { X86::CMOVB16rr,       X86::CMOVB16rm },
+    { X86::CMOVB32rr,       X86::CMOVB32rm },
+    { X86::CMOVB64rr,       X86::CMOVB64rm },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm },
+    { X86::CMOVE16rr,       X86::CMOVE16rm },
+    { X86::CMOVE32rr,       X86::CMOVE32rm },
+    { X86::CMOVE64rr,       X86::CMOVE64rm },
+    { X86::CMOVG16rr,       X86::CMOVG16rm },
+    { X86::CMOVG32rr,       X86::CMOVG32rm },
+    { X86::CMOVG64rr,       X86::CMOVG64rm },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm },
+    { X86::CMOVL16rr,       X86::CMOVL16rm },
+    { X86::CMOVL32rr,       X86::CMOVL32rm },
+    { X86::CMOVL64rr,       X86::CMOVL64rm },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm },
+    { X86::CMOVO16rr,       X86::CMOVO16rm },
+    { X86::CMOVO32rr,       X86::CMOVO32rm },
+    { X86::CMOVO64rr,       X86::CMOVO64rm },
+    { X86::CMOVP16rr,       X86::CMOVP16rm },
+    { X86::CMOVP32rr,       X86::CMOVP32rm },
+    { X86::CMOVP64rr,       X86::CMOVP64rm },
+    { X86::CMOVS16rr,       X86::CMOVS16rm },
+    { X86::CMOVS32rr,       X86::CMOVS32rm },
+    { X86::CMOVS64rr,       X86::CMOVS64rm },
+    { X86::CMPPDrri,        X86::CMPPDrmi },
+    { X86::CMPPSrri,        X86::CMPPSrmi },
+    { X86::CMPSDrr,         X86::CMPSDrm },
+    { X86::CMPSSrr,         X86::CMPSSrm },
+    { X86::DIVPDrr,         X86::DIVPDrm },
+    { X86::DIVPSrr,         X86::DIVPSrm },
+    { X86::DIVSDrr,         X86::DIVSDrm },
+    { X86::DIVSSrr,         X86::DIVSSrm },
+    { X86::FsANDNPDrr,      X86::FsANDNPDrm },
+    { X86::FsANDNPSrr,      X86::FsANDNPSrm },
+    { X86::FsANDPDrr,       X86::FsANDPDrm },
+    { X86::FsANDPSrr,       X86::FsANDPSrm },
+    { X86::FsORPDrr,        X86::FsORPDrm },
+    { X86::FsORPSrr,        X86::FsORPSrm },
+    { X86::FsXORPDrr,       X86::FsXORPDrm },
+    { X86::FsXORPSrr,       X86::FsXORPSrm },
+    { X86::HADDPDrr,        X86::HADDPDrm },
+    { X86::HADDPSrr,        X86::HADDPSrm },
+    { X86::HSUBPDrr,        X86::HSUBPDrm },
+    { X86::HSUBPSrr,        X86::HSUBPSrm },
+    { X86::IMUL16rr,        X86::IMUL16rm },
+    { X86::IMUL32rr,        X86::IMUL32rm },
+    { X86::IMUL64rr,        X86::IMUL64rm },
+    { X86::MAXPDrr,         X86::MAXPDrm },
+    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int },
+    { X86::MAXPSrr,         X86::MAXPSrm },
+    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int },
+    { X86::MAXSDrr,         X86::MAXSDrm },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int },
+    { X86::MAXSSrr,         X86::MAXSSrm },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int },
+    { X86::MINPDrr,         X86::MINPDrm },
+    { X86::MINPDrr_Int,     X86::MINPDrm_Int },
+    { X86::MINPSrr,         X86::MINPSrm },
+    { X86::MINPSrr_Int,     X86::MINPSrm_Int },
+    { X86::MINSDrr,         X86::MINSDrm },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int },
+    { X86::MINSSrr,         X86::MINSSrm },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int },
+    { X86::MULPDrr,         X86::MULPDrm },
+    { X86::MULPSrr,         X86::MULPSrm },
+    { X86::MULSDrr,         X86::MULSDrm },
+    { X86::MULSSrr,         X86::MULSSrm },
+    { X86::OR16rr,          X86::OR16rm },
+    { X86::OR32rr,          X86::OR32rm },
+    { X86::OR64rr,          X86::OR64rm },
+    { X86::OR8rr,           X86::OR8rm },
+    { X86::ORPDrr,          X86::ORPDrm },
+    { X86::ORPSrr,          X86::ORPSrm },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm },
+    { X86::PADDBrr,         X86::PADDBrm },
+    { X86::PADDDrr,         X86::PADDDrm },
+    { X86::PADDQrr,         X86::PADDQrm },
+    { X86::PADDSBrr,        X86::PADDSBrm },
+    { X86::PADDSWrr,        X86::PADDSWrm },
+    { X86::PADDWrr,         X86::PADDWrm },
+    { X86::PANDNrr,         X86::PANDNrm },
+    { X86::PANDrr,          X86::PANDrm },
+    { X86::PAVGBrr,         X86::PAVGBrm },
+    { X86::PAVGWrr,         X86::PAVGWrm },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm },
+    { X86::PINSRWrri,       X86::PINSRWrmi },
+    { X86::PMADDWDrr,       X86::PMADDWDrm },
+    { X86::PMAXSWrr,        X86::PMAXSWrm },
+    { X86::PMAXUBrr,        X86::PMAXUBrm },
+    { X86::PMINSWrr,        X86::PMINSWrm },
+    { X86::PMINUBrr,        X86::PMINUBrm },
+    { X86::PMULDQrr,        X86::PMULDQrm },
+    { X86::PMULHUWrr,       X86::PMULHUWrm },
+    { X86::PMULHWrr,        X86::PMULHWrm },
+    { X86::PMULLDrr,        X86::PMULLDrm },
+    { X86::PMULLDrr_int,    X86::PMULLDrm_int },
+    { X86::PMULLWrr,        X86::PMULLWrm },
+    { X86::PMULUDQrr,       X86::PMULUDQrm },
+    { X86::PORrr,           X86::PORrm },
+    { X86::PSADBWrr,        X86::PSADBWrm },
+    { X86::PSLLDrr,         X86::PSLLDrm },
+    { X86::PSLLQrr,         X86::PSLLQrm },
+    { X86::PSLLWrr,         X86::PSLLWrm },
+    { X86::PSRADrr,         X86::PSRADrm },
+    { X86::PSRAWrr,         X86::PSRAWrm },
+    { X86::PSRLDrr,         X86::PSRLDrm },
+    { X86::PSRLQrr,         X86::PSRLQrm },
+    { X86::PSRLWrr,         X86::PSRLWrm },
+    { X86::PSUBBrr,         X86::PSUBBrm },
+    { X86::PSUBDrr,         X86::PSUBDrm },
+    { X86::PSUBSBrr,        X86::PSUBSBrm },
+    { X86::PSUBSWrr,        X86::PSUBSWrm },
+    { X86::PSUBWrr,         X86::PSUBWrm },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm },
+    { X86::PXORrr,          X86::PXORrm },
+    { X86::SBB32rr,         X86::SBB32rm },
+    { X86::SBB64rr,         X86::SBB64rm },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi },
+    { X86::SUB16rr,         X86::SUB16rm },
+    { X86::SUB32rr,         X86::SUB32rm },
+    { X86::SUB64rr,         X86::SUB64rm },
+    { X86::SUB8rr,          X86::SUB8rm },
+    { X86::SUBPDrr,         X86::SUBPDrm },
+    { X86::SUBPSrr,         X86::SUBPSrm },
+    { X86::SUBSDrr,         X86::SUBSDrm },
+    { X86::SUBSSrr,         X86::SUBSSrm },
+    // FIXME: TEST*rr -> swapped operand of TEST*mr.
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm },
+    { X86::XOR16rr,         X86::XOR16rm },
+    { X86::XOR32rr,         X86::XOR32rm },
+    { X86::XOR64rr,         X86::XOR64rm },
+    { X86::XOR8rr,          X86::XOR8rm },
+    { X86::XORPDrr,         X86::XORPDrm },
+    { X86::XORPSrr,         X86::XORPSrm }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
+    unsigned RegOp = OpTbl2[i][0];
+    unsigned MemOp = OpTbl2[i][1];
+    if (!RegOp2MemOpTable2.insert(std::make_pair((unsigned*)RegOp,
+                                                 MemOp)).second)
+      assert(false && "Duplicated entries?");
+    unsigned AuxInfo = 2 | (1 << 4); // Index 2, folded load
+    if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                   std::make_pair(RegOp, AuxInfo))).second)
+      AmbEntries.push_back(MemOp);
+  }
+
+  // Remove ambiguous entries.
+  assert(AmbEntries.empty() && "Duplicated entries in unfolding maps?");
+}
+
+bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned &SrcReg, unsigned &DstReg,
+                               unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case X86::MOV8rr:
+  case X86::MOV8rr_NOREX:
+  case X86::MOV16rr:
+  case X86::MOV32rr: 
+  case X86::MOV64rr:
+  case X86::MOVSSrr:
+  case X86::MOVSDrr:
+
+  // FP Stack register class copies
+  case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
+  case X86::MOV_Fp3264: case X86::MOV_Fp3280:
+  case X86::MOV_Fp6432: case X86::MOV_Fp8032:
+      
+  case X86::FsMOVAPSrr:
+  case X86::FsMOVAPDrr:
+  case X86::MOVAPSrr:
+  case X86::MOVAPDrr:
+  case X86::MOVDQArr:
+  case X86::MOVSS2PSrr:
+  case X86::MOVSD2PDrr:
+  case X86::MOVPS2SSrr:
+  case X86::MOVPD2SDrr:
+  case X86::MMX_MOVQ64rr:
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid register-register move instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SrcSubIdx = MI.getOperand(1).getSubReg();
+    DstSubIdx = MI.getOperand(0).getSubReg();
+    return true;
+  }
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, 
+                                           int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isReg() && MI->getOperand(4).isImm() &&
+        MI->getOperand(2).getImm() == 1 &&
+        MI->getOperand(3).getReg() == 0 &&
+        MI->getOperand(4).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV32mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDmr:
+  case X86::MOVDQAmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+    if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
+        MI->getOperand(2).isReg() && MI->getOperand(3).isImm() &&
+        MI->getOperand(1).getImm() == 1 &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(X86AddrNumOperands).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
+/// X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+  bool isPICBase = false;
+  for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+         E = MRI.def_end(); I != E; ++I) {
+    MachineInstr *DefMI = I.getOperand().getParent();
+    if (DefMI->getOpcode() != X86::MOVPC32r)
+      return false;
+    assert(!isPICBase && "More than one PIC base?");
+    isPICBase = true;
+  }
+  return isPICBase;
+}
+
+/// isGVStub - Return true if the GV requires an extra load to get the
+/// real address.
+static inline bool isGVStub(GlobalValue *GV, X86TargetMachine &TM) {
+  return TM.getSubtarget<X86Subtarget>().GVRequiresExtraLoad(GV, TM, false);
+}
+ 
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default: break;
+    case X86::MOV8rm:
+    case X86::MOV16rm:
+    case X86::MOV32rm:
+    case X86::MOV64rm:
+    case X86::LD_Fp64m:
+    case X86::MOVSSrm:
+    case X86::MOVSDrm:
+    case X86::MOVAPSrm:
+    case X86::MOVAPDrm:
+    case X86::MOVDQArm:
+    case X86::MMX_MOVD64rm:
+    case X86::MMX_MOVQ64rm: {
+      // Loads from constant pools are trivially rematerializable.
+      if (MI->getOperand(1).isReg() &&
+          MI->getOperand(2).isImm() &&
+          MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+          (MI->getOperand(4).isCPI() ||
+           (MI->getOperand(4).isGlobal() &&
+            isGVStub(MI->getOperand(4).getGlobal(), TM)))) {
+        unsigned BaseReg = MI->getOperand(1).getReg();
+        if (BaseReg == 0)
+          return true;
+        // Allow re-materialization of PIC load.
+        if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+          return false;
+        const MachineFunction &MF = *MI->getParent()->getParent();
+        const MachineRegisterInfo &MRI = MF.getRegInfo();
+        bool isPICBase = false;
+        for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+               E = MRI.def_end(); I != E; ++I) {
+          MachineInstr *DefMI = I.getOperand().getParent();
+          if (DefMI->getOpcode() != X86::MOVPC32r)
+            return false;
+          assert(!isPICBase && "More than one PIC base?");
+          isPICBase = true;
+        }
+        return isPICBase;
+      } 
+      return false;
+    }
+ 
+     case X86::LEA32r:
+     case X86::LEA64r: {
+       if (MI->getOperand(2).isImm() &&
+           MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+           !MI->getOperand(4).isReg()) {
+         // lea fi#, lea GV, etc. are all rematerializable.
+         if (!MI->getOperand(1).isReg())
+           return true;
+         unsigned BaseReg = MI->getOperand(1).getReg();
+         if (BaseReg == 0)
+           return true;
+         // Allow re-materialization of lea PICBase + x.
+         const MachineFunction &MF = *MI->getParent()->getParent();
+         const MachineRegisterInfo &MRI = MF.getRegInfo();
+         return regIsPICBase(BaseReg, MRI);
+       }
+       return false;
+     }
+  }
+
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
+/// would clobber the EFLAGS condition register. Note the result may be
+/// conservative. If it cannot definitely determine the safety after visiting
+/// two instructions it assumes it's not safe.
+static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  // It's always safe to clobber EFLAGS at the end of a block.
+  if (I == MBB.end())
+    return true;
+
+  // For compile time consideration, if we are not able to determine the
+  // safety after visiting 2 instructions, we will assume it's not safe.
+  for (unsigned i = 0; i < 2; ++i) {
+    bool SeenDef = false;
+    for (unsigned j = 0, e = I->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = I->getOperand(j);
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == X86::EFLAGS) {
+        if (MO.isUse())
+          return false;
+        SeenDef = true;
+      }
+    }
+
+    if (SeenDef)
+      // This instruction defines EFLAGS, no need to look any further.
+      return true;
+    ++I;
+
+    // If we make it to the end of the block, it's safe to clobber EFLAGS.
+    if (I == MBB.end())
+      return true;
+  }
+
+  // Conservative answer.
+  return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 unsigned DestReg,
+                                 const MachineInstr *Orig) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  unsigned SubIdx = Orig->getOperand(0).isReg()
+    ? Orig->getOperand(0).getSubReg() : 0;
+  bool ChangeSubIdx = SubIdx != 0;
+  if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+    DestReg = RI.getSubReg(DestReg, SubIdx);
+    SubIdx = 0;
+  }
+
+  // MOV32r0 etc. are implemented with xor which clobbers condition code.
+  // Re-materialize them as movri instructions to avoid side effects.
+  bool Emitted = false;
+  switch (Orig->getOpcode()) {
+  default: break;
+  case X86::MOV8r0:
+  case X86::MOV16r0:
+  case X86::MOV32r0:
+  case X86::MOV64r0: {
+    if (!isSafeToClobberEFLAGS(MBB, I)) {
+      unsigned Opc = 0;
+      switch (Orig->getOpcode()) {
+      default: break;
+      case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
+      case X86::MOV16r0: Opc = X86::MOV16ri; break;
+      case X86::MOV32r0: Opc = X86::MOV32ri; break;
+      case X86::MOV64r0: Opc = X86::MOV64ri32; break;
+      }
+      BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
+      Emitted = true;
+    }
+    break;
+  }
+  }
+
+  if (!Emitted) {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MI->getOperand(0).setReg(DestReg);
+    MBB.insert(I, MI);
+  }
+
+  if (ChangeSubIdx) {
+    MachineInstr *NewMI = prior(I);
+    NewMI->getOperand(0).setSubReg(SubIdx);
+  }
+}
+
+/// isInvariantLoad - Return true if the specified instruction (which is marked
+/// mayLoad) is loading from a location whose value is invariant across the
+/// function.  For example, loading a value from the constant pool or from
+/// from the argument area of a function if it does not change.  This should
+/// only return true of *all* loads the instruction does are invariant (if it
+/// does multiple loads).
+bool X86InstrInfo::isInvariantLoad(const MachineInstr *MI) const {
+  // This code cares about loads from three cases: constant pool entries,
+  // invariant argument slots, and global stubs.  In order to handle these cases
+  // for all of the myriad of X86 instructions, we just scan for a CP/FI/GV
+  // operand and base our analysis on it.  This is safe because the address of
+  // none of these three cases is ever used as anything other than a load base
+  // and X86 doesn't have any instructions that load from multiple places.
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Loads from constant pools are trivially invariant.
+    if (MO.isCPI())
+      return true;
+
+    if (MO.isGlobal())
+      return isGVStub(MO.getGlobal(), TM);
+
+    // If this is a load from an invariant stack slot, the load is a constant.
+    if (MO.isFI()) {
+      const MachineFrameInfo &MFI =
+        *MI->getParent()->getParent()->getFrameInfo();
+      int Idx = MO.getIndex();
+      return MFI.isFixedObjectIndex(Idx) && MFI.isImmutableObjectIndex(Idx);
+    }
+  }
+  
+  // All other instances of these instructions are presumed to have other
+  // issues.
+  return false;
+}
+
+/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
+/// is not marked dead.
+static bool hasLiveCondCodeDef(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  // All instructions input are two-addr instructions.  Get the known operands.
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
+
+  MachineInstr *NewMI = NULL;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  bool DisableLEA16 = true;
+
+  unsigned MIOpc = MI->getOpcode();
+  switch (MIOpc) {
+  case X86::SHUFPSrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    if (B != C) return 0;
+    unsigned A = MI->getOperand(0).getReg();
+    unsigned M = MI->getOperand(3).getImm();
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+      .addReg(A, RegState::Define | getDeadRegState(isDead))
+      .addReg(B, getKillRegState(isKill)).addImm(M);
+    break;
+  }
+  case X86::SHL64ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(Src, getKillRegState(isKill))
+      .addImm(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ?
+      X86::LEA64_32r : X86::LEA32r;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(Src, getKillRegState(isKill)).addImm(0);
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    if (DisableLEA16) {
+      // If 16-bit LEA is disabled, use 32-bit LEA via subregisters.
+      MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+      unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
+        ? X86::LEA64_32r : X86::LEA32r;
+      unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+      unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+            
+      // Build and insert into an implicit UNDEF value. This is OK because
+      // well be shifting and then extracting the lower 16-bits. 
+      BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+      MachineInstr *InsMI =
+        BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
+        .addReg(leaInReg)
+        .addReg(Src, getKillRegState(isKill))
+        .addImm(X86::SUBREG_16BIT);
+      
+      NewMI = BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(Opc), leaOutReg)
+        .addReg(0).addImm(1 << ShAmt)
+        .addReg(leaInReg, RegState::Kill)
+        .addImm(0);
+      
+      MachineInstr *ExtMI =
+        BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
+        .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+        .addReg(leaOutReg, RegState::Kill)
+        .addImm(X86::SUBREG_16BIT);
+
+      if (LV) {
+        // Update live variables
+        LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+        LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+        if (isKill)
+          LV->replaceKillInstruction(Src, MI, InsMI);
+        if (isDead)
+          LV->replaceKillInstruction(Dest, MI, ExtMI);
+      }
+      return ExtMI;
+    } else {
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+        .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+        .addReg(0).addImm(1 << ShAmt)
+        .addReg(Src, getKillRegState(isKill))
+        .addImm(0);
+    }
+    break;
+  }
+  default: {
+    // The following opcodes also sets the condition code register(s). Only
+    // convert them to equivalent lea if the condition code register def's
+    // are dead!
+    if (hasLiveCondCodeDef(MI))
+      return 0;
+
+    bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+    switch (MIOpc) {
+    default: return 0;
+    case X86::INC64r:
+    case X86::INC32r:
+    case X86::INC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, 1);
+      break;
+    }
+    case X86::INC16r:
+    case X86::INC64_16r:
+      if (DisableLEA16) return 0;
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                           .addReg(Dest, RegState::Define |
+                                   getDeadRegState(isDead)),
+                           Src, isKill, 1);
+      break;
+    case X86::DEC64r:
+    case X86::DEC32r:
+    case X86::DEC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, -1);
+      break;
+    }
+    case X86::DEC16r:
+    case X86::DEC64_16r:
+      if (DisableLEA16) return 0;
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                           .addReg(Dest, RegState::Define |
+                                   getDeadRegState(isDead)),
+                           Src, isKill, -1);
+      break;
+    case X86::ADD64rr:
+    case X86::ADD32rr: {
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc = MIOpc == X86::ADD64rr ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addReg(Dest, RegState::Define |
+                                getDeadRegState(isDead)),
+                        Src, isKill, Src2, isKill2);
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
+      break;
+    }
+    case X86::ADD16rr: {
+      if (DisableLEA16) return 0;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addReg(Dest, RegState::Define |
+                                getDeadRegState(isDead)),
+                        Src, isKill, Src2, isKill2);
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
+      break;
+    }
+    case X86::ADD64ri32:
+    case X86::ADD64ri8:
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      if (MI->getOperand(2).isImm())
+        NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                                .addReg(Dest, RegState::Define |
+                                        getDeadRegState(isDead)),
+                                Src, isKill, MI->getOperand(2).getImm());
+      break;
+    case X86::ADD32ri:
+    case X86::ADD32ri8:
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      if (MI->getOperand(2).isImm()) {
+        unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+        NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                                .addReg(Dest, RegState::Define |
+                                        getDeadRegState(isDead)),
+                                Src, isKill, MI->getOperand(2).getImm());
+      }
+      break;
+    case X86::ADD16ri:
+    case X86::ADD16ri8:
+      if (DisableLEA16) return 0;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      if (MI->getOperand(2).isImm())
+        NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                             .addReg(Dest, RegState::Define |
+                                     getDeadRegState(isDead)),
+                             Src, isKill, MI->getOperand(2).getImm());
+      break;
+    case X86::SHL16ri:
+      if (DisableLEA16) return 0;
+    case X86::SHL32ri:
+    case X86::SHL64ri: {
+      assert(MI->getNumOperands() >= 3 && MI->getOperand(2).isImm() &&
+             "Unknown shl instruction!");
+      unsigned ShAmt = MI->getOperand(2).getImm();
+      if (ShAmt == 1 || ShAmt == 2 || ShAmt == 3) {
+        X86AddressMode AM;
+        AM.Scale = 1 << ShAmt;
+        AM.IndexReg = Src;
+        unsigned Opc = MIOpc == X86::SHL64ri ? X86::LEA64r
+          : (MIOpc == X86::SHL32ri
+             ? (is64Bit ? X86::LEA64_32r : X86::LEA32r) : X86::LEA16r);
+        NewMI = addFullAddress(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                               .addReg(Dest, RegState::Define |
+                                       getDeadRegState(isDead)), AM);
+        if (isKill)
+          NewMI->getOperand(3).setIsKill(true);
+      }
+      break;
+    }
+    }
+  }
+  }
+
+  if (!NewMI) return 0;
+
+  if (LV) {  // Update live variables
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, NewMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, NewMI);
+  }
+
+  MFI->insert(MBBI, NewMI);          // Insert the new inst    
+  return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  switch (MI->getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI->getOpcode()) {
+    default: assert(0 && "Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+    case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+    }
+    unsigned Amt = MI->getOperand(3).getImm();
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    MI->getOperand(3).setImm(Size-Amt);
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+  }
+  case X86::CMOVB16rr:
+  case X86::CMOVB32rr:
+  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr:
+  case X86::CMOVAE32rr:
+  case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:
+  case X86::CMOVE32rr:
+  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr:
+  case X86::CMOVNE32rr:
+  case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr:
+  case X86::CMOVBE32rr:
+  case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:
+  case X86::CMOVA32rr:
+  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:
+  case X86::CMOVL32rr:
+  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr:
+  case X86::CMOVGE32rr:
+  case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr:
+  case X86::CMOVLE32rr:
+  case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:
+  case X86::CMOVG32rr:
+  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:
+  case X86::CMOVS32rr:
+  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr:
+  case X86::CMOVNS32rr:
+  case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:
+  case X86::CMOVP32rr:
+  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr:
+  case X86::CMOVNP32rr:
+  case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:
+  case X86::CMOVO32rr:
+  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr:
+  case X86::CMOVNO32rr:
+  case X86::CMOVNO64rr: {
+    unsigned Opc = 0;
+    switch (MI->getOpcode()) {
+    default: break;
+    case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
+    case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
+    case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
+    case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+    case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+    case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+    case X86::CMOVE16rr:  Opc = X86::CMOVNE16rr; break;
+    case X86::CMOVE32rr:  Opc = X86::CMOVNE32rr; break;
+    case X86::CMOVE64rr:  Opc = X86::CMOVNE64rr; break;
+    case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+    case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+    case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+    case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+    case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+    case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+    case X86::CMOVA16rr:  Opc = X86::CMOVBE16rr; break;
+    case X86::CMOVA32rr:  Opc = X86::CMOVBE32rr; break;
+    case X86::CMOVA64rr:  Opc = X86::CMOVBE64rr; break;
+    case X86::CMOVL16rr:  Opc = X86::CMOVGE16rr; break;
+    case X86::CMOVL32rr:  Opc = X86::CMOVGE32rr; break;
+    case X86::CMOVL64rr:  Opc = X86::CMOVGE64rr; break;
+    case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+    case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+    case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+    case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+    case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+    case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+    case X86::CMOVG16rr:  Opc = X86::CMOVLE16rr; break;
+    case X86::CMOVG32rr:  Opc = X86::CMOVLE32rr; break;
+    case X86::CMOVG64rr:  Opc = X86::CMOVLE64rr; break;
+    case X86::CMOVS16rr:  Opc = X86::CMOVNS16rr; break;
+    case X86::CMOVS32rr:  Opc = X86::CMOVNS32rr; break;
+    case X86::CMOVS64rr:  Opc = X86::CMOVNS64rr; break;
+    case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+    case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+    case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+    case X86::CMOVP16rr:  Opc = X86::CMOVNP16rr; break;
+    case X86::CMOVP32rr:  Opc = X86::CMOVNP32rr; break;
+    case X86::CMOVP64rr:  Opc = X86::CMOVNP64rr; break;
+    case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+    case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+    case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+    case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
+    case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVO64rr:  Opc = X86::CMOVNO64rr; break;
+    case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+    case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+    case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    // Fallthrough intended.
+  }
+  default:
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+  }
+}
+
+static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE:  return X86::COND_E;
+  case X86::JNE: return X86::COND_NE;
+  case X86::JL:  return X86::COND_L;
+  case X86::JLE: return X86::COND_LE;
+  case X86::JG:  return X86::COND_G;
+  case X86::JGE: return X86::COND_GE;
+  case X86::JB:  return X86::COND_B;
+  case X86::JBE: return X86::COND_BE;
+  case X86::JA:  return X86::COND_A;
+  case X86::JAE: return X86::COND_AE;
+  case X86::JS:  return X86::COND_S;
+  case X86::JNS: return X86::COND_NS;
+  case X86::JP:  return X86::COND_P;
+  case X86::JNP: return X86::COND_NP;
+  case X86::JO:  return X86::COND_O;
+  case X86::JNO: return X86::COND_NO;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case X86::COND_E:  return X86::JE;
+  case X86::COND_NE: return X86::JNE;
+  case X86::COND_L:  return X86::JL;
+  case X86::COND_LE: return X86::JLE;
+  case X86::COND_G:  return X86::JG;
+  case X86::COND_GE: return X86::JGE;
+  case X86::COND_B:  return X86::JB;
+  case X86::COND_BE: return X86::JBE;
+  case X86::COND_A:  return X86::JA;
+  case X86::COND_AE: return X86::JAE;
+  case X86::COND_S:  return X86::JS;
+  case X86::COND_NS: return X86::JNS;
+  case X86::COND_P:  return X86::JP;
+  case X86::COND_NP: return X86::JNP;
+  case X86::COND_O:  return X86::JO;
+  case X86::COND_NO: return X86::JNO;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+  
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
+static bool isBrAnalysisUnpredicatedTerminator(const MachineInstr *MI,
+                                               const X86InstrInfo &TII) {
+  if (MI->getOpcode() == X86::FP_REG_KILL)
+    return false;
+  return TII.isUnpredicatedTerminator(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isBrAnalysisUnpredicatedTerminator(I, *this))
+      break;
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch())
+      return true;
+    // Handle unconditional branches.
+    if (I->getOpcode() == X86::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (next(I) != MBB.end())
+        next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+    // Handle conditional branches.
+    X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+    // Handle subsequent conditional branches. Only handle the case
+    // where all conditional branches branch to the same destination
+    // and their condition opcodes fit one of the special
+    // multi-branch idioms.
+    assert(Cond.size() == 1);
+    assert(TBB);
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+    // If they differ, see if they fit one of the known patterns.
+    // Theoretically we could handle more patterns here, but
+    // we shouldn't expect to see them if instruction selection
+    // has done a reasonable job.
+    if ((OldBranchCode == X86::COND_NP &&
+         BranchCode == X86::COND_E) ||
+        (OldBranchCode == X86::COND_E &&
+         BranchCode == X86::COND_NP))
+      BranchCode = X86::COND_NP_OR_E;
+    else if ((OldBranchCode == X86::COND_P &&
+              BranchCode == X86::COND_NE) ||
+             (OldBranchCode == X86::COND_NE &&
+              BranchCode == X86::COND_P))
+      BranchCode = X86::COND_NE_OR_P;
+    else
+      return true;
+    // Update the MachineOperand.
+    Cond[0].setImm(BranchCode);
+  }
+
+  return false;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->getOpcode() != X86::JMP &&
+        GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+  
+  return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(X86::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+  switch (CC) {
+  case X86::COND_NP_OR_E:
+    // Synthesize NP_OR_E with two branches.
+    BuildMI(&MBB, dl, get(X86::JNP)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, dl, get(X86::JE)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_NE_OR_P:
+    // Synthesize NE_OR_P with two branches.
+    BuildMI(&MBB, dl, get(X86::JNE)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, dl, get(X86::JP)).addMBB(TBB);
+    ++Count;
+    break;
+  default: {
+    unsigned Opc = GetCondBranchFromCond(CC);
+    BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
+    ++Count;
+  }
+  }
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, dl, get(X86::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+/// isHReg - Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+  return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                unsigned DestReg, unsigned SrcReg,
+                                const TargetRegisterClass *DestRC,
+                                const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  // Determine if DstRC and SrcRC have a common superclass in common.
+  const TargetRegisterClass *CommonRC = DestRC;
+  if (DestRC == SrcRC)
+    /* Source and destination have the same register class. */;
+  else if (CommonRC->hasSuperClass(SrcRC))
+    CommonRC = SrcRC;
+  else if (!DestRC->hasSubClass(SrcRC))
+    CommonRC = 0;
+
+  if (CommonRC) {
+    unsigned Opc;
+    if (CommonRC == &X86::GR64RegClass) {
+      Opc = X86::MOV64rr;
+    } else if (CommonRC == &X86::GR32RegClass) {
+      Opc = X86::MOV32rr;
+    } else if (CommonRC == &X86::GR16RegClass) {
+      Opc = X86::MOV16rr;
+    } else if (CommonRC == &X86::GR8RegClass) {
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+          TM.getSubtarget<X86Subtarget>().is64Bit())
+        Opc = X86::MOV8rr_NOREX;
+      else
+        Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::GR64_ABCDRegClass) {
+      Opc = X86::MOV64rr;
+    } else if (CommonRC == &X86::GR32_ABCDRegClass) {
+      Opc = X86::MOV32rr;
+    } else if (CommonRC == &X86::GR16_ABCDRegClass) {
+      Opc = X86::MOV16rr;
+    } else if (CommonRC == &X86::GR8_ABCD_LRegClass) {
+      Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::GR8_ABCD_HRegClass) {
+      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+        Opc = X86::MOV8rr_NOREX;
+      else
+        Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::GR64_NOREXRegClass) {
+      Opc = X86::MOV64rr;
+    } else if (CommonRC == &X86::GR32_NOREXRegClass) {
+      Opc = X86::MOV32rr;
+    } else if (CommonRC == &X86::GR16_NOREXRegClass) {
+      Opc = X86::MOV16rr;
+    } else if (CommonRC == &X86::GR8_NOREXRegClass) {
+      Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::RFP32RegClass) {
+      Opc = X86::MOV_Fp3232;
+    } else if (CommonRC == &X86::RFP64RegClass || CommonRC == &X86::RSTRegClass) {
+      Opc = X86::MOV_Fp6464;
+    } else if (CommonRC == &X86::RFP80RegClass) {
+      Opc = X86::MOV_Fp8080;
+    } else if (CommonRC == &X86::FR32RegClass) {
+      Opc = X86::FsMOVAPSrr;
+    } else if (CommonRC == &X86::FR64RegClass) {
+      Opc = X86::FsMOVAPDrr;
+    } else if (CommonRC == &X86::VR128RegClass) {
+      Opc = X86::MOVAPSrr;
+    } else if (CommonRC == &X86::VR64RegClass) {
+      Opc = X86::MMX_MOVQ64rr;
+    } else {
+      return false;
+    }
+    BuildMI(MBB, MI, DL, get(Opc), DestReg).addReg(SrcReg);
+    return true;
+  }
+  
+  // Moving EFLAGS to / from another register requires a push and a pop.
+  if (SrcRC == &X86::CCRRegClass) {
+    if (SrcReg != X86::EFLAGS)
+      return false;
+    if (DestRC == &X86::GR64RegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHFQ));
+      BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+      return true;
+    } else if (DestRC == &X86::GR32RegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHFD));
+      BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+      return true;
+    }
+  } else if (DestRC == &X86::CCRRegClass) {
+    if (DestReg != X86::EFLAGS)
+      return false;
+    if (SrcRC == &X86::GR64RegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH64r)).addReg(SrcReg);
+      BuildMI(MBB, MI, DL, get(X86::POPFQ));
+      return true;
+    } else if (SrcRC == &X86::GR32RegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH32r)).addReg(SrcReg);
+      BuildMI(MBB, MI, DL, get(X86::POPFD));
+      return true;
+    }
+  }
+
+  // Moving from ST(0) turns into FpGET_ST0_32 etc.
+  if (SrcRC == &X86::RSTRegClass) {
+    // Copying from ST(0)/ST(1).
+    if (SrcReg != X86::ST0 && SrcReg != X86::ST1)
+      // Can only copy from ST(0)/ST(1) right now
+      return false;
+    bool isST0 = SrcReg == X86::ST0;
+    unsigned Opc;
+    if (DestRC == &X86::RFP32RegClass)
+      Opc = isST0 ? X86::FpGET_ST0_32 : X86::FpGET_ST1_32;
+    else if (DestRC == &X86::RFP64RegClass)
+      Opc = isST0 ? X86::FpGET_ST0_64 : X86::FpGET_ST1_64;
+    else {
+      if (DestRC != &X86::RFP80RegClass)
+        return false;
+      Opc = isST0 ? X86::FpGET_ST0_80 : X86::FpGET_ST1_80;
+    }
+    BuildMI(MBB, MI, DL, get(Opc), DestReg);
+    return true;
+  }
+
+  // Moving to ST(0) turns into FpSET_ST0_32 etc.
+  if (DestRC == &X86::RSTRegClass) {
+    // Copying to ST(0) / ST(1).
+    if (DestReg != X86::ST0 && DestReg != X86::ST1)
+      // Can only copy to TOS right now
+      return false;
+    bool isST0 = DestReg == X86::ST0;
+    unsigned Opc;
+    if (SrcRC == &X86::RFP32RegClass)
+      Opc = isST0 ? X86::FpSET_ST0_32 : X86::FpSET_ST1_32;
+    else if (SrcRC == &X86::RFP64RegClass)
+      Opc = isST0 ? X86::FpSET_ST0_64 : X86::FpSET_ST1_64;
+    else {
+      if (SrcRC != &X86::RFP80RegClass)
+        return false;
+      Opc = isST0 ? X86::FpSET_ST0_80 : X86::FpSET_ST1_80;
+    }
+    BuildMI(MBB, MI, DL, get(Opc)).addReg(SrcReg);
+    return true;
+  }
+  
+  // Not yet supported!
+  return false;
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  TargetMachine &TM) {
+  unsigned Opc = 0;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8RegClass) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if (isHReg(SrcReg) &&
+        TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8mr_NOREX;
+    else
+      Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_ABCDRegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32_ABCDRegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16_ABCDRegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_ABCD_LRegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR8_ABCD_HRegClass) {
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8mr_NOREX;
+    else
+      Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_NOREXRegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::RFP80RegClass) {
+    Opc = X86::ST_FpP80m;   // pops
+  } else if (RC == &X86::RFP64RegClass) {
+    Opc = X86::ST_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::ST_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSmr;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDmr;
+  } else if (RC == &X86::VR128RegClass) {
+    // If stack is realigned we can use aligned stores.
+    Opc = isStackAligned ? X86::MOVAPSmr : X86::MOVUPSmr;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64mr;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+
+  return Opc;
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill, int FrameIdx,
+                                       const TargetRegisterClass *RC) const {
+  const MachineFunction &MF = *MBB.getParent();
+  bool isAligned = (RI.getStackAlignment() >= 16) ||
+    RI.needsStackRealignment(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  bool isAligned = (RI.getStackAlignment() >= 16) ||
+    RI.needsStackRealignment(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  NewMIs.push_back(MIB);
+}
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const TargetMachine &TM) {
+  unsigned Opc = 0;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8RegClass) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if (isHReg(DestReg) &&
+        TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8rm_NOREX;
+    else
+      Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_ABCDRegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32_ABCDRegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16_ABCDRegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_ABCD_LRegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR8_ABCD_HRegClass) {
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8rm_NOREX;
+    else
+      Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_NOREXRegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::RFP80RegClass) {
+    Opc = X86::LD_Fp80m;
+  } else if (RC == &X86::RFP64RegClass) {
+    Opc = X86::LD_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::LD_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSrm;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDrm;
+  } else if (RC == &X86::VR128RegClass) {
+    // If stack is realigned we can use aligned loads.
+    Opc = isStackAligned ? X86::MOVAPSrm : X86::MOVUPSrm;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64rm;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+
+  return Opc;
+}
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC) const{
+  const MachineFunction &MF = *MBB.getParent();
+  bool isAligned = (RI.getStackAlignment() >= 16) ||
+    RI.needsStackRealignment(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  bool isAligned = (RI.getStackAlignment() >= 16) ||
+    RI.needsStackRealignment(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  unsigned SlotSize = is64Bit ? 8 : 4;
+
+  MachineFunction &MF = *MBB.getParent();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  X86FI->setCalleeSavedFrameSize(CSI.size() * SlotSize);
+  
+  unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, DL, get(Opc))
+      .addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+  unsigned Opc = is64Bit ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    BuildMI(MBB, MI, DL, get(Opc), Reg);
+  }
+  return true;
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+                                     const SmallVectorImpl<MachineOperand> &MOs,
+                                     MachineInstr *MI,
+                                     const TargetInstrInfo &TII) {
+  // Create the base instruction with the memory operand as the first part.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(NewMI);
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+  
+  // Loop over the rest of the ri operands, converting them over.
+  unsigned NumOps = MI->getDesc().getNumOperands()-2;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i+2);
+    MIB.addOperand(MO);
+  }
+  for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    MIB.addOperand(MO);
+  }
+  return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF,
+                              unsigned Opcode, unsigned OpNo,
+                              const SmallVectorImpl<MachineOperand> &MOs,
+                              MachineInstr *MI, const TargetInstrInfo &TII) {
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(NewMI);
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      unsigned NumAddrOps = MOs.size();
+      for (unsigned i = 0; i != NumAddrOps; ++i)
+        MIB.addOperand(MOs[i]);
+      if (NumAddrOps < 4)  // FrameIndex only
+        addOffset(MIB, 0);
+    } else {
+      MIB.addOperand(MO);
+    }
+  }
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+                                const SmallVectorImpl<MachineOperand> &MOs,
+                                MachineInstr *MI) {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
+
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+  return MIB.addImm(0);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                    MachineInstr *MI, unsigned i,
+                                    const SmallVectorImpl<MachineOperand> &MOs) const{
+  const DenseMap<unsigned*, unsigned> *OpcodeTablePtr = NULL;
+  bool isTwoAddrFold = false;
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+  MachineInstr *NewMI = NULL;
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isReg() &&
+      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { 
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+    isTwoAddrFold = true;
+  } else if (i == 0) { // If operand 0
+    if (MI->getOpcode() == X86::MOV16r0)
+      NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV32r0)
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV64r0)
+      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV8r0)
+      NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+    if (NewMI)
+      return NewMI;
+    
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (i == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (i == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  }
+  
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    DenseMap<unsigned*, unsigned>::iterator I =
+      OpcodeTablePtr->find((unsigned*)MI->getOpcode());
+    if (I != OpcodeTablePtr->end()) {
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(MF, I->second, MOs, MI, *this);
+      else
+        NewMI = FuseInst(MF, I->second, i, MOs, MI, *this);
+      return NewMI;
+    }
+  }
+  
+  // No fusion 
+  if (PrintFailedFusing)
+    cerr << "We failed to fuse operand " << i << " in " << *MI;
+  return NULL;
+}
+
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                                  const SmallVectorImpl<unsigned> &Ops,
+                                                  int FrameIndex) const {
+  // Check switch flag 
+  if (NoFusing) return NULL;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  // FIXME: Move alignment requirement into tables?
+  if (Alignment < 16) {
+    switch (MI->getOpcode()) {
+    default: break;
+    // Not always safe to fold movsd into these instructions since their load
+    // folding variants expects the address to be 16 byte aligned.
+    case X86::FsANDNPDrr:
+    case X86::FsANDNPSrr:
+    case X86::FsANDPDrr:
+    case X86::FsANDPSrr:
+    case X86::FsORPDrr:
+    case X86::FsORPSrr:
+    case X86::FsXORPDrr:
+    case X86::FsXORPSrr:
+      return NULL;
+    }
+  }
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    switch (MI->getOpcode()) {
+    default: return NULL;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+    }
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return NULL;
+
+  SmallVector<MachineOperand,4> MOs;
+  MOs.push_back(MachineOperand::CreateFI(FrameIndex));
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                            const SmallVectorImpl<unsigned> &Ops,
+                                                  MachineInstr *LoadMI) const {
+  // Check switch flag 
+  if (NoFusing) return NULL;
+
+  // Determine the alignment of the load.
+  unsigned Alignment = 0;
+  if (LoadMI->hasOneMemOperand())
+    Alignment = LoadMI->memoperands_begin()->getAlignment();
+
+  // FIXME: Move alignment requirement into tables?
+  if (Alignment < 16) {
+    switch (MI->getOpcode()) {
+    default: break;
+    // Not always safe to fold movsd into these instructions since their load
+    // folding variants expects the address to be 16 byte aligned.
+    case X86::FsANDNPDrr:
+    case X86::FsANDNPSrr:
+    case X86::FsANDPDrr:
+    case X86::FsANDPSrr:
+    case X86::FsORPDrr:
+    case X86::FsORPSrr:
+    case X86::FsXORPDrr:
+    case X86::FsXORPSrr:
+      return NULL;
+    }
+  }
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    switch (MI->getOpcode()) {
+    default: return NULL;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+    }
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return NULL;
+
+  SmallVector<MachineOperand,X86AddrNumOperands> MOs;
+  if (LoadMI->getOpcode() == X86::V_SET0 ||
+      LoadMI->getOpcode() == X86::V_SETALLONES) {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Create a constant-pool entry and operands to load from it.
+
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (TM.getRelocationModel() == Reloc::PIC_ &&
+        !TM.getSubtarget<X86Subtarget>().is64Bit())
+      // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
+      // This doesn't work for several reasons.
+      // 1. GlobalBaseReg may have been spilled.
+      // 2. It may not be live at MI.
+      return false;
+
+    // Create a v4i32 constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    const VectorType *Ty = VectorType::get(Type::Int32Ty, 4);
+    Constant *C = LoadMI->getOpcode() == X86::V_SET0 ?
+                    ConstantVector::getNullValue(Ty) :
+                    ConstantVector::getAllOnesValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, 16);
+
+    // Create operands to load from the constant pool entry.
+    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+    MOs.push_back(MachineOperand::CreateImm(1));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+  } else {
+    // Folding a normal load. Just copy the load's address operands.
+    unsigned NumOps = LoadMI->getDesc().getNumOperands();
+    for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i)
+      MOs.push_back(LoadMI->getOperand(i));
+  }
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
+}
+
+
+bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                  const SmallVectorImpl<unsigned> &Ops) const {
+  // Check switch flag 
+  if (NoFusing) return 0;
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    switch (MI->getOpcode()) {
+    default: return false;
+    case X86::TEST8rr: 
+    case X86::TEST16rr:
+    case X86::TEST32rr:
+    case X86::TEST64rr:
+      return true;
+    }
+  }
+
+  if (Ops.size() != 1)
+    return false;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  const DenseMap<unsigned*, unsigned> *OpcodeTablePtr = NULL;
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2) { 
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+  } else if (OpNum == 0) { // If operand 0
+    switch (Opc) {
+    case X86::MOV16r0:
+    case X86::MOV32r0:
+    case X86::MOV64r0:
+    case X86::MOV8r0:
+      return true;
+    default: break;
+    }
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (OpNum == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (OpNum == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  }
+  
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    DenseMap<unsigned*, unsigned>::iterator I =
+      OpcodeTablePtr->find((unsigned*)Opc);
+    if (I != OpcodeTablePtr->end())
+      return true;
+  }
+  return false;
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
+    MemOp2RegOpTable.find((unsigned*)MI->getOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  DebugLoc dl = MI->getDebugLoc();
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & 0xf;
+  bool FoldedLoad = I->second.second & (1 << 4);
+  bool FoldedStore = I->second.second & (1 << 5);
+  if (UnfoldLoad && !FoldedLoad)
+    return false;
+  UnfoldLoad &= FoldedLoad;
+  if (UnfoldStore && !FoldedStore)
+    return false;
+  UnfoldStore &= FoldedStore;
+
+  const TargetInstrDesc &TID = get(Opc);
+  const TargetOperandInfo &TOI = TID.OpInfo[Index];
+  const TargetRegisterClass *RC = TOI.isLookupPtrRegClass()
+    ? RI.getPointerRegClass() : RI.getRegClass(TOI.RegClass);
+  SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
+  SmallVector<MachineOperand,2> BeforeOps;
+  SmallVector<MachineOperand,2> AfterOps;
+  SmallVector<MachineOperand,4> ImpOps;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (i >= Index && i < Index + X86AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (Op.isReg() && Op.isImplicit())
+      ImpOps.push_back(Op);
+    else if (i < Index)
+      BeforeOps.push_back(Op);
+    else if (i > Index)
+      AfterOps.push_back(Op);
+  }
+
+  // Emit the load instruction.
+  if (UnfoldLoad) {
+    loadRegFromAddr(MF, Reg, AddrOps, RC, NewMIs);
+    if (UnfoldStore) {
+      // Address operands cannot be marked isKill.
+      for (unsigned i = 1; i != 1 + X86AddrNumOperands; ++i) {
+        MachineOperand &MO = NewMIs[0]->getOperand(i);
+        if (MO.isReg())
+          MO.setIsKill(false);
+      }
+    }
+  }
+
+  // Emit the data processing instruction.
+  MachineInstr *DataMI = MF.CreateMachineInstr(TID, MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(DataMI);
+  
+  if (FoldedStore)
+    MIB.addReg(Reg, RegState::Define);
+  for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
+    MIB.addOperand(BeforeOps[i]);
+  if (FoldedLoad)
+    MIB.addReg(Reg);
+  for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
+    MIB.addOperand(AfterOps[i]);
+  for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
+    MachineOperand &MO = ImpOps[i];
+    MIB.addReg(MO.getReg(),
+               getDefRegState(MO.isDef()) |
+               RegState::Implicit |
+               getKillRegState(MO.isKill()) |
+               getDeadRegState(MO.isDead()));
+  }
+  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+  unsigned NewOpc = 0;
+  switch (DataMI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP32ri:
+  case X86::CMP16ri:
+  case X86::CMP8ri: {
+    MachineOperand &MO0 = DataMI->getOperand(0);
+    MachineOperand &MO1 = DataMI->getOperand(1);
+    if (MO1.getImm() == 0) {
+      switch (DataMI->getOpcode()) {
+      default: break;
+      case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+      case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
+      case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
+      case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
+      }
+      DataMI->setDesc(get(NewOpc));
+      MO1.ChangeToRegister(MO0.getReg(), false);
+    }
+  }
+  }
+  NewMIs.push_back(DataMI);
+
+  // Emit the store instruction.
+  if (UnfoldStore) {
+    const TargetOperandInfo &DstTOI = TID.OpInfo[0];
+    const TargetRegisterClass *DstRC = DstTOI.isLookupPtrRegClass()
+      ? RI.getPointerRegClass() : RI.getRegClass(DstTOI.RegClass);
+    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, NewMIs);
+  }
+
+  return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                  SmallVectorImpl<SDNode*> &NewNodes) const {
+  if (!N->isMachineOpcode())
+    return false;
+
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
+    MemOp2RegOpTable.find((unsigned*)N->getMachineOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & 0xf;
+  bool FoldedLoad = I->second.second & (1 << 4);
+  bool FoldedStore = I->second.second & (1 << 5);
+  const TargetInstrDesc &TID = get(Opc);
+  const TargetOperandInfo &TOI = TID.OpInfo[Index];
+  const TargetRegisterClass *RC = TOI.isLookupPtrRegClass()
+    ? RI.getPointerRegClass() : RI.getRegClass(TOI.RegClass);
+  unsigned NumDefs = TID.NumDefs;
+  std::vector<SDValue> AddrOps;
+  std::vector<SDValue> BeforeOps;
+  std::vector<SDValue> AfterOps;
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumOps = N->getNumOperands();
+  for (unsigned i = 0; i != NumOps-1; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (i < Index-NumDefs)
+      BeforeOps.push_back(Op);
+    else if (i > Index-NumDefs)
+      AfterOps.push_back(Op);
+  }
+  SDValue Chain = N->getOperand(NumOps-1);
+  AddrOps.push_back(Chain);
+
+  // Emit the load instruction.
+  SDNode *Load = 0;
+  const MachineFunction &MF = DAG.getMachineFunction();
+  if (FoldedLoad) {
+    MVT VT = *RC->vt_begin();
+    bool isAligned = (RI.getStackAlignment() >= 16) ||
+      RI.needsStackRealignment(MF);
+    Load = DAG.getTargetNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+                             VT, MVT::Other, &AddrOps[0], AddrOps.size());
+    NewNodes.push_back(Load);
+  }
+
+  // Emit the data processing instruction.
+  std::vector<MVT> VTs;
+  const TargetRegisterClass *DstRC = 0;
+  if (TID.getNumDefs() > 0) {
+    const TargetOperandInfo &DstTOI = TID.OpInfo[0];
+    DstRC = DstTOI.isLookupPtrRegClass()
+      ? RI.getPointerRegClass() : RI.getRegClass(DstTOI.RegClass);
+    VTs.push_back(*DstRC->vt_begin());
+  }
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    MVT VT = N->getValueType(i);
+    if (VT != MVT::Other && i >= (unsigned)TID.getNumDefs())
+      VTs.push_back(VT);
+  }
+  if (Load)
+    BeforeOps.push_back(SDValue(Load, 0));
+  std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+  SDNode *NewNode= DAG.getTargetNode(Opc, dl, VTs, &BeforeOps[0],
+                                     BeforeOps.size());
+  NewNodes.push_back(NewNode);
+
+  // Emit the store instruction.
+  if (FoldedStore) {
+    AddrOps.pop_back();
+    AddrOps.push_back(SDValue(NewNode, 0));
+    AddrOps.push_back(Chain);
+    bool isAligned = (RI.getStackAlignment() >= 16) ||
+      RI.needsStackRealignment(MF);
+    SDNode *Store = DAG.getTargetNode(getStoreRegOpcode(0, DstRC,
+                                                        isAligned, TM),
+                                      dl, MVT::Other,
+                                      &AddrOps[0], AddrOps.size());
+    NewNodes.push_back(Store);
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore) const {
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
+    MemOp2RegOpTable.find((unsigned*)Opc);
+  if (I == MemOp2RegOpTable.end())
+    return 0;
+  bool FoldedLoad = I->second.second & (1 << 4);
+  bool FoldedStore = I->second.second & (1 << 5);
+  if (UnfoldLoad && !FoldedLoad)
+    return 0;
+  if (UnfoldStore && !FoldedStore)
+    return 0;
+  return I->second.first;
+}
+
+bool X86InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case X86::TCRETURNri:
+  case X86::TCRETURNdi:
+  case X86::RET:     // Return.
+  case X86::RETI:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+  case X86::JMP:     // Uncond branch.
+  case X86::JMP32r:  // Indirect branch.
+  case X86::JMP64r:  // Indirect branch (64-bit).
+  case X86::JMP32m:  // Indirect branch through mem.
+  case X86::JMP64m:  // Indirect branch through mem (64-bit).
+    return true;
+  default: return false;
+  }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+    return true;
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // FIXME: Return false for x87 stack register classes for now. We can't
+  // allow any loads of these registers before FpGet_ST0_80.
+  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+unsigned X86InstrInfo::sizeOfImm(const TargetInstrDesc *Desc) {
+  switch (Desc->TSFlags & X86II::ImmMask) {
+  case X86II::Imm8:   return 1;
+  case X86II::Imm16:  return 2;
+  case X86II::Imm32:  return 4;
+  case X86II::Imm64:  return 8;
+  default: assert(0 && "Immediate size not set!");
+    return 0;
+  }
+}
+
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register?
+/// e.g. r8, xmm8, etc.
+bool X86InstrInfo::isX86_64ExtendedReg(const MachineOperand &MO) {
+  if (!MO.isReg()) return false;
+  switch (MO.getReg()) {
+  default: break;
+  case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+  case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+  case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+  case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+  case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+  case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+  case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+  case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+  case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    return true;
+  }
+  return false;
+}
+
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned X86InstrInfo::determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  const TargetInstrDesc &Desc = MI.getDesc();
+
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc.TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+
+  unsigned NumOps = Desc.getNumOperands();
+  if (NumOps) {
+    bool isTwoAddr = NumOps > 1 &&
+      Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    unsigned i = isTwoAddr ? 1 : 0;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (isX86_64NonExtLowByteReg(Reg))
+          REX |= 0x40;
+      }
+    }
+
+    switch (Desc.TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= (1 << 0) | (1 << 2);
+      break;
+    case X86II::MRMSrcReg: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 0;
+      }
+      break;
+    }
+    case X86II::MRMSrcMem: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      i = isTwoAddr ? 2 : 1;
+      for (; i != NumOps; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isReg()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+    case X86II::MRMDestMem: {
+      unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+      i = isTwoAddr ? 1 : 0;
+      if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      for (; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isReg()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    default: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 0;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 2;
+      }
+      break;
+    }
+    }
+  }
+  return REX;
+}
+
+/// sizePCRelativeBlockAddress - This method returns the size of a PC
+/// relative block address instruction
+///
+static unsigned sizePCRelativeBlockAddress() {
+  return 4;
+}
+
+/// sizeGlobalAddress - Give the size of the emission of this global address
+///
+static unsigned sizeGlobalAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+/// sizeConstPoolAddress - Give the size of the emission of this constant
+/// pool address
+///
+static unsigned sizeConstPoolAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+/// sizeExternalSymbolAddress - Give the size of the emission of this external
+/// symbol
+///
+static unsigned sizeExternalSymbolAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+/// sizeJumpTableAddress - Give the size of the emission of this jump
+/// table address
+///
+static unsigned sizeJumpTableAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+static unsigned sizeConstant(unsigned Size) {
+  return Size;
+}
+
+static unsigned sizeRegModRMByte(){
+  return 1;
+}
+
+static unsigned sizeSIBByte(){
+  return 1;
+}
+
+static unsigned getDisplacementFieldSize(const MachineOperand *RelocOp) {
+  unsigned FinalSize = 0;
+  // If this is a simple integer displacement that doesn't require a relocation.
+  if (!RelocOp) {
+    FinalSize += sizeConstant(4);
+    return FinalSize;
+  }
+  
+  // Otherwise, this is something that requires a relocation.
+  if (RelocOp->isGlobal()) {
+    FinalSize += sizeGlobalAddress(false);
+  } else if (RelocOp->isCPI()) {
+    FinalSize += sizeConstPoolAddress(false);
+  } else if (RelocOp->isJTI()) {
+    FinalSize += sizeJumpTableAddress(false);
+  } else {
+    assert(0 && "Unknown value to relocate!");
+  }
+  return FinalSize;
+}
+
+static unsigned getMemModRMByteSize(const MachineInstr &MI, unsigned Op,
+                                    bool IsPIC, bool Is64BitMode) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = 0;
+  unsigned FinalSize = 0;
+  
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobal()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isCPI()) {
+    if (Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal = 1;
+    }
+  } else if (Op3.isJTI()) {
+    if (Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal = 1; 
+    }
+  } else {
+    DispVal = 1;
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Is a SIB byte needed?
+  if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
+      IndexReg.getReg() == 0 &&
+      (BaseReg == 0 || X86RegisterInfo::getX86RegNum(BaseReg) != N86::ESP)) {      
+    if (BaseReg == 0) {  // Just a displacement?
+      // Emit special case [disp32] encoding
+      ++FinalSize; 
+      FinalSize += getDisplacementFieldSize(DispForReloc);
+    } else {
+      unsigned BaseRegNo = X86RegisterInfo::getX86RegNum(BaseReg);
+      if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+        // Emit simple indirect register encoding... [EAX] f.e.
+        ++FinalSize;
+      // Be pessimistic and assume it's a disp32, not a disp8
+      } else {
+        // Emit the most general non-SIB encoding: [REG+disp32]
+        ++FinalSize;
+        FinalSize += getDisplacementFieldSize(DispForReloc);
+      }
+    }
+
+  } else {  // We need a SIB byte, so start by outputting the ModR/M byte first
+    assert(IndexReg.getReg() != X86::ESP &&
+           IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+    bool ForceDisp32 = false;
+    if (BaseReg == 0 || DispForReloc) {
+      // Emit the normal disp32 encoding.
+      ++FinalSize;
+      ForceDisp32 = true;
+    } else {
+      ++FinalSize;
+    }
+
+    FinalSize += sizeSIBByte();
+
+    // Do we need to output a displacement?
+    if (DispVal != 0 || ForceDisp32) {
+      FinalSize += getDisplacementFieldSize(DispForReloc);
+    }
+  }
+  return FinalSize;
+}
+
+
+static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
+                                    const TargetInstrDesc *Desc,
+                                    bool IsPIC, bool Is64BitMode) {
+  
+  unsigned Opcode = Desc->Opcode;
+  unsigned FinalSize = 0;
+
+  // Emit the lock opcode prefix as needed.
+  if (Desc->TSFlags & X86II::LOCK) ++FinalSize;
+
+  // Emit segment override opcode prefix as needed.
+  switch (Desc->TSFlags & X86II::SegOvrMask) {
+  case X86II::FS:
+  case X86II::GS:
+   ++FinalSize;
+   break;
+  default: assert(0 && "Invalid segment!");
+  case 0: break;  // No segment override!
+  }
+
+  // Emit the repeat opcode prefix as needed.
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) ++FinalSize;
+
+  // Emit the operand size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::OpSize) ++FinalSize;
+
+  // Emit the address size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::AdSize) ++FinalSize;
+
+  bool Need0FPrefix = false;
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::TB:  // Two-byte opcode prefix
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    Need0FPrefix = true;
+    break;
+  case X86II::REP: break; // already handled.
+  case X86II::XS:   // F3 0F
+    ++FinalSize;
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    ++FinalSize;
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+    ++FinalSize;
+    break; // Two-byte opcode prefix
+  default: assert(0 && "Invalid prefix!");
+  case 0: break;  // No prefix!
+  }
+
+  if (Is64BitMode) {
+    // REX prefix
+    unsigned REX = X86InstrInfo::determineREX(MI);
+    if (REX)
+      ++FinalSize;
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    ++FinalSize;
+
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::T8:  // 0F 38
+    ++FinalSize;
+    break;
+  case X86II::TA:  // 0F 3A
+    ++FinalSize;
+    break;
+  }
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+    CurOp++;
+  else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+    --NumOps;
+
+  switch (Desc->TSFlags & X86II::FormMask) {
+  default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+    // Remember the current PC offset, this is the PIC relocation
+    // base address.
+    switch (Opcode) {
+    default: 
+      break;
+    case TargetInstrInfo::INLINEASM: {
+      const MachineFunction *MF = MI.getParent()->getParent();
+      const char *AsmStr = MI.getOperand(0).getSymbolName();
+      const TargetAsmInfo* AI = MF->getTarget().getTargetAsmInfo();
+      FinalSize += AI->getInlineAsmLength(AsmStr);
+      break;
+    }
+    case TargetInstrInfo::DBG_LABEL:
+    case TargetInstrInfo::EH_LABEL:
+      break;
+    case TargetInstrInfo::IMPLICIT_DEF:
+    case TargetInstrInfo::DECLARE:
+    case X86::DWARF_LOC:
+    case X86::FP_REG_KILL:
+      break;
+    case X86::MOVPC32r: {
+      // This emits the "call" portion of this pseudo instruction.
+      ++FinalSize;
+      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+      break;
+    }
+    }
+    CurOp = NumOps;
+    break;
+  case X86II::RawFrm:
+    ++FinalSize;
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      if (MO.isMBB()) {
+        FinalSize += sizePCRelativeBlockAddress();
+      } else if (MO.isGlobal()) {
+        FinalSize += sizeGlobalAddress(false);
+      } else if (MO.isSymbol()) {
+        FinalSize += sizeExternalSymbolAddress(false);
+      } else if (MO.isImm()) {
+        FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+      } else {
+        assert(0 && "Unknown RawFrm operand!");
+      }
+    }
+    break;
+
+  case X86II::AddRegFrm:
+    ++FinalSize;
+    ++CurOp;
+    
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      if (MO1.isImm())
+        FinalSize += sizeConstant(Size);
+      else {
+        bool dword = false;
+        if (Opcode == X86::MOV64ri)
+          dword = true; 
+        if (MO1.isGlobal()) {
+          FinalSize += sizeGlobalAddress(dword);
+        } else if (MO1.isSymbol())
+          FinalSize += sizeExternalSymbolAddress(dword);
+        else if (MO1.isCPI())
+          FinalSize += sizeConstPoolAddress(dword);
+        else if (MO1.isJTI())
+          FinalSize += sizeJumpTableAddress(dword);
+      }
+    }
+    break;
+
+  case X86II::MRMDestReg: {
+    ++FinalSize; 
+    FinalSize += sizeRegModRMByte();
+    CurOp += 2;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+    }
+    break;
+  }
+  case X86II::MRMDestMem: {
+    ++FinalSize;
+    FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+    CurOp +=  X86AddrNumOperands + 1;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+    }
+    break;
+  }
+
+  case X86II::MRMSrcReg:
+    ++FinalSize;
+    FinalSize += sizeRegModRMByte();
+    CurOp += 2;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+    }
+    break;
+
+  case X86II::MRMSrcMem: {
+    int AddrOperands;
+    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      AddrOperands = X86AddrNumOperands - 1; // No segment register
+    else
+      AddrOperands = X86AddrNumOperands;
+
+    ++FinalSize;
+    FinalSize += getMemModRMByteSize(MI, CurOp+1, IsPIC, Is64BitMode);
+    CurOp += AddrOperands + 1;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+    }
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    ++FinalSize;
+    if (Desc->getOpcode() == X86::LFENCE ||
+        Desc->getOpcode() == X86::MFENCE) {
+      // Special handling of lfence and mfence;
+      FinalSize += sizeRegModRMByte();
+    } else if (Desc->getOpcode() == X86::MONITOR ||
+               Desc->getOpcode() == X86::MWAIT) {
+      // Special handling of monitor and mwait.
+      FinalSize += sizeRegModRMByte() + 1; // +1 for the opcode.
+    } else {
+      ++CurOp;
+      FinalSize += sizeRegModRMByte();
+    }
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      if (MO1.isImm())
+        FinalSize += sizeConstant(Size);
+      else {
+        bool dword = false;
+        if (Opcode == X86::MOV64ri32)
+          dword = true;
+        if (MO1.isGlobal()) {
+          FinalSize += sizeGlobalAddress(dword);
+        } else if (MO1.isSymbol())
+          FinalSize += sizeExternalSymbolAddress(dword);
+        else if (MO1.isCPI())
+          FinalSize += sizeConstPoolAddress(dword);
+        else if (MO1.isJTI())
+          FinalSize += sizeJumpTableAddress(dword);
+      }
+    }
+    break;
+
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    
+    ++FinalSize;
+    FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+    CurOp += X86AddrNumOperands;
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+      if (MO.isImm())
+        FinalSize += sizeConstant(Size);
+      else {
+        bool dword = false;
+        if (Opcode == X86::MOV64mi32)
+          dword = true;
+        if (MO.isGlobal()) {
+          FinalSize += sizeGlobalAddress(dword);
+        } else if (MO.isSymbol())
+          FinalSize += sizeExternalSymbolAddress(dword);
+        else if (MO.isCPI())
+          FinalSize += sizeConstPoolAddress(dword);
+        else if (MO.isJTI())
+          FinalSize += sizeJumpTableAddress(dword);
+      }
+    }
+    break;
+  }
+
+  case X86II::MRMInitReg:
+    ++FinalSize;
+    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+    FinalSize += sizeRegModRMByte();
+    ++CurOp;
+    break;
+  }
+
+  if (!Desc->isVariadic() && CurOp != NumOps) {
+    cerr << "Cannot determine size: ";
+    MI.dump();
+    cerr << '\n';
+    abort();
+  }
+  
+
+  return FinalSize;
+}
+
+
+unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const TargetInstrDesc &Desc = MI->getDesc();
+  bool IsPIC = (TM.getRelocationModel() == Reloc::PIC_);
+  bool Is64BitMode = TM.getSubtargetImpl()->is64Bit();
+  unsigned Size = GetInstSizeWithDesc(*MI, &Desc, IsPIC, Is64BitMode);
+  if (Desc.getOpcode() == X86::MOVPC32r) {
+    Size += GetInstSizeWithDesc(*MI, &get(X86::POP32r), IsPIC, Is64BitMode);
+  }
+  return Size;
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+         "X86-64 PIC uses RIP relative addressing");
+
+  X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MBBI != FirstMBB.end()) DL = MBBI->getDebugLoc();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  unsigned PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+  
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  // Operand of MovePCtoStack is completely ignored by asm printer. It's
+  // only used in JIT code emission as displacement to pc.
+  BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC)
+    .addImm(0);
+  
+  // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+  // not to pc, but to _GLOBAL_ADDRESS_TABLE_ external
+  if (TM.getRelocationModel() == Reloc::PIC_ &&
+      TM.getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+    GlobalBaseReg =
+      RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+    BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+      .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+  } else {
+    GlobalBaseReg = PC;
+  }
+
+  X86FI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..e09769e
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,461 @@
+//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class X86RegisterInfo;
+  class X86TargetMachine;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+  enum CondCode {
+    COND_A  = 0,
+    COND_AE = 1,
+    COND_B  = 2,
+    COND_BE = 3,
+    COND_E  = 4,
+    COND_G  = 5,
+    COND_GE = 6,
+    COND_L  = 7,
+    COND_LE = 8,
+    COND_NE = 9,
+    COND_NO = 10,
+    COND_NP = 11,
+    COND_NS = 12,
+    COND_O  = 13,
+    COND_P  = 14,
+    COND_S  = 15,
+
+    // Artificial condition codes. These are used by AnalyzeBranch
+    // to indicate a block terminated with two conditional branches to
+    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+    // which can't be represented on x86 with a single condition. These
+    // are never used in MachineInstrs.
+    COND_NE_OR_P,
+    COND_NP_OR_E,
+
+    COND_INVALID
+  };
+    
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+  
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(X86::CondCode CC);
+
+}
+  
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction types.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    // MRMInitReg - This form is used for instructions whose source and
+    // destinations are the same register.
+    MRMInitReg = 32,
+
+    FormMask       = 63,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - Set if this instruction requires an operand size prefix (0x66),
+    // which most often indicates that the instruction operates on 16 bit data
+    // instead of 32 bit data.
+    OpSize      = 1 << 6,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
+    // Op0Mask - There are several prefix bytes that are used to form two byte
+    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
+    // used to obtain the setting of this field.  If no bits in this field is
+    // set, there is no prefix byte for obtaining a multibyte opcode.
+    //
+    Op0Shift    = 8,
+    Op0Mask     = 0xF << Op0Shift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB          = 1 << Op0Shift,
+
+    // REP - The 0xF3 prefix byte indicating repetition of the following
+    // instruction.
+    REP         = 2 << Op0Shift,
+
+    // D8-DF - These escape opcodes are used by the floating point unit.  These
+    // values must remain sequential.
+    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
+    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
+    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
+    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = 12,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = 13,
+    ImmMask  = 7 << ImmShift,
+    Imm8     = 1 << ImmShift,
+    Imm16    = 2 << ImmShift,
+    Imm32    = 3 << ImmShift,
+    Imm64    = 4 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = 16,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = 19,
+    LOCK = 1 << LOCKShift,
+
+    // Segment override prefixes. Currently we just need ability to address
+    // stuff in gs and fs segments.
+    SegOvrShift = 20,
+    SegOvrMask  = 3 << SegOvrShift,
+    FS          = 1 << SegOvrShift,
+    GS          = 2 << SegOvrShift,
+
+    // Bits 22 -> 23 are unused
+    OpcodeShift   = 24,
+    OpcodeMask    = 0xFF << OpcodeShift
+  };
+}
+
+const int X86AddrNumOperands = 5;
+
+inline static bool isScale(const MachineOperand &MO) {
+  return MO.isImm() &&
+    (MO.getImm() == 1 || MO.getImm() == 2 ||
+     MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+4 <= MI->getNumOperands() &&
+    MI->getOperand(Op  ).isReg() && isScale(MI->getOperand(Op+1)) &&
+    MI->getOperand(Op+2).isReg() &&
+    (MI->getOperand(Op+3).isImm() ||
+     MI->getOperand(Op+3).isGlobal() ||
+     MI->getOperand(Op+3).isCPI() ||
+     MI->getOperand(Op+3).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+5 <= MI->getNumOperands() &&
+    MI->getOperand(Op+4).isReg() &&
+    isLeaMem(MI, Op);
+}
+
+class X86InstrInfo : public TargetInstrInfoImpl {
+  X86TargetMachine &TM;
+  const X86RegisterInfo RI;
+  
+  /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2 - Load / store folding opcode maps.
+  ///
+  DenseMap<unsigned*, unsigned> RegOp2MemOpTable2Addr;
+  DenseMap<unsigned*, unsigned> RegOp2MemOpTable0;
+  DenseMap<unsigned*, unsigned> RegOp2MemOpTable1;
+  DenseMap<unsigned*, unsigned> RegOp2MemOpTable2;
+  
+  /// MemOp2RegOpTable - Load / store unfolding opcode map.
+  ///
+  DenseMap<unsigned*, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
+  
+public:
+  explicit X86InstrInfo(X86TargetMachine &tm);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI) const;
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  bool isInvariantLoad(const MachineInstr *MI) const;
+
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  /// commuteInstruction - We have a few instructions that must be hacked on to
+  /// commute them.
+  ///
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+
+  // Branch analysis.
+  virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand(s).  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  /// foldMemoryOperand - Same as the previous version except it allows folding
+  /// of any load and store from / to any address, not just from a specific
+  /// stack slot.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const;
+
+  /// canFoldMemoryOperand - Returns true if the specified load / store is
+  /// folding is possible.
+  virtual bool canFoldMemoryOperand(const MachineInstr*,
+                                    const SmallVectorImpl<unsigned> &) const;
+
+  /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+  /// a store or a load and a store into two or more instruction. If this is
+  /// possible, returns true as well as the new instructions by reference.
+  virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                           SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode*> &NewNodes) const;
+
+  /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+  /// instruction after load / store are unfolded from an instruction of the
+  /// specified opcode. It returns zero if the specified unfolding is not
+  /// possible.
+  virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore) const;
+  
+  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+  /// instruction that defines the specified register class.
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  unsigned char getBaseOpcodeFor(const TargetInstrDesc *TID) const {
+    return TID->TSFlags >> X86II::OpcodeShift;
+  }
+  unsigned char getBaseOpcodeFor(unsigned Opcode) const {
+    return getBaseOpcodeFor(&get(Opcode));
+  }
+  
+  static bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+          reg == X86::SIL || reg == X86::DIL);
+  }
+  
+  static unsigned sizeOfImm(const TargetInstrDesc *Desc);
+  static bool isX86_64ExtendedReg(const MachineOperand &MO);
+  static unsigned determineREX(const MachineInstr &MI);
+
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+private:
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      unsigned OpNum,
+                                      const SmallVectorImpl<MachineOperand> &MOs) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..50ae417
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,3961 @@
+//===- X86InstrInfo.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags  : SDTypeProfile<1, 1,
+                                            [SDTCisInt<0>]>;
+def SDTBinaryArithWithFlags : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>]>;
+def SDTX86BrCond  : SDTypeProfile<0, 3,
+                                  [SDTCisVT<0, OtherVT>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i8>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, 
+                                     SDTCisVT<2, i8>]>;
+def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_X86CallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                         SDTCisVT<1, i32> ]>;
+
+def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86RdTsc   : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SegmentBaseAddress : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTIntUnaryOp>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTIntUnaryOp>;
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
+
+def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+                         SDNPMayLoad]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+                         SDNPMayLoad]>;
+def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInFlag]>;
+
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;       
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86tailcall: SDNode<"X86ISD::TAILCALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+                         SDNPMayLoad]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG",SDTX86RdTsc,
+                        [SDNPHasChain, SDNPOutFlag, SDNPSideEffect]>;
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def X86SegmentBaseAddress : SDNode<"X86ISD::SegmentBaseAddress",
+                                 SDT_X86SegmentBaseAddress, []>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, 
+                        [SDNPHasChain,  SDNPOptInFlag]>;
+
+def X86add_flag  : SDNode<"X86ISD::ADD",  SDTBinaryArithWithFlags>;
+def X86sub_flag  : SDNode<"X86ISD::SUB",  SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags>;
+def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
+def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+}
+
+def i8mem   : X86MemOperand<"printi8mem">;
+def i16mem  : X86MemOperand<"printi16mem">;
+def i32mem  : X86MemOperand<"printi32mem">;
+def i64mem  : X86MemOperand<"printi64mem">;
+def i128mem : X86MemOperand<"printi128mem">;
+def f32mem  : X86MemOperand<"printf32mem">;
+def f64mem  : X86MemOperand<"printf64mem">;
+def f80mem  : X86MemOperand<"printf80mem">;
+def f128mem : X86MemOperand<"printf128mem">;
+
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX, i32imm, i8imm);
+}
+
+def lea32mem : Operand<i32> {
+  let PrintMethod = "printlea32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+}
+
+def piclabel: Operand<i32> {
+  let PrintMethod = "printPICLabel";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16>;
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+                               [add, sub, mul, shl, or, frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode  : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
+def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def OptForSpeed  : Predicate<"!OptForSize">;
+def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;  // alt. COND_NBE
+def X86_COND_AE  : PatLeaf<(i8 1)>;  // alt. COND_NC
+def X86_COND_B   : PatLeaf<(i8 2)>;  // alt. COND_C
+def X86_COND_BE  : PatLeaf<(i8 3)>;  // alt. COND_NA
+def X86_COND_E   : PatLeaf<(i8 4)>;  // alt. COND_Z
+def X86_COND_G   : PatLeaf<(i8 5)>;  // alt. COND_NLE
+def X86_COND_GE  : PatLeaf<(i8 6)>;  // alt. COND_NL
+def X86_COND_L   : PatLeaf<(i8 7)>;  // alt. COND_NGE
+def X86_COND_LE  : PatLeaf<(i8 8)>;  // alt. COND_NG
+def X86_COND_NE  : PatLeaf<(i8 9)>;  // alt. COND_NZ
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+def i16immSExt8  : PatLeaf<(i16 imm), [{
+  // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int16_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+def i32immSExt8  : PatLeaf<(i32 imm), [{
+  // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int32_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4 && !LD->isVolatile();
+  return false;
+}]>;
+
+def nvloadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  if (LD->isVolatile())
+    return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4;
+  return false;
+}]>;
+
+def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      return PT->getAddressSpace() == 256;
+  return false;
+}]>;
+
+def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      return PT->getAddressSpace() == 257;
+  return false;
+}]>;
+
+def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;
+
+// 'shld' and 'shrd' instruction patterns. Note that even though these have
+// the srl and shl in their patterns, the C++ code must still check for them,
+// because predicates are tested before children nodes are explored.
+
+def shrd : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+                   (or (srl node:$src1, node:$amt1),
+                       (shl node:$src2, node:$amt2)), [{
+  assert(N->getOpcode() == ISD::OR);
+  return N->getOperand(0).getOpcode() == ISD::SRL &&
+         N->getOperand(1).getOpcode() == ISD::SHL &&
+         isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+         isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+         N->getOperand(0).getConstantOperandVal(1) ==
+         N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
+def shld : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+                   (or (shl node:$src1, node:$amt1),
+                       (srl node:$src2, node:$amt2)), [{
+  assert(N->getOpcode() == ISD::OR);
+  return N->getOperand(0).getOpcode() == ISD::SHL &&
+         N->getOperand(1).getOpcode() == ISD::SRL &&
+         isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+         isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+         N->getOperand(0).getConstantOperandVal(1) ==
+         N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[In32BitMode]>;
+def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[In32BitMode]>;
+}
+
+// Nop
+let neverHasSideEffects = 1 in
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+
+// PIC base
+let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins piclabel:$label),
+                      "call\t$label\n\t"
+                      "pop{l}\t$reg", []>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// Return instructions.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, FPFormBits = SpecialFP.Value in {
+  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret",
+                    [(X86retflag 0)]>;
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret\t$amt",
+                    [(X86retflag imm:$amt)]>;
+}
+
+// All branches are RawFrm, Void, Branch, and Terminators
+let isBranch = 1, isTerminator = 1 in
+  class IBr<bits<8> opcode, dag ins, string asm, list<dag> pattern> :
+        I<opcode, RawFrm, (outs), ins, asm, pattern>;
+
+let isBranch = 1, isBarrier = 1 in
+  def JMP : IBr<0xE9, (ins brtarget:$dst), "jmp\t$dst", [(br bb:$dst)]>;
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+                     [(brind GR32:$dst)]>;
+  def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+                     [(brind (loadi32 addr:$dst))]>;
+}
+
+// Conditional branches
+let Uses = [EFLAGS] in {
+def JE  : IBr<0x84, (ins brtarget:$dst), "je\t$dst",
+              [(X86brcond bb:$dst, X86_COND_E, EFLAGS)]>, TB;
+def JNE : IBr<0x85, (ins brtarget:$dst), "jne\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NE, EFLAGS)]>, TB;
+def JL  : IBr<0x8C, (ins brtarget:$dst), "jl\t$dst",
+              [(X86brcond bb:$dst, X86_COND_L, EFLAGS)]>, TB;
+def JLE : IBr<0x8E, (ins brtarget:$dst), "jle\t$dst",
+              [(X86brcond bb:$dst, X86_COND_LE, EFLAGS)]>, TB;
+def JG  : IBr<0x8F, (ins brtarget:$dst), "jg\t$dst",
+              [(X86brcond bb:$dst, X86_COND_G, EFLAGS)]>, TB;
+def JGE : IBr<0x8D, (ins brtarget:$dst), "jge\t$dst",
+              [(X86brcond bb:$dst, X86_COND_GE, EFLAGS)]>, TB;
+
+def JB  : IBr<0x82, (ins brtarget:$dst), "jb\t$dst",
+              [(X86brcond bb:$dst, X86_COND_B, EFLAGS)]>, TB;
+def JBE : IBr<0x86, (ins brtarget:$dst), "jbe\t$dst",
+              [(X86brcond bb:$dst, X86_COND_BE, EFLAGS)]>, TB;
+def JA  : IBr<0x87, (ins brtarget:$dst), "ja\t$dst",
+              [(X86brcond bb:$dst, X86_COND_A, EFLAGS)]>, TB;
+def JAE : IBr<0x83, (ins brtarget:$dst), "jae\t$dst",
+              [(X86brcond bb:$dst, X86_COND_AE, EFLAGS)]>, TB;
+
+def JS  : IBr<0x88, (ins brtarget:$dst), "js\t$dst",
+              [(X86brcond bb:$dst, X86_COND_S, EFLAGS)]>, TB;
+def JNS : IBr<0x89, (ins brtarget:$dst), "jns\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NS, EFLAGS)]>, TB;
+def JP  : IBr<0x8A, (ins brtarget:$dst), "jp\t$dst",
+              [(X86brcond bb:$dst, X86_COND_P, EFLAGS)]>, TB;
+def JNP : IBr<0x8B, (ins brtarget:$dst), "jnp\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NP, EFLAGS)]>, TB;
+def JO  : IBr<0x80, (ins brtarget:$dst), "jo\t$dst",
+              [(X86brcond bb:$dst, X86_COND_O, EFLAGS)]>, TB;
+def JNO : IBr<0x81, (ins brtarget:$dst), "jno\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NO, EFLAGS)]>, TB;
+} // Uses = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. ESP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+      Uses = [ESP] in {
+    def CALLpcrel32 : Ii32<0xE8, RawFrm, (outs), (ins i32imm:$dst,variable_ops),
+                           "call\t${dst:call}", []>;
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+                        "call\t{*}$dst", [(X86call GR32:$dst)]>;
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+                        "call\t{*}$dst", [(X86call (loadi32 addr:$dst))]>;
+  }
+
+// Tail call stuff.
+
+def TAILCALL : I<0, Pseudo, (outs), (ins),
+                         "#TAILCALL",
+                         []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi : I<0, Pseudo, (outs), (ins i32imm:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNri : I<0, Pseudo, (outs), (ins GR32:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+
+  def TAILJMPd : IBr<0xE9, (ins i32imm:$dst), "jmp\t${dst:call}  # TAILCALL",
+                 []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst  # TAILCALL",
+                 []>;     
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem:$dst),
+                   "jmp\t{*}$dst  # TAILCALL", []>;
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+def LEAVE    : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", []>;
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP32r   : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, neverHasSideEffects=1 in
+def POPFD    : I<0x9D, RawFrm, (outs), (ins), "popf", []>;
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
+def PUSHFD   : I<0x9C, RawFrm, (outs), (ins), "pushf", []>;
+
+let isTwoAddress = 1 in                               // GR32 = bswap GR32
+  def BSWAP32r : I<0xC8, AddRegFrm,
+                   (outs GR32:$dst), (ins GR32:$src),
+                   "bswap{l}\t$dst", 
+                   [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsf GR16:$src)), (implicit EFLAGS)]>, TB;
+def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsf (loadi16 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsf GR32:$src)), (implicit EFLAGS)]>, TB;
+def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsf (loadi32 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+
+def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsr GR16:$src)), (implicit EFLAGS)]>, TB;
+def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsr (loadi16 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsr GR32:$src)), (implicit EFLAGS)]>, TB;
+def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsr (loadi32 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+let neverHasSideEffects = 1 in
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (outs GR16:$dst), (ins i32mem:$src),
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize;
+let isReMaterializable = 1 in
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (outs GR32:$dst), (ins lea32mem:$src),
+                 "lea{l}\t{$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI] in {
+def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                  [(X86rep_movs i8)]>, REP;
+def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                  [(X86rep_movs i16)]>, REP, OpSize;
+def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                  [(X86rep_movs i32)]>, REP;
+}
+
+let Defs = [ECX,EDI], Uses = [AL,ECX,EDI] in
+def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                  [(X86rep_stos i8)]>, REP;
+let Defs = [ECX,EDI], Uses = [AX,ECX,EDI] in
+def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                  [(X86rep_stos i16)]>, REP, OpSize;
+let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI] in
+def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                  [(X86rep_stos i32)]>, REP;
+
+let Defs = [RAX, RDX] in
+def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>,
+            TB;
+
+let isBarrier = 1, hasCtrlDep = 1 in {
+def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions...
+//
+let Defs = [AL], Uses = [DX] in
+def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
+               "in{b}\t{%dx, %al|%AL, %DX}", []>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+               "in{w}\t{%dx, %ax|%AX, %DX}", []>,  OpSize;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+               "in{l}\t{%dx, %eax|%EAX, %DX}", []>;
+
+let Defs = [AL] in
+def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i16i8imm:$port),
+                  "in{b}\t{$port, %al|%AL, $port}", []>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+                  "in{w}\t{$port, %ax|%AX, $port}", []>, OpSize;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+                  "in{l}\t{$port, %eax|%EAX, $port}", []>;
+
+let Uses = [DX, AL] in
+def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
+                "out{b}\t{%al, %dx|%DX, %AL}", []>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{w}\t{%ax, %dx|%DX, %AX}", []>, OpSize;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{l}\t{%eax, %dx|%DX, %EAX}", []>;
+
+let Uses = [AL] in
+def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i16i8imm:$port),
+                   "out{b}\t{%al, $port|$port, %AL}", []>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+                   "out{w}\t{%ax, $port|$port, %AX}", []>, OpSize;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+                   "out{l}\t{%eax, $port|$port, %EAX}", []>;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+let neverHasSideEffects = 1 in {
+def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)]>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, imm:$src)]>;
+}
+def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(store (i32 imm:$src), addr:$dst)]>;
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (loadi8 addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (loadi32 addr:$src))]>;
+}
+
+def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)]>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)]>;
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let neverHasSideEffects = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+                     (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+let mayStore = 1 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+                     (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+let mayLoad = 1,
+    canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+                     (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions...
+//
+
+// Extra precision multiplication
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src)),
+                (implicit EFLAGS)]>;     // AL,AH = AL*GR8
+
+let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
+               "mul{w}\t$src", 
+               []>, OpSize;    // AX,DX = AX*GR16
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
+               "mul{l}\t$src",
+               []>; // EAX,EDX = EAX*GR32
+
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+               "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src))),
+                (implicit EFLAGS)]>;   // AL,AH = AL*[mem8]
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+               "mul{w}\t$src",
+               []>, OpSize; // AX,DX = AX*[mem16]
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+              "mul{l}\t$src",
+              []>;          // EAX,EDX = EAX*[mem32]
+}
+
+let neverHasSideEffects = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", []>;
+              // AL,AH = AL*GR8
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", []>,
+              OpSize;    // AX,DX = AX*GR16
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", []>;
+              // EAX,EDX = EAX*GR32
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+                "imul{b}\t$src", []>;    // AL,AH = AL*[mem8]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+                "imul{w}\t$src", []>, OpSize; // AX,DX = AX*[mem16]
+let Defs = [EAX,EDX], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+                "imul{l}\t$src", []>;  // EAX,EDX = EAX*[mem32]
+}
+} // neverHasSideEffects
+
+// unsigned division/remainder
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),          // AX/r8 = AL,AH
+               "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),         // DX:AX/r16 = AX,DX
+               "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),         // EDX:EAX/r32 = EAX,EDX
+               "div{l}\t$src", []>;
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),       // AX/[mem8] = AL,AH
+               "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),      // DX:AX/[mem16] = AX,DX
+               "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),      // EDX:EAX/[mem32] = EAX,EDX
+               "div{l}\t$src", []>;
+}
+
+// Signed division/remainder.
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),          // AX/r8 = AL,AH
+               "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),         // DX:AX/r16 = AX,DX
+               "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),         // EDX:EAX/r32 = EAX,EDX
+               "idiv{l}\t$src", []>;
+let mayLoad = 1, mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),      // AX/[mem8] = AL,AH
+               "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),     // DX:AX/[mem16] = AX,DX
+               "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),     // EDX:EAX/[mem32] = EAX,EDX
+               "idiv{l}\t$src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions.
+//
+let isTwoAddress = 1 in {
+
+// Conditional moves
+let Uses = [EFLAGS] in {
+let isCommutable = 1 in {
+def CMOVB16rr : I<0x42, MRMSrcReg,       // if <u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovb\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_B, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVB32rr : I<0x42, MRMSrcReg,       // if <u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovb\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_B, EFLAGS))]>,
+                   TB;
+def CMOVAE16rr: I<0x43, MRMSrcReg,       // if >=u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovae\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVAE32rr: I<0x43, MRMSrcReg,       // if >=u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovae\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB;
+def CMOVE16rr : I<0x44, MRMSrcReg,       // if ==, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmove\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_E, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVE32rr : I<0x44, MRMSrcReg,       // if ==, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmove\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_E, EFLAGS))]>,
+                   TB;
+def CMOVNE16rr: I<0x45, MRMSrcReg,       // if !=, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovne\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVNE32rr: I<0x45, MRMSrcReg,       // if !=, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovne\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB;
+def CMOVBE16rr: I<0x46, MRMSrcReg,       // if <=u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovbe\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVBE32rr: I<0x46, MRMSrcReg,       // if <=u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovbe\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB;
+def CMOVA16rr : I<0x47, MRMSrcReg,       // if >u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmova\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_A, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVA32rr : I<0x47, MRMSrcReg,       // if >u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmova\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_A, EFLAGS))]>,
+                   TB;
+def CMOVL16rr : I<0x4C, MRMSrcReg,       // if <s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovl\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_L, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVL32rr : I<0x4C, MRMSrcReg,       // if <s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovl\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_L, EFLAGS))]>,
+                   TB;
+def CMOVGE16rr: I<0x4D, MRMSrcReg,       // if >=s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovge\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVGE32rr: I<0x4D, MRMSrcReg,       // if >=s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovge\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB;
+def CMOVLE16rr: I<0x4E, MRMSrcReg,       // if <=s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovle\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVLE32rr: I<0x4E, MRMSrcReg,       // if <=s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovle\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB;
+def CMOVG16rr : I<0x4F, MRMSrcReg,       // if >s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovg\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_G, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVG32rr : I<0x4F, MRMSrcReg,       // if >s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovg\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_G, EFLAGS))]>,
+                   TB;
+def CMOVS16rr : I<0x48, MRMSrcReg,       // if signed, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovs\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_S, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVS32rr : I<0x48, MRMSrcReg,       // if signed, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovs\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_S, EFLAGS))]>,
+                  TB;
+def CMOVNS16rr: I<0x49, MRMSrcReg,       // if !signed, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovns\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNS32rr: I<0x49, MRMSrcReg,       // if !signed, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovns\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB;
+def CMOVP16rr : I<0x4A, MRMSrcReg,       // if parity, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovp\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_P, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg,       // if parity, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovp\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_P, EFLAGS))]>,
+                  TB;
+def CMOVNP16rr : I<0x4B, MRMSrcReg,       // if !parity, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovnp\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg,       // if !parity, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovnp\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB;
+def CMOVO16rr : I<0x40, MRMSrcReg,       // if overflow, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_O, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVO32rr : I<0x40, MRMSrcReg,       // if overflow, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_O, EFLAGS))]>,
+                  TB;
+def CMOVNO16rr : I<0x41, MRMSrcReg,       // if !overflow, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNO32rr : I<0x41, MRMSrcReg,       // if !overflow, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB;
+} // isCommutable = 1
+
+def CMOVB16rm : I<0x42, MRMSrcMem,       // if <u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovb\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_B, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVB32rm : I<0x42, MRMSrcMem,       // if <u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovb\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_B, EFLAGS))]>,
+                   TB;
+def CMOVAE16rm: I<0x43, MRMSrcMem,       // if >=u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovae\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVAE32rm: I<0x43, MRMSrcMem,       // if >=u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovae\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB;
+def CMOVE16rm : I<0x44, MRMSrcMem,       // if ==, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmove\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_E, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVE32rm : I<0x44, MRMSrcMem,       // if ==, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmove\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_E, EFLAGS))]>,
+                   TB;
+def CMOVNE16rm: I<0x45, MRMSrcMem,       // if !=, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovne\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVNE32rm: I<0x45, MRMSrcMem,       // if !=, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovne\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB;
+def CMOVBE16rm: I<0x46, MRMSrcMem,       // if <=u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovbe\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVBE32rm: I<0x46, MRMSrcMem,       // if <=u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovbe\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB;
+def CMOVA16rm : I<0x47, MRMSrcMem,       // if >u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmova\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_A, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVA32rm : I<0x47, MRMSrcMem,       // if >u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmova\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_A, EFLAGS))]>,
+                   TB;
+def CMOVL16rm : I<0x4C, MRMSrcMem,       // if <s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovl\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_L, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVL32rm : I<0x4C, MRMSrcMem,       // if <s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovl\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_L, EFLAGS))]>,
+                   TB;
+def CMOVGE16rm: I<0x4D, MRMSrcMem,       // if >=s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovge\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVGE32rm: I<0x4D, MRMSrcMem,       // if >=s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovge\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB;
+def CMOVLE16rm: I<0x4E, MRMSrcMem,       // if <=s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovle\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVLE32rm: I<0x4E, MRMSrcMem,       // if <=s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovle\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB;
+def CMOVG16rm : I<0x4F, MRMSrcMem,       // if >s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovg\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_G, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVG32rm : I<0x4F, MRMSrcMem,       // if >s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovg\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_G, EFLAGS))]>,
+                   TB;
+def CMOVS16rm : I<0x48, MRMSrcMem,       // if signed, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovs\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_S, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVS32rm : I<0x48, MRMSrcMem,       // if signed, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovs\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_S, EFLAGS))]>,
+                  TB;
+def CMOVNS16rm: I<0x49, MRMSrcMem,       // if !signed, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovns\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNS32rm: I<0x49, MRMSrcMem,       // if !signed, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovns\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB;
+def CMOVP16rm : I<0x4A, MRMSrcMem,       // if parity, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovp\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_P, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVP32rm : I<0x4A, MRMSrcMem,       // if parity, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovp\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_P, EFLAGS))]>,
+                  TB;
+def CMOVNP16rm : I<0x4B, MRMSrcMem,       // if !parity, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovnp\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNP32rm : I<0x4B, MRMSrcMem,       // if !parity, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovnp\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB;
+def CMOVO16rm : I<0x40, MRMSrcMem,       // if overflow, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_O, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVO32rm : I<0x40, MRMSrcMem,       // if overflow, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovo\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_O, EFLAGS))]>,
+                  TB;
+def CMOVNO16rm : I<0x41, MRMSrcMem,       // if !overflow, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNO32rm : I<0x41, MRMSrcMem,       // if !overflow, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovno\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB;
+} // Uses = [EFLAGS]
+
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src), "neg{b}\t$dst",
+               [(set GR8:$dst, (ineg GR8:$src)),
+                (implicit EFLAGS)]>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src), "neg{w}\t$dst",
+               [(set GR16:$dst, (ineg GR16:$src)),
+                (implicit EFLAGS)]>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src), "neg{l}\t$dst",
+               [(set GR32:$dst, (ineg GR32:$src)),
+                (implicit EFLAGS)]>;
+let isTwoAddress = 0 in {
+  def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), "neg{b}\t$dst",
+                 [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), "neg{w}\t$dst",
+                 [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+                  (implicit EFLAGS)]>, OpSize;
+  def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), "neg{l}\t$dst",
+                 [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+                  (implicit EFLAGS)]>;
+}
+} // Defs = [EFLAGS]
+
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src), "not{b}\t$dst",
+               [(set GR8:$dst, (not GR8:$src))]>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src), "not{w}\t$dst",
+               [(set GR16:$dst, (not GR16:$src))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src), "not{l}\t$dst",
+               [(set GR32:$dst, (not GR32:$src))]>;
+}
+let isTwoAddress = 0 in {
+  def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), "not{b}\t$dst",
+                 [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+  def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), "not{w}\t$dst",
+                 [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+  def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), "not{l}\t$dst",
+                 [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+}
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst",
+               [(set GR8:$dst, (add GR8:$src, 1)),
+                (implicit EFLAGS)]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), "inc{w}\t$dst",
+               [(set GR16:$dst, (add GR16:$src, 1)),
+                (implicit EFLAGS)]>,
+             OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "inc{l}\t$dst",
+               [(set GR32:$dst, (add GR32:$src, 1)),
+                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+}
+let isTwoAddress = 0, CodeSize = 2 in {
+  def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)]>;
+  def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)]>,
+               OpSize, Requires<[In32BitMode]>;
+  def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)]>,
+               Requires<[In32BitMode]>;
+}
+
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst",
+               [(set GR8:$dst, (add GR8:$src, -1)),
+                (implicit EFLAGS)]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), "dec{w}\t$dst",
+               [(set GR16:$dst, (add GR16:$src, -1)),
+                (implicit EFLAGS)]>,
+             OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "dec{l}\t$dst",
+               [(set GR32:$dst, (add GR32:$src, -1)),
+                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 0, CodeSize = 2 in {
+  def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)]>;
+  def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)]>,
+               OpSize, Requires<[In32BitMode]>;
+  def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)]>,
+               Requires<[In32BitMode]>;
+}
+} // Defs = [EFLAGS]
+
+// Logical operators...
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {   // X = AND Y, Z   --> X = AND Z, Y
+def AND8rr   : I<0x20, MRMDestReg,
+                (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+                "and{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
+                 (implicit EFLAGS)]>;
+def AND16rr  : I<0x21, MRMDestReg,
+                 (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                 "and{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, OpSize;
+def AND32rr  : I<0x21, MRMDestReg, 
+                 (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                 "and{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (and GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>;
+}
+
+def AND8rm   : I<0x22, MRMSrcMem, 
+                 (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
+                 "and{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, (loadi8 addr:$src2))),
+                 (implicit EFLAGS)]>;
+def AND16rm  : I<0x23, MRMSrcMem, 
+                 (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                 "and{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (and GR16:$src1, (loadi16 addr:$src2))),
+                 (implicit EFLAGS)]>, OpSize;
+def AND32rm  : I<0x23, MRMSrcMem,
+                 (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                 "and{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (and GR32:$src1, (loadi32 addr:$src2))),
+                 (implicit EFLAGS)]>;
+
+def AND8ri   : Ii8<0x80, MRM4r, 
+                   (outs GR8 :$dst), (ins GR8 :$src1, i8imm :$src2),
+                   "and{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
+                    (implicit EFLAGS)]>;
+def AND16ri  : Ii16<0x81, MRM4r, 
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "and{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def AND32ri  : Ii32<0x81, MRM4r, 
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "and{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (and GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def AND16ri8 : Ii8<0x83, MRM4r, 
+                   (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                   "and{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+def AND32ri8 : Ii8<0x83, MRM4r, 
+                   (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                   "and{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+  def AND8mr   : I<0x20, MRMDestMem,
+                   (outs), (ins i8mem :$dst, GR8 :$src),
+                   "and{b}\t{$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def AND16mr  : I<0x21, MRMDestMem,
+                   (outs), (ins i16mem:$dst, GR16:$src),
+                   "and{w}\t{$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+  def AND32mr  : I<0x21, MRMDestMem,
+                   (outs), (ins i32mem:$dst, GR32:$src),
+                   "and{l}\t{$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR32:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def AND8mi   : Ii8<0x80, MRM4m,
+                     (outs), (ins i8mem :$dst, i8imm :$src),
+                     "and{b}\t{$src, $dst|$dst, $src}",
+                      [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst),
+                       (implicit EFLAGS)]>;
+  def AND16mi  : Ii16<0x81, MRM4m,
+                      (outs), (ins i16mem:$dst, i16imm:$src),
+                      "and{w}\t{$src, $dst|$dst, $src}",
+                      [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst),
+                       (implicit EFLAGS)]>,
+                      OpSize;
+  def AND32mi  : Ii32<0x81, MRM4m,
+                      (outs), (ins i32mem:$dst, i32imm:$src),
+                      "and{l}\t{$src, $dst|$dst, $src}",
+                      [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst),
+                       (implicit EFLAGS)]>;
+  def AND16mi8 : Ii8<0x83, MRM4m,
+                     (outs), (ins i16mem:$dst, i16i8imm :$src),
+                     "and{w}\t{$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst),
+                 (implicit EFLAGS)]>,
+                     OpSize;
+  def AND32mi8 : Ii8<0x83, MRM4m,
+                     (outs), (ins i32mem:$dst, i32i8imm :$src),
+                     "and{l}\t{$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst),
+                 (implicit EFLAGS)]>;
+}
+
+
+let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
+def OR8rr    : I<0x08, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+                 "or{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (or GR8:$src1, GR8:$src2)),
+                  (implicit EFLAGS)]>;
+def OR16rr   : I<0x09, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                 "or{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (or GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, OpSize;
+def OR32rr   : I<0x09, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                 "or{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>;
+}
+def OR8rm    : I<0x0A, MRMSrcMem , (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
+                 "or{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (or GR8:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+def OR16rm   : I<0x0B, MRMSrcMem , (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                 "or{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (or GR16:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>, OpSize;
+def OR32rm   : I<0x0B, MRMSrcMem , (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                 "or{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (or GR32:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+
+def OR8ri    : Ii8 <0x80, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                    "or{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (or GR8:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def OR16ri   : Ii16<0x81, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "or{w}\t{$src2, $dst|$dst, $src2}", 
+                    [(set GR16:$dst, (or GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def OR32ri   : Ii32<0x81, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "or{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+
+def OR16ri8  : Ii8<0x83, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                   "or{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+def OR32ri8  : Ii8<0x83, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                   "or{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+let isTwoAddress = 0 in {
+  def OR8mr  : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                 "or{b}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR16mr : I<0x09, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "or{w}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR16:$src), addr:$dst),
+                  (implicit EFLAGS)]>, OpSize;
+  def OR32mr : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "or{l}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR32:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR8mi    : Ii8<0x80, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+                 "or{b}\t{$src, $dst|$dst, $src}",
+                 [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR16mi   : Ii16<0x81, MRM1m, (outs), (ins i16mem:$dst, i16imm:$src),
+                 "or{w}\t{$src, $dst|$dst, $src}",
+                 [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst),
+                  (implicit EFLAGS)]>,
+                 OpSize;
+  def OR32mi   : Ii32<0x81, MRM1m, (outs), (ins i32mem:$dst, i32imm:$src),
+                 "or{l}\t{$src, $dst|$dst, $src}",
+                 [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR16mi8  : Ii8<0x83, MRM1m, (outs), (ins i16mem:$dst, i16i8imm:$src),
+                 "or{w}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>,
+                     OpSize;
+  def OR32mi8  : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$src),
+                 "or{l}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+} // isTwoAddress = 0
+
+
+let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
+  def XOR8rr   : I<0x30, MRMDestReg,
+                   (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+                   "xor{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
+                    (implicit EFLAGS)]>;
+  def XOR16rr  : I<0x31, MRMDestReg, 
+                   (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), 
+                   "xor{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+  def XOR32rr  : I<0x31, MRMDestReg, 
+                   (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), 
+                   "xor{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, GR32:$src2)),
+                    (implicit EFLAGS)]>;
+} // isCommutable = 1
+
+def XOR8rm   : I<0x32, MRMSrcMem , 
+                 (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2), 
+                 "xor{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+def XOR16rm  : I<0x33, MRMSrcMem , 
+                 (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), 
+                 "xor{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>,
+                 OpSize;
+def XOR32rm  : I<0x33, MRMSrcMem , 
+                 (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), 
+                 "xor{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+
+def XOR8ri   : Ii8<0x80, MRM6r, 
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), 
+                   "xor{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
+                    (implicit EFLAGS)]>;
+def XOR16ri  : Ii16<0x81, MRM6r, 
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), 
+                    "xor{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def XOR32ri  : Ii32<0x81, MRM6r, 
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), 
+                    "xor{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (xor GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def XOR16ri8 : Ii8<0x83, MRM6r, 
+                   (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                   "xor{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+def XOR32ri8 : Ii8<0x83, MRM6r, 
+                   (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                   "xor{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+  def XOR8mr   : I<0x30, MRMDestMem,
+                   (outs), (ins i8mem :$dst, GR8 :$src),
+                   "xor{b}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def XOR16mr  : I<0x31, MRMDestMem,
+                   (outs), (ins i16mem:$dst, GR16:$src),
+                   "xor{w}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+  def XOR32mr  : I<0x31, MRMDestMem,
+                   (outs), (ins i32mem:$dst, GR32:$src),
+                   "xor{l}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR32:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def XOR8mi   : Ii8<0x80, MRM6m,
+                     (outs), (ins i8mem :$dst, i8imm :$src),
+                     "xor{b}\t{$src, $dst|$dst, $src}",
+                    [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst),
+                     (implicit EFLAGS)]>;
+  def XOR16mi  : Ii16<0x81, MRM6m,
+                      (outs), (ins i16mem:$dst, i16imm:$src),
+                      "xor{w}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst),
+                    (implicit EFLAGS)]>,
+                      OpSize;
+  def XOR32mi  : Ii32<0x81, MRM6m,
+                      (outs), (ins i32mem:$dst, i32imm:$src),
+                      "xor{l}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def XOR16mi8 : Ii8<0x83, MRM6m,
+                     (outs), (ins i16mem:$dst, i16i8imm :$src),
+                     "xor{w}\t{$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>,
+                     OpSize;
+  def XOR32mi8 : Ii8<0x83, MRM6m,
+                     (outs), (ins i32mem:$dst, i32i8imm :$src),
+                     "xor{l}\t{$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+} // isTwoAddress = 0
+} // Defs = [EFLAGS]
+
+// Shift instructions
+let Defs = [EFLAGS] in {
+let Uses = [CL] in {
+def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "shl{b}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (shl GR8:$src, CL))]>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src),
+                 "shl{w}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (shl GR16:$src, CL))]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src),
+                 "shl{l}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (shl GR32:$src, CL))]>;
+} // Uses = [CL]
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "shl{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shl{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shl{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+// NOTE: We don't use shifts of a register by one, because 'add reg,reg' is
+// cheaper.
+} // isConvertibleToThreeAddress = 1
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+                   "shl{b}\t{%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+                   "shl{w}\t{%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+                   "shl{l}\t{%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "shl{b}\t{$src, $dst|$dst, $src}",
+                  [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "shl{w}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "shl{l}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+                   "shl{b}\t$dst",
+                  [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+                   "shl{w}\t$dst",
+                 [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+                   "shl{l}\t$dst",
+                 [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "shr{b}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (srl GR8:$src, CL))]>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src),
+                 "shr{w}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (srl GR16:$src, CL))]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src),
+                 "shr{l}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (srl GR32:$src, CL))]>;
+}
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "shr{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shr{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shr{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shr{b}\t$dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t$dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t$dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+                   "shr{b}\t{%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+                   "shr{w}\t{%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   OpSize;
+  def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+                   "shr{l}\t{%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "shr{b}\t{$src, $dst|$dst, $src}",
+                  [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "shr{w}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "shr{l}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+                   "shr{b}\t$dst",
+                  [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+                   "shr{w}\t$dst",
+                 [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+  def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+                   "shr{l}\t$dst",
+                 [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "sar{b}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (sra GR8:$src, CL))]>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src),
+                 "sar{w}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (sra GR16:$src, CL))]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src),
+                 "sar{l}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (sra GR32:$src, CL))]>;
+}
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "sar{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "sar{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+                   OpSize;
+def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "sar{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t$dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t$dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t$dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+                   "sar{b}\t{%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+                   "sar{w}\t{%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), 
+                   "sar{l}\t{%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "sar{b}\t{$src, $dst|$dst, $src}",
+                  [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "sar{w}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "sar{l}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+                   "sar{b}\t$dst",
+                  [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+                   "sar{w}\t$dst",
+                 [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+                   "sar{l}\t$dst",
+                 [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+// Rotate instructions
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "rol{b}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (rotl GR8:$src, CL))]>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src),
+                 "rol{w}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (rotl GR16:$src, CL))]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src),
+                 "rol{l}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (rotl GR32:$src, CL))]>;
+}
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "rol{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "rol{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "rol{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t$dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t$dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t$dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+                   "rol{b}\t{%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+                   "rol{w}\t{%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+                   "rol{l}\t{%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "rol{b}\t{$src, $dst|$dst, $src}",
+                 [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "rol{w}\t{$src, $dst|$dst, $src}",
+                [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "rol{l}\t{$src, $dst|$dst, $src}",
+                [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+                   "rol{b}\t$dst",
+                 [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+                   "rol{w}\t$dst",
+                [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+                   "rol{l}\t$dst",
+                [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "ror{b}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (rotr GR8:$src, CL))]>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src),
+                 "ror{w}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (rotr GR16:$src, CL))]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+                 "ror{l}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (rotr GR32:$src, CL))]>;
+}
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "ror{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "ror{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "ror{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t$dst",
+                 [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t$dst",
+                 [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t$dst",
+                 [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+                   "ror{b}\t{%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+                   "ror{w}\t{%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), 
+                   "ror{l}\t{%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "ror{b}\t{$src, $dst|$dst, $src}",
+                 [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "ror{w}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "ror{l}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+                   "ror{b}\t$dst",
+                 [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+                   "ror{w}\t$dst",
+                [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+                   "ror{l}\t$dst",
+                [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+let Uses = [CL] in {
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+                   TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+                   TB, OpSize;
+}
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+}
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                     "shld{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                     [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                       addr:$dst)]>, TB;
+  def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                    "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                      addr:$dst)]>, TB;
+  }
+  def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                      (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB;
+  def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
+                       (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                       "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                         (i8 imm:$src3)), addr:$dst)]>,
+                       TB;
+
+  let Uses = [CL] in {
+  def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                     "shld{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                     [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                       addr:$dst)]>, TB, OpSize;
+  def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                      addr:$dst)]>, TB, OpSize;
+  }
+  def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                      (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB, OpSize;
+  def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+                       (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                       "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                       TB, OpSize;
+}
+} // Defs = [EFLAGS]
+
+
+// Arithmetic.
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {   // X = ADD Y, Z   --> X = ADD Z, Y
+// Register-Register Addition
+def ADD8rr    : I<0x00, MRMDestReg, (outs GR8 :$dst),
+                                    (ins GR8 :$src1, GR8 :$src2),
+                  "add{b}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
+                   (implicit EFLAGS)]>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+// Register-Register Addition
+def ADD16rr  : I<0x01, MRMDestReg, (outs GR16:$dst),
+                                   (ins GR16:$src1, GR16:$src2),
+                 "add{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, OpSize;
+def ADD32rr  : I<0x01, MRMDestReg, (outs GR32:$dst),
+                                   (ins GR32:$src1, GR32:$src2),
+                 "add{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
+
+// Register-Memory Addition
+def ADD8rm   : I<0x02, MRMSrcMem, (outs GR8 :$dst),
+                                  (ins GR8 :$src1, i8mem :$src2),
+                 "add{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+def ADD16rm  : I<0x03, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "add{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>, OpSize;
+def ADD32rm  : I<0x03, MRMSrcMem, (outs GR32:$dst),
+                                  (ins GR32:$src1, i32mem:$src2),
+                 "add{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+
+// Register-Integer Addition
+def ADD8ri    : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "add{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+// Register-Integer Addition
+def ADD16ri  : Ii16<0x81, MRM0r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "add{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def ADD32ri  : Ii32<0x81, MRM0r, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "add{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def ADD16ri8 : Ii8<0x83, MRM0r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "add{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "add{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+}
+
+let isTwoAddress = 0 in {
+  // Memory-Register Addition
+  def ADD8mr   : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                   "add{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR8:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def ADD16mr  : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "add{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR16:$src2), addr:$dst),
+                    (implicit EFLAGS)]>, OpSize;
+  def ADD32mr  : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "add{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR32:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def ADD8mi   : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
+                     "add{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def ADD16mi  : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
+                      "add{w}\t{$src2, $dst|$dst, $src2}",
+                  [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst),
+                   (implicit EFLAGS)]>, OpSize;
+  def ADD32mi  : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
+                      "add{l}\t{$src2, $dst|$dst, $src2}",
+                      [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst),
+                       (implicit EFLAGS)]>;
+  def ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+                     "add{w}\t{$src2, $dst|$dst, $src2}",
+                     [(store (add (load addr:$dst), i16immSExt8:$src2),
+                                  addr:$dst),
+                      (implicit EFLAGS)]>, OpSize;
+  def ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                     "add{l}\t{$src2, $dst|$dst, $src2}",
+                  [(store (add (load addr:$dst), i32immSExt8:$src2),
+                               addr:$dst),
+                   (implicit EFLAGS)]>;
+}
+
+let Uses = [EFLAGS] in {
+let isCommutable = 1 in {  // X = ADC Y, Z --> X = ADC Z, Y
+def ADC8rr   : I<0x10, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                 "adc{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (adde GR8:$src1, GR8:$src2))]>;
+def ADC16rr  : I<0x11, MRMDestReg, (outs GR16:$dst),
+                                   (ins GR16:$src1, GR16:$src2),
+                 "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, GR16:$src2))]>, OpSize;
+def ADC32rr  : I<0x11, MRMDestReg, (outs GR32:$dst),
+                                   (ins GR32:$src1, GR32:$src2),
+                 "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>;
+}
+def ADC8rm   : I<0x12, MRMSrcMem , (outs GR8:$dst), 
+                                   (ins GR8:$src1, i8mem:$src2),
+                 "adc{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (adde GR8:$src1, (load addr:$src2)))]>;
+def ADC16rm  : I<0x13, MRMSrcMem , (outs GR16:$dst),
+                                   (ins GR16:$src1, i16mem:$src2),
+                 "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, (load addr:$src2)))]>,
+                 OpSize;
+def ADC32rm  : I<0x13, MRMSrcMem , (outs GR32:$dst),
+                                   (ins GR32:$src1, i32mem:$src2),
+                 "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>;
+def ADC8ri   : Ii8<0x80, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "adc{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (adde GR8:$src1, imm:$src2))]>;
+def ADC16ri  : Ii16<0x81, MRM2r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, imm:$src2))]>, OpSize;
+def ADC16ri8 : Ii8<0x83, MRM2r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, i16immSExt8:$src2))]>,
+                 OpSize;
+def ADC32ri  : Ii32<0x81, MRM2r, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>;
+def ADC32ri8 : Ii8<0x83, MRM2r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def ADC8mr   : I<0x10, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                   "adc{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def ADC16mr  : I<0x11, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "adc{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADC32mr  : I<0x11, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "adc{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def ADC8mi   : Ii8<0x80, MRM2m, (outs), (ins i8mem:$dst, i8imm:$src2),
+                      "adc{b}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADC16mi  : Ii16<0x81, MRM2m, (outs), (ins i16mem:$dst, i16imm:$src2),
+                      "adc{w}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                  OpSize;
+  def ADC16mi8 : Ii8<0x83, MRM2m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+                     "adc{w}\t{$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+               OpSize;
+  def ADC32mi  : Ii32<0x81, MRM2m, (outs), (ins i32mem:$dst, i32imm:$src2),
+                      "adc{l}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADC32mi8 : Ii8<0x83, MRM2m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                     "adc{l}\t{$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+} // Uses = [EFLAGS]
+
+// Register-Register Subtraction
+def SUB8rr  : I<0x28, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                "sub{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
+                 (implicit EFLAGS)]>;
+def SUB16rr : I<0x29, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                "sub{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
+                 (implicit EFLAGS)]>, OpSize;
+def SUB32rr : I<0x29, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                "sub{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (sub GR32:$src1, GR32:$src2)),
+                 (implicit EFLAGS)]>;
+
+// Register-Memory Subtraction
+def SUB8rm  : I<0x2A, MRMSrcMem, (outs GR8 :$dst),
+                                 (ins GR8 :$src1, i8mem :$src2),
+                "sub{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+def SUB16rm : I<0x2B, MRMSrcMem, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16mem:$src2),
+                "sub{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>, OpSize;
+def SUB32rm : I<0x2B, MRMSrcMem, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32mem:$src2),
+                "sub{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+
+// Register-Integer Subtraction
+def SUB8ri   : Ii8 <0x80, MRM5r, (outs GR8:$dst),
+                                 (ins GR8:$src1, i8imm:$src2),
+                    "sub{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def SUB16ri  : Ii16<0x81, MRM5r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "sub{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def SUB32ri  : Ii32<0x81, MRM5r, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "sub{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sub GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def SUB16ri8 : Ii8<0x83, MRM5r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "sub{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "sub{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+  // Memory-Register Subtraction
+  def SUB8mr   : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
+                   "sub{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR8:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def SUB16mr  : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "sub{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR16:$src2), addr:$dst),
+                    (implicit EFLAGS)]>, OpSize;
+  def SUB32mr  : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), 
+                   "sub{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR32:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+
+  // Memory-Integer Subtraction
+  def SUB8mi   : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2), 
+                     "sub{b}\t{$src2, $dst|$dst, $src2}",
+                     [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst),
+                      (implicit EFLAGS)]>;
+  def SUB16mi  : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2), 
+                      "sub{w}\t{$src2, $dst|$dst, $src2}",
+                      [(store (sub (loadi16 addr:$dst), imm:$src2),addr:$dst),
+                       (implicit EFLAGS)]>, OpSize;
+  def SUB32mi  : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2), 
+                      "sub{l}\t{$src2, $dst|$dst, $src2}",
+                      [(store (sub (loadi32 addr:$dst), imm:$src2),addr:$dst),
+                       (implicit EFLAGS)]>;
+  def SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2), 
+                     "sub{w}\t{$src2, $dst|$dst, $src2}",
+                     [(store (sub (load addr:$dst), i16immSExt8:$src2),
+                             addr:$dst),
+                      (implicit EFLAGS)]>, OpSize;
+  def SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                     "sub{l}\t{$src2, $dst|$dst, $src2}",
+                     [(store (sub (load addr:$dst), i32immSExt8:$src2),
+                             addr:$dst),
+                      (implicit EFLAGS)]>;
+}
+
+let Uses = [EFLAGS] in {
+def SBB8rr     : I<0x18, MRMDestReg, (outs GR8:$dst),
+                                     (ins GR8:$src1, GR8:$src2),
+                  "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (sube GR8:$src1, GR8:$src2))]>;
+def SBB16rr    : I<0x19, MRMDestReg, (outs GR16:$dst),
+                                     (ins GR16:$src1, GR16:$src2),
+                  "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (sube GR16:$src1, GR16:$src2))]>, OpSize;
+def SBB32rr    : I<0x19, MRMDestReg, (outs GR32:$dst),
+                                      (ins GR32:$src1, GR32:$src2),
+                  "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def SBB8mr   : I<0x18, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), 
+                   "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def SBB16mr  : I<0x19, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), 
+                   "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def SBB32mr  : I<0x19, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), 
+                   "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def SBB8mi  : Ii32<0x80, MRM3m, (outs), (ins i8mem:$dst, i8imm:$src2), 
+                      "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB16mi  : Ii16<0x81, MRM3m, (outs), (ins i16mem:$dst, i16imm:$src2), 
+                      "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sube (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                  OpSize;
+  def SBB16mi8 : Ii8<0x83, MRM3m, (outs), (ins i16mem:$dst, i16i8imm :$src2), 
+                     "sbb{w}\t{$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+               OpSize;
+  def SBB32mi  : Ii32<0x81, MRM3m, (outs), (ins i32mem:$dst, i32imm:$src2), 
+                      "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB32mi8 : Ii8<0x83, MRM3m, (outs), (ins i32mem:$dst, i32i8imm :$src2), 
+                     "sbb{l}\t{$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+def SBB8rm   : I<0x1A, MRMSrcMem, (outs GR8:$dst), (ins GR8:$src1, i8mem:$src2),
+                    "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sube GR8:$src1, (load addr:$src2)))]>;
+def SBB16rm  : I<0x1B, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                    "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sube GR16:$src1, (load addr:$src2)))]>,
+                    OpSize;
+def SBB32rm  : I<0x1B, MRMSrcMem, (outs GR32:$dst),
+                                  (ins GR32:$src1, i32mem:$src2),
+                    "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>;
+def SBB8ri   : Ii8<0x80, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sube GR8:$src1, imm:$src2))]>;
+def SBB16ri  : Ii16<0x81, MRM3r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sube GR16:$src1, imm:$src2))]>, OpSize;
+def SBB16ri8 : Ii8<0x83, MRM3r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sube GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def SBB32ri  : Ii32<0x81, MRM3r, (outs GR32:$dst), 
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>;
+def SBB32ri8 : Ii8<0x83, MRM3r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>, TB;
+}
+
+// Register-Memory Signed Integer Multiply
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>, TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+} // end Two Address instructions
+
+// Suprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul GR16:$src1, imm:$src2)),
+                       (implicit EFLAGS)]>, OpSize;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul GR32:$src1, imm:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2)),
+                      (implicit EFLAGS)]>, OpSize;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2)),
+                      (implicit EFLAGS)]>;
+
+// Memory-Integer Signed Integer Multiply
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                      // GR16 = [mem16]*I16
+                      (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul (load addr:$src1), imm:$src2)),
+                       (implicit EFLAGS)]>, OpSize;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                      // GR32 = [mem32]*I32
+                      (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul (load addr:$src1), imm:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, (mul (load addr:$src1),
+                                       i16immSExt8:$src2)),
+                      (implicit EFLAGS)]>, OpSize;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, (mul (load addr:$src1),
+                                           i32immSExt8:$src2)),
+                      (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Test instructions are just like AND, except they don't generate a result.
+//
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {   // TEST X, Y   --> TEST Y, X
+def TEST8rr  : I<0x84, MRMDestReg, (outs),  (ins GR8:$src1, GR8:$src2),
+                     "test{b}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and_su GR8:$src1, GR8:$src2), 0),
+                      (implicit EFLAGS)]>;
+def TEST16rr : I<0x85, MRMDestReg, (outs),  (ins GR16:$src1, GR16:$src2),
+                     "test{w}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and_su GR16:$src1, GR16:$src2), 0),
+                      (implicit EFLAGS)]>,
+                 OpSize;
+def TEST32rr : I<0x85, MRMDestReg, (outs),  (ins GR32:$src1, GR32:$src2),
+                     "test{l}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and_su GR32:$src1, GR32:$src2), 0),
+                      (implicit EFLAGS)]>;
+}
+
+def TEST8rm  : I<0x84, MRMSrcMem, (outs),  (ins GR8 :$src1, i8mem :$src2),
+                     "test{b}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0),
+                      (implicit EFLAGS)]>;
+def TEST16rm : I<0x85, MRMSrcMem, (outs),  (ins GR16:$src1, i16mem:$src2),
+                     "test{w}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR16:$src1, (loadi16 addr:$src2)), 0),
+                      (implicit EFLAGS)]>, OpSize;
+def TEST32rm : I<0x85, MRMSrcMem, (outs),  (ins GR32:$src1, i32mem:$src2),
+                     "test{l}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR32:$src1, (loadi32 addr:$src2)), 0),
+                      (implicit EFLAGS)]>;
+
+def TEST8ri  : Ii8 <0xF6, MRM0r,                     // flags = GR8  & imm8
+                    (outs),  (ins GR8:$src1, i8imm:$src2),
+                    "test{b}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and_su GR8:$src1, imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+def TEST16ri : Ii16<0xF7, MRM0r,                     // flags = GR16 & imm16
+                    (outs),  (ins GR16:$src1, i16imm:$src2),
+                    "test{w}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and_su GR16:$src1, imm:$src2), 0),
+                     (implicit EFLAGS)]>, OpSize;
+def TEST32ri : Ii32<0xF7, MRM0r,                     // flags = GR32 & imm32
+                    (outs),  (ins GR32:$src1, i32imm:$src2),
+                    "test{l}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and_su GR32:$src1, imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+
+def TEST8mi  : Ii8 <0xF6, MRM0m,                   // flags = [mem8]  & imm8
+                    (outs), (ins i8mem:$src1, i8imm:$src2),
+                    "test{b}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi8 addr:$src1), imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+def TEST16mi : Ii16<0xF7, MRM0m,                   // flags = [mem16] & imm16
+                    (outs), (ins i16mem:$src1, i16imm:$src2),
+                    "test{w}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi16 addr:$src1), imm:$src2), 0),
+                     (implicit EFLAGS)]>, OpSize;
+def TEST32mi : Ii32<0xF7, MRM0m,                   // flags = [mem32] & imm32
+                    (outs), (ins i32mem:$src1, i32imm:$src2),
+                    "test{l}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi32 addr:$src1), imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf", []>;  // flags = AH
+let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>;  // AH = flags
+
+let Uses = [EFLAGS] in {
+def SETEr    : I<0x94, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "sete\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_E, EFLAGS))]>,
+               TB;                        // GR8 = ==
+def SETEm    : I<0x94, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "sete\t$dst",
+                 [(store (X86setcc X86_COND_E, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = ==
+
+def SETNEr   : I<0x95, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setne\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NE, EFLAGS))]>,
+               TB;                        // GR8 = !=
+def SETNEm   : I<0x95, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setne\t$dst",
+                 [(store (X86setcc X86_COND_NE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = !=
+
+def SETLr    : I<0x9C, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setl\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_L, EFLAGS))]>,
+               TB;                        // GR8 = <  signed
+def SETLm    : I<0x9C, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setl\t$dst",
+                 [(store (X86setcc X86_COND_L, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <  signed
+
+def SETGEr   : I<0x9D, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setge\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_GE, EFLAGS))]>,
+               TB;                        // GR8 = >= signed
+def SETGEm   : I<0x9D, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setge\t$dst",
+                 [(store (X86setcc X86_COND_GE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >= signed
+
+def SETLEr   : I<0x9E, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setle\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_LE, EFLAGS))]>,
+               TB;                        // GR8 = <= signed
+def SETLEm   : I<0x9E, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setle\t$dst",
+                 [(store (X86setcc X86_COND_LE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <= signed
+
+def SETGr    : I<0x9F, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setg\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_G, EFLAGS))]>,
+               TB;                        // GR8 = >  signed
+def SETGm    : I<0x9F, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setg\t$dst",
+                 [(store (X86setcc X86_COND_G, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETBr    : I<0x92, MRM0r,
+                 (outs GR8   :$dst), (ins),
+                 "setb\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_B, EFLAGS))]>,
+               TB;                        // GR8 = <  unsign
+def SETBm    : I<0x92, MRM0m,
+                 (outs), (ins i8mem:$dst),
+                 "setb\t$dst",
+                 [(store (X86setcc X86_COND_B, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <  unsign
+
+def SETAEr   : I<0x93, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setae\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_AE, EFLAGS))]>,
+               TB;                        // GR8 = >= unsign
+def SETAEm   : I<0x93, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setae\t$dst",
+                 [(store (X86setcc X86_COND_AE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >= unsign
+
+def SETBEr   : I<0x96, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setbe\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_BE, EFLAGS))]>,
+               TB;                        // GR8 = <= unsign
+def SETBEm   : I<0x96, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setbe\t$dst",
+                 [(store (X86setcc X86_COND_BE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <= unsign
+
+def SETAr    : I<0x97, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "seta\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_A, EFLAGS))]>,
+               TB;                        // GR8 = >  signed
+def SETAm    : I<0x97, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "seta\t$dst",
+                 [(store (X86setcc X86_COND_A, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETSr    : I<0x98, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "sets\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_S, EFLAGS))]>,
+               TB;                        // GR8 = <sign bit>
+def SETSm    : I<0x98, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "sets\t$dst",
+                 [(store (X86setcc X86_COND_S, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <sign bit>
+def SETNSr   : I<0x99, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setns\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NS, EFLAGS))]>,
+               TB;                        // GR8 = !<sign bit>
+def SETNSm   : I<0x99, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setns\t$dst",
+                 [(store (X86setcc X86_COND_NS, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = !<sign bit>
+
+def SETPr    : I<0x9A, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setp\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_P, EFLAGS))]>,
+               TB;                        // GR8 = parity
+def SETPm    : I<0x9A, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setp\t$dst",
+                 [(store (X86setcc X86_COND_P, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = parity
+def SETNPr   : I<0x9B, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setnp\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NP, EFLAGS))]>,
+               TB;                        // GR8 = not parity
+def SETNPm   : I<0x9B, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setnp\t$dst",
+                 [(store (X86setcc X86_COND_NP, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = not parity
+
+def SETOr    : I<0x90, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "seto\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_O, EFLAGS))]>,
+               TB;                        // GR8 = overflow
+def SETOm    : I<0x90, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "seto\t$dst",
+                 [(store (X86setcc X86_COND_O, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = overflow
+def SETNOr   : I<0x91, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setno\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NO, EFLAGS))]>,
+               TB;                        // GR8 = not overflow
+def SETNOm   : I<0x91, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setno\t$dst",
+                 [(store (X86setcc X86_COND_NO, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = not overflow
+} // Uses = [EFLAGS]
+
+
+// Integer comparisons
+let Defs = [EFLAGS] in {
+def CMP8rr  : I<0x38, MRMDestReg,
+                (outs), (ins GR8 :$src1, GR8 :$src2),
+                "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, GR8:$src2), (implicit EFLAGS)]>;
+def CMP16rr : I<0x39, MRMDestReg,
+                (outs), (ins GR16:$src1, GR16:$src2),
+                "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, GR16:$src2), (implicit EFLAGS)]>, OpSize;
+def CMP32rr : I<0x39, MRMDestReg,
+                (outs), (ins GR32:$src1, GR32:$src2),
+                "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, GR32:$src2), (implicit EFLAGS)]>;
+def CMP8mr  : I<0x38, MRMDestMem,
+                (outs), (ins i8mem :$src1, GR8 :$src2),
+                "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi8 addr:$src1), GR8:$src2),
+                 (implicit EFLAGS)]>;
+def CMP16mr : I<0x39, MRMDestMem,
+                (outs), (ins i16mem:$src1, GR16:$src2),
+                "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi16 addr:$src1), GR16:$src2),
+                 (implicit EFLAGS)]>, OpSize;
+def CMP32mr : I<0x39, MRMDestMem,
+                (outs), (ins i32mem:$src1, GR32:$src2),
+                "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi32 addr:$src1), GR32:$src2),
+                 (implicit EFLAGS)]>;
+def CMP8rm  : I<0x3A, MRMSrcMem,
+                (outs), (ins GR8 :$src1, i8mem :$src2),
+                "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, (loadi8 addr:$src2)),
+                 (implicit EFLAGS)]>;
+def CMP16rm : I<0x3B, MRMSrcMem,
+                (outs), (ins GR16:$src1, i16mem:$src2),
+                "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, (loadi16 addr:$src2)),
+                 (implicit EFLAGS)]>, OpSize;
+def CMP32rm : I<0x3B, MRMSrcMem,
+                (outs), (ins GR32:$src1, i32mem:$src2),
+                "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, (loadi32 addr:$src2)),
+                 (implicit EFLAGS)]>;
+def CMP8ri  : Ii8<0x80, MRM7r,
+                  (outs), (ins GR8:$src1, i8imm:$src2),
+                  "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                  [(X86cmp GR8:$src1, imm:$src2), (implicit EFLAGS)]>;
+def CMP16ri : Ii16<0x81, MRM7r,
+                   (outs), (ins GR16:$src1, i16imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP32ri : Ii32<0x81, MRM7r,
+                   (outs), (ins GR32:$src1, i32imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, imm:$src2), (implicit EFLAGS)]>;
+def CMP8mi  : Ii8 <0x80, MRM7m,
+                   (outs), (ins i8mem :$src1, i8imm :$src2),
+                   "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi8 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)]>;
+def CMP16mi : Ii16<0x81, MRM7m,
+                   (outs), (ins i16mem:$src1, i16imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP32mi : Ii32<0x81, MRM7m,
+                   (outs), (ins i32mem:$src1, i32imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)]>;
+def CMP16ri8 : Ii8<0x83, MRM7r,
+                   (outs), (ins GR16:$src1, i16i8imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP16mi8 : Ii8<0x83, MRM7m,
+                   (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), i16immSExt8:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP32mi8 : Ii8<0x83, MRM7m,
+                   (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), i32immSExt8:$src2),
+                    (implicit EFLAGS)]>;
+def CMP32ri8 : Ii8<0x83, MRM7r,
+                   (outs), (ins GR32:$src1, i32i8imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+               "bt{w}\t{$src2, $src1|$src1, $src2}",
+               [(X86bt GR16:$src1, GR16:$src2),
+                (implicit EFLAGS)]>, OpSize, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+               "bt{l}\t{$src2, $src1|$src1, $src2}",
+               [(X86bt GR32:$src1, GR32:$src2),
+                (implicit EFLAGS)]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Disable these instructions for now.
+//def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+//               "bt{w}\t{$src2, $src1|$src1, $src2}",
+//               [(X86bt (loadi16 addr:$src1), GR16:$src2),
+//                (implicit EFLAGS)]>, OpSize, TB, Requires<[FastBTMem]>;
+//def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+//               "bt{l}\t{$src2, $src1|$src1, $src2}",
+//               [(X86bt (loadi32 addr:$src1), GR32:$src2),
+//                (implicit EFLAGS)]>, TB, Requires<[FastBTMem]>;
+
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR16:$src1, i16immSExt8:$src2),
+                 (implicit EFLAGS)]>, OpSize, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR32:$src1, i32immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi16 addr:$src1), i16immSExt8:$src2),
+                 (implicit EFLAGS)]>, OpSize, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi32 addr:$src1), i32immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+// Sign/Zero extenders
+// Use movsbl intead of movsbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update.
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+                   "movs{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR16:$dst, (sext GR8:$src))]>, TB;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+                   "movs{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))]>, TB;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))]>, TB;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+
+// Use movzbl intead of movzbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update.
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR16:$dst, (zext GR8:$src))]>, TB;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                   [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))]>, TB;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))]>, TB;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+
+// These are the same as the regular regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+
+let neverHasSideEffects = 1 in {
+  let Defs = [AX], Uses = [AL] in
+  def CBW : I<0x98, RawFrm, (outs), (ins),
+              "{cbtw|cbw}", []>, OpSize;   // AX = signext(AL)
+  let Defs = [EAX], Uses = [AX] in
+  def CWDE : I<0x98, RawFrm, (outs), (ins),
+              "{cwtl|cwde}", []>;   // EAX = signext(AX)
+
+  let Defs = [AX,DX], Uses = [AX] in
+  def CWD : I<0x99, RawFrm, (outs), (ins),
+              "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+  let Defs = [EAX,EDX], Uses = [EAX] in
+  def CDQ : I<0x99, RawFrm, (outs), (ins),
+              "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins),
+                 "xor{b}\t$dst, $dst",
+                 [(set GR8:$dst, 0)]>;
+// Use xorl instead of xorw since we don't care about the high 16 bits,
+// it's smaller, and it avoids a partial-register update.
+def MOV16r0  : I<0x31, MRMInitReg,  (outs GR16:$dst), (ins),
+                 "xor{l}\t${dst:subreg32}, ${dst:subreg32}",
+                 [(set GR16:$dst, 0)]>;
+def MOV32r0  : I<0x31, MRMInitReg,  (outs GR32:$dst), (ins),
+                 "xor{l}\t$dst, $dst",
+                 [(set GR32:$dst, 0)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [ESP, EBX] in
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32imm:$sym),
+                  "leal\t${sym:mem}(,%ebx,1), %eax; "
+                  "call\t___tls_get_addr@PLT",
+                  [(X86tlsaddr tglobaltlsaddr:$sym)]>,
+                  Requires<[In32BitMode]>;
+
+let AddedComplexity = 5 in
+def GS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movl\t%gs:$src, $dst",
+                   [(set GR32:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5 in
+def FS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movl\t%fs:$src, $dst",
+                   [(set GR32:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// DWARF Pseudo Instructions
+//
+
+def DWARF_LOC   : I<0, Pseudo, (outs),
+                    (ins i32imm:$line, i32imm:$col, i32imm:$file),
+                    ".loc\t${file:debug} ${line:debug} ${col:debug}",
+                    [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
+                      (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+                    "ret\t#eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+let Constraints = "$val = $dst" in {
+def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+               "xchg{l}\t{$val, $ptr|$ptr, $val}", 
+               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>;
+def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+               "xchg{w}\t{$val, $ptr|$ptr, $val}", 
+               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>, 
+                OpSize;
+def XCHG8rm  : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
+               "xchg{b}\t{$val, $ptr|$ptr, $val}", 
+               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>;
+}
+
+// Atomic compare and swap.
+let Defs = [EAX, EFLAGS], Uses = [EAX] in {
+def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
+               "lock\n\t"
+               "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
+               [(X86cas addr:$ptr, GR32:$swap, 4)]>, TB, LOCK;
+}
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i32mem:$ptr),
+               "lock\n\t"
+               "cmpxchg8b\t$ptr",
+               [(X86cas8 addr:$ptr)]>, TB, LOCK;
+}
+
+let Defs = [AX, EFLAGS], Uses = [AX] in {
+def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
+               "lock\n\t"
+               "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
+               [(X86cas addr:$ptr, GR16:$swap, 2)]>, TB, OpSize, LOCK;
+}
+let Defs = [AL, EFLAGS], Uses = [AL] in {
+def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
+               "lock\n\t"
+               "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
+               [(X86cas addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
+}
+
+// Atomic exchange and add
+let Constraints = "$val = $dst", Defs = [EFLAGS] in {
+def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+               "lock\n\t"
+               "xadd{l}\t{$val, $ptr|$ptr, $val}",
+               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+               "lock\n\t"
+               "xadd{w}\t{$val, $ptr|$ptr, $val}",
+               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>,
+                TB, OpSize, LOCK;
+def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
+               "lock\n\t"
+               "xadd{b}\t{$val, $ptr|$ptr, $val}",
+               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>,
+                TB, LOCK;
+}
+
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMAND32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>;
+def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMOR32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>;
+def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMXOR32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>;
+def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMNAND32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>;
+def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+               "#ATOMMIN32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>;
+def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMMAX32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMIN32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMAX32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>;
+
+def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMAND16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>;
+def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMOR16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>;
+def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMXOR16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>;
+def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMNAND16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>;
+def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+               "#ATOMMIN16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>;
+def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMMAX16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMUMIN16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMUMAX16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>;
+
+def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMAND8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>;
+def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMOR8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>;
+def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMXOR8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>;
+def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMNAND8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>;
+}
+
+let Constraints = "$val1 = $dst1, $val2 = $dst2", 
+                  Defs = [EFLAGS, EAX, EBX, ECX, EDX],
+                  Uses = [EAX, EBX, ECX, EDX],
+                  mayLoad = 1, mayStore = 1,
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMAND6432 PSEUDO!", []>;
+def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMOR6432 PSEUDO!", []>;
+def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMXOR6432 PSEUDO!", []>;
+def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMNAND6432 PSEUDO!", []>;
+def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMADD6432 PSEUDO!", []>;
+def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMSUB6432 PSEUDO!", []>;
+def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMSWAP6432 PSEUDO!", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+          (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+          (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+          (ADD32ri GR32:$src1, texternalsym:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, texternalsym:$src)>;
+
+// Calls
+// tailcall stuff
+def : Pat<(X86tailcall GR32:$dst),
+          (TAILCALL)>;
+
+def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
+          (TAILCALL)>;
+def : Pat<(X86tailcall (i32 texternalsym:$dst)),
+          (TAILCALL)>;
+
+def : Pat<(X86tcret GR32:$dst, imm:$off),
+          (TCRETURNri GR32:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR32:$src1, GR32:$src2),
+          (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(addc GR32:$src1, (load addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(addc GR32:$src1, imm:$src2),
+          (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(addc GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(subc GR32:$src1, GR32:$src2),
+          (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(subc GR32:$src1, (load addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(subc GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(subc GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(parallel (X86cmp GR8:$src1, 0), (implicit EFLAGS)),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(parallel (X86cmp GR16:$src1, 0), (implicit EFLAGS)),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(parallel (X86cmp GR32:$src1, 0), (implicit EFLAGS)),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO32rm GR32:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+
+// extload bool -> extload byte
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>,
+         Requires<[In32BitMode]>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>,
+         Requires<[In32BitMode]>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// anyext
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>,
+         Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>,
+         Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext GR16:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, x86_subreg_16bit)>;
+
+// (and (i32 load), 255) -> (zextload i8)
+def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 255))),
+          (MOVZX32rm8 addr:$src)>;
+def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 65535))),
+          (MOVZX32rm16 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+          (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+          (SUB16mi8 addr:$dst, -128)>;
+def : Pat<(add GR32:$src1, 128),
+          (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+          (SUB32mi8 addr:$dst, -128)>;
+
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, x86_subreg_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src1, GR32_ABCD),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+          (MOVZX16rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+          (MOVSX16rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+                          x86_subreg_8bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                          x86_subreg_8bit)>,
+      Requires<[In32BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                                      x86_subreg_8bit_hi))>,
+      Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+                                      x86_subreg_8bit_hi))>,
+      Requires<[In32BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+
+// (shl x (and y, 31)) ==> (shl x, y)
+def : Pat<(shl GR8:$src1, (and CL:$amt, 31)),
+          (SHL8rCL GR8:$src1)>;
+def : Pat<(shl GR16:$src1, (and CL:$amt, 31)),
+          (SHL16rCL GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (and CL:$amt, 31)),
+          (SHL32rCL GR32:$src1)>;
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHL8mCL addr:$dst)>;
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHL16mCL addr:$dst)>;
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHL32mCL addr:$dst)>;
+
+def : Pat<(srl GR8:$src1, (and CL:$amt, 31)),
+          (SHR8rCL GR8:$src1)>;
+def : Pat<(srl GR16:$src1, (and CL:$amt, 31)),
+          (SHR16rCL GR16:$src1)>;
+def : Pat<(srl GR32:$src1, (and CL:$amt, 31)),
+          (SHR32rCL GR32:$src1)>;
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHR8mCL addr:$dst)>;
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHR16mCL addr:$dst)>;
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHR32mCL addr:$dst)>;
+
+def : Pat<(sra GR8:$src1, (and CL:$amt, 31)),
+          (SAR8rCL GR8:$src1)>;
+def : Pat<(sra GR16:$src1, (and CL:$amt, 31)),
+          (SAR16rCL GR16:$src1)>;
+def : Pat<(sra GR32:$src1, (and CL:$amt, 31)),
+          (SAR32rCL GR32:$src1)>;
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SAR8mCL addr:$dst)>;
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SAR16mCL addr:$dst)>;
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SAR32mCL addr:$dst)>;
+
+// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
+def : Pat<(or (srl GR32:$src1, CL:$amt),
+              (shl GR32:$src2, (sub 32, CL:$amt))),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt),
+                     (shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(or (srl GR32:$src1, (i8 (trunc ECX:$amt))),
+              (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+                     (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+                 addr:$dst),
+          (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+          (SHRD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi32 addr:$dst), (i8 imm:$amt1),
+                       GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHRD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
+def : Pat<(or (shl GR32:$src1, CL:$amt),
+              (srl GR32:$src2, (sub 32, CL:$amt))),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt),
+                     (srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(or (shl GR32:$src1, (i8 (trunc ECX:$amt))),
+              (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+                     (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+                 addr:$dst),
+          (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+          (SHLD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi32 addr:$dst), (i8 imm:$amt1),
+                       GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHLD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
+// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
+def : Pat<(or (srl GR16:$src1, CL:$amt),
+              (shl GR16:$src2, (sub 16, CL:$amt))),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt),
+                     (shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(or (srl GR16:$src1, (i8 (trunc CX:$amt))),
+              (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+                     (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+                 addr:$dst),
+          (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+          (SHRD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi16 addr:$dst), (i8 imm:$amt1),
+                       GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHRD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
+def : Pat<(or (shl GR16:$src1, CL:$amt),
+              (srl GR16:$src2, (sub 16, CL:$amt))),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt),
+                     (srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(or (shl GR16:$src1, (i8 (trunc CX:$amt))),
+              (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+                     (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+                 addr:$dst),
+          (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+          (SHLD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi16 addr:$dst), (i8 imm:$amt1),
+                       GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHLD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// Register-Register Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (ADD8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// Register-Register Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (SUB8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+
+// Register-Register Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Integer Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Optimize multiply by 2 with EFLAGS result.
+let AddedComplexity = 2 in {
+def : Pat<(parallel (X86smul_flag GR16:$src1, 2),
+                    (implicit EFLAGS)),
+          (ADD16rr GR16:$src1, GR16:$src1)>;
+
+def : Pat<(parallel (X86smul_flag GR32:$src1, 2),
+                    (implicit EFLAGS)),
+          (ADD32rr GR32:$src1, GR32:$src1)>;
+}
+
+// INC and DEC with EFLAGS result. Note that these do not set CF.
+def : Pat<(parallel (X86inc_flag GR8:$src), (implicit EFLAGS)),
+          (INC8r GR8:$src)>;
+def : Pat<(parallel (store (i8 (X86inc_flag (loadi8 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC8m addr:$dst)>;
+def : Pat<(parallel (X86dec_flag GR8:$src), (implicit EFLAGS)),
+          (DEC8r GR8:$src)>;
+def : Pat<(parallel (store (i8 (X86dec_flag (loadi8 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC8m addr:$dst)>;
+
+def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
+          (INC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i16 (X86inc_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC16m addr:$dst)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
+          (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i16 (X86dec_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC16m addr:$dst)>, Requires<[In32BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
+          (INC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i32 (X86inc_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC32m addr:$dst)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
+          (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i32 (X86dec_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC32m addr:$dst)>, Requires<[In32BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Stack Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFPStack.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86Instr64bit.td"
+
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE / SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrMMX.td"
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..8f287e1
--- /dev/null
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,694 @@
+//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
+
+def bc_v8i8  : PatFrag<(ops node:$in), (v8i8  (bitconvert node:$in))>;
+def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
+def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
+def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// MMX Masks
+//===----------------------------------------------------------------------===//
+
+// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
+// PSHUFW imm.
+def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
+def mmx_unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+                         (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
+def mmx_unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+                         (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+def mmx_unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+def mmx_unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def mmx_pshufw : PatFrag<(ops node:$lhs, node:$rhs),
+                         (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], MMX_SHUFFLE_get_shuf_imm>;
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let isTwoAddress = 1 in {
+  // MMXI_binop_rm - Simple MMX binary operator.
+  multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           ValueType OpVT, bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
+                                         (bitconvert
+                                          (load_mmx addr:$src2)))))]>;
+  }
+
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))]>;
+  }
+
+  // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
+  //
+  // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
+  // to collapse (bitconvert VT to VT) into its operand.
+  //
+  multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst,
+                    (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId,
+                                Intrinsic IntId2> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))]>;
+    def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+                                   (ins VR64:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS & FEMMS Instructions
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",  [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (v2i32 (scalar_to_vector GR32:$src)))]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+              [(set VR64:$dst, (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+let mayStore = 1 in 
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+                        "movd\t{$src, $dst|$dst, $src}", []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+                             []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMSrcReg,
+                               (outs GR64:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}", []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))]>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (v1i64 VR64:$src), addr:$dst)]>;
+
+def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMDestMem, (outs VR64:$dst), (ins VR128:$src),
+                          "movdq2q\t{$src, $dst|$dst, $src}",
+                          [(set VR64:$dst,
+                            (v1i64 (bitconvert
+                            (i64 (vector_extract (v2i64 VR128:$src),
+                                  (iPTR 0))))))]>;
+
+def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMDestMem, (outs VR128:$dst), (ins VR64:$src),
+                           "movq2dq\t{$src, $dst|$dst, $src}",
+          [(set VR128:$dst,
+            (movl immAllZerosV,
+                  (v2i64 (scalar_to_vector (i64 (bitconvert VR64:$src))))))]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMDestMem, (outs FR64:$dst), (ins VR64:$src),
+                           "movq2dq\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                         "movntq\t{$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+              [(set VR64:$dst,
+                    (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+          [(set VR64:$dst,
+                (v2i32 (X86vzmovl (v2i32
+                                   (scalar_to_vector (loadi32 addr:$src))))))]>;
+
+// Arithmetic Instructions
+
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8,  1>;
+defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>;
+defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>;
+defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>;
+
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>;
+defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>;
+defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>;
+defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,  1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
+defm MMX_POR  : MMXI_binop_rm_v1i64<0xEB, "por" , or,  1>;
+defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+  def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
+                         (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                         "pandn\t{$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  VR64:$src2)))]>;
+  def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
+                         (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                         "pandn\t{$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  (load addr:$src2))))]>;
+}
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+
+// Shift up / down and insert zero's.
+def : Pat<(v1i64 (X86vshl     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSLLQri VR64:$src, imm:$amt))>;
+def : Pat<(v1i64 (X86vshr     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSRLQri VR64:$src, imm:$amt))>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+
+// Conversion Instructions
+
+// -- Unpack Instructions
+let isTwoAddress = 1 in {
+  // Unpack High Packed Data Instructions
+  def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg, 
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem, 
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckh VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg, 
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem, 
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckh VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg, 
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckh VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)))))]>;
+
+  // Unpack Low Packed Data Instructions
+  def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckl VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckl VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg, 
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckldq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem, 
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckldq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckl VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)))))]>;
+}
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+
+// -- Shuffle Instructions
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (v4i16 (mmx_pshufw:$src2 VR64:$src1, (undef))))]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (mmx_pshufw:$src2 (bc_v4i16 (load_mmx addr:$src1)),
+                                              (undef)))]>;
+
+// -- Conversion Instructions
+let neverHasSideEffects = 1 in {
+def MMX_CVTPD2PIrr  : MMX2I<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                            "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPD2PIrm  : MMX2I<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+                            "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PDrr  : MMX2I<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+                            "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PDrm  : MMX2I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                            "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PSrr  : MMXI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+                           "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PSrm  : MMXI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                           "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPS2PIrr  : MMXI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                           "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPS2PIrm  : MMXI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                           "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                            "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+                            "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                           "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                           "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+} // end neverHasSideEffects
+
+
+// Extract / Insert
+def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+def MMX_PEXTRWri  : MMXIi8<0xC5, MRMSrcReg,
+                           (outs GR32:$dst), (ins VR64:$src1, i16i8imm:$src2),
+                           "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+                                             (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+  def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
+                      (outs VR64:$dst), (ins VR64:$src1, GR32:$src2, i16i8imm:$src3),
+                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                                               GR32:$src2, (iPTR imm:$src3))))]>;
+  def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
+                     (outs VR64:$dst), (ins VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst,
+                       (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                               (i32 (anyext (loadi16 addr:$src2))),
+                               (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+                          "pmovmskb\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+// Misc.
+let Uses = [EDI] in
+def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (outs), (ins VR64:$src, VR64:$mask),
+                        "maskmovq\t{$mask, $src|$src, $mask}",
+                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+let Uses = [RDI] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMDestMem, (outs), (ins VR64:$src, VR64:$mask),
+                           "maskmovq\t{$mask, $src|$src, $mask}",
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map zero vector to pxor.
+let isReMaterializable = 1 in {
+  def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins),
+                              "pxor\t$dst, $dst",
+                              [(set VR64:$dst, (v2i32 immAllZerosV))]>;
+  def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins),
+                              "pcmpeqd\t$dst, $dst",
+                              [(set VR64:$dst, (v2i32 immAllOnesV))]>;
+}
+
+let Predicates = [HasMMX] in {
+  def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
+  def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
+  def : Pat<(v8i8  immAllZerosV), (MMX_V_SET0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Store 64-bit integer vector values.
+def : Pat<(store (v8i8  VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v4i16 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2i32 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2f32 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v1i64 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+
+// Bit convert.
+def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  VR64:$src))), (v1i64 VR64:$src)>;
+
+// 64-bit bit convert.
+def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2f32 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(i64 (bitconvert (v1i64 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2i32 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2f32 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v4i16 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64  (bitconvert (v8i8 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v1i64 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v2i32 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v4i16 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v8i8 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+
+// Move scalar to MMX zero-extended
+// movd to MMX register zero-extends
+let AddedComplexity = 15 in {
+  def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
+           (MMX_MOVZDI2PDIrr GR32:$src)>; 
+  def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))))),
+           (MMX_MOVZDI2PDIrr GR32:$src)>; 
+}
+
+let AddedComplexity = 20 in {
+  def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (load_mmx addr:$src)))),
+            (MMX_MOVZDI2PDIrm addr:$src)>; 
+  def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (load_mmx addr:$src)))),
+            (MMX_MOVZDI2PDIrm addr:$src)>; 
+  def : Pat<(v2i32 (X86vzmovl (bc_v2i32 (load_mmx addr:$src)))),
+            (MMX_MOVZDI2PDIrm addr:$src)>; 
+}
+
+// Clear top half.
+let AddedComplexity = 15 in {
+  def : Pat<(v8i8 (X86vzmovl VR64:$src)),
+            (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
+  def : Pat<(v4i16 (X86vzmovl VR64:$src)),
+            (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
+  def : Pat<(v2i32 (X86vzmovl VR64:$src)),
+            (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
+}
+
+// Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
+// 8 or 16-bits matter.
+def : Pat<(bc_v8i8  (v2i32 (scalar_to_vector GR32:$src))),
+          (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
+          (MMX_MOVD64rr GR32:$src)>;
+
+// Patterns to perform canonical versions of vector shuffling.
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (mmx_unpckl_undef VR64:$src, (undef))),
+            (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (mmx_unpckl_undef VR64:$src, (undef))),
+            (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (mmx_unpckl_undef VR64:$src, (undef))),
+            (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
+}
+
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (mmx_unpckh_undef VR64:$src, (undef))),
+            (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (mmx_unpckh_undef VR64:$src, (undef))),
+            (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (mmx_unpckh_undef VR64:$src, (undef))),
+            (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
+}
+
+// Patterns to perform vector shuffling with a zeroed out vector.
+let AddedComplexity = 20 in {
+  def : Pat<(bc_v2i32 (mmx_unpckl immAllZerosV,
+                       (v2i32 (scalar_to_vector (load_mmx addr:$src))))),
+            (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
+}
+
+// Some special case PANDN patterns.
+// FIXME: Get rid of these.
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+
+// Move MMX to lower 64-bit of XMM
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v4i16 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v2i32 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v1i64 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
+// Move lower 64-bit of XMM to MMX.
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+// CMOV* - Used to implement the SELECT DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+// These are expanded by the scheduler.
+let Uses = [EFLAGS], usesCustomDAGSchedInserter = 1 in {
+  def CMOV_V1I64 : I<0, Pseudo,
+                    (outs VR64:$dst), (ins VR64:$t, VR64:$f, i8imm:$cond),
+                    "#CMOV_V1I64 PSEUDO!",
+                    [(set VR64:$dst,
+                      (v1i64 (X86cmov VR64:$t, VR64:$f, imm:$cond,
+                                          EFLAGS)))]>;
+}
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..1fafa46
--- /dev/null
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,3643 @@
+//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+                                            SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<1>, SDTCisVT<3, i8>]>;
+
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB", 
+                 SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86pextrb  : SDNode<"X86ISD::PEXTRB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb  : SDNode<"X86ISD::PINSRB", 
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW", 
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insrtps : SDNode<"X86ISD::INSERTPS", 
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad]>;
+def X86vshl    : SDNode<"X86ISD::VSHL",      SDTIntShiftOp>;
+def X86vshr    : SDNode<"X86ISD::VSRL",      SDTIntShiftOp>;
+def X86cmpps   : SDNode<"X86ISD::CMPPS",     SDTX86VFCMP>;
+def X86cmppd   : SDNode<"X86ISD::CMPPD",     SDTX86VFCMP>;
+def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
+def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
+def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
+def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// Like 'store', but always requires vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr), (f32   (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr), (f64   (alignedload node:$ptr))>;
+def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (alignedload node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.
+// FIXME: Actually implement support for targets that don't require the
+//        alignment. This probably wants a subtarget predicate.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopv8i8  : PatFrag<(ops node:$ptr), (v8i8  (memop64 node:$ptr))>;
+def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
+def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzmovl
+                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+                           (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def PSxLDQ_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to 
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to 
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
+}]>;
+
+def movddup : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                            (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhp : PatFrag<(ops node:$lhs, node:$rhs),
+                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVHPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlp : PatFrag<(ops node:$lhs, node:$rhs),
+                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movl : PatFrag<(ops node:$lhs, node:$rhs),
+                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                           (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                           (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def shufp : PatFrag<(ops node:$lhs, node:$rhs),
+                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshufhw_imm>;
+
+def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshuflw_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE scalar FP Instructions
+//===----------------------------------------------------------------------===//
+
+// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+// These are expanded by the scheduler.
+let Uses = [EFLAGS], usesCustomDAGSchedInserter = 1 in {
+  def CMOV_FR32 : I<0, Pseudo,
+                    (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
+                    "#CMOV_FR32 PSEUDO!",
+                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_FR64 : I<0, Pseudo,
+                    (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
+                    "#CMOV_FR64 PSEUDO!",
+                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_V4F32 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V4F32 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2F64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2F64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2I64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2I64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVSSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                  "movss\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                  "movss\t{$src, $dst|$dst, $src}",
+                  [(set FR32:$dst, (loadf32 addr:$src))]>;
+def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
+                  "movss\t{$src, $dst|$dst, $src}",
+                  [(store FR32:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr  : SSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                      "cvtsi2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm  : SSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                      "cvtsi2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                         "cvtss2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+                         "cvtss2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si
+                                           (load addr:$src)))]>;
+
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvtps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
+def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                         "cvtps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtps2pi 
+                                           (load addr:$src)))]>;
+def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvttps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
+def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                         "cvttps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttps2pi 
+                                           (load addr:$src)))]>;
+let Constraints = "$src1 = $dst" in {
+  def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg, 
+                           (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
+                        "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+                                           VR64:$src2))]>;
+  def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem, 
+                           (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+                        "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1, 
+                                            (load addr:$src2)))]>;
+}
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                          "cvttss2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+                          "cvttss2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let Constraints = "$src1 = $dst" in {
+  def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+                           (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+                           "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              GR32:$src2))]>;
+  def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+                           (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+                           "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+  def CMPSSrr : SSIi8<0xC2, MRMSrcReg, 
+                    (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
+                    "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+  def CMPSSrm : SSIi8<0xC2, MRMSrcMem, 
+                    (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
+                    "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins FR32:$src1, FR32:$src2),
+                   "ucomiss\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, FR32:$src2), (implicit EFLAGS)]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
+                   "ucomiss\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, (loadf32 addr:$src2)),
+                    (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+  def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+                                           VR128:$src, imm:$cc))]>;
+  def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, f32mem:$src, SSECC:$cc),
+                        "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                       "ucomiss\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), VR128:$src2),
+                        (implicit EFLAGS)]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+                       "ucomiss\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2)),
+                        (implicit EFLAGS)]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                      "comiss\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), VR128:$src2),
+                       (implicit EFLAGS)]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                      "comiss\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), (load addr:$src2)),
+                       (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins),
+                 "pxor\t$dst, $dst", [(set FR32:$dst, fp32imm0)]>,
+               Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in 
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                     "movaps\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1 in
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                     "movaps\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+  def FsANDPSrr : PSI<0x54, MRMSrcReg, (outs FR32:$dst),
+                                       (ins FR32:$src1, FR32:$src2),
+                      "andps\t{$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+  def FsORPSrr  : PSI<0x56, MRMSrcReg, (outs FR32:$dst),
+                                       (ins FR32:$src1, FR32:$src2),
+                      "orps\t{$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+  def FsXORPSrr : PSI<0x57, MRMSrcReg, (outs FR32:$dst),
+                                       (ins FR32:$src1, FR32:$src2),
+                      "xorps\t{$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst),
+                                     (ins FR32:$src1, f128mem:$src2),
+                    "andps\t{$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fand FR32:$src1,
+                                      (memopfsf32 addr:$src2)))]>;
+def FsORPSrm  : PSI<0x56, MRMSrcMem, (outs FR32:$dst),
+                                     (ins FR32:$src1, f128mem:$src2),
+                    "orps\t{$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86for FR32:$src1,
+                                      (memopfsf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst),
+                                     (ins FR32:$src1, f128mem:$src2),
+                    "xorps\t{$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fxor FR32:$src1,
+                                      (memopfsf32 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F32Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                                 (ins FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]>;
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F32Int,
+                            Intrinsic V4F32Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                                 (ins FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, f128mem:$src2),
+                     !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+           [(set VR128:$dst, (V4F32Int VR128:$src1, (memopv4f32 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse_min_ss, int_x86_sse_min_ps>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in 
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1 in
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPS load and store
+let canFoldAsLoad = 1 in
+def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "movups\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                       "movups\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+  let AddedComplexity = 20 in {
+    def MOVLPSrm : PSI<0x12, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movlps\t{$src2, $dst|$dst, $src2}",
+       [(set VR128:$dst, 
+         (movlp VR128:$src1,
+                (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+    def MOVHPSrm : PSI<0x16, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movhps\t{$src2, $dst|$dst, $src2}",
+       [(set VR128:$dst, 
+         (movhp VR128:$src1,
+                (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                         (undef)), (iPTR 0))), addr:$dst)]>;
+
+let Constraints = "$src1 = $dst" in {
+let AddedComplexity = 20 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                    "movlhps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (movhp VR128:$src1, VR128:$src2)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                    "movhlps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
+} // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+let AddedComplexity = 20 in {
+def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+          (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+          (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+
+
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F32Int,
+                           Intrinsic V4F32Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+                             int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP   : sse1_fp_unop_rm<0x53, "rcp",   X86frcp,
+                             int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 1 in {
+    def ANDPSrr : PSI<0x54, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "andps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (and VR128:$src1, VR128:$src2)))]>;
+    def ORPSrr  : PSI<0x56, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "orps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (or VR128:$src1, VR128:$src2)))]>;
+    def XORPSrr : PSI<0x57, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "xorps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (xor VR128:$src1, VR128:$src2)))]>;
+  }
+
+  def ANDPSrm : PSI<0x54, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "andps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (and (bc_v2i64 (v4f32 VR128:$src1)),
+                                       (memopv2i64 addr:$src2)))]>;
+  def ORPSrm  : PSI<0x56, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "orps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (or (bc_v2i64 (v4f32 VR128:$src1)),
+                                       (memopv2i64 addr:$src2)))]>;
+  def XORPSrm : PSI<0x57, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "xorps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (xor (bc_v2i64 (v4f32 VR128:$src1)),
+                                       (memopv2i64 addr:$src2)))]>;
+  def ANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor VR128:$src1,
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               VR128:$src2)))]>;
+  def ANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               (memopv2i64 addr:$src2))))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  def CMPPSrri : PSIi8<0xC2, MRMSrcReg, 
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                    "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                                        VR128:$src, imm:$cc))]>;
+  def CMPPSrmi : PSIi8<0xC2, MRMSrcMem, 
+                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+                  "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                            (memop addr:$src), imm:$cc))]>;
+}
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPSrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+  let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+    def SHUFPSrri : PSIi8<0xC6, MRMSrcReg, 
+                          (outs VR128:$dst), (ins VR128:$src1,
+                           VR128:$src2, i8imm:$src3),
+                          "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                          [(set VR128:$dst,
+                            (v4f32 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+  def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1,
+                         f128mem:$src2, i8imm:$src3),
+                        "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v4f32 (shufp:$src3
+                                  VR128:$src1, (memopv4f32 addr:$src2))))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPSrr : PSI<0x15, MRMSrcReg, 
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpckhps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (unpckh VR128:$src1, VR128:$src2)))]>;
+    def UNPCKHPSrm : PSI<0x15, MRMSrcMem, 
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpckhps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (unpckh VR128:$src1,
+                                          (memopv4f32 addr:$src2))))]>;
+
+    def UNPCKLPSrr : PSI<0x14, MRMSrcReg, 
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpcklps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (unpckl VR128:$src1, VR128:$src2)))]>;
+    def UNPCKLPSrm : PSI<0x14, MRMSrcMem, 
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpcklps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (unpckl VR128:$src1, (memopv4f32 addr:$src2)))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                     "movmskps\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PDI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                     "movmskpd\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetch intrinsic.
+def PREFETCHT0   : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
+def PREFETCHT1   : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
+def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
+def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
+
+// Non-temporal stores
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+// Load, store, and memory fence
+def SFENCE : PSI<0xAE, MRM7r, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
+                 "xorps\t$dst, $dst",
+                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+
+let Predicates = [HasSSE1] in {
+  def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+  def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+  def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+  def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+  def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
+}
+
+// FR32 to 128-bit vector conversion.
+let isAsCheapAsAMove = 1 in
+def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
+                      "movss\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4f32 (scalar_to_vector FR32:$src)))]>;
+def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
+                     "movss\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+let isAsCheapAsAMove = 1 in
+def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src),
+                     "movss\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPS2SSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                     "movss\t{$src, $dst|$dst, $src}",
+                     [(store (f32 (vector_extract (v4f32 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let Constraints = "$src1 = $dst" in {
+let neverHasSideEffects = 1 in
+  def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
+                        "movss\t{$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPSrr : SSI<0x10, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "movss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v4f32 (movl VR128:$src1, VR128:$src2)))]>;
+}
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
+                      "movss\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
+                                                      (loadf32 addr:$src))))))]>;
+
+def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+          (MOVZSS2PSrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// SSE2 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVSDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(set FR64:$dst, (loadf64 addr:$src))]>;
+def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround FR64:$src))]>;
+def CVTSD2SSrm  : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
+def CVTSI2SDrr  : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
+                      "cvtsi2sd\t{$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SDrm  : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
+                      "cvtsi2sd\t{$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+                 Requires<[HasSSE2]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+                 Requires<[HasSSE2]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                         "cvtsd2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
+def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+                         "cvtsd2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si
+                                           (load addr:$src)))]>;
+
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
+def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+                         "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi 
+                                           (memop addr:$src)))]>;
+def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
+def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+                         "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi 
+                                           (memop addr:$src)))]>;
+def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+                         "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
+def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                         "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd 
+                                            (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                          "cvttsd2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse2_cvttsd2si VR128:$src))]>;
+def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+                          "cvttsd2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_sse2_cvttsd2si
+                                            (load addr:$src)))]>;
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+  def CMPSDrr : SDIi8<0xC2, MRMSrcReg, 
+                    (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
+                    "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+  def CMPSDrm : SDIi8<0xC2, MRMSrcMem, 
+                    (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
+                    "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins FR64:$src1, FR64:$src2),
+                   "ucomisd\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, FR64:$src2), (implicit EFLAGS)]>;
+def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
+                   "ucomisd\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, (loadf64 addr:$src2)),
+                    (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+  def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           VR128:$src, imm:$cc))]>;
+  def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src, SSECC:$cc),
+                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                       "ucomisd\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
+                        (implicit EFLAGS)]>;
+def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+                       "ucomisd\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2)),
+                        (implicit EFLAGS)]>;
+
+def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                      "comisd\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
+                       (implicit EFLAGS)]>;
+def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                      "comisd\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (load addr:$src2)),
+                       (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE2 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins),
+                 "pxor\t$dst, $dst", [(set FR64:$dst, fpimm0)]>,
+               Requires<[HasSSE2]>, TB, OpSize;
+
+// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                     "movapd\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1 in
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                     "movapd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+  def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
+                      "andpd\t{$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
+  def FsORPDrr  : PDI<0x56, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
+                      "orpd\t{$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
+  def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
+                      "xorpd\t{$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
+}
+
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
+                    "andpd\t{$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fand FR64:$src1,
+                                      (memopfsf64 addr:$src2)))]>;
+def FsORPDrm  : PDI<0x56, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
+                    "orpd\t{$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86for FR64:$src1,
+                                      (memopfsf64 addr:$src2)))]>;
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
+                    "xorpd\t{$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fxor FR64:$src1,
+                                      (memopfsf64 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F64Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+                                 (ins FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]>;
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F64Int,
+                            Intrinsic V2F64Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+                                 (ins FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, f128mem:$src2),
+                     !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1,
+                                                 (memopv2f64 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
+
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1 in
+def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv2f64 addr:$src))]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPD load and store
+def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "movupd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                       "movupd\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+  let AddedComplexity = 20 in {
+    def MOVLPDrm : PDI<0x12, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movlpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v2f64 (movlp VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)))))]>;
+    def MOVHPDrm : PDI<0x16, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movhpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v2f64 (movhp VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)))))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (unpckh VR128:$src, (undef))),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                      "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+                                        (bitconvert (memopv2i64 addr:$src))))]>,
+                     TB, Requires<[HasSSE2]>;
+
+// SSE2 instructions with XS prefix
+def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+                     XS, Requires<[HasSSE2]>;
+def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+                                        (bitconvert (memopv2i64 addr:$src))))]>,
+                     XS, Requires<[HasSSE2]>;
+
+def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvtps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+                                            (memop addr:$src)))]>;
+// SSE2 packed instructions with XS prefix
+def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>,
+                      XS, Requires<[HasSSE2]>;
+def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvttps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                           (memop addr:$src)))]>,
+                      XS, Requires<[HasSSE2]>;
+
+// SSE2 packed instructions with XD prefix
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                     XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+                                          (memop addr:$src)))]>,
+                     XD, Requires<[HasSSE2]>;
+
+def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                             (memop addr:$src)))]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+                                          (load addr:$src)))]>,
+                     TB, Requires<[HasSSE2]>;
+
+def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+                                            (memop addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+// Aliases for intrinsics
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+                        "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           GR32:$src2))]>;
+def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+                        "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           (loadi32 addr:$src2)))]>;
+def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                   "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      VR128:$src2))]>;
+def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 
+                   "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      (load addr:$src2)))]>;
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       VR128:$src2))]>, XS,
+                    Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       (load addr:$src2)))]>, XS,
+                    Requires<[HasSSE2]>;
+}
+
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F64Int,
+                           Intrinsic V2F64Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode FR64:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse2_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 1 in {
+    def ANDPDrr : PDI<0x54, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "andpd\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (and (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def ORPDrr  : PDI<0x56, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "orpd\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (or (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def XORPDrr : PDI<0x57, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "xorpd\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  }
+
+  def ANDPDrm : PDI<0x54, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "andpd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (and (bc_v2i64 (v2f64 VR128:$src1)),
+                       (memopv2i64 addr:$src2)))]>;
+  def ORPDrm  : PDI<0x56, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "orpd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (or (bc_v2i64 (v2f64 VR128:$src1)),
+                       (memopv2i64 addr:$src2)))]>;
+  def XORPDrm : PDI<0x57, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "xorpd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                       (memopv2i64 addr:$src2)))]>;
+  def ANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  def ANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (memopv2i64 addr:$src2)))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  def CMPPDrri : PDIi8<0xC2, MRMSrcReg, 
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                    "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                                        VR128:$src, imm:$cc))]>;
+  def CMPPDrmi : PDIi8<0xC2, MRMSrcMem, 
+                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+                  "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                                 (memop addr:$src), imm:$cc))]>;
+}
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+  def SHUFPDrri : PDIi8<0xC6, MRMSrcReg, 
+                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                 [(set VR128:$dst,
+                   (v2f64 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+  def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1,
+                         f128mem:$src2, i8imm:$src3),
+                        "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v2f64 (shufp:$src3
+                                  VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPDrr : PDI<0x15, MRMSrcReg, 
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (unpckh VR128:$src1, VR128:$src2)))]>;
+    def UNPCKHPDrm : PDI<0x15, MRMSrcMem, 
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (unpckh VR128:$src1,
+                                          (memopv2f64 addr:$src2))))]>;
+
+    def UNPCKLPDrr : PDI<0x14, MRMSrcReg, 
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (unpckl VR128:$src1, VR128:$src2)))]>;
+    def UNPCKLPDrm : PDI<0x14, MRMSrcMem, 
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (unpckl VR128:$src1, (memopv2f64 addr:$src2)))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+//===----------------------------------------------------------------------===//
+// SSE integer instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
+let mayStore = 1 in
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
+                 XS, Requires<[HasSSE2]>;
+let mayStore = 1 in
+def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
+                 XS, Requires<[HasSSE2]>;
+
+// Intrinsic forms of MOVDQU load and store
+let canFoldAsLoad = 1 in
+def MOVDQUrm_Int :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+                 XS, Requires<[HasSSE2]>;
+def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+                     XS, Requires<[HasSSE2]>;
+
+let Constraints = "$src1 = $dst" in {
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                            bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (memopv2i64 addr:$src2))))]>;
+}
+
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                             string OpcodeStr,
+                             Intrinsic IntId, Intrinsic IntId2> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (memopv2i64 addr:$src2))))]>;
+  def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
+}
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+                                       (bitconvert (memopv2i64 addr:$src2)))))]>;
+}
+
+/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
+///
+/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
+/// to collapse (bitconvert VT to VT) into its operand.
+///
+multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpNode VR128:$src1,(memopv2i64 addr:$src2)))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+
+// 128-bit Integer Arithmetic
+
+defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+
+defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+
+defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
+defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
+defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
+defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+
+defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
+defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
+defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
+defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
+
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
+
+defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
+defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
+
+defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+
+defm PAVGB  : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW  : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+
+
+defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                               int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                               int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                               int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                               int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                               int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                               int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                               int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                               int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
+
+// 128-bit logical shifts.
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+  def PSLLDQri : PDIi8<0x73, MRM7r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "pslldq\t{$src2, $dst|$dst, $src2}", []>;
+  def PSRLDQri : PDIi8<0x73, MRM3r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "psrldq\t{$src2, $dst|$dst, $src2}", []>;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
+            (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>;
+  def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
+            (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSLLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
+  def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSRLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
+}
+
+// Logical
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let Constraints = "$src1 = $dst" in {
+  def PANDNrr : PDI<0xDF, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "pandn\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              VR128:$src2)))]>;
+
+  def PANDNrm : PDI<0xDF, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                    "pandn\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              (memopv2i64 addr:$src2))))]>;
+}
+
+// SSE2 Integer comparison
+defm PCMPEQB  : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
+defm PCMPEQW  : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
+defm PCMPEQD  : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
+defm PCMPGTB  : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+          (PCMPEQBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+          (PCMPEQBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+          (PCMPEQWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+          (PCMPEQWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+          (PCMPEQDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+          (PCMPEQDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+          (PCMPGTBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+          (PCMPGTBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+          (PCMPGTWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+          (PCMPGTWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+          (PCMPGTDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+          (PCMPGTDrm VR128:$src1, addr:$src2)>;
+
+
+// Pack instructions
+defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
+defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
+defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+
+// Shuffle and unpack instructions
+def PSHUFDri : PDIi8<0x70, MRMSrcReg,
+                     (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+                     "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (pshufd:$src2
+                                               VR128:$src1, (undef))))]>;
+def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
+                     (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+                     "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (pshufd:$src2
+                                               (bc_v4i32(memopv2i64 addr:$src1)),
+                                               (undef))))]>;
+
+// SSE2 with ImmT == Imm8 and XS prefix.
+def PSHUFHWri : Ii8<0x70, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+                    "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshufhw:$src2 VR128:$src1,
+                                                            (undef))))]>,
+                XS, Requires<[HasSSE2]>;
+def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
+                    (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+                    "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshufhw:$src2
+                                             (bc_v8i16 (memopv2i64 addr:$src1)),
+                                             (undef))))]>,
+                XS, Requires<[HasSSE2]>;
+
+// SSE2 with ImmT == Imm8 and XD prefix.
+def PSHUFLWri : Ii8<0x70, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+                    "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshuflw:$src2 VR128:$src1,
+                                                            (undef))))]>,
+                XD, Requires<[HasSSE2]>;
+def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
+                    (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+                    "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshuflw:$src2
+                                             (bc_v8i16 (memopv2i64 addr:$src1)),
+                                             (undef))))]>,
+                XD, Requires<[HasSSE2]>;
+
+
+let Constraints = "$src1 = $dst" in {
+  def PUNPCKLBWrr : PDI<0x60, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLBWrm : PDI<0x60, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckl VR128:$src1,
+                                  (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKLWDrr : PDI<0x61, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLWDrm : PDI<0x61, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckl VR128:$src1,
+                                  (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKLDQrr : PDI<0x62, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckldq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLDQrm : PDI<0x62, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckldq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckl VR128:$src1,
+                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, 
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, 
+                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckl VR128:$src1,
+                                         (memopv2i64 addr:$src2))))]>;
+  
+  def PUNPCKHBWrr : PDI<0x68, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHBWrm : PDI<0x68, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, 
+                          (unpckh VR128:$src1, 
+                                  (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKHWDrr : PDI<0x69, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHWDrm : PDI<0x69, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckh VR128:$src1,
+                                  (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg, 
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckh VR128:$src1,
+                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, 
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, 
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckh VR128:$src1,
+                                         (memopv2i64 addr:$src2))))]>;
+}
+
+// Extract / Insert
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                                imm:$src2))]>;
+let Constraints = "$src1 = $dst" in {
+  def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        GR32:$src2, i32i8imm:$src3),
+                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst,
+                         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
+  def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        i16mem:$src2, i32i8imm:$src3),
+                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst, 
+                         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+                                    imm:$src3))]>;
+}
+
+// Mask creation
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                     "pmovmskb\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+
+// Conditional store
+let Uses = [EDI] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+                     "maskmovdqu\t{$mask, $src|$src, $mask}",
+                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
+
+let Uses = [RDI] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+                     "maskmovdqu\t{$mask, $src|$src, $mask}",
+                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+
+// Non-temporal stores
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movnti\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, 
+                  TB, Requires<[HasSSE2]>;
+
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+              TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM5r, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM6r, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+//TODO: custom lower this so as to never even generate the noop
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss), 
+           (i8 0)), (NOOP)>;
+def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss), 
+           (i8 1)), (MFENCE)>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
+                         "pcmpeqd\t$dst, $dst",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+
+// FR64 to 128-bit vector conversion.
+let isAsCheapAsAMove = 1 in
+def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
+                      "movsd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (scalar_to_vector FR64:$src)))]>;
+def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                     "movsd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, 
+                       (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+
+def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert GR32:$src))]>;
+
+def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                  Requires<[HasSSE2]>;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+let isAsCheapAsAMove = 1 in
+def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins VR128:$src),
+                     "movsd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPD2SDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                     "movsd\t{$src, $dst|$dst, $src}",
+                     [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))]>;
+def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)]>;
+
+def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32:$src))]>;
+def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let Constraints = "$src1 = $dst" in {
+  let neverHasSideEffects = 1 in
+  def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
+                        "movsd\t{$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPDrr : SDI<0x10, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "movsd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2f64 (movl VR128:$src1, VR128:$src2)))]>;
+}
+
+// Store / copy lower 64-bits of a XMM register.
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in {
+def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                      "movsd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (X86vzmovl (v2f64 (scalar_to_vector
+                                                 (loadf64 addr:$src))))))]>;
+
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>;
+}
+
+// movd / movq to XMM register zero-extends
+let AddedComplexity = 15 in {
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (v4i32 (X86vzmovl
+                                      (v4i32 (scalar_to_vector GR32:$src)))))]>;
+// This is X86-64 only.
+def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))]>;
+}
+
+let AddedComplexity = 20 in {
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+                                                   (loadi32 addr:$src))))))]>;
+
+def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+            (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))]>, XS,
+                   Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+                      XS, Requires<[HasSSE2]>;
+
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))]>,
+                      XS, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+            (MOVZPQILo2PQIrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions
+def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movshdup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (movshdup
+                                                VR128:$src, (undef))))]>;
+def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "movshdup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (movshdup
+                                         (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movsldup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (movsldup
+                                                VR128:$src, (undef))))]>;
+def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "movsldup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (movsldup
+                                         (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVDDUPrr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movddup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
+def MOVDDUPrm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                      "movddup\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
+                                      (undef))))]>;
+
+def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+                   (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (memopv2i64 addr:$src), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+}
+
+// Arithmetic
+let Constraints = "$src1 = $dst" in {
+  def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "addsubps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           VR128:$src2))]>;
+  def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                        "addsubps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           (memop addr:$src2)))]>;
+  def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "addsubpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          VR128:$src2))]>;
+  def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                       "addsubpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          (memop addr:$src2)))]>;
+}
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "lddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+
+// Horizontal ops
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+      [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+let Constraints = "$src1 = $dst" in {
+  def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+  def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+}
+
+// Thread synchronization
+def MONITOR : I<0x01, MRM1r, (outs), (ins), "monitor",
+                [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT   : I<0x01, MRM1r, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
+          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+  def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
+            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+  def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===----------------------------------------------------------------------===//
+// SSSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
+multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId64, Intrinsic IntId128> {
+  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64 (bitconvert (memopv8i8 addr:$src))))]>;
+
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    OpSize;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv16i8 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_16 - Simple SSSE3 unary operator whose type is v*i16.
+multiclass SS3I_unop_rm_int_16<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, Intrinsic IntId128> {
+  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                   (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                   (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64
+                      (bitconvert (memopv4i16 addr:$src))))]>;
+
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    OpSize;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_32 - Simple SSSE3 unary operator whose type is v*i32.
+multiclass SS3I_unop_rm_int_32<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, Intrinsic IntId128> {
+  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                   (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                   (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64
+                      (bitconvert (memopv2i32 addr:$src))))]>;
+
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    OpSize;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv4i32 addr:$src))))]>, OpSize;
+}
+
+defm PABSB       : SS3I_unop_rm_int_8 <0x1C, "pabsb",
+                                       int_x86_ssse3_pabs_b,
+                                       int_x86_ssse3_pabs_b_128>;
+defm PABSW       : SS3I_unop_rm_int_16<0x1D, "pabsw",
+                                       int_x86_ssse3_pabs_w,
+                                       int_x86_ssse3_pabs_w_128>;
+defm PABSD       : SS3I_unop_rm_int_32<0x1E, "pabsd",
+                                       int_x86_ssse3_pabs_d,
+                                       int_x86_ssse3_pabs_d_128>;
+
+/// SS3I_binop_rm_int_8 - Simple SSSE3 binary operator whose type is v*i8.
+let Constraints = "$src1 = $dst" in {
+  multiclass SS3I_binop_rm_int_8<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId64, Intrinsic IntId128,
+                                 bit Commutable = 0> {
+    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                     (ins VR64:$src1, VR64:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                     (ins VR64:$src1, i64mem:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst,
+                       (IntId64 VR64:$src1,
+                        (bitconvert (memopv8i8 addr:$src2))))]>;
+
+    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, i128mem:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst,
+                        (IntId128 VR128:$src1,
+                         (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+  }
+}
+
+/// SS3I_binop_rm_int_16 - Simple SSSE3 binary operator whose type is v*i16.
+let Constraints = "$src1 = $dst" in {
+  multiclass SS3I_binop_rm_int_16<bits<8> opc, string OpcodeStr,
+                                  Intrinsic IntId64, Intrinsic IntId128,
+                                  bit Commutable = 0> {
+    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                     (ins VR64:$src1, VR64:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                     (ins VR64:$src1, i64mem:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst,
+                       (IntId64 VR64:$src1,
+                        (bitconvert (memopv4i16 addr:$src2))))]>;
+
+    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, i128mem:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst,
+                        (IntId128 VR128:$src1,
+                         (bitconvert (memopv8i16 addr:$src2))))]>, OpSize;
+  }
+}
+
+/// SS3I_binop_rm_int_32 - Simple SSSE3 binary operator whose type is v*i32.
+let Constraints = "$src1 = $dst" in {
+  multiclass SS3I_binop_rm_int_32<bits<8> opc, string OpcodeStr,
+                                  Intrinsic IntId64, Intrinsic IntId128,
+                                  bit Commutable = 0> {
+    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                     (ins VR64:$src1, VR64:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                     (ins VR64:$src1, i64mem:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst,
+                       (IntId64 VR64:$src1,
+                        (bitconvert (memopv2i32 addr:$src2))))]>;
+
+    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, i128mem:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst,
+                        (IntId128 VR128:$src1,
+                         (bitconvert (memopv4i32 addr:$src2))))]>, OpSize;
+  }
+}
+
+defm PHADDW      : SS3I_binop_rm_int_16<0x01, "phaddw",
+                                        int_x86_ssse3_phadd_w,
+                                        int_x86_ssse3_phadd_w_128>;
+defm PHADDD      : SS3I_binop_rm_int_32<0x02, "phaddd",
+                                        int_x86_ssse3_phadd_d,
+                                        int_x86_ssse3_phadd_d_128>;
+defm PHADDSW     : SS3I_binop_rm_int_16<0x03, "phaddsw",
+                                        int_x86_ssse3_phadd_sw,
+                                        int_x86_ssse3_phadd_sw_128>;
+defm PHSUBW      : SS3I_binop_rm_int_16<0x05, "phsubw",
+                                        int_x86_ssse3_phsub_w,
+                                        int_x86_ssse3_phsub_w_128>;
+defm PHSUBD      : SS3I_binop_rm_int_32<0x06, "phsubd",
+                                        int_x86_ssse3_phsub_d,
+                                        int_x86_ssse3_phsub_d_128>;
+defm PHSUBSW     : SS3I_binop_rm_int_16<0x07, "phsubsw",
+                                        int_x86_ssse3_phsub_sw,
+                                        int_x86_ssse3_phsub_sw_128>;
+defm PMADDUBSW   : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
+                                        int_x86_ssse3_pmadd_ub_sw,
+                                        int_x86_ssse3_pmadd_ub_sw_128>;
+defm PMULHRSW    : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
+                                        int_x86_ssse3_pmul_hr_sw,
+                                        int_x86_ssse3_pmul_hr_sw_128, 1>;
+defm PSHUFB      : SS3I_binop_rm_int_8 <0x00, "pshufb",
+                                        int_x86_ssse3_pshuf_b,
+                                        int_x86_ssse3_pshuf_b_128>;
+defm PSIGNB      : SS3I_binop_rm_int_8 <0x08, "psignb",
+                                        int_x86_ssse3_psign_b,
+                                        int_x86_ssse3_psign_b_128>;
+defm PSIGNW      : SS3I_binop_rm_int_16<0x09, "psignw",
+                                        int_x86_ssse3_psign_w,
+                                        int_x86_ssse3_psign_w_128>;
+defm PSIGND      : SS3I_binop_rm_int_32<0x0A, "psignd",
+                                        int_x86_ssse3_psign_d,
+                                        int_x86_ssse3_psign_d_128>;
+
+let Constraints = "$src1 = $dst" in {
+  def PALIGNR64rr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+                           (ins VR64:$src1, VR64:$src2, i16imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           [(set VR64:$dst,
+                             (int_x86_ssse3_palign_r
+                              VR64:$src1, VR64:$src2,
+                              imm:$src3))]>;
+  def PALIGNR64rm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+                           (ins VR64:$src1, i64mem:$src2, i16imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           [(set VR64:$dst,
+                             (int_x86_ssse3_palign_r
+                              VR64:$src1,
+                              (bitconvert (memopv2i32 addr:$src2)),
+                              imm:$src3))]>;
+
+  def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+                           (ins VR128:$src1, VR128:$src2, i32imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           [(set VR128:$dst,
+                             (int_x86_ssse3_palign_r_128
+                              VR128:$src1, VR128:$src2,
+                              imm:$src3))]>, OpSize;
+  def PALIGNR128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+                           (ins VR128:$src1, i128mem:$src2, i32imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           [(set VR128:$dst,
+                             (int_x86_ssse3_palign_r_128
+                              VR128:$src1,
+                              (bitconvert (memopv4i32 addr:$src2)),
+                              imm:$src3))]>, OpSize;
+}
+
+def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
+def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+          (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// extload f32 -> f64.  This matches load+fextend because we have a hack in 
+// the isel (PreprocessForFPConvert) that can introduce loads after dag combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+let Predicates = [HasSSE2] in
+ def : Pat<(fextend (loadf32 addr:$src)),
+           (CVTSS2SDrm addr:$src)>;
+
+// bit_convert
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+          (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+          (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+          (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+          (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+          (UNPCKLPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2f64 VR128:$src), (undef)),
+          (UNPCKHPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+          (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
+          (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// Special unary SHUFPSrri case.
+def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+          (SHUFPSrri VR128:$src1, VR128:$src1,
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+      Requires<[HasSSE1]>;
+let AddedComplexity = 5 in
+def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+          (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+      Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+          (SHUFPDrri VR128:$src1, VR128:$src1, 
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+      Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+          (SHUFPDrri VR128:$src1, VR128:$src1, 
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+      Requires<[HasSSE2]>;
+// Unary v4f32 shuffle with PSHUF* in order to fold a load.
+def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+          (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+      Requires<[HasSSE2]>;
+
+// Special binary v4i32 shuffle cases with SHUFPS.
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+          (SHUFPSrri VR128:$src1, VR128:$src2, 
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+           Requires<[HasSSE2]>;
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
+          (SHUFPSrmi VR128:$src1, addr:$src2, 
+                    (SHUFFLE_get_shuf_imm VR128:$src3))>,
+           Requires<[HasSSE2]>;
+// Special binary v2i64 shuffle cases using SHUFPDrri.
+def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+          (SHUFPDrri VR128:$src1, VR128:$src2, 
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+          Requires<[HasSSE2]>;
+
+// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckl_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
+          (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
+          (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
+          (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
+          (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckh_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
+          (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
+          (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
+          (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
+          (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+def : Pat<(v4i32 (movhp VR128:$src1, VR128:$src2)),
+          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+// vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS
+def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (movhp VR128:$src1, (load addr:$src2))),
+          (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (movhp VR128:$src1, (load addr:$src2))),
+          (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (movhp VR128:$src1, (load addr:$src2))),
+          (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (movhp VR128:$src1, (load addr:$src2))),
+          (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+}
+
+// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+// (store (vector_shuffle (load addr), v2, <0, 1, 4, 5>), addr) using MOVHPS
+def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4f32 (movhp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVHPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2f64 (movhp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVHPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                 addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 (movhp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                 addr:$src1),
+          (MOVHPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2i64 (movhp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVHPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+
+let AddedComplexity = 15 in {
+// Setting the lowest element in the vector.
+def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+          (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+}
+
+// Set lowest element and zero upper elements.
+let AddedComplexity = 15 in
+def : Pat<(v2f64 (movl immAllZerosV_bc, VR128:$src)),
+          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  (memop addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  (memop addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  (memop addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// vector -> vector casts
+def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+          (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+          (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (sint_to_fp (v2i32 VR64:$src))),
+          (Int_CVTPI2PDrr VR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
+          (Int_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+def : Pat<(alignedloadv4i32 addr:$src),
+          (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
+def : Pat<(loadv4i32 addr:$src),
+          (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>;
+def : Pat<(alignedloadv2i64 addr:$src),
+          (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>;
+def : Pat<(loadv2i64 addr:$src),
+          (MOVUPSrm addr:$src)>, Requires<[HasSSE2]>;
+
+def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          
+//===----------------------------------------------------------------------===//
+// SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
+                            string OpcodeStr,
+                            Intrinsic V4F32Int,
+                            Intrinsic V2F64Int> {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr_Int : SS4AIi8<opcps, MRMSrcReg, 
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : SS4AIi8<opcps, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst,
+                          (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst,
+                          (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
+                    OpSize;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int> {
+  // Intrinsic operation, reg.
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+                    (outs VR128:$dst), 
+                                 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst, 
+                            (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+                    OpSize;
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 
+                    (outs VR128:$dst), 
+                                (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr, 
+                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst, 
+                         (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+                    OpSize;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+                    (outs VR128:$dst), 
+                            (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst, 
+                            (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+                    OpSize;
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+                    (outs VR128:$dst), 
+                            (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst, 
+                        (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+                    OpSize;
+}
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round",
+                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128> {
+  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                     (ins i128mem:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set VR128:$dst,
+                       (IntId128
+                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         int_x86_sse41_phminposuw>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                                Intrinsic IntId128, bit Commutable = 0> {
+    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                   OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (IntId128 VR128:$src1,
+                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+  }
+}
+
+defm PCMPEQQ      : SS41I_binop_rm_int<0x29, "pcmpeqq",
+                                       int_x86_sse41_pcmpeqq, 1>;
+defm PACKUSDW     : SS41I_binop_rm_int<0x2B, "packusdw",
+                                       int_x86_sse41_packusdw, 0>;
+defm PMINSB       : SS41I_binop_rm_int<0x38, "pminsb",
+                                       int_x86_sse41_pminsb, 1>;
+defm PMINSD       : SS41I_binop_rm_int<0x39, "pminsd",
+                                       int_x86_sse41_pminsd, 1>;
+defm PMINUD       : SS41I_binop_rm_int<0x3B, "pminud",
+                                       int_x86_sse41_pminud, 1>;
+defm PMINUW       : SS41I_binop_rm_int<0x3A, "pminuw",
+                                       int_x86_sse41_pminuw, 1>;
+defm PMAXSB       : SS41I_binop_rm_int<0x3C, "pmaxsb",
+                                       int_x86_sse41_pmaxsb, 1>;
+defm PMAXSD       : SS41I_binop_rm_int<0x3D, "pmaxsd",
+                                       int_x86_sse41_pmaxsd, 1>;
+defm PMAXUD       : SS41I_binop_rm_int<0x3F, "pmaxud",
+                                       int_x86_sse41_pmaxud, 1>;
+defm PMAXUW       : SS41I_binop_rm_int<0x3E, "pmaxuw",
+                                       int_x86_sse41_pmaxuw, 1>;
+
+defm PMULDQ       : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, 1>;
+
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+          (PCMPEQQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+          (PCMPEQQrm VR128:$src1, addr:$src2)>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
+                                SDNode OpNode, Intrinsic IntId128,
+                                bit Commutable = 0> {
+    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
+                                                   VR128:$src2))]>, OpSize {
+      let isCommutable = Commutable;
+    }
+    def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (OpNode VR128:$src1, (memop addr:$src2)))]>, OpSize;
+    def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                       (ins VR128:$src1, i128mem:$src2),
+                       !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                       [(set VR128:$dst,
+                        (IntId128 VR128:$src1, (memop addr:$src2)))]>,
+                       OpSize;
+  }
+}
+defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
+                                       int_x86_sse41_pmulld, 1>;
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128, bit Commutable = 0> {
+    def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr, 
+                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst, 
+                      (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
+                    OpSize {
+      let isCommutable = Commutable;
+    }
+    def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst,
+                      (IntId128 VR128:$src1,
+                       (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
+                    OpSize;
+  }
+}
+
+defm BLENDPS      : SS41I_binop_rmi_int<0x0C, "blendps",
+                                        int_x86_sse41_blendps, 0>;
+defm BLENDPD      : SS41I_binop_rmi_int<0x0D, "blendpd",
+                                        int_x86_sse41_blendpd, 0>;
+defm PBLENDW      : SS41I_binop_rmi_int<0x0E, "pblendw",
+                                        int_x86_sse41_pblendw, 0>;
+defm DPPS         : SS41I_binop_rmi_int<0x40, "dpps",
+                                        int_x86_sse41_dpps, 1>;
+defm DPPD         : SS41I_binop_rmi_int<0x41, "dppd",
+                                        int_x86_sse41_dppd, 1>;
+defm MPSADBW      : SS41I_binop_rmi_int<0x42, "mpsadbw",
+                                        int_x86_sse41_mpsadbw, 1>;
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2),
+                    !strconcat(OpcodeStr, 
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+                    OpSize;
+
+    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                    [(set VR128:$dst,
+                      (IntId VR128:$src1,
+                       (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
+  }
+}
+
+defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
+defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+
+
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+       OpSize;
+}
+
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+
+// Common patterns involving scalar load.
+def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+          OpSize;
+}
+
+defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
+defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
+defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
+defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+          (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+          (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+          (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+          (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  // Expecting a i16 load any extended to i32 value.
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId (bitconvert
+                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
+                 OpSize;
+}
+
+defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbq
+            (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+          (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbq
+            (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+          (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+                 OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, OpSize;
+}
+
+defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
+
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+           OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs), 
+                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+                          addr:$dst)]>, OpSize;
+}
+
+defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+         Requires<[HasSSE41]>;
+
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr, 
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+                                imm:$src3))]>, OpSize;
+  }
+}
+
+defm PINSRB      : SS41I_insert8<0x20, "pinsrb">;
+
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr, 
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+                   OpSize;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+                                       imm:$src3)))]>, OpSize;
+  }
+}
+
+defm PINSRD      : SS41I_insert32<0x22, "pinsrd">;
+
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, FR32:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr, 
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (X86insrtps VR128:$src1, FR32:$src2, imm:$src3))]>, OpSize;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (X86insrtps VR128:$src1, (loadf32 addr:$src2),
+                                 imm:$src3))]>, OpSize;
+  }
+}
+
+defm INSERTPS    : SS41I_insertf32<0x21, "insertps">;
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                    "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
+                    "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
+}
+
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+
+/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
+let Constraints = "$src1 = $dst" in {
+  multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                                Intrinsic IntId128, bit Commutable = 0> {
+    def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                   OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (IntId128 VR128:$src1,
+                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+  }
+}
+
+defm PCMPGTQ      : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
+
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
+          (PCMPGTQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
+          (PCMPGTQrm VR128:$src1, addr:$src2)>;
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..f923106
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,560 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "llvm/Function.h"
+#include "llvm/Config/alloca.h"
+#include "llvm/Support/Compiler.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned char *OldByte = (unsigned char *)Old;
+  *OldByte++ = 0xE9;                // Emit JMP opcode.
+  unsigned *OldWord = (unsigned *)OldByte;
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)OldWord;
+  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// Check if building with -fPIC
+#if defined(__PIC__) && __PIC__ && defined(__linux__)
+#define ASMCALLSUFFIX "@PLT"
+#else
+#define ASMCALLSUFFIX
+#endif
+
+// For ELF targets, use a .size and .type directive, to let tools
+// know the extent of functions defined in assembler.
+#if defined(__ELF__)
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+#else
+# define SIZE(sym)
+# define TYPE_FUNCTION(sym)
+#endif
+
+// Provide a convenient way for disabling usage of CFI directives.
+// This is needed for old/broken assemblers (for example, gas on
+// Darwin is pretty old and doesn't support these directives)
+#if defined(__APPLE__)
+# define CFI(x)
+#else
+// FIXME: Disable this until we really want to use it. Also, we will
+//        need to add some workarounds for compilers, which support
+//        only subset of these directives.
+# define CFI(x)
+#endif
+
+// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(X86_64_JIT)
+# ifndef _MSC_VER
+  // No need to save EAX/EDX for X86-64.
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    // Save RBP
+    "pushq   %rbp\n"
+    CFI(".cfi_def_cfa_offset 16\n")
+    CFI(".cfi_offset %rbp, -16\n")
+    // Save RSP
+    "movq    %rsp, %rbp\n"
+    CFI(".cfi_def_cfa_register %rbp\n")
+    // Save all int arg registers
+    "pushq   %rdi\n"
+    CFI(".cfi_rel_offset %rdi, 0\n")
+    "pushq   %rsi\n"
+    CFI(".cfi_rel_offset %rsi, 8\n")
+    "pushq   %rdx\n"
+    CFI(".cfi_rel_offset %rdx, 16\n")
+    "pushq   %rcx\n"
+    CFI(".cfi_rel_offset %rcx, 24\n")
+    "pushq   %r8\n"
+    CFI(".cfi_rel_offset %r8, 32\n")
+    "pushq   %r9\n"
+    CFI(".cfi_rel_offset %r9, 40\n")
+    // Align stack on 16-byte boundary. ESP might not be properly aligned
+    // (8 byte) if this is called from an indirect stub.
+    "andq    $-16, %rsp\n"
+    // Save all XMM arg registers
+    "subq    $128, %rsp\n"
+    "movaps  %xmm0, (%rsp)\n"
+    "movaps  %xmm1, 16(%rsp)\n"
+    "movaps  %xmm2, 32(%rsp)\n"
+    "movaps  %xmm3, 48(%rsp)\n"
+    "movaps  %xmm4, 64(%rsp)\n"
+    "movaps  %xmm5, 80(%rsp)\n"
+    "movaps  %xmm6, 96(%rsp)\n"
+    "movaps  %xmm7, 112(%rsp)\n"
+    // JIT callee
+    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rsi\n"
+    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+    // Restore all XMM arg registers
+    "movaps  112(%rsp), %xmm7\n"
+    "movaps  96(%rsp), %xmm6\n"
+    "movaps  80(%rsp), %xmm5\n"
+    "movaps  64(%rsp), %xmm4\n"
+    "movaps  48(%rsp), %xmm3\n"
+    "movaps  32(%rsp), %xmm2\n"
+    "movaps  16(%rsp), %xmm1\n"
+    "movaps  (%rsp), %xmm0\n"
+    // Restore RSP
+    "movq    %rbp, %rsp\n"
+    CFI(".cfi_def_cfa_register %rsp\n")
+    // Restore all int arg registers
+    "subq    $48, %rsp\n"
+    CFI(".cfi_adjust_cfa_offset 48\n")
+    "popq    %r9\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r9\n")
+    "popq    %r8\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r8\n")
+    "popq    %rcx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rcx\n")
+    "popq    %rdx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdx\n")
+    "popq    %rsi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rsi\n")
+    "popq    %rdi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdi\n")
+    // Restore RBP
+    "popq    %rbp\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rbp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+# else
+  // No inline assembler support on this platform. The routine is in external
+  // file.
+  void X86CompilationCallback();
+
+# endif
+#elif defined (X86_32_JIT)
+# ifndef _MSC_VER
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+#  if defined(__APPLE__)
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+#  endif
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register %esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+
+  // Same as X86CompilationCallback but also saves XMM argument registers.
+  void X86CompilationCallback_SSE(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+    TYPE_FUNCTION(X86CompilationCallback_SSE)
+  ASMPREFIX "X86CompilationCallback_SSE:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    // FIXME: provide frame move information for xmm registers.
+    // This can be tricky, because CFA register is ebp (unaligned)
+    // and we need to produce offsets relative to it.
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    CFI(".cfi_restore %xmm3\n")
+    "movaps  32(%esp), %xmm2\n"
+    CFI(".cfi_restore %xmm2\n")
+    "movaps  16(%esp), %xmm1\n"
+    CFI(".cfi_restore %xmm1\n")
+    "movaps  (%esp), %xmm0\n"
+    CFI(".cfi_restore %xmm0\n")
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback_SSE)
+  );
+# else
+  void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+
+  _declspec(naked) void X86CompilationCallback(void) {
+    __asm {
+      push  ebp
+      mov   ebp, esp
+      push  eax
+      push  edx
+      push  ecx
+      and   esp, -16
+      mov   eax, dword ptr [ebp+4]
+      mov   dword ptr [esp+4], eax
+      mov   dword ptr [esp], ebp
+      call  X86CompilationCallback2
+      mov   esp, ebp
+      sub   esp, 12
+      pop   ecx
+      pop   edx
+      pop   eax
+      pop   ebp
+      ret
+    }
+  }
+
+# endif // _MSC_VER
+
+#else // Not an i386 host
+  void X86CompilationCallback() {
+    assert(0 && "Cannot call X86CompilationCallback() on a non-x86 arch!\n");
+    abort();
+  }
+#endif
+}
+
+/// X86CompilationCallback2 - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" void ATTRIBUTE_USED
+X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+  intptr_t *RetAddrLoc = &StackPtr[1];
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+
+  // It's a stub if there is an interrupt marker after the call.
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD;
+
+  // The call instruction should have pushed the return value onto the stack...
+#if defined (X86_64_JIT)
+  RetAddr--;     // Backtrack to the reference itself...
+#else
+  RetAddr -= 4;  // Backtrack to the reference itself...
+#endif
+
+#if 0
+  DOUT << "In callback! Addr=" << (void*)RetAddr
+       << " ESP=" << (void*)StackPtr
+       << ": Resolving call to function: "
+       << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n";
+#endif
+
+  // Sanity check to make sure this really is a call instruction.
+#if defined (X86_64_JIT)
+  assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+  assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call.
+#if defined (X86_64_JIT)
+  if (!isStub)
+    *(intptr_t *)(RetAddr - 0xa) = NewVal;
+#else
+  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+  if (isStub) {
+    // If this is a stub, rewrite the call into an unconditional branch
+    // instruction so that two return addresses are not pushed onto the stack
+    // when the requested function finally gets called.  This also makes the
+    // 0xCD byte (interrupt) dead, so the marker doesn't effect anything.
+#if defined (X86_64_JIT)
+    // If the target address is within 32-bit range of the stub, use a
+    // PC-relative branch instead of loading the actual address.  (This is
+    // considerably shorter than the 64-bit immediate load already there.)
+    // We assume here intptr_t is 64 bits.
+    intptr_t diff = NewVal-RetAddr+7;
+    if (diff >= -2147483648LL && diff <= 2147483647LL) {
+      *(unsigned char*)(RetAddr-0xc) = 0xE9;
+      *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
+    } else {
+      *(intptr_t *)(RetAddr - 0xa) = NewVal;
+      ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+    }
+#else
+    ((unsigned char*)RetAddr)[-1] = 0xE9;
+#endif
+  }
+
+  // Change the return address to reexecute the call instruction...
+#if defined (X86_64_JIT)
+  *RetAddrLoc -= 0xd;
+#else
+  *RetAddrLoc -= 5;
+#endif
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  if (!X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) {
+    // FIXME: support for AMD family of processors.
+    if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+      X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+      if ((EDX >> 25) & 0x1)
+        return X86CompilationCallback_SSE;
+    }
+  }
+#endif
+
+  return X86CompilationCallback;
+}
+
+void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                             JITCodeEmitter &JCE) {
+#if defined (X86_64_JIT)
+  JCE.startGVStub(GV, 8, 8);
+  JCE.emitWordLE((unsigned)(intptr_t)ptr);
+  JCE.emitWordLE((unsigned)(((intptr_t)ptr) >> 32));
+#else
+  JCE.startGVStub(GV, 4, 4);
+  JCE.emitWordLE((intptr_t)ptr);
+#endif
+  return JCE.finishGVStub(GV);
+}
+
+void *X86JITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  // Note, we cast to intptr_t here to silence a -pedantic warning that 
+  // complains about casting a function pointer to a normal pointer.
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+  bool NotCC = (Fn != (void*)(intptr_t)X86CompilationCallback &&
+                Fn != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+  bool NotCC = Fn != (void*)(intptr_t)X86CompilationCallback;
+#endif
+  if (NotCC) {
+#if defined (X86_64_JIT)
+    JCE.startGVStub(F, 13, 4);
+    JCE.emitByte(0x49);          // REX prefix
+    JCE.emitByte(0xB8+2);        // movabsq r10
+    JCE.emitWordLE((unsigned)(intptr_t)Fn);
+    JCE.emitWordLE((unsigned)(((intptr_t)Fn) >> 32));
+    JCE.emitByte(0x41);          // REX prefix
+    JCE.emitByte(0xFF);          // jmpq *r10
+    JCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+    JCE.startGVStub(F, 5, 4);
+    JCE.emitByte(0xE9);
+    JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
+#endif
+    return JCE.finishGVStub(F);
+  }
+
+#if defined (X86_64_JIT)
+  JCE.startGVStub(F, 14, 4);
+  JCE.emitByte(0x49);          // REX prefix
+  JCE.emitByte(0xB8+2);        // movabsq r10
+  JCE.emitWordLE((unsigned)(intptr_t)Fn);
+  JCE.emitWordLE((unsigned)(((intptr_t)Fn) >> 32));
+  JCE.emitByte(0x41);          // REX prefix
+  JCE.emitByte(0xFF);          // callq *r10
+  JCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+  JCE.startGVStub(F, 6, 4);
+  JCE.emitByte(0xE8);   // Call with 32 bit pc-rel destination...
+
+  JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
+#endif
+
+  JCE.emitByte(0xCD);   // Interrupt - Just a marker identifying the stub!
+  return JCE.finishGVStub(F);
+}
+
+void X86JITInfo::emitFunctionStubAtAddr(const Function* F, void *Fn, void *Stub,
+                                        JITCodeEmitter &JCE) {
+  // Note, we cast to intptr_t here to silence a -pedantic warning that 
+  // complains about casting a function pointer to a normal pointer.
+  JCE.startGVStub(F, Stub, 5);
+  JCE.emitByte(0xE9);
+#if defined (X86_64_JIT)
+  assert(((((intptr_t)Fn-JCE.getCurrentPCValue()-5) << 32) >> 32) == 
+          ((intptr_t)Fn-JCE.getCurrentPCValue()-5) 
+         && "PIC displacement does not fit in displacement field!");
+#endif
+  JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
+  JCE.finishGVStub(F);
+}
+
+/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+/// specific basic block.
+uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
+#if defined(X86_64_JIT)
+  return BB - Entry;
+#else
+  return BB - PICBase;
+#endif
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((X86::RelocationType)MR->getRelocationType()) {
+    case X86::reloc_pcrel_word: {
+      // PC relative relocation, add the relocated value to the value already in
+      // memory, after we adjust it for where the PC is.
+      ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_picrel_word: {
+      // PIC base relative relocation, add the relocated value to the value
+      // already in memory, after we adjust it for where the PIC base is.
+      ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_absolute_word:
+      // Absolute relocation, just add the relocated value to the value already
+      // in memory.
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    case X86::reloc_absolute_dword:
+      *((intptr_t*)RelocPos) += ResultPtr;
+      break;
+    }
+  }
+}
+
+char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
+#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
+  TLSOffset -= size;
+  return TLSOffset;
+#else
+  assert(0 && "Cannot allocate thread local storage on this arch!\n");
+  return 0;
+#endif
+}
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..6a4e214
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,84 @@
+//===- X86JITInfo.h - X86 implementation of the JIT interface  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class X86TargetMachine;
+
+  class X86JITInfo : public TargetJITInfo {
+    X86TargetMachine &TM;
+    uintptr_t PICBase;
+    char* TLSOffset;
+  public:
+    explicit X86JITInfo(X86TargetMachine &tm) : TM(tm) {
+      useGOT = 0;
+      TLSOffset = 0;
+    }
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                             JITCodeEmitter &JCE);
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+
+    /// emitFunctionStubAtAddr - Use the specified JITCodeEmitter object to
+    /// emit a small native function that simply calls Fn. Emit the stub into
+    /// the supplied buffer.
+    virtual void emitFunctionStubAtAddr(const Function* F, void *Fn,
+                                        void *Buffer, JITCodeEmitter &JCE);
+
+    /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+    /// specific basic block.
+    virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+    
+    /// allocateThreadLocalMemory - Each target has its own way of
+    /// handling thread local variables. This method returns a value only
+    /// meaningful to the target.
+    virtual char* allocateThreadLocalMemory(size_t size);
+
+    /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
+    /// PIC jumptable entry.
+    void setPICBase(uintptr_t Base) { PICBase = Base; }
+    uintptr_t getPICBase() const { return PICBase; }
+  };
+}
+
+#endif
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..8a5ac2c
--- /dev/null
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,112 @@
+//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+enum NameDecorationStyle {
+  None,
+  StdCall,
+  FastCall
+};
+  
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or 
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// BytesToPopOnReturn - Number of bytes function pops on return.
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn;
+
+  /// DecorationStyle - If the function requires additional name decoration,
+  /// DecorationStyle holds the right way to do so.
+  NameDecorationStyle DecorationStyle;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+  /// TailCallReturnAddrDelta - Delta the ReturnAddr stack slot is moved
+  /// Used for creating an area before the register spill area on the stack
+  /// the returnaddr can be savely move to this area
+  int TailCallReturnAddrDelta;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+public:
+  X86MachineFunctionInfo() : ForceFramePointer(false),
+                             CalleeSavedFrameSize(0),
+                             BytesToPopOnReturn(0),
+                             DecorationStyle(None),
+                             ReturnAddrIndex(0),
+                             TailCallReturnAddrDelta(0),
+                             SRetReturnReg(0),
+                             GlobalBaseReg(0) {}
+  
+  X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
+                                                CalleeSavedFrameSize(0),
+                                                BytesToPopOnReturn(0),
+                                                DecorationStyle(None),
+                                                ReturnAddrIndex(0),
+                                                TailCallReturnAddrDelta(0),
+                                                SRetReturnReg(0),
+                                                GlobalBaseReg(0) {}
+  
+  bool getForceFramePointer() const { return ForceFramePointer;} 
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  NameDecorationStyle getDecorationStyle() const { return DecorationStyle; }
+  void setDecorationStyle(NameDecorationStyle style) { DecorationStyle = style;}
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+  void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..5af1fb1
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,1280 @@
+//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+                                 const TargetInstrInfo &tii)
+  : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() ?
+                         X86::ADJCALLSTACKDOWN64 :
+                         X86::ADJCALLSTACKDOWN32,
+                       tm.getSubtarget<X86Subtarget>().is64Bit() ?
+                         X86::ADJCALLSTACKUP64 :
+                         X86::ADJCALLSTACKUP32),
+    TM(tm), TII(tii) {
+  // Cache some information.
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  Is64Bit = Subtarget->is64Bit();
+  IsWin64 = Subtarget->isTargetWin64();
+  StackAlign = TM.getFrameInfo()->getStackAlignment();
+  if (Is64Bit) {
+    SlotSize = 8;
+    StackPtr = X86::RSP;
+    FramePtr = X86::RBP;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+  }
+}
+
+// getDwarfRegNum - This function maps LLVM register identifiers to the
+// Dwarf specific numbering, used in debug info and exception tables.
+
+int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  unsigned Flavour = DWARFFlavour::X86_64;
+  if (!Subtarget->is64Bit()) {
+    if (Subtarget->isTargetDarwin()) {
+      if (isEH)
+        Flavour = DWARFFlavour::X86_32_DarwinEH;
+      else
+        Flavour = DWARFFlavour::X86_32_Generic;
+    } else if (Subtarget->isTargetCygMing()) {
+      // Unsupported by now, just quick fallback
+      Flavour = DWARFFlavour::X86_32_Generic;
+    } else {
+      Flavour = DWARFFlavour::X86_32_Generic;
+    }
+  }
+
+  return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
+}
+
+// getX86RegNum - This function maps LLVM register identifiers to their X86
+// specific numbering, which is used in various places encoding instructions.
+//
+unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
+  switch(RegNo) {
+  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+    return N86::ESP;
+  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+    return N86::EBP;
+  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+    return N86::ESI;
+  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+    return N86::EDI;
+
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    return N86::EAX;
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    return N86::ECX;
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    return N86::EDX;
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    return N86::EBX;
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    return N86::ESP;
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    return N86::EBP;
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    return N86::ESI;
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    return N86::EDI;
+
+  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+    return RegNo-X86::ST0;
+
+  case X86::XMM0: case X86::XMM8: case X86::MM0:
+    return 0;
+  case X86::XMM1: case X86::XMM9: case X86::MM1:
+    return 1;
+  case X86::XMM2: case X86::XMM10: case X86::MM2:
+    return 2;
+  case X86::XMM3: case X86::XMM11: case X86::MM3:
+    return 3;
+  case X86::XMM4: case X86::XMM12: case X86::MM4:
+    return 4;
+  case X86::XMM5: case X86::XMM13: case X86::MM5:
+    return 5;
+  case X86::XMM6: case X86::XMM14: case X86::MM6:
+    return 6;
+  case X86::XMM7: case X86::XMM15: case X86::MM7:
+    return 7;
+
+  default:
+    assert(isVirtualRegister(RegNo) && "Unknown physical register!");
+    assert(0 && "Register allocator hasn't allocated reg correctly yet!");
+    return 0;
+  }
+}
+
+const TargetRegisterClass *X86RegisterInfo::getPointerRegClass() const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  if (Subtarget->is64Bit())
+    return &X86::GR64RegClass;
+  else
+    return &X86::GR32RegClass;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &X86::CCRRegClass) {
+    if (Is64Bit)
+      return &X86::GR64RegClass;
+    else
+      return &X86::GR32RegClass;
+  }
+  return NULL;
+}
+
+const unsigned *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  bool callsEHReturn = false;
+
+  if (MF) {
+    const MachineFrameInfo *MFI = MF->getFrameInfo();
+    const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+    callsEHReturn = (MMI ? MMI->callsEHReturn() : false);
+  }
+
+  static const unsigned CalleeSavedRegs32Bit[] = {
+    X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs32EHRet[] = {
+    X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs64Bit[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+
+  static const unsigned CalleeSavedRegs64EHRet[] = {
+    X86::RAX, X86::RDX, X86::RBX, X86::R12,
+    X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+
+  static const unsigned CalleeSavedRegsWin64[] = {
+    X86::RBX,   X86::RBP,   X86::RDI,   X86::RSI,
+    X86::R12,   X86::R13,   X86::R14,   X86::R15,
+    X86::XMM6,  X86::XMM7,  X86::XMM8,  X86::XMM9,
+    X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13,
+    X86::XMM14, X86::XMM15, 0
+  };
+
+  if (Is64Bit) {
+    if (IsWin64)
+      return CalleeSavedRegsWin64;
+    else
+      return (callsEHReturn ? CalleeSavedRegs64EHRet : CalleeSavedRegs64Bit);
+  } else {
+    return (callsEHReturn ? CalleeSavedRegs32EHRet : CalleeSavedRegs32Bit);
+  }
+}
+
+const TargetRegisterClass* const*
+X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  bool callsEHReturn = false;
+
+  if (MF) {
+    const MachineFrameInfo *MFI = MF->getFrameInfo();
+    const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+    callsEHReturn = (MMI ? MMI->callsEHReturn() : false);
+  }
+
+  static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass, 0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses64EHRet[] = {
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass, 0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClassesWin64[] = {
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass, 0
+  };
+
+  if (Is64Bit) {
+    if (IsWin64)
+      return CalleeSavedRegClassesWin64;
+    else
+      return (callsEHReturn ?
+              CalleeSavedRegClasses64EHRet : CalleeSavedRegClasses64Bit);
+  } else {
+    return (callsEHReturn ?
+            CalleeSavedRegClasses32EHRet : CalleeSavedRegClasses32Bit);
+  }
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  // Set the stack-pointer register and its aliases as reserved.
+  Reserved.set(X86::RSP);
+  Reserved.set(X86::ESP);
+  Reserved.set(X86::SP);
+  Reserved.set(X86::SPL);
+  // Set the frame-pointer register and its aliases as reserved if needed.
+  if (hasFP(MF)) {
+    Reserved.set(X86::RBP);
+    Reserved.set(X86::EBP);
+    Reserved.set(X86::BP);
+    Reserved.set(X86::BPL);
+  }
+  // Mark the x87 stack registers as reserved, since they don't
+  // behave normally with respect to liveness. We don't fully
+  // model the effects of x87 stack pushes and pops after
+  // stackification.
+  Reserved.set(X86::ST0);
+  Reserved.set(X86::ST1);
+  Reserved.set(X86::ST2);
+  Reserved.set(X86::ST3);
+  Reserved.set(X86::ST4);
+  Reserved.set(X86::ST5);
+  Reserved.set(X86::ST6);
+  Reserved.set(X86::ST7);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+static unsigned calculateMaxStackAlignment(const MachineFrameInfo *FFI) {
+  unsigned MaxAlign = 0;
+  for (int i = FFI->getObjectIndexBegin(),
+         e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    unsigned Align = FFI->getObjectAlignment(i);
+    MaxAlign = std::max(MaxAlign, Align);
+  }
+
+  return MaxAlign;
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+
+  return (NoFramePointerElim ||
+          needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          (MMI && MMI->callsUnwindInit()));
+}
+
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // FIXME: Currently we don't support stack realignment for functions with
+  // variable-sized allocas
+  return (RealignStack &&
+          (MFI->getMaxAlignment() > StackAlign &&
+           !MFI->hasVarSizedObjects()));
+}
+
+bool X86RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+int
+X86RegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
+  int Offset = MF.getFrameInfo()->getObjectOffset(FI) + SlotSize;
+  uint64_t StackSize = MF.getFrameInfo()->getStackSize();
+
+  if (needsStackRealignment(MF)) {
+    if (FI < 0)
+      // Skip the saved EBP
+      Offset += SlotSize;
+    else {
+      unsigned Align = MF.getFrameInfo()->getObjectAlignment(FI);
+      assert( (-(Offset + StackSize)) % Align == 0);
+      Align = 0;
+      return Offset + StackSize;
+    }
+
+    // FIXME: Support tail calls
+  } else {
+    if (!hasFP(MF))
+      return Offset + StackSize;
+
+    // Skip the saved EBP
+    Offset += SlotSize;
+
+    // Skip the RETADDR move area
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+    if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta;
+  }
+
+  return Offset;
+}
+
+void X86RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+    // adjcallstackdown instruction into 'add ESP, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, Old->getDebugLoc(),
+                      TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
+                      StackPtr).addReg(StackPtr).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == getCallFrameDestroyOpcode());
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount) {
+          unsigned Opc = (Amount < 128) ?
+            (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+            (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+          New = BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), StackPtr)
+            .addReg(StackPtr).addImm(Amount);
+        }
+      }
+
+      if (New) {
+        // The EFLAGS implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction...
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      unsigned Opc = (CalleeAmt < 128) ?
+        (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+        (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+      MachineInstr *Old = I;
+      MachineInstr *New =
+        BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), 
+                StackPtr).addReg(StackPtr).addImm(CalleeAmt);
+      // The EFLAGS implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, RegScavenger *RS) const{
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  unsigned BasePtr;
+  if (needsStackRealignment(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+  else
+    BasePtr = (hasFP(MF) ? FramePtr : StackPtr);
+
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  if (MI.getOperand(i+3).isImm()) {
+    // Offset is a 32-bit integer.
+    int Offset = getFrameIndexOffset(MF, FrameIndex) +
+      (int)(MI.getOperand(i+3).getImm());
+  
+     MI.getOperand(i+3).ChangeToImmediate(Offset);
+  } else {
+    // Offset is symbolic. This is extremely rare.
+    uint64_t Offset = getFrameIndexOffset(MF, FrameIndex) +
+                      (uint64_t)MI.getOperand(i+3).getOffset();
+    MI.getOperand(i+3).setOffset(Offset);
+  }
+}
+
+void
+X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  MachineFrameInfo *FFI = MF.getFrameInfo();
+
+  // Calculate and set max stack object alignment early, so we can decide
+  // whether we will need stack realignment (and thus FP).
+  unsigned MaxAlign = std::max(FFI->getMaxAlignment(),
+                               calculateMaxStackAlignment(FFI));
+
+  FFI->setMaxAlignment(MaxAlign);
+}
+
+void
+X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta < 0) {
+    // create RETURNADDR area
+    //   arg
+    //   arg
+    //   RETADDR
+    //   { ...
+    //     RETADDR area
+    //     ...
+    //   }
+    //   [EBP]
+    MF.getFrameInfo()->
+      CreateFixedObject(-TailCallReturnAddrDelta,
+                        (-1*SlotSize)+TailCallReturnAddrDelta);
+  }
+  if (hasFP(MF)) {
+    assert((TailCallReturnAddrDelta <= 0) &&
+           "The Delta should always be zero or negative");
+    // Create a frame entry for the EBP register that must be saved.
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                        (int)SlotSize * -2+
+                                                       TailCallReturnAddrDelta);
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for EBP register must be last in order to be found!");
+    FrameIdx = 0;
+  }
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  unsigned StackPtr, int64_t NumBytes, bool Is64Bit,
+                  const TargetInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  unsigned Opc = isSub
+    ? ((Offset < 128) ?
+       (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+       (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri))
+    : ((Offset < 128) ?
+       (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+       (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri));
+  uint64_t Chunk = (1LL << 31) - 1;
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+         .addReg(StackPtr).addImm(ThisVal);
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
+    Offset -= ThisVal;
+  }
+}
+
+// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+static
+void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                      unsigned StackPtr, uint64_t *NumBytes = NULL) {
+  if (MBBI == MBB.begin()) return;
+
+  MachineBasicBlock::iterator PI = prior(MBBI);
+  unsigned Opc = PI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  }
+}
+
+// mergeSPUpdatesUp - Merge two stack-manipulating instructions lower iterator.
+static
+void mergeSPUpdatesDown(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MBBI,
+                        unsigned StackPtr, uint64_t *NumBytes = NULL) {
+  return;
+
+  if (MBBI == MBB.end()) return;
+
+  MachineBasicBlock::iterator NI = next(MBBI);
+  if (NI == MBB.end()) return;
+
+  unsigned Opc = NI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  }
+}
+
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB instruction it is deleted
+/// argument and the stack adjustment is returned as a positive value for ADD
+/// and a negative for SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MBBI,
+                           unsigned StackPtr,
+                           bool doMergeWithPrevious) {
+
+  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+      (!doMergeWithPrevious && MBBI == MBB.end()))
+    return 0;
+
+  int Offset = 0;
+
+  MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : next(MBBI);
+  unsigned Opc = PI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      PI->getOperand(0).getReg() == StackPtr){
+    Offset += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    Offset -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  }
+
+  return Offset;
+}
+
+void X86RegisterInfo::emitFrameMoves(MachineFunction &MF,
+                                     unsigned FrameLabelId,
+                                     unsigned ReadyLabelId) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  if (!MMI)
+    return;
+
+  uint64_t StackSize = MFI->getStackSize();
+  std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+  const TargetData *TD = MF.getTarget().getTargetData();
+
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth =
+    (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+     TargetFrameInfo::StackGrowsUp ?
+     TD->getPointerSize() : -TD->getPointerSize());
+
+  if (StackSize) {
+    // Show update of SP.
+    if (hasFP(MF)) {
+      // Adjust SP
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    } else {
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP,
+                            -StackSize+stackGrowth);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    }
+  } else {
+    //FIXME: Verify & implement for FP
+    MachineLocation SPDst(StackPtr);
+    MachineLocation SPSrc(StackPtr, stackGrowth);
+    Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+  }
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  // FIXME: This is dirty hack. The code itself is pretty mess right now.
+  // It should be rewritten from scratch and generalized sometimes.
+
+  // Determine maximum offset (minumum due to stack growth)
+  int64_t MaxOffset = 0;
+  for (unsigned I = 0, E = CSI.size(); I!=E; ++I)
+    MaxOffset = std::min(MaxOffset,
+                         MFI->getObjectOffset(CSI[I].getFrameIdx()));
+
+  // Calculate offsets
+  int64_t saveAreaOffset = (hasFP(MF) ? 3 : 2)*stackGrowth;
+  for (unsigned I = 0, E = CSI.size(); I!=E; ++I) {
+    int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+    unsigned Reg = CSI[I].getReg();
+    Offset = (MaxOffset-Offset+saveAreaOffset);
+    MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+    MachineLocation CSSrc(Reg);
+    Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+  }
+
+  if (hasFP(MF)) {
+    // Save FP
+    MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
+    MachineLocation FPSrc(FramePtr);
+    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+  }
+
+  MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
+  MachineLocation FPSrc(MachineLocation::VirtualFP);
+  Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+}
+
+
+void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function* Fn = MF.getFunction();
+  const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
+                          !Fn->doesNotThrow() ||
+                          UnwindTablesMandatory;
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  // Prepare for frame info.
+  unsigned FrameLabelId = 0;
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+
+  // Get desired stack alignment
+  uint64_t MaxAlign  = MFI->getMaxAlignment();
+
+  // Add RETADDR move area to callee saved frame size.
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta < 0)
+    X86FI->setCalleeSavedFrameSize(
+          X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta));
+
+  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+  // function, and use up to 128 bytes of stack space, don't have a frame
+  // pointer, calls, or dynamic alloca then we do not need to adjust the
+  // stack pointer (we fit in the Red Zone).
+  if (Is64Bit && !DisableRedZone &&
+      !needsStackRealignment(MF) &&
+      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
+      !MFI->hasCalls()) {                          // No calls.
+    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+    if (hasFP(MF)) MinSize += SlotSize;
+    StackSize = std::max(MinSize,
+                         StackSize > 128 ? StackSize - 128 : 0);
+    MFI->setStackSize(StackSize);
+  }
+
+  // Insert stack pointer adjustment for later moving of return addr.  Only
+  // applies to tail call optimized functions where the callee argument stack
+  // size is bigger than the callers.
+  if (TailCallReturnAddrDelta < 0) {
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),
+              StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta);
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
+  }
+
+  uint64_t NumBytes = 0;
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (needsStackRealignment(MF))
+      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save EBP into the appropriate stack slot...
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(FramePtr, RegState::Kill);
+
+    if (needsFrameMoves) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
+    }
+
+    // Update EBP with the new base value...
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+        .addReg(StackPtr);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(FramePtr);
+
+    // Realign stack
+    if (needsStackRealignment(MF)) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL,
+                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
+                StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+      // The EFLAGS implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  } else {
+    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+  }
+
+  unsigned ReadyLabelId = 0;
+  if (needsFrameMoves) {
+    // Mark effective beginning of when frame pointer is ready.
+    ReadyLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
+  }
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  if (NumBytes) {   // adjust stack pointer: ESP -= numbytes
+    if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+      // Check, whether EAX is livein for this function
+      bool isEAXAlive = false;
+      for (MachineRegisterInfo::livein_iterator
+           II = MF.getRegInfo().livein_begin(),
+           EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) {
+        unsigned Reg = II->first;
+        isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+                      Reg == X86::AH || Reg == X86::AL);
+      }
+
+      // Function prologue calls _alloca to probe the stack when allocating
+      // more than 4k bytes in one go. Touching the stack at 4K increments is
+      // necessary to ensure that the guard pages used by the OS virtual memory
+      // manager are allocated in correct sequence.
+      if (!isEAXAlive) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+          .addImm(NumBytes);
+        BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+          .addExternalSymbol("_alloca");
+      } else {
+        // Save EAX
+        BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+          .addReg(X86::EAX, RegState::Kill);
+        // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+        // allocated bytes for EAX.
+        BuildMI(MBB, MBBI, DL, 
+                TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4);
+        BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+          .addExternalSymbol("_alloca");
+        // Restore EAX
+        MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+                                                X86::EAX),
+                                        StackPtr, false, NumBytes-4);
+        MBB.insert(MBBI, MI);
+      }
+    } else {
+      // If there is an SUB32ri of ESP immediately before this instruction,
+      // merge the two. This can be the case when tail call elimination is
+      // enabled and the callee has more arguments then the caller.
+      NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+      // If there is an ADD32ri or SUB32ri of ESP immediately after this
+      // instruction, merge the two instructions.
+      mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+
+      if (NumBytes)
+        emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
+    }
+  }
+
+  if (needsFrameMoves)
+    emitFrameMoves(MF, FrameLabelId, ReadyLabelId);
+}
+
+void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  switch (RetOpcode) {
+  case X86::RET:
+  case X86::RETI:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNdi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr:
+  case X86::TAILJMPm: break;  // These are ok
+  default:
+    assert(0 && "Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  uint64_t MaxAlign  = MFI->getMaxAlignment();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (needsStackRealignment(MF))
+      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+    NumBytes = FrameSize - CSSize;
+
+    // pop EBP.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+  } else {
+    NumBytes = StackSize - CSSize;
+  }
+
+  // Skip the callee-saved pop instructions.
+  MachineBasicBlock::iterator LastCSPop = MBBI;
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    unsigned Opc = PI->getOpcode();
+    if (Opc != X86::POP32r && Opc != X86::POP64r &&
+        !PI->getDesc().isTerminator())
+      break;
+    --MBBI;
+  }
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately before this
+  // instruction, merge the two instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  // If dynamic alloca is used, then reset esp to point to the last callee-saved
+  // slot before popping them off! Same applies for the case, when stack was
+  // realigned
+  if (needsStackRealignment(MF)) {
+    // We cannot use LEA here, because stack pointer was realigned. We need to
+    // deallocate local frame back
+    if (CSSize) {
+      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+      MBBI = prior(LastCSPop);
+    }
+
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            StackPtr).addReg(FramePtr);
+  } else if (MFI->hasVarSizedObjects()) {
+    if (CSSize) {
+      unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+      MachineInstr *MI = addLeaRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
+                                         FramePtr, false, -CSSize);
+      MBB.insert(MBBI, MI);
+    } else
+      BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+              StackPtr).addReg(FramePtr);
+
+  } else {
+    // adjust stack pointer back: ESP += numbytes
+    if (NumBytes)
+      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+  }
+
+  // We're returning from function via eh_return.
+  if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
+    MBBI = prior(MBB.end());
+    MachineOperand &DestAddr  = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            StackPtr).addReg(DestAddr.getReg());
+  // Tail call return: adjust the stack pointer and jump to callee
+  } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+             RetOpcode== X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+    int Offset = 0;
+    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+    // Incoporate the retaddr area.
+    Offset = StackAdj-MaxTCDelta;
+    assert(Offset >= 0 && "Offset should never be negative");
+
+    if (Offset) {
+      // Check for possible merge with preceeding ADD instruction.
+      Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII);
+    }
+
+    // Jump to label or value in register.
+    if (RetOpcode == X86::TCRETURNdi|| RetOpcode == X86::TCRETURNdi64)
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPd)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+    else if (RetOpcode== X86::TCRETURNri64)
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64), JumpTarget.getReg());
+    else
+       BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr), JumpTarget.getReg());
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+  } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+             (X86FI->getTCReturnAddrDelta() < 0)) {
+    // Add the return addr area delta back since we are not tail calling.
+    int delta = -1*X86FI->getTCReturnAddrDelta();
+    MBBI = prior(MBB.end());
+    // Check for possible merge with preceeding ADD instruction.
+    delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII);
+  }
+}
+
+unsigned X86RegisterInfo::getRARegister() const {
+  if (Is64Bit)
+    return X86::RIP;  // Should have dwarf #16
+  else
+    return X86::EIP;  // Should have dwarf #8
+}
+
+unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? FramePtr : StackPtr;
+}
+
+void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = (Is64Bit ? -8 : -4);
+
+  // Initial state of the frame pointer is esp+4.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(StackPtr, stackGrowth);
+  Moves.push_back(MachineMove(0, Dst, Src));
+
+  // Add return address to move list
+  MachineLocation CSDst(StackPtr, stackGrowth);
+  MachineLocation CSSrc(getRARegister());
+  Moves.push_back(MachineMove(0, CSDst, CSSrc));
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT VT, bool High) {
+  switch (VT.getSimpleVT()) {
+  default: return Reg;
+  case MVT::i8:
+    if (High) {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case MVT::i16:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case MVT::i32:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+
+  return Reg;
+}
+}
+
+#include "X86GenRegisterInfo.inc"
+
+namespace {
+  struct VISIBILITY_HIDDEN MSAC : public MachineFunctionPass {
+    static char ID;
+    MSAC() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      MachineRegisterInfo &RI = MF.getRegInfo();
+
+      // Calculate max stack alignment of all already allocated stack objects.
+      unsigned MaxAlign = calculateMaxStackAlignment(FFI);
+
+      // Be over-conservative: scan over all vreg defs and find, whether vector
+      // registers are used. If yes - there is probability, that vector register
+      // will be spilled and thus stack needs to be aligned properly.
+      for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
+           RegNum < RI.getLastVirtReg(); ++RegNum)
+        MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
+
+      FFI->setMaxAlignment(MaxAlign);
+
+      return false;
+    }
+
+    virtual const char *getPassName() const {
+      return "X86 Maximal Stack Alignment Calculator";
+    }
+  };
+
+  char MSAC::ID = 0;
+}
+
+FunctionPass*
+llvm::createX86MaxStackAlignmentCalculatorPass() { return new MSAC(); }
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..33b9f5e
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,163 @@
+//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "X86GenRegisterInfo.h.inc"
+
+namespace llvm {
+  class Type;
+  class TargetInstrInfo;
+  class X86TargetMachine;
+
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+namespace X86 {
+  /// SubregIndex - The index of various sized subregister classes. Note that 
+  /// these indices must be kept in sync with the class indices in the 
+  /// X86RegisterInfo.td file.
+  enum SubregIndex {
+    SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4
+  };
+}
+
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+} 
+  
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+  X86TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+private:
+  /// Is64Bit - Is the target 64-bits.
+  ///
+  bool Is64Bit;
+
+  /// IsWin64 - Is the target on of win64 flavours
+  ///
+  bool IsWin64;
+
+  /// SlotSize - Stack slot size in bytes.
+  ///
+  unsigned SlotSize;
+
+  /// StackAlign - Default stack alignment.
+  ///
+  unsigned StackAlign;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  ///
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  ///
+  unsigned FramePtr;
+
+public:
+  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+  /// getX86RegNum - Returns the native X86 register number for the given LLVM
+  /// register identifier.
+  static unsigned getX86RegNum(unsigned RegNo);
+
+  unsigned getStackAlignment() const { return StackAlign; }
+
+  /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
+  /// (created by TableGen) for target dependencies.
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+  /// Code Generation virtual methods...
+  /// 
+
+  /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+  /// values.
+  const TargetRegisterClass *getPointerRegClass() const;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns NULL if it is possible to copy
+  /// between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
+  /// register classes to spill each callee-saved register with.  The order and
+  /// length of this list match the getCalleeSavedRegs() list.
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  bool needsStackRealignment(const MachineFunction &MF) const;
+
+  bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void emitFrameMoves(MachineFunction &MF,
+                      unsigned FrameLabelId, unsigned ReadyLabelId) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+  int getFrameIndexOffset(MachineFunction &MF, int FI) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT, bool High=false);
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..d552cb3
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,762 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+let Namespace = "X86" in {
+
+  // In the register alias definitions below, we define which registers alias
+  // which others.  We only specify which registers the small registers alias,
+  // because the register file generator is smart enough to figure out that
+  // AL aliases AX if we tell it that AX aliased AL (for example).
+
+  // Dwarf numbering is different for 32-bit and 64-bit, and there are 
+  // variations by target as well. Currently the first entry is for X86-64, 
+  // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+  // and debug information on X86-32/Darwin)
+
+  // 8-bit registers
+  // Low registers
+  def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>;
+  def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>;
+  def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>;
+  def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>;
+
+  // X86-64 only
+  def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>;
+  def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>;
+  def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>;
+  def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>;
+  def R8B  : Register<"r8b">,  DwarfRegNum<[8, -2, -2]>;
+  def R9B  : Register<"r9b">,  DwarfRegNum<[9, -2, -2]>;
+  def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>;
+  def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>;
+  def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>;
+  def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>;
+  def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
+  def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
+
+  // High registers. On x86-64, these cannot be used in any instruction
+  // with a REX prefix.
+  def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
+  def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
+  def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
+  def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>;
+
+  // 16-bit registers
+  def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>;
+  def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>;
+  def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>;
+  def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>;
+  def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>;
+  def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>;
+  def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>;
+  def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>;
+  def IP : Register<"ip">, DwarfRegNum<[16]>;
+  
+  // X86-64 only
+  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>;
+  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>;
+  def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>;
+  def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>;
+  def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>;
+  def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>;
+  def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>;
+  def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>;
+
+  // 32-bit registers
+  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>;
+  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>;
+  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>;
+  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>;
+  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>;
+  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>;
+  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>;
+  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>;
+  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>;  
+  
+  // X86-64 only
+  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>;
+  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>;
+  def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>;
+  def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>;
+  def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>;
+  def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>;
+  def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>;
+  def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>;
+
+  // 64-bit registers, X86-64 only
+  def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>;
+  def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>;
+  def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>;
+  def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>;
+  def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>;
+  def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>;
+  def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
+  def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+  def R8  : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
+  def R9  : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
+  def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
+  def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>;
+  def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>;
+  def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>;
+  def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
+  def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
+  def RIP : RegisterWithSubRegs<"rip", [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+
+  // MMX Registers. These are actually aliased to ST0 .. ST7
+  def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
+  def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>;
+  def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>;
+  def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>;
+  def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>;
+  def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>;
+  def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>;
+  def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>;
+  
+  // Pseudo Floating Point registers
+  def FP0 : Register<"fp0">;
+  def FP1 : Register<"fp1">;
+  def FP2 : Register<"fp2">;
+  def FP3 : Register<"fp3">;
+  def FP4 : Register<"fp4">;
+  def FP5 : Register<"fp5">;
+  def FP6 : Register<"fp6">; 
+
+  // XMM Registers, used by the various SSE instruction set extensions
+  def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>;
+  def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>;
+  def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>;
+  def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>;
+  def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>;
+  def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>;
+  def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>;
+  def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
+
+  // X86-64 only
+  def XMM8:  Register<"xmm8">,  DwarfRegNum<[25, -2, -2]>;
+  def XMM9:  Register<"xmm9">,  DwarfRegNum<[26, -2, -2]>;
+  def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
+  def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>;
+  def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>;
+  def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
+  def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
+  def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
+
+  // Floating point stack registers
+  def ST0 : Register<"st(0)">, DwarfRegNum<[33, 12, 11]>;
+  def ST1 : Register<"st(1)">, DwarfRegNum<[34, 13, 12]>;
+  def ST2 : Register<"st(2)">, DwarfRegNum<[35, 14, 13]>;
+  def ST3 : Register<"st(3)">, DwarfRegNum<[36, 15, 14]>;
+  def ST4 : Register<"st(4)">, DwarfRegNum<[37, 16, 15]>;
+  def ST5 : Register<"st(5)">, DwarfRegNum<[38, 17, 16]>;
+  def ST6 : Register<"st(6)">, DwarfRegNum<[39, 18, 17]>;
+  def ST7 : Register<"st(7)">, DwarfRegNum<[40, 19, 18]>; 
+
+  // Status flags register
+  def EFLAGS : Register<"flags">;
+
+  // Segment registers
+  def CS : Register<"cs">;
+  def DS : Register<"ds">;
+  def SS : Register<"ss">;
+  def ES : Register<"es">;
+  def FS : Register<"fs">;
+  def GS : Register<"gs">;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subregister Set Definitions... now that we have all of the pieces, define the
+// sub registers for each register.
+//
+
+def x86_subreg_8bit    : PatLeaf<(i32 1)>;
+def x86_subreg_8bit_hi : PatLeaf<(i32 2)>;
+def x86_subreg_16bit   : PatLeaf<(i32 3)>;
+def x86_subreg_32bit   : PatLeaf<(i32 4)>;
+
+def : SubRegSet<1, [AX, CX, DX, BX, SP,  BP,  SI,  DI,  
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
+                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
+                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [AX, CX, DX, BX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<1, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,  
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
+                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [EAX, ECX, EDX, EBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+                   [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<1, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
+                    R8,  R9,  R10, R11, R12, R13, R14, R15],
+                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
+                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [RAX, RCX, RDX, RBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
+                    R8,  R9,  R10, R11, R12, R13, R14, R15],
+                   [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<4, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
+                    R8,  R9,  R10, R11, R12, R13, R14, R15],
+                   [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI, 
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and 
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B. 
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
+                         R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate SPL or BPL.
+    static const unsigned X86_GR8_AO_64_fp[] = {
+      X86::AL,   X86::CL,   X86::DL,   X86::SIL, X86::DIL,
+      X86::R8B,  X86::R9B,  X86::R10B, X86::R11B,
+      X86::BL,   X86::R14B, X86::R15B, X86::R12B, X86::R13B
+    };
+    // If not, just don't allocate SPL.
+    static const unsigned X86_GR8_AO_64[] = {
+      X86::AL,   X86::CL,   X86::DL,   X86::SIL, X86::DIL,
+      X86::R8B,  X86::R9B,  X86::R10B, X86::R11B,
+      X86::BL,   X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL
+    };
+    // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+    static const unsigned X86_GR8_AO_32[] = {
+      X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH
+    };
+
+    GR8Class::iterator
+    GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32;
+      else if (RI->hasFP(MF))
+        return X86_GR8_AO_64_fp;
+      else
+        return X86_GR8_AO_64;
+    }
+
+    GR8Class::iterator
+    GR8Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned));
+      else if (RI->hasFP(MF))
+        return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned));
+      else
+        return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned));
+    }
+  }];
+}
+
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         [AX, CX, DX, SI, DI, BX, BP, SP,
+                          R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
+  let SubRegClassList = [GR8, GR8];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate SP or BP.
+    static const unsigned X86_GR16_AO_64_fp[] = {
+      X86::AX,  X86::CX,   X86::DX,   X86::SI,   X86::DI,
+      X86::R8W, X86::R9W,  X86::R10W, X86::R11W,
+      X86::BX, X86::R14W, X86::R15W,  X86::R12W, X86::R13W
+    };
+    static const unsigned X86_GR16_AO_32_fp[] = {
+      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX
+    };
+    // If not, just don't allocate SP.
+    static const unsigned X86_GR16_AO_64[] = {
+      X86::AX,  X86::CX,   X86::DX,   X86::SI,   X86::DI,
+      X86::R8W, X86::R9W,  X86::R10W, X86::R11W,
+      X86::BX, X86::R14W, X86::R15W,  X86::R12W, X86::R13W, X86::BP
+    };
+    static const unsigned X86_GR16_AO_32[] = {
+      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP
+    };
+
+    GR16Class::iterator
+    GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_64_fp;
+        else
+          return X86_GR16_AO_64;
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_32_fp;
+        else
+          return X86_GR16_AO_32;
+      }
+    }
+
+    GR16Class::iterator
+    GR16Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned));
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned));
+      }
+    }
+  }];
+}
+
+
+def GR32 : RegisterClass<"X86", [i32], 32, 
+                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                          R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+  let SubRegClassList = [GR8, GR8, GR16];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate ESP or EBP.
+    static const unsigned X86_GR32_AO_64_fp[] = {
+      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
+      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
+      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D
+    };
+    static const unsigned X86_GR32_AO_32_fp[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
+    };
+    // If not, just don't allocate ESP.
+    static const unsigned X86_GR32_AO_64[] = {
+      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
+      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
+      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
+    };
+    static const unsigned X86_GR32_AO_32[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
+    };
+
+    GR32Class::iterator
+    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_64_fp;
+        else
+          return X86_GR32_AO_64;
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_32_fp;
+        else
+          return X86_GR32_AO_32;
+      }
+    }
+
+    GR32Class::iterator
+    GR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned));
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned));
+      }
+    }
+  }];
+}
+
+
+def GR64 : RegisterClass<"X86", [i64], 64, 
+                         [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                          RBX, R14, R15, R12, R13, RBP, RSP]> {
+  let SubRegClassList = [GR8, GR8, GR16, GR32];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64Class::iterator
+    GR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return begin();  // None of these are allocatable in 32-bit.
+      if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+        return end()-2;  // If so, don't allocate RSP or RBP
+      else
+        return end()-1;  // If not, just don't allocate RSP
+    }
+  }];
+}
+
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]> {
+}
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, [AH, CH, DH, BH]> {
+}
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H];
+}
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD];
+}
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD];
+}
+
+// GR8_NOREX, GR16_NOREX, GR32_NOREX, GR64_NOREX - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain only the first 8 GPRs.
+// On x86-64, GR64_NOREX, GR32_NOREX and GR16_NOREX are the classes
+// of registers which do not by themselves require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              [AL, CL, DL, BL, AH, CH, DH, BH,
+                               SIL, DIL, BPL, SPL]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate SPL or BPL.
+    static const unsigned X86_GR8_NOREX_AO_64_fp[] = {
+      X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, X86::BL
+    };
+    // If not, just don't allocate SPL.
+    static const unsigned X86_GR8_NOREX_AO_64[] = {
+      X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, X86::BL, X86::BPL
+    };
+    // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+    static const unsigned X86_GR8_NOREX_AO_32[] = {
+      X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH
+    };
+
+    GR8_NOREXClass::iterator
+    GR8_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_NOREX_AO_32;
+      else if (RI->hasFP(MF))
+        return X86_GR8_NOREX_AO_64_fp;
+      else
+        return X86_GR8_NOREX_AO_64;
+    }
+
+    GR8_NOREXClass::iterator
+    GR8_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_NOREX_AO_32 +
+               (sizeof(X86_GR8_NOREX_AO_32) / sizeof(unsigned));
+      else if (RI->hasFP(MF))
+        return X86_GR8_NOREX_AO_64_fp +
+               (sizeof(X86_GR8_NOREX_AO_64_fp) / sizeof(unsigned));
+      else
+        return X86_GR8_NOREX_AO_64 +
+               (sizeof(X86_GR8_NOREX_AO_64) / sizeof(unsigned));
+    }
+  }];
+}
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               [AX, CX, DX, SI, DI, BX, BP, SP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate SP or BP.
+    static const unsigned X86_GR16_AO_fp[] = {
+      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX
+    };
+    // If not, just don't allocate SP.
+    static const unsigned X86_GR16_AO[] = {
+      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP
+    };
+
+    GR16_NOREXClass::iterator
+    GR16_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR16_AO_fp;
+      else
+        return X86_GR16_AO;
+    }
+
+    GR16_NOREXClass::iterator
+    GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR16_AO_fp+(sizeof(X86_GR16_AO_fp)/sizeof(unsigned));
+      else
+        return X86_GR16_AO + (sizeof(X86_GR16_AO) / sizeof(unsigned));
+    }
+  }];
+}
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate ESP or EBP.
+    static const unsigned X86_GR32_NOREX_AO_fp[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
+    };
+    // If not, just don't allocate ESP.
+    static const unsigned X86_GR32_NOREX_AO[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
+    };
+
+    GR32_NOREXClass::iterator
+    GR32_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR32_NOREX_AO_fp;
+      else
+        return X86_GR32_NOREX_AO;
+    }
+
+    GR32_NOREXClass::iterator
+    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR32_NOREX_AO_fp +
+               (sizeof(X86_GR32_NOREX_AO_fp) / sizeof(unsigned));
+      else
+        return X86_GR32_NOREX_AO +
+               (sizeof(X86_GR32_NOREX_AO) / sizeof(unsigned));
+    }
+  }];
+}
+
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                               [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate RSP or RBP.
+    static const unsigned X86_GR64_NOREX_AO_fp[] = {
+      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX
+    };
+    // If not, just don't allocate RSP.
+    static const unsigned X86_GR64_NOREX_AO[] = {
+      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX, X86::RBP
+    };
+
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR64_NOREX_AO_fp;
+      else
+        return X86_GR64_NOREX_AO;
+    }
+
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR64_NOREX_AO_fp +
+               (sizeof(X86_GR64_NOREX_AO_fp) / sizeof(unsigned));
+      else
+        return X86_GR64_NOREX_AO +
+               (sizeof(X86_GR64_NOREX_AO) / sizeof(unsigned));
+    }
+  }];
+}
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GRAD : RegisterClass<"X86", [i32], 32, [EAX, EDX]>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR32Class::iterator
+    FR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+def FR64 : RegisterClass<"X86", [f64], 64,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR64Class::iterator
+    FR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP64 : RegisterClass<"X86",[f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32,
+                        [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
+    let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    RSTClass::iterator
+    RSTClass::allocation_order_end(const MachineFunction &MF) const {
+      return begin();
+    }
+  }];
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                          [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
+                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                           XMM8, XMM9, XMM10, XMM11,
+                           XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VR128Class::iterator
+    VR128Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+}
diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..b225f48
--- /dev/null
+++ b/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,42 @@
+//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace X86 {
+    /// RelocationType - An enum for the x86 relocation codes. Note that
+    /// the terminology here doesn't follow x86 convention - word means
+    /// 32-bit and dword means 64-bit.
+    enum RelocationType {
+      // reloc_pcrel_word - PC relative relocation, add the relocated value to
+      // the value already in memory, after we adjust it for where the PC is.
+      reloc_pcrel_word = 0,
+
+      // reloc_picrel_word - PIC base relative relocation, add the relocated
+      // value to the value already in memory, after we adjust it for where the
+      // PIC base is.
+      reloc_picrel_word = 1,
+      
+      // reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
+      // add the relocated value to the value already in memory.
+      reloc_absolute_word = 2,
+      reloc_absolute_dword = 3
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..03ce1ae
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,446 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget"
+#include "X86Subtarget.h"
+#include "X86GenSubtarget.inc"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#if defined(_MSC_VER)
+    #include <intrin.h>
+#endif
+
+static cl::opt<X86Subtarget::AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(
+    clEnumValN(X86Subtarget::ATT,   "att",   "Emit AT&T-style assembly"),
+    clEnumValN(X86Subtarget::Intel, "intel", "Emit Intel-style assembly"),
+    clEnumValEnd));
+
+
+/// True if accessing the GV requires an extra load. For Windows, dllimported
+/// symbols are indirect, loading the value at address GV rather then the
+/// value of GV itself. This means that the GlobalAddress must be in the base
+/// or index register of the address, not the GV offset field.
+bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV,
+                                       const TargetMachine& TM,
+                                       bool isDirectCall) const
+{
+  // FIXME: PIC
+  if (TM.getRelocationModel() != Reloc::Static &&
+      TM.getCodeModel() != CodeModel::Large) {
+    if (isTargetDarwin()) {
+      if (isDirectCall)
+        return false;
+      bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
+      if (GV->hasHiddenVisibility() &&
+          (Is64Bit || (!isDecl && !GV->hasCommonLinkage())))
+        // If symbol visibility is hidden, the extra load is not needed if
+        // target is x86-64 or the symbol is definitely defined in the current
+        // translation unit.
+        return false;
+      return !isDirectCall && (isDecl || GV->isWeakForLinker());
+    } else if (isTargetELF()) {
+      // Extra load is needed for all externally visible.
+      if (isDirectCall)
+        return false;
+      if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+        return false;
+      return true;
+    } else if (isTargetCygMing() || isTargetWindows()) {
+      return (GV->hasDLLImportLinkage());
+    }
+  }
+  return false;
+}
+
+/// True if accessing the GV requires a register.  This is a superset of the
+/// cases where GVRequiresExtraLoad is true.  Some variations of PIC require
+/// a register, but not an extra load.
+bool X86Subtarget::GVRequiresRegister(const GlobalValue *GV,
+                                      const TargetMachine& TM,
+                                      bool isDirectCall) const
+{
+  if (GVRequiresExtraLoad(GV, TM, isDirectCall))
+    return true;
+  // Code below here need only consider cases where GVRequiresExtraLoad
+  // returns false.
+  if (TM.getRelocationModel() == Reloc::PIC_)
+    return !isDirectCall && 
+      (GV->hasLocalLinkage() || GV->hasExternalLinkage());
+  return false;
+}
+
+/// getBZeroEntry - This function returns the name of a function which has an
+/// interface like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+  // Darwin 10 has a __bzero entry point for this purpose.
+  if (getDarwinVers() >= 10)
+    return "__bzero";
+
+  return 0;
+}
+
+/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+/// to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+  if (Is64Bit)
+    return false;
+  return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+/// getSpecialAddressLatency - For targets where it is beneficial to
+/// backschedule instructions that compute addresses, return a value
+/// indicating the number of scheduling cycles of backscheduling that
+/// should be attempted.
+unsigned X86Subtarget::getSpecialAddressLatency() const {
+  // For x86 out-of-order targets, back-schedule address computations so
+  // that loads and stores aren't blocked.
+  // This value was chosen arbitrarily.
+  return 200;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments.  If we can't run cpuid on the host, return true.
+bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                          unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64)
+  #if defined(__GNUC__)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    int registers[4];
+    __cpuid(registers, value);
+    *rEAX = registers[0];
+    *rEBX = registers[1];
+    *rECX = registers[2];
+    *rEDX = registers[3];
+    return false;
+  #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  #if defined(__GNUC__)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    __asm {
+      mov   eax,value
+      cpuid
+      mov   esi,rEAX
+      mov   dword ptr [esi],eax
+      mov   esi,rEBX
+      mov   dword ptr [esi],ebx
+      mov   esi,rECX
+      mov   dword ptr [esi],ecx
+      mov   esi,rEDX
+      mov   dword ptr [esi],edx
+    }
+    return false;
+  #endif
+#endif
+  return true;
+}
+
+static void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model) {
+  Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+  Model  = (EAX >> 4) & 0xf; // Bits 4 - 7
+  if (Family == 6 || Family == 0xf) {
+    if (Family == 0xf)
+      // Examine extended family ID if family ID is F.
+      Family += (EAX >> 20) & 0xff;    // Bits 20 - 27
+    // Examine extended model ID if family ID is 6 or F.
+    Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+  }
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+  
+  if (X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+    return;
+
+  X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+  
+  if ((EDX >> 23) & 0x1) X86SSELevel = MMX;
+  if ((EDX >> 25) & 0x1) X86SSELevel = SSE1;
+  if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
+  if (ECX & 0x1)         X86SSELevel = SSE3;
+  if ((ECX >> 9)  & 0x1) X86SSELevel = SSSE3;
+  if ((ECX >> 19) & 0x1) X86SSELevel = SSE41;
+  if ((ECX >> 20) & 0x1) X86SSELevel = SSE42;
+
+  bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
+  bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
+  if (IsIntel || IsAMD) {
+    // Determine if bit test memory instructions are slow.
+    unsigned Family = 0;
+    unsigned Model  = 0;
+    DetectFamilyModel(EAX, Family, Model);
+    IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
+
+    X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+    HasX86_64 = (EDX >> 29) & 0x1;
+    HasSSE4A = IsAMD && ((ECX >> 6) & 0x1);
+  }
+}
+
+static const char *GetCurrentX86CPU() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  if (X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+    return "generic";
+  unsigned Family = 0;
+  unsigned Model  = 0;
+  DetectFamilyModel(EAX, Family, Model);
+
+  X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  bool Em64T = (EDX >> 29) & 0x1;
+  bool HasSSE3 = (ECX & 0x1);
+
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+  if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+    switch (Family) {
+      case 3:
+        return "i386";
+      case 4:
+        return "i486";
+      case 5:
+        switch (Model) {
+        case 4:  return "pentium-mmx";
+        default: return "pentium";
+        }
+      case 6:
+        switch (Model) {
+        case 1:  return "pentiumpro";
+        case 3:
+        case 5:
+        case 6:  return "pentium2";
+        case 7:
+        case 8:
+        case 10:
+        case 11: return "pentium3";
+        case 9:
+        case 13: return "pentium-m";
+        case 14: return "yonah";
+        case 15:
+        case 22: // Celeron M 540
+          return "core2";
+        case 23: // 45nm: Penryn , Wolfdale, Yorkfield (XE)
+          return "penryn";
+        default: return "i686";
+        }
+      case 15: {
+        switch (Model) {
+        case 3:  
+        case 4:
+        case 6: // same as 4, but 65nm
+          return (Em64T) ? "nocona" : "prescott";
+        case 26:
+          return "corei7";
+        case 28:
+          return "atom";
+        default:
+          return (Em64T) ? "x86-64" : "pentium4";
+        }
+      }
+        
+    default:
+      return "generic";
+    }
+  } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) {
+    // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
+    // appears to be no way to generate the wide variety of AMD-specific targets
+    // from the information returned from CPUID.
+    switch (Family) {
+      case 4:
+        return "i486";
+      case 5:
+        switch (Model) {
+        case 6:
+        case 7:  return "k6";
+        case 8:  return "k6-2";
+        case 9:
+        case 13: return "k6-3";
+        default: return "pentium";
+        }
+      case 6:
+        switch (Model) {
+        case 4:  return "athlon-tbird";
+        case 6:
+        case 7:
+        case 8:  return "athlon-mp";
+        case 10: return "athlon-xp";
+        default: return "athlon";
+        }
+      case 15:
+        if (HasSSE3) {
+          switch (Model) {
+          default: return "k8-sse3";
+          }
+        } else {
+          switch (Model) {
+          case 1:  return "opteron";
+          case 5:  return "athlon-fx"; // also opteron
+          default: return "athlon64";
+          }
+        }
+      case 16:
+        switch (Model) {
+        default: return "amdfam10";
+        }
+    default:
+      return "generic";
+    }
+  } else {
+    return "generic";
+  }
+}
+
+X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
+  : AsmFlavor(AsmWriterFlavor)
+  , PICStyle(PICStyles::None)
+  , X86SSELevel(NoMMXSSE)
+  , X863DNowLevel(NoThreeDNow)
+  , HasX86_64(false)
+  , IsBTMemSlow(false)
+  , DarwinVers(0)
+  , IsLinux(false)
+  , stackAlignment(8)
+  // FIXME: this is a known good value for Yonah. How about others?
+  , MaxInlineSizeThreshold(128)
+  , Is64Bit(is64Bit)
+  , TargetType(isELF) { // Default to ELF unless otherwise specified.
+    
+  // Determine default and user specified characteristics
+  if (!FS.empty()) {
+    // If feature string is not empty, parse features string.
+    std::string CPU = GetCurrentX86CPU();
+    ParseSubtargetFeatures(FS, CPU);
+    // All X86-64 CPUs also have SSE2, however user might request no SSE via 
+    // -mattr, so don't force SSELevel here.
+  } else {
+    // Otherwise, use CPUID to auto-detect feature set.
+    AutoDetectSubtargetFeatures();
+    // Make sure SSE2 is enabled; it is available on all X86-64 CPUs.
+    if (Is64Bit && X86SSELevel < SSE2)
+      X86SSELevel = SSE2;
+  }
+
+  // If requesting codegen for X86-64, make sure that 64-bit features
+  // are enabled.
+  if (Is64Bit)
+    HasX86_64 = true;
+
+  DOUT << "Subtarget features: SSELevel " << X86SSELevel
+       << ", 3DNowLevel " << X863DNowLevel
+       << ", 64bit " << HasX86_64 << "\n";
+  assert((!Is64Bit || HasX86_64) &&
+         "64-bit code requested on a subtarget that doesn't support it!");
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string& TT = M.getTargetTriple();
+  if (TT.length() > 5) {
+    size_t Pos;
+    if ((Pos = TT.find("-darwin")) != std::string::npos) {
+      TargetType = isDarwin;
+      
+      // Compute the darwin version number.
+      if (isdigit(TT[Pos+7]))
+        DarwinVers = atoi(&TT[Pos+7]);
+      else
+        DarwinVers = 8;  // Minimum supported darwin is Tiger.
+    } else if (TT.find("linux") != std::string::npos) {
+      // Linux doesn't imply ELF, but we don't currently support anything else.
+      TargetType = isELF;
+      IsLinux = true;
+    } else if (TT.find("cygwin") != std::string::npos) {
+      TargetType = isCygwin;
+    } else if (TT.find("mingw") != std::string::npos) {
+      TargetType = isMingw;
+    } else if (TT.find("win32") != std::string::npos) {
+      TargetType = isWindows;
+    } else if (TT.find("windows") != std::string::npos) {
+      TargetType = isWindows;
+    }
+    else if (TT.find("-cl") != std::string::npos) {
+      TargetType = isDarwin;
+      DarwinVers = 9;
+    }
+  } else if (TT.empty()) {
+#if defined(__CYGWIN__)
+    TargetType = isCygwin;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+    TargetType = isMingw;
+#elif defined(__APPLE__)
+    TargetType = isDarwin;
+#if __APPLE_CC__ > 5400
+    DarwinVers = 9;  // GCC 5400+ is Leopard.
+#else
+    DarwinVers = 8;  // Minimum supported darwin is Tiger.
+#endif
+    
+#elif defined(_WIN32) || defined(_WIN64)
+    TargetType = isWindows;
+#elif defined(__linux__)
+    // Linux doesn't imply ELF, but we don't currently support anything else.
+    TargetType = isELF;
+    IsLinux = true;
+#endif
+  }
+
+  // If the asm syntax hasn't been overridden on the command line, use whatever
+  // the target wants.
+  if (AsmFlavor == X86Subtarget::Unset) {
+    AsmFlavor = (TargetType == isWindows)
+      ? X86Subtarget::Intel : X86Subtarget::ATT;
+  }
+
+  // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64
+  // bit targets.
+  if (TargetType == isDarwin || Is64Bit)
+    stackAlignment = 16;
+
+  if (StackAlignment)
+    stackAlignment = StackAlignment;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..46476f2
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,224 @@
+//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+class Module;
+class GlobalValue;
+class TargetMachine;
+  
+namespace PICStyles {
+enum Style {
+  Stub, GOT, RIPRel, WinPIC, None
+};
+}
+
+class X86Subtarget : public TargetSubtarget {
+public:
+  enum AsmWriterFlavorTy {
+    // Note: This numbering has to match the GCC assembler dialects for inline
+    // asm alternatives to work right.
+    ATT = 0, Intel = 1, Unset
+  };
+protected:
+  enum X86SSEEnum {
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, ThreeDNow, ThreeDNowA
+  };
+
+  /// AsmFlavor - Which x86 asm dialect to use.
+  ///
+  AsmWriterFlavorTy AsmFlavor;
+
+  /// PICStyle - Which PIC style to use
+  ///
+  PICStyles::Style PICStyle;
+  
+  /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
+  /// none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+  ///
+  X863DNowEnum X863DNowLevel;
+
+  /// HasX86_64 - True if the processor supports X86-64 instructions.
+  ///
+  bool HasX86_64;
+
+  /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+  bool IsBTMemSlow;
+  
+  /// HasSSE4A - True if the processor supports SSE4A instructions.
+  bool HasSSE4A;
+
+  /// DarwinVers - Nonzero if this is a darwin platform: the numeric
+  /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
+  unsigned char DarwinVers; // Is any darwin-x86 platform.
+
+  /// isLinux - true if this is a "linux" platform.
+  bool IsLinux;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  ///
+  unsigned MaxInlineSizeThreshold;
+
+private:
+  /// Is64Bit - True if the processor supports 64-bit instructions and module
+  /// pointer size is 64 bit.
+  bool Is64Bit;
+
+public:
+  enum {
+    isELF, isCygwin, isDarwin, isWindows, isMingw
+  } TargetType;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  X86Subtarget(const Module &M, const std::string &FS, bool is64Bit);
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+  /// instruction.
+  void AutoDetectSubtargetFeatures();
+
+  bool is64Bit() const { return Is64Bit; }
+
+  PICStyles::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
+
+  bool hasMMX() const { return X86SSELevel >= MMX; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool hasSSE41() const { return X86SSELevel >= SSE41; }
+  bool hasSSE42() const { return X86SSELevel >= SSE42; }
+  bool hasSSE4A() const { return HasSSE4A; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+
+  bool isBTMemSlow() const { return IsBTMemSlow; }
+
+  unsigned getAsmFlavor() const {
+    return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
+  }
+
+  bool isFlavorAtt() const { return AsmFlavor == ATT; }
+  bool isFlavorIntel() const { return AsmFlavor == Intel; }
+
+  bool isTargetDarwin() const { return TargetType == isDarwin; }
+  bool isTargetELF() const {
+    return TargetType == isELF;
+  }
+  bool isTargetWindows() const { return TargetType == isWindows; }
+  bool isTargetMingw() const { return TargetType == isMingw; }
+  bool isTargetCygMing() const { return (TargetType == isMingw ||
+                                         TargetType == isCygwin); }
+  bool isTargetCygwin() const { return TargetType == isCygwin; }
+  bool isTargetWin64() const {
+    return (Is64Bit && (TargetType == isMingw || TargetType == isWindows));
+  }
+
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit())
+      p = "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128";
+    else {
+      if (isTargetDarwin())
+        p = "e-p:32:32-f64:32:64-i64:32:64-f80:128:128";
+      else
+        p = "e-p:32:32-f64:32:64-i64:32:64-f80:32:32";
+    }
+    return std::string(p);
+  }
+
+  bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+  bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+  bool isPICStyleStub() const { return PICStyle == PICStyles::Stub; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+  bool isPICStyleWinPIC() const { return PICStyle == PICStyles:: WinPIC; }
+  
+  /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
+  unsigned getDarwinVers() const { return DarwinVers; }
+  
+  /// isLinux - Return true if the target is "Linux".
+  bool isLinux() const { return IsLinux; }
+
+  /// True if accessing the GV requires an extra load. For Windows, dllimported
+  /// symbols are indirect, loading the value at address GV rather then the
+  /// value of GV itself. This means that the GlobalAddress must be in the base
+  /// or index register of the address, not the GV offset field.
+  bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
+                           bool isDirectCall) const;
+
+  /// True if accessing the GV requires a register.  This is a superset of the
+  /// cases where GVRequiresExtraLoad is true.  Some variations of PIC require
+  /// a register, but not an extra load.
+  bool GVRequiresRegister(const GlobalValue* GV, const TargetMachine& TM,
+                           bool isDirectCall) const;
+
+  /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+  /// to immediate address.
+  bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  /// getSpecialAddressLatency - For targets where it is beneficial to
+  /// backschedule instructions that compute addresses, return a value
+  /// indicating the number of scheduling cycles of backscheduling that
+  /// should be attempted.
+  unsigned getSpecialAddressLatency() const;
+};
+
+namespace X86 {
+  /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+  /// the specified arguments.  If we can't run cpuid on the host, return true.
+  bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                       unsigned *rECX, unsigned *rEDX);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86TargetAsmInfo.cpp b/lib/Target/X86/X86TargetAsmInfo.cpp
new file mode 100644
index 0000000..5dda5f4
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.cpp
@@ -0,0 +1,461 @@
+//===-- X86TargetAsmInfo.cpp - X86 asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86Subtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Dwarf.h"
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+const char *const llvm::x86_asm_table[] = {
+  "{si}", "S",
+  "{di}", "D",
+  "{ax}", "a",
+  "{cx}", "c",
+  "{memory}", "memory",
+  "{flags}", "",
+  "{dirflag}", "",
+  "{fpsr}", "",
+  "{cc}", "cc",
+  0,0};
+
+X86DarwinTargetAsmInfo::X86DarwinTargetAsmInfo(const X86TargetMachine &TM):
+  X86TargetAsmInfo<DarwinTargetAsmInfo>(TM) {
+  const X86Subtarget* Subtarget = &TM.getSubtarget<X86Subtarget>();
+  bool is64Bit = Subtarget->is64Bit();
+
+  AlignmentIsInBytes = false;
+  TextAlignFillValue = 0x90;
+  GlobalPrefix = "_";
+  if (!is64Bit)
+    Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+  ZeroDirective = "\t.space\t";  // ".space N" emits N zeros.
+  PrivateGlobalPrefix = "L";     // Marker for constant pool idxs
+  LessPrivateGlobalPrefix = "l";  // Marker for some ObjC metadata
+  BSSSection = 0;                       // no BSS section.
+  ZeroFillDirective = "\t.zerofill\t";  // Uses .zerofill
+  if (TM.getRelocationModel() != Reloc::Static)
+    ConstantPoolSection = "\t.const_data";
+  else
+    ConstantPoolSection = "\t.const\n";
+  JumpTableDataSection = "\t.const\n";
+  CStringSection = "\t.cstring";
+  // FIXME: Why don't always use this section?
+  if (is64Bit) {
+    SixteenByteConstantSection = getUnnamedSection("\t.literal16\n",
+                                                   SectionFlags::Mergeable);
+  }
+  LCOMMDirective = "\t.lcomm\t";
+  SwitchToSectionDirective = "\t.section ";
+  StringConstantPrefix = "\1LC";
+  // Leopard and above support aligned common symbols.
+  COMMDirectiveTakesAlignment = (Subtarget->getDarwinVers() >= 9);
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = false;
+  NonLocalEHFrameLabel = true;
+  if (TM.getRelocationModel() == Reloc::Static) {
+    StaticCtorsSection = ".constructor";
+    StaticDtorsSection = ".destructor";
+  } else {
+    StaticCtorsSection = ".mod_init_func";
+    StaticDtorsSection = ".mod_term_func";
+  }
+  if (is64Bit) {
+    PersonalityPrefix = "";
+    PersonalitySuffix = "+4@GOTPCREL";
+  } else {
+    PersonalityPrefix = "L";
+    PersonalitySuffix = "$non_lazy_ptr";
+  }
+  NeedsIndirectEncoding = true;
+  InlineAsmStart = "## InlineAsm Start";
+  InlineAsmEnd = "## InlineAsm End";
+  CommentString = "##";
+  SetDirective = "\t.set";
+  PCSymbol = ".";
+  UsedDirective = "\t.no_dead_strip\t";
+  WeakDefDirective = "\t.weak_definition ";
+  WeakRefDirective = "\t.weak_reference ";
+  HiddenDirective = "\t.private_extern ";
+  ProtectedDirective = "\t.globl\t";
+
+  // In non-PIC modes, emit a special label before jump tables so that the
+  // linker can perform more accurate dead code stripping.
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    // Emit a local label that is preserved until the linker runs.
+    JumpTableSpecialLabelPrefix = "l";
+  }
+
+  SupportsDebugInformation = true;
+  NeedsSet = true;
+  DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+  DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+  DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+  DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+  DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+  DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+  DwarfDebugInlineSection = ".section __DWARF,__debug_inlined,regular,debug";
+  DwarfUsesInlineInfoSection = true;
+  DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+  DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+  DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+  DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+  DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+
+  // Exceptions handling
+  SupportsExceptionHandling = true;
+  GlobalEHDirective = "\t.globl\t";
+  SupportsWeakOmittedEHFrame = false;
+  AbsoluteEHSectionOffsets = false;
+  DwarfEHFrameSection =
+  ".section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support";
+  DwarfExceptionSection = ".section __DATA,__gcc_except_tab";
+}
+
+unsigned
+X86DarwinTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                              bool Global) const {
+  if (Reason == DwarfEncoding::Functions && Global)
+    return (DW_EH_PE_pcrel | DW_EH_PE_indirect | DW_EH_PE_sdata4);
+  else if (Reason == DwarfEncoding::CodeLabels || !Global)
+    return DW_EH_PE_pcrel;
+  else
+    return DW_EH_PE_absptr;
+}
+
+const char *
+X86DarwinTargetAsmInfo::getEHGlobalPrefix() const
+{
+  const X86Subtarget* Subtarget = &TM.getSubtarget<X86Subtarget>();
+  if (Subtarget->getDarwinVers() > 9)
+    return PrivateGlobalPrefix;
+  else
+    return "";
+}
+
+X86ELFTargetAsmInfo::X86ELFTargetAsmInfo(const X86TargetMachine &TM):
+  X86TargetAsmInfo<ELFTargetAsmInfo>(TM) {
+
+  CStringSection = ".rodata.str";
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Debug Information
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+  DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"\",@progbits";
+  DwarfInfoSection =    "\t.section\t.debug_info,\"\",@progbits";
+  DwarfLineSection =    "\t.section\t.debug_line,\"\",@progbits";
+  DwarfFrameSection =   "\t.section\t.debug_frame,\"\",@progbits";
+  DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",@progbits";
+  DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",@progbits";
+  DwarfStrSection =     "\t.section\t.debug_str,\"\",@progbits";
+  DwarfLocSection =     "\t.section\t.debug_loc,\"\",@progbits";
+  DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits";
+  DwarfRangesSection =  "\t.section\t.debug_ranges,\"\",@progbits";
+  DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits";
+
+  // Exceptions handling
+  SupportsExceptionHandling = true;
+  AbsoluteEHSectionOffsets = false;
+  DwarfEHFrameSection = "\t.section\t.eh_frame,\"aw\",@progbits";
+  DwarfExceptionSection = "\t.section\t.gcc_except_table,\"a\",@progbits";
+
+  // On Linux we must declare when we can use a non-executable stack.
+  if (TM.getSubtarget<X86Subtarget>().isLinux())
+    NonexecutableStackDirective = "\t.section\t.note.GNU-stack,\"\",@progbits";
+}
+
+unsigned
+X86ELFTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                           bool Global) const {
+  CodeModel::Model CM = TM.getCodeModel();
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    unsigned Format = 0;
+
+    if (!is64Bit)
+      // 32 bit targets always encode pointers as 4 bytes
+      Format = DW_EH_PE_sdata4;
+    else {
+      // 64 bit targets encode pointers in 4 bytes iff:
+      // - code model is small OR
+      // - code model is medium and we're emitting externally visible symbols
+      //   or any code symbols
+      if (CM == CodeModel::Small ||
+          (CM == CodeModel::Medium && (Global ||
+                                       Reason != DwarfEncoding::Data)))
+        Format = DW_EH_PE_sdata4;
+      else
+        Format = DW_EH_PE_sdata8;
+    }
+
+    if (Global)
+      Format |= DW_EH_PE_indirect;
+
+    return (Format | DW_EH_PE_pcrel);
+  } else {
+    if (is64Bit &&
+        (CM == CodeModel::Small ||
+         (CM == CodeModel::Medium && Reason != DwarfEncoding::Data)))
+      return DW_EH_PE_udata4;
+    else
+      return DW_EH_PE_absptr;
+  }
+}
+
+X86COFFTargetAsmInfo::X86COFFTargetAsmInfo(const X86TargetMachine &TM):
+  X86GenericTargetAsmInfo(TM) {
+
+  GlobalPrefix = "_";
+  LCOMMDirective = "\t.lcomm\t";
+  COMMDirectiveTakesAlignment = false;
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = false;
+  StaticCtorsSection = "\t.section .ctors,\"aw\"";
+  StaticDtorsSection = "\t.section .dtors,\"aw\"";
+  HiddenDirective = NULL;
+  PrivateGlobalPrefix = "L";  // Prefix for private global symbols
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+  AbsoluteDebugSectionOffsets = true;
+  AbsoluteEHSectionOffsets = false;
+  SupportsDebugInformation = true;
+  DwarfSectionOffsetDirective = "\t.secrel32\t";
+  DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"dr\"";
+  DwarfInfoSection =    "\t.section\t.debug_info,\"dr\"";
+  DwarfLineSection =    "\t.section\t.debug_line,\"dr\"";
+  DwarfFrameSection =   "\t.section\t.debug_frame,\"dr\"";
+  DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"dr\"";
+  DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"dr\"";
+  DwarfStrSection =     "\t.section\t.debug_str,\"dr\"";
+  DwarfLocSection =     "\t.section\t.debug_loc,\"dr\"";
+  DwarfARangesSection = "\t.section\t.debug_aranges,\"dr\"";
+  DwarfRangesSection =  "\t.section\t.debug_ranges,\"dr\"";
+  DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"dr\"";
+}
+
+unsigned
+X86COFFTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                            bool Global) const {
+  CodeModel::Model CM = TM.getCodeModel();
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    unsigned Format = 0;
+
+    if (!is64Bit)
+      // 32 bit targets always encode pointers as 4 bytes
+      Format = DW_EH_PE_sdata4;
+    else {
+      // 64 bit targets encode pointers in 4 bytes iff:
+      // - code model is small OR
+      // - code model is medium and we're emitting externally visible symbols
+      //   or any code symbols
+      if (CM == CodeModel::Small ||
+          (CM == CodeModel::Medium && (Global ||
+                                       Reason != DwarfEncoding::Data)))
+        Format = DW_EH_PE_sdata4;
+      else
+        Format = DW_EH_PE_sdata8;
+    }
+
+    if (Global)
+      Format |= DW_EH_PE_indirect;
+
+    return (Format | DW_EH_PE_pcrel);
+  } else {
+    if (is64Bit &&
+        (CM == CodeModel::Small ||
+         (CM == CodeModel::Medium && Reason != DwarfEncoding::Data)))
+      return DW_EH_PE_udata4;
+    else
+      return DW_EH_PE_absptr;
+  }
+}
+
+std::string
+X86COFFTargetAsmInfo::UniqueSectionForGlobal(const GlobalValue* GV,
+                                             SectionKind::Kind kind) const {
+  switch (kind) {
+   case SectionKind::Text:
+    return ".text$linkonce" + GV->getName();
+   case SectionKind::Data:
+   case SectionKind::BSS:
+   case SectionKind::ThreadData:
+   case SectionKind::ThreadBSS:
+    return ".data$linkonce" + GV->getName();
+   case SectionKind::ROData:
+   case SectionKind::RODataMergeConst:
+   case SectionKind::RODataMergeStr:
+    return ".rdata$linkonce" + GV->getName();
+   default:
+    assert(0 && "Unknown section kind");
+  }
+  return NULL;
+}
+
+std::string X86COFFTargetAsmInfo::printSectionFlags(unsigned flags) const {
+  std::string Flags = ",\"";
+
+  if (flags & SectionFlags::Code)
+    Flags += 'x';
+  if (flags & SectionFlags::Writeable)
+    Flags += 'w';
+
+  Flags += "\"";
+
+  return Flags;
+}
+
+X86WinTargetAsmInfo::X86WinTargetAsmInfo(const X86TargetMachine &TM):
+  X86GenericTargetAsmInfo(TM) {
+  GlobalPrefix = "_";
+  CommentString = ";";
+
+  PrivateGlobalPrefix = "$";
+  AlignDirective = "\talign\t";
+  ZeroDirective = "\tdb\t";
+  ZeroDirectiveSuffix = " dup(0)";
+  AsciiDirective = "\tdb\t";
+  AscizDirective = 0;
+  Data8bitsDirective = "\tdb\t";
+  Data16bitsDirective = "\tdw\t";
+  Data32bitsDirective = "\tdd\t";
+  Data64bitsDirective = "\tdq\t";
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = false;
+
+  TextSection = getUnnamedSection("_text", SectionFlags::Code);
+  DataSection = getUnnamedSection("_data", SectionFlags::Writeable);
+
+  JumpTableDataSection = NULL;
+  SwitchToSectionDirective = "";
+  TextSectionStartSuffix = "\tsegment 'CODE'";
+  DataSectionStartSuffix = "\tsegment 'DATA'";
+  SectionEndDirectiveSuffix = "\tends\n";
+}
+
+template <class BaseTAI>
+bool X86TargetAsmInfo<BaseTAI>::LowerToBSwap(CallInst *CI) const {
+  // FIXME: this should verify that we are targetting a 486 or better.  If not,
+  // we will turn this bswap into something that will be lowered to logical ops
+  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
+  // so don't worry about this.
+
+  // Verify this is a simple bswap.
+  if (CI->getNumOperands() != 2 ||
+      CI->getType() != CI->getOperand(1)->getType() ||
+      !CI->getType()->isInteger())
+    return false;
+
+  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
+  // Okay, we can do this xform, do so now.
+  const Type *Tys[] = { Ty };
+  Module *M = CI->getParent()->getParent()->getParent();
+  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
+
+  Value *Op = CI->getOperand(1);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
+
+template <class BaseTAI>
+bool X86TargetAsmInfo<BaseTAI>::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+
+  std::string AsmStr = IA->getAsmString();
+
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  std::vector<std::string> AsmPieces;
+  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
+
+    // bswap $0
+    if (AsmPieces.size() == 2 &&
+        (AsmPieces[0] == "bswap" ||
+         AsmPieces[0] == "bswapq" ||
+         AsmPieces[0] == "bswapl") &&
+        (AsmPieces[1] == "$0" ||
+         AsmPieces[1] == "${0:q}")) {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return LowerToBSwap(CI);
+    }
+    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
+    if (CI->getType() == Type::Int16Ty &&
+        AsmPieces.size() == 3 &&
+        AsmPieces[0] == "rorw" &&
+        AsmPieces[1] == "$$8," &&
+        AsmPieces[2] == "${0:w}" &&
+        IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
+      return LowerToBSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType() == Type::Int64Ty && Constraints.size() >= 2 &&
+        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+      std::vector<std::string> Words;
+      SplitString(AsmPieces[0], Words, " \t");
+      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+        Words.clear();
+        SplitString(AsmPieces[1], Words, " \t");
+        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+          Words.clear();
+          SplitString(AsmPieces[2], Words, " \t,");
+          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+              Words[2] == "%edx") {
+            return LowerToBSwap(CI);
+          }
+        }
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+// Instantiate default implementation.
+TEMPLATE_INSTANTIATION(class X86TargetAsmInfo<TargetAsmInfo>);
diff --git a/lib/Target/X86/X86TargetAsmInfo.h b/lib/Target/X86/X86TargetAsmInfo.h
new file mode 100644
index 0000000..f89171d
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.h
@@ -0,0 +1,75 @@
+//=====-- X86TargetAsmInfo.h - X86 asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "X86TargetMachine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+#include "llvm/Target/DarwinTargetAsmInfo.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+  extern const char *const x86_asm_table[];
+
+  template <class BaseTAI>
+  struct X86TargetAsmInfo : public BaseTAI {
+    explicit X86TargetAsmInfo(const X86TargetMachine &TM):
+      BaseTAI(TM) {
+      const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+
+      BaseTAI::AsmTransCBE = x86_asm_table;
+      BaseTAI::AssemblerDialect = Subtarget->getAsmFlavor();
+    }
+
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+
+  private:
+    bool LowerToBSwap(CallInst *CI) const;
+  };
+
+  typedef X86TargetAsmInfo<TargetAsmInfo> X86GenericTargetAsmInfo;
+
+  EXTERN_TEMPLATE_INSTANTIATION(class X86TargetAsmInfo<TargetAsmInfo>);
+
+  struct X86DarwinTargetAsmInfo : public X86TargetAsmInfo<DarwinTargetAsmInfo> {
+    explicit X86DarwinTargetAsmInfo(const X86TargetMachine &TM);
+    virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                           bool Global) const;
+    virtual const char *getEHGlobalPrefix() const;
+  };
+
+  struct X86ELFTargetAsmInfo : public X86TargetAsmInfo<ELFTargetAsmInfo> {
+    explicit X86ELFTargetAsmInfo(const X86TargetMachine &TM);
+    virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                           bool Global) const;
+  };
+
+  struct X86COFFTargetAsmInfo : public X86GenericTargetAsmInfo {
+    explicit X86COFFTargetAsmInfo(const X86TargetMachine &TM);
+    virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+                                           bool Global) const;
+    virtual std::string UniqueSectionForGlobal(const GlobalValue* GV,
+                                               SectionKind::Kind kind) const;
+    virtual std::string printSectionFlags(unsigned flags) const;
+  };
+
+  struct X86WinTargetAsmInfo : public X86GenericTargetAsmInfo {
+    explicit X86WinTargetAsmInfo(const X86TargetMachine &TM);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..8264462
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,317 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+/// X86TargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library.  In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this.  Though it is unused, do not remove it.
+extern "C" int X86TargetMachineModule;
+int X86TargetMachineModule = 0;
+
+// Register the target.
+static RegisterTarget<X86_32TargetMachine>
+X("x86",    "32-bit X86: Pentium-Pro and above");
+static RegisterTarget<X86_64TargetMachine>
+Y("x86-64", "64-bit X86: EM64T and AMD64");
+
+// No assembler printer by default
+X86TargetMachine::AsmPrinterCtorFn X86TargetMachine::AsmPrinterCtor = 0;
+
+const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
+  if (Subtarget.isFlavorIntel())
+    return new X86WinTargetAsmInfo(*this);
+  else
+    switch (Subtarget.TargetType) {
+     case X86Subtarget::isDarwin:
+      return new X86DarwinTargetAsmInfo(*this);
+     case X86Subtarget::isELF:
+      return new X86ELFTargetAsmInfo(*this);
+     case X86Subtarget::isMingw:
+     case X86Subtarget::isCygwin:
+      return new X86COFFTargetAsmInfo(*this);
+     case X86Subtarget::isWindows:
+      return new X86WinTargetAsmInfo(*this);
+     default:
+      return new X86GenericTargetAsmInfo(*this);
+    }
+}
+
+unsigned X86_32TargetMachine::getJITMatchQuality() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned X86_64TargetMachine::getJITMatchQuality() {
+#if defined(__x86_64__) || defined(_M_AMD64)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "i[3-9]86-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
+      TT[4] == '-' && TT[1] - '3' < 6)
+    return 20;
+  // If the target triple is something non-X86, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "x86_64-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' &&
+      TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-')
+    return 20;
+
+  // We strongly match "amd64-*".
+  if (TT.size() >= 6 && TT[0] == 'a' && TT[1] == 'm' && TT[2] == 'd' &&
+      TT[3] == '6' && TT[4] == '4' && TT[5] == '-')
+    return 20;
+  
+  // If the target triple is something non-X86-64, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS)
+  : X86TargetMachine(M, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS)
+  : X86TargetMachine(M, FS, true) {
+}
+
+/// X86TargetMachine ctor - Create an ILP32 architecture model
+///
+X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
+                                   bool is64Bit)
+  : Subtarget(M, FS, is64Bit),
+    DataLayout(Subtarget.getDataLayout()),
+    FrameInfo(TargetFrameInfo::StackGrowsDown,
+              Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
+    InstrInfo(*this), JITInfo(*this), TLInfo(*this) {
+  DefRelocModel = getRelocationModel();
+  // FIXME: Correctly select PIC model for Win64 stuff
+  if (getRelocationModel() == Reloc::Default) {
+    if (Subtarget.isTargetDarwin() ||
+        (Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64()))
+      setRelocationModel(Reloc::DynamicNoPIC);
+    else
+      setRelocationModel(Reloc::Static);
+  }
+
+  // ELF doesn't have a distinct dynamic-no-PIC model. Dynamic-no-PIC
+  // is defined as a model for code which may be used in static or
+  // dynamic executables but not necessarily a shared library. On ELF
+  // implement this by using the Static model.
+  if (Subtarget.isTargetELF() &&
+      getRelocationModel() == Reloc::DynamicNoPIC)
+    setRelocationModel(Reloc::Static);
+
+  if (Subtarget.is64Bit()) {
+    // No DynamicNoPIC support under X86-64.
+    if (getRelocationModel() == Reloc::DynamicNoPIC)
+      setRelocationModel(Reloc::PIC_);
+    // Default X86-64 code model is small.
+    if (getCodeModel() == CodeModel::Default)
+      setCodeModel(CodeModel::Small);
+  }
+
+  if (Subtarget.isTargetCygMing())
+    Subtarget.setPICStyle(PICStyles::WinPIC);
+  else if (Subtarget.isTargetDarwin()) {
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyles::RIPRel);
+    else
+      Subtarget.setPICStyle(PICStyles::Stub);
+  } else if (Subtarget.isTargetELF()) {
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyles::RIPRel);
+    else
+      Subtarget.setPICStyle(PICStyles::GOT);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createX86ISelDag(*this, OptLevel));
+
+  // If we're using Fast-ISel, clean up the mess.
+  if (EnableFastISel)
+    PM.add(createDeadMachineInstructionElimPass());
+
+  // Install a pass to insert x87 FP_REG_KILL instructions, as needed.
+  PM.add(createX87FPRegKillInserterPass());
+
+  return false;
+}
+
+bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  // Calculate and set max stack object alignment early, so we can decide
+  // whether we will need stack realignment (and thus FP).
+  PM.add(createX86MaxStackAlignmentCalculatorPass());
+  return false;  // -print-machineinstr shouldn't print after this.
+}
+
+bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  PM.add(createX86FloatingPointStackifierPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+bool X86TargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          bool Verbose,
+                                          raw_ostream &Out) {
+  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+  if (AsmPrinterCtor)
+    PM.add(AsmPrinterCtor(Out, *this, OptLevel, Verbose));
+  return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      bool DumpAsm, 
+                                      MachineCodeEmitter &MCE) {
+  // FIXME: Move this to TargetJITInfo!
+  // On Darwin, do not override 64-bit setting made in X86TargetMachine().
+  if (DefRelocModel == Reloc::Default && 
+        (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit()))
+    setRelocationModel(Reloc::Static);
+  
+  // 64-bit JIT places everything in the same buffer except external functions.
+  // On Darwin, use small code model but hack the call instruction for 
+  // externals.  Elsewhere, do not assume globals are in the lower 4G.
+  if (Subtarget.is64Bit()) {
+    if (Subtarget.isTargetDarwin())
+      setCodeModel(CodeModel::Small);
+    else
+      setCodeModel(CodeModel::Large);
+  }
+
+  PM.add(createX86CodeEmitterPass(*this, MCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      bool DumpAsm,
+                                      JITCodeEmitter &JCE) {
+  // FIXME: Move this to TargetJITInfo!
+  // On Darwin, do not override 64-bit setting made in X86TargetMachine().
+  if (DefRelocModel == Reloc::Default && 
+        (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit()))
+    setRelocationModel(Reloc::Static);
+  
+  // 64-bit JIT places everything in the same buffer except external functions.
+  // On Darwin, use small code model but hack the call instruction for 
+  // externals.  Elsewhere, do not assume globals are in the lower 4G.
+  if (Subtarget.is64Bit()) {
+    if (Subtarget.isTargetDarwin())
+      setCodeModel(CodeModel::Small);
+    else
+      setCodeModel(CodeModel::Large);
+  }
+
+  PM.add(createX86JITCodeEmitterPass(*this, JCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool X86TargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool DumpAsm,
+                                            MachineCodeEmitter &MCE) {
+  PM.add(createX86CodeEmitterPass(*this, MCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+bool X86TargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool DumpAsm,
+                                            JITCodeEmitter &JCE) {
+  PM.add(createX86JITCodeEmitterPass(*this, JCE));
+  if (DumpAsm) {
+    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+    if (AsmPrinterCtor)
+      PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+  }
+
+  return false;
+}
+
+/// symbolicAddressesAreRIPRel - Return true if symbolic addresses are
+/// RIP-relative on this machine, taking into consideration the relocation
+/// model and subtarget. RIP-relative addresses cannot have a separate
+/// base or index register.
+bool X86TargetMachine::symbolicAddressesAreRIPRel() const {
+  return getRelocationModel() != Reloc::Static &&
+         Subtarget.isPICStyleRIPRel();
+}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..ecc1d39
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,124 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "X86.h"
+#include "X86ELFWriterInfo.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86ISelLowering.h"
+
+namespace llvm {
+  
+class raw_ostream;
+
+class X86TargetMachine : public LLVMTargetMachine {
+  X86Subtarget      Subtarget;
+  const TargetData  DataLayout; // Calculates type size & alignment
+  TargetFrameInfo   FrameInfo;
+  X86InstrInfo      InstrInfo;
+  X86JITInfo        JITInfo;
+  X86TargetLowering TLInfo;
+  X86ELFWriterInfo  ELFWriterInfo;
+  Reloc::Model      DefRelocModel; // Reloc model before it's overridden.
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
+  // set this functions to ctor pointer at startup time if they are linked in.
+  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
+                                            X86TargetMachine &tm,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool verbose);
+  static AsmPrinterCtorFn AsmPrinterCtor;
+
+public:
+  X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+
+  virtual const X86InstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual       X86JITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual const X86Subtarget     *getSubtargetImpl() const{ return &Subtarget; }
+  virtual       X86TargetLowering *getTargetLowering() const { 
+    return const_cast<X86TargetLowering*>(&TLInfo); 
+  }
+  virtual const X86RegisterInfo  *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const X86ELFWriterInfo *getELFWriterInfo() const {
+    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  }
+
+  static unsigned getModuleMatchQuality(const Module &M);
+  static unsigned getJITMatchQuality();
+
+  static void registerAsmPrinter(AsmPrinterCtorFn F) {
+    AsmPrinterCtor = F;
+  }
+
+  // Set up the pass pipeline.
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel, 
+                                  bool Verbose, raw_ostream &Out);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, MachineCodeEmitter &MCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              bool DumpAsm, JITCodeEmitter &JCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm, MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    bool DumpAsm, JITCodeEmitter &JCE);
+
+  /// symbolicAddressesAreRIPRel - Return true if symbolic addresses are
+  /// RIP-relative on this machine, taking into consideration the relocation
+  /// model and subtarget. RIP-relative addresses cannot have a separate
+  /// base or index register.
+  bool symbolicAddressesAreRIPRel() const;
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+  X86_32TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+  X86_64TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
new file mode 100644
index 0000000..a7aba14
--- /dev/null
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(LLVM_TARGET_DEFINITIONS XCore.td)
+
+tablegen(XCoreGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(XCoreGenRegisterNames.inc -gen-register-enums)
+tablegen(XCoreGenRegisterInfo.inc -gen-register-desc)
+tablegen(XCoreGenInstrNames.inc -gen-instr-enums)
+tablegen(XCoreGenInstrInfo.inc -gen-instr-desc)
+tablegen(XCoreGenAsmWriter.inc -gen-asm-writer)
+tablegen(XCoreGenDAGISel.inc -gen-dag-isel)
+tablegen(XCoreGenCallingConv.inc -gen-callingconv)
+tablegen(XCoreGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(XCore
+  XCoreAsmPrinter.cpp
+  XCoreFrameInfo.cpp
+  XCoreInstrInfo.cpp
+  XCoreISelDAGToDAG.cpp
+  XCoreISelLowering.cpp
+  XCoreRegisterInfo.cpp
+  XCoreSubtarget.cpp
+  XCoreTargetAsmInfo.cpp
+  XCoreTargetMachine.cpp
+  )
diff --git a/lib/Target/XCore/Makefile b/lib/Target/XCore/Makefile
new file mode 100644
index 0000000..568df70
--- /dev/null
+++ b/lib/Target/XCore/Makefile
@@ -0,0 +1,21 @@
+##===- lib/Target/XCore/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMXCore
+TARGET = XCore
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = XCoreGenRegisterInfo.h.inc XCoreGenRegisterNames.inc \
+                XCoreGenRegisterInfo.inc XCoreGenInstrNames.inc \
+                XCoreGenInstrInfo.inc XCoreGenAsmWriter.inc \
+                XCoreGenDAGISel.inc XCoreGenCallingConv.inc \
+		XCoreGenSubtarget.inc
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/XCore/README.txt b/lib/Target/XCore/README.txt
new file mode 100644
index 0000000..deaeb0f
--- /dev/null
+++ b/lib/Target/XCore/README.txt
@@ -0,0 +1,8 @@
+To-do
+-----
+
+* Instruction encodings
+* Tailcalls
+* Investigate loop alignment
+* Add builtins
+* Make better use of lmul / macc
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
new file mode 100644
index 0000000..5722b87
--- /dev/null
+++ b/lib/Target/XCore/XCore.h
@@ -0,0 +1,42 @@
+//===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// XCore back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_XCORE_H
+#define TARGET_XCORE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class TargetMachine;
+  class XCoreTargetMachine;
+  class raw_ostream;
+
+  FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM);
+  FunctionPass *createXCoreCodePrinterPass(raw_ostream &OS,
+                                           XCoreTargetMachine &TM,
+                                           CodeGenOpt::Level OptLevel,
+                                           bool Verbose);
+} // end namespace llvm;
+
+// Defines symbolic names for XCore registers.  This defines a mapping from
+// register name to register number.
+//
+#include "XCoreGenRegisterNames.inc"
+
+// Defines symbolic names for the XCore instructions.
+//
+#include "XCoreGenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/XCore/XCore.td b/lib/Target/XCore/XCore.td
new file mode 100644
index 0000000..7a2dcdb
--- /dev/null
+++ b/lib/Target/XCore/XCore.td
@@ -0,0 +1,62 @@
+//===- XCore.td - Describe the XCore Target Machine --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Descriptions
+//===----------------------------------------------------------------------===//
+
+include "XCoreRegisterInfo.td"
+include "XCoreInstrInfo.td"
+include "XCoreCallingConv.td"
+
+def XCoreInstrInfo : InstrInfo {
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// XCore Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureXS1A
+  : SubtargetFeature<"xs1a", "IsXS1A", "true",
+                     "Enable XS1A instructions">;
+
+def FeatureXS1B
+  : SubtargetFeature<"xs1b", "IsXS1B", "true",
+                     "Enable XS1B instructions">;
+
+//===----------------------------------------------------------------------===//
+// XCore processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",      [FeatureXS1A]>;
+def : Proc<"xs1a-generic", [FeatureXS1A]>;
+def : Proc<"xs1b-generic", [FeatureXS1B]>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def XCore : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = XCoreInstrInfo;
+}
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
new file mode 100644
index 0000000..c9a6d8a
--- /dev/null
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -0,0 +1,472 @@
+//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the XAS-format XCore assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static cl::opt<std::string> FileDirective("xcore-file-directive", cl::Optional,
+  cl::desc("Output a file directive into the assembly file"),
+  cl::Hidden,
+  cl::value_desc("filename"),
+  cl::init(""));
+
+static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
+  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+  cl::Hidden,
+  cl::value_desc("number"),
+  cl::init(8));
+
+namespace {
+  class VISIBILITY_HIDDEN XCoreAsmPrinter : public AsmPrinter {
+    DwarfWriter *DW;
+    const XCoreSubtarget &Subtarget;
+  public:
+    explicit XCoreAsmPrinter(raw_ostream &O, XCoreTargetMachine &TM,
+                             const TargetAsmInfo *T, CodeGenOpt::Level OL,
+                             bool V)
+      : AsmPrinter(O, TM, T, OL, V), DW(0),
+        Subtarget(*TM.getSubtargetImpl()) {}
+
+    virtual const char *getPassName() const {
+      return "XCore Assembly Printer";
+    }
+
+    void printMemOperand(const MachineInstr *MI, int opNum);
+    void printOperand(const MachineInstr *MI, int opNum);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                        unsigned AsmVariant, const char *ExtraCode);
+    
+    void emitFileDirective(const std::string &filename);
+    void emitGlobalDirective(const std::string &name);
+    void emitExternDirective(const std::string &name);
+    
+    void emitArrayBound(const std::string &name, const GlobalVariable *GV);
+    void emitGlobal(const GlobalVariable *GV);
+
+    void emitFunctionStart(MachineFunction &MF);
+    void emitFunctionEnd(MachineFunction &MF);
+
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    void printMachineInstruction(const MachineInstr *MI);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+    }
+  };
+} // end of anonymous namespace
+
+#include "XCoreGenAsmWriter.inc"
+
+/// createXCoreCodePrinterPass - Returns a pass that prints the XCore
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createXCoreCodePrinterPass(raw_ostream &o,
+                                               XCoreTargetMachine &tm,
+                                               CodeGenOpt::Level OptLevel,
+                                               bool verbose) {
+  return new XCoreAsmPrinter(o, tm, tm.getTargetAsmInfo(), OptLevel, verbose);
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const std::string &Str, raw_ostream &Out) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '"' && C != '\\') {
+      Out << C;
+    } else {
+      Out << '\\'
+          << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
+          << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+    }
+  }
+}
+
+void XCoreAsmPrinter::
+emitFileDirective(const std::string &name)
+{
+  O << "\t.file\t\"";
+  PrintEscapedString(name, O);
+  O << "\"\n";
+}
+
+void XCoreAsmPrinter::
+emitGlobalDirective(const std::string &name)
+{
+  O << TAI->getGlobalDirective() << name;
+  O << "\n";
+}
+
+void XCoreAsmPrinter::
+emitExternDirective(const std::string &name)
+{
+  O << "\t.extern\t" << name;
+  O << '\n';
+}
+
+void XCoreAsmPrinter::
+emitArrayBound(const std::string &name, const GlobalVariable *GV)
+{
+  assert(((GV->hasExternalLinkage() ||
+    GV->hasWeakLinkage()) ||
+    GV->hasLinkOnceLinkage()) && "Unexpected linkage");
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(
+    cast<PointerType>(GV->getType())->getElementType()))
+  {
+    O << TAI->getGlobalDirective() << name << ".globound" << "\n";
+    O << TAI->getSetDirective() << name << ".globound" << ","
+      << ATy->getNumElements() << "\n";
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
+      // TODO Use COMDAT groups for LinkOnceLinkage
+      O << TAI->getWeakDefDirective() << name << ".globound" << "\n";
+    }
+  }
+}
+
+void XCoreAsmPrinter::
+emitGlobal(const GlobalVariable *GV)
+{
+  const TargetData *TD = TM.getTargetData();
+
+  if (GV->hasInitializer()) {
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(GV))
+      return;
+
+    SwitchToSection(TAI->SectionForGlobal(GV));
+    
+    std::string name = Mang->getValueName(GV);
+    Constant *C = GV->getInitializer();
+    unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
+    
+    // Mark the start of the global
+    O << "\t.cc_top " << name << ".data," << name << "\n";
+
+    switch (GV->getLinkage()) {
+    case GlobalValue::AppendingLinkage:
+      cerr << "AppendingLinkage is not supported by this target!\n";
+      abort();
+    case GlobalValue::LinkOnceAnyLinkage:
+    case GlobalValue::LinkOnceODRLinkage:
+    case GlobalValue::WeakAnyLinkage:
+    case GlobalValue::WeakODRLinkage:
+    case GlobalValue::ExternalLinkage:
+      emitArrayBound(name, GV);
+      emitGlobalDirective(name);
+      // TODO Use COMDAT groups for LinkOnceLinkage
+      if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
+        O << TAI->getWeakDefDirective() << name << "\n";
+      }
+      // FALL THROUGH
+    case GlobalValue::InternalLinkage:
+    case GlobalValue::PrivateLinkage:
+      break;
+    case GlobalValue::GhostLinkage:
+      cerr << "Should not have any unmaterialized functions!\n";
+      abort();
+    case GlobalValue::DLLImportLinkage:
+      cerr << "DLLImport linkage is not supported by this target!\n";
+      abort();
+    case GlobalValue::DLLExportLinkage:
+      cerr << "DLLExport linkage is not supported by this target!\n";
+      abort();
+    default:
+      assert(0 && "Unknown linkage type!");
+    }
+
+    EmitAlignment(Align, GV, 2);
+    
+    unsigned Size = TD->getTypeAllocSize(C->getType());
+    if (GV->isThreadLocal()) {
+      Size *= MaxThreads;
+    }
+    if (TAI->hasDotTypeDotSizeDirective()) {
+      O << "\t.type " << name << ",@object\n";
+      O << "\t.size " << name << "," << Size << "\n";
+    }
+    O << name << ":\n";
+    
+    EmitGlobalConstant(C);
+    if (GV->isThreadLocal()) {
+      for (unsigned i = 1; i < MaxThreads; ++i) {
+        EmitGlobalConstant(C);
+      }
+    }
+    if (Size < 4) {
+      // The ABI requires that unsigned scalar types smaller than 32 bits
+      // are are padded to 32 bits.
+      EmitZeros(4 - Size);
+    }
+    
+    // Mark the end of the global
+    O << "\t.cc_bottom " << name << ".data\n";
+  } else {
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+  }
+}
+
+/// Emit the directives on the start of functions
+void XCoreAsmPrinter::
+emitFunctionStart(MachineFunction &MF)
+{
+  // Print out the label for the function.
+  const Function *F = MF.getFunction();
+
+  SwitchToSection(TAI->SectionForGlobal(F));
+  
+  // Mark the start of the function
+  O << "\t.cc_top " << CurrentFnName << ".function," << CurrentFnName << "\n";
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+    break;
+  case Function::ExternalLinkage:
+    emitGlobalDirective(CurrentFnName);
+    break;
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    // TODO Use COMDAT groups for LinkOnceLinkage
+    O << TAI->getGlobalDirective() << CurrentFnName << "\n";
+    O << TAI->getWeakDefDirective() << CurrentFnName << "\n";
+    break;
+  }
+  // (1 << 1) byte aligned
+  EmitAlignment(1, F, 1);
+  if (TAI->hasDotTypeDotSizeDirective()) {
+    O << "\t.type " << CurrentFnName << ",@function\n";
+  }
+  O << CurrentFnName << ":\n";
+}
+
+/// Emit the directives on the end of functions
+void XCoreAsmPrinter::
+emitFunctionEnd(MachineFunction &MF) 
+{
+  // Mark the end of the function
+  O << "\t.cc_bottom " << CurrentFnName << ".function\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool XCoreAsmPrinter::runOnMachineFunction(MachineFunction &MF)
+{
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out jump tables referenced by the function
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // Emit the function start directives
+  emitFunctionStart(MF);
+  
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true , true);
+      O << '\n';
+    }
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+
+    // Each Basic Block is separated by a newline
+    O << '\n';
+  }
+
+  // Emit function end directives
+  emitFunctionEnd(MF);
+  
+  // Emit post-function debug information.
+  DW->EndFunction(&MF);
+
+  // We didn't modify anything.
+  return false;
+}
+
+void XCoreAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum)
+{
+  printOperand(MI, opNum);
+  
+  if (MI->getOperand(opNum+1).isImm()
+    && MI->getOperand(opNum+1).getImm() == 0)
+    return;
+  
+  O << "+";
+  printOperand(MI, opNum+1);
+}
+
+void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+    else
+      assert(0 && "not implemented");
+    break;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMBB());
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    {
+      const GlobalValue *GV = MO.getGlobal();
+      O << Mang->getValueName(GV);
+      if (GV->hasExternalWeakLinkage())
+        ExtWeakSymbols.insert(GV);
+    }
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  default:
+    assert(0 && "not implemented");
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant, 
+                                      const char *ExtraCode) {
+  printOperand(MI, OpNo);
+  return false;
+}
+
+void XCoreAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Check for mov mnemonic
+  unsigned src, dst, srcSR, dstSR;
+  if (TM.getInstrInfo()->isMoveInstr(*MI, src, dst, srcSR, dstSR)) {
+    O << "\tmov ";
+    O << TM.getRegisterInfo()->get(dst).AsmName;
+    O << ", ";
+    O << TM.getRegisterInfo()->get(src).AsmName;
+    O << "\n";
+    return;
+  }
+  if (printInstruction(MI)) {
+    return;
+  }
+  assert(0 && "Unhandled instruction in asm writer!");
+}
+
+bool XCoreAsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+  
+  if (!FileDirective.empty()) {
+    emitFileDirective(FileDirective);
+  }
+  
+  // Print out type strings for external functions here
+  for (Module::const_iterator I = M.begin(), E = M.end();
+       I != E; ++I) {
+    if (I->isDeclaration() && !I->isIntrinsic()) {
+      switch (I->getLinkage()) {
+      default:
+        assert(0 && "Unexpected linkage");
+      case Function::ExternalWeakLinkage:
+        ExtWeakSymbols.insert(I);
+        // fallthrough
+      case Function::ExternalLinkage:
+        break;
+      }
+    }
+  }
+
+  // Emit initial debug information.
+  DW = getAnalysisIfAvailable<DwarfWriter>();
+  assert(DW && "Dwarf Writer is not available");
+  DW->BeginModule(&M, getAnalysisIfAvailable<MachineModuleInfo>(),
+                  O, this, TAI);
+  return Result;
+}
+
+bool XCoreAsmPrinter::doFinalization(Module &M) {
+
+  // Print out module-level global variables.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    emitGlobal(I);
+  }
+  
+  // Emit final debug information.
+  DW->EndModule();
+
+  return AsmPrinter::doFinalization(M);
+}
diff --git a/lib/Target/XCore/XCoreCallingConv.td b/lib/Target/XCore/XCoreCallingConv.td
new file mode 100644
index 0000000..8107e32
--- /dev/null
+++ b/lib/Target/XCore/XCoreCallingConv.td
@@ -0,0 +1,33 @@
+//===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for XCore architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XCore Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_XCore : CallingConv<[
+  // i32 are returned in registers R0, R1, R2, R3
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// XCore Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_XCore : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
diff --git a/lib/Target/XCore/XCoreFrameInfo.cpp b/lib/Target/XCore/XCoreFrameInfo.cpp
new file mode 100644
index 0000000..f50dc96
--- /dev/null
+++ b/lib/Target/XCore/XCoreFrameInfo.cpp
@@ -0,0 +1,27 @@
+//===-- XCoreFrameInfo.cpp - Frame info for XCore Target ---------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreFrameInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// XCoreFrameInfo:
+//===----------------------------------------------------------------------===//
+
+XCoreFrameInfo::XCoreFrameInfo(const TargetMachine &tm):
+  TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 4, 0)
+{
+  // Do nothing
+}
diff --git a/lib/Target/XCore/XCoreFrameInfo.h b/lib/Target/XCore/XCoreFrameInfo.h
new file mode 100644
index 0000000..2c67577
--- /dev/null
+++ b/lib/Target/XCore/XCoreFrameInfo.h
@@ -0,0 +1,34 @@
+//===-- XCoreFrameInfo.h - Frame info for XCore Target -----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREFRAMEINFO_H
+#define XCOREFRAMEINFO_H
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class XCoreFrameInfo: public TargetFrameInfo {
+
+  public:
+    XCoreFrameInfo(const TargetMachine &tm);
+
+    //! Stack slot size (4 bytes)
+    static int stackSlotSize() {
+      return 4;
+    }
+  };
+}
+
+#endif // XCOREFRAMEINFO_H
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
new file mode 100644
index 0000000..eed34a4
--- /dev/null
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -0,0 +1,230 @@
+//===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreISelLowering.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+/// XCoreDAGToDAGISel - XCore specific code to select XCore machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class XCoreDAGToDAGISel : public SelectionDAGISel {
+    XCoreTargetLowering &Lowering;
+    const XCoreSubtarget &Subtarget;
+
+  public:
+    XCoreDAGToDAGISel(XCoreTargetMachine &TM)
+      : SelectionDAGISel(TM),
+        Lowering(*TM.getTargetLowering()), 
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    SDNode *Select(SDValue Op);
+    
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    // Complex Pattern Selectors.
+    bool SelectADDRspii(SDValue Op, SDValue Addr, SDValue &Base,
+                        SDValue &Offset);
+    bool SelectADDRdpii(SDValue Op, SDValue Addr, SDValue &Base,
+                        SDValue &Offset);
+    bool SelectADDRcpii(SDValue Op, SDValue Addr, SDValue &Base,
+                        SDValue &Offset);
+    
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "XCore DAG->DAG Pattern Instruction Selection";
+    } 
+    
+    // Include the pieces autogenerated from the target description.
+  #include "XCoreGenDAGISel.inc"
+  };
+}  // end anonymous namespace
+
+/// createXCoreISelDag - This pass converts a legalized DAG into a 
+/// XCore-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM) {
+  return new XCoreDAGToDAGISel(TM);
+}
+
+bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Offset) {
+  FrameIndexSDNode *FIN = 0;
+  if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant positive word offset from frame index
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::SelectADDRdpii(SDValue Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Offset) {
+  if (Addr.getOpcode() == XCoreISD::DPRelativeWrapper) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((Addr.getOperand(0).getOpcode() == XCoreISD::DPRelativeWrapper)
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0)) {
+      // Constant word offset from a object in the data region
+      Base = Addr.getOperand(0).getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::SelectADDRcpii(SDValue Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Offset) {
+  if (Addr.getOpcode() == XCoreISD::CPRelativeWrapper) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((Addr.getOperand(0).getOpcode() == XCoreISD::CPRelativeWrapper)
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0)) {
+      // Constant word offset from a object in the data region
+      Base = Addr.getOperand(0).getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void XCoreDAGToDAGISel::
+InstructionSelect() {
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  
+  CurDAG->RemoveDeadNodes();
+}
+
+SDNode *XCoreDAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  MVT NVT = N->getValueType(0);
+  if (NVT == MVT::i32) {
+    switch (N->getOpcode()) {
+      default: break;
+      case ISD::Constant: {
+        if (Predicate_immMskBitp(N)) {
+          SDValue MskSize = Transform_msksize_xform(N);
+          return CurDAG->getTargetNode(XCore::MKMSK_rus, dl, MVT::i32, MskSize);
+        }
+        else if (! Predicate_immU16(N)) {
+          unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+          SDValue CPIdx =
+            CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val),
+                                          TLI.getPointerTy());
+          return CurDAG->getTargetNode(XCore::LDWCP_lru6, dl, MVT::i32, 
+                                       MVT::Other, CPIdx, 
+                                       CurDAG->getEntryNode());
+        }
+        break;
+      }
+      case ISD::SMUL_LOHI: {
+        // FIXME fold addition into the macc instruction
+        if (!Subtarget.isXS1A()) {
+          SDValue Zero(CurDAG->getTargetNode(XCore::LDC_ru6, dl, MVT::i32,
+                                  CurDAG->getTargetConstant(0, MVT::i32)), 0);
+          SDValue Ops[] = { Zero, Zero, Op.getOperand(0), Op.getOperand(1) };
+          SDNode *ResNode = CurDAG->getTargetNode(XCore::MACCS_l4r, dl,
+                                                  MVT::i32, MVT::i32, Ops, 4);
+          ReplaceUses(SDValue(N, 0), SDValue(ResNode, 1));
+          ReplaceUses(SDValue(N, 1), SDValue(ResNode, 0));
+          return NULL;
+        }
+        break;
+      }
+      case ISD::UMUL_LOHI: {
+        // FIXME fold addition into the macc / lmul instruction
+        SDValue Zero(CurDAG->getTargetNode(XCore::LDC_ru6, dl, MVT::i32,
+                                  CurDAG->getTargetConstant(0, MVT::i32)), 0);
+        SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                            Zero, Zero };
+        SDNode *ResNode = CurDAG->getTargetNode(XCore::LMUL_l6r, dl, MVT::i32,
+                                                MVT::i32, Ops, 4);
+        ReplaceUses(SDValue(N, 0), SDValue(ResNode, 1));
+        ReplaceUses(SDValue(N, 1), SDValue(ResNode, 0));
+        return NULL;
+      }
+      case XCoreISD::LADD: {
+        if (!Subtarget.isXS1A()) {
+          SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                              Op.getOperand(2) };
+          return CurDAG->getTargetNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
+                                       Ops, 3);
+        }
+        break;
+      }
+      case XCoreISD::LSUB: {
+        if (!Subtarget.isXS1A()) {
+          SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                              Op.getOperand(2) };
+          return CurDAG->getTargetNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
+                                       Ops, 3);
+        }
+        break;
+      }
+      // Other cases are autogenerated.
+    }
+  }
+  return SelectCode(Op);
+}
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
new file mode 100644
index 0000000..93c5f59
--- /dev/null
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -0,0 +1,934 @@
+//===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation   ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xcore-lower"
+
+#include "XCoreISelLowering.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/VectorExtras.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+const char *XCoreTargetLowering::
+getTargetNodeName(unsigned Opcode) const 
+{
+  switch (Opcode) 
+  {
+    case XCoreISD::BL                : return "XCoreISD::BL";
+    case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
+    case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
+    case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
+    case XCoreISD::STWSP             : return "XCoreISD::STWSP";
+    case XCoreISD::RETSP             : return "XCoreISD::RETSP";
+    default                           : return NULL;
+  }
+}
+
+XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
+  : TargetLowering(XTM),
+    TM(XTM),
+    Subtarget(*XTM.getSubtargetImpl()) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, XCore::GRRegsRegisterClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  setShiftAmountType(MVT::i32);
+  // shl X, 32 == 0
+  setShiftAmountFlavor(Extend);
+  setStackPointerRegisterToSaveRestore(XCore::SP);
+
+  setSchedulingPreference(SchedulingForRegPressure);
+
+  // Use i32 for setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // XCore does not have the NodeTypes below.
+  setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+
+  // Stop the combiner recombining select and set_cc
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  
+  // 64bit
+  if (!Subtarget.isXS1A()) {
+    setOperationAction(ISD::ADD, MVT::i64, Custom);
+    setOperationAction(ISD::SUB, MVT::i64, Custom);
+  }
+  if (Subtarget.isXS1A()) {
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  }
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+  
+  // Bit Manipulation
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  
+  // Expand jump tables for now
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+
+  // RET must be custom lowered, to meet ABI requirements
+  setOperationAction(ISD::RET,           MVT::Other, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  
+  // Thread Local Storage
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  
+  // Conversion of i64 -> double produces constantpool nodes
+  setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
+
+  // Loads
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+  
+  // Varargs
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  
+  // Dynamic stack
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  
+  // Debug
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+}
+
+SDValue XCoreTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) 
+  {
+  case ISD::CALL:             return LowerCALL(Op, DAG);
+  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
+  case ISD::RET:              return LowerRET(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ConstantPool:     return LowerConstantPool(Op, DAG);
+  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::VAARG:            return LowerVAARG(Op, DAG);
+  case ISD::VASTART:          return LowerVASTART(Op, DAG);
+  // FIXME: Remove these when LegalizeDAGTypes lands.
+  case ISD::ADD:
+  case ISD::SUB:              return ExpandADDSUB(Op.getNode(), DAG);
+  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
+  default:
+    assert(0 && "unimplemented operand");
+    return SDValue();
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default:
+    assert(0 && "Don't know how to custom expand this!");
+    return;
+  case ISD::ADD:
+  case ISD::SUB:
+    Results.push_back(ExpandADDSUB(N, DAG));
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::
+LowerSELECT_CC(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2),
+                             Op.getOperand(3), Op.getOperand(4));
+  return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0),
+                     Op.getOperand(1));
+}
+
+SDValue XCoreTargetLowering::
+getGlobalAddressWrapper(SDValue GA, GlobalValue *GV, SelectionDAG &DAG)
+{
+  // FIXME there is no actual debug info here
+  DebugLoc dl = GA.getDebugLoc();
+  if (isa<Function>(GV)) {
+    return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+  } else if (!Subtarget.isXS1A()) {
+    const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+    if (!GVar) {
+      // If GV is an alias then use the aliasee to determine constness
+      if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+        GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
+    }
+    bool isConst = GVar && GVar->isConstant();
+    if (isConst) {
+      return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+    }
+  }
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG)
+{
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  // If it's a debug information descriptor, don't mess with it.
+  if (DAG.isVerifiedDebugInfoDesc(Op))
+    return GA;
+  return getGlobalAddressWrapper(GA, GV, DAG);
+}
+
+static inline SDValue BuildGetId(SelectionDAG &DAG, DebugLoc dl) {
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+                     DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
+}
+
+static inline bool isZeroLengthArray(const Type *Ty) {
+  const ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
+  return AT && (AT->getNumElements() == 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG)
+{
+  // FIXME there isn't really debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  // transform to label + getid() * size
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee to determine size
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
+  }
+  if (! GVar) {
+    assert(0 && "Thread local object not a GlobalVariable?");
+    return SDValue();
+  }
+  const Type *Ty = cast<PointerType>(GV->getType())->getElementType();
+  if (!Ty->isSized() || isZeroLengthArray(Ty)) {
+    cerr << "Size of thread local object " << GVar->getName()
+         << " is unknown\n";
+    abort();
+  }
+  SDValue base = getGlobalAddressWrapper(GA, GV, DAG);
+  const TargetData *TD = TM.getTargetData();
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  SDValue offset = DAG.getNode(ISD::MUL, dl, MVT::i32, BuildGetId(DAG, dl),
+                       DAG.getConstant(Size, MVT::i32));
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, base, offset);
+}
+
+SDValue XCoreTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG)
+{
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really debug info here
+  DebugLoc dl = CP->getDebugLoc();
+  if (Subtarget.isXS1A()) {
+    assert(0 && "Lowering of constant pool unimplemented");
+    return SDValue();
+  } else {
+    MVT PtrVT = Op.getValueType();
+    SDValue Res;
+    if (CP->isMachineConstantPoolEntry()) {
+      Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                      CP->getAlignment());
+    } else {
+      Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                      CP->getAlignment());
+    }
+    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
+  }
+}
+
+SDValue XCoreTargetLowering::
+LowerJumpTable(SDValue Op, SelectionDAG &DAG)
+{
+  // FIXME there isn't really debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  MVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, JTI);
+}
+
+SDValue XCoreTargetLowering::
+ExpandADDSUB(SDNode *N, SelectionDAG &DAG)
+{
+  assert(N->getValueType(0) == MVT::i64 &&
+         (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+        "Unknown operand to lower!");
+  assert(!Subtarget.isXS1A() && "Cannot custom lower ADD/SUB on xs1a");
+  DebugLoc dl = N->getDebugLoc();
+  
+  // Extract components
+  SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(0, MVT::i32));
+  SDValue LHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(1, MVT::i32));
+  SDValue RHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(0, MVT::i32));
+  SDValue RHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(1, MVT::i32));
+  
+  // Expand
+  unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
+                                                   XCoreISD::LSUB;
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Carry = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                                  LHSL, RHSL, Zero);
+  SDValue Lo(Carry.getNode(), 1);
+  
+  SDValue Ignored = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                                  LHSH, RHSH, Carry);
+  SDValue Hi(Ignored.getNode(), 1);
+  // Merge the pieces
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+LowerVAARG(SDValue Op, SelectionDAG &DAG)
+{
+  assert(0 && "unimplemented");
+  // FIX Arguments passed by reference need a extra dereference.
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  MVT VT = Node->getValueType(0);
+  SDValue VAList = DAG.getLoad(getPointerTy(), dl, Node->getOperand(0),
+                               Node->getOperand(1), V, 0);
+  // Increment the pointer, VAList, to the next vararg
+  SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, getPointerTy(), VAList, 
+                     DAG.getConstant(VT.getSizeInBits(), 
+                                     getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Node->getOperand(1), V, 0);
+  // Load the actual argument out of the pointer VAList
+  return DAG.getLoad(VT, dl, Tmp3, VAList, NULL, 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerVASTART(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+  // vastart stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1), SV, 0);
+}
+
+SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Depths > 0 not supported yet! 
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+  
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, 
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//
+//  The lower operations present on calling convention works on this order:
+//      LowerCALL (virt regs --> phys regs, virt regs --> stack) 
+//      LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
+//      LowerRET (virt regs --> phys regs)
+//      LowerCALL (phys regs --> virt regs)
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  CALL Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore custom CALL implementation
+SDValue XCoreTargetLowering::
+LowerCALL(SDValue Op, SelectionDAG &DAG)
+{
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  unsigned CallingConv = TheCall->getCallingConv();
+  // For now, only CallingConv::C implemented
+  switch (CallingConv) 
+  {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::Fast:
+    case CallingConv::C:
+      return LowerCCCCallTo(Op, DAG, CallingConv);
+  }
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isTailCall, sret.
+SDValue XCoreTargetLowering::
+LowerCCCCallTo(SDValue Op, SelectionDAG &DAG, unsigned CC) 
+{
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  SDValue Chain  = TheCall->getChain();
+  SDValue Callee = TheCall->getCallee();
+  bool isVarArg  = TheCall->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+
+  // The ABI dictates there should be one stack slot available to the callee
+  // on function entry (for saving lr).
+  CCInfo.AllocateStack(4, 4);
+
+  CCInfo.AnalyzeCallOperands(TheCall, CC_XCore);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, 
+                                 getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments start after the 5 first operands of ISD::CALL
+    SDValue Arg = TheCall->getArg(i);
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+    
+    // Arguments that can be passed on register must be kept at 
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      int Offset = VA.getLocMemOffset();
+
+      MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other, 
+                                        Chain, Arg,
+                                        DAG.getConstant(Offset/4, MVT::i32)));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token 
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  // XCoreBranchLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...  
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are 
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
+                 Op.getResNo());
+}
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the 
+/// ISD::CALL.
+SDNode *XCoreTargetLowering::
+LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 
+        unsigned CallingConv, SelectionDAG &DAG) {
+  bool isVarArg = TheCall->isVarArg();
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_XCore);
+  SmallVector<SDValue, 8> ResultVals;
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    ResultVals.push_back(Chain.getValue(0));
+  }
+
+  ResultVals.push_back(Chain);
+
+  // Merge everything together with a MERGE_VALUES node.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
+                     &ResultVals[0], ResultVals.size()).getNode();
+}
+
+//===----------------------------------------------------------------------===//
+//             FORMAL_ARGUMENTS Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore custom FORMAL_ARGUMENTS implementation
+SDValue XCoreTargetLowering::
+LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) 
+{
+  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch(CC) 
+  {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::Fast:
+      return LowerCCCArguments(Op, DAG);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: sret
+SDValue XCoreTargetLowering::
+LowerCCCArguments(SDValue Op, SelectionDAG &DAG)
+{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SDValue Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  unsigned CC = MF.getFunction()->getCallingConv();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+
+  CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_XCore);
+
+  unsigned StackSlotSize = XCoreFrameInfo::stackSlotSize();
+
+  SmallVector<SDValue, 16> ArgValues;
+  
+  unsigned LRSaveSize = StackSlotSize;
+  
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+    CCValAssign &VA = ArgLocs[i];
+    
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      MVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT()) {
+      default:
+        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << RegVT.getSimpleVT()
+             << "\n";
+        abort();
+      case MVT::i32:
+        unsigned VReg = RegInfo.createVirtualRegister(
+                          XCore::GRRegsRegisterClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        ArgValues.push_back(DAG.getCopyFromReg(Root, dl, VReg, RegVT));
+      }
+    } else {
+      // sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+      if (ObjSize > StackSlotSize) {
+        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << VA.getLocVT().getSimpleVT()
+             << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      LRSaveSize + VA.getLocMemOffset());
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      ArgValues.push_back(DAG.getLoad(VA.getLocVT(), dl, Root, FIN, NULL, 0));
+    }
+  }
+  
+  if (isVarArg) {
+    /* Argument registers */
+    static const unsigned ArgRegs[] = {
+      XCore::R0, XCore::R1, XCore::R2, XCore::R3
+    };
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+    unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs,
+                                                     array_lengthof(ArgRegs));
+    if (FirstVAReg < array_lengthof(ArgRegs)) {
+      SmallVector<SDValue, 4> MemOps;
+      int offset = 0;
+      // Save remaining registers, storing higher register numbers at a higher
+      // address
+      for (unsigned i = array_lengthof(ArgRegs) - 1; i >= FirstVAReg; --i) {
+        // Create a stack slot
+        int FI = MFI->CreateFixedObject(4, offset);
+        if (i == FirstVAReg) {
+          XFI->setVarArgsFrameIndex(FI);
+        }
+        offset -= StackSlotSize;
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+        // Move argument from phys reg -> virt reg
+        unsigned VReg = RegInfo.createVirtualRegister(
+                          XCore::GRRegsRegisterClass);
+        RegInfo.addLiveIn(ArgRegs[i], VReg);
+        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+        // Move argument from virt reg -> stack
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+      }
+      if (!MemOps.empty())
+        Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           &MemOps[0], MemOps.size());
+    } else {
+      // This will point to the next argument passed via stack.
+      XFI->setVarArgsFrameIndex(
+          MFI->CreateFixedObject(4, LRSaveSize + CCInfo.getNextStackOffset()));
+    }
+  }
+  
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  std::vector<MVT> RetVT(Op.getNode()->value_begin(),
+                                    Op.getNode()->value_end());
+  return DAG.getNode(ISD::MERGE_VALUES, dl, RetVT, 
+                     &ArgValues[0], ArgValues.size());
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::
+LowerRET(SDValue Op, SelectionDAG &DAG)
+{
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+
+  // Analize return values of ISD::RET
+  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_XCore);
+
+  // If this is the first return lowered for this function, add 
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  // The chain is always operand #0
+  SDValue Chain = Op.getOperand(0);
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // ISD::RET => ret chain, (regnum1,val1), ...
+    // So i*2+1 index only the regnums
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
+                             Op.getOperand(i*2+1), Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+
+  // Return on XCore is always a "retsp 0"
+  if (Flag.getNode())
+    return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
+                       Chain, DAG.getConstant(0, MVT::i32), Flag);
+  else // Return Void
+    return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
+                       Chain, DAG.getConstant(0, MVT::i32));
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  assert((MI->getOpcode() == XCore::SELECT_CC) &&
+         "Unexpected instr type to insert");
+  
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+  
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
+    .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  sinkMBB->transferSuccessors(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+  
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+  
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+  
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, dl, TII.get(XCore::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing mode description hooks
+//===----------------------------------------------------------------------===//
+
+static inline bool isImmUs(int64_t val)
+{
+  return (val >= 0 && val <= 11);
+}
+
+static inline bool isImmUs2(int64_t val)
+{
+  return (val%2 == 0 && isImmUs(val/2));
+}
+
+static inline bool isImmUs4(int64_t val)
+{
+  return (val%4 == 0 && isImmUs(val/4));
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool
+XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                              const Type *Ty) const {
+  MVT VT = getValueType(Ty, true);
+  // Get expected value type after legalization
+  switch (VT.getSimpleVT()) {
+  // Legal load / stores
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    break;
+  // Expand i1 -> i8
+  case MVT::i1:
+    VT = MVT::i8;
+    break;
+  // Everything else is lowered to words
+  default:
+    VT = MVT::i32;
+    break;
+  }
+  if (AM.BaseGV) {
+    return VT == MVT::i32 && !AM.HasBaseReg && AM.Scale == 0 &&
+                 AM.BaseOffs%4 == 0;
+  }
+  
+  switch (VT.getSimpleVT()) {
+  default:
+    return false;
+  case MVT::i8:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs(AM.BaseOffs);
+    }
+    return AM.Scale == 1 && AM.BaseOffs == 0;
+  case MVT::i16:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs2(AM.BaseOffs);
+    }
+    return AM.Scale == 2 && AM.BaseOffs == 0;
+  case MVT::i32:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs4(AM.BaseOffs);
+    }
+    // reg + reg<<2
+    return AM.Scale == 4 && AM.BaseOffs == 0;
+  }
+  
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           XCore Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::vector<unsigned> XCoreTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT VT) const 
+{
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {
+    default : break;
+    case 'r':
+      return make_vector<unsigned>(XCore::R0, XCore::R1,  XCore::R2, 
+                                   XCore::R3, XCore::R4,  XCore::R5, 
+                                   XCore::R6, XCore::R7,  XCore::R8, 
+                                   XCore::R9, XCore::R10, XCore::R11, 0);
+      break;
+  }
+  return std::vector<unsigned>();
+}
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
new file mode 100644
index 0000000..993ecbd
--- /dev/null
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -0,0 +1,123 @@
+//===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that XCore uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREISELLOWERING_H
+#define XCOREISELLOWERING_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "XCore.h"
+
+namespace llvm {
+  
+  // Forward delcarations
+  class XCoreSubtarget;
+  class XCoreTargetMachine;
+  
+  namespace XCoreISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+XCore::INSTRUCTION_LIST_END,
+
+      // Branch and link (call)
+      BL,
+
+      // pc relative address
+      PCRelativeWrapper,
+
+      // dp relative address
+      DPRelativeWrapper,
+      
+      // cp relative address
+      CPRelativeWrapper,
+      
+      // Store word to stack
+      STWSP,
+
+      // Corresponds to retsp instruction
+      RETSP,
+      
+      // Corresponds to LADD instruction
+      LADD,
+
+      // Corresponds to LSUB instruction
+      LSUB
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class XCoreTargetLowering : public TargetLowering 
+  {
+  public:
+
+    explicit XCoreTargetLowering(XCoreTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific 
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+  
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *MBB) const;
+
+    virtual bool isLegalAddressingMode(const AddrMode &AM,
+                                       const Type *Ty) const;
+
+  private:
+    const XCoreTargetMachine &TM;
+    const XCoreSubtarget &Subtarget;
+  
+    // Lower Operand helpers
+    SDValue LowerCCCArguments(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCCCCallTo(SDValue Op, SelectionDAG &DAG, unsigned CC);
+    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode*TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
+    SDValue getGlobalAddressWrapper(SDValue GA, GlobalValue *GV,
+                                    SelectionDAG &DAG);
+
+    // Lower Operand specifics
+    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+  
+    // Inline asm support
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+              MVT VT) const;
+  
+    // Expand specifics
+    SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG);
+  };
+}
+
+#endif // XCOREISELLOWERING_H
diff --git a/lib/Target/XCore/XCoreInstrFormats.td b/lib/Target/XCore/XCoreInstrFormats.td
new file mode 100644
index 0000000..8002c99
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrFormats.td
@@ -0,0 +1,120 @@
+//===- XCoreInstrFormats.td - XCore Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+class InstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "XCore";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+// XCore pseudo instructions format
+class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstXCore<outs, ins, asmstr, pattern>;
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+class _F3R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FL3R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FL2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FLRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FLU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FU10<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FLU10<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F2R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FRUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FL2R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F1R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F0R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _L4R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _L5R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _L6R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
new file mode 100644
index 0000000..504d202
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -0,0 +1,524 @@
+//===- XCoreInstrInfo.cpp - XCore Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreInstrInfo.h"
+#include "XCore.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "XCoreGenInstrInfo.inc"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+namespace XCore {
+
+  // XCore Condition Codes
+  enum CondCode {
+    COND_TRUE,
+    COND_FALSE,
+    COND_INVALID
+  };
+}
+}
+
+using namespace llvm;
+
+XCoreInstrInfo::XCoreInstrInfo(void)
+  : TargetInstrInfoImpl(XCoreInsts, array_lengthof(XCoreInsts)),
+    RI(*this) {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool XCoreInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg, unsigned &DstReg,
+                                 unsigned &SrcSR, unsigned &DstSR) const {
+  SrcSR = DstSR = 0; // No sub-registers.
+
+  // We look for 4 kinds of patterns here:
+  // add dst, src, 0
+  // sub dst, src, 0
+  // or dst, src, src
+  // and dst, src, src
+  if ((MI.getOpcode() == XCore::ADD_2rus || MI.getOpcode() == XCore::SUB_2rus)
+      && isZeroImm(MI.getOperand(2))) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  } else if ((MI.getOpcode() == XCore::OR_3r || MI.getOpcode() == XCore::AND_3r)
+      && MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  }
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned
+XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const{
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::LDWFI) 
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) 
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+unsigned
+XCoreInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                   int &FrameIndex) const {
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::STWFI)
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2))))
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isInvariantLoad - Return true if the specified instruction (which is marked
+/// mayLoad) is loading from a location whose value is invariant across the
+/// function.  For example, loading a value from the constant pool or from
+/// from the argument area of a function if it does not change.  This should
+/// only return true of *all* loads the instruction does are invariant (if it
+/// does multiple loads).
+bool
+XCoreInstrInfo::isInvariantLoad(const MachineInstr *MI) const {
+  // Loads from constants pools and loads from invariant argument slots are
+  // invariant
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::LDWCP_ru6 || Opcode == XCore::LDWCP_lru6) {
+    return MI->getOperand(1).isCPI();
+  }
+  int FrameIndex;
+  if (isLoadFromStackSlot(MI, FrameIndex)) {
+    const MachineFrameInfo &MFI =
+      *MI->getParent()->getParent()->getFrameInfo();
+    return MFI.isFixedObjectIndex(FrameIndex) &&
+           MFI.isImmutableObjectIndex(FrameIndex);
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+static inline bool IsBRU(unsigned BrOpc) {
+  return BrOpc == XCore::BRFU_u6
+      || BrOpc == XCore::BRFU_lu6
+      || BrOpc == XCore::BRBU_u6
+      || BrOpc == XCore::BRBU_lu6;
+}
+
+static inline bool IsBRT(unsigned BrOpc) {
+  return BrOpc == XCore::BRFT_ru6
+      || BrOpc == XCore::BRFT_lru6
+      || BrOpc == XCore::BRBT_ru6
+      || BrOpc == XCore::BRBT_lru6;
+}
+
+static inline bool IsBRF(unsigned BrOpc) {
+  return BrOpc == XCore::BRFF_ru6
+      || BrOpc == XCore::BRFF_lru6
+      || BrOpc == XCore::BRBF_ru6
+      || BrOpc == XCore::BRBF_lru6;
+}
+
+static inline bool IsCondBranch(unsigned BrOpc) {
+  return IsBRF(BrOpc) || IsBRT(BrOpc);
+}
+
+/// GetCondFromBranchOpc - Return the XCore CC that matches 
+/// the correspondent Branch instruction opcode.
+static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
+{
+  if (IsBRT(BrOpc)) {
+    return XCore::COND_TRUE;
+  } else if (IsBRF(BrOpc)) {
+    return XCore::COND_FALSE;
+  } else {
+    return XCore::COND_INVALID;
+  }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) 
+{
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::BRFT_lru6;
+  case XCore::COND_FALSE  : return XCore::BRFF_lru6;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified 
+/// condition, e.g. turning COND_E to COND_NE.
+static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
+{
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::COND_FALSE;
+  case XCore::COND_FALSE  : return XCore::COND_TRUE;
+  }
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool
+XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                              MachineBasicBlock *&FBB,
+                              SmallVectorImpl<MachineOperand> &Cond,
+                              bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (IsBRU(LastInst->getOpcode())) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    
+    XCore::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == XCore::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+    
+    // Conditional branch
+    // Block ends with fall-through condbranch.
+
+    TBB = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(LastInst->getOperand(0));
+    return false;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
+  XCore::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+  
+  // If the block ends with conditional branch followed by unconditional,
+  // handle it.
+  if (BranchCode != XCore::COND_INVALID
+    && IsBRU(LastInst->getOpcode())) {
+
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(SecondLastInst->getOperand(0));
+
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (IsBRU(SecondLastInst->getOpcode()) && 
+      IsBRU(LastInst->getOpcode())) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned
+XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond)const{
+  // FIXME there should probably be a DebugLoc argument here
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "Unexpected number of components!");
+  
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch
+      BuildMI(&MBB, dl, get(XCore::BRFU_lu6)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+      BuildMI(&MBB, dl, get(Opc)).addReg(Cond[1].getReg())
+                             .addMBB(TBB);
+    }
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  assert(Cond.size() == 2 && "Unexpected number of components!");
+  unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+  BuildMI(&MBB, dl, get(Opc)).addReg(Cond[1].getReg())
+                         .addMBB(TBB);
+  BuildMI(&MBB, dl, get(XCore::BRFU_lu6)).addMBB(FBB);
+  return 2;
+}
+
+unsigned
+XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!IsCondBranch(I->getOpcode()))
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+bool XCoreInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  const TargetRegisterClass *DestRC,
+                                  const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == SrcRC) {
+    if (DestRC == XCore::GRRegsRegisterClass) {
+      BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
+        .addReg(SrcReg)
+        .addImm(0);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  if (SrcRC == XCore::RRegsRegisterClass && SrcReg == XCore::SP &&
+    DestRC == XCore::GRRegsRegisterClass) {
+    BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg)
+      .addImm(0);
+    return true;
+  }
+  if (DestRC == XCore::RRegsRegisterClass && DestReg == XCore::SP &&
+    SrcRC == XCore::GRRegsRegisterClass) {
+    BuildMI(MBB, I, DL, get(XCore::SETSP_1r))
+      .addReg(SrcReg);
+    return true;
+  }
+  return false;
+}
+
+void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool isKill,
+                                         int FrameIndex,
+                                         const TargetRegisterClass *RC) const
+{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  BuildMI(MBB, I, DL, get(XCore::STWFI))
+    .addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIndex)
+    .addImm(0);
+}
+
+void XCoreInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                            bool isKill, SmallVectorImpl<MachineOperand> &Addr,
+                            const TargetRegisterClass *RC,
+                            SmallVectorImpl<MachineInstr*> &NewMIs) const
+{
+  assert(0 && "unimplemented\n");
+}
+
+void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DestReg, int FrameIndex,
+                                          const TargetRegisterClass *RC) const
+{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
+    .addFrameIndex(FrameIndex)
+    .addImm(0);
+}
+
+void XCoreInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const
+{
+  assert(0 && "unimplemented\n");
+}
+
+bool XCoreInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI) const
+{
+  if (CSI.empty()) {
+    return true;
+  }
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
+  
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(it->getReg());
+
+    storeRegToStackSlot(MBB, MI, it->getReg(), true,
+                        it->getFrameIdx(), it->getRegClass());
+    if (emitFrameMoves) {
+      unsigned SaveLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MI, DL, get(XCore::DBG_LABEL)).addImm(SaveLabelId);
+      XFI->getSpillLabels().push_back(
+          std::pair<unsigned, CalleeSavedInfo>(SaveLabelId, *it));
+    }
+  }
+  return true;
+}
+
+bool XCoreInstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                               const std::vector<CalleeSavedInfo> &CSI) const
+{
+  bool AtStart = MI == MBB.begin();
+  MachineBasicBlock::iterator BeforeI = MI;
+  if (!AtStart)
+    --BeforeI;
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    
+    loadRegFromStackSlot(MBB, MI, it->getReg(),
+                                  it->getFrameIdx(),
+                                  it->getRegClass());
+    assert(MI != MBB.begin() &&
+           "loadRegFromStackSlot didn't insert any code!");
+    // Insert in reverse order.  loadRegFromStackSlot can insert multiple
+    // instructions.
+    if (AtStart)
+      MI = MBB.begin();
+    else {
+      MI = BeforeI;
+      ++MI;
+    }
+  }
+  return true;
+}
+
+/// BlockHasNoFallThrough - Analyse if MachineBasicBlock does not
+/// fall-through into its successor block.
+bool XCoreInstrInfo::
+BlockHasNoFallThrough(const MachineBasicBlock &MBB) const 
+{
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case XCore::RETSP_u6:     // Return.
+  case XCore::RETSP_lu6:
+  case XCore::BAU_1r:       // Indirect branch.
+  case XCore::BRFU_u6:      // Uncond branch.
+  case XCore::BRFU_lu6:
+  case XCore::BRBU_u6:
+  case XCore::BRBU_lu6:
+    return true;
+  default: return false;
+  }
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the 
+/// specified Branch instruction.
+bool XCoreInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const 
+{
+  assert((Cond.size() == 2) && 
+          "Invalid XCore branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((XCore::CondCode)Cond[0].getImm()));
+  return false;
+}
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
new file mode 100644
index 0000000..0870886
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -0,0 +1,110 @@
+//===- XCoreInstrInfo.h - XCore Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREINSTRUCTIONINFO_H
+#define XCOREINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "XCoreRegisterInfo.h"
+
+namespace llvm {
+
+class XCoreInstrInfo : public TargetInstrInfoImpl {
+  const XCoreRegisterInfo RI;
+public:
+  XCoreInstrInfo(void);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  virtual bool isInvariantLoad(const MachineInstr *MI) const;
+  
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+  
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const;
+  
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                               const std::vector<CalleeSavedInfo> &CSI) const;
+
+  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+
+  virtual bool ReverseBranchCondition(
+                            SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
new file mode 100644
index 0000000..65cd4fe
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -0,0 +1,991 @@
+//===- XCoreInstrInfo.td - Target Description for XCore ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the XCore instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Uses of CP, DP are not currently reflected in the patterns, since
+// having a physical register as an operand prevents loop hoisting and
+// since the value of these registers never changes during the life of the
+// function.
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass.
+//===----------------------------------------------------------------------===//
+
+include "XCoreInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// HasXS1A - This predicate is true when the target processor supports XS1A
+// instructions.
+def HasXS1A   : Predicate<"Subtarget.isXS1A()">;
+
+// HasXS1B - This predicate is true when the target processor supports XS1B
+// instructions.
+def HasXS1B : Predicate<"Subtarget.isXS1B()">;
+
+//===----------------------------------------------------------------------===//
+// XCore specific DAG Nodes.
+//
+
+// Call
+def SDT_XCoreBranchLink : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
+                            [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTNone,
+                         [SDNPHasChain, SDNPOptInFlag]>;
+
+def SDT_XCoreAddress    : SDTypeProfile<1, 1,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def pcrelwrapper : SDNode<"XCoreISD::PCRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def dprelwrapper : SDNode<"XCoreISD::DPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
+def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
+                               [SDNPHasChain]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def div4_xform : SDNodeXForm<imm, [{
+  // Transformation function: imm/4
+  assert(N->getZExtValue() % 4 == 0);
+  return getI32Imm(N->getZExtValue()/4);
+}]>;
+
+def msksize_xform : SDNodeXForm<imm, [{
+  // Transformation function: get the size of a mask
+  assert(isMask_32(N->getZExtValue()));
+  // look for the first non-zero bit
+  return getI32Imm(32 - CountLeadingZeros_32(N->getZExtValue()));
+}]>;
+
+def neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(-value);
+}]>;
+
+def div4neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm/4
+  uint32_t value = N->getZExtValue();
+  assert(-value % 4 == 0);
+  return getI32Imm(-value/4);
+}]>;
+
+def immUs4Neg : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (-value)%4 == 0 && (-value)/4 <= 11;
+}]>;
+
+def immUs4 : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return value%4 == 0 && value/4 <= 11;
+}]>;
+
+def immUsNeg : PatLeaf<(imm), [{
+  return -((uint32_t)N->getZExtValue()) <= 11;
+}]>;
+
+def immUs : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() <= 11;
+}]>;
+
+def immU6 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 6);
+}]>;
+
+def immU10 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 10);
+}]>;
+
+def immU16 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 16);
+}]>;
+
+def immU20 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 20);
+}]>;
+
+// FIXME check subtarget. Currently we check if the immediate
+// is in the common subset of legal immediate values for both
+// XS1A and XS1B.
+def immMskBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  if (!isMask_32(value)) {
+    return false;
+  }
+  int msksize = 32 - CountLeadingZeros_32(value);
+  return (msksize >= 1 && msksize <= 8)
+          || msksize == 16
+          || msksize == 24
+          || msksize == 32;
+}]>;
+
+// FIXME check subtarget. Currently we check if the immediate
+// is in the common subset of legal immediate values for both
+// XS1A and XS1B.
+def immBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 1 && value <= 8)
+          || value == 16
+          || value == 24
+          || value == 32;
+}]>;
+
+def lda16f : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 1))>;
+def lda16b : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 1))>;
+def ldawf : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 2))>;
+def ldawb : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 2))>;
+
+// Instruction operand types
+def calltarget  : Operand<i32>;
+def brtarget : Operand<OtherVT>;
+def pclabel : Operand<i32>;
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+def ADDRdpii : ComplexPattern<i32, 2, "SelectADDRdpii", [add, dprelwrapper],
+                 []>;
+def ADDRcpii : ComplexPattern<i32, 2, "SelectADDRcpii", [add, cprelwrapper],
+                 []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+
+multiclass F3R_2RUS<string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+multiclass F3R_2RUS_np<string OpcStr> {
+  def _3r: _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 []>;
+  def _2rus : _F2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 []>;
+}
+
+multiclass F3R_2RBITP<string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class F3R<string OpcStr, SDNode OpNode> : _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+class F3R_np<string OpcStr> : _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 []>;
+// Three operand long
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RUS<string OpcStr, SDNode OpNode> {
+  def _l3r: _FL3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RBITP<string OpcStr, SDNode OpNode> {
+  def _l3r: _FL3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class FL3R<string OpcStr, SDNode OpNode> : _FL3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+// Register - U6
+// Operand register - U6
+multiclass FRU6_LRU6_branch<string OpcStr> {
+  def _ru6: _FRU6<
+                 (outs), (ins GRRegs:$cond, brtarget:$dest),
+                 !strconcat(OpcStr, " $cond, $dest"),
+                 []>;
+  def _lru6: _FLRU6<
+                 (outs), (ins GRRegs:$cond, brtarget:$dest),
+                 !strconcat(OpcStr, " $cond, $dest"),
+                 []>;
+}
+
+multiclass FRU6_LRU6_cp<string OpcStr> {
+  def _ru6: _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$a),
+                 !strconcat(OpcStr, " $dst, cp[$a]"),
+                 []>;
+  def _lru6: _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$a),
+                 !strconcat(OpcStr, " $dst, cp[$a]"),
+                 []>;
+}
+
+// U6
+multiclass FU6_LU6<string OpcStr, SDNode OpNode> {
+  def _u6: _FU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 [(OpNode immU6:$b)]>;
+  def _lu6: _FLU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 [(OpNode immU16:$b)]>;
+}
+
+multiclass FU6_LU6_np<string OpcStr> {
+  def _u6: _FU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+  def _lu6: _FLU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+}
+
+// U10
+multiclass FU10_LU10_np<string OpcStr> {
+  def _u10: _FU10<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+  def _lu10: _FLU10<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+}
+
+// Two operand short
+
+class F2R_np<string OpcStr> : _F2R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b),
+                 !strconcat(OpcStr, " $dst, $b"),
+                 []>;
+
+// Two operand long
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
+                               "${:comment} ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "${:comment} ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "${:comment} LDWFI $dst, $addr",
+                             [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
+
+def LDAWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "${:comment} LDAWFI $dst, $addr",
+                             [(set GRRegs:$dst, ADDRspii:$addr)]>;
+
+def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
+                            "${:comment} STWFI $src, $addr",
+                            [(store GRRegs:$src, ADDRspii:$addr)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in {
+  def SELECT_CC : PseudoInstXCore<(outs GRRegs:$dst),
+                              (ins GRRegs:$cond, GRRegs:$T, GRRegs:$F),
+                              "${:comment} SELECT_CC PSEUDO!",
+                              [(set GRRegs:$dst,
+                                 (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+defm ADD : F3R_2RUS<"add", add>;
+defm SUB : F3R_2RUS<"sub", sub>;
+let neverHasSideEffects = 1 in {
+defm EQ : F3R_2RUS_np<"eq">;
+def LSS_3r : F3R_np<"lss">;
+def LSU_3r : F3R_np<"lsu">;
+}
+def AND_3r : F3R<"and", and>;
+def OR_3r : F3R<"or", or>;
+
+let mayLoad=1 in {
+def LDW_3r : _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldw $dst, $addr[$offset]",
+                  []>;
+
+def LDW_2rus : _F2RUS<(outs GRRegs:$dst), (ins GRRegs:$addr, i32imm:$offset),
+                  "ldw $dst, $addr[$offset]",
+                  []>;
+
+def LD16S_3r :  _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ld16s $dst, $addr[$offset]",
+                  []>;
+
+def LD8U_3r :  _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ld8u $dst, $addr[$offset]",
+                  []>;
+}
+
+let mayStore=1 in {
+def STW_3r : _F3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                  "stw $val, $addr[$offset]",
+                  []>;
+
+def STW_2rus : _F2RUS<(outs), (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
+                  "stw $val, $addr[$offset]",
+                  []>;
+}
+
+defm SHL : F3R_2RBITP<"shl", shl>;
+defm SHR : F3R_2RBITP<"shr", srl>;
+// TODO tsetr
+
+// Three operand long
+def LDAWF_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldaw $dst, $addr[$offset]",
+                  [(set GRRegs:$dst, (ldawf GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWF_l2rus : _FL2RUS<(outs GRRegs:$dst),
+                    (ins GRRegs:$addr, i32imm:$offset),
+                    "ldaw $dst, $addr[$offset]",
+                    []>;
+
+def LDAWB_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldaw $dst, $addr[-$offset]",
+                  [(set GRRegs:$dst, (ldawb GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWB_l2rus : _FL2RUS<(outs GRRegs:$dst),
+                    (ins GRRegs:$addr, i32imm:$offset),
+                    "ldaw $dst, $addr[-$offset]",
+                    []>;
+
+def LDA16F_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "lda16 $dst, $addr[$offset]",
+                  [(set GRRegs:$dst, (lda16f GRRegs:$addr, GRRegs:$offset))]>;
+
+def LDA16B_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "lda16 $dst, $addr[-$offset]",
+                  [(set GRRegs:$dst, (lda16b GRRegs:$addr, GRRegs:$offset))]>;
+
+def MUL_l3r : FL3R<"mul", mul>;
+// Instructions which may trap are marked as side effecting.
+let hasSideEffects = 1 in {
+def DIVS_l3r : FL3R<"divs", sdiv>;
+def DIVU_l3r : FL3R<"divu", udiv>;
+def REMS_l3r : FL3R<"rems", srem>;
+def REMU_l3r : FL3R<"remu", urem>;
+}
+def XOR_l3r : FL3R<"xor", xor>;
+defm ASHR : FL3R_L2RBITP<"ashr", sra>;
+// TODO crc32, crc8, inpw, outpw
+let mayStore=1 in {
+def ST16_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                "st16 $val, $addr[$offset]",
+                []>;
+
+def ST8_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                "st8 $val, $addr[$offset]",
+                []>;
+}
+
+// Four operand long
+let Predicates = [HasXS1B], Constraints = "$src1 = $dst1,$src2 = $dst2" in {
+def MACCU_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
+                      GRRegs:$src4),
+                    "maccu $dst1, $dst2, $src3, $src4",
+                    []>;
+
+def MACCS_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
+                      GRRegs:$src4),
+                    "maccs $dst1, $dst2, $src3, $src4",
+                    []>;
+}
+
+// Five operand long
+
+let Predicates = [HasXS1B] in {
+def LADD_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "ladd $dst1, $dst2, $src1, $src2, $src3",
+                    []>;
+
+def LSUB_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "lsub $dst1, $dst2, $src1, $src2, $src3",
+                    []>;
+
+def LDIV_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "ldiv $dst1, $dst2, $src1, $src2, $src3",
+                    []>;
+}
+
+// Six operand long
+
+def LMUL_l6r : _L6R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
+                      GRRegs:$src4),
+                    "lmul $dst1, $dst2, $src1, $src2, $src3, $src4",
+                    []>;
+
+let Predicates = [HasXS1A] in
+def MACC_l6r : _L6R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
+                      GRRegs:$src4),
+                    "macc $dst1, $dst2, $src1, $src2, $src3, $src4",
+                    []>;
+
+// Register - U6
+
+//let Uses = [DP] in ...
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def LDAWDP_ru6: _FRU6<(outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldaw $dst, dp[$a]",
+                    []>;
+
+let isReMaterializable = 1 in                    
+def LDAWDP_lru6: _FLRU6<
+                    (outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldaw $dst, dp[$a]",
+                    [(set GRRegs:$dst, ADDRdpii:$a)]>;
+
+let mayLoad=1 in
+def LDWDP_ru6: _FRU6<(outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldw $dst, dp[$a]",
+                    []>;
+                    
+def LDWDP_lru6: _FLRU6<
+                    (outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldw $dst, dp[$a]",
+                    [(set GRRegs:$dst, (load ADDRdpii:$a))]>;
+
+let mayStore=1 in
+def STWDP_ru6 : _FRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
+                  "stw $val, dp[$addr]",
+                  []>;
+
+def STWDP_lru6 : _FLRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
+                  "stw $val, dp[$addr]",
+                  [(store GRRegs:$val, ADDRdpii:$addr)]>;
+
+//let Uses = [CP] in ..
+let mayLoad = 1, isReMaterializable = 1 in
+defm LDWCP : FRU6_LRU6_cp<"ldw">;
+
+let Uses = [SP] in {
+let mayStore=1 in {
+def STWSP_ru6 : _FRU6<
+                 (outs), (ins GRRegs:$val, i32imm:$index),
+                 "stw $val, sp[$index]",
+                 [(XCoreStwsp GRRegs:$val, immU6:$index)]>;
+
+def STWSP_lru6 : _FLRU6<
+                 (outs), (ins GRRegs:$val, i32imm:$index),
+                 "stw $val, sp[$index]",
+                 [(XCoreStwsp GRRegs:$val, immU16:$index)]>;
+}
+
+let mayLoad=1 in {
+def LDWSP_ru6 : _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldw $dst, sp[$b]",
+                 []>;
+
+def LDWSP_lru6 : _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldw $dst, sp[$b]",
+                 []>;
+}
+
+let neverHasSideEffects = 1 in {
+def LDAWSP_ru6 : _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+
+def LDAWSP_lru6 : _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+
+def LDAWSP_ru6_RRegs : _FRU6<
+                 (outs RRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+
+def LDAWSP_lru6_RRegs : _FLRU6<
+                 (outs RRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+}
+}
+
+let isReMaterializable = 1 in {
+def LDC_ru6 : _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldc $dst, $b",
+                 [(set GRRegs:$dst, immU6:$b)]>;
+
+def LDC_lru6 : _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldc $dst, $b",
+                 [(set GRRegs:$dst, immU16:$b)]>;
+}
+
+// Operand register - U6
+// TODO setc
+let isBranch = 1, isTerminator = 1 in {
+defm BRFT: FRU6_LRU6_branch<"bt">;
+defm BRBT: FRU6_LRU6_branch<"bt">;
+defm BRFF: FRU6_LRU6_branch<"bf">;
+defm BRBF: FRU6_LRU6_branch<"bf">;
+}
+
+// U6
+let Defs = [SP], Uses = [SP] in {
+let neverHasSideEffects = 1 in
+defm EXTSP : FU6_LU6_np<"extsp">;
+let mayStore = 1 in
+defm ENTSP : FU6_LU6_np<"entsp">;
+
+let isReturn = 1, isTerminator = 1, mayLoad = 1 in {
+defm RETSP : FU6_LU6<"retsp", XCoreRetsp>;
+}
+}
+
+// TODO extdp, kentsp, krestsp, blat, setsr
+// clrsr, getsr, kalli
+let isBranch = 1, isTerminator = 1 in {
+def BRBU_u6 : _FU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+
+def BRBU_lu6 : _FLU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+
+def BRFU_u6 : _FU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+
+def BRFU_lu6 : _FLU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+}
+
+//let Uses = [CP] in ...
+let Predicates = [HasXS1B], Defs = [R11], neverHasSideEffects = 1,
+  isReMaterializable = 1 in
+def LDAWCP_u6: _FRU6<(outs), (ins MEMii:$a),
+                    "ldaw r11, cp[$a]",
+                    []>;
+
+let Predicates = [HasXS1B], Defs = [R11], isReMaterializable = 1 in
+def LDAWCP_lu6: _FLRU6<
+                    (outs), (ins MEMii:$a),
+                    "ldaw r11, cp[$a]",
+                    [(set R11, ADDRcpii:$a)]>;
+
+// U10
+// TODO ldwcpl, blacp
+
+let Defs = [R11], isReMaterializable = 1, neverHasSideEffects = 1 in
+def LDAP_u10 : _FU10<
+                  (outs),
+                  (ins i32imm:$addr),
+                  "ldap r11, $addr",
+                  []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAP_lu10 : _FLU10<
+                  (outs),
+                  (ins i32imm:$addr),
+                  "ldap r11, $addr",
+                  [(set R11, (pcrelwrapper tglobaladdr:$addr))]>;
+
+let isCall=1,
+// All calls clobber the the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR] in {
+def BL_u10 : _FU10<
+                  (outs),
+                  (ins calltarget:$target, variable_ops),
+                  "bl $target",
+                  [(XCoreBranchLink immU10:$target)]>;
+
+def BL_lu10 : _FLU10<
+                  (outs),
+                  (ins calltarget:$target, variable_ops),
+                  "bl $target",
+                  [(XCoreBranchLink immU20:$target)]>;
+}
+
+// Two operand short
+// TODO getr, getst
+def NOT : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
+                 "not $dst, $b",
+                 [(set GRRegs:$dst, (not GRRegs:$b))]>;
+
+def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
+                 "neg $dst, $b",
+                 [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
+
+// TODO setd, eet, eef, getts, setpt, outct, inct, chkct, outt, intt, out,
+// in, outshr, inshr, testct, testwct, tinitpc, tinitdp, tinitsp, tinitcp,
+// tsetmr, sext (reg), zext (reg)
+let isTwoAddress = 1 in {
+let neverHasSideEffects = 1 in
+def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                 "sext $dst, $src2",
+                 []>;
+
+let neverHasSideEffects = 1 in
+def ZEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                 "zext $dst, $src2",
+                 []>;
+
+def ANDNOT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+                 "andnot $dst, $src2",
+                 [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
+}
+
+let isReMaterializable = 1, neverHasSideEffects = 1 in
+def MKMSK_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$size),
+                 "mkmsk $dst, $size",
+                 []>;
+
+def MKMSK_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$size),
+                 "mkmsk $dst, $size",
+                 [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), 0xffffffff))]>;
+
+// Two operand long
+// TODO settw, setclk, setrdy, setpsc, endin, peek,
+// getd, testlcl, tinitlr, getps, setps
+def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                 "bitrev $dst, $src",
+                 [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
+
+def BYTEREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                 "byterev $dst, $src",
+                 [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
+
+def CLZ_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                 "clz $dst, $src",
+                 [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
+
+// One operand short
+// TODO edu, eeu, waitet, waitef, freer, tstart, msync, mjoin, syncr, clrtp
+// bru, setdp, setcp, setv, setev, kcall
+// dgetreg
+let isBranch=1, isIndirectBranch=1, isTerminator=1 in
+def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
+                 "bau $addr",
+                 [(brind GRRegs:$addr)]>;
+
+let Defs=[SP], neverHasSideEffects=1 in
+def SETSP_1r : _F1R<(outs), (ins GRRegs:$src),
+                 "set sp, $src",
+                 []>;
+
+let isBarrier = 1, hasCtrlDep = 1 in 
+def ECALLT_1r : _F1R<(outs), (ins GRRegs:$src),
+                 "ecallt $src",
+                 []>;
+
+let isBarrier = 1, hasCtrlDep = 1 in 
+def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
+                 "ecallf $src",
+                 []>;
+
+let isCall=1, 
+// All calls clobber the the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR] in {
+def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops),
+                 "bla $addr",
+                 [(XCoreBranchLink GRRegs:$addr)]>;
+}
+
+// Zero operand short
+// TODO waiteu, clre, ssync, freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
+// stet, geted, getet, getkep, getksp, setkep, getid, kret, dcall, dret,
+// dentsp, drestsp
+
+let Defs = [R11] in
+def GETID_0R : _F0R<(outs), (ins),
+                 "get r11, id",
+                 [(set R11, (int_xcore_getid))]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(XCoreBranchLink tglobaladdr:$addr), (BL_lu10 tglobaladdr:$addr)>;
+def : Pat<(XCoreBranchLink texternalsym:$addr), (BL_lu10 texternalsym:$addr)>;
+
+/// sext_inreg
+def : Pat<(sext_inreg GRRegs:$b, i1), (SEXT_rus GRRegs:$b, 1)>;
+def : Pat<(sext_inreg GRRegs:$b, i8), (SEXT_rus GRRegs:$b, 8)>;
+def : Pat<(sext_inreg GRRegs:$b, i16), (SEXT_rus GRRegs:$b, 16)>;
+
+/// loads
+def : Pat<(zextloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(zextloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(zextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(sextloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(load (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (LDW_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(load (add GRRegs:$addr, immUs4:$offset)),
+          (LDW_2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(load GRRegs:$addr), (LDW_2rus GRRegs:$addr, 0)>;
+
+/// anyext
+def : Pat<(extloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+def : Pat<(extloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+/// stores
+def : Pat<(truncstorei8 GRRegs:$val, (add GRRegs:$addr, GRRegs:$offset)),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei8 GRRegs:$val, GRRegs:$addr),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+          
+def : Pat<(truncstorei16 GRRegs:$val, (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(store GRRegs:$val, (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (STW_3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(store GRRegs:$val, GRRegs:$addr),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, 0)>;
+
+/// cttz
+def : Pat<(cttz GRRegs:$src), (CLZ_l2r (BITREV_l2r GRRegs:$src))>;
+
+/// trap
+def : Pat<(trap), (ECALLF_1r (LDC_ru6 0))>;
+
+///
+/// branch patterns
+///
+
+// unconditional branch
+def : Pat<(br bb:$addr), (BRFU_lu6 bb:$addr)>;
+
+// direct match equal/notequal zero brcond
+def : Pat<(brcond (setne GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 GRRegs:$lhs, bb:$dst)>;
+def : Pat<(brcond (seteq GRRegs:$lhs, 0), bb:$dst),
+          (BRFF_lru6 GRRegs:$lhs, bb:$dst)>;
+
+def : Pat<(brcond (setle GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setule GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setuge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, immUs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_2rus GRRegs:$lhs, immUs:$rhs), bb:$dst)>;
+
+// generic brcond pattern
+def : Pat<(brcond GRRegs:$cond, bb:$addr), (BRFT_lru6 GRRegs:$cond, bb:$addr)>;
+
+
+///
+/// Select patterns
+///
+
+// direct match equal/notequal zero select
+def : Pat<(select (setne GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (seteq GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(select (setle GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setule GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setuge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, immUs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_2rus GRRegs:$lhs, immUs:$rhs), GRRegs:$F, GRRegs:$T)>;
+
+///
+/// setcc patterns, only matched when none of the above brcond
+/// patterns match
+///
+
+// setcc 2 register operands
+def : Pat<(setle GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+def : Pat<(setule GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+
+def : Pat<(setgt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$rhs, GRRegs:$lhs)>;
+def : Pat<(setugt GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$rhs, GRRegs:$lhs)>;
+
+def : Pat<(setge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+def : Pat<(setuge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(setlt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$lhs, GRRegs:$rhs)>;
+def : Pat<(setult GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+def : Pat<(setne GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (EQ_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(seteq GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+// setcc reg/imm operands
+def : Pat<(seteq GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus GRRegs:$lhs, immUs:$rhs)>;
+def : Pat<(setne GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus (EQ_2rus GRRegs:$lhs, immUs:$rhs), 0)>;
+
+// misc
+def : Pat<(add GRRegs:$addr, immUs4:$offset),
+          (LDAWF_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(sub GRRegs:$addr, immUs4:$offset),
+          (LDAWB_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(and GRRegs:$val, immMskBitp:$mask),
+          (ZEXT_rus GRRegs:$val, (msksize_xform immMskBitp:$mask))>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : Pat<(add GRRegs:$src1, immUsNeg:$src2),
+          (SUB_2rus GRRegs:$src1, (neg_xform immUsNeg:$src2))>;
+
+def : Pat<(add GRRegs:$src1, immUs4Neg:$src2),
+          (LDAWB_l2rus GRRegs:$src1, (div4neg_xform immUs4Neg:$src2))>;
+
+///
+/// Some peepholes
+///
+
+def : Pat<(mul GRRegs:$src, 3),
+          (LDA16F_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, 5),
+          (LDAWF_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, -3),
+          (LDAWB_l3r GRRegs:$src, GRRegs:$src)>;
+
+// ashr X, 32 is equivalent to ashr X, 31 on the XCore.
+def : Pat<(sra GRRegs:$src, 31),
+          (ASHR_l2rus GRRegs:$src, 32)>;
+
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
new file mode 100644
index 0000000..43adb0f
--- /dev/null
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -0,0 +1,69 @@
+//====- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares XCore-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMACHINEFUNCTIONINFO_H
+#define XCOREMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include <vector>
+
+namespace llvm {
+
+// Forward declarations
+class Function;
+
+/// XCoreFunctionInfo - This class is derived from MachineFunction private
+/// XCore target-specific information for each MachineFunction.
+class XCoreFunctionInfo : public MachineFunctionInfo {
+private:
+  bool UsesLR;
+  int LRSpillSlot;
+  int FPSpillSlot;
+  int VarArgsFrameIndex;
+  std::vector<std::pair<unsigned, CalleeSavedInfo> > SpillLabels;
+
+public:
+  XCoreFunctionInfo() :
+    UsesLR(false),
+    LRSpillSlot(0),
+    FPSpillSlot(0),
+    VarArgsFrameIndex(0) {}
+  
+  XCoreFunctionInfo(MachineFunction &MF) :
+    UsesLR(false),
+    LRSpillSlot(0),
+    FPSpillSlot(0),
+    VarArgsFrameIndex(0) {}
+  
+  ~XCoreFunctionInfo() {}
+  
+  void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  
+  void setUsesLR(bool val) { UsesLR = val; }
+  bool getUsesLR() const { return UsesLR; }
+  
+  void setLRSpillSlot(int off) { LRSpillSlot = off; }
+  int getLRSpillSlot() const { return LRSpillSlot; }
+  
+  void setFPSpillSlot(int off) { FPSpillSlot = off; }
+  int getFPSpillSlot() const { return FPSpillSlot; }
+  
+  std::vector<std::pair<unsigned, CalleeSavedInfo> >&getSpillLabels() {
+    return SpillLabels;
+  }
+};
+} // End llvm namespace
+
+#endif // XCOREMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
new file mode 100644
index 0000000..82cd92d
--- /dev/null
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -0,0 +1,598 @@
+//===- XCoreRegisterInfo.cpp - XCore Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreRegisterInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCore.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+XCoreRegisterInfo::XCoreRegisterInfo(const TargetInstrInfo &tii)
+  : XCoreGenRegisterInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+    TII(tii) {
+}
+
+// helper functions
+static inline bool isImmUs(unsigned val) {
+  return val <= 11;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+static const unsigned XCore_ArgRegs[] = {
+  XCore::R0, XCore::R1, XCore::R2, XCore::R3
+};
+
+const unsigned * XCoreRegisterInfo::getArgRegs(const MachineFunction *MF)
+{
+  return XCore_ArgRegs;
+}
+
+unsigned XCoreRegisterInfo::getNumArgRegs(const MachineFunction *MF)
+{
+  return array_lengthof(XCore_ArgRegs);
+}
+
+bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF)
+{
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  return (MMI && MMI->hasDebugInfo()) ||
+          !MF.getFunction()->doesNotThrow() ||
+          UnwindTablesMandatory;
+}
+
+const unsigned* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9, XCore::R10, XCore::LR,
+    0
+  };
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+XCoreRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
+    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
+    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
+    XCore::GRRegsRegisterClass, XCore::RRegsRegisterClass,
+    0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(XCore::CP);
+  Reserved.set(XCore::DP);
+  Reserved.set(XCore::SP);
+  Reserved.set(XCore::LR);
+  if (hasFP(MF)) {
+    Reserved.set(XCore::R10);
+  }
+  return Reserved;
+}
+
+bool
+XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  // TODO can we estimate stack size?
+  return hasFP(MF);
+}
+
+bool XCoreRegisterInfo::hasFP(const MachineFunction &MF) const {
+  return NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void XCoreRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
+    // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      assert(Amount%4 == 0);
+      Amount /= 4;
+      
+      bool isU6 = isImmU6(Amount);
+      
+      if (!isU6 && !isImmU16(Amount)) {
+        // FIX could emit multiple instructions in this case.
+        cerr << "eliminateCallFramePseudoInstr size too big: "
+             << Amount << "\n";
+        abort();
+      }
+
+      MachineInstr *New;
+      if (Old->getOpcode() == XCore::ADJCALLSTACKDOWN) {
+        int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode))
+          .addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == XCore::ADJCALLSTACKUP);
+        int Opcode = isU6 ? XCore::LDAWSP_ru6_RRegs : XCore::LDAWSP_lru6_RRegs;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode), XCore::SP)
+          .addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+  
+  MBB.erase(I);
+}
+
+void XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  MachineOperand &FrameOp = MI.getOperand(i);
+  int FrameIndex = FrameOp.getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+  int StackSize = MF.getFrameInfo()->getStackSize();
+
+  #ifndef NDEBUG
+  DOUT << "\nFunction         : " << MF.getFunction()->getName() << "\n";
+  DOUT << "<--------->\n";
+  MI.print(DOUT);
+  DOUT << "FrameIndex         : " << FrameIndex << "\n";
+  DOUT << "FrameOffset        : " << Offset << "\n";
+  DOUT << "StackSize          : " << StackSize << "\n";
+  #endif
+
+  Offset += StackSize;
+  
+  // fold constant into offset.
+  Offset += MI.getOperand(i + 1).getImm();
+  MI.getOperand(i + 1).ChangeToImmediate(0);
+  
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+
+  #ifndef NDEBUG
+  DOUT << "Offset             : " << Offset << "\n";
+  DOUT << "<--------->\n";
+  #endif
+  
+  Offset/=4;
+  
+  bool FP = hasFP(MF);
+  
+  unsigned Reg = MI.getOperand(0).getReg();
+  bool isKill = MI.getOpcode() == XCore::STWFI && MI.getOperand(0).isKill();
+
+  assert(XCore::GRRegsRegisterClass->contains(Reg) &&
+         "Unexpected register operand");
+  
+  MachineBasicBlock &MBB = *MI.getParent();
+  
+  if (FP) {
+    bool isUs = isImmUs(Offset);
+    unsigned FramePtr = XCore::R10;
+    
+    MachineInstr *New = 0;
+    if (!isUs) {
+      if (!RS) {
+        cerr << "eliminateFrameIndex Frame size too big: " << Offset << "\n";
+        abort();
+      }
+      unsigned ScratchReg = RS->scavengeRegister(XCore::GRRegsRegisterClass, II,
+                                                 SPAdj);
+      loadConstant(MBB, II, ScratchReg, Offset, dl);
+      switch (MI.getOpcode()) {
+      case XCore::LDWFI:
+        New = BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+              .addReg(FramePtr)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      case XCore::STWFI:
+        New = BuildMI(MBB, II, dl, TII.get(XCore::STW_3r))
+              .addReg(Reg, getKillRegState(isKill))
+              .addReg(FramePtr)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      case XCore::LDAWFI:
+        New = BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+              .addReg(FramePtr)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      default:
+        assert(0 && "Unexpected Opcode\n");
+      }
+    } else {
+      switch (MI.getOpcode()) {
+      case XCore::LDWFI:
+        New = BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+              .addReg(FramePtr)
+              .addImm(Offset);
+        break;
+      case XCore::STWFI:
+        New = BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+              .addReg(Reg, getKillRegState(isKill))
+              .addReg(FramePtr)
+              .addImm(Offset);
+        break;
+      case XCore::LDAWFI:
+        New = BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+              .addReg(FramePtr)
+              .addImm(Offset);
+        break;
+      default:
+        assert(0 && "Unexpected Opcode\n");
+      }
+    }
+  } else {
+    bool isU6 = isImmU6(Offset);
+    if (!isU6 && !isImmU16(Offset)) {
+      // FIXME could make this work for LDWSP, LDAWSP.
+      cerr << "eliminateFrameIndex Frame size too big: " << Offset << "\n";
+      abort();
+    }
+
+    switch (MI.getOpcode()) {
+    int NewOpcode;
+    case XCore::LDWFI:
+      NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+            .addImm(Offset);
+      break;
+    case XCore::STWFI:
+      NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode))
+            .addReg(Reg, getKillRegState(isKill))
+            .addImm(Offset);
+      break;
+    case XCore::LDAWFI:
+      NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+            .addImm(Offset);
+      break;
+    default:
+      assert(0 && "Unexpected Opcode\n");
+    }
+  }
+  // Erase old instruction.
+  MBB.erase(II);
+}
+
+void
+XCoreRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
+  const TargetRegisterClass *RC = XCore::GRRegsRegisterClass;
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  if (LRUsed) {
+    MF.getRegInfo().setPhysRegUnused(XCore::LR);
+    
+    bool isVarArg = MF.getFunction()->isVarArg();
+    int FrameIdx;
+    if (! isVarArg) {
+      // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), 0);
+    } else {
+      FrameIdx = MFI->CreateStackObject(RC->getSize(), RC->getAlignment());
+    }
+    XFI->setUsesLR(FrameIdx);
+    XFI->setLRSpillSlot(FrameIdx);
+  }
+  if (requiresRegisterScavenging(MF)) {
+    // Reserve a slot close to SP or frame pointer.
+    RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                RC->getAlignment()));
+  }
+  if (hasFP(MF)) {
+    // A callee save register is used to hold the FP.
+    // This needs saving / restoring in the epilogue / prologue.
+    XFI->setFPSpillSlot(MFI->CreateStackObject(RC->getSize(),
+                        RC->getAlignment()));
+  }
+}
+
+void XCoreRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+  
+}
+
+void XCoreRegisterInfo::
+loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+            unsigned DstReg, int64_t Value, DebugLoc dl) const {
+  // TODO use mkmsk if possible.
+  if (!isImmU16(Value)) {
+    // TODO use constant pool.
+    cerr << "loadConstant value too big " << Value << "\n";
+    abort();
+  }
+  int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
+}
+
+void XCoreRegisterInfo::
+storeToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  unsigned SrcReg, int Offset, DebugLoc dl) const {
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  Offset/=4;
+  bool isU6 = isImmU6(Offset);
+  if (!isU6 && !isImmU16(Offset)) {
+    cerr << "storeToStack offset too big " << Offset << "\n";
+    abort();
+  }
+  int Opcode = isU6 ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode))
+    .addReg(SrcReg)
+    .addImm(Offset);
+}
+
+void XCoreRegisterInfo::
+loadFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  unsigned DstReg, int Offset, DebugLoc dl) const {
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  Offset/=4;
+  bool isU6 = isImmU6(Offset);
+  if (!isU6 && !isImmU16(Offset)) {
+    cerr << "loadFromStack offset too big " << Offset << "\n";
+    abort();
+  }
+  int Opcode = isU6 ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg)
+    .addImm(Offset);
+}
+
+void XCoreRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  bool FP = hasFP(MF);
+
+  // Work out frame sizes.
+  int FrameSize = MFI->getStackSize();
+
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+  
+  FrameSize/=4;
+  
+  bool isU6 = isImmU6(FrameSize);
+
+  if (!isU6 && !isImmU16(FrameSize)) {
+    // FIXME could emit multiple instructions.
+    cerr << "emitPrologue Frame size too big: " << FrameSize << "\n";
+    abort();
+  }
+  bool emitFrameMoves = needsFrameMoves(MF);
+
+  // Do we need to allocate space on the stack?
+  if (FrameSize) {
+    bool saveLR = XFI->getUsesLR();
+    bool LRSavedOnEntry = false;
+    int Opcode;
+    if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
+      Opcode = (isU6) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
+      MBB.addLiveIn(XCore::LR);
+      saveLR = false;
+      LRSavedOnEntry = true;
+    } else {
+      Opcode = (isU6) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+    }
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+    
+    if (emitFrameMoves) {
+      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+      
+      // Show update of SP.
+      unsigned FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(FrameLabelId);
+      
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize * 4);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      
+      if (LRSavedOnEntry) {
+        MachineLocation CSDst(MachineLocation::VirtualFP, 0);
+        MachineLocation CSSrc(XCore::LR);
+        Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+      }
+    }
+    if (saveLR) {
+      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+      storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl);
+      MBB.addLiveIn(XCore::LR);
+      
+      if (emitFrameMoves) {
+        unsigned SaveLRLabelId = MMI->NextLabelID();
+        BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(SaveLRLabelId);
+        MachineLocation CSDst(MachineLocation::VirtualFP, LRSpillOffset);
+        MachineLocation CSSrc(XCore::LR);
+        MMI->getFrameMoves().push_back(MachineMove(SaveLRLabelId,
+                                                   CSDst, CSSrc));
+      }
+    }
+  }
+  
+  if (FP) {
+    // Save R10 to the stack.
+    int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+    storeToStack(MBB, MBBI, XCore::R10, FPSpillOffset + FrameSize*4, dl);
+    // R10 is live-in. It is killed at the spill.
+    MBB.addLiveIn(XCore::R10);
+    if (emitFrameMoves) {
+      unsigned SaveR10LabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(SaveR10LabelId);
+      MachineLocation CSDst(MachineLocation::VirtualFP, FPSpillOffset);
+      MachineLocation CSSrc(XCore::R10);
+      MMI->getFrameMoves().push_back(MachineMove(SaveR10LabelId,
+                                                 CSDst, CSSrc));
+    }
+    // Set the FP from the SP.
+    unsigned FramePtr = XCore::R10;
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr)
+      .addImm(0);
+    if (emitFrameMoves) {
+      // Show FP is now valid.
+      unsigned FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(FrameLabelId);
+      MachineLocation SPDst(FramePtr);
+      MachineLocation SPSrc(MachineLocation::VirtualFP);
+      MMI->getFrameMoves().push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    }
+  }
+  
+  if (emitFrameMoves) {
+    // Frame moves for callee saved.
+    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+    std::vector<std::pair<unsigned, CalleeSavedInfo> >&SpillLabels =
+        XFI->getSpillLabels();
+    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
+      unsigned SpillLabel = SpillLabels[I].first;
+      CalleeSavedInfo &CSI = SpillLabels[I].second;
+      int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
+      unsigned Reg = CSI.getReg();
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(SpillLabel, CSDst, CSSrc));
+    }
+  }
+}
+
+void XCoreRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+  
+  bool FP = hasFP(MF);
+  
+  if (FP) {
+    // Restore the stack pointer.
+    unsigned FramePtr = XCore::R10;
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r))
+      .addReg(FramePtr);
+  }
+
+  // Work out frame sizes.
+  int FrameSize = MFI->getStackSize();
+
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+
+  FrameSize/=4;
+  
+  bool isU6 = isImmU6(FrameSize);
+
+  if (!isU6 && !isImmU16(FrameSize)) {
+    // FIXME could emit multiple instructions.
+    cerr << "emitEpilogue Frame size too big: " << FrameSize << "\n";
+    abort();
+  }
+
+  if (FrameSize) {
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+    
+    if (FP) {
+      // Restore R10
+      int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+      FPSpillOffset += FrameSize*4;
+      loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl);
+    }
+    bool restoreLR = XFI->getUsesLR();
+    if (restoreLR && MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0) {
+      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+      LRSpillOffset += FrameSize*4;
+      loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl);
+      restoreLR = false;
+    }
+    if (restoreLR) {
+      // Fold prologue into return instruction
+      assert(MBBI->getOpcode() == XCore::RETSP_u6
+        || MBBI->getOpcode() == XCore::RETSP_lu6);
+      int Opcode = (isU6) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+      MBB.erase(MBBI);
+    } else {
+      int Opcode = (isU6) ? XCore::LDAWSP_ru6_RRegs : XCore::LDAWSP_lru6_RRegs;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(FrameSize);
+    }
+  }
+}
+
+int XCoreRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return XCoreGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+unsigned XCoreRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  bool FP = hasFP(MF);
+  
+  return FP ? XCore::R10 : XCore::SP;
+}
+
+unsigned XCoreRegisterInfo::getRARegister() const {
+  return XCore::LR;
+}
+
+void XCoreRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Initial state of the frame pointer is SP.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(XCore::SP, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+#include "XCoreGenRegisterInfo.inc"
+
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
new file mode 100644
index 0000000..00b7caa
--- /dev/null
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -0,0 +1,94 @@
+//===- XCoreRegisterInfo.h - XCore Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREREGISTERINFO_H
+#define XCOREREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "XCoreGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
+private:
+  const TargetInstrInfo &TII;
+
+  void loadConstant(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned DstReg, int64_t Value, DebugLoc dl) const;
+
+  void storeToStack(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned SrcReg, int Offset, DebugLoc dl) const;
+
+  void loadFromStack(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned DstReg, int Offset, DebugLoc dl) const;
+
+public:
+  XCoreRegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+                           
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  //! Return the array of argument passing registers
+  /*!
+    \note The size of this array is returned by getArgRegsSize().
+    */
+  static const unsigned *getArgRegs(const MachineFunction *MF = 0);
+
+  //! Return the size of the argument passing register array
+  static unsigned getNumArgRegs(const MachineFunction *MF = 0);
+  
+  //! Return whether to emit frame moves
+  static bool needsFrameMoves(const MachineFunction &MF);
+
+  //! Get DWARF debugging register number
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td
new file mode 100644
index 0000000..62daf5d
--- /dev/null
+++ b/lib/Target/XCore/XCoreRegisterInfo.td
@@ -0,0 +1,91 @@
+//===- XCoreRegisterInfo.td - XCore Register defs ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the XCore register file 
+//===----------------------------------------------------------------------===//
+
+class XCoreReg<string n> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "XCore";
+}
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<4> num, string n> : XCoreReg<n> {
+  let Num = num;
+}
+
+// CPU registers
+def R0  : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1  : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2  : Ri< 2, "r2">, DwarfRegNum<[2]>; 
+def R3  : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4  : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5  : Ri< 5, "r5">, DwarfRegNum<[5]>; 
+def R6  : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7  : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8  : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9  : Ri< 9, "r9">, DwarfRegNum<[9]>; 
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+def CP : Ri<12, "cp">, DwarfRegNum<[12]>; 
+def DP : Ri<13, "dp">, DwarfRegNum<[13]>;
+def SP : Ri<14, "sp">, DwarfRegNum<[14]>;
+def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
+
+// Register classes.
+//
+def GRRegs : RegisterClass<"XCore", [i32], 32,
+  // Return values and arguments
+  [R0, R1, R2, R3,
+  // Not preserved across procedure calls
+  R11,
+  // Callee save
+  R4, R5, R6, R7, R8, R9, R10]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GRRegsClass::iterator
+    GRRegsClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    GRRegsClass::iterator
+    GRRegsClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return end()-1;  // don't allocate R10
+      else
+        return end();
+    }
+  }];
+}
+
+def RRegs : RegisterClass<"XCore", [i32], 32,
+  // Reserved
+  [CP, DP, SP, LR]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    RRegsClass::iterator
+    RRegsClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    RRegsClass::iterator
+    RRegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // No allocatable registers
+      return begin();
+    }
+  }];
+}
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
new file mode 100644
index 0000000..dc53da4
--- /dev/null
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -0,0 +1,28 @@
+//===- XCoreSubtarget.cpp - XCore Subtarget Information -----------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCore specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreSubtarget.h"
+#include "XCore.h"
+#include "XCoreGenSubtarget.inc"
+using namespace llvm;
+
+XCoreSubtarget::XCoreSubtarget(const TargetMachine &TM, const Module &M, 
+                             const std::string &FS)
+  : IsXS1A(false),
+    IsXS1B(false)
+{
+  std::string CPU = "xs1a-generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h
new file mode 100644
index 0000000..ff6475b
--- /dev/null
+++ b/lib/Target/XCore/XCoreSubtarget.h
@@ -0,0 +1,46 @@
+//=====-- XCoreSubtarget.h - Define Subtarget for the XCore -----*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORESUBTARGET_H
+#define XCORESUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+class XCoreSubtarget : public TargetSubtarget {
+  bool IsXS1A;
+  bool IsXS1B;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  XCoreSubtarget(const TargetMachine &TM, const Module &M, 
+                const std::string &FS);
+
+  bool isXS1A() const { return IsXS1A; }
+  bool isXS1B() const { return IsXS1B; }
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/XCore/XCoreTargetAsmInfo.cpp b/lib/Target/XCore/XCoreTargetAsmInfo.cpp
new file mode 100644
index 0000000..5513762
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetAsmInfo.cpp
@@ -0,0 +1,201 @@
+//===-- XCoreTargetAsmInfo.cpp - XCore asm properties -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the XCoreTargetAsmInfo properties.
+// We use the small section flag for the CP relative and DP relative
+// flags. If a section is small and writable then it is DP relative. If a
+// section is small and not writable then it is CP relative.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetAsmInfo.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+
+XCoreTargetAsmInfo::XCoreTargetAsmInfo(const XCoreTargetMachine &TM)
+  : ELFTargetAsmInfo(TM),
+    Subtarget(TM.getSubtargetImpl()) {
+  TextSection = getUnnamedSection("\t.text", SectionFlags::Code);
+  DataSection = getNamedSection("\t.dp.data", SectionFlags::Writeable |
+                                SectionFlags::Small);
+  BSSSection_  = getNamedSection("\t.dp.bss", SectionFlags::Writeable |
+                                 SectionFlags::BSS | SectionFlags::Small);
+  if (Subtarget->isXS1A()) {
+    ReadOnlySection = getNamedSection("\t.dp.rodata", SectionFlags::None |
+                                      SectionFlags::Writeable |
+                                      SectionFlags::Small);
+  } else {
+    ReadOnlySection = getNamedSection("\t.cp.rodata", SectionFlags::None |
+                                      SectionFlags::Small);
+  }
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = 0;
+  ZeroDirective = "\t.space\t";
+  CommentString = "#";
+  ConstantPoolSection = "\t.section\t.cp.rodata,\"ac\",@progbits";
+  JumpTableDataSection = "\t.section\t.dp.data,\"awd\",@progbits";
+  PrivateGlobalPrefix = ".L";
+  AscizDirective = ".asciiz";
+  WeakDefDirective = "\t.weak\t";
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+
+  // Debug
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+  
+  DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"\",@progbits";
+  DwarfInfoSection = "\t.section\t.debug_info,\"\",@progbits";
+  DwarfLineSection = "\t.section\t.debug_line,\"\",@progbits";
+  DwarfFrameSection = "\t.section\t.debug_frame,\"\",@progbits";
+  DwarfPubNamesSection = "\t.section\t.debug_pubnames,\"\",@progbits";
+  DwarfPubTypesSection = "\t.section\t.debug_pubtypes,\"\",@progbits";
+  DwarfStrSection = "\t.section\t.debug_str,\"\",@progbits";
+  DwarfLocSection = "\t.section\t.debug_loc,\"\",@progbits";
+  DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits";
+  DwarfRangesSection = "\t.section\t.debug_ranges,\"\",@progbits";
+  DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits";
+}
+
+const Section*
+XCoreTargetAsmInfo::SelectSectionForGlobal(const GlobalValue *GV) const {
+  SectionKind::Kind Kind = SectionKindForGlobal(GV);
+
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+  {
+    if (!GVar->isWeakForLinker()) {
+      switch (Kind) {
+      case SectionKind::RODataMergeStr:
+        return MergeableStringSection(GVar);
+      case SectionKind::RODataMergeConst:
+        return getReadOnlySection();
+      case SectionKind::ThreadData:
+        return DataSection;
+      case SectionKind::ThreadBSS:
+        return getBSSSection_();
+      default:
+        break;
+      }
+    }
+  }
+  return ELFTargetAsmInfo::SelectSectionForGlobal(GV);
+}
+
+const Section*
+XCoreTargetAsmInfo::SelectSectionForMachineConst(const Type *Ty) const {
+  return MergeableConstSection(Ty);
+}
+
+const Section*
+XCoreTargetAsmInfo::MergeableConstSection(const GlobalVariable *GV) const {
+  Constant *C = GV->getInitializer();
+  return MergeableConstSection(C->getType());
+}
+
+inline const Section*
+XCoreTargetAsmInfo::MergeableConstSection(const Type *Ty) const {
+  const TargetData *TD = TM.getTargetData();
+
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (Size == 4 || Size == 8 || Size == 16) {
+    std::string Name =  ".cp.const" + utostr(Size);
+
+    return getNamedSection(Name.c_str(),
+                           SectionFlags::setEntitySize(SectionFlags::Mergeable |
+                                                       SectionFlags::Small,
+                                                       Size));
+  }
+
+  return getReadOnlySection();
+}
+
+const Section* XCoreTargetAsmInfo::
+MergeableStringSection(const GlobalVariable *GV) const {
+  // FIXME insert in correct mergable section
+  return getReadOnlySection();
+}
+
+unsigned XCoreTargetAsmInfo::
+SectionFlagsForGlobal(const GlobalValue *GV,
+                                     const char* Name) const {
+  unsigned Flags = ELFTargetAsmInfo::SectionFlagsForGlobal(GV, Name);
+  // Mask out unsupported flags
+  Flags &= ~(SectionFlags::Small | SectionFlags::TLS);
+
+  // Set CP / DP relative flags
+  if (GV) {
+    SectionKind::Kind Kind = SectionKindForGlobal(GV);
+    switch (Kind) {
+    case SectionKind::ThreadData:
+    case SectionKind::ThreadBSS:
+    case SectionKind::Data:
+    case SectionKind::BSS:
+    case SectionKind::SmallData:
+    case SectionKind::SmallBSS:
+      Flags |= SectionFlags::Small;
+      break;
+    case SectionKind::ROData:
+    case SectionKind::RODataMergeStr:
+    case SectionKind::SmallROData:
+      if (Subtarget->isXS1A()) {
+        Flags |= SectionFlags::Writeable;
+      }
+      Flags |=SectionFlags::Small;
+      break;
+    case SectionKind::RODataMergeConst:
+      Flags |=SectionFlags::Small;
+    default:
+      break;
+    }
+  }
+
+  return Flags;
+}
+
+std::string XCoreTargetAsmInfo::
+printSectionFlags(unsigned flags) const {
+  std::string Flags = ",\"";
+
+  if (!(flags & SectionFlags::Debug))
+    Flags += 'a';
+  if (flags & SectionFlags::Code)
+    Flags += 'x';
+  if (flags & SectionFlags::Writeable)
+    Flags += 'w';
+  if (flags & SectionFlags::Mergeable)
+    Flags += 'M';
+  if (flags & SectionFlags::Strings)
+    Flags += 'S';
+  if (flags & SectionFlags::TLS)
+    Flags += 'T';
+  if (flags & SectionFlags::Small) {
+    if (flags & SectionFlags::Writeable)
+      Flags += 'd'; // DP relative
+    else
+      Flags += 'c'; // CP relative
+  }
+
+  Flags += "\",";
+  
+  Flags += '@';
+
+  if (flags & SectionFlags::BSS)
+    Flags += "nobits";
+  else
+    Flags += "progbits";
+
+  if (unsigned entitySize = SectionFlags::getEntitySize(flags))
+    Flags += "," + utostr(entitySize);
+
+  return Flags;
+}
diff --git a/lib/Target/XCore/XCoreTargetAsmInfo.h b/lib/Target/XCore/XCoreTargetAsmInfo.h
new file mode 100644
index 0000000..79fd36a
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetAsmInfo.h
@@ -0,0 +1,45 @@
+//=====-- XCoreTargetAsmInfo.h - XCore asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the XCoreTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETASMINFO_H
+#define XCORETARGETASMINFO_H
+
+#include "llvm/Target/ELFTargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declarations.
+  class XCoreTargetMachine;
+  class XCoreSubtarget;
+
+  class XCoreTargetAsmInfo : public ELFTargetAsmInfo {
+  private:
+    const XCoreSubtarget *Subtarget;
+  public:
+    explicit XCoreTargetAsmInfo(const XCoreTargetMachine &TM);
+    
+    virtual const Section* SelectSectionForGlobal(const GlobalValue *GV) const;
+    virtual std::string printSectionFlags(unsigned flags) const;
+    const Section* MergeableConstSection(const GlobalVariable *GV) const;
+    inline const Section* MergeableConstSection(const Type *Ty) const;
+    const Section* MergeableStringSection(const GlobalVariable *GV) const;
+    virtual const Section*
+    SelectSectionForMachineConst(const Type *Ty) const;
+    virtual unsigned
+    SectionFlagsForGlobal(const GlobalValue *GV = NULL,
+                          const char* name = NULL) const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
new file mode 100644
index 0000000..5437c57
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -0,0 +1,71 @@
+//===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetAsmInfo.h"
+#include "XCoreTargetMachine.h"
+#include "XCore.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+/// XCoreTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int XCoreTargetMachineModule;
+int XCoreTargetMachineModule = 0;
+
+namespace {
+  // Register the target.
+  RegisterTarget<XCoreTargetMachine> X("xcore", "XCore");
+}
+
+const TargetAsmInfo *XCoreTargetMachine::createTargetAsmInfo() const {
+  return new XCoreTargetAsmInfo(*this);
+}
+
+/// XCoreTargetMachine ctor - Create an ILP32 architecture model
+///
+XCoreTargetMachine::XCoreTargetMachine(const Module &M, const std::string &FS)
+  : Subtarget(*this, M, FS),
+    DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
+               "i16:16:32-i32:32:32-i64:32:32"),
+    InstrInfo(),
+    FrameInfo(*this),
+    TLInfo(*this) {
+}
+
+unsigned XCoreTargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "xcore-")
+    return 20;
+  
+  // Otherwise we don't match.
+  return 0;
+}
+
+bool XCoreTargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createXCoreISelDag(*this));
+  return false;
+}
+
+bool XCoreTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool Verbose,
+                                            raw_ostream &Out) {
+  // Output assembly language.
+  PM.add(createXCoreCodePrinterPass(Out, *this, OptLevel, Verbose));
+  return false;
+}
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
new file mode 100644
index 0000000..2385aed
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -0,0 +1,63 @@
+//===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETMACHINE_H
+#define XCORETARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "XCoreFrameInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreISelLowering.h"
+
+namespace llvm {
+
+class Module;
+
+class XCoreTargetMachine : public LLVMTargetMachine {
+  XCoreSubtarget Subtarget;
+  const TargetData DataLayout;       // Calculates type size & alignment
+  XCoreInstrInfo InstrInfo;
+  XCoreFrameInfo FrameInfo;
+  XCoreTargetLowering TLInfo;
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+public:
+  XCoreTargetMachine(const Module &M, const std::string &FS);
+
+  virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const XCoreFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  virtual       XCoreTargetLowering *getTargetLowering() const {
+    return const_cast<XCoreTargetLowering*>(&TLInfo);
+  }
+
+  virtual const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addAssemblyEmitter(PassManagerBase &PM,
+                                  CodeGenOpt::Level OptLevel, 
+                                  bool Verbose, raw_ostream &Out);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Transforms/Hello/CMakeLists.txt b/lib/Transforms/Hello/CMakeLists.txt
new file mode 100644
index 0000000..b80d15b
--- /dev/null
+++ b/lib/Transforms/Hello/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library( LLVMHello
+  Hello.cpp
+  )
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
new file mode 100644
index 0000000..d07f613
--- /dev/null
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -0,0 +1,67 @@
+//===- Hello.cpp - Example code from "Writing an LLVM Pass" ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements two versions of the LLVM "Hello World" pass described
+// in docs/WritingAnLLVMPass.html
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hello"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(HelloCounter, "Counts number of functions greeted");
+
+namespace {
+  // Hello - The first implementation, without getAnalysisUsage.
+  struct Hello : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    Hello() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      HelloCounter++;
+      std::string fname = F.getName();
+      EscapeString(fname);
+      cerr << "Hello: " << fname << "\n";
+      return false;
+    }
+  };
+}
+
+char Hello::ID = 0;
+static RegisterPass<Hello> X("hello", "Hello World Pass");
+
+namespace {
+  // Hello2 - The second implementation with getAnalysisUsage implemented.
+  struct Hello2 : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    Hello2() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      HelloCounter++;
+      std::string fname = F.getName();
+      EscapeString(fname);
+      cerr << "Hello: " << fname << "\n";
+      return false;
+    }
+
+    // We don't modify the program, so we preserve all analyses
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    };
+  };
+}
+
+char Hello2::ID = 0;
+static RegisterPass<Hello2>
+Y("hello2", "Hello World Pass (with getAnalysisUsage implemented)");
diff --git a/lib/Transforms/Hello/Makefile b/lib/Transforms/Hello/Makefile
new file mode 100644
index 0000000..c5e75d4
--- /dev/null
+++ b/lib/Transforms/Hello/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Transforms/Hello/Makefile -----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMHello
+LOADABLE_MODULE = 1
+USEDLIBS =
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
new file mode 100644
index 0000000..2bb6428
--- /dev/null
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -0,0 +1,863 @@
+//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass promotes "by reference" arguments to be "by value" arguments.  In
+// practice, this means looking for internal functions that have pointer
+// arguments.  If it can prove, through the use of alias analysis, that an
+// argument is *only* loaded, then it can pass the value into the function
+// instead of the address of the value.  This can cause recursive simplification
+// of code and lead to the elimination of allocas (especially in C++ template
+// code like the STL).
+//
+// This pass also handles aggregate arguments that are passed into a function,
+// scalarizing them if the elements of the aggregate are only loaded.  Note that
+// by default it refuses to scalarize aggregates which would require passing in
+// more than three operands to the function, because passing thousands of
+// operands for a large array or structure is unprofitable! This limit can be
+// configured or disabled, however.
+//
+// Note that this transformation could also be done for arguments that are only
+// stored to (returning the value instead), but does not currently.  This case
+// would be best handled when and if LLVM begins supporting multiple return
+// values from functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "argpromotion"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted");
+STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
+STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead     , "Number of dead pointer args eliminated");
+
+namespace {
+  /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+  ///
+  struct VISIBILITY_HIDDEN ArgPromotion : public CallGraphSCCPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetData>();
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+    static char ID; // Pass identification, replacement for typeid
+    explicit ArgPromotion(unsigned maxElements = 3)
+      : CallGraphSCCPass(&ID), maxElements(maxElements) {}
+
+    /// A vector used to hold the indices of a single GEP instruction
+    typedef std::vector<uint64_t> IndicesVector;
+
+  private:
+    bool PromoteArguments(CallGraphNode *CGN);
+    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const;
+    Function *DoPromotion(Function *F,
+                          SmallPtrSet<Argument*, 8> &ArgsToPromote,
+                          SmallPtrSet<Argument*, 8> &ByValArgsToTransform);
+    /// The maximum number of elements to expand, or 0 for unlimited.
+    unsigned maxElements;
+  };
+}
+
+char ArgPromotion::ID = 0;
+static RegisterPass<ArgPromotion>
+X("argpromotion", "Promote 'by reference' arguments to scalars");
+
+Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
+  return new ArgPromotion(maxElements);
+}
+
+bool ArgPromotion::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = false, LocalChange;
+
+  do {  // Iterate until we stop promoting from this SCC.
+    LocalChange = false;
+    // Attempt to promote arguments from all functions in this SCC.
+    for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+      LocalChange |= PromoteArguments(SCC[i]);
+    Changed |= LocalChange;               // Remember that we changed something.
+  } while (LocalChange);
+
+  return Changed;
+}
+
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct).  If safe to promote some arguments, it
+/// calls the DoPromotion method.
+///
+bool ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
+  Function *F = CGN->getFunction();
+
+  // Make sure that it is local to this module.
+  if (!F || !F->hasLocalLinkage()) return false;
+
+  // First check: see if there are any pointer arguments!  If not, quick exit.
+  SmallVector<std::pair<Argument*, unsigned>, 16> PointerArgs;
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++ArgNo)
+    if (isa<PointerType>(I->getType()))
+      PointerArgs.push_back(std::pair<Argument*, unsigned>(I, ArgNo));
+  if (PointerArgs.empty()) return false;
+
+  // Second check: make sure that all callers are direct callers.  We can't
+  // transform functions that have indirect callers.
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end();
+       UI != E; ++UI) {
+    CallSite CS = CallSite::get(*UI);
+    if (!CS.getInstruction())       // "Taking the address" of the function
+      return false;
+
+    // Ensure that this call site is CALLING the function, not passing it as
+    // an argument.
+    if (!CS.isCallee(UI))
+      return false;
+  }
+
+  // Check to see which arguments are promotable.  If an argument is promotable,
+  // add it to ArgsToPromote.
+  SmallPtrSet<Argument*, 8> ArgsToPromote;
+  SmallPtrSet<Argument*, 8> ByValArgsToTransform;
+  for (unsigned i = 0; i != PointerArgs.size(); ++i) {
+    bool isByVal = F->paramHasAttr(PointerArgs[i].second+1, Attribute::ByVal);
+
+    // If this is a byval argument, and if the aggregate type is small, just
+    // pass the elements, which is always safe.
+    Argument *PtrArg = PointerArgs[i].first;
+    if (isByVal) {
+      const Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+      if (const StructType *STy = dyn_cast<StructType>(AgTy)) {
+        if (maxElements > 0 && STy->getNumElements() > maxElements) {
+          DOUT << "argpromotion disable promoting argument '"
+               << PtrArg->getName() << "' because it would require adding more "
+               << "than " << maxElements << " arguments to the function.\n";
+        } else {
+          // If all the elements are single-value types, we can promote it.
+          bool AllSimple = true;
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+            if (!STy->getElementType(i)->isSingleValueType()) {
+              AllSimple = false;
+              break;
+            }
+
+          // Safe to transform, don't even bother trying to "promote" it.
+          // Passing the elements as a scalar will allow scalarrepl to hack on
+          // the new alloca we introduce.
+          if (AllSimple) {
+            ByValArgsToTransform.insert(PtrArg);
+            continue;
+          }
+        }
+      }
+    }
+
+    // Otherwise, see if we can promote the pointer to its value.
+    if (isSafeToPromoteArgument(PtrArg, isByVal))
+      ArgsToPromote.insert(PtrArg);
+  }
+
+  // No promotable pointer arguments.
+  if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) return false;
+
+  Function *NewF = DoPromotion(F, ArgsToPromote, ByValArgsToTransform);
+
+  // Update the call graph to know that the function has been transformed.
+  getAnalysis<CallGraph>().changeFunction(F, NewF);
+  return true;
+}
+
+/// IsAlwaysValidPointer - Return true if the specified pointer is always legal
+/// to load.
+static bool IsAlwaysValidPointer(Value *V) {
+  if (isa<AllocaInst>(V) || isa<GlobalVariable>(V)) return true;
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V))
+    return IsAlwaysValidPointer(GEP->getOperand(0));
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      return IsAlwaysValidPointer(CE->getOperand(0));
+
+  return false;
+}
+
+/// AllCalleesPassInValidPointerForArgument - Return true if we can prove that
+/// all callees pass in a valid pointer for the specified function argument.
+static bool AllCalleesPassInValidPointerForArgument(Argument *Arg) {
+  Function *Callee = Arg->getParent();
+
+  unsigned ArgNo = std::distance(Callee->arg_begin(),
+                                 Function::arg_iterator(Arg));
+
+  // Look at all call sites of the function.  At this pointer we know we only
+  // have direct callees.
+  for (Value::use_iterator UI = Callee->use_begin(), E = Callee->use_end();
+       UI != E; ++UI) {
+    CallSite CS = CallSite::get(*UI);
+    assert(CS.getInstruction() && "Should only have direct calls!");
+
+    if (!IsAlwaysValidPointer(CS.getArgument(ArgNo)))
+      return false;
+  }
+  return true;
+}
+
+/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
+/// that is greater than or equal to the size of prefix, and each of the
+/// elements in Prefix is the same as the corresponding elements in Longer.
+///
+/// This means it also returns true when Prefix and Longer are equal!
+static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix,
+                     const ArgPromotion::IndicesVector &Longer) {
+  if (Prefix.size() > Longer.size())
+    return false;
+  for (unsigned i = 0, e = Prefix.size(); i != e; ++i)
+    if (Prefix[i] != Longer[i])
+      return false;
+  return true;
+}
+
+
+/// Checks if Indices, or a prefix of Indices, is in Set.
+static bool PrefixIn(const ArgPromotion::IndicesVector &Indices,
+                     std::set<ArgPromotion::IndicesVector> &Set) {
+    std::set<ArgPromotion::IndicesVector>::iterator Low;
+    Low = Set.upper_bound(Indices);
+    if (Low != Set.begin())
+      Low--;
+    // Low is now the last element smaller than or equal to Indices. This means
+    // it points to a prefix of Indices (possibly Indices itself), if such
+    // prefix exists.
+    //
+    // This load is safe if any prefix of its operands is safe to load.
+    return Low != Set.end() && IsPrefix(*Low, Indices);
+}
+
+/// Mark the given indices (ToMark) as safe in the the given set of indices
+/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
+/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
+/// already. Furthermore, any indices that Indices is itself a prefix of, are
+/// removed from Safe (since they are implicitely safe because of Indices now).
+static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,
+                            std::set<ArgPromotion::IndicesVector> &Safe) {
+  std::set<ArgPromotion::IndicesVector>::iterator Low;
+  Low = Safe.upper_bound(ToMark);
+  // Guard against the case where Safe is empty
+  if (Low != Safe.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This
+  // means it points to a prefix of Indices (possibly Indices itself), if
+  // such prefix exists.
+  if (Low != Safe.end()) {
+    if (IsPrefix(*Low, ToMark))
+      // If there is already a prefix of these indices (or exactly these
+      // indices) marked a safe, don't bother adding these indices
+      return;
+
+    // Increment Low, so we can use it as a "insert before" hint
+    ++Low;
+  }
+  // Insert
+  Low = Safe.insert(Low, ToMark);
+  ++Low;
+  // If there we're a prefix of longer index list(s), remove those
+  std::set<ArgPromotion::IndicesVector>::iterator End = Safe.end();
+  while (Low != End && IsPrefix(ToMark, *Low)) {
+    std::set<ArgPromotion::IndicesVector>::iterator Remove = Low;
+    ++Low;
+    Safe.erase(Remove);
+  }
+}
+
+/// isSafeToPromoteArgument - As you might guess from the name of this method,
+/// it checks to see if it is both safe and useful to promote the argument.
+/// This method limits promotion of aggregates to only promote up to three
+/// elements of the aggregate in order to avoid exploding the number of
+/// arguments passed in.
+bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
+  typedef std::set<IndicesVector> GEPIndicesSet;
+
+  // Quick exit for unused arguments
+  if (Arg->use_empty())
+    return true;
+
+  // We can only promote this argument if all of the uses are loads, or are GEP
+  // instructions (with constant indices) that are subsequently loaded.
+  //
+  // Promoting the argument causes it to be loaded in the caller
+  // unconditionally. This is only safe if we can prove that either the load
+  // would have happened in the callee anyway (ie, there is a load in the entry
+  // block) or the pointer passed in at every call site is guaranteed to be
+  // valid.
+  // In the former case, invalid loads can happen, but would have happened
+  // anyway, in the latter case, invalid loads won't happen. This prevents us
+  // from introducing an invalid load that wouldn't have happened in the
+  // original code.
+  //
+  // This set will contain all sets of indices that are loaded in the entry
+  // block, and thus are safe to unconditionally load in the caller.
+  GEPIndicesSet SafeToUnconditionallyLoad;
+
+  // This set contains all the sets of indices that we are planning to promote.
+  // This makes it possible to limit the number of arguments added.
+  GEPIndicesSet ToPromote;
+
+  // If the pointer is always valid, any load with first index 0 is valid.
+  if(isByVal || AllCalleesPassInValidPointerForArgument(Arg))
+    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  // First, iterate the entry block and mark loads of (geps of) arguments as
+  // safe.
+  BasicBlock *EntryBlock = Arg->getParent()->begin();
+  // Declare this here so we can reuse it
+  IndicesVector Indices;
+  for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end();
+       I != E; ++I)
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      Value *V = LI->getPointerOperand();
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+        V = GEP->getPointerOperand();
+        if (V == Arg) {
+          // This load actually loads (part of) Arg? Check the indices then.
+          Indices.reserve(GEP->getNumIndices());
+          for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+               II != IE; ++II)
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
+              Indices.push_back(CI->getSExtValue());
+            else
+              // We found a non-constant GEP index for this argument? Bail out
+              // right away, can't promote this argument at all.
+              return false;
+
+          // Indices checked out, mark them as safe
+          MarkIndicesSafe(Indices, SafeToUnconditionallyLoad);
+          Indices.clear();
+        }
+      } else if (V == Arg) {
+        // Direct loads are equivalent to a GEP with a single 0 index.
+        MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+      }
+    }
+
+  // Now, iterate all uses of the argument to see if there are any uses that are
+  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  SmallVector<LoadInst*, 16> Loads;
+  IndicesVector Operands;
+  for (Value::use_iterator UI = Arg->use_begin(), E = Arg->use_end();
+       UI != E; ++UI) {
+    Operands.clear();
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (LI->isVolatile()) return false;  // Don't hack volatile loads
+      Loads.push_back(LI);
+      // Direct loads are equivalent to a GEP with a zero index and then a load.
+      Operands.push_back(0);
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+      if (GEP->use_empty()) {
+        // Dead GEP's cause trouble later.  Just remove them if we run into
+        // them.
+        getAnalysis<AliasAnalysis>().deleteValue(GEP);
+        GEP->eraseFromParent();
+        // TODO: This runs the above loop over and over again for dead GEPS
+        // Couldn't we just do increment the UI iterator earlier and erase the
+        // use?
+        return isSafeToPromoteArgument(Arg, isByVal);
+      }
+
+      // Ensure that all of the indices are constants.
+      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end();
+        i != e; ++i)
+        if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
+          Operands.push_back(C->getSExtValue());
+        else
+          return false;  // Not a constant operand GEP!
+
+      // Ensure that the only users of the GEP are load instructions.
+      for (Value::use_iterator UI = GEP->use_begin(), E = GEP->use_end();
+           UI != E; ++UI)
+        if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+          if (LI->isVolatile()) return false;  // Don't hack volatile loads
+          Loads.push_back(LI);
+        } else {
+          // Other uses than load?
+          return false;
+        }
+    } else {
+      return false;  // Not a load or a GEP.
+    }
+
+    // Now, see if it is safe to promote this load / loads of this GEP. Loading
+    // is safe if Operands, or a prefix of Operands, is marked as safe.
+    if (!PrefixIn(Operands, SafeToUnconditionallyLoad))
+      return false;
+
+    // See if we are already promoting a load with these indices. If not, check
+    // to make sure that we aren't promoting too many elements.  If so, nothing
+    // to do.
+    if (ToPromote.find(Operands) == ToPromote.end()) {
+      if (maxElements > 0 && ToPromote.size() == maxElements) {
+        DOUT << "argpromotion not promoting argument '"
+             << Arg->getName() << "' because it would require adding more "
+             << "than " << maxElements << " arguments to the function.\n";
+        // We limit aggregate promotion to only promoting up to a fixed number
+        // of elements of the aggregate.
+        return false;
+      }
+      ToPromote.insert(Operands);
+    }
+  }
+
+  if (Loads.empty()) return true;  // No users, this is a dead argument.
+
+  // Okay, now we know that the argument is only used by load instructions and
+  // it is safe to unconditionally perform all of them. Use alias analysis to
+  // check to see if the pointer is guaranteed to not be modified from entry of
+  // the function to each of the load instructions.
+
+  // Because there could be several/many load instructions, remember which
+  // blocks we know to be transparent to the load.
+  SmallPtrSet<BasicBlock*, 16> TranspBlocks;
+
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  TargetData &TD = getAnalysis<TargetData>();
+
+  for (unsigned i = 0, e = Loads.size(); i != e; ++i) {
+    // Check to see if the load is invalidated from the start of the block to
+    // the load itself.
+    LoadInst *Load = Loads[i];
+    BasicBlock *BB = Load->getParent();
+
+    const PointerType *LoadTy =
+      cast<PointerType>(Load->getPointerOperand()->getType());
+    unsigned LoadSize = (unsigned)TD.getTypeStoreSize(LoadTy->getElementType());
+
+    if (AA.canInstructionRangeModify(BB->front(), *Load, Arg, LoadSize))
+      return false;  // Pointer is invalidated!
+
+    // Now check every path from the entry block to the load for transparency.
+    // To do this, we perform a depth first search on the inverse CFG from the
+    // loading block.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+      for (idf_ext_iterator<BasicBlock*, SmallPtrSet<BasicBlock*, 16> >
+             I = idf_ext_begin(*PI, TranspBlocks),
+             E = idf_ext_end(*PI, TranspBlocks); I != E; ++I)
+        if (AA.canBasicBlockModify(**I, Arg, LoadSize))
+          return false;
+  }
+
+  // If the path from the entry of the function to each load is free of
+  // instructions that potentially invalidate the load, we can make the
+  // transformation!
+  return true;
+}
+
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function.  At this point, we know that it's
+/// safe to do so.
+Function *ArgPromotion::DoPromotion(Function *F,
+                                    SmallPtrSet<Argument*, 8> &ArgsToPromote,
+                              SmallPtrSet<Argument*, 8> &ByValArgsToTransform) {
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has modified arguments.
+  const FunctionType *FTy = F->getFunctionType();
+  std::vector<const Type*> Params;
+
+  typedef std::set<IndicesVector> ScalarizeTable;
+
+  // ScalarizedElements - If we are promoting a pointer that has elements
+  // accessed out of it, keep track of which elements are accessed so that we
+  // can add one argument for each.
+  //
+  // Arguments that are directly loaded will have a zero element value here, to
+  // handle cases where there are both a direct load and GEP accesses.
+  //
+  std::map<Argument*, ScalarizeTable> ScalarizedElements;
+
+  // OriginalLoads - Keep track of a representative load instruction from the
+  // original function so that we can tell the alias analysis implementation
+  // what the new GEP/Load instructions we are inserting look like.
+  std::map<IndicesVector, LoadInst*> OriginalLoads;
+
+  // Attributes - Keep track of the parameter attributes for the arguments
+  // that we are *not* promoting. For the ones that we do promote, the parameter
+  // attributes are lost
+  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  const AttrListPtr &PAL = F->getAttributes();
+
+  // Add any return attributes.
+  if (Attributes attrs = PAL.getRetAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+
+  // First, determine the new argument list
+  unsigned ArgIndex = 1;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIndex) {
+    if (ByValArgsToTransform.count(I)) {
+      // Simple byval argument? Just add all the struct element types.
+      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      const StructType *STy = cast<StructType>(AgTy);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        Params.push_back(STy->getElementType(i));
+      ++NumByValArgsPromoted;
+    } else if (!ArgsToPromote.count(I)) {
+      // Unchanged argument
+      Params.push_back(I->getType());
+      if (Attributes attrs = PAL.getParamAttributes(ArgIndex))
+        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs));
+    } else if (I->use_empty()) {
+      // Dead argument (which are always marked as promotable)
+      ++NumArgumentsDead;
+    } else {
+      // Okay, this is being promoted. This means that the only uses are loads
+      // or GEPs which are only used by loads
+
+      // In this table, we will track which indices are loaded from the argument
+      // (where direct loads are tracked as no indices).
+      ScalarizeTable &ArgIndices = ScalarizedElements[I];
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
+           ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        assert(isa<LoadInst>(User) || isa<GetElementPtrInst>(User));
+        IndicesVector Indices;
+        Indices.reserve(User->getNumOperands() - 1);
+        // Since loads will only have a single operand, and GEPs only a single
+        // non-index operand, this will record direct loads without any indices,
+        // and gep+loads with the GEP indices.
+        for (User::op_iterator II = User->op_begin() + 1, IE = User->op_end();
+             II != IE; ++II)
+          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Indices.size() == 1 && Indices.front() == 0)
+          Indices.clear();
+        ArgIndices.insert(Indices);
+        LoadInst *OrigLoad;
+        if (LoadInst *L = dyn_cast<LoadInst>(User))
+          OrigLoad = L;
+        else
+          // Take any load, we will use it only to update Alias Analysis
+          OrigLoad = cast<LoadInst>(User->use_back());
+        OriginalLoads[Indices] = OrigLoad;
+      }
+
+      // Add a parameter to the function for each element passed in.
+      for (ScalarizeTable::iterator SI = ArgIndices.begin(),
+             E = ArgIndices.end(); SI != E; ++SI) {
+        // not allowed to dereference ->begin() if size() is 0
+        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(),
+                                                           SI->begin(),
+                                                           SI->end()));
+        assert(Params.back());
+      }
+
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->empty())
+        ++NumArgumentsPromoted;
+      else
+        ++NumAggregatesPromoted;
+    }
+  }
+
+  // Add any function attributes.
+  if (Attributes attrs = PAL.getFnAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+
+  const Type *RetTy = FTy->getReturnType();
+
+  // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
+  // have zero fixed arguments.
+  bool ExtraArgHack = false;
+  if (Params.empty() && FTy->isVarArg()) {
+    ExtraArgHack = true;
+    Params.push_back(Type::Int32Ty);
+  }
+
+  // Construct the new function type using the new arguments.
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->copyAttributesFrom(F);
+
+  // Recompute the parameter attributes list based on the new arguments for
+  // the function.
+  NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()));
+  AttributesVec.clear();
+
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->takeName(F);
+
+  // Get the alias analysis information that we need to update to reflect our
+  // changes.
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  //
+  SmallVector<Value*, 16> Args;
+  while (!F->use_empty()) {
+    CallSite CS = CallSite::get(F->use_back());
+    Instruction *Call = CS.getInstruction();
+    const AttrListPtr &CallPAL = CS.getAttributes();
+
+    // Add any return attributes.
+    if (Attributes attrs = CallPAL.getRetAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+
+    // Loop over the operands, inserting GEP and loads in the caller as
+    // appropriate.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    ArgIndex = 1;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I, ++AI, ++ArgIndex)
+      if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+        Args.push_back(*AI);          // Unmodified argument
+
+        if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex))
+          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+
+      } else if (ByValArgsToTransform.count(I)) {
+        // Emit a GEP and load for each element of the struct.
+        const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        const StructType *STy = cast<StructType>(AgTy);
+        Value *Idxs[2] = { ConstantInt::get(Type::Int32Ty, 0), 0 };
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          Idxs[1] = ConstantInt::get(Type::Int32Ty, i);
+          Value *Idx = GetElementPtrInst::Create(*AI, Idxs, Idxs+2,
+                                                 (*AI)->getName()+"."+utostr(i),
+                                                 Call);
+          // TODO: Tell AA about the new values?
+          Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call));
+        }
+      } else if (!I->use_empty()) {
+        // Non-dead argument: insert GEPs and loads as appropriate.
+        ScalarizeTable &ArgIndices = ScalarizedElements[I];
+        // Store the Value* version of the indices in here, but declare it now
+        // for reuse
+        std::vector<Value*> Ops;
+        for (ScalarizeTable::iterator SI = ArgIndices.begin(),
+               E = ArgIndices.end(); SI != E; ++SI) {
+          Value *V = *AI;
+          LoadInst *OrigLoad = OriginalLoads[*SI];
+          if (!SI->empty()) {
+            Ops.reserve(SI->size());
+            const Type *ElTy = V->getType();
+            for (IndicesVector::const_iterator II = SI->begin(),
+                 IE = SI->end(); II != IE; ++II) {
+              // Use i32 to index structs, and i64 for others (pointers/arrays).
+              // This satisfies GEP constraints.
+              const Type *IdxTy = (isa<StructType>(ElTy) ? Type::Int32Ty : Type::Int64Ty);
+              Ops.push_back(ConstantInt::get(IdxTy, *II));
+              // Keep track of the type we're currently indexing
+              ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
+            }
+            // And create a GEP to extract those indices
+            V = GetElementPtrInst::Create(V, Ops.begin(), Ops.end(),
+                                          V->getName()+".idx", Call);
+            Ops.clear();
+            AA.copyValue(OrigLoad->getOperand(0), V);
+          }
+          Args.push_back(new LoadInst(V, V->getName()+".val", Call));
+          AA.copyValue(OrigLoad, Args.back());
+        }
+      }
+
+    if (ExtraArgHack)
+      Args.push_back(Constant::getNullValue(Type::Int32Ty));
+
+    // Push any varargs arguments on the list
+    for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
+      Args.push_back(*AI);
+      if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex))
+        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+    }
+
+    // Add any function attributes.
+    if (Attributes attrs = CallPAL.getFnAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
+                                                          AttributesVec.end()));
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
+                                                        AttributesVec.end()));
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+    AttributesVec.clear();
+
+    // Update the alias analysis implementation to know that we are replacing
+    // the old call with a new one.
+    AA.replaceWithNewValue(Call, New);
+
+    // Update the callgraph to know that the callsite has been transformed.
+    CG[Call->getParent()->getParent()]->replaceCallSite(Call, New);
+
+    if (!Call->use_empty()) {
+      Call->replaceAllUsesWith(New);
+      New->takeName(Call);
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transfering uses of the old arguments over to
+  // the new arguments, also transfering over the names as well.
+  //
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I) {
+    if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+      // If this is an unmodified argument, move the name and users over to the
+      // new version.
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+      AA.replaceWithNewValue(I, I2);
+      ++I2;
+      continue;
+    }
+
+    if (ByValArgsToTransform.count(I)) {
+      // In the callee, we create an alloca, and store each of the new incoming
+      // arguments into the alloca.
+      Instruction *InsertPt = NF->begin()->begin();
+
+      // Just add all the struct element types.
+      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
+      const StructType *STy = cast<StructType>(AgTy);
+      Value *Idxs[2] = { ConstantInt::get(Type::Int32Ty, 0), 0 };
+
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Idxs[1] = ConstantInt::get(Type::Int32Ty, i);
+        std::string Name = TheAlloca->getName()+"."+utostr(i);
+        Value *Idx = GetElementPtrInst::Create(TheAlloca, Idxs, Idxs+2,
+                                               Name, InsertPt);
+        I2->setName(I->getName()+"."+utostr(i));
+        new StoreInst(I2++, Idx, InsertPt);
+      }
+
+      // Anything that used the arg should now use the alloca.
+      I->replaceAllUsesWith(TheAlloca);
+      TheAlloca->takeName(I);
+      AA.replaceWithNewValue(I, TheAlloca);
+      continue;
+    }
+
+    if (I->use_empty()) {
+      AA.deleteValue(I);
+      continue;
+    }
+
+    // Otherwise, if we promoted this argument, then all users are load
+    // instructions (or GEPs with only load users), and all loads should be
+    // using the new argument that we added.
+    ScalarizeTable &ArgIndices = ScalarizedElements[I];
+
+    while (!I->use_empty()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I->use_back())) {
+        assert(ArgIndices.begin()->empty() &&
+               "Load element should sort to front!");
+        I2->setName(I->getName()+".val");
+        LI->replaceAllUsesWith(I2);
+        AA.replaceWithNewValue(LI, I2);
+        LI->eraseFromParent();
+        DOUT << "*** Promoted load of argument '" << I->getName()
+             << "' in function '" << F->getName() << "'\n";
+      } else {
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->use_back());
+        IndicesVector Operands;
+        Operands.reserve(GEP->getNumIndices());
+        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+             II != IE; ++II)
+          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Operands.size() == 1 && Operands.front() == 0)
+          Operands.clear();
+
+        Function::arg_iterator TheArg = I2;
+        for (ScalarizeTable::iterator It = ArgIndices.begin();
+             *It != Operands; ++It, ++TheArg) {
+          assert(It != ArgIndices.end() && "GEP not handled??");
+        }
+
+        std::string NewName = I->getName();
+        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+            NewName += "." + utostr(Operands[i]);
+        }
+        NewName += ".val";
+        TheArg->setName(NewName);
+
+        DOUT << "*** Promoted agg argument '" << TheArg->getName()
+             << "' of function '" << NF->getName() << "'\n";
+
+        // All of the uses must be load instructions.  Replace them all with
+        // the argument specified by ArgNo.
+        while (!GEP->use_empty()) {
+          LoadInst *L = cast<LoadInst>(GEP->use_back());
+          L->replaceAllUsesWith(TheArg);
+          AA.replaceWithNewValue(L, TheArg);
+          L->eraseFromParent();
+        }
+        AA.deleteValue(GEP);
+        GEP->eraseFromParent();
+      }
+    }
+
+    // Increment I2 past all of the arguments added for this promoted pointer.
+    for (unsigned i = 0, e = ArgIndices.size(); i != e; ++i)
+      ++I2;
+  }
+
+  // Notify the alias analysis implementation that we inserted a new argument.
+  if (ExtraArgHack)
+    AA.copyValue(Constant::getNullValue(Type::Int32Ty), NF->arg_begin());
+
+
+  // Tell the alias analysis that the old function is about to disappear.
+  AA.replaceWithNewValue(F, NF);
+
+  // Now that the old function is dead, delete it.
+  F->eraseFromParent();
+  return NF;
+}
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
new file mode 100644
index 0000000..4b85e13
--- /dev/null
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_llvm_library(LLVMipo
+  FunctionAttrs.cpp
+  ArgumentPromotion.cpp
+  ConstantMerge.cpp
+  DeadArgumentElimination.cpp
+  DeadTypeElimination.cpp
+  ExtractGV.cpp
+  GlobalDCE.cpp
+  GlobalOpt.cpp
+  IndMemRemoval.cpp
+  InlineAlways.cpp
+  Inliner.cpp
+  InlineSimple.cpp
+  Internalize.cpp
+  IPConstantPropagation.cpp
+  LoopExtractor.cpp
+  LowerSetJmp.cpp
+  MergeFunctions.cpp
+  PartialSpecialization.cpp
+  PruneEH.cpp
+  RaiseAllocations.cpp
+  StripDeadPrototypes.cpp
+  StripSymbols.cpp
+  StructRetPromotion.cpp
+  )
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
new file mode 100644
index 0000000..237e6db
--- /dev/null
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -0,0 +1,114 @@
+//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface to a pass that merges duplicate global
+// constants together into a single constant that is shared.  This is useful
+// because some passes (ie TraceValues) insert a lot of string constants into
+// the program, regardless of whether or not an existing string is available.
+//
+// Algorithm: ConstantMerge is designed to build up a map of available constants
+// and eliminate duplicates when it is initialized.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "constmerge"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumMerged, "Number of global constants merged");
+
+namespace {
+  struct VISIBILITY_HIDDEN ConstantMerge : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantMerge() : ModulePass(&ID) {}
+
+    // run - For this pass, process all of the globals in the module,
+    // eliminating duplicate constants.
+    //
+    bool runOnModule(Module &M);
+  };
+}
+
+char ConstantMerge::ID = 0;
+static RegisterPass<ConstantMerge>
+X("constmerge", "Merge Duplicate Global Constants");
+
+ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
+
+bool ConstantMerge::runOnModule(Module &M) {
+  // Map unique constant/section pairs to globals.  We don't want to merge
+  // globals in different sections.
+  std::map<std::pair<Constant*, std::string>, GlobalVariable*> CMap;
+
+  // Replacements - This vector contains a list of replacements to perform.
+  std::vector<std::pair<GlobalVariable*, GlobalVariable*> > Replacements;
+
+  bool MadeChange = false;
+
+  // Iterate constant merging while we are still making progress.  Merging two
+  // constants together may allow us to merge other constants together if the
+  // second level constants have initializers which point to the globals that
+  // were just merged.
+  while (1) {
+    // First pass: identify all globals that can be merged together, filling in
+    // the Replacements vector.  We cannot do the replacement in this pass
+    // because doing so may cause initializers of other globals to be rewritten,
+    // invalidating the Constant* pointers in CMap.
+    //
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = GVI++;
+      
+      // If this GV is dead, remove it.
+      GV->removeDeadConstantUsers();
+      if (GV->use_empty() && GV->hasLocalLinkage()) {
+        GV->eraseFromParent();
+        continue;
+      }
+      
+      // Only process constants with initializers.
+      if (GV->isConstant() && GV->hasInitializer()) {
+        Constant *Init = GV->getInitializer();
+
+        // Check to see if the initializer is already known.
+        GlobalVariable *&Slot = CMap[std::make_pair(Init, GV->getSection())];
+
+        if (Slot == 0) {    // Nope, add it to the map.
+          Slot = GV;
+        } else if (GV->hasLocalLinkage()) {    // Yup, this is a duplicate!
+          // Make all uses of the duplicate constant use the canonical version.
+          Replacements.push_back(std::make_pair(GV, Slot));
+        }
+      }
+    }
+
+    if (Replacements.empty())
+      return MadeChange;
+    CMap.clear();
+
+    // Now that we have figured out which replacements must be made, do them all
+    // now.  This avoid invalidating the pointers in CMap, which are unneeded
+    // now.
+    for (unsigned i = 0, e = Replacements.size(); i != e; ++i) {
+      // Eliminate any uses of the dead global...
+      Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
+
+      // Delete the global value from the module...
+      M.getGlobalList().erase(Replacements[i].first);
+    }
+
+    NumMerged += Replacements.size();
+    Replacements.clear();
+  }
+}
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
new file mode 100644
index 0000000..666db7e
--- /dev/null
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -0,0 +1,944 @@
+//===-- DeadArgumentElimination.cpp - Eliminate dead arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass deletes dead arguments from internal functions.  Dead argument
+// elimination removes arguments which are directly dead, as well as arguments
+// only passed into function calls as dead arguments of other functions.  This
+// pass also deletes dead return values in a similar way.
+//
+// This pass is often useful as a cleanup pass to run after aggressive
+// interprocedural passes, which add possibly-dead arguments or return values.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "deadargelim"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constant.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <map>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
+STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
+
+namespace {
+  /// DAE - The dead argument elimination pass.
+  ///
+  class VISIBILITY_HIDDEN DAE : public ModulePass {
+  public:
+
+    /// Struct that represents (part of) either a return value or a function
+    /// argument.  Used so that arguments and return values can be used
+    /// interchangably.
+    struct RetOrArg {
+      RetOrArg(const Function* F, unsigned Idx, bool IsArg) : F(F), Idx(Idx),
+               IsArg(IsArg) {}
+      const Function *F;
+      unsigned Idx;
+      bool IsArg;
+
+      /// Make RetOrArg comparable, so we can put it into a map.
+      bool operator<(const RetOrArg &O) const {
+        if (F != O.F)
+          return F < O.F;
+        else if (Idx != O.Idx)
+          return Idx < O.Idx;
+        else
+          return IsArg < O.IsArg;
+      }
+
+      /// Make RetOrArg comparable, so we can easily iterate the multimap.
+      bool operator==(const RetOrArg &O) const {
+        return F == O.F && Idx == O.Idx && IsArg == O.IsArg;
+      }
+
+      std::string getDescription() const {
+        return std::string((IsArg ? "Argument #" : "Return value #")) 
+               + utostr(Idx) + " of function " + F->getName();
+      }
+    };
+
+    /// Liveness enum - During our initial pass over the program, we determine
+    /// that things are either alive or maybe alive. We don't mark anything
+    /// explicitly dead (even if we know they are), since anything not alive
+    /// with no registered uses (in Uses) will never be marked alive and will
+    /// thus become dead in the end.
+    enum Liveness { Live, MaybeLive };
+
+    /// Convenience wrapper
+    RetOrArg CreateRet(const Function *F, unsigned Idx) {
+      return RetOrArg(F, Idx, false);
+    }
+    /// Convenience wrapper
+    RetOrArg CreateArg(const Function *F, unsigned Idx) {
+      return RetOrArg(F, Idx, true);
+    }
+
+    typedef std::multimap<RetOrArg, RetOrArg> UseMap;
+    /// This maps a return value or argument to any MaybeLive return values or
+    /// arguments it uses. This allows the MaybeLive values to be marked live
+    /// when any of its users is marked live.
+    /// For example (indices are left out for clarity):
+    ///  - Uses[ret F] = ret G
+    ///    This means that F calls G, and F returns the value returned by G.
+    ///  - Uses[arg F] = ret G
+    ///    This means that some function calls G and passes its result as an
+    ///    argument to F.
+    ///  - Uses[ret F] = arg F
+    ///    This means that F returns one of its own arguments.
+    ///  - Uses[arg F] = arg G
+    ///    This means that G calls F and passes one of its own (G's) arguments
+    ///    directly to F.
+    UseMap Uses;
+
+    typedef std::set<RetOrArg> LiveSet;
+    typedef std::set<const Function*> LiveFuncSet;
+
+    /// This set contains all values that have been determined to be live.
+    LiveSet LiveValues;
+    /// This set contains all values that are cannot be changed in any way.
+    LiveFuncSet LiveFunctions;
+
+    typedef SmallVector<RetOrArg, 5> UseVector;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DAE() : ModulePass(&ID) {}
+    bool runOnModule(Module &M);
+
+    virtual bool ShouldHackArguments() const { return false; }
+
+  private:
+    Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
+    Liveness SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
+                       unsigned RetValNum = 0);
+    Liveness SurveyUses(Value *V, UseVector &MaybeLiveUses);
+
+    void SurveyFunction(Function &F);
+    void MarkValue(const RetOrArg &RA, Liveness L,
+                   const UseVector &MaybeLiveUses);
+    void MarkLive(const RetOrArg &RA);
+    void MarkLive(const Function &F);
+    void PropagateLiveness(const RetOrArg &RA);
+    bool RemoveDeadStuffFromFunction(Function *F);
+    bool DeleteDeadVarargs(Function &Fn);
+  };
+}
+
+
+char DAE::ID = 0;
+static RegisterPass<DAE>
+X("deadargelim", "Dead Argument Elimination");
+
+namespace {
+  /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
+  /// deletes arguments to functions which are external.  This is only for use
+  /// by bugpoint.
+  struct DAH : public DAE {
+    static char ID;
+    virtual bool ShouldHackArguments() const { return true; }
+  };
+}
+
+char DAH::ID = 0;
+static RegisterPass<DAH>
+Y("deadarghaX0r", "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)");
+
+/// createDeadArgEliminationPass - This pass removes arguments from functions
+/// which are not used by the body of the function.
+///
+ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
+ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
+
+/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
+/// llvm.vastart is never called, the varargs list is dead for the function.
+bool DAE::DeleteDeadVarargs(Function &Fn) {
+  assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
+  if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
+
+  // Ensure that the function is only directly called.
+  for (Value::use_iterator I = Fn.use_begin(), E = Fn.use_end(); I != E; ++I) {
+    // If this use is anything other than a call site, give up.
+    CallSite CS = CallSite::get(*I);
+    Instruction *TheCall = CS.getInstruction();
+    if (!TheCall) return false;   // Not a direct call site?
+
+    // The addr of this function is passed to the call.
+    if (!CS.isCallee(I)) return false;
+  }
+
+  // Okay, we know we can transform this function if safe.  Scan its body
+  // looking for calls to llvm.vastart.
+  for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+      }
+    }
+  }
+
+  // If we get here, there are no calls to llvm.vastart in the function body,
+  // remove the "..." and adjust all the calls.
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but doesn't have isVarArg set.
+  const FunctionType *FTy = Fn.getFunctionType();
+  std::vector<const Type*> Params(FTy->param_begin(), FTy->param_end());
+  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(), Params, false);
+  unsigned NumArgs = Params.size();
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, Fn.getLinkage());
+  NF->copyAttributesFrom(&Fn);
+  Fn.getParent()->getFunctionList().insert(&Fn, NF);
+  NF->takeName(&Fn);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  while (!Fn.use_empty()) {
+    CallSite CS = CallSite::get(Fn.use_back());
+    Instruction *Call = CS.getInstruction();
+
+    // Pass all the same arguments.
+    Args.assign(CS.arg_begin(), CS.arg_begin()+NumArgs);
+
+    // Drop any attributes that were on the vararg arguments.
+    AttrListPtr PAL = CS.getAttributes();
+    if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) {
+      SmallVector<AttributeWithIndex, 8> AttributesVec;
+      for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
+        AttributesVec.push_back(PAL.getSlot(i));
+      if (Attributes FnAttrs = PAL.getFnAttributes()) 
+        AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+      PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
+    }
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(PAL);
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(PAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+
+    if (!Call->use_empty())
+      Call->replaceAllUsesWith(New);
+
+    New->takeName(Call);
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
+
+  // Loop over the argument list, transfering uses of the old arguments over to
+  // the new arguments, also transfering over the names as well.  While we're at
+  // it, remove the dead arguments from the DeadArguments list.
+  //
+  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++I2) {
+    // Move the name and users over to the new version.
+    I->replaceAllUsesWith(I2);
+    I2->takeName(I);
+  }
+
+  // Finally, nuke the old function.
+  Fn.eraseFromParent();
+  return true;
+}
+
+/// Convenience function that returns the number of return values. It returns 0
+/// for void functions and 1 for functions not returning a struct. It returns
+/// the number of struct elements for functions returning a struct.
+static unsigned NumRetVals(const Function *F) {
+  if (F->getReturnType() == Type::VoidTy)
+    return 0;
+  else if (const StructType *STy = dyn_cast<StructType>(F->getReturnType()))
+    return STy->getNumElements();
+  else
+    return 1;
+}
+
+/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
+/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
+/// liveness of Use.
+DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) {
+  // We're live if our use or its Function is already marked as live.
+  if (LiveFunctions.count(Use.F) || LiveValues.count(Use))
+    return Live;
+
+  // We're maybe live otherwise, but remember that we must become live if
+  // Use becomes live.
+  MaybeLiveUses.push_back(Use);
+  return MaybeLive;
+}
+
+
+/// SurveyUse - This looks at a single use of an argument or return value
+/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
+/// if it causes the used value to become MaybeAlive.
+///
+/// RetValNum is the return value number to use when this use is used in a
+/// return instruction. This is used in the recursion, you should always leave
+/// it at 0.
+DAE::Liveness DAE::SurveyUse(Value::use_iterator U, UseVector &MaybeLiveUses,
+                             unsigned RetValNum) {
+    Value *V = *U;
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+      // The value is returned from a function. It's only live when the
+      // function's return value is live. We use RetValNum here, for the case
+      // that U is really a use of an insertvalue instruction that uses the
+      // orginal Use.
+      RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum);
+      // We might be live, depending on the liveness of Use.
+      return MarkIfNotLive(Use, MaybeLiveUses);
+    }
+    if (InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+      if (U.getOperandNo() != InsertValueInst::getAggregateOperandIndex()
+          && IV->hasIndices())
+        // The use we are examining is inserted into an aggregate. Our liveness
+        // depends on all uses of that aggregate, but if it is used as a return
+        // value, only index at which we were inserted counts.
+        RetValNum = *IV->idx_begin();
+
+      // Note that if we are used as the aggregate operand to the insertvalue,
+      // we don't change RetValNum, but do survey all our uses.
+
+      Liveness Result = MaybeLive;
+      for (Value::use_iterator I = IV->use_begin(),
+           E = V->use_end(); I != E; ++I) {
+        Result = SurveyUse(I, MaybeLiveUses, RetValNum);
+        if (Result == Live)
+          break;
+      }
+      return Result;
+    }
+    CallSite CS = CallSite::get(V);
+    if (CS.getInstruction()) {
+      Function *F = CS.getCalledFunction();
+      if (F) {
+        // Used in a direct call.
+  
+        // Find the argument number. We know for sure that this use is an
+        // argument, since if it was the function argument this would be an
+        // indirect call and the we know can't be looking at a value of the
+        // label type (for the invoke instruction).
+        unsigned ArgNo = CS.getArgumentNo(U.getOperandNo());
+
+        if (ArgNo >= F->getFunctionType()->getNumParams())
+          // The value is passed in through a vararg! Must be live.
+          return Live;
+
+        assert(CS.getArgument(ArgNo) 
+               == CS.getInstruction()->getOperand(U.getOperandNo()) 
+               && "Argument is not where we expected it");
+
+        // Value passed to a normal call. It's only live when the corresponding
+        // argument to the called function turns out live.
+        RetOrArg Use = CreateArg(F, ArgNo);
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      }
+    }
+    // Used in any other way? Value must be live.
+    return Live;
+}
+
+/// SurveyUses - This looks at all the uses of the given value
+/// Returns the Liveness deduced from the uses of this value.
+///
+/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
+/// the result is Live, MaybeLiveUses might be modified but its content should
+/// be ignored (since it might not be complete).
+DAE::Liveness DAE::SurveyUses(Value *V, UseVector &MaybeLiveUses) {
+  // Assume it's dead (which will only hold if there are no uses at all..).
+  Liveness Result = MaybeLive;
+  // Check each use.
+  for (Value::use_iterator I = V->use_begin(),
+       E = V->use_end(); I != E; ++I) {
+    Result = SurveyUse(I, MaybeLiveUses);
+    if (Result == Live)
+      break;
+  }
+  return Result;
+}
+
+// SurveyFunction - This performs the initial survey of the specified function,
+// checking out whether or not it uses any of its incoming arguments or whether
+// any callers use the return value.  This fills in the LiveValues set and Uses
+// map.
+//
+// We consider arguments of non-internal functions to be intrinsically alive as
+// well as arguments to functions which have their "address taken".
+//
+void DAE::SurveyFunction(Function &F) {
+  unsigned RetCount = NumRetVals(&F);
+  // Assume all return values are dead
+  typedef SmallVector<Liveness, 5> RetVals;
+  RetVals RetValLiveness(RetCount, MaybeLive);
+
+  typedef SmallVector<UseVector, 5> RetUses;
+  // These vectors map each return value to the uses that make it MaybeLive, so
+  // we can add those to the Uses map if the return value really turns out to be
+  // MaybeLive. Initialized to a list of RetCount empty lists.
+  RetUses MaybeLiveRetUses(RetCount);
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+      if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
+          != F.getFunctionType()->getReturnType()) {
+        // We don't support old style multiple return values.
+        MarkLive(F);
+        return;
+      }
+
+  if (!F.hasLocalLinkage() && (!ShouldHackArguments() || F.isIntrinsic())) {
+    MarkLive(F);
+    return;
+  }
+
+  DOUT << "DAE - Inspecting callers for fn: " << F.getName() << "\n";
+  // Keep track of the number of live retvals, so we can skip checks once all
+  // of them turn out to be live.
+  unsigned NumLiveRetVals = 0;
+  const Type *STy = dyn_cast<StructType>(F.getReturnType());
+  // Loop all uses of the function.
+  for (Value::use_iterator I = F.use_begin(), E = F.use_end(); I != E; ++I) {
+    // If the function is PASSED IN as an argument, its address has been
+    // taken.
+    CallSite CS = CallSite::get(*I);
+    if (!CS.getInstruction() || !CS.isCallee(I)) {
+      MarkLive(F);
+      return;
+    }
+
+    // If this use is anything other than a call site, the function is alive.
+    Instruction *TheCall = CS.getInstruction();
+    if (!TheCall) {   // Not a direct call site?
+      MarkLive(F);
+      return;
+    }
+
+    // If we end up here, we are looking at a direct call to our function.
+
+    // Now, check how our return value(s) is/are used in this caller. Don't
+    // bother checking return values if all of them are live already.
+    if (NumLiveRetVals != RetCount) {
+      if (STy) {
+        // Check all uses of the return value.
+        for (Value::use_iterator I = TheCall->use_begin(),
+             E = TheCall->use_end(); I != E; ++I) {
+          ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(*I);
+          if (Ext && Ext->hasIndices()) {
+            // This use uses a part of our return value, survey the uses of
+            // that part and store the results for this index only.
+            unsigned Idx = *Ext->idx_begin();
+            if (RetValLiveness[Idx] != Live) {
+              RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+              if (RetValLiveness[Idx] == Live)
+                NumLiveRetVals++;
+            }
+          } else {
+            // Used by something else than extractvalue. Mark all return
+            // values as live.
+            for (unsigned i = 0; i != RetCount; ++i )
+              RetValLiveness[i] = Live;
+            NumLiveRetVals = RetCount;
+            break;
+          }
+        }
+      } else {
+        // Single return value
+        RetValLiveness[0] = SurveyUses(TheCall, MaybeLiveRetUses[0]);
+        if (RetValLiveness[0] == Live)
+          NumLiveRetVals = RetCount;
+      }
+    }
+  }
+
+  // Now we've inspected all callers, record the liveness of our return values.
+  for (unsigned i = 0; i != RetCount; ++i)
+    MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
+
+  DOUT << "DAE - Inspecting args for fn: " << F.getName() << "\n";
+
+  // Now, check all of our arguments.
+  unsigned i = 0;
+  UseVector MaybeLiveArgUses;
+  for (Function::arg_iterator AI = F.arg_begin(),
+       E = F.arg_end(); AI != E; ++AI, ++i) {
+    // See what the effect of this use is (recording any uses that cause
+    // MaybeLive in MaybeLiveArgUses).
+    Liveness Result = SurveyUses(AI, MaybeLiveArgUses);
+    // Mark the result.
+    MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses);
+    // Clear the vector again for the next iteration.
+    MaybeLiveArgUses.clear();
+  }
+}
+
+/// MarkValue - This function marks the liveness of RA depending on L. If L is
+/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
+/// such that RA will be marked live if any use in MaybeLiveUses gets marked
+/// live later on.
+void DAE::MarkValue(const RetOrArg &RA, Liveness L,
+                    const UseVector &MaybeLiveUses) {
+  switch (L) {
+    case Live: MarkLive(RA); break;
+    case MaybeLive:
+    {
+      // Note any uses of this value, so this return value can be
+      // marked live whenever one of the uses becomes live.
+      for (UseVector::const_iterator UI = MaybeLiveUses.begin(),
+           UE = MaybeLiveUses.end(); UI != UE; ++UI)
+        Uses.insert(std::make_pair(*UI, RA));
+      break;
+    }
+  }
+}
+
+/// MarkLive - Mark the given Function as alive, meaning that it cannot be
+/// changed in any way. Additionally,
+/// mark any values that are used as this function's parameters or by its return
+/// values (according to Uses) live as well.
+void DAE::MarkLive(const Function &F) {
+    DOUT << "DAE - Intrinsically live fn: " << F.getName() << "\n";
+    // Mark the function as live.
+    LiveFunctions.insert(&F);
+    // Mark all arguments as live.
+    for (unsigned i = 0, e = F.arg_size(); i != e; ++i)
+      PropagateLiveness(CreateArg(&F, i));
+    // Mark all return values as live.
+    for (unsigned i = 0, e = NumRetVals(&F); i != e; ++i)
+      PropagateLiveness(CreateRet(&F, i));
+}
+
+/// MarkLive - Mark the given return value or argument as live. Additionally,
+/// mark any values that are used by this value (according to Uses) live as
+/// well.
+void DAE::MarkLive(const RetOrArg &RA) {
+  if (LiveFunctions.count(RA.F))
+    return; // Function was already marked Live.
+
+  if (!LiveValues.insert(RA).second)
+    return; // We were already marked Live.
+
+  DOUT << "DAE - Marking " << RA.getDescription() << " live\n";
+  PropagateLiveness(RA);
+}
+
+/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
+/// to any other values it uses (according to Uses).
+void DAE::PropagateLiveness(const RetOrArg &RA) {
+  // We don't use upper_bound (or equal_range) here, because our recursive call
+  // to ourselves is likely to cause the upper_bound (which is the first value
+  // not belonging to RA) to become erased and the iterator invalidated.
+  UseMap::iterator Begin = Uses.lower_bound(RA);
+  UseMap::iterator E = Uses.end();
+  UseMap::iterator I;
+  for (I = Begin; I != E && I->first == RA; ++I)
+    MarkLive(I->second);
+
+  // Erase RA from the Uses map (from the lower bound to wherever we ended up
+  // after the loop).
+  Uses.erase(Begin, I);
+}
+
+// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
+// that are not in LiveValues. Transform the function and all of the callees of
+// the function to not have these arguments and return values.
+//
+bool DAE::RemoveDeadStuffFromFunction(Function *F) {
+  // Don't modify fully live functions
+  if (LiveFunctions.count(F))
+    return false;
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has fewer arguments and a different return type.
+  const FunctionType *FTy = F->getFunctionType();
+  std::vector<const Type*> Params;
+
+  // Set up to build a new list of parameter attributes.
+  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  const AttrListPtr &PAL = F->getAttributes();
+
+  // The existing function return attributes.
+  Attributes RAttrs = PAL.getRetAttributes();
+  Attributes FnAttrs = PAL.getFnAttributes();
+
+  // Find out the new return value.
+
+  const Type *RetTy = FTy->getReturnType();
+  const Type *NRetTy = NULL;
+  unsigned RetCount = NumRetVals(F);
+  // -1 means unused, other numbers are the new index
+  SmallVector<int, 5> NewRetIdxs(RetCount, -1);
+  std::vector<const Type*> RetTypes;
+  if (RetTy == Type::VoidTy) {
+    NRetTy = Type::VoidTy;
+  } else {
+    const StructType *STy = dyn_cast<StructType>(RetTy);
+    if (STy)
+      // Look at each of the original return values individually.
+      for (unsigned i = 0; i != RetCount; ++i) {
+        RetOrArg Ret = CreateRet(F, i);
+        if (LiveValues.erase(Ret)) {
+          RetTypes.push_back(STy->getElementType(i));
+          NewRetIdxs[i] = RetTypes.size() - 1;
+        } else {
+          ++NumRetValsEliminated;
+          DOUT << "DAE - Removing return value " << i << " from "
+               << F->getNameStart() << "\n";
+        }
+      }
+    else
+      // We used to return a single value.
+      if (LiveValues.erase(CreateRet(F, 0))) {
+        RetTypes.push_back(RetTy);
+        NewRetIdxs[0] = 0;
+      } else {
+        DOUT << "DAE - Removing return value from " << F->getNameStart()
+             << "\n";
+        ++NumRetValsEliminated;
+      }
+    if (RetTypes.size() > 1)
+      // More than one return type? Return a struct with them. Also, if we used
+      // to return a struct and didn't change the number of return values,
+      // return a struct again. This prevents changing {something} into
+      // something and {} into void.
+      // Make the new struct packed if we used to return a packed struct
+      // already.
+      NRetTy = StructType::get(RetTypes, STy->isPacked());
+    else if (RetTypes.size() == 1)
+      // One return type? Just a simple value then, but only if we didn't use to
+      // return a struct with that simple value before.
+      NRetTy = RetTypes.front();
+    else if (RetTypes.size() == 0)
+      // No return types? Make it void, but only if we didn't use to return {}.
+      NRetTy = Type::VoidTy;
+  }
+
+  assert(NRetTy && "No new return type found?");
+
+  // Remove any incompatible attributes, but only if we removed all return
+  // values. Otherwise, ensure that we don't have any conflicting attributes
+  // here. Currently, this should not be possible, but special handling might be
+  // required when new return value attributes are added.
+  if (NRetTy == Type::VoidTy)
+    RAttrs &= ~Attribute::typeIncompatible(NRetTy);
+  else
+    assert((RAttrs & Attribute::typeIncompatible(NRetTy)) == 0 
+           && "Return attributes no longer compatible?");
+
+  if (RAttrs)
+    AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs));
+
+  // Remember which arguments are still alive.
+  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
+  // Construct the new parameter list from non-dead arguments. Also construct
+  // a new set of parameter attributes to correspond. Skip the first parameter
+  // attribute, since that belongs to the return value.
+  unsigned i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++i) {
+    RetOrArg Arg = CreateArg(F, i);
+    if (LiveValues.erase(Arg)) {
+      Params.push_back(I->getType());
+      ArgAlive[i] = true;
+
+      // Get the original parameter attributes (skipping the first one, that is
+      // for the return value.
+      if (Attributes Attrs = PAL.getParamAttributes(i + 1))
+        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs));
+    } else {
+      ++NumArgumentsEliminated;
+      DOUT << "DAE - Removing argument " << i << " (" << I->getNameStart()
+           << ") from " << F->getNameStart() << "\n";
+    }
+  }
+
+  if (FnAttrs != Attribute::None) 
+    AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+
+  // Reconstruct the AttributesList based on the vector we constructed.
+  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
+
+  // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
+  // have zero fixed arguments.
+  //
+  // Note that we apply this hack for a vararg fuction that does not have any
+  // arguments anymore, but did have them before (so don't bother fixing
+  // functions that were already broken wrt CWriter).
+  bool ExtraArgHack = false;
+  if (Params.empty() && FTy->isVarArg() && FTy->getNumParams() != 0) {
+    ExtraArgHack = true;
+    Params.push_back(Type::Int32Ty);
+  }
+
+  // Create the new function type based on the recomputed parameters.
+  FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
+
+  // No change?
+  if (NFTy == FTy)
+    return false;
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, F->getLinkage());
+  NF->copyAttributesFrom(F);
+  NF->setAttributes(NewPAL);
+  // Insert the new function before the old function, so we won't be processing
+  // it again.
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->takeName(F);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  while (!F->use_empty()) {
+    CallSite CS = CallSite::get(F->use_back());
+    Instruction *Call = CS.getInstruction();
+
+    AttributesVec.clear();
+    const AttrListPtr &CallPAL = CS.getAttributes();
+
+    // The call return attributes.
+    Attributes RAttrs = CallPAL.getRetAttributes();
+    Attributes FnAttrs = CallPAL.getFnAttributes();
+    // Adjust in case the function was changed to return void.
+    RAttrs &= ~Attribute::typeIncompatible(NF->getReturnType());
+    if (RAttrs)
+      AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs));
+
+    // Declare these outside of the loops, so we can reuse them for the second
+    // loop, which loops the varargs.
+    CallSite::arg_iterator I = CS.arg_begin();
+    unsigned i = 0;
+    // Loop over those operands, corresponding to the normal arguments to the
+    // original function, and add those that are still alive.
+    for (unsigned e = FTy->getNumParams(); i != e; ++I, ++i)
+      if (ArgAlive[i]) {
+        Args.push_back(*I);
+        // Get original parameter attributes, but skip return attributes.
+        if (Attributes Attrs = CallPAL.getParamAttributes(i + 1))
+          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      }
+
+    if (ExtraArgHack)
+      Args.push_back(UndefValue::get(Type::Int32Ty));
+
+    // Push any varargs arguments on the list. Don't forget their attributes.
+    for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
+      Args.push_back(*I);
+      if (Attributes Attrs = CallPAL.getParamAttributes(i + 1))
+        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+    }
+
+    if (FnAttrs != Attribute::None)
+      AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+
+    // Reconstruct the AttributesList based on the vector we constructed.
+    AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec.begin(),
+                                              AttributesVec.end());
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(NewCallPAL);
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(NewCallPAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+
+    if (!Call->use_empty()) {
+      if (New->getType() == Call->getType()) {
+        // Return type not changed? Just replace users then.
+        Call->replaceAllUsesWith(New);
+        New->takeName(Call);
+      } else if (New->getType() == Type::VoidTy) {
+        // Our return value has uses, but they will get removed later on.
+        // Replace by null for now.
+        Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
+      } else {
+        assert(isa<StructType>(RetTy) &&
+               "Return type changed, but not into a void. The old return type"
+               " must have been a struct!");
+        Instruction *InsertPt = Call;
+        if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+          BasicBlock::iterator IP = II->getNormalDest()->begin();
+          while (isa<PHINode>(IP)) ++IP;
+          InsertPt = IP;
+        }
+          
+        // We used to return a struct. Instead of doing smart stuff with all the
+        // uses of this struct, we will just rebuild it using
+        // extract/insertvalue chaining and let instcombine clean that up.
+        //
+        // Start out building up our return value from undef
+        Value *RetVal = llvm::UndefValue::get(RetTy);
+        for (unsigned i = 0; i != RetCount; ++i)
+          if (NewRetIdxs[i] != -1) {
+            Value *V;
+            if (RetTypes.size() > 1)
+              // We are still returning a struct, so extract the value from our
+              // return value
+              V = ExtractValueInst::Create(New, NewRetIdxs[i], "newret",
+                                           InsertPt);
+            else
+              // We are now returning a single element, so just insert that
+              V = New;
+            // Insert the value at the old position
+            RetVal = InsertValueInst::Create(RetVal, V, i, "oldret", InsertPt);
+          }
+        // Now, replace all uses of the old call instruction with the return
+        // struct we built
+        Call->replaceAllUsesWith(RetVal);
+        New->takeName(Call);
+      }
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transfering uses of the old arguments over to
+  // the new arguments, also transfering over the names as well.
+  i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++i)
+    if (ArgAlive[i]) {
+      // If this is a live argument, move the name and users over to the new
+      // version.
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+      ++I2;
+    } else {
+      // If this argument is dead, replace any uses of it with null constants
+      // (these are guaranteed to become unused later on).
+      I->replaceAllUsesWith(Constant::getNullValue(I->getType()));
+    }
+
+  // If we change the return value of the function we must rewrite any return
+  // instructions.  Check this now.
+  if (F->getReturnType() != NF->getReturnType())
+    for (Function::iterator BB = NF->begin(), E = NF->end(); BB != E; ++BB)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+        Value *RetVal;
+
+        if (NFTy->getReturnType() == Type::VoidTy) {
+          RetVal = 0;
+        } else {
+          assert (isa<StructType>(RetTy));
+          // The original return value was a struct, insert
+          // extractvalue/insertvalue chains to extract only the values we need
+          // to return and insert them into our new result.
+          // This does generate messy code, but we'll let it to instcombine to
+          // clean that up.
+          Value *OldRet = RI->getOperand(0);
+          // Start out building up our return value from undef
+          RetVal = llvm::UndefValue::get(NRetTy);
+          for (unsigned i = 0; i != RetCount; ++i)
+            if (NewRetIdxs[i] != -1) {
+              ExtractValueInst *EV = ExtractValueInst::Create(OldRet, i,
+                                                              "oldret", RI);
+              if (RetTypes.size() > 1) {
+                // We're still returning a struct, so reinsert the value into
+                // our new return value at the new index
+
+                RetVal = InsertValueInst::Create(RetVal, EV, NewRetIdxs[i],
+                                                 "newret", RI);
+              } else {
+                // We are now only returning a simple value, so just return the
+                // extracted value.
+                RetVal = EV;
+              }
+            }
+        }
+        // Replace the return instruction with one returning the new return
+        // value (possibly 0 if we became void).
+        ReturnInst::Create(RetVal, RI);
+        BB->getInstList().erase(RI);
+      }
+
+  // Now that the old function is dead, delete it.
+  F->eraseFromParent();
+
+  return true;
+}
+
+bool DAE::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // First pass: Do a simple check to see if any functions can have their "..."
+  // removed.  We can do this if they never call va_start.  This loop cannot be
+  // fused with the next loop, because deleting a function invalidates
+  // information computed while surveying other functions.
+  DOUT << "DAE - Deleting dead varargs\n";
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function &F = *I++;
+    if (F.getFunctionType()->isVarArg())
+      Changed |= DeleteDeadVarargs(F);
+  }
+
+  // Second phase:loop through the module, determining which arguments are live.
+  // We assume all arguments are dead unless proven otherwise (allowing us to
+  // determine that dead arguments passed into recursive functions are dead).
+  //
+  DOUT << "DAE - Determining liveness\n";
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    SurveyFunction(*I);
+  
+  // Now, remove all dead arguments and return values from each function in
+  // turn
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    // Increment now, because the function will probably get removed (ie
+    // replaced by a new one).
+    Function *F = I++;
+    Changed |= RemoveDeadStuffFromFunction(F);
+  }
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/DeadTypeElimination.cpp b/lib/Transforms/IPO/DeadTypeElimination.cpp
new file mode 100644
index 0000000..85aed2b
--- /dev/null
+++ b/lib/Transforms/IPO/DeadTypeElimination.cpp
@@ -0,0 +1,107 @@
+//===- DeadTypeElimination.cpp - Eliminate unused types for symbol table --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to cleanup the output of GCC.  It eliminate names for types
+// that are unused in the entire translation unit, using the FindUsedTypes pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "deadtypeelim"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumKilled, "Number of unused typenames removed from symtab");
+
+namespace {
+  struct VISIBILITY_HIDDEN DTE : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    DTE() : ModulePass(&ID) {}
+
+    // doPassInitialization - For this pass, it removes global symbol table
+    // entries for primitive types.  These are never used for linking in GCC and
+    // they make the output uglier to look at, so we nuke them.
+    //
+    // Also, initialize instance variables.
+    //
+    bool runOnModule(Module &M);
+
+    // getAnalysisUsage - This function needs FindUsedTypes to do its job...
+    //
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+    }
+  };
+}
+
+char DTE::ID = 0;
+static RegisterPass<DTE> X("deadtypeelim", "Dead Type Elimination");
+
+ModulePass *llvm::createDeadTypeEliminationPass() {
+  return new DTE();
+}
+
+
+// ShouldNukeSymtabEntry - Return true if this module level symbol table entry
+// should be eliminated.
+//
+static inline bool ShouldNukeSymtabEntry(const Type *Ty){
+  // Nuke all names for primitive types!
+  if (Ty->isPrimitiveType() || Ty->isInteger()) 
+    return true;
+
+  // Nuke all pointers to primitive types as well...
+  if (const PointerType *PT = dyn_cast<PointerType>(Ty))
+    if (PT->getElementType()->isPrimitiveType() ||
+        PT->getElementType()->isInteger()) 
+      return true;
+
+  return false;
+}
+
+// run - For this pass, it removes global symbol table entries for primitive
+// types.  These are never used for linking in GCC and they make the output
+// uglier to look at, so we nuke them.  Also eliminate types that are never used
+// in the entire program as indicated by FindUsedTypes.
+//
+bool DTE::runOnModule(Module &M) {
+  bool Changed = false;
+
+  TypeSymbolTable &ST = M.getTypeSymbolTable();
+  std::set<const Type *> UsedTypes = getAnalysis<FindUsedTypes>().getTypes();
+
+  // Check the symbol table for superfluous type entries...
+  //
+  // Grab the 'type' plane of the module symbol...
+  TypeSymbolTable::iterator TI = ST.begin();
+  TypeSymbolTable::iterator TE = ST.end();
+  while ( TI != TE ) {
+    // If this entry should be unconditionally removed, or if we detect that
+    // the type is not used, remove it.
+    const Type *RHS = TI->second;
+    if (ShouldNukeSymtabEntry(RHS) || !UsedTypes.count(RHS)) {
+      ST.remove(TI++);
+      ++NumKilled;
+      Changed = true;
+    } else {
+      ++TI;
+      // We only need to leave one name for each type.
+      UsedTypes.erase(RHS);
+    }
+  }
+
+  return Changed;
+}
+
+// vim: sw=2
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
new file mode 100644
index 0000000..0c529d2
--- /dev/null
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -0,0 +1,173 @@
+//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts global values
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Constants.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+  /// @brief A pass to extract specific functions and their dependencies.
+  class VISIBILITY_HIDDEN GVExtractorPass : public ModulePass {
+    std::vector<GlobalValue*> Named;
+    bool deleteStuff;
+    bool reLink;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    /// FunctionExtractorPass - If deleteFn is true, this pass deletes as the
+    /// specified function. Otherwise, it deletes as much of the module as
+    /// possible, except for the function specified.
+    ///
+    explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true,
+                             bool relinkCallees = false)
+      : ModulePass(&ID), Named(GVs), deleteStuff(deleteS),
+        reLink(relinkCallees) {}
+
+    bool runOnModule(Module &M) {
+      if (Named.size() == 0) {
+        return false;  // Nothing to extract
+      }
+      
+      if (deleteStuff)
+        return deleteGV();
+      M.setModuleInlineAsm("");
+      return isolateGV(M);
+    }
+
+    bool deleteGV() {
+      for (std::vector<GlobalValue*>::iterator GI = Named.begin(), 
+             GE = Named.end(); GI != GE; ++GI) {
+        if (Function* NamedFunc = dyn_cast<Function>(*GI)) {
+         // If we're in relinking mode, set linkage of all internal callees to
+         // external. This will allow us extract function, and then - link
+         // everything together
+         if (reLink) {
+           for (Function::iterator B = NamedFunc->begin(), BE = NamedFunc->end();
+                B != BE; ++B) {
+             for (BasicBlock::iterator I = B->begin(), E = B->end();
+                  I != E; ++I) {
+               if (CallInst* callInst = dyn_cast<CallInst>(&*I)) {
+                 Function* Callee = callInst->getCalledFunction();
+                 if (Callee && Callee->hasLocalLinkage())
+                   Callee->setLinkage(GlobalValue::ExternalLinkage);
+               }
+             }
+           }
+         }
+         
+         NamedFunc->setLinkage(GlobalValue::ExternalLinkage);
+         NamedFunc->deleteBody();
+         assert(NamedFunc->isDeclaration() && "This didn't make the function external!");
+       } else {
+          if (!(*GI)->isDeclaration()) {
+            cast<GlobalVariable>(*GI)->setInitializer(0);  //clear the initializer
+            (*GI)->setLinkage(GlobalValue::ExternalLinkage);
+          }
+        }
+      }
+      return true;
+    }
+
+    bool isolateGV(Module &M) {
+      // Mark all globals internal
+      // FIXME: what should we do with private linkage?
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I)
+        if (!I->isDeclaration()) {
+          I->setLinkage(GlobalValue::InternalLinkage);
+        }
+      for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+        if (!I->isDeclaration()) {
+          I->setLinkage(GlobalValue::InternalLinkage);
+        }
+
+      // Make sure our result is globally accessible...
+      // by putting them in the used array
+      {
+        std::vector<Constant *> AUGs;
+        const Type *SBP= PointerType::getUnqual(Type::Int8Ty);
+        for (std::vector<GlobalValue*>::iterator GI = Named.begin(), 
+               GE = Named.end(); GI != GE; ++GI) {
+          (*GI)->setLinkage(GlobalValue::ExternalLinkage);
+          AUGs.push_back(ConstantExpr::getBitCast(*GI, SBP));
+        }
+        ArrayType *AT = ArrayType::get(SBP, AUGs.size());
+        Constant *Init = ConstantArray::get(AT, AUGs);
+        GlobalValue *gv = new GlobalVariable(AT, false, 
+                                             GlobalValue::AppendingLinkage, 
+                                             Init, "llvm.used", &M);
+        gv->setSection("llvm.metadata");
+      }
+
+      // All of the functions may be used by global variables or the named
+      // globals.  Loop through them and create a new, external functions that
+      // can be "used", instead of ones with bodies.
+      std::vector<Function*> NewFunctions;
+
+      Function *Last = --M.end();  // Figure out where the last real fn is.
+
+      for (Module::iterator I = M.begin(); ; ++I) {
+        if (std::find(Named.begin(), Named.end(), &*I) == Named.end()) {
+          Function *New = Function::Create(I->getFunctionType(),
+                                           GlobalValue::ExternalLinkage);
+          New->copyAttributesFrom(I);
+
+          // If it's not the named function, delete the body of the function
+          I->dropAllReferences();
+
+          M.getFunctionList().push_back(New);
+          NewFunctions.push_back(New);
+          New->takeName(I);
+        }
+
+        if (&*I == Last) break;  // Stop after processing the last function
+      }
+
+      // Now that we have replacements all set up, loop through the module,
+      // deleting the old functions, replacing them with the newly created
+      // functions.
+      if (!NewFunctions.empty()) {
+        unsigned FuncNum = 0;
+        Module::iterator I = M.begin();
+        do {
+          if (std::find(Named.begin(), Named.end(), &*I) == Named.end()) {
+            // Make everything that uses the old function use the new dummy fn
+            I->replaceAllUsesWith(NewFunctions[FuncNum++]);
+
+            Function *Old = I;
+            ++I;  // Move the iterator to the new function
+
+            // Delete the old function!
+            M.getFunctionList().erase(Old);
+
+          } else {
+            ++I;  // Skip the function we are extracting
+          }
+        } while (&*I != NewFunctions[0]);
+      }
+
+      return true;
+    }
+  };
+
+  char GVExtractorPass::ID = 0;
+}
+
+ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue*>& GVs, 
+                                         bool deleteFn, bool relinkCallees) {
+  return new GVExtractorPass(GVs, deleteFn, relinkCallees);
+}
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
new file mode 100644
index 0000000..e831524
--- /dev/null
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -0,0 +1,347 @@
+//===- FunctionAttrs.cpp - Pass which marks functions readnone or readonly ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, looking for functions which do not access or only read
+// non-local memory, and marking them readnone/readonly.  In addition,
+// it marks function arguments (of pointer type) 'nocapture' if a call
+// to the function does not create any copies of the pointer value that
+// outlive the call.  This more or less means that the pointer is only
+// dereferenced, and not returned from the function or stored in a global.
+// This pass is implemented as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "functionattrs"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+using namespace llvm;
+
+STATISTIC(NumReadNone, "Number of functions marked readnone");
+STATISTIC(NumReadOnly, "Number of functions marked readonly");
+STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+
+namespace {
+  struct VISIBILITY_HIDDEN FunctionAttrs : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    FunctionAttrs() : CallGraphSCCPass(&ID) {}
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+
+    // AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
+    bool AddReadAttrs(const std::vector<CallGraphNode *> &SCC);
+
+    // AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
+    bool AddNoCaptureAttrs(const std::vector<CallGraphNode *> &SCC);
+
+    // IsFunctionMallocLike - Does this function allocate new memory?
+    bool IsFunctionMallocLike(Function *F,
+                              SmallPtrSet<CallGraphNode*, 8> &) const;
+
+    // AddNoAliasAttrs - Deduce noalias attributes for the SCC.
+    bool AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+    bool PointsToLocalMemory(Value *V);
+  };
+}
+
+char FunctionAttrs::ID = 0;
+static RegisterPass<FunctionAttrs>
+X("functionattrs", "Deduce function attributes");
+
+Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); }
+
+
+/// PointsToLocalMemory - Returns whether the given pointer value points to
+/// memory that is local to the function.  Global constants are considered
+/// local to all functions.
+bool FunctionAttrs::PointsToLocalMemory(Value *V) {
+  V = V->getUnderlyingObject();
+  // An alloca instruction defines local memory.
+  if (isa<AllocaInst>(V))
+    return true;
+  // A global constant counts as local memory for our purposes.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+  // Could look through phi nodes and selects here, but it doesn't seem
+  // to be useful in practice.
+  return false;
+}
+
+/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
+bool FunctionAttrs::AddReadAttrs(const std::vector<CallGraphNode *> &SCC) {
+  SmallPtrSet<CallGraphNode*, 8> SCCNodes;
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    SCCNodes.insert(SCC[i]);
+
+  // Check if any of the functions in the SCC read or write memory.  If they
+  // write memory then they can't be marked readnone or readonly.
+  bool ReadsMemory = false;
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F == 0)
+      // External node - may write memory.  Just give up.
+      return false;
+
+    if (F->doesNotAccessMemory())
+      // Already perfect!
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime with
+    // something that writes memory, so treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden()) {
+      if (!F->onlyReadsMemory())
+        // May write memory.  Just give up.
+        return false;
+
+      ReadsMemory = true;
+      continue;
+    }
+
+    // Scan the function body for instructions that may read or write memory.
+    for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+      Instruction *I = &*II;
+
+      // Some instructions can be ignored even if they read or write memory.
+      // Detect these now, skipping to the next instruction if one is found.
+      CallSite CS = CallSite::get(I);
+      if (CS.getInstruction()) {
+        // Ignore calls to functions in the same SCC.
+        if (SCCNodes.count(CG[CS.getCalledFunction()]))
+          continue;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        // Ignore loads from local memory.
+        if (PointsToLocalMemory(LI->getPointerOperand()))
+          continue;
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Ignore stores to local memory.
+        if (PointsToLocalMemory(SI->getPointerOperand()))
+          continue;
+      }
+
+      // Any remaining instructions need to be taken seriously!  Check if they
+      // read or write memory.
+      if (I->mayWriteToMemory())
+        // Writes memory.  Just give up.
+        return false;
+
+      if (isa<MallocInst>(I))
+        // MallocInst claims not to write memory!  PR3754.
+        return false;
+
+      // If this instruction may read memory, remember that.
+      ReadsMemory |= I->mayReadFromMemory();
+    }
+  }
+
+  // Success!  Functions in this SCC do not access memory, or only read memory.
+  // Give them the appropriate attribute.
+  bool MadeChange = false;
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F->doesNotAccessMemory())
+      // Already perfect!
+      continue;
+
+    if (F->onlyReadsMemory() && ReadsMemory)
+      // No change.
+      continue;
+
+    MadeChange = true;
+
+    // Clear out any existing attributes.
+    F->removeAttribute(~0, Attribute::ReadOnly | Attribute::ReadNone);
+
+    // Add in the new attribute.
+    F->addAttribute(~0, ReadsMemory? Attribute::ReadOnly : Attribute::ReadNone);
+
+    if (ReadsMemory)
+      ++NumReadOnly;
+    else
+      ++NumReadNone;
+  }
+
+  return MadeChange;
+}
+
+/// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
+bool FunctionAttrs::AddNoCaptureAttrs(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = false;
+
+  // Check each function in turn, determining which pointer arguments are not
+  // captured.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F == 0)
+      // External node - skip it;
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime with
+    // something that writes memory, so treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden())
+      continue;
+
+    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A!=E; ++A)
+      if (isa<PointerType>(A->getType()) && !A->hasNoCaptureAttr() &&
+          !PointerMayBeCaptured(A, true)) {
+        A->addAttr(Attribute::NoCapture);
+        ++NumNoCapture;
+        Changed = true;
+      }
+  }
+
+  return Changed;
+}
+
+/// IsFunctionMallocLike - A function is malloc-like if it returns either null
+/// or a pointer that doesn't alias any other pointer visible to the caller.
+bool FunctionAttrs::IsFunctionMallocLike(Function *F,
+                              SmallPtrSet<CallGraphNode*, 8> &SCCNodes) const {
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  UniqueVector<Value *> FlowsToReturn;
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
+
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i+1];   // UniqueVector[0] is reserved.
+
+    if (Constant *C = dyn_cast<Constant>(RetVal)) {
+      if (!C->isNullValue() && !isa<UndefValue>(C))
+        return false;
+
+      continue;
+    }
+
+    if (isa<Argument>(RetVal))
+      return false;
+
+    if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
+      switch (RVI->getOpcode()) {
+        // Extend the analysis by looking upwards.
+        case Instruction::GetElementPtr:
+        case Instruction::BitCast:
+          FlowsToReturn.insert(RVI->getOperand(0));
+          continue;
+        case Instruction::Select: {
+          SelectInst *SI = cast<SelectInst>(RVI);
+          FlowsToReturn.insert(SI->getTrueValue());
+          FlowsToReturn.insert(SI->getFalseValue());
+        } continue;
+        case Instruction::PHI: {
+          PHINode *PN = cast<PHINode>(RVI);
+          for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            FlowsToReturn.insert(PN->getIncomingValue(i));
+        } continue;
+
+        // Check whether the pointer came from an allocation.
+        case Instruction::Alloca:
+        case Instruction::Malloc:
+          break;
+        case Instruction::Call:
+        case Instruction::Invoke: {
+          CallSite CS(RVI);
+          if (CS.paramHasAttr(0, Attribute::NoAlias))
+            break;
+          if (CS.getCalledFunction() &&
+              SCCNodes.count(CG[CS.getCalledFunction()]))
+            break;
+        } // fall-through
+        default:
+          return false;  // Did not come from an allocation.
+      }
+
+    if (PointerMayBeCaptured(RetVal, false))
+      return false;
+  }
+
+  return true;
+}
+
+/// AddNoAliasAttrs - Deduce noalias attributes for the SCC.
+bool FunctionAttrs::AddNoAliasAttrs(const std::vector<CallGraphNode *> &SCC) {
+  SmallPtrSet<CallGraphNode*, 8> SCCNodes;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    SCCNodes.insert(SCC[i]);
+
+  // Check each function in turn, determining which functions return noalias
+  // pointers.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+
+    if (F == 0)
+      // External node - skip it;
+      return false;
+
+    // Already noalias.
+    if (F->doesNotAlias(0))
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime, so
+    // treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden())
+      return false;
+
+    // We annotate noalias return values, which are only applicable to 
+    // pointer types.
+    if (!isa<PointerType>(F->getReturnType()))
+      continue;
+
+    if (!IsFunctionMallocLike(F, SCCNodes))
+      return false;
+  }
+
+  bool MadeChange = false;
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+    if (F->doesNotAlias(0) || !isa<PointerType>(F->getReturnType()))
+      continue;
+
+    F->setDoesNotAlias(0);
+    ++NumNoAlias;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+bool FunctionAttrs::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = AddReadAttrs(SCC);
+  Changed |= AddNoCaptureAttrs(SCC);
+  Changed |= AddNoAliasAttrs(SCC);
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
new file mode 100644
index 0000000..db378b0
--- /dev/null
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -0,0 +1,227 @@
+//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate unreachable internal globals from the
+// program.  It uses an aggressive algorithm, searching out globals that are
+// known to be alive.  After it finds all of the globals which are needed, it
+// deletes whatever is left over.  This allows it to delete recursive chunks of
+// the program which are unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globaldce"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumAliases  , "Number of global aliases removed");
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN GlobalDCE : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    GlobalDCE() : ModulePass(&ID) {}
+
+    // run - Do the GlobalDCE pass on the specified module, optionally updating
+    // the specified callgraph to reflect the changes.
+    //
+    bool runOnModule(Module &M);
+
+  private:
+    std::set<GlobalValue*> AliveGlobals;
+
+    /// GlobalIsNeeded - mark the specific global value as needed, and
+    /// recursively mark anything that it uses as also needed.
+    void GlobalIsNeeded(GlobalValue *GV);
+    void MarkUsedGlobalsAsNeeded(Constant *C);
+
+    bool SafeToDestroyConstant(Constant* C);
+    bool RemoveUnusedGlobalValue(GlobalValue &GV);
+  };
+}
+
+char GlobalDCE::ID = 0;
+static RegisterPass<GlobalDCE> X("globaldce", "Dead Global Elimination");
+
+ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); }
+
+bool GlobalDCE::runOnModule(Module &M) {
+  bool Changed = false;
+  // Loop over the module, adding globals which are obviously necessary.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Functions with external linkage are needed if they have a body
+    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Externally visible & appending globals are needed, if they have an
+    // initializer.
+    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Externally visible aliases are needed.
+    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  // Now that all globals which are needed are in the AliveGlobals set, we loop
+  // through the program, deleting those which are not alive.
+  //
+
+  // The first pass is to drop initializers of global variables which are dead.
+  std::vector<GlobalVariable*> DeadGlobalVars;   // Keep track of dead globals
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadGlobalVars.push_back(I);         // Keep track of dead globals
+      I->setInitializer(0);
+    }
+
+  // The second pass drops the bodies of functions which are dead...
+  std::vector<Function*> DeadFunctions;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadFunctions.push_back(I);         // Keep track of dead globals
+      if (!I->isDeclaration())
+        I->deleteBody();
+    }
+
+  // The third pass drops targets of aliases which are dead...
+  std::vector<GlobalAlias*> DeadAliases;
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;
+       ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadAliases.push_back(I);
+      I->setAliasee(0);
+    }
+
+  if (!DeadFunctions.empty()) {
+    // Now that all interferences have been dropped, delete the actual objects
+    // themselves.
+    for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadFunctions[i]);
+      M.getFunctionList().erase(DeadFunctions[i]);
+    }
+    NumFunctions += DeadFunctions.size();
+    Changed = true;
+  }
+
+  if (!DeadGlobalVars.empty()) {
+    for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadGlobalVars[i]);
+      M.getGlobalList().erase(DeadGlobalVars[i]);
+    }
+    NumVariables += DeadGlobalVars.size();
+    Changed = true;
+  }
+
+  // Now delete any dead aliases.
+  if (!DeadAliases.empty()) {
+    for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadAliases[i]);
+      M.getAliasList().erase(DeadAliases[i]);
+    }
+    NumAliases += DeadAliases.size();
+    Changed = true;
+  }
+
+  // Make sure that all memory is released
+  AliveGlobals.clear();
+  return Changed;
+}
+
+/// GlobalIsNeeded - the specific global value as needed, and
+/// recursively mark anything that it uses as also needed.
+void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
+  std::set<GlobalValue*>::iterator I = AliveGlobals.find(G);
+
+  // If the global is already in the set, no need to reprocess it.
+  if (I != AliveGlobals.end()) return;
+
+  // Otherwise insert it now, so we do not infinitely recurse
+  AliveGlobals.insert(I, G);
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) {
+    // If this is a global variable, we must make sure to add any global values
+    // referenced by the initializer to the alive set.
+    if (GV->hasInitializer())
+      MarkUsedGlobalsAsNeeded(GV->getInitializer());
+  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(G)) {
+    // The target of a global alias is needed.
+    MarkUsedGlobalsAsNeeded(GA->getAliasee());
+  } else {
+    // Otherwise this must be a function object.  We have to scan the body of
+    // the function looking for constants and global values which are used as
+    // operands.  Any operands of these types must be processed to ensure that
+    // any globals used will be marked as needed.
+    Function *F = cast<Function>(G);
+    // For all basic blocks...
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      // For all instructions...
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        // For all operands...
+        for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
+          if (GlobalValue *GV = dyn_cast<GlobalValue>(*U))
+            GlobalIsNeeded(GV);
+          else if (Constant *C = dyn_cast<Constant>(*U))
+            MarkUsedGlobalsAsNeeded(C);
+  }
+}
+
+void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    GlobalIsNeeded(GV);
+  else {
+    // Loop over all of the operands of the constant, adding any globals they
+    // use to the list of needed globals.
+    for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I)
+      MarkUsedGlobalsAsNeeded(cast<Constant>(*I));
+  }
+}
+
+// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
+// GlobalValue, looking for the constant pointer ref that may be pointing to it.
+// If found, check to see if the constant pointer ref is safe to destroy, and if
+// so, nuke it.  This will reduce the reference count on the global value, which
+// might make it deader.
+//
+bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) {
+  if (GV.use_empty()) return false;
+  GV.removeDeadConstantUsers();
+  return GV.use_empty();
+}
+
+// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
+// by constants itself.  Note that constants cannot be cyclic, so this test is
+// pretty easy to implement recursively.
+//
+bool GlobalDCE::SafeToDestroyConstant(Constant *C) {
+  for (Value::use_iterator I = C->use_begin(), E = C->use_end(); I != E; ++I)
+    if (Constant *User = dyn_cast<Constant>(*I)) {
+      if (!SafeToDestroyConstant(User)) return false;
+    } else {
+      return false;
+    }
+  return true;
+}
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
new file mode 100644
index 0000000..2c01cc3
--- /dev/null
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -0,0 +1,2485 @@
+//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms simple global variables that never have their address
+// taken.  If obviously true, it marks read/write globals as constant, deletes
+// variables only stored to, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globalopt"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumMarked    , "Number of globals marked constant");
+STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");
+STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd");
+STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
+STATISTIC(NumDeleted   , "Number of globals deleted");
+STATISTIC(NumFnDeleted , "Number of functions deleted");
+STATISTIC(NumGlobUses  , "Number of global uses devirtualized");
+STATISTIC(NumLocalized , "Number of globals localized");
+STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans");
+STATISTIC(NumFastCallFns   , "Number of functions converted to fastcc");
+STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
+STATISTIC(NumNestRemoved   , "Number of nest attributes removed");
+STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
+STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
+
+namespace {
+  struct VISIBILITY_HIDDEN GlobalOpt : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+    static char ID; // Pass identification, replacement for typeid
+    GlobalOpt() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M);
+
+  private:
+    GlobalVariable *FindGlobalCtors(Module &M);
+    bool OptimizeFunctions(Module &M);
+    bool OptimizeGlobalVars(Module &M);
+    bool OptimizeGlobalAliases(Module &M);
+    bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
+    bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
+  };
+}
+
+char GlobalOpt::ID = 0;
+static RegisterPass<GlobalOpt> X("globalopt", "Global Variable Optimizer");
+
+ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
+
+namespace {
+
+/// GlobalStatus - As we analyze each global, keep track of some information
+/// about it.  If we find out that the address of the global is taken, none of
+/// this info will be accurate.
+struct VISIBILITY_HIDDEN GlobalStatus {
+  /// isLoaded - True if the global is ever loaded.  If the global isn't ever
+  /// loaded it can be deleted.
+  bool isLoaded;
+
+  /// StoredType - Keep track of what stores to the global look like.
+  ///
+  enum StoredType {
+    /// NotStored - There is no store to this global.  It can thus be marked
+    /// constant.
+    NotStored,
+
+    /// isInitializerStored - This global is stored to, but the only thing
+    /// stored is the constant it was initialized with.  This is only tracked
+    /// for scalar globals.
+    isInitializerStored,
+
+    /// isStoredOnce - This global is stored to, but only its initializer and
+    /// one other value is ever stored to it.  If this global isStoredOnce, we
+    /// track the value stored to it in StoredOnceValue below.  This is only
+    /// tracked for scalar globals.
+    isStoredOnce,
+
+    /// isStored - This global is stored to by multiple values or something else
+    /// that we cannot track.
+    isStored
+  } StoredType;
+
+  /// StoredOnceValue - If only one value (besides the initializer constant) is
+  /// ever stored to this global, keep track of what value it is.
+  Value *StoredOnceValue;
+
+  /// AccessingFunction/HasMultipleAccessingFunctions - These start out
+  /// null/false.  When the first accessing function is noticed, it is recorded.
+  /// When a second different accessing function is noticed,
+  /// HasMultipleAccessingFunctions is set to true.
+  Function *AccessingFunction;
+  bool HasMultipleAccessingFunctions;
+
+  /// HasNonInstructionUser - Set to true if this global has a user that is not
+  /// an instruction (e.g. a constant expr or GV initializer).
+  bool HasNonInstructionUser;
+
+  /// HasPHIUser - Set to true if this global has a user that is a PHI node.
+  bool HasPHIUser;
+  
+  GlobalStatus() : isLoaded(false), StoredType(NotStored), StoredOnceValue(0),
+                   AccessingFunction(0), HasMultipleAccessingFunctions(false),
+                   HasNonInstructionUser(false), HasPHIUser(false) {}
+};
+
+}
+
+/// ConstantIsDead - Return true if the specified constant is (transitively)
+/// dead.  The constant may be used by other constants (e.g. constant arrays and
+/// constant exprs) as long as they are dead, but it cannot be used by anything
+/// else.
+static bool ConstantIsDead(Constant *C) {
+  if (isa<GlobalValue>(C)) return false;
+
+  for (Value::use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; ++UI)
+    if (Constant *CU = dyn_cast<Constant>(*UI)) {
+      if (!ConstantIsDead(CU)) return false;
+    } else
+      return false;
+  return true;
+}
+
+
+/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus
+/// structure.  If the global has its address taken, return true to indicate we
+/// can't do anything with it.
+///
+static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
+                          SmallPtrSet<PHINode*, 16> &PHIUsers) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
+      GS.HasNonInstructionUser = true;
+
+      if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
+
+    } else if (Instruction *I = dyn_cast<Instruction>(*UI)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        Function *F = I->getParent()->getParent();
+        if (GS.AccessingFunction == 0)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.isLoaded = true;
+        if (LI->isVolatile()) return true;  // Don't hack on volatile loads.
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V) return true;
+
+        if (SI->isVolatile()) return true;  // Don't hack on volatile stores.
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::isStored) {
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(SI->getOperand(1))){
+            Value *StoredVal = SI->getOperand(0);
+            if (StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::isInitializerStored)
+                GS.StoredType = GlobalStatus::isInitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              // G = G
+              if (GS.StoredType < GlobalStatus::isInitializerStored)
+                GS.StoredType = GlobalStatus::isInitializerStored;
+            } else if (GS.StoredType < GlobalStatus::isStoredOnce) {
+              GS.StoredType = GlobalStatus::isStoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::isStoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::isStored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::isStored;
+          }
+        }
+      } else if (isa<GetElementPtrInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (isa<SelectInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (PHINode *PN = dyn_cast<PHINode>(I)) {
+        // PHI nodes we can check just like select or GEP instructions, but we
+        // have to be careful about infinite recursion.
+        if (PHIUsers.insert(PN))  // Not already visited.
+          if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+        GS.HasPHIUser = true;
+      } else if (isa<CmpInst>(I)) {
+      } else if (isa<MemTransferInst>(I)) {
+        if (I->getOperand(1) == V)
+          GS.StoredType = GlobalStatus::isStored;
+        if (I->getOperand(2) == V)
+          GS.isLoaded = true;
+      } else if (isa<MemSetInst>(I)) {
+        assert(I->getOperand(1) == V && "Memset only takes one pointer!");
+        GS.StoredType = GlobalStatus::isStored;
+      } else {
+        return true;  // Any other non-load instruction might take address!
+      }
+    } else if (Constant *C = dyn_cast<Constant>(*UI)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!ConstantIsDead(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+
+  return false;
+}
+
+static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
+  if (!CI) return 0;
+  unsigned IdxV = CI->getZExtValue();
+
+  if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Agg)) {
+    if (IdxV < CS->getNumOperands()) return CS->getOperand(IdxV);
+  } else if (ConstantArray *CA = dyn_cast<ConstantArray>(Agg)) {
+    if (IdxV < CA->getNumOperands()) return CA->getOperand(IdxV);
+  } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Agg)) {
+    if (IdxV < CP->getNumOperands()) return CP->getOperand(IdxV);
+  } else if (isa<ConstantAggregateZero>(Agg)) {
+    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+      if (IdxV < STy->getNumElements())
+        return Constant::getNullValue(STy->getElementType(IdxV));
+    } else if (const SequentialType *STy =
+               dyn_cast<SequentialType>(Agg->getType())) {
+      return Constant::getNullValue(STy->getElementType());
+    }
+  } else if (isa<UndefValue>(Agg)) {
+    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+      if (IdxV < STy->getNumElements())
+        return UndefValue::get(STy->getElementType(IdxV));
+    } else if (const SequentialType *STy =
+               dyn_cast<SequentialType>(Agg->getType())) {
+      return UndefValue::get(STy->getElementType());
+    }
+  }
+  return 0;
+}
+
+
+/// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all
+/// users of the global, cleaning up the obvious ones.  This is largely just a
+/// quick scan over the use list to clean up the easy and obvious cruft.  This
+/// returns true if it made a change.
+static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
+  bool Changed = false;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
+    User *U = *UI++;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (Init) {
+        // Replace the load with the initializer.
+        LI->replaceAllUsesWith(Init);
+        LI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // Store must be unreachable or storing Init into the global.
+      SI->eraseFromParent();
+      Changed = true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        Constant *SubInit = 0;
+        if (Init)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit);
+      } else if (CE->getOpcode() == Instruction::BitCast && 
+                 isa<PointerType>(CE->getType())) {
+        // Pointer cast, delete any stores and memsets to the global.
+        Changed |= CleanupConstantGlobalUsers(CE, 0);
+      }
+
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // Do not transform "gepinst (gep constexpr (GV))" here, because forming
+      // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
+      // and will invalidate our notion of what Init is.
+      Constant *SubInit = 0;
+      if (!isa<ConstantExpr>(GEP->getOperand(0))) {
+        ConstantExpr *CE = 
+          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP));
+        if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+      }
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit);
+
+      if (GEP->use_empty()) {
+        GEP->eraseFromParent();
+        Changed = true;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
+      if (MI->getRawDest() == V) {
+        MI->eraseFromParent();
+        Changed = true;
+      }
+
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      // If we have a chain of dead constantexprs or other things dangling from
+      // us, and if they are all dead, nuke them without remorse.
+      if (ConstantIsDead(C)) {
+        C->destroyConstant();
+        // This could have invalidated UI, start over from scratch.
+        CleanupConstantGlobalUsers(V, Init);
+        return true;
+      }
+    }
+  }
+  return Changed;
+}
+
+/// isSafeSROAElementUse - Return true if the specified instruction is a safe
+/// user of a derived expression from a global that we want to SROA.
+static bool isSafeSROAElementUse(Value *V) {
+  // We might have a dead and dangling constant hanging off of here.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantIsDead(C);
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Loads are ok.
+  if (isa<LoadInst>(I)) return true;
+
+  // Stores *to* the pointer are ok.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getOperand(0) != V;
+    
+  // Otherwise, it must be a GEP.
+  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
+  if (GEPI == 0) return false;
+  
+  if (GEPI->getNumOperands() < 3 || !isa<Constant>(GEPI->getOperand(1)) ||
+      !cast<Constant>(GEPI->getOperand(1))->isNullValue())
+    return false;
+  
+  for (Value::use_iterator I = GEPI->use_begin(), E = GEPI->use_end();
+       I != E; ++I)
+    if (!isSafeSROAElementUse(*I))
+      return false;
+  return true;
+}
+
+
+/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value.
+/// Look at it and its uses and decide whether it is safe to SROA this global.
+///
+static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
+  // The user of the global must be a GEP Inst or a ConstantExpr GEP.
+  if (!isa<GetElementPtrInst>(U) && 
+      (!isa<ConstantExpr>(U) || 
+       cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
+    return false;
+  
+  // Check to see if this ConstantExpr GEP is SRA'able.  In particular, we
+  // don't like < 3 operand CE's, and we don't like non-constant integer
+  // indices.  This enforces that all uses are 'gep GV, 0, C, ...' for some
+  // value of C.
+  if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
+      !cast<Constant>(U->getOperand(1))->isNullValue() ||
+      !isa<ConstantInt>(U->getOperand(2)))
+    return false;
+
+  gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
+  ++GEPI;  // Skip over the pointer index.
+  
+  // If this is a use of an array allocation, do a bit more checking for sanity.
+  if (const ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
+    uint64_t NumElements = AT->getNumElements();
+    ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2));
+    
+    // Check to make sure that index falls within the array.  If not,
+    // something funny is going on, so we won't do the optimization.
+    //
+    if (Idx->getZExtValue() >= NumElements)
+      return false;
+      
+    // We cannot scalar repl this level of the array unless any array
+    // sub-indices are in-range constants.  In particular, consider:
+    // A[0][i].  We cannot know that the user isn't doing invalid things like
+    // allowing i to index an out-of-range subscript that accesses A[1].
+    //
+    // Scalar replacing *just* the outer index of the array is probably not
+    // going to be a win anyway, so just give up.
+    for (++GEPI; // Skip array index.
+         GEPI != E && (isa<ArrayType>(*GEPI) || isa<VectorType>(*GEPI));
+         ++GEPI) {
+      uint64_t NumElements;
+      if (const ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
+        NumElements = SubArrayTy->getNumElements();
+      else
+        NumElements = cast<VectorType>(*GEPI)->getNumElements();
+      
+      ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
+      if (!IdxVal || IdxVal->getZExtValue() >= NumElements)
+        return false;
+    }
+  }
+
+  for (Value::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I)
+    if (!isSafeSROAElementUse(*I))
+      return false;
+  return true;
+}
+
+/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it
+/// is safe for us to perform this transformation.
+///
+static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI) {
+    if (!IsUserOfGlobalSafeForSRA(*UI, GV))
+      return false;
+  }
+  return true;
+}
+ 
+
+/// SRAGlobal - Perform scalar replacement of aggregates on the specified global
+/// variable.  This opens the door for other optimizations by exposing the
+/// behavior of the program in a more fine-grained way.  We have determined that
+/// this transformation is safe already.  We return the first global variable we
+/// insert so that the caller can reprocess it.
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
+  // Make sure this global only has simple uses that we can SRA.
+  if (!GlobalUsersSafeToSRA(GV))
+    return 0;
+  
+  assert(GV->hasLocalLinkage() && !GV->isConstant());
+  Constant *Init = GV->getInitializer();
+  const Type *Ty = Init->getType();
+
+  std::vector<GlobalVariable*> NewGlobals;
+  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+
+  // Get the alignment of the global, either explicit or target-specific.
+  unsigned StartAlignment = GV->getAlignment();
+  if (StartAlignment == 0)
+    StartAlignment = TD.getABITypeAlignment(GV->getType());
+   
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    NewGlobals.reserve(STy->getNumElements());
+    const StructLayout &Layout = *TD.getStructLayout(STy);
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      Constant *In = getAggregateConstantElement(Init,
+                                            ConstantInt::get(Type::Int32Ty, i));
+      assert(In && "Couldn't get element of initializer?");
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+utostr(i),
+                                               (Module *)NULL,
+                                               GV->isThreadLocal(),
+                                               GV->getType()->getAddressSpace());
+      Globals.insert(GV, NGV);
+      NewGlobals.push_back(NGV);
+      
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      uint64_t FieldOffset = Layout.getElementOffset(i);
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
+      if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i)))
+        NGV->setAlignment(NewAlign);
+    }
+  } else if (const SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+    unsigned NumElements = 0;
+    if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+      NumElements = ATy->getNumElements();
+    else
+      NumElements = cast<VectorType>(STy)->getNumElements();
+
+    if (NumElements > 16 && GV->hasNUsesOrMore(16))
+      return 0; // It's not worth it.
+    NewGlobals.reserve(NumElements);
+    
+    uint64_t EltSize = TD.getTypeAllocSize(STy->getElementType());
+    unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType());
+    for (unsigned i = 0, e = NumElements; i != e; ++i) {
+      Constant *In = getAggregateConstantElement(Init,
+                                            ConstantInt::get(Type::Int32Ty, i));
+      assert(In && "Couldn't get element of initializer?");
+
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+utostr(i),
+                                               (Module *)NULL,
+                                               GV->isThreadLocal(),
+                                               GV->getType()->getAddressSpace());
+      Globals.insert(GV, NGV);
+      NewGlobals.push_back(NGV);
+      
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
+      if (NewAlign > EltAlign)
+        NGV->setAlignment(NewAlign);
+    }
+  }
+
+  if (NewGlobals.empty())
+    return 0;
+
+  DOUT << "PERFORMING GLOBAL SRA ON: " << *GV;
+
+  Constant *NullInt = Constant::getNullValue(Type::Int32Ty);
+
+  // Loop over all of the uses of the global, replacing the constantexpr geps,
+  // with smaller constantexpr geps or direct references.
+  while (!GV->use_empty()) {
+    User *GEP = GV->use_back();
+    assert(((isa<ConstantExpr>(GEP) &&
+             cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
+            isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
+
+    // Ignore the 1th operand, which has to be zero or else the program is quite
+    // broken (undefined).  Get the 2nd operand, which is the structure or array
+    // index.
+    unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access.
+
+    Value *NewPtr = NewGlobals[Val];
+
+    // Form a shorter GEP if needed.
+    if (GEP->getNumOperands() > 3) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
+        SmallVector<Constant*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
+          Idxs.push_back(CE->getOperand(i));
+        NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr),
+                                                &Idxs[0], Idxs.size());
+      } else {
+        GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
+        SmallVector<Value*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
+          Idxs.push_back(GEPI->getOperand(i));
+        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs.begin(), Idxs.end(),
+                                           GEPI->getName()+"."+utostr(Val), GEPI);
+      }
+    }
+    GEP->replaceAllUsesWith(NewPtr);
+
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
+      GEPI->eraseFromParent();
+    else
+      cast<ConstantExpr>(GEP)->destroyConstant();
+  }
+
+  // Delete the old global, now that it is dead.
+  Globals.erase(GV);
+  ++NumSRA;
+
+  // Loop over the new globals array deleting any globals that are obviously
+  // dead.  This can arise due to scalarization of a structure or an array that
+  // has elements that are dead.
+  unsigned FirstGlobal = 0;
+  for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i)
+    if (NewGlobals[i]->use_empty()) {
+      Globals.erase(NewGlobals[i]);
+      if (FirstGlobal == i) ++FirstGlobal;
+    }
+
+  return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : 0;
+}
+
+/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified
+/// value will trap if the value is dynamically null.  PHIs keeps track of any 
+/// phi nodes we've seen to avoid reprocessing them.
+static bool AllUsesOfValueWillTrapIfNull(Value *V,
+                                         SmallPtrSet<PHINode*, 8> &PHIs) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (isa<LoadInst>(*UI)) {
+      // Will trap.
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (SI->getOperand(0) == V) {
+        //cerr << "NONTRAPPING USE: " << **UI;
+        return false;  // Storing the value.
+      }
+    } else if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      if (CI->getOperand(0) != V) {
+        //cerr << "NONTRAPPING USE: " << **UI;
+        return false;  // Not calling the ptr
+      }
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
+      if (II->getOperand(0) != V) {
+        //cerr << "NONTRAPPING USE: " << **UI;
+        return false;  // Not calling the ptr
+      }
+    } else if (BitCastInst *CI = dyn_cast<BitCastInst>(*UI)) {
+      if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(*UI)) {
+      if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
+    } else if (PHINode *PN = dyn_cast<PHINode>(*UI)) {
+      // If we've already seen this phi node, ignore it, it has already been
+      // checked.
+      if (PHIs.insert(PN))
+        return AllUsesOfValueWillTrapIfNull(PN, PHIs);
+    } else if (isa<ICmpInst>(*UI) &&
+               isa<ConstantPointerNull>(UI->getOperand(1))) {
+      // Ignore setcc X, null
+    } else {
+      //cerr << "NONTRAPPING USE: " << **UI;
+      return false;
+    }
+  return true;
+}
+
+/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads
+/// from GV will trap if the loaded value is null.  Note that this also permits
+/// comparisons of the loaded value against null, as a special case.
+static bool AllUsesOfLoadedValueWillTrapIfNull(GlobalVariable *GV) {
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI!=E; ++UI)
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      SmallPtrSet<PHINode*, 8> PHIs;
+      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+        return false;
+    } else if (isa<StoreInst>(*UI)) {
+      // Ignore stores to the global.
+    } else {
+      // We don't know or understand this user, bail out.
+      //cerr << "UNKNOWN USER OF GLOBAL!: " << **UI;
+      return false;
+    }
+
+  return true;
+}
+
+static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
+  bool Changed = false;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) {
+    Instruction *I = cast<Instruction>(*UI++);
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      LI->setOperand(0, NewV);
+      Changed = true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (SI->getOperand(1) == V) {
+        SI->setOperand(1, NewV);
+        Changed = true;
+      }
+    } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+      if (I->getOperand(0) == V) {
+        // Calling through the pointer!  Turn into a direct call, but be careful
+        // that the pointer is not also being passed as an argument.
+        I->setOperand(0, NewV);
+        Changed = true;
+        bool PassedAsArg = false;
+        for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i)
+          if (I->getOperand(i) == V) {
+            PassedAsArg = true;
+            I->setOperand(i, NewV);
+          }
+
+        if (PassedAsArg) {
+          // Being passed as an argument also.  Be careful to not invalidate UI!
+          UI = V->use_begin();
+        }
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(CI,
+                                ConstantExpr::getCast(CI->getOpcode(),
+                                                      NewV, CI->getType()));
+      if (CI->use_empty()) {
+        Changed = true;
+        CI->eraseFromParent();
+      }
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      // Should handle GEP here.
+      SmallVector<Constant*, 8> Idxs;
+      Idxs.reserve(GEPI->getNumOperands()-1);
+      for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+           i != e; ++i)
+        if (Constant *C = dyn_cast<Constant>(*i))
+          Idxs.push_back(C);
+        else
+          break;
+      if (Idxs.size() == GEPI->getNumOperands()-1)
+        Changed |= OptimizeAwayTrappingUsesOfValue(GEPI,
+                                ConstantExpr::getGetElementPtr(NewV, &Idxs[0],
+                                                               Idxs.size()));
+      if (GEPI->use_empty()) {
+        Changed = true;
+        GEPI->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+
+/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null
+/// value stored into it.  If there are uses of the loaded value that would trap
+/// if the loaded value is dynamically null, then we know that they cannot be
+/// reachable with a null optimize away the load.
+static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
+  bool Changed = false;
+
+  // Keep track of whether we are able to remove all the uses of the global
+  // other than the store that defines it.
+  bool AllNonStoreUsesGone = true;
+  
+  // Replace all uses of loads with uses of uses of the stored value.
+  for (Value::use_iterator GUI = GV->use_begin(), E = GV->use_end(); GUI != E;){
+    User *GlobalUser = *GUI++;
+    if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
+      // If we were able to delete all uses of the loads
+      if (LI->use_empty()) {
+        LI->eraseFromParent();
+        Changed = true;
+      } else {
+        AllNonStoreUsesGone = false;
+      }
+    } else if (isa<StoreInst>(GlobalUser)) {
+      // Ignore the store that stores "LV" to the global.
+      assert(GlobalUser->getOperand(1) == GV &&
+             "Must be storing *to* the global");
+    } else {
+      AllNonStoreUsesGone = false;
+
+      // If we get here we could have other crazy uses that are transitively
+      // loaded.
+      assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
+              isa<ConstantExpr>(GlobalUser)) && "Only expect load and stores!");
+    }
+  }
+
+  if (Changed) {
+    DOUT << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV;
+    ++NumGlobUses;
+  }
+
+  // If we nuked all of the loads, then none of the stores are needed either,
+  // nor is the global.
+  if (AllNonStoreUsesGone) {
+    DOUT << "  *** GLOBAL NOW DEAD!\n";
+    CleanupConstantGlobalUsers(GV, 0);
+    if (GV->use_empty()) {
+      GV->eraseFromParent();
+      ++NumDeleted;
+    }
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
+/// instructions that are foldable.
+static void ConstantPropUsersOf(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; )
+    if (Instruction *I = dyn_cast<Instruction>(*UI++))
+      if (Constant *NewC = ConstantFoldInstruction(I)) {
+        I->replaceAllUsesWith(NewC);
+
+        // Advance UI to the next non-I use to avoid invalidating it!
+        // Instructions could multiply use V.
+        while (UI != E && *UI == I)
+          ++UI;
+        I->eraseFromParent();
+      }
+}
+
+/// OptimizeGlobalAddressOfMalloc - This function takes the specified global
+/// variable, and transforms the program as if it always contained the result of
+/// the specified malloc.  Because it is always the result of the specified
+/// malloc, there is no reason to actually DO the malloc.  Instead, turn the
+/// malloc into a global, and any loads of GV as uses of the new global.
+static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
+                                                     MallocInst *MI) {
+  DOUT << "PROMOTING MALLOC GLOBAL: " << *GV << "  MALLOC = " << *MI;
+  ConstantInt *NElements = cast<ConstantInt>(MI->getArraySize());
+
+  if (NElements->getZExtValue() != 1) {
+    // If we have an array allocation, transform it to a single element
+    // allocation to make the code below simpler.
+    Type *NewTy = ArrayType::get(MI->getAllocatedType(),
+                                 NElements->getZExtValue());
+    MallocInst *NewMI =
+      new MallocInst(NewTy, Constant::getNullValue(Type::Int32Ty),
+                     MI->getAlignment(), MI->getName(), MI);
+    Value* Indices[2];
+    Indices[0] = Indices[1] = Constant::getNullValue(Type::Int32Ty);
+    Value *NewGEP = GetElementPtrInst::Create(NewMI, Indices, Indices + 2,
+                                              NewMI->getName()+".el0", MI);
+    MI->replaceAllUsesWith(NewGEP);
+    MI->eraseFromParent();
+    MI = NewMI;
+  }
+
+  // Create the new global variable.  The contents of the malloc'd memory is
+  // undefined, so initialize with an undef value.
+  Constant *Init = UndefValue::get(MI->getAllocatedType());
+  GlobalVariable *NewGV = new GlobalVariable(MI->getAllocatedType(), false,
+                                             GlobalValue::InternalLinkage, Init,
+                                             GV->getName()+".body",
+                                             (Module *)NULL,
+                                             GV->isThreadLocal());
+  // FIXME: This new global should have the alignment returned by malloc.  Code
+  // could depend on malloc returning large alignment (on the mac, 16 bytes) but
+  // this would only guarantee some lower alignment.
+  GV->getParent()->getGlobalList().insert(GV, NewGV);
+
+  // Anything that used the malloc now uses the global directly.
+  MI->replaceAllUsesWith(NewGV);
+
+  Constant *RepValue = NewGV;
+  if (NewGV->getType() != GV->getType()->getElementType())
+    RepValue = ConstantExpr::getBitCast(RepValue, 
+                                        GV->getType()->getElementType());
+
+  // If there is a comparison against null, we will insert a global bool to
+  // keep track of whether the global was initialized yet or not.
+  GlobalVariable *InitBool =
+    new GlobalVariable(Type::Int1Ty, false, GlobalValue::InternalLinkage,
+                       ConstantInt::getFalse(), GV->getName()+".init",
+                       (Module *)NULL, GV->isThreadLocal());
+  bool InitBoolUsed = false;
+
+  // Loop over all uses of GV, processing them in turn.
+  std::vector<StoreInst*> Stores;
+  while (!GV->use_empty())
+    if (LoadInst *LI = dyn_cast<LoadInst>(GV->use_back())) {
+      while (!LI->use_empty()) {
+        Use &LoadUse = LI->use_begin().getUse();
+        if (!isa<ICmpInst>(LoadUse.getUser()))
+          LoadUse = RepValue;
+        else {
+          ICmpInst *CI = cast<ICmpInst>(LoadUse.getUser());
+          // Replace the cmp X, 0 with a use of the bool value.
+          Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", CI);
+          InitBoolUsed = true;
+          switch (CI->getPredicate()) {
+          default: assert(0 && "Unknown ICmp Predicate!");
+          case ICmpInst::ICMP_ULT:
+          case ICmpInst::ICMP_SLT:
+            LV = ConstantInt::getFalse();   // X < null -> always false
+            break;
+          case ICmpInst::ICMP_ULE:
+          case ICmpInst::ICMP_SLE:
+          case ICmpInst::ICMP_EQ:
+            LV = BinaryOperator::CreateNot(LV, "notinit", CI);
+            break;
+          case ICmpInst::ICMP_NE:
+          case ICmpInst::ICMP_UGE:
+          case ICmpInst::ICMP_SGE:
+          case ICmpInst::ICMP_UGT:
+          case ICmpInst::ICMP_SGT:
+            break;  // no change.
+          }
+          CI->replaceAllUsesWith(LV);
+          CI->eraseFromParent();
+        }
+      }
+      LI->eraseFromParent();
+    } else {
+      StoreInst *SI = cast<StoreInst>(GV->use_back());
+      // The global is initialized when the store to it occurs.
+      new StoreInst(ConstantInt::getTrue(), InitBool, SI);
+      SI->eraseFromParent();
+    }
+
+  // If the initialization boolean was used, insert it, otherwise delete it.
+  if (!InitBoolUsed) {
+    while (!InitBool->use_empty())  // Delete initializations
+      cast<Instruction>(InitBool->use_back())->eraseFromParent();
+    delete InitBool;
+  } else
+    GV->getParent()->getGlobalList().insert(GV, InitBool);
+
+
+  // Now the GV is dead, nuke it and the malloc.
+  GV->eraseFromParent();
+  MI->eraseFromParent();
+
+  // To further other optimizations, loop over all users of NewGV and try to
+  // constant prop them.  This will promote GEP instructions with constant
+  // indices into GEP constant-exprs, which will allow global-opt to hack on it.
+  ConstantPropUsersOf(NewGV);
+  if (RepValue != NewGV)
+    ConstantPropUsersOf(RepValue);
+
+  return NewGV;
+}
+
+/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking
+/// to make sure that there are no complex uses of V.  We permit simple things
+/// like dereferencing the pointer, but not storing through the address, unless
+/// it is to the specified global.
+static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Instruction *V,
+                                                      GlobalVariable *GV,
+                                              SmallPtrSet<PHINode*, 8> &PHIs) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    Instruction *Inst = dyn_cast<Instruction>(*UI);
+    if (Inst == 0) return false;
+    
+    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
+      continue; // Fine, ignore.
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
+        return false;  // Storing the pointer itself... bad.
+      continue; // Otherwise, storing through it, or storing into GV... fine.
+    }
+    
+    if (isa<GetElementPtrInst>(Inst)) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
+        return false;
+      continue;
+    }
+    
+    if (PHINode *PN = dyn_cast<PHINode>(Inst)) {
+      // PHIs are ok if all uses are ok.  Don't infinitely recurse through PHI
+      // cycles.
+      if (PHIs.insert(PN))
+        if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
+          return false;
+      continue;
+    }
+    
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
+        return false;
+      continue;
+    }
+    
+    return false;
+  }
+  return true;
+}
+
+/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV
+/// somewhere.  Transform all uses of the allocation into loads from the
+/// global and uses of the resultant pointer.  Further, delete the store into
+/// GV.  This assumes that these value pass the 
+/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
+static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, 
+                                          GlobalVariable *GV) {
+  while (!Alloc->use_empty()) {
+    Instruction *U = cast<Instruction>(*Alloc->use_begin());
+    Instruction *InsertPt = U;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // If this is the store of the allocation into the global, remove it.
+      if (SI->getOperand(1) == GV) {
+        SI->eraseFromParent();
+        continue;
+      }
+    } else if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Insert the load in the corresponding predecessor, not right before the
+      // PHI.
+      InsertPt = PN->getIncomingBlock(Alloc->use_begin())->getTerminator();
+    } else if (isa<BitCastInst>(U)) {
+      // Must be bitcast between the malloc and store to initialize the global.
+      ReplaceUsesOfMallocWithGlobal(U, GV);
+      U->eraseFromParent();
+      continue;
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      // If this is a "GEP bitcast" and the user is a store to the global, then
+      // just process it as a bitcast.
+      if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
+        if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->use_back()))
+          if (SI->getOperand(1) == GV) {
+            // Must be bitcast GEP between the malloc and store to initialize
+            // the global.
+            ReplaceUsesOfMallocWithGlobal(GEPI, GV);
+            GEPI->eraseFromParent();
+            continue;
+          }
+    }
+      
+    // Insert a load from the global, and use it instead of the malloc.
+    Value *NL = new LoadInst(GV, GV->getName()+".val", InsertPt);
+    U->replaceUsesOfWith(Alloc, NL);
+  }
+}
+
+/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi
+/// of a load) are simple enough to perform heap SRA on.  This permits GEP's
+/// that index through the array and struct field, icmps of null, and PHIs.
+static bool LoadUsesSimpleEnoughForHeapSRA(Value *V,
+                              SmallPtrSet<PHINode*, 32> &LoadUsingPHIs,
+                              SmallPtrSet<PHINode*, 32> &LoadUsingPHIsPerLoad) {
+  // We permit two users of the load: setcc comparing against the null
+  // pointer, and a getelementptr of a specific form.
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    Instruction *User = cast<Instruction>(*UI);
+    
+    // Comparison against null is ok.
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(User)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return false;
+      continue;
+    }
+    
+    // getelementptr is also ok, but only a simple form.
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      // Must index into the array and into the struct.
+      if (GEPI->getNumOperands() < 3)
+        return false;
+      
+      // Otherwise the GEP is ok.
+      continue;
+    }
+    
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      if (!LoadUsingPHIsPerLoad.insert(PN))
+        // This means some phi nodes are dependent on each other.
+        // Avoid infinite looping!
+        return false;
+      if (!LoadUsingPHIs.insert(PN))
+        // If we have already analyzed this PHI, then it is safe.
+        continue;
+      
+      // Make sure all uses of the PHI are simple enough to transform.
+      if (!LoadUsesSimpleEnoughForHeapSRA(PN,
+                                          LoadUsingPHIs, LoadUsingPHIsPerLoad))
+        return false;
+      
+      continue;
+    }
+    
+    // Otherwise we don't know what this is, not ok.
+    return false;
+  }
+  
+  return true;
+}
+
+
+/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from
+/// GV are simple enough to perform HeapSRA, return true.
+static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(GlobalVariable *GV,
+                                                    MallocInst *MI) {
+  SmallPtrSet<PHINode*, 32> LoadUsingPHIs;
+  SmallPtrSet<PHINode*, 32> LoadUsingPHIsPerLoad;
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E; 
+       ++UI)
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
+                                          LoadUsingPHIsPerLoad))
+        return false;
+      LoadUsingPHIsPerLoad.clear();
+    }
+  
+  // If we reach here, we know that all uses of the loads and transitive uses
+  // (through PHI nodes) are simple enough to transform.  However, we don't know
+  // that all inputs the to the PHI nodes are in the same equivalence sets. 
+  // Check to verify that all operands of the PHIs are either PHIS that can be
+  // transformed, loads from GV, or MI itself.
+  for (SmallPtrSet<PHINode*, 32>::iterator I = LoadUsingPHIs.begin(),
+       E = LoadUsingPHIs.end(); I != E; ++I) {
+    PHINode *PN = *I;
+    for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
+      Value *InVal = PN->getIncomingValue(op);
+      
+      // PHI of the stored value itself is ok.
+      if (InVal == MI) continue;
+      
+      if (PHINode *InPN = dyn_cast<PHINode>(InVal)) {
+        // One of the PHIs in our set is (optimistically) ok.
+        if (LoadUsingPHIs.count(InPN))
+          continue;
+        return false;
+      }
+      
+      // Load from GV is ok.
+      if (LoadInst *LI = dyn_cast<LoadInst>(InVal))
+        if (LI->getOperand(0) == GV)
+          continue;
+      
+      // UNDEF? NULL?
+      
+      // Anything else is rejected.
+      return false;
+    }
+  }
+  
+  return true;
+}
+
+static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  std::vector<Value*> &FieldVals = InsertedScalarizedValues[V];
+  
+  if (FieldNo >= FieldVals.size())
+    FieldVals.resize(FieldNo+1);
+  
+  // If we already have this value, just reuse the previously scalarized
+  // version.
+  if (Value *FieldVal = FieldVals[FieldNo])
+    return FieldVal;
+  
+  // Depending on what instruction this is, we have several cases.
+  Value *Result;
+  if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    // This is a scalarized version of the load from the global.  Just create
+    // a new Load of the scalarized global.
+    Result = new LoadInst(GetHeapSROAValue(LI->getOperand(0), FieldNo,
+                                           InsertedScalarizedValues,
+                                           PHIsToRewrite),
+                          LI->getName()+".f" + utostr(FieldNo), LI);
+  } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    // PN's type is pointer to struct.  Make a new PHI of pointer to struct
+    // field.
+    const StructType *ST = 
+      cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
+    
+    Result =PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)),
+                            PN->getName()+".f"+utostr(FieldNo), PN);
+    PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
+  } else {
+    assert(0 && "Unknown usable value");
+    Result = 0;
+  }
+  
+  return FieldVals[FieldNo] = Result;
+}
+
+/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from
+/// the load, rewrite the derived value to use the HeapSRoA'd load.
+static void RewriteHeapSROALoadUser(Instruction *LoadUser, 
+             DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  // If this is a comparison against null, handle it.
+  if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
+    assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
+    // If we have a setcc of the loaded pointer, we can use a setcc of any
+    // field.
+    Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
+                                   InsertedScalarizedValues, PHIsToRewrite);
+    
+    Value *New = new ICmpInst(SCI->getPredicate(), NPtr,
+                              Constant::getNullValue(NPtr->getType()),
+                              SCI->getName(), SCI);
+    SCI->replaceAllUsesWith(New);
+    SCI->eraseFromParent();
+    return;
+  }
+  
+  // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
+    assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
+           && "Unexpected GEPI!");
+  
+    // Load the pointer for this field.
+    unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+    Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
+                                     InsertedScalarizedValues, PHIsToRewrite);
+    
+    // Create the new GEP idx vector.
+    SmallVector<Value*, 8> GEPIdx;
+    GEPIdx.push_back(GEPI->getOperand(1));
+    GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
+    
+    Value *NGEPI = GetElementPtrInst::Create(NewPtr,
+                                             GEPIdx.begin(), GEPIdx.end(),
+                                             GEPI->getName(), GEPI);
+    GEPI->replaceAllUsesWith(NGEPI);
+    GEPI->eraseFromParent();
+    return;
+  }
+
+  // Recursively transform the users of PHI nodes.  This will lazily create the
+  // PHIs that are needed for individual elements.  Keep track of what PHIs we
+  // see in InsertedScalarizedValues so that we don't get infinite loops (very
+  // antisocial).  If the PHI is already in InsertedScalarizedValues, it has
+  // already been seen first by another load, so its uses have already been
+  // processed.
+  PHINode *PN = cast<PHINode>(LoadUser);
+  bool Inserted;
+  DenseMap<Value*, std::vector<Value*> >::iterator InsertPos;
+  tie(InsertPos, Inserted) =
+    InsertedScalarizedValues.insert(std::make_pair(PN, std::vector<Value*>()));
+  if (!Inserted) return;
+  
+  // If this is the first time we've seen this PHI, recursively process all
+  // users.
+  for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+}
+
+/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global.  Ptr
+/// is a value loaded from the global.  Eliminate all uses of Ptr, making them
+/// use FieldGlobals instead.  All uses of loaded values satisfy
+/// AllGlobalLoadUsesSimpleEnoughForHeapSRA.
+static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, 
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  for (Value::use_iterator UI = Load->use_begin(), E = Load->use_end();
+       UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+  
+  if (Load->use_empty()) {
+    Load->eraseFromParent();
+    InsertedScalarizedValues.erase(Load);
+  }
+}
+
+/// PerformHeapAllocSRoA - MI is an allocation of an array of structures.  Break
+/// it up into multiple allocations of arrays of the fields.
+static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, MallocInst *MI){
+  DOUT << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *MI;
+  const StructType *STy = cast<StructType>(MI->getAllocatedType());
+
+  // There is guaranteed to be at least one use of the malloc (storing
+  // it into GV).  If there are other uses, change them to be uses of
+  // the global to simplify later code.  This also deletes the store
+  // into GV.
+  ReplaceUsesOfMallocWithGlobal(MI, GV);
+  
+  // Okay, at this point, there are no users of the malloc.  Insert N
+  // new mallocs at the same place as MI, and N globals.
+  std::vector<Value*> FieldGlobals;
+  std::vector<MallocInst*> FieldMallocs;
+  
+  for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
+    const Type *FieldTy = STy->getElementType(FieldNo);
+    const Type *PFieldTy = PointerType::getUnqual(FieldTy);
+    
+    GlobalVariable *NGV =
+      new GlobalVariable(PFieldTy, false, GlobalValue::InternalLinkage,
+                         Constant::getNullValue(PFieldTy),
+                         GV->getName() + ".f" + utostr(FieldNo), GV,
+                         GV->isThreadLocal());
+    FieldGlobals.push_back(NGV);
+    
+    MallocInst *NMI = new MallocInst(FieldTy, MI->getArraySize(),
+                                     MI->getName() + ".f" + utostr(FieldNo),MI);
+    FieldMallocs.push_back(NMI);
+    new StoreInst(NMI, NGV, MI);
+  }
+  
+  // The tricky aspect of this transformation is handling the case when malloc
+  // fails.  In the original code, malloc failing would set the result pointer
+  // of malloc to null.  In this case, some mallocs could succeed and others
+  // could fail.  As such, we emit code that looks like this:
+  //    F0 = malloc(field0)
+  //    F1 = malloc(field1)
+  //    F2 = malloc(field2)
+  //    if (F0 == 0 || F1 == 0 || F2 == 0) {
+  //      if (F0) { free(F0); F0 = 0; }
+  //      if (F1) { free(F1); F1 = 0; }
+  //      if (F2) { free(F2); F2 = 0; }
+  //    }
+  Value *RunningOr = 0;
+  for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
+    Value *Cond = new ICmpInst(ICmpInst::ICMP_EQ, FieldMallocs[i],
+                             Constant::getNullValue(FieldMallocs[i]->getType()),
+                                  "isnull", MI);
+    if (!RunningOr)
+      RunningOr = Cond;   // First seteq
+    else
+      RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", MI);
+  }
+
+  // Split the basic block at the old malloc.
+  BasicBlock *OrigBB = MI->getParent();
+  BasicBlock *ContBB = OrigBB->splitBasicBlock(MI, "malloc_cont");
+  
+  // Create the block to check the first condition.  Put all these blocks at the
+  // end of the function as they are unlikely to be executed.
+  BasicBlock *NullPtrBlock = BasicBlock::Create("malloc_ret_null",
+                                                OrigBB->getParent());
+  
+  // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
+  // branch on RunningOr.
+  OrigBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
+  
+  // Within the NullPtrBlock, we need to emit a comparison and branch for each
+  // pointer, because some may be null while others are not.
+  for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+    Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock);
+    Value *Cmp = new ICmpInst(ICmpInst::ICMP_NE, GVVal, 
+                              Constant::getNullValue(GVVal->getType()),
+                              "tmp", NullPtrBlock);
+    BasicBlock *FreeBlock = BasicBlock::Create("free_it", OrigBB->getParent());
+    BasicBlock *NextBlock = BasicBlock::Create("next", OrigBB->getParent());
+    BranchInst::Create(FreeBlock, NextBlock, Cmp, NullPtrBlock);
+
+    // Fill in FreeBlock.
+    new FreeInst(GVVal, FreeBlock);
+    new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
+                  FreeBlock);
+    BranchInst::Create(NextBlock, FreeBlock);
+    
+    NullPtrBlock = NextBlock;
+  }
+  
+  BranchInst::Create(ContBB, NullPtrBlock);
+  
+  // MI is no longer needed, remove it.
+  MI->eraseFromParent();
+
+  /// InsertedScalarizedLoads - As we process loads, if we can't immediately
+  /// update all uses of the load, keep track of what scalarized loads are
+  /// inserted for a given load.
+  DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;
+  InsertedScalarizedValues[GV] = FieldGlobals;
+  
+  std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite;
+  
+  // Okay, the malloc site is completely handled.  All of the uses of GV are now
+  // loads, and all uses of those loads are simple.  Rewrite them to use loads
+  // of the per-field globals instead.
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
+      continue;
+    }
+    
+    // Must be a store of null.
+    StoreInst *SI = cast<StoreInst>(User);
+    assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
+           "Unexpected heap-sra user!");
+    
+    // Insert a store of null into each global.
+    for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+      const PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
+      Constant *Null = Constant::getNullValue(PT->getElementType());
+      new StoreInst(Null, FieldGlobals[i], SI);
+    }
+    // Erase the original store.
+    SI->eraseFromParent();
+  }
+
+  // While we have PHIs that are interesting to rewrite, do it.
+  while (!PHIsToRewrite.empty()) {
+    PHINode *PN = PHIsToRewrite.back().first;
+    unsigned FieldNo = PHIsToRewrite.back().second;
+    PHIsToRewrite.pop_back();
+    PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
+    assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
+
+    // Add all the incoming values.  This can materialize more phis.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
+                               PHIsToRewrite);
+      FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
+    }
+  }
+  
+  // Drop all inter-phi links and any loads that made it this far.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->dropAllReferences();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->dropAllReferences();
+  }
+  
+  // Delete all the phis and loads now that inter-references are dead.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->eraseFromParent();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->eraseFromParent();
+  }
+  
+  // The old global is now dead, remove it.
+  GV->eraseFromParent();
+
+  ++NumHeapSRA;
+  return cast<GlobalVariable>(FieldGlobals[0]);
+}
+
+/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a
+/// pointer global variable with a single value stored it that is a malloc or
+/// cast of malloc.
+static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
+                                               MallocInst *MI,
+                                               Module::global_iterator &GVI,
+                                               TargetData &TD) {
+  // If this is a malloc of an abstract type, don't touch it.
+  if (!MI->getAllocatedType()->isSized())
+    return false;
+  
+  // We can't optimize this global unless all uses of it are *known* to be
+  // of the malloc value, not of the null initializer value (consider a use
+  // that compares the global's value against zero to see if the malloc has
+  // been reached).  To do this, we check to see if all uses of the global
+  // would trap if the global were null: this proves that they must all
+  // happen after the malloc.
+  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+    return false;
+  
+  // We can't optimize this if the malloc itself is used in a complex way,
+  // for example, being stored into multiple globals.  This allows the
+  // malloc to be stored into the specified global, loaded setcc'd, and
+  // GEP'd.  These are all things we could transform to using the global
+  // for.
+  {
+    SmallPtrSet<PHINode*, 8> PHIs;
+    if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(MI, GV, PHIs))
+      return false;
+  }
+  
+  
+  // If we have a global that is only initialized with a fixed size malloc,
+  // transform the program to use global memory instead of malloc'd memory.
+  // This eliminates dynamic allocation, avoids an indirection accessing the
+  // data, and exposes the resultant global to further GlobalOpt.
+  if (ConstantInt *NElements = dyn_cast<ConstantInt>(MI->getArraySize())) {
+    // Restrict this transformation to only working on small allocations
+    // (2048 bytes currently), as we don't want to introduce a 16M global or
+    // something.
+    if (NElements->getZExtValue()*
+        TD.getTypeAllocSize(MI->getAllocatedType()) < 2048) {
+      GVI = OptimizeGlobalAddressOfMalloc(GV, MI);
+      return true;
+    }
+  }
+  
+  // If the allocation is an array of structures, consider transforming this
+  // into multiple malloc'd arrays, one for each field.  This is basically
+  // SRoA for malloc'd memory.
+  const Type *AllocTy = MI->getAllocatedType();
+  
+  // If this is an allocation of a fixed size array of structs, analyze as a
+  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
+  if (!MI->isArrayAllocation())
+    if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+      AllocTy = AT->getElementType();
+  
+  if (const StructType *AllocSTy = dyn_cast<StructType>(AllocTy)) {
+    // This the structure has an unreasonable number of fields, leave it
+    // alone.
+    if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
+        AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, MI)) {
+      
+      // If this is a fixed size array, transform the Malloc to be an alloc of
+      // structs.  malloc [100 x struct],1 -> malloc struct, 100
+      if (const ArrayType *AT = dyn_cast<ArrayType>(MI->getAllocatedType())) {
+        MallocInst *NewMI = 
+          new MallocInst(AllocSTy, 
+                         ConstantInt::get(Type::Int32Ty, AT->getNumElements()),
+                         "", MI);
+        NewMI->takeName(MI);
+        Value *Cast = new BitCastInst(NewMI, MI->getType(), "tmp", MI);
+        MI->replaceAllUsesWith(Cast);
+        MI->eraseFromParent();
+        MI = NewMI;
+      }
+      
+      GVI = PerformHeapAllocSRoA(GV, MI);
+      return true;
+    }
+  }
+  
+  return false;
+}  
+
+// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge
+// that only one value (besides its initializer) is ever stored to the global.
+static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                                     Module::global_iterator &GVI,
+                                     TargetData &TD) {
+  // Ignore no-op GEPs and bitcasts.
+  StoredOnceVal = StoredOnceVal->stripPointerCasts();
+
+  // If we are dealing with a pointer global that is initialized to null and
+  // only has one (non-null) value stored into it, then we can optimize any
+  // users of the loaded value (often calls and loads) that would trap if the
+  // value was null.
+  if (isa<PointerType>(GV->getInitializer()->getType()) &&
+      GV->getInitializer()->isNullValue()) {
+    if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
+      if (GV->getInitializer()->getType() != SOVC->getType())
+        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+
+      // Optimize away any trapping uses of the loaded value.
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
+        return true;
+    } else if (MallocInst *MI = dyn_cast<MallocInst>(StoredOnceVal)) {
+      if (TryToOptimizeStoreOfMallocToGlobal(GV, MI, GVI, TD))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only
+/// two values ever stored into GV are its initializer and OtherVal.  See if we
+/// can shrink the global into a boolean and select between the two values
+/// whenever it is used.  This exposes the values to other scalar optimizations.
+static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
+  const Type *GVElType = GV->getType()->getElementType();
+  
+  // If GVElType is already i1, it is already shrunk.  If the type of the GV is
+  // an FP value, pointer or vector, don't do this optimization because a select
+  // between them is very expensive and unlikely to lead to later
+  // simplification.  In these cases, we typically end up with "cond ? v1 : v2"
+  // where v1 and v2 both require constant pool loads, a big loss.
+  if (GVElType == Type::Int1Ty || GVElType->isFloatingPoint() ||
+      isa<PointerType>(GVElType) || isa<VectorType>(GVElType))
+    return false;
+  
+  // Walk the use list of the global seeing if all the uses are load or store.
+  // If there is anything else, bail out.
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I)
+    if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+      return false;
+  
+  DOUT << "   *** SHRINKING TO BOOL: " << *GV;
+  
+  // Create the new global, initializing it to false.
+  GlobalVariable *NewGV = new GlobalVariable(Type::Int1Ty, false,
+         GlobalValue::InternalLinkage, ConstantInt::getFalse(),
+                                             GV->getName()+".b",
+                                             (Module *)NULL,
+                                             GV->isThreadLocal());
+  GV->getParent()->getGlobalList().insert(GV, NewGV);
+
+  Constant *InitVal = GV->getInitializer();
+  assert(InitVal->getType() != Type::Int1Ty && "No reason to shrink to bool!");
+
+  // If initialized to zero and storing one into the global, we can use a cast
+  // instead of a select to synthesize the desired value.
+  bool IsOneZero = false;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal))
+    IsOneZero = InitVal->isNullValue() && CI->isOne();
+
+  while (!GV->use_empty()) {
+    Instruction *UI = cast<Instruction>(GV->use_back());
+    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      // Change the store into a boolean store.
+      bool StoringOther = SI->getOperand(0) == OtherVal;
+      // Only do this if we weren't storing a loaded value.
+      Value *StoreVal;
+      if (StoringOther || SI->getOperand(0) == InitVal)
+        StoreVal = ConstantInt::get(Type::Int1Ty, StoringOther);
+      else {
+        // Otherwise, we are storing a previously loaded copy.  To do this,
+        // change the copy from copying the original value to just copying the
+        // bool.
+        Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
+
+        // If we're already replaced the input, StoredVal will be a cast or
+        // select instruction.  If not, it will be a load of the original
+        // global.
+        if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+          assert(LI->getOperand(0) == GV && "Not a copy!");
+          // Insert a new load, to preserve the saved value.
+          StoreVal = new LoadInst(NewGV, LI->getName()+".b", LI);
+        } else {
+          assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
+                 "This is not a form that we understand!");
+          StoreVal = StoredVal->getOperand(0);
+          assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
+        }
+      }
+      new StoreInst(StoreVal, NewGV, SI);
+    } else {
+      // Change the load into a load of bool then a select.
+      LoadInst *LI = cast<LoadInst>(UI);
+      LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", LI);
+      Value *NSI;
+      if (IsOneZero)
+        NSI = new ZExtInst(NLI, LI->getType(), "", LI);
+      else
+        NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
+      NSI->takeName(LI);
+      LI->replaceAllUsesWith(NSI);
+    }
+    UI->eraseFromParent();
+  }
+
+  GV->eraseFromParent();
+  return true;
+}
+
+
+/// ProcessInternalGlobal - Analyze the specified global variable and optimize
+/// it if possible.  If we make a change, return true.
+bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
+                                      Module::global_iterator &GVI) {
+  SmallPtrSet<PHINode*, 16> PHIUsers;
+  GlobalStatus GS;
+  GV->removeDeadConstantUsers();
+
+  if (GV->use_empty()) {
+    DOUT << "GLOBAL DEAD: " << *GV;
+    GV->eraseFromParent();
+    ++NumDeleted;
+    return true;
+  }
+
+  if (!AnalyzeGlobal(GV, GS, PHIUsers)) {
+#if 0
+    cerr << "Global: " << *GV;
+    cerr << "  isLoaded = " << GS.isLoaded << "\n";
+    cerr << "  StoredType = ";
+    switch (GS.StoredType) {
+    case GlobalStatus::NotStored: cerr << "NEVER STORED\n"; break;
+    case GlobalStatus::isInitializerStored: cerr << "INIT STORED\n"; break;
+    case GlobalStatus::isStoredOnce: cerr << "STORED ONCE\n"; break;
+    case GlobalStatus::isStored: cerr << "stored\n"; break;
+    }
+    if (GS.StoredType == GlobalStatus::isStoredOnce && GS.StoredOnceValue)
+      cerr << "  StoredOnceValue = " << *GS.StoredOnceValue << "\n";
+    if (GS.AccessingFunction && !GS.HasMultipleAccessingFunctions)
+      cerr << "  AccessingFunction = " << GS.AccessingFunction->getName()
+                << "\n";
+    cerr << "  HasMultipleAccessingFunctions =  "
+              << GS.HasMultipleAccessingFunctions << "\n";
+    cerr << "  HasNonInstructionUser = " << GS.HasNonInstructionUser<<"\n";
+    cerr << "\n";
+#endif
+    
+    // If this is a first class global and has only one accessing function
+    // and this function is main (which we know is not recursive we can make
+    // this global a local variable) we replace the global with a local alloca
+    // in this function.
+    //
+    // NOTE: It doesn't make sense to promote non single-value types since we
+    // are just replacing static memory to stack memory.
+    if (!GS.HasMultipleAccessingFunctions &&
+        GS.AccessingFunction && !GS.HasNonInstructionUser &&
+        GV->getType()->getElementType()->isSingleValueType() &&
+        GS.AccessingFunction->getName() == "main" &&
+        GS.AccessingFunction->hasExternalLinkage()) {
+      DOUT << "LOCALIZING GLOBAL: " << *GV;
+      Instruction* FirstI = GS.AccessingFunction->getEntryBlock().begin();
+      const Type* ElemTy = GV->getType()->getElementType();
+      // FIXME: Pass Global's alignment when globals have alignment
+      AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), FirstI);
+      if (!isa<UndefValue>(GV->getInitializer()))
+        new StoreInst(GV->getInitializer(), Alloca, FirstI);
+
+      GV->replaceAllUsesWith(Alloca);
+      GV->eraseFromParent();
+      ++NumLocalized;
+      return true;
+    }
+    
+    // If the global is never loaded (but may be stored to), it is dead.
+    // Delete it now.
+    if (!GS.isLoaded) {
+      DOUT << "GLOBAL NEVER LOADED: " << *GV;
+
+      // Delete any stores we can find to the global.  We may not be able to
+      // make it completely dead though.
+      bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer());
+
+      // If the global is dead now, delete it.
+      if (GV->use_empty()) {
+        GV->eraseFromParent();
+        ++NumDeleted;
+        Changed = true;
+      }
+      return Changed;
+
+    } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
+      DOUT << "MARKING CONSTANT: " << *GV;
+      GV->setConstant(true);
+
+      // Clean up any obviously simplifiable users now.
+      CleanupConstantGlobalUsers(GV, GV->getInitializer());
+
+      // If the global is dead now, just nuke it.
+      if (GV->use_empty()) {
+        DOUT << "   *** Marking constant allowed us to simplify "
+             << "all users and delete global!\n";
+        GV->eraseFromParent();
+        ++NumDeleted;
+      }
+
+      ++NumMarked;
+      return true;
+    } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
+      if (GlobalVariable *FirstNewGV = SRAGlobal(GV, 
+                                                 getAnalysis<TargetData>())) {
+        GVI = FirstNewGV;  // Don't skip the newly produced globals!
+        return true;
+      }
+    } else if (GS.StoredType == GlobalStatus::isStoredOnce) {
+      // If the initial value for the global was an undef value, and if only
+      // one other value was stored into it, we can just change the
+      // initializer to be the stored value, then delete all stores to the
+      // global.  This allows us to mark it constant.
+      if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+        if (isa<UndefValue>(GV->getInitializer())) {
+          // Change the initial value here.
+          GV->setInitializer(SOVConstant);
+
+          // Clean up any obviously simplifiable users now.
+          CleanupConstantGlobalUsers(GV, GV->getInitializer());
+
+          if (GV->use_empty()) {
+            DOUT << "   *** Substituting initializer allowed us to "
+                 << "simplify all users and delete global!\n";
+            GV->eraseFromParent();
+            ++NumDeleted;
+          } else {
+            GVI = GV;
+          }
+          ++NumSubstitute;
+          return true;
+        }
+
+      // Try to optimize globals based on the knowledge that only one value
+      // (besides its initializer) is ever stored to the global.
+      if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GVI,
+                                   getAnalysis<TargetData>()))
+        return true;
+
+      // Otherwise, if the global was not a boolean, we can shrink it to be a
+      // boolean.
+      if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+          ++NumShrunkToBool;
+          return true;
+        }
+    }
+  }
+  return false;
+}
+
+/// OnlyCalledDirectly - Return true if the specified function is only called
+/// directly.  In other words, its address is never taken.
+static bool OnlyCalledDirectly(Function *F) {
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    Instruction *User = dyn_cast<Instruction>(*UI);
+    if (!User) return false;
+    if (!isa<CallInst>(User) && !isa<InvokeInst>(User)) return false;
+
+    // See if the function address is passed as an argument.
+    for (User::op_iterator i = User->op_begin() + 1, e = User->op_end();
+         i != e; ++i)
+      if (*i == F) return false;
+  }
+  return true;
+}
+
+/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified
+/// function, changing them to FastCC.
+static void ChangeCalleesToFastCall(Function *F) {
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    CallSite User(cast<Instruction>(*UI));
+    User.setCallingConv(CallingConv::Fast);
+  }
+}
+
+static AttrListPtr StripNest(const AttrListPtr &Attrs) {
+  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
+    if ((Attrs.getSlot(i).Attrs & Attribute::Nest) == 0)
+      continue;
+
+    // There can be only one.
+    return Attrs.removeAttr(Attrs.getSlot(i).Index, Attribute::Nest);
+  }
+
+  return Attrs;
+}
+
+static void RemoveNestAttribute(Function *F) {
+  F->setAttributes(StripNest(F->getAttributes()));
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    CallSite User(cast<Instruction>(*UI));
+    User.setAttributes(StripNest(User.getAttributes()));
+  }
+}
+
+bool GlobalOpt::OptimizeFunctions(Module &M) {
+  bool Changed = false;
+  // Optimize functions.
+  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
+    Function *F = FI++;
+    // Functions without names cannot be referenced outside this module.
+    if (!F->hasName() && !F->isDeclaration())
+      F->setLinkage(GlobalValue::InternalLinkage);
+    F->removeDeadConstantUsers();
+    if (F->use_empty() && (F->hasLocalLinkage() ||
+                           F->hasLinkOnceLinkage())) {
+      M.getFunctionList().erase(F);
+      Changed = true;
+      ++NumFnDeleted;
+    } else if (F->hasLocalLinkage()) {
+      if (F->getCallingConv() == CallingConv::C && !F->isVarArg() &&
+          OnlyCalledDirectly(F)) {
+        // If this function has C calling conventions, is not a varargs
+        // function, and is only called directly, promote it to use the Fast
+        // calling convention.
+        F->setCallingConv(CallingConv::Fast);
+        ChangeCalleesToFastCall(F);
+        ++NumFastCallFns;
+        Changed = true;
+      }
+
+      if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+          OnlyCalledDirectly(F)) {
+        // The function is not used by a trampoline intrinsic, so it is safe
+        // to remove the 'nest' attribute.
+        RemoveNestAttribute(F);
+        ++NumNestRemoved;
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool GlobalOpt::OptimizeGlobalVars(Module &M) {
+  bool Changed = false;
+  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+       GVI != E; ) {
+    GlobalVariable *GV = GVI++;
+    // Global variables without names cannot be referenced outside this module.
+    if (!GV->hasName() && !GV->isDeclaration())
+      GV->setLinkage(GlobalValue::InternalLinkage);
+    if (!GV->isConstant() && GV->hasLocalLinkage() &&
+        GV->hasInitializer())
+      Changed |= ProcessInternalGlobal(GV, GVI);
+  }
+  return Changed;
+}
+
+/// FindGlobalCtors - Find the llvm.globalctors list, verifying that all
+/// initializers have an init priority of 65535.
+GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (I->getName() == "llvm.global_ctors") {
+      // Found it, verify it's an array of { int, void()* }.
+      const ArrayType *ATy =dyn_cast<ArrayType>(I->getType()->getElementType());
+      if (!ATy) return 0;
+      const StructType *STy = dyn_cast<StructType>(ATy->getElementType());
+      if (!STy || STy->getNumElements() != 2 ||
+          STy->getElementType(0) != Type::Int32Ty) return 0;
+      const PointerType *PFTy = dyn_cast<PointerType>(STy->getElementType(1));
+      if (!PFTy) return 0;
+      const FunctionType *FTy = dyn_cast<FunctionType>(PFTy->getElementType());
+      if (!FTy || FTy->getReturnType() != Type::VoidTy || FTy->isVarArg() ||
+          FTy->getNumParams() != 0)
+        return 0;
+      
+      // Verify that the initializer is simple enough for us to handle.
+      if (!I->hasInitializer()) return 0;
+      ConstantArray *CA = dyn_cast<ConstantArray>(I->getInitializer());
+      if (!CA) return 0;
+      for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i)
+        if (ConstantStruct *CS = dyn_cast<ConstantStruct>(*i)) {
+          if (isa<ConstantPointerNull>(CS->getOperand(1)))
+            continue;
+
+          // Must have a function or null ptr.
+          if (!isa<Function>(CS->getOperand(1)))
+            return 0;
+          
+          // Init priority must be standard.
+          ConstantInt *CI = dyn_cast<ConstantInt>(CS->getOperand(0));
+          if (!CI || CI->getZExtValue() != 65535)
+            return 0;
+        } else {
+          return 0;
+        }
+      
+      return I;
+    }
+  return 0;
+}
+
+/// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) {
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  std::vector<Function*> Result;
+  Result.reserve(CA->getNumOperands());
+  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    ConstantStruct *CS = cast<ConstantStruct>(*i);
+    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+  }
+  return Result;
+}
+
+/// InstallGlobalCtors - Given a specified llvm.global_ctors list, install the
+/// specified array, returning the new global to use.
+static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, 
+                                          const std::vector<Function*> &Ctors) {
+  // If we made a change, reassemble the initializer list.
+  std::vector<Constant*> CSVals;
+  CSVals.push_back(ConstantInt::get(Type::Int32Ty, 65535));
+  CSVals.push_back(0);
+  
+  // Create the new init list.
+  std::vector<Constant*> CAList;
+  for (unsigned i = 0, e = Ctors.size(); i != e; ++i) {
+    if (Ctors[i]) {
+      CSVals[1] = Ctors[i];
+    } else {
+      const Type *FTy = FunctionType::get(Type::VoidTy,
+                                          std::vector<const Type*>(), false);
+      const PointerType *PFTy = PointerType::getUnqual(FTy);
+      CSVals[1] = Constant::getNullValue(PFTy);
+      CSVals[0] = ConstantInt::get(Type::Int32Ty, 2147483647);
+    }
+    CAList.push_back(ConstantStruct::get(CSVals));
+  }
+  
+  // Create the array initializer.
+  const Type *StructTy =
+    cast<ArrayType>(GCL->getType()->getElementType())->getElementType();
+  Constant *CA = ConstantArray::get(ArrayType::get(StructTy, CAList.size()),
+                                    CAList);
+  
+  // If we didn't change the number of elements, don't create a new GV.
+  if (CA->getType() == GCL->getInitializer()->getType()) {
+    GCL->setInitializer(CA);
+    return GCL;
+  }
+  
+  // Create the new global and insert it next to the existing list.
+  GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(),
+                                           GCL->getLinkage(), CA, "",
+                                           (Module *)NULL,
+                                           GCL->isThreadLocal());
+  GCL->getParent()->getGlobalList().insert(GCL, NGV);
+  NGV->takeName(GCL);
+  
+  // Nuke the old list, replacing any uses with the new one.
+  if (!GCL->use_empty()) {
+    Constant *V = NGV;
+    if (V->getType() != GCL->getType())
+      V = ConstantExpr::getBitCast(V, GCL->getType());
+    GCL->replaceAllUsesWith(V);
+  }
+  GCL->eraseFromParent();
+  
+  if (Ctors.size())
+    return NGV;
+  else
+    return 0;
+}
+
+
+static Constant *getVal(DenseMap<Value*, Constant*> &ComputedValues,
+                        Value *V) {
+  if (Constant *CV = dyn_cast<Constant>(V)) return CV;
+  Constant *R = ComputedValues[V];
+  assert(R && "Reference to an uncomputed value!");
+  return R;
+}
+
+/// isSimpleEnoughPointerToCommit - Return true if this constant is simple
+/// enough for us to understand.  In particular, if it is a cast of something,
+/// we punt.  We basically just support direct accesses to globals and GEP's of
+/// globals.  This should be kept up to date with CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (!GV->hasExternalLinkage() && !GV->hasLocalLinkage())
+      return false;  // do not allow weak/linkonce/dllimport/dllexport linkage.
+    return !GV->isDeclaration();  // reject external globals.
+  }
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    // Handle a constantexpr gep.
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (!GV->hasExternalLinkage() && !GV->hasLocalLinkage())
+        return false;  // do not allow weak/linkonce/dllimport/dllexport linkage.
+      return GV->hasInitializer() &&
+             ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+  return false;
+}
+
+/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global
+/// initializer.  This returns 'Init' modified to reflect 'Val' stored into it.
+/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into.
+static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
+                                   ConstantExpr *Addr, unsigned OpNo) {
+  // Base case of the recursion.
+  if (OpNo == Addr->getNumOperands()) {
+    assert(Val->getType() == Init->getType() && "Type mismatch!");
+    return Val;
+  }
+  
+  if (const StructType *STy = dyn_cast<StructType>(Init->getType())) {
+    std::vector<Constant*> Elts;
+
+    // Break up the constant into its elements.
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) {
+      for (User::op_iterator i = CS->op_begin(), e = CS->op_end(); i != e; ++i)
+        Elts.push_back(cast<Constant>(*i));
+    } else if (isa<ConstantAggregateZero>(Init)) {
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        Elts.push_back(Constant::getNullValue(STy->getElementType(i)));
+    } else if (isa<UndefValue>(Init)) {
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        Elts.push_back(UndefValue::get(STy->getElementType(i)));
+    } else {
+      assert(0 && "This code is out of sync with "
+             " ConstantFoldLoadThroughGEPConstantExpr");
+    }
+    
+    // Replace the element that we are supposed to.
+    ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
+    unsigned Idx = CU->getZExtValue();
+    assert(Idx < STy->getNumElements() && "Struct index out of range!");
+    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
+    
+    // Return the modified struct.
+    return ConstantStruct::get(&Elts[0], Elts.size(), STy->isPacked());
+  } else {
+    ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
+    const ArrayType *ATy = cast<ArrayType>(Init->getType());
+
+    // Break up the array into elements.
+    std::vector<Constant*> Elts;
+    if (ConstantArray *CA = dyn_cast<ConstantArray>(Init)) {
+      for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i)
+        Elts.push_back(cast<Constant>(*i));
+    } else if (isa<ConstantAggregateZero>(Init)) {
+      Constant *Elt = Constant::getNullValue(ATy->getElementType());
+      Elts.assign(ATy->getNumElements(), Elt);
+    } else if (isa<UndefValue>(Init)) {
+      Constant *Elt = UndefValue::get(ATy->getElementType());
+      Elts.assign(ATy->getNumElements(), Elt);
+    } else {
+      assert(0 && "This code is out of sync with "
+             " ConstantFoldLoadThroughGEPConstantExpr");
+    }
+    
+    assert(CI->getZExtValue() < ATy->getNumElements());
+    Elts[CI->getZExtValue()] =
+      EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
+    return ConstantArray::get(ATy, Elts);
+  }    
+}
+
+/// CommitValueTo - We have decided that Addr (which satisfies the predicate
+/// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
+static void CommitValueTo(Constant *Val, Constant *Addr) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    assert(GV->hasInitializer());
+    GV->setInitializer(Val);
+    return;
+  }
+  
+  ConstantExpr *CE = cast<ConstantExpr>(Addr);
+  GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+  
+  Constant *Init = GV->getInitializer();
+  Init = EvaluateStoreInto(Init, Val, CE, 2);
+  GV->setInitializer(Init);
+}
+
+/// ComputeLoadResult - Return the value that would be computed by a load from
+/// P after the stores reflected by 'memory' have been performed.  If we can't
+/// decide, return null.
+static Constant *ComputeLoadResult(Constant *P,
+                                const DenseMap<Constant*, Constant*> &Memory) {
+  // If this memory location has been recently stored, use the stored value: it
+  // is the most up-to-date.
+  DenseMap<Constant*, Constant*>::const_iterator I = Memory.find(P);
+  if (I != Memory.end()) return I->second;
+ 
+  // Access it.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+    if (GV->hasInitializer())
+      return GV->getInitializer();
+    return 0;
+  }
+  
+  // Handle a constantexpr getelementptr.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (GV->hasInitializer())
+        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+
+  return 0;  // don't know how to evaluate.
+}
+
+/// EvaluateFunction - Evaluate a call to function F, returning true if
+/// successful, false if we can't evaluate it.  ActualArgs contains the formal
+/// arguments for the function.
+static bool EvaluateFunction(Function *F, Constant *&RetVal,
+                             const std::vector<Constant*> &ActualArgs,
+                             std::vector<Function*> &CallStack,
+                             DenseMap<Constant*, Constant*> &MutatedMemory,
+                             std::vector<GlobalVariable*> &AllocaTmps) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
+    return false;
+  
+  CallStack.push_back(F);
+  
+  /// Values - As we compute SSA register values, we store their contents here.
+  DenseMap<Value*, Constant*> Values;
+  
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    Values[AI] = ActualArgs[ArgNo];
+
+  /// ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  /// we can only evaluate any one basic block at most once.  This set keeps
+  /// track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+  
+  // CurInst - The current instruction we're evaluating.
+  BasicBlock::iterator CurInst = F->begin()->begin();
+  
+  // This is the main evaluation loop.
+  while (1) {
+    Constant *InstResult = 0;
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+      if (SI->isVolatile()) return false;  // no volatile accesses.
+      Constant *Ptr = getVal(Values, SI->getOperand(1));
+      if (!isSimpleEnoughPointerToCommit(Ptr))
+        // If this is too complex for us to commit, reject it.
+        return false;
+      Constant *Val = getVal(Values, SI->getOperand(0));
+      MutatedMemory[Ptr] = Val;
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+      InstResult = ConstantExpr::get(BO->getOpcode(),
+                                     getVal(Values, BO->getOperand(0)),
+                                     getVal(Values, BO->getOperand(1)));
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+                                            getVal(Values, CI->getOperand(0)),
+                                            getVal(Values, CI->getOperand(1)));
+    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+      InstResult = ConstantExpr::getCast(CI->getOpcode(),
+                                         getVal(Values, CI->getOperand(0)),
+                                         CI->getType());
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+      InstResult = ConstantExpr::getSelect(getVal(Values, SI->getOperand(0)),
+                                           getVal(Values, SI->getOperand(1)),
+                                           getVal(Values, SI->getOperand(2)));
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+      Constant *P = getVal(Values, GEP->getOperand(0));
+      SmallVector<Constant*, 8> GEPOps;
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(Values, *i));
+      InstResult = ConstantExpr::getGetElementPtr(P, &GEPOps[0], GEPOps.size());
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+      if (LI->isVolatile()) return false;  // no volatile accesses.
+      InstResult = ComputeLoadResult(getVal(Values, LI->getOperand(0)),
+                                     MutatedMemory);
+      if (InstResult == 0) return false; // Could not evaluate load.
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+      if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
+      const Type *Ty = AI->getType()->getElementType();
+      AllocaTmps.push_back(new GlobalVariable(Ty, false,
+                                              GlobalValue::InternalLinkage,
+                                              UndefValue::get(Ty),
+                                              AI->getName()));
+      InstResult = AllocaTmps.back();     
+    } else if (CallInst *CI = dyn_cast<CallInst>(CurInst)) {
+
+      // Debug info can safely be ignored here.
+      if (isa<DbgInfoIntrinsic>(CI)) {
+        ++CurInst;
+        continue;
+      }
+
+      // Cannot handle inline asm.
+      if (isa<InlineAsm>(CI->getOperand(0))) return false;
+
+      // Resolve function pointers.
+      Function *Callee = dyn_cast<Function>(getVal(Values, CI->getOperand(0)));
+      if (!Callee) return false;  // Cannot resolve.
+
+      std::vector<Constant*> Formals;
+      for (User::op_iterator i = CI->op_begin() + 1, e = CI->op_end();
+           i != e; ++i)
+        Formals.push_back(getVal(Values, *i));
+      
+      if (Callee->isDeclaration()) {
+        // If this is a function we can constant fold, do it.
+        if (Constant *C = ConstantFoldCall(Callee, &Formals[0],
+                                           Formals.size())) {
+          InstResult = C;
+        } else {
+          return false;
+        }
+      } else {
+        if (Callee->getFunctionType()->isVarArg())
+          return false;
+        
+        Constant *RetVal;
+        // Execute the call, if successful, use the return value.
+        if (!EvaluateFunction(Callee, RetVal, Formals, CallStack,
+                              MutatedMemory, AllocaTmps))
+          return false;
+        InstResult = RetVal;
+      }
+    } else if (isa<TerminatorInst>(CurInst)) {
+      BasicBlock *NewBB = 0;
+      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+        if (BI->isUnconditional()) {
+          NewBB = BI->getSuccessor(0);
+        } else {
+          ConstantInt *Cond =
+            dyn_cast<ConstantInt>(getVal(Values, BI->getCondition()));
+          if (!Cond) return false;  // Cannot determine.
+
+          NewBB = BI->getSuccessor(!Cond->getZExtValue());          
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+        ConstantInt *Val =
+          dyn_cast<ConstantInt>(getVal(Values, SI->getCondition()));
+        if (!Val) return false;  // Cannot determine.
+        NewBB = SI->getSuccessor(SI->findCaseValue(Val));
+      } else if (ReturnInst *RI = dyn_cast<ReturnInst>(CurInst)) {
+        if (RI->getNumOperands())
+          RetVal = getVal(Values, RI->getOperand(0));
+        
+        CallStack.pop_back();  // return from fn.
+        return true;  // We succeeded at evaluating this ctor!
+      } else {
+        // invoke, unwind, unreachable.
+        return false;  // Cannot handle this terminator.
+      }
+      
+      // Okay, we succeeded in evaluating this control flow.  See if we have
+      // executed the new block before.  If so, we have a looping function,
+      // which we cannot evaluate in reasonable time.
+      if (!ExecutedBlocks.insert(NewBB))
+        return false;  // looped!
+      
+      // Okay, we have never been in this block before.  Check to see if there
+      // are any PHI nodes.  If so, evaluate them with information about where
+      // we came from.
+      BasicBlock *OldBB = CurInst->getParent();
+      CurInst = NewBB->begin();
+      PHINode *PN;
+      for (; (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+        Values[PN] = getVal(Values, PN->getIncomingValueForBlock(OldBB));
+
+      // Do NOT increment CurInst.  We know that the terminator had no value.
+      continue;
+    } else {
+      // Did not know how to evaluate this!
+      return false;
+    }
+    
+    if (!CurInst->use_empty())
+      Values[CurInst] = InstResult;
+    
+    // Advance program counter.
+    ++CurInst;
+  }
+}
+
+/// EvaluateStaticConstructor - Evaluate static constructors in the function, if
+/// we can.  Return true if we can, false otherwise.
+static bool EvaluateStaticConstructor(Function *F) {
+  /// MutatedMemory - For each store we execute, we update this map.  Loads
+  /// check this to get the most up-to-date value.  If evaluation is successful,
+  /// this state is committed to the process.
+  DenseMap<Constant*, Constant*> MutatedMemory;
+
+  /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable
+  /// to represent its body.  This vector is needed so we can delete the
+  /// temporary globals when we are done.
+  std::vector<GlobalVariable*> AllocaTmps;
+  
+  /// CallStack - This is used to detect recursion.  In pathological situations
+  /// we could hit exponential behavior, but at least there is nothing
+  /// unbounded.
+  std::vector<Function*> CallStack;
+
+  // Call the function.
+  Constant *RetValDummy;
+  bool EvalSuccess = EvaluateFunction(F, RetValDummy, std::vector<Constant*>(),
+                                       CallStack, MutatedMemory, AllocaTmps);
+  if (EvalSuccess) {
+    // We succeeded at evaluation: commit the result.
+    DOUT << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+         << F->getName() << "' to " << MutatedMemory.size()
+         << " stores.\n";
+    for (DenseMap<Constant*, Constant*>::iterator I = MutatedMemory.begin(),
+         E = MutatedMemory.end(); I != E; ++I)
+      CommitValueTo(I->second, I->first);
+  }
+  
+  // At this point, we are done interpreting.  If we created any 'alloca'
+  // temporaries, release them now.
+  while (!AllocaTmps.empty()) {
+    GlobalVariable *Tmp = AllocaTmps.back();
+    AllocaTmps.pop_back();
+    
+    // If there are still users of the alloca, the program is doing something
+    // silly, e.g. storing the address of the alloca somewhere and using it
+    // later.  Since this is undefined, we'll just make it be null.
+    if (!Tmp->use_empty())
+      Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
+    delete Tmp;
+  }
+  
+  return EvalSuccess;
+}
+
+
+
+/// OptimizeGlobalCtorsList - Simplify and evaluation global ctors if possible.
+/// Return true if anything changed.
+bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
+  std::vector<Function*> Ctors = ParseGlobalCtors(GCL);
+  bool MadeChange = false;
+  if (Ctors.empty()) return false;
+  
+  // Loop over global ctors, optimizing them when we can.
+  for (unsigned i = 0; i != Ctors.size(); ++i) {
+    Function *F = Ctors[i];
+    // Found a null terminator in the middle of the list, prune off the rest of
+    // the list.
+    if (F == 0) {
+      if (i != Ctors.size()-1) {
+        Ctors.resize(i+1);
+        MadeChange = true;
+      }
+      break;
+    }
+    
+    // We cannot simplify external ctor functions.
+    if (F->empty()) continue;
+    
+    // If we can evaluate the ctor at compile time, do.
+    if (EvaluateStaticConstructor(F)) {
+      Ctors.erase(Ctors.begin()+i);
+      MadeChange = true;
+      --i;
+      ++NumCtorsEvaluated;
+      continue;
+    }
+  }
+  
+  if (!MadeChange) return false;
+  
+  GCL = InstallGlobalCtors(GCL, Ctors);
+  return true;
+}
+
+bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
+  bool Changed = false;
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E;) {
+    Module::alias_iterator J = I++;
+    // Aliases without names cannot be referenced outside this module.
+    if (!J->hasName() && !J->isDeclaration())
+      J->setLinkage(GlobalValue::InternalLinkage);
+    // If the aliasee may change at link time, nothing can be done - bail out.
+    if (J->mayBeOverridden())
+      continue;
+
+    Constant *Aliasee = J->getAliasee();
+    GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+    Target->removeDeadConstantUsers();
+    bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse();
+
+    // Make all users of the alias use the aliasee instead.
+    if (!J->use_empty()) {
+      J->replaceAllUsesWith(Aliasee);
+      ++NumAliasesResolved;
+      Changed = true;
+    }
+
+    // If the aliasee has internal linkage, give it the name and linkage
+    // of the alias, and delete the alias.  This turns:
+    //   define internal ... @f(...)
+    //   @a = alias ... @f
+    // into:
+    //   define ... @a(...)
+    if (!Target->hasLocalLinkage())
+      continue;
+
+    // The transform is only useful if the alias does not have internal linkage.
+    if (J->hasLocalLinkage())
+      continue;
+
+    // Do not perform the transform if multiple aliases potentially target the
+    // aliasee.  This check also ensures that it is safe to replace the section
+    // and other attributes of the aliasee with those of the alias.
+    if (!hasOneUse)
+      continue;
+
+    // Give the aliasee the name, linkage and other attributes of the alias.
+    Target->takeName(J);
+    Target->setLinkage(J->getLinkage());
+    Target->GlobalValue::copyAttributesFrom(J);
+
+    // Delete the alias.
+    M.getAliasList().erase(J);
+    ++NumAliasesRemoved;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool GlobalOpt::runOnModule(Module &M) {
+  bool Changed = false;
+  
+  // Try to find the llvm.globalctors list.
+  GlobalVariable *GlobalCtors = FindGlobalCtors(M);
+
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+    
+    // Delete functions that are trivially dead, ccc -> fastcc
+    LocalChange |= OptimizeFunctions(M);
+    
+    // Optimize global_ctors list.
+    if (GlobalCtors)
+      LocalChange |= OptimizeGlobalCtorsList(GlobalCtors);
+    
+    // Optimize non-address-taken globals.
+    LocalChange |= OptimizeGlobalVars(M);
+
+    // Resolve aliases, when possible.
+    LocalChange |= OptimizeGlobalAliases(M);
+    Changed |= LocalChange;
+  }
+  
+  // TODO: Move all global ctors functions to the end of the module for code
+  // layout.
+  
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
new file mode 100644
index 0000000..2dc8558
--- /dev/null
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -0,0 +1,277 @@
+//===-- IPConstantPropagation.cpp - Propagate constants through calls -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an _extremely_ simple interprocedural constant
+// propagation pass.  It could certainly be improved in many different ways,
+// like using a worklist.  This pass makes arguments dead, but does not remove
+// them.  The existing dead argument elimination pass should be run after this
+// to clean up the mess.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ipconstprop"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+STATISTIC(NumArgumentsProped, "Number of args turned into constants");
+STATISTIC(NumReturnValProped, "Number of return values turned into constants");
+
+namespace {
+  /// IPCP - The interprocedural constant propagation pass
+  ///
+  struct VISIBILITY_HIDDEN IPCP : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    IPCP() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M);
+  private:
+    bool PropagateConstantsIntoArguments(Function &F);
+    bool PropagateConstantReturn(Function &F);
+  };
+}
+
+char IPCP::ID = 0;
+static RegisterPass<IPCP>
+X("ipconstprop", "Interprocedural constant propagation");
+
+ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
+
+bool IPCP::runOnModule(Module &M) {
+  bool Changed = false;
+  bool LocalChange = true;
+
+  // FIXME: instead of using smart algorithms, we just iterate until we stop
+  // making changes.
+  while (LocalChange) {
+    LocalChange = false;
+    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Delete any klingons.
+        I->removeDeadConstantUsers();
+        if (I->hasLocalLinkage())
+          LocalChange |= PropagateConstantsIntoArguments(*I);
+        Changed |= PropagateConstantReturn(*I);
+      }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+/// PropagateConstantsIntoArguments - Look at all uses of the specified
+/// function.  If all uses are direct call sites, and all pass a particular
+/// constant in for an argument, propagate that constant in as the argument.
+///
+bool IPCP::PropagateConstantsIntoArguments(Function &F) {
+  if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit.
+
+  // For each argument, keep track of its constant value and whether it is a
+  // constant or not.  The bool is driven to true when found to be non-constant.
+  SmallVector<std::pair<Constant*, bool>, 16> ArgumentConstants;
+  ArgumentConstants.resize(F.arg_size());
+
+  unsigned NumNonconstant = 0;
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    // Used by a non-instruction, or not the callee of a function, do not
+    // transform.
+    if (!isa<CallInst>(*UI) && !isa<InvokeInst>(*UI))
+      return false;
+    
+    CallSite CS = CallSite::get(cast<Instruction>(*UI));
+    if (!CS.isCallee(UI))
+      return false;
+
+    // Check out all of the potentially constant arguments.  Note that we don't
+    // inspect varargs here.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    Function::arg_iterator Arg = F.arg_begin();
+    for (unsigned i = 0, e = ArgumentConstants.size(); i != e;
+         ++i, ++AI, ++Arg) {
+      
+      // If this argument is known non-constant, ignore it.
+      if (ArgumentConstants[i].second)
+        continue;
+      
+      Constant *C = dyn_cast<Constant>(*AI);
+      if (C && ArgumentConstants[i].first == 0) {
+        ArgumentConstants[i].first = C;   // First constant seen.
+      } else if (C && ArgumentConstants[i].first == C) {
+        // Still the constant value we think it is.
+      } else if (*AI == &*Arg) {
+        // Ignore recursive calls passing argument down.
+      } else {
+        // Argument became non-constant.  If all arguments are non-constant now,
+        // give up on this function.
+        if (++NumNonconstant == ArgumentConstants.size())
+          return false;
+        ArgumentConstants[i].second = true;
+      }
+    }
+  }
+
+  // If we got to this point, there is a constant argument!
+  assert(NumNonconstant != ArgumentConstants.size());
+  bool MadeChange = false;
+  Function::arg_iterator AI = F.arg_begin();
+  for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) {
+    // Do we have a constant argument?
+    if (ArgumentConstants[i].second || AI->use_empty())
+      continue;
+  
+    Value *V = ArgumentConstants[i].first;
+    if (V == 0) V = UndefValue::get(AI->getType());
+    AI->replaceAllUsesWith(V);
+    ++NumArgumentsProped;
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+
+// Check to see if this function returns one or more constants. If so, replace
+// all callers that use those return values with the constant value. This will
+// leave in the actual return values and instructions, but deadargelim will
+// clean that up.
+//
+// Additionally if a function always returns one of its arguments directly,
+// callers will be updated to use the value they pass in directly instead of
+// using the return value.
+bool IPCP::PropagateConstantReturn(Function &F) {
+  if (F.getReturnType() == Type::VoidTy)
+    return false; // No return value.
+
+  // If this function could be overridden later in the link stage, we can't
+  // propagate information about its results into callers.
+  if (F.mayBeOverridden())
+    return false;
+  
+  // Check to see if this function returns a constant.
+  SmallVector<Value *,4> RetVals;
+  const StructType *STy = dyn_cast<StructType>(F.getReturnType());
+  if (STy)
+    for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) 
+      RetVals.push_back(UndefValue::get(STy->getElementType(i)));
+  else
+    RetVals.push_back(UndefValue::get(F.getReturnType()));
+
+  unsigned NumNonConstant = 0;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      for (unsigned i = 0, e = RetVals.size(); i != e; ++i) {
+        // Already found conflicting return values?
+        Value *RV = RetVals[i];
+        if (!RV)
+          continue;
+
+        // Find the returned value
+        Value *V;
+        if (!STy)
+          V = RI->getOperand(i);
+        else
+          V = FindInsertedValue(RI->getOperand(0), i);
+
+        if (V) {
+          // Ignore undefs, we can change them into anything
+          if (isa<UndefValue>(V))
+            continue;
+          
+          // Try to see if all the rets return the same constant or argument.
+          if (isa<Constant>(V) || isa<Argument>(V)) {
+            if (isa<UndefValue>(RV)) {
+              // No value found yet? Try the current one.
+              RetVals[i] = V;
+              continue;
+            }
+            // Returning the same value? Good.
+            if (RV == V)
+              continue;
+          }
+        }
+        // Different or no known return value? Don't propagate this return
+        // value.
+        RetVals[i] = 0;
+        // All values non constant? Stop looking.
+        if (++NumNonConstant == RetVals.size())
+          return false;
+      }
+    }
+
+  // If we got here, the function returns at least one constant value.  Loop
+  // over all users, replacing any uses of the return value with the returned
+  // constant.
+  bool MadeChange = false;
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    CallSite CS = CallSite::get(*UI);
+    Instruction* Call = CS.getInstruction();
+
+    // Not a call instruction or a call instruction that's not calling F
+    // directly?
+    if (!Call || !CS.isCallee(UI))
+      continue;
+    
+    // Call result not used?
+    if (Call->use_empty())
+      continue;
+
+    MadeChange = true;
+
+    if (STy == 0) {
+      Value* New = RetVals[0];
+      if (Argument *A = dyn_cast<Argument>(New))
+        // Was an argument returned? Then find the corresponding argument in
+        // the call instruction and use that.
+        New = CS.getArgument(A->getArgNo());
+      Call->replaceAllUsesWith(New);
+      continue;
+    }
+   
+    for (Value::use_iterator I = Call->use_begin(), E = Call->use_end();
+         I != E;) {
+      Instruction *Ins = dyn_cast<Instruction>(*I);
+
+      // Increment now, so we can remove the use
+      ++I;
+
+      // Not an instruction? Ignore
+      if (!Ins)
+        continue;
+
+      // Find the index of the retval to replace with
+      int index = -1;
+      if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins))
+        if (EV->hasIndices())
+          index = *EV->idx_begin();
+
+      // If this use uses a specific return value, and we have a replacement,
+      // replace it.
+      if (index != -1) {
+        Value *New = RetVals[index];
+        if (New) {
+          if (Argument *A = dyn_cast<Argument>(New))
+            // Was an argument returned? Then find the corresponding argument in
+            // the call instruction and use that.
+            New = CS.getArgument(A->getArgNo());
+          Ins->replaceAllUsesWith(New);
+          Ins->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  if (MadeChange) ++NumReturnValProped;
+  return MadeChange;
+}
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
new file mode 100644
index 0000000..43066076
--- /dev/null
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -0,0 +1,75 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the C bindings for libLLVMIPO.a, which implements
+// several transformations over the LLVM intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/IPO.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createArgumentPromotionPass());
+}
+
+void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantMergePass());
+}
+
+void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadArgEliminationPass());
+}
+
+void LLVMAddDeadTypeEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadTypeEliminationPass());
+}
+
+void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionAttrsPass());
+}
+
+void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionInliningPass());
+}
+
+void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalDCEPass());
+}
+
+void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalOptimizerPass());
+}
+
+void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIPConstantPropagationPass());
+}
+
+void LLVMAddLowerSetJmpPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSetJmpPass());
+}
+
+void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPruneEHPass());
+}
+
+void LLVMAddRaiseAllocationsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createRaiseAllocationsPass());
+}
+
+void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripDeadPrototypesPass());
+}
+
+void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripSymbolsPass());
+}
diff --git a/lib/Transforms/IPO/IndMemRemoval.cpp b/lib/Transforms/IPO/IndMemRemoval.cpp
new file mode 100644
index 0000000..b55dea2
--- /dev/null
+++ b/lib/Transforms/IPO/IndMemRemoval.cpp
@@ -0,0 +1,89 @@
+//===-- IndMemRemoval.cpp - Remove indirect allocations and frees ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass finds places where memory allocation functions may escape into
+// indirect land.  Some transforms are much easier (aka possible) only if free 
+// or malloc are not called indirectly.
+// Thus find places where the address of memory functions are taken and construct
+// bounce functions with direct calls of those functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indmemrem"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumBounceSites, "Number of sites modified");
+STATISTIC(NumBounce     , "Number of bounce functions created");
+
+namespace {
+  class VISIBILITY_HIDDEN IndMemRemPass : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    IndMemRemPass() : ModulePass(&ID) {}
+
+    virtual bool runOnModule(Module &M);
+  };
+} // end anonymous namespace
+
+char IndMemRemPass::ID = 0;
+static RegisterPass<IndMemRemPass>
+X("indmemrem","Indirect Malloc and Free Removal");
+
+bool IndMemRemPass::runOnModule(Module &M) {
+  // In theory, all direct calls of malloc and free should be promoted
+  // to intrinsics.  Therefore, this goes through and finds where the
+  // address of free or malloc are taken and replaces those with bounce
+  // functions, ensuring that all malloc and free that might happen
+  // happen through intrinsics.
+  bool changed = false;
+  if (Function* F = M.getFunction("free")) {
+    if (F->isDeclaration() && F->arg_size() == 1 && !F->use_empty()) {
+      Function* FN = Function::Create(F->getFunctionType(),
+                                      GlobalValue::LinkOnceAnyLinkage,
+                                      "free_llvm_bounce", &M);
+      BasicBlock* bb = BasicBlock::Create("entry",FN);
+      Instruction* R = ReturnInst::Create(bb);
+      new FreeInst(FN->arg_begin(), R);
+      ++NumBounce;
+      NumBounceSites += F->getNumUses();
+      F->replaceAllUsesWith(FN);
+      changed = true;
+    }
+  }
+  if (Function* F = M.getFunction("malloc")) {
+    if (F->isDeclaration() && F->arg_size() == 1 && !F->use_empty()) {
+      Function* FN = Function::Create(F->getFunctionType(), 
+                                      GlobalValue::LinkOnceAnyLinkage,
+                                      "malloc_llvm_bounce", &M);
+      FN->setDoesNotAlias(0);
+      BasicBlock* bb = BasicBlock::Create("entry",FN);
+      Instruction* c = CastInst::CreateIntegerCast(
+          FN->arg_begin(), Type::Int32Ty, false, "c", bb);
+      Instruction* a = new MallocInst(Type::Int8Ty, c, "m", bb);
+      ReturnInst::Create(a, bb);
+      ++NumBounce;
+      NumBounceSites += F->getNumUses();
+      F->replaceAllUsesWith(FN);
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+ModulePass *llvm::createIndMemRemPass() {
+  return new IndMemRemPass();
+}
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
new file mode 100644
index 0000000..5f9ea54
--- /dev/null
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -0,0 +1,75 @@
+//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a custom inliner that handles only functions that
+// are marked as "always inline".
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/CallingConv.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Transforms/Utils/InlineCost.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+
+namespace {
+
+  // AlwaysInliner only inlines functions that are mark as "always inline".
+  class VISIBILITY_HIDDEN AlwaysInliner : public Inliner {
+    // Functions that are never inlined
+    SmallPtrSet<const Function*, 16> NeverInline; 
+    InlineCostAnalyzer CA;
+  public:
+    // Use extremely low threshold. 
+    AlwaysInliner() : Inliner(&ID, -2000000000) {}
+    static char ID; // Pass identification, replacement for typeid
+    InlineCost getInlineCost(CallSite CS) {
+      return CA.getInlineCost(CS, NeverInline);
+    }
+    float getInlineFudgeFactor(CallSite CS) {
+      return CA.getInlineFudgeFactor(CS);
+    }
+    void resetCachedCostInfo(Function *Caller) {
+      return CA.resetCachedCostInfo(Caller);
+    }
+    virtual bool doFinalization(CallGraph &CG) { 
+      return removeDeadFunctions(CG, &NeverInline); 
+    }
+    virtual bool doInitialization(CallGraph &CG);
+  };
+}
+
+char AlwaysInliner::ID = 0;
+static RegisterPass<AlwaysInliner>
+X("always-inline", "Inliner for always_inline functions");
+
+Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
+
+// doInitialization - Initializes the vector of functions that have not 
+// been annotated with the "always inline" attribute.
+bool AlwaysInliner::doInitialization(CallGraph &CG) {
+  Module &M = CG.getModule();
+  
+  for (Module::iterator I = M.begin(), E = M.end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasFnAttr(Attribute::AlwaysInline))
+      NeverInline.insert(I);
+
+  return false;
+}
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
new file mode 100644
index 0000000..e107a00
--- /dev/null
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -0,0 +1,106 @@
+//===- InlineSimple.cpp - Code to perform simple function inlining --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bottom-up inlining of functions into callees.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/CallingConv.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Transforms/Utils/InlineCost.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+
+namespace {
+
+  class VISIBILITY_HIDDEN SimpleInliner : public Inliner {
+    // Functions that are never inlined
+    SmallPtrSet<const Function*, 16> NeverInline; 
+    InlineCostAnalyzer CA;
+  public:
+    SimpleInliner() : Inliner(&ID) {}
+    SimpleInliner(int Threshold) : Inliner(&ID, Threshold) {}
+    static char ID; // Pass identification, replacement for typeid
+    InlineCost getInlineCost(CallSite CS) {
+      return CA.getInlineCost(CS, NeverInline);
+    }
+    float getInlineFudgeFactor(CallSite CS) {
+      return CA.getInlineFudgeFactor(CS);
+    }
+    void resetCachedCostInfo(Function *Caller) {
+      CA.resetCachedCostInfo(Caller);
+    }
+    virtual bool doInitialization(CallGraph &CG);
+  };
+}
+
+char SimpleInliner::ID = 0;
+static RegisterPass<SimpleInliner>
+X("inline", "Function Integration/Inlining");
+
+Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
+
+Pass *llvm::createFunctionInliningPass(int Threshold) { 
+  return new SimpleInliner(Threshold);
+}
+
+// doInitialization - Initializes the vector of functions that have been
+// annotated with the noinline attribute.
+bool SimpleInliner::doInitialization(CallGraph &CG) {
+  
+  Module &M = CG.getModule();
+  
+  for (Module::iterator I = M.begin(), E = M.end();
+       I != E; ++I)
+    if (!I->isDeclaration() && I->hasFnAttr(Attribute::NoInline))
+      NeverInline.insert(I);
+
+  // Get llvm.noinline
+  GlobalVariable *GV = M.getNamedGlobal("llvm.noinline");
+  
+  if (GV == 0)
+    return false;
+
+  // Don't crash on invalid code
+  if (!GV->hasInitializer())
+    return false;
+  
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  
+  if (InitList == 0)
+    return false;
+
+  // Iterate over each element and add to the NeverInline set
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+        
+    // Get Source
+    const Constant *Elt = InitList->getOperand(i);
+        
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Elt))
+      if (CE->getOpcode() == Instruction::BitCast) 
+        Elt = CE->getOperand(0);
+    
+    // Insert into set of functions to never inline
+    if (const Function *F = dyn_cast<Function>(Elt))
+      NeverInline.insert(F);
+  }
+  
+  return false;
+}
+
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
new file mode 100644
index 0000000..b382837
--- /dev/null
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -0,0 +1,278 @@
+//===- Inliner.cpp - Code common to all inliners --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls and updating the call graph.  The decisions of which calls
+// are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+
+static cl::opt<int>
+InlineLimit("inline-threshold", cl::Hidden, cl::init(200),
+        cl::desc("Control the amount of inlining to perform (default = 200)"));
+
+Inliner::Inliner(void *ID) 
+  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit) {}
+
+Inliner::Inliner(void *ID, int Threshold) 
+  : CallGraphSCCPass(ID), InlineThreshold(Threshold) {}
+
+/// getAnalysisUsage - For this class, we declare that we require and preserve
+/// the call graph.  If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void Inliner::getAnalysisUsage(AnalysisUsage &Info) const {
+  Info.addRequired<TargetData>();
+  CallGraphSCCPass::getAnalysisUsage(Info);
+}
+
+// InlineCallIfPossible - If it is possible to inline the specified call site,
+// do so and update the CallGraph for this operation.
+bool Inliner::InlineCallIfPossible(CallSite CS, CallGraph &CG,
+                                 const SmallPtrSet<Function*, 8> &SCCFunctions,
+                                 const TargetData &TD) {
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
+
+  if (!InlineFunction(CS, &CG, &TD)) return false;
+
+  // If the inlined function had a higher stack protection level than the
+  // calling function, then bump up the caller's stack protection level.
+  if (Callee->hasFnAttr(Attribute::StackProtectReq))
+    Caller->addFnAttr(Attribute::StackProtectReq);
+  else if (Callee->hasFnAttr(Attribute::StackProtect) &&
+           !Caller->hasFnAttr(Attribute::StackProtectReq))
+    Caller->addFnAttr(Attribute::StackProtect);
+
+  // If we inlined the last possible call site to the function, delete the
+  // function body now.
+  if (Callee->use_empty() && (Callee->hasLocalLinkage() ||
+                              Callee->hasAvailableExternallyLinkage()) &&
+      !SCCFunctions.count(Callee)) {
+    DOUT << "    -> Deleting dead function: " << Callee->getName() << "\n";
+    CallGraphNode *CalleeNode = CG[Callee];
+
+    // Remove any call graph edges from the callee to its callees.
+    CalleeNode->removeAllCalledFunctions();
+
+    resetCachedCostInfo(CalleeNode->getFunction());
+
+    // Removing the node for callee from the call graph and delete it.
+    delete CG.removeFunctionFromModule(CalleeNode);
+    ++NumDeleted;
+  }
+  return true;
+}
+        
+/// shouldInline - Return true if the inliner should attempt to inline
+/// at the given CallSite.
+bool Inliner::shouldInline(CallSite CS) {
+  InlineCost IC = getInlineCost(CS);
+  float FudgeFactor = getInlineFudgeFactor(CS);
+  
+  if (IC.isAlways()) {
+    DOUT << "    Inlining: cost=always"
+         << ", Call: " << *CS.getInstruction();
+    return true;
+  }
+  
+  if (IC.isNever()) {
+    DOUT << "    NOT Inlining: cost=never"
+         << ", Call: " << *CS.getInstruction();
+    return false;
+  }
+  
+  int Cost = IC.getValue();
+  int CurrentThreshold = InlineThreshold;
+  Function *Fn = CS.getCaller();
+  if (Fn && !Fn->isDeclaration() 
+      && Fn->hasFnAttr(Attribute::OptimizeForSize)
+      && InlineThreshold != 50) {
+    CurrentThreshold = 50;
+  }
+  
+  if (Cost >= (int)(CurrentThreshold * FudgeFactor)) {
+    DOUT << "    NOT Inlining: cost=" << Cost
+         << ", Call: " << *CS.getInstruction();
+    return false;
+  } else {
+    DOUT << "    Inlining: cost=" << Cost
+         << ", Call: " << *CS.getInstruction();
+    return true;
+  }
+}
+
+bool Inliner::runOnSCC(const std::vector<CallGraphNode*> &SCC) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  TargetData &TD = getAnalysis<TargetData>();
+
+  SmallPtrSet<Function*, 8> SCCFunctions;
+  DOUT << "Inliner visiting SCC:";
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+    if (F) SCCFunctions.insert(F);
+    DOUT << " " << (F ? F->getName() : "INDIRECTNODE");
+  }
+
+  // Scan through and identify all call sites ahead of time so that we only
+  // inline call sites in the original functions, not call sites that result
+  // from inlining other functions.
+  std::vector<CallSite> CallSites;
+
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    if (Function *F = SCC[i]->getFunction())
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+        for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+          CallSite CS = CallSite::get(I);
+          if (CS.getInstruction() && !isa<DbgInfoIntrinsic>(I) &&
+                                     (!CS.getCalledFunction() ||
+                                      !CS.getCalledFunction()->isDeclaration()))
+            CallSites.push_back(CS);
+        }
+
+  DOUT << ": " << CallSites.size() << " call sites.\n";
+
+  // Now that we have all of the call sites, move the ones to functions in the
+  // current SCC to the end of the list.
+  unsigned FirstCallInSCC = CallSites.size();
+  for (unsigned i = 0; i < FirstCallInSCC; ++i)
+    if (Function *F = CallSites[i].getCalledFunction())
+      if (SCCFunctions.count(F))
+        std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
+
+  // Now that we have all of the call sites, loop over them and inline them if
+  // it looks profitable to do so.
+  bool Changed = false;
+  bool LocalChange;
+  do {
+    LocalChange = false;
+    // Iterate over the outer loop because inlining functions can cause indirect
+    // calls to become direct calls.
+    for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi)
+      if (Function *Callee = CallSites[CSi].getCalledFunction()) {
+        // Calls to external functions are never inlinable.
+        if (Callee->isDeclaration()) {
+          if (SCC.size() == 1) {
+            std::swap(CallSites[CSi], CallSites.back());
+            CallSites.pop_back();
+          } else {
+            // Keep the 'in SCC / not in SCC' boundary correct.
+            CallSites.erase(CallSites.begin()+CSi);
+          }
+          --CSi;
+          continue;
+        }
+
+        // If the policy determines that we should inline this function,
+        // try to do so.
+        CallSite CS = CallSites[CSi];
+        if (shouldInline(CS)) {
+          Function *Caller = CS.getCaller();
+          // Attempt to inline the function...
+          if (InlineCallIfPossible(CS, CG, SCCFunctions, TD)) {
+            // Remove any cached cost info for this caller, as inlining the
+            // callee has increased the size of the caller (which may be the
+            // same as the callee).
+            resetCachedCostInfo(Caller);
+
+            // Remove this call site from the list.  If possible, use 
+            // swap/pop_back for efficiency, but do not use it if doing so would
+            // move a call site to a function in this SCC before the
+            // 'FirstCallInSCC' barrier.
+            if (SCC.size() == 1) {
+              std::swap(CallSites[CSi], CallSites.back());
+              CallSites.pop_back();
+            } else {
+              CallSites.erase(CallSites.begin()+CSi);
+            }
+            --CSi;
+
+            ++NumInlined;
+            Changed = true;
+            LocalChange = true;
+          }
+        }
+      }
+  } while (LocalChange);
+
+  return Changed;
+}
+
+// doFinalization - Remove now-dead linkonce functions at the end of
+// processing to avoid breaking the SCC traversal.
+bool Inliner::doFinalization(CallGraph &CG) {
+  return removeDeadFunctions(CG);
+}
+
+  /// removeDeadFunctions - Remove dead functions that are not included in
+  /// DNR (Do Not Remove) list.
+bool Inliner::removeDeadFunctions(CallGraph &CG, 
+                                 SmallPtrSet<const Function *, 16> *DNR) {
+  std::set<CallGraphNode*> FunctionsToRemove;
+
+  // Scan for all of the functions, looking for ones that should now be removed
+  // from the program.  Insert the dead ones in the FunctionsToRemove set.
+  for (CallGraph::iterator I = CG.begin(), E = CG.end(); I != E; ++I) {
+    CallGraphNode *CGN = I->second;
+    if (Function *F = CGN ? CGN->getFunction() : 0) {
+      // If the only remaining users of the function are dead constants, remove
+      // them.
+      F->removeDeadConstantUsers();
+
+      if (DNR && DNR->count(F))
+        continue;
+
+      if ((F->hasLinkOnceLinkage() || F->hasLocalLinkage()) &&
+          F->use_empty()) {
+
+        // Remove any call graph edges from the function to its callees.
+        CGN->removeAllCalledFunctions();
+
+        // Remove any edges from the external node to the function's call graph
+        // node.  These edges might have been made irrelegant due to
+        // optimization of the program.
+        CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
+
+        // Removing the node for callee from the call graph and delete it.
+        FunctionsToRemove.insert(CGN);
+      }
+    }
+  }
+
+  // Now that we know which functions to delete, do so.  We didn't want to do
+  // this inline, because that would invalidate our CallGraph::iterator
+  // objects. :(
+  bool Changed = false;
+  for (std::set<CallGraphNode*>::iterator I = FunctionsToRemove.begin(),
+         E = FunctionsToRemove.end(); I != E; ++I) {
+    resetCachedCostInfo((*I)->getFunction());
+    delete CG.removeFunctionFromModule(*I);
+    ++NumDeleted;
+    Changed = true;
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
new file mode 100644
index 0000000..5093ae9
--- /dev/null
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -0,0 +1,184 @@
+//===-- Internalize.cpp - Mark functions internal -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for a
+// main function.  If a main function is found, all other functions and all
+// global variables with initializers are marked as internal.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "internalize"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumAliases  , "Number of aliases internalized");
+STATISTIC(NumFunctions, "Number of functions internalized");
+STATISTIC(NumGlobals  , "Number of global vars internalized");
+
+// APIFile - A file which contains a list of symbols that should not be marked
+// external.
+static cl::opt<std::string>
+APIFile("internalize-public-api-file", cl::value_desc("filename"),
+        cl::desc("A file containing list of symbol names to preserve"));
+
+// APIList - A list of symbols that should not be marked internal.
+static cl::list<std::string>
+APIList("internalize-public-api-list", cl::value_desc("list"),
+        cl::desc("A list of symbol names to preserve"),
+        cl::CommaSeparated);
+
+namespace {
+  class VISIBILITY_HIDDEN InternalizePass : public ModulePass {
+    std::set<std::string> ExternalNames;
+    /// If no api symbols were specified and a main function is defined,
+    /// assume the main function is the only API
+    bool AllButMain;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit InternalizePass(bool AllButMain = true);
+    explicit InternalizePass(const std::vector <const char *>& exportList);
+    void LoadFile(const char *Filename);
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addPreserved<CallGraph>();
+    }
+  };
+} // end anonymous namespace
+
+char InternalizePass::ID = 0;
+static RegisterPass<InternalizePass>
+X("internalize", "Internalize Global Symbols");
+
+InternalizePass::InternalizePass(bool AllButMain)
+  : ModulePass(&ID), AllButMain(AllButMain){
+  if (!APIFile.empty())           // If a filename is specified, use it.
+    LoadFile(APIFile.c_str());
+  if (!APIList.empty())           // If a list is specified, use it as well.
+    ExternalNames.insert(APIList.begin(), APIList.end());
+}
+
+InternalizePass::InternalizePass(const std::vector<const char *>&exportList)
+  : ModulePass(&ID), AllButMain(false){
+  for(std::vector<const char *>::const_iterator itr = exportList.begin();
+        itr != exportList.end(); itr++) {
+    ExternalNames.insert(*itr);
+  }
+}
+
+void InternalizePass::LoadFile(const char *Filename) {
+  // Load the APIFile...
+  std::ifstream In(Filename);
+  if (!In.good()) {
+    cerr << "WARNING: Internalize couldn't load file '" << Filename
+         << "'! Continuing as if it's empty.\n";
+    return; // Just continue as if the file were empty
+  }
+  while (In) {
+    std::string Symbol;
+    In >> Symbol;
+    if (!Symbol.empty())
+      ExternalNames.insert(Symbol);
+  }
+}
+
+bool InternalizePass::runOnModule(Module &M) {
+  CallGraph *CG = getAnalysisIfAvailable<CallGraph>();
+  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
+
+  if (ExternalNames.empty()) {
+    // Return if we're not in 'all but main' mode and have no external api
+    if (!AllButMain)
+      return false;
+    // If no list or file of symbols was specified, check to see if there is a
+    // "main" symbol defined in the module.  If so, use it, otherwise do not
+    // internalize the module, it must be a library or something.
+    //
+    Function *MainFunc = M.getFunction("main");
+    if (MainFunc == 0 || MainFunc->isDeclaration())
+      return false;  // No main found, must be a library...
+
+    // Preserve main, internalize all else.
+    ExternalNames.insert(MainFunc->getName());
+  }
+
+  bool Changed = false;
+
+  // Mark all functions not in the api as internal.
+  // FIXME: maybe use private linkage?
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() &&         // Function must be defined here
+        !I->hasLocalLinkage() &&  // Can't already have internal linkage
+        !ExternalNames.count(I->getName())) {// Not marked to keep external?
+      I->setLinkage(GlobalValue::InternalLinkage);
+      // Remove a callgraph edge from the external node to this function.
+      if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
+      Changed = true;
+      ++NumFunctions;
+      DOUT << "Internalizing func " << I->getName() << "\n";
+    }
+
+  // Never internalize the llvm.used symbol.  It is used to implement
+  // attribute((used)).
+  ExternalNames.insert("llvm.used");
+
+  // Never internalize anchors used by the machine module info, else the info
+  // won't find them.  (see MachineModuleInfo.)
+  ExternalNames.insert("llvm.dbg.compile_units");
+  ExternalNames.insert("llvm.dbg.global_variables");
+  ExternalNames.insert("llvm.dbg.subprograms");
+  ExternalNames.insert("llvm.global_ctors");
+  ExternalNames.insert("llvm.global_dtors");
+  ExternalNames.insert("llvm.noinline");
+  ExternalNames.insert("llvm.global.annotations");
+
+  // Mark all global variables with initializers that are not in the api as
+  // internal as well.
+  // FIXME: maybe use private linkage?
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasLocalLinkage() &&
+        !ExternalNames.count(I->getName())) {
+      I->setLinkage(GlobalValue::InternalLinkage);
+      Changed = true;
+      ++NumGlobals;
+      DOUT << "Internalized gvar " << I->getName() << "\n";
+    }
+
+  // Mark all aliases that are not in the api as internal as well.
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasInternalLinkage() &&
+        !ExternalNames.count(I->getName())) {
+      I->setLinkage(GlobalValue::InternalLinkage);
+      Changed = true;
+      ++NumAliases;
+      DOUT << "Internalized alias " << I->getName() << "\n";
+    }
+
+  return Changed;
+}
+
+ModulePass *llvm::createInternalizePass(bool AllButMain) {
+  return new InternalizePass(AllButMain);
+}
+
+ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) {
+  return new InternalizePass(el);
+}
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
new file mode 100644
index 0000000..0c65443
--- /dev/null
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -0,0 +1,261 @@
+//===- LoopExtractor.cpp - Extract each loop into a new function ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass wrapper around the ExtractLoop() scalar transformation to extract each
+// top-level loop into its own new function. If the loop is the ONLY loop in a
+// given function, it is not touched. This is a pass most useful for debugging
+// via bugpoint.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-extract"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumExtracted, "Number of loops extracted");
+
+namespace {
+  // FIXME: This is not a function pass, but the PassManager doesn't allow
+  // Module passes to require FunctionPasses, so we can't get loop info if we're
+  // not a function pass.
+  struct VISIBILITY_HIDDEN LoopExtractor : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    unsigned NumLoops;
+
+    explicit LoopExtractor(unsigned numLoops = ~0) 
+      : FunctionPass(&ID), NumLoops(numLoops) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+    }
+  };
+}
+
+char LoopExtractor::ID = 0;
+static RegisterPass<LoopExtractor>
+X("loop-extract", "Extract loops into new functions");
+
+namespace {
+  /// SingleLoopExtractor - For bugpoint.
+  struct SingleLoopExtractor : public LoopExtractor {
+    static char ID; // Pass identification, replacement for typeid
+    SingleLoopExtractor() : LoopExtractor(1) {}
+  };
+} // End anonymous namespace
+
+char SingleLoopExtractor::ID = 0;
+static RegisterPass<SingleLoopExtractor>
+Y("loop-extract-single", "Extract at most one loop into a new function");
+
+// createLoopExtractorPass - This pass extracts all natural loops from the
+// program into a function if it can.
+//
+FunctionPass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
+
+bool LoopExtractor::runOnFunction(Function &F) {
+  LoopInfo &LI = getAnalysis<LoopInfo>();
+
+  // If this function has no loops, there is nothing to do.
+  if (LI.empty())
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+
+  // If there is more than one top-level loop in this function, extract all of
+  // the loops.
+  bool Changed = false;
+  if (LI.end()-LI.begin() > 1) {
+    for (LoopInfo::iterator i = LI.begin(), e = LI.end(); i != e; ++i) {
+      if (NumLoops == 0) return Changed;
+      --NumLoops;
+      Changed |= ExtractLoop(DT, *i) != 0;
+      ++NumExtracted;
+    }
+  } else {
+    // Otherwise there is exactly one top-level loop.  If this function is more
+    // than a minimal wrapper around the loop, extract the loop.
+    Loop *TLL = *LI.begin();
+    bool ShouldExtractLoop = false;
+
+    // Extract the loop if the entry block doesn't branch to the loop header.
+    TerminatorInst *EntryTI = F.getEntryBlock().getTerminator();
+    if (!isa<BranchInst>(EntryTI) ||
+        !cast<BranchInst>(EntryTI)->isUnconditional() ||
+        EntryTI->getSuccessor(0) != TLL->getHeader())
+      ShouldExtractLoop = true;
+    else {
+      // Check to see if any exits from the loop are more than just return
+      // blocks.
+      SmallVector<BasicBlock*, 8> ExitBlocks;
+      TLL->getExitBlocks(ExitBlocks);
+      for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+        if (!isa<ReturnInst>(ExitBlocks[i]->getTerminator())) {
+          ShouldExtractLoop = true;
+          break;
+        }
+    }
+
+    if (ShouldExtractLoop) {
+      if (NumLoops == 0) return Changed;
+      --NumLoops;
+      Changed |= ExtractLoop(DT, TLL) != 0;
+      ++NumExtracted;
+    } else {
+      // Okay, this function is a minimal container around the specified loop.
+      // If we extract the loop, we will continue to just keep extracting it
+      // infinitely... so don't extract it.  However, if the loop contains any
+      // subloops, extract them.
+      for (Loop::iterator i = TLL->begin(), e = TLL->end(); i != e; ++i) {
+        if (NumLoops == 0) return Changed;
+        --NumLoops;
+        Changed |= ExtractLoop(DT, *i) != 0;
+        ++NumExtracted;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+// createSingleLoopExtractorPass - This pass extracts one natural loop from the
+// program into a function if it can.  This is used by bugpoint.
+//
+FunctionPass *llvm::createSingleLoopExtractorPass() {
+  return new SingleLoopExtractor();
+}
+
+
+// BlockFile - A file which contains a list of blocks that should not be
+// extracted.
+static cl::opt<std::string>
+BlockFile("extract-blocks-file", cl::value_desc("filename"),
+          cl::desc("A file containing list of basic blocks to not extract"),
+          cl::Hidden);
+
+namespace {
+  /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks
+  /// from the module into their own functions except for those specified by the
+  /// BlocksToNotExtract list.
+  class BlockExtractorPass : public ModulePass {
+    void LoadFile(const char *Filename);
+
+    std::vector<BasicBlock*> BlocksToNotExtract;
+    std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit BlockExtractorPass(const std::vector<BasicBlock*> &B) 
+      : ModulePass(&ID), BlocksToNotExtract(B) {
+      if (!BlockFile.empty())
+        LoadFile(BlockFile.c_str());
+    }
+    BlockExtractorPass() : ModulePass(&ID) {}
+
+    bool runOnModule(Module &M);
+  };
+}
+
+char BlockExtractorPass::ID = 0;
+static RegisterPass<BlockExtractorPass>
+XX("extract-blocks", "Extract Basic Blocks From Module (for bugpoint use)");
+
+// createBlockExtractorPass - This pass extracts all blocks (except those
+// specified in the argument list) from the functions in the module.
+//
+ModulePass *llvm::createBlockExtractorPass(const std::vector<BasicBlock*> &BTNE)
+{
+  return new BlockExtractorPass(BTNE);
+}
+
+void BlockExtractorPass::LoadFile(const char *Filename) {
+  // Load the BlockFile...
+  std::ifstream In(Filename);
+  if (!In.good()) {
+    cerr << "WARNING: BlockExtractor couldn't load file '" << Filename
+         << "'!\n";
+    return;
+  }
+  while (In) {
+    std::string FunctionName, BlockName;
+    In >> FunctionName;
+    In >> BlockName;
+    if (!BlockName.empty())
+      BlocksToNotExtractByName.push_back(
+          std::make_pair(FunctionName, BlockName));
+  }
+}
+
+bool BlockExtractorPass::runOnModule(Module &M) {
+  std::set<BasicBlock*> TranslatedBlocksToNotExtract;
+  for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
+    BasicBlock *BB = BlocksToNotExtract[i];
+    Function *F = BB->getParent();
+
+    // Map the corresponding function in this module.
+    Function *MF = M.getFunction(F->getName());
+    assert(MF->getFunctionType() == F->getFunctionType() && "Wrong function?");
+
+    // Figure out which index the basic block is in its function.
+    Function::iterator BBI = MF->begin();
+    std::advance(BBI, std::distance(F->begin(), Function::iterator(BB)));
+    TranslatedBlocksToNotExtract.insert(BBI);
+  }
+
+  while (!BlocksToNotExtractByName.empty()) {
+    // There's no way to find BBs by name without looking at every BB inside
+    // every Function. Fortunately, this is always empty except when used by
+    // bugpoint in which case correctness is more important than performance.
+
+    std::string &FuncName  = BlocksToNotExtractByName.back().first;
+    std::string &BlockName = BlocksToNotExtractByName.back().second;
+
+    for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+      Function &F = *FI;
+      if (F.getName() != FuncName) continue;
+
+      for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+        BasicBlock &BB = *BI;
+        if (BB.getName() != BlockName) continue;
+
+        TranslatedBlocksToNotExtract.insert(BI);
+      }
+    }
+
+    BlocksToNotExtractByName.pop_back();
+  }
+
+  // Now that we know which blocks to not extract, figure out which ones we WANT
+  // to extract.
+  std::vector<BasicBlock*> BlocksToExtract;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (!TranslatedBlocksToNotExtract.count(BB))
+        BlocksToExtract.push_back(BB);
+
+  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i)
+    ExtractBasicBlock(BlocksToExtract[i]);
+
+  return !BlocksToExtract.empty();
+}
diff --git a/lib/Transforms/IPO/LowerSetJmp.cpp b/lib/Transforms/IPO/LowerSetJmp.cpp
new file mode 100644
index 0000000..dfc040b
--- /dev/null
+++ b/lib/Transforms/IPO/LowerSetJmp.cpp
@@ -0,0 +1,536 @@
+//===- LowerSetJmp.cpp - Code pertaining to lowering set/long jumps -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the lowering of setjmp and longjmp to use the
+//  LLVM invoke and unwind instructions as necessary.
+//
+//  Lowering of longjmp is fairly trivial. We replace the call with a
+//  call to the LLVM library function "__llvm_sjljeh_throw_longjmp()".
+//  This unwinds the stack for us calling all of the destructors for
+//  objects allocated on the stack.
+//
+//  At a setjmp call, the basic block is split and the setjmp removed.
+//  The calls in a function that have a setjmp are converted to invoke
+//  where the except part checks to see if it's a longjmp exception and,
+//  if so, if it's handled in the function. If it is, then it gets the
+//  value returned by the longjmp and goes to where the basic block was
+//  split. Invoke instructions are handled in a similar fashion with the
+//  original except block being executed if it isn't a longjmp except
+//  that is handled by that function.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FIXME: This pass doesn't deal with PHI statements just yet. That is,
+// we expect this to occur before SSAification is done. This would seem
+// to make sense, but in general, it might be a good idea to make this
+// pass invokable via the "opt" command at will.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowersetjmp"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(LongJmpsTransformed, "Number of longjmps transformed");
+STATISTIC(SetJmpsTransformed , "Number of setjmps transformed");
+STATISTIC(CallsTransformed   , "Number of calls invokified");
+STATISTIC(InvokesTransformed , "Number of invokes modified");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // LowerSetJmp pass implementation.
+  class VISIBILITY_HIDDEN LowerSetJmp : public ModulePass,
+                      public InstVisitor<LowerSetJmp> {
+    // LLVM library functions...
+    Constant *InitSJMap;        // __llvm_sjljeh_init_setjmpmap
+    Constant *DestroySJMap;     // __llvm_sjljeh_destroy_setjmpmap
+    Constant *AddSJToMap;       // __llvm_sjljeh_add_setjmp_to_map
+    Constant *ThrowLongJmp;     // __llvm_sjljeh_throw_longjmp
+    Constant *TryCatchLJ;       // __llvm_sjljeh_try_catching_longjmp_exception
+    Constant *IsLJException;    // __llvm_sjljeh_is_longjmp_exception
+    Constant *GetLJValue;       // __llvm_sjljeh_get_longjmp_value
+
+    typedef std::pair<SwitchInst*, CallInst*> SwitchValuePair;
+
+    // Keep track of those basic blocks reachable via a depth-first search of
+    // the CFG from a setjmp call. We only need to transform those "call" and
+    // "invoke" instructions that are reachable from the setjmp call site.
+    std::set<BasicBlock*> DFSBlocks;
+
+    // The setjmp map is going to hold information about which setjmps
+    // were called (each setjmp gets its own number) and with which
+    // buffer it was called.
+    std::map<Function*, AllocaInst*>            SJMap;
+
+    // The rethrow basic block map holds the basic block to branch to if
+    // the exception isn't handled in the current function and needs to
+    // be rethrown.
+    std::map<const Function*, BasicBlock*>      RethrowBBMap;
+
+    // The preliminary basic block map holds a basic block that grabs the
+    // exception and determines if it's handled by the current function.
+    std::map<const Function*, BasicBlock*>      PrelimBBMap;
+
+    // The switch/value map holds a switch inst/call inst pair. The
+    // switch inst controls which handler (if any) gets called and the
+    // value is the value returned to that handler by the call to
+    // __llvm_sjljeh_get_longjmp_value.
+    std::map<const Function*, SwitchValuePair>  SwitchValMap;
+
+    // A map of which setjmps we've seen so far in a function.
+    std::map<const Function*, unsigned>         SetJmpIDMap;
+
+    AllocaInst*     GetSetJmpMap(Function* Func);
+    BasicBlock*     GetRethrowBB(Function* Func);
+    SwitchValuePair GetSJSwitch(Function* Func, BasicBlock* Rethrow);
+
+    void TransformLongJmpCall(CallInst* Inst);
+    void TransformSetJmpCall(CallInst* Inst);
+
+    bool IsTransformableFunction(const std::string& Name);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerSetJmp() : ModulePass(&ID) {}
+
+    void visitCallInst(CallInst& CI);
+    void visitInvokeInst(InvokeInst& II);
+    void visitReturnInst(ReturnInst& RI);
+    void visitUnwindInst(UnwindInst& UI);
+
+    bool runOnModule(Module& M);
+    bool doInitialization(Module& M);
+  };
+} // end anonymous namespace
+
+char LowerSetJmp::ID = 0;
+static RegisterPass<LowerSetJmp> X("lowersetjmp", "Lower Set Jump");
+
+// run - Run the transformation on the program. We grab the function
+// prototypes for longjmp and setjmp. If they are used in the program,
+// then we can go directly to the places they're at and transform them.
+bool LowerSetJmp::runOnModule(Module& M) {
+  bool Changed = false;
+
+  // These are what the functions are called.
+  Function* SetJmp = M.getFunction("llvm.setjmp");
+  Function* LongJmp = M.getFunction("llvm.longjmp");
+
+  // This program doesn't have longjmp and setjmp calls.
+  if ((!LongJmp || LongJmp->use_empty()) &&
+        (!SetJmp || SetJmp->use_empty())) return false;
+
+  // Initialize some values and functions we'll need to transform the
+  // setjmp/longjmp functions.
+  doInitialization(M);
+
+  if (SetJmp) {
+    for (Value::use_iterator B = SetJmp->use_begin(), E = SetJmp->use_end();
+         B != E; ++B) {
+      BasicBlock* BB = cast<Instruction>(*B)->getParent();
+      for (df_ext_iterator<BasicBlock*> I = df_ext_begin(BB, DFSBlocks),
+             E = df_ext_end(BB, DFSBlocks); I != E; ++I)
+        /* empty */;
+    }
+
+    while (!SetJmp->use_empty()) {
+      assert(isa<CallInst>(SetJmp->use_back()) &&
+             "User of setjmp intrinsic not a call?");
+      TransformSetJmpCall(cast<CallInst>(SetJmp->use_back()));
+      Changed = true;
+    }
+  }
+
+  if (LongJmp)
+    while (!LongJmp->use_empty()) {
+      assert(isa<CallInst>(LongJmp->use_back()) &&
+             "User of longjmp intrinsic not a call?");
+      TransformLongJmpCall(cast<CallInst>(LongJmp->use_back()));
+      Changed = true;
+    }
+
+  // Now go through the affected functions and convert calls and invokes
+  // to new invokes...
+  for (std::map<Function*, AllocaInst*>::iterator
+      B = SJMap.begin(), E = SJMap.end(); B != E; ++B) {
+    Function* F = B->first;
+    for (Function::iterator BB = F->begin(), BE = F->end(); BB != BE; ++BB)
+      for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ) {
+        visit(*IB++);
+        if (IB != BB->end() && IB->getParent() != BB)
+          break;  // The next instruction got moved to a different block!
+      }
+  }
+
+  DFSBlocks.clear();
+  SJMap.clear();
+  RethrowBBMap.clear();
+  PrelimBBMap.clear();
+  SwitchValMap.clear();
+  SetJmpIDMap.clear();
+
+  return Changed;
+}
+
+// doInitialization - For the lower long/setjmp pass, this ensures that a
+// module contains a declaration for the intrisic functions we are going
+// to call to convert longjmp and setjmp calls.
+//
+// This function is always successful, unless it isn't.
+bool LowerSetJmp::doInitialization(Module& M)
+{
+  const Type *SBPTy = PointerType::getUnqual(Type::Int8Ty);
+  const Type *SBPPTy = PointerType::getUnqual(SBPTy);
+
+  // N.B. See llvm/runtime/GCCLibraries/libexception/SJLJ-Exception.h for
+  // a description of the following library functions.
+
+  // void __llvm_sjljeh_init_setjmpmap(void**)
+  InitSJMap = M.getOrInsertFunction("__llvm_sjljeh_init_setjmpmap",
+                                    Type::VoidTy, SBPPTy, (Type *)0);
+  // void __llvm_sjljeh_destroy_setjmpmap(void**)
+  DestroySJMap = M.getOrInsertFunction("__llvm_sjljeh_destroy_setjmpmap",
+                                       Type::VoidTy, SBPPTy, (Type *)0);
+
+  // void __llvm_sjljeh_add_setjmp_to_map(void**, void*, unsigned)
+  AddSJToMap = M.getOrInsertFunction("__llvm_sjljeh_add_setjmp_to_map",
+                                     Type::VoidTy, SBPPTy, SBPTy,
+                                     Type::Int32Ty, (Type *)0);
+
+  // void __llvm_sjljeh_throw_longjmp(int*, int)
+  ThrowLongJmp = M.getOrInsertFunction("__llvm_sjljeh_throw_longjmp",
+                                       Type::VoidTy, SBPTy, Type::Int32Ty,
+                                       (Type *)0);
+
+  // unsigned __llvm_sjljeh_try_catching_longjmp_exception(void **)
+  TryCatchLJ =
+    M.getOrInsertFunction("__llvm_sjljeh_try_catching_longjmp_exception",
+                          Type::Int32Ty, SBPPTy, (Type *)0);
+
+  // bool __llvm_sjljeh_is_longjmp_exception()
+  IsLJException = M.getOrInsertFunction("__llvm_sjljeh_is_longjmp_exception",
+                                        Type::Int1Ty, (Type *)0);
+
+  // int __llvm_sjljeh_get_longjmp_value()
+  GetLJValue = M.getOrInsertFunction("__llvm_sjljeh_get_longjmp_value",
+                                     Type::Int32Ty, (Type *)0);
+  return true;
+}
+
+// IsTransformableFunction - Return true if the function name isn't one
+// of the ones we don't want transformed. Currently, don't transform any
+// "llvm.{setjmp,longjmp}" functions and none of the setjmp/longjmp error
+// handling functions (beginning with __llvm_sjljeh_...they don't throw
+// exceptions).
+bool LowerSetJmp::IsTransformableFunction(const std::string& Name) {
+  std::string SJLJEh("__llvm_sjljeh");
+
+  if (Name.size() > SJLJEh.size())
+    return std::string(Name.begin(), Name.begin() + SJLJEh.size()) != SJLJEh;
+
+  return true;
+}
+
+// TransformLongJmpCall - Transform a longjmp call into a call to the
+// internal __llvm_sjljeh_throw_longjmp function. It then takes care of
+// throwing the exception for us.
+void LowerSetJmp::TransformLongJmpCall(CallInst* Inst)
+{
+  const Type* SBPTy = PointerType::getUnqual(Type::Int8Ty);
+
+  // Create the call to "__llvm_sjljeh_throw_longjmp". This takes the
+  // same parameters as "longjmp", except that the buffer is cast to a
+  // char*. It returns "void", so it doesn't need to replace any of
+  // Inst's uses and doesn't get a name.
+  CastInst* CI = 
+    new BitCastInst(Inst->getOperand(1), SBPTy, "LJBuf", Inst);
+  SmallVector<Value *, 2> Args;
+  Args.push_back(CI);
+  Args.push_back(Inst->getOperand(2));
+  CallInst::Create(ThrowLongJmp, Args.begin(), Args.end(), "", Inst);
+
+  SwitchValuePair& SVP = SwitchValMap[Inst->getParent()->getParent()];
+
+  // If the function has a setjmp call in it (they are transformed first)
+  // we should branch to the basic block that determines if this longjmp
+  // is applicable here. Otherwise, issue an unwind.
+  if (SVP.first)
+    BranchInst::Create(SVP.first->getParent(), Inst);
+  else
+    new UnwindInst(Inst);
+
+  // Remove all insts after the branch/unwind inst.  Go from back to front to
+  // avoid replaceAllUsesWith if possible.
+  BasicBlock *BB = Inst->getParent();
+  Instruction *Removed;
+  do {
+    Removed = &BB->back();
+    // If the removed instructions have any users, replace them now.
+    if (!Removed->use_empty())
+      Removed->replaceAllUsesWith(UndefValue::get(Removed->getType()));
+    Removed->eraseFromParent();
+  } while (Removed != Inst);
+
+  ++LongJmpsTransformed;
+}
+
+// GetSetJmpMap - Retrieve (create and initialize, if necessary) the
+// setjmp map. This map is going to hold information about which setjmps
+// were called (each setjmp gets its own number) and with which buffer it
+// was called. There can be only one!
+AllocaInst* LowerSetJmp::GetSetJmpMap(Function* Func)
+{
+  if (SJMap[Func]) return SJMap[Func];
+
+  // Insert the setjmp map initialization before the first instruction in
+  // the function.
+  Instruction* Inst = Func->getEntryBlock().begin();
+  assert(Inst && "Couldn't find even ONE instruction in entry block!");
+
+  // Fill in the alloca and call to initialize the SJ map.
+  const Type *SBPTy = PointerType::getUnqual(Type::Int8Ty);
+  AllocaInst* Map = new AllocaInst(SBPTy, 0, "SJMap", Inst);
+  CallInst::Create(InitSJMap, Map, "", Inst);
+  return SJMap[Func] = Map;
+}
+
+// GetRethrowBB - Only one rethrow basic block is needed per function.
+// If this is a longjmp exception but not handled in this block, this BB
+// performs the rethrow.
+BasicBlock* LowerSetJmp::GetRethrowBB(Function* Func)
+{
+  if (RethrowBBMap[Func]) return RethrowBBMap[Func];
+
+  // The basic block we're going to jump to if we need to rethrow the
+  // exception.
+  BasicBlock* Rethrow = BasicBlock::Create("RethrowExcept", Func);
+
+  // Fill in the "Rethrow" BB with a call to rethrow the exception. This
+  // is the last instruction in the BB since at this point the runtime
+  // should exit this function and go to the next function.
+  new UnwindInst(Rethrow);
+  return RethrowBBMap[Func] = Rethrow;
+}
+
+// GetSJSwitch - Return the switch statement that controls which handler
+// (if any) gets called and the value returned to that handler.
+LowerSetJmp::SwitchValuePair LowerSetJmp::GetSJSwitch(Function* Func,
+                                                      BasicBlock* Rethrow)
+{
+  if (SwitchValMap[Func].first) return SwitchValMap[Func];
+
+  BasicBlock* LongJmpPre = BasicBlock::Create("LongJmpBlkPre", Func);
+
+  // Keep track of the preliminary basic block for some of the other
+  // transformations.
+  PrelimBBMap[Func] = LongJmpPre;
+
+  // Grab the exception.
+  CallInst* Cond = CallInst::Create(IsLJException, "IsLJExcept", LongJmpPre);
+
+  // The "decision basic block" gets the number associated with the
+  // setjmp call returning to switch on and the value returned by
+  // longjmp.
+  BasicBlock* DecisionBB = BasicBlock::Create("LJDecisionBB", Func);
+
+  BranchInst::Create(DecisionBB, Rethrow, Cond, LongJmpPre);
+
+  // Fill in the "decision" basic block.
+  CallInst* LJVal = CallInst::Create(GetLJValue, "LJVal", DecisionBB);
+  CallInst* SJNum = CallInst::Create(TryCatchLJ, GetSetJmpMap(Func), "SJNum",
+                                     DecisionBB);
+
+  SwitchInst* SI = SwitchInst::Create(SJNum, Rethrow, 0, DecisionBB);
+  return SwitchValMap[Func] = SwitchValuePair(SI, LJVal);
+}
+
+// TransformSetJmpCall - The setjmp call is a bit trickier to transform.
+// We're going to convert all setjmp calls to nops. Then all "call" and
+// "invoke" instructions in the function are converted to "invoke" where
+// the "except" branch is used when returning from a longjmp call.
+void LowerSetJmp::TransformSetJmpCall(CallInst* Inst)
+{
+  BasicBlock* ABlock = Inst->getParent();
+  Function* Func = ABlock->getParent();
+
+  // Add this setjmp to the setjmp map.
+  const Type* SBPTy = PointerType::getUnqual(Type::Int8Ty);
+  CastInst* BufPtr = 
+    new BitCastInst(Inst->getOperand(1), SBPTy, "SBJmpBuf", Inst);
+  std::vector<Value*> Args = 
+    make_vector<Value*>(GetSetJmpMap(Func), BufPtr,
+                        ConstantInt::get(Type::Int32Ty,
+                                         SetJmpIDMap[Func]++), 0);
+  CallInst::Create(AddSJToMap, Args.begin(), Args.end(), "", Inst);
+
+  // We are guaranteed that there are no values live across basic blocks
+  // (because we are "not in SSA form" yet), but there can still be values live
+  // in basic blocks.  Because of this, splitting the setjmp block can cause
+  // values above the setjmp to not dominate uses which are after the setjmp
+  // call.  For all of these occasions, we must spill the value to the stack.
+  //
+  std::set<Instruction*> InstrsAfterCall;
+
+  // The call is probably very close to the end of the basic block, for the
+  // common usage pattern of: 'if (setjmp(...))', so keep track of the
+  // instructions after the call.
+  for (BasicBlock::iterator I = ++BasicBlock::iterator(Inst), E = ABlock->end();
+       I != E; ++I)
+    InstrsAfterCall.insert(I);
+
+  for (BasicBlock::iterator II = ABlock->begin();
+       II != BasicBlock::iterator(Inst); ++II)
+    // Loop over all of the uses of instruction.  If any of them are after the
+    // call, "spill" the value to the stack.
+    for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
+         UI != E; ++UI)
+      if (cast<Instruction>(*UI)->getParent() != ABlock ||
+          InstrsAfterCall.count(cast<Instruction>(*UI))) {
+        DemoteRegToStack(*II);
+        break;
+      }
+  InstrsAfterCall.clear();
+
+  // Change the setjmp call into a branch statement. We'll remove the
+  // setjmp call in a little bit. No worries.
+  BasicBlock* SetJmpContBlock = ABlock->splitBasicBlock(Inst);
+  assert(SetJmpContBlock && "Couldn't split setjmp BB!!");
+
+  SetJmpContBlock->setName(ABlock->getName()+"SetJmpCont");
+
+  // Add the SetJmpContBlock to the set of blocks reachable from a setjmp.
+  DFSBlocks.insert(SetJmpContBlock);
+
+  // This PHI node will be in the new block created from the
+  // splitBasicBlock call.
+  PHINode* PHI = PHINode::Create(Type::Int32Ty, "SetJmpReturn", Inst);
+
+  // Coming from a call to setjmp, the return is 0.
+  PHI->addIncoming(ConstantInt::getNullValue(Type::Int32Ty), ABlock);
+
+  // Add the case for this setjmp's number...
+  SwitchValuePair SVP = GetSJSwitch(Func, GetRethrowBB(Func));
+  SVP.first->addCase(ConstantInt::get(Type::Int32Ty, SetJmpIDMap[Func] - 1),
+                     SetJmpContBlock);
+
+  // Value coming from the handling of the exception.
+  PHI->addIncoming(SVP.second, SVP.second->getParent());
+
+  // Replace all uses of this instruction with the PHI node created by
+  // the eradication of setjmp.
+  Inst->replaceAllUsesWith(PHI);
+  Inst->eraseFromParent();
+
+  ++SetJmpsTransformed;
+}
+
+// visitCallInst - This converts all LLVM call instructions into invoke
+// instructions. The except part of the invoke goes to the "LongJmpBlkPre"
+// that grabs the exception and proceeds to determine if it's a longjmp
+// exception or not.
+void LowerSetJmp::visitCallInst(CallInst& CI)
+{
+  if (CI.getCalledFunction())
+    if (!IsTransformableFunction(CI.getCalledFunction()->getName()) ||
+        CI.getCalledFunction()->isIntrinsic()) return;
+
+  BasicBlock* OldBB = CI.getParent();
+
+  // If not reachable from a setjmp call, don't transform.
+  if (!DFSBlocks.count(OldBB)) return;
+
+  BasicBlock* NewBB = OldBB->splitBasicBlock(CI);
+  assert(NewBB && "Couldn't split BB of \"call\" instruction!!");
+  DFSBlocks.insert(NewBB);
+  NewBB->setName("Call2Invoke");
+
+  Function* Func = OldBB->getParent();
+
+  // Construct the new "invoke" instruction.
+  TerminatorInst* Term = OldBB->getTerminator();
+  std::vector<Value*> Params(CI.op_begin() + 1, CI.op_end());
+  InvokeInst* II =
+    InvokeInst::Create(CI.getCalledValue(), NewBB, PrelimBBMap[Func],
+                       Params.begin(), Params.end(), CI.getName(), Term);
+  II->setCallingConv(CI.getCallingConv());
+  II->setAttributes(CI.getAttributes());
+
+  // Replace the old call inst with the invoke inst and remove the call.
+  CI.replaceAllUsesWith(II);
+  CI.eraseFromParent();
+
+  // The old terminator is useless now that we have the invoke inst.
+  Term->eraseFromParent();
+  ++CallsTransformed;
+}
+
+// visitInvokeInst - Converting the "invoke" instruction is fairly
+// straight-forward. The old exception part is replaced by a query asking
+// if this is a longjmp exception. If it is, then it goes to the longjmp
+// exception blocks. Otherwise, control is passed the old exception.
+void LowerSetJmp::visitInvokeInst(InvokeInst& II)
+{
+  if (II.getCalledFunction())
+    if (!IsTransformableFunction(II.getCalledFunction()->getName()) ||
+        II.getCalledFunction()->isIntrinsic()) return;
+
+  BasicBlock* BB = II.getParent();
+
+  // If not reachable from a setjmp call, don't transform.
+  if (!DFSBlocks.count(BB)) return;
+
+  BasicBlock* ExceptBB = II.getUnwindDest();
+
+  Function* Func = BB->getParent();
+  BasicBlock* NewExceptBB = BasicBlock::Create("InvokeExcept", Func);
+
+  // If this is a longjmp exception, then branch to the preliminary BB of
+  // the longjmp exception handling. Otherwise, go to the old exception.
+  CallInst* IsLJExcept = CallInst::Create(IsLJException, "IsLJExcept",
+                                          NewExceptBB);
+
+  BranchInst::Create(PrelimBBMap[Func], ExceptBB, IsLJExcept, NewExceptBB);
+
+  II.setUnwindDest(NewExceptBB);
+  ++InvokesTransformed;
+}
+
+// visitReturnInst - We want to destroy the setjmp map upon exit from the
+// function.
+void LowerSetJmp::visitReturnInst(ReturnInst &RI) {
+  Function* Func = RI.getParent()->getParent();
+  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &RI);
+}
+
+// visitUnwindInst - We want to destroy the setjmp map upon exit from the
+// function.
+void LowerSetJmp::visitUnwindInst(UnwindInst &UI) {
+  Function* Func = UI.getParent()->getParent();
+  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &UI);
+}
+
+ModulePass *llvm::createLowerSetJmpPass() {
+  return new LowerSetJmp();
+}
+
diff --git a/lib/Transforms/IPO/Makefile b/lib/Transforms/IPO/Makefile
new file mode 100644
index 0000000..5c42374
--- /dev/null
+++ b/lib/Transforms/IPO/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/IPO/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMipo
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
new file mode 100644
index 0000000..17bc2d4
--- /dev/null
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -0,0 +1,377 @@
+//===- MergeFunctions.cpp - Merge identical functions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for equivalent functions that are mergable and folds them.
+//
+// A Function will not be analyzed if:
+// * it is overridable at runtime (except for weak linkage), or
+// * it is used by anything other than the callee parameter of a call/invoke
+//
+// A hash is computed from the function, based on its type and number of
+// basic blocks.
+//
+// Once all hashes are computed, we perform an expensive equality comparison
+// on each function pair. This takes n^2/2 comparisons per bucket, so it's
+// important that the hash function be high quality. The equality comparison
+// iterates through each instruction in each basic block.
+//
+// When a match is found, the functions are folded. We can only fold two
+// functions when we know that the definition of one of them is not
+// overridable.
+// * fold a function marked internal by replacing all of its users.
+// * fold extern or weak functions by replacing them with a global alias
+//
+//===----------------------------------------------------------------------===//
+//
+// Future work:
+//
+// * fold vector<T*>::push_back and vector<S*>::push_back.
+//
+// These two functions have different types, but in a way that doesn't matter
+// to us. As long as we never see an S or T itself, using S* and S** is the
+// same as using a T* and T**.
+//
+// * virtual functions.
+//
+// Many functions have their address taken by the virtual function table for
+// the object they belong to. However, as long as it's only used for a lookup
+// and call, this is irrelevant, and we'd like to fold such implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mergefunc"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Constants.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <map>
+#include <vector>
+using namespace llvm;
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumMergeFails, "Number of identical function pairings not merged");
+
+namespace {
+  struct VISIBILITY_HIDDEN MergeFunctions : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    MergeFunctions() : ModulePass((intptr_t)&ID) {}
+
+    bool runOnModule(Module &M);
+  };
+}
+
+char MergeFunctions::ID = 0;
+static RegisterPass<MergeFunctions>
+X("mergefunc", "Merge Functions");
+
+ModulePass *llvm::createMergeFunctionsPass() {
+  return new MergeFunctions();
+}
+
+static unsigned long hash(const Function *F) {
+  return F->size() ^ reinterpret_cast<unsigned long>(F->getType());
+  //return F->size() ^ F->arg_size() ^ F->getReturnType();
+}
+
+static bool compare(const Value *V, const Value *U) {
+  assert(!isa<BasicBlock>(V) && !isa<BasicBlock>(U) &&
+         "Must not compare basic blocks.");
+
+  assert(V->getType() == U->getType() &&
+        "Two of the same operation have operands of different type.");
+
+  // TODO: If the constant is an expression of F, we should accept that it's
+  // equal to the same expression in terms of G.
+  if (isa<Constant>(V))
+    return V == U;
+
+  // The caller has ensured that ValueMap[V] != U. Since Arguments are
+  // pre-loaded into the ValueMap, and Instructions are added as we go, we know
+  // that this can only be a mis-match.
+  if (isa<Instruction>(V) || isa<Argument>(V))
+    return false;
+
+  if (isa<InlineAsm>(V) && isa<InlineAsm>(U)) {
+    const InlineAsm *IAF = cast<InlineAsm>(V);
+    const InlineAsm *IAG = cast<InlineAsm>(U);
+    return IAF->getAsmString() == IAG->getAsmString() &&
+           IAF->getConstraintString() == IAG->getConstraintString();
+  }
+
+  return false;
+}
+
+static bool equals(const BasicBlock *BB1, const BasicBlock *BB2,
+                   DenseMap<const Value *, const Value *> &ValueMap,
+                   DenseMap<const Value *, const Value *> &SpeculationMap) {
+  // Specutively add it anyways. If it's false, we'll notice a difference later, and
+  // this won't matter.
+  ValueMap[BB1] = BB2;
+
+  BasicBlock::const_iterator FI = BB1->begin(), FE = BB1->end();
+  BasicBlock::const_iterator GI = BB2->begin(), GE = BB2->end();
+
+  do {
+    if (!FI->isSameOperationAs(const_cast<Instruction *>(&*GI)))
+      return false;
+
+    if (FI->getNumOperands() != GI->getNumOperands())
+      return false;
+
+    if (ValueMap[FI] == GI) {
+      ++FI, ++GI;
+      continue;
+    }
+
+    if (ValueMap[FI] != NULL)
+      return false;
+
+    for (unsigned i = 0, e = FI->getNumOperands(); i != e; ++i) {
+      Value *OpF = FI->getOperand(i);
+      Value *OpG = GI->getOperand(i);
+
+      if (ValueMap[OpF] == OpG)
+        continue;
+
+      if (ValueMap[OpF] != NULL)
+        return false;
+
+      assert(OpF->getType() == OpG->getType() &&
+             "Two of the same operation has operands of different type.");
+
+      if (OpF->getValueID() != OpG->getValueID())
+        return false;
+
+      if (isa<PHINode>(FI)) {
+        if (SpeculationMap[OpF] == NULL)
+          SpeculationMap[OpF] = OpG;
+        else if (SpeculationMap[OpF] != OpG)
+          return false;
+        continue;
+      } else if (isa<BasicBlock>(OpF)) {
+        assert(isa<TerminatorInst>(FI) &&
+               "BasicBlock referenced by non-Terminator non-PHI");
+        // This call changes the ValueMap, hence we can't use
+        // Value *& = ValueMap[...]
+        if (!equals(cast<BasicBlock>(OpF), cast<BasicBlock>(OpG), ValueMap,
+                    SpeculationMap))
+          return false;
+      } else {
+        if (!compare(OpF, OpG))
+          return false;
+      }
+
+      ValueMap[OpF] = OpG;
+    }
+
+    ValueMap[FI] = GI;
+    ++FI, ++GI;
+  } while (FI != FE && GI != GE);
+
+  return FI == FE && GI == GE;
+}
+
+static bool equals(const Function *F, const Function *G) {
+  // We need to recheck everything, but check the things that weren't included
+  // in the hash first.
+
+  if (F->getAttributes() != G->getAttributes())
+    return false;
+
+  if (F->hasGC() != G->hasGC())
+    return false;
+
+  if (F->hasGC() && F->getGC() != G->getGC())
+    return false;
+
+  if (F->hasSection() != G->hasSection())
+    return false;
+
+  if (F->hasSection() && F->getSection() != G->getSection())
+    return false;
+
+  // TODO: if it's internal and only used in direct calls, we could handle this
+  // case too.
+  if (F->getCallingConv() != G->getCallingConv())
+    return false;
+
+  // TODO: We want to permit cases where two functions take T* and S* but
+  // only load or store them into T** and S**.
+  if (F->getType() != G->getType())
+    return false;
+
+  DenseMap<const Value *, const Value *> ValueMap;
+  DenseMap<const Value *, const Value *> SpeculationMap;
+  ValueMap[F] = G;
+
+  assert(F->arg_size() == G->arg_size() &&
+         "Identical functions have a different number of args.");
+
+  for (Function::const_arg_iterator fi = F->arg_begin(), gi = G->arg_begin(),
+         fe = F->arg_end(); fi != fe; ++fi, ++gi)
+    ValueMap[fi] = gi;
+
+  if (!equals(&F->getEntryBlock(), &G->getEntryBlock(), ValueMap,
+              SpeculationMap))
+    return false;
+
+  for (DenseMap<const Value *, const Value *>::iterator
+         I = SpeculationMap.begin(), E = SpeculationMap.end(); I != E; ++I) {
+    if (ValueMap[I->first] != I->second)
+      return false;
+  }
+
+  return true;
+}
+
+static bool fold(std::vector<Function *> &FnVec, unsigned i, unsigned j) {
+  if (FnVec[i]->mayBeOverridden() && !FnVec[j]->mayBeOverridden())
+    std::swap(FnVec[i], FnVec[j]);
+
+  Function *F = FnVec[i];
+  Function *G = FnVec[j];
+
+  if (!F->mayBeOverridden()) {
+    if (G->hasLocalLinkage()) {
+      F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+      G->replaceAllUsesWith(F);
+      G->eraseFromParent();
+      ++NumFunctionsMerged;
+      return true;
+    }
+
+    if (G->hasExternalLinkage() || G->hasWeakLinkage()) {
+      GlobalAlias *GA = new GlobalAlias(G->getType(), G->getLinkage(), "",
+                                        F, G->getParent());
+      F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+      GA->takeName(G);
+      GA->setVisibility(G->getVisibility());
+      G->replaceAllUsesWith(GA);
+      G->eraseFromParent();
+      ++NumFunctionsMerged;
+      return true;
+    }
+  }
+
+  if (F->hasWeakLinkage() && G->hasWeakLinkage()) {
+    GlobalAlias *GA_F = new GlobalAlias(F->getType(), F->getLinkage(), "",
+                                        0, F->getParent());
+    GA_F->takeName(F);
+    GA_F->setVisibility(F->getVisibility());
+    F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+    F->replaceAllUsesWith(GA_F);
+    F->setName("folded." + GA_F->getName());
+    F->setLinkage(GlobalValue::ExternalLinkage);
+    GA_F->setAliasee(F);
+
+    GlobalAlias *GA_G = new GlobalAlias(G->getType(), G->getLinkage(), "",
+                                        F, G->getParent());
+    GA_G->takeName(G);
+    GA_G->setVisibility(G->getVisibility());
+    G->replaceAllUsesWith(GA_G);
+    G->eraseFromParent();
+
+    ++NumFunctionsMerged;
+    return true;
+  }
+
+  DOUT << "Failed on " << F->getName() << " and " << G->getName() << "\n";
+
+  ++NumMergeFails;
+  return false;
+}
+
+static bool hasAddressTaken(User *U) {
+  for (User::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I) {
+    User *Use = *I;
+
+    // 'call (bitcast @F to ...)' happens a lot.
+    while (isa<ConstantExpr>(Use) && Use->hasOneUse()) {
+      Use = *Use->use_begin();
+    }
+
+    if (isa<ConstantExpr>(Use)) {
+      if (hasAddressTaken(Use))
+        return true;
+    }
+
+    if (!isa<CallInst>(Use) && !isa<InvokeInst>(Use))
+      return true;
+
+    // Make sure we aren't passing U as a parameter to call instead of the
+    // callee.
+    if (CallSite(cast<Instruction>(Use)).hasArgument(U))
+      return true;
+  }
+
+  return false;
+}
+
+bool MergeFunctions::runOnModule(Module &M) {
+  bool Changed = false;
+
+  std::map<unsigned long, std::vector<Function *> > FnMap;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration() || F->isIntrinsic())
+      continue;
+
+    if (!F->hasLocalLinkage() && !F->hasExternalLinkage() &&
+        !F->hasWeakLinkage())
+      continue;
+
+    if (hasAddressTaken(F))
+      continue;
+
+    FnMap[hash(F)].push_back(F);
+  }
+
+  // TODO: instead of running in a loop, we could also fold functions in callgraph
+  // order. Constructing the CFG probably isn't cheaper than just running in a loop.
+
+  bool LocalChanged;
+  do {
+    LocalChanged = false;
+    for (std::map<unsigned long, std::vector<Function *> >::iterator
+         I = FnMap.begin(), E = FnMap.end(); I != E; ++I) {
+      DOUT << "size: " << FnMap.size() << "\n";
+      std::vector<Function *> &FnVec = I->second;
+      DOUT << "hash (" << I->first << "): " << FnVec.size() << "\n";
+
+      for (int i = 0, e = FnVec.size(); i != e; ++i) {
+        for (int j = i + 1; j != e; ++j) {
+          bool isEqual = equals(FnVec[i], FnVec[j]);
+
+          DOUT << "  " << FnVec[i]->getName()
+               << (isEqual ? " == " : " != ")
+               << FnVec[j]->getName() << "\n";
+
+          if (isEqual) {
+            if (fold(FnVec, i, j)) {
+              LocalChanged = true;
+              FnVec.erase(FnVec.begin() + j);
+              --j, --e;
+            }
+          }
+        }
+      }
+
+    }
+    Changed |= LocalChanged;
+  } while (LocalChanged);
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/PartialSpecialization.cpp b/lib/Transforms/IPO/PartialSpecialization.cpp
new file mode 100644
index 0000000..0e1fdb9
--- /dev/null
+++ b/lib/Transforms/IPO/PartialSpecialization.cpp
@@ -0,0 +1,191 @@
+//===-- PartialSpecialization.cpp - Specialize for common constants--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass finds function arguments that are often a common constant and 
+// specializes a version of the called function for that constant.
+//
+// This pass simply does the cloning for functions it specializes.  It depends
+// on IPSCCP and DAE to clean up the results.
+//
+// The initial heuristic favors constant arguments that are used in control 
+// flow.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "partialspecialization"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constant.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DenseSet.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(numSpecialized, "Number of specialized functions created");
+
+// Call must be used at least occasionally
+static const int CallsMin = 5;
+
+// Must have 10% of calls having the same constant to specialize on
+static const double ConstValPercent = .1;
+
+namespace {
+  class VISIBILITY_HIDDEN PartSpec : public ModulePass {
+    void scanForInterest(Function&, SmallVector<int, 6>&);
+    int scanDistribution(Function&, int, std::map<Constant*, int>&);
+  public :
+    static char ID; // Pass identification, replacement for typeid
+    PartSpec() : ModulePass(&ID) {}
+    bool runOnModule(Module &M);
+  };
+}
+
+char PartSpec::ID = 0;
+static RegisterPass<PartSpec>
+X("partialspecialization", "Partial Specialization");
+
+// Specialize F by replacing the arguments (keys) in replacements with the 
+// constants (values).  Replace all calls to F with those constants with
+// a call to the specialized function.  Returns the specialized function
+static Function* 
+SpecializeFunction(Function* F, 
+                   DenseMap<const Value*, Value*>& replacements) {
+  // arg numbers of deleted arguments
+  DenseSet<unsigned> deleted;
+  for (DenseMap<const Value*, Value*>::iterator 
+         repb = replacements.begin(), repe = replacements.end();
+       repb != repe; ++repb)
+    deleted.insert(cast<Argument>(repb->first)->getArgNo());
+
+  Function* NF = CloneFunction(F, replacements);
+  NF->setLinkage(GlobalValue::InternalLinkage);
+  F->getParent()->getFunctionList().push_back(NF);
+
+  for (Value::use_iterator ii = F->use_begin(), ee = F->use_end(); 
+       ii != ee; ) {
+    Value::use_iterator i = ii;
+    ++ii;
+    if (isa<CallInst>(i) || isa<InvokeInst>(i)) {
+      CallSite CS(cast<Instruction>(i));
+      if (CS.getCalledFunction() == F) {
+        
+        SmallVector<Value*, 6> args;
+        for (unsigned x = 0; x < CS.arg_size(); ++x)
+          if (!deleted.count(x))
+            args.push_back(CS.getArgument(x));
+        Value* NCall;
+        if (CallInst *CI = dyn_cast<CallInst>(i)) {
+          NCall = CallInst::Create(NF, args.begin(), args.end(), 
+                                   CI->getName(), CI);
+          cast<CallInst>(NCall)->setTailCall(CI->isTailCall());
+          cast<CallInst>(NCall)->setCallingConv(CI->getCallingConv());
+        } else {
+          InvokeInst *II = cast<InvokeInst>(i);
+          NCall = InvokeInst::Create(NF, II->getNormalDest(),
+                                     II->getUnwindDest(),
+                                     args.begin(), args.end(), 
+                                     II->getName(), II);
+          cast<InvokeInst>(NCall)->setCallingConv(II->getCallingConv());
+        }
+        CS.getInstruction()->replaceAllUsesWith(NCall);
+        CS.getInstruction()->eraseFromParent();
+      }
+    }
+  }
+  return NF;
+}
+
+
+bool PartSpec::runOnModule(Module &M) {
+  bool Changed = false;
+  for (Module::iterator I = M.begin(); I != M.end(); ++I) {
+    Function &F = *I;
+    if (F.isDeclaration() || F.mayBeOverridden()) continue;
+    SmallVector<int, 6> interestingArgs;
+    scanForInterest(F, interestingArgs);
+
+    // Find the first interesting Argument that we can specialize on
+    // If there are multiple interesting Arguments, then those will be found
+    // when processing the cloned function.
+    bool breakOuter = false;
+    for (unsigned int x = 0; !breakOuter && x < interestingArgs.size(); ++x) {
+      std::map<Constant*, int> distribution;
+      int total = scanDistribution(F, interestingArgs[x], distribution);
+      if (total > CallsMin) 
+        for (std::map<Constant*, int>::iterator ii = distribution.begin(),
+               ee = distribution.end(); ii != ee; ++ii)
+          if (total > ii->second && ii->first &&
+               ii->second > total * ConstValPercent) {
+            DenseMap<const Value*, Value*> m;
+            Function::arg_iterator arg = F.arg_begin();
+            for (int y = 0; y < interestingArgs[x]; ++y)
+              ++arg;
+            m[&*arg] = ii->first;
+            SpecializeFunction(&F, m);
+            ++numSpecialized;
+            breakOuter = true;
+            Changed = true;
+          }
+    }
+  }
+  return Changed;
+}
+
+/// scanForInterest - This function decides which arguments would be worth
+/// specializing on.
+void PartSpec::scanForInterest(Function& F, SmallVector<int, 6>& args) {
+  for(Function::arg_iterator ii = F.arg_begin(), ee = F.arg_end();
+      ii != ee; ++ii) {
+    for(Value::use_iterator ui = ii->use_begin(), ue = ii->use_end();
+        ui != ue; ++ui) {
+
+      bool interesting = false;
+
+      if (isa<CmpInst>(ui)) interesting = true;
+      else if (isa<CallInst>(ui))
+        interesting = ui->getOperand(0) == ii;
+      else if (isa<InvokeInst>(ui))
+        interesting = ui->getOperand(0) == ii;
+      else if (isa<SwitchInst>(ui)) interesting = true;
+      else if (isa<BranchInst>(ui)) interesting = true;
+
+      if (interesting) {
+        args.push_back(std::distance(F.arg_begin(), ii));
+        break;
+      }
+    }
+  }
+}
+
+/// scanDistribution - Construct a histogram of constants for arg of F at arg.
+int PartSpec::scanDistribution(Function& F, int arg, 
+                               std::map<Constant*, int>& dist) {
+  bool hasIndirect = false;
+  int total = 0;
+  for(Value::use_iterator ii = F.use_begin(), ee = F.use_end();
+      ii != ee; ++ii)
+    if ((isa<CallInst>(ii) || isa<InvokeInst>(ii))
+        && ii->getOperand(0) == &F) {
+      ++dist[dyn_cast<Constant>(ii->getOperand(arg + 1))];
+      ++total;
+    } else
+      hasIndirect = true;
+
+  // Preserve the original address taken function even if all other uses
+  // will be specialized.
+  if (hasIndirect) ++total;
+  return total;
+}
+
+ModulePass* llvm::createPartialSpecializationPass() { return new PartSpec(); }
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
new file mode 100644
index 0000000..2b52f46
--- /dev/null
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -0,0 +1,255 @@
+//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, turning invoke instructions into calls, iff the callee cannot
+// throw an exception, and marking functions 'nounwind' if they cannot throw.
+// It implements this as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "prune-eh"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumRemoved, "Number of invokes removed");
+STATISTIC(NumUnreach, "Number of noreturn calls optimized");
+
+namespace {
+  struct VISIBILITY_HIDDEN PruneEH : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    PruneEH() : CallGraphSCCPass(&ID) {}
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+
+    bool SimplifyFunction(Function *F);
+    void DeleteBasicBlock(BasicBlock *BB);
+  };
+}
+
+char PruneEH::ID = 0;
+static RegisterPass<PruneEH>
+X("prune-eh", "Remove unused exception handling info");
+
+Pass *llvm::createPruneEHPass() { return new PruneEH(); }
+
+
+bool PruneEH::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  SmallPtrSet<CallGraphNode *, 8> SCCNodes;
+  CallGraph &CG = getAnalysis<CallGraph>();
+  bool MadeChange = false;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    SCCNodes.insert(SCC[i]);
+
+  // First pass, scan all of the functions in the SCC, simplifying them
+  // according to what we know.
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    if (Function *F = SCC[i]->getFunction())
+      MadeChange |= SimplifyFunction(F);
+
+  // Next, check to see if any callees might throw or if there are any external
+  // functions in this SCC: if so, we cannot prune any functions in this SCC.
+  // Definitions that are weak and not declared non-throwing might be 
+  // overridden at linktime with something that throws, so assume that.
+  // If this SCC includes the unwind instruction, we KNOW it throws, so
+  // obviously the SCC might throw.
+  //
+  bool SCCMightUnwind = false, SCCMightReturn = false;
+  for (unsigned i = 0, e = SCC.size();
+       (!SCCMightUnwind || !SCCMightReturn) && i != e; ++i) {
+    Function *F = SCC[i]->getFunction();
+    if (F == 0) {
+      SCCMightUnwind = true;
+      SCCMightReturn = true;
+    } else if (F->isDeclaration() || F->mayBeOverridden()) {
+      SCCMightUnwind |= !F->doesNotThrow();
+      SCCMightReturn |= !F->doesNotReturn();
+    } else {
+      bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow();
+      bool CheckReturn = !SCCMightReturn && !F->doesNotReturn();
+
+      if (!CheckUnwind && !CheckReturn)
+        continue;
+
+      // Check to see if this function performs an unwind or calls an
+      // unwinding function.
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        if (CheckUnwind && isa<UnwindInst>(BB->getTerminator())) {
+          // Uses unwind!
+          SCCMightUnwind = true;
+        } else if (CheckReturn && isa<ReturnInst>(BB->getTerminator())) {
+          SCCMightReturn = true;
+        }
+
+        // Invoke instructions don't allow unwinding to continue, so we are
+        // only interested in call instructions.
+        if (CheckUnwind && !SCCMightUnwind)
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+            if (CallInst *CI = dyn_cast<CallInst>(I)) {
+              if (CI->doesNotThrow()) {
+                // This call cannot throw.
+              } else if (Function *Callee = CI->getCalledFunction()) {
+                CallGraphNode *CalleeNode = CG[Callee];
+                // If the callee is outside our current SCC then we may
+                // throw because it might.
+                if (!SCCNodes.count(CalleeNode)) {
+                  SCCMightUnwind = true;
+                  break;
+                }
+              } else {
+                // Indirect call, it might throw.
+                SCCMightUnwind = true;
+                break;
+              }
+            }
+        if (SCCMightUnwind && SCCMightReturn) break;
+      }
+    }
+  }
+
+  // If the SCC doesn't unwind or doesn't throw, note this fact.
+  if (!SCCMightUnwind || !SCCMightReturn)
+    for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+      Attributes NewAttributes = Attribute::None;
+
+      if (!SCCMightUnwind)
+        NewAttributes |= Attribute::NoUnwind;
+      if (!SCCMightReturn)
+        NewAttributes |= Attribute::NoReturn;
+
+      const AttrListPtr &PAL = SCC[i]->getFunction()->getAttributes();
+      const AttrListPtr &NPAL = PAL.addAttr(~0, NewAttributes);
+      if (PAL != NPAL) {
+        MadeChange = true;
+        SCC[i]->getFunction()->setAttributes(NPAL);
+      }
+    }
+
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+    // Convert any invoke instructions to non-throwing functions in this node
+    // into call instructions with a branch.  This makes the exception blocks
+    // dead.
+    if (Function *F = SCC[i]->getFunction())
+      MadeChange |= SimplifyFunction(F);
+  }
+
+  return MadeChange;
+}
+
+
+// SimplifyFunction - Given information about callees, simplify the specified
+// function if we have invokes to non-unwinding functions or code after calls to
+// no-return functions.
+bool PruneEH::SimplifyFunction(Function *F) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  CallGraphNode *CGN = CG[F];
+
+  bool MadeChange = false;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow()) {
+        SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+        // Insert a call instruction before the invoke.
+        CallInst *Call = CallInst::Create(II->getCalledValue(),
+                                          Args.begin(), Args.end(), "", II);
+        Call->takeName(II);
+        Call->setCallingConv(II->getCallingConv());
+        Call->setAttributes(II->getAttributes());
+
+        // Anything that used the value produced by the invoke instruction
+        // now uses the value produced by the call instruction.
+        II->replaceAllUsesWith(Call);
+        BasicBlock *UnwindBlock = II->getUnwindDest();
+        UnwindBlock->removePredecessor(II->getParent());
+
+        // Fix up the call graph.
+        CGN->replaceCallSite(II, Call);
+
+        // Insert a branch to the normal destination right before the
+        // invoke.
+        BranchInst::Create(II->getNormalDest(), II);
+
+        // Finally, delete the invoke instruction!
+        BB->getInstList().pop_back();
+
+        // If the unwind block is now dead, nuke it.
+        if (pred_begin(UnwindBlock) == pred_end(UnwindBlock))
+          DeleteBasicBlock(UnwindBlock);  // Delete the new BB.
+
+        ++NumRemoved;
+        MadeChange = true;
+      }
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (CI->doesNotReturn() && !isa<UnreachableInst>(I)) {
+          // This call calls a function that cannot return.  Insert an
+          // unreachable instruction after it and simplify the code.  Do this
+          // by splitting the BB, adding the unreachable, then deleting the
+          // new BB.
+          BasicBlock *New = BB->splitBasicBlock(I);
+
+          // Remove the uncond branch and add an unreachable.
+          BB->getInstList().pop_back();
+          new UnreachableInst(BB);
+
+          DeleteBasicBlock(New);  // Delete the new BB.
+          MadeChange = true;
+          ++NumUnreach;
+          break;
+        }
+  }
+
+  return MadeChange;
+}
+
+/// DeleteBasicBlock - remove the specified basic block from the program,
+/// updating the callgraph to reflect any now-obsolete edges due to calls that
+/// exist in the BB.
+void PruneEH::DeleteBasicBlock(BasicBlock *BB) {
+  assert(pred_begin(BB) == pred_end(BB) && "BB is not dead!");
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  CallGraphNode *CGN = CG[BB->getParent()];
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
+    --I;
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (!isa<DbgInfoIntrinsic>(I))
+        CGN->removeCallEdgeFor(CI);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+      CGN->removeCallEdgeFor(II);
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+
+  // Get the list of successors of this block.
+  std::vector<BasicBlock*> Succs(succ_begin(BB), succ_end(BB));
+
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    Succs[i]->removePredecessor(BB);
+
+  BB->eraseFromParent();
+}
diff --git a/lib/Transforms/IPO/RaiseAllocations.cpp b/lib/Transforms/IPO/RaiseAllocations.cpp
new file mode 100644
index 0000000..a81bbdb
--- /dev/null
+++ b/lib/Transforms/IPO/RaiseAllocations.cpp
@@ -0,0 +1,251 @@
+//===- RaiseAllocations.cpp - Convert @malloc & @free calls to insts ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RaiseAllocations pass which convert malloc and free
+// calls to malloc and free instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "raiseallocs"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumRaised, "Number of allocations raised");
+
+namespace {
+  // RaiseAllocations - Turn @malloc and @free calls into the appropriate
+  // instruction.
+  //
+  class VISIBILITY_HIDDEN RaiseAllocations : public ModulePass {
+    Function *MallocFunc;   // Functions in the module we are processing
+    Function *FreeFunc;     // Initialized by doPassInitializationVirt
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    RaiseAllocations() 
+      : ModulePass(&ID), MallocFunc(0), FreeFunc(0) {}
+
+    // doPassInitialization - For the raise allocations pass, this finds a
+    // declaration for malloc and free if they exist.
+    //
+    void doInitialization(Module &M);
+
+    // run - This method does the actual work of converting instructions over.
+    //
+    bool runOnModule(Module &M);
+  };
+}  // end anonymous namespace
+
+char RaiseAllocations::ID = 0;
+static RegisterPass<RaiseAllocations>
+X("raiseallocs", "Raise allocations from calls to instructions");
+
+// createRaiseAllocationsPass - The interface to this file...
+ModulePass *llvm::createRaiseAllocationsPass() {
+  return new RaiseAllocations();
+}
+
+
+// If the module has a symbol table, they might be referring to the malloc and
+// free functions.  If this is the case, grab the method pointers that the
+// module is using.
+//
+// Lookup @malloc and @free in the symbol table, for later use.  If they don't
+// exist, or are not external, we do not worry about converting calls to that
+// function into the appropriate instruction.
+//
+void RaiseAllocations::doInitialization(Module &M) {
+
+  // Get Malloc and free prototypes if they exist!
+  MallocFunc = M.getFunction("malloc");
+  if (MallocFunc) {
+    const FunctionType* TyWeHave = MallocFunc->getFunctionType();
+
+    // Get the expected prototype for malloc
+    const FunctionType *Malloc1Type = 
+      FunctionType::get(PointerType::getUnqual(Type::Int8Ty),
+                      std::vector<const Type*>(1, Type::Int64Ty), false);
+
+    // Chck to see if we got the expected malloc
+    if (TyWeHave != Malloc1Type) {
+      // Check to see if the prototype is wrong, giving us sbyte*(uint) * malloc
+      // This handles the common declaration of: 'void *malloc(unsigned);'
+      const FunctionType *Malloc2Type = 
+        FunctionType::get(PointerType::getUnqual(Type::Int8Ty),
+                          std::vector<const Type*>(1, Type::Int32Ty), false);
+      if (TyWeHave != Malloc2Type) {
+        // Check to see if the prototype is missing, giving us 
+        // sbyte*(...) * malloc
+        // This handles the common declaration of: 'void *malloc();'
+        const FunctionType *Malloc3Type = 
+          FunctionType::get(PointerType::getUnqual(Type::Int8Ty),
+                            std::vector<const Type*>(), true);
+        if (TyWeHave != Malloc3Type)
+          // Give up
+          MallocFunc = 0;
+      }
+    }
+  }
+
+  FreeFunc = M.getFunction("free");
+  if (FreeFunc) {
+    const FunctionType* TyWeHave = FreeFunc->getFunctionType();
+    
+    // Get the expected prototype for void free(i8*)
+    const FunctionType *Free1Type = FunctionType::get(Type::VoidTy,
+      std::vector<const Type*>(1, PointerType::getUnqual(Type::Int8Ty)), false);
+
+    if (TyWeHave != Free1Type) {
+      // Check to see if the prototype was forgotten, giving us 
+      // void (...) * free
+      // This handles the common forward declaration of: 'void free();'
+      const FunctionType* Free2Type = FunctionType::get(Type::VoidTy, 
+        std::vector<const Type*>(),true);
+
+      if (TyWeHave != Free2Type) {
+        // One last try, check to see if we can find free as 
+        // int (...)* free.  This handles the case where NOTHING was declared.
+        const FunctionType* Free3Type = FunctionType::get(Type::Int32Ty, 
+          std::vector<const Type*>(),true);
+        
+        if (TyWeHave != Free3Type) {
+          // Give up.
+          FreeFunc = 0;
+        }
+      }
+    }
+  }
+
+  // Don't mess with locally defined versions of these functions...
+  if (MallocFunc && !MallocFunc->isDeclaration()) MallocFunc = 0;
+  if (FreeFunc && !FreeFunc->isDeclaration())     FreeFunc = 0;
+}
+
+// run - Transform calls into instructions...
+//
+bool RaiseAllocations::runOnModule(Module &M) {
+  // Find the malloc/free prototypes...
+  doInitialization(M);
+
+  bool Changed = false;
+
+  // First, process all of the malloc calls...
+  if (MallocFunc) {
+    std::vector<User*> Users(MallocFunc->use_begin(), MallocFunc->use_end());
+    std::vector<Value*> EqPointers;   // Values equal to MallocFunc
+    while (!Users.empty()) {
+      User *U = Users.back();
+      Users.pop_back();
+
+      if (Instruction *I = dyn_cast<Instruction>(U)) {
+        CallSite CS = CallSite::get(I);
+        if (CS.getInstruction() && !CS.arg_empty() &&
+            (CS.getCalledFunction() == MallocFunc ||
+             std::find(EqPointers.begin(), EqPointers.end(),
+                       CS.getCalledValue()) != EqPointers.end())) {
+
+          Value *Source = *CS.arg_begin();
+
+          // If no prototype was provided for malloc, we may need to cast the
+          // source size.
+          if (Source->getType() != Type::Int32Ty)
+            Source = 
+              CastInst::CreateIntegerCast(Source, Type::Int32Ty, false/*ZExt*/,
+                                          "MallocAmtCast", I);
+
+          MallocInst *MI = new MallocInst(Type::Int8Ty, Source, "", I);
+          MI->takeName(I);
+          I->replaceAllUsesWith(MI);
+
+          // If the old instruction was an invoke, add an unconditional branch
+          // before the invoke, which will become the new terminator.
+          if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+            BranchInst::Create(II->getNormalDest(), I);
+
+          // Delete the old call site
+          I->eraseFromParent();
+          Changed = true;
+          ++NumRaised;
+        }
+      } else if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) {
+        Users.insert(Users.end(), GV->use_begin(), GV->use_end());
+        EqPointers.push_back(GV);
+      } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+        if (CE->isCast()) {
+          Users.insert(Users.end(), CE->use_begin(), CE->use_end());
+          EqPointers.push_back(CE);
+        }
+      }
+    }
+  }
+
+  // Next, process all free calls...
+  if (FreeFunc) {
+    std::vector<User*> Users(FreeFunc->use_begin(), FreeFunc->use_end());
+    std::vector<Value*> EqPointers;   // Values equal to FreeFunc
+
+    while (!Users.empty()) {
+      User *U = Users.back();
+      Users.pop_back();
+
+      if (Instruction *I = dyn_cast<Instruction>(U)) {
+        if (isa<InvokeInst>(I))
+          continue;
+        CallSite CS = CallSite::get(I);
+        if (CS.getInstruction() && !CS.arg_empty() &&
+            (CS.getCalledFunction() == FreeFunc ||
+             std::find(EqPointers.begin(), EqPointers.end(),
+                       CS.getCalledValue()) != EqPointers.end())) {
+
+          // If no prototype was provided for free, we may need to cast the
+          // source pointer.  This should be really uncommon, but it's necessary
+          // just in case we are dealing with weird code like this:
+          //   free((long)ptr);
+          //
+          Value *Source = *CS.arg_begin();
+          if (!isa<PointerType>(Source->getType()))
+            Source = new IntToPtrInst(Source,           
+                                      PointerType::getUnqual(Type::Int8Ty), 
+                                      "FreePtrCast", I);
+          new FreeInst(Source, I);
+
+          // If the old instruction was an invoke, add an unconditional branch
+          // before the invoke, which will become the new terminator.
+          if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+            BranchInst::Create(II->getNormalDest(), I);
+
+          // Delete the old call site
+          if (I->getType() != Type::VoidTy)
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          I->eraseFromParent();
+          Changed = true;
+          ++NumRaised;
+        }
+      } else if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) {
+        Users.insert(Users.end(), GV->use_begin(), GV->use_end());
+        EqPointers.push_back(GV);
+      } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+        if (CE->isCast()) {
+          Users.insert(Users.end(), CE->use_begin(), CE->use_end());
+          EqPointers.push_back(CE);
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/StripDeadPrototypes.cpp b/lib/Transforms/IPO/StripDeadPrototypes.cpp
new file mode 100644
index 0000000..a94d78e
--- /dev/null
+++ b/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -0,0 +1,72 @@
+//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for 
+// dead declarations and removes them. Dead declarations are declarations of
+// functions for which no implementation is available (i.e., declarations for
+// unused library functions).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "strip-dead-prototypes"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
+
+namespace {
+
+/// @brief Pass to remove unused function declarations.
+class VISIBILITY_HIDDEN StripDeadPrototypesPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  StripDeadPrototypesPass() : ModulePass(&ID) { }
+  virtual bool runOnModule(Module &M);
+};
+
+} // end anonymous namespace
+
+char StripDeadPrototypesPass::ID = 0;
+static RegisterPass<StripDeadPrototypesPass>
+X("strip-dead-prototypes", "Strip Unused Function Prototypes");
+
+bool StripDeadPrototypesPass::runOnModule(Module &M) {
+  bool MadeChange = false;
+  
+  // Erase dead function prototypes.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function *F = I++;
+    // Function must be a prototype and unused.
+    if (F->isDeclaration() && F->use_empty()) {
+      F->eraseFromParent();
+      ++NumDeadPrototypes;
+      MadeChange = true;
+    }
+  }
+
+  // Erase dead global var prototypes.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ) {
+    GlobalVariable *GV = I++;
+    // Global must be a prototype and unused.
+    if (GV->isDeclaration() && GV->use_empty())
+      GV->eraseFromParent();
+  }
+  
+  // Return an indication of whether we changed anything or not.
+  return MadeChange;
+}
+
+ModulePass *llvm::createStripDeadPrototypesPass() {
+  return new StripDeadPrototypesPass();
+}
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
new file mode 100644
index 0000000..ab8fe5f
--- /dev/null
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -0,0 +1,415 @@
+//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripSymbols transformation implements code stripping. Specifically, it
+// can delete:
+// 
+//   * names for virtual registers
+//   * symbols for internal globals and functions
+//   * debug information
+//
+// Note that this transformation makes code much less readable, so it should
+// only be used in situations where the 'strip' utility would be used, such as
+// reducing code size or making it harder to reverse engineer code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallPtrSet.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN StripSymbols : public ModulePass {
+    bool OnlyDebugInfo;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripSymbols(bool ODI = false) 
+      : ModulePass(&ID), OnlyDebugInfo(ODI) {}
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+
+  class VISIBILITY_HIDDEN StripNonDebugSymbols : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripNonDebugSymbols()
+      : ModulePass(&ID) {}
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+
+  class VISIBILITY_HIDDEN StripDebugDeclare : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDebugDeclare()
+      : ModulePass(&ID) {}
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char StripSymbols::ID = 0;
+static RegisterPass<StripSymbols>
+X("strip", "Strip all symbols from a module");
+
+ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
+  return new StripSymbols(OnlyDebugInfo);
+}
+
+char StripNonDebugSymbols::ID = 0;
+static RegisterPass<StripNonDebugSymbols>
+Y("strip-nondebug", "Strip all symbols, except dbg symbols, from a module");
+
+ModulePass *llvm::createStripNonDebugSymbolsPass() {
+  return new StripNonDebugSymbols();
+}
+
+char StripDebugDeclare::ID = 0;
+static RegisterPass<StripDebugDeclare>
+Z("strip-debug-declare", "Strip all llvm.dbg.declare intrinsics");
+
+ModulePass *llvm::createStripDebugDeclarePass() {
+  return new StripDebugDeclare();
+}
+
+/// OnlyUsedBy - Return true if V is only used by Usr.
+static bool OnlyUsedBy(Value *V, Value *Usr) {
+  for(Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) {
+    User *U = *I;
+    if (U != Usr)
+      return false;
+  }
+  return true;
+}
+
+static void RemoveDeadConstant(Constant *C) {
+  assert(C->use_empty() && "Constant is not dead!");
+  SmallPtrSet<Constant *, 4> Operands;
+  for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+    if (isa<DerivedType>(C->getOperand(i)->getType()) &&
+        OnlyUsedBy(C->getOperand(i), C)) 
+      Operands.insert(C->getOperand(i));
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (!GV->hasLocalLinkage()) return;   // Don't delete non static globals.
+    GV->eraseFromParent();
+  }
+  else if (!isa<Function>(C))
+    if (isa<CompositeType>(C->getType()))
+      C->destroyConstant();
+
+  // If the constant referenced anything, see if we can delete it as well.
+  for (SmallPtrSet<Constant *, 4>::iterator OI = Operands.begin(),
+         OE = Operands.end(); OI != OE; ++OI)
+    RemoveDeadConstant(*OI);
+}
+
+// Strip the symbol table of its names.
+//
+static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
+  for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) {
+    Value *V = VI->getValue();
+    ++VI;
+    if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) {
+      if (!PreserveDbgInfo || strncmp(V->getNameStart(), "llvm.dbg", 8))
+        // Set name to "", removing from symbol table!
+        V->setName("");
+    }
+  }
+}
+
+// Strip the symbol table of its names.
+static void StripTypeSymtab(TypeSymbolTable &ST, bool PreserveDbgInfo) {
+  for (TypeSymbolTable::iterator TI = ST.begin(), E = ST.end(); TI != E; ) {
+    if (PreserveDbgInfo && strncmp(TI->first.c_str(), "llvm.dbg", 8) == 0)
+      ++TI;
+    else
+      ST.remove(TI++);
+  }
+}
+
+/// Find values that are marked as llvm.used.
+void findUsedValues(Module &M,
+                    SmallPtrSet<const GlobalValue*, 8>& llvmUsedValues) {
+  if (GlobalVariable *LLVMUsed = M.getGlobalVariable("llvm.used")) {
+    llvmUsedValues.insert(LLVMUsed);
+    // Collect values that are preserved as per explicit request.
+    // llvm.used is used to list these values.
+    if (ConstantArray *Inits = 
+        dyn_cast<ConstantArray>(LLVMUsed->getInitializer())) {
+      for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
+        if (GlobalValue *GV = dyn_cast<GlobalValue>(Inits->getOperand(i)))
+          llvmUsedValues.insert(GV);
+        else if (ConstantExpr *CE =
+                 dyn_cast<ConstantExpr>(Inits->getOperand(i)))
+          if (CE->getOpcode() == Instruction::BitCast)
+            if (GlobalValue *GV = dyn_cast<GlobalValue>(CE->getOperand(0)))
+              llvmUsedValues.insert(GV);
+      }
+    }
+  }
+}
+
+/// StripSymbolNames - Strip symbol names.
+bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
+
+  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+  findUsedValues(M, llvmUsedValues);
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+      if (!PreserveDbgInfo || strncmp(I->getNameStart(), "llvm.dbg", 8))
+        I->setName("");     // Internal symbols can't participate in linkage
+  }
+  
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+      if (!PreserveDbgInfo || strncmp(I->getNameStart(), "llvm.dbg", 8))
+        I->setName("");     // Internal symbols can't participate in linkage
+    StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
+  }
+  
+  // Remove all names from types.
+  StripTypeSymtab(M.getTypeSymbolTable(), PreserveDbgInfo);
+
+  return true;
+}
+
+// StripDebugInfo - Strip debug info in the module if it exists.  
+// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and 
+// llvm.dbg.region.end calls, and any globals they point to if now dead.
+bool StripDebugInfo(Module &M) {
+
+  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+  findUsedValues(M, llvmUsedValues);
+
+  // Delete all dbg variables.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+       I != E; ++I) {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(I);
+    if (!GV) continue;
+    if (!GV->use_empty() && llvmUsedValues.count(I) == 0) {
+      if (strncmp(GV->getNameStart(), "llvm.dbg", 8) == 0) {
+        GV->replaceAllUsesWith(UndefValue::get(GV->getType()));
+      }
+    }
+  }
+
+  Function *FuncStart = M.getFunction("llvm.dbg.func.start");
+  Function *StopPoint = M.getFunction("llvm.dbg.stoppoint");
+  Function *RegionStart = M.getFunction("llvm.dbg.region.start");
+  Function *RegionEnd = M.getFunction("llvm.dbg.region.end");
+  Function *Declare = M.getFunction("llvm.dbg.declare");
+
+  std::vector<Constant*> DeadConstants;
+
+  // Remove all of the calls to the debugger intrinsics, and remove them from
+  // the module.
+  if (FuncStart) {
+    while (!FuncStart->use_empty()) {
+      CallInst *CI = cast<CallInst>(FuncStart->use_back());
+      Value *Arg = CI->getOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    FuncStart->eraseFromParent();
+  }
+  if (StopPoint) {
+    while (!StopPoint->use_empty()) {
+      CallInst *CI = cast<CallInst>(StopPoint->use_back());
+      Value *Arg = CI->getOperand(3);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    StopPoint->eraseFromParent();
+  }
+  if (RegionStart) {
+    while (!RegionStart->use_empty()) {
+      CallInst *CI = cast<CallInst>(RegionStart->use_back());
+      Value *Arg = CI->getOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    RegionStart->eraseFromParent();
+  }
+  if (RegionEnd) {
+    while (!RegionEnd->use_empty()) {
+      CallInst *CI = cast<CallInst>(RegionEnd->use_back());
+      Value *Arg = CI->getOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg)) 
+          DeadConstants.push_back(C);
+    }
+    RegionEnd->eraseFromParent();
+  }
+  if (Declare) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      Value *Arg1 = CI->getOperand(1);
+      Value *Arg2 = CI->getOperand(2);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg1->use_empty()) {
+        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+          DeadConstants.push_back(C);
+        else 
+          RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+      }
+      if (Arg2->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+          DeadConstants.push_back(C);
+    }
+    Declare->eraseFromParent();
+  }
+
+  // llvm.dbg.compile_units and llvm.dbg.subprograms are marked as linkonce
+  // but since we are removing all debug information, make them internal now.
+  // FIXME: Use private linkage maybe?
+  if (Constant *C = M.getNamedGlobal("llvm.dbg.compile_units"))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+      GV->setLinkage(GlobalValue::InternalLinkage);
+
+  if (Constant *C = M.getNamedGlobal("llvm.dbg.subprograms"))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+      GV->setLinkage(GlobalValue::InternalLinkage);
+ 
+  if (Constant *C = M.getNamedGlobal("llvm.dbg.global_variables"))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+      GV->setLinkage(GlobalValue::InternalLinkage);
+
+  // Delete all dbg variables.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+       I != E; ++I) {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(I);
+    if (!GV) continue;
+    if (GV->use_empty() && llvmUsedValues.count(I) == 0
+        && (!GV->hasSection() 
+            || strcmp(GV->getSection().c_str(), "llvm.metadata") == 0))
+      DeadConstants.push_back(GV);
+  }
+
+  if (DeadConstants.empty())
+    return false;
+
+  // Delete any internal globals that were only used by the debugger intrinsics.
+  while (!DeadConstants.empty()) {
+    Constant *C = DeadConstants.back();
+    DeadConstants.pop_back();
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->hasLocalLinkage())
+        RemoveDeadConstant(GV);
+    }
+    else
+      RemoveDeadConstant(C);
+  }
+
+  // Remove all llvm.dbg types.
+  TypeSymbolTable &ST = M.getTypeSymbolTable();
+  for (TypeSymbolTable::iterator TI = ST.begin(), TE = ST.end(); TI != TE; ) {
+    if (!strncmp(TI->first.c_str(), "llvm.dbg.", 9))
+      ST.remove(TI++);
+    else 
+      ++TI;
+  }
+  
+  return true;
+}
+
+bool StripSymbols::runOnModule(Module &M) {
+  bool Changed = false;
+  Changed |= StripDebugInfo(M);
+  if (!OnlyDebugInfo)
+    Changed |= StripSymbolNames(M, false);
+  return Changed;
+}
+
+bool StripNonDebugSymbols::runOnModule(Module &M) {
+  return StripSymbolNames(M, true);
+}
+
+bool StripDebugDeclare::runOnModule(Module &M) {
+
+  Function *Declare = M.getFunction("llvm.dbg.declare");
+  std::vector<Constant*> DeadConstants;
+
+  if (Declare) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      Value *Arg1 = CI->getOperand(1);
+      Value *Arg2 = CI->getOperand(2);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg1->use_empty()) {
+        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+          DeadConstants.push_back(C);
+        else 
+          RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+      }
+      if (Arg2->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+          DeadConstants.push_back(C);
+    }
+    Declare->eraseFromParent();
+  }
+
+  // Delete all llvm.dbg.global_variables.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+       I != E; ++I) {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(I);
+    if (!GV) continue;
+    if (GV->use_empty() && GV->hasName() 
+        && strncmp(GV->getNameStart(), "llvm.dbg.global_variable", 24) == 0)
+      DeadConstants.push_back(GV);
+  }
+
+  while (!DeadConstants.empty()) {
+    Constant *C = DeadConstants.back();
+    DeadConstants.pop_back();
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->hasLocalLinkage())
+        RemoveDeadConstant(GV);
+    }
+    else
+      RemoveDeadConstant(C);
+  }
+
+  return true;
+}
diff --git a/lib/Transforms/IPO/StructRetPromotion.cpp b/lib/Transforms/IPO/StructRetPromotion.cpp
new file mode 100644
index 0000000..9f54388
--- /dev/null
+++ b/lib/Transforms/IPO/StructRetPromotion.cpp
@@ -0,0 +1,351 @@
+//===-- StructRetPromotion.cpp - Promote sret arguments ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass finds functions that return a struct (using a pointer to the struct
+// as the first argument of the function, marked with the 'sret' attribute) and
+// replaces them with a new function that simply returns each of the elements of
+// that struct (using multiple return values).
+//
+// This pass works under a number of conditions:
+//  1. The returned struct must not contain other structs
+//  2. The returned struct must only be used to load values from
+//  3. The placeholder struct passed in is the result of an alloca
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sretpromotion"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumRejectedSRETUses , "Number of sret rejected due to unexpected uses");
+STATISTIC(NumSRET , "Number of sret promoted");
+namespace {
+  /// SRETPromotion - This pass removes sret parameter and updates
+  /// function to use multiple return value.
+  ///
+  struct VISIBILITY_HIDDEN SRETPromotion : public CallGraphSCCPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnSCC(const std::vector<CallGraphNode *> &SCC);
+    static char ID; // Pass identification, replacement for typeid
+    SRETPromotion() : CallGraphSCCPass(&ID) {}
+
+  private:
+    bool PromoteReturn(CallGraphNode *CGN);
+    bool isSafeToUpdateAllCallers(Function *F);
+    Function *cloneFunctionBody(Function *F, const StructType *STy);
+    void updateCallSites(Function *F, Function *NF);
+    bool nestedStructType(const StructType *STy);
+  };
+}
+
+char SRETPromotion::ID = 0;
+static RegisterPass<SRETPromotion>
+X("sretpromotion", "Promote sret arguments to multiple ret values");
+
+Pass *llvm::createStructRetPromotionPass() {
+  return new SRETPromotion();
+}
+
+bool SRETPromotion::runOnSCC(const std::vector<CallGraphNode *> &SCC) {
+  bool Changed = false;
+
+  for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+    Changed |= PromoteReturn(SCC[i]);
+
+  return Changed;
+}
+
+/// PromoteReturn - This method promotes function that uses StructRet paramater 
+/// into a function that uses mulitple return value.
+bool SRETPromotion::PromoteReturn(CallGraphNode *CGN) {
+  Function *F = CGN->getFunction();
+
+  if (!F || F->isDeclaration() || !F->hasLocalLinkage())
+    return false;
+
+  // Make sure that function returns struct.
+  if (F->arg_size() == 0 || !F->hasStructRetAttr() || F->doesNotReturn())
+    return false;
+
+  DOUT << "SretPromotion: Looking at sret function " << F->getNameStart() << "\n";
+
+  assert (F->getReturnType() == Type::VoidTy && "Invalid function return type");
+  Function::arg_iterator AI = F->arg_begin();
+  const llvm::PointerType *FArgType = dyn_cast<PointerType>(AI->getType());
+  assert (FArgType && "Invalid sret parameter type");
+  const llvm::StructType *STy = 
+    dyn_cast<StructType>(FArgType->getElementType());
+  assert (STy && "Invalid sret parameter element type");
+
+  // Check if it is ok to perform this promotion.
+  if (isSafeToUpdateAllCallers(F) == false) {
+    DOUT << "SretPromotion: Not all callers can be updated\n";
+    NumRejectedSRETUses++;
+    return false;
+  }
+
+  DOUT << "SretPromotion: sret argument will be promoted\n";
+  NumSRET++;
+  // [1] Replace use of sret parameter 
+  AllocaInst *TheAlloca = new AllocaInst (STy, NULL, "mrv", 
+                                          F->getEntryBlock().begin());
+  Value *NFirstArg = F->arg_begin();
+  NFirstArg->replaceAllUsesWith(TheAlloca);
+
+  // [2] Find and replace ret instructions
+  for (Function::iterator FI = F->begin(), FE = F->end();  FI != FE; ++FI) 
+    for(BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
+      Instruction *I = BI;
+      ++BI;
+      if (isa<ReturnInst>(I)) {
+        Value *NV = new LoadInst(TheAlloca, "mrv.ld", I);
+        ReturnInst *NR = ReturnInst::Create(NV, I);
+        I->replaceAllUsesWith(NR);
+        I->eraseFromParent();
+      }
+    }
+
+  // [3] Create the new function body and insert it into the module.
+  Function *NF = cloneFunctionBody(F, STy);
+
+  // [4] Update all call sites to use new function
+  updateCallSites(F, NF);
+
+  F->eraseFromParent();
+  getAnalysis<CallGraph>().changeFunction(F, NF);
+  return true;
+}
+
+// Check if it is ok to perform this promotion.
+bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) {
+
+  if (F->use_empty())
+    // No users. OK to modify signature.
+    return true;
+
+  for (Value::use_iterator FnUseI = F->use_begin(), FnUseE = F->use_end();
+       FnUseI != FnUseE; ++FnUseI) {
+    // The function is passed in as an argument to (possibly) another function,
+    // we can't change it!
+    CallSite CS = CallSite::get(*FnUseI);
+    Instruction *Call = CS.getInstruction();
+    // The function is used by something else than a call or invoke instruction,
+    // we can't change it!
+    if (!Call || !CS.isCallee(FnUseI))
+      return false;
+    CallSite::arg_iterator AI = CS.arg_begin();
+    Value *FirstArg = *AI;
+
+    if (!isa<AllocaInst>(FirstArg))
+      return false;
+
+    // Check FirstArg's users.
+    for (Value::use_iterator ArgI = FirstArg->use_begin(), 
+           ArgE = FirstArg->use_end(); ArgI != ArgE; ++ArgI) {
+
+      // If FirstArg user is a CallInst that does not correspond to current
+      // call site then this function F is not suitable for sret promotion.
+      if (CallInst *CI = dyn_cast<CallInst>(ArgI)) {
+        if (CI != Call)
+          return false;
+      }
+      // If FirstArg user is a GEP whose all users are not LoadInst then
+      // this function F is not suitable for sret promotion.
+      else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(ArgI)) {
+        // TODO : Use dom info and insert PHINodes to collect get results
+        // from multiple call sites for this GEP.
+        if (GEP->getParent() != Call->getParent())
+          return false;
+        for (Value::use_iterator GEPI = GEP->use_begin(), GEPE = GEP->use_end();
+             GEPI != GEPE; ++GEPI) 
+          if (!isa<LoadInst>(GEPI))
+            return false;
+      } 
+      // Any other FirstArg users make this function unsuitable for sret 
+      // promotion.
+      else
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// cloneFunctionBody - Create a new function based on F and
+/// insert it into module. Remove first argument. Use STy as
+/// the return type for new function.
+Function *SRETPromotion::cloneFunctionBody(Function *F, 
+                                           const StructType *STy) {
+
+  const FunctionType *FTy = F->getFunctionType();
+  std::vector<const Type*> Params;
+
+  // Attributes - Keep track of the parameter attributes for the arguments.
+  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  const AttrListPtr &PAL = F->getAttributes();
+
+  // Add any return attributes.
+  if (Attributes attrs = PAL.getRetAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+
+  // Skip first argument.
+  Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+  ++I;
+  // 0th parameter attribute is reserved for return type.
+  // 1th parameter attribute is for first 1st sret argument.
+  unsigned ParamIndex = 2; 
+  while (I != E) {
+    Params.push_back(I->getType());
+    if (Attributes Attrs = PAL.getParamAttributes(ParamIndex))
+      AttributesVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs));
+    ++I;
+    ++ParamIndex;
+  }
+
+  // Add any fn attributes.
+  if (Attributes attrs = PAL.getFnAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+
+
+  FunctionType *NFTy = FunctionType::get(STy, Params, FTy->isVarArg());
+  Function *NF = Function::Create(NFTy, F->getLinkage());
+  NF->takeName(F);
+  NF->copyAttributesFrom(F);
+  NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()));
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Replace arguments
+  I = F->arg_begin();
+  E = F->arg_end();
+  Function::arg_iterator NI = NF->arg_begin();
+  ++I;
+  while (I != E) {
+      I->replaceAllUsesWith(NI);
+      NI->takeName(I);
+      ++I;
+      ++NI;
+  }
+
+  return NF;
+}
+
+/// updateCallSites - Update all sites that call F to use NF.
+void SRETPromotion::updateCallSites(Function *F, Function *NF) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  SmallVector<Value*, 16> Args;
+
+  // Attributes - Keep track of the parameter attributes for the arguments.
+  SmallVector<AttributeWithIndex, 8> ArgAttrsVec;
+
+  while (!F->use_empty()) {
+    CallSite CS = CallSite::get(*F->use_begin());
+    Instruction *Call = CS.getInstruction();
+
+    const AttrListPtr &PAL = F->getAttributes();
+    // Add any return attributes.
+    if (Attributes attrs = PAL.getRetAttributes())
+      ArgAttrsVec.push_back(AttributeWithIndex::get(0, attrs));
+
+    // Copy arguments, however skip first one.
+    CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+    Value *FirstCArg = *AI;
+    ++AI;
+    // 0th parameter attribute is reserved for return type.
+    // 1th parameter attribute is for first 1st sret argument.
+    unsigned ParamIndex = 2; 
+    while (AI != AE) {
+      Args.push_back(*AI); 
+      if (Attributes Attrs = PAL.getParamAttributes(ParamIndex))
+        ArgAttrsVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs));
+      ++ParamIndex;
+      ++AI;
+    }
+
+    // Add any function attributes.
+    if (Attributes attrs = PAL.getFnAttributes())
+      ArgAttrsVec.push_back(AttributeWithIndex::get(~0, attrs));
+    
+    AttrListPtr NewPAL = AttrListPtr::get(ArgAttrsVec.begin(), ArgAttrsVec.end());
+    
+    // Build new call instruction.
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args.begin(), Args.end(), "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(NewPAL);
+    } else {
+      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(NewPAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+    ArgAttrsVec.clear();
+    New->takeName(Call);
+
+    // Update the callgraph to know that the callsite has been transformed.
+    CG[Call->getParent()->getParent()]->replaceCallSite(Call, New);
+
+    // Update all users of sret parameter to extract value using extractvalue.
+    for (Value::use_iterator UI = FirstCArg->use_begin(), 
+           UE = FirstCArg->use_end(); UI != UE; ) {
+      User *U2 = *UI++;
+      CallInst *C2 = dyn_cast<CallInst>(U2);
+      if (C2 && (C2 == Call))
+        continue;
+      else if (GetElementPtrInst *UGEP = dyn_cast<GetElementPtrInst>(U2)) {
+        ConstantInt *Idx = dyn_cast<ConstantInt>(UGEP->getOperand(2));
+        assert (Idx && "Unexpected getelementptr index!");
+        Value *GR = ExtractValueInst::Create(New, Idx->getZExtValue(),
+                                             "evi", UGEP);
+        while(!UGEP->use_empty()) {
+          // isSafeToUpdateAllCallers has checked that all GEP uses are
+          // LoadInsts
+          LoadInst *L = cast<LoadInst>(*UGEP->use_begin());
+          L->replaceAllUsesWith(GR);
+          L->eraseFromParent();
+        }
+        UGEP->eraseFromParent();
+      }
+      else assert( 0 && "Unexpected sret parameter use");
+    }
+    Call->eraseFromParent();
+  }
+}
+
+/// nestedStructType - Return true if STy includes any
+/// other aggregate types
+bool SRETPromotion::nestedStructType(const StructType *STy) {
+  unsigned Num = STy->getNumElements();
+  for (unsigned i = 0; i < Num; i++) {
+    const Type *Ty = STy->getElementType(i);
+    if (!Ty->isSingleValueType() && Ty != Type::VoidTy)
+      return true;
+  }
+  return false;
+}
diff --git a/lib/Transforms/Instrumentation/BlockProfiling.cpp b/lib/Transforms/Instrumentation/BlockProfiling.cpp
new file mode 100644
index 0000000..2bd9809
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlockProfiling.cpp
@@ -0,0 +1,126 @@
+//===- BlockProfiling.cpp - Insert counters for block profiling -----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments the specified program with counters for basic block or
+// function profiling.  This is the most basic form of profiling, which can tell
+// which blocks are hot, but cannot reliably detect hot paths through the CFG.
+// Block profiling counts the number of times each basic block executes, and
+// function profiling counts the number of times each function is called.
+//
+// Note that this implementation is very naive.  Control equivalent regions of
+// the CFG should not require duplicate counters, but we do put duplicate
+// counters in.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "RSProfiling.h"
+#include "ProfilingUtils.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN FunctionProfiler : public RSProfilers_std {
+  public:
+    static char ID;
+    bool runOnModule(Module &M);
+  };
+}
+
+char FunctionProfiler::ID = 0;
+
+static RegisterPass<FunctionProfiler>
+X("insert-function-profiling",
+  "Insert instrumentation for function profiling");
+static RegisterAnalysisGroup<RSProfilers> XG(X);
+
+ModulePass *llvm::createFunctionProfilerPass() {
+  return new FunctionProfiler();
+}
+
+bool FunctionProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    cerr << "WARNING: cannot insert function profiling into a module"
+         << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  unsigned NumFunctions = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration())
+      ++NumFunctions;
+
+  const Type *ATy = ArrayType::get(Type::Int32Ty, NumFunctions);
+  GlobalVariable *Counters =
+    new GlobalVariable(ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "FuncProfCounters", &M);
+
+  // Instrument all of the functions...
+  unsigned i = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration())
+      // Insert counter at the start of the function
+      IncrementCounterInBlock(&I->getEntryBlock(), i++, Counters);
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_func_profiling", Counters);
+  return true;
+}
+
+
+namespace {
+  class BlockProfiler : public RSProfilers_std {
+    bool runOnModule(Module &M);
+  public:
+    static char ID;
+  };
+}
+
+char BlockProfiler::ID = 0;
+static RegisterPass<BlockProfiler>
+Y("insert-block-profiling", "Insert instrumentation for block profiling");
+static RegisterAnalysisGroup<RSProfilers> YG(Y);
+
+ModulePass *llvm::createBlockProfilerPass() { return new BlockProfiler(); }
+
+bool BlockProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    cerr << "WARNING: cannot insert block profiling into a module"
+         << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  unsigned NumBlocks = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    NumBlocks += I->size();
+
+  const Type *ATy = ArrayType::get(Type::Int32Ty, NumBlocks);
+  GlobalVariable *Counters =
+    new GlobalVariable(ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "BlockProfCounters", &M);
+
+  // Instrument all of the blocks...
+  unsigned i = 0;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    for (Function::iterator BB = I->begin(), E = I->end(); BB != E; ++BB)
+      // Insert counter at the start of the block
+      IncrementCounterInBlock(BB, i++, Counters);
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_block_profiling", Counters);
+  return true;
+}
+
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
new file mode 100644
index 0000000..d7c518d
--- /dev/null
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_library(LLVMInstrumentation
+  BlockProfiling.cpp
+  EdgeProfiling.cpp
+  ProfilingUtils.cpp
+  RSProfiling.cpp
+  )
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
new file mode 100644
index 0000000..0831f3b
--- /dev/null
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -0,0 +1,101 @@
+//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments the specified program with counters for edge profiling.
+// Edge profiling can give a reasonable approximation of the hot paths through a
+// program, and is used for a wide variety of program transformations.
+//
+// Note that this implementation is very naive.  We insert a counter for *every*
+// edge in the program, instead of using control flow information to prune the
+// number of counters inserted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfilingUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include <set>
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN EdgeProfiler : public ModulePass {
+    bool runOnModule(Module &M);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    EdgeProfiler() : ModulePass(&ID) {}
+  };
+}
+
+char EdgeProfiler::ID = 0;
+static RegisterPass<EdgeProfiler>
+X("insert-edge-profiling", "Insert instrumentation for edge profiling");
+
+ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
+
+bool EdgeProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    cerr << "WARNING: cannot insert edge profiling into a module"
+         << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  std::set<BasicBlock*> BlocksToInstrument;
+  unsigned NumEdges = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      // Keep track of which blocks need to be instrumented.  We don't want to
+      // instrument blocks that are added as the result of breaking critical
+      // edges!
+      BlocksToInstrument.insert(BB);
+      NumEdges += BB->getTerminator()->getNumSuccessors();
+    }
+
+  const Type *ATy = ArrayType::get(Type::Int32Ty, NumEdges);
+  GlobalVariable *Counters =
+    new GlobalVariable(ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "EdgeProfCounters", &M);
+
+  // Instrument all of the edges...
+  unsigned i = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (BlocksToInstrument.count(BB)) {  // Don't instrument inserted blocks
+        // Okay, we have to add a counter of each outgoing edge.  If the
+        // outgoing edge is not critical don't split it, just insert the counter
+        // in the source or destination of the edge.
+        TerminatorInst *TI = BB->getTerminator();
+        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+          // If the edge is critical, split it.
+          SplitCriticalEdge(TI, s, this);
+
+          // Okay, we are guaranteed that the edge is no longer critical.  If we
+          // only have a single successor, insert the counter in this block,
+          // otherwise insert it in the successor block.
+          if (TI->getNumSuccessors() == 1) {
+            // Insert counter at the start of the block
+            IncrementCounterInBlock(BB, i++, Counters);
+          } else {
+            // Insert counter at the start of the block
+            IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters);
+          }
+        }
+      }
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters);
+  return true;
+}
+
diff --git a/lib/Transforms/Instrumentation/Makefile b/lib/Transforms/Instrumentation/Makefile
new file mode 100644
index 0000000..6cbc7a9
--- /dev/null
+++ b/lib/Transforms/Instrumentation/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Instrumentation/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMInstrumentation
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
new file mode 100644
index 0000000..48071f1
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -0,0 +1,120 @@
+//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a few helper functions which are used by profile
+// instrumentation code to instrument the code.  This allows the profiler pass
+// to worry about *what* to insert, and these functions take care of *how* to do
+// it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfilingUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+
+void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
+                                   GlobalValue *Array) {
+  const Type *ArgVTy = 
+    PointerType::getUnqual(PointerType::getUnqual(Type::Int8Ty));
+  const PointerType *UIntPtr = PointerType::getUnqual(Type::Int32Ty);
+  Module &M = *MainFn->getParent();
+  Constant *InitFn = M.getOrInsertFunction(FnName, Type::Int32Ty, Type::Int32Ty,
+                                           ArgVTy, UIntPtr, Type::Int32Ty,
+                                           (Type *)0);
+
+  // This could force argc and argv into programs that wouldn't otherwise have
+  // them, but instead we just pass null values in.
+  std::vector<Value*> Args(4);
+  Args[0] = Constant::getNullValue(Type::Int32Ty);
+  Args[1] = Constant::getNullValue(ArgVTy);
+
+  // Skip over any allocas in the entry block.
+  BasicBlock *Entry = MainFn->begin();
+  BasicBlock::iterator InsertPos = Entry->begin();
+  while (isa<AllocaInst>(InsertPos)) ++InsertPos;
+
+  std::vector<Constant*> GEPIndices(2, Constant::getNullValue(Type::Int32Ty));
+  unsigned NumElements = 0;
+  if (Array) {
+    Args[2] = ConstantExpr::getGetElementPtr(Array, &GEPIndices[0],
+                                             GEPIndices.size());
+    NumElements =
+      cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
+  } else {
+    // If this profiling instrumentation doesn't have a constant array, just
+    // pass null.
+    Args[2] = ConstantPointerNull::get(UIntPtr);
+  }
+  Args[3] = ConstantInt::get(Type::Int32Ty, NumElements);
+
+  Instruction *InitCall = CallInst::Create(InitFn, Args.begin(), Args.end(),
+                                           "newargc", InsertPos);
+
+  // If argc or argv are not available in main, just pass null values in.
+  Function::arg_iterator AI;
+  switch (MainFn->arg_size()) {
+  default:
+  case 2:
+    AI = MainFn->arg_begin(); ++AI;
+    if (AI->getType() != ArgVTy) {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy, 
+                                                            false);
+      InitCall->setOperand(2, 
+          CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
+    } else {
+      InitCall->setOperand(2, AI);
+    }
+    /* FALL THROUGH */
+
+  case 1:
+    AI = MainFn->arg_begin();
+    // If the program looked at argc, have it look at the return value of the
+    // init call instead.
+    if (AI->getType() != Type::Int32Ty) {
+      Instruction::CastOps opcode;
+      if (!AI->use_empty()) {
+        opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true);
+        AI->replaceAllUsesWith(
+          CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos));
+      }
+      opcode = CastInst::getCastOpcode(AI, true, Type::Int32Ty, true);
+      InitCall->setOperand(1, 
+          CastInst::Create(opcode, AI, Type::Int32Ty, "argc.cast", InitCall));
+    } else {
+      AI->replaceAllUsesWith(InitCall);
+      InitCall->setOperand(1, AI);
+    }
+
+  case 0: break;
+  }
+}
+
+void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                                   GlobalValue *CounterArray) {
+  // Insert the increment after any alloca or PHI instructions...
+  BasicBlock::iterator InsertPos = BB->getFirstNonPHI();
+  while (isa<AllocaInst>(InsertPos))
+    ++InsertPos;
+
+  // Create the getelementptr constant expression
+  std::vector<Constant*> Indices(2);
+  Indices[0] = Constant::getNullValue(Type::Int32Ty);
+  Indices[1] = ConstantInt::get(Type::Int32Ty, CounterNum);
+  Constant *ElementPtr = 
+    ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], Indices.size());
+
+  // Load, increment and store the value back.
+  Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
+  Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal,
+                                         ConstantInt::get(Type::Int32Ty, 1),
+                                         "NewFuncCounter", InsertPos);
+  new StoreInst(NewVal, ElementPtr, InsertPos);
+}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h
new file mode 100644
index 0000000..94efffe
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.h
@@ -0,0 +1,31 @@
+//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a few helper functions which are used by profile
+// instrumentation code to instrument the code.  This allows the profiler pass
+// to worry about *what* to insert, and these functions take care of *how* to do
+// it.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PROFILINGUTILS_H
+#define PROFILINGUTILS_H
+
+namespace llvm {
+  class Function;
+  class GlobalValue;
+  class BasicBlock;
+
+  void InsertProfilingInitCall(Function *MainFn, const char *FnName,
+                               GlobalValue *Arr = 0);
+  void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                               GlobalValue *CounterArray);
+}
+
+#endif
diff --git a/lib/Transforms/Instrumentation/RSProfiling.cpp b/lib/Transforms/Instrumentation/RSProfiling.cpp
new file mode 100644
index 0000000..c6cf4df
--- /dev/null
+++ b/lib/Transforms/Instrumentation/RSProfiling.cpp
@@ -0,0 +1,653 @@
+//===- RSProfiling.cpp - Various profiling using random sampling ----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// These passes implement a random sampling based profiling.  Different methods
+// of choosing when to sample are supported, as well as different types of
+// profiling.  This is done as two passes.  The first is a sequence of profiling
+// passes which insert profiling into the program, and remember what they 
+// inserted.
+//
+// The second stage duplicates all instructions in a function, ignoring the 
+// profiling code, then connects the two versions togeather at the entry and at
+// backedges.  At each connection point a choice is made as to whether to jump
+// to the profiled code (take a sample) or execute the unprofiled code.
+//
+// It is highly recommended that after this pass one runs mem2reg and adce
+// (instcombine load-vn gdce dse also are good to run afterwards)
+//
+// This design is intended to make the profiling passes independent of the RS
+// framework, but any profiling pass that implements the RSProfiling interface
+// is compatible with the rs framework (and thus can be sampled)
+//
+// TODO: obviously the block and function profiling are almost identical to the
+// existing ones, so they can be unified (esp since these passes are valid
+// without the rs framework).
+// TODO: Fix choice code so that frequency is not hard coded
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "RSProfiling.h"
+#include <set>
+#include <map>
+#include <queue>
+using namespace llvm;
+
+namespace {
+  enum RandomMeth {
+    GBV, GBVO, HOSTCC
+  };
+}
+
+static cl::opt<RandomMeth> RandomMethod("profile-randomness",
+    cl::desc("How to randomly choose to profile:"),
+    cl::values(
+               clEnumValN(GBV, "global", "global counter"),
+               clEnumValN(GBVO, "ra_global", 
+                          "register allocated global counter"),
+               clEnumValN(HOSTCC, "rdcc", "cycle counter"),
+               clEnumValEnd));
+  
+namespace {
+  /// NullProfilerRS - The basic profiler that does nothing.  It is the default
+  /// profiler and thus terminates RSProfiler chains.  It is useful for 
+  /// measuring framework overhead
+  class VISIBILITY_HIDDEN NullProfilerRS : public RSProfilers {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    bool isProfiling(Value* v) {
+      return false;
+    }
+    bool runOnModule(Module &M) {
+      return false;
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+static RegisterAnalysisGroup<RSProfilers> A("Profiling passes");
+static RegisterPass<NullProfilerRS> NP("insert-null-profiling-rs",
+                                       "Measure profiling framework overhead");
+static RegisterAnalysisGroup<RSProfilers, true> NPT(NP);
+
+namespace {
+  /// Chooser - Something that chooses when to make a sample of the profiled code
+  class VISIBILITY_HIDDEN Chooser {
+  public:
+    /// ProcessChoicePoint - is called for each basic block inserted to choose 
+    /// between normal and sample code
+    virtual void ProcessChoicePoint(BasicBlock*) = 0;
+    /// PrepFunction - is called once per function before other work is done.
+    /// This gives the opertunity to insert new allocas and such.
+    virtual void PrepFunction(Function*) = 0;
+    virtual ~Chooser() {}
+  };
+
+  //Things that implement sampling policies
+  //A global value that is read-mod-stored to choose when to sample.
+  //A sample is taken when the global counter hits 0
+  class VISIBILITY_HIDDEN GlobalRandomCounter : public Chooser {
+    GlobalVariable* Counter;
+    Value* ResetValue;
+    const Type* T;
+  public:
+    GlobalRandomCounter(Module& M, const Type* t, uint64_t resetval);
+    virtual ~GlobalRandomCounter();
+    virtual void PrepFunction(Function* F);
+    virtual void ProcessChoicePoint(BasicBlock* bb);
+  };
+
+  //Same is GRC, but allow register allocation of the global counter
+  class VISIBILITY_HIDDEN GlobalRandomCounterOpt : public Chooser {
+    GlobalVariable* Counter;
+    Value* ResetValue;
+    AllocaInst* AI;
+    const Type* T;
+  public:
+    GlobalRandomCounterOpt(Module& M, const Type* t, uint64_t resetval);
+    virtual ~GlobalRandomCounterOpt();
+    virtual void PrepFunction(Function* F);
+    virtual void ProcessChoicePoint(BasicBlock* bb);
+  };
+
+  //Use the cycle counter intrinsic as a source of pseudo randomness when
+  //deciding when to sample.
+  class VISIBILITY_HIDDEN CycleCounter : public Chooser {
+    uint64_t rm;
+    Constant *F;
+  public:
+    CycleCounter(Module& m, uint64_t resetmask);
+    virtual ~CycleCounter();
+    virtual void PrepFunction(Function* F);
+    virtual void ProcessChoicePoint(BasicBlock* bb);
+  };
+
+  /// ProfilerRS - Insert the random sampling framework
+  struct VISIBILITY_HIDDEN ProfilerRS : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ProfilerRS() : FunctionPass(&ID) {}
+
+    std::map<Value*, Value*> TransCache;
+    std::set<BasicBlock*> ChoicePoints;
+    Chooser* c;
+
+    //Translate and duplicate values for the new profile free version of stuff
+    Value* Translate(Value* v);
+    //Duplicate an entire function (with out profiling)
+    void Duplicate(Function& F, RSProfilers& LI);
+    //Called once for each backedge, handle the insertion of choice points and
+    //the interconection of the two versions of the code
+    void ProcessBackEdge(BasicBlock* src, BasicBlock* dst, Function& F);
+    bool runOnFunction(Function& F);
+    bool doInitialization(Module &M);
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  };
+}
+
+static RegisterPass<ProfilerRS>
+X("insert-rs-profiling-framework",
+  "Insert random sampling instrumentation framework");
+
+char RSProfilers::ID = 0;
+char NullProfilerRS::ID = 0;
+char ProfilerRS::ID = 0;
+
+//Local utilities
+static void ReplacePhiPred(BasicBlock* btarget, 
+                           BasicBlock* bold, BasicBlock* bnew);
+
+static void CollapsePhi(BasicBlock* btarget, BasicBlock* bsrc);
+
+template<class T>
+static void recBackEdge(BasicBlock* bb, T& BackEdges, 
+                        std::map<BasicBlock*, int>& color,
+                        std::map<BasicBlock*, int>& depth,
+                        std::map<BasicBlock*, int>& finish,
+                        int& time);
+
+//find the back edges and where they go to
+template<class T>
+static void getBackEdges(Function& F, T& BackEdges);
+
+
+///////////////////////////////////////
+// Methods of choosing when to profile
+///////////////////////////////////////
+  
+GlobalRandomCounter::GlobalRandomCounter(Module& M, const Type* t, 
+                                         uint64_t resetval) : T(t) {
+  ConstantInt* Init = ConstantInt::get(T, resetval); 
+  ResetValue = Init;
+  Counter = new GlobalVariable(T, false, GlobalValue::InternalLinkage,
+                               Init, "RandomSteeringCounter", &M);
+}
+
+GlobalRandomCounter::~GlobalRandomCounter() {}
+
+void GlobalRandomCounter::PrepFunction(Function* F) {}
+
+void GlobalRandomCounter::ProcessChoicePoint(BasicBlock* bb) {
+  BranchInst* t = cast<BranchInst>(bb->getTerminator());
+  
+  //decrement counter
+  LoadInst* l = new LoadInst(Counter, "counter", t);
+  
+  ICmpInst* s = new ICmpInst(ICmpInst::ICMP_EQ, l, ConstantInt::get(T, 0), 
+                             "countercc", t);
+
+  Value* nv = BinaryOperator::CreateSub(l, ConstantInt::get(T, 1),
+                                        "counternew", t);
+  new StoreInst(nv, Counter, t);
+  t->setCondition(s);
+  
+  //reset counter
+  BasicBlock* oldnext = t->getSuccessor(0);
+  BasicBlock* resetblock = BasicBlock::Create("reset", oldnext->getParent(), 
+                                              oldnext);
+  TerminatorInst* t2 = BranchInst::Create(oldnext, resetblock);
+  t->setSuccessor(0, resetblock);
+  new StoreInst(ResetValue, Counter, t2);
+  ReplacePhiPred(oldnext, bb, resetblock);
+}
+
+GlobalRandomCounterOpt::GlobalRandomCounterOpt(Module& M, const Type* t, 
+                                               uint64_t resetval) 
+  : AI(0), T(t) {
+  ConstantInt* Init = ConstantInt::get(T, resetval);
+  ResetValue  = Init;
+  Counter = new GlobalVariable(T, false, GlobalValue::InternalLinkage,
+                               Init, "RandomSteeringCounter", &M);
+}
+
+GlobalRandomCounterOpt::~GlobalRandomCounterOpt() {}
+
+void GlobalRandomCounterOpt::PrepFunction(Function* F) {
+  //make a local temporary to cache the global
+  BasicBlock& bb = F->getEntryBlock();
+  BasicBlock::iterator InsertPt = bb.begin();
+  AI = new AllocaInst(T, 0, "localcounter", InsertPt);
+  LoadInst* l = new LoadInst(Counter, "counterload", InsertPt);
+  new StoreInst(l, AI, InsertPt);
+  
+  //modify all functions and return values to restore the local variable to/from
+  //the global variable
+  for(Function::iterator fib = F->begin(), fie = F->end();
+      fib != fie; ++fib)
+    for(BasicBlock::iterator bib = fib->begin(), bie = fib->end();
+        bib != bie; ++bib)
+      if (isa<CallInst>(bib)) {
+        LoadInst* l = new LoadInst(AI, "counter", bib);
+        new StoreInst(l, Counter, bib);
+        l = new LoadInst(Counter, "counter", ++bib);
+        new StoreInst(l, AI, bib--);
+      } else if (isa<InvokeInst>(bib)) {
+        LoadInst* l = new LoadInst(AI, "counter", bib);
+        new StoreInst(l, Counter, bib);
+        
+        BasicBlock* bb = cast<InvokeInst>(bib)->getNormalDest();
+        BasicBlock::iterator i = bb->getFirstNonPHI();
+        l = new LoadInst(Counter, "counter", i);
+        
+        bb = cast<InvokeInst>(bib)->getUnwindDest();
+        i = bb->getFirstNonPHI();
+        l = new LoadInst(Counter, "counter", i);
+        new StoreInst(l, AI, i);
+      } else if (isa<UnwindInst>(&*bib) || isa<ReturnInst>(&*bib)) {
+        LoadInst* l = new LoadInst(AI, "counter", bib);
+        new StoreInst(l, Counter, bib);
+      }
+}
+
+void GlobalRandomCounterOpt::ProcessChoicePoint(BasicBlock* bb) {
+  BranchInst* t = cast<BranchInst>(bb->getTerminator());
+  
+  //decrement counter
+  LoadInst* l = new LoadInst(AI, "counter", t);
+  
+  ICmpInst* s = new ICmpInst(ICmpInst::ICMP_EQ, l, ConstantInt::get(T, 0), 
+                             "countercc", t);
+
+  Value* nv = BinaryOperator::CreateSub(l, ConstantInt::get(T, 1),
+                                        "counternew", t);
+  new StoreInst(nv, AI, t);
+  t->setCondition(s);
+  
+  //reset counter
+  BasicBlock* oldnext = t->getSuccessor(0);
+  BasicBlock* resetblock = BasicBlock::Create("reset", oldnext->getParent(), 
+                                              oldnext);
+  TerminatorInst* t2 = BranchInst::Create(oldnext, resetblock);
+  t->setSuccessor(0, resetblock);
+  new StoreInst(ResetValue, AI, t2);
+  ReplacePhiPred(oldnext, bb, resetblock);
+}
+
+
+CycleCounter::CycleCounter(Module& m, uint64_t resetmask) : rm(resetmask) {
+  F = Intrinsic::getDeclaration(&m, Intrinsic::readcyclecounter);
+}
+
+CycleCounter::~CycleCounter() {}
+
+void CycleCounter::PrepFunction(Function* F) {}
+
+void CycleCounter::ProcessChoicePoint(BasicBlock* bb) {
+  BranchInst* t = cast<BranchInst>(bb->getTerminator());
+  
+  CallInst* c = CallInst::Create(F, "rdcc", t);
+  BinaryOperator* b = 
+    BinaryOperator::CreateAnd(c, ConstantInt::get(Type::Int64Ty, rm),
+                              "mrdcc", t);
+  
+  ICmpInst *s = new ICmpInst(ICmpInst::ICMP_EQ, b,
+                             ConstantInt::get(Type::Int64Ty, 0), 
+                             "mrdccc", t);
+
+  t->setCondition(s);
+}
+
+///////////////////////////////////////
+// Profiling:
+///////////////////////////////////////
+bool RSProfilers_std::isProfiling(Value* v) {
+  if (profcode.find(v) != profcode.end())
+    return true;
+  //else
+  RSProfilers& LI = getAnalysis<RSProfilers>();
+  return LI.isProfiling(v);
+}
+
+void RSProfilers_std::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                                          GlobalValue *CounterArray) {
+  // Insert the increment after any alloca or PHI instructions...
+  BasicBlock::iterator InsertPos = BB->getFirstNonPHI();
+  while (isa<AllocaInst>(InsertPos))
+    ++InsertPos;
+  
+  // Create the getelementptr constant expression
+  std::vector<Constant*> Indices(2);
+  Indices[0] = Constant::getNullValue(Type::Int32Ty);
+  Indices[1] = ConstantInt::get(Type::Int32Ty, CounterNum);
+  Constant *ElementPtr = ConstantExpr::getGetElementPtr(CounterArray,
+                                                        &Indices[0], 2);
+  
+  // Load, increment and store the value back.
+  Value *OldVal = new LoadInst(ElementPtr, "OldCounter", InsertPos);
+  profcode.insert(OldVal);
+  Value *NewVal = BinaryOperator::CreateAdd(OldVal,
+                                            ConstantInt::get(Type::Int32Ty, 1),
+                                            "NewCounter", InsertPos);
+  profcode.insert(NewVal);
+  profcode.insert(new StoreInst(NewVal, ElementPtr, InsertPos));
+}
+
+void RSProfilers_std::getAnalysisUsage(AnalysisUsage &AU) const {
+  //grab any outstanding profiler, or get the null one
+  AU.addRequired<RSProfilers>();
+}
+
+///////////////////////////////////////
+// RS Framework
+///////////////////////////////////////
+
+Value* ProfilerRS::Translate(Value* v) {
+  if(TransCache[v])
+    return TransCache[v];
+  
+  if (BasicBlock* bb = dyn_cast<BasicBlock>(v)) {
+    if (bb == &bb->getParent()->getEntryBlock())
+      TransCache[bb] = bb; //don't translate entry block
+    else
+      TransCache[bb] = BasicBlock::Create("dup_" + bb->getName(),
+                                          bb->getParent(), NULL);
+    return TransCache[bb];
+  } else if (Instruction* i = dyn_cast<Instruction>(v)) {
+    //we have already translated this
+    //do not translate entry block allocas
+    if(&i->getParent()->getParent()->getEntryBlock() == i->getParent()) {
+      TransCache[i] = i;
+      return i;
+    } else {
+      //translate this
+      Instruction* i2 = i->clone();
+      if (i->hasName())
+        i2->setName("dup_" + i->getName());
+      TransCache[i] = i2;
+      //NumNewInst++;
+      for (unsigned x = 0; x < i2->getNumOperands(); ++x)
+        i2->setOperand(x, Translate(i2->getOperand(x)));
+      return i2;
+    }
+  } else if (isa<Function>(v) || isa<Constant>(v) || isa<Argument>(v)) {
+    TransCache[v] = v;
+    return v;
+  }
+  assert(0 && "Value not handled");
+  return 0;
+}
+
+void ProfilerRS::Duplicate(Function& F, RSProfilers& LI)
+{
+  //perform a breadth first search, building up a duplicate of the code
+  std::queue<BasicBlock*> worklist;
+  std::set<BasicBlock*> seen;
+  
+  //This loop ensures proper BB order, to help performance
+  for (Function::iterator fib = F.begin(), fie = F.end(); fib != fie; ++fib)
+    worklist.push(fib);
+  while (!worklist.empty()) {
+    Translate(worklist.front());
+    worklist.pop();
+  }
+  
+  //remember than reg2mem created a new entry block we don't want to duplicate
+  worklist.push(F.getEntryBlock().getTerminator()->getSuccessor(0));
+  seen.insert(&F.getEntryBlock());
+  
+  while (!worklist.empty()) {
+    BasicBlock* bb = worklist.front();
+    worklist.pop();
+    if(seen.find(bb) == seen.end()) {
+      BasicBlock* bbtarget = cast<BasicBlock>(Translate(bb));
+      BasicBlock::InstListType& instlist = bbtarget->getInstList();
+      for (BasicBlock::iterator iib = bb->begin(), iie = bb->end(); 
+           iib != iie; ++iib) {
+        //NumOldInst++;
+        if (!LI.isProfiling(&*iib)) {
+          Instruction* i = cast<Instruction>(Translate(iib));
+          instlist.insert(bbtarget->end(), i);
+        }
+      }
+      //updated search state;
+      seen.insert(bb);
+      TerminatorInst* ti = bb->getTerminator();
+      for (unsigned x = 0; x < ti->getNumSuccessors(); ++x) {
+        BasicBlock* bbs = ti->getSuccessor(x);
+        if (seen.find(bbs) == seen.end()) {
+          worklist.push(bbs);
+        }
+      }
+    }
+  }
+}
+
+void ProfilerRS::ProcessBackEdge(BasicBlock* src, BasicBlock* dst, Function& F) {
+  //given a backedge from B -> A, and translations A' and B',
+  //a: insert C and C'
+  //b: add branches in C to A and A' and in C' to A and A'
+  //c: mod terminators@B, replace A with C
+  //d: mod terminators@B', replace A' with C'
+  //e: mod phis@A for pred B to be pred C
+  //       if multiple entries, simplify to one
+  //f: mod phis@A' for pred B' to be pred C'
+  //       if multiple entries, simplify to one
+  //g: for all phis@A with pred C using x
+  //       add in edge from C' using x'
+  //       add in edge from C using x in A'
+  
+  //a:
+  Function::iterator BBN = src; ++BBN;
+  BasicBlock* bbC = BasicBlock::Create("choice", &F, BBN);
+  //ChoicePoints.insert(bbC);
+  BBN = cast<BasicBlock>(Translate(src));
+  BasicBlock* bbCp = BasicBlock::Create("choice", &F, ++BBN);
+  ChoicePoints.insert(bbCp);
+  
+  //b:
+  BranchInst::Create(cast<BasicBlock>(Translate(dst)), bbC);
+  BranchInst::Create(dst, cast<BasicBlock>(Translate(dst)), 
+                     ConstantInt::get(Type::Int1Ty, true), bbCp);
+  //c:
+  {
+    TerminatorInst* iB = src->getTerminator();
+    for (unsigned x = 0; x < iB->getNumSuccessors(); ++x)
+      if (iB->getSuccessor(x) == dst)
+        iB->setSuccessor(x, bbC);
+  }
+  //d:
+  {
+    TerminatorInst* iBp = cast<TerminatorInst>(Translate(src->getTerminator()));
+    for (unsigned x = 0; x < iBp->getNumSuccessors(); ++x)
+      if (iBp->getSuccessor(x) == cast<BasicBlock>(Translate(dst)))
+        iBp->setSuccessor(x, bbCp);
+  }
+  //e:
+  ReplacePhiPred(dst, src, bbC);
+  //src could be a switch, in which case we are replacing several edges with one
+  //thus collapse those edges int the Phi
+  CollapsePhi(dst, bbC);
+  //f:
+  ReplacePhiPred(cast<BasicBlock>(Translate(dst)),
+                 cast<BasicBlock>(Translate(src)),bbCp);
+  CollapsePhi(cast<BasicBlock>(Translate(dst)), bbCp);
+  //g:
+  for(BasicBlock::iterator ib = dst->begin(), ie = dst->end(); ib != ie;
+      ++ib)
+    if (PHINode* phi = dyn_cast<PHINode>(&*ib)) {
+      for(unsigned x = 0; x < phi->getNumIncomingValues(); ++x)
+        if(bbC == phi->getIncomingBlock(x)) {
+          phi->addIncoming(Translate(phi->getIncomingValue(x)), bbCp);
+          cast<PHINode>(Translate(phi))->addIncoming(phi->getIncomingValue(x), 
+                                                     bbC);
+        }
+      phi->removeIncomingValue(bbC);
+    }
+}
+
+bool ProfilerRS::runOnFunction(Function& F) {
+  if (!F.isDeclaration()) {
+    std::set<std::pair<BasicBlock*, BasicBlock*> > BackEdges;
+    RSProfilers& LI = getAnalysis<RSProfilers>();
+    
+    getBackEdges(F, BackEdges);
+    Duplicate(F, LI);
+    //assume that stuff worked.  now connect the duplicated basic blocks 
+    //with the originals in such a way as to preserve ssa.  yuk!
+    for (std::set<std::pair<BasicBlock*, BasicBlock*> >::iterator 
+           ib = BackEdges.begin(), ie = BackEdges.end(); ib != ie; ++ib)
+      ProcessBackEdge(ib->first, ib->second, F);
+    
+    //oh, and add the edge from the reg2mem created entry node to the 
+    //duplicated second node
+    TerminatorInst* T = F.getEntryBlock().getTerminator();
+    ReplaceInstWithInst(T, BranchInst::Create(T->getSuccessor(0),
+                                              cast<BasicBlock>(
+                                                Translate(T->getSuccessor(0))),
+                                              ConstantInt::get(Type::Int1Ty,
+                                                               true)));
+    
+    //do whatever is needed now that the function is duplicated
+    c->PrepFunction(&F);
+    
+    //add entry node to choice points
+    ChoicePoints.insert(&F.getEntryBlock());
+    
+    for (std::set<BasicBlock*>::iterator 
+           ii = ChoicePoints.begin(), ie = ChoicePoints.end(); ii != ie; ++ii)
+      c->ProcessChoicePoint(*ii);
+    
+    ChoicePoints.clear();
+    TransCache.clear();
+    
+    return true;
+  }
+  return false;
+}
+
+bool ProfilerRS::doInitialization(Module &M) {
+  switch (RandomMethod) {
+  case GBV:
+    c = new GlobalRandomCounter(M, Type::Int32Ty, (1 << 14) - 1);
+    break;
+  case GBVO:
+    c = new GlobalRandomCounterOpt(M, Type::Int32Ty, (1 << 14) - 1);
+    break;
+  case HOSTCC:
+    c = new CycleCounter(M, (1 << 14) - 1);
+    break;
+  };
+  return true;
+}
+
+void ProfilerRS::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<RSProfilers>();
+  AU.addRequiredID(DemoteRegisterToMemoryID);
+}
+
+///////////////////////////////////////
+// Utilities:
+///////////////////////////////////////
+static void ReplacePhiPred(BasicBlock* btarget, 
+                           BasicBlock* bold, BasicBlock* bnew) {
+  for(BasicBlock::iterator ib = btarget->begin(), ie = btarget->end();
+      ib != ie; ++ib)
+    if (PHINode* phi = dyn_cast<PHINode>(&*ib)) {
+      for(unsigned x = 0; x < phi->getNumIncomingValues(); ++x)
+        if(bold == phi->getIncomingBlock(x))
+          phi->setIncomingBlock(x, bnew);
+    }
+}
+
+static void CollapsePhi(BasicBlock* btarget, BasicBlock* bsrc) {
+  for(BasicBlock::iterator ib = btarget->begin(), ie = btarget->end();
+      ib != ie; ++ib)
+    if (PHINode* phi = dyn_cast<PHINode>(&*ib)) {
+      std::map<BasicBlock*, Value*> counter;
+      for(unsigned i = 0; i < phi->getNumIncomingValues(); ) {
+        if (counter[phi->getIncomingBlock(i)]) {
+          assert(phi->getIncomingValue(i) == counter[phi->getIncomingBlock(i)]);
+          phi->removeIncomingValue(i, false);
+        } else {
+          counter[phi->getIncomingBlock(i)] = phi->getIncomingValue(i);
+          ++i;
+        }
+      }
+    } 
+}
+
+template<class T>
+static void recBackEdge(BasicBlock* bb, T& BackEdges, 
+                        std::map<BasicBlock*, int>& color,
+                        std::map<BasicBlock*, int>& depth,
+                        std::map<BasicBlock*, int>& finish,
+                        int& time)
+{
+  color[bb] = 1;
+  ++time;
+  depth[bb] = time;
+  TerminatorInst* t= bb->getTerminator();
+  for(unsigned i = 0; i < t->getNumSuccessors(); ++i) {
+    BasicBlock* bbnew = t->getSuccessor(i);
+    if (color[bbnew] == 0)
+      recBackEdge(bbnew, BackEdges, color, depth, finish, time);
+    else if (color[bbnew] == 1) {
+      BackEdges.insert(std::make_pair(bb, bbnew));
+      //NumBackEdges++;
+    }
+  }
+  color[bb] = 2;
+  ++time;
+  finish[bb] = time;
+}
+
+
+
+//find the back edges and where they go to
+template<class T>
+static void getBackEdges(Function& F, T& BackEdges) {
+  std::map<BasicBlock*, int> color;
+  std::map<BasicBlock*, int> depth;
+  std::map<BasicBlock*, int> finish;
+  int time = 0;
+  recBackEdge(&F.getEntryBlock(), BackEdges, color, depth, finish, time);
+  DOUT << F.getName() << " " << BackEdges.size() << "\n";
+}
+
+
+//Creation functions
+ModulePass* llvm::createNullProfilerRSPass() {
+  return new NullProfilerRS();
+}
+
+FunctionPass* llvm::createRSProfilingPass() {
+  return new ProfilerRS();
+}
diff --git a/lib/Transforms/Instrumentation/RSProfiling.h b/lib/Transforms/Instrumentation/RSProfiling.h
new file mode 100644
index 0000000..8bbe7c7
--- /dev/null
+++ b/lib/Transforms/Instrumentation/RSProfiling.h
@@ -0,0 +1,31 @@
+//===- RSProfiling.h - Various profiling using random sampling ----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// See notes in RSProfiling.cpp
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/RSProfiling.h"
+#include <set>
+
+namespace llvm {
+  /// RSProfilers_std - a simple support class for profilers that handles most
+  /// of the work of chaining and tracking inserted code.
+  struct RSProfilers_std : public RSProfilers {
+    static char ID;
+    std::set<Value*> profcode;
+    // Lookup up values in profcode
+    virtual bool isProfiling(Value* v);
+    // handles required chaining
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    // places counter updates in basic blocks and recordes added instructions in
+    // profcode
+    void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                                 GlobalValue *CounterArray);
+  };
+}
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
new file mode 100644
index 0000000..5fe1eeb
--- /dev/null
+++ b/lib/Transforms/Makefile
@@ -0,0 +1,20 @@
+##===- lib/Transforms/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+PARALLEL_DIRS = Utils Instrumentation Scalar IPO Hello
+
+include $(LEVEL)/Makefile.config
+
+# No support for plugins on windows targets
+ifeq ($(OS), $(filter $(OS), Cygwin MingW))
+  PARALLEL_DIRS := $(filter-out Hello, $(PARALLEL_DIRS))
+endif
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 0000000..9c55f66
--- /dev/null
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -0,0 +1,98 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass.  This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not 
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "adce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN ADCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ADCE() : FunctionPass(&ID) {}
+    
+    virtual bool runOnFunction(Function& F);
+    
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+      AU.setPreservesCFG();
+    }
+    
+  };
+}
+
+char ADCE::ID = 0;
+static RegisterPass<ADCE> X("adce", "Aggressive Dead Code Elimination");
+
+bool ADCE::runOnFunction(Function& F) {
+  SmallPtrSet<Instruction*, 128> alive;
+  SmallVector<Instruction*, 128> worklist;
+  
+  // Collect the set of "root" instructions that are known live.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (isa<TerminatorInst>(I.getInstructionIterator()) ||
+        isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
+        I->mayHaveSideEffects()) {
+      alive.insert(I.getInstructionIterator());
+      worklist.push_back(I.getInstructionIterator());
+    }
+  
+  // Propagate liveness backwards to operands.
+  while (!worklist.empty()) {
+    Instruction* curr = worklist.back();
+    worklist.pop_back();
+    
+    for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
+         OI != OE; ++OI)
+      if (Instruction* Inst = dyn_cast<Instruction>(OI))
+        if (alive.insert(Inst))
+          worklist.push_back(Inst);
+  }
+  
+  // The inverse of the live set is the dead set.  These are those instructions
+  // which have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the worklist vector here for memory efficiency.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (!alive.count(I.getInstructionIterator())) {
+      worklist.push_back(I.getInstructionIterator());
+      I->dropAllReferences();
+    }
+  
+  for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
+       E = worklist.end(); I != E; ++I) {
+    NumRemoved++;
+    (*I)->eraseFromParent();
+  }
+
+  return !worklist.empty();
+}
+
+FunctionPass *llvm::createAggressiveDCEPass() {
+  return new ADCE();
+}
diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
new file mode 100644
index 0000000..fb9b880
--- /dev/null
+++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
@@ -0,0 +1,148 @@
+//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a very simple profile guided basic block placement
+// algorithm.  The idea is to put frequently executed blocks together at the
+// start of the function, and hopefully increase the number of fall-through
+// conditional branches.  If there is no profile information for a particular
+// function, this pass basically orders blocks in depth-first order
+//
+// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
+// Positioning" by Pettis and Hansen, except that it uses basic block counts
+// instead of edge counts.  This should be improved in many ways, but is very
+// simple for now.
+//
+// Basically we "place" the entry block, then loop over all successors in a DFO,
+// placing the most frequently executed successor until we run out of blocks.  I
+// told you this was _extremely_ simplistic. :) This is also much slower than it
+// could be.  When it becomes important, this pass will be rewritten to use a
+// better algorithm, and then we can worry about efficiency.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "block-placement"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Scalar.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumMoved, "Number of basic blocks moved");
+
+namespace {
+  struct VISIBILITY_HIDDEN BlockPlacement : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BlockPlacement() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<ProfileInfo>();
+      //AU.addPreserved<ProfileInfo>();  // Does this work?
+    }
+  private:
+    /// PI - The profile information that is guiding us.
+    ///
+    ProfileInfo *PI;
+
+    /// NumMovedBlocks - Every time we move a block, increment this counter.
+    ///
+    unsigned NumMovedBlocks;
+
+    /// PlacedBlocks - Every time we place a block, remember it so we don't get
+    /// into infinite loops.
+    std::set<BasicBlock*> PlacedBlocks;
+
+    /// InsertPos - This an iterator to the next place we want to insert a
+    /// block.
+    Function::iterator InsertPos;
+
+    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
+    /// successors.
+    void PlaceBlocks(BasicBlock *BB);
+  };
+}
+
+char BlockPlacement::ID = 0;
+static RegisterPass<BlockPlacement>
+X("block-placement", "Profile Guided Basic Block Placement");
+
+FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
+
+bool BlockPlacement::runOnFunction(Function &F) {
+  PI = &getAnalysis<ProfileInfo>();
+
+  NumMovedBlocks = 0;
+  InsertPos = F.begin();
+
+  // Recursively place all blocks.
+  PlaceBlocks(F.begin());
+
+  PlacedBlocks.clear();
+  NumMoved += NumMovedBlocks;
+  return NumMovedBlocks != 0;
+}
+
+
+/// PlaceBlocks - Recursively place the specified blocks and any unplaced
+/// successors.
+void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
+  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
+  PlacedBlocks.insert(BB);
+
+  // Place the specified block.
+  if (&*InsertPos != BB) {
+    // Use splice to move the block into the right place.  This avoids having to
+    // remove the block from the function then readd it, which causes a bunch of
+    // symbol table traffic that is entirely pointless.
+    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
+    Blocks.splice(InsertPos, Blocks, BB);
+
+    ++NumMovedBlocks;
+  } else {
+    // This block is already in the right place, we don't have to do anything.
+    ++InsertPos;
+  }
+
+  // Keep placing successors until we run out of ones to place.  Note that this
+  // loop is very inefficient (N^2) for blocks with many successors, like switch
+  // statements.  FIXME!
+  while (1) {
+    // Okay, now place any unplaced successors.
+    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
+
+    // Scan for the first unplaced successor.
+    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
+      /*empty*/;
+    if (SI == E) return;  // No more successors to place.
+
+    unsigned MaxExecutionCount = PI->getExecutionCount(*SI);
+    BasicBlock *MaxSuccessor = *SI;
+
+    // Scan for more frequently executed successors
+    for (; SI != E; ++SI)
+      if (!PlacedBlocks.count(*SI)) {
+        unsigned Count = PI->getExecutionCount(*SI);
+        if (Count > MaxExecutionCount ||
+            // Prefer to not disturb the code.
+            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
+          MaxExecutionCount = Count;
+          MaxSuccessor = *SI;
+        }
+      }
+
+    // Now that we picked the maximally executed successor, place it.
+    PlaceBlocks(MaxSuccessor);
+  }
+}
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
new file mode 100644
index 0000000..7a7c48b
--- /dev/null
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -0,0 +1,33 @@
+add_llvm_library(LLVMScalarOpts
+  ADCE.cpp
+  BasicBlockPlacement.cpp
+  CodeGenPrepare.cpp
+  CondPropagate.cpp
+  ConstantProp.cpp
+  DCE.cpp
+  DeadStoreElimination.cpp
+  GVN.cpp
+  GVNPRE.cpp
+  IndVarSimplify.cpp
+  InstructionCombining.cpp
+  JumpThreading.cpp
+  LICM.cpp
+  LoopDeletion.cpp
+  LoopIndexSplit.cpp
+  LoopRotation.cpp
+  LoopStrengthReduce.cpp
+  LoopUnroll.cpp
+  LoopUnswitch.cpp
+  MemCpyOptimizer.cpp
+  PredicateSimplifier.cpp
+  Reassociate.cpp
+  Reg2Mem.cpp
+  SCCP.cpp
+  Scalar.cpp
+  ScalarReplAggregates.cpp
+  SimplifyCFGPass.cpp
+  SimplifyHalfPowrLibCalls.cpp
+  SimplifyLibCalls.cpp
+  TailDuplication.cpp
+  TailRecursionElimination.cpp
+  )
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
new file mode 100644
index 0000000..342b1e5
--- /dev/null
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -0,0 +1,873 @@
+//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach. It should eventually be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "codegenprepare"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> FactorCommonPreds("split-critical-paths-tweak",
+                                       cl::init(false), cl::Hidden);
+
+namespace {
+  class VISIBILITY_HIDDEN CodeGenPrepare : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+
+    /// BackEdges - Keep a set of all the loop back edges.
+    ///
+    SmallSet<std::pair<const BasicBlock*, const BasicBlock*>, 8> BackEdges;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit CodeGenPrepare(const TargetLowering *tli = 0)
+      : FunctionPass(&ID), TLI(tli) {}
+    bool runOnFunction(Function &F);
+
+  private:
+    bool EliminateMostlyEmptyBlocks(Function &F);
+    bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+    void EliminateMostlyEmptyBlock(BasicBlock *BB);
+    bool OptimizeBlock(BasicBlock &BB);
+    bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy,
+                            DenseMap<Value*,Value*> &SunkAddrs);
+    bool OptimizeInlineAsmInst(Instruction *I, CallSite CS,
+                               DenseMap<Value*,Value*> &SunkAddrs);
+    bool OptimizeExtUses(Instruction *I);
+    void findLoopBackEdges(const Function &F);
+  };
+}
+
+char CodeGenPrepare::ID = 0;
+static RegisterPass<CodeGenPrepare> X("codegenprepare",
+                                      "Optimize for code generation");
+
+FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
+  return new CodeGenPrepare(TLI);
+}
+
+/// findLoopBackEdges - Do a DFS walk to find loop back edges.
+///
+void CodeGenPrepare::findLoopBackEdges(const Function &F) {
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+  
+  BackEdges.insert(Edges.begin(), Edges.end());
+}
+
+
+bool CodeGenPrepare::runOnFunction(Function &F) {
+  bool EverMadeChange = false;
+
+  // First pass, eliminate blocks that contain only PHI nodes and an
+  // unconditional branch.
+  EverMadeChange |= EliminateMostlyEmptyBlocks(F);
+
+  // Now find loop back edges.
+  findLoopBackEdges(F);
+
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      MadeChange |= OptimizeBlock(*BB);
+    EverMadeChange |= MadeChange;
+  }
+  return EverMadeChange;
+}
+
+/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes,
+/// debug info directives, and an unconditional branch.  Passes before isel
+/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for
+/// isel.  Start by eliminating these blocks so we can split them the way we
+/// want them.
+bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
+  bool MadeChange = false;
+  // Note that this intentionally skips the entry block.
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    // If this block doesn't end with an uncond branch, ignore it.
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || !BI->isUnconditional())
+      continue;
+
+    // If the instruction before the branch (skipping debug info) isn't a phi
+    // node, then other stuff is happening here.
+    BasicBlock::iterator BBI = BI;
+    if (BBI != BB->begin()) {
+      --BBI;
+      while (isa<DbgInfoIntrinsic>(BBI)) {
+        if (BBI == BB->begin())
+          break;
+        --BBI;
+      }
+      if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+        continue;
+    }
+
+    // Do not break infinite loops.
+    BasicBlock *DestBB = BI->getSuccessor(0);
+    if (DestBB == BB)
+      continue;
+
+    if (!CanMergeBlocks(BB, DestBB))
+      continue;
+
+    EliminateMostlyEmptyBlock(BB);
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a
+/// single uncond branch between them, and BB contains no other non-phi
+/// instructions.
+bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
+                                    const BasicBlock *DestBB) const {
+  // We only want to eliminate blocks whose phi nodes are used by phi nodes in
+  // the successor.  If there are more complex condition (e.g. preheaders),
+  // don't mess around with them.
+  BasicBlock::const_iterator BBI = BB->begin();
+  while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+    for (Value::use_const_iterator UI = PN->use_begin(), E = PN->use_end();
+         UI != E; ++UI) {
+      const Instruction *User = cast<Instruction>(*UI);
+      if (User->getParent() != DestBB || !isa<PHINode>(User))
+        return false;
+      // If User is inside DestBB block and it is a PHINode then check
+      // incoming value. If incoming value is not from BB then this is
+      // a complex condition (e.g. preheaders) we want to avoid here.
+      if (User->getParent() == DestBB) {
+        if (const PHINode *UPN = dyn_cast<PHINode>(User))
+          for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
+            Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
+            if (Insn && Insn->getParent() == BB &&
+                Insn->getParent() != UPN->getIncomingBlock(I))
+              return false;
+          }
+      }
+    }
+  }
+
+  // If BB and DestBB contain any common predecessors, then the phi nodes in BB
+  // and DestBB may have conflicting incoming values for the block.  If so, we
+  // can't merge the block.
+  const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
+  if (!DestBBPN) return true;  // no conflict.
+
+  // Collect the preds of BB.
+  SmallPtrSet<const BasicBlock*, 16> BBPreds;
+  if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+    // It is faster to get preds from a PHI than with pred_iterator.
+    for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+      BBPreds.insert(BBPN->getIncomingBlock(i));
+  } else {
+    BBPreds.insert(pred_begin(BB), pred_end(BB));
+  }
+
+  // Walk the preds of DestBB.
+  for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
+    if (BBPreds.count(Pred)) {   // Common predecessor?
+      BBI = DestBB->begin();
+      while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+        const Value *V1 = PN->getIncomingValueForBlock(Pred);
+        const Value *V2 = PN->getIncomingValueForBlock(BB);
+
+        // If V2 is a phi node in BB, look up what the mapped value will be.
+        if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
+          if (V2PN->getParent() == BB)
+            V2 = V2PN->getIncomingValueForBlock(Pred);
+
+        // If there is a conflict, bail out.
+        if (V1 != V2) return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+
+/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and
+/// an unconditional branch in it.
+void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  BasicBlock *DestBB = BI->getSuccessor(0);
+
+  DOUT << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB;
+
+  // If the destination block has a single pred, then this is a trivial edge,
+  // just collapse it.
+  if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
+    if (SinglePred != DestBB) {
+      // Remember if SinglePred was the entry block of the function.  If so, we
+      // will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(DestBB);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+      
+      DOUT << "AFTER:\n" << *DestBB << "\n\n\n";
+      return;
+    }
+  }
+
+  // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
+  // to handle the new incoming edges it is about to have.
+  PHINode *PN;
+  for (BasicBlock::iterator BBI = DestBB->begin();
+       (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+    // Remove the incoming value for BB, and remember it.
+    Value *InVal = PN->removeIncomingValue(BB, false);
+
+    // Two options: either the InVal is a phi node defined in BB or it is some
+    // value that dominates BB.
+    PHINode *InValPhi = dyn_cast<PHINode>(InVal);
+    if (InValPhi && InValPhi->getParent() == BB) {
+      // Add all of the input values of the input PHI as inputs of this phi.
+      for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
+        PN->addIncoming(InValPhi->getIncomingValue(i),
+                        InValPhi->getIncomingBlock(i));
+    } else {
+      // Otherwise, add one instance of the dominating value for each edge that
+      // we will be adding.
+      if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+        for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+          PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
+      } else {
+        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+          PN->addIncoming(InVal, *PI);
+      }
+    }
+  }
+
+  // The PHIs are now updated, change everything that refers to BB to use
+  // DestBB and remove BB.
+  BB->replaceAllUsesWith(DestBB);
+  BB->eraseFromParent();
+
+  DOUT << "AFTER:\n" << *DestBB << "\n\n\n";
+}
+
+
+/// SplitEdgeNicely - Split the critical edge from TI to its specified
+/// successor if it will improve codegen.  We only do this if the successor has
+/// phi nodes (otherwise critical edges are ok).  If there is already another
+/// predecessor of the succ that is empty (and thus has no phi nodes), use it
+/// instead of introducing a new block.
+static void SplitEdgeNicely(TerminatorInst *TI, unsigned SuccNum,
+                     SmallSet<std::pair<const BasicBlock*,
+                                        const BasicBlock*>, 8> &BackEdges,
+                             Pass *P) {
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  assert(isa<PHINode>(Dest->begin()) &&
+         "This should only be called if Dest has a PHI!");
+
+  // Do not split edges to EH landing pads.
+  if (InvokeInst *Invoke = dyn_cast<InvokeInst>(TI)) {
+    if (Invoke->getSuccessor(1) == Dest)
+      return;
+  }
+
+  // As a hack, never split backedges of loops.  Even though the copy for any
+  // PHIs inserted on the backedge would be dead for exits from the loop, we
+  // assume that the cost of *splitting* the backedge would be too high.
+  if (BackEdges.count(std::make_pair(TIBB, Dest)))
+    return;
+
+  if (!FactorCommonPreds) {
+    /// TIPHIValues - This array is lazily computed to determine the values of
+    /// PHIs in Dest that TI would provide.
+    SmallVector<Value*, 32> TIPHIValues;
+
+    // Check to see if Dest has any blocks that can be used as a split edge for
+    // this terminator.
+    for (pred_iterator PI = pred_begin(Dest), E = pred_end(Dest); PI != E; ++PI) {
+      BasicBlock *Pred = *PI;
+      // To be usable, the pred has to end with an uncond branch to the dest.
+      BranchInst *PredBr = dyn_cast<BranchInst>(Pred->getTerminator());
+      if (!PredBr || !PredBr->isUnconditional())
+        continue;
+      // Must be empty other than the branch and debug info.
+      BasicBlock::iterator I = Pred->begin();
+      while (isa<DbgInfoIntrinsic>(I))
+        I++;
+      if (dyn_cast<Instruction>(I) != PredBr)
+        continue;
+      // Cannot be the entry block; its label does not get emitted.
+      if (Pred == &(Dest->getParent()->getEntryBlock()))
+        continue;
+
+      // Finally, since we know that Dest has phi nodes in it, we have to make
+      // sure that jumping to Pred will have the same effect as going to Dest in
+      // terms of PHI values.
+      PHINode *PN;
+      unsigned PHINo = 0;
+      bool FoundMatch = true;
+      for (BasicBlock::iterator I = Dest->begin();
+           (PN = dyn_cast<PHINode>(I)); ++I, ++PHINo) {
+        if (PHINo == TIPHIValues.size())
+          TIPHIValues.push_back(PN->getIncomingValueForBlock(TIBB));
+
+        // If the PHI entry doesn't work, we can't use this pred.
+        if (TIPHIValues[PHINo] != PN->getIncomingValueForBlock(Pred)) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      // If we found a workable predecessor, change TI to branch to Succ.
+      if (FoundMatch) {
+        Dest->removePredecessor(TIBB);
+        TI->setSuccessor(SuccNum, Pred);
+        return;
+      }
+    }
+
+    SplitCriticalEdge(TI, SuccNum, P, true);
+    return;
+  }
+
+  PHINode *PN;
+  SmallVector<Value*, 8> TIPHIValues;
+  for (BasicBlock::iterator I = Dest->begin();
+       (PN = dyn_cast<PHINode>(I)); ++I)
+    TIPHIValues.push_back(PN->getIncomingValueForBlock(TIBB));
+
+  SmallVector<BasicBlock*, 8> IdenticalPreds;
+  for (pred_iterator PI = pred_begin(Dest), E = pred_end(Dest); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (BackEdges.count(std::make_pair(Pred, Dest)))
+      continue;
+    if (PI == TIBB)
+      IdenticalPreds.push_back(Pred);
+    else {
+      bool Identical = true;
+      unsigned PHINo = 0;
+      for (BasicBlock::iterator I = Dest->begin();
+           (PN = dyn_cast<PHINode>(I)); ++I, ++PHINo)
+        if (TIPHIValues[PHINo] != PN->getIncomingValueForBlock(Pred)) {
+          Identical = false;
+          break;
+        }
+      if (Identical)
+        IdenticalPreds.push_back(Pred);
+    }
+  }
+
+  assert(!IdenticalPreds.empty());
+  SplitBlockPredecessors(Dest, &IdenticalPreds[0], IdenticalPreds.size(),
+                         ".critedge", P);
+}
+
+
+/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
+/// copy (e.g. it's casting from one pointer type to another, int->uint, or
+/// int->sbyte on PPC), sink it into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced.
+///
+/// Return true if any changes are made.
+///
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+  // If this is a noop copy,
+  MVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
+  MVT DstVT = TLI.getValueType(CI->getType());
+
+  // This is an fp<->int conversion?
+  if (SrcVT.isInteger() != DstVT.isInteger())
+    return false;
+
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT.bitsLT(DstVT)) return false;
+
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote)
+    SrcVT = TLI.getTypeToTransformTo(SrcVT);
+  if (TLI.getTypeAction(DstVT) == TargetLowering::Promote)
+    DstVT = TLI.getTypeToTransformTo(DstVT);
+
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
+    return false;
+
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCasts - Only insert a cast in each block once.
+  DenseMap<BasicBlock*, CastInst*> InsertedCasts;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this cast is used in.  For PHI's this is the
+    // appropriate predecessor block.
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      UserBB = PN->getIncomingBlock(UI);
+    }
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // If this user is in the same block as the cast, don't change the cast.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cast into this block, use it.
+    CastInst *&InsertedCast = InsertedCasts[UserBB];
+
+    if (!InsertedCast) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
+
+      InsertedCast =
+        CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
+                         InsertPt);
+      MadeChange = true;
+    }
+
+    // Replace a use of the cast with a use of the new cast.
+    TheUse = InsertedCast;
+  }
+
+  // If we removed all uses, nuke the cast.
+  if (CI->use_empty()) {
+    CI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce
+/// the number of virtual registers that must be created and coalesced.  This is
+/// a clear win except on targets with multiple condition code registers
+///  (PowerPC), where it might lose; some adjustment may be wanted there.
+///
+/// Return true if any changes are made.
+static bool OptimizeCmpExpression(CmpInst *CI) {
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCmp - Only insert a cmp in each block once.
+  DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    // Figure out which BB this cmp is used in.
+    BasicBlock *UserBB = User->getParent();
+
+    // If this user is in the same block as the cmp, don't change the cmp.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cmp into this block, use it.
+    CmpInst *&InsertedCmp = InsertedCmps[UserBB];
+
+    if (!InsertedCmp) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
+
+      InsertedCmp =
+        CmpInst::Create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0),
+                        CI->getOperand(1), "", InsertPt);
+      MadeChange = true;
+    }
+
+    // Replace a use of the cmp with a use of the new cmp.
+    TheUse = InsertedCmp;
+  }
+
+  // If we removed all uses, nuke the cmp.
+  if (CI->use_empty())
+    CI->eraseFromParent();
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Optimization
+//===----------------------------------------------------------------------===//
+
+/// IsNonLocalValue - Return true if the specified values are defined in a
+/// different basic block than BB.
+static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() != BB;
+  return false;
+}
+
+/// OptimizeMemoryInst - Load and Store Instructions have often have
+/// addressing modes that can do significant amounts of computation.  As such,
+/// instruction selection will try to get the load or store to do as much
+/// computation as possible for the program.  The problem is that isel can only
+/// see within a single block.  As such, we sink as much legal addressing mode
+/// stuff into the block as possible.
+///
+/// This method is used to optimize both load/store and inline asms with memory
+/// operands.
+bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+                                        const Type *AccessTy,
+                                        DenseMap<Value*,Value*> &SunkAddrs) {
+  // Figure out what addressing mode will be built up for this operation.
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  ExtAddrMode AddrMode = AddressingModeMatcher::Match(Addr, AccessTy,MemoryInst,
+                                                      AddrModeInsts, *TLI);
+
+  // Check to see if any of the instructions supersumed by this addr mode are
+  // non-local to I's BB.
+  bool AnyNonLocal = false;
+  for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
+    if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) {
+      AnyNonLocal = true;
+      break;
+    }
+  }
+
+  // If all the instructions matched are already in this BB, don't do anything.
+  if (!AnyNonLocal) {
+    DEBUG(cerr << "CGP: Found      local addrmode: " << AddrMode << "\n");
+    return false;
+  }
+
+  // Insert this computation right after this user.  Since our caller is
+  // scanning from the top of the BB to the bottom, reuse of the expr are
+  // guaranteed to happen later.
+  BasicBlock::iterator InsertPt = MemoryInst;
+
+  // Now that we determined the addressing expression we want to use and know
+  // that we have to sink it into this block.  Check to see if we have already
+  // done this for some other load/store instr in this block.  If so, reuse the
+  // computation.
+  Value *&SunkAddr = SunkAddrs[Addr];
+  if (SunkAddr) {
+    DEBUG(cerr << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
+               << *MemoryInst);
+    if (SunkAddr->getType() != Addr->getType())
+      SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt);
+  } else {
+    DEBUG(cerr << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
+               << *MemoryInst);
+    const Type *IntPtrTy = TLI->getTargetData()->getIntPtrType();
+
+    Value *Result = 0;
+    // Start with the scale value.
+    if (AddrMode.Scale) {
+      Value *V = AddrMode.ScaledReg;
+      if (V->getType() == IntPtrTy) {
+        // done.
+      } else if (isa<PointerType>(V->getType())) {
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth()) {
+        V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else {
+        V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      }
+      if (AddrMode.Scale != 1)
+        V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy,
+                                                          AddrMode.Scale),
+                                      "sunkaddr", InsertPt);
+      Result = V;
+    }
+
+    // Add in the base register.
+    if (AddrMode.BaseReg) {
+      Value *V = AddrMode.BaseReg;
+      if (V->getType() != IntPtrTy)
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      if (Result)
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    // Add in the BaseGV if present.
+    if (AddrMode.BaseGV) {
+      Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
+                                  InsertPt);
+      if (Result)
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    // Add in the Base Offset if present.
+    if (AddrMode.BaseOffs) {
+      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      if (Result)
+        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    if (Result == 0)
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    else
+      SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt);
+  }
+
+  MemoryInst->replaceUsesOfWith(Addr, SunkAddr);
+
+  if (Addr->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(Addr);
+  return true;
+}
+
+/// OptimizeInlineAsmInst - If there are any memory operands, use
+/// OptimizeMemoryInst to sink their address computing into the block when
+/// possible / profitable.
+bool CodeGenPrepare::OptimizeInlineAsmInst(Instruction *I, CallSite CS,
+                                           DenseMap<Value*,Value*> &SunkAddrs) {
+  bool MadeChange = false;
+  InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+
+  // Do a prepass over the constraints, canonicalizing them, and building up the
+  // ConstraintOperands list.
+  std::vector<InlineAsm::ConstraintInfo>
+    ConstraintInfos = IA->ParseConstraints();
+
+  /// ConstraintOperands - Information about all of the constraints.
+  std::vector<TargetLowering::AsmOperandInfo> ConstraintOperands;
+  unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
+  for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) {
+    ConstraintOperands.
+      push_back(TargetLowering::AsmOperandInfo(ConstraintInfos[i]));
+    TargetLowering::AsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      if (OpInfo.isIndirect)
+        OpInfo.CallOperandVal = CS.getArgument(ArgNo++);
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = CS.getArgument(ArgNo++);
+      break;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI->ComputeConstraintToUse(OpInfo, SDValue(),
+                             OpInfo.ConstraintType == TargetLowering::C_Memory);
+
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.isIndirect) {
+      Value *OpVal = OpInfo.CallOperandVal;
+      MadeChange |= OptimizeMemoryInst(I, OpVal, OpVal->getType(), SunkAddrs);
+    }
+  }
+
+  return MadeChange;
+}
+
+bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
+  BasicBlock *DefBB = I->getParent();
+
+  // If both result of the {s|z}xt and its source are live out, rewrite all
+  // other uses of the source with result of extension.
+  Value *Src = I->getOperand(0);
+  if (Src->hasOneUse())
+    return false;
+
+  // Only do this xform if truncating is free.
+  if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
+    return false;
+
+  // Only safe to perform the optimization if the source is also defined in
+  // this block.
+  if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
+    return false;
+
+  bool DefIsLiveOut = false;
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+    DefIsLiveOut = true;
+    break;
+  }
+  if (!DefIsLiveOut)
+    return false;
+
+  // Make sure non of the uses are PHI nodes.
+  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+    // Be conservative. We don't want this xform to end up introducing
+    // reloads just before load / store instructions.
+    if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User))
+      return false;
+  }
+
+  // InsertedTruncs - Only insert one trunc in each block once.
+  DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
+       UI != E; ++UI) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+
+    // Both src and def are live in this block. Rewrite the use.
+    Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
+
+    if (!InsertedTrunc) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
+
+      InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
+    }
+
+    // Replace a use of the {s|z}ext source with a use of the result.
+    TheUse = InsertedTrunc;
+
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+// In this pass we look for GEP and cast instructions that are used
+// across basic blocks and rewrite them to improve basic-block-at-a-time
+// selection.
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+  bool MadeChange = false;
+
+  // Split all critical edges where the dest block has a PHI.
+  TerminatorInst *BBTI = BB.getTerminator();
+  if (BBTI->getNumSuccessors() > 1) {
+    for (unsigned i = 0, e = BBTI->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *SuccBB = BBTI->getSuccessor(i);
+      if (isa<PHINode>(SuccBB->begin()) && isCriticalEdge(BBTI, i, true))
+        SplitEdgeNicely(BBTI, i, BackEdges, this);
+    }
+  }
+
+  // Keep track of non-local addresses that have been sunk into this block.
+  // This allows us to avoid inserting duplicate code for blocks with multiple
+  // load/stores of the same address.
+  DenseMap<Value*, Value*> SunkAddrs;
+
+  for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E; ) {
+    Instruction *I = BBI++;
+
+    if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      // If the source of the cast is a constant, then this should have
+      // already been constant folded.  The only reason NOT to constant fold
+      // it is if something (e.g. LSR) was careful to place the constant
+      // evaluation in a block other than then one that uses it (e.g. to hoist
+      // the address of globals out of a loop).  If this is the case, we don't
+      // want to forward-subst the cast.
+      if (isa<Constant>(CI->getOperand(0)))
+        continue;
+
+      bool Change = false;
+      if (TLI) {
+        Change = OptimizeNoopCopyExpression(CI, *TLI);
+        MadeChange |= Change;
+      }
+
+      if (!Change && (isa<ZExtInst>(I) || isa<SExtInst>(I)))
+        MadeChange |= OptimizeExtUses(I);
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+      MadeChange |= OptimizeCmpExpression(CI);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeMemoryInst(I, I->getOperand(0), LI->getType(),
+                                         SunkAddrs);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeMemoryInst(I, SI->getOperand(1),
+                                         SI->getOperand(0)->getType(),
+                                         SunkAddrs);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      if (GEPI->hasAllZeroIndices()) {
+        /// The GEP operand must be a pointer, so must its result -> BitCast
+        Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
+                                          GEPI->getName(), GEPI);
+        GEPI->replaceAllUsesWith(NC);
+        GEPI->eraseFromParent();
+        MadeChange = true;
+        BBI = NC;
+      }
+    } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      // If we found an inline asm expession, and if the target knows how to
+      // lower it to normal LLVM code, do so now.
+      if (TLI && isa<InlineAsm>(CI->getCalledValue()))
+        if (const TargetAsmInfo *TAI =
+            TLI->getTargetMachine().getTargetAsmInfo()) {
+          if (TAI->ExpandInlineAsm(CI)) {
+            BBI = BB.begin();
+            // Avoid processing instructions out of order, which could cause
+            // reuse before a value is defined.
+            SunkAddrs.clear();
+          } else
+            // Sink address computing for memory operands into the block.
+            MadeChange |= OptimizeInlineAsmInst(I, &(*CI), SunkAddrs);
+        }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/lib/Transforms/Scalar/CondPropagate.cpp b/lib/Transforms/Scalar/CondPropagate.cpp
new file mode 100644
index 0000000..c85d031
--- /dev/null
+++ b/lib/Transforms/Scalar/CondPropagate.cpp
@@ -0,0 +1,295 @@
+//===-- CondPropagate.cpp - Propagate Conditional Expressions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass propagates information about conditional expressions through the
+// program, allowing it to eliminate conditional branches in some cases.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "condprop"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+STATISTIC(NumBrThread, "Number of CFG edges threaded through branches");
+STATISTIC(NumSwThread, "Number of CFG edges threaded through switches");
+
+namespace {
+  struct VISIBILITY_HIDDEN CondProp : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CondProp() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      //AU.addRequired<DominanceFrontier>();
+    }
+
+  private:
+    bool MadeChange;
+    SmallVector<BasicBlock *, 4> DeadBlocks;
+    void SimplifyBlock(BasicBlock *BB);
+    void SimplifyPredecessors(BranchInst *BI);
+    void SimplifyPredecessors(SwitchInst *SI);
+    void RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB);
+    bool RevectorBlockTo(BasicBlock *FromBB, Value *Cond, BranchInst *BI);
+  };
+}
+  
+char CondProp::ID = 0;
+static RegisterPass<CondProp> X("condprop", "Conditional Propagation");
+
+FunctionPass *llvm::createCondPropagationPass() {
+  return new CondProp();
+}
+
+bool CondProp::runOnFunction(Function &F) {
+  bool EverMadeChange = false;
+  DeadBlocks.clear();
+
+  // While we are simplifying blocks, keep iterating.
+  do {
+    MadeChange = false;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E;)
+      SimplifyBlock(BB++);
+    EverMadeChange = EverMadeChange || MadeChange;
+  } while (MadeChange);
+
+  if (EverMadeChange) {
+    while (!DeadBlocks.empty()) {
+      BasicBlock *BB = DeadBlocks.back(); DeadBlocks.pop_back();
+      DeleteDeadBlock(BB);
+    }
+  }
+  return EverMadeChange;
+}
+
+void CondProp::SimplifyBlock(BasicBlock *BB) {
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    // If this is a conditional branch based on a phi node that is defined in
+    // this block, see if we can simplify predecessors of this block.
+    if (BI->isConditional() && isa<PHINode>(BI->getCondition()) &&
+        cast<PHINode>(BI->getCondition())->getParent() == BB)
+      SimplifyPredecessors(BI);
+
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+    if (isa<PHINode>(SI->getCondition()) &&
+        cast<PHINode>(SI->getCondition())->getParent() == BB)
+      SimplifyPredecessors(SI);
+  }
+
+  // If possible, simplify the terminator of this block.
+  if (ConstantFoldTerminator(BB))
+    MadeChange = true;
+
+  // If this block ends with an unconditional branch and the only successor has
+  // only this block as a predecessor, merge the two blocks together.
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+    if (BI->isUnconditional() && BI->getSuccessor(0)->getSinglePredecessor() &&
+        BB != BI->getSuccessor(0)) {
+      BasicBlock *Succ = BI->getSuccessor(0);
+      
+      // If Succ has any PHI nodes, they are all single-entry PHI's.  Eliminate
+      // them.
+      FoldSingleEntryPHINodes(Succ);
+      
+      // Remove BI.
+      BI->eraseFromParent();
+
+      // Move over all of the instructions.
+      BB->getInstList().splice(BB->end(), Succ->getInstList());
+
+      // Any phi nodes that had entries for Succ now have entries from BB.
+      Succ->replaceAllUsesWith(BB);
+
+      // Succ is now dead, but we cannot delete it without potentially
+      // invalidating iterators elsewhere.  Just insert an unreachable
+      // instruction in it and delete this block later on.
+      new UnreachableInst(Succ);
+      DeadBlocks.push_back(Succ);
+      MadeChange = true;
+    }
+}
+
+// SimplifyPredecessors(branches) - We know that BI is a conditional branch
+// based on a PHI node defined in this block.  If the phi node contains constant
+// operands, then the blocks corresponding to those operands can be modified to
+// jump directly to the destination instead of going through this block.
+void CondProp::SimplifyPredecessors(BranchInst *BI) {
+  // TODO: We currently only handle the most trival case, where the PHI node has
+  // one use (the branch), and is the only instruction besides the branch and dbg
+  // intrinsics in the block.
+  PHINode *PN = cast<PHINode>(BI->getCondition());
+
+  if (PN->getNumIncomingValues() == 1) {
+    // Eliminate single-entry PHI nodes.
+    FoldSingleEntryPHINodes(PN->getParent());
+    return;
+  }
+  
+  
+  if (!PN->hasOneUse()) return;
+
+  BasicBlock *BB = BI->getParent();
+  if (&*BB->begin() != PN)
+    return;
+  BasicBlock::iterator BBI = BB->begin();
+  BasicBlock::iterator BBE = BB->end();
+  while (BBI != BBE && isa<DbgInfoIntrinsic>(++BBI)) /* empty */;
+  if (&*BBI != BI)
+    return;
+
+  // Ok, we have this really simple case, walk the PHI operands, looking for
+  // constants.  Walk from the end to remove operands from the end when
+  // possible, and to avoid invalidating "i".
+  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i) {
+    Value *InVal = PN->getIncomingValue(i-1);
+    if (!RevectorBlockTo(PN->getIncomingBlock(i-1), InVal, BI))
+      continue;
+
+    ++NumBrThread;
+
+    // If there were two predecessors before this simplification, or if the
+    // PHI node contained all the same value except for the one we just
+    // substituted, the PHI node may be deleted.  Don't iterate through it the
+    // last time.
+    if (BI->getCondition() != PN) return;
+  }
+}
+
+// SimplifyPredecessors(switch) - We know that SI is switch based on a PHI node
+// defined in this block.  If the phi node contains constant operands, then the
+// blocks corresponding to those operands can be modified to jump directly to
+// the destination instead of going through this block.
+void CondProp::SimplifyPredecessors(SwitchInst *SI) {
+  // TODO: We currently only handle the most trival case, where the PHI node has
+  // one use (the branch), and is the only instruction besides the branch and 
+  // dbg intrinsics in the block.
+  PHINode *PN = cast<PHINode>(SI->getCondition());
+  if (!PN->hasOneUse()) return;
+
+  BasicBlock *BB = SI->getParent();
+  if (&*BB->begin() != PN)
+    return;
+  BasicBlock::iterator BBI = BB->begin();
+  BasicBlock::iterator BBE = BB->end();
+  while (BBI != BBE && isa<DbgInfoIntrinsic>(++BBI)) /* empty */;
+  if (&*BBI != SI)
+    return;
+
+  bool RemovedPreds = false;
+
+  // Ok, we have this really simple case, walk the PHI operands, looking for
+  // constants.  Walk from the end to remove operands from the end when
+  // possible, and to avoid invalidating "i".
+  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i)
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(PN->getIncomingValue(i-1))) {
+      // If we have a constant, forward the edge from its current to its
+      // ultimate destination.
+      unsigned DestCase = SI->findCaseValue(CI);
+      RevectorBlockTo(PN->getIncomingBlock(i-1),
+                      SI->getSuccessor(DestCase));
+      ++NumSwThread;
+      RemovedPreds = true;
+
+      // If there were two predecessors before this simplification, or if the
+      // PHI node contained all the same value except for the one we just
+      // substituted, the PHI node may be deleted.  Don't iterate through it the
+      // last time.
+      if (SI->getCondition() != PN) return;
+    }
+}
+
+
+// RevectorBlockTo - Revector the unconditional branch at the end of FromBB to
+// the ToBB block, which is one of the successors of its current successor.
+void CondProp::RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB) {
+  BranchInst *FromBr = cast<BranchInst>(FromBB->getTerminator());
+  assert(FromBr->isUnconditional() && "FromBB should end with uncond br!");
+
+  // Get the old block we are threading through.
+  BasicBlock *OldSucc = FromBr->getSuccessor(0);
+
+  // OldSucc had multiple successors. If ToBB has multiple predecessors, then 
+  // the edge between them would be critical, which we already took care of.
+  // If ToBB has single operand PHI node then take care of it here.
+  FoldSingleEntryPHINodes(ToBB);
+
+  // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
+  OldSucc->removePredecessor(FromBB);
+
+  // Change FromBr to branch to the new destination.
+  FromBr->setSuccessor(0, ToBB);
+
+  MadeChange = true;
+}
+
+bool CondProp::RevectorBlockTo(BasicBlock *FromBB, Value *Cond, BranchInst *BI){
+  BranchInst *FromBr = cast<BranchInst>(FromBB->getTerminator());
+  if (!FromBr->isUnconditional())
+    return false;
+
+  // Get the old block we are threading through.
+  BasicBlock *OldSucc = FromBr->getSuccessor(0);
+
+  // If the condition is a constant, simply revector the unconditional branch at
+  // the end of FromBB to one of the successors of its current successor.
+  if (ConstantInt *CB = dyn_cast<ConstantInt>(Cond)) {
+    BasicBlock *ToBB = BI->getSuccessor(CB->isZero());
+
+    // OldSucc had multiple successors. If ToBB has multiple predecessors, then 
+    // the edge between them would be critical, which we already took care of.
+    // If ToBB has single operand PHI node then take care of it here.
+    FoldSingleEntryPHINodes(ToBB);
+
+    // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
+    OldSucc->removePredecessor(FromBB);
+
+    // Change FromBr to branch to the new destination.
+    FromBr->setSuccessor(0, ToBB);
+  } else {
+    BasicBlock *Succ0 = BI->getSuccessor(0);
+    // Do not perform transform if the new destination has PHI nodes. The
+    // transform will add new preds to the PHI's.
+    if (isa<PHINode>(Succ0->begin()))
+      return false;
+
+    BasicBlock *Succ1 = BI->getSuccessor(1);
+    if (isa<PHINode>(Succ1->begin()))
+      return false;
+
+    // Insert the new conditional branch.
+    BranchInst::Create(Succ0, Succ1, Cond, FromBr);
+
+    FoldSingleEntryPHINodes(Succ0);
+    FoldSingleEntryPHINodes(Succ1);
+
+    // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
+    OldSucc->removePredecessor(FromBB);
+
+    // Delete the old branch.
+    FromBr->eraseFromParent();
+  }
+
+  MadeChange = true;
+  return true;
+}
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 0000000..b933488
--- /dev/null
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -0,0 +1,90 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+//   * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "constprop"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Constant.h"
+#include "llvm/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+  struct VISIBILITY_HIDDEN ConstantPropagation : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantPropagation() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+}
+
+char ConstantPropagation::ID = 0;
+static RegisterPass<ConstantPropagation>
+X("constprop", "Simple constant propagation");
+
+FunctionPass *llvm::createConstantPropagationPass() {
+  return new ConstantPropagation();
+}
+
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+  // Initialize the worklist to all of the instructions ready to process...
+  std::set<Instruction*> WorkList;
+  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+      WorkList.insert(&*i);
+  }
+  bool Changed = false;
+
+  while (!WorkList.empty()) {
+    Instruction *I = *WorkList.begin();
+    WorkList.erase(WorkList.begin());    // Get an element from the worklist...
+
+    if (!I->use_empty())                 // Don't muck with dead instructions...
+      if (Constant *C = ConstantFoldInstruction(I)) {
+        // Add all of the users of this instruction to the worklist, they might
+        // be constant propagatable now...
+        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE; ++UI)
+          WorkList.insert(cast<Instruction>(*UI));
+
+        // Replace all of the uses of a variable with uses of the constant.
+        I->replaceAllUsesWith(C);
+
+        // Remove the dead instruction.
+        WorkList.erase(I);
+        I->eraseFromParent();
+
+        // We made a change to the function...
+        Changed = true;
+        ++NumInstKilled;
+      }
+  }
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 0000000..8bb504c
--- /dev/null
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -0,0 +1,133 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead.  Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadInstElimination pass implementation
+  //
+  struct VISIBILITY_HIDDEN DeadInstElimination : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+    DeadInstElimination() : BasicBlockPass(&ID) {}
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+        Instruction *Inst = DI++;
+        if (isInstructionTriviallyDead(Inst)) {
+          Inst->eraseFromParent();
+          Changed = true;
+          ++DIEEliminated;
+        }
+      }
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+}
+
+char DeadInstElimination::ID = 0;
+static RegisterPass<DeadInstElimination>
+X("die", "Dead Instruction Elimination");
+
+Pass *llvm::createDeadInstEliminationPass() {
+  return new DeadInstElimination();
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadCodeElimination pass implementation
+  //
+  struct DCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DCE() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+ };
+}
+
+char DCE::ID = 0;
+static RegisterPass<DCE> Y("dce", "Dead Code Elimination");
+
+bool DCE::runOnFunction(Function &F) {
+  // Start out with all of the instructions in the worklist...
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
+    WorkList.push_back(&*i);
+
+  // Loop over the worklist finding instructions that are dead.  If they are
+  // dead make them drop all of their uses, making other instructions
+  // potentially dead, and work until the worklist is empty.
+  //
+  bool MadeChange = false;
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.back();
+    WorkList.pop_back();
+
+    if (isInstructionTriviallyDead(I)) {       // If the instruction is dead.
+      // Loop over all of the values that the instruction uses, if there are
+      // instructions being used, add them to the worklist, because they might
+      // go dead after this one is removed.
+      //
+      for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+        if (Instruction *Used = dyn_cast<Instruction>(*OI))
+          WorkList.push_back(Used);
+
+      // Remove the instruction.
+      I->eraseFromParent();
+
+      // Remove the instruction from the worklist if it still exists in it.
+      for (std::vector<Instruction*>::iterator WI = WorkList.begin();
+           WI != WorkList.end(); ) {
+        if (*WI == I)
+          WI = WorkList.erase(WI);
+        else
+          ++WI;
+      }
+
+      MadeChange = true;
+      ++DCEEliminated;
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCE();
+}
+
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 0000000..b923c92
--- /dev/null
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -0,0 +1,461 @@
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dse"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN DSE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DSE() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      bool Changed = false;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        Changed |= runOnBasicBlock(*I);
+      return Changed;
+    }
+    
+    bool runOnBasicBlock(BasicBlock &BB);
+    bool handleFreeWithNonTrivialDependency(FreeInst *F, MemDepResult Dep);
+    bool handleEndBlock(BasicBlock &BB);
+    bool RemoveUndeadPointers(Value* Ptr, uint64_t killPointerSize,
+                              BasicBlock::iterator& BBI,
+                              SmallPtrSet<Value*, 64>& deadPointers);
+    void DeleteDeadInstruction(Instruction *I,
+                               SmallPtrSet<Value*, 64> *deadPointers = 0);
+    
+
+    // getAnalysisUsage - We require post dominance frontiers (aka Control
+    // Dependence Graph)
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetData>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+    }
+  };
+}
+
+char DSE::ID = 0;
+static RegisterPass<DSE> X("dse", "Dead Store Elimination");
+
+FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
+
+bool DSE::runOnBasicBlock(BasicBlock &BB) {
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  TargetData &TD = getAnalysis<TargetData>();  
+
+  bool MadeChange = false;
+  
+  // Do a top-down walk on the BB
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+    Instruction *Inst = BBI++;
+    
+    // If we find a store or a free, get it's memory dependence.
+    if (!isa<StoreInst>(Inst) && !isa<FreeInst>(Inst))
+      continue;
+    
+    // Don't molest volatile stores or do queries that will return "clobber".
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      if (SI->isVolatile())
+        continue;
+
+    MemDepResult InstDep = MD.getDependency(Inst);
+    
+    // Ignore non-local stores.
+    // FIXME: cross-block DSE would be fun. :)
+    if (InstDep.isNonLocal()) continue;
+  
+    // Handle frees whose dependencies are non-trivial.
+    if (FreeInst *FI = dyn_cast<FreeInst>(Inst)) {
+      MadeChange |= handleFreeWithNonTrivialDependency(FI, InstDep);
+      continue;
+    }
+    
+    StoreInst *SI = cast<StoreInst>(Inst);
+    
+    // If not a definite must-alias dependency, ignore it.
+    if (!InstDep.isDef())
+      continue;
+    
+    // If this is a store-store dependence, then the previous store is dead so
+    // long as this store is at least as big as it.
+    if (StoreInst *DepStore = dyn_cast<StoreInst>(InstDep.getInst()))
+      if (TD.getTypeStoreSize(DepStore->getOperand(0)->getType()) <=
+          TD.getTypeStoreSize(SI->getOperand(0)->getType())) {
+        // Delete the store and now-dead instructions that feed it.
+        DeleteDeadInstruction(DepStore);
+        NumFastStores++;
+        MadeChange = true;
+        
+        if (BBI != BB.begin())
+          --BBI;
+        continue;
+      }
+    
+    // If we're storing the same value back to a pointer that we just
+    // loaded from, then the store can be removed.
+    if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
+      if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+          SI->getOperand(0) == DepLoad) {
+        DeleteDeadInstruction(SI);
+        if (BBI != BB.begin())
+          --BBI;
+        NumFastStores++;
+        MadeChange = true;
+        continue;
+      }
+    }
+  }
+  
+  // If this block ends in a return, unwind, or unreachable, all allocas are
+  // dead at its end, which means stores to them are also dead.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB);
+  
+  return MadeChange;
+}
+
+/// handleFreeWithNonTrivialDependency - Handle frees of entire structures whose
+/// dependency is a store to a field of that structure.
+bool DSE::handleFreeWithNonTrivialDependency(FreeInst *F, MemDepResult Dep) {
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  
+  StoreInst *Dependency = dyn_cast_or_null<StoreInst>(Dep.getInst());
+  if (!Dependency || Dependency->isVolatile())
+    return false;
+  
+  Value *DepPointer = Dependency->getPointerOperand()->getUnderlyingObject();
+
+  // Check for aliasing.
+  if (AA.alias(F->getPointerOperand(), 1, DepPointer, 1) !=
+         AliasAnalysis::MustAlias)
+    return false;
+  
+  // DCE instructions only used to calculate that store
+  DeleteDeadInstruction(Dependency);
+  NumFastStores++;
+  return true;
+}
+
+/// handleEndBlock - Remove dead stores to stack-allocated locations in the
+/// function end block.  Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+bool DSE::handleEndBlock(BasicBlock &BB) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  
+  bool MadeChange = false;
+  
+  // Pointers alloca'd in this function are dead in the end block
+  SmallPtrSet<Value*, 64> deadPointers;
+  
+  // Find all of the alloca'd pointers in the entry block.
+  BasicBlock *Entry = BB.getParent()->begin();
+  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      deadPointers.insert(AI);
+  
+  // Treat byval arguments the same, stores to them are dead at the end of the
+  // function.
+  for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
+       AE = BB.getParent()->arg_end(); AI != AE; ++AI)
+    if (AI->hasByValAttr())
+      deadPointers.insert(AI);
+  
+  // Scan the basic block backwards
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+    --BBI;
+    
+    // If we find a store whose pointer is dead.
+    if (StoreInst* S = dyn_cast<StoreInst>(BBI)) {
+      if (!S->isVolatile()) {
+        // See through pointer-to-pointer bitcasts
+        Value* pointerOperand = S->getPointerOperand()->getUnderlyingObject();
+
+        // Alloca'd pointers or byval arguments (which are functionally like
+        // alloca's) are valid candidates for removal.
+        if (deadPointers.count(pointerOperand)) {
+          // DCE instructions only used to calculate that store.
+          BBI++;
+          DeleteDeadInstruction(S, &deadPointers);
+          NumFastStores++;
+          MadeChange = true;
+        }
+      }
+      
+      continue;
+    }
+    
+    // We can also remove memcpy's to local variables at the end of a function.
+    if (MemCpyInst *M = dyn_cast<MemCpyInst>(BBI)) {
+      Value *dest = M->getDest()->getUnderlyingObject();
+
+      if (deadPointers.count(dest)) {
+        BBI++;
+        DeleteDeadInstruction(M, &deadPointers);
+        NumFastOther++;
+        MadeChange = true;
+        continue;
+      }
+      
+      // Because a memcpy is also a load, we can't skip it if we didn't remove
+      // it.
+    }
+    
+    Value* killPointer = 0;
+    uint64_t killPointerSize = ~0UL;
+    
+    // If we encounter a use of the pointer, it is no longer considered dead
+    if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      // However, if this load is unused and not volatile, we can go ahead and
+      // remove it, and not have to worry about it making our pointer undead!
+      if (L->use_empty() && !L->isVolatile()) {
+        BBI++;
+        DeleteDeadInstruction(L, &deadPointers);
+        NumFastOther++;
+        MadeChange = true;
+        continue;
+      }
+      
+      killPointer = L->getPointerOperand();
+    } else if (VAArgInst* V = dyn_cast<VAArgInst>(BBI)) {
+      killPointer = V->getOperand(0);
+    } else if (isa<MemCpyInst>(BBI) &&
+               isa<ConstantInt>(cast<MemCpyInst>(BBI)->getLength())) {
+      killPointer = cast<MemCpyInst>(BBI)->getSource();
+      killPointerSize = cast<ConstantInt>(
+                            cast<MemCpyInst>(BBI)->getLength())->getZExtValue();
+    } else if (AllocaInst* A = dyn_cast<AllocaInst>(BBI)) {
+      deadPointers.erase(A);
+      
+      // Dead alloca's can be DCE'd when we reach them
+      if (A->use_empty()) {
+        BBI++;
+        DeleteDeadInstruction(A, &deadPointers);
+        NumFastOther++;
+        MadeChange = true;
+      }
+      
+      continue;
+    } else if (CallSite::get(BBI).getInstruction() != 0) {
+      // If this call does not access memory, it can't
+      // be undeadifying any of our pointers.
+      CallSite CS = CallSite::get(BBI);
+      if (AA.doesNotAccessMemory(CS))
+        continue;
+      
+      unsigned modRef = 0;
+      unsigned other = 0;
+      
+      // Remove any pointers made undead by the call from the dead set
+      std::vector<Value*> dead;
+      for (SmallPtrSet<Value*, 64>::iterator I = deadPointers.begin(),
+           E = deadPointers.end(); I != E; ++I) {
+        // HACK: if we detect that our AA is imprecise, it's not
+        // worth it to scan the rest of the deadPointers set.  Just
+        // assume that the AA will return ModRef for everything, and
+        // go ahead and bail.
+        if (modRef >= 16 && other == 0) {
+          deadPointers.clear();
+          return MadeChange;
+        }
+
+        // Get size information for the alloca
+        unsigned pointerSize = ~0U;
+        if (AllocaInst* A = dyn_cast<AllocaInst>(*I)) {
+          if (ConstantInt* C = dyn_cast<ConstantInt>(A->getArraySize()))
+            pointerSize = C->getZExtValue() *
+                          TD.getTypeAllocSize(A->getAllocatedType());
+        } else {
+          const PointerType* PT = cast<PointerType>(
+                                                 cast<Argument>(*I)->getType());
+          pointerSize = TD.getTypeAllocSize(PT->getElementType());
+        }
+
+        // See if the call site touches it
+        AliasAnalysis::ModRefResult A = AA.getModRefInfo(CS, *I, pointerSize);
+        
+        if (A == AliasAnalysis::ModRef)
+          modRef++;
+        else
+          other++;
+        
+        if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
+          dead.push_back(*I);
+      }
+
+      for (std::vector<Value*>::iterator I = dead.begin(), E = dead.end();
+           I != E; ++I)
+        deadPointers.erase(*I);
+      
+      continue;
+    } else if (isInstructionTriviallyDead(BBI)) {
+      // For any non-memory-affecting non-terminators, DCE them as we reach them
+      Instruction *Inst = BBI;
+      BBI++;
+      DeleteDeadInstruction(Inst, &deadPointers);
+      NumFastOther++;
+      MadeChange = true;
+      continue;
+    }
+    
+    if (!killPointer)
+      continue;
+
+    killPointer = killPointer->getUnderlyingObject();
+
+    // Deal with undead pointers
+    MadeChange |= RemoveUndeadPointers(killPointer, killPointerSize, BBI,
+                                       deadPointers);
+  }
+  
+  return MadeChange;
+}
+
+/// RemoveUndeadPointers - check for uses of a pointer that make it
+/// undead when scanning for dead stores to alloca's.
+bool DSE::RemoveUndeadPointers(Value* killPointer, uint64_t killPointerSize,
+                               BasicBlock::iterator &BBI,
+                               SmallPtrSet<Value*, 64>& deadPointers) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+                                  
+  // If the kill pointer can be easily reduced to an alloca,
+  // don't bother doing extraneous AA queries.
+  if (deadPointers.count(killPointer)) {
+    deadPointers.erase(killPointer);
+    return false;
+  }
+  
+  // A global can't be in the dead pointer set.
+  if (isa<GlobalValue>(killPointer))
+    return false;
+  
+  bool MadeChange = false;
+  
+  SmallVector<Value*, 16> undead;
+    
+  for (SmallPtrSet<Value*, 64>::iterator I = deadPointers.begin(),
+      E = deadPointers.end(); I != E; ++I) {
+    // Get size information for the alloca.
+    unsigned pointerSize = ~0U;
+    if (AllocaInst* A = dyn_cast<AllocaInst>(*I)) {
+      if (ConstantInt* C = dyn_cast<ConstantInt>(A->getArraySize()))
+        pointerSize = C->getZExtValue() *
+                      TD.getTypeAllocSize(A->getAllocatedType());
+    } else {
+      const PointerType* PT = cast<PointerType>(cast<Argument>(*I)->getType());
+      pointerSize = TD.getTypeAllocSize(PT->getElementType());
+    }
+
+    // See if this pointer could alias it
+    AliasAnalysis::AliasResult A = AA.alias(*I, pointerSize,
+                                            killPointer, killPointerSize);
+
+    // If it must-alias and a store, we can delete it
+    if (isa<StoreInst>(BBI) && A == AliasAnalysis::MustAlias) {
+      StoreInst* S = cast<StoreInst>(BBI);
+
+      // Remove it!
+      BBI++;
+      DeleteDeadInstruction(S, &deadPointers);
+      NumFastStores++;
+      MadeChange = true;
+
+      continue;
+
+      // Otherwise, it is undead
+    } else if (A != AliasAnalysis::NoAlias)
+      undead.push_back(*I);
+  }
+
+  for (SmallVector<Value*, 16>::iterator I = undead.begin(), E = undead.end();
+       I != E; ++I)
+      deadPointers.erase(*I);
+  
+  return MadeChange;
+}
+
+/// DeleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// and zero out all the operands of this instruction.  If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+///
+void DSE::DeleteDeadInstruction(Instruction *I,
+                                SmallPtrSet<Value*, 64> *ValueSet) {
+  SmallVector<Instruction*, 32> NowDeadInsts;
+  
+  NowDeadInsts.push_back(I);
+  --NumFastOther;
+
+  // Before we touch this instruction, remove it from memdep!
+  MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>();
+  while (!NowDeadInsts.empty()) {
+    Instruction *DeadInst = NowDeadInsts.back();
+    NowDeadInsts.pop_back();
+    
+    ++NumFastOther;
+    
+    // This instruction is dead, zap it, in stages.  Start by removing it from
+    // MemDep, which needs to know the operands and needs it to be in the
+    // function.
+    MDA.removeInstruction(DeadInst);
+    
+    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+      Value *Op = DeadInst->getOperand(op);
+      DeadInst->setOperand(op, 0);
+      
+      // If this operand just became dead, add it to the NowDeadInsts list.
+      if (!Op->use_empty()) continue;
+      
+      if (Instruction *OpI = dyn_cast<Instruction>(Op))
+        if (isInstructionTriviallyDead(OpI))
+          NowDeadInsts.push_back(OpI);
+    }
+    
+    DeadInst->eraseFromParent();
+    
+    if (ValueSet) ValueSet->erase(DeadInst);
+  }
+}
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
new file mode 100644
index 0000000..733dfa9
--- /dev/null
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -0,0 +1,1738 @@
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions.  It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gvn"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Value.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cstdio>
+using namespace llvm;
+
+STATISTIC(NumGVNInstr,  "Number of instructions deleted");
+STATISTIC(NumGVNLoad,   "Number of loads deleted");
+STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumPRELoad,   "Number of loads PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+                               cl::init(true), cl::Hidden);
+cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
+
+//===----------------------------------------------------------------------===//
+//                         ValueTable Class
+//===----------------------------------------------------------------------===//
+
+/// This class holds the mapping between values and value numbers.  It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+namespace {
+  struct VISIBILITY_HIDDEN Expression {
+    enum ExpressionOpcode { ADD, SUB, MUL, UDIV, SDIV, FDIV, UREM, SREM, 
+                            FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, 
+                            ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, 
+                            ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, 
+                            FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, 
+                            FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, 
+                            FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT,
+                            SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI,
+                            FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, 
+                            PTRTOINT, INTTOPTR, BITCAST, GEP, CALL, CONSTANT,
+                            EMPTY, TOMBSTONE };
+
+    ExpressionOpcode opcode;
+    const Type* type;
+    uint32_t firstVN;
+    uint32_t secondVN;
+    uint32_t thirdVN;
+    SmallVector<uint32_t, 4> varargs;
+    Value* function;
+  
+    Expression() { }
+    Expression(ExpressionOpcode o) : opcode(o) { }
+  
+    bool operator==(const Expression &other) const {
+      if (opcode != other.opcode)
+        return false;
+      else if (opcode == EMPTY || opcode == TOMBSTONE)
+        return true;
+      else if (type != other.type)
+        return false;
+      else if (function != other.function)
+        return false;
+      else if (firstVN != other.firstVN)
+        return false;
+      else if (secondVN != other.secondVN)
+        return false;
+      else if (thirdVN != other.thirdVN)
+        return false;
+      else {
+        if (varargs.size() != other.varargs.size())
+          return false;
+      
+        for (size_t i = 0; i < varargs.size(); ++i)
+          if (varargs[i] != other.varargs[i])
+            return false;
+    
+        return true;
+      }
+    }
+  
+    bool operator!=(const Expression &other) const {
+      return !(*this == other);
+    }
+  };
+  
+  class VISIBILITY_HIDDEN ValueTable {
+    private:
+      DenseMap<Value*, uint32_t> valueNumbering;
+      DenseMap<Expression, uint32_t> expressionNumbering;
+      AliasAnalysis* AA;
+      MemoryDependenceAnalysis* MD;
+      DominatorTree* DT;
+  
+      uint32_t nextValueNumber;
+    
+      Expression::ExpressionOpcode getOpcode(BinaryOperator* BO);
+      Expression::ExpressionOpcode getOpcode(CmpInst* C);
+      Expression::ExpressionOpcode getOpcode(CastInst* C);
+      Expression create_expression(BinaryOperator* BO);
+      Expression create_expression(CmpInst* C);
+      Expression create_expression(ShuffleVectorInst* V);
+      Expression create_expression(ExtractElementInst* C);
+      Expression create_expression(InsertElementInst* V);
+      Expression create_expression(SelectInst* V);
+      Expression create_expression(CastInst* C);
+      Expression create_expression(GetElementPtrInst* G);
+      Expression create_expression(CallInst* C);
+      Expression create_expression(Constant* C);
+    public:
+      ValueTable() : nextValueNumber(1) { }
+      uint32_t lookup_or_add(Value* V);
+      uint32_t lookup(Value* V) const;
+      void add(Value* V, uint32_t num);
+      void clear();
+      void erase(Value* v);
+      unsigned size();
+      void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
+      AliasAnalysis *getAliasAnalysis() const { return AA; }
+      void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
+      void setDomTree(DominatorTree* D) { DT = D; }
+      uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
+      void verifyRemoved(const Value *) const;
+  };
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+  static inline Expression getEmptyKey() {
+    return Expression(Expression::EMPTY);
+  }
+  
+  static inline Expression getTombstoneKey() {
+    return Expression(Expression::TOMBSTONE);
+  }
+  
+  static unsigned getHashValue(const Expression e) {
+    unsigned hash = e.opcode;
+    
+    hash = e.firstVN + hash * 37;
+    hash = e.secondVN + hash * 37;
+    hash = e.thirdVN + hash * 37;
+    
+    hash = ((unsigned)((uintptr_t)e.type >> 4) ^
+            (unsigned)((uintptr_t)e.type >> 9)) +
+           hash * 37;
+    
+    for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(),
+         E = e.varargs.end(); I != E; ++I)
+      hash = *I + hash * 37;
+    
+    hash = ((unsigned)((uintptr_t)e.function >> 4) ^
+            (unsigned)((uintptr_t)e.function >> 9)) +
+           hash * 37;
+    
+    return hash;
+  }
+  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+    return LHS == RHS;
+  }
+  static bool isPod() { return true; }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+Expression::ExpressionOpcode ValueTable::getOpcode(BinaryOperator* BO) {
+  switch(BO->getOpcode()) {
+  default: // THIS SHOULD NEVER HAPPEN
+    assert(0 && "Binary operator with unknown opcode?");
+  case Instruction::Add:  return Expression::ADD;
+  case Instruction::Sub:  return Expression::SUB;
+  case Instruction::Mul:  return Expression::MUL;
+  case Instruction::UDiv: return Expression::UDIV;
+  case Instruction::SDiv: return Expression::SDIV;
+  case Instruction::FDiv: return Expression::FDIV;
+  case Instruction::URem: return Expression::UREM;
+  case Instruction::SRem: return Expression::SREM;
+  case Instruction::FRem: return Expression::FREM;
+  case Instruction::Shl:  return Expression::SHL;
+  case Instruction::LShr: return Expression::LSHR;
+  case Instruction::AShr: return Expression::ASHR;
+  case Instruction::And:  return Expression::AND;
+  case Instruction::Or:   return Expression::OR;
+  case Instruction::Xor:  return Expression::XOR;
+  }
+}
+
+Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) {
+  if (isa<ICmpInst>(C) || isa<VICmpInst>(C)) {
+    switch (C->getPredicate()) {
+    default:  // THIS SHOULD NEVER HAPPEN
+      assert(0 && "Comparison with unknown predicate?");
+    case ICmpInst::ICMP_EQ:  return Expression::ICMPEQ;
+    case ICmpInst::ICMP_NE:  return Expression::ICMPNE;
+    case ICmpInst::ICMP_UGT: return Expression::ICMPUGT;
+    case ICmpInst::ICMP_UGE: return Expression::ICMPUGE;
+    case ICmpInst::ICMP_ULT: return Expression::ICMPULT;
+    case ICmpInst::ICMP_ULE: return Expression::ICMPULE;
+    case ICmpInst::ICMP_SGT: return Expression::ICMPSGT;
+    case ICmpInst::ICMP_SGE: return Expression::ICMPSGE;
+    case ICmpInst::ICMP_SLT: return Expression::ICMPSLT;
+    case ICmpInst::ICMP_SLE: return Expression::ICMPSLE;
+    }
+  }
+  assert((isa<FCmpInst>(C) || isa<VFCmpInst>(C)) && "Unknown compare");
+  switch (C->getPredicate()) {
+  default: // THIS SHOULD NEVER HAPPEN
+    assert(0 && "Comparison with unknown predicate?");
+  case FCmpInst::FCMP_OEQ: return Expression::FCMPOEQ;
+  case FCmpInst::FCMP_OGT: return Expression::FCMPOGT;
+  case FCmpInst::FCMP_OGE: return Expression::FCMPOGE;
+  case FCmpInst::FCMP_OLT: return Expression::FCMPOLT;
+  case FCmpInst::FCMP_OLE: return Expression::FCMPOLE;
+  case FCmpInst::FCMP_ONE: return Expression::FCMPONE;
+  case FCmpInst::FCMP_ORD: return Expression::FCMPORD;
+  case FCmpInst::FCMP_UNO: return Expression::FCMPUNO;
+  case FCmpInst::FCMP_UEQ: return Expression::FCMPUEQ;
+  case FCmpInst::FCMP_UGT: return Expression::FCMPUGT;
+  case FCmpInst::FCMP_UGE: return Expression::FCMPUGE;
+  case FCmpInst::FCMP_ULT: return Expression::FCMPULT;
+  case FCmpInst::FCMP_ULE: return Expression::FCMPULE;
+  case FCmpInst::FCMP_UNE: return Expression::FCMPUNE;
+  }
+}
+
+Expression::ExpressionOpcode ValueTable::getOpcode(CastInst* C) {
+  switch(C->getOpcode()) {
+  default: // THIS SHOULD NEVER HAPPEN
+    assert(0 && "Cast operator with unknown opcode?");
+  case Instruction::Trunc:    return Expression::TRUNC;
+  case Instruction::ZExt:     return Expression::ZEXT;
+  case Instruction::SExt:     return Expression::SEXT;
+  case Instruction::FPToUI:   return Expression::FPTOUI;
+  case Instruction::FPToSI:   return Expression::FPTOSI;
+  case Instruction::UIToFP:   return Expression::UITOFP;
+  case Instruction::SIToFP:   return Expression::SITOFP;
+  case Instruction::FPTrunc:  return Expression::FPTRUNC;
+  case Instruction::FPExt:    return Expression::FPEXT;
+  case Instruction::PtrToInt: return Expression::PTRTOINT;
+  case Instruction::IntToPtr: return Expression::INTTOPTR;
+  case Instruction::BitCast:  return Expression::BITCAST;
+  }
+}
+
+Expression ValueTable::create_expression(CallInst* C) {
+  Expression e;
+  
+  e.type = C->getType();
+  e.firstVN = 0;
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.function = C->getCalledFunction();
+  e.opcode = Expression::CALL;
+  
+  for (CallInst::op_iterator I = C->op_begin()+1, E = C->op_end();
+       I != E; ++I)
+    e.varargs.push_back(lookup_or_add(*I));
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(BinaryOperator* BO) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(BO->getOperand(0));
+  e.secondVN = lookup_or_add(BO->getOperand(1));
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = BO->getType();
+  e.opcode = getOpcode(BO);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CmpInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = lookup_or_add(C->getOperand(1));
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CastInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ShuffleVectorInst* S) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(S->getOperand(0));
+  e.secondVN = lookup_or_add(S->getOperand(1));
+  e.thirdVN = lookup_or_add(S->getOperand(2));
+  e.function = 0;
+  e.type = S->getType();
+  e.opcode = Expression::SHUFFLE;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ExtractElementInst* E) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(E->getOperand(0));
+  e.secondVN = lookup_or_add(E->getOperand(1));
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = E->getType();
+  e.opcode = Expression::EXTRACT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(InsertElementInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getOperand(0));
+  e.secondVN = lookup_or_add(I->getOperand(1));
+  e.thirdVN = lookup_or_add(I->getOperand(2));
+  e.function = 0;
+  e.type = I->getType();
+  e.opcode = Expression::INSERT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(SelectInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getCondition());
+  e.secondVN = lookup_or_add(I->getTrueValue());
+  e.thirdVN = lookup_or_add(I->getFalseValue());
+  e.function = 0;
+  e.type = I->getType();
+  e.opcode = Expression::SELECT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(GetElementPtrInst* G) {
+  Expression e;
+  
+  e.firstVN = lookup_or_add(G->getPointerOperand());
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.function = 0;
+  e.type = G->getType();
+  e.opcode = Expression::GEP;
+  
+  for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end();
+       I != E; ++I)
+    e.varargs.push_back(lookup_or_add(*I));
+  
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// add - Insert a value into the table with a specified value number.
+void ValueTable::add(Value* V, uint32_t num) {
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value* V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  
+  if (CallInst* C = dyn_cast<CallInst>(V)) {
+    if (AA->doesNotAccessMemory(C)) {
+      Expression e = create_expression(C);
+    
+      DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+      if (EI != expressionNumbering.end()) {
+        valueNumbering.insert(std::make_pair(V, EI->second));
+        return EI->second;
+      } else {
+        expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+        return nextValueNumber++;
+      }
+    } else if (AA->onlyReadsMemory(C)) {
+      Expression e = create_expression(C);
+      
+      if (expressionNumbering.find(e) == expressionNumbering.end()) {
+        expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+      
+      MemDepResult local_dep = MD->getDependency(C);
+      
+      if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+
+      if (local_dep.isDef()) {
+        CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+        
+        if (local_cdep->getNumOperands() != C->getNumOperands()) {
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          return nextValueNumber++;
+        }
+          
+        for (unsigned i = 1; i < C->getNumOperands(); ++i) {
+          uint32_t c_vn = lookup_or_add(C->getOperand(i));
+          uint32_t cd_vn = lookup_or_add(local_cdep->getOperand(i));
+          if (c_vn != cd_vn) {
+            valueNumbering.insert(std::make_pair(V, nextValueNumber));
+            return nextValueNumber++;
+          }
+        }
+      
+        uint32_t v = lookup_or_add(local_cdep);
+        valueNumbering.insert(std::make_pair(V, v));
+        return v;
+      }
+
+      // Non-local case.
+      const MemoryDependenceAnalysis::NonLocalDepInfo &deps = 
+        MD->getNonLocalCallDependency(CallSite(C));
+      // FIXME: call/call dependencies for readonly calls should return def, not
+      // clobber!  Move the checking logic to MemDep!
+      CallInst* cdep = 0;
+      
+      // Check to see if we have a single dominating call instruction that is
+      // identical to C.
+      for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+        const MemoryDependenceAnalysis::NonLocalDepEntry *I = &deps[i];
+        // Ignore non-local dependencies.
+        if (I->second.isNonLocal())
+          continue;
+
+        // We don't handle non-depedencies.  If we already have a call, reject
+        // instruction dependencies.
+        if (I->second.isClobber() || cdep != 0) {
+          cdep = 0;
+          break;
+        }
+        
+        CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->second.getInst());
+        // FIXME: All duplicated with non-local case.
+        if (NonLocalDepCall && DT->properlyDominates(I->first, C->getParent())){
+          cdep = NonLocalDepCall;
+          continue;
+        }
+        
+        cdep = 0;
+        break;
+      }
+      
+      if (!cdep) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+      
+      if (cdep->getNumOperands() != C->getNumOperands()) {
+        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+        return nextValueNumber++;
+      }
+      for (unsigned i = 1; i < C->getNumOperands(); ++i) {
+        uint32_t c_vn = lookup_or_add(C->getOperand(i));
+        uint32_t cd_vn = lookup_or_add(cdep->getOperand(i));
+        if (c_vn != cd_vn) {
+          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          return nextValueNumber++;
+        }
+      }
+      
+      uint32_t v = lookup_or_add(cdep);
+      valueNumbering.insert(std::make_pair(V, v));
+      return v;
+      
+    } else {
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      return nextValueNumber++;
+    }
+  } else if (BinaryOperator* BO = dyn_cast<BinaryOperator>(V)) {
+    Expression e = create_expression(BO);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CmpInst* C = dyn_cast<CmpInst>(V)) {
+    Expression e = create_expression(C);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ShuffleVectorInst* U = dyn_cast<ShuffleVectorInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ExtractElementInst* U = dyn_cast<ExtractElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (InsertElementInst* U = dyn_cast<InsertElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (SelectInst* U = dyn_cast<SelectInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else {
+    valueNumbering.insert(std::make_pair(V, nextValueNumber));
+    return nextValueNumber++;
+  }
+}
+
+/// lookup - Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value* V) const {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  assert(VI != valueNumbering.end() && "Value not numbered?");
+  return VI->second;
+}
+
+/// clear - Remove all entries from the ValueTable
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// erase - Remove a value from the value numbering
+void ValueTable::erase(Value* V) {
+  valueNumbering.erase(V);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void ValueTable::verifyRemoved(const Value *V) const {
+  for (DenseMap<Value*, uint32_t>::iterator
+         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+    assert(I->first != V && "Inst still occurs in value numbering map!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct VISIBILITY_HIDDEN ValueNumberScope {
+    ValueNumberScope* parent;
+    DenseMap<uint32_t, Value*> table;
+    
+    ValueNumberScope(ValueNumberScope* p) : parent(p) { }
+  };
+}
+
+namespace {
+
+  class VISIBILITY_HIDDEN GVN : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    GVN() : FunctionPass(&ID) { }
+
+  private:
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+
+    ValueTable VN;
+    DenseMap<BasicBlock*, ValueNumberScope*> localAvail;
+    
+    typedef DenseMap<Value*, SmallPtrSet<Instruction*, 4> > PhiMapType;
+    PhiMapType phiMap;
+    
+    
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AliasAnalysis>();
+      
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<AliasAnalysis>();
+    }
+  
+    // Helper fuctions
+    // FIXME: eliminate or document these better
+    bool processLoad(LoadInst* L,
+                     SmallVectorImpl<Instruction*> &toErase);
+    bool processInstruction(Instruction* I,
+                            SmallVectorImpl<Instruction*> &toErase);
+    bool processNonLocalLoad(LoadInst* L,
+                             SmallVectorImpl<Instruction*> &toErase);
+    bool processBlock(BasicBlock* BB);
+    Value *GetValueForBlock(BasicBlock *BB, Instruction* orig,
+                            DenseMap<BasicBlock*, Value*> &Phis,
+                            bool top_level = false);
+    void dump(DenseMap<uint32_t, Value*>& d);
+    bool iterateOnFunction(Function &F);
+    Value* CollapsePhi(PHINode* p);
+    bool isSafeReplacement(PHINode* p, Instruction* inst);
+    bool performPRE(Function& F);
+    Value* lookupNumber(BasicBlock* BB, uint32_t num);
+    bool mergeBlockIntoPredecessor(BasicBlock* BB);
+    Value* AttemptRedundancyElimination(Instruction* orig, unsigned valno);
+    void cleanupGlobalSets();
+    void verifyRemoved(const Instruction *I) const;
+  };
+  
+  char GVN::ID = 0;
+}
+
+// createGVNPass - The public interface to this file...
+FunctionPass *llvm::createGVNPass() { return new GVN(); }
+
+static RegisterPass<GVN> X("gvn",
+                           "Global Value Numbering");
+
+void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+  printf("{\n");
+  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+       E = d.end(); I != E; ++I) {
+      printf("%d\n", I->first);
+      I->second->dump();
+  }
+  printf("}\n");
+}
+
+Value* GVN::CollapsePhi(PHINode* p) {
+  Value* constVal = p->hasConstantValue();
+  if (!constVal) return 0;
+  
+  Instruction* inst = dyn_cast<Instruction>(constVal);
+  if (!inst)
+    return constVal;
+    
+  if (DT->dominates(inst, p))
+    if (isSafeReplacement(p, inst))
+      return inst;
+  return 0;
+}
+
+bool GVN::isSafeReplacement(PHINode* p, Instruction* inst) {
+  if (!isa<PHINode>(inst))
+    return true;
+  
+  for (Instruction::use_iterator UI = p->use_begin(), E = p->use_end();
+       UI != E; ++UI)
+    if (PHINode* use_phi = dyn_cast<PHINode>(UI))
+      if (use_phi->getParent() == inst->getParent())
+        return false;
+  
+  return true;
+}
+
+/// GetValueForBlock - Get the value to use within the specified basic block.
+/// available values are in Phis.
+Value *GVN::GetValueForBlock(BasicBlock *BB, Instruction* orig,
+                             DenseMap<BasicBlock*, Value*> &Phis,
+                             bool top_level) { 
+                                 
+  // If we have already computed this value, return the previously computed val.
+  DenseMap<BasicBlock*, Value*>::iterator V = Phis.find(BB);
+  if (V != Phis.end() && !top_level) return V->second;
+  
+  // If the block is unreachable, just return undef, since this path
+  // can't actually occur at runtime.
+  if (!DT->isReachableFromEntry(BB))
+    return Phis[BB] = UndefValue::get(orig->getType());
+  
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    Value *ret = GetValueForBlock(Pred, orig, Phis);
+    Phis[BB] = ret;
+    return ret;
+  }
+
+  // Get the number of predecessors of this block so we can reserve space later.
+  // If there is already a PHI in it, use the #preds from it, otherwise count.
+  // Getting it from the PHI is constant time.
+  unsigned NumPreds;
+  if (PHINode *ExistingPN = dyn_cast<PHINode>(BB->begin()))
+    NumPreds = ExistingPN->getNumIncomingValues();
+  else
+    NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+  
+  // Otherwise, the idom is the loop, so we need to insert a PHI node.  Do so
+  // now, then get values to fill in the incoming values for the PHI.
+  PHINode *PN = PHINode::Create(orig->getType(), orig->getName()+".rle",
+                                BB->begin());
+  PN->reserveOperandSpace(NumPreds);
+  
+  Phis.insert(std::make_pair(BB, PN));
+  
+  // Fill in the incoming values for the block.
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    Value* val = GetValueForBlock(*PI, orig, Phis);
+    PN->addIncoming(val, *PI);
+  }
+  
+  VN.getAliasAnalysis()->copyValue(orig, PN);
+  
+  // Attempt to collapse PHI nodes that are trivially redundant
+  Value* v = CollapsePhi(PN);
+  if (!v) {
+    // Cache our phi construction results
+    if (LoadInst* L = dyn_cast<LoadInst>(orig))
+      phiMap[L->getPointerOperand()].insert(PN);
+    else
+      phiMap[orig].insert(PN);
+    
+    return PN;
+  }
+    
+  PN->replaceAllUsesWith(v);
+  if (isa<PointerType>(v->getType()))
+    MD->invalidateCachedPointerInfo(v);
+
+  for (DenseMap<BasicBlock*, Value*>::iterator I = Phis.begin(),
+       E = Phis.end(); I != E; ++I)
+    if (I->second == PN)
+      I->second = v;
+
+  DEBUG(cerr << "GVN removed: " << *PN);
+  MD->removeInstruction(PN);
+  PN->eraseFromParent();
+  DEBUG(verifyRemoved(PN));
+
+  Phis[BB] = v;
+  return v;
+}
+
+/// IsValueFullyAvailableInBlock - Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block.  As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
+/// map is actually a tri-state map with the following values:
+///   0) we know the block *is not* fully available.
+///   1) we know the block *is* fully available.
+///   2) we do not know whether the block is fully available or not, but we are
+///      currently speculating that it will be.
+///   3) we are speculating for this block and have used that to speculate for
+///      other blocks.
+static bool IsValueFullyAvailableInBlock(BasicBlock *BB, 
+                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks) {
+  // Optimistically assume that the block is fully available and check to see
+  // if we already know about this block in one lookup.
+  std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV = 
+    FullyAvailableBlocks.insert(std::make_pair(BB, 2));
+
+  // If the entry already existed for this block, return the precomputed value.
+  if (!IV.second) {
+    // If this is a speculative "available" value, mark it as being used for
+    // speculation of other blocks.
+    if (IV.first->second == 2)
+      IV.first->second = 3;
+    return IV.first->second != 0;
+  }
+  
+  // Otherwise, see if it is fully available in all predecessors.
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+  
+  // If this block has no predecessors, it isn't live-in here.
+  if (PI == PE)
+    goto SpeculationFailure;
+  
+  for (; PI != PE; ++PI)
+    // If the value isn't fully available in one of our predecessors, then it
+    // isn't fully available in this block either.  Undo our previous
+    // optimistic assumption and bail out.
+    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks))
+      goto SpeculationFailure;
+  
+  return true;
+  
+// SpeculationFailure - If we get here, we found out that this is not, after
+// all, a fully-available block.  We have a problem if we speculated on this and
+// used the speculation to mark other blocks as available.
+SpeculationFailure:
+  char &BBVal = FullyAvailableBlocks[BB];
+  
+  // If we didn't speculate on this, just return with it set to false.
+  if (BBVal == 2) {
+    BBVal = 0;
+    return false;
+  }
+
+  // If we did speculate on this value, we could have blocks set to 1 that are
+  // incorrect.  Walk the (transitive) successors of this block and mark them as
+  // 0 if set to one.
+  SmallVector<BasicBlock*, 32> BBWorklist;
+  BBWorklist.push_back(BB);
+  
+  while (!BBWorklist.empty()) {
+    BasicBlock *Entry = BBWorklist.pop_back_val();
+    // Note that this sets blocks to 0 (unavailable) if they happen to not
+    // already be in FullyAvailableBlocks.  This is safe.
+    char &EntryVal = FullyAvailableBlocks[Entry];
+    if (EntryVal == 0) continue;  // Already unavailable.
+
+    // Mark as unavailable.
+    EntryVal = 0;
+    
+    for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I)
+      BBWorklist.push_back(*I);
+  }
+  
+  return false;
+}
+
+/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI,
+                              SmallVectorImpl<Instruction*> &toErase) {
+  // Find the non-local dependencies of the load.
+  SmallVector<MemoryDependenceAnalysis::NonLocalDepEntry, 64> Deps; 
+  MD->getNonLocalPointerDependency(LI->getOperand(0), true, LI->getParent(),
+                                   Deps);
+  //DEBUG(cerr << "INVESTIGATING NONLOCAL LOAD: " << Deps.size() << *LI);
+  
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  if (Deps.size() > 100)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (Deps.size() == 1 && Deps[0].second.isClobber())
+    return false;
+  
+  // Filter out useless results (non-locals, etc).  Keep track of the blocks
+  // where we have a value available in repl, also keep track of whether we see
+  // dependencies that produce an unknown value for the load (such as a call
+  // that could potentially clobber the load).
+  SmallVector<std::pair<BasicBlock*, Value*>, 16> ValuesPerBlock;
+  SmallVector<BasicBlock*, 16> UnavailableBlocks;
+  
+  for (unsigned i = 0, e = Deps.size(); i != e; ++i) {
+    BasicBlock *DepBB = Deps[i].first;
+    MemDepResult DepInfo = Deps[i].second;
+    
+    if (DepInfo.isClobber()) {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+    
+    Instruction *DepInst = DepInfo.getInst();
+    
+    // Loading the allocation -> undef.
+    if (isa<AllocationInst>(DepInst)) {
+      ValuesPerBlock.push_back(std::make_pair(DepBB, 
+                                              UndefValue::get(LI->getType())));
+      continue;
+    }
+  
+    if (StoreInst* S = dyn_cast<StoreInst>(DepInst)) {
+      // Reject loads and stores that are to the same address but are of 
+      // different types.
+      // NOTE: 403.gcc does have this case (e.g. in readonly_fields_p) because
+      // of bitfield access, it would be interesting to optimize for it at some
+      // point.
+      if (S->getOperand(0)->getType() != LI->getType()) {
+        UnavailableBlocks.push_back(DepBB);
+        continue;
+      }
+      
+      ValuesPerBlock.push_back(std::make_pair(DepBB, S->getOperand(0)));
+      
+    } else if (LoadInst* LD = dyn_cast<LoadInst>(DepInst)) {
+      if (LD->getType() != LI->getType()) {
+        UnavailableBlocks.push_back(DepBB);
+        continue;
+      }
+      ValuesPerBlock.push_back(std::make_pair(DepBB, LD));
+    } else {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+  }
+  
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty()) return false;
+  
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    // Use cached PHI construction information from previous runs
+    SmallPtrSet<Instruction*, 4> &p = phiMap[LI->getPointerOperand()];
+    // FIXME: What does phiMap do? Are we positive it isn't getting invalidated?
+    for (SmallPtrSet<Instruction*, 4>::iterator I = p.begin(), E = p.end();
+         I != E; ++I) {
+      if ((*I)->getParent() == LI->getParent()) {
+        DEBUG(cerr << "GVN REMOVING NONLOCAL LOAD #1: " << *LI);
+        LI->replaceAllUsesWith(*I);
+        if (isa<PointerType>((*I)->getType()))
+          MD->invalidateCachedPointerInfo(*I);
+        toErase.push_back(LI);
+        NumGVNLoad++;
+        return true;
+      }
+      
+      ValuesPerBlock.push_back(std::make_pair((*I)->getParent(), *I));
+    }
+    
+    DEBUG(cerr << "GVN REMOVING NONLOCAL LOAD: " << *LI);
+    
+    DenseMap<BasicBlock*, Value*> BlockReplValues;
+    BlockReplValues.insert(ValuesPerBlock.begin(), ValuesPerBlock.end());
+    // Perform PHI construction.
+    Value* v = GetValueForBlock(LI->getParent(), LI, BlockReplValues, true);
+    LI->replaceAllUsesWith(v);
+    
+    if (isa<PHINode>(v))
+      v->takeName(LI);
+    if (isa<PointerType>(v->getType()))
+      MD->invalidateCachedPointerInfo(v);
+    toErase.push_back(LI);
+    NumGVNLoad++;
+    return true;
+  }
+  
+  if (!EnablePRE || !EnableLoadPRE)
+    return false;
+
+  // Okay, we have *some* definitions of the value.  This means that the value
+  // is available in some of our (transitive) predecessors.  Lets think about
+  // doing PRE of this load.  This will involve inserting a new load into the
+  // predecessor when it's not available.  We could do this in general, but
+  // prefer to not increase code size.  As such, we only do this when we know
+  // that we only have to insert *one* load (which means we're basically moving
+  // the load, not inserting a new one).
+  
+  SmallPtrSet<BasicBlock *, 4> Blockers;
+  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
+    Blockers.insert(UnavailableBlocks[i]);
+
+  // Lets find first basic block with more than one predecessor.  Walk backwards
+  // through predecessors if needed.
+  BasicBlock *LoadBB = LI->getParent();
+  BasicBlock *TmpBB = LoadBB;
+
+  bool isSinglePred = false;
+  while (TmpBB->getSinglePredecessor()) {
+    isSinglePred = true;
+    TmpBB = TmpBB->getSinglePredecessor();
+    if (!TmpBB) // If haven't found any, bail now.
+      return false;
+    if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+      return false;
+    if (Blockers.count(TmpBB))
+      return false;
+  }
+  
+  assert(TmpBB);
+  LoadBB = TmpBB;
+  
+  // If we have a repl set with LI itself in it, this means we have a loop where
+  // at least one of the values is LI.  Since this means that we won't be able
+  // to eliminate LI even if we insert uses in the other predecessors, we will
+  // end up increasing code size.  Reject this by scanning for LI.
+  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
+    if (ValuesPerBlock[i].second == LI)
+      return false;
+  
+  if (isSinglePred) {
+    bool isHot = false;
+    for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
+      if (Instruction *I = dyn_cast<Instruction>(ValuesPerBlock[i].second))
+	// "Hot" Instruction is in some loop (because it dominates its dep. 
+	// instruction).
+	if (DT->dominates(LI, I)) { 
+	  isHot = true;
+	  break;
+	}
+
+    // We are interested only in "hot" instructions. We don't want to do any
+    // mis-optimizations here.
+    if (!isHot)
+      return false;
+  }
+
+  // Okay, we have some hope :).  Check to see if the loaded value is fully
+  // available in all but one predecessor.
+  // FIXME: If we could restructure the CFG, we could make a common pred with
+  // all the preds that don't have an available LI and insert a new load into
+  // that one block.
+  BasicBlock *UnavailablePred = 0;
+
+  DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
+    FullyAvailableBlocks[ValuesPerBlock[i].first] = true;
+  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
+    FullyAvailableBlocks[UnavailableBlocks[i]] = false;
+
+  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
+       PI != E; ++PI) {
+    if (IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks))
+      continue;
+    
+    // If this load is not available in multiple predecessors, reject it.
+    if (UnavailablePred && UnavailablePred != *PI)
+      return false;
+    UnavailablePred = *PI;
+  }
+  
+  assert(UnavailablePred != 0 &&
+         "Fully available value should be eliminated above!");
+  
+  // If the loaded pointer is PHI node defined in this block, do PHI translation
+  // to get its value in the predecessor.
+  Value *LoadPtr = LI->getOperand(0)->DoPHITranslation(LoadBB, UnavailablePred);
+  
+  // Make sure the value is live in the predecessor.  If it was defined by a
+  // non-PHI instruction in this block, we don't know how to recompute it above.
+  if (Instruction *LPInst = dyn_cast<Instruction>(LoadPtr))
+    if (!DT->dominates(LPInst->getParent(), UnavailablePred)) {
+      DEBUG(cerr << "COULDN'T PRE LOAD BECAUSE PTR IS UNAVAILABLE IN PRED: "
+                 << *LPInst << *LI << "\n");
+      return false;
+    }
+  
+  // We don't currently handle critical edges :(
+  if (UnavailablePred->getTerminator()->getNumSuccessors() != 1) {
+    DEBUG(cerr << "COULD NOT PRE LOAD BECAUSE OF CRITICAL EDGE '"
+                << UnavailablePred->getName() << "': " << *LI);
+    return false;
+  }
+  
+  // Okay, we can eliminate this load by inserting a reload in the predecessor
+  // and using PHI construction to get the value in the other predecessors, do
+  // it.
+  DEBUG(cerr << "GVN REMOVING PRE LOAD: " << *LI);
+  
+  Value *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
+                                LI->getAlignment(),
+                                UnavailablePred->getTerminator());
+  
+  SmallPtrSet<Instruction*, 4> &p = phiMap[LI->getPointerOperand()];
+  for (SmallPtrSet<Instruction*, 4>::iterator I = p.begin(), E = p.end();
+       I != E; ++I)
+    ValuesPerBlock.push_back(std::make_pair((*I)->getParent(), *I));
+  
+  DenseMap<BasicBlock*, Value*> BlockReplValues;
+  BlockReplValues.insert(ValuesPerBlock.begin(), ValuesPerBlock.end());
+  BlockReplValues[UnavailablePred] = NewLoad;
+  
+  // Perform PHI construction.
+  Value* v = GetValueForBlock(LI->getParent(), LI, BlockReplValues, true);
+  LI->replaceAllUsesWith(v);
+  if (isa<PHINode>(v))
+    v->takeName(LI);
+  if (isa<PointerType>(v->getType()))
+    MD->invalidateCachedPointerInfo(v);
+  toErase.push_back(LI);
+  NumPRELoad++;
+  return true;
+}
+
+/// processLoad - Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
+  if (L->isVolatile())
+    return false;
+  
+  Value* pointer = L->getPointerOperand();
+
+  // ... to a pointer that has been loaded from before...
+  MemDepResult dep = MD->getDependency(L);
+  
+  // If the value isn't available, don't do anything!
+  if (dep.isClobber()) {
+    DEBUG(
+      // fast print dep, using operator<< on instruction would be too slow
+      DOUT << "GVN: load ";
+      WriteAsOperand(*DOUT.stream(), L);
+      Instruction *I = dep.getInst();
+      DOUT << " is clobbered by " << *I;
+    );
+    return false;
+  }
+
+  // If it is defined in another block, try harder.
+  if (dep.isNonLocal())
+    return processNonLocalLoad(L, toErase);
+
+  Instruction *DepInst = dep.getInst();
+  if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Only forward substitute stores to loads of the same type.
+    // FIXME: Could do better!
+    if (DepSI->getPointerOperand()->getType() != pointer->getType())
+      return false;
+    
+    // Remove it!
+    L->replaceAllUsesWith(DepSI->getOperand(0));
+    if (isa<PointerType>(DepSI->getOperand(0)->getType()))
+      MD->invalidateCachedPointerInfo(DepSI->getOperand(0));
+    toErase.push_back(L);
+    NumGVNLoad++;
+    return true;
+  }
+
+  if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Only forward substitute stores to loads of the same type.
+    // FIXME: Could do better! load i32 -> load i8 -> truncate on little endian.
+    if (DepLI->getType() != L->getType())
+      return false;
+    
+    // Remove it!
+    L->replaceAllUsesWith(DepLI);
+    if (isa<PointerType>(DepLI->getType()))
+      MD->invalidateCachedPointerInfo(DepLI);
+    toErase.push_back(L);
+    NumGVNLoad++;
+    return true;
+  }
+  
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.
+  if (isa<AllocationInst>(DepInst)) {
+    L->replaceAllUsesWith(UndefValue::get(L->getType()));
+    toErase.push_back(L);
+    NumGVNLoad++;
+    return true;
+  }
+
+  return false;
+}
+
+Value* GVN::lookupNumber(BasicBlock* BB, uint32_t num) {
+  DenseMap<BasicBlock*, ValueNumberScope*>::iterator I = localAvail.find(BB);
+  if (I == localAvail.end())
+    return 0;
+  
+  ValueNumberScope* locals = I->second;
+  
+  while (locals) {
+    DenseMap<uint32_t, Value*>::iterator I = locals->table.find(num);
+    if (I != locals->table.end())
+      return I->second;
+    else
+      locals = locals->parent;
+  }
+  
+  return 0;
+}
+
+/// AttemptRedundancyElimination - If the "fast path" of redundancy elimination
+/// by inheritance from the dominator fails, see if we can perform phi 
+/// construction to eliminate the redundancy.
+Value* GVN::AttemptRedundancyElimination(Instruction* orig, unsigned valno) {
+  BasicBlock* BaseBlock = orig->getParent();
+  
+  SmallPtrSet<BasicBlock*, 4> Visited;
+  SmallVector<BasicBlock*, 8> Stack;
+  Stack.push_back(BaseBlock);
+  
+  DenseMap<BasicBlock*, Value*> Results;
+  
+  // Walk backwards through our predecessors, looking for instances of the
+  // value number we're looking for.  Instances are recorded in the Results
+  // map, which is then used to perform phi construction.
+  while (!Stack.empty()) {
+    BasicBlock* Current = Stack.back();
+    Stack.pop_back();
+    
+    // If we've walked all the way to a proper dominator, then give up. Cases
+    // where the instance is in the dominator will have been caught by the fast
+    // path, and any cases that require phi construction further than this are
+    // probably not worth it anyways.  Note that this is a SIGNIFICANT compile
+    // time improvement.
+    if (DT->properlyDominates(Current, orig->getParent())) return 0;
+    
+    DenseMap<BasicBlock*, ValueNumberScope*>::iterator LA =
+                                                       localAvail.find(Current);
+    if (LA == localAvail.end()) return 0;
+    DenseMap<uint32_t, Value*>::iterator V = LA->second->table.find(valno);
+    
+    if (V != LA->second->table.end()) {
+      // Found an instance, record it.
+      Results.insert(std::make_pair(Current, V->second));
+      continue;
+    }
+    
+    // If we reach the beginning of the function, then give up.
+    if (pred_begin(Current) == pred_end(Current))
+      return 0;
+    
+    for (pred_iterator PI = pred_begin(Current), PE = pred_end(Current);
+         PI != PE; ++PI)
+      if (Visited.insert(*PI))
+        Stack.push_back(*PI);
+  }
+  
+  // If we didn't find instances, give up.  Otherwise, perform phi construction.
+  if (Results.size() == 0)
+    return 0;
+  else
+    return GetValueForBlock(BaseBlock, orig, Results, true);
+}
+
+/// processInstruction - When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I,
+                             SmallVectorImpl<Instruction*> &toErase) {
+  if (LoadInst* L = dyn_cast<LoadInst>(I)) {
+    bool changed = processLoad(L, toErase);
+    
+    if (!changed) {
+      unsigned num = VN.lookup_or_add(L);
+      localAvail[I->getParent()]->table.insert(std::make_pair(num, L));
+    }
+    
+    return changed;
+  }
+  
+  uint32_t nextNum = VN.getNextUnusedValueNumber();
+  unsigned num = VN.lookup_or_add(I);
+  
+  if (BranchInst* BI = dyn_cast<BranchInst>(I)) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    
+    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+      return false;
+    
+    Value* branchCond = BI->getCondition();
+    uint32_t condVN = VN.lookup_or_add(branchCond);
+    
+    BasicBlock* trueSucc = BI->getSuccessor(0);
+    BasicBlock* falseSucc = BI->getSuccessor(1);
+    
+    if (trueSucc->getSinglePredecessor())
+      localAvail[trueSucc]->table[condVN] = ConstantInt::getTrue();
+    if (falseSucc->getSinglePredecessor())
+      localAvail[falseSucc]->table[condVN] = ConstantInt::getFalse();
+
+    return false;
+    
+  // Allocations are always uniquely numbered, so we can save time and memory
+  // by fast failing them.  
+  } else if (isa<AllocationInst>(I) || isa<TerminatorInst>(I)) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    return false;
+  }
+  
+  // Collapse PHI nodes
+  if (PHINode* p = dyn_cast<PHINode>(I)) {
+    Value* constVal = CollapsePhi(p);
+    
+    if (constVal) {
+      for (PhiMapType::iterator PI = phiMap.begin(), PE = phiMap.end();
+           PI != PE; ++PI)
+        PI->second.erase(p);
+        
+      p->replaceAllUsesWith(constVal);
+      if (isa<PointerType>(constVal->getType()))
+        MD->invalidateCachedPointerInfo(constVal);
+      VN.erase(p);
+      
+      toErase.push_back(p);
+    } else {
+      localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    }
+  
+  // If the number we were assigned was a brand new VN, then we don't
+  // need to do a lookup to see if the number already exists
+  // somewhere in the domtree: it can't!
+  } else if (num == nextNum) {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+    
+  // Perform fast-path value-number based elimination of values inherited from
+  // dominators.
+  } else if (Value* repl = lookupNumber(I->getParent(), num)) {
+    // Remove it!
+    VN.erase(I);
+    I->replaceAllUsesWith(repl);
+    if (isa<PointerType>(repl->getType()))
+      MD->invalidateCachedPointerInfo(repl);
+    toErase.push_back(I);
+    return true;
+
+#if 0
+  // Perform slow-pathvalue-number based elimination with phi construction.
+  } else if (Value* repl = AttemptRedundancyElimination(I, num)) {
+    // Remove it!
+    VN.erase(I);
+    I->replaceAllUsesWith(repl);
+    if (isa<PointerType>(repl->getType()))
+      MD->invalidateCachedPointerInfo(repl);
+    toErase.push_back(I);
+    return true;
+#endif
+  } else {
+    localAvail[I->getParent()]->table.insert(std::make_pair(num, I));
+  }
+  
+  return false;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runOnFunction(Function& F) {
+  MD = &getAnalysis<MemoryDependenceAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+  VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
+  VN.setMemDep(MD);
+  VN.setDomTree(DT);
+  
+  bool changed = false;
+  bool shouldContinue = true;
+  
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+    BasicBlock* BB = FI;
+    ++FI;
+    bool removedBlock = MergeBlockIntoPredecessor(BB, this);
+    if (removedBlock) NumGVNBlocks++;
+    
+    changed |= removedBlock;
+  }
+  
+  unsigned Iteration = 0;
+  
+  while (shouldContinue) {
+    DEBUG(cerr << "GVN iteration: " << Iteration << "\n");
+    shouldContinue = iterateOnFunction(F);
+    changed |= shouldContinue;
+    ++Iteration;
+  }
+  
+  if (EnablePRE) {
+    bool PREChanged = true;
+    while (PREChanged) {
+      PREChanged = performPRE(F);
+      changed |= PREChanged;
+    }
+  }
+  // FIXME: Should perform GVN again after PRE does something.  PRE can move
+  // computations into blocks where they become fully redundant.  Note that
+  // we can't do this until PRE's critical edge splitting updates memdep.
+  // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+  cleanupGlobalSets();
+
+  return changed;
+}
+
+
+bool GVN::processBlock(BasicBlock* BB) {
+  // FIXME: Kill off toErase by doing erasing eagerly in a helper function (and
+  // incrementing BI before processing an instruction).
+  SmallVector<Instruction*, 8> toErase;
+  bool changed_function = false;
+  
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+       BI != BE;) {
+    changed_function |= processInstruction(BI, toErase);
+    if (toErase.empty()) {
+      ++BI;
+      continue;
+    }
+    
+    // If we need some instructions deleted, do it now.
+    NumGVNInstr += toErase.size();
+    
+    // Avoid iterator invalidation.
+    bool AtStart = BI == BB->begin();
+    if (!AtStart)
+      --BI;
+
+    for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
+         E = toErase.end(); I != E; ++I) {
+      DEBUG(cerr << "GVN removed: " << **I);
+      MD->removeInstruction(*I);
+      (*I)->eraseFromParent();
+      DEBUG(verifyRemoved(*I));
+    }
+    toErase.clear();
+
+    if (AtStart)
+      BI = BB->begin();
+    else
+      ++BI;
+  }
+  
+  return changed_function;
+}
+
+/// performPRE - Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function& F) {
+  bool Changed = false;
+  SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
+  DenseMap<BasicBlock*, Value*> predMap;
+  for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+       DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+    BasicBlock* CurrentBlock = *DI;
+    
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock()) continue;
+    
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+         BE = CurrentBlock->end(); BI != BE; ) {
+      Instruction *CurInst = BI++;
+
+      if (isa<AllocationInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+          isa<PHINode>(CurInst) || (CurInst->getType() == Type::VoidTy) ||
+          CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+          isa<DbgInfoIntrinsic>(CurInst))
+        continue;
+
+      uint32_t valno = VN.lookup(CurInst);
+      
+      // Look for the predecessors for PRE opportunities.  We're
+      // only trying to solve the basic diamond case, where
+      // a value is computed in the successor and one predecessor,
+      // but not the other.  We also explicitly disallow cases
+      // where the successor is its own predecessor, because they're
+      // more complicated to get right.
+      unsigned numWith = 0;
+      unsigned numWithout = 0;
+      BasicBlock* PREPred = 0;
+      predMap.clear();
+
+      for (pred_iterator PI = pred_begin(CurrentBlock),
+           PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        // We're not interested in PRE where the block is its
+        // own predecessor, on in blocks with predecessors
+        // that are not reachable.
+        if (*PI == CurrentBlock) {
+          numWithout = 2;
+          break;
+        } else if (!localAvail.count(*PI))  {
+          numWithout = 2;
+          break;
+        }
+        
+        DenseMap<uint32_t, Value*>::iterator predV = 
+                                            localAvail[*PI]->table.find(valno);
+        if (predV == localAvail[*PI]->table.end()) {
+          PREPred = *PI;
+          numWithout++;
+        } else if (predV->second == CurInst) {
+          numWithout = 2;
+        } else {
+          predMap[*PI] = predV->second;
+          numWith++;
+        }
+      }
+      
+      // Don't do PRE when it might increase code size, i.e. when
+      // we would need to insert instructions in more than one pred.
+      if (numWithout != 1 || numWith == 0)
+        continue;
+      
+      // We can't do PRE safely on a critical edge, so instead we schedule
+      // the edge to be split and perform the PRE the next time we iterate
+      // on the function.
+      unsigned succNum = 0;
+      for (unsigned i = 0, e = PREPred->getTerminator()->getNumSuccessors();
+           i != e; ++i)
+        if (PREPred->getTerminator()->getSuccessor(i) == CurrentBlock) {
+          succNum = i;
+          break;
+        }
+        
+      if (isCriticalEdge(PREPred->getTerminator(), succNum)) {
+        toSplit.push_back(std::make_pair(PREPred->getTerminator(), succNum));
+        continue;
+      }
+      
+      // Instantiate the expression the in predecessor that lacked it.
+      // Because we are going top-down through the block, all value numbers
+      // will be available in the predecessor by the time we need them.  Any
+      // that weren't original present will have been instantiated earlier
+      // in this loop.
+      Instruction* PREInstr = CurInst->clone();
+      bool success = true;
+      for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) {
+        Value *Op = PREInstr->getOperand(i);
+        if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+          continue;
+        
+        if (Value *V = lookupNumber(PREPred, VN.lookup(Op))) {
+          PREInstr->setOperand(i, V);
+        } else {
+          success = false;
+          break;
+        }
+      }
+      
+      // Fail out if we encounter an operand that is not available in
+      // the PRE predecessor.  This is typically because of loads which 
+      // are not value numbered precisely.
+      if (!success) {
+        delete PREInstr;
+        DEBUG(verifyRemoved(PREInstr));
+        continue;
+      }
+      
+      PREInstr->insertBefore(PREPred->getTerminator());
+      PREInstr->setName(CurInst->getName() + ".pre");
+      predMap[PREPred] = PREInstr;
+      VN.add(PREInstr, valno);
+      NumGVNPRE++;
+      
+      // Update the availability map to include the new instruction.
+      localAvail[PREPred]->table.insert(std::make_pair(valno, PREInstr));
+      
+      // Create a PHI to make the value available in this block.
+      PHINode* Phi = PHINode::Create(CurInst->getType(),
+                                     CurInst->getName() + ".pre-phi",
+                                     CurrentBlock->begin());
+      for (pred_iterator PI = pred_begin(CurrentBlock),
+           PE = pred_end(CurrentBlock); PI != PE; ++PI)
+        Phi->addIncoming(predMap[*PI], *PI);
+      
+      VN.add(Phi, valno);
+      localAvail[CurrentBlock]->table[valno] = Phi;
+      
+      CurInst->replaceAllUsesWith(Phi);
+      if (isa<PointerType>(Phi->getType()))
+        MD->invalidateCachedPointerInfo(Phi);
+      VN.erase(CurInst);
+      
+      DEBUG(cerr << "GVN PRE removed: " << *CurInst);
+      MD->removeInstruction(CurInst);
+      CurInst->eraseFromParent();
+      DEBUG(verifyRemoved(CurInst));
+      Changed = true;
+    }
+  }
+  
+  for (SmallVector<std::pair<TerminatorInst*, unsigned>, 4>::iterator
+       I = toSplit.begin(), E = toSplit.end(); I != E; ++I)
+    SplitCriticalEdge(I->first, I->second, this);
+  
+  return Changed || toSplit.size();
+}
+
+/// iterateOnFunction - Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+  cleanupGlobalSets();
+
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
+       DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
+    if (DI->getIDom())
+      localAvail[DI->getBlock()] =
+                   new ValueNumberScope(localAvail[DI->getIDom()->getBlock()]);
+    else
+      localAvail[DI->getBlock()] = new ValueNumberScope(0);
+  }
+
+  // Top-down walk of the dominator tree
+  bool changed = false;
+#if 0
+  // Needed for value numbering with phi construction to work.
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(),
+       RE = RPOT.end(); RI != RE; ++RI)
+    changed |= processBlock(*RI);
+#else
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
+       DE = df_end(DT->getRootNode()); DI != DE; ++DI)
+    changed |= processBlock(DI->getBlock());
+#endif
+
+  return changed;
+}
+
+void GVN::cleanupGlobalSets() {
+  VN.clear();
+  phiMap.clear();
+
+  for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator
+       I = localAvail.begin(), E = localAvail.end(); I != E; ++I)
+    delete I->second;
+  localAvail.clear();
+}
+
+/// verifyRemoved - Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+  VN.verifyRemoved(Inst);
+
+  // Walk through the PHI map to make sure the instruction isn't hiding in there
+  // somewhere.
+  for (PhiMapType::iterator
+         I = phiMap.begin(), E = phiMap.end(); I != E; ++I) {
+    assert(I->first != Inst && "Inst is still a key in PHI map!");
+
+    for (SmallPtrSet<Instruction*, 4>::iterator
+           II = I->second.begin(), IE = I->second.end(); II != IE; ++II) {
+      assert(*II != Inst && "Inst is still a value in PHI map!");
+    }
+  }
+
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator
+         I = localAvail.begin(), E = localAvail.end(); I != E; ++I) {
+    const ValueNumberScope *VNS = I->second;
+
+    while (VNS) {
+      for (DenseMap<uint32_t, Value*>::iterator
+             II = VNS->table.begin(), IE = VNS->table.end(); II != IE; ++II) {
+        assert(II->second != Inst && "Inst still in value numbering scope!");
+      }
+
+      VNS = VNS->parent;
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/GVNPRE.cpp b/lib/Transforms/Scalar/GVNPRE.cpp
new file mode 100644
index 0000000..e3b0937
--- /dev/null
+++ b/lib/Transforms/Scalar/GVNPRE.cpp
@@ -0,0 +1,1885 @@
+//===- GVNPRE.cpp - Eliminate redundant values and expressions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a hybrid of global value numbering and partial redundancy
+// elimination, known as GVN-PRE.  It performs partial redundancy elimination on
+// values, rather than lexical expressions, allowing a more comprehensive view 
+// the optimization.  It replaces redundant values with uses of earlier 
+// occurences of the same value.  While this is beneficial in that it eliminates
+// unneeded computation, it also increases register pressure by creating large
+// live ranges, and should be used with caution on platforms that are very 
+// sensitive to register pressure.
+//
+// Note that this pass does the value numbering itself, it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gvnpre"
+#include "llvm/Value.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <deque>
+#include <map>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                         ValueTable Class
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// This class holds the mapping between values and value numbers.  It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+
+struct Expression {
+  enum ExpressionOpcode { ADD, SUB, MUL, UDIV, SDIV, FDIV, UREM, SREM, 
+                          FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, 
+                          ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, 
+                          ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, 
+                          FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, 
+                          FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, 
+                          FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT,
+                          SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI,
+                          FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, 
+                          PTRTOINT, INTTOPTR, BITCAST, GEP, EMPTY,
+                          TOMBSTONE };
+
+  ExpressionOpcode opcode;
+  const Type* type;
+  uint32_t firstVN;
+  uint32_t secondVN;
+  uint32_t thirdVN;
+  SmallVector<uint32_t, 4> varargs;
+  
+  Expression() { }
+  explicit Expression(ExpressionOpcode o) : opcode(o) { }
+  
+  bool operator==(const Expression &other) const {
+    if (opcode != other.opcode)
+      return false;
+    else if (opcode == EMPTY || opcode == TOMBSTONE)
+      return true;
+    else if (type != other.type)
+      return false;
+    else if (firstVN != other.firstVN)
+      return false;
+    else if (secondVN != other.secondVN)
+      return false;
+    else if (thirdVN != other.thirdVN)
+      return false;
+    else {
+      if (varargs.size() != other.varargs.size())
+        return false;
+      
+      for (size_t i = 0; i < varargs.size(); ++i)
+        if (varargs[i] != other.varargs[i])
+          return false;
+    
+      return true;
+    }
+  }
+  
+  bool operator!=(const Expression &other) const {
+    if (opcode != other.opcode)
+      return true;
+    else if (opcode == EMPTY || opcode == TOMBSTONE)
+      return false;
+    else if (type != other.type)
+      return true;
+    else if (firstVN != other.firstVN)
+      return true;
+    else if (secondVN != other.secondVN)
+      return true;
+    else if (thirdVN != other.thirdVN)
+      return true;
+    else {
+      if (varargs.size() != other.varargs.size())
+        return true;
+      
+      for (size_t i = 0; i < varargs.size(); ++i)
+        if (varargs[i] != other.varargs[i])
+          return true;
+    
+      return false;
+    }
+  }
+};
+
+}
+
+namespace {
+  class VISIBILITY_HIDDEN ValueTable {
+    private:
+      DenseMap<Value*, uint32_t> valueNumbering;
+      DenseMap<Expression, uint32_t> expressionNumbering;
+  
+      uint32_t nextValueNumber;
+    
+      Expression::ExpressionOpcode getOpcode(BinaryOperator* BO);
+      Expression::ExpressionOpcode getOpcode(CmpInst* C);
+      Expression::ExpressionOpcode getOpcode(CastInst* C);
+      Expression create_expression(BinaryOperator* BO);
+      Expression create_expression(CmpInst* C);
+      Expression create_expression(ShuffleVectorInst* V);
+      Expression create_expression(ExtractElementInst* C);
+      Expression create_expression(InsertElementInst* V);
+      Expression create_expression(SelectInst* V);
+      Expression create_expression(CastInst* C);
+      Expression create_expression(GetElementPtrInst* G);
+    public:
+      ValueTable() { nextValueNumber = 1; }
+      uint32_t lookup_or_add(Value* V);
+      uint32_t lookup(Value* V) const;
+      void add(Value* V, uint32_t num);
+      void clear();
+      void erase(Value* v);
+      unsigned size();
+  };
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+  static inline Expression getEmptyKey() {
+    return Expression(Expression::EMPTY);
+  }
+  
+  static inline Expression getTombstoneKey() {
+    return Expression(Expression::TOMBSTONE);
+  }
+  
+  static unsigned getHashValue(const Expression e) {
+    unsigned hash = e.opcode;
+    
+    hash = e.firstVN + hash * 37;
+    hash = e.secondVN + hash * 37;
+    hash = e.thirdVN + hash * 37;
+    
+    hash = ((unsigned)((uintptr_t)e.type >> 4) ^
+            (unsigned)((uintptr_t)e.type >> 9)) +
+           hash * 37;
+    
+    for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(),
+         E = e.varargs.end(); I != E; ++I)
+      hash = *I + hash * 37;
+    
+    return hash;
+  }
+  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+    return LHS == RHS;
+  }
+  static bool isPod() { return true; }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+Expression::ExpressionOpcode 
+                             ValueTable::getOpcode(BinaryOperator* BO) {
+  switch(BO->getOpcode()) {
+    case Instruction::Add:
+      return Expression::ADD;
+    case Instruction::Sub:
+      return Expression::SUB;
+    case Instruction::Mul:
+      return Expression::MUL;
+    case Instruction::UDiv:
+      return Expression::UDIV;
+    case Instruction::SDiv:
+      return Expression::SDIV;
+    case Instruction::FDiv:
+      return Expression::FDIV;
+    case Instruction::URem:
+      return Expression::UREM;
+    case Instruction::SRem:
+      return Expression::SREM;
+    case Instruction::FRem:
+      return Expression::FREM;
+    case Instruction::Shl:
+      return Expression::SHL;
+    case Instruction::LShr:
+      return Expression::LSHR;
+    case Instruction::AShr:
+      return Expression::ASHR;
+    case Instruction::And:
+      return Expression::AND;
+    case Instruction::Or:
+      return Expression::OR;
+    case Instruction::Xor:
+      return Expression::XOR;
+    
+    // THIS SHOULD NEVER HAPPEN
+    default:
+      assert(0 && "Binary operator with unknown opcode?");
+      return Expression::ADD;
+  }
+}
+
+Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) {
+  if (C->getOpcode() == Instruction::ICmp) {
+    switch (C->getPredicate()) {
+      case ICmpInst::ICMP_EQ:
+        return Expression::ICMPEQ;
+      case ICmpInst::ICMP_NE:
+        return Expression::ICMPNE;
+      case ICmpInst::ICMP_UGT:
+        return Expression::ICMPUGT;
+      case ICmpInst::ICMP_UGE:
+        return Expression::ICMPUGE;
+      case ICmpInst::ICMP_ULT:
+        return Expression::ICMPULT;
+      case ICmpInst::ICMP_ULE:
+        return Expression::ICMPULE;
+      case ICmpInst::ICMP_SGT:
+        return Expression::ICMPSGT;
+      case ICmpInst::ICMP_SGE:
+        return Expression::ICMPSGE;
+      case ICmpInst::ICMP_SLT:
+        return Expression::ICMPSLT;
+      case ICmpInst::ICMP_SLE:
+        return Expression::ICMPSLE;
+      
+      // THIS SHOULD NEVER HAPPEN
+      default:
+        assert(0 && "Comparison with unknown predicate?");
+        return Expression::ICMPEQ;
+    }
+  } else {
+    switch (C->getPredicate()) {
+      case FCmpInst::FCMP_OEQ:
+        return Expression::FCMPOEQ;
+      case FCmpInst::FCMP_OGT:
+        return Expression::FCMPOGT;
+      case FCmpInst::FCMP_OGE:
+        return Expression::FCMPOGE;
+      case FCmpInst::FCMP_OLT:
+        return Expression::FCMPOLT;
+      case FCmpInst::FCMP_OLE:
+        return Expression::FCMPOLE;
+      case FCmpInst::FCMP_ONE:
+        return Expression::FCMPONE;
+      case FCmpInst::FCMP_ORD:
+        return Expression::FCMPORD;
+      case FCmpInst::FCMP_UNO:
+        return Expression::FCMPUNO;
+      case FCmpInst::FCMP_UEQ:
+        return Expression::FCMPUEQ;
+      case FCmpInst::FCMP_UGT:
+        return Expression::FCMPUGT;
+      case FCmpInst::FCMP_UGE:
+        return Expression::FCMPUGE;
+      case FCmpInst::FCMP_ULT:
+        return Expression::FCMPULT;
+      case FCmpInst::FCMP_ULE:
+        return Expression::FCMPULE;
+      case FCmpInst::FCMP_UNE:
+        return Expression::FCMPUNE;
+      
+      // THIS SHOULD NEVER HAPPEN
+      default:
+        assert(0 && "Comparison with unknown predicate?");
+        return Expression::FCMPOEQ;
+    }
+  }
+}
+
+Expression::ExpressionOpcode 
+                             ValueTable::getOpcode(CastInst* C) {
+  switch(C->getOpcode()) {
+    case Instruction::Trunc:
+      return Expression::TRUNC;
+    case Instruction::ZExt:
+      return Expression::ZEXT;
+    case Instruction::SExt:
+      return Expression::SEXT;
+    case Instruction::FPToUI:
+      return Expression::FPTOUI;
+    case Instruction::FPToSI:
+      return Expression::FPTOSI;
+    case Instruction::UIToFP:
+      return Expression::UITOFP;
+    case Instruction::SIToFP:
+      return Expression::SITOFP;
+    case Instruction::FPTrunc:
+      return Expression::FPTRUNC;
+    case Instruction::FPExt:
+      return Expression::FPEXT;
+    case Instruction::PtrToInt:
+      return Expression::PTRTOINT;
+    case Instruction::IntToPtr:
+      return Expression::INTTOPTR;
+    case Instruction::BitCast:
+      return Expression::BITCAST;
+    
+    // THIS SHOULD NEVER HAPPEN
+    default:
+      assert(0 && "Cast operator with unknown opcode?");
+      return Expression::BITCAST;
+  }
+}
+
+Expression ValueTable::create_expression(BinaryOperator* BO) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(BO->getOperand(0));
+  e.secondVN = lookup_or_add(BO->getOperand(1));
+  e.thirdVN = 0;
+  e.type = BO->getType();
+  e.opcode = getOpcode(BO);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CmpInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = lookup_or_add(C->getOperand(1));
+  e.thirdVN = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(CastInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ShuffleVectorInst* S) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(S->getOperand(0));
+  e.secondVN = lookup_or_add(S->getOperand(1));
+  e.thirdVN = lookup_or_add(S->getOperand(2));
+  e.type = S->getType();
+  e.opcode = Expression::SHUFFLE;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(ExtractElementInst* E) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(E->getOperand(0));
+  e.secondVN = lookup_or_add(E->getOperand(1));
+  e.thirdVN = 0;
+  e.type = E->getType();
+  e.opcode = Expression::EXTRACT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(InsertElementInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getOperand(0));
+  e.secondVN = lookup_or_add(I->getOperand(1));
+  e.thirdVN = lookup_or_add(I->getOperand(2));
+  e.type = I->getType();
+  e.opcode = Expression::INSERT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(SelectInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getCondition());
+  e.secondVN = lookup_or_add(I->getTrueValue());
+  e.thirdVN = lookup_or_add(I->getFalseValue());
+  e.type = I->getType();
+  e.opcode = Expression::SELECT;
+  
+  return e;
+}
+
+Expression ValueTable::create_expression(GetElementPtrInst* G) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(G->getPointerOperand());
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.type = G->getType();
+  e.opcode = Expression::GEP;
+  
+  for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end();
+       I != E; ++I)
+    e.varargs.push_back(lookup_or_add(*I));
+  
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value* V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  
+  
+  if (BinaryOperator* BO = dyn_cast<BinaryOperator>(V)) {
+    Expression e = create_expression(BO);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CmpInst* C = dyn_cast<CmpInst>(V)) {
+    Expression e = create_expression(C);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ShuffleVectorInst* U = dyn_cast<ShuffleVectorInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ExtractElementInst* U = dyn_cast<ExtractElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (InsertElementInst* U = dyn_cast<InsertElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (SelectInst* U = dyn_cast<SelectInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Expression e = create_expression(U);
+    
+    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else {
+    valueNumbering.insert(std::make_pair(V, nextValueNumber));
+    return nextValueNumber++;
+  }
+}
+
+/// lookup - Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value* V) const {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  else
+    assert(0 && "Value not numbered?");
+  
+  return 0;
+}
+
+/// add - Add the specified value with the given value number, removing
+/// its old number, if any
+void ValueTable::add(Value* V, uint32_t num) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    valueNumbering.erase(VI);
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+/// clear - Remove all entries from the ValueTable
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// erase - Remove a value from the value numbering
+void ValueTable::erase(Value* V) {
+  valueNumbering.erase(V);
+}
+
+/// size - Return the number of assigned value numbers
+unsigned ValueTable::size() {
+  // NOTE: zero is never assigned
+  return nextValueNumber;
+}
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+//                       ValueNumberedSet Class
+//===----------------------------------------------------------------------===//
+
+class ValueNumberedSet {
+  private:
+    SmallPtrSet<Value*, 8> contents;
+    BitVector numbers;
+  public:
+    ValueNumberedSet() { numbers.resize(1); }
+    ValueNumberedSet(const ValueNumberedSet& other) {
+      numbers = other.numbers;
+      contents = other.contents;
+    }
+    
+    typedef SmallPtrSet<Value*, 8>::iterator iterator;
+    
+    iterator begin() { return contents.begin(); }
+    iterator end() { return contents.end(); }
+    
+    bool insert(Value* v) { return contents.insert(v); }
+    void insert(iterator I, iterator E) { contents.insert(I, E); }
+    void erase(Value* v) { contents.erase(v); }
+    unsigned count(Value* v) { return contents.count(v); }
+    size_t size() { return contents.size(); }
+    
+    void set(unsigned i)  {
+      if (i >= numbers.size())
+        numbers.resize(i+1);
+      
+      numbers.set(i);
+    }
+    
+    void operator=(const ValueNumberedSet& other) {
+      contents = other.contents;
+      numbers = other.numbers;
+    }
+    
+    void reset(unsigned i)  {
+      if (i < numbers.size())
+        numbers.reset(i);
+    }
+    
+    bool test(unsigned i)  {
+      if (i >= numbers.size())
+        return false;
+      
+      return numbers.test(i);
+    }
+    
+    void clear() {
+      contents.clear();
+      numbers.clear();
+    }
+};
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         GVNPRE Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+  class VISIBILITY_HIDDEN GVNPRE : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    GVNPRE() : FunctionPass(&ID) {}
+
+  private:
+    ValueTable VN;
+    SmallVector<Instruction*, 8> createdExpressions;
+    
+    DenseMap<BasicBlock*, ValueNumberedSet> availableOut;
+    DenseMap<BasicBlock*, ValueNumberedSet> anticipatedIn;
+    DenseMap<BasicBlock*, ValueNumberedSet> generatedPhis;
+    
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequired<UnifyFunctionExitNodes>();
+      AU.addRequired<DominatorTree>();
+    }
+  
+    // Helper fuctions
+    // FIXME: eliminate or document these better
+    void dump(ValueNumberedSet& s) const ;
+    void clean(ValueNumberedSet& set) ;
+    Value* find_leader(ValueNumberedSet& vals, uint32_t v) ;
+    Value* phi_translate(Value* V, BasicBlock* pred, BasicBlock* succ) ;
+    void phi_translate_set(ValueNumberedSet& anticIn, BasicBlock* pred,
+                           BasicBlock* succ, ValueNumberedSet& out) ;
+    
+    void topo_sort(ValueNumberedSet& set,
+                   SmallVector<Value*, 8>& vec) ;
+    
+    void cleanup() ;
+    bool elimination() ;
+    
+    void val_insert(ValueNumberedSet& s, Value* v) ;
+    void val_replace(ValueNumberedSet& s, Value* v) ;
+    bool dependsOnInvoke(Value* V) ;
+    void buildsets_availout(BasicBlock::iterator I,
+                            ValueNumberedSet& currAvail,
+                            ValueNumberedSet& currPhis,
+                            ValueNumberedSet& currExps,
+                            SmallPtrSet<Value*, 16>& currTemps);
+    bool buildsets_anticout(BasicBlock* BB,
+                            ValueNumberedSet& anticOut,
+                            SmallPtrSet<BasicBlock*, 8>& visited);
+    unsigned buildsets_anticin(BasicBlock* BB,
+                           ValueNumberedSet& anticOut,
+                           ValueNumberedSet& currExps,
+                           SmallPtrSet<Value*, 16>& currTemps,
+                           SmallPtrSet<BasicBlock*, 8>& visited);
+    void buildsets(Function& F) ;
+    
+    void insertion_pre(Value* e, BasicBlock* BB,
+                       DenseMap<BasicBlock*, Value*>& avail,
+                       std::map<BasicBlock*,ValueNumberedSet>& new_set);
+    unsigned insertion_mergepoint(SmallVector<Value*, 8>& workList,
+                                  df_iterator<DomTreeNode*>& D,
+                      std::map<BasicBlock*, ValueNumberedSet>& new_set);
+    bool insertion(Function& F) ;
+  
+  };
+  
+  char GVNPRE::ID = 0;
+  
+}
+
+// createGVNPREPass - The public interface to this file...
+FunctionPass *llvm::createGVNPREPass() { return new GVNPRE(); }
+
+static RegisterPass<GVNPRE> X("gvnpre",
+                      "Global Value Numbering/Partial Redundancy Elimination");
+
+
+STATISTIC(NumInsertedVals, "Number of values inserted");
+STATISTIC(NumInsertedPhis, "Number of PHI nodes inserted");
+STATISTIC(NumEliminated, "Number of redundant instructions eliminated");
+
+/// find_leader - Given a set and a value number, return the first
+/// element of the set with that value number, or 0 if no such element
+/// is present
+Value* GVNPRE::find_leader(ValueNumberedSet& vals, uint32_t v) {
+  if (!vals.test(v))
+    return 0;
+  
+  for (ValueNumberedSet::iterator I = vals.begin(), E = vals.end();
+       I != E; ++I)
+    if (v == VN.lookup(*I))
+      return *I;
+  
+  assert(0 && "No leader found, but present bit is set?");
+  return 0;
+}
+
+/// val_insert - Insert a value into a set only if there is not a value
+/// with the same value number already in the set
+void GVNPRE::val_insert(ValueNumberedSet& s, Value* v) {
+  uint32_t num = VN.lookup(v);
+  if (!s.test(num))
+    s.insert(v);
+}
+
+/// val_replace - Insert a value into a set, replacing any values already in
+/// the set that have the same value number
+void GVNPRE::val_replace(ValueNumberedSet& s, Value* v) {
+  if (s.count(v)) return;
+  
+  uint32_t num = VN.lookup(v);
+  Value* leader = find_leader(s, num);
+  if (leader != 0)
+    s.erase(leader);
+  s.insert(v);
+  s.set(num);
+}
+
+/// phi_translate - Given a value, its parent block, and a predecessor of its
+/// parent, translate the value into legal for the predecessor block.  This 
+/// means translating its operands (and recursively, their operands) through
+/// any phi nodes in the parent into values available in the predecessor
+Value* GVNPRE::phi_translate(Value* V, BasicBlock* pred, BasicBlock* succ) {
+  if (V == 0)
+    return 0;
+  
+  // Unary Operations
+  if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0)) {
+      Instruction* newVal = 0;
+      if (CastInst* C = dyn_cast<CastInst>(U))
+        newVal = CastInst::Create(C->getOpcode(),
+                                  newOp1, C->getType(),
+                                  C->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Binary Operations
+  } if (isa<BinaryOperator>(V) || isa<CmpInst>(V) || 
+      isa<ExtractElementInst>(V)) {
+    User* U = cast<User>(V);
+    
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    Value* newOp2 = 0;
+    if (isa<Instruction>(U->getOperand(1)))
+      newOp2 = phi_translate(U->getOperand(1), pred, succ);
+    else
+      newOp2 = U->getOperand(1);
+    
+    if (newOp2 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0) || newOp2 != U->getOperand(1)) {
+      Instruction* newVal = 0;
+      if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
+        newVal = BinaryOperator::Create(BO->getOpcode(),
+                                        newOp1, newOp2,
+                                        BO->getName()+".expr");
+      else if (CmpInst* C = dyn_cast<CmpInst>(U))
+        newVal = CmpInst::Create(C->getOpcode(),
+                                 C->getPredicate(),
+                                 newOp1, newOp2,
+                                 C->getName()+".expr");
+      else if (ExtractElementInst* E = dyn_cast<ExtractElementInst>(U))
+        newVal = new ExtractElementInst(newOp1, newOp2, E->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Ternary Operations
+  } else if (isa<ShuffleVectorInst>(V) || isa<InsertElementInst>(V) ||
+             isa<SelectInst>(V)) {
+    User* U = cast<User>(V);
+    
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    Value* newOp2 = 0;
+    if (isa<Instruction>(U->getOperand(1)))
+      newOp2 = phi_translate(U->getOperand(1), pred, succ);
+    else
+      newOp2 = U->getOperand(1);
+    
+    if (newOp2 == 0)
+      return 0;
+    
+    Value* newOp3 = 0;
+    if (isa<Instruction>(U->getOperand(2)))
+      newOp3 = phi_translate(U->getOperand(2), pred, succ);
+    else
+      newOp3 = U->getOperand(2);
+    
+    if (newOp3 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0) ||
+        newOp2 != U->getOperand(1) ||
+        newOp3 != U->getOperand(2)) {
+      Instruction* newVal = 0;
+      if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
+        newVal = new ShuffleVectorInst(newOp1, newOp2, newOp3,
+                                       S->getName() + ".expr");
+      else if (InsertElementInst* I = dyn_cast<InsertElementInst>(U))
+        newVal = InsertElementInst::Create(newOp1, newOp2, newOp3,
+                                           I->getName() + ".expr");
+      else if (SelectInst* I = dyn_cast<SelectInst>(U))
+        newVal = SelectInst::Create(newOp1, newOp2, newOp3,
+                                    I->getName() + ".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Varargs operators
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getPointerOperand()))
+      newOp1 = phi_translate(U->getPointerOperand(), pred, succ);
+    else
+      newOp1 = U->getPointerOperand();
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    bool changed_idx = false;
+    SmallVector<Value*, 4> newIdx;
+    for (GetElementPtrInst::op_iterator I = U->idx_begin(), E = U->idx_end();
+         I != E; ++I)
+      if (isa<Instruction>(*I)) {
+        Value* newVal = phi_translate(*I, pred, succ);
+        newIdx.push_back(newVal);
+        if (newVal != *I)
+          changed_idx = true;
+      } else {
+        newIdx.push_back(*I);
+      }
+    
+    if (newOp1 != U->getPointerOperand() || changed_idx) {
+      Instruction* newVal =
+          GetElementPtrInst::Create(newOp1,
+                                    newIdx.begin(), newIdx.end(),
+                                    U->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // PHI Nodes
+  } else if (PHINode* P = dyn_cast<PHINode>(V)) {
+    if (P->getParent() == succ)
+      return P->getIncomingValueForBlock(pred);
+  }
+  
+  return V;
+}
+
+/// phi_translate_set - Perform phi translation on every element of a set
+void GVNPRE::phi_translate_set(ValueNumberedSet& anticIn,
+                              BasicBlock* pred, BasicBlock* succ,
+                              ValueNumberedSet& out) {
+  for (ValueNumberedSet::iterator I = anticIn.begin(),
+       E = anticIn.end(); I != E; ++I) {
+    Value* V = phi_translate(*I, pred, succ);
+    if (V != 0 && !out.test(VN.lookup_or_add(V))) {
+      out.insert(V);
+      out.set(VN.lookup(V));
+    }
+  }
+}
+
+/// dependsOnInvoke - Test if a value has an phi node as an operand, any of 
+/// whose inputs is an invoke instruction.  If this is true, we cannot safely
+/// PRE the instruction or anything that depends on it.
+bool GVNPRE::dependsOnInvoke(Value* V) {
+  if (PHINode* p = dyn_cast<PHINode>(V)) {
+    for (PHINode::op_iterator I = p->op_begin(), E = p->op_end(); I != E; ++I)
+      if (isa<InvokeInst>(*I))
+        return true;
+    return false;
+  } else {
+    return false;
+  }
+}
+
+/// clean - Remove all non-opaque values from the set whose operands are not
+/// themselves in the set, as well as all values that depend on invokes (see 
+/// above)
+void GVNPRE::clean(ValueNumberedSet& set) {
+  SmallVector<Value*, 8> worklist;
+  worklist.reserve(set.size());
+  topo_sort(set, worklist);
+  
+  for (unsigned i = 0; i < worklist.size(); ++i) {
+    Value* v = worklist[i];
+    
+    // Handle unary ops
+    if (CastInst* U = dyn_cast<CastInst>(v)) {
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+      
+      if (!lhsValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle binary ops
+    } else if (isa<BinaryOperator>(v) || isa<CmpInst>(v) ||
+        isa<ExtractElementInst>(v)) {
+      User* U = cast<User>(v);
+      
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+    
+      bool rhsValid = !isa<Instruction>(U->getOperand(1));
+      rhsValid |= set.test(VN.lookup(U->getOperand(1)));
+      if (rhsValid)
+        rhsValid = !dependsOnInvoke(U->getOperand(1));
+      
+      if (!lhsValid || !rhsValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle ternary ops
+    } else if (isa<ShuffleVectorInst>(v) || isa<InsertElementInst>(v) ||
+               isa<SelectInst>(v)) {
+      User* U = cast<User>(v);
+    
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+      
+      bool rhsValid = !isa<Instruction>(U->getOperand(1));
+      rhsValid |= set.test(VN.lookup(U->getOperand(1)));
+      if (rhsValid)
+        rhsValid = !dependsOnInvoke(U->getOperand(1));
+      
+      bool thirdValid = !isa<Instruction>(U->getOperand(2));
+      thirdValid |= set.test(VN.lookup(U->getOperand(2)));
+      if (thirdValid)
+        thirdValid = !dependsOnInvoke(U->getOperand(2));
+    
+      if (!lhsValid || !rhsValid || !thirdValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle varargs ops
+    } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(v)) {
+      bool ptrValid = !isa<Instruction>(U->getPointerOperand());
+      ptrValid |= set.test(VN.lookup(U->getPointerOperand()));
+      if (ptrValid)
+        ptrValid = !dependsOnInvoke(U->getPointerOperand());
+      
+      bool varValid = true;
+      for (GetElementPtrInst::op_iterator I = U->idx_begin(), E = U->idx_end();
+           I != E; ++I)
+        if (varValid) {
+          varValid &= !isa<Instruction>(*I) || set.test(VN.lookup(*I));
+          varValid &= !dependsOnInvoke(*I);
+        }
+    
+      if (!ptrValid || !varValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    }
+  }
+}
+
+/// topo_sort - Given a set of values, sort them by topological
+/// order into the provided vector.
+void GVNPRE::topo_sort(ValueNumberedSet& set, SmallVector<Value*, 8>& vec) {
+  SmallPtrSet<Value*, 16> visited;
+  SmallVector<Value*, 8> stack;
+  for (ValueNumberedSet::iterator I = set.begin(), E = set.end();
+       I != E; ++I) {
+    if (visited.count(*I) == 0)
+      stack.push_back(*I);
+    
+    while (!stack.empty()) {
+      Value* e = stack.back();
+      
+      // Handle unary ops
+      if (CastInst* U = dyn_cast<CastInst>(e)) {
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+    
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle binary ops
+      } else if (isa<BinaryOperator>(e) || isa<CmpInst>(e) ||
+          isa<ExtractElementInst>(e)) {
+        User* U = cast<User>(e);
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+        Value* r = find_leader(set, VN.lookup(U->getOperand(1)));
+    
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else if (r != 0 && isa<Instruction>(r) &&
+                 visited.count(r) == 0)
+          stack.push_back(r);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle ternary ops
+      } else if (isa<InsertElementInst>(e) || isa<ShuffleVectorInst>(e) ||
+                 isa<SelectInst>(e)) {
+        User* U = cast<User>(e);
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+        Value* r = find_leader(set, VN.lookup(U->getOperand(1)));
+        Value* m = find_leader(set, VN.lookup(U->getOperand(2)));
+      
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else if (r != 0 && isa<Instruction>(r) &&
+                 visited.count(r) == 0)
+          stack.push_back(r);
+        else if (m != 0 && isa<Instruction>(m) &&
+                 visited.count(m) == 0)
+          stack.push_back(m);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle vararg ops
+      } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(e)) {
+        Value* p = find_leader(set, VN.lookup(U->getPointerOperand()));
+        
+        if (p != 0 && isa<Instruction>(p) &&
+            visited.count(p) == 0)
+          stack.push_back(p);
+        else {
+          bool push_va = false;
+          for (GetElementPtrInst::op_iterator I = U->idx_begin(),
+               E = U->idx_end(); I != E; ++I) {
+            Value * v = find_leader(set, VN.lookup(*I));
+            if (v != 0 && isa<Instruction>(v) && visited.count(v) == 0) {
+              stack.push_back(v);
+              push_va = true;
+            }
+          }
+          
+          if (!push_va) {
+            vec.push_back(e);
+            visited.insert(e);
+            stack.pop_back();
+          }
+        }
+      
+      // Handle opaque ops
+      } else {
+        visited.insert(e);
+        vec.push_back(e);
+        stack.pop_back();
+      }
+    }
+    
+    stack.clear();
+  }
+}
+
+/// dump - Dump a set of values to standard error
+void GVNPRE::dump(ValueNumberedSet& s) const {
+  DOUT << "{ ";
+  for (ValueNumberedSet::iterator I = s.begin(), E = s.end();
+       I != E; ++I) {
+    DOUT << "" << VN.lookup(*I) << ": ";
+    DEBUG((*I)->dump());
+  }
+  DOUT << "}\n\n";
+}
+
+/// elimination - Phase 3 of the main algorithm.  Perform full redundancy 
+/// elimination by walking the dominator tree and removing any instruction that 
+/// is dominated by another instruction with the same value number.
+bool GVNPRE::elimination() {
+  bool changed_function = false;
+  
+  SmallVector<std::pair<Instruction*, Value*>, 8> replace;
+  SmallVector<Instruction*, 8> erase;
+  
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+    BasicBlock* BB = DI->getBlock();
+    
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI) {
+
+      if (isa<BinaryOperator>(BI) || isa<CmpInst>(BI) ||
+          isa<ShuffleVectorInst>(BI) || isa<InsertElementInst>(BI) ||
+          isa<ExtractElementInst>(BI) || isa<SelectInst>(BI) ||
+          isa<CastInst>(BI) || isa<GetElementPtrInst>(BI)) {
+        
+        if (availableOut[BB].test(VN.lookup(BI)) &&
+            !availableOut[BB].count(BI)) {
+          Value *leader = find_leader(availableOut[BB], VN.lookup(BI));
+          if (Instruction* Instr = dyn_cast<Instruction>(leader))
+            if (Instr->getParent() != 0 && Instr != BI) {
+              replace.push_back(std::make_pair(BI, leader));
+              erase.push_back(BI);
+              ++NumEliminated;
+            }
+        }
+      }
+    }
+  }
+  
+  while (!replace.empty()) {
+    std::pair<Instruction*, Value*> rep = replace.back();
+    replace.pop_back();
+    rep.first->replaceAllUsesWith(rep.second);
+    changed_function = true;
+  }
+    
+  for (SmallVector<Instruction*, 8>::iterator I = erase.begin(),
+       E = erase.end(); I != E; ++I)
+     (*I)->eraseFromParent();
+  
+  return changed_function;
+}
+
+/// cleanup - Delete any extraneous values that were created to represent
+/// expressions without leaders.
+void GVNPRE::cleanup() {
+  while (!createdExpressions.empty()) {
+    Instruction* I = createdExpressions.back();
+    createdExpressions.pop_back();
+    
+    delete I;
+  }
+}
+
+/// buildsets_availout - When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+void GVNPRE::buildsets_availout(BasicBlock::iterator I,
+                                ValueNumberedSet& currAvail,
+                                ValueNumberedSet& currPhis,
+                                ValueNumberedSet& currExps,
+                                SmallPtrSet<Value*, 16>& currTemps) {
+  // Handle PHI nodes
+  if (PHINode* p = dyn_cast<PHINode>(I)) {
+    unsigned num = VN.lookup_or_add(p);
+    
+    currPhis.insert(p);
+    currPhis.set(num);
+  
+  // Handle unary ops
+  } else if (CastInst* U = dyn_cast<CastInst>(I)) {
+    Value* leftValue = U->getOperand(0);
+    
+    unsigned num = VN.lookup_or_add(U);
+      
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+  
+  // Handle binary ops
+  } else if (isa<BinaryOperator>(I) || isa<CmpInst>(I) ||
+             isa<ExtractElementInst>(I)) {
+    User* U = cast<User>(I);
+    Value* leftValue = U->getOperand(0);
+    Value* rightValue = U->getOperand(1);
+    
+    unsigned num = VN.lookup_or_add(U);
+      
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    
+    if (isa<Instruction>(rightValue))
+      if (!currExps.test(VN.lookup(rightValue))) {
+        currExps.insert(rightValue);
+        currExps.set(VN.lookup(rightValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle ternary ops
+  } else if (isa<InsertElementInst>(I) || isa<ShuffleVectorInst>(I) ||
+             isa<SelectInst>(I)) {
+    User* U = cast<User>(I);
+    Value* leftValue = U->getOperand(0);
+    Value* rightValue = U->getOperand(1);
+    Value* thirdValue = U->getOperand(2);
+      
+    VN.lookup_or_add(U);
+    
+    unsigned num = VN.lookup_or_add(U);
+    
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    if (isa<Instruction>(rightValue))
+      if (!currExps.test(VN.lookup(rightValue))) {
+        currExps.insert(rightValue);
+        currExps.set(VN.lookup(rightValue));
+      }
+    if (isa<Instruction>(thirdValue))
+      if (!currExps.test(VN.lookup(thirdValue))) {
+        currExps.insert(thirdValue);
+        currExps.set(VN.lookup(thirdValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle vararg ops
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(I)) {
+    Value* ptrValue = U->getPointerOperand();
+      
+    VN.lookup_or_add(U);
+    
+    unsigned num = VN.lookup_or_add(U);
+    
+    if (isa<Instruction>(ptrValue))
+      if (!currExps.test(VN.lookup(ptrValue))) {
+        currExps.insert(ptrValue);
+        currExps.set(VN.lookup(ptrValue));
+      }
+    
+    for (GetElementPtrInst::op_iterator OI = U->idx_begin(), OE = U->idx_end();
+         OI != OE; ++OI)
+      if (isa<Instruction>(*OI) && !currExps.test(VN.lookup(*OI))) {
+        currExps.insert(*OI);
+        currExps.set(VN.lookup(*OI));
+      }
+    
+    if (!currExps.test(VN.lookup(U))) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle opaque ops
+  } else if (!I->isTerminator()){
+    VN.lookup_or_add(I);
+    
+    currTemps.insert(I);
+  }
+    
+  if (!I->isTerminator())
+    if (!currAvail.test(VN.lookup(I))) {
+      currAvail.insert(I);
+      currAvail.set(VN.lookup(I));
+    }
+}
+
+/// buildsets_anticout - When walking the postdom tree, calculate the ANTIC_OUT
+/// set as a function of the ANTIC_IN set of the block's predecessors
+bool GVNPRE::buildsets_anticout(BasicBlock* BB,
+                                ValueNumberedSet& anticOut,
+                                SmallPtrSet<BasicBlock*, 8>& visited) {
+  if (BB->getTerminator()->getNumSuccessors() == 1) {
+    if (BB->getTerminator()->getSuccessor(0) != BB &&
+        visited.count(BB->getTerminator()->getSuccessor(0)) == 0) {
+      return true;
+    }
+    else {
+      phi_translate_set(anticipatedIn[BB->getTerminator()->getSuccessor(0)],
+                        BB,  BB->getTerminator()->getSuccessor(0), anticOut);
+    }
+  } else if (BB->getTerminator()->getNumSuccessors() > 1) {
+    BasicBlock* first = BB->getTerminator()->getSuccessor(0);
+    for (ValueNumberedSet::iterator I = anticipatedIn[first].begin(),
+         E = anticipatedIn[first].end(); I != E; ++I) {
+      anticOut.insert(*I);
+      anticOut.set(VN.lookup(*I));
+    }
+    
+    for (unsigned i = 1; i < BB->getTerminator()->getNumSuccessors(); ++i) {
+      BasicBlock* currSucc = BB->getTerminator()->getSuccessor(i);
+      ValueNumberedSet& succAnticIn = anticipatedIn[currSucc];
+      
+      SmallVector<Value*, 16> temp;
+      
+      for (ValueNumberedSet::iterator I = anticOut.begin(),
+           E = anticOut.end(); I != E; ++I)
+        if (!succAnticIn.test(VN.lookup(*I)))
+          temp.push_back(*I);
+
+      for (SmallVector<Value*, 16>::iterator I = temp.begin(), E = temp.end();
+           I != E; ++I) {
+        anticOut.erase(*I);
+        anticOut.reset(VN.lookup(*I));
+      }
+    }
+  }
+  
+  return false;
+}
+
+/// buildsets_anticin - Walk the postdom tree, calculating ANTIC_OUT for
+/// each block.  ANTIC_IN is then a function of ANTIC_OUT and the GEN
+/// sets populated in buildsets_availout
+unsigned GVNPRE::buildsets_anticin(BasicBlock* BB,
+                               ValueNumberedSet& anticOut,
+                               ValueNumberedSet& currExps,
+                               SmallPtrSet<Value*, 16>& currTemps,
+                               SmallPtrSet<BasicBlock*, 8>& visited) {
+  ValueNumberedSet& anticIn = anticipatedIn[BB];
+  unsigned old = anticIn.size();
+      
+  bool defer = buildsets_anticout(BB, anticOut, visited);
+  if (defer)
+    return 0;
+  
+  anticIn.clear();
+  
+  for (ValueNumberedSet::iterator I = anticOut.begin(),
+       E = anticOut.end(); I != E; ++I) {
+    anticIn.insert(*I);
+    anticIn.set(VN.lookup(*I));
+  }
+  for (ValueNumberedSet::iterator I = currExps.begin(),
+       E = currExps.end(); I != E; ++I) {
+    if (!anticIn.test(VN.lookup(*I))) {
+      anticIn.insert(*I);
+      anticIn.set(VN.lookup(*I));
+    }
+  } 
+  
+  for (SmallPtrSet<Value*, 16>::iterator I = currTemps.begin(),
+       E = currTemps.end(); I != E; ++I) {
+    anticIn.erase(*I);
+    anticIn.reset(VN.lookup(*I));
+  }
+  
+  clean(anticIn);
+  anticOut.clear();
+  
+  if (old != anticIn.size())
+    return 2;
+  else
+    return 1;
+}
+
+/// buildsets - Phase 1 of the main algorithm.  Construct the AVAIL_OUT
+/// and the ANTIC_IN sets.
+void GVNPRE::buildsets(Function& F) {
+  DenseMap<BasicBlock*, ValueNumberedSet> generatedExpressions;
+  DenseMap<BasicBlock*, SmallPtrSet<Value*, 16> > generatedTemporaries;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();   
+  
+  // Phase 1, Part 1: calculate AVAIL_OUT
+  
+  // Top-down walk of the dominator tree
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+    
+    // Get the sets to update for this block
+    ValueNumberedSet& currExps = generatedExpressions[DI->getBlock()];
+    ValueNumberedSet& currPhis = generatedPhis[DI->getBlock()];
+    SmallPtrSet<Value*, 16>& currTemps = generatedTemporaries[DI->getBlock()];
+    ValueNumberedSet& currAvail = availableOut[DI->getBlock()];     
+    
+    BasicBlock* BB = DI->getBlock();
+  
+    // A block inherits AVAIL_OUT from its dominator
+    if (DI->getIDom() != 0)
+      currAvail = availableOut[DI->getIDom()->getBlock()];
+
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI)
+      buildsets_availout(BI, currAvail, currPhis, currExps,
+                         currTemps);
+      
+  }
+
+  // Phase 1, Part 2: calculate ANTIC_IN
+  
+  SmallPtrSet<BasicBlock*, 8> visited;
+  SmallPtrSet<BasicBlock*, 4> block_changed;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    block_changed.insert(FI);
+  
+  bool changed = true;
+  unsigned iterations = 0;
+  
+  while (changed) {
+    changed = false;
+    ValueNumberedSet anticOut;
+    
+    // Postorder walk of the CFG
+    for (po_iterator<BasicBlock*> BBI = po_begin(&F.getEntryBlock()),
+         BBE = po_end(&F.getEntryBlock()); BBI != BBE; ++BBI) {
+      BasicBlock* BB = *BBI;
+      
+      if (block_changed.count(BB) != 0) {
+        unsigned ret = buildsets_anticin(BB, anticOut,generatedExpressions[BB],
+                                         generatedTemporaries[BB], visited);
+      
+        if (ret == 0) {
+          changed = true;
+          continue;
+        } else {
+          visited.insert(BB);
+        
+          if (ret == 2)
+           for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+                 PI != PE; ++PI) {
+              block_changed.insert(*PI);
+           }
+          else
+            block_changed.erase(BB);
+        
+          changed |= (ret == 2);
+        }
+      }
+    }
+    
+    iterations++;
+  }
+}
+
+/// insertion_pre - When a partial redundancy has been identified, eliminate it
+/// by inserting appropriate values into the predecessors and a phi node in
+/// the main block
+void GVNPRE::insertion_pre(Value* e, BasicBlock* BB,
+                           DenseMap<BasicBlock*, Value*>& avail,
+                    std::map<BasicBlock*, ValueNumberedSet>& new_sets) {
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    Value* e2 = avail[*PI];
+    if (!availableOut[*PI].test(VN.lookup(e2))) {
+      User* U = cast<User>(e2);
+      
+      Value* s1 = 0;
+      if (isa<BinaryOperator>(U->getOperand(0)) || 
+          isa<CmpInst>(U->getOperand(0)) ||
+          isa<ShuffleVectorInst>(U->getOperand(0)) ||
+          isa<ExtractElementInst>(U->getOperand(0)) ||
+          isa<InsertElementInst>(U->getOperand(0)) ||
+          isa<SelectInst>(U->getOperand(0)) ||
+          isa<CastInst>(U->getOperand(0)) ||
+          isa<GetElementPtrInst>(U->getOperand(0)))
+        s1 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(0)));
+      else
+        s1 = U->getOperand(0);
+      
+      Value* s2 = 0;
+      
+      if (isa<BinaryOperator>(U) || 
+          isa<CmpInst>(U) ||
+          isa<ShuffleVectorInst>(U) ||
+          isa<ExtractElementInst>(U) ||
+          isa<InsertElementInst>(U) ||
+          isa<SelectInst>(U)) {
+        if (isa<BinaryOperator>(U->getOperand(1)) || 
+            isa<CmpInst>(U->getOperand(1)) ||
+            isa<ShuffleVectorInst>(U->getOperand(1)) ||
+            isa<ExtractElementInst>(U->getOperand(1)) ||
+            isa<InsertElementInst>(U->getOperand(1)) ||
+            isa<SelectInst>(U->getOperand(1)) ||
+            isa<CastInst>(U->getOperand(1)) ||
+            isa<GetElementPtrInst>(U->getOperand(1))) {
+          s2 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(1)));
+        } else {
+          s2 = U->getOperand(1);
+        }
+      }
+      
+      // Ternary Operators
+      Value* s3 = 0;
+      if (isa<ShuffleVectorInst>(U) ||
+          isa<InsertElementInst>(U) ||
+          isa<SelectInst>(U)) {
+        if (isa<BinaryOperator>(U->getOperand(2)) || 
+            isa<CmpInst>(U->getOperand(2)) ||
+            isa<ShuffleVectorInst>(U->getOperand(2)) ||
+            isa<ExtractElementInst>(U->getOperand(2)) ||
+            isa<InsertElementInst>(U->getOperand(2)) ||
+            isa<SelectInst>(U->getOperand(2)) ||
+            isa<CastInst>(U->getOperand(2)) ||
+            isa<GetElementPtrInst>(U->getOperand(2))) {
+          s3 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(2)));
+        } else {
+          s3 = U->getOperand(2);
+        }
+      }
+      
+      // Vararg operators
+      SmallVector<Value*, 4> sVarargs;
+      if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(U)) {
+        for (GetElementPtrInst::op_iterator OI = G->idx_begin(),
+             OE = G->idx_end(); OI != OE; ++OI) {
+          if (isa<BinaryOperator>(*OI) || 
+              isa<CmpInst>(*OI) ||
+              isa<ShuffleVectorInst>(*OI) ||
+              isa<ExtractElementInst>(*OI) ||
+              isa<InsertElementInst>(*OI) ||
+              isa<SelectInst>(*OI) ||
+              isa<CastInst>(*OI) ||
+              isa<GetElementPtrInst>(*OI)) {
+            sVarargs.push_back(find_leader(availableOut[*PI], 
+                               VN.lookup(*OI)));
+          } else {
+            sVarargs.push_back(*OI);
+          }
+        }
+      }
+      
+      Value* newVal = 0;
+      if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
+        newVal = BinaryOperator::Create(BO->getOpcode(), s1, s2,
+                                        BO->getName()+".gvnpre",
+                                        (*PI)->getTerminator());
+      else if (CmpInst* C = dyn_cast<CmpInst>(U))
+        newVal = CmpInst::Create(C->getOpcode(), C->getPredicate(), s1, s2,
+                                 C->getName()+".gvnpre", 
+                                 (*PI)->getTerminator());
+      else if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
+        newVal = new ShuffleVectorInst(s1, s2, s3, S->getName()+".gvnpre",
+                                       (*PI)->getTerminator());
+      else if (InsertElementInst* S = dyn_cast<InsertElementInst>(U))
+        newVal = InsertElementInst::Create(s1, s2, s3, S->getName()+".gvnpre",
+                                           (*PI)->getTerminator());
+      else if (ExtractElementInst* S = dyn_cast<ExtractElementInst>(U))
+        newVal = new ExtractElementInst(s1, s2, S->getName()+".gvnpre",
+                                        (*PI)->getTerminator());
+      else if (SelectInst* S = dyn_cast<SelectInst>(U))
+        newVal = SelectInst::Create(s1, s2, s3, S->getName()+".gvnpre",
+                                    (*PI)->getTerminator());
+      else if (CastInst* C = dyn_cast<CastInst>(U))
+        newVal = CastInst::Create(C->getOpcode(), s1, C->getType(),
+                                  C->getName()+".gvnpre", 
+                                  (*PI)->getTerminator());
+      else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(U))
+        newVal = GetElementPtrInst::Create(s1, sVarargs.begin(), sVarargs.end(),
+                                           G->getName()+".gvnpre", 
+                                           (*PI)->getTerminator());
+
+      VN.add(newVal, VN.lookup(U));
+                  
+      ValueNumberedSet& predAvail = availableOut[*PI];
+      val_replace(predAvail, newVal);
+      val_replace(new_sets[*PI], newVal);
+      predAvail.set(VN.lookup(newVal));
+            
+      DenseMap<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+      if (av != avail.end())
+        avail.erase(av);
+      avail.insert(std::make_pair(*PI, newVal));
+                  
+      ++NumInsertedVals;
+    }
+  }
+              
+  PHINode* p = 0;
+              
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    if (p == 0)
+      p = PHINode::Create(avail[*PI]->getType(), "gvnpre-join", BB->begin());
+    
+    p->addIncoming(avail[*PI], *PI);
+  }
+
+  VN.add(p, VN.lookup(e));
+  val_replace(availableOut[BB], p);
+  availableOut[BB].set(VN.lookup(e));
+  generatedPhis[BB].insert(p);
+  generatedPhis[BB].set(VN.lookup(e));
+  new_sets[BB].insert(p);
+  new_sets[BB].set(VN.lookup(e));
+              
+  ++NumInsertedPhis;
+}
+
+/// insertion_mergepoint - When walking the dom tree, check at each merge
+/// block for the possibility of a partial redundancy.  If present, eliminate it
+unsigned GVNPRE::insertion_mergepoint(SmallVector<Value*, 8>& workList,
+                                      df_iterator<DomTreeNode*>& D,
+                    std::map<BasicBlock*, ValueNumberedSet >& new_sets) {
+  bool changed_function = false;
+  bool new_stuff = false;
+  
+  BasicBlock* BB = D->getBlock();
+  for (unsigned i = 0; i < workList.size(); ++i) {
+    Value* e = workList[i];
+          
+    if (isa<BinaryOperator>(e) || isa<CmpInst>(e) ||
+        isa<ExtractElementInst>(e) || isa<InsertElementInst>(e) ||
+        isa<ShuffleVectorInst>(e) || isa<SelectInst>(e) || isa<CastInst>(e) ||
+        isa<GetElementPtrInst>(e)) {
+      if (availableOut[D->getIDom()->getBlock()].test(VN.lookup(e)))
+        continue;
+            
+      DenseMap<BasicBlock*, Value*> avail;
+      bool by_some = false;
+      bool all_same = true;
+      Value * first_s = 0;
+            
+      for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;
+           ++PI) {
+        Value *e2 = phi_translate(e, *PI, BB);
+        Value *e3 = find_leader(availableOut[*PI], VN.lookup(e2));
+              
+        if (e3 == 0) {
+          DenseMap<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+          if (av != avail.end())
+            avail.erase(av);
+          avail.insert(std::make_pair(*PI, e2));
+          all_same = false;
+        } else {
+          DenseMap<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+          if (av != avail.end())
+            avail.erase(av);
+          avail.insert(std::make_pair(*PI, e3));
+                
+          by_some = true;
+          if (first_s == 0)
+            first_s = e3;
+          else if (first_s != e3)
+            all_same = false;
+        }
+      }
+            
+      if (by_some && !all_same &&
+          !generatedPhis[BB].test(VN.lookup(e))) {
+        insertion_pre(e, BB, avail, new_sets);
+              
+        changed_function = true;
+        new_stuff = true;
+      }
+    }
+  }
+  
+  unsigned retval = 0;
+  if (changed_function)
+    retval += 1;
+  if (new_stuff)
+    retval += 2;
+  
+  return retval;
+}
+
+/// insert - Phase 2 of the main algorithm.  Walk the dominator tree looking for
+/// merge points.  When one is found, check for a partial redundancy.  If one is
+/// present, eliminate it.  Repeat this walk until no changes are made.
+bool GVNPRE::insertion(Function& F) {
+  bool changed_function = false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();  
+  
+  std::map<BasicBlock*, ValueNumberedSet> new_sets;
+  bool new_stuff = true;
+  while (new_stuff) {
+    new_stuff = false;
+    for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+      BasicBlock* BB = DI->getBlock();
+      
+      if (BB == 0)
+        continue;
+      
+      ValueNumberedSet& availOut = availableOut[BB];
+      ValueNumberedSet& anticIn = anticipatedIn[BB];
+      
+      // Replace leaders with leaders inherited from dominator
+      if (DI->getIDom() != 0) {
+        ValueNumberedSet& dom_set = new_sets[DI->getIDom()->getBlock()];
+        for (ValueNumberedSet::iterator I = dom_set.begin(),
+             E = dom_set.end(); I != E; ++I) {
+          val_replace(new_sets[BB], *I);
+          val_replace(availOut, *I);
+        }
+      }
+      
+      // If there is more than one predecessor...
+      if (pred_begin(BB) != pred_end(BB) && ++pred_begin(BB) != pred_end(BB)) {
+        SmallVector<Value*, 8> workList;
+        workList.reserve(anticIn.size());
+        topo_sort(anticIn, workList);
+        
+        unsigned result = insertion_mergepoint(workList, DI, new_sets);
+        if (result & 1)
+          changed_function = true;
+        if (result & 2)
+          new_stuff = true;
+      }
+    }
+  }
+  
+  return changed_function;
+}
+
+// GVNPRE::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool GVNPRE::runOnFunction(Function &F) {
+  // Clean out global sets from any previous functions
+  VN.clear();
+  createdExpressions.clear();
+  availableOut.clear();
+  anticipatedIn.clear();
+  generatedPhis.clear();
+ 
+  bool changed_function = false;
+  
+  // Phase 1: BuildSets
+  // This phase calculates the AVAIL_OUT and ANTIC_IN sets
+  buildsets(F);
+  
+  // Phase 2: Insert
+  // This phase inserts values to make partially redundant values
+  // fully redundant
+  changed_function |= insertion(F);
+  
+  // Phase 3: Eliminate
+  // This phase performs trivial full redundancy elimination
+  changed_function |= elimination();
+  
+  // Phase 4: Cleanup
+  // This phase cleans up values that were created solely
+  // as leaders for expressions
+  cleanup();
+  
+  return changed_function;
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 0000000..ca7aa7b
--- /dev/null
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -0,0 +1,880 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// This transformation makes the following changes to each loop with an
+// identifiable induction variable:
+//   1. All loops are transformed to have a SINGLE canonical induction variable
+//      which starts at zero and steps by one.
+//   2. The canonical induction variable is guaranteed to be the first PHI node
+//      in the loop header block.
+//   3. Any pointer arithmetic recurrences are raised to use array subscripts.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+//   1. The exit condition for the loop is canonicalized to compare the
+//      induction value against the exit value.  This turns loops like:
+//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+//   2. Any use outside of the loop of an expression derived from the indvar
+//      is changed to compute the derived value outside of the loop, eliminating
+//      the dependence on the exit value of the induction variable.  If the only
+//      purpose of the loop is to compute the exit value of some derived
+//      expression, this transformation will make the loop dead.
+//
+// This transformation should be followed by strength reduction after all of the
+// desired loop transformations have been performed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indvars"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+STATISTIC(NumRemoved , "Number of aux indvars removed");
+STATISTIC(NumInserted, "Number of canonical indvars added");
+STATISTIC(NumReplaced, "Number of exit values replaced");
+STATISTIC(NumLFTR    , "Number of loop exit tests replaced");
+
+namespace {
+  class VISIBILITY_HIDDEN IndVarSimplify : public LoopPass {
+    IVUsers         *IU;
+    LoopInfo        *LI;
+    ScalarEvolution *SE;
+    bool Changed;
+  public:
+
+   static char ID; // Pass identification, replacement for typeid
+   IndVarSimplify() : LoopPass(&ID) {}
+
+   virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+     AU.addRequired<DominatorTree>();
+     AU.addRequired<ScalarEvolution>();
+     AU.addRequiredID(LCSSAID);
+     AU.addRequiredID(LoopSimplifyID);
+     AU.addRequired<LoopInfo>();
+     AU.addRequired<IVUsers>();
+     AU.addPreserved<ScalarEvolution>();
+     AU.addPreservedID(LoopSimplifyID);
+     AU.addPreserved<IVUsers>();
+     AU.addPreservedID(LCSSAID);
+     AU.setPreservesCFG();
+   }
+
+  private:
+
+    void RewriteNonIntegerIVs(Loop *L);
+
+    ICmpInst *LinearFunctionTestReplace(Loop *L, SCEVHandle BackedgeTakenCount,
+                                   Value *IndVar,
+                                   BasicBlock *ExitingBlock,
+                                   BranchInst *BI,
+                                   SCEVExpander &Rewriter);
+    void RewriteLoopExitValues(Loop *L, const SCEV *BackedgeTakenCount);
+
+    void RewriteIVExpressions(Loop *L, const Type *LargestType,
+                              SCEVExpander &Rewriter);
+
+    void SinkUnusedInvariants(Loop *L, SCEVExpander &Rewriter);
+
+    void FixUsesBeforeDefs(Loop *L, SCEVExpander &Rewriter);
+
+    void HandleFloatingPointIV(Loop *L, PHINode *PH);
+  };
+}
+
+char IndVarSimplify::ID = 0;
+static RegisterPass<IndVarSimplify>
+X("indvars", "Canonicalize Induction Variables");
+
+Pass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplify();
+}
+
+/// LinearFunctionTestReplace - This method rewrites the exit condition of the
+/// loop to be a canonical != comparison against the incremented loop induction
+/// variable.  This pass is able to rewrite the exit tests of any loop where the
+/// SCEV analysis can determine a loop-invariant trip count of the loop, which
+/// is actually a much broader range than just linear tests.
+ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
+                                   SCEVHandle BackedgeTakenCount,
+                                   Value *IndVar,
+                                   BasicBlock *ExitingBlock,
+                                   BranchInst *BI,
+                                   SCEVExpander &Rewriter) {
+  // If the exiting block is not the same as the backedge block, we must compare
+  // against the preincremented value, otherwise we prefer to compare against
+  // the post-incremented value.
+  Value *CmpIndVar;
+  SCEVHandle RHS = BackedgeTakenCount;
+  if (ExitingBlock == L->getLoopLatch()) {
+    // Add one to the "backedge-taken" count to get the trip count.
+    // If this addition may overflow, we have to be more pessimistic and
+    // cast the induction variable before doing the add.
+    SCEVHandle Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType());
+    SCEVHandle N =
+      SE->getAddExpr(BackedgeTakenCount,
+                     SE->getIntegerSCEV(1, BackedgeTakenCount->getType()));
+    if ((isa<SCEVConstant>(N) && !N->isZero()) ||
+        SE->isLoopGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
+      // No overflow. Cast the sum.
+      RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType());
+    } else {
+      // Potential overflow. Cast before doing the add.
+      RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
+                                        IndVar->getType());
+      RHS = SE->getAddExpr(RHS,
+                           SE->getIntegerSCEV(1, IndVar->getType()));
+    }
+
+    // The BackedgeTaken expression contains the number of times that the
+    // backedge branches to the loop header.  This is one less than the
+    // number of times the loop executes, so use the incremented indvar.
+    CmpIndVar = L->getCanonicalInductionVariableIncrement();
+  } else {
+    // We have to use the preincremented value...
+    RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
+                                      IndVar->getType());
+    CmpIndVar = IndVar;
+  }
+
+  // Expand the code for the iteration count into the preheader of the loop.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  Value *ExitCnt = Rewriter.expandCodeFor(RHS, CmpIndVar->getType(),
+                                          Preheader->getTerminator());
+
+  // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  ICmpInst::Predicate Opcode;
+  if (L->contains(BI->getSuccessor(0)))
+    Opcode = ICmpInst::ICMP_NE;
+  else
+    Opcode = ICmpInst::ICMP_EQ;
+
+  DOUT << "INDVARS: Rewriting loop exit condition to:\n"
+       << "      LHS:" << *CmpIndVar // includes a newline
+       << "       op:\t"
+       << (Opcode == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+       << "      RHS:\t" << *RHS << "\n";
+
+  ICmpInst *Cond = new ICmpInst(Opcode, CmpIndVar, ExitCnt, "exitcond", BI);
+
+  Instruction *OrigCond = cast<Instruction>(BI->getCondition());
+  // It's tempting to use replaceAllUsesWith here to fully replace the old
+  // comparison, but that's not immediately safe, since users of the old
+  // comparison may not be dominated by the new comparison. Instead, just
+  // update the branch to use the new comparison; in the common case this
+  // will make old comparison dead.
+  BI->setCondition(Cond);
+  RecursivelyDeleteTriviallyDeadInstructions(OrigCond);
+
+  ++NumLFTR;
+  Changed = true;
+  return Cond;
+}
+
+/// RewriteLoopExitValues - Check to see if this loop has a computable
+/// loop-invariant execution count.  If so, this means that we can compute the
+/// final value of any expressions that are recurrent in the loop, and
+/// substitute the exit values from the loop into any instructions outside of
+/// the loop that use the final values of the current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
+void IndVarSimplify::RewriteLoopExitValues(Loop *L,
+                                           const SCEV *BackedgeTakenCount) {
+  // Verify the input to the pass in already in LCSSA form.
+  assert(L->isLCSSAForm());
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // Scan all of the instructions in the loop, looking at those that have
+  // extra-loop users and which are recurrences.
+  SCEVExpander Rewriter(*SE);
+
+  // We insert the code into the preheader of the loop if the loop contains
+  // multiple exit blocks, or in the exit block if there is exactly one.
+  BasicBlock *BlockToInsertInto;
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() == 1)
+    BlockToInsertInto = ExitBlocks[0];
+  else
+    BlockToInsertInto = Preheader;
+  BasicBlock::iterator InsertPt = BlockToInsertInto->getFirstNonPHI();
+
+  std::map<Instruction*, Value*> ExitValues;
+
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBB = ExitBlocks[i];
+
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+
+    unsigned NumPreds = PN->getNumIncomingValues();
+
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      if (PN->use_empty())
+        continue; // dead use, don't replace it
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal) ||
+            // SCEV only supports integer expressions for now.
+            (!isa<IntegerType>(InVal->getType()) &&
+             !isa<PointerType>(InVal->getType())))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst->getParent()))
+          continue;
+
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.
+        SCEVHandle ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (!ExitValue->isLoopInvariant(L))
+          continue;
+
+        Changed = true;
+        ++NumReplaced;
+
+        // See if we already computed the exit value for the instruction, if so,
+        // just reuse it.
+        Value *&ExitVal = ExitValues[Inst];
+        if (!ExitVal)
+          ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), InsertPt);
+
+        DOUT << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal
+             << "  LoopVal = " << *Inst << "\n";
+
+        PN->setIncomingValue(i, ExitVal);
+
+        // If this instruction is dead now, delete it.
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+
+        // See if this is a single-entry LCSSA PHI node.  If so, we can (and
+        // have to) remove
+        // the PHI entirely.  This is safe, because the NewVal won't be variant
+        // in the loop, so we don't need an LCSSA phi node anymore.
+        if (NumPreds == 1) {
+          PN->replaceAllUsesWith(ExitVal);
+          RecursivelyDeleteTriviallyDeadInstructions(PN);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
+  // First step.  Check to see if there are any floating-point recurrences.
+  // If there are, change them into integer recurrences, permitting analysis by
+  // the SCEV routines.
+  //
+  BasicBlock *Header    = L->getHeader();
+
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i]))
+      HandleFloatingPointIV(L, PN);
+
+  // If the loop previously had floating-point IV, ScalarEvolution
+  // may not have been able to compute a trip count. Now that we've done some
+  // re-writing, the trip count may be computable.
+  if (Changed)
+    SE->forgetLoopBackedgeTakenCount(L);
+}
+
+bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  IU = &getAnalysis<IVUsers>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  Changed = false;
+
+  // If there are any floating-point recurrences, attempt to
+  // transform them to use integer recurrences.
+  RewriteNonIntegerIVs(L);
+
+  BasicBlock *Header       = L->getHeader();
+  BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+
+  // Check to see if this loop has a computable loop-invariant execution count.
+  // If so, this means that we can compute the final value of any expressions
+  // that are recurrent in the loop, and substitute the exit values from the
+  // loop into any instructions outside of the loop that use the final values of
+  // the current expressions.
+  //
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    RewriteLoopExitValues(L, BackedgeTakenCount);
+
+  // Compute the type of the largest recurrence expression, and decide whether
+  // a canonical induction variable should be inserted.
+  const Type *LargestType = 0;
+  bool NeedCannIV = false;
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    LargestType = BackedgeTakenCount->getType();
+    LargestType = SE->getEffectiveSCEVType(LargestType);
+    // If we have a known trip count and a single exit block, we'll be
+    // rewriting the loop exit test condition below, which requires a
+    // canonical induction variable.
+    if (ExitingBlock)
+      NeedCannIV = true;
+  }
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    SCEVHandle Stride = IU->StrideOrder[i];
+    const Type *Ty = SE->getEffectiveSCEVType(Stride->getType());
+    if (!LargestType ||
+        SE->getTypeSizeInBits(Ty) >
+          SE->getTypeSizeInBits(LargestType))
+      LargestType = Ty;
+
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[i]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+
+    if (!SI->second->Users.empty())
+      NeedCannIV = true;
+  }
+
+  // Create a rewriter object which we'll use to transform the code with.
+  SCEVExpander Rewriter(*SE);
+
+  // Now that we know the largest of of the induction variable expressions
+  // in this loop, insert a canonical induction variable of the largest size.
+  Value *IndVar = 0;
+  if (NeedCannIV) {
+    IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L,LargestType);
+    ++NumInserted;
+    Changed = true;
+    DOUT << "INDVARS: New CanIV: " << *IndVar;
+  }
+
+  // If we have a trip count expression, rewrite the loop's exit condition
+  // using it.  We can currently only handle loops with a single exit.
+  ICmpInst *NewICmp = 0;
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && ExitingBlock) {
+    assert(NeedCannIV &&
+           "LinearFunctionTestReplace requires a canonical induction variable");
+    // Can't rewrite non-branch yet.
+    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
+      NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                          ExitingBlock, BI, Rewriter);
+  }
+
+  Rewriter.setInsertionPoint(Header->getFirstNonPHI());
+
+  // Rewrite IV-derived expressions. Clears the rewriter cache.
+  RewriteIVExpressions(L, LargestType, Rewriter);
+
+  // The Rewriter may only be used for isInsertedInstruction queries from this
+  // point on.
+
+  // Loop-invariant instructions in the preheader that aren't used in the
+  // loop may be sunk below the loop to reduce register pressure.
+  SinkUnusedInvariants(L, Rewriter);
+
+  // Reorder instructions to avoid use-before-def conditions.
+  FixUsesBeforeDefs(L, Rewriter);
+
+  // For completeness, inform IVUsers of the IV use in the newly-created
+  // loop exit test instruction.
+  if (NewICmp)
+    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)));
+
+  // Clean up dead instructions.
+  DeleteDeadPHIs(L->getHeader());
+  // Check a post-condition.
+  assert(L->isLCSSAForm() && "Indvars did not leave the loop in lcssa form!");
+  return Changed;
+}
+
+void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
+                                          SCEVExpander &Rewriter) {
+  SmallVector<WeakVH, 16> DeadInsts;
+
+  // Rewrite all induction variable expressions in terms of the canonical
+  // induction variable.
+  //
+  // If there were induction variables of other sizes or offsets, manually
+  // add the offsets to the primary induction variable and cast, avoiding
+  // the need for the code evaluation methods to insert induction variables
+  // of different sizes.
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    SCEVHandle Stride = IU->StrideOrder[i];
+
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[i]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+    ilist<IVStrideUse> &List = SI->second->Users;
+    for (ilist<IVStrideUse>::iterator UI = List.begin(),
+         E = List.end(); UI != E; ++UI) {
+      SCEVHandle Offset = UI->getOffset();
+      Value *Op = UI->getOperandValToReplace();
+      Instruction *User = UI->getUser();
+      bool isSigned = UI->isSigned();
+
+      // Compute the final addrec to expand into code.
+      SCEVHandle AR = IU->getReplacementExpr(*UI);
+
+      // FIXME: It is an extremely bad idea to indvar substitute anything more
+      // complex than affine induction variables.  Doing so will put expensive
+      // polynomial evaluations inside of the loop, and the str reduction pass
+      // currently can only reduce affine polynomials.  For now just disable
+      // indvar subst on anything more complex than an affine addrec, unless
+      // it can be expanded to a trivial value.
+      if (!Stride->isLoopInvariant(L) &&
+          !isa<SCEVConstant>(AR) &&
+          L->contains(User->getParent()))
+        continue;
+
+      Value *NewVal = 0;
+      if (AR->isLoopInvariant(L)) {
+        BasicBlock::iterator I = Rewriter.getInsertionPoint();
+        // Expand loop-invariant values in the loop preheader. They will
+        // be sunk to the exit block later, if possible.
+        NewVal =
+          Rewriter.expandCodeFor(AR, LargestType,
+                                 L->getLoopPreheader()->getTerminator());
+        Rewriter.setInsertionPoint(I);
+        ++NumReplaced;
+      } else {
+        const Type *IVTy = Offset->getType();
+        const Type *UseTy = Op->getType();
+
+        // Promote the Offset and Stride up to the canonical induction
+        // variable's bit width.
+        SCEVHandle PromotedOffset = Offset;
+        SCEVHandle PromotedStride = Stride;
+        if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType)) {
+          // It doesn't matter for correctness whether zero or sign extension
+          // is used here, since the value is truncated away below, but if the
+          // value is signed, sign extension is more likely to be folded.
+          if (isSigned) {
+            PromotedOffset = SE->getSignExtendExpr(PromotedOffset, LargestType);
+            PromotedStride = SE->getSignExtendExpr(PromotedStride, LargestType);
+          } else {
+            PromotedOffset = SE->getZeroExtendExpr(PromotedOffset, LargestType);
+            // If the stride is obviously negative, use sign extension to
+            // produce things like x-1 instead of x+255.
+            if (isa<SCEVConstant>(PromotedStride) &&
+                cast<SCEVConstant>(PromotedStride)
+                  ->getValue()->getValue().isNegative())
+              PromotedStride = SE->getSignExtendExpr(PromotedStride,
+                                                     LargestType);
+            else
+              PromotedStride = SE->getZeroExtendExpr(PromotedStride,
+                                                     LargestType);
+          }
+        }
+
+        // Create the SCEV representing the offset from the canonical
+        // induction variable, still in the canonical induction variable's
+        // type, so that all expanded arithmetic is done in the same type.
+        SCEVHandle NewAR = SE->getAddRecExpr(SE->getIntegerSCEV(0, LargestType),
+                                           PromotedStride, L);
+        // Add the PromotedOffset as a separate step, because it may not be
+        // loop-invariant.
+        NewAR = SE->getAddExpr(NewAR, PromotedOffset);
+
+        // Expand the addrec into instructions.
+        Value *V = Rewriter.expandCodeFor(NewAR);
+
+        // Insert an explicit cast if necessary to truncate the value
+        // down to the original stride type. This is done outside of
+        // SCEVExpander because in SCEV expressions, a truncate of an
+        // addrec is always folded.
+        if (LargestType != IVTy) {
+          if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType))
+            NewAR = SE->getTruncateExpr(NewAR, IVTy);
+          if (Rewriter.isInsertedExpression(NewAR))
+            V = Rewriter.expandCodeFor(NewAR);
+          else {
+            V = Rewriter.InsertCastOfTo(CastInst::getCastOpcode(V, false,
+                                                                IVTy, false),
+                                        V, IVTy);
+            assert(!isa<SExtInst>(V) && !isa<ZExtInst>(V) &&
+                   "LargestType wasn't actually the largest type!");
+            // Force the rewriter to use this trunc whenever this addrec
+            // appears so that it doesn't insert new phi nodes or
+            // arithmetic in a different type.
+            Rewriter.addInsertedValue(V, NewAR);
+          }
+        }
+
+        DOUT << "INDVARS: Made offset-and-trunc IV for offset "
+             << *IVTy << " " << *Offset << ": ";
+        DEBUG(WriteAsOperand(*DOUT, V, false));
+        DOUT << "\n";
+
+        // Now expand it into actual Instructions and patch it into place.
+        NewVal = Rewriter.expandCodeFor(AR, UseTy);
+      }
+
+      // Patch the new value into place.
+      if (Op->hasName())
+        NewVal->takeName(Op);
+      User->replaceUsesOfWith(Op, NewVal);
+      UI->setOperandValToReplace(NewVal);
+      DOUT << "INDVARS: Rewrote IV '" << *AR << "' " << *Op
+           << "   into = " << *NewVal << "\n";
+      ++NumRemoved;
+      Changed = true;
+
+      // The old value may be dead now.
+      DeadInsts.push_back(Op);
+    }
+  }
+
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted in the loop below, causing the AssertingVH in the cache to
+  // trigger.
+  Rewriter.clear();
+  // Now that we're done iterating through lists, clean up any instructions
+  // which are now dead.
+  while (!DeadInsts.empty()) {
+    Instruction *Inst = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
+    if (Inst)
+      RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+}
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::SinkUnusedInvariants(Loop *L, SCEVExpander &Rewriter) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock) return;
+
+  Instruction *NonPHI = ExitBlock->getFirstNonPHI();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock::iterator I = Preheader->getTerminator();
+  while (I != Preheader->begin()) {
+    --I;
+    // New instructions were inserted at the end of the preheader. Only
+    // consider those new instructions.
+    if (!Rewriter.isInsertedInstruction(I))
+      break;
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoop = false;
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+         UI != UE; ++UI) {
+      BasicBlock *UseBB = cast<Instruction>(UI)->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(UI)) {
+        unsigned i =
+          PHINode::getIncomingValueNumForOperand(UI.getOperandNo());
+        UseBB = P->getIncomingBlock(i);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoop = true;
+        break;
+      }
+    }
+    // If there is, the def must remain in the preheader.
+    if (UsedInLoop)
+      continue;
+    // Otherwise, sink it to the exit block.
+    Instruction *ToMove = I;
+    bool Done = false;
+    if (I != Preheader->begin())
+      --I;
+    else
+      Done = true;
+    ToMove->moveBefore(NonPHI);
+    if (Done)
+      break;
+  }
+}
+
+/// Re-schedule the inserted instructions to put defs before uses. This
+/// fixes problems that arrise when SCEV expressions contain loop-variant
+/// values unrelated to the induction variable which are defined inside the
+/// loop. FIXME: It would be better to insert instructions in the right
+/// place so that this step isn't needed.
+void IndVarSimplify::FixUsesBeforeDefs(Loop *L, SCEVExpander &Rewriter) {
+  // Visit all the blocks in the loop in pre-order dom-tree dfs order.
+  DominatorTree *DT = &getAnalysis<DominatorTree>();
+  std::map<Instruction *, unsigned> NumPredsLeft;
+  SmallVector<DomTreeNode *, 16> Worklist;
+  Worklist.push_back(DT->getNode(L->getHeader()));
+  do {
+    DomTreeNode *Node = Worklist.pop_back_val();
+    for (DomTreeNode::iterator I = Node->begin(), E = Node->end(); I != E; ++I)
+      if (L->contains((*I)->getBlock()))
+        Worklist.push_back(*I);
+    BasicBlock *BB = Node->getBlock();
+    // Visit all the instructions in the block top down.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Count the number of operands that aren't properly dominating.
+      unsigned NumPreds = 0;
+      if (Rewriter.isInsertedInstruction(I) && !isa<PHINode>(I))
+        for (User::op_iterator OI = I->op_begin(), OE = I->op_end();
+             OI != OE; ++OI)
+          if (Instruction *Inst = dyn_cast<Instruction>(OI))
+            if (L->contains(Inst->getParent()) && !NumPredsLeft.count(Inst))
+              ++NumPreds;
+      NumPredsLeft[I] = NumPreds;
+      // Notify uses of the position of this instruction, and move the
+      // users (and their dependents, recursively) into place after this
+      // instruction if it is their last outstanding operand.
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI) {
+        Instruction *Inst = cast<Instruction>(UI);
+        std::map<Instruction *, unsigned>::iterator Z = NumPredsLeft.find(Inst);
+        if (Z != NumPredsLeft.end() && Z->second != 0 && --Z->second == 0) {
+          SmallVector<Instruction *, 4> UseWorkList;
+          UseWorkList.push_back(Inst);
+          BasicBlock::iterator InsertPt = I;
+          if (InvokeInst *II = dyn_cast<InvokeInst>(InsertPt))
+            InsertPt = II->getNormalDest()->begin();
+          else
+            ++InsertPt;
+          while (isa<PHINode>(InsertPt)) ++InsertPt;
+          do {
+            Instruction *Use = UseWorkList.pop_back_val();
+            Use->moveBefore(InsertPt);
+            NumPredsLeft.erase(Use);
+            for (Value::use_iterator IUI = Use->use_begin(),
+                 IUE = Use->use_end(); IUI != IUE; ++IUI) {
+              Instruction *IUIInst = cast<Instruction>(IUI);
+              if (L->contains(IUIInst->getParent()) &&
+                  Rewriter.isInsertedInstruction(IUIInst) &&
+                  !isa<PHINode>(IUIInst))
+                UseWorkList.push_back(IUIInst);
+            }
+          } while (!UseWorkList.empty());
+        }
+      }
+    }
+  } while (!Worklist.empty());
+}
+
+/// Return true if it is OK to use SIToFPInst for an inducation variable
+/// with given inital and exit values.
+static bool useSIToFPInst(ConstantFP &InitV, ConstantFP &ExitV,
+                          uint64_t intIV, uint64_t intEV) {
+
+  if (InitV.getValueAPF().isNegative() || ExitV.getValueAPF().isNegative())
+    return true;
+
+  // If the iteration range can be handled by SIToFPInst then use it.
+  APInt Max = APInt::getSignedMaxValue(32);
+  if (Max.getZExtValue() > static_cast<uint64_t>(abs64(intEV - intIV)))
+    return true;
+
+  return false;
+}
+
+/// convertToInt - Convert APF to an integer, if possible.
+static bool convertToInt(const APFloat &APF, uint64_t *intVal) {
+
+  bool isExact = false;
+  if (&APF.getSemantics() == &APFloat::PPCDoubleDouble)
+    return false;
+  if (APF.convertToInteger(intVal, 32, APF.isNegative(),
+                           APFloat::rmTowardZero, &isExact)
+      != APFloat::opOK)
+    return false;
+  if (!isExact)
+    return false;
+  return true;
+
+}
+
+/// HandleFloatingPointIV - If the loop has floating induction variable
+/// then insert corresponding integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+///   bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+///   bar((double)i);
+///
+void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PH) {
+
+  unsigned IncomingEdge = L->contains(PH->getIncomingBlock(0));
+  unsigned BackEdge     = IncomingEdge^1;
+
+  // Check incoming value.
+  ConstantFP *InitValue = dyn_cast<ConstantFP>(PH->getIncomingValue(IncomingEdge));
+  if (!InitValue) return;
+  uint64_t newInitValue = Type::Int32Ty->getPrimitiveSizeInBits();
+  if (!convertToInt(InitValue->getValueAPF(), &newInitValue))
+    return;
+
+  // Check IV increment. Reject this PH if increement operation is not
+  // an add or increment value can not be represented by an integer.
+  BinaryOperator *Incr =
+    dyn_cast<BinaryOperator>(PH->getIncomingValue(BackEdge));
+  if (!Incr) return;
+  if (Incr->getOpcode() != Instruction::Add) return;
+  ConstantFP *IncrValue = NULL;
+  unsigned IncrVIndex = 1;
+  if (Incr->getOperand(1) == PH)
+    IncrVIndex = 0;
+  IncrValue = dyn_cast<ConstantFP>(Incr->getOperand(IncrVIndex));
+  if (!IncrValue) return;
+  uint64_t newIncrValue = Type::Int32Ty->getPrimitiveSizeInBits();
+  if (!convertToInt(IncrValue->getValueAPF(), &newIncrValue))
+    return;
+
+  // Check Incr uses. One user is PH and the other users is exit condition used
+  // by the conditional terminator.
+  Value::use_iterator IncrUse = Incr->use_begin();
+  Instruction *U1 = cast<Instruction>(IncrUse++);
+  if (IncrUse == Incr->use_end()) return;
+  Instruction *U2 = cast<Instruction>(IncrUse++);
+  if (IncrUse != Incr->use_end()) return;
+
+  // Find exit condition.
+  FCmpInst *EC = dyn_cast<FCmpInst>(U1);
+  if (!EC)
+    EC = dyn_cast<FCmpInst>(U2);
+  if (!EC) return;
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(EC->getParent()->getTerminator())) {
+    if (!BI->isConditional()) return;
+    if (BI->getCondition() != EC) return;
+  }
+
+  // Find exit value. If exit value can not be represented as an interger then
+  // do not handle this floating point PH.
+  ConstantFP *EV = NULL;
+  unsigned EVIndex = 1;
+  if (EC->getOperand(1) == Incr)
+    EVIndex = 0;
+  EV = dyn_cast<ConstantFP>(EC->getOperand(EVIndex));
+  if (!EV) return;
+  uint64_t intEV = Type::Int32Ty->getPrimitiveSizeInBits();
+  if (!convertToInt(EV->getValueAPF(), &intEV))
+    return;
+
+  // Find new predicate for integer comparison.
+  CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+  switch (EC->getPredicate()) {
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ:
+    NewPred = CmpInst::ICMP_EQ;
+    break;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT:
+    NewPred = CmpInst::ICMP_UGT;
+    break;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE:
+    NewPred = CmpInst::ICMP_UGE;
+    break;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT:
+    NewPred = CmpInst::ICMP_ULT;
+    break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE:
+    NewPred = CmpInst::ICMP_ULE;
+    break;
+  default:
+    break;
+  }
+  if (NewPred == CmpInst::BAD_ICMP_PREDICATE) return;
+
+  // Insert new integer induction variable.
+  PHINode *NewPHI = PHINode::Create(Type::Int32Ty,
+                                    PH->getName()+".int", PH);
+  NewPHI->addIncoming(ConstantInt::get(Type::Int32Ty, newInitValue),
+                      PH->getIncomingBlock(IncomingEdge));
+
+  Value *NewAdd = BinaryOperator::CreateAdd(NewPHI,
+                                            ConstantInt::get(Type::Int32Ty,
+                                                             newIncrValue),
+                                            Incr->getName()+".int", Incr);
+  NewPHI->addIncoming(NewAdd, PH->getIncomingBlock(BackEdge));
+
+  // The back edge is edge 1 of newPHI, whatever it may have been in the
+  // original PHI.
+  ConstantInt *NewEV = ConstantInt::get(Type::Int32Ty, intEV);
+  Value *LHS = (EVIndex == 1 ? NewPHI->getIncomingValue(1) : NewEV);
+  Value *RHS = (EVIndex == 1 ? NewEV : NewPHI->getIncomingValue(1));
+  ICmpInst *NewEC = new ICmpInst(NewPred, LHS, RHS, EC->getNameStart(),
+                                 EC->getParent()->getTerminator());
+
+  // In the following deltions, PH may become dead and may be deleted.
+  // Use a WeakVH to observe whether this happens.
+  WeakVH WeakPH = PH;
+
+  // Delete old, floating point, exit comparision instruction.
+  NewEC->takeName(EC);
+  EC->replaceAllUsesWith(NewEC);
+  RecursivelyDeleteTriviallyDeadInstructions(EC);
+
+  // Delete old, floating point, increment instruction.
+  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+  RecursivelyDeleteTriviallyDeadInstructions(Incr);
+
+  // Replace floating induction variable, if it isn't already deleted.
+  // Give SIToFPInst preference over UIToFPInst because it is faster on
+  // platforms that are widely used.
+  if (WeakPH && !PH->use_empty()) {
+    if (useSIToFPInst(*InitValue, *EV, newInitValue, intEV)) {
+      SIToFPInst *Conv = new SIToFPInst(NewPHI, PH->getType(), "indvar.conv",
+                                        PH->getParent()->getFirstNonPHI());
+      PH->replaceAllUsesWith(Conv);
+    } else {
+      UIToFPInst *Conv = new UIToFPInst(NewPHI, PH->getType(), "indvar.conv",
+                                        PH->getParent()->getFirstNonPHI());
+      PH->replaceAllUsesWith(Conv);
+    }
+    RecursivelyDeleteTriviallyDeadInstructions(PH);
+  }
+
+  // Add a new IVUsers entry for the newly-created integer PHI.
+  IU->AddUsersIfInteresting(NewPHI);
+}
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
new file mode 100644
index 0000000..e6f854f
--- /dev/null
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -0,0 +1,12919 @@
+//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions.  This pass does not modify the CFG.  This pass is where
+// algebraic simplification happens.
+//
+// This pass combines things like:
+//    %Y = add i32 %X, 1
+//    %Z = add i32 %Y, 1
+// into:
+//    %Z = add i32 %X, 2
+//
+// This is a simple worklist driven algorithm.
+//
+// This pass guarantees that the following canonicalizations are performed on
+// the program:
+//    1. If a binary operator has a constant operand, it is moved to the RHS
+//    2. Bitwise operators with constant operands are always grouped so that
+//       shifts are performed first, then or's, then and's, then xor's.
+//    3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
+//    4. All cmp instructions on boolean values are replaced with logical ops
+//    5. add X, X is represented as (X*2) => (X << 1)
+//    6. Multiplies with a power-of-two constant argument are transformed into
+//       shifts.
+//   ... etc.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <climits>
+#include <sstream>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumCombined , "Number of insts combined");
+STATISTIC(NumConstProp, "Number of constant folds");
+STATISTIC(NumDeadInst , "Number of dead inst eliminated");
+STATISTIC(NumDeadStore, "Number of dead stores eliminated");
+STATISTIC(NumSunkInst , "Number of instructions sunk");
+
+namespace {
+  class VISIBILITY_HIDDEN InstCombiner
+    : public FunctionPass,
+      public InstVisitor<InstCombiner, Instruction*> {
+    // Worklist of all of the instructions that need to be simplified.
+    SmallVector<Instruction*, 256> Worklist;
+    DenseMap<Instruction*, unsigned> WorklistMap;
+    TargetData *TD;
+    bool MustPreserveLCSSA;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    InstCombiner() : FunctionPass(&ID) {}
+
+    /// AddToWorkList - Add the specified instruction to the worklist if it
+    /// isn't already in it.
+    void AddToWorkList(Instruction *I) {
+      if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second)
+        Worklist.push_back(I);
+    }
+    
+    // RemoveFromWorkList - remove I from the worklist if it exists.
+    void RemoveFromWorkList(Instruction *I) {
+      DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
+      if (It == WorklistMap.end()) return; // Not in worklist.
+      
+      // Don't bother moving everything down, just null out the slot.
+      Worklist[It->second] = 0;
+      
+      WorklistMap.erase(It);
+    }
+    
+    Instruction *RemoveOneFromWorkList() {
+      Instruction *I = Worklist.back();
+      Worklist.pop_back();
+      WorklistMap.erase(I);
+      return I;
+    }
+
+    
+    /// AddUsersToWorkList - When an instruction is simplified, add all users of
+    /// the instruction to the work lists because they might get more simplified
+    /// now.
+    ///
+    void AddUsersToWorkList(Value &I) {
+      for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+           UI != UE; ++UI)
+        AddToWorkList(cast<Instruction>(*UI));
+    }
+
+    /// AddUsesToWorkList - When an instruction is simplified, add operands to
+    /// the work lists because they might get more simplified now.
+    ///
+    void AddUsesToWorkList(Instruction &I) {
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i))
+          AddToWorkList(Op);
+    }
+    
+    /// AddSoonDeadInstToWorklist - The specified instruction is about to become
+    /// dead.  Add all of its operands to the worklist, turning them into
+    /// undef's to reduce the number of uses of those instructions.
+    ///
+    /// Return the specified operand before it is turned into an undef.
+    ///
+    Value *AddSoonDeadInstToWorklist(Instruction &I, unsigned op) {
+      Value *R = I.getOperand(op);
+      
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i)) {
+          AddToWorkList(Op);
+          // Set the operand to undef to drop the use.
+          *i = UndefValue::get(Op->getType());
+        }
+      
+      return R;
+    }
+
+  public:
+    virtual bool runOnFunction(Function &F);
+    
+    bool DoOneIteration(Function &F, unsigned ItNum);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+      AU.addPreservedID(LCSSAID);
+      AU.setPreservesCFG();
+    }
+
+    TargetData &getTargetData() const { return *TD; }
+
+    // Visitation implementation - Implement instruction combining for different
+    // instruction types.  The semantics are as follows:
+    // Return Value:
+    //    null        - No change was made
+    //     I          - Change was made, I is still valid, I may be dead though
+    //   otherwise    - Change was made, replace I with returned instruction
+    //
+    Instruction *visitAdd(BinaryOperator &I);
+    Instruction *visitSub(BinaryOperator &I);
+    Instruction *visitMul(BinaryOperator &I);
+    Instruction *visitURem(BinaryOperator &I);
+    Instruction *visitSRem(BinaryOperator &I);
+    Instruction *visitFRem(BinaryOperator &I);
+    bool SimplifyDivRemOfSelect(BinaryOperator &I);
+    Instruction *commonRemTransforms(BinaryOperator &I);
+    Instruction *commonIRemTransforms(BinaryOperator &I);
+    Instruction *commonDivTransforms(BinaryOperator &I);
+    Instruction *commonIDivTransforms(BinaryOperator &I);
+    Instruction *visitUDiv(BinaryOperator &I);
+    Instruction *visitSDiv(BinaryOperator &I);
+    Instruction *visitFDiv(BinaryOperator &I);
+    Instruction *FoldAndOfICmps(Instruction &I, ICmpInst *LHS, ICmpInst *RHS);
+    Instruction *visitAnd(BinaryOperator &I);
+    Instruction *FoldOrOfICmps(Instruction &I, ICmpInst *LHS, ICmpInst *RHS);
+    Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                     Value *A, Value *B, Value *C);
+    Instruction *visitOr (BinaryOperator &I);
+    Instruction *visitXor(BinaryOperator &I);
+    Instruction *visitShl(BinaryOperator &I);
+    Instruction *visitAShr(BinaryOperator &I);
+    Instruction *visitLShr(BinaryOperator &I);
+    Instruction *commonShiftTransforms(BinaryOperator &I);
+    Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI,
+                                      Constant *RHSC);
+    Instruction *visitFCmpInst(FCmpInst &I);
+    Instruction *visitICmpInst(ICmpInst &I);
+    Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
+    Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                                Instruction *LHS,
+                                                ConstantInt *RHS);
+    Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                                ConstantInt *DivRHS);
+
+    Instruction *FoldGEPICmp(User *GEPLHS, Value *RHS,
+                             ICmpInst::Predicate Cond, Instruction &I);
+    Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                     BinaryOperator &I);
+    Instruction *commonCastTransforms(CastInst &CI);
+    Instruction *commonIntCastTransforms(CastInst &CI);
+    Instruction *commonPointerCastTransforms(CastInst &CI);
+    Instruction *visitTrunc(TruncInst &CI);
+    Instruction *visitZExt(ZExtInst &CI);
+    Instruction *visitSExt(SExtInst &CI);
+    Instruction *visitFPTrunc(FPTruncInst &CI);
+    Instruction *visitFPExt(CastInst &CI);
+    Instruction *visitFPToUI(FPToUIInst &FI);
+    Instruction *visitFPToSI(FPToSIInst &FI);
+    Instruction *visitUIToFP(CastInst &CI);
+    Instruction *visitSIToFP(CastInst &CI);
+    Instruction *visitPtrToInt(PtrToIntInst &CI);
+    Instruction *visitIntToPtr(IntToPtrInst &CI);
+    Instruction *visitBitCast(BitCastInst &CI);
+    Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                Instruction *FI);
+    Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
+    Instruction *visitSelectInst(SelectInst &SI);
+    Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+    Instruction *visitCallInst(CallInst &CI);
+    Instruction *visitInvokeInst(InvokeInst &II);
+    Instruction *visitPHINode(PHINode &PN);
+    Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+    Instruction *visitAllocationInst(AllocationInst &AI);
+    Instruction *visitFreeInst(FreeInst &FI);
+    Instruction *visitLoadInst(LoadInst &LI);
+    Instruction *visitStoreInst(StoreInst &SI);
+    Instruction *visitBranchInst(BranchInst &BI);
+    Instruction *visitSwitchInst(SwitchInst &SI);
+    Instruction *visitInsertElementInst(InsertElementInst &IE);
+    Instruction *visitExtractElementInst(ExtractElementInst &EI);
+    Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+    Instruction *visitExtractValueInst(ExtractValueInst &EV);
+
+    // visitInstruction - Specify what to return for unhandled instructions...
+    Instruction *visitInstruction(Instruction &I) { return 0; }
+
+  private:
+    Instruction *visitCallSite(CallSite CS);
+    bool transformConstExprCastCall(CallSite CS);
+    Instruction *transformCallThroughTrampoline(CallSite CS);
+    Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
+                                   bool DoXform = true);
+    bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
+    DbgDeclareInst *hasOneUsePlusDeclare(Value *V);
+
+
+  public:
+    // InsertNewInstBefore - insert an instruction New before instruction Old
+    // in the program.  Add the new instruction to the worklist.
+    //
+    Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+      assert(New && New->getParent() == 0 &&
+             "New instruction already inserted into a basic block!");
+      BasicBlock *BB = Old.getParent();
+      BB->getInstList().insert(&Old, New);  // Insert inst
+      AddToWorkList(New);
+      return New;
+    }
+
+    /// InsertCastBefore - Insert a cast of V to TY before the instruction POS.
+    /// This also adds the cast to the worklist.  Finally, this returns the
+    /// cast.
+    Value *InsertCastBefore(Instruction::CastOps opc, Value *V, const Type *Ty,
+                            Instruction &Pos) {
+      if (V->getType() == Ty) return V;
+
+      if (Constant *CV = dyn_cast<Constant>(V))
+        return ConstantExpr::getCast(opc, CV, Ty);
+      
+      Instruction *C = CastInst::Create(opc, V, Ty, V->getName(), &Pos);
+      AddToWorkList(C);
+      return C;
+    }
+        
+    Value *InsertBitCastBefore(Value *V, const Type *Ty, Instruction &Pos) {
+      return InsertCastBefore(Instruction::BitCast, V, Ty, Pos);
+    }
+
+
+    // ReplaceInstUsesWith - This method is to be used when an instruction is
+    // found to be dead, replacable with another preexisting expression.  Here
+    // we add all uses of I to the worklist, replace all uses of I with the new
+    // value, then return I, so that the inst combiner will know that I was
+    // modified.
+    //
+    Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
+      AddUsersToWorkList(I);         // Add all modified instrs to worklist
+      if (&I != V) {
+        I.replaceAllUsesWith(V);
+        return &I;
+      } else {
+        // If we are replacing the instruction with itself, this must be in a
+        // segment of unreachable code, so just clobber the instruction.
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+        return &I;
+      }
+    }
+
+    // EraseInstFromFunction - When dealing with an instruction that has side
+    // effects or produces a void value, we can't rely on DCE to delete the
+    // instruction.  Instead, visit methods should return the value returned by
+    // this function.
+    Instruction *EraseInstFromFunction(Instruction &I) {
+      assert(I.use_empty() && "Cannot erase instruction that is used!");
+      AddUsesToWorkList(I);
+      RemoveFromWorkList(&I);
+      I.eraseFromParent();
+      return 0;  // Don't do anything with FI
+    }
+        
+    void ComputeMaskedBits(Value *V, const APInt &Mask, APInt &KnownZero,
+                           APInt &KnownOne, unsigned Depth = 0) const {
+      return llvm::ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+    }
+    
+    bool MaskedValueIsZero(Value *V, const APInt &Mask, 
+                           unsigned Depth = 0) const {
+      return llvm::MaskedValueIsZero(V, Mask, TD, Depth);
+    }
+    unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0) const {
+      return llvm::ComputeNumSignBits(Op, TD, Depth);
+    }
+
+  private:
+
+    /// SimplifyCommutative - This performs a few simplifications for 
+    /// commutative operators.
+    bool SimplifyCommutative(BinaryOperator &I);
+
+    /// SimplifyCompare - This reorders the operands of a CmpInst to get them in
+    /// most-complex to least-complex order.
+    bool SimplifyCompare(CmpInst &I);
+
+    /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
+    /// based on the demanded bits.
+    Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, 
+                                   APInt& KnownZero, APInt& KnownOne,
+                                   unsigned Depth);
+    bool SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+                              APInt& KnownZero, APInt& KnownOne,
+                              unsigned Depth=0);
+        
+    /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+    /// SimplifyDemandedBits knows about.  See if the instruction has any
+    /// properties that allow us to simplify its operands.
+    bool SimplifyDemandedInstructionBits(Instruction &Inst);
+        
+    Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                      APInt& UndefElts, unsigned Depth = 0);
+      
+    // FoldOpIntoPhi - Given a binary operator or cast instruction which has a
+    // PHI node as operand #0, see if we can fold the instruction into the PHI
+    // (which is only possible if all operands to the PHI are constants).
+    Instruction *FoldOpIntoPhi(Instruction &I);
+
+    // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+    // operator and they all are only used by the PHI, PHI together their
+    // inputs, and do the operation once, to the result of the PHI.
+    Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
+    Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
+    Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
+
+    
+    Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+                          ConstantInt *AndRHS, BinaryOperator &TheAnd);
+    
+    Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
+                              bool isSub, Instruction &I);
+    Instruction *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                                 bool isSigned, bool Inside, Instruction &IB);
+    Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocationInst &AI);
+    Instruction *MatchBSwap(BinaryOperator &I);
+    bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+    Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
+    Instruction *SimplifyMemSet(MemSetInst *MI);
+
+
+    Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned);
+
+    bool CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
+                                    unsigned CastOpc, int &NumCastsRemoved);
+    unsigned GetOrEnforceKnownAlignment(Value *V,
+                                        unsigned PrefAlign = 0);
+
+  };
+}
+
+char InstCombiner::ID = 0;
+static RegisterPass<InstCombiner>
+X("instcombine", "Combine redundant instructions");
+
+// getComplexity:  Assign a complexity or rank value to LLVM Values...
+//   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
+static unsigned getComplexity(Value *V) {
+  if (isa<Instruction>(V)) {
+    if (BinaryOperator::isNeg(V) || BinaryOperator::isNot(V))
+      return 3;
+    return 4;
+  }
+  if (isa<Argument>(V)) return 3;
+  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+}
+
+// isOnlyUse - Return true if this instruction will be deleted if we stop using
+// it.
+static bool isOnlyUse(Value *V) {
+  return V->hasOneUse() || isa<Constant>(V);
+}
+
+// getPromotedType - Return the specified type promoted as it would be to pass
+// though a va_arg area...
+static const Type *getPromotedType(const Type *Ty) {
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+    if (ITy->getBitWidth() < 32)
+      return Type::Int32Ty;
+  }
+  return Ty;
+}
+
+/// getBitCastOperand - If the specified operand is a CastInst, a constant
+/// expression bitcast, or a GetElementPtrInst with all zero indices, return the
+/// operand value, otherwise return null.
+static Value *getBitCastOperand(Value *V) {
+  if (BitCastInst *I = dyn_cast<BitCastInst>(V))
+    // BitCastInst?
+    return I->getOperand(0);
+  else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+    // GetElementPtrInst?
+    if (GEP->hasAllZeroIndices())
+      return GEP->getOperand(0);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() == Instruction::BitCast)
+      // BitCast ConstantExp?
+      return CE->getOperand(0);
+    else if (CE->getOpcode() == Instruction::GetElementPtr) {
+      // GetElementPtr ConstantExp?
+      for (User::op_iterator I = CE->op_begin() + 1, E = CE->op_end();
+           I != E; ++I) {
+        ConstantInt *CI = dyn_cast<ConstantInt>(I);
+        if (!CI || !CI->isZero())
+          // Any non-zero indices? Not cast-like.
+          return 0;
+      }
+      // All-zero indices? This is just like casting.
+      return CE->getOperand(0);
+    }
+  }
+  return 0;
+}
+
+/// This function is a wrapper around CastInst::isEliminableCastPair. It
+/// simply extracts arguments and returns what that function returns.
+static Instruction::CastOps 
+isEliminableCastPair(
+  const CastInst *CI, ///< The first cast instruction
+  unsigned opcode,       ///< The opcode of the second cast instruction
+  const Type *DstTy,     ///< The target type for the second cast instruction
+  TargetData *TD         ///< The target data for pointer size
+) {
+  
+  const Type *SrcTy = CI->getOperand(0)->getType();   // A from above
+  const Type *MidTy = CI->getType();                  // B from above
+
+  // Get the opcodes of the two Cast instructions
+  Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
+  Instruction::CastOps secondOp = Instruction::CastOps(opcode);
+
+  unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
+                                                DstTy, TD->getIntPtrType());
+  
+  // We don't want to form an inttoptr or ptrtoint that converts to an integer
+  // type that differs from the pointer size.
+  if ((Res == Instruction::IntToPtr && SrcTy != TD->getIntPtrType()) ||
+      (Res == Instruction::PtrToInt && DstTy != TD->getIntPtrType()))
+    Res = 0;
+  
+  return Instruction::CastOps(Res);
+}
+
+/// ValueRequiresCast - Return true if the cast from "V to Ty" actually results
+/// in any code being generated.  It does not require codegen if V is simple
+/// enough or if the cast can be folded into other casts.
+static bool ValueRequiresCast(Instruction::CastOps opcode, const Value *V, 
+                              const Type *Ty, TargetData *TD) {
+  if (V->getType() == Ty || isa<Constant>(V)) return false;
+  
+  // If this is another cast that can be eliminated, it isn't codegen either.
+  if (const CastInst *CI = dyn_cast<CastInst>(V))
+    if (isEliminableCastPair(CI, opcode, Ty, TD)) 
+      return false;
+  return true;
+}
+
+// SimplifyCommutative - This performs a few simplifications for commutative
+// operators:
+//
+//  1. Order operands such that they are listed from right (least complex) to
+//     left (most complex).  This puts constants before unary operators before
+//     binary operators.
+//
+//  2. Transform: (op (op V, C1), C2) ==> (op V, (op C1, C2))
+//  3. Transform: (op (op V1, C1), (op V2, C2)) ==> (op (op V1, V2), (op C1,C2))
+//
+bool InstCombiner::SimplifyCommutative(BinaryOperator &I) {
+  bool Changed = false;
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1)))
+    Changed = !I.swapOperands();
+
+  if (!I.isAssociative()) return Changed;
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (BinaryOperator *Op = dyn_cast<BinaryOperator>(I.getOperand(0)))
+    if (Op->getOpcode() == Opcode && isa<Constant>(Op->getOperand(1))) {
+      if (isa<Constant>(I.getOperand(1))) {
+        Constant *Folded = ConstantExpr::get(I.getOpcode(),
+                                             cast<Constant>(I.getOperand(1)),
+                                             cast<Constant>(Op->getOperand(1)));
+        I.setOperand(0, Op->getOperand(0));
+        I.setOperand(1, Folded);
+        return true;
+      } else if (BinaryOperator *Op1=dyn_cast<BinaryOperator>(I.getOperand(1)))
+        if (Op1->getOpcode() == Opcode && isa<Constant>(Op1->getOperand(1)) &&
+            isOnlyUse(Op) && isOnlyUse(Op1)) {
+          Constant *C1 = cast<Constant>(Op->getOperand(1));
+          Constant *C2 = cast<Constant>(Op1->getOperand(1));
+
+          // Fold (op (op V1, C1), (op V2, C2)) ==> (op (op V1, V2), (op C1,C2))
+          Constant *Folded = ConstantExpr::get(I.getOpcode(), C1, C2);
+          Instruction *New = BinaryOperator::Create(Opcode, Op->getOperand(0),
+                                                    Op1->getOperand(0),
+                                                    Op1->getName(), &I);
+          AddToWorkList(New);
+          I.setOperand(0, New);
+          I.setOperand(1, Folded);
+          return true;
+        }
+    }
+  return Changed;
+}
+
+/// SimplifyCompare - For a CmpInst this function just orders the operands
+/// so that theyare listed from right (least complex) to left (most complex).
+/// This puts constants before unary operators before binary operators.
+bool InstCombiner::SimplifyCompare(CmpInst &I) {
+  if (getComplexity(I.getOperand(0)) >= getComplexity(I.getOperand(1)))
+    return false;
+  I.swapOperands();
+  // Compare instructions are not associative so there's nothing else we can do.
+  return true;
+}
+
+// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction
+// if the LHS is a constant zero (which is the 'negate' form).
+//
+static inline Value *dyn_castNegVal(Value *V) {
+  if (BinaryOperator::isNeg(V))
+    return BinaryOperator::getNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantExpr::getNeg(C);
+
+  if (ConstantVector *C = dyn_cast<ConstantVector>(V))
+    if (C->getType()->getElementType()->isInteger())
+      return ConstantExpr::getNeg(C);
+
+  return 0;
+}
+
+static inline Value *dyn_castNotVal(Value *V) {
+  if (BinaryOperator::isNot(V))
+    return BinaryOperator::getNotArgument(V);
+
+  // Constants can be considered to be not'ed values...
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantInt::get(~C->getValue());
+  return 0;
+}
+
+// dyn_castFoldableMul - If this value is a multiply that can be folded into
+// other computations (because it has a constant operand), return the
+// non-constant operand of the multiply, and set CST to point to the multiplier.
+// Otherwise, return null.
+//
+static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
+  if (V->hasOneUse() && V->getType()->isInteger())
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (I->getOpcode() == Instruction::Mul)
+        if ((CST = dyn_cast<ConstantInt>(I->getOperand(1))))
+          return I->getOperand(0);
+      if (I->getOpcode() == Instruction::Shl)
+        if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) {
+          // The multiplier is really 1 << CST.
+          uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+          uint32_t CSTVal = CST->getLimitedValue(BitWidth);
+          CST = ConstantInt::get(APInt(BitWidth, 1).shl(CSTVal));
+          return I->getOperand(0);
+        }
+    }
+  return 0;
+}
+
+/// dyn_castGetElementPtr - If this is a getelementptr instruction or constant
+/// expression, return it.
+static User *dyn_castGetElementPtr(Value *V) {
+  if (isa<GetElementPtrInst>(V)) return cast<User>(V);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      return cast<User>(V);
+  return false;
+}
+
+/// getOpcode - If this is an Instruction or a ConstantExpr, return the
+/// opcode value. Otherwise return UserOp1.
+static unsigned getOpcode(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getOpcode();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    return CE->getOpcode();
+  // Use UserOp1 to mean there's no opcode.
+  return Instruction::UserOp1;
+}
+
+/// AddOne - Add one to a ConstantInt
+static ConstantInt *AddOne(ConstantInt *C) {
+  APInt Val(C->getValue());
+  return ConstantInt::get(++Val);
+}
+/// SubOne - Subtract one from a ConstantInt
+static ConstantInt *SubOne(ConstantInt *C) {
+  APInt Val(C->getValue());
+  return ConstantInt::get(--Val);
+}
+/// Add - Add two ConstantInts together
+static ConstantInt *Add(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() + C2->getValue());
+}
+/// And - Bitwise AND two ConstantInts together
+static ConstantInt *And(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() & C2->getValue());
+}
+/// Subtract - Subtract one ConstantInt from another
+static ConstantInt *Subtract(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() - C2->getValue());
+}
+/// Multiply - Multiply two ConstantInts together
+static ConstantInt *Multiply(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() * C2->getValue());
+}
+/// MultiplyOverflows - True if the multiply can not be expressed in an int
+/// this size.
+static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
+  uint32_t W = C1->getBitWidth();
+  APInt LHSExt = C1->getValue(), RHSExt = C2->getValue();
+  if (sign) {
+    LHSExt.sext(W * 2);
+    RHSExt.sext(W * 2);
+  } else {
+    LHSExt.zext(W * 2);
+    RHSExt.zext(W * 2);
+  }
+
+  APInt MulExt = LHSExt * RHSExt;
+
+  if (sign) {
+    APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
+    APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
+    return MulExt.slt(Min) || MulExt.sgt(Max);
+  } else 
+    return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
+}
+
+
+/// ShrinkDemandedConstant - Check to see if the specified operand of the 
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, 
+                                   APInt Demanded) {
+  assert(I && "No instruction?");
+  assert(OpNo < I->getNumOperands() && "Operand index too large");
+
+  // If the operand is not a constant integer, nothing to do.
+  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
+  if (!OpC) return false;
+
+  // If there are no bits set that aren't demanded, nothing to do.
+  Demanded.zextOrTrunc(OpC->getValue().getBitWidth());
+  if ((~Demanded & OpC->getValue()) == 0)
+    return false;
+
+  // This instruction is producing bits that are not demanded. Shrink the RHS.
+  Demanded &= OpC->getValue();
+  I->setOperand(OpNo, ConstantInt::get(Demanded));
+  return true;
+}
+
+// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a 
+// set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero,
+                                                   const APInt& KnownOne,
+                                                   APInt& Min, APInt& Max) {
+  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
+         KnownZero.getBitWidth() == Min.getBitWidth() &&
+         KnownZero.getBitWidth() == Max.getBitWidth() &&
+         "KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+
+  // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
+  // bit if it is unknown.
+  Min = KnownOne;
+  Max = KnownOne|UnknownBits;
+  
+  if (UnknownBits.isNegative()) { // Sign bit is unknown
+    Min.set(Min.getBitWidth()-1);
+    Max.clear(Max.getBitWidth()-1);
+  }
+}
+
+// ComputeUnsignedMinMaxValuesFromKnownBits - Given an unsigned integer type and
+// a set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
+                                                     const APInt &KnownOne,
+                                                     APInt &Min, APInt &Max) {
+  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
+         KnownZero.getBitWidth() == Min.getBitWidth() &&
+         KnownZero.getBitWidth() == Max.getBitWidth() &&
+         "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+  
+  // The minimum value is when the unknown bits are all zeros.
+  Min = KnownOne;
+  // The maximum value is when the unknown bits are all ones.
+  Max = KnownOne|UnknownBits;
+}
+
+/// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+/// SimplifyDemandedBits knows about.  See if the instruction has any
+/// properties that allow us to simplify its operands.
+bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+  unsigned BitWidth = cast<IntegerType>(Inst.getType())->getBitWidth();
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+  
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, 
+                                     KnownZero, KnownOne, 0);
+  if (V == 0) return false;
+  if (V == &Inst) return true;
+  ReplaceInstUsesWith(Inst, V);
+  return true;
+}
+
+/// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the
+/// specified instruction operand if possible, updating it in place.  It returns
+/// true if it made any change and false otherwise.
+bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+                                        APInt &KnownZero, APInt &KnownOne,
+                                        unsigned Depth) {
+  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
+                                          KnownZero, KnownOne, Depth);
+  if (NewVal == 0) return false;
+  U.set(NewVal);
+  return true;
+}
+
+
+/// SimplifyDemandedUseBits - This function attempts to replace V with a simpler
+/// value based on the demanded bits.  When this function is called, it is known
+/// that only the bits set in DemandedMask of the result of V are ever used
+/// downstream. Consequently, depending on the mask and V, it may be possible
+/// to replace V with a constant or one of its operands. In such cases, this
+/// function does the replacement and returns true. In all other cases, it
+/// returns false after analyzing the expression and setting KnownOne and known
+/// to be one in the expression.  KnownZero contains all the bits that are known
+/// to be zero in the expression. These are provided to potentially allow the
+/// caller (which might recursively be SimplifyDemandedBits itself) to simplify
+/// the expression. KnownOne and KnownZero always follow the invariant that 
+/// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that
+/// the bits in KnownOne and KnownZero may only be accurate for those bits set
+/// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero
+/// and KnownOne must all be the same.
+///
+/// This returns null if it did not change anything and it permits no
+/// simplification.  This returns V itself if it did some simplification of V's
+/// operands based on the information about what bits are demanded. This returns
+/// some other non-null value if it found out that V is equal to another value
+/// in the context where the specified bits are demanded, but not for all users.
+Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                             APInt &KnownZero, APInt &KnownOne,
+                                             unsigned Depth) {
+  assert(V != 0 && "Null pointer of Value???");
+  assert(Depth <= 6 && "Limit Search Depth");
+  uint32_t BitWidth = DemandedMask.getBitWidth();
+  const Type *VTy = V->getType();
+  assert((TD || !isa<PointerType>(VTy)) &&
+         "SimplifyDemandedBits needs to know bit widths!");
+  assert((!TD || TD->getTypeSizeInBits(VTy) == BitWidth) &&
+         (!isa<IntegerType>(VTy) ||
+          VTy->getPrimitiveSizeInBits() == BitWidth) &&
+         KnownZero.getBitWidth() == BitWidth &&
+         KnownOne.getBitWidth() == BitWidth &&
+         "Value *V, DemandedMask, KnownZero and KnownOne \
+          must have same BitWidth");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getValue() & DemandedMask;
+    KnownZero = ~KnownOne & DemandedMask;
+    return 0;
+  }
+  if (isa<ConstantPointerNull>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne.clear();
+    KnownZero = DemandedMask;
+    return 0;
+  }
+
+  KnownZero.clear();
+  KnownOne.clear();
+  if (DemandedMask == 0) {   // Not demanding any bits from V.
+    if (isa<UndefValue>(V))
+      return 0;
+    return UndefValue::get(VTy);
+  }
+  
+  if (Depth == 6)        // Limit search depth.
+    return 0;
+  
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  APInt &RHSKnownZero = KnownZero, &RHSKnownOne = KnownOne;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    return 0;        // Only analyze instructions.
+  }
+
+  // If there are multiple uses of this value and we aren't at the root, then
+  // we can't do any simplifications of the operands, because DemandedMask
+  // only reflects the bits demanded by *one* of the users.
+  if (Depth != 0 && !I->hasOneUse()) {
+    // Despite the fact that we can't simplify this instruction in all User's
+    // context, we can at least compute the knownzero/knownone bits, and we can
+    // do simplifications that apply to *just* the one user if we know that
+    // this instruction has a simpler value in that context.
+    if (I->getOpcode() == Instruction::And) {
+      // If either the LHS or the RHS are Zero, the result is zero.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask,
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known 1 on one side, return the other.
+      // These bits cannot contribute to the result of the 'and' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+          (DemandedMask & ~LHSKnownZero))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+          (DemandedMask & ~RHSKnownZero))
+        return I->getOperand(1);
+      
+      // If all of the demanded bits in the inputs are known zeros, return zero.
+      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+        return Constant::getNullValue(VTy);
+      
+    } else if (I->getOpcode() == Instruction::Or) {
+      // We can simplify (X|Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+      
+      // If either the LHS or the RHS are One, the result is One.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask, 
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known zero on one side, return the
+      // other.  These bits cannot contribute to the result of the 'or' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+          (DemandedMask & ~LHSKnownOne))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+          (DemandedMask & ~RHSKnownOne))
+        return I->getOperand(1);
+      
+      // If all of the potentially set bits on one side are known to be set on
+      // the other side, just use the 'other' side.
+      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+          (DemandedMask & (~RHSKnownZero)))
+        return I->getOperand(0);
+      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+          (DemandedMask & (~LHSKnownZero)))
+        return I->getOperand(1);
+    }
+    
+    // Compute the KnownZero/KnownOne bits to simplify things downstream.
+    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    return 0;
+  }
+  
+  // If this is the root being simplified, allow it to have multiple uses,
+  // just set the DemandedMask to all bits so that we can try to simplify the
+  // operands.  This allows visitTruncInst (for example) to simplify the
+  // operand of a trunc without duplicating all the logic below.
+  if (Depth == 0 && !V->hasOneUse())
+    DemandedMask = APInt::getAllOnesValue(BitWidth);
+  
+  switch (I->getOpcode()) {
+  default:
+    ComputeMaskedBits(I, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    break;
+  case Instruction::And:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+        (DemandedMask & ~LHSKnownZero))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+        (DemandedMask & ~RHSKnownZero))
+      return I->getOperand(1);
+    
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+      return Constant::getNullValue(VTy);
+      
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
+      return I;
+      
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    RHSKnownOne &= LHSKnownOne;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    RHSKnownZero |= LHSKnownZero;
+    break;
+  case Instruction::Or:
+    // If either the LHS or the RHS are One, the result is One.
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+        (DemandedMask & ~LHSKnownOne))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+        (DemandedMask & ~RHSKnownOne))
+      return I->getOperand(1);
+
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+        (DemandedMask & (~RHSKnownZero)))
+      return I->getOperand(0);
+    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+        (DemandedMask & (~LHSKnownZero)))
+      return I->getOperand(1);
+        
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+          
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    RHSKnownZero &= LHSKnownZero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    RHSKnownOne |= LHSKnownOne;
+    break;
+  case Instruction::Xor: {
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((DemandedMask & RHSKnownZero) == DemandedMask)
+      return I->getOperand(0);
+    if ((DemandedMask & LHSKnownZero) == DemandedMask)
+      return I->getOperand(1);
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (RHSKnownZero & LHSKnownZero) | 
+                         (RHSKnownOne & LHSKnownOne);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt KnownOneOut = (RHSKnownZero & LHSKnownOne) | 
+                        (RHSKnownOne & LHSKnownZero);
+    
+    // If all of the demanded bits are known to be zero on one side or the
+    // other, turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) {
+      Instruction *Or =
+        BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                 I->getName());
+      return InsertNewInstBefore(Or, *I);
+    }
+    
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) { 
+      // all known
+      if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
+        Constant *AndC = ConstantInt::get(~RHSKnownOne & DemandedMask);
+        Instruction *And = 
+          BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
+        return InsertNewInstBefore(And, *I);
+      }
+    }
+    
+    // If the RHS is a constant, see if we can simplify it.
+    // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+    
+    RHSKnownZero = KnownZeroOut;
+    RHSKnownOne  = KnownOneOut;
+    break;
+  }
+  case Instruction::Select:
+    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
+        ShrinkDemandedConstant(I, 2, DemandedMask))
+      return I;
+    
+    // Only known if known in both the LHS and RHS.
+    RHSKnownOne &= LHSKnownOne;
+    RHSKnownZero &= LHSKnownZero;
+    break;
+  case Instruction::Trunc: {
+    unsigned truncBf = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    DemandedMask.zext(truncBf);
+    RHSKnownZero.zext(truncBf);
+    RHSKnownOne.zext(truncBf);
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    DemandedMask.trunc(BitWidth);
+    RHSKnownZero.trunc(BitWidth);
+    RHSKnownOne.trunc(BitWidth);
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    break;
+  }
+  case Instruction::BitCast:
+    if (!I->getOperand(0)->getType()->isInteger())
+      return false;  // vector->int or fp->int?
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    break;
+  case Instruction::ZExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    
+    DemandedMask.trunc(SrcBitWidth);
+    RHSKnownZero.trunc(SrcBitWidth);
+    RHSKnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    DemandedMask.zext(BitWidth);
+    RHSKnownZero.zext(BitWidth);
+    RHSKnownOne.zext(BitWidth);
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    // The top bits are known to be zero.
+    RHSKnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    break;
+  }
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    
+    APInt InputDemandedBits = DemandedMask & 
+                              APInt::getLowBitsSet(BitWidth, SrcBitWidth);
+
+    APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    if ((NewBits & DemandedMask) != 0)
+      InputDemandedBits.set(SrcBitWidth-1);
+      
+    InputDemandedBits.trunc(SrcBitWidth);
+    RHSKnownZero.trunc(SrcBitWidth);
+    RHSKnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return I;
+    InputDemandedBits.zext(BitWidth);
+    RHSKnownZero.zext(BitWidth);
+    RHSKnownOne.zext(BitWidth);
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+      
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+
+    // If the input sign bit is known zero, or if the NewBits are not demanded
+    // convert this into a zero extension.
+    if (RHSKnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
+      // Convert to ZExt cast
+      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
+      return InsertNewInstBefore(NewCast, *I);
+    } else if (RHSKnownOne[SrcBitWidth-1]) {    // Input sign bit known set
+      RHSKnownOne |= NewBits;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Figure out what the input bits are.  If the top bits of the and result
+    // are not demanded, then the add doesn't demand them from its input
+    // either.
+    unsigned NLZ = DemandedMask.countLeadingZeros();
+      
+    // If there is a constant on the RHS, there are a variety of xformations
+    // we can do.
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // If null, this should be simplified elsewhere.  Some of the xforms here
+      // won't work if the RHS is zero.
+      if (RHS->isZero())
+        break;
+      
+      // If the top bit of the output is demanded, demand everything from the
+      // input.  Otherwise, we demand all the input bits except NLZ top bits.
+      APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ));
+
+      // Find information about known zero/one bits in the input.
+      if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, 
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return I;
+
+      // If the RHS of the add has bits set that can't affect the input, reduce
+      // the constant.
+      if (ShrinkDemandedConstant(I, 1, InDemandedBits))
+        return I;
+      
+      // Avoid excess work.
+      if (LHSKnownZero == 0 && LHSKnownOne == 0)
+        break;
+      
+      // Turn it into OR if input bits are zero.
+      if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) {
+        Instruction *Or =
+          BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                   I->getName());
+        return InsertNewInstBefore(Or, *I);
+      }
+      
+      // We can say something about the output known-zero and known-one bits,
+      // depending on potential carries from the input constant and the
+      // unknowns.  For example if the LHS is known to have at most the 0x0F0F0
+      // bits set and the RHS constant is 0x01001, then we know we have a known
+      // one mask of 0x00001 and a known zero mask of 0xE0F0E.
+      
+      // To compute this, we first compute the potential carry bits.  These are
+      // the bits which may be modified.  I'm not aware of a better way to do
+      // this scan.
+      const APInt &RHSVal = RHS->getValue();
+      APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal));
+      
+      // Now that we know which bits have carries, compute the known-1/0 sets.
+      
+      // Bits are known one if they are known zero in one operand and one in the
+      // other, and there is no input carry.
+      RHSKnownOne = ((LHSKnownZero & RHSVal) | 
+                     (LHSKnownOne & ~RHSVal)) & ~CarryBits;
+      
+      // Bits are known zero if they are known zero in both operands and there
+      // is no input carry.
+      RHSKnownZero = LHSKnownZero & ~RHSVal & ~CarryBits;
+    } else {
+      // If the high-bits of this ADD are not demanded, then it does not demand
+      // the high bits of its LHS or RHS.
+      if (DemandedMask[BitWidth-1] == 0) {
+        // Right fill the mask of bits for this ADD to demand the most
+        // significant bit and all those below it.
+        APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+        if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1) ||
+            SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return I;
+      }
+    }
+    break;
+  }
+  case Instruction::Sub:
+    // If the high-bits of this SUB are not demanded, then it does not demand
+    // the high bits of its LHS or RHS.
+    if (DemandedMask[BitWidth-1] == 0) {
+      // Right fill the mask of bits for this SUB to demand the most
+      // significant bit and all those below it.
+      uint32_t NLZ = DemandedMask.countLeadingZeros();
+      APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1) ||
+          SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return I;
+    }
+    // Otherwise just hand the sub off to ComputeMaskedBits to fill in
+    // the known zeros and ones.
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    break;
+  case Instruction::Shl:
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, 
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+      RHSKnownZero <<= ShiftAmt;
+      RHSKnownOne  <<= ShiftAmt;
+      // low bits known zero.
+      if (ShiftAmt)
+        RHSKnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+    }
+    break;
+  case Instruction::LShr:
+    // For a logical shift right
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Unsigned shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+      RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
+      RHSKnownOne  = APIntOps::lshr(RHSKnownOne, ShiftAmt);
+      if (ShiftAmt) {
+        // Compute the new bits that are at the top now.
+        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+        RHSKnownZero |= HighBits;  // high bits known zero.
+      }
+    }
+    break;
+  case Instruction::AShr:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask == 1) {
+      // Perform the logical shift right.
+      Instruction *NewVal = BinaryOperator::CreateLShr(
+                        I->getOperand(0), I->getOperand(1), I->getName());
+      return InsertNewInstBefore(NewVal, *I);
+    }    
+
+    // If the sign bit is the only bit demanded by this ashr, then there is no
+    // need to do it, the shift doesn't change the high bit.
+    if (DemandedMask.isSignBit())
+      return I->getOperand(0);
+    
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Signed shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      // If any of the "high bits" are demanded, we should set the sign bit as
+      // demanded.
+      if (DemandedMask.countLeadingZeros() <= ShiftAmt)
+        DemandedMaskIn.set(BitWidth-1);
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+      // Compute the new bits that are at the top now.
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
+      RHSKnownOne  = APIntOps::lshr(RHSKnownOne, ShiftAmt);
+        
+      // Handle the sign bits.
+      APInt SignBit(APInt::getSignBit(BitWidth));
+      // Adjust to where it is now in the mask.
+      SignBit = APIntOps::lshr(SignBit, ShiftAmt);  
+        
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (BitWidth <= ShiftAmt || RHSKnownZero[BitWidth-ShiftAmt-1] || 
+          (HighBits & ~DemandedMask) == HighBits) {
+        // Perform the logical shift right.
+        Instruction *NewVal = BinaryOperator::CreateLShr(
+                          I->getOperand(0), SA, I->getName());
+        return InsertNewInstBefore(NewVal, *I);
+      } else if ((RHSKnownOne & SignBit) != 0) { // New bits are known one.
+        RHSKnownOne |= HighBits;
+      }
+    }
+    break;
+  case Instruction::SRem:
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      APInt RA = Rem->getValue().abs();
+      if (RA.isPowerOf2()) {
+        if (DemandedMask.ule(RA))    // srem won't affect demanded bits
+          return I->getOperand(0);
+
+        APInt LowBits = RA - 1;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return I;
+
+        if (LHSKnownZero[BitWidth-1] || ((LHSKnownZero & LowBits) == LowBits))
+          LHSKnownZero |= ~LowBits;
+
+        KnownZero |= LHSKnownZero & DemandedMask;
+
+        assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
+      }
+    }
+    break;
+  case Instruction::URem: {
+    APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1))
+      return I;
+
+    unsigned Leaders = KnownZero2.countLeadingOnes();
+    Leaders = std::max(Leaders,
+                       KnownZero2.countLeadingOnes());
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    break;
+  }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap: {
+        // If the only bits demanded come from one byte of the bswap result,
+        // just shift the input byte into position to eliminate the bswap.
+        unsigned NLZ = DemandedMask.countLeadingZeros();
+        unsigned NTZ = DemandedMask.countTrailingZeros();
+          
+        // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+        // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+        // have 14 leading zeros, round to 8.
+        NLZ &= ~7;
+        NTZ &= ~7;
+        // If we need exactly one byte, we can do this transformation.
+        if (BitWidth-NLZ-NTZ == 8) {
+          unsigned ResultBit = NTZ;
+          unsigned InputBit = BitWidth-NTZ-8;
+          
+          // Replace this with either a left or right shift to get the byte into
+          // the right place.
+          Instruction *NewVal;
+          if (InputBit > ResultBit)
+            NewVal = BinaryOperator::CreateLShr(I->getOperand(1),
+                    ConstantInt::get(I->getType(), InputBit-ResultBit));
+          else
+            NewVal = BinaryOperator::CreateShl(I->getOperand(1),
+                    ConstantInt::get(I->getType(), ResultBit-InputBit));
+          NewVal->takeName(I);
+          return InsertNewInstBefore(NewVal, *I);
+        }
+          
+        // TODO: Could compute known zero/one bits based on the input.
+        break;
+      }
+      }
+    }
+    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    break;
+  }
+  
+  // If the client is only demanding bits that we know, return the known
+  // constant.
+  if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) {
+    Constant *C = ConstantInt::get(RHSKnownOne);
+    if (isa<PointerType>(V->getType()))
+      C = ConstantExpr::getIntToPtr(C, V->getType());
+    return C;
+  }
+  return false;
+}
+
+
+/// SimplifyDemandedVectorElts - The specified value produces a vector with
+/// any number of elements. DemandedElts contains the set of elements that are
+/// actually used by the caller.  This method analyzes which elements of the
+/// operand are undef and returns that information in UndefElts.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned.  This returns null if no change was made.
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                                APInt& UndefElts,
+                                                unsigned Depth) {
+  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
+  APInt EltMask(APInt::getAllOnesValue(VWidth));
+  assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
+
+  if (isa<UndefValue>(V)) {
+    // If the entire vector is undefined, just return this info.
+    UndefElts = EltMask;
+    return 0;
+  } else if (DemandedElts == 0) { // If nothing is demanded, provide undef.
+    UndefElts = EltMask;
+    return UndefValue::get(V->getType());
+  }
+
+  UndefElts = 0;
+  if (ConstantVector *CP = dyn_cast<ConstantVector>(V)) {
+    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Undef = UndefValue::get(EltTy);
+
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i)
+      if (!DemandedElts[i]) {   // If not demanded, set to undef.
+        Elts.push_back(Undef);
+        UndefElts.set(i);
+      } else if (isa<UndefValue>(CP->getOperand(i))) {   // Already undef.
+        Elts.push_back(Undef);
+        UndefElts.set(i);
+      } else {                               // Otherwise, defined.
+        Elts.push_back(CP->getOperand(i));
+      }
+
+    // If we changed the constant, return it.
+    Constant *NewCP = ConstantVector::get(Elts);
+    return NewCP != CP ? NewCP : 0;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    // Simplify the CAZ to a ConstantVector where the non-demanded elements are
+    // set to undef.
+    
+    // Check if this is identity. If so, return 0 since we are not simplifying
+    // anything.
+    if (DemandedElts == ((1ULL << VWidth) -1))
+      return 0;
+    
+    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Zero = Constant::getNullValue(EltTy);
+    Constant *Undef = UndefValue::get(EltTy);
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = DemandedElts[i] ? Zero : Undef;
+      Elts.push_back(Elt);
+    }
+    UndefElts = DemandedElts ^ EltMask;
+    return ConstantVector::get(Elts);
+  }
+  
+  // Limit search depth.
+  if (Depth == 10)
+    return 0;
+
+  // If multiple users are using the root value, procede with
+  // simplification conservatively assuming that all elements
+  // are needed.
+  if (!V->hasOneUse()) {
+    // Quit if we find multiple users of a non-root value though.
+    // They'll be handled when it's their turn to be visited by
+    // the main instcombine process.
+    if (Depth != 0)
+      // TODO: Just compute the UndefElts information recursively.
+      return 0;
+
+    // Conservatively assume that all elements are needed.
+    DemandedElts = EltMask;
+  }
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return 0;        // Only analyze instructions.
+  
+  bool MadeChange = false;
+  APInt UndefElts2(VWidth, 0);
+  Value *TmpV;
+  switch (I->getOpcode()) {
+  default: break;
+    
+  case Instruction::InsertElement: {
+    // If this is a variable index, we don't know which element it overwrites.
+    // demand exactly the same input as we produce.
+    ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
+    if (Idx == 0) {
+      // Note that we can't propagate undef elt info, because we don't know
+      // which elt is getting updated.
+      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+      break;
+    }
+    
+    // If this is inserting an element that isn't demanded, remove this
+    // insertelement.
+    unsigned IdxNo = Idx->getZExtValue();
+    if (IdxNo >= VWidth || !DemandedElts[IdxNo])
+      return AddSoonDeadInstToWorklist(*I, 0);
+    
+    // Otherwise, the element inserted overwrites whatever was there, so the
+    // input demanded set is simpler than the output set.
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clear(IdxNo);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    // The inserted element is defined.
+    UndefElts.clear(IdxNo);
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
+    uint64_t LHSVWidth =
+      cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
+    APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
+    for (unsigned i = 0; i < VWidth; i++) {
+      if (DemandedElts[i]) {
+        unsigned MaskVal = Shuffle->getMaskValue(i);
+        if (MaskVal != -1u) {
+          assert(MaskVal < LHSVWidth * 2 &&
+                 "shufflevector mask index out of range!");
+          if (MaskVal < LHSVWidth)
+            LeftDemanded.set(MaskVal);
+          else
+            RightDemanded.set(MaskVal - LHSVWidth);
+        }
+      }
+    }
+
+    APInt UndefElts4(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
+                                      UndefElts4, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    APInt UndefElts3(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
+                                      UndefElts3, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    bool NewUndefElts = false;
+    for (unsigned i = 0; i < VWidth; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u) {
+        UndefElts.set(i);
+      } else if (MaskVal < LHSVWidth) {
+        if (UndefElts4[MaskVal]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
+      } else {
+        if (UndefElts3[MaskVal - LHSVWidth]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
+      }
+    }
+
+    if (NewUndefElts) {
+      // Add additional discovered undefs.
+      std::vector<Constant*> Elts;
+      for (unsigned i = 0; i < VWidth; ++i) {
+        if (UndefElts[i])
+          Elts.push_back(UndefValue::get(Type::Int32Ty));
+        else
+          Elts.push_back(ConstantInt::get(Type::Int32Ty,
+                                          Shuffle->getMaskValue(i)));
+      }
+      I->setOperand(2, ConstantVector::get(Elts));
+      MadeChange = true;
+    }
+    break;
+  }
+  case Instruction::BitCast: {
+    // Vector->vector casts only.
+    const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    if (!VTy) break;
+    unsigned InVWidth = VTy->getNumElements();
+    APInt InputDemandedElts(InVWidth, 0);
+    unsigned Ratio;
+
+    if (VWidth == InVWidth) {
+      // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
+      // elements as are demanded of us.
+      Ratio = 1;
+      InputDemandedElts = DemandedElts;
+    } else if (VWidth > InVWidth) {
+      // Untested so far.
+      break;
+      
+      // If there are more elements in the result than there are in the source,
+      // then an input element is live if any of the corresponding output
+      // elements are live.
+      Ratio = VWidth/InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        if (DemandedElts[OutIdx])
+          InputDemandedElts.set(OutIdx/Ratio);
+      }
+    } else {
+      // Untested so far.
+      break;
+      
+      // If there are more elements in the source than there are in the result,
+      // then an input element is live if the corresponding output element is
+      // live.
+      Ratio = InVWidth/VWidth;
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (DemandedElts[InIdx/Ratio])
+          InputDemandedElts.set(InIdx);
+    }
+    
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) {
+      I->setOperand(0, TmpV);
+      MadeChange = true;
+    }
+    
+    UndefElts = UndefElts2;
+    if (VWidth > InVWidth) {
+      assert(0 && "Unimp");
+      // If there are more elements in the result than there are in the source,
+      // then an output element is undef if the corresponding input element is
+      // undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (UndefElts2[OutIdx/Ratio])
+          UndefElts.set(OutIdx);
+    } else if (VWidth < InVWidth) {
+      assert(0 && "Unimp");
+      // If there are more elements in the source than there are in the result,
+      // then a result element is undef if all of the corresponding input
+      // elements are undef.
+      UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (!UndefElts2[InIdx])            // Not undef?
+          UndefElts.clear(InIdx/Ratio);    // Clear undef bit.
+    }
+    break;
+  }
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+      
+    // Output elements are undefined if both are undefined.  Consider things
+    // like undef&0.  The result is known zero, not undef.
+    UndefElts &= UndefElts2;
+    break;
+    
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (!II) break;
+    switch (II->getIntrinsicID()) {
+    default: break;
+      
+    // Binary vector operations that work column-wise.  A dest element is a
+    // function of the corresponding input elements from the two inputs.
+    case Intrinsic::x86_sse_sub_ss:
+    case Intrinsic::x86_sse_mul_ss:
+    case Intrinsic::x86_sse_min_ss:
+    case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse2_sub_sd:
+    case Intrinsic::x86_sse2_mul_sd:
+    case Intrinsic::x86_sse2_min_sd:
+    case Intrinsic::x86_sse2_max_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+                                        UndefElts, Depth+1);
+      if (TmpV) { II->setOperand(1, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(2), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { II->setOperand(2, TmpV); MadeChange = true; }
+
+      // If only the low elt is demanded and this is a scalarizable intrinsic,
+      // scalarize it now.
+      if (DemandedElts == 1) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::x86_sse_sub_ss:
+        case Intrinsic::x86_sse_mul_ss:
+        case Intrinsic::x86_sse2_sub_sd:
+        case Intrinsic::x86_sse2_mul_sd:
+          // TODO: Lower MIN/MAX/ABS/etc
+          Value *LHS = II->getOperand(1);
+          Value *RHS = II->getOperand(2);
+          // Extract the element as scalars.
+          LHS = InsertNewInstBefore(new ExtractElementInst(LHS, 0U,"tmp"), *II);
+          RHS = InsertNewInstBefore(new ExtractElementInst(RHS, 0U,"tmp"), *II);
+          
+          switch (II->getIntrinsicID()) {
+          default: assert(0 && "Case stmts out of sync!");
+          case Intrinsic::x86_sse_sub_ss:
+          case Intrinsic::x86_sse2_sub_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateSub(LHS, RHS,
+                                                        II->getName()), *II);
+            break;
+          case Intrinsic::x86_sse_mul_ss:
+          case Intrinsic::x86_sse2_mul_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateMul(LHS, RHS,
+                                                         II->getName()), *II);
+            break;
+          }
+          
+          Instruction *New =
+            InsertElementInst::Create(UndefValue::get(II->getType()), TmpV, 0U,
+                                      II->getName());
+          InsertNewInstBefore(New, *II);
+          AddSoonDeadInstToWorklist(*II, 0);
+          return New;
+        }            
+      }
+        
+      // Output elements are undefined if both are undefined.  Consider things
+      // like undef&0.  The result is known zero, not undef.
+      UndefElts &= UndefElts2;
+      break;
+    }
+    break;
+  }
+  }
+  return MadeChange ? I : 0;
+}
+
+
+/// AssociativeOpt - Perform an optimization on an associative operator.  This
+/// function is designed to check a chain of associative operators for a
+/// potential to apply a certain optimization.  Since the optimization may be
+/// applicable if the expression was reassociated, this checks the chain, then
+/// reassociates the expression as necessary to expose the optimization
+/// opportunity.  This makes use of a special Functor, which must define
+/// 'shouldApply' and 'apply' methods.
+///
+template<typename Functor>
+static Instruction *AssociativeOpt(BinaryOperator &Root, const Functor &F) {
+  unsigned Opcode = Root.getOpcode();
+  Value *LHS = Root.getOperand(0);
+
+  // Quick check, see if the immediate LHS matches...
+  if (F.shouldApply(LHS))
+    return F.apply(Root);
+
+  // Otherwise, if the LHS is not of the same opcode as the root, return.
+  Instruction *LHSI = dyn_cast<Instruction>(LHS);
+  while (LHSI && LHSI->getOpcode() == Opcode && LHSI->hasOneUse()) {
+    // Should we apply this transform to the RHS?
+    bool ShouldApply = F.shouldApply(LHSI->getOperand(1));
+
+    // If not to the RHS, check to see if we should apply to the LHS...
+    if (!ShouldApply && F.shouldApply(LHSI->getOperand(0))) {
+      cast<BinaryOperator>(LHSI)->swapOperands();   // Make the LHS the RHS
+      ShouldApply = true;
+    }
+
+    // If the functor wants to apply the optimization to the RHS of LHSI,
+    // reassociate the expression from ((? op A) op B) to (? op (A op B))
+    if (ShouldApply) {
+      // Now all of the instructions are in the current basic block, go ahead
+      // and perform the reassociation.
+      Instruction *TmpLHSI = cast<Instruction>(Root.getOperand(0));
+
+      // First move the selected RHS to the LHS of the root...
+      Root.setOperand(0, LHSI->getOperand(1));
+
+      // Make what used to be the LHS of the root be the user of the root...
+      Value *ExtraOperand = TmpLHSI->getOperand(1);
+      if (&Root == TmpLHSI) {
+        Root.replaceAllUsesWith(Constant::getNullValue(TmpLHSI->getType()));
+        return 0;
+      }
+      Root.replaceAllUsesWith(TmpLHSI);          // Users now use TmpLHSI
+      TmpLHSI->setOperand(1, &Root);             // TmpLHSI now uses the root
+      BasicBlock::iterator ARI = &Root; ++ARI;
+      TmpLHSI->moveBefore(ARI);                  // Move TmpLHSI to after Root
+      ARI = Root;
+
+      // Now propagate the ExtraOperand down the chain of instructions until we
+      // get to LHSI.
+      while (TmpLHSI != LHSI) {
+        Instruction *NextLHSI = cast<Instruction>(TmpLHSI->getOperand(0));
+        // Move the instruction to immediately before the chain we are
+        // constructing to avoid breaking dominance properties.
+        NextLHSI->moveBefore(ARI);
+        ARI = NextLHSI;
+
+        Value *NextOp = NextLHSI->getOperand(1);
+        NextLHSI->setOperand(1, ExtraOperand);
+        TmpLHSI = NextLHSI;
+        ExtraOperand = NextOp;
+      }
+
+      // Now that the instructions are reassociated, have the functor perform
+      // the transformation...
+      return F.apply(Root);
+    }
+
+    LHSI = dyn_cast<Instruction>(LHSI->getOperand(0));
+  }
+  return 0;
+}
+
+namespace {
+
+// AddRHS - Implements: X + X --> X << 1
+struct AddRHS {
+  Value *RHS;
+  AddRHS(Value *rhs) : RHS(rhs) {}
+  bool shouldApply(Value *LHS) const { return LHS == RHS; }
+  Instruction *apply(BinaryOperator &Add) const {
+    return BinaryOperator::CreateShl(Add.getOperand(0),
+                                     ConstantInt::get(Add.getType(), 1));
+  }
+};
+
+// AddMaskingAnd - Implements (A & C1)+(B & C2) --> (A & C1)|(B & C2)
+//                 iff C1&C2 == 0
+struct AddMaskingAnd {
+  Constant *C2;
+  AddMaskingAnd(Constant *c) : C2(c) {}
+  bool shouldApply(Value *LHS) const {
+    ConstantInt *C1;
+    return match(LHS, m_And(m_Value(), m_ConstantInt(C1))) &&
+           ConstantExpr::getAnd(C1, C2)->isNullValue();
+  }
+  Instruction *apply(BinaryOperator &Add) const {
+    return BinaryOperator::CreateOr(Add.getOperand(0), Add.getOperand(1));
+  }
+};
+
+}
+
+static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
+                                             InstCombiner *IC) {
+  if (CastInst *CI = dyn_cast<CastInst>(&I)) {
+    return IC->InsertCastBefore(CI->getOpcode(), SO, I.getType(), I);
+  }
+
+  // Figure out if the constant is the left or the right argument.
+  bool ConstIsRHS = isa<Constant>(I.getOperand(1));
+  Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
+
+  if (Constant *SOC = dyn_cast<Constant>(SO)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
+    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
+  }
+
+  Value *Op0 = SO, *Op1 = ConstOperand;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+  Instruction *New;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I))
+    New = BinaryOperator::Create(BO->getOpcode(), Op0, Op1,SO->getName()+".op");
+  else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+    New = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), Op0, Op1, 
+                          SO->getName()+".cmp");
+  else {
+    assert(0 && "Unknown binary instruction type!");
+    abort();
+  }
+  return IC->InsertNewInstBefore(New, I);
+}
+
+// FoldOpIntoSelect - Given an instruction with a select as one operand and a
+// constant as the other operand, try to fold the binary operator into the
+// select arguments.  This also works for Cast instructions, which obviously do
+// not have a second operand.
+static Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
+                                     InstCombiner *IC) {
+  // Don't modify shared select instructions
+  if (!SI->hasOneUse()) return 0;
+  Value *TV = SI->getOperand(1);
+  Value *FV = SI->getOperand(2);
+
+  if (isa<Constant>(TV) || isa<Constant>(FV)) {
+    // Bool selects with constant operands can be folded to logical ops.
+    if (SI->getType() == Type::Int1Ty) return 0;
+
+    Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, IC);
+    Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, IC);
+
+    return SelectInst::Create(SI->getCondition(), SelectTrueVal,
+                              SelectFalseVal);
+  }
+  return 0;
+}
+
+
+/// FoldOpIntoPhi - Given a binary operator or cast instruction which has a PHI
+/// node as operand #0, see if we can fold the instruction into the PHI (which
+/// is only possible if all operands to the PHI are constants).
+Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
+  PHINode *PN = cast<PHINode>(I.getOperand(0));
+  unsigned NumPHIValues = PN->getNumIncomingValues();
+  if (!PN->hasOneUse() || NumPHIValues == 0) return 0;
+
+  // Check to see if all of the operands of the PHI are constants.  If there is
+  // one non-constant value, remember the BB it is.  If there is more than one
+  // or if *it* is a PHI, bail out.
+  BasicBlock *NonConstBB = 0;
+  for (unsigned i = 0; i != NumPHIValues; ++i)
+    if (!isa<Constant>(PN->getIncomingValue(i))) {
+      if (NonConstBB) return 0;  // More than one non-const value.
+      if (isa<PHINode>(PN->getIncomingValue(i))) return 0;  // Itself a phi.
+      NonConstBB = PN->getIncomingBlock(i);
+      
+      // If the incoming non-constant value is in I's block, we have an infinite
+      // loop.
+      if (NonConstBB == I.getParent())
+        return 0;
+    }
+  
+  // If there is exactly one non-constant value, we can insert a copy of the
+  // operation in that block.  However, if this is a critical edge, we would be
+  // inserting the computation one some other paths (e.g. inside a loop).  Only
+  // do this if the pred block is unconditionally branching into the phi block.
+  if (NonConstBB) {
+    BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
+    if (!BI || !BI->isUnconditional()) return 0;
+  }
+
+  // Okay, we can do the transformation: create the new PHI node.
+  PHINode *NewPN = PHINode::Create(I.getType(), "");
+  NewPN->reserveOperandSpace(PN->getNumOperands()/2);
+  InsertNewInstBefore(NewPN, *PN);
+  NewPN->takeName(PN);
+
+  // Next, add all of the operands to the PHI.
+  if (I.getNumOperands() == 2) {
+    Constant *C = cast<Constant>(I.getOperand(1));
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = 0;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) {
+        if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+          InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
+        else
+          InV = ConstantExpr::get(I.getOpcode(), InC, C);
+      } else {
+        assert(PN->getIncomingBlock(i) == NonConstBB);
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) 
+          InV = BinaryOperator::Create(BO->getOpcode(),
+                                       PN->getIncomingValue(i), C, "phitmp",
+                                       NonConstBB->getTerminator());
+        else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+          InV = CmpInst::Create(CI->getOpcode(), 
+                                CI->getPredicate(),
+                                PN->getIncomingValue(i), C, "phitmp",
+                                NonConstBB->getTerminator());
+        else
+          assert(0 && "Unknown binop!");
+        
+        AddToWorkList(cast<Instruction>(InV));
+      }
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else { 
+    CastInst *CI = cast<CastInst>(&I);
+    const Type *RetTy = CI->getType();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) {
+        InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
+      } else {
+        assert(PN->getIncomingBlock(i) == NonConstBB);
+        InV = CastInst::Create(CI->getOpcode(), PN->getIncomingValue(i), 
+                               I.getType(), "phitmp", 
+                               NonConstBB->getTerminator());
+        AddToWorkList(cast<Instruction>(InV));
+      }
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  }
+  return ReplaceInstUsesWith(I, NewPN);
+}
+
+
+/// WillNotOverflowSignedAdd - Return true if we can prove that:
+///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
+/// This basically requires proving that the add in the original type would not
+/// overflow to change the sign bit or have a carry out.
+bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
+  // There are different heuristics we can use for this.  Here are some simple
+  // ones.
+  
+  // Add has the property that adding any two 2's complement numbers can only 
+  // have one carry bit which can change a sign.  As such, if LHS and RHS each
+  // have at least two sign bits, we know that the addition of the two values will
+  // sign extend fine.
+  if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
+    return true;
+  
+  
+  // If one of the operands only has one non-zero bit, and if the other operand
+  // has a known-zero bit in a more significant place than it (not including the
+  // sign bit) the ripple may go up to and fill the zero, but won't change the
+  // sign.  For example, (X & ~4) + 1.
+  
+  // TODO: Implement.
+  
+  return false;
+}
+
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
+    // X + undef -> undef
+    if (isa<UndefValue>(RHS))
+      return ReplaceInstUsesWith(I, RHS);
+
+    // X + 0 --> X
+    if (!I.getType()->isFPOrFPVector()) { // NOTE: -0 + +0 = +0.
+      if (RHSC->isNullValue())
+        return ReplaceInstUsesWith(I, LHS);
+    } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->isExactlyValue(ConstantFP::getNegativeZero
+                              (I.getType())->getValueAPF()))
+        return ReplaceInstUsesWith(I, LHS);
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHSC)) {
+      // X + (signbit) --> X ^ signbit
+      const APInt& Val = CI->getValue();
+      uint32_t BitWidth = Val.getBitWidth();
+      if (Val == APInt::getSignBit(BitWidth))
+        return BinaryOperator::CreateXor(LHS, RHS);
+      
+      // See if SimplifyDemandedBits can simplify this.  This handles stuff like
+      // (X & 254)+1 -> (X&254)|1
+      if (!isa<VectorType>(I.getType()) && SimplifyDemandedInstructionBits(I))
+        return &I;
+
+      // zext(i1) - 1  ->  select i1, 0, -1
+      if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
+        if (CI->isAllOnesValue() &&
+            ZI->getOperand(0)->getType() == Type::Int1Ty)
+          return SelectInst::Create(ZI->getOperand(0),
+                                    Constant::getNullValue(I.getType()),
+                                    ConstantInt::getAllOnesValue(I.getType()));
+    }
+
+    if (isa<PHINode>(LHS))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+    
+    ConstantInt *XorRHS = 0;
+    Value *XorLHS = 0;
+    if (isa<ConstantInt>(RHSC) &&
+        match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
+      uint32_t TySizeBits = I.getType()->getPrimitiveSizeInBits();
+      const APInt& RHSVal = cast<ConstantInt>(RHSC)->getValue();
+      
+      uint32_t Size = TySizeBits / 2;
+      APInt C0080Val(APInt(TySizeBits, 1ULL).shl(Size - 1));
+      APInt CFF80Val(-C0080Val);
+      do {
+        if (TySizeBits > Size) {
+          // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext.
+          // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext.
+          if ((RHSVal == CFF80Val && XorRHS->getValue() == C0080Val) ||
+              (RHSVal == C0080Val && XorRHS->getValue() == CFF80Val)) {
+            // This is a sign extend if the top bits are known zero.
+            if (!MaskedValueIsZero(XorLHS, 
+                   APInt::getHighBitsSet(TySizeBits, TySizeBits - Size)))
+              Size = 0;  // Not a sign ext, but can't be any others either.
+            break;
+          }
+        }
+        Size >>= 1;
+        C0080Val = APIntOps::lshr(C0080Val, Size);
+        CFF80Val = APIntOps::ashr(CFF80Val, Size);
+      } while (Size >= 1);
+      
+      // FIXME: This shouldn't be necessary. When the backends can handle types
+      // with funny bit widths then this switch statement should be removed. It
+      // is just here to get the size of the "middle" type back up to something
+      // that the back ends can handle.
+      const Type *MiddleType = 0;
+      switch (Size) {
+        default: break;
+        case 32: MiddleType = Type::Int32Ty; break;
+        case 16: MiddleType = Type::Int16Ty; break;
+        case  8: MiddleType = Type::Int8Ty; break;
+      }
+      if (MiddleType) {
+        Instruction *NewTrunc = new TruncInst(XorLHS, MiddleType, "sext");
+        InsertNewInstBefore(NewTrunc, I);
+        return new SExtInst(NewTrunc, I.getType(), I.getName());
+      }
+    }
+  }
+
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateXor(LHS, RHS);
+
+  // X + X --> X << 1
+  if (I.getType()->isInteger()) {
+    if (Instruction *Result = AssociativeOpt(I, AddRHS(RHS))) return Result;
+
+    if (Instruction *RHSI = dyn_cast<Instruction>(RHS)) {
+      if (RHSI->getOpcode() == Instruction::Sub)
+        if (LHS == RHSI->getOperand(1))                   // A + (B - A) --> B
+          return ReplaceInstUsesWith(I, RHSI->getOperand(0));
+    }
+    if (Instruction *LHSI = dyn_cast<Instruction>(LHS)) {
+      if (LHSI->getOpcode() == Instruction::Sub)
+        if (RHS == LHSI->getOperand(1))                   // (B - A) + A --> B
+          return ReplaceInstUsesWith(I, LHSI->getOperand(0));
+    }
+  }
+
+  // -A + B  -->  B - A
+  // -A + -B  -->  -(A + B)
+  if (Value *LHSV = dyn_castNegVal(LHS)) {
+    if (LHS->getType()->isIntOrIntVector()) {
+      if (Value *RHSV = dyn_castNegVal(RHS)) {
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSV, RHSV, "sum");
+        InsertNewInstBefore(NewAdd, I);
+        return BinaryOperator::CreateNeg(NewAdd);
+      }
+    }
+    
+    return BinaryOperator::CreateSub(RHS, LHSV);
+  }
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castNegVal(RHS))
+      return BinaryOperator::CreateSub(LHS, V);
+
+
+  ConstantInt *C2;
+  if (Value *X = dyn_castFoldableMul(LHS, C2)) {
+    if (X == RHS)   // X*C + X --> X * (C+1)
+      return BinaryOperator::CreateMul(RHS, AddOne(C2));
+
+    // X*C1 + X*C2 --> X * (C1+C2)
+    ConstantInt *C1;
+    if (X == dyn_castFoldableMul(RHS, C1))
+      return BinaryOperator::CreateMul(X, Add(C1, C2));
+  }
+
+  // X + X*C --> X * (C+1)
+  if (dyn_castFoldableMul(RHS, C2) == LHS)
+    return BinaryOperator::CreateMul(LHS, AddOne(C2));
+
+  // X + ~X --> -1   since   ~X = -X-1
+  if (dyn_castNotVal(LHS) == RHS || dyn_castNotVal(RHS) == LHS)
+    return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+  
+
+  // (A & C1)+(B & C2) --> (A & C1)|(B & C2) iff C1&C2 == 0
+  if (match(RHS, m_And(m_Value(), m_ConstantInt(C2))))
+    if (Instruction *R = AssociativeOpt(I, AddMaskingAnd(C2)))
+      return R;
+  
+  // A+B --> A|B iff A and B have no bits set in common.
+  if (const IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+    APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
+    APInt LHSKnownOne(IT->getBitWidth(), 0);
+    APInt LHSKnownZero(IT->getBitWidth(), 0);
+    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    if (LHSKnownZero != 0) {
+      APInt RHSKnownOne(IT->getBitWidth(), 0);
+      APInt RHSKnownZero(IT->getBitWidth(), 0);
+      ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+      
+      // No bits in common -> bitwise or.
+      if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
+        return BinaryOperator::CreateOr(LHS, RHS);
+    }
+  }
+
+  // W*X + Y*Z --> W * (X+Z)  iff W == Y
+  if (I.getType()->isIntOrIntVector()) {
+    Value *W, *X, *Y, *Z;
+    if (match(LHS, m_Mul(m_Value(W), m_Value(X))) &&
+        match(RHS, m_Mul(m_Value(Y), m_Value(Z)))) {
+      if (W != Y) {
+        if (W == Z) {
+          std::swap(Y, Z);
+        } else if (Y == X) {
+          std::swap(W, X);
+        } else if (X == Z) {
+          std::swap(Y, Z);
+          std::swap(W, X);
+        }
+      }
+
+      if (W == Y) {
+        Value *NewAdd = InsertNewInstBefore(BinaryOperator::CreateAdd(X, Z,
+                                                            LHS->getName()), I);
+        return BinaryOperator::CreateMul(W, NewAdd);
+      }
+    }
+  }
+
+  if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
+    Value *X = 0;
+    if (match(LHS, m_Not(m_Value(X))))    // ~X + C --> (C-1) - X
+      return BinaryOperator::CreateSub(SubOne(CRHS), X);
+
+    // (X & FF00) + xx00  -> (X+xx00) & FF00
+    if (LHS->hasOneUse() && match(LHS, m_And(m_Value(X), m_ConstantInt(C2)))) {
+      Constant *Anded = And(CRHS, C2);
+      if (Anded == CRHS) {
+        // See if all bits from the first bit set in the Add RHS up are included
+        // in the mask.  First, get the rightmost bit.
+        const APInt& AddRHSV = CRHS->getValue();
+
+        // Form a mask of all bits from the lowest bit added through the top.
+        APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
+
+        // See if the and mask includes all of these bits.
+        APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue());
+
+        if (AddRHSHighBits == AddRHSHighBitsAnd) {
+          // Okay, the xform is safe.  Insert the new add pronto.
+          Value *NewAdd = InsertNewInstBefore(BinaryOperator::CreateAdd(X, CRHS,
+                                                            LHS->getName()), I);
+          return BinaryOperator::CreateAnd(NewAdd, C2);
+        }
+      }
+    }
+
+    // Try to fold constant add into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+  }
+
+  // add (cast *A to intptrtype) B -> 
+  //   cast (GEP (cast *A to sbyte*) B)  -->  intptrtype
+  {
+    CastInst *CI = dyn_cast<CastInst>(LHS);
+    Value *Other = RHS;
+    if (!CI) {
+      CI = dyn_cast<CastInst>(RHS);
+      Other = LHS;
+    }
+    if (CI && CI->getType()->isSized() && 
+        (CI->getType()->getPrimitiveSizeInBits() == 
+         TD->getIntPtrType()->getPrimitiveSizeInBits()) 
+        && isa<PointerType>(CI->getOperand(0)->getType())) {
+      unsigned AS =
+        cast<PointerType>(CI->getOperand(0)->getType())->getAddressSpace();
+      Value *I2 = InsertBitCastBefore(CI->getOperand(0),
+                                      PointerType::get(Type::Int8Ty, AS), I);
+      I2 = InsertNewInstBefore(GetElementPtrInst::Create(I2, Other, "ctg2"), I);
+      return new PtrToIntInst(I2, CI->getType());
+    }
+  }
+  
+  // add (select X 0 (sub n A)) A  -->  select X A n
+  {
+    SelectInst *SI = dyn_cast<SelectInst>(LHS);
+    Value *A = RHS;
+    if (!SI) {
+      SI = dyn_cast<SelectInst>(RHS);
+      A = LHS;
+    }
+    if (SI && SI->hasOneUse()) {
+      Value *TV = SI->getTrueValue();
+      Value *FV = SI->getFalseValue();
+      Value *N;
+
+      // Can we fold the add into the argument of the select?
+      // We check both true and false select arguments for a matching subtract.
+      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the true select value.
+        return SelectInst::Create(SI->getCondition(), N, A);
+      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the false select value.
+        return SelectInst::Create(SI->getCondition(), A, N);
+    }
+  }
+  
+  // Check for X+0.0.  Simplify it to X if we know X is not -0.0.
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
+    if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS))
+      return ReplaceInstUsesWith(I, LHS);
+
+  // Check for (add (sext x), y), see if we can merge this into an
+  // integer add followed by a sext.
+  if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
+    // (add (sext x), cst) --> (sext (add x, cst'))
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+      Constant *CI = 
+        ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new, smaller add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        CI, "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+    
+    // (add (sext x), (sext y)) --> (sext (add int x, y))
+    if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of sexts), and if the
+      // integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        RHSConv->getOperand(0),
+                                                        "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+  }
+  
+  // Check for (add double (sitofp x), y), see if we can merge this into an
+  // integer add followed by a promotion.
+  if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    // (add double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
+    // ... if the constant fits in the integer value.  This is useful for things
+    // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
+    // requires a constant pool load, and generally allows the add to be better
+    // instcombined.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
+      Constant *CI = 
+      ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        CI, "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+    
+    // (add double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
+    if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of int->fp conversions),
+      // and if the integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Instruction *NewAdd = BinaryOperator::CreateAdd(LHSConv->getOperand(0), 
+                                                        RHSConv->getOperand(0),
+                                                        "addconv");
+        InsertNewInstBefore(NewAdd, I);
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+  }
+  
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Op0 == Op1 &&                        // sub X, X  -> 0
+      !I.getType()->isFPOrFPVector())
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // If this is a 'B = x-(-A)', change to B = x+A...
+  if (Value *V = dyn_castNegVal(Op1))
+    return BinaryOperator::CreateAdd(Op0, V);
+
+  if (isa<UndefValue>(Op0))
+    return ReplaceInstUsesWith(I, Op0);    // undef - X -> undef
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);    // X - undef -> undef
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
+    // Replace (-1 - A) with (~A)...
+    if (C->isAllOnesValue())
+      return BinaryOperator::CreateNot(Op1);
+
+    // C - ~X == X + (1+C)
+    Value *X = 0;
+    if (match(Op1, m_Not(m_Value(X))))
+      return BinaryOperator::CreateAdd(X, AddOne(C));
+
+    // -(X >>u 31) -> (X >>s 31)
+    // -(X >>s 31) -> (X >>u 31)
+    if (C->isZero()) {
+      if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op1)) {
+        if (SI->getOpcode() == Instruction::LShr) {
+          if (ConstantInt *CU = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+            // Check to see if we are shifting out everything but the sign bit.
+            if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
+                SI->getType()->getPrimitiveSizeInBits()-1) {
+              // Ok, the transformation is safe.  Insert AShr.
+              return BinaryOperator::Create(Instruction::AShr, 
+                                          SI->getOperand(0), CU, SI->getName());
+            }
+          }
+        }
+        else if (SI->getOpcode() == Instruction::AShr) {
+          if (ConstantInt *CU = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+            // Check to see if we are shifting out everything but the sign bit.
+            if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
+                SI->getType()->getPrimitiveSizeInBits()-1) {
+              // Ok, the transformation is safe.  Insert LShr. 
+              return BinaryOperator::CreateLShr(
+                                          SI->getOperand(0), CU, SI->getName());
+            }
+          }
+        }
+      }
+    }
+
+    // Try to fold constant sub into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+  }
+
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateXor(Op0, Op1);
+
+  if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
+    if (Op1I->getOpcode() == Instruction::Add &&
+        !Op0->getType()->isFPOrFPVector()) {
+      if (Op1I->getOperand(0) == Op0)              // X-(X+Y) == -Y
+        return BinaryOperator::CreateNeg(Op1I->getOperand(1), I.getName());
+      else if (Op1I->getOperand(1) == Op0)         // X-(Y+X) == -Y
+        return BinaryOperator::CreateNeg(Op1I->getOperand(0), I.getName());
+      else if (ConstantInt *CI1 = dyn_cast<ConstantInt>(I.getOperand(0))) {
+        if (ConstantInt *CI2 = dyn_cast<ConstantInt>(Op1I->getOperand(1)))
+          // C1-(X+C2) --> (C1-C2)-X
+          return BinaryOperator::CreateSub(Subtract(CI1, CI2), 
+                                           Op1I->getOperand(0));
+      }
+    }
+
+    if (Op1I->hasOneUse()) {
+      // Replace (x - (y - z)) with (x + (z - y)) if the (y - z) subexpression
+      // is not used by anyone else...
+      //
+      if (Op1I->getOpcode() == Instruction::Sub &&
+          !Op1I->getType()->isFPOrFPVector()) {
+        // Swap the two operands of the subexpr...
+        Value *IIOp0 = Op1I->getOperand(0), *IIOp1 = Op1I->getOperand(1);
+        Op1I->setOperand(0, IIOp1);
+        Op1I->setOperand(1, IIOp0);
+
+        // Create the new top level add instruction...
+        return BinaryOperator::CreateAdd(Op0, Op1);
+      }
+
+      // Replace (A - (A & B)) with (A & ~B) if this is the only use of (A&B)...
+      //
+      if (Op1I->getOpcode() == Instruction::And &&
+          (Op1I->getOperand(0) == Op0 || Op1I->getOperand(1) == Op0)) {
+        Value *OtherOp = Op1I->getOperand(Op1I->getOperand(0) == Op0);
+
+        Value *NewNot =
+          InsertNewInstBefore(BinaryOperator::CreateNot(OtherOp, "B.not"), I);
+        return BinaryOperator::CreateAnd(Op0, NewNot);
+      }
+
+      // 0 - (X sdiv C)  -> (X sdiv -C)
+      if (Op1I->getOpcode() == Instruction::SDiv)
+        if (ConstantInt *CSI = dyn_cast<ConstantInt>(Op0))
+          if (CSI->isZero())
+            if (Constant *DivRHS = dyn_cast<Constant>(Op1I->getOperand(1)))
+              return BinaryOperator::CreateSDiv(Op1I->getOperand(0),
+                                               ConstantExpr::getNeg(DivRHS));
+
+      // X - X*C --> X * (1-C)
+      ConstantInt *C2 = 0;
+      if (dyn_castFoldableMul(Op1I, C2) == Op0) {
+        Constant *CP1 = Subtract(ConstantInt::get(I.getType(), 1), C2);
+        return BinaryOperator::CreateMul(Op0, CP1);
+      }
+    }
+  }
+
+  if (!Op0->getType()->isFPOrFPVector())
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      if (Op0I->getOpcode() == Instruction::Add) {
+        if (Op0I->getOperand(0) == Op1)             // (Y+X)-Y == X
+          return ReplaceInstUsesWith(I, Op0I->getOperand(1));
+        else if (Op0I->getOperand(1) == Op1)        // (X+Y)-Y == X
+          return ReplaceInstUsesWith(I, Op0I->getOperand(0));
+      } else if (Op0I->getOpcode() == Instruction::Sub) {
+        if (Op0I->getOperand(0) == Op1)             // (X-Y)-X == -Y
+          return BinaryOperator::CreateNeg(Op0I->getOperand(1), I.getName());
+      }
+    }
+
+  ConstantInt *C1;
+  if (Value *X = dyn_castFoldableMul(Op0, C1)) {
+    if (X == Op1)  // X*C - X --> X * (C-1)
+      return BinaryOperator::CreateMul(Op1, SubOne(C1));
+
+    ConstantInt *C2;   // X*C1 - X*C2 -> X * (C1-C2)
+    if (X == dyn_castFoldableMul(Op1, C2))
+      return BinaryOperator::CreateMul(X, Subtract(C1, C2));
+  }
+  return 0;
+}
+
+/// isSignBitCheck - Given an exploded icmp instruction, return true if the
+/// comparison only checks the sign bit.  If it only checks the sign bit, set
+/// TrueIfSigned if the result of the comparison is true when the input value is
+/// signed.
+static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
+                           bool &TrueIfSigned) {
+  switch (pred) {
+  case ICmpInst::ICMP_SLT:   // True if LHS s< 0
+    TrueIfSigned = true;
+    return RHS->isZero();
+  case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
+    TrueIfSigned = true;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_SGT:   // True if LHS s> -1
+    TrueIfSigned = false;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_UGT:
+    // True if LHS u> RHS and RHS == high-bit-mask - 1
+    TrueIfSigned = true;
+    return RHS->getValue() ==
+      APInt::getSignedMaxValue(RHS->getType()->getPrimitiveSizeInBits());
+  case ICmpInst::ICMP_UGE: 
+    // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = true;
+    return RHS->getValue().isSignBit();
+  default:
+    return false;
+  }
+}
+
+Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0);
+
+  if (isa<UndefValue>(I.getOperand(1)))              // undef * X -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // Simplify mul instructions with a constant RHS...
+  if (Constant *Op1 = dyn_cast<Constant>(I.getOperand(1))) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+
+      // ((X << C1)*C2) == (X * (C2 << C1))
+      if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
+        if (SI->getOpcode() == Instruction::Shl)
+          if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
+            return BinaryOperator::CreateMul(SI->getOperand(0),
+                                             ConstantExpr::getShl(CI, ShOp));
+
+      if (CI->isZero())
+        return ReplaceInstUsesWith(I, Op1);  // X * 0  == 0
+      if (CI->equalsInt(1))                  // X * 1  == X
+        return ReplaceInstUsesWith(I, Op0);
+      if (CI->isAllOnesValue())              // X * -1 == 0 - X
+        return BinaryOperator::CreateNeg(Op0, I.getName());
+
+      const APInt& Val = cast<ConstantInt>(CI)->getValue();
+      if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
+        return BinaryOperator::CreateShl(Op0,
+                 ConstantInt::get(Op0->getType(), Val.logBase2()));
+      }
+    } else if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1F->isNullValue())
+        return ReplaceInstUsesWith(I, Op1);
+
+      // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
+      // ANSI says we can drop signals, so we can do this anyway." (from GCC)
+      if (Op1F->isExactlyValue(1.0))
+        return ReplaceInstUsesWith(I, Op0);  // Eliminate 'mul double %X, 1.0'
+    } else if (isa<VectorType>(Op1->getType())) {
+      if (isa<ConstantAggregateZero>(Op1))
+        return ReplaceInstUsesWith(I, Op1);
+
+      if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
+        if (Op1V->isAllOnesValue())              // X * -1 == 0 - X
+          return BinaryOperator::CreateNeg(Op0, I.getName());
+
+        // As above, vector X*splat(1.0) -> X in all defined cases.
+        if (Constant *Splat = Op1V->getSplatValue()) {
+          if (ConstantFP *F = dyn_cast<ConstantFP>(Splat))
+            if (F->isExactlyValue(1.0))
+              return ReplaceInstUsesWith(I, Op0);
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Splat))
+            if (CI->equalsInt(1))
+              return ReplaceInstUsesWith(I, Op0);
+        }
+      }
+    }
+    
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
+      if (Op0I->getOpcode() == Instruction::Add && Op0I->hasOneUse() &&
+          isa<ConstantInt>(Op0I->getOperand(1)) && isa<ConstantInt>(Op1)) {
+        // Canonicalize (X+C1)*C2 -> X*C2+C1*C2.
+        Instruction *Add = BinaryOperator::CreateMul(Op0I->getOperand(0),
+                                                     Op1, "tmp");
+        InsertNewInstBefore(Add, I);
+        Value *C1C2 = ConstantExpr::getMul(Op1, 
+                                           cast<Constant>(Op0I->getOperand(1)));
+        return BinaryOperator::CreateAdd(Add, C1C2);
+        
+      }
+
+    // Try to fold constant mul into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *Op0v = dyn_castNegVal(Op0))     // -X * -Y = X*Y
+    if (Value *Op1v = dyn_castNegVal(I.getOperand(1)))
+      return BinaryOperator::CreateMul(Op0v, Op1v);
+
+  // (X / Y) *  Y = X - (X % Y)
+  // (X / Y) * -Y = (X % Y) - X
+  {
+    Value *Op1 = I.getOperand(1);
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
+    if (!BO ||
+        (BO->getOpcode() != Instruction::UDiv && 
+         BO->getOpcode() != Instruction::SDiv)) {
+      Op1 = Op0;
+      BO = dyn_cast<BinaryOperator>(I.getOperand(1));
+    }
+    Value *Neg = dyn_castNegVal(Op1);
+    if (BO && BO->hasOneUse() &&
+        (BO->getOperand(1) == Op1 || BO->getOperand(1) == Neg) &&
+        (BO->getOpcode() == Instruction::UDiv ||
+         BO->getOpcode() == Instruction::SDiv)) {
+      Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1);
+
+      Instruction *Rem;
+      if (BO->getOpcode() == Instruction::UDiv)
+        Rem = BinaryOperator::CreateURem(Op0BO, Op1BO);
+      else
+        Rem = BinaryOperator::CreateSRem(Op0BO, Op1BO);
+
+      InsertNewInstBefore(Rem, I);
+      Rem->takeName(BO);
+
+      if (Op1BO == Op1)
+        return BinaryOperator::CreateSub(Op0BO, Rem);
+      else
+        return BinaryOperator::CreateSub(Rem, Op0BO);
+    }
+  }
+
+  if (I.getType() == Type::Int1Ty)
+    return BinaryOperator::CreateAnd(Op0, I.getOperand(1));
+
+  // If one of the operands of the multiply is a cast from a boolean value, then
+  // we know the bool is either zero or one, so this is a 'masking' multiply.
+  // See if we can simplify things based on how the boolean was originally
+  // formed.
+  CastInst *BoolCast = 0;
+  if (ZExtInst *CI = dyn_cast<ZExtInst>(Op0))
+    if (CI->getOperand(0)->getType() == Type::Int1Ty)
+      BoolCast = CI;
+  if (!BoolCast)
+    if (ZExtInst *CI = dyn_cast<ZExtInst>(I.getOperand(1)))
+      if (CI->getOperand(0)->getType() == Type::Int1Ty)
+        BoolCast = CI;
+  if (BoolCast) {
+    if (ICmpInst *SCI = dyn_cast<ICmpInst>(BoolCast->getOperand(0))) {
+      Value *SCIOp0 = SCI->getOperand(0), *SCIOp1 = SCI->getOperand(1);
+      const Type *SCOpTy = SCIOp0->getType();
+      bool TIS = false;
+      
+      // If the icmp is true iff the sign bit of X is set, then convert this
+      // multiply into a shift/and combination.
+      if (isa<ConstantInt>(SCIOp1) &&
+          isSignBitCheck(SCI->getPredicate(), cast<ConstantInt>(SCIOp1), TIS) &&
+          TIS) {
+        // Shift the X value right to turn it into "all signbits".
+        Constant *Amt = ConstantInt::get(SCIOp0->getType(),
+                                          SCOpTy->getPrimitiveSizeInBits()-1);
+        Value *V =
+          InsertNewInstBefore(
+            BinaryOperator::Create(Instruction::AShr, SCIOp0, Amt,
+                                            BoolCast->getOperand(0)->getName()+
+                                            ".mask"), I);
+
+        // If the multiply type is not the same as the source type, sign extend
+        // or truncate to the multiply type.
+        if (I.getType() != V->getType()) {
+          uint32_t SrcBits = V->getType()->getPrimitiveSizeInBits();
+          uint32_t DstBits = I.getType()->getPrimitiveSizeInBits();
+          Instruction::CastOps opcode = 
+            (SrcBits == DstBits ? Instruction::BitCast : 
+             (SrcBits < DstBits ? Instruction::SExt : Instruction::Trunc));
+          V = InsertCastBefore(opcode, V, I.getType(), I);
+        }
+
+        Value *OtherOp = Op0 == BoolCast ? I.getOperand(1) : Op0;
+        return BinaryOperator::CreateAnd(V, OtherOp);
+      }
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select
+/// instruction.
+bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
+  SelectInst *SI = cast<SelectInst>(I.getOperand(1));
+  
+  // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
+  int NonNullOperand = -1;
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
+    if (ST->isNullValue())
+      NonNullOperand = 2;
+  // div/rem X, (Cond ? Y : 0) -> div/rem X, Y
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
+    if (ST->isNullValue())
+      NonNullOperand = 1;
+  
+  if (NonNullOperand == -1)
+    return false;
+  
+  Value *SelectCond = SI->getOperand(0);
+  
+  // Change the div/rem to use 'Y' instead of the select.
+  I.setOperand(1, SI->getOperand(NonNullOperand));
+  
+  // Okay, we know we replace the operand of the div/rem with 'Y' with no
+  // problem.  However, the select, or the condition of the select may have
+  // multiple uses.  Based on our knowledge that the operand must be non-zero,
+  // propagate the known value for the select into other uses of it, and
+  // propagate a known value of the condition into its other users.
+  
+  // If the select and condition only have a single use, don't bother with this,
+  // early exit.
+  if (SI->use_empty() && SelectCond->hasOneUse())
+    return true;
+  
+  // Scan the current block backward, looking for other uses of SI.
+  BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin();
+  
+  while (BBI != BBFront) {
+    --BBI;
+    // If we found a call to a function, we can't assume it will return, so
+    // information from below it cannot be propagated above it.
+    if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
+      break;
+    
+    // Replace uses of the select or its condition with the known values.
+    for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
+         I != E; ++I) {
+      if (*I == SI) {
+        *I = SI->getOperand(NonNullOperand);
+        AddToWorkList(BBI);
+      } else if (*I == SelectCond) {
+        *I = NonNullOperand == 1 ? ConstantInt::getTrue() :
+                                   ConstantInt::getFalse();
+        AddToWorkList(BBI);
+      }
+    }
+    
+    // If we past the instruction, quit looking for it.
+    if (&*BBI == SI)
+      SI = 0;
+    if (&*BBI == SelectCond)
+      SelectCond = 0;
+    
+    // If we ran out of things to eliminate, break out of the loop.
+    if (SelectCond == 0 && SI == 0)
+      break;
+    
+  }
+  return true;
+}
+
+
+/// This function implements the transforms on div instructions that work
+/// regardless of the kind of div instruction it is (udiv, sdiv, or fdiv). It is
+/// used by the visitors to those instructions.
+/// @brief Transforms common to all three div instructions
+Instruction *InstCombiner::commonDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // undef / X -> 0        for integer.
+  // undef / X -> undef    for FP (the undef could be a snan).
+  if (isa<UndefValue>(Op0)) {
+    if (Op0->getType()->isFPOrFPVector())
+      return ReplaceInstUsesWith(I, Op0);
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+
+  // X / undef -> undef
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);
+
+  return 0;
+}
+
+/// This function implements the transforms common to both integer division
+/// instructions (udiv and sdiv). It is called by the visitors to those integer
+/// division instructions.
+/// @brief Common integer divide transforms
+Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // (sdiv X, X) --> 1     (udiv X, X) --> 1
+  if (Op0 == Op1) {
+    if (const VectorType *Ty = dyn_cast<VectorType>(I.getType())) {
+      ConstantInt *CI = ConstantInt::get(Ty->getElementType(), 1);
+      std::vector<Constant*> Elts(Ty->getNumElements(), CI);
+      return ReplaceInstUsesWith(I, ConstantVector::get(Elts));
+    }
+
+    ConstantInt *CI = ConstantInt::get(I.getType(), 1);
+    return ReplaceInstUsesWith(I, CI);
+  }
+  
+  if (Instruction *Common = commonDivTransforms(I))
+    return Common;
+  
+  // Handle cases involving: [su]div X, (select Cond, Y, Z)
+  // This does not apply for fdiv.
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // div X, 1 == X
+    if (RHS->equalsInt(1))
+      return ReplaceInstUsesWith(I, Op0);
+
+    // (X / C1) / C2  -> X / (C1*C2)
+    if (Instruction *LHS = dyn_cast<Instruction>(Op0))
+      if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode())
+        if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) {
+          if (MultiplyOverflows(RHS, LHSRHS, I.getOpcode()==Instruction::SDiv))
+            return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+          else 
+            return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0),
+                                          Multiply(RHS, LHSRHS));
+        }
+
+    if (!RHS->isZero()) { // avoid X udiv 0
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+        if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+          return R;
+      if (isa<PHINode>(Op0))
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+    }
+  }
+
+  // 0 / X == 0, we don't need to preserve faults!
+  if (ConstantInt *LHS = dyn_cast<ConstantInt>(Op0))
+    if (LHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // It can't be division by zero, hence it must be division by one.
+  if (I.getType() == Type::Int1Ty)
+    return ReplaceInstUsesWith(I, Op0);
+
+  if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
+    if (ConstantInt *X = cast_or_null<ConstantInt>(Op1V->getSplatValue()))
+      // div X, 1 == X
+      if (X->isOne())
+        return ReplaceInstUsesWith(I, Op0);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+    // X udiv C^2 -> X >> C
+    // Check to see if this is an unsigned division with an exact power of 2,
+    // if so, convert to a right shift.
+    if (C->getValue().isPowerOf2())  // 0 not included in isPowerOf2
+      return BinaryOperator::CreateLShr(Op0, 
+               ConstantInt::get(Op0->getType(), C->getValue().logBase2()));
+
+    // X udiv C, where C >= signbit
+    if (C->getValue().isNegative()) {
+      Value *IC = InsertNewInstBefore(new ICmpInst(ICmpInst::ICMP_ULT, Op0, C),
+                                      I);
+      return SelectInst::Create(IC, Constant::getNullValue(I.getType()),
+                                ConstantInt::get(I.getType(), 1));
+    }
+  }
+
+  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+  if (BinaryOperator *RHSI = dyn_cast<BinaryOperator>(I.getOperand(1))) {
+    if (RHSI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(RHSI->getOperand(0))) {
+      const APInt& C1 = cast<ConstantInt>(RHSI->getOperand(0))->getValue();
+      if (C1.isPowerOf2()) {
+        Value *N = RHSI->getOperand(1);
+        const Type *NTy = N->getType();
+        if (uint32_t C2 = C1.logBase2()) {
+          Constant *C2V = ConstantInt::get(NTy, C2);
+          N = InsertNewInstBefore(BinaryOperator::CreateAdd(N, C2V, "tmp"), I);
+        }
+        return BinaryOperator::CreateLShr(Op0, N);
+      }
+    }
+  }
+  
+  // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2)
+  // where C1&C2 are powers of two.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
+    if (ConstantInt *STO = dyn_cast<ConstantInt>(SI->getOperand(1)))
+      if (ConstantInt *SFO = dyn_cast<ConstantInt>(SI->getOperand(2)))  {
+        const APInt &TVA = STO->getValue(), &FVA = SFO->getValue();
+        if (TVA.isPowerOf2() && FVA.isPowerOf2()) {
+          // Compute the shift amounts
+          uint32_t TSA = TVA.logBase2(), FSA = FVA.logBase2();
+          // Construct the "on true" case of the select
+          Constant *TC = ConstantInt::get(Op0->getType(), TSA);
+          Instruction *TSI = BinaryOperator::CreateLShr(
+                                                 Op0, TC, SI->getName()+".t");
+          TSI = InsertNewInstBefore(TSI, I);
+  
+          // Construct the "on false" case of the select
+          Constant *FC = ConstantInt::get(Op0->getType(), FSA); 
+          Instruction *FSI = BinaryOperator::CreateLShr(
+                                                 Op0, FC, SI->getName()+".f");
+          FSI = InsertNewInstBefore(FSI, I);
+
+          // construct the select instruction and return it.
+          return SelectInst::Create(SI->getOperand(0), TSI, FSI, SI->getName());
+        }
+      }
+  return 0;
+}
+
+Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // sdiv X, -1 == -X
+    if (RHS->isAllOnesValue())
+      return BinaryOperator::CreateNeg(Op0);
+  }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a udiv.
+  if (I.getType()->isInteger()) {
+    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+    if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+      // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
+      return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+    }
+  }      
+  
+  return 0;
+}
+
+Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+  return commonDivTransforms(I);
+}
+
+/// This function implements the transforms on rem instructions that work
+/// regardless of the kind of rem instruction it is (urem, srem, or frem). It 
+/// is used by the visitors to those instructions.
+/// @brief Transforms common to all three rem instructions
+Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op0)) {             // undef % X -> 0
+    if (I.getType()->isFPOrFPVector())
+      return ReplaceInstUsesWith(I, Op0);  // X % undef -> undef (could be SNaN)
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  return 0;
+}
+
+/// This function implements the transforms common to both integer remainder
+/// instructions (urem and srem). It is called by the visitors to those integer
+/// remainder instructions.
+/// @brief Common integer remainder transforms
+Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *common = commonRemTransforms(I))
+    return common;
+
+  // 0 % X == 0 for integer, we don't need to preserve faults!
+  if (Constant *LHS = dyn_cast<Constant>(Op0))
+    if (LHS->isNullValue())
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X % 0 == undef, we don't need to preserve faults!
+    if (RHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
+    
+    if (RHS->equalsInt(1))  // X % 1 == 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
+        if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+          return R;
+      } else if (isa<PHINode>(Op0I)) {
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+      }
+
+      // See if we can fold away this rem instruction.
+      if (SimplifyDemandedInstructionBits(I))
+        return &I;
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+  
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X urem C^2 -> X and C
+    // Check to see if this is an unsigned remainder with an exact power of 2,
+    // if so, convert to a bitwise and.
+    if (ConstantInt *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue().isPowerOf2())
+        return BinaryOperator::CreateAnd(Op0, SubOne(C));
+  }
+
+  if (Instruction *RHSI = dyn_cast<Instruction>(I.getOperand(1))) {
+    // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
+    if (RHSI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(RHSI->getOperand(0))) {
+      if (cast<ConstantInt>(RHSI->getOperand(0))->getValue().isPowerOf2()) {
+        Constant *N1 = ConstantInt::getAllOnesValue(I.getType());
+        Value *Add = InsertNewInstBefore(BinaryOperator::CreateAdd(RHSI, N1,
+                                                                   "tmp"), I);
+        return BinaryOperator::CreateAnd(Op0, Add);
+      }
+    }
+  }
+
+  // urem X, (select Cond, 2^C1, 2^C2) --> select Cond, (and X, C1), (and X, C2)
+  // where C1&C2 are powers of two.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) {
+    if (ConstantInt *STO = dyn_cast<ConstantInt>(SI->getOperand(1)))
+      if (ConstantInt *SFO = dyn_cast<ConstantInt>(SI->getOperand(2))) {
+        // STO == 0 and SFO == 0 handled above.
+        if ((STO->getValue().isPowerOf2()) && 
+            (SFO->getValue().isPowerOf2())) {
+          Value *TrueAnd = InsertNewInstBefore(
+            BinaryOperator::CreateAnd(Op0, SubOne(STO), SI->getName()+".t"), I);
+          Value *FalseAnd = InsertNewInstBefore(
+            BinaryOperator::CreateAnd(Op0, SubOne(SFO), SI->getName()+".f"), I);
+          return SelectInst::Create(SI->getOperand(0), TrueAnd, FalseAnd);
+        }
+      }
+  }
+  
+  return 0;
+}
+
+Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer rem common cases
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+  
+  if (Value *RHSNeg = dyn_castNegVal(Op1))
+    if (!isa<Constant>(RHSNeg) ||
+        (isa<ConstantInt>(RHSNeg) &&
+         cast<ConstantInt>(RHSNeg)->getValue().isStrictlyPositive())) {
+      // X % -Y -> X % Y
+      AddUsesToWorkList(I);
+      I.setOperand(1, RHSNeg);
+      return &I;
+    }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a urem.
+  if (I.getType()->isInteger()) {
+    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+    if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+      // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+      return BinaryOperator::CreateURem(Op0, Op1, I.getName());
+    }
+  }
+
+  // If it's a constant vector, flip any negative values positive.
+  if (ConstantVector *RHSV = dyn_cast<ConstantVector>(Op1)) {
+    unsigned VWidth = RHSV->getNumOperands();
+
+    bool hasNegative = false;
+    for (unsigned i = 0; !hasNegative && i != VWidth; ++i)
+      if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i)))
+        if (RHS->getValue().isNegative())
+          hasNegative = true;
+
+    if (hasNegative) {
+      std::vector<Constant *> Elts(VWidth);
+      for (unsigned i = 0; i != VWidth; ++i) {
+        if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i))) {
+          if (RHS->getValue().isNegative())
+            Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
+          else
+            Elts[i] = RHS;
+        }
+      }
+
+      Constant *NewRHSV = ConstantVector::get(Elts);
+      if (NewRHSV != RHSV) {
+        AddUsesToWorkList(I);
+        I.setOperand(1, NewRHSV);
+        return &I;
+      }
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+  return commonRemTransforms(I);
+}
+
+// isOneBitSet - Return true if there is exactly one bit set in the specified
+// constant.
+static bool isOneBitSet(const ConstantInt *CI) {
+  return CI->getValue().isPowerOf2();
+}
+
+// isHighOnes - Return true if the constant is of the form 1+0+.
+// This is the same as lowones(~X).
+static bool isHighOnes(const ConstantInt *CI) {
+  return (~CI->getValue() + 1).isPowerOf2();
+}
+
+/// getICmpCode - Encode a icmp predicate into a three bit mask.  These bits
+/// are carefully arranged to allow folding of expressions such as:
+///
+///      (A < B) | (A > B) --> (A != B)
+///
+/// Note that this is only valid if the first and second predicates have the
+/// same sign. Is illegal to do: (A u< B) | (A s> B) 
+///
+/// Three bits are used to represent the condition, as follows:
+///   0  A > B
+///   1  A == B
+///   2  A < B
+///
+/// <=>  Value  Definition
+/// 000     0   Always false
+/// 001     1   A >  B
+/// 010     2   A == B
+/// 011     3   A >= B
+/// 100     4   A <  B
+/// 101     5   A != B
+/// 110     6   A <= B
+/// 111     7   Always true
+///  
+static unsigned getICmpCode(const ICmpInst *ICI) {
+  switch (ICI->getPredicate()) {
+    // False -> 0
+  case ICmpInst::ICMP_UGT: return 1;  // 001
+  case ICmpInst::ICMP_SGT: return 1;  // 001
+  case ICmpInst::ICMP_EQ:  return 2;  // 010
+  case ICmpInst::ICMP_UGE: return 3;  // 011
+  case ICmpInst::ICMP_SGE: return 3;  // 011
+  case ICmpInst::ICMP_ULT: return 4;  // 100
+  case ICmpInst::ICMP_SLT: return 4;  // 100
+  case ICmpInst::ICMP_NE:  return 5;  // 101
+  case ICmpInst::ICMP_ULE: return 6;  // 110
+  case ICmpInst::ICMP_SLE: return 6;  // 110
+    // True -> 7
+  default:
+    assert(0 && "Invalid ICmp predicate!");
+    return 0;
+  }
+}
+
+/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp
+/// predicate into a three bit mask. It also returns whether it is an ordered
+/// predicate by reference.
+static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
+  isOrdered = false;
+  switch (CC) {
+  case FCmpInst::FCMP_ORD: isOrdered = true; return 0;  // 000
+  case FCmpInst::FCMP_UNO:                   return 0;  // 000
+  case FCmpInst::FCMP_OGT: isOrdered = true; return 1;  // 001
+  case FCmpInst::FCMP_UGT:                   return 1;  // 001
+  case FCmpInst::FCMP_OEQ: isOrdered = true; return 2;  // 010
+  case FCmpInst::FCMP_UEQ:                   return 2;  // 010
+  case FCmpInst::FCMP_OGE: isOrdered = true; return 3;  // 011
+  case FCmpInst::FCMP_UGE:                   return 3;  // 011
+  case FCmpInst::FCMP_OLT: isOrdered = true; return 4;  // 100
+  case FCmpInst::FCMP_ULT:                   return 4;  // 100
+  case FCmpInst::FCMP_ONE: isOrdered = true; return 5;  // 101
+  case FCmpInst::FCMP_UNE:                   return 5;  // 101
+  case FCmpInst::FCMP_OLE: isOrdered = true; return 6;  // 110
+  case FCmpInst::FCMP_ULE:                   return 6;  // 110
+    // True -> 7
+  default:
+    // Not expecting FCMP_FALSE and FCMP_TRUE;
+    assert(0 && "Unexpected FCmp predicate!");
+    return 0;
+  }
+}
+
+/// getICmpValue - This is the complement of getICmpCode, which turns an
+/// opcode and two operands into either a constant true or false, or a brand 
+/// new ICmp instruction. The sign is passed in to determine which kind
+/// of predicate to use in the new icmp instruction.
+static Value *getICmpValue(bool sign, unsigned code, Value *LHS, Value *RHS) {
+  switch (code) {
+  default: assert(0 && "Illegal ICmp code!");
+  case  0: return ConstantInt::getFalse();
+  case  1: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SGT, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_UGT, LHS, RHS);
+  case  2: return new ICmpInst(ICmpInst::ICMP_EQ,  LHS, RHS);
+  case  3: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SGE, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_UGE, LHS, RHS);
+  case  4: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SLT, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_ULT, LHS, RHS);
+  case  5: return new ICmpInst(ICmpInst::ICMP_NE,  LHS, RHS);
+  case  6: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SLE, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_ULE, LHS, RHS);
+  case  7: return ConstantInt::getTrue();
+  }
+}
+
+/// getFCmpValue - This is the complement of getFCmpCode, which turns an
+/// opcode and two operands into either a FCmp instruction. isordered is passed
+/// in to determine which kind of predicate to use in the new fcmp instruction.
+static Value *getFCmpValue(bool isordered, unsigned code,
+                           Value *LHS, Value *RHS) {
+  switch (code) {
+  default: assert(0 && "Illegal FCmp code!");
+  case  0:
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_ORD, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UNO, LHS, RHS);
+  case  1: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OGT, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UGT, LHS, RHS);
+  case  2: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OEQ, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UEQ, LHS, RHS);
+  case  3: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OGE, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UGE, LHS, RHS);
+  case  4: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OLT, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_ULT, LHS, RHS);
+  case  5: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_ONE, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_UNE, LHS, RHS);
+  case  6: 
+    if (isordered)
+      return new FCmpInst(FCmpInst::FCMP_OLE, LHS, RHS);
+    else
+      return new FCmpInst(FCmpInst::FCMP_ULE, LHS, RHS);
+  case  7: return ConstantInt::getTrue();
+  }
+}
+
+/// PredicatesFoldable - Return true if both predicates match sign or if at
+/// least one of them is an equality comparison (which is signless).
+static bool PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
+  return (ICmpInst::isSignedPredicate(p1) == ICmpInst::isSignedPredicate(p2)) ||
+         (ICmpInst::isSignedPredicate(p1) && ICmpInst::isEquality(p2)) ||
+         (ICmpInst::isSignedPredicate(p2) && ICmpInst::isEquality(p1));
+}
+
+namespace { 
+// FoldICmpLogical - Implements (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+struct FoldICmpLogical {
+  InstCombiner &IC;
+  Value *LHS, *RHS;
+  ICmpInst::Predicate pred;
+  FoldICmpLogical(InstCombiner &ic, ICmpInst *ICI)
+    : IC(ic), LHS(ICI->getOperand(0)), RHS(ICI->getOperand(1)),
+      pred(ICI->getPredicate()) {}
+  bool shouldApply(Value *V) const {
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(V))
+      if (PredicatesFoldable(pred, ICI->getPredicate()))
+        return ((ICI->getOperand(0) == LHS && ICI->getOperand(1) == RHS) ||
+                (ICI->getOperand(0) == RHS && ICI->getOperand(1) == LHS));
+    return false;
+  }
+  Instruction *apply(Instruction &Log) const {
+    ICmpInst *ICI = cast<ICmpInst>(Log.getOperand(0));
+    if (ICI->getOperand(0) != LHS) {
+      assert(ICI->getOperand(1) == LHS);
+      ICI->swapOperands();  // Swap the LHS and RHS of the ICmp
+    }
+
+    ICmpInst *RHSICI = cast<ICmpInst>(Log.getOperand(1));
+    unsigned LHSCode = getICmpCode(ICI);
+    unsigned RHSCode = getICmpCode(RHSICI);
+    unsigned Code;
+    switch (Log.getOpcode()) {
+    case Instruction::And: Code = LHSCode & RHSCode; break;
+    case Instruction::Or:  Code = LHSCode | RHSCode; break;
+    case Instruction::Xor: Code = LHSCode ^ RHSCode; break;
+    default: assert(0 && "Illegal logical opcode!"); return 0;
+    }
+
+    bool isSigned = ICmpInst::isSignedPredicate(RHSICI->getPredicate()) || 
+                    ICmpInst::isSignedPredicate(ICI->getPredicate());
+      
+    Value *RV = getICmpValue(isSigned, Code, LHS, RHS);
+    if (Instruction *I = dyn_cast<Instruction>(RV))
+      return I;
+    // Otherwise, it's a constant boolean value...
+    return IC.ReplaceInstUsesWith(Log, RV);
+  }
+};
+} // end anonymous namespace
+
+// OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where
+// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
+// guaranteed to be a binary operator.
+Instruction *InstCombiner::OptAndOp(Instruction *Op,
+                                    ConstantInt *OpRHS,
+                                    ConstantInt *AndRHS,
+                                    BinaryOperator &TheAnd) {
+  Value *X = Op->getOperand(0);
+  Constant *Together = 0;
+  if (!Op->isShift())
+    Together = And(AndRHS, OpRHS);
+
+  switch (Op->getOpcode()) {
+  case Instruction::Xor:
+    if (Op->hasOneUse()) {
+      // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
+      Instruction *And = BinaryOperator::CreateAnd(X, AndRHS);
+      InsertNewInstBefore(And, TheAnd);
+      And->takeName(Op);
+      return BinaryOperator::CreateXor(And, Together);
+    }
+    break;
+  case Instruction::Or:
+    if (Together == AndRHS) // (X | C) & C --> C
+      return ReplaceInstUsesWith(TheAnd, AndRHS);
+
+    if (Op->hasOneUse() && Together != OpRHS) {
+      // (X | C1) & C2 --> (X | (C1&C2)) & C2
+      Instruction *Or = BinaryOperator::CreateOr(X, Together);
+      InsertNewInstBefore(Or, TheAnd);
+      Or->takeName(Op);
+      return BinaryOperator::CreateAnd(Or, AndRHS);
+    }
+    break;
+  case Instruction::Add:
+    if (Op->hasOneUse()) {
+      // Adding a one to a single bit bit-field should be turned into an XOR
+      // of the bit.  First thing to check is to see if this AND is with a
+      // single bit constant.
+      const APInt& AndRHSV = cast<ConstantInt>(AndRHS)->getValue();
+
+      // If there is only one bit set...
+      if (isOneBitSet(cast<ConstantInt>(AndRHS))) {
+        // Ok, at this point, we know that we are masking the result of the
+        // ADD down to exactly one bit.  If the constant we are adding has
+        // no bits set below this bit, then we can eliminate the ADD.
+        const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue();
+
+        // Check to see if any bits below the one bit set in AndRHSV are set.
+        if ((AddRHS & (AndRHSV-1)) == 0) {
+          // If not, the only thing that can effect the output of the AND is
+          // the bit specified by AndRHSV.  If that bit is set, the effect of
+          // the XOR is to toggle the bit.  If it is clear, then the ADD has
+          // no effect.
+          if ((AddRHS & AndRHSV) == 0) { // Bit is not set, noop
+            TheAnd.setOperand(0, X);
+            return &TheAnd;
+          } else {
+            // Pull the XOR out of the AND.
+            Instruction *NewAnd = BinaryOperator::CreateAnd(X, AndRHS);
+            InsertNewInstBefore(NewAnd, TheAnd);
+            NewAnd->takeName(Op);
+            return BinaryOperator::CreateXor(NewAnd, AndRHS);
+          }
+        }
+      }
+    }
+    break;
+
+  case Instruction::Shl: {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(AndRHS->getValue() & ShlMask);
+
+    if (CI->getValue() == ShlMask) { 
+    // Masking out bits that the shift already masks
+      return ReplaceInstUsesWith(TheAnd, Op);   // No need for the and.
+    } else if (CI != AndRHS) {                  // Reducing bits set in and.
+      TheAnd.setOperand(1, CI);
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::LShr:
+  {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!  This only applies to
+    // unsigned shifts, because a signed shr may bring in set bits!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(AndRHS->getValue() & ShrMask);
+
+    if (CI->getValue() == ShrMask) {   
+    // Masking out bits that the shift already masks.
+      return ReplaceInstUsesWith(TheAnd, Op);
+    } else if (CI != AndRHS) {
+      TheAnd.setOperand(1, CI);  // Reduce bits set in and cst.
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::AShr:
+    // Signed shr.
+    // See if this is shifting in some sign extension, then masking it out
+    // with an and.
+    if (Op->hasOneUse()) {
+      uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+      uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+      APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+      Constant *C = ConstantInt::get(AndRHS->getValue() & ShrMask);
+      if (C == AndRHS) {          // Masking out bits shifted in.
+        // (Val ashr C1) & C2 -> (Val lshr C1) & C2
+        // Make the argument unsigned.
+        Value *ShVal = Op->getOperand(0);
+        ShVal = InsertNewInstBefore(
+            BinaryOperator::CreateLShr(ShVal, OpRHS, 
+                                   Op->getName()), TheAnd);
+        return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName());
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
+/// true, otherwise (V < Lo || V >= Hi).  In pratice, we emit the more efficient
+/// (V-Lo) <u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
+/// whether to treat the V, Lo and HI as signed or not. IB is the location to
+/// insert new instructions.
+Instruction *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                                           bool isSigned, bool Inside, 
+                                           Instruction &IB) {
+  assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ? 
+            ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() &&
+         "Lo is not <= Hi in range emission code!");
+    
+  if (Inside) {
+    if (Lo == Hi)  // Trivially false.
+      return new ICmpInst(ICmpInst::ICMP_NE, V, V);
+
+    // V >= Min && V < Hi --> V < Hi
+    if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+      ICmpInst::Predicate pred = (isSigned ? 
+        ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT);
+      return new ICmpInst(pred, V, Hi);
+    }
+
+    // Emit V-Lo <u Hi-Lo
+    Constant *NegLo = ConstantExpr::getNeg(Lo);
+    Instruction *Add = BinaryOperator::CreateAdd(V, NegLo, V->getName()+".off");
+    InsertNewInstBefore(Add, IB);
+    Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi);
+    return new ICmpInst(ICmpInst::ICMP_ULT, Add, UpperBound);
+  }
+
+  if (Lo == Hi)  // Trivially true.
+    return new ICmpInst(ICmpInst::ICMP_EQ, V, V);
+
+  // V < Min || V >= Hi -> V > Hi-1
+  Hi = SubOne(cast<ConstantInt>(Hi));
+  if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+    ICmpInst::Predicate pred = (isSigned ? 
+        ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
+    return new ICmpInst(pred, V, Hi);
+  }
+
+  // Emit V-Lo >u Hi-1-Lo
+  // Note that Hi has already had one subtracted from it, above.
+  ConstantInt *NegLo = cast<ConstantInt>(ConstantExpr::getNeg(Lo));
+  Instruction *Add = BinaryOperator::CreateAdd(V, NegLo, V->getName()+".off");
+  InsertNewInstBefore(Add, IB);
+  Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi);
+  return new ICmpInst(ICmpInst::ICMP_UGT, Add, LowerBound);
+}
+
+// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with
+// any number of 0s on either side.  The 1s are allowed to wrap from LSB to
+// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
+// not, since all 1s are not contiguous.
+static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
+  const APInt& V = Val->getValue();
+  uint32_t BitWidth = Val->getType()->getBitWidth();
+  if (!APIntOps::isShiftedMask(BitWidth, V)) return false;
+
+  // look for the first zero bit after the run of ones
+  MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
+  // look for the first non-zero bit
+  ME = V.getActiveBits(); 
+  return true;
+}
+
+/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask,
+/// where isSub determines whether the operator is a sub.  If we can fold one of
+/// the following xforms:
+/// 
+/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
+/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+///
+/// return (A +/- B).
+///
+Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
+                                        ConstantInt *Mask, bool isSub,
+                                        Instruction &I) {
+  Instruction *LHSI = dyn_cast<Instruction>(LHS);
+  if (!LHSI || LHSI->getNumOperands() != 2 ||
+      !isa<ConstantInt>(LHSI->getOperand(1))) return 0;
+
+  ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
+
+  switch (LHSI->getOpcode()) {
+  default: return 0;
+  case Instruction::And:
+    if (And(N, Mask) == Mask) {
+      // If the AndRHS is a power of two minus one (0+1+), this is simple.
+      if ((Mask->getValue().countLeadingZeros() + 
+           Mask->getValue().countPopulation()) == 
+          Mask->getValue().getBitWidth())
+        break;
+
+      // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+
+      // part, we don't need any explicit masks to take them out of A.  If that
+      // is all N is, ignore it.
+      uint32_t MB = 0, ME = 0;
+      if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
+        uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
+        APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
+        if (MaskedValueIsZero(RHS, Mask))
+          break;
+      }
+    }
+    return 0;
+  case Instruction::Or:
+  case Instruction::Xor:
+    // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
+    if ((Mask->getValue().countLeadingZeros() + 
+         Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
+        && And(N, Mask)->isZero())
+      break;
+    return 0;
+  }
+  
+  Instruction *New;
+  if (isSub)
+    New = BinaryOperator::CreateSub(LHSI->getOperand(0), RHS, "fold");
+  else
+    New = BinaryOperator::CreateAdd(LHSI->getOperand(0), RHS, "fold");
+  return InsertNewInstBefore(New, I);
+}
+
+/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible.
+Instruction *InstCombiner::FoldAndOfICmps(Instruction &I,
+                                          ICmpInst *LHS, ICmpInst *RHS) {
+  Value *Val, *Val2;
+  ConstantInt *LHSCst, *RHSCst;
+  ICmpInst::Predicate LHSCC, RHSCC;
+  
+  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
+  if (!match(LHS, m_ICmp(LHSCC, m_Value(Val), m_ConstantInt(LHSCst))) ||
+      !match(RHS, m_ICmp(RHSCC, m_Value(Val2), m_ConstantInt(RHSCst))))
+    return 0;
+  
+  // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
+  // where C is a power of 2
+  if (LHSCst == RHSCst && LHSCC == RHSCC && LHSCC == ICmpInst::ICMP_ULT &&
+      LHSCst->getValue().isPowerOf2()) {
+    Instruction *NewOr = BinaryOperator::CreateOr(Val, Val2);
+    InsertNewInstBefore(NewOr, I);
+    return new ICmpInst(LHSCC, NewOr, LHSCst);
+  }
+  
+  // From here on, we only handle:
+  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+  
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+  
+  // We can't fold (ugt x, C) & (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+    
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (ICmpInst::isSignedPredicate(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) && 
+       ICmpInst::isSignedPredicate(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+
+  // At this point, we know we have have two icmp instructions
+  // comparing a value against two constants and and'ing the result
+  // together.  Because of the above check, we know that we only have
+  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know 
+  // (from the FoldICmpLogical check above), that the two constants 
+  // are not equal and that the larger constant is on the RHS
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X == 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT:        // (X == 13 & X >  15) -> false
+    case ICmpInst::ICMP_SGT:        // (X == 13 & X >  15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
+    case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
+    case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
+      return ReplaceInstUsesWith(I, LHS);
+    }
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_ULT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
+        return new ICmpInst(ICmpInst::ICMP_ULT, Val, LHSCst);
+      break;                        // (X != 13 & X u< 15) -> no change
+    case ICmpInst::ICMP_SLT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
+        return new ICmpInst(ICmpInst::ICMP_SLT, Val, LHSCst);
+      break;                        // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
+    case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_NE:
+      if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Instruction *Add = BinaryOperator::CreateAdd(Val, AddCST,
+                                                     Val->getName()+".off");
+        InsertNewInstBefore(Add, I);
+        return new ICmpInst(ICmpInst::ICMP_UGT, Add,
+                            ConstantInt::get(Add->getType(), 1));
+      }
+      break;                        // (X != 13 & X != 15) -> no change
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
+    case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s< 13 & X == 15) -> false
+    case ICmpInst::ICMP_SGT:        // (X s< 13 & X s> 15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
+    case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
+        return new ICmpInst(LHSCC, Val, RHSCst);
+      break;                        // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true, I);
+    case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
+        return new ICmpInst(LHSCC, Val, RHSCst);
+      break;                        // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, true, true, I);
+    case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+ 
+  return 0;
+}
+
+
+Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1))                         // X & undef -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // and X, X = X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, Op1);
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+  } else {
+    if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
+      if (CP->isAllOnesValue())            // X & <-1,-1> -> X
+        return ReplaceInstUsesWith(I, I.getOperand(0));
+    } else if (isa<ConstantAggregateZero>(Op1)) {
+      return ReplaceInstUsesWith(I, Op1);  // X & <0,0> -> <0,0>
+    }
+  }
+  
+  if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
+    const APInt& AndRHSMask = AndRHS->getValue();
+    APInt NotAndRHS(~AndRHSMask);
+
+    // Optimize a variety of ((val OP C1) & C2) combinations...
+    if (isa<BinaryOperator>(Op0)) {
+      Instruction *Op0I = cast<Instruction>(Op0);
+      Value *Op0LHS = Op0I->getOperand(0);
+      Value *Op0RHS = Op0I->getOperand(1);
+      switch (Op0I->getOpcode()) {
+      case Instruction::Xor:
+      case Instruction::Or:
+        // If the mask is only needed on one incoming arm, push it up.
+        if (Op0I->hasOneUse()) {
+          if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
+            // Not masking anything out for the LHS, move to RHS.
+            Instruction *NewRHS = BinaryOperator::CreateAnd(Op0RHS, AndRHS,
+                                                   Op0RHS->getName()+".masked");
+            InsertNewInstBefore(NewRHS, I);
+            return BinaryOperator::Create(
+                       cast<BinaryOperator>(Op0I)->getOpcode(), Op0LHS, NewRHS);
+          }
+          if (!isa<Constant>(Op0RHS) &&
+              MaskedValueIsZero(Op0RHS, NotAndRHS)) {
+            // Not masking anything out for the RHS, move to LHS.
+            Instruction *NewLHS = BinaryOperator::CreateAnd(Op0LHS, AndRHS,
+                                                   Op0LHS->getName()+".masked");
+            InsertNewInstBefore(NewLHS, I);
+            return BinaryOperator::Create(
+                       cast<BinaryOperator>(Op0I)->getOpcode(), NewLHS, Op0RHS);
+          }
+        }
+
+        break;
+      case Instruction::Add:
+        // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);
+        if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);  // Add commutes
+        break;
+
+      case Instruction::Sub:
+        // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);
+
+        // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS
+        // has 1's for all bits that the subtraction with A might affect.
+        if (Op0I->hasOneUse()) {
+          uint32_t BitWidth = AndRHSMask.getBitWidth();
+          uint32_t Zeros = AndRHSMask.countLeadingZeros();
+          APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros);
+
+          ConstantInt *A = dyn_cast<ConstantInt>(Op0LHS);
+          if (!(A && A->isZero()) &&               // avoid infinite recursion.
+              MaskedValueIsZero(Op0LHS, Mask)) {
+            Instruction *NewNeg = BinaryOperator::CreateNeg(Op0RHS);
+            InsertNewInstBefore(NewNeg, I);
+            return BinaryOperator::CreateAnd(NewNeg, AndRHS);
+          }
+        }
+        break;
+
+      case Instruction::Shl:
+      case Instruction::LShr:
+        // (1 << x) & 1 --> zext(x == 0)
+        // (1 >> x) & 1 --> zext(x == 0)
+        if (AndRHSMask == 1 && Op0LHS == AndRHS) {
+          Instruction *NewICmp = new ICmpInst(ICmpInst::ICMP_EQ, Op0RHS,
+                                           Constant::getNullValue(I.getType()));
+          InsertNewInstBefore(NewICmp, I);
+          return new ZExtInst(NewICmp, I.getType());
+        }
+        break;
+      }
+
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
+        if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
+          return Res;
+    } else if (CastInst *CI = dyn_cast<CastInst>(Op0)) {
+      // If this is an integer truncation or change from signed-to-unsigned, and
+      // if the source is an and/or with immediate, transform it.  This
+      // frequently occurs for bitfield accesses.
+      if (Instruction *CastOp = dyn_cast<Instruction>(CI->getOperand(0))) {
+        if ((isa<TruncInst>(CI) || isa<BitCastInst>(CI)) &&
+            CastOp->getNumOperands() == 2)
+          if (ConstantInt *AndCI = dyn_cast<ConstantInt>(CastOp->getOperand(1))) {
+            if (CastOp->getOpcode() == Instruction::And) {
+              // Change: and (cast (and X, C1) to T), C2
+              // into  : and (cast X to T), trunc_or_bitcast(C1)&C2
+              // This will fold the two constants together, which may allow 
+              // other simplifications.
+              Instruction *NewCast = CastInst::CreateTruncOrBitCast(
+                CastOp->getOperand(0), I.getType(), 
+                CastOp->getName()+".shrunk");
+              NewCast = InsertNewInstBefore(NewCast, I);
+              // trunc_or_bitcast(C1)&C2
+              Constant *C3 = ConstantExpr::getTruncOrBitCast(AndCI,I.getType());
+              C3 = ConstantExpr::getAnd(C3, AndRHS);
+              return BinaryOperator::CreateAnd(NewCast, C3);
+            } else if (CastOp->getOpcode() == Instruction::Or) {
+              // Change: and (cast (or X, C1) to T), C2
+              // into  : trunc(C1)&C2 iff trunc(C1)&C2 == C2
+              Constant *C3 = ConstantExpr::getTruncOrBitCast(AndCI,I.getType());
+              if (ConstantExpr::getAnd(C3, AndRHS) == AndRHS)   // trunc(C1)&C2
+                return ReplaceInstUsesWith(I, AndRHS);
+            }
+          }
+      }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  Value *Op0NotVal = dyn_castNotVal(Op0);
+  Value *Op1NotVal = dyn_castNotVal(Op1);
+
+  if (Op0NotVal == Op1 || Op1NotVal == Op0)  // A & ~A  == ~A & A == 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // (~A & ~B) == (~(A | B)) - De Morgan's Law
+  if (Op0NotVal && Op1NotVal && isOnlyUse(Op0) && isOnlyUse(Op1)) {
+    Instruction *Or = BinaryOperator::CreateOr(Op0NotVal, Op1NotVal,
+                                               I.getName()+".demorgan");
+    InsertNewInstBefore(Or, I);
+    return BinaryOperator::CreateNot(Or);
+  }
+  
+  {
+    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    if (match(Op0, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op1 || B == Op1)    // (A | ?) & A  --> A
+        return ReplaceInstUsesWith(I, Op1);
+    
+      // (A|B) & ~(A&B) -> A^B
+      if (match(Op1, m_Not(m_And(m_Value(C), m_Value(D))))) {
+        if ((A == C && B == D) || (A == D && B == C))
+          return BinaryOperator::CreateXor(A, B);
+      }
+    }
+    
+    if (match(Op1, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op0 || B == Op0)    // A & (A | ?)  --> A
+        return ReplaceInstUsesWith(I, Op0);
+
+      // ~(A&B) & (A|B) -> A^B
+      if (match(Op0, m_Not(m_And(m_Value(C), m_Value(D))))) {
+        if ((A == C && B == D) || (A == D && B == C))
+          return BinaryOperator::CreateXor(A, B);
+      }
+    }
+    
+    if (Op0->hasOneUse() &&
+        match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1) {                                // (A^B)&A -> A&(A^B)
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      } else if (B == Op1) {                         // (A^B)&B -> B&(B^A)
+        cast<BinaryOperator>(Op0)->swapOperands();
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      }
+    }
+
+    if (Op1->hasOneUse() &&
+        match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+      if (B == Op0) {                                // B&(A^B) -> B&(B^A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (A == Op0) {                                // A&(A^B) -> A & ~B
+        Instruction *NotB = BinaryOperator::CreateNot(B, "tmp");
+        InsertNewInstBefore(NotB, I);
+        return BinaryOperator::CreateAnd(A, NotB);
+      }
+    }
+
+    // (A&((~A)|B)) -> A&B
+    if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) ||
+        match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1)))))
+      return BinaryOperator::CreateAnd(A, Op1);
+    if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) ||
+        match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0)))))
+      return BinaryOperator::CreateAnd(A, Op0);
+  }
+  
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1)) {
+    // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0))
+      if (Instruction *Res = FoldAndOfICmps(I, LHS, RHS))
+        return Res;
+  }
+
+  // fold (and (cast A), (cast B)) -> (cast (and A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind ?
+        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+            // Only do this if the casts both really cause code to be generated.
+            ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                              I.getType(), TD) &&
+            ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                              I.getType(), TD)) {
+          Instruction *NewOp = BinaryOperator::CreateAnd(Op0C->getOperand(0),
+                                                         Op1C->getOperand(0),
+                                                         I.getName());
+          InsertNewInstBefore(NewOp, I);
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+    
+  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Instruction *NewOp =
+          InsertNewInstBefore(BinaryOperator::CreateAnd(SI0->getOperand(0),
+                                                        SI1->getOperand(0),
+                                                        SI0->getName()), I);
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
+                                      SI1->getOperand(1));
+      }
+  }
+
+  // If and'ing two fcmp, try combine them into one.
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) {
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) {
+      if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
+          RHS->getPredicate() == FCmpInst::FCMP_ORD) {
+        // (fcmp ord x, c) & (fcmp ord y, c)  -> (fcmp ord x, y)
+        if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+          if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+            // If either of the constants are nans, then the whole thing returns
+            // false.
+            if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+            return new FCmpInst(FCmpInst::FCMP_ORD, LHS->getOperand(0),
+                                RHS->getOperand(0));
+          }
+      } else {
+        Value *Op0LHS, *Op0RHS, *Op1LHS, *Op1RHS;
+        FCmpInst::Predicate Op0CC, Op1CC;
+        if (match(Op0, m_FCmp(Op0CC, m_Value(Op0LHS), m_Value(Op0RHS))) &&
+            match(Op1, m_FCmp(Op1CC, m_Value(Op1LHS), m_Value(Op1RHS)))) {
+          if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+            // Swap RHS operands to match LHS.
+            Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+            std::swap(Op1LHS, Op1RHS);
+          }
+          if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
+            // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
+            if (Op0CC == Op1CC)
+              return new FCmpInst((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
+            else if (Op0CC == FCmpInst::FCMP_FALSE ||
+                     Op1CC == FCmpInst::FCMP_FALSE)
+              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+            else if (Op0CC == FCmpInst::FCMP_TRUE)
+              return ReplaceInstUsesWith(I, Op1);
+            else if (Op1CC == FCmpInst::FCMP_TRUE)
+              return ReplaceInstUsesWith(I, Op0);
+            bool Op0Ordered;
+            bool Op1Ordered;
+            unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
+            unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+            if (Op1Pred == 0) {
+              std::swap(Op0, Op1);
+              std::swap(Op0Pred, Op1Pred);
+              std::swap(Op0Ordered, Op1Ordered);
+            }
+            if (Op0Pred == 0) {
+              // uno && ueq -> uno && (uno || eq) -> ueq
+              // ord && olt -> ord && (ord && lt) -> olt
+              if (Op0Ordered == Op1Ordered)
+                return ReplaceInstUsesWith(I, Op1);
+              // uno && oeq -> uno && (ord && eq) -> false
+              // uno && ord -> false
+              if (!Op0Ordered)
+                return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+              // ord && ueq -> ord && (uno || eq) -> oeq
+              return cast<Instruction>(getFCmpValue(true, Op1Pred,
+                                                    Op0LHS, Op0RHS));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// CollectBSwapParts - Analyze the specified subexpression and see if it is
+/// capable of providing pieces of a bswap.  The subexpression provides pieces
+/// of a bswap if it is proven that each of the non-zero bytes in the output of
+/// the expression came from the corresponding "byte swapped" byte in some other
+/// value.  For example, if the current subexpression is "(shl i32 %X, 24)" then
+/// we know that the expression deposits the low byte of %X into the high byte
+/// of the bswap result and that all other bytes are zero.  This expression is
+/// accepted, the high byte of ByteValues is set to X to indicate a correct
+/// match.
+///
+/// This function returns true if the match was unsuccessful and false if so.
+/// On entry to the function the "OverallLeftShift" is a signed integer value
+/// indicating the number of bytes that the subexpression is later shifted.  For
+/// example, if the expression is later right shifted by 16 bits, the
+/// OverallLeftShift value would be -2 on entry.  This is used to specify which
+/// byte of ByteValues is actually being set.
+///
+/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding
+/// byte is masked to zero by a user.  For example, in (X & 255), X will be
+/// processed with a bytemask of 1.  Because bytemask is 32-bits, this limits
+/// this function to working on up to 32-byte (256 bit) values.  ByteMask is
+/// always in the local (OverallLeftShift) coordinate space.
+///
+static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
+                              SmallVector<Value*, 8> &ByteValues) {
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
+                               ByteValues) ||
+             CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask,
+                               ByteValues);
+    }
+  
+    // If this is a logical shift by a constant multiple of 8, recurse with
+    // OverallLeftShift and ByteMask adjusted.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned ShAmt = 
+        cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined and of a byte value.
+      if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size()))
+        return true;
+
+      unsigned ByteShift = ShAmt >> 3;
+      if (I->getOpcode() == Instruction::Shl) {
+        // X << 2 -> collect(X, +2)
+        OverallLeftShift += ByteShift;
+        ByteMask >>= ByteShift;
+      } else {
+        // X >>u 2 -> collect(X, -2)
+        OverallLeftShift -= ByteShift;
+        ByteMask <<= ByteShift;
+        ByteMask &= (~0U >> (32-ByteValues.size()));
+      }
+
+      if (OverallLeftShift >= (int)ByteValues.size()) return true;
+      if (OverallLeftShift <= -(int)ByteValues.size()) return true;
+
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, 
+                               ByteValues);
+    }
+
+    // If this is a logical 'and' with a mask that clears bytes, clear the
+    // corresponding bytes in ByteMask.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      // Scan every byte of the and mask, seeing if the byte is either 0 or 255.
+      unsigned NumBytes = ByteValues.size();
+      APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+      
+      for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) {
+        // If this byte is masked out by a later operation, we don't care what
+        // the and mask is.
+        if ((ByteMask & (1 << i)) == 0)
+          continue;
+        
+        // If the AndMask is all zeros for this byte, clear the bit.
+        APInt MaskB = AndMask & Byte;
+        if (MaskB == 0) {
+          ByteMask &= ~(1U << i);
+          continue;
+        }
+        
+        // If the AndMask is not all ones for this byte, it's not a bytezap.
+        if (MaskB != Byte)
+          return true;
+
+        // Otherwise, this byte is kept.
+      }
+
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, 
+                               ByteValues);
+    }
+  }
+  
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap.  Some observations: 1) if more than one byte
+  // is demanded from this input, then it could not be successfully assembled
+  // into a byteswap.  At least one of the two bytes would not be aligned with
+  // their ultimate destination.
+  if (!isPowerOf2_32(ByteMask)) return true;
+  unsigned InputByteNo = CountTrailingZeros_32(ByteMask);
+  
+  // 2) The input and ultimate destinations must line up: if byte 3 of an i32
+  // is demanded, it needs to go into byte 0 of the result.  This means that the
+  // byte needs to be shifted until it lands in the right byte bucket.  The
+  // shift amount depends on the position: if the byte is coming from the high
+  // part of the value (e.g. byte 3) then it must be shifted right.  If from the
+  // low part, it must be shifted left.
+  unsigned DestByteNo = InputByteNo + OverallLeftShift;
+  if (InputByteNo < ByteValues.size()/2) {
+    if (ByteValues.size()-1-DestByteNo != InputByteNo)
+      return true;
+  } else {
+    if (ByteValues.size()-1-DestByteNo != InputByteNo)
+      return true;
+  }
+  
+  // If the destination byte value is already defined, the values are or'd
+  // together, which isn't a bswap (unless it's an or of the same bits).
+  if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V)
+    return true;
+  ByteValues[DestByteNo] = V;
+  return false;
+}
+
+/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom.
+/// If so, insert the new bswap intrinsic and return it.
+Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
+  const IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
+  if (!ITy || ITy->getBitWidth() % 16 || 
+      // ByteMask only allows up to 32-byte values.
+      ITy->getBitWidth() > 32*8) 
+    return 0;   // Can only bswap pairs of bytes.  Can't do vectors.
+  
+  /// ByteValues - For each byte of the result, we keep track of which value
+  /// defines each byte.
+  SmallVector<Value*, 8> ByteValues;
+  ByteValues.resize(ITy->getBitWidth()/8);
+    
+  // Try to find all the pieces corresponding to the bswap.
+  uint32_t ByteMask = ~0U >> (32-ByteValues.size());
+  if (CollectBSwapParts(&I, 0, ByteMask, ByteValues))
+    return 0;
+  
+  // Check to see if all of the bytes come from the same value.
+  Value *V = ByteValues[0];
+  if (V == 0) return 0;  // Didn't find a byte?  Must be zero.
+  
+  // Check to make sure that all of the bytes come from the same value.
+  for (unsigned i = 1, e = ByteValues.size(); i != e; ++i)
+    if (ByteValues[i] != V)
+      return 0;
+  const Type *Tys[] = { ITy };
+  Module *M = I.getParent()->getParent()->getParent();
+  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
+  return CallInst::Create(F, V);
+}
+
+/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D).  Check
+/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then
+/// we can simplify this expression to "cond ? C : D or B".
+static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
+                                         Value *C, Value *D) {
+  // If A is not a select of -1/0, this cannot match.
+  Value *Cond = 0;
+  if (!match(A, m_SelectCst<-1, 0>(m_Value(Cond))))
+    return 0;
+
+  // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B.
+  if (match(D, m_SelectCst<0, -1>(m_Specific(Cond))))
+    return SelectInst::Create(Cond, C, B);
+  if (match(D, m_Not(m_SelectCst<-1, 0>(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, B);
+  // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D.
+  if (match(B, m_SelectCst<0, -1>(m_Specific(Cond))))
+    return SelectInst::Create(Cond, C, D);
+  if (match(B, m_Not(m_SelectCst<-1, 0>(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, D);
+  return 0;
+}
+
+/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
+Instruction *InstCombiner::FoldOrOfICmps(Instruction &I,
+                                         ICmpInst *LHS, ICmpInst *RHS) {
+  Value *Val, *Val2;
+  ConstantInt *LHSCst, *RHSCst;
+  ICmpInst::Predicate LHSCC, RHSCC;
+  
+  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
+  if (!match(LHS, m_ICmp(LHSCC, m_Value(Val), m_ConstantInt(LHSCst))) ||
+      !match(RHS, m_ICmp(RHSCC, m_Value(Val2), m_ConstantInt(RHSCst))))
+    return 0;
+  
+  // From here on, we only handle:
+  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+  
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+  
+  // We can't fold (ugt x, C) | (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+  
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (ICmpInst::isSignedPredicate(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) && 
+       ICmpInst::isSignedPredicate(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+  
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+  
+  // At this point, we know we have have two icmp instructions
+  // comparing a value against two constants and or'ing the result
+  // together.  Because of the above check, we know that we only have
+  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+  // FoldICmpLogical check above), that the two constants are not
+  // equal.
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:
+      if (LHSCst == SubOne(RHSCst)) { // (X == 13 | X == 14) -> X-13 <u 2
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Instruction *Add = BinaryOperator::CreateAdd(Val, AddCST,
+                                                     Val->getName()+".off");
+        InsertNewInstBefore(Add, I);
+        AddCST = Subtract(AddOne(RHSCst), LHSCst);
+        return new ICmpInst(ICmpInst::ICMP_ULT, Add, AddCST);
+      }
+      break;                         // (X == 13 | X == 15) -> no change
+    case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+      break;
+    case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
+      return ReplaceInstUsesWith(I, RHS);
+    }
+    break;
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
+    case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
+    case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
+    case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) -> (X-13) u> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(false))
+        return ReplaceInstUsesWith(I, LHS);
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), false, false, I);
+    case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) -> (X-13) s> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(true))
+        return ReplaceInstUsesWith(I, LHS);
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), true, false, I);
+    case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
+    case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
+    case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
+    case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+  return 0;
+}
+
+/// FoldOrWithConstants - This helper function folds:
+///
+///     ((A | B) & C1) | (B & C2)
+///
+/// into:
+/// 
+///     (A & C1) | B
+///
+/// when the XOR of the two constants is "all ones" (-1).
+Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                               Value *A, Value *B, Value *C) {
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
+  if (!CI1) return 0;
+
+  Value *V1 = 0;
+  ConstantInt *CI2 = 0;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return 0;
+
+  APInt Xor = CI1->getValue() ^ CI2->getValue();
+  if (!Xor.isAllOnesValue()) return 0;
+
+  if (V1 == A || V1 == B) {
+    Instruction *NewOp =
+      InsertNewInstBefore(BinaryOperator::CreateAnd((V1 == A) ? B : A, CI1), I);
+    return BinaryOperator::CreateOr(NewOp, V1);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1))                       // X | undef -> -1
+    return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  // or X, X = X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, Op0);
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+  } else if (isa<ConstantAggregateZero>(Op1)) {
+    return ReplaceInstUsesWith(I, Op0);  // X | <0,0> -> X
+  } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
+    if (CP->isAllOnesValue())            // X | <-1,-1> -> <-1,-1>
+      return ReplaceInstUsesWith(I, I.getOperand(1));
+  }
+    
+
+  
+  // or X, -1 == -1
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    ConstantInt *C1 = 0; Value *X = 0;
+    // (X & C1) | C2 --> (X | C2) & (C1|C2)
+    if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
+      Instruction *Or = BinaryOperator::CreateOr(X, RHS);
+      InsertNewInstBefore(Or, I);
+      Or->takeName(Op0);
+      return BinaryOperator::CreateAnd(Or, 
+               ConstantInt::get(RHS->getValue() | C1->getValue()));
+    }
+
+    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
+    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
+      Instruction *Or = BinaryOperator::CreateOr(X, RHS);
+      InsertNewInstBefore(Or, I);
+      Or->takeName(Op0);
+      return BinaryOperator::CreateXor(Or,
+                 ConstantInt::get(C1->getValue() & ~RHS->getValue()));
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  Value *A = 0, *B = 0;
+  ConstantInt *C1 = 0, *C2 = 0;
+
+  if (match(Op0, m_And(m_Value(A), m_Value(B))))
+    if (A == Op1 || B == Op1)    // (A & ?) | A  --> A
+      return ReplaceInstUsesWith(I, Op1);
+  if (match(Op1, m_And(m_Value(A), m_Value(B))))
+    if (A == Op0 || B == Op0)    // A | (A & ?)  --> A
+      return ReplaceInstUsesWith(I, Op0);
+
+  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
+  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
+  if (match(Op0, m_Or(m_Value(), m_Value())) ||
+      match(Op1, m_Or(m_Value(), m_Value())) ||
+      (match(Op0, m_Shift(m_Value(), m_Value())) &&
+       match(Op1, m_Shift(m_Value(), m_Value())))) {
+    if (Instruction *BSwap = MatchBSwap(I))
+      return BSwap;
+  }
+  
+  // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+  if (Op0->hasOneUse() && match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op1, C1->getValue())) {
+    Instruction *NOr = BinaryOperator::CreateOr(A, Op1);
+    InsertNewInstBefore(NOr, I);
+    NOr->takeName(Op0);
+    return BinaryOperator::CreateXor(NOr, C1);
+  }
+
+  // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+  if (Op1->hasOneUse() && match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op0, C1->getValue())) {
+    Instruction *NOr = BinaryOperator::CreateOr(A, Op0);
+    InsertNewInstBefore(NOr, I);
+    NOr->takeName(Op0);
+    return BinaryOperator::CreateXor(NOr, C1);
+  }
+
+  // (A & C)|(B & D)
+  Value *C = 0, *D = 0;
+  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+      match(Op1, m_And(m_Value(B), m_Value(D)))) {
+    Value *V1 = 0, *V2 = 0, *V3 = 0;
+    C1 = dyn_cast<ConstantInt>(C);
+    C2 = dyn_cast<ConstantInt>(D);
+    if (C1 && C2) {  // (A & C1)|(B & C2)
+      // If we have: ((V + N) & C1) | (V & C2)
+      // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+      // replace with V+N.
+      if (C1->getValue() == ~C2->getValue()) {
+        if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+
+            match(A, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == B && MaskedValueIsZero(V2, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+          if (V2 == B && MaskedValueIsZero(V1, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+        }
+        // Or commutes, try both ways.
+        if ((C1->getValue() & (C1->getValue()+1)) == 0 &&
+            match(B, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == A && MaskedValueIsZero(V2, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+          if (V2 == A && MaskedValueIsZero(V1, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+        }
+      }
+      V1 = 0; V2 = 0; V3 = 0;
+    }
+    
+    // Check to see if we have any common things being and'ed.  If so, find the
+    // terms for V1 & (V2|V3).
+    if (isOnlyUse(Op0) || isOnlyUse(Op1)) {
+      if (A == B)      // (A & C)|(A & D) == A & (C|D)
+        V1 = A, V2 = C, V3 = D;
+      else if (A == D) // (A & C)|(B & A) == A & (B|C)
+        V1 = A, V2 = B, V3 = C;
+      else if (C == B) // (A & C)|(C & D) == C & (A|D)
+        V1 = C, V2 = A, V3 = D;
+      else if (C == D) // (A & C)|(B & C) == C & (A|B)
+        V1 = C, V2 = A, V3 = B;
+      
+      if (V1) {
+        Value *Or =
+          InsertNewInstBefore(BinaryOperator::CreateOr(V2, V3, "tmp"), I);
+        return BinaryOperator::CreateAnd(V1, Or);
+      }
+    }
+
+    // (A & (C0?-1:0)) | (B & ~(C0?-1:0)) ->  C0 ? A : B, and commuted variants
+    if (Instruction *Match = MatchSelectFromAndOr(A, B, C, D))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(B, A, D, C))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(C, B, A, D))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(D, A, B, C))
+      return Match;
+
+    // ((A&~B)|(~A&B)) -> A^B
+    if ((match(C, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, D);
+    // ((~B&A)|(~A&B)) -> A^B
+    if ((match(A, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, D);
+    // ((A&~B)|(B&~A)) -> A^B
+    if ((match(C, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, B);
+    // ((~B&A)|(B&~A)) -> A^B
+    if ((match(A, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, B);
+  }
+  
+  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Instruction *NewOp =
+        InsertNewInstBefore(BinaryOperator::CreateOr(SI0->getOperand(0),
+                                                     SI1->getOperand(0),
+                                                     SI0->getName()), I);
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
+                                      SI1->getOperand(1));
+      }
+  }
+
+  // ((A|B)&1)|(B&-2) -> (A&1) | B
+  if (match(Op0, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
+      match(Op0, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
+    Instruction *Ret = FoldOrWithConstants(I, Op1, A, B, C);
+    if (Ret) return Ret;
+  }
+  // (B&-2)|((A|B)&1) -> (A&1) | B
+  if (match(Op1, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
+      match(Op1, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
+    Instruction *Ret = FoldOrWithConstants(I, Op0, A, B, C);
+    if (Ret) return Ret;
+  }
+
+  if (match(Op0, m_Not(m_Value(A)))) {   // ~A | Op1
+    if (A == Op1)   // ~A | A == -1
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+  } else {
+    A = 0;
+  }
+  // Note, A is still live here!
+  if (match(Op1, m_Not(m_Value(B)))) {   // Op0 | ~B
+    if (Op0 == B)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+    // (~A | ~B) == (~(A & B)) - De Morgan's Law
+    if (A && isOnlyUse(Op0) && isOnlyUse(Op1)) {
+      Value *And = InsertNewInstBefore(BinaryOperator::CreateAnd(A, B,
+                                              I.getName()+".demorgan"), I);
+      return BinaryOperator::CreateNot(And);
+    }
+  }
+
+  // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) {
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+      if (Instruction *Res = FoldOrOfICmps(I, LHS, RHS))
+        return Res;
+  }
+    
+  // fold (or (cast A), (cast B)) -> (cast (or A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ?
+        if (!isa<ICmpInst>(Op0C->getOperand(0)) ||
+            !isa<ICmpInst>(Op1C->getOperand(0))) {
+          const Type *SrcTy = Op0C->getOperand(0)->getType();
+          if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+              // Only do this if the casts both really cause code to be
+              // generated.
+              ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                                I.getType(), TD) &&
+              ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                                I.getType(), TD)) {
+            Instruction *NewOp = BinaryOperator::CreateOr(Op0C->getOperand(0),
+                                                          Op1C->getOperand(0),
+                                                          I.getName());
+            InsertNewInstBefore(NewOp, I);
+            return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+          }
+        }
+      }
+  }
+  
+    
+  // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) {
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) {
+      if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&
+          RHS->getPredicate() == FCmpInst::FCMP_UNO && 
+          LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) {
+        if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+          if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+            // If either of the constants are nans, then the whole thing returns
+            // true.
+            if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+            
+            // Otherwise, no need to compare the two constants, compare the
+            // rest.
+            return new FCmpInst(FCmpInst::FCMP_UNO, LHS->getOperand(0),
+                                RHS->getOperand(0));
+          }
+      } else {
+        Value *Op0LHS, *Op0RHS, *Op1LHS, *Op1RHS;
+        FCmpInst::Predicate Op0CC, Op1CC;
+        if (match(Op0, m_FCmp(Op0CC, m_Value(Op0LHS), m_Value(Op0RHS))) &&
+            match(Op1, m_FCmp(Op1CC, m_Value(Op1LHS), m_Value(Op1RHS)))) {
+          if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+            // Swap RHS operands to match LHS.
+            Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+            std::swap(Op1LHS, Op1RHS);
+          }
+          if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
+            // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y).
+            if (Op0CC == Op1CC)
+              return new FCmpInst((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
+            else if (Op0CC == FCmpInst::FCMP_TRUE ||
+                     Op1CC == FCmpInst::FCMP_TRUE)
+              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+            else if (Op0CC == FCmpInst::FCMP_FALSE)
+              return ReplaceInstUsesWith(I, Op1);
+            else if (Op1CC == FCmpInst::FCMP_FALSE)
+              return ReplaceInstUsesWith(I, Op0);
+            bool Op0Ordered;
+            bool Op1Ordered;
+            unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
+            unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+            if (Op0Ordered == Op1Ordered) {
+              // If both are ordered or unordered, return a new fcmp with
+              // or'ed predicates.
+              Value *RV = getFCmpValue(Op0Ordered, Op0Pred|Op1Pred,
+                                       Op0LHS, Op0RHS);
+              if (Instruction *I = dyn_cast<Instruction>(RV))
+                return I;
+              // Otherwise, it's a constant boolean value...
+              return ReplaceInstUsesWith(I, RV);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+namespace {
+
+// XorSelf - Implements: X ^ X --> 0
+struct XorSelf {
+  Value *RHS;
+  XorSelf(Value *rhs) : RHS(rhs) {}
+  bool shouldApply(Value *LHS) const { return LHS == RHS; }
+  Instruction *apply(BinaryOperator &Xor) const {
+    return &Xor;
+  }
+};
+
+}
+
+Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1)) {
+    if (isa<UndefValue>(Op0))
+      // Handle undef ^ undef -> 0 special case. This is a common
+      // idiom (misuse).
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+    return ReplaceInstUsesWith(I, Op1);  // X ^ undef -> undef
+  }
+
+  // xor X, X = 0, even if X is nested in a sequence of Xor's.
+  if (Instruction *Result = AssociativeOpt(I, XorSelf(Op1))) {
+    assert(Result == &I && "AssociativeOpt didn't work?"); Result=Result;
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+  } else if (isa<ConstantAggregateZero>(Op1)) {
+    return ReplaceInstUsesWith(I, Op0);  // X ^ <0,0> -> X
+  }
+
+  // Is this a ~ operation?
+  if (Value *NotOp = dyn_castNotVal(&I)) {
+    // ~(~X & Y) --> (X | ~Y) - De Morgan's Law
+    // ~(~X | Y) === (X & ~Y) - De Morgan's Law
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) {
+      if (Op0I->getOpcode() == Instruction::And || 
+          Op0I->getOpcode() == Instruction::Or) {
+        if (dyn_castNotVal(Op0I->getOperand(1))) Op0I->swapOperands();
+        if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) {
+          Instruction *NotY =
+            BinaryOperator::CreateNot(Op0I->getOperand(1),
+                                      Op0I->getOperand(1)->getName()+".not");
+          InsertNewInstBefore(NotY, I);
+          if (Op0I->getOpcode() == Instruction::And)
+            return BinaryOperator::CreateOr(Op0NotVal, NotY);
+          else
+            return BinaryOperator::CreateAnd(Op0NotVal, NotY);
+        }
+      }
+    }
+  }
+  
+  
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    if (RHS == ConstantInt::getTrue() && Op0->hasOneUse()) {
+      // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(Op0))
+        return new ICmpInst(ICI->getInversePredicate(),
+                            ICI->getOperand(0), ICI->getOperand(1));
+
+      if (FCmpInst *FCI = dyn_cast<FCmpInst>(Op0))
+        return new FCmpInst(FCI->getInversePredicate(),
+                            FCI->getOperand(0), FCI->getOperand(1));
+    }
+
+    // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
+    if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+      if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
+        if (CI->hasOneUse() && Op0C->hasOneUse()) {
+          Instruction::CastOps Opcode = Op0C->getOpcode();
+          if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+            if (RHS == ConstantExpr::getCast(Opcode, ConstantInt::getTrue(),
+                                             Op0C->getDestTy())) {
+              Instruction *NewCI = InsertNewInstBefore(CmpInst::Create(
+                                     CI->getOpcode(), CI->getInversePredicate(),
+                                     CI->getOperand(0), CI->getOperand(1)), I);
+              NewCI->takeName(CI);
+              return CastInst::Create(Opcode, NewCI, Op0C->getType());
+            }
+          }
+        }
+      }
+    }
+
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      // ~(c-X) == X-c-1 == X+(-c-1)
+      if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
+        if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
+          Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
+          Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
+                                              ConstantInt::get(I.getType(), 1));
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
+        }
+          
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+        if (Op0I->getOpcode() == Instruction::Add) {
+          // ~(X-c) --> (-c-1)-X
+          if (RHS->isAllOnesValue()) {
+            Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
+            return BinaryOperator::CreateSub(
+                           ConstantExpr::getSub(NegOp0CI,
+                                             ConstantInt::get(I.getType(), 1)),
+                                          Op0I->getOperand(0));
+          } else if (RHS->getValue().isSignBit()) {
+            // (X + C) ^ signbit -> (X + C + signbit)
+            Constant *C = ConstantInt::get(RHS->getValue() + Op0CI->getValue());
+            return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
+
+          }
+        } else if (Op0I->getOpcode() == Instruction::Or) {
+          // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue())) {
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            // Anything in both C1 and C2 is known to be zero, remove it from
+            // NewRHS.
+            Constant *CommonBits = And(Op0CI, RHS);
+            NewRHS = ConstantExpr::getAnd(NewRHS, 
+                                          ConstantExpr::getNot(CommonBits));
+            AddToWorkList(Op0I);
+            I.setOperand(0, Op0I->getOperand(0));
+            I.setOperand(1, NewRHS);
+            return &I;
+          }
+        }
+      }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *X = dyn_castNotVal(Op0))   // ~A ^ A == -1
+    if (X == Op1)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  if (Value *X = dyn_castNotVal(Op1))   // A ^ ~A == -1
+    if (X == Op0)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  
+  BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
+  if (Op1I) {
+    Value *A, *B;
+    if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op0) {              // B^(B|A) == (A|B)^B
+        Op1I->swapOperands();
+        I.swapOperands();
+        std::swap(Op0, Op1);
+      } else if (B == Op0) {       // B^(A|B) == (A|B)^B
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    } else if (match(Op1I, m_Xor(m_Specific(Op0), m_Value(B)))) {
+      return ReplaceInstUsesWith(I, B);                      // A^(A^B) == B
+    } else if (match(Op1I, m_Xor(m_Value(A), m_Specific(Op0)))) {
+      return ReplaceInstUsesWith(I, A);                      // A^(B^A) == B
+    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && Op1I->hasOneUse()){
+      if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
+        Op1I->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    }
+  }
+  
+  BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
+  if (Op0I) {
+    Value *A, *B;
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && Op0I->hasOneUse()) {
+      if (A == Op1)                                  // (B|A)^B == (A|B)^B
+        std::swap(A, B);
+      if (B == Op1) {                                // (A|B)^B == A & ~B
+        Instruction *NotB =
+          InsertNewInstBefore(BinaryOperator::CreateNot(Op1, "tmp"), I);
+        return BinaryOperator::CreateAnd(A, NotB);
+      }
+    } else if (match(Op0I, m_Xor(m_Specific(Op1), m_Value(B)))) {
+      return ReplaceInstUsesWith(I, B);                      // (A^B)^A == B
+    } else if (match(Op0I, m_Xor(m_Value(A), m_Specific(Op1)))) {
+      return ReplaceInstUsesWith(I, A);                      // (B^A)^A == B
+    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && Op0I->hasOneUse()){
+      if (A == Op1)                                        // (A&B)^A -> (B&A)^A
+        std::swap(A, B);
+      if (B == Op1 &&                                      // (B&A)^A == ~B & A
+          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+        Instruction *N =
+          InsertNewInstBefore(BinaryOperator::CreateNot(A, "tmp"), I);
+        return BinaryOperator::CreateAnd(N, Op1);
+      }
+    }
+  }
+  
+  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
+  if (Op0I && Op1I && Op0I->isShift() && 
+      Op0I->getOpcode() == Op1I->getOpcode() && 
+      Op0I->getOperand(1) == Op1I->getOperand(1) &&
+      (Op1I->hasOneUse() || Op1I->hasOneUse())) {
+    Instruction *NewOp =
+      InsertNewInstBefore(BinaryOperator::CreateXor(Op0I->getOperand(0),
+                                                    Op1I->getOperand(0),
+                                                    Op0I->getName()), I);
+    return BinaryOperator::Create(Op1I->getOpcode(), NewOp, 
+                                  Op1I->getOperand(1));
+  }
+    
+  if (Op0I && Op1I) {
+    Value *A, *B, *C, *D;
+    // (A & B)^(A | B) -> A ^ B
+    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C)) 
+        return BinaryOperator::CreateXor(A, B);
+    }
+    // (A | B)^(A & B) -> A ^ B
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C)) 
+        return BinaryOperator::CreateXor(A, B);
+    }
+    
+    // (A & B)^(C & D)
+    if ((Op0I->hasOneUse() || Op1I->hasOneUse()) &&
+        match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+      // (X & Y)^(X & Y) -> (Y^Z) & X
+      Value *X = 0, *Y = 0, *Z = 0;
+      if (A == C)
+        X = A, Y = B, Z = D;
+      else if (A == D)
+        X = A, Y = B, Z = C;
+      else if (B == C)
+        X = B, Y = A, Z = D;
+      else if (B == D)
+        X = B, Y = A, Z = C;
+      
+      if (X) {
+        Instruction *NewOp =
+        InsertNewInstBefore(BinaryOperator::CreateXor(Y, Z, Op0->getName()), I);
+        return BinaryOperator::CreateAnd(NewOp, X);
+      }
+    }
+  }
+    
+  // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+  // fold (xor (cast A), (cast B)) -> (cast (xor A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind?
+        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+            // Only do this if the casts both really cause code to be generated.
+            ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                              I.getType(), TD) &&
+            ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                              I.getType(), TD)) {
+          Instruction *NewOp = BinaryOperator::CreateXor(Op0C->getOperand(0),
+                                                         Op1C->getOperand(0),
+                                                         I.getName());
+          InsertNewInstBefore(NewOp, I);
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// AddWithOverflow - Compute Result = In1+In2, returning true if the result
+/// overflowed for this type.
+static bool AddWithOverflow(ConstantInt *&Result, ConstantInt *In1,
+                            ConstantInt *In2, bool IsSigned = false) {
+  Result = cast<ConstantInt>(Add(In1, In2));
+
+  if (IsSigned)
+    if (In2->getValue().isNegative())
+      return Result->getValue().sgt(In1->getValue());
+    else
+      return Result->getValue().slt(In1->getValue());
+  else
+    return Result->getValue().ult(In1->getValue());
+}
+
+/// SubWithOverflow - Compute Result = In1-In2, returning true if the result
+/// overflowed for this type.
+static bool SubWithOverflow(ConstantInt *&Result, ConstantInt *In1,
+                            ConstantInt *In2, bool IsSigned = false) {
+  Result = cast<ConstantInt>(Subtract(In1, In2));
+
+  if (IsSigned)
+    if (In2->getValue().isNegative())
+      return Result->getValue().slt(In1->getValue());
+    else
+      return Result->getValue().sgt(In1->getValue());
+  else
+    return Result->getValue().ugt(In1->getValue());
+}
+
+/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
+/// code necessary to compute the offset from the base pointer (without adding
+/// in the base pointer).  Return the result as a signed integer of intptr size.
+static Value *EmitGEPOffset(User *GEP, Instruction &I, InstCombiner &IC) {
+  TargetData &TD = IC.getTargetData();
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  const Type *IntPtrTy = TD.getIntPtrType();
+  Value *Result = Constant::getNullValue(IntPtrTy);
+
+  // Build a mask for high order bits.
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+
+  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
+       ++i, ++GTI) {
+    Value *Op = *i;
+    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
+    if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
+      if (OpC->isZero()) continue;
+      
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+        
+        if (ConstantInt *RC = dyn_cast<ConstantInt>(Result))
+          Result = ConstantInt::get(RC->getValue() + APInt(IntPtrWidth, Size));
+        else
+          Result = IC.InsertNewInstBefore(
+                   BinaryOperator::CreateAdd(Result,
+                                             ConstantInt::get(IntPtrTy, Size),
+                                             GEP->getName()+".offs"), I);
+        continue;
+      }
+      
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
+      Scale = ConstantExpr::getMul(OC, Scale);
+      if (Constant *RC = dyn_cast<Constant>(Result))
+        Result = ConstantExpr::getAdd(RC, Scale);
+      else {
+        // Emit an add instruction.
+        Result = IC.InsertNewInstBefore(
+           BinaryOperator::CreateAdd(Result, Scale,
+                                     GEP->getName()+".offs"), I);
+      }
+      continue;
+    }
+    // Convert to correct type.
+    if (Op->getType() != IntPtrTy) {
+      if (Constant *OpC = dyn_cast<Constant>(Op))
+        Op = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true);
+      else
+        Op = IC.InsertNewInstBefore(CastInst::CreateIntegerCast(Op, IntPtrTy,
+                                                                true,
+                                                      Op->getName()+".c"), I);
+    }
+    if (Size != 1) {
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      if (Constant *OpC = dyn_cast<Constant>(Op))
+        Op = ConstantExpr::getMul(OpC, Scale);
+      else    // We'll let instcombine(mul) convert this to a shl if possible.
+        Op = IC.InsertNewInstBefore(BinaryOperator::CreateMul(Op, Scale,
+                                                  GEP->getName()+".idx"), I);
+    }
+
+    // Emit an add instruction.
+    if (isa<Constant>(Op) && isa<Constant>(Result))
+      Result = ConstantExpr::getAdd(cast<Constant>(Op),
+                                    cast<Constant>(Result));
+    else
+      Result = IC.InsertNewInstBefore(BinaryOperator::CreateAdd(Op, Result,
+                                                  GEP->getName()+".offs"), I);
+  }
+  return Result;
+}
+
+
+/// EvaluateGEPOffsetExpression - Return an value that can be used to compare of
+/// the *offset* implied by GEP to zero.  For example, if we have &A[i], we want
+/// to return 'i' for "icmp ne i, 0".  Note that, in general, indices can be
+/// complex, and scales are involved.  The above expression would also be legal
+/// to codegen as "icmp ne (i*4), 0" (assuming A is a pointer to i32).  This
+/// later form is less amenable to optimization though, and we are allowed to
+/// generate the first by knowing that pointer arithmetic doesn't overflow.
+///
+/// If we can't emit an optimized form for this expression, this returns null.
+/// 
+static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
+                                          InstCombiner &IC) {
+  TargetData &TD = IC.getTargetData();
+  gep_type_iterator GTI = gep_type_begin(GEP);
+
+  // Check to see if this gep only has a single variable index.  If so, and if
+  // any constant indices are a multiple of its scale, then we can compute this
+  // in terms of the scale of the variable index.  For example, if the GEP
+  // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
+  // because the expression will cross zero at the same point.
+  unsigned i, e = GEP->getNumOperands();
+  int64_t Offset = 0;
+  for (i = 1; i != e; ++i, ++GTI) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      // Compute the aggregate offset of constant indices.
+      if (CI->isZero()) continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+      } else {
+        uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+        Offset += Size*CI->getSExtValue();
+      }
+    } else {
+      // Found our variable index.
+      break;
+    }
+  }
+  
+  // If there are no variable indices, we must have a constant offset, just
+  // evaluate it the general way.
+  if (i == e) return 0;
+  
+  Value *VariableIdx = GEP->getOperand(i);
+  // Determine the scale factor of the variable element.  For example, this is
+  // 4 if the variable index is into an array of i32.
+  uint64_t VariableScale = TD.getTypeAllocSize(GTI.getIndexedType());
+  
+  // Verify that there are no other variable indices.  If so, emit the hard way.
+  for (++i, ++GTI; i != e; ++i, ++GTI) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!CI) return 0;
+   
+    // Compute the aggregate offset of constant indices.
+    if (CI->isZero()) continue;
+    
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+    } else {
+      uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+      Offset += Size*CI->getSExtValue();
+    }
+  }
+  
+  // Okay, we know we have a single variable index, which must be a
+  // pointer/array/vector index.  If there is no offset, life is simple, return
+  // the index.
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  if (Offset == 0) {
+    // Cast to intptrty in case a truncation occurs.  If an extension is needed,
+    // we don't need to bother extending: the extension won't affect where the
+    // computation crosses zero.
+    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth)
+      VariableIdx = new TruncInst(VariableIdx, TD.getIntPtrType(),
+                                  VariableIdx->getNameStart(), &I);
+    return VariableIdx;
+  }
+  
+  // Otherwise, there is an index.  The computation we will do will be modulo
+  // the pointer size, so get it.
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+  
+  Offset &= PtrSizeMask;
+  VariableScale &= PtrSizeMask;
+
+  // To do this transformation, any constant index must be a multiple of the
+  // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
+  // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
+  // multiple of the variable scale.
+  int64_t NewOffs = Offset / (int64_t)VariableScale;
+  if (Offset != NewOffs*(int64_t)VariableScale)
+    return 0;
+
+  // Okay, we can do this evaluation.  Start by converting the index to intptr.
+  const Type *IntPtrTy = TD.getIntPtrType();
+  if (VariableIdx->getType() != IntPtrTy)
+    VariableIdx = CastInst::CreateIntegerCast(VariableIdx, IntPtrTy,
+                                              true /*SExt*/, 
+                                              VariableIdx->getNameStart(), &I);
+  Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
+  return BinaryOperator::CreateAdd(VariableIdx, OffsetVal, "offset", &I);
+}
+
+
+/// FoldGEPICmp - Fold comparisons between a GEP instruction and something
+/// else.  At this point we know that the GEP is on the LHS of the comparison.
+Instruction *InstCombiner::FoldGEPICmp(User *GEPLHS, Value *RHS,
+                                       ICmpInst::Predicate Cond,
+                                       Instruction &I) {
+  assert(dyn_castGetElementPtr(GEPLHS) && "LHS is not a getelementptr!");
+
+  // Look through bitcasts.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
+    RHS = BCI->getOperand(0);
+
+  Value *PtrBase = GEPLHS->getOperand(0);
+  if (PtrBase == RHS) {
+    // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
+    // This transformation (ignoring the base and scales) is valid because we
+    // know pointers can't overflow.  See if we can output an optimized form.
+    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, I, *this);
+    
+    // If not, synthesize the offset the hard way.
+    if (Offset == 0)
+      Offset = EmitGEPOffset(GEPLHS, I, *this);
+    return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
+                        Constant::getNullValue(Offset->getType()));
+  } else if (User *GEPRHS = dyn_castGetElementPtr(RHS)) {
+    // If the base pointers are different, but the indices are the same, just
+    // compare the base pointer.
+    if (PtrBase != GEPRHS->getOperand(0)) {
+      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
+      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
+                        GEPRHS->getOperand(0)->getType();
+      if (IndicesTheSame)
+        for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+          if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+            IndicesTheSame = false;
+            break;
+          }
+
+      // If all indices are the same, just compare the base pointers.
+      if (IndicesTheSame)
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), 
+                            GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+
+      // Otherwise, the base pointers are different and the indices are
+      // different, bail out.
+      return 0;
+    }
+
+    // If one of the GEPs has all zero indices, recurse.
+    bool AllZeros = true;
+    for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPLHS->getOperand(i)) ||
+          !cast<Constant>(GEPLHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+                          ICmpInst::getSwappedPredicate(Cond), I);
+
+    // If the other GEP has all zero indices, recurse.
+    AllZeros = true;
+    for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPRHS->getOperand(i)) ||
+          !cast<Constant>(GEPRHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+
+    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+      // If the GEPs only differ by one index, compare it.
+      unsigned NumDifferences = 0;  // Keep track of # differences.
+      unsigned DiffOperand = 0;     // The operand that differs.
+      for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+        if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+          if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
+                   GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
+            // Irreconcilable differences.
+            NumDifferences = 2;
+            break;
+          } else {
+            if (NumDifferences++) break;
+            DiffOperand = i;
+          }
+        }
+
+      if (NumDifferences == 0)   // SAME GEP?
+        return ReplaceInstUsesWith(I, // No comparison is needed here.
+                                   ConstantInt::get(Type::Int1Ty,
+                                             ICmpInst::isTrueWhenEqual(Cond)));
+
+      else if (NumDifferences == 1) {
+        Value *LHSV = GEPLHS->getOperand(DiffOperand);
+        Value *RHSV = GEPRHS->getOperand(DiffOperand);
+        // Make sure we do a signed comparison here.
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+      }
+    }
+
+    // Only lower this if the icmp is the only user of the GEP or if we expect
+    // the result to fold to a constant!
+    if ((isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+        (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
+      // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
+      Value *L = EmitGEPOffset(GEPLHS, I, *this);
+      Value *R = EmitGEPOffset(GEPRHS, I, *this);
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+    }
+  }
+  return 0;
+}
+
+/// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
+///
+Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
+                                                Instruction *LHSI,
+                                                Constant *RHSC) {
+  if (!isa<ConstantFP>(RHSC)) return 0;
+  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
+  
+  // Get the width of the mantissa.  We don't want to hack on conversions that
+  // might lose information from the integer, e.g. "i64 -> float"
+  int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
+  if (MantissaWidth == -1) return 0;  // Unknown.
+  
+  // Check to see that the input is converted from an integer type that is small
+  // enough that preserves all bits.  TODO: check here for "known" sign bits.
+  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+  unsigned InputSize = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
+  
+  // If this is a uitofp instruction, we need an extra bit to hold the sign.
+  bool LHSUnsigned = isa<UIToFPInst>(LHSI);
+  if (LHSUnsigned)
+    ++InputSize;
+  
+  // If the conversion would lose info, don't hack on this.
+  if ((int)InputSize > MantissaWidth)
+    return 0;
+  
+  // Otherwise, we can potentially simplify the comparison.  We know that it
+  // will always come through as an integer value and we know the constant is
+  // not a NAN (it would have been previously simplified).
+  assert(!RHS.isNaN() && "NaN comparison not already folded!");
+  
+  ICmpInst::Predicate Pred;
+  switch (I.getPredicate()) {
+  default: assert(0 && "Unexpected predicate!");
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_OEQ:
+    Pred = ICmpInst::ICMP_EQ;
+    break;
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_OGT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
+    break;
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OGE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
+    break;
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_OLT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
+    break;
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_OLE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
+    break;
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ONE:
+    Pred = ICmpInst::ICMP_NE;
+    break;
+  case FCmpInst::FCMP_ORD:
+    return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+  case FCmpInst::FCMP_UNO:
+    return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+  }
+  
+  const IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+  
+  // Now we know that the APFloat is a normal number, zero or inf.
+  
+  // See if the FP constant is too large for the integer.  For example,
+  // comparing an i8 to 300.0.
+  unsigned IntWidth = IntTy->getPrimitiveSizeInBits();
+  
+  if (!LHSUnsigned) {
+    // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
+    // and large values.
+    APFloat SMax(RHS.getSemantics(), APFloat::fcZero, false);
+    SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
+          Pred == ICmpInst::ICMP_SLE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    }
+  } else {
+    // If the RHS value is > UnsignedMax, fold the comparison. This handles
+    // +INF and large values.
+    APFloat UMax(RHS.getSemantics(), APFloat::fcZero, false);
+    UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
+                          APFloat::rmNearestTiesToEven);
+    if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
+          Pred == ICmpInst::ICMP_ULE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    }
+  }
+  
+  if (!LHSUnsigned) {
+    // See if the RHS value is < SignedMin.
+    APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+    SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
+          Pred == ICmpInst::ICMP_SGE)
+        return ReplaceInstUsesWith(I,ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    }
+  }
+
+  // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
+  // [0, UMAX], but it may still be fractional.  See if it is fractional by
+  // casting the FP value to the integer value and back, checking for equality.
+  // Don't do this for zero, because -0.0 is not fractional.
+  Constant *RHSInt = LHSUnsigned
+    ? ConstantExpr::getFPToUI(RHSC, IntTy)
+    : ConstantExpr::getFPToSI(RHSC, IntTy);
+  if (!RHS.isZero()) {
+    bool Equal = LHSUnsigned
+      ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
+      : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
+    if (!Equal) {
+      // If we had a comparison against a fractional value, we have to adjust
+      // the compare predicate and sometimes the value.  RHSC is rounded towards
+      // zero at this point.
+      switch (Pred) {
+      default: assert(0 && "Unexpected integer comparison!");
+      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      case ICmpInst::ICMP_ULE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> false
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        break;
+      case ICmpInst::ICMP_SLE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> int < -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLT;
+        break;
+      case ICmpInst::ICMP_ULT:
+        // (float)int < -4.4   --> false
+        // (float)int < 4.4    --> int <= 4
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        Pred = ICmpInst::ICMP_ULE;
+        break;
+      case ICmpInst::ICMP_SLT:
+        // (float)int < -4.4   --> int < -4
+        // (float)int < 4.4    --> int <= 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLE;
+        break;
+      case ICmpInst::ICMP_UGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> true
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        break;
+      case ICmpInst::ICMP_SGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> int >= -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGE;
+        break;
+      case ICmpInst::ICMP_UGE:
+        // (float)int >= -4.4   --> true
+        // (float)int >= 4.4    --> int > 4
+        if (!RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        Pred = ICmpInst::ICMP_UGT;
+        break;
+      case ICmpInst::ICMP_SGE:
+        // (float)int >= -4.4   --> int >= -4
+        // (float)int >= 4.4    --> int > 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGT;
+        break;
+      }
+    }
+  }
+
+  // Lower this FP comparison into an appropriate integer version of the
+  // comparison.
+  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+}
+
+Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+  bool Changed = SimplifyCompare(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Fold trivial predicates.
+  if (I.getPredicate() == FCmpInst::FCMP_FALSE)
+    return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+  if (I.getPredicate() == FCmpInst::FCMP_TRUE)
+    return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+  
+  // Simplify 'fcmp pred X, X'
+  if (Op0 == Op1) {
+    switch (I.getPredicate()) {
+    default: assert(0 && "Unknown predicate!");
+    case FCmpInst::FCMP_UEQ:    // True if unordered or equal
+    case FCmpInst::FCMP_UGE:    // True if unordered, greater than, or equal
+    case FCmpInst::FCMP_ULE:    // True if unordered, less than, or equal
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case FCmpInst::FCMP_OGT:    // True if ordered and greater than
+    case FCmpInst::FCMP_OLT:    // True if ordered and less than
+    case FCmpInst::FCMP_ONE:    // True if ordered and operands are unequal
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      
+    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
+    case FCmpInst::FCMP_ULT:    // True if unordered or less than
+    case FCmpInst::FCMP_UGT:    // True if unordered or greater than
+    case FCmpInst::FCMP_UNE:    // True if unordered or not equal
+      // Canonicalize these to be 'fcmp uno %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_UNO);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+      
+    case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
+    case FCmpInst::FCMP_OEQ:    // True if ordered and equal
+    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
+    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal
+      // Canonicalize these to be 'fcmp ord %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_ORD);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+    }
+  }
+    
+  if (isa<UndefValue>(Op1))                  // fcmp pred X, undef -> undef
+    return ReplaceInstUsesWith(I, UndefValue::get(Type::Int1Ty));
+
+  // Handle fcmp with constant RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    // If the constant is a nan, see if we can fold the comparison based on it.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->getValueAPF().isNaN()) {
+        if (FCmpInst::isOrdered(I.getPredicate()))   // True if ordered and...
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        assert(FCmpInst::isUnordered(I.getPredicate()) &&
+               "Comparison must be either ordered or unordered!");
+        // True if unordered.
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      }
+    }
+    
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::PHI:
+        // Only fold fcmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        break;
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        if (Instruction *NV = FoldFCmp_IntToFP_Cst(I, LHSI, RHSC))
+          return NV;
+        break;
+      case Instruction::Select:
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (LHSI->hasOneUse()) {
+          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+            // Fold the known value into the constant operand.
+            Op1 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op2 = InsertNewInstBefore(new FCmpInst(I.getPredicate(),
+                                                      LHSI->getOperand(2), RHSC,
+                                                      I.getName()), I);
+          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+            // Fold the known value into the constant operand.
+            Op2 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op1 = InsertNewInstBefore(new FCmpInst(I.getPredicate(),
+                                                      LHSI->getOperand(1), RHSC,
+                                                      I.getName()), I);
+          }
+        }
+
+        if (Op1)
+          return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+        break;
+      }
+  }
+
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+  bool Changed = SimplifyCompare(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const Type *Ty = Op0->getType();
+
+  // icmp X, X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
+                                                   I.isTrueWhenEqual()));
+
+  if (isa<UndefValue>(Op1))                  // X icmp undef -> undef
+    return ReplaceInstUsesWith(I, UndefValue::get(Type::Int1Ty));
+  
+  // icmp <global/alloca*/null>, <global/alloca*/null> - Global/Stack value
+  // addresses never equal each other!  We already know that Op0 != Op1.
+  if ((isa<GlobalValue>(Op0) || isa<AllocaInst>(Op0) ||
+       isa<ConstantPointerNull>(Op0)) &&
+      (isa<GlobalValue>(Op1) || isa<AllocaInst>(Op1) ||
+       isa<ConstantPointerNull>(Op1)))
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
+                                                   !I.isTrueWhenEqual()));
+
+  // icmp's with boolean values can always be turned into bitwise operations
+  if (Ty == Type::Int1Ty) {
+    switch (I.getPredicate()) {
+    default: assert(0 && "Invalid icmp instruction!");
+    case ICmpInst::ICMP_EQ: {               // icmp eq i1 A, B -> ~(A^B)
+      Instruction *Xor = BinaryOperator::CreateXor(Op0, Op1, I.getName()+"tmp");
+      InsertNewInstBefore(Xor, I);
+      return BinaryOperator::CreateNot(Xor);
+    }
+    case ICmpInst::ICMP_NE:                  // icmp eq i1 A, B -> A^B
+      return BinaryOperator::CreateXor(Op0, Op1);
+
+    case ICmpInst::ICMP_UGT:
+      std::swap(Op0, Op1);                   // Change icmp ugt -> icmp ult
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULT:{               // icmp ult i1 A, B -> ~A & B
+      Instruction *Not = BinaryOperator::CreateNot(Op0, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateAnd(Not, Op1);
+    }
+    case ICmpInst::ICMP_SGT:
+      std::swap(Op0, Op1);                   // Change icmp sgt -> icmp slt
+      // FALL THROUGH
+    case ICmpInst::ICMP_SLT: {               // icmp slt i1 A, B -> A & ~B
+      Instruction *Not = BinaryOperator::CreateNot(Op1, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateAnd(Not, Op0);
+    }
+    case ICmpInst::ICMP_UGE:
+      std::swap(Op0, Op1);                   // Change icmp uge -> icmp ule
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULE: {               //  icmp ule i1 A, B -> ~A | B
+      Instruction *Not = BinaryOperator::CreateNot(Op0, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateOr(Not, Op1);
+    }
+    case ICmpInst::ICMP_SGE:
+      std::swap(Op0, Op1);                   // Change icmp sge -> icmp sle
+      // FALL THROUGH
+    case ICmpInst::ICMP_SLE: {               //  icmp sle i1 A, B -> A | ~B
+      Instruction *Not = BinaryOperator::CreateNot(Op1, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+    }
+  }
+
+  unsigned BitWidth = 0;
+  if (TD)
+    BitWidth = TD->getTypeSizeInBits(Ty);
+  else if (isa<IntegerType>(Ty))
+    BitWidth = Ty->getPrimitiveSizeInBits();
+
+  bool isSignBit = false;
+
+  // See if we are doing a comparison with a constant.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    Value *A = 0, *B = 0;
+    
+    // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
+    if (I.isEquality() && CI->isNullValue() &&
+        match(Op0, m_Sub(m_Value(A), m_Value(B)))) {
+      // (icmp cond A B) if cond is equality
+      return new ICmpInst(I.getPredicate(), A, B);
+    }
+    
+    // If we have an icmp le or icmp ge instruction, turn it into the
+    // appropriate icmp lt or icmp gt instruction.  This allows us to rely on
+    // them being folded in the code below.
+    switch (I.getPredicate()) {
+    default: break;
+    case ICmpInst::ICMP_ULE:
+      if (CI->isMaxValue(false))                 // A <=u MAX -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst(ICmpInst::ICMP_ULT, Op0, AddOne(CI));
+    case ICmpInst::ICMP_SLE:
+      if (CI->isMaxValue(true))                  // A <=s MAX -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst(ICmpInst::ICMP_SLT, Op0, AddOne(CI));
+    case ICmpInst::ICMP_UGE:
+      if (CI->isMinValue(false))                 // A >=u MIN -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst( ICmpInst::ICMP_UGT, Op0, SubOne(CI));
+    case ICmpInst::ICMP_SGE:
+      if (CI->isMinValue(true))                  // A >=s MIN -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return new ICmpInst(ICmpInst::ICMP_SGT, Op0, SubOne(CI));
+    }
+    
+    // If this comparison is a normal comparison, it demands all
+    // bits, if it is a sign bit comparison, it only demands the sign bit.
+    bool UnusedBit;
+    isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit);
+  }
+
+  // See if we can fold the comparison based on range information we can get
+  // by checking whether bits are known to be zero or one in the input.
+  if (BitWidth != 0) {
+    APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
+    APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
+
+    if (SimplifyDemandedBits(I.getOperandUse(0),
+                             isSignBit ? APInt::getSignBit(BitWidth)
+                                       : APInt::getAllOnesValue(BitWidth),
+                             Op0KnownZero, Op0KnownOne, 0))
+      return &I;
+    if (SimplifyDemandedBits(I.getOperandUse(1),
+                             APInt::getAllOnesValue(BitWidth),
+                             Op1KnownZero, Op1KnownOne, 0))
+      return &I;
+
+    // Given the known and unknown bits, compute a range that the LHS could be
+    // in.  Compute the Min, Max and RHS values based on the known bits. For the
+    // EQ and NE we use unsigned values.
+    APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
+    APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
+    if (ICmpInst::isSignedPredicate(I.getPredicate())) {
+      ComputeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
+                                             Op0Min, Op0Max);
+      ComputeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
+                                             Op1Min, Op1Max);
+    } else {
+      ComputeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
+                                               Op0Min, Op0Max);
+      ComputeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
+                                               Op1Min, Op1Max);
+    }
+
+    // If Min and Max are known to be the same, then SimplifyDemandedBits
+    // figured out that the LHS is a constant.  Just constant fold this now so
+    // that code below can assume that Min != Max.
+    if (!isa<Constant>(Op0) && Op0Min == Op0Max)
+      return new ICmpInst(I.getPredicate(), ConstantInt::get(Op0Min), Op1);
+    if (!isa<Constant>(Op1) && Op1Min == Op1Max)
+      return new ICmpInst(I.getPredicate(), Op0, ConstantInt::get(Op1Min));
+
+    // Based on the range information we know about the LHS, see if we can
+    // simplify this comparison.  For example, (x&4) < 8  is always true.
+    switch (I.getPredicate()) {
+    default: assert(0 && "Unknown icmp opcode!");
+    case ICmpInst::ICMP_EQ:
+      if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_NE:
+      if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      break;
+    case ICmpInst::ICMP_ULT:
+      if (Op0Max.ult(Op1Min))          // A <u B -> true if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.uge(Op1Max))          // A <u B -> false if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (Op1Min == Op0Max)            // A <u B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Max == Op0Min+1)        // A <u C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, SubOne(CI));
+
+        // (x <u 2147483648) -> (x >s -1)  -> true if sign bit clear
+        if (CI->isMinValue(true))
+          return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
+                            ConstantInt::getAllOnesValue(Op0->getType()));
+      }
+      break;
+    case ICmpInst::ICMP_UGT:
+      if (Op0Min.ugt(Op1Max))          // A >u B -> true if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.ule(Op1Min))          // A >u B -> false if max(A) <= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+
+      if (Op1Max == Op0Min)            // A >u B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Min == Op0Max-1)        // A >u C -> A == C+1 if max(a)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, AddOne(CI));
+
+        // (x >u 2147483647) -> (x <s 0)  -> true if sign bit set
+        if (CI->isMaxValue(true))
+          return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
+                              ConstantInt::getNullValue(Op0->getType()));
+      }
+      break;
+    case ICmpInst::ICMP_SLT:
+      if (Op0Max.slt(Op1Min))          // A <s B -> true if max(A) < min(C)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.sge(Op1Max))          // A <s B -> false if min(A) >= max(C)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (Op1Min == Op0Max)            // A <s B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Max == Op0Min+1)        // A <s C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, SubOne(CI));
+      }
+      break;
+    case ICmpInst::ICMP_SGT:
+      if (Op0Min.sgt(Op1Max))          // A >s B -> true if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.sle(Op1Min))          // A >s B -> false if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+
+      if (Op1Max == Op0Min)            // A >s B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Min == Op0Max-1)        // A >s C -> A == C+1 if max(A)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, AddOne(CI));
+      }
+      break;
+    case ICmpInst::ICMP_SGE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
+      if (Op0Min.sge(Op1Max))          // A >=s B -> true if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.slt(Op1Min))          // A >=s B -> false if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_SLE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
+      if (Op0Max.sle(Op1Min))          // A <=s B -> true if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.sgt(Op1Max))          // A <=s B -> false if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_UGE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
+      if (Op0Min.uge(Op1Max))          // A >=u B -> true if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Max.ult(Op1Min))          // A >=u B -> false if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    case ICmpInst::ICMP_ULE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
+      if (Op0Max.ule(Op1Min))          // A <=u B -> true if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (Op0Min.ugt(Op1Max))          // A <=u B -> false if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      break;
+    }
+
+    // Turn a signed comparison into an unsigned one if both operands
+    // are known to have the same sign.
+    if (I.isSignedPredicate() &&
+        ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) ||
+         (Op0KnownOne.isNegative() && Op1KnownOne.isNegative())))
+      return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+  }
+
+  // Test if the ICmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (I.hasOneUse())
+    if (SelectInst *SI = dyn_cast<SelectInst>(*I.use_begin()))
+      if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
+          (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+        return 0;
+
+  // See if we are doing a comparison between a constant and an instruction that
+  // can be folded into the comparison.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    // Since the RHS is a ConstantInt (CI), if the left hand side is an 
+    // instruction, see if that instruction also has constants so that the 
+    // instruction can be folded into the icmp 
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
+        return Res;
+  }
+
+  // Handle icmp with constant (but not simple integer constant) RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::GetElementPtr:
+        if (RHSC->isNullValue()) {
+          // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+          bool isAllZeros = true;
+          for (unsigned i = 1, e = LHSI->getNumOperands(); i != e; ++i)
+            if (!isa<Constant>(LHSI->getOperand(i)) ||
+                !cast<Constant>(LHSI->getOperand(i))->isNullValue()) {
+              isAllZeros = false;
+              break;
+            }
+          if (isAllZeros)
+            return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
+                    Constant::getNullValue(LHSI->getOperand(0)->getType()));
+        }
+        break;
+
+      case Instruction::PHI:
+        // Only fold icmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        break;
+      case Instruction::Select: {
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (LHSI->hasOneUse()) {
+          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+            // Fold the known value into the constant operand.
+            Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+            // Insert a new ICmp of the other select operand.
+            Op2 = InsertNewInstBefore(new ICmpInst(I.getPredicate(),
+                                                   LHSI->getOperand(2), RHSC,
+                                                   I.getName()), I);
+          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+            // Fold the known value into the constant operand.
+            Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+            // Insert a new ICmp of the other select operand.
+            Op1 = InsertNewInstBefore(new ICmpInst(I.getPredicate(),
+                                                   LHSI->getOperand(1), RHSC,
+                                                   I.getName()), I);
+          }
+        }
+
+        if (Op1)
+          return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+        break;
+      }
+      case Instruction::Malloc:
+        // If we have (malloc != null), and if the malloc has a single use, we
+        // can assume it is successful and remove the malloc.
+        if (LHSI->hasOneUse() && isa<ConstantPointerNull>(RHSC)) {
+          AddToWorkList(LHSI);
+          return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty,
+                                                         !I.isTrueWhenEqual()));
+        }
+        break;
+      }
+  }
+
+  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
+  if (User *GEP = dyn_castGetElementPtr(Op0))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op1, I.getPredicate(), I))
+      return NI;
+  if (User *GEP = dyn_castGetElementPtr(Op1))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op0,
+                           ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+      return NI;
+
+  // Test to see if the operands of the icmp are casted versions of other
+  // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
+  // now.
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) {
+    if (isa<PointerType>(Op0->getType()) && 
+        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { 
+      // We keep moving the cast from the left operand over to the right
+      // operand, where it can often be eliminated completely.
+      Op0 = CI->getOperand(0);
+
+      // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
+      // so eliminate it as well.
+      if (BitCastInst *CI2 = dyn_cast<BitCastInst>(Op1))
+        Op1 = CI2->getOperand(0);
+
+      // If Op1 is a constant, we can fold the cast into the constant.
+      if (Op0->getType() != Op1->getType()) {
+        if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+          Op1 = ConstantExpr::getBitCast(Op1C, Op0->getType());
+        } else {
+          // Otherwise, cast the RHS right before the icmp
+          Op1 = InsertBitCastBefore(Op1, Op0->getType(), I);
+        }
+      }
+      return new ICmpInst(I.getPredicate(), Op0, Op1);
+    }
+  }
+  
+  if (isa<CastInst>(Op0)) {
+    // Handle the special case of: icmp (cast bool to X), <cst>
+    // This comes up when you have code like
+    //   int X = A < B;
+    //   if (X) ...
+    // For generality, we handle any zero-extension of any operand comparison
+    // with a constant or another cast from the same type.
+    if (isa<ConstantInt>(Op1) || isa<CastInst>(Op1))
+      if (Instruction *R = visitICmpInstWithCastAndCast(I))
+        return R;
+  }
+  
+  // See if it's the same type of instruction on the left and right.
+  if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+    if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
+      if (Op0I->getOpcode() == Op1I->getOpcode() && Op0I->hasOneUse() &&
+          Op1I->hasOneUse() && Op0I->getOperand(1) == Op1I->getOperand(1)) {
+        switch (Op0I->getOpcode()) {
+        default: break;
+        case Instruction::Add:
+        case Instruction::Sub:
+        case Instruction::Xor:
+          if (I.isEquality())    // a+x icmp eq/ne b+x --> a icmp b
+            return new ICmpInst(I.getPredicate(), Op0I->getOperand(0),
+                                Op1I->getOperand(0));
+          // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+            if (CI->getValue().isSignBit()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+            
+            if (CI->getValue().isMaxSignedValue()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              Pred = I.getSwappedPredicate(Pred);
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+          }
+          break;
+        case Instruction::Mul:
+          if (!I.isEquality())
+            break;
+
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+            // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
+            // Mask = -1 >> count-trailing-zeros(Cst).
+            if (!CI->isZero() && !CI->isOne()) {
+              const APInt &AP = CI->getValue();
+              ConstantInt *Mask = ConstantInt::get(
+                                      APInt::getLowBitsSet(AP.getBitWidth(),
+                                                           AP.getBitWidth() -
+                                                      AP.countTrailingZeros()));
+              Instruction *And1 = BinaryOperator::CreateAnd(Op0I->getOperand(0),
+                                                            Mask);
+              Instruction *And2 = BinaryOperator::CreateAnd(Op1I->getOperand(0),
+                                                            Mask);
+              InsertNewInstBefore(And1, I);
+              InsertNewInstBefore(And2, I);
+              return new ICmpInst(I.getPredicate(), And1, And2);
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+  
+  // ~x < ~y --> y < x
+  { Value *A, *B;
+    if (match(Op0, m_Not(m_Value(A))) &&
+        match(Op1, m_Not(m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B, A);
+  }
+  
+  if (I.isEquality()) {
+    Value *A, *B, *C, *D;
+    
+    // -x == -y --> x == y
+    if (match(Op0, m_Neg(m_Value(A))) &&
+        match(Op1, m_Neg(m_Value(B))))
+      return new ICmpInst(I.getPredicate(), A, B);
+    
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1 || B == Op1) {    // (A^B) == A  ->  B == 0
+        Value *OtherVal = A == Op1 ? B : A;
+        return new ICmpInst(I.getPredicate(), OtherVal,
+                            Constant::getNullValue(A->getType()));
+      }
+
+      if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+        // A^c1 == C^c2 --> A == C^(c1^c2)
+        ConstantInt *C1, *C2;
+        if (match(B, m_ConstantInt(C1)) &&
+            match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
+          Constant *NC = ConstantInt::get(C1->getValue() ^ C2->getValue());
+          Instruction *Xor = BinaryOperator::CreateXor(C, NC, "tmp");
+          return new ICmpInst(I.getPredicate(), A,
+                              InsertNewInstBefore(Xor, I));
+        }
+        
+        // A^B == A^D -> B == D
+        if (A == C) return new ICmpInst(I.getPredicate(), B, D);
+        if (A == D) return new ICmpInst(I.getPredicate(), B, C);
+        if (B == C) return new ICmpInst(I.getPredicate(), A, D);
+        if (B == D) return new ICmpInst(I.getPredicate(), A, C);
+      }
+    }
+    
+    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+        (A == Op0 || B == Op0)) {
+      // A == (A^B)  ->  B == 0
+      Value *OtherVal = A == Op0 ? B : A;
+      return new ICmpInst(I.getPredicate(), OtherVal,
+                          Constant::getNullValue(A->getType()));
+    }
+
+    // (A-B) == A  ->  B == 0
+    if (match(Op0, m_Sub(m_Specific(Op1), m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B, 
+                          Constant::getNullValue(B->getType()));
+
+    // A == (A-B)  ->  B == 0
+    if (match(Op1, m_Sub(m_Specific(Op0), m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B,
+                          Constant::getNullValue(B->getType()));
+    
+    // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_And(m_Value(A), m_Value(B))) && 
+        match(Op1, m_And(m_Value(C), m_Value(D)))) {
+      Value *X = 0, *Y = 0, *Z = 0;
+      
+      if (A == C) {
+        X = B; Y = D; Z = A;
+      } else if (A == D) {
+        X = B; Y = C; Z = A;
+      } else if (B == C) {
+        X = A; Y = D; Z = B;
+      } else if (B == D) {
+        X = A; Y = C; Z = B;
+      }
+      
+      if (X) {   // Build (X^Y) & Z
+        Op1 = InsertNewInstBefore(BinaryOperator::CreateXor(X, Y, "tmp"), I);
+        Op1 = InsertNewInstBefore(BinaryOperator::CreateAnd(Op1, Z, "tmp"), I);
+        I.setOperand(0, Op1);
+        I.setOperand(1, Constant::getNullValue(Op1->getType()));
+        return &I;
+      }
+    }
+  }
+  return Changed ? &I : 0;
+}
+
+
+/// FoldICmpDivCst - Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS
+/// and CmpRHS are both known to be integer constants.
+Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                                          ConstantInt *DivRHS) {
+  ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
+  const APInt &CmpRHSV = CmpRHS->getValue();
+  
+  // FIXME: If the operand types don't match the type of the divide 
+  // then don't attempt this transform. The code below doesn't have the
+  // logic to deal with a signed divide and an unsigned compare (and
+  // vice versa). This is because (x /s C1) <s C2  produces different 
+  // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even
+  // (x /u C1) <u C2.  Simply casting the operands and result won't 
+  // work. :(  The if statement below tests that condition and bails 
+  // if it finds it. 
+  bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
+  if (!ICI.isEquality() && DivIsSigned != ICI.isSignedPredicate())
+    return 0;
+  if (DivRHS->isZero())
+    return 0; // The ProdOV computation fails on divide by zero.
+  if (DivIsSigned && DivRHS->isAllOnesValue())
+    return 0; // The overflow computation also screws up here
+  if (DivRHS->isOne())
+    return 0; // Not worth bothering, and eliminates some funny cases
+              // with INT_MIN.
+
+  // Compute Prod = CI * DivRHS. We are essentially solving an equation
+  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and 
+  // C2 (CI). By solving for X we can turn this into a range check 
+  // instead of computing a divide. 
+  ConstantInt *Prod = Multiply(CmpRHS, DivRHS);
+
+  // Determine if the product overflows by seeing if the product is
+  // not equal to the divide. Make sure we do the same kind of divide
+  // as in the LHS instruction that we're folding. 
+  bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) :
+                 ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
+
+  // Get the ICmp opcode
+  ICmpInst::Predicate Pred = ICI.getPredicate();
+
+  // Figure out the interval that is being checked.  For example, a comparison
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). 
+  // Compute this interval based on the constants involved and the signedness of
+  // the compare/divide.  This computes a half-open interval, keeping track of
+  // whether either value in the interval overflows.  After analysis each
+  // overflow variable is set to 0 if it's corresponding bound variable is valid
+  // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
+  int LoOverflow = 0, HiOverflow = 0;
+  ConstantInt *LoBound = 0, *HiBound = 0;
+  
+  if (!DivIsSigned) {  // udiv
+    // e.g. X/5 op 3  --> [15, 20)
+    LoBound = Prod;
+    HiOverflow = LoOverflow = ProdOV;
+    if (!HiOverflow)
+      HiOverflow = AddWithOverflow(HiBound, LoBound, DivRHS, false);
+  } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.
+    if (CmpRHSV == 0) {       // (X / pos) op 0
+      // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
+      LoBound = cast<ConstantInt>(ConstantExpr::getNeg(SubOne(DivRHS)));
+      HiBound = DivRHS;
+    } else if (CmpRHSV.isStrictlyPositive()) {   // (X / pos) op pos
+      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
+      HiOverflow = LoOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = AddWithOverflow(HiBound, Prod, DivRHS, true);
+    } else {                       // (X / pos) op neg
+      // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14)
+      HiBound = AddOne(Prod);
+      LoOverflow = HiOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow) {
+        ConstantInt* DivNeg = cast<ConstantInt>(ConstantExpr::getNeg(DivRHS));
+        LoOverflow = AddWithOverflow(LoBound, HiBound, DivNeg,
+                                     true) ? -1 : 0;
+       }
+    }
+  } else if (DivRHS->getValue().isNegative()) { // Divisor is < 0.
+    if (CmpRHSV == 0) {       // (X / neg) op 0
+      // e.g. X/-5 op 0  --> [-4, 5)
+      LoBound = AddOne(DivRHS);
+      HiBound = cast<ConstantInt>(ConstantExpr::getNeg(DivRHS));
+      if (HiBound == DivRHS) {     // -INTMIN = INTMIN
+        HiOverflow = 1;            // [INTMIN+1, overflow)
+        HiBound = 0;               // e.g. X/INTMIN = 0 --> X > INTMIN
+      }
+    } else if (CmpRHSV.isStrictlyPositive()) {   // (X / neg) op pos
+      // e.g. X/-5 op 3  --> [-19, -14)
+      HiBound = AddOne(Prod);
+      HiOverflow = LoOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow)
+        LoOverflow = AddWithOverflow(LoBound, HiBound, DivRHS, true) ? -1 : 0;
+    } else {                       // (X / neg) op neg
+      LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20)
+      LoOverflow = HiOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = SubWithOverflow(HiBound, Prod, DivRHS, true);
+    }
+    
+    // Dividing by a negative swaps the condition.  LT <-> GT
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  Value *X = DivI->getOperand(0);
+  switch (Pred) {
+  default: assert(0 && "Unhandled icmp opcode!");
+  case ICmpInst::ICMP_EQ:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    else if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
+                          ICmpInst::ICMP_UGE, X, LoBound);
+    else if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
+                          ICmpInst::ICMP_ULT, X, HiBound);
+    else
+      return InsertRangeTest(X, LoBound, HiBound, DivIsSigned, true, ICI);
+  case ICmpInst::ICMP_NE:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    else if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
+                          ICmpInst::ICMP_ULT, X, LoBound);
+    else if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
+                          ICmpInst::ICMP_UGE, X, HiBound);
+    else
+      return InsertRangeTest(X, LoBound, HiBound, DivIsSigned, false, ICI);
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    if (LoOverflow == +1)   // Low bound is greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    if (LoOverflow == -1)   // Low bound is less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    return new ICmpInst(Pred, X, LoBound);
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    if (HiOverflow == +1)       // High bound greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    else if (HiOverflow == -1)  // High bound less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    if (Pred == ICmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
+    else
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
+  }
+}
+
+
+/// visitICmpInstWithInstAndIntCst - Handle "icmp (instr, intcst)".
+///
+Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                                          Instruction *LHSI,
+                                                          ConstantInt *RHS) {
+  const APInt &RHSV = RHS->getValue();
+  
+  switch (LHSI->getOpcode()) {
+  case Instruction::Trunc:
+    if (ICI.isEquality() && LHSI->hasOneUse()) {
+      // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+      // of the high bits truncated out of x are known.
+      unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
+             SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits));
+      APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
+      ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne);
+      
+      // If all the high bits are known, we can do this xform.
+      if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
+        // Pull in the high bits from known-ones set.
+        APInt NewRHS(RHS->getValue());
+        NewRHS.zext(SrcBits);
+        NewRHS |= KnownOne;
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantInt::get(NewRHS));
+      }
+    }
+    break;
+      
+  case Instruction::Xor:         // (icmp pred (xor X, XorCST), CI)
+    if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
+      // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+      // fold the xor.
+      if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) ||
+          (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) {
+        Value *CompareVal = LHSI->getOperand(0);
+        
+        // If the sign bit of the XorCST is not set, there is no change to
+        // the operation, just stop using the Xor.
+        if (!XorCST->getValue().isNegative()) {
+          ICI.setOperand(0, CompareVal);
+          AddToWorkList(LHSI);
+          return &ICI;
+        }
+        
+        // Was the old condition true if the operand is positive?
+        bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT;
+        
+        // If so, the new one isn't.
+        isTrueIfPositive ^= true;
+        
+        if (isTrueIfPositive)
+          return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal, SubOne(RHS));
+        else
+          return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal, AddOne(RHS));
+      }
+
+      if (LHSI->hasOneUse()) {
+        // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit))
+        if (!ICI.isEquality() && XorCST->getValue().isSignBit()) {
+          const APInt &SignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ SignBit));
+        }
+
+        // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
+        if (!ICI.isEquality() && XorCST->getValue().isMaxSignedValue()) {
+          const APInt &NotSignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          Pred = ICI.getSwappedPredicate(Pred);
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ NotSignBit));
+        }
+      }
+    }
+    break;
+  case Instruction::And:         // (icmp pred (and X, AndCST), RHS)
+    if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) &&
+        LHSI->getOperand(0)->hasOneUse()) {
+      ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
+      
+      // If the LHS is an AND of a truncating cast, we can widen the
+      // and/compare to be the input width without changing the value
+      // produced, eliminating a cast.
+      if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) {
+        // We can do this transformation if either the AND constant does not
+        // have its sign bit set or if it is an equality comparison. 
+        // Extending a relational comparison when we're checking the sign
+        // bit would not work.
+        if (Cast->hasOneUse() &&
+            (ICI.isEquality() ||
+             (AndCST->getValue().isNonNegative() && RHSV.isNonNegative()))) {
+          uint32_t BitWidth = 
+            cast<IntegerType>(Cast->getOperand(0)->getType())->getBitWidth();
+          APInt NewCST = AndCST->getValue();
+          NewCST.zext(BitWidth);
+          APInt NewCI = RHSV;
+          NewCI.zext(BitWidth);
+          Instruction *NewAnd = 
+            BinaryOperator::CreateAnd(Cast->getOperand(0),
+                                      ConstantInt::get(NewCST),LHSI->getName());
+          InsertNewInstBefore(NewAnd, ICI);
+          return new ICmpInst(ICI.getPredicate(), NewAnd,
+                              ConstantInt::get(NewCI));
+        }
+      }
+      
+      // If this is: (X >> C1) & C2 != C3 (where any shift and any compare
+      // could exist), turn it into (X & (C2 << C1)) != (C3 << C1).  This
+      // happens a LOT in code produced by the C front-end, for bitfield
+      // access.
+      BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
+      if (Shift && !Shift->isShift())
+        Shift = 0;
+      
+      ConstantInt *ShAmt;
+      ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
+      const Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
+      const Type *AndTy = AndCST->getType();          // Type of the and.
+      
+      // We can fold this as long as we can't shift unknown bits
+      // into the mask.  This can only happen with signed shift
+      // rights, as they sign-extend.
+      if (ShAmt) {
+        bool CanFold = Shift->isLogicalShift();
+        if (!CanFold) {
+          // To test for the bad case of the signed shr, see if any
+          // of the bits shifted in could be tested after the mask.
+          uint32_t TyBits = Ty->getPrimitiveSizeInBits();
+          int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits);
+          
+          uint32_t BitWidth = AndTy->getPrimitiveSizeInBits();
+          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & 
+               AndCST->getValue()) == 0)
+            CanFold = true;
+        }
+        
+        if (CanFold) {
+          Constant *NewCst;
+          if (Shift->getOpcode() == Instruction::Shl)
+            NewCst = ConstantExpr::getLShr(RHS, ShAmt);
+          else
+            NewCst = ConstantExpr::getShl(RHS, ShAmt);
+          
+          // Check to see if we are shifting out any of the bits being
+          // compared.
+          if (ConstantExpr::get(Shift->getOpcode(), NewCst, ShAmt) != RHS) {
+            // If we shifted bits out, the fold is not going to work out.
+            // As a special case, check to see if this means that the
+            // result is always true or false now.
+            if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+              return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+            if (ICI.getPredicate() == ICmpInst::ICMP_NE)
+              return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+          } else {
+            ICI.setOperand(1, NewCst);
+            Constant *NewAndCST;
+            if (Shift->getOpcode() == Instruction::Shl)
+              NewAndCST = ConstantExpr::getLShr(AndCST, ShAmt);
+            else
+              NewAndCST = ConstantExpr::getShl(AndCST, ShAmt);
+            LHSI->setOperand(1, NewAndCST);
+            LHSI->setOperand(0, Shift->getOperand(0));
+            AddToWorkList(Shift); // Shift is dead.
+            AddUsesToWorkList(ICI);
+            return &ICI;
+          }
+        }
+      }
+      
+      // Turn ((X >> Y) & C) == 0  into  (X & (C << Y)) == 0.  The later is
+      // preferable because it allows the C<<Y expression to be hoisted out
+      // of a loop if Y is invariant and X is not.
+      if (Shift && Shift->hasOneUse() && RHSV == 0 &&
+          ICI.isEquality() && !Shift->isArithmeticShift() &&
+          !isa<Constant>(Shift->getOperand(0))) {
+        // Compute C << Y.
+        Value *NS;
+        if (Shift->getOpcode() == Instruction::LShr) {
+          NS = BinaryOperator::CreateShl(AndCST, 
+                                         Shift->getOperand(1), "tmp");
+        } else {
+          // Insert a logical shift.
+          NS = BinaryOperator::CreateLShr(AndCST,
+                                          Shift->getOperand(1), "tmp");
+        }
+        InsertNewInstBefore(cast<Instruction>(NS), ICI);
+        
+        // Compute X & (C << Y).
+        Instruction *NewAnd = 
+          BinaryOperator::CreateAnd(Shift->getOperand(0), NS, LHSI->getName());
+        InsertNewInstBefore(NewAnd, ICI);
+        
+        ICI.setOperand(0, NewAnd);
+        return &ICI;
+      }
+    }
+    break;
+    
+  case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
+    ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!ShAmt) break;
+    
+    uint32_t TypeBits = RHSV.getBitWidth();
+    
+    // Check that the shift amount is in range.  If not, don't perform
+    // undefined shifts.  When the shift is visited it will be
+    // simplified.
+    if (ShAmt->uge(TypeBits))
+      break;
+    
+    if (ICI.isEquality()) {
+      // If we are comparing against bits always shifted out, the
+      // comparison cannot succeed.
+      Constant *Comp =
+        ConstantExpr::getShl(ConstantExpr::getLShr(RHS, ShAmt), ShAmt);
+      if (Comp != RHS) {// Comparing against a bit that we know is zero.
+        bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+        Constant *Cst = ConstantInt::get(Type::Int1Ty, IsICMP_NE);
+        return ReplaceInstUsesWith(ICI, Cst);
+      }
+      
+      if (LHSI->hasOneUse()) {
+        // Otherwise strength reduce the shift into an and.
+        uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+        Constant *Mask =
+          ConstantInt::get(APInt::getLowBitsSet(TypeBits, TypeBits-ShAmtVal));
+        
+        Instruction *AndI =
+          BinaryOperator::CreateAnd(LHSI->getOperand(0),
+                                    Mask, LHSI->getName()+".mask");
+        Value *And = InsertNewInstBefore(AndI, ICI);
+        return new ICmpInst(ICI.getPredicate(), And,
+                            ConstantInt::get(RHSV.lshr(ShAmtVal)));
+      }
+    }
+    
+    // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+    bool TrueIfSigned = false;
+    if (LHSI->hasOneUse() &&
+        isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) {
+      // (X << 31) <s 0  --> (X&1) != 0
+      Constant *Mask = ConstantInt::get(APInt(TypeBits, 1) <<
+                                           (TypeBits-ShAmt->getZExtValue()-1));
+      Instruction *AndI =
+        BinaryOperator::CreateAnd(LHSI->getOperand(0),
+                                  Mask, LHSI->getName()+".mask");
+      Value *And = InsertNewInstBefore(AndI, ICI);
+      
+      return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+                          And, Constant::getNullValue(And->getType()));
+    }
+    break;
+  }
+    
+  case Instruction::LShr:         // (icmp pred (shr X, ShAmt), CI)
+  case Instruction::AShr: {
+    // Only handle equality comparisons of shift-by-constant.
+    ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!ShAmt || !ICI.isEquality()) break;
+
+    // Check that the shift amount is in range.  If not, don't perform
+    // undefined shifts.  When the shift is visited it will be
+    // simplified.
+    uint32_t TypeBits = RHSV.getBitWidth();
+    if (ShAmt->uge(TypeBits))
+      break;
+    
+    uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+      
+    // If we are comparing against bits always shifted out, the
+    // comparison cannot succeed.
+    APInt Comp = RHSV << ShAmtVal;
+    if (LHSI->getOpcode() == Instruction::LShr)
+      Comp = Comp.lshr(ShAmtVal);
+    else
+      Comp = Comp.ashr(ShAmtVal);
+    
+    if (Comp != RHSV) { // Comparing against a bit that we know is zero.
+      bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+      Constant *Cst = ConstantInt::get(Type::Int1Ty, IsICMP_NE);
+      return ReplaceInstUsesWith(ICI, Cst);
+    }
+    
+    // Otherwise, check to see if the bits shifted out are known to be zero.
+    // If so, we can compare against the unshifted value:
+    //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
+    if (LHSI->hasOneUse() &&
+        MaskedValueIsZero(LHSI->getOperand(0), 
+                          APInt::getLowBitsSet(Comp.getBitWidth(), ShAmtVal))) {
+      return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                          ConstantExpr::getShl(RHS, ShAmt));
+    }
+      
+    if (LHSI->hasOneUse()) {
+      // Otherwise strength reduce the shift into an and.
+      APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+      Constant *Mask = ConstantInt::get(Val);
+      
+      Instruction *AndI =
+        BinaryOperator::CreateAnd(LHSI->getOperand(0),
+                                  Mask, LHSI->getName()+".mask");
+      Value *And = InsertNewInstBefore(AndI, ICI);
+      return new ICmpInst(ICI.getPredicate(), And,
+                          ConstantExpr::getShl(RHS, ShAmt));
+    }
+    break;
+  }
+    
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+    // Fold: icmp pred ([us]div X, C1), C2 -> range test
+    // Fold this div into the comparison, producing a range check. 
+    // Determine, based on the divide type, what the range is being 
+    // checked.  If there is an overflow on the low or high side, remember 
+    // it, otherwise compute the range [low, hi) bounding the new value.
+    // See: InsertRangeTest above for the kinds of replacements possible.
+    if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1)))
+      if (Instruction *R = FoldICmpDivCst(ICI, cast<BinaryOperator>(LHSI),
+                                          DivRHS))
+        return R;
+    break;
+
+  case Instruction::Add:
+    // Fold: icmp pred (add, X, C1), C2
+
+    if (!ICI.isEquality()) {
+      ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+      if (!LHSC) break;
+      const APInt &LHSV = LHSC->getValue();
+
+      ConstantRange CR = ICI.makeConstantRange(ICI.getPredicate(), RHSV)
+                            .subtract(LHSV);
+
+      if (ICI.isSignedPredicate()) {
+        if (CR.getLower().isSignBit()) {
+          return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getUpper()));
+        } else if (CR.getUpper().isSignBit()) {
+          return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getLower()));
+        }
+      } else {
+        if (CR.getLower().isMinValue()) {
+          return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getUpper()));
+        } else if (CR.getUpper().isMinValue()) {
+          return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0),
+                              ConstantInt::get(CR.getLower()));
+        }
+      }
+    }
+    break;
+  }
+  
+  // Simplify icmp_eq and icmp_ne instructions with integer constant RHS.
+  if (ICI.isEquality()) {
+    bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+    
+    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and 
+    // the second operand is a constant, simplify a bit.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) {
+      switch (BO->getOpcode()) {
+      case Instruction::SRem:
+        // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+        if (RHSV == 0 && isa<ConstantInt>(BO->getOperand(1)) &&BO->hasOneUse()){
+          const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue();
+          if (V.sgt(APInt(V.getBitWidth(), 1)) && V.isPowerOf2()) {
+            Instruction *NewRem =
+              BinaryOperator::CreateURem(BO->getOperand(0), BO->getOperand(1),
+                                         BO->getName());
+            InsertNewInstBefore(NewRem, ICI);
+            return new ICmpInst(ICI.getPredicate(), NewRem, 
+                                Constant::getNullValue(BO->getType()));
+          }
+        }
+        break;
+      case Instruction::Add:
+        // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+        if (ConstantInt *BOp1C = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          if (BO->hasOneUse())
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                Subtract(RHS, BOp1C));
+        } else if (RHSV == 0) {
+          // Replace ((add A, B) != 0) with (A != -B) if A or B is
+          // efficiently invertible, or if the add has just this one use.
+          Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+          
+          if (Value *NegVal = dyn_castNegVal(BOp1))
+            return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
+          else if (Value *NegVal = dyn_castNegVal(BOp0))
+            return new ICmpInst(ICI.getPredicate(), NegVal, BOp1);
+          else if (BO->hasOneUse()) {
+            Instruction *Neg = BinaryOperator::CreateNeg(BOp1);
+            InsertNewInstBefore(Neg, ICI);
+            Neg->takeName(BO);
+            return new ICmpInst(ICI.getPredicate(), BOp0, Neg);
+          }
+        }
+        break;
+      case Instruction::Xor:
+        // For the xor case, we can xor two constants together, eliminating
+        // the explicit xor.
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1)))
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), 
+                              ConstantExpr::getXor(RHS, BOC));
+        
+        // FALLTHROUGH
+      case Instruction::Sub:
+        // Replace (([sub|xor] A, B) != 0) with (A != B)
+        if (RHSV == 0)
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                              BO->getOperand(1));
+        break;
+        
+      case Instruction::Or:
+        // If bits are being or'd in that are not present in the constant we
+        // are comparing against, then the comparison could never succeed!
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
+          Constant *NotCI = ConstantExpr::getNot(RHS);
+          if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
+            return ReplaceInstUsesWith(ICI, ConstantInt::get(Type::Int1Ty, 
+                                                             isICMP_NE));
+        }
+        break;
+        
+      case Instruction::And:
+        if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          // If bits are being compared against that are and'd out, then the
+          // comparison can never succeed!
+          if ((RHSV & ~BOC->getValue()) != 0)
+            return ReplaceInstUsesWith(ICI, ConstantInt::get(Type::Int1Ty,
+                                                             isICMP_NE));
+          
+          // If we have ((X & C) == C), turn it into ((X & C) != 0).
+          if (RHS == BOC && RHSV.isPowerOf2())
+            return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ :
+                                ICmpInst::ICMP_NE, LHSI,
+                                Constant::getNullValue(RHS->getType()));
+          
+          // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
+          if (BOC->getValue().isSignBit()) {
+            Value *X = BO->getOperand(0);
+            Constant *Zero = Constant::getNullValue(X->getType());
+            ICmpInst::Predicate pred = isICMP_NE ? 
+              ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+            return new ICmpInst(pred, X, Zero);
+          }
+          
+          // ((X & ~7) == 0) --> X < 8
+          if (RHSV == 0 && isHighOnes(BOC)) {
+            Value *X = BO->getOperand(0);
+            Constant *NegX = ConstantExpr::getNeg(BOC);
+            ICmpInst::Predicate pred = isICMP_NE ? 
+              ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+            return new ICmpInst(pred, X, NegX);
+          }
+        }
+      default: break;
+      }
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
+      // Handle icmp {eq|ne} <intrinsic>, intcst.
+      if (II->getIntrinsicID() == Intrinsic::bswap) {
+        AddToWorkList(II);
+        ICI.setOperand(0, II->getOperand(1));
+        ICI.setOperand(1, ConstantInt::get(RHSV.byteSwap()));
+        return &ICI;
+      }
+    }
+  }
+  return 0;
+}
+
+/// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst).
+/// We only handle extending casts so far.
+///
+Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
+  const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0));
+  Value *LHSCIOp        = LHSCI->getOperand(0);
+  const Type *SrcTy     = LHSCIOp->getType();
+  const Type *DestTy    = LHSCI->getType();
+  Value *RHSCIOp;
+
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the 
+  // integer type is the same size as the pointer type.
+  if (LHSCI->getOpcode() == Instruction::PtrToInt &&
+      getTargetData().getPointerSizeInBits() == 
+         cast<IntegerType>(DestTy)->getBitWidth()) {
+    Value *RHSOp = 0;
+    if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
+      RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) {
+      RHSOp = RHSC->getOperand(0);
+      // If the pointer types don't match, insert a bitcast.
+      if (LHSCIOp->getType() != RHSOp->getType())
+        RHSOp = InsertBitCastBefore(RHSOp, LHSCIOp->getType(), ICI);
+    }
+
+    if (RHSOp)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
+  }
+  
+  // The code below only handles extension cast instructions, so far.
+  // Enforce this.
+  if (LHSCI->getOpcode() != Instruction::ZExt &&
+      LHSCI->getOpcode() != Instruction::SExt)
+    return 0;
+
+  bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
+  bool isSignedCmp = ICI.isSignedPredicate();
+
+  if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) {
+    // Not an extension from the same type?
+    RHSCIOp = CI->getOperand(0);
+    if (RHSCIOp->getType() != LHSCIOp->getType()) 
+      return 0;
+    
+    // If the signedness of the two casts doesn't agree (i.e. one is a sext
+    // and the other is a zext), then we can't handle this.
+    if (CI->getOpcode() != LHSCI->getOpcode())
+      return 0;
+
+    // Deal with equality cases early.
+    if (ICI.isEquality())
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (isSignedCmp && isSignedExt)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
+  }
+
+  // If we aren't dealing with a constant on the RHS, exit early
+  ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1));
+  if (!CI)
+    return 0;
+
+  // Compute the constant that would happen if we truncated to SrcTy then
+  // reextended to DestTy.
+  Constant *Res1 = ConstantExpr::getTrunc(CI, SrcTy);
+  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
+
+  // If the re-extended constant didn't change...
+  if (Res2 == CI) {
+    // Make sure that sign of the Cmp and the sign of the Cast are the same.
+    // For example, we might have:
+    //    %A = sext short %X to uint
+    //    %B = icmp ugt uint %A, 1330
+    // It is incorrect to transform this into 
+    //    %B = icmp ugt short %X, 1330 
+    // because %A may have negative value. 
+    //
+    // However, we allow this when the compare is EQ/NE, because they are
+    // signless.
+    if (isSignedExt == isSignedCmp || ICI.isEquality())
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+    return 0;
+  }
+
+  // The re-extended constant changed so the constant cannot be represented 
+  // in the shorter type. Consequently, we cannot emit a simple comparison.
+
+  // First, handle some easy cases. We know the result cannot be equal at this
+  // point so handle the ICI.isEquality() cases
+  if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+  if (ICI.getPredicate() == ICmpInst::ICMP_NE)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+
+  // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
+  // should have been folded away previously and not enter in here.
+  Value *Result;
+  if (isSignedCmp) {
+    // We're performing a signed comparison.
+    if (cast<ConstantInt>(CI)->getValue().isNegative())
+      Result = ConstantInt::getFalse();          // X < (small) --> false
+    else
+      Result = ConstantInt::getTrue();           // X < (large) --> true
+  } else {
+    // We're performing an unsigned comparison.
+    if (isSignedExt) {
+      // We're performing an unsigned comp with a sign extended value.
+      // This is true if the input is >= 0. [aka >s -1]
+      Constant *NegOne = ConstantInt::getAllOnesValue(SrcTy);
+      Result = InsertNewInstBefore(new ICmpInst(ICmpInst::ICMP_SGT, LHSCIOp,
+                                   NegOne, ICI.getName()), ICI);
+    } else {
+      // Unsigned extend & unsigned compare -> always true.
+      Result = ConstantInt::getTrue();
+    }
+  }
+
+  // Finally, return the value computed.
+  if (ICI.getPredicate() == ICmpInst::ICMP_ULT ||
+      ICI.getPredicate() == ICmpInst::ICMP_SLT)
+    return ReplaceInstUsesWith(ICI, Result);
+
+  assert((ICI.getPredicate()==ICmpInst::ICMP_UGT || 
+          ICI.getPredicate()==ICmpInst::ICMP_SGT) &&
+         "ICmp should be folded!");
+  if (Constant *CI = dyn_cast<Constant>(Result))
+    return ReplaceInstUsesWith(ICI, ConstantExpr::getNot(CI));
+  return BinaryOperator::CreateNot(Result);
+}
+
+Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  return commonShiftTransforms(I);
+}
+
+Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+  return commonShiftTransforms(I);
+}
+
+Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+  
+  Value *Op0 = I.getOperand(0);
+  
+  // ashr int -1, X = -1   (for any arithmetic shift rights of ~0)
+  if (ConstantInt *CSI = dyn_cast<ConstantInt>(Op0))
+    if (CSI->isAllOnesValue())
+      return ReplaceInstUsesWith(I, CSI);
+  
+  // See if we can turn a signed shr into an unsigned shr.
+  if (!isa<VectorType>(I.getType())) {
+    if (MaskedValueIsZero(Op0,
+                      APInt::getSignBit(I.getType()->getPrimitiveSizeInBits())))
+      return BinaryOperator::CreateLShr(Op0, I.getOperand(1));
+
+    // Arithmetic shifting an all-sign-bit value is a no-op.
+    unsigned NumSignBits = ComputeNumSignBits(Op0);
+    if (NumSignBits == Op0->getType()->getPrimitiveSizeInBits())
+      return ReplaceInstUsesWith(I, Op0);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+  assert(I.getOperand(1)->getType() == I.getOperand(0)->getType());
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // shl X, 0 == X and shr X, 0 == X
+  // shl 0, X == 0 and shr 0, X == 0
+  if (Op1 == Constant::getNullValue(Op1->getType()) ||
+      Op0 == Constant::getNullValue(Op0->getType()))
+    return ReplaceInstUsesWith(I, Op0);
+  
+  if (isa<UndefValue>(Op0)) {            
+    if (I.getOpcode() == Instruction::AShr) // undef >>s X -> undef
+      return ReplaceInstUsesWith(I, Op0);
+    else                                    // undef << X -> 0, undef >>u X -> 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  if (isa<UndefValue>(Op1)) {
+    if (I.getOpcode() == Instruction::AShr)  // X >>s undef -> X
+      return ReplaceInstUsesWith(I, Op0);          
+    else                                     // X << undef, X >>u undef -> 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+
+  // See if we can fold away this shift.
+  if (!isa<VectorType>(I.getType()) && SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Try to fold constant and into select arguments.
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+  if (ConstantInt *CUI = dyn_cast<ConstantInt>(Op1))
+    if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
+      return Res;
+  return 0;
+}
+
+Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                               BinaryOperator &I) {
+  bool isLeftShift = I.getOpcode() == Instruction::Shl;
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  uint32_t TypeBits = Op0->getType()->getPrimitiveSizeInBits();
+  
+  // shl uint X, 32 = 0 and shr ubyte Y, 9 = 0, ... just don't eliminate shr
+  // of a signed value.
+  //
+  if (Op1->uge(TypeBits)) {
+    if (I.getOpcode() != Instruction::AShr)
+      return ReplaceInstUsesWith(I, Constant::getNullValue(Op0->getType()));
+    else {
+      I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1));
+      return &I;
+    }
+  }
+  
+  // ((X*C1) << C2) == (X * (C1 << C2))
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
+    if (BO->getOpcode() == Instruction::Mul && isLeftShift)
+      if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
+        return BinaryOperator::CreateMul(BO->getOperand(0),
+                                         ConstantExpr::getShl(BOOp, Op1));
+  
+  // Try to fold constant and into select arguments.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+      return R;
+  if (isa<PHINode>(Op0))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
+  
+  // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
+  if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
+    Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0));
+    // If 'shift2' is an ashr, we would have to get the sign bit into a funny
+    // place.  Don't try to do this transformation in this case.  Also, we
+    // require that the input operand is a shift-by-constant so that we have
+    // confidence that the shifts will get folded together.  We could do this
+    // xform in more cases, but it is unlikely to be profitable.
+    if (TrOp && I.isLogicalShift() && TrOp->isShift() && 
+        isa<ConstantInt>(TrOp->getOperand(1))) {
+      // Okay, we'll do this xform.  Make the shift of shift.
+      Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType());
+      Instruction *NSh = BinaryOperator::Create(I.getOpcode(), TrOp, ShAmt,
+                                                I.getName());
+      InsertNewInstBefore(NSh, I); // (shift2 (shift1 & 0x00FF), c2)
+
+      // For logical shifts, the truncation has the effect of making the high
+      // part of the register be zeros.  Emulate this by inserting an AND to
+      // clear the top bits as needed.  This 'and' will usually be zapped by
+      // other xforms later if dead.
+      unsigned SrcSize = TrOp->getType()->getPrimitiveSizeInBits();
+      unsigned DstSize = TI->getType()->getPrimitiveSizeInBits();
+      APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize));
+      
+      // The mask we constructed says what the trunc would do if occurring
+      // between the shifts.  We want to know the effect *after* the second
+      // shift.  We know that it is a logical shift by a constant, so adjust the
+      // mask as appropriate.
+      if (I.getOpcode() == Instruction::Shl)
+        MaskV <<= Op1->getZExtValue();
+      else {
+        assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
+        MaskV = MaskV.lshr(Op1->getZExtValue());
+      }
+
+      Instruction *And = BinaryOperator::CreateAnd(NSh, ConstantInt::get(MaskV),
+                                                   TI->getName());
+      InsertNewInstBefore(And, I); // shift1 & 0x00FF
+
+      // Return the value truncated to the interesting size.
+      return new TruncInst(And, I.getType());
+    }
+  }
+  
+  if (Op0->hasOneUse()) {
+    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+      Value *V1, *V2;
+      ConstantInt *CC;
+      switch (Op0BO->getOpcode()) {
+        default: break;
+        case Instruction::Add:
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor: {
+          // These operators commute.
+          // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
+          if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
+              match(Op0BO->getOperand(1), m_Shr(m_Value(V1), m_Specific(Op1)))){
+            Instruction *YS = BinaryOperator::CreateShl(
+                                            Op0BO->getOperand(0), Op1,
+                                            Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *X = 
+              BinaryOperator::Create(Op0BO->getOpcode(), YS, V1,
+                                     Op0BO->getOperand(1)->getName());
+            InsertNewInstBefore(X, I);  // (X + (Y << C))
+            uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+            return BinaryOperator::CreateAnd(X, ConstantInt::get(
+                       APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          }
+          
+          // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
+          Value *Op0BOOp1 = Op0BO->getOperand(1);
+          if (isLeftShift && Op0BOOp1->hasOneUse() &&
+              match(Op0BOOp1, 
+                    m_And(m_Shr(m_Value(V1), m_Specific(Op1)),
+                          m_ConstantInt(CC))) &&
+              cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse()) {
+            Instruction *YS = BinaryOperator::CreateShl(
+                                                     Op0BO->getOperand(0), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *XM =
+              BinaryOperator::CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+            InsertNewInstBefore(XM, I); // X & (CC << C)
+            
+            return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
+          }
+        }
+          
+        // FALL THROUGH.
+        case Instruction::Sub: {
+          // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+          if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+              match(Op0BO->getOperand(0), m_Shr(m_Value(V1), m_Specific(Op1)))){
+            Instruction *YS = BinaryOperator::CreateShl(
+                                                     Op0BO->getOperand(1), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *X =
+              BinaryOperator::Create(Op0BO->getOpcode(), V1, YS,
+                                     Op0BO->getOperand(0)->getName());
+            InsertNewInstBefore(X, I);  // (X + (Y << C))
+            uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+            return BinaryOperator::CreateAnd(X, ConstantInt::get(
+                       APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          }
+          
+          // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
+          if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+              match(Op0BO->getOperand(0),
+                    m_And(m_Shr(m_Value(V1), m_Value(V2)),
+                          m_ConstantInt(CC))) && V2 == Op1 &&
+              cast<BinaryOperator>(Op0BO->getOperand(0))
+                  ->getOperand(0)->hasOneUse()) {
+            Instruction *YS = BinaryOperator::CreateShl(
+                                                     Op0BO->getOperand(1), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *XM =
+              BinaryOperator::CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+            InsertNewInstBefore(XM, I); // X & (CC << C)
+            
+            return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
+          }
+          
+          break;
+        }
+      }
+      
+      
+      // If the operand is an bitwise operator with a constant RHS, and the
+      // shift is the only use, we can pull it out of the shift.
+      if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
+        bool isValid = true;     // Valid only for And, Or, Xor
+        bool highBitSet = false; // Transform if high bit of constant set?
+        
+        switch (Op0BO->getOpcode()) {
+          default: isValid = false; break;   // Do not perform transform!
+          case Instruction::Add:
+            isValid = isLeftShift;
+            break;
+          case Instruction::Or:
+          case Instruction::Xor:
+            highBitSet = false;
+            break;
+          case Instruction::And:
+            highBitSet = true;
+            break;
+        }
+        
+        // If this is a signed shift right, and the high bit is modified
+        // by the logical operation, do not perform the transformation.
+        // The highBitSet boolean indicates the value of the high bit of
+        // the constant which would cause it to be modified for this
+        // operation.
+        //
+        if (isValid && I.getOpcode() == Instruction::AShr)
+          isValid = Op0C->getValue()[TypeBits-1] == highBitSet;
+        
+        if (isValid) {
+          Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
+          
+          Instruction *NewShift =
+            BinaryOperator::Create(I.getOpcode(), Op0BO->getOperand(0), Op1);
+          InsertNewInstBefore(NewShift, I);
+          NewShift->takeName(Op0BO);
+          
+          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
+                                        NewRHS);
+        }
+      }
+    }
+  }
+  
+  // Find out if this is a shift of a shift by a constant.
+  BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
+  if (ShiftOp && !ShiftOp->isShift())
+    ShiftOp = 0;
+  
+  if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
+    ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
+    uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
+    uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits);
+    assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
+    if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
+    Value *X = ShiftOp->getOperand(0);
+    
+    uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
+    
+    const IntegerType *Ty = cast<IntegerType>(I.getType());
+    
+    // Check for (X << c1) << c2  and  (X >> c1) >> c2
+    if (I.getOpcode() == ShiftOp->getOpcode()) {
+      // If this is oversized composite shift, then unsigned shifts get 0, ashr
+      // saturates.
+      if (AmtSum >= TypeBits) {
+        if (I.getOpcode() != Instruction::AShr)
+          return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+        AmtSum = TypeBits-1;  // Saturate to 31 for i32 ashr.
+      }
+      
+      return BinaryOperator::Create(I.getOpcode(), X,
+                                    ConstantInt::get(Ty, AmtSum));
+    } else if (ShiftOp->getOpcode() == Instruction::LShr &&
+               I.getOpcode() == Instruction::AShr) {
+      if (AmtSum >= TypeBits)
+        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+      
+      // ((X >>u C1) >>s C2) -> (X >>u (C1+C2))  since C1 != 0.
+      return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
+    } else if (ShiftOp->getOpcode() == Instruction::AShr &&
+               I.getOpcode() == Instruction::LShr) {
+      // ((X >>s C1) >>u C2) -> ((X >>s (C1+C2)) & mask) since C1 != 0.
+      if (AmtSum >= TypeBits)
+        AmtSum = TypeBits-1;
+      
+      Instruction *Shift =
+        BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
+      InsertNewInstBefore(Shift, I);
+
+      APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+      return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+    }
+    
+    // Okay, if we get here, one shift must be left, and the other shift must be
+    // right.  See if the amounts are equal.
+    if (ShiftAmt1 == ShiftAmt2) {
+      // If we have ((X >>? C) << C), turn this into X & (-1 << C).
+      if (I.getOpcode() == Instruction::Shl) {
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1));
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Mask));
+      }
+      // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
+      if (I.getOpcode() == Instruction::LShr) {
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Mask));
+      }
+      // We can simplify ((X << C) >>s C) into a trunc + sext.
+      // NOTE: we could do this for any C, but that would make 'unusual' integer
+      // types.  For now, just stick to ones well-supported by the code
+      // generators.
+      const Type *SExtType = 0;
+      switch (Ty->getBitWidth() - ShiftAmt1) {
+      case 1  :
+      case 8  :
+      case 16 :
+      case 32 :
+      case 64 :
+      case 128:
+        SExtType = IntegerType::get(Ty->getBitWidth() - ShiftAmt1);
+        break;
+      default: break;
+      }
+      if (SExtType) {
+        Instruction *NewTrunc = new TruncInst(X, SExtType, "sext");
+        InsertNewInstBefore(NewTrunc, I);
+        return new SExtInst(NewTrunc, Ty);
+      }
+      // Otherwise, we can't handle it yet.
+    } else if (ShiftAmt1 < ShiftAmt2) {
+      uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
+      
+      // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2)
+      if (I.getOpcode() == Instruction::Shl) {
+        assert(ShiftOp->getOpcode() == Instruction::LShr ||
+               ShiftOp->getOpcode() == Instruction::AShr);
+        Instruction *Shift =
+          BinaryOperator::CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr) {
+        assert(ShiftOp->getOpcode() == Instruction::Shl);
+        Instruction *Shift =
+          BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in.
+    } else {
+      assert(ShiftAmt2 < ShiftAmt1);
+      uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
+
+      // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2)
+      if (I.getOpcode() == Instruction::Shl) {
+        assert(ShiftOp->getOpcode() == Instruction::LShr ||
+               ShiftOp->getOpcode() == Instruction::AShr);
+        Instruction *Shift =
+          BinaryOperator::Create(ShiftOp->getOpcode(), X,
+                                 ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr) {
+        assert(ShiftOp->getOpcode() == Instruction::Shl);
+        Instruction *Shift =
+          BinaryOperator::CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // We can't handle (X << C1) >>a C2, it shifts arbitrary bits in.
+    }
+  }
+  return 0;
+}
+
+
+/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear
+/// expression.  If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+                                        int &Offset) {
+  assert(Val->getType() == Type::Int32Ty && "Unexpected allocation size type!");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    Offset = CI->getZExtValue();
+    Scale  = 0;
+    return ConstantInt::get(Type::Int32Ty, 0);
+  } else if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (I->getOpcode() == Instruction::Shl) {
+        // This is a value scaled by '1 << the shift amt'.
+        Scale = 1U << RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      } else if (I->getOpcode() == Instruction::Mul) {
+        // This value is scaled by 'RHS'.
+        Scale = RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      } else if (I->getOpcode() == Instruction::Add) {
+        // We have X+C.  Check to see if we really have (X*C2)+C1, 
+        // where C1 is divisible by C2.
+        unsigned SubScale;
+        Value *SubVal = 
+          DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+        Offset += RHS->getZExtValue();
+        Scale = SubScale;
+        return SubVal;
+      }
+    }
+  }
+
+  // Otherwise, we can't look past this.
+  Scale = 1;
+  Offset = 0;
+  return Val;
+}
+
+
+/// PromoteCastOfAllocation - If we find a cast of an allocation instruction,
+/// try to eliminate the cast by moving the type information into the alloc.
+Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
+                                                   AllocationInst &AI) {
+  const PointerType *PTy = cast<PointerType>(CI.getType());
+  
+  // Remove any uses of AI that are dead.
+  assert(!CI.use_empty() && "Dead instructions should be removed earlier!");
+  
+  for (Value::use_iterator UI = AI.use_begin(), E = AI.use_end(); UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    if (isInstructionTriviallyDead(User)) {
+      while (UI != E && *UI == User)
+        ++UI; // If this instruction uses AI more than once, don't break UI.
+      
+      ++NumDeadInst;
+      DOUT << "IC: DCE: " << *User;
+      EraseInstFromFunction(*User);
+    }
+  }
+  
+  // Get the type really allocated and the type casted to.
+  const Type *AllocElTy = AI.getAllocatedType();
+  const Type *CastElTy = PTy->getElementType();
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0;
+
+  unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy);
+  unsigned CastElTyAlign = TD->getABITypeAlignment(CastElTy);
+  if (CastElTyAlign < AllocElTyAlign) return 0;
+
+  // If the allocation has multiple uses, only promote it if we are strictly
+  // increasing the alignment of the resultant allocation.  If we keep it the
+  // same, we open the door to infinite loops of various kinds.  (A reference
+  // from a dbg.declare doesn't count as a use for this purpose.)
+  if (!AI.hasOneUse() && !hasOneUsePlusDeclare(&AI) &&
+      CastElTyAlign == AllocElTyAlign) return 0;
+
+  uint64_t AllocElTySize = TD->getTypeAllocSize(AllocElTy);
+  uint64_t CastElTySize = TD->getTypeAllocSize(CastElTy);
+  if (CastElTySize == 0 || AllocElTySize == 0) return 0;
+
+  // See if we can satisfy the modulus by pulling a scale out of the array
+  // size argument.
+  unsigned ArraySizeScale;
+  int ArrayOffset;
+  Value *NumElements = // See if the array size is a decomposable linear expr.
+    DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+ 
+  // If we can now satisfy the modulus, by using a non-1 scale, we really can
+  // do the xform.
+  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return 0;
+
+  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+  Value *Amt = 0;
+  if (Scale == 1) {
+    Amt = NumElements;
+  } else {
+    // If the allocation size is constant, form a constant mul expression
+    Amt = ConstantInt::get(Type::Int32Ty, Scale);
+    if (isa<ConstantInt>(NumElements))
+      Amt = Multiply(cast<ConstantInt>(NumElements), cast<ConstantInt>(Amt));
+    // otherwise multiply the amount and the number of elements
+    else {
+      Instruction *Tmp = BinaryOperator::CreateMul(Amt, NumElements, "tmp");
+      Amt = InsertNewInstBefore(Tmp, AI);
+    }
+  }
+  
+  if (int Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(Type::Int32Ty, Offset, true);
+    Instruction *Tmp = BinaryOperator::CreateAdd(Amt, Off, "tmp");
+    Amt = InsertNewInstBefore(Tmp, AI);
+  }
+  
+  AllocationInst *New;
+  if (isa<MallocInst>(AI))
+    New = new MallocInst(CastElTy, Amt, AI.getAlignment());
+  else
+    New = new AllocaInst(CastElTy, Amt, AI.getAlignment());
+  InsertNewInstBefore(New, AI);
+  New->takeName(&AI);
+  
+  // If the allocation has one real use plus a dbg.declare, just remove the
+  // declare.
+  if (DbgDeclareInst *DI = hasOneUsePlusDeclare(&AI)) {
+    EraseInstFromFunction(*DI);
+  }
+  // If the allocation has multiple real uses, insert a cast and change all
+  // things that used it to use the new cast.  This will also hack on CI, but it
+  // will die soon.
+  else if (!AI.hasOneUse()) {
+    AddUsesToWorkList(AI);
+    // New is the allocation instruction, pointer typed. AI is the original
+    // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+    CastInst *NewCast = new BitCastInst(New, AI.getType(), "tmpcast");
+    InsertNewInstBefore(NewCast, AI);
+    AI.replaceAllUsesWith(NewCast);
+  }
+  return ReplaceInstUsesWith(CI, New);
+}
+
+/// CanEvaluateInDifferentType - Return true if we can take the specified value
+/// and return it as type Ty without inserting any new casts and without
+/// changing the computed value.  This is used by code that tries to decide
+/// whether promoting or shrinking integer operations to wider or smaller types
+/// will allow us to eliminate a truncate or extend.
+///
+/// This is a truncation operation if Ty is smaller than V->getType(), or an
+/// extension operation if Ty is larger.
+///
+/// If CastOpc is a truncation, then Ty will be a type smaller than V.  We
+/// should return true if trunc(V) can be computed by computing V in the smaller
+/// type.  If V is an instruction, then trunc(inst(x,y)) can be computed as
+/// inst(trunc(x),trunc(y)), which only makes sense if x and y can be
+/// efficiently truncated.
+///
+/// If CastOpc is a sext or zext, we are asking if the low bits of the value can
+/// bit computed in a larger type, which is then and'd or sext_in_reg'd to get
+/// the final result.
+bool InstCombiner::CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
+                                              unsigned CastOpc,
+                                              int &NumCastsRemoved){
+  // We can always evaluate constants in another type.
+  if (isa<ConstantInt>(V))
+    return true;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  const IntegerType *OrigTy = cast<IntegerType>(V->getType());
+  
+  // If this is an extension or truncate, we can often eliminate it.
+  if (isa<TruncInst>(I) || isa<ZExtInst>(I) || isa<SExtInst>(I)) {
+    // If this is a cast from the destination type, we can trivially eliminate
+    // it, and this will remove a cast overall.
+    if (I->getOperand(0)->getType() == Ty) {
+      // If the first operand is itself a cast, and is eliminable, do not count
+      // this as an eliminable cast.  We would prefer to eliminate those two
+      // casts first.
+      if (!isa<CastInst>(I->getOperand(0)) && I->hasOneUse())
+        ++NumCastsRemoved;
+      return true;
+    }
+  }
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // These operators can all arbitrarily be extended or truncated.
+    return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
+                                      NumCastsRemoved) &&
+           CanEvaluateInDifferentType(I->getOperand(1), Ty, CastOpc,
+                                      NumCastsRemoved);
+
+  case Instruction::Shl:
+    // If we are truncating the result of this SHL, and if it's a shift of a
+    // constant amount, we can always perform a SHL in a smaller type.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t BitWidth = Ty->getBitWidth();
+      if (BitWidth < OrigTy->getBitWidth() && 
+          CI->getLimitedValue(BitWidth) < BitWidth)
+        return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
+                                          NumCastsRemoved);
+    }
+    break;
+  case Instruction::LShr:
+    // If this is a truncate of a logical shr, we can truncate it to a smaller
+    // lshr iff we know that the bits we would otherwise be shifting in are
+    // already zeros.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t OrigBitWidth = OrigTy->getBitWidth();
+      uint32_t BitWidth = Ty->getBitWidth();
+      if (BitWidth < OrigBitWidth &&
+          MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
+                                          NumCastsRemoved);
+      }
+    }
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::Trunc:
+    // If this is the same kind of case as our original (e.g. zext+zext), we
+    // can safely replace it.  Note that replacing it does not reduce the number
+    // of casts in the input.
+    if (Opc == CastOpc)
+      return true;
+
+    // sext (zext ty1), ty2 -> zext ty2
+    if (CastOpc == Instruction::SExt && Opc == Instruction::ZExt)
+      return true;
+    break;
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return CanEvaluateInDifferentType(SI->getTrueValue(), Ty, CastOpc,
+                                      NumCastsRemoved) &&
+           CanEvaluateInDifferentType(SI->getFalseValue(), Ty, CastOpc,
+                                      NumCastsRemoved);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateInDifferentType(PN->getIncomingValue(i), Ty, CastOpc,
+                                      NumCastsRemoved))
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+  
+  return false;
+}
+
+/// EvaluateInDifferentType - Given an expression that 
+/// CanEvaluateInDifferentType returns true for, actually insert the code to
+/// evaluate the expression.
+Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, 
+                                             bool isSigned) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+
+  // Otherwise, it must be an instruction.
+  Instruction *I = cast<Instruction>(V);
+  Instruction *Res = 0;
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl: {
+    Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+    Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+    break;
+  }    
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // If the source type of the cast is the type we're trying for then we can
+    // just return the source.  There's no need to insert it because it is not
+    // new.
+    if (I->getOperand(0)->getType() == Ty)
+      return I->getOperand(0);
+    
+    // Otherwise, must be the same type of cast, so just reinsert a new one.
+    Res = CastInst::Create(cast<CastInst>(I)->getOpcode(), I->getOperand(0),
+                           Ty);
+    break;
+  case Instruction::Select: {
+    Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
+    Res = SelectInst::Create(I->getOperand(0), True, False);
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *OPN = cast<PHINode>(I);
+    PHINode *NPN = PHINode::Create(Ty);
+    for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
+      Value *V =EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+      NPN->addIncoming(V, OPN->getIncomingBlock(i));
+    }
+    Res = NPN;
+    break;
+  }
+  default: 
+    // TODO: Can handle more cases here.
+    assert(0 && "Unreachable!");
+    break;
+  }
+  
+  Res->takeName(I);
+  return InsertNewInstBefore(Res, *I);
+}
+
+/// @brief Implement the transforms common to all CastInst visitors.
+Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+
+  // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
+  // eliminate it now.
+  if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
+    if (Instruction::CastOps opc = 
+        isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
+      // The first cast (CSrc) is eliminable so we need to fix up or replace
+      // the second cast (CI). CSrc will then have a good chance of being dead.
+      return CastInst::Create(opc, CSrc->getOperand(0), CI.getType());
+    }
+  }
+
+  // If we are casting a select then fold the cast into the select
+  if (SelectInst *SI = dyn_cast<SelectInst>(Src))
+    if (Instruction *NV = FoldOpIntoSelect(CI, SI, this))
+      return NV;
+
+  // If we are casting a PHI then fold the cast into the PHI
+  if (isa<PHINode>(Src))
+    if (Instruction *NV = FoldOpIntoPhi(CI))
+      return NV;
+  
+  return 0;
+}
+
+/// FindElementAtOffset - Given a type and a constant offset, determine whether
+/// or not there is a sequence of GEP indices into the type that will land us at
+/// the specified offset.  If so, fill them into NewIndices and return the
+/// resultant element type, otherwise return null.
+static const Type *FindElementAtOffset(const Type *Ty, int64_t Offset, 
+                                       SmallVectorImpl<Value*> &NewIndices,
+                                       const TargetData *TD) {
+  if (!Ty->isSized()) return 0;
+  
+  // Start with the index over the outer type.  Note that the type size
+  // might be zero (even if the offset isn't zero) if the indexed type
+  // is something like [0 x {int, int}]
+  const Type *IntPtrTy = TD->getIntPtrType();
+  int64_t FirstIdx = 0;
+  if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
+    FirstIdx = Offset/TySize;
+    Offset -= FirstIdx*TySize;
+    
+    // Handle hosts where % returns negative instead of values [0..TySize).
+    if (Offset < 0) {
+      --FirstIdx;
+      Offset += TySize;
+      assert(Offset >= 0);
+    }
+    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
+  }
+  
+  NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+    
+  // Index into the types.  If we fail, set OrigBase to null.
+  while (Offset) {
+    // Indexing into tail padding between struct/array elements.
+    if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
+      return 0;
+    
+    if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = TD->getStructLayout(STy);
+      assert(Offset < (int64_t)SL->getSizeInBytes() &&
+             "Offset must stay within the indexed type");
+      
+      unsigned Elt = SL->getElementContainingOffset(Offset);
+      NewIndices.push_back(ConstantInt::get(Type::Int32Ty, Elt));
+      
+      Offset -= SL->getElementOffset(Elt);
+      Ty = STy->getElementType(Elt);
+    } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+      uint64_t EltSize = TD->getTypeAllocSize(AT->getElementType());
+      assert(EltSize && "Cannot index into a zero-sized array");
+      NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+      Offset %= EltSize;
+      Ty = AT->getElementType();
+    } else {
+      // Otherwise, we can't index into the middle of this atomic type, bail.
+      return 0;
+    }
+  }
+  
+  return Ty;
+}
+
+/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
+Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+  
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
+    // If casting the result of a getelementptr instruction with no offset, turn
+    // this into a cast of the original pointer!
+    if (GEP->hasAllZeroIndices()) {
+      // Changing the cast operand is usually not a good idea but it is safe
+      // here because the pointer operand is being replaced with another 
+      // pointer operand so the opcode doesn't need to change.
+      AddToWorkList(GEP);
+      CI.setOperand(0, GEP->getOperand(0));
+      return &CI;
+    }
+    
+    // If the GEP has a single use, and the base pointer is a bitcast, and the
+    // GEP computes a constant offset, see if we can convert these three
+    // instructions into fewer.  This typically happens with unions and other
+    // non-type-safe code.
+    if (GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0))) {
+      if (GEP->hasAllConstantIndices()) {
+        // We are guaranteed to get a constant from EmitGEPOffset.
+        ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP, CI, *this));
+        int64_t Offset = OffsetV->getSExtValue();
+        
+        // Get the base pointer input of the bitcast, and the type it points to.
+        Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
+        const Type *GEPIdxTy =
+          cast<PointerType>(OrigBase->getType())->getElementType();
+        SmallVector<Value*, 8> NewIndices;
+        if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices, TD)) {
+          // If we were able to index down into an element, create the GEP
+          // and bitcast the result.  This eliminates one bitcast, potentially
+          // two.
+          Instruction *NGEP = GetElementPtrInst::Create(OrigBase, 
+                                                        NewIndices.begin(),
+                                                        NewIndices.end(), "");
+          InsertNewInstBefore(NGEP, CI);
+          NGEP->takeName(GEP);
+          
+          if (isa<BitCastInst>(CI))
+            return new BitCastInst(NGEP, CI.getType());
+          assert(isa<PtrToIntInst>(CI));
+          return new PtrToIntInst(NGEP, CI.getType());
+        }
+      }      
+    }
+  }
+    
+  return commonCastTransforms(CI);
+}
+
+/// isSafeIntegerType - Return true if this is a basic integer type, not a crazy
+/// type like i42.  We don't want to introduce operations on random non-legal
+/// integer types where they don't already exist in the code.  In the future,
+/// we should consider making this based off target-data, so that 32-bit targets
+/// won't get i64 operations etc.
+static bool isSafeIntegerType(const Type *Ty) {
+  switch (Ty->getPrimitiveSizeInBits()) {
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+    return true;
+  default: 
+    return false;
+  }
+}
+
+/// Only the TRUNC, ZEXT, SEXT, and BITCAST can both operand and result as
+/// integer types. This function implements the common transforms for all those
+/// cases.
+/// @brief Implement the transforms common to CastInst with integer operands
+Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType();
+  const Type *DestTy = CI.getType();
+  uint32_t SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  uint32_t DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  // See if we can simplify any instructions used by the LHS whose sole 
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+
+  // If the source isn't an instruction or has more than one use then we
+  // can't do anything more. 
+  Instruction *SrcI = dyn_cast<Instruction>(Src);
+  if (!SrcI || !Src->hasOneUse())
+    return 0;
+
+  // Attempt to propagate the cast into the instruction for int->int casts.
+  int NumCastsRemoved = 0;
+  if (!isa<BitCastInst>(CI) &&
+      // Only do this if the dest type is a simple type, don't convert the
+      // expression tree to something weird like i93 unless the source is also
+      // strange.
+      (isSafeIntegerType(DestTy) || !isSafeIntegerType(SrcI->getType())) &&
+      CanEvaluateInDifferentType(SrcI, cast<IntegerType>(DestTy),
+                                 CI.getOpcode(), NumCastsRemoved)) {
+    // If this cast is a truncate, evaluting in a different type always
+    // eliminates the cast, so it is always a win.  If this is a zero-extension,
+    // we need to do an AND to maintain the clear top-part of the computation,
+    // so we require that the input have eliminated at least one cast.  If this
+    // is a sign extension, we insert two new casts (to do the extension) so we
+    // require that two casts have been eliminated.
+    bool DoXForm = false;
+    bool JustReplace = false;
+    switch (CI.getOpcode()) {
+    default:
+      // All the others use floating point so we shouldn't actually 
+      // get here because of the check above.
+      assert(0 && "Unknown cast type");
+    case Instruction::Trunc:
+      DoXForm = true;
+      break;
+    case Instruction::ZExt: {
+      DoXForm = NumCastsRemoved >= 1;
+      if (!DoXForm && 0) {
+        // If it's unnecessary to issue an AND to clear the high bits, it's
+        // always profitable to do this xform.
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, false);
+        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
+        if (MaskedValueIsZero(TryRes, Mask))
+          return ReplaceInstUsesWith(CI, TryRes);
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+          if (TryI->use_empty())
+            EraseInstFromFunction(*TryI);
+      }
+      break;
+    }
+    case Instruction::SExt: {
+      DoXForm = NumCastsRemoved >= 2;
+      if (!DoXForm && !isa<TruncInst>(SrcI) && 0) {
+        // If we do not have to emit the truncate + sext pair, then it's always
+        // profitable to do this xform.
+        //
+        // It's not safe to eliminate the trunc + sext pair if one of the
+        // eliminated cast is a truncate. e.g.
+        // t2 = trunc i32 t1 to i16
+        // t3 = sext i16 t2 to i32
+        // !=
+        // i32 t1
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, true);
+        unsigned NumSignBits = ComputeNumSignBits(TryRes);
+        if (NumSignBits > (DestBitSize - SrcBitSize))
+          return ReplaceInstUsesWith(CI, TryRes);
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+          if (TryI->use_empty())
+            EraseInstFromFunction(*TryI);
+      }
+      break;
+    }
+    }
+    
+    if (DoXForm) {
+      DOUT << "ICE: EvaluateInDifferentType converting expression type to avoid"
+           << " cast: " << CI;
+      Value *Res = EvaluateInDifferentType(SrcI, DestTy, 
+                                           CI.getOpcode() == Instruction::SExt);
+      if (JustReplace)
+        // Just replace this cast with the result.
+        return ReplaceInstUsesWith(CI, Res);
+
+      assert(Res->getType() == DestTy);
+      switch (CI.getOpcode()) {
+      default: assert(0 && "Unknown cast type!");
+      case Instruction::Trunc:
+      case Instruction::BitCast:
+        // Just replace this cast with the result.
+        return ReplaceInstUsesWith(CI, Res);
+      case Instruction::ZExt: {
+        assert(SrcBitSize < DestBitSize && "Not a zext?");
+
+        // If the high bits are already zero, just replace this cast with the
+        // result.
+        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
+        if (MaskedValueIsZero(Res, Mask))
+          return ReplaceInstUsesWith(CI, Res);
+
+        // We need to emit an AND to clear the high bits.
+        Constant *C = ConstantInt::get(APInt::getLowBitsSet(DestBitSize,
+                                                            SrcBitSize));
+        return BinaryOperator::CreateAnd(Res, C);
+      }
+      case Instruction::SExt: {
+        // If the high bits are already filled with sign bit, just replace this
+        // cast with the result.
+        unsigned NumSignBits = ComputeNumSignBits(Res);
+        if (NumSignBits > (DestBitSize - SrcBitSize))
+          return ReplaceInstUsesWith(CI, Res);
+
+        // We need to emit a cast to truncate, then a cast to sext.
+        return CastInst::Create(Instruction::SExt,
+            InsertCastBefore(Instruction::Trunc, Res, Src->getType(), 
+                             CI), DestTy);
+      }
+      }
+    }
+  }
+  
+  Value *Op0 = SrcI->getNumOperands() > 0 ? SrcI->getOperand(0) : 0;
+  Value *Op1 = SrcI->getNumOperands() > 1 ? SrcI->getOperand(1) : 0;
+
+  switch (SrcI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // If we are discarding information, rewrite.
+    if (DestBitSize <= SrcBitSize && DestBitSize != 1) {
+      // Don't insert two casts if they cannot be eliminated.  We allow 
+      // two casts to be inserted if the sizes are the same.  This could 
+      // only be converting signedness, which is a noop.
+      if (DestBitSize == SrcBitSize || 
+          !ValueRequiresCast(CI.getOpcode(), Op1, DestTy,TD) ||
+          !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
+        Instruction::CastOps opcode = CI.getOpcode();
+        Value *Op0c = InsertCastBefore(opcode, Op0, DestTy, *SrcI);
+        Value *Op1c = InsertCastBefore(opcode, Op1, DestTy, *SrcI);
+        return BinaryOperator::Create(
+            cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
+      }
+    }
+
+    // cast (xor bool X, true) to int  --> xor (cast bool X to int), 1
+    if (isa<ZExtInst>(CI) && SrcBitSize == 1 && 
+        SrcI->getOpcode() == Instruction::Xor &&
+        Op1 == ConstantInt::getTrue() &&
+        (!Op0->hasOneUse() || !isa<CmpInst>(Op0))) {
+      Value *New = InsertCastBefore(Instruction::ZExt, Op0, DestTy, CI);
+      return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
+    }
+    break;
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // If we are just changing the sign, rewrite.
+    if (DestBitSize == SrcBitSize) {
+      // Don't insert two casts if they cannot be eliminated.  We allow 
+      // two casts to be inserted if the sizes are the same.  This could 
+      // only be converting signedness, which is a noop.
+      if (!ValueRequiresCast(CI.getOpcode(), Op1, DestTy, TD) || 
+          !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
+        Value *Op0c = InsertCastBefore(Instruction::BitCast, 
+                                       Op0, DestTy, *SrcI);
+        Value *Op1c = InsertCastBefore(Instruction::BitCast, 
+                                       Op1, DestTy, *SrcI);
+        return BinaryOperator::Create(
+          cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
+      }
+    }
+    break;
+
+  case Instruction::Shl:
+    // Allow changing the sign of the source operand.  Do not allow 
+    // changing the size of the shift, UNLESS the shift amount is a 
+    // constant.  We must not change variable sized shifts to a smaller 
+    // size, because it is undefined to shift more bits out than exist 
+    // in the value.
+    if (DestBitSize == SrcBitSize ||
+        (DestBitSize < SrcBitSize && isa<Constant>(Op1))) {
+      Instruction::CastOps opcode = (DestBitSize == SrcBitSize ?
+          Instruction::BitCast : Instruction::Trunc);
+      Value *Op0c = InsertCastBefore(opcode, Op0, DestTy, *SrcI);
+      Value *Op1c = InsertCastBefore(opcode, Op1, DestTy, *SrcI);
+      return BinaryOperator::CreateShl(Op0c, Op1c);
+    }
+    break;
+  case Instruction::AShr:
+    // If this is a signed shr, and if all bits shifted in are about to be
+    // truncated off, turn it into an unsigned shr to allow greater
+    // simplifications.
+    if (DestBitSize < SrcBitSize &&
+        isa<ConstantInt>(Op1)) {
+      uint32_t ShiftAmt = cast<ConstantInt>(Op1)->getLimitedValue(SrcBitSize);
+      if (SrcBitSize > ShiftAmt && SrcBitSize-ShiftAmt >= DestBitSize) {
+        // Insert the new logical shift right.
+        return BinaryOperator::CreateLShr(Op0, Op1);
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
+  if (Instruction *Result = commonIntCastTransforms(CI))
+    return Result;
+  
+  Value *Src = CI.getOperand(0);
+  const Type *Ty = CI.getType();
+  uint32_t DestBitWidth = Ty->getPrimitiveSizeInBits();
+  uint32_t SrcBitWidth = cast<IntegerType>(Src->getType())->getBitWidth();
+
+  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0)
+  if (DestBitWidth == 1) {
+    Constant *One = ConstantInt::get(Src->getType(), 1);
+    Src = InsertNewInstBefore(BinaryOperator::CreateAnd(Src, One, "tmp"), CI);
+    Value *Zero = Constant::getNullValue(Src->getType());
+    return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
+  }
+  
+  // Optimize trunc(lshr(), c) to pull the shift through the truncate.
+  ConstantInt *ShAmtV = 0;
+  Value *ShiftOp = 0;
+  if (Src->hasOneUse() &&
+      match(Src, m_LShr(m_Value(ShiftOp), m_ConstantInt(ShAmtV)))) {
+    uint32_t ShAmt = ShAmtV->getLimitedValue(SrcBitWidth);
+    
+    // Get a mask for the bits shifting in.
+    APInt Mask(APInt::getLowBitsSet(SrcBitWidth, ShAmt).shl(DestBitWidth));
+    if (MaskedValueIsZero(ShiftOp, Mask)) {
+      if (ShAmt >= DestBitWidth)        // All zeros.
+        return ReplaceInstUsesWith(CI, Constant::getNullValue(Ty));
+      
+      // Okay, we can shrink this.  Truncate the input, then return a new
+      // shift.
+      Value *V1 = InsertCastBefore(Instruction::Trunc, ShiftOp, Ty, CI);
+      Value *V2 = ConstantExpr::getTrunc(ShAmtV, Ty);
+      return BinaryOperator::CreateLShr(V1, V2);
+    }
+  }
+  
+  return 0;
+}
+
+/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations
+/// in order to eliminate the icmp.
+Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
+                                             bool DoXform) {
+  // If we are just checking for a icmp eq of a single bit and zext'ing it
+  // to an integer, then shift the bit to the appropriate place and then
+  // cast to integer to avoid the comparison.
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+    const APInt &Op1CV = Op1C->getValue();
+      
+    // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
+    // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
+    if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+        (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) {
+      if (!DoXform) return ICI;
+
+      Value *In = ICI->getOperand(0);
+      Value *Sh = ConstantInt::get(In->getType(),
+                                   In->getType()->getPrimitiveSizeInBits()-1);
+      In = InsertNewInstBefore(BinaryOperator::CreateLShr(In, Sh,
+                                                        In->getName()+".lobit"),
+                               CI);
+      if (In->getType() != CI.getType())
+        In = CastInst::CreateIntegerCast(In, CI.getType(),
+                                         false/*ZExt*/, "tmp", &CI);
+
+      if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
+        Constant *One = ConstantInt::get(In->getType(), 1);
+        In = InsertNewInstBefore(BinaryOperator::CreateXor(In, One,
+                                                         In->getName()+".not"),
+                                 CI);
+      }
+
+      return ReplaceInstUsesWith(CI, In);
+    }
+      
+      
+      
+    // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    // zext (X == 1) to i32 --> X        iff X has only the low bit set.
+    // zext (X == 2) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 0) to i32 --> X        iff X has only the low bit set.
+    // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    if ((Op1CV == 0 || Op1CV.isPowerOf2()) && 
+        // This only works for EQ and NE
+        ICI->isEquality()) {
+      // If Op1C some other power of two, convert:
+      uint32_t BitWidth = Op1C->getType()->getBitWidth();
+      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
+      ComputeMaskedBits(ICI->getOperand(0), TypeMask, KnownZero, KnownOne);
+        
+      APInt KnownZeroMask(~KnownZero);
+      if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
+        if (!DoXform) return ICI;
+
+        bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
+        if (Op1CV != 0 && (Op1CV != KnownZeroMask)) {
+          // (X&4) == 2 --> false
+          // (X&4) != 2 --> true
+          Constant *Res = ConstantInt::get(Type::Int1Ty, isNE);
+          Res = ConstantExpr::getZExt(Res, CI.getType());
+          return ReplaceInstUsesWith(CI, Res);
+        }
+          
+        uint32_t ShiftAmt = KnownZeroMask.logBase2();
+        Value *In = ICI->getOperand(0);
+        if (ShiftAmt) {
+          // Perform a logical shr by shiftamt.
+          // Insert the shift to put the result in the low bit.
+          In = InsertNewInstBefore(BinaryOperator::CreateLShr(In,
+                                  ConstantInt::get(In->getType(), ShiftAmt),
+                                                   In->getName()+".lobit"), CI);
+        }
+          
+        if ((Op1CV != 0) == isNE) { // Toggle the low bit.
+          Constant *One = ConstantInt::get(In->getType(), 1);
+          In = BinaryOperator::CreateXor(In, One, "tmp");
+          InsertNewInstBefore(cast<Instruction>(In), CI);
+        }
+          
+        if (CI.getType() == In->getType())
+          return ReplaceInstUsesWith(CI, In);
+        else
+          return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
+      }
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+  // If one of the common conversion will work ..
+  if (Instruction *Result = commonIntCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+
+  // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+  // types and if the sizes are just right we can convert this into a logical
+  // 'and' which will be much cheaper than the pair of casts.
+  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // Get the sizes of the types involved.  We know that the intermediate type
+    // will be smaller than A or C, but don't know the relation between A and C.
+    Value *A = CSrc->getOperand(0);
+    unsigned SrcSize = A->getType()->getPrimitiveSizeInBits();
+    unsigned MidSize = CSrc->getType()->getPrimitiveSizeInBits();
+    unsigned DstSize = CI.getType()->getPrimitiveSizeInBits();
+    // If we're actually extending zero bits, then if
+    // SrcSize <  DstSize: zext(a & mask)
+    // SrcSize == DstSize: a & mask
+    // SrcSize  > DstSize: trunc(a) & mask
+    if (SrcSize < DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      Constant *AndConst = ConstantInt::get(AndValue);
+      Instruction *And =
+        BinaryOperator::CreateAnd(A, AndConst, CSrc->getName()+".mask");
+      InsertNewInstBefore(And, CI);
+      return new ZExtInst(And, CI.getType());
+    } else if (SrcSize == DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      return BinaryOperator::CreateAnd(A, ConstantInt::get(AndValue));
+    } else if (SrcSize > DstSize) {
+      Instruction *Trunc = new TruncInst(A, CI.getType(), "tmp");
+      InsertNewInstBefore(Trunc, CI);
+      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+      return BinaryOperator::CreateAnd(Trunc, ConstantInt::get(AndValue));
+    }
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+    return transformZExtICmp(ICI, CI);
+
+  BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
+  if (SrcI && SrcI->getOpcode() == Instruction::Or) {
+    // zext (or icmp, icmp) --> or (zext icmp), (zext icmp) if at least one
+    // of the (zext icmp) will be transformed.
+    ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
+    ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
+    if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+        (transformZExtICmp(LHS, CI, false) ||
+         transformZExtICmp(RHS, CI, false))) {
+      Value *LCast = InsertCastBefore(Instruction::ZExt, LHS, CI.getType(), CI);
+      Value *RCast = InsertCastBefore(Instruction::ZExt, RHS, CI.getType(), CI);
+      return BinaryOperator::Create(Instruction::Or, LCast, RCast);
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+  if (Instruction *I = commonIntCastTransforms(CI))
+    return I;
+  
+  Value *Src = CI.getOperand(0);
+  
+  // Canonicalize sign-extend from i1 to a select.
+  if (Src->getType() == Type::Int1Ty)
+    return SelectInst::Create(Src,
+                              ConstantInt::getAllOnesValue(CI.getType()),
+                              Constant::getNullValue(CI.getType()));
+
+  // See if the value being truncated is already sign extended.  If so, just
+  // eliminate the trunc/sext pair.
+  if (getOpcode(Src) == Instruction::Trunc) {
+    Value *Op = cast<User>(Src)->getOperand(0);
+    unsigned OpBits   = cast<IntegerType>(Op->getType())->getBitWidth();
+    unsigned MidBits  = cast<IntegerType>(Src->getType())->getBitWidth();
+    unsigned DestBits = cast<IntegerType>(CI.getType())->getBitWidth();
+    unsigned NumSignBits = ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return ReplaceInstUsesWith(CI, Op);
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return new SExtInst(Op, CI.getType(), "tmp");
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return new TruncInst(Op, CI.getType(), "tmp");
+    }
+  }
+
+  // If the input is a shl/ashr pair of a same constant, then this is a sign
+  // extension from a smaller value.  If we could trust arbitrary bitwidth
+  // integers, we could turn this into a truncate to the smaller bit and then
+  // use a sext for the whole extension.  Since we don't, look deeper and check
+  // for a truncate.  If the source and dest are the same type, eliminate the
+  // trunc and extend and just do shifts.  For example, turn:
+  //   %a = trunc i32 %i to i8
+  //   %b = shl i8 %a, 6
+  //   %c = ashr i8 %b, 6
+  //   %d = sext i8 %c to i32
+  // into:
+  //   %a = shl i32 %i, 30
+  //   %d = ashr i32 %a, 30
+  Value *A = 0;
+  ConstantInt *BA = 0, *CA = 0;
+  if (match(Src, m_AShr(m_Shl(m_Value(A), m_ConstantInt(BA)),
+                        m_ConstantInt(CA))) &&
+      BA == CA && isa<TruncInst>(A)) {
+    Value *I = cast<TruncInst>(A)->getOperand(0);
+    if (I->getType() == CI.getType()) {
+      unsigned MidSize = Src->getType()->getPrimitiveSizeInBits();
+      unsigned SrcDstSize = CI.getType()->getPrimitiveSizeInBits();
+      unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
+      Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
+      I = InsertNewInstBefore(BinaryOperator::CreateShl(I, ShAmtV,
+                                                        CI.getName()), CI);
+      return BinaryOperator::CreateAShr(I, ShAmtV);
+    }
+  }
+  
+  return 0;
+}
+
+/// FitsInFPType - Return a Constant* for the specified FP constant if it fits
+/// in the specified FP type without changing its value.
+static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+  bool losesInfo;
+  APFloat F = CFP->getValueAPF();
+  (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
+  if (!losesInfo)
+    return ConstantFP::get(F);
+  return 0;
+}
+
+/// LookThroughFPExtensions - If this is an fp extension instruction, look
+/// through it until we get the source value.
+static Value *LookThroughFPExtensions(Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (I->getOpcode() == Instruction::FPExt)
+      return LookThroughFPExtensions(I->getOperand(0));
+  
+  // If this value is a constant, return the constant in the smallest FP type
+  // that can accurately represent it.  This allows us to turn
+  // (float)((double)X+2.0) into x+2.0f.
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType() == Type::PPC_FP128Ty)
+      return V;  // No constant folding of this.
+    // See if the value can be truncated to float and then reextended.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle))
+      return V;
+    if (CFP->getType() == Type::DoubleTy)
+      return V;  // Won't shrink.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble))
+      return V;
+    // Don't try to shrink to various long double types.
+  }
+  
+  return V;
+}
+
+Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+  
+  // If we have fptrunc(add (fpextend x), (fpextend y)), where x and y are
+  // smaller than the destination type, we can eliminate the truncate by doing
+  // the add as the smaller type.  This applies to add/sub/mul/div as well as
+  // many builtins (sqrt, etc).
+  BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
+  if (OpI && OpI->hasOneUse()) {
+    switch (OpI->getOpcode()) {
+    default: break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      const Type *SrcTy = OpI->getType();
+      Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
+      Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
+      if (LHSTrunc->getType() != SrcTy && 
+          RHSTrunc->getType() != SrcTy) {
+        unsigned DstSize = CI.getType()->getPrimitiveSizeInBits();
+        // If the source types were both smaller than the destination type of
+        // the cast, do this xform.
+        if (LHSTrunc->getType()->getPrimitiveSizeInBits() <= DstSize &&
+            RHSTrunc->getType()->getPrimitiveSizeInBits() <= DstSize) {
+          LHSTrunc = InsertCastBefore(Instruction::FPExt, LHSTrunc,
+                                      CI.getType(), CI);
+          RHSTrunc = InsertCastBefore(Instruction::FPExt, RHSTrunc,
+                                      CI.getType(), CI);
+          return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
+        }
+      }
+      break;  
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitFPExt(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (OpI == 0)
+    return commonCastTransforms(FI);
+
+  // fptoui(uitofp(X)) --> X
+  // fptoui(sitofp(X)) --> X
+  // This is safe if the intermediate type has enough bits in its mantissa to
+  // accurately represent all values of X.  For example, do not do this with
+  // i64->float->i64.  This is also safe for sitofp case, because any negative
+  // 'X' value would cause an undefined result for the fptoui. 
+  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
+      OpI->getOperand(0)->getType() == FI.getType() &&
+      (int)FI.getType()->getPrimitiveSizeInBits() < /*extra bit for sign */
+                    OpI->getType()->getFPMantissaWidth())
+    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (OpI == 0)
+    return commonCastTransforms(FI);
+  
+  // fptosi(sitofp(X)) --> X
+  // fptosi(uitofp(X)) --> X
+  // This is safe if the intermediate type has enough bits in its mantissa to
+  // accurately represent all values of X.  For example, do not do this with
+  // i64->float->i64.  This is also safe for sitofp case, because any negative
+  // 'X' value would cause an undefined result for the fptoui. 
+  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
+      OpI->getOperand(0)->getType() == FI.getType() &&
+      (int)FI.getType()->getPrimitiveSizeInBits() <= 
+                    OpI->getType()->getFPMantissaWidth())
+    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+  
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+  // If the destination integer type is smaller than the intptr_t type for
+  // this target, do a ptrtoint to intptr_t then do a trunc.  This allows the
+  // trunc to be exposed to other transforms.  Don't do this for extending
+  // ptrtoint's, because we don't know if the target sign or zero extends its
+  // pointers.
+  if (CI.getType()->getPrimitiveSizeInBits() < TD->getPointerSizeInBits()) {
+    Value *P = InsertNewInstBefore(new PtrToIntInst(CI.getOperand(0),
+                                                    TD->getIntPtrType(),
+                                                    "tmp"), CI);
+    return new TruncInst(P, CI.getType());
+  }
+  
+  return commonPointerCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+  // If the source integer type is larger than the intptr_t type for
+  // this target, do a trunc to the intptr_t type, then inttoptr of it.  This
+  // allows the trunc to be exposed to other transforms.  Don't do this for
+  // extending inttoptr's, because we don't know if the target sign or zero
+  // extends to pointers.
+  if (CI.getOperand(0)->getType()->getPrimitiveSizeInBits() >
+      TD->getPointerSizeInBits()) {
+    Value *P = InsertNewInstBefore(new TruncInst(CI.getOperand(0),
+                                                 TD->getIntPtrType(),
+                                                 "tmp"), CI);
+    return new IntToPtrInst(P, CI.getType());
+  }
+  
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+  
+  const Type *DestPointee = cast<PointerType>(CI.getType())->getElementType();
+  if (!DestPointee->isSized()) return 0;
+
+  // If this is inttoptr(add (ptrtoint x), cst), try to turn this into a GEP.
+  ConstantInt *Cst;
+  Value *X;
+  if (match(CI.getOperand(0), m_Add(m_Cast<PtrToIntInst>(m_Value(X)),
+                                    m_ConstantInt(Cst)))) {
+    // If the source and destination operands have the same type, see if this
+    // is a single-index GEP.
+    if (X->getType() == CI.getType()) {
+      // Get the size of the pointee type.
+      uint64_t Size = TD->getTypeAllocSize(DestPointee);
+
+      // Convert the constant to intptr type.
+      APInt Offset = Cst->getValue();
+      Offset.sextOrTrunc(TD->getPointerSizeInBits());
+
+      // If Offset is evenly divisible by Size, we can do this xform.
+      if (Size && !APIntOps::srem(Offset, APInt(Offset.getBitWidth(), Size))){
+        Offset = APIntOps::sdiv(Offset, APInt(Offset.getBitWidth(), Size));
+        return GetElementPtrInst::Create(X, ConstantInt::get(Offset));
+      }
+    }
+    // TODO: Could handle other cases, e.g. where add is indexing into field of
+    // struct etc.
+  } else if (CI.getOperand(0)->hasOneUse() &&
+             match(CI.getOperand(0), m_Add(m_Value(X), m_ConstantInt(Cst)))) {
+    // Otherwise, if this is inttoptr(add x, cst), try to turn this into an
+    // "inttoptr+GEP" instead of "add+intptr".
+    
+    // Get the size of the pointee type.
+    uint64_t Size = TD->getTypeAllocSize(DestPointee);
+    
+    // Convert the constant to intptr type.
+    APInt Offset = Cst->getValue();
+    Offset.sextOrTrunc(TD->getPointerSizeInBits());
+    
+    // If Offset is evenly divisible by Size, we can do this xform.
+    if (Size && !APIntOps::srem(Offset, APInt(Offset.getBitWidth(), Size))){
+      Offset = APIntOps::sdiv(Offset, APInt(Offset.getBitWidth(), Size));
+      
+      Instruction *P = InsertNewInstBefore(new IntToPtrInst(X, CI.getType(),
+                                                            "tmp"), CI);
+      return GetElementPtrInst::Create(P, ConstantInt::get(Offset), "tmp");
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+  // If the operands are integer typed then apply the integer transforms,
+  // otherwise just apply the common ones.
+  Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType();
+  const Type *DestTy = CI.getType();
+
+  if (SrcTy->isInteger() && DestTy->isInteger()) {
+    if (Instruction *Result = commonIntCastTransforms(CI))
+      return Result;
+  } else if (isa<PointerType>(SrcTy)) {
+    if (Instruction *I = commonPointerCastTransforms(CI))
+      return I;
+  } else {
+    if (Instruction *Result = commonCastTransforms(CI))
+      return Result;
+  }
+
+
+  // Get rid of casts from one type to the same type. These are useless and can
+  // be replaced by the operand.
+  if (DestTy == Src->getType())
+    return ReplaceInstUsesWith(CI, Src);
+
+  if (const PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+    const PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    const Type *DstElTy = DstPTy->getElementType();
+    const Type *SrcElTy = SrcPTy->getElementType();
+    
+    // If the address spaces don't match, don't eliminate the bitcast, which is
+    // required for changing types.
+    if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace())
+      return 0;
+    
+    // If we are casting a malloc or alloca to a pointer to a type of the same
+    // size, rewrite the allocation instruction to allocate the "right" type.
+    if (AllocationInst *AI = dyn_cast<AllocationInst>(Src))
+      if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+        return V;
+    
+    // If the source and destination are pointers, and this cast is equivalent
+    // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
+    // This can enhance SROA and other transforms that want type-safe pointers.
+    Constant *ZeroUInt = Constant::getNullValue(Type::Int32Ty);
+    unsigned NumZeros = 0;
+    while (SrcElTy != DstElTy && 
+           isa<CompositeType>(SrcElTy) && !isa<PointerType>(SrcElTy) &&
+           SrcElTy->getNumContainedTypes() /* not "{}" */) {
+      SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
+      ++NumZeros;
+    }
+
+    // If we found a path from the src to dest, create the getelementptr now.
+    if (SrcElTy == DstElTy) {
+      SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
+      return GetElementPtrInst::Create(Src, Idxs.begin(), Idxs.end(), "", 
+                                       ((Instruction*) NULL));
+    }
+  }
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
+    if (SVI->hasOneUse()) {
+      // Okay, we have (bitconvert (shuffle ..)).  Check to see if this is
+      // a bitconvert to a vector with the same # elts.
+      if (isa<VectorType>(DestTy) && 
+          cast<VectorType>(DestTy)->getNumElements() ==
+                SVI->getType()->getNumElements() &&
+          SVI->getType()->getNumElements() ==
+            cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) {
+        CastInst *Tmp;
+        // If either of the operands is a cast from CI.getType(), then
+        // evaluating the shuffle in the casted destination's type will allow
+        // us to eliminate at least one cast.
+        if (((Tmp = dyn_cast<CastInst>(SVI->getOperand(0))) && 
+             Tmp->getOperand(0)->getType() == DestTy) ||
+            ((Tmp = dyn_cast<CastInst>(SVI->getOperand(1))) && 
+             Tmp->getOperand(0)->getType() == DestTy)) {
+          Value *LHS = InsertCastBefore(Instruction::BitCast,
+                                        SVI->getOperand(0), DestTy, CI);
+          Value *RHS = InsertCastBefore(Instruction::BitCast,
+                                        SVI->getOperand(1), DestTy, CI);
+          // Return a new shuffle vector.  Use the same element ID's, as we
+          // know the vector types match #elts.
+          return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+/// GetSelectFoldableOperands - We want to turn code that looks like this:
+///   %C = or %A, %B
+///   %D = select %cond, %C, %A
+/// into:
+///   %C = select %cond, %B, 0
+///   %D = or %A, %C
+///
+/// Assuming that the specified instruction is an operand to the select, return
+/// a bitmask indicating which operands of this instruction are foldable if they
+/// equal the other incoming value of the select.
+///
+static unsigned GetSelectFoldableOperands(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return 3;              // Can fold through either operand.
+  case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::Shl:   // Can only fold on the shift amount.
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return 1;
+  default:
+    return 0;              // Cannot fold
+  }
+}
+
+/// GetSelectFoldableConstant - For the same transformation as the previous
+/// function, return the identity constant that goes into the select.
+static Constant *GetSelectFoldableConstant(Instruction *I) {
+  switch (I->getOpcode()) {
+  default: assert(0 && "This cannot happen!"); abort();
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return Constant::getNullValue(I->getType());
+  case Instruction::And:
+    return Constant::getAllOnesValue(I->getType());
+  case Instruction::Mul:
+    return ConstantInt::get(I->getType(), 1);
+  }
+}
+
+/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI
+/// have the same opcode and only one use each.  Try to simplify this.
+Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                          Instruction *FI) {
+  if (TI->getNumOperands() == 1) {
+    // If this is a non-volatile load or a cast from the same type,
+    // merge.
+    if (TI->isCast()) {
+      if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
+        return 0;
+    } else {
+      return 0;  // unknown unary op.
+    }
+
+    // Fold this by inserting a select from the input values.
+    SelectInst *NewSI = SelectInst::Create(SI.getCondition(), TI->getOperand(0),
+                                           FI->getOperand(0), SI.getName()+".v");
+    InsertNewInstBefore(NewSI, SI);
+    return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI, 
+                            TI->getType());
+  }
+
+  // Only handle binary operators here.
+  if (!isa<BinaryOperator>(TI))
+    return 0;
+
+  // Figure out if the operations have any operands in common.
+  Value *MatchOp, *OtherOpT, *OtherOpF;
+  bool MatchIsOpZero;
+  if (TI->getOperand(0) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = false;
+  } else if (!TI->isCommutative()) {
+    return 0;
+  } else if (TI->getOperand(0) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else {
+    return 0;
+  }
+
+  // If we reach here, they do have operations in common.
+  SelectInst *NewSI = SelectInst::Create(SI.getCondition(), OtherOpT,
+                                         OtherOpF, SI.getName()+".v");
+  InsertNewInstBefore(NewSI, SI);
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
+    if (MatchIsOpZero)
+      return BinaryOperator::Create(BO->getOpcode(), MatchOp, NewSI);
+    else
+      return BinaryOperator::Create(BO->getOpcode(), NewSI, MatchOp);
+  }
+  assert(0 && "Shouldn't get here");
+  return 0;
+}
+
+static bool isSelect01(Constant *C1, Constant *C2) {
+  ConstantInt *C1I = dyn_cast<ConstantInt>(C1);
+  if (!C1I)
+    return false;
+  ConstantInt *C2I = dyn_cast<ConstantInt>(C2);
+  if (!C2I)
+    return false;
+  return (C1I->isZero() || C1I->isOne()) && (C2I->isZero() || C2I->isOne());
+}
+
+/// FoldSelectIntoOp - Try fold the select into one of the operands to
+/// facilitate further optimization.
+Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+                                            Value *FalseVal) {
+  // See the comment above GetSelectFoldableOperands for a description of the
+  // transformation we are doing here.
+  if (Instruction *TVI = dyn_cast<Instruction>(TrueVal)) {
+    if (TVI->hasOneUse() && TVI->getNumOperands() == 2 &&
+        !isa<Constant>(FalseVal)) {
+      if (unsigned SFO = GetSelectFoldableOperands(TVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
+          OpToFold = 1;
+        } else  if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = GetSelectFoldableConstant(TVI);
+          Value *OOp = TVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0 and 1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Instruction *NewSel = SelectInst::Create(SI.getCondition(), OOp, C);
+            InsertNewInstBefore(NewSel, SI);
+            NewSel->takeName(TVI);
+            if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TVI))
+              return BinaryOperator::Create(BO->getOpcode(), FalseVal, NewSel);
+            assert(0 && "Unknown instruction!!");
+          }
+        }
+      }
+    }
+  }
+
+  if (Instruction *FVI = dyn_cast<Instruction>(FalseVal)) {
+    if (FVI->hasOneUse() && FVI->getNumOperands() == 2 &&
+        !isa<Constant>(TrueVal)) {
+      if (unsigned SFO = GetSelectFoldableOperands(FVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
+          OpToFold = 1;
+        } else  if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = GetSelectFoldableConstant(FVI);
+          Value *OOp = FVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0 and 1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Instruction *NewSel = SelectInst::Create(SI.getCondition(), C, OOp);
+            InsertNewInstBefore(NewSel, SI);
+            NewSel->takeName(FVI);
+            if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FVI))
+              return BinaryOperator::Create(BO->getOpcode(), TrueVal, NewSel);
+            assert(0 && "Unknown instruction!!");
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// visitSelectInstWithICmp - Visit a SelectInst that has an
+/// ICmpInst as its first operand.
+///
+Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
+                                                   ICmpInst *ICI) {
+  bool Changed = false;
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  // Check cases where the comparison is with a constant that
+  // can be adjusted to fit the min/max idiom. We may edit ICI in
+  // place here, so make sure the select is the only user.
+  if (ICI->hasOneUse())
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS)) {
+      switch (Pred) {
+      default: break;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT: {
+        // X < MIN ? T : F  -->  F
+        if (CI->isMinValue(Pred == ICmpInst::ICMP_SLT))
+          return ReplaceInstUsesWith(SI, FalseVal);
+        // X < C ? X : C-1  -->  X > C-1 ? C-1 : X
+        Constant *AdjustedRHS = SubOne(CI);
+        if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+            (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+          Pred = ICmpInst::getSwappedPredicate(Pred);
+          CmpRHS = AdjustedRHS;
+          std::swap(FalseVal, TrueVal);
+          ICI->setPredicate(Pred);
+          ICI->setOperand(1, CmpRHS);
+          SI.setOperand(1, TrueVal);
+          SI.setOperand(2, FalseVal);
+          Changed = true;
+        }
+        break;
+      }
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT: {
+        // X > MAX ? T : F  -->  F
+        if (CI->isMaxValue(Pred == ICmpInst::ICMP_SGT))
+          return ReplaceInstUsesWith(SI, FalseVal);
+        // X > C ? X : C+1  -->  X < C+1 ? C+1 : X
+        Constant *AdjustedRHS = AddOne(CI);
+        if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+            (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+          Pred = ICmpInst::getSwappedPredicate(Pred);
+          CmpRHS = AdjustedRHS;
+          std::swap(FalseVal, TrueVal);
+          ICI->setPredicate(Pred);
+          ICI->setOperand(1, CmpRHS);
+          SI.setOperand(1, TrueVal);
+          SI.setOperand(2, FalseVal);
+          Changed = true;
+        }
+        break;
+      }
+      }
+
+      // (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed
+      // (x >s -1) ? -1 : 0 -> ashr x, 31  -> all ones if not signed
+      CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+      if (match(TrueVal, m_ConstantInt<-1>()) &&
+          match(FalseVal, m_ConstantInt<0>()))
+        Pred = ICI->getPredicate();
+      else if (match(TrueVal, m_ConstantInt<0>()) &&
+               match(FalseVal, m_ConstantInt<-1>()))
+        Pred = CmpInst::getInversePredicate(ICI->getPredicate());
+      
+      if (Pred != CmpInst::BAD_ICMP_PREDICATE) {
+        // If we are just checking for a icmp eq of a single bit and zext'ing it
+        // to an integer, then shift the bit to the appropriate place and then
+        // cast to integer to avoid the comparison.
+        const APInt &Op1CV = CI->getValue();
+    
+        // sext (x <s  0) to i32 --> x>>s31      true if signbit set.
+        // sext (x >s -1) to i32 --> (x>>s31)^-1  true if signbit clear.
+        if ((Pred == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+            (Pred == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
+          Value *In = ICI->getOperand(0);
+          Value *Sh = ConstantInt::get(In->getType(),
+                                       In->getType()->getPrimitiveSizeInBits()-1);
+          In = InsertNewInstBefore(BinaryOperator::CreateAShr(In, Sh,
+                                                          In->getName()+".lobit"),
+                                   *ICI);
+          if (In->getType() != SI.getType())
+            In = CastInst::CreateIntegerCast(In, SI.getType(),
+                                             true/*SExt*/, "tmp", ICI);
+    
+          if (Pred == ICmpInst::ICMP_SGT)
+            In = InsertNewInstBefore(BinaryOperator::CreateNot(In,
+                                       In->getName()+".not"), *ICI);
+    
+          return ReplaceInstUsesWith(SI, In);
+        }
+      }
+    }
+
+  if (CmpLHS == TrueVal && CmpRHS == FalseVal) {
+    // Transform (X == Y) ? X : Y  -> Y
+    if (Pred == ICmpInst::ICMP_EQ)
+      return ReplaceInstUsesWith(SI, FalseVal);
+    // Transform (X != Y) ? X : Y  -> X
+    if (Pred == ICmpInst::ICMP_NE)
+      return ReplaceInstUsesWith(SI, TrueVal);
+    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
+
+  } else if (CmpLHS == FalseVal && CmpRHS == TrueVal) {
+    // Transform (X == Y) ? Y : X  -> X
+    if (Pred == ICmpInst::ICMP_EQ)
+      return ReplaceInstUsesWith(SI, FalseVal);
+    // Transform (X != Y) ? Y : X  -> Y
+    if (Pred == ICmpInst::ICMP_NE)
+      return ReplaceInstUsesWith(SI, TrueVal);
+    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
+  }
+
+  /// NOTE: if we wanted to, this is where to detect integer ABS
+
+  return Changed ? &SI : 0;
+}
+
+Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  // select true, X, Y  -> X
+  // select false, X, Y -> Y
+  if (ConstantInt *C = dyn_cast<ConstantInt>(CondVal))
+    return ReplaceInstUsesWith(SI, C->getZExtValue() ? TrueVal : FalseVal);
+
+  // select C, X, X -> X
+  if (TrueVal == FalseVal)
+    return ReplaceInstUsesWith(SI, TrueVal);
+
+  if (isa<UndefValue>(TrueVal))   // select C, undef, X -> X
+    return ReplaceInstUsesWith(SI, FalseVal);
+  if (isa<UndefValue>(FalseVal))   // select C, X, undef -> X
+    return ReplaceInstUsesWith(SI, TrueVal);
+  if (isa<UndefValue>(CondVal)) {  // select undef, X, Y -> X or Y
+    if (isa<Constant>(TrueVal))
+      return ReplaceInstUsesWith(SI, TrueVal);
+    else
+      return ReplaceInstUsesWith(SI, FalseVal);
+  }
+
+  if (SI.getType() == Type::Int1Ty) {
+    if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) {
+      if (C->getZExtValue()) {
+        // Change: A = select B, true, C --> A = or B, C
+        return BinaryOperator::CreateOr(CondVal, FalseVal);
+      } else {
+        // Change: A = select B, false, C --> A = and !B, C
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
+                                             "not."+CondVal->getName()), SI);
+        return BinaryOperator::CreateAnd(NotCond, FalseVal);
+      }
+    } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
+      if (C->getZExtValue() == false) {
+        // Change: A = select B, C, false --> A = and B, C
+        return BinaryOperator::CreateAnd(CondVal, TrueVal);
+      } else {
+        // Change: A = select B, C, true --> A = or !B, C
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
+                                             "not."+CondVal->getName()), SI);
+        return BinaryOperator::CreateOr(NotCond, TrueVal);
+      }
+    }
+    
+    // select a, b, a  -> a&b
+    // select a, a, b  -> a|b
+    if (CondVal == TrueVal)
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    else if (CondVal == FalseVal)
+      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+  }
+
+  // Selecting between two integer constants?
+  if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
+    if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal)) {
+      // select C, 1, 0 -> zext C to int
+      if (FalseValC->isZero() && TrueValC->getValue() == 1) {
+        return CastInst::Create(Instruction::ZExt, CondVal, SI.getType());
+      } else if (TrueValC->isZero() && FalseValC->getValue() == 1) {
+        // select C, 0, 1 -> zext !C to int
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
+                                               "not."+CondVal->getName()), SI);
+        return CastInst::Create(Instruction::ZExt, NotCond, SI.getType());
+      }
+
+      if (ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition())) {
+
+        // (x <s 0) ? -1 : 0 -> ashr x, 31
+        if (TrueValC->isAllOnesValue() && FalseValC->isZero())
+          if (ConstantInt *CmpCst = dyn_cast<ConstantInt>(IC->getOperand(1))) {
+            if (IC->getPredicate() == ICmpInst::ICMP_SLT && CmpCst->isZero()) {
+              // The comparison constant and the result are not neccessarily the
+              // same width. Make an all-ones value by inserting a AShr.
+              Value *X = IC->getOperand(0);
+              uint32_t Bits = X->getType()->getPrimitiveSizeInBits();
+              Constant *ShAmt = ConstantInt::get(X->getType(), Bits-1);
+              Instruction *SRA = BinaryOperator::Create(Instruction::AShr, X,
+                                                        ShAmt, "ones");
+              InsertNewInstBefore(SRA, SI);
+
+              // Then cast to the appropriate width.
+              return CastInst::CreateIntegerCast(SRA, SI.getType(), true);
+            }
+          }
+
+
+        // If one of the constants is zero (we know they can't both be) and we
+        // have an icmp instruction with zero, and we have an 'and' with the
+        // non-constant value, eliminate this whole mess.  This corresponds to
+        // cases like this: ((X & 27) ? 27 : 0)
+        if (TrueValC->isZero() || FalseValC->isZero())
+          if (IC->isEquality() && isa<ConstantInt>(IC->getOperand(1)) &&
+              cast<Constant>(IC->getOperand(1))->isNullValue())
+            if (Instruction *ICA = dyn_cast<Instruction>(IC->getOperand(0)))
+              if (ICA->getOpcode() == Instruction::And &&
+                  isa<ConstantInt>(ICA->getOperand(1)) &&
+                  (ICA->getOperand(1) == TrueValC ||
+                   ICA->getOperand(1) == FalseValC) &&
+                  isOneBitSet(cast<ConstantInt>(ICA->getOperand(1)))) {
+                // Okay, now we know that everything is set up, we just don't
+                // know whether we have a icmp_ne or icmp_eq and whether the 
+                // true or false val is the zero.
+                bool ShouldNotVal = !TrueValC->isZero();
+                ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE;
+                Value *V = ICA;
+                if (ShouldNotVal)
+                  V = InsertNewInstBefore(BinaryOperator::Create(
+                                  Instruction::Xor, V, ICA->getOperand(1)), SI);
+                return ReplaceInstUsesWith(SI, V);
+              }
+      }
+    }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
+    if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
+      // Transform (X == Y) ? X : Y  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:  
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+        return ReplaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X != Y) ? X : Y  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_ONE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+
+    } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
+      // Transform (X == Y) ? Y : X  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:  
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+          return ReplaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X != Y) ? Y : X  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_ONE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+    }
+    // NOTE: if we wanted to, this is where to detect ABS
+  }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
+    if (Instruction *Result = visitSelectInstWithICmp(SI, ICI))
+      return Result;
+
+  if (Instruction *TI = dyn_cast<Instruction>(TrueVal))
+    if (Instruction *FI = dyn_cast<Instruction>(FalseVal))
+      if (TI->hasOneUse() && FI->hasOneUse()) {
+        Instruction *AddOp = 0, *SubOp = 0;
+
+        // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+        if (TI->getOpcode() == FI->getOpcode())
+          if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+            return IV;
+
+        // Turn select C, (X+Y), (X-Y) --> (X+(select C, Y, (-Y))).  This is
+        // even legal for FP.
+        if (TI->getOpcode() == Instruction::Sub &&
+            FI->getOpcode() == Instruction::Add) {
+          AddOp = FI; SubOp = TI;
+        } else if (FI->getOpcode() == Instruction::Sub &&
+                   TI->getOpcode() == Instruction::Add) {
+          AddOp = TI; SubOp = FI;
+        }
+
+        if (AddOp) {
+          Value *OtherAddOp = 0;
+          if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+            OtherAddOp = AddOp->getOperand(1);
+          } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+            OtherAddOp = AddOp->getOperand(0);
+          }
+
+          if (OtherAddOp) {
+            // So at this point we know we have (Y -> OtherAddOp):
+            //        select C, (add X, Y), (sub X, Z)
+            Value *NegVal;  // Compute -Z
+            if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
+              NegVal = ConstantExpr::getNeg(C);
+            } else {
+              NegVal = InsertNewInstBefore(
+                    BinaryOperator::CreateNeg(SubOp->getOperand(1), "tmp"), SI);
+            }
+
+            Value *NewTrueOp = OtherAddOp;
+            Value *NewFalseOp = NegVal;
+            if (AddOp != TI)
+              std::swap(NewTrueOp, NewFalseOp);
+            Instruction *NewSel =
+              SelectInst::Create(CondVal, NewTrueOp,
+                                 NewFalseOp, SI.getName() + ".p");
+
+            NewSel = InsertNewInstBefore(NewSel, SI);
+            return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
+          }
+        }
+      }
+
+  // See if we can fold the select into one of our operands.
+  if (SI.getType()->isInteger()) {
+    Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal);
+    if (FoldI)
+      return FoldI;
+  }
+
+  if (BinaryOperator::isNot(CondVal)) {
+    SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    return &SI;
+  }
+
+  return 0;
+}
+
+/// EnforceKnownAlignment - If the specified pointer points to an object that
+/// we control, modify the object's alignment to PrefAlign. This isn't
+/// often possible though. If alignment is important, a more reliable approach
+/// is to simply align all global variables and allocation instructions to
+/// their preferred alignment from the beginning.
+///
+static unsigned EnforceKnownAlignment(Value *V,
+                                      unsigned Align, unsigned PrefAlign) {
+
+  User *U = dyn_cast<User>(V);
+  if (!U) return Align;
+
+  switch (getOpcode(U)) {
+  default: break;
+  case Instruction::BitCast:
+    return EnforceKnownAlignment(U->getOperand(0), Align, PrefAlign);
+  case Instruction::GetElementPtr: {
+    // If all indexes are zero, it is just the alignment of the base pointer.
+    bool AllZeroOperands = true;
+    for (User::op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i)
+      if (!isa<Constant>(*i) ||
+          !cast<Constant>(*i)->isNullValue()) {
+        AllZeroOperands = false;
+        break;
+      }
+
+    if (AllZeroOperands) {
+      // Treat this like a bitcast.
+      return EnforceKnownAlignment(U->getOperand(0), Align, PrefAlign);
+    }
+    break;
+  }
+  }
+
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // If there is a large requested alignment and we can, bump up the alignment
+    // of the global.
+    if (!GV->isDeclaration()) {
+      if (GV->getAlignment() >= PrefAlign)
+        Align = GV->getAlignment();
+      else {
+        GV->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
+    }
+  } else if (AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
+    // If there is a requested alignment and if this is an alloca, round up.  We
+    // don't do this for malloc, because some systems can't respect the request.
+    if (isa<AllocaInst>(AI)) {
+      if (AI->getAlignment() >= PrefAlign)
+        Align = AI->getAlignment();
+      else {
+        AI->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
+    }
+  }
+
+  return Align;
+}
+
+/// GetOrEnforceKnownAlignment - If the specified pointer has an alignment that
+/// we can determine, return it, otherwise return 0.  If PrefAlign is specified,
+/// and it is more than the alignment of the ultimate object, see if we can
+/// increase the alignment of the ultimate object, making this check succeed.
+unsigned InstCombiner::GetOrEnforceKnownAlignment(Value *V,
+                                                  unsigned PrefAlign) {
+  unsigned BitWidth = TD ? TD->getTypeSizeInBits(V->getType()) :
+                      sizeof(PrefAlign) * CHAR_BIT;
+  APInt Mask = APInt::getAllOnesValue(BitWidth);
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  ComputeMaskedBits(V, Mask, KnownZero, KnownOne);
+  unsigned TrailZ = KnownZero.countTrailingOnes();
+  unsigned Align = 1u << std::min(BitWidth - 1, TrailZ);
+
+  if (PrefAlign > Align)
+    Align = EnforceKnownAlignment(V, Align, PrefAlign);
+  
+    // We don't need to make any adjustment.
+  return Align;
+}
+
+Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
+  unsigned DstAlign = GetOrEnforceKnownAlignment(MI->getOperand(1));
+  unsigned SrcAlign = GetOrEnforceKnownAlignment(MI->getOperand(2));
+  unsigned MinAlign = std::min(DstAlign, SrcAlign);
+  unsigned CopyAlign = MI->getAlignment();
+
+  if (CopyAlign < MinAlign) {
+    MI->setAlignment(MinAlign);
+    return MI;
+  }
+  
+  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
+  // load/store.
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getOperand(3));
+  if (MemOpLength == 0) return 0;
+  
+  // Source and destination pointer types are always "i8*" for intrinsic.  See
+  // if the size is something we can handle with a single primitive load/store.
+  // A single load+store correctly handles overlapping memory in the memmove
+  // case.
+  unsigned Size = MemOpLength->getZExtValue();
+  if (Size == 0) return MI;  // Delete this mem transfer.
+  
+  if (Size > 8 || (Size&(Size-1)))
+    return 0;  // If not 1/2/4/8 bytes, exit.
+  
+  // Use an integer load+store unless we can find something better.
+  Type *NewPtrTy = PointerType::getUnqual(IntegerType::get(Size<<3));
+  
+  // Memcpy forces the use of i8* for the source and destination.  That means
+  // that if you're using memcpy to move one double around, you'll get a cast
+  // from double* to i8*.  We'd much rather use a double load+store rather than
+  // an i64 load+store, here because this improves the odds that the source or
+  // dest address will be promotable.  See if we can find a better type than the
+  // integer datatype.
+  if (Value *Op = getBitCastOperand(MI->getOperand(1))) {
+    const Type *SrcETy = cast<PointerType>(Op->getType())->getElementType();
+    if (SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
+      // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
+      // down through these levels if so.
+      while (!SrcETy->isSingleValueType()) {
+        if (const StructType *STy = dyn_cast<StructType>(SrcETy)) {
+          if (STy->getNumElements() == 1)
+            SrcETy = STy->getElementType(0);
+          else
+            break;
+        } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
+          if (ATy->getNumElements() == 1)
+            SrcETy = ATy->getElementType();
+          else
+            break;
+        } else
+          break;
+      }
+      
+      if (SrcETy->isSingleValueType())
+        NewPtrTy = PointerType::getUnqual(SrcETy);
+    }
+  }
+  
+  
+  // If the memcpy/memmove provides better alignment info than we can
+  // infer, use it.
+  SrcAlign = std::max(SrcAlign, CopyAlign);
+  DstAlign = std::max(DstAlign, CopyAlign);
+  
+  Value *Src = InsertBitCastBefore(MI->getOperand(2), NewPtrTy, *MI);
+  Value *Dest = InsertBitCastBefore(MI->getOperand(1), NewPtrTy, *MI);
+  Instruction *L = new LoadInst(Src, "tmp", false, SrcAlign);
+  InsertNewInstBefore(L, *MI);
+  InsertNewInstBefore(new StoreInst(L, Dest, false, DstAlign), *MI);
+
+  // Set the size of the copy to 0, it will be deleted on the next iteration.
+  MI->setOperand(3, Constant::getNullValue(MemOpLength->getType()));
+  return MI;
+}
+
+Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+  unsigned Alignment = GetOrEnforceKnownAlignment(MI->getDest());
+  if (MI->getAlignment() < Alignment) {
+    MI->setAlignment(Alignment);
+    return MI;
+  }
+  
+  // Extract the length and alignment and fill if they are constant.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
+  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
+  if (!LenC || !FillC || FillC->getType() != Type::Int8Ty)
+    return 0;
+  uint64_t Len = LenC->getZExtValue();
+  Alignment = MI->getAlignment();
+  
+  // If the length is zero, this is a no-op
+  if (Len == 0) return MI; // memset(d,c,0,a) -> noop
+  
+  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
+  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
+    const Type *ITy = IntegerType::get(Len*8);  // n=1 -> i8.
+    
+    Value *Dest = MI->getDest();
+    Dest = InsertBitCastBefore(Dest, PointerType::getUnqual(ITy), *MI);
+
+    // Alignment 0 is identity for alignment 1 for memset, but not store.
+    if (Alignment == 0) Alignment = 1;
+    
+    // Extract the fill value and store.
+    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
+    InsertNewInstBefore(new StoreInst(ConstantInt::get(ITy, Fill), Dest, false,
+                                      Alignment), *MI);
+    
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(LenC->getType()));
+    return MI;
+  }
+
+  return 0;
+}
+
+
+/// visitCallInst - CallInst simplification.  This mostly only handles folding 
+/// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
+/// the heavy lifting.
+///
+Instruction *InstCombiner::visitCallInst(CallInst &CI) {
+  // If the caller function is nounwind, mark the call as nounwind, even if the
+  // callee isn't.
+  if (CI.getParent()->getParent()->doesNotThrow() &&
+      !CI.doesNotThrow()) {
+    CI.setDoesNotThrow();
+    return &CI;
+  }
+  
+  
+  
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
+  if (!II) return visitCallSite(&CI);
+  
+  // Intrinsics cannot occur in an invoke, so handle them here instead of in
+  // visitCallSite.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+    bool Changed = false;
+
+    // memmove/cpy/set of zero bytes is a noop.
+    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
+      if (NumBytes->isNullValue()) return EraseInstFromFunction(CI);
+
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
+        if (CI->getZExtValue() == 1) {
+          // Replace the instruction with just byte operations.  We would
+          // transform other cases to loads/stores, but we don't know if
+          // alignment is sufficient.
+        }
+    }
+
+    // If we have a memmove and the source operation is a constant global,
+    // then the source and dest pointers can't alias, so we can change this
+    // into a call to memcpy.
+    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
+        if (GVSrc->isConstant()) {
+          Module *M = CI.getParent()->getParent()->getParent();
+          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+          const Type *Tys[1];
+          Tys[0] = CI.getOperand(3)->getType();
+          CI.setOperand(0, 
+                        Intrinsic::getDeclaration(M, MemCpyID, Tys, 1));
+          Changed = true;
+        }
+
+      // memmove(x,x,size) -> noop.
+      if (MMI->getSource() == MMI->getDest())
+        return EraseInstFromFunction(CI);
+    }
+
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (isa<MemTransferInst>(MI)) {
+      if (Instruction *I = SimplifyMemTransfer(MI))
+        return I;
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
+      if (Instruction *I = SimplifyMemSet(MSI))
+        return I;
+    }
+          
+    if (Changed) return II;
+  }
+  
+  switch (II->getIntrinsicID()) {
+  default: break;
+  case Intrinsic::bswap:
+    // bswap(bswap(x)) -> x
+    if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getOperand(1)))
+      if (Operand->getIntrinsicID() == Intrinsic::bswap)
+        return ReplaceInstUsesWith(CI, Operand->getOperand(1));
+    break;
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+  case Intrinsic::x86_sse_loadu_ps:
+  case Intrinsic::x86_sse2_loadu_pd:
+  case Intrinsic::x86_sse2_loadu_dq:
+    // Turn PPC lvx     -> load if the pointer is known aligned.
+    // Turn X86 loadups -> load if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
+      Value *Ptr = InsertBitCastBefore(II->getOperand(1),
+                                       PointerType::getUnqual(II->getType()),
+                                       CI);
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(2), 16) >= 16) {
+      const Type *OpPtrTy = 
+        PointerType::getUnqual(II->getOperand(1)->getType());
+      Value *Ptr = InsertBitCastBefore(II->getOperand(2), OpPtrTy, CI);
+      return new StoreInst(II->getOperand(1), Ptr);
+    }
+    break;
+  case Intrinsic::x86_sse_storeu_ps:
+  case Intrinsic::x86_sse2_storeu_pd:
+  case Intrinsic::x86_sse2_storeu_dq:
+    // Turn X86 storeu -> store if the pointer is known aligned.
+    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
+      const Type *OpPtrTy = 
+        PointerType::getUnqual(II->getOperand(2)->getType());
+      Value *Ptr = InsertBitCastBefore(II->getOperand(1), OpPtrTy, CI);
+      return new StoreInst(II->getOperand(2), Ptr);
+    }
+    break;
+    
+  case Intrinsic::x86_sse_cvttss2si: {
+    // These intrinsics only demands the 0th element of its input vector.  If
+    // we can simplify the input based on that, do so now.
+    unsigned VWidth =
+      cast<VectorType>(II->getOperand(1)->getType())->getNumElements();
+    APInt DemandedElts(VWidth, 1);
+    APInt UndefElts(VWidth, 0);
+    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+                                              UndefElts)) {
+      II->setOperand(1, V);
+      return II;
+    }
+    break;
+  }
+    
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getOperand(3))) {
+      assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
+      
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        if (!isa<ConstantInt>(Mask->getOperand(i)) && 
+            !isa<UndefValue>(Mask->getOperand(i))) {
+          AllEltsOk = false;
+          break;
+        }
+      }
+      
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 =InsertBitCastBefore(II->getOperand(1),Mask->getType(),CI);
+        Value *Op1 =InsertBitCastBefore(II->getOperand(2),Mask->getType(),CI);
+        Value *Result = UndefValue::get(Op0->getType());
+        
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
+        
+        for (unsigned i = 0; i != 16; ++i) {
+          if (isa<UndefValue>(Mask->getOperand(i)))
+            continue;
+          unsigned Idx=cast<ConstantInt>(Mask->getOperand(i))->getZExtValue();
+          Idx &= 31;  // Match the hardware behavior.
+          
+          if (ExtractedElts[Idx] == 0) {
+            Instruction *Elt = 
+              new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp");
+            InsertNewInstBefore(Elt, CI);
+            ExtractedElts[Idx] = Elt;
+          }
+        
+          // Insert this value into the result vector.
+          Result = InsertElementInst::Create(Result, ExtractedElts[Idx],
+                                             i, "tmp");
+          InsertNewInstBefore(cast<Instruction>(Result), CI);
+        }
+        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
+      }
+    }
+    break;
+
+  case Intrinsic::stackrestore: {
+    // If the save is right next to the restore, remove the restore.  This can
+    // happen when variable allocas are DCE'd.
+    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getOperand(1))) {
+      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+        BasicBlock::iterator BI = SS;
+        if (&*++BI == II)
+          return EraseInstFromFunction(CI);
+      }
+    }
+    
+    // Scan down this block to see if there is another stack restore in the
+    // same block without an intervening call/alloca.
+    BasicBlock::iterator BI = II;
+    TerminatorInst *TI = II->getParent()->getTerminator();
+    bool CannotRemove = false;
+    for (++BI; &*BI != TI; ++BI) {
+      if (isa<AllocaInst>(BI)) {
+        CannotRemove = true;
+        break;
+      }
+      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) {
+          // If there is a stackrestore below this one, remove this one.
+          if (II->getIntrinsicID() == Intrinsic::stackrestore)
+            return EraseInstFromFunction(CI);
+          // Otherwise, ignore the intrinsic.
+        } else {
+          // If we found a non-intrinsic call, we can't remove the stack
+          // restore.
+          CannotRemove = true;
+          break;
+        }
+      }
+    }
+    
+    // If the stack restore is in a return/unwind block and if there are no
+    // allocas or calls between the restore and the return, nuke the restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<UnwindInst>(TI)))
+      return EraseInstFromFunction(CI);
+    break;
+  }
+  }
+
+  return visitCallSite(II);
+}
+
+// InvokeInst simplification
+//
+Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
+  return visitCallSite(&II);
+}
+
+/// isSafeToEliminateVarargsCast - If this cast does not affect the value 
+/// passed through the varargs area, we can eliminate the use of the cast.
+static bool isSafeToEliminateVarargsCast(const CallSite CS,
+                                         const CastInst * const CI,
+                                         const TargetData * const TD,
+                                         const int ix) {
+  if (!CI->isLosslessCast())
+    return false;
+
+  // The size of ByVal arguments is derived from the type, so we
+  // can't change to a type with a different size.  If the size were
+  // passed explicitly we could avoid this check.
+  if (!CS.paramHasAttr(ix, Attribute::ByVal))
+    return true;
+
+  const Type* SrcTy = 
+            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
+  const Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
+  if (!SrcTy->isSized() || !DstTy->isSized())
+    return false;
+  if (TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy))
+    return false;
+  return true;
+}
+
+// visitCallSite - Improvements for call and invoke instructions.
+//
+Instruction *InstCombiner::visitCallSite(CallSite CS) {
+  bool Changed = false;
+
+  // If the callee is a constexpr cast of a function, attempt to move the cast
+  // to the arguments of the call/invoke.
+  if (transformConstExprCastCall(CS)) return 0;
+
+  Value *Callee = CS.getCalledValue();
+
+  if (Function *CalleeF = dyn_cast<Function>(Callee))
+    if (CalleeF->getCallingConv() != CS.getCallingConv()) {
+      Instruction *OldCall = CS.getInstruction();
+      // If the call and callee calling conventions don't match, this call must
+      // be unreachable, as the call is undefined.
+      new StoreInst(ConstantInt::getTrue(),
+                    UndefValue::get(PointerType::getUnqual(Type::Int1Ty)), 
+                                    OldCall);
+      if (!OldCall->use_empty())
+        OldCall->replaceAllUsesWith(UndefValue::get(OldCall->getType()));
+      if (isa<CallInst>(OldCall))   // Not worth removing an invoke here.
+        return EraseInstFromFunction(*OldCall);
+      return 0;
+    }
+
+  if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+    // This instruction is not reachable, just remove it.  We insert a store to
+    // undef so that we know that this code is not reachable, despite the fact
+    // that we can't modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(),
+                  UndefValue::get(PointerType::getUnqual(Type::Int1Ty)),
+                  CS.getInstruction());
+
+    if (!CS.getInstruction()->use_empty())
+      CS.getInstruction()->
+        replaceAllUsesWith(UndefValue::get(CS.getInstruction()->getType()));
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      // Don't break the CFG, insert a dummy cond branch.
+      BranchInst::Create(II->getNormalDest(), II->getUnwindDest(),
+                         ConstantInt::getTrue(), II);
+    }
+    return EraseInstFromFunction(*CS.getInstruction());
+  }
+
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(Callee))
+    if (IntrinsicInst *In = dyn_cast<IntrinsicInst>(BC->getOperand(0)))
+      if (In->getIntrinsicID() == Intrinsic::init_trampoline)
+        return transformCallThroughTrampoline(CS);
+
+  const PointerType *PTy = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  if (FTy->isVarArg()) {
+    int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1);
+    // See if we can optimize any arguments passed through the varargs area of
+    // the call.
+    for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
+           E = CS.arg_end(); I != E; ++I, ++ix) {
+      CastInst *CI = dyn_cast<CastInst>(*I);
+      if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) {
+        *I = CI->getOperand(0);
+        Changed = true;
+      }
+    }
+  }
+
+  if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) {
+    // Inline asm calls cannot throw - mark them 'nounwind'.
+    CS.setDoesNotThrow();
+    Changed = true;
+  }
+
+  return Changed ? CS.getInstruction() : 0;
+}
+
+// transformConstExprCastCall - If the callee is a constexpr cast of a function,
+// attempt to move the cast to the arguments of the call/invoke.
+//
+bool InstCombiner::transformConstExprCastCall(CallSite CS) {
+  if (!isa<ConstantExpr>(CS.getCalledValue())) return false;
+  ConstantExpr *CE = cast<ConstantExpr>(CS.getCalledValue());
+  if (CE->getOpcode() != Instruction::BitCast || 
+      !isa<Function>(CE->getOperand(0)))
+    return false;
+  Function *Callee = cast<Function>(CE->getOperand(0));
+  Instruction *Caller = CS.getInstruction();
+  const AttrListPtr &CallerPAL = CS.getAttributes();
+
+  // Okay, this is a cast from a function to a different type.  Unless doing so
+  // would cause a type conversion of one of our arguments, change this call to
+  // be a direct call with arguments casted to the appropriate types.
+  //
+  const FunctionType *FT = Callee->getFunctionType();
+  const Type *OldRetTy = Caller->getType();
+  const Type *NewRetTy = FT->getReturnType();
+
+  if (isa<StructType>(NewRetTy))
+    return false; // TODO: Handle multiple return values.
+
+  // Check to see if we are changing the return type...
+  if (OldRetTy != NewRetTy) {
+    if (Callee->isDeclaration() &&
+        // Conversion is ok if changing from one pointer type to another or from
+        // a pointer to an integer of the same size.
+        !((isa<PointerType>(OldRetTy) || OldRetTy == TD->getIntPtrType()) &&
+          (isa<PointerType>(NewRetTy) || NewRetTy == TD->getIntPtrType())))
+      return false;   // Cannot transform this return value.
+
+    if (!Caller->use_empty() &&
+        // void -> non-void is handled specially
+        NewRetTy != Type::VoidTy && !CastInst::isCastable(NewRetTy, OldRetTy))
+      return false;   // Cannot transform this return value.
+
+    if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
+      Attributes RAttrs = CallerPAL.getRetAttributes();
+      if (RAttrs & Attribute::typeIncompatible(NewRetTy))
+        return false;   // Attribute not compatible with transformed value.
+    }
+
+    // If the callsite is an invoke instruction, and the return value is used by
+    // a PHI node in a successor, we cannot change the return type of the call
+    // because there is no place to put the cast instruction (without breaking
+    // the critical edge).  Bail out in this case.
+    if (!Caller->use_empty())
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
+        for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
+             UI != E; ++UI)
+          if (PHINode *PN = dyn_cast<PHINode>(*UI))
+            if (PN->getParent() == II->getNormalDest() ||
+                PN->getParent() == II->getUnwindDest())
+              return false;
+  }
+
+  unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
+
+  CallSite::arg_iterator AI = CS.arg_begin();
+  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
+    const Type *ParamTy = FT->getParamType(i);
+    const Type *ActTy = (*AI)->getType();
+
+    if (!CastInst::isCastable(ActTy, ParamTy))
+      return false;   // Cannot transform this parameter value.
+
+    if (CallerPAL.getParamAttributes(i + 1) 
+        & Attribute::typeIncompatible(ParamTy))
+      return false;   // Attribute not compatible with transformed value.
+
+    // Converting from one pointer type to another or between a pointer and an
+    // integer of the same size is safe even if we do not have a body.
+    bool isConvertible = ActTy == ParamTy ||
+      ((isa<PointerType>(ParamTy) || ParamTy == TD->getIntPtrType()) &&
+       (isa<PointerType>(ActTy) || ActTy == TD->getIntPtrType()));
+    if (Callee->isDeclaration() && !isConvertible) return false;
+  }
+
+  if (FT->getNumParams() < NumActualArgs && !FT->isVarArg() &&
+      Callee->isDeclaration())
+    return false;   // Do not delete arguments unless we have a function body.
+
+  if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
+      !CallerPAL.isEmpty())
+    // In this case we have more arguments than the new function type, but we
+    // won't be dropping them.  Check that these extra arguments have attributes
+    // that are compatible with being a vararg call argument.
+    for (unsigned i = CallerPAL.getNumSlots(); i; --i) {
+      if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams())
+        break;
+      Attributes PAttrs = CallerPAL.getSlot(i - 1).Attrs;
+      if (PAttrs & Attribute::VarArgsIncompatible)
+        return false;
+    }
+
+  // Okay, we decided that this is a safe thing to do: go ahead and start
+  // inserting cast instructions as necessary...
+  std::vector<Value*> Args;
+  Args.reserve(NumActualArgs);
+  SmallVector<AttributeWithIndex, 8> attrVec;
+  attrVec.reserve(NumCommonArgs);
+
+  // Get any return attributes.
+  Attributes RAttrs = CallerPAL.getRetAttributes();
+
+  // If the return value is not being used, the type may not be compatible
+  // with the existing attributes.  Wipe out any problematic attributes.
+  RAttrs &= ~Attribute::typeIncompatible(NewRetTy);
+
+  // Add the new return attributes.
+  if (RAttrs)
+    attrVec.push_back(AttributeWithIndex::get(0, RAttrs));
+
+  AI = CS.arg_begin();
+  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
+    const Type *ParamTy = FT->getParamType(i);
+    if ((*AI)->getType() == ParamTy) {
+      Args.push_back(*AI);
+    } else {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
+          false, ParamTy, false);
+      CastInst *NewCast = CastInst::Create(opcode, *AI, ParamTy, "tmp");
+      Args.push_back(InsertNewInstBefore(NewCast, *Caller));
+    }
+
+    // Add any parameter attributes.
+    if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1))
+      attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+  }
+
+  // If the function takes more arguments than the call was taking, add them
+  // now...
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i)
+    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+
+  // If we are removing arguments to the function, emit an obnoxious warning...
+  if (FT->getNumParams() < NumActualArgs) {
+    if (!FT->isVarArg()) {
+      cerr << "WARNING: While resolving call to function '"
+           << Callee->getName() << "' arguments were dropped!\n";
+    } else {
+      // Add all of the arguments in their promoted form to the arg list...
+      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
+        const Type *PTy = getPromotedType((*AI)->getType());
+        if (PTy != (*AI)->getType()) {
+          // Must promote to pass through va_arg area!
+          Instruction::CastOps opcode = CastInst::getCastOpcode(*AI, false, 
+                                                                PTy, false);
+          Instruction *Cast = CastInst::Create(opcode, *AI, PTy, "tmp");
+          InsertNewInstBefore(Cast, *Caller);
+          Args.push_back(Cast);
+        } else {
+          Args.push_back(*AI);
+        }
+
+        // Add any parameter attributes.
+        if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1))
+          attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+      }
+    }
+  }
+
+  if (Attributes FnAttrs =  CallerPAL.getFnAttributes())
+    attrVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+
+  if (NewRetTy == Type::VoidTy)
+    Caller->setName("");   // Void type should not have a name.
+
+  const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec.begin(),attrVec.end());
+
+  Instruction *NC;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+    NC = InvokeInst::Create(Callee, II->getNormalDest(), II->getUnwindDest(),
+                            Args.begin(), Args.end(),
+                            Caller->getName(), Caller);
+    cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
+    cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
+  } else {
+    NC = CallInst::Create(Callee, Args.begin(), Args.end(),
+                          Caller->getName(), Caller);
+    CallInst *CI = cast<CallInst>(Caller);
+    if (CI->isTailCall())
+      cast<CallInst>(NC)->setTailCall();
+    cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
+    cast<CallInst>(NC)->setAttributes(NewCallerPAL);
+  }
+
+  // Insert a cast of the return type as necessary.
+  Value *NV = NC;
+  if (OldRetTy != NV->getType() && !Caller->use_empty()) {
+    if (NV->getType() != Type::VoidTy) {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(NC, false, 
+                                                            OldRetTy, false);
+      NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp");
+
+      // If this is an invoke instruction, we should insert it after the first
+      // non-phi, instruction in the normal successor block.
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        BasicBlock::iterator I = II->getNormalDest()->getFirstNonPHI();
+        InsertNewInstBefore(NC, *I);
+      } else {
+        // Otherwise, it's a call, just insert cast right after the call instr
+        InsertNewInstBefore(NC, *Caller);
+      }
+      AddUsersToWorkList(*Caller);
+    } else {
+      NV = UndefValue::get(Caller->getType());
+    }
+  }
+
+  if (Caller->getType() != Type::VoidTy && !Caller->use_empty())
+    Caller->replaceAllUsesWith(NV);
+  Caller->eraseFromParent();
+  RemoveFromWorkList(Caller);
+  return true;
+}
+
+// transformCallThroughTrampoline - Turn a call to a function created by the
+// init_trampoline intrinsic into a direct call to the underlying function.
+//
+Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
+  Value *Callee = CS.getCalledValue();
+  const PointerType *PTy = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  const AttrListPtr &Attrs = CS.getAttributes();
+
+  // If the call already has the 'nest' attribute somewhere then give up -
+  // otherwise 'nest' would occur twice after splicing in the chain.
+  if (Attrs.hasAttrSomewhere(Attribute::Nest))
+    return 0;
+
+  IntrinsicInst *Tramp =
+    cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0));
+
+  Function *NestF = cast<Function>(Tramp->getOperand(2)->stripPointerCasts());
+  const PointerType *NestFPTy = cast<PointerType>(NestF->getType());
+  const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
+
+  const AttrListPtr &NestAttrs = NestF->getAttributes();
+  if (!NestAttrs.isEmpty()) {
+    unsigned NestIdx = 1;
+    const Type *NestTy = 0;
+    Attributes NestAttr = Attribute::None;
+
+    // Look for a parameter marked with the 'nest' attribute.
+    for (FunctionType::param_iterator I = NestFTy->param_begin(),
+         E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
+      if (NestAttrs.paramHasAttr(NestIdx, Attribute::Nest)) {
+        // Record the parameter type and any other attributes.
+        NestTy = *I;
+        NestAttr = NestAttrs.getParamAttributes(NestIdx);
+        break;
+      }
+
+    if (NestTy) {
+      Instruction *Caller = CS.getInstruction();
+      std::vector<Value*> NewArgs;
+      NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1);
+
+      SmallVector<AttributeWithIndex, 8> NewAttrs;
+      NewAttrs.reserve(Attrs.getNumSlots() + 1);
+
+      // Insert the nest argument into the call argument list, which may
+      // mean appending it.  Likewise for attributes.
+
+      // Add any result attributes.
+      if (Attributes Attr = Attrs.getRetAttributes())
+        NewAttrs.push_back(AttributeWithIndex::get(0, Attr));
+
+      {
+        unsigned Idx = 1;
+        CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+        do {
+          if (Idx == NestIdx) {
+            // Add the chain argument and attributes.
+            Value *NestVal = Tramp->getOperand(3);
+            if (NestVal->getType() != NestTy)
+              NestVal = new BitCastInst(NestVal, NestTy, "nest", Caller);
+            NewArgs.push_back(NestVal);
+            NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr));
+          }
+
+          if (I == E)
+            break;
+
+          // Add the original argument and attributes.
+          NewArgs.push_back(*I);
+          if (Attributes Attr = Attrs.getParamAttributes(Idx))
+            NewAttrs.push_back
+              (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr));
+
+          ++Idx, ++I;
+        } while (1);
+      }
+
+      // Add any function attributes.
+      if (Attributes Attr = Attrs.getFnAttributes())
+        NewAttrs.push_back(AttributeWithIndex::get(~0, Attr));
+
+      // The trampoline may have been bitcast to a bogus type (FTy).
+      // Handle this by synthesizing a new function type, equal to FTy
+      // with the chain parameter inserted.
+
+      std::vector<const Type*> NewTypes;
+      NewTypes.reserve(FTy->getNumParams()+1);
+
+      // Insert the chain's type into the list of parameter types, which may
+      // mean appending it.
+      {
+        unsigned Idx = 1;
+        FunctionType::param_iterator I = FTy->param_begin(),
+          E = FTy->param_end();
+
+        do {
+          if (Idx == NestIdx)
+            // Add the chain's type.
+            NewTypes.push_back(NestTy);
+
+          if (I == E)
+            break;
+
+          // Add the original type.
+          NewTypes.push_back(*I);
+
+          ++Idx, ++I;
+        } while (1);
+      }
+
+      // Replace the trampoline call with a direct call.  Let the generic
+      // code sort out any function type mismatches.
+      FunctionType *NewFTy =
+        FunctionType::get(FTy->getReturnType(), NewTypes, FTy->isVarArg());
+      Constant *NewCallee = NestF->getType() == PointerType::getUnqual(NewFTy) ?
+        NestF : ConstantExpr::getBitCast(NestF, PointerType::getUnqual(NewFTy));
+      const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(),NewAttrs.end());
+
+      Instruction *NewCaller;
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        NewCaller = InvokeInst::Create(NewCallee,
+                                       II->getNormalDest(), II->getUnwindDest(),
+                                       NewArgs.begin(), NewArgs.end(),
+                                       Caller->getName(), Caller);
+        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
+        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
+      } else {
+        NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end(),
+                                     Caller->getName(), Caller);
+        if (cast<CallInst>(Caller)->isTailCall())
+          cast<CallInst>(NewCaller)->setTailCall();
+        cast<CallInst>(NewCaller)->
+          setCallingConv(cast<CallInst>(Caller)->getCallingConv());
+        cast<CallInst>(NewCaller)->setAttributes(NewPAL);
+      }
+      if (Caller->getType() != Type::VoidTy && !Caller->use_empty())
+        Caller->replaceAllUsesWith(NewCaller);
+      Caller->eraseFromParent();
+      RemoveFromWorkList(Caller);
+      return 0;
+    }
+  }
+
+  // Replace the trampoline call with a direct call.  Since there is no 'nest'
+  // parameter, there is no need to adjust the argument list.  Let the generic
+  // code sort out any function type mismatches.
+  Constant *NewCallee =
+    NestF->getType() == PTy ? NestF : ConstantExpr::getBitCast(NestF, PTy);
+  CS.setCalledFunction(NewCallee);
+  return CS.getInstruction();
+}
+
+/// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(c,d)]
+/// and if a/b/c/d and the add's all have a single use, turn this into two phi's
+/// and a single binop.
+Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
+  unsigned Opc = FirstInst->getOpcode();
+  Value *LHSVal = FirstInst->getOperand(0);
+  Value *RHSVal = FirstInst->getOperand(1);
+    
+  const Type *LHSType = LHSVal->getType();
+  const Type *RHSType = RHSVal->getType();
+  
+  // Scan to see if all operands are the same opcode, all have one use, and all
+  // kill their operands (i.e. the operands have one use).
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+    if (!I || I->getOpcode() != Opc || !I->hasOneUse() ||
+        // Verify type of the LHS matches so we don't fold cmp's of different
+        // types or GEP's with different index types.
+        I->getOperand(0)->getType() != LHSType ||
+        I->getOperand(1)->getType() != RHSType)
+      return 0;
+
+    // If they are CmpInst instructions, check their predicates
+    if (Opc == Instruction::ICmp || Opc == Instruction::FCmp)
+      if (cast<CmpInst>(I)->getPredicate() !=
+          cast<CmpInst>(FirstInst)->getPredicate())
+        return 0;
+    
+    // Keep track of which operand needs a phi node.
+    if (I->getOperand(0) != LHSVal) LHSVal = 0;
+    if (I->getOperand(1) != RHSVal) RHSVal = 0;
+  }
+  
+  // Otherwise, this is safe to transform!
+  
+  Value *InLHS = FirstInst->getOperand(0);
+  Value *InRHS = FirstInst->getOperand(1);
+  PHINode *NewLHS = 0, *NewRHS = 0;
+  if (LHSVal == 0) {
+    NewLHS = PHINode::Create(LHSType,
+                             FirstInst->getOperand(0)->getName() + ".pn");
+    NewLHS->reserveOperandSpace(PN.getNumOperands()/2);
+    NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewLHS, PN);
+    LHSVal = NewLHS;
+  }
+  
+  if (RHSVal == 0) {
+    NewRHS = PHINode::Create(RHSType,
+                             FirstInst->getOperand(1)->getName() + ".pn");
+    NewRHS->reserveOperandSpace(PN.getNumOperands()/2);
+    NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewRHS, PN);
+    RHSVal = NewRHS;
+  }
+  
+  // Add all operands to the new PHIs.
+  if (NewLHS || NewRHS) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
+      if (NewLHS) {
+        Value *NewInLHS = InInst->getOperand(0);
+        NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+      }
+      if (NewRHS) {
+        Value *NewInRHS = InInst->getOperand(1);
+        NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+      }
+    }
+  }
+    
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
+    return BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
+  CmpInst *CIOp = cast<CmpInst>(FirstInst);
+  return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), LHSVal,
+                         RHSVal);
+}
+
+Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
+  
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
+                                        FirstInst->op_end());
+  // This is true if all GEP bases are allocas and if all indices into them are
+  // constants.
+  bool AllBasePointersAreAllocas = true;
+  
+  // Scan to see if all operands are the same opcode, all have one use, and all
+  // kill their operands (i.e. the operands have one use).
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
+    if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() ||
+      GEP->getNumOperands() != FirstInst->getNumOperands())
+      return 0;
+
+    // Keep track of whether or not all GEPs are of alloca pointers.
+    if (AllBasePointersAreAllocas &&
+        (!isa<AllocaInst>(GEP->getOperand(0)) ||
+         !GEP->hasAllConstantIndices()))
+      AllBasePointersAreAllocas = false;
+    
+    // Compare the operand lists.
+    for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
+      if (FirstInst->getOperand(op) == GEP->getOperand(op))
+        continue;
+      
+      // Don't merge two GEPs when two operands differ (introducing phi nodes)
+      // if one of the PHIs has a constant for the index.  The index may be
+      // substantially cheaper to compute for the constants, so making it a
+      // variable index could pessimize the path.  This also handles the case
+      // for struct indices, which must always be constant.
+      if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
+          isa<ConstantInt>(GEP->getOperand(op)))
+        return 0;
+      
+      if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
+        return 0;
+      FixedOperands[op] = 0;  // Needs a PHI.
+    }
+  }
+  
+  // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+  // bother doing this transformation.  At best, this will just save a bit of
+  // offset calculation, but all the predecessors will have to materialize the
+  // stack address into a register anyway.  We'd actually rather *clone* the
+  // load up into the predecessors so that we have a load of a gep of an alloca,
+  // which can usually all be folded into the load.
+  if (AllBasePointersAreAllocas)
+    return 0;
+  
+  // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
+  // that is variable.
+  SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
+  
+  bool HasAnyPHIs = false;
+  for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
+    if (FixedOperands[i]) continue;  // operand doesn't need a phi.
+    Value *FirstOp = FirstInst->getOperand(i);
+    PHINode *NewPN = PHINode::Create(FirstOp->getType(),
+                                     FirstOp->getName()+".pn");
+    InsertNewInstBefore(NewPN, PN);
+    
+    NewPN->reserveOperandSpace(e);
+    NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
+    OperandPhis[i] = NewPN;
+    FixedOperands[i] = NewPN;
+    HasAnyPHIs = true;
+  }
+
+  
+  // Add all operands to the new PHIs.
+  if (HasAnyPHIs) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
+      BasicBlock *InBB = PN.getIncomingBlock(i);
+      
+      for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
+        if (PHINode *OpPhi = OperandPhis[op])
+          OpPhi->addIncoming(InGEP->getOperand(op), InBB);
+    }
+  }
+  
+  Value *Base = FixedOperands[0];
+  return GetElementPtrInst::Create(Base, FixedOperands.begin()+1,
+                                   FixedOperands.end());
+}
+
+
+/// isSafeAndProfitableToSinkLoad - Return true if we know that it is safe to
+/// sink the load out of the block that defines it.  This means that it must be
+/// obvious the value of the load is not changed from the point of the load to
+/// the end of the block it is in.
+///
+/// Finally, it is safe, but not profitable, to sink a load targetting a
+/// non-address-taken alloca.  Doing so will cause us to not promote the alloca
+/// to a register.
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
+  BasicBlock::iterator BBI = L, E = L->getParent()->end();
+  
+  for (++BBI; BBI != E; ++BBI)
+    if (BBI->mayWriteToMemory())
+      return false;
+  
+  // Check for non-address taken alloca.  If not address-taken already, it isn't
+  // profitable to do this xform.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
+    bool isAddressTaken = false;
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+         UI != E; ++UI) {
+      if (isa<LoadInst>(UI)) continue;
+      if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+        // If storing TO the alloca, then the address isn't taken.
+        if (SI->getOperand(1) == AI) continue;
+      }
+      isAddressTaken = true;
+      break;
+    }
+    
+    if (!isAddressTaken && AI->isStaticAlloca())
+      return false;
+  }
+  
+  // If this load is a load from a GEP with a constant offset from an alloca,
+  // then we don't want to sink it.  In its present form, it will be
+  // load [constant stack offset].  Sinking it will cause us to have to
+  // materialize the stack addresses in each predecessor in a register only to
+  // do a shared load from register in the successor.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+        return false;
+  
+  return true;
+}
+
+
+// FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+// operator and they all are only used by the PHI, PHI together their
+// inputs, and do the operation once, to the result of the PHI.
+Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+
+  // Scan the instruction, looking for input operations that can be folded away.
+  // If all input operands to the phi are the same instruction (e.g. a cast from
+  // the same type or "+42") we can pull the operation through the PHI, reducing
+  // code size and simplifying code.
+  Constant *ConstantOp = 0;
+  const Type *CastSrcTy = 0;
+  bool isVolatile = false;
+  if (isa<CastInst>(FirstInst)) {
+    CastSrcTy = FirstInst->getOperand(0)->getType();
+  } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
+    // Can fold binop, compare or shift here if the RHS is a constant, 
+    // otherwise call FoldPHIArgBinOpIntoPHI.
+    ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
+    if (ConstantOp == 0)
+      return FoldPHIArgBinOpIntoPHI(PN);
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(FirstInst)) {
+    isVolatile = LI->isVolatile();
+    // We can't sink the load if the loaded value could be modified between the
+    // load and the PHI.
+    if (LI->getParent() != PN.getIncomingBlock(0) ||
+        !isSafeAndProfitableToSinkLoad(LI))
+      return 0;
+    
+    // If the PHI is of volatile loads and the load block has multiple
+    // successors, sinking it would remove a load of the volatile value from
+    // the path through the other successor.
+    if (isVolatile &&
+        LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+      return 0;
+    
+  } else if (isa<GetElementPtrInst>(FirstInst)) {
+    return FoldPHIArgGEPIntoPHI(PN);
+  } else {
+    return 0;  // Cannot fold this operation.
+  }
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    if (!isa<Instruction>(PN.getIncomingValue(i))) return 0;
+    Instruction *I = cast<Instruction>(PN.getIncomingValue(i));
+    if (!I->hasOneUse() || !I->isSameOperationAs(FirstInst))
+      return 0;
+    if (CastSrcTy) {
+      if (I->getOperand(0)->getType() != CastSrcTy)
+        return 0;  // Cast operation must match.
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      // We can't sink the load if the loaded value could be modified between 
+      // the load and the PHI.
+      if (LI->isVolatile() != isVolatile ||
+          LI->getParent() != PN.getIncomingBlock(i) ||
+          !isSafeAndProfitableToSinkLoad(LI))
+        return 0;
+      
+      // If the PHI is of volatile loads and the load block has multiple
+      // successors, sinking it would remove a load of the volatile value from
+      // the path through the other successor.
+      if (isVolatile &&
+          LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+        return 0;
+      
+    } else if (I->getOperand(1) != ConstantOp) {
+      return 0;
+    }
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
+                                   PN.getName()+".in");
+  NewPN->reserveOperandSpace(PN.getNumOperands()/2);
+
+  Value *InVal = FirstInst->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+
+  // Add all operands to the new PHI.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = 0;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  Value *PhiVal;
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    PhiVal = InVal;
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+    PhiVal = NewPN;
+  }
+
+  // Insert and return the new operation.
+  if (CastInst* FirstCI = dyn_cast<CastInst>(FirstInst))
+    return CastInst::Create(FirstCI->getOpcode(), PhiVal, PN.getType());
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
+    return BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
+    return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), 
+                           PhiVal, ConstantOp);
+  assert(isa<LoadInst>(FirstInst) && "Unknown operation");
+  
+  // If this was a volatile load that we are merging, make sure to loop through
+  // and mark all the input loads as non-volatile.  If we don't do this, we will
+  // insert a new volatile load and the old ones will not be deletable.
+  if (isVolatile)
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+      cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
+  
+  return new LoadInst(PhiVal, "", isVolatile);
+}
+
+/// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
+/// that is dead.
+static bool DeadPHICycle(PHINode *PN,
+                         SmallPtrSet<PHINode*, 16> &PotentiallyDeadPHIs) {
+  if (PN->use_empty()) return true;
+  if (!PN->hasOneUse()) return false;
+
+  // Remember this node, and if we find the cycle, return.
+  if (!PotentiallyDeadPHIs.insert(PN))
+    return true;
+  
+  // Don't scan crazily complex things.
+  if (PotentiallyDeadPHIs.size() == 16)
+    return false;
+
+  if (PHINode *PU = dyn_cast<PHINode>(PN->use_back()))
+    return DeadPHICycle(PU, PotentiallyDeadPHIs);
+
+  return false;
+}
+
+/// PHIsEqualValue - Return true if this phi node is always equal to
+/// NonPhiInVal.  This happens with mutually cyclic phi nodes like:
+///   z = some value; x = phi (y, z); y = phi (x, z)
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, 
+                           SmallPtrSet<PHINode*, 16> &ValueEqualPHIs) {
+  // See if we already saw this PHI node.
+  if (!ValueEqualPHIs.insert(PN))
+    return true;
+  
+  // Don't scan crazily complex things.
+  if (ValueEqualPHIs.size() == 16)
+    return false;
+ 
+  // Scan the operands to see if they are either phi nodes or are equal to
+  // the value.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *Op = PN->getIncomingValue(i);
+    if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
+      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
+        return false;
+    } else if (Op != NonPhiInVal)
+      return false;
+  }
+  
+  return true;
+}
+
+
+// PHINode simplification
+//
+Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+  // If LCSSA is around, don't mess with Phi nodes
+  if (MustPreserveLCSSA) return 0;
+  
+  if (Value *V = PN.hasConstantValue())
+    return ReplaceInstUsesWith(PN, V);
+
+  // If all PHI operands are the same operation, pull them through the PHI,
+  // reducing code size.
+  if (isa<Instruction>(PN.getIncomingValue(0)) &&
+      isa<Instruction>(PN.getIncomingValue(1)) &&
+      cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
+      cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
+      // FIXME: The hasOneUse check will fail for PHIs that use the value more
+      // than themselves more than once.
+      PN.getIncomingValue(0)->hasOneUse())
+    if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
+      return Result;
+
+  // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
+  // this PHI only has a single use (a PHI), and if that PHI only has one use (a
+  // PHI)... break the cycle.
+  if (PN.hasOneUse()) {
+    Instruction *PHIUser = cast<Instruction>(PN.use_back());
+    if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
+      SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
+      PotentiallyDeadPHIs.insert(&PN);
+      if (DeadPHICycle(PU, PotentiallyDeadPHIs))
+        return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+   
+    // If this phi has a single use, and if that use just computes a value for
+    // the next iteration of a loop, delete the phi.  This occurs with unused
+    // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
+    // common case here is good because the only other things that catch this
+    // are induction variable analysis (sometimes) and ADCE, which is only run
+    // late.
+    if (PHIUser->hasOneUse() &&
+        (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
+        PHIUser->use_back() == &PN) {
+      return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+  }
+
+  // We sometimes end up with phi cycles that non-obviously end up being the
+  // same value, for example:
+  //   z = some value; x = phi (y, z); y = phi (x, z)
+  // where the phi nodes don't necessarily need to be in the same block.  Do a
+  // quick check to see if the PHI node only contains a single non-phi value, if
+  // so, scan to see if the phi cycle is actually equal to that value.
+  {
+    unsigned InValNo = 0, NumOperandVals = PN.getNumIncomingValues();
+    // Scan for the first non-phi operand.
+    while (InValNo != NumOperandVals && 
+           isa<PHINode>(PN.getIncomingValue(InValNo)))
+      ++InValNo;
+
+    if (InValNo != NumOperandVals) {
+      Value *NonPhiInVal = PN.getOperand(InValNo);
+      
+      // Scan the rest of the operands to see if there are any conflicts, if so
+      // there is no need to recursively scan other phis.
+      for (++InValNo; InValNo != NumOperandVals; ++InValNo) {
+        Value *OpVal = PN.getIncomingValue(InValNo);
+        if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
+          break;
+      }
+      
+      // If we scanned over all operands, then we have one unique value plus
+      // phi values.  Scan PHI nodes to see if they all merge in each other or
+      // the value.
+      if (InValNo == NumOperandVals) {
+        SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
+        if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
+          return ReplaceInstUsesWith(PN, NonPhiInVal);
+      }
+    }
+  }
+  return 0;
+}
+
+static Value *InsertCastToIntPtrTy(Value *V, const Type *DTy,
+                                   Instruction *InsertPoint,
+                                   InstCombiner *IC) {
+  unsigned PtrSize = DTy->getPrimitiveSizeInBits();
+  unsigned VTySize = V->getType()->getPrimitiveSizeInBits();
+  // We must cast correctly to the pointer type. Ensure that we
+  // sign extend the integer value if it is smaller as this is
+  // used for address computation.
+  Instruction::CastOps opcode = 
+     (VTySize < PtrSize ? Instruction::SExt :
+      (VTySize == PtrSize ? Instruction::BitCast : Instruction::Trunc));
+  return IC->InsertCastBefore(opcode, V, DTy, *InsertPoint);
+}
+
+
+Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  Value *PtrOp = GEP.getOperand(0);
+  // Is it 'getelementptr %P, i32 0'  or 'getelementptr %P'
+  // If so, eliminate the noop.
+  if (GEP.getNumOperands() == 1)
+    return ReplaceInstUsesWith(GEP, PtrOp);
+
+  if (isa<UndefValue>(GEP.getOperand(0)))
+    return ReplaceInstUsesWith(GEP, UndefValue::get(GEP.getType()));
+
+  bool HasZeroPointerIndex = false;
+  if (Constant *C = dyn_cast<Constant>(GEP.getOperand(1)))
+    HasZeroPointerIndex = C->isNullValue();
+
+  if (GEP.getNumOperands() == 2 && HasZeroPointerIndex)
+    return ReplaceInstUsesWith(GEP, PtrOp);
+
+  // Eliminate unneeded casts for indices.
+  bool MadeChange = false;
+  
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (User::op_iterator i = GEP.op_begin() + 1, e = GEP.op_end();
+       i != e; ++i, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      if (CastInst *CI = dyn_cast<CastInst>(*i)) {
+        if (CI->getOpcode() == Instruction::ZExt ||
+            CI->getOpcode() == Instruction::SExt) {
+          const Type *SrcTy = CI->getOperand(0)->getType();
+          // We can eliminate a cast from i32 to i64 iff the target 
+          // is a 32-bit pointer target.
+          if (SrcTy->getPrimitiveSizeInBits() >= TD->getPointerSizeInBits()) {
+            MadeChange = true;
+            *i = CI->getOperand(0);
+          }
+        }
+      }
+      // If we are using a wider index than needed for this platform, shrink it
+      // to what we need.  If narrower, sign-extend it to what we need.
+      // If the incoming value needs a cast instruction,
+      // insert it.  This explicit cast can make subsequent optimizations more
+      // obvious.
+      Value *Op = *i;
+      if (TD->getTypeSizeInBits(Op->getType()) > TD->getPointerSizeInBits()) {
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          *i = ConstantExpr::getTrunc(C, TD->getIntPtrType());
+          MadeChange = true;
+        } else {
+          Op = InsertCastBefore(Instruction::Trunc, Op, TD->getIntPtrType(),
+                                GEP);
+          *i = Op;
+          MadeChange = true;
+        }
+      } else if (TD->getTypeSizeInBits(Op->getType()) < TD->getPointerSizeInBits()) {
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          *i = ConstantExpr::getSExt(C, TD->getIntPtrType());
+          MadeChange = true;
+        } else {
+          Op = InsertCastBefore(Instruction::SExt, Op, TD->getIntPtrType(),
+                                GEP);
+          *i = Op;
+          MadeChange = true;
+        }
+      }
+    }
+  }
+  if (MadeChange) return &GEP;
+
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction, combine the indices of the two
+  // getelementptr instructions into a single instruction.
+  //
+  SmallVector<Value*, 8> SrcGEPOperands;
+  if (User *Src = dyn_castGetElementPtr(PtrOp))
+    SrcGEPOperands.append(Src->op_begin(), Src->op_end());
+
+  if (!SrcGEPOperands.empty()) {
+    // Note that if our source is a gep chain itself that we wait for that
+    // chain to be resolved before we perform this transformation.  This
+    // avoids us creating a TON of code in some cases.
+    //
+    if (isa<GetElementPtrInst>(SrcGEPOperands[0]) &&
+        cast<Instruction>(SrcGEPOperands[0])->getNumOperands() == 2)
+      return 0;   // Wait until our source is folded to completion.
+
+    SmallVector<Value*, 8> Indices;
+
+    // Find out whether the last index in the source GEP is a sequential idx.
+    bool EndsWithSequential = false;
+    for (gep_type_iterator I = gep_type_begin(*cast<User>(PtrOp)),
+           E = gep_type_end(*cast<User>(PtrOp)); I != E; ++I)
+      EndsWithSequential = !isa<StructType>(*I);
+
+    // Can we combine the two pointer arithmetics offsets?
+    if (EndsWithSequential) {
+      // Replace: gep (gep %P, long B), long A, ...
+      // With:    T = long A+B; gep %P, T, ...
+      //
+      Value *Sum, *SO1 = SrcGEPOperands.back(), *GO1 = GEP.getOperand(1);
+      if (SO1 == Constant::getNullValue(SO1->getType())) {
+        Sum = GO1;
+      } else if (GO1 == Constant::getNullValue(GO1->getType())) {
+        Sum = SO1;
+      } else {
+        // If they aren't the same type, convert both to an integer of the
+        // target's pointer size.
+        if (SO1->getType() != GO1->getType()) {
+          if (Constant *SO1C = dyn_cast<Constant>(SO1)) {
+            SO1 = ConstantExpr::getIntegerCast(SO1C, GO1->getType(), true);
+          } else if (Constant *GO1C = dyn_cast<Constant>(GO1)) {
+            GO1 = ConstantExpr::getIntegerCast(GO1C, SO1->getType(), true);
+          } else {
+            unsigned PS = TD->getPointerSizeInBits();
+            if (TD->getTypeSizeInBits(SO1->getType()) == PS) {
+              // Convert GO1 to SO1's type.
+              GO1 = InsertCastToIntPtrTy(GO1, SO1->getType(), &GEP, this);
+
+            } else if (TD->getTypeSizeInBits(GO1->getType()) == PS) {
+              // Convert SO1 to GO1's type.
+              SO1 = InsertCastToIntPtrTy(SO1, GO1->getType(), &GEP, this);
+            } else {
+              const Type *PT = TD->getIntPtrType();
+              SO1 = InsertCastToIntPtrTy(SO1, PT, &GEP, this);
+              GO1 = InsertCastToIntPtrTy(GO1, PT, &GEP, this);
+            }
+          }
+        }
+        if (isa<Constant>(SO1) && isa<Constant>(GO1))
+          Sum = ConstantExpr::getAdd(cast<Constant>(SO1), cast<Constant>(GO1));
+        else {
+          Sum = BinaryOperator::CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
+          InsertNewInstBefore(cast<Instruction>(Sum), GEP);
+        }
+      }
+
+      // Recycle the GEP we already have if possible.
+      if (SrcGEPOperands.size() == 2) {
+        GEP.setOperand(0, SrcGEPOperands[0]);
+        GEP.setOperand(1, Sum);
+        return &GEP;
+      } else {
+        Indices.insert(Indices.end(), SrcGEPOperands.begin()+1,
+                       SrcGEPOperands.end()-1);
+        Indices.push_back(Sum);
+        Indices.insert(Indices.end(), GEP.op_begin()+2, GEP.op_end());
+      }
+    } else if (isa<Constant>(*GEP.idx_begin()) &&
+               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+               SrcGEPOperands.size() != 1) {
+      // Otherwise we can do the fold if the first index of the GEP is a zero
+      Indices.insert(Indices.end(), SrcGEPOperands.begin()+1,
+                     SrcGEPOperands.end());
+      Indices.insert(Indices.end(), GEP.idx_begin()+1, GEP.idx_end());
+    }
+
+    if (!Indices.empty())
+      return GetElementPtrInst::Create(SrcGEPOperands[0], Indices.begin(),
+                                       Indices.end(), GEP.getName());
+
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(PtrOp)) {
+    // GEP of global variable.  If all of the indices for this GEP are
+    // constants, we can promote this to a constexpr instead of an instruction.
+
+    // Scan for nonconstants...
+    SmallVector<Constant*, 8> Indices;
+    User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end();
+    for (; I != E && isa<Constant>(*I); ++I)
+      Indices.push_back(cast<Constant>(*I));
+
+    if (I == E) {  // If they are all constants...
+      Constant *CE = ConstantExpr::getGetElementPtr(GV,
+                                                    &Indices[0],Indices.size());
+
+      // Replace all uses of the GEP with the new constexpr...
+      return ReplaceInstUsesWith(GEP, CE);
+    }
+  } else if (Value *X = getBitCastOperand(PtrOp)) {  // Is the operand a cast?
+    if (!isa<PointerType>(X->getType())) {
+      // Not interesting.  Source pointer must be a cast from pointer.
+    } else if (HasZeroPointerIndex) {
+      // transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
+      // into     : GEP [10 x i8]* X, i32 0, ...
+      //
+      // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
+      //           into     : GEP i8* X, ...
+      // 
+      // This occurs when the program declares an array extern like "int X[];"
+      const PointerType *CPTy = cast<PointerType>(PtrOp->getType());
+      const PointerType *XTy = cast<PointerType>(X->getType());
+      if (const ArrayType *CATy =
+          dyn_cast<ArrayType>(CPTy->getElementType())) {
+        // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
+        if (CATy->getElementType() == XTy->getElementType()) {
+          // -> GEP i8* X, ...
+          SmallVector<Value*, 8> Indices(GEP.idx_begin()+1, GEP.idx_end());
+          return GetElementPtrInst::Create(X, Indices.begin(), Indices.end(),
+                                           GEP.getName());
+        } else if (const ArrayType *XATy =
+                 dyn_cast<ArrayType>(XTy->getElementType())) {
+          // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
+          if (CATy->getElementType() == XATy->getElementType()) {
+            // -> GEP [10 x i8]* X, i32 0, ...
+            // At this point, we know that the cast source type is a pointer
+            // to an array of the same type as the destination pointer
+            // array.  Because the array type is never stepped over (there
+            // is a leading zero) we can fold the cast into this GEP.
+            GEP.setOperand(0, X);
+            return &GEP;
+          }
+        }
+      }
+    } else if (GEP.getNumOperands() == 2) {
+      // Transform things like:
+      // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
+      // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
+      const Type *SrcElTy = cast<PointerType>(X->getType())->getElementType();
+      const Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      if (isa<ArrayType>(SrcElTy) &&
+          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypeAllocSize(ResElTy)) {
+        Value *Idx[2];
+        Idx[0] = Constant::getNullValue(Type::Int32Ty);
+        Idx[1] = GEP.getOperand(1);
+        Value *V = InsertNewInstBefore(
+               GetElementPtrInst::Create(X, Idx, Idx + 2, GEP.getName()), GEP);
+        // V and GEP are both pointer types --> BitCast
+        return new BitCastInst(V, GEP.getType());
+      }
+      
+      // Transform things like:
+      // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
+      //   (where tmp = 8*tmp2) into:
+      // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
+      
+      if (isa<ArrayType>(SrcElTy) && ResElTy == Type::Int8Ty) {
+        uint64_t ArrayEltSize =
+            TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
+        
+        // Check to see if "tmp" is a scale by a multiple of ArrayEltSize.  We
+        // allow either a mul, shift, or constant here.
+        Value *NewIdx = 0;
+        ConstantInt *Scale = 0;
+        if (ArrayEltSize == 1) {
+          NewIdx = GEP.getOperand(1);
+          Scale = ConstantInt::get(NewIdx->getType(), 1);
+        } else if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP.getOperand(1))) {
+          NewIdx = ConstantInt::get(CI->getType(), 1);
+          Scale = CI;
+        } else if (Instruction *Inst =dyn_cast<Instruction>(GEP.getOperand(1))){
+          if (Inst->getOpcode() == Instruction::Shl &&
+              isa<ConstantInt>(Inst->getOperand(1))) {
+            ConstantInt *ShAmt = cast<ConstantInt>(Inst->getOperand(1));
+            uint32_t ShAmtVal = ShAmt->getLimitedValue(64);
+            Scale = ConstantInt::get(Inst->getType(), 1ULL << ShAmtVal);
+            NewIdx = Inst->getOperand(0);
+          } else if (Inst->getOpcode() == Instruction::Mul &&
+                     isa<ConstantInt>(Inst->getOperand(1))) {
+            Scale = cast<ConstantInt>(Inst->getOperand(1));
+            NewIdx = Inst->getOperand(0);
+          }
+        }
+        
+        // If the index will be to exactly the right offset with the scale taken
+        // out, perform the transformation. Note, we don't know whether Scale is
+        // signed or not. We'll use unsigned version of division/modulo
+        // operation after making sure Scale doesn't have the sign bit set.
+        if (ArrayEltSize && Scale && Scale->getSExtValue() >= 0LL &&
+            Scale->getZExtValue() % ArrayEltSize == 0) {
+          Scale = ConstantInt::get(Scale->getType(),
+                                   Scale->getZExtValue() / ArrayEltSize);
+          if (Scale->getZExtValue() != 1) {
+            Constant *C = ConstantExpr::getIntegerCast(Scale, NewIdx->getType(),
+                                                       false /*ZExt*/);
+            Instruction *Sc = BinaryOperator::CreateMul(NewIdx, C, "idxscale");
+            NewIdx = InsertNewInstBefore(Sc, GEP);
+          }
+
+          // Insert the new GEP instruction.
+          Value *Idx[2];
+          Idx[0] = Constant::getNullValue(Type::Int32Ty);
+          Idx[1] = NewIdx;
+          Instruction *NewGEP =
+            GetElementPtrInst::Create(X, Idx, Idx + 2, GEP.getName());
+          NewGEP = InsertNewInstBefore(NewGEP, GEP);
+          // The NewGEP must be pointer typed, so must the old one -> BitCast
+          return new BitCastInst(NewGEP, GEP.getType());
+        }
+      }
+    }
+  }
+  
+  /// See if we can simplify:
+  ///   X = bitcast A to B*
+  ///   Y = gep X, <...constant indices...>
+  /// into a gep of the original struct.  This is important for SROA and alias
+  /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
+    if (!isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices()) {
+      // Determine how much the GEP moves the pointer.  We are guaranteed to get
+      // a constant back from EmitGEPOffset.
+      ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP, GEP, *this));
+      int64_t Offset = OffsetV->getSExtValue();
+      
+      // If this GEP instruction doesn't move the pointer, just replace the GEP
+      // with a bitcast of the real input to the dest type.
+      if (Offset == 0) {
+        // If the bitcast is of an allocation, and the allocation will be
+        // converted to match the type of the cast, don't touch this.
+        if (isa<AllocationInst>(BCI->getOperand(0))) {
+          // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+          if (Instruction *I = visitBitCast(*BCI)) {
+            if (I != BCI) {
+              I->takeName(BCI);
+              BCI->getParent()->getInstList().insert(BCI, I);
+              ReplaceInstUsesWith(*BCI, I);
+            }
+            return &GEP;
+          }
+        }
+        return new BitCastInst(BCI->getOperand(0), GEP.getType());
+      }
+      
+      // Otherwise, if the offset is non-zero, we need to find out if there is a
+      // field at Offset in 'A's type.  If so, we can pull the cast through the
+      // GEP.
+      SmallVector<Value*, 8> NewIndices;
+      const Type *InTy =
+        cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
+      if (FindElementAtOffset(InTy, Offset, NewIndices, TD)) {
+        Instruction *NGEP =
+           GetElementPtrInst::Create(BCI->getOperand(0), NewIndices.begin(),
+                                     NewIndices.end());
+        if (NGEP->getType() == GEP.getType()) return NGEP;
+        InsertNewInstBefore(NGEP, GEP);
+        NGEP->takeName(&GEP);
+        return new BitCastInst(NGEP, GEP.getType());
+      }
+    }
+  }    
+    
+  return 0;
+}
+
+Instruction *InstCombiner::visitAllocationInst(AllocationInst &AI) {
+  // Convert: malloc Ty, C - where C is a constant != 1 into: malloc [C x Ty], 1
+  if (AI.isArrayAllocation()) {  // Check C != 1
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+      const Type *NewTy = 
+        ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+      AllocationInst *New = 0;
+
+      // Create and insert the replacement instruction...
+      if (isa<MallocInst>(AI))
+        New = new MallocInst(NewTy, 0, AI.getAlignment(), AI.getName());
+      else {
+        assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
+        New = new AllocaInst(NewTy, 0, AI.getAlignment(), AI.getName());
+      }
+
+      InsertNewInstBefore(New, AI);
+
+      // Scan to the end of the allocation instructions, to skip over a block of
+      // allocas if possible...also skip interleaved debug info
+      //
+      BasicBlock::iterator It = New;
+      while (isa<AllocationInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It;
+
+      // Now that I is pointing to the first non-allocation-inst in the block,
+      // insert our getelementptr instruction...
+      //
+      Value *NullIdx = Constant::getNullValue(Type::Int32Ty);
+      Value *Idx[2];
+      Idx[0] = NullIdx;
+      Idx[1] = NullIdx;
+      Value *V = GetElementPtrInst::Create(New, Idx, Idx + 2,
+                                           New->getName()+".sub", It);
+
+      // Now make everything use the getelementptr instead of the original
+      // allocation.
+      return ReplaceInstUsesWith(AI, V);
+    } else if (isa<UndefValue>(AI.getArraySize())) {
+      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+    }
+  }
+
+  if (isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) {
+    // If alloca'ing a zero byte object, replace the alloca with a null pointer.
+    // Note that we only do this for alloca's, because malloc should allocate
+    // and return a unique pointer, even for a zero byte allocation.
+    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
+      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+    // If the alignment is 0 (unspecified), assign it the preferred alignment.
+    if (AI.getAlignment() == 0)
+      AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFreeInst(FreeInst &FI) {
+  Value *Op = FI.getOperand(0);
+
+  // free undef -> unreachable.
+  if (isa<UndefValue>(Op)) {
+    // Insert a new store to null because we cannot modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(),
+                  UndefValue::get(PointerType::getUnqual(Type::Int1Ty)), &FI);
+    return EraseInstFromFunction(FI);
+  }
+  
+  // If we have 'free null' delete the instruction.  This can happen in stl code
+  // when lots of inlining happens.
+  if (isa<ConstantPointerNull>(Op))
+    return EraseInstFromFunction(FI);
+  
+  // Change free <ty>* (cast <ty2>* X to <ty>*) into free <ty2>* X
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op)) {
+    FI.setOperand(0, CI->getOperand(0));
+    return &FI;
+  }
+  
+  // Change free (gep X, 0,0,0,0) into free(X)
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    if (GEPI->hasAllZeroIndices()) {
+      AddToWorkList(GEPI);
+      FI.setOperand(0, GEPI->getOperand(0));
+      return &FI;
+    }
+  }
+  
+  // Change free(malloc) into nothing, if the malloc has a single use.
+  if (MallocInst *MI = dyn_cast<MallocInst>(Op))
+    if (MI->hasOneUse()) {
+      EraseInstFromFunction(FI);
+      return EraseInstFromFunction(*MI);
+    }
+
+  return 0;
+}
+
+
+/// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
+static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
+                                        const TargetData *TD) {
+  User *CI = cast<User>(LI.getOperand(0));
+  Value *CastOp = CI->getOperand(0);
+
+  if (TD) {
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CI)) {
+      // Instead of loading constant c string, use corresponding integer value
+      // directly if string length is small enough.
+      std::string Str;
+      if (GetConstantStringInfo(CE->getOperand(0), Str) && !Str.empty()) {
+        unsigned len = Str.length();
+        const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
+        unsigned numBits = Ty->getPrimitiveSizeInBits();
+        // Replace LI with immediate integer store.
+        if ((numBits >> 3) == len + 1) {
+          APInt StrVal(numBits, 0);
+          APInt SingleChar(numBits, 0);
+          if (TD->isLittleEndian()) {
+            for (signed i = len-1; i >= 0; i--) {
+              SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+              StrVal = (StrVal << 8) | SingleChar;
+            }
+          } else {
+            for (unsigned i = 0; i < len; i++) {
+              SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+              StrVal = (StrVal << 8) | SingleChar;
+            }
+            // Append NULL at the end.
+            SingleChar = 0;
+            StrVal = (StrVal << 8) | SingleChar;
+          }
+          Value *NL = ConstantInt::get(StrVal);
+          return IC.ReplaceInstUsesWith(LI, NL);
+        }
+      }
+    }
+  }
+
+  const PointerType *DestTy = cast<PointerType>(CI->getType());
+  const Type *DestPTy = DestTy->getElementType();
+  if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+
+    // If the address spaces don't match, don't eliminate the cast.
+    if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
+      return 0;
+
+    const Type *SrcPTy = SrcTy->getElementType();
+
+    if (DestPTy->isInteger() || isa<PointerType>(DestPTy) || 
+         isa<VectorType>(DestPTy)) {
+      // If the source is an array, the code below will not succeed.  Check to
+      // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+      // constants.
+      if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
+        if (Constant *CSrc = dyn_cast<Constant>(CastOp))
+          if (ASrcTy->getNumElements() != 0) {
+            Value *Idxs[2];
+            Idxs[0] = Idxs[1] = Constant::getNullValue(Type::Int32Ty);
+            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
+            SrcTy = cast<PointerType>(CastOp->getType());
+            SrcPTy = SrcTy->getElementType();
+          }
+
+      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy) || 
+            isa<VectorType>(SrcPTy)) &&
+          // Do not allow turning this into a load of an integer, which is then
+          // casted to a pointer, this pessimizes pointer analysis a lot.
+          (isa<PointerType>(SrcPTy) == isa<PointerType>(LI.getType())) &&
+          IC.getTargetData().getTypeSizeInBits(SrcPTy) ==
+               IC.getTargetData().getTypeSizeInBits(DestPTy)) {
+
+        // Okay, we are casting from one integer or pointer type to another of
+        // the same size.  Instead of casting the pointer before the load, cast
+        // the result of the loaded value.
+        Value *NewLoad = IC.InsertNewInstBefore(new LoadInst(CastOp,
+                                                             CI->getName(),
+                                                         LI.isVolatile()),LI);
+        // Now cast the result of the load.
+        return new BitCastInst(NewLoad, LI.getType());
+      }
+    }
+  }
+  return 0;
+}
+
+/// isSafeToLoadUnconditionally - Return true if we know that executing a load
+/// from this value cannot trap.  If it is not obviously safe to load from the
+/// specified pointer, we do a quick local scan of the basic block containing
+/// ScanFrom, to determine if the address is already accessed.
+static bool isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom) {
+  // If it is an alloca it is always safe to load from.
+  if (isa<AllocaInst>(V)) return true;
+
+  // If it is a global variable it is mostly safe to load from.
+  if (const GlobalValue *GV = dyn_cast<GlobalVariable>(V))
+    // Don't try to evaluate aliases.  External weak GV can be null.
+    return !isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage();
+
+  // Otherwise, be a little bit agressive by scanning the local block where we
+  // want to check to see if the pointer is already being loaded or stored
+  // from/to.  If so, the previous load or store would have already trapped,
+  // so there is no harm doing an extra load (also, CSE will later eliminate
+  // the load entirely).
+  BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
+
+  while (BBI != E) {
+    --BBI;
+
+    // If we see a free or a call (which might do a free) the pointer could be
+    // marked invalid.
+    if (isa<FreeInst>(BBI) || 
+        (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)))
+      return false;
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI->getOperand(0) == V) return true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (SI->getOperand(1) == V) return true;
+    }
+
+  }
+  return false;
+}
+
+Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+  Value *Op = LI.getOperand(0);
+
+  // Attempt to improve the alignment.
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()));
+  if (KnownAlign >
+      (LI.getAlignment() == 0 ? TD->getABITypeAlignment(LI.getType()) :
+                                LI.getAlignment()))
+    LI.setAlignment(KnownAlign);
+
+  // load (cast X) --> cast (load X) iff safe
+  if (isa<CastInst>(Op))
+    if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+      return Res;
+
+  // None of the following transforms are legal for volatile loads.
+  if (LI.isVolatile()) return 0;
+  
+  // Do really simple store-to-load forwarding and load CSE, to catch cases
+  // where there are several consequtive memory accesses to the same location,
+  // separated by a few arithmetic operations.
+  BasicBlock::iterator BBI = &LI;
+  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
+    return ReplaceInstUsesWith(LI, AvailableVal);
+
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    const Value *GEPI0 = GEPI->getOperand(0);
+    // TODO: Consider a target hook for valid address spaces for this xform.
+    if (isa<ConstantPointerNull>(GEPI0) &&
+        cast<PointerType>(GEPI0->getType())->getAddressSpace() == 0) {
+      // Insert a new store to null instruction before the load to indicate
+      // that this code is not reachable.  We do this instead of inserting
+      // an unreachable instruction directly because we cannot modify the
+      // CFG.
+      new StoreInst(UndefValue::get(LI.getType()),
+                    Constant::getNullValue(Op->getType()), &LI);
+      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+  } 
+
+  if (Constant *C = dyn_cast<Constant>(Op)) {
+    // load null/undef -> undef
+    // TODO: Consider a target hook for valid address spaces for this xform.
+    if (isa<UndefValue>(C) || (C->isNullValue() && 
+        cast<PointerType>(Op->getType())->getAddressSpace() == 0)) {
+      // Insert a new store to null instruction before the load to indicate that
+      // this code is not reachable.  We do this instead of inserting an
+      // unreachable instruction directly because we cannot modify the CFG.
+      new StoreInst(UndefValue::get(LI.getType()),
+                    Constant::getNullValue(Op->getType()), &LI);
+      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+
+    // Instcombine load (constant global) into the value loaded.
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op))
+      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+        return ReplaceInstUsesWith(LI, GV->getInitializer());
+
+    // Instcombine load (constantexpr_GEP global, 0, ...) into the value loaded.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer())
+            if (Constant *V = 
+               ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
+              return ReplaceInstUsesWith(LI, V);
+        if (CE->getOperand(0)->isNullValue()) {
+          // Insert a new store to null instruction before the load to indicate
+          // that this code is not reachable.  We do this instead of inserting
+          // an unreachable instruction directly because we cannot modify the
+          // CFG.
+          new StoreInst(UndefValue::get(LI.getType()),
+                        Constant::getNullValue(Op->getType()), &LI);
+          return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+        }
+
+      } else if (CE->isCast()) {
+        if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+          return Res;
+      }
+    }
+  }
+    
+  // If this load comes from anywhere in a constant global, and if the global
+  // is all undef or zero, we know what it loads.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op->getUnderlyingObject())){
+    if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+      if (GV->getInitializer()->isNullValue())
+        return ReplaceInstUsesWith(LI, Constant::getNullValue(LI.getType()));
+      else if (isa<UndefValue>(GV->getInitializer()))
+        return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+  }
+
+  if (Op->hasOneUse()) {
+    // Change select and PHI nodes to select values instead of addresses: this
+    // helps alias analysis out a lot, allows many others simplifications, and
+    // exposes redundancy in the code.
+    //
+    // Note that we cannot do the transformation unless we know that the
+    // introduced loads cannot trap!  Something like this is valid as long as
+    // the condition is always false: load (select bool %C, int* null, int* %G),
+    // but it would not be valid if we transformed it to load from null
+    // unconditionally.
+    //
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
+      // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), SI)) {
+        Value *V1 = InsertNewInstBefore(new LoadInst(SI->getOperand(1),
+                                     SI->getOperand(1)->getName()+".val"), LI);
+        Value *V2 = InsertNewInstBefore(new LoadInst(SI->getOperand(2),
+                                     SI->getOperand(2)->getName()+".val"), LI);
+        return SelectInst::Create(SI->getCondition(), V1, V2);
+      }
+
+      // load (select (cond, null, P)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(2));
+          return &LI;
+        }
+
+      // load (select (cond, P, null)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(2)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(1));
+          return &LI;
+        }
+    }
+  }
+  return 0;
+}
+
+/// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
+/// when possible.  This makes it generally easy to do alias analysis and/or
+/// SROA/mem2reg of the memory object.
+static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
+  User *CI = cast<User>(SI.getOperand(1));
+  Value *CastOp = CI->getOperand(0);
+
+  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
+  if (SrcTy == 0) return 0;
+  
+  const Type *SrcPTy = SrcTy->getElementType();
+
+  if (!DestPTy->isInteger() && !isa<PointerType>(DestPTy))
+    return 0;
+  
+  /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
+  /// to its first element.  This allows us to handle things like:
+  ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
+  /// on 32-bit hosts.
+  SmallVector<Value*, 4> NewGEPIndices;
+  
+  // If the source is an array, the code below will not succeed.  Check to
+  // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+  // constants.
+  if (isa<ArrayType>(SrcPTy) || isa<StructType>(SrcPTy)) {
+    // Index through pointer.
+    Constant *Zero = Constant::getNullValue(Type::Int32Ty);
+    NewGEPIndices.push_back(Zero);
+    
+    while (1) {
+      if (const StructType *STy = dyn_cast<StructType>(SrcPTy)) {
+        if (!STy->getNumElements()) /* Struct can be empty {} */
+          break;
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = STy->getElementType(0);
+      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = ATy->getElementType();
+      } else {
+        break;
+      }
+    }
+    
+    SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
+  }
+
+  if (!SrcPTy->isInteger() && !isa<PointerType>(SrcPTy))
+    return 0;
+  
+  // If the pointers point into different address spaces or if they point to
+  // values with different sizes, we can't do the transformation.
+  if (SrcTy->getAddressSpace() != 
+        cast<PointerType>(CI->getType())->getAddressSpace() ||
+      IC.getTargetData().getTypeSizeInBits(SrcPTy) !=
+      IC.getTargetData().getTypeSizeInBits(DestPTy))
+    return 0;
+
+  // Okay, we are casting from one integer or pointer type to another of
+  // the same size.  Instead of casting the pointer before 
+  // the store, cast the value to be stored.
+  Value *NewCast;
+  Value *SIOp0 = SI.getOperand(0);
+  Instruction::CastOps opcode = Instruction::BitCast;
+  const Type* CastSrcTy = SIOp0->getType();
+  const Type* CastDstTy = SrcPTy;
+  if (isa<PointerType>(CastDstTy)) {
+    if (CastSrcTy->isInteger())
+      opcode = Instruction::IntToPtr;
+  } else if (isa<IntegerType>(CastDstTy)) {
+    if (isa<PointerType>(SIOp0->getType()))
+      opcode = Instruction::PtrToInt;
+  }
+  
+  // SIOp0 is a pointer to aggregate and this is a store to the first field,
+  // emit a GEP to index into its first field.
+  if (!NewGEPIndices.empty()) {
+    if (Constant *C = dyn_cast<Constant>(CastOp))
+      CastOp = ConstantExpr::getGetElementPtr(C, &NewGEPIndices[0], 
+                                              NewGEPIndices.size());
+    else
+      CastOp = IC.InsertNewInstBefore(
+              GetElementPtrInst::Create(CastOp, NewGEPIndices.begin(),
+                                        NewGEPIndices.end()), SI);
+  }
+  
+  if (Constant *C = dyn_cast<Constant>(SIOp0))
+    NewCast = ConstantExpr::getCast(opcode, C, CastDstTy);
+  else
+    NewCast = IC.InsertNewInstBefore(
+      CastInst::Create(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"), 
+      SI);
+  return new StoreInst(NewCast, CastOp);
+}
+
+/// equivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool equivalentAddressValues(Value *A, Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+  
+  // Test if the values come form identical arithmetic instructions.
+  if (isa<BinaryOperator>(A) ||
+      isa<CastInst>(A) ||
+      isa<PHINode>(A) ||
+      isa<GetElementPtrInst>(A))
+    if (Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalTo(BI))
+        return true;
+  
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+// If this instruction has two uses, one of which is a llvm.dbg.declare,
+// return the llvm.dbg.declare.
+DbgDeclareInst *InstCombiner::hasOneUsePlusDeclare(Value *V) {
+  if (!V->hasNUses(2))
+    return 0;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(UI))
+      return DI;
+    if (isa<BitCastInst>(UI) && UI->hasOneUse()) {
+      if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(UI->use_begin()))
+        return DI;
+      }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  if (isa<UndefValue>(Ptr)) {     // store X, undef -> noop (even if volatile)
+    EraseInstFromFunction(SI);
+    ++NumCombined;
+    return 0;
+  }
+  
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  // If the RHS is an alloca with a two uses, the other one being a 
+  // llvm.dbg.declare, zapify the store and the declare, making the
+  // alloca dead.  We must do this to prevent declare's from affecting
+  // codegen.
+  if (!SI.isVolatile()) {
+    if (Ptr->hasOneUse()) {
+      if (isa<AllocaInst>(Ptr)) {
+        EraseInstFromFunction(SI);
+        ++NumCombined;
+        return 0;
+      }
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+        if (isa<AllocaInst>(GEP->getOperand(0))) {
+          if (GEP->getOperand(0)->hasOneUse()) {
+            EraseInstFromFunction(SI);
+            ++NumCombined;
+            return 0;
+          }
+          if (DbgDeclareInst *DI = hasOneUsePlusDeclare(GEP->getOperand(0))) {
+            EraseInstFromFunction(*DI);
+            EraseInstFromFunction(SI);
+            ++NumCombined;
+            return 0;
+          }
+        }
+      }
+    }
+    if (DbgDeclareInst *DI = hasOneUsePlusDeclare(Ptr)) {
+      EraseInstFromFunction(*DI);
+      EraseInstFromFunction(SI);
+      ++NumCombined;
+      return 0;
+    }
+  }
+
+  // Attempt to improve the alignment.
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()));
+  if (KnownAlign >
+      (SI.getAlignment() == 0 ? TD->getABITypeAlignment(Val->getType()) :
+                                SI.getAlignment()))
+    SI.setAlignment(KnownAlign);
+
+  // Do really simple DSE, to catch cases where there are several consecutive
+  // stores to the same location, separated by a few arithmetic operations. This
+  // situation often occurs with bitfield accesses.
+  BasicBlock::iterator BBI = &SI;
+  for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
+       --ScanInsts) {
+    --BBI;
+    // Don't count debug info directives, lest they affect codegen,
+    // and we skip pointer-to-pointer bitcasts, which are NOPs.
+    // It is necessary for correctness to skip those that feed into a
+    // llvm.dbg.declare, as these are not present when debugging is off.
+    if (isa<DbgInfoIntrinsic>(BBI) ||
+        (isa<BitCastInst>(BBI) && isa<PointerType>(BBI->getType()))) {
+      ScanInsts++;
+      continue;
+    }    
+    
+    if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
+      // Prev store isn't volatile, and stores to the same location?
+      if (!PrevSI->isVolatile() &&equivalentAddressValues(PrevSI->getOperand(1),
+                                                          SI.getOperand(1))) {
+        ++NumDeadStore;
+        ++BBI;
+        EraseInstFromFunction(*PrevSI);
+        continue;
+      }
+      break;
+    }
+    
+    // If this is a load, we have to stop.  However, if the loaded value is from
+    // the pointer we're loading and is producing the pointer we're storing,
+    // then *this* store is dead (X = load P; store X -> P).
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
+          !SI.isVolatile()) {
+        EraseInstFromFunction(SI);
+        ++NumCombined;
+        return 0;
+      }
+      // Otherwise, this is a load from some other location.  Stores before it
+      // may not be dead.
+      break;
+    }
+    
+    // Don't skip over loads or things that can modify memory.
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
+      break;
+  }
+  
+  
+  if (SI.isVolatile()) return 0;  // Don't hack volatile stores.
+
+  // store X, null    -> turns into 'unreachable' in SimplifyCFG
+  if (isa<ConstantPointerNull>(Ptr)) {
+    if (!isa<UndefValue>(Val)) {
+      SI.setOperand(0, UndefValue::get(Val->getType()));
+      if (Instruction *U = dyn_cast<Instruction>(Val))
+        AddToWorkList(U);  // Dropped a use.
+      ++NumCombined;
+    }
+    return 0;  // Do not modify these!
+  }
+
+  // store undef, Ptr -> noop
+  if (isa<UndefValue>(Val)) {
+    EraseInstFromFunction(SI);
+    ++NumCombined;
+    return 0;
+  }
+
+  // If the pointer destination is a cast, see if we can fold the cast into the
+  // source instead.
+  if (isa<CastInst>(Ptr))
+    if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+      return Res;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+    if (CE->isCast())
+      if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+        return Res;
+
+  
+  // If this store is the last instruction in the basic block (possibly
+  // excepting debug info instructions and the pointer bitcasts that feed
+  // into them), and if the block ends with an unconditional branch, try
+  // to move it to the successor block.
+  BBI = &SI; 
+  do {
+    ++BBI;
+  } while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && isa<PointerType>(BBI->getType())));
+  if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
+    if (BI->isUnconditional())
+      if (SimplifyStoreAtEndOfBlock(SI))
+        return 0;  // xform done!
+  
+  return 0;
+}
+
+/// SimplifyStoreAtEndOfBlock - Turn things like:
+///   if () { *P = v1; } else { *P = v2 }
+/// into a phi node with a store in the successor.
+///
+/// Simplify things like:
+///   *P = v1; if () { *P = v2; }
+/// into a phi node with a store in the successor.
+///
+bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
+  BasicBlock *StoreBB = SI.getParent();
+  
+  // Check to see if the successor block has exactly two incoming edges.  If
+  // so, see if the other predecessor contains a store to the same location.
+  // if so, insert a PHI node (if needed) and move the stores down.
+  BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
+  
+  // Determine whether Dest has exactly two predecessors and, if so, compute
+  // the other predecessor.
+  pred_iterator PI = pred_begin(DestBB);
+  BasicBlock *OtherBB = 0;
+  if (*PI != StoreBB)
+    OtherBB = *PI;
+  ++PI;
+  if (PI == pred_end(DestBB))
+    return false;
+  
+  if (*PI != StoreBB) {
+    if (OtherBB)
+      return false;
+    OtherBB = *PI;
+  }
+  if (++PI != pred_end(DestBB))
+    return false;
+
+  // Bail out if all the relevant blocks aren't distinct (this can happen,
+  // for example, if SI is in an infinite loop)
+  if (StoreBB == DestBB || OtherBB == DestBB)
+    return false;
+
+  // Verify that the other block ends in a branch and is not otherwise empty.
+  BasicBlock::iterator BBI = OtherBB->getTerminator();
+  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
+  if (!OtherBr || BBI == OtherBB->begin())
+    return false;
+  
+  // If the other block ends in an unconditional branch, check for the 'if then
+  // else' case.  there is an instruction before the branch.
+  StoreInst *OtherStore = 0;
+  if (OtherBr->isUnconditional()) {
+    --BBI;
+    // Skip over debugging info.
+    while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && isa<PointerType>(BBI->getType()))) {
+      if (BBI==OtherBB->begin())
+        return false;
+      --BBI;
+    }
+    // If this isn't a store, or isn't a store to the same location, bail out.
+    OtherStore = dyn_cast<StoreInst>(BBI);
+    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1))
+      return false;
+  } else {
+    // Otherwise, the other block ended with a conditional branch. If one of the
+    // destinations is StoreBB, then we have the if/then case.
+    if (OtherBr->getSuccessor(0) != StoreBB && 
+        OtherBr->getSuccessor(1) != StoreBB)
+      return false;
+    
+    // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
+    // if/then triangle.  See if there is a store to the same ptr as SI that
+    // lives in OtherBB.
+    for (;; --BBI) {
+      // Check to see if we find the matching store.
+      if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
+        if (OtherStore->getOperand(1) != SI.getOperand(1))
+          return false;
+        break;
+      }
+      // If we find something that may be using or overwriting the stored
+      // value, or if we run out of instructions, we can't do the xform.
+      if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() ||
+          BBI == OtherBB->begin())
+        return false;
+    }
+    
+    // In order to eliminate the store in OtherBr, we have to
+    // make sure nothing reads or overwrites the stored value in
+    // StoreBB.
+    for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
+      // FIXME: This should really be AA driven.
+      if (I->mayReadFromMemory() || I->mayWriteToMemory())
+        return false;
+    }
+  }
+  
+  // Insert a PHI node now if we need it.
+  Value *MergedVal = OtherStore->getOperand(0);
+  if (MergedVal != SI.getOperand(0)) {
+    PHINode *PN = PHINode::Create(MergedVal->getType(), "storemerge");
+    PN->reserveOperandSpace(2);
+    PN->addIncoming(SI.getOperand(0), SI.getParent());
+    PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+    MergedVal = InsertNewInstBefore(PN, DestBB->front());
+  }
+  
+  // Advance to a place where it is safe to insert the new store and
+  // insert it.
+  BBI = DestBB->getFirstNonPHI();
+  InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1),
+                                    OtherStore->isVolatile()), *BBI);
+  
+  // Nuke the old stores.
+  EraseInstFromFunction(SI);
+  EraseInstFromFunction(*OtherStore);
+  ++NumCombined;
+  return true;
+}
+
+
+Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+  // Change br (not X), label True, label False to: br X, label False, True
+  Value *X = 0;
+  BasicBlock *TrueDest;
+  BasicBlock *FalseDest;
+  if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
+      !isa<Constant>(X)) {
+    // Swap Destinations and condition...
+    BI.setCondition(X);
+    BI.setSuccessor(0, FalseDest);
+    BI.setSuccessor(1, TrueDest);
+    return &BI;
+  }
+
+  // Cannonicalize fcmp_one -> fcmp_oeq
+  FCmpInst::Predicate FPred; Value *Y;
+  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), 
+                             TrueDest, FalseDest)))
+    if ((FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
+         FPred == FCmpInst::FCMP_OGE) && BI.getCondition()->hasOneUse()) {
+      FCmpInst *I = cast<FCmpInst>(BI.getCondition());
+      FCmpInst::Predicate NewPred = FCmpInst::getInversePredicate(FPred);
+      Instruction *NewSCC = new FCmpInst(NewPred, X, Y, "", I);
+      NewSCC->takeName(I);
+      // Swap Destinations and condition...
+      BI.setCondition(NewSCC);
+      BI.setSuccessor(0, FalseDest);
+      BI.setSuccessor(1, TrueDest);
+      RemoveFromWorkList(I);
+      I->eraseFromParent();
+      AddToWorkList(NewSCC);
+      return &BI;
+    }
+
+  // Cannonicalize icmp_ne -> icmp_eq
+  ICmpInst::Predicate IPred;
+  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
+                      TrueDest, FalseDest)))
+    if ((IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||
+         IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
+         IPred == ICmpInst::ICMP_SGE) && BI.getCondition()->hasOneUse()) {
+      ICmpInst *I = cast<ICmpInst>(BI.getCondition());
+      ICmpInst::Predicate NewPred = ICmpInst::getInversePredicate(IPred);
+      Instruction *NewSCC = new ICmpInst(NewPred, X, Y, "", I);
+      NewSCC->takeName(I);
+      // Swap Destinations and condition...
+      BI.setCondition(NewSCC);
+      BI.setSuccessor(0, FalseDest);
+      BI.setSuccessor(1, TrueDest);
+      RemoveFromWorkList(I);
+      I->eraseFromParent();;
+      AddToWorkList(NewSCC);
+      return &BI;
+    }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+  Value *Cond = SI.getCondition();
+  if (Instruction *I = dyn_cast<Instruction>(Cond)) {
+    if (I->getOpcode() == Instruction::Add)
+      if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        // change 'switch (X+4) case 1:' into 'switch (X) case -3'
+        for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2)
+          SI.setOperand(i,ConstantExpr::getSub(cast<Constant>(SI.getOperand(i)),
+                                                AddRHS));
+        SI.setOperand(0, I->getOperand(0));
+        AddToWorkList(I);
+        return &SI;
+      }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+  Value *Agg = EV.getAggregateOperand();
+
+  if (!EV.hasIndices())
+    return ReplaceInstUsesWith(EV, Agg);
+
+  if (Constant *C = dyn_cast<Constant>(Agg)) {
+    if (isa<UndefValue>(C))
+      return ReplaceInstUsesWith(EV, UndefValue::get(EV.getType()));
+      
+    if (isa<ConstantAggregateZero>(C))
+      return ReplaceInstUsesWith(EV, Constant::getNullValue(EV.getType()));
+
+    if (isa<ConstantArray>(C) || isa<ConstantStruct>(C)) {
+      // Extract the element indexed by the first index out of the constant
+      Value *V = C->getOperand(*EV.idx_begin());
+      if (EV.getNumIndices() > 1)
+        // Extract the remaining indices out of the constant indexed by the
+        // first index
+        return ExtractValueInst::Create(V, EV.idx_begin() + 1, EV.idx_end());
+      else
+        return ReplaceInstUsesWith(EV, V);
+    }
+    return 0; // Can't handle other constants
+  } 
+  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
+    // We're extracting from an insertvalue instruction, compare the indices
+    const unsigned *exti, *exte, *insi, *inse;
+    for (exti = EV.idx_begin(), insi = IV->idx_begin(),
+         exte = EV.idx_end(), inse = IV->idx_end();
+         exti != exte && insi != inse;
+         ++exti, ++insi) {
+      if (*insi != *exti)
+        // The insert and extract both reference distinctly different elements.
+        // This means the extract is not influenced by the insert, and we can
+        // replace the aggregate operand of the extract with the aggregate
+        // operand of the insert. i.e., replace
+        // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+        // %E = extractvalue { i32, { i32 } } %I, 0
+        // with
+        // %E = extractvalue { i32, { i32 } } %A, 0
+        return ExtractValueInst::Create(IV->getAggregateOperand(),
+                                        EV.idx_begin(), EV.idx_end());
+    }
+    if (exti == exte && insi == inse)
+      // Both iterators are at the end: Index lists are identical. Replace
+      // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %C = extractvalue { i32, { i32 } } %B, 1, 0
+      // with "i32 42"
+      return ReplaceInstUsesWith(EV, IV->getInsertedValueOperand());
+    if (exti == exte) {
+      // The extract list is a prefix of the insert list. i.e. replace
+      // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %E = extractvalue { i32, { i32 } } %I, 1
+      // with
+      // %X = extractvalue { i32, { i32 } } %A, 1
+      // %E = insertvalue { i32 } %X, i32 42, 0
+      // by switching the order of the insert and extract (though the
+      // insertvalue should be left in, since it may have other uses).
+      Value *NewEV = InsertNewInstBefore(
+        ExtractValueInst::Create(IV->getAggregateOperand(),
+                                 EV.idx_begin(), EV.idx_end()),
+        EV);
+      return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
+                                     insi, inse);
+    }
+    if (insi == inse)
+      // The insert list is a prefix of the extract list
+      // We can simply remove the common indices from the extract and make it
+      // operate on the inserted value instead of the insertvalue result.
+      // i.e., replace
+      // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+      // %E = extractvalue { i32, { i32 } } %I, 1, 0
+      // with
+      // %E extractvalue { i32 } { i32 42 }, 0
+      return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
+                                      exti, exte);
+  }
+  // Can't simplify extracts from other values. Note that nested extracts are
+  // already simplified implicitely by the above (extract ( extract (insert) )
+  // will be translated into extract ( insert ( extract ) ) first and then just
+  // the value inserted, if appropriate).
+  return 0;
+}
+
+/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
+/// is to leave as a vector operation.
+static bool CheapToScalarize(Value *V, bool isConstant) {
+  if (isa<ConstantAggregateZero>(V)) 
+    return true;
+  if (ConstantVector *C = dyn_cast<ConstantVector>(V)) {
+    if (isConstant) return true;
+    // If all elts are the same, we can extract.
+    Constant *Op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != Op0)
+        return false;
+    return true;
+  }
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // Insert element gets simplified to the inserted element or is deleted if
+  // this is constant idx extract element and its a constant idx insertelt.
+  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
+      isa<ConstantInt>(I->getOperand(2)))
+    return true;
+  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+    return true;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
+    if (BO->hasOneUse() &&
+        (CheapToScalarize(BO->getOperand(0), isConstant) ||
+         CheapToScalarize(BO->getOperand(1), isConstant)))
+      return true;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    if (CI->hasOneUse() &&
+        (CheapToScalarize(CI->getOperand(0), isConstant) ||
+         CheapToScalarize(CI->getOperand(1), isConstant)))
+      return true;
+  
+  return false;
+}
+
+/// Read and decode a shufflevector mask.
+///
+/// It turns undef elements into values that are larger than the number of
+/// elements in the input.
+static std::vector<unsigned> getShuffleMask(const ShuffleVectorInst *SVI) {
+  unsigned NElts = SVI->getType()->getNumElements();
+  if (isa<ConstantAggregateZero>(SVI->getOperand(2)))
+    return std::vector<unsigned>(NElts, 0);
+  if (isa<UndefValue>(SVI->getOperand(2)))
+    return std::vector<unsigned>(NElts, 2*NElts);
+
+  std::vector<unsigned> Result;
+  const ConstantVector *CP = cast<ConstantVector>(SVI->getOperand(2));
+  for (User::const_op_iterator i = CP->op_begin(), e = CP->op_end(); i!=e; ++i)
+    if (isa<UndefValue>(*i))
+      Result.push_back(NElts*2);  // undef -> 8
+    else
+      Result.push_back(cast<ConstantInt>(*i)->getZExtValue());
+  return Result;
+}
+
+/// FindScalarElement - Given a vector and an element number, see if the scalar
+/// value is already around as a register, for example if it were inserted then
+/// extracted from the vector.
+static Value *FindScalarElement(Value *V, unsigned EltNo) {
+  assert(isa<VectorType>(V->getType()) && "Not looking at a vector?");
+  const VectorType *PTy = cast<VectorType>(V->getType());
+  unsigned Width = PTy->getNumElements();
+  if (EltNo >= Width)  // Out of range access.
+    return UndefValue::get(PTy->getElementType());
+  
+  if (isa<UndefValue>(V))
+    return UndefValue::get(PTy->getElementType());
+  else if (isa<ConstantAggregateZero>(V))
+    return Constant::getNullValue(PTy->getElementType());
+  else if (ConstantVector *CP = dyn_cast<ConstantVector>(V))
+    return CP->getOperand(EltNo);
+  else if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert to a variable element, we don't know what it is.
+    if (!isa<ConstantInt>(III->getOperand(2))) 
+      return 0;
+    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
+    
+    // If this is an insert to the element we are looking for, return the
+    // inserted value.
+    if (EltNo == IIElt) 
+      return III->getOperand(1);
+    
+    // Otherwise, the insertelement doesn't modify the value, recurse on its
+    // vector input.
+    return FindScalarElement(III->getOperand(0), EltNo);
+  } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+    unsigned LHSWidth =
+      cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements();
+    unsigned InEl = getShuffleMask(SVI)[EltNo];
+    if (InEl < LHSWidth)
+      return FindScalarElement(SVI->getOperand(0), InEl);
+    else if (InEl < LHSWidth*2)
+      return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
+    else
+      return UndefValue::get(PTy->getElementType());
+  }
+  
+  // Otherwise, we don't know.
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+  // If vector val is undef, replace extract with scalar undef.
+  if (isa<UndefValue>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+
+  // If vector val is constant 0, replace extract with scalar 0.
+  if (isa<ConstantAggregateZero>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType()));
+  
+  if (ConstantVector *C = dyn_cast<ConstantVector>(EI.getOperand(0))) {
+    // If vector val is constant with all elements the same, replace EI with
+    // that element. When the elements are not identical, we cannot replace yet
+    // (we do that below, but only when the index is constant).
+    Constant *op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != op0) {
+        op0 = 0; 
+        break;
+      }
+    if (op0)
+      return ReplaceInstUsesWith(EI, op0);
+  }
+  
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+    unsigned IndexVal = IdxC->getZExtValue();
+    unsigned VectorWidth = 
+      cast<VectorType>(EI.getOperand(0)->getType())->getNumElements();
+      
+    // If this is extracting an invalid index, turn this into undef, to avoid
+    // crashing the code below.
+    if (IndexVal >= VectorWidth)
+      return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+    
+    // This instruction only demands the single element from the input vector.
+    // If the input vector has a single use, simplify it based on this use
+    // property.
+    if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) {
+      APInt UndefElts(VectorWidth, 0);
+      APInt DemandedMask(VectorWidth, 1 << IndexVal);
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
+                                                DemandedMask, UndefElts)) {
+        EI.setOperand(0, V);
+        return &EI;
+      }
+    }
+    
+    if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
+      return ReplaceInstUsesWith(EI, Elt);
+    
+    // If the this extractelement is directly using a bitcast from a vector of
+    // the same number of elements, see if we can find the source element from
+    // it.  In this case, we will end up needing to bitcast the scalars.
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
+      if (const VectorType *VT = 
+              dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
+        if (VT->getNumElements() == VectorWidth)
+          if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+            return new BitCastInst(Elt, EI.getType());
+    }
+  }
+  
+  if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
+    if (I->hasOneUse()) {
+      // Push extractelement into predecessor operation if legal and
+      // profitable to do so
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        bool isConstantElt = isa<ConstantInt>(EI.getOperand(1));
+        if (CheapToScalarize(BO, isConstantElt)) {
+          ExtractElementInst *newEI0 = 
+            new ExtractElementInst(BO->getOperand(0), EI.getOperand(1),
+                                   EI.getName()+".lhs");
+          ExtractElementInst *newEI1 =
+            new ExtractElementInst(BO->getOperand(1), EI.getOperand(1),
+                                   EI.getName()+".rhs");
+          InsertNewInstBefore(newEI0, EI);
+          InsertNewInstBefore(newEI1, EI);
+          return BinaryOperator::Create(BO->getOpcode(), newEI0, newEI1);
+        }
+      } else if (isa<LoadInst>(I)) {
+        unsigned AS = 
+          cast<PointerType>(I->getOperand(0)->getType())->getAddressSpace();
+        Value *Ptr = InsertBitCastBefore(I->getOperand(0),
+                                         PointerType::get(EI.getType(), AS),EI);
+        GetElementPtrInst *GEP =
+          GetElementPtrInst::Create(Ptr, EI.getOperand(1), I->getName()+".gep");
+        InsertNewInstBefore(GEP, EI);
+        return new LoadInst(GEP);
+      }
+    }
+    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+      // Extracting the inserted element?
+      if (IE->getOperand(2) == EI.getOperand(1))
+        return ReplaceInstUsesWith(EI, IE->getOperand(1));
+      // If the inserted and extracted elements are constants, they must not
+      // be the same value, extract from the pre-inserted value instead.
+      if (isa<Constant>(IE->getOperand(2)) &&
+          isa<Constant>(EI.getOperand(1))) {
+        AddUsesToWorkList(EI);
+        EI.setOperand(0, IE->getOperand(0));
+        return &EI;
+      }
+    } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+      // If this is extracting an element from a shufflevector, figure out where
+      // it came from and extract from the appropriate input element instead.
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+        unsigned SrcIdx = getShuffleMask(SVI)[Elt->getZExtValue()];
+        Value *Src;
+        unsigned LHSWidth =
+          cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements();
+
+        if (SrcIdx < LHSWidth)
+          Src = SVI->getOperand(0);
+        else if (SrcIdx < LHSWidth*2) {
+          SrcIdx -= LHSWidth;
+          Src = SVI->getOperand(1);
+        } else {
+          return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+        }
+        return new ExtractElementInst(Src, SrcIdx);
+      }
+    }
+  }
+  return 0;
+}
+
+/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
+/// elements from either LHS or RHS, return the shuffle mask and true. 
+/// Otherwise, return false.
+static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         std::vector<Constant*> &Mask) {
+  assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::Int32Ty));
+    return true;
+  } else if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::Int32Ty, i));
+    return true;
+  } else if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::Int32Ty, i+NumElts));
+    return true;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+    
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // Okay, we can handle this if the vector we are insertinting into is
+      // transitively ok.
+      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::Int32Ty);
+        return true;
+      }      
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1)) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // Okay, we can handle this if the vector we are insertinting into is
+          // transitively ok.
+          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx % NumElts] = 
+                 ConstantInt::get(Type::Int32Ty, ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx % NumElts] = 
+                ConstantInt::get(Type::Int32Ty, ExtractedIdx+NumElts);
+              
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  return false;
+}
+
+/// CollectShuffleElements - We are building a shuffle of V, using RHS as the
+/// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
+/// that computes V and the LHS value of the shuffle.
+static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask,
+                                     Value *&RHS) {
+  assert(isa<VectorType>(V->getType()) && 
+         (RHS == 0 || V->getType() == RHS->getType()) &&
+         "Invalid shuffle!");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::Int32Ty));
+    return V;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantInt::get(Type::Int32Ty, 0));
+    return V;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+        
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == RHS || RHS == 0) {
+          RHS = EI->getOperand(0);
+          Value *V = CollectShuffleElements(VecOp, Mask, RHS);
+          Mask[InsertedIdx % NumElts] = 
+            ConstantInt::get(Type::Int32Ty, NumElts+ExtractedIdx);
+          return V;
+        }
+        
+        if (VecOp == RHS) {
+          Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Everything but the extracted element is replaced with the RHS.
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (i != InsertedIdx)
+              Mask[i] = ConstantInt::get(Type::Int32Ty, NumElts+i);
+          }
+          return V;
+        }
+        
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask))
+          return EI->getOperand(0);
+        
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  // Otherwise, can't do anything fancy.  Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantInt::get(Type::Int32Ty, i));
+  return V;
+}
+
+Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+  
+  // Inserting an undef or into an undefined place, remove this.
+  if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
+    ReplaceInstUsesWith(IE, VecOp);
+  
+  // If the inserted element was extracted from some other vector, and if the 
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+        EI->getOperand(0)->getType() == IE.getType()) {
+      unsigned NumVectorElts = IE.getType()->getNumElements();
+      unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+      
+      if (ExtractedIdx >= NumVectorElts) // Out of range extract.
+        return ReplaceInstUsesWith(IE, VecOp);
+      
+      if (InsertedIdx >= NumVectorElts)  // Out of range insert.
+        return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+      
+      // If we are extracting a value from a vector, then inserting it right
+      // back into the same place, just use the input vector.
+      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
+        return ReplaceInstUsesWith(IE, VecOp);      
+      
+      // We could theoretically do this for ANY input.  However, doing so could
+      // turn chains of insertelement instructions into a chain of shufflevector
+      // instructions, and right now we do not merge shufflevectors.  As such,
+      // only do this in a situation where it is clear that there is benefit.
+      if (isa<UndefValue>(VecOp) || isa<ConstantAggregateZero>(VecOp)) {
+        // Turn this into shuffle(EIOp0, VecOp, Mask).  The result has all of
+        // the values of VecOp, except then one read from EIOp0.
+        // Build a new shuffle mask.
+        std::vector<Constant*> Mask;
+        if (isa<UndefValue>(VecOp))
+          Mask.assign(NumVectorElts, UndefValue::get(Type::Int32Ty));
+        else {
+          assert(isa<ConstantAggregateZero>(VecOp) && "Unknown thing");
+          Mask.assign(NumVectorElts, ConstantInt::get(Type::Int32Ty,
+                                                       NumVectorElts));
+        } 
+        Mask[InsertedIdx] = ConstantInt::get(Type::Int32Ty, ExtractedIdx);
+        return new ShuffleVectorInst(EI->getOperand(0), VecOp,
+                                     ConstantVector::get(Mask));
+      }
+      
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
+        std::vector<Constant*> Mask;
+        Value *RHS = 0;
+        Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
+        if (RHS == 0) RHS = UndefValue::get(LHS->getType());
+        // We now have a shuffle of LHS, RHS, Mask.
+        return new ShuffleVectorInst(LHS, RHS, ConstantVector::get(Mask));
+      }
+    }
+  }
+
+  return 0;
+}
+
+
+Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  std::vector<unsigned> Mask = getShuffleMask(&SVI);
+
+  bool MadeChange = false;
+
+  // Undefined shuffle mask -> undefined value.
+  if (isa<UndefValue>(SVI.getOperand(2)))
+    return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+
+  unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
+
+  if (VWidth != cast<VectorType>(LHS->getType())->getNumElements())
+    return 0;
+
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+  
+  // Canonicalize shuffle(x    ,x,mask) -> shuffle(x, undef,mask')
+  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  if (LHS == RHS || isa<UndefValue>(LHS)) {
+    if (isa<UndefValue>(LHS) && LHS == RHS) {
+      // shuffle(undef,undef,mask) -> undef.
+      return ReplaceInstUsesWith(SVI, LHS);
+    }
+    
+    // Remap any references to RHS to use LHS.
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] >= 2*e)
+        Elts.push_back(UndefValue::get(Type::Int32Ty));
+      else {
+        if ((Mask[i] >= e && isa<UndefValue>(RHS)) ||
+            (Mask[i] <  e && isa<UndefValue>(LHS))) {
+          Mask[i] = 2*e;     // Turn into undef.
+          Elts.push_back(UndefValue::get(Type::Int32Ty));
+        } else {
+          Mask[i] = Mask[i] % e;  // Force to LHS.
+          Elts.push_back(ConstantInt::get(Type::Int32Ty, Mask[i]));
+        }
+      }
+    }
+    SVI.setOperand(0, SVI.getOperand(1));
+    SVI.setOperand(1, UndefValue::get(RHS->getType()));
+    SVI.setOperand(2, ConstantVector::get(Elts));
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+  
+  // Analyze the shuffle, are the LHS or RHS and identity shuffles?
+  bool isLHSID = true, isRHSID = true;
+    
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] >= e*2) continue;  // Ignore undef values.
+    // Is this an identity shuffle of the LHS value?
+    isLHSID &= (Mask[i] == i);
+      
+    // Is this an identity shuffle of the RHS value?
+    isRHSID &= (Mask[i]-e == i);
+  }
+
+  // Eliminate identity shuffles.
+  if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
+  if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+  
+  // If the LHS is a shufflevector itself, see if we can combine it with this
+  // one without producing an unusual shuffle.  Here we are really conservative:
+  // we are absolutely afraid of producing a shuffle mask not in the input
+  // program, because the code gen may not be smart enough to turn a merged
+  // shuffle into two specific shuffles: it may produce worse code.  As such,
+  // we only merge two shuffles if the result is one of the two input shuffle
+  // masks.  In this case, merging the shuffles just removes one instruction,
+  // which we know is safe.  This is good for things like turning:
+  // (splat(splat)) -> splat.
+  if (ShuffleVectorInst *LHSSVI = dyn_cast<ShuffleVectorInst>(LHS)) {
+    if (isa<UndefValue>(RHS)) {
+      std::vector<unsigned> LHSMask = getShuffleMask(LHSSVI);
+
+      std::vector<unsigned> NewMask;
+      for (unsigned i = 0, e = Mask.size(); i != e; ++i)
+        if (Mask[i] >= 2*e)
+          NewMask.push_back(2*e);
+        else
+          NewMask.push_back(LHSMask[Mask[i]]);
+      
+      // If the result mask is equal to the src shuffle or this shuffle mask, do
+      // the replacement.
+      if (NewMask == LHSMask || NewMask == Mask) {
+        unsigned LHSInNElts =
+          cast<VectorType>(LHSSVI->getOperand(0)->getType())->getNumElements();
+        std::vector<Constant*> Elts;
+        for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
+          if (NewMask[i] >= LHSInNElts*2) {
+            Elts.push_back(UndefValue::get(Type::Int32Ty));
+          } else {
+            Elts.push_back(ConstantInt::get(Type::Int32Ty, NewMask[i]));
+          }
+        }
+        return new ShuffleVectorInst(LHSSVI->getOperand(0),
+                                     LHSSVI->getOperand(1),
+                                     ConstantVector::get(Elts));
+      }
+    }
+  }
+
+  return MadeChange ? &SVI : 0;
+}
+
+
+
+
+/// TryToSinkInstruction - Try to move the specified instruction from its
+/// current block into the beginning of DestBlock, which can only happen if it's
+/// safe to move the instruction past all of the instructions between it and the
+/// end of its block.
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  assert(I->hasOneUse() && "Invariants didn't hold!");
+
+  // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+  if (isa<PHINode>(I) || I->mayHaveSideEffects() || isa<TerminatorInst>(I))
+    return false;
+
+  // Do not sink alloca instructions out of the entry block.
+  if (isa<AllocaInst>(I) && I->getParent() ==
+        &DestBlock->getParent()->getEntryBlock())
+    return false;
+
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (I->mayReadFromMemory()) {
+    for (BasicBlock::iterator Scan = I, E = I->getParent()->end();
+         Scan != E; ++Scan)
+      if (Scan->mayWriteToMemory())
+        return false;
+  }
+
+  BasicBlock::iterator InsertPos = DestBlock->getFirstNonPHI();
+
+  CopyPrecedingStopPoint(I, InsertPos);
+  I->moveBefore(InsertPos);
+  ++NumSunkInst;
+  return true;
+}
+
+
+/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding
+/// all reachable code to the worklist.
+///
+/// This has a couple of tricks to make the code faster and more powerful.  In
+/// particular, we constant fold and DCE instructions as we go, to avoid adding
+/// them to the worklist (this significantly speeds up instcombine on code where
+/// many instructions are dead or constant).  Additionally, if we find a branch
+/// whose condition is a known constant, we only visit the reachable successors.
+///
+static void AddReachableCodeToWorklist(BasicBlock *BB, 
+                                       SmallPtrSet<BasicBlock*, 64> &Visited,
+                                       InstCombiner &IC,
+                                       const TargetData *TD) {
+  SmallVector<BasicBlock*, 256> Worklist;
+  Worklist.push_back(BB);
+
+  while (!Worklist.empty()) {
+    BB = Worklist.back();
+    Worklist.pop_back();
+    
+    // We have now visited this block!  If we've already been here, ignore it.
+    if (!Visited.insert(BB)) continue;
+
+    DbgInfoIntrinsic *DBI_Prev = NULL;
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+      Instruction *Inst = BBI++;
+      
+      // DCE instruction if trivially dead.
+      if (isInstructionTriviallyDead(Inst)) {
+        ++NumDeadInst;
+        DOUT << "IC: DCE: " << *Inst;
+        Inst->eraseFromParent();
+        continue;
+      }
+      
+      // ConstantProp instruction if trivially constant.
+      if (Constant *C = ConstantFoldInstruction(Inst, TD)) {
+        DOUT << "IC: ConstFold to: " << *C << " from: " << *Inst;
+        Inst->replaceAllUsesWith(C);
+        ++NumConstProp;
+        Inst->eraseFromParent();
+        continue;
+      }
+     
+      // If there are two consecutive llvm.dbg.stoppoint calls then
+      // it is likely that the optimizer deleted code in between these
+      // two intrinsics. 
+      DbgInfoIntrinsic *DBI_Next = dyn_cast<DbgInfoIntrinsic>(Inst);
+      if (DBI_Next) {
+        if (DBI_Prev
+            && DBI_Prev->getIntrinsicID() == llvm::Intrinsic::dbg_stoppoint
+            && DBI_Next->getIntrinsicID() == llvm::Intrinsic::dbg_stoppoint) {
+          IC.RemoveFromWorkList(DBI_Prev);
+          DBI_Prev->eraseFromParent();
+        }
+        DBI_Prev = DBI_Next;
+      } else {
+        DBI_Prev = 0;
+      }
+
+      IC.AddToWorkList(Inst);
+    }
+
+    // Recursively visit successors.  If this is a branch or switch on a
+    // constant, only visit the reachable successor.
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
+        bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
+        BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
+        Worklist.push_back(ReachableBB);
+        continue;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
+        // See if this is an explicit destination.
+        for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
+          if (SI->getCaseValue(i) == Cond) {
+            BasicBlock *ReachableBB = SI->getSuccessor(i);
+            Worklist.push_back(ReachableBB);
+            continue;
+          }
+        
+        // Otherwise it is the default destination.
+        Worklist.push_back(SI->getSuccessor(0));
+        continue;
+      }
+    }
+    
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      Worklist.push_back(TI->getSuccessor(i));
+  }
+}
+
+bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
+  bool Changed = false;
+  TD = &getAnalysis<TargetData>();
+  
+  DEBUG(DOUT << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+             << F.getNameStr() << "\n");
+
+  {
+    // Do a depth-first traversal of the function, populate the worklist with
+    // the reachable instructions.  Ignore blocks that are not reachable.  Keep
+    // track of which blocks we visit.
+    SmallPtrSet<BasicBlock*, 64> Visited;
+    AddReachableCodeToWorklist(F.begin(), Visited, *this, TD);
+
+    // Do a quick scan over the function.  If we find any blocks that are
+    // unreachable, remove any instructions inside of them.  This prevents
+    // the instcombine code from having to deal with some bad special cases.
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      if (!Visited.count(BB)) {
+        Instruction *Term = BB->getTerminator();
+        while (Term != BB->begin()) {   // Remove instrs bottom-up
+          BasicBlock::iterator I = Term; --I;
+
+          DOUT << "IC: DCE: " << *I;
+          // A debug intrinsic shouldn't force another iteration if we weren't
+          // going to do one without it.
+          if (!isa<DbgInfoIntrinsic>(I)) {
+            ++NumDeadInst;
+            Changed = true;
+          }
+          if (!I->use_empty())
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          I->eraseFromParent();
+        }
+      }
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = RemoveOneFromWorkList();
+    if (I == 0) continue;  // skip null values.
+
+    // Check to see if we can DCE the instruction.
+    if (isInstructionTriviallyDead(I)) {
+      // Add operands to the worklist.
+      if (I->getNumOperands() < 4)
+        AddUsesToWorkList(*I);
+      ++NumDeadInst;
+
+      DOUT << "IC: DCE: " << *I;
+
+      I->eraseFromParent();
+      RemoveFromWorkList(I);
+      Changed = true;
+      continue;
+    }
+
+    // Instruction isn't dead, see if we can constant propagate it.
+    if (Constant *C = ConstantFoldInstruction(I, TD)) {
+      DOUT << "IC: ConstFold to: " << *C << " from: " << *I;
+
+      // Add operands to the worklist.
+      AddUsesToWorkList(*I);
+      ReplaceInstUsesWith(*I, C);
+
+      ++NumConstProp;
+      I->eraseFromParent();
+      RemoveFromWorkList(I);
+      Changed = true;
+      continue;
+    }
+
+    if (TD &&
+        (I->getType()->getTypeID() == Type::VoidTyID ||
+         I->isTrapping())) {
+      // See if we can constant fold its operands.
+      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(i))
+          if (Constant *NewC = ConstantFoldConstantExpression(CE, TD))
+            if (NewC != CE) {
+              i->set(NewC);
+              Changed = true;
+            }
+    }
+
+    // See if we can trivially sink this instruction to a successor basic block.
+    if (I->hasOneUse()) {
+      BasicBlock *BB = I->getParent();
+      BasicBlock *UserParent = cast<Instruction>(I->use_back())->getParent();
+      if (UserParent != BB) {
+        bool UserIsSuccessor = false;
+        // See if the user is one of our successors.
+        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+          if (*SI == UserParent) {
+            UserIsSuccessor = true;
+            break;
+          }
+
+        // If the user is one of our immediate successors, and if that successor
+        // only has us as a predecessors (we'd have to split the critical edge
+        // otherwise), we can keep going.
+        if (UserIsSuccessor && !isa<PHINode>(I->use_back()) &&
+            next(pred_begin(UserParent)) == pred_end(UserParent))
+          // Okay, the CFG is simple enough, try to sink this instruction.
+          Changed |= TryToSinkInstruction(I, UserParent);
+      }
+    }
+
+    // Now that we have an instruction, try combining it to simplify it...
+#ifndef NDEBUG
+    std::string OrigI;
+#endif
+    DEBUG(std::ostringstream SS; I->print(SS); OrigI = SS.str(););
+    if (Instruction *Result = visit(*I)) {
+      ++NumCombined;
+      // Should we replace the old instruction with a new one?
+      if (Result != I) {
+        DOUT << "IC: Old = " << *I
+             << "    New = " << *Result;
+
+        // Everything uses the new instruction now.
+        I->replaceAllUsesWith(Result);
+
+        // Push the new instruction and any users onto the worklist.
+        AddToWorkList(Result);
+        AddUsersToWorkList(*Result);
+
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
+        // Insert the new instruction into the basic block...
+        BasicBlock *InstParent = I->getParent();
+        BasicBlock::iterator InsertPos = I;
+
+        if (!isa<PHINode>(Result))        // If combining a PHI, don't insert
+          while (isa<PHINode>(InsertPos)) // middle of a block of PHIs.
+            ++InsertPos;
+
+        InstParent->getInstList().insert(InsertPos, Result);
+
+        // Make sure that we reprocess all operands now that we reduced their
+        // use counts.
+        AddUsesToWorkList(*I);
+
+        // Instructions can end up on the worklist more than once.  Make sure
+        // we do not process an instruction that has been deleted.
+        RemoveFromWorkList(I);
+
+        // Erase the old instruction.
+        InstParent->getInstList().erase(I);
+      } else {
+#ifndef NDEBUG
+        DOUT << "IC: Mod = " << OrigI
+             << "    New = " << *I;
+#endif
+
+        // If the instruction was modified, it's possible that it is now dead.
+        // if so, remove it.
+        if (isInstructionTriviallyDead(I)) {
+          // Make sure we process all operands now that we are reducing their
+          // use counts.
+          AddUsesToWorkList(*I);
+
+          // Instructions may end up in the worklist more than once.  Erase all
+          // occurrences of this instruction.
+          RemoveFromWorkList(I);
+          I->eraseFromParent();
+        } else {
+          AddToWorkList(I);
+          AddUsersToWorkList(*I);
+        }
+      }
+      Changed = true;
+    }
+  }
+
+  assert(WorklistMap.empty() && "Worklist empty, but map not?");
+    
+  // Do an explicit clear, this shrinks the map if needed.
+  WorklistMap.clear();
+  return Changed;
+}
+
+
+bool InstCombiner::runOnFunction(Function &F) {
+  MustPreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+  
+  bool EverMadeChange = false;
+
+  // Iterate while there is work to do.
+  unsigned Iteration = 0;
+  while (DoOneIteration(F, Iteration++))
+    EverMadeChange = true;
+  return EverMadeChange;
+}
+
+FunctionPass *llvm::createInstructionCombiningPass() {
+  return new InstCombiner();
+}
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
new file mode 100644
index 0000000..c0ca2df
--- /dev/null
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -0,0 +1,954 @@
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jump-threading"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+using namespace llvm;
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds,   "Number of terminators folded");
+
+static cl::opt<unsigned>
+Threshold("jump-threading-threshold", 
+          cl::desc("Max block size to duplicate for jump threading"),
+          cl::init(6), cl::Hidden);
+
+namespace {
+  /// This pass performs 'jump threading', which looks at blocks that have
+  /// multiple predecessors and multiple successors.  If one or more of the
+  /// predecessors of the block can be proven to always jump to one of the
+  /// successors, we forward the edge from the predecessor to the successor by
+  /// duplicating the contents of this block.
+  ///
+  /// An example of when this can occur is code like this:
+  ///
+  ///   if () { ...
+  ///     X = 4;
+  ///   }
+  ///   if (X < 3) {
+  ///
+  /// In this case, the unconditional branch at the end of the first if can be
+  /// revectored to the false side of the second if.
+  ///
+  class VISIBILITY_HIDDEN JumpThreading : public FunctionPass {
+    TargetData *TD;
+#ifdef NDEBUG
+    SmallPtrSet<BasicBlock*, 16> LoopHeaders;
+#else
+    SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders;
+#endif
+  public:
+    static char ID; // Pass identification
+    JumpThreading() : FunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+    bool runOnFunction(Function &F);
+    void FindLoopHeaders(Function &F);
+    
+    bool ProcessBlock(BasicBlock *BB);
+    bool ThreadEdge(BasicBlock *BB, BasicBlock *PredBB, BasicBlock *SuccBB,
+                    unsigned JumpThreadCost);
+    BasicBlock *FactorCommonPHIPreds(PHINode *PN, Constant *CstVal);
+    bool ProcessBranchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
+    bool ProcessSwitchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
+
+    bool ProcessJumpOnPHI(PHINode *PN);
+    bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd);
+    bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB);
+    
+    bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
+  };
+}
+
+char JumpThreading::ID = 0;
+static RegisterPass<JumpThreading>
+X("jump-threading", "Jump Threading");
+
+// Public interface to the Jump Threading pass
+FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
+
+/// runOnFunction - Top level algorithm.
+///
+bool JumpThreading::runOnFunction(Function &F) {
+  DOUT << "Jump threading on function '" << F.getNameStart() << "'\n";
+  TD = &getAnalysis<TargetData>();
+  
+  FindLoopHeaders(F);
+  
+  bool AnotherIteration = true, EverChanged = false;
+  while (AnotherIteration) {
+    AnotherIteration = false;
+    bool Changed = false;
+    for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+      BasicBlock *BB = I;
+      while (ProcessBlock(BB))
+        Changed = true;
+      
+      ++I;
+      
+      // If the block is trivially dead, zap it.  This eliminates the successor
+      // edges which simplifies the CFG.
+      if (pred_begin(BB) == pred_end(BB) &&
+          BB != &BB->getParent()->getEntryBlock()) {
+        DOUT << "  JT: Deleting dead block '" << BB->getNameStart()
+             << "' with terminator: " << *BB->getTerminator();
+        LoopHeaders.erase(BB);
+        DeleteDeadBlock(BB);
+        Changed = true;
+      }
+    }
+    AnotherIteration = Changed;
+    EverChanged |= Changed;
+  }
+  
+  LoopHeaders.clear();
+  return EverChanged;
+}
+
+/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// structures into irreducible loops.  Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations.  To prevent this from
+/// happening, we first have to find the loop headers.  Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header.  For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop).  This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
+///
+void JumpThreading::FindLoopHeaders(Function &F) {
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+  
+  for (unsigned i = 0, e = Edges.size(); i != e; ++i)
+    LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second));
+}
+
+
+/// FactorCommonPHIPreds - If there are multiple preds with the same incoming
+/// value for the PHI, factor them together so we get one block to thread for
+/// the whole group.
+/// This is important for things like "phi i1 [true, true, false, true, x]"
+/// where we only need to clone the block for the true blocks once.
+///
+BasicBlock *JumpThreading::FactorCommonPHIPreds(PHINode *PN, Constant *CstVal) {
+  SmallVector<BasicBlock*, 16> CommonPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == CstVal)
+      CommonPreds.push_back(PN->getIncomingBlock(i));
+  
+  if (CommonPreds.size() == 1)
+    return CommonPreds[0];
+    
+  DOUT << "  Factoring out " << CommonPreds.size()
+       << " common predecessors.\n";
+  return SplitBlockPredecessors(PN->getParent(),
+                                &CommonPreds[0], CommonPreds.size(),
+                                ".thr_comm", this);
+}
+  
+
+/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
+/// thread across it.
+static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
+  /// Ignore PHI nodes, these will be flattened when duplication happens.
+  BasicBlock::const_iterator I = BB->getFirstNonPHI();
+
+  // Sum up the cost of each instruction until we get to the terminator.  Don't
+  // include the terminator because the copy won't include it.
+  unsigned Size = 0;
+  for (; !isa<TerminatorInst>(I); ++I) {
+    // Debugger intrinsics don't incur code size.
+    if (isa<DbgInfoIntrinsic>(I)) continue;
+    
+    // If this is a pointer->pointer bitcast, it is free.
+    if (isa<BitCastInst>(I) && isa<PointerType>(I->getType()))
+      continue;
+    
+    // All other instructions count for at least one unit.
+    ++Size;
+    
+    // Calls are more expensive.  If they are non-intrinsic calls, we model them
+    // as having cost of 4.  If they are a non-vector intrinsic, we model them
+    // as having cost of 2 total, and if they are a vector intrinsic, we model
+    // them as having cost 1.
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (!isa<IntrinsicInst>(CI))
+        Size += 3;
+      else if (isa<VectorType>(CI->getType()))
+        Size += 1;
+    }
+  }
+  
+  // Threading through a switch statement is particularly profitable.  If this
+  // block ends in a switch, decrease its cost to make it more likely to happen.
+  if (isa<SwitchInst>(I))
+    Size = Size > 6 ? Size-6 : 0;
+  
+  return Size;
+}
+
+/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// through to a successor, transform them now.
+bool JumpThreading::ProcessBlock(BasicBlock *BB) {
+  // If this block has a single predecessor, and if that pred has a single
+  // successor, merge the blocks.  This encourages recursive jump threading
+  // because now the condition in this block can be threaded through
+  // predecessors of our predecessor block.
+  if (BasicBlock *SinglePred = BB->getSinglePredecessor())
+    if (SinglePred->getTerminator()->getNumSuccessors() == 1 &&
+        SinglePred != BB) {
+      // If SinglePred was a loop header, BB becomes one.
+      if (LoopHeaders.erase(SinglePred))
+        LoopHeaders.insert(BB);
+      
+      // Remember if SinglePred was the entry block of the function.  If so, we
+      // will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(BB);
+      
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+      return true;
+    }
+  
+  // See if this block ends with a branch or switch.  If so, see if the
+  // condition is a phi node.  If so, and if an entry of the phi node is a
+  // constant, we can thread the block.
+  Value *Condition;
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    // Can't thread an unconditional jump.
+    if (BI->isUnconditional()) return false;
+    Condition = BI->getCondition();
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
+    Condition = SI->getCondition();
+  else
+    return false; // Must be an invoke.
+  
+  // If the terminator of this block is branching on a constant, simplify the
+  // terminator to an unconditional branch.  This can occur due to threading in
+  // other blocks.
+  if (isa<ConstantInt>(Condition)) {
+    DOUT << "  In block '" << BB->getNameStart()
+         << "' folding terminator: " << *BB->getTerminator();
+    ++NumFolds;
+    ConstantFoldTerminator(BB);
+    return true;
+  }
+  
+  // If the terminator is branching on an undef, we can pick any of the
+  // successors to branch to.  Since this is arbitrary, we pick the successor
+  // with the fewest predecessors.  This should reduce the in-degree of the
+  // others.
+  if (isa<UndefValue>(Condition)) {
+    TerminatorInst *BBTerm = BB->getTerminator();
+    unsigned MinSucc = 0;
+    BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+    // Compute the successor with the minimum number of predecessors.
+    unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+    for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      TestBB = BBTerm->getSuccessor(i);
+      unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+      if (NumPreds < MinNumPreds)
+        MinSucc = i;
+    }
+    
+    // Fold the branch/switch.
+    for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      if (i == MinSucc) continue;
+      BBTerm->getSuccessor(i)->removePredecessor(BB);
+    }
+    
+    DOUT << "  In block '" << BB->getNameStart()
+         << "' folding undef terminator: " << *BBTerm;
+    BranchInst::Create(BBTerm->getSuccessor(MinSucc), BBTerm);
+    BBTerm->eraseFromParent();
+    return true;
+  }
+  
+  Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+  // If the condition is an instruction defined in another block, see if a
+  // predecessor has the same condition:
+  //     br COND, BBX, BBY
+  //  BBX:
+  //     br COND, BBZ, BBW
+  if (!Condition->hasOneUse() && // Multiple uses.
+      (CondInst == 0 || CondInst->getParent() != BB)) { // Non-local definition.
+    pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+    if (isa<BranchInst>(BB->getTerminator())) {
+      for (; PI != E; ++PI)
+        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+          if (PBI->isConditional() && PBI->getCondition() == Condition &&
+              ProcessBranchOnDuplicateCond(*PI, BB))
+            return true;
+    } else {
+      assert(isa<SwitchInst>(BB->getTerminator()) && "Unknown jump terminator");
+      for (; PI != E; ++PI)
+        if (SwitchInst *PSI = dyn_cast<SwitchInst>((*PI)->getTerminator()))
+          if (PSI->getCondition() == Condition &&
+              ProcessSwitchOnDuplicateCond(*PI, BB))
+            return true;
+    }
+  }
+
+  // If there is only a single predecessor of this block, nothing to fold.
+  if (BB->getSinglePredecessor())
+    return false;
+  
+  // All the rest of our checks depend on the condition being an instruction.
+  if (CondInst == 0)
+    return false;
+  
+  // See if this is a phi node in the current block.
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB)
+      return ProcessJumpOnPHI(PN);
+  
+  // If this is a conditional branch whose condition is and/or of a phi, try to
+  // simplify it.
+  if ((CondInst->getOpcode() == Instruction::And || 
+       CondInst->getOpcode() == Instruction::Or) &&
+      isa<BranchInst>(BB->getTerminator()) &&
+      ProcessBranchOnLogical(CondInst, BB,
+                             CondInst->getOpcode() == Instruction::And))
+    return true;
+  
+  // If we have "br (phi != 42)" and the phi node has any constant values as 
+  // operands, we can thread through this block.
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst))
+    if (isa<PHINode>(CondCmp->getOperand(0)) &&
+        isa<Constant>(CondCmp->getOperand(1)) &&
+        ProcessBranchOnCompare(CondCmp, BB))
+      return true;
+
+  // Check for some cases that are worth simplifying.  Right now we want to look
+  // for loads that are used by a switch or by the condition for the branch.  If
+  // we see one, check to see if it's partially redundant.  If so, insert a PHI
+  // which can then be used to thread the values.
+  //
+  // This is particularly important because reg2mem inserts loads and stores all
+  // over the place, and this blocks jump threading if we don't zap them.
+  Value *SimplifyValue = CondInst;
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+    if (isa<Constant>(CondCmp->getOperand(1)))
+      SimplifyValue = CondCmp->getOperand(0);
+  
+  if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+    if (SimplifyPartiallyRedundantLoad(LI))
+      return true;
+  
+  // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know
+  // "(X == 4)" thread through this block.
+  
+  return false;
+}
+
+/// ProcessBranchOnDuplicateCond - We found a block and a predecessor of that
+/// block that jump on exactly the same condition.  This means that we almost
+/// always know the direction of the edge in the DESTBB:
+///  PREDBB:
+///     br COND, DESTBB, BBY
+///  DESTBB:
+///     br COND, BBZ, BBW
+///
+/// If DESTBB has multiple predecessors, we can't just constant fold the branch
+/// in DESTBB, we have to thread over it.
+bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB,
+                                                 BasicBlock *BB) {
+  BranchInst *PredBI = cast<BranchInst>(PredBB->getTerminator());
+  
+  // If both successors of PredBB go to DESTBB, we don't know anything.  We can
+  // fold the branch to an unconditional one, which allows other recursive
+  // simplifications.
+  bool BranchDir;
+  if (PredBI->getSuccessor(1) != BB)
+    BranchDir = true;
+  else if (PredBI->getSuccessor(0) != BB)
+    BranchDir = false;
+  else {
+    DOUT << "  In block '" << PredBB->getNameStart()
+         << "' folding terminator: " << *PredBB->getTerminator();
+    ++NumFolds;
+    ConstantFoldTerminator(PredBB);
+    return true;
+  }
+   
+  BranchInst *DestBI = cast<BranchInst>(BB->getTerminator());
+
+  // If the dest block has one predecessor, just fix the branch condition to a
+  // constant and fold it.
+  if (BB->getSinglePredecessor()) {
+    DOUT << "  In block '" << BB->getNameStart()
+         << "' folding condition to '" << BranchDir << "': "
+         << *BB->getTerminator();
+    ++NumFolds;
+    DestBI->setCondition(ConstantInt::get(Type::Int1Ty, BranchDir));
+    ConstantFoldTerminator(BB);
+    return true;
+  }
+  
+  // Otherwise we need to thread from PredBB to DestBB's successor which
+  // involves code duplication.  Check to see if it is worth it.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+  
+  // Next, figure out which successor we are threading to.
+  BasicBlock *SuccBB = DestBI->getSuccessor(!BranchDir);
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+/// ProcessSwitchOnDuplicateCond - We found a block and a predecessor of that
+/// block that switch on exactly the same condition.  This means that we almost
+/// always know the direction of the edge in the DESTBB:
+///  PREDBB:
+///     switch COND [... DESTBB, BBY ... ]
+///  DESTBB:
+///     switch COND [... BBZ, BBW ]
+///
+/// Optimizing switches like this is very important, because simplifycfg builds
+/// switches out of repeated 'if' conditions.
+bool JumpThreading::ProcessSwitchOnDuplicateCond(BasicBlock *PredBB,
+                                                 BasicBlock *DestBB) {
+  // Can't thread edge to self.
+  if (PredBB == DestBB)
+    return false;
+  
+  
+  SwitchInst *PredSI = cast<SwitchInst>(PredBB->getTerminator());
+  SwitchInst *DestSI = cast<SwitchInst>(DestBB->getTerminator());
+
+  // There are a variety of optimizations that we can potentially do on these
+  // blocks: we order them from most to least preferable.
+  
+  // If DESTBB *just* contains the switch, then we can forward edges from PREDBB
+  // directly to their destination.  This does not introduce *any* code size
+  // growth.  Skip debug info first.
+  BasicBlock::iterator BBI = DestBB->begin();
+  while (isa<DbgInfoIntrinsic>(BBI))
+    BBI++;
+  
+  // FIXME: Thread if it just contains a PHI.
+  if (isa<SwitchInst>(BBI)) {
+    bool MadeChange = false;
+    // Ignore the default edge for now.
+    for (unsigned i = 1, e = DestSI->getNumSuccessors(); i != e; ++i) {
+      ConstantInt *DestVal = DestSI->getCaseValue(i);
+      BasicBlock *DestSucc = DestSI->getSuccessor(i);
+      
+      // Okay, DestSI has a case for 'DestVal' that goes to 'DestSucc'.  See if
+      // PredSI has an explicit case for it.  If so, forward.  If it is covered
+      // by the default case, we can't update PredSI.
+      unsigned PredCase = PredSI->findCaseValue(DestVal);
+      if (PredCase == 0) continue;
+      
+      // If PredSI doesn't go to DestBB on this value, then it won't reach the
+      // case on this condition.
+      if (PredSI->getSuccessor(PredCase) != DestBB &&
+          DestSI->getSuccessor(i) != DestBB)
+        continue;
+
+      // Otherwise, we're safe to make the change.  Make sure that the edge from
+      // DestSI to DestSucc is not critical and has no PHI nodes.
+      DOUT << "FORWARDING EDGE " << *DestVal << "   FROM: " << *PredSI;
+      DOUT << "THROUGH: " << *DestSI;
+
+      // If the destination has PHI nodes, just split the edge for updating
+      // simplicity.
+      if (isa<PHINode>(DestSucc->begin()) && !DestSucc->getSinglePredecessor()){
+        SplitCriticalEdge(DestSI, i, this);
+        DestSucc = DestSI->getSuccessor(i);
+      }
+      FoldSingleEntryPHINodes(DestSucc);
+      PredSI->setSuccessor(PredCase, DestSucc);
+      MadeChange = true;
+    }
+    
+    if (MadeChange)
+      return true;
+  }
+  
+  return false;
+}
+
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node.  This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+  // Don't hack volatile loads.
+  if (LI->isVolatile()) return false;
+  
+  // If the load is defined in a block with exactly one predecessor, it can't be
+  // partially redundant.
+  BasicBlock *LoadBB = LI->getParent();
+  if (LoadBB->getSinglePredecessor())
+    return false;
+  
+  Value *LoadedPtr = LI->getOperand(0);
+
+  // If the loaded operand is defined in the LoadBB, it can't be available.
+  // FIXME: Could do PHI translation, that would be fun :)
+  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
+    if (PtrOp->getParent() == LoadBB)
+      return false;
+  
+  // Scan a few instructions up from the load, to see if it is obviously live at
+  // the entry to its block.
+  BasicBlock::iterator BBIt = LI;
+
+  if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, 
+                                                     BBIt, 6)) {
+    // If the value if the load is locally available within the block, just use
+    // it.  This frequently occurs for reg2mem'd allocas.
+    //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
+    
+    // If the returned value is the load itself, replace with an undef. This can
+    // only happen in dead loops.
+    if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+    LI->replaceAllUsesWith(AvailableVal);
+    LI->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, if we scanned the whole block and got to the top of the block,
+  // we know the block is locally transparent to the load.  If not, something
+  // might clobber its value.
+  if (BBIt != LoadBB->begin())
+    return false;
+  
+  
+  SmallPtrSet<BasicBlock*, 8> PredsScanned;
+  typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+  AvailablePredsTy AvailablePreds;
+  BasicBlock *OneUnavailablePred = 0;
+  
+  // If we got here, the loaded value is transparent through to the start of the
+  // block.  Check to see if it is available in any of the predecessor blocks.
+  for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+       PI != PE; ++PI) {
+    BasicBlock *PredBB = *PI;
+
+    // If we already scanned this predecessor, skip it.
+    if (!PredsScanned.insert(PredBB))
+      continue;
+
+    // Scan the predecessor to see if the value is available in the pred.
+    BBIt = PredBB->end();
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6);
+    if (!PredAvailable) {
+      OneUnavailablePred = PredBB;
+      continue;
+    }
+    
+    // If so, this load is partially redundant.  Remember this info so that we
+    // can create a PHI node.
+    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+  }
+  
+  // If the loaded value isn't available in any predecessor, it isn't partially
+  // redundant.
+  if (AvailablePreds.empty()) return false;
+  
+  // Okay, the loaded value is available in at least one (and maybe all!)
+  // predecessors.  If the value is unavailable in more than one unique
+  // predecessor, we want to insert a merge block for those common predecessors.
+  // This ensures that we only have to insert one reload, thus not increasing
+  // code size.
+  BasicBlock *UnavailablePred = 0;
+  
+  // If there is exactly one predecessor where the value is unavailable, the
+  // already computed 'OneUnavailablePred' block is it.  If it ends in an
+  // unconditional branch, we know that it isn't a critical edge.
+  if (PredsScanned.size() == AvailablePreds.size()+1 &&
+      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    UnavailablePred = OneUnavailablePred;
+  } else if (PredsScanned.size() != AvailablePreds.size()) {
+    // Otherwise, we had multiple unavailable predecessors or we had a critical
+    // edge from the one.
+    SmallVector<BasicBlock*, 8> PredsToSplit;
+    SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+    for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i)
+      AvailablePredSet.insert(AvailablePreds[i].first);
+
+    // Add all the unavailable predecessors to the PredsToSplit list.
+    for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+         PI != PE; ++PI)
+      if (!AvailablePredSet.count(*PI))
+        PredsToSplit.push_back(*PI);
+    
+    // Split them out to their own block.
+    UnavailablePred =
+      SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(),
+                             "thread-split", this);
+  }
+  
+  // If the value isn't available in all predecessors, then there will be
+  // exactly one where it isn't available.  Insert a load on that edge and add
+  // it to the AvailablePreds list.
+  if (UnavailablePred) {
+    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+           "Can't handle critical edge here!");
+    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr",
+                                 UnavailablePred->getTerminator());
+    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+  }
+  
+  // Now we know that each predecessor of this block has a value in
+  // AvailablePreds, sort them for efficient access as we're walking the preds.
+  array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+  
+  // Create a PHI node at the start of the block for the PRE'd load value.
+  PHINode *PN = PHINode::Create(LI->getType(), "", LoadBB->begin());
+  PN->takeName(LI);
+  
+  // Insert new entries into the PHI for each predecessor.  A single block may
+  // have multiple entries here.
+  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E;
+       ++PI) {
+    AvailablePredsTy::iterator I = 
+      std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+                       std::make_pair(*PI, (Value*)0));
+    
+    assert(I != AvailablePreds.end() && I->first == *PI &&
+           "Didn't find entry for predecessor!");
+    
+    PN->addIncoming(I->second, I->first);
+  }
+  
+  //cerr << "PRE: " << *LI << *PN << "\n";
+  
+  LI->replaceAllUsesWith(PN);
+  LI->eraseFromParent();
+  
+  return true;
+}
+
+
+/// ProcessJumpOnPHI - We have a conditional branch of switch on a PHI node in
+/// the current block.  See if there are any simplifications we can do based on
+/// inputs to the phi node.
+/// 
+bool JumpThreading::ProcessJumpOnPHI(PHINode *PN) {
+  // See if the phi node has any constant values.  If so, we can determine where
+  // the corresponding predecessor will branch.
+  ConstantInt *PredCst = 0;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if ((PredCst = dyn_cast<ConstantInt>(PN->getIncomingValue(i))))
+      break;
+  
+  // If no incoming value has a constant, we don't know the destination of any
+  // predecessors.
+  if (PredCst == 0)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  BasicBlock *BB = PN->getParent();
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+  
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, figure out which successor we are threading to.
+  BasicBlock *SuccBB;
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+    SuccBB = BI->getSuccessor(PredCst == ConstantInt::getFalse());
+  else {
+    SwitchInst *SI = cast<SwitchInst>(BB->getTerminator());
+    SuccBB = SI->getSuccessor(SI->findCaseValue(PredCst));
+  }
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+/// ProcessJumpOnLogicalPHI - PN's basic block contains a conditional branch
+/// whose condition is an AND/OR where one side is PN.  If PN has constant
+/// operands that permit us to evaluate the condition for some operand, thread
+/// through the block.  For example with:
+///   br (and X, phi(Y, Z, false))
+/// the predecessor corresponding to the 'false' will always jump to the false
+/// destination of the branch.
+///
+bool JumpThreading::ProcessBranchOnLogical(Value *V, BasicBlock *BB,
+                                           bool isAnd) {
+  // If this is a binary operator tree of the same AND/OR opcode, check the
+  // LHS/RHS.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if ((isAnd && BO->getOpcode() == Instruction::And) ||
+        (!isAnd && BO->getOpcode() == Instruction::Or)) {
+      if (ProcessBranchOnLogical(BO->getOperand(0), BB, isAnd))
+        return true;
+      if (ProcessBranchOnLogical(BO->getOperand(1), BB, isAnd))
+        return true;
+    }
+      
+  // If this isn't a PHI node, we can't handle it.
+  PHINode *PN = dyn_cast<PHINode>(V);
+  if (!PN || PN->getParent() != BB) return false;
+                                             
+  // We can only do the simplification for phi nodes of 'false' with AND or
+  // 'true' with OR.  See if we have any entries in the phi for this.
+  unsigned PredNo = ~0U;
+  ConstantInt *PredCst = ConstantInt::get(Type::Int1Ty, !isAnd);
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) == PredCst) {
+      PredNo = i;
+      break;
+    }
+  }
+  
+  // If no match, bail out.
+  if (PredNo == ~0U)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, figure out which successor we are threading to.  If this was an AND,
+  // the constant must be FALSE, and we must be targeting the 'false' block.
+  // If this is an OR, the constant must be TRUE, and we must be targeting the
+  // 'true' block.
+  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(isAnd);
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+/// ProcessBranchOnCompare - We found a branch on a comparison between a phi
+/// node and a constant.  If the PHI node contains any constants as inputs, we
+/// can fold the compare for that edge and thread through it.
+bool JumpThreading::ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB) {
+  PHINode *PN = cast<PHINode>(Cmp->getOperand(0));
+  Constant *RHS = cast<Constant>(Cmp->getOperand(1));
+  
+  // If the phi isn't in the current block, an incoming edge to this block
+  // doesn't control the destination.
+  if (PN->getParent() != BB)
+    return false;
+  
+  // We can do this simplification if any comparisons fold to true or false.
+  // See if any do.
+  Constant *PredCst = 0;
+  bool TrueDirection = false;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    PredCst = dyn_cast<Constant>(PN->getIncomingValue(i));
+    if (PredCst == 0) continue;
+    
+    Constant *Res;
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cmp))
+      Res = ConstantExpr::getICmp(ICI->getPredicate(), PredCst, RHS);
+    else
+      Res = ConstantExpr::getFCmp(cast<FCmpInst>(Cmp)->getPredicate(),
+                                  PredCst, RHS);
+    // If this folded to a constant expr, we can't do anything.
+    if (ConstantInt *ResC = dyn_cast<ConstantInt>(Res)) {
+      TrueDirection = ResC->getZExtValue();
+      break;
+    }
+    // If this folded to undef, just go the false way.
+    if (isa<UndefValue>(Res)) {
+      TrueDirection = false;
+      break;
+    }
+    
+    // Otherwise, we can't fold this input.
+    PredCst = 0;
+  }
+  
+  // If no match, bail out.
+  if (PredCst == 0)
+    return false;
+  
+  // See if the cost of duplicating this block is low enough.
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  if (JumpThreadCost > Threshold) {
+    DOUT << "  Not threading BB '" << BB->getNameStart()
+         << "' - Cost is too high: " << JumpThreadCost << "\n";
+    return false;
+  }
+  
+  // If so, we can actually do this threading.  Merge any common predecessors
+  // that will act the same.
+  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
+  
+  // Next, get our successor.
+  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(!TrueDirection);
+  
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredBB, SuccBB, JumpThreadCost);
+}
+
+
+/// ThreadEdge - We have decided that it is safe and profitable to thread an
+/// edge from PredBB to SuccBB across BB.  Transform the IR to reflect this
+/// change.
+bool JumpThreading::ThreadEdge(BasicBlock *BB, BasicBlock *PredBB, 
+                               BasicBlock *SuccBB, unsigned JumpThreadCost) {
+
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DOUT << "  Not threading across BB '" << BB->getNameStart()
+         << "' - would thread to self!\n";
+    return false;
+  }
+  
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DOUT << "  Not threading from '" << PredBB->getNameStart()
+         << "' across loop header BB '" << BB->getNameStart()
+         << "' to dest BB '" << SuccBB->getNameStart()
+         << "' - it might create an irreducible loop!\n";
+    return false;
+  }
+
+  // And finally, do it!
+  DOUT << "  Threading edge from '" << PredBB->getNameStart() << "' to '"
+       << SuccBB->getNameStart() << "' with cost: " << JumpThreadCost
+       << ", across block:\n    "
+       << *BB << "\n";
+  
+  // Jump Threading can not update SSA properties correctly if the values
+  // defined in the duplicated block are used outside of the block itself.  For
+  // this reason, we spill all values that are used outside of BB to the stack.
+  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+    if (!I->isUsedOutsideOfBlock(BB))
+      continue;
+    
+    // We found a use of I outside of BB.  Create a new stack slot to
+    // break this inter-block usage pattern.
+    DemoteRegToStack(*I);
+  }
+ 
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+  
+  BasicBlock *NewBB =
+    BasicBlock::Create(BB->getName()+".thread", BB->getParent(), BB);
+  NewBB->moveAfter(PredBB);
+  
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+  
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; !isa<TerminatorInst>(BI); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getNameStart());
+    NewBB->getInstList().push_back(New);
+    ValueMapping[BI] = New;
+   
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i)))
+        if (Value *Remapped = ValueMapping[Inst])
+          New->setOperand(i, Remapped);
+  }
+  
+  // We didn't copy the terminator from BB over to NewBB, because there is now
+  // an unconditional jump to SuccBB.  Insert the unconditional jump.
+  BranchInst::Create(SuccBB, NewBB);
+  
+  // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+  // PHI nodes for NewBB now.
+  for (BasicBlock::iterator PNI = SuccBB->begin(); isa<PHINode>(PNI); ++PNI) {
+    PHINode *PN = cast<PHINode>(PNI);
+    // Ok, we have a PHI node.  Figure out what the incoming value was for the
+    // DestBlock.
+    Value *IV = PN->getIncomingValueForBlock(BB);
+    
+    // Remap the value if necessary.
+    if (Instruction *Inst = dyn_cast<Instruction>(IV))
+      if (Value *MappedIV = ValueMapping[Inst])
+        IV = MappedIV;
+    PN->addIncoming(IV, NewBB);
+  }
+  
+  // Ok, NewBB is good to go.  Update the terminator of PredBB to jump to
+  // NewBB instead of BB.  This eliminates predecessors from BB, which requires
+  // us to simplify any PHI nodes in BB.
+  TerminatorInst *PredTerm = PredBB->getTerminator();
+  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredTerm->getSuccessor(i) == BB) {
+      BB->removePredecessor(PredBB);
+      PredTerm->setSuccessor(i, NewBB);
+    }
+  
+  // At this point, the IR is fully up to date and consistent.  Do a quick scan
+  // over the new instructions and zap any that are constants or dead.  This
+  // frequently happens because of phi translation.
+  BI = NewBB->begin();
+  for (BasicBlock::iterator E = NewBB->end(); BI != E; ) {
+    Instruction *Inst = BI++;
+    if (Constant *C = ConstantFoldInstruction(Inst, TD)) {
+      Inst->replaceAllUsesWith(C);
+      Inst->eraseFromParent();
+      continue;
+    }
+    
+    RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+  
+  // Threaded an edge!
+  ++NumThreads;
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 0000000..1021469
--- /dev/null
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -0,0 +1,885 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible.  It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe.  This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+//  1. Moving loop invariant loads and calls out of loops.  If we can determine
+//     that a load or call inside of a loop never aliases anything stored to,
+//     we can hoist it or sink it like any other instruction.
+//  2. Scalar Promotion of Memory - If there is a store instruction inside of
+//     the loop, we try to move the store to happen AFTER the loop instead of
+//     inside of the loop.  This can only happen if a few conditions are true:
+//       A. The pointer stored through is loop invariant
+//       B. There are no stores or loads in the loop which _may_ alias the
+//          pointer.  There are no calls in the loop which mod/ref the pointer.
+//     If these conditions are true, we can promote the loads and stores in the
+//     loop of the pointer to use a temporary alloca'd variable.  We then use
+//     the mem2reg functionality to construct the appropriate SSA form for the
+//     variable.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "licm"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumSunk      , "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted   , "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted  , "Number of memory locations promoted to registers");
+
+static cl::opt<bool>
+DisablePromotion("disable-licm-promotion", cl::Hidden,
+                 cl::desc("Disable memory promotion in LICM pass"));
+
+// This feature is currently disabled by default because CodeGen is not yet
+// capable of rematerializing these constants in PIC mode, so it can lead to
+// degraded performance. Compile test/CodeGen/X86/remat-constant.ll with
+// -relocation-model=pic to see an example of this.
+static cl::opt<bool>
+EnableLICMConstantMotion("enable-licm-constant-variables", cl::Hidden,
+                         cl::desc("Enable hoisting/sinking of constant "
+                                  "global variables"));
+
+namespace {
+  struct VISIBILITY_HIDDEN LICM : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LICM() : LoopPass(&ID) {}
+
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();  // For scalar promotion (mem2reg)
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+    bool doFinalization() {
+      // Free the values stored in the map
+      for (std::map<Loop *, AliasSetTracker *>::iterator
+             I = LoopToAliasMap.begin(), E = LoopToAliasMap.end(); I != E; ++I)
+        delete I->second;
+
+      LoopToAliasMap.clear();
+      return false;
+    }
+
+  private:
+    // Various analyses that we use...
+    AliasAnalysis *AA;       // Current AliasAnalysis information
+    LoopInfo      *LI;       // Current LoopInfo
+    DominatorTree *DT;       // Dominator Tree for the current Loop...
+    DominanceFrontier *DF;   // Current Dominance Frontier
+
+    // State that is updated as we process loops
+    bool Changed;            // Set to true when we change anything.
+    BasicBlock *Preheader;   // The preheader block of the current loop...
+    Loop *CurLoop;           // The current loop we are working on...
+    AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    std::map<Loop *, AliasSetTracker *> LoopToAliasMap;
+
+    /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+    void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L);
+
+    /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+    /// set.
+    void deleteAnalysisValue(Value *V, Loop *L);
+
+    /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+    /// dominated by the specified block, and that are in the current loop) in
+    /// reverse depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit uses before definitions, allowing us to sink a loop body in one
+    /// pass without iteration.
+    ///
+    void SinkRegion(DomTreeNode *N);
+
+    /// HoistRegion - Walk the specified region of the CFG (defined by all
+    /// blocks dominated by the specified block, and that are in the current
+    /// loop) in depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit definitions before uses, allowing us to hoist a loop body in one
+    /// pass without iteration.
+    ///
+    void HoistRegion(DomTreeNode *N);
+
+    /// inSubLoop - Little predicate that returns true if the specified basic
+    /// block is in a subloop of the current one, not the current one itself.
+    ///
+    bool inSubLoop(BasicBlock *BB) {
+      assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+      for (Loop::iterator I = CurLoop->begin(), E = CurLoop->end(); I != E; ++I)
+        if ((*I)->contains(BB))
+          return true;  // A subloop actually contains this block!
+      return false;
+    }
+
+    /// isExitBlockDominatedByBlockInLoop - This method checks to see if the
+    /// specified exit block of the loop is dominated by the specified block
+    /// that is in the body of the loop.  We use these constraints to
+    /// dramatically limit the amount of the dominator tree that needs to be
+    /// searched.
+    bool isExitBlockDominatedByBlockInLoop(BasicBlock *ExitBlock,
+                                           BasicBlock *BlockInLoop) const {
+      // If the block in the loop is the loop header, it must be dominated!
+      BasicBlock *LoopHeader = CurLoop->getHeader();
+      if (BlockInLoop == LoopHeader)
+        return true;
+
+      DomTreeNode *BlockInLoopNode = DT->getNode(BlockInLoop);
+      DomTreeNode *IDom            = DT->getNode(ExitBlock);
+
+      // Because the exit block is not in the loop, we know we have to get _at
+      // least_ its immediate dominator.
+      do {
+        // Get next Immediate Dominator.
+        IDom = IDom->getIDom();
+
+        // If we have got to the header of the loop, then the instructions block
+        // did not dominate the exit node, so we can't hoist it.
+        if (IDom->getBlock() == LoopHeader)
+          return false;
+
+      } while (IDom != BlockInLoopNode);
+
+      return true;
+    }
+
+    /// sink - When an instruction is found to only be used outside of the loop,
+    /// this function moves it to the exit blocks and patches up SSA form as
+    /// needed.
+    ///
+    void sink(Instruction &I);
+
+    /// hoist - When an instruction is found to only use loop invariant operands
+    /// that is safe to hoist, this instruction is called to do the dirty work.
+    ///
+    void hoist(Instruction &I);
+
+    /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it
+    /// is not a trapping instruction or if it is a trapping instruction and is
+    /// guaranteed to execute.
+    ///
+    bool isSafeToExecuteUnconditionally(Instruction &I);
+
+    /// pointerInvalidatedByLoop - Return true if the body of this loop may
+    /// store into the memory location pointed to by V.
+    ///
+    bool pointerInvalidatedByLoop(Value *V, unsigned Size) {
+      // Check to see if any of the basic blocks in CurLoop invalidate *V.
+      return CurAST->getAliasSetForPointer(V, Size).isMod();
+    }
+
+    bool canSinkOrHoistInst(Instruction &I);
+    bool isLoopInvariantInst(Instruction &I);
+    bool isNotUsedInLoop(Instruction &I);
+
+    /// PromoteValuesInLoop - Look at the stores in the loop and promote as many
+    /// to scalars as we can.
+    ///
+    void PromoteValuesInLoop();
+
+    /// FindPromotableValuesInLoop - Check the current loop for stores to
+    /// definite pointers, which are not loaded and stored through may aliases.
+    /// If these are found, create an alloca for the value, add it to the
+    /// PromotedValues list, and keep track of the mapping from value to
+    /// alloca...
+    ///
+    void FindPromotableValuesInLoop(
+                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
+                                    std::map<Value*, AllocaInst*> &Val2AlMap);
+  };
+}
+
+char LICM::ID = 0;
+static RegisterPass<LICM> X("licm", "Loop Invariant Code Motion");
+
+Pass *llvm::createLICMPass() { return new LICM(); }
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple 
+/// times on one loop.
+///
+bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  Changed = false;
+
+  // Get our Loop and Alias Analysis information...
+  LI = &getAnalysis<LoopInfo>();
+  AA = &getAnalysis<AliasAnalysis>();
+  DF = &getAnalysis<DominanceFrontier>();
+  DT = &getAnalysis<DominatorTree>();
+
+  CurAST = new AliasSetTracker(*AA);
+  // Collect Alias info from subloops
+  for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
+       LoopItr != LoopItrE; ++LoopItr) {
+    Loop *InnerL = *LoopItr;
+    AliasSetTracker *InnerAST = LoopToAliasMap[InnerL];
+    assert (InnerAST && "Where is my AST?");
+
+    // What if InnerLoop was modified by other passes ?
+    CurAST->add(*InnerAST);
+  }
+  
+  CurLoop = L;
+
+  // Get the preheader block to move instructions into...
+  Preheader = L->getLoopPreheader();
+  assert(Preheader&&"Preheader insertion pass guarantees we have a preheader!");
+
+  // Loop over the body of this loop, looking for calls, invokes, and stores.
+  // Because subloops have already been incorporated into AST, we skip blocks in
+  // subloops.
+  //
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops...
+      CurAST->add(*BB);                 // Incorporate the specified basic block
+  }
+
+  // We want to visit all of the instructions in this loop... that are not parts
+  // of our subloops (they have already had their invariants hoisted out of
+  // their loop, into this loop, so there is no need to process the BODIES of
+  // the subloops).
+  //
+  // Traverse the body of the loop in depth first order on the dominator tree so
+  // that we are guaranteed to see definitions before we see uses.  This allows
+  // us to sink instructions in one pass, without iteration.  After sinking
+  // instructions, we perform another pass to hoist them out of the loop.
+  //
+  SinkRegion(DT->getNode(L->getHeader()));
+  HoistRegion(DT->getNode(L->getHeader()));
+
+  // Now that all loop invariants have been removed from the loop, promote any
+  // memory references to scalars that we can...
+  if (!DisablePromotion)
+    PromoteValuesInLoop();
+
+  // Clear out loops state information for the next iteration
+  CurLoop = 0;
+  Preheader = 0;
+
+  LoopToAliasMap[L] = CurAST;
+  return Changed;
+}
+
+/// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in
+/// reverse depth first order w.r.t the DominatorTree.  This allows us to visit
+/// uses before definitions, allowing us to sink a loop body in one pass without
+/// iteration.
+///
+void LICM::SinkRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // We are processing blocks in reverse dfo, so process children first...
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    SinkRegion(Children[i]);
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (inSubLoop(BB)) return;
+
+  for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
+    Instruction &I = *--II;
+
+    // Check to see if we can sink this instruction to the exit blocks
+    // of the loop.  We can do this if the all users of the instruction are
+    // outside of the loop.  In this case, it doesn't even matter if the
+    // operands of the instruction are loop invariant.
+    //
+    if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) {
+      ++II;
+      sink(I);
+    }
+  }
+}
+
+
+/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in depth
+/// first order w.r.t the DominatorTree.  This allows us to visit definitions
+/// before uses, allowing us to hoist a loop body in one pass without iteration.
+///
+void LICM::HoistRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (!inSubLoop(BB))
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
+      Instruction &I = *II++;
+
+      // Try hoisting the instruction out to the preheader.  We can only do this
+      // if all of the operands of the instruction are loop invariant and if it
+      // is safe to hoist the instruction.
+      //
+      if (isLoopInvariantInst(I) && canSinkOrHoistInst(I) &&
+          isSafeToExecuteUnconditionally(I))
+        hoist(I);
+      }
+
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    HoistRegion(Children[i]);
+}
+
+/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
+/// instruction.
+///
+bool LICM::canSinkOrHoistInst(Instruction &I) {
+  // Loads have extra constraints we have to verify before we can hoist them.
+  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (LI->isVolatile())
+      return false;        // Don't hoist volatile loads!
+
+    // Loads from constant memory are always safe to move, even if they end up
+    // in the same alias set as something that ends up being modified.
+    if (EnableLICMConstantMotion &&
+        AA->pointsToConstantMemory(LI->getOperand(0)))
+      return true;
+    
+    // Don't hoist loads which have may-aliased stores in loop.
+    unsigned Size = 0;
+    if (LI->getType()->isSized())
+      Size = AA->getTargetData().getTypeStoreSize(LI->getType());
+    return !pointerInvalidatedByLoop(LI->getOperand(0), Size);
+  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Handle obvious cases efficiently.
+    AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI);
+    if (Behavior == AliasAnalysis::DoesNotAccessMemory)
+      return true;
+    else if (Behavior == AliasAnalysis::OnlyReadsMemory) {
+      // If this call only reads from memory and there are no writes to memory
+      // in the loop, we can hoist or sink the call as appropriate.
+      bool FoundMod = false;
+      for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+           I != E; ++I) {
+        AliasSet &AS = *I;
+        if (!AS.isForwardingAliasSet() && AS.isMod()) {
+          FoundMod = true;
+          break;
+        }
+      }
+      if (!FoundMod) return true;
+    }
+
+    // FIXME: This should use mod/ref information to see if we can hoist or sink
+    // the call.
+
+    return false;
+  }
+
+  // Otherwise these instructions are hoistable/sinkable
+  return isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+         isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+         isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+         isa<ShuffleVectorInst>(I);
+}
+
+/// isNotUsedInLoop - Return true if the only users of this instruction are
+/// outside of the loop.  If this is true, we can sink the instruction to the
+/// exit blocks of the loop.
+///
+bool LICM::isNotUsedInLoop(Instruction &I) {
+  for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      // PHI node uses occur in predecessor blocks!
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I)
+          if (CurLoop->contains(PN->getIncomingBlock(i)))
+            return false;
+    } else if (CurLoop->contains(User->getParent())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+/// isLoopInvariantInst - Return true if all operands of this instruction are
+/// loop invariant.  We also filter out non-hoistable instructions here just for
+/// efficiency.
+///
+bool LICM::isLoopInvariantInst(Instruction &I) {
+  // The instruction is loop invariant if all of its operands are loop-invariant
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+    if (!CurLoop->isLoopInvariant(I.getOperand(i)))
+      return false;
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+/// sink - When an instruction is found to only be used outside of the loop,
+/// this function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+void LICM::sink(Instruction &I) {
+  DOUT << "LICM sinking instruction: " << I;
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumSunk;
+  Changed = true;
+
+  // The case where there is only a single exit node of this loop is common
+  // enough that we handle it as a special (more efficient) case.  It is more
+  // efficient to handle because there are no PHI nodes that need to be placed.
+  if (ExitBlocks.size() == 1) {
+    if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[0], I.getParent())) {
+      // Instruction is not used, just delete it.
+      CurAST->deleteValue(&I);
+      if (!I.use_empty())  // If I has users in unreachable blocks, eliminate.
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+      I.eraseFromParent();
+    } else {
+      // Move the instruction to the start of the exit block, after any PHI
+      // nodes in it.
+      I.removeFromParent();
+
+      BasicBlock::iterator InsertPt = ExitBlocks[0]->getFirstNonPHI();
+      ExitBlocks[0]->getInstList().insert(InsertPt, &I);
+    }
+  } else if (ExitBlocks.empty()) {
+    // The instruction is actually dead if there ARE NO exit blocks.
+    CurAST->deleteValue(&I);
+    if (!I.use_empty())  // If I has users in unreachable blocks, eliminate.
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    I.eraseFromParent();
+  } else {
+    // Otherwise, if we have multiple exits, use the PromoteMem2Reg function to
+    // do all of the hard work of inserting PHI nodes as necessary.  We convert
+    // the value into a stack object to get it to do this.
+
+    // Firstly, we create a stack object to hold the value...
+    AllocaInst *AI = 0;
+
+    if (I.getType() != Type::VoidTy) {
+      AI = new AllocaInst(I.getType(), 0, I.getName(),
+                          I.getParent()->getParent()->getEntryBlock().begin());
+      CurAST->add(AI);
+    }
+
+    // Secondly, insert load instructions for each use of the instruction
+    // outside of the loop.
+    while (!I.use_empty()) {
+      Instruction *U = cast<Instruction>(I.use_back());
+
+      // If the user is a PHI Node, we actually have to insert load instructions
+      // in all predecessor blocks, not in the PHI block itself!
+      if (PHINode *UPN = dyn_cast<PHINode>(U)) {
+        // Only insert into each predecessor once, so that we don't have
+        // different incoming values from the same block!
+        std::map<BasicBlock*, Value*> InsertedBlocks;
+        for (unsigned i = 0, e = UPN->getNumIncomingValues(); i != e; ++i)
+          if (UPN->getIncomingValue(i) == &I) {
+            BasicBlock *Pred = UPN->getIncomingBlock(i);
+            Value *&PredVal = InsertedBlocks[Pred];
+            if (!PredVal) {
+              // Insert a new load instruction right before the terminator in
+              // the predecessor block.
+              PredVal = new LoadInst(AI, "", Pred->getTerminator());
+              CurAST->add(cast<LoadInst>(PredVal));
+            }
+
+            UPN->setIncomingValue(i, PredVal);
+          }
+
+      } else {
+        LoadInst *L = new LoadInst(AI, "", U);
+        U->replaceUsesOfWith(&I, L);
+        CurAST->add(L);
+      }
+    }
+
+    // Thirdly, insert a copy of the instruction in each exit block of the loop
+    // that is dominated by the instruction, storing the result into the memory
+    // location.  Be careful not to insert the instruction into any particular
+    // basic block more than once.
+    std::set<BasicBlock*> InsertedBlocks;
+    BasicBlock *InstOrigBB = I.getParent();
+
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+
+      if (isExitBlockDominatedByBlockInLoop(ExitBlock, InstOrigBB)) {
+        // If we haven't already processed this exit block, do so now.
+        if (InsertedBlocks.insert(ExitBlock).second) {
+          // Insert the code after the last PHI node...
+          BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
+
+          // If this is the first exit block processed, just move the original
+          // instruction, otherwise clone the original instruction and insert
+          // the copy.
+          Instruction *New;
+          if (InsertedBlocks.size() == 1) {
+            I.removeFromParent();
+            ExitBlock->getInstList().insert(InsertPt, &I);
+            New = &I;
+          } else {
+            New = I.clone();
+            CurAST->copyValue(&I, New);
+            if (!I.getName().empty())
+              New->setName(I.getName()+".le");
+            ExitBlock->getInstList().insert(InsertPt, New);
+          }
+
+          // Now that we have inserted the instruction, store it into the alloca
+          if (AI) new StoreInst(New, AI, InsertPt);
+        }
+      }
+    }
+
+    // If the instruction doesn't dominate any exit blocks, it must be dead.
+    if (InsertedBlocks.empty()) {
+      CurAST->deleteValue(&I);
+      I.eraseFromParent();
+    }
+
+    // Finally, promote the fine value to SSA form.
+    if (AI) {
+      std::vector<AllocaInst*> Allocas;
+      Allocas.push_back(AI);
+      PromoteMemToReg(Allocas, *DT, *DF, CurAST);
+    }
+  }
+}
+
+/// hoist - When an instruction is found to only use loop invariant operands
+/// that is safe to hoist, this instruction is called to do the dirty work.
+///
+void LICM::hoist(Instruction &I) {
+  DOUT << "LICM hoisting to " << Preheader->getName() << ": " << I;
+
+  // Remove the instruction from its current basic block... but don't delete the
+  // instruction.
+  I.removeFromParent();
+
+  // Insert the new node in Preheader, before the terminator.
+  Preheader->getInstList().insert(Preheader->getTerminator(), &I);
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumHoisted;
+  Changed = true;
+}
+
+/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is
+/// not a trapping instruction or if it is a trapping instruction and is
+/// guaranteed to execute.
+///
+bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
+  // If it is not a trapping instruction, it is always safe to hoist.
+  if (!Inst.isTrapping()) return true;
+
+  // Otherwise we have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    return true;
+
+  // It's always safe to load from a global or alloca.
+  if (isa<LoadInst>(Inst))
+    if (isa<AllocationInst>(Inst.getOperand(0)) ||
+        isa<GlobalVariable>(Inst.getOperand(0)))
+      return true;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // For each exit block, get the DT node and walk up the DT until the
+  // instruction's basic block is found or we exit the loop.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[i], Inst.getParent()))
+      return false;
+
+  return true;
+}
+
+
+/// PromoteValuesInLoop - Try to promote memory values to scalars by sinking
+/// stores out of the loop and moving loads to before the loop.  We do this by
+/// looping over the stores in the loop, looking for stores to Must pointers
+/// which are loop invariant.  We promote these memory locations to use allocas
+/// instead.  These allocas can easily be raised to register values by the
+/// PromoteMem2Reg functionality.
+///
+void LICM::PromoteValuesInLoop() {
+  // PromotedValues - List of values that are promoted out of the loop.  Each
+  // value has an alloca instruction for it, and a canonical version of the
+  // pointer.
+  std::vector<std::pair<AllocaInst*, Value*> > PromotedValues;
+  std::map<Value*, AllocaInst*> ValueToAllocaMap; // Map of ptr to alloca
+
+  FindPromotableValuesInLoop(PromotedValues, ValueToAllocaMap);
+  if (ValueToAllocaMap.empty()) return;   // If there are values to promote.
+
+  Changed = true;
+  NumPromoted += PromotedValues.size();
+
+  std::vector<Value*> PointerValueNumbers;
+
+  // Emit a copy from the value into the alloca'd value in the loop preheader
+  TerminatorInst *LoopPredInst = Preheader->getTerminator();
+  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
+    Value *Ptr = PromotedValues[i].second;
+
+    // If we are promoting a pointer value, update alias information for the
+    // inserted load.
+    Value *LoadValue = 0;
+    if (isa<PointerType>(cast<PointerType>(Ptr->getType())->getElementType())) {
+      // Locate a load or store through the pointer, and assign the same value
+      // to LI as we are loading or storing.  Since we know that the value is
+      // stored in this loop, this will always succeed.
+      for (Value::use_iterator UI = Ptr->use_begin(), E = Ptr->use_end();
+           UI != E; ++UI)
+        if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+          LoadValue = LI;
+          break;
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+          if (SI->getOperand(1) == Ptr) {
+            LoadValue = SI->getOperand(0);
+            break;
+          }
+        }
+      assert(LoadValue && "No store through the pointer found!");
+      PointerValueNumbers.push_back(LoadValue);  // Remember this for later.
+    }
+
+    // Load from the memory we are promoting.
+    LoadInst *LI = new LoadInst(Ptr, Ptr->getName()+".promoted", LoopPredInst);
+
+    if (LoadValue) CurAST->copyValue(LoadValue, LI);
+
+    // Store into the temporary alloca.
+    new StoreInst(LI, PromotedValues[i].first, LoopPredInst);
+  }
+
+  // Scan the basic blocks in the loop, replacing uses of our pointers with
+  // uses of the allocas in question.
+  //
+  for (Loop::block_iterator I = CurLoop->block_begin(),
+         E = CurLoop->block_end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    // Rewrite all loads and stores in the block of the pointer...
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+      if (LoadInst *L = dyn_cast<LoadInst>(II)) {
+        std::map<Value*, AllocaInst*>::iterator
+          I = ValueToAllocaMap.find(L->getOperand(0));
+        if (I != ValueToAllocaMap.end())
+          L->setOperand(0, I->second);    // Rewrite load instruction...
+      } else if (StoreInst *S = dyn_cast<StoreInst>(II)) {
+        std::map<Value*, AllocaInst*>::iterator
+          I = ValueToAllocaMap.find(S->getOperand(1));
+        if (I != ValueToAllocaMap.end())
+          S->setOperand(1, I->second);    // Rewrite store instruction...
+      }
+    }
+  }
+
+  // Now that the body of the loop uses the allocas instead of the original
+  // memory locations, insert code to copy the alloca value back into the
+  // original memory location on all exits from the loop.  Note that we only
+  // want to insert one copy of the code in each exit block, though the loop may
+  // exit to the same block more than once.
+  //
+  SmallPtrSet<BasicBlock*, 16> ProcessedBlocks;
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    if (!ProcessedBlocks.insert(ExitBlocks[i]))
+      continue;
+  
+    // Copy all of the allocas into their memory locations.
+    BasicBlock::iterator BI = ExitBlocks[i]->getFirstNonPHI();
+    Instruction *InsertPos = BI;
+    unsigned PVN = 0;
+    for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
+      // Load from the alloca.
+      LoadInst *LI = new LoadInst(PromotedValues[i].first, "", InsertPos);
+
+      // If this is a pointer type, update alias info appropriately.
+      if (isa<PointerType>(LI->getType()))
+        CurAST->copyValue(PointerValueNumbers[PVN++], LI);
+
+      // Store into the memory we promoted.
+      new StoreInst(LI, PromotedValues[i].second, InsertPos);
+    }
+  }
+
+  // Now that we have done the deed, use the mem2reg functionality to promote
+  // all of the new allocas we just created into real SSA registers.
+  //
+  std::vector<AllocaInst*> PromotedAllocas;
+  PromotedAllocas.reserve(PromotedValues.size());
+  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i)
+    PromotedAllocas.push_back(PromotedValues[i].first);
+  PromoteMemToReg(PromotedAllocas, *DT, *DF, CurAST);
+}
+
+/// FindPromotableValuesInLoop - Check the current loop for stores to definite
+/// pointers, which are not loaded and stored through may aliases and are safe
+/// for promotion.  If these are found, create an alloca for the value, add it 
+/// to the PromotedValues list, and keep track of the mapping from value to 
+/// alloca. 
+void LICM::FindPromotableValuesInLoop(
+                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
+                             std::map<Value*, AllocaInst*> &ValueToAllocaMap) {
+  Instruction *FnStart = CurLoop->getHeader()->getParent()->begin()->begin();
+
+  // Loop over all of the alias sets in the tracker object.
+  for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+       I != E; ++I) {
+    AliasSet &AS = *I;
+    // We can promote this alias set if it has a store, if it is a "Must" alias
+    // set, if the pointer is loop invariant, and if we are not eliminating any
+    // volatile loads or stores.
+    if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+        AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
+      continue;
+    
+    assert(!AS.empty() &&
+           "Must alias set should have at least one pointer element in it!");
+    Value *V = AS.begin()->getValue();
+
+    // Check that all of the pointers in the alias set have the same type.  We
+    // cannot (yet) promote a memory location that is loaded and stored in
+    // different sizes.
+    {
+      bool PointerOk = true;
+      for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
+        if (V->getType() != I->getValue()->getType()) {
+          PointerOk = false;
+          break;
+        }
+      if (!PointerOk)
+        continue;
+    }
+
+    // It isn't safe to promote a load/store from the loop if the load/store is
+    // conditional.  For example, turning:
+    //
+    //    for () { if (c) *P += 1; }
+    //
+    // into:
+    //
+    //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+    //
+    // is not safe, because *P may only be valid to access if 'c' is true.
+    // 
+    // It is safe to promote P if all uses are direct load/stores and if at
+    // least one is guaranteed to be executed.
+    bool GuaranteedToExecute = false;
+    bool InvalidInst = false;
+    for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      // Ignore instructions not in this loop.
+      Instruction *Use = dyn_cast<Instruction>(*UI);
+      if (!Use || !CurLoop->contains(Use->getParent()))
+        continue;
+
+      if (!isa<LoadInst>(Use) && !isa<StoreInst>(Use)) {
+        InvalidInst = true;
+        break;
+      }
+      
+      if (!GuaranteedToExecute)
+        GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use);
+    }
+
+    // If there is an non-load/store instruction in the loop, we can't promote
+    // it.  If there isn't a guaranteed-to-execute instruction, we can't
+    // promote.
+    if (InvalidInst || !GuaranteedToExecute)
+      continue;
+    
+    const Type *Ty = cast<PointerType>(V->getType())->getElementType();
+    AllocaInst *AI = new AllocaInst(Ty, 0, V->getName()+".tmp", FnStart);
+    PromotedValues.push_back(std::make_pair(AI, V));
+
+    // Update the AST and alias analysis.
+    CurAST->copyValue(V, AI);
+
+    for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
+      ValueToAllocaMap.insert(std::make_pair(I->getValue(), AI));
+
+    DOUT << "LICM: Promoting value: " << *V << "\n";
+  }
+}
+
+/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasMap[L];
+  if (!AST)
+    return;
+
+  AST->copyValue(From, To);
+}
+
+/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+/// set.
+void LICM::deleteAnalysisValue(Value *V, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasMap[L];
+  if (!AST)
+    return;
+
+  AST->deleteValue(V);
+}
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
new file mode 100644
index 0000000..6512672
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -0,0 +1,280 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-delete"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+namespace {
+  class VISIBILITY_HIDDEN LoopDeletion : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopDeletion() : LoopPass(&ID) {}
+    
+    // Possibly eliminate loop L if it is dead.
+    bool runOnLoop(Loop* L, LPPassManager& LPM);
+    
+    bool SingleDominatingExit(Loop* L,
+                              SmallVector<BasicBlock*, 4>& exitingBlocks);
+    bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
+                    SmallVector<BasicBlock*, 4>& exitBlocks);
+    bool IsLoopInvariantInst(Instruction *I, Loop* L);
+    
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DominanceFrontier>();
+    }
+  };
+}
+  
+char LoopDeletion::ID = 0;
+static RegisterPass<LoopDeletion> X("loop-deletion", "Delete dead loops");
+
+Pass* llvm::createLoopDeletionPass() {
+  return new LoopDeletion();
+}
+
+/// SingleDominatingExit - Checks that there is only a single blocks that 
+/// branches out of the loop, and that it also g the latch block.  Loops
+/// with multiple or non-latch-dominating exiting blocks could be dead, but we'd
+/// have to do more extensive analysis to make sure, for instance, that the 
+/// control flow logic involved was or could be made loop-invariant.
+bool LoopDeletion::SingleDominatingExit(Loop* L,
+                                   SmallVector<BasicBlock*, 4>& exitingBlocks) {
+  
+  if (exitingBlocks.size() != 1)
+    return false;
+  
+  BasicBlock* latch = L->getLoopLatch();
+  if (!latch)
+    return false;
+  
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  return DT.dominates(exitingBlocks[0], latch);
+}
+
+/// IsLoopInvariantInst - Checks if an instruction is invariant with respect to
+/// a loop, which is defined as being true if all of its operands are defined
+/// outside of the loop.  These instructions can be hoisted out of the loop
+/// if their results are needed.  This could be made more aggressive by
+/// recursively checking the operands for invariance, but it's not clear that
+/// it's worth it.
+bool LoopDeletion::IsLoopInvariantInst(Instruction *I, Loop* L)  {
+  // PHI nodes are not loop invariant if defined in  the loop.
+  if (isa<PHINode>(I) && L->contains(I->getParent()))
+    return false;
+    
+  // The instruction is loop invariant if all of its operands are loop-invariant
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (!L->isLoopInvariant(I->getOperand(i)))
+      return false;
+  
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+/// IsLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// checked for unique exit and exiting blocks, and that the code is in LCSSA
+/// form.
+bool LoopDeletion::IsLoopDead(Loop* L,
+                              SmallVector<BasicBlock*, 4>& exitingBlocks,
+                              SmallVector<BasicBlock*, 4>& exitBlocks) {
+  BasicBlock* exitingBlock = exitingBlocks[0];
+  BasicBlock* exitBlock = exitBlocks[0];
+  
+  // Make sure that all PHI entries coming from the loop are loop invariant.
+  // Because the code is in LCSSA form, any values used outside of the loop
+  // must pass through a PHI in the exit block, meaning that this check is
+  // sufficient to guarantee that no loop-variant values are used outside
+  // of the loop.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+    Value* incoming = P->getIncomingValueForBlock(exitingBlock);
+    if (Instruction* I = dyn_cast<Instruction>(incoming))
+      if (!IsLoopInvariantInst(I, L))
+        return false;
+      
+    BI++;
+  }
+  
+  // Make sure that no instructions in the block have potential side-effects.
+  // This includes instructions that could write to memory, and loads that are
+  // marked volatile.  This could be made more aggressive by using aliasing
+  // information to identify readonly and readnone calls.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ++BI) {
+      if (BI->mayHaveSideEffects())
+        return false;
+    }
+  }
+  
+  return true;
+}
+
+/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
+/// observable behavior of the program other than finite running time.  Note 
+/// we do ensure that this never remove a loop that might be infinite, as doing
+/// so could change the halting/non-halting nature of a program.
+/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
+/// in order to make various safety checks work.
+bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
+  // We can only remove the loop if there is a preheader that we can 
+  // branch from after removing it.
+  BasicBlock* preheader = L->getLoopPreheader();
+  if (!preheader)
+    return false;
+  
+  // We can't remove loops that contain subloops.  If the subloops were dead,
+  // they would already have been removed in earlier executions of this pass.
+  if (L->begin() != L->end())
+    return false;
+  
+  SmallVector<BasicBlock*, 4> exitingBlocks;
+  L->getExitingBlocks(exitingBlocks);
+  
+  SmallVector<BasicBlock*, 4> exitBlocks;
+  L->getUniqueExitBlocks(exitBlocks);
+  
+  // We require that the loop only have a single exit block.  Otherwise, we'd
+  // be in the situation of needing to be able to solve statically which exit
+  // block will be branched to, or trying to preserve the branching logic in
+  // a loop invariant manner.
+  if (exitBlocks.size() != 1)
+    return false;
+  
+  // Loops with multiple exits or exits that don't dominate the latch
+  // are too complicated to handle correctly.
+  if (!SingleDominatingExit(L, exitingBlocks))
+    return false;
+  
+  // Finally, we have to check that the loop really is dead.
+  if (!IsLoopDead(L, exitingBlocks, exitBlocks))
+    return false;
+  
+  // Don't remove loops for which we can't solve the trip count.
+  // They could be infinite, in which case we'd be changing program behavior.
+  ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
+  SCEVHandle S = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(S))
+    return false;
+  
+  // Now that we know the removal is safe, remove the loop by changing the
+  // branch from the preheader to go to the single exit block.  
+  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock* exitingBlock = exitingBlocks[0];
+  
+  // Because we're deleting a large chunk of code at once, the sequence in which
+  // we remove things is very important to avoid invalidation issues.  Don't
+  // mess with this unless you have good reason and know what you're doing.
+  
+  // Move simple loop-invariant expressions out of the loop, since they
+  // might be needed by the exit phis.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ) {
+      Instruction* I = BI++;
+      if (!I->use_empty() && IsLoopInvariantInst(I, L))
+        I->moveBefore(preheader->getTerminator());
+    }
+  
+  // Connect the preheader directly to the exit block.
+  TerminatorInst* TI = preheader->getTerminator();
+  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from
+  // the preheader instead of the exiting block.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+    P->replaceUsesOfWith(exitingBlock, preheader);
+    BI++;
+  }
+  
+  // Update the dominator tree and remove the instructions and blocks that will
+  // be deleted from the reference counting scheme.
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  DominanceFrontier* DF = getAnalysisIfAvailable<DominanceFrontier>();
+  SmallPtrSet<DomTreeNode*, 8> ChildNodes;
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    // Move all of the block's children to be children of the preheader, which
+    // allows us to remove the domtree entry for the block.
+    ChildNodes.insert(DT[*LI]->begin(), DT[*LI]->end());
+    for (SmallPtrSet<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(),
+         DE = ChildNodes.end(); DI != DE; ++DI) {
+      DT.changeImmediateDominator(*DI, DT[preheader]);
+      if (DF) DF->changeImmediateDominator((*DI)->getBlock(), preheader, &DT);
+    }
+    
+    ChildNodes.clear();
+    DT.eraseNode(*LI);
+    if (DF) DF->removeBlock(*LI);
+
+    // Remove the block from the reference counting scheme, so that we can
+    // delete it freely later.
+    (*LI)->dropAllReferences();
+  }
+  
+  // Tell ScalarEvolution that the loop is deleted. Do this before
+  // deleting the loop so that ScalarEvolution can look at the loop
+  // to determine what it needs to clean up.
+  SE.forgetLoopBackedgeTakenCount(L);
+
+  // Erase the instructions and the blocks without having to worry
+  // about ordering because we already dropped the references.
+  // NOTE: This iteration is safe because erasing the block does not remove its
+  // entry from the loop's block list.  We do that in the next section.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    (*LI)->eraseFromParent();
+
+  // Finally, the blocks from loopinfo.  This has to happen late because
+  // otherwise our loop iterators won't work.
+  LoopInfo& loopInfo = getAnalysis<LoopInfo>();
+  SmallPtrSet<BasicBlock*, 8> blocks;
+  blocks.insert(L->block_begin(), L->block_end());
+  for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
+       E = blocks.end(); I != E; ++I)
+    loopInfo.removeBlock(*I);
+  
+  // The last step is to inform the loop pass manager that we've
+  // eliminated this loop.
+  LPM.deleteLoopFromQueue(L);
+  
+  NumDeleted++;
+  
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp
new file mode 100644
index 0000000..9c78596
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp
@@ -0,0 +1,1237 @@
+//===- LoopIndexSplit.cpp - Loop Index Splitting Pass ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Index Splitting Pass. This pass handles three
+// kinds of loops.
+//
+// [1] A loop may be eliminated if the body is executed exactly once.
+//     For example,
+//
+// for (i = 0; i < N; ++i) {
+//   if (i == X) {
+//     body;
+//   }
+// }
+//
+// is transformed to
+//
+// i = X;
+// body;
+//
+// [2] A loop's iteration space may be shrunk if the loop body is executed
+//     for a proper sub-range of the loop's iteration space. For example,
+//
+// for (i = 0; i < N; ++i) {
+//   if (i > A && i < B) {
+//     ...
+//   }
+// }
+//
+// is transformed to iterators from A to B, if A > 0 and B < N.
+//
+// [3] A loop may be split if the loop body is dominated by a branch.
+//     For example,
+//
+// for (i = LB; i < UB; ++i) { if (i < SV) A; else B; }
+//
+// is transformed into
+//
+// AEV = BSV = SV
+// for (i = LB; i < min(UB, AEV); ++i)
+//    A;
+// for (i = max(LB, BSV); i < UB; ++i);
+//    B;
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-index-split"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(NumIndexSplit, "Number of loop index split");
+STATISTIC(NumIndexSplitRemoved, "Number of loops eliminated by loop index split");
+STATISTIC(NumRestrictBounds, "Number of loop iteration space restricted");
+
+namespace {
+
+  class VISIBILITY_HIDDEN LoopIndexSplit : public LoopPass {
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopIndexSplit() : LoopPass(&ID) {}
+
+    // Index split Loop L. Return true if loop is split.
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+  private:
+    /// processOneIterationLoop -- Eliminate loop if loop body is executed 
+    /// only once. For example,
+    /// for (i = 0; i < N; ++i) {
+    ///   if ( i == X) {
+    ///     ...
+    ///   }
+    /// }
+    ///
+    bool processOneIterationLoop();
+
+    // -- Routines used by updateLoopIterationSpace();
+
+    /// updateLoopIterationSpace -- Update loop's iteration space if loop 
+    /// body is executed for certain IV range only. For example,
+    /// 
+    /// for (i = 0; i < N; ++i) {
+    ///   if ( i > A && i < B) {
+    ///     ...
+    ///   }
+    /// }
+    /// is transformed to iterators from A to B, if A > 0 and B < N.
+    ///
+    bool updateLoopIterationSpace();
+
+    /// restrictLoopBound - Op dominates loop body. Op compares an IV based value
+    /// with a loop invariant value. Update loop's lower and upper bound based on
+    /// the loop invariant value.
+    bool restrictLoopBound(ICmpInst &Op);
+
+    // --- Routines used by splitLoop(). --- /
+
+    bool splitLoop();
+
+    /// removeBlocks - Remove basic block DeadBB and all blocks dominated by 
+    /// DeadBB. This routine is used to remove split condition's dead branch, 
+    /// dominated by DeadBB. LiveBB dominates split conidition's other branch.
+    void removeBlocks(BasicBlock *DeadBB, Loop *LP, BasicBlock *LiveBB);
+    
+    /// moveExitCondition - Move exit condition EC into split condition block.
+    void moveExitCondition(BasicBlock *CondBB, BasicBlock *ActiveBB,
+                           BasicBlock *ExitBB, ICmpInst *EC, ICmpInst *SC,
+                           PHINode *IV, Instruction *IVAdd, Loop *LP,
+                           unsigned);
+    
+    /// updatePHINodes - CFG has been changed. 
+    /// Before 
+    ///   - ExitBB's single predecessor was Latch
+    ///   - Latch's second successor was Header
+    /// Now
+    ///   - ExitBB's single predecessor was Header
+    ///   - Latch's one and only successor was Header
+    ///
+    /// Update ExitBB PHINodes' to reflect this change.
+    void updatePHINodes(BasicBlock *ExitBB, BasicBlock *Latch, 
+                        BasicBlock *Header,
+                        PHINode *IV, Instruction *IVIncrement, Loop *LP);
+
+    // --- Utility routines --- /
+
+    /// cleanBlock - A block is considered clean if all non terminal 
+    /// instructions are either PHINodes or IV based values.
+    bool cleanBlock(BasicBlock *BB);
+
+    /// IVisLT - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is less than  the loop invariant then return the loop 
+    /// invariant. Otherwise return NULL.
+    Value * IVisLT(ICmpInst &Op);
+
+    /// IVisLE - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is less than or equal to the loop invariant then 
+    /// return the loop invariant. Otherwise return NULL.
+    Value * IVisLE(ICmpInst &Op);
+
+    /// IVisGT - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is greater than  the loop invariant then return the loop 
+    /// invariant. Otherwise return NULL.
+    Value * IVisGT(ICmpInst &Op);
+
+    /// IVisGE - If Op is comparing IV based value with an loop invariant and 
+    /// IV based value is greater than or equal to the loop invariant then 
+    /// return the loop invariant. Otherwise return NULL.
+    Value * IVisGE(ICmpInst &Op);
+
+  private:
+
+    // Current Loop information.
+    Loop *L;
+    LPPassManager *LPM;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    DominanceFrontier *DF;
+
+    PHINode *IndVar;
+    ICmpInst *ExitCondition;
+    ICmpInst *SplitCondition;
+    Value *IVStartValue;
+    Value *IVExitValue;
+    Instruction *IVIncrement;
+    SmallPtrSet<Value *, 4> IVBasedValues;
+  };
+}
+
+char LoopIndexSplit::ID = 0;
+static RegisterPass<LoopIndexSplit>
+X("loop-index-split", "Index Split Loops");
+
+Pass *llvm::createLoopIndexSplitPass() {
+  return new LoopIndexSplit();
+}
+
+// Index split Loop L. Return true if loop is split.
+bool LoopIndexSplit::runOnLoop(Loop *IncomingLoop, LPPassManager &LPM_Ref) {
+  L = IncomingLoop;
+  LPM = &LPM_Ref;
+
+  // FIXME - Nested loops make dominator info updates tricky. 
+  if (!L->getSubLoops().empty())
+    return false;
+
+  DT = &getAnalysis<DominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+  DF = &getAnalysis<DominanceFrontier>();
+
+  // Initialize loop data.
+  IndVar = L->getCanonicalInductionVariable();
+  if (!IndVar) return false;
+
+  bool P1InLoop = L->contains(IndVar->getIncomingBlock(1));
+  IVStartValue = IndVar->getIncomingValue(!P1InLoop);
+  IVIncrement = dyn_cast<Instruction>(IndVar->getIncomingValue(P1InLoop));
+  if (!IVIncrement) return false;
+  
+  IVBasedValues.clear();
+  IVBasedValues.insert(IndVar);
+  IVBasedValues.insert(IVIncrement);
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) 
+    for(BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); 
+        BI != BE; ++BI) {
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BI)) 
+        if (BO != IVIncrement 
+            && (BO->getOpcode() == Instruction::Add
+                || BO->getOpcode() == Instruction::Sub))
+          if (IVBasedValues.count(BO->getOperand(0))
+              && L->isLoopInvariant(BO->getOperand(1)))
+            IVBasedValues.insert(BO);
+    }
+
+  // Reject loop if loop exit condition is not suitable.
+  BasicBlock *ExitingBlock = L->getExitingBlock();
+  if (!ExitingBlock)
+    return false;
+  BranchInst *EBR = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (!EBR) return false;
+  ExitCondition = dyn_cast<ICmpInst>(EBR->getCondition());
+  if (!ExitCondition) return false;
+  if (ExitingBlock != L->getLoopLatch()) return false;
+  IVExitValue = ExitCondition->getOperand(1);
+  if (!L->isLoopInvariant(IVExitValue))
+    IVExitValue = ExitCondition->getOperand(0);
+  if (!L->isLoopInvariant(IVExitValue))
+    return false;
+
+  // If start value is more then exit value where induction variable
+  // increments by 1 then we are potentially dealing with an infinite loop.
+  // Do not index split this loop.
+  if (ConstantInt *SV = dyn_cast<ConstantInt>(IVStartValue))
+    if (ConstantInt *EV = dyn_cast<ConstantInt>(IVExitValue))
+      if (SV->getSExtValue() > EV->getSExtValue())
+        return false;
+
+  if (processOneIterationLoop())
+    return true;
+
+  if (updateLoopIterationSpace())
+    return true;
+
+  if (splitLoop())
+    return true;
+
+  return false;
+}
+
+// --- Helper routines --- 
+// isUsedOutsideLoop - Returns true iff V is used outside the loop L.
+static bool isUsedOutsideLoop(Value *V, Loop *L) {
+  for(Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (!L->contains(cast<Instruction>(*UI)->getParent()))
+      return true;
+  return false;
+}
+
+// Return V+1
+static Value *getPlusOne(Value *V, bool Sign, Instruction *InsertPt) {
+  ConstantInt *One = ConstantInt::get(V->getType(), 1, Sign);
+  return BinaryOperator::CreateAdd(V, One, "lsp", InsertPt);
+}
+
+// Return V-1
+static Value *getMinusOne(Value *V, bool Sign, Instruction *InsertPt) {
+  ConstantInt *One = ConstantInt::get(V->getType(), 1, Sign);
+  return BinaryOperator::CreateSub(V, One, "lsp", InsertPt);
+}
+
+// Return min(V1, V1)
+static Value *getMin(Value *V1, Value *V2, bool Sign, Instruction *InsertPt) {
+ 
+  Value *C = new ICmpInst(Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                          V1, V2, "lsp", InsertPt);
+  return SelectInst::Create(C, V1, V2, "lsp", InsertPt);
+}
+
+// Return max(V1, V2)
+static Value *getMax(Value *V1, Value *V2, bool Sign, Instruction *InsertPt) {
+ 
+  Value *C = new ICmpInst(Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                          V1, V2, "lsp", InsertPt);
+  return SelectInst::Create(C, V2, V1, "lsp", InsertPt);
+}
+
+/// processOneIterationLoop -- Eliminate loop if loop body is executed 
+/// only once. For example,
+/// for (i = 0; i < N; ++i) {
+///   if ( i == X) {
+///     ...
+///   }
+/// }
+///
+bool LoopIndexSplit::processOneIterationLoop() {
+  SplitCondition = NULL;
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+  BranchInst *BR = dyn_cast<BranchInst>(Header->getTerminator());
+  if (!BR) return false;
+  if (!isa<BranchInst>(Latch->getTerminator())) return false;
+  if (BR->isUnconditional()) return false;
+  SplitCondition = dyn_cast<ICmpInst>(BR->getCondition());
+  if (!SplitCondition) return false;
+  if (SplitCondition == ExitCondition) return false;
+  if (SplitCondition->getPredicate() != ICmpInst::ICMP_EQ) return false;
+  if (BR->getOperand(1) != Latch) return false;
+  if (!IVBasedValues.count(SplitCondition->getOperand(0))
+      && !IVBasedValues.count(SplitCondition->getOperand(1)))
+    return false;
+
+  // If IV is used outside the loop then this loop traversal is required.
+  // FIXME: Calculate and use last IV value. 
+  if (isUsedOutsideLoop(IVIncrement, L))
+    return false;
+
+  // If BR operands are not IV or not loop invariants then skip this loop.
+  Value *OPV = SplitCondition->getOperand(0);
+  Value *SplitValue = SplitCondition->getOperand(1);
+  if (!L->isLoopInvariant(SplitValue))
+    std::swap(OPV, SplitValue);
+  if (!L->isLoopInvariant(SplitValue))
+    return false;
+  Instruction *OPI = dyn_cast<Instruction>(OPV);
+  if (!OPI) 
+    return false;
+  if (OPI->getParent() != Header || isUsedOutsideLoop(OPI, L))
+    return false;
+  Value *StartValue = IVStartValue;
+  Value *ExitValue = IVExitValue;;
+
+  if (OPV != IndVar) {
+    // If BR operand is IV based then use this operand to calculate
+    // effective conditions for loop body.
+    BinaryOperator *BOPV = dyn_cast<BinaryOperator>(OPV);
+    if (!BOPV) 
+      return false;
+    if (BOPV->getOpcode() != Instruction::Add) 
+      return false;
+    StartValue = BinaryOperator::CreateAdd(OPV, StartValue, "" , BR);
+    ExitValue = BinaryOperator::CreateAdd(OPV, ExitValue, "" , BR);
+  }
+
+  if (!cleanBlock(Header))
+    return false;
+
+  if (!cleanBlock(Latch))
+    return false;
+    
+  // If the merge point for BR is not loop latch then skip this loop.
+  if (BR->getSuccessor(0) != Latch) {
+    DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0));
+    assert (DF0 != DF->end() && "Unable to find dominance frontier");
+    if (!DF0->second.count(Latch))
+      return false;
+  }
+  
+  if (BR->getSuccessor(1) != Latch) {
+    DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1));
+    assert (DF1 != DF->end() && "Unable to find dominance frontier");
+    if (!DF1->second.count(Latch))
+      return false;
+  }
+    
+  // Now, Current loop L contains compare instruction
+  // that compares induction variable, IndVar, against loop invariant. And
+  // entire (i.e. meaningful) loop body is dominated by this compare
+  // instruction. In such case eliminate 
+  // loop structure surrounding this loop body. For example,
+  //     for (int i = start; i < end; ++i) {
+  //         if ( i == somevalue) {
+  //           loop_body
+  //         }
+  //     }
+  // can be transformed into
+  //     if (somevalue >= start && somevalue < end) {
+  //        i = somevalue;
+  //        loop_body
+  //     }
+
+  // Replace index variable with split value in loop body. Loop body is executed
+  // only when index variable is equal to split value.
+  IndVar->replaceAllUsesWith(SplitValue);
+
+  // Replace split condition in header.
+  // Transform 
+  //      SplitCondition : icmp eq i32 IndVar, SplitValue
+  // into
+  //      c1 = icmp uge i32 SplitValue, StartValue
+  //      c2 = icmp ult i32 SplitValue, ExitValue
+  //      and i32 c1, c2 
+  Instruction *C1 = new ICmpInst(ExitCondition->isSignedPredicate() ? 
+                                 ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE,
+                                 SplitValue, StartValue, "lisplit", BR);
+
+  CmpInst::Predicate C2P  = ExitCondition->getPredicate();
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  if (LatchBR->getOperand(0) != Header)
+    C2P = CmpInst::getInversePredicate(C2P);
+  Instruction *C2 = new ICmpInst(C2P, SplitValue, ExitValue, "lisplit", BR);
+  Instruction *NSplitCond = BinaryOperator::CreateAnd(C1, C2, "lisplit", BR);
+
+  SplitCondition->replaceAllUsesWith(NSplitCond);
+  SplitCondition->eraseFromParent();
+
+  // Remove Latch to Header edge.
+  BasicBlock *LatchSucc = NULL;
+  Header->removePredecessor(Latch);
+  for (succ_iterator SI = succ_begin(Latch), E = succ_end(Latch);
+       SI != E; ++SI) {
+    if (Header != *SI)
+      LatchSucc = *SI;
+  }
+
+  // Clean up latch block.
+  Value *LatchBRCond = LatchBR->getCondition();
+  LatchBR->setUnconditionalDest(LatchSucc);
+  RecursivelyDeleteTriviallyDeadInstructions(LatchBRCond);
+  
+  LPM->deleteLoopFromQueue(L);
+
+  // Update Dominator Info.
+  // Only CFG change done is to remove Latch to Header edge. This
+  // does not change dominator tree because Latch did not dominate
+  // Header.
+  if (DF) {
+    DominanceFrontier::iterator HeaderDF = DF->find(Header);
+    if (HeaderDF != DF->end()) 
+      DF->removeFromFrontier(HeaderDF, Header);
+
+    DominanceFrontier::iterator LatchDF = DF->find(Latch);
+    if (LatchDF != DF->end()) 
+      DF->removeFromFrontier(LatchDF, Header);
+  }
+
+  ++NumIndexSplitRemoved;
+  return true;
+}
+
+/// restrictLoopBound - Op dominates loop body. Op compares an IV based value 
+/// with a loop invariant value. Update loop's lower and upper bound based on 
+/// the loop invariant value.
+bool LoopIndexSplit::restrictLoopBound(ICmpInst &Op) {
+  bool Sign = Op.isSignedPredicate();
+  Instruction *PHTerm = L->getLoopPreheader()->getTerminator();
+
+  if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) {
+    BranchInst *EBR = 
+      cast<BranchInst>(ExitCondition->getParent()->getTerminator());
+    ExitCondition->setPredicate(ExitCondition->getInversePredicate());
+    BasicBlock *T = EBR->getSuccessor(0);
+    EBR->setSuccessor(0, EBR->getSuccessor(1));
+    EBR->setSuccessor(1, T);
+  }
+
+  // New upper and lower bounds.
+  Value *NLB = NULL;
+  Value *NUB = NULL;
+  if (Value *V = IVisLT(Op)) {
+    // Restrict upper bound.
+    if (IVisLE(*ExitCondition)) 
+      V = getMinusOne(V, Sign, PHTerm);
+    NUB = getMin(V, IVExitValue, Sign, PHTerm);
+  } else if (Value *V = IVisLE(Op)) {
+    // Restrict upper bound.
+    if (IVisLT(*ExitCondition)) 
+      V = getPlusOne(V, Sign, PHTerm);
+    NUB = getMin(V, IVExitValue, Sign, PHTerm);
+  } else if (Value *V = IVisGT(Op)) {
+    // Restrict lower bound.
+    V = getPlusOne(V, Sign, PHTerm);
+    NLB = getMax(V, IVStartValue, Sign, PHTerm);
+  } else if (Value *V = IVisGE(Op))
+    // Restrict lower bound.
+    NLB = getMax(V, IVStartValue, Sign, PHTerm);
+
+  if (!NLB && !NUB) 
+    return false;
+
+  if (NLB) {
+    unsigned i = IndVar->getBasicBlockIndex(L->getLoopPreheader());
+    IndVar->setIncomingValue(i, NLB);
+  }
+
+  if (NUB) {
+    unsigned i = (ExitCondition->getOperand(0) != IVExitValue);
+    ExitCondition->setOperand(i, NUB);
+  }
+  return true;
+}
+
+/// updateLoopIterationSpace -- Update loop's iteration space if loop 
+/// body is executed for certain IV range only. For example,
+/// 
+/// for (i = 0; i < N; ++i) {
+///   if ( i > A && i < B) {
+///     ...
+///   }
+/// }
+/// is transformed to iterators from A to B, if A > 0 and B < N.
+///
+bool LoopIndexSplit::updateLoopIterationSpace() {
+  SplitCondition = NULL;
+  if (ExitCondition->getPredicate() == ICmpInst::ICMP_NE
+      || ExitCondition->getPredicate() == ICmpInst::ICMP_EQ)
+    return false;
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+  BranchInst *BR = dyn_cast<BranchInst>(Header->getTerminator());
+  if (!BR) return false;
+  if (!isa<BranchInst>(Latch->getTerminator())) return false;
+  if (BR->isUnconditional()) return false;
+  BinaryOperator *AND = dyn_cast<BinaryOperator>(BR->getCondition());
+  if (!AND) return false;
+  if (AND->getOpcode() != Instruction::And) return false;
+  ICmpInst *Op0 = dyn_cast<ICmpInst>(AND->getOperand(0));
+  ICmpInst *Op1 = dyn_cast<ICmpInst>(AND->getOperand(1));
+  if (!Op0 || !Op1)
+    return false;
+  IVBasedValues.insert(AND);
+  IVBasedValues.insert(Op0);
+  IVBasedValues.insert(Op1);
+  if (!cleanBlock(Header)) return false;
+  BasicBlock *ExitingBlock = ExitCondition->getParent();
+  if (!cleanBlock(ExitingBlock)) return false;
+
+  // If the merge point for BR is not loop latch then skip this loop.
+  if (BR->getSuccessor(0) != Latch) {
+    DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0));
+    assert (DF0 != DF->end() && "Unable to find dominance frontier");
+    if (!DF0->second.count(Latch))
+      return false;
+  }
+  
+  if (BR->getSuccessor(1) != Latch) {
+    DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1));
+    assert (DF1 != DF->end() && "Unable to find dominance frontier");
+    if (!DF1->second.count(Latch))
+      return false;
+  }
+    
+  // Verify that loop exiting block has only two predecessor, where one pred
+  // is split condition block. The other predecessor will become exiting block's
+  // dominator after CFG is updated. TODO : Handle CFG's where exiting block has
+  // more then two predecessors. This requires extra work in updating dominator
+  // information.
+  BasicBlock *ExitingBBPred = NULL;
+  for (pred_iterator PI = pred_begin(ExitingBlock), PE = pred_end(ExitingBlock);
+       PI != PE; ++PI) {
+    BasicBlock *BB = *PI;
+    if (Header == BB)
+      continue;
+    if (ExitingBBPred)
+      return false;
+    else
+      ExitingBBPred = BB;
+  }
+
+  if (!restrictLoopBound(*Op0))
+    return false;
+
+  if (!restrictLoopBound(*Op1))
+    return false;
+
+  // Update CFG.
+  if (BR->getSuccessor(0) == ExitingBlock)
+    BR->setUnconditionalDest(BR->getSuccessor(1));
+  else
+    BR->setUnconditionalDest(BR->getSuccessor(0));
+
+  AND->eraseFromParent();
+  if (Op0->use_empty())
+    Op0->eraseFromParent();
+  if (Op1->use_empty())
+    Op1->eraseFromParent();
+
+  // Update domiantor info. Now, ExitingBlock has only one predecessor, 
+  // ExitingBBPred, and it is ExitingBlock's immediate domiantor.
+  DT->changeImmediateDominator(ExitingBlock, ExitingBBPred);
+
+  BasicBlock *ExitBlock = ExitingBlock->getTerminator()->getSuccessor(1);
+  if (L->contains(ExitBlock))
+    ExitBlock = ExitingBlock->getTerminator()->getSuccessor(0);
+
+  // If ExitingBlock is a member of the loop basic blocks' DF list then
+  // replace ExitingBlock with header and exit block in the DF list
+  DominanceFrontier::iterator ExitingBlockDF = DF->find(ExitingBlock);
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (BB == Header || BB == ExitingBlock)
+      continue;
+    DominanceFrontier::iterator BBDF = DF->find(BB);
+    DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin();
+    DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end();
+    while (DomSetI != DomSetE) {
+      DominanceFrontier::DomSetType::iterator CurrentItr = DomSetI;
+      ++DomSetI;
+      BasicBlock *DFBB = *CurrentItr;
+      if (DFBB == ExitingBlock) {
+        BBDF->second.erase(DFBB);
+        for (DominanceFrontier::DomSetType::iterator 
+               EBI = ExitingBlockDF->second.begin(),
+               EBE = ExitingBlockDF->second.end(); EBI != EBE; ++EBI) 
+          BBDF->second.insert(*EBI);
+      }
+    }
+  }
+  NumRestrictBounds++;
+  return true;
+}
+
+/// removeBlocks - Remove basic block DeadBB and all blocks dominated by DeadBB.
+/// This routine is used to remove split condition's dead branch, dominated by
+/// DeadBB. LiveBB dominates split conidition's other branch.
+void LoopIndexSplit::removeBlocks(BasicBlock *DeadBB, Loop *LP, 
+                                  BasicBlock *LiveBB) {
+
+  // First update DeadBB's dominance frontier. 
+  SmallVector<BasicBlock *, 8> FrontierBBs;
+  DominanceFrontier::iterator DeadBBDF = DF->find(DeadBB);
+  if (DeadBBDF != DF->end()) {
+    SmallVector<BasicBlock *, 8> PredBlocks;
+    
+    DominanceFrontier::DomSetType DeadBBSet = DeadBBDF->second;
+    for (DominanceFrontier::DomSetType::iterator DeadBBSetI = DeadBBSet.begin(),
+           DeadBBSetE = DeadBBSet.end(); DeadBBSetI != DeadBBSetE; ++DeadBBSetI) 
+      {
+      BasicBlock *FrontierBB = *DeadBBSetI;
+      FrontierBBs.push_back(FrontierBB);
+
+      // Rremove any PHI incoming edge from blocks dominated by DeadBB.
+      PredBlocks.clear();
+      for(pred_iterator PI = pred_begin(FrontierBB), PE = pred_end(FrontierBB);
+          PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        if (P == DeadBB || DT->dominates(DeadBB, P))
+          PredBlocks.push_back(P);
+      }
+
+      for(BasicBlock::iterator FBI = FrontierBB->begin(), FBE = FrontierBB->end();
+          FBI != FBE; ++FBI) {
+        if (PHINode *PN = dyn_cast<PHINode>(FBI)) {
+          for(SmallVector<BasicBlock *, 8>::iterator PI = PredBlocks.begin(),
+                PE = PredBlocks.end(); PI != PE; ++PI) {
+            BasicBlock *P = *PI;
+            PN->removeIncomingValue(P);
+          }
+        }
+        else
+          break;
+      }      
+    }
+  }
+  
+  // Now remove DeadBB and all nodes dominated by DeadBB in df order.
+  SmallVector<BasicBlock *, 32> WorkList;
+  DomTreeNode *DN = DT->getNode(DeadBB);
+  for (df_iterator<DomTreeNode*> DI = df_begin(DN),
+         E = df_end(DN); DI != E; ++DI) {
+    BasicBlock *BB = DI->getBlock();
+    WorkList.push_back(BB);
+    BB->replaceAllUsesWith(UndefValue::get(Type::LabelTy));
+  }
+
+  while (!WorkList.empty()) {
+    BasicBlock *BB = WorkList.back(); WorkList.pop_back();
+    LPM->deleteSimpleAnalysisValue(BB, LP);
+    for(BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); 
+        BBI != BBE; ) {
+      Instruction *I = BBI;
+      ++BBI;
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      LPM->deleteSimpleAnalysisValue(I, LP);
+      I->eraseFromParent();
+    }
+    DT->eraseNode(BB);
+    DF->removeBlock(BB);
+    LI->removeBlock(BB);
+    BB->eraseFromParent();
+  }
+
+  // Update Frontier BBs' dominator info.
+  while (!FrontierBBs.empty()) {
+    BasicBlock *FBB = FrontierBBs.back(); FrontierBBs.pop_back();
+    BasicBlock *NewDominator = FBB->getSinglePredecessor();
+    if (!NewDominator) {
+      pred_iterator PI = pred_begin(FBB), PE = pred_end(FBB);
+      NewDominator = *PI;
+      ++PI;
+      if (NewDominator != LiveBB) {
+        for(; PI != PE; ++PI) {
+          BasicBlock *P = *PI;
+          if (P == LiveBB) {
+            NewDominator = LiveBB;
+            break;
+          }
+          NewDominator = DT->findNearestCommonDominator(NewDominator, P);
+        }
+      }
+    }
+    assert (NewDominator && "Unable to fix dominator info.");
+    DT->changeImmediateDominator(FBB, NewDominator);
+    DF->changeImmediateDominator(FBB, NewDominator, DT);
+  }
+
+}
+
+// moveExitCondition - Move exit condition EC into split condition block CondBB.
+void LoopIndexSplit::moveExitCondition(BasicBlock *CondBB, BasicBlock *ActiveBB,
+                                       BasicBlock *ExitBB, ICmpInst *EC, 
+                                       ICmpInst *SC, PHINode *IV, 
+                                       Instruction *IVAdd, Loop *LP,
+                                       unsigned ExitValueNum) {
+
+  BasicBlock *ExitingBB = EC->getParent();
+  Instruction *CurrentBR = CondBB->getTerminator();
+
+  // Move exit condition into split condition block.
+  EC->moveBefore(CurrentBR);
+  EC->setOperand(ExitValueNum == 0 ? 1 : 0, IV);
+
+  // Move exiting block's branch into split condition block. Update its branch
+  // destination.
+  BranchInst *ExitingBR = cast<BranchInst>(ExitingBB->getTerminator());
+  ExitingBR->moveBefore(CurrentBR);
+  BasicBlock *OrigDestBB = NULL;
+  if (ExitingBR->getSuccessor(0) == ExitBB) {
+    OrigDestBB = ExitingBR->getSuccessor(1);
+    ExitingBR->setSuccessor(1, ActiveBB);
+  }
+  else {
+    OrigDestBB = ExitingBR->getSuccessor(0);
+    ExitingBR->setSuccessor(0, ActiveBB);
+  }
+    
+  // Remove split condition and current split condition branch.
+  SC->eraseFromParent();
+  CurrentBR->eraseFromParent();
+
+  // Connect exiting block to original destination.
+  BranchInst::Create(OrigDestBB, ExitingBB);
+
+  // Update PHINodes
+  updatePHINodes(ExitBB, ExitingBB, CondBB, IV, IVAdd, LP);
+
+  // Fix dominator info.
+  // ExitBB is now dominated by CondBB
+  DT->changeImmediateDominator(ExitBB, CondBB);
+  DF->changeImmediateDominator(ExitBB, CondBB, DT);
+
+  // Blocks outside the loop may have been in the dominance frontier of blocks
+  // inside the condition; this is now impossible because the blocks inside the
+  // condition no loger dominate the exit.  Remove the relevant blocks from
+  // the dominance frontiers.
+  for (Loop::block_iterator I = LP->block_begin(), E = LP->block_end();
+       I != E; ++I) {
+    if (*I == CondBB || !DT->dominates(CondBB, *I)) continue;
+    DominanceFrontier::iterator BBDF = DF->find(*I);
+    DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin();
+    DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end();
+    while (DomSetI != DomSetE) {
+      DominanceFrontier::DomSetType::iterator CurrentItr = DomSetI;
+      ++DomSetI;
+      BasicBlock *DFBB = *CurrentItr;
+      if (!LP->contains(DFBB))
+        BBDF->second.erase(DFBB);
+    }
+  }
+}
+
+/// updatePHINodes - CFG has been changed. 
+/// Before 
+///   - ExitBB's single predecessor was Latch
+///   - Latch's second successor was Header
+/// Now
+///   - ExitBB's single predecessor is Header
+///   - Latch's one and only successor is Header
+///
+/// Update ExitBB PHINodes' to reflect this change.
+void LoopIndexSplit::updatePHINodes(BasicBlock *ExitBB, BasicBlock *Latch, 
+                                    BasicBlock *Header,
+                                    PHINode *IV, Instruction *IVIncrement,
+                                    Loop *LP) {
+
+  for (BasicBlock::iterator BI = ExitBB->begin(), BE = ExitBB->end(); 
+       BI != BE; ) {
+    PHINode *PN = dyn_cast<PHINode>(BI);
+    ++BI;
+    if (!PN)
+      break;
+
+    Value *V = PN->getIncomingValueForBlock(Latch);
+    if (PHINode *PHV = dyn_cast<PHINode>(V)) {
+      // PHV is in Latch. PHV has one use is in ExitBB PHINode. And one use
+      // in Header which is new incoming value for PN.
+      Value *NewV = NULL;
+      for (Value::use_iterator UI = PHV->use_begin(), E = PHV->use_end(); 
+           UI != E; ++UI) 
+        if (PHINode *U = dyn_cast<PHINode>(*UI)) 
+          if (LP->contains(U->getParent())) {
+            NewV = U;
+            break;
+          }
+
+      // Add incoming value from header only if PN has any use inside the loop.
+      if (NewV)
+        PN->addIncoming(NewV, Header);
+
+    } else if (Instruction *PHI = dyn_cast<Instruction>(V)) {
+      // If this instruction is IVIncrement then IV is new incoming value 
+      // from header otherwise this instruction must be incoming value from 
+      // header because loop is in LCSSA form.
+      if (PHI == IVIncrement)
+        PN->addIncoming(IV, Header);
+      else
+        PN->addIncoming(V, Header);
+    } else
+      // Otherwise this is an incoming value from header because loop is in 
+      // LCSSA form.
+      PN->addIncoming(V, Header);
+    
+    // Remove incoming value from Latch.
+    PN->removeIncomingValue(Latch);
+  }
+}
+
+bool LoopIndexSplit::splitLoop() {
+  SplitCondition = NULL;
+  if (ExitCondition->getPredicate() == ICmpInst::ICMP_NE
+      || ExitCondition->getPredicate() == ICmpInst::ICMP_EQ)
+    return false;
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BranchInst *SBR = NULL; // Split Condition Branch
+  BranchInst *EBR = cast<BranchInst>(ExitCondition->getParent()->getTerminator());
+  // If Exiting block includes loop variant instructions then this
+  // loop may not be split safely.
+  BasicBlock *ExitingBlock = ExitCondition->getParent();
+  if (!cleanBlock(ExitingBlock)) return false;
+
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BranchInst *BR = dyn_cast<BranchInst>((*I)->getTerminator());
+    if (!BR || BR->isUnconditional()) continue;
+    ICmpInst *CI = dyn_cast<ICmpInst>(BR->getCondition());
+    if (!CI || CI == ExitCondition 
+        || CI->getPredicate() == ICmpInst::ICMP_NE
+        || CI->getPredicate() == ICmpInst::ICMP_EQ)
+      continue;
+
+    // Unable to handle triangle loops at the moment.
+    // In triangle loop, split condition is in header and one of the
+    // the split destination is loop latch. If split condition is EQ
+    // then such loops are already handle in processOneIterationLoop().
+    if (Header == (*I)
+        && (Latch == BR->getSuccessor(0) || Latch == BR->getSuccessor(1)))
+      continue;
+
+    // If the block does not dominate the latch then this is not a diamond.
+    // Such loop may not benefit from index split.
+    if (!DT->dominates((*I), Latch))
+      continue;
+
+    // If split condition branches heads do not have single predecessor, 
+    // SplitCondBlock, then is not possible to remove inactive branch.
+    if (!BR->getSuccessor(0)->getSinglePredecessor() 
+        || !BR->getSuccessor(1)->getSinglePredecessor())
+      return false;
+
+    // If the merge point for BR is not loop latch then skip this condition.
+    if (BR->getSuccessor(0) != Latch) {
+      DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0));
+      assert (DF0 != DF->end() && "Unable to find dominance frontier");
+      if (!DF0->second.count(Latch))
+        continue;
+    }
+    
+    if (BR->getSuccessor(1) != Latch) {
+      DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1));
+      assert (DF1 != DF->end() && "Unable to find dominance frontier");
+      if (!DF1->second.count(Latch))
+        continue;
+    }
+    SplitCondition = CI;
+    SBR = BR;
+    break;
+  }
+   
+  if (!SplitCondition)
+    return false;
+
+  // If the predicate sign does not match then skip.
+  if (ExitCondition->isSignedPredicate() != SplitCondition->isSignedPredicate())
+    return false;
+
+  unsigned EVOpNum = (ExitCondition->getOperand(1) == IVExitValue);
+  unsigned SVOpNum = IVBasedValues.count(SplitCondition->getOperand(0));
+  Value *SplitValue = SplitCondition->getOperand(SVOpNum);
+  if (!L->isLoopInvariant(SplitValue))
+    return false;
+  if (!IVBasedValues.count(SplitCondition->getOperand(!SVOpNum)))
+    return false;
+
+  // Normalize loop conditions so that it is easier to calculate new loop
+  // bounds.
+  if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) {
+    ExitCondition->setPredicate(ExitCondition->getInversePredicate());
+    BasicBlock *T = EBR->getSuccessor(0);
+    EBR->setSuccessor(0, EBR->getSuccessor(1));
+    EBR->setSuccessor(1, T);
+  }
+
+  if (IVisGT(*SplitCondition) || IVisGE(*SplitCondition)) {
+    SplitCondition->setPredicate(SplitCondition->getInversePredicate());
+    BasicBlock *T = SBR->getSuccessor(0);
+    SBR->setSuccessor(0, SBR->getSuccessor(1));
+    SBR->setSuccessor(1, T);
+  }
+
+  //[*] Calculate new loop bounds.
+  Value *AEV = SplitValue;
+  Value *BSV = SplitValue;
+  bool Sign = SplitCondition->isSignedPredicate();
+  Instruction *PHTerm = L->getLoopPreheader()->getTerminator();
+
+  if (IVisLT(*ExitCondition)) {
+    if (IVisLT(*SplitCondition)) {
+      /* Do nothing */
+    }
+    else if (IVisLE(*SplitCondition)) {
+      AEV = getPlusOne(SplitValue, Sign, PHTerm);
+      BSV = getPlusOne(SplitValue, Sign, PHTerm);
+    } else {
+      assert (0 && "Unexpected split condition!");
+    }
+  }
+  else if (IVisLE(*ExitCondition)) {
+    if (IVisLT(*SplitCondition)) {
+      AEV = getMinusOne(SplitValue, Sign, PHTerm);
+    }
+    else if (IVisLE(*SplitCondition)) {
+      BSV = getPlusOne(SplitValue, Sign, PHTerm);
+    } else {
+      assert (0 && "Unexpected split condition!");
+    }
+  } else {
+    assert (0 && "Unexpected exit condition!");
+  }
+  AEV = getMin(AEV, IVExitValue, Sign, PHTerm);
+  BSV = getMax(BSV, IVStartValue, Sign, PHTerm);
+
+  // [*] Clone Loop
+  DenseMap<const Value *, Value *> ValueMap;
+  Loop *BLoop = CloneLoop(L, LPM, LI, ValueMap, this);
+  Loop *ALoop = L;
+
+  // [*] ALoop's exiting edge enters BLoop's header.
+  //    ALoop's original exit block becomes BLoop's exit block.
+  PHINode *B_IndVar = cast<PHINode>(ValueMap[IndVar]);
+  BasicBlock *A_ExitingBlock = ExitCondition->getParent();
+  BranchInst *A_ExitInsn =
+    dyn_cast<BranchInst>(A_ExitingBlock->getTerminator());
+  assert (A_ExitInsn && "Unable to find suitable loop exit branch");
+  BasicBlock *B_ExitBlock = A_ExitInsn->getSuccessor(1);
+  BasicBlock *B_Header = BLoop->getHeader();
+  if (ALoop->contains(B_ExitBlock)) {
+    B_ExitBlock = A_ExitInsn->getSuccessor(0);
+    A_ExitInsn->setSuccessor(0, B_Header);
+  } else
+    A_ExitInsn->setSuccessor(1, B_Header);
+
+  // [*] Update ALoop's exit value using new exit value.
+  ExitCondition->setOperand(EVOpNum, AEV);
+
+  // [*] Update BLoop's header phi nodes. Remove incoming PHINode's from
+  //     original loop's preheader. Add incoming PHINode values from
+  //     ALoop's exiting block. Update BLoop header's domiantor info.
+
+  // Collect inverse map of Header PHINodes.
+  DenseMap<Value *, Value *> InverseMap;
+  for (BasicBlock::iterator BI = ALoop->getHeader()->begin(), 
+         BE = ALoop->getHeader()->end(); BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      PHINode *PNClone = cast<PHINode>(ValueMap[PN]);
+      InverseMap[PNClone] = PN;
+    } else
+      break;
+  }
+
+  BasicBlock *A_Preheader = ALoop->getLoopPreheader();
+  for (BasicBlock::iterator BI = B_Header->begin(), BE = B_Header->end();
+       BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      // Remove incoming value from original preheader.
+      PN->removeIncomingValue(A_Preheader);
+
+      // Add incoming value from A_ExitingBlock.
+      if (PN == B_IndVar)
+        PN->addIncoming(BSV, A_ExitingBlock);
+      else { 
+        PHINode *OrigPN = cast<PHINode>(InverseMap[PN]);
+        Value *V2 = NULL;
+        // If loop header is also loop exiting block then
+        // OrigPN is incoming value for B loop header.
+        if (A_ExitingBlock == ALoop->getHeader())
+          V2 = OrigPN;
+        else
+          V2 = OrigPN->getIncomingValueForBlock(A_ExitingBlock);
+        PN->addIncoming(V2, A_ExitingBlock);
+      }
+    } else
+      break;
+  }
+
+  DT->changeImmediateDominator(B_Header, A_ExitingBlock);
+  DF->changeImmediateDominator(B_Header, A_ExitingBlock, DT);
+  
+  // [*] Update BLoop's exit block. Its new predecessor is BLoop's exit
+  //     block. Remove incoming PHINode values from ALoop's exiting block.
+  //     Add new incoming values from BLoop's incoming exiting value.
+  //     Update BLoop exit block's dominator info..
+  BasicBlock *B_ExitingBlock = cast<BasicBlock>(ValueMap[A_ExitingBlock]);
+  for (BasicBlock::iterator BI = B_ExitBlock->begin(), BE = B_ExitBlock->end();
+       BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      PN->addIncoming(ValueMap[PN->getIncomingValueForBlock(A_ExitingBlock)], 
+                                                            B_ExitingBlock);
+      PN->removeIncomingValue(A_ExitingBlock);
+    } else
+      break;
+  }
+
+  DT->changeImmediateDominator(B_ExitBlock, B_ExitingBlock);
+  DF->changeImmediateDominator(B_ExitBlock, B_ExitingBlock, DT);
+
+  //[*] Split ALoop's exit edge. This creates a new block which
+  //    serves two purposes. First one is to hold PHINode defnitions
+  //    to ensure that ALoop's LCSSA form. Second use it to act
+  //    as a preheader for BLoop.
+  BasicBlock *A_ExitBlock = SplitEdge(A_ExitingBlock, B_Header, this);
+
+  //[*] Preserve ALoop's LCSSA form. Create new forwarding PHINodes
+  //    in A_ExitBlock to redefine outgoing PHI definitions from ALoop.
+  for(BasicBlock::iterator BI = B_Header->begin(), BE = B_Header->end();
+      BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      Value *V1 = PN->getIncomingValueForBlock(A_ExitBlock);
+      PHINode *newPHI = PHINode::Create(PN->getType(), PN->getName());
+      newPHI->addIncoming(V1, A_ExitingBlock);
+      A_ExitBlock->getInstList().push_front(newPHI);
+      PN->removeIncomingValue(A_ExitBlock);
+      PN->addIncoming(newPHI, A_ExitBlock);
+    } else
+      break;
+  }
+
+  //[*] Eliminate split condition's inactive branch from ALoop.
+  BasicBlock *A_SplitCondBlock = SplitCondition->getParent();
+  BranchInst *A_BR = cast<BranchInst>(A_SplitCondBlock->getTerminator());
+  BasicBlock *A_InactiveBranch = NULL;
+  BasicBlock *A_ActiveBranch = NULL;
+  A_ActiveBranch = A_BR->getSuccessor(0);
+  A_InactiveBranch = A_BR->getSuccessor(1);
+  A_BR->setUnconditionalDest(A_ActiveBranch);
+  removeBlocks(A_InactiveBranch, L, A_ActiveBranch);
+
+  //[*] Eliminate split condition's inactive branch in from BLoop.
+  BasicBlock *B_SplitCondBlock = cast<BasicBlock>(ValueMap[A_SplitCondBlock]);
+  BranchInst *B_BR = cast<BranchInst>(B_SplitCondBlock->getTerminator());
+  BasicBlock *B_InactiveBranch = NULL;
+  BasicBlock *B_ActiveBranch = NULL;
+  B_ActiveBranch = B_BR->getSuccessor(1);
+  B_InactiveBranch = B_BR->getSuccessor(0);
+  B_BR->setUnconditionalDest(B_ActiveBranch);
+  removeBlocks(B_InactiveBranch, BLoop, B_ActiveBranch);
+
+  BasicBlock *A_Header = ALoop->getHeader();
+  if (A_ExitingBlock == A_Header)
+    return true;
+
+  //[*] Move exit condition into split condition block to avoid
+  //    executing dead loop iteration.
+  ICmpInst *B_ExitCondition = cast<ICmpInst>(ValueMap[ExitCondition]);
+  Instruction *B_IndVarIncrement = cast<Instruction>(ValueMap[IVIncrement]);
+  ICmpInst *B_SplitCondition = cast<ICmpInst>(ValueMap[SplitCondition]);
+
+  moveExitCondition(A_SplitCondBlock, A_ActiveBranch, A_ExitBlock, ExitCondition,
+                    cast<ICmpInst>(SplitCondition), IndVar, IVIncrement, 
+                    ALoop, EVOpNum);
+
+  moveExitCondition(B_SplitCondBlock, B_ActiveBranch, 
+                    B_ExitBlock, B_ExitCondition,
+                    B_SplitCondition, B_IndVar, B_IndVarIncrement, 
+                    BLoop, EVOpNum);
+
+  NumIndexSplit++;
+  return true;
+}
+
+/// cleanBlock - A block is considered clean if all non terminal instructions 
+/// are either, PHINodes, IV based.
+bool LoopIndexSplit::cleanBlock(BasicBlock *BB) {
+  Instruction *Terminator = BB->getTerminator();
+  for(BasicBlock::iterator BI = BB->begin(), BE = BB->end(); 
+      BI != BE; ++BI) {
+    Instruction *I = BI;
+
+    if (isa<PHINode>(I) || I == Terminator || I == ExitCondition
+        || I == SplitCondition || IVBasedValues.count(I) 
+        || isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (I->mayHaveSideEffects())
+      return false;
+
+    // I is used only inside this block then it is OK.
+    bool usedOutsideBB = false;
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); 
+         UI != UE; ++UI) {
+      Instruction *U = cast<Instruction>(UI);
+      if (U->getParent() != BB)
+        usedOutsideBB = true;
+    }
+    if (!usedOutsideBB)
+      continue;
+
+    // Otherwise we have a instruction that may not allow loop spliting.
+    return false;
+  }
+  return true;
+}
+
+/// IVisLT - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is less than  the loop invariant then return the loop 
+/// invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisLT(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SLT || P == ICmpInst::ICMP_ULT) 
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SGT || P == ICmpInst::ICMP_UGT) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
+/// IVisLE - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is less than or equal to the loop invariant then 
+/// return the loop invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisLE(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SLE || P == ICmpInst::ICMP_ULE)
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SGE || P == ICmpInst::ICMP_UGE) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
+/// IVisGT - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is greater than  the loop invariant then return the loop 
+/// invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisGT(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SGT || P == ICmpInst::ICMP_UGT) 
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SLT || P == ICmpInst::ICMP_ULT) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
+/// IVisGE - If Op is comparing IV based value with an loop invariant and 
+/// IV based value is greater than or equal to the loop invariant then 
+/// return the loop invariant. Otherwise return NULL.
+Value * LoopIndexSplit::IVisGE(ICmpInst &Op) {
+  ICmpInst::Predicate P = Op.getPredicate();
+  if ((P == ICmpInst::ICMP_SGE || P == ICmpInst::ICMP_UGE)
+      && IVBasedValues.count(Op.getOperand(0)) 
+      && L->isLoopInvariant(Op.getOperand(1)))
+    return Op.getOperand(1);
+
+  if ((P == ICmpInst::ICMP_SLE || P == ICmpInst::ICMP_ULE) 
+      && IVBasedValues.count(Op.getOperand(1)) 
+      && L->isLoopInvariant(Op.getOperand(0)))
+    return Op.getOperand(0);
+
+  return NULL;
+}
+
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 0000000..a088230
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -0,0 +1,572 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-rotate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+#define MAX_HEADER_SIZE 16
+
+STATISTIC(NumRotated, "Number of loops rotated");
+namespace {
+
+  class VISIBILITY_HIDDEN RenameData {
+  public:
+    RenameData(Instruction *O, Value *P, Instruction *H) 
+      : Original(O), PreHeader(P), Header(H) { }
+  public:
+    Instruction *Original; // Original instruction
+    Value *PreHeader; // Original pre-header replacement
+    Instruction *Header; // New header replacement
+  };
+  
+  class VISIBILITY_HIDDEN LoopRotate : public LoopPass {
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopRotate() : LoopPass(&ID) {}
+
+    // Rotate Loop L as many times as possible. Return true if
+    // loop is rotated at least once.
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    // LCSSA form makes instruction renaming easier.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+    // Helper functions
+
+    /// Do actual work
+    bool rotateLoop(Loop *L, LPPassManager &LPM);
+    
+    /// Initialize local data
+    void initialize();
+
+    /// Make sure all Exit block PHINodes have required incoming values.
+    /// If incoming value is constant or defined outside the loop then
+    /// PHINode may not have an entry for original pre-header. 
+    void  updateExitBlock();
+
+    /// Return true if this instruction is used outside original header.
+    bool usedOutsideOriginalHeader(Instruction *In);
+
+    /// Find Replacement information for instruction. Return NULL if it is
+    /// not available.
+    const RenameData *findReplacementData(Instruction *I);
+
+    /// After loop rotation, loop pre-header has multiple sucessors.
+    /// Insert one forwarding basic block to ensure that loop pre-header
+    /// has only one successor.
+    void preserveCanonicalLoopForm(LPPassManager &LPM);
+
+  private:
+
+    Loop *L;
+    BasicBlock *OrigHeader;
+    BasicBlock *OrigPreHeader;
+    BasicBlock *OrigLatch;
+    BasicBlock *NewHeader;
+    BasicBlock *Exit;
+    LPPassManager *LPM_Ptr;
+    SmallVector<RenameData, MAX_HEADER_SIZE> LoopHeaderInfo;
+  };
+}
+  
+char LoopRotate::ID = 0;
+static RegisterPass<LoopRotate> X("loop-rotate", "Rotate Loops");
+
+Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
+
+/// Rotate Loop L as many times as possible. Return true if
+/// loop is rotated at least once.
+bool LoopRotate::runOnLoop(Loop *Lp, LPPassManager &LPM) {
+
+  bool RotatedOneLoop = false;
+  initialize();
+  LPM_Ptr = &LPM;
+
+  // One loop can be rotated multiple times.
+  while (rotateLoop(Lp,LPM)) {
+    RotatedOneLoop = true;
+    initialize();
+  }
+
+  return RotatedOneLoop;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
+  L = Lp;
+
+  OrigHeader =  L->getHeader();
+  OrigPreHeader = L->getLoopPreheader();
+  OrigLatch = L->getLoopLatch();
+
+  // If loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  assert(OrigHeader && OrigLatch && OrigPreHeader &&
+         "Loop is not in canonical form");
+
+  // If loop header is not one of the loop exit block then
+  // either this loop is already rotated or it is not 
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExit(OrigHeader))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI)
+    return false;
+  assert(BI->isConditional() && "Branch Instruction is not conditional");
+
+  // Updating PHInodes in loops with multiple exits adds complexity. 
+  // Keep it simple, and restrict loop rotation to loops with one exit only.
+  // In future, lift this restriction and support for multiple exits if
+  // required.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() > 1)
+    return false;
+
+  // Check size of original header and reject
+  // loop if it is very big.
+  unsigned Size = 0;
+  
+  // FIXME: Use common api to estimate size.
+  for (BasicBlock::const_iterator OI = OrigHeader->begin(), 
+         OE = OrigHeader->end(); OI != OE; ++OI) {
+      if (isa<PHINode>(OI)) 
+        continue;           // PHI nodes don't count.
+      if (isa<DbgInfoIntrinsic>(OI))
+        continue;  // Debug intrinsics don't count as size.
+      Size++;
+  }
+
+  if (Size > MAX_HEADER_SIZE)
+    return false;
+
+  // Now, this loop is suitable for rotation.
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  Exit = BI->getSuccessor(0);
+  NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) && 
+         "Unable to determine loop header and exit blocks");
+  
+  // This code assumes that new header has exactly one predecessor.  Remove any
+  // single entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Copy PHI nodes and other instructions from original header
+  // into original pre-header. Unlike original header, original pre-header is
+  // not a member of loop. 
+  //
+  // New loop header is one and only successor of original header that 
+  // is inside the loop. All other original header successors are outside 
+  // the loop. Copy PHI Nodes from original header into new loop header. 
+  // Add second incoming value, from original loop pre-header into these phi 
+  // nodes. If a value defined in original header is used outside original 
+  // header then new loop header will need new phi nodes with two incoming 
+  // values, one definition from original header and second definition is 
+  // from original loop pre-header.
+
+  // Remove terminator from Original pre-header. Original pre-header will
+  // receive a clone of original header terminator as a new terminator.
+  OrigPreHeader->getInstList().pop_back();
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  PHINode *PN = 0;
+  for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+    // PHI nodes are not copied into original pre-header. Instead their values
+    // are directly propagated.
+    Value *NPV = PN->getIncomingValueForBlock(OrigPreHeader);
+
+    // Create new PHI node with two incoming values for NewHeader.
+    // One incoming value is from OrigLatch (through OrigHeader) and 
+    // second incoming value is from original pre-header.
+    PHINode *NH = PHINode::Create(PN->getType(), PN->getName(),
+                                  NewHeader->begin());
+    NH->addIncoming(PN->getIncomingValueForBlock(OrigLatch), OrigHeader);
+    NH->addIncoming(NPV, OrigPreHeader);
+    
+    // "In" can be replaced by NH at various places.
+    LoopHeaderInfo.push_back(RenameData(PN, NPV, NH));
+  }
+
+  // Now, handle non-phi instructions.
+  for (; I != E; ++I) {
+    Instruction *In = I;
+    assert(!isa<PHINode>(In) && "PHINode is not expected here");
+    
+    // This is not a PHI instruction. Insert its clone into original pre-header.
+    // If this instruction is using a value from same basic block then
+    // update it to use value from cloned instruction.
+    Instruction *C = In->clone();
+    C->setName(In->getName());
+    OrigPreHeader->getInstList().push_back(C);
+
+    for (unsigned opi = 0, e = In->getNumOperands(); opi != e; ++opi) {
+      Instruction *OpInsn = dyn_cast<Instruction>(In->getOperand(opi));
+      if (!OpInsn) continue;  // Ignore non-instruction values.
+      if (const RenameData *D = findReplacementData(OpInsn))
+        C->setOperand(opi, D->PreHeader);
+    }
+
+    // If this instruction is used outside this basic block then
+    // create new PHINode for this instruction.
+    Instruction *NewHeaderReplacement = NULL;
+    if (usedOutsideOriginalHeader(In)) {
+      PHINode *PN = PHINode::Create(In->getType(), In->getName(),
+                                    NewHeader->begin());
+      PN->addIncoming(In, OrigHeader);
+      PN->addIncoming(C, OrigPreHeader);
+      NewHeaderReplacement = PN;
+    }
+    LoopHeaderInfo.push_back(RenameData(In, C, NewHeaderReplacement));
+  }
+
+  // Rename uses of original header instructions to reflect their new
+  // definitions (either from original pre-header node or from newly created
+  // new header PHINodes.
+  //
+  // Original header instructions are used in
+  // 1) Original header:
+  //
+  //    If instruction is used in non-phi instructions then it is using
+  //    defintion from original heder iteself. Do not replace this use
+  //    with definition from new header or original pre-header.
+  //
+  //    If instruction is used in phi node then it is an incoming 
+  //    value. Rename its use to reflect new definition from new-preheader
+  //    or new header.
+  //
+  // 2) Inside loop but not in original header
+  //
+  //    Replace this use to reflect definition from new header.
+  for (unsigned LHI = 0, LHI_E = LoopHeaderInfo.size(); LHI != LHI_E; ++LHI) {
+    const RenameData &ILoopHeaderInfo = LoopHeaderInfo[LHI];
+
+    if (!ILoopHeaderInfo.Header)
+      continue;
+
+    Instruction *OldPhi = ILoopHeaderInfo.Original;
+    Instruction *NewPhi = ILoopHeaderInfo.Header;
+
+    // Before replacing uses, collect them first, so that iterator is
+    // not invalidated.
+    SmallVector<Instruction *, 16> AllUses;
+    for (Value::use_iterator UI = OldPhi->use_begin(), UE = OldPhi->use_end();
+         UI != UE; ++UI)
+      AllUses.push_back(cast<Instruction>(UI));
+
+    for (SmallVector<Instruction *, 16>::iterator UI = AllUses.begin(), 
+           UE = AllUses.end(); UI != UE; ++UI) {
+      Instruction *U = *UI;
+      BasicBlock *Parent = U->getParent();
+
+      // Used inside original header
+      if (Parent == OrigHeader) {
+        // Do not rename uses inside original header non-phi instructions.
+        PHINode *PU = dyn_cast<PHINode>(U);
+        if (!PU)
+          continue;
+
+        // Do not rename uses inside original header phi nodes, if the
+        // incoming value is for new header.
+        if (PU->getBasicBlockIndex(NewHeader) != -1
+            && PU->getIncomingValueForBlock(NewHeader) == U)
+          continue;
+        
+       U->replaceUsesOfWith(OldPhi, NewPhi);
+       continue;
+      }
+
+      // Used inside loop, but not in original header.
+      if (L->contains(U->getParent())) {
+        if (U != NewPhi)
+          U->replaceUsesOfWith(OldPhi, NewPhi);
+        continue;
+      }
+      
+      // Used inside Exit Block. Since we are in LCSSA form, U must be PHINode.
+      if (U->getParent() == Exit) {
+        assert(isa<PHINode>(U) && "Use in Exit Block that is not PHINode");
+        
+        PHINode *UPhi = cast<PHINode>(U);
+        // UPhi already has one incoming argument from original header. 
+        // Add second incoming argument from new Pre header.
+        UPhi->addIncoming(ILoopHeaderInfo.PreHeader, OrigPreHeader);
+      } else {
+        // Used outside Exit block. Create a new PHI node from exit block
+        // to receive value from ne new header ane pre header.
+        PHINode *PN = PHINode::Create(U->getType(), U->getName(),
+                                      Exit->begin());
+        PN->addIncoming(ILoopHeaderInfo.PreHeader, OrigPreHeader);
+        PN->addIncoming(OldPhi, OrigHeader);
+        U->replaceUsesOfWith(OldPhi, PN);
+      }
+    }
+  }
+  
+  /// Make sure all Exit block PHINodes have required incoming values.
+  updateExitBlock();
+
+  // Update CFG
+
+  // Removing incoming branch from loop preheader to original header.
+  // Now original header is inside the loop.
+  for (BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+       I != E; ++I)
+    if (PHINode *PN = dyn_cast<PHINode>(I))
+      PN->removeIncomingValue(OrigPreHeader);
+
+  // Make NewHeader as the new header for the loop.
+  L->moveToHeader(NewHeader);
+
+  preserveCanonicalLoopForm(LPM);
+
+  NumRotated++;
+  return true;
+}
+
+/// Make sure all Exit block PHINodes have required incoming values.
+/// If incoming value is constant or defined outside the loop then
+/// PHINode may not have an entry for original pre-header. 
+void LoopRotate::updateExitBlock() {
+
+  for (BasicBlock::iterator I = Exit->begin(), E = Exit->end();
+       I != E; ++I) {
+
+    PHINode *PN = dyn_cast<PHINode>(I);
+    if (!PN)
+      break;
+
+    // There is already one incoming value from original pre-header block.
+    if (PN->getBasicBlockIndex(OrigPreHeader) != -1)
+      continue;
+
+    const RenameData *ILoopHeaderInfo;
+    Value *V = PN->getIncomingValueForBlock(OrigHeader);
+    if (isa<Instruction>(V) && 
+        (ILoopHeaderInfo = findReplacementData(cast<Instruction>(V)))) {
+      assert(ILoopHeaderInfo->PreHeader && "Missing New Preheader Instruction");
+      PN->addIncoming(ILoopHeaderInfo->PreHeader, OrigPreHeader);
+    } else {
+      PN->addIncoming(V, OrigPreHeader);
+    }
+  }
+}
+
+/// Initialize local data
+void LoopRotate::initialize() {
+  L = NULL;
+  OrigHeader = NULL;
+  OrigPreHeader = NULL;
+  NewHeader = NULL;
+  Exit = NULL;
+
+  LoopHeaderInfo.clear();
+}
+
+/// Return true if this instruction is used by any instructions in the loop that
+/// aren't in original header.
+bool LoopRotate::usedOutsideOriginalHeader(Instruction *In) {
+  for (Value::use_iterator UI = In->use_begin(), UE = In->use_end();
+       UI != UE; ++UI) {
+    BasicBlock *UserBB = cast<Instruction>(UI)->getParent();
+    if (UserBB != OrigHeader && L->contains(UserBB))
+      return true;
+  }
+
+  return false;
+}
+
+/// Find Replacement information for instruction. Return NULL if it is
+/// not available.
+const RenameData *LoopRotate::findReplacementData(Instruction *In) {
+
+  // Since LoopHeaderInfo is small, linear walk is OK.
+  for (unsigned LHI = 0, LHI_E = LoopHeaderInfo.size(); LHI != LHI_E; ++LHI) {
+    const RenameData &ILoopHeaderInfo = LoopHeaderInfo[LHI];
+    if (ILoopHeaderInfo.Original == In)
+      return &ILoopHeaderInfo;
+  }
+  return NULL;
+}
+
+/// After loop rotation, loop pre-header has multiple sucessors.
+/// Insert one forwarding basic block to ensure that loop pre-header
+/// has only one successor.
+void LoopRotate::preserveCanonicalLoopForm(LPPassManager &LPM) {
+
+  // Right now original pre-header has two successors, new header and
+  // exit block. Insert new block between original pre-header and
+  // new header such that loop's new pre-header has only one successor.
+  BasicBlock *NewPreHeader = BasicBlock::Create("bb.nph",
+                                                OrigHeader->getParent(), 
+                                                NewHeader);
+  LoopInfo &LI = LPM.getAnalysis<LoopInfo>();
+  if (Loop *PL = LI.getLoopFor(OrigPreHeader))
+    PL->addBasicBlockToLoop(NewPreHeader, LI.getBase());
+  BranchInst::Create(NewHeader, NewPreHeader);
+  
+  BranchInst *OrigPH_BI = cast<BranchInst>(OrigPreHeader->getTerminator());
+  if (OrigPH_BI->getSuccessor(0) == NewHeader)
+    OrigPH_BI->setSuccessor(0, NewPreHeader);
+  else {
+    assert(OrigPH_BI->getSuccessor(1) == NewHeader &&
+           "Unexpected original pre-header terminator");
+    OrigPH_BI->setSuccessor(1, NewPreHeader);
+  }
+  
+  for (BasicBlock::iterator I = NewHeader->begin(), E = NewHeader->end();
+       I != E; ++I) {
+    PHINode *PN = dyn_cast<PHINode>(I);
+    if (!PN)
+      break;
+
+    int index = PN->getBasicBlockIndex(OrigPreHeader);
+    assert(index != -1 && "Expected incoming value from Original PreHeader");
+    PN->setIncomingBlock(index, NewPreHeader);
+    assert(PN->getBasicBlockIndex(OrigPreHeader) == -1 && 
+           "Expected only one incoming value from Original PreHeader");
+  }
+
+  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
+    DT->addNewBlock(NewPreHeader, OrigPreHeader);
+    DT->changeImmediateDominator(L->getHeader(), NewPreHeader);
+    DT->changeImmediateDominator(Exit, OrigPreHeader);
+    for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      BasicBlock *B = *BI;
+      if (L->getHeader() != B) {
+        DomTreeNode *Node = DT->getNode(B);
+        if (Node && Node->getBlock() == OrigHeader)
+          DT->changeImmediateDominator(*BI, L->getHeader());
+      }
+    }
+    DT->changeImmediateDominator(OrigHeader, OrigLatch);
+  }
+
+  if (DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>()) {
+    // New Preheader's dominance frontier is Exit block.
+    DominanceFrontier::DomSetType NewPHSet;
+    NewPHSet.insert(Exit);
+    DF->addBasicBlock(NewPreHeader, NewPHSet);
+
+    // New Header's dominance frontier now includes itself and Exit block
+    DominanceFrontier::iterator HeadI = DF->find(L->getHeader());
+    if (HeadI != DF->end()) {
+      DominanceFrontier::DomSetType & HeaderSet = HeadI->second;
+      HeaderSet.clear();
+      HeaderSet.insert(L->getHeader());
+      HeaderSet.insert(Exit);
+    } else {
+      DominanceFrontier::DomSetType HeaderSet;
+      HeaderSet.insert(L->getHeader());
+      HeaderSet.insert(Exit);
+      DF->addBasicBlock(L->getHeader(), HeaderSet);
+    }
+
+    // Original header (new Loop Latch)'s dominance frontier is Exit.
+    DominanceFrontier::iterator LatchI = DF->find(L->getLoopLatch());
+    if (LatchI != DF->end()) {
+      DominanceFrontier::DomSetType &LatchSet = LatchI->second;
+      LatchSet = LatchI->second;
+      LatchSet.clear();
+      LatchSet.insert(Exit);
+    } else {
+      DominanceFrontier::DomSetType LatchSet;
+      LatchSet.insert(Exit);
+      DF->addBasicBlock(L->getHeader(), LatchSet);
+    }
+
+    // If a loop block dominates new loop latch then its frontier is
+    // new header and Exit.
+    BasicBlock *NewLatch = L->getLoopLatch();
+    DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
+    for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      BasicBlock *B = *BI;
+      if (DT->dominates(B, NewLatch)) {
+        DominanceFrontier::iterator BDFI = DF->find(B);
+        if (BDFI != DF->end()) {
+          DominanceFrontier::DomSetType &BSet = BDFI->second;
+          BSet = BDFI->second;
+          BSet.clear();
+          BSet.insert(L->getHeader());
+          BSet.insert(Exit);
+        } else {
+          DominanceFrontier::DomSetType BSet;
+          BSet.insert(L->getHeader());
+          BSet.insert(Exit);
+          DF->addBasicBlock(B, BSet);
+        }
+      }
+    }
+  }
+
+  // Preserve canonical loop form, which means Exit block should
+  // have only one predecessor.
+  BasicBlock *NExit = SplitEdge(L->getLoopLatch(), Exit, this);
+
+  // Preserve LCSSA.
+  BasicBlock::iterator I = Exit->begin(), E = Exit->end();
+  PHINode *PN = NULL;
+  for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+    unsigned N = PN->getNumIncomingValues();
+    for (unsigned index = 0; index < N; ++index)
+      if (PN->getIncomingBlock(index) == NExit) {
+        PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName(),
+                                         NExit->begin());
+        NewPN->addIncoming(PN->getIncomingValue(index), L->getLoopLatch());
+        PN->setIncomingValue(index, NewPN);
+        PN->setIncomingBlock(index, NExit);
+        break;
+      }
+  }
+
+  assert(NewHeader && L->getHeader() == NewHeader &&
+         "Invalid loop header after loop rotation");
+  assert(NewPreHeader && L->getLoopPreheader() == NewPreHeader &&
+         "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() &&
+         "Invalid loop latch after loop rotation");
+}
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 0000000..92270b5
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -0,0 +1,2605 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reduce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumReduced ,    "Number of IV uses strength reduced");
+STATISTIC(NumInserted,    "Number of PHIs inserted");
+STATISTIC(NumVariable,    "Number of PHIs with variable strides");
+STATISTIC(NumEliminated,  "Number of strides eliminated");
+STATISTIC(NumShadow,      "Number of Shadow IVs optimized");
+STATISTIC(NumImmSunk,     "Number of common expr immediates sunk into uses");
+STATISTIC(NumLoopCond,    "Number of loop terminating conds optimized");
+
+static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
+                                       cl::init(false),
+                                       cl::Hidden);
+
+namespace {
+
+  struct BasedUser;
+
+  /// IVInfo - This structure keeps track of one IV expression inserted during
+  /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
+  /// well as the PHI node and increment value created for rewrite.
+  struct VISIBILITY_HIDDEN IVExpr {
+    SCEVHandle  Stride;
+    SCEVHandle  Base;
+    PHINode    *PHI;
+
+    IVExpr(const SCEVHandle &stride, const SCEVHandle &base, PHINode *phi)
+      : Stride(stride), Base(base), PHI(phi) {}
+  };
+
+  /// IVsOfOneStride - This structure keeps track of all IV expression inserted
+  /// during StrengthReduceStridedIVUsers for a particular stride of the IV.
+  struct VISIBILITY_HIDDEN IVsOfOneStride {
+    std::vector<IVExpr> IVs;
+
+    void addIV(const SCEVHandle &Stride, const SCEVHandle &Base, PHINode *PHI) {
+      IVs.push_back(IVExpr(Stride, Base, PHI));
+    }
+  };
+
+  class VISIBILITY_HIDDEN LoopStrengthReduce : public LoopPass {
+    IVUsers *IU;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    ScalarEvolution *SE;
+    bool Changed;
+
+    /// IVsByStride - Keep track of all IVs that have been inserted for a
+    /// particular stride.
+    std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
+
+    /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
+    /// reused (nor should they be rewritten to reuse other strides).
+    SmallSet<SCEVHandle, 4> StrideNoReuse;
+
+    /// DeadInsts - Keep track of instructions we may have made dead, so that
+    /// we can remove them after we are done working.
+    SmallVector<WeakVH, 16> DeadInsts;
+
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopStrengthReduce(const TargetLowering *tli = NULL) : 
+      LoopPass(&ID), TLI(tli) {
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // We split critical edges, so we change the CFG.  However, we do update
+      // many analyses if they are around.
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreserved<DominatorTree>();
+
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<IVUsers>();
+      AU.addPreserved<IVUsers>();
+    }
+
+  private:
+    ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
+                                  IVStrideUse* &CondUse,
+                                  const SCEVHandle* &CondStride);
+
+    void OptimizeIndvars(Loop *L);
+    void OptimizeLoopCountIV(Loop *L);
+    void OptimizeLoopTermCond(Loop *L);
+
+    /// OptimizeShadowIV - If IV is used in a int-to-float cast
+    /// inside the loop then try to eliminate the cast opeation.
+    void OptimizeShadowIV(Loop *L);
+
+    /// OptimizeSMax - Rewrite the loop's terminating condition
+    /// if it uses an smax computation.
+    ICmpInst *OptimizeSMax(Loop *L, ICmpInst *Cond,
+                           IVStrideUse* &CondUse);
+
+    bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
+                           const SCEVHandle *&CondStride);
+    bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
+    SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&,
+                             IVExpr&, const Type*,
+                             const std::vector<BasedUser>& UsersToProcess);
+    bool ValidScale(bool, int64_t,
+                    const std::vector<BasedUser>& UsersToProcess);
+    bool ValidOffset(bool, int64_t, int64_t,
+                     const std::vector<BasedUser>& UsersToProcess);
+    SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
+                              IVUsersOfOneStride &Uses,
+                              Loop *L,
+                              bool &AllUsesAreAddresses,
+                              bool &AllUsesAreOutsideLoop,
+                              std::vector<BasedUser> &UsersToProcess);
+    bool ShouldUseFullStrengthReductionMode(
+                                const std::vector<BasedUser> &UsersToProcess,
+                                const Loop *L,
+                                bool AllUsesAreAddresses,
+                                SCEVHandle Stride);
+    void PrepareToStrengthReduceFully(
+                             std::vector<BasedUser> &UsersToProcess,
+                             SCEVHandle Stride,
+                             SCEVHandle CommonExprs,
+                             const Loop *L,
+                             SCEVExpander &PreheaderRewriter);
+    void PrepareToStrengthReduceFromSmallerStride(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         Value *CommonBaseV,
+                                         const IVExpr &ReuseIV,
+                                         Instruction *PreInsertPt);
+    void PrepareToStrengthReduceWithNewPhi(
+                                  std::vector<BasedUser> &UsersToProcess,
+                                  SCEVHandle Stride,
+                                  SCEVHandle CommonExprs,
+                                  Value *CommonBaseV,
+                                  Instruction *IVIncInsertPt,
+                                  const Loop *L,
+                                  SCEVExpander &PreheaderRewriter);
+    void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+                                      IVUsersOfOneStride &Uses,
+                                      Loop *L);
+    void DeleteTriviallyDeadInstructions();
+  };
+}
+
+char LoopStrengthReduce::ID = 0;
+static RegisterPass<LoopStrengthReduce>
+X("loop-reduce", "Loop Strength Reduction");
+
+Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
+  return new LoopStrengthReduce(TLI);
+}
+
+/// DeleteTriviallyDeadInstructions - If any of the instructions is the
+/// specified set are trivially dead, delete them and see if this makes any of
+/// their operands subsequently dead.
+void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
+  if (DeadInsts.empty()) return;
+  
+  while (!DeadInsts.empty()) {
+    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.back());
+    DeadInsts.pop_back();
+    
+    if (I == 0 || !isInstructionTriviallyDead(I))
+      continue;
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        *OI = 0;
+        if (U->use_empty())
+          DeadInsts.push_back(U);
+      }
+    }
+    
+    I->eraseFromParent();
+    Changed = true;
+  }
+}
+
+/// containsAddRecFromDifferentLoop - Determine whether expression S involves a 
+/// subexpression that is an AddRec from a loop other than L.  An outer loop 
+/// of L is OK, but not an inner loop nor a disjoint loop.
+static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+  // This is very common, put it first.
+  if (isa<SCEVConstant>(S))
+    return false;
+  if (const SCEVCommutativeExpr *AE = dyn_cast<SCEVCommutativeExpr>(S)) {
+    for (unsigned int i=0; i< AE->getNumOperands(); i++)
+      if (containsAddRecFromDifferentLoop(AE->getOperand(i), L))
+        return true;
+    return false;
+  }
+  if (const SCEVAddRecExpr *AE = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (const Loop *newLoop = AE->getLoop()) {
+      if (newLoop == L)
+        return false;
+      // if newLoop is an outer loop of L, this is OK.
+      if (!LoopInfoBase<BasicBlock>::isNotAlreadyContainedIn(L, newLoop))
+        return false;
+    }
+    return true;
+  }
+  if (const SCEVUDivExpr *DE = dyn_cast<SCEVUDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#if 0
+  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll 
+  // need this when it is.
+  if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#endif
+  if (const SCEVCastExpr *CE = dyn_cast<SCEVCastExpr>(S))
+    return containsAddRecFromDifferentLoop(CE->getOperand(), L);
+  return false;
+}
+
+/// isAddressUse - Returns true if the specified instruction is using the
+/// specified value as an address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+  bool isAddress = isa<LoadInst>(Inst);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->getOperand(1) == OperandVal)
+      isAddress = true;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::prefetch:
+      case Intrinsic::x86_sse2_loadu_dq:
+      case Intrinsic::x86_sse2_loadu_pd:
+      case Intrinsic::x86_sse_loadu_ps:
+      case Intrinsic::x86_sse_storeu_ps:
+      case Intrinsic::x86_sse2_storeu_pd:
+      case Intrinsic::x86_sse2_storeu_dq:
+      case Intrinsic::x86_sse2_storel_dq:
+        if (II->getOperand(1) == OperandVal)
+          isAddress = true;
+        break;
+    }
+  }
+  return isAddress;
+}
+
+/// getAccessType - Return the type of the memory being accessed.
+static const Type *getAccessType(const Instruction *Inst) {
+  const Type *AccessTy = Inst->getType();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    AccessTy = SI->getOperand(0)->getType();
+  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      AccessTy = II->getOperand(1)->getType();
+      break;
+    }
+  }
+  return AccessTy;
+}
+
+namespace {
+  /// BasedUser - For a particular base value, keep information about how we've
+  /// partitioned the expression so far.
+  struct BasedUser {
+    /// SE - The current ScalarEvolution object.
+    ScalarEvolution *SE;
+
+    /// Base - The Base value for the PHI node that needs to be inserted for
+    /// this use.  As the use is processed, information gets moved from this
+    /// field to the Imm field (below).  BasedUser values are sorted by this
+    /// field.
+    SCEVHandle Base;
+    
+    /// Inst - The instruction using the induction variable.
+    Instruction *Inst;
+
+    /// OperandValToReplace - The operand value of Inst to replace with the
+    /// EmittedBase.
+    Value *OperandValToReplace;
+
+    /// isSigned - The stride (and thus also the Base) of this use may be in
+    /// a narrower type than the use itself (OperandValToReplace->getType()).
+    /// When this is the case, the isSigned field indicates whether the
+    /// IV expression should be signed-extended instead of zero-extended to
+    /// fit the type of the use.
+    bool isSigned;
+
+    /// Imm - The immediate value that should be added to the base immediately
+    /// before Inst, because it will be folded into the imm field of the
+    /// instruction.  This is also sometimes used for loop-variant values that
+    /// must be added inside the loop.
+    SCEVHandle Imm;
+
+    /// Phi - The induction variable that performs the striding that
+    /// should be used for this user.
+    PHINode *Phi;
+
+    // isUseOfPostIncrementedValue - True if this should use the
+    // post-incremented version of this IV, not the preincremented version.
+    // This can only be set in special cases, such as the terminating setcc
+    // instruction for a loop and uses outside the loop that are dominated by
+    // the loop.
+    bool isUseOfPostIncrementedValue;
+    
+    BasedUser(IVStrideUse &IVSU, ScalarEvolution *se)
+      : SE(se), Base(IVSU.getOffset()), Inst(IVSU.getUser()),
+        OperandValToReplace(IVSU.getOperandValToReplace()),
+        isSigned(IVSU.isSigned()),
+        Imm(SE->getIntegerSCEV(0, Base->getType())), 
+        isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {}
+
+    // Once we rewrite the code to insert the new IVs we want, update the
+    // operands of Inst to use the new expression 'NewBase', with 'Imm' added
+    // to it.
+    void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                        Instruction *InsertPt,
+                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
+                                        LoopInfo &LI,
+                                        SmallVectorImpl<WeakVH> &DeadInsts);
+    
+    Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+                                       const Type *Ty,
+                                       SCEVExpander &Rewriter,
+                                       Instruction *IP, Loop *L,
+                                       LoopInfo &LI);
+    void dump() const;
+  };
+}
+
+void BasedUser::dump() const {
+  cerr << " Base=" << *Base;
+  cerr << " Imm=" << *Imm;
+  cerr << "   Inst: " << *Inst;
+}
+
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+                                              const Type *Ty,
+                                              SCEVExpander &Rewriter,
+                                              Instruction *IP, Loop *L,
+                                              LoopInfo &LI) {
+  // Figure out where we *really* want to insert this code.  In particular, if
+  // the user is inside of a loop that is nested inside of L, we really don't
+  // want to insert this expression before the user, we'd rather pull it out as
+  // many loops as possible.
+  Instruction *BaseInsertPt = IP;
+  
+  // Figure out the most-nested loop that IP is in.
+  Loop *InsertLoop = LI.getLoopFor(IP->getParent());
+  
+  // If InsertLoop is not L, and InsertLoop is nested inside of L, figure out
+  // the preheader of the outer-most loop where NewBase is not loop invariant.
+  if (L->contains(IP->getParent()))
+    while (InsertLoop && NewBase->isLoopInvariant(InsertLoop)) {
+      BaseInsertPt = InsertLoop->getLoopPreheader()->getTerminator();
+      InsertLoop = InsertLoop->getParentLoop();
+    }
+  
+  Value *Base = Rewriter.expandCodeFor(NewBase, 0, BaseInsertPt);
+
+  SCEVHandle NewValSCEV = SE->getUnknown(Base);
+
+  // If there is no immediate value, skip the next part.
+  if (!Imm->isZero()) {
+    // If we are inserting the base and imm values in the same block, make sure
+    // to adjust the IP position if insertion reused a result.
+    if (IP == BaseInsertPt)
+      IP = Rewriter.getInsertionPoint();
+
+    // Always emit the immediate (if non-zero) into the same block as the user.
+    NewValSCEV = SE->getAddExpr(NewValSCEV, Imm);
+  }
+
+  if (isSigned)
+    NewValSCEV = SE->getTruncateOrSignExtend(NewValSCEV, Ty);
+  else
+    NewValSCEV = SE->getTruncateOrZeroExtend(NewValSCEV, Ty);
+
+  return Rewriter.expandCodeFor(NewValSCEV, Ty, IP);
+}
+
+
+// Once we rewrite the code to insert the new IVs we want, update the
+// operands of Inst to use the new expression 'NewBase', with 'Imm' added
+// to it. NewBasePt is the last instruction which contributes to the
+// value of NewBase in the case that it's a diffferent instruction from
+// the PHI that NewBase is computed from, or null otherwise.
+//
+void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                               Instruction *NewBasePt,
+                                      SCEVExpander &Rewriter, Loop *L, Pass *P,
+                                      LoopInfo &LI,
+                                      SmallVectorImpl<WeakVH> &DeadInsts) {
+  if (!isa<PHINode>(Inst)) {
+    // By default, insert code at the user instruction.
+    BasicBlock::iterator InsertPt = Inst;
+    
+    // However, if the Operand is itself an instruction, the (potentially
+    // complex) inserted code may be shared by many users.  Because of this, we
+    // want to emit code for the computation of the operand right before its old
+    // computation.  This is usually safe, because we obviously used to use the
+    // computation when it was computed in its current block.  However, in some
+    // cases (e.g. use of a post-incremented induction variable) the NewBase
+    // value will be pinned to live somewhere after the original computation.
+    // In this case, we have to back off.
+    //
+    // If this is a use outside the loop (which means after, since it is based
+    // on a loop indvar) we use the post-incremented value, so that we don't
+    // artificially make the preinc value live out the bottom of the loop. 
+    if (!isUseOfPostIncrementedValue && L->contains(Inst->getParent())) {
+      if (NewBasePt && isa<PHINode>(OperandValToReplace)) {
+        InsertPt = NewBasePt;
+        ++InsertPt;
+      } else if (Instruction *OpInst
+                 = dyn_cast<Instruction>(OperandValToReplace)) {
+        InsertPt = OpInst;
+        while (isa<PHINode>(InsertPt)) ++InsertPt;
+      }
+    }
+    Value *NewVal = InsertCodeForBaseAtPosition(NewBase,
+                                                OperandValToReplace->getType(),
+                                                Rewriter, InsertPt, L, LI);
+    // Replace the use of the operand Value with the new Phi we just created.
+    Inst->replaceUsesOfWith(OperandValToReplace, NewVal);
+
+    DOUT << "      Replacing with ";
+    DEBUG(WriteAsOperand(*DOUT, NewVal, /*PrintType=*/false));
+    DOUT << ", which has value " << *NewBase << " plus IMM " << *Imm << "\n";
+    return;
+  }
+
+  // PHI nodes are more complex.  We have to insert one copy of the NewBase+Imm
+  // expression into each operand block that uses it.  Note that PHI nodes can
+  // have multiple entries for the same predecessor.  We use a map to make sure
+  // that a PHI node only has a single Value* for each predecessor (which also
+  // prevents us from inserting duplicate code in some blocks).
+  DenseMap<BasicBlock*, Value*> InsertedCode;
+  PHINode *PN = cast<PHINode>(Inst);
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) == OperandValToReplace) {
+      // If the original expression is outside the loop, put the replacement
+      // code in the same place as the original expression,
+      // which need not be an immediate predecessor of this PHI.  This way we 
+      // need only one copy of it even if it is referenced multiple times in
+      // the PHI.  We don't do this when the original expression is inside the
+      // loop because multiple copies sometimes do useful sinking of code in
+      // that case(?).
+      Instruction *OldLoc = dyn_cast<Instruction>(OperandValToReplace);
+      if (L->contains(OldLoc->getParent())) {
+        // If this is a critical edge, split the edge so that we do not insert
+        // the code on all predecessor/successor paths.  We do this unless this
+        // is the canonical backedge for this loop, as this can make some
+        // inserted code be in an illegal position.
+        BasicBlock *PHIPred = PN->getIncomingBlock(i);
+        if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 &&
+            (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) {
+
+          // First step, split the critical edge.
+          SplitCriticalEdge(PHIPred, PN->getParent(), P, false);
+
+          // Next step: move the basic block.  In particular, if the PHI node
+          // is outside of the loop, and PredTI is in the loop, we want to
+          // move the block to be immediately before the PHI block, not
+          // immediately after PredTI.
+          if (L->contains(PHIPred) && !L->contains(PN->getParent())) {
+            BasicBlock *NewBB = PN->getIncomingBlock(i);
+            NewBB->moveBefore(PN->getParent());
+          }
+
+          // Splitting the edge can reduce the number of PHI entries we have.
+          e = PN->getNumIncomingValues();
+        }
+      }
+      Value *&Code = InsertedCode[PN->getIncomingBlock(i)];
+      if (!Code) {
+        // Insert the code into the end of the predecessor block.
+        Instruction *InsertPt = (L->contains(OldLoc->getParent())) ?
+                                PN->getIncomingBlock(i)->getTerminator() :
+                                OldLoc->getParent()->getTerminator();
+        Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(),
+                                           Rewriter, InsertPt, L, LI);
+
+        DOUT << "      Changing PHI use to ";
+        DEBUG(WriteAsOperand(*DOUT, Code, /*PrintType=*/false));
+        DOUT << ", which has value " << *NewBase << " plus IMM " << *Imm << "\n";
+      }
+
+      // Replace the use of the operand Value with the new Phi we just created.
+      PN->setIncomingValue(i, Code);
+      Rewriter.clear();
+    }
+  }
+
+  // PHI node might have become a constant value after SplitCriticalEdge.
+  DeadInsts.push_back(Inst);
+}
+
+
+/// fitsInAddressMode - Return true if V can be subsumed within an addressing
+/// mode, and does not need to be put in a register first.
+static bool fitsInAddressMode(const SCEVHandle &V, const Type *AccessTy,
+                             const TargetLowering *TLI, bool HasBaseReg) {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
+    int64_t VC = SC->getValue()->getSExtValue();
+    if (TLI) {
+      TargetLowering::AddrMode AM;
+      AM.BaseOffs = VC;
+      AM.HasBaseReg = HasBaseReg;
+      return TLI->isLegalAddressingMode(AM, AccessTy);
+    } else {
+      // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field.
+      return (VC > -(1 << 16) && VC < (1 << 16)-1);
+    }
+  }
+
+  if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V))
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(SU->getValue())) {
+      if (TLI) {
+        TargetLowering::AddrMode AM;
+        AM.BaseGV = GV;
+        AM.HasBaseReg = HasBaseReg;
+        return TLI->isLegalAddressingMode(AM, AccessTy);
+      } else {
+        // Default: assume global addresses are not legal.
+      }
+    }
+
+  return false;
+}
+
+/// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are
+/// loop varying to the Imm operand.
+static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
+                                             Loop *L, ScalarEvolution *SE) {
+  if (Val->isLoopInvariant(L)) return;  // Nothing to do.
+  
+  if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
+    std::vector<SCEVHandle> NewOps;
+    NewOps.reserve(SAE->getNumOperands());
+    
+    for (unsigned i = 0; i != SAE->getNumOperands(); ++i)
+      if (!SAE->getOperand(i)->isLoopInvariant(L)) {
+        // If this is a loop-variant expression, it must stay in the immediate
+        // field of the expression.
+        Imm = SE->getAddExpr(Imm, SAE->getOperand(i));
+      } else {
+        NewOps.push_back(SAE->getOperand(i));
+      }
+
+    if (NewOps.empty())
+      Val = SE->getIntegerSCEV(0, Val->getType());
+    else
+      Val = SE->getAddExpr(NewOps);
+  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
+    // Try to pull immediates out of the start value of nested addrec's.
+    SCEVHandle Start = SARE->getStart();
+    MoveLoopVariantsToImmediateField(Start, Imm, L, SE);
+    
+    std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+    Ops[0] = Start;
+    Val = SE->getAddRecExpr(Ops, SARE->getLoop());
+  } else {
+    // Otherwise, all of Val is variant, move the whole thing over.
+    Imm = SE->getAddExpr(Imm, Val);
+    Val = SE->getIntegerSCEV(0, Val->getType());
+  }
+}
+
+
+/// MoveImmediateValues - Look at Val, and pull out any additions of constants
+/// that can fit into the immediate field of instructions in the target.
+/// Accumulate these immediate values into the Imm value.
+static void MoveImmediateValues(const TargetLowering *TLI,
+                                const Type *AccessTy,
+                                SCEVHandle &Val, SCEVHandle &Imm,
+                                bool isAddress, Loop *L,
+                                ScalarEvolution *SE) {
+  if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
+    std::vector<SCEVHandle> NewOps;
+    NewOps.reserve(SAE->getNumOperands());
+    
+    for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
+      SCEVHandle NewOp = SAE->getOperand(i);
+      MoveImmediateValues(TLI, AccessTy, NewOp, Imm, isAddress, L, SE);
+      
+      if (!NewOp->isLoopInvariant(L)) {
+        // If this is a loop-variant expression, it must stay in the immediate
+        // field of the expression.
+        Imm = SE->getAddExpr(Imm, NewOp);
+      } else {
+        NewOps.push_back(NewOp);
+      }
+    }
+
+    if (NewOps.empty())
+      Val = SE->getIntegerSCEV(0, Val->getType());
+    else
+      Val = SE->getAddExpr(NewOps);
+    return;
+  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
+    // Try to pull immediates out of the start value of nested addrec's.
+    SCEVHandle Start = SARE->getStart();
+    MoveImmediateValues(TLI, AccessTy, Start, Imm, isAddress, L, SE);
+    
+    if (Start != SARE->getStart()) {
+      std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+      Ops[0] = Start;
+      Val = SE->getAddRecExpr(Ops, SARE->getLoop());
+    }
+    return;
+  } else if (const SCEVMulExpr *SME = dyn_cast<SCEVMulExpr>(Val)) {
+    // Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field.
+    if (isAddress &&
+        fitsInAddressMode(SME->getOperand(0), AccessTy, TLI, false) &&
+        SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) {
+
+      SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());
+      SCEVHandle NewOp = SME->getOperand(1);
+      MoveImmediateValues(TLI, AccessTy, NewOp, SubImm, isAddress, L, SE);
+      
+      // If we extracted something out of the subexpressions, see if we can 
+      // simplify this!
+      if (NewOp != SME->getOperand(1)) {
+        // Scale SubImm up by "8".  If the result is a target constant, we are
+        // good.
+        SubImm = SE->getMulExpr(SubImm, SME->getOperand(0));
+        if (fitsInAddressMode(SubImm, AccessTy, TLI, false)) {
+          // Accumulate the immediate.
+          Imm = SE->getAddExpr(Imm, SubImm);
+          
+          // Update what is left of 'Val'.
+          Val = SE->getMulExpr(SME->getOperand(0), NewOp);
+          return;
+        }
+      }
+    }
+  }
+
+  // Loop-variant expressions must stay in the immediate field of the
+  // expression.
+  if ((isAddress && fitsInAddressMode(Val, AccessTy, TLI, false)) ||
+      !Val->isLoopInvariant(L)) {
+    Imm = SE->getAddExpr(Imm, Val);
+    Val = SE->getIntegerSCEV(0, Val->getType());
+    return;
+  }
+
+  // Otherwise, no immediates to move.
+}
+
+static void MoveImmediateValues(const TargetLowering *TLI,
+                                Instruction *User,
+                                SCEVHandle &Val, SCEVHandle &Imm,
+                                bool isAddress, Loop *L,
+                                ScalarEvolution *SE) {
+  const Type *AccessTy = getAccessType(User);
+  MoveImmediateValues(TLI, AccessTy, Val, Imm, isAddress, L, SE);
+}
+
+/// SeparateSubExprs - Decompose Expr into all of the subexpressions that are
+/// added together.  This is used to reassociate common addition subexprs
+/// together for maximal sharing when rewriting bases.
+static void SeparateSubExprs(std::vector<SCEVHandle> &SubExprs,
+                             SCEVHandle Expr,
+                             ScalarEvolution *SE) {
+  if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(Expr)) {
+    for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j)
+      SeparateSubExprs(SubExprs, AE->getOperand(j), SE);
+  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Expr)) {
+    SCEVHandle Zero = SE->getIntegerSCEV(0, Expr->getType());
+    if (SARE->getOperand(0) == Zero) {
+      SubExprs.push_back(Expr);
+    } else {
+      // Compute the addrec with zero as its base.
+      std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+      Ops[0] = Zero;   // Start with zero base.
+      SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop()));
+      
+
+      SeparateSubExprs(SubExprs, SARE->getOperand(0), SE);
+    }
+  } else if (!Expr->isZero()) {
+    // Do not add zero.
+    SubExprs.push_back(Expr);
+  }
+}
+
+// This is logically local to the following function, but C++ says we have 
+// to make it file scope.
+struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
+
+/// RemoveCommonExpressionsFromUseBases - Look through all of the Bases of all
+/// the Uses, removing any common subexpressions, except that if all such
+/// subexpressions can be folded into an addressing mode for all uses inside
+/// the loop (this case is referred to as "free" in comments herein) we do
+/// not remove anything.  This looks for things like (a+b+c) and
+/// (a+c+d) and computes the common (a+c) subexpression.  The common expression
+/// is *removed* from the Bases and returned.
+static SCEVHandle 
+RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
+                                    ScalarEvolution *SE, Loop *L,
+                                    const TargetLowering *TLI) {
+  unsigned NumUses = Uses.size();
+
+  // Only one use?  This is a very common case, so we handle it specially and
+  // cheaply.
+  SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
+  SCEVHandle Result = Zero;
+  SCEVHandle FreeResult = Zero;
+  if (NumUses == 1) {
+    // If the use is inside the loop, use its base, regardless of what it is:
+    // it is clearly shared across all the IV's.  If the use is outside the loop
+    // (which means after it) we don't want to factor anything *into* the loop,
+    // so just use 0 as the base.
+    if (L->contains(Uses[0].Inst->getParent()))
+      std::swap(Result, Uses[0].Base);
+    return Result;
+  }
+
+  // To find common subexpressions, count how many of Uses use each expression.
+  // If any subexpressions are used Uses.size() times, they are common.
+  // Also track whether all uses of each expression can be moved into an
+  // an addressing mode "for free"; such expressions are left within the loop.
+  // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
+  std::map<SCEVHandle, SubExprUseData> SubExpressionUseData;
+  
+  // UniqueSubExprs - Keep track of all of the subexpressions we see in the
+  // order we see them.
+  std::vector<SCEVHandle> UniqueSubExprs;
+
+  std::vector<SCEVHandle> SubExprs;
+  unsigned NumUsesInsideLoop = 0;
+  for (unsigned i = 0; i != NumUses; ++i) {
+    // If the user is outside the loop, just ignore it for base computation.
+    // Since the user is outside the loop, it must be *after* the loop (if it
+    // were before, it could not be based on the loop IV).  We don't want users
+    // after the loop to affect base computation of values *inside* the loop,
+    // because we can always add their offsets to the result IV after the loop
+    // is done, ensuring we get good code inside the loop.
+    if (!L->contains(Uses[i].Inst->getParent()))
+      continue;
+    NumUsesInsideLoop++;
+    
+    // If the base is zero (which is common), return zero now, there are no
+    // CSEs we can find.
+    if (Uses[i].Base == Zero) return Zero;
+
+    // If this use is as an address we may be able to put CSEs in the addressing
+    // mode rather than hoisting them.
+    bool isAddrUse = isAddressUse(Uses[i].Inst, Uses[i].OperandValToReplace);
+    // We may need the AccessTy below, but only when isAddrUse, so compute it
+    // only in that case.
+    const Type *AccessTy = 0;
+    if (isAddrUse)
+      AccessTy = getAccessType(Uses[i].Inst);
+
+    // Split the expression into subexprs.
+    SeparateSubExprs(SubExprs, Uses[i].Base, SE);
+    // Add one to SubExpressionUseData.Count for each subexpr present, and
+    // if the subexpr is not a valid immediate within an addressing mode use,
+    // set SubExpressionUseData.notAllUsesAreFree.  We definitely want to
+    // hoist these out of the loop (if they are common to all uses).
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
+      if (++SubExpressionUseData[SubExprs[j]].Count == 1)
+        UniqueSubExprs.push_back(SubExprs[j]);
+      if (!isAddrUse || !fitsInAddressMode(SubExprs[j], AccessTy, TLI, false))
+        SubExpressionUseData[SubExprs[j]].notAllUsesAreFree = true;
+    }
+    SubExprs.clear();
+  }
+
+  // Now that we know how many times each is used, build Result.  Iterate over
+  // UniqueSubexprs so that we have a stable ordering.
+  for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
+    std::map<SCEVHandle, SubExprUseData>::iterator I = 
+       SubExpressionUseData.find(UniqueSubExprs[i]);
+    assert(I != SubExpressionUseData.end() && "Entry not found?");
+    if (I->second.Count == NumUsesInsideLoop) { // Found CSE! 
+      if (I->second.notAllUsesAreFree)
+        Result = SE->getAddExpr(Result, I->first);
+      else 
+        FreeResult = SE->getAddExpr(FreeResult, I->first);
+    } else
+      // Remove non-cse's from SubExpressionUseData.
+      SubExpressionUseData.erase(I);
+  }
+
+  if (FreeResult != Zero) {
+    // We have some subexpressions that can be subsumed into addressing
+    // modes in every use inside the loop.  However, it's possible that
+    // there are so many of them that the combined FreeResult cannot
+    // be subsumed, or that the target cannot handle both a FreeResult
+    // and a Result in the same instruction (for example because it would
+    // require too many registers).  Check this.
+    for (unsigned i=0; i<NumUses; ++i) {
+      if (!L->contains(Uses[i].Inst->getParent()))
+        continue;
+      // We know this is an addressing mode use; if there are any uses that
+      // are not, FreeResult would be Zero.
+      const Type *AccessTy = getAccessType(Uses[i].Inst);
+      if (!fitsInAddressMode(FreeResult, AccessTy, TLI, Result!=Zero)) {
+        // FIXME:  could split up FreeResult into pieces here, some hoisted
+        // and some not.  There is no obvious advantage to this.
+        Result = SE->getAddExpr(Result, FreeResult);
+        FreeResult = Zero;
+        break;
+      }
+    }
+  }
+
+  // If we found no CSE's, return now.
+  if (Result == Zero) return Result;
+  
+  // If we still have a FreeResult, remove its subexpressions from
+  // SubExpressionUseData.  This means they will remain in the use Bases.
+  if (FreeResult != Zero) {
+    SeparateSubExprs(SubExprs, FreeResult, SE);
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
+      std::map<SCEVHandle, SubExprUseData>::iterator I = 
+         SubExpressionUseData.find(SubExprs[j]);
+      SubExpressionUseData.erase(I);
+    }
+    SubExprs.clear();
+  }
+
+  // Otherwise, remove all of the CSE's we found from each of the base values.
+  for (unsigned i = 0; i != NumUses; ++i) {
+    // Uses outside the loop don't necessarily include the common base, but
+    // the final IV value coming into those uses does.  Instead of trying to
+    // remove the pieces of the common base, which might not be there,
+    // subtract off the base to compensate for this.
+    if (!L->contains(Uses[i].Inst->getParent())) {
+      Uses[i].Base = SE->getMinusSCEV(Uses[i].Base, Result);
+      continue;
+    }
+
+    // Split the expression into subexprs.
+    SeparateSubExprs(SubExprs, Uses[i].Base, SE);
+
+    // Remove any common subexpressions.
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
+      if (SubExpressionUseData.count(SubExprs[j])) {
+        SubExprs.erase(SubExprs.begin()+j);
+        --j; --e;
+      }
+    
+    // Finally, add the non-shared expressions together.
+    if (SubExprs.empty())
+      Uses[i].Base = Zero;
+    else
+      Uses[i].Base = SE->getAddExpr(SubExprs);
+    SubExprs.clear();
+  }
+ 
+  return Result;
+}
+
+/// ValidScale - Check whether the given Scale is valid for all loads and 
+/// stores in UsersToProcess.
+///
+bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
+                               const std::vector<BasedUser>& UsersToProcess) {
+  if (!TLI)
+    return true;
+
+  for (unsigned i = 0, e = UsersToProcess.size(); i!=e; ++i) {
+    // If this is a load or other access, pass the type of the access in.
+    const Type *AccessTy = Type::VoidTy;
+    if (isAddressUse(UsersToProcess[i].Inst,
+                     UsersToProcess[i].OperandValToReplace))
+      AccessTy = getAccessType(UsersToProcess[i].Inst);
+    else if (isa<PHINode>(UsersToProcess[i].Inst))
+      continue;
+    
+    TargetLowering::AddrMode AM;
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
+      AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero();
+    AM.Scale = Scale;
+
+    // If load[imm+r*scale] is illegal, bail out.
+    if (!TLI->isLegalAddressingMode(AM, AccessTy))
+      return false;
+  }
+  return true;
+}
+
+/// ValidOffset - Check whether the given Offset is valid for all loads and
+/// stores in UsersToProcess.
+///
+bool LoopStrengthReduce::ValidOffset(bool HasBaseReg,
+                               int64_t Offset,
+                               int64_t Scale,
+                               const std::vector<BasedUser>& UsersToProcess) {
+  if (!TLI)
+    return true;
+
+  for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
+    // If this is a load or other access, pass the type of the access in.
+    const Type *AccessTy = Type::VoidTy;
+    if (isAddressUse(UsersToProcess[i].Inst,
+                     UsersToProcess[i].OperandValToReplace))
+      AccessTy = getAccessType(UsersToProcess[i].Inst);
+    else if (isa<PHINode>(UsersToProcess[i].Inst))
+      continue;
+
+    TargetLowering::AddrMode AM;
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
+      AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.BaseOffs = (uint64_t)AM.BaseOffs + (uint64_t)Offset;
+    AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero();
+    AM.Scale = Scale;
+
+    // If load[imm+r*scale] is illegal, bail out.
+    if (!TLI->isLegalAddressingMode(AM, AccessTy))
+      return false;
+  }
+  return true;
+}
+
+/// RequiresTypeConversion - Returns true if converting Ty1 to Ty2 is not
+/// a nop.
+bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
+                                                const Type *Ty2) {
+  if (Ty1 == Ty2)
+    return false;
+  Ty1 = SE->getEffectiveSCEVType(Ty1);
+  Ty2 = SE->getEffectiveSCEVType(Ty2);
+  if (Ty1 == Ty2)
+    return false;
+  if (Ty1->canLosslesslyBitCastTo(Ty2))
+    return false;
+  if (TLI && TLI->isTruncateFree(Ty1, Ty2))
+    return false;
+  return true;
+}
+
+/// CheckForIVReuse - Returns the multiple if the stride is the multiple
+/// of a previous stride and it is a legal value for the target addressing
+/// mode scale component and optional base reg. This allows the users of
+/// this stride to be rewritten as prev iv * factor. It returns 0 if no
+/// reuse is possible.  Factors can be negative on same targets, e.g. ARM.
+///
+/// If all uses are outside the loop, we don't require that all multiplies
+/// be folded into the addressing mode, nor even that the factor be constant; 
+/// a multiply (executed once) outside the loop is better than another IV 
+/// within.  Well, usually.
+SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
+                                bool AllUsesAreAddresses,
+                                bool AllUsesAreOutsideLoop,
+                                const SCEVHandle &Stride, 
+                                IVExpr &IV, const Type *Ty,
+                                const std::vector<BasedUser>& UsersToProcess) {
+  if (StrideNoReuse.count(Stride))
+    return SE->getIntegerSCEV(0, Stride->getType());
+
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
+    int64_t SInt = SC->getValue()->getSExtValue();
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
+      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+                IVsByStride.find(IU->StrideOrder[NewStride]);
+      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
+          StrideNoReuse.count(SI->first))
+        continue;
+      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+      if (SI->first != Stride &&
+          (unsigned(abs64(SInt)) < SSInt || (SInt % SSInt) != 0))
+        continue;
+      int64_t Scale = SInt / SSInt;
+      // Check that this stride is valid for all the types used for loads and
+      // stores; if it can be used for some and not others, we might as well use
+      // the original stride everywhere, since we have to create the IV for it
+      // anyway. If the scale is 1, then we don't need to worry about folding
+      // multiplications.
+      if (Scale == 1 ||
+          (AllUsesAreAddresses &&
+           ValidScale(HasBaseReg, Scale, UsersToProcess))) {
+        // Prefer to reuse an IV with a base of zero.
+        for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+               IE = SI->second.IVs.end(); II != IE; ++II)
+          // Only reuse previous IV if it would not require a type conversion
+          // and if the base difference can be folded.
+          if (II->Base->isZero() &&
+              !RequiresTypeConversion(II->Base->getType(), Ty)) {
+            IV = *II;
+            return SE->getIntegerSCEV(Scale, Stride->getType());
+          }
+        // Otherwise, settle for an IV with a foldable base.
+        if (AllUsesAreAddresses)
+          for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+                 IE = SI->second.IVs.end(); II != IE; ++II)
+            // Only reuse previous IV if it would not require a type conversion
+            // and if the base difference can be folded.
+            if (SE->getEffectiveSCEVType(II->Base->getType()) ==
+                SE->getEffectiveSCEVType(Ty) &&
+                isa<SCEVConstant>(II->Base)) {
+              int64_t Base =
+                cast<SCEVConstant>(II->Base)->getValue()->getSExtValue();
+              if (Base > INT32_MIN && Base <= INT32_MAX &&
+                  ValidOffset(HasBaseReg, -Base * Scale,
+                              Scale, UsersToProcess)) {
+                IV = *II;
+                return SE->getIntegerSCEV(Scale, Stride->getType());
+              }
+            }
+      }
+    }
+  } else if (AllUsesAreOutsideLoop) {
+    // Accept nonconstant strides here; it is really really right to substitute
+    // an existing IV if we can.
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
+      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+                IVsByStride.find(IU->StrideOrder[NewStride]);
+      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
+        continue;
+      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+      if (SI->first != Stride && SSInt != 1)
+        continue;
+      for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+             IE = SI->second.IVs.end(); II != IE; ++II)
+        // Accept nonzero base here.
+        // Only reuse previous IV if it would not require a type conversion.
+        if (!RequiresTypeConversion(II->Base->getType(), Ty)) {
+          IV = *II;
+          return Stride;
+        }
+    }
+    // Special case, old IV is -1*x and this one is x.  Can treat this one as
+    // -1*old.
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
+      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+                IVsByStride.find(IU->StrideOrder[NewStride]);
+      if (SI == IVsByStride.end()) 
+        continue;
+      if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(SI->first))
+        if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(ME->getOperand(0)))
+          if (Stride == ME->getOperand(1) &&
+              SC->getValue()->getSExtValue() == -1LL)
+            for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+                   IE = SI->second.IVs.end(); II != IE; ++II)
+              // Accept nonzero base here.
+              // Only reuse previous IV if it would not require type conversion.
+              if (!RequiresTypeConversion(II->Base->getType(), Ty)) {
+                IV = *II;
+                return SE->getIntegerSCEV(-1LL, Stride->getType());
+              }
+    }
+  }
+  return SE->getIntegerSCEV(0, Stride->getType());
+}
+
+/// PartitionByIsUseOfPostIncrementedValue - Simple boolean predicate that
+/// returns true if Val's isUseOfPostIncrementedValue is true.
+static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {
+  return Val.isUseOfPostIncrementedValue;
+}
+
+/// isNonConstantNegative - Return true if the specified scev is negated, but
+/// not a constant.
+static bool isNonConstantNegative(const SCEVHandle &Expr) {
+  const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
+  if (!Mul) return false;
+  
+  // If there is a constant factor, it will be first.
+  const SCEVConstant *SC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
+  if (!SC) return false;
+  
+  // Return true if the value is negative, this matches things like (-42 * V).
+  return SC->getValue()->getValue().isNegative();
+}
+
+// CollectIVUsers - Transform our list of users and offsets to a bit more
+// complex table. In this new vector, each 'BasedUser' contains 'Base', the base
+// of the strided accesses, as well as the old information from Uses. We
+// progressively move information from the Base field to the Imm field, until
+// we eventually have the full access expression to rewrite the use.
+SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
+                                              IVUsersOfOneStride &Uses,
+                                              Loop *L,
+                                              bool &AllUsesAreAddresses,
+                                              bool &AllUsesAreOutsideLoop,
+                                       std::vector<BasedUser> &UsersToProcess) {
+  // FIXME: Generalize to non-affine IV's.
+  if (!Stride->isLoopInvariant(L))
+    return SE->getIntegerSCEV(0, Stride->getType());
+
+  UsersToProcess.reserve(Uses.Users.size());
+  for (ilist<IVStrideUse>::iterator I = Uses.Users.begin(),
+       E = Uses.Users.end(); I != E; ++I) {
+    UsersToProcess.push_back(BasedUser(*I, SE));
+
+    // Move any loop variant operands from the offset field to the immediate
+    // field of the use, so that we don't try to use something before it is
+    // computed.
+    MoveLoopVariantsToImmediateField(UsersToProcess.back().Base,
+                                     UsersToProcess.back().Imm, L, SE);
+    assert(UsersToProcess.back().Base->isLoopInvariant(L) &&
+           "Base value is not loop invariant!");
+  }
+
+  // We now have a whole bunch of uses of like-strided induction variables, but
+  // they might all have different bases.  We want to emit one PHI node for this
+  // stride which we fold as many common expressions (between the IVs) into as
+  // possible.  Start by identifying the common expressions in the base values 
+  // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find
+  // "A+B"), emit it to the preheader, then remove the expression from the
+  // UsersToProcess base values.
+  SCEVHandle CommonExprs =
+    RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI);
+
+  // Next, figure out what we can represent in the immediate fields of
+  // instructions.  If we can represent anything there, move it to the imm
+  // fields of the BasedUsers.  We do this so that it increases the commonality
+  // of the remaining uses.
+  unsigned NumPHI = 0;
+  bool HasAddress = false;
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    // If the user is not in the current loop, this means it is using the exit
+    // value of the IV.  Do not put anything in the base, make sure it's all in
+    // the immediate field to allow as much factoring as possible.
+    if (!L->contains(UsersToProcess[i].Inst->getParent())) {
+      UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm,
+                                             UsersToProcess[i].Base);
+      UsersToProcess[i].Base = 
+        SE->getIntegerSCEV(0, UsersToProcess[i].Base->getType());
+    } else {
+      // Not all uses are outside the loop.
+      AllUsesAreOutsideLoop = false; 
+
+      // Addressing modes can be folded into loads and stores.  Be careful that
+      // the store is through the expression, not of the expression though.
+      bool isPHI = false;
+      bool isAddress = isAddressUse(UsersToProcess[i].Inst,
+                                    UsersToProcess[i].OperandValToReplace);
+      if (isa<PHINode>(UsersToProcess[i].Inst)) {
+        isPHI = true;
+        ++NumPHI;
+      }
+
+      if (isAddress)
+        HasAddress = true;
+     
+      // If this use isn't an address, then not all uses are addresses.
+      if (!isAddress && !isPHI)
+        AllUsesAreAddresses = false;
+      
+      MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base,
+                          UsersToProcess[i].Imm, isAddress, L, SE);
+    }
+  }
+
+  // If one of the use is a PHI node and all other uses are addresses, still
+  // allow iv reuse. Essentially we are trading one constant multiplication
+  // for one fewer iv.
+  if (NumPHI > 1)
+    AllUsesAreAddresses = false;
+    
+  // There are no in-loop address uses.
+  if (AllUsesAreAddresses && (!HasAddress && !AllUsesAreOutsideLoop))
+    AllUsesAreAddresses = false;
+
+  return CommonExprs;
+}
+
+/// ShouldUseFullStrengthReductionMode - Test whether full strength-reduction
+/// is valid and profitable for the given set of users of a stride. In
+/// full strength-reduction mode, all addresses at the current stride are
+/// strength-reduced all the way down to pointer arithmetic.
+///
+bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
+                                   const std::vector<BasedUser> &UsersToProcess,
+                                   const Loop *L,
+                                   bool AllUsesAreAddresses,
+                                   SCEVHandle Stride) {
+  if (!EnableFullLSRMode)
+    return false;
+
+  // The heuristics below aim to avoid increasing register pressure, but
+  // fully strength-reducing all the addresses increases the number of
+  // add instructions, so don't do this when optimizing for size.
+  // TODO: If the loop is large, the savings due to simpler addresses
+  // may oughtweight the costs of the extra increment instructions.
+  if (L->getHeader()->getParent()->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  // TODO: For now, don't do full strength reduction if there could
+  // potentially be greater-stride multiples of the current stride
+  // which could reuse the current stride IV.
+  if (IU->StrideOrder.back() != Stride)
+    return false;
+
+  // Iterate through the uses to find conditions that automatically rule out
+  // full-lsr mode.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
+    const SCEV *Base = UsersToProcess[i].Base;
+    const SCEV *Imm = UsersToProcess[i].Imm;
+    // If any users have a loop-variant component, they can't be fully
+    // strength-reduced.
+    if (Imm && !Imm->isLoopInvariant(L))
+      return false;
+    // If there are to users with the same base and the difference between
+    // the two Imm values can't be folded into the address, full
+    // strength reduction would increase register pressure.
+    do {
+      const SCEV *CurImm = UsersToProcess[i].Imm;
+      if ((CurImm || Imm) && CurImm != Imm) {
+        if (!CurImm) CurImm = SE->getIntegerSCEV(0, Stride->getType());
+        if (!Imm)       Imm = SE->getIntegerSCEV(0, Stride->getType());
+        const Instruction *Inst = UsersToProcess[i].Inst;
+        const Type *AccessTy = getAccessType(Inst);
+        SCEVHandle Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+        if (!Diff->isZero() &&
+            (!AllUsesAreAddresses ||
+             !fitsInAddressMode(Diff, AccessTy, TLI, /*HasBaseReg=*/true)))
+          return false;
+      }
+    } while (++i != e && Base == UsersToProcess[i].Base);
+  }
+
+  // If there's exactly one user in this stride, fully strength-reducing it
+  // won't increase register pressure. If it's starting from a non-zero base,
+  // it'll be simpler this way.
+  if (UsersToProcess.size() == 1 && !UsersToProcess[0].Base->isZero())
+    return true;
+
+  // Otherwise, if there are any users in this stride that don't require
+  // a register for their base, full strength-reduction will increase
+  // register pressure.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+    if (UsersToProcess[i].Base->isZero())
+      return false;
+
+  // Otherwise, go for it.
+  return true;
+}
+
+/// InsertAffinePhi Create and insert a PHI node for an induction variable
+/// with the specified start and step values in the specified loop.
+///
+/// If NegateStride is true, the stride should be negated by using a
+/// subtract instead of an add.
+///
+/// Return the created phi node.
+///
+static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
+                                Instruction *IVIncInsertPt,
+                                const Loop *L,
+                                SCEVExpander &Rewriter) {
+  assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!");
+  assert(Step->isLoopInvariant(L) && "New PHI stride is not loop invariant!");
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  const Type *Ty = Start->getType();
+  Ty = Rewriter.SE.getEffectiveSCEVType(Ty);
+
+  PHINode *PN = PHINode::Create(Ty, "lsr.iv", Header->begin());
+  PN->addIncoming(Rewriter.expandCodeFor(Start, Ty, Preheader->getTerminator()),
+                  Preheader);
+
+  // If the stride is negative, insert a sub instead of an add for the
+  // increment.
+  bool isNegative = isNonConstantNegative(Step);
+  SCEVHandle IncAmount = Step;
+  if (isNegative)
+    IncAmount = Rewriter.SE.getNegativeSCEV(Step);
+
+  // Insert an add instruction right before the terminator corresponding
+  // to the back-edge or just before the only use. The location is determined
+  // by the caller and passed in as IVIncInsertPt.
+  Value *StepV = Rewriter.expandCodeFor(IncAmount, Ty,
+                                        Preheader->getTerminator());
+  Instruction *IncV;
+  if (isNegative) {
+    IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next",
+                                     IVIncInsertPt);
+  } else {
+    IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next",
+                                     IVIncInsertPt);
+  }
+  if (!isa<ConstantInt>(StepV)) ++NumVariable;
+
+  PN->addIncoming(IncV, LatchBlock);
+
+  ++NumInserted;
+  return PN;
+}
+
+static void SortUsersToProcess(std::vector<BasedUser> &UsersToProcess) {
+  // We want to emit code for users inside the loop first.  To do this, we
+  // rearrange BasedUser so that the entries at the end have
+  // isUseOfPostIncrementedValue = false, because we pop off the end of the
+  // vector (so we handle them first).
+  std::partition(UsersToProcess.begin(), UsersToProcess.end(),
+                 PartitionByIsUseOfPostIncrementedValue);
+
+  // Sort this by base, so that things with the same base are handled
+  // together.  By partitioning first and stable-sorting later, we are
+  // guaranteed that within each base we will pop off users from within the
+  // loop before users outside of the loop with a particular base.
+  //
+  // We would like to use stable_sort here, but we can't.  The problem is that
+  // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
+  // we don't have anything to do a '<' comparison on.  Because we think the
+  // number of uses is small, do a horrible bubble sort which just relies on
+  // ==.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    // Get a base value.
+    SCEVHandle Base = UsersToProcess[i].Base;
+
+    // Compact everything with this base to be consecutive with this one.
+    for (unsigned j = i+1; j != e; ++j) {
+      if (UsersToProcess[j].Base == Base) {
+        std::swap(UsersToProcess[i+1], UsersToProcess[j]);
+        ++i;
+      }
+    }
+  }
+}
+
+/// PrepareToStrengthReduceFully - Prepare to fully strength-reduce
+/// UsersToProcess, meaning lowering addresses all the way down to direct
+/// pointer arithmetic.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceFully(
+                                        std::vector<BasedUser> &UsersToProcess,
+                                        SCEVHandle Stride,
+                                        SCEVHandle CommonExprs,
+                                        const Loop *L,
+                                        SCEVExpander &PreheaderRewriter) {
+  DOUT << "  Fully reducing all users\n";
+
+  // Rewrite the UsersToProcess records, creating a separate PHI for each
+  // unique Base value.
+  Instruction *IVIncInsertPt = L->getLoopLatch()->getTerminator();
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
+    // TODO: The uses are grouped by base, but not sorted. We arbitrarily
+    // pick the first Imm value here to start with, and adjust it for the
+    // other uses.
+    SCEVHandle Imm = UsersToProcess[i].Imm;
+    SCEVHandle Base = UsersToProcess[i].Base;
+    SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
+    PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L,
+                                   PreheaderRewriter);
+    // Loop over all the users with the same base.
+    do {
+      UsersToProcess[i].Base = SE->getIntegerSCEV(0, Stride->getType());
+      UsersToProcess[i].Imm = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+      UsersToProcess[i].Phi = Phi;
+      assert(UsersToProcess[i].Imm->isLoopInvariant(L) &&
+             "ShouldUseFullStrengthReductionMode should reject this!");
+    } while (++i != e && Base == UsersToProcess[i].Base);
+  }
+}
+
+/// FindIVIncInsertPt - Return the location to insert the increment instruction.
+/// If the only use if a use of postinc value, (must be the loop termination
+/// condition), then insert it just before the use.
+static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
+                                      const Loop *L) {
+  if (UsersToProcess.size() == 1 &&
+      UsersToProcess[0].isUseOfPostIncrementedValue &&
+      L->contains(UsersToProcess[0].Inst->getParent()))
+    return UsersToProcess[0].Inst;
+  return L->getLoopLatch()->getTerminator();
+}
+
+/// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the
+/// given users to share.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         SCEVHandle Stride,
+                                         SCEVHandle CommonExprs,
+                                         Value *CommonBaseV,
+                                         Instruction *IVIncInsertPt,
+                                         const Loop *L,
+                                         SCEVExpander &PreheaderRewriter) {
+  DOUT << "  Inserting new PHI:\n";
+
+  PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV),
+                                 Stride, IVIncInsertPt, L,
+                                 PreheaderRewriter);
+
+  // Remember this in case a later stride is multiple of this.
+  IVsByStride[Stride].addIV(Stride, CommonExprs, Phi);
+
+  // All the users will share this new IV.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+    UsersToProcess[i].Phi = Phi;
+
+  DOUT << "    IV=";
+  DEBUG(WriteAsOperand(*DOUT, Phi, /*PrintType=*/false));
+  DOUT << "\n";
+}
+
+/// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to
+/// reuse an induction variable with a stride that is a factor of the current
+/// induction variable.
+///
+void
+LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         Value *CommonBaseV,
+                                         const IVExpr &ReuseIV,
+                                         Instruction *PreInsertPt) {
+  DOUT << "  Rewriting in terms of existing IV of STRIDE " << *ReuseIV.Stride
+       << " and BASE " << *ReuseIV.Base << "\n";
+
+  // All the users will share the reused IV.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+    UsersToProcess[i].Phi = ReuseIV.PHI;
+
+  Constant *C = dyn_cast<Constant>(CommonBaseV);
+  if (C &&
+      (!C->isNullValue() &&
+       !fitsInAddressMode(SE->getUnknown(CommonBaseV), CommonBaseV->getType(),
+                         TLI, false)))
+    // We want the common base emitted into the preheader! This is just
+    // using cast as a copy so BitCast (no-op cast) is appropriate
+    CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(),
+                                  "commonbase", PreInsertPt);
+}
+
+static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset,
+                                    const Type *AccessTy,
+                                   std::vector<BasedUser> &UsersToProcess,
+                                   const TargetLowering *TLI) {
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    if (UsersToProcess[i].isUseOfPostIncrementedValue)
+      continue;
+    ExtAddrMode AddrMode =
+      AddressingModeMatcher::Match(UsersToProcess[i].OperandValToReplace,
+                                   AccessTy, UsersToProcess[i].Inst,
+                                   AddrModeInsts, *TLI);
+    if (GV && GV != AddrMode.BaseGV)
+      return false;
+    if (Offset && !AddrMode.BaseOffs)
+      // FIXME: How to accurate check it's immediate offset is folded.
+      return false;
+    AddrModeInsts.clear();
+  }
+  return true;
+}
+
+/// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
+/// stride of IV.  All of the users may have different starting values, and this
+/// may not be the only stride.
+void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+                                                      IVUsersOfOneStride &Uses,
+                                                      Loop *L) {
+  // If all the users are moved to another stride, then there is nothing to do.
+  if (Uses.Users.empty())
+    return;
+
+  // Keep track if every use in UsersToProcess is an address. If they all are,
+  // we may be able to rewrite the entire collection of them in terms of a
+  // smaller-stride IV.
+  bool AllUsesAreAddresses = true;
+
+  // Keep track if every use of a single stride is outside the loop.  If so,
+  // we want to be more aggressive about reusing a smaller-stride IV; a
+  // multiply outside the loop is better than another IV inside.  Well, usually.
+  bool AllUsesAreOutsideLoop = true;
+
+  // Transform our list of users and offsets to a bit more complex table.  In
+  // this new vector, each 'BasedUser' contains 'Base' the base of the
+  // strided accessas well as the old information from Uses.  We progressively
+  // move information from the Base field to the Imm field, until we eventually
+  // have the full access expression to rewrite the use.
+  std::vector<BasedUser> UsersToProcess;
+  SCEVHandle CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
+                                          AllUsesAreOutsideLoop,
+                                          UsersToProcess);
+
+  // Sort the UsersToProcess array so that users with common bases are
+  // next to each other.
+  SortUsersToProcess(UsersToProcess);
+
+  // If we managed to find some expressions in common, we'll need to carry
+  // their value in a register and add it in for each use. This will take up
+  // a register operand, which potentially restricts what stride values are
+  // valid.
+  bool HaveCommonExprs = !CommonExprs->isZero();
+  const Type *ReplacedTy = CommonExprs->getType();
+
+  // If all uses are addresses, consider sinking the immediate part of the
+  // common expression back into uses if they can fit in the immediate fields.
+  if (TLI && HaveCommonExprs && AllUsesAreAddresses) {
+    SCEVHandle NewCommon = CommonExprs;
+    SCEVHandle Imm = SE->getIntegerSCEV(0, ReplacedTy);
+    MoveImmediateValues(TLI, Type::VoidTy, NewCommon, Imm, true, L, SE);
+    if (!Imm->isZero()) {
+      bool DoSink = true;
+
+      // If the immediate part of the common expression is a GV, check if it's
+      // possible to fold it into the target addressing mode.
+      GlobalValue *GV = 0;
+      if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(Imm))
+        GV = dyn_cast<GlobalValue>(SU->getValue());
+      int64_t Offset = 0;
+      if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Imm))
+        Offset = SC->getValue()->getSExtValue();
+      if (GV || Offset)
+        // Pass VoidTy as the AccessTy to be conservative, because
+        // there could be multiple access types among all the uses.
+        DoSink = IsImmFoldedIntoAddrMode(GV, Offset, Type::VoidTy,
+                                         UsersToProcess, TLI);
+
+      if (DoSink) {
+        DOUT << "  Sinking " << *Imm << " back down into uses\n";
+        for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+          UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, Imm);
+        CommonExprs = NewCommon;
+        HaveCommonExprs = !CommonExprs->isZero();
+        ++NumImmSunk;
+      }
+    }
+  }
+
+  // Now that we know what we need to do, insert the PHI node itself.
+  //
+  DOUT << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE "
+       << *Stride << ":\n"
+       << "  Common base: " << *CommonExprs << "\n";
+
+  SCEVExpander Rewriter(*SE);
+  SCEVExpander PreheaderRewriter(*SE);
+
+  BasicBlock  *Preheader = L->getLoopPreheader();
+  Instruction *PreInsertPt = Preheader->getTerminator();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  Instruction *IVIncInsertPt = LatchBlock->getTerminator();
+
+  Value *CommonBaseV = Constant::getNullValue(ReplacedTy);
+
+  SCEVHandle RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
+  IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
+                   SE->getIntegerSCEV(0, Type::Int32Ty),
+                   0);
+
+  /// Choose a strength-reduction strategy and prepare for it by creating
+  /// the necessary PHIs and adjusting the bookkeeping.
+  if (ShouldUseFullStrengthReductionMode(UsersToProcess, L,
+                                         AllUsesAreAddresses, Stride)) {
+    PrepareToStrengthReduceFully(UsersToProcess, Stride, CommonExprs, L,
+                                 PreheaderRewriter);
+  } else {
+    // Emit the initial base value into the loop preheader.
+    CommonBaseV = PreheaderRewriter.expandCodeFor(CommonExprs, ReplacedTy,
+                                                  PreInsertPt);
+
+    // If all uses are addresses, check if it is possible to reuse an IV.  The
+    // new IV must have a stride that is a multiple of the old stride; the
+    // multiple must be a number that can be encoded in the scale field of the
+    // target addressing mode; and we must have a valid instruction after this 
+    // substitution, including the immediate field, if any.
+    RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses,
+                                    AllUsesAreOutsideLoop,
+                                    Stride, ReuseIV, ReplacedTy,
+                                    UsersToProcess);
+    if (!RewriteFactor->isZero())
+      PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV,
+                                               ReuseIV, PreInsertPt);
+    else {
+      IVIncInsertPt = FindIVIncInsertPt(UsersToProcess, L);
+      PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
+                                        CommonBaseV, IVIncInsertPt,
+                                        L, PreheaderRewriter);
+    }
+  }
+
+  // Process all the users now, replacing their strided uses with
+  // strength-reduced forms.  This outer loop handles all bases, the inner
+  // loop handles all users of a particular base.
+  while (!UsersToProcess.empty()) {
+    SCEVHandle Base = UsersToProcess.back().Base;
+    Instruction *Inst = UsersToProcess.back().Inst;
+
+    // Emit the code for Base into the preheader.
+    Value *BaseV = 0;
+    if (!Base->isZero()) {
+      BaseV = PreheaderRewriter.expandCodeFor(Base, 0, PreInsertPt);
+
+      DOUT << "  INSERTING code for BASE = " << *Base << ":";
+      if (BaseV->hasName())
+        DOUT << " Result value name = %" << BaseV->getNameStr();
+      DOUT << "\n";
+
+      // If BaseV is a non-zero constant, make sure that it gets inserted into
+      // the preheader, instead of being forward substituted into the uses.  We
+      // do this by forcing a BitCast (noop cast) to be inserted into the
+      // preheader in this case.
+      if (!fitsInAddressMode(Base, getAccessType(Inst), TLI, false)) {
+        // We want this constant emitted into the preheader! This is just
+        // using cast as a copy so BitCast (no-op cast) is appropriate
+        BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert",
+                                PreInsertPt);       
+      }
+    }
+
+    // Emit the code to add the immediate offset to the Phi value, just before
+    // the instructions that we identified as using this stride and base.
+    do {
+      // FIXME: Use emitted users to emit other users.
+      BasedUser &User = UsersToProcess.back();
+
+      DOUT << "    Examining ";
+      if (User.isUseOfPostIncrementedValue)
+        DOUT << "postinc";
+      else
+        DOUT << "preinc";
+      DOUT << " use ";
+      DEBUG(WriteAsOperand(*DOUT, UsersToProcess.back().OperandValToReplace,
+                           /*PrintType=*/false));
+      DOUT << " in Inst: " << *(User.Inst);
+
+      // If this instruction wants to use the post-incremented value, move it
+      // after the post-inc and use its value instead of the PHI.
+      Value *RewriteOp = User.Phi;
+      if (User.isUseOfPostIncrementedValue) {
+        RewriteOp = User.Phi->getIncomingValueForBlock(LatchBlock);
+        // If this user is in the loop, make sure it is the last thing in the
+        // loop to ensure it is dominated by the increment. In case it's the
+        // only use of the iv, the increment instruction is already before the
+        // use.
+        if (L->contains(User.Inst->getParent()) && User.Inst != IVIncInsertPt)
+          User.Inst->moveBefore(IVIncInsertPt);
+      }
+
+      SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
+
+      if (SE->getEffectiveSCEVType(RewriteOp->getType()) !=
+          SE->getEffectiveSCEVType(ReplacedTy)) {
+        assert(SE->getTypeSizeInBits(RewriteOp->getType()) >
+               SE->getTypeSizeInBits(ReplacedTy) &&
+               "Unexpected widening cast!");
+        RewriteExpr = SE->getTruncateExpr(RewriteExpr, ReplacedTy);
+      }
+
+      // If we had to insert new instructions for RewriteOp, we have to
+      // consider that they may not have been able to end up immediately
+      // next to RewriteOp, because non-PHI instructions may never precede
+      // PHI instructions in a block. In this case, remember where the last
+      // instruction was inserted so that if we're replacing a different
+      // PHI node, we can use the later point to expand the final
+      // RewriteExpr.
+      Instruction *NewBasePt = dyn_cast<Instruction>(RewriteOp);
+      if (RewriteOp == User.Phi) NewBasePt = 0;
+
+      // Clear the SCEVExpander's expression map so that we are guaranteed
+      // to have the code emitted where we expect it.
+      Rewriter.clear();
+
+      // If we are reusing the iv, then it must be multiplied by a constant
+      // factor to take advantage of the addressing mode scale component.
+      if (!RewriteFactor->isZero()) {
+        // If we're reusing an IV with a nonzero base (currently this happens
+        // only when all reuses are outside the loop) subtract that base here.
+        // The base has been used to initialize the PHI node but we don't want
+        // it here.
+        if (!ReuseIV.Base->isZero()) {
+          SCEVHandle typedBase = ReuseIV.Base;
+          if (SE->getEffectiveSCEVType(RewriteExpr->getType()) !=
+              SE->getEffectiveSCEVType(ReuseIV.Base->getType())) {
+            // It's possible the original IV is a larger type than the new IV,
+            // in which case we have to truncate the Base.  We checked in
+            // RequiresTypeConversion that this is valid.
+            assert(SE->getTypeSizeInBits(RewriteExpr->getType()) <
+                   SE->getTypeSizeInBits(ReuseIV.Base->getType()) &&
+                   "Unexpected lengthening conversion!");
+            typedBase = SE->getTruncateExpr(ReuseIV.Base, 
+                                            RewriteExpr->getType());
+          }
+          RewriteExpr = SE->getMinusSCEV(RewriteExpr, typedBase);
+        }
+
+        // Multiply old variable, with base removed, by new scale factor.
+        RewriteExpr = SE->getMulExpr(RewriteFactor,
+                                     RewriteExpr);
+
+        // The common base is emitted in the loop preheader. But since we
+        // are reusing an IV, it has not been used to initialize the PHI node.
+        // Add it to the expression used to rewrite the uses.
+        // When this use is outside the loop, we earlier subtracted the
+        // common base, and are adding it back here.  Use the same expression
+        // as before, rather than CommonBaseV, so DAGCombiner will zap it.
+        if (!CommonExprs->isZero()) {
+          if (L->contains(User.Inst->getParent()))
+            RewriteExpr = SE->getAddExpr(RewriteExpr,
+                                       SE->getUnknown(CommonBaseV));
+          else
+            RewriteExpr = SE->getAddExpr(RewriteExpr, CommonExprs);
+        }
+      }
+
+      // Now that we know what we need to do, insert code before User for the
+      // immediate and any loop-variant expressions.
+      if (BaseV)
+        // Add BaseV to the PHI value if needed.
+        RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(BaseV));
+
+      User.RewriteInstructionToUseNewBase(RewriteExpr, NewBasePt,
+                                          Rewriter, L, this, *LI,
+                                          DeadInsts);
+
+      // Mark old value we replaced as possibly dead, so that it is eliminated
+      // if we just replaced the last use of that value.
+      DeadInsts.push_back(User.OperandValToReplace);
+
+      UsersToProcess.pop_back();
+      ++NumReduced;
+
+      // If there are any more users to process with the same base, process them
+      // now.  We sorted by base above, so we just have to check the last elt.
+    } while (!UsersToProcess.empty() && UsersToProcess.back().Base == Base);
+    // TODO: Next, find out which base index is the most common, pull it out.
+  }
+
+  // IMPORTANT TODO: Figure out how to partition the IV's with this stride, but
+  // different starting values, into different PHIs.
+}
+
+/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
+/// set the IV user and stride information and return true, otherwise return
+/// false.
+bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
+                                       const SCEVHandle *&CondStride) {
+  for (unsigned Stride = 0, e = IU->StrideOrder.size();
+       Stride != e && !CondUse; ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+
+    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+         E = SI->second->Users.end(); UI != E; ++UI)
+      if (UI->getUser() == Cond) {
+        // NOTE: we could handle setcc instructions with multiple uses here, but
+        // InstCombine does it as well for simple uses, it's not clear that it
+        // occurs enough in real life to handle.
+        CondUse = UI;
+        CondStride = &SI->first;
+        return true;
+      }
+  }
+  return false;
+}    
+
+namespace {
+  // Constant strides come first which in turns are sorted by their absolute
+  // values. If absolute values are the same, then positive strides comes first.
+  // e.g.
+  // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
+  struct StrideCompare {
+    const ScalarEvolution *SE;
+    explicit StrideCompare(const ScalarEvolution *se) : SE(se) {}
+
+    bool operator()(const SCEVHandle &LHS, const SCEVHandle &RHS) {
+      const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
+      const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
+      if (LHSC && RHSC) {
+        int64_t  LV = LHSC->getValue()->getSExtValue();
+        int64_t  RV = RHSC->getValue()->getSExtValue();
+        uint64_t ALV = (LV < 0) ? -LV : LV;
+        uint64_t ARV = (RV < 0) ? -RV : RV;
+        if (ALV == ARV) {
+          if (LV != RV)
+            return LV > RV;
+        } else {
+          return ALV < ARV;
+        }
+
+        // If it's the same value but different type, sort by bit width so
+        // that we emit larger induction variables before smaller
+        // ones, letting the smaller be re-written in terms of larger ones.
+        return SE->getTypeSizeInBits(RHS->getType()) <
+               SE->getTypeSizeInBits(LHS->getType());
+      }
+      return LHSC && !RHSC;
+    }
+  };
+}
+
+/// ChangeCompareStride - If a loop termination compare instruction is the
+/// only use of its stride, and the compaison is against a constant value,
+/// try eliminate the stride by moving the compare instruction to another
+/// stride and change its constant operand accordingly. e.g.
+///
+/// loop:
+/// ...
+/// v1 = v1 + 3
+/// v2 = v2 + 1
+/// if (v2 < 10) goto loop
+/// =>
+/// loop:
+/// ...
+/// v1 = v1 + 3
+/// if (v1 < 30) goto loop
+ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
+                                                IVStrideUse* &CondUse,
+                                                const SCEVHandle* &CondStride) {
+  // If there's only one stride in the loop, there's nothing to do here.
+  if (IU->StrideOrder.size() < 2)
+    return Cond;
+  // If there are other users of the condition's stride, don't bother
+  // trying to change the condition because the stride will still
+  // remain.
+  std::map<SCEVHandle, IVUsersOfOneStride *>::iterator I =
+    IU->IVUsesByStride.find(*CondStride);
+  if (I == IU->IVUsesByStride.end() ||
+      I->second->Users.size() != 1)
+    return Cond;
+  // Only handle constant strides for now.
+  const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride);
+  if (!SC) return Cond;
+
+  ICmpInst::Predicate Predicate = Cond->getPredicate();
+  int64_t CmpSSInt = SC->getValue()->getSExtValue();
+  unsigned BitWidth = SE->getTypeSizeInBits((*CondStride)->getType());
+  uint64_t SignBit = 1ULL << (BitWidth-1);
+  const Type *CmpTy = Cond->getOperand(0)->getType();
+  const Type *NewCmpTy = NULL;
+  unsigned TyBits = SE->getTypeSizeInBits(CmpTy);
+  unsigned NewTyBits = 0;
+  SCEVHandle *NewStride = NULL;
+  Value *NewCmpLHS = NULL;
+  Value *NewCmpRHS = NULL;
+  int64_t Scale = 1;
+  SCEVHandle NewOffset = SE->getIntegerSCEV(0, CmpTy);
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Cond->getOperand(1))) {
+    int64_t CmpVal = C->getValue().getSExtValue();
+
+    // Check stride constant and the comparision constant signs to detect
+    // overflow.
+    if ((CmpVal & SignBit) != (CmpSSInt & SignBit))
+      return Cond;
+
+    // Look for a suitable stride / iv as replacement.
+    for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        IU->IVUsesByStride.find(IU->StrideOrder[i]);
+      if (!isa<SCEVConstant>(SI->first))
+        continue;
+      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+      if (SSInt == CmpSSInt ||
+          abs64(SSInt) < abs64(CmpSSInt) ||
+          (SSInt % CmpSSInt) != 0)
+        continue;
+
+      Scale = SSInt / CmpSSInt;
+      int64_t NewCmpVal = CmpVal * Scale;
+      APInt Mul = APInt(BitWidth*2, CmpVal, true);
+      Mul = Mul * APInt(BitWidth*2, Scale, true);
+      // Check for overflow.
+      if (!Mul.isSignedIntN(BitWidth))
+        continue;
+      // Check for overflow in the stride's type too.
+      if (!Mul.isSignedIntN(SE->getTypeSizeInBits(SI->first->getType())))
+        continue;
+
+      // Watch out for overflow.
+      if (ICmpInst::isSignedPredicate(Predicate) &&
+          (CmpVal & SignBit) != (NewCmpVal & SignBit))
+        continue;
+
+      if (NewCmpVal == CmpVal)
+        continue;
+      // Pick the best iv to use trying to avoid a cast.
+      NewCmpLHS = NULL;
+      for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+             E = SI->second->Users.end(); UI != E; ++UI) {
+        Value *Op = UI->getOperandValToReplace();
+
+        // If the IVStrideUse implies a cast, check for an actual cast which
+        // can be used to find the original IV expression.
+        if (SE->getEffectiveSCEVType(Op->getType()) !=
+            SE->getEffectiveSCEVType(SI->first->getType())) {
+          CastInst *CI = dyn_cast<CastInst>(Op);
+          // If it's not a simple cast, it's complicated.
+          if (!CI)
+            continue;
+          // If it's a cast from a type other than the stride type,
+          // it's complicated.
+          if (CI->getOperand(0)->getType() != SI->first->getType())
+            continue;
+          // Ok, we found the IV expression in the stride's type.
+          Op = CI->getOperand(0);
+        }
+
+        NewCmpLHS = Op;
+        if (NewCmpLHS->getType() == CmpTy)
+          break;
+      }
+      if (!NewCmpLHS)
+        continue;
+
+      NewCmpTy = NewCmpLHS->getType();
+      NewTyBits = SE->getTypeSizeInBits(NewCmpTy);
+      const Type *NewCmpIntTy = IntegerType::get(NewTyBits);
+      if (RequiresTypeConversion(NewCmpTy, CmpTy)) {
+        // Check if it is possible to rewrite it using
+        // an iv / stride of a smaller integer type.
+        unsigned Bits = NewTyBits;
+        if (ICmpInst::isSignedPredicate(Predicate))
+          --Bits;
+        uint64_t Mask = (1ULL << Bits) - 1;
+        if (((uint64_t)NewCmpVal & Mask) != (uint64_t)NewCmpVal)
+          continue;
+      }
+
+      // Don't rewrite if use offset is non-constant and the new type is
+      // of a different type.
+      // FIXME: too conservative?
+      if (NewTyBits != TyBits && !isa<SCEVConstant>(CondUse->getOffset()))
+        continue;
+
+      bool AllUsesAreAddresses = true;
+      bool AllUsesAreOutsideLoop = true;
+      std::vector<BasedUser> UsersToProcess;
+      SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+                                              AllUsesAreAddresses,
+                                              AllUsesAreOutsideLoop,
+                                              UsersToProcess);
+      // Avoid rewriting the compare instruction with an iv of new stride
+      // if it's likely the new stride uses will be rewritten using the
+      // stride of the compare instruction.
+      if (AllUsesAreAddresses &&
+          ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
+        continue;
+
+      // Avoid rewriting the compare instruction with an iv which has
+      // implicit extension or truncation built into it.
+      // TODO: This is over-conservative.
+      if (SE->getTypeSizeInBits(CondUse->getOffset()->getType()) != TyBits)
+        continue;
+
+      // If scale is negative, use swapped predicate unless it's testing
+      // for equality.
+      if (Scale < 0 && !Cond->isEquality())
+        Predicate = ICmpInst::getSwappedPredicate(Predicate);
+
+      NewStride = &IU->StrideOrder[i];
+      if (!isa<PointerType>(NewCmpTy))
+        NewCmpRHS = ConstantInt::get(NewCmpTy, NewCmpVal);
+      else {
+        ConstantInt *CI = ConstantInt::get(NewCmpIntTy, NewCmpVal);
+        NewCmpRHS = ConstantExpr::getIntToPtr(CI, NewCmpTy);
+      }
+      NewOffset = TyBits == NewTyBits
+        ? SE->getMulExpr(CondUse->getOffset(),
+                         SE->getConstant(ConstantInt::get(CmpTy, Scale)))
+        : SE->getConstant(ConstantInt::get(NewCmpIntTy,
+          cast<SCEVConstant>(CondUse->getOffset())->getValue()
+            ->getSExtValue()*Scale));
+      break;
+    }
+  }
+
+  // Forgo this transformation if it the increment happens to be
+  // unfortunately positioned after the condition, and the condition
+  // has multiple uses which prevent it from being moved immediately
+  // before the branch. See
+  // test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-*.ll
+  // for an example of this situation.
+  if (!Cond->hasOneUse()) {
+    for (BasicBlock::iterator I = Cond, E = Cond->getParent()->end();
+         I != E; ++I)
+      if (I == NewCmpLHS)
+        return Cond;
+  }
+
+  if (NewCmpRHS) {
+    // Create a new compare instruction using new stride / iv.
+    ICmpInst *OldCond = Cond;
+    // Insert new compare instruction.
+    Cond = new ICmpInst(Predicate, NewCmpLHS, NewCmpRHS,
+                        L->getHeader()->getName() + ".termcond",
+                        OldCond);
+
+    // Remove the old compare instruction. The old indvar is probably dead too.
+    DeadInsts.push_back(CondUse->getOperandValToReplace());
+    OldCond->replaceAllUsesWith(Cond);
+    OldCond->eraseFromParent();
+
+    IU->IVUsesByStride[*NewStride]->addUser(NewOffset, Cond, NewCmpLHS, false);
+    CondUse = &IU->IVUsesByStride[*NewStride]->Users.back();
+    CondStride = NewStride;
+    ++NumEliminated;
+    Changed = true;
+  }
+
+  return Cond;
+}
+
+/// OptimizeSMax - Rewrite the loop's terminating condition if it uses
+/// an smax computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+///   i = 0;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i < n);
+///
+/// where the comparison is signed, the trip count isn't just 'n', because
+/// 'n' could be negative. And unfortunately this can come up even for loops
+/// where the user didn't use a C do-while loop. For example, seemingly
+/// well-behaved top-test loops will commonly be lowered like this:
+//
+///   if (n > 0) {
+///     i = 0;
+///     do {
+///       p[i] = 0.0;
+///     } while (++i < n);
+///   }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// signed-max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+///   i = 0;
+///   smax = n < 1 ? 1 : n;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i != smax);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analyis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+///
+ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond,
+                                           IVStrideUse* &CondUse) {
+  // Check that the loop matches the pattern we're looking for.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+      Cond->getPredicate() != CmpInst::ICMP_NE)
+    return Cond;
+
+  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+  if (!Sel || !Sel->hasOneUse()) return Cond;
+
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return Cond;
+  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+
+  // Add one to the backedge-taken count to get the trip count.
+  SCEVHandle IterationCount = SE->getAddExpr(BackedgeTakenCount, One);
+
+  // Check for a max calculation that matches the pattern.
+  const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(IterationCount);
+  if (!SMax || SMax != SE->getSCEV(Sel)) return Cond;
+
+  SCEVHandle SMaxLHS = SMax->getOperand(0);
+  SCEVHandle SMaxRHS = SMax->getOperand(1);
+  if (!SMaxLHS || SMaxLHS != One) return Cond;
+
+  // Check the relevant induction variable for conformance to
+  // the pattern.
+  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  if (!AR || !AR->isAffine() ||
+      AR->getStart() != One ||
+      AR->getStepRecurrence(*SE) != One)
+    return Cond;
+
+  assert(AR->getLoop() == L &&
+         "Loop condition operand is an addrec in a different loop!");
+
+  // Check the right operand of the select, and remember it, as it will
+  // be used in the new comparison instruction.
+  Value *NewRHS = 0;
+  if (SE->getSCEV(Sel->getOperand(1)) == SMaxRHS)
+    NewRHS = Sel->getOperand(1);
+  else if (SE->getSCEV(Sel->getOperand(2)) == SMaxRHS)
+    NewRHS = Sel->getOperand(2);
+  if (!NewRHS) return Cond;
+
+  // Ok, everything looks ok to change the condition into an SLT or SGE and
+  // delete the max calculation.
+  ICmpInst *NewCond =
+    new ICmpInst(Cond->getPredicate() == CmpInst::ICMP_NE ?
+                   CmpInst::ICMP_SLT :
+                   CmpInst::ICMP_SGE,
+                 Cond->getOperand(0), NewRHS, "scmp", Cond);
+
+  // Delete the max calculation instructions.
+  Cond->replaceAllUsesWith(NewCond);
+  CondUse->setUser(NewCond);
+  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+  Cond->eraseFromParent();
+  Sel->eraseFromParent();
+  if (Cmp->use_empty())
+    Cmp->eraseFromParent();
+  return NewCond;
+}
+
+/// OptimizeShadowIV - If IV is used in a int-to-float cast
+/// inside the loop then try to eliminate the cast opeation.
+void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
+
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e;
+       ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+    if (!isa<SCEVConstant>(SI->first))
+      continue;
+
+    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+           E = SI->second->Users.end(); UI != E; /* empty */) {
+      ilist<IVStrideUse>::iterator CandidateUI = UI;
+      ++UI;
+      Instruction *ShadowUse = CandidateUI->getUser();
+      const Type *DestTy = NULL;
+
+      /* If shadow use is a int->float cast then insert a second IV
+         to eliminate this cast.
+
+           for (unsigned i = 0; i < n; ++i) 
+             foo((double)i);
+
+         is transformed into
+
+           double d = 0.0;
+           for (unsigned i = 0; i < n; ++i, ++d) 
+             foo(d);
+      */
+      if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
+        DestTy = UCast->getDestTy();
+      else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
+        DestTy = SCast->getDestTy();
+      if (!DestTy) continue;
+
+      if (TLI) {
+        // If target does not support DestTy natively then do not apply
+        // this transformation.
+        MVT DVT = TLI->getValueType(DestTy);
+        if (!TLI->isTypeLegal(DVT)) continue;
+      }
+
+      PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+      if (!PH) continue;
+      if (PH->getNumIncomingValues() != 2) continue;
+
+      const Type *SrcTy = PH->getType();
+      int Mantissa = DestTy->getFPMantissaWidth();
+      if (Mantissa == -1) continue; 
+      if ((int)SE->getTypeSizeInBits(SrcTy) > Mantissa)
+        continue;
+
+      unsigned Entry, Latch;
+      if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+        Entry = 0;
+        Latch = 1;
+      } else {
+        Entry = 1;
+        Latch = 0;
+      }
+        
+      ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+      if (!Init) continue;
+      ConstantFP *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
+
+      BinaryOperator *Incr = 
+        dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+      if (!Incr) continue;
+      if (Incr->getOpcode() != Instruction::Add
+          && Incr->getOpcode() != Instruction::Sub)
+        continue;
+
+      /* Initialize new IV, double d = 0.0 in above example. */
+      ConstantInt *C = NULL;
+      if (Incr->getOperand(0) == PH)
+        C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+      else if (Incr->getOperand(1) == PH)
+        C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+      else
+        continue;
+
+      if (!C) continue;
+
+      /* Add new PHINode. */
+      PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
+
+      /* create new increment. '++d' in above example. */
+      ConstantFP *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+      BinaryOperator *NewIncr = 
+        BinaryOperator::Create(Incr->getOpcode(),
+                               NewPH, CFP, "IV.S.next.", Incr);
+
+      NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+      NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+      /* Remove cast operation */
+      ShadowUse->replaceAllUsesWith(NewPH);
+      ShadowUse->eraseFromParent();
+      NumShadow++;
+      break;
+    }
+  }
+}
+
+// OptimizeIndvars - Now that IVUsesByStride is set up with all of the indvar
+// uses in the loop, look to see if we can eliminate some, in favor of using
+// common indvars for the different uses.
+void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
+  // TODO: implement optzns here.
+
+  OptimizeShadowIV(L);
+}
+
+/// OptimizeLoopTermCond - Change loop terminating condition to use the 
+/// postinc iv when possible.
+void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
+  // Finally, get the terminating condition for the loop if possible.  If we
+  // can, we want to change it to use a post-incremented version of its
+  // induction variable, to allow coalescing the live ranges for the IV into
+  // one register value.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BasicBlock *ExitBlock = L->getExitingBlock();
+  if (!ExitBlock)
+    // Multiple exits, just look at the exit in the latch block if there is one.
+    ExitBlock = LatchBlock;
+  BranchInst *TermBr = dyn_cast<BranchInst>(ExitBlock->getTerminator());
+  if (!TermBr)
+    return;
+  if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+    return;
+
+  // Search IVUsesByStride to find Cond's IVUse if there is one.
+  IVStrideUse *CondUse = 0;
+  const SCEVHandle *CondStride = 0;
+  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+  if (!FindIVUserForCond(Cond, CondUse, CondStride))
+    return; // setcc doesn't use the IV.
+
+  if (ExitBlock != LatchBlock) {
+    if (!Cond->hasOneUse())
+      // See below, we don't want the condition to be cloned.
+      return;
+
+    // If exiting block is the latch block, we know it's safe and profitable to
+    // transform the icmp to use post-inc iv. Otherwise do so only if it would
+    // not reuse another iv and its iv would be reused by other uses. We are
+    // optimizing for the case where the icmp is the only use of the iv.
+    IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[*CondStride];
+    for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
+         E = StrideUses.Users.end(); I != E; ++I) {
+      if (I->getUser() == Cond)
+        continue;
+      if (!I->isUseOfPostIncrementedValue())
+        return;
+    }
+
+    // FIXME: This is expensive, and worse still ChangeCompareStride does a
+    // similar check. Can we perform all the icmp related transformations after
+    // StrengthReduceStridedIVUsers?
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride)) {
+      int64_t SInt = SC->getValue()->getSExtValue();
+      for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee;
+           ++NewStride) {
+        std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+          IU->IVUsesByStride.find(IU->StrideOrder[NewStride]);
+        if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
+          continue;
+        int64_t SSInt =
+          cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+        if (SSInt == SInt)
+          return; // This can definitely be reused.
+        if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
+          continue;
+        int64_t Scale = SSInt / SInt;
+        bool AllUsesAreAddresses = true;
+        bool AllUsesAreOutsideLoop = true;
+        std::vector<BasedUser> UsersToProcess;
+        SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+                                                AllUsesAreAddresses,
+                                                AllUsesAreOutsideLoop,
+                                                UsersToProcess);
+        // Avoid rewriting the compare instruction with an iv of new stride
+        // if it's likely the new stride uses will be rewritten using the
+        // stride of the compare instruction.
+        if (AllUsesAreAddresses &&
+            ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
+          return;
+      }
+    }
+
+    StrideNoReuse.insert(*CondStride);
+  }
+
+  // If the trip count is computed in terms of an smax (due to ScalarEvolution
+  // being unable to find a sufficient guard, for example), change the loop
+  // comparison to use SLT instead of NE.
+  Cond = OptimizeSMax(L, Cond, CondUse);
+
+  // If possible, change stride and operands of the compare instruction to
+  // eliminate one stride.
+  if (ExitBlock == LatchBlock)
+    Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
+
+  // It's possible for the setcc instruction to be anywhere in the loop, and
+  // possible for it to have multiple users.  If it is not immediately before
+  // the latch block branch, move it.
+  if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) {
+    if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
+      Cond->moveBefore(TermBr);
+    } else {
+      // Otherwise, clone the terminating condition and insert into the loopend.
+      Cond = cast<ICmpInst>(Cond->clone());
+      Cond->setName(L->getHeader()->getName() + ".termcond");
+      LatchBlock->getInstList().insert(TermBr, Cond);
+      
+      // Clone the IVUse, as the old use still exists!
+      IU->IVUsesByStride[*CondStride]->addUser(CondUse->getOffset(), Cond,
+                                              CondUse->getOperandValToReplace(),
+                                               false);
+      CondUse = &IU->IVUsesByStride[*CondStride]->Users.back();
+    }
+  }
+
+  // If we get to here, we know that we can transform the setcc instruction to
+  // use the post-incremented version of the IV, allowing us to coalesce the
+  // live ranges for the IV correctly.
+  CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), *CondStride));
+  CondUse->setIsUseOfPostIncrementedValue(true);
+  Changed = true;
+
+  ++NumLoopCond;
+}
+
+// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding
+// when to exit the loop is used only for that purpose, try to rearrange things
+// so it counts down to a test against zero.
+void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
+
+  // If the number of times the loop is executed isn't computable, give up.
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  // Get the terminating condition for the loop if possible (this isn't
+  // necessarily in the latch, or a block that's a predecessor of the header).
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1) return;
+
+  // Okay, there is one exit block.  Try to find the condition that causes the
+  // loop to be exited.
+  BasicBlock *ExitBlock = ExitBlocks[0];
+
+  BasicBlock *ExitingBlock = 0;
+  for (pred_iterator PI = pred_begin(ExitBlock), E = pred_end(ExitBlock);
+       PI != E; ++PI)
+    if (L->contains(*PI)) {
+      if (ExitingBlock == 0)
+        ExitingBlock = *PI;
+      else
+        return; // More than one block exiting!
+    }
+  assert(ExitingBlock && "No exits from loop, something is broken!");
+
+  // Okay, we've computed the exiting block.  See what condition causes us to
+  // exit.
+  //
+  // FIXME: we should be able to handle switch instructions (with a single exit)
+  BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (TermBr == 0) return;
+  assert(TermBr->isConditional() && "If unconditional, it can't be in loop!");
+  if (!isa<ICmpInst>(TermBr->getCondition()))
+    return;
+  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+
+  // Handle only tests for equality for the moment, and only stride 1.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ)
+    return;
+  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+  if (!AR || !AR->isAffine() || AR->getStepRecurrence(*SE) != One)
+    return;
+  // If the RHS of the comparison is defined inside the loop, the rewrite
+  // cannot be done.
+  if (Instruction *CR = dyn_cast<Instruction>(Cond->getOperand(1)))
+    if (L->contains(CR->getParent()))
+      return;
+
+  // Make sure the IV is only used for counting.  Value may be preinc or
+  // postinc; 2 uses in either case.
+  if (!Cond->getOperand(0)->hasNUses(2))
+    return;
+  PHINode *phi = dyn_cast<PHINode>(Cond->getOperand(0));
+  Instruction *incr;
+  if (phi && phi->getParent()==L->getHeader()) {
+    // value tested is preinc.  Find the increment.
+    // A CmpInst is not a BinaryOperator; we depend on this.
+    Instruction::use_iterator UI = phi->use_begin();
+    incr = dyn_cast<BinaryOperator>(UI);
+    if (!incr)
+      incr = dyn_cast<BinaryOperator>(++UI);
+    // 1 use for postinc value, the phi.  Unnecessarily conservative?
+    if (!incr || !incr->hasOneUse() || incr->getOpcode()!=Instruction::Add)
+      return;
+  } else {
+    // Value tested is postinc.  Find the phi node.
+    incr = dyn_cast<BinaryOperator>(Cond->getOperand(0));
+    if (!incr || incr->getOpcode()!=Instruction::Add)
+      return;
+
+    Instruction::use_iterator UI = Cond->getOperand(0)->use_begin();
+    phi = dyn_cast<PHINode>(UI);
+    if (!phi)
+      phi = dyn_cast<PHINode>(++UI);
+    // 1 use for preinc value, the increment.
+    if (!phi || phi->getParent()!=L->getHeader() || !phi->hasOneUse())
+      return;
+  }
+
+  // Replace the increment with a decrement.
+  BinaryOperator *decr = 
+    BinaryOperator::Create(Instruction::Sub, incr->getOperand(0),
+                           incr->getOperand(1), "tmp", incr);
+  incr->replaceAllUsesWith(decr);
+  incr->eraseFromParent();
+
+  // Substitute endval-startval for the original startval, and 0 for the
+  // original endval.  Since we're only testing for equality this is OK even 
+  // if the computation wraps around.
+  BasicBlock  *Preheader = L->getLoopPreheader();
+  Instruction *PreInsertPt = Preheader->getTerminator();
+  int inBlock = L->contains(phi->getIncomingBlock(0)) ? 1 : 0;
+  Value *startVal = phi->getIncomingValue(inBlock);
+  Value *endVal = Cond->getOperand(1);
+  // FIXME check for case where both are constant
+  ConstantInt* Zero = ConstantInt::get(Cond->getOperand(1)->getType(), 0);
+  BinaryOperator *NewStartVal = 
+    BinaryOperator::Create(Instruction::Sub, endVal, startVal,
+                           "tmp", PreInsertPt);
+  phi->setIncomingValue(inBlock, NewStartVal);
+  Cond->setOperand(1, Zero);
+
+  Changed = true;
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
+
+  IU = &getAnalysis<IVUsers>();
+  LI = &getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+  SE = &getAnalysis<ScalarEvolution>();
+  Changed = false;
+
+  if (!IU->IVUsesByStride.empty()) {
+#ifndef NDEBUG
+    DOUT << "\nLSR on \"" << L->getHeader()->getParent()->getNameStart()
+         << "\" ";
+    DEBUG(L->dump());
+#endif
+
+    // Sort the StrideOrder so we process larger strides first.
+    std::stable_sort(IU->StrideOrder.begin(), IU->StrideOrder.end(),
+                     StrideCompare(SE));
+
+    // Optimize induction variables.  Some indvar uses can be transformed to use
+    // strides that will be needed for other purposes.  A common example of this
+    // is the exit test for the loop, which can often be rewritten to use the
+    // computation of some other indvar to decide when to terminate the loop.
+    OptimizeIndvars(L);
+
+    // Change loop terminating condition to use the postinc iv when possible
+    // and optimize loop terminating compare. FIXME: Move this after
+    // StrengthReduceStridedIVUsers?
+    OptimizeLoopTermCond(L);
+
+    // FIXME: We can shrink overlarge IV's here.  e.g. if the code has
+    // computation in i64 values and the target doesn't support i64, demote
+    // the computation to 32-bit if safe.
+
+    // FIXME: Attempt to reuse values across multiple IV's.  In particular, we
+    // could have something like "for(i) { foo(i*8); bar(i*16) }", which should
+    // be codegened as "for (j = 0;; j+=8) { foo(j); bar(j+j); }" on X86/PPC.
+    // Need to be careful that IV's are all the same type.  Only works for
+    // intptr_t indvars.
+
+    // IVsByStride keeps IVs for one particular loop.
+    assert(IVsByStride.empty() && "Stale entries in IVsByStride?");
+
+    // Note: this processes each stride/type pair individually.  All users
+    // passed into StrengthReduceStridedIVUsers have the same type AND stride.
+    // Also, note that we iterate over IVUsesByStride indirectly by using
+    // StrideOrder. This extra layer of indirection makes the ordering of
+    // strides deterministic - not dependent on map order.
+    for (unsigned Stride = 0, e = IU->StrideOrder.size();
+         Stride != e; ++Stride) {
+      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+      assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+      // FIXME: Generalize to non-affine IV's.
+      if (!SI->first->isLoopInvariant(L))
+        continue;
+      StrengthReduceStridedIVUsers(SI->first, *SI->second, L);
+    }
+  }
+
+  // After all sharing is done, see if we can adjust the loop to test against
+  // zero instead of counting up to a maximum.  This is usually faster.
+  OptimizeLoopCountIV(L);
+
+  // We're done analyzing this loop; release all the state we built up for it.
+  IVsByStride.clear();
+  StrideNoReuse.clear();
+
+  // Clean up after ourselves
+  if (!DeadInsts.empty())
+    DeleteTriviallyDeadInstructions();
+
+  // At this point, it is worth checking to see if any recurrence PHIs are also
+  // dead, so that we can remove them as well.
+  DeleteDeadPHIs(L->getHeader());
+
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/LoopUnroll.cpp b/lib/Transforms/Scalar/LoopUnroll.cpp
new file mode 100644
index 0000000..23757cd
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopUnroll.cpp
@@ -0,0 +1,183 @@
+//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller.  It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <climits>
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+UnrollThreshold("unroll-threshold", cl::init(100), cl::Hidden,
+  cl::desc("The cut-off point for automatic loop unrolling"));
+
+static cl::opt<unsigned>
+UnrollCount("unroll-count", cl::init(0), cl::Hidden,
+  cl::desc("Use this unroll count for all loops, for testing purposes"));
+
+static cl::opt<bool>
+UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
+  cl::desc("Allows loops to be partially unrolled until "
+           "-unroll-threshold loop size is reached."));
+
+namespace {
+  class VISIBILITY_HIDDEN LoopUnroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopUnroll() : LoopPass(&ID) {}
+
+    /// A magic value for use with the Threshold parameter to indicate
+    /// that the loop unroll should be performed regardless of how much
+    /// code expansion would result.
+    static const unsigned NoThreshold = UINT_MAX;
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<LoopInfo>();
+      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+      // If loop unroll does not preserve dom info then LCSSA pass on next
+      // loop will receive invalid dom info.
+      // For now, recreate dom info, if loop is unrolled.
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+  };
+}
+
+char LoopUnroll::ID = 0;
+static RegisterPass<LoopUnroll> X("loop-unroll", "Unroll loops");
+
+Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); }
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L) {
+  unsigned Size = 0;
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    Instruction *Term = BB->getTerminator();
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (isa<PHINode>(I) && BB == L->getHeader()) {
+        // Ignore PHI nodes in the header.
+      } else if (I->hasOneUse() && I->use_back() == Term) {
+        // Ignore instructions only used by the loop terminator.
+      } else if (isa<DbgInfoIntrinsic>(I)) {
+        // Ignore debug instructions
+      } else if (isa<GetElementPtrInst>(I) && I->hasOneUse()) {
+        // Ignore GEP as they generally are subsumed into a load or store.
+      } else if (isa<CallInst>(I)) {
+        // Estimate size overhead introduced by call instructions which
+        // is higher than other instructions. Here 3 and 10 are magic
+        // numbers that help one isolated test case from PR2067 without
+        // negatively impacting measured benchmarks.
+        if (isa<IntrinsicInst>(I))
+          Size = Size + 3;
+        else
+          Size = Size + 10;
+      } else {
+        ++Size;
+      }
+
+      // TODO: Ignore expressions derived from PHI and constants if inval of phi
+      // is a constant, or if operation is associative.  This will get induction
+      // variables.
+    }
+  }
+
+  return Size;
+}
+
+bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  assert(L->isLCSSAForm());
+  LoopInfo *LI = &getAnalysis<LoopInfo>();
+
+  BasicBlock *Header = L->getHeader();
+  DOUT << "Loop Unroll: F[" << Header->getParent()->getName()
+       << "] Loop %" << Header->getName() << "\n";
+
+  // Find trip count
+  unsigned TripCount = L->getSmallConstantTripCount();
+  unsigned Count = UnrollCount;
+ 
+  // Automatically select an unroll count.
+  if (Count == 0) {
+    // Conservative heuristic: if we know the trip count, see if we can
+    // completely unroll (subject to the threshold, checked below); otherwise
+    // try to find greatest modulo of the trip count which is still under 
+    // threshold value.
+    if (TripCount != 0) {
+      Count = TripCount;
+    } else {
+      return false;
+    }
+  }
+
+  // Enforce the threshold.
+  if (UnrollThreshold != NoThreshold) {
+    unsigned LoopSize = ApproximateLoopSize(L);
+    DOUT << "  Loop Size = " << LoopSize << "\n";
+    uint64_t Size = (uint64_t)LoopSize*Count;
+    if (TripCount != 1 && Size > UnrollThreshold) {
+      DOUT << "  Too large to fully unroll with count: " << Count
+           << " because size: " << Size << ">" << UnrollThreshold << "\n";
+      if (UnrollAllowPartial) {
+        // Reduce unroll count to be modulo of TripCount for partial unrolling
+        Count = UnrollThreshold / LoopSize;        
+        while (Count != 0 && TripCount%Count != 0) {
+          Count--;
+        }        
+        if (Count < 2) {
+          DOUT << "  could not unroll partially\n";
+          return false;
+        } else {
+          DOUT << "  partially unrolling with count: " << Count << "\n";
+        }
+      } else {
+        DOUT << "  will not try to unroll partially because "
+             << "-unroll-allow-partial not given\n";
+        return false;
+      }
+    }
+  }
+
+  // Unroll the loop.
+  Function *F = L->getHeader()->getParent();
+  if (!UnrollLoop(L, Count, LI, &LPM))
+    return false;
+
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
+  if (DT) {
+    DT->runOnFunction(*F);
+    DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>();
+    if (DF)
+      DF->runOnFunction(*F);
+  }
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 0000000..e3e881f
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -0,0 +1,1098 @@
+//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to have multiple loops.  For example, it turns the left into the right code:
+//
+//  for (...)                  if (lic)
+//    A                          for (...)
+//    if (lic)                     A; B; C
+//      B                      else
+//    C                          for (...)
+//                                 A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unswitch"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+          cl::init(10), cl::Hidden);
+  
+namespace {
+  class VISIBILITY_HIDDEN LoopUnswitch : public LoopPass {
+    LoopInfo *LI;  // Loop information
+    LPPassManager *LPM;
+
+    // LoopProcessWorklist - Used to check if second loop needs processing
+    // after RewriteLoopBodyWithConditionConstant rewrites first loop.
+    std::vector<Loop*> LoopProcessWorklist;
+    SmallPtrSet<Value *,8> UnswitchedVals;
+    
+    bool OptimizeForSize;
+    bool redoLoop;
+
+    Loop *currentLoop;
+    DominanceFrontier *DF;
+    DominatorTree *DT;
+    BasicBlock *loopHeader;
+    BasicBlock *loopPreheader;
+    
+    // LoopBlocks contains all of the basic blocks of the loop, including the
+    // preheader of the loop, the body of the loop, and the exit blocks of the 
+    // loop, in that order.
+    std::vector<BasicBlock*> LoopBlocks;
+    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+    std::vector<BasicBlock*> NewBlocks;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopUnswitch(bool Os = false) : 
+      LoopPass(&ID), OptimizeForSize(Os), redoLoop(false), 
+      currentLoop(NULL), DF(NULL), DT(NULL), loopHeader(NULL),
+      loopPreheader(NULL) {}
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool processCurrentLoop();
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+    }
+
+  private:
+
+    /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist,
+    /// remove it.
+    void RemoveLoopFromWorklist(Loop *L) {
+      std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(),
+                                                 LoopProcessWorklist.end(), L);
+      if (I != LoopProcessWorklist.end())
+        LoopProcessWorklist.erase(I);
+    }
+
+    void initLoopData() {
+      loopHeader = currentLoop->getHeader();
+      loopPreheader = currentLoop->getLoopPreheader();
+    }
+
+    /// Split all of the edges from inside the loop to their exit blocks.
+    /// Update the appropriate Phi nodes as we do so.
+    void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks);
+
+    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val);
+    unsigned getLoopUnswitchCost(Value *LIC);
+    void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                  BasicBlock *ExitBlock);
+    void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L);
+
+    void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool isEqual);
+
+    void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                        BasicBlock *TrueDest, 
+                                        BasicBlock *FalseDest,
+                                        Instruction *InsertPt);
+
+    void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+    void RemoveBlockIfDead(BasicBlock *BB,
+                           std::vector<Instruction*> &Worklist, Loop *l);
+    void RemoveLoopFromHierarchy(Loop *L);
+    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
+                                    BasicBlock **LoopExit = 0);
+
+  };
+}
+char LoopUnswitch::ID = 0;
+static RegisterPass<LoopUnswitch> X("loop-unswitch", "Unswitch loops");
+
+Pass *llvm::createLoopUnswitchPass(bool Os) { 
+  return new LoopUnswitch(Os); 
+}
+
+/// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is
+/// invariant in the loop, or has an invariant piece, return the invariant.
+/// Otherwise, return null.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+  // Constants should be folded, not unswitched on!
+  if (isa<Constant>(Cond)) return 0;
+
+  // TODO: Handle: br (VARIANT|INVARIANT).
+  // TODO: Hoist simple expressions out of loops.
+  if (L->isLoopInvariant(Cond)) return Cond;
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+    if (BO->getOpcode() == Instruction::And ||
+        BO->getOpcode() == Instruction::Or) {
+      // If either the left or right side is invariant, we can unswitch on this,
+      // which will cause the branch to go away in one loop and the condition to
+      // simplify in the other one.
+      if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed))
+        return LHS;
+      if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
+        return RHS;
+    }
+  
+  return 0;
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+  LI = &getAnalysis<LoopInfo>();
+  LPM = &LPM_Ref;
+  DF = getAnalysisIfAvailable<DominanceFrontier>();
+  DT = getAnalysisIfAvailable<DominatorTree>();
+  currentLoop = L;
+  Function *F = currentLoop->getHeader()->getParent();
+  bool Changed = false;
+  do {
+    assert(currentLoop->isLCSSAForm());
+    redoLoop = false;
+    Changed |= processCurrentLoop();
+  } while(redoLoop);
+
+  if (Changed) {
+    // FIXME: Reconstruct dom info, because it is not preserved properly.
+    if (DT)
+      DT->runOnFunction(*F);
+    if (DF)
+      DF->runOnFunction(*F);
+  }
+  return Changed;
+}
+
+/// processCurrentLoop - Do actual work and unswitch loop if possible 
+/// and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+  bool Changed = false;
+
+  // Loop over all of the basic blocks in the loop.  If we find an interior
+  // block that is branching on a loop-invariant condition, we can unswitch this
+  // loop.
+  for (Loop::block_iterator I = currentLoop->block_begin(), 
+         E = currentLoop->block_end();
+       I != E; ++I) {
+    TerminatorInst *TI = (*I)->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      // If this isn't branching on an invariant condition, we can't unswitch
+      // it.
+      if (BI->isConditional()) {
+        // See if this, or some part of it, is loop invariant.  If so, we can
+        // unswitch on it if we desire.
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), 
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+                                             ConstantInt::getTrue())) {
+          ++NumBranches;
+          return true;
+        }
+      }      
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+                                             currentLoop, Changed);
+      if (LoopCond && SI->getNumCases() > 1) {
+        // Find a value to unswitch on:
+        // FIXME: this should chose the most expensive case!
+        Constant *UnswitchVal = SI->getCaseValue(1);
+        // Do not process same value again and again.
+        if (!UnswitchedVals.insert(UnswitchVal))
+          continue;
+
+        if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+          ++NumSwitches;
+          return true;
+        }
+      }
+    }
+    
+    // Scan the instructions to check for unswitchable values.
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); 
+         BBI != E; ++BBI)
+      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+                                             ConstantInt::getTrue())) {
+          ++NumSelects;
+          return true;
+        }
+      }
+  }
+  return Changed;
+}
+
+/// isTrivialLoopExitBlock - Check to see if all paths from BB either:
+///   1. Exit the loop with no side effects.
+///   2. Branch to the latch block with no side-effects.
+///
+/// If these conditions are true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+                                         BasicBlock *&ExitBB,
+                                         std::set<BasicBlock*> &Visited) {
+  if (!Visited.insert(BB).second) {
+    // Already visited and Ok, end of recursion.
+    return true;
+  } else if (!L->contains(BB)) {
+    // Otherwise, this is a loop exit, this is fine so long as this is the
+    // first exit.
+    if (ExitBB != 0) return false;
+    ExitBB = BB;
+    return true;
+  }
+  
+  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
+  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+    // Check to see if the successor is a trivial loop exit.
+    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+      return false;
+  }
+
+  // Okay, everything after this looks good, check to make sure that this block
+  // doesn't include any side effects.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (I->mayHaveSideEffects())
+      return false;
+  
+  return true;
+}
+
+/// isTrivialLoopExitBlock - Return true if the specified block unconditionally
+/// leads to an exit from the specified loop, and has no side-effects in the 
+/// process.  If so, return the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+  std::set<BasicBlock*> Visited;
+  Visited.insert(L->getHeader());  // Branches to header are ok.
+  BasicBlock *ExitBB = 0;
+  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+    return ExitBB;
+  return 0;
+}
+
+/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is
+/// trivial: that is, that the condition controls whether or not the loop does
+/// anything at all.  If this is a trivial condition, unswitching produces no
+/// code duplications (equivalently, it produces a simpler loop and a new empty
+/// loop, which gets deleted).
+///
+/// If this is a trivial condition, return true, otherwise return false.  When
+/// returning true, this sets Cond and Val to the condition that controls the
+/// trivial condition: when Cond dynamically equals Val, the loop is known to
+/// exit.  Finally, this sets LoopExit to the BB that the loop exits to when
+/// Cond == Val.
+///
+bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
+                                       BasicBlock **LoopExit) {
+  BasicBlock *Header = currentLoop->getHeader();
+  TerminatorInst *HeaderTerm = Header->getTerminator();
+  
+  BasicBlock *LoopExitBB = 0;
+  if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
+    // If the header block doesn't end with a conditional branch on Cond, we
+    // can't handle it.
+    if (!BI->isConditional() || BI->getCondition() != Cond)
+      return false;
+  
+    // Check to see if a successor of the branch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any 
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                             BI->getSuccessor(0)))) {
+      if (Val) *Val = ConstantInt::getTrue();
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                                    BI->getSuccessor(1)))) {
+      if (Val) *Val = ConstantInt::getFalse();
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
+    // If this isn't a switch on Cond, we can't handle it.
+    if (SI->getCondition() != Cond) return false;
+    
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any 
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.  Note that we can't trivially unswitch on the default case.
+    for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
+      if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+                                               SI->getSuccessor(i)))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        if (Val) *Val = SI->getCaseValue(i);
+        break;
+      }
+  }
+
+  // If we didn't find a single unique LoopExit block, or if the loop exit block
+  // contains phi nodes, this isn't trivial.
+  if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+    return false;   // Can't handle this.
+  
+  if (LoopExit) *LoopExit = LoopExitBB;
+  
+  // We already know that nothing uses any scalar values defined inside of this
+  // loop.  As such, we just have to check to see if this loop will execute any
+  // side-effecting instructions (e.g. stores, calls, volatile loads) in the
+  // part of the loop that the code *would* execute.  We already checked the
+  // tail, check the header now.
+  for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I)
+    if (I->mayHaveSideEffects())
+      return false;
+  return true;
+}
+
+/// getLoopUnswitchCost - Return the cost (code size growth) that will happen if
+/// we choose to unswitch current loop on the specified value.
+///
+unsigned LoopUnswitch::getLoopUnswitchCost(Value *LIC) {
+  // If the condition is trivial, always unswitch.  There is no code growth for
+  // this case.
+  if (IsTrivialUnswitchCondition(LIC))
+    return 0;
+  
+  // FIXME: This is really overly conservative.  However, more liberal 
+  // estimations have thus far resulted in excessive unswitching, which is bad
+  // both in compile time and in code size.  This should be replaced once
+  // someone figures out how a good estimation.
+  return currentLoop->getBlocks().size();
+  
+  unsigned Cost = 0;
+  // FIXME: this is brain dead.  It should take into consideration code
+  // shrinkage.
+  for (Loop::block_iterator I = currentLoop->block_begin(), 
+         E = currentLoop->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    // Do not include empty blocks in the cost calculation.  This happen due to
+    // loop canonicalization and will be removed.
+    if (BB->begin() == BasicBlock::iterator(BB->getTerminator()))
+      continue;
+    
+    // Count basic blocks.
+    ++Cost;
+  }
+
+  return Cost;
+}
+
+/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when
+/// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val){
+
+  initLoopData();
+  Function *F = loopHeader->getParent();
+
+
+  // Check to see if it would be profitable to unswitch current loop.
+  unsigned Cost = getLoopUnswitchCost(LoopCond);
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  if (Cost && OptimizeForSize)
+    return false;
+  if (Cost && !F->isDeclaration() && F->hasFnAttr(Attribute::OptimizeForSize))
+    return false;
+
+  if (Cost > Threshold) {
+    // FIXME: this should estimate growth by the amount of code shared by the
+    // resultant unswitched loops.
+    //
+    DOUT << "NOT unswitching loop %"
+         << currentLoop->getHeader()->getName() << ", cost too high: "
+         << currentLoop->getBlocks().size() << "\n";
+    return false;
+  }
+
+  Constant *CondVal;
+  BasicBlock *ExitBlock;
+  if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) {
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock);
+  } else {
+    UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
+  }
+
+  return true;
+}
+
+// RemapInstruction - Convert the instruction operands from referencing the
+// current values into those specified by ValueMap.
+//
+static inline void RemapInstruction(Instruction *I,
+                                    DenseMap<const Value *, Value*> &ValueMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
+    if (It != ValueMap.end()) Op = It->second;
+    I->setOperand(op, Op);
+  }
+}
+
+/// CloneLoop - Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, DenseMap<const Value*, Value*> &VM,
+                       LoopInfo *LI, LPPassManager *LPM) {
+  Loop *New = new Loop();
+
+  LPM->insertLoop(New, PL);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase());
+
+  // Add all of the subloops to the new loop.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    CloneLoop(*I, New, VM, LI, LPM);
+
+  return New;
+}
+
+/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values
+/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest.  Insert the
+/// code immediately before InsertPt.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                                  BasicBlock *TrueDest,
+                                                  BasicBlock *FalseDest,
+                                                  Instruction *InsertPt) {
+  // Insert a conditional branch on LIC to the two preheaders.  The original
+  // code is the true version and the new code is the false version.
+  Value *BranchVal = LIC;
+  if (!isa<ConstantInt>(Val) || Val->getType() != Type::Int1Ty)
+    BranchVal = new ICmpInst(ICmpInst::ICMP_EQ, LIC, Val, "tmp", InsertPt);
+  else if (Val != ConstantInt::getTrue())
+    // We want to enter the new loop when the condition is true.
+    std::swap(TrueDest, FalseDest);
+
+  // Insert the new branch.
+  BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
+}
+
+/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
+/// condition in it (a cond branch from its header block to its latch block,
+/// where the path through the loop that doesn't execute its body has no 
+/// side-effects), unswitch it.  This doesn't involve any code duplication, just
+/// moving the conditional branch outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, 
+                                            Constant *Val, 
+                                            BasicBlock *ExitBlock) {
+  DOUT << "loop-unswitch: Trivial-Unswitch loop %"
+       << loopHeader->getName() << " [" << L->getBlocks().size()
+       << " blocks] in Function " << L->getHeader()->getParent()->getName()
+       << " on cond: " << *Val << " == " << *Cond << "\n";
+  
+  // First step, split the preheader, so that we know that there is a safe place
+  // to insert the conditional branch.  We will change loopPreheader to have a
+  // conditional branch on Cond.
+  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we should
+  // short-circuit to.
+  
+  // Split this block now, so that the loop maintains its exit block, and so
+  // that the jump from the preheader can execute the contents of the exit block
+  // without actually branching to it (the exit block should be dominated by the
+  // loop header, not the preheader).
+  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
+    
+  // Okay, now we have a position to branch from and a position to branch to, 
+  // insert the new conditional branch.
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, 
+                                 loopPreheader->getTerminator());
+  LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
+  loopPreheader->getTerminator()->eraseFromParent();
+
+  // We need to reprocess this loop, it could be unswitched again.
+  redoLoop = true;
+  
+  // Now that we know that the loop is never entered when this condition is a
+  // particular value, rewrite the loop with this info.  We know that this will
+  // at least eliminate the old branch.
+  RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+  ++NumTrivial;
+}
+
+/// SplitExitEdges - Split all of the edges from inside the loop to their exit
+/// blocks.  Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::SplitExitEdges(Loop *L, 
+                                const SmallVector<BasicBlock *, 8> &ExitBlocks) 
+{
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    std::vector<BasicBlock*> Preds(pred_begin(ExitBlock), pred_end(ExitBlock));
+
+    for (unsigned j = 0, e = Preds.size(); j != e; ++j) {
+      BasicBlock* NewExitBlock = SplitEdge(Preds[j], ExitBlock, this);
+      BasicBlock* StartBlock = Preds[j];
+      BasicBlock* EndBlock;
+      if (NewExitBlock->getSinglePredecessor() == ExitBlock) {
+        EndBlock = NewExitBlock;
+        NewExitBlock = EndBlock->getSinglePredecessor();
+      } else {
+        EndBlock = ExitBlock;
+      }
+      
+      std::set<PHINode*> InsertedPHIs;
+      PHINode* OldLCSSA = 0;
+      for (BasicBlock::iterator I = EndBlock->begin();
+           (OldLCSSA = dyn_cast<PHINode>(I)); ++I) {
+        Value* OldValue = OldLCSSA->getIncomingValueForBlock(NewExitBlock);
+        PHINode* NewLCSSA = PHINode::Create(OldLCSSA->getType(),
+                                            OldLCSSA->getName() + ".us-lcssa",
+                                            NewExitBlock->getTerminator());
+        NewLCSSA->addIncoming(OldValue, StartBlock);
+        OldLCSSA->setIncomingValue(OldLCSSA->getBasicBlockIndex(NewExitBlock),
+                                   NewLCSSA);
+        InsertedPHIs.insert(NewLCSSA);
+      }
+
+      BasicBlock::iterator InsertPt = EndBlock->getFirstNonPHI();
+      for (BasicBlock::iterator I = NewExitBlock->begin();
+         (OldLCSSA = dyn_cast<PHINode>(I)) && InsertedPHIs.count(OldLCSSA) == 0;
+         ++I) {
+        PHINode *NewLCSSA = PHINode::Create(OldLCSSA->getType(),
+                                            OldLCSSA->getName() + ".us-lcssa",
+                                            InsertPt);
+        OldLCSSA->replaceAllUsesWith(NewLCSSA);
+        NewLCSSA->addIncoming(OldLCSSA, NewExitBlock);
+      }
+
+    }    
+  }
+
+}
+
+/// UnswitchNontrivialCondition - We determined that the loop is profitable 
+/// to unswitch when LIC equal Val.  Split it into loop versions and test the 
+/// condition outside of either loop.  Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, 
+                                               Loop *L) {
+  Function *F = loopHeader->getParent();
+  DOUT << "loop-unswitch: Unswitching loop %"
+       << loopHeader->getName() << " [" << L->getBlocks().size()
+       << " blocks] in Function " << F->getName()
+       << " when '" << *Val << "' == " << *LIC << "\n";
+
+  LoopBlocks.clear();
+  NewBlocks.clear();
+
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this);
+  LoopBlocks.push_back(NewPreheader);
+
+  // We want the loop to come after the preheader, but before the exit blocks.
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  SplitExitEdges(L, ExitBlocks);
+
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
+  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  NewBlocks.reserve(LoopBlocks.size());
+  DenseMap<const Value*, Value*> ValueMap;
+  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+    BasicBlock *New = CloneBasicBlock(LoopBlocks[i], ValueMap, ".us", F);
+    NewBlocks.push_back(New);
+    ValueMap[LoopBlocks[i]] = New;  // Keep the BB mapping.
+    LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], New, L);
+  }
+
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(LoopBlocks[0], F->getBasicBlockList(),
+                                NewBlocks[0], F->end());
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), ValueMap, LI, LPM);
+  Loop *ParentLoop = L->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
+  }
+  
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *NewExit = cast<BasicBlock>(ValueMap[ExitBlocks[i]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
+    
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    PHINode *PN;
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         (PN = dyn_cast<PHINode>(I)); ++I) {
+      Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+      DenseMap<const Value *, Value*>::iterator It = ValueMap.find(V);
+      if (It != ValueMap.end()) V = It->second;
+      PN->addIncoming(V, NewExit);
+    }
+  }
+
+  // Rewrite the code to refer to itself.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+      RemapInstruction(I, ValueMap);
+  
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+
+  // Emit the new branch that selects between the two versions of this loop.
+  EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR);
+  LPM->deleteSimpleAnalysisValue(OldBR, L);
+  OldBR->eraseFromParent();
+
+  LoopProcessWorklist.push_back(NewLoop);
+  redoLoop = true;
+
+  // Now we rewrite the original code to know that the condition is true and the
+  // new code to know that the condition is false.
+  RewriteLoopBodyWithConditionConstant(L      , LIC, Val, false);
+  
+  // It's possible that simplifying one loop could cause the other to be
+  // deleted.  If so, don't simplify it.
+  if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop)
+    RewriteLoopBodyWithConditionConstant(NewLoop, LIC, Val, true);
+
+}
+
+/// RemoveFromWorklist - Remove all instances of I from the worklist vector
+/// specified.
+static void RemoveFromWorklist(Instruction *I, 
+                               std::vector<Instruction*> &Worklist) {
+  std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(),
+                                                     Worklist.end(), I);
+  while (WI != Worklist.end()) {
+    unsigned Offset = WI-Worklist.begin();
+    Worklist.erase(WI);
+    WI = std::find(Worklist.begin()+Offset, Worklist.end(), I);
+  }
+}
+
+/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V, 
+                              std::vector<Instruction*> &Worklist,
+                              Loop *L, LPPassManager *LPM) {
+  DOUT << "Replace with '" << *V << "': " << *I;
+
+  // Add uses to the worklist, which may be dead now.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+      Worklist.push_back(Use);
+
+  // Add users to the worklist which may be simplified now.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI)
+    Worklist.push_back(cast<Instruction>(*UI));
+  LPM->deleteSimpleAnalysisValue(I, L);
+  RemoveFromWorklist(I, Worklist);
+  I->replaceAllUsesWith(V);
+  I->eraseFromParent();
+  ++NumSimplify;
+}
+
+/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
+/// information, and remove any dead successors it has.
+///
+void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
+                                     std::vector<Instruction*> &Worklist,
+                                     Loop *L) {
+  if (pred_begin(BB) != pred_end(BB)) {
+    // This block isn't dead, since an edge to BB was just removed, see if there
+    // are any easy simplifications we can do now.
+    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+      // If it has one pred, fold phi nodes in BB.
+      while (isa<PHINode>(BB->begin()))
+        ReplaceUsesOfWith(BB->begin(), 
+                          cast<PHINode>(BB->begin())->getIncomingValue(0), 
+                          Worklist, L, LPM);
+      
+      // If this is the header of a loop and the only pred is the latch, we now
+      // have an unreachable loop.
+      if (Loop *L = LI->getLoopFor(BB))
+        if (loopHeader == BB && L->contains(Pred)) {
+          // Remove the branch from the latch to the header block, this makes
+          // the header dead, which will make the latch dead (because the header
+          // dominates the latch).
+          LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
+          Pred->getTerminator()->eraseFromParent();
+          new UnreachableInst(Pred);
+          
+          // The loop is now broken, remove it from LI.
+          RemoveLoopFromHierarchy(L);
+          
+          // Reprocess the header, which now IS dead.
+          RemoveBlockIfDead(BB, Worklist, L);
+          return;
+        }
+      
+      // If pred ends in a uncond branch, add uncond branch to worklist so that
+      // the two blocks will get merged.
+      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
+        if (BI->isUnconditional())
+          Worklist.push_back(BI);
+    }
+    return;
+  }
+
+  DOUT << "Nuking dead block: " << *BB;
+  
+  // Remove the instructions in the basic block from the worklist.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    RemoveFromWorklist(I, Worklist);
+    
+    // Anything that uses the instructions in this basic block should have their
+    // uses replaced with undefs.
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+  
+  // If this is the edge to the header block for a loop, remove the loop and
+  // promote all subloops.
+  if (Loop *BBLoop = LI->getLoopFor(BB)) {
+    if (BBLoop->getLoopLatch() == BB)
+      RemoveLoopFromHierarchy(BBLoop);
+  }
+
+  // Remove the block from the loop info, which removes it from any loops it
+  // was in.
+  LI->removeBlock(BB);
+  
+  
+  // Remove phi node entries in successors for this block.
+  TerminatorInst *TI = BB->getTerminator();
+  SmallVector<BasicBlock*, 4> Succs;
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+    Succs.push_back(TI->getSuccessor(i));
+    TI->getSuccessor(i)->removePredecessor(BB);
+  }
+  
+  // Unique the successors, remove anything with multiple uses.
+  array_pod_sort(Succs.begin(), Succs.end());
+  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
+  
+  // Remove the basic block, including all of the instructions contained in it.
+  LPM->deleteSimpleAnalysisValue(BB, L);  
+  BB->eraseFromParent();
+  // Remove successor blocks here that are not dead, so that we know we only
+  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
+  // then getting removed before we revisit them, which is badness.
+  //
+  for (unsigned i = 0; i != Succs.size(); ++i)
+    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
+      // One exception is loop headers.  If this block was the preheader for a
+      // loop, then we DO want to visit the loop so the loop gets deleted.
+      // We know that if the successor is a loop header, that this loop had to
+      // be the preheader: the case where this was the latch block was handled
+      // above and headers can only have two predecessors.
+      if (!LI->isLoopHeader(Succs[i])) {
+        Succs.erase(Succs.begin()+i);
+        --i;
+      }
+    }
+  
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    RemoveBlockIfDead(Succs[i], Worklist, L);
+}
+
+/// RemoveLoopFromHierarchy - We have discovered that the specified loop has
+/// become unwrapped, either because the backedge was deleted, or because the
+/// edge into the header was removed.  If the edge into the header from the
+/// latch block was removed, the loop is unwrapped but subloops are still alive,
+/// so they just reparent loops.  If the loops are actually dead, they will be
+/// removed later.
+void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) {
+  LPM->deleteLoopFromQueue(L);
+  RemoveLoopFromWorklist(L);
+}
+
+// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has
+// the value specified by Val in the specified loop, or we know it does NOT have
+// that value.  Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                                        Constant *Val,
+                                                        bool IsEqual) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+  
+  // FIXME: Support correlated properties, like:
+  //  for (...)
+  //    if (li1 < li2)
+  //      ...
+  //    if (li1 > li2)
+  //      ...
+  
+  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
+  // selects, switches.
+  std::vector<User*> Users(LIC->use_begin(), LIC->use_end());
+  std::vector<Instruction*> Worklist;
+
+  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+  // in the loop with the appropriate one directly.
+  if (IsEqual || (isa<ConstantInt>(Val) && Val->getType() == Type::Int1Ty)) {
+    Value *Replacement;
+    if (IsEqual)
+      Replacement = Val;
+    else
+      Replacement = ConstantInt::get(Type::Int1Ty, 
+                                     !cast<ConstantInt>(Val)->getZExtValue());
+    
+    for (unsigned i = 0, e = Users.size(); i != e; ++i)
+      if (Instruction *U = cast<Instruction>(Users[i])) {
+        if (!L->contains(U->getParent()))
+          continue;
+        U->replaceUsesOfWith(LIC, Replacement);
+        Worklist.push_back(U);
+      }
+  } else {
+    // Otherwise, we don't know the precise value of LIC, but we do know that it
+    // is certainly NOT "Val".  As such, simplify any uses in the loop that we
+    // can.  This case occurs when we unswitch switch statements.
+    for (unsigned i = 0, e = Users.size(); i != e; ++i)
+      if (Instruction *U = cast<Instruction>(Users[i])) {
+        if (!L->contains(U->getParent()))
+          continue;
+
+        Worklist.push_back(U);
+
+        // If we know that LIC is not Val, use this info to simplify code.
+        if (SwitchInst *SI = dyn_cast<SwitchInst>(U)) {
+          for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
+            if (SI->getCaseValue(i) == Val) {
+              // Found a dead case value.  Don't remove PHI nodes in the 
+              // successor if they become single-entry, those PHI nodes may
+              // be in the Users list.
+              
+              // FIXME: This is a hack.  We need to keep the successor around
+              // and hooked up so as to preserve the loop structure, because
+              // trying to update it is complicated.  So instead we preserve the
+              // loop structure and put the block on an dead code path.
+              
+              BasicBlock *SISucc = SI->getSuccessor(i);
+              BasicBlock* Old = SI->getParent();
+              BasicBlock* Split = SplitBlock(Old, SI, this);
+              
+              Instruction* OldTerm = Old->getTerminator();
+              BranchInst::Create(Split, SISucc,
+                                 ConstantInt::getTrue(), OldTerm);
+
+              LPM->deleteSimpleAnalysisValue(Old->getTerminator(), L);
+              Old->getTerminator()->eraseFromParent();
+              
+              PHINode *PN;
+              for (BasicBlock::iterator II = SISucc->begin();
+                   (PN = dyn_cast<PHINode>(II)); ++II) {
+                Value *InVal = PN->removeIncomingValue(Split, false);
+                PN->addIncoming(InVal, Old);
+              }
+
+              SI->removeCase(i);
+              break;
+            }
+          }
+        }
+        
+        // TODO: We could do other simplifications, for example, turning 
+        // LIC == Val -> false.
+      }
+  }
+  
+  SimplifyCode(Worklist, L);
+}
+
+/// SimplifyCode - Okay, now that we have simplified some instructions in the 
+/// loop, walk over it and constant prop, dce, and fold control flow where
+/// possible.  Note that this is effectively a very simple loop-structure-aware
+/// optimizer.  During processing of this loop, L could very well be deleted, so
+/// it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    
+    // Simple constant folding.
+    if (Constant *C = ConstantFoldInstruction(I)) {
+      ReplaceUsesOfWith(I, C, Worklist, L, LPM);
+      continue;
+    }
+    
+    // Simple DCE.
+    if (isInstructionTriviallyDead(I)) {
+      DOUT << "Remove dead instruction '" << *I;
+      
+      // Add uses to the worklist, which may be dead now.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+          Worklist.push_back(Use);
+      LPM->deleteSimpleAnalysisValue(I, L);
+      RemoveFromWorklist(I, Worklist);
+      I->eraseFromParent();
+      ++NumSimplify;
+      continue;
+    }
+    
+    // Special case hacks that appear commonly in unswitched code.
+    switch (I->getOpcode()) {
+    case Instruction::Select:
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(0))) {
+        ReplaceUsesOfWith(I, I->getOperand(!CB->getZExtValue()+1), Worklist, L,
+                          LPM);
+        continue;
+      }
+      break;
+    case Instruction::And:
+      if (isa<ConstantInt>(I->getOperand(0)) && 
+          I->getOperand(0)->getType() == Type::Int1Ty)   // constant -> RHS
+        cast<BinaryOperator>(I)->swapOperands();
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1))) 
+        if (CB->getType() == Type::Int1Ty) {
+          if (CB->isOne())      // X & 1 -> X
+            ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM);
+          else                  // X & 0 -> 0
+            ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM);
+          continue;
+        }
+      break;
+    case Instruction::Or:
+      if (isa<ConstantInt>(I->getOperand(0)) &&
+          I->getOperand(0)->getType() == Type::Int1Ty)   // constant -> RHS
+        cast<BinaryOperator>(I)->swapOperands();
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1)))
+        if (CB->getType() == Type::Int1Ty) {
+          if (CB->isOne())   // X | 1 -> 1
+            ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM);
+          else                  // X | 0 -> X
+            ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM);
+          continue;
+        }
+      break;
+    case Instruction::Br: {
+      BranchInst *BI = cast<BranchInst>(I);
+      if (BI->isUnconditional()) {
+        // If BI's parent is the only pred of the successor, fold the two blocks
+        // together.
+        BasicBlock *Pred = BI->getParent();
+        BasicBlock *Succ = BI->getSuccessor(0);
+        BasicBlock *SinglePred = Succ->getSinglePredecessor();
+        if (!SinglePred) continue;  // Nothing to do.
+        assert(SinglePred == Pred && "CFG broken");
+
+        DOUT << "Merging blocks: " << Pred->getName() << " <- " 
+             << Succ->getName() << "\n";
+        
+        // Resolve any single entry PHI nodes in Succ.
+        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
+        
+        // Move all of the successor contents from Succ to Pred.
+        Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(),
+                                   Succ->end());
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+        
+        // If Succ has any successors with PHI nodes, update them to have
+        // entries coming from Pred instead of Succ.
+        Succ->replaceAllUsesWith(Pred);
+        
+        // Remove Succ from the loop tree.
+        LI->removeBlock(Succ);
+        LPM->deleteSimpleAnalysisValue(Succ, L);
+        Succ->eraseFromParent();
+        ++NumSimplify;
+      } else if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
+        // Conditional branch.  Turn it into an unconditional branch, then
+        // remove dead blocks.
+        break;  // FIXME: Enable.
+
+        DOUT << "Folded branch: " << *BI;
+        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
+        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
+        DeadSucc->removePredecessor(BI->getParent(), true);
+        Worklist.push_back(BranchInst::Create(LiveSucc, BI));
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+        ++NumSimplify;
+
+        RemoveBlockIfDead(DeadSucc, Worklist, L);
+      }
+      break;
+    }
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/Makefile b/lib/Transforms/Scalar/Makefile
new file mode 100644
index 0000000..cc42fd0
--- /dev/null
+++ b/lib/Transforms/Scalar/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Scalar/Makefile ----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMScalarOpts
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
new file mode 100644
index 0000000..5cf0518
--- /dev/null
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -0,0 +1,741 @@
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "memcpyopt"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Instructions.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetData.h"
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+
+/// isBytewiseValue - If the specified value can be set by repeating the same
+/// byte in memory, return the i8 value that it is represented with.  This is
+/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
+/// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
+/// byte store (e.g. i16 0x1234), return null.
+static Value *isBytewiseValue(Value *V) {
+  // All byte-wide stores are splatable, even of arbitrary variables.
+  if (V->getType() == Type::Int8Ty) return V;
+  
+  // Constant float and double values can be handled as integer values if the
+  // corresponding integer value is "byteable".  An important case is 0.0. 
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType() == Type::FloatTy)
+      V = ConstantExpr::getBitCast(CFP, Type::Int32Ty);
+    if (CFP->getType() == Type::DoubleTy)
+      V = ConstantExpr::getBitCast(CFP, Type::Int64Ty);
+    // Don't handle long double formats, which have strange constraints.
+  }
+  
+  // We can handle constant integers that are power of two in size and a 
+  // multiple of 8 bits.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    unsigned Width = CI->getBitWidth();
+    if (isPowerOf2_32(Width) && Width > 8) {
+      // We can handle this value if the recursive binary decomposition is the
+      // same at all levels.
+      APInt Val = CI->getValue();
+      APInt Val2;
+      while (Val.getBitWidth() != 8) {
+        unsigned NextWidth = Val.getBitWidth()/2;
+        Val2  = Val.lshr(NextWidth);
+        Val2.trunc(Val.getBitWidth()/2);
+        Val.trunc(Val.getBitWidth()/2);
+
+        // If the top/bottom halves aren't the same, reject it.
+        if (Val != Val2)
+          return 0;
+      }
+      return ConstantInt::get(Val);
+    }
+  }
+  
+  // Conceptually, we could handle things like:
+  //   %a = zext i8 %X to i16
+  //   %b = shl i16 %a, 8
+  //   %c = or i16 %a, %b
+  // but until there is an example that actually needs this, it doesn't seem
+  // worth worrying about.
+  return 0;
+}
+
+static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
+                                  bool &VariableIdxFound, TargetData &TD) {
+  // Skip over the first indices.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1; i != Idx; ++i, ++GTI)
+    /*skip along*/;
+  
+  // Compute the offset implied by the rest of the indices.
+  int64_t Offset = 0;
+  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (OpC == 0)
+      return VariableIdxFound = true;
+    if (OpC->isZero()) continue;  // No offset.
+
+    // Handle struct indices, which add their field offset to the pointer.
+    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+      continue;
+    }
+    
+    // Otherwise, we have a sequential type like an array or vector.  Multiply
+    // the index by the ElementSize.
+    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+    Offset += Size*OpC->getSExtValue();
+  }
+
+  return Offset;
+}
+
+/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a
+/// constant offset, and return that constant offset.  For example, Ptr1 might
+/// be &A[42], and Ptr2 might be &A[40].  In this case offset would be -8.
+static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
+                            TargetData &TD) {
+  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+  // base.  After that base, they may have some number of common (and
+  // potentially variable) indices.  After that they handle some constant
+  // offset, which determines their offset from each other.  At this point, we
+  // handle no other case.
+  GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
+  GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
+  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+    return false;
+  
+  // Skip any common indices and track the GEP types.
+  unsigned Idx = 1;
+  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+      break;
+
+  bool VariableIdxFound = false;
+  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
+  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
+  if (VariableIdxFound) return false;
+  
+  Offset = Offset2-Offset1;
+  return true;
+}
+
+
+/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+///   store 0 -> P+1
+///   store 0 -> P+0
+///   store 0 -> P+3
+///   store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc.  When we see
+/// the first store, we make a range [1, 2).  The second store extends the range
+/// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+namespace {
+struct MemsetRange {
+  // Start/End - A semi range that describes the span that this range covers.
+  // The range is closed at the start and open at the end: [Start, End).  
+  int64_t Start, End;
+
+  /// StartPtr - The getelementptr instruction that points to the start of the
+  /// range.
+  Value *StartPtr;
+  
+  /// Alignment - The known alignment of the first store.
+  unsigned Alignment;
+  
+  /// TheStores - The actual stores that make up this range.
+  SmallVector<StoreInst*, 16> TheStores;
+  
+  bool isProfitableToUseMemset(const TargetData &TD) const;
+
+};
+} // end anon namespace
+
+bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
+  // If we found more than 8 stores to merge or 64 bytes, use memset.
+  if (TheStores.size() >= 8 || End-Start >= 64) return true;
+  
+  // Assume that the code generator is capable of merging pairs of stores
+  // together if it wants to.
+  if (TheStores.size() <= 2) return false;
+  
+  // If we have fewer than 8 stores, it can still be worthwhile to do this.
+  // For example, merging 4 i8 stores into an i32 store is useful almost always.
+  // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+  // memset will be split into 2 32-bit stores anyway) and doing so can
+  // pessimize the llvm optimizer.
+  //
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum GPR width is the same size as the pointer size and assume that
+  // this width can be stored.  If so, check to see whether we will end up
+  // actually reducing the number of stores used.
+  unsigned Bytes = unsigned(End-Start);
+  unsigned NumPointerStores = Bytes/TD.getPointerSize();
+  
+  // Assume the remaining bytes if any are done a byte at a time.
+  unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
+  
+  // If we will reduce the # stores (according to this heuristic), do the
+  // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+  // etc.
+  return TheStores.size() > NumPointerStores+NumByteStores;
+}    
+
+
+namespace {
+class MemsetRanges {
+  /// Ranges - A sorted list of the memset ranges.  We use std::list here
+  /// because each element is relatively large and expensive to copy.
+  std::list<MemsetRange> Ranges;
+  typedef std::list<MemsetRange>::iterator range_iterator;
+  TargetData &TD;
+public:
+  MemsetRanges(TargetData &td) : TD(td) {}
+  
+  typedef std::list<MemsetRange>::const_iterator const_iterator;
+  const_iterator begin() const { return Ranges.begin(); }
+  const_iterator end() const { return Ranges.end(); }
+  bool empty() const { return Ranges.empty(); }
+  
+  void addStore(int64_t OffsetFromFirst, StoreInst *SI);
+};
+  
+} // end anon namespace
+
+
+/// addStore - Add a new store to the MemsetRanges data structure.  This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
+  int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType());
+  
+  // Do a linear search of the ranges to see if this can be joined and/or to
+  // find the insertion point in the list.  We keep the ranges sorted for
+  // simplicity here.  This is a linear search of a linked list, which is ugly,
+  // however the number of ranges is limited, so this won't get crazy slow.
+  range_iterator I = Ranges.begin(), E = Ranges.end();
+  
+  while (I != E && Start > I->End)
+    ++I;
+  
+  // We now know that I == E, in which case we didn't find anything to merge
+  // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
+  // to insert a new range.  Handle this now.
+  if (I == E || End < I->Start) {
+    MemsetRange &R = *Ranges.insert(I, MemsetRange());
+    R.Start        = Start;
+    R.End          = End;
+    R.StartPtr     = SI->getPointerOperand();
+    R.Alignment    = SI->getAlignment();
+    R.TheStores.push_back(SI);
+    return;
+  }
+
+  // This store overlaps with I, add it.
+  I->TheStores.push_back(SI);
+  
+  // At this point, we may have an interval that completely contains our store.
+  // If so, just add it to the interval and return.
+  if (I->Start <= Start && I->End >= End)
+    return;
+  
+  // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+  // but is not entirely contained within the range.
+  
+  // See if the range extends the start of the range.  In this case, it couldn't
+  // possibly cause it to join the prior range, because otherwise we would have
+  // stopped on *it*.
+  if (Start < I->Start) {
+    I->Start = Start;
+    I->StartPtr = SI->getPointerOperand();
+  }
+    
+  // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+  // is in or right at the end of I), and that End >= I->Start.  Extend I out to
+  // End.
+  if (End > I->End) {
+    I->End = End;
+    range_iterator NextI = I;
+    while (++NextI != E && End >= NextI->Start) {
+      // Merge the range in.
+      I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+      if (NextI->End > I->End)
+        I->End = NextI->End;
+      Ranges.erase(NextI);
+      NextI = I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         MemCpyOpt Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+  class VISIBILITY_HIDDEN MemCpyOpt : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MemCpyOpt() : FunctionPass(&ID) {}
+
+  private:
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetData>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+      AU.addPreserved<TargetData>();
+    }
+  
+    // Helper fuctions
+    bool processStore(StoreInst *SI, BasicBlock::iterator& BBI);
+    bool processMemCpy(MemCpyInst* M);
+    bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C);
+    bool iterateOnFunction(Function &F);
+  };
+  
+  char MemCpyOpt::ID = 0;
+}
+
+// createMemCpyOptPass - The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
+
+static RegisterPass<MemCpyOpt> X("memcpyopt",
+                                 "MemCpy Optimization");
+
+
+
+/// processStore - When GVN is scanning forward over instructions, we look for
+/// some other patterns to fold away.  In particular, this looks for stores to
+/// neighboring locations of memory.  If it sees enough consequtive ones
+/// (currently 4) it attempts to merge them together into a memcpy/memset.
+bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
+  if (SI->isVolatile()) return false;
+  
+  // There are two cases that are interesting for this code to handle: memcpy
+  // and memset.  Right now we only handle memset.
+  
+  // Ensure that the value being stored is something that can be memset'able a
+  // byte at a time like "0" or "-1" or any width, as well as things like
+  // 0xA0A0A0A0 and 0.0.
+  Value *ByteVal = isBytewiseValue(SI->getOperand(0));
+  if (!ByteVal)
+    return false;
+
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // Okay, so we now have a single store that can be splatable.  Scan to find
+  // all subsequent stores of the same value to offset from the same pointer.
+  // Join these together into ranges, so we can decide whether contiguous blocks
+  // are stored.
+  MemsetRanges Ranges(TD);
+  
+  Value *StartPtr = SI->getPointerOperand();
+  
+  BasicBlock::iterator BI = SI;
+  for (++BI; !isa<TerminatorInst>(BI); ++BI) {
+    if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { 
+      // If the call is readnone, ignore it, otherwise bail out.  We don't even
+      // allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (AA.getModRefBehavior(CallSite::get(BI)) ==
+            AliasAnalysis::DoesNotAccessMemory)
+        continue;
+      
+      // TODO: If this is a memset, try to join it in.
+      
+      break;
+    } else if (isa<VAArgInst>(BI) || isa<LoadInst>(BI))
+      break;
+
+    // If this is a non-store instruction it is fine, ignore it.
+    StoreInst *NextStore = dyn_cast<StoreInst>(BI);
+    if (NextStore == 0) continue;
+    
+    // If this is a store, see if we can merge it in.
+    if (NextStore->isVolatile()) break;
+    
+    // Check to see if this stored value is of the same byte-splattable value.
+    if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+      break;
+
+    // Check to see if this store is to a constant offset from the start ptr.
+    int64_t Offset;
+    if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD))
+      break;
+
+    Ranges.addStore(Offset, NextStore);
+  }
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (Ranges.empty())
+    return false;
+  
+  // If we had at least one store that could be merged in, add the starting
+  // store as well.  We try to avoid this unless there is at least something
+  // interesting as a small compile-time optimization.
+  Ranges.addStore(0, SI);
+
+  
+  Function *MemSetF = 0;
+  
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  bool MadeChange = false;
+  for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
+       I != E; ++I) {
+    const MemsetRange &Range = *I;
+
+    if (Range.TheStores.size() == 1) continue;
+    
+    // If it is profitable to lower this range to memset, do so now.
+    if (!Range.isProfitableToUseMemset(TD))
+      continue;
+    
+    // Otherwise, we do want to transform this!  Create a new memset.  We put
+    // the memset right before the first instruction that isn't part of this
+    // memset block.  This ensure that the memset is dominated by any addressing
+    // instruction needed by the start of the block.
+    BasicBlock::iterator InsertPt = BI;
+  
+    if (MemSetF == 0) {
+      const Type *Tys[] = {Type::Int64Ty};
+      MemSetF = Intrinsic::getDeclaration(SI->getParent()->getParent()
+                                          ->getParent(), Intrinsic::memset,
+                                          Tys, 1);
+   }
+    
+    // Get the starting pointer of the block.
+    StartPtr = Range.StartPtr;
+  
+    // Cast the start ptr to be i8* as memset requires.
+    const Type *i8Ptr = PointerType::getUnqual(Type::Int8Ty);
+    if (StartPtr->getType() != i8Ptr)
+      StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getNameStart(),
+                                 InsertPt);
+  
+    Value *Ops[] = {
+      StartPtr, ByteVal,   // Start, value
+      ConstantInt::get(Type::Int64Ty, Range.End-Range.Start),  // size
+      ConstantInt::get(Type::Int32Ty, Range.Alignment)   // align
+    };
+    Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt);
+    DEBUG(cerr << "Replace stores:\n";
+          for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
+            cerr << *Range.TheStores[i];
+          cerr << "With: " << *C); C=C;
+  
+    // Don't invalidate the iterator
+    BBI = BI;
+  
+    // Zap all the stores.
+    for (SmallVector<StoreInst*, 16>::const_iterator SI = Range.TheStores.begin(),
+         SE = Range.TheStores.end(); SI != SE; ++SI)
+      (*SI)->eraseFromParent();
+    ++NumMemSetInfer;
+    MadeChange = true;
+  }
+  
+  return MadeChange;
+}
+
+
+/// performCallSlotOptzn - takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
+bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
+  // The general transformation to keep in mind is
+  //
+  //   call @func(..., src, ...)
+  //   memcpy(dest, src, ...)
+  //
+  // ->
+  //
+  //   memcpy(dest, src, ...)
+  //   call @func(..., dest, ...)
+  //
+  // Since moving the memcpy is technically awkward, we additionally check that
+  // src only holds uninitialized values at the moment of the call, meaning that
+  // the memcpy can be discarded rather than moved.
+
+  // Deliberately get the source and destination with bitcasts stripped away,
+  // because we'll need to do type comparisons based on the underlying type.
+  Value* cpyDest = cpy->getDest();
+  Value* cpySrc = cpy->getSource();
+  CallSite CS = CallSite::get(C);
+
+  // We need to be able to reason about the size of the memcpy, so we require
+  // that it be a constant.
+  ConstantInt* cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
+  if (!cpyLength)
+    return false;
+
+  // Require that src be an alloca.  This simplifies the reasoning considerably.
+  AllocaInst* srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  if (!srcAlloca)
+    return false;
+
+  // Check that all of src is copied to dest.
+  TargetData& TD = getAnalysis<TargetData>();
+
+  ConstantInt* srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+  if (!srcArraySize)
+    return false;
+
+  uint64_t srcSize = TD.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+    srcArraySize->getZExtValue();
+
+  if (cpyLength->getZExtValue() < srcSize)
+    return false;
+
+  // Check that accessing the first srcSize bytes of dest will not cause a
+  // trap.  Otherwise the transform is invalid since it might cause a trap
+  // to occur earlier than it otherwise would.
+  if (AllocaInst* A = dyn_cast<AllocaInst>(cpyDest)) {
+    // The destination is an alloca.  Check it is larger than srcSize.
+    ConstantInt* destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+    if (!destArraySize)
+      return false;
+
+    uint64_t destSize = TD.getTypeAllocSize(A->getAllocatedType()) *
+      destArraySize->getZExtValue();
+
+    if (destSize < srcSize)
+      return false;
+  } else if (Argument* A = dyn_cast<Argument>(cpyDest)) {
+    // If the destination is an sret parameter then only accesses that are
+    // outside of the returned struct type can trap.
+    if (!A->hasStructRetAttr())
+      return false;
+
+    const Type* StructTy = cast<PointerType>(A->getType())->getElementType();
+    uint64_t destSize = TD.getTypeAllocSize(StructTy);
+
+    if (destSize < srcSize)
+      return false;
+  } else {
+    return false;
+  }
+
+  // Check that src is not accessed except via the call and the memcpy.  This
+  // guarantees that it holds only undefined values when passed in (so the final
+  // memcpy can be dropped), that it is not read or written between the call and
+  // the memcpy, and that writing beyond the end of it is undefined.
+  SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(),
+                                   srcAlloca->use_end());
+  while (!srcUseList.empty()) {
+    User* UI = srcUseList.back();
+    srcUseList.pop_back();
+
+    if (isa<BitCastInst>(UI)) {
+      for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+           I != E; ++I)
+        srcUseList.push_back(*I);
+    } else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(UI)) {
+      if (G->hasAllZeroIndices())
+        for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+             I != E; ++I)
+          srcUseList.push_back(*I);
+      else
+        return false;
+    } else if (UI != C && UI != cpy) {
+      return false;
+    }
+  }
+
+  // Since we're changing the parameter to the callsite, we need to make sure
+  // that what would be the new parameter dominates the callsite.
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  if (Instruction* cpyDestInst = dyn_cast<Instruction>(cpyDest))
+    if (!DT.dominates(cpyDestInst, C))
+      return false;
+
+  // In addition to knowing that the call does not access src in some
+  // unexpected manner, for example via a global, which we deduce from
+  // the use analysis, we also need to know that it does not sneakily
+  // access dest.  We rely on AA to figure this out for us.
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
+      AliasAnalysis::NoModRef)
+    return false;
+
+  // All the checks have passed, so do the transformation.
+  bool changedArgument = false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+      if (cpySrc->getType() != cpyDest->getType())
+        cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+                                              cpyDest->getName(), C);
+      changedArgument = true;
+      if (CS.getArgument(i)->getType() != cpyDest->getType())
+        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, 
+                       CS.getArgument(i)->getType(), cpyDest->getName(), C));
+      else
+        CS.setArgument(i, cpyDest);
+    }
+
+  if (!changedArgument)
+    return false;
+
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  MD.removeInstruction(C);
+
+  // Remove the memcpy
+  MD.removeInstruction(cpy);
+  cpy->eraseFromParent();
+  NumMemCpyInstr++;
+
+  return true;
+}
+
+/// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
+/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
+/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
+///  This allows later passes to remove the first memcpy altogether.
+bool MemCpyOpt::processMemCpy(MemCpyInst* M) {
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+
+  // The are two possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE
+  //   b) call-memcpy xform for return slot optimization
+  MemDepResult dep = MD.getDependency(M);
+  if (!dep.isClobber())
+    return false;
+  if (!isa<MemCpyInst>(dep.getInst())) {
+    if (CallInst* C = dyn_cast<CallInst>(dep.getInst()))
+      return performCallSlotOptzn(M, C);
+    return false;
+  }
+  
+  MemCpyInst* MDep = cast<MemCpyInst>(dep.getInst());
+  
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other
+  if (M->getSource() != MDep->getDest())
+    return false;
+  
+  // Second, the length of the memcpy's must be the same, or the preceeding one
+  // must be larger than the following one.
+  ConstantInt* C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  ConstantInt* C2 = dyn_cast<ConstantInt>(M->getLength());
+  if (!C1 || !C2)
+    return false;
+  
+  uint64_t DepSize = C1->getValue().getZExtValue();
+  uint64_t CpySize = C2->getValue().getZExtValue();
+  
+  if (DepSize < CpySize)
+    return false;
+  
+  // Finally, we have to make sure that the dest of the second does not
+  // alias the source of the first
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
+      AliasAnalysis::NoAlias)
+    return false;
+  else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) !=
+           AliasAnalysis::NoAlias)
+    return false;
+  else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize)
+           != AliasAnalysis::NoAlias)
+    return false;
+  
+  // If all checks passed, then we can transform these memcpy's
+  const Type *Tys[1];
+  Tys[0] = M->getLength()->getType();
+  Function* MemCpyFun = Intrinsic::getDeclaration(
+                                 M->getParent()->getParent()->getParent(),
+                                 M->getIntrinsicID(), Tys, 1);
+    
+  Value *Args[4] = {
+    M->getRawDest(), MDep->getRawSource(), M->getLength(), M->getAlignmentCst()
+  };
+  
+  CallInst* C = CallInst::Create(MemCpyFun, Args, Args+4, "", M);
+  
+  
+  // If C and M don't interfere, then this is a valid transformation.  If they
+  // did, this would mean that the two sources overlap, which would be bad.
+  if (MD.getDependency(C) == dep) {
+    MD.removeInstruction(M);
+    M->eraseFromParent();
+    NumMemCpyInstr++;
+    return true;
+  }
+  
+  // Otherwise, there was no point in doing this, so we remove the call we
+  // inserted and act like nothing happened.
+  MD.removeInstruction(C);
+  C->eraseFromParent();
+  return false;
+}
+
+// MemCpyOpt::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool MemCpyOpt::runOnFunction(Function& F) {
+  
+  bool changed = false;
+  bool shouldContinue = true;
+  
+  while (shouldContinue) {
+    shouldContinue = iterateOnFunction(F);
+    changed |= shouldContinue;
+  }
+  
+  return changed;
+}
+
+
+// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN
+bool MemCpyOpt::iterateOnFunction(Function &F) {
+  bool changed_function = false;
+
+  // Walk all instruction in the function
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE;) {
+      // Avoid invalidating the iterator
+      Instruction* I = BI++;
+      
+      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        changed_function |= processStore(SI, BI);
+      else if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
+        changed_function |= processMemCpy(M);
+      }
+    }
+  }
+  
+  return changed_function;
+}
diff --git a/lib/Transforms/Scalar/PredicateSimplifier.cpp b/lib/Transforms/Scalar/PredicateSimplifier.cpp
new file mode 100644
index 0000000..a7e4d6e
--- /dev/null
+++ b/lib/Transforms/Scalar/PredicateSimplifier.cpp
@@ -0,0 +1,2725 @@
+//===-- PredicateSimplifier.cpp - Path Sensitive Simplifier ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Path-sensitive optimizer. In a branch where x == y, replace uses of
+// x with y. Permits further optimization, such as the elimination of
+// the unreachable call:
+//
+// void test(int *p, int *q)
+// {
+//   if (p != q)
+//     return;
+// 
+//   if (*p != *q)
+//     foo(); // unreachable
+// }
+//
+//===----------------------------------------------------------------------===//
+//
+// The InequalityGraph focusses on four properties; equals, not equals,
+// less-than and less-than-or-equals-to. The greater-than forms are also held
+// just to allow walking from a lesser node to a greater one. These properties
+// are stored in a lattice; LE can become LT or EQ, NE can become LT or GT.
+//
+// These relationships define a graph between values of the same type. Each
+// Value is stored in a map table that retrieves the associated Node. This
+// is how EQ relationships are stored; the map contains pointers from equal
+// Value to the same node. The node contains a most canonical Value* form
+// and the list of known relationships with other nodes.
+//
+// If two nodes are known to be inequal, then they will contain pointers to
+// each other with an "NE" relationship. If node getNode(%x) is less than
+// getNode(%y), then the %x node will contain <%y, GT> and %y will contain
+// <%x, LT>. This allows us to tie nodes together into a graph like this:
+//
+//   %a < %b < %c < %d
+//
+// with four nodes representing the properties. The InequalityGraph provides
+// querying with "isRelatedBy" and mutators "addEquality" and "addInequality".
+// To find a relationship, we start with one of the nodes any binary search
+// through its list to find where the relationships with the second node start.
+// Then we iterate through those to find the first relationship that dominates
+// our context node.
+//
+// To create these properties, we wait until a branch or switch instruction
+// implies that a particular value is true (or false). The VRPSolver is
+// responsible for analyzing the variable and seeing what new inferences
+// can be made from each property. For example:
+//
+//   %P = icmp ne i32* %ptr, null
+//   %a = and i1 %P, %Q
+//   br i1 %a label %cond_true, label %cond_false
+//
+// For the true branch, the VRPSolver will start with %a EQ true and look at
+// the definition of %a and find that it can infer that %P and %Q are both
+// true. From %P being true, it can infer that %ptr NE null. For the false
+// branch it can't infer anything from the "and" instruction.
+//
+// Besides branches, we can also infer properties from instruction that may
+// have undefined behaviour in certain cases. For example, the dividend of
+// a division may never be zero. After the division instruction, we may assume
+// that the dividend is not equal to zero.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ValueRanges class stores the known integer bounds of a Value. When we
+// encounter i8 %a u< %b, the ValueRanges stores that %a = [1, 255] and
+// %b = [0, 254].
+//
+// It never stores an empty range, because that means that the code is
+// unreachable. It never stores a single-element range since that's an equality
+// relationship and better stored in the InequalityGraph, nor an empty range
+// since that is better stored in UnreachableBlocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "predsimplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <deque>
+#include <stack>
+using namespace llvm;
+
+STATISTIC(NumVarsReplaced, "Number of argument substitutions");
+STATISTIC(NumInstruction , "Number of instructions removed");
+STATISTIC(NumSimple      , "Number of simple replacements");
+STATISTIC(NumBlocks      , "Number of blocks marked unreachable");
+STATISTIC(NumSnuggle     , "Number of comparisons snuggled");
+
+namespace {
+  class DomTreeDFS {
+  public:
+    class Node {
+      friend class DomTreeDFS;
+    public:
+      typedef std::vector<Node *>::iterator       iterator;
+      typedef std::vector<Node *>::const_iterator const_iterator;
+
+      unsigned getDFSNumIn()  const { return DFSin;  }
+      unsigned getDFSNumOut() const { return DFSout; }
+
+      BasicBlock *getBlock() const { return BB; }
+
+      iterator begin() { return Children.begin(); }
+      iterator end()   { return Children.end();   }
+
+      const_iterator begin() const { return Children.begin(); }
+      const_iterator end()   const { return Children.end();   }
+
+      bool dominates(const Node *N) const {
+        return DFSin <= N->DFSin && DFSout >= N->DFSout;
+      }
+
+      bool DominatedBy(const Node *N) const {
+        return N->dominates(this);
+      }
+
+      /// Sorts by the number of descendants. With this, you can iterate
+      /// through a sorted list and the first matching entry is the most
+      /// specific match for your basic block. The order provided is stable;
+      /// DomTreeDFS::Nodes with the same number of descendants are sorted by
+      /// DFS in number.
+      bool operator<(const Node &N) const {
+        unsigned   spread =   DFSout -   DFSin;
+        unsigned N_spread = N.DFSout - N.DFSin;
+        if (spread == N_spread) return DFSin < N.DFSin;
+        return spread < N_spread;
+      }
+      bool operator>(const Node &N) const { return N < *this; }
+
+    private:
+      unsigned DFSin, DFSout;
+      BasicBlock *BB;
+
+      std::vector<Node *> Children;
+    };
+
+    // XXX: this may be slow. Instead of using "new" for each node, consider
+    // putting them in a vector to keep them contiguous.
+    explicit DomTreeDFS(DominatorTree *DT) {
+      std::stack<std::pair<Node *, DomTreeNode *> > S;
+
+      Entry = new Node;
+      Entry->BB = DT->getRootNode()->getBlock();
+      S.push(std::make_pair(Entry, DT->getRootNode()));
+
+      NodeMap[Entry->BB] = Entry;
+
+      while (!S.empty()) {
+        std::pair<Node *, DomTreeNode *> &Pair = S.top();
+        Node *N = Pair.first;
+        DomTreeNode *DTNode = Pair.second;
+        S.pop();
+
+        for (DomTreeNode::iterator I = DTNode->begin(), E = DTNode->end();
+             I != E; ++I) {
+          Node *NewNode = new Node;
+          NewNode->BB = (*I)->getBlock();
+          N->Children.push_back(NewNode);
+          S.push(std::make_pair(NewNode, *I));
+
+          NodeMap[NewNode->BB] = NewNode;
+        }
+      }
+
+      renumber();
+
+#ifndef NDEBUG
+      DEBUG(dump());
+#endif
+    }
+
+#ifndef NDEBUG
+    virtual
+#endif
+    ~DomTreeDFS() {
+      std::stack<Node *> S;
+
+      S.push(Entry);
+      while (!S.empty()) {
+        Node *N = S.top(); S.pop();
+
+        for (Node::iterator I = N->begin(), E = N->end(); I != E; ++I)
+          S.push(*I);
+
+        delete N;
+      }
+    }
+
+    /// getRootNode - This returns the entry node for the CFG of the function.
+    Node *getRootNode() const { return Entry; }
+
+    /// getNodeForBlock - return the node for the specified basic block.
+    Node *getNodeForBlock(BasicBlock *BB) const {
+      if (!NodeMap.count(BB)) return 0;
+      return const_cast<DomTreeDFS*>(this)->NodeMap[BB];
+    }
+
+    /// dominates - returns true if the basic block for I1 dominates that of
+    /// the basic block for I2. If the instructions belong to the same basic
+    /// block, the instruction first instruction sequentially in the block is
+    /// considered dominating.
+    bool dominates(Instruction *I1, Instruction *I2) {
+      BasicBlock *BB1 = I1->getParent(),
+                 *BB2 = I2->getParent();
+      if (BB1 == BB2) {
+        if (isa<TerminatorInst>(I1)) return false;
+        if (isa<TerminatorInst>(I2)) return true;
+        if ( isa<PHINode>(I1) && !isa<PHINode>(I2)) return true;
+        if (!isa<PHINode>(I1) &&  isa<PHINode>(I2)) return false;
+
+        for (BasicBlock::const_iterator I = BB2->begin(), E = BB2->end();
+             I != E; ++I) {
+          if (&*I == I1) return true;
+          else if (&*I == I2) return false;
+        }
+        assert(!"Instructions not found in parent BasicBlock?");
+      } else {
+        Node *Node1 = getNodeForBlock(BB1),
+             *Node2 = getNodeForBlock(BB2);
+        return Node1 && Node2 && Node1->dominates(Node2);
+      }
+      return false; // Not reached
+    }
+
+  private:
+    /// renumber - calculates the depth first search numberings and applies
+    /// them onto the nodes.
+    void renumber() {
+      std::stack<std::pair<Node *, Node::iterator> > S;
+      unsigned n = 0;
+
+      Entry->DFSin = ++n;
+      S.push(std::make_pair(Entry, Entry->begin()));
+
+      while (!S.empty()) {
+        std::pair<Node *, Node::iterator> &Pair = S.top();
+        Node *N = Pair.first;
+        Node::iterator &I = Pair.second;
+
+        if (I == N->end()) {
+          N->DFSout = ++n;
+          S.pop();
+        } else {
+          Node *Next = *I++;
+          Next->DFSin = ++n;
+          S.push(std::make_pair(Next, Next->begin()));
+        }
+      }
+    }
+
+#ifndef NDEBUG
+    virtual void dump() const {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) const {
+      os << "Predicate simplifier DomTreeDFS: \n";
+      dump(Entry, 0, os);
+      os << "\n\n";
+    }
+
+    void dump(Node *N, int depth, std::ostream &os) const {
+      ++depth;
+      for (int i = 0; i < depth; ++i) { os << " "; }
+      os << "[" << depth << "] ";
+
+      os << N->getBlock()->getName() << " (" << N->getDFSNumIn()
+         << ", " << N->getDFSNumOut() << ")\n";
+
+      for (Node::iterator I = N->begin(), E = N->end(); I != E; ++I)
+        dump(*I, depth, os);
+    }
+#endif
+
+    Node *Entry;
+    std::map<BasicBlock *, Node *> NodeMap;
+  };
+
+  // SLT SGT ULT UGT EQ
+  //   0   1   0   1  0 -- GT                  10
+  //   0   1   0   1  1 -- GE                  11
+  //   0   1   1   0  0 -- SGTULT              12
+  //   0   1   1   0  1 -- SGEULE              13
+  //   0   1   1   1  0 -- SGT                 14
+  //   0   1   1   1  1 -- SGE                 15
+  //   1   0   0   1  0 -- SLTUGT              18
+  //   1   0   0   1  1 -- SLEUGE              19
+  //   1   0   1   0  0 -- LT                  20
+  //   1   0   1   0  1 -- LE                  21
+  //   1   0   1   1  0 -- SLT                 22
+  //   1   0   1   1  1 -- SLE                 23
+  //   1   1   0   1  0 -- UGT                 26
+  //   1   1   0   1  1 -- UGE                 27
+  //   1   1   1   0  0 -- ULT                 28
+  //   1   1   1   0  1 -- ULE                 29
+  //   1   1   1   1  0 -- NE                  30
+  enum LatticeBits {
+    EQ_BIT = 1, UGT_BIT = 2, ULT_BIT = 4, SGT_BIT = 8, SLT_BIT = 16
+  };
+  enum LatticeVal {
+    GT = SGT_BIT | UGT_BIT,
+    GE = GT | EQ_BIT,
+    LT = SLT_BIT | ULT_BIT,
+    LE = LT | EQ_BIT,
+    NE = SLT_BIT | SGT_BIT | ULT_BIT | UGT_BIT,
+    SGTULT = SGT_BIT | ULT_BIT,
+    SGEULE = SGTULT | EQ_BIT,
+    SLTUGT = SLT_BIT | UGT_BIT,
+    SLEUGE = SLTUGT | EQ_BIT,
+    ULT = SLT_BIT | SGT_BIT | ULT_BIT,
+    UGT = SLT_BIT | SGT_BIT | UGT_BIT,
+    SLT = SLT_BIT | ULT_BIT | UGT_BIT,
+    SGT = SGT_BIT | ULT_BIT | UGT_BIT,
+    SLE = SLT | EQ_BIT,
+    SGE = SGT | EQ_BIT,
+    ULE = ULT | EQ_BIT,
+    UGE = UGT | EQ_BIT
+  };
+
+#ifndef NDEBUG
+  /// validPredicate - determines whether a given value is actually a lattice
+  /// value. Only used in assertions or debugging.
+  static bool validPredicate(LatticeVal LV) {
+    switch (LV) {
+      case GT: case GE: case LT: case LE: case NE:
+      case SGTULT: case SGT: case SGEULE:
+      case SLTUGT: case SLT: case SLEUGE:
+      case ULT: case UGT:
+      case SLE: case SGE: case ULE: case UGE:
+        return true;
+      default:
+        return false;
+    }
+  }
+#endif
+
+  /// reversePredicate - reverse the direction of the inequality
+  static LatticeVal reversePredicate(LatticeVal LV) {
+    unsigned reverse = LV ^ (SLT_BIT|SGT_BIT|ULT_BIT|UGT_BIT); //preserve EQ_BIT
+
+    if ((reverse & (SLT_BIT|SGT_BIT)) == 0)
+      reverse |= (SLT_BIT|SGT_BIT);
+
+    if ((reverse & (ULT_BIT|UGT_BIT)) == 0)
+      reverse |= (ULT_BIT|UGT_BIT);
+
+    LatticeVal Rev = static_cast<LatticeVal>(reverse);
+    assert(validPredicate(Rev) && "Failed reversing predicate.");
+    return Rev;
+  }
+
+  /// ValueNumbering stores the scope-specific value numbers for a given Value.
+  class VISIBILITY_HIDDEN ValueNumbering {
+
+    /// VNPair is a tuple of {Value, index number, DomTreeDFS::Node}. It
+    /// includes the comparison operators necessary to allow you to store it
+    /// in a sorted vector.
+    class VISIBILITY_HIDDEN VNPair {
+    public:
+      Value *V;
+      unsigned index;
+      DomTreeDFS::Node *Subtree;
+
+      VNPair(Value *V, unsigned index, DomTreeDFS::Node *Subtree)
+        : V(V), index(index), Subtree(Subtree) {}
+
+      bool operator==(const VNPair &RHS) const {
+        return V == RHS.V && Subtree == RHS.Subtree;
+      }
+
+      bool operator<(const VNPair &RHS) const {
+        if (V != RHS.V) return V < RHS.V;
+        return *Subtree < *RHS.Subtree;
+      }
+
+      bool operator<(Value *RHS) const {
+        return V < RHS;
+      }
+
+      bool operator>(Value *RHS) const {
+        return V > RHS;
+      }
+
+      friend bool operator<(Value *RHS, const VNPair &pair) {
+        return pair.operator>(RHS);
+      }
+    };
+
+    typedef std::vector<VNPair> VNMapType;
+    VNMapType VNMap;
+
+    /// The canonical choice for value number at index.
+    std::vector<Value *> Values;
+
+    DomTreeDFS *DTDFS;
+
+  public:
+#ifndef NDEBUG
+    virtual ~ValueNumbering() {}
+    virtual void dump() {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) {
+      for (unsigned i = 1; i <= Values.size(); ++i) {
+        os << i << " = ";
+        WriteAsOperand(os, Values[i-1]);
+        os << " {";
+        for (unsigned j = 0; j < VNMap.size(); ++j) {
+          if (VNMap[j].index == i) {
+            WriteAsOperand(os, VNMap[j].V);
+            os << " (" << VNMap[j].Subtree->getDFSNumIn() << ")  ";
+          }
+        }
+        os << "}\n";
+      }
+    }
+#endif
+
+    /// compare - returns true if V1 is a better canonical value than V2.
+    bool compare(Value *V1, Value *V2) const {
+      if (isa<Constant>(V1))
+        return !isa<Constant>(V2);
+      else if (isa<Constant>(V2))
+        return false;
+      else if (isa<Argument>(V1))
+        return !isa<Argument>(V2);
+      else if (isa<Argument>(V2))
+        return false;
+
+      Instruction *I1 = dyn_cast<Instruction>(V1);
+      Instruction *I2 = dyn_cast<Instruction>(V2);
+
+      if (!I1 || !I2)
+        return V1->getNumUses() < V2->getNumUses();
+
+      return DTDFS->dominates(I1, I2);
+    }
+
+    ValueNumbering(DomTreeDFS *DTDFS) : DTDFS(DTDFS) {}
+
+    /// valueNumber - finds the value number for V under the Subtree. If
+    /// there is no value number, returns zero.
+    unsigned valueNumber(Value *V, DomTreeDFS::Node *Subtree) {
+      if (!(isa<Constant>(V) || isa<Argument>(V) || isa<Instruction>(V))
+          || V->getType() == Type::VoidTy) return 0;
+
+      VNMapType::iterator E = VNMap.end();
+      VNPair pair(V, 0, Subtree);
+      VNMapType::iterator I = std::lower_bound(VNMap.begin(), E, pair);
+      while (I != E && I->V == V) {
+        if (I->Subtree->dominates(Subtree))
+          return I->index;
+        ++I;
+      }
+      return 0;
+    }
+
+    /// getOrInsertVN - always returns a value number, creating it if necessary.
+    unsigned getOrInsertVN(Value *V, DomTreeDFS::Node *Subtree) {
+      if (unsigned n = valueNumber(V, Subtree))
+        return n;
+      else
+        return newVN(V);
+    }
+
+    /// newVN - creates a new value number. Value V must not already have a
+    /// value number assigned.
+    unsigned newVN(Value *V) {
+      assert((isa<Constant>(V) || isa<Argument>(V) || isa<Instruction>(V)) &&
+             "Bad Value for value numbering.");
+      assert(V->getType() != Type::VoidTy && "Won't value number a void value");
+
+      Values.push_back(V);
+
+      VNPair pair = VNPair(V, Values.size(), DTDFS->getRootNode());
+      VNMapType::iterator I = std::lower_bound(VNMap.begin(), VNMap.end(), pair);
+      assert((I == VNMap.end() || value(I->index) != V) &&
+             "Attempt to create a duplicate value number.");
+      VNMap.insert(I, pair);
+
+      return Values.size();
+    }
+
+    /// value - returns the Value associated with a value number.
+    Value *value(unsigned index) const {
+      assert(index != 0 && "Zero index is reserved for not found.");
+      assert(index <= Values.size() && "Index out of range.");
+      return Values[index-1];
+    }
+
+    /// canonicalize - return a Value that is equal to V under Subtree.
+    Value *canonicalize(Value *V, DomTreeDFS::Node *Subtree) {
+      if (isa<Constant>(V)) return V;
+
+      if (unsigned n = valueNumber(V, Subtree))
+        return value(n);
+      else
+        return V;
+    }
+
+    /// addEquality - adds that value V belongs to the set of equivalent
+    /// values defined by value number n under Subtree.
+    void addEquality(unsigned n, Value *V, DomTreeDFS::Node *Subtree) {
+      assert(canonicalize(value(n), Subtree) == value(n) &&
+             "Node's 'canonical' choice isn't best within this subtree.");
+
+      // Suppose that we are given "%x -> node #1 (%y)". The problem is that
+      // we may already have "%z -> node #2 (%x)" somewhere above us in the
+      // graph. We need to find those edges and add "%z -> node #1 (%y)"
+      // to keep the lookups canonical.
+
+      std::vector<Value *> ToRepoint(1, V);
+
+      if (unsigned Conflict = valueNumber(V, Subtree)) {
+        for (VNMapType::iterator I = VNMap.begin(), E = VNMap.end();
+             I != E; ++I) {
+          if (I->index == Conflict && I->Subtree->dominates(Subtree))
+            ToRepoint.push_back(I->V);
+        }
+      }
+
+      for (std::vector<Value *>::iterator VI = ToRepoint.begin(),
+           VE = ToRepoint.end(); VI != VE; ++VI) {
+        Value *V = *VI;
+
+        VNPair pair(V, n, Subtree);
+        VNMapType::iterator B = VNMap.begin(), E = VNMap.end();
+        VNMapType::iterator I = std::lower_bound(B, E, pair);
+        if (I != E && I->V == V && I->Subtree == Subtree)
+          I->index = n; // Update best choice
+        else
+          VNMap.insert(I, pair); // New Value
+
+        // XXX: we currently don't have to worry about updating values with
+        // more specific Subtrees, but we will need to for PHI node support.
+
+#ifndef NDEBUG
+        Value *V_n = value(n);
+        if (isa<Constant>(V) && isa<Constant>(V_n)) {
+          assert(V == V_n && "Constant equals different constant?");
+        }
+#endif
+      }
+    }
+
+    /// remove - removes all references to value V.
+    void remove(Value *V) {
+      VNMapType::iterator B = VNMap.begin(), E = VNMap.end();
+      VNPair pair(V, 0, DTDFS->getRootNode());
+      VNMapType::iterator J = std::upper_bound(B, E, pair);
+      VNMapType::iterator I = J;
+
+      while (I != B && (I == E || I->V == V)) --I;
+
+      VNMap.erase(I, J);
+    }
+  };
+
+  /// The InequalityGraph stores the relationships between values.
+  /// Each Value in the graph is assigned to a Node. Nodes are pointer
+  /// comparable for equality. The caller is expected to maintain the logical
+  /// consistency of the system.
+  ///
+  /// The InequalityGraph class may invalidate Node*s after any mutator call.
+  /// @brief The InequalityGraph stores the relationships between values.
+  class VISIBILITY_HIDDEN InequalityGraph {
+    ValueNumbering &VN;
+    DomTreeDFS::Node *TreeRoot;
+
+    InequalityGraph();                  // DO NOT IMPLEMENT
+    InequalityGraph(InequalityGraph &); // DO NOT IMPLEMENT
+  public:
+    InequalityGraph(ValueNumbering &VN, DomTreeDFS::Node *TreeRoot)
+      : VN(VN), TreeRoot(TreeRoot) {}
+
+    class Node;
+
+    /// An Edge is contained inside a Node making one end of the edge implicit
+    /// and contains a pointer to the other end. The edge contains a lattice
+    /// value specifying the relationship and an DomTreeDFS::Node specifying
+    /// the root in the dominator tree to which this edge applies.
+    class VISIBILITY_HIDDEN Edge {
+    public:
+      Edge(unsigned T, LatticeVal V, DomTreeDFS::Node *ST)
+        : To(T), LV(V), Subtree(ST) {}
+
+      unsigned To;
+      LatticeVal LV;
+      DomTreeDFS::Node *Subtree;
+
+      bool operator<(const Edge &edge) const {
+        if (To != edge.To) return To < edge.To;
+        return *Subtree < *edge.Subtree;
+      }
+
+      bool operator<(unsigned to) const {
+        return To < to;
+      }
+
+      bool operator>(unsigned to) const {
+        return To > to;
+      }
+
+      friend bool operator<(unsigned to, const Edge &edge) {
+        return edge.operator>(to);
+      }
+    };
+
+    /// A single node in the InequalityGraph. This stores the canonical Value
+    /// for the node, as well as the relationships with the neighbours.
+    ///
+    /// @brief A single node in the InequalityGraph.
+    class VISIBILITY_HIDDEN Node {
+      friend class InequalityGraph;
+
+      typedef SmallVector<Edge, 4> RelationsType;
+      RelationsType Relations;
+
+      // TODO: can this idea improve performance?
+      //friend class std::vector<Node>;
+      //Node(Node &N) { RelationsType.swap(N.RelationsType); }
+
+    public:
+      typedef RelationsType::iterator       iterator;
+      typedef RelationsType::const_iterator const_iterator;
+
+#ifndef NDEBUG
+      virtual ~Node() {}
+      virtual void dump() const {
+        dump(*cerr.stream());
+      }
+    private:
+      void dump(std::ostream &os) const {
+        static const std::string names[32] =
+          { "000000", "000001", "000002", "000003", "000004", "000005",
+            "000006", "000007", "000008", "000009", "     >", "    >=",
+            "  s>u<", "s>=u<=", "    s>", "   s>=", "000016", "000017",
+            "  s<u>", "s<=u>=", "     <", "    <=", "    s<", "   s<=",
+            "000024", "000025", "    u>", "   u>=", "    u<", "   u<=",
+            "    !=", "000031" };
+        for (Node::const_iterator NI = begin(), NE = end(); NI != NE; ++NI) {
+          os << names[NI->LV] << " " << NI->To
+             << " (" << NI->Subtree->getDFSNumIn() << "), ";
+        }
+      }
+    public:
+#endif
+
+      iterator begin()             { return Relations.begin(); }
+      iterator end()               { return Relations.end();   }
+      const_iterator begin() const { return Relations.begin(); }
+      const_iterator end()   const { return Relations.end();   }
+
+      iterator find(unsigned n, DomTreeDFS::Node *Subtree) {
+        iterator E = end();
+        for (iterator I = std::lower_bound(begin(), E, n);
+             I != E && I->To == n; ++I) {
+          if (Subtree->DominatedBy(I->Subtree))
+            return I;
+        }
+        return E;
+      }
+
+      const_iterator find(unsigned n, DomTreeDFS::Node *Subtree) const {
+        const_iterator E = end();
+        for (const_iterator I = std::lower_bound(begin(), E, n);
+             I != E && I->To == n; ++I) {
+          if (Subtree->DominatedBy(I->Subtree))
+            return I;
+        }
+        return E;
+      }
+
+      /// update - updates the lattice value for a given node, creating a new
+      /// entry if one doesn't exist. The new lattice value must not be
+      /// inconsistent with any previously existing value.
+      void update(unsigned n, LatticeVal R, DomTreeDFS::Node *Subtree) {
+        assert(validPredicate(R) && "Invalid predicate.");
+
+        Edge edge(n, R, Subtree);
+        iterator B = begin(), E = end();
+        iterator I = std::lower_bound(B, E, edge);
+
+        iterator J = I;
+        while (J != E && J->To == n) {
+          if (Subtree->DominatedBy(J->Subtree))
+            break;
+          ++J;
+        }
+
+        if (J != E && J->To == n) {
+          edge.LV = static_cast<LatticeVal>(J->LV & R);
+          assert(validPredicate(edge.LV) && "Invalid union of lattice values.");
+
+          if (edge.LV == J->LV)
+            return; // This update adds nothing new.
+        }
+
+        if (I != B) {
+          // We also have to tighten any edge beneath our update.
+          for (iterator K = I - 1; K->To == n; --K) {
+            if (K->Subtree->DominatedBy(Subtree)) {
+              LatticeVal LV = static_cast<LatticeVal>(K->LV & edge.LV);
+              assert(validPredicate(LV) && "Invalid union of lattice values");
+              K->LV = LV;
+            }
+            if (K == B) break;
+          }
+        }
+
+        // Insert new edge at Subtree if it isn't already there.
+        if (I == E || I->To != n || Subtree != I->Subtree)
+          Relations.insert(I, edge);
+      }
+    };
+
+  private:
+
+    std::vector<Node> Nodes;
+
+  public:
+    /// node - returns the node object at a given value number. The pointer
+    /// returned may be invalidated on the next call to node().
+    Node *node(unsigned index) {
+      assert(VN.value(index)); // This triggers the necessary checks.
+      if (Nodes.size() < index) Nodes.resize(index);
+      return &Nodes[index-1];
+    }
+
+    /// isRelatedBy - true iff n1 op n2
+    bool isRelatedBy(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                     LatticeVal LV) {
+      if (n1 == n2) return LV & EQ_BIT;
+
+      Node *N1 = node(n1);
+      Node::iterator I = N1->find(n2, Subtree), E = N1->end();
+      if (I != E) return (I->LV & LV) == I->LV;
+
+      return false;
+    }
+
+    // The add* methods assume that your input is logically valid and may 
+    // assertion-fail or infinitely loop if you attempt a contradiction.
+
+    /// addInequality - Sets n1 op n2.
+    /// It is also an error to call this on an inequality that is already true.
+    void addInequality(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                       LatticeVal LV1) {
+      assert(n1 != n2 && "A node can't be inequal to itself.");
+
+      if (LV1 != NE)
+        assert(!isRelatedBy(n1, n2, Subtree, reversePredicate(LV1)) &&
+               "Contradictory inequality.");
+
+      // Suppose we're adding %n1 < %n2. Find all the %a < %n1 and
+      // add %a < %n2 too. This keeps the graph fully connected.
+      if (LV1 != NE) {
+        // Break up the relationship into signed and unsigned comparison parts.
+        // If the signed parts of %a op1 %n1 match that of %n1 op2 %n2, and
+        // op1 and op2 aren't NE, then add %a op3 %n2. The new relationship
+        // should have the EQ_BIT iff it's set for both op1 and op2.
+
+        unsigned LV1_s = LV1 & (SLT_BIT|SGT_BIT);
+        unsigned LV1_u = LV1 & (ULT_BIT|UGT_BIT);
+
+        for (Node::iterator I = node(n1)->begin(), E = node(n1)->end(); I != E; ++I) {
+          if (I->LV != NE && I->To != n2) {
+
+            DomTreeDFS::Node *Local_Subtree = NULL;
+            if (Subtree->DominatedBy(I->Subtree))
+              Local_Subtree = Subtree;
+            else if (I->Subtree->DominatedBy(Subtree))
+              Local_Subtree = I->Subtree;
+
+            if (Local_Subtree) {
+              unsigned new_relationship = 0;
+              LatticeVal ILV = reversePredicate(I->LV);
+              unsigned ILV_s = ILV & (SLT_BIT|SGT_BIT);
+              unsigned ILV_u = ILV & (ULT_BIT|UGT_BIT);
+
+              if (LV1_s != (SLT_BIT|SGT_BIT) && ILV_s == LV1_s)
+                new_relationship |= ILV_s;
+              if (LV1_u != (ULT_BIT|UGT_BIT) && ILV_u == LV1_u)
+                new_relationship |= ILV_u;
+
+              if (new_relationship) {
+                if ((new_relationship & (SLT_BIT|SGT_BIT)) == 0)
+                  new_relationship |= (SLT_BIT|SGT_BIT);
+                if ((new_relationship & (ULT_BIT|UGT_BIT)) == 0)
+                  new_relationship |= (ULT_BIT|UGT_BIT);
+                if ((LV1 & EQ_BIT) && (ILV & EQ_BIT))
+                  new_relationship |= EQ_BIT;
+
+                LatticeVal NewLV = static_cast<LatticeVal>(new_relationship);
+
+                node(I->To)->update(n2, NewLV, Local_Subtree);
+                node(n2)->update(I->To, reversePredicate(NewLV), Local_Subtree);
+              }
+            }
+          }
+        }
+
+        for (Node::iterator I = node(n2)->begin(), E = node(n2)->end(); I != E; ++I) {
+          if (I->LV != NE && I->To != n1) {
+            DomTreeDFS::Node *Local_Subtree = NULL;
+            if (Subtree->DominatedBy(I->Subtree))
+              Local_Subtree = Subtree;
+            else if (I->Subtree->DominatedBy(Subtree))
+              Local_Subtree = I->Subtree;
+
+            if (Local_Subtree) {
+              unsigned new_relationship = 0;
+              unsigned ILV_s = I->LV & (SLT_BIT|SGT_BIT);
+              unsigned ILV_u = I->LV & (ULT_BIT|UGT_BIT);
+
+              if (LV1_s != (SLT_BIT|SGT_BIT) && ILV_s == LV1_s)
+                new_relationship |= ILV_s;
+
+              if (LV1_u != (ULT_BIT|UGT_BIT) && ILV_u == LV1_u)
+                new_relationship |= ILV_u;
+
+              if (new_relationship) {
+                if ((new_relationship & (SLT_BIT|SGT_BIT)) == 0)
+                  new_relationship |= (SLT_BIT|SGT_BIT);
+                if ((new_relationship & (ULT_BIT|UGT_BIT)) == 0)
+                  new_relationship |= (ULT_BIT|UGT_BIT);
+                if ((LV1 & EQ_BIT) && (I->LV & EQ_BIT))
+                  new_relationship |= EQ_BIT;
+
+                LatticeVal NewLV = static_cast<LatticeVal>(new_relationship);
+
+                node(n1)->update(I->To, NewLV, Local_Subtree);
+                node(I->To)->update(n1, reversePredicate(NewLV), Local_Subtree);
+              }
+            }
+          }
+        }
+      }
+
+      node(n1)->update(n2, LV1, Subtree);
+      node(n2)->update(n1, reversePredicate(LV1), Subtree);
+    }
+
+    /// remove - removes a node from the graph by removing all references to
+    /// and from it.
+    void remove(unsigned n) {
+      Node *N = node(n);
+      for (Node::iterator NI = N->begin(), NE = N->end(); NI != NE; ++NI) {
+        Node::iterator Iter = node(NI->To)->find(n, TreeRoot);
+        do {
+          node(NI->To)->Relations.erase(Iter);
+          Iter = node(NI->To)->find(n, TreeRoot);
+        } while (Iter != node(NI->To)->end());
+      }
+      N->Relations.clear();
+    }
+
+#ifndef NDEBUG
+    virtual ~InequalityGraph() {}
+    virtual void dump() {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) {
+      for (unsigned i = 1; i <= Nodes.size(); ++i) {
+        os << i << " = {";
+        node(i)->dump(os);
+        os << "}\n";
+      }
+    }
+#endif
+  };
+
+  class VRPSolver;
+
+  /// ValueRanges tracks the known integer ranges and anti-ranges of the nodes
+  /// in the InequalityGraph.
+  class VISIBILITY_HIDDEN ValueRanges {
+    ValueNumbering &VN;
+    TargetData *TD;
+
+    class VISIBILITY_HIDDEN ScopedRange {
+      typedef std::vector<std::pair<DomTreeDFS::Node *, ConstantRange> >
+              RangeListType;
+      RangeListType RangeList;
+
+      static bool swo(const std::pair<DomTreeDFS::Node *, ConstantRange> &LHS,
+                      const std::pair<DomTreeDFS::Node *, ConstantRange> &RHS) {
+        return *LHS.first < *RHS.first;
+      }
+
+    public:
+#ifndef NDEBUG
+      virtual ~ScopedRange() {}
+      virtual void dump() const {
+        dump(*cerr.stream());
+      }
+
+      void dump(std::ostream &os) const {
+        os << "{";
+        for (const_iterator I = begin(), E = end(); I != E; ++I) {
+          os << &I->second << " (" << I->first->getDFSNumIn() << "), ";
+        }
+        os << "}";
+      }
+#endif
+
+      typedef RangeListType::iterator       iterator;
+      typedef RangeListType::const_iterator const_iterator;
+
+      iterator begin() { return RangeList.begin(); }
+      iterator end()   { return RangeList.end(); }
+      const_iterator begin() const { return RangeList.begin(); }
+      const_iterator end()   const { return RangeList.end(); }
+
+      iterator find(DomTreeDFS::Node *Subtree) {
+        static ConstantRange empty(1, false);
+        iterator E = end();
+        iterator I = std::lower_bound(begin(), E,
+                                      std::make_pair(Subtree, empty), swo);
+
+        while (I != E && !I->first->dominates(Subtree)) ++I;
+        return I;
+      }
+
+      const_iterator find(DomTreeDFS::Node *Subtree) const {
+        static const ConstantRange empty(1, false);
+        const_iterator E = end();
+        const_iterator I = std::lower_bound(begin(), E,
+                                            std::make_pair(Subtree, empty), swo);
+
+        while (I != E && !I->first->dominates(Subtree)) ++I;
+        return I;
+      }
+
+      void update(const ConstantRange &CR, DomTreeDFS::Node *Subtree) {
+        assert(!CR.isEmptySet() && "Empty ConstantRange.");
+        assert(!CR.isSingleElement() && "Refusing to store single element.");
+
+        static ConstantRange empty(1, false);
+        iterator E = end();
+        iterator I =
+            std::lower_bound(begin(), E, std::make_pair(Subtree, empty), swo);
+
+        if (I != end() && I->first == Subtree) {
+          ConstantRange CR2 = I->second.maximalIntersectWith(CR);
+          assert(!CR2.isEmptySet() && !CR2.isSingleElement() &&
+                 "Invalid union of ranges.");
+          I->second = CR2;
+        } else
+          RangeList.insert(I, std::make_pair(Subtree, CR));
+      }
+    };
+
+    std::vector<ScopedRange> Ranges;
+
+    void update(unsigned n, const ConstantRange &CR, DomTreeDFS::Node *Subtree){
+      if (CR.isFullSet()) return;
+      if (Ranges.size() < n) Ranges.resize(n);
+      Ranges[n-1].update(CR, Subtree);
+    }
+
+    /// create - Creates a ConstantRange that matches the given LatticeVal
+    /// relation with a given integer.
+    ConstantRange create(LatticeVal LV, const ConstantRange &CR) {
+      assert(!CR.isEmptySet() && "Can't deal with empty set.");
+
+      if (LV == NE)
+        return makeConstantRange(ICmpInst::ICMP_NE, CR);
+
+      unsigned LV_s = LV & (SGT_BIT|SLT_BIT);
+      unsigned LV_u = LV & (UGT_BIT|ULT_BIT);
+      bool hasEQ = LV & EQ_BIT;
+
+      ConstantRange Range(CR.getBitWidth());
+
+      if (LV_s == SGT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_SGT, CR));
+      } else if (LV_s == SLT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_SLT, CR));
+      }
+
+      if (LV_u == UGT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_UGT, CR));
+      } else if (LV_u == ULT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT, CR));
+      }
+
+      return Range;
+    }
+
+    /// makeConstantRange - Creates a ConstantRange representing the set of all
+    /// value that match the ICmpInst::Predicate with any of the values in CR.
+    ConstantRange makeConstantRange(ICmpInst::Predicate ICmpOpcode,
+                                    const ConstantRange &CR) {
+      uint32_t W = CR.getBitWidth();
+      switch (ICmpOpcode) {
+        default: assert(!"Invalid ICmp opcode to makeConstantRange()");
+        case ICmpInst::ICMP_EQ:
+          return ConstantRange(CR.getLower(), CR.getUpper());
+        case ICmpInst::ICMP_NE:
+          if (CR.isSingleElement())
+            return ConstantRange(CR.getUpper(), CR.getLower());
+          return ConstantRange(W);
+        case ICmpInst::ICMP_ULT:
+          return ConstantRange(APInt::getMinValue(W), CR.getUnsignedMax());
+        case ICmpInst::ICMP_SLT:
+          return ConstantRange(APInt::getSignedMinValue(W), CR.getSignedMax());
+        case ICmpInst::ICMP_ULE: {
+          APInt UMax(CR.getUnsignedMax());
+          if (UMax.isMaxValue())
+            return ConstantRange(W);
+          return ConstantRange(APInt::getMinValue(W), UMax + 1);
+        }
+        case ICmpInst::ICMP_SLE: {
+          APInt SMax(CR.getSignedMax());
+          if (SMax.isMaxSignedValue() || (SMax+1).isMaxSignedValue())
+            return ConstantRange(W);
+          return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
+        }
+        case ICmpInst::ICMP_UGT:
+          return ConstantRange(CR.getUnsignedMin() + 1, APInt::getNullValue(W));
+        case ICmpInst::ICMP_SGT:
+          return ConstantRange(CR.getSignedMin() + 1,
+                               APInt::getSignedMinValue(W));
+        case ICmpInst::ICMP_UGE: {
+          APInt UMin(CR.getUnsignedMin());
+          if (UMin.isMinValue())
+            return ConstantRange(W);
+          return ConstantRange(UMin, APInt::getNullValue(W));
+        }
+        case ICmpInst::ICMP_SGE: {
+          APInt SMin(CR.getSignedMin());
+          if (SMin.isMinSignedValue())
+            return ConstantRange(W);
+          return ConstantRange(SMin, APInt::getSignedMinValue(W));
+        }
+      }
+    }
+
+#ifndef NDEBUG
+    bool isCanonical(Value *V, DomTreeDFS::Node *Subtree) {
+      return V == VN.canonicalize(V, Subtree);
+    }
+#endif
+
+  public:
+
+    ValueRanges(ValueNumbering &VN, TargetData *TD) : VN(VN), TD(TD) {}
+
+#ifndef NDEBUG
+    virtual ~ValueRanges() {}
+
+    virtual void dump() const {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) const {
+      for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
+        os << (i+1) << " = ";
+        Ranges[i].dump(os);
+        os << "\n";
+      }
+    }
+#endif
+
+    /// range - looks up the ConstantRange associated with a value number.
+    ConstantRange range(unsigned n, DomTreeDFS::Node *Subtree) {
+      assert(VN.value(n)); // performs range checks
+
+      if (n <= Ranges.size()) {
+        ScopedRange::iterator I = Ranges[n-1].find(Subtree);
+        if (I != Ranges[n-1].end()) return I->second;
+      }
+
+      Value *V = VN.value(n);
+      ConstantRange CR = range(V);
+      return CR;
+    }
+
+    /// range - determine a range from a Value without performing any lookups.
+    ConstantRange range(Value *V) const {
+      if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+        return ConstantRange(C->getValue());
+      else if (isa<ConstantPointerNull>(V))
+        return ConstantRange(APInt::getNullValue(typeToWidth(V->getType())));
+      else
+        return ConstantRange(typeToWidth(V->getType()));
+    }
+
+    // typeToWidth - returns the number of bits necessary to store a value of
+    // this type, or zero if unknown.
+    uint32_t typeToWidth(const Type *Ty) const {
+      if (TD)
+        return TD->getTypeSizeInBits(Ty);
+      else
+        return Ty->getPrimitiveSizeInBits();
+    }
+
+    static bool isRelatedBy(const ConstantRange &CR1, const ConstantRange &CR2,
+                            LatticeVal LV) {
+      switch (LV) {
+      default: assert(!"Impossible lattice value!");
+      case NE:
+        return CR1.maximalIntersectWith(CR2).isEmptySet();
+      case ULT:
+        return CR1.getUnsignedMax().ult(CR2.getUnsignedMin());
+      case ULE:
+        return CR1.getUnsignedMax().ule(CR2.getUnsignedMin());
+      case UGT:
+        return CR1.getUnsignedMin().ugt(CR2.getUnsignedMax());
+      case UGE:
+        return CR1.getUnsignedMin().uge(CR2.getUnsignedMax());
+      case SLT:
+        return CR1.getSignedMax().slt(CR2.getSignedMin());
+      case SLE:
+        return CR1.getSignedMax().sle(CR2.getSignedMin());
+      case SGT:
+        return CR1.getSignedMin().sgt(CR2.getSignedMax());
+      case SGE:
+        return CR1.getSignedMin().sge(CR2.getSignedMax());
+      case LT:
+        return CR1.getUnsignedMax().ult(CR2.getUnsignedMin()) &&
+               CR1.getSignedMax().slt(CR2.getUnsignedMin());
+      case LE:
+        return CR1.getUnsignedMax().ule(CR2.getUnsignedMin()) &&
+               CR1.getSignedMax().sle(CR2.getUnsignedMin());
+      case GT:
+        return CR1.getUnsignedMin().ugt(CR2.getUnsignedMax()) &&
+               CR1.getSignedMin().sgt(CR2.getSignedMax());
+      case GE:
+        return CR1.getUnsignedMin().uge(CR2.getUnsignedMax()) &&
+               CR1.getSignedMin().sge(CR2.getSignedMax());
+      case SLTUGT:
+        return CR1.getSignedMax().slt(CR2.getSignedMin()) &&
+               CR1.getUnsignedMin().ugt(CR2.getUnsignedMax());
+      case SLEUGE:
+        return CR1.getSignedMax().sle(CR2.getSignedMin()) &&
+               CR1.getUnsignedMin().uge(CR2.getUnsignedMax());
+      case SGTULT:
+        return CR1.getSignedMin().sgt(CR2.getSignedMax()) &&
+               CR1.getUnsignedMax().ult(CR2.getUnsignedMin());
+      case SGEULE:
+        return CR1.getSignedMin().sge(CR2.getSignedMax()) &&
+               CR1.getUnsignedMax().ule(CR2.getUnsignedMin());
+      }
+    }
+
+    bool isRelatedBy(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                     LatticeVal LV) {
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      // True iff all values in CR1 are LV to all values in CR2.
+      return isRelatedBy(CR1, CR2, LV);
+    }
+
+    void addToWorklist(Value *V, Constant *C, ICmpInst::Predicate Pred,
+                       VRPSolver *VRP);
+    void markBlock(VRPSolver *VRP);
+
+    void mergeInto(Value **I, unsigned n, unsigned New,
+                   DomTreeDFS::Node *Subtree, VRPSolver *VRP) {
+      ConstantRange CR_New = range(New, Subtree);
+      ConstantRange Merged = CR_New;
+
+      for (; n != 0; ++I, --n) {
+        unsigned i = VN.valueNumber(*I, Subtree);
+        ConstantRange CR_Kill = i ? range(i, Subtree) : range(*I);
+        if (CR_Kill.isFullSet()) continue;
+        Merged = Merged.maximalIntersectWith(CR_Kill);
+      }
+
+      if (Merged.isFullSet() || Merged == CR_New) return;
+
+      applyRange(New, Merged, Subtree, VRP);
+    }
+
+    void applyRange(unsigned n, const ConstantRange &CR,
+                    DomTreeDFS::Node *Subtree, VRPSolver *VRP) {
+      ConstantRange Merged = CR.maximalIntersectWith(range(n, Subtree));
+      if (Merged.isEmptySet()) {
+        markBlock(VRP);
+        return;
+      }
+
+      if (const APInt *I = Merged.getSingleElement()) {
+        Value *V = VN.value(n); // XXX: redesign worklist.
+        const Type *Ty = V->getType();
+        if (Ty->isInteger()) {
+          addToWorklist(V, ConstantInt::get(*I), ICmpInst::ICMP_EQ, VRP);
+          return;
+        } else if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+          assert(*I == 0 && "Pointer is null but not zero?");
+          addToWorklist(V, ConstantPointerNull::get(PTy),
+                        ICmpInst::ICMP_EQ, VRP);
+          return;
+        }
+      }
+
+      update(n, Merged, Subtree);
+    }
+
+    void addNotEquals(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                      VRPSolver *VRP) {
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      uint32_t W = CR1.getBitWidth();
+
+      if (const APInt *I = CR1.getSingleElement()) {
+        if (CR2.isFullSet()) {
+          ConstantRange NewCR2(CR1.getUpper(), CR1.getLower());
+          applyRange(n2, NewCR2, Subtree, VRP);
+        } else if (*I == CR2.getLower()) {
+          APInt NewLower(CR2.getLower() + 1),
+                NewUpper(CR2.getUpper());
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR2(NewLower, NewUpper);
+          applyRange(n2, NewCR2, Subtree, VRP);
+        } else if (*I == CR2.getUpper() - 1) {
+          APInt NewLower(CR2.getLower()),
+                NewUpper(CR2.getUpper() - 1);
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR2(NewLower, NewUpper);
+          applyRange(n2, NewCR2, Subtree, VRP);
+        }
+      }
+
+      if (const APInt *I = CR2.getSingleElement()) {
+        if (CR1.isFullSet()) {
+          ConstantRange NewCR1(CR2.getUpper(), CR2.getLower());
+          applyRange(n1, NewCR1, Subtree, VRP);
+        } else if (*I == CR1.getLower()) {
+          APInt NewLower(CR1.getLower() + 1),
+                NewUpper(CR1.getUpper());
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR1(NewLower, NewUpper);
+          applyRange(n1, NewCR1, Subtree, VRP);
+        } else if (*I == CR1.getUpper() - 1) {
+          APInt NewLower(CR1.getLower()),
+                NewUpper(CR1.getUpper() - 1);
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR1(NewLower, NewUpper);
+          applyRange(n1, NewCR1, Subtree, VRP);
+        }
+      }
+    }
+
+    void addInequality(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                       LatticeVal LV, VRPSolver *VRP) {
+      assert(!isRelatedBy(n1, n2, Subtree, LV) && "Asked to do useless work.");
+
+      if (LV == NE) {
+        addNotEquals(n1, n2, Subtree, VRP);
+        return;
+      }
+
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      if (!CR1.isSingleElement()) {
+        ConstantRange NewCR1 = CR1.maximalIntersectWith(create(LV, CR2));
+        if (NewCR1 != CR1)
+          applyRange(n1, NewCR1, Subtree, VRP);
+      }
+
+      if (!CR2.isSingleElement()) {
+        ConstantRange NewCR2 = CR2.maximalIntersectWith(
+                                       create(reversePredicate(LV), CR1));
+        if (NewCR2 != CR2)
+          applyRange(n2, NewCR2, Subtree, VRP);
+      }
+    }
+  };
+
+  /// UnreachableBlocks keeps tracks of blocks that are for one reason or
+  /// another discovered to be unreachable. This is used to cull the graph when
+  /// analyzing instructions, and to mark blocks with the "unreachable"
+  /// terminator instruction after the function has executed.
+  class VISIBILITY_HIDDEN UnreachableBlocks {
+  private:
+    std::vector<BasicBlock *> DeadBlocks;
+
+  public:
+    /// mark - mark a block as dead
+    void mark(BasicBlock *BB) {
+      std::vector<BasicBlock *>::iterator E = DeadBlocks.end();
+      std::vector<BasicBlock *>::iterator I =
+        std::lower_bound(DeadBlocks.begin(), E, BB);
+
+      if (I == E || *I != BB) DeadBlocks.insert(I, BB);
+    }
+
+    /// isDead - returns whether a block is known to be dead already
+    bool isDead(BasicBlock *BB) {
+      std::vector<BasicBlock *>::iterator E = DeadBlocks.end();
+      std::vector<BasicBlock *>::iterator I =
+        std::lower_bound(DeadBlocks.begin(), E, BB);
+
+      return I != E && *I == BB;
+    }
+
+    /// kill - replace the dead blocks' terminator with an UnreachableInst.
+    bool kill() {
+      bool modified = false;
+      for (std::vector<BasicBlock *>::iterator I = DeadBlocks.begin(),
+           E = DeadBlocks.end(); I != E; ++I) {
+        BasicBlock *BB = *I;
+
+        DOUT << "unreachable block: " << BB->getName() << "\n";
+
+        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+             SI != SE; ++SI) {
+          BasicBlock *Succ = *SI;
+          Succ->removePredecessor(BB);
+        }
+
+        TerminatorInst *TI = BB->getTerminator();
+        TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        TI->eraseFromParent();
+        new UnreachableInst(BB);
+        ++NumBlocks;
+        modified = true;
+      }
+      DeadBlocks.clear();
+      return modified;
+    }
+  };
+
+  /// VRPSolver keeps track of how changes to one variable affect other
+  /// variables, and forwards changes along to the InequalityGraph. It
+  /// also maintains the correct choice for "canonical" in the IG.
+  /// @brief VRPSolver calculates inferences from a new relationship.
+  class VISIBILITY_HIDDEN VRPSolver {
+  private:
+    friend class ValueRanges;
+
+    struct Operation {
+      Value *LHS, *RHS;
+      ICmpInst::Predicate Op;
+
+      BasicBlock *ContextBB; // XXX use a DomTreeDFS::Node instead
+      Instruction *ContextInst;
+    };
+    std::deque<Operation> WorkList;
+
+    ValueNumbering &VN;
+    InequalityGraph &IG;
+    UnreachableBlocks &UB;
+    ValueRanges &VR;
+    DomTreeDFS *DTDFS;
+    DomTreeDFS::Node *Top;
+    BasicBlock *TopBB;
+    Instruction *TopInst;
+    bool &modified;
+
+    typedef InequalityGraph::Node Node;
+
+    // below - true if the Instruction is dominated by the current context
+    // block or instruction
+    bool below(Instruction *I) {
+      BasicBlock *BB = I->getParent();
+      if (TopInst && TopInst->getParent() == BB) {
+        if (isa<TerminatorInst>(TopInst)) return false;
+        if (isa<TerminatorInst>(I)) return true;
+        if ( isa<PHINode>(TopInst) && !isa<PHINode>(I)) return true;
+        if (!isa<PHINode>(TopInst) &&  isa<PHINode>(I)) return false;
+
+        for (BasicBlock::const_iterator Iter = BB->begin(), E = BB->end();
+             Iter != E; ++Iter) {
+          if (&*Iter == TopInst) return true;
+          else if (&*Iter == I) return false;
+        }
+        assert(!"Instructions not found in parent BasicBlock?");
+      } else {
+        DomTreeDFS::Node *Node = DTDFS->getNodeForBlock(BB);
+        if (!Node) return false;
+        return Top->dominates(Node);
+      }
+      return false; // Not reached
+    }
+
+    // aboveOrBelow - true if the Instruction either dominates or is dominated
+    // by the current context block or instruction
+    bool aboveOrBelow(Instruction *I) {
+      BasicBlock *BB = I->getParent();
+      DomTreeDFS::Node *Node = DTDFS->getNodeForBlock(BB);
+      if (!Node) return false;
+
+      return Top == Node || Top->dominates(Node) || Node->dominates(Top);
+    }
+
+    bool makeEqual(Value *V1, Value *V2) {
+      DOUT << "makeEqual(" << *V1 << ", " << *V2 << ")\n";
+      DOUT << "context is ";
+      if (TopInst) DOUT << "I: " << *TopInst << "\n";
+      else DOUT << "BB: " << TopBB->getName()
+                << "(" << Top->getDFSNumIn() << ")\n";
+
+      assert(V1->getType() == V2->getType() &&
+             "Can't make two values with different types equal.");
+
+      if (V1 == V2) return true;
+
+      if (isa<Constant>(V1) && isa<Constant>(V2))
+        return false;
+
+      unsigned n1 = VN.valueNumber(V1, Top), n2 = VN.valueNumber(V2, Top);
+
+      if (n1 && n2) {
+        if (n1 == n2) return true;
+        if (IG.isRelatedBy(n1, n2, Top, NE)) return false;
+      }
+
+      if (n1) assert(V1 == VN.value(n1) && "Value isn't canonical.");
+      if (n2) assert(V2 == VN.value(n2) && "Value isn't canonical.");
+
+      assert(!VN.compare(V2, V1) && "Please order parameters to makeEqual.");
+
+      assert(!isa<Constant>(V2) && "Tried to remove a constant.");
+
+      SetVector<unsigned> Remove;
+      if (n2) Remove.insert(n2);
+
+      if (n1 && n2) {
+        // Suppose we're being told that %x == %y, and %x <= %z and %y >= %z.
+        // We can't just merge %x and %y because the relationship with %z would
+        // be EQ and that's invalid. What we're doing is looking for any nodes
+        // %z such that %x <= %z and %y >= %z, and vice versa.
+
+        Node::iterator end = IG.node(n2)->end();
+
+        // Find the intersection between N1 and N2 which is dominated by
+        // Top. If we find %x where N1 <= %x <= N2 (or >=) then add %x to
+        // Remove.
+        for (Node::iterator I = IG.node(n1)->begin(), E = IG.node(n1)->end();
+             I != E; ++I) {
+          if (!(I->LV & EQ_BIT) || !Top->DominatedBy(I->Subtree)) continue;
+
+          unsigned ILV_s = I->LV & (SLT_BIT|SGT_BIT);
+          unsigned ILV_u = I->LV & (ULT_BIT|UGT_BIT);
+          Node::iterator NI = IG.node(n2)->find(I->To, Top);
+          if (NI != end) {
+            LatticeVal NILV = reversePredicate(NI->LV);
+            unsigned NILV_s = NILV & (SLT_BIT|SGT_BIT);
+            unsigned NILV_u = NILV & (ULT_BIT|UGT_BIT);
+
+            if ((ILV_s != (SLT_BIT|SGT_BIT) && ILV_s == NILV_s) ||
+                (ILV_u != (ULT_BIT|UGT_BIT) && ILV_u == NILV_u))
+              Remove.insert(I->To);
+          }
+        }
+
+        // See if one of the nodes about to be removed is actually a better
+        // canonical choice than n1.
+        unsigned orig_n1 = n1;
+        SetVector<unsigned>::iterator DontRemove = Remove.end();
+        for (SetVector<unsigned>::iterator I = Remove.begin()+1 /* skip n2 */,
+             E = Remove.end(); I != E; ++I) {
+          unsigned n = *I;
+          Value *V = VN.value(n);
+          if (VN.compare(V, V1)) {
+            V1 = V;
+            n1 = n;
+            DontRemove = I;
+          }
+        }
+        if (DontRemove != Remove.end()) {
+          unsigned n = *DontRemove;
+          Remove.remove(n);
+          Remove.insert(orig_n1);
+        }
+      }
+
+      // We'd like to allow makeEqual on two values to perform a simple
+      // substitution without creating nodes in the IG whenever possible.
+      //
+      // The first iteration through this loop operates on V2 before going
+      // through the Remove list and operating on those too. If all of the
+      // iterations performed simple replacements then we exit early.
+      bool mergeIGNode = false;
+      unsigned i = 0;
+      for (Value *R = V2; i == 0 || i < Remove.size(); ++i) {
+        if (i) R = VN.value(Remove[i]); // skip n2.
+
+        // Try to replace the whole instruction. If we can, we're done.
+        Instruction *I2 = dyn_cast<Instruction>(R);
+        if (I2 && below(I2)) {
+          std::vector<Instruction *> ToNotify;
+          for (Value::use_iterator UI = R->use_begin(), UE = R->use_end();
+               UI != UE;) {
+            Use &TheUse = UI.getUse();
+            ++UI;
+            if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser()))
+              ToNotify.push_back(I);
+          }
+
+          DOUT << "Simply removing " << *I2
+               << ", replacing with " << *V1 << "\n";
+          I2->replaceAllUsesWith(V1);
+          // leave it dead; it'll get erased later.
+          ++NumInstruction;
+          modified = true;
+
+          for (std::vector<Instruction *>::iterator II = ToNotify.begin(),
+               IE = ToNotify.end(); II != IE; ++II) {
+            opsToDef(*II);
+          }
+
+          continue;
+        }
+
+        // Otherwise, replace all dominated uses.
+        for (Value::use_iterator UI = R->use_begin(), UE = R->use_end();
+             UI != UE;) {
+          Use &TheUse = UI.getUse();
+          ++UI;
+          if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+            if (below(I)) {
+              TheUse.set(V1);
+              modified = true;
+              ++NumVarsReplaced;
+              opsToDef(I);
+            }
+          }
+        }
+
+        // If that killed the instruction, stop here.
+        if (I2 && isInstructionTriviallyDead(I2)) {
+          DOUT << "Killed all uses of " << *I2
+               << ", replacing with " << *V1 << "\n";
+          continue;
+        }
+
+        // If we make it to here, then we will need to create a node for N1.
+        // Otherwise, we can skip out early!
+        mergeIGNode = true;
+      }
+
+      if (!isa<Constant>(V1)) {
+        if (Remove.empty()) {
+          VR.mergeInto(&V2, 1, VN.getOrInsertVN(V1, Top), Top, this);
+        } else {
+          std::vector<Value*> RemoveVals;
+          RemoveVals.reserve(Remove.size());
+
+          for (SetVector<unsigned>::iterator I = Remove.begin(),
+               E = Remove.end(); I != E; ++I) {
+            Value *V = VN.value(*I);
+            if (!V->use_empty())
+              RemoveVals.push_back(V);
+          }
+          VR.mergeInto(&RemoveVals[0], RemoveVals.size(), 
+                       VN.getOrInsertVN(V1, Top), Top, this);
+        }
+      }
+
+      if (mergeIGNode) {
+        // Create N1.
+        if (!n1) n1 = VN.getOrInsertVN(V1, Top);
+        IG.node(n1); // Ensure that IG.Nodes won't get resized
+
+        // Migrate relationships from removed nodes to N1.
+        for (SetVector<unsigned>::iterator I = Remove.begin(), E = Remove.end();
+             I != E; ++I) {
+          unsigned n = *I;
+          for (Node::iterator NI = IG.node(n)->begin(), NE = IG.node(n)->end();
+               NI != NE; ++NI) {
+            if (NI->Subtree->DominatedBy(Top)) {
+              if (NI->To == n1) {
+                assert((NI->LV & EQ_BIT) && "Node inequal to itself.");
+                continue;
+              }
+              if (Remove.count(NI->To))
+                continue;
+
+              IG.node(NI->To)->update(n1, reversePredicate(NI->LV), Top);
+              IG.node(n1)->update(NI->To, NI->LV, Top);
+            }
+          }
+        }
+
+        // Point V2 (and all items in Remove) to N1.
+        if (!n2)
+          VN.addEquality(n1, V2, Top);
+        else {
+          for (SetVector<unsigned>::iterator I = Remove.begin(),
+               E = Remove.end(); I != E; ++I) {
+            VN.addEquality(n1, VN.value(*I), Top);
+          }
+        }
+
+        // If !Remove.empty() then V2 = Remove[0]->getValue().
+        // Even when Remove is empty, we still want to process V2.
+        i = 0;
+        for (Value *R = V2; i == 0 || i < Remove.size(); ++i) {
+          if (i) R = VN.value(Remove[i]); // skip n2.
+
+          if (Instruction *I2 = dyn_cast<Instruction>(R)) {
+            if (aboveOrBelow(I2))
+            defToOps(I2);
+          }
+          for (Value::use_iterator UI = V2->use_begin(), UE = V2->use_end();
+               UI != UE;) {
+            Use &TheUse = UI.getUse();
+            ++UI;
+            if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+              if (aboveOrBelow(I))
+                opsToDef(I);
+            }
+          }
+        }
+      }
+
+      // re-opsToDef all dominated users of V1.
+      if (Instruction *I = dyn_cast<Instruction>(V1)) {
+        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE;) {
+          Use &TheUse = UI.getUse();
+          ++UI;
+          Value *V = TheUse.getUser();
+          if (!V->use_empty()) {
+            if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+              if (aboveOrBelow(Inst))
+                opsToDef(Inst);
+            }
+          }
+        }
+      }
+
+      return true;
+    }
+
+    /// cmpInstToLattice - converts an CmpInst::Predicate to lattice value
+    /// Requires that the lattice value be valid; does not accept ICMP_EQ.
+    static LatticeVal cmpInstToLattice(ICmpInst::Predicate Pred) {
+      switch (Pred) {
+        case ICmpInst::ICMP_EQ:
+          assert(!"No matching lattice value.");
+          return static_cast<LatticeVal>(EQ_BIT);
+        default:
+          assert(!"Invalid 'icmp' predicate.");
+        case ICmpInst::ICMP_NE:
+          return NE;
+        case ICmpInst::ICMP_UGT:
+          return UGT;
+        case ICmpInst::ICMP_UGE:
+          return UGE;
+        case ICmpInst::ICMP_ULT:
+          return ULT;
+        case ICmpInst::ICMP_ULE:
+          return ULE;
+        case ICmpInst::ICMP_SGT:
+          return SGT;
+        case ICmpInst::ICMP_SGE:
+          return SGE;
+        case ICmpInst::ICMP_SLT:
+          return SLT;
+        case ICmpInst::ICMP_SLE:
+          return SLE;
+      }
+    }
+
+  public:
+    VRPSolver(ValueNumbering &VN, InequalityGraph &IG, UnreachableBlocks &UB,
+              ValueRanges &VR, DomTreeDFS *DTDFS, bool &modified,
+              BasicBlock *TopBB)
+      : VN(VN),
+        IG(IG),
+        UB(UB),
+        VR(VR),
+        DTDFS(DTDFS),
+        Top(DTDFS->getNodeForBlock(TopBB)),
+        TopBB(TopBB),
+        TopInst(NULL),
+        modified(modified)
+    {
+      assert(Top && "VRPSolver created for unreachable basic block.");
+    }
+
+    VRPSolver(ValueNumbering &VN, InequalityGraph &IG, UnreachableBlocks &UB,
+              ValueRanges &VR, DomTreeDFS *DTDFS, bool &modified,
+              Instruction *TopInst)
+      : VN(VN),
+        IG(IG),
+        UB(UB),
+        VR(VR),
+        DTDFS(DTDFS),
+        Top(DTDFS->getNodeForBlock(TopInst->getParent())),
+        TopBB(TopInst->getParent()),
+        TopInst(TopInst),
+        modified(modified)
+    {
+      assert(Top && "VRPSolver created for unreachable basic block.");
+      assert(Top->getBlock() == TopInst->getParent() && "Context mismatch.");
+    }
+
+    bool isRelatedBy(Value *V1, Value *V2, ICmpInst::Predicate Pred) const {
+      if (Constant *C1 = dyn_cast<Constant>(V1))
+        if (Constant *C2 = dyn_cast<Constant>(V2))
+          return ConstantExpr::getCompare(Pred, C1, C2) ==
+                 ConstantInt::getTrue();
+
+      unsigned n1 = VN.valueNumber(V1, Top);
+      unsigned n2 = VN.valueNumber(V2, Top);
+
+      if (n1 && n2) {
+        if (n1 == n2) return Pred == ICmpInst::ICMP_EQ ||
+                             Pred == ICmpInst::ICMP_ULE ||
+                             Pred == ICmpInst::ICMP_UGE ||
+                             Pred == ICmpInst::ICMP_SLE ||
+                             Pred == ICmpInst::ICMP_SGE;
+        if (Pred == ICmpInst::ICMP_EQ) return false;
+        if (IG.isRelatedBy(n1, n2, Top, cmpInstToLattice(Pred))) return true;
+        if (VR.isRelatedBy(n1, n2, Top, cmpInstToLattice(Pred))) return true;
+      }
+
+      if ((n1 && !n2 && isa<Constant>(V2)) ||
+          (n2 && !n1 && isa<Constant>(V1))) {
+        ConstantRange CR1 = n1 ? VR.range(n1, Top) : VR.range(V1);
+        ConstantRange CR2 = n2 ? VR.range(n2, Top) : VR.range(V2);
+
+        if (Pred == ICmpInst::ICMP_EQ)
+          return CR1.isSingleElement() &&
+                 CR1.getSingleElement() == CR2.getSingleElement();
+
+        return VR.isRelatedBy(CR1, CR2, cmpInstToLattice(Pred));
+      }
+      if (Pred == ICmpInst::ICMP_EQ) return V1 == V2;
+      return false;
+    }
+
+    /// add - adds a new property to the work queue
+    void add(Value *V1, Value *V2, ICmpInst::Predicate Pred,
+             Instruction *I = NULL) {
+      DOUT << "adding " << *V1 << " " << Pred << " " << *V2;
+      if (I) DOUT << " context: " << *I;
+      else DOUT << " default context (" << Top->getDFSNumIn() << ")";
+      DOUT << "\n";
+
+      assert(V1->getType() == V2->getType() &&
+             "Can't relate two values with different types.");
+
+      WorkList.push_back(Operation());
+      Operation &O = WorkList.back();
+      O.LHS = V1, O.RHS = V2, O.Op = Pred, O.ContextInst = I;
+      O.ContextBB = I ? I->getParent() : TopBB;
+    }
+
+    /// defToOps - Given an instruction definition that we've learned something
+    /// new about, find any new relationships between its operands.
+    void defToOps(Instruction *I) {
+      Instruction *NewContext = below(I) ? I : TopInst;
+      Value *Canonical = VN.canonicalize(I, Top);
+
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        const Type *Ty = BO->getType();
+        assert(!Ty->isFPOrFPVector() && "Float in work queue!");
+
+        Value *Op0 = VN.canonicalize(BO->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(BO->getOperand(1), Top);
+
+        // TODO: "and i32 -1, %x" EQ %y then %x EQ %y.
+
+        switch (BO->getOpcode()) {
+          case Instruction::And: {
+            // "and i32 %a, %b" EQ -1 then %a EQ -1 and %b EQ -1
+            ConstantInt *CI = ConstantInt::getAllOnesValue(Ty);
+            if (Canonical == CI) {
+              add(CI, Op0, ICmpInst::ICMP_EQ, NewContext);
+              add(CI, Op1, ICmpInst::ICMP_EQ, NewContext);
+            }
+          } break;
+          case Instruction::Or: {
+            // "or i32 %a, %b" EQ 0 then %a EQ 0 and %b EQ 0
+            Constant *Zero = Constant::getNullValue(Ty);
+            if (Canonical == Zero) {
+              add(Zero, Op0, ICmpInst::ICMP_EQ, NewContext);
+              add(Zero, Op1, ICmpInst::ICMP_EQ, NewContext);
+            }
+          } break;
+          case Instruction::Xor: {
+            // "xor i32 %c, %a" EQ %b then %a EQ %c ^ %b
+            // "xor i32 %c, %a" EQ %c then %a EQ 0
+            // "xor i32 %c, %a" NE %c then %a NE 0
+            // Repeat the above, with order of operands reversed.
+            Value *LHS = Op0;
+            Value *RHS = Op1;
+            if (!isa<Constant>(LHS)) std::swap(LHS, RHS);
+
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(Canonical)) {
+              if (ConstantInt *Arg = dyn_cast<ConstantInt>(LHS)) {
+                add(RHS, ConstantInt::get(CI->getValue() ^ Arg->getValue()),
+                    ICmpInst::ICMP_EQ, NewContext);
+              }
+            }
+            if (Canonical == LHS) {
+              if (isa<ConstantInt>(Canonical))
+                add(RHS, Constant::getNullValue(Ty), ICmpInst::ICMP_EQ,
+                    NewContext);
+            } else if (isRelatedBy(LHS, Canonical, ICmpInst::ICMP_NE)) {
+              add(RHS, Constant::getNullValue(Ty), ICmpInst::ICMP_NE,
+                  NewContext);
+            }
+          } break;
+          default:
+            break;
+        }
+      } else if (ICmpInst *IC = dyn_cast<ICmpInst>(I)) {
+        // "icmp ult i32 %a, %y" EQ true then %a u< y
+        // etc.
+
+        if (Canonical == ConstantInt::getTrue()) {
+          add(IC->getOperand(0), IC->getOperand(1), IC->getPredicate(),
+              NewContext);
+        } else if (Canonical == ConstantInt::getFalse()) {
+          add(IC->getOperand(0), IC->getOperand(1),
+              ICmpInst::getInversePredicate(IC->getPredicate()), NewContext);
+        }
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        if (I->getType()->isFPOrFPVector()) return;
+
+        // Given: "%a = select i1 %x, i32 %b, i32 %c"
+        // %a EQ %b and %b NE %c then %x EQ true
+        // %a EQ %c and %b NE %c then %x EQ false
+
+        Value *True  = SI->getTrueValue();
+        Value *False = SI->getFalseValue();
+        if (isRelatedBy(True, False, ICmpInst::ICMP_NE)) {
+          if (Canonical == VN.canonicalize(True, Top) ||
+              isRelatedBy(Canonical, False, ICmpInst::ICMP_NE))
+            add(SI->getCondition(), ConstantInt::getTrue(),
+                ICmpInst::ICMP_EQ, NewContext);
+          else if (Canonical == VN.canonicalize(False, Top) ||
+                   isRelatedBy(Canonical, True, ICmpInst::ICMP_NE))
+            add(SI->getCondition(), ConstantInt::getFalse(),
+                ICmpInst::ICMP_EQ, NewContext);
+        }
+      } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+        for (GetElementPtrInst::op_iterator OI = GEPI->idx_begin(),
+             OE = GEPI->idx_end(); OI != OE; ++OI) {
+          ConstantInt *Op = dyn_cast<ConstantInt>(VN.canonicalize(*OI, Top));
+          if (!Op || !Op->isZero()) return;
+        }
+        // TODO: The GEPI indices are all zero. Copy from definition to operand,
+        // jumping the type plane as needed.
+        if (isRelatedBy(GEPI, Constant::getNullValue(GEPI->getType()),
+                        ICmpInst::ICMP_NE)) {
+          Value *Ptr = GEPI->getPointerOperand();
+          add(Ptr, Constant::getNullValue(Ptr->getType()), ICmpInst::ICMP_NE,
+              NewContext);
+        }
+      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        const Type *SrcTy = CI->getSrcTy();
+
+        unsigned ci = VN.getOrInsertVN(CI, Top);
+        uint32_t W = VR.typeToWidth(SrcTy);
+        if (!W) return;
+        ConstantRange CR = VR.range(ci, Top);
+
+        if (CR.isFullSet()) return;
+
+        switch (CI->getOpcode()) {
+          default: break;
+          case Instruction::ZExt:
+          case Instruction::SExt:
+            VR.applyRange(VN.getOrInsertVN(CI->getOperand(0), Top),
+                          CR.truncate(W), Top, this);
+            break;
+          case Instruction::BitCast:
+            VR.applyRange(VN.getOrInsertVN(CI->getOperand(0), Top),
+                          CR, Top, this);
+            break;
+        }
+      }
+    }
+
+    /// opsToDef - A new relationship was discovered involving one of this
+    /// instruction's operands. Find any new relationship involving the
+    /// definition, or another operand.
+    void opsToDef(Instruction *I) {
+      Instruction *NewContext = below(I) ? I : TopInst;
+
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        Value *Op0 = VN.canonicalize(BO->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(BO->getOperand(1), Top);
+
+        if (ConstantInt *CI0 = dyn_cast<ConstantInt>(Op0))
+          if (ConstantInt *CI1 = dyn_cast<ConstantInt>(Op1)) {
+            add(BO, ConstantExpr::get(BO->getOpcode(), CI0, CI1),
+                ICmpInst::ICMP_EQ, NewContext);
+            return;
+          }
+
+        // "%y = and i1 true, %x" then %x EQ %y
+        // "%y = or i1 false, %x" then %x EQ %y
+        // "%x = add i32 %y, 0" then %x EQ %y
+        // "%x = mul i32 %y, 0" then %x EQ 0
+
+        Instruction::BinaryOps Opcode = BO->getOpcode();
+        const Type *Ty = BO->getType();
+        assert(!Ty->isFPOrFPVector() && "Float in work queue!");
+
+        Constant *Zero = Constant::getNullValue(Ty);
+        Constant *One = ConstantInt::get(Ty, 1);
+        ConstantInt *AllOnes = ConstantInt::getAllOnesValue(Ty);
+
+        switch (Opcode) {
+          default: break;
+          case Instruction::LShr:
+          case Instruction::AShr:
+          case Instruction::Shl:
+            if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::Sub:
+            if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (ConstantInt *CI0 = dyn_cast<ConstantInt>(Op0)) {
+              unsigned n_ci0 = VN.getOrInsertVN(Op1, Top);
+              ConstantRange CR = VR.range(n_ci0, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(CI0->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            if (ConstantInt *CI1 = dyn_cast<ConstantInt>(Op1)) {
+              unsigned n_ci1 = VN.getOrInsertVN(Op0, Top);
+              ConstantRange CR = VR.range(n_ci1, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(CI1->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            break;
+          case Instruction::Or:
+            if (Op0 == AllOnes || Op1 == AllOnes) {
+              add(BO, AllOnes, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (Op0 == Zero) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::Add:
+            if (ConstantInt *CI0 = dyn_cast<ConstantInt>(Op0)) {
+              unsigned n_ci0 = VN.getOrInsertVN(Op1, Top);
+              ConstantRange CR = VR.range(n_ci0, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(-CI0->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            if (ConstantInt *CI1 = dyn_cast<ConstantInt>(Op1)) {
+              unsigned n_ci1 = VN.getOrInsertVN(Op0, Top);
+              ConstantRange CR = VR.range(n_ci1, Top);
+              if (!CR.isFullSet()) {
+                CR.subtract(-CI1->getValue());
+                unsigned n_bo = VN.getOrInsertVN(BO, Top);
+                VR.applyRange(n_bo, CR, Top, this);
+                return;
+              }
+            }
+            // fall-through
+          case Instruction::Xor:
+            if (Op0 == Zero) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::And:
+            if (Op0 == AllOnes) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == AllOnes) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (Op0 == Zero || Op1 == Zero) {
+              add(BO, Zero, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::Mul:
+            if (Op0 == Zero || Op1 == Zero) {
+              add(BO, Zero, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            if (Op0 == One) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == One) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+        }
+
+        // "%x = add i32 %y, %z" and %x EQ %y then %z EQ 0
+        // "%x = add i32 %y, %z" and %x EQ %z then %y EQ 0
+        // "%x = shl i32 %y, %z" and %x EQ %y and %y NE 0 then %z EQ 0
+        // "%x = udiv i32 %y, %z" and %x EQ %y and %y NE 0 then %z EQ 1
+
+        Value *Known = Op0, *Unknown = Op1,
+              *TheBO = VN.canonicalize(BO, Top);
+        if (Known != TheBO) std::swap(Known, Unknown);
+        if (Known == TheBO) {
+          switch (Opcode) {
+            default: break;
+            case Instruction::LShr:
+            case Instruction::AShr:
+            case Instruction::Shl:
+              if (!isRelatedBy(Known, Zero, ICmpInst::ICMP_NE)) break;
+              // otherwise, fall-through.
+            case Instruction::Sub:
+              if (Unknown == Op0) break;
+              // otherwise, fall-through.
+            case Instruction::Xor:
+            case Instruction::Add:
+              add(Unknown, Zero, ICmpInst::ICMP_EQ, NewContext);
+              break;
+            case Instruction::UDiv:
+            case Instruction::SDiv:
+              if (Unknown == Op1) break;
+              if (isRelatedBy(Known, Zero, ICmpInst::ICMP_NE))
+                add(Unknown, One, ICmpInst::ICMP_EQ, NewContext);
+              break;
+          }
+        }
+
+        // TODO: "%a = add i32 %b, 1" and %b > %z then %a >= %z.
+
+      } else if (ICmpInst *IC = dyn_cast<ICmpInst>(I)) {
+        // "%a = icmp ult i32 %b, %c" and %b u<  %c then %a EQ true
+        // "%a = icmp ult i32 %b, %c" and %b u>= %c then %a EQ false
+        // etc.
+
+        Value *Op0 = VN.canonicalize(IC->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(IC->getOperand(1), Top);
+
+        ICmpInst::Predicate Pred = IC->getPredicate();
+        if (isRelatedBy(Op0, Op1, Pred))
+          add(IC, ConstantInt::getTrue(), ICmpInst::ICMP_EQ, NewContext);
+        else if (isRelatedBy(Op0, Op1, ICmpInst::getInversePredicate(Pred)))
+          add(IC, ConstantInt::getFalse(), ICmpInst::ICMP_EQ, NewContext);
+
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        if (I->getType()->isFPOrFPVector()) return;
+
+        // Given: "%a = select i1 %x, i32 %b, i32 %c"
+        // %x EQ true  then %a EQ %b
+        // %x EQ false then %a EQ %c
+        // %b EQ %c then %a EQ %b
+
+        Value *Canonical = VN.canonicalize(SI->getCondition(), Top);
+        if (Canonical == ConstantInt::getTrue()) {
+          add(SI, SI->getTrueValue(), ICmpInst::ICMP_EQ, NewContext);
+        } else if (Canonical == ConstantInt::getFalse()) {
+          add(SI, SI->getFalseValue(), ICmpInst::ICMP_EQ, NewContext);
+        } else if (VN.canonicalize(SI->getTrueValue(), Top) ==
+                   VN.canonicalize(SI->getFalseValue(), Top)) {
+          add(SI, SI->getTrueValue(), ICmpInst::ICMP_EQ, NewContext);
+        }
+      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        const Type *DestTy = CI->getDestTy();
+        if (DestTy->isFPOrFPVector()) return;
+
+        Value *Op = VN.canonicalize(CI->getOperand(0), Top);
+        Instruction::CastOps Opcode = CI->getOpcode();
+
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          add(CI, ConstantExpr::getCast(Opcode, C, DestTy),
+              ICmpInst::ICMP_EQ, NewContext);
+        }
+
+        uint32_t W = VR.typeToWidth(DestTy);
+        unsigned ci = VN.getOrInsertVN(CI, Top);
+        ConstantRange CR = VR.range(VN.getOrInsertVN(Op, Top), Top);
+
+        if (!CR.isFullSet()) {
+          switch (Opcode) {
+            default: break;
+            case Instruction::ZExt:
+              VR.applyRange(ci, CR.zeroExtend(W), Top, this);
+              break;
+            case Instruction::SExt:
+              VR.applyRange(ci, CR.signExtend(W), Top, this);
+              break;
+            case Instruction::Trunc: {
+              ConstantRange Result = CR.truncate(W);
+              if (!Result.isFullSet())
+                VR.applyRange(ci, Result, Top, this);
+            } break;
+            case Instruction::BitCast:
+              VR.applyRange(ci, CR, Top, this);
+              break;
+            // TODO: other casts?
+          }
+        }
+      } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+        for (GetElementPtrInst::op_iterator OI = GEPI->idx_begin(),
+             OE = GEPI->idx_end(); OI != OE; ++OI) {
+          ConstantInt *Op = dyn_cast<ConstantInt>(VN.canonicalize(*OI, Top));
+          if (!Op || !Op->isZero()) return;
+        }
+        // TODO: The GEPI indices are all zero. Copy from operand to definition,
+        // jumping the type plane as needed.
+        Value *Ptr = GEPI->getPointerOperand();
+        if (isRelatedBy(Ptr, Constant::getNullValue(Ptr->getType()),
+                        ICmpInst::ICMP_NE)) {
+          add(GEPI, Constant::getNullValue(GEPI->getType()), ICmpInst::ICMP_NE,
+              NewContext);
+        }
+      }
+    }
+
+    /// solve - process the work queue
+    void solve() {
+      //DOUT << "WorkList entry, size: " << WorkList.size() << "\n";
+      while (!WorkList.empty()) {
+        //DOUT << "WorkList size: " << WorkList.size() << "\n";
+
+        Operation &O = WorkList.front();
+        TopInst = O.ContextInst;
+        TopBB = O.ContextBB;
+        Top = DTDFS->getNodeForBlock(TopBB); // XXX move this into Context
+
+        O.LHS = VN.canonicalize(O.LHS, Top);
+        O.RHS = VN.canonicalize(O.RHS, Top);
+
+        assert(O.LHS == VN.canonicalize(O.LHS, Top) && "Canonicalize isn't.");
+        assert(O.RHS == VN.canonicalize(O.RHS, Top) && "Canonicalize isn't.");
+
+        DOUT << "solving " << *O.LHS << " " << O.Op << " " << *O.RHS;
+        if (O.ContextInst) DOUT << " context inst: " << *O.ContextInst;
+        else DOUT << " context block: " << O.ContextBB->getName();
+        DOUT << "\n";
+
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+
+        // If they're both Constant, skip it. Check for contradiction and mark
+        // the BB as unreachable if so.
+        if (Constant *CI_L = dyn_cast<Constant>(O.LHS)) {
+          if (Constant *CI_R = dyn_cast<Constant>(O.RHS)) {
+            if (ConstantExpr::getCompare(O.Op, CI_L, CI_R) ==
+                ConstantInt::getFalse())
+              UB.mark(TopBB);
+
+            WorkList.pop_front();
+            continue;
+          }
+        }
+
+        if (VN.compare(O.LHS, O.RHS)) {
+          std::swap(O.LHS, O.RHS);
+          O.Op = ICmpInst::getSwappedPredicate(O.Op);
+        }
+
+        if (O.Op == ICmpInst::ICMP_EQ) {
+          if (!makeEqual(O.RHS, O.LHS))
+            UB.mark(TopBB);
+        } else {
+          LatticeVal LV = cmpInstToLattice(O.Op);
+
+          if ((LV & EQ_BIT) &&
+              isRelatedBy(O.LHS, O.RHS, ICmpInst::getSwappedPredicate(O.Op))) {
+            if (!makeEqual(O.RHS, O.LHS))
+              UB.mark(TopBB);
+          } else {
+            if (isRelatedBy(O.LHS, O.RHS, ICmpInst::getInversePredicate(O.Op))){
+              UB.mark(TopBB);
+              WorkList.pop_front();
+              continue;
+            }
+
+            unsigned n1 = VN.getOrInsertVN(O.LHS, Top);
+            unsigned n2 = VN.getOrInsertVN(O.RHS, Top);
+
+            if (n1 == n2) {
+              if (O.Op != ICmpInst::ICMP_UGE && O.Op != ICmpInst::ICMP_ULE &&
+                  O.Op != ICmpInst::ICMP_SGE && O.Op != ICmpInst::ICMP_SLE)
+                UB.mark(TopBB);
+
+              WorkList.pop_front();
+              continue;
+            }
+
+            if (VR.isRelatedBy(n1, n2, Top, LV) ||
+                IG.isRelatedBy(n1, n2, Top, LV)) {
+              WorkList.pop_front();
+              continue;
+            }
+
+            VR.addInequality(n1, n2, Top, LV, this);
+            if ((!isa<ConstantInt>(O.RHS) && !isa<ConstantInt>(O.LHS)) ||
+                LV == NE)
+              IG.addInequality(n1, n2, Top, LV);
+
+            if (Instruction *I1 = dyn_cast<Instruction>(O.LHS)) {
+              if (aboveOrBelow(I1))
+                defToOps(I1);
+            }
+            if (isa<Instruction>(O.LHS) || isa<Argument>(O.LHS)) {
+              for (Value::use_iterator UI = O.LHS->use_begin(),
+                   UE = O.LHS->use_end(); UI != UE;) {
+                Use &TheUse = UI.getUse();
+                ++UI;
+                if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+                  if (aboveOrBelow(I))
+                    opsToDef(I);
+                }
+              }
+            }
+            if (Instruction *I2 = dyn_cast<Instruction>(O.RHS)) {
+              if (aboveOrBelow(I2))
+              defToOps(I2);
+            }
+            if (isa<Instruction>(O.RHS) || isa<Argument>(O.RHS)) {
+              for (Value::use_iterator UI = O.RHS->use_begin(),
+                   UE = O.RHS->use_end(); UI != UE;) {
+                Use &TheUse = UI.getUse();
+                ++UI;
+                if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+                  if (aboveOrBelow(I))
+                    opsToDef(I);
+                }
+              }
+            }
+          }
+        }
+        WorkList.pop_front();
+      }
+    }
+  };
+
+  void ValueRanges::addToWorklist(Value *V, Constant *C,
+                                  ICmpInst::Predicate Pred, VRPSolver *VRP) {
+    VRP->add(V, C, Pred, VRP->TopInst);
+  }
+
+  void ValueRanges::markBlock(VRPSolver *VRP) {
+    VRP->UB.mark(VRP->TopBB);
+  }
+
+  /// PredicateSimplifier - This class is a simplifier that replaces
+  /// one equivalent variable with another. It also tracks what
+  /// can't be equal and will solve setcc instructions when possible.
+  /// @brief Root of the predicate simplifier optimization.
+  class VISIBILITY_HIDDEN PredicateSimplifier : public FunctionPass {
+    DomTreeDFS *DTDFS;
+    bool modified;
+    ValueNumbering *VN;
+    InequalityGraph *IG;
+    UnreachableBlocks UB;
+    ValueRanges *VR;
+
+    std::vector<DomTreeDFS::Node *> WorkList;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PredicateSimplifier() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetData>();
+      AU.addPreserved<TargetData>();
+    }
+
+  private:
+    /// Forwards - Adds new properties to VRPSolver and uses them to
+    /// simplify instructions. Because new properties sometimes apply to
+    /// a transition from one BasicBlock to another, this will use the
+    /// PredicateSimplifier::proceedToSuccessor(s) interface to enter the
+    /// basic block.
+    /// @brief Performs abstract execution of the program.
+    class VISIBILITY_HIDDEN Forwards : public InstVisitor<Forwards> {
+      friend class InstVisitor<Forwards>;
+      PredicateSimplifier *PS;
+      DomTreeDFS::Node *DTNode;
+
+    public:
+      ValueNumbering &VN;
+      InequalityGraph &IG;
+      UnreachableBlocks &UB;
+      ValueRanges &VR;
+
+      Forwards(PredicateSimplifier *PS, DomTreeDFS::Node *DTNode)
+        : PS(PS), DTNode(DTNode), VN(*PS->VN), IG(*PS->IG), UB(PS->UB),
+          VR(*PS->VR) {}
+
+      void visitTerminatorInst(TerminatorInst &TI);
+      void visitBranchInst(BranchInst &BI);
+      void visitSwitchInst(SwitchInst &SI);
+
+      void visitAllocaInst(AllocaInst &AI);
+      void visitLoadInst(LoadInst &LI);
+      void visitStoreInst(StoreInst &SI);
+
+      void visitSExtInst(SExtInst &SI);
+      void visitZExtInst(ZExtInst &ZI);
+
+      void visitBinaryOperator(BinaryOperator &BO);
+      void visitICmpInst(ICmpInst &IC);
+    };
+  
+    // Used by terminator instructions to proceed from the current basic
+    // block to the next. Verifies that "current" dominates "next",
+    // then calls visitBasicBlock.
+    void proceedToSuccessors(DomTreeDFS::Node *Current) {
+      for (DomTreeDFS::Node::iterator I = Current->begin(),
+           E = Current->end(); I != E; ++I) {
+        WorkList.push_back(*I);
+      }
+    }
+
+    void proceedToSuccessor(DomTreeDFS::Node *Next) {
+      WorkList.push_back(Next);
+    }
+
+    // Visits each instruction in the basic block.
+    void visitBasicBlock(DomTreeDFS::Node *Node) {
+      BasicBlock *BB = Node->getBlock();
+      DOUT << "Entering Basic Block: " << BB->getName()
+           << " (" << Node->getDFSNumIn() << ")\n";
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+        visitInstruction(I++, Node);
+      }
+    }
+
+    // Tries to simplify each Instruction and add new properties.
+    void visitInstruction(Instruction *I, DomTreeDFS::Node *DT) {
+      DOUT << "Considering instruction " << *I << "\n";
+      DEBUG(VN->dump());
+      DEBUG(IG->dump());
+      DEBUG(VR->dump());
+
+      // Sometimes instructions are killed in earlier analysis.
+      if (isInstructionTriviallyDead(I)) {
+        ++NumSimple;
+        modified = true;
+        if (unsigned n = VN->valueNumber(I, DTDFS->getRootNode()))
+          if (VN->value(n) == I) IG->remove(n);
+        VN->remove(I);
+        I->eraseFromParent();
+        return;
+      }
+
+#ifndef NDEBUG
+      // Try to replace the whole instruction.
+      Value *V = VN->canonicalize(I, DT);
+      assert(V == I && "Late instruction canonicalization.");
+      if (V != I) {
+        modified = true;
+        ++NumInstruction;
+        DOUT << "Removing " << *I << ", replacing with " << *V << "\n";
+        if (unsigned n = VN->valueNumber(I, DTDFS->getRootNode()))
+          if (VN->value(n) == I) IG->remove(n);
+        VN->remove(I);
+        I->replaceAllUsesWith(V);
+        I->eraseFromParent();
+        return;
+      }
+
+      // Try to substitute operands.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *Oper = I->getOperand(i);
+        Value *V = VN->canonicalize(Oper, DT);
+        assert(V == Oper && "Late operand canonicalization.");
+        if (V != Oper) {
+          modified = true;
+          ++NumVarsReplaced;
+          DOUT << "Resolving " << *I;
+          I->setOperand(i, V);
+          DOUT << " into " << *I;
+        }
+      }
+#endif
+
+      std::string name = I->getParent()->getName();
+      DOUT << "push (%" << name << ")\n";
+      Forwards visit(this, DT);
+      visit.visit(*I);
+      DOUT << "pop (%" << name << ")\n";
+    }
+  };
+
+  bool PredicateSimplifier::runOnFunction(Function &F) {
+    DominatorTree *DT = &getAnalysis<DominatorTree>();
+    DTDFS = new DomTreeDFS(DT);
+    TargetData *TD = &getAnalysis<TargetData>();
+
+    DOUT << "Entering Function: " << F.getName() << "\n";
+
+    modified = false;
+    DomTreeDFS::Node *Root = DTDFS->getRootNode();
+    VN = new ValueNumbering(DTDFS);
+    IG = new InequalityGraph(*VN, Root);
+    VR = new ValueRanges(*VN, TD);
+    WorkList.push_back(Root);
+
+    do {
+      DomTreeDFS::Node *DTNode = WorkList.back();
+      WorkList.pop_back();
+      if (!UB.isDead(DTNode->getBlock())) visitBasicBlock(DTNode);
+    } while (!WorkList.empty());
+
+    delete DTDFS;
+    delete VR;
+    delete IG;
+    delete VN;
+
+    modified |= UB.kill();
+
+    return modified;
+  }
+
+  void PredicateSimplifier::Forwards::visitTerminatorInst(TerminatorInst &TI) {
+    PS->proceedToSuccessors(DTNode);
+  }
+
+  void PredicateSimplifier::Forwards::visitBranchInst(BranchInst &BI) {
+    if (BI.isUnconditional()) {
+      PS->proceedToSuccessors(DTNode);
+      return;
+    }
+
+    Value *Condition = BI.getCondition();
+    BasicBlock *TrueDest  = BI.getSuccessor(0);
+    BasicBlock *FalseDest = BI.getSuccessor(1);
+
+    if (isa<Constant>(Condition) || TrueDest == FalseDest) {
+      PS->proceedToSuccessors(DTNode);
+      return;
+    }
+
+    for (DomTreeDFS::Node::iterator I = DTNode->begin(), E = DTNode->end();
+         I != E; ++I) {
+      BasicBlock *Dest = (*I)->getBlock();
+      DOUT << "Branch thinking about %" << Dest->getName()
+           << "(" << PS->DTDFS->getNodeForBlock(Dest)->getDFSNumIn() << ")\n";
+
+      if (Dest == TrueDest) {
+        DOUT << "(" << DTNode->getBlock()->getName() << ") true set:\n";
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, Dest);
+        VRP.add(ConstantInt::getTrue(), Condition, ICmpInst::ICMP_EQ);
+        VRP.solve();
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+      } else if (Dest == FalseDest) {
+        DOUT << "(" << DTNode->getBlock()->getName() << ") false set:\n";
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, Dest);
+        VRP.add(ConstantInt::getFalse(), Condition, ICmpInst::ICMP_EQ);
+        VRP.solve();
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+      }
+
+      PS->proceedToSuccessor(*I);
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitSwitchInst(SwitchInst &SI) {
+    Value *Condition = SI.getCondition();
+
+    // Set the EQProperty in each of the cases BBs, and the NEProperties
+    // in the default BB.
+
+    for (DomTreeDFS::Node::iterator I = DTNode->begin(), E = DTNode->end();
+         I != E; ++I) {
+      BasicBlock *BB = (*I)->getBlock();
+      DOUT << "Switch thinking about BB %" << BB->getName()
+           << "(" << PS->DTDFS->getNodeForBlock(BB)->getDFSNumIn() << ")\n";
+
+      VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, BB);
+      if (BB == SI.getDefaultDest()) {
+        for (unsigned i = 1, e = SI.getNumCases(); i < e; ++i)
+          if (SI.getSuccessor(i) != BB)
+            VRP.add(Condition, SI.getCaseValue(i), ICmpInst::ICMP_NE);
+        VRP.solve();
+      } else if (ConstantInt *CI = SI.findCaseDest(BB)) {
+        VRP.add(Condition, CI, ICmpInst::ICMP_EQ);
+        VRP.solve();
+      }
+      PS->proceedToSuccessor(*I);
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitAllocaInst(AllocaInst &AI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &AI);
+    VRP.add(Constant::getNullValue(AI.getType()), &AI, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitLoadInst(LoadInst &LI) {
+    Value *Ptr = LI.getPointerOperand();
+    // avoid "load i8* null" -> null NE null.
+    if (isa<Constant>(Ptr)) return;
+
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &LI);
+    VRP.add(Constant::getNullValue(Ptr->getType()), Ptr, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitStoreInst(StoreInst &SI) {
+    Value *Ptr = SI.getPointerOperand();
+    if (isa<Constant>(Ptr)) return;
+
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &SI);
+    VRP.add(Constant::getNullValue(Ptr->getType()), Ptr, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitSExtInst(SExtInst &SI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &SI);
+    uint32_t SrcBitWidth = cast<IntegerType>(SI.getSrcTy())->getBitWidth();
+    uint32_t DstBitWidth = cast<IntegerType>(SI.getDestTy())->getBitWidth();
+    APInt Min(APInt::getHighBitsSet(DstBitWidth, DstBitWidth-SrcBitWidth+1));
+    APInt Max(APInt::getLowBitsSet(DstBitWidth, SrcBitWidth-1));
+    VRP.add(ConstantInt::get(Min), &SI, ICmpInst::ICMP_SLE);
+    VRP.add(ConstantInt::get(Max), &SI, ICmpInst::ICMP_SGE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitZExtInst(ZExtInst &ZI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &ZI);
+    uint32_t SrcBitWidth = cast<IntegerType>(ZI.getSrcTy())->getBitWidth();
+    uint32_t DstBitWidth = cast<IntegerType>(ZI.getDestTy())->getBitWidth();
+    APInt Max(APInt::getLowBitsSet(DstBitWidth, SrcBitWidth));
+    VRP.add(ConstantInt::get(Max), &ZI, ICmpInst::ICMP_UGE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitBinaryOperator(BinaryOperator &BO) {
+    Instruction::BinaryOps ops = BO.getOpcode();
+
+    switch (ops) {
+    default: break;
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::UDiv:
+      case Instruction::SDiv: {
+        Value *Divisor = BO.getOperand(1);
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(Constant::getNullValue(Divisor->getType()), Divisor,
+                ICmpInst::ICMP_NE);
+        VRP.solve();
+        break;
+      }
+    }
+
+    switch (ops) {
+      default: break;
+      case Instruction::Shl: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_UGE);
+        VRP.solve();
+      } break;
+      case Instruction::AShr: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_SLE);
+        VRP.solve();
+      } break;
+      case Instruction::LShr:
+      case Instruction::UDiv: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::URem: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::And: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_ULE);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::Or: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_UGE);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_UGE);
+        VRP.solve();
+      } break;
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitICmpInst(ICmpInst &IC) {
+    // If possible, squeeze the ICmp predicate into something simpler.
+    // Eg., if x = [0, 4) and we're being asked icmp uge %x, 3 then change
+    // the predicate to eq.
+
+    // XXX: once we do full PHI handling, modifying the instruction in the
+    // Forwards visitor will cause missed optimizations.
+
+    ICmpInst::Predicate Pred = IC.getPredicate();
+
+    switch (Pred) {
+      default: break;
+      case ICmpInst::ICMP_ULE: Pred = ICmpInst::ICMP_ULT; break;
+      case ICmpInst::ICMP_UGE: Pred = ICmpInst::ICMP_UGT; break;
+      case ICmpInst::ICMP_SLE: Pred = ICmpInst::ICMP_SLT; break;
+      case ICmpInst::ICMP_SGE: Pred = ICmpInst::ICMP_SGT; break;
+    }
+    if (Pred != IC.getPredicate()) {
+      VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &IC);
+      if (VRP.isRelatedBy(IC.getOperand(1), IC.getOperand(0),
+                          ICmpInst::ICMP_NE)) {
+        ++NumSnuggle;
+        PS->modified = true;
+        IC.setPredicate(Pred);
+      }
+    }
+
+    Pred = IC.getPredicate();
+
+    if (ConstantInt *Op1 = dyn_cast<ConstantInt>(IC.getOperand(1))) {
+      ConstantInt *NextVal = 0;
+      switch (Pred) {
+        default: break;
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_ULT:
+          if (Op1->getValue() != 0)
+            NextVal = ConstantInt::get(Op1->getValue()-1);
+         break;
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_UGT:
+          if (!Op1->getValue().isAllOnesValue())
+            NextVal = ConstantInt::get(Op1->getValue()+1);
+         break;
+      }
+
+      if (NextVal) {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &IC);
+        if (VRP.isRelatedBy(IC.getOperand(0), NextVal,
+                            ICmpInst::getInversePredicate(Pred))) {
+          ICmpInst *NewIC = new ICmpInst(ICmpInst::ICMP_EQ, IC.getOperand(0),
+                                         NextVal, "", &IC);
+          NewIC->takeName(&IC);
+          IC.replaceAllUsesWith(NewIC);
+
+          // XXX: prove this isn't necessary
+          if (unsigned n = VN.valueNumber(&IC, PS->DTDFS->getRootNode()))
+            if (VN.value(n) == &IC) IG.remove(n);
+          VN.remove(&IC);
+
+          IC.eraseFromParent();
+          ++NumSnuggle;
+          PS->modified = true;
+        }
+      }
+    }
+  }
+}
+
+char PredicateSimplifier::ID = 0;
+static RegisterPass<PredicateSimplifier>
+X("predsimplify", "Predicate Simplifier");
+
+FunctionPass *llvm::createPredicateSimplifierPass() {
+  return new PredicateSimplifier();
+}
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 0000000..293cf92
--- /dev/null
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -0,0 +1,896 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE...
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reassociate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumLinear , "Number of insts linearized");
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+namespace {
+  struct VISIBILITY_HIDDEN ValueEntry {
+    unsigned Rank;
+    Value *Op;
+    ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {}
+  };
+  inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
+    return LHS.Rank > RHS.Rank;   // Sort so that highest rank goes to start.
+  }
+}
+
+#ifndef NDEBUG
+/// PrintOps - Print out the expression identified in the Ops list.
+///
+static void PrintOps(Instruction *I, const std::vector<ValueEntry> &Ops) {
+  Module *M = I->getParent()->getParent()->getParent();
+  cerr << Instruction::getOpcodeName(I->getOpcode()) << " "
+       << *Ops[0].Op->getType();
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    WriteAsOperand(*cerr.stream() << " ", Ops[i].Op, false, M);
+    cerr << "," << Ops[i].Rank;
+  }
+}
+#endif
+  
+namespace {
+  class VISIBILITY_HIDDEN Reassociate : public FunctionPass {
+    std::map<BasicBlock*, unsigned> RankMap;
+    std::map<AssertingVH<>, unsigned> ValueRankMap;
+    bool MadeChange;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    Reassociate() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  private:
+    void BuildRankMap(Function &F);
+    unsigned getRank(Value *V);
+    void ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops,
+                         unsigned Idx = 0);
+    Value *OptimizeExpression(BinaryOperator *I, std::vector<ValueEntry> &Ops);
+    void LinearizeExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops);
+    void LinearizeExpr(BinaryOperator *I);
+    Value *RemoveFactorFromExpression(Value *V, Value *Factor);
+    void ReassociateBB(BasicBlock *BB);
+    
+    void RemoveDeadBinaryOp(Value *V);
+  };
+}
+
+char Reassociate::ID = 0;
+static RegisterPass<Reassociate> X("reassociate", "Reassociate expressions");
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
+
+void Reassociate::RemoveDeadBinaryOp(Value *V) {
+  Instruction *Op = dyn_cast<Instruction>(V);
+  if (!Op || !isa<BinaryOperator>(Op) || !isa<CmpInst>(Op) || !Op->use_empty())
+    return;
+  
+  Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1);
+  RemoveDeadBinaryOp(LHS);
+  RemoveDeadBinaryOp(RHS);
+}
+
+
+static bool isUnmovableInstruction(Instruction *I) {
+  if (I->getOpcode() == Instruction::PHI ||
+      I->getOpcode() == Instruction::Alloca ||
+      I->getOpcode() == Instruction::Load ||
+      I->getOpcode() == Instruction::Malloc ||
+      I->getOpcode() == Instruction::Invoke ||
+      (I->getOpcode() == Instruction::Call &&
+       !isa<DbgInfoIntrinsic>(I)) ||
+      I->getOpcode() == Instruction::UDiv || 
+      I->getOpcode() == Instruction::SDiv ||
+      I->getOpcode() == Instruction::FDiv ||
+      I->getOpcode() == Instruction::URem ||
+      I->getOpcode() == Instruction::SRem ||
+      I->getOpcode() == Instruction::FRem)
+    return true;
+  return false;
+}
+
+void Reassociate::BuildRankMap(Function &F) {
+  unsigned i = 2;
+
+  // Assign distinct ranks to function arguments
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    ValueRankMap[&*I] = ++i;
+
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
+         E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    unsigned BBRank = RankMap[BB] = ++i << 16;
+
+    // Walk the basic block, adding precomputed ranks for any instructions that
+    // we cannot move.  This ensures that the ranks for these instructions are
+    // all different in the block.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (isUnmovableInstruction(I))
+        ValueRankMap[&*I] = ++BBRank;
+  }
+}
+
+unsigned Reassociate::getRank(Value *V) {
+  if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument...
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return 0;  // Otherwise it's a global or constant, rank 0.
+
+  unsigned &CachedRank = ValueRankMap[I];
+  if (CachedRank) return CachedRank;    // Rank already known?
+
+  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+  // we can reassociate expressions for code motion!  Since we do not recurse
+  // for PHI nodes, we cannot have infinite recursion here, because there
+  // cannot be loops in the value graph that do not go through PHI nodes.
+  unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+  for (unsigned i = 0, e = I->getNumOperands();
+       i != e && Rank != MaxRank; ++i)
+    Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+  // If this is a not or neg instruction, do not count it for rank.  This
+  // assures us that X and ~X will have the same rank.
+  if (!I->getType()->isInteger() ||
+      (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I)))
+    ++Rank;
+
+  //DOUT << "Calculated Rank[" << V->getName() << "] = "
+  //     << Rank << "\n";
+
+  return CachedRank = Rank;
+}
+
+/// isReassociableOp - Return true if V is an instruction of the specified
+/// opcode and if it only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode)
+    return cast<BinaryOperator>(V);
+  return 0;
+}
+
+/// LowerNegateToMultiply - Replace 0-X with X*-1.
+///
+static Instruction *LowerNegateToMultiply(Instruction *Neg,
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+  Constant *Cst = ConstantInt::getAllOnesValue(Neg->getType());
+
+  Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
+  ValueRankMap.erase(Neg);
+  Res->takeName(Neg);
+  Neg->replaceAllUsesWith(Res);
+  Neg->eraseFromParent();
+  return Res;
+}
+
+// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'.
+// Note that if D is also part of the expression tree that we recurse to
+// linearize it as well.  Besides that case, this does not recurse into A,B, or
+// C.
+void Reassociate::LinearizeExpr(BinaryOperator *I) {
+  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
+  BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1));
+  assert(isReassociableOp(LHS, I->getOpcode()) &&
+         isReassociableOp(RHS, I->getOpcode()) &&
+         "Not an expression that needs linearization?");
+
+  DOUT << "Linear" << *LHS << *RHS << *I;
+
+  // Move the RHS instruction to live immediately before I, avoiding breaking
+  // dominator properties.
+  RHS->moveBefore(I);
+
+  // Move operands around to do the linearization.
+  I->setOperand(1, RHS->getOperand(0));
+  RHS->setOperand(0, LHS);
+  I->setOperand(0, RHS);
+
+  ++NumLinear;
+  MadeChange = true;
+  DOUT << "Linearized: " << *I;
+
+  // If D is part of this expression tree, tail recurse.
+  if (isReassociableOp(I->getOperand(1), I->getOpcode()))
+    LinearizeExpr(I);
+}
+
+
+/// LinearizeExprTree - Given an associative binary expression tree, traverse
+/// all of the uses putting it into canonical form.  This forces a left-linear
+/// form of the the expression (((a+b)+c)+d), and collects information about the
+/// rank of the non-tree operands.
+///
+/// NOTE: These intentionally destroys the expression tree operands (turning
+/// them into undef values) to reduce #uses of the values.  This means that the
+/// caller MUST use something like RewriteExprTree to put the values back in.
+///
+void Reassociate::LinearizeExprTree(BinaryOperator *I,
+                                    std::vector<ValueEntry> &Ops) {
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  unsigned Opcode = I->getOpcode();
+
+  // First step, linearize the expression if it is in ((A+B)+(C+D)) form.
+  BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode);
+  BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode);
+
+  // If this is a multiply expression tree and it contains internal negations,
+  // transform them into multiplies by -1 so they can be reassociated.
+  if (I->getOpcode() == Instruction::Mul) {
+    if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) {
+      LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap);
+      LHSBO = isReassociableOp(LHS, Opcode);
+    }
+    if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) {
+      RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap);
+      RHSBO = isReassociableOp(RHS, Opcode);
+    }
+  }
+
+  if (!LHSBO) {
+    if (!RHSBO) {
+      // Neither the LHS or RHS as part of the tree, thus this is a leaf.  As
+      // such, just remember these operands and their rank.
+      Ops.push_back(ValueEntry(getRank(LHS), LHS));
+      Ops.push_back(ValueEntry(getRank(RHS), RHS));
+      
+      // Clear the leaves out.
+      I->setOperand(0, UndefValue::get(I->getType()));
+      I->setOperand(1, UndefValue::get(I->getType()));
+      return;
+    } else {
+      // Turn X+(Y+Z) -> (Y+Z)+X
+      std::swap(LHSBO, RHSBO);
+      std::swap(LHS, RHS);
+      bool Success = !I->swapOperands();
+      assert(Success && "swapOperands failed");
+      Success = false;
+      MadeChange = true;
+    }
+  } else if (RHSBO) {
+    // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the the RHS is not
+    // part of the expression tree.
+    LinearizeExpr(I);
+    LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0));
+    RHS = I->getOperand(1);
+    RHSBO = 0;
+  }
+
+  // Okay, now we know that the LHS is a nested expression and that the RHS is
+  // not.  Perform reassociation.
+  assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!");
+
+  // Move LHS right before I to make sure that the tree expression dominates all
+  // values.
+  LHSBO->moveBefore(I);
+
+  // Linearize the expression tree on the LHS.
+  LinearizeExprTree(LHSBO, Ops);
+
+  // Remember the RHS operand and its rank.
+  Ops.push_back(ValueEntry(getRank(RHS), RHS));
+  
+  // Clear the RHS leaf out.
+  I->setOperand(1, UndefValue::get(I->getType()));
+}
+
+// RewriteExprTree - Now that the operands for this expression tree are
+// linearized and optimized, emit them in-order.  This function is written to be
+// tail recursive.
+void Reassociate::RewriteExprTree(BinaryOperator *I,
+                                  std::vector<ValueEntry> &Ops,
+                                  unsigned i) {
+  if (i+2 == Ops.size()) {
+    if (I->getOperand(0) != Ops[i].Op ||
+        I->getOperand(1) != Ops[i+1].Op) {
+      Value *OldLHS = I->getOperand(0);
+      DOUT << "RA: " << *I;
+      I->setOperand(0, Ops[i].Op);
+      I->setOperand(1, Ops[i+1].Op);
+      DOUT << "TO: " << *I;
+      MadeChange = true;
+      ++NumChanged;
+      
+      // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3)
+      // delete the extra, now dead, nodes.
+      RemoveDeadBinaryOp(OldLHS);
+    }
+    return;
+  }
+  assert(i+2 < Ops.size() && "Ops index out of range!");
+
+  if (I->getOperand(1) != Ops[i].Op) {
+    DOUT << "RA: " << *I;
+    I->setOperand(1, Ops[i].Op);
+    DOUT << "TO: " << *I;
+    MadeChange = true;
+    ++NumChanged;
+  }
+  
+  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
+  assert(LHS->getOpcode() == I->getOpcode() &&
+         "Improper expression tree!");
+  
+  // Compactify the tree instructions together with each other to guarantee
+  // that the expression tree is dominated by all of Ops.
+  LHS->moveBefore(I);
+  RewriteExprTree(LHS, Ops, i+1);
+}
+
+
+
+// NegateValue - Insert instructions before the instruction pointed to by BI,
+// that computes the negative version of the value specified.  The negative
+// version of the value is returned, and BI is left pointing at the instruction
+// that should be processed next by the reassociation pass.
+//
+static Value *NegateValue(Value *V, Instruction *BI) {
+  // We are trying to expose opportunity for reassociation.  One of the things
+  // that we want to do to achieve this is to push a negation as deep into an
+  // expression chain as possible, to expose the add instructions.  In practice,
+  // this means that we turn this:
+  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
+  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+  // the constants.  We assume that instcombine will clean up the mess later if
+  // we introduce tons of unnecessary negation instructions...
+  //
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (I->getOpcode() == Instruction::Add && I->hasOneUse()) {
+      // Push the negates through the add.
+      I->setOperand(0, NegateValue(I->getOperand(0), BI));
+      I->setOperand(1, NegateValue(I->getOperand(1), BI));
+
+      // We must move the add instruction here, because the neg instructions do
+      // not dominate the old add instruction in general.  By moving it, we are
+      // assured that the neg instructions we just inserted dominate the 
+      // instruction we are about to insert after them.
+      //
+      I->moveBefore(BI);
+      I->setName(I->getName()+".neg");
+      return I;
+    }
+
+  // Insert a 'neg' instruction that subtracts the value from zero to get the
+  // negation.
+  //
+  return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI);
+}
+
+/// ShouldBreakUpSubtract - Return true if we should break up this subtract of
+/// X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+  // If this is a negation, we can't split it up!
+  if (BinaryOperator::isNeg(Sub))
+    return false;
+  
+  // Don't bother to break this up unless either the LHS is an associable add or
+  // subtract or if this is only used by one.
+  if (isReassociableOp(Sub->getOperand(0), Instruction::Add) ||
+      isReassociableOp(Sub->getOperand(0), Instruction::Sub))
+    return true;
+  if (isReassociableOp(Sub->getOperand(1), Instruction::Add) ||
+      isReassociableOp(Sub->getOperand(1), Instruction::Sub))
+    return true;
+  if (Sub->hasOneUse() && 
+      (isReassociableOp(Sub->use_back(), Instruction::Add) ||
+       isReassociableOp(Sub->use_back(), Instruction::Sub)))
+    return true;
+    
+  return false;
+}
+
+/// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is
+/// only used by an add, transform this into (X+(0-Y)) to promote better
+/// reassociation.
+static Instruction *BreakUpSubtract(Instruction *Sub,
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+  // Convert a subtract into an add and a neg instruction... so that sub
+  // instructions can be commuted with other add instructions...
+  //
+  // Calculate the negative value of Operand 1 of the sub instruction...
+  // and set it as the RHS of the add instruction we just made...
+  //
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
+  Instruction *New =
+    BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
+  New->takeName(Sub);
+
+  // Everyone now refers to the add instruction.
+  ValueRankMap.erase(Sub);
+  Sub->replaceAllUsesWith(New);
+  Sub->eraseFromParent();
+
+  DOUT << "Negated: " << *New;
+  return New;
+}
+
+/// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
+/// by one, change this into a multiply by a constant to assist with further
+/// reassociation.
+static Instruction *ConvertShiftToMul(Instruction *Shl, 
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+  // If an operand of this shift is a reassociable multiply, or if the shift
+  // is used by a reassociable multiply or add, turn into a multiply.
+  if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
+      (Shl->hasOneUse() && 
+       (isReassociableOp(Shl->use_back(), Instruction::Mul) ||
+        isReassociableOp(Shl->use_back(), Instruction::Add)))) {
+    Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+    MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+    
+    Instruction *Mul = BinaryOperator::CreateMul(Shl->getOperand(0), MulCst,
+                                                 "", Shl);
+    ValueRankMap.erase(Shl);
+    Mul->takeName(Shl);
+    Shl->replaceAllUsesWith(Mul);
+    Shl->eraseFromParent();
+    return Mul;
+  }
+  return 0;
+}
+
+// Scan backwards and forwards among values with the same rank as element i to
+// see if X exists.  If X does not exist, return i.
+static unsigned FindInOperandList(std::vector<ValueEntry> &Ops, unsigned i,
+                                  Value *X) {
+  unsigned XRank = Ops[i].Rank;
+  unsigned e = Ops.size();
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j)
+    if (Ops[j].Op == X)
+      return j;
+  // Scan backwards
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j)
+    if (Ops[j].Op == X)
+      return j;
+  return i;
+}
+
+/// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together
+/// and returning the result.  Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I, std::vector<Value*> &Ops) {
+  if (Ops.size() == 1) return Ops.back();
+  
+  Value *V1 = Ops.back();
+  Ops.pop_back();
+  Value *V2 = EmitAddTreeOfValues(I, Ops);
+  return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
+}
+
+/// RemoveFactorFromExpression - If V is an expression tree that is a 
+/// multiplication sequence, and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  if (!BO) return 0;
+  
+  std::vector<ValueEntry> Factors;
+  LinearizeExprTree(BO, Factors);
+
+  bool FoundFactor = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i)
+    if (Factors[i].Op == Factor) {
+      FoundFactor = true;
+      Factors.erase(Factors.begin()+i);
+      break;
+    }
+  if (!FoundFactor) {
+    // Make sure to restore the operands to the expression tree.
+    RewriteExprTree(BO, Factors);
+    return 0;
+  }
+  
+  if (Factors.size() == 1) return Factors[0].Op;
+  
+  RewriteExprTree(BO, Factors);
+  return BO;
+}
+
+/// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively
+/// add its operands as factors, otherwise add V to the list of factors.
+static void FindSingleUseMultiplyFactors(Value *V,
+                                         std::vector<Value*> &Factors) {
+  BinaryOperator *BO;
+  if ((!V->hasOneUse() && !V->use_empty()) ||
+      !(BO = dyn_cast<BinaryOperator>(V)) ||
+      BO->getOpcode() != Instruction::Mul) {
+    Factors.push_back(V);
+    return;
+  }
+  
+  // Otherwise, add the LHS and RHS to the list of factors.
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
+}
+
+
+
+Value *Reassociate::OptimizeExpression(BinaryOperator *I,
+                                       std::vector<ValueEntry> &Ops) {
+  // Now that we have the linearized expression tree, try to optimize it.
+  // Start by folding any constants that we found.
+  bool IterateOptimization = false;
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  unsigned Opcode = I->getOpcode();
+  
+  if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op))
+    if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) {
+      Ops.pop_back();
+      Ops.back().Op = ConstantExpr::get(Opcode, V1, V2);
+      return OptimizeExpression(I, Ops);
+    }
+
+  // Check for destructive annihilation due to a constant being used.
+  if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op))
+    switch (Opcode) {
+    default: break;
+    case Instruction::And:
+      if (CstVal->isZero()) {                // ... & 0 -> 0
+        ++NumAnnihil;
+        return CstVal;
+      } else if (CstVal->isAllOnesValue()) { // ... & -1 -> ...
+        Ops.pop_back();
+      }
+      break;
+    case Instruction::Mul:
+      if (CstVal->isZero()) {                // ... * 0 -> 0
+        ++NumAnnihil;
+        return CstVal;
+      } else if (cast<ConstantInt>(CstVal)->isOne()) {
+        Ops.pop_back();                      // ... * 1 -> ...
+      }
+      break;
+    case Instruction::Or:
+      if (CstVal->isAllOnesValue()) {        // ... | -1 -> -1
+        ++NumAnnihil;
+        return CstVal;
+      }
+      // FALLTHROUGH!
+    case Instruction::Add:
+    case Instruction::Xor:
+      if (CstVal->isZero())                  // ... [|^+] 0 -> ...
+        Ops.pop_back();
+      break;
+    }
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  // Handle destructive annihilation do to identities between elements in the
+  // argument list here.
+  switch (Opcode) {
+  default: break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+    // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      // First, check for X and ~X in the operand list.
+      assert(i < Ops.size());
+      if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
+        Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+        unsigned FoundX = FindInOperandList(Ops, i, X);
+        if (FoundX != i) {
+          if (Opcode == Instruction::And) {   // ...&X&~X = 0
+            ++NumAnnihil;
+            return Constant::getNullValue(X->getType());
+          } else if (Opcode == Instruction::Or) {   // ...|X|~X = -1
+            ++NumAnnihil;
+            return ConstantInt::getAllOnesValue(X->getType());
+          }
+        }
+      }
+
+      // Next, check for duplicate pairs of values, which we assume are next to
+      // each other, due to our sorting criteria.
+      assert(i < Ops.size());
+      if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+        if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+          // Drop duplicate values.
+          Ops.erase(Ops.begin()+i);
+          --i; --e;
+          IterateOptimization = true;
+          ++NumAnnihil;
+        } else {
+          assert(Opcode == Instruction::Xor);
+          if (e == 2) {
+            ++NumAnnihil;
+            return Constant::getNullValue(Ops[0].Op->getType());
+          }
+          // ... X^X -> ...
+          Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+          i -= 1; e -= 2;
+          IterateOptimization = true;
+          ++NumAnnihil;
+        }
+      }
+    }
+    break;
+
+  case Instruction::Add:
+    // Scan the operand lists looking for X and -X pairs.  If we find any, we
+    // can simplify the expression. X+-X == 0.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      assert(i < Ops.size());
+      // Check for X and -X in the operand list.
+      if (BinaryOperator::isNeg(Ops[i].Op)) {
+        Value *X = BinaryOperator::getNegArgument(Ops[i].Op);
+        unsigned FoundX = FindInOperandList(Ops, i, X);
+        if (FoundX != i) {
+          // Remove X and -X from the operand list.
+          if (Ops.size() == 2) {
+            ++NumAnnihil;
+            return Constant::getNullValue(X->getType());
+          } else {
+            Ops.erase(Ops.begin()+i);
+            if (i < FoundX)
+              --FoundX;
+            else
+              --i;   // Need to back up an extra one.
+            Ops.erase(Ops.begin()+FoundX);
+            IterateOptimization = true;
+            ++NumAnnihil;
+            --i;     // Revisit element.
+            e -= 2;  // Removed two elements.
+          }
+        }
+      }
+    }
+    
+
+    // Scan the operand list, checking to see if there are any common factors
+    // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+    // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+    // To efficiently find this, we count the number of times a factor occurs
+    // for any ADD operands that are MULs.
+    std::map<Value*, unsigned> FactorOccurrences;
+    unsigned MaxOcc = 0;
+    Value *MaxOccVal = 0;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op)) {
+        if (BOp->getOpcode() == Instruction::Mul && BOp->use_empty()) {
+          // Compute all of the factors of this added value.
+          std::vector<Value*> Factors;
+          FindSingleUseMultiplyFactors(BOp, Factors);
+          assert(Factors.size() > 1 && "Bad linearize!");
+
+          // Add one to FactorOccurrences for each unique factor in this op.
+          if (Factors.size() == 2) {
+            unsigned Occ = ++FactorOccurrences[Factors[0]];
+            if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[0]; }
+            if (Factors[0] != Factors[1]) {   // Don't double count A*A.
+              Occ = ++FactorOccurrences[Factors[1]];
+              if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[1]; }
+            }
+          } else {
+            std::set<Value*> Duplicates;
+            for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+              if (Duplicates.insert(Factors[i]).second) {
+                unsigned Occ = ++FactorOccurrences[Factors[i]];
+                if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[i]; }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // If any factor occurred more than one time, we can pull it out.
+    if (MaxOcc > 1) {
+      DOUT << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << "\n";
+      
+      // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+      // this, we could otherwise run into situations where removing a factor
+      // from an expression will drop a use of maxocc, and this can cause 
+      // RemoveFactorFromExpression on successive values to behave differently.
+      Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
+      std::vector<Value*> NewMulOps;
+      for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+        if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+          NewMulOps.push_back(V);
+          Ops.erase(Ops.begin()+i);
+          --i; --e;
+        }
+      }
+      
+      // No need for extra uses anymore.
+      delete DummyInst;
+
+      unsigned NumAddedValues = NewMulOps.size();
+      Value *V = EmitAddTreeOfValues(I, NewMulOps);
+      Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+
+      // Now that we have inserted V and its sole use, optimize it. This allows
+      // us to handle cases that require multiple factoring steps, such as this:
+      // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+      if (NumAddedValues > 1)
+        ReassociateExpression(cast<BinaryOperator>(V));
+      
+      ++NumFactor;
+      
+      if (Ops.empty())
+        return V2;
+
+      // Add the new value to the list of things being added.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+      
+      // Rewrite the tree so that there is now a use of V.
+      RewriteExprTree(I, Ops);
+      return OptimizeExpression(I, Ops);
+    }
+    break;
+  //case Instruction::Mul:
+  }
+
+  if (IterateOptimization)
+    return OptimizeExpression(I, Ops);
+  return 0;
+}
+
+
+/// ReassociateBB - Inspect all of the instructions in this basic block,
+/// reassociating them as we go.
+void Reassociate::ReassociateBB(BasicBlock *BB) {
+  for (BasicBlock::iterator BBI = BB->begin(); BBI != BB->end(); ) {
+    Instruction *BI = BBI++;
+    if (BI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(BI->getOperand(1)))
+      if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) {
+        MadeChange = true;
+        BI = NI;
+      }
+
+    // Reject cases where it is pointless to do this.
+    if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPoint() || 
+        isa<VectorType>(BI->getType()))
+      continue;  // Floating point ops are not associative.
+
+    // If this is a subtract instruction which is not already in negate form,
+    // see if we can convert it to X+-Y.
+    if (BI->getOpcode() == Instruction::Sub) {
+      if (ShouldBreakUpSubtract(BI)) {
+        BI = BreakUpSubtract(BI, ValueRankMap);
+        MadeChange = true;
+      } else if (BinaryOperator::isNeg(BI)) {
+        // Otherwise, this is a negation.  See if the operand is a multiply tree
+        // and if this is not an inner node of a multiply tree.
+        if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
+            (!BI->hasOneUse() ||
+             !isReassociableOp(BI->use_back(), Instruction::Mul))) {
+          BI = LowerNegateToMultiply(BI, ValueRankMap);
+          MadeChange = true;
+        }
+      }
+    }
+
+    // If this instruction is a commutative binary operator, process it.
+    if (!BI->isAssociative()) continue;
+    BinaryOperator *I = cast<BinaryOperator>(BI);
+
+    // If this is an interior node of a reassociable tree, ignore it until we
+    // get to the root of the tree, to avoid N^2 analysis.
+    if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode()))
+      continue;
+
+    // If this is an add tree that is used by a sub instruction, ignore it 
+    // until we process the subtract.
+    if (I->hasOneUse() && I->getOpcode() == Instruction::Add &&
+        cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub)
+      continue;
+
+    ReassociateExpression(I);
+  }
+}
+
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+  
+  // First, walk the expression tree, linearizing the tree, collecting
+  std::vector<ValueEntry> Ops;
+  LinearizeExprTree(I, Ops);
+  
+  DOUT << "RAIn:\t"; DEBUG(PrintOps(I, Ops)); DOUT << "\n";
+  
+  // Now that we have linearized the tree to a list and have gathered all of
+  // the operands and their ranks, sort the operands by their rank.  Use a
+  // stable_sort so that values with equal ranks will have their relative
+  // positions maintained (and so the compiler is deterministic).  Note that
+  // this sorts so that the highest ranking values end up at the beginning of
+  // the vector.
+  std::stable_sort(Ops.begin(), Ops.end());
+  
+  // OptimizeExpression - Now that we have the expression tree in a convenient
+  // sorted form, optimize it globally if possible.
+  if (Value *V = OptimizeExpression(I, Ops)) {
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    DOUT << "Reassoc to scalar: " << *V << "\n";
+    I->replaceAllUsesWith(V);
+    RemoveDeadBinaryOp(I);
+    return;
+  }
+  
+  // We want to sink immediates as deeply as possible except in the case where
+  // this is a multiply tree used only by an add, and the immediate is a -1.
+  // In this case we reassociate to put the negation on the outside so that we
+  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+  if (I->getOpcode() == Instruction::Mul && I->hasOneUse() &&
+      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Ops.back().Op) &&
+      cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+    Ops.insert(Ops.begin(), Ops.back());
+    Ops.pop_back();
+  }
+  
+  DOUT << "RAOut:\t"; DEBUG(PrintOps(I, Ops)); DOUT << "\n";
+  
+  if (Ops.size() == 1) {
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    I->replaceAllUsesWith(Ops[0].Op);
+    RemoveDeadBinaryOp(I);
+  } else {
+    // Now that we ordered and optimized the expressions, splat them back into
+    // the expression tree, removing any unneeded nodes.
+    RewriteExprTree(I, Ops);
+  }
+}
+
+
+bool Reassociate::runOnFunction(Function &F) {
+  // Recalculate the rank map for F
+  BuildRankMap(F);
+
+  MadeChange = false;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    ReassociateBB(FI);
+
+  // We are done with the rank map...
+  RankMap.clear();
+  ValueRankMap.clear();
+  return MadeChange;
+}
+
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 0000000..46b2952
--- /dev/null
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -0,0 +1,125 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references.  It is intented to be
+// the inverse of PromoteMemoryToRegister.  By converting to loads, the only
+// values live accross basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reg2mem"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Instructions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CFG.h"
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
+namespace {
+  struct VISIBILITY_HIDDEN RegToMem : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    RegToMem() : FunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addPreservedID(BreakCriticalEdgesID);
+    }
+
+   bool valueEscapes(Instruction* i) {
+      BasicBlock* bb = i->getParent();
+      for (Value::use_iterator ii = i->use_begin(), ie = i->use_end();
+           ii != ie; ++ii)
+        if (cast<Instruction>(*ii)->getParent() != bb ||
+            isa<PHINode>(*ii))
+          return true;
+      return false;
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      if (!F.isDeclaration()) {
+        // Insert all new allocas into entry block.
+        BasicBlock* BBEntry = &F.getEntryBlock();
+        assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
+               "Entry block to function must not have predecessors!");
+
+        // Find first non-alloca instruction and create insertion point. This is
+        // safe if block is well-formed: it always have terminator, otherwise
+        // we'll get and assertion.
+        BasicBlock::iterator I = BBEntry->begin();
+        while (isa<AllocaInst>(I)) ++I;
+
+        CastInst *AllocaInsertionPoint =
+          CastInst::Create(Instruction::BitCast,
+                           Constant::getNullValue(Type::Int32Ty), Type::Int32Ty,
+                           "reg2mem alloca point", I);
+
+        // Find the escaped instructions. But don't create stack slots for
+        // allocas in entry block.
+        std::list<Instruction*> worklist;
+        for (Function::iterator ibb = F.begin(), ibe = F.end();
+             ibb != ibe; ++ibb)
+          for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+               iib != iie; ++iib) {
+            if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+                valueEscapes(iib)) {
+              worklist.push_front(&*iib);
+            }
+          }
+
+        // Demote escaped instructions
+        NumRegsDemoted += worklist.size();
+        for (std::list<Instruction*>::iterator ilb = worklist.begin(), 
+               ile = worklist.end(); ilb != ile; ++ilb)
+          DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
+
+        worklist.clear();
+
+        // Find all phi's
+        for (Function::iterator ibb = F.begin(), ibe = F.end();
+             ibb != ibe; ++ibb)
+          for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+               iib != iie; ++iib)
+            if (isa<PHINode>(iib))
+              worklist.push_front(&*iib);
+
+        // Demote phi nodes
+        NumPhisDemoted += worklist.size();
+        for (std::list<Instruction*>::iterator ilb = worklist.begin(), 
+               ile = worklist.end(); ilb != ile; ++ilb)
+          DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
+
+        return true;
+      }
+      return false;
+    }
+  };
+}
+  
+char RegToMem::ID = 0;
+static RegisterPass<RegToMem>
+X("reg2mem", "Demote all values to stack slots");
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+//
+const PassInfo *const llvm::DemoteRegisterToMemoryID = &X;
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+  return new RegToMem();
+}
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 0000000..d73519c
--- /dev/null
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -0,0 +1,1855 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+//   * Assumes values are constant unless proven otherwise
+//   * Assumes BasicBlocks are dead unless proven otherwise
+//   * Proves values to be constant, and replaces them with constants
+//   * Proves conditional branches to be unconditional
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to to run a DCE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sccp"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumDeadBlocks , "Number of basic blocks unreachable by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+
+namespace {
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy.  It is a simple class with value semantics.
+///
+class VISIBILITY_HIDDEN LatticeVal {
+  enum {
+    /// undefined - This LLVM Value has no known value yet.
+    undefined,
+    
+    /// constant - This LLVM Value has a specific constant value.
+    constant,
+
+    /// forcedconstant - This LLVM Value was thought to be undef until
+    /// ResolvedUndefsIn.  This is treated just like 'constant', but if merged
+    /// with another (different) constant, it goes to overdefined, instead of
+    /// asserting.
+    forcedconstant,
+    
+    /// overdefined - This instruction is not known to be constant, and we know
+    /// it has a value.
+    overdefined
+  } LatticeValue;    // The current lattice position
+  
+  Constant *ConstantVal; // If Constant value, the current value
+public:
+  inline LatticeVal() : LatticeValue(undefined), ConstantVal(0) {}
+  
+  // markOverdefined - Return true if this is a new status to be in...
+  inline bool markOverdefined() {
+    if (LatticeValue != overdefined) {
+      LatticeValue = overdefined;
+      return true;
+    }
+    return false;
+  }
+
+  // markConstant - Return true if this is a new status for us.
+  inline bool markConstant(Constant *V) {
+    if (LatticeValue != constant) {
+      if (LatticeValue == undefined) {
+        LatticeValue = constant;
+        assert(V && "Marking constant with NULL");
+        ConstantVal = V;
+      } else {
+        assert(LatticeValue == forcedconstant && 
+               "Cannot move from overdefined to constant!");
+        // Stay at forcedconstant if the constant is the same.
+        if (V == ConstantVal) return false;
+        
+        // Otherwise, we go to overdefined.  Assumptions made based on the
+        // forced value are possibly wrong.  Assuming this is another constant
+        // could expose a contradiction.
+        LatticeValue = overdefined;
+      }
+      return true;
+    } else {
+      assert(ConstantVal == V && "Marking constant with different value");
+    }
+    return false;
+  }
+
+  inline void markForcedConstant(Constant *V) {
+    assert(LatticeValue == undefined && "Can't force a defined value!");
+    LatticeValue = forcedconstant;
+    ConstantVal = V;
+  }
+  
+  inline bool isUndefined() const { return LatticeValue == undefined; }
+  inline bool isConstant() const {
+    return LatticeValue == constant || LatticeValue == forcedconstant;
+  }
+  inline bool isOverdefined() const { return LatticeValue == overdefined; }
+
+  inline Constant *getConstant() const {
+    assert(isConstant() && "Cannot get the constant of a non-constant!");
+    return ConstantVal;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+  DenseSet<BasicBlock*> BBExecutable;// The basic blocks that are executable
+  std::map<Value*, LatticeVal> ValueState;  // The state each value is in.
+
+  /// GlobalValue - If we are tracking any values for the contents of a global
+  /// variable, we keep a mapping from the constant accessor to the element of
+  /// the global, to the currently known value.  If the value becomes
+  /// overdefined, it's entry is simply removed from this map.
+  DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+
+  /// TrackedRetVals - If we are tracking arguments into and the return
+  /// value out of a function, it will have an entry in this map, indicating
+  /// what the known return value for the function is.
+  DenseMap<Function*, LatticeVal> TrackedRetVals;
+
+  /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+  /// that return multiple values.
+  DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+
+  // The reason for two worklists is that overdefined is the lowest state
+  // on the lattice, and moving things to overdefined as fast as possible
+  // makes SCCP converge much faster.
+  // By having a separate worklist, we accomplish this because everything
+  // possibly overdefined will become overdefined at the soonest possible
+  // point.
+  SmallVector<Value*, 64> OverdefinedInstWorkList;
+  SmallVector<Value*, 64> InstWorkList;
+
+
+  SmallVector<BasicBlock*, 64>  BBWorkList;  // The BasicBlock work list
+
+  /// UsersOfOverdefinedPHIs - Keep track of any users of PHI nodes that are not
+  /// overdefined, despite the fact that the PHI node is overdefined.
+  std::multimap<PHINode*, Instruction*> UsersOfOverdefinedPHIs;
+
+  /// KnownFeasibleEdges - Entries in this set are edges which have already had
+  /// PHI nodes retriggered.
+  typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+  DenseSet<Edge> KnownFeasibleEdges;
+public:
+
+  /// MarkBlockExecutable - This method can be used by clients to mark all of
+  /// the blocks that are known to be intrinsically live in the processed unit.
+  void MarkBlockExecutable(BasicBlock *BB) {
+    DOUT << "Marking Block Executable: " << BB->getNameStart() << "\n";
+    BBExecutable.insert(BB);   // Basic block is executable!
+    BBWorkList.push_back(BB);  // Add the block to the work list!
+  }
+
+  /// TrackValueOfGlobalVariable - Clients can use this method to
+  /// inform the SCCPSolver that it should track loads and stores to the
+  /// specified global variable if it can.  This is only legal to call if
+  /// performing Interprocedural SCCP.
+  void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+    const Type *ElTy = GV->getType()->getElementType();
+    if (ElTy->isFirstClassType()) {
+      LatticeVal &IV = TrackedGlobals[GV];
+      if (!isa<UndefValue>(GV->getInitializer()))
+        IV.markConstant(GV->getInitializer());
+    }
+  }
+
+  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+  /// and out of the specified function (which cannot have its address taken),
+  /// this method must be called.
+  void AddTrackedFunction(Function *F) {
+    assert(F->hasLocalLinkage() && "Can only track internal functions!");
+    // Add an entry, F -> undef.
+    if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+                                                     LatticeVal()));
+    } else
+      TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+  }
+
+  /// Solve - Solve for constants and executable blocks.
+  ///
+  void Solve();
+
+  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+  /// that branches on undef values cannot reach any of their successors.
+  /// However, this is not a safe assumption.  After we solve dataflow, this
+  /// method should be use to handle this.  If this returns true, the solver
+  /// should be rerun.
+  bool ResolvedUndefsIn(Function &F);
+
+  bool isBlockExecutable(BasicBlock *BB) const {
+    return BBExecutable.count(BB);
+  }
+
+  /// getValueMapping - Once we have solved for constants, return the mapping of
+  /// LLVM values to LatticeVals.
+  std::map<Value*, LatticeVal> &getValueMapping() {
+    return ValueState;
+  }
+
+  /// getTrackedRetVals - Get the inferred return value map.
+  ///
+  const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
+    return TrackedRetVals;
+  }
+
+  /// getTrackedGlobals - Get and return the set of inferred initializers for
+  /// global variables.
+  const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+    return TrackedGlobals;
+  }
+
+  inline void markOverdefined(Value *V) {
+    markOverdefined(ValueState[V], V);
+  }
+
+private:
+  // markConstant - Make a value be marked as "constant".  If the value
+  // is not already a constant, add it to the instruction work list so that
+  // the users of the instruction are updated later.
+  //
+  inline void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+    if (IV.markConstant(C)) {
+      DOUT << "markConstant: " << *C << ": " << *V;
+      InstWorkList.push_back(V);
+    }
+  }
+  
+  inline void markForcedConstant(LatticeVal &IV, Value *V, Constant *C) {
+    IV.markForcedConstant(C);
+    DOUT << "markForcedConstant: " << *C << ": " << *V;
+    InstWorkList.push_back(V);
+  }
+  
+  inline void markConstant(Value *V, Constant *C) {
+    markConstant(ValueState[V], V, C);
+  }
+
+  // markOverdefined - Make a value be marked as "overdefined". If the
+  // value is not already overdefined, add it to the overdefined instruction
+  // work list so that the users of the instruction are updated later.
+  inline void markOverdefined(LatticeVal &IV, Value *V) {
+    if (IV.markOverdefined()) {
+      DEBUG(DOUT << "markOverdefined: ";
+            if (Function *F = dyn_cast<Function>(V))
+              DOUT << "Function '" << F->getName() << "'\n";
+            else
+              DOUT << *V);
+      // Only instructions go on the work list
+      OverdefinedInstWorkList.push_back(V);
+    }
+  }
+
+  inline void mergeInValue(LatticeVal &IV, Value *V, LatticeVal &MergeWithV) {
+    if (IV.isOverdefined() || MergeWithV.isUndefined())
+      return;  // Noop.
+    if (MergeWithV.isOverdefined())
+      markOverdefined(IV, V);
+    else if (IV.isUndefined())
+      markConstant(IV, V, MergeWithV.getConstant());
+    else if (IV.getConstant() != MergeWithV.getConstant())
+      markOverdefined(IV, V);
+  }
+  
+  inline void mergeInValue(Value *V, LatticeVal &MergeWithV) {
+    return mergeInValue(ValueState[V], V, MergeWithV);
+  }
+
+
+  // getValueState - Return the LatticeVal object that corresponds to the value.
+  // This function is necessary because not all values should start out in the
+  // underdefined state... Argument's should be overdefined, and
+  // constants should be marked as constants.  If a value is not known to be an
+  // Instruction object, then use this accessor to get its value from the map.
+  //
+  inline LatticeVal &getValueState(Value *V) {
+    std::map<Value*, LatticeVal>::iterator I = ValueState.find(V);
+    if (I != ValueState.end()) return I->second;  // Common case, in the map
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      if (isa<UndefValue>(V)) {
+        // Nothing to do, remain undefined.
+      } else {
+        LatticeVal &LV = ValueState[C];
+        LV.markConstant(C);          // Constants are constant
+        return LV;
+      }
+    }
+    // All others are underdefined by default...
+    return ValueState[V];
+  }
+
+  // markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+  // work list if it is not already executable...
+  //
+  void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+      return;  // This edge is already known to be executable!
+
+    if (BBExecutable.count(Dest)) {
+      DOUT << "Marking Edge Executable: " << Source->getNameStart()
+           << " -> " << Dest->getNameStart() << "\n";
+
+      // The destination is already executable, but we just made an edge
+      // feasible that wasn't before.  Revisit the PHI nodes in the block
+      // because they have potentially new operands.
+      for (BasicBlock::iterator I = Dest->begin(); isa<PHINode>(I); ++I)
+        visitPHINode(*cast<PHINode>(I));
+
+    } else {
+      MarkBlockExecutable(Dest);
+    }
+  }
+
+  // getFeasibleSuccessors - Return a vector of booleans to indicate which
+  // successors are reachable from a given terminator instruction.
+  //
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs);
+
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible...
+  //
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+  // OperandChangedState - This method is invoked on all of the users of an
+  // instruction that was just changed state somehow....  Based on this
+  // information, we need to update the specified user of this instruction.
+  //
+  void OperandChangedState(User *U) {
+    // Only instructions use other variable values!
+    Instruction &I = cast<Instruction>(*U);
+    if (BBExecutable.count(I.getParent()))   // Inst is executable?
+      visit(I);
+  }
+
+private:
+  friend class InstVisitor<SCCPSolver>;
+
+  // visit implementations - Something changed in this instruction... Either an
+  // operand made a transition, or the instruction is newly executable.  Change
+  // the value type of I to reflect these changes if appropriate.
+  //
+  void visitPHINode(PHINode &I);
+
+  // Terminators
+  void visitReturnInst(ReturnInst &I);
+  void visitTerminatorInst(TerminatorInst &TI);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitBinaryOperator(Instruction &I);
+  void visitCmpInst(CmpInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &EVI);
+  void visitInsertValueInst(InsertValueInst &IVI);
+
+  // Instructions that cannot be folded away...
+  void visitStoreInst     (Instruction &I);
+  void visitLoadInst      (LoadInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitCallInst      (CallInst &I) { visitCallSite(CallSite::get(&I)); }
+  void visitInvokeInst    (InvokeInst &II) {
+    visitCallSite(CallSite::get(&II));
+    visitTerminatorInst(II);
+  }
+  void visitCallSite      (CallSite CS);
+  void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
+  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitAllocationInst(Instruction &I) { markOverdefined(&I); }
+  void visitVANextInst    (Instruction &I) { markOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markOverdefined(&I); }
+  void visitFreeInst      (Instruction &I) { /*returns void*/ }
+
+  void visitInstruction(Instruction &I) {
+    // If a new instruction is added to LLVM that we don't handle...
+    cerr << "SCCP: Don't know how to handle: " << I;
+    markOverdefined(&I);   // Just in case
+  }
+};
+
+} // end anonymous namespace
+
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+//
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                       SmallVector<bool, 16> &Succs) {
+  Succs.resize(TI.getNumSuccessors());
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+    } else {
+      LatticeVal &BCValue = getValueState(BI->getCondition());
+      if (BCValue.isOverdefined() ||
+          (BCValue.isConstant() && !isa<ConstantInt>(BCValue.getConstant()))) {
+        // Overdefined condition variables, and branches on unfoldable constant
+        // conditions, mean the branch could go either way.
+        Succs[0] = Succs[1] = true;
+      } else if (BCValue.isConstant()) {
+        // Constant condition variables mean the branch can only go a single way
+        Succs[BCValue.getConstant() == ConstantInt::getFalse()] = true;
+      }
+    }
+  } else if (isa<InvokeInst>(&TI)) {
+    // Invoke instructions successors are always executable.
+    Succs[0] = Succs[1] = true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    LatticeVal &SCValue = getValueState(SI->getCondition());
+    if (SCValue.isOverdefined() ||   // Overdefined condition?
+        (SCValue.isConstant() && !isa<ConstantInt>(SCValue.getConstant()))) {
+      // All destinations are executable!
+      Succs.assign(TI.getNumSuccessors(), true);
+    } else if (SCValue.isConstant())
+      Succs[SI->findCaseValue(cast<ConstantInt>(SCValue.getConstant()))] = true;
+  } else {
+    assert(0 && "SCCP: Don't know how to handle this terminator!");
+  }
+}
+
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible...
+//
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now...
+  TerminatorInst *TI = From->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+    else {
+      LatticeVal &BCValue = getValueState(BI->getCondition());
+      if (BCValue.isOverdefined()) {
+        // Overdefined condition variables mean the branch could go either way.
+        return true;
+      } else if (BCValue.isConstant()) {
+        // Not branching on an evaluatable constant?
+        if (!isa<ConstantInt>(BCValue.getConstant())) return true;
+
+        // Constant condition variables mean the branch can only go a single way
+        return BI->getSuccessor(BCValue.getConstant() ==
+                                       ConstantInt::getFalse()) == To;
+      }
+      return false;
+    }
+  } else if (isa<InvokeInst>(TI)) {
+    // Invoke instructions successors are always executable.
+    return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    LatticeVal &SCValue = getValueState(SI->getCondition());
+    if (SCValue.isOverdefined()) {  // Overdefined condition?
+      // All destinations are executable!
+      return true;
+    } else if (SCValue.isConstant()) {
+      Constant *CPV = SCValue.getConstant();
+      if (!isa<ConstantInt>(CPV))
+        return true;  // not a foldable constant?
+
+      // Make sure to skip the "default value" which isn't a value
+      for (unsigned i = 1, E = SI->getNumSuccessors(); i != E; ++i)
+        if (SI->getSuccessorValue(i) == CPV) // Found the taken branch...
+          return SI->getSuccessor(i) == To;
+
+      // Constant value not equal to any of the branches... must execute
+      // default branch then...
+      return SI->getDefaultDest() == To;
+    }
+    return false;
+  } else {
+    cerr << "Unknown terminator instruction: " << *TI;
+    abort();
+  }
+}
+
+// visit Implementations - Something changed in this instruction... Either an
+// operand made a transition, or the instruction is newly executable.  Change
+// the value type of I to reflect these changes if appropriate.  This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+//    from different branches, or if the PHI node merges in an overdefined
+//    value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+//    PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+//    destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+//    successors executable.
+//
+void SCCPSolver::visitPHINode(PHINode &PN) {
+  LatticeVal &PNIV = getValueState(&PN);
+  if (PNIV.isOverdefined()) {
+    // There may be instructions using this PHI node that are not overdefined
+    // themselves.  If so, make sure that they know that the PHI node operand
+    // changed.
+    std::multimap<PHINode*, Instruction*>::iterator I, E;
+    tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN);
+    if (I != E) {
+      SmallVector<Instruction*, 16> Users;
+      for (; I != E; ++I) Users.push_back(I->second);
+      while (!Users.empty()) {
+        visit(Users.back());
+        Users.pop_back();
+      }
+    }
+    return;  // Quick exit
+  }
+
+  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64) {
+    markOverdefined(PNIV, &PN);
+    return;
+  }
+
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  If they are all
+  // constant, and they agree with each other, the PHI becomes the identical
+  // constant.  If they are constant and don't agree, the PHI is overdefined.
+  // If there are no executable operands, the PHI remains undefined.
+  //
+  Constant *OperandVal = 0;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LatticeVal &IV = getValueState(PN.getIncomingValue(i));
+    if (IV.isUndefined()) continue;  // Doesn't influence PHI node.
+
+    if (isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) {
+      if (IV.isOverdefined()) {   // PHI node becomes overdefined!
+        markOverdefined(&PN);
+        return;
+      }
+
+      if (OperandVal == 0) {   // Grab the first value...
+        OperandVal = IV.getConstant();
+      } else {                // Another value is being merged in!
+        // There is already a reachable operand.  If we conflict with it,
+        // then the PHI node becomes overdefined.  If we agree with it, we
+        // can continue on.
+
+        // Check to see if there are two different constants merging...
+        if (IV.getConstant() != OperandVal) {
+          // Yes there is.  This means the PHI node is not constant.
+          // You must be overdefined poor PHI.
+          //
+          markOverdefined(&PN);    // The PHI node now becomes overdefined
+          return;    // I'm done analyzing you
+        }
+      }
+    }
+  }
+
+  // If we exited the loop, this means that the PHI node only has constant
+  // arguments that agree with each other(and OperandVal is the constant) or
+  // OperandVal is null because there are no defined incoming arguments.  If
+  // this is the case, the PHI remains undefined.
+  //
+  if (OperandVal)
+    markConstant(&PN, OperandVal);      // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+  if (I.getNumOperands() == 0) return;  // Ret void
+
+  Function *F = I.getParent()->getParent();
+  // If we are tracking the return value of this function, merge it in.
+  if (!F->hasLocalLinkage())
+    return;
+
+  if (!TrackedRetVals.empty() && I.getNumOperands() == 1) {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI =
+      TrackedRetVals.find(F);
+    if (TFRVI != TrackedRetVals.end() &&
+        !TFRVI->second.isOverdefined()) {
+      LatticeVal &IV = getValueState(I.getOperand(0));
+      mergeInValue(TFRVI->second, F, IV);
+      return;
+    }
+  }
+  
+  // Handle functions that return multiple values.
+  if (!TrackedMultipleRetVals.empty() && I.getNumOperands() > 1) {
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+      DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+        It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      if (It == TrackedMultipleRetVals.end()) break;
+      mergeInValue(It->second, F, getValueState(I.getOperand(i)));
+    }
+  } else if (!TrackedMultipleRetVals.empty() &&
+             I.getNumOperands() == 1 &&
+             isa<StructType>(I.getOperand(0)->getType())) {
+    for (unsigned i = 0, e = I.getOperand(0)->getType()->getNumContainedTypes();
+         i != e; ++i) {
+      DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+        It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      if (It == TrackedMultipleRetVals.end()) break;
+      Value *Val = FindInsertedValue(I.getOperand(0), i);
+      mergeInValue(It->second, F, getValueState(Val));
+    }
+  }
+}
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible);
+
+  BasicBlock *BB = TI.getParent();
+
+  // Mark all feasible successors executable...
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+  Value *V = I.getOperand(0);
+  LatticeVal &VState = getValueState(V);
+  if (VState.isOverdefined())          // Inherit overdefinedness of operand
+    markOverdefined(&I);
+  else if (VState.isConstant())        // Propagate constant value
+    markConstant(&I, ConstantExpr::getCast(I.getOpcode(), 
+                                           VState.getConstant(), I.getType()));
+}
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+  Value *Aggr = EVI.getAggregateOperand();
+
+  // If the operand to the extractvalue is an undef, the result is undef.
+  if (isa<UndefValue>(Aggr))
+    return;
+
+  // Currently only handle single-index extractvalues.
+  if (EVI.getNumIndices() != 1) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  Function *F = 0;
+  if (CallInst *CI = dyn_cast<CallInst>(Aggr))
+    F = CI->getCalledFunction();
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(Aggr))
+    F = II->getCalledFunction();
+
+  // TODO: If IPSCCP resolves the callee of this function, we could propagate a
+  // result back!
+  if (F == 0 || TrackedMultipleRetVals.empty()) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  // See if we are tracking the result of the callee.  If not tracking this
+  // function (for example, it is a declaration) just move to overdefined.
+  if (!TrackedMultipleRetVals.count(std::make_pair(F, *EVI.idx_begin()))) {
+    markOverdefined(&EVI);
+    return;
+  }
+  
+  // Otherwise, the value will be merged in here as a result of CallSite
+  // handling.
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+  Value *Aggr = IVI.getAggregateOperand();
+  Value *Val = IVI.getInsertedValueOperand();
+
+  // If the operands to the insertvalue are undef, the result is undef.
+  if (isa<UndefValue>(Aggr) && isa<UndefValue>(Val))
+    return;
+
+  // Currently only handle single-index insertvalues.
+  if (IVI.getNumIndices() != 1) {
+    markOverdefined(&IVI);
+    return;
+  }
+
+  // Currently only handle insertvalue instructions that are in a single-use
+  // chain that builds up a return value.
+  for (const InsertValueInst *TmpIVI = &IVI; ; ) {
+    if (!TmpIVI->hasOneUse()) {
+      markOverdefined(&IVI);
+      return;
+    }
+    const Value *V = *TmpIVI->use_begin();
+    if (isa<ReturnInst>(V))
+      break;
+    TmpIVI = dyn_cast<InsertValueInst>(V);
+    if (!TmpIVI) {
+      markOverdefined(&IVI);
+      return;
+    }
+  }
+  
+  // See if we are tracking the result of the callee.
+  Function *F = IVI.getParent()->getParent();
+  DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+    It = TrackedMultipleRetVals.find(std::make_pair(F, *IVI.idx_begin()));
+
+  // Merge in the inserted member value.
+  if (It != TrackedMultipleRetVals.end())
+    mergeInValue(It->second, F, getValueState(Val));
+
+  // Mark the aggregate result of the IVI overdefined; any tracking that we do
+  // will be done on the individual member values.
+  markOverdefined(&IVI);
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+  LatticeVal &CondValue = getValueState(I.getCondition());
+  if (CondValue.isUndefined())
+    return;
+  if (CondValue.isConstant()) {
+    if (ConstantInt *CondCB = dyn_cast<ConstantInt>(CondValue.getConstant())){
+      mergeInValue(&I, getValueState(CondCB->getZExtValue() ? I.getTrueValue()
+                                                          : I.getFalseValue()));
+      return;
+    }
+  }
+  
+  // Otherwise, the condition is overdefined or a constant we can't evaluate.
+  // See if we can produce something better than overdefined based on the T/F
+  // value.
+  LatticeVal &TVal = getValueState(I.getTrueValue());
+  LatticeVal &FVal = getValueState(I.getFalseValue());
+  
+  // select ?, C, C -> C.
+  if (TVal.isConstant() && FVal.isConstant() && 
+      TVal.getConstant() == FVal.getConstant()) {
+    markConstant(&I, FVal.getConstant());
+    return;
+  }
+
+  if (TVal.isUndefined()) {  // select ?, undef, X -> X.
+    mergeInValue(&I, FVal);
+  } else if (FVal.isUndefined()) {  // select ?, X, undef -> X.
+    mergeInValue(&I, TVal);
+  } else {
+    markOverdefined(&I);
+  }
+}
+
+// Handle BinaryOperators and Shift Instructions...
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &V1State = getValueState(I.getOperand(0));
+  LatticeVal &V2State = getValueState(I.getOperand(1));
+
+  if (V1State.isOverdefined() || V2State.isOverdefined()) {
+    // If this is an AND or OR with 0 or -1, it doesn't matter that the other
+    // operand is overdefined.
+    if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+      LatticeVal *NonOverdefVal = 0;
+      if (!V1State.isOverdefined()) {
+        NonOverdefVal = &V1State;
+      } else if (!V2State.isOverdefined()) {
+        NonOverdefVal = &V2State;
+      }
+
+      if (NonOverdefVal) {
+        if (NonOverdefVal->isUndefined()) {
+          // Could annihilate value.
+          if (I.getOpcode() == Instruction::And)
+            markConstant(IV, &I, Constant::getNullValue(I.getType()));
+          else if (const VectorType *PT = dyn_cast<VectorType>(I.getType()))
+            markConstant(IV, &I, ConstantVector::getAllOnesValue(PT));
+          else
+            markConstant(IV, &I, ConstantInt::getAllOnesValue(I.getType()));
+          return;
+        } else {
+          if (I.getOpcode() == Instruction::And) {
+            if (NonOverdefVal->getConstant()->isNullValue()) {
+              markConstant(IV, &I, NonOverdefVal->getConstant());
+              return;      // X and 0 = 0
+            }
+          } else {
+            if (ConstantInt *CI =
+                     dyn_cast<ConstantInt>(NonOverdefVal->getConstant()))
+              if (CI->isAllOnesValue()) {
+                markConstant(IV, &I, NonOverdefVal->getConstant());
+                return;    // X or -1 = -1
+              }
+          }
+        }
+      }
+    }
+
+
+    // If both operands are PHI nodes, it is possible that this instruction has
+    // a constant value, despite the fact that the PHI node doesn't.  Check for
+    // this condition now.
+    if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
+      if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
+        if (PN1->getParent() == PN2->getParent()) {
+          // Since the two PHI nodes are in the same basic block, they must have
+          // entries for the same predecessors.  Walk the predecessor list, and
+          // if all of the incoming values are constants, and the result of
+          // evaluating this expression with all incoming value pairs is the
+          // same, then this expression is a constant even though the PHI node
+          // is not a constant!
+          LatticeVal Result;
+          for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
+            LatticeVal &In1 = getValueState(PN1->getIncomingValue(i));
+            BasicBlock *InBlock = PN1->getIncomingBlock(i);
+            LatticeVal &In2 =
+              getValueState(PN2->getIncomingValueForBlock(InBlock));
+
+            if (In1.isOverdefined() || In2.isOverdefined()) {
+              Result.markOverdefined();
+              break;  // Cannot fold this operation over the PHI nodes!
+            } else if (In1.isConstant() && In2.isConstant()) {
+              Constant *V = ConstantExpr::get(I.getOpcode(), In1.getConstant(),
+                                              In2.getConstant());
+              if (Result.isUndefined())
+                Result.markConstant(V);
+              else if (Result.isConstant() && Result.getConstant() != V) {
+                Result.markOverdefined();
+                break;
+              }
+            }
+          }
+
+          // If we found a constant value here, then we know the instruction is
+          // constant despite the fact that the PHI nodes are overdefined.
+          if (Result.isConstant()) {
+            markConstant(IV, &I, Result.getConstant());
+            // Remember that this instruction is virtually using the PHI node
+            // operands.
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I));
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I));
+            return;
+          } else if (Result.isUndefined()) {
+            return;
+          }
+
+          // Okay, this really is overdefined now.  Since we might have
+          // speculatively thought that this was not overdefined before, and
+          // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
+          // make sure to clean out any entries that we put there, for
+          // efficiency.
+          std::multimap<PHINode*, Instruction*>::iterator It, E;
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN1);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN2);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+        }
+
+    markOverdefined(IV, &I);
+  } else if (V1State.isConstant() && V2State.isConstant()) {
+    markConstant(IV, &I, ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+                                           V2State.getConstant()));
+  }
+}
+
+// Handle ICmpInst instruction...
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &V1State = getValueState(I.getOperand(0));
+  LatticeVal &V2State = getValueState(I.getOperand(1));
+
+  if (V1State.isOverdefined() || V2State.isOverdefined()) {
+    // If both operands are PHI nodes, it is possible that this instruction has
+    // a constant value, despite the fact that the PHI node doesn't.  Check for
+    // this condition now.
+    if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
+      if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
+        if (PN1->getParent() == PN2->getParent()) {
+          // Since the two PHI nodes are in the same basic block, they must have
+          // entries for the same predecessors.  Walk the predecessor list, and
+          // if all of the incoming values are constants, and the result of
+          // evaluating this expression with all incoming value pairs is the
+          // same, then this expression is a constant even though the PHI node
+          // is not a constant!
+          LatticeVal Result;
+          for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
+            LatticeVal &In1 = getValueState(PN1->getIncomingValue(i));
+            BasicBlock *InBlock = PN1->getIncomingBlock(i);
+            LatticeVal &In2 =
+              getValueState(PN2->getIncomingValueForBlock(InBlock));
+
+            if (In1.isOverdefined() || In2.isOverdefined()) {
+              Result.markOverdefined();
+              break;  // Cannot fold this operation over the PHI nodes!
+            } else if (In1.isConstant() && In2.isConstant()) {
+              Constant *V = ConstantExpr::getCompare(I.getPredicate(), 
+                                                     In1.getConstant(), 
+                                                     In2.getConstant());
+              if (Result.isUndefined())
+                Result.markConstant(V);
+              else if (Result.isConstant() && Result.getConstant() != V) {
+                Result.markOverdefined();
+                break;
+              }
+            }
+          }
+
+          // If we found a constant value here, then we know the instruction is
+          // constant despite the fact that the PHI nodes are overdefined.
+          if (Result.isConstant()) {
+            markConstant(IV, &I, Result.getConstant());
+            // Remember that this instruction is virtually using the PHI node
+            // operands.
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I));
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I));
+            return;
+          } else if (Result.isUndefined()) {
+            return;
+          }
+
+          // Okay, this really is overdefined now.  Since we might have
+          // speculatively thought that this was not overdefined before, and
+          // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
+          // make sure to clean out any entries that we put there, for
+          // efficiency.
+          std::multimap<PHINode*, Instruction*>::iterator It, E;
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN1);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN2);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+        }
+
+    markOverdefined(IV, &I);
+  } else if (V1State.isConstant() && V2State.isConstant()) {
+    markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(), 
+                                                  V1State.getConstant(), 
+                                                  V2State.getConstant()));
+  }
+}
+
+void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &IdxState = getValueState(I.getOperand(1));
+
+  if (ValState.isOverdefined() || IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(),
+                                                     IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &EltState = getValueState(I.getOperand(1));
+  LatticeVal &IdxState = getValueState(I.getOperand(2));
+
+  if (ValState.isOverdefined() || EltState.isOverdefined() ||
+      IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && EltState.isConstant() &&
+          IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(),
+                                                    EltState.getConstant(),
+                                                    IdxState.getConstant()));
+  else if (ValState.isUndefined() && EltState.isConstant() &&
+           IdxState.isConstant()) 
+    markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
+                                                   EltState.getConstant(),
+                                                   IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+#if 0
+  LatticeVal &V1State   = getValueState(I.getOperand(0));
+  LatticeVal &V2State   = getValueState(I.getOperand(1));
+  LatticeVal &MaskState = getValueState(I.getOperand(2));
+
+  if (MaskState.isUndefined() ||
+      (V1State.isUndefined() && V2State.isUndefined()))
+    return;  // Undefined output if mask or both inputs undefined.
+  
+  if (V1State.isOverdefined() || V2State.isOverdefined() ||
+      MaskState.isOverdefined()) {
+    markOverdefined(&I);
+  } else {
+    // A mix of constant/undef inputs.
+    Constant *V1 = V1State.isConstant() ? 
+        V1State.getConstant() : UndefValue::get(I.getType());
+    Constant *V2 = V2State.isConstant() ? 
+        V2State.getConstant() : UndefValue::get(I.getType());
+    Constant *Mask = MaskState.isConstant() ? 
+      MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
+    markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
+  }
+#endif
+}
+
+// Handle getelementptr instructions... if all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+//
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I.getNumOperands());
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    LatticeVal &State = getValueState(I.getOperand(i));
+    if (State.isUndefined())
+      return;  // Operands are not resolved yet...
+    else if (State.isOverdefined()) {
+      markOverdefined(IV, &I);
+      return;
+    }
+    assert(State.isConstant() && "Unknown state!");
+    Operands.push_back(State.getConstant());
+  }
+
+  Constant *Ptr = Operands[0];
+  Operands.erase(Operands.begin());  // Erase the pointer from idx list...
+
+  markConstant(IV, &I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0],
+                                                      Operands.size()));
+}
+
+void SCCPSolver::visitStoreInst(Instruction &SI) {
+  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+    return;
+  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+  DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+  // Get the value we are storing into the global.
+  LatticeVal &PtrVal = getValueState(SI.getOperand(0));
+
+  mergeInValue(I->second, GV, PtrVal);
+  if (I->second.isOverdefined())
+    TrackedGlobals.erase(I);      // No need to keep tracking this!
+}
+
+
+// Handle load instructions.  If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUndefined()) return;   // The pointer is not resolved yet!
+  if (PtrVal.isConstant() && !I.isVolatile()) {
+    Value *Ptr = PtrVal.getConstant();
+    // TODO: Consider a target hook for valid address spaces for this xform.
+    if (isa<ConstantPointerNull>(Ptr) && 
+        cast<PointerType>(Ptr->getType())->getAddressSpace() == 0) {
+      // load null -> null
+      markConstant(IV, &I, Constant::getNullValue(I.getType()));
+      return;
+    }
+
+    // Transform load (constant global) into the value loaded.
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+      if (GV->isConstant()) {
+        if (GV->hasDefinitiveInitializer()) {
+          markConstant(IV, &I, GV->getInitializer());
+          return;
+        }
+      } else if (!TrackedGlobals.empty()) {
+        // If we are tracking this global, merge in the known value for it.
+        DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+          TrackedGlobals.find(GV);
+        if (It != TrackedGlobals.end()) {
+          mergeInValue(IV, &I, It->second);
+          return;
+        }
+      }
+    }
+
+    // Transform load (constantexpr_GEP global, 0, ...) into the value loaded.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (CE->getOpcode() == Instruction::GetElementPtr)
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+        if (Constant *V =
+             ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE)) {
+          markConstant(IV, &I, V);
+          return;
+        }
+  }
+
+  // Otherwise we cannot say for certain what value this load will produce.
+  // Bail out.
+  markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  Instruction *I = CS.getInstruction();
+  
+  // The common case is that we aren't tracking the callee, either because we
+  // are not doing interprocedural analysis or the callee is indirect, or is
+  // external.  Handle these cases first.
+  if (F == 0 || !F->hasLocalLinkage()) {
+CallOverdefined:
+    // Void return and not tracking callee, just bail.
+    if (I->getType() == Type::VoidTy) return;
+    
+    // Otherwise, if we have a single return value case, and if the function is
+    // a declaration, maybe we can constant fold it.
+    if (!isa<StructType>(I->getType()) && F && F->isDeclaration() && 
+        canConstantFoldCallTo(F)) {
+      
+      SmallVector<Constant*, 8> Operands;
+      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+           AI != E; ++AI) {
+        LatticeVal &State = getValueState(*AI);
+        if (State.isUndefined())
+          return;  // Operands are not resolved yet.
+        else if (State.isOverdefined()) {
+          markOverdefined(I);
+          return;
+        }
+        assert(State.isConstant() && "Unknown state!");
+        Operands.push_back(State.getConstant());
+      }
+     
+      // If we can constant fold this, mark the result of the call as a
+      // constant.
+      if (Constant *C = ConstantFoldCall(F, Operands.data(), Operands.size())) {
+        markConstant(I, C);
+        return;
+      }
+    }
+
+    // Otherwise, we don't know anything about this call, mark it overdefined.
+    markOverdefined(I);
+    return;
+  }
+
+  // If this is a single/zero retval case, see if we're tracking the function.
+  DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+  if (TFRVI != TrackedRetVals.end()) {
+    // If so, propagate the return value of the callee into this call result.
+    mergeInValue(I, TFRVI->second);
+  } else if (isa<StructType>(I->getType())) {
+    // Check to see if we're tracking this callee, if not, handle it in the
+    // common path above.
+    DenseMap<std::pair<Function*, unsigned>, LatticeVal>::iterator
+    TMRVI = TrackedMultipleRetVals.find(std::make_pair(F, 0));
+    if (TMRVI == TrackedMultipleRetVals.end())
+      goto CallOverdefined;
+    
+    // If we are tracking this callee, propagate the return values of the call
+    // into this call site.  We do this by walking all the uses. Single-index
+    // ExtractValueInst uses can be tracked; anything more complicated is
+    // currently handled conservatively.
+    for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+         UI != E; ++UI) {
+      if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(*UI)) {
+        if (EVI->getNumIndices() == 1) {
+          mergeInValue(EVI, 
+                  TrackedMultipleRetVals[std::make_pair(F, *EVI->idx_begin())]);
+          continue;
+        }
+      }
+      // The aggregate value is used in a way not handled here. Assume nothing.
+      markOverdefined(*UI);
+    }
+  } else {
+    // Otherwise we're not tracking this callee, so handle it in the
+    // common path above.
+    goto CallOverdefined;
+  }
+   
+  // Finally, if this is the first call to the function hit, mark its entry
+  // block executable.
+  if (!BBExecutable.count(F->begin()))
+    MarkBlockExecutable(F->begin());
+  
+  // Propagate information from this call site into the callee.
+  CallSite::arg_iterator CAI = CS.arg_begin();
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+       AI != E; ++AI, ++CAI) {
+    LatticeVal &IV = ValueState[AI];
+    if (!IV.isOverdefined())
+      mergeInValue(IV, AI, getValueState(*CAI));
+  }
+}
+
+
+void SCCPSolver::Solve() {
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty() ||
+         !OverdefinedInstWorkList.empty()) {
+    // Process the instruction work list...
+    while (!OverdefinedInstWorkList.empty()) {
+      Value *I = OverdefinedInstWorkList.back();
+      OverdefinedInstWorkList.pop_back();
+
+      DOUT << "\nPopped off OI-WL: " << *I;
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined
+      // Update all of the users of this instruction's value...
+      //
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI)
+        OperandChangedState(*UI);
+    }
+    // Process the instruction work list...
+    while (!InstWorkList.empty()) {
+      Value *I = InstWorkList.back();
+      InstWorkList.pop_back();
+
+      DOUT << "\nPopped off I-WL: " << *I;
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined.
+      // Update all of the users of this instruction's value...
+      //
+      if (!getValueState(I).isOverdefined())
+        for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+             UI != E; ++UI)
+          OperandChangedState(*UI);
+    }
+
+    // Process the basic block work list...
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DOUT << "\nPopped off BBWL: " << *BB;
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      visit(BB);
+    }
+  }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption.  After we solve dataflow, this
+/// method should be use to handle this.  If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible).  This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined.  For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!BBExecutable.count(BB))
+      continue;
+    
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Look for instructions which produce undef values.
+      if (I->getType() == Type::VoidTy) continue;
+      
+      LatticeVal &LV = getValueState(I);
+      if (!LV.isUndefined()) continue;
+
+      // Get the lattice values of the first two operands for use below.
+      LatticeVal &Op0LV = getValueState(I->getOperand(0));
+      LatticeVal Op1LV;
+      if (I->getNumOperands() == 2) {
+        // If this is a two-operand instruction, and if both operands are
+        // undefs, the result stays undef.
+        Op1LV = getValueState(I->getOperand(1));
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          continue;
+      }
+      
+      // If this is an instructions whose result is defined even if the input is
+      // not fully defined, propagate the information.
+      const Type *ITy = I->getType();
+      switch (I->getOpcode()) {
+      default: break;          // Leave the instruction as an undef.
+      case Instruction::ZExt:
+        // After a zero extend, we know the top part is zero.  SExt doesn't have
+        // to be handled here, because we don't know whether the top part is 1's
+        // or 0's.
+        assert(Op0LV.isUndefined());
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Mul:
+      case Instruction::And:
+        // undef * X -> 0.   X could be zero.
+        // undef & X -> 0.   X could be zero.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::Or:
+        // undef | X -> -1.   X could be -1.
+        if (const VectorType *PTy = dyn_cast<VectorType>(ITy))
+          markForcedConstant(LV, I, ConstantVector::getAllOnesValue(PTy));
+        else          
+          markForcedConstant(LV, I, ConstantInt::getAllOnesValue(ITy));
+        return true;
+
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::SRem:
+      case Instruction::URem:
+        // X / undef -> undef.  No change.
+        // X % undef -> undef.  No change.
+        if (Op1LV.isUndefined()) break;
+        
+        // undef / X -> 0.   X could be maxint.
+        // undef % X -> 0.   X could be 1.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+        
+      case Instruction::AShr:
+        // undef >>s X -> undef.  No change.
+        if (Op0LV.isUndefined()) break;
+        
+        // X >>s undef -> X.  X could be 0, X could have the high-bit known set.
+        if (Op0LV.isConstant())
+          markForcedConstant(LV, I, Op0LV.getConstant());
+        else
+          markOverdefined(LV, I);
+        return true;
+      case Instruction::LShr:
+      case Instruction::Shl:
+        // undef >> X -> undef.  No change.
+        // undef << X -> undef.  No change.
+        if (Op0LV.isUndefined()) break;
+        
+        // X >> undef -> 0.  X could be 0.
+        // X << undef -> 0.  X could be 0.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Select:
+        // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
+        if (Op0LV.isUndefined()) {
+          if (!Op1LV.isConstant())  // Pick the constant one if there is any.
+            Op1LV = getValueState(I->getOperand(2));
+        } else if (Op1LV.isUndefined()) {
+          // c ? undef : undef -> undef.  No change.
+          Op1LV = getValueState(I->getOperand(2));
+          if (Op1LV.isUndefined())
+            break;
+          // Otherwise, c ? undef : x -> x.
+        } else {
+          // Leave Op1LV as Operand(1)'s LatticeValue.
+        }
+        
+        if (Op1LV.isConstant())
+          markForcedConstant(LV, I, Op1LV.getConstant());
+        else
+          markOverdefined(LV, I);
+        return true;
+      case Instruction::Call:
+        // If a call has an undef result, it is because it is constant foldable
+        // but one of the inputs was undef.  Just force the result to
+        // overdefined.
+        markOverdefined(LV, I);
+        return true;
+      }
+    }
+  
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional()) continue;
+      if (!getValueState(BI->getCondition()).isUndefined())
+        continue;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (SI->getNumSuccessors()<2)   // no cases
+        continue;
+      if (!getValueState(SI->getCondition()).isUndefined())
+        continue;
+    } else {
+      continue;
+    }
+    
+    // If the edge to the second successor isn't thought to be feasible yet,
+    // mark it so now.  We pick the second one so that this goes to some
+    // enumerated value in a switch instead of going to the default destination.
+    if (KnownFeasibleEdges.count(Edge(BB, TI->getSuccessor(1))))
+      continue;
+    
+    // Otherwise, it isn't already thought to be feasible.  Mark it as such now
+    // and return.  This will make other blocks reachable, which will allow new
+    // values to be discovered and existing ones to be moved in the lattice.
+    markEdgeExecutable(BB, TI->getSuccessor(1));
+    
+    // This must be a conditional branch of switch on undef.  At this point,
+    // force the old terminator to branch to the first successor.  This is
+    // required because we are now influencing the dataflow of the function with
+    // the assumption that this edge is taken.  If we leave the branch condition
+    // as undef, then further analysis could think the undef went another way
+    // leading to an inconsistent set of conclusions.
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      BI->setCondition(ConstantInt::getFalse());
+    } else {
+      SwitchInst *SI = cast<SwitchInst>(TI);
+      SI->setCondition(SI->getCaseValue(1));
+    }
+    
+    return true;
+  }
+
+  return false;
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// SCCP Class - This class uses the SCCPSolver to implement a per-function
+  /// Sparse Conditional Constant Propagator.
+  ///
+  struct VISIBILITY_HIDDEN SCCP : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    SCCP() : FunctionPass(&ID) {}
+
+    // runOnFunction - Run the Sparse Conditional Constant Propagation
+    // algorithm, and return true if the function was modified.
+    //
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+} // end anonymous namespace
+
+char SCCP::ID = 0;
+static RegisterPass<SCCP>
+X("sccp", "Sparse Conditional Constant Propagation");
+
+// createSCCPPass - This is the public interface to this file...
+FunctionPass *llvm::createSCCPPass() {
+  return new SCCP();
+}
+
+
+// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+//
+bool SCCP::runOnFunction(Function &F) {
+  DOUT << "SCCP on function '" << F.getNameStart() << "'\n";
+  SCCPSolver Solver;
+
+  // Mark the first block of the function as being executable.
+  Solver.MarkBlockExecutable(F.begin());
+
+  // Mark all arguments to the function as being overdefined.
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI)
+    Solver.markOverdefined(AI);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+    DOUT << "RESOLVING UNDEFs\n";
+    ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // If we decided that there are basic blocks that are dead in this function,
+  // delete their contents now.  Note that we cannot actually delete the blocks,
+  // as we cannot modify the CFG of the function.
+  //
+  SmallVector<Instruction*, 512> Insts;
+  std::map<Value*, LatticeVal> &Values = Solver.getValueMapping();
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (!Solver.isBlockExecutable(BB)) {
+      DOUT << "  BasicBlock Dead:" << *BB;
+      ++NumDeadBlocks;
+
+      // Delete the instructions backwards, as it has a reduced likelihood of
+      // having to update as many def-use and use-def chains.
+      for (BasicBlock::iterator I = BB->begin(), E = BB->getTerminator();
+           I != E; ++I)
+        Insts.push_back(I);
+      while (!Insts.empty()) {
+        Instruction *I = Insts.back();
+        Insts.pop_back();
+        if (!I->use_empty())
+          I->replaceAllUsesWith(UndefValue::get(I->getType()));
+        BB->getInstList().erase(I);
+        MadeChanges = true;
+        ++NumInstRemoved;
+      }
+    } else {
+      // Iterate over all of the instructions in a function, replacing them with
+      // constants if we have found them to be of constant values.
+      //
+      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+        Instruction *Inst = BI++;
+        if (Inst->getType() == Type::VoidTy ||
+            isa<TerminatorInst>(Inst))
+          continue;
+        
+        LatticeVal &IV = Values[Inst];
+        if (!IV.isConstant() && !IV.isUndefined())
+          continue;
+        
+        Constant *Const = IV.isConstant()
+          ? IV.getConstant() : UndefValue::get(Inst->getType());
+        DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+        // Replaces all of the uses of a variable with uses of the constant.
+        Inst->replaceAllUsesWith(Const);
+        
+        // Delete the instruction.
+        Inst->eraseFromParent();
+        
+        // Hey, we just changed something!
+        MadeChanges = true;
+        ++NumInstRemoved;
+      }
+    }
+
+  return MadeChanges;
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// IPSCCP Class - This class implements interprocedural Sparse Conditional
+  /// Constant Propagation.
+  ///
+  struct VISIBILITY_HIDDEN IPSCCP : public ModulePass {
+    static char ID;
+    IPSCCP() : ModulePass(&ID) {}
+    bool runOnModule(Module &M);
+  };
+} // end anonymous namespace
+
+char IPSCCP::ID = 0;
+static RegisterPass<IPSCCP>
+Y("ipsccp", "Interprocedural Sparse Conditional Constant Propagation");
+
+// createIPSCCPPass - This is the public interface to this file...
+ModulePass *llvm::createIPSCCPPass() {
+  return new IPSCCP();
+}
+
+
+static bool AddressIsTaken(GlobalValue *GV) {
+  // Delete any dead constantexpr klingons.
+  GV->removeDeadConstantUsers();
+
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI)
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (SI->getOperand(0) == GV || SI->isVolatile())
+        return true;  // Storing addr of GV.
+    } else if (isa<InvokeInst>(*UI) || isa<CallInst>(*UI)) {
+      // Make sure we are calling the function, not passing the address.
+      CallSite CS = CallSite::get(cast<Instruction>(*UI));
+      if (CS.hasArgument(GV))
+        return true;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (LI->isVolatile())
+        return true;
+    } else {
+      return true;
+    }
+  return false;
+}
+
+bool IPSCCP::runOnModule(Module &M) {
+  SCCPSolver Solver;
+
+  // Loop over all functions, marking arguments to those with their addresses
+  // taken or that are external as overdefined.
+  //
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    if (!F->hasLocalLinkage() || AddressIsTaken(F)) {
+      if (!F->isDeclaration())
+        Solver.MarkBlockExecutable(F->begin());
+      for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+           AI != E; ++AI)
+        Solver.markOverdefined(AI);
+    } else {
+      Solver.AddTrackedFunction(F);
+    }
+
+  // Loop over global variables.  We inform the solver about any internal global
+  // variables that do not have their 'addresses taken'.  If they don't have
+  // their addresses taken, we can propagate constants through them.
+  for (Module::global_iterator G = M.global_begin(), E = M.global_end();
+       G != E; ++G)
+    if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G))
+      Solver.TrackValueOfGlobalVariable(G);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+
+    DOUT << "RESOLVING UNDEFS\n";
+    ResolvedUndefs = false;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+      ResolvedUndefs |= Solver.ResolvedUndefsIn(*F);
+  }
+
+  bool MadeChanges = false;
+
+  // Iterate over all of the instructions in the module, replacing them with
+  // constants if we have found them to be of constant values.
+  //
+  SmallVector<Instruction*, 512> Insts;
+  SmallVector<BasicBlock*, 512> BlocksToErase;
+  std::map<Value*, LatticeVal> &Values = Solver.getValueMapping();
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI)
+      if (!AI->use_empty()) {
+        LatticeVal &IV = Values[AI];
+        if (IV.isConstant() || IV.isUndefined()) {
+          Constant *CST = IV.isConstant() ?
+            IV.getConstant() : UndefValue::get(AI->getType());
+          DOUT << "***  Arg " << *AI << " = " << *CST <<"\n";
+
+          // Replaces all of the uses of a variable with uses of the
+          // constant.
+          AI->replaceAllUsesWith(CST);
+          ++IPNumArgsElimed;
+        }
+      }
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (!Solver.isBlockExecutable(BB)) {
+        DOUT << "  BasicBlock Dead:" << *BB;
+        ++IPNumDeadBlocks;
+
+        // Delete the instructions backwards, as it has a reduced likelihood of
+        // having to update as many def-use and use-def chains.
+        TerminatorInst *TI = BB->getTerminator();
+        for (BasicBlock::iterator I = BB->begin(), E = TI; I != E; ++I)
+          Insts.push_back(I);
+
+        while (!Insts.empty()) {
+          Instruction *I = Insts.back();
+          Insts.pop_back();
+          if (!I->use_empty())
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          BB->getInstList().erase(I);
+          MadeChanges = true;
+          ++IPNumInstRemoved;
+        }
+
+        for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+          BasicBlock *Succ = TI->getSuccessor(i);
+          if (!Succ->empty() && isa<PHINode>(Succ->begin()))
+            TI->getSuccessor(i)->removePredecessor(BB);
+        }
+        if (!TI->use_empty())
+          TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        BB->getInstList().erase(TI);
+
+        if (&*BB != &F->front())
+          BlocksToErase.push_back(BB);
+        else
+          new UnreachableInst(BB);
+
+      } else {
+        for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+          Instruction *Inst = BI++;
+          if (Inst->getType() == Type::VoidTy)
+            continue;
+          
+          LatticeVal &IV = Values[Inst];
+          if (!IV.isConstant() && !IV.isUndefined())
+            continue;
+          
+          Constant *Const = IV.isConstant()
+            ? IV.getConstant() : UndefValue::get(Inst->getType());
+          DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+          // Replaces all of the uses of a variable with uses of the
+          // constant.
+          Inst->replaceAllUsesWith(Const);
+          
+          // Delete the instruction.
+          if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+            Inst->eraseFromParent();
+
+          // Hey, we just changed something!
+          MadeChanges = true;
+          ++IPNumInstRemoved;
+        }
+      }
+
+    // Now that all instructions in the function are constant folded, erase dead
+    // blocks, because we can now use ConstantFoldTerminator to get rid of
+    // in-edges.
+    for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+      // If there are any PHI nodes in this successor, drop entries for BB now.
+      BasicBlock *DeadBB = BlocksToErase[i];
+      while (!DeadBB->use_empty()) {
+        Instruction *I = cast<Instruction>(DeadBB->use_back());
+        bool Folded = ConstantFoldTerminator(I->getParent());
+        if (!Folded) {
+          // The constant folder may not have been able to fold the terminator
+          // if this is a branch or switch on undef.  Fold it manually as a
+          // branch to the first successor.
+#ifndef NDEBUG
+          if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+                   "Branch should be foldable!");
+          } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          } else {
+            assert(0 && "Didn't fold away reference to block!");
+          }
+#endif
+          
+          // Make this an uncond branch to the first successor.
+          TerminatorInst *TI = I->getParent()->getTerminator();
+          BranchInst::Create(TI->getSuccessor(0), TI);
+          
+          // Remove entries in successor phi nodes to remove edges.
+          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+            TI->getSuccessor(i)->removePredecessor(TI->getParent());
+          
+          // Remove the old terminator.
+          TI->eraseFromParent();
+        }
+      }
+
+      // Finally, delete the basic block.
+      F->getBasicBlockList().erase(DeadBB);
+    }
+    BlocksToErase.clear();
+  }
+
+  // If we inferred constant or undef return values for a function, we replaced
+  // all call uses with the inferred value.  This means we don't need to bother
+  // actually returning anything from the function.  Replace all return
+  // instructions with return undef.
+  // TODO: Process multiple value ret instructions also.
+  const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
+  for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
+         E = RV.end(); I != E; ++I)
+    if (!I->second.isOverdefined() &&
+        I->first->getReturnType() != Type::VoidTy) {
+      Function *F = I->first;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+        if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+          if (!isa<UndefValue>(RI->getOperand(0)))
+            RI->setOperand(0, UndefValue::get(F->getReturnType()));
+    }
+
+  // If we infered constant or undef values for globals variables, we can delete
+  // the global and any stores that remain to it.
+  const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+  for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+         E = TG.end(); I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    assert(!I->second.isOverdefined() &&
+           "Overdefined values should have been taken out of the map!");
+    DOUT << "Found that GV '" << GV->getNameStart() << "' is constant!\n";
+    while (!GV->use_empty()) {
+      StoreInst *SI = cast<StoreInst>(GV->use_back());
+      SI->eraseFromParent();
+    }
+    M.getGlobalList().erase(GV);
+    ++IPNumGlobalConst;
+  }
+
+  return MadeChanges;
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
new file mode 100644
index 0000000..5669da0
--- /dev/null
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -0,0 +1,111 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the C bindings for libLLVMScalarOpts.a, which implements
+// several scalar transformations over the LLVM intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCFGSimplificationPass());
+}
+
+void LLVMAddCondPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCondPropagationPass());
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstructionCombiningPass());
+}
+
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopDeletionPass());
+}
+
+void LLVMAddLoopIndexSplitPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopIndexSplitPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarReplAggregatesPass());
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSimplifyLibCallsPass());
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantPropagationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
new file mode 100644
index 0000000..9935f12
--- /dev/null
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -0,0 +1,1820 @@
+//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation implements the well known scalar replacement of
+// aggregates transformation.  This xform breaks up alloca instructions of
+// aggregate type (structure or array) into individual alloca instructions for
+// each member (if possible).  Then, if possible, it transforms the individual
+// alloca instructions into nice clean scalar SSA form.
+//
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because
+// often interact, especially for C++ programs.  As such, iterating between
+// SRoA, then Mem2Reg until we run out of things to promote works well.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scalarrepl"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+STATISTIC(NumReplaced,  "Number of allocas broken up");
+STATISTIC(NumPromoted,  "Number of allocas promoted");
+STATISTIC(NumConverted, "Number of aggregates converted to scalar");
+STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
+
+namespace {
+  struct VISIBILITY_HIDDEN SROA : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    explicit SROA(signed T = -1) : FunctionPass(&ID) {
+      if (T == -1)
+        SRThreshold = 128;
+      else
+        SRThreshold = T;
+    }
+
+    bool runOnFunction(Function &F);
+
+    bool performScalarRepl(Function &F);
+    bool performPromotion(Function &F);
+
+    // getAnalysisUsage - This pass does not require any passes, but we know it
+    // will not alter the CFG, so say so.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();
+      AU.addRequired<TargetData>();
+      AU.setPreservesCFG();
+    }
+
+  private:
+    TargetData *TD;
+    
+    /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
+    /// information about the uses.  All these fields are initialized to false
+    /// and set to true when something is learned.
+    struct AllocaInfo {
+      /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
+      bool isUnsafe : 1;
+      
+      /// needsCleanup - This is set to true if there is some use of the alloca
+      /// that requires cleanup.
+      bool needsCleanup : 1;
+      
+      /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
+      bool isMemCpySrc : 1;
+
+      /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
+      bool isMemCpyDst : 1;
+
+      AllocaInfo()
+        : isUnsafe(false), needsCleanup(false), 
+          isMemCpySrc(false), isMemCpyDst(false) {}
+    };
+    
+    unsigned SRThreshold;
+
+    void MarkUnsafe(AllocaInfo &I) { I.isUnsafe = true; }
+
+    int isSafeAllocaToScalarRepl(AllocationInst *AI);
+
+    void isSafeUseOfAllocation(Instruction *User, AllocationInst *AI,
+                               AllocaInfo &Info);
+    void isSafeElementUse(Value *Ptr, bool isFirstElt, AllocationInst *AI,
+                         AllocaInfo &Info);
+    void isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocationInst *AI,
+                                        unsigned OpNo, AllocaInfo &Info);
+    void isSafeUseOfBitCastedAllocation(BitCastInst *User, AllocationInst *AI,
+                                        AllocaInfo &Info);
+    
+    void DoScalarReplacement(AllocationInst *AI, 
+                             std::vector<AllocationInst*> &WorkList);
+    void CleanupGEP(GetElementPtrInst *GEP);
+    void CleanupAllocaUsers(AllocationInst *AI);
+    AllocaInst *AddNewAlloca(Function &F, const Type *Ty, AllocationInst *Base);
+    
+    void RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocationInst *AI,
+                                    SmallVector<AllocaInst*, 32> &NewElts);
+    
+    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
+                                      AllocationInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocationInst *AI,
+                                       SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocationInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts);
+    
+    bool CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy,
+                            bool &SawVec, uint64_t Offset, unsigned AllocaSize);
+    void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
+    Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType,
+                                     uint64_t Offset, IRBuilder<> &Builder);
+    Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
+                                     uint64_t Offset, IRBuilder<> &Builder);
+    static Instruction *isOnlyCopiedFromConstantGlobal(AllocationInst *AI);
+  };
+}
+
+char SROA::ID = 0;
+static RegisterPass<SROA> X("scalarrepl", "Scalar Replacement of Aggregates");
+
+// Public interface to the ScalarReplAggregates pass
+FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { 
+  return new SROA(Threshold);
+}
+
+
+bool SROA::runOnFunction(Function &F) {
+  TD = &getAnalysis<TargetData>();
+  
+  bool Changed = performPromotion(F);
+  while (1) {
+    bool LocalChange = performScalarRepl(F);
+    if (!LocalChange) break;   // No need to repromote if no scalarrepl
+    Changed = true;
+    LocalChange = performPromotion(F);
+    if (!LocalChange) break;   // No need to re-scalarrepl if no promotion
+  }
+
+  return Changed;
+}
+
+
+bool SROA::performPromotion(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+  DominatorTree         &DT = getAnalysis<DominatorTree>();
+  DominanceFrontier &DF = getAnalysis<DominanceFrontier>();
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+
+  bool Changed = false;
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT, DF);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// getNumSAElements - Return the number of elements in the specific struct or
+/// array.
+static uint64_t getNumSAElements(const Type *T) {
+  if (const StructType *ST = dyn_cast<StructType>(T))
+    return ST->getNumElements();
+  return cast<ArrayType>(T)->getNumElements();
+}
+
+// performScalarRepl - This algorithm is a simple worklist driven algorithm,
+// which runs on all of the malloc/alloca instructions in the function, removing
+// them if they are only used by getelementptr instructions.
+//
+bool SROA::performScalarRepl(Function &F) {
+  std::vector<AllocationInst*> WorkList;
+
+  // Scan the entry basic block, adding any alloca's and mallocs to the worklist
+  BasicBlock &BB = F.getEntryBlock();
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+    if (AllocationInst *A = dyn_cast<AllocationInst>(I))
+      WorkList.push_back(A);
+
+  // Process the worklist
+  bool Changed = false;
+  while (!WorkList.empty()) {
+    AllocationInst *AI = WorkList.back();
+    WorkList.pop_back();
+    
+    // Handle dead allocas trivially.  These can be formed by SROA'ing arrays
+    // with unused elements.
+    if (AI->use_empty()) {
+      AI->eraseFromParent();
+      continue;
+    }
+
+    // If this alloca is impossible for us to promote, reject it early.
+    if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
+      continue;
+    
+    // Check to see if this allocation is only modified by a memcpy/memmove from
+    // a constant global.  If this is the case, we can change all users to use
+    // the constant global instead.  This is commonly produced by the CFE by
+    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
+    // is only subsequently read.
+    if (Instruction *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) {
+      DOUT << "Found alloca equal to global: " << *AI;
+      DOUT << "  memcpy = " << *TheCopy;
+      Constant *TheSrc = cast<Constant>(TheCopy->getOperand(2));
+      AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
+      TheCopy->eraseFromParent();  // Don't mutate the global.
+      AI->eraseFromParent();
+      ++NumGlobals;
+      Changed = true;
+      continue;
+    }
+    
+    // Check to see if we can perform the core SROA transformation.  We cannot
+    // transform the allocation instruction if it is an array allocation
+    // (allocations OF arrays are ok though), and an allocation of a scalar
+    // value cannot be decomposed at all.
+    uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType());
+
+    // Do not promote any struct whose size is too big.
+    if (AllocaSize > SRThreshold) continue;
+        
+    if ((isa<StructType>(AI->getAllocatedType()) ||
+         isa<ArrayType>(AI->getAllocatedType())) &&
+        // Do not promote any struct into more than "32" separate vars.
+        getNumSAElements(AI->getAllocatedType()) <= SRThreshold/4) {
+      // Check that all of the users of the allocation are capable of being
+      // transformed.
+      switch (isSafeAllocaToScalarRepl(AI)) {
+      default: assert(0 && "Unexpected value!");
+      case 0:  // Not safe to scalar replace.
+        break;
+      case 1:  // Safe, but requires cleanup/canonicalizations first
+        CleanupAllocaUsers(AI);
+        // FALL THROUGH.
+      case 3:  // Safe to scalar replace.
+        DoScalarReplacement(AI, WorkList);
+        Changed = true;
+        continue;
+      }
+    }
+
+    // If we can turn this aggregate value (potentially with casts) into a
+    // simple scalar value that can be mem2reg'd into a register value.
+    // IsNotTrivial tracks whether this is something that mem2reg could have
+    // promoted itself.  If so, we don't want to transform it needlessly.  Note
+    // that we can't just check based on the type: the alloca may be of an i32
+    // but that has pointer arithmetic to set byte 3 of it or something.
+    bool IsNotTrivial = false;
+    const Type *VectorTy = 0;
+    bool HadAVector = false;
+    if (CanConvertToScalar(AI, IsNotTrivial, VectorTy, HadAVector, 
+                           0, unsigned(AllocaSize)) && IsNotTrivial) {
+      AllocaInst *NewAI;
+      // If we were able to find a vector type that can handle this with
+      // insert/extract elements, and if there was at least one use that had
+      // a vector type, promote this to a vector.  We don't want to promote
+      // random stuff that doesn't use vectors (e.g. <9 x double>) because then
+      // we just get a lot of insert/extracts.  If at least one vector is
+      // involved, then we probably really do have a union of vector/array.
+      if (VectorTy && isa<VectorType>(VectorTy) && HadAVector) {
+        DOUT << "CONVERT TO VECTOR: " << *AI << "  TYPE = " << *VectorTy <<"\n";
+        
+        // Create and insert the vector alloca.
+        NewAI = new AllocaInst(VectorTy, 0, "", AI->getParent()->begin());
+        ConvertUsesToScalar(AI, NewAI, 0);
+      } else {
+        DOUT << "CONVERT TO SCALAR INTEGER: " << *AI << "\n";
+        
+        // Create and insert the integer alloca.
+        const Type *NewTy = IntegerType::get(AllocaSize*8);
+        NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
+        ConvertUsesToScalar(AI, NewAI, 0);
+      }
+      NewAI->takeName(AI);
+      AI->eraseFromParent();
+      ++NumConverted;
+      Changed = true;
+      continue;
+    }
+    
+    // Otherwise, couldn't process this alloca.
+  }
+
+  return Changed;
+}
+
+/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
+/// predicate, do SROA now.
+void SROA::DoScalarReplacement(AllocationInst *AI, 
+                               std::vector<AllocationInst*> &WorkList) {
+  DOUT << "Found inst to SROA: " << *AI;
+  SmallVector<AllocaInst*, 32> ElementAllocas;
+  if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    ElementAllocas.reserve(ST->getNumContainedTypes());
+    for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, 
+                                      AI->getAlignment(),
+                                      AI->getName() + "." + utostr(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  } else {
+    const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ElementAllocas.reserve(AT->getNumElements());
+    const Type *ElTy = AT->getElementType();
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
+                                      AI->getName() + "." + utostr(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  }
+
+  // Now that we have created the alloca instructions that we want to use,
+  // expand the getelementptr instructions to use them.
+  //
+  while (!AI->use_empty()) {
+    Instruction *User = cast<Instruction>(AI->use_back());
+    if (BitCastInst *BCInst = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCastUserOfAlloca(BCInst, AI, ElementAllocas);
+      BCInst->eraseFromParent();
+      continue;
+    }
+    
+    // Replace:
+    //   %res = load { i32, i32 }* %alloc
+    // with:
+    //   %load.0 = load i32* %alloc.0
+    //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 
+    //   %load.1 = load i32* %alloc.1
+    //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 
+    // (Also works for arrays instead of structs)
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      Value *Insert = UndefValue::get(LI->getType());
+      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
+        Value *Load = new LoadInst(ElementAllocas[i], "load", LI);
+        Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
+      }
+      LI->replaceAllUsesWith(Insert);
+      LI->eraseFromParent();
+      continue;
+    }
+
+    // Replace:
+    //   store { i32, i32 } %val, { i32, i32 }* %alloc
+    // with:
+    //   %val.0 = extractvalue { i32, i32 } %val, 0 
+    //   store i32 %val.0, i32* %alloc.0
+    //   %val.1 = extractvalue { i32, i32 } %val, 1 
+    //   store i32 %val.1, i32* %alloc.1
+    // (Also works for arrays instead of structs)
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      Value *Val = SI->getOperand(0);
+      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
+        Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
+        new StoreInst(Extract, ElementAllocas[i], SI);
+      }
+      SI->eraseFromParent();
+      continue;
+    }
+    
+    GetElementPtrInst *GEPI = cast<GetElementPtrInst>(User);
+    // We now know that the GEP is of the form: GEP <ptr>, 0, <cst>
+    unsigned Idx =
+       (unsigned)cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+
+    assert(Idx < ElementAllocas.size() && "Index out of range?");
+    AllocaInst *AllocaToUse = ElementAllocas[Idx];
+
+    Value *RepValue;
+    if (GEPI->getNumOperands() == 3) {
+      // Do not insert a new getelementptr instruction with zero indices, only
+      // to have it optimized out later.
+      RepValue = AllocaToUse;
+    } else {
+      // We are indexing deeply into the structure, so we still need a
+      // getelement ptr instruction to finish the indexing.  This may be
+      // expanded itself once the worklist is rerun.
+      //
+      SmallVector<Value*, 8> NewArgs;
+      NewArgs.push_back(Constant::getNullValue(Type::Int32Ty));
+      NewArgs.append(GEPI->op_begin()+3, GEPI->op_end());
+      RepValue = GetElementPtrInst::Create(AllocaToUse, NewArgs.begin(),
+                                           NewArgs.end(), "", GEPI);
+      RepValue->takeName(GEPI);
+    }
+    
+    // If this GEP is to the start of the aggregate, check for memcpys.
+    if (Idx == 0 && GEPI->hasAllZeroIndices())
+      RewriteBitCastUserOfAlloca(GEPI, AI, ElementAllocas);
+
+    // Move all of the users over to the new GEP.
+    GEPI->replaceAllUsesWith(RepValue);
+    // Delete the old GEP
+    GEPI->eraseFromParent();
+  }
+
+  // Finally, delete the Alloca instruction
+  AI->eraseFromParent();
+  NumReplaced++;
+}
+
+
+/// isSafeElementUse - Check to see if this use is an allowed use for a
+/// getelementptr instruction of an array aggregate allocation.  isFirstElt
+/// indicates whether Ptr is known to the start of the aggregate.
+///
+void SROA::isSafeElementUse(Value *Ptr, bool isFirstElt, AllocationInst *AI,
+                            AllocaInfo &Info) {
+  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
+       I != E; ++I) {
+    Instruction *User = cast<Instruction>(*I);
+    switch (User->getOpcode()) {
+    case Instruction::Load:  break;
+    case Instruction::Store:
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (User->getOperand(0) == Ptr) return MarkUnsafe(Info);
+      break;
+    case Instruction::GetElementPtr: {
+      GetElementPtrInst *GEP = cast<GetElementPtrInst>(User);
+      bool AreAllZeroIndices = isFirstElt;
+      if (GEP->getNumOperands() > 1) {
+        if (!isa<ConstantInt>(GEP->getOperand(1)) ||
+            !cast<ConstantInt>(GEP->getOperand(1))->isZero())
+          // Using pointer arithmetic to navigate the array.
+          return MarkUnsafe(Info);
+       
+        if (AreAllZeroIndices)
+          AreAllZeroIndices = GEP->hasAllZeroIndices();
+      }
+      isSafeElementUse(GEP, AreAllZeroIndices, AI, Info);
+      if (Info.isUnsafe) return;
+      break;
+    }
+    case Instruction::BitCast:
+      if (isFirstElt) {
+        isSafeUseOfBitCastedAllocation(cast<BitCastInst>(User), AI, Info);
+        if (Info.isUnsafe) return;
+        break;
+      }
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    case Instruction::Call:
+      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+        if (isFirstElt) {
+          isSafeMemIntrinsicOnAllocation(MI, AI, I.getOperandNo(), Info);
+          if (Info.isUnsafe) return;
+          break;
+        }
+      }
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    default:
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    }
+  }
+  return;  // All users look ok :)
+}
+
+/// AllUsersAreLoads - Return true if all users of this value are loads.
+static bool AllUsersAreLoads(Value *Ptr) {
+  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
+       I != E; ++I)
+    if (cast<Instruction>(*I)->getOpcode() != Instruction::Load)
+      return false;
+  return true;
+}
+
+/// isSafeUseOfAllocation - Check to see if this user is an allowed use for an
+/// aggregate allocation.
+///
+void SROA::isSafeUseOfAllocation(Instruction *User, AllocationInst *AI,
+                                 AllocaInfo &Info) {
+  if (BitCastInst *C = dyn_cast<BitCastInst>(User))
+    return isSafeUseOfBitCastedAllocation(C, AI, Info);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(User))
+    if (!LI->isVolatile())
+      return;// Loads (returning a first class aggregrate) are always rewritable
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(User))
+    if (!SI->isVolatile() && SI->getOperand(0) != AI)
+      return;// Store is ok if storing INTO the pointer, not storing the pointer
+ 
+  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User);
+  if (GEPI == 0)
+    return MarkUnsafe(Info);
+
+  gep_type_iterator I = gep_type_begin(GEPI), E = gep_type_end(GEPI);
+
+  // The GEP is not safe to transform if not of the form "GEP <ptr>, 0, <cst>".
+  if (I == E ||
+      I.getOperand() != Constant::getNullValue(I.getOperand()->getType())) {
+    return MarkUnsafe(Info);
+  }
+
+  ++I;
+  if (I == E) return MarkUnsafe(Info);  // ran out of GEP indices??
+
+  bool IsAllZeroIndices = true;
+  
+  // If the first index is a non-constant index into an array, see if we can
+  // handle it as a special case.
+  if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
+    if (!isa<ConstantInt>(I.getOperand())) {
+      IsAllZeroIndices = 0;
+      uint64_t NumElements = AT->getNumElements();
+      
+      // If this is an array index and the index is not constant, we cannot
+      // promote... that is unless the array has exactly one or two elements in
+      // it, in which case we CAN promote it, but we have to canonicalize this
+      // out if this is the only problem.
+      if ((NumElements == 1 || NumElements == 2) &&
+          AllUsersAreLoads(GEPI)) {
+        Info.needsCleanup = true;
+        return;  // Canonicalization required!
+      }
+      return MarkUnsafe(Info);
+    }
+  }
+ 
+  // Walk through the GEP type indices, checking the types that this indexes
+  // into.
+  for (; I != E; ++I) {
+    // Ignore struct elements, no extra checking needed for these.
+    if (isa<StructType>(*I))
+      continue;
+    
+    ConstantInt *IdxVal = dyn_cast<ConstantInt>(I.getOperand());
+    if (!IdxVal) return MarkUnsafe(Info);
+
+    // Are all indices still zero?
+    IsAllZeroIndices &= IdxVal->isZero();
+    
+    if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
+      // This GEP indexes an array.  Verify that this is an in-range constant
+      // integer. Specifically, consider A[0][i]. We cannot know that the user
+      // isn't doing invalid things like allowing i to index an out-of-range
+      // subscript that accesses A[1].  Because of this, we have to reject SROA
+      // of any accesses into structs where any of the components are variables. 
+      if (IdxVal->getZExtValue() >= AT->getNumElements())
+        return MarkUnsafe(Info);
+    } else if (const VectorType *VT = dyn_cast<VectorType>(*I)) {
+      if (IdxVal->getZExtValue() >= VT->getNumElements())
+        return MarkUnsafe(Info);
+    }
+  }
+  
+  // If there are any non-simple uses of this getelementptr, make sure to reject
+  // them.
+  return isSafeElementUse(GEPI, IsAllZeroIndices, AI, Info);
+}
+
+/// isSafeMemIntrinsicOnAllocation - Return true if the specified memory
+/// intrinsic can be promoted by SROA.  At this point, we know that the operand
+/// of the memintrinsic is a pointer to the beginning of the allocation.
+void SROA::isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocationInst *AI,
+                                          unsigned OpNo, AllocaInfo &Info) {
+  // If not constant length, give up.
+  ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+  if (!Length) return MarkUnsafe(Info);
+  
+  // If not the whole aggregate, give up.
+  if (Length->getZExtValue() !=
+      TD->getTypeAllocSize(AI->getType()->getElementType()))
+    return MarkUnsafe(Info);
+  
+  // We only know about memcpy/memset/memmove.
+  if (!isa<MemIntrinsic>(MI))
+    return MarkUnsafe(Info);
+  
+  // Otherwise, we can transform it.  Determine whether this is a memcpy/set
+  // into or out of the aggregate.
+  if (OpNo == 1)
+    Info.isMemCpyDst = true;
+  else {
+    assert(OpNo == 2);
+    Info.isMemCpySrc = true;
+  }
+}
+
+/// isSafeUseOfBitCastedAllocation - Return true if all users of this bitcast
+/// are 
+void SROA::isSafeUseOfBitCastedAllocation(BitCastInst *BC, AllocationInst *AI,
+                                          AllocaInfo &Info) {
+  for (Value::use_iterator UI = BC->use_begin(), E = BC->use_end();
+       UI != E; ++UI) {
+    if (BitCastInst *BCU = dyn_cast<BitCastInst>(UI)) {
+      isSafeUseOfBitCastedAllocation(BCU, AI, Info);
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(UI)) {
+      isSafeMemIntrinsicOnAllocation(MI, AI, UI.getOperandNo(), Info);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      if (SI->isVolatile())
+        return MarkUnsafe(Info);
+      
+      // If storing the entire alloca in one chunk through a bitcasted pointer
+      // to integer, we can transform it.  This happens (for example) when you
+      // cast a {i32,i32}* to i64* and store through it.  This is similar to the
+      // memcpy case and occurs in various "byval" cases and emulated memcpys.
+      if (isa<IntegerType>(SI->getOperand(0)->getType()) &&
+          TD->getTypeAllocSize(SI->getOperand(0)->getType()) ==
+          TD->getTypeAllocSize(AI->getType()->getElementType())) {
+        Info.isMemCpyDst = true;
+        continue;
+      }
+      return MarkUnsafe(Info);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+      if (LI->isVolatile())
+        return MarkUnsafe(Info);
+
+      // If loading the entire alloca in one chunk through a bitcasted pointer
+      // to integer, we can transform it.  This happens (for example) when you
+      // cast a {i32,i32}* to i64* and load through it.  This is similar to the
+      // memcpy case and occurs in various "byval" cases and emulated memcpys.
+      if (isa<IntegerType>(LI->getType()) &&
+          TD->getTypeAllocSize(LI->getType()) ==
+          TD->getTypeAllocSize(AI->getType()->getElementType())) {
+        Info.isMemCpySrc = true;
+        continue;
+      }
+      return MarkUnsafe(Info);
+    } else if (isa<DbgInfoIntrinsic>(UI)) {
+      // If one user is DbgInfoIntrinsic then check if all users are
+      // DbgInfoIntrinsics.
+      if (OnlyUsedByDbgInfoIntrinsics(BC)) {
+        Info.needsCleanup = true;
+        return;
+      }
+      else
+        MarkUnsafe(Info);
+    }
+    else {
+      return MarkUnsafe(Info);
+    }
+    if (Info.isUnsafe) return;
+  }
+}
+
+/// RewriteBitCastUserOfAlloca - BCInst (transitively) bitcasts AI, or indexes
+/// to its first element.  Transform users of the cast to use the new values
+/// instead.
+void SROA::RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocationInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts) {
+  Value::use_iterator UI = BCInst->use_begin(), UE = BCInst->use_end();
+  while (UI != UE) {
+    Instruction *User = cast<Instruction>(*UI++);
+    if (BitCastInst *BCU = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCastUserOfAlloca(BCU, AI, NewElts);
+      if (BCU->use_empty()) BCU->eraseFromParent();
+      continue;
+    }
+
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+      // This must be memcpy/memmove/memset of the entire aggregate.
+      // Split into one per element.
+      RewriteMemIntrinUserOfAlloca(MI, BCInst, AI, NewElts);
+      continue;
+    }
+      
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // If this is a store of the entire alloca from an integer, rewrite it.
+      RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
+      continue;
+    }
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // If this is a load of the entire alloca to an integer, rewrite it.
+      RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
+      continue;
+    }
+    
+    // Otherwise it must be some other user of a gep of the first pointer.  Just
+    // leave these alone.
+    continue;
+  }
+}
+
+/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
+/// Rewrite it to copy or set the elements of the scalarized memory.
+void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
+                                        AllocationInst *AI,
+                                        SmallVector<AllocaInst*, 32> &NewElts) {
+  
+  // If this is a memcpy/memmove, construct the other pointer as the
+  // appropriate type.  The "Other" pointer is the pointer that goes to memory
+  // that doesn't have anything to do with the alloca that we are promoting. For
+  // memset, this Value* stays null.
+  Value *OtherPtr = 0;
+  unsigned MemAlignment = MI->getAlignment();
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
+    if (BCInst == MTI->getRawDest())
+      OtherPtr = MTI->getRawSource();
+    else {
+      assert(BCInst == MTI->getRawSource());
+      OtherPtr = MTI->getRawDest();
+    }
+  }
+  
+  // If there is an other pointer, we want to convert it to the same pointer
+  // type as AI has, so we can GEP through it safely.
+  if (OtherPtr) {
+    // It is likely that OtherPtr is a bitcast, if so, remove it.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(OtherPtr))
+      OtherPtr = BC->getOperand(0);
+    // All zero GEPs are effectively bitcasts.
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(OtherPtr))
+      if (GEP->hasAllZeroIndices())
+        OtherPtr = GEP->getOperand(0);
+    
+    if (ConstantExpr *BCE = dyn_cast<ConstantExpr>(OtherPtr))
+      if (BCE->getOpcode() == Instruction::BitCast)
+        OtherPtr = BCE->getOperand(0);
+    
+    // If the pointer is not the right type, insert a bitcast to the right
+    // type.
+    if (OtherPtr->getType() != AI->getType())
+      OtherPtr = new BitCastInst(OtherPtr, AI->getType(), OtherPtr->getName(),
+                                 MI);
+  }
+  
+  // Process each element of the aggregate.
+  Value *TheFn = MI->getOperand(0);
+  const Type *BytePtrTy = MI->getRawDest()->getType();
+  bool SROADest = MI->getRawDest() == BCInst;
+  
+  Constant *Zero = Constant::getNullValue(Type::Int32Ty);
+
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // If this is a memcpy/memmove, emit a GEP of the other element address.
+    Value *OtherElt = 0;
+    unsigned OtherEltAlign = MemAlignment;
+    
+    if (OtherPtr) {
+      Value *Idx[2] = { Zero, ConstantInt::get(Type::Int32Ty, i) };
+      OtherElt = GetElementPtrInst::Create(OtherPtr, Idx, Idx + 2,
+                                           OtherPtr->getNameStr()+"."+utostr(i),
+                                           MI);
+      uint64_t EltOffset;
+      const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+      if (const StructType *ST =
+            dyn_cast<StructType>(OtherPtrTy->getElementType())) {
+        EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
+      } else {
+        const Type *EltTy =
+          cast<SequentialType>(OtherPtr->getType())->getElementType();
+        EltOffset = TD->getTypeAllocSize(EltTy)*i;
+      }
+      
+      // The alignment of the other pointer is the guaranteed alignment of the
+      // element, which is affected by both the known alignment of the whole
+      // mem intrinsic and the alignment of the element.  If the alignment of
+      // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the
+      // known alignment is just 4 bytes.
+      OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset);
+    }
+    
+    Value *EltPtr = NewElts[i];
+    const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+    
+    // If we got down to a scalar, insert a load or store as appropriate.
+    if (EltTy->isSingleValueType()) {
+      if (isa<MemTransferInst>(MI)) {
+        if (SROADest) {
+          // From Other to Alloca.
+          Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI);
+          new StoreInst(Elt, EltPtr, MI);
+        } else {
+          // From Alloca to Other.
+          Value *Elt = new LoadInst(EltPtr, "tmp", MI);
+          new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI);
+        }
+        continue;
+      }
+      assert(isa<MemSetInst>(MI));
+      
+      // If the stored element is zero (common case), just store a null
+      // constant.
+      Constant *StoreVal;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getOperand(2))) {
+        if (CI->isZero()) {
+          StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
+        } else {
+          // If EltTy is a vector type, get the element type.
+          const Type *ValTy = EltTy;
+          if (const VectorType *VTy = dyn_cast<VectorType>(ValTy))
+            ValTy = VTy->getElementType();
+          
+          // Construct an integer with the right value.
+          unsigned EltSize = TD->getTypeSizeInBits(ValTy);
+          APInt OneVal(EltSize, CI->getZExtValue());
+          APInt TotalVal(OneVal);
+          // Set each byte.
+          for (unsigned i = 0; 8*i < EltSize; ++i) {
+            TotalVal = TotalVal.shl(8);
+            TotalVal |= OneVal;
+          }
+          
+          // Convert the integer value to the appropriate type.
+          StoreVal = ConstantInt::get(TotalVal);
+          if (isa<PointerType>(ValTy))
+            StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
+          else if (ValTy->isFloatingPoint())
+            StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
+          assert(StoreVal->getType() == ValTy && "Type mismatch!");
+          
+          // If the requested value was a vector constant, create it.
+          if (EltTy != ValTy) {
+            unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
+            SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
+            StoreVal = ConstantVector::get(&Elts[0], NumElts);
+          }
+        }
+        new StoreInst(StoreVal, EltPtr, MI);
+        continue;
+      }
+      // Otherwise, if we're storing a byte variable, use a memset call for
+      // this element.
+    }
+    
+    // Cast the element pointer to BytePtrTy.
+    if (EltPtr->getType() != BytePtrTy)
+      EltPtr = new BitCastInst(EltPtr, BytePtrTy, EltPtr->getNameStr(), MI);
+    
+    // Cast the other pointer (if we have one) to BytePtrTy. 
+    if (OtherElt && OtherElt->getType() != BytePtrTy)
+      OtherElt = new BitCastInst(OtherElt, BytePtrTy,OtherElt->getNameStr(),
+                                 MI);
+    
+    unsigned EltSize = TD->getTypeAllocSize(EltTy);
+    
+    // Finally, insert the meminst for this element.
+    if (isa<MemTransferInst>(MI)) {
+      Value *Ops[] = {
+        SROADest ? EltPtr : OtherElt,  // Dest ptr
+        SROADest ? OtherElt : EltPtr,  // Src ptr
+        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        ConstantInt::get(Type::Int32Ty, OtherEltAlign)  // Align
+      };
+      CallInst::Create(TheFn, Ops, Ops + 4, "", MI);
+    } else {
+      assert(isa<MemSetInst>(MI));
+      Value *Ops[] = {
+        EltPtr, MI->getOperand(2),  // Dest, Value,
+        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        Zero  // Align
+      };
+      CallInst::Create(TheFn, Ops, Ops + 4, "", MI);
+    }
+  }
+  MI->eraseFromParent();
+}
+
+/// RewriteStoreUserOfWholeAlloca - We found an store of an integer that
+/// overwrites the entire allocation.  Extract out the pieces of the stored
+/// integer and store them individually.
+void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI,
+                                         AllocationInst *AI,
+                                         SmallVector<AllocaInst*, 32> &NewElts){
+  // Extract each element out of the integer according to its structure offset
+  // and store the element value to the individual alloca.
+  Value *SrcVal = SI->getOperand(0);
+  const Type *AllocaEltTy = AI->getType()->getElementType();
+  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+  
+  // If this isn't a store of an integer to the whole alloca, it may be a store
+  // to the first element.  Just ignore the store in this case and normal SROA
+  // will handle it.
+  if (!isa<IntegerType>(SrcVal->getType()) ||
+      TD->getTypeAllocSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+    return;
+  // Handle tail padding by extending the operand
+  if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+    SrcVal = new ZExtInst(SrcVal, IntegerType::get(AllocaSizeBits), "", SI);
+
+  DOUT << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << *SI;
+
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    const StructLayout *Layout = TD->getStructLayout(EltSTy);
+    
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Get the number of bits to shift SrcVal to get the value.
+      const Type *FieldTy = EltSTy->getElementType(i);
+      uint64_t Shift = Layout->getElementOffsetInBits(i);
+      
+      if (TD->isBigEndian())
+        Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy);
+      
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = BinaryOperator::CreateLShr(EltVal, ShiftVal,
+                                            "sroa.store.elt", SI);
+      }
+      
+      // Truncate down to an integer of the right size.
+      uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+      
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (FieldSizeBits == 0) continue;
+      
+      if (FieldSizeBits != AllocaSizeBits)
+        EltVal = new TruncInst(EltVal, IntegerType::get(FieldSizeBits), "", SI);
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == FieldTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (FieldTy->isFloatingPoint() || isa<VectorType>(FieldTy)) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = new BitCastInst(EltVal, FieldTy, "", SI);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = new BitCastInst(DestField,
+                                    PointerType::getUnqual(EltVal->getType()),
+                                    "", SI);
+      }
+      new StoreInst(EltVal, DestField, SI);
+    }
+    
+  } else {
+    const ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+    const Type *ArrayEltTy = ATy->getElementType();
+    uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
+    uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
+
+    uint64_t Shift;
+    
+    if (TD->isBigEndian())
+      Shift = AllocaSizeBits-ElementOffset;
+    else 
+      Shift = 0;
+    
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (ElementSizeBits == 0) continue;
+      
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = BinaryOperator::CreateLShr(EltVal, ShiftVal,
+                                            "sroa.store.elt", SI);
+      }
+      
+      // Truncate down to an integer of the right size.
+      if (ElementSizeBits != AllocaSizeBits)
+        EltVal = new TruncInst(EltVal, IntegerType::get(ElementSizeBits),"",SI);
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == ArrayEltTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (ArrayEltTy->isFloatingPoint() || isa<VectorType>(ArrayEltTy)) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = new BitCastInst(EltVal, ArrayEltTy, "", SI);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = new BitCastInst(DestField,
+                                    PointerType::getUnqual(EltVal->getType()),
+                                    "", SI);
+      }
+      new StoreInst(EltVal, DestField, SI);
+      
+      if (TD->isBigEndian())
+        Shift -= ElementOffset;
+      else 
+        Shift += ElementOffset;
+    }
+  }
+  
+  SI->eraseFromParent();
+}
+
+/// RewriteLoadUserOfWholeAlloca - We found an load of the entire allocation to
+/// an integer.  Load the individual pieces to form the aggregate value.
+void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocationInst *AI,
+                                        SmallVector<AllocaInst*, 32> &NewElts) {
+  // Extract each element out of the NewElts according to its structure offset
+  // and form the result value.
+  const Type *AllocaEltTy = AI->getType()->getElementType();
+  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+  
+  // If this isn't a load of the whole alloca to an integer, it may be a load
+  // of the first element.  Just ignore the load in this case and normal SROA
+  // will handle it.
+  if (!isa<IntegerType>(LI->getType()) ||
+      TD->getTypeAllocSizeInBits(LI->getType()) != AllocaSizeBits)
+    return;
+  
+  DOUT << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << *LI;
+  
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  const StructLayout *Layout = 0;
+  uint64_t ArrayEltBitOffset = 0;
+  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    Layout = TD->getStructLayout(EltSTy);
+  } else {
+    const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+    ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
+  }    
+    
+  Value *ResultVal = Constant::getNullValue(IntegerType::get(AllocaSizeBits));
+  
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // Load the value from the alloca.  If the NewElt is an aggregate, cast
+    // the pointer to an integer of the same size before doing the load.
+    Value *SrcField = NewElts[i];
+    const Type *FieldTy =
+      cast<PointerType>(SrcField->getType())->getElementType();
+    uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+    
+    // Ignore zero sized fields like {}, they obviously contain no data.
+    if (FieldSizeBits == 0) continue;
+    
+    const IntegerType *FieldIntTy = IntegerType::get(FieldSizeBits);
+    if (!isa<IntegerType>(FieldTy) && !FieldTy->isFloatingPoint() &&
+        !isa<VectorType>(FieldTy))
+      SrcField = new BitCastInst(SrcField, PointerType::getUnqual(FieldIntTy),
+                                 "", LI);
+    SrcField = new LoadInst(SrcField, "sroa.load.elt", LI);
+
+    // If SrcField is a fp or vector of the right size but that isn't an
+    // integer type, bitcast to an integer so we can shift it.
+    if (SrcField->getType() != FieldIntTy)
+      SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI);
+
+    // Zero extend the field to be the same size as the final alloca so that
+    // we can shift and insert it.
+    if (SrcField->getType() != ResultVal->getType())
+      SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI);
+    
+    // Determine the number of bits to shift SrcField.
+    uint64_t Shift;
+    if (Layout) // Struct case.
+      Shift = Layout->getElementOffsetInBits(i);
+    else  // Array case.
+      Shift = i*ArrayEltBitOffset;
+    
+    if (TD->isBigEndian())
+      Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
+    
+    if (Shift) {
+      Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift);
+      SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
+    }
+
+    ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+  }
+
+  // Handle tail padding by truncating the result
+  if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
+    ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
+
+  LI->replaceAllUsesWith(ResultVal);
+  LI->eraseFromParent();
+}
+
+
+/// HasPadding - Return true if the specified type has any structure or
+/// alignment padding, false otherwise.
+static bool HasPadding(const Type *Ty, const TargetData &TD) {
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = TD.getStructLayout(STy);
+    unsigned PrevFieldBitOffset = 0;
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
+
+      // Padding in sub-elements?
+      if (HasPadding(STy->getElementType(i), TD))
+        return true;
+
+      // Check to see if there is any padding between this element and the
+      // previous one.
+      if (i) {
+        unsigned PrevFieldEnd =
+        PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1));
+        if (PrevFieldEnd < FieldBitOffset)
+          return true;
+      }
+
+      PrevFieldBitOffset = FieldBitOffset;
+    }
+
+    //  Check for tail padding.
+    if (unsigned EltCount = STy->getNumElements()) {
+      unsigned PrevFieldEnd = PrevFieldBitOffset +
+                   TD.getTypeSizeInBits(STy->getElementType(EltCount-1));
+      if (PrevFieldEnd < SL->getSizeInBits())
+        return true;
+    }
+
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    return HasPadding(ATy->getElementType(), TD);
+  } else if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    return HasPadding(VTy->getElementType(), TD);
+  }
+  return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
+}
+
+/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
+/// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
+/// or 1 if safe after canonicalization has been performed.
+///
+int SROA::isSafeAllocaToScalarRepl(AllocationInst *AI) {
+  // Loop over the use list of the alloca.  We can only transform it if all of
+  // the users are safe to transform.
+  AllocaInfo Info;
+  
+  for (Value::use_iterator I = AI->use_begin(), E = AI->use_end();
+       I != E; ++I) {
+    isSafeUseOfAllocation(cast<Instruction>(*I), AI, Info);
+    if (Info.isUnsafe) {
+      DOUT << "Cannot transform: " << *AI << "  due to user: " << **I;
+      return 0;
+    }
+  }
+  
+  // Okay, we know all the users are promotable.  If the aggregate is a memcpy
+  // source and destination, we have to be careful.  In particular, the memcpy
+  // could be moving around elements that live in structure padding of the LLVM
+  // types, but may actually be used.  In these cases, we refuse to promote the
+  // struct.
+  if (Info.isMemCpySrc && Info.isMemCpyDst &&
+      HasPadding(AI->getType()->getElementType(), *TD))
+    return 0;
+
+  // If we require cleanup, return 1, otherwise return 3.
+  return Info.needsCleanup ? 1 : 3;
+}
+
+/// CleanupGEP - GEP is used by an Alloca, which can be prompted after the GEP
+/// is canonicalized here.
+void SROA::CleanupGEP(GetElementPtrInst *GEPI) {
+  gep_type_iterator I = gep_type_begin(GEPI);
+  ++I;
+  
+  const ArrayType *AT = dyn_cast<ArrayType>(*I);
+  if (!AT) 
+    return;
+
+  uint64_t NumElements = AT->getNumElements();
+  
+  if (isa<ConstantInt>(I.getOperand()))
+    return;
+
+  if (NumElements == 1) {
+    GEPI->setOperand(2, Constant::getNullValue(Type::Int32Ty));
+    return;
+  } 
+    
+  assert(NumElements == 2 && "Unhandled case!");
+  // All users of the GEP must be loads.  At each use of the GEP, insert
+  // two loads of the appropriate indexed GEP and select between them.
+  Value *IsOne = new ICmpInst(ICmpInst::ICMP_NE, I.getOperand(), 
+                              Constant::getNullValue(I.getOperand()->getType()),
+                              "isone", GEPI);
+  // Insert the new GEP instructions, which are properly indexed.
+  SmallVector<Value*, 8> Indices(GEPI->op_begin()+1, GEPI->op_end());
+  Indices[1] = Constant::getNullValue(Type::Int32Ty);
+  Value *ZeroIdx = GetElementPtrInst::Create(GEPI->getOperand(0),
+                                             Indices.begin(),
+                                             Indices.end(),
+                                             GEPI->getName()+".0", GEPI);
+  Indices[1] = ConstantInt::get(Type::Int32Ty, 1);
+  Value *OneIdx = GetElementPtrInst::Create(GEPI->getOperand(0),
+                                            Indices.begin(),
+                                            Indices.end(),
+                                            GEPI->getName()+".1", GEPI);
+  // Replace all loads of the variable index GEP with loads from both
+  // indexes and a select.
+  while (!GEPI->use_empty()) {
+    LoadInst *LI = cast<LoadInst>(GEPI->use_back());
+    Value *Zero = new LoadInst(ZeroIdx, LI->getName()+".0", LI);
+    Value *One  = new LoadInst(OneIdx , LI->getName()+".1", LI);
+    Value *R = SelectInst::Create(IsOne, One, Zero, LI->getName(), LI);
+    LI->replaceAllUsesWith(R);
+    LI->eraseFromParent();
+  }
+  GEPI->eraseFromParent();
+}
+
+
+/// CleanupAllocaUsers - If SROA reported that it can promote the specified
+/// allocation, but only if cleaned up, perform the cleanups required.
+void SROA::CleanupAllocaUsers(AllocationInst *AI) {
+  // At this point, we know that the end result will be SROA'd and promoted, so
+  // we can insert ugly code if required so long as sroa+mem2reg will clean it
+  // up.
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+       UI != E; ) {
+    User *U = *UI++;
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U))
+      CleanupGEP(GEPI);
+    else if (Instruction *I = dyn_cast<Instruction>(U)) {
+      SmallVector<DbgInfoIntrinsic *, 2> DbgInUses;
+      if (!isa<StoreInst>(I) && OnlyUsedByDbgInfoIntrinsics(I, &DbgInUses)) {
+        // Safe to remove debug info uses.
+        while (!DbgInUses.empty()) {
+          DbgInfoIntrinsic *DI = DbgInUses.back(); DbgInUses.pop_back();
+          DI->eraseFromParent();
+        }
+        I->eraseFromParent();
+      }
+    }
+  }
+}
+
+/// MergeInType - Add the 'In' type to the accumulated type (Accum) so far at
+/// the offset specified by Offset (which is specified in bytes).
+///
+/// There are two cases we handle here:
+///   1) A union of vector types of the same size and potentially its elements.
+///      Here we turn element accesses into insert/extract element operations.
+///      This promotes a <4 x float> with a store of float to the third element
+///      into a <4 x float> that uses insert element.
+///   2) A fully general blob of memory, which we turn into some (potentially
+///      large) integer type with extract and insert operations where the loads
+///      and stores would mutate the memory.
+static void MergeInType(const Type *In, uint64_t Offset, const Type *&VecTy,
+                        unsigned AllocaSize, const TargetData &TD) {
+  // If this could be contributing to a vector, analyze it.
+  if (VecTy != Type::VoidTy) { // either null or a vector type.
+
+    // If the In type is a vector that is the same size as the alloca, see if it
+    // matches the existing VecTy.
+    if (const VectorType *VInTy = dyn_cast<VectorType>(In)) {
+      if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+        // If we're storing/loading a vector of the right size, allow it as a
+        // vector.  If this the first vector we see, remember the type so that
+        // we know the element size.
+        if (VecTy == 0)
+          VecTy = VInTy;
+        return;
+      }
+    } else if (In == Type::FloatTy || In == Type::DoubleTy ||
+               (isa<IntegerType>(In) && In->getPrimitiveSizeInBits() >= 8 &&
+                isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
+      // If we're accessing something that could be an element of a vector, see
+      // if the implied vector agrees with what we already have and if Offset is
+      // compatible with it.
+      unsigned EltSize = In->getPrimitiveSizeInBits()/8;
+      if (Offset % EltSize == 0 &&
+          AllocaSize % EltSize == 0 &&
+          (VecTy == 0 || 
+           cast<VectorType>(VecTy)->getElementType()
+                 ->getPrimitiveSizeInBits()/8 == EltSize)) {
+        if (VecTy == 0)
+          VecTy = VectorType::get(In, AllocaSize/EltSize);
+        return;
+      }
+    }
+  }
+  
+  // Otherwise, we have a case that we can't handle with an optimized vector
+  // form.  We can still turn this into a large integer.
+  VecTy = Type::VoidTy;
+}
+
+/// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
+/// its accesses to use a to single vector type, return true, and set VecTy to
+/// the new type.  If we could convert the alloca into a single promotable
+/// integer, return true but set VecTy to VoidTy.  Further, if the use is not a
+/// completely trivial use that mem2reg could promote, set IsNotTrivial.  Offset
+/// is the current offset from the base of the alloca being analyzed.
+///
+/// If we see at least one access to the value that is as a vector type, set the
+/// SawVec flag.
+///
+bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy,
+                              bool &SawVec, uint64_t Offset,
+                              unsigned AllocaSize) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // Don't break volatile loads.
+      if (LI->isVolatile())
+        return false;
+      MergeInType(LI->getType(), Offset, VecTy, AllocaSize, *TD);
+      SawVec |= isa<VectorType>(LI->getType());
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Storing the pointer, not into the value?
+      if (SI->getOperand(0) == V || SI->isVolatile()) return 0;
+      MergeInType(SI->getOperand(0)->getType(), Offset, VecTy, AllocaSize, *TD);
+      SawVec |= isa<VectorType>(SI->getOperand(0)->getType());
+      continue;
+    }
+    
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+      if (!CanConvertToScalar(BCI, IsNotTrivial, VecTy, SawVec, Offset,
+                              AllocaSize))
+        return false;
+      IsNotTrivial = true;
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // If this is a GEP with a variable indices, we can't handle it.
+      if (!GEP->hasAllConstantIndices())
+        return false;
+      
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
+                                                &Indices[0], Indices.size());
+      // See if all uses can be converted.
+      if (!CanConvertToScalar(GEP, IsNotTrivial, VecTy, SawVec,Offset+GEPOffset,
+                              AllocaSize))
+        return false;
+      IsNotTrivial = true;
+      continue;
+    }
+
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // handle it.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      // Store of constant value and constant size.
+      if (isa<ConstantInt>(MSI->getValue()) &&
+          isa<ConstantInt>(MSI->getLength())) {
+        IsNotTrivial = true;
+        continue;
+      }
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      if (ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()))
+        if (Len->getZExtValue() == AllocaSize && Offset == 0) {
+          IsNotTrivial = true;
+          continue;
+        }
+    }
+    
+    // Ignore dbg intrinsic.
+    if (isa<DbgInfoIntrinsic>(User))
+      continue;
+
+    // Otherwise, we cannot handle this!
+    return false;
+  }
+  
+  return true;
+}
+
+
+/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
+/// directly.  This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.  By the end of this, there should be no uses of Ptr.
+void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
+  while (!Ptr->use_empty()) {
+    Instruction *User = cast<Instruction>(Ptr->use_back());
+
+    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+      ConvertUsesToScalar(CI, NewAI, Offset);
+      CI->eraseFromParent();
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
+                                                &Indices[0], Indices.size());
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
+      GEP->eraseFromParent();
+      continue;
+    }
+    
+    IRBuilder<> Builder(User->getParent(), User);
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // The load is a bit extract from NewAI shifted right by Offset bits.
+      Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp");
+      Value *NewLoadVal
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
+      LI->replaceAllUsesWith(NewLoadVal);
+      LI->eraseFromParent();
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      assert(SI->getOperand(0) != Ptr && "Consistency error!");
+      Value *Old = Builder.CreateLoad(NewAI, (NewAI->getName()+".in").c_str());
+      Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
+                                             Builder);
+      Builder.CreateStore(New, NewAI);
+      SI->eraseFromParent();
+      continue;
+    }
+    
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // transform it into a store of the expanded constant value.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+      if (NumBytes != 0) {
+        unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
+        
+        // Compute the value replicated the right number of times.
+        APInt APVal(NumBytes*8, Val);
+
+        // Splat the value if non-zero.
+        if (Val)
+          for (unsigned i = 1; i != NumBytes; ++i)
+            APVal |= APVal << 8;
+        
+        Value *Old = Builder.CreateLoad(NewAI, (NewAI->getName()+".in").c_str());
+        Value *New = ConvertScalar_InsertValue(ConstantInt::get(APVal), Old,
+                                               Offset, Builder);
+        Builder.CreateStore(New, NewAI);
+      }
+      MSI->eraseFromParent();
+      continue;
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      assert(Offset == 0 && "must be store to start of alloca");
+      
+      // If the source and destination are both to the same alloca, then this is
+      // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
+      // as appropriate.
+      AllocaInst *OrigAI = cast<AllocaInst>(Ptr->getUnderlyingObject());
+      
+      if (MTI->getSource()->getUnderlyingObject() != OrigAI) {
+        // Dest must be OrigAI, change this to be a load from the original
+        // pointer (bitcasted), then a store to our new alloca.
+        assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
+        Value *SrcPtr = MTI->getSource();
+        SrcPtr = Builder.CreateBitCast(SrcPtr, NewAI->getType());
+        
+        LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
+        SrcVal->setAlignment(MTI->getAlignment());
+        Builder.CreateStore(SrcVal, NewAI);
+      } else if (MTI->getDest()->getUnderlyingObject() != OrigAI) {
+        // Src must be OrigAI, change this to be a load from NewAI then a store
+        // through the original dest pointer (bitcasted).
+        assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
+        LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
+
+        Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), NewAI->getType());
+        StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr);
+        NewStore->setAlignment(MTI->getAlignment());
+      } else {
+        // Noop transfer. Src == Dst
+      }
+          
+
+      MTI->eraseFromParent();
+      continue;
+    }
+    
+    // If user is a dbg info intrinsic then it is safe to remove it.
+    if (isa<DbgInfoIntrinsic>(User)) {
+      User->eraseFromParent();
+      continue;
+    }
+
+    assert(0 && "Unsupported operation!");
+    abort();
+  }
+}
+
+/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
+/// or vector value FromVal, extracting the bits from the offset specified by
+/// Offset.  This returns the value, which is of type ToType.
+///
+/// This happens when we are converting an "integer union" to a single
+/// integer scalar, or when we are converting a "vector union" to a vector with
+/// insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+Value *SROA::ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
+                                        uint64_t Offset, IRBuilder<> &Builder) {
+  // If the load is of the whole new alloca, no conversion is needed.
+  if (FromVal->getType() == ToType && Offset == 0)
+    return FromVal;
+
+  // If the result alloca is a vector type, this is either an element
+  // access or a bitcast to another vector type of the same size.
+  if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) {
+    if (isa<VectorType>(ToType))
+      return Builder.CreateBitCast(FromVal, ToType, "tmp");
+
+    // Otherwise it must be an element access.
+    unsigned Elt = 0;
+    if (Offset) {
+      unsigned EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType());
+      Elt = Offset/EltSize;
+      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
+    }
+    // Return the element extracted out of it.
+    Value *V = Builder.CreateExtractElement(FromVal,
+                                            ConstantInt::get(Type::Int32Ty,Elt),
+                                            "tmp");
+    if (V->getType() != ToType)
+      V = Builder.CreateBitCast(V, ToType, "tmp");
+    return V;
+  }
+  
+  // If ToType is a first class aggregate, extract out each of the pieces and
+  // use insertvalue's to form the FCA.
+  if (const StructType *ST = dyn_cast<StructType>(ToType)) {
+    const StructLayout &Layout = *TD->getStructLayout(ST);
+    Value *Res = UndefValue::get(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
+                                        Offset+Layout.getElementOffsetInBits(i),
+                                              Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+    }
+    return Res;
+  }
+  
+  if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType());
+    Value *Res = UndefValue::get(AT);
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
+                                              Offset+i*EltSize, Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+    }
+    return Res;
+  }
+
+  // Otherwise, this must be a union that was converted to an integer value.
+  const IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+
+  // If this is a big-endian system and the load is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (TD->isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = TD->getTypeStoreSizeInBits(NTy) -
+            TD->getTypeStoreSizeInBits(ToType) - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shl) which are not defined.
+  // We do this to support (f.e.) loads off the end of a structure where
+  // only some bits are used.
+  if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateLShr(FromVal, ConstantInt::get(FromVal->getType(),
+                                                           ShAmt), "tmp");
+  else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateShl(FromVal, ConstantInt::get(FromVal->getType(),
+                                                          -ShAmt), "tmp");
+
+  // Finally, unconditionally truncate the integer to the right width.
+  unsigned LIBitWidth = TD->getTypeSizeInBits(ToType);
+  if (LIBitWidth < NTy->getBitWidth())
+    FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(LIBitWidth), "tmp");
+  else if (LIBitWidth > NTy->getBitWidth())
+    FromVal = Builder.CreateZExt(FromVal, IntegerType::get(LIBitWidth), "tmp");
+
+  // If the result is an integer, this is a trunc or bitcast.
+  if (isa<IntegerType>(ToType)) {
+    // Should be done.
+  } else if (ToType->isFloatingPoint() || isa<VectorType>(ToType)) {
+    // Just do a bitcast, we know the sizes match up.
+    FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp");
+  } else {
+    // Otherwise must be a pointer.
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp");
+  }
+  assert(FromVal->getType() == ToType && "Didn't convert right?");
+  return FromVal;
+}
+
+
+/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
+/// or vector value "Old" at the offset specified by Offset.
+///
+/// This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+Value *SROA::ConvertScalar_InsertValue(Value *SV, Value *Old,
+                                       uint64_t Offset, IRBuilder<> &Builder) {
+
+  // Convert the stored type to the actual type, shift it left to insert
+  // then 'or' into place.
+  const Type *AllocaType = Old->getType();
+
+  if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+    uint64_t VecSize = TD->getTypeAllocSizeInBits(VTy);
+    uint64_t ValSize = TD->getTypeAllocSizeInBits(SV->getType());
+    
+    // Changing the whole vector with memset or with an access of a different
+    // vector type?
+    if (ValSize == VecSize)
+      return Builder.CreateBitCast(SV, AllocaType, "tmp");
+
+    uint64_t EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType());
+
+    // Must be an element insertion.
+    unsigned Elt = Offset/EltSize;
+    
+    if (SV->getType() != VTy->getElementType())
+      SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp");
+    
+    SV = Builder.CreateInsertElement(Old, SV, 
+                                     ConstantInt::get(Type::Int32Ty, Elt),
+                                     "tmp");
+    return SV;
+  }
+  
+  // If SV is a first-class aggregate value, insert each value recursively.
+  if (const StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    const StructLayout &Layout = *TD->getStructLayout(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Old = ConvertScalar_InsertValue(Elt, Old, 
+                                      Offset+Layout.getElementOffsetInBits(i),
+                                      Builder);
+    }
+    return Old;
+  }
+  
+  if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType());
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
+    }
+    return Old;
+  }
+
+  // If SV is a float, convert it to the appropriate integer type.
+  // If it is a pointer, do the same.
+  unsigned SrcWidth = TD->getTypeSizeInBits(SV->getType());
+  unsigned DestWidth = TD->getTypeSizeInBits(AllocaType);
+  unsigned SrcStoreWidth = TD->getTypeStoreSizeInBits(SV->getType());
+  unsigned DestStoreWidth = TD->getTypeStoreSizeInBits(AllocaType);
+  if (SV->getType()->isFloatingPoint() || isa<VectorType>(SV->getType()))
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SrcWidth), "tmp");
+  else if (isa<PointerType>(SV->getType()))
+    SV = Builder.CreatePtrToInt(SV, TD->getIntPtrType(), "tmp");
+
+  // Zero extend or truncate the value if needed.
+  if (SV->getType() != AllocaType) {
+    if (SV->getType()->getPrimitiveSizeInBits() <
+             AllocaType->getPrimitiveSizeInBits())
+      SV = Builder.CreateZExt(SV, AllocaType, "tmp");
+    else {
+      // Truncation may be needed if storing more than the alloca can hold
+      // (undefined behavior).
+      SV = Builder.CreateTrunc(SV, AllocaType, "tmp");
+      SrcWidth = DestWidth;
+      SrcStoreWidth = DestStoreWidth;
+    }
+  }
+
+  // If this is a big-endian system and the store is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (TD->isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shr) which are not defined.
+  // We do this to support (f.e.) stores off the end of a structure where
+  // only some bits in the structure are set.
+  APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
+  if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt), "tmp");
+    Mask <<= ShAmt;
+  } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt), "tmp");
+    Mask = Mask.lshr(-ShAmt);
+  }
+
+  // Mask out the bits we are about to insert from the old value, and or
+  // in the new bits.
+  if (SrcWidth != DestWidth) {
+    assert(DestWidth > SrcWidth);
+    Old = Builder.CreateAnd(Old, ConstantInt::get(~Mask), "mask");
+    SV = Builder.CreateOr(Old, SV, "ins");
+  }
+  return SV;
+}
+
+
+
+/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to
+/// some part of a constant global variable.  This intentionally only accepts
+/// constant expressions because we don't can't rewrite arbitrary instructions.
+static bool PointsToConstantGlobal(Value *V) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast || 
+        CE->getOpcode() == Instruction::GetElementPtr)
+      return PointsToConstantGlobal(CE->getOperand(0));
+  return false;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with isOffset) but otherwise traverse
+/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant  global, we
+/// can optimize this.
+static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy,
+                                           bool isOffset) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI))
+      // Ignore non-volatile loads, they are always ok.
+      if (!LI->isVolatile())
+        continue;
+    
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI)) {
+      // If uses of the bitcast are ok, we are ok.
+      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset))
+        return false;
+      continue;
+    }
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
+      // doesn't, it does.
+      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy,
+                                         isOffset || !GEP->hasAllZeroIndices()))
+        return false;
+      continue;
+    }
+    
+    // If this is isn't our memcpy/memmove, reject it as something we can't
+    // handle.
+    if (!isa<MemTransferInst>(*UI))
+      return false;
+
+    // If we already have seen a copy, reject the second one.
+    if (TheCopy) return false;
+    
+    // If the pointer has been offset from the start of the alloca, we can't
+    // safely handle this.
+    if (isOffset) return false;
+
+    // If the memintrinsic isn't using the alloca as the dest, reject it.
+    if (UI.getOperandNo() != 1) return false;
+    
+    MemIntrinsic *MI = cast<MemIntrinsic>(*UI);
+    
+    // If the source of the memcpy/move is not a constant global, reject it.
+    if (!PointsToConstantGlobal(MI->getOperand(2)))
+      return false;
+    
+    // Otherwise, the transform is safe.  Remember the copy instruction.
+    TheCopy = MI;
+  }
+  return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global.  If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+Instruction *SROA::isOnlyCopiedFromConstantGlobal(AllocationInst *AI) {
+  Instruction *TheCopy = 0;
+  if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false))
+    return TheCopy;
+  return 0;
+}
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
new file mode 100644
index 0000000..b499279
--- /dev/null
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -0,0 +1,232 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations.  For example:
+//
+//   * Removes basic blocks with no predecessors.
+//   * Merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor.
+//   * Eliminates a basic block that only contains an unconditional branch.
+//   * Changes invoke instructions to nounwind functions to be calls.
+//   * Change things like "if (x) if (y)" into "if (x&y)".
+//   * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Attributes.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGSimplifyPass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGSimplifyPass() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+  };
+}
+
+char CFGSimplifyPass::ID = 0;
+static RegisterPass<CFGSimplifyPass> X("simplifycfg", "Simplify the CFG");
+
+// Public interface to the CFGSimplification pass
+FunctionPass *llvm::createCFGSimplificationPass() {
+  return new CFGSimplifyPass();
+}
+
+/// ChangeToUnreachable - Insert an unreachable instruction before the specified
+/// instruction, making it and the rest of the code in the block dead.
+static void ChangeToUnreachable(Instruction *I) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    (*SI)->removePredecessor(BB);
+  
+  new UnreachableInst(I);
+  
+  // All instructions after this are dead.
+  BasicBlock::iterator BBI = I, BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+  }
+}
+
+/// ChangeToCall - Convert the specified invoke into a normal call.
+static void ChangeToCall(InvokeInst *II) {
+  BasicBlock *BB = II->getParent();
+  SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args.begin(),
+                                       Args.end(), "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(BB);
+  BB->getInstList().erase(II);
+}
+
+static bool MarkAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+  
+  SmallVector<BasicBlock*, 128> Worklist;
+  Worklist.push_back(BB);
+  bool Changed = false;
+  while (!Worklist.empty()) {
+    BB = Worklist.back();
+    Worklist.pop_back();
+    
+    if (!Reachable.insert(BB))
+      continue;
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          ++BBI;
+          if (!isa<UnreachableInst>(BBI)) {
+            ChangeToUnreachable(BBI);
+            Changed = true;
+          }
+          break;
+        }
+      }
+      
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+        if (isa<ConstantPointerNull>(SI->getOperand(1)) ||
+            isa<UndefValue>(SI->getOperand(1))) {
+          ChangeToUnreachable(SI);
+          Changed = true;
+          break;
+        }
+    }
+
+    // Turn invokes that call 'nounwind' functions into ordinary calls.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow()) {
+        ChangeToCall(II);
+        Changed = true;
+      }
+
+    Changed |= ConstantFoldTerminator(BB);
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      Worklist.push_back(*SI);
+  }
+  return Changed;
+}
+
+/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even 
+/// if they are in a dead cycle.  Return true if a change was made, false 
+/// otherwise.
+static bool RemoveUnreachableBlocksFromFn(Function &F) {
+  SmallPtrSet<BasicBlock*, 128> Reachable;
+  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+  
+  // If there are unreachable blocks in the CFG...
+  if (Reachable.size() == F.size())
+    return Changed;
+  
+  assert(Reachable.size() < F.size());
+  NumSimpl += F.size()-Reachable.size();
+  
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
+      continue;
+    
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.count(*SI))
+        (*SI)->removePredecessor(BB);
+    BB->dropAllReferences();
+  }
+  
+  for (Function::iterator I = ++F.begin(); I != F.end();)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
+  
+  return true;
+}
+
+/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool IterativeSimplifyCFG(Function &F) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+    
+    // Loop over all of the basic blocks (except the first one) and remove them
+    // if they are unneeded...
+    //
+    for (Function::iterator BBIt = ++F.begin(); BBIt != F.end(); ) {
+      if (SimplifyCFG(BBIt++)) {
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+// It is possible that we may require multiple passes over the code to fully
+// simplify the CFG.
+//
+bool CFGSimplifyPass::runOnFunction(Function &F) {
+  bool EverChanged = RemoveUnreachableBlocksFromFn(F);
+  EverChanged |= IterativeSimplifyCFG(F);
+  
+  // If neither pass changed anything, we're done.
+  if (!EverChanged) return false;
+
+  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // iterate between the two optimizations.  We structure the code like this to
+  // avoid reruning IterativeSimplifyCFG if the second pass of 
+  // RemoveUnreachableBlocksFromFn doesn't do anything.
+  if (!RemoveUnreachableBlocksFromFn(F))
+    return true;
+  
+  do {
+    EverChanged = IterativeSimplifyCFG(F);
+    EverChanged |= RemoveUnreachableBlocksFromFn(F);
+  } while (EverChanged);
+  
+  return true;
+}
diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
new file mode 100644
index 0000000..4aad17d
--- /dev/null
+++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
@@ -0,0 +1,159 @@
+//===- SimplifyHalfPowrLibCalls.cpp - Optimize specific half_powr calls ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple pass that applies an experimental
+// transformation on calls to specific functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplify-libcalls-halfpowr"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+namespace {
+  /// This pass optimizes well half_powr function calls.
+  ///
+  class VISIBILITY_HIDDEN SimplifyHalfPowrLibCalls : public FunctionPass {
+    const TargetData *TD;
+  public:
+    static char ID; // Pass identification
+    SimplifyHalfPowrLibCalls() : FunctionPass(&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+    Instruction *
+    InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs,
+                    Instruction *InsertPt);
+  };
+  char SimplifyHalfPowrLibCalls::ID = 0;
+} // end anonymous namespace.
+
+static RegisterPass<SimplifyHalfPowrLibCalls>
+X("simplify-libcalls-halfpowr", "Simplify half_powr library calls");
+
+// Public interface to the Simplify HalfPowr LibCalls pass.
+FunctionPass *llvm::createSimplifyHalfPowrLibCallsPass() {
+  return new SimplifyHalfPowrLibCalls(); 
+}
+
+/// InlineHalfPowrs - Inline a sequence of adjacent half_powr calls, rearranging
+/// their control flow to better facilitate subsequent optimization.
+Instruction *
+SimplifyHalfPowrLibCalls::InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs,
+                                        Instruction *InsertPt) {
+  std::vector<BasicBlock *> Bodies;
+  BasicBlock *NewBlock = 0;
+
+  for (unsigned i = 0, e = HalfPowrs.size(); i != e; ++i) {
+    CallInst *Call = cast<CallInst>(HalfPowrs[i]);
+    Function *Callee = Call->getCalledFunction();
+
+    // Minimally sanity-check the CFG of half_powr to ensure that it contains
+    // the the kind of code we expect.  If we're running this pass, we have
+    // reason to believe it will be what we expect.
+    Function::iterator I = Callee->begin();
+    BasicBlock *Prologue = I++;
+    if (I == Callee->end()) break;
+    BasicBlock *SubnormalHandling = I++;
+    if (I == Callee->end()) break;
+    BasicBlock *Body = I++;
+    if (I != Callee->end()) break;
+    if (SubnormalHandling->getSinglePredecessor() != Prologue)
+      break;
+    BranchInst *PBI = dyn_cast<BranchInst>(Prologue->getTerminator());
+    if (!PBI || !PBI->isConditional())
+      break;
+    BranchInst *SNBI = dyn_cast<BranchInst>(SubnormalHandling->getTerminator());
+    if (!SNBI || SNBI->isConditional())
+      break;
+    if (!isa<ReturnInst>(Body->getTerminator()))
+      break;
+
+    Instruction *NextInst = next(BasicBlock::iterator(Call));
+
+    // Inline the call, taking care of what code ends up where.
+    NewBlock = SplitBlock(NextInst->getParent(), NextInst, this);
+
+    bool B = InlineFunction(Call, 0, TD);
+    assert(B && "half_powr didn't inline?"); B=B;
+
+    BasicBlock *NewBody = NewBlock->getSinglePredecessor();
+    assert(NewBody);
+    Bodies.push_back(NewBody);
+  }
+
+  if (!NewBlock)
+    return InsertPt;
+
+  // Put the code for all the bodies into one block, to facilitate
+  // subsequent optimization.
+  (void)SplitEdge(NewBlock->getSinglePredecessor(), NewBlock, this);
+  for (unsigned i = 0, e = Bodies.size(); i != e; ++i) {
+    BasicBlock *Body = Bodies[i];
+    Instruction *FNP = Body->getFirstNonPHI();
+    // Splice the insts from body into NewBlock.
+    NewBlock->getInstList().splice(NewBlock->begin(), Body->getInstList(),
+                                   FNP, Body->getTerminator());
+  }
+
+  return NewBlock->begin();
+}
+
+/// runOnFunction - Top level algorithm.
+///
+bool SimplifyHalfPowrLibCalls::runOnFunction(Function &F) {
+  TD = &getAnalysis<TargetData>();
+  
+  bool Changed = false;
+  std::vector<Instruction *> HalfPowrs;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Look for calls.
+      bool IsHalfPowr = false;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        // Look for direct calls and calls to non-external functions.
+        Function *Callee = CI->getCalledFunction();
+        if (Callee && Callee->hasExternalLinkage()) {
+          // Look for calls with well-known names.
+          const char *CalleeName = Callee->getNameStart();
+          if (strcmp(CalleeName, "__half_powrf4") == 0)
+            IsHalfPowr = true;
+        }
+      }
+      if (IsHalfPowr)
+        HalfPowrs.push_back(I);
+      // We're looking for sequences of up to three such calls, which we'll
+      // simplify as a group.
+      if ((!IsHalfPowr && !HalfPowrs.empty()) || HalfPowrs.size() == 3) {
+        I = InlineHalfPowrs(HalfPowrs, I);
+        E = I->getParent()->end();
+        HalfPowrs.clear();
+        Changed = true;
+      }
+    }
+    assert(HalfPowrs.empty() && "Block had no terminator!");
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
new file mode 100644
index 0000000..4b00640
--- /dev/null
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -0,0 +1,2429 @@
+//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple pass that applies a variety of small
+// optimizations for calls to specific well-known function calls (e.g. runtime
+// library functions). For example, a call to the function "exit(3)" that
+// occurs within the main() function can be transformed into a simple "return 3"
+// instruction. Any optimization that takes this form (replace call to library
+// function with simpler code that provides the same result) belongs in this
+// file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplify-libcalls"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Config/config.h"
+using namespace llvm;
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
+STATISTIC(NumAnnotated, "Number of attributes added to library functions");
+
+//===----------------------------------------------------------------------===//
+// Optimizer Base Class
+//===----------------------------------------------------------------------===//
+
+/// This class is the abstract base class for the set of optimizations that
+/// corresponds to one library call.
+namespace {
+class VISIBILITY_HIDDEN LibCallOptimization {
+protected:
+  Function *Caller;
+  const TargetData *TD;
+public:
+  LibCallOptimization() { }
+  virtual ~LibCallOptimization() {}
+
+  /// CallOptimizer - This pure virtual method is implemented by base classes to
+  /// do various optimizations.  If this returns null then no transformation was
+  /// performed.  If it returns CI, then it transformed the call and CI is to be
+  /// deleted.  If it returns something else, replace CI with the new value and
+  /// delete CI.
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) 
+    =0;
+  
+  Value *OptimizeCall(CallInst *CI, const TargetData &TD, IRBuilder<> &B) {
+    Caller = CI->getParent()->getParent();
+    this->TD = &TD;
+    return CallOptimizer(CI->getCalledFunction(), CI, B);
+  }
+
+  /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
+  Value *CastToCStr(Value *V, IRBuilder<> &B);
+
+  /// EmitStrLen - Emit a call to the strlen function to the builder, for the
+  /// specified pointer.  Ptr is required to be some pointer type, and the
+  /// return value has 'intptr_t' type.
+  Value *EmitStrLen(Value *Ptr, IRBuilder<> &B);
+  
+  /// EmitMemCpy - Emit a call to the memcpy function to the builder.  This
+  /// always expects that the size has type 'intptr_t' and Dst/Src are pointers.
+  Value *EmitMemCpy(Value *Dst, Value *Src, Value *Len, 
+                    unsigned Align, IRBuilder<> &B);
+  
+  /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
+  /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
+  Value *EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B);
+
+  /// EmitMemCmp - Emit a call to the memcmp function.
+  Value *EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B);
+
+  /// EmitMemSet - Emit a call to the memset function
+  Value *EmitMemSet(Value *Dst, Value *Val, Value *Len, IRBuilder<> &B);
+
+  /// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
+  /// 'floor').  This function is known to take a single of type matching 'Op'
+  /// and returns one value with the same type.  If 'Op' is a long double, 'l'
+  /// is added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
+  Value *EmitUnaryFloatFnCall(Value *Op, const char *Name, IRBuilder<> &B);
+  
+  /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
+  /// is an integer.
+  void EmitPutChar(Value *Char, IRBuilder<> &B);
+  
+  /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
+  /// some pointer.
+  void EmitPutS(Value *Str, IRBuilder<> &B);
+    
+  /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
+  /// an i32, and File is a pointer to FILE.
+  void EmitFPutC(Value *Char, Value *File, IRBuilder<> &B);
+  
+  /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
+  /// pointer and File is a pointer to FILE.
+  void EmitFPutS(Value *Str, Value *File, IRBuilder<> &B);
+  
+  /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
+  /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
+  void EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B);
+  
+};
+} // End anonymous namespace.
+
+/// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
+Value *LibCallOptimization::CastToCStr(Value *V, IRBuilder<> &B) {
+  return B.CreateBitCast(V, PointerType::getUnqual(Type::Int8Ty), "cstr");
+}
+
+/// EmitStrLen - Emit a call to the strlen function to the builder, for the
+/// specified pointer.  This always returns an integer value of size intptr_t.
+Value *LibCallOptimization::EmitStrLen(Value *Ptr, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[2];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
+                                   Attribute::NoUnwind);
+
+  Constant *StrLen =M->getOrInsertFunction("strlen", AttrListPtr::get(AWI, 2),
+                                           TD->getIntPtrType(),
+                                           PointerType::getUnqual(Type::Int8Ty),
+                                           NULL);
+  return B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
+}
+
+/// EmitMemCpy - Emit a call to the memcpy function to the builder.  This always
+/// expects that the size has type 'intptr_t' and Dst/Src are pointers.
+Value *LibCallOptimization::EmitMemCpy(Value *Dst, Value *Src, Value *Len,
+                                       unsigned Align, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  Intrinsic::ID IID = Intrinsic::memcpy;
+  const Type *Tys[1];
+  Tys[0] = Len->getType();
+  Value *MemCpy = Intrinsic::getDeclaration(M, IID, Tys, 1);
+  return B.CreateCall4(MemCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Len,
+                       ConstantInt::get(Type::Int32Ty, Align));
+}
+
+/// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
+/// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
+Value *LibCallOptimization::EmitMemChr(Value *Ptr, Value *Val,
+                                       Value *Len, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI;
+  AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
+
+  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(&AWI, 1),
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         Type::Int32Ty, TD->getIntPtrType(),
+                                         NULL);
+  return B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr");
+}
+
+/// EmitMemCmp - Emit a call to the memcmp function.
+Value *LibCallOptimization::EmitMemCmp(Value *Ptr1, Value *Ptr2,
+                                       Value *Len, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[3];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
+                                   Attribute::NoUnwind);
+
+  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI, 3),
+                                         Type::Int32Ty,
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         PointerType::getUnqual(Type::Int8Ty),
+                                         TD->getIntPtrType(), NULL);
+  return B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B),
+                       Len, "memcmp");
+}
+
+/// EmitMemSet - Emit a call to the memset function
+Value *LibCallOptimization::EmitMemSet(Value *Dst, Value *Val,
+                                       Value *Len, IRBuilder<> &B) {
+ Module *M = Caller->getParent();
+ Intrinsic::ID IID = Intrinsic::memset;
+ const Type *Tys[1];
+ Tys[0] = Len->getType();
+ Value *MemSet = Intrinsic::getDeclaration(M, IID, Tys, 1);
+ Value *Align = ConstantInt::get(Type::Int32Ty, 1);
+ return B.CreateCall4(MemSet, CastToCStr(Dst, B), Val, Len, Align);
+}
+
+/// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
+/// 'floor').  This function is known to take a single of type matching 'Op' and
+/// returns one value with the same type.  If 'Op' is a long double, 'l' is
+/// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
+Value *LibCallOptimization::EmitUnaryFloatFnCall(Value *Op, const char *Name,
+                                                 IRBuilder<> &B) {
+  char NameBuffer[20];
+  if (Op->getType() != Type::DoubleTy) {
+    // If we need to add a suffix, copy into NameBuffer.
+    unsigned NameLen = strlen(Name);
+    assert(NameLen < sizeof(NameBuffer)-2);
+    memcpy(NameBuffer, Name, NameLen);
+    if (Op->getType() == Type::FloatTy)
+      NameBuffer[NameLen] = 'f';  // floorf
+    else
+      NameBuffer[NameLen] = 'l';  // floorl
+    NameBuffer[NameLen+1] = 0;
+    Name = NameBuffer;
+  }
+  
+  Module *M = Caller->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, Op->getType(), 
+                                         Op->getType(), NULL);
+  return B.CreateCall(Callee, Op, Name);
+}
+
+/// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
+/// is an integer.
+void LibCallOptimization::EmitPutChar(Value *Char, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  Value *F = M->getOrInsertFunction("putchar", Type::Int32Ty,
+                                    Type::Int32Ty, NULL);
+  B.CreateCall(F, B.CreateIntCast(Char, Type::Int32Ty, "chari"), "putchar");
+}
+
+/// EmitPutS - Emit a call to the puts function.  This assumes that Str is
+/// some pointer.
+void LibCallOptimization::EmitPutS(Value *Str, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[2];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+
+  Value *F = M->getOrInsertFunction("puts", AttrListPtr::get(AWI, 2),
+                                    Type::Int32Ty,
+                                    PointerType::getUnqual(Type::Int8Ty), NULL);
+  B.CreateCall(F, CastToCStr(Str, B), "puts");
+}
+
+/// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
+/// an integer and File is a pointer to FILE.
+void LibCallOptimization::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[2];
+  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  Constant *F;
+  if (isa<PointerType>(File->getType()))
+    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI, 2), Type::Int32Ty,
+                               Type::Int32Ty, File->getType(), NULL);
+                                         
+  else
+    F = M->getOrInsertFunction("fputc", Type::Int32Ty, Type::Int32Ty,
+                               File->getType(), NULL);
+  Char = B.CreateIntCast(Char, Type::Int32Ty, "chari");
+  B.CreateCall2(F, Char, File, "fputc");
+}
+
+/// EmitFPutS - Emit a call to the puts function.  Str is required to be a
+/// pointer and File is a pointer to FILE.
+void LibCallOptimization::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[3];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  Constant *F;
+  if (isa<PointerType>(File->getType()))
+    F = M->getOrInsertFunction("fputs", AttrListPtr::get(AWI, 3), Type::Int32Ty,
+                               PointerType::getUnqual(Type::Int8Ty),
+                               File->getType(), NULL);
+  else
+    F = M->getOrInsertFunction("fputs", Type::Int32Ty,
+                               PointerType::getUnqual(Type::Int8Ty),
+                               File->getType(), NULL);
+  B.CreateCall2(F, CastToCStr(Str, B), File, "fputs");
+}
+
+/// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
+/// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
+void LibCallOptimization::EmitFWrite(Value *Ptr, Value *Size, Value *File,
+                                     IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI[3];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  Constant *F;
+  if (isa<PointerType>(File->getType()))
+    F = M->getOrInsertFunction("fwrite", AttrListPtr::get(AWI, 3),
+                               TD->getIntPtrType(),
+                               PointerType::getUnqual(Type::Int8Ty),
+                               TD->getIntPtrType(), TD->getIntPtrType(),
+                               File->getType(), NULL);
+  else
+    F = M->getOrInsertFunction("fwrite", TD->getIntPtrType(),
+                               PointerType::getUnqual(Type::Int8Ty),
+                               TD->getIntPtrType(), TD->getIntPtrType(),
+                               File->getType(), NULL);
+  B.CreateCall4(F, CastToCStr(Ptr, B), Size, 
+                ConstantInt::get(TD->getIntPtrType(), 1), File);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// GetStringLengthH - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+static uint64_t GetStringLengthH(Value *V, SmallPtrSet<PHINode*, 32> &PHIs) {
+  // Look through noop bitcast instructions.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(V))
+    return GetStringLengthH(BCI->getOperand(0), PHIs);
+  
+  // If this is a PHI node, there are two cases: either we have already seen it
+  // or we haven't.
+  if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    if (!PHIs.insert(PN))
+      return ~0ULL;  // already in the set.
+    
+    // If it was new, see if all the input strings are the same length.
+    uint64_t LenSoFar = ~0ULL;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      uint64_t Len = GetStringLengthH(PN->getIncomingValue(i), PHIs);
+      if (Len == 0) return 0; // Unknown length -> unknown.
+      
+      if (Len == ~0ULL) continue;
+      
+      if (Len != LenSoFar && LenSoFar != ~0ULL)
+        return 0;    // Disagree -> unknown.
+      LenSoFar = Len;
+    }
+    
+    // Success, all agree.
+    return LenSoFar;
+  }
+  
+  // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
+  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs);
+    if (Len1 == 0) return 0;
+    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs);
+    if (Len2 == 0) return 0;
+    if (Len1 == ~0ULL) return Len2;
+    if (Len2 == ~0ULL) return Len1;
+    if (Len1 != Len2) return 0;
+    return Len1;
+  }
+  
+  // If the value is not a GEP instruction nor a constant expression with a
+  // GEP instruction, then return unknown.
+  User *GEP = 0;
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(V)) {
+    GEP = GEPI;
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() != Instruction::GetElementPtr)
+      return 0;
+    GEP = CE;
+  } else {
+    return 0;
+  }
+  
+  // Make sure the GEP has exactly three arguments.
+  if (GEP->getNumOperands() != 3)
+    return 0;
+  
+  // Check to make sure that the first operand of the GEP is an integer and
+  // has value 0 so that we are sure we're indexing into the initializer.
+  if (ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(1))) {
+    if (!Idx->isZero())
+      return 0;
+  } else
+    return 0;
+  
+  // If the second index isn't a ConstantInt, then this is a variable index
+  // into the array.  If this occurs, we can't say anything meaningful about
+  // the string.
+  uint64_t StartIdx = 0;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
+    StartIdx = CI->getZExtValue();
+  else
+    return 0;
+  
+  // The GEP instruction, constant or instruction, must reference a global
+  // variable that is a constant and is initialized. The referenced constant
+  // initializer is the array that we'll use for optimization.
+  GlobalVariable* GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
+  if (!GV || !GV->isConstant() || !GV->hasInitializer())
+    return 0;
+  Constant *GlobalInit = GV->getInitializer();
+  
+  // Handle the ConstantAggregateZero case, which is a degenerate case. The
+  // initializer is constant zero so the length of the string must be zero.
+  if (isa<ConstantAggregateZero>(GlobalInit))
+    return 1;  // Len = 0 offset by 1.
+  
+  // Must be a Constant Array
+  ConstantArray *Array = dyn_cast<ConstantArray>(GlobalInit);
+  if (!Array || Array->getType()->getElementType() != Type::Int8Ty)
+    return false;
+  
+  // Get the number of elements in the array
+  uint64_t NumElts = Array->getType()->getNumElements();
+  
+  // Traverse the constant array from StartIdx (derived above) which is
+  // the place the GEP refers to in the array.
+  for (unsigned i = StartIdx; i != NumElts; ++i) {
+    Constant *Elt = Array->getOperand(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(Elt);
+    if (!CI) // This array isn't suitable, non-int initializer.
+      return 0;
+    if (CI->isZero())
+      return i-StartIdx+1; // We found end of string, success!
+  }
+  
+  return 0; // The array isn't null terminated, conservatively return 'unknown'.
+}
+
+/// GetStringLength - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+static uint64_t GetStringLength(Value *V) {
+  if (!isa<PointerType>(V->getType())) return 0;
+  
+  SmallPtrSet<PHINode*, 32> PHIs;
+  uint64_t Len = GetStringLengthH(V, PHIs);
+  // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
+  // an empty string as a length.
+  return Len == ~0ULL ? 1 : Len;
+}
+
+/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero. 
+static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous LibCall Optimizations
+//===----------------------------------------------------------------------===//
+
+namespace {
+//===---------------------------------------===//
+// 'exit' Optimizations
+
+/// ExitOpt - int main() { exit(4); } --> int main() { return 4; }
+struct VISIBILITY_HIDDEN ExitOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify we have a reasonable prototype for exit.
+    if (Callee->arg_size() == 0 || !CI->use_empty())
+      return 0;
+
+    // Verify the caller is main, and that the result type of main matches the
+    // argument type of exit.
+    if (!Caller->isName("main") || !Caller->hasExternalLinkage() ||
+        Caller->getReturnType() != CI->getOperand(1)->getType())
+      return 0;
+
+    TerminatorInst *OldTI = CI->getParent()->getTerminator();
+    
+    // Create the return after the call.
+    ReturnInst *RI = B.CreateRet(CI->getOperand(1));
+
+    // Drop all successor phi node entries.
+    for (unsigned i = 0, e = OldTI->getNumSuccessors(); i != e; ++i)
+      OldTI->getSuccessor(i)->removePredecessor(CI->getParent());
+    
+    // Erase all instructions from after our return instruction until the end of
+    // the block.
+    BasicBlock::iterator FirstDead = RI; ++FirstDead;
+    CI->getParent()->getInstList().erase(FirstDead, CI->getParent()->end());
+    return CI;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// String and Memory LibCall Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'strcat' Optimizations
+
+struct VISIBILITY_HIDDEN StrCatOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcat" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType())
+      return 0;
+    
+    // Extract some information from the instruction
+    Value *Dst = CI->getOperand(1);
+    Value *Src = CI->getOperand(2);
+    
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    --Len;  // Unbias length.
+    
+    // Handle the simple, do-nothing case: strcat(x, "") -> x
+    if (Len == 0)
+      return Dst;
+    
+    EmitStrLenMemCpy(Src, Dst, Len, B);
+    return Dst;
+  }
+
+  void EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) {
+    // We need to find the end of the destination string.  That's where the
+    // memory is to be moved to. We just generate a call to strlen.
+    Value *DstLen = EmitStrLen(Dst, B);
+    
+    // Now that we have the destination's length, we must index into the
+    // destination's pointer to get the actual memcpy destination (end of
+    // the string .. we're concatenating).
+    Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
+    
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    EmitMemCpy(CpyDst, Src, ConstantInt::get(TD->getIntPtrType(), Len+1), 1, B);
+  }
+};
+
+//===---------------------------------------===//
+// 'strncat' Optimizations
+
+struct VISIBILITY_HIDDEN StrNCatOpt : public StrCatOpt {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncat" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType() ||
+        !isa<IntegerType>(FT->getParamType(2)))
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getOperand(1);
+    Value *Src = CI->getOperand(2);
+    uint64_t Len;
+
+    // We don't do anything if length is not constant
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;  // Unbias length.
+
+    // Handle the simple, do-nothing cases:
+    // strncat(x, "", c) -> x
+    // strncat(x,  c, 0) -> x
+    if (SrcLen == 0 || Len == 0) return Dst;
+
+    // We don't optimize this case
+    if (Len < SrcLen) return 0;
+
+    // strncat(x, s, c) -> strcat(x, s)
+    // s is constant so the strcat can be optimized further
+    EmitStrLenMemCpy(Src, Dst, SrcLen, B);
+    return Dst;
+  }
+};
+
+//===---------------------------------------===//
+// 'strchr' Optimizations
+
+struct VISIBILITY_HIDDEN StrChrOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strchr" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != PointerType::getUnqual(Type::Int8Ty) ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+    
+    Value *SrcStr = CI->getOperand(1);
+    
+    // If the second operand is non-constant, see if we can compute the length
+    // of the input string and turn this into memchr.
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    if (CharC == 0) {
+      uint64_t Len = GetStringLength(SrcStr);
+      if (Len == 0 || FT->getParamType(1) != Type::Int32Ty) // memchr needs i32.
+        return 0;
+      
+      return EmitMemChr(SrcStr, CI->getOperand(2), // include nul.
+                        ConstantInt::get(TD->getIntPtrType(), Len), B);
+    }
+
+    // Otherwise, the character is a constant, see if the first argument is
+    // a string literal.  If so, we can constant fold.
+    std::string Str;
+    if (!GetConstantStringInfo(SrcStr, Str))
+      return 0;
+    
+    // strchr can find the nul character.
+    Str += '\0';
+    char CharValue = CharC->getSExtValue();
+    
+    // Compute the offset.
+    uint64_t i = 0;
+    while (1) {
+      if (i == Str.size())    // Didn't find the char.  strchr returns null.
+        return Constant::getNullValue(CI->getType());
+      // Did we find our match?
+      if (Str[i] == CharValue)
+        break;
+      ++i;
+    }
+    
+    // strchr(s+n,c)  -> gep(s+n+i,c)
+    Value *Idx = ConstantInt::get(Type::Int64Ty, i);
+    return B.CreateGEP(SrcStr, Idx, "strchr");
+  }
+};
+
+//===---------------------------------------===//
+// 'strcmp' Optimizations
+
+struct VISIBILITY_HIDDEN StrCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcmp" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || FT->getReturnType() != Type::Int32Ty ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty))
+      return 0;
+    
+    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    std::string Str1, Str2;
+    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x
+      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+    
+    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(), strcmp(Str1.c_str(),Str2.c_str()));
+
+    // strcmp(P, "x") -> memcmp(P, "x", 2)
+    uint64_t Len1 = GetStringLength(Str1P);
+    uint64_t Len2 = GetStringLength(Str2P);
+    if (Len1 || Len2) {
+      // Choose the smallest Len excluding 0 which means 'unknown'.
+      if (!Len1 || (Len2 && Len2 < Len1))
+        Len1 = Len2;
+      return EmitMemCmp(Str1P, Str2P,
+                        ConstantInt::get(TD->getIntPtrType(), Len1), B);
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'strncmp' Optimizations
+
+struct VISIBILITY_HIDDEN StrNCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncmp" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != Type::Int32Ty ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getParamType(2)))
+      return 0;
+    
+    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    // Get the length argument if it is constant.
+    uint64_t Length;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+      Length = LengthArg->getZExtValue();
+    else
+      return 0;
+    
+    if (Length == 0) // strncmp(x,y,0)   -> 0
+      return ConstantInt::get(CI->getType(), 0);
+    
+    std::string Str1, Str2;
+    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    
+    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+    
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(),
+                              strncmp(Str1.c_str(), Str2.c_str(), Length));
+    return 0;
+  }
+};
+
+
+//===---------------------------------------===//
+// 'strcpy' Optimizations
+
+struct VISIBILITY_HIDDEN StrCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcpy" function prototype.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty))
+      return 0;
+    
+    Value *Dst = CI->getOperand(1), *Src = CI->getOperand(2);
+    if (Dst == Src)      // strcpy(x,x)  -> x
+      return Src;
+    
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    EmitMemCpy(Dst, Src, ConstantInt::get(TD->getIntPtrType(), Len), 1, B);
+    return Dst;
+  }
+};
+
+//===---------------------------------------===//
+// 'strncpy' Optimizations
+
+struct VISIBILITY_HIDDEN StrNCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getParamType(2)))
+      return 0;
+
+    Value *Dst = CI->getOperand(1);
+    Value *Src = CI->getOperand(2);
+    Value *LenOp = CI->getOperand(3);
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;
+
+    if (SrcLen == 0) {
+      // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+      EmitMemSet(Dst, ConstantInt::get(Type::Int8Ty, '\0'), LenOp, B);
+      return Dst;
+    }
+
+    uint64_t Len;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
+
+    // Let strncpy handle the zero padding
+    if (Len > SrcLen+1) return 0;
+
+    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
+    EmitMemCpy(Dst, Src, ConstantInt::get(TD->getIntPtrType(), Len), 1, B);
+
+    return Dst;
+  }
+};
+
+//===---------------------------------------===//
+// 'strlen' Optimizations
+
+struct VISIBILITY_HIDDEN StrLenOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 ||
+        FT->getParamType(0) != PointerType::getUnqual(Type::Int8Ty) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    Value *Src = CI->getOperand(1);
+
+    // Constant folding: strlen("xyz") -> 3
+    if (uint64_t Len = GetStringLength(Src))
+      return ConstantInt::get(CI->getType(), Len-1);
+
+    // Handle strlen(p) != 0.
+    if (!IsOnlyUsedInZeroEqualityComparison(CI)) return 0;
+
+    // strlen(x) != 0 --> *x != 0
+    // strlen(x) == 0 --> *x == 0
+    return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+  }
+};
+
+//===---------------------------------------===//
+// 'strto*' Optimizations
+
+struct VISIBILITY_HIDDEN StrToOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)))
+      return 0;
+
+    Value *EndPtr = CI->getOperand(2);
+    if (isa<ConstantPointerNull>(EndPtr)) {
+      CI->setOnlyReadsMemory();
+      CI->addAttribute(1, Attribute::NoCapture);
+    }
+
+    return 0;
+  }
+};
+
+
+//===---------------------------------------===//
+// 'memcmp' Optimizations
+
+struct VISIBILITY_HIDDEN MemCmpOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getReturnType() != Type::Int32Ty)
+      return 0;
+
+    Value *LHS = CI->getOperand(1), *RHS = CI->getOperand(2);
+
+    if (LHS == RHS)  // memcmp(s,s,x) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // Make sure we have a constant length.
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (!LenC) return 0;
+    uint64_t Len = LenC->getZExtValue();
+
+    if (Len == 0) // memcmp(s1,s2,0) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    if (Len == 1) { // memcmp(S1,S2,1) -> *LHS - *RHS
+      Value *LHSV = B.CreateLoad(CastToCStr(LHS, B), "lhsv");
+      Value *RHSV = B.CreateLoad(CastToCStr(RHS, B), "rhsv");
+      return B.CreateSExt(B.CreateSub(LHSV, RHSV, "chardiff"), CI->getType());
+    }
+
+    // memcmp(S1,S2,2) != 0 -> (*(short*)LHS ^ *(short*)RHS)  != 0
+    // memcmp(S1,S2,4) != 0 -> (*(int*)LHS ^ *(int*)RHS)  != 0
+    if ((Len == 2 || Len == 4) && IsOnlyUsedInZeroEqualityComparison(CI)) {
+      const Type *PTy = PointerType::getUnqual(Len == 2 ?
+                                               Type::Int16Ty : Type::Int32Ty);
+      LHS = B.CreateBitCast(LHS, PTy, "tmp");
+      RHS = B.CreateBitCast(RHS, PTy, "tmp");
+      LoadInst *LHSV = B.CreateLoad(LHS, "lhsv");
+      LoadInst *RHSV = B.CreateLoad(RHS, "rhsv");
+      LHSV->setAlignment(1); RHSV->setAlignment(1);  // Unaligned loads.
+      return B.CreateZExt(B.CreateXor(LHSV, RHSV, "shortdiff"), CI->getType());
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'memcpy' Optimizations
+
+struct VISIBILITY_HIDDEN MemCpyOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getParamType(2) != TD->getIntPtrType())
+      return 0;
+
+    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+    EmitMemCpy(CI->getOperand(1), CI->getOperand(2), CI->getOperand(3), 1, B);
+    return CI->getOperand(1);
+  }
+};
+
+//===---------------------------------------===//
+// 'memmove' Optimizations
+
+struct VISIBILITY_HIDDEN MemMoveOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        FT->getParamType(2) != TD->getIntPtrType())
+      return 0;
+
+    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+    Module *M = Caller->getParent();
+    Intrinsic::ID IID = Intrinsic::memmove;
+    const Type *Tys[1];
+    Tys[0] = TD->getIntPtrType();
+    Value *MemMove = Intrinsic::getDeclaration(M, IID, Tys, 1);
+    Value *Dst = CastToCStr(CI->getOperand(1), B);
+    Value *Src = CastToCStr(CI->getOperand(2), B);
+    Value *Size = CI->getOperand(3);
+    Value *Align = ConstantInt::get(Type::Int32Ty, 1);
+    B.CreateCall4(MemMove, Dst, Src, Size, Align);
+    return CI->getOperand(1);
+  }
+};
+
+//===---------------------------------------===//
+// 'memset' Optimizations
+
+struct VISIBILITY_HIDDEN MemSetOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        FT->getParamType(1) != TD->getIntPtrType() ||
+        FT->getParamType(2) != TD->getIntPtrType())
+      return 0;
+
+    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+    Value *Val = B.CreateTrunc(CI->getOperand(2), Type::Int8Ty);
+    EmitMemSet(CI->getOperand(1), Val,  CI->getOperand(3), B);
+    return CI->getOperand(1);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'pow*' Optimizations
+
+struct VISIBILITY_HIDDEN PowOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        !FT->getParamType(0)->isFloatingPoint())
+      return 0;
+    
+    Value *Op1 = CI->getOperand(1), *Op2 = CI->getOperand(2);
+    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+        return Op1C;
+      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+        return EmitUnaryFloatFnCall(Op2, "exp2", B);
+    }
+    
+    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+    if (Op2C == 0) return 0;
+    
+    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
+      return ConstantFP::get(CI->getType(), 1.0);
+    
+    if (Op2C->isExactlyValue(0.5)) {
+      // FIXME: This is not safe for -0.0 and -inf.  This can only be done when
+      // 'unsafe' math optimizations are allowed.
+      // x    pow(x, 0.5)  sqrt(x)
+      // ---------------------------------------------
+      // -0.0    +0.0       -0.0
+      // -inf    +inf       NaN
+#if 0
+      // pow(x, 0.5) -> sqrt(x)
+      return B.CreateCall(get_sqrt(), Op1, "sqrt");
+#endif
+    }
+    
+    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
+      return Op1;
+    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
+      return B.CreateMul(Op1, Op1, "pow2");
+    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'exp2' Optimizations
+
+struct VISIBILITY_HIDDEN Exp2Opt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPoint())
+      return 0;
+    
+    Value *Op = CI->getOperand(1);
+    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+    Value *LdExpArg = 0;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), Type::Int32Ty, "tmp");
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), Type::Int32Ty, "tmp");
+    }
+    
+    if (LdExpArg) {
+      const char *Name;
+      if (Op->getType() == Type::FloatTy)
+        Name = "ldexpf";
+      else if (Op->getType() == Type::DoubleTy)
+        Name = "ldexp";
+      else
+        Name = "ldexpl";
+
+      Constant *One = ConstantFP::get(APFloat(1.0f));
+      if (Op->getType() != Type::FloatTy)
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = Caller->getParent();
+      Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                             Op->getType(), Type::Int32Ty,NULL);
+      return B.CreateCall2(Callee, One, LdExpArg);
+    }
+    return 0;
+  }
+};
+    
+
+//===---------------------------------------===//
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct VISIBILITY_HIDDEN UnaryDoubleFPOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || FT->getReturnType() != Type::DoubleTy ||
+        FT->getParamType(0) != Type::DoubleTy)
+      return 0;
+    
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getOperand(1));
+    if (Cast == 0 || Cast->getOperand(0)->getType() != Type::FloatTy)
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getNameStart(), B);
+    return B.CreateFPExt(V, Type::DoubleTy);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Integer Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'ffs*' Optimizations
+
+struct VISIBILITY_HIDDEN FFSOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != Type::Int32Ty ||
+        !isa<IntegerType>(FT->getParamType(0)))
+      return 0;
+    
+    Value *Op = CI->getOperand(1);
+    
+    // Constant fold.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      if (CI->getValue() == 0)  // ffs(0) -> 0.
+        return Constant::getNullValue(CI->getType());
+      return ConstantInt::get(Type::Int32Ty, // ffs(c) -> cttz(c)+1
+                              CI->getValue().countTrailingZeros()+1);
+    }
+    
+    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+    const Type *ArgType = Op->getType();
+    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
+                                         Intrinsic::cttz, &ArgType, 1);
+    Value *V = B.CreateCall(F, Op, "cttz");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp");
+    V = B.CreateIntCast(V, Type::Int32Ty, false, "tmp");
+    
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
+    return B.CreateSelect(Cond, V, ConstantInt::get(Type::Int32Ty, 0));
+  }
+};
+
+//===---------------------------------------===//
+// 'isdigit' Optimizations
+
+struct VISIBILITY_HIDDEN IsDigitOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isdigit(c) -> (c-'0') <u 10
+    Value *Op = CI->getOperand(1);
+    Op = B.CreateSub(Op, ConstantInt::get(Type::Int32Ty, '0'), "isdigittmp");
+    Op = B.CreateICmpULT(Op, ConstantInt::get(Type::Int32Ty, 10), "isdigit");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+//===---------------------------------------===//
+// 'isascii' Optimizations
+
+struct VISIBILITY_HIDDEN IsAsciiOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isascii(c) -> c <u 128
+    Value *Op = CI->getOperand(1);
+    Op = B.CreateICmpULT(Op, ConstantInt::get(Type::Int32Ty, 128), "isascii");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+  
+//===---------------------------------------===//
+// 'abs', 'labs', 'llabs' Optimizations
+
+struct VISIBILITY_HIDDEN AbsOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require integer(integer) where the types agree.
+    if (FT->getNumParams() != 1 || !isa<IntegerType>(FT->getReturnType()) ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+    
+    // abs(x) -> x >s -1 ? x : -x
+    Value *Op = CI->getOperand(1);
+    Value *Pos = B.CreateICmpSGT(Op,ConstantInt::getAllOnesValue(Op->getType()),
+                                 "ispos");
+    Value *Neg = B.CreateNeg(Op, "neg");
+    return B.CreateSelect(Pos, Op, Neg);
+  }
+};
+  
+
+//===---------------------------------------===//
+// 'toascii' Optimizations
+
+struct VISIBILITY_HIDDEN ToAsciiOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    // We require i32(i32)
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != Type::Int32Ty)
+      return 0;
+    
+    // isascii(c) -> c & 0x7f
+    return B.CreateAnd(CI->getOperand(1), ConstantInt::get(CI->getType(),0x7F));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Optimizations
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------===//
+// 'printf' Optimizations
+
+struct VISIBILITY_HIDDEN PrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !isa<PointerType>(FT->getParamType(0)) ||
+        !(isa<IntegerType>(FT->getReturnType()) ||
+          FT->getReturnType() == Type::VoidTy))
+      return 0;
+    
+    // Check for a fixed format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(1), FormatStr))
+      return 0;
+
+    // Empty format string -> noop.
+    if (FormatStr.empty())  // Tolerate printf's declared void.
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 0);
+    
+    // printf("x") -> putchar('x'), even for '%'.
+    if (FormatStr.size() == 1) {
+      EmitPutChar(ConstantInt::get(Type::Int32Ty, FormatStr[0]), B);
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 1);
+    }
+    
+    // printf("foo\n") --> puts("foo")
+    if (FormatStr[FormatStr.size()-1] == '\n' &&
+        FormatStr.find('%') == std::string::npos) {  // no format characters.
+      // Create a string literal with no \n on it.  We expect the constant merge
+      // pass to be run after this pass, to merge duplicate strings.
+      FormatStr.erase(FormatStr.end()-1);
+      Constant *C = ConstantArray::get(FormatStr, true);
+      C = new GlobalVariable(C->getType(), true,GlobalVariable::InternalLinkage,
+                             C, "str", Callee->getParent());
+      EmitPutS(C, B);
+      return CI->use_empty() ? (Value*)CI : 
+                          ConstantInt::get(CI->getType(), FormatStr.size()+1);
+    }
+    
+    // Optimize specific format strings.
+    // printf("%c", chr) --> putchar(*(i8*)dst)
+    if (FormatStr == "%c" && CI->getNumOperands() > 2 &&
+        isa<IntegerType>(CI->getOperand(2)->getType())) {
+      EmitPutChar(CI->getOperand(2), B);
+      return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 1);
+    }
+    
+    // printf("%s\n", str) --> puts(str)
+    if (FormatStr == "%s\n" && CI->getNumOperands() > 2 &&
+        isa<PointerType>(CI->getOperand(2)->getType()) &&
+        CI->use_empty()) {
+      EmitPutS(CI->getOperand(2), B);
+      return CI;
+    }
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'sprintf' Optimizations
+
+struct VISIBILITY_HIDDEN SPrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed pointer arguments and an integer result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+
+    // Check for a fixed format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+      return 0;
+    
+    // If we just have a format string (nothing else crazy) transform it.
+    if (CI->getNumOperands() == 3) {
+      // Make sure there's no % in the constant array.  We could try to handle
+      // %% -> % in the future if we cared.
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')
+          return 0; // we found a format specifier, bail out.
+      
+      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      EmitMemCpy(CI->getOperand(1), CI->getOperand(2), // Copy the nul byte.
+                 ConstantInt::get(TD->getIntPtrType(), FormatStr.size()+1),1,B);
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+    
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+      return 0;
+    
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!isa<IntegerType>(CI->getOperand(3)->getType())) return 0;
+      Value *V = B.CreateTrunc(CI->getOperand(3), Type::Int8Ty, "char");
+      Value *Ptr = CastToCStr(CI->getOperand(1), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(Ptr, ConstantInt::get(Type::Int32Ty, 1), "nul");
+      B.CreateStore(Constant::getNullValue(Type::Int8Ty), Ptr);
+      
+      return ConstantInt::get(CI->getType(), 1);
+    }
+    
+    if (FormatStr[1] == 's') {
+      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+      if (!isa<PointerType>(CI->getOperand(3)->getType())) return 0;
+
+      Value *Len = EmitStrLen(CI->getOperand(3), B);
+      Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1),
+                                  "leninc");
+      EmitMemCpy(CI->getOperand(1), CI->getOperand(3), IncLen, 1, B);
+      
+      // The sprintf result is the unincremented number of bytes in the string.
+      return B.CreateIntCast(Len, CI->getType(), false);
+    }
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'fwrite' Optimizations
+
+struct VISIBILITY_HIDDEN FWriteOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require a pointer, an integer, an integer, a pointer, returning integer.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 4 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<IntegerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getParamType(2)) ||
+        !isa<PointerType>(FT->getParamType(3)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    // Get the element size and count.
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (!SizeC || !CountC) return 0;
+    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
+    
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+    
+    // If this is writing one byte, turn it into fputc.
+    if (Bytes == 1) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(CastToCStr(CI->getOperand(1), B), "char");
+      EmitFPutC(Char, CI->getOperand(4), B);
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
+// 'fputs' Optimizations
+
+struct VISIBILITY_HIDDEN FPutsOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two pointers.  Also, we can't optimize if return value is used.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !CI->use_empty())
+      return 0;
+    
+    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+    uint64_t Len = GetStringLength(CI->getOperand(1));
+    if (!Len) return 0;
+    EmitFWrite(CI->getOperand(1), ConstantInt::get(TD->getIntPtrType(), Len-1),
+               CI->getOperand(2), B);
+    return CI;  // Known to have no uses (see above).
+  }
+};
+
+//===---------------------------------------===//
+// 'fprintf' Optimizations
+
+struct VISIBILITY_HIDDEN FPrintFOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed paramters as pointers and integer result.
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !isa<IntegerType>(FT->getReturnType()))
+      return 0;
+    
+    // All the optimizations depend on the format string.
+    std::string FormatStr;
+    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+      return 0;
+
+    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+    if (CI->getNumOperands() == 3) {
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
+          return 0; // We found a format specifier.
+      
+      EmitFWrite(CI->getOperand(2), ConstantInt::get(TD->getIntPtrType(),
+                                                     FormatStr.size()),
+                 CI->getOperand(1), B);
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+    
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+      return 0;
+    
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // fprintf(F, "%c", chr) --> *(i8*)dst = chr
+      if (!isa<IntegerType>(CI->getOperand(3)->getType())) return 0;
+      EmitFPutC(CI->getOperand(3), CI->getOperand(1), B);
+      return ConstantInt::get(CI->getType(), 1);
+    }
+    
+    if (FormatStr[1] == 's') {
+      // fprintf(F, "%s", str) -> fputs(str, F)
+      if (!isa<PointerType>(CI->getOperand(3)->getType()) || !CI->use_empty())
+        return 0;
+      EmitFPutS(CI->getOperand(3), CI->getOperand(1), B);
+      return CI;
+    }
+    return 0;
+  }
+};
+
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// SimplifyLibCalls Pass Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// This pass optimizes well known library functions from libc and libm.
+  ///
+  class VISIBILITY_HIDDEN SimplifyLibCalls : public FunctionPass {
+    StringMap<LibCallOptimization*> Optimizations;
+    // Miscellaneous LibCall Optimizations
+    ExitOpt Exit; 
+    // String and Memory LibCall Optimizations
+    StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrCmpOpt StrCmp;
+    StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrNCpyOpt StrNCpy; StrLenOpt StrLen;
+    StrToOpt StrTo; MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove;
+    MemSetOpt MemSet;
+    // Math Library Optimizations
+    PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    // Integer Optimizations
+    FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
+    ToAsciiOpt ToAscii;
+    // Formatting and IO Optimizations
+    SPrintFOpt SPrintF; PrintFOpt PrintF;
+    FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF;
+
+    bool Modified;  // This is only used by doInitialization.
+  public:
+    static char ID; // Pass identification
+    SimplifyLibCalls() : FunctionPass(&ID) {}
+
+    void InitOptimizations();
+    bool runOnFunction(Function &F);
+
+    void setDoesNotAccessMemory(Function &F);
+    void setOnlyReadsMemory(Function &F);
+    void setDoesNotThrow(Function &F);
+    void setDoesNotCapture(Function &F, unsigned n);
+    void setDoesNotAlias(Function &F, unsigned n);
+    bool doInitialization(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+  };
+  char SimplifyLibCalls::ID = 0;
+} // end anonymous namespace.
+
+static RegisterPass<SimplifyLibCalls>
+X("simplify-libcalls", "Simplify well-known library calls");
+
+// Public interface to the Simplify LibCalls pass.
+FunctionPass *llvm::createSimplifyLibCallsPass() {
+  return new SimplifyLibCalls(); 
+}
+
+/// Optimizations - Populate the Optimizations map with all the optimizations
+/// we know.
+void SimplifyLibCalls::InitOptimizations() {
+  // Miscellaneous LibCall Optimizations
+  Optimizations["exit"] = &Exit;
+  
+  // String and Memory LibCall Optimizations
+  Optimizations["strcat"] = &StrCat;
+  Optimizations["strncat"] = &StrNCat;
+  Optimizations["strchr"] = &StrChr;
+  Optimizations["strcmp"] = &StrCmp;
+  Optimizations["strncmp"] = &StrNCmp;
+  Optimizations["strcpy"] = &StrCpy;
+  Optimizations["strncpy"] = &StrNCpy;
+  Optimizations["strlen"] = &StrLen;
+  Optimizations["strtol"] = &StrTo;
+  Optimizations["strtod"] = &StrTo;
+  Optimizations["strtof"] = &StrTo;
+  Optimizations["strtoul"] = &StrTo;
+  Optimizations["strtoll"] = &StrTo;
+  Optimizations["strtold"] = &StrTo;
+  Optimizations["strtoull"] = &StrTo;
+  Optimizations["memcmp"] = &MemCmp;
+  Optimizations["memcpy"] = &MemCpy;
+  Optimizations["memmove"] = &MemMove;
+  Optimizations["memset"] = &MemSet;
+  
+  // Math Library Optimizations
+  Optimizations["powf"] = &Pow;
+  Optimizations["pow"] = &Pow;
+  Optimizations["powl"] = &Pow;
+  Optimizations["llvm.pow.f32"] = &Pow;
+  Optimizations["llvm.pow.f64"] = &Pow;
+  Optimizations["llvm.pow.f80"] = &Pow;
+  Optimizations["llvm.pow.f128"] = &Pow;
+  Optimizations["llvm.pow.ppcf128"] = &Pow;
+  Optimizations["exp2l"] = &Exp2;
+  Optimizations["exp2"] = &Exp2;
+  Optimizations["exp2f"] = &Exp2;
+  Optimizations["llvm.exp2.ppcf128"] = &Exp2;
+  Optimizations["llvm.exp2.f128"] = &Exp2;
+  Optimizations["llvm.exp2.f80"] = &Exp2;
+  Optimizations["llvm.exp2.f64"] = &Exp2;
+  Optimizations["llvm.exp2.f32"] = &Exp2;
+  
+#ifdef HAVE_FLOORF
+  Optimizations["floor"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_CEILF
+  Optimizations["ceil"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_ROUNDF
+  Optimizations["round"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_RINTF
+  Optimizations["rint"] = &UnaryDoubleFP;
+#endif
+#ifdef HAVE_NEARBYINTF
+  Optimizations["nearbyint"] = &UnaryDoubleFP;
+#endif
+  
+  // Integer Optimizations
+  Optimizations["ffs"] = &FFS;
+  Optimizations["ffsl"] = &FFS;
+  Optimizations["ffsll"] = &FFS;
+  Optimizations["abs"] = &Abs;
+  Optimizations["labs"] = &Abs;
+  Optimizations["llabs"] = &Abs;
+  Optimizations["isdigit"] = &IsDigit;
+  Optimizations["isascii"] = &IsAscii;
+  Optimizations["toascii"] = &ToAscii;
+  
+  // Formatting and IO Optimizations
+  Optimizations["sprintf"] = &SPrintF;
+  Optimizations["printf"] = &PrintF;
+  Optimizations["fwrite"] = &FWrite;
+  Optimizations["fputs"] = &FPuts;
+  Optimizations["fprintf"] = &FPrintF;
+}
+
+
+/// runOnFunction - Top level algorithm.
+///
+bool SimplifyLibCalls::runOnFunction(Function &F) {
+  if (Optimizations.empty())
+    InitOptimizations();
+  
+  const TargetData &TD = getAnalysis<TargetData>();
+  
+  IRBuilder<> Builder;
+
+  bool Changed = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I++);
+      if (!CI) continue;
+      
+      // Ignore indirect calls and calls to non-external functions.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0 || !Callee->isDeclaration() ||
+          !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage()))
+        continue;
+      
+      // Ignore unknown calls.
+      const char *CalleeName = Callee->getNameStart();
+      StringMap<LibCallOptimization*>::iterator OMI =
+        Optimizations.find(CalleeName, CalleeName+Callee->getNameLen());
+      if (OMI == Optimizations.end()) continue;
+      
+      // Set the builder to the instruction after the call.
+      Builder.SetInsertPoint(BB, I);
+      
+      // Try to optimize this call.
+      Value *Result = OMI->second->OptimizeCall(CI, TD, Builder);
+      if (Result == 0) continue;
+
+      DEBUG(DOUT << "SimplifyLibCalls simplified: " << *CI;
+            DOUT << "  into: " << *Result << "\n");
+      
+      // Something changed!
+      Changed = true;
+      ++NumSimplified;
+      
+      // Inspect the instruction after the call (which was potentially just
+      // added) next.
+      I = CI; ++I;
+      
+      if (CI != Result && !CI->use_empty()) {
+        CI->replaceAllUsesWith(Result);
+        if (!Result->hasName())
+          Result->takeName(CI);
+      }
+      CI->eraseFromParent();
+    }
+  }
+  return Changed;
+}
+
+// Utility methods for doInitialization.
+
+void SimplifyLibCalls::setDoesNotAccessMemory(Function &F) {
+  if (!F.doesNotAccessMemory()) {
+    F.setDoesNotAccessMemory();
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setOnlyReadsMemory(Function &F) {
+  if (!F.onlyReadsMemory()) {
+    F.setOnlyReadsMemory();
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setDoesNotThrow(Function &F) {
+  if (!F.doesNotThrow()) {
+    F.setDoesNotThrow();
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setDoesNotCapture(Function &F, unsigned n) {
+  if (!F.doesNotCapture(n)) {
+    F.setDoesNotCapture(n);
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
+  if (!F.doesNotAlias(n)) {
+    F.setDoesNotAlias(n);
+    ++NumAnnotated;
+    Modified = true;
+  }
+}
+
+/// doInitialization - Add attributes to well-known functions.
+///
+bool SimplifyLibCalls::doInitialization(Module &M) {
+  Modified = false;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function &F = *I;
+    if (!F.isDeclaration())
+      continue;
+
+    unsigned NameLen = F.getNameLen();
+    if (!NameLen)
+      continue;
+
+    const FunctionType *FTy = F.getFunctionType();
+
+    const char *NameStr = F.getNameStart();
+    switch (NameStr[0]) {
+      case 's':
+        if (NameLen == 6 && !strcmp(NameStr, "strlen")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strcpy")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "stpcpy")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strcat")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strtol")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strtod")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strtof")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strtoul")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strtoll")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strtold")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strncat")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strncpy")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "strtoull"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 7 && !strcmp(NameStr, "strxfrm")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strcmp")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "strspn")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strncmp")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strcspn")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strcoll")) ||
+                   (NameLen == 10 && !strcmp(NameStr, "strcasecmp")) ||
+                   (NameLen == 11 && !strcmp(NameStr, "strncasecmp"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strstr")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strpbrk"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strtok")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "strtok_r"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "scanf")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "setbuf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "setvbuf"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "strdup")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "strndup"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "stat")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "sscanf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "sprintf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "statvfs"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "snprintf")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 3);
+        } else if (NameLen == 9 && !strcmp(NameStr, "setitimer")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+          setDoesNotCapture(F, 3);
+        } else if (NameLen == 6 && !strcmp(NameStr, "system")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          // May throw; "system" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'm':
+        if (NameLen == 6 && !strcmp(NameStr, "memcmp")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "memchr")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "memrchr"))) {
+          if (FTy->getNumParams() != 3)
+            continue;
+          setOnlyReadsMemory(F);
+          setDoesNotThrow(F);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "modf")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "modff")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "modfl")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "memcpy")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "memccpy")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "memmove"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "memalign")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotAlias(F, 0);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "mkdir")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "mktime"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'r':
+        if (NameLen == 7 && !strcmp(NameStr, "realloc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 4 && !strcmp(NameStr, "read")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          // May throw; "read" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "rmdir")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "rewind")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "remove")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "realpath"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 6 && !strcmp(NameStr, "rename")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "readlink"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'w':
+        if (NameLen == 5 && !strcmp(NameStr, "write")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          // May throw; "write" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'b':
+        if (NameLen == 5 && !strcmp(NameStr, "bcopy")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 4 && !strcmp(NameStr, "bcmp")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setOnlyReadsMemory(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 5 && !strcmp(NameStr, "bzero")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'c':
+        if (NameLen == 6 && !strcmp(NameStr, "calloc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "chmod")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "chown")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "ctermid")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "clearerr")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "closedir"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'a':
+        if ((NameLen == 4 && !strcmp(NameStr, "atoi")) ||
+            (NameLen == 4 && !strcmp(NameStr, "atol")) ||
+            (NameLen == 4 && !strcmp(NameStr, "atof")) ||
+            (NameLen == 5 && !strcmp(NameStr, "atoll"))) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setOnlyReadsMemory(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 6 && !strcmp(NameStr, "access")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'f':
+        if (NameLen == 5 && !strcmp(NameStr, "fopen")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "fdopen")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "feof")) ||
+                   (NameLen == 4 && !strcmp(NameStr, "free")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "fseek")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "ftell")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "fgetc")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fseeko")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "ftello")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fileno")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fflush")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fclose")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "fsetpos")) ||
+                   (NameLen == 9 && !strcmp(NameStr, "flockfile")) ||
+                   (NameLen == 11 && !strcmp(NameStr, "funlockfile")) ||
+                   (NameLen == 12 && !strcmp(NameStr, "ftrylockfile"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 6 && !strcmp(NameStr, "ferror")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setOnlyReadsMemory(F);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "fputc")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "fstat")) ||
+                   (NameLen == 5 && !strcmp(NameStr, "frexp")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "frexpf")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "frexpl")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "fstatvfs"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 5 && !strcmp(NameStr, "fgets")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 3);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "fread")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fwrite"))) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(3)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 4);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "fputs")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "fscanf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "fprintf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "fgetpos"))) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'g':
+        if ((NameLen == 4 && !strcmp(NameStr, "getc")) ||
+            (NameLen == 10 && !strcmp(NameStr, "getlogin_r")) ||
+            (NameLen == 13 && !strcmp(NameStr, "getc_unlocked"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 6 && !strcmp(NameStr, "getenv")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setOnlyReadsMemory(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "gets")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "getchar"))) {
+          setDoesNotThrow(F);
+        } else if (NameLen == 9 && !strcmp(NameStr, "getitimer")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "getpwnam")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'u':
+        if (NameLen == 6 && !strcmp(NameStr, "ungetc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "uname")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "unlink")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "unsetenv"))) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "utime")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "utimes"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 'p':
+        if (NameLen == 4 && !strcmp(NameStr, "putc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 4 && !strcmp(NameStr, "puts")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "printf")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "perror"))) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 5 && !strcmp(NameStr, "pread")) ||
+                   (NameLen == 6 && !strcmp(NameStr, "pwrite"))) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          // May throw; these are valid pthread cancellation points.
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 7 && !strcmp(NameStr, "putchar")) {
+          setDoesNotThrow(F);
+        } else if (NameLen == 5 && !strcmp(NameStr, "popen")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "pclose")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'v':
+        if (NameLen == 6 && !strcmp(NameStr, "vscanf")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 7 && !strcmp(NameStr, "vsscanf")) ||
+                   (NameLen == 7 && !strcmp(NameStr, "vfscanf"))) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "valloc")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if (NameLen == 7 && !strcmp(NameStr, "vprintf")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 8 && !strcmp(NameStr, "vfprintf")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "vsprintf"))) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 9 && !strcmp(NameStr, "vsnprintf")) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(2)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 3);
+        }
+        break;
+      case 'o':
+        if (NameLen == 4 && !strcmp(NameStr, "open")) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          // May throw; "open" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 7 && !strcmp(NameStr, "opendir")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 't':
+        if (NameLen == 7 && !strcmp(NameStr, "tmpfile")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if (NameLen == 5 && !strcmp(NameStr, "times")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'h':
+        if ((NameLen == 5 && !strcmp(NameStr, "htonl")) ||
+            (NameLen == 5 && !strcmp(NameStr, "htons"))) {
+          setDoesNotThrow(F);
+          setDoesNotAccessMemory(F);
+        }
+        break;
+      case 'n':
+        if ((NameLen == 5 && !strcmp(NameStr, "ntohl")) ||
+            (NameLen == 5 && !strcmp(NameStr, "ntohs"))) {
+          setDoesNotThrow(F);
+          setDoesNotAccessMemory(F);
+        }
+        break;
+      case 'l':
+        if (NameLen == 5 && !strcmp(NameStr, "lstat")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 6 && !strcmp(NameStr, "lchown")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        }
+        break;
+      case 'q':
+        if (NameLen == 5 && !strcmp(NameStr, "qsort")) {
+          if (FTy->getNumParams() != 4 ||
+              !isa<PointerType>(FTy->getParamType(3)))
+            continue;
+          // May throw; places call through function pointer.
+          setDoesNotCapture(F, 4);
+        }
+        break;
+      case '_':
+        if ((NameLen == 8 && !strcmp(NameStr, "__strdup")) ||
+            (NameLen == 9 && !strcmp(NameStr, "__strndup"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 10 && !strcmp(NameStr, "__strtok_r")) {
+          if (FTy->getNumParams() != 3 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "_IO_getc")) {
+          if (FTy->getNumParams() != 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 8 && !strcmp(NameStr, "_IO_putc")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        }
+        break;
+      case 1:
+        if (NameLen == 15 && !strcmp(NameStr, "\1__isoc99_scanf")) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if ((NameLen == 7 && !strcmp(NameStr, "\1stat64")) ||
+                   (NameLen == 8 && !strcmp(NameStr, "\1lstat64")) ||
+                   (NameLen == 10 && !strcmp(NameStr, "\1statvfs64")) ||
+                   (NameLen == 16 && !strcmp(NameStr, "\1__isoc99_sscanf"))) {
+          if (FTy->getNumParams() < 1 ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 8 && !strcmp(NameStr, "\1fopen64")) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getReturnType()) ||
+              !isa<PointerType>(FTy->getParamType(0)) ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+          setDoesNotCapture(F, 1);
+          setDoesNotCapture(F, 2);
+        } else if ((NameLen == 9 && !strcmp(NameStr, "\1fseeko64")) ||
+                   (NameLen == 9 && !strcmp(NameStr, "\1ftello64"))) {
+          if (FTy->getNumParams() == 0 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 1);
+        } else if (NameLen == 10 && !strcmp(NameStr, "\1tmpfile64")) {
+          if (!isa<PointerType>(FTy->getReturnType()))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotAlias(F, 0);
+        } else if ((NameLen == 8 && !strcmp(NameStr, "\1fstat64")) ||
+                   (NameLen == 11 && !strcmp(NameStr, "\1fstatvfs64"))) {
+          if (FTy->getNumParams() != 2 ||
+              !isa<PointerType>(FTy->getParamType(1)))
+            continue;
+          setDoesNotThrow(F);
+          setDoesNotCapture(F, 2);
+        } else if (NameLen == 7 && !strcmp(NameStr, "\1open64")) {
+          if (FTy->getNumParams() < 2 ||
+              !isa<PointerType>(FTy->getParamType(0)))
+            continue;
+          // May throw; "open" is a valid pthread cancellation point.
+          setDoesNotCapture(F, 1);
+        }
+        break;
+    }
+  }
+  return Modified;
+}
+
+// TODO:
+//   Additional cases that we need to add to this file:
+//
+// cbrt:
+//   * cbrt(expN(X))  -> expN(x/3)
+//   * cbrt(sqrt(x))  -> pow(x,1/6)
+//   * cbrt(sqrt(x))  -> pow(x,1/9)
+//
+// cos, cosf, cosl:
+//   * cos(-x)  -> cos(x)
+//
+// exp, expf, expl:
+//   * exp(log(x))  -> x
+//
+// log, logf, logl:
+//   * log(exp(x))   -> x
+//   * log(x**y)     -> y*log(x)
+//   * log(exp(y))   -> y*log(e)
+//   * log(exp2(y))  -> y*log(2)
+//   * log(exp10(y)) -> y*log(10)
+//   * log(sqrt(x))  -> 0.5*log(x)
+//   * log(pow(x,y)) -> y*log(x)
+//
+// lround, lroundf, lroundl:
+//   * lround(cnst) -> cnst'
+//
+// memcmp:
+//   * memcmp(x,y,l)   -> cnst
+//      (if all arguments are constant and strlen(x) <= l and strlen(y) <= l)
+//
+// pow, powf, powl:
+//   * pow(exp(x),y)  -> exp(x*y)
+//   * pow(sqrt(x),y) -> pow(x,y*0.5)
+//   * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// puts:
+//   * puts("") -> putchar("\n")
+//
+// round, roundf, roundl:
+//   * round(cnst) -> cnst'
+//
+// signbit:
+//   * signbit(cnst) -> cnst'
+//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+//   * sqrt(expN(x))  -> expN(x*0.5)
+//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+// stpcpy:
+//   * stpcpy(str, "literal") ->
+//           llvm.memcpy(str,"literal",strlen("literal")+1,1)
+// strrchr:
+//   * strrchr(s,c) -> reverse_offset_of_in(c,s)
+//      (if c is a constant integer and s is a constant string)
+//   * strrchr(s1,0) -> strchr(s1,0)
+//
+// strpbrk:
+//   * strpbrk(s,a) -> offset_in_for(s,a)
+//      (if s and a are both constant strings)
+//   * strpbrk(s,"") -> 0
+//   * strpbrk(s,a) -> strchr(s,a[0]) (if a is constant string of length 1)
+//
+// strspn, strcspn:
+//   * strspn(s,a)   -> const_int (if both args are constant)
+//   * strspn("",a)  -> 0
+//   * strspn(s,"")  -> 0
+//   * strcspn(s,a)  -> const_int (if both args are constant)
+//   * strcspn("",a) -> 0
+//   * strcspn(s,"") -> strlen(a)
+//
+// strstr:
+//   * strstr(x,x)  -> x
+//   * strstr(s1,s2) -> offset_of_s2_in(s1)
+//       (if s1 and s2 are constant strings)
+//
+// tan, tanf, tanl:
+//   * tan(atan(x)) -> x
+//
+// trunc, truncf, truncl:
+//   * trunc(cnst) -> cnst'
+//
+//
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
new file mode 100644
index 0000000..99a7dee
--- /dev/null
+++ b/lib/Transforms/Scalar/TailDuplication.cpp
@@ -0,0 +1,365 @@
+//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a limited form of tail duplication, intended to simplify
+// CFGs by removing some unconditional branches.  This pass is necessary to
+// straighten out loops created by the C front-end, but also is capable of
+// making other code nicer.  After this pass is run, the CFG simplify pass
+// should be run to clean up the mess.
+//
+// This pass could be enhanced in the future to use profile information to be
+// more aggressive.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailduplicate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constant.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumEliminated, "Number of unconditional branches eliminated");
+
+static cl::opt<unsigned>
+TailDupThreshold("taildup-threshold",
+                 cl::desc("Max block size to tail duplicate"),
+                 cl::init(1), cl::Hidden);
+
+namespace {
+  class VISIBILITY_HIDDEN TailDup : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    TailDup() : FunctionPass(&ID) {}
+
+  private:
+    inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned);
+    inline void eliminateUnconditionalBranch(BranchInst *BI);
+    SmallPtrSet<BasicBlock*, 4> CycleDetector;
+  };
+}
+
+char TailDup::ID = 0;
+static RegisterPass<TailDup> X("tailduplicate", "Tail Duplication");
+
+// Public interface to the Tail Duplication pass
+FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); }
+
+/// runOnFunction - Top level algorithm - Loop over each unconditional branch in
+/// the function, eliminating it if it looks attractive enough.  CycleDetector
+/// prevents infinite loops by checking that we aren't redirecting a branch to
+/// a place it already pointed to earlier; see PR 2323.
+bool TailDup::runOnFunction(Function &F) {
+  bool Changed = false;
+  CycleDetector.clear();
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    if (shouldEliminateUnconditionalBranch(I->getTerminator(),
+                                           TailDupThreshold)) {
+      eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator()));
+      Changed = true;
+    } else {
+      ++I;
+      CycleDetector.clear();
+    }
+  }
+  return Changed;
+}
+
+/// shouldEliminateUnconditionalBranch - Return true if this branch looks
+/// attractive to eliminate.  We eliminate the branch if the destination basic
+/// block has <= 5 instructions in it, not counting PHI nodes.  In practice,
+/// since one of these is a terminator instruction, this means that we will add
+/// up to 4 instructions to the new block.
+///
+/// We don't count PHI nodes in the count since they will be removed when the
+/// contents of the block are copied over.
+///
+bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI,
+                                                 unsigned Threshold) {
+  BranchInst *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isUnconditional()) return false;  // Not an uncond branch!
+
+  BasicBlock *Dest = BI->getSuccessor(0);
+  if (Dest == BI->getParent()) return false;        // Do not loop infinitely!
+
+  // Do not inline a block if we will just get another branch to the same block!
+  TerminatorInst *DTI = Dest->getTerminator();
+  if (BranchInst *DBI = dyn_cast<BranchInst>(DTI))
+    if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest)
+      return false;                                 // Do not loop infinitely!
+
+  // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack,
+  // because doing so would require breaking critical edges.  This should be
+  // fixed eventually.
+  if (!DTI->use_empty())
+    return false;
+
+  // Do not bother with blocks with only a single predecessor: simplify
+  // CFG will fold these two blocks together!
+  pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest);
+  ++PI;
+  if (PI == PE) return false;  // Exactly one predecessor!
+
+  BasicBlock::iterator I = Dest->getFirstNonPHI();
+
+  for (unsigned Size = 0; I != Dest->end(); ++I) {
+    if (Size == Threshold) return false;  // The block is too large.
+    
+    // Don't tail duplicate call instructions.  They are very large compared to
+    // other instructions.
+    if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false;
+
+    // Allso alloca and malloc.
+    if (isa<AllocationInst>(I)) return false;
+
+    // Some vector instructions can expand into a number of instructions.
+    if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) ||
+        isa<InsertElementInst>(I)) return false;
+    
+    // Only count instructions that are not debugger intrinsics.
+    if (!isa<DbgInfoIntrinsic>(I)) ++Size;
+  }
+
+  // Do not tail duplicate a block that has thousands of successors into a block
+  // with a single successor if the block has many other predecessors.  This can
+  // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in
+  // cases that have a large number of indirect gotos.
+  unsigned NumSuccs = DTI->getNumSuccessors();
+  if (NumSuccs > 8) {
+    unsigned TooMany = 128;
+    if (NumSuccs >= TooMany) return false;
+    TooMany = TooMany/NumSuccs;
+    for (; PI != PE; ++PI)
+      if (TooMany-- == 0) return false;
+  }
+  
+  // If this unconditional branch is a fall-through, be careful about
+  // tail duplicating it.  In particular, we don't want to taildup it if the
+  // original block will still be there after taildup is completed: doing so
+  // would eliminate the fall-through, requiring unconditional branches.
+  Function::iterator DestI = Dest;
+  if (&*--DestI == BI->getParent()) {
+    // The uncond branch is a fall-through.  Tail duplication of the block is
+    // will eliminate the fall-through-ness and end up cloning the terminator
+    // at the end of the Dest block.  Since the original Dest block will
+    // continue to exist, this means that one or the other will not be able to
+    // fall through.  One typical example that this helps with is code like:
+    // if (a)
+    //   foo();
+    // if (b)
+    //   foo();
+    // Cloning the 'if b' block into the end of the first foo block is messy.
+    
+    // The messy case is when the fall-through block falls through to other
+    // blocks.  This is what we would be preventing if we cloned the block.
+    DestI = Dest;
+    if (++DestI != Dest->getParent()->end()) {
+      BasicBlock *DestSucc = DestI;
+      // If any of Dest's successors are fall-throughs, don't do this xform.
+      for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest);
+           SI != SE; ++SI)
+        if (*SI == DestSucc)
+          return false;
+    }
+  }
+
+  // Finally, check that we haven't redirected to this target block earlier;
+  // there are cases where we loop forever if we don't check this (PR 2323).
+  if (!CycleDetector.insert(Dest))
+    return false;
+
+  return true;
+}
+
+/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to
+/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock.  If we
+/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and
+/// DstBlock, return it.
+static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
+                                          BasicBlock *DstBlock) {
+  // SrcBlock must have a single predecessor.
+  pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock);
+  if (PI == PE || ++PI != PE) return 0;
+
+  BasicBlock *SrcPred = *pred_begin(SrcBlock);
+
+  // Look at the predecessors of DstBlock.  One of them will be SrcBlock.  If
+  // there is only one other pred, get it, otherwise we can't handle it.
+  PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
+  BasicBlock *DstOtherPred = 0;
+  if (*PI == SrcBlock) {
+    if (++PI == PE) return 0;
+    DstOtherPred = *PI;
+    if (++PI != PE) return 0;
+  } else {
+    DstOtherPred = *PI;
+    if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
+  }
+
+  // We can handle two situations here: "if then" and "if then else" blocks.  An
+  // 'if then' situation is just where DstOtherPred == SrcPred.
+  if (DstOtherPred == SrcPred)
+    return SrcPred;
+
+  // Check to see if we have an "if then else" situation, which means that
+  // DstOtherPred will have a single predecessor and it will be SrcPred.
+  PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred);
+  if (PI != PE && *PI == SrcPred) {
+    if (++PI != PE) return 0;  // Not a single pred.
+    return SrcPred;  // Otherwise, it's an "if then" situation.  Return the if.
+  }
+
+  // Otherwise, this is something we can't handle.
+  return 0;
+}
+
+
+/// eliminateUnconditionalBranch - Clone the instructions from the destination
+/// block into the source block, eliminating the specified unconditional branch.
+/// If the destination block defines values used by successors of the dest
+/// block, we may need to insert PHI nodes.
+///
+void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) {
+  BasicBlock *SourceBlock = Branch->getParent();
+  BasicBlock *DestBlock = Branch->getSuccessor(0);
+  assert(SourceBlock != DestBlock && "Our predicate is broken!");
+
+  DOUT << "TailDuplication[" << SourceBlock->getParent()->getName()
+       << "]: Eliminating branch: " << *Branch;
+
+  // See if we can avoid duplicating code by moving it up to a dominator of both
+  // blocks.
+  if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) {
+    DOUT << "Found shared dominator: " << DomBlock->getName() << "\n";
+
+    // If there are non-phi instructions in DestBlock that have no operands
+    // defined in DestBlock, and if the instruction has no side effects, we can
+    // move the instruction to DomBlock instead of duplicating it.
+    BasicBlock::iterator BBI = DestBlock->getFirstNonPHI();
+    while (!isa<TerminatorInst>(BBI)) {
+      Instruction *I = BBI++;
+
+      bool CanHoist = !I->isTrapping() && !I->mayHaveSideEffects();
+      if (CanHoist) {
+        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+          if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op)))
+            if (OpI->getParent() == DestBlock ||
+                (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) {
+              CanHoist = false;
+              break;
+            }
+        if (CanHoist) {
+          // Remove from DestBlock, move right before the term in DomBlock.
+          DestBlock->getInstList().remove(I);
+          DomBlock->getInstList().insert(DomBlock->getTerminator(), I);
+          DOUT << "Hoisted: " << *I;
+        }
+      }
+    }
+  }
+
+  // Tail duplication can not update SSA properties correctly if the values
+  // defined in the duplicated tail are used outside of the tail itself.  For
+  // this reason, we spill all values that are used outside of the tail to the
+  // stack.
+  for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I)
+    if (I->isUsedOutsideOfBlock(DestBlock)) {
+      // We found a use outside of the tail.  Create a new stack slot to
+      // break this inter-block usage pattern.
+      DemoteRegToStack(*I);
+    }
+
+  // We are going to have to map operands from the original block B to the new
+  // copy of the block B'.  If there are PHI nodes in the DestBlock, these PHI
+  // nodes also define part of this mapping.  Loop over these PHI nodes, adding
+  // them to our mapping.
+  //
+  std::map<Value*, Value*> ValueMapping;
+
+  BasicBlock::iterator BI = DestBlock->begin();
+  bool HadPHINodes = isa<PHINode>(BI);
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock);
+
+  // Clone the non-phi instructions of the dest block into the source block,
+  // keeping track of the mapping...
+  //
+  for (; BI != DestBlock->end(); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    SourceBlock->getInstList().push_back(New);
+    ValueMapping[BI] = New;
+  }
+
+  // Now that we have built the mapping information and cloned all of the
+  // instructions (giving us a new terminator, among other things), walk the new
+  // instructions, rewriting references of old instructions to use new
+  // instructions.
+  //
+  BI = Branch; ++BI;  // Get an iterator to the first new instruction
+  for (; BI != SourceBlock->end(); ++BI)
+    for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i)
+      if (Value *Remapped = ValueMapping[BI->getOperand(i)])
+        BI->setOperand(i, Remapped);
+
+  // Next we check to see if any of the successors of DestBlock had PHI nodes.
+  // If so, we need to add entries to the PHI nodes for SourceBlock now.
+  for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock);
+       SI != SE; ++SI) {
+    BasicBlock *Succ = *SI;
+    for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) {
+      PHINode *PN = cast<PHINode>(PNI);
+      // Ok, we have a PHI node.  Figure out what the incoming value was for the
+      // DestBlock.
+      Value *IV = PN->getIncomingValueForBlock(DestBlock);
+
+      // Remap the value if necessary...
+      if (Value *MappedIV = ValueMapping[IV])
+        IV = MappedIV;
+      PN->addIncoming(IV, SourceBlock);
+    }
+  }
+
+  // Next, remove the old branch instruction, and any PHI node entries that we
+  // had.
+  BI = Branch; ++BI;  // Get an iterator to the first new instruction
+  DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes...
+  SourceBlock->getInstList().erase(Branch);  // Destroy the uncond branch...
+
+  // Final step: now that we have finished everything up, walk the cloned
+  // instructions one last time, constant propagating and DCE'ing them, because
+  // they may not be needed anymore.
+  //
+  if (HadPHINodes) {
+    while (BI != SourceBlock->end()) {
+      Instruction *Inst = BI++;
+      if (isInstructionTriviallyDead(Inst))
+        Inst->eraseFromParent();
+      else if (Constant *C = ConstantFoldInstruction(Inst)) {
+        Inst->replaceAllUsesWith(C);
+        Inst->eraseFromParent();
+      }
+    }
+  }
+
+  ++NumEliminated;  // We just killed a branch!
+}
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 0000000..682d069
--- /dev/null
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -0,0 +1,479 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop.  This pass also implements the following extensions to the basic
+// algorithm:
+//
+//  1. Trivial instructions between the call and return do not prevent the
+//     transformation from taking place, though currently the analysis cannot
+//     support moving any really useful instructions (only dead ones).
+//  2. This pass transforms functions that are prevented from being tail
+//     recursive by an associative expression to use an accumulator variable,
+//     thus compiling the typical naive factorial or 'fib' implementation into
+//     efficient code.
+//  3. TRE is performed if the function returns void, if the return
+//     returns the result returned by the call, or if the function returns a
+//     run-time constant on all exits from the function.  It is possible, though
+//     unlikely, that the return returns something else (like constant 0), and
+//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
+//     the function return the exact same value.
+//  4. If it can prove that callees do not access theier caller stack frame,
+//     they are marked as eligible for tail call elimination (by the code
+//     generator).
+//
+// There are several improvements that could be made:
+//
+//  1. If the function has any alloca instructions, these instructions will be
+//     moved out of the entry block of the function, causing them to be
+//     evaluated each time through the tail recursion.  Safely keeping allocas
+//     in the entry block requires analysis to proves that the tail-called
+//     function does not read or write the stack object.
+//  2. Tail recursion is only performed if the call immediately preceeds the
+//     return instruction.  It's possible that there could be a jump between
+//     the call and the return.
+//  3. There can be intervening operations between the call and the return that
+//     prevent the TRE from occurring.  For example, there could be GEP's and
+//     stores to memory that will not be read or written by the call.  This
+//     requires some substantial analysis (such as with DSA) to prove safe to
+//     move ahead of the call, but doing so could allow many more TREs to be
+//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+//  4. The algorithm we use to detect if callees access their caller stack
+//     frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailcallelim"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+namespace {
+  struct VISIBILITY_HIDDEN TailCallElim : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    TailCallElim() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
+                               bool &TailCallsAreMarkedTail,
+                               std::vector<PHINode*> &ArgumentPHIs,
+                               bool CannotTailCallElimCallsMarkedTail);
+    bool CanMoveAboveCall(Instruction *I, CallInst *CI);
+    Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
+  };
+}
+
+char TailCallElim::ID = 0;
+static RegisterPass<TailCallElim> X("tailcallelim", "Tail Call Elimination");
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+
+/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
+/// callees of this function.  We only do very simple analysis right now, this
+/// could be expanded in the future to use mod/ref information for particular
+/// call sites if desired.
+static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
+  // FIXME: do simple 'address taken' analysis.
+  return true;
+}
+
+/// FunctionContainsAllocas - Scan the specified basic block for alloca
+/// instructions.  If it contains any that might be accessed by calls, return
+/// true.
+static bool CheckForEscapingAllocas(BasicBlock *BB,
+                                    bool &CannotTCETailMarkedCall) {
+  bool RetVal = false;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      RetVal |= AllocaMightEscapeToCalls(AI);
+
+      // If this alloca is in the body of the function, or if it is a variable
+      // sized allocation, we cannot tail call eliminate calls marked 'tail'
+      // with this mechanism.
+      if (BB != &BB->getParent()->getEntryBlock() ||
+          !isa<ConstantInt>(AI->getArraySize()))
+        CannotTCETailMarkedCall = true;
+    }
+  return RetVal;
+}
+
+bool TailCallElim::runOnFunction(Function &F) {
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg()) return false;
+
+  BasicBlock *OldEntry = 0;
+  bool TailCallsAreMarkedTail = false;
+  std::vector<PHINode*> ArgumentPHIs;
+  bool MadeChange = false;
+
+  bool FunctionContainsEscapingAllocas = false;
+
+  // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls
+  // marked with the 'tail' attribute, because doing so would cause the stack
+  // size to increase (real TCE would deallocate variable sized allocas, TCE
+  // doesn't).
+  bool CannotTCETailMarkedCall = false;
+
+  // Loop over the function, looking for any returning blocks, and keeping track
+  // of whether this function has any non-trivially used allocas.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall)
+      break;
+
+    FunctionContainsEscapingAllocas |=
+      CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
+  }
+  
+  /// FIXME: The code generator produces really bad code when an 'escaping
+  /// alloca' is changed from being a static alloca to being a dynamic alloca.
+  /// Until this is resolved, disable this transformation if that would ever
+  /// happen.  This bug is PR962.
+  if (FunctionContainsEscapingAllocas)
+    return false;
+  
+
+  // Second pass, change any tail calls to loops.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator()))
+      MadeChange |= ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                          ArgumentPHIs,CannotTCETailMarkedCall);
+
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  if (!ArgumentPHIs.empty()) {
+    for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
+      PHINode *PN = ArgumentPHIs[i];
+
+      // If the PHI Node is a dynamic constant, replace it with the value it is.
+      if (Value *PNV = PN->hasConstantValue()) {
+        PN->replaceAllUsesWith(PNV);
+        PN->eraseFromParent();
+      }
+    }
+  }
+
+  // Finally, if this function contains no non-escaping allocas, mark all calls
+  // in the function as eligible for tail calls (there is no stack memory for
+  // them to access).
+  if (!FunctionContainsEscapingAllocas)
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (CallInst *CI = dyn_cast<CallInst>(I)) {
+          CI->setTailCall();
+          MadeChange = true;
+        }
+
+  return MadeChange;
+}
+
+
+/// CanMoveAboveCall - Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayHaveSideEffects() || isa<LoadInst>(I))
+    return false;
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (I->getOperand(i) == CI)
+      return false;
+  return true;
+}
+
+// isDynamicConstant - Return true if the specified value is the same when the
+// return would exit as it was when the initial iteration of the recursive
+// function was executed.
+//
+// We currently handle static constants and arguments that are not modified as
+// part of the recursion.
+//
+static bool isDynamicConstant(Value *V, CallInst *CI) {
+  if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+  // Check to see if this is an immutable argument, if so, the value
+  // will be available to initialize the accumulator.
+  if (Argument *Arg = dyn_cast<Argument>(V)) {
+    // Figure out which argument number this is...
+    unsigned ArgNo = 0;
+    Function *F = CI->getParent()->getParent();
+    for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+      ++ArgNo;
+
+    // If we are passing this argument into call as the corresponding
+    // argument operand, then the argument is dynamically constant.
+    // Otherwise, we cannot transform this function safely.
+    if (CI->getOperand(ArgNo+1) == Arg)
+      return true;
+  }
+  // Not a constant or immutable argument, we can't safely transform.
+  return false;
+}
+
+// getCommonReturnValue - Check to see if the function containing the specified
+// return instruction and tail call consistently returns the same
+// runtime-constant value at all exit points.  If so, return the returned value.
+//
+static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
+  Function *F = TheRI->getParent()->getParent();
+  Value *ReturnedValue = 0;
+
+  // TODO: Handle multiple value ret instructions;
+  if (isa<StructType>(F->getReturnType()))
+      return 0;
+
+  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+      if (RI != TheRI) {
+        Value *RetOp = RI->getOperand(0);
+
+        // We can only perform this transformation if the value returned is
+        // evaluatable at the start of the initial invocation of the function,
+        // instead of at the end of the evaluation.
+        //
+        if (!isDynamicConstant(RetOp, CI))
+          return 0;
+
+        if (ReturnedValue && RetOp != ReturnedValue)
+          return 0;     // Cannot transform if differing values are returned.
+        ReturnedValue = RetOp;
+      }
+  return ReturnedValue;
+}
+
+/// CanTransformAccumulatorRecursion - If the specified instruction can be
+/// transformed using accumulator recursion elimination, return the constant
+/// which is the start of the accumulator value.  Otherwise return null.
+///
+Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
+                                                      CallInst *CI) {
+  if (!I->isAssociative()) return 0;
+  assert(I->getNumOperands() == 2 &&
+         "Associative operations should have 2 args!");
+
+  // Exactly one operand should be the result of the call instruction...
+  if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+      (I->getOperand(0) != CI && I->getOperand(1) != CI))
+    return 0;
+
+  // The only user of this instruction we allow is a single return instruction.
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back()))
+    return 0;
+
+  // Ok, now we have to check all of the other return instructions in this
+  // function.  If they return non-constants or differing values, then we cannot
+  // transform the function safely.
+  return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI);
+}
+
+bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                         bool &TailCallsAreMarkedTail,
+                                         std::vector<PHINode*> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  BasicBlock *BB = Ret->getParent();
+  Function *F = BB->getParent();
+
+  if (&BB->front() == Ret) // Make sure there is something before the ret...
+    return false;
+  
+  // If the return is in the entry block, then making this transformation would
+  // turn infinite recursion into an infinite loop.  This transformation is ok
+  // in theory, but breaks some code like:
+  //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+  // disable this xform in this case, because the code generator will lower the
+  // call to fabs into inline code.
+  if (BB == &F->getEntryBlock())
+    return false;
+
+  // Scan backwards from the return, checking to see if there is a tail call in
+  // this block.  If so, set CI to it.
+  CallInst *CI;
+  BasicBlock::iterator BBI = Ret;
+  while (1) {
+    CI = dyn_cast<CallInst>(BBI);
+    if (CI && CI->getCalledFunction() == F)
+      break;
+
+    if (BBI == BB->begin())
+      return false;          // Didn't find a potential tail call.
+    --BBI;
+  }
+
+  // If this call is marked as a tail call, and if there are dynamic allocas in
+  // the function, we cannot perform this optimization.
+  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+    return false;
+
+  // If we are introducing accumulator recursion to eliminate associative
+  // operations after the call instruction, this variable contains the initial
+  // value for the accumulator.  If this value is set, we actually perform
+  // accumulator recursion elimination instead of simple tail recursion
+  // elimination.
+  Value *AccumulatorRecursionEliminationInitVal = 0;
+  Instruction *AccumulatorRecursionInstr = 0;
+
+  // Ok, we found a potential tail call.  We can currently only transform the
+  // tail call if all of the instructions between the call and the return are
+  // movable to above the call itself, leaving the call next to the return.
+  // Check that this is the case now.
+  for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI)
+    if (!CanMoveAboveCall(BBI, CI)) {
+      // If we can't move the instruction above the call, it might be because it
+      // is an associative operation that could be tranformed using accumulator
+      // recursion elimination.  Check to see if this is the case, and if so,
+      // remember the initial accumulator value for later.
+      if ((AccumulatorRecursionEliminationInitVal =
+                             CanTransformAccumulatorRecursion(BBI, CI))) {
+        // Yes, this is accumulator recursion.  Remember which instruction
+        // accumulates.
+        AccumulatorRecursionInstr = BBI;
+      } else {
+        return false;   // Otherwise, we cannot eliminate the tail recursion!
+      }
+    }
+
+  // We can only transform call/return pairs that either ignore the return value
+  // of the call and return void, ignore the value of the call and return a
+  // constant, return the value returned by the tail call, or that are being
+  // accumulator recursion variable eliminated.
+  if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
+      !isa<UndefValue>(Ret->getReturnValue()) &&
+      AccumulatorRecursionEliminationInitVal == 0 &&
+      !getCommonReturnValue(Ret, CI))
+    return false;
+
+  // OK! We can transform this tail call.  If this is the first one found,
+  // create the new entry block, allowing us to branch back to the old entry.
+  if (OldEntry == 0) {
+    OldEntry = &F->getEntryBlock();
+    BasicBlock *NewEntry = BasicBlock::Create("", F, OldEntry);
+    NewEntry->takeName(OldEntry);
+    OldEntry->setName("tailrecurse");
+    BranchInst::Create(OldEntry, NewEntry);
+
+    // If this tail call is marked 'tail' and if there are any allocas in the
+    // entry block, move them up to the new entry block.
+    TailCallsAreMarkedTail = CI->isTailCall();
+    if (TailCallsAreMarkedTail)
+      // Move all fixed sized allocas from OldEntry to NewEntry.
+      for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+             NEBI = NewEntry->begin(); OEBI != E; )
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+          if (isa<ConstantInt>(AI->getArraySize()))
+            AI->moveBefore(NEBI);
+
+    // Now that we have created a new block, which jumps to the entry
+    // block, insert a PHI node for each argument of the function.
+    // For now, we initialize each PHI to only have the real arguments
+    // which are passed in.
+    Instruction *InsertPos = OldEntry->begin();
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      PHINode *PN = PHINode::Create(I->getType(),
+                                    I->getName() + ".tr", InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(I, NewEntry);
+      ArgumentPHIs.push_back(PN);
+    }
+  }
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.  We
+  // have to choose whether we move allocas in the entry block to the new entry
+  // block or not, so we can't make a good choice for both.  NOTE: We could do
+  // slightly better here in the case that the function has no entry block
+  // allocas.
+  if (TailCallsAreMarkedTail && !CI->isTailCall())
+    return false;
+
+  // Ok, now that we know we have a pseudo-entry block WITH all of the
+  // required PHI nodes, add entries into the PHI node for the actual
+  // parameters passed into the tail-recursive call.
+  for (unsigned i = 0, e = CI->getNumOperands()-1; i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getOperand(i+1), BB);
+
+  // If we are introducing an accumulator variable to eliminate the recursion,
+  // do so now.  Note that we _know_ that no subsequent tail recursion
+  // eliminations will happen on this function because of the way the
+  // accumulator recursion predicate is set up.
+  //
+  if (AccumulatorRecursionEliminationInitVal) {
+    Instruction *AccRecInstr = AccumulatorRecursionInstr;
+    // Start by inserting a new PHI node for the accumulator.
+    PHINode *AccPN = PHINode::Create(AccRecInstr->getType(), "accumulator.tr",
+                                     OldEntry->begin());
+
+    // Loop over all of the predecessors of the tail recursion block.  For the
+    // real entry into the function we seed the PHI with the initial value,
+    // computed earlier.  For any other existing branches to this block (due to
+    // other tail recursions eliminated) the accumulator is not modified.
+    // Because we haven't added the branch in the current block to OldEntry yet,
+    // it will not show up as a predecessor.
+    for (pred_iterator PI = pred_begin(OldEntry), PE = pred_end(OldEntry);
+         PI != PE; ++PI) {
+      if (*PI == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, *PI);
+      else
+        AccPN->addIncoming(AccPN, *PI);
+    }
+
+    // Add an incoming argument for the current block, which is computed by our
+    // associative accumulator instruction.
+    AccPN->addIncoming(AccRecInstr, BB);
+
+    // Next, rewrite the accumulator recursion instruction so that it does not
+    // use the result of the call anymore, instead, use the PHI node we just
+    // inserted.
+    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+
+    // Finally, rewrite any return instructions in the program to return the PHI
+    // node instead of the "initval" that they do currently.  This loop will
+    // actually rewrite the return value we are destroying, but that's ok.
+    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+        RI->setOperand(0, AccPN);
+    ++NumAccumAdded;
+  }
+
+  // Now that all of the PHI nodes are in place, remove the call and
+  // ret instructions, replacing them with an unconditional branch.
+  BranchInst::Create(OldEntry, Ret);
+  BB->getInstList().erase(Ret);  // Remove return.
+  BB->getInstList().erase(CI);   // Remove call.
+  ++NumEliminated;
+  return true;
+}
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
new file mode 100644
index 0000000..71049fa
--- /dev/null
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -0,0 +1,594 @@
+//===- AddrModeMatcher.cpp - Addressing mode matching facility --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target addressing mode matcher class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instruction.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+void ExtAddrMode::print(OStream &OS) const {
+  bool NeedPlus = false;
+  OS << "[";
+  if (BaseGV) {
+    OS << (NeedPlus ? " + " : "")
+       << "GV:";
+    WriteAsOperand(*OS.stream(), BaseGV, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  if (BaseOffs)
+    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
+
+  if (BaseReg) {
+    OS << (NeedPlus ? " + " : "")
+       << "Base:";
+    WriteAsOperand(*OS.stream(), BaseReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+  if (Scale) {
+    OS << (NeedPlus ? " + " : "")
+       << Scale << "*";
+    WriteAsOperand(*OS.stream(), ScaledReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  OS << ']';
+}
+
+void ExtAddrMode::dump() const {
+  print(cerr);
+  cerr << '\n';
+}
+
+
+/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
+/// Return true and update AddrMode if this addr mode is legal for the target,
+/// false if not.
+bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
+                                             unsigned Depth) {
+  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
+  // mode.  Just process that directly.
+  if (Scale == 1)
+    return MatchAddr(ScaleReg, Depth);
+  
+  // If the scale is 0, it takes nothing to add this.
+  if (Scale == 0)
+    return true;
+  
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+
+  ExtAddrMode TestAddrMode = AddrMode;
+
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  TestAddrMode.Scale += Scale;
+  TestAddrMode.ScaledReg = ScaleReg;
+
+  // If the new address isn't legal, bail out.
+  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
+    return false;
+
+  // It was legal, so commit it.
+  AddrMode = TestAddrMode;
+  
+  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+  // X*Scale + C*Scale to addr mode.
+  ConstantInt *CI = 0; Value *AddLHS = 0;
+  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+    TestAddrMode.ScaledReg = AddLHS;
+    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+      
+    // If this addressing mode is legal, commit it and remember that we folded
+    // this instruction.
+    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
+      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
+      AddrMode = TestAddrMode;
+      return true;
+    }
+  }
+
+  // Otherwise, not (x+c)*scale, just return what we have.
+  return true;
+}
+
+/// MightBeFoldableInst - This is a little filter, which returns true if an
+/// addressing computation involving I might be folded into a load/store
+/// accessing it.  This doesn't need to be perfect, but needs to accept at least
+/// the set of instructions that MatchOperationAddr can.
+static bool MightBeFoldableInst(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    // Don't touch identity bitcasts.
+    if (I->getType() == I->getOperand(0)->getType())
+      return false;
+    return isa<PointerType>(I->getType()) || isa<IntegerType>(I->getType());
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return true;
+  case Instruction::IntToPtr:
+    // We know the input is intptr_t, so this is foldable.
+    return true;
+  case Instruction::Add:
+    return true;
+  case Instruction::Mul:
+  case Instruction::Shl:
+    // Can only handle X*C and X << C.
+    return isa<ConstantInt>(I->getOperand(1));
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+/// MatchOperationAddr - Given an instruction or constant expr, see if we can
+/// fold the operation into the addressing mode.  If so, update the addressing
+/// mode and return true, otherwise return false without modifying AddrMode.
+bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
+                                               unsigned Depth) {
+  // Avoid exponential behavior on extremely deep expression trees.
+  if (Depth >= 5) return false;
+  
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return MatchAddr(AddrInst->getOperand(0), Depth);
+  case Instruction::IntToPtr:
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
+        TLI.getPointerTy())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::BitCast:
+    // BitCast is always a noop, and we can handle it as long as it is
+    // int->int or pointer->pointer (we don't want int<->fp or something).
+    if ((isa<PointerType>(AddrInst->getOperand(0)->getType()) ||
+         isa<IntegerType>(AddrInst->getOperand(0)->getType())) &&
+        // Don't touch identity bitcasts.  These were probably put here by LSR,
+        // and we don't want to mess around with them.  Assume it knows what it
+        // is doing.
+        AddrInst->getOperand(0)->getType() != AddrInst->getType())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::Add: {
+    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(0), Depth+1))
+      return true;
+    
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    
+    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
+    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(1), Depth+1))
+      return true;
+    
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    break;
+  }
+  //case Instruction::Or:
+  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+  //break;
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C.
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) return false;
+    int64_t Scale = RHS->getSExtValue();
+    if (Opcode == Instruction::Shl)
+      Scale = 1 << Scale;
+    
+    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+    
+    int64_t ConstantOffset = 0;
+    const TargetData *TD = TLI.getTargetData();
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx =
+          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+          ConstantOffset += CI->getSExtValue()*TypeSize;
+        } else if (TypeSize) {  // Scales of zero don't do anything.
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1)
+            return false;
+          
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+    
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
+        // Check to see if we can fold the base pointer in too.
+        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+      return false;
+    }
+
+    // Save the valid addressing mode in case we can't match.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // See if the scale and offset amount is valid for this target.
+    AddrMode.BaseOffs += ConstantOffset;
+
+    // Match the base operand of the GEP.
+    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
+      // If it couldn't be matched, just stuff the value in a register.
+      if (AddrMode.HasBaseReg) {
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+    }
+
+    // Match the remaining variable portion of the GEP.
+    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+                          Depth)) {
+      // If it couldn't be matched, try stuffing the base into a register
+      // instead of matching it, and retrying the match of the scale.
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      if (AddrMode.HasBaseReg)
+        return false;
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+      AddrMode.BaseOffs += ConstantOffset;
+      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
+                            VariableScale, Depth)) {
+        // If even that didn't work, bail.
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+    }
+
+    return true;
+  }
+  }
+  return false;
+}
+
+/// MatchAddr - If we can, try to add the value of 'Addr' into the current
+/// addressing mode.  If Addr can't be added to AddrMode this returns false and
+/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
+/// or intptr_t for the target.
+///
+bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    // Fold in immediates if legal for the target.
+    AddrMode.BaseOffs += CI->getSExtValue();
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.BaseOffs -= CI->getSExtValue();
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    // If this is a global variable, try to fold it into the addressing mode.
+    if (AddrMode.BaseGV == 0) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+        return true;
+      AddrMode.BaseGV = 0;
+    }
+  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // Check to see if it is possible to fold this operation.
+    if (MatchOperationAddr(I, I->getOpcode(), Depth)) {
+      // Okay, it's possible to fold this.  Check to see if it is actually
+      // *profitable* to do so.  We use a simple cost model to avoid increasing
+      // register pressure too much.
+      if (I->hasOneUse() ||
+          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+        AddrModeInsts.push_back(I);
+        return true;
+      }
+      
+      // It isn't profitable to do this, roll back.
+      //cerr << "NOT FOLDING: " << *I;
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+    }
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
+      return true;
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    // Null pointer gets folded without affecting the addressing mode.
+    return true;
+  }
+
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    AddrMode.BaseReg = Addr;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.HasBaseReg = false;
+    AddrMode.BaseReg = 0;
+  }
+
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    AddrMode.ScaledReg = Addr;
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.Scale = 0;
+    AddrMode.ScaledReg = 0;
+  }
+  // Couldn't match.
+  return false;
+}
+
+
+/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
+/// inline asm call are due to memory operands.  If so, return true, otherwise
+/// return false.
+static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
+                                    const TargetLowering &TLI) {
+  std::vector<InlineAsm::ConstraintInfo>
+  Constraints = IA->ParseConstraints();
+  
+  unsigned ArgNo = 1;   // ArgNo - The operand of the CallInst.
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo OpInfo(Constraints[i]);
+    
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+      case InlineAsm::isOutput:
+        if (OpInfo.isIndirect)
+          OpInfo.CallOperandVal = CI->getOperand(ArgNo++);
+        break;
+      case InlineAsm::isInput:
+        OpInfo.CallOperandVal = CI->getOperand(ArgNo++);
+        break;
+      case InlineAsm::isClobber:
+        // Nothing to do.
+        break;
+    }
+    
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue(),
+                             OpInfo.ConstraintType == TargetLowering::C_Memory);
+    
+    // If this asm operand is our Value*, and if it isn't an indirect memory
+    // operand, we can't fold it!
+    if (OpInfo.CallOperandVal == OpVal &&
+        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
+         !OpInfo.isIndirect))
+      return false;
+  }
+  
+  return true;
+}
+
+
+/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
+/// memory use.  If we find an obviously non-foldable instruction, return true.
+/// Add the ultimately found memory instructions to MemoryUses.
+static bool FindAllMemoryUses(Instruction *I,
+                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
+                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
+                              const TargetLowering &TLI) {
+  // If we already considered this instruction, we're done.
+  if (!ConsideredInsts.insert(I))
+    return false;
+  
+  // If this is an obviously unfoldable instruction, bail out.
+  if (!MightBeFoldableInst(I))
+    return true;
+
+  // Loop over all the uses, recursively processing them.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (UI.getOperandNo() == 0) return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(SI, UI.getOperandNo()));
+      continue;
+    }
+    
+    if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+      if (IA == 0) return true;
+      
+      // If this is a memory operand, we're cool, otherwise bail out.
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
+        return true;
+      continue;
+    }
+    
+    if (FindAllMemoryUses(cast<Instruction>(*UI), MemoryUses, ConsideredInsts,
+                          TLI))
+      return true;
+  }
+
+  return false;
+}
+
+
+/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
+/// the use site that we're folding it into.  If so, there is no cost to
+/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
+/// that we know are live at the instruction already.
+bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
+                                                   Value *KnownLive2) {
+  // If Val is either of the known-live values, we know it is live!
+  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
+    return true;
+  
+  // All values other than instructions and arguments (e.g. constants) are live.
+  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
+  
+  // If Val is a constant sized alloca in the entry block, it is live, this is
+  // true because it is just a reference to the stack/frame pointer, which is
+  // live for the whole function.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
+    if (AI->isStaticAlloca())
+      return true;
+  
+  // Check to see if this value is already used in the memory instruction's
+  // block.  If so, it's already live into the block at the very least, so we
+  // can reasonably fold it.
+  BasicBlock *MemBB = MemoryInst->getParent();
+  for (Value::use_iterator UI = Val->use_begin(), E = Val->use_end();
+       UI != E; ++UI)
+    // We know that uses of arguments and instructions have to be instructions.
+    if (cast<Instruction>(*UI)->getParent() == MemBB)
+      return true;
+  
+  return false;
+}
+
+
+
+/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
+/// mode of the machine to fold the specified instruction into a load or store
+/// that ultimately uses it.  However, the specified instruction has multiple
+/// uses.  Given this, it may actually increase register pressure to fold it
+/// into the load.  For example, consider this code:
+///
+///     X = ...
+///     Y = X+1
+///     use(Y)   -> nonload/store
+///     Z = Y+1
+///     load Z
+///
+/// In this case, Y has multiple uses, and can be folded into the load of Z
+/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
+/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
+/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
+/// number of computations either.
+///
+/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
+/// X was live across 'load Z' for other reasons, we actually *would* want to
+/// fold the addressing mode in the Z case.  This would make Y die earlier.
+bool AddressingModeMatcher::
+IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
+                                     ExtAddrMode &AMAfter) {
+  if (IgnoreProfitability) return true;
+  
+  // AMBefore is the addressing mode before this instruction was folded into it,
+  // and AMAfter is the addressing mode after the instruction was folded.  Get
+  // the set of registers referenced by AMAfter and subtract out those
+  // referenced by AMBefore: this is the set of values which folding in this
+  // address extends the lifetime of.
+  //
+  // Note that there are only two potential values being referenced here,
+  // BaseReg and ScaleReg (global addresses are always available, as are any
+  // folded immediates).
+  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
+  
+  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
+  // lifetime wasn't extended by adding this instruction.
+  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    BaseReg = 0;
+  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    ScaledReg = 0;
+
+  // If folding this instruction (and it's subexprs) didn't extend any live
+  // ranges, we're ok with it.
+  if (BaseReg == 0 && ScaledReg == 0)
+    return true;
+
+  // If all uses of this instruction are ultimately load/store/inlineasm's,
+  // check to see if their addressing modes will include this instruction.  If
+  // so, we can fold it into all uses, so it doesn't matter if it has multiple
+  // uses.
+  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
+  SmallPtrSet<Instruction*, 16> ConsideredInsts;
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
+    return false;  // Has a non-memory, non-foldable use!
+  
+  // Now that we know that all uses of this instruction are part of a chain of
+  // computation involving only operations that could theoretically be folded
+  // into a memory use, loop over each of these uses and see if they could
+  // *actually* fold the instruction.
+  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
+  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
+    Instruction *User = MemoryUses[i].first;
+    unsigned OpNo = MemoryUses[i].second;
+    
+    // Get the access type of this use.  If the use isn't a pointer, we don't
+    // know what it accesses.
+    Value *Address = User->getOperand(OpNo);
+    if (!isa<PointerType>(Address->getType()))
+      return false;
+    const Type *AddressAccessTy =
+      cast<PointerType>(Address->getType())->getElementType();
+    
+    // Do a match against the root of this address, ignoring profitability. This
+    // will tell us if the addressing mode for the memory operation will
+    // *actually* cover the shared instruction.
+    ExtAddrMode Result;
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
+                                  MemoryInst, Result);
+    Matcher.IgnoreProfitability = true;
+    bool Success = Matcher.MatchAddr(Address, 0);
+    Success = Success; assert(Success && "Couldn't select *anything*?");
+
+    // If the match didn't cover I, then it won't be shared by it.
+    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
+                  I) == MatchedAddrModeInsts.end())
+      return false;
+    
+    MatchedAddrModeInsts.clear();
+  }
+  
+  return true;
+}
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
new file mode 100644
index 0000000..6d1180d
--- /dev/null
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -0,0 +1,622 @@
+//===-- BasicBlockUtils.cpp - BasicBlock Utilities -------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on basic blocks, and
+// instructions contained within basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Constant.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/ValueHandle.h"
+#include <algorithm>
+using namespace llvm;
+
+/// DeleteDeadBlock - Delete the specified block, which must have no
+/// predecessors.
+void llvm::DeleteDeadBlock(BasicBlock *BB) {
+  assert((pred_begin(BB) == pred_end(BB) ||
+         // Can delete self loop.
+         BB->getSinglePredecessor() == BB) && "Block is not dead!");
+  TerminatorInst *BBTerm = BB->getTerminator();
+  
+  // Loop through all of our successors and make sure they know that one
+  // of their predecessors is going away.
+  for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i)
+    BBTerm->getSuccessor(i)->removePredecessor(BB);
+  
+  // Zap all the instructions in the block.
+  while (!BB->empty()) {
+    Instruction &I = BB->back();
+    // If this instruction is used, replace uses with an arbitrary value.
+    // Because control flow can't get here, we don't care what we replace the
+    // value with.  Note that since this block is unreachable, and all values
+    // contained within it must dominate their uses, that all uses will
+    // eventually be removed (they are themselves dead).
+    if (!I.use_empty())
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    BB->getInstList().pop_back();
+  }
+  
+  // Zap the block!
+  BB->eraseFromParent();
+}
+
+/// FoldSingleEntryPHINodes - We know that BB has one predecessor.  If there are
+/// any single-entry PHI nodes in it, fold them away.  This handles the case
+/// when all entries to the PHI nodes in a block are guaranteed equal, such as
+/// when the block has exactly one predecessor.
+void llvm::FoldSingleEntryPHINodes(BasicBlock *BB) {
+  if (!isa<PHINode>(BB->begin()))
+    return;
+  
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    if (PN->getIncomingValue(0) != PN)
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    else
+      PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+    PN->eraseFromParent();
+  }
+}
+
+
+/// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it
+/// is dead. Also recursively delete any operands that become dead as
+/// a result. This includes tracing the def-use list from the PHI to see if
+/// it is ultimately unused or if it reaches an unused cycle.
+void llvm::DeleteDeadPHIs(BasicBlock *BB) {
+  // Recursively deleting a PHI may cause multiple PHIs to be deleted
+  // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
+      RecursivelyDeleteDeadPHINode(PN);
+}
+
+/// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
+/// if possible.  The return value indicates success or failure.
+bool llvm::MergeBlockIntoPredecessor(BasicBlock* BB, Pass* P) {
+  pred_iterator PI(pred_begin(BB)), PE(pred_end(BB));
+  // Can't merge the entry block.
+  if (pred_begin(BB) == pred_end(BB)) return false;
+  
+  BasicBlock *PredBB = *PI++;
+  for (; PI != PE; ++PI)  // Search all predecessors, see if they are all same
+    if (*PI != PredBB) {
+      PredBB = 0;       // There are multiple different predecessors...
+      break;
+    }
+  
+  // Can't merge if there are multiple predecessors.
+  if (!PredBB) return false;
+  // Don't break self-loops.
+  if (PredBB == BB) return false;
+  // Don't break invokes.
+  if (isa<InvokeInst>(PredBB->getTerminator())) return false;
+  
+  succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
+  BasicBlock* OnlySucc = BB;
+  for (; SI != SE; ++SI)
+    if (*SI != OnlySucc) {
+      OnlySucc = 0;     // There are multiple distinct successors!
+      break;
+    }
+  
+  // Can't merge if there are multiple successors.
+  if (!OnlySucc) return false;
+
+  // Can't merge if there is PHI loop.
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == PN)
+          return false;
+    } else
+      break;
+  }
+
+  // Begin by getting rid of unneeded PHIs.
+  while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+    PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    BB->getInstList().pop_front();  // Delete the phi node...
+  }
+  
+  // Delete the unconditional branch from the predecessor...
+  PredBB->getInstList().pop_back();
+  
+  // Move all definitions in the successor to the predecessor...
+  PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+  
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(PredBB);
+  
+  // Inherit predecessors name if it exists.
+  if (!PredBB->hasName())
+    PredBB->takeName(BB);
+  
+  // Finally, erase the old block and update dominator info.
+  if (P) {
+    if (DominatorTree* DT = P->getAnalysisIfAvailable<DominatorTree>()) {
+      DomTreeNode* DTN = DT->getNode(BB);
+      DomTreeNode* PredDTN = DT->getNode(PredBB);
+  
+      if (DTN) {
+        SmallPtrSet<DomTreeNode*, 8> Children(DTN->begin(), DTN->end());
+        for (SmallPtrSet<DomTreeNode*, 8>::iterator DI = Children.begin(),
+             DE = Children.end(); DI != DE; ++DI)
+          DT->changeImmediateDominator(*DI, PredDTN);
+
+        DT->eraseNode(BB);
+      }
+    }
+  }
+  
+  BB->eraseFromParent();
+  
+  
+  return true;
+}
+
+/// ReplaceInstWithValue - Replace all uses of an instruction (specified by BI)
+/// with a value, then remove and delete the original instruction.
+///
+void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
+                                BasicBlock::iterator &BI, Value *V) {
+  Instruction &I = *BI;
+  // Replaces all of the uses of the instruction with uses of the value
+  I.replaceAllUsesWith(V);
+
+  // Make sure to propagate a name if there is one already.
+  if (I.hasName() && !V->hasName())
+    V->takeName(&I);
+
+  // Delete the unnecessary instruction now...
+  BI = BIL.erase(BI);
+}
+
+
+/// ReplaceInstWithInst - Replace the instruction specified by BI with the
+/// instruction specified by I.  The original instruction is deleted and BI is
+/// updated to point to the new instruction.
+///
+void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
+                               BasicBlock::iterator &BI, Instruction *I) {
+  assert(I->getParent() == 0 &&
+         "ReplaceInstWithInst: Instruction already inserted into basic block!");
+
+  // Insert the new instruction into the basic block...
+  BasicBlock::iterator New = BIL.insert(BI, I);
+
+  // Replace all uses of the old instruction, and delete it.
+  ReplaceInstWithValue(BIL, BI, I);
+
+  // Move BI back to point to the newly inserted instruction
+  BI = New;
+}
+
+/// ReplaceInstWithInst - Replace the instruction specified by From with the
+/// instruction specified by To.
+///
+void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
+  BasicBlock::iterator BI(From);
+  ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
+}
+
+/// RemoveSuccessor - Change the specified terminator instruction such that its
+/// successor SuccNum no longer exists.  Because this reduces the outgoing
+/// degree of the current basic block, the actual terminator instruction itself
+/// may have to be changed.  In the case where the last successor of the block 
+/// is deleted, a return instruction is inserted in its place which can cause a
+/// surprising change in program behavior if it is not expected.
+///
+void llvm::RemoveSuccessor(TerminatorInst *TI, unsigned SuccNum) {
+  assert(SuccNum < TI->getNumSuccessors() &&
+         "Trying to remove a nonexistant successor!");
+
+  // If our old successor block contains any PHI nodes, remove the entry in the
+  // PHI nodes that comes from this branch...
+  //
+  BasicBlock *BB = TI->getParent();
+  TI->getSuccessor(SuccNum)->removePredecessor(BB);
+
+  TerminatorInst *NewTI = 0;
+  switch (TI->getOpcode()) {
+  case Instruction::Br:
+    // If this is a conditional branch... convert to unconditional branch.
+    if (TI->getNumSuccessors() == 2) {
+      cast<BranchInst>(TI)->setUnconditionalDest(TI->getSuccessor(1-SuccNum));
+    } else {                    // Otherwise convert to a return instruction...
+      Value *RetVal = 0;
+
+      // Create a value to return... if the function doesn't return null...
+      if (BB->getParent()->getReturnType() != Type::VoidTy)
+        RetVal = Constant::getNullValue(BB->getParent()->getReturnType());
+
+      // Create the return...
+      NewTI = ReturnInst::Create(RetVal);
+    }
+    break;
+
+  case Instruction::Invoke:    // Should convert to call
+  case Instruction::Switch:    // Should remove entry
+  default:
+  case Instruction::Ret:       // Cannot happen, has no successors!
+    assert(0 && "Unhandled terminator instruction type in RemoveSuccessor!");
+    abort();
+  }
+
+  if (NewTI)   // If it's a different instruction, replace.
+    ReplaceInstWithInst(TI, NewTI);
+}
+
+/// SplitEdge -  Split the edge connecting specified block. Pass P must 
+/// not be NULL. 
+BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
+  TerminatorInst *LatchTerm = BB->getTerminator();
+  unsigned SuccNum = 0;
+#ifndef NDEBUG
+  unsigned e = LatchTerm->getNumSuccessors();
+#endif
+  for (unsigned i = 0; ; ++i) {
+    assert(i != e && "Didn't find edge?");
+    if (LatchTerm->getSuccessor(i) == Succ) {
+      SuccNum = i;
+      break;
+    }
+  }
+  
+  // If this is a critical edge, let SplitCriticalEdge do it.
+  if (SplitCriticalEdge(BB->getTerminator(), SuccNum, P))
+    return LatchTerm->getSuccessor(SuccNum);
+
+  // If the edge isn't critical, then BB has a single successor or Succ has a
+  // single pred.  Split the block.
+  BasicBlock::iterator SplitPoint;
+  if (BasicBlock *SP = Succ->getSinglePredecessor()) {
+    // If the successor only has a single pred, split the top of the successor
+    // block.
+    assert(SP == BB && "CFG broken");
+    SP = NULL;
+    return SplitBlock(Succ, Succ->begin(), P);
+  } else {
+    // Otherwise, if BB has a single successor, split it at the bottom of the
+    // block.
+    assert(BB->getTerminator()->getNumSuccessors() == 1 &&
+           "Should have a single succ!"); 
+    return SplitBlock(BB, BB->getTerminator(), P);
+  }
+}
+
+/// SplitBlock - Split the specified block at the specified instruction - every
+/// thing before SplitPt stays in Old and everything starting with SplitPt moves
+/// to a new block.  The two blocks are joined by an unconditional branch and
+/// the loop info is updated.
+///
+BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
+  BasicBlock::iterator SplitIt = SplitPt;
+  while (isa<PHINode>(SplitIt))
+    ++SplitIt;
+  BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
+
+  // The new block lives in whichever loop the old one did.
+  if (LoopInfo* LI = P->getAnalysisIfAvailable<LoopInfo>())
+    if (Loop *L = LI->getLoopFor(Old))
+      L->addBasicBlockToLoop(New, LI->getBase());
+
+  if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>())
+    {
+      // Old dominates New. New node domiantes all other nodes dominated by Old.
+      DomTreeNode *OldNode = DT->getNode(Old);
+      std::vector<DomTreeNode *> Children;
+      for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
+           I != E; ++I) 
+        Children.push_back(*I);
+
+      DomTreeNode *NewNode =   DT->addNewBlock(New,Old);
+
+      for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
+             E = Children.end(); I != E; ++I) 
+        DT->changeImmediateDominator(*I, NewNode);
+    }
+
+  if (DominanceFrontier *DF = P->getAnalysisIfAvailable<DominanceFrontier>())
+    DF->splitBlock(Old);
+    
+  return New;
+}
+
+
+/// SplitBlockPredecessors - This method transforms BB by introducing a new
+/// basic block into the function, and moving some of the predecessors of BB to
+/// be predecessors of the new block.  The new predecessors are indicated by the
+/// Preds array, which has NumPreds elements in it.  The new block is given a
+/// suffix of 'Suffix'.
+///
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree and
+/// DominanceFrontier, but no other analyses.
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
+                                         BasicBlock *const *Preds,
+                                         unsigned NumPreds, const char *Suffix,
+                                         Pass *P) {
+  // Create new basic block, insert right before the original block.
+  BasicBlock *NewBB =
+    BasicBlock::Create(BB->getName()+Suffix, BB->getParent(), BB);
+  
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI = BranchInst::Create(BB, NewBB);
+  
+  // Move the edges from Preds to point to NewBB instead of BB.
+  for (unsigned i = 0; i != NumPreds; ++i)
+    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+  
+  // Update dominator tree and dominator frontier if available.
+  DominatorTree *DT = P ? P->getAnalysisIfAvailable<DominatorTree>() : 0;
+  if (DT)
+    DT->splitBlock(NewBB);
+  if (DominanceFrontier *DF = P ? P->getAnalysisIfAvailable<DominanceFrontier>():0)
+    DF->splitBlock(NewBB);
+  AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0;
+  
+  
+  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+  // node becomes an incoming value for BB's phi node.  However, if the Preds
+  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+  // account for the newly created predecessor.
+  if (NumPreds == 0) {
+    // Insert dummy values as the incoming value.
+    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+    return NewBB;
+  }
+  
+  // Otherwise, create a new PHI node in NewBB for each PHI node in BB.
+  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I++);
+    
+    // Check to see if all of the values coming in are the same.  If so, we
+    // don't need to create a new PHI node.
+    Value *InVal = PN->getIncomingValueForBlock(Preds[0]);
+    for (unsigned i = 1; i != NumPreds; ++i)
+      if (InVal != PN->getIncomingValueForBlock(Preds[i])) {
+        InVal = 0;
+        break;
+      }
+    
+    if (InVal) {
+      // If all incoming values for the new PHI would be the same, just don't
+      // make a new PHI.  Instead, just remove the incoming values from the old
+      // PHI.
+      for (unsigned i = 0; i != NumPreds; ++i)
+        PN->removeIncomingValue(Preds[i], false);
+    } else {
+      // If the values coming into the block are not the same, we need a PHI.
+      // Create the new PHI node, insert it into NewBB at the end of the block
+      PHINode *NewPHI =
+        PHINode::Create(PN->getType(), PN->getName()+".ph", BI);
+      if (AA) AA->copyValue(PN, NewPHI);
+      
+      // Move all of the PHI values for 'Preds' to the new PHI.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        Value *V = PN->removeIncomingValue(Preds[i], false);
+        NewPHI->addIncoming(V, Preds[i]);
+      }
+      InVal = NewPHI;
+    }
+    
+    // Add an incoming value to the PHI node in the loop for the preheader
+    // edge.
+    PN->addIncoming(InVal, NewBB);
+    
+    // Check to see if we can eliminate this phi node.
+    if (Value *V = PN->hasConstantValue(DT != 0)) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (!I || DT == 0 || DT->dominates(I, PN)) {
+        PN->replaceAllUsesWith(V);
+        if (AA) AA->deleteValue(PN);
+        PN->eraseFromParent();
+      }
+    }
+  }
+  
+  return NewBB;
+}
+
+/// FindFunctionBackedges - Analyze the specified function to find all of the
+/// loop backedges in the function and return them.  This is a relatively cheap
+/// (compared to computing dominators and loop info) analysis.
+///
+/// The output is added to Result, as pairs of <from,to> edge info.
+void llvm::FindFunctionBackedges(const Function &F,
+     SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) {
+  const BasicBlock *BB = &F.getEntryBlock();
+  if (succ_begin(BB) == succ_end(BB))
+    return;
+  
+  SmallPtrSet<const BasicBlock*, 8> Visited;
+  SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
+  SmallPtrSet<const BasicBlock*, 8> InStack;
+  
+  Visited.insert(BB);
+  VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+  InStack.insert(BB);
+  do {
+    std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
+    const BasicBlock *ParentBB = Top.first;
+    succ_const_iterator &I = Top.second;
+    
+    bool FoundNew = false;
+    while (I != succ_end(ParentBB)) {
+      BB = *I++;
+      if (Visited.insert(BB)) {
+        FoundNew = true;
+        break;
+      }
+      // Successor is in VisitStack, it's a back edge.
+      if (InStack.count(BB))
+        Result.push_back(std::make_pair(ParentBB, BB));
+    }
+    
+    if (FoundNew) {
+      // Go down one level if there is a unvisited successor.
+      InStack.insert(BB);
+      VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+    } else {
+      // Go up one level.
+      InStack.erase(VisitStack.pop_back_val().first);
+    }
+  } while (!VisitStack.empty());
+  
+  
+}
+
+
+
+/// AreEquivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+  
+  // Test if the values come form identical arithmetic instructions.
+  if (isa<BinaryOperator>(A) || isa<CastInst>(A) ||
+      isa<PHINode>(A) || isa<GetElementPtrInst>(A))
+    if (const Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalTo(BI))
+        return true;
+  
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+/// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at the
+/// instruction before ScanFrom) checking to see if we have the value at the
+/// memory address *Ptr locally available within a small number of instructions.
+/// If the value is available, return it.
+///
+/// If not, return the iterator for the last validated instruction that the 
+/// value would be live through.  If we scanned the entire block and didn't find
+/// something that invalidates *Ptr or provides it, ScanFrom would be left at
+/// begin() and this returns null.  ScanFrom could also be left 
+///
+/// MaxInstsToScan specifies the maximum instructions to scan in the block.  If
+/// it is set to 0, it will scan the whole block. You can also optionally
+/// specify an alias analysis implementation, which makes this more precise.
+Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
+                                      BasicBlock::iterator &ScanFrom,
+                                      unsigned MaxInstsToScan,
+                                      AliasAnalysis *AA) {
+  if (MaxInstsToScan == 0) MaxInstsToScan = ~0U;
+
+  // If we're using alias analysis to disambiguate get the size of *Ptr.
+  unsigned AccessSize = 0;
+  if (AA) {
+    const Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
+    AccessSize = AA->getTargetData().getTypeStoreSizeInBits(AccessTy);
+  }
+  
+  while (ScanFrom != ScanBB->begin()) {
+    // We must ignore debug info directives when counting (otherwise they
+    // would affect codegen).
+    Instruction *Inst = --ScanFrom;
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+    // We skip pointer-to-pointer bitcasts, which are NOPs.
+    // It is necessary for correctness to skip those that feed into a
+    // llvm.dbg.declare, as these are not present when debugging is off.
+    if (isa<BitCastInst>(Inst) && isa<PointerType>(Inst->getType()))
+      continue;
+
+    // Restore ScanFrom to expected value in case next test succeeds
+    ScanFrom++;
+   
+    // Don't scan huge blocks.
+    if (MaxInstsToScan-- == 0) return 0;
+    
+    --ScanFrom;
+    // If this is a load of Ptr, the loaded value is available.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr))
+        return LI;
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // If this is a store through Ptr, the value is available!
+      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr))
+        return SI->getOperand(0);
+      
+      // If Ptr is an alloca and this is a store to a different alloca, ignore
+      // the store.  This is a trivial form of alias analysis that is important
+      // for reg2mem'd code.
+      if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
+          (isa<AllocaInst>(SI->getOperand(1)) ||
+           isa<GlobalVariable>(SI->getOperand(1))))
+        continue;
+      
+      // If we have alias analysis and it says the store won't modify the loaded
+      // value, ignore the store.
+      if (AA &&
+          (AA->getModRefInfo(SI, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // Otherwise the store that may or may not alias the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+    
+    // If this is some other instruction that may clobber Ptr, bail out.
+    if (Inst->mayWriteToMemory()) {
+      // If alias analysis claims that it really won't modify the load,
+      // ignore it.
+      if (AA &&
+          (AA->getModRefInfo(Inst, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // May modify the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+  }
+  
+  // Got to the start of the block, we didn't find it, but are done for this
+  // block.
+  return 0;
+}
+
+/// CopyPrecedingStopPoint - If I is immediately preceded by a StopPoint,
+/// make a copy of the stoppoint before InsertPos (presumably before copying
+/// or moving I).
+void llvm::CopyPrecedingStopPoint(Instruction *I, 
+                                  BasicBlock::iterator InsertPos) {
+  if (I != I->getParent()->begin()) {
+    BasicBlock::iterator BBI = I;  --BBI;
+    if (DbgStopPointInst *DSPI = dyn_cast<DbgStopPointInst>(BBI)) {
+      CallInst *newDSPI = DSPI->clone();
+      newDSPI->insertBefore(InsertPos);
+    }
+  }
+}
diff --git a/lib/Transforms/Utils/BasicInliner.cpp b/lib/Transforms/Utils/BasicInliner.cpp
new file mode 100644
index 0000000..1650cfa
--- /dev/null
+++ b/lib/Transforms/Utils/BasicInliner.cpp
@@ -0,0 +1,181 @@
+//===- BasicInliner.cpp - Basic function level inliner --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a simple function based inliner that does not use
+// call graph information. 
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "basicinliner"
+
+#include "llvm/Module.h"
+#include "llvm/Function.h"
+#include "llvm/Transforms/Utils/BasicInliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<unsigned>     
+BasicInlineThreshold("basic-inline-threshold", cl::Hidden, cl::init(200),
+   cl::desc("Control the amount of basic inlining to perform (default = 200)"));
+
+namespace llvm {
+
+  /// BasicInlinerImpl - BasicInliner implemantation class. This hides
+  /// container info, used by basic inliner, from public interface.
+  struct VISIBILITY_HIDDEN BasicInlinerImpl {
+    
+    BasicInlinerImpl(const BasicInlinerImpl&); // DO NOT IMPLEMENT
+    void operator=(const BasicInlinerImpl&); // DO NO IMPLEMENT
+  public:
+    BasicInlinerImpl(TargetData *T) : TD(T) {}
+
+    /// addFunction - Add function into the list of functions to process.
+    /// All functions must be inserted using this interface before invoking
+    /// inlineFunctions().
+    void addFunction(Function *F) {
+      Functions.push_back(F);
+    }
+
+    /// neverInlineFunction - Sometimes a function is never to be inlined 
+    /// because of one or other reason. 
+    void neverInlineFunction(Function *F) {
+      NeverInline.insert(F);
+    }
+
+    /// inlineFuctions - Walk all call sites in all functions supplied by
+    /// client. Inline as many call sites as possible. Delete completely
+    /// inlined functions.
+    void inlineFunctions();
+    
+  private:
+    TargetData *TD;
+    std::vector<Function *> Functions;
+    SmallPtrSet<const Function *, 16> NeverInline;
+    SmallPtrSet<Function *, 8> DeadFunctions;
+    InlineCostAnalyzer CA;
+  };
+
+/// inlineFuctions - Walk all call sites in all functions supplied by
+/// client. Inline as many call sites as possible. Delete completely
+/// inlined functions.
+void BasicInlinerImpl::inlineFunctions() {
+      
+  // Scan through and identify all call sites ahead of time so that we only
+  // inline call sites in the original functions, not call sites that result
+  // from inlining other functions.
+  std::vector<CallSite> CallSites;
+  
+  for (std::vector<Function *>::iterator FI = Functions.begin(),
+         FE = Functions.end(); FI != FE; ++FI) {
+    Function *F = *FI;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+        CallSite CS = CallSite::get(I);
+        if (CS.getInstruction() && CS.getCalledFunction()
+            && !CS.getCalledFunction()->isDeclaration())
+          CallSites.push_back(CS);
+      }
+  }
+  
+  DOUT << ": " << CallSites.size() << " call sites.\n";
+  
+  // Inline call sites.
+  bool Changed = false;
+  do {
+    Changed = false;
+    for (unsigned index = 0; index != CallSites.size() && !CallSites.empty(); 
+         ++index) {
+      CallSite CS = CallSites[index];
+      if (Function *Callee = CS.getCalledFunction()) {
+        
+        // Eliminate calls that are never inlinable.
+        if (Callee->isDeclaration() ||
+            CS.getInstruction()->getParent()->getParent() == Callee) {
+          CallSites.erase(CallSites.begin() + index);
+          --index;
+          continue;
+        }
+        InlineCost IC = CA.getInlineCost(CS, NeverInline);
+        if (IC.isAlways()) {        
+          DOUT << "  Inlining: cost=always"
+               <<", call: " << *CS.getInstruction();
+        } else if (IC.isNever()) {
+          DOUT << "  NOT Inlining: cost=never"
+               <<", call: " << *CS.getInstruction();
+          continue;
+        } else {
+          int Cost = IC.getValue();
+          
+          if (Cost >= (int) BasicInlineThreshold) {
+            DOUT << "  NOT Inlining: cost = " << Cost
+                 << ", call: " <<  *CS.getInstruction();
+            continue;
+          } else {
+            DOUT << "  Inlining: cost = " << Cost
+                 << ", call: " <<  *CS.getInstruction();
+          }
+        }
+        
+        // Inline
+        if (InlineFunction(CS, NULL, TD)) {
+          if (Callee->use_empty() && (Callee->hasLocalLinkage() ||
+                                      Callee->hasAvailableExternallyLinkage()))
+            DeadFunctions.insert(Callee);
+          Changed = true;
+          CallSites.erase(CallSites.begin() + index);
+          --index;
+        }
+      }
+    }
+  } while (Changed);
+  
+  // Remove completely inlined functions from module.
+  for(SmallPtrSet<Function *, 8>::iterator I = DeadFunctions.begin(),
+        E = DeadFunctions.end(); I != E; ++I) {
+    Function *D = *I;
+    Module *M = D->getParent();
+    M->getFunctionList().remove(D);
+  }
+}
+
+BasicInliner::BasicInliner(TargetData *TD) {
+  Impl = new BasicInlinerImpl(TD);
+}
+
+BasicInliner::~BasicInliner() {
+  delete Impl;
+}
+
+/// addFunction - Add function into the list of functions to process.
+/// All functions must be inserted using this interface before invoking
+/// inlineFunctions().
+void BasicInliner::addFunction(Function *F) {
+  Impl->addFunction(F);
+}
+
+/// neverInlineFunction - Sometimes a function is never to be inlined because
+/// of one or other reason. 
+void BasicInliner::neverInlineFunction(Function *F) {
+  Impl->neverInlineFunction(F);
+}
+
+/// inlineFuctions - Walk all call sites in all functions supplied by
+/// client. Inline as many call sites as possible. Delete completely
+/// inlined functions.
+void BasicInliner::inlineFunctions() {
+  Impl->inlineFunctions();
+}
+
+}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
new file mode 100644
index 0000000..c4fd1ea
--- /dev/null
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -0,0 +1,282 @@
+//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// BreakCriticalEdges pass - Break all of the critical edges in the CFG by
+// inserting a dummy basic block.  This pass may be "required" by passes that
+// cannot deal with critical edges.  For this usage, the structure type is
+// forward declared.  This pass obviously invalidates the CFG, but can update
+// forward dominator (set, immediate dominators, tree, and frontier)
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "break-crit-edges"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumBroken, "Number of blocks inserted");
+
+namespace {
+  struct VISIBILITY_HIDDEN BreakCriticalEdges : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BreakCriticalEdges() : FunctionPass(&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreserved<LoopInfo>();
+
+      // No loop canonicalization guarantees are broken by this pass.
+      AU.addPreservedID(LoopSimplifyID);
+    }
+  };
+}
+
+char BreakCriticalEdges::ID = 0;
+static RegisterPass<BreakCriticalEdges>
+X("break-crit-edges", "Break critical edges in CFG");
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::BreakCriticalEdgesID = &X;
+FunctionPass *llvm::createBreakCriticalEdgesPass() {
+  return new BreakCriticalEdges();
+}
+
+// runOnFunction - Loop over all of the edges in the CFG, breaking critical
+// edges as they are found.
+//
+bool BreakCriticalEdges::runOnFunction(Function &F) {
+  bool Changed = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (TI->getNumSuccessors() > 1)
+      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+        if (SplitCriticalEdge(TI, i, this)) {
+          ++NumBroken;
+          Changed = true;
+        }
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//    Implementation of the external critical edge manipulation functions
+//===----------------------------------------------------------------------===//
+
+// isCriticalEdge - Return true if the specified edge is a critical edge.
+// Critical edges are edges from a block with multiple successors to a block
+// with multiple predecessors.
+//
+bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+                          bool AllowIdenticalEdges) {
+  assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
+  if (TI->getNumSuccessors() == 1) return false;
+
+  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  pred_const_iterator I = pred_begin(Dest), E = pred_end(Dest);
+
+  // If there is more than one predecessor, this is a critical edge...
+  assert(I != E && "No preds, but we have an edge to the block?");
+  const BasicBlock *FirstPred = *I;
+  ++I;        // Skip one edge due to the incoming arc from TI.
+  if (!AllowIdenticalEdges)
+    return I != E;
+  
+  // If AllowIdenticalEdges is true, then we allow this edge to be considered
+  // non-critical iff all preds come from TI's block.
+  while (I != E) {
+    if (*I != FirstPred)
+      return true;
+    // Note: leave this as is until no one ever compiles with either gcc 4.0.1
+    // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207
+    E = pred_end(*I);
+    ++I;
+  }
+  return false;
+}
+
+/// SplitCriticalEdge - If this edge is a critical edge, insert a new node to
+/// split the critical edge.  This will update DominatorTree and
+/// DominatorFrontier  information if it is available, thus calling this pass
+/// will not invalidate  any of them.  This returns true if the edge was split,
+/// false otherwise.  This ensures that all edges to that dest go to one block
+/// instead of each going to a different block.
+//
+bool llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, Pass *P,
+                             bool MergeIdenticalEdges) {
+  if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return false;
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *DestBB = TI->getSuccessor(SuccNum);
+
+  // Create a new basic block, linking it into the CFG.
+  BasicBlock *NewBB = BasicBlock::Create(TIBB->getName() + "." +
+                                         DestBB->getName() + "_crit_edge");
+  // Create our unconditional branch...
+  BranchInst::Create(DestBB, NewBB);
+
+  // Branch to the new block, breaking the edge.
+  TI->setSuccessor(SuccNum, NewBB);
+
+  // Insert the block into the function... right after the block TI lives in.
+  Function &F = *TIBB->getParent();
+  Function::iterator FBBI = TIBB;
+  F.getBasicBlockList().insert(++FBBI, NewBB);
+  
+  // If there are any PHI nodes in DestBB, we need to update them so that they
+  // merge incoming values from NewBB instead of from TIBB.
+  //
+  for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // We no longer enter through TIBB, now we come in through NewBB.  Revector
+    // exactly one entry in the PHI node that used to come from TIBB to come
+    // from NewBB.
+    int BBIdx = PN->getBasicBlockIndex(TIBB);
+    PN->setIncomingBlock(BBIdx, NewBB);
+  }
+  
+  // If there are any other edges from TIBB to DestBB, update those to go
+  // through the split block, making those edges non-critical as well (and
+  // reducing the number of phi entries in the DestBB if relevant).
+  if (MergeIdenticalEdges) {
+    for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
+      if (TI->getSuccessor(i) != DestBB) continue;
+      
+      // Remove an entry for TIBB from DestBB phi nodes.
+      DestBB->removePredecessor(TIBB);
+      
+      // We found another edge to DestBB, go to NewBB instead.
+      TI->setSuccessor(i, NewBB);
+    }
+  }
+  
+  
+
+  // If we don't have a pass object, we can't update anything...
+  if (P == 0) return true;
+
+  // Now update analysis information.  Since the only predecessor of NewBB is
+  // the TIBB, TIBB clearly dominates NewBB.  TIBB usually doesn't dominate
+  // anything, as there are other successors of DestBB.  However, if all other
+  // predecessors of DestBB are already dominated by DestBB (e.g. DestBB is a
+  // loop header) then NewBB dominates DestBB.
+  SmallVector<BasicBlock*, 8> OtherPreds;
+
+  for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E; ++I)
+    if (*I != NewBB)
+      OtherPreds.push_back(*I);
+  
+  bool NewBBDominatesDestBB = true;
+  
+  // Should we update DominatorTree information?
+  if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
+    DomTreeNode *TINode = DT->getNode(TIBB);
+
+    // The new block is not the immediate dominator for any other nodes, but
+    // TINode is the immediate dominator for the new node.
+    //
+    if (TINode) {       // Don't break unreachable code!
+      DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB);
+      DomTreeNode *DestBBNode = 0;
+     
+      // If NewBBDominatesDestBB hasn't been computed yet, do so with DT.
+      if (!OtherPreds.empty()) {
+        DestBBNode = DT->getNode(DestBB);
+        while (!OtherPreds.empty() && NewBBDominatesDestBB) {
+          if (DomTreeNode *OPNode = DT->getNode(OtherPreds.back()))
+            NewBBDominatesDestBB = DT->dominates(DestBBNode, OPNode);
+          OtherPreds.pop_back();
+        }
+        OtherPreds.clear();
+      }
+      
+      // If NewBBDominatesDestBB, then NewBB dominates DestBB, otherwise it
+      // doesn't dominate anything.
+      if (NewBBDominatesDestBB) {
+        if (!DestBBNode) DestBBNode = DT->getNode(DestBB);
+        DT->changeImmediateDominator(DestBBNode, NewBBNode);
+      }
+    }
+  }
+
+  // Should we update DominanceFrontier information?
+  if (DominanceFrontier *DF = P->getAnalysisIfAvailable<DominanceFrontier>()) {
+    // If NewBBDominatesDestBB hasn't been computed yet, do so with DF.
+    if (!OtherPreds.empty()) {
+      // FIXME: IMPLEMENT THIS!
+      assert(0 && "Requiring domfrontiers but not idom/domtree/domset."
+             " not implemented yet!");
+    }
+    
+    // Since the new block is dominated by its only predecessor TIBB,
+    // it cannot be in any block's dominance frontier.  If NewBB dominates
+    // DestBB, its dominance frontier is the same as DestBB's, otherwise it is
+    // just {DestBB}.
+    DominanceFrontier::DomSetType NewDFSet;
+    if (NewBBDominatesDestBB) {
+      DominanceFrontier::iterator I = DF->find(DestBB);
+      if (I != DF->end()) {
+        DF->addBasicBlock(NewBB, I->second);
+        
+        if (I->second.count(DestBB)) {
+          // However NewBB's frontier does not include DestBB.
+          DominanceFrontier::iterator NF = DF->find(NewBB);
+          DF->removeFromFrontier(NF, DestBB);
+        }
+      }
+      else
+        DF->addBasicBlock(NewBB, DominanceFrontier::DomSetType());
+    } else {
+      DominanceFrontier::DomSetType NewDFSet;
+      NewDFSet.insert(DestBB);
+      DF->addBasicBlock(NewBB, NewDFSet);
+    }
+  }
+  
+  // Update LoopInfo if it is around.
+  if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) {
+    // If one or the other blocks were not in a loop, the new block is not
+    // either, and thus LI doesn't need to be updated.
+    if (Loop *TIL = LI->getLoopFor(TIBB))
+      if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NewBB joins loop.
+          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else if (TIL->contains(DestLoop->getHeader())) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else if (DestLoop->contains(TIL->getHeader())) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == DestBB &&
+                 "Should not create irreducible loops!");
+          if (Loop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NewBB, LI->getBase());
+        }
+      }
+  }
+  return true;
+}
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
new file mode 100644
index 0000000..6628b4b
--- /dev/null
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -0,0 +1,27 @@
+add_llvm_library(LLVMTransformUtils
+  AddrModeMatcher.cpp
+  BasicBlockUtils.cpp
+  BasicInliner.cpp
+  BreakCriticalEdges.cpp
+  CloneFunction.cpp
+  CloneLoop.cpp
+  CloneModule.cpp
+  CloneTrace.cpp
+  CodeExtractor.cpp
+  DemoteRegToStack.cpp
+  InlineCost.cpp
+  InlineFunction.cpp
+  LCSSA.cpp
+  Local.cpp
+  LoopSimplify.cpp
+  LowerAllocations.cpp
+  LowerInvoke.cpp
+  LowerSwitch.cpp
+  Mem2Reg.cpp
+  PromoteMemoryToRegister.cpp
+  SimplifyCFG.cpp
+  UnifyFunctionExitNodes.cpp
+  UnrollLoop.cpp
+  ValueMapper.cpp
+  InstructionNamer.cpp
+  )
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
new file mode 100644
index 0000000..d0fdefa
--- /dev/null
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -0,0 +1,533 @@
+//===- CloneFunction.cpp - Clone a function into another function ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneFunctionInto interface, which is used as the
+// low-level function cloner.  This is used by the CloneFunction and function
+// inliner to do the dirty work of copying the body of a function around.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include <map>
+using namespace llvm;
+
+// CloneBasicBlock - See comments in Cloning.h
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
+                                  DenseMap<const Value*, Value*> &ValueMap,
+                                  const char *NameSuffix, Function *F,
+                                  ClonedCodeInfo *CodeInfo) {
+  BasicBlock *NewBB = BasicBlock::Create("", F);
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  
+  // Loop over all instructions, and copy them over.
+  for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
+       II != IE; ++II) {
+    Instruction *NewInst = II->clone();
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    ValueMap[II] = NewInst;                // Add instruction map to value.
+    
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsUnwinds        |= isa<UnwindInst>(BB->getTerminator());
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+                                        BB != &BB->getParent()->getEntryBlock();
+  }
+  return NewBB;
+}
+
+// Clone OldFunc into NewFunc, transforming the old arguments into references to
+// ArgMap values.
+//
+void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                             DenseMap<const Value*, Value*> &ValueMap,
+                             std::vector<ReturnInst*> &Returns,
+                             const char *NameSuffix, ClonedCodeInfo *CodeInfo) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+#ifndef NDEBUG
+  for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
+       E = OldFunc->arg_end(); I != E; ++I)
+    assert(ValueMap.count(I) && "No mapping from source argument specified!");
+#endif
+
+  // Clone any attributes.
+  if (NewFunc->arg_size() == OldFunc->arg_size())
+    NewFunc->copyAttributesFrom(OldFunc);
+  else {
+    //Some arguments were deleted with the ValueMap. Copy arguments one by one
+    for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
+           E = OldFunc->arg_end(); I != E; ++I)
+      if (Argument* Anew = dyn_cast<Argument>(ValueMap[I]))
+        Anew->addAttr( OldFunc->getAttributes()
+                       .getParamAttributes(I->getArgNo() + 1));
+    NewFunc->setAttributes(NewFunc->getAttributes()
+                           .addAttr(0, OldFunc->getAttributes()
+                                     .getRetAttributes()));
+    NewFunc->setAttributes(NewFunc->getAttributes()
+                           .addAttr(~0, OldFunc->getAttributes()
+                                     .getFnAttributes()));
+
+  }
+
+  // Loop over all of the basic blocks in the function, cloning them as
+  // appropriate.  Note that we save BE this way in order to handle cloning of
+  // recursive functions into themselves.
+  //
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    const BasicBlock &BB = *BI;
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(&BB, ValueMap, NameSuffix, NewFunc,
+                                      CodeInfo);
+    ValueMap[&BB] = CBB;                       // Add basic block mapping.
+
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+      Returns.push_back(RI);
+  }
+
+  // Loop over all of the instructions in the function, fixing up operand
+  // references as we go.  This uses ValueMap to do all the hard work.
+  //
+  for (Function::iterator BB = cast<BasicBlock>(ValueMap[OldFunc->begin()]),
+         BE = NewFunc->end(); BB != BE; ++BB)
+    // Loop over all instructions, fixing each one as we find it...
+    for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
+      RemapInstruction(II, ValueMap);
+}
+
+/// CloneFunction - Return a copy of the specified function, but without
+/// embedding the function into another module.  Also, any references specified
+/// in the ValueMap are changed to refer to their mapped value instead of the
+/// original one.  If any of the arguments to the function are in the ValueMap,
+/// the arguments are deleted from the resultant function.  The ValueMap is
+/// updated to include mappings from all of the instructions and basicblocks in
+/// the function from their old to new values.
+///
+Function *llvm::CloneFunction(const Function *F,
+                              DenseMap<const Value*, Value*> &ValueMap,
+                              ClonedCodeInfo *CodeInfo) {
+  std::vector<const Type*> ArgTypes;
+
+  // The user might be deleting arguments to the function by specifying them in
+  // the ValueMap.  If so, we need to not add the arguments to the arg ty vector
+  //
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I)
+    if (ValueMap.count(I) == 0)  // Haven't mapped the argument to anything yet?
+      ArgTypes.push_back(I->getType());
+
+  // Create a new function type...
+  FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
+                                    ArgTypes, F->getFunctionType()->isVarArg());
+
+  // Create the new function...
+  Function *NewF = Function::Create(FTy, F->getLinkage(), F->getName());
+
+  // Loop over the arguments, copying the names of the mapped arguments over...
+  Function::arg_iterator DestI = NewF->arg_begin();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I)
+    if (ValueMap.count(I) == 0) {   // Is this argument preserved?
+      DestI->setName(I->getName()); // Copy the name over...
+      ValueMap[I] = DestI++;        // Add mapping to ValueMap
+    }
+
+  std::vector<ReturnInst*> Returns;  // Ignore returns cloned...
+  CloneFunctionInto(NewF, F, ValueMap, Returns, "", CodeInfo);
+  return NewF;
+}
+
+
+
+namespace {
+  /// PruningFunctionCloner - This class is a private class used to implement
+  /// the CloneAndPruneFunctionInto method.
+  struct VISIBILITY_HIDDEN PruningFunctionCloner {
+    Function *NewFunc;
+    const Function *OldFunc;
+    DenseMap<const Value*, Value*> &ValueMap;
+    std::vector<ReturnInst*> &Returns;
+    const char *NameSuffix;
+    ClonedCodeInfo *CodeInfo;
+    const TargetData *TD;
+    Value *DbgFnStart;
+  public:
+    PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
+                          DenseMap<const Value*, Value*> &valueMap,
+                          std::vector<ReturnInst*> &returns,
+                          const char *nameSuffix, 
+                          ClonedCodeInfo *codeInfo,
+                          const TargetData *td)
+    : NewFunc(newFunc), OldFunc(oldFunc), ValueMap(valueMap), Returns(returns),
+      NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td), DbgFnStart(NULL) {
+    }
+
+    /// CloneBlock - The specified block is found to be reachable, clone it and
+    /// anything that it can reach.
+    void CloneBlock(const BasicBlock *BB,
+                    std::vector<const BasicBlock*> &ToClone);
+    
+  public:
+    /// ConstantFoldMappedInstruction - Constant fold the specified instruction,
+    /// mapping its operands through ValueMap if they are available.
+    Constant *ConstantFoldMappedInstruction(const Instruction *I);
+  };
+}
+
+/// CloneBlock - The specified block is found to be reachable, clone it and
+/// anything that it can reach.
+void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
+                                       std::vector<const BasicBlock*> &ToClone){
+  Value *&BBEntry = ValueMap[BB];
+
+  // Have we already cloned this block?
+  if (BBEntry) return;
+  
+  // Nope, clone it now.
+  BasicBlock *NewBB;
+  BBEntry = NewBB = BasicBlock::Create();
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  
+  // Loop over all instructions, and copy them over, DCE'ing as we go.  This
+  // loop doesn't include the terminator.
+  for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end();
+       II != IE; ++II) {
+    // If this instruction constant folds, don't bother cloning the instruction,
+    // instead, just add the constant to the value map.
+    if (Constant *C = ConstantFoldMappedInstruction(II)) {
+      ValueMap[II] = C;
+      continue;
+    }
+
+    // Do not clone llvm.dbg.region.end. It will be adjusted by the inliner.
+    if (const DbgFuncStartInst *DFSI = dyn_cast<DbgFuncStartInst>(II)) {
+      if (DbgFnStart == NULL) {
+        DISubprogram SP(cast<GlobalVariable>(DFSI->getSubprogram()));
+        if (SP.describes(BB->getParent()))
+          DbgFnStart = DFSI->getSubprogram();
+      }
+    } 
+    if (const DbgRegionEndInst *DREIS = dyn_cast<DbgRegionEndInst>(II)) {
+      if (DREIS->getContext() == DbgFnStart)
+        continue;
+    }
+      
+    Instruction *NewInst = II->clone();
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    ValueMap[II] = NewInst;                // Add instruction map to value.
+    
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  // Finally, clone over the terminator.
+  const TerminatorInst *OldTI = BB->getTerminator();
+  bool TerminatorDone = false;
+  if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
+    if (BI->isConditional()) {
+      // If the condition was a known constant in the callee...
+      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+      // Or is a known constant in the caller...
+      if (Cond == 0)  
+        Cond = dyn_cast_or_null<ConstantInt>(ValueMap[BI->getCondition()]);
+
+      // Constant fold to uncond branch!
+      if (Cond) {
+        BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
+        ValueMap[OldTI] = BranchInst::Create(Dest, NewBB);
+        ToClone.push_back(Dest);
+        TerminatorDone = true;
+      }
+    }
+  } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
+    // If switching on a value known constant in the caller.
+    ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+    if (Cond == 0)  // Or known constant after constant prop in the callee...
+      Cond = dyn_cast_or_null<ConstantInt>(ValueMap[SI->getCondition()]);
+    if (Cond) {     // Constant fold to uncond branch!
+      BasicBlock *Dest = SI->getSuccessor(SI->findCaseValue(Cond));
+      ValueMap[OldTI] = BranchInst::Create(Dest, NewBB);
+      ToClone.push_back(Dest);
+      TerminatorDone = true;
+    }
+  }
+  
+  if (!TerminatorDone) {
+    Instruction *NewInst = OldTI->clone();
+    if (OldTI->hasName())
+      NewInst->setName(OldTI->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    ValueMap[OldTI] = NewInst;             // Add instruction map to value.
+    
+    // Recursively clone any reachable successor blocks.
+    const TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      ToClone.push_back(TI->getSuccessor(i));
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsUnwinds        |= isa<UnwindInst>(OldTI);
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+      BB != &BB->getParent()->front();
+  }
+  
+  if (ReturnInst *RI = dyn_cast<ReturnInst>(NewBB->getTerminator()))
+    Returns.push_back(RI);
+}
+
+/// ConstantFoldMappedInstruction - Constant fold the specified instruction,
+/// mapping its operands through ValueMap if they are available.
+Constant *PruningFunctionCloner::
+ConstantFoldMappedInstruction(const Instruction *I) {
+  SmallVector<Constant*, 8> Ops;
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i),
+                                                           ValueMap)))
+      Ops.push_back(Op);
+    else
+      return 0;  // All operands not constant!
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(CI->getPredicate(),
+                                           &Ops[0], Ops.size(), TD);
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
+      if (!LI->isVolatile() && CE->getOpcode() == Instruction::GetElementPtr)
+        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer())
+            return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(),
+                                                          CE);
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), &Ops[0],
+                                  Ops.size(), TD);
+}
+
+/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
+/// except that it does some simple constant prop and DCE on the fly.  The
+/// effect of this is to copy significantly less code in cases where (for
+/// example) a function call with constant arguments is inlined, and those
+/// constant arguments cause a significant amount of code in the callee to be
+/// dead.  Since this doesn't produce an exact copy of the input, it can't be
+/// used for things like CloneFunction or CloneModule.
+void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                                     DenseMap<const Value*, Value*> &ValueMap,
+                                     std::vector<ReturnInst*> &Returns,
+                                     const char *NameSuffix, 
+                                     ClonedCodeInfo *CodeInfo,
+                                     const TargetData *TD) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+  
+#ifndef NDEBUG
+  for (Function::const_arg_iterator II = OldFunc->arg_begin(), 
+       E = OldFunc->arg_end(); II != E; ++II)
+    assert(ValueMap.count(II) && "No mapping from source argument specified!");
+#endif
+
+  PruningFunctionCloner PFC(NewFunc, OldFunc, ValueMap, Returns,
+                            NameSuffix, CodeInfo, TD);
+
+  // Clone the entry block, and anything recursively reachable from it.
+  std::vector<const BasicBlock*> CloneWorklist;
+  CloneWorklist.push_back(&OldFunc->getEntryBlock());
+  while (!CloneWorklist.empty()) {
+    const BasicBlock *BB = CloneWorklist.back();
+    CloneWorklist.pop_back();
+    PFC.CloneBlock(BB, CloneWorklist);
+  }
+  
+  // Loop over all of the basic blocks in the old function.  If the block was
+  // reachable, we have cloned it and the old block is now in the value map:
+  // insert it into the new function in the right order.  If not, ignore it.
+  //
+  // Defer PHI resolution until rest of function is resolved.
+  std::vector<const PHINode*> PHIToResolve;
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(ValueMap[BI]);
+    if (NewBB == 0) continue;  // Dead block.
+
+    // Add the new block to the new function.
+    NewFunc->getBasicBlockList().push_back(NewBB);
+    
+    // Loop over all of the instructions in the block, fixing up operand
+    // references as we go.  This uses ValueMap to do all the hard work.
+    //
+    BasicBlock::iterator I = NewBB->begin();
+    
+    // Handle PHI nodes specially, as we have to remove references to dead
+    // blocks.
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      // Skip over all PHI nodes, remembering them for later.
+      BasicBlock::const_iterator OldI = BI->begin();
+      for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI)
+        PHIToResolve.push_back(cast<PHINode>(OldI));
+    }
+    
+    // Otherwise, remap the rest of the instructions normally.
+    for (; I != NewBB->end(); ++I)
+      RemapInstruction(I, ValueMap);
+  }
+  
+  // Defer PHI resolution until rest of function is resolved, PHI resolution
+  // requires the CFG to be up-to-date.
+  for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) {
+    const PHINode *OPN = PHIToResolve[phino];
+    unsigned NumPreds = OPN->getNumIncomingValues();
+    const BasicBlock *OldBB = OPN->getParent();
+    BasicBlock *NewBB = cast<BasicBlock>(ValueMap[OldBB]);
+
+    // Map operands for blocks that are live and remove operands for blocks
+    // that are dead.
+    for (; phino != PHIToResolve.size() &&
+         PHIToResolve[phino]->getParent() == OldBB; ++phino) {
+      OPN = PHIToResolve[phino];
+      PHINode *PN = cast<PHINode>(ValueMap[OPN]);
+      for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
+        if (BasicBlock *MappedBlock = 
+            cast_or_null<BasicBlock>(ValueMap[PN->getIncomingBlock(pred)])) {
+          Value *InVal = MapValue(PN->getIncomingValue(pred), ValueMap);
+          assert(InVal && "Unknown input value?");
+          PN->setIncomingValue(pred, InVal);
+          PN->setIncomingBlock(pred, MappedBlock);
+        } else {
+          PN->removeIncomingValue(pred, false);
+          --pred, --e;  // Revisit the next entry.
+        }
+      } 
+    }
+    
+    // The loop above has removed PHI entries for those blocks that are dead
+    // and has updated others.  However, if a block is live (i.e. copied over)
+    // but its terminator has been changed to not go to this block, then our
+    // phi nodes will have invalid entries.  Update the PHI nodes in this
+    // case.
+    PHINode *PN = cast<PHINode>(NewBB->begin());
+    NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB));
+    if (NumPreds != PN->getNumIncomingValues()) {
+      assert(NumPreds < PN->getNumIncomingValues());
+      // Count how many times each predecessor comes to this block.
+      std::map<BasicBlock*, unsigned> PredCount;
+      for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB);
+           PI != E; ++PI)
+        --PredCount[*PI];
+      
+      // Figure out how many entries to remove from each PHI.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        ++PredCount[PN->getIncomingBlock(i)];
+      
+      // At this point, the excess predecessor entries are positive in the
+      // map.  Loop over all of the PHIs and remove excess predecessor
+      // entries.
+      BasicBlock::iterator I = NewBB->begin();
+      for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+        for (std::map<BasicBlock*, unsigned>::iterator PCI =PredCount.begin(),
+             E = PredCount.end(); PCI != E; ++PCI) {
+          BasicBlock *Pred     = PCI->first;
+          for (unsigned NumToRemove = PCI->second; NumToRemove; --NumToRemove)
+            PN->removeIncomingValue(Pred, false);
+        }
+      }
+    }
+    
+    // If the loops above have made these phi nodes have 0 or 1 operand,
+    // replace them with undef or the input value.  We must do this for
+    // correctness, because 0-operand phis are not valid.
+    PN = cast<PHINode>(NewBB->begin());
+    if (PN->getNumIncomingValues() == 0) {
+      BasicBlock::iterator I = NewBB->begin();
+      BasicBlock::const_iterator OldI = OldBB->begin();
+      while ((PN = dyn_cast<PHINode>(I++))) {
+        Value *NV = UndefValue::get(PN->getType());
+        PN->replaceAllUsesWith(NV);
+        assert(ValueMap[OldI] == PN && "ValueMap mismatch");
+        ValueMap[OldI] = NV;
+        PN->eraseFromParent();
+        ++OldI;
+      }
+    }
+    // NOTE: We cannot eliminate single entry phi nodes here, because of
+    // ValueMap.  Single entry phi nodes can have multiple ValueMap entries
+    // pointing at them.  Thus, deleting one would require scanning the ValueMap
+    // to update any entries in it that would require that.  This would be
+    // really slow.
+  }
+  
+  // Now that the inlined function body has been fully constructed, go through
+  // and zap unconditional fall-through branches.  This happen all the time when
+  // specializing code: code specialization turns conditional branches into
+  // uncond branches, and this code folds them.
+  Function::iterator I = cast<BasicBlock>(ValueMap[&OldFunc->getEntryBlock()]);
+  while (I != NewFunc->end()) {
+    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
+    if (!BI || BI->isConditional()) { ++I; continue; }
+    
+    // Note that we can't eliminate uncond branches if the destination has
+    // single-entry PHI nodes.  Eliminating the single-entry phi nodes would
+    // require scanning the ValueMap to update any entries that point to the phi
+    // node.
+    BasicBlock *Dest = BI->getSuccessor(0);
+    if (!Dest->getSinglePredecessor() || isa<PHINode>(Dest->begin())) {
+      ++I; continue;
+    }
+    
+    // We know all single-entry PHI nodes in the inlined function have been
+    // removed, so we just need to splice the blocks.
+    BI->eraseFromParent();
+    
+    // Move all the instructions in the succ to the pred.
+    I->getInstList().splice(I->end(), Dest->getInstList());
+    
+    // Make all PHI nodes that referred to Dest now refer to I as their source.
+    Dest->replaceAllUsesWith(I);
+
+    // Remove the dest block.
+    Dest->eraseFromParent();
+    
+    // Do not increment I, iteratively merge all things this block branches to.
+  }
+}
diff --git a/lib/Transforms/Utils/CloneLoop.cpp b/lib/Transforms/Utils/CloneLoop.cpp
new file mode 100644
index 0000000..7e000a1
--- /dev/null
+++ b/lib/Transforms/Utils/CloneLoop.cpp
@@ -0,0 +1,152 @@
+//===- CloneLoop.cpp - Clone loop nest ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneLoop interface which makes a copy of a loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/DenseMap.h"
+
+
+using namespace llvm;
+
+/// CloneDominatorInfo - Clone basicblock's dominator tree and, if available,
+/// dominance info. It is expected that basic block is already cloned.
+static void CloneDominatorInfo(BasicBlock *BB, 
+                               DenseMap<const Value *, Value *> &ValueMap,
+                               DominatorTree *DT,
+                               DominanceFrontier *DF) {
+
+  assert (DT && "DominatorTree is not available");
+  DenseMap<const Value *, Value*>::iterator BI = ValueMap.find(BB);
+  assert (BI != ValueMap.end() && "BasicBlock clone is missing");
+  BasicBlock *NewBB = cast<BasicBlock>(BI->second);
+
+  // NewBB already got dominator info.
+  if (DT->getNode(NewBB))
+    return;
+
+  assert (DT->getNode(BB) && "BasicBlock does not have dominator info");
+  // Entry block is not expected here. Infinite loops are not to cloned.
+  assert (DT->getNode(BB)->getIDom() && "BasicBlock does not have immediate dominator");
+  BasicBlock *BBDom = DT->getNode(BB)->getIDom()->getBlock();
+
+  // NewBB's dominator is either BB's dominator or BB's dominator's clone.
+  BasicBlock *NewBBDom = BBDom;
+  DenseMap<const Value *, Value*>::iterator BBDomI = ValueMap.find(BBDom);
+  if (BBDomI != ValueMap.end()) {
+    NewBBDom = cast<BasicBlock>(BBDomI->second);
+    if (!DT->getNode(NewBBDom))
+      CloneDominatorInfo(BBDom, ValueMap, DT, DF);
+  }
+  DT->addNewBlock(NewBB, NewBBDom);
+
+  // Copy cloned dominance frontiner set
+  if (DF) {
+    DominanceFrontier::DomSetType NewDFSet;
+    DominanceFrontier::iterator DFI = DF->find(BB);
+    if ( DFI != DF->end()) {
+      DominanceFrontier::DomSetType S = DFI->second;
+        for (DominanceFrontier::DomSetType::iterator I = S.begin(), E = S.end();
+             I != E; ++I) {
+          BasicBlock *DB = *I;
+          DenseMap<const Value*, Value*>::iterator IDM = ValueMap.find(DB);
+          if (IDM != ValueMap.end())
+            NewDFSet.insert(cast<BasicBlock>(IDM->second));
+          else
+            NewDFSet.insert(DB);
+        }
+    }
+    DF->addBasicBlock(NewBB, NewDFSet);
+  }
+}
+
+/// CloneLoop - Clone Loop. Clone dominator info. Populate ValueMap
+/// using old blocks to new blocks mapping.
+Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
+                      DenseMap<const Value *, Value *> &ValueMap, Pass *P) {
+  
+  DominatorTree *DT = NULL;
+  DominanceFrontier *DF = NULL;
+  if (P) {
+    DT = P->getAnalysisIfAvailable<DominatorTree>();
+    DF = P->getAnalysisIfAvailable<DominanceFrontier>();
+  }
+
+  SmallVector<BasicBlock *, 16> NewBlocks;
+
+  // Populate loop nest.
+  SmallVector<Loop *, 8> LoopNest;
+  LoopNest.push_back(OrigL);
+
+
+  Loop *NewParentLoop = NULL;
+  while (!LoopNest.empty()) {
+    Loop *L = LoopNest.pop_back_val();
+    Loop *NewLoop = new Loop();
+
+    if (!NewParentLoop)
+      NewParentLoop = NewLoop;
+
+    LPM->insertLoop(NewLoop, L->getParentLoop());
+
+    // Clone Basic Blocks.
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+         I != E; ++I) {
+      BasicBlock *BB = *I;
+      BasicBlock *NewBB = CloneBasicBlock(BB, ValueMap, ".clone");
+      ValueMap[BB] = NewBB;
+      if (P)
+        LPM->cloneBasicBlockSimpleAnalysis(BB, NewBB, L);
+      NewLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+      NewBlocks.push_back(NewBB);
+    }
+
+    // Clone dominator info.
+    if (DT)
+      for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        CloneDominatorInfo(BB, ValueMap, DT, DF);
+      }
+
+    // Process sub loops
+    for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+      LoopNest.push_back(*I);
+  }
+
+  // Remap instructions to reference operands from ValueMap.
+  for(SmallVector<BasicBlock *, 16>::iterator NBItr = NewBlocks.begin(), 
+        NBE = NewBlocks.end();  NBItr != NBE; ++NBItr) {
+    BasicBlock *NB = *NBItr;
+    for(BasicBlock::iterator BI = NB->begin(), BE = NB->end(); 
+        BI != BE; ++BI) {
+      Instruction *Insn = BI;
+      for (unsigned index = 0, num_ops = Insn->getNumOperands(); 
+           index != num_ops; ++index) {
+        Value *Op = Insn->getOperand(index);
+        DenseMap<const Value *, Value *>::iterator OpItr = ValueMap.find(Op);
+        if (OpItr != ValueMap.end())
+          Insn->setOperand(index, OpItr->second);
+      }
+    }
+  }
+
+  BasicBlock *Latch = OrigL->getLoopLatch();
+  Function *F = Latch->getParent();
+  F->getBasicBlockList().insert(OrigL->getHeader(), 
+                                NewBlocks.begin(), NewBlocks.end());
+
+
+  return NewParentLoop;
+}
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
new file mode 100644
index 0000000..337fa8a
--- /dev/null
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -0,0 +1,126 @@
+//===- CloneModule.cpp - Clone an entire module ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneModule interface which makes a copy of an
+// entire module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Constant.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+/// CloneModule - Return an exact copy of the specified module.  This is not as
+/// easy as it might seem because we have to worry about making copies of global
+/// variables and functions, and making their (initializers and references,
+/// respectively) refer to the right globals.
+///
+Module *llvm::CloneModule(const Module *M) {
+  // Create the value map that maps things from the old module over to the new
+  // module.
+  DenseMap<const Value*, Value*> ValueMap;
+  return CloneModule(M, ValueMap);
+}
+
+Module *llvm::CloneModule(const Module *M,
+                          DenseMap<const Value*, Value*> &ValueMap) {
+  // First off, we need to create the new module...
+  Module *New = new Module(M->getModuleIdentifier());
+  New->setDataLayout(M->getDataLayout());
+  New->setTargetTriple(M->getTargetTriple());
+  New->setModuleInlineAsm(M->getModuleInlineAsm());
+
+  // Copy all of the type symbol table entries over.
+  const TypeSymbolTable &TST = M->getTypeSymbolTable();
+  for (TypeSymbolTable::const_iterator TI = TST.begin(), TE = TST.end(); 
+       TI != TE; ++TI)
+    New->addTypeName(TI->first, TI->second);
+  
+  // Copy all of the dependent libraries over.
+  for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I)
+    New->addLibrary(*I);
+
+  // Loop over all of the global variables, making corresponding globals in the
+  // new module.  Here we add them to the ValueMap and to the new Module.  We
+  // don't worry about attributes or initializers, they will come later.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = new GlobalVariable(I->getType()->getElementType(),
+                                            false,
+                                            GlobalValue::ExternalLinkage, 0,
+                                            I->getName(), New);
+    GV->setAlignment(I->getAlignment());
+    ValueMap[I] = GV;
+  }
+
+  // Loop over the functions in the module, making external functions as before
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    Function *NF =
+      Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                       GlobalValue::ExternalLinkage, I->getName(), New);
+    NF->copyAttributesFrom(I);
+    ValueMap[I] = NF;
+  }
+
+  // Loop over the aliases in the module
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    ValueMap[I] = new GlobalAlias(I->getType(), GlobalAlias::ExternalLinkage,
+                                  I->getName(), NULL, New);
+  
+  // Now that all of the things that global variable initializer can refer to
+  // have been created, loop through and copy the global variable referrers
+  // over...  We also set the attributes on the global now.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = cast<GlobalVariable>(ValueMap[I]);
+    if (I->hasInitializer())
+      GV->setInitializer(cast<Constant>(MapValue(I->getInitializer(),
+                                                 ValueMap)));
+    GV->setLinkage(I->getLinkage());
+    GV->setThreadLocal(I->isThreadLocal());
+    GV->setConstant(I->isConstant());
+  }
+
+  // Similarly, copy over function bodies now...
+  //
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    Function *F = cast<Function>(ValueMap[I]);
+    if (!I->isDeclaration()) {
+      Function::arg_iterator DestI = F->arg_begin();
+      for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
+           ++J) {
+        DestI->setName(J->getName());
+        ValueMap[J] = DestI++;
+      }
+
+      std::vector<ReturnInst*> Returns;  // Ignore returns cloned...
+      CloneFunctionInto(F, I, ValueMap, Returns);
+    }
+
+    F->setLinkage(I->getLinkage());
+  }
+
+  // And aliases
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I) {
+    GlobalAlias *GA = cast<GlobalAlias>(ValueMap[I]);
+    GA->setLinkage(I->getLinkage());
+    if (const Constant* C = I->getAliasee())
+      GA->setAliasee(cast<Constant>(MapValue(C, ValueMap)));
+  }
+  
+  return New;
+}
diff --git a/lib/Transforms/Utils/CloneTrace.cpp b/lib/Transforms/Utils/CloneTrace.cpp
new file mode 100644
index 0000000..0711139
--- /dev/null
+++ b/lib/Transforms/Utils/CloneTrace.cpp
@@ -0,0 +1,119 @@
+//===- CloneTrace.cpp - Clone a trace -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneTrace interface, which is used when writing
+// runtime optimizations. It takes a vector of basic blocks clones the basic
+// blocks, removes internal phi nodes, adds it to the same function as the
+// original (although there is no jump to it) and returns the new vector of
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Trace.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+//Clones the trace (a vector of basic blocks)
+std::vector<BasicBlock *>
+llvm::CloneTrace(const std::vector<BasicBlock*> &origTrace) {
+  std::vector<BasicBlock *> clonedTrace;
+  DenseMap<const Value*, Value*> ValueMap;
+
+  //First, loop over all the Basic Blocks in the trace and copy
+  //them using CloneBasicBlock. Also fix the phi nodes during
+  //this loop. To fix the phi nodes, we delete incoming branches
+  //that are not in the trace.
+  for (std::vector<BasicBlock *>::const_iterator T = origTrace.begin(),
+    End = origTrace.end(); T != End; ++T) {
+
+    //Clone Basic Block
+    BasicBlock *clonedBlock =
+      CloneBasicBlock(*T, ValueMap, ".tr", (*T)->getParent());
+
+    //Add it to our new trace
+    clonedTrace.push_back(clonedBlock);
+
+    //Add this new mapping to our Value Map
+    ValueMap[*T] = clonedBlock;
+
+    //Loop over the phi instructions and delete operands
+    //that are from blocks not in the trace
+    //only do this if we are NOT the first block
+    if (T != origTrace.begin()) {
+      for (BasicBlock::iterator I = clonedBlock->begin();
+           isa<PHINode>(I); ++I) {
+        PHINode *PN = cast<PHINode>(I);
+        //get incoming value for the previous BB
+        Value *V = PN->getIncomingValueForBlock(*(T-1));
+        assert(V && "No incoming value from a BasicBlock in our trace!");
+
+        //remap our phi node to point to incoming value
+        ValueMap[*&I] = V;
+
+        //remove phi node
+        clonedBlock->getInstList().erase(PN);
+      }
+    }
+  }
+
+  //Second loop to do the remapping
+  for (std::vector<BasicBlock *>::const_iterator BB = clonedTrace.begin(),
+    BE = clonedTrace.end(); BB != BE; ++BB) {
+    for (BasicBlock::iterator I = (*BB)->begin(); I != (*BB)->end(); ++I) {
+      //Loop over all the operands of the instruction
+      for (unsigned op=0, E = I->getNumOperands(); op != E; ++op) {
+        const Value *Op = I->getOperand(op);
+
+        //Get it out of the value map
+        Value *V = ValueMap[Op];
+
+        //If not in the value map, then its outside our trace so ignore
+        if (V != 0)
+          I->setOperand(op,V);
+      }
+    }
+  }
+
+  //return new vector of basic blocks
+  return clonedTrace;
+}
+
+/// CloneTraceInto - Clone T into NewFunc. Original<->clone mapping is
+/// saved in ValueMap.
+///
+void llvm::CloneTraceInto(Function *NewFunc, Trace &T,
+                          DenseMap<const Value*, Value*> &ValueMap,
+                          const char *NameSuffix) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+  // Loop over all of the basic blocks in the trace, cloning them as
+  // appropriate.
+  //
+  for (Trace::const_iterator BI = T.begin(), BE = T.end(); BI != BE; ++BI) {
+    const BasicBlock *BB = *BI;
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(BB, ValueMap, NameSuffix, NewFunc);
+    ValueMap[BB] = CBB;                       // Add basic block mapping.
+  }
+
+  // Loop over all of the instructions in the new function, fixing up operand
+  // references as we go.  This uses ValueMap to do all the hard work.
+  //
+  for (Function::iterator BB =
+         cast<BasicBlock>(ValueMap[T.getEntryBasicBlock()]),
+         BE = NewFunc->end(); BB != BE; ++BB)
+    // Loop over all instructions, fixing each one as we find it...
+    for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
+      RemapInstruction(II, ValueMap);
+}
+
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
new file mode 100644
index 0000000..6d5904e
--- /dev/null
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -0,0 +1,746 @@
+//===- CodeExtractor.cpp - Pull code region into a new function -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface to tear out a code region, such as an
+// individual loop or a parallel section, into a new function, replacing it with
+// a call to the new function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+// Provide a command-line option to aggregate function arguments into a struct
+// for functions produced by the code extractor. This is useful when converting
+// extracted functions to pthread-based code, as only one argument (void*) can
+// be passed in to pthread_create().
+static cl::opt<bool>
+AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
+                 cl::desc("Aggregate arguments to code-extracted functions"));
+
+namespace {
+  class VISIBILITY_HIDDEN CodeExtractor {
+    typedef std::vector<Value*> Values;
+    std::set<BasicBlock*> BlocksToExtract;
+    DominatorTree* DT;
+    bool AggregateArgs;
+    unsigned NumExitBlocks;
+    const Type *RetTy;
+  public:
+    CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false)
+      : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {}
+
+    Function *ExtractCodeRegion(const std::vector<BasicBlock*> &code);
+
+    bool isEligible(const std::vector<BasicBlock*> &code);
+
+  private:
+    /// definedInRegion - Return true if the specified value is defined in the
+    /// extracted region.
+    bool definedInRegion(Value *V) const {
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        if (BlocksToExtract.count(I->getParent()))
+          return true;
+      return false;
+    }
+
+    /// definedInCaller - Return true if the specified value is defined in the
+    /// function being code extracted, but not in the region being extracted.
+    /// These values must be passed in as live-ins to the function.
+    bool definedInCaller(Value *V) const {
+      if (isa<Argument>(V)) return true;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        if (!BlocksToExtract.count(I->getParent()))
+          return true;
+      return false;
+    }
+
+    void severSplitPHINodes(BasicBlock *&Header);
+    void splitReturnBlocks();
+    void findInputsOutputs(Values &inputs, Values &outputs);
+
+    Function *constructFunction(const Values &inputs,
+                                const Values &outputs,
+                                BasicBlock *header,
+                                BasicBlock *newRootNode, BasicBlock *newHeader,
+                                Function *oldFunction, Module *M);
+
+    void moveCodeToFunction(Function *newFunction);
+
+    void emitCallAndSwitchStatement(Function *newFunction,
+                                    BasicBlock *newHeader,
+                                    Values &inputs,
+                                    Values &outputs);
+
+  };
+}
+
+/// severSplitPHINodes - If a PHI node has multiple inputs from outside of the
+/// region, we need to split the entry block of the region so that the PHI node
+/// is easier to deal with.
+void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
+  bool HasPredsFromRegion = false;
+  unsigned NumPredsOutsideRegion = 0;
+
+  if (Header != &Header->getParent()->getEntryBlock()) {
+    PHINode *PN = dyn_cast<PHINode>(Header->begin());
+    if (!PN) return;  // No PHI nodes.
+
+    // If the header node contains any PHI nodes, check to see if there is more
+    // than one entry from outside the region.  If so, we need to sever the
+    // header block into two.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (BlocksToExtract.count(PN->getIncomingBlock(i)))
+        HasPredsFromRegion = true;
+      else
+        ++NumPredsOutsideRegion;
+
+    // If there is one (or fewer) predecessor from outside the region, we don't
+    // need to do anything special.
+    if (NumPredsOutsideRegion <= 1) return;
+  }
+
+  // Otherwise, we need to split the header block into two pieces: one
+  // containing PHI nodes merging values from outside of the region, and a
+  // second that contains all of the code for the block and merges back any
+  // incoming values from inside of the region.
+  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI();
+  BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs,
+                                              Header->getName()+".ce");
+
+  // We only want to code extract the second block now, and it becomes the new
+  // header of the region.
+  BasicBlock *OldPred = Header;
+  BlocksToExtract.erase(OldPred);
+  BlocksToExtract.insert(NewBB);
+  Header = NewBB;
+
+  // Okay, update dominator sets. The blocks that dominate the new one are the
+  // blocks that dominate TIBB plus the new block itself.
+  if (DT)
+    DT->splitBlock(NewBB);
+
+  // Okay, now we need to adjust the PHI nodes and any branches from within the
+  // region to go to the new header block instead of the old header block.
+  if (HasPredsFromRegion) {
+    PHINode *PN = cast<PHINode>(OldPred->begin());
+    // Loop over all of the predecessors of OldPred that are in the region,
+    // changing them to branch to NewBB instead.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+        TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
+        TI->replaceUsesOfWith(OldPred, NewBB);
+      }
+
+    // Okay, everthing within the region is now branching to the right block, we
+    // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
+    for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
+      PHINode *PN = cast<PHINode>(AfterPHIs);
+      // Create a new PHI node in the new region, which has an incoming value
+      // from OldPred of PN.
+      PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName()+".ce",
+                                       NewBB->begin());
+      NewPN->addIncoming(PN, OldPred);
+
+      // Loop over all of the incoming value in PN, moving them to NewPN if they
+      // are from the extracted region.
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+          NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
+          PN->removeIncomingValue(i);
+          --i;
+        }
+      }
+    }
+  }
+}
+
+void CodeExtractor::splitReturnBlocks() {
+  for (std::set<BasicBlock*>::iterator I = BlocksToExtract.begin(),
+         E = BlocksToExtract.end(); I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator()))
+      (*I)->splitBasicBlock(RI, (*I)->getName()+".ret");
+}
+
+// findInputsOutputs - Find inputs to, outputs from the code region.
+//
+void CodeExtractor::findInputsOutputs(Values &inputs, Values &outputs) {
+  std::set<BasicBlock*> ExitBlocks;
+  for (std::set<BasicBlock*>::const_iterator ci = BlocksToExtract.begin(),
+       ce = BlocksToExtract.end(); ci != ce; ++ci) {
+    BasicBlock *BB = *ci;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // If a used value is defined outside the region, it's an input.  If an
+      // instruction is used outside the region, it's an output.
+      for (User::op_iterator O = I->op_begin(), E = I->op_end(); O != E; ++O)
+        if (definedInCaller(*O))
+          inputs.push_back(*O);
+
+      // Consider uses of this instruction (outputs).
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI)
+        if (!definedInRegion(*UI)) {
+          outputs.push_back(I);
+          break;
+        }
+    } // for: insts
+
+    // Keep track of the exit blocks from the region.
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!BlocksToExtract.count(TI->getSuccessor(i)))
+        ExitBlocks.insert(TI->getSuccessor(i));
+  } // for: basic blocks
+
+  NumExitBlocks = ExitBlocks.size();
+
+  // Eliminate duplicates.
+  std::sort(inputs.begin(), inputs.end());
+  inputs.erase(std::unique(inputs.begin(), inputs.end()), inputs.end());
+  std::sort(outputs.begin(), outputs.end());
+  outputs.erase(std::unique(outputs.begin(), outputs.end()), outputs.end());
+}
+
+/// constructFunction - make a function based on inputs and outputs, as follows:
+/// f(in0, ..., inN, out0, ..., outN)
+///
+Function *CodeExtractor::constructFunction(const Values &inputs,
+                                           const Values &outputs,
+                                           BasicBlock *header,
+                                           BasicBlock *newRootNode,
+                                           BasicBlock *newHeader,
+                                           Function *oldFunction,
+                                           Module *M) {
+  DOUT << "inputs: " << inputs.size() << "\n";
+  DOUT << "outputs: " << outputs.size() << "\n";
+
+  // This function returns unsigned, outputs will go back by reference.
+  switch (NumExitBlocks) {
+  case 0:
+  case 1: RetTy = Type::VoidTy; break;
+  case 2: RetTy = Type::Int1Ty; break;
+  default: RetTy = Type::Int16Ty; break;
+  }
+
+  std::vector<const Type*> paramTy;
+
+  // Add the types of the input values to the function's argument list
+  for (Values::const_iterator i = inputs.begin(),
+         e = inputs.end(); i != e; ++i) {
+    const Value *value = *i;
+    DOUT << "value used in func: " << *value << "\n";
+    paramTy.push_back(value->getType());
+  }
+
+  // Add the types of the output values to the function's argument list.
+  for (Values::const_iterator I = outputs.begin(), E = outputs.end();
+       I != E; ++I) {
+    DOUT << "instr used in func: " << **I << "\n";
+    if (AggregateArgs)
+      paramTy.push_back((*I)->getType());
+    else
+      paramTy.push_back(PointerType::getUnqual((*I)->getType()));
+  }
+
+  DOUT << "Function type: " << *RetTy << " f(";
+  for (std::vector<const Type*>::iterator i = paramTy.begin(),
+         e = paramTy.end(); i != e; ++i)
+    DOUT << **i << ", ";
+  DOUT << ")\n";
+
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    PointerType *StructPtr = PointerType::getUnqual(StructType::get(paramTy));
+    paramTy.clear();
+    paramTy.push_back(StructPtr);
+  }
+  const FunctionType *funcType = FunctionType::get(RetTy, paramTy, false);
+
+  // Create the new function
+  Function *newFunction = Function::Create(funcType,
+                                           GlobalValue::InternalLinkage,
+                                           oldFunction->getName() + "_" +
+                                           header->getName(), M);
+  // If the old function is no-throw, so is the new one.
+  if (oldFunction->doesNotThrow())
+    newFunction->setDoesNotThrow(true);
+  
+  newFunction->getBasicBlockList().push_back(newRootNode);
+
+  // Create an iterator to name all of the arguments we inserted.
+  Function::arg_iterator AI = newFunction->arg_begin();
+
+  // Rewrite all users of the inputs in the extracted region to use the
+  // arguments (or appropriate addressing into struct) instead.
+  for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+    Value *RewriteVal;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::Int32Ty);
+      Idx[1] = ConstantInt::get(Type::Int32Ty, i);
+      std::string GEPname = "gep_" + inputs[i]->getName();
+      TerminatorInst *TI = newFunction->begin()->getTerminator();
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(AI, Idx, Idx+2, 
+                                                         GEPname, TI);
+      RewriteVal = new LoadInst(GEP, "load" + GEPname, TI);
+    } else
+      RewriteVal = AI++;
+
+    std::vector<User*> Users(inputs[i]->use_begin(), inputs[i]->use_end());
+    for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
+         use != useE; ++use)
+      if (Instruction* inst = dyn_cast<Instruction>(*use))
+        if (BlocksToExtract.count(inst->getParent()))
+          inst->replaceUsesOfWith(inputs[i], RewriteVal);
+  }
+
+  // Set names for input and output arguments.
+  if (!AggregateArgs) {
+    AI = newFunction->arg_begin();
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
+      AI->setName(inputs[i]->getName());
+    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
+      AI->setName(outputs[i]->getName()+".out");
+  }
+
+  // Rewrite branches to basic blocks outside of the loop to new dummy blocks
+  // within the new function. This must be done before we lose track of which
+  // blocks were originally in the code region.
+  std::vector<User*> Users(header->use_begin(), header->use_end());
+  for (unsigned i = 0, e = Users.size(); i != e; ++i)
+    // The BasicBlock which contains the branch is not in the region
+    // modify the branch target to a new block
+    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
+      if (!BlocksToExtract.count(TI->getParent()) &&
+          TI->getParent()->getParent() == oldFunction)
+        TI->replaceUsesOfWith(header, newHeader);
+
+  return newFunction;
+}
+
+/// emitCallAndSwitchStatement - This method sets up the caller side by adding
+/// the call instruction, splitting any PHI nodes in the header block as
+/// necessary.
+void CodeExtractor::
+emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
+                           Values &inputs, Values &outputs) {
+  // Emit a call to the new function, passing in: *pointer to struct (if
+  // aggregating parameters), or plan inputs and allocated memory for outputs
+  std::vector<Value*> params, StructValues, ReloadOutputs;
+
+  // Add inputs as params, or to be filled into the struct
+  for (Values::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
+    if (AggregateArgs)
+      StructValues.push_back(*i);
+    else
+      params.push_back(*i);
+
+  // Create allocas for the outputs
+  for (Values::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
+    if (AggregateArgs) {
+      StructValues.push_back(*i);
+    } else {
+      AllocaInst *alloca =
+        new AllocaInst((*i)->getType(), 0, (*i)->getName()+".loc",
+                       codeReplacer->getParent()->begin()->begin());
+      ReloadOutputs.push_back(alloca);
+      params.push_back(alloca);
+    }
+  }
+
+  AllocaInst *Struct = 0;
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    std::vector<const Type*> ArgTypes;
+    for (Values::iterator v = StructValues.begin(),
+           ve = StructValues.end(); v != ve; ++v)
+      ArgTypes.push_back((*v)->getType());
+
+    // Allocate a struct at the beginning of this function
+    Type *StructArgTy = StructType::get(ArgTypes);
+    Struct =
+      new AllocaInst(StructArgTy, 0, "structArg",
+                     codeReplacer->getParent()->begin()->begin());
+    params.push_back(Struct);
+
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::Int32Ty);
+      Idx[1] = ConstantInt::get(Type::Int32Ty, i);
+      GetElementPtrInst *GEP =
+        GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+                                  "gep_" + StructValues[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      StoreInst *SI = new StoreInst(StructValues[i], GEP);
+      codeReplacer->getInstList().push_back(SI);
+    }
+  }
+
+  // Emit the call to the function
+  CallInst *call = CallInst::Create(newFunction, params.begin(), params.end(),
+                                    NumExitBlocks > 1 ? "targetBlock" : "");
+  codeReplacer->getInstList().push_back(call);
+
+  Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
+  unsigned FirstOut = inputs.size();
+  if (!AggregateArgs)
+    std::advance(OutputArgBegin, inputs.size());
+
+  // Reload the outputs passed in by reference
+  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+    Value *Output = 0;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::Int32Ty);
+      Idx[1] = ConstantInt::get(Type::Int32Ty, FirstOut + i);
+      GetElementPtrInst *GEP
+        = GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+                                    "gep_reload_" + outputs[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      Output = GEP;
+    } else {
+      Output = ReloadOutputs[i];
+    }
+    LoadInst *load = new LoadInst(Output, outputs[i]->getName()+".reload");
+    codeReplacer->getInstList().push_back(load);
+    std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end());
+    for (unsigned u = 0, e = Users.size(); u != e; ++u) {
+      Instruction *inst = cast<Instruction>(Users[u]);
+      if (!BlocksToExtract.count(inst->getParent()))
+        inst->replaceUsesOfWith(outputs[i], load);
+    }
+  }
+
+  // Now we can emit a switch statement using the call as a value.
+  SwitchInst *TheSwitch =
+      SwitchInst::Create(ConstantInt::getNullValue(Type::Int16Ty),
+                         codeReplacer, 0, codeReplacer);
+
+  // Since there may be multiple exits from the original region, make the new
+  // function return an unsigned, switch on that number.  This loop iterates
+  // over all of the blocks in the extracted region, updating any terminator
+  // instructions in the to-be-extracted region that branch to blocks that are
+  // not in the region to be extracted.
+  std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
+
+  unsigned switchVal = 0;
+  for (std::set<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
+         e = BlocksToExtract.end(); i != e; ++i) {
+    TerminatorInst *TI = (*i)->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!BlocksToExtract.count(TI->getSuccessor(i))) {
+        BasicBlock *OldTarget = TI->getSuccessor(i);
+        // add a new basic block which returns the appropriate value
+        BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
+        if (!NewTarget) {
+          // If we don't already have an exit stub for this non-extracted
+          // destination, create one now!
+          NewTarget = BasicBlock::Create(OldTarget->getName() + ".exitStub",
+                                         newFunction);
+          unsigned SuccNum = switchVal++;
+
+          Value *brVal = 0;
+          switch (NumExitBlocks) {
+          case 0:
+          case 1: break;  // No value needed.
+          case 2:         // Conditional branch, return a bool
+            brVal = ConstantInt::get(Type::Int1Ty, !SuccNum);
+            break;
+          default:
+            brVal = ConstantInt::get(Type::Int16Ty, SuccNum);
+            break;
+          }
+
+          ReturnInst *NTRet = ReturnInst::Create(brVal, NewTarget);
+
+          // Update the switch instruction.
+          TheSwitch->addCase(ConstantInt::get(Type::Int16Ty, SuccNum),
+                             OldTarget);
+
+          // Restore values just before we exit
+          Function::arg_iterator OAI = OutputArgBegin;
+          for (unsigned out = 0, e = outputs.size(); out != e; ++out) {
+            // For an invoke, the normal destination is the only one that is
+            // dominated by the result of the invocation
+            BasicBlock *DefBlock = cast<Instruction>(outputs[out])->getParent();
+
+            bool DominatesDef = true;
+
+            if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) {
+              DefBlock = Invoke->getNormalDest();
+
+              // Make sure we are looking at the original successor block, not
+              // at a newly inserted exit block, which won't be in the dominator
+              // info.
+              for (std::map<BasicBlock*, BasicBlock*>::iterator I =
+                     ExitBlockMap.begin(), E = ExitBlockMap.end(); I != E; ++I)
+                if (DefBlock == I->second) {
+                  DefBlock = I->first;
+                  break;
+                }
+
+              // In the extract block case, if the block we are extracting ends
+              // with an invoke instruction, make sure that we don't emit a
+              // store of the invoke value for the unwind block.
+              if (!DT && DefBlock != OldTarget)
+                DominatesDef = false;
+            }
+
+            if (DT)
+              DominatesDef = DT->dominates(DefBlock, OldTarget);
+
+            if (DominatesDef) {
+              if (AggregateArgs) {
+                Value *Idx[2];
+                Idx[0] = Constant::getNullValue(Type::Int32Ty);
+                Idx[1] = ConstantInt::get(Type::Int32Ty,FirstOut+out);
+                GetElementPtrInst *GEP =
+                  GetElementPtrInst::Create(OAI, Idx, Idx + 2,
+                                            "gep_" + outputs[out]->getName(),
+                                            NTRet);
+                new StoreInst(outputs[out], GEP, NTRet);
+              } else {
+                new StoreInst(outputs[out], OAI, NTRet);
+              }
+            }
+            // Advance output iterator even if we don't emit a store
+            if (!AggregateArgs) ++OAI;
+          }
+        }
+
+        // rewrite the original branch instruction with this new target
+        TI->setSuccessor(i, NewTarget);
+      }
+  }
+
+  // Now that we've done the deed, simplify the switch instruction.
+  const Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+  switch (NumExitBlocks) {
+  case 0:
+    // There are no successors (the block containing the switch itself), which
+    // means that previously this was the last part of the function, and hence
+    // this should be rewritten as a `ret'
+
+    // Check if the function should return a value
+    if (OldFnRetTy == Type::VoidTy) {
+      ReturnInst::Create(0, TheSwitch);  // Return void
+    } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
+      // return what we have
+      ReturnInst::Create(TheSwitch->getCondition(), TheSwitch);
+    } else {
+      // Otherwise we must have code extracted an unwind or something, just
+      // return whatever we want.
+      ReturnInst::Create(Constant::getNullValue(OldFnRetTy), TheSwitch);
+    }
+
+    TheSwitch->eraseFromParent();
+    break;
+  case 1:
+    // Only a single destination, change the switch into an unconditional
+    // branch.
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  case 2:
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
+                       call, TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  default:
+    // Otherwise, make the default destination of the switch instruction be one
+    // of the other successors.
+    TheSwitch->setOperand(0, call);
+    TheSwitch->setSuccessor(0, TheSwitch->getSuccessor(NumExitBlocks));
+    TheSwitch->removeCase(NumExitBlocks);  // Remove redundant case
+    break;
+  }
+}
+
+void CodeExtractor::moveCodeToFunction(Function *newFunction) {
+  Function *oldFunc = (*BlocksToExtract.begin())->getParent();
+  Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
+  Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
+
+  for (std::set<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
+         e = BlocksToExtract.end(); i != e; ++i) {
+    // Delete the basic block from the old function, and the list of blocks
+    oldBlocks.remove(*i);
+
+    // Insert this basic block into the new function
+    newBlocks.push_back(*i);
+  }
+}
+
+/// ExtractRegion - Removes a loop from a function, replaces it with a call to
+/// new function. Returns pointer to the new function.
+///
+/// algorithm:
+///
+/// find inputs and outputs for the region
+///
+/// for inputs: add to function as args, map input instr* to arg#
+/// for outputs: add allocas for scalars,
+///             add to func as args, map output instr* to arg#
+///
+/// rewrite func to use argument #s instead of instr*
+///
+/// for each scalar output in the function: at every exit, store intermediate
+/// computed result back into memory.
+///
+Function *CodeExtractor::
+ExtractCodeRegion(const std::vector<BasicBlock*> &code) {
+  if (!isEligible(code))
+    return 0;
+
+  // 1) Find inputs, outputs
+  // 2) Construct new function
+  //  * Add allocas for defs, pass as args by reference
+  //  * Pass in uses as args
+  // 3) Move code region, add call instr to func
+  //
+  BlocksToExtract.insert(code.begin(), code.end());
+
+  Values inputs, outputs;
+
+  // Assumption: this is a single-entry code region, and the header is the first
+  // block in the region.
+  BasicBlock *header = code[0];
+
+  for (unsigned i = 1, e = code.size(); i != e; ++i)
+    for (pred_iterator PI = pred_begin(code[i]), E = pred_end(code[i]);
+         PI != E; ++PI)
+      assert(BlocksToExtract.count(*PI) &&
+             "No blocks in this region may have entries from outside the region"
+             " except for the first block!");
+
+  // If we have to split PHI nodes or the entry block, do so now.
+  severSplitPHINodes(header);
+
+  // If we have any return instructions in the region, split those blocks so
+  // that the return is not in the region.
+  splitReturnBlocks();
+
+  Function *oldFunction = header->getParent();
+
+  // This takes place of the original loop
+  BasicBlock *codeReplacer = BasicBlock::Create("codeRepl", oldFunction,
+                                                header);
+
+  // The new function needs a root node because other nodes can branch to the
+  // head of the region, but the entry node of a function cannot have preds.
+  BasicBlock *newFuncRoot = BasicBlock::Create("newFuncRoot");
+  newFuncRoot->getInstList().push_back(BranchInst::Create(header));
+
+  // Find inputs to, outputs from the code region.
+  findInputsOutputs(inputs, outputs);
+
+  // Construct new function based on inputs/outputs & add allocas for all defs.
+  Function *newFunction = constructFunction(inputs, outputs, header,
+                                            newFuncRoot,
+                                            codeReplacer, oldFunction,
+                                            oldFunction->getParent());
+
+  emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
+
+  moveCodeToFunction(newFunction);
+
+  // Loop over all of the PHI nodes in the header block, and change any
+  // references to the old incoming edge to be the new incoming edge.
+  for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!BlocksToExtract.count(PN->getIncomingBlock(i)))
+        PN->setIncomingBlock(i, newFuncRoot);
+  }
+
+  // Look at all successors of the codeReplacer block.  If any of these blocks
+  // had PHI nodes in them, we need to update the "from" block to be the code
+  // replacer, not the original block in the extracted region.
+  std::vector<BasicBlock*> Succs(succ_begin(codeReplacer),
+                                 succ_end(codeReplacer));
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      std::set<BasicBlock*> ProcessedPreds;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+          if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
+            PN->setIncomingBlock(i, codeReplacer);
+          else {
+            // There were multiple entries in the PHI for this block, now there
+            // is only one, so remove the duplicated entries.
+            PN->removeIncomingValue(i, false);
+            --i; --e;
+          }
+        }
+    }
+
+  //cerr << "NEW FUNCTION: " << *newFunction;
+  //  verifyFunction(*newFunction);
+
+  //  cerr << "OLD FUNCTION: " << *oldFunction;
+  //  verifyFunction(*oldFunction);
+
+  DEBUG(if (verifyFunction(*newFunction)) abort());
+  return newFunction;
+}
+
+bool CodeExtractor::isEligible(const std::vector<BasicBlock*> &code) {
+  // Deny code region if it contains allocas or vastarts.
+  for (std::vector<BasicBlock*>::const_iterator BB = code.begin(), e=code.end();
+       BB != e; ++BB)
+    for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end();
+         I != Ie; ++I)
+      if (isa<AllocaInst>(*I))
+        return false;
+      else if (const CallInst *CI = dyn_cast<CallInst>(I))
+        if (const Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::vastart)
+            return false;
+  return true;
+}
+
+
+/// ExtractCodeRegion - slurp a sequence of basic blocks into a brand new
+/// function
+///
+Function* llvm::ExtractCodeRegion(DominatorTree &DT,
+                                  const std::vector<BasicBlock*> &code,
+                                  bool AggregateArgs) {
+  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code);
+}
+
+/// ExtractBasicBlock - slurp a natural loop into a brand new function
+///
+Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) {
+  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks());
+}
+
+/// ExtractBasicBlock - slurp a basic block into a brand new function
+///
+Function* llvm::ExtractBasicBlock(BasicBlock *BB, bool AggregateArgs) {
+  std::vector<BasicBlock*> Blocks;
+  Blocks.push_back(BB);
+  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(Blocks);
+}
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
new file mode 100644
index 0000000..b8dd754
--- /dev/null
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -0,0 +1,144 @@
+//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provide the function DemoteRegToStack().  This function takes a
+// virtual register computed by an Instruction and replaces it with a slot in
+// the stack frame, allocated via alloca. It returns the pointer to the
+// AllocaInst inserted.  After this function is called on an instruction, we are
+// guaranteed that the only user of the instruction is a store that is
+// immediately after it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include <map>
+using namespace llvm;
+
+/// DemoteRegToStack - This function takes a virtual register computed by an
+/// Instruction and replaces it with a slot in the stack frame, allocated via
+/// alloca.  This allows the CFG to be changed around without fear of
+/// invalidating the SSA information for the value.  It returns the pointer to
+/// the alloca inserted to create a stack slot for I.
+///
+AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
+                                   Instruction *AllocaPoint) {
+  if (I.use_empty()) {
+    I.eraseFromParent();
+    return 0;
+  }
+  
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = I.getParent()->getParent();
+    Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem",
+                          F->getEntryBlock().begin());
+  }
+  
+  // Change all of the users of the instruction to read from the stack slot
+  // instead.
+  while (!I.use_empty()) {
+    Instruction *U = cast<Instruction>(I.use_back());
+    if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If this is a PHI node, we can't insert a load of the value before the
+      // use.  Instead, insert the load in the predecessor block corresponding
+      // to the incoming value.
+      //
+      // Note that if there are multiple edges from a basic block to this PHI
+      // node that we cannot multiple loads.  The problem is that the resultant
+      // PHI node will have multiple values (from each load) coming in from the
+      // same block, which is illegal SSA form.  For this reason, we keep track
+      // and reuse loads we insert.
+      std::map<BasicBlock*, Value*> Loads;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I) {
+          Value *&V = Loads[PN->getIncomingBlock(i)];
+          if (V == 0) {
+            // Insert the load into the predecessor block
+            V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, 
+                             PN->getIncomingBlock(i)->getTerminator());
+          }
+          PN->setIncomingValue(i, V);
+        }
+
+    } else {
+      // If this is a normal instruction, just insert a load.
+      Value *V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, U);
+      U->replaceUsesOfWith(&I, V);
+    }
+  }
+
+
+  // Insert stores of the computed value into the stack slot.  We have to be
+  // careful is I is an invoke instruction though, because we can't insert the
+  // store AFTER the terminator instruction.
+  BasicBlock::iterator InsertPt;
+  if (!isa<TerminatorInst>(I)) {
+    InsertPt = &I;
+    ++InsertPt;
+  } else {
+    // We cannot demote invoke instructions to the stack if their normal edge
+    // is critical.
+    InvokeInst &II = cast<InvokeInst>(I);
+    assert(II.getNormalDest()->getSinglePredecessor() &&
+           "Cannot demote invoke with a critical successor!");
+    InsertPt = II.getNormalDest()->begin();
+  }
+
+  for (; isa<PHINode>(InsertPt); ++InsertPt)
+  /* empty */;   // Don't insert before any PHI nodes.
+  new StoreInst(&I, Slot, InsertPt);
+
+  return Slot;
+}
+
+
+/// DemotePHIToStack - This function takes a virtual register computed by a phi
+/// node and replaces it with a slot in the stack frame, allocated via alloca.
+/// The phi node is deleted and it returns the pointer to the alloca inserted.
+AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+  if (P->use_empty()) {
+    P->eraseFromParent();    
+    return 0;                
+  }
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = P->getParent()->getParent();
+    Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem",
+                          F->getEntryBlock().begin());
+  }
+  
+  // Iterate over each operand, insert store in each predecessor.
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
+      assert(II->getParent() != P->getIncomingBlock(i) && 
+             "Invoke edge not supported yet"); II=II;
+    }
+    new StoreInst(P->getIncomingValue(i), Slot, 
+                  P->getIncomingBlock(i)->getTerminator());
+  }
+  
+  // Insert load in place of the phi and replace all uses.
+  Value *V = new LoadInst(Slot, P->getName()+".reload", P);
+  P->replaceAllUsesWith(V);
+  
+  // Delete phi.
+  P->eraseFromParent();
+  
+  return Slot;
+}
diff --git a/lib/Transforms/Utils/InlineCost.cpp b/lib/Transforms/Utils/InlineCost.cpp
new file mode 100644
index 0000000..87aff01
--- /dev/null
+++ b/lib/Transforms/Utils/InlineCost.cpp
@@ -0,0 +1,315 @@
+//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inline cost analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Transforms/Utils/InlineCost.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/CallingConv.h"
+#include "llvm/IntrinsicInst.h"
+
+using namespace llvm;
+
+// CountCodeReductionForConstant - Figure out an approximation for how many
+// instructions will be constant folded if the specified value is constant.
+//
+unsigned InlineCostAnalyzer::FunctionInfo::
+         CountCodeReductionForConstant(Value *V) {
+  unsigned Reduction = 0;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
+    if (isa<BranchInst>(*UI))
+      Reduction += 40;          // Eliminating a conditional branch is a big win
+    else if (SwitchInst *SI = dyn_cast<SwitchInst>(*UI))
+      // Eliminating a switch is a big win, proportional to the number of edges
+      // deleted.
+      Reduction += (SI->getNumSuccessors()-1) * 40;
+    else if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      // Turning an indirect call into a direct call is a BIG win
+      Reduction += CI->getCalledValue() == V ? 500 : 0;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
+      // Turning an indirect call into a direct call is a BIG win
+      Reduction += II->getCalledValue() == V ? 500 : 0;
+    } else {
+      // Figure out if this instruction will be removed due to simple constant
+      // propagation.
+      Instruction &Inst = cast<Instruction>(**UI);
+      bool AllOperandsConstant = true;
+      for (unsigned i = 0, e = Inst.getNumOperands(); i != e; ++i)
+        if (!isa<Constant>(Inst.getOperand(i)) && Inst.getOperand(i) != V) {
+          AllOperandsConstant = false;
+          break;
+        }
+
+      if (AllOperandsConstant) {
+        // We will get to remove this instruction...
+        Reduction += 7;
+
+        // And any other instructions that use it which become constants
+        // themselves.
+        Reduction += CountCodeReductionForConstant(&Inst);
+      }
+    }
+
+  return Reduction;
+}
+
+// CountCodeReductionForAlloca - Figure out an approximation of how much smaller
+// the function will be if it is inlined into a context where an argument
+// becomes an alloca.
+//
+unsigned InlineCostAnalyzer::FunctionInfo::
+         CountCodeReductionForAlloca(Value *V) {
+  if (!isa<PointerType>(V->getType())) return 0;  // Not a pointer
+  unsigned Reduction = 0;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    Instruction *I = cast<Instruction>(*UI);
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      Reduction += 10;
+    else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      // If the GEP has variable indices, we won't be able to do much with it.
+      if (!GEP->hasAllConstantIndices())
+        Reduction += CountCodeReductionForAlloca(GEP)+15;
+    } else {
+      // If there is some other strange instruction, we're not going to be able
+      // to do much if we inline this.
+      return 0;
+    }
+  }
+
+  return Reduction;
+}
+
+/// analyzeFunction - Fill in the current structure with information gleaned
+/// from the specified function.
+void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F) {
+  unsigned NumInsts = 0, NumBlocks = 0, NumVectorInsts = 0;
+
+  // Look at the size of the callee.  Each basic block counts as 20 units, and
+  // each instruction counts as 5.
+  for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
+         II != E; ++II) {
+      if (isa<PHINode>(II)) continue;           // PHI nodes don't count.
+
+      // Special handling for calls.
+      if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
+        if (isa<DbgInfoIntrinsic>(II))
+          continue;  // Debug intrinsics don't count as size.
+        
+        CallSite CS = CallSite::get(const_cast<Instruction*>(&*II));
+        
+        // If this function contains a call to setjmp or _setjmp, never inline
+        // it.  This is a hack because we depend on the user marking their local
+        // variables as volatile if they are live across a setjmp call, and they
+        // probably won't do this in callers.
+        if (Function *F = CS.getCalledFunction())
+          if (F->isDeclaration() && 
+              (F->isName("setjmp") || F->isName("_setjmp"))) {
+            NeverInline = true;
+            return;
+          }
+
+        // Calls often compile into many machine instructions.  Bump up their
+        // cost to reflect this.
+        if (!isa<IntrinsicInst>(II))
+          NumInsts += 5;
+      }
+      
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+        if (!AI->isStaticAlloca())
+          this->usesDynamicAlloca = true;
+      }
+
+      if (isa<ExtractElementInst>(II) || isa<VectorType>(II->getType()))
+        ++NumVectorInsts; 
+      
+      // Noop casts, including ptr <-> int,  don't count.
+      if (const CastInst *CI = dyn_cast<CastInst>(II)) {
+        if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) || 
+            isa<PtrToIntInst>(CI))
+          continue;
+      } else if (const GetElementPtrInst *GEPI =
+                 dyn_cast<GetElementPtrInst>(II)) {
+        // If a GEP has all constant indices, it will probably be folded with
+        // a load/store.
+        if (GEPI->hasAllConstantIndices())
+          continue;
+      }
+      
+      ++NumInsts;
+    }
+
+    ++NumBlocks;
+  }
+
+  this->NumBlocks      = NumBlocks;
+  this->NumInsts       = NumInsts;
+  this->NumVectorInsts = NumVectorInsts;
+
+  // Check out all of the arguments to the function, figuring out how much
+  // code can be eliminated if one of the arguments is a constant.
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
+    ArgumentWeights.push_back(ArgInfo(CountCodeReductionForConstant(I),
+                                      CountCodeReductionForAlloca(I)));
+}
+
+
+
+// getInlineCost - The heuristic used to determine if we should inline the
+// function call or not.
+//
+InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
+                               SmallPtrSet<const Function *, 16> &NeverInline) {
+  Instruction *TheCall = CS.getInstruction();
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = TheCall->getParent()->getParent();
+
+      // Don't inline functions which can be redefined at link-time to mean
+      // something else.
+   if (Callee->mayBeOverridden() ||
+       // Don't inline functions marked noinline.
+       Callee->hasFnAttr(Attribute::NoInline) || NeverInline.count(Callee))
+    return llvm::InlineCost::getNever();
+
+  // InlineCost - This value measures how good of an inline candidate this call
+  // site is to inline.  A lower inline cost make is more likely for the call to
+  // be inlined.  This value may go negative.
+  //
+  int InlineCost = 0;
+  
+  // If there is only one call of the function, and it has internal linkage,
+  // make it almost guaranteed to be inlined.
+  //
+  if ((Callee->hasLocalLinkage() || Callee->hasAvailableExternallyLinkage()) && 
+      Callee->hasOneUse())
+    InlineCost -= 15000;
+  
+  // If this function uses the coldcc calling convention, prefer not to inline
+  // it.
+  if (Callee->getCallingConv() == CallingConv::Cold)
+    InlineCost += 2000;
+  
+  // If the instruction after the call, or if the normal destination of the
+  // invoke is an unreachable instruction, the function is noreturn.  As such,
+  // there is little point in inlining this.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+    if (isa<UnreachableInst>(II->getNormalDest()->begin()))
+      InlineCost += 10000;
+  } else if (isa<UnreachableInst>(++BasicBlock::iterator(TheCall)))
+    InlineCost += 10000;
+  
+  // Get information about the callee...
+  FunctionInfo &CalleeFI = CachedFunctionInfo[Callee];
+  
+  // If we haven't calculated this information yet, do so now.
+  if (CalleeFI.NumBlocks == 0)
+    CalleeFI.analyzeFunction(Callee);
+
+  // If we should never inline this, return a huge cost.
+  if (CalleeFI.NeverInline)
+    return InlineCost::getNever();
+
+  // FIXME: It would be nice to kill off CalleeFI.NeverInline. Then we
+  // could move this up and avoid computing the FunctionInfo for
+  // things we are going to just return always inline for. This
+  // requires handling setjmp somewhere else, however.
+  if (!Callee->isDeclaration() && Callee->hasFnAttr(Attribute::AlwaysInline))
+    return InlineCost::getAlways();
+    
+  if (CalleeFI.usesDynamicAlloca) {
+    // Get infomation about the caller...
+    FunctionInfo &CallerFI = CachedFunctionInfo[Caller];
+
+    // If we haven't calculated this information yet, do so now.
+    if (CallerFI.NumBlocks == 0)
+      CallerFI.analyzeFunction(Caller);
+
+    // Don't inline a callee with dynamic alloca into a caller without them.
+    // Functions containing dynamic alloca's are inefficient in various ways;
+    // don't create more inefficiency.
+    if (!CallerFI.usesDynamicAlloca)
+      return InlineCost::getNever();
+  }
+
+  // Add to the inline quality for properties that make the call valuable to
+  // inline.  This includes factors that indicate that the result of inlining
+  // the function will be optimizable.  Currently this just looks at arguments
+  // passed into the function.
+  //
+  unsigned ArgNo = 0;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I, ++ArgNo) {
+    // Each argument passed in has a cost at both the caller and the callee
+    // sides.  This favors functions that take many arguments over functions
+    // that take few arguments.
+    InlineCost -= 20;
+    
+    // If this is a function being passed in, it is very likely that we will be
+    // able to turn an indirect function call into a direct function call.
+    if (isa<Function>(I))
+      InlineCost -= 100;
+    
+    // If an alloca is passed in, inlining this function is likely to allow
+    // significant future optimization possibilities (like scalar promotion, and
+    // scalarization), so encourage the inlining of the function.
+    //
+    else if (isa<AllocaInst>(I)) {
+      if (ArgNo < CalleeFI.ArgumentWeights.size())
+        InlineCost -= CalleeFI.ArgumentWeights[ArgNo].AllocaWeight;
+      
+      // If this is a constant being passed into the function, use the argument
+      // weights calculated for the callee to determine how much will be folded
+      // away with this information.
+    } else if (isa<Constant>(I)) {
+      if (ArgNo < CalleeFI.ArgumentWeights.size())
+        InlineCost -= CalleeFI.ArgumentWeights[ArgNo].ConstantWeight;
+    }
+  }
+  
+  // Now that we have considered all of the factors that make the call site more
+  // likely to be inlined, look at factors that make us not want to inline it.
+  
+  // Don't inline into something too big, which would make it bigger.
+  //
+  InlineCost += Caller->size()/15;
+  
+  // Look at the size of the callee. Each instruction counts as 5.
+  InlineCost += CalleeFI.NumInsts*5;
+
+  return llvm::InlineCost::get(InlineCost);
+}
+
+// getInlineFudgeFactor - Return a > 1.0 factor if the inliner should use a
+// higher threshold to determine if the function call should be inlined.
+float InlineCostAnalyzer::getInlineFudgeFactor(CallSite CS) {
+  Function *Callee = CS.getCalledFunction();
+  
+  // Get information about the callee...
+  FunctionInfo &CalleeFI = CachedFunctionInfo[Callee];
+  
+  // If we haven't calculated this information yet, do so now.
+  if (CalleeFI.NumBlocks == 0)
+    CalleeFI.analyzeFunction(Callee);
+
+  float Factor = 1.0f;
+  // Single BB functions are often written to be inlined.
+  if (CalleeFI.NumBlocks == 1)
+    Factor += 0.5f;
+
+  // Be more aggressive if the function contains a good chunk (if it mades up
+  // at least 10% of the instructions) of vector instructions.
+  if (CalleeFI.NumVectorInsts > CalleeFI.NumInsts/2)
+    Factor += 2.0f;
+  else if (CalleeFI.NumVectorInsts > CalleeFI.NumInsts/10)
+    Factor += 1.5f;
+  return Factor;
+}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
new file mode 100644
index 0000000..4989c00
--- /dev/null
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -0,0 +1,656 @@
+//===- InlineFunction.cpp - Code to perform function inlining -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inlining of a function into a call site, resolving
+// parameters and the return value as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Attributes.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CallSite.h"
+using namespace llvm;
+
+bool llvm::InlineFunction(CallInst *CI, CallGraph *CG, const TargetData *TD) {
+  return InlineFunction(CallSite(CI), CG, TD);
+}
+bool llvm::InlineFunction(InvokeInst *II, CallGraph *CG, const TargetData *TD) {
+  return InlineFunction(CallSite(II), CG, TD);
+}
+
+/// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes and turn unwind
+/// instructions into branches to the invoke unwind dest.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
+                                ClonedCodeInfo &InlinedCodeInfo,
+                                CallGraph *CG) {
+  BasicBlock *InvokeDest = II->getUnwindDest();
+  std::vector<Value*> InvokeDestPHIValues;
+
+  // If there are PHI nodes in the unwind destination block, we need to
+  // keep track of which values came into them from this invoke, then remove
+  // the entry for this block.
+  BasicBlock *InvokeBlock = II->getParent();
+  for (BasicBlock::iterator I = InvokeDest->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // Save the value to use for this edge.
+    InvokeDestPHIValues.push_back(PN->getIncomingValueForBlock(InvokeBlock));
+  }
+
+  Function *Caller = FirstNewBlock->getParent();
+
+  // The inlined code is currently at the end of the function, scan from the
+  // start of the inlined code to its end, checking for stuff we need to
+  // rewrite.
+  if (InlinedCodeInfo.ContainsCalls || InlinedCodeInfo.ContainsUnwinds) {
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+         BB != E; ++BB) {
+      if (InlinedCodeInfo.ContainsCalls) {
+        for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ){
+          Instruction *I = BBI++;
+
+          // We only need to check for function calls: inlined invoke
+          // instructions require no special handling.
+          if (!isa<CallInst>(I)) continue;
+          CallInst *CI = cast<CallInst>(I);
+
+          // If this call cannot unwind, don't convert it to an invoke.
+          if (CI->doesNotThrow())
+            continue;
+
+          // Convert this function call into an invoke instruction.
+          // First, split the basic block.
+          BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc");
+
+          // Next, create the new invoke instruction, inserting it at the end
+          // of the old basic block.
+          SmallVector<Value*, 8> InvokeArgs(CI->op_begin()+1, CI->op_end());
+          InvokeInst *II =
+            InvokeInst::Create(CI->getCalledValue(), Split, InvokeDest,
+                               InvokeArgs.begin(), InvokeArgs.end(),
+                               CI->getName(), BB->getTerminator());
+          II->setCallingConv(CI->getCallingConv());
+          II->setAttributes(CI->getAttributes());
+
+          // Make sure that anything using the call now uses the invoke!
+          CI->replaceAllUsesWith(II);
+
+          // Update the callgraph.
+          if (CG) {
+            // We should be able to do this:
+            //   (*CG)[Caller]->replaceCallSite(CI, II);
+            // but that fails if the old call site isn't in the call graph,
+            // which, because of LLVM bug 3601, it sometimes isn't.
+            CallGraphNode *CGN = (*CG)[Caller];
+            for (CallGraphNode::iterator NI = CGN->begin(), NE = CGN->end();
+                 NI != NE; ++NI) {
+              if (NI->first == CI) {
+                NI->first = II;
+                break;
+              }
+            }
+          }
+
+          // Delete the unconditional branch inserted by splitBasicBlock
+          BB->getInstList().pop_back();
+          Split->getInstList().pop_front();  // Delete the original call
+
+          // Update any PHI nodes in the exceptional block to indicate that
+          // there is now a new entry in them.
+          unsigned i = 0;
+          for (BasicBlock::iterator I = InvokeDest->begin();
+               isa<PHINode>(I); ++I, ++i) {
+            PHINode *PN = cast<PHINode>(I);
+            PN->addIncoming(InvokeDestPHIValues[i], BB);
+          }
+
+          // This basic block is now complete, start scanning the next one.
+          break;
+        }
+      }
+
+      if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+        // An UnwindInst requires special handling when it gets inlined into an
+        // invoke site.  Once this happens, we know that the unwind would cause
+        // a control transfer to the invoke exception destination, so we can
+        // transform it into a direct branch to the exception destination.
+        BranchInst::Create(InvokeDest, UI);
+
+        // Delete the unwind instruction!
+        UI->eraseFromParent();
+
+        // Update any PHI nodes in the exceptional block to indicate that
+        // there is now a new entry in them.
+        unsigned i = 0;
+        for (BasicBlock::iterator I = InvokeDest->begin();
+             isa<PHINode>(I); ++I, ++i) {
+          PHINode *PN = cast<PHINode>(I);
+          PN->addIncoming(InvokeDestPHIValues[i], BB);
+        }
+      }
+    }
+  }
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction.  Eliminate these entries (which might even delete the
+  // PHI node) now.
+  InvokeDest->removePredecessor(II->getParent());
+}
+
+/// UpdateCallGraphAfterInlining - Once we have cloned code over from a callee
+/// into the caller, update the specified callgraph to reflect the changes we
+/// made.  Note that it's possible that not all code was copied over, so only
+/// some edges of the callgraph may remain.
+static void UpdateCallGraphAfterInlining(CallSite CS,
+                                         Function::iterator FirstNewBlock,
+                                       DenseMap<const Value*, Value*> &ValueMap,
+                                         CallGraph &CG) {
+  const Function *Caller = CS.getInstruction()->getParent()->getParent();
+  const Function *Callee = CS.getCalledFunction();
+  CallGraphNode *CalleeNode = CG[Callee];
+  CallGraphNode *CallerNode = CG[Caller];
+
+  // Since we inlined some uninlined call sites in the callee into the caller,
+  // add edges from the caller to all of the callees of the callee.
+  CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end();
+
+  // Consider the case where CalleeNode == CallerNode.
+  CallGraphNode::CalledFunctionsVector CallCache;
+  if (CalleeNode == CallerNode) {
+    CallCache.assign(I, E);
+    I = CallCache.begin();
+    E = CallCache.end();
+  }
+
+  for (; I != E; ++I) {
+    const Instruction *OrigCall = I->first.getInstruction();
+
+    DenseMap<const Value*, Value*>::iterator VMI = ValueMap.find(OrigCall);
+    // Only copy the edge if the call was inlined!
+    if (VMI != ValueMap.end() && VMI->second) {
+      // If the call was inlined, but then constant folded, there is no edge to
+      // add.  Check for this case.
+      if (Instruction *NewCall = dyn_cast<Instruction>(VMI->second))
+        CallerNode->addCalledFunction(CallSite::get(NewCall), I->second);
+    }
+  }
+  // Update the call graph by deleting the edge from Callee to Caller.  We must
+  // do this after the loop above in case Caller and Callee are the same.
+  CallerNode->removeCallEdgeFor(CS);
+}
+
+/// findFnRegionEndMarker - This is a utility routine that is used by
+/// InlineFunction. Return llvm.dbg.region.end intrinsic that corresponds
+/// to the llvm.dbg.func.start of the function F. Otherwise return NULL.
+static const DbgRegionEndInst *findFnRegionEndMarker(const Function *F) {
+
+  GlobalVariable *FnStart = NULL;
+  const DbgRegionEndInst *FnEnd = NULL;
+  for (Function::const_iterator FI = F->begin(), FE =F->end(); FI != FE; ++FI) 
+    for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end(); BI != BE;
+         ++BI) {
+      if (FnStart == NULL)  {
+        if (const DbgFuncStartInst *FSI = dyn_cast<DbgFuncStartInst>(BI)) {
+          DISubprogram SP(cast<GlobalVariable>(FSI->getSubprogram()));
+          assert (SP.isNull() == false && "Invalid llvm.dbg.func.start");
+          if (SP.describes(F))
+            FnStart = SP.getGV();
+        }
+      } else {
+        if (const DbgRegionEndInst *REI = dyn_cast<DbgRegionEndInst>(BI))
+          if (REI->getContext() == FnStart)
+            FnEnd = REI;
+      }
+    }
+  return FnEnd;
+}
+
+// InlineFunction - This function inlines the called function into the basic
+// block of the caller.  This returns false if it is not possible to inline this
+// call.  The program is still in a well defined state if this occurs though.
+//
+// Note that this only does one level of inlining.  For example, if the
+// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+// exists in the instruction stream.  Similiarly this will inline a recursive
+// function by one level.
+//
+bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD) {
+  Instruction *TheCall = CS.getInstruction();
+  assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
+         "Instruction not in function!");
+
+  const Function *CalledFunc = CS.getCalledFunction();
+  if (CalledFunc == 0 ||          // Can't inline external function or indirect
+      CalledFunc->isDeclaration() || // call, or call to a vararg function!
+      CalledFunc->getFunctionType()->isVarArg()) return false;
+
+
+  // If the call to the callee is not a tail call, we must clear the 'tail'
+  // flags on any calls that we inline.
+  bool MustClearTailCallFlags =
+    !(isa<CallInst>(TheCall) && cast<CallInst>(TheCall)->isTailCall());
+
+  // If the call to the callee cannot throw, set the 'nounwind' flag on any
+  // calls that we inline.
+  bool MarkNoUnwind = CS.doesNotThrow();
+
+  BasicBlock *OrigBB = TheCall->getParent();
+  Function *Caller = OrigBB->getParent();
+
+  // GC poses two hazards to inlining, which only occur when the callee has GC:
+  //  1. If the caller has no GC, then the callee's GC must be propagated to the
+  //     caller.
+  //  2. If the caller has a differing GC, it is invalid to inline.
+  if (CalledFunc->hasGC()) {
+    if (!Caller->hasGC())
+      Caller->setGC(CalledFunc->getGC());
+    else if (CalledFunc->getGC() != Caller->getGC())
+      return false;
+  }
+
+  // Get an iterator to the last basic block in the function, which will have
+  // the new function inlined after it.
+  //
+  Function::iterator LastBlock = &Caller->back();
+
+  // Make sure to capture all of the return instructions from the cloned
+  // function.
+  std::vector<ReturnInst*> Returns;
+  ClonedCodeInfo InlinedFunctionInfo;
+  Function::iterator FirstNewBlock;
+
+  { // Scope to destroy ValueMap after cloning.
+    DenseMap<const Value*, Value*> ValueMap;
+
+    assert(CalledFunc->arg_size() == CS.arg_size() &&
+           "No varargs calls can be inlined!");
+
+    // Calculate the vector of arguments to pass into the function cloner, which
+    // matches up the formal to the actual argument values.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    unsigned ArgNo = 0;
+    for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+         E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
+      Value *ActualArg = *AI;
+
+      // When byval arguments actually inlined, we need to make the copy implied
+      // by them explicit.  However, we don't do this if the callee is readonly
+      // or readnone, because the copy would be unneeded: the callee doesn't
+      // modify the struct.
+      if (CalledFunc->paramHasAttr(ArgNo+1, Attribute::ByVal) &&
+          !CalledFunc->onlyReadsMemory()) {
+        const Type *AggTy = cast<PointerType>(I->getType())->getElementType();
+        const Type *VoidPtrTy = PointerType::getUnqual(Type::Int8Ty);
+
+        // Create the alloca.  If we have TargetData, use nice alignment.
+        unsigned Align = 1;
+        if (TD) Align = TD->getPrefTypeAlignment(AggTy);
+        Value *NewAlloca = new AllocaInst(AggTy, 0, Align, I->getName(),
+                                          Caller->begin()->begin());
+        // Emit a memcpy.
+        const Type *Tys[] = { Type::Int64Ty };
+        Function *MemCpyFn = Intrinsic::getDeclaration(Caller->getParent(),
+                                                       Intrinsic::memcpy, 
+                                                       Tys, 1);
+        Value *DestCast = new BitCastInst(NewAlloca, VoidPtrTy, "tmp", TheCall);
+        Value *SrcCast = new BitCastInst(*AI, VoidPtrTy, "tmp", TheCall);
+
+        Value *Size;
+        if (TD == 0)
+          Size = ConstantExpr::getSizeOf(AggTy);
+        else
+          Size = ConstantInt::get(Type::Int64Ty, TD->getTypeStoreSize(AggTy));
+
+        // Always generate a memcpy of alignment 1 here because we don't know
+        // the alignment of the src pointer.  Other optimizations can infer
+        // better alignment.
+        Value *CallArgs[] = {
+          DestCast, SrcCast, Size, ConstantInt::get(Type::Int32Ty, 1)
+        };
+        CallInst *TheMemCpy =
+          CallInst::Create(MemCpyFn, CallArgs, CallArgs+4, "", TheCall);
+
+        // If we have a call graph, update it.
+        if (CG) {
+          CallGraphNode *MemCpyCGN = CG->getOrInsertFunction(MemCpyFn);
+          CallGraphNode *CallerNode = (*CG)[Caller];
+          CallerNode->addCalledFunction(TheMemCpy, MemCpyCGN);
+        }
+
+        // Uses of the argument in the function should use our new alloca
+        // instead.
+        ActualArg = NewAlloca;
+      }
+
+      ValueMap[I] = ActualArg;
+    }
+
+    // Adjust llvm.dbg.region.end. If the CalledFunc has region end
+    // marker then clone that marker after next stop point at the 
+    // call site. The function body cloner does not clone original
+    // region end marker from the CalledFunc. This will ensure that
+    // inlined function's scope ends at the right place. 
+    const DbgRegionEndInst *DREI = findFnRegionEndMarker(CalledFunc);
+    if (DREI) {
+      for (BasicBlock::iterator BI = TheCall, 
+             BE = TheCall->getParent()->end(); BI != BE; ++BI) {
+        if (DbgStopPointInst *DSPI = dyn_cast<DbgStopPointInst>(BI)) {
+          if (DbgRegionEndInst *NewDREI = 
+              dyn_cast<DbgRegionEndInst>(DREI->clone()))
+            NewDREI->insertAfter(DSPI);
+          break;
+        }
+      }
+    }
+
+    // We want the inliner to prune the code as it copies.  We would LOVE to
+    // have no dead or constant instructions leftover after inlining occurs
+    // (which can happen, e.g., because an argument was constant), but we'll be
+    // happy with whatever the cloner can do.
+    CloneAndPruneFunctionInto(Caller, CalledFunc, ValueMap, Returns, ".i",
+                              &InlinedFunctionInfo, TD);
+
+    // Remember the first block that is newly cloned over.
+    FirstNewBlock = LastBlock; ++FirstNewBlock;
+
+    // Update the callgraph if requested.
+    if (CG)
+      UpdateCallGraphAfterInlining(CS, FirstNewBlock, ValueMap, *CG);
+  }
+
+  // If there are any alloca instructions in the block that used to be the entry
+  // block for the callee, move them to the entry block of the caller.  First
+  // calculate which instruction they should be inserted before.  We insert the
+  // instructions at the end of the current alloca list.
+  //
+  {
+    BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    for (BasicBlock::iterator I = FirstNewBlock->begin(),
+           E = FirstNewBlock->end(); I != E; )
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I++)) {
+        // If the alloca is now dead, remove it.  This often occurs due to code
+        // specialization.
+        if (AI->use_empty()) {
+          AI->eraseFromParent();
+          continue;
+        }
+
+        if (isa<Constant>(AI->getArraySize())) {
+          // Scan for the block of allocas that we can move over, and move them
+          // all at once.
+          while (isa<AllocaInst>(I) &&
+                 isa<Constant>(cast<AllocaInst>(I)->getArraySize()))
+            ++I;
+
+          // Transfer all of the allocas over in a block.  Using splice means
+          // that the instructions aren't removed from the symbol table, then
+          // reinserted.
+          Caller->getEntryBlock().getInstList().splice(
+              InsertPoint,
+              FirstNewBlock->getInstList(),
+              AI, I);
+        }
+      }
+  }
+
+  // If the inlined code contained dynamic alloca instructions, wrap the inlined
+  // code with llvm.stacksave/llvm.stackrestore intrinsics.
+  if (InlinedFunctionInfo.ContainsDynamicAllocas) {
+    Module *M = Caller->getParent();
+    // Get the two intrinsics we care about.
+    Constant *StackSave, *StackRestore;
+    StackSave    = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+    StackRestore = Intrinsic::getDeclaration(M, Intrinsic::stackrestore);
+
+    // If we are preserving the callgraph, add edges to the stacksave/restore
+    // functions for the calls we insert.
+    CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0;
+    if (CG) {
+      // We know that StackSave/StackRestore are Function*'s, because they are
+      // intrinsics which must have the right types.
+      StackSaveCGN    = CG->getOrInsertFunction(cast<Function>(StackSave));
+      StackRestoreCGN = CG->getOrInsertFunction(cast<Function>(StackRestore));
+      CallerNode = (*CG)[Caller];
+    }
+
+    // Insert the llvm.stacksave.
+    CallInst *SavedPtr = CallInst::Create(StackSave, "savedstack",
+                                          FirstNewBlock->begin());
+    if (CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN);
+
+    // Insert a call to llvm.stackrestore before any return instructions in the
+    // inlined function.
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", Returns[i]);
+      if (CG) CallerNode->addCalledFunction(CI, StackRestoreCGN);
+    }
+
+    // Count the number of StackRestore calls we insert.
+    unsigned NumStackRestores = Returns.size();
+
+    // If we are inlining an invoke instruction, insert restores before each
+    // unwind.  These unwinds will be rewritten into branches later.
+    if (InlinedFunctionInfo.ContainsUnwinds && isa<InvokeInst>(TheCall)) {
+      for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+           BB != E; ++BB)
+        if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+          CallInst::Create(StackRestore, SavedPtr, "", UI);
+          ++NumStackRestores;
+        }
+    }
+  }
+
+  // If we are inlining tail call instruction through a call site that isn't
+  // marked 'tail', we must remove the tail marker for any calls in the inlined
+  // code.  Also, calls inlined through a 'nounwind' call site should be marked
+  // 'nounwind'.
+  if (InlinedFunctionInfo.ContainsCalls &&
+      (MustClearTailCallFlags || MarkNoUnwind)) {
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+         BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (CallInst *CI = dyn_cast<CallInst>(I)) {
+          if (MustClearTailCallFlags)
+            CI->setTailCall(false);
+          if (MarkNoUnwind)
+            CI->setDoesNotThrow();
+        }
+  }
+
+  // If we are inlining through a 'nounwind' call site then any inlined 'unwind'
+  // instructions are unreachable.
+  if (InlinedFunctionInfo.ContainsUnwinds && MarkNoUnwind)
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+         BB != E; ++BB) {
+      TerminatorInst *Term = BB->getTerminator();
+      if (isa<UnwindInst>(Term)) {
+        new UnreachableInst(Term);
+        BB->getInstList().erase(Term);
+      }
+    }
+
+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any inlined 'unwind' instructions into branches to the invoke exception
+  // destination, and call instructions into invoke instructions.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
+    HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo, CG);
+
+  // If we cloned in _exactly one_ basic block, and if that block ends in a
+  // return instruction, we splice the body of the inlined callee directly into
+  // the calling basic block.
+  if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
+    // Move all of the instructions right before the call.
+    OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(),
+                                 FirstNewBlock->begin(), FirstNewBlock->end());
+    // Remove the cloned basic block.
+    Caller->getBasicBlockList().pop_back();
+
+    // If the call site was an invoke instruction, add a branch to the normal
+    // destination.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
+      BranchInst::Create(II->getNormalDest(), TheCall);
+
+    // If the return instruction returned a value, replace uses of the call with
+    // uses of the returned value.
+    if (!TheCall->use_empty()) {
+      ReturnInst *R = Returns[0];
+      if (TheCall == R->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(R->getReturnValue());
+    }
+    // Since we are now done with the Call/Invoke, we can delete it.
+    TheCall->eraseFromParent();
+
+    // Since we are now done with the return instruction, delete it also.
+    Returns[0]->eraseFromParent();
+
+    // We are now done with the inlining.
+    return true;
+  }
+
+  // Otherwise, we have the normal case, of more than one block to inline or
+  // multiple return sites.
+
+  // We want to clone the entire callee function into the hole between the
+  // "starter" and "ender" blocks.  How we accomplish this depends on whether
+  // this is an invoke instruction or a call instruction.
+  BasicBlock *AfterCallBB;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+
+    // Add an unconditional branch to make this look like the CallInst case...
+    BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+
+    // Split the basic block.  This guarantees that no PHI nodes will have to be
+    // updated due to new incoming edges, and make the invoke case more
+    // symmetric to the call case.
+    AfterCallBB = OrigBB->splitBasicBlock(NewBr,
+                                          CalledFunc->getName()+".exit");
+
+  } else {  // It's a call
+    // If this is a call instruction, we need to split the basic block that
+    // the call lives in.
+    //
+    AfterCallBB = OrigBB->splitBasicBlock(TheCall,
+                                          CalledFunc->getName()+".exit");
+  }
+
+  // Change the branch that used to go to AfterCallBB to branch to the first
+  // basic block of the inlined function.
+  //
+  TerminatorInst *Br = OrigBB->getTerminator();
+  assert(Br && Br->getOpcode() == Instruction::Br &&
+         "splitBasicBlock broken!");
+  Br->setOperand(0, FirstNewBlock);
+
+
+  // Now that the function is correct, make it a little bit nicer.  In
+  // particular, move the basic blocks inserted from the end of the function
+  // into the space made by splitting the source basic block.
+  Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(),
+                                     FirstNewBlock, Caller->end());
+
+  // Handle all of the return instructions that we just cloned in, and eliminate
+  // any users of the original call/invoke instruction.
+  const Type *RTy = CalledFunc->getReturnType();
+
+  if (Returns.size() > 1) {
+    // The PHI node should go at the front of the new basic block to merge all
+    // possible incoming values.
+    PHINode *PHI = 0;
+    if (!TheCall->use_empty()) {
+      PHI = PHINode::Create(RTy, TheCall->getName(),
+                            AfterCallBB->begin());
+      // Anything that used the result of the function call should now use the
+      // PHI node as their operand.
+      TheCall->replaceAllUsesWith(PHI);
+    }
+
+    // Loop over all of the return instructions adding entries to the PHI node
+    // as appropriate.
+    if (PHI) {
+      for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+        ReturnInst *RI = Returns[i];
+        assert(RI->getReturnValue()->getType() == PHI->getType() &&
+               "Ret value not consistent in function!");
+        PHI->addIncoming(RI->getReturnValue(), RI->getParent());
+      }
+    }
+
+    // Add a branch to the merge points and remove return instructions.
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *RI = Returns[i];
+      BranchInst::Create(AfterCallBB, RI);
+      RI->eraseFromParent();
+    }
+  } else if (!Returns.empty()) {
+    // Otherwise, if there is exactly one return value, just replace anything
+    // using the return value of the call with the computed value.
+    if (!TheCall->use_empty()) {
+      if (TheCall == Returns[0]->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(Returns[0]->getReturnValue());
+    }
+
+    // Splice the code from the return block into the block that it will return
+    // to, which contains the code that was after the call.
+    BasicBlock *ReturnBB = Returns[0]->getParent();
+    AfterCallBB->getInstList().splice(AfterCallBB->begin(),
+                                      ReturnBB->getInstList());
+
+    // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
+    ReturnBB->replaceAllUsesWith(AfterCallBB);
+
+    // Delete the return instruction now and empty ReturnBB now.
+    Returns[0]->eraseFromParent();
+    ReturnBB->eraseFromParent();
+  } else if (!TheCall->use_empty()) {
+    // No returns, but something is using the return value of the call.  Just
+    // nuke the result.
+    TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+  }
+
+  // Since we are now done with the Call/Invoke, we can delete it.
+  TheCall->eraseFromParent();
+
+  // We should always be able to fold the entry block of the function into the
+  // single predecessor of the block...
+  assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
+  BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
+
+  // Splice the code entry block into calling block, right before the
+  // unconditional branch.
+  OrigBB->getInstList().splice(Br, CalleeEntry->getInstList());
+  CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes
+
+  // Remove the unconditional branch.
+  OrigBB->getInstList().erase(Br);
+
+  // Now we can remove the CalleeEntry block, which is now empty.
+  Caller->getBasicBlockList().erase(CalleeEntry);
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
new file mode 100644
index 0000000..4f8a160
--- /dev/null
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -0,0 +1,63 @@
+//===- InstructionNamer.cpp - Give anonymous instructions names -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that gives instructions names, this is mostly
+// useful when diffing the effect of an optimization because deleting an
+// unnamed instruction can change all other instruction numbering, making the
+// diff very noisy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+using namespace llvm;
+
+namespace {
+  struct InstNamer : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    InstNamer() : FunctionPass(&ID) {}
+    
+    void getAnalysisUsage(AnalysisUsage &Info) const {
+      Info.setPreservesAll();
+    }
+
+    bool runOnFunction(Function &F) {
+      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
+           AI != AE; ++AI)
+        if (!AI->hasName() && AI->getType() != Type::VoidTy)
+          AI->setName("tmp");
+
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+        if (!BB->hasName())
+          BB->setName("BB");
+        
+        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+          if (!I->hasName() && I->getType() != Type::VoidTy)
+            I->setName("tmp");
+      }
+      return true;
+    }
+  };
+  
+  char InstNamer::ID = 0;
+  static RegisterPass<InstNamer> X("instnamer",
+                                   "Assign names to anonymous instructions");
+}
+
+
+const PassInfo *const llvm::InstructionNamerID = &X;
+//===----------------------------------------------------------------------===//
+//
+// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
+//
+FunctionPass *llvm::createInstructionNamerPass() {
+  return new InstNamer();
+}
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
new file mode 100644
index 0000000..7d4f3a3
--- /dev/null
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -0,0 +1,276 @@
+//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops by placing phi nodes at the end of the loops for
+// all values that are live across the loop boundary.  For example, it turns
+// the left into the right code:
+// 
+// for (...)                for (...)
+//   if (c)                   if (c)
+//     X1 = ...                 X1 = ...
+//   else                     else
+//     X2 = ...                 X2 = ...
+//   X3 = phi(X1, X2)         X3 = phi(X1, X2)
+// ... = X3 + 4             X4 = phi(X3)
+//                          ... = X4 + 4
+//
+// This is still valid LLVM; the extra phi nodes are purely redundant, and will
+// be trivially eliminated by InstCombine.  The major benefit of this 
+// transformation is that it makes many other loop optimizations, such as 
+// LoopUnswitching, simpler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lcssa"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/PredIteratorCache.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumLCSSA, "Number of live out of a loop variables");
+
+namespace {
+  struct VISIBILITY_HIDDEN LCSSA : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LCSSA() : LoopPass(&ID) {}
+
+    // Cached analysis information for the current function.
+    LoopInfo *LI;
+    DominatorTree *DT;
+    std::vector<BasicBlock*> LoopBlocks;
+    PredIteratorCache PredCache;
+    
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    void ProcessInstruction(Instruction* Instr,
+                            const SmallVector<BasicBlock*, 8>& exitBlocks);
+    
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.  It maintains both of these,
+    /// as well as the CFG.  It also requires dominator information.
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominatorTree>();
+
+      // Request DominanceFrontier now, even though LCSSA does
+      // not use it. This allows Pass Manager to schedule Dominance
+      // Frontier early enough such that one LPPassManager can handle
+      // multiple loop transformation passes.
+      AU.addRequired<DominanceFrontier>(); 
+      AU.addPreserved<DominanceFrontier>();
+    }
+  private:
+    void getLoopValuesUsedOutsideLoop(Loop *L,
+                                      SetVector<Instruction*> &AffectedValues,
+                                 const SmallVector<BasicBlock*, 8>& exitBlocks);
+
+    Value *GetValueForBlock(DomTreeNode *BB, Instruction *OrigInst,
+                            DenseMap<DomTreeNode*, Value*> &Phis);
+
+    /// inLoop - returns true if the given block is within the current loop
+    bool inLoop(BasicBlock* B) {
+      return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B);
+    }
+  };
+}
+  
+char LCSSA::ID = 0;
+static RegisterPass<LCSSA> X("lcssa", "Loop-Closed SSA Form Pass");
+
+Pass *llvm::createLCSSAPass() { return new LCSSA(); }
+const PassInfo *const llvm::LCSSAID = &X;
+
+/// runOnFunction - Process all loops in the function, inner-most out.
+bool LCSSA::runOnLoop(Loop *L, LPPassManager &LPM) {
+  PredCache.clear();
+  
+  LI = &LPM.getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+
+  // Speed up queries by creating a sorted list of blocks
+  LoopBlocks.clear();
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+  std::sort(LoopBlocks.begin(), LoopBlocks.end());
+  
+  SmallVector<BasicBlock*, 8> exitBlocks;
+  L->getExitBlocks(exitBlocks);
+  
+  SetVector<Instruction*> AffectedValues;
+  getLoopValuesUsedOutsideLoop(L, AffectedValues, exitBlocks);
+  
+  // If no values are affected, we can save a lot of work, since we know that
+  // nothing will be changed.
+  if (AffectedValues.empty())
+    return false;
+  
+  // Iterate over all affected values for this loop and insert Phi nodes
+  // for them in the appropriate exit blocks
+  
+  for (SetVector<Instruction*>::iterator I = AffectedValues.begin(),
+       E = AffectedValues.end(); I != E; ++I)
+    ProcessInstruction(*I, exitBlocks);
+  
+  assert(L->isLCSSAForm());
+  
+  return true;
+}
+
+/// processInstruction - Given a live-out instruction, insert LCSSA Phi nodes,
+/// eliminate all out-of-loop uses.
+void LCSSA::ProcessInstruction(Instruction *Instr,
+                               const SmallVector<BasicBlock*, 8>& exitBlocks) {
+  ++NumLCSSA; // We are applying the transformation
+
+  // Keep track of the blocks that have the value available already.
+  DenseMap<DomTreeNode*, Value*> Phis;
+
+  DomTreeNode *InstrNode = DT->getNode(Instr->getParent());
+
+  // Insert the LCSSA phi's into the exit blocks (dominated by the value), and
+  // add them to the Phi's map.
+  for (SmallVector<BasicBlock*, 8>::const_iterator BBI = exitBlocks.begin(),
+      BBE = exitBlocks.end(); BBI != BBE; ++BBI) {
+    BasicBlock *BB = *BBI;
+    DomTreeNode *ExitBBNode = DT->getNode(BB);
+    Value *&Phi = Phis[ExitBBNode];
+    if (!Phi && DT->dominates(InstrNode, ExitBBNode)) {
+      PHINode *PN = PHINode::Create(Instr->getType(), Instr->getName()+".lcssa",
+                                    BB->begin());
+      PN->reserveOperandSpace(PredCache.GetNumPreds(BB));
+
+      // Remember that this phi makes the value alive in this block.
+      Phi = PN;
+
+      // Add inputs from inside the loop for this PHI.
+      for (BasicBlock** PI = PredCache.GetPreds(BB); *PI; ++PI)
+        PN->addIncoming(Instr, *PI);
+    }
+  }
+  
+  
+  // Record all uses of Instr outside the loop.  We need to rewrite these.  The
+  // LCSSA phis won't be included because they use the value in the loop.
+  for (Value::use_iterator UI = Instr->use_begin(), E = Instr->use_end();
+       UI != E;) {
+    BasicBlock *UserBB = cast<Instruction>(*UI)->getParent();
+    if (PHINode *P = dyn_cast<PHINode>(*UI)) {
+      UserBB = P->getIncomingBlock(UI);
+    }
+    
+    // If the user is in the loop, don't rewrite it!
+    if (UserBB == Instr->getParent() || inLoop(UserBB)) {
+      ++UI;
+      continue;
+    }
+    
+    // Otherwise, patch up uses of the value with the appropriate LCSSA Phi,
+    // inserting PHI nodes into join points where needed.
+    Value *Val = GetValueForBlock(DT->getNode(UserBB), Instr, Phis);
+    
+    // Preincrement the iterator to avoid invalidating it when we change the
+    // value.
+    Use &U = UI.getUse();
+    ++UI;
+    U.set(Val);
+  }
+}
+
+/// getLoopValuesUsedOutsideLoop - Return any values defined in the loop that
+/// are used by instructions outside of it.
+void LCSSA::getLoopValuesUsedOutsideLoop(Loop *L,
+                                      SetVector<Instruction*> &AffectedValues,
+                                const SmallVector<BasicBlock*, 8>& exitBlocks) {
+  // FIXME: For large loops, we may be able to avoid a lot of use-scanning
+  // by using dominance information.  In particular, if a block does not
+  // dominate any of the loop exits, then none of the values defined in the
+  // block could be used outside the loop.
+  for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
+       BB != BE; ++BB) {
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ++I)
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+           ++UI) {
+        BasicBlock *UserBB = cast<Instruction>(*UI)->getParent();
+        if (PHINode* p = dyn_cast<PHINode>(*UI)) {
+          UserBB = p->getIncomingBlock(UI);
+        }
+        
+        if (*BB != UserBB && !inLoop(UserBB)) {
+          AffectedValues.insert(I);
+          break;
+        }
+      }
+  }
+}
+
+/// GetValueForBlock - Get the value to use within the specified basic block.
+/// available values are in Phis.
+Value *LCSSA::GetValueForBlock(DomTreeNode *BB, Instruction *OrigInst,
+                               DenseMap<DomTreeNode*, Value*> &Phis) {
+  // If there is no dominator info for this BB, it is unreachable.
+  if (BB == 0)
+    return UndefValue::get(OrigInst->getType());
+                                 
+  // If we have already computed this value, return the previously computed val.
+  if (Phis.count(BB)) return Phis[BB];
+
+  DomTreeNode *IDom = BB->getIDom();
+
+  // Otherwise, there are two cases: we either have to insert a PHI node or we
+  // don't.  We need to insert a PHI node if this block is not dominated by one
+  // of the exit nodes from the loop (the loop could have multiple exits, and
+  // though the value defined *inside* the loop dominated all its uses, each
+  // exit by itself may not dominate all the uses).
+  //
+  // The simplest way to check for this condition is by checking to see if the
+  // idom is in the loop.  If so, we *know* that none of the exit blocks
+  // dominate this block.  Note that we *know* that the block defining the
+  // original instruction is in the idom chain, because if it weren't, then the
+  // original value didn't dominate this use.
+  if (!inLoop(IDom->getBlock())) {
+    // Idom is not in the loop, we must still be "below" the exit block and must
+    // be fully dominated by the value live in the idom.
+    Value* val = GetValueForBlock(IDom, OrigInst, Phis);
+    Phis.insert(std::make_pair(BB, val));
+    return val;
+  }
+  
+  BasicBlock *BBN = BB->getBlock();
+  
+  // Otherwise, the idom is the loop, so we need to insert a PHI node.  Do so
+  // now, then get values to fill in the incoming values for the PHI.
+  PHINode *PN = PHINode::Create(OrigInst->getType(),
+                                OrigInst->getName() + ".lcssa", BBN->begin());
+  PN->reserveOperandSpace(PredCache.GetNumPreds(BBN));
+  Phis.insert(std::make_pair(BB, PN));
+                                 
+  // Fill in the incoming values for the block.
+  for (BasicBlock** PI = PredCache.GetPreds(BBN); *PI; ++PI)
+    PN->addIncoming(GetValueForBlock(DT->getNode(*PI), OrigInst, Phis), *PI);
+  return PN;
+}
+
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
new file mode 100644
index 0000000..94483b8
--- /dev/null
+++ b/lib/Transforms/Utils/Local.cpp
@@ -0,0 +1,338 @@
+//===-- Local.cpp - Functions to perform local transformations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform various local transformations to the
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Local constant propagation.
+//
+
+// ConstantFoldTerminator - If a terminator instruction is predicated on a
+// constant value, convert it into an unconditional branch to the constant
+// destination.
+//
+bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
+  TerminatorInst *T = BB->getTerminator();
+
+  // Branch - See if we are conditional jumping on constant
+  if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+    if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
+    BasicBlock *Dest1 = BI->getSuccessor(0);
+    BasicBlock *Dest2 = BI->getSuccessor(1);
+
+    if (ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
+      // Are we branching on constant?
+      // YES.  Change to unconditional branch...
+      BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
+      BasicBlock *OldDest     = Cond->getZExtValue() ? Dest2 : Dest1;
+
+      //cerr << "Function: " << T->getParent()->getParent()
+      //     << "\nRemoving branch from " << T->getParent()
+      //     << "\n\nTo: " << OldDest << endl;
+
+      // Let the basic block know that we are letting go of it.  Based on this,
+      // it will adjust it's PHI nodes.
+      assert(BI->getParent() && "Terminator not inserted in block!");
+      OldDest->removePredecessor(BI->getParent());
+
+      // Set the unconditional destination, and change the insn to be an
+      // unconditional branch.
+      BI->setUnconditionalDest(Destination);
+      return true;
+    } else if (Dest2 == Dest1) {       // Conditional branch to same location?
+      // This branch matches something like this:
+      //     br bool %cond, label %Dest, label %Dest
+      // and changes it into:  br label %Dest
+
+      // Let the basic block know that we are letting go of one copy of it.
+      assert(BI->getParent() && "Terminator not inserted in block!");
+      Dest1->removePredecessor(BI->getParent());
+
+      // Change a conditional branch to unconditional.
+      BI->setUnconditionalDest(Dest1);
+      return true;
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+    // If we are switching on a constant, we can convert the switch into a
+    // single branch instruction!
+    ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    BasicBlock *TheOnlyDest = SI->getSuccessor(0);  // The default dest
+    BasicBlock *DefaultDest = TheOnlyDest;
+    assert(TheOnlyDest == SI->getDefaultDest() &&
+           "Default destination is not successor #0?");
+
+    // Figure out which case it goes to...
+    for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i) {
+      // Found case matching a constant operand?
+      if (SI->getSuccessorValue(i) == CI) {
+        TheOnlyDest = SI->getSuccessor(i);
+        break;
+      }
+
+      // Check to see if this branch is going to the same place as the default
+      // dest.  If so, eliminate it as an explicit compare.
+      if (SI->getSuccessor(i) == DefaultDest) {
+        // Remove this entry...
+        DefaultDest->removePredecessor(SI->getParent());
+        SI->removeCase(i);
+        --i; --e;  // Don't skip an entry...
+        continue;
+      }
+
+      // Otherwise, check to see if the switch only branches to one destination.
+      // We do this by reseting "TheOnlyDest" to null when we find two non-equal
+      // destinations.
+      if (SI->getSuccessor(i) != TheOnlyDest) TheOnlyDest = 0;
+    }
+
+    if (CI && !TheOnlyDest) {
+      // Branching on a constant, but not any of the cases, go to the default
+      // successor.
+      TheOnlyDest = SI->getDefaultDest();
+    }
+
+    // If we found a single destination that we can fold the switch into, do so
+    // now.
+    if (TheOnlyDest) {
+      // Insert the new branch..
+      BranchInst::Create(TheOnlyDest, SI);
+      BasicBlock *BB = SI->getParent();
+
+      // Remove entries from PHI nodes which we no longer branch to...
+      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+        // Found case matching a constant operand?
+        BasicBlock *Succ = SI->getSuccessor(i);
+        if (Succ == TheOnlyDest)
+          TheOnlyDest = 0;  // Don't modify the first branch to TheOnlyDest
+        else
+          Succ->removePredecessor(BB);
+      }
+
+      // Delete the old switch...
+      BB->getInstList().erase(SI);
+      return true;
+    } else if (SI->getNumSuccessors() == 2) {
+      // Otherwise, we can fold this switch into a conditional branch
+      // instruction if it has only one non-default destination.
+      Value *Cond = new ICmpInst(ICmpInst::ICMP_EQ, SI->getCondition(),
+                                 SI->getSuccessorValue(1), "cond", SI);
+      // Insert the new branch...
+      BranchInst::Create(SI->getSuccessor(1), SI->getSuccessor(0), Cond, SI);
+
+      // Delete the old switch...
+      SI->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Local dead code elimination...
+//
+
+/// isInstructionTriviallyDead - Return true if the result produced by the
+/// instruction is not used, and the instruction has no side effects.
+///
+bool llvm::isInstructionTriviallyDead(Instruction *I) {
+  if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
+
+  // We don't want debug info removed by anything this general.
+  if (isa<DbgInfoIntrinsic>(I)) return false;
+
+  if (!I->mayHaveSideEffects()) return true;
+
+  // Special case intrinsics that "may have side effects" but can be deleted
+  // when dead.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    // Safe to delete llvm.stacksave if dead.
+    if (II->getIntrinsicID() == Intrinsic::stacksave)
+      return true;
+  return false;
+}
+
+/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
+/// trivially dead instruction, delete it.  If that makes any of its operands
+/// trivially dead, delete them too, recursively.
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I))
+    return;
+  
+  SmallVector<Instruction*, 16> DeadInsts;
+  DeadInsts.push_back(I);
+  
+  while (!DeadInsts.empty()) {
+    I = DeadInsts.pop_back_val();
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, 0);
+      
+      if (!OpV->use_empty()) continue;
+    
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI))
+          DeadInsts.push_back(OpI);
+    }
+    
+    I->eraseFromParent();
+  }
+}
+
+/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
+/// dead PHI node, due to being a def-use chain of single-use nodes that
+/// either forms a cycle or is terminated by a trivially dead instruction,
+/// delete it.  If that makes any of its operands trivially dead, delete them
+/// too, recursively.
+void
+llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
+
+  // We can remove a PHI if it is on a cycle in the def-use graph
+  // where each node in the cycle has degree one, i.e. only one use,
+  // and is an instruction with no side effects.
+  if (!PN->hasOneUse())
+    return;
+
+  SmallPtrSet<PHINode *, 4> PHIs;
+  PHIs.insert(PN);
+  for (Instruction *J = cast<Instruction>(*PN->use_begin());
+       J->hasOneUse() && !J->mayHaveSideEffects();
+       J = cast<Instruction>(*J->use_begin()))
+    // If we find a PHI more than once, we're on a cycle that
+    // won't prove fruitful.
+    if (PHINode *JP = dyn_cast<PHINode>(J))
+      if (!PHIs.insert(cast<PHINode>(JP))) {
+        // Break the cycle and delete the PHI and its operands.
+        JP->replaceAllUsesWith(UndefValue::get(JP->getType()));
+        RecursivelyDeleteTriviallyDeadInstructions(JP);
+        break;
+      }
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Graph Restructuring...
+//
+
+/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
+/// predecessor is known to have one successor (DestBB!).  Eliminate the edge
+/// between them, moving the instructions in the predecessor into DestBB and
+/// deleting the predecessor block.
+///
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB) {
+  // If BB has single-entry PHI nodes, fold them.
+  while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    Value *NewVal = PN->getIncomingValue(0);
+    // Replace self referencing PHI with undef, it must be dead.
+    if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+    PN->replaceAllUsesWith(NewVal);
+    PN->eraseFromParent();
+  }
+  
+  BasicBlock *PredBB = DestBB->getSinglePredecessor();
+  assert(PredBB && "Block doesn't have a single predecessor!");
+  
+  // Splice all the instructions from PredBB to DestBB.
+  PredBB->getTerminator()->eraseFromParent();
+  DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
+  
+  // Anything that branched to PredBB now branches to DestBB.
+  PredBB->replaceAllUsesWith(DestBB);
+  
+  // Nuke BB.
+  PredBB->eraseFromParent();
+}
+
+/// OnlyUsedByDbgIntrinsics - Return true if the instruction I is only used
+/// by DbgIntrinsics. If DbgInUses is specified then the vector is filled 
+/// with the DbgInfoIntrinsic that use the instruction I.
+bool llvm::OnlyUsedByDbgInfoIntrinsics(Instruction *I, 
+                               SmallVectorImpl<DbgInfoIntrinsic *> *DbgInUses) {
+  if (DbgInUses)
+    DbgInUses->clear();
+
+  for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 
+       ++UI) {
+    if (DbgInfoIntrinsic *DI = dyn_cast<DbgInfoIntrinsic>(*UI)) {
+      if (DbgInUses)
+        DbgInUses->push_back(DI);
+    } else {
+      if (DbgInUses)
+        DbgInUses->clear();
+      return false;
+    }
+  }
+  return true;
+}
+
+/// UserIsDebugInfo - Return true if U is a constant expr used by 
+/// llvm.dbg.variable or llvm.dbg.global_variable
+bool llvm::UserIsDebugInfo(User *U) {
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(U);
+
+  if (!CE || CE->getNumUses() != 1)
+    return false;
+
+  Constant *Init = dyn_cast<Constant>(CE->use_back());
+  if (!Init || Init->getNumUses() != 1)
+    return false;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(Init->use_back());
+  if (!GV || !GV->hasInitializer() || GV->getInitializer() != Init)
+    return false;
+
+  DIVariable DV(GV);
+  if (!DV.isNull()) 
+    return true; // User is llvm.dbg.variable
+
+  DIGlobalVariable DGV(GV);
+  if (!DGV.isNull())
+    return true; // User is llvm.dbg.global_variable
+
+  return false;
+}
+
+/// RemoveDbgInfoUser - Remove an User which is representing debug info.
+void llvm::RemoveDbgInfoUser(User *U) {
+  assert (UserIsDebugInfo(U) && "Unexpected User!");
+  ConstantExpr *CE = cast<ConstantExpr>(U);
+  while (!CE->use_empty()) {
+    Constant *C = cast<Constant>(CE->use_back());
+    while (!C->use_empty()) {
+      GlobalVariable *GV = cast<GlobalVariable>(C->use_back());
+      GV->eraseFromParent();
+    }
+    C->destroyConstant();
+  }
+  CE->destroyConstant();
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
new file mode 100644
index 0000000..03d273d
--- /dev/null
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -0,0 +1,600 @@
+//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs several transformations to transform natural loops into a
+// simpler form, which makes subsequent analyses and transformations simpler and
+// more effective.
+//
+// Loop pre-header insertion guarantees that there is a single, non-critical
+// entry edge from outside of the loop to the loop header.  This simplifies a
+// number of analyses and transformations, such as LICM.
+//
+// Loop exit-block insertion guarantees that all exit blocks from the loop
+// (blocks which are outside of the loop that have predecessors inside of the
+// loop) only have predecessors from inside of the loop (and are thus dominated
+// by the loop header).  This simplifies transformations such as store-sinking
+// that are built into LICM.
+//
+// This pass also guarantees that loops will have exactly one backedge.
+//
+// Note that the simplifycfg pass will clean up blocks which are split out but
+// end up being unnecessary, so usage of this pass should not pessimize
+// generated code.
+//
+// This pass obviously modifies the CFG, but updates loop information and
+// dominator information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loopsimplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+using namespace llvm;
+
+STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
+STATISTIC(NumNested  , "Number of nested loops split out");
+
+namespace {
+  struct VISIBILITY_HIDDEN LoopSimplify : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    LoopSimplify() : FunctionPass(&ID) {}
+
+    // AA - If we have an alias analysis object to update, this is it, otherwise
+    // this is null.
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // We need loop information to identify the loops...
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
+    }
+
+    /// verifyAnalysis() - Verify loop nest.
+    void verifyAnalysis() const {
+#ifndef NDEBUG
+      LoopInfo *NLI = &getAnalysis<LoopInfo>();
+      for (LoopInfo::iterator I = NLI->begin(), E = NLI->end(); I != E; ++I) 
+        (*I)->verifyLoop();
+#endif  
+    }
+
+  private:
+    bool ProcessLoop(Loop *L);
+    BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
+    void InsertPreheaderForLoop(Loop *L);
+    Loop *SeparateNestedLoop(Loop *L);
+    void InsertUniqueBackedgeBlock(Loop *L);
+    void PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                  SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                  Loop *L);
+  };
+}
+
+char LoopSimplify::ID = 0;
+static RegisterPass<LoopSimplify>
+X("loopsimplify", "Canonicalize natural loops", true);
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::LoopSimplifyID = &X;
+FunctionPass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+
+/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
+/// it in any convenient order) inserting preheaders...
+///
+bool LoopSimplify::runOnFunction(Function &F) {
+  bool Changed = false;
+  LI = &getAnalysis<LoopInfo>();
+  AA = getAnalysisIfAvailable<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  // Check to see that no blocks (other than the header) in loops have
+  // predecessors that are not in loops.  This is not valid for natural loops,
+  // but can occur if the blocks are unreachable.  Since they are unreachable we
+  // can just shamelessly destroy their terminators to make them not branch into
+  // the loop!
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    // This case can only occur for unreachable blocks.  Blocks that are
+    // unreachable can't be in loops, so filter those blocks out.
+    if (LI->getLoopFor(BB)) continue;
+    
+    bool BlockUnreachable = false;
+    TerminatorInst *TI = BB->getTerminator();
+
+    // Check to see if any successors of this block are non-loop-header loops
+    // that are not the header.
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      // If this successor is not in a loop, BB is clearly ok.
+      Loop *L = LI->getLoopFor(TI->getSuccessor(i));
+      if (!L) continue;
+      
+      // If the succ is the loop header, and if L is a top-level loop, then this
+      // is an entrance into a loop through the header, which is also ok.
+      if (L->getHeader() == TI->getSuccessor(i) && L->getParentLoop() == 0)
+        continue;
+      
+      // Otherwise, this is an entrance into a loop from some place invalid.
+      // Either the loop structure is invalid and this is not a natural loop (in
+      // which case the compiler is buggy somewhere else) or BB is unreachable.
+      BlockUnreachable = true;
+      break;
+    }
+    
+    // If this block is ok, check the next one.
+    if (!BlockUnreachable) continue;
+    
+    // Otherwise, this block is dead.  To clean up the CFG and to allow later
+    // loop transformations to ignore this case, we delete the edges into the
+    // loop by replacing the terminator.
+    
+    // Remove PHI entries from the successors.
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      TI->getSuccessor(i)->removePredecessor(BB);
+   
+    // Add a new unreachable instruction before the old terminator.
+    new UnreachableInst(TI);
+    
+    // Delete the dead terminator.
+    if (AA) AA->deleteValue(TI);
+    if (!TI->use_empty())
+      TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+    TI->eraseFromParent();
+    Changed |= true;
+  }
+  
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    Changed |= ProcessLoop(*I);
+
+  return Changed;
+}
+
+/// ProcessLoop - Walk the loop structure in depth first order, ensuring that
+/// all loops have preheaders.
+///
+bool LoopSimplify::ProcessLoop(Loop *L) {
+  bool Changed = false;
+ReprocessLoop:
+  
+  // Canonicalize inner loops before outer loops.  Inner loop canonicalization
+  // can provide work for the outer loop to canonicalize.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    Changed |= ProcessLoop(*I);
+  
+  assert(L->getBlocks()[0] == L->getHeader() &&
+         "Header isn't first block in loop?");
+
+  // Does the loop already have a preheader?  If so, don't insert one.
+  if (L->getLoopPreheader() == 0) {
+    InsertPreheaderForLoop(L);
+    NumInserted++;
+    Changed = true;
+  }
+
+  // Next, check to make sure that all exit nodes of the loop only have
+  // predecessors that are inside of the loop.  This check guarantees that the
+  // loop preheader/header will dominate the exit blocks.  If the exit block has
+  // predecessors from outside of the loop, split the edge now.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+    
+  SetVector<BasicBlock*> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
+  for (SetVector<BasicBlock*>::iterator I = ExitBlockSet.begin(),
+         E = ExitBlockSet.end(); I != E; ++I) {
+    BasicBlock *ExitBlock = *I;
+    for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
+         PI != PE; ++PI)
+      // Must be exactly this loop: no subloops, parent loops, or non-loop preds
+      // allowed.
+      if (!L->contains(*PI)) {
+        RewriteLoopExitBlock(L, ExitBlock);
+        NumInserted++;
+        Changed = true;
+        break;
+      }
+  }
+
+  // If the header has more than two predecessors at this point (from the
+  // preheader and from multiple backedges), we must adjust the loop.
+  unsigned NumBackedges = L->getNumBackEdges();
+  if (NumBackedges != 1) {
+    // If this is really a nested loop, rip it out into a child loop.  Don't do
+    // this for loops with a giant number of backedges, just factor them into a
+    // common backedge instead.
+    if (NumBackedges < 8) {
+      if (Loop *NL = SeparateNestedLoop(L)) {
+        ++NumNested;
+        // This is a big restructuring change, reprocess the whole loop.
+        ProcessLoop(NL);
+        Changed = true;
+        // GCC doesn't tail recursion eliminate this.
+        goto ReprocessLoop;
+      }
+    }
+
+    // If we either couldn't, or didn't want to, identify nesting of the loops,
+    // insert a new block that all backedges target, then make it jump to the
+    // loop header.
+    InsertUniqueBackedgeBlock(L);
+    NumInserted++;
+    Changed = true;
+  }
+
+  // Scan over the PHI nodes in the loop header.  Since they now have only two
+  // incoming values (the loop is canonicalized), we may have simplified the PHI
+  // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
+  PHINode *PN;
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       (PN = dyn_cast<PHINode>(I++)); )
+    if (Value *V = PN->hasConstantValue()) {
+      if (AA) AA->deleteValue(PN);
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
+    }
+
+  return Changed;
+}
+
+/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a
+/// preheader, this method is called to insert one.  This method has two phases:
+/// preheader insertion and analysis updating.
+///
+void LoopSimplify::InsertPreheaderForLoop(Loop *L) {
+  BasicBlock *Header = L->getHeader();
+
+  // Compute the set of predecessors of the loop that are not in the loop.
+  SmallVector<BasicBlock*, 8> OutsideBlocks;
+  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+       PI != PE; ++PI)
+    if (!L->contains(*PI))           // Coming in from outside the loop?
+      OutsideBlocks.push_back(*PI);  // Keep track of it...
+
+  // Split out the loop pre-header.
+  BasicBlock *NewBB =
+    SplitBlockPredecessors(Header, &OutsideBlocks[0], OutsideBlocks.size(),
+                           ".preheader", this);
+  
+
+  //===--------------------------------------------------------------------===//
+  //  Update analysis results now that we have performed the transformation
+  //
+
+  // We know that we have loop information to update... update it now.
+  if (Loop *Parent = L->getParentLoop())
+    Parent->addBasicBlockToLoop(NewBB, LI->getBase());
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  PlaceSplitBlockCarefully(NewBB, OutsideBlocks, L);
+}
+
+/// RewriteLoopExitBlock - Ensure that the loop preheader dominates all exit
+/// blocks.  This method is used to split exit blocks that have predecessors
+/// outside of the loop.
+BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
+  SmallVector<BasicBlock*, 8> LoopBlocks;
+  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I)
+    if (L->contains(*I))
+      LoopBlocks.push_back(*I);
+
+  assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
+  BasicBlock *NewBB = SplitBlockPredecessors(Exit, &LoopBlocks[0], 
+                                             LoopBlocks.size(), ".loopexit",
+                                             this);
+
+  // Update Loop Information - we know that the new block will be in whichever
+  // loop the Exit block is in.  Note that it may not be in that immediate loop,
+  // if the successor is some other loop header.  In that case, we continue 
+  // walking up the loop tree to find a loop that contains both the successor
+  // block and the predecessor block.
+  Loop *SuccLoop = LI->getLoopFor(Exit);
+  while (SuccLoop && !SuccLoop->contains(L->getHeader()))
+    SuccLoop = SuccLoop->getParentLoop();
+  if (SuccLoop)
+    SuccLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+
+  return NewBB;
+}
+
+/// AddBlockAndPredsToSet - Add the specified block, and all of its
+/// predecessors, to the specified set, if it's not already in there.  Stop
+/// predecessor traversal when we reach StopBlock.
+static void AddBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
+                                  std::set<BasicBlock*> &Blocks) {
+  std::vector<BasicBlock *> WorkList;
+  WorkList.push_back(InputBB);
+  do {
+    BasicBlock *BB = WorkList.back(); WorkList.pop_back();
+    if (Blocks.insert(BB).second && BB != StopBlock)
+      // If BB is not already processed and it is not a stop block then
+      // insert its predecessor in the work list
+      for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+        BasicBlock *WBB = *I;
+        WorkList.push_back(WBB);
+      }
+  } while(!WorkList.empty());
+}
+
+/// FindPHIToPartitionLoops - The first part of loop-nestification is to find a
+/// PHI node that tells us how to partition the loops.
+static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT,
+                                        AliasAnalysis *AA) {
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I);
+    ++I;
+    if (Value *V = PN->hasConstantValue())
+      if (!isa<Instruction>(V) || DT->dominates(cast<Instruction>(V), PN)) {
+        // This is a degenerate PHI already, don't modify it!
+        PN->replaceAllUsesWith(V);
+        if (AA) AA->deleteValue(PN);
+        PN->eraseFromParent();
+        continue;
+      }
+
+    // Scan this PHI node looking for a use of the PHI node by itself.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == PN &&
+          L->contains(PN->getIncomingBlock(i)))
+        // We found something tasty to remove.
+        return PN;
+  }
+  return 0;
+}
+
+// PlaceSplitBlockCarefully - If the block isn't already, move the new block to
+// right after some 'outside block' block.  This prevents the preheader from
+// being placed inside the loop body, e.g. when the loop hasn't been rotated.
+void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                       SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                            Loop *L) {
+  // Check to see if NewBB is already well placed.
+  Function::iterator BBI = NewBB; --BBI;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    if (&*BBI == SplitPreds[i])
+      return;
+  }
+  
+  // If it isn't already after an outside block, move it after one.  This is
+  // always good as it makes the uncond branch from the outside block into a
+  // fall-through.
+  
+  // Figure out *which* outside block to put this after.  Prefer an outside
+  // block that neighbors a BB actually in the loop.
+  BasicBlock *FoundBB = 0;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    Function::iterator BBI = SplitPreds[i];
+    if (++BBI != NewBB->getParent()->end() && 
+        L->contains(BBI)) {
+      FoundBB = SplitPreds[i];
+      break;
+    }
+  }
+  
+  // If our heuristic for a *good* bb to place this after doesn't find
+  // anything, just pick something.  It's likely better than leaving it within
+  // the loop.
+  if (!FoundBB)
+    FoundBB = SplitPreds[0];
+  NewBB->moveAfter(FoundBB);
+}
+
+
+/// SeparateNestedLoop - If this loop has multiple backedges, try to pull one of
+/// them out into a nested loop.  This is important for code that looks like
+/// this:
+///
+///  Loop:
+///     ...
+///     br cond, Loop, Next
+///     ...
+///     br cond2, Loop, Out
+///
+/// To identify this common case, we look at the PHI nodes in the header of the
+/// loop.  PHI nodes with unchanging values on one backedge correspond to values
+/// that change in the "outer" loop, but not in the "inner" loop.
+///
+/// If we are able to separate out a loop, return the new outer loop that was
+/// created.
+///
+Loop *LoopSimplify::SeparateNestedLoop(Loop *L) {
+  PHINode *PN = FindPHIToPartitionLoops(L, DT, AA);
+  if (PN == 0) return 0;  // No known way to partition.
+
+  // Pull out all predecessors that have varying values in the loop.  This
+  // handles the case when a PHI node has multiple instances of itself as
+  // arguments.
+  SmallVector<BasicBlock*, 8> OuterLoopPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) != PN ||
+        !L->contains(PN->getIncomingBlock(i)))
+      OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *NewBB = SplitBlockPredecessors(Header, &OuterLoopPreds[0],
+                                             OuterLoopPreds.size(),
+                                             ".outer", this);
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  PlaceSplitBlockCarefully(NewBB, OuterLoopPreds, L);
+  
+  // Create the new outer loop.
+  Loop *NewOuter = new Loop();
+
+  // Change the parent loop to use the outer loop as its child now.
+  if (Loop *Parent = L->getParentLoop())
+    Parent->replaceChildLoopWith(L, NewOuter);
+  else
+    LI->changeTopLevelLoop(L, NewOuter);
+
+  // This block is going to be our new header block: add it to this loop and all
+  // parent loops.
+  NewOuter->addBasicBlockToLoop(NewBB, LI->getBase());
+
+  // L is now a subloop of our outer loop.
+  NewOuter->addChildLoop(L);
+
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    NewOuter->addBlockEntry(*I);
+
+  // Determine which blocks should stay in L and which should be moved out to
+  // the Outer loop now.
+  std::set<BasicBlock*> BlocksInL;
+  for (pred_iterator PI = pred_begin(Header), E = pred_end(Header); PI!=E; ++PI)
+    if (DT->dominates(Header, *PI))
+      AddBlockAndPredsToSet(*PI, Header, BlocksInL);
+
+
+  // Scan all of the loop children of L, moving them to OuterLoop if they are
+  // not part of the inner loop.
+  const std::vector<Loop*> &SubLoops = L->getSubLoops();
+  for (size_t I = 0; I != SubLoops.size(); )
+    if (BlocksInL.count(SubLoops[I]->getHeader()))
+      ++I;   // Loop remains in L
+    else
+      NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
+
+  // Now that we know which blocks are in L and which need to be moved to
+  // OuterLoop, move any blocks that need it.
+  for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    if (!BlocksInL.count(BB)) {
+      // Move this block to the parent, updating the exit blocks sets
+      L->removeBlockFromLoop(BB);
+      if ((*LI)[BB] == L)
+        LI->changeLoopFor(BB, NewOuter);
+      --i;
+    }
+  }
+
+  return NewOuter;
+}
+
+
+
+/// InsertUniqueBackedgeBlock - This method is called when the specified loop
+/// has more than one backedge in it.  If this occurs, revector all of these
+/// backedges to target a new basic block and have that block branch to the loop
+/// header.  This ensures that loops have exactly one backedge.
+///
+void LoopSimplify::InsertUniqueBackedgeBlock(Loop *L) {
+  assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
+
+  // Get information about the loop
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  Function *F = Header->getParent();
+
+  // Figure out which basic blocks contain back-edges to the loop header.
+  std::vector<BasicBlock*> BackedgeBlocks;
+  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I)
+    if (*I != Preheader) BackedgeBlocks.push_back(*I);
+
+  // Create and insert the new backedge block...
+  BasicBlock *BEBlock = BasicBlock::Create(Header->getName()+".backedge", F);
+  BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
+
+  // Move the new backedge block to right after the last backedge block.
+  Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos;
+  F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
+
+  // Now that the block has been inserted into the function, create PHI nodes in
+  // the backedge block which correspond to any PHI nodes in the header block.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName()+".be",
+                                     BETerminator);
+    NewPN->reserveOperandSpace(BackedgeBlocks.size());
+    if (AA) AA->copyValue(PN, NewPN);
+
+    // Loop over the PHI node, moving all entries except the one for the
+    // preheader over to the new PHI node.
+    unsigned PreheaderIdx = ~0U;
+    bool HasUniqueIncomingValue = true;
+    Value *UniqueValue = 0;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *IBB = PN->getIncomingBlock(i);
+      Value *IV = PN->getIncomingValue(i);
+      if (IBB == Preheader) {
+        PreheaderIdx = i;
+      } else {
+        NewPN->addIncoming(IV, IBB);
+        if (HasUniqueIncomingValue) {
+          if (UniqueValue == 0)
+            UniqueValue = IV;
+          else if (UniqueValue != IV)
+            HasUniqueIncomingValue = false;
+        }
+      }
+    }
+
+    // Delete all of the incoming values from the old PN except the preheader's
+    assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
+    if (PreheaderIdx != 0) {
+      PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
+      PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
+    }
+    // Nuke all entries except the zero'th.
+    for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i)
+      PN->removeIncomingValue(e-i, false);
+
+    // Finally, add the newly constructed PHI node as the entry for the BEBlock.
+    PN->addIncoming(NewPN, BEBlock);
+
+    // As an optimization, if all incoming values in the new PhiNode (which is a
+    // subset of the incoming values of the old PHI node) have the same value,
+    // eliminate the PHI Node.
+    if (HasUniqueIncomingValue) {
+      NewPN->replaceAllUsesWith(UniqueValue);
+      if (AA) AA->deleteValue(NewPN);
+      BEBlock->getInstList().erase(NewPN);
+    }
+  }
+
+  // Now that all of the PHI nodes have been inserted and adjusted, modify the
+  // backedge blocks to just to the BEBlock instead of the header.
+  for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
+    TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
+    for (unsigned Op = 0, e = TI->getNumSuccessors(); Op != e; ++Op)
+      if (TI->getSuccessor(Op) == Header)
+        TI->setSuccessor(Op, BEBlock);
+  }
+
+  //===--- Update all analyses which we must preserve now -----------------===//
+
+  // Update Loop Information - we know that this block is now in the current
+  // loop and all parent loops.
+  L->addBasicBlockToLoop(BEBlock, LI->getBase());
+
+  // Update dominator information
+  DT->splitBlock(BEBlock);
+  if (DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>())
+    DF->splitBlock(BEBlock);
+}
diff --git a/lib/Transforms/Utils/LowerAllocations.cpp b/lib/Transforms/Utils/LowerAllocations.cpp
new file mode 100644
index 0000000..3249895
--- /dev/null
+++ b/lib/Transforms/Utils/LowerAllocations.cpp
@@ -0,0 +1,177 @@
+//===- LowerAllocations.cpp - Reduce malloc & free insts to calls ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerAllocations transformation is a target-dependent tranformation
+// because it depends on the size of data types and alignment constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowerallocs"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumLowered, "Number of allocations lowered");
+
+namespace {
+  /// LowerAllocations - Turn malloc and free instructions into %malloc and
+  /// %free calls.
+  ///
+  class VISIBILITY_HIDDEN LowerAllocations : public BasicBlockPass {
+    Constant *MallocFunc;   // Functions in the module we are processing
+    Constant *FreeFunc;     // Initialized by doInitialization
+    bool LowerMallocArgToInteger;
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LowerAllocations(bool LowerToInt = false)
+      : BasicBlockPass(&ID), MallocFunc(0), FreeFunc(0), 
+        LowerMallocArgToInteger(LowerToInt) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+      AU.setPreservesCFG();
+
+      // This is a cluster of orthogonal Transforms:
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreservedID(LowerSwitchID);
+      AU.addPreservedID(LowerInvokePassID);
+    }
+
+    /// doPassInitialization - For the lower allocations pass, this ensures that
+    /// a module contains a declaration for a malloc and a free function.
+    ///
+    bool doInitialization(Module &M);
+
+    virtual bool doInitialization(Function &F) {
+      return doInitialization(*F.getParent());
+    }
+
+    /// runOnBasicBlock - This method does the actual work of converting
+    /// instructions over, assuming that the pass has already been initialized.
+    ///
+    bool runOnBasicBlock(BasicBlock &BB);
+  };
+}
+
+char LowerAllocations::ID = 0;
+static RegisterPass<LowerAllocations>
+X("lowerallocs", "Lower allocations from instructions to calls");
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::LowerAllocationsID = &X;
+// createLowerAllocationsPass - Interface to this file...
+Pass *llvm::createLowerAllocationsPass(bool LowerMallocArgToInteger) {
+  return new LowerAllocations(LowerMallocArgToInteger);
+}
+
+
+// doInitialization - For the lower allocations pass, this ensures that a
+// module contains a declaration for a malloc and a free function.
+//
+// This function is always successful.
+//
+bool LowerAllocations::doInitialization(Module &M) {
+  const Type *BPTy = PointerType::getUnqual(Type::Int8Ty);
+  // Prototype malloc as "char* malloc(...)", because we don't know in
+  // doInitialization whether size_t is int or long.
+  FunctionType *FT = FunctionType::get(BPTy, std::vector<const Type*>(), true);
+  MallocFunc = M.getOrInsertFunction("malloc", FT);
+  FreeFunc = M.getOrInsertFunction("free"  , Type::VoidTy, BPTy, (Type *)0);
+  return true;
+}
+
+// runOnBasicBlock - This method does the actual work of converting
+// instructions over, assuming that the pass has already been initialized.
+//
+bool LowerAllocations::runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  assert(MallocFunc && FreeFunc && "Pass not initialized!");
+
+  BasicBlock::InstListType &BBIL = BB.getInstList();
+
+  const TargetData &TD = getAnalysis<TargetData>();
+  const Type *IntPtrTy = TD.getIntPtrType();
+
+  // Loop over all of the instructions, looking for malloc or free instructions
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    if (MallocInst *MI = dyn_cast<MallocInst>(I)) {
+      const Type *AllocTy = MI->getType()->getElementType();
+
+      // malloc(type) becomes sbyte *malloc(size)
+      Value *MallocArg;
+      if (LowerMallocArgToInteger)
+        MallocArg = ConstantInt::get(Type::Int64Ty,
+                                     TD.getTypeAllocSize(AllocTy));
+      else
+        MallocArg = ConstantExpr::getSizeOf(AllocTy);
+      MallocArg = ConstantExpr::getTruncOrBitCast(cast<Constant>(MallocArg), 
+                                                  IntPtrTy);
+
+      if (MI->isArrayAllocation()) {
+        if (isa<ConstantInt>(MallocArg) &&
+            cast<ConstantInt>(MallocArg)->isOne()) {
+          MallocArg = MI->getOperand(0);         // Operand * 1 = Operand
+        } else if (Constant *CO = dyn_cast<Constant>(MI->getOperand(0))) {
+          CO = ConstantExpr::getIntegerCast(CO, IntPtrTy, false /*ZExt*/);
+          MallocArg = ConstantExpr::getMul(CO, cast<Constant>(MallocArg));
+        } else {
+          Value *Scale = MI->getOperand(0);
+          if (Scale->getType() != IntPtrTy)
+            Scale = CastInst::CreateIntegerCast(Scale, IntPtrTy, false /*ZExt*/,
+                                                "", I);
+
+          // Multiply it by the array size if necessary...
+          MallocArg = BinaryOperator::Create(Instruction::Mul, Scale,
+                                             MallocArg, "", I);
+        }
+      }
+
+      // Create the call to Malloc.
+      CallInst *MCall = CallInst::Create(MallocFunc, MallocArg, "", I);
+      MCall->setTailCall();
+
+      // Create a cast instruction to convert to the right type...
+      Value *MCast;
+      if (MCall->getType() != Type::VoidTy)
+        MCast = new BitCastInst(MCall, MI->getType(), "", I);
+      else
+        MCast = Constant::getNullValue(MI->getType());
+
+      // Replace all uses of the old malloc inst with the cast inst
+      MI->replaceAllUsesWith(MCast);
+      I = --BBIL.erase(I);         // remove and delete the malloc instr...
+      Changed = true;
+      ++NumLowered;
+    } else if (FreeInst *FI = dyn_cast<FreeInst>(I)) {
+      Value *PtrCast = 
+        new BitCastInst(FI->getOperand(0),
+                        PointerType::getUnqual(Type::Int8Ty), "", I);
+
+      // Insert a call to the free function...
+      CallInst::Create(FreeFunc, PtrCast, "", I)->setTailCall();
+
+      // Delete the old free instruction
+      I = --BBIL.erase(I);
+      Changed = true;
+      ++NumLowered;
+    }
+  }
+
+  return Changed;
+}
+
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
new file mode 100644
index 0000000..1f6b1a2
--- /dev/null
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -0,0 +1,614 @@
+//===- LowerInvoke.cpp - Eliminate Invoke & Unwind instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which do not yet
+// support stack unwinding.  This pass supports two models of exception handling
+// lowering, the 'cheap' support and the 'expensive' support.
+//
+// 'Cheap' exception handling support gives the program the ability to execute
+// any program which does not "throw an exception", by turning 'invoke'
+// instructions into calls and by turning 'unwind' instructions into calls to
+// abort().  If the program does dynamically use the unwind instruction, the
+// program will print a message then abort.
+//
+// 'Expensive' exception handling support gives the full exception handling
+// support to the program at the cost of making the 'invoke' instruction
+// really expensive.  It basically inserts setjmp/longjmp calls to emulate the
+// exception handling as necessary.
+//
+// Because the 'expensive' support slows down programs a lot, and EH is only
+// used for a subset of the programs, it must be specifically enabled by an
+// option.
+//
+// Note that after this pass runs the CFG is not entirely accurate (exceptional
+// control flow edges are not correct anymore) so only very simple things should
+// be done after the lowerinvoke pass has run (like generation of native code).
+// This should not be used as a general purpose "my LLVM-to-LLVM pass doesn't
+// support the invoke instruction yet" lowering pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowerinvoke"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetLowering.h"
+#include <csetjmp>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+STATISTIC(NumUnwinds, "Number of unwinds replaced");
+STATISTIC(NumSpilled, "Number of registers live across unwind edges");
+
+static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support",
+ cl::desc("Make the -lowerinvoke pass insert expensive, but correct, EH code"));
+
+namespace {
+  class VISIBILITY_HIDDEN LowerInvoke : public FunctionPass {
+    // Used for both models.
+    Constant *WriteFn;
+    Constant *AbortFn;
+    Value *AbortMessage;
+    unsigned AbortMessageLength;
+
+    // Used for expensive EH support.
+    const Type *JBLinkTy;
+    GlobalVariable *JBListHead;
+    Constant *SetJmpFn, *LongJmpFn;
+
+    // We peek in TLI to grab the target's jmp_buf size and alignment
+    const TargetLowering *TLI;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerInvoke(const TargetLowering *tli = NULL)
+      : FunctionPass(&ID), TLI(tli) { }
+    bool doInitialization(Module &M);
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // This is a cluster of orthogonal Transforms
+      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreservedID(LowerSwitchID);
+      AU.addPreservedID(LowerAllocationsID);
+    }
+
+  private:
+    void createAbortMessage(Module *M);
+    void writeAbortMessage(Instruction *IB);
+    bool insertCheapEHSupport(Function &F);
+    void splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes);
+    void rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
+                                AllocaInst *InvokeNum, SwitchInst *CatchSwitch);
+    bool insertExpensiveEHSupport(Function &F);
+  };
+}
+
+char LowerInvoke::ID = 0;
+static RegisterPass<LowerInvoke>
+X("lowerinvoke", "Lower invoke and unwind, for unwindless code generators");
+
+const PassInfo *const llvm::LowerInvokePassID = &X;
+
+// Public Interface To the LowerInvoke pass.
+FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) {
+  return new LowerInvoke(TLI);
+}
+
+// doInitialization - Make sure that there is a prototype for abort in the
+// current module.
+bool LowerInvoke::doInitialization(Module &M) {
+  const Type *VoidPtrTy = PointerType::getUnqual(Type::Int8Ty);
+  AbortMessage = 0;
+  if (ExpensiveEHSupport) {
+    // Insert a type for the linked list of jump buffers.
+    unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
+    JBSize = JBSize ? JBSize : 200;
+    const Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize);
+
+    { // The type is recursive, so use a type holder.
+      std::vector<const Type*> Elements;
+      Elements.push_back(JmpBufTy);
+      OpaqueType *OT = OpaqueType::get();
+      Elements.push_back(PointerType::getUnqual(OT));
+      PATypeHolder JBLType(StructType::get(Elements));
+      OT->refineAbstractTypeTo(JBLType.get());  // Complete the cycle.
+      JBLinkTy = JBLType.get();
+      M.addTypeName("llvm.sjljeh.jmpbufty", JBLinkTy);
+    }
+
+    const Type *PtrJBList = PointerType::getUnqual(JBLinkTy);
+
+    // Now that we've done that, insert the jmpbuf list head global, unless it
+    // already exists.
+    if (!(JBListHead = M.getGlobalVariable("llvm.sjljeh.jblist", PtrJBList))) {
+      JBListHead = new GlobalVariable(PtrJBList, false,
+                                      GlobalValue::LinkOnceAnyLinkage,
+                                      Constant::getNullValue(PtrJBList),
+                                      "llvm.sjljeh.jblist", &M);
+    }
+
+// VisualStudio defines setjmp as _setjmp via #include <csetjmp> / <setjmp.h>,
+// so it looks like Intrinsic::_setjmp
+#if defined(_MSC_VER) && defined(setjmp)
+#define setjmp_undefined_for_visual_studio
+#undef setjmp
+#endif
+
+    SetJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::setjmp);
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_visual_studio)
+// let's return it to _setjmp state in case anyone ever needs it after this
+// point under VisualStudio
+#define setjmp _setjmp
+#endif
+
+    LongJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::longjmp);
+  }
+
+  // We need the 'write' and 'abort' functions for both models.
+  AbortFn = M.getOrInsertFunction("abort", Type::VoidTy, (Type *)0);
+#if 0 // "write" is Unix-specific.. code is going away soon anyway.
+  WriteFn = M.getOrInsertFunction("write", Type::VoidTy, Type::Int32Ty,
+                                  VoidPtrTy, Type::Int32Ty, (Type *)0);
+#else
+  WriteFn = 0;
+#endif
+  return true;
+}
+
+void LowerInvoke::createAbortMessage(Module *M) {
+  if (ExpensiveEHSupport) {
+    // The abort message for expensive EH support tells the user that the
+    // program 'unwound' without an 'invoke' instruction.
+    Constant *Msg =
+      ConstantArray::get("ERROR: Exception thrown, but not caught!\n");
+    AbortMessageLength = Msg->getNumOperands()-1;  // don't include \0
+
+    GlobalVariable *MsgGV = new GlobalVariable(Msg->getType(), true,
+                                               GlobalValue::InternalLinkage,
+                                               Msg, "abortmsg", M);
+    std::vector<Constant*> GEPIdx(2, Constant::getNullValue(Type::Int32Ty));
+    AbortMessage = ConstantExpr::getGetElementPtr(MsgGV, &GEPIdx[0], 2);
+  } else {
+    // The abort message for cheap EH support tells the user that EH is not
+    // enabled.
+    Constant *Msg =
+      ConstantArray::get("Exception handler needed, but not enabled.  Recompile"
+                         " program with -enable-correct-eh-support.\n");
+    AbortMessageLength = Msg->getNumOperands()-1;  // don't include \0
+
+    GlobalVariable *MsgGV = new GlobalVariable(Msg->getType(), true,
+                                               GlobalValue::InternalLinkage,
+                                               Msg, "abortmsg", M);
+    std::vector<Constant*> GEPIdx(2, Constant::getNullValue(Type::Int32Ty));
+    AbortMessage = ConstantExpr::getGetElementPtr(MsgGV, &GEPIdx[0], 2);
+  }
+}
+
+
+void LowerInvoke::writeAbortMessage(Instruction *IB) {
+#if 0
+  if (AbortMessage == 0)
+    createAbortMessage(IB->getParent()->getParent()->getParent());
+
+  // These are the arguments we WANT...
+  Value* Args[3];
+  Args[0] = ConstantInt::get(Type::Int32Ty, 2);
+  Args[1] = AbortMessage;
+  Args[2] = ConstantInt::get(Type::Int32Ty, AbortMessageLength);
+  (new CallInst(WriteFn, Args, 3, "", IB))->setTailCall();
+#endif
+}
+
+bool LowerInvoke::insertCheapEHSupport(Function &F) {
+  bool Changed = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      std::vector<Value*> CallArgs(II->op_begin()+3, II->op_end());
+      // Insert a normal call instruction...
+      CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                           CallArgs.begin(), CallArgs.end(), "",II);
+      NewCall->takeName(II);
+      NewCall->setCallingConv(II->getCallingConv());
+      NewCall->setAttributes(II->getAttributes());
+      II->replaceAllUsesWith(NewCall);
+
+      // Insert an unconditional branch to the normal destination.
+      BranchInst::Create(II->getNormalDest(), II);
+
+      // Remove any PHI node entries from the exception destination.
+      II->getUnwindDest()->removePredecessor(BB);
+
+      // Remove the invoke instruction now.
+      BB->getInstList().erase(II);
+
+      ++NumInvokes; Changed = true;
+    } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+      // Insert a new call to write(2, AbortMessage, AbortMessageLength);
+      writeAbortMessage(UI);
+
+      // Insert a call to abort()
+      CallInst::Create(AbortFn, "", UI)->setTailCall();
+
+      // Insert a return instruction.  This really should be a "barrier", as it
+      // is unreachable.
+      ReturnInst::Create(F.getReturnType() == Type::VoidTy ? 0 :
+                         Constant::getNullValue(F.getReturnType()), UI);
+
+      // Remove the unwind instruction now.
+      BB->getInstList().erase(UI);
+
+      ++NumUnwinds; Changed = true;
+    }
+  return Changed;
+}
+
+/// rewriteExpensiveInvoke - Insert code and hack the function to replace the
+/// specified invoke instruction with a call.
+void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
+                                         AllocaInst *InvokeNum,
+                                         SwitchInst *CatchSwitch) {
+  ConstantInt *InvokeNoC = ConstantInt::get(Type::Int32Ty, InvokeNo);
+
+  // If the unwind edge has phi nodes, split the edge.
+  if (isa<PHINode>(II->getUnwindDest()->begin())) {
+    SplitCriticalEdge(II, 1, this);
+
+    // If there are any phi nodes left, they must have a single predecessor.
+    while (PHINode *PN = dyn_cast<PHINode>(II->getUnwindDest()->begin())) {
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      PN->eraseFromParent();
+    }
+  }
+
+  // Insert a store of the invoke num before the invoke and store zero into the
+  // location afterward.
+  new StoreInst(InvokeNoC, InvokeNum, true, II);  // volatile
+
+  BasicBlock::iterator NI = II->getNormalDest()->getFirstNonPHI();
+  // nonvolatile.
+  new StoreInst(Constant::getNullValue(Type::Int32Ty), InvokeNum, false, NI);
+
+  // Add a switch case to our unwind block.
+  CatchSwitch->addCase(InvokeNoC, II->getUnwindDest());
+
+  // Insert a normal call instruction.
+  std::vector<Value*> CallArgs(II->op_begin()+3, II->op_end());
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                       CallArgs.begin(), CallArgs.end(), "",
+                                       II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  II->replaceAllUsesWith(NewCall);
+
+  // Replace the invoke with an uncond branch.
+  BranchInst::Create(II->getNormalDest(), NewCall->getParent());
+  II->eraseFromParent();
+}
+
+/// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
+/// we reach blocks we've already seen.
+static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) {
+  if (!LiveBBs.insert(BB).second) return; // already been here.
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    MarkBlocksLiveIn(*PI, LiveBBs);
+}
+
+// First thing we need to do is scan the whole function for values that are
+// live across unwind edges.  Each value that is live across an unwind edge
+// we spill into a stack location, guaranteeing that there is nothing live
+// across the unwind edge.  This process also splits all critical edges
+// coming out of invoke's.
+void LowerInvoke::
+splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes) {
+  // First step, split all critical edges from invoke instructions.
+  for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+    InvokeInst *II = Invokes[i];
+    SplitCriticalEdge(II, 0, this);
+    SplitCriticalEdge(II, 1, this);
+    assert(!isa<PHINode>(II->getNormalDest()) &&
+           !isa<PHINode>(II->getUnwindDest()) &&
+           "critical edge splitting left single entry phi nodes?");
+  }
+
+  Function *F = Invokes.back()->getParent()->getParent();
+
+  // To avoid having to handle incoming arguments specially, we lower each arg
+  // to a copy instruction in the entry block.  This ensures that the argument
+  // value itself cannot be live across the entry block.
+  BasicBlock::iterator AfterAllocaInsertPt = F->begin()->begin();
+  while (isa<AllocaInst>(AfterAllocaInsertPt) &&
+        isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsertPt)->getArraySize()))
+    ++AfterAllocaInsertPt;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+       AI != E; ++AI) {
+    // This is always a no-op cast because we're casting AI to AI->getType() so
+    // src and destination types are identical. BitCast is the only possibility.
+    CastInst *NC = new BitCastInst(
+      AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
+    AI->replaceAllUsesWith(NC);
+    // Normally its is forbidden to replace a CastInst's operand because it
+    // could cause the opcode to reflect an illegal conversion. However, we're
+    // replacing it here with the same value it was constructed with to simply
+    // make NC its user.
+    NC->setOperand(0, AI);
+  }
+
+  // Finally, scan the code looking for instructions with bad live ranges.
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+      // Ignore obvious cases we don't have to handle.  In particular, most
+      // instructions either have no uses or only have a single use inside the
+      // current block.  Ignore them quickly.
+      Instruction *Inst = II;
+      if (Inst->use_empty()) continue;
+      if (Inst->hasOneUse() &&
+          cast<Instruction>(Inst->use_back())->getParent() == BB &&
+          !isa<PHINode>(Inst->use_back())) continue;
+
+      // If this is an alloca in the entry block, it's not a real register
+      // value.
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+        if (isa<ConstantInt>(AI->getArraySize()) && BB == F->begin())
+          continue;
+
+      // Avoid iterator invalidation by copying users to a temporary vector.
+      std::vector<Instruction*> Users;
+      for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
+           UI != E; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (User->getParent() != BB || isa<PHINode>(User))
+          Users.push_back(User);
+      }
+
+      // Scan all of the uses and see if the live range is live across an unwind
+      // edge.  If we find a use live across an invoke edge, create an alloca
+      // and spill the value.
+      std::set<InvokeInst*> InvokesWithStoreInserted;
+
+      // Find all of the blocks that this value is live in.
+      std::set<BasicBlock*> LiveBBs;
+      LiveBBs.insert(Inst->getParent());
+      while (!Users.empty()) {
+        Instruction *U = Users.back();
+        Users.pop_back();
+
+        if (!isa<PHINode>(U)) {
+          MarkBlocksLiveIn(U->getParent(), LiveBBs);
+        } else {
+          // Uses for a PHI node occur in their predecessor block.
+          PHINode *PN = cast<PHINode>(U);
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == Inst)
+              MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
+        }
+      }
+
+      // Now that we know all of the blocks that this thing is live in, see if
+      // it includes any of the unwind locations.
+      bool NeedsSpill = false;
+      for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+        BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
+        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
+          NeedsSpill = true;
+        }
+      }
+
+      // If we decided we need a spill, do it.
+      if (NeedsSpill) {
+        ++NumSpilled;
+        DemoteRegToStack(*Inst, true);
+      }
+    }
+}
+
+bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
+  std::vector<ReturnInst*> Returns;
+  std::vector<UnwindInst*> Unwinds;
+  std::vector<InvokeInst*> Invokes;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      // Remember all return instructions in case we insert an invoke into this
+      // function.
+      Returns.push_back(RI);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Invokes.push_back(II);
+    } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
+      Unwinds.push_back(UI);
+    }
+
+  if (Unwinds.empty() && Invokes.empty()) return false;
+
+  NumInvokes += Invokes.size();
+  NumUnwinds += Unwinds.size();
+
+  // TODO: This is not an optimal way to do this.  In particular, this always
+  // inserts setjmp calls into the entries of functions with invoke instructions
+  // even though there are possibly paths through the function that do not
+  // execute any invokes.  In particular, for functions with early exits, e.g.
+  // the 'addMove' method in hexxagon, it would be nice to not have to do the
+  // setjmp stuff on the early exit path.  This requires a bit of dataflow, but
+  // would not be too hard to do.
+
+  // If we have an invoke instruction, insert a setjmp that dominates all
+  // invokes.  After the setjmp, use a cond branch that goes to the original
+  // code path on zero, and to a designated 'catch' block of nonzero.
+  Value *OldJmpBufPtr = 0;
+  if (!Invokes.empty()) {
+    // First thing we need to do is scan the whole function for values that are
+    // live across unwind edges.  Each value that is live across an unwind edge
+    // we spill into a stack location, guaranteeing that there is nothing live
+    // across the unwind edge.  This process also splits all critical edges
+    // coming out of invoke's.
+    splitLiveRangesLiveAcrossInvokes(Invokes);
+
+    BasicBlock *EntryBB = F.begin();
+
+    // Create an alloca for the incoming jump buffer ptr and the new jump buffer
+    // that needs to be restored on all exits from the function.  This is an
+    // alloca because the value needs to be live across invokes.
+    unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0;
+    AllocaInst *JmpBuf =
+      new AllocaInst(JBLinkTy, 0, Align, "jblink", F.begin()->begin());
+
+    std::vector<Value*> Idx;
+    Idx.push_back(Constant::getNullValue(Type::Int32Ty));
+    Idx.push_back(ConstantInt::get(Type::Int32Ty, 1));
+    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx.begin(), Idx.end(),
+                                             "OldBuf", EntryBB->getTerminator());
+
+    // Copy the JBListHead to the alloca.
+    Value *OldBuf = new LoadInst(JBListHead, "oldjmpbufptr", true,
+                                 EntryBB->getTerminator());
+    new StoreInst(OldBuf, OldJmpBufPtr, true, EntryBB->getTerminator());
+
+    // Add the new jumpbuf to the list.
+    new StoreInst(JmpBuf, JBListHead, true, EntryBB->getTerminator());
+
+    // Create the catch block.  The catch block is basically a big switch
+    // statement that goes to all of the invoke catch blocks.
+    BasicBlock *CatchBB = BasicBlock::Create("setjmp.catch", &F);
+
+    // Create an alloca which keeps track of which invoke is currently
+    // executing.  For normal calls it contains zero.
+    AllocaInst *InvokeNum = new AllocaInst(Type::Int32Ty, 0, "invokenum",
+                                           EntryBB->begin());
+    new StoreInst(ConstantInt::get(Type::Int32Ty, 0), InvokeNum, true,
+                  EntryBB->getTerminator());
+
+    // Insert a load in the Catch block, and a switch on its value.  By default,
+    // we go to a block that just does an unwind (which is the correct action
+    // for a standard call).
+    BasicBlock *UnwindBB = BasicBlock::Create("unwindbb", &F);
+    Unwinds.push_back(new UnwindInst(UnwindBB));
+
+    Value *CatchLoad = new LoadInst(InvokeNum, "invoke.num", true, CatchBB);
+    SwitchInst *CatchSwitch =
+      SwitchInst::Create(CatchLoad, UnwindBB, Invokes.size(), CatchBB);
+
+    // Now that things are set up, insert the setjmp call itself.
+
+    // Split the entry block to insert the conditional branch for the setjmp.
+    BasicBlock *ContBlock = EntryBB->splitBasicBlock(EntryBB->getTerminator(),
+                                                     "setjmp.cont");
+
+    Idx[1] = ConstantInt::get(Type::Int32Ty, 0);
+    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx.begin(), Idx.end(),
+                                                 "TheJmpBuf",
+                                                 EntryBB->getTerminator());
+    JmpBufPtr = new BitCastInst(JmpBufPtr, PointerType::getUnqual(Type::Int8Ty),
+                                "tmp", EntryBB->getTerminator());
+    Value *SJRet = CallInst::Create(SetJmpFn, JmpBufPtr, "sjret",
+                                    EntryBB->getTerminator());
+
+    // Compare the return value to zero.
+    Value *IsNormal = new ICmpInst(ICmpInst::ICMP_EQ, SJRet,
+                                   Constant::getNullValue(SJRet->getType()),
+      "notunwind", EntryBB->getTerminator());
+    // Nuke the uncond branch.
+    EntryBB->getTerminator()->eraseFromParent();
+
+    // Put in a new condbranch in its place.
+    BranchInst::Create(ContBlock, CatchBB, IsNormal, EntryBB);
+
+    // At this point, we are all set up, rewrite each invoke instruction.
+    for (unsigned i = 0, e = Invokes.size(); i != e; ++i)
+      rewriteExpensiveInvoke(Invokes[i], i+1, InvokeNum, CatchSwitch);
+  }
+
+  // We know that there is at least one unwind.
+
+  // Create three new blocks, the block to load the jmpbuf ptr and compare
+  // against null, the block to do the longjmp, and the error block for if it
+  // is null.  Add them at the end of the function because they are not hot.
+  BasicBlock *UnwindHandler = BasicBlock::Create("dounwind", &F);
+  BasicBlock *UnwindBlock = BasicBlock::Create("unwind", &F);
+  BasicBlock *TermBlock = BasicBlock::Create("unwinderror", &F);
+
+  // If this function contains an invoke, restore the old jumpbuf ptr.
+  Value *BufPtr;
+  if (OldJmpBufPtr) {
+    // Before the return, insert a copy from the saved value to the new value.
+    BufPtr = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", UnwindHandler);
+    new StoreInst(BufPtr, JBListHead, UnwindHandler);
+  } else {
+    BufPtr = new LoadInst(JBListHead, "ehlist", UnwindHandler);
+  }
+
+  // Load the JBList, if it's null, then there was no catch!
+  Value *NotNull = new ICmpInst(ICmpInst::ICMP_NE, BufPtr,
+                                Constant::getNullValue(BufPtr->getType()),
+    "notnull", UnwindHandler);
+  BranchInst::Create(UnwindBlock, TermBlock, NotNull, UnwindHandler);
+
+  // Create the block to do the longjmp.
+  // Get a pointer to the jmpbuf and longjmp.
+  std::vector<Value*> Idx;
+  Idx.push_back(Constant::getNullValue(Type::Int32Ty));
+  Idx.push_back(ConstantInt::get(Type::Int32Ty, 0));
+  Idx[0] = GetElementPtrInst::Create(BufPtr, Idx.begin(), Idx.end(), "JmpBuf",
+                                     UnwindBlock);
+  Idx[0] = new BitCastInst(Idx[0], PointerType::getUnqual(Type::Int8Ty),
+                           "tmp", UnwindBlock);
+  Idx[1] = ConstantInt::get(Type::Int32Ty, 1);
+  CallInst::Create(LongJmpFn, Idx.begin(), Idx.end(), "", UnwindBlock);
+  new UnreachableInst(UnwindBlock);
+
+  // Set up the term block ("throw without a catch").
+  new UnreachableInst(TermBlock);
+
+  // Insert a new call to write(2, AbortMessage, AbortMessageLength);
+  writeAbortMessage(TermBlock->getTerminator());
+
+  // Insert a call to abort()
+  CallInst::Create(AbortFn, "",
+                   TermBlock->getTerminator())->setTailCall();
+
+
+  // Replace all unwinds with a branch to the unwind handler.
+  for (unsigned i = 0, e = Unwinds.size(); i != e; ++i) {
+    BranchInst::Create(UnwindHandler, Unwinds[i]);
+    Unwinds[i]->eraseFromParent();
+  }
+
+  // Finally, for any returns from this function, if this function contains an
+  // invoke, restore the old jmpbuf pointer to its input value.
+  if (OldJmpBufPtr) {
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *R = Returns[i];
+
+      // Before the return, insert a copy from the saved value to the new value.
+      Value *OldBuf = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", true, R);
+      new StoreInst(OldBuf, JBListHead, true, R);
+    }
+  }
+
+  return true;
+}
+
+bool LowerInvoke::runOnFunction(Function &F) {
+  if (ExpensiveEHSupport)
+    return insertExpensiveEHSupport(F);
+  else
+    return insertCheapEHSupport(F);
+}
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
new file mode 100644
index 0000000..1da5936
--- /dev/null
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -0,0 +1,323 @@
+//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerSwitch transformation rewrites switch instructions with a sequence
+// of branches, which allows targets to get away with not implementing the
+// switch instruction until it is convenient.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+  /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch
+  /// instructions.  Note that this cannot be a BasicBlock pass because it
+  /// modifies the CFG!
+  class VISIBILITY_HIDDEN LowerSwitch : public FunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerSwitch() : FunctionPass(&ID) {} 
+
+    virtual bool runOnFunction(Function &F);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // This is a cluster of orthogonal Transforms
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreservedID(LowerInvokePassID);
+      AU.addPreservedID(LowerAllocationsID);
+    }
+
+    struct CaseRange {
+      Constant* Low;
+      Constant* High;
+      BasicBlock* BB;
+
+      CaseRange() : Low(0), High(0), BB(0) { }
+      CaseRange(Constant* low, Constant* high, BasicBlock* bb) :
+        Low(low), High(high), BB(bb) { }
+    };
+
+    typedef std::vector<CaseRange>           CaseVector;
+    typedef std::vector<CaseRange>::iterator CaseItr;
+  private:
+    void processSwitchInst(SwitchInst *SI);
+
+    BasicBlock* switchConvert(CaseItr Begin, CaseItr End, Value* Val,
+                              BasicBlock* OrigBlock, BasicBlock* Default);
+    BasicBlock* newLeafBlock(CaseRange& Leaf, Value* Val,
+                             BasicBlock* OrigBlock, BasicBlock* Default);
+    unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
+  };
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator () (const LowerSwitch::CaseRange& C1,
+                      const LowerSwitch::CaseRange& C2) {
+
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+}
+
+char LowerSwitch::ID = 0;
+static RegisterPass<LowerSwitch>
+X("lowerswitch", "Lower SwitchInst's to branches");
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::LowerSwitchID = &X;
+// createLowerSwitchPass - Interface to this file...
+FunctionPass *llvm::createLowerSwitchPass() {
+  return new LowerSwitch();
+}
+
+bool LowerSwitch::runOnFunction(Function &F) {
+  bool Changed = false;
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
+      Changed = true;
+      processSwitchInst(SI);
+    }
+  }
+
+  return Changed;
+}
+
+// operator<< - Used for debugging purposes.
+//
+static std::ostream& operator<<(std::ostream &O,
+                                const LowerSwitch::CaseVector &C) {
+  O << "[";
+
+  for (LowerSwitch::CaseVector::const_iterator B = C.begin(),
+         E = C.end(); B != E; ) {
+    O << *B->Low << " -" << *B->High;
+    if (++B != E) O << ", ";
+  }
+
+  return O << "]";
+}
+
+static OStream& operator<<(OStream &O, const LowerSwitch::CaseVector &C) {
+  if (O.stream()) *O.stream() << C;
+  return O;
+}
+
+// switchConvert - Convert the switch statement into a binary lookup of
+// the case values. The function recursively builds this tree.
+//
+BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
+                                       Value* Val, BasicBlock* OrigBlock,
+                                       BasicBlock* Default)
+{
+  unsigned Size = End - Begin;
+
+  if (Size == 1)
+    return newLeafBlock(*Begin, Val, OrigBlock, Default);
+
+  unsigned Mid = Size / 2;
+  std::vector<CaseRange> LHS(Begin, Begin + Mid);
+  DOUT << "LHS: " << LHS << "\n";
+  std::vector<CaseRange> RHS(Begin + Mid, End);
+  DOUT << "RHS: " << RHS << "\n";
+
+  CaseRange& Pivot = *(Begin + Mid);
+  DEBUG(errs() << "Pivot ==> " 
+               << cast<ConstantInt>(Pivot.Low)->getValue() << " -"
+               << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+
+  BasicBlock* LBranch = switchConvert(LHS.begin(), LHS.end(), Val,
+                                      OrigBlock, Default);
+  BasicBlock* RBranch = switchConvert(RHS.begin(), RHS.end(), Val,
+                                      OrigBlock, Default);
+
+  // Create a new node that checks if the value is < pivot. Go to the
+  // left branch if it is and right branch if not.
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewNode = BasicBlock::Create("NodeBlock");
+  Function::iterator FI = OrigBlock;
+  F->getBasicBlockList().insert(++FI, NewNode);
+
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, Val, Pivot.Low, "Pivot");
+  NewNode->getInstList().push_back(Comp);
+  BranchInst::Create(LBranch, RBranch, Comp, NewNode);
+  return NewNode;
+}
+
+// newLeafBlock - Create a new leaf block for the binary lookup tree. It
+// checks if the switch's value == the case's value. If not, then it
+// jumps to the default branch. At this point in the tree, the value
+// can't be another valid case value, so the jump to the "default" branch
+// is warranted.
+//
+BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
+                                      BasicBlock* OrigBlock,
+                                      BasicBlock* Default)
+{
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewLeaf = BasicBlock::Create("LeafBlock");
+  Function::iterator FI = OrigBlock;
+  F->getBasicBlockList().insert(++FI, NewLeaf);
+
+  // Emit comparison
+  ICmpInst* Comp = NULL;
+  if (Leaf.Low == Leaf.High) {
+    // Make the seteq instruction...
+    Comp = new ICmpInst(ICmpInst::ICMP_EQ, Val, Leaf.Low,
+                        "SwitchLeaf", NewLeaf);
+  } else {
+    // Make range comparison
+    if (cast<ConstantInt>(Leaf.Low)->isMinValue(true /*isSigned*/)) {
+      // Val >= Min && Val <= Hi --> Val <= Hi
+      Comp = new ICmpInst(ICmpInst::ICMP_SLE, Val, Leaf.High,
+                          "SwitchLeaf", NewLeaf);
+    } else if (cast<ConstantInt>(Leaf.Low)->isZero()) {
+      // Val >= 0 && Val <= Hi --> Val <=u Hi
+      Comp = new ICmpInst(ICmpInst::ICMP_ULE, Val, Leaf.High,
+                          "SwitchLeaf", NewLeaf);      
+    } else {
+      // Emit V-Lo <=u Hi-Lo
+      Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
+      Instruction* Add = BinaryOperator::CreateAdd(Val, NegLo,
+                                                   Val->getName()+".off",
+                                                   NewLeaf);
+      Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
+      Comp = new ICmpInst(ICmpInst::ICMP_ULE, Add, UpperBound,
+                          "SwitchLeaf", NewLeaf);
+    }
+  }
+
+  // Make the conditional branch...
+  BasicBlock* Succ = Leaf.BB;
+  BranchInst::Create(Succ, Default, Comp, NewLeaf);
+
+  // If there were any PHI nodes in this successor, rewrite one entry
+  // from OrigBlock to come from NewLeaf.
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode* PN = cast<PHINode>(I);
+    // Remove all but one incoming entries from the cluster
+    uint64_t Range = cast<ConstantInt>(Leaf.High)->getSExtValue() -
+                     cast<ConstantInt>(Leaf.Low)->getSExtValue();    
+    for (uint64_t j = 0; j < Range; ++j) {
+      PN->removeIncomingValue(OrigBlock);
+    }
+    
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
+  }
+
+  return NewLeaf;
+}
+
+// Clusterify - Transform simple list of Cases into list of CaseRange's
+unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
+  unsigned numCmps = 0;
+
+  // Start with "simple" cases
+  for (unsigned i = 1; i < SI->getNumSuccessors(); ++i)
+    Cases.push_back(CaseRange(SI->getSuccessorValue(i),
+                              SI->getSuccessorValue(i),
+                              SI->getSuccessor(i)));
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size()>=2)
+    for (CaseItr I=Cases.begin(), J=next(Cases.begin()); J!=Cases.end(); ) {
+      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
+      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
+      BasicBlock* nextBB = J->BB;
+      BasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue-currentValue==1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
+  }
+
+  return numCmps;
+}
+
+// processSwitchInst - Replace the specified switch instruction with a sequence
+// of chained if-then insts in a balanced binary search.
+//
+void LowerSwitch::processSwitchInst(SwitchInst *SI) {
+  BasicBlock *CurBlock = SI->getParent();
+  BasicBlock *OrigBlock = CurBlock;
+  Function *F = CurBlock->getParent();
+  Value *Val = SI->getOperand(0);  // The value we are switching on...
+  BasicBlock* Default = SI->getDefaultDest();
+
+  // If there is only the default destination, don't bother with the code below.
+  if (SI->getNumOperands() == 2) {
+    BranchInst::Create(SI->getDefaultDest(), CurBlock);
+    CurBlock->getInstList().erase(SI);
+    return;
+  }
+
+  // Create a new, empty default block so that the new hierarchy of
+  // if-then statements go to this and the PHI nodes are happy.
+  BasicBlock* NewDefault = BasicBlock::Create("NewDefault");
+  F->getBasicBlockList().insert(Default, NewDefault);
+
+  BranchInst::Create(Default, NewDefault);
+
+  // If there is an entry in any PHI nodes for the default edge, make sure
+  // to update them as well.
+  for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
+  }
+
+  // Prepare cases vector.
+  CaseVector Cases;
+  unsigned numCmps = Clusterify(Cases, SI);
+
+  DOUT << "Clusterify finished. Total clusters: " << Cases.size()
+       << ". Total compares: " << numCmps << "\n";
+  DOUT << "Cases: " << Cases << "\n";
+  
+  BasicBlock* SwitchBlock = switchConvert(Cases.begin(), Cases.end(), Val,
+                                          OrigBlock, NewDefault);
+
+  // Branch to our shiny new if-then stuff...
+  BranchInst::Create(SwitchBlock, OrigBlock);
+
+  // We are now done with the switch instruction, delete it.
+  CurBlock->getInstList().erase(SI);
+}
diff --git a/lib/Transforms/Utils/Makefile b/lib/Transforms/Utils/Makefile
new file mode 100644
index 0000000..d1e9336
--- /dev/null
+++ b/lib/Transforms/Utils/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Utils/Makefile -----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMTransformUtils
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
new file mode 100644
index 0000000..2b06d77
--- /dev/null
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -0,0 +1,92 @@
+//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mem2reg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumPromoted, "Number of alloca's promoted");
+
+namespace {
+  struct VISIBILITY_HIDDEN PromotePass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    PromotePass() : FunctionPass(&ID) {}
+
+    // runOnFunction - To run this pass, first we calculate the alloca
+    // instructions that are safe for promotion, then we promote each one.
+    //
+    virtual bool runOnFunction(Function &F);
+
+    // getAnalysisUsage - We need dominance frontiers
+    //
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();
+      AU.setPreservesCFG();
+      // This is a cluster of orthogonal Transforms
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreservedID(LowerSwitchID);
+      AU.addPreservedID(LowerInvokePassID);
+      AU.addPreservedID(LowerAllocationsID);
+    }
+  };
+}  // end of anonymous namespace
+
+char PromotePass::ID = 0;
+static RegisterPass<PromotePass> X("mem2reg", "Promote Memory to Register");
+
+bool PromotePass::runOnFunction(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+
+  bool Changed  = false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  DominanceFrontier &DF = getAnalysis<DominanceFrontier>();
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT, DF);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// Publically exposed interface to pass...
+const PassInfo *const llvm::PromoteMemoryToRegisterID = &X;
+// createPromoteMemoryToRegister - Provide an entry point to create this pass.
+//
+FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
+  return new PromotePass();
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
new file mode 100644
index 0000000..b717699
--- /dev/null
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -0,0 +1,1003 @@
+//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file promotes memory references to be register references.  It promotes
+// alloca instructions which only have loads and stores as uses.  An alloca is
+// transformed by using dominator frontiers to place PHI nodes, then traversing
+// the function in depth-first order to rewrite loads and stores as appropriate.
+// This is just the standard SSA construction algorithm to construct "pruned"
+// SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mem2reg"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
+STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+
+// Provide DenseMapInfo for all pointers.
+namespace llvm {
+template<>
+struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > {
+  typedef std::pair<BasicBlock*, unsigned> EltTy;
+  static inline EltTy getEmptyKey() {
+    return EltTy(reinterpret_cast<BasicBlock*>(-1), ~0U);
+  }
+  static inline EltTy getTombstoneKey() {
+    return EltTy(reinterpret_cast<BasicBlock*>(-2), 0U);
+  }
+  static unsigned getHashValue(const std::pair<BasicBlock*, unsigned> &Val) {
+    return DenseMapInfo<void*>::getHashValue(Val.first) + Val.second*2;
+  }
+  static bool isEqual(const EltTy &LHS, const EltTy &RHS) {
+    return LHS == RHS;
+  }
+  static bool isPod() { return true; }
+};
+}
+
+/// isAllocaPromotable - Return true if this alloca is legal for promotion.
+/// This is true if there are only loads and stores to the alloca.
+///
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // FIXME: If the memory unit is of pointer or integer type, we can permit
+  // assignments to subsections of the memory unit.
+
+  // Only allow direct and non-volatile loads and stores...
+  for (Value::use_const_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE; ++UI)     // Loop over all of the uses of the alloca
+    if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (LI->isVolatile())
+        return false;
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (SI->getOperand(0) == AI)
+        return false;   // Don't allow a store OF the AI, only INTO the AI.
+      if (SI->isVolatile())
+        return false;
+    } else if (const BitCastInst *BC = dyn_cast<BitCastInst>(*UI)) {
+      // A bitcast that does not feed into debug info inhibits promotion.
+      if (!BC->hasOneUse() || !isa<DbgInfoIntrinsic>(*BC->use_begin()))
+        return false;
+      // If the only use is by debug info, this alloca will not exist in
+      // non-debug code, so don't try to promote; this ensures the same
+      // codegen with debug info.  Otherwise, debug info should not
+      // inhibit promotion (but we must examine other uses).
+      if (AI->hasOneUse())
+        return false;
+    } else {
+      return false;
+    }
+
+  return true;
+}
+
+namespace {
+  struct AllocaInfo;
+
+  // Data package used by RenamePass()
+  class VISIBILITY_HIDDEN RenamePassData {
+  public:
+    typedef std::vector<Value *> ValVector;
+    
+    RenamePassData() {}
+    RenamePassData(BasicBlock *B, BasicBlock *P,
+                   const ValVector &V) : BB(B), Pred(P), Values(V) {}
+    BasicBlock *BB;
+    BasicBlock *Pred;
+    ValVector Values;
+    
+    void swap(RenamePassData &RHS) {
+      std::swap(BB, RHS.BB);
+      std::swap(Pred, RHS.Pred);
+      Values.swap(RHS.Values);
+    }
+  };
+  
+  /// LargeBlockInfo - This assigns and keeps a per-bb relative ordering of
+  /// load/store instructions in the block that directly load or store an alloca.
+  ///
+  /// This functionality is important because it avoids scanning large basic
+  /// blocks multiple times when promoting many allocas in the same block.
+  class VISIBILITY_HIDDEN LargeBlockInfo {
+    /// InstNumbers - For each instruction that we track, keep the index of the
+    /// instruction.  The index starts out as the number of the instruction from
+    /// the start of the block.
+    DenseMap<const Instruction *, unsigned> InstNumbers;
+  public:
+    
+    /// isInterestingInstruction - This code only looks at accesses to allocas.
+    static bool isInterestingInstruction(const Instruction *I) {
+      return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+             (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+    }
+    
+    /// getInstructionIndex - Get or calculate the index of the specified
+    /// instruction.
+    unsigned getInstructionIndex(const Instruction *I) {
+      assert(isInterestingInstruction(I) &&
+             "Not a load/store to/from an alloca?");
+      
+      // If we already have this instruction number, return it.
+      DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
+      if (It != InstNumbers.end()) return It->second;
+      
+      // Scan the whole block to get the instruction.  This accumulates
+      // information for every interesting instruction in the block, in order to
+      // avoid gratuitus rescans.
+      const BasicBlock *BB = I->getParent();
+      unsigned InstNo = 0;
+      for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end();
+           BBI != E; ++BBI)
+        if (isInterestingInstruction(BBI))
+          InstNumbers[BBI] = InstNo++;
+      It = InstNumbers.find(I);
+      
+      assert(It != InstNumbers.end() && "Didn't insert instruction?");
+      return It->second;
+    }
+    
+    void deleteValue(const Instruction *I) {
+      InstNumbers.erase(I);
+    }
+    
+    void clear() {
+      InstNumbers.clear();
+    }
+  };
+
+  struct VISIBILITY_HIDDEN PromoteMem2Reg {
+    /// Allocas - The alloca instructions being promoted.
+    ///
+    std::vector<AllocaInst*> Allocas;
+    DominatorTree &DT;
+    DominanceFrontier &DF;
+
+    /// AST - An AliasSetTracker object to update.  If null, don't update it.
+    ///
+    AliasSetTracker *AST;
+
+    /// AllocaLookup - Reverse mapping of Allocas.
+    ///
+    std::map<AllocaInst*, unsigned>  AllocaLookup;
+
+    /// NewPhiNodes - The PhiNodes we're adding.
+    ///
+    DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*> NewPhiNodes;
+    
+    /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas
+    /// it corresponds to.
+    DenseMap<PHINode*, unsigned> PhiToAllocaMap;
+    
+    /// PointerAllocaValues - If we are updating an AliasSetTracker, then for
+    /// each alloca that is of pointer type, we keep track of what to copyValue
+    /// to the inserted PHI nodes here.
+    ///
+    std::vector<Value*> PointerAllocaValues;
+
+    /// Visited - The set of basic blocks the renamer has already visited.
+    ///
+    SmallPtrSet<BasicBlock*, 16> Visited;
+
+    /// BBNumbers - Contains a stable numbering of basic blocks to avoid
+    /// non-determinstic behavior.
+    DenseMap<BasicBlock*, unsigned> BBNumbers;
+
+    /// BBNumPreds - Lazily compute the number of predecessors a block has.
+    DenseMap<const BasicBlock*, unsigned> BBNumPreds;
+  public:
+    PromoteMem2Reg(const std::vector<AllocaInst*> &A, DominatorTree &dt,
+                   DominanceFrontier &df, AliasSetTracker *ast)
+      : Allocas(A), DT(dt), DF(df), AST(ast) {}
+
+    void run();
+
+    /// properlyDominates - Return true if I1 properly dominates I2.
+    ///
+    bool properlyDominates(Instruction *I1, Instruction *I2) const {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(I1))
+        I1 = II->getNormalDest()->begin();
+      return DT.properlyDominates(I1->getParent(), I2->getParent());
+    }
+    
+    /// dominates - Return true if BB1 dominates BB2 using the DominatorTree.
+    ///
+    bool dominates(BasicBlock *BB1, BasicBlock *BB2) const {
+      return DT.dominates(BB1, BB2);
+    }
+
+  private:
+    void RemoveFromAllocasList(unsigned &AllocaIdx) {
+      Allocas[AllocaIdx] = Allocas.back();
+      Allocas.pop_back();
+      --AllocaIdx;
+    }
+
+    unsigned getNumPreds(const BasicBlock *BB) {
+      unsigned &NP = BBNumPreds[BB];
+      if (NP == 0)
+        NP = std::distance(pred_begin(BB), pred_end(BB))+1;
+      return NP-1;
+    }
+
+    void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
+                                 AllocaInfo &Info);
+    void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
+                             const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
+                             SmallPtrSet<BasicBlock*, 32> &LiveInBlocks);
+    
+    void RewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                  LargeBlockInfo &LBI);
+    void PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                  LargeBlockInfo &LBI);
+
+    
+    void RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                    RenamePassData::ValVector &IncVals,
+                    std::vector<RenamePassData> &Worklist);
+    bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version,
+                      SmallPtrSet<PHINode*, 16> &InsertedPHINodes);
+  };
+  
+  struct AllocaInfo {
+    std::vector<BasicBlock*> DefiningBlocks;
+    std::vector<BasicBlock*> UsingBlocks;
+    
+    StoreInst  *OnlyStore;
+    BasicBlock *OnlyBlock;
+    bool OnlyUsedInOneBlock;
+    
+    Value *AllocaPointerVal;
+    
+    void clear() {
+      DefiningBlocks.clear();
+      UsingBlocks.clear();
+      OnlyStore = 0;
+      OnlyBlock = 0;
+      OnlyUsedInOneBlock = true;
+      AllocaPointerVal = 0;
+    }
+    
+    /// AnalyzeAlloca - Scan the uses of the specified alloca, filling in our
+    /// ivars.
+    void AnalyzeAlloca(AllocaInst *AI) {
+      clear();
+
+      // As we scan the uses of the alloca instruction, keep track of stores,
+      // and decide whether all of the loads and stores to the alloca are within
+      // the same basic block.
+      for (Value::use_iterator U = AI->use_begin(), E = AI->use_end();
+           U != E;)  {
+        Instruction *User = cast<Instruction>(*U);
+        ++U;
+        if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+          // Remove any uses of this alloca in DbgInfoInstrinsics.
+          assert(BC->hasOneUse() && "Unexpected alloca uses!");
+          DbgInfoIntrinsic *DI = cast<DbgInfoIntrinsic>(*BC->use_begin());
+          DI->eraseFromParent();
+          BC->eraseFromParent();
+          continue;
+        } 
+        else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+          // Remember the basic blocks which define new values for the alloca
+          DefiningBlocks.push_back(SI->getParent());
+          AllocaPointerVal = SI->getOperand(0);
+          OnlyStore = SI;
+        } else {
+          LoadInst *LI = cast<LoadInst>(User);
+          // Otherwise it must be a load instruction, keep track of variable
+          // reads.
+          UsingBlocks.push_back(LI->getParent());
+          AllocaPointerVal = LI;
+        }
+        
+        if (OnlyUsedInOneBlock) {
+          if (OnlyBlock == 0)
+            OnlyBlock = User->getParent();
+          else if (OnlyBlock != User->getParent())
+            OnlyUsedInOneBlock = false;
+        }
+      }
+    }
+  };
+}  // end of anonymous namespace
+
+
+void PromoteMem2Reg::run() {
+  Function &F = *DF.getRoot()->getParent();
+
+  if (AST) PointerAllocaValues.resize(Allocas.size());
+
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+
+  for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
+    AllocaInst *AI = Allocas[AllocaNum];
+
+    assert(isAllocaPromotable(AI) &&
+           "Cannot promote non-promotable alloca!");
+    assert(AI->getParent()->getParent() == &F &&
+           "All allocas should be in the same function, which is same as DF!");
+
+    if (AI->use_empty()) {
+      // If there are no uses of the alloca, just delete it now.
+      if (AST) AST->deleteValue(AI);
+      AI->eraseFromParent();
+
+      // Remove the alloca from the Allocas list, since it has been processed
+      RemoveFromAllocasList(AllocaNum);
+      ++NumDeadAlloca;
+      continue;
+    }
+    
+    // Calculate the set of read and write-locations for each alloca.  This is
+    // analogous to finding the 'uses' and 'definitions' of each variable.
+    Info.AnalyzeAlloca(AI);
+
+    // If there is only a single store to this value, replace any loads of
+    // it that are directly dominated by the definition with the value stored.
+    if (Info.DefiningBlocks.size() == 1) {
+      RewriteSingleStoreAlloca(AI, Info, LBI);
+
+      // Finally, after the scan, check to see if the store is all that is left.
+      if (Info.UsingBlocks.empty()) {
+        // Remove the (now dead) store and alloca.
+        Info.OnlyStore->eraseFromParent();
+        LBI.deleteValue(Info.OnlyStore);
+
+        if (AST) AST->deleteValue(AI);
+        AI->eraseFromParent();
+        LBI.deleteValue(AI);
+        
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        
+        ++NumSingleStore;
+        continue;
+      }
+    }
+    
+    // If the alloca is only read and written in one basic block, just perform a
+    // linear sweep over the block to eliminate it.
+    if (Info.OnlyUsedInOneBlock) {
+      PromoteSingleBlockAlloca(AI, Info, LBI);
+      
+      // Finally, after the scan, check to see if the stores are all that is
+      // left.
+      if (Info.UsingBlocks.empty()) {
+        
+        // Remove the (now dead) stores and alloca.
+        while (!AI->use_empty()) {
+          StoreInst *SI = cast<StoreInst>(AI->use_back());
+          SI->eraseFromParent();
+          LBI.deleteValue(SI);
+        }
+        
+        if (AST) AST->deleteValue(AI);
+        AI->eraseFromParent();
+        LBI.deleteValue(AI);
+        
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        
+        ++NumLocalPromoted;
+        continue;
+      }
+    }
+    
+    // If we haven't computed a numbering for the BB's in the function, do so
+    // now.
+    if (BBNumbers.empty()) {
+      unsigned ID = 0;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        BBNumbers[I] = ID++;
+    }
+
+    // If we have an AST to keep updated, remember some pointer value that is
+    // stored into the alloca.
+    if (AST)
+      PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal;
+    
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.  Determine which blocks need PHI
+    // nodes and see if we can optimize out some work by avoiding insertion of
+    // dead phi nodes.
+    DetermineInsertionPoint(AI, AllocaNum, Info);
+  }
+
+  if (Allocas.empty())
+    return; // All of the allocas must have been trivial!
+
+  LBI.clear();
+  
+  
+  // Set the incoming values for the basic block to be null values for all of
+  // the alloca's.  We do this in case there is a load of a value that has not
+  // been stored yet.  In this case, it will get this null value.
+  //
+  RenamePassData::ValVector Values(Allocas.size());
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+    Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+
+  // Walks all basic blocks in the function performing the SSA rename algorithm
+  // and inserting the phi nodes we marked as necessary
+  //
+  std::vector<RenamePassData> RenamePassWorkList;
+  RenamePassWorkList.push_back(RenamePassData(F.begin(), 0, Values));
+  while (!RenamePassWorkList.empty()) {
+    RenamePassData RPD;
+    RPD.swap(RenamePassWorkList.back());
+    RenamePassWorkList.pop_back();
+    // RenamePass may add new worklist entries.
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
+  }
+  
+  // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
+  Visited.clear();
+
+  // Remove the allocas themselves from the function.
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
+    Instruction *A = Allocas[i];
+
+    // If there are any uses of the alloca instructions left, they must be in
+    // sections of dead code that were not processed on the dominance frontier.
+    // Just delete the users now.
+    //
+    if (!A->use_empty())
+      A->replaceAllUsesWith(UndefValue::get(A->getType()));
+    if (AST) AST->deleteValue(A);
+    A->eraseFromParent();
+  }
+
+  
+  // Loop over all of the PHI nodes and see if there are any that we can get
+  // rid of because they merge all of the same incoming values.  This can
+  // happen due to undef values coming into the PHI nodes.  This process is
+  // iterative, because eliminating one PHI node can cause others to be removed.
+  bool EliminatedAPHI = true;
+  while (EliminatedAPHI) {
+    EliminatedAPHI = false;
+    
+    for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+           NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) {
+      PHINode *PN = I->second;
+      
+      // If this PHI node merges one value and/or undefs, get the value.
+      if (Value *V = PN->hasConstantValue(true)) {
+        if (!isa<Instruction>(V) ||
+            properlyDominates(cast<Instruction>(V), PN)) {
+          if (AST && isa<PointerType>(PN->getType()))
+            AST->deleteValue(PN);
+          PN->replaceAllUsesWith(V);
+          PN->eraseFromParent();
+          NewPhiNodes.erase(I++);
+          EliminatedAPHI = true;
+          continue;
+        }
+      }
+      ++I;
+    }
+  }
+  
+  // At this point, the renamer has added entries to PHI nodes for all reachable
+  // code.  Unfortunately, there may be unreachable blocks which the renamer
+  // hasn't traversed.  If this is the case, the PHI nodes may not
+  // have incoming values for all predecessors.  Loop over all PHI nodes we have
+  // created, inserting undef values if they are missing any incoming values.
+  //
+  for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+         NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) {
+    // We want to do this once per basic block.  As such, only process a block
+    // when we find the PHI that is the first entry in the block.
+    PHINode *SomePHI = I->second;
+    BasicBlock *BB = SomePHI->getParent();
+    if (&BB->front() != SomePHI)
+      continue;
+
+    // Only do work here if there the PHI nodes are missing incoming values.  We
+    // know that all PHI nodes that were inserted in a block will have the same
+    // number of incoming values, so we can just check any of them.
+    if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
+      continue;
+
+    // Get the preds for BB.
+    SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+    
+    // Ok, now we know that all of the PHI nodes are missing entries for some
+    // basic blocks.  Start by sorting the incoming predecessors for efficient
+    // access.
+    std::sort(Preds.begin(), Preds.end());
+    
+    // Now we loop through all BB's which have entries in SomePHI and remove
+    // them from the Preds list.
+    for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
+      // Do a log(n) search of the Preds list for the entry we want.
+      SmallVector<BasicBlock*, 16>::iterator EntIt =
+        std::lower_bound(Preds.begin(), Preds.end(),
+                         SomePHI->getIncomingBlock(i));
+      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i)&&
+             "PHI node has entry for a block which is not a predecessor!");
+
+      // Remove the entry
+      Preds.erase(EntIt);
+    }
+
+    // At this point, the blocks left in the preds list must have dummy
+    // entries inserted into every PHI nodes for the block.  Update all the phi
+    // nodes in this block that we are inserting (there could be phis before
+    // mem2reg runs).
+    unsigned NumBadPreds = SomePHI->getNumIncomingValues();
+    BasicBlock::iterator BBI = BB->begin();
+    while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
+           SomePHI->getNumIncomingValues() == NumBadPreds) {
+      Value *UndefVal = UndefValue::get(SomePHI->getType());
+      for (unsigned pred = 0, e = Preds.size(); pred != e; ++pred)
+        SomePHI->addIncoming(UndefVal, Preds[pred]);
+    }
+  }
+        
+  NewPhiNodes.clear();
+}
+
+
+/// ComputeLiveInBlocks - Determine which blocks the value is live in.  These
+/// are blocks which lead to uses.  Knowing this allows us to avoid inserting
+/// PHI nodes into blocks which don't lead to uses (thus, the inserted phi nodes
+/// would be dead).
+void PromoteMem2Reg::
+ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
+                    const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
+                    SmallPtrSet<BasicBlock*, 32> &LiveInBlocks) {
+  
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock*, 64> LiveInBlockWorklist;
+  LiveInBlockWorklist.insert(LiveInBlockWorklist.end(), 
+                             Info.UsingBlocks.begin(), Info.UsingBlocks.end());
+  
+  // If any of the using blocks is also a definition block, check to see if the
+  // definition occurs before or after the use.  If it happens before the use,
+  // the value isn't really live-in.
+  for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
+    BasicBlock *BB = LiveInBlockWorklist[i];
+    if (!DefBlocks.count(BB)) continue;
+    
+    // Okay, this is a block that both uses and defines the value.  If the first
+    // reference to the alloca is a def (store), then we know it isn't live-in.
+    for (BasicBlock::iterator I = BB->begin(); ; ++I) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (SI->getOperand(1) != AI) continue;
+        
+        // We found a store to the alloca before a load.  The alloca is not
+        // actually live-in here.
+        LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
+        LiveInBlockWorklist.pop_back();
+        --i, --e;
+        break;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        if (LI->getOperand(0) != AI) continue;
+        
+        // Okay, we found a load before a store to the alloca.  It is actually
+        // live into this block.
+        break;
+      }
+    }
+  }
+  
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+    
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB))
+      continue;
+    
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *P = *PI;
+      
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+      
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// DetermineInsertionPoint - At this point, we're committed to promoting the
+/// alloca using IDF's, and the standard SSA construction algorithm.  Determine
+/// which blocks need phi nodes and see if we can optimize out some work by
+/// avoiding insertion of dead phi nodes.
+void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
+                                             AllocaInfo &Info) {
+
+  // Unique the set of defining blocks for efficient lookup.
+  SmallPtrSet<BasicBlock*, 32> DefBlocks;
+  DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end());
+
+  // Determine which blocks the value is live in.  These are blocks which lead
+  // to uses.
+  SmallPtrSet<BasicBlock*, 32> LiveInBlocks;
+  ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+  // Compute the locations where PhiNodes need to be inserted.  Look at the
+  // dominance frontier of EACH basic-block we have a write in.
+  unsigned CurrentVersion = 0;
+  SmallPtrSet<PHINode*, 16> InsertedPHINodes;
+  std::vector<std::pair<unsigned, BasicBlock*> > DFBlocks;
+  while (!Info.DefiningBlocks.empty()) {
+    BasicBlock *BB = Info.DefiningBlocks.back();
+    Info.DefiningBlocks.pop_back();
+    
+    // Look up the DF for this write, add it to defining blocks.
+    DominanceFrontier::const_iterator it = DF.find(BB);
+    if (it == DF.end()) continue;
+    
+    const DominanceFrontier::DomSetType &S = it->second;
+    
+    // In theory we don't need the indirection through the DFBlocks vector.
+    // In practice, the order of calling QueuePhiNode would depend on the
+    // (unspecified) ordering of basic blocks in the dominance frontier,
+    // which would give PHI nodes non-determinstic subscripts.  Fix this by
+    // processing blocks in order of the occurance in the function.
+    for (DominanceFrontier::DomSetType::const_iterator P = S.begin(),
+         PE = S.end(); P != PE; ++P) {
+      // If the frontier block is not in the live-in set for the alloca, don't
+      // bother processing it.
+      if (!LiveInBlocks.count(*P))
+        continue;
+      
+      DFBlocks.push_back(std::make_pair(BBNumbers[*P], *P));
+    }
+    
+    // Sort by which the block ordering in the function.
+    if (DFBlocks.size() > 1)
+      std::sort(DFBlocks.begin(), DFBlocks.end());
+    
+    for (unsigned i = 0, e = DFBlocks.size(); i != e; ++i) {
+      BasicBlock *BB = DFBlocks[i].second;
+      if (QueuePhiNode(BB, AllocaNum, CurrentVersion, InsertedPHINodes))
+        Info.DefiningBlocks.push_back(BB);
+    }
+    DFBlocks.clear();
+  }
+}
+
+/// RewriteSingleStoreAlloca - If there is only a single store to this value,
+/// replace any loads of it that are directly dominated by the definition with
+/// the value stored.
+void PromoteMem2Reg::RewriteSingleStoreAlloca(AllocaInst *AI,
+                                              AllocaInfo &Info,
+                                              LargeBlockInfo &LBI) {
+  StoreInst *OnlyStore = Info.OnlyStore;
+  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+  BasicBlock *StoreBB = OnlyStore->getParent();
+  int StoreIndex = -1;
+
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+  
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ) {
+    Instruction *UserInst = cast<Instruction>(*UI++);
+    if (!isa<LoadInst>(UserInst)) {
+      assert(UserInst == OnlyStore && "Should only have load/stores");
+      continue;
+    }
+    LoadInst *LI = cast<LoadInst>(UserInst);
+    
+    // Okay, if we have a load from the alloca, we want to replace it with the
+    // only value stored to the alloca.  We can do this if the value is
+    // dominated by the store.  If not, we use the rest of the mem2reg machinery
+    // to insert the phi nodes as needed.
+    if (!StoringGlobalVal) {  // Non-instructions are always dominated.
+      if (LI->getParent() == StoreBB) {
+        // If we have a use that is in the same block as the store, compare the
+        // indices of the two instructions to see which one came first.  If the
+        // load came before the store, we can't handle it.
+        if (StoreIndex == -1)
+          StoreIndex = LBI.getInstructionIndex(OnlyStore);
+
+        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
+          // Can't handle this load, bail out.
+          Info.UsingBlocks.push_back(StoreBB);
+          continue;
+        }
+        
+      } else if (LI->getParent() != StoreBB &&
+                 !dominates(StoreBB, LI->getParent())) {
+        // If the load and store are in different blocks, use BB dominance to
+        // check their relationships.  If the store doesn't dom the use, bail
+        // out.
+        Info.UsingBlocks.push_back(LI->getParent());
+        continue;
+      }
+    }
+    
+    // Otherwise, we *can* safely rewrite this load.
+    LI->replaceAllUsesWith(OnlyStore->getOperand(0));
+    if (AST && isa<PointerType>(LI->getType()))
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+}
+
+
+/// StoreIndexSearchPredicate - This is a helper predicate used to search by the
+/// first element of a pair.
+struct StoreIndexSearchPredicate {
+  bool operator()(const std::pair<unsigned, StoreInst*> &LHS,
+                  const std::pair<unsigned, StoreInst*> &RHS) {
+    return LHS.first < RHS.first;
+  }
+};
+
+/// PromoteSingleBlockAlloca - Many allocas are only used within a single basic
+/// block.  If this is the case, avoid traversing the CFG and inserting a lot of
+/// potentially useless PHI nodes by just performing a single linear pass over
+/// the basic block using the Alloca.
+///
+/// If we cannot promote this alloca (because it is read before it is written),
+/// return true.  This is necessary in cases where, due to control flow, the
+/// alloca is potentially undefined on some control flow paths.  e.g. code like
+/// this is potentially correct:
+///
+///   for (...) { if (c) { A = undef; undef = B; } }
+///
+/// ... so long as A is not used before undef is set.
+///
+void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                              LargeBlockInfo &LBI) {
+  // The trickiest case to handle is when we have large blocks. Because of this,
+  // this code is optimized assuming that large blocks happen.  This does not
+  // significantly pessimize the small block case.  This uses LargeBlockInfo to
+  // make it efficient to get the index of various operations in the block.
+  
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+  
+  // Walk the use-def list of the alloca, getting the locations of all stores.
+  typedef SmallVector<std::pair<unsigned, StoreInst*>, 64> StoresByIndexTy;
+  StoresByIndexTy StoresByIndex;
+  
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+       UI != E; ++UI) 
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI))
+      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
+
+  // If there are no stores to the alloca, just replace any loads with undef.
+  if (StoresByIndex.empty()) {
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) 
+      if (LoadInst *LI = dyn_cast<LoadInst>(*UI++)) {
+        LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+        if (AST && isa<PointerType>(LI->getType()))
+          AST->deleteValue(LI);
+        LBI.deleteValue(LI);
+        LI->eraseFromParent();
+      }
+    return;
+  }
+  
+  // Sort the stores by their index, making it efficient to do a lookup with a
+  // binary search.
+  std::sort(StoresByIndex.begin(), StoresByIndex.end());
+  
+  // Walk all of the loads from this alloca, replacing them with the nearest
+  // store above them, if any.
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI++);
+    if (!LI) continue;
+    
+    unsigned LoadIdx = LBI.getInstructionIndex(LI);
+    
+    // Find the nearest store that has a lower than this load. 
+    StoresByIndexTy::iterator I = 
+      std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
+                       std::pair<unsigned, StoreInst*>(LoadIdx, 0),
+                       StoreIndexSearchPredicate());
+    
+    // If there is no store before this load, then we can't promote this load.
+    if (I == StoresByIndex.begin()) {
+      // Can't handle this load, bail out.
+      Info.UsingBlocks.push_back(LI->getParent());
+      continue;
+    }
+      
+    // Otherwise, there was a store before this load, the load takes its value.
+    --I;
+    LI->replaceAllUsesWith(I->second->getOperand(0));
+    if (AST && isa<PointerType>(LI->getType()))
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+}
+
+
+// QueuePhiNode - queues a phi-node to be added to a basic-block for a specific
+// Alloca returns true if there wasn't already a phi-node for that variable
+//
+bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
+                                  unsigned &Version,
+                                  SmallPtrSet<PHINode*, 16> &InsertedPHINodes) {
+  // Look up the basic-block in question.
+  PHINode *&PN = NewPhiNodes[std::make_pair(BB, AllocaNo)];
+
+  // If the BB already has a phi node added for the i'th alloca then we're done!
+  if (PN) return false;
+
+  // Create a PhiNode using the dereferenced type... and add the phi-node to the
+  // BasicBlock.
+  PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(),
+                       Allocas[AllocaNo]->getName() + "." +
+                       utostr(Version++), BB->begin());
+  ++NumPHIInsert;
+  PhiToAllocaMap[PN] = AllocaNo;
+  PN->reserveOperandSpace(getNumPreds(BB));
+  
+  InsertedPHINodes.insert(PN);
+
+  if (AST && isa<PointerType>(PN->getType()))
+    AST->copyValue(PointerAllocaValues[AllocaNo], PN);
+
+  return true;
+}
+
+// RenamePass - Recursively traverse the CFG of the function, renaming loads and
+// stores to the allocas which we are promoting.  IncomingVals indicates what
+// value each Alloca contains on exit from the predecessor block Pred.
+//
+void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                                RenamePassData::ValVector &IncomingVals,
+                                std::vector<RenamePassData> &Worklist) {
+NextIteration:
+  // If we are inserting any phi nodes into this BB, they will already be in the
+  // block.
+  if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) {
+    // If we have PHI nodes to update, compute the number of edges from Pred to
+    // BB.
+    if (PhiToAllocaMap.count(APN)) {
+      // We want to be able to distinguish between PHI nodes being inserted by
+      // this invocation of mem2reg from those phi nodes that already existed in
+      // the IR before mem2reg was run.  We determine that APN is being inserted
+      // because it is missing incoming edges.  All other PHI nodes being
+      // inserted by this pass of mem2reg will have the same number of incoming
+      // operands so far.  Remember this count.
+      unsigned NewPHINumOperands = APN->getNumOperands();
+      
+      unsigned NumEdges = 0;
+      for (succ_iterator I = succ_begin(Pred), E = succ_end(Pred); I != E; ++I)
+        if (*I == BB)
+          ++NumEdges;
+      assert(NumEdges && "Must be at least one edge from Pred to BB!");
+      
+      // Add entries for all the phis.
+      BasicBlock::iterator PNI = BB->begin();
+      do {
+        unsigned AllocaNo = PhiToAllocaMap[APN];
+        
+        // Add N incoming values to the PHI node.
+        for (unsigned i = 0; i != NumEdges; ++i)
+          APN->addIncoming(IncomingVals[AllocaNo], Pred);
+        
+        // The currently active variable for this block is now the PHI.
+        IncomingVals[AllocaNo] = APN;
+        
+        // Get the next phi node.
+        ++PNI;
+        APN = dyn_cast<PHINode>(PNI);
+        if (APN == 0) break;
+        
+        // Verify that it is missing entries.  If not, it is not being inserted
+        // by this mem2reg invocation so we want to ignore it.
+      } while (APN->getNumOperands() == NewPHINumOperands);
+    }
+  }
+  
+  // Don't revisit blocks.
+  if (!Visited.insert(BB)) return;
+
+  for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II); ) {
+    Instruction *I = II++; // get the instruction, increment iterator
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
+      if (!Src) continue;
+  
+      std::map<AllocaInst*, unsigned>::iterator AI = AllocaLookup.find(Src);
+      if (AI == AllocaLookup.end()) continue;
+
+      Value *V = IncomingVals[AI->second];
+
+      // Anything using the load now uses the current value.
+      LI->replaceAllUsesWith(V);
+      if (AST && isa<PointerType>(LI->getType()))
+        AST->deleteValue(LI);
+      BB->getInstList().erase(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Delete this instruction and mark the name as the current holder of the
+      // value
+      AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
+      if (!Dest) continue;
+      
+      std::map<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
+      if (ai == AllocaLookup.end())
+        continue;
+      
+      // what value were we writing?
+      IncomingVals[ai->second] = SI->getOperand(0);
+      BB->getInstList().erase(SI);
+    }
+  }
+
+  // 'Recurse' to our successors.
+  succ_iterator I = succ_begin(BB), E = succ_end(BB);
+  if (I == E) return;
+
+  // Keep track of the successors so we don't visit the same successor twice
+  SmallPtrSet<BasicBlock*, 8> VisitedSuccs;
+
+  // Handle the first successor without using the worklist.
+  VisitedSuccs.insert(*I);
+  Pred = BB;
+  BB = *I;
+  ++I;
+
+  for (; I != E; ++I)
+    if (VisitedSuccs.insert(*I))
+      Worklist.push_back(RenamePassData(*I, Pred, IncomingVals));
+
+  goto NextIteration;
+}
+
+/// PromoteMemToReg - Promote the specified list of alloca instructions into
+/// scalar registers, inserting PHI nodes as appropriate.  This function makes
+/// use of DominanceFrontier information.  This function does not modify the CFG
+/// of the function at all.  All allocas must be from the same function.
+///
+/// If AST is specified, the specified tracker is updated to reflect changes
+/// made to the IR.
+///
+void llvm::PromoteMemToReg(const std::vector<AllocaInst*> &Allocas,
+                           DominatorTree &DT, DominanceFrontier &DF,
+                           AliasSetTracker *AST) {
+  // If there is nothing to do, bail out...
+  if (Allocas.empty()) return;
+
+  PromoteMem2Reg(Allocas, DT, DF, AST).run();
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
new file mode 100644
index 0000000..2cde765
--- /dev/null
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -0,0 +1,2213 @@
+//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Peephole optimize the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+#include <functional>
+#include <set>
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+
+/// SafeToMergeTerminators - Return true if it is safe to merge these two
+/// terminator instructions together.
+///
+static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
+  if (SI1 == SI2) return false;  // Can't merge with self!
+  
+  // It is not safe to merge these two switch instructions if they have a common
+  // successor, and if that successor has a PHI node, and if *that* PHI node has
+  // conflicting incoming values from the two switch blocks.
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  
+  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
+    if (SI1Succs.count(*I))
+      for (BasicBlock::iterator BBI = (*I)->begin();
+           isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) !=
+            PN->getIncomingValueForBlock(SI2BB))
+          return false;
+      }
+        
+  return true;
+}
+
+/// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will
+/// now be entries in it from the 'NewPred' block.  The values that will be
+/// flowing into the PHI nodes will be the same as those coming in from
+/// ExistPred, an existing predecessor of Succ.
+static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
+                                  BasicBlock *ExistPred) {
+  assert(std::find(succ_begin(ExistPred), succ_end(ExistPred), Succ) !=
+         succ_end(ExistPred) && "ExistPred is not a predecessor of Succ!");
+  if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
+  
+  PHINode *PN;
+  for (BasicBlock::iterator I = Succ->begin();
+       (PN = dyn_cast<PHINode>(I)); ++I)
+    PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
+}
+
+/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
+/// almost-empty BB ending in an unconditional branch to Succ, into succ.
+///
+/// Assumption: Succ is the single successor for BB.
+///
+static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
+
+  DOUT << "Looking to fold " << BB->getNameStart() << " into " 
+       << Succ->getNameStart() << "\n";
+  // Shortcut, if there is only a single predecessor it must be BB and merging
+  // is always safe
+  if (Succ->getSinglePredecessor()) return true;
+
+  typedef SmallPtrSet<Instruction*, 16> InstrSet;
+  InstrSet BBPHIs;
+
+  // Make a list of all phi nodes in BB
+  BasicBlock::iterator BBI = BB->begin();
+  while (isa<PHINode>(*BBI)) BBPHIs.insert(BBI++);
+
+  // Make a list of the predecessors of BB
+  typedef SmallPtrSet<BasicBlock*, 16> BlockSet;
+  BlockSet BBPreds(pred_begin(BB), pred_end(BB));
+
+  // Use that list to make another list of common predecessors of BB and Succ
+  BlockSet CommonPreds;
+  for (pred_iterator PI = pred_begin(Succ), PE = pred_end(Succ);
+        PI != PE; ++PI)
+    if (BBPreds.count(*PI))
+      CommonPreds.insert(*PI);
+
+  // Shortcut, if there are no common predecessors, merging is always safe
+  if (CommonPreds.empty())
+    return true;
+  
+  // Look at all the phi nodes in Succ, to see if they present a conflict when
+  // merging these blocks
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // If the incoming value from BB is again a PHINode in
+    // BB which has the same incoming value for *PI as PN does, we can
+    // merge the phi nodes and then the blocks can still be merged
+    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
+    if (BBPN && BBPN->getParent() == BB) {
+      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+            PI != PE; PI++) {
+        if (BBPN->getIncomingValueForBlock(*PI) 
+              != PN->getIncomingValueForBlock(*PI)) {
+          DOUT << "Can't fold, phi node " << *PN->getNameStart() << " in " 
+               << Succ->getNameStart() << " is conflicting with " 
+               << BBPN->getNameStart() << " with regard to common predecessor "
+               << (*PI)->getNameStart() << "\n";
+          return false;
+        }
+      }
+      // Remove this phinode from the list of phis in BB, since it has been
+      // handled.
+      BBPHIs.erase(BBPN);
+    } else {
+      Value* Val = PN->getIncomingValueForBlock(BB);
+      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+            PI != PE; PI++) {
+        // See if the incoming value for the common predecessor is equal to the
+        // one for BB, in which case this phi node will not prevent the merging
+        // of the block.
+        if (Val != PN->getIncomingValueForBlock(*PI)) {
+          DOUT << "Can't fold, phi node " << *PN->getNameStart() << " in " 
+          << Succ->getNameStart() << " is conflicting with regard to common "
+          << "predecessor " << (*PI)->getNameStart() << "\n";
+          return false;
+        }
+      }
+    }
+  }
+
+  // If there are any other phi nodes in BB that don't have a phi node in Succ
+  // to merge with, they must be moved to Succ completely. However, for any
+  // predecessors of Succ, branches will be added to the phi node that just
+  // point to itself. So, for any common predecessors, this must not cause
+  // conflicts.
+  for (InstrSet::iterator I = BBPHIs.begin(), E = BBPHIs.end();
+        I != E; I++) {
+    PHINode *PN = cast<PHINode>(*I);
+    for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+          PI != PE; PI++)
+      if (PN->getIncomingValueForBlock(*PI) != PN) {
+        DOUT << "Can't fold, phi node " << *PN->getNameStart() << " in " 
+             << BB->getNameStart() << " is conflicting with regard to common "
+             << "predecessor " << (*PI)->getNameStart() << "\n";
+        return false;
+      }
+  }
+
+  return true;
+}
+
+/// TryToSimplifyUncondBranchFromEmptyBlock - BB contains an unconditional
+/// branch to Succ, and contains no instructions other than PHI nodes and the
+/// branch.  If possible, eliminate BB.
+static bool TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+                                                    BasicBlock *Succ) {
+  // Check to see if merging these blocks would cause conflicts for any of the
+  // phi nodes in BB or Succ. If not, we can safely merge.
+  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+  
+  DOUT << "Killing Trivial BB: \n" << *BB;
+  
+  if (isa<PHINode>(Succ->begin())) {
+    // If there is more than one pred of succ, and there are PHI nodes in
+    // the successor, then we need to add incoming edges for the PHI nodes
+    //
+    const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+    
+    // Loop over all of the PHI nodes in the successor of BB.
+    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      Value *OldVal = PN->removeIncomingValue(BB, false);
+      assert(OldVal && "No entry in PHI for Pred BB!");
+      
+      // If this incoming value is one of the PHI nodes in BB, the new entries
+      // in the PHI node are the entries from the old PHI.
+      if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+        PHINode *OldValPN = cast<PHINode>(OldVal);
+        for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i)
+          // Note that, since we are merging phi nodes and BB and Succ might
+          // have common predecessors, we could end up with a phi node with
+          // identical incoming branches. This will be cleaned up later (and
+          // will trigger asserts if we try to clean it up now, without also
+          // simplifying the corresponding conditional branch).
+          PN->addIncoming(OldValPN->getIncomingValue(i),
+                          OldValPN->getIncomingBlock(i));
+      } else {
+        // Add an incoming value for each of the new incoming values.
+        for (unsigned i = 0, e = BBPreds.size(); i != e; ++i)
+          PN->addIncoming(OldVal, BBPreds[i]);
+      }
+    }
+  }
+  
+  if (isa<PHINode>(&BB->front())) {
+    SmallVector<BasicBlock*, 16>
+    OldSuccPreds(pred_begin(Succ), pred_end(Succ));
+    
+    // Move all PHI nodes in BB to Succ if they are alive, otherwise
+    // delete them.
+    while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+      if (PN->use_empty()) {
+        // Just remove the dead phi.  This happens if Succ's PHIs were the only
+        // users of the PHI nodes.
+        PN->eraseFromParent();
+        continue;
+      }
+    
+      // The instruction is alive, so this means that BB must dominate all
+      // predecessors of Succ (Since all uses of the PN are after its
+      // definition, so in Succ or a block dominated by Succ. If a predecessor
+      // of Succ would not be dominated by BB, PN would violate the def before
+      // use SSA demand). Therefore, we can simply move the phi node to the
+      // next block.
+      Succ->getInstList().splice(Succ->begin(),
+                                 BB->getInstList(), BB->begin());
+      
+      // We need to add new entries for the PHI node to account for
+      // predecessors of Succ that the PHI node does not take into
+      // account.  At this point, since we know that BB dominated succ and all
+      // of its predecessors, this means that we should any newly added
+      // incoming edges should use the PHI node itself as the value for these
+      // edges, because they are loop back edges.
+      for (unsigned i = 0, e = OldSuccPreds.size(); i != e; ++i)
+        if (OldSuccPreds[i] != BB)
+          PN->addIncoming(PN, OldSuccPreds[i]);
+    }
+  }
+    
+  // Everything that jumped to BB now goes to Succ.
+  BB->replaceAllUsesWith(Succ);
+  if (!Succ->hasName()) Succ->takeName(BB);
+  BB->eraseFromParent();              // Delete the old basic block.
+  return true;
+}
+
+/// GetIfCondition - Given a basic block (BB) with two predecessors (and
+/// presumably PHI nodes in it), check to see if the merge at this block is due
+/// to an "if condition".  If so, return the boolean condition that determines
+/// which entry into BB will be taken.  Also, return by references the block
+/// that will be entered from if the condition is true, and the block that will
+/// be entered if the condition is false.
+///
+///
+static Value *GetIfCondition(BasicBlock *BB,
+                             BasicBlock *&IfTrue, BasicBlock *&IfFalse) {
+  assert(std::distance(pred_begin(BB), pred_end(BB)) == 2 &&
+         "Function can only handle blocks with 2 predecessors!");
+  BasicBlock *Pred1 = *pred_begin(BB);
+  BasicBlock *Pred2 = *++pred_begin(BB);
+
+  // We can only handle branches.  Other control flow will be lowered to
+  // branches if possible anyway.
+  if (!isa<BranchInst>(Pred1->getTerminator()) ||
+      !isa<BranchInst>(Pred2->getTerminator()))
+    return 0;
+  BranchInst *Pred1Br = cast<BranchInst>(Pred1->getTerminator());
+  BranchInst *Pred2Br = cast<BranchInst>(Pred2->getTerminator());
+
+  // Eliminate code duplication by ensuring that Pred1Br is conditional if
+  // either are.
+  if (Pred2Br->isConditional()) {
+    // If both branches are conditional, we don't have an "if statement".  In
+    // reality, we could transform this case, but since the condition will be
+    // required anyway, we stand no chance of eliminating it, so the xform is
+    // probably not profitable.
+    if (Pred1Br->isConditional())
+      return 0;
+
+    std::swap(Pred1, Pred2);
+    std::swap(Pred1Br, Pred2Br);
+  }
+
+  if (Pred1Br->isConditional()) {
+    // If we found a conditional branch predecessor, make sure that it branches
+    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
+    if (Pred1Br->getSuccessor(0) == BB &&
+        Pred1Br->getSuccessor(1) == Pred2) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else if (Pred1Br->getSuccessor(0) == Pred2 &&
+               Pred1Br->getSuccessor(1) == BB) {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    } else {
+      // We know that one arm of the conditional goes to BB, so the other must
+      // go somewhere unrelated, and this must not be an "if statement".
+      return 0;
+    }
+
+    // The only thing we have to watch out for here is to make sure that Pred2
+    // doesn't have incoming edges from other blocks.  If it does, the condition
+    // doesn't dominate BB.
+    if (++pred_begin(Pred2) != pred_end(Pred2))
+      return 0;
+
+    return Pred1Br->getCondition();
+  }
+
+  // Ok, if we got here, both predecessors end with an unconditional branch to
+  // BB.  Don't panic!  If both blocks only have a single (identical)
+  // predecessor, and THAT is a conditional branch, then we're all ok!
+  if (pred_begin(Pred1) == pred_end(Pred1) ||
+      ++pred_begin(Pred1) != pred_end(Pred1) ||
+      pred_begin(Pred2) == pred_end(Pred2) ||
+      ++pred_begin(Pred2) != pred_end(Pred2) ||
+      *pred_begin(Pred1) != *pred_begin(Pred2))
+    return 0;
+
+  // Otherwise, if this is a conditional branch, then we can use it!
+  BasicBlock *CommonPred = *pred_begin(Pred1);
+  if (BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator())) {
+    assert(BI->isConditional() && "Two successors but not conditional?");
+    if (BI->getSuccessor(0) == Pred1) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    }
+    return BI->getCondition();
+  }
+  return 0;
+}
+
+/// DominatesMergePoint - If we have a merge point of an "if condition" as
+/// accepted above, return true if the specified value dominates the block.  We
+/// don't handle the true generality of domination here, just a special case
+/// which works well enough for us.
+///
+/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
+/// see if V (which must be an instruction) is cheap to compute and is
+/// non-trapping.  If both are true, the instruction is inserted into the set
+/// and true is returned.
+static bool DominatesMergePoint(Value *V, BasicBlock *BB,
+                                std::set<Instruction*> *AggressiveInsts) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Non-instructions all dominate instructions, but not all constantexprs
+    // can be executed unconditionally.
+    if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
+      if (C->canTrap())
+        return false;
+    return true;
+  }
+  BasicBlock *PBB = I->getParent();
+
+  // We don't want to allow weird loops that might have the "if condition" in
+  // the bottom of this block.
+  if (PBB == BB) return false;
+
+  // If this instruction is defined in a block that contains an unconditional
+  // branch to BB, then it must be in the 'conditional' part of the "if
+  // statement".
+  if (BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator()))
+    if (BI->isUnconditional() && BI->getSuccessor(0) == BB) {
+      if (!AggressiveInsts) return false;
+      // Okay, it looks like the instruction IS in the "condition".  Check to
+      // see if its a cheap instruction to unconditionally compute, and if it
+      // only uses stuff defined outside of the condition.  If so, hoist it out.
+      switch (I->getOpcode()) {
+      default: return false;  // Cannot hoist this out safely.
+      case Instruction::Load: {
+        // We can hoist loads that are non-volatile and obviously cannot trap.
+        if (cast<LoadInst>(I)->isVolatile())
+          return false;
+        // FIXME: A computation of a constant can trap!
+        if (!isa<AllocaInst>(I->getOperand(0)) &&
+            !isa<Constant>(I->getOperand(0)))
+          return false;
+        // External weak globals may have address 0, so we can't load them.
+        Value *V2 = I->getOperand(0)->getUnderlyingObject();
+        if (V2) {
+          GlobalVariable* GV = dyn_cast<GlobalVariable>(V2);
+          if (GV && GV->hasExternalWeakLinkage())
+            return false;
+        }
+        // Finally, we have to check to make sure there are no instructions
+        // before the load in its basic block, as we are going to hoist the loop
+        // out to its predecessor.
+        BasicBlock::iterator IP = PBB->begin();
+        while (isa<DbgInfoIntrinsic>(IP))
+          IP++;
+        if (IP != BasicBlock::iterator(I))
+          return false;
+        break;
+      }
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        if (I->getOperand(0)->getType()->isFPOrFPVector())
+          return false;  // FP arithmetic might trap.
+        break;   // These are all cheap and non-trapping instructions.
+      }
+
+      // Okay, we can only really hoist these out if their operands are not
+      // defined in the conditional region.
+      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+        if (!DominatesMergePoint(*i, BB, 0))
+          return false;
+      // Okay, it's safe to do this!  Remember this instruction.
+      AggressiveInsts->insert(I);
+    }
+
+  return true;
+}
+
+/// GatherConstantSetEQs - Given a potentially 'or'd together collection of
+/// icmp_eq instructions that compare a value against a constant, return the
+/// value being compared, and stick the constant into the Values vector.
+static Value *GatherConstantSetEQs(Value *V, std::vector<ConstantInt*> &Values){
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    if (Inst->getOpcode() == Instruction::ICmp &&
+        cast<ICmpInst>(Inst)->getPredicate() == ICmpInst::ICMP_EQ) {
+      if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(1))) {
+        Values.push_back(C);
+        return Inst->getOperand(0);
+      } else if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
+        Values.push_back(C);
+        return Inst->getOperand(1);
+      }
+    } else if (Inst->getOpcode() == Instruction::Or) {
+      if (Value *LHS = GatherConstantSetEQs(Inst->getOperand(0), Values))
+        if (Value *RHS = GatherConstantSetEQs(Inst->getOperand(1), Values))
+          if (LHS == RHS)
+            return LHS;
+    }
+  }
+  return 0;
+}
+
+/// GatherConstantSetNEs - Given a potentially 'and'd together collection of
+/// setne instructions that compare a value against a constant, return the value
+/// being compared, and stick the constant into the Values vector.
+static Value *GatherConstantSetNEs(Value *V, std::vector<ConstantInt*> &Values){
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    if (Inst->getOpcode() == Instruction::ICmp &&
+               cast<ICmpInst>(Inst)->getPredicate() == ICmpInst::ICMP_NE) {
+      if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(1))) {
+        Values.push_back(C);
+        return Inst->getOperand(0);
+      } else if (ConstantInt *C = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
+        Values.push_back(C);
+        return Inst->getOperand(1);
+      }
+    } else if (Inst->getOpcode() == Instruction::And) {
+      if (Value *LHS = GatherConstantSetNEs(Inst->getOperand(0), Values))
+        if (Value *RHS = GatherConstantSetNEs(Inst->getOperand(1), Values))
+          if (LHS == RHS)
+            return LHS;
+    }
+  }
+  return 0;
+}
+
+/// GatherValueComparisons - If the specified Cond is an 'and' or 'or' of a
+/// bunch of comparisons of one value against constants, return the value and
+/// the constants being compared.
+static bool GatherValueComparisons(Instruction *Cond, Value *&CompVal,
+                                   std::vector<ConstantInt*> &Values) {
+  if (Cond->getOpcode() == Instruction::Or) {
+    CompVal = GatherConstantSetEQs(Cond, Values);
+
+    // Return true to indicate that the condition is true if the CompVal is
+    // equal to one of the constants.
+    return true;
+  } else if (Cond->getOpcode() == Instruction::And) {
+    CompVal = GatherConstantSetNEs(Cond, Values);
+
+    // Return false to indicate that the condition is false if the CompVal is
+    // equal to one of the constants.
+    return false;
+  }
+  return false;
+}
+
+static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
+  Instruction* Cond = 0;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cond = dyn_cast<Instruction>(SI->getCondition());
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional())
+      Cond = dyn_cast<Instruction>(BI->getCondition());
+  }
+
+  TI->eraseFromParent();
+  if (Cond) RecursivelyDeleteTriviallyDeadInstructions(Cond);
+}
+
+/// isValueEqualityComparison - Return true if the specified terminator checks
+/// to see if a value is equal to constant integer value.
+static Value *isValueEqualityComparison(TerminatorInst *TI) {
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    // Do not permit merging of large switch instructions into their
+    // predecessors unless there is only one predecessor.
+    if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
+                                               pred_end(SI->getParent())) > 128)
+      return 0;
+
+    return SI->getCondition();
+  }
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI))
+    if (BI->isConditional() && BI->getCondition()->hasOneUse())
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
+        if ((ICI->getPredicate() == ICmpInst::ICMP_EQ ||
+             ICI->getPredicate() == ICmpInst::ICMP_NE) &&
+            isa<ConstantInt>(ICI->getOperand(1)))
+          return ICI->getOperand(0);
+  return 0;
+}
+
+/// GetValueEqualityComparisonCases - Given a value comparison instruction,
+/// decode all of the 'cases' that it represents and return the 'default' block.
+static BasicBlock *
+GetValueEqualityComparisonCases(TerminatorInst *TI,
+                                std::vector<std::pair<ConstantInt*,
+                                                      BasicBlock*> > &Cases) {
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cases.reserve(SI->getNumCases());
+    for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+      Cases.push_back(std::make_pair(SI->getCaseValue(i), SI->getSuccessor(i)));
+    return SI->getDefaultDest();
+  }
+
+  BranchInst *BI = cast<BranchInst>(TI);
+  ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+  Cases.push_back(std::make_pair(cast<ConstantInt>(ICI->getOperand(1)),
+                                 BI->getSuccessor(ICI->getPredicate() ==
+                                                  ICmpInst::ICMP_NE)));
+  return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+}
+
+
+/// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries
+/// in the list that match the specified block.
+static void EliminateBlockCases(BasicBlock *BB,
+               std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases) {
+  for (unsigned i = 0, e = Cases.size(); i != e; ++i)
+    if (Cases[i].second == BB) {
+      Cases.erase(Cases.begin()+i);
+      --i; --e;
+    }
+}
+
+/// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as
+/// well.
+static bool
+ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
+              std::vector<std::pair<ConstantInt*, BasicBlock*> > &C2) {
+  std::vector<std::pair<ConstantInt*, BasicBlock*> > *V1 = &C1, *V2 = &C2;
+
+  // Make V1 be smaller than V2.
+  if (V1->size() > V2->size())
+    std::swap(V1, V2);
+
+  if (V1->size() == 0) return false;
+  if (V1->size() == 1) {
+    // Just scan V2.
+    ConstantInt *TheVal = (*V1)[0].first;
+    for (unsigned i = 0, e = V2->size(); i != e; ++i)
+      if (TheVal == (*V2)[i].first)
+        return true;
+  }
+
+  // Otherwise, just sort both lists and compare element by element.
+  std::sort(V1->begin(), V1->end());
+  std::sort(V2->begin(), V2->end());
+  unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
+  while (i1 != e1 && i2 != e2) {
+    if ((*V1)[i1].first == (*V2)[i2].first)
+      return true;
+    if ((*V1)[i1].first < (*V2)[i2].first)
+      ++i1;
+    else
+      ++i2;
+  }
+  return false;
+}
+
+/// SimplifyEqualityComparisonWithOnlyPredecessor - If TI is known to be a
+/// terminator instruction and its block is known to only have a single
+/// predecessor block, check to see if that predecessor is also a value
+/// comparison with the same value, and if that comparison determines the
+/// outcome of this comparison.  If so, simplify TI.  This does a very limited
+/// form of jump threading.
+static bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
+                                                          BasicBlock *Pred) {
+  Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
+  if (!PredVal) return false;  // Not a value comparison in predecessor.
+
+  Value *ThisVal = isValueEqualityComparison(TI);
+  assert(ThisVal && "This isn't a value comparison!!");
+  if (ThisVal != PredVal) return false;  // Different predicates.
+
+  // Find out information about when control will move from Pred to TI's block.
+  std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+  BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
+                                                        PredCases);
+  EliminateBlockCases(PredDef, PredCases);  // Remove default from cases.
+
+  // Find information about how control leaves this block.
+  std::vector<std::pair<ConstantInt*, BasicBlock*> > ThisCases;
+  BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
+  EliminateBlockCases(ThisDef, ThisCases);  // Remove default from cases.
+
+  // If TI's block is the default block from Pred's comparison, potentially
+  // simplify TI based on this knowledge.
+  if (PredDef == TI->getParent()) {
+    // If we are here, we know that the value is none of those cases listed in
+    // PredCases.  If there are any cases in ThisCases that are in PredCases, we
+    // can simplify TI.
+    if (ValuesOverlap(PredCases, ThisCases)) {
+      if (isa<BranchInst>(TI)) {
+        // Okay, one of the successors of this condbr is dead.  Convert it to a
+        // uncond br.
+        assert(ThisCases.size() == 1 && "Branch can only have one case!");
+        // Insert the new branch.
+        Instruction *NI = BranchInst::Create(ThisDef, TI);
+
+        // Remove PHI node entries for the dead edge.
+        ThisCases[0].second->removePredecessor(TI->getParent());
+
+        DOUT << "Threading pred instr: " << *Pred->getTerminator()
+             << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n";
+
+        EraseTerminatorInstAndDCECond(TI);
+        return true;
+
+      } else {
+        SwitchInst *SI = cast<SwitchInst>(TI);
+        // Okay, TI has cases that are statically dead, prune them away.
+        SmallPtrSet<Constant*, 16> DeadCases;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          DeadCases.insert(PredCases[i].first);
+
+        DOUT << "Threading pred instr: " << *Pred->getTerminator()
+             << "Through successor TI: " << *TI;
+
+        for (unsigned i = SI->getNumCases()-1; i != 0; --i)
+          if (DeadCases.count(SI->getCaseValue(i))) {
+            SI->getSuccessor(i)->removePredecessor(TI->getParent());
+            SI->removeCase(i);
+          }
+
+        DOUT << "Leaving: " << *TI << "\n";
+        return true;
+      }
+    }
+
+  } else {
+    // Otherwise, TI's block must correspond to some matched value.  Find out
+    // which value (or set of values) this is.
+    ConstantInt *TIV = 0;
+    BasicBlock *TIBB = TI->getParent();
+    for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+      if (PredCases[i].second == TIBB) {
+        if (TIV == 0)
+          TIV = PredCases[i].first;
+        else
+          return false;  // Cannot handle multiple values coming to this block.
+      }
+    assert(TIV && "No edge from pred to succ?");
+
+    // Okay, we found the one constant that our value can be if we get into TI's
+    // BB.  Find out which successor will unconditionally be branched to.
+    BasicBlock *TheRealDest = 0;
+    for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
+      if (ThisCases[i].first == TIV) {
+        TheRealDest = ThisCases[i].second;
+        break;
+      }
+
+    // If not handled by any explicit cases, it is handled by the default case.
+    if (TheRealDest == 0) TheRealDest = ThisDef;
+
+    // Remove PHI node entries for dead edges.
+    BasicBlock *CheckEdge = TheRealDest;
+    for (succ_iterator SI = succ_begin(TIBB), e = succ_end(TIBB); SI != e; ++SI)
+      if (*SI != CheckEdge)
+        (*SI)->removePredecessor(TIBB);
+      else
+        CheckEdge = 0;
+
+    // Insert the new branch.
+    Instruction *NI = BranchInst::Create(TheRealDest, TI);
+
+    DOUT << "Threading pred instr: " << *Pred->getTerminator()
+         << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n";
+
+    EraseTerminatorInstAndDCECond(TI);
+    return true;
+  }
+  return false;
+}
+
+namespace {
+  /// ConstantIntOrdering - This class implements a stable ordering of constant
+  /// integers that does not depend on their address.  This is important for
+  /// applications that sort ConstantInt's to ensure uniqueness.
+  struct ConstantIntOrdering {
+    bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+      return LHS->getValue().ult(RHS->getValue());
+    }
+  };
+}
+
+/// FoldValueComparisonIntoPredecessors - The specified terminator is a value
+/// equality comparison instruction (either a switch or a branch on "X == c").
+/// See if any of the predecessors of the terminator block are value comparisons
+/// on the same value.  If so, and if safe to do so, fold them together.
+static bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI) {
+  BasicBlock *BB = TI->getParent();
+  Value *CV = isValueEqualityComparison(TI);  // CondVal
+  assert(CV && "Not a comparison?");
+  bool Changed = false;
+
+  SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+  while (!Preds.empty()) {
+    BasicBlock *Pred = Preds.pop_back_val();
+
+    // See if the predecessor is a comparison with the same value.
+    TerminatorInst *PTI = Pred->getTerminator();
+    Value *PCV = isValueEqualityComparison(PTI);  // PredCondVal
+
+    if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
+      // Figure out which 'cases' to copy from SI to PSI.
+      std::vector<std::pair<ConstantInt*, BasicBlock*> > BBCases;
+      BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
+
+      std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+      BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
+
+      // Based on whether the default edge from PTI goes to BB or not, fill in
+      // PredCases and PredDefault with the new switch cases we would like to
+      // build.
+      SmallVector<BasicBlock*, 8> NewSuccessors;
+
+      if (PredDefault == BB) {
+        // If this is the default destination from PTI, only the edges in TI
+        // that don't occur in PTI, or that branch to BB will be activated.
+        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].second != BB)
+            PTIHandled.insert(PredCases[i].first);
+          else {
+            // The default destination is BB, we don't need explicit targets.
+            std::swap(PredCases[i], PredCases.back());
+            PredCases.pop_back();
+            --i; --e;
+          }
+
+        // Reconstruct the new switch statement we will be building.
+        if (PredDefault != BBDefault) {
+          PredDefault->removePredecessor(Pred);
+          PredDefault = BBDefault;
+          NewSuccessors.push_back(BBDefault);
+        }
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (!PTIHandled.count(BBCases[i].first) &&
+              BBCases[i].second != BBDefault) {
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].second);
+          }
+
+      } else {
+        // If this is not the default destination from PSI, only the edges
+        // in SI that occur in PSI with a destination of BB will be
+        // activated.
+        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].second == BB) {
+            PTIHandled.insert(PredCases[i].first);
+            std::swap(PredCases[i], PredCases.back());
+            PredCases.pop_back();
+            --i; --e;
+          }
+
+        // Okay, now we know which constants were sent to BB from the
+        // predecessor.  Figure out where they will all go now.
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (PTIHandled.count(BBCases[i].first)) {
+            // If this is one we are capable of getting...
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].second);
+            PTIHandled.erase(BBCases[i].first);// This constant is taken care of
+          }
+
+        // If there are any constants vectored to BB that TI doesn't handle,
+        // they must go to the default destination of TI.
+        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = 
+                                    PTIHandled.begin(),
+               E = PTIHandled.end(); I != E; ++I) {
+          PredCases.push_back(std::make_pair(*I, BBDefault));
+          NewSuccessors.push_back(BBDefault);
+        }
+      }
+
+      // Okay, at this point, we know which new successor Pred will get.  Make
+      // sure we update the number of entries in the PHI nodes for these
+      // successors.
+      for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i)
+        AddPredecessorToBlock(NewSuccessors[i], Pred, BB);
+
+      // Now that the successors are updated, create the new Switch instruction.
+      SwitchInst *NewSI = SwitchInst::Create(CV, PredDefault,
+                                             PredCases.size(), PTI);
+      for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+        NewSI->addCase(PredCases[i].first, PredCases[i].second);
+
+      EraseTerminatorInstAndDCECond(PTI);
+
+      // Okay, last check.  If BB is still a successor of PSI, then we must
+      // have an infinite loop case.  If so, add an infinitely looping block
+      // to handle the case to preserve the behavior of the code.
+      BasicBlock *InfLoopBlock = 0;
+      for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i)
+        if (NewSI->getSuccessor(i) == BB) {
+          if (InfLoopBlock == 0) {
+            // Insert it at the end of the function, because it's either code,
+            // or it won't matter if it's hot. :)
+            InfLoopBlock = BasicBlock::Create("infloop", BB->getParent());
+            BranchInst::Create(InfLoopBlock, InfLoopBlock);
+          }
+          NewSI->setSuccessor(i, InfLoopBlock);
+        }
+
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+/// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and
+/// BB2, hoist any common code in the two blocks up into the branch block.  The
+/// caller of this function guarantees that BI's block dominates BB1 and BB2.
+static bool HoistThenElseCodeToIf(BranchInst *BI) {
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  In particular, we don't want to get into
+  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
+  // such, we currently just scan for obviously identical instructions in an
+  // identical order.
+  BasicBlock *BB1 = BI->getSuccessor(0);  // The true destination.
+  BasicBlock *BB2 = BI->getSuccessor(1);  // The false destination
+
+  BasicBlock::iterator BB1_Itr = BB1->begin();
+  BasicBlock::iterator BB2_Itr = BB2->begin();
+
+  Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++;
+  while (isa<DbgInfoIntrinsic>(I1))
+    I1 = BB1_Itr++;
+  while (isa<DbgInfoIntrinsic>(I2))
+    I2 = BB2_Itr++;
+  if (I1->getOpcode() != I2->getOpcode() || isa<PHINode>(I1) || 
+      isa<InvokeInst>(I1) || !I1->isIdenticalTo(I2))
+    return false;
+
+  // If we get here, we can hoist at least one instruction.
+  BasicBlock *BIParent = BI->getParent();
+
+  do {
+    // If we are hoisting the terminator instruction, don't move one (making a
+    // broken BB), instead clone it, and remove BI.
+    if (isa<TerminatorInst>(I1))
+      goto HoistTerminator;
+
+    // For a normal instruction, we just move one to right before the branch,
+    // then replace all uses of the other with the first.  Finally, we remove
+    // the now redundant second instruction.
+    BIParent->getInstList().splice(BI, BB1->getInstList(), I1);
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    BB2->getInstList().erase(I2);
+
+    I1 = BB1_Itr++;
+    while (isa<DbgInfoIntrinsic>(I1))
+      I1 = BB1_Itr++;
+    I2 = BB2_Itr++;
+    while (isa<DbgInfoIntrinsic>(I2))
+      I2 = BB2_Itr++;
+  } while (I1->getOpcode() == I2->getOpcode() && I1->isIdenticalTo(I2));
+
+  return true;
+
+HoistTerminator:
+  // Okay, it is safe to hoist the terminator.
+  Instruction *NT = I1->clone();
+  BIParent->getInstList().insert(BI, NT);
+  if (NT->getType() != Type::VoidTy) {
+    I1->replaceAllUsesWith(NT);
+    I2->replaceAllUsesWith(NT);
+    NT->takeName(I1);
+  }
+
+  // Hoisting one of the terminators from our successor is a great thing.
+  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
+  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
+  // nodes, so we insert select instruction to compute the final result.
+  std::map<std::pair<Value*,Value*>, SelectInst*> InsertedSelects;
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = SI->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V != BB2V) {
+        // These values do not agree.  Insert a select instruction before NT
+        // that determines the right value.
+        SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+        if (SI == 0)
+          SI = SelectInst::Create(BI->getCondition(), BB1V, BB2V,
+                                  BB1V->getName()+"."+BB2V->getName(), NT);
+        // Make the PHI node use the select for all incoming values for BB1/BB2
+        for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+          if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
+            PN->setIncomingValue(i, SI);
+      }
+    }
+  }
+
+  // Update any PHI nodes in our new successors.
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI)
+    AddPredecessorToBlock(*SI, BIParent, BB1);
+
+  EraseTerminatorInstAndDCECond(BI);
+  return true;
+}
+
+/// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
+/// and an BB2 and the only successor of BB1 is BB2, hoist simple code
+/// (for now, restricted to a single instruction that's side effect free) from
+/// the BB1 into the branch block to speculatively execute it.
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
+  // Only speculatively execution a single instruction (not counting the
+  // terminator) for now.
+  Instruction *HInst = NULL;
+  Instruction *Term = BB1->getTerminator();
+  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end();
+       BBI != BBE; ++BBI) {
+    Instruction *I = BBI;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(I))   continue;
+    if (I == Term)  break;
+
+    if (!HInst)
+      HInst = I;
+    else
+      return false;
+  }
+  if (!HInst)
+    return false;
+
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<Instruction>(BrCond) &&
+      cast<Instruction>(BrCond)->getOpcode() == Instruction::FCmp)
+    return false;
+
+  // If BB1 is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (BB1 != BI->getSuccessor(0)) {
+    assert(BB1 == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+
+  // Turn
+  // BB:
+  //     %t1 = icmp
+  //     br i1 %t1, label %BB1, label %BB2
+  // BB1:
+  //     %t3 = add %t2, c
+  //     br label BB2
+  // BB2:
+  // =>
+  // BB:
+  //     %t1 = icmp
+  //     %t4 = add %t2, c
+  //     %t3 = select i1 %t1, %t2, %t3
+  switch (HInst->getOpcode()) {
+  default: return false;  // Not safe / profitable to hoist.
+  case Instruction::Add:
+  case Instruction::Sub:
+    // FP arithmetic might trap. Not worth doing for vector ops.
+    if (HInst->getType()->isFloatingPoint() 
+        || isa<VectorType>(HInst->getType()))
+      return false;
+    break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    // Don't mess with vector operations.
+    if (isa<VectorType>(HInst->getType()))
+      return false;
+    break;   // These are all cheap and non-trapping instructions.
+  }
+  
+  // If the instruction is obviously dead, don't try to predicate it.
+  if (HInst->use_empty()) {
+    HInst->eraseFromParent();
+    return true;
+  }
+
+  // Can we speculatively execute the instruction? And what is the value 
+  // if the condition is false? Consider the phi uses, if the incoming value
+  // from the "if" block are all the same V, then V is the value of the
+  // select if the condition is false.
+  BasicBlock *BIParent = BI->getParent();
+  SmallVector<PHINode*, 4> PHIUses;
+  Value *FalseV = NULL;
+  
+  BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0);
+  for (Value::use_iterator UI = HInst->use_begin(), E = HInst->use_end();
+       UI != E; ++UI) {
+    // Ignore any user that is not a PHI node in BB2.  These can only occur in
+    // unreachable blocks, because they would not be dominated by the instr.
+    PHINode *PN = dyn_cast<PHINode>(UI);
+    if (!PN || PN->getParent() != BB2)
+      return false;
+    PHIUses.push_back(PN);
+    
+    Value *PHIV = PN->getIncomingValueForBlock(BIParent);
+    if (!FalseV)
+      FalseV = PHIV;
+    else if (FalseV != PHIV)
+      return false;  // Inconsistent value when condition is false.
+  }
+  
+  assert(FalseV && "Must have at least one user, and it must be a PHI");
+
+  // Do not hoist the instruction if any of its operands are defined but not
+  // used in this BB. The transformation will prevent the operand from
+  // being sunk into the use block.
+  for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); 
+       i != e; ++i) {
+    Instruction *OpI = dyn_cast<Instruction>(*i);
+    if (OpI && OpI->getParent() == BIParent &&
+        !OpI->isUsedInBasicBlock(BIParent))
+      return false;
+  }
+
+  // If we get here, we can hoist the instruction. Try to place it
+  // before the icmp instruction preceding the conditional branch.
+  BasicBlock::iterator InsertPos = BI;
+  if (InsertPos != BIParent->begin())
+    --InsertPos;
+  // Skip debug info between condition and branch.
+  while (InsertPos != BIParent->begin() && isa<DbgInfoIntrinsic>(InsertPos))
+    --InsertPos;
+  if (InsertPos == BrCond && !isa<PHINode>(BrCond)) {
+    SmallPtrSet<Instruction *, 4> BB1Insns;
+    for(BasicBlock::iterator BB1I = BB1->begin(), BB1E = BB1->end(); 
+        BB1I != BB1E; ++BB1I) 
+      BB1Insns.insert(BB1I);
+    for(Value::use_iterator UI = BrCond->use_begin(), UE = BrCond->use_end();
+        UI != UE; ++UI) {
+      Instruction *Use = cast<Instruction>(*UI);
+      if (BB1Insns.count(Use)) {
+        // If BrCond uses the instruction that place it just before
+        // branch instruction.
+        InsertPos = BI;
+        break;
+      }
+    }
+  } else
+    InsertPos = BI;
+  BIParent->getInstList().splice(InsertPos, BB1->getInstList(), HInst);
+
+  // Create a select whose true value is the speculatively executed value and
+  // false value is the previously determined FalseV.
+  SelectInst *SI;
+  if (Invert)
+    SI = SelectInst::Create(BrCond, FalseV, HInst,
+                            FalseV->getName() + "." + HInst->getName(), BI);
+  else
+    SI = SelectInst::Create(BrCond, HInst, FalseV,
+                            HInst->getName() + "." + FalseV->getName(), BI);
+
+  // Make the PHI node use the select for all incoming values for "then" and
+  // "if" blocks.
+  for (unsigned i = 0, e = PHIUses.size(); i != e; ++i) {
+    PHINode *PN = PHIUses[i];
+    for (unsigned j = 0, ee = PN->getNumIncomingValues(); j != ee; ++j)
+      if (PN->getIncomingBlock(j) == BB1 ||
+          PN->getIncomingBlock(j) == BIParent)
+        PN->setIncomingValue(j, SI);
+  }
+
+  ++NumSpeculations;
+  return true;
+}
+
+/// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch
+/// across this block.
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  unsigned Size = 0;
+  
+  for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+    if (isa<DbgInfoIntrinsic>(BBI))
+      continue;
+    if (Size > 10) return false;  // Don't clone large BB's.
+    ++Size;
+    
+    // We can only support instructions that do not define values that are
+    // live outside of the current basic block.
+    for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
+         UI != E; ++UI) {
+      Instruction *U = cast<Instruction>(*UI);
+      if (U->getParent() != BB || isa<PHINode>(U)) return false;
+    }
+    
+    // Looks ok, continue checking.
+  }
+
+  return true;
+}
+
+/// FoldCondBranchOnPHI - If we have a conditional branch on a PHI node value
+/// that is defined in the same block as the branch and if any PHI entries are
+/// constants, thread edges corresponding to that entry to be branches to their
+/// ultimate destination.
+static bool FoldCondBranchOnPHI(BranchInst *BI) {
+  BasicBlock *BB = BI->getParent();
+  PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
+  // NOTE: we currently cannot transform this case if the PHI node is used
+  // outside of the block.
+  if (!PN || PN->getParent() != BB || !PN->hasOneUse())
+    return false;
+  
+  // Degenerate case of a single entry PHI.
+  if (PN->getNumIncomingValues() == 1) {
+    FoldSingleEntryPHINodes(PN->getParent());
+    return true;    
+  }
+
+  // Now we know that this block has multiple preds and two succs.
+  if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
+  
+  // Okay, this is a simple enough basic block.  See if any phi values are
+  // constants.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    ConstantInt *CB;
+    if ((CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i))) &&
+        CB->getType() == Type::Int1Ty) {
+      // Okay, we now know that all edges from PredBB should be revectored to
+      // branch to RealDest.
+      BasicBlock *PredBB = PN->getIncomingBlock(i);
+      BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
+      
+      if (RealDest == BB) continue;  // Skip self loops.
+      
+      // The dest block might have PHI nodes, other predecessors and other
+      // difficult cases.  Instead of being smart about this, just insert a new
+      // block that jumps to the destination block, effectively splitting
+      // the edge we are about to create.
+      BasicBlock *EdgeBB = BasicBlock::Create(RealDest->getName()+".critedge",
+                                              RealDest->getParent(), RealDest);
+      BranchInst::Create(RealDest, EdgeBB);
+      PHINode *PN;
+      for (BasicBlock::iterator BBI = RealDest->begin();
+           (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+        Value *V = PN->getIncomingValueForBlock(BB);
+        PN->addIncoming(V, EdgeBB);
+      }
+
+      // BB may have instructions that are being threaded over.  Clone these
+      // instructions into EdgeBB.  We know that there will be no uses of the
+      // cloned instructions outside of EdgeBB.
+      BasicBlock::iterator InsertPt = EdgeBB->begin();
+      std::map<Value*, Value*> TranslateMap;  // Track translated values.
+      for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+        if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
+          TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
+        } else {
+          // Clone the instruction.
+          Instruction *N = BBI->clone();
+          if (BBI->hasName()) N->setName(BBI->getName()+".c");
+          
+          // Update operands due to translation.
+          for (User::op_iterator i = N->op_begin(), e = N->op_end();
+               i != e; ++i) {
+            std::map<Value*, Value*>::iterator PI =
+              TranslateMap.find(*i);
+            if (PI != TranslateMap.end())
+              *i = PI->second;
+          }
+          
+          // Check for trivial simplification.
+          if (Constant *C = ConstantFoldInstruction(N)) {
+            TranslateMap[BBI] = C;
+            delete N;   // Constant folded away, don't need actual inst
+          } else {
+            // Insert the new instruction into its new home.
+            EdgeBB->getInstList().insert(InsertPt, N);
+            if (!BBI->use_empty())
+              TranslateMap[BBI] = N;
+          }
+        }
+      }
+
+      // Loop over all of the edges from PredBB to BB, changing them to branch
+      // to EdgeBB instead.
+      TerminatorInst *PredBBTI = PredBB->getTerminator();
+      for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
+        if (PredBBTI->getSuccessor(i) == BB) {
+          BB->removePredecessor(PredBB);
+          PredBBTI->setSuccessor(i, EdgeBB);
+        }
+      
+      // Recurse, simplifying any other constants.
+      return FoldCondBranchOnPHI(BI) | true;
+    }
+  }
+
+  return false;
+}
+
+/// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry
+/// PHI node, see if we can eliminate it.
+static bool FoldTwoEntryPHINode(PHINode *PN) {
+  // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
+  // statement", which has a very simple dominance structure.  Basically, we
+  // are trying to find the condition that is being branched on, which
+  // subsequently causes this merge to happen.  We really want control
+  // dependence information for this check, but simplifycfg can't keep it up
+  // to date, and this catches most of the cases we care about anyway.
+  //
+  BasicBlock *BB = PN->getParent();
+  BasicBlock *IfTrue, *IfFalse;
+  Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
+  if (!IfCond) return false;
+  
+  // Okay, we found that we can merge this two-entry phi node into a select.
+  // Doing so would require us to fold *all* two entry phi nodes in this block.
+  // At some point this becomes non-profitable (particularly if the target
+  // doesn't support cmov's).  Only do this transformation if there are two or
+  // fewer PHI nodes in this block.
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
+    if (NumPhis > 2)
+      return false;
+  
+  DOUT << "FOUND IF CONDITION!  " << *IfCond << "  T: "
+       << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n";
+  
+  // Loop over the PHI's seeing if we can promote them all to select
+  // instructions.  While we are at it, keep track of the instructions
+  // that need to be moved to the dominating block.
+  std::set<Instruction*> AggressiveInsts;
+  
+  BasicBlock::iterator AfterPHIIt = BB->begin();
+  while (isa<PHINode>(AfterPHIIt)) {
+    PHINode *PN = cast<PHINode>(AfterPHIIt++);
+    if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) {
+      if (PN->getIncomingValue(0) != PN)
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      else
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+    } else if (!DominatesMergePoint(PN->getIncomingValue(0), BB,
+                                    &AggressiveInsts) ||
+               !DominatesMergePoint(PN->getIncomingValue(1), BB,
+                                    &AggressiveInsts)) {
+      return false;
+    }
+  }
+  
+  // If we all PHI nodes are promotable, check to make sure that all
+  // instructions in the predecessor blocks can be promoted as well.  If
+  // not, we won't be able to get rid of the control flow, so it's not
+  // worth promoting to select instructions.
+  BasicBlock *DomBlock = 0, *IfBlock1 = 0, *IfBlock2 = 0;
+  PN = cast<PHINode>(BB->begin());
+  BasicBlock *Pred = PN->getIncomingBlock(0);
+  if (cast<BranchInst>(Pred->getTerminator())->isUnconditional()) {
+    IfBlock1 = Pred;
+    DomBlock = *pred_begin(Pred);
+    for (BasicBlock::iterator I = Pred->begin();
+         !isa<TerminatorInst>(I); ++I)
+      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control
+        // flow, so the xform is not worth it.
+        return false;
+      }
+  }
+    
+  Pred = PN->getIncomingBlock(1);
+  if (cast<BranchInst>(Pred->getTerminator())->isUnconditional()) {
+    IfBlock2 = Pred;
+    DomBlock = *pred_begin(Pred);
+    for (BasicBlock::iterator I = Pred->begin();
+         !isa<TerminatorInst>(I); ++I)
+      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control
+        // flow, so the xform is not worth it.
+        return false;
+      }
+  }
+      
+  // If we can still promote the PHI nodes after this gauntlet of tests,
+  // do all of the PHI's now.
+
+  // Move all 'aggressive' instructions, which are defined in the
+  // conditional parts of the if's up to the dominating block.
+  if (IfBlock1) {
+    DomBlock->getInstList().splice(DomBlock->getTerminator(),
+                                   IfBlock1->getInstList(),
+                                   IfBlock1->begin(),
+                                   IfBlock1->getTerminator());
+  }
+  if (IfBlock2) {
+    DomBlock->getInstList().splice(DomBlock->getTerminator(),
+                                   IfBlock2->getInstList(),
+                                   IfBlock2->begin(),
+                                   IfBlock2->getTerminator());
+  }
+  
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    // Change the PHI node into a select instruction.
+    Value *TrueVal =
+      PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+    Value *FalseVal =
+      PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
+    
+    Value *NV = SelectInst::Create(IfCond, TrueVal, FalseVal, "", AfterPHIIt);
+    PN->replaceAllUsesWith(NV);
+    NV->takeName(PN);
+    
+    BB->getInstList().erase(PN);
+  }
+  return true;
+}
+
+/// isTerminatorFirstRelevantInsn - Return true if Term is very first 
+/// instruction ignoring Phi nodes and dbg intrinsics.
+static bool isTerminatorFirstRelevantInsn(BasicBlock *BB, Instruction *Term) {
+  BasicBlock::iterator BBI = Term;
+  while (BBI != BB->begin()) {
+    --BBI;
+    if (!isa<DbgInfoIntrinsic>(BBI))
+      break;
+  }
+
+  if (isa<PHINode>(BBI) || &*BBI == Term || isa<DbgInfoIntrinsic>(BBI))
+    return true;
+  return false;
+}
+
+/// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
+/// to two returning blocks, try to merge them together into one return,
+/// introducing a select if the return values disagree.
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
+  assert(BI->isConditional() && "Must be a conditional branch");
+  BasicBlock *TrueSucc = BI->getSuccessor(0);
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
+  ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
+  ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
+  
+  // Check to ensure both blocks are empty (just a return) or optionally empty
+  // with PHI nodes.  If there are other instructions, merging would cause extra
+  // computation on one path or the other.
+  if (!isTerminatorFirstRelevantInsn(TrueSucc, TrueRet))
+    return false;
+  if (!isTerminatorFirstRelevantInsn(FalseSucc, FalseRet))
+    return false;
+
+  // Okay, we found a branch that is going to two return nodes.  If
+  // there is no return value for this function, just change the
+  // branch into a return.
+  if (FalseRet->getNumOperands() == 0) {
+    TrueSucc->removePredecessor(BI->getParent());
+    FalseSucc->removePredecessor(BI->getParent());
+    ReturnInst::Create(0, BI);
+    EraseTerminatorInstAndDCECond(BI);
+    return true;
+  }
+    
+  // Otherwise, figure out what the true and false return values are
+  // so we can insert a new select instruction.
+  Value *TrueValue = TrueRet->getReturnValue();
+  Value *FalseValue = FalseRet->getReturnValue();
+  
+  // Unwrap any PHI nodes in the return blocks.
+  if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
+    if (TVPN->getParent() == TrueSucc)
+      TrueValue = TVPN->getIncomingValueForBlock(BI->getParent());
+  if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
+    if (FVPN->getParent() == FalseSucc)
+      FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
+  
+  // In order for this transformation to be safe, we must be able to
+  // unconditionally execute both operands to the return.  This is
+  // normally the case, but we could have a potentially-trapping
+  // constant expression that prevents this transformation from being
+  // safe.
+  if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue))
+    if (TCV->canTrap())
+      return false;
+  if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
+    if (FCV->canTrap())
+      return false;
+  
+  // Okay, we collected all the mapped values and checked them for sanity, and
+  // defined to really do this transformation.  First, update the CFG.
+  TrueSucc->removePredecessor(BI->getParent());
+  FalseSucc->removePredecessor(BI->getParent());
+  
+  // Insert select instructions where needed.
+  Value *BrCond = BI->getCondition();
+  if (TrueValue) {
+    // Insert a select if the results differ.
+    if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) {
+    } else if (isa<UndefValue>(TrueValue)) {
+      TrueValue = FalseValue;
+    } else {
+      TrueValue = SelectInst::Create(BrCond, TrueValue,
+                                     FalseValue, "retval", BI);
+    }
+  }
+
+  Value *RI = !TrueValue ?
+              ReturnInst::Create(BI) :
+              ReturnInst::Create(TrueValue, BI);
+      
+  DOUT << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+       << "\n  " << *BI << "NewRet = " << *RI
+       << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc;
+      
+  EraseTerminatorInstAndDCECond(BI);
+
+  return true;
+}
+
+/// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch,
+/// and if a predecessor branches to us and one of our successors, fold the
+/// setcc into the predecessor and use logical operations to pick the right
+/// destination.
+static bool FoldBranchToCommonDest(BranchInst *BI) {
+  BasicBlock *BB = BI->getParent();
+  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+  if (Cond == 0) return false;
+
+  
+  // Only allow this if the condition is a simple instruction that can be
+  // executed unconditionally.  It must be in the same block as the branch, and
+  // must be at the front of the block.
+  BasicBlock::iterator FrontIt = BB->front();
+  // Ignore dbg intrinsics.
+  while(isa<DbgInfoIntrinsic>(FrontIt))
+    ++FrontIt;
+  if ((!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+      Cond->getParent() != BB || &*FrontIt != Cond || !Cond->hasOneUse()) {
+    return false;
+  }
+  
+  // Make sure the instruction after the condition is the cond branch.
+  BasicBlock::iterator CondIt = Cond; ++CondIt;
+  // Ingore dbg intrinsics.
+  while(isa<DbgInfoIntrinsic>(CondIt))
+    ++CondIt;
+  if (&*CondIt != BI) {
+    assert (!isa<DbgInfoIntrinsic>(CondIt) && "Hey do not forget debug info!");
+    return false;
+  }
+
+  // Cond is known to be a compare or binary operator.  Check to make sure that
+  // neither operand is a potentially-trapping constant expression.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
+    if (CE->canTrap())
+      return false;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
+    if (CE->canTrap())
+      return false;
+  
+  
+  // Finally, don't infinitely unroll conditional loops.
+  BasicBlock *TrueDest  = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+  if (TrueDest == BB || FalseDest == BB)
+    return false;
+  
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *PredBlock = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
+    
+    // Check that we have two conditional branches.  If there is a PHI node in
+    // the common successor, verify that the same value flows in from both
+    // blocks.
+    if (PBI == 0 || PBI->isUnconditional() ||
+        !SafeToMergeTerminators(BI, PBI))
+      continue;
+    
+    Instruction::BinaryOps Opc;
+    bool InvertPredCond = false;
+
+    if (PBI->getSuccessor(0) == TrueDest)
+      Opc = Instruction::Or;
+    else if (PBI->getSuccessor(1) == FalseDest)
+      Opc = Instruction::And;
+    else if (PBI->getSuccessor(0) == FalseDest)
+      Opc = Instruction::And, InvertPredCond = true;
+    else if (PBI->getSuccessor(1) == TrueDest)
+      Opc = Instruction::Or, InvertPredCond = true;
+    else
+      continue;
+
+    DOUT << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB;
+    
+    // If we need to invert the condition in the pred block to match, do so now.
+    if (InvertPredCond) {
+      Value *NewCond =
+        BinaryOperator::CreateNot(PBI->getCondition(),
+                                  PBI->getCondition()->getName()+".not", PBI);
+      PBI->setCondition(NewCond);
+      BasicBlock *OldTrue = PBI->getSuccessor(0);
+      BasicBlock *OldFalse = PBI->getSuccessor(1);
+      PBI->setSuccessor(0, OldFalse);
+      PBI->setSuccessor(1, OldTrue);
+    }
+    
+    // Clone Cond into the predecessor basic block, and or/and the
+    // two conditions together.
+    Instruction *New = Cond->clone();
+    PredBlock->getInstList().insert(PBI, New);
+    New->takeName(Cond);
+    Cond->setName(New->getName()+".old");
+    
+    Value *NewCond = BinaryOperator::Create(Opc, PBI->getCondition(),
+                                            New, "or.cond", PBI);
+    PBI->setCondition(NewCond);
+    if (PBI->getSuccessor(0) == BB) {
+      AddPredecessorToBlock(TrueDest, PredBlock, BB);
+      PBI->setSuccessor(0, TrueDest);
+    }
+    if (PBI->getSuccessor(1) == BB) {
+      AddPredecessorToBlock(FalseDest, PredBlock, BB);
+      PBI->setSuccessor(1, FalseDest);
+    }
+    return true;
+  }
+  return false;
+}
+
+/// SimplifyCondBranchToCondBranch - If we have a conditional branch as a
+/// predecessor of another block, this function tries to simplify it.  We know
+/// that PBI and BI are both conditional branches, and BI is in one of the
+/// successor blocks of PBI - PBI branches to BI.
+static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
+  assert(PBI->isConditional() && BI->isConditional());
+  BasicBlock *BB = BI->getParent();
+  
+  // If this block ends with a branch instruction, and if there is a
+  // predecessor that ends on a branch of the same condition, make 
+  // this conditional branch redundant.
+  if (PBI->getCondition() == BI->getCondition() &&
+      PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+    // Okay, the outcome of this conditional branch is statically
+    // knowable.  If this block had a single pred, handle specially.
+    if (BB->getSinglePredecessor()) {
+      // Turn this into a branch on constant.
+      bool CondIsTrue = PBI->getSuccessor(0) == BB;
+      BI->setCondition(ConstantInt::get(Type::Int1Ty, CondIsTrue));
+      return true;  // Nuke the branch on constant.
+    }
+    
+    // Otherwise, if there are multiple predecessors, insert a PHI that merges
+    // in the constant and simplify the block result.  Subsequent passes of
+    // simplifycfg will thread the block.
+    if (BlockIsSimpleEnoughToThreadThrough(BB)) {
+      PHINode *NewPN = PHINode::Create(Type::Int1Ty,
+                                       BI->getCondition()->getName() + ".pr",
+                                       BB->begin());
+      // Okay, we're going to insert the PHI node.  Since PBI is not the only
+      // predecessor, compute the PHI'd conditional value for all of the preds.
+      // Any predecessor where the condition is not computable we keep symbolic.
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        if ((PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) &&
+            PBI != BI && PBI->isConditional() &&
+            PBI->getCondition() == BI->getCondition() &&
+            PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+          bool CondIsTrue = PBI->getSuccessor(0) == BB;
+          NewPN->addIncoming(ConstantInt::get(Type::Int1Ty, 
+                                              CondIsTrue), *PI);
+        } else {
+          NewPN->addIncoming(BI->getCondition(), *PI);
+        }
+      
+      BI->setCondition(NewPN);
+      return true;
+    }
+  }
+  
+  // If this is a conditional branch in an empty block, and if any
+  // predecessors is a conditional branch to one of our destinations,
+  // fold the conditions into logical ops and one cond br.
+  BasicBlock::iterator BBI = BB->begin();
+  // Ignore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(BBI))
+    ++BBI;
+  if (&*BBI != BI)
+    return false;
+
+  
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
+    if (CE->canTrap())
+      return false;
+  
+  int PBIOp, BIOp;
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0))
+    PBIOp = BIOp = 0;
+  else if (PBI->getSuccessor(0) == BI->getSuccessor(1))
+    PBIOp = 0, BIOp = 1;
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(0))
+    PBIOp = 1, BIOp = 0;
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(1))
+    PBIOp = BIOp = 1;
+  else
+    return false;
+    
+  // Check to make sure that the other destination of this branch
+  // isn't BB itself.  If so, this is an infinite loop that will
+  // keep getting unwound.
+  if (PBI->getSuccessor(PBIOp) == BB)
+    return false;
+    
+  // Do not perform this transformation if it would require 
+  // insertion of a large number of select instructions. For targets
+  // without predication/cmovs, this is a big pessimization.
+  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
+      
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator II = CommonDest->begin();
+       isa<PHINode>(II); ++II, ++NumPhis)
+    if (NumPhis > 2) // Disable this xform.
+      return false;
+    
+  // Finally, if everything is ok, fold the branches to logical ops.
+  BasicBlock *OtherDest  = BI->getSuccessor(BIOp ^ 1);
+  
+  DOUT << "FOLDING BRs:" << *PBI->getParent()
+       << "AND: " << *BI->getParent();
+  
+  
+  // If OtherDest *is* BB, then BB is a basic block with a single conditional
+  // branch in it, where one edge (OtherDest) goes back to itself but the other
+  // exits.  We don't *know* that the program avoids the infinite loop
+  // (even though that seems likely).  If we do this xform naively, we'll end up
+  // recursively unpeeling the loop.  Since we know that (after the xform is
+  // done) that the block *is* infinite if reached, we just make it an obviously
+  // infinite loop with no cond branch.
+  if (OtherDest == BB) {
+    // Insert it at the end of the function, because it's either code,
+    // or it won't matter if it's hot. :)
+    BasicBlock *InfLoopBlock = BasicBlock::Create("infloop", BB->getParent());
+    BranchInst::Create(InfLoopBlock, InfLoopBlock);
+    OtherDest = InfLoopBlock;
+  }  
+  
+  DOUT << *PBI->getParent()->getParent();
+  
+  // BI may have other predecessors.  Because of this, we leave
+  // it alone, but modify PBI.
+  
+  // Make sure we get to CommonDest on True&True directions.
+  Value *PBICond = PBI->getCondition();
+  if (PBIOp)
+    PBICond = BinaryOperator::CreateNot(PBICond,
+                                        PBICond->getName()+".not",
+                                        PBI);
+  Value *BICond = BI->getCondition();
+  if (BIOp)
+    BICond = BinaryOperator::CreateNot(BICond,
+                                       BICond->getName()+".not",
+                                       PBI);
+  // Merge the conditions.
+  Value *Cond = BinaryOperator::CreateOr(PBICond, BICond, "brmerge", PBI);
+  
+  // Modify PBI to branch on the new condition to the new dests.
+  PBI->setCondition(Cond);
+  PBI->setSuccessor(0, CommonDest);
+  PBI->setSuccessor(1, OtherDest);
+  
+  // OtherDest may have phi nodes.  If so, add an entry from PBI's
+  // block that are identical to the entries for BI's block.
+  PHINode *PN;
+  for (BasicBlock::iterator II = OtherDest->begin();
+       (PN = dyn_cast<PHINode>(II)); ++II) {
+    Value *V = PN->getIncomingValueForBlock(BB);
+    PN->addIncoming(V, PBI->getParent());
+  }
+  
+  // We know that the CommonDest already had an edge from PBI to
+  // it.  If it has PHIs though, the PHIs may have different
+  // entries for BB and PBI's BB.  If so, insert a select to make
+  // them agree.
+  for (BasicBlock::iterator II = CommonDest->begin();
+       (PN = dyn_cast<PHINode>(II)); ++II) {
+    Value *BIV = PN->getIncomingValueForBlock(BB);
+    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN->getIncomingValue(PBBIdx);
+    if (BIV != PBIV) {
+      // Insert a select in PBI to pick the right value.
+      Value *NV = SelectInst::Create(PBICond, PBIV, BIV,
+                                     PBIV->getName()+".mux", PBI);
+      PN->setIncomingValue(PBBIdx, NV);
+    }
+  }
+  
+  DOUT << "INTO: " << *PBI->getParent();
+  
+  DOUT << *PBI->getParent()->getParent();
+  
+  // This basic block is probably dead.  We know it has at least
+  // one fewer predecessor.
+  return true;
+}
+
+
+/// SimplifyCFG - This function is used to do simplification of a CFG.  For
+/// example, it adjusts branches to branches to eliminate the extra hop, it
+/// eliminates unreachable basic blocks, and does other "peephole" optimization
+/// of the CFG.  It returns true if a modification was made.
+///
+/// WARNING:  The entry node of a function may not be simplified.
+///
+bool llvm::SimplifyCFG(BasicBlock *BB) {
+  bool Changed = false;
+  Function *M = BB->getParent();
+
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+  assert(&BB->getParent()->getEntryBlock() != BB &&
+         "Can't Simplify entry block!");
+
+  // Remove basic blocks that have no predecessors... or that just have themself
+  // as a predecessor.  These are unreachable.
+  if (pred_begin(BB) == pred_end(BB) || BB->getSinglePredecessor() == BB) {
+    DOUT << "Removing BB: \n" << *BB;
+    DeleteDeadBlock(BB);
+    return true;
+  }
+
+  // Check to see if we can constant propagate this terminator instruction
+  // away...
+  Changed |= ConstantFoldTerminator(BB);
+
+  // If there is a trivial two-entry PHI node in this basic block, and we can
+  // eliminate it, do so now.
+  if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
+    if (PN->getNumIncomingValues() == 2)
+      Changed |= FoldTwoEntryPHINode(PN); 
+
+  // If this is a returning block with only PHI nodes in it, fold the return
+  // instruction into any unconditional branch predecessors.
+  //
+  // If any predecessor is a conditional branch that just selects among
+  // different return values, fold the replace the branch/return with a select
+  // and return.
+  if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+    if (isTerminatorFirstRelevantInsn(BB, BB->getTerminator())) {
+      // Find predecessors that end with branches.
+      SmallVector<BasicBlock*, 8> UncondBranchPreds;
+      SmallVector<BranchInst*, 8> CondBranchPreds;
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        TerminatorInst *PTI = (*PI)->getTerminator();
+        if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
+          if (BI->isUnconditional())
+            UncondBranchPreds.push_back(*PI);
+          else
+            CondBranchPreds.push_back(BI);
+        }
+      }
+
+      // If we found some, do the transformation!
+      if (!UncondBranchPreds.empty()) {
+        while (!UncondBranchPreds.empty()) {
+          BasicBlock *Pred = UncondBranchPreds.pop_back_val();
+          DOUT << "FOLDING: " << *BB
+               << "INTO UNCOND BRANCH PRED: " << *Pred;
+          Instruction *UncondBranch = Pred->getTerminator();
+          // Clone the return and add it to the end of the predecessor.
+          Instruction *NewRet = RI->clone();
+          Pred->getInstList().push_back(NewRet);
+
+          BasicBlock::iterator BBI = RI;
+          if (BBI != BB->begin()) {
+            // Move region end info into the predecessor.
+            if (DbgRegionEndInst *DREI = dyn_cast<DbgRegionEndInst>(--BBI))
+              DREI->moveBefore(NewRet);
+          }
+
+          // If the return instruction returns a value, and if the value was a
+          // PHI node in "BB", propagate the right value into the return.
+          for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
+               i != e; ++i)
+            if (PHINode *PN = dyn_cast<PHINode>(*i))
+              if (PN->getParent() == BB)
+                *i = PN->getIncomingValueForBlock(Pred);
+          
+          // Update any PHI nodes in the returning block to realize that we no
+          // longer branch to them.
+          BB->removePredecessor(Pred);
+          Pred->getInstList().erase(UncondBranch);
+        }
+
+        // If we eliminated all predecessors of the block, delete the block now.
+        if (pred_begin(BB) == pred_end(BB))
+          // We know there are no successors, so just nuke the block.
+          M->getBasicBlockList().erase(BB);
+
+        return true;
+      }
+
+      // Check out all of the conditional branches going to this return
+      // instruction.  If any of them just select between returns, change the
+      // branch itself into a select/return pair.
+      while (!CondBranchPreds.empty()) {
+        BranchInst *BI = CondBranchPreds.pop_back_val();
+
+        // Check to see if the non-BB successor is also a return block.
+        if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
+            isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
+            SimplifyCondBranchToTwoReturns(BI))
+          return true;
+      }
+    }
+  } else if (isa<UnwindInst>(BB->begin())) {
+    // Check to see if the first instruction in this block is just an unwind.
+    // If so, replace any invoke instructions which use this as an exception
+    // destination with call instructions, and any unconditional branch
+    // predecessor with an unwind.
+    //
+    SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
+    while (!Preds.empty()) {
+      BasicBlock *Pred = Preds.back();
+      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) {
+        if (BI->isUnconditional()) {
+          Pred->getInstList().pop_back();  // nuke uncond branch
+          new UnwindInst(Pred);            // Use unwind.
+          Changed = true;
+        }
+      } else if (InvokeInst *II = dyn_cast<InvokeInst>(Pred->getTerminator()))
+        if (II->getUnwindDest() == BB) {
+          // Insert a new branch instruction before the invoke, because this
+          // is now a fall through...
+          BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+          Pred->getInstList().remove(II);   // Take out of symbol table
+
+          // Insert the call now...
+          SmallVector<Value*,8> Args(II->op_begin()+3, II->op_end());
+          CallInst *CI = CallInst::Create(II->getCalledValue(),
+                                          Args.begin(), Args.end(),
+                                          II->getName(), BI);
+          CI->setCallingConv(II->getCallingConv());
+          CI->setAttributes(II->getAttributes());
+          // If the invoke produced a value, the Call now does instead
+          II->replaceAllUsesWith(CI);
+          delete II;
+          Changed = true;
+        }
+
+      Preds.pop_back();
+    }
+
+    // If this block is now dead, remove it.
+    if (pred_begin(BB) == pred_end(BB)) {
+      // We know there are no successors, so just nuke the block.
+      M->getBasicBlockList().erase(BB);
+      return true;
+    }
+
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+    if (isValueEqualityComparison(SI)) {
+      // If we only have one predecessor, and if it is a branch on this value,
+      // see if that predecessor totally determines the outcome of this switch.
+      if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+        if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred))
+          return SimplifyCFG(BB) || 1;
+
+      // If the block only contains the switch, see if we can fold the block
+      // away into any preds.
+      BasicBlock::iterator BBI = BB->begin();
+      // Ignore dbg intrinsics.
+      while (isa<DbgInfoIntrinsic>(BBI))
+        ++BBI;
+      if (SI == &*BBI)
+        if (FoldValueComparisonIntoPredecessors(SI))
+          return SimplifyCFG(BB) || 1;
+    }
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    if (BI->isUnconditional()) {
+      BasicBlock::iterator BBI = BB->getFirstNonPHI();
+
+      BasicBlock *Succ = BI->getSuccessor(0);
+      // Ignore dbg intrinsics.
+      while (isa<DbgInfoIntrinsic>(BBI))
+        ++BBI;
+      if (BBI->isTerminator() &&  // Terminator is the only non-phi instruction!
+          Succ != BB)             // Don't hurt infinite loops!
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB, Succ))
+          return true;
+      
+    } else {  // Conditional branch
+      if (isValueEqualityComparison(BI)) {
+        // If we only have one predecessor, and if it is a branch on this value,
+        // see if that predecessor totally determines the outcome of this
+        // switch.
+        if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+          if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred))
+            return SimplifyCFG(BB) || 1;
+
+        // This block must be empty, except for the setcond inst, if it exists.
+        // Ignore dbg intrinsics.
+        BasicBlock::iterator I = BB->begin();
+        // Ignore dbg intrinsics.
+        while (isa<DbgInfoIntrinsic>(I))
+          ++I;
+        if (&*I == BI) {
+          if (FoldValueComparisonIntoPredecessors(BI))
+            return SimplifyCFG(BB) | true;
+        } else if (&*I == cast<Instruction>(BI->getCondition())){
+          ++I;
+          // Ignore dbg intrinsics.
+          while (isa<DbgInfoIntrinsic>(I))
+            ++I;
+          if(&*I == BI) {
+            if (FoldValueComparisonIntoPredecessors(BI))
+              return SimplifyCFG(BB) | true;
+          }
+        }
+      }
+
+      // If this is a branch on a phi node in the current block, thread control
+      // through this block if any PHI node entries are constants.
+      if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
+        if (PN->getParent() == BI->getParent())
+          if (FoldCondBranchOnPHI(BI))
+            return SimplifyCFG(BB) | true;
+
+      // If this basic block is ONLY a setcc and a branch, and if a predecessor
+      // branches to us and one of our successors, fold the setcc into the
+      // predecessor and use logical operations to pick the right destination.
+      if (FoldBranchToCommonDest(BI))
+        return SimplifyCFG(BB) | 1;
+
+
+      // Scan predecessor blocks for conditional branches.
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+          if (PBI != BI && PBI->isConditional())
+            if (SimplifyCondBranchToCondBranch(PBI, BI))
+              return SimplifyCFG(BB) | true;
+    }
+  } else if (isa<UnreachableInst>(BB->getTerminator())) {
+    // If there are any instructions immediately before the unreachable that can
+    // be removed, do so.
+    Instruction *Unreachable = BB->getTerminator();
+    while (Unreachable != BB->begin()) {
+      BasicBlock::iterator BBI = Unreachable;
+      --BBI;
+      // Do not delete instructions that can have side effects, like calls
+      // (which may never return) and volatile loads and stores.
+      if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+        if (SI->isVolatile())
+          break;
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(BBI))
+        if (LI->isVolatile())
+          break;
+
+      // Delete this instruction
+      BB->getInstList().erase(BBI);
+      Changed = true;
+    }
+
+    // If the unreachable instruction is the first in the block, take a gander
+    // at all of the predecessors of this instruction, and simplify them.
+    if (&BB->front() == Unreachable) {
+      SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+        TerminatorInst *TI = Preds[i]->getTerminator();
+
+        if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+          if (BI->isUnconditional()) {
+            if (BI->getSuccessor(0) == BB) {
+              new UnreachableInst(TI);
+              TI->eraseFromParent();
+              Changed = true;
+            }
+          } else {
+            if (BI->getSuccessor(0) == BB) {
+              BranchInst::Create(BI->getSuccessor(1), BI);
+              EraseTerminatorInstAndDCECond(BI);
+            } else if (BI->getSuccessor(1) == BB) {
+              BranchInst::Create(BI->getSuccessor(0), BI);
+              EraseTerminatorInstAndDCECond(BI);
+              Changed = true;
+            }
+          }
+        } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+          for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+            if (SI->getSuccessor(i) == BB) {
+              BB->removePredecessor(SI->getParent());
+              SI->removeCase(i);
+              --i; --e;
+              Changed = true;
+            }
+          // If the default value is unreachable, figure out the most popular
+          // destination and make it the default.
+          if (SI->getSuccessor(0) == BB) {
+            std::map<BasicBlock*, unsigned> Popularity;
+            for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+              Popularity[SI->getSuccessor(i)]++;
+
+            // Find the most popular block.
+            unsigned MaxPop = 0;
+            BasicBlock *MaxBlock = 0;
+            for (std::map<BasicBlock*, unsigned>::iterator
+                   I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
+              if (I->second > MaxPop) {
+                MaxPop = I->second;
+                MaxBlock = I->first;
+              }
+            }
+            if (MaxBlock) {
+              // Make this the new default, allowing us to delete any explicit
+              // edges to it.
+              SI->setSuccessor(0, MaxBlock);
+              Changed = true;
+
+              // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
+              // it.
+              if (isa<PHINode>(MaxBlock->begin()))
+                for (unsigned i = 0; i != MaxPop-1; ++i)
+                  MaxBlock->removePredecessor(SI->getParent());
+
+              for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
+                if (SI->getSuccessor(i) == MaxBlock) {
+                  SI->removeCase(i);
+                  --i; --e;
+                }
+            }
+          }
+        } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) {
+          if (II->getUnwindDest() == BB) {
+            // Convert the invoke to a call instruction.  This would be a good
+            // place to note that the call does not throw though.
+            BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+            II->removeFromParent();   // Take out of symbol table
+
+            // Insert the call now...
+            SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+            CallInst *CI = CallInst::Create(II->getCalledValue(),
+                                            Args.begin(), Args.end(),
+                                            II->getName(), BI);
+            CI->setCallingConv(II->getCallingConv());
+            CI->setAttributes(II->getAttributes());
+            // If the invoke produced a value, the Call does now instead.
+            II->replaceAllUsesWith(CI);
+            delete II;
+            Changed = true;
+          }
+        }
+      }
+
+      // If this block is now dead, remove it.
+      if (pred_begin(BB) == pred_end(BB)) {
+        // We know there are no successors, so just nuke the block.
+        M->getBasicBlockList().erase(BB);
+        return true;
+      }
+    }
+  }
+
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  //
+  if (MergeBlockIntoPredecessor(BB))
+    return true;
+
+  // Otherwise, if this block only has a single predecessor, and if that block
+  // is a conditional branch, see if we can hoist any code from this block up
+  // into our predecessor.
+  pred_iterator PI(pred_begin(BB)), PE(pred_end(BB));
+  BasicBlock *OnlyPred = *PI++;
+  for (; PI != PE; ++PI)  // Search all predecessors, see if they are all same
+    if (*PI != OnlyPred) {
+      OnlyPred = 0;       // There are multiple different predecessors...
+      break;
+    }
+  
+  if (OnlyPred)
+    if (BranchInst *BI = dyn_cast<BranchInst>(OnlyPred->getTerminator()))
+      if (BI->isConditional()) {
+        // Get the other block.
+        BasicBlock *OtherBB = BI->getSuccessor(BI->getSuccessor(0) == BB);
+        PI = pred_begin(OtherBB);
+        ++PI;
+        
+        if (PI == pred_end(OtherBB)) {
+          // We have a conditional branch to two blocks that are only reachable
+          // from the condbr.  We know that the condbr dominates the two blocks,
+          // so see if there is any identical code in the "then" and "else"
+          // blocks.  If so, we can hoist it up to the branching block.
+          Changed |= HoistThenElseCodeToIf(BI);
+        } else {
+          BasicBlock* OnlySucc = NULL;
+          for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+               SI != SE; ++SI) {
+            if (!OnlySucc)
+              OnlySucc = *SI;
+            else if (*SI != OnlySucc) {
+              OnlySucc = 0;     // There are multiple distinct successors!
+              break;
+            }
+          }
+
+          if (OnlySucc == OtherBB) {
+            // If BB's only successor is the other successor of the predecessor,
+            // i.e. a triangle, see if we can hoist any code from this block up
+            // to the "if" block.
+            Changed |= SpeculativelyExecuteBB(BI, BB);
+          }
+        }
+      }
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (BranchInst *BI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      // Change br (X == 0 | X == 1), T, F into a switch instruction.
+      if (BI->isConditional() && isa<Instruction>(BI->getCondition())) {
+        Instruction *Cond = cast<Instruction>(BI->getCondition());
+        // If this is a bunch of seteq's or'd together, or if it's a bunch of
+        // 'setne's and'ed together, collect them.
+        Value *CompVal = 0;
+        std::vector<ConstantInt*> Values;
+        bool TrueWhenEqual = GatherValueComparisons(Cond, CompVal, Values);
+        if (CompVal && CompVal->getType()->isInteger()) {
+          // There might be duplicate constants in the list, which the switch
+          // instruction can't handle, remove them now.
+          std::sort(Values.begin(), Values.end(), ConstantIntOrdering());
+          Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
+
+          // Figure out which block is which destination.
+          BasicBlock *DefaultBB = BI->getSuccessor(1);
+          BasicBlock *EdgeBB    = BI->getSuccessor(0);
+          if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
+
+          // Create the new switch instruction now.
+          SwitchInst *New = SwitchInst::Create(CompVal, DefaultBB,
+                                               Values.size(), BI);
+
+          // Add all of the 'cases' to the switch instruction.
+          for (unsigned i = 0, e = Values.size(); i != e; ++i)
+            New->addCase(Values[i], EdgeBB);
+
+          // We added edges from PI to the EdgeBB.  As such, if there were any
+          // PHI nodes in EdgeBB, they need entries to be added corresponding to
+          // the number of edges added.
+          for (BasicBlock::iterator BBI = EdgeBB->begin();
+               isa<PHINode>(BBI); ++BBI) {
+            PHINode *PN = cast<PHINode>(BBI);
+            Value *InVal = PN->getIncomingValueForBlock(*PI);
+            for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
+              PN->addIncoming(InVal, *PI);
+          }
+
+          // Erase the old branch instruction.
+          EraseTerminatorInstAndDCECond(BI);
+          return true;
+        }
+      }
+
+  return Changed;
+}
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
new file mode 100644
index 0000000..848f2b8
--- /dev/null
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -0,0 +1,139 @@
+//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to ensure that functions have at most one return
+// instruction in them.  Additionally, it keeps track of which node is the new
+// exit node of the CFG.  If there are no exit nodes in the CFG, the getExitNode
+// method will return a null pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+char UnifyFunctionExitNodes::ID = 0;
+static RegisterPass<UnifyFunctionExitNodes>
+X("mergereturn", "Unify function exit nodes");
+
+Pass *llvm::createUnifyFunctionExitNodesPass() {
+  return new UnifyFunctionExitNodes();
+}
+
+void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(PromoteMemoryToRegisterID);
+  AU.addPreservedID(LowerSwitchID);
+}
+
+// UnifyAllExitNodes - Unify all exit nodes of the CFG by creating a new
+// BasicBlock, and converting all returns to unconditional branches to this
+// new basic block.  The singular exit node is returned.
+//
+// If there are no return stmts in the Function, a null pointer is returned.
+//
+bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  std::vector<BasicBlock*> ReturningBlocks;
+  std::vector<BasicBlock*> UnwindingBlocks;
+  std::vector<BasicBlock*> UnreachableBlocks;
+  for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (isa<ReturnInst>(I->getTerminator()))
+      ReturningBlocks.push_back(I);
+    else if (isa<UnwindInst>(I->getTerminator()))
+      UnwindingBlocks.push_back(I);
+    else if (isa<UnreachableInst>(I->getTerminator()))
+      UnreachableBlocks.push_back(I);
+
+  // Handle unwinding blocks first.
+  if (UnwindingBlocks.empty()) {
+    UnwindBlock = 0;
+  } else if (UnwindingBlocks.size() == 1) {
+    UnwindBlock = UnwindingBlocks.front();
+  } else {
+    UnwindBlock = BasicBlock::Create("UnifiedUnwindBlock", &F);
+    new UnwindInst(UnwindBlock);
+
+    for (std::vector<BasicBlock*>::iterator I = UnwindingBlocks.begin(),
+           E = UnwindingBlocks.end(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      BB->getInstList().pop_back();  // Remove the unwind insn
+      BranchInst::Create(UnwindBlock, BB);
+    }
+  }
+
+  // Then unreachable blocks.
+  if (UnreachableBlocks.empty()) {
+    UnreachableBlock = 0;
+  } else if (UnreachableBlocks.size() == 1) {
+    UnreachableBlock = UnreachableBlocks.front();
+  } else {
+    UnreachableBlock = BasicBlock::Create("UnifiedUnreachableBlock", &F);
+    new UnreachableInst(UnreachableBlock);
+
+    for (std::vector<BasicBlock*>::iterator I = UnreachableBlocks.begin(),
+           E = UnreachableBlocks.end(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      BB->getInstList().pop_back();  // Remove the unreachable inst.
+      BranchInst::Create(UnreachableBlock, BB);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty()) {
+    ReturnBlock = 0;
+    return false;                          // No blocks return
+  } else if (ReturningBlocks.size() == 1) {
+    ReturnBlock = ReturningBlocks.front(); // Already has a single return block
+    return false;
+  }
+
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create("UnifiedReturnBlock", &F);
+
+  PHINode *PN = 0;
+  if (F.getReturnType() == Type::VoidTy) {
+    ReturnInst::Create(NULL, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (std::vector<BasicBlock*>::iterator I = ReturningBlocks.begin(),
+         E = ReturningBlocks.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+  ReturnBlock = NewRetBlock;
+  return true;
+}
diff --git a/lib/Transforms/Utils/UnrollLoop.cpp b/lib/Transforms/Utils/UnrollLoop.cpp
new file mode 100644
index 0000000..caef7ec
--- /dev/null
+++ b/lib/Transforms/Utils/UnrollLoop.cpp
@@ -0,0 +1,369 @@
+//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities. It does not define any
+// actual pass or policy, but provides a single function to perform loop
+// unrolling.
+//
+// It works best when loops have been canonicalized by the -indvars pass,
+// allowing it to determine the trip counts of loops easily.
+//
+// The process of unrolling can produce extraneous basic blocks linked with
+// unconditional branches.  This will be corrected in the future.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cstdio>
+
+using namespace llvm;
+
+// TODO: Should these be here or in LoopUnroll?
+STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
+STATISTIC(NumUnrolled,    "Number of loops unrolled (completely or otherwise)");
+
+/// RemapInstruction - Convert the instruction operands from referencing the
+/// current values into those specified by ValueMap.
+static inline void RemapInstruction(Instruction *I,
+                                    DenseMap<const Value *, Value*> &ValueMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
+    if (It != ValueMap.end()) Op = It->second;
+    I->setOperand(op, Op);
+  }
+}
+
+/// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
+/// only has one predecessor, and that predecessor only has one successor.
+/// The LoopInfo Analysis that is passed will be kept consistent.
+/// Returns the new combined block.
+static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  BasicBlock *OnlyPred = BB->getSinglePredecessor();
+  if (!OnlyPred) return 0;
+
+  if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
+    return 0;
+
+  DOUT << "Merging: " << *BB << "into: " << *OnlyPred;
+
+  // Resolve any PHI nodes at the start of the block.  They are all
+  // guaranteed to have exactly one entry if they exist, unless there are
+  // multiple duplicate (but guaranteed to be equal) entries for the
+  // incoming edges.  This occurs when there are multiple edges from
+  // OnlyPred to OnlySucc.
+  FoldSingleEntryPHINodes(BB);
+
+  // Delete the unconditional branch from the predecessor...
+  OnlyPred->getInstList().pop_back();
+
+  // Move all definitions in the successor to the predecessor...
+  OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(OnlyPred);
+
+  std::string OldName = BB->getName();
+
+  // Erase basic block from the function...
+  LI->removeBlock(BB);
+  BB->eraseFromParent();
+
+  // Inherit predecessor's name if it exists...
+  if (!OldName.empty() && !OnlyPred->hasName())
+    OnlyPred->setName(OldName);
+
+  return OnlyPred;
+}
+
+/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
+/// if unrolling was succesful, or false if the loop was unmodified. Unrolling
+/// can only fail when the loop's latch block is not terminated by a conditional
+/// branch instruction. However, if the trip count (and multiple) are not known,
+/// loop unrolling will mostly produce more code that is no faster.
+///
+/// The LoopInfo Analysis that is passed will be kept consistent.
+///
+/// If a LoopPassManager is passed in, and the loop is fully removed, it will be
+/// removed from the LoopPassManager as well. LPM can also be NULL.
+bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM) {
+  assert(L->isLCSSAForm());
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  
+  if (!BI || BI->isUnconditional()) {
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    DOUT << "  Can't unroll; loop not terminated by a conditional branch.\n";
+    return false;
+  }
+
+  // Find trip count
+  unsigned TripCount = L->getSmallConstantTripCount();
+  // Find trip multiple if count is not available
+  unsigned TripMultiple = 1;
+  if (TripCount == 0)
+    TripMultiple = L->getSmallConstantTripMultiple();
+
+  if (TripCount != 0)
+    DOUT << "  Trip Count = " << TripCount << "\n";
+  if (TripMultiple != 1)
+    DOUT << "  Trip Multiple = " << TripMultiple << "\n";
+
+  // Effectively "DCE" unrolled iterations that are beyond the tripcount
+  // and will never be executed.
+  if (TripCount != 0 && Count > TripCount)
+    Count = TripCount;
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = Count == TripCount;
+
+  // If we know the trip count, we know the multiple...
+  unsigned BreakoutTrip = 0;
+  if (TripCount != 0) {
+    BreakoutTrip = TripCount % Count;
+    TripMultiple = 0;
+  } else {
+    // Figure out what multiple to use.
+    BreakoutTrip = TripMultiple =
+      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
+  }
+
+  if (CompletelyUnroll) {
+    DOUT << "COMPLETELY UNROLLING loop %" << Header->getName()
+         << " with trip count " << TripCount << "!\n";
+  } else {
+    DOUT << "UNROLLING loop %" << Header->getName()
+         << " by " << Count;
+    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
+      DOUT << " with a breakout at trip " << BreakoutTrip;
+    } else if (TripMultiple != 1) {
+      DOUT << " with " << TripMultiple << " trips per branch";
+    }
+    DOUT << "!\n";
+  }
+
+  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
+
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+
+  // For the first iteration of the loop, we should use the precloned values for
+  // PHI nodes.  Insert associations now.
+  typedef DenseMap<const Value*, Value*> ValueMapTy;
+  ValueMapTy LastValueMap;
+  std::vector<PHINode*> OrigPHINode;
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    OrigPHINode.push_back(PN);
+    if (Instruction *I = 
+                dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
+      if (L->contains(I->getParent()))
+        LastValueMap[I] = I;
+  }
+
+  std::vector<BasicBlock*> Headers;
+  std::vector<BasicBlock*> Latches;
+  Headers.push_back(Header);
+  Latches.push_back(LatchBlock);
+
+  for (unsigned It = 1; It != Count; ++It) {
+    char SuffixBuffer[100];
+    sprintf(SuffixBuffer, ".%d", It);
+    
+    std::vector<BasicBlock*> NewBlocks;
+    
+    for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
+         E = LoopBlocks.end(); BB != E; ++BB) {
+      ValueMapTy ValueMap;
+      BasicBlock *New = CloneBasicBlock(*BB, ValueMap, SuffixBuffer);
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      // Loop over all of the PHI nodes in the block, changing them to use the
+      // incoming values from the previous block.
+      if (*BB == Header)
+        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+          PHINode *NewPHI = cast<PHINode>(ValueMap[OrigPHINode[i]]);
+          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
+            if (It > 1 && L->contains(InValI->getParent()))
+              InVal = LastValueMap[InValI];
+          ValueMap[OrigPHINode[i]] = InVal;
+          New->getInstList().erase(NewPHI);
+        }
+
+      // Update our running map of newest clones
+      LastValueMap[*BB] = New;
+      for (ValueMapTy::iterator VI = ValueMap.begin(), VE = ValueMap.end();
+           VI != VE; ++VI)
+        LastValueMap[VI->first] = VI->second;
+
+      L->addBasicBlockToLoop(New, LI->getBase());
+
+      // Add phi entries for newly created values to all exit blocks except
+      // the successor of the latch block.  The successor of the exit block will
+      // be updated specially after unrolling all the way.
+      if (*BB != LatchBlock)
+        for (Value::use_iterator UI = (*BB)->use_begin(), UE = (*BB)->use_end();
+             UI != UE;) {
+          Instruction *UseInst = cast<Instruction>(*UI);
+          ++UI;
+          if (isa<PHINode>(UseInst) && !L->contains(UseInst->getParent())) {
+            PHINode *phi = cast<PHINode>(UseInst);
+            Value *Incoming = phi->getIncomingValueForBlock(*BB);
+            phi->addIncoming(Incoming, New);
+          }
+        }
+
+      // Keep track of new headers and latches as we create them, so that
+      // we can insert the proper branches later.
+      if (*BB == Header)
+        Headers.push_back(New);
+      if (*BB == LatchBlock) {
+        Latches.push_back(New);
+
+        // Also, clear out the new latch's back edge so that it doesn't look
+        // like a new loop, so that it's amenable to being merged with adjacent
+        // blocks later on.
+        TerminatorInst *Term = New->getTerminator();
+        assert(L->contains(Term->getSuccessor(!ContinueOnTrue)));
+        assert(Term->getSuccessor(ContinueOnTrue) == LoopExit);
+        Term->setSuccessor(!ContinueOnTrue, NULL);
+      }
+
+      NewBlocks.push_back(New);
+    }
+    
+    // Remap all instructions in the most recent iteration
+    for (unsigned i = 0; i < NewBlocks.size(); ++i)
+      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+        RemapInstruction(I, LastValueMap);
+  }
+  
+  // The latch block exits the loop.  If there are any PHI nodes in the
+  // successor blocks, update them to use the appropriate values computed as the
+  // last iteration of the loop.
+  if (Count != 1) {
+    SmallPtrSet<PHINode*, 8> Users;
+    for (Value::use_iterator UI = LatchBlock->use_begin(),
+         UE = LatchBlock->use_end(); UI != UE; ++UI)
+      if (PHINode *phi = dyn_cast<PHINode>(*UI))
+        Users.insert(phi);
+    
+    BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]);
+    for (SmallPtrSet<PHINode*,8>::iterator SI = Users.begin(), SE = Users.end();
+         SI != SE; ++SI) {
+      PHINode *PN = *SI;
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI->getParent()))
+          InVal = LastValueMap[InVal];
+      }
+      PN->addIncoming(InVal, LastIterationBB);
+    }
+  }
+
+  // Now, if we're doing complete unrolling, loop over the PHI nodes in the
+  // original block, setting them to their incoming values.
+  if (CompletelyUnroll) {
+    BasicBlock *Preheader = L->getLoopPreheader();
+    for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+      PHINode *PN = OrigPHINode[i];
+      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
+      Header->getInstList().erase(PN);
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // set up the branches to connect them.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The original branch was replicated in each unrolled iteration.
+    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = true;
+
+    // For a complete unroll, make the last iteration end with a branch
+    // to the exit block.
+    if (CompletelyUnroll && j == 0) {
+      Dest = LoopExit;
+      NeedConditional = false;
+    }
+
+    // If we know the trip count or a multiple of it, we can safely use an
+    // unconditional branch for some iterations.
+    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+      NeedConditional = false;
+    }
+
+    if (NeedConditional) {
+      // Update the conditional branch's successor for the following
+      // iteration.
+      Term->setSuccessor(!ContinueOnTrue, Dest);
+    } else {
+      Term->setUnconditionalDest(Dest);
+      // Merge adjacent basic blocks, if possible.
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI)) {
+        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+        std::replace(Headers.begin(), Headers.end(), Dest, Fold);
+      }
+    }
+  }
+  
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
+  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
+       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
+      Instruction *Inst = I++;
+
+      if (isInstructionTriviallyDead(Inst))
+        (*BB)->getInstList().erase(Inst);
+      else if (Constant *C = ConstantFoldInstruction(Inst)) {
+        Inst->replaceAllUsesWith(C);
+        (*BB)->getInstList().erase(Inst);
+      }
+    }
+
+  NumCompletelyUnrolled += CompletelyUnroll;
+  ++NumUnrolled;
+  // Remove the loop from the LoopPassManager if it's completely removed.
+  if (CompletelyUnroll && LPM != NULL)
+    LPM->deleteLoopFromQueue(L);
+
+  // If we didn't completely unroll the loop, it should still be in LCSSA form.
+  if (!CompletelyUnroll)
+    assert(L->isLCSSAForm());
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
new file mode 100644
index 0000000..20b676d
--- /dev/null
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -0,0 +1,143 @@
+//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MapValue function, which is shared by various parts of
+// the lib/Transforms/Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instruction.h"
+#include "llvm/MDNode.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+Value *llvm::MapValue(const Value *V, ValueMapTy &VM) {
+  Value *&VMSlot = VM[V];
+  if (VMSlot) return VMSlot;      // Does it exist in the map yet?
+  
+  // NOTE: VMSlot can be invalidated by any reference to VM, which can grow the
+  // DenseMap.  This includes any recursive calls to MapValue.
+
+  // Global values do not need to be seeded into the ValueMap if they are using
+  // the identity mapping.
+  if (isa<GlobalValue>(V) || isa<InlineAsm>(V))
+    return VMSlot = const_cast<Value*>(V);
+
+  if (Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V))) {
+    if (isa<ConstantInt>(C) || isa<ConstantFP>(C) ||
+        isa<ConstantPointerNull>(C) || isa<ConstantAggregateZero>(C) ||
+        isa<UndefValue>(C) || isa<MDString>(C))
+      return VMSlot = C;           // Primitive constants map directly
+    else if (ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
+      for (User::op_iterator b = CA->op_begin(), i = b, e = CA->op_end();
+           i != e; ++i) {
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This array must contain a reference to a global, make a new array
+          // and return it.
+          //
+          std::vector<Constant*> Values;
+          Values.reserve(CA->getNumOperands());
+          for (User::op_iterator j = b; j != i; ++j)
+            Values.push_back(cast<Constant>(*j));
+          Values.push_back(cast<Constant>(MV));
+          for (++i; i != e; ++i)
+            Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          return VM[V] = ConstantArray::get(CA->getType(), Values);
+        }
+      }
+      return VM[V] = C;
+
+    } else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
+      for (User::op_iterator b = CS->op_begin(), i = b, e = CS->op_end();
+           i != e; ++i) {
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This struct must contain a reference to a global, make a new struct
+          // and return it.
+          //
+          std::vector<Constant*> Values;
+          Values.reserve(CS->getNumOperands());
+          for (User::op_iterator j = b; j != i; ++j)
+            Values.push_back(cast<Constant>(*j));
+          Values.push_back(cast<Constant>(MV));
+          for (++i; i != e; ++i)
+            Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          return VM[V] = ConstantStruct::get(CS->getType(), Values);
+        }
+      }
+      return VM[V] = C;
+
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      std::vector<Constant*> Ops;
+      for (User::op_iterator i = CE->op_begin(), e = CE->op_end(); i != e; ++i)
+        Ops.push_back(cast<Constant>(MapValue(*i, VM)));
+      return VM[V] = CE->getWithOperands(Ops);
+    } else if (ConstantVector *CP = dyn_cast<ConstantVector>(C)) {
+      for (User::op_iterator b = CP->op_begin(), i = b, e = CP->op_end();
+           i != e; ++i) {
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This vector value must contain a reference to a global, make a new
+          // vector constant and return it.
+          //
+          std::vector<Constant*> Values;
+          Values.reserve(CP->getNumOperands());
+          for (User::op_iterator j = b; j != i; ++j)
+            Values.push_back(cast<Constant>(*j));
+          Values.push_back(cast<Constant>(MV));
+          for (++i; i != e; ++i)
+            Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          return VM[V] = ConstantVector::get(Values);
+        }
+      }
+      return VM[V] = C;
+      
+    } else if (MDNode *N = dyn_cast<MDNode>(C)) {
+      for (MDNode::const_elem_iterator b = N->elem_begin(), i = b,
+             e = N->elem_end(); i != e; ++i) {
+        if (!*i) continue;
+
+        Value *MV = MapValue(*i, VM);
+        if (MV != *i) {
+          // This MDNode must contain a reference to a global, make a new MDNode
+          // and return it.
+	  SmallVector<Value*, 8> Values;
+          Values.reserve(N->getNumElements());
+          for (MDNode::const_elem_iterator j = b; j != i; ++j)
+            Values.push_back(*j);
+          Values.push_back(MV);
+          for (++i; i != e; ++i)
+            Values.push_back(MapValue(*i, VM));
+          return VM[V] = MDNode::get(Values.data(), Values.size());
+        }
+      }
+      return VM[V] = C;
+
+    } else {
+      assert(0 && "Unknown type of constant!");
+    }
+  }
+
+  return 0;
+}
+
+/// RemapInstruction - Convert the instruction operands from referencing the
+/// current values into those specified by ValueMap.
+///
+void llvm::RemapInstruction(Instruction *I, ValueMapTy &ValueMap) {
+  for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
+    Value *V = MapValue(*op, ValueMap);
+    assert(V && "Referenced value not in value map!");
+    *op = V;
+  }
+}
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
new file mode 100644
index 0000000..6b369b6
--- /dev/null
+++ b/lib/VMCore/AsmWriter.cpp
@@ -0,0 +1,1880 @@
+//===-- AsmWriter.cpp - Printing LLVM as an assembly file -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library implements the functionality defined in llvm/Assembly/Writer.h
+//
+// Note that these routines must be extremely tolerant of various errors in the
+// LLVM code, because it can be used for debugging transformations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Assembly/AsmAnnotationWriter.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/MDNode.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+// Make virtual table appear in this compilation unit.
+AssemblyAnnotationWriter::~AssemblyAnnotationWriter() {}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+static const Module *getModuleFromVal(const Value *V) {
+  if (const Argument *MA = dyn_cast<Argument>(V))
+    return MA->getParent() ? MA->getParent()->getParent() : 0;
+  
+  if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
+    return BB->getParent() ? BB->getParent()->getParent() : 0;
+  
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    const Function *M = I->getParent() ? I->getParent()->getParent() : 0;
+    return M ? M->getParent() : 0;
+  }
+  
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return GV->getParent();
+  return 0;
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const char *Str, unsigned Length,
+                               raw_ostream &Out) {
+  for (unsigned i = 0; i != Length; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '\\' && C != '"')
+      Out << C;
+    else
+      Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+  }
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const std::string &Str, raw_ostream &Out) {
+  PrintEscapedString(Str.c_str(), Str.size(), Out);
+}
+
+enum PrefixType {
+  GlobalPrefix,
+  LabelPrefix,
+  LocalPrefix,
+  NoPrefix
+};
+
+/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either
+/// prefixed with % (if the string only contains simple characters) or is
+/// surrounded with ""'s (if it has special chars in it).  Print it out.
+static void PrintLLVMName(raw_ostream &OS, const char *NameStr,
+                          unsigned NameLen, PrefixType Prefix) {
+  assert(NameStr && "Cannot get empty name!");
+  switch (Prefix) {
+  default: assert(0 && "Bad prefix!");
+  case NoPrefix: break;
+  case GlobalPrefix: OS << '@'; break;
+  case LabelPrefix:  break;
+  case LocalPrefix:  OS << '%'; break;
+  }
+  
+  // Scan the name to see if it needs quotes first.
+  bool NeedsQuotes = isdigit(NameStr[0]);
+  if (!NeedsQuotes) {
+    for (unsigned i = 0; i != NameLen; ++i) {
+      char C = NameStr[i];
+      if (!isalnum(C) && C != '-' && C != '.' && C != '_') {
+        NeedsQuotes = true;
+        break;
+      }
+    }
+  }
+  
+  // If we didn't need any quotes, just write out the name in one blast.
+  if (!NeedsQuotes) {
+    OS.write(NameStr, NameLen);
+    return;
+  }
+  
+  // Okay, we need quotes.  Output the quotes and escape any scary characters as
+  // needed.
+  OS << '"';
+  PrintEscapedString(NameStr, NameLen, OS);
+  OS << '"';
+}
+
+/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either
+/// prefixed with % (if the string only contains simple characters) or is
+/// surrounded with ""'s (if it has special chars in it).  Print it out.
+static void PrintLLVMName(raw_ostream &OS, const Value *V) {
+  PrintLLVMName(OS, V->getNameStart(), V->getNameLen(),
+                isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
+}
+
+//===----------------------------------------------------------------------===//
+// TypePrinting Class: Type printing machinery
+//===----------------------------------------------------------------------===//
+
+static DenseMap<const Type *, std::string> &getTypeNamesMap(void *M) {
+  return *static_cast<DenseMap<const Type *, std::string>*>(M);
+}
+
+void TypePrinting::clear() {
+  getTypeNamesMap(TypeNames).clear();
+}
+
+bool TypePrinting::hasTypeName(const Type *Ty) const {
+  return getTypeNamesMap(TypeNames).count(Ty);
+}
+
+void TypePrinting::addTypeName(const Type *Ty, const std::string &N) {
+  getTypeNamesMap(TypeNames).insert(std::make_pair(Ty, N));
+}
+
+
+TypePrinting::TypePrinting() {
+  TypeNames = new DenseMap<const Type *, std::string>();
+}
+
+TypePrinting::~TypePrinting() {
+  delete &getTypeNamesMap(TypeNames);
+}
+
+/// CalcTypeName - Write the specified type to the specified raw_ostream, making
+/// use of type names or up references to shorten the type name where possible.
+void TypePrinting::CalcTypeName(const Type *Ty,
+                                SmallVectorImpl<const Type *> &TypeStack,
+                                raw_ostream &OS, bool IgnoreTopLevelName) {
+  // Check to see if the type is named.
+  if (!IgnoreTopLevelName) {
+    DenseMap<const Type *, std::string> &TM = getTypeNamesMap(TypeNames);
+    DenseMap<const Type *, std::string>::iterator I = TM.find(Ty);
+    if (I != TM.end()) {
+      OS << I->second;
+      return;
+    }
+  }
+  
+  // Check to see if the Type is already on the stack...
+  unsigned Slot = 0, CurSize = TypeStack.size();
+  while (Slot < CurSize && TypeStack[Slot] != Ty) ++Slot; // Scan for type
+  
+  // This is another base case for the recursion.  In this case, we know
+  // that we have looped back to a type that we have previously visited.
+  // Generate the appropriate upreference to handle this.
+  if (Slot < CurSize) {
+    OS << '\\' << unsigned(CurSize-Slot);     // Here's the upreference
+    return;
+  }
+  
+  TypeStack.push_back(Ty);    // Recursive case: Add us to the stack..
+  
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:      OS << "void"; break;
+  case Type::FloatTyID:     OS << "float"; break;
+  case Type::DoubleTyID:    OS << "double"; break;
+  case Type::X86_FP80TyID:  OS << "x86_fp80"; break;
+  case Type::FP128TyID:     OS << "fp128"; break;
+  case Type::PPC_FP128TyID: OS << "ppc_fp128"; break;
+  case Type::LabelTyID:     OS << "label"; break;
+  case Type::MetadataTyID:  OS << "metadata"; break;
+  case Type::IntegerTyID:
+    OS << 'i' << cast<IntegerType>(Ty)->getBitWidth();
+    break;
+      
+  case Type::FunctionTyID: {
+    const FunctionType *FTy = cast<FunctionType>(Ty);
+    CalcTypeName(FTy->getReturnType(), TypeStack, OS);
+    OS << " (";
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+         E = FTy->param_end(); I != E; ++I) {
+      if (I != FTy->param_begin())
+        OS << ", ";
+      CalcTypeName(*I, TypeStack, OS);
+    }
+    if (FTy->isVarArg()) {
+      if (FTy->getNumParams()) OS << ", ";
+      OS << "...";
+    }
+    OS << ')';
+    break;
+  }
+  case Type::StructTyID: {
+    const StructType *STy = cast<StructType>(Ty);
+    if (STy->isPacked())
+      OS << '<';
+    OS << "{ ";
+    for (StructType::element_iterator I = STy->element_begin(),
+         E = STy->element_end(); I != E; ++I) {
+      CalcTypeName(*I, TypeStack, OS);
+      if (next(I) != STy->element_end())
+        OS << ',';
+      OS << ' ';
+    }
+    OS << '}';
+    if (STy->isPacked())
+      OS << '>';
+    break;
+  }
+  case Type::PointerTyID: {
+    const PointerType *PTy = cast<PointerType>(Ty);
+    CalcTypeName(PTy->getElementType(), TypeStack, OS);
+    if (unsigned AddressSpace = PTy->getAddressSpace())
+      OS << " addrspace(" << AddressSpace << ')';
+    OS << '*';
+    break;
+  }
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    OS << '[' << ATy->getNumElements() << " x ";
+    CalcTypeName(ATy->getElementType(), TypeStack, OS);
+    OS << ']';
+    break;
+  }
+  case Type::VectorTyID: {
+    const VectorType *PTy = cast<VectorType>(Ty);
+    OS << "<" << PTy->getNumElements() << " x ";
+    CalcTypeName(PTy->getElementType(), TypeStack, OS);
+    OS << '>';
+    break;
+  }
+  case Type::OpaqueTyID:
+    OS << "opaque";
+    break;
+  default:
+    OS << "<unrecognized-type>";
+    break;
+  }
+  
+  TypeStack.pop_back();       // Remove self from stack.
+}
+
+/// printTypeInt - The internal guts of printing out a type that has a
+/// potentially named portion.
+///
+void TypePrinting::print(const Type *Ty, raw_ostream &OS,
+                         bool IgnoreTopLevelName) {
+  // Check to see if the type is named.
+  DenseMap<const Type*, std::string> &TM = getTypeNamesMap(TypeNames);
+  if (!IgnoreTopLevelName) {
+    DenseMap<const Type*, std::string>::iterator I = TM.find(Ty);
+    if (I != TM.end()) {
+      OS << I->second;
+      return;
+    }
+  }
+  
+  // Otherwise we have a type that has not been named but is a derived type.
+  // Carefully recurse the type hierarchy to print out any contained symbolic
+  // names.
+  SmallVector<const Type *, 16> TypeStack;
+  std::string TypeName;
+  
+  raw_string_ostream TypeOS(TypeName);
+  CalcTypeName(Ty, TypeStack, TypeOS, IgnoreTopLevelName);
+  OS << TypeOS.str();
+
+  // Cache type name for later use.
+  if (!IgnoreTopLevelName)
+    TM.insert(std::make_pair(Ty, TypeOS.str()));
+}
+
+namespace {
+  class TypeFinder {
+    // To avoid walking constant expressions multiple times and other IR
+    // objects, we keep several helper maps.
+    DenseSet<const Value*> VisitedConstants;
+    DenseSet<const Type*> VisitedTypes;
+    
+    TypePrinting &TP;
+    std::vector<const Type*> &NumberedTypes;
+  public:
+    TypeFinder(TypePrinting &tp, std::vector<const Type*> &numberedTypes)
+      : TP(tp), NumberedTypes(numberedTypes) {}
+    
+    void Run(const Module &M) {
+      // Get types from the type symbol table.  This gets opaque types referened
+      // only through derived named types.
+      const TypeSymbolTable &ST = M.getTypeSymbolTable();
+      for (TypeSymbolTable::const_iterator TI = ST.begin(), E = ST.end();
+           TI != E; ++TI)
+        IncorporateType(TI->second);
+      
+      // Get types from global variables.
+      for (Module::const_global_iterator I = M.global_begin(),
+           E = M.global_end(); I != E; ++I) {
+        IncorporateType(I->getType());
+        if (I->hasInitializer())
+          IncorporateValue(I->getInitializer());
+      }
+      
+      // Get types from aliases.
+      for (Module::const_alias_iterator I = M.alias_begin(),
+           E = M.alias_end(); I != E; ++I) {
+        IncorporateType(I->getType());
+        IncorporateValue(I->getAliasee());
+      }
+      
+      // Get types from functions.
+      for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
+        IncorporateType(FI->getType());
+        
+        for (Function::const_iterator BB = FI->begin(), E = FI->end();
+             BB != E;++BB)
+          for (BasicBlock::const_iterator II = BB->begin(),
+               E = BB->end(); II != E; ++II) {
+            const Instruction &I = *II;
+            // Incorporate the type of the instruction and all its operands.
+            IncorporateType(I.getType());
+            for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
+                 OI != OE; ++OI)
+              IncorporateValue(*OI);
+          }
+      }
+    }
+    
+  private:
+    void IncorporateType(const Type *Ty) {
+      // Check to see if we're already visited this type.
+      if (!VisitedTypes.insert(Ty).second)
+        return;
+      
+      // If this is a structure or opaque type, add a name for the type.
+      if (((isa<StructType>(Ty) && cast<StructType>(Ty)->getNumElements())
+            || isa<OpaqueType>(Ty)) && !TP.hasTypeName(Ty)) {
+        TP.addTypeName(Ty, "%"+utostr(unsigned(NumberedTypes.size())));
+        NumberedTypes.push_back(Ty);
+      }
+      
+      // Recursively walk all contained types.
+      for (Type::subtype_iterator I = Ty->subtype_begin(),
+           E = Ty->subtype_end(); I != E; ++I)
+        IncorporateType(*I);      
+    }
+    
+    /// IncorporateValue - This method is used to walk operand lists finding
+    /// types hiding in constant expressions and other operands that won't be
+    /// walked in other ways.  GlobalValues, basic blocks, instructions, and
+    /// inst operands are all explicitly enumerated.
+    void IncorporateValue(const Value *V) {
+      if (V == 0 || !isa<Constant>(V) || isa<GlobalValue>(V)) return;
+      
+      // Already visited?
+      if (!VisitedConstants.insert(V).second)
+        return;
+      
+      // Check this type.
+      IncorporateType(V->getType());
+      
+      // Look in operands for types.
+      const Constant *C = cast<Constant>(V);
+      for (Constant::const_op_iterator I = C->op_begin(),
+           E = C->op_end(); I != E;++I)
+        IncorporateValue(*I);
+    }
+  };
+} // end anonymous namespace
+
+
+/// AddModuleTypesToPrinter - Add all of the symbolic type names for types in
+/// the specified module to the TypePrinter and all numbered types to it and the
+/// NumberedTypes table.
+static void AddModuleTypesToPrinter(TypePrinting &TP, 
+                                    std::vector<const Type*> &NumberedTypes,
+                                    const Module *M) {
+  if (M == 0) return;
+  
+  // If the module has a symbol table, take all global types and stuff their
+  // names into the TypeNames map.
+  const TypeSymbolTable &ST = M->getTypeSymbolTable();
+  for (TypeSymbolTable::const_iterator TI = ST.begin(), E = ST.end();
+       TI != E; ++TI) {
+    const Type *Ty = cast<Type>(TI->second);
+    
+    // As a heuristic, don't insert pointer to primitive types, because
+    // they are used too often to have a single useful name.
+    if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+      const Type *PETy = PTy->getElementType();
+      if ((PETy->isPrimitiveType() || PETy->isInteger()) &&
+          !isa<OpaqueType>(PETy))
+        continue;
+    }
+    
+    // Likewise don't insert primitives either.
+    if (Ty->isInteger() || Ty->isPrimitiveType())
+      continue;
+    
+    // Get the name as a string and insert it into TypeNames.
+    std::string NameStr;
+    raw_string_ostream NameOS(NameStr);
+    PrintLLVMName(NameOS, TI->first.c_str(), TI->first.length(), LocalPrefix);
+    TP.addTypeName(Ty, NameOS.str());
+  }
+  
+  // Walk the entire module to find references to unnamed structure and opaque
+  // types.  This is required for correctness by opaque types (because multiple
+  // uses of an unnamed opaque type needs to be referred to by the same ID) and
+  // it shrinks complex recursive structure types substantially in some cases.
+  TypeFinder(TP, NumberedTypes).Run(*M);
+}
+
+
+/// WriteTypeSymbolic - This attempts to write the specified type as a symbolic
+/// type, iff there is an entry in the modules symbol table for the specified
+/// type or one of it's component types.
+///
+void llvm::WriteTypeSymbolic(raw_ostream &OS, const Type *Ty, const Module *M) {
+  TypePrinting Printer;
+  std::vector<const Type*> NumberedTypes;
+  AddModuleTypesToPrinter(Printer, NumberedTypes, M);
+  Printer.print(Ty, OS);
+}
+
+//===----------------------------------------------------------------------===//
+// SlotTracker Class: Enumerate slot numbers for unnamed values
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// This class provides computation of slot numbers for LLVM Assembly writing.
+///
+class SlotTracker {
+public:
+  /// ValueMap - A mapping of Values to slot numbers
+  typedef DenseMap<const Value*, unsigned> ValueMap;
+  
+private:  
+  /// TheModule - The module for which we are holding slot numbers
+  const Module* TheModule;
+  
+  /// TheFunction - The function for which we are holding slot numbers
+  const Function* TheFunction;
+  bool FunctionProcessed;
+  
+  /// mMap - The TypePlanes map for the module level data
+  ValueMap mMap;
+  unsigned mNext;
+  
+  /// fMap - The TypePlanes map for the function level data
+  ValueMap fMap;
+  unsigned fNext;
+  
+public:
+  /// Construct from a module
+  explicit SlotTracker(const Module *M);
+  /// Construct from a function, starting out in incorp state.
+  explicit SlotTracker(const Function *F);
+
+  /// Return the slot number of the specified value in it's type
+  /// plane.  If something is not in the SlotTracker, return -1.
+  int getLocalSlot(const Value *V);
+  int getGlobalSlot(const GlobalValue *V);
+
+  /// If you'd like to deal with a function instead of just a module, use
+  /// this method to get its data into the SlotTracker.
+  void incorporateFunction(const Function *F) {
+    TheFunction = F;
+    FunctionProcessed = false;
+  }
+
+  /// After calling incorporateFunction, use this method to remove the
+  /// most recently incorporated function from the SlotTracker. This
+  /// will reset the state of the machine back to just the module contents.
+  void purgeFunction();
+
+  // Implementation Details
+private:
+  /// This function does the actual initialization.
+  inline void initialize();
+
+  /// CreateModuleSlot - Insert the specified GlobalValue* into the slot table.
+  void CreateModuleSlot(const GlobalValue *V);
+  
+  /// CreateFunctionSlot - Insert the specified Value* into the slot table.
+  void CreateFunctionSlot(const Value *V);
+
+  /// Add all of the module level global variables (and their initializers)
+  /// and function declarations, but not the contents of those functions.
+  void processModule();
+
+  /// Add all of the functions arguments, basic blocks, and instructions
+  void processFunction();
+
+  SlotTracker(const SlotTracker &);  // DO NOT IMPLEMENT
+  void operator=(const SlotTracker &);  // DO NOT IMPLEMENT
+};
+
+}  // end anonymous namespace
+
+
+static SlotTracker *createSlotTracker(const Value *V) {
+  if (const Argument *FA = dyn_cast<Argument>(V))
+    return new SlotTracker(FA->getParent());
+  
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return new SlotTracker(I->getParent()->getParent());
+  
+  if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
+    return new SlotTracker(BB->getParent());
+  
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return new SlotTracker(GV->getParent());
+  
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
+    return new SlotTracker(GA->getParent());    
+  
+  if (const Function *Func = dyn_cast<Function>(V))
+    return new SlotTracker(Func);
+  
+  return 0;
+}
+
+#if 0
+#define ST_DEBUG(X) cerr << X
+#else
+#define ST_DEBUG(X)
+#endif
+
+// Module level constructor. Causes the contents of the Module (sans functions)
+// to be added to the slot table.
+SlotTracker::SlotTracker(const Module *M)
+  : TheModule(M), TheFunction(0), FunctionProcessed(false), mNext(0), fNext(0) {
+}
+
+// Function level constructor. Causes the contents of the Module and the one
+// function provided to be added to the slot table.
+SlotTracker::SlotTracker(const Function *F)
+  : TheModule(F ? F->getParent() : 0), TheFunction(F), FunctionProcessed(false),
+    mNext(0), fNext(0) {
+}
+
+inline void SlotTracker::initialize() {
+  if (TheModule) {
+    processModule();
+    TheModule = 0; ///< Prevent re-processing next time we're called.
+  }
+  
+  if (TheFunction && !FunctionProcessed)
+    processFunction();
+}
+
+// Iterate through all the global variables, functions, and global
+// variable initializers and create slots for them.
+void SlotTracker::processModule() {
+  ST_DEBUG("begin processModule!\n");
+  
+  // Add all of the unnamed global variables to the value table.
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+       E = TheModule->global_end(); I != E; ++I)
+    if (!I->hasName()) 
+      CreateModuleSlot(I);
+  
+  // Add all the unnamed functions to the table.
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I)
+    if (!I->hasName())
+      CreateModuleSlot(I);
+  
+  ST_DEBUG("end processModule!\n");
+}
+
+
+// Process the arguments, basic blocks, and instructions  of a function.
+void SlotTracker::processFunction() {
+  ST_DEBUG("begin processFunction!\n");
+  fNext = 0;
+  
+  // Add all the function arguments with no names.
+  for(Function::const_arg_iterator AI = TheFunction->arg_begin(),
+      AE = TheFunction->arg_end(); AI != AE; ++AI)
+    if (!AI->hasName())
+      CreateFunctionSlot(AI);
+  
+  ST_DEBUG("Inserting Instructions:\n");
+  
+  // Add all of the basic blocks and instructions with no names.
+  for (Function::const_iterator BB = TheFunction->begin(),
+       E = TheFunction->end(); BB != E; ++BB) {
+    if (!BB->hasName())
+      CreateFunctionSlot(BB);
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (I->getType() != Type::VoidTy && !I->hasName())
+        CreateFunctionSlot(I);
+  }
+  
+  FunctionProcessed = true;
+  
+  ST_DEBUG("end processFunction!\n");
+}
+
+/// Clean up after incorporating a function. This is the only way to get out of
+/// the function incorporation state that affects get*Slot/Create*Slot. Function
+/// incorporation state is indicated by TheFunction != 0.
+void SlotTracker::purgeFunction() {
+  ST_DEBUG("begin purgeFunction!\n");
+  fMap.clear(); // Simply discard the function level map
+  TheFunction = 0;
+  FunctionProcessed = false;
+  ST_DEBUG("end purgeFunction!\n");
+}
+
+/// getGlobalSlot - Get the slot number of a global value.
+int SlotTracker::getGlobalSlot(const GlobalValue *V) {
+  // Check for uninitialized state and do lazy initialization.
+  initialize();
+  
+  // Find the type plane in the module map
+  ValueMap::iterator MI = mMap.find(V);
+  return MI == mMap.end() ? -1 : (int)MI->second;
+}
+
+
+/// getLocalSlot - Get the slot number for a value that is local to a function.
+int SlotTracker::getLocalSlot(const Value *V) {
+  assert(!isa<Constant>(V) && "Can't get a constant or global slot with this!");
+  
+  // Check for uninitialized state and do lazy initialization.
+  initialize();
+  
+  ValueMap::iterator FI = fMap.find(V);
+  return FI == fMap.end() ? -1 : (int)FI->second;
+}
+
+
+/// CreateModuleSlot - Insert the specified GlobalValue* into the slot table.
+void SlotTracker::CreateModuleSlot(const GlobalValue *V) {
+  assert(V && "Can't insert a null Value into SlotTracker!");
+  assert(V->getType() != Type::VoidTy && "Doesn't need a slot!");
+  assert(!V->hasName() && "Doesn't need a slot!");
+  
+  unsigned DestSlot = mNext++;
+  mMap[V] = DestSlot;
+  
+  ST_DEBUG("  Inserting value [" << V->getType() << "] = " << V << " slot=" <<
+           DestSlot << " [");
+  // G = Global, F = Function, A = Alias, o = other
+  ST_DEBUG((isa<GlobalVariable>(V) ? 'G' :
+            (isa<Function>(V) ? 'F' :
+             (isa<GlobalAlias>(V) ? 'A' : 'o'))) << "]\n");
+}
+
+
+/// CreateSlot - Create a new slot for the specified value if it has no name.
+void SlotTracker::CreateFunctionSlot(const Value *V) {
+  assert(V->getType() != Type::VoidTy && !V->hasName() &&
+         "Doesn't need a slot!");
+  
+  unsigned DestSlot = fNext++;
+  fMap[V] = DestSlot;
+  
+  // G = Global, F = Function, o = other
+  ST_DEBUG("  Inserting value [" << V->getType() << "] = " << V << " slot=" <<
+           DestSlot << " [o]\n");
+}  
+
+
+
+//===----------------------------------------------------------------------===//
+// AsmWriter Implementation
+//===----------------------------------------------------------------------===//
+
+static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+                                   TypePrinting &TypePrinter,
+                                   SlotTracker *Machine);
+
+
+
+static const char *getPredicateText(unsigned predicate) {
+  const char * pred = "unknown";
+  switch (predicate) {
+    case FCmpInst::FCMP_FALSE: pred = "false"; break;
+    case FCmpInst::FCMP_OEQ:   pred = "oeq"; break;
+    case FCmpInst::FCMP_OGT:   pred = "ogt"; break;
+    case FCmpInst::FCMP_OGE:   pred = "oge"; break;
+    case FCmpInst::FCMP_OLT:   pred = "olt"; break;
+    case FCmpInst::FCMP_OLE:   pred = "ole"; break;
+    case FCmpInst::FCMP_ONE:   pred = "one"; break;
+    case FCmpInst::FCMP_ORD:   pred = "ord"; break;
+    case FCmpInst::FCMP_UNO:   pred = "uno"; break;
+    case FCmpInst::FCMP_UEQ:   pred = "ueq"; break;
+    case FCmpInst::FCMP_UGT:   pred = "ugt"; break;
+    case FCmpInst::FCMP_UGE:   pred = "uge"; break;
+    case FCmpInst::FCMP_ULT:   pred = "ult"; break;
+    case FCmpInst::FCMP_ULE:   pred = "ule"; break;
+    case FCmpInst::FCMP_UNE:   pred = "une"; break;
+    case FCmpInst::FCMP_TRUE:  pred = "true"; break;
+    case ICmpInst::ICMP_EQ:    pred = "eq"; break;
+    case ICmpInst::ICMP_NE:    pred = "ne"; break;
+    case ICmpInst::ICMP_SGT:   pred = "sgt"; break;
+    case ICmpInst::ICMP_SGE:   pred = "sge"; break;
+    case ICmpInst::ICMP_SLT:   pred = "slt"; break;
+    case ICmpInst::ICMP_SLE:   pred = "sle"; break;
+    case ICmpInst::ICMP_UGT:   pred = "ugt"; break;
+    case ICmpInst::ICMP_UGE:   pred = "uge"; break;
+    case ICmpInst::ICMP_ULT:   pred = "ult"; break;
+    case ICmpInst::ICMP_ULE:   pred = "ule"; break;
+  }
+  return pred;
+}
+
+static void WriteConstantInt(raw_ostream &Out, const Constant *CV,
+                             TypePrinting &TypePrinter, SlotTracker *Machine) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    if (CI->getType() == Type::Int1Ty) {
+      Out << (CI->getZExtValue() ? "true" : "false");
+      return;
+    }
+    Out << CI->getValue();
+    return;
+  }
+  
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEdouble ||
+        &CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle) {
+      // We would like to output the FP constant value in exponential notation,
+      // but we cannot do this if doing so will lose precision.  Check here to
+      // make sure that we only output it in exponential format if we can parse
+      // the value back and get the same value.
+      //
+      bool ignored;
+      bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble;
+      double Val = isDouble ? CFP->getValueAPF().convertToDouble() :
+                              CFP->getValueAPF().convertToFloat();
+      std::string StrVal = ftostr(CFP->getValueAPF());
+
+      // Check to make sure that the stringized number is not some string like
+      // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
+      // that the string matches the "[-+]?[0-9]" regex.
+      //
+      if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+          ((StrVal[0] == '-' || StrVal[0] == '+') &&
+           (StrVal[1] >= '0' && StrVal[1] <= '9'))) {
+        // Reparse stringized version!
+        if (atof(StrVal.c_str()) == Val) {
+          Out << StrVal;
+          return;
+        }
+      }
+      // Otherwise we could not reparse it to exactly the same value, so we must
+      // output the string in hexadecimal format!  Note that loading and storing
+      // floating point types changes the bits of NaNs on some hosts, notably
+      // x86, so we must not use these types.
+      assert(sizeof(double) == sizeof(uint64_t) &&
+             "assuming that double is 64 bits!");
+      char Buffer[40];
+      APFloat apf = CFP->getValueAPF();
+      // Floats are represented in ASCII IR as double, convert.
+      if (!isDouble)
+        apf.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, 
+                          &ignored);
+      Out << "0x" << 
+              utohex_buffer(uint64_t(apf.bitcastToAPInt().getZExtValue()), 
+                            Buffer+40);
+      return;
+    }
+    
+    // Some form of long double.  These appear as a magic letter identifying
+    // the type, then a fixed number of hex digits.
+    Out << "0x";
+    if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) {
+      Out << 'K';
+      // api needed to prevent premature destruction
+      APInt api = CFP->getValueAPF().bitcastToAPInt();
+      const uint64_t* p = api.getRawData();
+      uint64_t word = p[1];
+      int shiftcount=12;
+      int width = api.getBitWidth();
+      for (int j=0; j<width; j+=4, shiftcount-=4) {
+        unsigned int nibble = (word>>shiftcount) & 15;
+        if (nibble < 10)
+          Out << (unsigned char)(nibble + '0');
+        else
+          Out << (unsigned char)(nibble - 10 + 'A');
+        if (shiftcount == 0 && j+4 < width) {
+          word = *p;
+          shiftcount = 64;
+          if (width-j-4 < 64)
+            shiftcount = width-j-4;
+        }
+      }
+      return;
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad)
+      Out << 'L';
+    else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble)
+      Out << 'M';
+    else
+      assert(0 && "Unsupported floating point type");
+    // api needed to prevent premature destruction
+    APInt api = CFP->getValueAPF().bitcastToAPInt();
+    const uint64_t* p = api.getRawData();
+    uint64_t word = *p;
+    int shiftcount=60;
+    int width = api.getBitWidth();
+    for (int j=0; j<width; j+=4, shiftcount-=4) {
+      unsigned int nibble = (word>>shiftcount) & 15;
+      if (nibble < 10)
+        Out << (unsigned char)(nibble + '0');
+      else
+        Out << (unsigned char)(nibble - 10 + 'A');
+      if (shiftcount == 0 && j+4 < width) {
+        word = *(++p);
+        shiftcount = 64;
+        if (width-j-4 < 64)
+          shiftcount = width-j-4;
+      }
+    }
+    return;
+  }
+  
+  if (isa<ConstantAggregateZero>(CV)) {
+    Out << "zeroinitializer";
+    return;
+  }
+  
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+    // As a special case, print the array as a string if it is an array of
+    // i8 with ConstantInt values.
+    //
+    const Type *ETy = CA->getType()->getElementType();
+    if (CA->isString()) {
+      Out << "c\"";
+      PrintEscapedString(CA->getAsString(), Out);
+      Out << '"';
+    } else {                // Cannot output in string format...
+      Out << '[';
+      if (CA->getNumOperands()) {
+        TypePrinter.print(ETy, Out);
+        Out << ' ';
+        WriteAsOperandInternal(Out, CA->getOperand(0),
+                               TypePrinter, Machine);
+        for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
+          Out << ", ";
+          TypePrinter.print(ETy, Out);
+          Out << ' ';
+          WriteAsOperandInternal(Out, CA->getOperand(i), TypePrinter, Machine);
+        }
+      }
+      Out << ']';
+    }
+    return;
+  }
+  
+  if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+    if (CS->getType()->isPacked())
+      Out << '<';
+    Out << '{';
+    unsigned N = CS->getNumOperands();
+    if (N) {
+      Out << ' ';
+      TypePrinter.print(CS->getOperand(0)->getType(), Out);
+      Out << ' ';
+
+      WriteAsOperandInternal(Out, CS->getOperand(0), TypePrinter, Machine);
+
+      for (unsigned i = 1; i < N; i++) {
+        Out << ", ";
+        TypePrinter.print(CS->getOperand(i)->getType(), Out);
+        Out << ' ';
+
+        WriteAsOperandInternal(Out, CS->getOperand(i), TypePrinter, Machine);
+      }
+      Out << ' ';
+    }
+ 
+    Out << '}';
+    if (CS->getType()->isPacked())
+      Out << '>';
+    return;
+  }
+  
+  if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
+    const Type *ETy = CP->getType()->getElementType();
+    assert(CP->getNumOperands() > 0 &&
+           "Number of operands for a PackedConst must be > 0");
+    Out << '<';
+    TypePrinter.print(ETy, Out);
+    Out << ' ';
+    WriteAsOperandInternal(Out, CP->getOperand(0), TypePrinter, Machine);
+    for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
+      Out << ", ";
+      TypePrinter.print(ETy, Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, CP->getOperand(i), TypePrinter, Machine);
+    }
+    Out << '>';
+    return;
+  }
+  
+  if (isa<ConstantPointerNull>(CV)) {
+    Out << "null";
+    return;
+  }
+  
+  if (isa<UndefValue>(CV)) {
+    Out << "undef";
+    return;
+  }
+  
+  if (const MDString *S = dyn_cast<MDString>(CV)) {
+    Out << "!\"";
+    PrintEscapedString(S->begin(), S->size(), Out);
+    Out << '"';
+    return;
+  }
+
+  if (const MDNode *N = dyn_cast<MDNode>(CV)) {
+    Out << "!{";
+    for (MDNode::const_elem_iterator I = N->elem_begin(), E = N->elem_end();
+         I != E;) {
+      if (!*I) {
+        Out << "null";
+      } else {
+        TypePrinter.print((*I)->getType(), Out);
+        Out << ' ';
+        WriteAsOperandInternal(Out, *I, TypePrinter, Machine);
+      }
+
+      if (++I != E)
+        Out << ", ";
+    }
+    Out << "}";
+    return;
+  }
+  
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    Out << CE->getOpcodeName();
+    if (CE->isCompare())
+      Out << ' ' << getPredicateText(CE->getPredicate());
+    Out << " (";
+
+    for (User::const_op_iterator OI=CE->op_begin(); OI != CE->op_end(); ++OI) {
+      TypePrinter.print((*OI)->getType(), Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, *OI, TypePrinter, Machine);
+      if (OI+1 != CE->op_end())
+        Out << ", ";
+    }
+
+    if (CE->hasIndices()) {
+      const SmallVector<unsigned, 4> &Indices = CE->getIndices();
+      for (unsigned i = 0, e = Indices.size(); i != e; ++i)
+        Out << ", " << Indices[i];
+    }
+
+    if (CE->isCast()) {
+      Out << " to ";
+      TypePrinter.print(CE->getType(), Out);
+    }
+
+    Out << ')';
+    return;
+  }
+  
+  Out << "<placeholder or erroneous Constant>";
+}
+
+
+/// WriteAsOperand - Write the name of the specified value out to the specified
+/// ostream.  This can be useful when you just want to print int %reg126, not
+/// the whole instruction that generated it.
+///
+static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+                                   TypePrinting &TypePrinter,
+                                   SlotTracker *Machine) {
+  if (V->hasName()) {
+    PrintLLVMName(Out, V);
+    return;
+  }
+  
+  const Constant *CV = dyn_cast<Constant>(V);
+  if (CV && !isa<GlobalValue>(CV)) {
+    WriteConstantInt(Out, CV, TypePrinter, Machine);
+    return;
+  }
+  
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+    Out << "asm ";
+    if (IA->hasSideEffects())
+      Out << "sideeffect ";
+    Out << '"';
+    PrintEscapedString(IA->getAsmString(), Out);
+    Out << "\", \"";
+    PrintEscapedString(IA->getConstraintString(), Out);
+    Out << '"';
+    return;
+  }
+  
+  char Prefix = '%';
+  int Slot;
+  if (Machine) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+      Slot = Machine->getGlobalSlot(GV);
+      Prefix = '@';
+    } else {
+      Slot = Machine->getLocalSlot(V);
+    }
+  } else {
+    Machine = createSlotTracker(V);
+    if (Machine) {
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+        Slot = Machine->getGlobalSlot(GV);
+        Prefix = '@';
+      } else {
+        Slot = Machine->getLocalSlot(V);
+      }
+    } else {
+      Slot = -1;
+    }
+    delete Machine;
+  }
+  
+  if (Slot != -1)
+    Out << Prefix << Slot;
+  else
+    Out << "<badref>";
+}
+
+/// WriteAsOperand - Write the name of the specified value out to the specified
+/// ostream.  This can be useful when you just want to print int %reg126, not
+/// the whole instruction that generated it.
+///
+void llvm::WriteAsOperand(std::ostream &Out, const Value *V, bool PrintType,
+                          const Module *Context) {
+  raw_os_ostream OS(Out);
+  WriteAsOperand(OS, V, PrintType, Context);
+}
+
+void llvm::WriteAsOperand(raw_ostream &Out, const Value *V, bool PrintType,
+                          const Module *Context) {
+  if (Context == 0) Context = getModuleFromVal(V);
+
+  TypePrinting TypePrinter;
+  std::vector<const Type*> NumberedTypes;
+  AddModuleTypesToPrinter(TypePrinter, NumberedTypes, Context);
+  if (PrintType) {
+    TypePrinter.print(V->getType(), Out);
+    Out << ' ';
+  }
+
+  WriteAsOperandInternal(Out, V, TypePrinter, 0);
+}
+
+
+namespace {
+
+class AssemblyWriter {
+  raw_ostream &Out;
+  SlotTracker &Machine;
+  const Module *TheModule;
+  TypePrinting TypePrinter;
+  AssemblyAnnotationWriter *AnnotationWriter;
+  std::vector<const Type*> NumberedTypes;
+public:
+  inline AssemblyWriter(raw_ostream &o, SlotTracker &Mac, const Module *M,
+                        AssemblyAnnotationWriter *AAW)
+    : Out(o), Machine(Mac), TheModule(M), AnnotationWriter(AAW) {
+    AddModuleTypesToPrinter(TypePrinter, NumberedTypes, M);
+  }
+
+  void write(const Module *M) { printModule(M); }
+  
+  void write(const GlobalValue *G) {
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(G))
+      printGlobal(GV);
+    else if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(G))
+      printAlias(GA);
+    else if (const Function *F = dyn_cast<Function>(G))
+      printFunction(F);
+    else
+      assert(0 && "Unknown global");
+  }
+  
+  void write(const BasicBlock *BB)    { printBasicBlock(BB);  }
+  void write(const Instruction *I)    { printInstruction(*I); }
+
+  void writeOperand(const Value *Op, bool PrintType);
+  void writeParamOperand(const Value *Operand, Attributes Attrs);
+
+  const Module* getModule() { return TheModule; }
+
+private:
+  void printModule(const Module *M);
+  void printTypeSymbolTable(const TypeSymbolTable &ST);
+  void printGlobal(const GlobalVariable *GV);
+  void printAlias(const GlobalAlias *GV);
+  void printFunction(const Function *F);
+  void printArgument(const Argument *FA, Attributes Attrs);
+  void printBasicBlock(const BasicBlock *BB);
+  void printInstruction(const Instruction &I);
+
+  // printInfoComment - Print a little comment after the instruction indicating
+  // which slot it occupies.
+  void printInfoComment(const Value &V);
+};
+}  // end of anonymous namespace
+
+
+void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
+  if (Operand == 0) {
+    Out << "<null operand!>";
+  } else {
+    if (PrintType) {
+      TypePrinter.print(Operand->getType(), Out);
+      Out << ' ';
+    }
+    WriteAsOperandInternal(Out, Operand, TypePrinter, &Machine);
+  }
+}
+
+void AssemblyWriter::writeParamOperand(const Value *Operand, 
+                                       Attributes Attrs) {
+  if (Operand == 0) {
+    Out << "<null operand!>";
+  } else {
+    // Print the type
+    TypePrinter.print(Operand->getType(), Out);
+    // Print parameter attributes list
+    if (Attrs != Attribute::None)
+      Out << ' ' << Attribute::getAsString(Attrs);
+    Out << ' ';
+    // Print the operand
+    WriteAsOperandInternal(Out, Operand, TypePrinter, &Machine);
+  }
+}
+
+void AssemblyWriter::printModule(const Module *M) {
+  if (!M->getModuleIdentifier().empty() &&
+      // Don't print the ID if it will start a new line (which would
+      // require a comment char before it).
+      M->getModuleIdentifier().find('\n') == std::string::npos)
+    Out << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
+
+  if (!M->getDataLayout().empty())
+    Out << "target datalayout = \"" << M->getDataLayout() << "\"\n";
+  if (!M->getTargetTriple().empty())
+    Out << "target triple = \"" << M->getTargetTriple() << "\"\n";
+
+  if (!M->getModuleInlineAsm().empty()) {
+    // Split the string into lines, to make it easier to read the .ll file.
+    std::string Asm = M->getModuleInlineAsm();
+    size_t CurPos = 0;
+    size_t NewLine = Asm.find_first_of('\n', CurPos);
+    while (NewLine != std::string::npos) {
+      // We found a newline, print the portion of the asm string from the
+      // last newline up to this newline.
+      Out << "module asm \"";
+      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
+                         Out);
+      Out << "\"\n";
+      CurPos = NewLine+1;
+      NewLine = Asm.find_first_of('\n', CurPos);
+    }
+    Out << "module asm \"";
+    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
+    Out << "\"\n";
+  }
+  
+  // Loop over the dependent libraries and emit them.
+  Module::lib_iterator LI = M->lib_begin();
+  Module::lib_iterator LE = M->lib_end();
+  if (LI != LE) {
+    Out << "deplibs = [ ";
+    while (LI != LE) {
+      Out << '"' << *LI << '"';
+      ++LI;
+      if (LI != LE)
+        Out << ", ";
+    }
+    Out << " ]\n";
+  }
+
+  // Loop over the symbol table, emitting all id'd types.
+  printTypeSymbolTable(M->getTypeSymbolTable());
+
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I)
+    printGlobal(I);
+  
+  // Output all aliases.
+  if (!M->alias_empty()) Out << "\n";
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    printAlias(I);
+
+  // Output all of the functions.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
+    printFunction(I);
+}
+
+static void PrintLinkage(GlobalValue::LinkageTypes LT, raw_ostream &Out) {
+  switch (LT) {
+  case GlobalValue::PrivateLinkage:     Out << "private "; break;
+  case GlobalValue::InternalLinkage:    Out << "internal "; break;
+  case GlobalValue::AvailableExternallyLinkage:
+    Out << "available_externally ";
+    break;
+  case GlobalValue::LinkOnceAnyLinkage: Out << "linkonce "; break;
+  case GlobalValue::LinkOnceODRLinkage: Out << "linkonce_odr "; break;
+  case GlobalValue::WeakAnyLinkage:     Out << "weak "; break;
+  case GlobalValue::WeakODRLinkage:     Out << "weak_odr "; break;
+  case GlobalValue::CommonLinkage:      Out << "common "; break;
+  case GlobalValue::AppendingLinkage:   Out << "appending "; break;
+  case GlobalValue::DLLImportLinkage:   Out << "dllimport "; break;
+  case GlobalValue::DLLExportLinkage:   Out << "dllexport "; break;
+  case GlobalValue::ExternalWeakLinkage: Out << "extern_weak "; break;
+  case GlobalValue::ExternalLinkage: break;
+  case GlobalValue::GhostLinkage:
+    Out << "GhostLinkage not allowed in AsmWriter!\n";
+    abort();
+  }
+}
+
+
+static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
+                            raw_ostream &Out) {
+  switch (Vis) {
+  default: assert(0 && "Invalid visibility style!");
+  case GlobalValue::DefaultVisibility: break;
+  case GlobalValue::HiddenVisibility:    Out << "hidden "; break;
+  case GlobalValue::ProtectedVisibility: Out << "protected "; break;
+  }
+}
+
+void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+  if (GV->hasName()) {
+    PrintLLVMName(Out, GV);
+    Out << " = ";
+  }
+
+  if (!GV->hasInitializer() && GV->hasExternalLinkage())
+    Out << "external ";
+  
+  PrintLinkage(GV->getLinkage(), Out);
+  PrintVisibility(GV->getVisibility(), Out);
+
+  if (GV->isThreadLocal()) Out << "thread_local ";
+  if (unsigned AddressSpace = GV->getType()->getAddressSpace())
+    Out << "addrspace(" << AddressSpace << ") ";
+  Out << (GV->isConstant() ? "constant " : "global ");
+  TypePrinter.print(GV->getType()->getElementType(), Out);
+
+  if (GV->hasInitializer()) {
+    Out << ' ';
+    writeOperand(GV->getInitializer(), false);
+  }
+    
+  if (GV->hasSection())
+    Out << ", section \"" << GV->getSection() << '"';
+  if (GV->getAlignment())
+    Out << ", align " << GV->getAlignment();
+
+  printInfoComment(*GV);
+  Out << '\n';
+}
+
+void AssemblyWriter::printAlias(const GlobalAlias *GA) {
+  // Don't crash when dumping partially built GA
+  if (!GA->hasName())
+    Out << "<<nameless>> = ";
+  else {
+    PrintLLVMName(Out, GA);
+    Out << " = ";
+  }
+  PrintVisibility(GA->getVisibility(), Out);
+
+  Out << "alias ";
+
+  PrintLinkage(GA->getLinkage(), Out);
+  
+  const Constant *Aliasee = GA->getAliasee();
+    
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Aliasee)) {
+    TypePrinter.print(GV->getType(), Out);
+    Out << ' ';
+    PrintLLVMName(Out, GV);
+  } else if (const Function *F = dyn_cast<Function>(Aliasee)) {
+    TypePrinter.print(F->getFunctionType(), Out);
+    Out << "* ";
+
+    WriteAsOperandInternal(Out, F, TypePrinter, &Machine);
+  } else if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Aliasee)) {
+    TypePrinter.print(GA->getType(), Out);
+    Out << ' ';
+    PrintLLVMName(Out, GA);
+  } else {
+    const ConstantExpr *CE = cast<ConstantExpr>(Aliasee);
+    // The only valid GEP is an all zero GEP.
+    assert((CE->getOpcode() == Instruction::BitCast ||
+            CE->getOpcode() == Instruction::GetElementPtr) &&
+           "Unsupported aliasee");
+    writeOperand(CE, false);
+  }
+  
+  printInfoComment(*GA);
+  Out << '\n';
+}
+
+void AssemblyWriter::printTypeSymbolTable(const TypeSymbolTable &ST) {
+  // Emit all numbered types.
+  for (unsigned i = 0, e = NumberedTypes.size(); i != e; ++i) {
+    Out << "\ttype ";
+    
+    // Make sure we print out at least one level of the type structure, so
+    // that we do not get %2 = type %2
+    TypePrinter.printAtLeastOneLevel(NumberedTypes[i], Out);
+    Out << "\t\t; type %" << i << '\n';
+  }
+  
+  // Print the named types.
+  for (TypeSymbolTable::const_iterator TI = ST.begin(), TE = ST.end();
+       TI != TE; ++TI) {
+    Out << '\t';
+    PrintLLVMName(Out, &TI->first[0], TI->first.size(), LocalPrefix);
+    Out << " = type ";
+
+    // Make sure we print out at least one level of the type structure, so
+    // that we do not get %FILE = type %FILE
+    TypePrinter.printAtLeastOneLevel(TI->second, Out);
+    Out << '\n';
+  }
+}
+
+/// printFunction - Print all aspects of a function.
+///
+void AssemblyWriter::printFunction(const Function *F) {
+  // Print out the return type and name.
+  Out << '\n';
+
+  if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
+
+  if (F->isDeclaration())
+    Out << "declare ";
+  else
+    Out << "define ";
+  
+  PrintLinkage(F->getLinkage(), Out);
+  PrintVisibility(F->getVisibility(), Out);
+
+  // Print the calling convention.
+  switch (F->getCallingConv()) {
+  case CallingConv::C: break;   // default
+  case CallingConv::Fast:         Out << "fastcc "; break;
+  case CallingConv::Cold:         Out << "coldcc "; break;
+  case CallingConv::X86_StdCall:  Out << "x86_stdcallcc "; break;
+  case CallingConv::X86_FastCall: Out << "x86_fastcallcc "; break; 
+  default: Out << "cc" << F->getCallingConv() << " "; break;
+  }
+
+  const FunctionType *FT = F->getFunctionType();
+  const AttrListPtr &Attrs = F->getAttributes();
+  Attributes RetAttrs = Attrs.getRetAttributes();
+  if (RetAttrs != Attribute::None)
+    Out <<  Attribute::getAsString(Attrs.getRetAttributes()) << ' ';
+  TypePrinter.print(F->getReturnType(), Out);
+  Out << ' ';
+  WriteAsOperandInternal(Out, F, TypePrinter, &Machine);
+  Out << '(';
+  Machine.incorporateFunction(F);
+
+  // Loop over the arguments, printing them...
+
+  unsigned Idx = 1;
+  if (!F->isDeclaration()) {
+    // If this isn't a declaration, print the argument names as well.
+    for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      // Insert commas as we go... the first arg doesn't get a comma
+      if (I != F->arg_begin()) Out << ", ";
+      printArgument(I, Attrs.getParamAttributes(Idx));
+      Idx++;
+    }
+  } else {
+    // Otherwise, print the types from the function type.
+    for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+      // Insert commas as we go... the first arg doesn't get a comma
+      if (i) Out << ", ";
+      
+      // Output type...
+      TypePrinter.print(FT->getParamType(i), Out);
+      
+      Attributes ArgAttrs = Attrs.getParamAttributes(i+1);
+      if (ArgAttrs != Attribute::None)
+        Out << ' ' << Attribute::getAsString(ArgAttrs);
+    }
+  }
+
+  // Finish printing arguments...
+  if (FT->isVarArg()) {
+    if (FT->getNumParams()) Out << ", ";
+    Out << "...";  // Output varargs portion of signature!
+  }
+  Out << ')';
+  Attributes FnAttrs = Attrs.getFnAttributes();
+  if (FnAttrs != Attribute::None)
+    Out << ' ' << Attribute::getAsString(Attrs.getFnAttributes());
+  if (F->hasSection())
+    Out << " section \"" << F->getSection() << '"';
+  if (F->getAlignment())
+    Out << " align " << F->getAlignment();
+  if (F->hasGC())
+    Out << " gc \"" << F->getGC() << '"';
+  if (F->isDeclaration()) {
+    Out << "\n";
+  } else {
+    Out << " {";
+
+    // Output all of its basic blocks... for the function
+    for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I)
+      printBasicBlock(I);
+
+    Out << "}\n";
+  }
+
+  Machine.purgeFunction();
+}
+
+/// printArgument - This member is called for every argument that is passed into
+/// the function.  Simply print it out
+///
+void AssemblyWriter::printArgument(const Argument *Arg, 
+                                   Attributes Attrs) {
+  // Output type...
+  TypePrinter.print(Arg->getType(), Out);
+
+  // Output parameter attributes list
+  if (Attrs != Attribute::None)
+    Out << ' ' << Attribute::getAsString(Attrs);
+
+  // Output name, if available...
+  if (Arg->hasName()) {
+    Out << ' ';
+    PrintLLVMName(Out, Arg);
+  }
+}
+
+/// printBasicBlock - This member is called for each basic block in a method.
+///
+void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
+  if (BB->hasName()) {              // Print out the label if it exists...
+    Out << "\n";
+    PrintLLVMName(Out, BB->getNameStart(), BB->getNameLen(), LabelPrefix);
+    Out << ':';
+  } else if (!BB->use_empty()) {      // Don't print block # of no uses...
+    Out << "\n; <label>:";
+    int Slot = Machine.getLocalSlot(BB);
+    if (Slot != -1)
+      Out << Slot;
+    else
+      Out << "<badref>";
+  }
+
+  if (BB->getParent() == 0)
+    Out << "\t\t; Error: Block without parent!";
+  else if (BB != &BB->getParent()->getEntryBlock()) {  // Not the entry block?
+    // Output predecessors for the block...
+    Out << "\t\t;";
+    pred_const_iterator PI = pred_begin(BB), PE = pred_end(BB);
+    
+    if (PI == PE) {
+      Out << " No predecessors!";
+    } else {
+      Out << " preds = ";
+      writeOperand(*PI, false);
+      for (++PI; PI != PE; ++PI) {
+        Out << ", ";
+        writeOperand(*PI, false);
+      }
+    }
+  }
+
+  Out << "\n";
+
+  if (AnnotationWriter) AnnotationWriter->emitBasicBlockStartAnnot(BB, Out);
+
+  // Output all of the instructions in the basic block...
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    printInstruction(*I);
+
+  if (AnnotationWriter) AnnotationWriter->emitBasicBlockEndAnnot(BB, Out);
+}
+
+
+/// printInfoComment - Print a little comment after the instruction indicating
+/// which slot it occupies.
+///
+void AssemblyWriter::printInfoComment(const Value &V) {
+  if (V.getType() != Type::VoidTy) {
+    Out << "\t\t; <";
+    TypePrinter.print(V.getType(), Out);
+    Out << '>';
+
+    if (!V.hasName() && !isa<Instruction>(V)) {
+      int SlotNum;
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(&V))
+        SlotNum = Machine.getGlobalSlot(GV);
+      else
+        SlotNum = Machine.getLocalSlot(&V);
+      if (SlotNum == -1)
+        Out << ":<badref>";
+      else
+        Out << ':' << SlotNum; // Print out the def slot taken.
+    }
+    Out << " [#uses=" << V.getNumUses() << ']';  // Output # uses
+  }
+}
+
+// This member is called for each Instruction in a function..
+void AssemblyWriter::printInstruction(const Instruction &I) {
+  if (AnnotationWriter) AnnotationWriter->emitInstructionAnnot(&I, Out);
+
+  Out << '\t';
+
+  // Print out name if it exists...
+  if (I.hasName()) {
+    PrintLLVMName(Out, &I);
+    Out << " = ";
+  } else if (I.getType() != Type::VoidTy) {
+    // Print out the def slot taken.
+    int SlotNum = Machine.getLocalSlot(&I);
+    if (SlotNum == -1)
+      Out << "<badref> = ";
+    else
+      Out << '%' << SlotNum << " = ";
+  }
+
+  // If this is a volatile load or store, print out the volatile marker.
+  if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isVolatile()) ||
+      (isa<StoreInst>(I) && cast<StoreInst>(I).isVolatile())) {
+      Out << "volatile ";
+  } else if (isa<CallInst>(I) && cast<CallInst>(I).isTailCall()) {
+    // If this is a call, check if it's a tail call.
+    Out << "tail ";
+  }
+
+  // Print out the opcode...
+  Out << I.getOpcodeName();
+
+  // Print out the compare instruction predicates
+  if (const CmpInst *CI = dyn_cast<CmpInst>(&I))
+    Out << ' ' << getPredicateText(CI->getPredicate());
+
+  // Print out the type of the operands...
+  const Value *Operand = I.getNumOperands() ? I.getOperand(0) : 0;
+
+  // Special case conditional branches to swizzle the condition out to the front
+  if (isa<BranchInst>(I) && cast<BranchInst>(I).isConditional()) {
+    BranchInst &BI(cast<BranchInst>(I));
+    Out << ' ';
+    writeOperand(BI.getCondition(), true);
+    Out << ", ";
+    writeOperand(BI.getSuccessor(0), true);
+    Out << ", ";
+    writeOperand(BI.getSuccessor(1), true);
+
+  } else if (isa<SwitchInst>(I)) {
+    // Special case switch statement to get formatting nice and correct...
+    Out << ' ';
+    writeOperand(Operand        , true);
+    Out << ", ";
+    writeOperand(I.getOperand(1), true);
+    Out << " [";
+
+    for (unsigned op = 2, Eop = I.getNumOperands(); op < Eop; op += 2) {
+      Out << "\n\t\t";
+      writeOperand(I.getOperand(op  ), true);
+      Out << ", ";
+      writeOperand(I.getOperand(op+1), true);
+    }
+    Out << "\n\t]";
+  } else if (isa<PHINode>(I)) {
+    Out << ' ';
+    TypePrinter.print(I.getType(), Out);
+    Out << ' ';
+
+    for (unsigned op = 0, Eop = I.getNumOperands(); op < Eop; op += 2) {
+      if (op) Out << ", ";
+      Out << "[ ";
+      writeOperand(I.getOperand(op  ), false); Out << ", ";
+      writeOperand(I.getOperand(op+1), false); Out << " ]";
+    }
+  } else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(&I)) {
+    Out << ' ';
+    writeOperand(I.getOperand(0), true);
+    for (const unsigned *i = EVI->idx_begin(), *e = EVI->idx_end(); i != e; ++i)
+      Out << ", " << *i;
+  } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(&I)) {
+    Out << ' ';
+    writeOperand(I.getOperand(0), true); Out << ", ";
+    writeOperand(I.getOperand(1), true);
+    for (const unsigned *i = IVI->idx_begin(), *e = IVI->idx_end(); i != e; ++i)
+      Out << ", " << *i;
+  } else if (isa<ReturnInst>(I) && !Operand) {
+    Out << " void";
+  } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Print the calling convention being used.
+    switch (CI->getCallingConv()) {
+    case CallingConv::C: break;   // default
+    case CallingConv::Fast:  Out << " fastcc"; break;
+    case CallingConv::Cold:  Out << " coldcc"; break;
+    case CallingConv::X86_StdCall:  Out << " x86_stdcallcc"; break;
+    case CallingConv::X86_FastCall: Out << " x86_fastcallcc"; break; 
+    default: Out << " cc" << CI->getCallingConv(); break;
+    }
+
+    const PointerType    *PTy = cast<PointerType>(Operand->getType());
+    const FunctionType   *FTy = cast<FunctionType>(PTy->getElementType());
+    const Type         *RetTy = FTy->getReturnType();
+    const AttrListPtr &PAL = CI->getAttributes();
+
+    if (PAL.getRetAttributes() != Attribute::None)
+      Out << ' ' << Attribute::getAsString(PAL.getRetAttributes());
+
+    // If possible, print out the short form of the call instruction.  We can
+    // only do this if the first argument is a pointer to a nonvararg function,
+    // and if the return type is not a pointer to a function.
+    //
+    Out << ' ';
+    if (!FTy->isVarArg() &&
+        (!isa<PointerType>(RetTy) ||
+         !isa<FunctionType>(cast<PointerType>(RetTy)->getElementType()))) {
+      TypePrinter.print(RetTy, Out);
+      Out << ' ';
+      writeOperand(Operand, false);
+    } else {
+      writeOperand(Operand, true);
+    }
+    Out << '(';
+    for (unsigned op = 1, Eop = I.getNumOperands(); op < Eop; ++op) {
+      if (op > 1)
+        Out << ", ";
+      writeParamOperand(I.getOperand(op), PAL.getParamAttributes(op));
+    }
+    Out << ')';
+    if (PAL.getFnAttributes() != Attribute::None)
+      Out << ' ' << Attribute::getAsString(PAL.getFnAttributes());
+  } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+    const PointerType    *PTy = cast<PointerType>(Operand->getType());
+    const FunctionType   *FTy = cast<FunctionType>(PTy->getElementType());
+    const Type         *RetTy = FTy->getReturnType();
+    const AttrListPtr &PAL = II->getAttributes();
+
+    // Print the calling convention being used.
+    switch (II->getCallingConv()) {
+    case CallingConv::C: break;   // default
+    case CallingConv::Fast:  Out << " fastcc"; break;
+    case CallingConv::Cold:  Out << " coldcc"; break;
+    case CallingConv::X86_StdCall:  Out << " x86_stdcallcc"; break;
+    case CallingConv::X86_FastCall: Out << " x86_fastcallcc"; break;
+    default: Out << " cc" << II->getCallingConv(); break;
+    }
+
+    if (PAL.getRetAttributes() != Attribute::None)
+      Out << ' ' << Attribute::getAsString(PAL.getRetAttributes());
+
+    // If possible, print out the short form of the invoke instruction. We can
+    // only do this if the first argument is a pointer to a nonvararg function,
+    // and if the return type is not a pointer to a function.
+    //
+    Out << ' ';
+    if (!FTy->isVarArg() &&
+        (!isa<PointerType>(RetTy) ||
+         !isa<FunctionType>(cast<PointerType>(RetTy)->getElementType()))) {
+      TypePrinter.print(RetTy, Out);
+      Out << ' ';
+      writeOperand(Operand, false);
+    } else {
+      writeOperand(Operand, true);
+    }
+    Out << '(';
+    for (unsigned op = 3, Eop = I.getNumOperands(); op < Eop; ++op) {
+      if (op > 3)
+        Out << ", ";
+      writeParamOperand(I.getOperand(op), PAL.getParamAttributes(op-2));
+    }
+
+    Out << ')';
+    if (PAL.getFnAttributes() != Attribute::None)
+      Out << ' ' << Attribute::getAsString(PAL.getFnAttributes());
+
+    Out << "\n\t\t\tto ";
+    writeOperand(II->getNormalDest(), true);
+    Out << " unwind ";
+    writeOperand(II->getUnwindDest(), true);
+
+  } else if (const AllocationInst *AI = dyn_cast<AllocationInst>(&I)) {
+    Out << ' ';
+    TypePrinter.print(AI->getType()->getElementType(), Out);
+    if (AI->isArrayAllocation()) {
+      Out << ", ";
+      writeOperand(AI->getArraySize(), true);
+    }
+    if (AI->getAlignment()) {
+      Out << ", align " << AI->getAlignment();
+    }
+  } else if (isa<CastInst>(I)) {
+    if (Operand) {
+      Out << ' ';
+      writeOperand(Operand, true);   // Work with broken code
+    }
+    Out << " to ";
+    TypePrinter.print(I.getType(), Out);
+  } else if (isa<VAArgInst>(I)) {
+    if (Operand) {
+      Out << ' ';
+      writeOperand(Operand, true);   // Work with broken code
+    }
+    Out << ", ";
+    TypePrinter.print(I.getType(), Out);
+  } else if (Operand) {   // Print the normal way.
+
+    // PrintAllTypes - Instructions who have operands of all the same type
+    // omit the type from all but the first operand.  If the instruction has
+    // different type operands (for example br), then they are all printed.
+    bool PrintAllTypes = false;
+    const Type *TheType = Operand->getType();
+
+    // Select, Store and ShuffleVector always print all types.
+    if (isa<SelectInst>(I) || isa<StoreInst>(I) || isa<ShuffleVectorInst>(I)
+        || isa<ReturnInst>(I)) {
+      PrintAllTypes = true;
+    } else {
+      for (unsigned i = 1, E = I.getNumOperands(); i != E; ++i) {
+        Operand = I.getOperand(i);
+        // note that Operand shouldn't be null, but the test helps make dump()
+        // more tolerant of malformed IR
+        if (Operand && Operand->getType() != TheType) {
+          PrintAllTypes = true;    // We have differing types!  Print them all!
+          break;
+        }
+      }
+    }
+
+    if (!PrintAllTypes) {
+      Out << ' ';
+      TypePrinter.print(TheType, Out);
+    }
+
+    Out << ' ';
+    for (unsigned i = 0, E = I.getNumOperands(); i != E; ++i) {
+      if (i) Out << ", ";
+      writeOperand(I.getOperand(i), PrintAllTypes);
+    }
+  }
+  
+  // Print post operand alignment for load/store
+  if (isa<LoadInst>(I) && cast<LoadInst>(I).getAlignment()) {
+    Out << ", align " << cast<LoadInst>(I).getAlignment();
+  } else if (isa<StoreInst>(I) && cast<StoreInst>(I).getAlignment()) {
+    Out << ", align " << cast<StoreInst>(I).getAlignment();
+  }
+
+  printInfoComment(I);
+  Out << '\n';
+}
+
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declarations
+//===----------------------------------------------------------------------===//
+
+void Module::print(std::ostream &o, AssemblyAnnotationWriter *AAW) const {
+  raw_os_ostream OS(o);
+  print(OS, AAW);
+}
+void Module::print(raw_ostream &OS, AssemblyAnnotationWriter *AAW) const {
+  SlotTracker SlotTable(this);
+  AssemblyWriter W(OS, SlotTable, this, AAW);
+  W.write(this);
+}
+
+void Type::print(std::ostream &o) const {
+  raw_os_ostream OS(o);
+  print(OS);
+}
+
+void Type::print(raw_ostream &OS) const {
+  if (this == 0) {
+    OS << "<null Type>";
+    return;
+  }
+  TypePrinting().print(this, OS);
+}
+
+void Value::print(raw_ostream &OS, AssemblyAnnotationWriter *AAW) const {
+  if (this == 0) {
+    OS << "printing a <null> value\n";
+    return;
+  }
+
+  if (const Instruction *I = dyn_cast<Instruction>(this)) {
+    const Function *F = I->getParent() ? I->getParent()->getParent() : 0;
+    SlotTracker SlotTable(F);
+    AssemblyWriter W(OS, SlotTable, F ? F->getParent() : 0, AAW);
+    W.write(I);
+  } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(this)) {
+    SlotTracker SlotTable(BB->getParent());
+    AssemblyWriter W(OS, SlotTable,
+                     BB->getParent() ? BB->getParent()->getParent() : 0, AAW);
+    W.write(BB);
+  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) {
+    SlotTracker SlotTable(GV->getParent());
+    AssemblyWriter W(OS, SlotTable, GV->getParent(), AAW);
+    W.write(GV);
+  } else if (const Constant *C = dyn_cast<Constant>(this)) {
+    TypePrinting TypePrinter;
+    TypePrinter.print(C->getType(), OS);
+    OS << ' ';
+    WriteConstantInt(OS, C, TypePrinter, 0);
+  } else if (const Argument *A = dyn_cast<Argument>(this)) {
+    WriteAsOperand(OS, this, true,
+                   A->getParent() ? A->getParent()->getParent() : 0);
+  } else if (isa<InlineAsm>(this)) {
+    WriteAsOperand(OS, this, true, 0);
+  } else {
+    assert(0 && "Unknown value to print out!");
+  }
+}
+
+void Value::print(std::ostream &O, AssemblyAnnotationWriter *AAW) const {
+  raw_os_ostream OS(O);
+  print(OS, AAW);
+}
+
+// Value::dump - allow easy printing of Values from the debugger.
+void Value::dump() const { print(errs()); errs() << '\n'; }
+
+// Type::dump - allow easy printing of Types from the debugger.
+// This one uses type names from the given context module
+void Type::dump(const Module *Context) const {
+  WriteTypeSymbolic(errs(), this, Context);
+  errs() << '\n';
+}
+
+// Type::dump - allow easy printing of Types from the debugger.
+void Type::dump() const { dump(0); }
+
+// Module::dump() - Allow printing of Modules from the debugger.
+void Module::dump() const { print(errs(), 0); }
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
new file mode 100644
index 0000000..5a8fad9
--- /dev/null
+++ b/lib/VMCore/Attributes.cpp
@@ -0,0 +1,310 @@
+//===-- Attributes.cpp - Implement AttributesList -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AttributesList class and Attribute utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Attributes.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/ManagedStatic.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Attribute Function Definitions
+//===----------------------------------------------------------------------===//
+
+std::string Attribute::getAsString(Attributes Attrs) {
+  std::string Result;
+  if (Attrs & Attribute::ZExt)
+    Result += "zeroext ";
+  if (Attrs & Attribute::SExt)
+    Result += "signext ";
+  if (Attrs & Attribute::NoReturn)
+    Result += "noreturn ";
+  if (Attrs & Attribute::NoUnwind)
+    Result += "nounwind ";
+  if (Attrs & Attribute::InReg)
+    Result += "inreg ";
+  if (Attrs & Attribute::NoAlias)
+    Result += "noalias ";
+  if (Attrs & Attribute::NoCapture)
+    Result += "nocapture ";
+  if (Attrs & Attribute::StructRet)
+    Result += "sret ";  
+  if (Attrs & Attribute::ByVal)
+    Result += "byval ";
+  if (Attrs & Attribute::Nest)
+    Result += "nest ";
+  if (Attrs & Attribute::ReadNone)
+    Result += "readnone ";
+  if (Attrs & Attribute::ReadOnly)
+    Result += "readonly ";
+  if (Attrs & Attribute::OptimizeForSize)
+    Result += "optsize ";
+  if (Attrs & Attribute::NoInline)
+    Result += "noinline ";
+  if (Attrs & Attribute::AlwaysInline)
+    Result += "alwaysinline ";
+  if (Attrs & Attribute::StackProtect)
+    Result += "ssp ";
+  if (Attrs & Attribute::StackProtectReq)
+    Result += "sspreq ";
+  if (Attrs & Attribute::Alignment) {
+    Result += "align ";
+    Result += utostr(Attribute::getAlignmentFromAttrs(Attrs));
+    Result += " ";
+  }
+  // Trim the trailing space.
+  assert(!Result.empty() && "Unknown attribute!");
+  Result.erase(Result.end()-1);
+  return Result;
+}
+
+Attributes Attribute::typeIncompatible(const Type *Ty) {
+  Attributes Incompatible = None;
+  
+  if (!Ty->isInteger())
+    // Attributes that only apply to integers.
+    Incompatible |= SExt | ZExt;
+  
+  if (!isa<PointerType>(Ty))
+    // Attributes that only apply to pointers.
+    Incompatible |= ByVal | Nest | NoAlias | StructRet | NoCapture;
+  
+  return Incompatible;
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeListImpl Definition
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class AttributeListImpl : public FoldingSetNode {
+  unsigned RefCount;
+  
+  // AttributesList is uniqued, these should not be publicly available.
+  void operator=(const AttributeListImpl &); // Do not implement
+  AttributeListImpl(const AttributeListImpl &); // Do not implement
+  ~AttributeListImpl();                        // Private implementation
+public:
+  SmallVector<AttributeWithIndex, 4> Attrs;
+  
+  AttributeListImpl(const AttributeWithIndex *Attr, unsigned NumAttrs)
+    : Attrs(Attr, Attr+NumAttrs) {
+    RefCount = 0;
+  }
+  
+  void AddRef() { ++RefCount; }
+  void DropRef() { if (--RefCount == 0) delete this; }
+  
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, Attrs.data(), Attrs.size());
+  }
+  static void Profile(FoldingSetNodeID &ID, const AttributeWithIndex *Attr,
+                      unsigned NumAttrs) {
+    for (unsigned i = 0; i != NumAttrs; ++i)
+      ID.AddInteger(uint64_t(Attr[i].Attrs) << 32 | unsigned(Attr[i].Index));
+  }
+};
+}
+
+static ManagedStatic<FoldingSet<AttributeListImpl> > AttributesLists;
+
+AttributeListImpl::~AttributeListImpl() {
+  AttributesLists->RemoveNode(this);
+}
+
+
+AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs) {
+  // If there are no attributes then return a null AttributesList pointer.
+  if (NumAttrs == 0)
+    return AttrListPtr();
+  
+#ifndef NDEBUG
+  for (unsigned i = 0; i != NumAttrs; ++i) {
+    assert(Attrs[i].Attrs != Attribute::None && 
+           "Pointless attribute!");
+    assert((!i || Attrs[i-1].Index < Attrs[i].Index) &&
+           "Misordered AttributesList!");
+  }
+#endif
+  
+  // Otherwise, build a key to look up the existing attributes.
+  FoldingSetNodeID ID;
+  AttributeListImpl::Profile(ID, Attrs, NumAttrs);
+  void *InsertPos;
+  AttributeListImpl *PAL =
+    AttributesLists->FindNodeOrInsertPos(ID, InsertPos);
+  
+  // If we didn't find any existing attributes of the same shape then
+  // create a new one and insert it.
+  if (!PAL) {
+    PAL = new AttributeListImpl(Attrs, NumAttrs);
+    AttributesLists->InsertNode(PAL, InsertPos);
+  }
+  
+  // Return the AttributesList that we found or created.
+  return AttrListPtr(PAL);
+}
+
+
+//===----------------------------------------------------------------------===//
+// AttrListPtr Method Implementations
+//===----------------------------------------------------------------------===//
+
+AttrListPtr::AttrListPtr(AttributeListImpl *LI) : AttrList(LI) {
+  if (LI) LI->AddRef();
+}
+
+AttrListPtr::AttrListPtr(const AttrListPtr &P) : AttrList(P.AttrList) {
+  if (AttrList) AttrList->AddRef();  
+}
+
+const AttrListPtr &AttrListPtr::operator=(const AttrListPtr &RHS) {
+  if (AttrList == RHS.AttrList) return *this;
+  if (AttrList) AttrList->DropRef();
+  AttrList = RHS.AttrList;
+  if (AttrList) AttrList->AddRef();
+  return *this;
+}
+
+AttrListPtr::~AttrListPtr() {
+  if (AttrList) AttrList->DropRef();
+}
+
+/// getNumSlots - Return the number of slots used in this attribute list. 
+/// This is the number of arguments that have an attribute set on them
+/// (including the function itself).
+unsigned AttrListPtr::getNumSlots() const {
+  return AttrList ? AttrList->Attrs.size() : 0;
+}
+
+/// getSlot - Return the AttributeWithIndex at the specified slot.  This
+/// holds a number plus a set of attributes.
+const AttributeWithIndex &AttrListPtr::getSlot(unsigned Slot) const {
+  assert(AttrList && Slot < AttrList->Attrs.size() && "Slot # out of range!");
+  return AttrList->Attrs[Slot];
+}
+
+
+/// getAttributes - The attributes for the specified index are
+/// returned.  Attributes for the result are denoted with Idx = 0.
+/// Function notes are denoted with idx = ~0.
+Attributes AttrListPtr::getAttributes(unsigned Idx) const {
+  if (AttrList == 0) return Attribute::None;
+  
+  const SmallVector<AttributeWithIndex, 4> &Attrs = AttrList->Attrs;
+  for (unsigned i = 0, e = Attrs.size(); i != e && Attrs[i].Index <= Idx; ++i)
+    if (Attrs[i].Index == Idx)
+      return Attrs[i].Attrs;
+  return Attribute::None;
+}
+
+/// hasAttrSomewhere - Return true if the specified attribute is set for at
+/// least one parameter or for the return value.
+bool AttrListPtr::hasAttrSomewhere(Attributes Attr) const {
+  if (AttrList == 0) return false;
+  
+  const SmallVector<AttributeWithIndex, 4> &Attrs = AttrList->Attrs;
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i)
+    if (Attrs[i].Attrs & Attr)
+      return true;
+  return false;
+}
+
+
+AttrListPtr AttrListPtr::addAttr(unsigned Idx, Attributes Attrs) const {
+  Attributes OldAttrs = getAttributes(Idx);
+#ifndef NDEBUG
+  // FIXME it is not obvious how this should work for alignment.
+  // For now, say we can't change a known alignment.
+  Attributes OldAlign = OldAttrs & Attribute::Alignment;
+  Attributes NewAlign = Attrs & Attribute::Alignment;
+  assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
+         "Attempt to change alignment!");
+#endif
+  
+  Attributes NewAttrs = OldAttrs | Attrs;
+  if (NewAttrs == OldAttrs)
+    return *this;
+  
+  SmallVector<AttributeWithIndex, 8> NewAttrList;
+  if (AttrList == 0)
+    NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
+  else {
+    const SmallVector<AttributeWithIndex, 4> &OldAttrList = AttrList->Attrs;
+    unsigned i = 0, e = OldAttrList.size();
+    // Copy attributes for arguments before this one.
+    for (; i != e && OldAttrList[i].Index < Idx; ++i)
+      NewAttrList.push_back(OldAttrList[i]);
+
+    // If there are attributes already at this index, merge them in.
+    if (i != e && OldAttrList[i].Index == Idx) {
+      Attrs |= OldAttrList[i].Attrs;
+      ++i;
+    }
+    
+    NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
+    
+    // Copy attributes for arguments after this one.
+    NewAttrList.insert(NewAttrList.end(), 
+                       OldAttrList.begin()+i, OldAttrList.end());
+  }
+  
+  return get(NewAttrList.data(), NewAttrList.size());
+}
+
+AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const {
+#ifndef NDEBUG
+  // FIXME it is not obvious how this should work for alignment.
+  // For now, say we can't pass in alignment, which no current use does.
+  assert(!(Attrs & Attribute::Alignment) && "Attempt to exclude alignment!");
+#endif
+  if (AttrList == 0) return AttrListPtr();
+  
+  Attributes OldAttrs = getAttributes(Idx);
+  Attributes NewAttrs = OldAttrs & ~Attrs;
+  if (NewAttrs == OldAttrs)
+    return *this;
+
+  SmallVector<AttributeWithIndex, 8> NewAttrList;
+  const SmallVector<AttributeWithIndex, 4> &OldAttrList = AttrList->Attrs;
+  unsigned i = 0, e = OldAttrList.size();
+  
+  // Copy attributes for arguments before this one.
+  for (; i != e && OldAttrList[i].Index < Idx; ++i)
+    NewAttrList.push_back(OldAttrList[i]);
+  
+  // If there are attributes already at this index, merge them in.
+  assert(OldAttrList[i].Index == Idx && "Attribute isn't set?");
+  Attrs = OldAttrList[i].Attrs & ~Attrs;
+  ++i;
+  if (Attrs)  // If any attributes left for this parameter, add them.
+    NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
+  
+  // Copy attributes for arguments after this one.
+  NewAttrList.insert(NewAttrList.end(), 
+                     OldAttrList.begin()+i, OldAttrList.end());
+  
+  return get(NewAttrList.data(), NewAttrList.size());
+}
+
+void AttrListPtr::dump() const {
+  cerr << "PAL[ ";
+  for (unsigned i = 0; i < getNumSlots(); ++i) {
+    const AttributeWithIndex &PAWI = getSlot(i);
+    cerr << "{" << PAWI.Index << "," << PAWI.Attrs << "} ";
+  }
+  
+  cerr << "]\n";
+}
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
new file mode 100644
index 0000000..dd36607
--- /dev/null
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -0,0 +1,430 @@
+//===-- AutoUpgrade.cpp - Implement auto-upgrade helper functions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the auto-upgrade helper functions 
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AutoUpgrade.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/ADT/SmallVector.h"
+#include <cstring>
+using namespace llvm;
+
+
+static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
+  assert(F && "Illegal to upgrade a non-existent Function.");
+
+  // Get the Function's name.
+  const std::string& Name = F->getName();
+
+  // Convenience
+  const FunctionType *FTy = F->getFunctionType();
+
+  // Quickly eliminate it, if it's not a candidate.
+  if (Name.length() <= 8 || Name[0] != 'l' || Name[1] != 'l' || 
+      Name[2] != 'v' || Name[3] != 'm' || Name[4] != '.')
+    return false;
+
+  Module *M = F->getParent();
+  switch (Name[5]) {
+  default: break;
+  case 'a':
+    // This upgrades the llvm.atomic.lcs, llvm.atomic.las, llvm.atomic.lss,
+    // and atomics with default address spaces to their new names to their new
+    // function name (e.g. llvm.atomic.add.i32 => llvm.atomic.add.i32.p0i32)
+    if (Name.compare(5,7,"atomic.",7) == 0) {
+      if (Name.compare(12,3,"lcs",3) == 0) {
+        std::string::size_type delim = Name.find('.',12);
+        F->setName("llvm.atomic.cmp.swap" + Name.substr(delim) +
+                   ".p0" + Name.substr(delim+1));
+        NewFn = F;
+        return true;
+      }
+      else if (Name.compare(12,3,"las",3) == 0) {
+        std::string::size_type delim = Name.find('.',12);
+        F->setName("llvm.atomic.load.add"+Name.substr(delim)
+                   + ".p0" + Name.substr(delim+1));
+        NewFn = F;
+        return true;
+      }
+      else if (Name.compare(12,3,"lss",3) == 0) {
+        std::string::size_type delim = Name.find('.',12);
+        F->setName("llvm.atomic.load.sub"+Name.substr(delim)
+                   + ".p0" + Name.substr(delim+1));
+        NewFn = F;
+        return true;
+      }
+      else if (Name.rfind(".p") == std::string::npos) {
+        // We don't have an address space qualifier so this has be upgraded
+        // to the new name.  Copy the type name at the end of the intrinsic
+        // and add to it
+        std::string::size_type delim = Name.find_last_of('.');
+        assert(delim != std::string::npos && "can not find type");
+        F->setName(Name + ".p0" + Name.substr(delim+1));
+        NewFn = F;
+        return true;
+      }
+    }
+    break;
+  case 'b':
+    //  This upgrades the name of the llvm.bswap intrinsic function to only use 
+    //  a single type name for overloading. We only care about the old format
+    //  'llvm.bswap.i*.i*', so check for 'bswap.' and then for there being 
+    //  a '.' after 'bswap.'
+    if (Name.compare(5,6,"bswap.",6) == 0) {
+      std::string::size_type delim = Name.find('.',11);
+      
+      if (delim != std::string::npos) {
+        //  Construct the new name as 'llvm.bswap' + '.i*'
+        F->setName(Name.substr(0,10)+Name.substr(delim));
+        NewFn = F;
+        return true;
+      }
+    }
+    break;
+
+  case 'c':
+    //  We only want to fix the 'llvm.ct*' intrinsics which do not have the 
+    //  correct return type, so we check for the name, and then check if the 
+    //  return type does not match the parameter type.
+    if ( (Name.compare(5,5,"ctpop",5) == 0 ||
+          Name.compare(5,4,"ctlz",4) == 0 ||
+          Name.compare(5,4,"cttz",4) == 0) &&
+        FTy->getReturnType() != FTy->getParamType(0)) {
+      //  We first need to change the name of the old (bad) intrinsic, because 
+      //  its type is incorrect, but we cannot overload that name. We 
+      //  arbitrarily unique it here allowing us to construct a correctly named 
+      //  and typed function below.
+      F->setName("");
+
+      //  Now construct the new intrinsic with the correct name and type. We 
+      //  leave the old function around in order to query its type, whatever it 
+      //  may be, and correctly convert up to the new type.
+      NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                    FTy->getParamType(0),
+                                                    FTy->getParamType(0),
+                                                    (Type *)0));
+      return true;
+    }
+    break;
+
+  case 'p':
+    //  This upgrades the llvm.part.select overloaded intrinsic names to only 
+    //  use one type specifier in the name. We only care about the old format
+    //  'llvm.part.select.i*.i*', and solve as above with bswap.
+    if (Name.compare(5,12,"part.select.",12) == 0) {
+      std::string::size_type delim = Name.find('.',17);
+      
+      if (delim != std::string::npos) {
+        //  Construct a new name as 'llvm.part.select' + '.i*'
+        F->setName(Name.substr(0,16)+Name.substr(delim));
+        NewFn = F;
+        return true;
+      }
+      break;
+    }
+
+    //  This upgrades the llvm.part.set intrinsics similarly as above, however 
+    //  we care about 'llvm.part.set.i*.i*.i*', but only the first two types 
+    //  must match. There is an additional type specifier after these two 
+    //  matching types that we must retain when upgrading.  Thus, we require 
+    //  finding 2 periods, not just one, after the intrinsic name.
+    if (Name.compare(5,9,"part.set.",9) == 0) {
+      std::string::size_type delim = Name.find('.',14);
+
+      if (delim != std::string::npos &&
+          Name.find('.',delim+1) != std::string::npos) {
+        //  Construct a new name as 'llvm.part.select' + '.i*.i*'
+        F->setName(Name.substr(0,13)+Name.substr(delim));
+        NewFn = F;
+        return true;
+      }
+      break;
+    }
+
+    break;
+  case 'x': 
+    // This fixes all MMX shift intrinsic instructions to take a
+    // v1i64 instead of a v2i32 as the second parameter.
+    if (Name.compare(5,10,"x86.mmx.ps",10) == 0 &&
+        (Name.compare(13,4,"psll", 4) == 0 ||
+         Name.compare(13,4,"psra", 4) == 0 ||
+         Name.compare(13,4,"psrl", 4) == 0) && Name[17] != 'i') {
+      
+      const llvm::Type *VT = VectorType::get(IntegerType::get(64), 1);
+      
+      // We don't have to do anything if the parameter already has
+      // the correct type.
+      if (FTy->getParamType(1) == VT)
+        break;
+      
+      //  We first need to change the name of the old (bad) intrinsic, because 
+      //  its type is incorrect, but we cannot overload that name. We 
+      //  arbitrarily unique it here allowing us to construct a correctly named 
+      //  and typed function below.
+      F->setName("");
+
+      assert(FTy->getNumParams() == 2 && "MMX shift intrinsics take 2 args!");
+      
+      //  Now construct the new intrinsic with the correct name and type. We 
+      //  leave the old function around in order to query its type, whatever it 
+      //  may be, and correctly convert up to the new type.
+      NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                    FTy->getReturnType(),
+                                                    FTy->getParamType(0),
+                                                    VT,
+                                                    (Type *)0));
+      return true;
+    } else if (Name.compare(5,17,"x86.sse2.loadh.pd",17) == 0 ||
+               Name.compare(5,17,"x86.sse2.loadl.pd",17) == 0 ||
+               Name.compare(5,16,"x86.sse2.movl.dq",16) == 0 ||
+               Name.compare(5,15,"x86.sse2.movs.d",15) == 0 ||
+               Name.compare(5,16,"x86.sse2.shuf.pd",16) == 0 ||
+               Name.compare(5,18,"x86.sse2.unpckh.pd",18) == 0 ||
+               Name.compare(5,18,"x86.sse2.unpckl.pd",18) == 0 ||
+               Name.compare(5,20,"x86.sse2.punpckh.qdq",20) == 0 ||
+               Name.compare(5,20,"x86.sse2.punpckl.qdq",20) == 0) {
+      // Calls to these intrinsics are transformed into ShuffleVector's.
+      NewFn = 0;
+      return true;
+    }
+
+    break;
+  }
+
+  //  This may not belong here. This function is effectively being overloaded 
+  //  to both detect an intrinsic which needs upgrading, and to provide the 
+  //  upgraded form of the intrinsic. We should perhaps have two separate 
+  //  functions for this.
+  return false;
+}
+
+bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
+  NewFn = 0;
+  bool Upgraded = UpgradeIntrinsicFunction1(F, NewFn);
+
+  // Upgrade intrinsic attributes.  This does not change the function.
+  if (NewFn)
+    F = NewFn;
+  if (unsigned id = F->getIntrinsicID())
+    F->setAttributes(Intrinsic::getAttributes((Intrinsic::ID)id));
+  return Upgraded;
+}
+
+// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the 
+// upgraded intrinsic. All argument and return casting must be provided in 
+// order to seamlessly integrate with existing context.
+void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
+  Function *F = CI->getCalledFunction();
+  assert(F && "CallInst has no function associated with it.");
+
+  if (!NewFn) {
+    bool isLoadH = false, isLoadL = false, isMovL = false;
+    bool isMovSD = false, isShufPD = false;
+    bool isUnpckhPD = false, isUnpcklPD = false;
+    bool isPunpckhQPD = false, isPunpcklQPD = false;
+    if (strcmp(F->getNameStart(), "llvm.x86.sse2.loadh.pd") == 0)
+      isLoadH = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.loadl.pd") == 0)
+      isLoadL = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.movl.dq") == 0)
+      isMovL = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.movs.d") == 0)
+      isMovSD = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.shuf.pd") == 0)
+      isShufPD = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.unpckh.pd") == 0)
+      isUnpckhPD = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.unpckl.pd") == 0)
+      isUnpcklPD = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.punpckh.qdq") == 0)
+      isPunpckhQPD = true;
+    else if (strcmp(F->getNameStart(), "llvm.x86.sse2.punpckl.qdq") == 0)
+      isPunpcklQPD = true;
+
+    if (isLoadH || isLoadL || isMovL || isMovSD || isShufPD ||
+        isUnpckhPD || isUnpcklPD || isPunpckhQPD || isPunpcklQPD) {
+      std::vector<Constant*> Idxs;
+      Value *Op0 = CI->getOperand(1);
+      ShuffleVectorInst *SI = NULL;
+      if (isLoadH || isLoadL) {
+        Value *Op1 = UndefValue::get(Op0->getType());
+        Value *Addr = new BitCastInst(CI->getOperand(2), 
+                                      PointerType::getUnqual(Type::DoubleTy),
+                                      "upgraded.", CI);
+        Value *Load = new LoadInst(Addr, "upgraded.", false, 8, CI);
+        Value *Idx = ConstantInt::get(Type::Int32Ty, 0);
+        Op1 = InsertElementInst::Create(Op1, Load, Idx, "upgraded.", CI);
+
+        if (isLoadH) {
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 0));
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 2));
+        } else {
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 2));
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 1));
+        }
+        Value *Mask = ConstantVector::get(Idxs);
+        SI = new ShuffleVectorInst(Op0, Op1, Mask, "upgraded.", CI);
+      } else if (isMovL) {
+        Constant *Zero = ConstantInt::get(Type::Int32Ty, 0);
+        Idxs.push_back(Zero);
+        Idxs.push_back(Zero);
+        Idxs.push_back(Zero);
+        Idxs.push_back(Zero);
+        Value *ZeroV = ConstantVector::get(Idxs);
+
+        Idxs.clear(); 
+        Idxs.push_back(ConstantInt::get(Type::Int32Ty, 4));
+        Idxs.push_back(ConstantInt::get(Type::Int32Ty, 5));
+        Idxs.push_back(ConstantInt::get(Type::Int32Ty, 2));
+        Idxs.push_back(ConstantInt::get(Type::Int32Ty, 3));
+        Value *Mask = ConstantVector::get(Idxs);
+        SI = new ShuffleVectorInst(ZeroV, Op0, Mask, "upgraded.", CI);
+      } else if (isMovSD ||
+                 isUnpckhPD || isUnpcklPD || isPunpckhQPD || isPunpcklQPD) {
+        Value *Op1 = CI->getOperand(2);
+        if (isMovSD) {
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 2));
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 1));
+        } else if (isUnpckhPD || isPunpckhQPD) {
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 1));
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 3));
+        } else {
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 0));
+          Idxs.push_back(ConstantInt::get(Type::Int32Ty, 2));
+        }
+        Value *Mask = ConstantVector::get(Idxs);
+        SI = new ShuffleVectorInst(Op0, Op1, Mask, "upgraded.", CI);
+      } else if (isShufPD) {
+        Value *Op1 = CI->getOperand(2);
+        unsigned MaskVal = cast<ConstantInt>(CI->getOperand(3))->getZExtValue();
+        Idxs.push_back(ConstantInt::get(Type::Int32Ty, MaskVal & 1));
+        Idxs.push_back(ConstantInt::get(Type::Int32Ty, ((MaskVal >> 1) & 1)+2));
+        Value *Mask = ConstantVector::get(Idxs);
+        SI = new ShuffleVectorInst(Op0, Op1, Mask, "upgraded.", CI);
+      }
+
+      assert(SI && "Unexpected!");
+
+      // Handle any uses of the old CallInst.
+      if (!CI->use_empty())
+        //  Replace all uses of the old call with the new cast which has the 
+        //  correct type.
+        CI->replaceAllUsesWith(SI);
+      
+      //  Clean up the old call now that it has been completely upgraded.
+      CI->eraseFromParent();
+    } else {
+      assert(0 && "Unknown function for CallInst upgrade.");
+    }
+    return;
+  }
+
+  switch (NewFn->getIntrinsicID()) {
+  default:  assert(0 && "Unknown function for CallInst upgrade.");
+  case Intrinsic::x86_mmx_psll_d:
+  case Intrinsic::x86_mmx_psll_q:
+  case Intrinsic::x86_mmx_psll_w:
+  case Intrinsic::x86_mmx_psra_d:
+  case Intrinsic::x86_mmx_psra_w:
+  case Intrinsic::x86_mmx_psrl_d:
+  case Intrinsic::x86_mmx_psrl_q:
+  case Intrinsic::x86_mmx_psrl_w: {
+    Value *Operands[2];
+    
+    Operands[0] = CI->getOperand(1);
+    
+    // Cast the second parameter to the correct type.
+    BitCastInst *BC = new BitCastInst(CI->getOperand(2), 
+                                      NewFn->getFunctionType()->getParamType(1),
+                                      "upgraded.", CI);
+    Operands[1] = BC;
+    
+    //  Construct a new CallInst
+    CallInst *NewCI = CallInst::Create(NewFn, Operands, Operands+2, 
+                                       "upgraded."+CI->getName(), CI);
+    NewCI->setTailCall(CI->isTailCall());
+    NewCI->setCallingConv(CI->getCallingConv());
+    
+    //  Handle any uses of the old CallInst.
+    if (!CI->use_empty())
+      //  Replace all uses of the old call with the new cast which has the 
+      //  correct type.
+      CI->replaceAllUsesWith(NewCI);
+    
+    //  Clean up the old call now that it has been completely upgraded.
+    CI->eraseFromParent();
+    break;
+  }        
+  case Intrinsic::ctlz:
+  case Intrinsic::ctpop:
+  case Intrinsic::cttz: {
+    //  Build a small vector of the 1..(N-1) operands, which are the 
+    //  parameters.
+    SmallVector<Value*, 8> Operands(CI->op_begin()+1, CI->op_end());
+
+    //  Construct a new CallInst
+    CallInst *NewCI = CallInst::Create(NewFn, Operands.begin(), Operands.end(),
+                                       "upgraded."+CI->getName(), CI);
+    NewCI->setTailCall(CI->isTailCall());
+    NewCI->setCallingConv(CI->getCallingConv());
+
+    //  Handle any uses of the old CallInst.
+    if (!CI->use_empty()) {
+      //  Check for sign extend parameter attributes on the return values.
+      bool SrcSExt = NewFn->getAttributes().paramHasAttr(0, Attribute::SExt);
+      bool DestSExt = F->getAttributes().paramHasAttr(0, Attribute::SExt);
+      
+      //  Construct an appropriate cast from the new return type to the old.
+      CastInst *RetCast = CastInst::Create(
+                            CastInst::getCastOpcode(NewCI, SrcSExt,
+                                                    F->getReturnType(),
+                                                    DestSExt),
+                            NewCI, F->getReturnType(),
+                            NewCI->getName(), CI);
+      NewCI->moveBefore(RetCast);
+
+      //  Replace all uses of the old call with the new cast which has the 
+      //  correct type.
+      CI->replaceAllUsesWith(RetCast);
+    }
+
+    //  Clean up the old call now that it has been completely upgraded.
+    CI->eraseFromParent();
+  }
+  break;
+  }
+}
+
+// This tests each Function to determine if it needs upgrading. When we find 
+// one we are interested in, we then upgrade all calls to reflect the new 
+// function.
+void llvm::UpgradeCallsToIntrinsic(Function* F) {
+  assert(F && "Illegal attempt to upgrade a non-existent intrinsic.");
+
+  // Upgrade the function and check if it is a totaly new function.
+  Function* NewFn;
+  if (UpgradeIntrinsicFunction(F, NewFn)) {
+    if (NewFn != F) {
+      // Replace all uses to the old function with the new one if necessary.
+      for (Value::use_iterator UI = F->use_begin(), UE = F->use_end();
+           UI != UE; ) {
+        if (CallInst* CI = dyn_cast<CallInst>(*UI++))
+          UpgradeIntrinsicCall(CI, NewFn);
+      }
+      // Remove old function, no longer used, from the module.
+      F->eraseFromParent();
+    }
+  }
+}
diff --git a/lib/VMCore/BasicBlock.cpp b/lib/VMCore/BasicBlock.cpp
new file mode 100644
index 0000000..3065766
--- /dev/null
+++ b/lib/VMCore/BasicBlock.cpp
@@ -0,0 +1,274 @@
+//===-- BasicBlock.cpp - Implement BasicBlock related methods -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BasicBlock class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/Compiler.h"
+#include "SymbolTableListTraitsImpl.h"
+#include <algorithm>
+using namespace llvm;
+
+ValueSymbolTable *BasicBlock::getValueSymbolTable() {
+  if (Function *F = getParent())
+    return &F->getValueSymbolTable();
+  return 0;
+}
+
+// Explicit instantiation of SymbolTableListTraits since some of the methods
+// are not in the public header file...
+template class SymbolTableListTraits<Instruction, BasicBlock>;
+
+
+BasicBlock::BasicBlock(const std::string &Name, Function *NewParent,
+                       BasicBlock *InsertBefore)
+  : Value(Type::LabelTy, Value::BasicBlockVal), Parent(0) {
+
+  // Make sure that we get added to a function
+  LeakDetector::addGarbageObject(this);
+
+  if (InsertBefore) {
+    assert(NewParent &&
+           "Cannot insert block before another block with no function!");
+    NewParent->getBasicBlockList().insert(InsertBefore, this);
+  } else if (NewParent) {
+    NewParent->getBasicBlockList().push_back(this);
+  }
+  
+  setName(Name);
+}
+
+
+BasicBlock::~BasicBlock() {
+  assert(getParent() == 0 && "BasicBlock still linked into the program!");
+  dropAllReferences();
+  InstList.clear();
+}
+
+void BasicBlock::setParent(Function *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+
+  // Set Parent=parent, updating instruction symtab entries as appropriate.
+  InstList.setSymTabObject(&Parent, parent);
+
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+void BasicBlock::removeFromParent() {
+  getParent()->getBasicBlockList().remove(this);
+}
+
+void BasicBlock::eraseFromParent() {
+  getParent()->getBasicBlockList().erase(this);
+}
+
+/// moveBefore - Unlink this basic block from its current function and
+/// insert it into the function that MovePos lives in, right before MovePos.
+void BasicBlock::moveBefore(BasicBlock *MovePos) {
+  MovePos->getParent()->getBasicBlockList().splice(MovePos,
+                       getParent()->getBasicBlockList(), this);
+}
+
+/// moveAfter - Unlink this basic block from its current function and
+/// insert it into the function that MovePos lives in, right after MovePos.
+void BasicBlock::moveAfter(BasicBlock *MovePos) {
+  Function::iterator I = MovePos;
+  MovePos->getParent()->getBasicBlockList().splice(++I,
+                                       getParent()->getBasicBlockList(), this);
+}
+
+
+TerminatorInst *BasicBlock::getTerminator() {
+  if (InstList.empty()) return 0;
+  return dyn_cast<TerminatorInst>(&InstList.back());
+}
+
+const TerminatorInst *BasicBlock::getTerminator() const {
+  if (InstList.empty()) return 0;
+  return dyn_cast<TerminatorInst>(&InstList.back());
+}
+
+Instruction* BasicBlock::getFirstNonPHI() {
+  BasicBlock::iterator i = begin();
+  // All valid basic blocks should have a terminator,
+  // which is not a PHINode. If we have an invalid basic
+  // block we'll get an assertion failure when dereferencing
+  // a past-the-end iterator.
+  while (isa<PHINode>(i)) ++i;
+  return &*i;
+}
+
+void BasicBlock::dropAllReferences() {
+  for(iterator I = begin(), E = end(); I != E; ++I)
+    I->dropAllReferences();
+}
+
+/// getSinglePredecessor - If this basic block has a single predecessor block,
+/// return the block, otherwise return a null pointer.
+BasicBlock *BasicBlock::getSinglePredecessor() {
+  pred_iterator PI = pred_begin(this), E = pred_end(this);
+  if (PI == E) return 0;         // No preds.
+  BasicBlock *ThePred = *PI;
+  ++PI;
+  return (PI == E) ? ThePred : 0 /*multiple preds*/;
+}
+
+/// getUniquePredecessor - If this basic block has a unique predecessor block,
+/// return the block, otherwise return a null pointer.
+/// Note that unique predecessor doesn't mean single edge, there can be 
+/// multiple edges from the unique predecessor to this block (for example 
+/// a switch statement with multiple cases having the same destination).
+BasicBlock *BasicBlock::getUniquePredecessor() {
+  pred_iterator PI = pred_begin(this), E = pred_end(this);
+  if (PI == E) return 0; // No preds.
+  BasicBlock *PredBB = *PI;
+  ++PI;
+  for (;PI != E; ++PI) {
+    if (*PI != PredBB)
+      return 0;
+    // The same predecessor appears multiple times in the predecessor list.
+    // This is OK.
+  }
+  return PredBB;
+}
+
+/// removePredecessor - This method is used to notify a BasicBlock that the
+/// specified Predecessor of the block is no longer able to reach it.  This is
+/// actually not used to update the Predecessor list, but is actually used to
+/// update the PHI nodes that reside in the block.  Note that this should be
+/// called while the predecessor still refers to this block.
+///
+void BasicBlock::removePredecessor(BasicBlock *Pred,
+                                   bool DontDeleteUselessPHIs) {
+  assert((hasNUsesOrMore(16)||// Reduce cost of this assertion for complex CFGs.
+          find(pred_begin(this), pred_end(this), Pred) != pred_end(this)) &&
+         "removePredecessor: BB is not a predecessor!");
+
+  if (InstList.empty()) return;
+  PHINode *APN = dyn_cast<PHINode>(&front());
+  if (!APN) return;   // Quick exit.
+
+  // If there are exactly two predecessors, then we want to nuke the PHI nodes
+  // altogether.  However, we cannot do this, if this in this case:
+  //
+  //  Loop:
+  //    %x = phi [X, Loop]
+  //    %x2 = add %x, 1         ;; This would become %x2 = add %x2, 1
+  //    br Loop                 ;; %x2 does not dominate all uses
+  //
+  // This is because the PHI node input is actually taken from the predecessor
+  // basic block.  The only case this can happen is with a self loop, so we
+  // check for this case explicitly now.
+  //
+  unsigned max_idx = APN->getNumIncomingValues();
+  assert(max_idx != 0 && "PHI Node in block with 0 predecessors!?!?!");
+  if (max_idx == 2) {
+    BasicBlock *Other = APN->getIncomingBlock(APN->getIncomingBlock(0) == Pred);
+
+    // Disable PHI elimination!
+    if (this == Other) max_idx = 3;
+  }
+
+  // <= Two predecessors BEFORE I remove one?
+  if (max_idx <= 2 && !DontDeleteUselessPHIs) {
+    // Yup, loop through and nuke the PHI nodes
+    while (PHINode *PN = dyn_cast<PHINode>(&front())) {
+      // Remove the predecessor first.
+      PN->removeIncomingValue(Pred, !DontDeleteUselessPHIs);
+
+      // If the PHI _HAD_ two uses, replace PHI node with its now *single* value
+      if (max_idx == 2) {
+        if (PN->getOperand(0) != PN)
+          PN->replaceAllUsesWith(PN->getOperand(0));
+        else
+          // We are left with an infinite loop with no entries: kill the PHI.
+          PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+        getInstList().pop_front();    // Remove the PHI node
+      }
+
+      // If the PHI node already only had one entry, it got deleted by
+      // removeIncomingValue.
+    }
+  } else {
+    // Okay, now we know that we need to remove predecessor #pred_idx from all
+    // PHI nodes.  Iterate over each PHI node fixing them up
+    PHINode *PN;
+    for (iterator II = begin(); (PN = dyn_cast<PHINode>(II)); ) {
+      ++II;
+      PN->removeIncomingValue(Pred, false);
+      // If all incoming values to the Phi are the same, we can replace the Phi
+      // with that value.
+      Value* PNV = 0;
+      if (!DontDeleteUselessPHIs && (PNV = PN->hasConstantValue())) {
+        PN->replaceAllUsesWith(PNV);
+        PN->eraseFromParent();
+      }
+    }
+  }
+}
+
+
+/// splitBasicBlock - This splits a basic block into two at the specified
+/// instruction.  Note that all instructions BEFORE the specified iterator stay
+/// as part of the original basic block, an unconditional branch is added to
+/// the new BB, and the rest of the instructions in the BB are moved to the new
+/// BB, including the old terminator.  This invalidates the iterator.
+///
+/// Note that this only works on well formed basic blocks (must have a
+/// terminator), and 'I' must not be the end of instruction list (which would
+/// cause a degenerate basic block to be formed, having a terminator inside of
+/// the basic block).
+///
+BasicBlock *BasicBlock::splitBasicBlock(iterator I, const std::string &BBName) {
+  assert(getTerminator() && "Can't use splitBasicBlock on degenerate BB!");
+  assert(I != InstList.end() &&
+         "Trying to get me to create degenerate basic block!");
+
+  BasicBlock *InsertBefore = next(Function::iterator(this))
+                               .getNodePtrUnchecked();
+  BasicBlock *New = BasicBlock::Create(BBName, getParent(), InsertBefore);
+
+  // Move all of the specified instructions from the original basic block into
+  // the new basic block.
+  New->getInstList().splice(New->end(), this->getInstList(), I, end());
+
+  // Add a branch instruction to the newly formed basic block.
+  BranchInst::Create(New, this);
+
+  // Now we must loop through all of the successors of the New block (which
+  // _were_ the successors of the 'this' block), and update any PHI nodes in
+  // successors.  If there were PHI nodes in the successors, then they need to
+  // know that incoming branches will be from New, not from Old.
+  //
+  for (succ_iterator I = succ_begin(New), E = succ_end(New); I != E; ++I) {
+    // Loop over any phi nodes in the basic block, updating the BB field of
+    // incoming values...
+    BasicBlock *Successor = *I;
+    PHINode *PN;
+    for (BasicBlock::iterator II = Successor->begin();
+         (PN = dyn_cast<PHINode>(II)); ++II) {
+      int IDX = PN->getBasicBlockIndex(this);
+      while (IDX != -1) {
+        PN->setIncomingBlock((unsigned)IDX, New);
+        IDX = PN->getBasicBlockIndex(this);
+      }
+    }
+  }
+  return New;
+}
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
new file mode 100644
index 0000000..d78e093
--- /dev/null
+++ b/lib/VMCore/CMakeLists.txt
@@ -0,0 +1,30 @@
+add_llvm_library(LLVMCore
+  AsmWriter.cpp
+  Attributes.cpp
+  AutoUpgrade.cpp
+  BasicBlock.cpp
+  ConstantFold.cpp
+  Constants.cpp
+  Core.cpp
+  Dominators.cpp
+  Function.cpp
+  Globals.cpp
+  InlineAsm.cpp
+  Instruction.cpp
+  Instructions.cpp
+  IntrinsicInst.cpp
+  LeakDetector.cpp
+  Mangler.cpp
+  Module.cpp
+  ModuleProvider.cpp
+  Pass.cpp
+  PassManager.cpp
+  PrintModulePass.cpp
+  Type.cpp
+  TypeSymbolTable.cpp
+  Use.cpp
+  Value.cpp
+  ValueSymbolTable.cpp
+  ValueTypes.cpp
+  Verifier.cpp
+  )
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
new file mode 100644
index 0000000..7e4902f
--- /dev/null
+++ b/lib/VMCore/ConstantFold.cpp
@@ -0,0 +1,1681 @@
+//===- ConstantFold.cpp - LLVM constant folder ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements folding of constants for LLVM.  This implements the
+// (internal) ConstantFold.h interface, which is used by the
+// ConstantExpr::get* methods to automatically fold constants when possible.
+//
+// The current constant folding implementation is implemented in two pieces: the
+// template-based folder for simple primitive constants like ConstantInt, and
+// the special case hackery that we use to symbolically evaluate expressions
+// that use ConstantExprs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ConstantFold.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include <limits>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                ConstantFold*Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+/// BitCastConstantVector - Convert the specified ConstantVector node to the
+/// specified vector type.  At this point, we know that the elements of the
+/// input vector constant are all simple integer or FP values.
+static Constant *BitCastConstantVector(ConstantVector *CV,
+                                       const VectorType *DstTy) {
+  // If this cast changes element count then we can't handle it here:
+  // doing so requires endianness information.  This should be handled by
+  // Analysis/ConstantFolding.cpp
+  unsigned NumElts = DstTy->getNumElements();
+  if (NumElts != CV->getNumOperands())
+    return 0;
+  
+  // Check to verify that all elements of the input are simple.
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (!isa<ConstantInt>(CV->getOperand(i)) &&
+        !isa<ConstantFP>(CV->getOperand(i)))
+      return 0;
+  }
+
+  // Bitcast each element now.
+  std::vector<Constant*> Result;
+  const Type *DstEltTy = DstTy->getElementType();
+  for (unsigned i = 0; i != NumElts; ++i)
+    Result.push_back(ConstantExpr::getBitCast(CV->getOperand(i), DstEltTy));
+  return ConstantVector::get(Result);
+}
+
+/// This function determines which opcode to use to fold two constant cast 
+/// expressions together. It uses CastInst::isEliminableCastPair to determine
+/// the opcode. Consequently its just a wrapper around that function.
+/// @brief Determine if it is valid to fold a cast of a cast
+static unsigned
+foldConstantCastPair(
+  unsigned opc,          ///< opcode of the second cast constant expression
+  const ConstantExpr*Op, ///< the first cast constant expression
+  const Type *DstTy      ///< desintation type of the first cast
+) {
+  assert(Op && Op->isCast() && "Can't fold cast of cast without a cast!");
+  assert(DstTy && DstTy->isFirstClassType() && "Invalid cast destination type");
+  assert(CastInst::isCast(opc) && "Invalid cast opcode");
+  
+  // The the types and opcodes for the two Cast constant expressions
+  const Type *SrcTy = Op->getOperand(0)->getType();
+  const Type *MidTy = Op->getType();
+  Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode());
+  Instruction::CastOps secondOp = Instruction::CastOps(opc);
+
+  // Let CastInst::isEliminableCastPair do the heavy lifting.
+  return CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy,
+                                        Type::Int64Ty);
+}
+
+static Constant *FoldBitCast(Constant *V, const Type *DestTy) {
+  const Type *SrcTy = V->getType();
+  if (SrcTy == DestTy)
+    return V; // no-op cast
+  
+  // Check to see if we are casting a pointer to an aggregate to a pointer to
+  // the first element.  If so, return the appropriate GEP instruction.
+  if (const PointerType *PTy = dyn_cast<PointerType>(V->getType()))
+    if (const PointerType *DPTy = dyn_cast<PointerType>(DestTy))
+      if (PTy->getAddressSpace() == DPTy->getAddressSpace()) {
+        SmallVector<Value*, 8> IdxList;
+        IdxList.push_back(Constant::getNullValue(Type::Int32Ty));
+        const Type *ElTy = PTy->getElementType();
+        while (ElTy != DPTy->getElementType()) {
+          if (const StructType *STy = dyn_cast<StructType>(ElTy)) {
+            if (STy->getNumElements() == 0) break;
+            ElTy = STy->getElementType(0);
+            IdxList.push_back(Constant::getNullValue(Type::Int32Ty));
+          } else if (const SequentialType *STy = 
+                     dyn_cast<SequentialType>(ElTy)) {
+            if (isa<PointerType>(ElTy)) break;  // Can't index into pointers!
+            ElTy = STy->getElementType();
+            IdxList.push_back(IdxList[0]);
+          } else {
+            break;
+          }
+        }
+        
+        if (ElTy == DPTy->getElementType())
+          return ConstantExpr::getGetElementPtr(V, &IdxList[0], IdxList.size());
+      }
+  
+  // Handle casts from one vector constant to another.  We know that the src 
+  // and dest type have the same size (otherwise its an illegal cast).
+  if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
+    if (const VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
+      assert(DestPTy->getBitWidth() == SrcTy->getBitWidth() &&
+             "Not cast between same sized vectors!");
+      SrcTy = NULL;
+      // First, check for null.  Undef is already handled.
+      if (isa<ConstantAggregateZero>(V))
+        return Constant::getNullValue(DestTy);
+      
+      if (ConstantVector *CV = dyn_cast<ConstantVector>(V))
+        return BitCastConstantVector(CV, DestPTy);
+    }
+
+    // Canonicalize scalar-to-vector bitcasts into vector-to-vector bitcasts
+    // This allows for other simplifications (although some of them
+    // can only be handled by Analysis/ConstantFolding.cpp).
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V))
+      return ConstantExpr::getBitCast(ConstantVector::get(&V, 1), DestPTy);
+  }
+  
+  // Finally, implement bitcast folding now.   The code below doesn't handle
+  // bitcast right.
+  if (isa<ConstantPointerNull>(V))  // ptr->ptr cast.
+    return ConstantPointerNull::get(cast<PointerType>(DestTy));
+  
+  // Handle integral constant input.
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (DestTy->isInteger())
+      // Integral -> Integral. This is a no-op because the bit widths must
+      // be the same. Consequently, we just fold to V.
+      return V;
+
+    if (DestTy->isFloatingPoint())
+      return ConstantFP::get(APFloat(CI->getValue(),
+                                     DestTy != Type::PPC_FP128Ty));
+
+    // Otherwise, can't fold this (vector?)
+    return 0;
+  }
+
+  // Handle ConstantFP input.
+  if (const ConstantFP *FP = dyn_cast<ConstantFP>(V))
+    // FP -> Integral.
+    return ConstantInt::get(FP->getValueAPF().bitcastToAPInt());
+
+  return 0;
+}
+
+
+Constant *llvm::ConstantFoldCastInstruction(unsigned opc, const Constant *V,
+                                            const Type *DestTy) {
+  if (isa<UndefValue>(V)) {
+    // zext(undef) = 0, because the top bits will be zero.
+    // sext(undef) = 0, because the top bits will all be the same.
+    // [us]itofp(undef) = 0, because the result value is bounded.
+    if (opc == Instruction::ZExt || opc == Instruction::SExt ||
+        opc == Instruction::UIToFP || opc == Instruction::SIToFP)
+      return Constant::getNullValue(DestTy);
+    return UndefValue::get(DestTy);
+  }
+  // No compile-time operations on this type yet.
+  if (V->getType() == Type::PPC_FP128Ty || DestTy == Type::PPC_FP128Ty)
+    return 0;
+
+  // If the cast operand is a constant expression, there's a few things we can
+  // do to try to simplify it.
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->isCast()) {
+      // Try hard to fold cast of cast because they are often eliminable.
+      if (unsigned newOpc = foldConstantCastPair(opc, CE, DestTy))
+        return ConstantExpr::getCast(newOpc, CE->getOperand(0), DestTy);
+    } else if (CE->getOpcode() == Instruction::GetElementPtr) {
+      // If all of the indexes in the GEP are null values, there is no pointer
+      // adjustment going on.  We might as well cast the source pointer.
+      bool isAllNull = true;
+      for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+        if (!CE->getOperand(i)->isNullValue()) {
+          isAllNull = false;
+          break;
+        }
+      if (isAllNull)
+        // This is casting one pointer type to another, always BitCast
+        return ConstantExpr::getPointerCast(CE->getOperand(0), DestTy);
+    }
+  }
+
+  // We actually have to do a cast now. Perform the cast according to the
+  // opcode specified.
+  switch (opc) {
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    if (const ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
+      bool ignored;
+      APFloat Val = FPC->getValueAPF();
+      Val.convert(DestTy == Type::FloatTy ? APFloat::IEEEsingle :
+                  DestTy == Type::DoubleTy ? APFloat::IEEEdouble :
+                  DestTy == Type::X86_FP80Ty ? APFloat::x87DoubleExtended :
+                  DestTy == Type::FP128Ty ? APFloat::IEEEquad :
+                  APFloat::Bogus,
+                  APFloat::rmNearestTiesToEven, &ignored);
+      return ConstantFP::get(Val);
+    }
+    return 0; // Can't fold.
+  case Instruction::FPToUI: 
+  case Instruction::FPToSI:
+    if (const ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
+      const APFloat &V = FPC->getValueAPF();
+      bool ignored;
+      uint64_t x[2]; 
+      uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      (void) V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
+                                APFloat::rmTowardZero, &ignored);
+      APInt Val(DestBitWidth, 2, x);
+      return ConstantInt::get(Val);
+    }
+    if (const ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+      std::vector<Constant*> res;
+      const VectorType *DestVecTy = cast<VectorType>(DestTy);
+      const Type *DstEltTy = DestVecTy->getElementType();
+      for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
+        res.push_back(ConstantExpr::getCast(opc, CV->getOperand(i), DstEltTy));
+      return ConstantVector::get(DestVecTy, res);
+    }
+    return 0; // Can't fold.
+  case Instruction::IntToPtr:   //always treated as unsigned
+    if (V->isNullValue())       // Is it an integral null value?
+      return ConstantPointerNull::get(cast<PointerType>(DestTy));
+    return 0;                   // Other pointer types cannot be casted
+  case Instruction::PtrToInt:   // always treated as unsigned
+    if (V->isNullValue())       // is it a null pointer value?
+      return ConstantInt::get(DestTy, 0);
+    return 0;                   // Other pointer types cannot be casted
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      APInt api = CI->getValue();
+      const uint64_t zero[] = {0, 0};
+      APFloat apf = APFloat(APInt(DestTy->getPrimitiveSizeInBits(),
+                                  2, zero));
+      (void)apf.convertFromAPInt(api, 
+                                 opc==Instruction::SIToFP,
+                                 APFloat::rmNearestTiesToEven);
+      return ConstantFP::get(apf);
+    }
+    if (const ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+      std::vector<Constant*> res;
+      const VectorType *DestVecTy = cast<VectorType>(DestTy);
+      const Type *DstEltTy = DestVecTy->getElementType();
+      for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
+        res.push_back(ConstantExpr::getCast(opc, CV->getOperand(i), DstEltTy));
+      return ConstantVector::get(DestVecTy, res);
+    }
+    return 0;
+  case Instruction::ZExt:
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      APInt Result(CI->getValue());
+      Result.zext(BitWidth);
+      return ConstantInt::get(Result);
+    }
+    return 0;
+  case Instruction::SExt:
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      APInt Result(CI->getValue());
+      Result.sext(BitWidth);
+      return ConstantInt::get(Result);
+    }
+    return 0;
+  case Instruction::Trunc:
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      APInt Result(CI->getValue());
+      Result.trunc(BitWidth);
+      return ConstantInt::get(Result);
+    }
+    return 0;
+  case Instruction::BitCast:
+    return FoldBitCast(const_cast<Constant*>(V), DestTy);
+  default:
+    assert(!"Invalid CE CastInst opcode");
+    break;
+  }
+
+  assert(0 && "Failed to cast constant expression");
+  return 0;
+}
+
+Constant *llvm::ConstantFoldSelectInstruction(const Constant *Cond,
+                                              const Constant *V1,
+                                              const Constant *V2) {
+  if (const ConstantInt *CB = dyn_cast<ConstantInt>(Cond))
+    return const_cast<Constant*>(CB->getZExtValue() ? V1 : V2);
+
+  if (isa<UndefValue>(V1)) return const_cast<Constant*>(V2);
+  if (isa<UndefValue>(V2)) return const_cast<Constant*>(V1);
+  if (isa<UndefValue>(Cond)) return const_cast<Constant*>(V1);
+  if (V1 == V2) return const_cast<Constant*>(V1);
+  return 0;
+}
+
+Constant *llvm::ConstantFoldExtractElementInstruction(const Constant *Val,
+                                                      const Constant *Idx) {
+  if (isa<UndefValue>(Val))  // ee(undef, x) -> undef
+    return UndefValue::get(cast<VectorType>(Val->getType())->getElementType());
+  if (Val->isNullValue())  // ee(zero, x) -> zero
+    return Constant::getNullValue(
+                          cast<VectorType>(Val->getType())->getElementType());
+  
+  if (const ConstantVector *CVal = dyn_cast<ConstantVector>(Val)) {
+    if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx)) {
+      return CVal->getOperand(CIdx->getZExtValue());
+    } else if (isa<UndefValue>(Idx)) {
+      // ee({w,x,y,z}, undef) -> w (an arbitrary value).
+      return CVal->getOperand(0);
+    }
+  }
+  return 0;
+}
+
+Constant *llvm::ConstantFoldInsertElementInstruction(const Constant *Val,
+                                                     const Constant *Elt,
+                                                     const Constant *Idx) {
+  const ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
+  if (!CIdx) return 0;
+  APInt idxVal = CIdx->getValue();
+  if (isa<UndefValue>(Val)) { 
+    // Insertion of scalar constant into vector undef
+    // Optimize away insertion of undef
+    if (isa<UndefValue>(Elt))
+      return const_cast<Constant*>(Val);
+    // Otherwise break the aggregate undef into multiple undefs and do
+    // the insertion
+    unsigned numOps = 
+      cast<VectorType>(Val->getType())->getNumElements();
+    std::vector<Constant*> Ops; 
+    Ops.reserve(numOps);
+    for (unsigned i = 0; i < numOps; ++i) {
+      const Constant *Op =
+        (idxVal == i) ? Elt : UndefValue::get(Elt->getType());
+      Ops.push_back(const_cast<Constant*>(Op));
+    }
+    return ConstantVector::get(Ops);
+  }
+  if (isa<ConstantAggregateZero>(Val)) {
+    // Insertion of scalar constant into vector aggregate zero
+    // Optimize away insertion of zero
+    if (Elt->isNullValue())
+      return const_cast<Constant*>(Val);
+    // Otherwise break the aggregate zero into multiple zeros and do
+    // the insertion
+    unsigned numOps = 
+      cast<VectorType>(Val->getType())->getNumElements();
+    std::vector<Constant*> Ops; 
+    Ops.reserve(numOps);
+    for (unsigned i = 0; i < numOps; ++i) {
+      const Constant *Op =
+        (idxVal == i) ? Elt : Constant::getNullValue(Elt->getType());
+      Ops.push_back(const_cast<Constant*>(Op));
+    }
+    return ConstantVector::get(Ops);
+  }
+  if (const ConstantVector *CVal = dyn_cast<ConstantVector>(Val)) {
+    // Insertion of scalar constant into vector constant
+    std::vector<Constant*> Ops; 
+    Ops.reserve(CVal->getNumOperands());
+    for (unsigned i = 0; i < CVal->getNumOperands(); ++i) {
+      const Constant *Op =
+        (idxVal == i) ? Elt : cast<Constant>(CVal->getOperand(i));
+      Ops.push_back(const_cast<Constant*>(Op));
+    }
+    return ConstantVector::get(Ops);
+  }
+
+  return 0;
+}
+
+/// GetVectorElement - If C is a ConstantVector, ConstantAggregateZero or Undef
+/// return the specified element value.  Otherwise return null.
+static Constant *GetVectorElement(const Constant *C, unsigned EltNo) {
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(C))
+    return CV->getOperand(EltNo);
+  
+  const Type *EltTy = cast<VectorType>(C->getType())->getElementType();
+  if (isa<ConstantAggregateZero>(C))
+    return Constant::getNullValue(EltTy);
+  if (isa<UndefValue>(C))
+    return UndefValue::get(EltTy);
+  return 0;
+}
+
+Constant *llvm::ConstantFoldShuffleVectorInstruction(const Constant *V1,
+                                                     const Constant *V2,
+                                                     const Constant *Mask) {
+  // Undefined shuffle mask -> undefined value.
+  if (isa<UndefValue>(Mask)) return UndefValue::get(V1->getType());
+
+  unsigned MaskNumElts = cast<VectorType>(Mask->getType())->getNumElements();
+  unsigned SrcNumElts = cast<VectorType>(V1->getType())->getNumElements();
+  const Type *EltTy = cast<VectorType>(V1->getType())->getElementType();
+
+  // Loop over the shuffle mask, evaluating each element.
+  SmallVector<Constant*, 32> Result;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    Constant *InElt = GetVectorElement(Mask, i);
+    if (InElt == 0) return 0;
+
+    if (isa<UndefValue>(InElt))
+      InElt = UndefValue::get(EltTy);
+    else if (ConstantInt *CI = dyn_cast<ConstantInt>(InElt)) {
+      unsigned Elt = CI->getZExtValue();
+      if (Elt >= SrcNumElts*2)
+        InElt = UndefValue::get(EltTy);
+      else if (Elt >= SrcNumElts)
+        InElt = GetVectorElement(V2, Elt - SrcNumElts);
+      else
+        InElt = GetVectorElement(V1, Elt);
+      if (InElt == 0) return 0;
+    } else {
+      // Unknown value.
+      return 0;
+    }
+    Result.push_back(InElt);
+  }
+
+  return ConstantVector::get(&Result[0], Result.size());
+}
+
+Constant *llvm::ConstantFoldExtractValueInstruction(const Constant *Agg,
+                                                    const unsigned *Idxs,
+                                                    unsigned NumIdx) {
+  // Base case: no indices, so return the entire value.
+  if (NumIdx == 0)
+    return const_cast<Constant *>(Agg);
+
+  if (isa<UndefValue>(Agg))  // ev(undef, x) -> undef
+    return UndefValue::get(ExtractValueInst::getIndexedType(Agg->getType(),
+                                                            Idxs,
+                                                            Idxs + NumIdx));
+
+  if (isa<ConstantAggregateZero>(Agg))  // ev(0, x) -> 0
+    return
+      Constant::getNullValue(ExtractValueInst::getIndexedType(Agg->getType(),
+                                                              Idxs,
+                                                              Idxs + NumIdx));
+
+  // Otherwise recurse.
+  return ConstantFoldExtractValueInstruction(Agg->getOperand(*Idxs),
+                                             Idxs+1, NumIdx-1);
+}
+
+Constant *llvm::ConstantFoldInsertValueInstruction(const Constant *Agg,
+                                                   const Constant *Val,
+                                                   const unsigned *Idxs,
+                                                   unsigned NumIdx) {
+  // Base case: no indices, so replace the entire value.
+  if (NumIdx == 0)
+    return const_cast<Constant *>(Val);
+
+  if (isa<UndefValue>(Agg)) {
+    // Insertion of constant into aggregate undef
+    // Optimize away insertion of undef
+    if (isa<UndefValue>(Val))
+      return const_cast<Constant*>(Agg);
+    // Otherwise break the aggregate undef into multiple undefs and do
+    // the insertion
+    const CompositeType *AggTy = cast<CompositeType>(Agg->getType());
+    unsigned numOps;
+    if (const ArrayType *AR = dyn_cast<ArrayType>(AggTy))
+      numOps = AR->getNumElements();
+    else
+      numOps = cast<StructType>(AggTy)->getNumElements();
+    std::vector<Constant*> Ops(numOps); 
+    for (unsigned i = 0; i < numOps; ++i) {
+      const Type *MemberTy = AggTy->getTypeAtIndex(i);
+      const Constant *Op =
+        (*Idxs == i) ?
+        ConstantFoldInsertValueInstruction(UndefValue::get(MemberTy),
+                                           Val, Idxs+1, NumIdx-1) :
+        UndefValue::get(MemberTy);
+      Ops[i] = const_cast<Constant*>(Op);
+    }
+    if (isa<StructType>(AggTy))
+      return ConstantStruct::get(Ops);
+    else
+      return ConstantArray::get(cast<ArrayType>(AggTy), Ops);
+  }
+  if (isa<ConstantAggregateZero>(Agg)) {
+    // Insertion of constant into aggregate zero
+    // Optimize away insertion of zero
+    if (Val->isNullValue())
+      return const_cast<Constant*>(Agg);
+    // Otherwise break the aggregate zero into multiple zeros and do
+    // the insertion
+    const CompositeType *AggTy = cast<CompositeType>(Agg->getType());
+    unsigned numOps;
+    if (const ArrayType *AR = dyn_cast<ArrayType>(AggTy))
+      numOps = AR->getNumElements();
+    else
+      numOps = cast<StructType>(AggTy)->getNumElements();
+    std::vector<Constant*> Ops(numOps);
+    for (unsigned i = 0; i < numOps; ++i) {
+      const Type *MemberTy = AggTy->getTypeAtIndex(i);
+      const Constant *Op =
+        (*Idxs == i) ?
+        ConstantFoldInsertValueInstruction(Constant::getNullValue(MemberTy),
+                                           Val, Idxs+1, NumIdx-1) :
+        Constant::getNullValue(MemberTy);
+      Ops[i] = const_cast<Constant*>(Op);
+    }
+    if (isa<StructType>(AggTy))
+      return ConstantStruct::get(Ops);
+    else
+      return ConstantArray::get(cast<ArrayType>(AggTy), Ops);
+  }
+  if (isa<ConstantStruct>(Agg) || isa<ConstantArray>(Agg)) {
+    // Insertion of constant into aggregate constant
+    std::vector<Constant*> Ops(Agg->getNumOperands());
+    for (unsigned i = 0; i < Agg->getNumOperands(); ++i) {
+      const Constant *Op =
+        (*Idxs == i) ?
+        ConstantFoldInsertValueInstruction(Agg->getOperand(i),
+                                           Val, Idxs+1, NumIdx-1) :
+        Agg->getOperand(i);
+      Ops[i] = const_cast<Constant*>(Op);
+    }
+    Constant *C;
+    if (isa<StructType>(Agg->getType()))
+      C = ConstantStruct::get(Ops);
+    else
+      C = ConstantArray::get(cast<ArrayType>(Agg->getType()), Ops);
+    return C;
+  }
+
+  return 0;
+}
+
+/// EvalVectorOp - Given two vector constants and a function pointer, apply the
+/// function pointer to each element pair, producing a new ConstantVector
+/// constant. Either or both of V1 and V2 may be NULL, meaning a
+/// ConstantAggregateZero operand.
+static Constant *EvalVectorOp(const ConstantVector *V1, 
+                              const ConstantVector *V2,
+                              const VectorType *VTy,
+                              Constant *(*FP)(Constant*, Constant*)) {
+  std::vector<Constant*> Res;
+  const Type *EltTy = VTy->getElementType();
+  for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
+    const Constant *C1 = V1 ? V1->getOperand(i) : Constant::getNullValue(EltTy);
+    const Constant *C2 = V2 ? V2->getOperand(i) : Constant::getNullValue(EltTy);
+    Res.push_back(FP(const_cast<Constant*>(C1),
+                     const_cast<Constant*>(C2)));
+  }
+  return ConstantVector::get(Res);
+}
+
+Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
+                                              const Constant *C1,
+                                              const Constant *C2) {
+  // No compile-time operations on this type yet.
+  if (C1->getType() == Type::PPC_FP128Ty)
+    return 0;
+
+  // Handle UndefValue up front
+  if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
+    switch (Opcode) {
+    case Instruction::Xor:
+      if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
+        // Handle undef ^ undef -> 0 special case. This is a common
+        // idiom (misuse).
+        return Constant::getNullValue(C1->getType());
+      // Fallthrough
+    case Instruction::Add:
+    case Instruction::Sub:
+      return UndefValue::get(C1->getType());
+    case Instruction::Mul:
+    case Instruction::And:
+      return Constant::getNullValue(C1->getType());
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+      if (!isa<UndefValue>(C2))                    // undef / X -> 0
+        return Constant::getNullValue(C1->getType());
+      return const_cast<Constant*>(C2);            // X / undef -> undef
+    case Instruction::Or:                          // X | undef -> -1
+      if (const VectorType *PTy = dyn_cast<VectorType>(C1->getType()))
+        return ConstantVector::getAllOnesValue(PTy);
+      return ConstantInt::getAllOnesValue(C1->getType());
+    case Instruction::LShr:
+      if (isa<UndefValue>(C2) && isa<UndefValue>(C1))
+        return const_cast<Constant*>(C1);           // undef lshr undef -> undef
+      return Constant::getNullValue(C1->getType()); // X lshr undef -> 0
+                                                    // undef lshr X -> 0
+    case Instruction::AShr:
+      if (!isa<UndefValue>(C2))
+        return const_cast<Constant*>(C1);           // undef ashr X --> undef
+      else if (isa<UndefValue>(C1)) 
+        return const_cast<Constant*>(C1);           // undef ashr undef -> undef
+      else
+        return const_cast<Constant*>(C1);           // X ashr undef --> X
+    case Instruction::Shl:
+      // undef << X -> 0   or   X << undef -> 0
+      return Constant::getNullValue(C1->getType());
+    }
+  }
+
+  // Handle simplifications of the RHS when a constant int.
+  if (const ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
+    switch (Opcode) {
+    case Instruction::Add:
+      if (CI2->equalsInt(0)) return const_cast<Constant*>(C1);  // X + 0 == X
+      break;
+    case Instruction::Sub:
+      if (CI2->equalsInt(0)) return const_cast<Constant*>(C1);  // X - 0 == X
+      break;
+    case Instruction::Mul:
+      if (CI2->equalsInt(0)) return const_cast<Constant*>(C2);  // X * 0 == 0
+      if (CI2->equalsInt(1))
+        return const_cast<Constant*>(C1);                       // X * 1 == X
+      break;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+      if (CI2->equalsInt(1))
+        return const_cast<Constant*>(C1);                     // X / 1 == X
+      if (CI2->equalsInt(0))
+        return UndefValue::get(CI2->getType());               // X / 0 == undef
+      break;
+    case Instruction::URem:
+    case Instruction::SRem:
+      if (CI2->equalsInt(1))
+        return Constant::getNullValue(CI2->getType());        // X % 1 == 0
+      if (CI2->equalsInt(0))
+        return UndefValue::get(CI2->getType());               // X % 0 == undef
+      break;
+    case Instruction::And:
+      if (CI2->isZero()) return const_cast<Constant*>(C2);    // X & 0 == 0
+      if (CI2->isAllOnesValue())
+        return const_cast<Constant*>(C1);                     // X & -1 == X
+      
+      if (const ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
+        // (zext i32 to i64) & 4294967295 -> (zext i32 to i64)
+        if (CE1->getOpcode() == Instruction::ZExt) {
+          unsigned DstWidth = CI2->getType()->getBitWidth();
+          unsigned SrcWidth =
+            CE1->getOperand(0)->getType()->getPrimitiveSizeInBits();
+          APInt PossiblySetBits(APInt::getLowBitsSet(DstWidth, SrcWidth));
+          if ((PossiblySetBits & CI2->getValue()) == PossiblySetBits)
+            return const_cast<Constant*>(C1);
+        }
+        
+        // If and'ing the address of a global with a constant, fold it.
+        if (CE1->getOpcode() == Instruction::PtrToInt && 
+            isa<GlobalValue>(CE1->getOperand(0))) {
+          GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
+        
+          // Functions are at least 4-byte aligned.
+          unsigned GVAlign = GV->getAlignment();
+          if (isa<Function>(GV))
+            GVAlign = std::max(GVAlign, 4U);
+          
+          if (GVAlign > 1) {
+            unsigned DstWidth = CI2->getType()->getBitWidth();
+            unsigned SrcWidth = std::min(DstWidth, Log2_32(GVAlign));
+            APInt BitsNotSet(APInt::getLowBitsSet(DstWidth, SrcWidth));
+
+            // If checking bits we know are clear, return zero.
+            if ((CI2->getValue() & BitsNotSet) == CI2->getValue())
+              return Constant::getNullValue(CI2->getType());
+          }
+        }
+      }
+      break;
+    case Instruction::Or:
+      if (CI2->equalsInt(0)) return const_cast<Constant*>(C1);  // X | 0 == X
+      if (CI2->isAllOnesValue())
+        return const_cast<Constant*>(C2);  // X | -1 == -1
+      break;
+    case Instruction::Xor:
+      if (CI2->equalsInt(0)) return const_cast<Constant*>(C1);  // X ^ 0 == X
+      break;
+    case Instruction::AShr:
+      // ashr (zext C to Ty), C2 -> lshr (zext C, CSA), C2
+      if (const ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1))
+        if (CE1->getOpcode() == Instruction::ZExt)  // Top bits known zero.
+          return ConstantExpr::getLShr(const_cast<Constant*>(C1),
+                                       const_cast<Constant*>(C2));
+      break;
+    }
+  }
+  
+  // At this point we know neither constant is an UndefValue.
+  if (const ConstantInt *CI1 = dyn_cast<ConstantInt>(C1)) {
+    if (const ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
+      using namespace APIntOps;
+      const APInt &C1V = CI1->getValue();
+      const APInt &C2V = CI2->getValue();
+      switch (Opcode) {
+      default:
+        break;
+      case Instruction::Add:     
+        return ConstantInt::get(C1V + C2V);
+      case Instruction::Sub:     
+        return ConstantInt::get(C1V - C2V);
+      case Instruction::Mul:     
+        return ConstantInt::get(C1V * C2V);
+      case Instruction::UDiv:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        return ConstantInt::get(C1V.udiv(C2V));
+      case Instruction::SDiv:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
+          return UndefValue::get(CI1->getType());   // MIN_INT / -1 -> undef
+        return ConstantInt::get(C1V.sdiv(C2V));
+      case Instruction::URem:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        return ConstantInt::get(C1V.urem(C2V));
+      case Instruction::SRem:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
+          return UndefValue::get(CI1->getType());   // MIN_INT % -1 -> undef
+        return ConstantInt::get(C1V.srem(C2V));
+      case Instruction::And:
+        return ConstantInt::get(C1V & C2V);
+      case Instruction::Or:
+        return ConstantInt::get(C1V | C2V);
+      case Instruction::Xor:
+        return ConstantInt::get(C1V ^ C2V);
+      case Instruction::Shl: {
+        uint32_t shiftAmt = C2V.getZExtValue();
+        if (shiftAmt < C1V.getBitWidth())
+          return ConstantInt::get(C1V.shl(shiftAmt));
+        else
+          return UndefValue::get(C1->getType()); // too big shift is undef
+      }
+      case Instruction::LShr: {
+        uint32_t shiftAmt = C2V.getZExtValue();
+        if (shiftAmt < C1V.getBitWidth())
+          return ConstantInt::get(C1V.lshr(shiftAmt));
+        else
+          return UndefValue::get(C1->getType()); // too big shift is undef
+      }
+      case Instruction::AShr: {
+        uint32_t shiftAmt = C2V.getZExtValue();
+        if (shiftAmt < C1V.getBitWidth())
+          return ConstantInt::get(C1V.ashr(shiftAmt));
+        else
+          return UndefValue::get(C1->getType()); // too big shift is undef
+      }
+      }
+    }
+  } else if (const ConstantFP *CFP1 = dyn_cast<ConstantFP>(C1)) {
+    if (const ConstantFP *CFP2 = dyn_cast<ConstantFP>(C2)) {
+      APFloat C1V = CFP1->getValueAPF();
+      APFloat C2V = CFP2->getValueAPF();
+      APFloat C3V = C1V;  // copy for modification
+      switch (Opcode) {
+      default:                   
+        break;
+      case Instruction::Add:
+        (void)C3V.add(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C3V);
+      case Instruction::Sub:     
+        (void)C3V.subtract(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C3V);
+      case Instruction::Mul:
+        (void)C3V.multiply(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C3V);
+      case Instruction::FDiv:
+        (void)C3V.divide(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C3V);
+      case Instruction::FRem:
+        (void)C3V.mod(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C3V);
+      }
+    }
+  } else if (const VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
+    const ConstantVector *CP1 = dyn_cast<ConstantVector>(C1);
+    const ConstantVector *CP2 = dyn_cast<ConstantVector>(C2);
+    if ((CP1 != NULL || isa<ConstantAggregateZero>(C1)) &&
+        (CP2 != NULL || isa<ConstantAggregateZero>(C2))) {
+      switch (Opcode) {
+      default:
+        break;
+      case Instruction::Add: 
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getAdd);
+      case Instruction::Sub: 
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getSub);
+      case Instruction::Mul: 
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getMul);
+      case Instruction::UDiv:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getUDiv);
+      case Instruction::SDiv:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getSDiv);
+      case Instruction::FDiv:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getFDiv);
+      case Instruction::URem:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getURem);
+      case Instruction::SRem:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getSRem);
+      case Instruction::FRem:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getFRem);
+      case Instruction::And: 
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getAnd);
+      case Instruction::Or:  
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getOr);
+      case Instruction::Xor: 
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getXor);
+      case Instruction::LShr:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getLShr);
+      case Instruction::AShr:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getAShr);
+      case Instruction::Shl:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getShl);
+      }
+    }
+  }
+
+  if (isa<ConstantExpr>(C1)) {
+    // There are many possible foldings we could do here.  We should probably
+    // at least fold add of a pointer with an integer into the appropriate
+    // getelementptr.  This will improve alias analysis a bit.
+  } else if (isa<ConstantExpr>(C2)) {
+    // If C2 is a constant expr and C1 isn't, flop them around and fold the
+    // other way if possible.
+    switch (Opcode) {
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      // No change of opcode required.
+      return ConstantFoldBinaryInstruction(Opcode, C2, C1);
+      
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Sub:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    default:  // These instructions cannot be flopped around.
+      break;
+    }
+  }
+  
+  // We don't know how to fold this.
+  return 0;
+}
+
+/// isZeroSizedType - This type is zero sized if its an array or structure of
+/// zero sized types.  The only leaf zero sized type is an empty structure.
+static bool isMaybeZeroSizedType(const Type *Ty) {
+  if (isa<OpaqueType>(Ty)) return true;  // Can't say.
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+
+    // If all of elements have zero size, this does too.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      if (!isMaybeZeroSizedType(STy->getElementType(i))) return false;
+    return true;
+
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    return isMaybeZeroSizedType(ATy->getElementType());
+  }
+  return false;
+}
+
+/// IdxCompare - Compare the two constants as though they were getelementptr
+/// indices.  This allows coersion of the types to be the same thing.
+///
+/// If the two constants are the "same" (after coersion), return 0.  If the
+/// first is less than the second, return -1, if the second is less than the
+/// first, return 1.  If the constants are not integral, return -2.
+///
+static int IdxCompare(Constant *C1, Constant *C2, const Type *ElTy) {
+  if (C1 == C2) return 0;
+
+  // Ok, we found a different index.  If they are not ConstantInt, we can't do
+  // anything with them.
+  if (!isa<ConstantInt>(C1) || !isa<ConstantInt>(C2))
+    return -2; // don't know!
+
+  // Ok, we have two differing integer indices.  Sign extend them to be the same
+  // type.  Long is always big enough, so we use it.
+  if (C1->getType() != Type::Int64Ty)
+    C1 = ConstantExpr::getSExt(C1, Type::Int64Ty);
+
+  if (C2->getType() != Type::Int64Ty)
+    C2 = ConstantExpr::getSExt(C2, Type::Int64Ty);
+
+  if (C1 == C2) return 0;  // They are equal
+
+  // If the type being indexed over is really just a zero sized type, there is
+  // no pointer difference being made here.
+  if (isMaybeZeroSizedType(ElTy))
+    return -2; // dunno.
+
+  // If they are really different, now that they are the same type, then we
+  // found a difference!
+  if (cast<ConstantInt>(C1)->getSExtValue() < 
+      cast<ConstantInt>(C2)->getSExtValue())
+    return -1;
+  else
+    return 1;
+}
+
+/// evaluateFCmpRelation - This function determines if there is anything we can
+/// decide about the two constants provided.  This doesn't need to handle simple
+/// things like ConstantFP comparisons, but should instead handle ConstantExprs.
+/// If we can determine that the two constants have a particular relation to 
+/// each other, we should return the corresponding FCmpInst predicate, 
+/// otherwise return FCmpInst::BAD_FCMP_PREDICATE. This is used below in
+/// ConstantFoldCompareInstruction.
+///
+/// To simplify this code we canonicalize the relation so that the first
+/// operand is always the most "complex" of the two.  We consider ConstantFP
+/// to be the simplest, and ConstantExprs to be the most complex.
+static FCmpInst::Predicate evaluateFCmpRelation(const Constant *V1, 
+                                                const Constant *V2) {
+  assert(V1->getType() == V2->getType() &&
+         "Cannot compare values of different types!");
+
+  // No compile-time operations on this type yet.
+  if (V1->getType() == Type::PPC_FP128Ty)
+    return FCmpInst::BAD_FCMP_PREDICATE;
+
+  // Handle degenerate case quickly
+  if (V1 == V2) return FCmpInst::FCMP_OEQ;
+
+  if (!isa<ConstantExpr>(V1)) {
+    if (!isa<ConstantExpr>(V2)) {
+      // We distilled thisUse the standard constant folder for a few cases
+      ConstantInt *R = 0;
+      Constant *C1 = const_cast<Constant*>(V1);
+      Constant *C2 = const_cast<Constant*>(V2);
+      R = dyn_cast<ConstantInt>(
+                             ConstantExpr::getFCmp(FCmpInst::FCMP_OEQ, C1, C2));
+      if (R && !R->isZero()) 
+        return FCmpInst::FCMP_OEQ;
+      R = dyn_cast<ConstantInt>(
+                             ConstantExpr::getFCmp(FCmpInst::FCMP_OLT, C1, C2));
+      if (R && !R->isZero()) 
+        return FCmpInst::FCMP_OLT;
+      R = dyn_cast<ConstantInt>(
+                             ConstantExpr::getFCmp(FCmpInst::FCMP_OGT, C1, C2));
+      if (R && !R->isZero()) 
+        return FCmpInst::FCMP_OGT;
+
+      // Nothing more we can do
+      return FCmpInst::BAD_FCMP_PREDICATE;
+    }
+    
+    // If the first operand is simple and second is ConstantExpr, swap operands.
+    FCmpInst::Predicate SwappedRelation = evaluateFCmpRelation(V2, V1);
+    if (SwappedRelation != FCmpInst::BAD_FCMP_PREDICATE)
+      return FCmpInst::getSwappedPredicate(SwappedRelation);
+  } else {
+    // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
+    // constantexpr or a simple constant.
+    const ConstantExpr *CE1 = cast<ConstantExpr>(V1);
+    switch (CE1->getOpcode()) {
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+      // We might be able to do something with these but we don't right now.
+      break;
+    default:
+      break;
+    }
+  }
+  // There are MANY other foldings that we could perform here.  They will
+  // probably be added on demand, as they seem needed.
+  return FCmpInst::BAD_FCMP_PREDICATE;
+}
+
+/// evaluateICmpRelation - This function determines if there is anything we can
+/// decide about the two constants provided.  This doesn't need to handle simple
+/// things like integer comparisons, but should instead handle ConstantExprs
+/// and GlobalValues.  If we can determine that the two constants have a
+/// particular relation to each other, we should return the corresponding ICmp
+/// predicate, otherwise return ICmpInst::BAD_ICMP_PREDICATE.
+///
+/// To simplify this code we canonicalize the relation so that the first
+/// operand is always the most "complex" of the two.  We consider simple
+/// constants (like ConstantInt) to be the simplest, followed by
+/// GlobalValues, followed by ConstantExpr's (the most complex).
+///
+static ICmpInst::Predicate evaluateICmpRelation(const Constant *V1, 
+                                                const Constant *V2,
+                                                bool isSigned) {
+  assert(V1->getType() == V2->getType() &&
+         "Cannot compare different types of values!");
+  if (V1 == V2) return ICmpInst::ICMP_EQ;
+
+  if (!isa<ConstantExpr>(V1) && !isa<GlobalValue>(V1)) {
+    if (!isa<GlobalValue>(V2) && !isa<ConstantExpr>(V2)) {
+      // We distilled this down to a simple case, use the standard constant
+      // folder.
+      ConstantInt *R = 0;
+      Constant *C1 = const_cast<Constant*>(V1);
+      Constant *C2 = const_cast<Constant*>(V2);
+      ICmpInst::Predicate pred = ICmpInst::ICMP_EQ;
+      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, C1, C2));
+      if (R && !R->isZero()) 
+        return pred;
+      pred = isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, C1, C2));
+      if (R && !R->isZero())
+        return pred;
+      pred = isSigned ?  ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, C1, C2));
+      if (R && !R->isZero())
+        return pred;
+      
+      // If we couldn't figure it out, bail.
+      return ICmpInst::BAD_ICMP_PREDICATE;
+    }
+    
+    // If the first operand is simple, swap operands.
+    ICmpInst::Predicate SwappedRelation = 
+      evaluateICmpRelation(V2, V1, isSigned);
+    if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
+      return ICmpInst::getSwappedPredicate(SwappedRelation);
+
+  } else if (const GlobalValue *CPR1 = dyn_cast<GlobalValue>(V1)) {
+    if (isa<ConstantExpr>(V2)) {  // Swap as necessary.
+      ICmpInst::Predicate SwappedRelation = 
+        evaluateICmpRelation(V2, V1, isSigned);
+      if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
+        return ICmpInst::getSwappedPredicate(SwappedRelation);
+      else
+        return ICmpInst::BAD_ICMP_PREDICATE;
+    }
+
+    // Now we know that the RHS is a GlobalValue or simple constant,
+    // which (since the types must match) means that it's a ConstantPointerNull.
+    if (const GlobalValue *CPR2 = dyn_cast<GlobalValue>(V2)) {
+      // Don't try to decide equality of aliases.
+      if (!isa<GlobalAlias>(CPR1) && !isa<GlobalAlias>(CPR2))
+        if (!CPR1->hasExternalWeakLinkage() || !CPR2->hasExternalWeakLinkage())
+          return ICmpInst::ICMP_NE;
+    } else {
+      assert(isa<ConstantPointerNull>(V2) && "Canonicalization guarantee!");
+      // GlobalVals can never be null.  Don't try to evaluate aliases.
+      if (!CPR1->hasExternalWeakLinkage() && !isa<GlobalAlias>(CPR1))
+        return ICmpInst::ICMP_NE;
+    }
+  } else {
+    // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
+    // constantexpr, a CPR, or a simple constant.
+    const ConstantExpr *CE1 = cast<ConstantExpr>(V1);
+    const Constant *CE1Op0 = CE1->getOperand(0);
+
+    switch (CE1->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      break; // We can't evaluate floating point casts or truncations.
+
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::BitCast:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // If the cast is not actually changing bits, and the second operand is a
+      // null pointer, do the comparison with the pre-casted value.
+      if (V2->isNullValue() &&
+          (isa<PointerType>(CE1->getType()) || CE1->getType()->isInteger())) {
+        bool sgnd = isSigned;
+        if (CE1->getOpcode() == Instruction::ZExt) isSigned = false;
+        if (CE1->getOpcode() == Instruction::SExt) isSigned = true;
+        return evaluateICmpRelation(CE1Op0,
+                                    Constant::getNullValue(CE1Op0->getType()), 
+                                    sgnd);
+      }
+
+      // If the dest type is a pointer type, and the RHS is a constantexpr cast
+      // from the same type as the src of the LHS, evaluate the inputs.  This is
+      // important for things like "icmp eq (cast 4 to int*), (cast 5 to int*)",
+      // which happens a lot in compilers with tagged integers.
+      if (const ConstantExpr *CE2 = dyn_cast<ConstantExpr>(V2))
+        if (CE2->isCast() && isa<PointerType>(CE1->getType()) &&
+            CE1->getOperand(0)->getType() == CE2->getOperand(0)->getType() &&
+            CE1->getOperand(0)->getType()->isInteger()) {
+          bool sgnd = isSigned;
+          if (CE1->getOpcode() == Instruction::ZExt) isSigned = false;
+          if (CE1->getOpcode() == Instruction::SExt) isSigned = true;
+          return evaluateICmpRelation(CE1->getOperand(0), CE2->getOperand(0),
+                                      sgnd);
+        }
+      break;
+
+    case Instruction::GetElementPtr:
+      // Ok, since this is a getelementptr, we know that the constant has a
+      // pointer type.  Check the various cases.
+      if (isa<ConstantPointerNull>(V2)) {
+        // If we are comparing a GEP to a null pointer, check to see if the base
+        // of the GEP equals the null pointer.
+        if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
+          if (GV->hasExternalWeakLinkage())
+            // Weak linkage GVals could be zero or not. We're comparing that
+            // to null pointer so its greater-or-equal
+            return isSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
+          else 
+            // If its not weak linkage, the GVal must have a non-zero address
+            // so the result is greater-than
+            return isSigned ? ICmpInst::ICMP_SGT :  ICmpInst::ICMP_UGT;
+        } else if (isa<ConstantPointerNull>(CE1Op0)) {
+          // If we are indexing from a null pointer, check to see if we have any
+          // non-zero indices.
+          for (unsigned i = 1, e = CE1->getNumOperands(); i != e; ++i)
+            if (!CE1->getOperand(i)->isNullValue())
+              // Offsetting from null, must not be equal.
+              return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+          // Only zero indexes from null, must still be zero.
+          return ICmpInst::ICMP_EQ;
+        }
+        // Otherwise, we can't really say if the first operand is null or not.
+      } else if (const GlobalValue *CPR2 = dyn_cast<GlobalValue>(V2)) {
+        if (isa<ConstantPointerNull>(CE1Op0)) {
+          if (CPR2->hasExternalWeakLinkage())
+            // Weak linkage GVals could be zero or not. We're comparing it to
+            // a null pointer, so its less-or-equal
+            return isSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+          else
+            // If its not weak linkage, the GVal must have a non-zero address
+            // so the result is less-than
+            return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+        } else if (const GlobalValue *CPR1 = dyn_cast<GlobalValue>(CE1Op0)) {
+          if (CPR1 == CPR2) {
+            // If this is a getelementptr of the same global, then it must be
+            // different.  Because the types must match, the getelementptr could
+            // only have at most one index, and because we fold getelementptr's
+            // with a single zero index, it must be nonzero.
+            assert(CE1->getNumOperands() == 2 &&
+                   !CE1->getOperand(1)->isNullValue() &&
+                   "Suprising getelementptr!");
+            return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+          } else {
+            // If they are different globals, we don't know what the value is,
+            // but they can't be equal.
+            return ICmpInst::ICMP_NE;
+          }
+        }
+      } else {
+        const ConstantExpr *CE2 = cast<ConstantExpr>(V2);
+        const Constant *CE2Op0 = CE2->getOperand(0);
+
+        // There are MANY other foldings that we could perform here.  They will
+        // probably be added on demand, as they seem needed.
+        switch (CE2->getOpcode()) {
+        default: break;
+        case Instruction::GetElementPtr:
+          // By far the most common case to handle is when the base pointers are
+          // obviously to the same or different globals.
+          if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) {
+            if (CE1Op0 != CE2Op0) // Don't know relative ordering, but not equal
+              return ICmpInst::ICMP_NE;
+            // Ok, we know that both getelementptr instructions are based on the
+            // same global.  From this, we can precisely determine the relative
+            // ordering of the resultant pointers.
+            unsigned i = 1;
+
+            // Compare all of the operands the GEP's have in common.
+            gep_type_iterator GTI = gep_type_begin(CE1);
+            for (;i != CE1->getNumOperands() && i != CE2->getNumOperands();
+                 ++i, ++GTI)
+              switch (IdxCompare(CE1->getOperand(i), CE2->getOperand(i),
+                                 GTI.getIndexedType())) {
+              case -1: return isSigned ? ICmpInst::ICMP_SLT:ICmpInst::ICMP_ULT;
+              case 1:  return isSigned ? ICmpInst::ICMP_SGT:ICmpInst::ICMP_UGT;
+              case -2: return ICmpInst::BAD_ICMP_PREDICATE;
+              }
+
+            // Ok, we ran out of things they have in common.  If any leftovers
+            // are non-zero then we have a difference, otherwise we are equal.
+            for (; i < CE1->getNumOperands(); ++i)
+              if (!CE1->getOperand(i)->isNullValue()) {
+                if (isa<ConstantInt>(CE1->getOperand(i)))
+                  return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+                else
+                  return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
+              }
+
+            for (; i < CE2->getNumOperands(); ++i)
+              if (!CE2->getOperand(i)->isNullValue()) {
+                if (isa<ConstantInt>(CE2->getOperand(i)))
+                  return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+                else
+                  return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
+              }
+            return ICmpInst::ICMP_EQ;
+          }
+        }
+      }
+    default:
+      break;
+    }
+  }
+
+  return ICmpInst::BAD_ICMP_PREDICATE;
+}
+
+Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, 
+                                               const Constant *C1, 
+                                               const Constant *C2) {
+  // Fold FCMP_FALSE/FCMP_TRUE unconditionally.
+  if (pred == FCmpInst::FCMP_FALSE) {
+    if (const VectorType *VT = dyn_cast<VectorType>(C1->getType()))
+      return Constant::getNullValue(VectorType::getInteger(VT));
+    else
+      return ConstantInt::getFalse();
+  }
+  
+  if (pred == FCmpInst::FCMP_TRUE) {
+    if (const VectorType *VT = dyn_cast<VectorType>(C1->getType()))
+      return Constant::getAllOnesValue(VectorType::getInteger(VT));
+    else
+      return ConstantInt::getTrue();
+  }
+      
+  // Handle some degenerate cases first
+  if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
+    // vicmp/vfcmp -> [vector] undef
+    if (const VectorType *VTy = dyn_cast<VectorType>(C1->getType()))
+      return UndefValue::get(VectorType::getInteger(VTy));
+    
+    // icmp/fcmp -> i1 undef
+    return UndefValue::get(Type::Int1Ty);
+  }
+
+  // No compile-time operations on this type yet.
+  if (C1->getType() == Type::PPC_FP128Ty)
+    return 0;
+
+  // icmp eq/ne(null,GV) -> false/true
+  if (C1->isNullValue()) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C2))
+      // Don't try to evaluate aliases.  External weak GV can be null.
+      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
+        if (pred == ICmpInst::ICMP_EQ)
+          return ConstantInt::getFalse();
+        else if (pred == ICmpInst::ICMP_NE)
+          return ConstantInt::getTrue();
+      }
+  // icmp eq/ne(GV,null) -> false/true
+  } else if (C2->isNullValue()) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C1))
+      // Don't try to evaluate aliases.  External weak GV can be null.
+      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
+        if (pred == ICmpInst::ICMP_EQ)
+          return ConstantInt::getFalse();
+        else if (pred == ICmpInst::ICMP_NE)
+          return ConstantInt::getTrue();
+      }
+  }
+
+  if (isa<ConstantInt>(C1) && isa<ConstantInt>(C2)) {
+    APInt V1 = cast<ConstantInt>(C1)->getValue();
+    APInt V2 = cast<ConstantInt>(C2)->getValue();
+    switch (pred) {
+    default: assert(0 && "Invalid ICmp Predicate"); return 0;
+    case ICmpInst::ICMP_EQ: return ConstantInt::get(Type::Int1Ty, V1 == V2);
+    case ICmpInst::ICMP_NE: return ConstantInt::get(Type::Int1Ty, V1 != V2);
+    case ICmpInst::ICMP_SLT:return ConstantInt::get(Type::Int1Ty, V1.slt(V2));
+    case ICmpInst::ICMP_SGT:return ConstantInt::get(Type::Int1Ty, V1.sgt(V2));
+    case ICmpInst::ICMP_SLE:return ConstantInt::get(Type::Int1Ty, V1.sle(V2));
+    case ICmpInst::ICMP_SGE:return ConstantInt::get(Type::Int1Ty, V1.sge(V2));
+    case ICmpInst::ICMP_ULT:return ConstantInt::get(Type::Int1Ty, V1.ult(V2));
+    case ICmpInst::ICMP_UGT:return ConstantInt::get(Type::Int1Ty, V1.ugt(V2));
+    case ICmpInst::ICMP_ULE:return ConstantInt::get(Type::Int1Ty, V1.ule(V2));
+    case ICmpInst::ICMP_UGE:return ConstantInt::get(Type::Int1Ty, V1.uge(V2));
+    }
+  } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) {
+    APFloat C1V = cast<ConstantFP>(C1)->getValueAPF();
+    APFloat C2V = cast<ConstantFP>(C2)->getValueAPF();
+    APFloat::cmpResult R = C1V.compare(C2V);
+    switch (pred) {
+    default: assert(0 && "Invalid FCmp Predicate"); return 0;
+    case FCmpInst::FCMP_FALSE: return ConstantInt::getFalse();
+    case FCmpInst::FCMP_TRUE:  return ConstantInt::getTrue();
+    case FCmpInst::FCMP_UNO:
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpUnordered);
+    case FCmpInst::FCMP_ORD:
+      return ConstantInt::get(Type::Int1Ty, R!=APFloat::cmpUnordered);
+    case FCmpInst::FCMP_UEQ:
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpUnordered ||
+                                            R==APFloat::cmpEqual);
+    case FCmpInst::FCMP_OEQ:   
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpEqual);
+    case FCmpInst::FCMP_UNE:
+      return ConstantInt::get(Type::Int1Ty, R!=APFloat::cmpEqual);
+    case FCmpInst::FCMP_ONE:   
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpLessThan ||
+                                            R==APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_ULT: 
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpUnordered ||
+                                            R==APFloat::cmpLessThan);
+    case FCmpInst::FCMP_OLT:   
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpLessThan);
+    case FCmpInst::FCMP_UGT:
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpUnordered ||
+                                            R==APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_OGT:
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_ULE:
+      return ConstantInt::get(Type::Int1Ty, R!=APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_OLE: 
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpLessThan ||
+                                            R==APFloat::cmpEqual);
+    case FCmpInst::FCMP_UGE:
+      return ConstantInt::get(Type::Int1Ty, R!=APFloat::cmpLessThan);
+    case FCmpInst::FCMP_OGE: 
+      return ConstantInt::get(Type::Int1Ty, R==APFloat::cmpGreaterThan ||
+                                            R==APFloat::cmpEqual);
+    }
+  } else if (isa<VectorType>(C1->getType())) {
+    SmallVector<Constant*, 16> C1Elts, C2Elts;
+    C1->getVectorElements(C1Elts);
+    C2->getVectorElements(C2Elts);
+    
+    // If we can constant fold the comparison of each element, constant fold
+    // the whole vector comparison.
+    SmallVector<Constant*, 4> ResElts;
+    const Type *InEltTy = C1Elts[0]->getType();
+    bool isFP = InEltTy->isFloatingPoint();
+    const Type *ResEltTy = InEltTy;
+    if (isFP)
+      ResEltTy = IntegerType::get(InEltTy->getPrimitiveSizeInBits());
+    
+    for (unsigned i = 0, e = C1Elts.size(); i != e; ++i) {
+      // Compare the elements, producing an i1 result or constant expr.
+      Constant *C;
+      if (isFP)
+        C = ConstantExpr::getFCmp(pred, C1Elts[i], C2Elts[i]);
+      else
+        C = ConstantExpr::getICmp(pred, C1Elts[i], C2Elts[i]);
+
+      // If it is a bool or undef result, convert to the dest type.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+        if (CI->isZero())
+          ResElts.push_back(Constant::getNullValue(ResEltTy));
+        else
+          ResElts.push_back(Constant::getAllOnesValue(ResEltTy));
+      } else if (isa<UndefValue>(C)) {
+        ResElts.push_back(UndefValue::get(ResEltTy));
+      } else {
+        break;
+      }
+    }
+    
+    if (ResElts.size() == C1Elts.size())
+      return ConstantVector::get(&ResElts[0], ResElts.size());
+  }
+
+  if (C1->getType()->isFloatingPoint()) {
+    int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
+    switch (evaluateFCmpRelation(C1, C2)) {
+    default: assert(0 && "Unknown relation!");
+    case FCmpInst::FCMP_UNO:
+    case FCmpInst::FCMP_ORD:
+    case FCmpInst::FCMP_UEQ:
+    case FCmpInst::FCMP_UNE:
+    case FCmpInst::FCMP_ULT:
+    case FCmpInst::FCMP_UGT:
+    case FCmpInst::FCMP_ULE:
+    case FCmpInst::FCMP_UGE:
+    case FCmpInst::FCMP_TRUE:
+    case FCmpInst::FCMP_FALSE:
+    case FCmpInst::BAD_FCMP_PREDICATE:
+      break; // Couldn't determine anything about these constants.
+    case FCmpInst::FCMP_OEQ: // We know that C1 == C2
+      Result = (pred == FCmpInst::FCMP_UEQ || pred == FCmpInst::FCMP_OEQ ||
+                pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE ||
+                pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE);
+      break;
+    case FCmpInst::FCMP_OLT: // We know that C1 < C2
+      Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE ||
+                pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT ||
+                pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE);
+      break;
+    case FCmpInst::FCMP_OGT: // We know that C1 > C2
+      Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE ||
+                pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT ||
+                pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE);
+      break;
+    case FCmpInst::FCMP_OLE: // We know that C1 <= C2
+      // We can only partially decide this relation.
+      if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) 
+        Result = 0;
+      else if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) 
+        Result = 1;
+      break;
+    case FCmpInst::FCMP_OGE: // We known that C1 >= C2
+      // We can only partially decide this relation.
+      if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) 
+        Result = 0;
+      else if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) 
+        Result = 1;
+      break;
+    case ICmpInst::ICMP_NE: // We know that C1 != C2
+      // We can only partially decide this relation.
+      if (pred == FCmpInst::FCMP_OEQ || pred == FCmpInst::FCMP_UEQ) 
+        Result = 0;
+      else if (pred == FCmpInst::FCMP_ONE || pred == FCmpInst::FCMP_UNE) 
+        Result = 1;
+      break;
+    }
+    
+    // If we evaluated the result, return it now.
+    if (Result != -1) {
+      if (const VectorType *VT = dyn_cast<VectorType>(C1->getType())) {
+        if (Result == 0)
+          return Constant::getNullValue(VectorType::getInteger(VT));
+        else
+          return Constant::getAllOnesValue(VectorType::getInteger(VT));
+      }
+      return ConstantInt::get(Type::Int1Ty, Result);
+    }
+    
+  } else {
+    // Evaluate the relation between the two constants, per the predicate.
+    int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
+    switch (evaluateICmpRelation(C1, C2, CmpInst::isSigned(pred))) {
+    default: assert(0 && "Unknown relational!");
+    case ICmpInst::BAD_ICMP_PREDICATE:
+      break;  // Couldn't determine anything about these constants.
+    case ICmpInst::ICMP_EQ:   // We know the constants are equal!
+      // If we know the constants are equal, we can decide the result of this
+      // computation precisely.
+      Result = (pred == ICmpInst::ICMP_EQ  ||
+                pred == ICmpInst::ICMP_ULE ||
+                pred == ICmpInst::ICMP_SLE ||
+                pred == ICmpInst::ICMP_UGE ||
+                pred == ICmpInst::ICMP_SGE);
+      break;
+    case ICmpInst::ICMP_ULT:
+      // If we know that C1 < C2, we can decide the result of this computation
+      // precisely.
+      Result = (pred == ICmpInst::ICMP_ULT ||
+                pred == ICmpInst::ICMP_NE  ||
+                pred == ICmpInst::ICMP_ULE);
+      break;
+    case ICmpInst::ICMP_SLT:
+      // If we know that C1 < C2, we can decide the result of this computation
+      // precisely.
+      Result = (pred == ICmpInst::ICMP_SLT ||
+                pred == ICmpInst::ICMP_NE  ||
+                pred == ICmpInst::ICMP_SLE);
+      break;
+    case ICmpInst::ICMP_UGT:
+      // If we know that C1 > C2, we can decide the result of this computation
+      // precisely.
+      Result = (pred == ICmpInst::ICMP_UGT ||
+                pred == ICmpInst::ICMP_NE  ||
+                pred == ICmpInst::ICMP_UGE);
+      break;
+    case ICmpInst::ICMP_SGT:
+      // If we know that C1 > C2, we can decide the result of this computation
+      // precisely.
+      Result = (pred == ICmpInst::ICMP_SGT ||
+                pred == ICmpInst::ICMP_NE  ||
+                pred == ICmpInst::ICMP_SGE);
+      break;
+    case ICmpInst::ICMP_ULE:
+      // If we know that C1 <= C2, we can only partially decide this relation.
+      if (pred == ICmpInst::ICMP_UGT) Result = 0;
+      if (pred == ICmpInst::ICMP_ULT) Result = 1;
+      break;
+    case ICmpInst::ICMP_SLE:
+      // If we know that C1 <= C2, we can only partially decide this relation.
+      if (pred == ICmpInst::ICMP_SGT) Result = 0;
+      if (pred == ICmpInst::ICMP_SLT) Result = 1;
+      break;
+
+    case ICmpInst::ICMP_UGE:
+      // If we know that C1 >= C2, we can only partially decide this relation.
+      if (pred == ICmpInst::ICMP_ULT) Result = 0;
+      if (pred == ICmpInst::ICMP_UGT) Result = 1;
+      break;
+    case ICmpInst::ICMP_SGE:
+      // If we know that C1 >= C2, we can only partially decide this relation.
+      if (pred == ICmpInst::ICMP_SLT) Result = 0;
+      if (pred == ICmpInst::ICMP_SGT) Result = 1;
+      break;
+
+    case ICmpInst::ICMP_NE:
+      // If we know that C1 != C2, we can only partially decide this relation.
+      if (pred == ICmpInst::ICMP_EQ) Result = 0;
+      if (pred == ICmpInst::ICMP_NE) Result = 1;
+      break;
+    }
+    
+    // If we evaluated the result, return it now.
+    if (Result != -1) {
+      if (const VectorType *VT = dyn_cast<VectorType>(C1->getType())) {
+        if (Result == 0)
+          return Constant::getNullValue(VT);
+        else
+          return Constant::getAllOnesValue(VT);
+      }
+      return ConstantInt::get(Type::Int1Ty, Result);
+    }
+    
+    if (!isa<ConstantExpr>(C1) && isa<ConstantExpr>(C2)) {
+      // If C2 is a constant expr and C1 isn't, flop them around and fold the
+      // other way if possible.
+      switch (pred) {
+      case ICmpInst::ICMP_EQ:
+      case ICmpInst::ICMP_NE:
+        // No change of predicate required.
+        return ConstantFoldCompareInstruction(pred, C2, C1);
+
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT:
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT:
+      case ICmpInst::ICMP_ULE:
+      case ICmpInst::ICMP_SLE:
+      case ICmpInst::ICMP_UGE:
+      case ICmpInst::ICMP_SGE:
+        // Change the predicate as necessary to swap the operands.
+        pred = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)pred);
+        return ConstantFoldCompareInstruction(pred, C2, C1);
+
+      default:  // These predicates cannot be flopped around.
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+Constant *llvm::ConstantFoldGetElementPtr(const Constant *C,
+                                          Constant* const *Idxs,
+                                          unsigned NumIdx) {
+  if (NumIdx == 0 ||
+      (NumIdx == 1 && Idxs[0]->isNullValue()))
+    return const_cast<Constant*>(C);
+
+  if (isa<UndefValue>(C)) {
+    const PointerType *Ptr = cast<PointerType>(C->getType());
+    const Type *Ty = GetElementPtrInst::getIndexedType(Ptr,
+                                                       (Value **)Idxs,
+                                                       (Value **)Idxs+NumIdx);
+    assert(Ty != 0 && "Invalid indices for GEP!");
+    return UndefValue::get(PointerType::get(Ty, Ptr->getAddressSpace()));
+  }
+
+  Constant *Idx0 = Idxs[0];
+  if (C->isNullValue()) {
+    bool isNull = true;
+    for (unsigned i = 0, e = NumIdx; i != e; ++i)
+      if (!Idxs[i]->isNullValue()) {
+        isNull = false;
+        break;
+      }
+    if (isNull) {
+      const PointerType *Ptr = cast<PointerType>(C->getType());
+      const Type *Ty = GetElementPtrInst::getIndexedType(Ptr,
+                                                         (Value**)Idxs,
+                                                         (Value**)Idxs+NumIdx);
+      assert(Ty != 0 && "Invalid indices for GEP!");
+      return 
+        ConstantPointerNull::get(PointerType::get(Ty,Ptr->getAddressSpace()));
+    }
+  }
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(const_cast<Constant*>(C))) {
+    // Combine Indices - If the source pointer to this getelementptr instruction
+    // is a getelementptr instruction, combine the indices of the two
+    // getelementptr instructions into a single instruction.
+    //
+    if (CE->getOpcode() == Instruction::GetElementPtr) {
+      const Type *LastTy = 0;
+      for (gep_type_iterator I = gep_type_begin(CE), E = gep_type_end(CE);
+           I != E; ++I)
+        LastTy = *I;
+
+      if ((LastTy && isa<ArrayType>(LastTy)) || Idx0->isNullValue()) {
+        SmallVector<Value*, 16> NewIndices;
+        NewIndices.reserve(NumIdx + CE->getNumOperands());
+        for (unsigned i = 1, e = CE->getNumOperands()-1; i != e; ++i)
+          NewIndices.push_back(CE->getOperand(i));
+
+        // Add the last index of the source with the first index of the new GEP.
+        // Make sure to handle the case when they are actually different types.
+        Constant *Combined = CE->getOperand(CE->getNumOperands()-1);
+        // Otherwise it must be an array.
+        if (!Idx0->isNullValue()) {
+          const Type *IdxTy = Combined->getType();
+          if (IdxTy != Idx0->getType()) {
+            Constant *C1 = ConstantExpr::getSExtOrBitCast(Idx0, Type::Int64Ty);
+            Constant *C2 = ConstantExpr::getSExtOrBitCast(Combined, 
+                                                          Type::Int64Ty);
+            Combined = ConstantExpr::get(Instruction::Add, C1, C2);
+          } else {
+            Combined =
+              ConstantExpr::get(Instruction::Add, Idx0, Combined);
+          }
+        }
+
+        NewIndices.push_back(Combined);
+        NewIndices.insert(NewIndices.end(), Idxs+1, Idxs+NumIdx);
+        return ConstantExpr::getGetElementPtr(CE->getOperand(0), &NewIndices[0],
+                                              NewIndices.size());
+      }
+    }
+
+    // Implement folding of:
+    //    int* getelementptr ([2 x int]* cast ([3 x int]* %X to [2 x int]*),
+    //                        long 0, long 0)
+    // To: int* getelementptr ([3 x int]* %X, long 0, long 0)
+    //
+    if (CE->isCast() && NumIdx > 1 && Idx0->isNullValue()) {
+      if (const PointerType *SPT =
+          dyn_cast<PointerType>(CE->getOperand(0)->getType()))
+        if (const ArrayType *SAT = dyn_cast<ArrayType>(SPT->getElementType()))
+          if (const ArrayType *CAT =
+        dyn_cast<ArrayType>(cast<PointerType>(C->getType())->getElementType()))
+            if (CAT->getElementType() == SAT->getElementType())
+              return ConstantExpr::getGetElementPtr(
+                      (Constant*)CE->getOperand(0), Idxs, NumIdx);
+    }
+    
+    // Fold: getelementptr (i8* inttoptr (i64 1 to i8*), i32 -1)
+    // Into: inttoptr (i64 0 to i8*)
+    // This happens with pointers to member functions in C++.
+    if (CE->getOpcode() == Instruction::IntToPtr && NumIdx == 1 &&
+        isa<ConstantInt>(CE->getOperand(0)) && isa<ConstantInt>(Idxs[0]) &&
+        cast<PointerType>(CE->getType())->getElementType() == Type::Int8Ty) {
+      Constant *Base = CE->getOperand(0);
+      Constant *Offset = Idxs[0];
+      
+      // Convert the smaller integer to the larger type.
+      if (Offset->getType()->getPrimitiveSizeInBits() < 
+          Base->getType()->getPrimitiveSizeInBits())
+        Offset = ConstantExpr::getSExt(Offset, Base->getType());
+      else if (Base->getType()->getPrimitiveSizeInBits() <
+               Offset->getType()->getPrimitiveSizeInBits())
+        Base = ConstantExpr::getZExt(Base, Offset->getType());
+      
+      Base = ConstantExpr::getAdd(Base, Offset);
+      return ConstantExpr::getIntToPtr(Base, CE->getType());
+    }
+  }
+  return 0;
+}
+
diff --git a/lib/VMCore/ConstantFold.h b/lib/VMCore/ConstantFold.h
new file mode 100644
index 0000000..49aea11
--- /dev/null
+++ b/lib/VMCore/ConstantFold.h
@@ -0,0 +1,60 @@
+//===-- ConstantFolding.h - Internal Constant Folding Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the (internal) constant folding interfaces for LLVM.  These
+// interfaces are used by the ConstantExpr::get* methods to automatically fold
+// constants when possible.
+//
+// These operators may return a null object if they don't know how to perform
+// the specified operation on the specified constant types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CONSTANTFOLDING_H
+#define CONSTANTFOLDING_H
+
+namespace llvm {
+  class Value;
+  class Constant;
+  class Type;
+
+  // Constant fold various types of instruction...
+  Constant *ConstantFoldCastInstruction(
+    unsigned opcode,     ///< The opcode of the cast
+    const Constant *V,   ///< The source constant
+    const Type *DestTy   ///< The destination type
+  );
+  Constant *ConstantFoldSelectInstruction(const Constant *Cond,
+                                          const Constant *V1,
+                                          const Constant *V2);
+  Constant *ConstantFoldExtractElementInstruction(const Constant *Val,
+                                                  const Constant *Idx);
+  Constant *ConstantFoldInsertElementInstruction(const Constant *Val,
+                                                 const Constant *Elt,
+                                                 const Constant *Idx);
+  Constant *ConstantFoldShuffleVectorInstruction(const Constant *V1,
+                                                 const Constant *V2,
+                                                 const Constant *Mask);
+  Constant *ConstantFoldExtractValueInstruction(const Constant *Agg,
+                                                const unsigned *Idxs,
+                                                unsigned NumIdx);
+  Constant *ConstantFoldInsertValueInstruction(const Constant *Agg,
+                                               const Constant *Val,
+                                               const unsigned* Idxs,
+                                               unsigned NumIdx);
+  Constant *ConstantFoldBinaryInstruction(unsigned Opcode, const Constant *V1,
+                                          const Constant *V2);
+  Constant *ConstantFoldCompareInstruction(unsigned short predicate, 
+                                           const Constant *C1, 
+                                           const Constant *C2);
+  Constant *ConstantFoldGetElementPtr(const Constant *C,
+                                      Constant* const *Idxs, unsigned NumIdx);
+} // End llvm namespace
+
+#endif
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
new file mode 100644
index 0000000..97f3ac9
--- /dev/null
+++ b/lib/VMCore/Constants.cpp
@@ -0,0 +1,2832 @@
+//===-- Constants.cpp - Implement Constant nodes --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Constant* classes...
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "ConstantFold.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/MDNode.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              Constant Class
+//===----------------------------------------------------------------------===//
+
+void Constant::destroyConstantImpl() {
+  // When a Constant is destroyed, there may be lingering
+  // references to the constant by other constants in the constant pool.  These
+  // constants are implicitly dependent on the module that is being deleted,
+  // but they don't know that.  Because we only find out when the CPV is
+  // deleted, we must now notify all of our users (that should only be
+  // Constants) that they are, in fact, invalid now and should be deleted.
+  //
+  while (!use_empty()) {
+    Value *V = use_back();
+#ifndef NDEBUG      // Only in -g mode...
+    if (!isa<Constant>(V))
+      DOUT << "While deleting: " << *this
+           << "\n\nUse still stuck around after Def is destroyed: "
+           << *V << "\n\n";
+#endif
+    assert(isa<Constant>(V) && "References remain to Constant being destroyed");
+    Constant *CV = cast<Constant>(V);
+    CV->destroyConstant();
+
+    // The constant should remove itself from our use list...
+    assert((use_empty() || use_back() != V) && "Constant not removed!");
+  }
+
+  // Value has no outstanding references it is safe to delete it now...
+  delete this;
+}
+
+/// canTrap - Return true if evaluation of this constant could trap.  This is
+/// true for things like constant expressions that could divide by zero.
+bool Constant::canTrap() const {
+  assert(getType()->isFirstClassType() && "Cannot evaluate aggregate vals!");
+  // The only thing that could possibly trap are constant exprs.
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(this);
+  if (!CE) return false;
+  
+  // ConstantExpr traps if any operands can trap. 
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (getOperand(i)->canTrap()) 
+      return true;
+
+  // Otherwise, only specific operations can trap.
+  switch (CE->getOpcode()) {
+  default:
+    return false;
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+    // Div and rem can trap if the RHS is not known to be non-zero.
+    if (!isa<ConstantInt>(getOperand(1)) || getOperand(1)->isNullValue())
+      return true;
+    return false;
+  }
+}
+
+/// ContainsRelocations - Return true if the constant value contains relocations
+/// which cannot be resolved at compile time. Kind argument is used to filter
+/// only 'interesting' sorts of relocations.
+bool Constant::ContainsRelocations(unsigned Kind) const {
+  if (const GlobalValue* GV = dyn_cast<GlobalValue>(this)) {
+    bool isLocal = GV->hasLocalLinkage();
+    if ((Kind & Reloc::Local) && isLocal) {
+      // Global has local linkage and 'local' kind of relocations are
+      // requested
+      return true;
+    }
+
+    if ((Kind & Reloc::Global) && !isLocal) {
+      // Global has non-local linkage and 'global' kind of relocations are
+      // requested
+      return true;
+    }
+
+    return false;
+  }
+
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (getOperand(i)->ContainsRelocations(Kind))
+      return true;
+
+  return false;
+}
+
+// Static constructor to create a '0' constant of arbitrary type...
+Constant *Constant::getNullValue(const Type *Ty) {
+  static uint64_t zero[2] = {0, 0};
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID:
+    return ConstantInt::get(Ty, 0);
+  case Type::FloatTyID:
+    return ConstantFP::get(APFloat(APInt(32, 0)));
+  case Type::DoubleTyID:
+    return ConstantFP::get(APFloat(APInt(64, 0)));
+  case Type::X86_FP80TyID:
+    return ConstantFP::get(APFloat(APInt(80, 2, zero)));
+  case Type::FP128TyID:
+    return ConstantFP::get(APFloat(APInt(128, 2, zero), true));
+  case Type::PPC_FP128TyID:
+    return ConstantFP::get(APFloat(APInt(128, 2, zero)));
+  case Type::PointerTyID:
+    return ConstantPointerNull::get(cast<PointerType>(Ty));
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    return ConstantAggregateZero::get(Ty);
+  default:
+    // Function, Label, or Opaque type?
+    assert(!"Cannot create a null constant of that type!");
+    return 0;
+  }
+}
+
+Constant *Constant::getAllOnesValue(const Type *Ty) {
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty))
+    return ConstantInt::get(APInt::getAllOnesValue(ITy->getBitWidth()));
+  return ConstantVector::getAllOnesValue(cast<VectorType>(Ty));
+}
+
+// Static constructor to create an integral constant with all bits set
+ConstantInt *ConstantInt::getAllOnesValue(const Type *Ty) {
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty))
+    return ConstantInt::get(APInt::getAllOnesValue(ITy->getBitWidth()));
+  return 0;
+}
+
+/// @returns the value for a vector integer constant of the given type that
+/// has all its bits set to true.
+/// @brief Get the all ones value
+ConstantVector *ConstantVector::getAllOnesValue(const VectorType *Ty) {
+  std::vector<Constant*> Elts;
+  Elts.resize(Ty->getNumElements(),
+              ConstantInt::getAllOnesValue(Ty->getElementType()));
+  assert(Elts[0] && "Not a vector integer type!");
+  return cast<ConstantVector>(ConstantVector::get(Elts));
+}
+
+
+/// getVectorElements - This method, which is only valid on constant of vector
+/// type, returns the elements of the vector in the specified smallvector.
+/// This handles breaking down a vector undef into undef elements, etc.  For
+/// constant exprs and other cases we can't handle, we return an empty vector.
+void Constant::getVectorElements(SmallVectorImpl<Constant*> &Elts) const {
+  assert(isa<VectorType>(getType()) && "Not a vector constant!");
+  
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this)) {
+    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i)
+      Elts.push_back(CV->getOperand(i));
+    return;
+  }
+  
+  const VectorType *VT = cast<VectorType>(getType());
+  if (isa<ConstantAggregateZero>(this)) {
+    Elts.assign(VT->getNumElements(), 
+                Constant::getNullValue(VT->getElementType()));
+    return;
+  }
+  
+  if (isa<UndefValue>(this)) {
+    Elts.assign(VT->getNumElements(), UndefValue::get(VT->getElementType()));
+    return;
+  }
+  
+  // Unknown type, must be constant expr etc.
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                                ConstantInt
+//===----------------------------------------------------------------------===//
+
+ConstantInt::ConstantInt(const IntegerType *Ty, const APInt& V)
+  : Constant(Ty, ConstantIntVal, 0, 0), Val(V) {
+  assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
+}
+
+ConstantInt *ConstantInt::TheTrueVal = 0;
+ConstantInt *ConstantInt::TheFalseVal = 0;
+
+namespace llvm {
+  void CleanupTrueFalse(void *) {
+    ConstantInt::ResetTrueFalse();
+  }
+}
+
+static ManagedCleanup<llvm::CleanupTrueFalse> TrueFalseCleanup;
+
+ConstantInt *ConstantInt::CreateTrueFalseVals(bool WhichOne) {
+  assert(TheTrueVal == 0 && TheFalseVal == 0);
+  TheTrueVal  = get(Type::Int1Ty, 1);
+  TheFalseVal = get(Type::Int1Ty, 0);
+  
+  // Ensure that llvm_shutdown nulls out TheTrueVal/TheFalseVal.
+  TrueFalseCleanup.Register();
+  
+  return WhichOne ? TheTrueVal : TheFalseVal;
+}
+
+
+namespace {
+  struct DenseMapAPIntKeyInfo {
+    struct KeyTy {
+      APInt val;
+      const Type* type;
+      KeyTy(const APInt& V, const Type* Ty) : val(V), type(Ty) {}
+      KeyTy(const KeyTy& that) : val(that.val), type(that.type) {}
+      bool operator==(const KeyTy& that) const {
+        return type == that.type && this->val == that.val;
+      }
+      bool operator!=(const KeyTy& that) const {
+        return !this->operator==(that);
+      }
+    };
+    static inline KeyTy getEmptyKey() { return KeyTy(APInt(1,0), 0); }
+    static inline KeyTy getTombstoneKey() { return KeyTy(APInt(1,1), 0); }
+    static unsigned getHashValue(const KeyTy &Key) {
+      return DenseMapInfo<void*>::getHashValue(Key.type) ^ 
+        Key.val.getHashValue();
+    }
+    static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
+      return LHS == RHS;
+    }
+    static bool isPod() { return false; }
+  };
+}
+
+
+typedef DenseMap<DenseMapAPIntKeyInfo::KeyTy, ConstantInt*, 
+                 DenseMapAPIntKeyInfo> IntMapTy;
+static ManagedStatic<IntMapTy> IntConstants;
+
+ConstantInt *ConstantInt::get(const Type *Ty, uint64_t V, bool isSigned) {
+  const IntegerType *ITy = cast<IntegerType>(Ty);
+  return get(APInt(ITy->getBitWidth(), V, isSigned));
+}
+
+// Get a ConstantInt from an APInt. Note that the value stored in the DenseMap 
+// as the key, is a DenseMapAPIntKeyInfo::KeyTy which has provided the
+// operator== and operator!= to ensure that the DenseMap doesn't attempt to
+// compare APInt's of different widths, which would violate an APInt class
+// invariant which generates an assertion.
+ConstantInt *ConstantInt::get(const APInt& V) {
+  // Get the corresponding integer type for the bit width of the value.
+  const IntegerType *ITy = IntegerType::get(V.getBitWidth());
+  // get an existing value or the insertion position
+  DenseMapAPIntKeyInfo::KeyTy Key(V, ITy);
+  ConstantInt *&Slot = (*IntConstants)[Key]; 
+  // if it exists, return it.
+  if (Slot)
+    return Slot;
+  // otherwise create a new one, insert it, and return it.
+  return Slot = new ConstantInt(ITy, V);
+}
+
+//===----------------------------------------------------------------------===//
+//                                ConstantFP
+//===----------------------------------------------------------------------===//
+
+static const fltSemantics *TypeToFloatSemantics(const Type *Ty) {
+  if (Ty == Type::FloatTy)
+    return &APFloat::IEEEsingle;
+  if (Ty == Type::DoubleTy)
+    return &APFloat::IEEEdouble;
+  if (Ty == Type::X86_FP80Ty)
+    return &APFloat::x87DoubleExtended;
+  else if (Ty == Type::FP128Ty)
+    return &APFloat::IEEEquad;
+  
+  assert(Ty == Type::PPC_FP128Ty && "Unknown FP format");
+  return &APFloat::PPCDoubleDouble;
+}
+
+ConstantFP::ConstantFP(const Type *Ty, const APFloat& V)
+  : Constant(Ty, ConstantFPVal, 0, 0), Val(V) {
+  assert(&V.getSemantics() == TypeToFloatSemantics(Ty) &&
+         "FP type Mismatch");
+}
+
+bool ConstantFP::isNullValue() const {
+  return Val.isZero() && !Val.isNegative();
+}
+
+ConstantFP *ConstantFP::getNegativeZero(const Type *Ty) {
+  APFloat apf = cast <ConstantFP>(Constant::getNullValue(Ty))->getValueAPF();
+  apf.changeSign();
+  return ConstantFP::get(apf);
+}
+
+bool ConstantFP::isExactlyValue(const APFloat& V) const {
+  return Val.bitwiseIsEqual(V);
+}
+
+namespace {
+  struct DenseMapAPFloatKeyInfo {
+    struct KeyTy {
+      APFloat val;
+      KeyTy(const APFloat& V) : val(V){}
+      KeyTy(const KeyTy& that) : val(that.val) {}
+      bool operator==(const KeyTy& that) const {
+        return this->val.bitwiseIsEqual(that.val);
+      }
+      bool operator!=(const KeyTy& that) const {
+        return !this->operator==(that);
+      }
+    };
+    static inline KeyTy getEmptyKey() { 
+      return KeyTy(APFloat(APFloat::Bogus,1));
+    }
+    static inline KeyTy getTombstoneKey() { 
+      return KeyTy(APFloat(APFloat::Bogus,2)); 
+    }
+    static unsigned getHashValue(const KeyTy &Key) {
+      return Key.val.getHashValue();
+    }
+    static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
+      return LHS == RHS;
+    }
+    static bool isPod() { return false; }
+  };
+}
+
+//---- ConstantFP::get() implementation...
+//
+typedef DenseMap<DenseMapAPFloatKeyInfo::KeyTy, ConstantFP*, 
+                 DenseMapAPFloatKeyInfo> FPMapTy;
+
+static ManagedStatic<FPMapTy> FPConstants;
+
+ConstantFP *ConstantFP::get(const APFloat &V) {
+  DenseMapAPFloatKeyInfo::KeyTy Key(V);
+  ConstantFP *&Slot = (*FPConstants)[Key];
+  if (Slot) return Slot;
+  
+  const Type *Ty;
+  if (&V.getSemantics() == &APFloat::IEEEsingle)
+    Ty = Type::FloatTy;
+  else if (&V.getSemantics() == &APFloat::IEEEdouble)
+    Ty = Type::DoubleTy;
+  else if (&V.getSemantics() == &APFloat::x87DoubleExtended)
+    Ty = Type::X86_FP80Ty;
+  else if (&V.getSemantics() == &APFloat::IEEEquad)
+    Ty = Type::FP128Ty;
+  else {
+    assert(&V.getSemantics() == &APFloat::PPCDoubleDouble&&"Unknown FP format");
+    Ty = Type::PPC_FP128Ty;
+  }
+  
+  return Slot = new ConstantFP(Ty, V);
+}
+
+/// get() - This returns a constant fp for the specified value in the
+/// specified type.  This should only be used for simple constant values like
+/// 2.0/1.0 etc, that are known-valid both as double and as the target format.
+ConstantFP *ConstantFP::get(const Type *Ty, double V) {
+  APFloat FV(V);
+  bool ignored;
+  FV.convert(*TypeToFloatSemantics(Ty), APFloat::rmNearestTiesToEven, &ignored);
+  return get(FV);
+}
+
+//===----------------------------------------------------------------------===//
+//                            ConstantXXX Classes
+//===----------------------------------------------------------------------===//
+
+
+ConstantArray::ConstantArray(const ArrayType *T,
+                             const std::vector<Constant*> &V)
+  : Constant(T, ConstantArrayVal,
+             OperandTraits<ConstantArray>::op_end(this) - V.size(),
+             V.size()) {
+  assert(V.size() == T->getNumElements() &&
+         "Invalid initializer vector for constant array");
+  Use *OL = OperandList;
+  for (std::vector<Constant*>::const_iterator I = V.begin(), E = V.end();
+       I != E; ++I, ++OL) {
+    Constant *C = *I;
+    assert((C->getType() == T->getElementType() ||
+            (T->isAbstract() &&
+             C->getType()->getTypeID() == T->getElementType()->getTypeID())) &&
+           "Initializer for array element doesn't match array element type!");
+    *OL = C;
+  }
+}
+
+
+ConstantStruct::ConstantStruct(const StructType *T,
+                               const std::vector<Constant*> &V)
+  : Constant(T, ConstantStructVal,
+             OperandTraits<ConstantStruct>::op_end(this) - V.size(),
+             V.size()) {
+  assert(V.size() == T->getNumElements() &&
+         "Invalid initializer vector for constant structure");
+  Use *OL = OperandList;
+  for (std::vector<Constant*>::const_iterator I = V.begin(), E = V.end();
+       I != E; ++I, ++OL) {
+    Constant *C = *I;
+    assert((C->getType() == T->getElementType(I-V.begin()) ||
+            ((T->getElementType(I-V.begin())->isAbstract() ||
+              C->getType()->isAbstract()) &&
+             T->getElementType(I-V.begin())->getTypeID() == 
+                   C->getType()->getTypeID())) &&
+           "Initializer for struct element doesn't match struct element type!");
+    *OL = C;
+  }
+}
+
+
+ConstantVector::ConstantVector(const VectorType *T,
+                               const std::vector<Constant*> &V)
+  : Constant(T, ConstantVectorVal,
+             OperandTraits<ConstantVector>::op_end(this) - V.size(),
+             V.size()) {
+  Use *OL = OperandList;
+    for (std::vector<Constant*>::const_iterator I = V.begin(), E = V.end();
+         I != E; ++I, ++OL) {
+      Constant *C = *I;
+      assert((C->getType() == T->getElementType() ||
+            (T->isAbstract() &&
+             C->getType()->getTypeID() == T->getElementType()->getTypeID())) &&
+           "Initializer for vector element doesn't match vector element type!");
+    *OL = C;
+  }
+}
+
+
+namespace llvm {
+// We declare several classes private to this file, so use an anonymous
+// namespace
+namespace {
+
+/// UnaryConstantExpr - This class is private to Constants.cpp, and is used
+/// behind the scenes to implement unary constant exprs.
+class VISIBILITY_HIDDEN UnaryConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 1);
+  }
+  UnaryConstantExpr(unsigned Opcode, Constant *C, const Type *Ty)
+    : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
+    Op<0>() = C;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// BinaryConstantExpr - This class is private to Constants.cpp, and is used
+/// behind the scenes to implement binary constant exprs.
+class VISIBILITY_HIDDEN BinaryConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2)
+    : ConstantExpr(C1->getType(), Opcode, &Op<0>(), 2) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// SelectConstantExpr - This class is private to Constants.cpp, and is used
+/// behind the scenes to implement select constant exprs.
+class VISIBILITY_HIDDEN SelectConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+  SelectConstantExpr(Constant *C1, Constant *C2, Constant *C3)
+    : ConstantExpr(C2->getType(), Instruction::Select, &Op<0>(), 3) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+    Op<2>() = C3;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// ExtractElementConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// extractelement constant exprs.
+class VISIBILITY_HIDDEN ExtractElementConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  ExtractElementConstantExpr(Constant *C1, Constant *C2)
+    : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(), 
+                   Instruction::ExtractElement, &Op<0>(), 2) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// InsertElementConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// insertelement constant exprs.
+class VISIBILITY_HIDDEN InsertElementConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+  InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
+    : ConstantExpr(C1->getType(), Instruction::InsertElement, 
+                   &Op<0>(), 3) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+    Op<2>() = C3;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// ShuffleVectorConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// shufflevector constant exprs.
+class VISIBILITY_HIDDEN ShuffleVectorConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+  ShuffleVectorConstantExpr(Constant *C1, Constant *C2, Constant *C3)
+  : ConstantExpr(VectorType::get(
+                   cast<VectorType>(C1->getType())->getElementType(),
+                   cast<VectorType>(C3->getType())->getNumElements()),
+                 Instruction::ShuffleVector, 
+                 &Op<0>(), 3) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+    Op<2>() = C3;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// ExtractValueConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// extractvalue constant exprs.
+class VISIBILITY_HIDDEN ExtractValueConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 1);
+  }
+  ExtractValueConstantExpr(Constant *Agg,
+                           const SmallVector<unsigned, 4> &IdxList,
+                           const Type *DestTy)
+    : ConstantExpr(DestTy, Instruction::ExtractValue, &Op<0>(), 1),
+      Indices(IdxList) {
+    Op<0>() = Agg;
+  }
+
+  /// Indices - These identify which value to extract.
+  const SmallVector<unsigned, 4> Indices;
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// InsertValueConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// insertvalue constant exprs.
+class VISIBILITY_HIDDEN InsertValueConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+public:
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  InsertValueConstantExpr(Constant *Agg, Constant *Val,
+                          const SmallVector<unsigned, 4> &IdxList,
+                          const Type *DestTy)
+    : ConstantExpr(DestTy, Instruction::InsertValue, &Op<0>(), 2),
+      Indices(IdxList) {
+    Op<0>() = Agg;
+    Op<1>() = Val;
+  }
+
+  /// Indices - These identify the position for the insertion.
+  const SmallVector<unsigned, 4> Indices;
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+
+/// GetElementPtrConstantExpr - This class is private to Constants.cpp, and is
+/// used behind the scenes to implement getelementpr constant exprs.
+class VISIBILITY_HIDDEN GetElementPtrConstantExpr : public ConstantExpr {
+  GetElementPtrConstantExpr(Constant *C, const std::vector<Constant*> &IdxList,
+                            const Type *DestTy);
+public:
+  static GetElementPtrConstantExpr *Create(Constant *C,
+                                           const std::vector<Constant*>&IdxList,
+                                           const Type *DestTy) {
+    return new(IdxList.size() + 1)
+      GetElementPtrConstantExpr(C, IdxList, DestTy);
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+// CompareConstantExpr - This class is private to Constants.cpp, and is used
+// behind the scenes to implement ICmp and FCmp constant expressions. This is
+// needed in order to store the predicate value for these instructions.
+struct VISIBILITY_HIDDEN CompareConstantExpr : public ConstantExpr {
+  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  unsigned short predicate;
+  CompareConstantExpr(const Type *ty, Instruction::OtherOps opc,
+                      unsigned short pred,  Constant* LHS, Constant* RHS)
+    : ConstantExpr(ty, opc, &Op<0>(), 2), predicate(pred) {
+    Op<0>() = LHS;
+    Op<1>() = RHS;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+} // end anonymous namespace
+
+template <>
+struct OperandTraits<UnaryConstantExpr> : FixedNumOperandTraits<1> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(UnaryConstantExpr, Value)
+
+template <>
+struct OperandTraits<BinaryConstantExpr> : FixedNumOperandTraits<2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BinaryConstantExpr, Value)
+
+template <>
+struct OperandTraits<SelectConstantExpr> : FixedNumOperandTraits<3> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SelectConstantExpr, Value)
+
+template <>
+struct OperandTraits<ExtractElementConstantExpr> : FixedNumOperandTraits<2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementConstantExpr, Value)
+
+template <>
+struct OperandTraits<InsertElementConstantExpr> : FixedNumOperandTraits<3> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertElementConstantExpr, Value)
+
+template <>
+struct OperandTraits<ShuffleVectorConstantExpr> : FixedNumOperandTraits<3> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ShuffleVectorConstantExpr, Value)
+
+template <>
+struct OperandTraits<ExtractValueConstantExpr> : FixedNumOperandTraits<1> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractValueConstantExpr, Value)
+
+template <>
+struct OperandTraits<InsertValueConstantExpr> : FixedNumOperandTraits<2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueConstantExpr, Value)
+
+template <>
+struct OperandTraits<GetElementPtrConstantExpr> : VariadicOperandTraits<1> {
+};
+
+GetElementPtrConstantExpr::GetElementPtrConstantExpr
+  (Constant *C,
+   const std::vector<Constant*> &IdxList,
+   const Type *DestTy)
+    : ConstantExpr(DestTy, Instruction::GetElementPtr,
+                   OperandTraits<GetElementPtrConstantExpr>::op_end(this)
+                   - (IdxList.size()+1),
+                   IdxList.size()+1) {
+  OperandList[0] = C;
+  for (unsigned i = 0, E = IdxList.size(); i != E; ++i)
+    OperandList[i+1] = IdxList[i];
+}
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrConstantExpr, Value)
+
+
+template <>
+struct OperandTraits<CompareConstantExpr> : FixedNumOperandTraits<2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value)
+
+
+} // End llvm namespace
+
+
+// Utility function for determining if a ConstantExpr is a CastOp or not. This
+// can't be inline because we don't want to #include Instruction.h into
+// Constant.h
+bool ConstantExpr::isCast() const {
+  return Instruction::isCast(getOpcode());
+}
+
+bool ConstantExpr::isCompare() const {
+  return getOpcode() == Instruction::ICmp || getOpcode() == Instruction::FCmp ||
+         getOpcode() == Instruction::VICmp || getOpcode() == Instruction::VFCmp;
+}
+
+bool ConstantExpr::hasIndices() const {
+  return getOpcode() == Instruction::ExtractValue ||
+         getOpcode() == Instruction::InsertValue;
+}
+
+const SmallVector<unsigned, 4> &ConstantExpr::getIndices() const {
+  if (const ExtractValueConstantExpr *EVCE =
+        dyn_cast<ExtractValueConstantExpr>(this))
+    return EVCE->Indices;
+
+  return cast<InsertValueConstantExpr>(this)->Indices;
+}
+
+/// ConstantExpr::get* - Return some common constants without having to
+/// specify the full Instruction::OPCODE identifier.
+///
+Constant *ConstantExpr::getNeg(Constant *C) {
+  return get(Instruction::Sub,
+             ConstantExpr::getZeroValueForNegationExpr(C->getType()),
+             C);
+}
+Constant *ConstantExpr::getNot(Constant *C) {
+  assert((isa<IntegerType>(C->getType()) ||
+            cast<VectorType>(C->getType())->getElementType()->isInteger()) &&
+          "Cannot NOT a nonintegral value!");
+  return get(Instruction::Xor, C,
+             Constant::getAllOnesValue(C->getType()));
+}
+Constant *ConstantExpr::getAdd(Constant *C1, Constant *C2) {
+  return get(Instruction::Add, C1, C2);
+}
+Constant *ConstantExpr::getSub(Constant *C1, Constant *C2) {
+  return get(Instruction::Sub, C1, C2);
+}
+Constant *ConstantExpr::getMul(Constant *C1, Constant *C2) {
+  return get(Instruction::Mul, C1, C2);
+}
+Constant *ConstantExpr::getUDiv(Constant *C1, Constant *C2) {
+  return get(Instruction::UDiv, C1, C2);
+}
+Constant *ConstantExpr::getSDiv(Constant *C1, Constant *C2) {
+  return get(Instruction::SDiv, C1, C2);
+}
+Constant *ConstantExpr::getFDiv(Constant *C1, Constant *C2) {
+  return get(Instruction::FDiv, C1, C2);
+}
+Constant *ConstantExpr::getURem(Constant *C1, Constant *C2) {
+  return get(Instruction::URem, C1, C2);
+}
+Constant *ConstantExpr::getSRem(Constant *C1, Constant *C2) {
+  return get(Instruction::SRem, C1, C2);
+}
+Constant *ConstantExpr::getFRem(Constant *C1, Constant *C2) {
+  return get(Instruction::FRem, C1, C2);
+}
+Constant *ConstantExpr::getAnd(Constant *C1, Constant *C2) {
+  return get(Instruction::And, C1, C2);
+}
+Constant *ConstantExpr::getOr(Constant *C1, Constant *C2) {
+  return get(Instruction::Or, C1, C2);
+}
+Constant *ConstantExpr::getXor(Constant *C1, Constant *C2) {
+  return get(Instruction::Xor, C1, C2);
+}
+unsigned ConstantExpr::getPredicate() const {
+  assert(getOpcode() == Instruction::FCmp || 
+         getOpcode() == Instruction::ICmp ||
+         getOpcode() == Instruction::VFCmp ||
+         getOpcode() == Instruction::VICmp);
+  return ((const CompareConstantExpr*)this)->predicate;
+}
+Constant *ConstantExpr::getShl(Constant *C1, Constant *C2) {
+  return get(Instruction::Shl, C1, C2);
+}
+Constant *ConstantExpr::getLShr(Constant *C1, Constant *C2) {
+  return get(Instruction::LShr, C1, C2);
+}
+Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2) {
+  return get(Instruction::AShr, C1, C2);
+}
+
+/// getWithOperandReplaced - Return a constant expression identical to this
+/// one, but with the specified operand set to the specified value.
+Constant *
+ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
+  assert(OpNo < getNumOperands() && "Operand num is out of range!");
+  assert(Op->getType() == getOperand(OpNo)->getType() &&
+         "Replacing operand with value of different type!");
+  if (getOperand(OpNo) == Op)
+    return const_cast<ConstantExpr*>(this);
+  
+  Constant *Op0, *Op1, *Op2;
+  switch (getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    return ConstantExpr::getCast(getOpcode(), Op, getType());
+  case Instruction::Select:
+    Op0 = (OpNo == 0) ? Op : getOperand(0);
+    Op1 = (OpNo == 1) ? Op : getOperand(1);
+    Op2 = (OpNo == 2) ? Op : getOperand(2);
+    return ConstantExpr::getSelect(Op0, Op1, Op2);
+  case Instruction::InsertElement:
+    Op0 = (OpNo == 0) ? Op : getOperand(0);
+    Op1 = (OpNo == 1) ? Op : getOperand(1);
+    Op2 = (OpNo == 2) ? Op : getOperand(2);
+    return ConstantExpr::getInsertElement(Op0, Op1, Op2);
+  case Instruction::ExtractElement:
+    Op0 = (OpNo == 0) ? Op : getOperand(0);
+    Op1 = (OpNo == 1) ? Op : getOperand(1);
+    return ConstantExpr::getExtractElement(Op0, Op1);
+  case Instruction::ShuffleVector:
+    Op0 = (OpNo == 0) ? Op : getOperand(0);
+    Op1 = (OpNo == 1) ? Op : getOperand(1);
+    Op2 = (OpNo == 2) ? Op : getOperand(2);
+    return ConstantExpr::getShuffleVector(Op0, Op1, Op2);
+  case Instruction::GetElementPtr: {
+    SmallVector<Constant*, 8> Ops;
+    Ops.resize(getNumOperands()-1);
+    for (unsigned i = 1, e = getNumOperands(); i != e; ++i)
+      Ops[i-1] = getOperand(i);
+    if (OpNo == 0)
+      return ConstantExpr::getGetElementPtr(Op, &Ops[0], Ops.size());
+    Ops[OpNo-1] = Op;
+    return ConstantExpr::getGetElementPtr(getOperand(0), &Ops[0], Ops.size());
+  }
+  default:
+    assert(getNumOperands() == 2 && "Must be binary operator?");
+    Op0 = (OpNo == 0) ? Op : getOperand(0);
+    Op1 = (OpNo == 1) ? Op : getOperand(1);
+    return ConstantExpr::get(getOpcode(), Op0, Op1);
+  }
+}
+
+/// getWithOperands - This returns the current constant expression with the
+/// operands replaced with the specified values.  The specified operands must
+/// match count and type with the existing ones.
+Constant *ConstantExpr::
+getWithOperands(Constant* const *Ops, unsigned NumOps) const {
+  assert(NumOps == getNumOperands() && "Operand count mismatch!");
+  bool AnyChange = false;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    assert(Ops[i]->getType() == getOperand(i)->getType() &&
+           "Operand type mismatch!");
+    AnyChange |= Ops[i] != getOperand(i);
+  }
+  if (!AnyChange)  // No operands changed, return self.
+    return const_cast<ConstantExpr*>(this);
+
+  switch (getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    return ConstantExpr::getCast(getOpcode(), Ops[0], getType());
+  case Instruction::Select:
+    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
+  case Instruction::InsertElement:
+    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ExtractElement:
+    return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
+  case Instruction::ShuffleVector:
+    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
+  case Instruction::GetElementPtr:
+    return ConstantExpr::getGetElementPtr(Ops[0], &Ops[1], NumOps-1);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+  case Instruction::VICmp:
+  case Instruction::VFCmp:
+    return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1]);
+  default:
+    assert(getNumOperands() == 2 && "Must be binary operator?");
+    return ConstantExpr::get(getOpcode(), Ops[0], Ops[1]);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      isValueValidForType implementations
+
+bool ConstantInt::isValueValidForType(const Type *Ty, uint64_t Val) {
+  unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); // assert okay
+  if (Ty == Type::Int1Ty)
+    return Val == 0 || Val == 1;
+  if (NumBits >= 64)
+    return true; // always true, has to fit in largest type
+  uint64_t Max = (1ll << NumBits) - 1;
+  return Val <= Max;
+}
+
+bool ConstantInt::isValueValidForType(const Type *Ty, int64_t Val) {
+  unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); // assert okay
+  if (Ty == Type::Int1Ty)
+    return Val == 0 || Val == 1 || Val == -1;
+  if (NumBits >= 64)
+    return true; // always true, has to fit in largest type
+  int64_t Min = -(1ll << (NumBits-1));
+  int64_t Max = (1ll << (NumBits-1)) - 1;
+  return (Val >= Min && Val <= Max);
+}
+
+bool ConstantFP::isValueValidForType(const Type *Ty, const APFloat& Val) {
+  // convert modifies in place, so make a copy.
+  APFloat Val2 = APFloat(Val);
+  bool losesInfo;
+  switch (Ty->getTypeID()) {
+  default:
+    return false;         // These can't be represented as floating point!
+
+  // FIXME rounding mode needs to be more flexible
+  case Type::FloatTyID: {
+    if (&Val2.getSemantics() == &APFloat::IEEEsingle)
+      return true;
+    Val2.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &losesInfo);
+    return !losesInfo;
+  }
+  case Type::DoubleTyID: {
+    if (&Val2.getSemantics() == &APFloat::IEEEsingle ||
+        &Val2.getSemantics() == &APFloat::IEEEdouble)
+      return true;
+    Val2.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &losesInfo);
+    return !losesInfo;
+  }
+  case Type::X86_FP80TyID:
+    return &Val2.getSemantics() == &APFloat::IEEEsingle || 
+           &Val2.getSemantics() == &APFloat::IEEEdouble ||
+           &Val2.getSemantics() == &APFloat::x87DoubleExtended;
+  case Type::FP128TyID:
+    return &Val2.getSemantics() == &APFloat::IEEEsingle || 
+           &Val2.getSemantics() == &APFloat::IEEEdouble ||
+           &Val2.getSemantics() == &APFloat::IEEEquad;
+  case Type::PPC_FP128TyID:
+    return &Val2.getSemantics() == &APFloat::IEEEsingle || 
+           &Val2.getSemantics() == &APFloat::IEEEdouble ||
+           &Val2.getSemantics() == &APFloat::PPCDoubleDouble;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Factory Function Implementation
+
+
+// The number of operands for each ConstantCreator::create method is
+// determined by the ConstantTraits template.
+// ConstantCreator - A class that is used to create constants by
+// ValueMap*.  This class should be partially specialized if there is
+// something strange that needs to be done to interface to the ctor for the
+// constant.
+//
+namespace llvm {
+  template<class ValType>
+  struct ConstantTraits;
+
+  template<typename T, typename Alloc>
+  struct VISIBILITY_HIDDEN ConstantTraits< std::vector<T, Alloc> > {
+    static unsigned uses(const std::vector<T, Alloc>& v) {
+      return v.size();
+    }
+  };
+
+  template<class ConstantClass, class TypeClass, class ValType>
+  struct VISIBILITY_HIDDEN ConstantCreator {
+    static ConstantClass *create(const TypeClass *Ty, const ValType &V) {
+      return new(ConstantTraits<ValType>::uses(V)) ConstantClass(Ty, V);
+    }
+  };
+
+  template<class ConstantClass, class TypeClass>
+  struct VISIBILITY_HIDDEN ConvertConstantType {
+    static void convert(ConstantClass *OldC, const TypeClass *NewTy) {
+      assert(0 && "This type cannot be converted!\n");
+      abort();
+    }
+  };
+
+  template<class ValType, class TypeClass, class ConstantClass,
+           bool HasLargeKey = false  /*true for arrays and structs*/ >
+  class VISIBILITY_HIDDEN ValueMap : public AbstractTypeUser {
+  public:
+    typedef std::pair<const Type*, ValType> MapKey;
+    typedef std::map<MapKey, Constant *> MapTy;
+    typedef std::map<Constant*, typename MapTy::iterator> InverseMapTy;
+    typedef std::map<const Type*, typename MapTy::iterator> AbstractTypeMapTy;
+  private:
+    /// Map - This is the main map from the element descriptor to the Constants.
+    /// This is the primary way we avoid creating two of the same shape
+    /// constant.
+    MapTy Map;
+    
+    /// InverseMap - If "HasLargeKey" is true, this contains an inverse mapping
+    /// from the constants to their element in Map.  This is important for
+    /// removal of constants from the array, which would otherwise have to scan
+    /// through the map with very large keys.
+    InverseMapTy InverseMap;
+
+    /// AbstractTypeMap - Map for abstract type constants.
+    ///
+    AbstractTypeMapTy AbstractTypeMap;
+
+  public:
+    typename MapTy::iterator map_end() { return Map.end(); }
+    
+    /// InsertOrGetItem - Return an iterator for the specified element.
+    /// If the element exists in the map, the returned iterator points to the
+    /// entry and Exists=true.  If not, the iterator points to the newly
+    /// inserted entry and returns Exists=false.  Newly inserted entries have
+    /// I->second == 0, and should be filled in.
+    typename MapTy::iterator InsertOrGetItem(std::pair<MapKey, Constant *>
+                                   &InsertVal,
+                                   bool &Exists) {
+      std::pair<typename MapTy::iterator, bool> IP = Map.insert(InsertVal);
+      Exists = !IP.second;
+      return IP.first;
+    }
+    
+private:
+    typename MapTy::iterator FindExistingElement(ConstantClass *CP) {
+      if (HasLargeKey) {
+        typename InverseMapTy::iterator IMI = InverseMap.find(CP);
+        assert(IMI != InverseMap.end() && IMI->second != Map.end() &&
+               IMI->second->second == CP &&
+               "InverseMap corrupt!");
+        return IMI->second;
+      }
+      
+      typename MapTy::iterator I =
+        Map.find(MapKey(static_cast<const TypeClass*>(CP->getRawType()),
+                        getValType(CP)));
+      if (I == Map.end() || I->second != CP) {
+        // FIXME: This should not use a linear scan.  If this gets to be a
+        // performance problem, someone should look at this.
+        for (I = Map.begin(); I != Map.end() && I->second != CP; ++I)
+          /* empty */;
+      }
+      return I;
+    }
+public:
+    
+    /// getOrCreate - Return the specified constant from the map, creating it if
+    /// necessary.
+    ConstantClass *getOrCreate(const TypeClass *Ty, const ValType &V) {
+      MapKey Lookup(Ty, V);
+      typename MapTy::iterator I = Map.find(Lookup);
+      // Is it in the map?      
+      if (I != Map.end())
+        return static_cast<ConstantClass *>(I->second);  
+
+      // If no preexisting value, create one now...
+      ConstantClass *Result =
+        ConstantCreator<ConstantClass,TypeClass,ValType>::create(Ty, V);
+
+      assert(Result->getType() == Ty && "Type specified is not correct!");
+      I = Map.insert(I, std::make_pair(MapKey(Ty, V), Result));
+
+      if (HasLargeKey)  // Remember the reverse mapping if needed.
+        InverseMap.insert(std::make_pair(Result, I));
+      
+      // If the type of the constant is abstract, make sure that an entry exists
+      // for it in the AbstractTypeMap.
+      if (Ty->isAbstract()) {
+        typename AbstractTypeMapTy::iterator TI = AbstractTypeMap.find(Ty);
+
+        if (TI == AbstractTypeMap.end()) {
+          // Add ourselves to the ATU list of the type.
+          cast<DerivedType>(Ty)->addAbstractTypeUser(this);
+
+          AbstractTypeMap.insert(TI, std::make_pair(Ty, I));
+        }
+      }
+      return Result;
+    }
+
+    void remove(ConstantClass *CP) {
+      typename MapTy::iterator I = FindExistingElement(CP);
+      assert(I != Map.end() && "Constant not found in constant table!");
+      assert(I->second == CP && "Didn't find correct element?");
+
+      if (HasLargeKey)  // Remember the reverse mapping if needed.
+        InverseMap.erase(CP);
+      
+      // Now that we found the entry, make sure this isn't the entry that
+      // the AbstractTypeMap points to.
+      const TypeClass *Ty = static_cast<const TypeClass *>(I->first.first);
+      if (Ty->isAbstract()) {
+        assert(AbstractTypeMap.count(Ty) &&
+               "Abstract type not in AbstractTypeMap?");
+        typename MapTy::iterator &ATMEntryIt = AbstractTypeMap[Ty];
+        if (ATMEntryIt == I) {
+          // Yes, we are removing the representative entry for this type.
+          // See if there are any other entries of the same type.
+          typename MapTy::iterator TmpIt = ATMEntryIt;
+
+          // First check the entry before this one...
+          if (TmpIt != Map.begin()) {
+            --TmpIt;
+            if (TmpIt->first.first != Ty) // Not the same type, move back...
+              ++TmpIt;
+          }
+
+          // If we didn't find the same type, try to move forward...
+          if (TmpIt == ATMEntryIt) {
+            ++TmpIt;
+            if (TmpIt == Map.end() || TmpIt->first.first != Ty)
+              --TmpIt;   // No entry afterwards with the same type
+          }
+
+          // If there is another entry in the map of the same abstract type,
+          // update the AbstractTypeMap entry now.
+          if (TmpIt != ATMEntryIt) {
+            ATMEntryIt = TmpIt;
+          } else {
+            // Otherwise, we are removing the last instance of this type
+            // from the table.  Remove from the ATM, and from user list.
+            cast<DerivedType>(Ty)->removeAbstractTypeUser(this);
+            AbstractTypeMap.erase(Ty);
+          }
+        }
+      }
+
+      Map.erase(I);
+    }
+
+    
+    /// MoveConstantToNewSlot - If we are about to change C to be the element
+    /// specified by I, update our internal data structures to reflect this
+    /// fact.
+    void MoveConstantToNewSlot(ConstantClass *C, typename MapTy::iterator I) {
+      // First, remove the old location of the specified constant in the map.
+      typename MapTy::iterator OldI = FindExistingElement(C);
+      assert(OldI != Map.end() && "Constant not found in constant table!");
+      assert(OldI->second == C && "Didn't find correct element?");
+      
+      // If this constant is the representative element for its abstract type,
+      // update the AbstractTypeMap so that the representative element is I.
+      if (C->getType()->isAbstract()) {
+        typename AbstractTypeMapTy::iterator ATI =
+            AbstractTypeMap.find(C->getType());
+        assert(ATI != AbstractTypeMap.end() &&
+               "Abstract type not in AbstractTypeMap?");
+        if (ATI->second == OldI)
+          ATI->second = I;
+      }
+      
+      // Remove the old entry from the map.
+      Map.erase(OldI);
+      
+      // Update the inverse map so that we know that this constant is now
+      // located at descriptor I.
+      if (HasLargeKey) {
+        assert(I->second == C && "Bad inversemap entry!");
+        InverseMap[C] = I;
+      }
+    }
+    
+    void refineAbstractType(const DerivedType *OldTy, const Type *NewTy) {
+      typename AbstractTypeMapTy::iterator I =
+        AbstractTypeMap.find(cast<Type>(OldTy));
+
+      assert(I != AbstractTypeMap.end() &&
+             "Abstract type not in AbstractTypeMap?");
+
+      // Convert a constant at a time until the last one is gone.  The last one
+      // leaving will remove() itself, causing the AbstractTypeMapEntry to be
+      // eliminated eventually.
+      do {
+        ConvertConstantType<ConstantClass,
+                            TypeClass>::convert(
+                                static_cast<ConstantClass *>(I->second->second),
+                                                cast<TypeClass>(NewTy));
+
+        I = AbstractTypeMap.find(cast<Type>(OldTy));
+      } while (I != AbstractTypeMap.end());
+    }
+
+    // If the type became concrete without being refined to any other existing
+    // type, we just remove ourselves from the ATU list.
+    void typeBecameConcrete(const DerivedType *AbsTy) {
+      AbsTy->removeAbstractTypeUser(this);
+    }
+
+    void dump() const {
+      DOUT << "Constant.cpp: ValueMap\n";
+    }
+  };
+}
+
+
+
+//---- ConstantAggregateZero::get() implementation...
+//
+namespace llvm {
+  // ConstantAggregateZero does not take extra "value" argument...
+  template<class ValType>
+  struct ConstantCreator<ConstantAggregateZero, Type, ValType> {
+    static ConstantAggregateZero *create(const Type *Ty, const ValType &V){
+      return new ConstantAggregateZero(Ty);
+    }
+  };
+
+  template<>
+  struct ConvertConstantType<ConstantAggregateZero, Type> {
+    static void convert(ConstantAggregateZero *OldC, const Type *NewTy) {
+      // Make everyone now use a constant of the new type...
+      Constant *New = ConstantAggregateZero::get(NewTy);
+      assert(New != OldC && "Didn't replace constant??");
+      OldC->uncheckedReplaceAllUsesWith(New);
+      OldC->destroyConstant();     // This constant is now dead, destroy it.
+    }
+  };
+}
+
+static ManagedStatic<ValueMap<char, Type, 
+                              ConstantAggregateZero> > AggZeroConstants;
+
+static char getValType(ConstantAggregateZero *CPZ) { return 0; }
+
+ConstantAggregateZero *ConstantAggregateZero::get(const Type *Ty) {
+  assert((isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) &&
+         "Cannot create an aggregate zero of non-aggregate type!");
+  return AggZeroConstants->getOrCreate(Ty, 0);
+}
+
+/// destroyConstant - Remove the constant from the constant table...
+///
+void ConstantAggregateZero::destroyConstant() {
+  AggZeroConstants->remove(this);
+  destroyConstantImpl();
+}
+
+//---- ConstantArray::get() implementation...
+//
+namespace llvm {
+  template<>
+  struct ConvertConstantType<ConstantArray, ArrayType> {
+    static void convert(ConstantArray *OldC, const ArrayType *NewTy) {
+      // Make everyone now use a constant of the new type...
+      std::vector<Constant*> C;
+      for (unsigned i = 0, e = OldC->getNumOperands(); i != e; ++i)
+        C.push_back(cast<Constant>(OldC->getOperand(i)));
+      Constant *New = ConstantArray::get(NewTy, C);
+      assert(New != OldC && "Didn't replace constant??");
+      OldC->uncheckedReplaceAllUsesWith(New);
+      OldC->destroyConstant();    // This constant is now dead, destroy it.
+    }
+  };
+}
+
+static std::vector<Constant*> getValType(ConstantArray *CA) {
+  std::vector<Constant*> Elements;
+  Elements.reserve(CA->getNumOperands());
+  for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
+    Elements.push_back(cast<Constant>(CA->getOperand(i)));
+  return Elements;
+}
+
+typedef ValueMap<std::vector<Constant*>, ArrayType, 
+                 ConstantArray, true /*largekey*/> ArrayConstantsTy;
+static ManagedStatic<ArrayConstantsTy> ArrayConstants;
+
+Constant *ConstantArray::get(const ArrayType *Ty,
+                             const std::vector<Constant*> &V) {
+  // If this is an all-zero array, return a ConstantAggregateZero object
+  if (!V.empty()) {
+    Constant *C = V[0];
+    if (!C->isNullValue())
+      return ArrayConstants->getOrCreate(Ty, V);
+    for (unsigned i = 1, e = V.size(); i != e; ++i)
+      if (V[i] != C)
+        return ArrayConstants->getOrCreate(Ty, V);
+  }
+  return ConstantAggregateZero::get(Ty);
+}
+
+/// destroyConstant - Remove the constant from the constant table...
+///
+void ConstantArray::destroyConstant() {
+  ArrayConstants->remove(this);
+  destroyConstantImpl();
+}
+
+/// ConstantArray::get(const string&) - Return an array that is initialized to
+/// contain the specified string.  If length is zero then a null terminator is 
+/// added to the specified string so that it may be used in a natural way. 
+/// Otherwise, the length parameter specifies how much of the string to use 
+/// and it won't be null terminated.
+///
+Constant *ConstantArray::get(const std::string &Str, bool AddNull) {
+  std::vector<Constant*> ElementVals;
+  for (unsigned i = 0; i < Str.length(); ++i)
+    ElementVals.push_back(ConstantInt::get(Type::Int8Ty, Str[i]));
+
+  // Add a null terminator to the string...
+  if (AddNull) {
+    ElementVals.push_back(ConstantInt::get(Type::Int8Ty, 0));
+  }
+
+  ArrayType *ATy = ArrayType::get(Type::Int8Ty, ElementVals.size());
+  return ConstantArray::get(ATy, ElementVals);
+}
+
+/// isString - This method returns true if the array is an array of i8, and 
+/// if the elements of the array are all ConstantInt's.
+bool ConstantArray::isString() const {
+  // Check the element type for i8...
+  if (getType()->getElementType() != Type::Int8Ty)
+    return false;
+  // Check the elements to make sure they are all integers, not constant
+  // expressions.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (!isa<ConstantInt>(getOperand(i)))
+      return false;
+  return true;
+}
+
+/// isCString - This method returns true if the array is a string (see
+/// isString) and it ends in a null byte \\0 and does not contains any other
+/// null bytes except its terminator.
+bool ConstantArray::isCString() const {
+  // Check the element type for i8...
+  if (getType()->getElementType() != Type::Int8Ty)
+    return false;
+  Constant *Zero = Constant::getNullValue(getOperand(0)->getType());
+  // Last element must be a null.
+  if (getOperand(getNumOperands()-1) != Zero)
+    return false;
+  // Other elements must be non-null integers.
+  for (unsigned i = 0, e = getNumOperands()-1; i != e; ++i) {
+    if (!isa<ConstantInt>(getOperand(i)))
+      return false;
+    if (getOperand(i) == Zero)
+      return false;
+  }
+  return true;
+}
+
+
+/// getAsString - If the sub-element type of this array is i8
+/// then this method converts the array to an std::string and returns it.
+/// Otherwise, it asserts out.
+///
+std::string ConstantArray::getAsString() const {
+  assert(isString() && "Not a string!");
+  std::string Result;
+  Result.reserve(getNumOperands());
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    Result.push_back((char)cast<ConstantInt>(getOperand(i))->getZExtValue());
+  return Result;
+}
+
+
+//---- ConstantStruct::get() implementation...
+//
+
+namespace llvm {
+  template<>
+  struct ConvertConstantType<ConstantStruct, StructType> {
+    static void convert(ConstantStruct *OldC, const StructType *NewTy) {
+      // Make everyone now use a constant of the new type...
+      std::vector<Constant*> C;
+      for (unsigned i = 0, e = OldC->getNumOperands(); i != e; ++i)
+        C.push_back(cast<Constant>(OldC->getOperand(i)));
+      Constant *New = ConstantStruct::get(NewTy, C);
+      assert(New != OldC && "Didn't replace constant??");
+
+      OldC->uncheckedReplaceAllUsesWith(New);
+      OldC->destroyConstant();    // This constant is now dead, destroy it.
+    }
+  };
+}
+
+typedef ValueMap<std::vector<Constant*>, StructType,
+                 ConstantStruct, true /*largekey*/> StructConstantsTy;
+static ManagedStatic<StructConstantsTy> StructConstants;
+
+static std::vector<Constant*> getValType(ConstantStruct *CS) {
+  std::vector<Constant*> Elements;
+  Elements.reserve(CS->getNumOperands());
+  for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i)
+    Elements.push_back(cast<Constant>(CS->getOperand(i)));
+  return Elements;
+}
+
+Constant *ConstantStruct::get(const StructType *Ty,
+                              const std::vector<Constant*> &V) {
+  // Create a ConstantAggregateZero value if all elements are zeros...
+  for (unsigned i = 0, e = V.size(); i != e; ++i)
+    if (!V[i]->isNullValue())
+      return StructConstants->getOrCreate(Ty, V);
+
+  return ConstantAggregateZero::get(Ty);
+}
+
+Constant *ConstantStruct::get(const std::vector<Constant*> &V, bool packed) {
+  std::vector<const Type*> StructEls;
+  StructEls.reserve(V.size());
+  for (unsigned i = 0, e = V.size(); i != e; ++i)
+    StructEls.push_back(V[i]->getType());
+  return get(StructType::get(StructEls, packed), V);
+}
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantStruct::destroyConstant() {
+  StructConstants->remove(this);
+  destroyConstantImpl();
+}
+
+//---- ConstantVector::get() implementation...
+//
+namespace llvm {
+  template<>
+  struct ConvertConstantType<ConstantVector, VectorType> {
+    static void convert(ConstantVector *OldC, const VectorType *NewTy) {
+      // Make everyone now use a constant of the new type...
+      std::vector<Constant*> C;
+      for (unsigned i = 0, e = OldC->getNumOperands(); i != e; ++i)
+        C.push_back(cast<Constant>(OldC->getOperand(i)));
+      Constant *New = ConstantVector::get(NewTy, C);
+      assert(New != OldC && "Didn't replace constant??");
+      OldC->uncheckedReplaceAllUsesWith(New);
+      OldC->destroyConstant();    // This constant is now dead, destroy it.
+    }
+  };
+}
+
+static std::vector<Constant*> getValType(ConstantVector *CP) {
+  std::vector<Constant*> Elements;
+  Elements.reserve(CP->getNumOperands());
+  for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
+    Elements.push_back(CP->getOperand(i));
+  return Elements;
+}
+
+static ManagedStatic<ValueMap<std::vector<Constant*>, VectorType,
+                              ConstantVector> > VectorConstants;
+
+Constant *ConstantVector::get(const VectorType *Ty,
+                              const std::vector<Constant*> &V) {
+  assert(!V.empty() && "Vectors can't be empty");
+  // If this is an all-undef or alll-zero vector, return a
+  // ConstantAggregateZero or UndefValue.
+  Constant *C = V[0];
+  bool isZero = C->isNullValue();
+  bool isUndef = isa<UndefValue>(C);
+
+  if (isZero || isUndef) {
+    for (unsigned i = 1, e = V.size(); i != e; ++i)
+      if (V[i] != C) {
+        isZero = isUndef = false;
+        break;
+      }
+  }
+  
+  if (isZero)
+    return ConstantAggregateZero::get(Ty);
+  if (isUndef)
+    return UndefValue::get(Ty);
+  return VectorConstants->getOrCreate(Ty, V);
+}
+
+Constant *ConstantVector::get(const std::vector<Constant*> &V) {
+  assert(!V.empty() && "Cannot infer type if V is empty");
+  return get(VectorType::get(V.front()->getType(),V.size()), V);
+}
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantVector::destroyConstant() {
+  VectorConstants->remove(this);
+  destroyConstantImpl();
+}
+
+/// This function will return true iff every element in this vector constant
+/// is set to all ones.
+/// @returns true iff this constant's emements are all set to all ones.
+/// @brief Determine if the value is all ones.
+bool ConstantVector::isAllOnesValue() const {
+  // Check out first element.
+  const Constant *Elt = getOperand(0);
+  const ConstantInt *CI = dyn_cast<ConstantInt>(Elt);
+  if (!CI || !CI->isAllOnesValue()) return false;
+  // Then make sure all remaining elements point to the same value.
+  for (unsigned I = 1, E = getNumOperands(); I < E; ++I) {
+    if (getOperand(I) != Elt) return false;
+  }
+  return true;
+}
+
+/// getSplatValue - If this is a splat constant, where all of the
+/// elements have the same value, return that value. Otherwise return null.
+Constant *ConstantVector::getSplatValue() {
+  // Check out first element.
+  Constant *Elt = getOperand(0);
+  // Then make sure all remaining elements point to the same value.
+  for (unsigned I = 1, E = getNumOperands(); I < E; ++I)
+    if (getOperand(I) != Elt) return 0;
+  return Elt;
+}
+
+//---- ConstantPointerNull::get() implementation...
+//
+
+namespace llvm {
+  // ConstantPointerNull does not take extra "value" argument...
+  template<class ValType>
+  struct ConstantCreator<ConstantPointerNull, PointerType, ValType> {
+    static ConstantPointerNull *create(const PointerType *Ty, const ValType &V){
+      return new ConstantPointerNull(Ty);
+    }
+  };
+
+  template<>
+  struct ConvertConstantType<ConstantPointerNull, PointerType> {
+    static void convert(ConstantPointerNull *OldC, const PointerType *NewTy) {
+      // Make everyone now use a constant of the new type...
+      Constant *New = ConstantPointerNull::get(NewTy);
+      assert(New != OldC && "Didn't replace constant??");
+      OldC->uncheckedReplaceAllUsesWith(New);
+      OldC->destroyConstant();     // This constant is now dead, destroy it.
+    }
+  };
+}
+
+static ManagedStatic<ValueMap<char, PointerType, 
+                              ConstantPointerNull> > NullPtrConstants;
+
+static char getValType(ConstantPointerNull *) {
+  return 0;
+}
+
+
+ConstantPointerNull *ConstantPointerNull::get(const PointerType *Ty) {
+  return NullPtrConstants->getOrCreate(Ty, 0);
+}
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantPointerNull::destroyConstant() {
+  NullPtrConstants->remove(this);
+  destroyConstantImpl();
+}
+
+
+//---- UndefValue::get() implementation...
+//
+
+namespace llvm {
+  // UndefValue does not take extra "value" argument...
+  template<class ValType>
+  struct ConstantCreator<UndefValue, Type, ValType> {
+    static UndefValue *create(const Type *Ty, const ValType &V) {
+      return new UndefValue(Ty);
+    }
+  };
+
+  template<>
+  struct ConvertConstantType<UndefValue, Type> {
+    static void convert(UndefValue *OldC, const Type *NewTy) {
+      // Make everyone now use a constant of the new type.
+      Constant *New = UndefValue::get(NewTy);
+      assert(New != OldC && "Didn't replace constant??");
+      OldC->uncheckedReplaceAllUsesWith(New);
+      OldC->destroyConstant();     // This constant is now dead, destroy it.
+    }
+  };
+}
+
+static ManagedStatic<ValueMap<char, Type, UndefValue> > UndefValueConstants;
+
+static char getValType(UndefValue *) {
+  return 0;
+}
+
+
+UndefValue *UndefValue::get(const Type *Ty) {
+  return UndefValueConstants->getOrCreate(Ty, 0);
+}
+
+// destroyConstant - Remove the constant from the constant table.
+//
+void UndefValue::destroyConstant() {
+  UndefValueConstants->remove(this);
+  destroyConstantImpl();
+}
+
+//---- MDString::get() implementation
+//
+
+MDString::MDString(const char *begin, const char *end)
+  : Constant(Type::MetadataTy, MDStringVal, 0, 0),
+    StrBegin(begin), StrEnd(end) {}
+
+static ManagedStatic<StringMap<MDString*> > MDStringCache;
+
+MDString *MDString::get(const char *StrBegin, const char *StrEnd) {
+  StringMapEntry<MDString *> &Entry = MDStringCache->GetOrCreateValue(StrBegin,
+                                                                      StrEnd);
+  MDString *&S = Entry.getValue();
+  if (!S) S = new MDString(Entry.getKeyData(),
+                           Entry.getKeyData() + Entry.getKeyLength());
+  return S;
+}
+
+void MDString::destroyConstant() {
+  MDStringCache->erase(MDStringCache->find(StrBegin, StrEnd));
+  destroyConstantImpl();
+}
+
+//---- MDNode::get() implementation
+//
+
+static ManagedStatic<FoldingSet<MDNode> > MDNodeSet;
+
+MDNode::MDNode(Value*const* Vals, unsigned NumVals)
+  : Constant(Type::MetadataTy, MDNodeVal, 0, 0) {
+  for (unsigned i = 0; i != NumVals; ++i)
+    Node.push_back(ElementVH(Vals[i], this));
+}
+
+void MDNode::Profile(FoldingSetNodeID &ID) const {
+  for (const_elem_iterator I = elem_begin(), E = elem_end(); I != E; ++I)
+    ID.AddPointer(*I);
+}
+
+MDNode *MDNode::get(Value*const* Vals, unsigned NumVals) {
+  FoldingSetNodeID ID;
+  for (unsigned i = 0; i != NumVals; ++i)
+    ID.AddPointer(Vals[i]);
+
+  void *InsertPoint;
+  if (MDNode *N = MDNodeSet->FindNodeOrInsertPos(ID, InsertPoint))
+    return N;
+
+  // InsertPoint will have been set by the FindNodeOrInsertPos call.
+  MDNode *N = new(0) MDNode(Vals, NumVals);
+  MDNodeSet->InsertNode(N, InsertPoint);
+  return N;
+}
+
+void MDNode::destroyConstant() {
+  MDNodeSet->RemoveNode(this);
+  destroyConstantImpl();
+}
+
+//---- ConstantExpr::get() implementations...
+//
+
+namespace {
+
+struct ExprMapKeyType {
+  typedef SmallVector<unsigned, 4> IndexList;
+
+  ExprMapKeyType(unsigned opc,
+      const std::vector<Constant*> &ops,
+      unsigned short pred = 0,
+      const IndexList &inds = IndexList())
+        : opcode(opc), predicate(pred), operands(ops), indices(inds) {}
+  uint16_t opcode;
+  uint16_t predicate;
+  std::vector<Constant*> operands;
+  IndexList indices;
+  bool operator==(const ExprMapKeyType& that) const {
+    return this->opcode == that.opcode &&
+           this->predicate == that.predicate &&
+           this->operands == that.operands &&
+           this->indices == that.indices;
+  }
+  bool operator<(const ExprMapKeyType & that) const {
+    return this->opcode < that.opcode ||
+      (this->opcode == that.opcode && this->predicate < that.predicate) ||
+      (this->opcode == that.opcode && this->predicate == that.predicate &&
+       this->operands < that.operands) ||
+      (this->opcode == that.opcode && this->predicate == that.predicate &&
+       this->operands == that.operands && this->indices < that.indices);
+  }
+
+  bool operator!=(const ExprMapKeyType& that) const {
+    return !(*this == that);
+  }
+};
+
+}
+
+namespace llvm {
+  template<>
+  struct ConstantCreator<ConstantExpr, Type, ExprMapKeyType> {
+    static ConstantExpr *create(const Type *Ty, const ExprMapKeyType &V,
+        unsigned short pred = 0) {
+      if (Instruction::isCast(V.opcode))
+        return new UnaryConstantExpr(V.opcode, V.operands[0], Ty);
+      if ((V.opcode >= Instruction::BinaryOpsBegin &&
+           V.opcode < Instruction::BinaryOpsEnd))
+        return new BinaryConstantExpr(V.opcode, V.operands[0], V.operands[1]);
+      if (V.opcode == Instruction::Select)
+        return new SelectConstantExpr(V.operands[0], V.operands[1], 
+                                      V.operands[2]);
+      if (V.opcode == Instruction::ExtractElement)
+        return new ExtractElementConstantExpr(V.operands[0], V.operands[1]);
+      if (V.opcode == Instruction::InsertElement)
+        return new InsertElementConstantExpr(V.operands[0], V.operands[1],
+                                             V.operands[2]);
+      if (V.opcode == Instruction::ShuffleVector)
+        return new ShuffleVectorConstantExpr(V.operands[0], V.operands[1],
+                                             V.operands[2]);
+      if (V.opcode == Instruction::InsertValue)
+        return new InsertValueConstantExpr(V.operands[0], V.operands[1],
+                                           V.indices, Ty);
+      if (V.opcode == Instruction::ExtractValue)
+        return new ExtractValueConstantExpr(V.operands[0], V.indices, Ty);
+      if (V.opcode == Instruction::GetElementPtr) {
+        std::vector<Constant*> IdxList(V.operands.begin()+1, V.operands.end());
+        return GetElementPtrConstantExpr::Create(V.operands[0], IdxList, Ty);
+      }
+
+      // The compare instructions are weird. We have to encode the predicate
+      // value and it is combined with the instruction opcode by multiplying
+      // the opcode by one hundred. We must decode this to get the predicate.
+      if (V.opcode == Instruction::ICmp)
+        return new CompareConstantExpr(Ty, Instruction::ICmp, V.predicate, 
+                                       V.operands[0], V.operands[1]);
+      if (V.opcode == Instruction::FCmp) 
+        return new CompareConstantExpr(Ty, Instruction::FCmp, V.predicate, 
+                                       V.operands[0], V.operands[1]);
+      if (V.opcode == Instruction::VICmp)
+        return new CompareConstantExpr(Ty, Instruction::VICmp, V.predicate, 
+                                       V.operands[0], V.operands[1]);
+      if (V.opcode == Instruction::VFCmp) 
+        return new CompareConstantExpr(Ty, Instruction::VFCmp, V.predicate, 
+                                       V.operands[0], V.operands[1]);
+      assert(0 && "Invalid ConstantExpr!");
+      return 0;
+    }
+  };
+
+  template<>
+  struct ConvertConstantType<ConstantExpr, Type> {
+    static void convert(ConstantExpr *OldC, const Type *NewTy) {
+      Constant *New;
+      switch (OldC->getOpcode()) {
+      case Instruction::Trunc:
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPTrunc:
+      case Instruction::FPExt:
+      case Instruction::UIToFP:
+      case Instruction::SIToFP:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::BitCast:
+        New = ConstantExpr::getCast(OldC->getOpcode(), OldC->getOperand(0), 
+                                    NewTy);
+        break;
+      case Instruction::Select:
+        New = ConstantExpr::getSelectTy(NewTy, OldC->getOperand(0),
+                                        OldC->getOperand(1),
+                                        OldC->getOperand(2));
+        break;
+      default:
+        assert(OldC->getOpcode() >= Instruction::BinaryOpsBegin &&
+               OldC->getOpcode() <  Instruction::BinaryOpsEnd);
+        New = ConstantExpr::getTy(NewTy, OldC->getOpcode(), OldC->getOperand(0),
+                                  OldC->getOperand(1));
+        break;
+      case Instruction::GetElementPtr:
+        // Make everyone now use a constant of the new type...
+        std::vector<Value*> Idx(OldC->op_begin()+1, OldC->op_end());
+        New = ConstantExpr::getGetElementPtrTy(NewTy, OldC->getOperand(0),
+                                               &Idx[0], Idx.size());
+        break;
+      }
+
+      assert(New != OldC && "Didn't replace constant??");
+      OldC->uncheckedReplaceAllUsesWith(New);
+      OldC->destroyConstant();    // This constant is now dead, destroy it.
+    }
+  };
+} // end namespace llvm
+
+
+static ExprMapKeyType getValType(ConstantExpr *CE) {
+  std::vector<Constant*> Operands;
+  Operands.reserve(CE->getNumOperands());
+  for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
+    Operands.push_back(cast<Constant>(CE->getOperand(i)));
+  return ExprMapKeyType(CE->getOpcode(), Operands, 
+      CE->isCompare() ? CE->getPredicate() : 0,
+      CE->hasIndices() ?
+        CE->getIndices() : SmallVector<unsigned, 4>());
+}
+
+static ManagedStatic<ValueMap<ExprMapKeyType, Type,
+                              ConstantExpr> > ExprConstants;
+
+/// This is a utility function to handle folding of casts and lookup of the
+/// cast in the ExprConstants map. It is used by the various get* methods below.
+static inline Constant *getFoldedCast(
+  Instruction::CastOps opc, Constant *C, const Type *Ty) {
+  assert(Ty->isFirstClassType() && "Cannot cast to an aggregate type!");
+  // Fold a few common cases
+  if (Constant *FC = ConstantFoldCastInstruction(opc, C, Ty))
+    return FC;
+
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> argVec(1, C);
+  ExprMapKeyType Key(opc, argVec);
+  return ExprConstants->getOrCreate(Ty, Key);
+}
+ 
+Constant *ConstantExpr::getCast(unsigned oc, Constant *C, const Type *Ty) {
+  Instruction::CastOps opc = Instruction::CastOps(oc);
+  assert(Instruction::isCast(opc) && "opcode out of range");
+  assert(C && Ty && "Null arguments to getCast");
+  assert(Ty->isFirstClassType() && "Cannot cast to an aggregate type!");
+
+  switch (opc) {
+    default:
+      assert(0 && "Invalid cast opcode");
+      break;
+    case Instruction::Trunc:    return getTrunc(C, Ty);
+    case Instruction::ZExt:     return getZExt(C, Ty);
+    case Instruction::SExt:     return getSExt(C, Ty);
+    case Instruction::FPTrunc:  return getFPTrunc(C, Ty);
+    case Instruction::FPExt:    return getFPExtend(C, Ty);
+    case Instruction::UIToFP:   return getUIToFP(C, Ty);
+    case Instruction::SIToFP:   return getSIToFP(C, Ty);
+    case Instruction::FPToUI:   return getFPToUI(C, Ty);
+    case Instruction::FPToSI:   return getFPToSI(C, Ty);
+    case Instruction::PtrToInt: return getPtrToInt(C, Ty);
+    case Instruction::IntToPtr: return getIntToPtr(C, Ty);
+    case Instruction::BitCast:  return getBitCast(C, Ty);
+  }
+  return 0;
+} 
+
+Constant *ConstantExpr::getZExtOrBitCast(Constant *C, const Type *Ty) {
+  if (C->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return getCast(Instruction::BitCast, C, Ty);
+  return getCast(Instruction::ZExt, C, Ty);
+}
+
+Constant *ConstantExpr::getSExtOrBitCast(Constant *C, const Type *Ty) {
+  if (C->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return getCast(Instruction::BitCast, C, Ty);
+  return getCast(Instruction::SExt, C, Ty);
+}
+
+Constant *ConstantExpr::getTruncOrBitCast(Constant *C, const Type *Ty) {
+  if (C->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return getCast(Instruction::BitCast, C, Ty);
+  return getCast(Instruction::Trunc, C, Ty);
+}
+
+Constant *ConstantExpr::getPointerCast(Constant *S, const Type *Ty) {
+  assert(isa<PointerType>(S->getType()) && "Invalid cast");
+  assert((Ty->isInteger() || isa<PointerType>(Ty)) && "Invalid cast");
+
+  if (Ty->isInteger())
+    return getCast(Instruction::PtrToInt, S, Ty);
+  return getCast(Instruction::BitCast, S, Ty);
+}
+
+Constant *ConstantExpr::getIntegerCast(Constant *C, const Type *Ty, 
+                                       bool isSigned) {
+  assert(C->getType()->isInteger() && Ty->isInteger() && "Invalid cast");
+  unsigned SrcBits = C->getType()->getPrimitiveSizeInBits();
+  unsigned DstBits = Ty->getPrimitiveSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::Trunc :
+      (isSigned ? Instruction::SExt : Instruction::ZExt)));
+  return getCast(opcode, C, Ty);
+}
+
+Constant *ConstantExpr::getFPCast(Constant *C, const Type *Ty) {
+  assert(C->getType()->isFloatingPoint() && Ty->isFloatingPoint() && 
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getPrimitiveSizeInBits();
+  unsigned DstBits = Ty->getPrimitiveSizeInBits();
+  if (SrcBits == DstBits)
+    return C; // Avoid a useless cast
+  Instruction::CastOps opcode =
+     (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt);
+  return getCast(opcode, C, Ty);
+}
+
+Constant *ConstantExpr::getTrunc(Constant *C, const Type *Ty) {
+  assert(C->getType()->isInteger() && "Trunc operand must be integer");
+  assert(Ty->isInteger() && "Trunc produces only integral");
+  assert(C->getType()->getPrimitiveSizeInBits() > Ty->getPrimitiveSizeInBits()&&
+         "SrcTy must be larger than DestTy for Trunc!");
+
+  return getFoldedCast(Instruction::Trunc, C, Ty);
+}
+
+Constant *ConstantExpr::getSExt(Constant *C, const Type *Ty) {
+  assert(C->getType()->isInteger() && "SEXt operand must be integral");
+  assert(Ty->isInteger() && "SExt produces only integer");
+  assert(C->getType()->getPrimitiveSizeInBits() < Ty->getPrimitiveSizeInBits()&&
+         "SrcTy must be smaller than DestTy for SExt!");
+
+  return getFoldedCast(Instruction::SExt, C, Ty);
+}
+
+Constant *ConstantExpr::getZExt(Constant *C, const Type *Ty) {
+  assert(C->getType()->isInteger() && "ZEXt operand must be integral");
+  assert(Ty->isInteger() && "ZExt produces only integer");
+  assert(C->getType()->getPrimitiveSizeInBits() < Ty->getPrimitiveSizeInBits()&&
+         "SrcTy must be smaller than DestTy for ZExt!");
+
+  return getFoldedCast(Instruction::ZExt, C, Ty);
+}
+
+Constant *ConstantExpr::getFPTrunc(Constant *C, const Type *Ty) {
+  assert(C->getType()->isFloatingPoint() && Ty->isFloatingPoint() &&
+         C->getType()->getPrimitiveSizeInBits() > Ty->getPrimitiveSizeInBits()&&
+         "This is an illegal floating point truncation!");
+  return getFoldedCast(Instruction::FPTrunc, C, Ty);
+}
+
+Constant *ConstantExpr::getFPExtend(Constant *C, const Type *Ty) {
+  assert(C->getType()->isFloatingPoint() && Ty->isFloatingPoint() &&
+         C->getType()->getPrimitiveSizeInBits() < Ty->getPrimitiveSizeInBits()&&
+         "This is an illegal floating point extension!");
+  return getFoldedCast(Instruction::FPExt, C, Ty);
+}
+
+Constant *ConstantExpr::getUIToFP(Constant *C, const Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isIntOrIntVector() && Ty->isFPOrFPVector() &&
+         "This is an illegal uint to floating point cast!");
+  return getFoldedCast(Instruction::UIToFP, C, Ty);
+}
+
+Constant *ConstantExpr::getSIToFP(Constant *C, const Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isIntOrIntVector() && Ty->isFPOrFPVector() &&
+         "This is an illegal sint to floating point cast!");
+  return getFoldedCast(Instruction::SIToFP, C, Ty);
+}
+
+Constant *ConstantExpr::getFPToUI(Constant *C, const Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isFPOrFPVector() && Ty->isIntOrIntVector() &&
+         "This is an illegal floating point to uint cast!");
+  return getFoldedCast(Instruction::FPToUI, C, Ty);
+}
+
+Constant *ConstantExpr::getFPToSI(Constant *C, const Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isFPOrFPVector() && Ty->isIntOrIntVector() &&
+         "This is an illegal floating point to sint cast!");
+  return getFoldedCast(Instruction::FPToSI, C, Ty);
+}
+
+Constant *ConstantExpr::getPtrToInt(Constant *C, const Type *DstTy) {
+  assert(isa<PointerType>(C->getType()) && "PtrToInt source must be pointer");
+  assert(DstTy->isInteger() && "PtrToInt destination must be integral");
+  return getFoldedCast(Instruction::PtrToInt, C, DstTy);
+}
+
+Constant *ConstantExpr::getIntToPtr(Constant *C, const Type *DstTy) {
+  assert(C->getType()->isInteger() && "IntToPtr source must be integral");
+  assert(isa<PointerType>(DstTy) && "IntToPtr destination must be a pointer");
+  return getFoldedCast(Instruction::IntToPtr, C, DstTy);
+}
+
+Constant *ConstantExpr::getBitCast(Constant *C, const Type *DstTy) {
+  // BitCast implies a no-op cast of type only. No bits change.  However, you 
+  // can't cast pointers to anything but pointers.
+#ifndef NDEBUG
+  const Type *SrcTy = C->getType();
+  assert((isa<PointerType>(SrcTy) == isa<PointerType>(DstTy)) &&
+         "BitCast cannot cast pointer to non-pointer and vice versa");
+
+  // Now we know we're not dealing with mismatched pointer casts (ptr->nonptr
+  // or nonptr->ptr). For all the other types, the cast is okay if source and 
+  // destination bit widths are identical.
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DstBitSize = DstTy->getPrimitiveSizeInBits();
+#endif
+  assert(SrcBitSize == DstBitSize && "BitCast requires types of same width");
+  
+  // It is common to ask for a bitcast of a value to its own type, handle this
+  // speedily.
+  if (C->getType() == DstTy) return C;
+  
+  return getFoldedCast(Instruction::BitCast, C, DstTy);
+}
+
+Constant *ConstantExpr::getAlignOf(const Type *Ty) {
+  // alignof is implemented as: (i64) gep ({i8,Ty}*)null, 0, 1
+  const Type *AligningTy = StructType::get(Type::Int8Ty, Ty, NULL);
+  Constant *NullPtr = getNullValue(AligningTy->getPointerTo());
+  Constant *Zero = ConstantInt::get(Type::Int32Ty, 0);
+  Constant *One = ConstantInt::get(Type::Int32Ty, 1);
+  Constant *Indices[2] = { Zero, One };
+  Constant *GEP = getGetElementPtr(NullPtr, Indices, 2);
+  return getCast(Instruction::PtrToInt, GEP, Type::Int32Ty);
+}
+
+Constant *ConstantExpr::getSizeOf(const Type *Ty) {
+  // sizeof is implemented as: (i64) gep (Ty*)null, 1
+  Constant *GEPIdx = ConstantInt::get(Type::Int32Ty, 1);
+  Constant *GEP =
+    getGetElementPtr(getNullValue(PointerType::getUnqual(Ty)), &GEPIdx, 1);
+  return getCast(Instruction::PtrToInt, GEP, Type::Int64Ty);
+}
+
+Constant *ConstantExpr::getTy(const Type *ReqTy, unsigned Opcode,
+                              Constant *C1, Constant *C2) {
+  // Check the operands for consistency first
+  assert(Opcode >= Instruction::BinaryOpsBegin &&
+         Opcode <  Instruction::BinaryOpsEnd   &&
+         "Invalid opcode in binary constant expression");
+  assert(C1->getType() == C2->getType() &&
+         "Operand types in binary constant expression should match");
+
+  if (ReqTy == C1->getType() || ReqTy == Type::Int1Ty)
+    if (Constant *FC = ConstantFoldBinaryInstruction(Opcode, C1, C2))
+      return FC;          // Fold a few common cases...
+
+  std::vector<Constant*> argVec(1, C1); argVec.push_back(C2);
+  ExprMapKeyType Key(Opcode, argVec);
+  return ExprConstants->getOrCreate(ReqTy, Key);
+}
+
+Constant *ConstantExpr::getCompareTy(unsigned short predicate,
+                                     Constant *C1, Constant *C2) {
+  bool isVectorType = C1->getType()->getTypeID() == Type::VectorTyID;
+  switch (predicate) {
+    default: assert(0 && "Invalid CmpInst predicate");
+    case CmpInst::FCMP_FALSE: case CmpInst::FCMP_OEQ: case CmpInst::FCMP_OGT:
+    case CmpInst::FCMP_OGE:   case CmpInst::FCMP_OLT: case CmpInst::FCMP_OLE:
+    case CmpInst::FCMP_ONE:   case CmpInst::FCMP_ORD: case CmpInst::FCMP_UNO:
+    case CmpInst::FCMP_UEQ:   case CmpInst::FCMP_UGT: case CmpInst::FCMP_UGE:
+    case CmpInst::FCMP_ULT:   case CmpInst::FCMP_ULE: case CmpInst::FCMP_UNE:
+    case CmpInst::FCMP_TRUE:
+      return isVectorType ? getVFCmp(predicate, C1, C2) 
+                          : getFCmp(predicate, C1, C2);
+    case CmpInst::ICMP_EQ:  case CmpInst::ICMP_NE:  case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_SLT:
+    case CmpInst::ICMP_SLE:
+      return isVectorType ? getVICmp(predicate, C1, C2)
+                          : getICmp(predicate, C1, C2);
+  }
+}
+
+Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2) {
+#ifndef NDEBUG
+  switch (Opcode) {
+  case Instruction::Add: 
+  case Instruction::Sub:
+  case Instruction::Mul: 
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert((C1->getType()->isInteger() || C1->getType()->isFloatingPoint() ||
+            isa<VectorType>(C1->getType())) &&
+           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::UDiv: 
+  case Instruction::SDiv: 
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert((C1->getType()->isInteger() || (isa<VectorType>(C1->getType()) &&
+      cast<VectorType>(C1->getType())->getElementType()->isInteger())) &&
+           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::FDiv:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert((C1->getType()->isFloatingPoint() || (isa<VectorType>(C1->getType())
+      && cast<VectorType>(C1->getType())->getElementType()->isFloatingPoint())) 
+      && "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::URem: 
+  case Instruction::SRem: 
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert((C1->getType()->isInteger() || (isa<VectorType>(C1->getType()) &&
+      cast<VectorType>(C1->getType())->getElementType()->isInteger())) &&
+           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::FRem:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert((C1->getType()->isFloatingPoint() || (isa<VectorType>(C1->getType())
+      && cast<VectorType>(C1->getType())->getElementType()->isFloatingPoint())) 
+      && "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert((C1->getType()->isInteger() || isa<VectorType>(C1->getType())) &&
+           "Tried to create a logical operation on a non-integral type!");
+    break;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isIntOrIntVector() &&
+           "Tried to create a shift operation on a non-integer type!");
+    break;
+  default:
+    break;
+  }
+#endif
+
+  return getTy(C1->getType(), Opcode, C1, C2);
+}
+
+Constant *ConstantExpr::getCompare(unsigned short pred, 
+                            Constant *C1, Constant *C2) {
+  assert(C1->getType() == C2->getType() && "Op types should be identical!");
+  return getCompareTy(pred, C1, C2);
+}
+
+Constant *ConstantExpr::getSelectTy(const Type *ReqTy, Constant *C,
+                                    Constant *V1, Constant *V2) {
+  assert(!SelectInst::areInvalidOperands(C, V1, V2)&&"Invalid select operands");
+
+  if (ReqTy == V1->getType())
+    if (Constant *SC = ConstantFoldSelectInstruction(C, V1, V2))
+      return SC;        // Fold common cases
+
+  std::vector<Constant*> argVec(3, C);
+  argVec[1] = V1;
+  argVec[2] = V2;
+  ExprMapKeyType Key(Instruction::Select, argVec);
+  return ExprConstants->getOrCreate(ReqTy, Key);
+}
+
+Constant *ConstantExpr::getGetElementPtrTy(const Type *ReqTy, Constant *C,
+                                           Value* const *Idxs,
+                                           unsigned NumIdx) {
+  assert(GetElementPtrInst::getIndexedType(C->getType(), Idxs,
+                                           Idxs+NumIdx) ==
+         cast<PointerType>(ReqTy)->getElementType() &&
+         "GEP indices invalid!");
+
+  if (Constant *FC = ConstantFoldGetElementPtr(C, (Constant**)Idxs, NumIdx))
+    return FC;          // Fold a few common cases...
+
+  assert(isa<PointerType>(C->getType()) &&
+         "Non-pointer type for constant GetElementPtr expression");
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec;
+  ArgVec.reserve(NumIdx+1);
+  ArgVec.push_back(C);
+  for (unsigned i = 0; i != NumIdx; ++i)
+    ArgVec.push_back(cast<Constant>(Idxs[i]));
+  const ExprMapKeyType Key(Instruction::GetElementPtr, ArgVec);
+  return ExprConstants->getOrCreate(ReqTy, Key);
+}
+
+Constant *ConstantExpr::getGetElementPtr(Constant *C, Value* const *Idxs,
+                                         unsigned NumIdx) {
+  // Get the result type of the getelementptr!
+  const Type *Ty = 
+    GetElementPtrInst::getIndexedType(C->getType(), Idxs, Idxs+NumIdx);
+  assert(Ty && "GEP indices invalid!");
+  unsigned As = cast<PointerType>(C->getType())->getAddressSpace();
+  return getGetElementPtrTy(PointerType::get(Ty, As), C, Idxs, NumIdx);
+}
+
+Constant *ConstantExpr::getGetElementPtr(Constant *C, Constant* const *Idxs,
+                                         unsigned NumIdx) {
+  return getGetElementPtr(C, (Value* const *)Idxs, NumIdx);
+}
+
+
+Constant *
+ConstantExpr::getICmp(unsigned short pred, Constant* LHS, Constant* RHS) {
+  assert(LHS->getType() == RHS->getType());
+  assert(pred >= ICmpInst::FIRST_ICMP_PREDICATE && 
+         pred <= ICmpInst::LAST_ICMP_PREDICATE && "Invalid ICmp Predicate");
+
+  if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
+    return FC;          // Fold a few common cases...
+
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec;
+  ArgVec.push_back(LHS);
+  ArgVec.push_back(RHS);
+  // Get the key type with both the opcode and predicate
+  const ExprMapKeyType Key(Instruction::ICmp, ArgVec, pred);
+  return ExprConstants->getOrCreate(Type::Int1Ty, Key);
+}
+
+Constant *
+ConstantExpr::getFCmp(unsigned short pred, Constant* LHS, Constant* RHS) {
+  assert(LHS->getType() == RHS->getType());
+  assert(pred <= FCmpInst::LAST_FCMP_PREDICATE && "Invalid FCmp Predicate");
+
+  if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
+    return FC;          // Fold a few common cases...
+
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec;
+  ArgVec.push_back(LHS);
+  ArgVec.push_back(RHS);
+  // Get the key type with both the opcode and predicate
+  const ExprMapKeyType Key(Instruction::FCmp, ArgVec, pred);
+  return ExprConstants->getOrCreate(Type::Int1Ty, Key);
+}
+
+Constant *
+ConstantExpr::getVICmp(unsigned short pred, Constant* LHS, Constant* RHS) {
+  assert(isa<VectorType>(LHS->getType()) && LHS->getType() == RHS->getType() &&
+         "Tried to create vicmp operation on non-vector type!");
+  assert(pred >= ICmpInst::FIRST_ICMP_PREDICATE && 
+         pred <= ICmpInst::LAST_ICMP_PREDICATE && "Invalid VICmp Predicate");
+
+  const VectorType *VTy = cast<VectorType>(LHS->getType());
+  const Type *EltTy = VTy->getElementType();
+  unsigned NumElts = VTy->getNumElements();
+
+  // See if we can fold the element-wise comparison of the LHS and RHS.
+  SmallVector<Constant *, 16> LHSElts, RHSElts;
+  LHS->getVectorElements(LHSElts);
+  RHS->getVectorElements(RHSElts);
+                    
+  if (!LHSElts.empty() && !RHSElts.empty()) {
+    SmallVector<Constant *, 16> Elts;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *FC = ConstantFoldCompareInstruction(pred, LHSElts[i],
+                                                    RHSElts[i]);
+      if (ConstantInt *FCI = dyn_cast_or_null<ConstantInt>(FC)) {
+        if (FCI->getZExtValue())
+          Elts.push_back(ConstantInt::getAllOnesValue(EltTy));
+        else
+          Elts.push_back(ConstantInt::get(EltTy, 0ULL));
+      } else if (FC && isa<UndefValue>(FC)) {
+        Elts.push_back(UndefValue::get(EltTy));
+      } else {
+        break;
+      }
+    }
+    if (Elts.size() == NumElts)
+      return ConstantVector::get(&Elts[0], Elts.size());
+  }
+
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec;
+  ArgVec.push_back(LHS);
+  ArgVec.push_back(RHS);
+  // Get the key type with both the opcode and predicate
+  const ExprMapKeyType Key(Instruction::VICmp, ArgVec, pred);
+  return ExprConstants->getOrCreate(LHS->getType(), Key);
+}
+
+Constant *
+ConstantExpr::getVFCmp(unsigned short pred, Constant* LHS, Constant* RHS) {
+  assert(isa<VectorType>(LHS->getType()) &&
+         "Tried to create vfcmp operation on non-vector type!");
+  assert(LHS->getType() == RHS->getType());
+  assert(pred <= FCmpInst::LAST_FCMP_PREDICATE && "Invalid VFCmp Predicate");
+
+  const VectorType *VTy = cast<VectorType>(LHS->getType());
+  unsigned NumElts = VTy->getNumElements();
+  const Type *EltTy = VTy->getElementType();
+  const Type *REltTy = IntegerType::get(EltTy->getPrimitiveSizeInBits());
+  const Type *ResultTy = VectorType::get(REltTy, NumElts);
+
+  // See if we can fold the element-wise comparison of the LHS and RHS.
+  SmallVector<Constant *, 16> LHSElts, RHSElts;
+  LHS->getVectorElements(LHSElts);
+  RHS->getVectorElements(RHSElts);
+  
+  if (!LHSElts.empty() && !RHSElts.empty()) {
+    SmallVector<Constant *, 16> Elts;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *FC = ConstantFoldCompareInstruction(pred, LHSElts[i],
+                                                    RHSElts[i]);
+      if (ConstantInt *FCI = dyn_cast_or_null<ConstantInt>(FC)) {
+        if (FCI->getZExtValue())
+          Elts.push_back(ConstantInt::getAllOnesValue(REltTy));
+        else
+          Elts.push_back(ConstantInt::get(REltTy, 0ULL));
+      } else if (FC && isa<UndefValue>(FC)) {
+        Elts.push_back(UndefValue::get(REltTy));
+      } else {
+        break;
+      }
+    }
+    if (Elts.size() == NumElts)
+      return ConstantVector::get(&Elts[0], Elts.size());
+  }
+
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec;
+  ArgVec.push_back(LHS);
+  ArgVec.push_back(RHS);
+  // Get the key type with both the opcode and predicate
+  const ExprMapKeyType Key(Instruction::VFCmp, ArgVec, pred);
+  return ExprConstants->getOrCreate(ResultTy, Key);
+}
+
+Constant *ConstantExpr::getExtractElementTy(const Type *ReqTy, Constant *Val,
+                                            Constant *Idx) {
+  if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx))
+    return FC;          // Fold a few common cases...
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec(1, Val);
+  ArgVec.push_back(Idx);
+  const ExprMapKeyType Key(Instruction::ExtractElement,ArgVec);
+  return ExprConstants->getOrCreate(ReqTy, Key);
+}
+
+Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx) {
+  assert(isa<VectorType>(Val->getType()) &&
+         "Tried to create extractelement operation on non-vector type!");
+  assert(Idx->getType() == Type::Int32Ty &&
+         "Extractelement index must be i32 type!");
+  return getExtractElementTy(cast<VectorType>(Val->getType())->getElementType(),
+                             Val, Idx);
+}
+
+Constant *ConstantExpr::getInsertElementTy(const Type *ReqTy, Constant *Val,
+                                           Constant *Elt, Constant *Idx) {
+  if (Constant *FC = ConstantFoldInsertElementInstruction(Val, Elt, Idx))
+    return FC;          // Fold a few common cases...
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec(1, Val);
+  ArgVec.push_back(Elt);
+  ArgVec.push_back(Idx);
+  const ExprMapKeyType Key(Instruction::InsertElement,ArgVec);
+  return ExprConstants->getOrCreate(ReqTy, Key);
+}
+
+Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt, 
+                                         Constant *Idx) {
+  assert(isa<VectorType>(Val->getType()) &&
+         "Tried to create insertelement operation on non-vector type!");
+  assert(Elt->getType() == cast<VectorType>(Val->getType())->getElementType()
+         && "Insertelement types must match!");
+  assert(Idx->getType() == Type::Int32Ty &&
+         "Insertelement index must be i32 type!");
+  return getInsertElementTy(Val->getType(), Val, Elt, Idx);
+}
+
+Constant *ConstantExpr::getShuffleVectorTy(const Type *ReqTy, Constant *V1,
+                                           Constant *V2, Constant *Mask) {
+  if (Constant *FC = ConstantFoldShuffleVectorInstruction(V1, V2, Mask))
+    return FC;          // Fold a few common cases...
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec(1, V1);
+  ArgVec.push_back(V2);
+  ArgVec.push_back(Mask);
+  const ExprMapKeyType Key(Instruction::ShuffleVector,ArgVec);
+  return ExprConstants->getOrCreate(ReqTy, Key);
+}
+
+Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2, 
+                                         Constant *Mask) {
+  assert(ShuffleVectorInst::isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector constant expr operands!");
+
+  unsigned NElts = cast<VectorType>(Mask->getType())->getNumElements();
+  const Type *EltTy = cast<VectorType>(V1->getType())->getElementType();
+  const Type *ShufTy = VectorType::get(EltTy, NElts);
+  return getShuffleVectorTy(ShufTy, V1, V2, Mask);
+}
+
+Constant *ConstantExpr::getInsertValueTy(const Type *ReqTy, Constant *Agg,
+                                         Constant *Val,
+                                        const unsigned *Idxs, unsigned NumIdx) {
+  assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs,
+                                          Idxs+NumIdx) == Val->getType() &&
+         "insertvalue indices invalid!");
+  assert(Agg->getType() == ReqTy &&
+         "insertvalue type invalid!");
+  assert(Agg->getType()->isFirstClassType() &&
+         "Non-first-class type for constant InsertValue expression");
+  Constant *FC = ConstantFoldInsertValueInstruction(Agg, Val, Idxs, NumIdx);
+  assert(FC && "InsertValue constant expr couldn't be folded!");
+  return FC;
+}
+
+Constant *ConstantExpr::getInsertValue(Constant *Agg, Constant *Val,
+                                     const unsigned *IdxList, unsigned NumIdx) {
+  assert(Agg->getType()->isFirstClassType() &&
+         "Tried to create insertelement operation on non-first-class type!");
+
+  const Type *ReqTy = Agg->getType();
+#ifndef NDEBUG
+  const Type *ValTy =
+    ExtractValueInst::getIndexedType(Agg->getType(), IdxList, IdxList+NumIdx);
+#endif
+  assert(ValTy == Val->getType() && "insertvalue indices invalid!");
+  return getInsertValueTy(ReqTy, Agg, Val, IdxList, NumIdx);
+}
+
+Constant *ConstantExpr::getExtractValueTy(const Type *ReqTy, Constant *Agg,
+                                        const unsigned *Idxs, unsigned NumIdx) {
+  assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs,
+                                          Idxs+NumIdx) == ReqTy &&
+         "extractvalue indices invalid!");
+  assert(Agg->getType()->isFirstClassType() &&
+         "Non-first-class type for constant extractvalue expression");
+  Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs, NumIdx);
+  assert(FC && "ExtractValue constant expr couldn't be folded!");
+  return FC;
+}
+
+Constant *ConstantExpr::getExtractValue(Constant *Agg,
+                                     const unsigned *IdxList, unsigned NumIdx) {
+  assert(Agg->getType()->isFirstClassType() &&
+         "Tried to create extractelement operation on non-first-class type!");
+
+  const Type *ReqTy =
+    ExtractValueInst::getIndexedType(Agg->getType(), IdxList, IdxList+NumIdx);
+  assert(ReqTy && "extractvalue indices invalid!");
+  return getExtractValueTy(ReqTy, Agg, IdxList, NumIdx);
+}
+
+Constant *ConstantExpr::getZeroValueForNegationExpr(const Type *Ty) {
+  if (const VectorType *PTy = dyn_cast<VectorType>(Ty))
+    if (PTy->getElementType()->isFloatingPoint()) {
+      std::vector<Constant*> zeros(PTy->getNumElements(),
+                           ConstantFP::getNegativeZero(PTy->getElementType()));
+      return ConstantVector::get(PTy, zeros);
+    }
+
+  if (Ty->isFloatingPoint()) 
+    return ConstantFP::getNegativeZero(Ty);
+
+  return Constant::getNullValue(Ty);
+}
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantExpr::destroyConstant() {
+  ExprConstants->remove(this);
+  destroyConstantImpl();
+}
+
+const char *ConstantExpr::getOpcodeName() const {
+  return Instruction::getOpcodeName(getOpcode());
+}
+
+//===----------------------------------------------------------------------===//
+//                replaceUsesOfWithOnConstant implementations
+
+/// replaceUsesOfWithOnConstant - Update this constant array to change uses of
+/// 'From' to be uses of 'To'.  This must update the uniquing data structures
+/// etc.
+///
+/// Note that we intentionally replace all uses of From with To here.  Consider
+/// a large array that uses 'From' 1000 times.  By handling this case all here,
+/// ConstantArray::replaceUsesOfWithOnConstant is only invoked once, and that
+/// single invocation handles all 1000 uses.  Handling them one at a time would
+/// work, but would be really slow because it would have to unique each updated
+/// array instance.
+void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                Use *U) {
+  assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
+  Constant *ToC = cast<Constant>(To);
+
+  std::pair<ArrayConstantsTy::MapKey, Constant*> Lookup;
+  Lookup.first.first = getType();
+  Lookup.second = this;
+
+  std::vector<Constant*> &Values = Lookup.first.second;
+  Values.reserve(getNumOperands());  // Build replacement array.
+
+  // Fill values with the modified operands of the constant array.  Also, 
+  // compute whether this turns into an all-zeros array.
+  bool isAllZeros = false;
+  unsigned NumUpdated = 0;
+  if (!ToC->isNullValue()) {
+    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
+      Constant *Val = cast<Constant>(O->get());
+      if (Val == From) {
+        Val = ToC;
+        ++NumUpdated;
+      }
+      Values.push_back(Val);
+    }
+  } else {
+    isAllZeros = true;
+    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
+      Constant *Val = cast<Constant>(O->get());
+      if (Val == From) {
+        Val = ToC;
+        ++NumUpdated;
+      }
+      Values.push_back(Val);
+      if (isAllZeros) isAllZeros = Val->isNullValue();
+    }
+  }
+  
+  Constant *Replacement = 0;
+  if (isAllZeros) {
+    Replacement = ConstantAggregateZero::get(getType());
+  } else {
+    // Check to see if we have this array type already.
+    bool Exists;
+    ArrayConstantsTy::MapTy::iterator I =
+      ArrayConstants->InsertOrGetItem(Lookup, Exists);
+    
+    if (Exists) {
+      Replacement = I->second;
+    } else {
+      // Okay, the new shape doesn't exist in the system yet.  Instead of
+      // creating a new constant array, inserting it, replaceallusesof'ing the
+      // old with the new, then deleting the old... just update the current one
+      // in place!
+      ArrayConstants->MoveConstantToNewSlot(this, I);
+      
+      // Update to the new value.  Optimize for the case when we have a single
+      // operand that we're changing, but handle bulk updates efficiently.
+      if (NumUpdated == 1) {
+        unsigned OperandToUpdate = U-OperandList;
+        assert(getOperand(OperandToUpdate) == From &&
+               "ReplaceAllUsesWith broken!");
+        setOperand(OperandToUpdate, ToC);
+      } else {
+        for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+          if (getOperand(i) == From)
+            setOperand(i, ToC);
+      }
+      return;
+    }
+  }
+ 
+  // Otherwise, I do need to replace this with an existing value.
+  assert(Replacement != this && "I didn't contain From!");
+  
+  // Everyone using this now uses the replacement.
+  uncheckedReplaceAllUsesWith(Replacement);
+  
+  // Delete the old constant!
+  destroyConstant();
+}
+
+void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                 Use *U) {
+  assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
+  Constant *ToC = cast<Constant>(To);
+
+  unsigned OperandToUpdate = U-OperandList;
+  assert(getOperand(OperandToUpdate) == From && "ReplaceAllUsesWith broken!");
+
+  std::pair<StructConstantsTy::MapKey, Constant*> Lookup;
+  Lookup.first.first = getType();
+  Lookup.second = this;
+  std::vector<Constant*> &Values = Lookup.first.second;
+  Values.reserve(getNumOperands());  // Build replacement struct.
+  
+  
+  // Fill values with the modified operands of the constant struct.  Also, 
+  // compute whether this turns into an all-zeros struct.
+  bool isAllZeros = false;
+  if (!ToC->isNullValue()) {
+    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O)
+      Values.push_back(cast<Constant>(O->get()));
+  } else {
+    isAllZeros = true;
+    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
+      Constant *Val = cast<Constant>(O->get());
+      Values.push_back(Val);
+      if (isAllZeros) isAllZeros = Val->isNullValue();
+    }
+  }
+  Values[OperandToUpdate] = ToC;
+  
+  Constant *Replacement = 0;
+  if (isAllZeros) {
+    Replacement = ConstantAggregateZero::get(getType());
+  } else {
+    // Check to see if we have this array type already.
+    bool Exists;
+    StructConstantsTy::MapTy::iterator I =
+      StructConstants->InsertOrGetItem(Lookup, Exists);
+    
+    if (Exists) {
+      Replacement = I->second;
+    } else {
+      // Okay, the new shape doesn't exist in the system yet.  Instead of
+      // creating a new constant struct, inserting it, replaceallusesof'ing the
+      // old with the new, then deleting the old... just update the current one
+      // in place!
+      StructConstants->MoveConstantToNewSlot(this, I);
+      
+      // Update to the new value.
+      setOperand(OperandToUpdate, ToC);
+      return;
+    }
+  }
+  
+  assert(Replacement != this && "I didn't contain From!");
+  
+  // Everyone using this now uses the replacement.
+  uncheckedReplaceAllUsesWith(Replacement);
+  
+  // Delete the old constant!
+  destroyConstant();
+}
+
+void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                 Use *U) {
+  assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
+  
+  std::vector<Constant*> Values;
+  Values.reserve(getNumOperands());  // Build replacement array...
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    Constant *Val = getOperand(i);
+    if (Val == From) Val = cast<Constant>(To);
+    Values.push_back(Val);
+  }
+  
+  Constant *Replacement = ConstantVector::get(getType(), Values);
+  assert(Replacement != this && "I didn't contain From!");
+  
+  // Everyone using this now uses the replacement.
+  uncheckedReplaceAllUsesWith(Replacement);
+  
+  // Delete the old constant!
+  destroyConstant();
+}
+
+void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
+                                               Use *U) {
+  assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
+  Constant *To = cast<Constant>(ToV);
+  
+  Constant *Replacement = 0;
+  if (getOpcode() == Instruction::GetElementPtr) {
+    SmallVector<Constant*, 8> Indices;
+    Constant *Pointer = getOperand(0);
+    Indices.reserve(getNumOperands()-1);
+    if (Pointer == From) Pointer = To;
+    
+    for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+      Constant *Val = getOperand(i);
+      if (Val == From) Val = To;
+      Indices.push_back(Val);
+    }
+    Replacement = ConstantExpr::getGetElementPtr(Pointer,
+                                                 &Indices[0], Indices.size());
+  } else if (getOpcode() == Instruction::ExtractValue) {
+    Constant *Agg = getOperand(0);
+    if (Agg == From) Agg = To;
+    
+    const SmallVector<unsigned, 4> &Indices = getIndices();
+    Replacement = ConstantExpr::getExtractValue(Agg,
+                                                &Indices[0], Indices.size());
+  } else if (getOpcode() == Instruction::InsertValue) {
+    Constant *Agg = getOperand(0);
+    Constant *Val = getOperand(1);
+    if (Agg == From) Agg = To;
+    if (Val == From) Val = To;
+    
+    const SmallVector<unsigned, 4> &Indices = getIndices();
+    Replacement = ConstantExpr::getInsertValue(Agg, Val,
+                                               &Indices[0], Indices.size());
+  } else if (isCast()) {
+    assert(getOperand(0) == From && "Cast only has one use!");
+    Replacement = ConstantExpr::getCast(getOpcode(), To, getType());
+  } else if (getOpcode() == Instruction::Select) {
+    Constant *C1 = getOperand(0);
+    Constant *C2 = getOperand(1);
+    Constant *C3 = getOperand(2);
+    if (C1 == From) C1 = To;
+    if (C2 == From) C2 = To;
+    if (C3 == From) C3 = To;
+    Replacement = ConstantExpr::getSelect(C1, C2, C3);
+  } else if (getOpcode() == Instruction::ExtractElement) {
+    Constant *C1 = getOperand(0);
+    Constant *C2 = getOperand(1);
+    if (C1 == From) C1 = To;
+    if (C2 == From) C2 = To;
+    Replacement = ConstantExpr::getExtractElement(C1, C2);
+  } else if (getOpcode() == Instruction::InsertElement) {
+    Constant *C1 = getOperand(0);
+    Constant *C2 = getOperand(1);
+    Constant *C3 = getOperand(1);
+    if (C1 == From) C1 = To;
+    if (C2 == From) C2 = To;
+    if (C3 == From) C3 = To;
+    Replacement = ConstantExpr::getInsertElement(C1, C2, C3);
+  } else if (getOpcode() == Instruction::ShuffleVector) {
+    Constant *C1 = getOperand(0);
+    Constant *C2 = getOperand(1);
+    Constant *C3 = getOperand(2);
+    if (C1 == From) C1 = To;
+    if (C2 == From) C2 = To;
+    if (C3 == From) C3 = To;
+    Replacement = ConstantExpr::getShuffleVector(C1, C2, C3);
+  } else if (isCompare()) {
+    Constant *C1 = getOperand(0);
+    Constant *C2 = getOperand(1);
+    if (C1 == From) C1 = To;
+    if (C2 == From) C2 = To;
+    if (getOpcode() == Instruction::ICmp)
+      Replacement = ConstantExpr::getICmp(getPredicate(), C1, C2);
+    else if (getOpcode() == Instruction::FCmp)
+      Replacement = ConstantExpr::getFCmp(getPredicate(), C1, C2);
+    else if (getOpcode() == Instruction::VICmp)
+      Replacement = ConstantExpr::getVICmp(getPredicate(), C1, C2);
+    else {
+      assert(getOpcode() == Instruction::VFCmp);
+      Replacement = ConstantExpr::getVFCmp(getPredicate(), C1, C2);
+    }
+  } else if (getNumOperands() == 2) {
+    Constant *C1 = getOperand(0);
+    Constant *C2 = getOperand(1);
+    if (C1 == From) C1 = To;
+    if (C2 == From) C2 = To;
+    Replacement = ConstantExpr::get(getOpcode(), C1, C2);
+  } else {
+    assert(0 && "Unknown ConstantExpr type!");
+    return;
+  }
+  
+  assert(Replacement != this && "I didn't contain From!");
+  
+  // Everyone using this now uses the replacement.
+  uncheckedReplaceAllUsesWith(Replacement);
+  
+  // Delete the old constant!
+  destroyConstant();
+}
+
+void MDNode::replaceElement(Value *From, Value *To) {
+  SmallVector<Value*, 4> Values;
+  Values.reserve(getNumElements());  // Build replacement array...
+  for (unsigned i = 0, e = getNumElements(); i != e; ++i) {
+    Value *Val = getElement(i);
+    if (Val == From) Val = To;
+    Values.push_back(Val);
+  }
+
+  MDNode *Replacement = MDNode::get(&Values[0], Values.size());
+  assert(Replacement != this && "I didn't contain From!");
+
+  uncheckedReplaceAllUsesWith(Replacement);
+
+  destroyConstant();
+}
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
new file mode 100644
index 0000000..f85dbe7
--- /dev/null
+++ b/lib/VMCore/Core.cpp
@@ -0,0 +1,1450 @@
+//===-- Core.cpp ----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the C bindings for libLLVMCore.a, which implements
+// the LLVM intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Core.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/CallSite.h"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+
+/*===-- Error handling ----------------------------------------------------===*/
+
+void LLVMDisposeMessage(char *Message) {
+  free(Message);
+}
+
+
+/*===-- Operations on modules ---------------------------------------------===*/
+
+LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) {
+  return wrap(new Module(ModuleID));
+}
+
+void LLVMDisposeModule(LLVMModuleRef M) {
+  delete unwrap(M);
+}
+
+/*--.. Data layout .........................................................--*/
+const char * LLVMGetDataLayout(LLVMModuleRef M) {
+  return unwrap(M)->getDataLayout().c_str();
+}
+
+void LLVMSetDataLayout(LLVMModuleRef M, const char *Triple) {
+  unwrap(M)->setDataLayout(Triple);
+}
+
+/*--.. Target triple .......................................................--*/
+const char * LLVMGetTarget(LLVMModuleRef M) {
+  return unwrap(M)->getTargetTriple().c_str();
+}
+
+void LLVMSetTarget(LLVMModuleRef M, const char *Triple) {
+  unwrap(M)->setTargetTriple(Triple);
+}
+
+/*--.. Type names ..........................................................--*/
+int LLVMAddTypeName(LLVMModuleRef M, const char *Name, LLVMTypeRef Ty) {
+  return unwrap(M)->addTypeName(Name, unwrap(Ty));
+}
+
+void LLVMDeleteTypeName(LLVMModuleRef M, const char *Name) {
+  std::string N(Name);
+  
+  TypeSymbolTable &TST = unwrap(M)->getTypeSymbolTable();
+  for (TypeSymbolTable::iterator I = TST.begin(), E = TST.end(); I != E; ++I)
+    if (I->first == N)
+      TST.remove(I);
+}
+
+void LLVMDumpModule(LLVMModuleRef M) {
+  unwrap(M)->dump();
+}
+
+
+/*===-- Operations on types -----------------------------------------------===*/
+
+/*--.. Operations on all types (mostly) ....................................--*/
+
+LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
+  return static_cast<LLVMTypeKind>(unwrap(Ty)->getTypeID());
+}
+
+/*--.. Operations on integer types .........................................--*/
+
+LLVMTypeRef LLVMInt1Type(void)  { return (LLVMTypeRef) Type::Int1Ty;  }
+LLVMTypeRef LLVMInt8Type(void)  { return (LLVMTypeRef) Type::Int8Ty;  }
+LLVMTypeRef LLVMInt16Type(void) { return (LLVMTypeRef) Type::Int16Ty; }
+LLVMTypeRef LLVMInt32Type(void) { return (LLVMTypeRef) Type::Int32Ty; }
+LLVMTypeRef LLVMInt64Type(void) { return (LLVMTypeRef) Type::Int64Ty; }
+
+LLVMTypeRef LLVMIntType(unsigned NumBits) {
+  return wrap(IntegerType::get(NumBits));
+}
+
+unsigned LLVMGetIntTypeWidth(LLVMTypeRef IntegerTy) {
+  return unwrap<IntegerType>(IntegerTy)->getBitWidth();
+}
+
+/*--.. Operations on real types ............................................--*/
+
+LLVMTypeRef LLVMFloatType(void)    { return (LLVMTypeRef) Type::FloatTy;     }
+LLVMTypeRef LLVMDoubleType(void)   { return (LLVMTypeRef) Type::DoubleTy;    }
+LLVMTypeRef LLVMX86FP80Type(void)  { return (LLVMTypeRef) Type::X86_FP80Ty;  }
+LLVMTypeRef LLVMFP128Type(void)    { return (LLVMTypeRef) Type::FP128Ty;     }
+LLVMTypeRef LLVMPPCFP128Type(void) { return (LLVMTypeRef) Type::PPC_FP128Ty; }
+
+/*--.. Operations on function types ........................................--*/
+
+LLVMTypeRef LLVMFunctionType(LLVMTypeRef ReturnType,
+                             LLVMTypeRef *ParamTypes, unsigned ParamCount,
+                             int IsVarArg) {
+  std::vector<const Type*> Tys;
+  for (LLVMTypeRef *I = ParamTypes, *E = ParamTypes + ParamCount; I != E; ++I)
+    Tys.push_back(unwrap(*I));
+  
+  return wrap(FunctionType::get(unwrap(ReturnType), Tys, IsVarArg != 0));
+}
+
+int LLVMIsFunctionVarArg(LLVMTypeRef FunctionTy) {
+  return unwrap<FunctionType>(FunctionTy)->isVarArg();
+}
+
+LLVMTypeRef LLVMGetReturnType(LLVMTypeRef FunctionTy) {
+  return wrap(unwrap<FunctionType>(FunctionTy)->getReturnType());
+}
+
+unsigned LLVMCountParamTypes(LLVMTypeRef FunctionTy) {
+  return unwrap<FunctionType>(FunctionTy)->getNumParams();
+}
+
+void LLVMGetParamTypes(LLVMTypeRef FunctionTy, LLVMTypeRef *Dest) {
+  FunctionType *Ty = unwrap<FunctionType>(FunctionTy);
+  for (FunctionType::param_iterator I = Ty->param_begin(),
+                                    E = Ty->param_end(); I != E; ++I)
+    *Dest++ = wrap(*I);
+}
+
+/*--.. Operations on struct types ..........................................--*/
+
+LLVMTypeRef LLVMStructType(LLVMTypeRef *ElementTypes,
+                           unsigned ElementCount, int Packed) {
+  std::vector<const Type*> Tys;
+  for (LLVMTypeRef *I = ElementTypes,
+                   *E = ElementTypes + ElementCount; I != E; ++I)
+    Tys.push_back(unwrap(*I));
+  
+  return wrap(StructType::get(Tys, Packed != 0));
+}
+
+unsigned LLVMCountStructElementTypes(LLVMTypeRef StructTy) {
+  return unwrap<StructType>(StructTy)->getNumElements();
+}
+
+void LLVMGetStructElementTypes(LLVMTypeRef StructTy, LLVMTypeRef *Dest) {
+  StructType *Ty = unwrap<StructType>(StructTy);
+  for (FunctionType::param_iterator I = Ty->element_begin(),
+                                    E = Ty->element_end(); I != E; ++I)
+    *Dest++ = wrap(*I);
+}
+
+int LLVMIsPackedStruct(LLVMTypeRef StructTy) {
+  return unwrap<StructType>(StructTy)->isPacked();
+}
+
+/*--.. Operations on array, pointer, and vector types (sequence types) .....--*/
+
+LLVMTypeRef LLVMArrayType(LLVMTypeRef ElementType, unsigned ElementCount) {
+  return wrap(ArrayType::get(unwrap(ElementType), ElementCount));
+}
+
+LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace) {
+  return wrap(PointerType::get(unwrap(ElementType), AddressSpace));
+}
+
+LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount) {
+  return wrap(VectorType::get(unwrap(ElementType), ElementCount));
+}
+
+LLVMTypeRef LLVMGetElementType(LLVMTypeRef Ty) {
+  return wrap(unwrap<SequentialType>(Ty)->getElementType());
+}
+
+unsigned LLVMGetArrayLength(LLVMTypeRef ArrayTy) {
+  return unwrap<ArrayType>(ArrayTy)->getNumElements();
+}
+
+unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) {
+  return unwrap<PointerType>(PointerTy)->getAddressSpace();
+}
+
+unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) {
+  return unwrap<VectorType>(VectorTy)->getNumElements();
+}
+
+/*--.. Operations on other types ...........................................--*/
+
+LLVMTypeRef LLVMVoidType(void)  { return (LLVMTypeRef) Type::VoidTy;  }
+LLVMTypeRef LLVMLabelType(void) { return (LLVMTypeRef) Type::LabelTy; }
+
+LLVMTypeRef LLVMOpaqueType(void) {
+  return wrap(llvm::OpaqueType::get());
+}
+
+/*--.. Operations on type handles ..........................................--*/
+
+LLVMTypeHandleRef LLVMCreateTypeHandle(LLVMTypeRef PotentiallyAbstractTy) {
+  return wrap(new PATypeHolder(unwrap(PotentiallyAbstractTy)));
+}
+
+void LLVMDisposeTypeHandle(LLVMTypeHandleRef TypeHandle) {
+  delete unwrap(TypeHandle);
+}
+
+LLVMTypeRef LLVMResolveTypeHandle(LLVMTypeHandleRef TypeHandle) {
+  return wrap(unwrap(TypeHandle)->get());
+}
+
+void LLVMRefineType(LLVMTypeRef AbstractTy, LLVMTypeRef ConcreteTy) {
+  unwrap<DerivedType>(AbstractTy)->refineAbstractTypeTo(unwrap(ConcreteTy));
+}
+
+
+/*===-- Operations on values ----------------------------------------------===*/
+
+/*--.. Operations on all values ............................................--*/
+
+LLVMTypeRef LLVMTypeOf(LLVMValueRef Val) {
+  return wrap(unwrap(Val)->getType());
+}
+
+const char *LLVMGetValueName(LLVMValueRef Val) {
+  return unwrap(Val)->getNameStart();
+}
+
+void LLVMSetValueName(LLVMValueRef Val, const char *Name) {
+  unwrap(Val)->setName(Name);
+}
+
+void LLVMDumpValue(LLVMValueRef Val) {
+  unwrap(Val)->dump();
+}
+
+
+/*--.. Conversion functions ................................................--*/
+
+#define LLVM_DEFINE_VALUE_CAST(name)                                       \
+  LLVMValueRef LLVMIsA##name(LLVMValueRef Val) {                           \
+    return wrap(static_cast<Value*>(dyn_cast_or_null<name>(unwrap(Val)))); \
+  }
+
+LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DEFINE_VALUE_CAST)
+
+
+/*--.. Operations on constants of any type .................................--*/
+
+LLVMValueRef LLVMConstNull(LLVMTypeRef Ty) {
+  return wrap(Constant::getNullValue(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMConstAllOnes(LLVMTypeRef Ty) {
+  return wrap(Constant::getAllOnesValue(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMGetUndef(LLVMTypeRef Ty) {
+  return wrap(UndefValue::get(unwrap(Ty)));
+}
+
+int LLVMIsConstant(LLVMValueRef Ty) {
+  return isa<Constant>(unwrap(Ty));
+}
+
+int LLVMIsNull(LLVMValueRef Val) {
+  if (Constant *C = dyn_cast<Constant>(unwrap(Val)))
+    return C->isNullValue();
+  return false;
+}
+
+int LLVMIsUndef(LLVMValueRef Val) {
+  return isa<UndefValue>(unwrap(Val));
+}
+
+/*--.. Operations on scalar constants ......................................--*/
+
+LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N,
+                          int SignExtend) {
+  return wrap(ConstantInt::get(unwrap<IntegerType>(IntTy), N, SignExtend != 0));
+}
+
+static const fltSemantics &SemanticsForType(Type *Ty) {
+  assert(Ty->isFloatingPoint() && "Type is not floating point!");
+  if (Ty == Type::FloatTy)
+    return APFloat::IEEEsingle;
+  if (Ty == Type::DoubleTy)
+    return APFloat::IEEEdouble;
+  if (Ty == Type::X86_FP80Ty)
+    return APFloat::x87DoubleExtended;
+  if (Ty == Type::FP128Ty)
+    return APFloat::IEEEquad;
+  if (Ty == Type::PPC_FP128Ty)
+    return APFloat::PPCDoubleDouble;
+  return APFloat::Bogus;
+}
+
+LLVMValueRef LLVMConstReal(LLVMTypeRef RealTy, double N) {
+  APFloat APN(N);
+  bool ignored;
+  APN.convert(SemanticsForType(unwrap(RealTy)), APFloat::rmNearestTiesToEven,
+              &ignored);
+  return wrap(ConstantFP::get(APN));
+}
+
+LLVMValueRef LLVMConstRealOfString(LLVMTypeRef RealTy, const char *Text) {
+  return wrap(ConstantFP::get(APFloat(SemanticsForType(unwrap(RealTy)), Text)));
+}
+
+/*--.. Operations on composite constants ...................................--*/
+
+LLVMValueRef LLVMConstString(const char *Str, unsigned Length,
+                             int DontNullTerminate) {
+  /* Inverted the sense of AddNull because ', 0)' is a
+     better mnemonic for null termination than ', 1)'. */
+  return wrap(ConstantArray::get(std::string(Str, Length),
+                                 DontNullTerminate == 0));
+}
+
+LLVMValueRef LLVMConstArray(LLVMTypeRef ElementTy,
+                            LLVMValueRef *ConstantVals, unsigned Length) {
+  return wrap(ConstantArray::get(ArrayType::get(unwrap(ElementTy), Length),
+                                 unwrap<Constant>(ConstantVals, Length),
+                                 Length));
+}
+
+LLVMValueRef LLVMConstStruct(LLVMValueRef *ConstantVals, unsigned Count,
+                             int Packed) {
+  return wrap(ConstantStruct::get(unwrap<Constant>(ConstantVals, Count),
+                                  Count, Packed != 0));
+}
+
+LLVMValueRef LLVMConstVector(LLVMValueRef *ScalarConstantVals, unsigned Size) {
+  return wrap(ConstantVector::get(unwrap<Constant>(ScalarConstantVals, Size),
+                                  Size));
+}
+
+/*--.. Constant expressions ................................................--*/
+
+LLVMValueRef LLVMAlignOf(LLVMTypeRef Ty) {
+  return wrap(ConstantExpr::getAlignOf(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMSizeOf(LLVMTypeRef Ty) {
+  return wrap(ConstantExpr::getSizeOf(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMConstNeg(LLVMValueRef ConstantVal) {
+  return wrap(ConstantExpr::getNeg(unwrap<Constant>(ConstantVal)));
+}
+
+LLVMValueRef LLVMConstNot(LLVMValueRef ConstantVal) {
+  return wrap(ConstantExpr::getNot(unwrap<Constant>(ConstantVal)));
+}
+
+LLVMValueRef LLVMConstAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getAdd(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getSub(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getMul(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstUDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getUDiv(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstSDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getSDiv(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFDiv(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstURem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getURem(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstSRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getSRem(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFRem(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstAnd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getAnd(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstOr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getOr(unwrap<Constant>(LHSConstant),
+                                  unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstXor(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getXor(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstICmp(LLVMIntPredicate Predicate,
+                           LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getICmp(Predicate,
+                                    unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFCmp(LLVMRealPredicate Predicate,
+                           LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFCmp(Predicate,
+                                    unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstShl(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getShl(unwrap<Constant>(LHSConstant),
+                                  unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstLShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getLShr(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstAShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getAShr(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal,
+                          LLVMValueRef *ConstantIndices, unsigned NumIndices) {
+  return wrap(ConstantExpr::getGetElementPtr(unwrap<Constant>(ConstantVal),
+                                             unwrap<Constant>(ConstantIndices, 
+                                                              NumIndices),
+                                             NumIndices));
+}
+
+LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getTrunc(unwrap<Constant>(ConstantVal),
+                                     unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstSExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getSExt(unwrap<Constant>(ConstantVal),
+                                    unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstZExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getZExt(unwrap<Constant>(ConstantVal),
+                                    unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPTrunc(unwrap<Constant>(ConstantVal),
+                                       unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPExtend(unwrap<Constant>(ConstantVal),
+                                        unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstUIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getUIToFP(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstSIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getSIToFP(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPToUI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPToUI(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPToSI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPToSI(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstPtrToInt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getPtrToInt(unwrap<Constant>(ConstantVal),
+                                        unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstIntToPtr(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getIntToPtr(unwrap<Constant>(ConstantVal),
+                                        unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getBitCast(unwrap<Constant>(ConstantVal),
+                                       unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstSelect(LLVMValueRef ConstantCondition,
+                             LLVMValueRef ConstantIfTrue,
+                             LLVMValueRef ConstantIfFalse) {
+  return wrap(ConstantExpr::getSelect(unwrap<Constant>(ConstantCondition),
+                                      unwrap<Constant>(ConstantIfTrue),
+                                      unwrap<Constant>(ConstantIfFalse)));
+}
+
+LLVMValueRef LLVMConstExtractElement(LLVMValueRef VectorConstant,
+                                     LLVMValueRef IndexConstant) {
+  return wrap(ConstantExpr::getExtractElement(unwrap<Constant>(VectorConstant),
+                                              unwrap<Constant>(IndexConstant)));
+}
+
+LLVMValueRef LLVMConstInsertElement(LLVMValueRef VectorConstant,
+                                    LLVMValueRef ElementValueConstant,
+                                    LLVMValueRef IndexConstant) {
+  return wrap(ConstantExpr::getInsertElement(unwrap<Constant>(VectorConstant),
+                                         unwrap<Constant>(ElementValueConstant),
+                                             unwrap<Constant>(IndexConstant)));
+}
+
+LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
+                                    LLVMValueRef VectorBConstant,
+                                    LLVMValueRef MaskConstant) {
+  return wrap(ConstantExpr::getShuffleVector(unwrap<Constant>(VectorAConstant),
+                                             unwrap<Constant>(VectorBConstant),
+                                             unwrap<Constant>(MaskConstant)));
+}
+
+LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
+                                   unsigned NumIdx) {
+  return wrap(ConstantExpr::getExtractValue(unwrap<Constant>(AggConstant),
+                                            IdxList, NumIdx));
+}
+
+LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
+                                  LLVMValueRef ElementValueConstant,
+                                  unsigned *IdxList, unsigned NumIdx) {
+  return wrap(ConstantExpr::getInsertValue(unwrap<Constant>(AggConstant),
+                                         unwrap<Constant>(ElementValueConstant),
+                                           IdxList, NumIdx));
+}
+
+LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty, const char *AsmString, 
+                                const char *Constraints, int HasSideEffects) {
+  return wrap(InlineAsm::get(dyn_cast<FunctionType>(unwrap(Ty)), AsmString, 
+                             Constraints, HasSideEffects));
+}
+
+/*--.. Operations on global variables, functions, and aliases (globals) ....--*/
+
+LLVMModuleRef LLVMGetGlobalParent(LLVMValueRef Global) {
+  return wrap(unwrap<GlobalValue>(Global)->getParent());
+}
+
+int LLVMIsDeclaration(LLVMValueRef Global) {
+  return unwrap<GlobalValue>(Global)->isDeclaration();
+}
+
+LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
+  return static_cast<LLVMLinkage>(unwrap<GlobalValue>(Global)->getLinkage());
+}
+
+void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
+  unwrap<GlobalValue>(Global)
+    ->setLinkage(static_cast<GlobalValue::LinkageTypes>(Linkage));
+}
+
+const char *LLVMGetSection(LLVMValueRef Global) {
+  return unwrap<GlobalValue>(Global)->getSection().c_str();
+}
+
+void LLVMSetSection(LLVMValueRef Global, const char *Section) {
+  unwrap<GlobalValue>(Global)->setSection(Section);
+}
+
+LLVMVisibility LLVMGetVisibility(LLVMValueRef Global) {
+  return static_cast<LLVMVisibility>(
+    unwrap<GlobalValue>(Global)->getVisibility());
+}
+
+void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz) {
+  unwrap<GlobalValue>(Global)
+    ->setVisibility(static_cast<GlobalValue::VisibilityTypes>(Viz));
+}
+
+unsigned LLVMGetAlignment(LLVMValueRef Global) {
+  return unwrap<GlobalValue>(Global)->getAlignment();
+}
+
+void LLVMSetAlignment(LLVMValueRef Global, unsigned Bytes) {
+  unwrap<GlobalValue>(Global)->setAlignment(Bytes);
+}
+
+/*--.. Operations on global variables ......................................--*/
+
+LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) {
+  return wrap(new GlobalVariable(unwrap(Ty), false,
+                                 GlobalValue::ExternalLinkage, 0, Name,
+                                 unwrap(M)));
+}
+
+LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) {
+  return wrap(unwrap(M)->getNamedGlobal(Name));
+}
+
+LLVMValueRef LLVMGetFirstGlobal(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::global_iterator I = Mod->global_begin();
+  if (I == Mod->global_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::global_iterator I = Mod->global_end();
+  if (I == Mod->global_begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextGlobal(LLVMValueRef GlobalVar) {
+  GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
+  Module::global_iterator I = GV;
+  if (++I == GV->getParent()->global_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousGlobal(LLVMValueRef GlobalVar) {
+  GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
+  Module::global_iterator I = GV;
+  if (I == GV->getParent()->global_begin())
+    return 0;
+  return wrap(--I);
+}
+
+void LLVMDeleteGlobal(LLVMValueRef GlobalVar) {
+  unwrap<GlobalVariable>(GlobalVar)->eraseFromParent();
+}
+
+LLVMValueRef LLVMGetInitializer(LLVMValueRef GlobalVar) {
+  return wrap(unwrap<GlobalVariable>(GlobalVar)->getInitializer());
+}
+
+void LLVMSetInitializer(LLVMValueRef GlobalVar, LLVMValueRef ConstantVal) {
+  unwrap<GlobalVariable>(GlobalVar)
+    ->setInitializer(unwrap<Constant>(ConstantVal));
+}
+
+int LLVMIsThreadLocal(LLVMValueRef GlobalVar) {
+  return unwrap<GlobalVariable>(GlobalVar)->isThreadLocal();
+}
+
+void LLVMSetThreadLocal(LLVMValueRef GlobalVar, int IsThreadLocal) {
+  unwrap<GlobalVariable>(GlobalVar)->setThreadLocal(IsThreadLocal != 0);
+}
+
+int LLVMIsGlobalConstant(LLVMValueRef GlobalVar) {
+  return unwrap<GlobalVariable>(GlobalVar)->isConstant();
+}
+
+void LLVMSetGlobalConstant(LLVMValueRef GlobalVar, int IsConstant) {
+  unwrap<GlobalVariable>(GlobalVar)->setConstant(IsConstant != 0);
+}
+
+/*--.. Operations on aliases ......................................--*/
+
+LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
+                          const char *Name) {
+  return wrap(new GlobalAlias(unwrap(Ty), GlobalValue::ExternalLinkage, Name,
+                              unwrap<Constant>(Aliasee), unwrap (M)));
+}
+
+/*--.. Operations on functions .............................................--*/
+
+LLVMValueRef LLVMAddFunction(LLVMModuleRef M, const char *Name,
+                             LLVMTypeRef FunctionTy) {
+  return wrap(Function::Create(unwrap<FunctionType>(FunctionTy),
+                               GlobalValue::ExternalLinkage, Name, unwrap(M)));
+}
+
+LLVMValueRef LLVMGetNamedFunction(LLVMModuleRef M, const char *Name) {
+  return wrap(unwrap(M)->getFunction(Name));
+}
+
+LLVMValueRef LLVMGetFirstFunction(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::iterator I = Mod->begin();
+  if (I == Mod->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::iterator I = Mod->end();
+  if (I == Mod->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Module::iterator I = Func;
+  if (++I == Func->getParent()->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Module::iterator I = Func;
+  if (I == Func->getParent()->begin())
+    return 0;
+  return wrap(--I);
+}
+
+void LLVMDeleteFunction(LLVMValueRef Fn) {
+  unwrap<Function>(Fn)->eraseFromParent();
+}
+
+unsigned LLVMGetIntrinsicID(LLVMValueRef Fn) {
+  if (Function *F = dyn_cast<Function>(unwrap(Fn)))
+    return F->getIntrinsicID();
+  return 0;
+}
+
+unsigned LLVMGetFunctionCallConv(LLVMValueRef Fn) {
+  return unwrap<Function>(Fn)->getCallingConv();
+}
+
+void LLVMSetFunctionCallConv(LLVMValueRef Fn, unsigned CC) {
+  return unwrap<Function>(Fn)->setCallingConv(CC);
+}
+
+const char *LLVMGetGC(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return F->hasGC()? F->getGC() : 0;
+}
+
+void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
+  Function *F = unwrap<Function>(Fn);
+  if (GC)
+    F->setGC(GC);
+  else
+    F->clearGC();
+}
+
+void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttrListPtr PAL = Func->getAttributes();
+  const AttrListPtr PALnew = PAL.addAttr(0, PA);
+  Func->setAttributes(PALnew);
+}
+
+void LLVMRemoveFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttrListPtr PAL = Func->getAttributes();
+  const AttrListPtr PALnew = PAL.removeAttr(0, PA);
+  Func->setAttributes(PALnew);
+}
+
+/*--.. Operations on parameters ............................................--*/
+
+unsigned LLVMCountParams(LLVMValueRef FnRef) {
+  // This function is strictly redundant to
+  //   LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(FnRef)))
+  return unwrap<Function>(FnRef)->arg_size();
+}
+
+void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) {
+  Function *Fn = unwrap<Function>(FnRef);
+  for (Function::arg_iterator I = Fn->arg_begin(),
+                              E = Fn->arg_end(); I != E; I++)
+    *ParamRefs++ = wrap(I);
+}
+
+LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) {
+  Function::arg_iterator AI = unwrap<Function>(FnRef)->arg_begin();
+  while (index --> 0)
+    AI++;
+  return wrap(AI);
+}
+
+LLVMValueRef LLVMGetParamParent(LLVMValueRef V) {
+  return wrap(unwrap<Argument>(V)->getParent());
+}
+
+LLVMValueRef LLVMGetFirstParam(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::arg_iterator I = Func->arg_begin();
+  if (I == Func->arg_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::arg_iterator I = Func->arg_end();
+  if (I == Func->arg_begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
+  Argument *A = unwrap<Argument>(Arg);
+  Function::arg_iterator I = A;
+  if (++I == A->getParent()->arg_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
+  Argument *A = unwrap<Argument>(Arg);
+  Function::arg_iterator I = A;
+  if (I == A->getParent()->arg_begin())
+    return 0;
+  return wrap(--I);
+}
+
+void LLVMAddAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
+  unwrap<Argument>(Arg)->addAttr(PA);
+}
+
+void LLVMRemoveAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
+  unwrap<Argument>(Arg)->removeAttr(PA);
+}
+
+void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
+  unwrap<Argument>(Arg)->addAttr(
+          Attribute::constructAlignmentFromInt(align));
+}
+
+/*--.. Operations on basic blocks ..........................................--*/
+
+LLVMValueRef LLVMBasicBlockAsValue(LLVMBasicBlockRef BB) {
+  return wrap(static_cast<Value*>(unwrap(BB)));
+}
+
+int LLVMValueIsBasicBlock(LLVMValueRef Val) {
+  return isa<BasicBlock>(unwrap(Val));
+}
+
+LLVMBasicBlockRef LLVMValueAsBasicBlock(LLVMValueRef Val) {
+  return wrap(unwrap<BasicBlock>(Val));
+}
+
+LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB) {
+  return wrap(unwrap(BB)->getParent());
+}
+
+unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) {
+  return unwrap<Function>(FnRef)->size();
+}
+
+void LLVMGetBasicBlocks(LLVMValueRef FnRef, LLVMBasicBlockRef *BasicBlocksRefs){
+  Function *Fn = unwrap<Function>(FnRef);
+  for (Function::iterator I = Fn->begin(), E = Fn->end(); I != E; I++)
+    *BasicBlocksRefs++ = wrap(I);
+}
+
+LLVMBasicBlockRef LLVMGetEntryBasicBlock(LLVMValueRef Fn) {
+  return wrap(&unwrap<Function>(Fn)->getEntryBlock());
+}
+
+LLVMBasicBlockRef LLVMGetFirstBasicBlock(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::iterator I = Func->begin();
+  if (I == Func->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::iterator I = Func->end();
+  if (I == Func->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMBasicBlockRef LLVMGetNextBasicBlock(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  Function::iterator I = Block;
+  if (++I == Block->getParent()->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMBasicBlockRef LLVMGetPreviousBasicBlock(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  Function::iterator I = Block;
+  if (I == Block->getParent()->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMBasicBlockRef LLVMAppendBasicBlock(LLVMValueRef FnRef, const char *Name) {
+  return wrap(BasicBlock::Create(Name, unwrap<Function>(FnRef)));
+}
+
+LLVMBasicBlockRef LLVMInsertBasicBlock(LLVMBasicBlockRef InsertBeforeBBRef,
+                                       const char *Name) {
+  BasicBlock *InsertBeforeBB = unwrap(InsertBeforeBBRef);
+  return wrap(BasicBlock::Create(Name, InsertBeforeBB->getParent(),
+                                 InsertBeforeBB));
+}
+
+void LLVMDeleteBasicBlock(LLVMBasicBlockRef BBRef) {
+  unwrap(BBRef)->eraseFromParent();
+}
+
+/*--.. Operations on instructions ..........................................--*/
+
+LLVMBasicBlockRef LLVMGetInstructionParent(LLVMValueRef Inst) {
+  return wrap(unwrap<Instruction>(Inst)->getParent());
+}
+
+LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  BasicBlock::iterator I = Block->begin();
+  if (I == Block->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  BasicBlock::iterator I = Block->end();
+  if (I == Block->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst) {
+  Instruction *Instr = unwrap<Instruction>(Inst);
+  BasicBlock::iterator I = Instr;
+  if (++I == Instr->getParent()->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) {
+  Instruction *Instr = unwrap<Instruction>(Inst);
+  BasicBlock::iterator I = Instr;
+  if (I == Instr->getParent()->begin())
+    return 0;
+  return wrap(--I);
+}
+
+/*--.. Call and invoke instructions ........................................--*/
+
+unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) {
+  Value *V = unwrap(Instr);
+  if (CallInst *CI = dyn_cast<CallInst>(V))
+    return CI->getCallingConv();
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(V))
+    return II->getCallingConv();
+  assert(0 && "LLVMGetInstructionCallConv applies only to call and invoke!");
+  return 0;
+}
+
+void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
+  Value *V = unwrap(Instr);
+  if (CallInst *CI = dyn_cast<CallInst>(V))
+    return CI->setCallingConv(CC);
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(V))
+    return II->setCallingConv(CC);
+  assert(0 && "LLVMSetInstructionCallConv applies only to call and invoke!");
+}
+
+void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index, 
+                           LLVMAttribute PA) {
+  CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  Call.setAttributes(
+    Call.getAttributes().addAttr(index, PA));
+}
+
+void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index, 
+                              LLVMAttribute PA) {
+  CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  Call.setAttributes(
+    Call.getAttributes().removeAttr(index, PA));
+}
+
+void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, 
+                                unsigned align) {
+  CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  Call.setAttributes(
+    Call.getAttributes().addAttr(index, 
+        Attribute::constructAlignmentFromInt(align)));
+}
+
+/*--.. Operations on call instructions (only) ..............................--*/
+
+int LLVMIsTailCall(LLVMValueRef Call) {
+  return unwrap<CallInst>(Call)->isTailCall();
+}
+
+void LLVMSetTailCall(LLVMValueRef Call, int isTailCall) {
+  unwrap<CallInst>(Call)->setTailCall(isTailCall);
+}
+
+/*--.. Operations on phi nodes .............................................--*/
+
+void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues,
+                     LLVMBasicBlockRef *IncomingBlocks, unsigned Count) {
+  PHINode *PhiVal = unwrap<PHINode>(PhiNode);
+  for (unsigned I = 0; I != Count; ++I)
+    PhiVal->addIncoming(unwrap(IncomingValues[I]), unwrap(IncomingBlocks[I]));
+}
+
+unsigned LLVMCountIncoming(LLVMValueRef PhiNode) {
+  return unwrap<PHINode>(PhiNode)->getNumIncomingValues();
+}
+
+LLVMValueRef LLVMGetIncomingValue(LLVMValueRef PhiNode, unsigned Index) {
+  return wrap(unwrap<PHINode>(PhiNode)->getIncomingValue(Index));
+}
+
+LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index) {
+  return wrap(unwrap<PHINode>(PhiNode)->getIncomingBlock(Index));
+}
+
+
+/*===-- Instruction builders ----------------------------------------------===*/
+
+LLVMBuilderRef LLVMCreateBuilder(void) {
+  return wrap(new IRBuilder<>());
+}
+
+void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block,
+                         LLVMValueRef Instr) {
+  BasicBlock *BB = unwrap(Block);
+  Instruction *I = Instr? unwrap<Instruction>(Instr) : (Instruction*) BB->end();
+  unwrap(Builder)->SetInsertPoint(BB, I);
+}
+
+void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr) {
+  Instruction *I = unwrap<Instruction>(Instr);
+  unwrap(Builder)->SetInsertPoint(I->getParent(), I);
+}
+
+void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block) {
+  BasicBlock *BB = unwrap(Block);
+  unwrap(Builder)->SetInsertPoint(BB);
+}
+
+LLVMBasicBlockRef LLVMGetInsertBlock(LLVMBuilderRef Builder) {
+   return wrap(unwrap(Builder)->GetInsertBlock());
+}
+
+void LLVMClearInsertionPosition(LLVMBuilderRef Builder) {
+  unwrap(Builder)->ClearInsertionPoint ();
+}
+
+void LLVMInsertIntoBuilder(LLVMBuilderRef Builder, LLVMValueRef Instr) {
+  unwrap(Builder)->Insert(unwrap<Instruction>(Instr));
+}
+
+void LLVMDisposeBuilder(LLVMBuilderRef Builder) {
+  delete unwrap(Builder);
+}
+
+/*--.. Instruction builders ................................................--*/
+
+LLVMValueRef LLVMBuildRetVoid(LLVMBuilderRef B) {
+  return wrap(unwrap(B)->CreateRetVoid());
+}
+
+LLVMValueRef LLVMBuildRet(LLVMBuilderRef B, LLVMValueRef V) {
+  return wrap(unwrap(B)->CreateRet(unwrap(V)));
+}
+
+LLVMValueRef LLVMBuildBr(LLVMBuilderRef B, LLVMBasicBlockRef Dest) {
+  return wrap(unwrap(B)->CreateBr(unwrap(Dest)));
+}
+
+LLVMValueRef LLVMBuildCondBr(LLVMBuilderRef B, LLVMValueRef If,
+                             LLVMBasicBlockRef Then, LLVMBasicBlockRef Else) {
+  return wrap(unwrap(B)->CreateCondBr(unwrap(If), unwrap(Then), unwrap(Else)));
+}
+
+LLVMValueRef LLVMBuildSwitch(LLVMBuilderRef B, LLVMValueRef V,
+                             LLVMBasicBlockRef Else, unsigned NumCases) {
+  return wrap(unwrap(B)->CreateSwitch(unwrap(V), unwrap(Else), NumCases));
+}
+
+LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef B, LLVMValueRef Fn,
+                             LLVMValueRef *Args, unsigned NumArgs,
+                             LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateInvoke(unwrap(Fn), unwrap(Then), unwrap(Catch),
+                                      unwrap(Args), unwrap(Args) + NumArgs,
+                                      Name));
+}
+
+LLVMValueRef LLVMBuildUnwind(LLVMBuilderRef B) {
+  return wrap(unwrap(B)->CreateUnwind());
+}
+
+LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef B) {
+  return wrap(unwrap(B)->CreateUnreachable());
+}
+
+void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal,
+                 LLVMBasicBlockRef Dest) {
+  unwrap<SwitchInst>(Switch)->addCase(unwrap<ConstantInt>(OnVal), unwrap(Dest));
+}
+
+/*--.. Arithmetic ..........................................................--*/
+
+LLVMValueRef LLVMBuildAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateAdd(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateSub(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateMul(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildUDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateUDiv(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildSDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateSDiv(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateFDiv(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildURem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateURem(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildSRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateSRem(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateFRem(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildShl(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateShl(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildLShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateLShr(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildAShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateAShr(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildAnd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateAnd(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildOr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                         const char *Name) {
+  return wrap(unwrap(B)->CreateOr(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildXor(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateXor(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
+  return wrap(unwrap(B)->CreateNeg(unwrap(V), Name));
+}
+
+LLVMValueRef LLVMBuildNot(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
+  return wrap(unwrap(B)->CreateNot(unwrap(V), Name));
+}
+
+/*--.. Memory ..............................................................--*/
+
+LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateMalloc(unwrap(Ty), 0, Name));
+}
+
+LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
+                                  LLVMValueRef Val, const char *Name) {
+  return wrap(unwrap(B)->CreateMalloc(unwrap(Ty), unwrap(Val), Name));
+}
+
+LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), 0, Name));
+}
+
+LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
+                                  LLVMValueRef Val, const char *Name) {
+  return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), unwrap(Val), Name));
+}
+
+LLVMValueRef LLVMBuildFree(LLVMBuilderRef B, LLVMValueRef PointerVal) {
+  return wrap(unwrap(B)->CreateFree(unwrap(PointerVal)));
+}
+
+
+LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateLoad(unwrap(PointerVal), Name));
+}
+
+LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val, 
+                            LLVMValueRef PointerVal) {
+  return wrap(unwrap(B)->CreateStore(unwrap(Val), unwrap(PointerVal)));
+}
+
+LLVMValueRef LLVMBuildGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
+                          LLVMValueRef *Indices, unsigned NumIndices,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateGEP(unwrap(Pointer), unwrap(Indices),
+                                   unwrap(Indices) + NumIndices, Name));
+}
+
+/*--.. Casts ...............................................................--*/
+
+LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val,
+                            LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateTrunc(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildZExt(LLVMBuilderRef B, LLVMValueRef Val,
+                           LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateZExt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildSExt(LLVMBuilderRef B, LLVMValueRef Val,
+                           LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateSExt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPToUI(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPToUI(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPToSI(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPToSI(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildUIToFP(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateUIToFP(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildSIToFP(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateSIToFP(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPTrunc(LLVMBuilderRef B, LLVMValueRef Val,
+                              LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPTrunc(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPExt(LLVMBuilderRef B, LLVMValueRef Val,
+                            LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPExt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildPtrToInt(LLVMBuilderRef B, LLVMValueRef Val,
+                               LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreatePtrToInt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildIntToPtr(LLVMBuilderRef B, LLVMValueRef Val,
+                               LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateIntToPtr(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef B, LLVMValueRef Val,
+                              LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateBitCast(unwrap(Val), unwrap(DestTy), Name));
+}
+
+/*--.. Comparisons .........................................................--*/
+
+LLVMValueRef LLVMBuildICmp(LLVMBuilderRef B, LLVMIntPredicate Op,
+                           LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateICmp(static_cast<ICmpInst::Predicate>(Op),
+                                    unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFCmp(LLVMBuilderRef B, LLVMRealPredicate Op,
+                           LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateFCmp(static_cast<FCmpInst::Predicate>(Op),
+                                    unwrap(LHS), unwrap(RHS), Name));
+}
+
+/*--.. Miscellaneous instructions ..........................................--*/
+
+LLVMValueRef LLVMBuildPhi(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name) {
+  return wrap(unwrap(B)->CreatePHI(unwrap(Ty), Name));
+}
+
+LLVMValueRef LLVMBuildCall(LLVMBuilderRef B, LLVMValueRef Fn,
+                           LLVMValueRef *Args, unsigned NumArgs,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateCall(unwrap(Fn), unwrap(Args),
+                                    unwrap(Args) + NumArgs, Name));
+}
+
+LLVMValueRef LLVMBuildSelect(LLVMBuilderRef B, LLVMValueRef If,
+                             LLVMValueRef Then, LLVMValueRef Else,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateSelect(unwrap(If), unwrap(Then), unwrap(Else),
+                                      Name));
+}
+
+LLVMValueRef LLVMBuildVAArg(LLVMBuilderRef B, LLVMValueRef List,
+                            LLVMTypeRef Ty, const char *Name) {
+  return wrap(unwrap(B)->CreateVAArg(unwrap(List), unwrap(Ty), Name));
+}
+
+LLVMValueRef LLVMBuildExtractElement(LLVMBuilderRef B, LLVMValueRef VecVal,
+                                      LLVMValueRef Index, const char *Name) {
+  return wrap(unwrap(B)->CreateExtractElement(unwrap(VecVal), unwrap(Index),
+                                              Name));
+}
+
+LLVMValueRef LLVMBuildInsertElement(LLVMBuilderRef B, LLVMValueRef VecVal,
+                                    LLVMValueRef EltVal, LLVMValueRef Index,
+                                    const char *Name) {
+  return wrap(unwrap(B)->CreateInsertElement(unwrap(VecVal), unwrap(EltVal),
+                                             unwrap(Index), Name));
+}
+
+LLVMValueRef LLVMBuildShuffleVector(LLVMBuilderRef B, LLVMValueRef V1,
+                                    LLVMValueRef V2, LLVMValueRef Mask,
+                                    const char *Name) {
+  return wrap(unwrap(B)->CreateShuffleVector(unwrap(V1), unwrap(V2),
+                                             unwrap(Mask), Name));
+}
+
+LLVMValueRef LLVMBuildExtractValue(LLVMBuilderRef B, LLVMValueRef AggVal,
+                                   unsigned Index, const char *Name) {
+  return wrap(unwrap(B)->CreateExtractValue(unwrap(AggVal), Index, Name));
+}
+
+LLVMValueRef LLVMBuildInsertValue(LLVMBuilderRef B, LLVMValueRef AggVal,
+                                  LLVMValueRef EltVal, unsigned Index,
+                                  const char *Name) {
+  return wrap(unwrap(B)->CreateInsertValue(unwrap(AggVal), unwrap(EltVal),
+                                           Index, Name));
+}
+
+
+/*===-- Module providers --------------------------------------------------===*/
+
+LLVMModuleProviderRef
+LLVMCreateModuleProviderForExistingModule(LLVMModuleRef M) {
+  return wrap(new ExistingModuleProvider(unwrap(M)));
+}
+
+void LLVMDisposeModuleProvider(LLVMModuleProviderRef MP) {
+  delete unwrap(MP);
+}
+
+
+/*===-- Memory buffers ----------------------------------------------------===*/
+
+int LLVMCreateMemoryBufferWithContentsOfFile(const char *Path,
+                                             LLVMMemoryBufferRef *OutMemBuf,
+                                             char **OutMessage) {
+  std::string Error;
+  if (MemoryBuffer *MB = MemoryBuffer::getFile(Path, &Error)) {
+    *OutMemBuf = wrap(MB);
+    return 0;
+  }
+  
+  *OutMessage = strdup(Error.c_str());
+  return 1;
+}
+
+int LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
+                                    char **OutMessage) {
+  if (MemoryBuffer *MB = MemoryBuffer::getSTDIN()) {
+    *OutMemBuf = wrap(MB);
+    return 0;
+  }
+  
+  *OutMessage = strdup("stdin is empty.");
+  return 1;
+}
+
+void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf) {
+  delete unwrap(MemBuf);
+}
diff --git a/lib/VMCore/Dominators.cpp b/lib/VMCore/Dominators.cpp
new file mode 100644
index 0000000..735a70c
--- /dev/null
+++ b/lib/VMCore/Dominators.cpp
@@ -0,0 +1,287 @@
+//===- Dominators.cpp - Dominator Calculation -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// forward dominators.  Postdominators are available in libanalysis, but are not
+// included in libvmcore, because it's not needed.  Forward dominators are
+// needed to support the Verifier pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/Streams.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace llvm {
+static std::ostream &operator<<(std::ostream &o,
+                                const std::set<BasicBlock*> &BBs) {
+  for (std::set<BasicBlock*>::const_iterator I = BBs.begin(), E = BBs.end();
+       I != E; ++I)
+    if (*I)
+      WriteAsOperand(o, *I, false);
+    else
+      o << " <<exit node>>";
+  return o;
+}
+}
+
+//===----------------------------------------------------------------------===//
+//  DominatorTree Implementation
+//===----------------------------------------------------------------------===//
+//
+// Provide public access to DominatorTree information.  Implementation details
+// can be found in DominatorCalculation.h.
+//
+//===----------------------------------------------------------------------===//
+
+TEMPLATE_INSTANTIATION(class DomTreeNodeBase<BasicBlock>);
+TEMPLATE_INSTANTIATION(class DominatorTreeBase<BasicBlock>);
+
+char DominatorTree::ID = 0;
+static RegisterPass<DominatorTree>
+E("domtree", "Dominator Tree Construction", true, true);
+
+bool DominatorTree::runOnFunction(Function &F) {
+  DT->recalculate(F);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//  DominanceFrontier Implementation
+//===----------------------------------------------------------------------===//
+
+char DominanceFrontier::ID = 0;
+static RegisterPass<DominanceFrontier>
+G("domfrontier", "Dominance Frontier Construction", true, true);
+
+// NewBB is split and now it has one successor. Update dominace frontier to
+// reflect this change.
+void DominanceFrontier::splitBlock(BasicBlock *NewBB) {
+  assert(NewBB->getTerminator()->getNumSuccessors() == 1
+         && "NewBB should have a single successor!");
+  BasicBlock *NewBBSucc = NewBB->getTerminator()->getSuccessor(0);
+
+  std::vector<BasicBlock*> PredBlocks;
+  for (pred_iterator PI = pred_begin(NewBB), PE = pred_end(NewBB);
+       PI != PE; ++PI)
+      PredBlocks.push_back(*PI);  
+
+  if (PredBlocks.empty())
+    // If NewBB does not have any predecessors then it is a entry block.
+    // In this case, NewBB and its successor NewBBSucc dominates all
+    // other blocks.
+    return;
+
+  // NewBBSucc inherits original NewBB frontier.
+  DominanceFrontier::iterator NewBBI = find(NewBB);
+  if (NewBBI != end()) {
+    DominanceFrontier::DomSetType NewBBSet = NewBBI->second;
+    DominanceFrontier::DomSetType NewBBSuccSet;
+    NewBBSuccSet.insert(NewBBSet.begin(), NewBBSet.end());
+    addBasicBlock(NewBBSucc, NewBBSuccSet);
+  }
+
+  // If NewBB dominates NewBBSucc, then DF(NewBB) is now going to be the
+  // DF(PredBlocks[0]) without the stuff that the new block does not dominate
+  // a predecessor of.
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  if (DT.dominates(NewBB, NewBBSucc)) {
+    DominanceFrontier::iterator DFI = find(PredBlocks[0]);
+    if (DFI != end()) {
+      DominanceFrontier::DomSetType Set = DFI->second;
+      // Filter out stuff in Set that we do not dominate a predecessor of.
+      for (DominanceFrontier::DomSetType::iterator SetI = Set.begin(),
+             E = Set.end(); SetI != E;) {
+        bool DominatesPred = false;
+        for (pred_iterator PI = pred_begin(*SetI), E = pred_end(*SetI);
+             PI != E; ++PI)
+          if (DT.dominates(NewBB, *PI))
+            DominatesPred = true;
+        if (!DominatesPred)
+          Set.erase(SetI++);
+        else
+          ++SetI;
+      }
+
+      if (NewBBI != end()) {
+        for (DominanceFrontier::DomSetType::iterator SetI = Set.begin(),
+               E = Set.end(); SetI != E; ++SetI) {
+          BasicBlock *SB = *SetI;
+          addToFrontier(NewBBI, SB);
+        }
+      } else 
+        addBasicBlock(NewBB, Set);
+    }
+    
+  } else {
+    // DF(NewBB) is {NewBBSucc} because NewBB does not strictly dominate
+    // NewBBSucc, but it does dominate itself (and there is an edge (NewBB ->
+    // NewBBSucc)).  NewBBSucc is the single successor of NewBB.
+    DominanceFrontier::DomSetType NewDFSet;
+    NewDFSet.insert(NewBBSucc);
+    addBasicBlock(NewBB, NewDFSet);
+  }
+  
+  // Now we must loop over all of the dominance frontiers in the function,
+  // replacing occurrences of NewBBSucc with NewBB in some cases.  All
+  // blocks that dominate a block in PredBlocks and contained NewBBSucc in
+  // their dominance frontier must be updated to contain NewBB instead.
+  //
+  for (Function::iterator FI = NewBB->getParent()->begin(),
+         FE = NewBB->getParent()->end(); FI != FE; ++FI) {
+    DominanceFrontier::iterator DFI = find(FI);
+    if (DFI == end()) continue;  // unreachable block.
+    
+    // Only consider nodes that have NewBBSucc in their dominator frontier.
+    if (!DFI->second.count(NewBBSucc)) continue;
+
+    // Verify whether this block dominates a block in predblocks.  If not, do
+    // not update it.
+    bool BlockDominatesAny = false;
+    for (std::vector<BasicBlock*>::const_iterator BI = PredBlocks.begin(), 
+           BE = PredBlocks.end(); BI != BE; ++BI) {
+      if (DT.dominates(FI, *BI)) {
+        BlockDominatesAny = true;
+        break;
+      }
+    }
+
+    // If NewBBSucc should not stay in our dominator frontier, remove it.
+    // We remove it unless there is a predecessor of NewBBSucc that we
+    // dominate, but we don't strictly dominate NewBBSucc.
+    bool ShouldRemove = true;
+    if ((BasicBlock*)FI == NewBBSucc || !DT.dominates(FI, NewBBSucc)) {
+      // Okay, we know that PredDom does not strictly dominate NewBBSucc.
+      // Check to see if it dominates any predecessors of NewBBSucc.
+      for (pred_iterator PI = pred_begin(NewBBSucc),
+           E = pred_end(NewBBSucc); PI != E; ++PI)
+        if (DT.dominates(FI, *PI)) {
+          ShouldRemove = false;
+          break;
+        }
+    }
+    
+    if (ShouldRemove)
+      removeFromFrontier(DFI, NewBBSucc);
+    if (BlockDominatesAny && (&*FI == NewBB || !DT.dominates(FI, NewBB)))
+      addToFrontier(DFI, NewBB);
+  }
+}
+
+namespace {
+  class DFCalculateWorkObject {
+  public:
+    DFCalculateWorkObject(BasicBlock *B, BasicBlock *P, 
+                          const DomTreeNode *N,
+                          const DomTreeNode *PN)
+    : currentBB(B), parentBB(P), Node(N), parentNode(PN) {}
+    BasicBlock *currentBB;
+    BasicBlock *parentBB;
+    const DomTreeNode *Node;
+    const DomTreeNode *parentNode;
+  };
+}
+
+const DominanceFrontier::DomSetType &
+DominanceFrontier::calculate(const DominatorTree &DT,
+                             const DomTreeNode *Node) {
+  BasicBlock *BB = Node->getBlock();
+  DomSetType *Result = NULL;
+
+  std::vector<DFCalculateWorkObject> workList;
+  SmallPtrSet<BasicBlock *, 32> visited;
+
+  workList.push_back(DFCalculateWorkObject(BB, NULL, Node, NULL));
+  do {
+    DFCalculateWorkObject *currentW = &workList.back();
+    assert (currentW && "Missing work object.");
+
+    BasicBlock *currentBB = currentW->currentBB;
+    BasicBlock *parentBB = currentW->parentBB;
+    const DomTreeNode *currentNode = currentW->Node;
+    const DomTreeNode *parentNode = currentW->parentNode;
+    assert (currentBB && "Invalid work object. Missing current Basic Block");
+    assert (currentNode && "Invalid work object. Missing current Node");
+    DomSetType &S = Frontiers[currentBB];
+
+    // Visit each block only once.
+    if (visited.count(currentBB) == 0) {
+      visited.insert(currentBB);
+
+      // Loop over CFG successors to calculate DFlocal[currentNode]
+      for (succ_iterator SI = succ_begin(currentBB), SE = succ_end(currentBB);
+           SI != SE; ++SI) {
+        // Does Node immediately dominate this successor?
+        if (DT[*SI]->getIDom() != currentNode)
+          S.insert(*SI);
+      }
+    }
+
+    // At this point, S is DFlocal.  Now we union in DFup's of our children...
+    // Loop through and visit the nodes that Node immediately dominates (Node's
+    // children in the IDomTree)
+    bool visitChild = false;
+    for (DomTreeNode::const_iterator NI = currentNode->begin(), 
+           NE = currentNode->end(); NI != NE; ++NI) {
+      DomTreeNode *IDominee = *NI;
+      BasicBlock *childBB = IDominee->getBlock();
+      if (visited.count(childBB) == 0) {
+        workList.push_back(DFCalculateWorkObject(childBB, currentBB,
+                                                 IDominee, currentNode));
+        visitChild = true;
+      }
+    }
+
+    // If all children are visited or there is any child then pop this block
+    // from the workList.
+    if (!visitChild) {
+
+      if (!parentBB) {
+        Result = &S;
+        break;
+      }
+
+      DomSetType::const_iterator CDFI = S.begin(), CDFE = S.end();
+      DomSetType &parentSet = Frontiers[parentBB];
+      for (; CDFI != CDFE; ++CDFI) {
+        if (!DT.properlyDominates(parentNode, DT[*CDFI]))
+          parentSet.insert(*CDFI);
+      }
+      workList.pop_back();
+    }
+
+  } while (!workList.empty());
+
+  return *Result;
+}
+
+void DominanceFrontierBase::print(std::ostream &o, const Module* ) const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    o << "  DomFrontier for BB";
+    if (I->first)
+      WriteAsOperand(o, I->first, false);
+    else
+      o << " <<exit node>>";
+    o << " is:\t" << I->second << "\n";
+  }
+}
+
+void DominanceFrontierBase::dump() {
+  print (llvm::cerr);
+}
+
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
new file mode 100644
index 0000000..3a991f6
--- /dev/null
+++ b/lib/VMCore/Function.cpp
@@ -0,0 +1,367 @@
+//===-- Function.cpp - Implement the Global object classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Function class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/StringPool.h"
+#include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+
+// Explicit instantiations of SymbolTableListTraits since some of the methods
+// are not in the public header file...
+template class SymbolTableListTraits<Argument, Function>;
+template class SymbolTableListTraits<BasicBlock, Function>;
+
+//===----------------------------------------------------------------------===//
+// Argument Implementation
+//===----------------------------------------------------------------------===//
+
+Argument::Argument(const Type *Ty, const std::string &Name, Function *Par)
+  : Value(Ty, Value::ArgumentVal) {
+  Parent = 0;
+
+  // Make sure that we get added to a function
+  LeakDetector::addGarbageObject(this);
+
+  if (Par)
+    Par->getArgumentList().push_back(this);
+  setName(Name);
+}
+
+void Argument::setParent(Function *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+/// getArgNo - Return the index of this formal argument in its containing
+/// function.  For example in "void foo(int a, float b)" a is 0 and b is 1. 
+unsigned Argument::getArgNo() const {
+  const Function *F = getParent();
+  assert(F && "Argument is not in a function");
+  
+  Function::const_arg_iterator AI = F->arg_begin();
+  unsigned ArgIdx = 0;
+  for (; &*AI != this; ++AI)
+    ++ArgIdx;
+
+  return ArgIdx;
+}
+
+/// hasByValAttr - Return true if this argument has the byval attribute on it
+/// in its containing function.
+bool Argument::hasByValAttr() const {
+  if (!isa<PointerType>(getType())) return false;
+  return getParent()->paramHasAttr(getArgNo()+1, Attribute::ByVal);
+}
+
+/// hasNoAliasAttr - Return true if this argument has the noalias attribute on
+/// it in its containing function.
+bool Argument::hasNoAliasAttr() const {
+  if (!isa<PointerType>(getType())) return false;
+  return getParent()->paramHasAttr(getArgNo()+1, Attribute::NoAlias);
+}
+
+/// hasNoCaptureAttr - Return true if this argument has the nocapture attribute
+/// on it in its containing function.
+bool Argument::hasNoCaptureAttr() const {
+  if (!isa<PointerType>(getType())) return false;
+  return getParent()->paramHasAttr(getArgNo()+1, Attribute::NoCapture);
+}
+
+/// hasSRetAttr - Return true if this argument has the sret attribute on
+/// it in its containing function.
+bool Argument::hasStructRetAttr() const {
+  if (!isa<PointerType>(getType())) return false;
+  if (this != getParent()->arg_begin())
+    return false; // StructRet param must be first param
+  return getParent()->paramHasAttr(1, Attribute::StructRet);
+}
+
+/// addAttr - Add a Attribute to an argument
+void Argument::addAttr(Attributes attr) {
+  getParent()->addAttribute(getArgNo() + 1, attr);
+}
+
+/// removeAttr - Remove a Attribute from an argument
+void Argument::removeAttr(Attributes attr) {
+  getParent()->removeAttribute(getArgNo() + 1, attr);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Helper Methods in Function
+//===----------------------------------------------------------------------===//
+
+const FunctionType *Function::getFunctionType() const {
+  return cast<FunctionType>(getType()->getElementType());
+}
+
+bool Function::isVarArg() const {
+  return getFunctionType()->isVarArg();
+}
+
+const Type *Function::getReturnType() const {
+  return getFunctionType()->getReturnType();
+}
+
+void Function::removeFromParent() {
+  getParent()->getFunctionList().remove(this);
+}
+
+void Function::eraseFromParent() {
+  getParent()->getFunctionList().erase(this);
+}
+
+//===----------------------------------------------------------------------===//
+// Function Implementation
+//===----------------------------------------------------------------------===//
+
+Function::Function(const FunctionType *Ty, LinkageTypes Linkage,
+                   const std::string &name, Module *ParentModule)
+  : GlobalValue(PointerType::getUnqual(Ty), 
+                Value::FunctionVal, 0, 0, Linkage, name) {
+  assert(FunctionType::isValidReturnType(getReturnType()) &&
+         !isa<OpaqueType>(getReturnType()) && "invalid return type");
+  SymTab = new ValueSymbolTable();
+
+  // If the function has arguments, mark them as lazily built.
+  if (Ty->getNumParams())
+    SubclassData = 1;   // Set the "has lazy arguments" bit.
+  
+  // Make sure that we get added to a function
+  LeakDetector::addGarbageObject(this);
+
+  if (ParentModule)
+    ParentModule->getFunctionList().push_back(this);
+
+  // Ensure intrinsics have the right parameter attributes.
+  if (unsigned IID = getIntrinsicID())
+    setAttributes(Intrinsic::getAttributes(Intrinsic::ID(IID)));
+
+}
+
+Function::~Function() {
+  dropAllReferences();    // After this it is safe to delete instructions.
+
+  // Delete all of the method arguments and unlink from symbol table...
+  ArgumentList.clear();
+  delete SymTab;
+
+  // Remove the function from the on-the-side GC table.
+  clearGC();
+}
+
+void Function::BuildLazyArguments() const {
+  // Create the arguments vector, all arguments start out unnamed.
+  const FunctionType *FT = getFunctionType();
+  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+    assert(FT->getParamType(i) != Type::VoidTy &&
+           "Cannot have void typed arguments!");
+    ArgumentList.push_back(new Argument(FT->getParamType(i)));
+  }
+  
+  // Clear the lazy arguments bit.
+  const_cast<Function*>(this)->SubclassData &= ~1;
+}
+
+size_t Function::arg_size() const {
+  return getFunctionType()->getNumParams();
+}
+bool Function::arg_empty() const {
+  return getFunctionType()->getNumParams() == 0;
+}
+
+void Function::setParent(Module *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+// dropAllReferences() - This function causes all the subinstructions to "let
+// go" of all references that they are maintaining.  This allows one to
+// 'delete' a whole class at a time, even though there may be circular
+// references... first all references are dropped, and all use counts go to
+// zero.  Then everything is deleted for real.  Note that no operations are
+// valid on an object that has "dropped all references", except operator
+// delete.
+//
+void Function::dropAllReferences() {
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    I->dropAllReferences();
+  BasicBlocks.clear();    // Delete all basic blocks...
+}
+
+void Function::addAttribute(unsigned i, Attributes attr) {
+  AttrListPtr PAL = getAttributes();
+  PAL = PAL.addAttr(i, attr);
+  setAttributes(PAL);
+}
+
+void Function::removeAttribute(unsigned i, Attributes attr) {
+  AttrListPtr PAL = getAttributes();
+  PAL = PAL.removeAttr(i, attr);
+  setAttributes(PAL);
+}
+
+// Maintain the GC name for each function in an on-the-side table. This saves
+// allocating an additional word in Function for programs which do not use GC
+// (i.e., most programs) at the cost of increased overhead for clients which do
+// use GC.
+static DenseMap<const Function*,PooledStringPtr> *GCNames;
+static StringPool *GCNamePool;
+
+bool Function::hasGC() const {
+  return GCNames && GCNames->count(this);
+}
+
+const char *Function::getGC() const {
+  assert(hasGC() && "Function has no collector");
+  return *(*GCNames)[this];
+}
+
+void Function::setGC(const char *Str) {
+  if (!GCNamePool)
+    GCNamePool = new StringPool();
+  if (!GCNames)
+    GCNames = new DenseMap<const Function*,PooledStringPtr>();
+  (*GCNames)[this] = GCNamePool->intern(Str);
+}
+
+void Function::clearGC() {
+  if (GCNames) {
+    GCNames->erase(this);
+    if (GCNames->empty()) {
+      delete GCNames;
+      GCNames = 0;
+      if (GCNamePool->empty()) {
+        delete GCNamePool;
+        GCNamePool = 0;
+      }
+    }
+  }
+}
+
+/// copyAttributesFrom - copy all additional attributes (those not needed to
+/// create a Function) from the Function Src to this one.
+void Function::copyAttributesFrom(const GlobalValue *Src) {
+  assert(isa<Function>(Src) && "Expected a Function!");
+  GlobalValue::copyAttributesFrom(Src);
+  const Function *SrcF = cast<Function>(Src);
+  setCallingConv(SrcF->getCallingConv());
+  setAttributes(SrcF->getAttributes());
+  if (SrcF->hasGC())
+    setGC(SrcF->getGC());
+  else
+    clearGC();
+}
+
+/// getIntrinsicID - This method returns the ID number of the specified
+/// function, or Intrinsic::not_intrinsic if the function is not an
+/// intrinsic, or if the pointer is null.  This value is always defined to be
+/// zero to allow easy checking for whether a function is intrinsic or not.  The
+/// particular intrinsic functions which correspond to this value are defined in
+/// llvm/Intrinsics.h.
+///
+unsigned Function::getIntrinsicID() const {
+  const ValueName *ValName = this->getValueName();
+  if (!ValName)
+    return 0;
+  unsigned Len = ValName->getKeyLength();
+  const char *Name = ValName->getKeyData();
+  
+  if (Len < 5 || Name[4] != '.' || Name[0] != 'l' || Name[1] != 'l'
+      || Name[2] != 'v' || Name[3] != 'm')
+    return 0;  // All intrinsics start with 'llvm.'
+
+#define GET_FUNCTION_RECOGNIZER
+#include "llvm/Intrinsics.gen"
+#undef GET_FUNCTION_RECOGNIZER
+  return 0;
+}
+
+std::string Intrinsic::getName(ID id, const Type **Tys, unsigned numTys) { 
+  assert(id < num_intrinsics && "Invalid intrinsic ID!");
+  const char * const Table[] = {
+    "not_intrinsic",
+#define GET_INTRINSIC_NAME_TABLE
+#include "llvm/Intrinsics.gen"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+  if (numTys == 0)
+    return Table[id];
+  std::string Result(Table[id]);
+  for (unsigned i = 0; i < numTys; ++i) {
+    if (const PointerType* PTyp = dyn_cast<PointerType>(Tys[i])) {
+      Result += ".p" + llvm::utostr(PTyp->getAddressSpace()) + 
+                MVT::getMVT(PTyp->getElementType()).getMVTString();
+    }
+    else if (Tys[i])
+      Result += "." + MVT::getMVT(Tys[i]).getMVTString();
+  }
+  return Result;
+}
+
+const FunctionType *Intrinsic::getType(ID id, const Type **Tys, 
+                                       unsigned numTys) {
+  const Type *ResultTy = NULL;
+  std::vector<const Type*> ArgTys;
+  bool IsVarArg = false;
+  
+#define GET_INTRINSIC_GENERATOR
+#include "llvm/Intrinsics.gen"
+#undef GET_INTRINSIC_GENERATOR
+
+  return FunctionType::get(ResultTy, ArgTys, IsVarArg); 
+}
+
+bool Intrinsic::isOverloaded(ID id) {
+  const bool OTable[] = {
+    false,
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "llvm/Intrinsics.gen"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+  };
+  return OTable[id];
+}
+
+/// This defines the "Intrinsic::getAttributes(ID id)" method.
+#define GET_INTRINSIC_ATTRIBUTES
+#include "llvm/Intrinsics.gen"
+#undef GET_INTRINSIC_ATTRIBUTES
+
+Function *Intrinsic::getDeclaration(Module *M, ID id, const Type **Tys, 
+                                    unsigned numTys) {
+  // There can never be multiple globals with the same name of different types,
+  // because intrinsics must be a specific type.
+  return
+    cast<Function>(M->getOrInsertFunction(getName(id, Tys, numTys),
+                                          getType(id, Tys, numTys)));
+}
+
+// This defines the "Intrinsic::getIntrinsicForGCCBuiltin()" method.
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "llvm/Intrinsics.gen"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+
+// vim: sw=2 ai
diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp
new file mode 100644
index 0000000..5abe1f9
--- /dev/null
+++ b/lib/VMCore/Globals.cpp
@@ -0,0 +1,273 @@
+//===-- Globals.cpp - Implement the GlobalValue & GlobalVariable class ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GlobalValue & GlobalVariable classes for the VMCore
+// library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/LeakDetector.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                            GlobalValue Class
+//===----------------------------------------------------------------------===//
+
+/// removeDeadUsersOfConstant - If the specified constantexpr is dead, remove
+/// it.  This involves recursively eliminating any dead users of the
+/// constantexpr.
+static bool removeDeadUsersOfConstant(const Constant *C) {
+  if (isa<GlobalValue>(C)) return false; // Cannot remove this
+
+  while (!C->use_empty()) {
+    const Constant *User = dyn_cast<Constant>(C->use_back());
+    if (!User) return false; // Non-constant usage;
+    if (!removeDeadUsersOfConstant(User))
+      return false; // Constant wasn't dead
+  }
+
+  const_cast<Constant*>(C)->destroyConstant();
+  return true;
+}
+
+/// removeDeadConstantUsers - If there are any dead constant users dangling
+/// off of this global value, remove them.  This method is useful for clients
+/// that want to check to see if a global is unused, but don't want to deal
+/// with potentially dead constants hanging off of the globals.
+void GlobalValue::removeDeadConstantUsers() const {
+  Value::use_const_iterator I = use_begin(), E = use_end();
+  Value::use_const_iterator LastNonDeadUser = E;
+  while (I != E) {
+    if (const Constant *User = dyn_cast<Constant>(*I)) {
+      if (!removeDeadUsersOfConstant(User)) {
+        // If the constant wasn't dead, remember that this was the last live use
+        // and move on to the next constant.
+        LastNonDeadUser = I;
+        ++I;
+      } else {
+        // If the constant was dead, then the iterator is invalidated.
+        if (LastNonDeadUser == E) {
+          I = use_begin();
+          if (I == E) break;
+        } else {
+          I = LastNonDeadUser;
+          ++I;
+        }
+      }
+    } else {
+      LastNonDeadUser = I;
+      ++I;
+    }
+  }
+}
+
+/// Override destroyConstant to make sure it doesn't get called on
+/// GlobalValue's because they shouldn't be treated like other constants.
+void GlobalValue::destroyConstant() {
+  assert(0 && "You can't GV->destroyConstant()!");
+  abort();
+}
+
+/// copyAttributesFrom - copy all additional attributes (those not needed to
+/// create a GlobalValue) from the GlobalValue Src to this one.
+void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
+  setAlignment(Src->getAlignment());
+  setSection(Src->getSection());
+  setVisibility(Src->getVisibility());
+}
+
+
+//===----------------------------------------------------------------------===//
+// GlobalVariable Implementation
+//===----------------------------------------------------------------------===//
+
+GlobalVariable::GlobalVariable(const Type *Ty, bool constant, LinkageTypes Link,
+                               Constant *InitVal, const std::string &Name,
+                               Module *ParentModule, bool ThreadLocal, 
+                               unsigned AddressSpace)
+  : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
+                OperandTraits<GlobalVariable>::op_begin(this),
+                InitVal != 0, Link, Name),
+    isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) {
+  if (InitVal) {
+    assert(InitVal->getType() == Ty &&
+           "Initializer should be the same type as the GlobalVariable!");
+    Op<0>() = InitVal;
+  }
+
+  LeakDetector::addGarbageObject(this);
+
+  if (ParentModule)
+    ParentModule->getGlobalList().push_back(this);
+}
+
+GlobalVariable::GlobalVariable(const Type *Ty, bool constant, LinkageTypes Link,
+                               Constant *InitVal, const std::string &Name,
+                               GlobalVariable *Before, bool ThreadLocal,
+                               unsigned AddressSpace)
+  : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
+                OperandTraits<GlobalVariable>::op_begin(this),
+                InitVal != 0, Link, Name),
+    isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) {
+  if (InitVal) {
+    assert(InitVal->getType() == Ty &&
+           "Initializer should be the same type as the GlobalVariable!");
+    Op<0>() = InitVal;
+  }
+  
+  LeakDetector::addGarbageObject(this);
+  
+  if (Before)
+    Before->getParent()->getGlobalList().insert(Before, this);
+}
+
+void GlobalVariable::setParent(Module *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+void GlobalVariable::removeFromParent() {
+  getParent()->getGlobalList().remove(this);
+}
+
+void GlobalVariable::eraseFromParent() {
+  getParent()->getGlobalList().erase(this);
+}
+
+void GlobalVariable::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                 Use *U) {
+  // If you call this, then you better know this GVar has a constant
+  // initializer worth replacing. Enforce that here.
+  assert(getNumOperands() == 1 &&
+         "Attempt to replace uses of Constants on a GVar with no initializer");
+
+  // And, since you know it has an initializer, the From value better be
+  // the initializer :)
+  assert(getOperand(0) == From &&
+         "Attempt to replace wrong constant initializer in GVar");
+
+  // And, you better have a constant for the replacement value
+  assert(isa<Constant>(To) &&
+         "Attempt to replace GVar initializer with non-constant");
+
+  // Okay, preconditions out of the way, replace the constant initializer.
+  this->setOperand(0, cast<Constant>(To));
+}
+
+/// copyAttributesFrom - copy all additional attributes (those not needed to
+/// create a GlobalVariable) from the GlobalVariable Src to this one.
+void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
+  assert(isa<GlobalVariable>(Src) && "Expected a GlobalVariable!");
+  GlobalValue::copyAttributesFrom(Src);
+  const GlobalVariable *SrcVar = cast<GlobalVariable>(Src);
+  setThreadLocal(SrcVar->isThreadLocal());
+}
+
+
+//===----------------------------------------------------------------------===//
+// GlobalAlias Implementation
+//===----------------------------------------------------------------------===//
+
+GlobalAlias::GlobalAlias(const Type *Ty, LinkageTypes Link,
+                         const std::string &Name, Constant* aliasee,
+                         Module *ParentModule)
+  : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name) {
+  LeakDetector::addGarbageObject(this);
+
+  if (aliasee)
+    assert(aliasee->getType() == Ty && "Alias and aliasee types should match!");
+  Op<0>() = aliasee;
+
+  if (ParentModule)
+    ParentModule->getAliasList().push_back(this);
+}
+
+void GlobalAlias::setParent(Module *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+void GlobalAlias::removeFromParent() {
+  getParent()->getAliasList().remove(this);
+}
+
+void GlobalAlias::eraseFromParent() {
+  getParent()->getAliasList().erase(this);
+}
+
+bool GlobalAlias::isDeclaration() const {
+  const GlobalValue* AV = getAliasedGlobal();
+  if (AV)
+    return AV->isDeclaration();
+  else
+    return false;
+}
+
+void GlobalAlias::setAliasee(Constant *Aliasee) 
+{
+  if (Aliasee)
+    assert(Aliasee->getType() == getType() &&
+           "Alias and aliasee types should match!");
+  
+  setOperand(0, Aliasee);
+}
+
+const GlobalValue *GlobalAlias::getAliasedGlobal() const {
+  const Constant *C = getAliasee();
+  if (C) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      return GV;
+    else {
+      const ConstantExpr *CE = 0;
+      if ((CE = dyn_cast<ConstantExpr>(C)) &&
+          (CE->getOpcode() == Instruction::BitCast || 
+           CE->getOpcode() == Instruction::GetElementPtr))
+        return dyn_cast<GlobalValue>(CE->getOperand(0));
+      else
+        assert(0 && "Unsupported aliasee");
+    }
+  }
+  return 0;
+}
+
+const GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) const {
+  SmallPtrSet<const GlobalValue*, 3> Visited;
+
+  // Check if we need to stop early.
+  if (stopOnWeak && mayBeOverridden())
+    return this;
+
+  const GlobalValue *GV = getAliasedGlobal();
+  Visited.insert(GV);
+
+  // Iterate over aliasing chain, stopping on weak alias if necessary.
+  while (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
+    if (stopOnWeak && GA->mayBeOverridden())
+      break;
+
+    GV = GA->getAliasedGlobal();
+
+    if (!Visited.insert(GV))
+      return NULL;
+  }
+
+  return GV;
+}
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
new file mode 100644
index 0000000..524e294
--- /dev/null
+++ b/lib/VMCore/InlineAsm.cpp
@@ -0,0 +1,231 @@
+//===-- InlineAsm.cpp - Implement the InlineAsm class ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the InlineAsm class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InlineAsm.h"
+#include "llvm/DerivedTypes.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+// Implement the first virtual method in this class in this file so the
+// InlineAsm vtable is emitted here.
+InlineAsm::~InlineAsm() {
+}
+
+
+// NOTE: when memoizing the function type, we have to be careful to handle the
+// case when the type gets refined.
+
+InlineAsm *InlineAsm::get(const FunctionType *Ty, const std::string &AsmString,
+                          const std::string &Constraints, bool hasSideEffects) {
+  // FIXME: memoize!
+  return new InlineAsm(Ty, AsmString, Constraints, hasSideEffects);  
+}
+
+InlineAsm::InlineAsm(const FunctionType *Ty, const std::string &asmString,
+                     const std::string &constraints, bool hasSideEffects)
+  : Value(PointerType::getUnqual(Ty), 
+          Value::InlineAsmVal), 
+    AsmString(asmString), 
+    Constraints(constraints), HasSideEffects(hasSideEffects) {
+
+  // Do various checks on the constraint string and type.
+  assert(Verify(Ty, constraints) && "Function type not legal for constraints!");
+}
+
+const FunctionType *InlineAsm::getFunctionType() const {
+  return cast<FunctionType>(getType()->getElementType());
+}
+
+/// Parse - Analyze the specified string (e.g. "==&{eax}") and fill in the
+/// fields in this structure.  If the constraint string is not understood,
+/// return true, otherwise return false.
+bool InlineAsm::ConstraintInfo::Parse(const std::string &Str,
+                     std::vector<InlineAsm::ConstraintInfo> &ConstraintsSoFar) {
+  std::string::const_iterator I = Str.begin(), E = Str.end();
+  
+  // Initialize
+  Type = isInput;
+  isEarlyClobber = false;
+  MatchingInput = -1;
+  isCommutative = false;
+  isIndirect = false;
+  
+  // Parse prefixes.
+  if (*I == '~') {
+    Type = isClobber;
+    ++I;
+  } else if (*I == '=') {
+    ++I;
+    Type = isOutput;
+  }
+  
+  if (*I == '*') {
+    isIndirect = true;
+    ++I;
+  }
+  
+  if (I == E) return true;  // Just a prefix, like "==" or "~".
+  
+  // Parse the modifiers.
+  bool DoneWithModifiers = false;
+  while (!DoneWithModifiers) {
+    switch (*I) {
+    default:
+      DoneWithModifiers = true;
+      break;
+    case '&':     // Early clobber.
+      if (Type != isOutput ||      // Cannot early clobber anything but output.
+          isEarlyClobber)          // Reject &&&&&&
+        return true;
+      isEarlyClobber = true;
+      break;
+    case '%':     // Commutative.
+      if (Type == isClobber ||     // Cannot commute clobbers.
+          isCommutative)           // Reject %%%%%
+        return true;
+      isCommutative = true;
+      break;
+    case '#':     // Comment.
+    case '*':     // Register preferencing.
+      return true;     // Not supported.
+    }
+    
+    if (!DoneWithModifiers) {
+      ++I;
+      if (I == E) return true;   // Just prefixes and modifiers!
+    }
+  }
+  
+  // Parse the various constraints.
+  while (I != E) {
+    if (*I == '{') {   // Physical register reference.
+      // Find the end of the register name.
+      std::string::const_iterator ConstraintEnd = std::find(I+1, E, '}');
+      if (ConstraintEnd == E) return true;  // "{foo"
+      Codes.push_back(std::string(I, ConstraintEnd+1));
+      I = ConstraintEnd+1;
+    } else if (isdigit(*I)) {     // Matching Constraint
+      // Maximal munch numbers.
+      std::string::const_iterator NumStart = I;
+      while (I != E && isdigit(*I))
+        ++I;
+      Codes.push_back(std::string(NumStart, I));
+      unsigned N = atoi(Codes.back().c_str());
+      // Check that this is a valid matching constraint!
+      if (N >= ConstraintsSoFar.size() || ConstraintsSoFar[N].Type != isOutput||
+          Type != isInput)
+        return true;  // Invalid constraint number.
+      
+      // If Operand N already has a matching input, reject this.  An output
+      // can't be constrained to the same value as multiple inputs.
+      if (ConstraintsSoFar[N].hasMatchingInput())
+        return true;
+      
+      // Note that operand #n has a matching input.
+      ConstraintsSoFar[N].MatchingInput = ConstraintsSoFar.size();
+    } else {
+      // Single letter constraint.
+      Codes.push_back(std::string(I, I+1));
+      ++I;
+    }
+  }
+
+  return false;
+}
+
+std::vector<InlineAsm::ConstraintInfo>
+InlineAsm::ParseConstraints(const std::string &Constraints) {
+  std::vector<ConstraintInfo> Result;
+  
+  // Scan the constraints string.
+  for (std::string::const_iterator I = Constraints.begin(), 
+       E = Constraints.end(); I != E; ) {
+    ConstraintInfo Info;
+
+    // Find the end of this constraint.
+    std::string::const_iterator ConstraintEnd = std::find(I, E, ',');
+
+    if (ConstraintEnd == I ||  // Empty constraint like ",,"
+        Info.Parse(std::string(I, ConstraintEnd), Result)) {
+      Result.clear();          // Erroneous constraint?
+      break;
+    }
+
+    Result.push_back(Info);
+    
+    // ConstraintEnd may be either the next comma or the end of the string.  In
+    // the former case, we skip the comma.
+    I = ConstraintEnd;
+    if (I != E) {
+      ++I;
+      if (I == E) { Result.clear(); break; }    // don't allow "xyz,"
+    }
+  }
+  
+  return Result;
+}
+
+
+/// Verify - Verify that the specified constraint string is reasonable for the
+/// specified function type, and otherwise validate the constraint string.
+bool InlineAsm::Verify(const FunctionType *Ty, const std::string &ConstStr) {
+  if (Ty->isVarArg()) return false;
+  
+  std::vector<ConstraintInfo> Constraints = ParseConstraints(ConstStr);
+  
+  // Error parsing constraints.
+  if (Constraints.empty() && !ConstStr.empty()) return false;
+  
+  unsigned NumOutputs = 0, NumInputs = 0, NumClobbers = 0;
+  unsigned NumIndirect = 0;
+  
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    switch (Constraints[i].Type) {
+    case InlineAsm::isOutput:
+      if ((NumInputs-NumIndirect) != 0 || NumClobbers != 0)
+        return false;  // outputs before inputs and clobbers.
+      if (!Constraints[i].isIndirect) {
+        ++NumOutputs;
+        break;
+      }
+      ++NumIndirect;
+      // FALLTHROUGH for Indirect Outputs.
+    case InlineAsm::isInput:
+      if (NumClobbers) return false;               // inputs before clobbers.
+      ++NumInputs;
+      break;
+    case InlineAsm::isClobber:
+      ++NumClobbers;
+      break;
+    }
+  }
+  
+  switch (NumOutputs) {
+  case 0:
+    if (Ty->getReturnType() != Type::VoidTy) return false;
+    break;
+  case 1:
+    if (isa<StructType>(Ty->getReturnType())) return false;
+    break;
+  default:
+    const StructType *STy = dyn_cast<StructType>(Ty->getReturnType());
+    if (STy == 0 || STy->getNumElements() != NumOutputs)
+      return false;
+    break;
+  }      
+  
+  if (Ty->getNumParams() != NumInputs) return false;
+  return true;
+}
+
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
new file mode 100644
index 0000000..9e030b7
--- /dev/null
+++ b/lib/VMCore/Instruction.cpp
@@ -0,0 +1,387 @@
+//===-- Instruction.cpp - Implement the Instruction class -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Instruction class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Type.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/LeakDetector.h"
+using namespace llvm;
+
+Instruction::Instruction(const Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+                         Instruction *InsertBefore)
+  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
+  // Make sure that we get added to a basicblock
+  LeakDetector::addGarbageObject(this);
+
+  // If requested, insert this instruction into a basic block...
+  if (InsertBefore) {
+    assert(InsertBefore->getParent() &&
+           "Instruction to insert before is not in a basic block!");
+    InsertBefore->getParent()->getInstList().insert(InsertBefore, this);
+  }
+}
+
+Instruction::Instruction(const Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+                         BasicBlock *InsertAtEnd)
+  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
+  // Make sure that we get added to a basicblock
+  LeakDetector::addGarbageObject(this);
+
+  // append this instruction into the basic block
+  assert(InsertAtEnd && "Basic block to append to may not be NULL!");
+  InsertAtEnd->getInstList().push_back(this);
+}
+
+
+// Out of line virtual method, so the vtable, etc has a home.
+Instruction::~Instruction() {
+  assert(Parent == 0 && "Instruction still linked in the program!");
+}
+
+
+void Instruction::setParent(BasicBlock *P) {
+  if (getParent()) {
+    if (!P) LeakDetector::addGarbageObject(this);
+  } else {
+    if (P) LeakDetector::removeGarbageObject(this);
+  }
+
+  Parent = P;
+}
+
+void Instruction::removeFromParent() {
+  getParent()->getInstList().remove(this);
+}
+
+void Instruction::eraseFromParent() {
+  getParent()->getInstList().erase(this);
+}
+
+/// insertBefore - Insert an unlinked instructions into a basic block
+/// immediately before the specified instruction.
+void Instruction::insertBefore(Instruction *InsertPos) {
+  InsertPos->getParent()->getInstList().insert(InsertPos, this);
+}
+
+/// insertAfter - Insert an unlinked instructions into a basic block
+/// immediately after the specified instruction.
+void Instruction::insertAfter(Instruction *InsertPos) {
+  InsertPos->getParent()->getInstList().insertAfter(InsertPos, this);
+}
+
+/// moveBefore - Unlink this instruction from its current basic block and
+/// insert it into the basic block that MovePos lives in, right before
+/// MovePos.
+void Instruction::moveBefore(Instruction *MovePos) {
+  MovePos->getParent()->getInstList().splice(MovePos,getParent()->getInstList(),
+                                             this);
+}
+
+
+const char *Instruction::getOpcodeName(unsigned OpCode) {
+  switch (OpCode) {
+  // Terminators
+  case Ret:    return "ret";
+  case Br:     return "br";
+  case Switch: return "switch";
+  case Invoke: return "invoke";
+  case Unwind: return "unwind";
+  case Unreachable: return "unreachable";
+
+  // Standard binary operators...
+  case Add: return "add";
+  case Sub: return "sub";
+  case Mul: return "mul";
+  case UDiv: return "udiv";
+  case SDiv: return "sdiv";
+  case FDiv: return "fdiv";
+  case URem: return "urem";
+  case SRem: return "srem";
+  case FRem: return "frem";
+
+  // Logical operators...
+  case And: return "and";
+  case Or : return "or";
+  case Xor: return "xor";
+
+  // Memory instructions...
+  case Malloc:        return "malloc";
+  case Free:          return "free";
+  case Alloca:        return "alloca";
+  case Load:          return "load";
+  case Store:         return "store";
+  case GetElementPtr: return "getelementptr";
+
+  // Convert instructions...
+  case Trunc:     return "trunc";
+  case ZExt:      return "zext";
+  case SExt:      return "sext";
+  case FPTrunc:   return "fptrunc";
+  case FPExt:     return "fpext";
+  case FPToUI:    return "fptoui";
+  case FPToSI:    return "fptosi";
+  case UIToFP:    return "uitofp";
+  case SIToFP:    return "sitofp";
+  case IntToPtr:  return "inttoptr";
+  case PtrToInt:  return "ptrtoint";
+  case BitCast:   return "bitcast";
+
+  // Other instructions...
+  case ICmp:           return "icmp";
+  case FCmp:           return "fcmp";
+  case VICmp:          return "vicmp";
+  case VFCmp:          return "vfcmp";
+  case PHI:            return "phi";
+  case Select:         return "select";
+  case Call:           return "call";
+  case Shl:            return "shl";
+  case LShr:           return "lshr";
+  case AShr:           return "ashr";
+  case VAArg:          return "va_arg";
+  case ExtractElement: return "extractelement";
+  case InsertElement:  return "insertelement";
+  case ShuffleVector:  return "shufflevector";
+  case ExtractValue:   return "extractvalue";
+  case InsertValue:    return "insertvalue";
+
+  default: return "<Invalid operator> ";
+  }
+
+  return 0;
+}
+
+/// isIdenticalTo - Return true if the specified instruction is exactly
+/// identical to the current one.  This means that all operands match and any
+/// extra information (e.g. load is volatile) agree.
+bool Instruction::isIdenticalTo(const Instruction *I) const {
+  if (getOpcode() != I->getOpcode() ||
+      getNumOperands() != I->getNumOperands() ||
+      getType() != I->getType())
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (getOperand(i) != I->getOperand(i))
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
+    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
+           LI->getAlignment() == cast<LoadInst>(I)->getAlignment();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
+    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
+           SI->getAlignment() == cast<StoreInst>(I)->getAlignment();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
+    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
+           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
+           CI->getAttributes().getRawPointer() ==
+             cast<CallInst>(I)->getAttributes().getRawPointer();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
+    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
+           CI->getAttributes().getRawPointer() ==
+             cast<InvokeInst>(I)->getAttributes().getRawPointer();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this)) {
+    if (IVI->getNumIndices() != cast<InsertValueInst>(I)->getNumIndices())
+      return false;
+    for (unsigned i = 0, e = IVI->getNumIndices(); i != e; ++i)
+      if (IVI->idx_begin()[i] != cast<InsertValueInst>(I)->idx_begin()[i])
+        return false;
+    return true;
+  }
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this)) {
+    if (EVI->getNumIndices() != cast<ExtractValueInst>(I)->getNumIndices())
+      return false;
+    for (unsigned i = 0, e = EVI->getNumIndices(); i != e; ++i)
+      if (EVI->idx_begin()[i] != cast<ExtractValueInst>(I)->idx_begin()[i])
+        return false;
+    return true;
+  }
+
+  return true;
+}
+
+// isSameOperationAs
+bool Instruction::isSameOperationAs(const Instruction *I) const {
+  if (getOpcode() != I->getOpcode() || getType() != I->getType() ||
+      getNumOperands() != I->getNumOperands())
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (getOperand(i)->getType() != I->getOperand(i)->getType())
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
+    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
+           LI->getAlignment() == cast<LoadInst>(I)->getAlignment();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
+    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
+           SI->getAlignment() == cast<StoreInst>(I)->getAlignment();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
+    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
+           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
+           CI->getAttributes().getRawPointer() ==
+             cast<CallInst>(I)->getAttributes().getRawPointer();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
+    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
+           CI->getAttributes().getRawPointer() ==
+             cast<InvokeInst>(I)->getAttributes().getRawPointer();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this)) {
+    if (IVI->getNumIndices() != cast<InsertValueInst>(I)->getNumIndices())
+      return false;
+    for (unsigned i = 0, e = IVI->getNumIndices(); i != e; ++i)
+      if (IVI->idx_begin()[i] != cast<InsertValueInst>(I)->idx_begin()[i])
+        return false;
+    return true;
+  }
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this)) {
+    if (EVI->getNumIndices() != cast<ExtractValueInst>(I)->getNumIndices())
+      return false;
+    for (unsigned i = 0, e = EVI->getNumIndices(); i != e; ++i)
+      if (EVI->idx_begin()[i] != cast<ExtractValueInst>(I)->idx_begin()[i])
+        return false;
+    return true;
+  }
+
+  return true;
+}
+
+/// isUsedOutsideOfBlock - Return true if there are any uses of I outside of the
+/// specified block.  Note that PHI nodes are considered to evaluate their
+/// operands in the corresponding predecessor block.
+bool Instruction::isUsedOutsideOfBlock(const BasicBlock *BB) const {
+  for (use_const_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    // PHI nodes uses values in the corresponding predecessor block.  For other
+    // instructions, just check to see whether the parent of the use matches up.
+    const PHINode *PN = dyn_cast<PHINode>(*UI);
+    if (PN == 0) {
+      if (cast<Instruction>(*UI)->getParent() != BB)
+        return true;
+      continue;
+    }
+    
+    if (PN->getIncomingBlock(UI) != BB)
+      return true;
+  }
+  return false;    
+}
+
+/// mayReadFromMemory - Return true if this instruction may read memory.
+///
+bool Instruction::mayReadFromMemory() const {
+  switch (getOpcode()) {
+  default: return false;
+  case Instruction::Free:
+  case Instruction::VAArg:
+  case Instruction::Load:
+    return true;
+  case Instruction::Call:
+    return !cast<CallInst>(this)->doesNotAccessMemory();
+  case Instruction::Invoke:
+    return !cast<InvokeInst>(this)->doesNotAccessMemory();
+  case Instruction::Store:
+    return cast<StoreInst>(this)->isVolatile();
+  }
+}
+
+/// mayWriteToMemory - Return true if this instruction may modify memory.
+///
+bool Instruction::mayWriteToMemory() const {
+  switch (getOpcode()) {
+  default: return false;
+  case Instruction::Free:
+  case Instruction::Store:
+  case Instruction::VAArg:
+    return true;
+  case Instruction::Call:
+    return !cast<CallInst>(this)->onlyReadsMemory();
+  case Instruction::Invoke:
+    return !cast<InvokeInst>(this)->onlyReadsMemory();
+  case Instruction::Load:
+    return cast<LoadInst>(this)->isVolatile();
+  }
+}
+
+/// mayThrow - Return true if this instruction may throw an exception.
+///
+bool Instruction::mayThrow() const {
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return !CI->doesNotThrow();
+  return false;
+}
+
+/// isAssociative - Return true if the instruction is associative:
+///
+///   Associative operators satisfy:  x op (y op z) === (x op y) op z)
+///
+/// In LLVM, the Add, Mul, And, Or, and Xor operators are associative, when not
+/// applied to floating point types.
+///
+bool Instruction::isAssociative(unsigned Opcode, const Type *Ty) {
+  if (Opcode == And || Opcode == Or || Opcode == Xor)
+    return true;
+
+  // Add/Mul reassociate unless they are FP or FP vectors.
+  if (Opcode == Add || Opcode == Mul)
+    return !Ty->isFPOrFPVector();
+  return 0;
+}
+
+/// isCommutative - Return true if the instruction is commutative:
+///
+///   Commutative operators satisfy: (x op y) === (y op x)
+///
+/// In LLVM, these are the associative operators, plus SetEQ and SetNE, when
+/// applied to any type.
+///
+bool Instruction::isCommutative(unsigned op) {
+  switch (op) {
+  case Add:
+  case Mul:
+  case And:
+  case Or:
+  case Xor:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isTrapping - Return true if the instruction may trap.
+///
+bool Instruction::isTrapping(unsigned op) {
+  switch(op) {
+  case UDiv:
+  case SDiv:
+  case FDiv:
+  case URem:
+  case SRem:
+  case FRem:
+  case Load:
+  case Store:
+  case Call:
+  case Invoke:
+  case VAArg:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
new file mode 100644
index 0000000..fe30271
--- /dev/null
+++ b/lib/VMCore/Instructions.cpp
@@ -0,0 +1,2963 @@
+//===-- Instructions.cpp - Implement the LLVM instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements all of the non-inline methods for the LLVM instruction
+// classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                            CallSite Class
+//===----------------------------------------------------------------------===//
+
+#define CALLSITE_DELEGATE_GETTER(METHOD) \
+  Instruction *II(getInstruction());     \
+  return isCall()                        \
+    ? cast<CallInst>(II)->METHOD         \
+    : cast<InvokeInst>(II)->METHOD
+
+#define CALLSITE_DELEGATE_SETTER(METHOD) \
+  Instruction *II(getInstruction());     \
+  if (isCall())                          \
+    cast<CallInst>(II)->METHOD;          \
+  else                                   \
+    cast<InvokeInst>(II)->METHOD
+
+CallSite::CallSite(Instruction *C) {
+  assert((isa<CallInst>(C) || isa<InvokeInst>(C)) && "Not a call!");
+  I.setPointer(C);
+  I.setInt(isa<CallInst>(C));
+}
+unsigned CallSite::getCallingConv() const {
+  CALLSITE_DELEGATE_GETTER(getCallingConv());
+}
+void CallSite::setCallingConv(unsigned CC) {
+  CALLSITE_DELEGATE_SETTER(setCallingConv(CC));
+}
+const AttrListPtr &CallSite::getAttributes() const {
+  CALLSITE_DELEGATE_GETTER(getAttributes());
+}
+void CallSite::setAttributes(const AttrListPtr &PAL) {
+  CALLSITE_DELEGATE_SETTER(setAttributes(PAL));
+}
+bool CallSite::paramHasAttr(uint16_t i, Attributes attr) const {
+  CALLSITE_DELEGATE_GETTER(paramHasAttr(i, attr));
+}
+uint16_t CallSite::getParamAlignment(uint16_t i) const {
+  CALLSITE_DELEGATE_GETTER(getParamAlignment(i));
+}
+bool CallSite::doesNotAccessMemory() const {
+  CALLSITE_DELEGATE_GETTER(doesNotAccessMemory());
+}
+void CallSite::setDoesNotAccessMemory(bool doesNotAccessMemory) {
+  CALLSITE_DELEGATE_SETTER(setDoesNotAccessMemory(doesNotAccessMemory));
+}
+bool CallSite::onlyReadsMemory() const {
+  CALLSITE_DELEGATE_GETTER(onlyReadsMemory());
+}
+void CallSite::setOnlyReadsMemory(bool onlyReadsMemory) {
+  CALLSITE_DELEGATE_SETTER(setOnlyReadsMemory(onlyReadsMemory));
+}
+bool CallSite::doesNotReturn() const {
+ CALLSITE_DELEGATE_GETTER(doesNotReturn());
+}
+void CallSite::setDoesNotReturn(bool doesNotReturn) {
+  CALLSITE_DELEGATE_SETTER(setDoesNotReturn(doesNotReturn));
+}
+bool CallSite::doesNotThrow() const {
+  CALLSITE_DELEGATE_GETTER(doesNotThrow());
+}
+void CallSite::setDoesNotThrow(bool doesNotThrow) {
+  CALLSITE_DELEGATE_SETTER(setDoesNotThrow(doesNotThrow));
+}
+
+bool CallSite::hasArgument(const Value *Arg) const {
+  for (arg_iterator AI = this->arg_begin(), E = this->arg_end(); AI != E; ++AI)
+    if (AI->get() == Arg)
+      return true;
+  return false;
+}
+
+#undef CALLSITE_DELEGATE_GETTER
+#undef CALLSITE_DELEGATE_SETTER
+
+//===----------------------------------------------------------------------===//
+//                            TerminatorInst Class
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method, so the vtable, etc has a home.
+TerminatorInst::~TerminatorInst() {
+}
+
+//===----------------------------------------------------------------------===//
+//                           UnaryInstruction Class
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method, so the vtable, etc has a home.
+UnaryInstruction::~UnaryInstruction() {
+}
+
+//===----------------------------------------------------------------------===//
+//                              SelectInst Class
+//===----------------------------------------------------------------------===//
+
+/// areInvalidOperands - Return a string if the specified operands are invalid
+/// for a select operation, otherwise return null.
+const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
+  if (Op1->getType() != Op2->getType())
+    return "both values to select must have same type";
+  
+  if (const VectorType *VT = dyn_cast<VectorType>(Op0->getType())) {
+    // Vector select.
+    if (VT->getElementType() != Type::Int1Ty)
+      return "vector select condition element type must be i1";
+    const VectorType *ET = dyn_cast<VectorType>(Op1->getType());
+    if (ET == 0)
+      return "selected values for vector select must be vectors";
+    if (ET->getNumElements() != VT->getNumElements())
+      return "vector select requires selected vectors to have "
+                   "the same vector length as select condition";
+  } else if (Op0->getType() != Type::Int1Ty) {
+    return "select condition must be i1 or <n x i1>";
+  }
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               PHINode Class
+//===----------------------------------------------------------------------===//
+
+PHINode::PHINode(const PHINode &PN)
+  : Instruction(PN.getType(), Instruction::PHI,
+                allocHungoffUses(PN.getNumOperands()), PN.getNumOperands()),
+    ReservedSpace(PN.getNumOperands()) {
+  Use *OL = OperandList;
+  for (unsigned i = 0, e = PN.getNumOperands(); i != e; i+=2) {
+    OL[i] = PN.getOperand(i);
+    OL[i+1] = PN.getOperand(i+1);
+  }
+}
+
+PHINode::~PHINode() {
+  if (OperandList)
+    dropHungoffUses(OperandList);
+}
+
+// removeIncomingValue - Remove an incoming value.  This is useful if a
+// predecessor basic block is deleted.
+Value *PHINode::removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty) {
+  unsigned NumOps = getNumOperands();
+  Use *OL = OperandList;
+  assert(Idx*2 < NumOps && "BB not in PHI node!");
+  Value *Removed = OL[Idx*2];
+
+  // Move everything after this operand down.
+  //
+  // FIXME: we could just swap with the end of the list, then erase.  However,
+  // client might not expect this to happen.  The code as it is thrashes the
+  // use/def lists, which is kinda lame.
+  for (unsigned i = (Idx+1)*2; i != NumOps; i += 2) {
+    OL[i-2] = OL[i];
+    OL[i-2+1] = OL[i+1];
+  }
+
+  // Nuke the last value.
+  OL[NumOps-2].set(0);
+  OL[NumOps-2+1].set(0);
+  NumOperands = NumOps-2;
+
+  // If the PHI node is dead, because it has zero entries, nuke it now.
+  if (NumOps == 2 && DeletePHIIfEmpty) {
+    // If anyone is using this PHI, make them use a dummy value instead...
+    replaceAllUsesWith(UndefValue::get(getType()));
+    eraseFromParent();
+  }
+  return Removed;
+}
+
+/// resizeOperands - resize operands - This adjusts the length of the operands
+/// list according to the following behavior:
+///   1. If NumOps == 0, grow the operand list in response to a push_back style
+///      of operation.  This grows the number of ops by 1.5 times.
+///   2. If NumOps > NumOperands, reserve space for NumOps operands.
+///   3. If NumOps == NumOperands, trim the reserved space.
+///
+void PHINode::resizeOperands(unsigned NumOps) {
+  unsigned e = getNumOperands();
+  if (NumOps == 0) {
+    NumOps = e*3/2;
+    if (NumOps < 4) NumOps = 4;      // 4 op PHI nodes are VERY common.
+  } else if (NumOps*2 > NumOperands) {
+    // No resize needed.
+    if (ReservedSpace >= NumOps) return;
+  } else if (NumOps == NumOperands) {
+    if (ReservedSpace == NumOps) return;
+  } else {
+    return;
+  }
+
+  ReservedSpace = NumOps;
+  Use *OldOps = OperandList;
+  Use *NewOps = allocHungoffUses(NumOps);
+  std::copy(OldOps, OldOps + e, NewOps);
+  OperandList = NewOps;
+  if (OldOps) Use::zap(OldOps, OldOps + e, true);
+}
+
+/// hasConstantValue - If the specified PHI node always merges together the same
+/// value, return the value, otherwise return null.
+///
+Value *PHINode::hasConstantValue(bool AllowNonDominatingInstruction) const {
+  // If the PHI node only has one incoming value, eliminate the PHI node...
+  if (getNumIncomingValues() == 1) {
+    if (getIncomingValue(0) != this)   // not  X = phi X
+      return getIncomingValue(0);
+    else
+      return UndefValue::get(getType());  // Self cycle is dead.
+  }
+      
+  // Otherwise if all of the incoming values are the same for the PHI, replace
+  // the PHI node with the incoming value.
+  //
+  Value *InVal = 0;
+  bool HasUndefInput = false;
+  for (unsigned i = 0, e = getNumIncomingValues(); i != e; ++i)
+    if (isa<UndefValue>(getIncomingValue(i))) {
+      HasUndefInput = true;
+    } else if (getIncomingValue(i) != this) { // Not the PHI node itself...
+      if (InVal && getIncomingValue(i) != InVal)
+        return 0;  // Not the same, bail out.
+      else
+        InVal = getIncomingValue(i);
+    }
+  
+  // The only case that could cause InVal to be null is if we have a PHI node
+  // that only has entries for itself.  In this case, there is no entry into the
+  // loop, so kill the PHI.
+  //
+  if (InVal == 0) InVal = UndefValue::get(getType());
+  
+  // If we have a PHI node like phi(X, undef, X), where X is defined by some
+  // instruction, we cannot always return X as the result of the PHI node.  Only
+  // do this if X is not an instruction (thus it must dominate the PHI block),
+  // or if the client is prepared to deal with this possibility.
+  if (HasUndefInput && !AllowNonDominatingInstruction)
+    if (Instruction *IV = dyn_cast<Instruction>(InVal))
+      // If it's in the entry block, it dominates everything.
+      if (IV->getParent() != &IV->getParent()->getParent()->getEntryBlock() ||
+          isa<InvokeInst>(IV))
+        return 0;   // Cannot guarantee that InVal dominates this PHINode.
+
+  // All of the incoming values are the same, return the value now.
+  return InVal;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        CallInst Implementation
+//===----------------------------------------------------------------------===//
+
+CallInst::~CallInst() {
+}
+
+void CallInst::init(Value *Func, Value* const *Params, unsigned NumParams) {
+  assert(NumOperands == NumParams+1 && "NumOperands not set up?");
+  Use *OL = OperandList;
+  OL[0] = Func;
+
+  const FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
+  FTy = FTy;  // silence warning.
+
+  assert((NumParams == FTy->getNumParams() ||
+          (FTy->isVarArg() && NumParams > FTy->getNumParams())) &&
+         "Calling a function with bad signature!");
+  for (unsigned i = 0; i != NumParams; ++i) {
+    assert((i >= FTy->getNumParams() || 
+            FTy->getParamType(i) == Params[i]->getType()) &&
+           "Calling a function with a bad signature!");
+    OL[i+1] = Params[i];
+  }
+}
+
+void CallInst::init(Value *Func, Value *Actual1, Value *Actual2) {
+  assert(NumOperands == 3 && "NumOperands not set up?");
+  Use *OL = OperandList;
+  OL[0] = Func;
+  OL[1] = Actual1;
+  OL[2] = Actual2;
+
+  const FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
+  FTy = FTy;  // silence warning.
+
+  assert((FTy->getNumParams() == 2 ||
+          (FTy->isVarArg() && FTy->getNumParams() < 2)) &&
+         "Calling a function with bad signature");
+  assert((0 >= FTy->getNumParams() || 
+          FTy->getParamType(0) == Actual1->getType()) &&
+         "Calling a function with a bad signature!");
+  assert((1 >= FTy->getNumParams() || 
+          FTy->getParamType(1) == Actual2->getType()) &&
+         "Calling a function with a bad signature!");
+}
+
+void CallInst::init(Value *Func, Value *Actual) {
+  assert(NumOperands == 2 && "NumOperands not set up?");
+  Use *OL = OperandList;
+  OL[0] = Func;
+  OL[1] = Actual;
+
+  const FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
+  FTy = FTy;  // silence warning.
+
+  assert((FTy->getNumParams() == 1 ||
+          (FTy->isVarArg() && FTy->getNumParams() == 0)) &&
+         "Calling a function with bad signature");
+  assert((0 == FTy->getNumParams() || 
+          FTy->getParamType(0) == Actual->getType()) &&
+         "Calling a function with a bad signature!");
+}
+
+void CallInst::init(Value *Func) {
+  assert(NumOperands == 1 && "NumOperands not set up?");
+  Use *OL = OperandList;
+  OL[0] = Func;
+
+  const FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
+  FTy = FTy;  // silence warning.
+
+  assert(FTy->getNumParams() == 0 && "Calling a function with bad signature");
+}
+
+CallInst::CallInst(Value *Func, Value* Actual, const std::string &Name,
+                   Instruction *InsertBefore)
+  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
+                                   ->getElementType())->getReturnType(),
+                Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - 2,
+                2, InsertBefore) {
+  init(Func, Actual);
+  setName(Name);
+}
+
+CallInst::CallInst(Value *Func, Value* Actual, const std::string &Name,
+                   BasicBlock  *InsertAtEnd)
+  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
+                                   ->getElementType())->getReturnType(),
+                Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - 2,
+                2, InsertAtEnd) {
+  init(Func, Actual);
+  setName(Name);
+}
+CallInst::CallInst(Value *Func, const std::string &Name,
+                   Instruction *InsertBefore)
+  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
+                                   ->getElementType())->getReturnType(),
+                Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - 1,
+                1, InsertBefore) {
+  init(Func);
+  setName(Name);
+}
+
+CallInst::CallInst(Value *Func, const std::string &Name,
+                   BasicBlock *InsertAtEnd)
+  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
+                                   ->getElementType())->getReturnType(),
+                Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - 1,
+                1, InsertAtEnd) {
+  init(Func);
+  setName(Name);
+}
+
+CallInst::CallInst(const CallInst &CI)
+  : Instruction(CI.getType(), Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - CI.getNumOperands(),
+                CI.getNumOperands()) {
+  setAttributes(CI.getAttributes());
+  SubclassData = CI.SubclassData;
+  Use *OL = OperandList;
+  Use *InOL = CI.OperandList;
+  for (unsigned i = 0, e = CI.getNumOperands(); i != e; ++i)
+    OL[i] = InOL[i];
+}
+
+void CallInst::addAttribute(unsigned i, Attributes attr) {
+  AttrListPtr PAL = getAttributes();
+  PAL = PAL.addAttr(i, attr);
+  setAttributes(PAL);
+}
+
+void CallInst::removeAttribute(unsigned i, Attributes attr) {
+  AttrListPtr PAL = getAttributes();
+  PAL = PAL.removeAttr(i, attr);
+  setAttributes(PAL);
+}
+
+bool CallInst::paramHasAttr(unsigned i, Attributes attr) const {
+  if (AttributeList.paramHasAttr(i, attr))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->paramHasAttr(i, attr);
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        InvokeInst Implementation
+//===----------------------------------------------------------------------===//
+
+void InvokeInst::init(Value *Fn, BasicBlock *IfNormal, BasicBlock *IfException,
+                      Value* const *Args, unsigned NumArgs) {
+  assert(NumOperands == 3+NumArgs && "NumOperands not set up?");
+  Use *OL = OperandList;
+  OL[0] = Fn;
+  OL[1] = IfNormal;
+  OL[2] = IfException;
+  const FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Fn->getType())->getElementType());
+  FTy = FTy;  // silence warning.
+
+  assert(((NumArgs == FTy->getNumParams()) ||
+          (FTy->isVarArg() && NumArgs > FTy->getNumParams())) &&
+         "Calling a function with bad signature");
+
+  for (unsigned i = 0, e = NumArgs; i != e; i++) {
+    assert((i >= FTy->getNumParams() || 
+            FTy->getParamType(i) == Args[i]->getType()) &&
+           "Invoking a function with a bad signature!");
+    
+    OL[i+3] = Args[i];
+  }
+}
+
+InvokeInst::InvokeInst(const InvokeInst &II)
+  : TerminatorInst(II.getType(), Instruction::Invoke,
+                   OperandTraits<InvokeInst>::op_end(this)
+                   - II.getNumOperands(),
+                   II.getNumOperands()) {
+  setAttributes(II.getAttributes());
+  SubclassData = II.SubclassData;
+  Use *OL = OperandList, *InOL = II.OperandList;
+  for (unsigned i = 0, e = II.getNumOperands(); i != e; ++i)
+    OL[i] = InOL[i];
+}
+
+BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned InvokeInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  return setSuccessor(idx, B);
+}
+
+bool InvokeInst::paramHasAttr(unsigned i, Attributes attr) const {
+  if (AttributeList.paramHasAttr(i, attr))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->paramHasAttr(i, attr);
+  return false;
+}
+
+void InvokeInst::addAttribute(unsigned i, Attributes attr) {
+  AttrListPtr PAL = getAttributes();
+  PAL = PAL.addAttr(i, attr);
+  setAttributes(PAL);
+}
+
+void InvokeInst::removeAttribute(unsigned i, Attributes attr) {
+  AttrListPtr PAL = getAttributes();
+  PAL = PAL.removeAttr(i, attr);
+  setAttributes(PAL);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        ReturnInst Implementation
+//===----------------------------------------------------------------------===//
+
+ReturnInst::ReturnInst(const ReturnInst &RI)
+  : TerminatorInst(Type::VoidTy, Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this) -
+                     RI.getNumOperands(),
+                   RI.getNumOperands()) {
+  if (RI.getNumOperands())
+    Op<0>() = RI.Op<0>();
+}
+
+ReturnInst::ReturnInst(Value *retVal, Instruction *InsertBefore)
+  : TerminatorInst(Type::VoidTy, Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                   InsertBefore) {
+  if (retVal)
+    Op<0>() = retVal;
+}
+ReturnInst::ReturnInst(Value *retVal, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::VoidTy, Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                   InsertAtEnd) {
+  if (retVal)
+    Op<0>() = retVal;
+}
+ReturnInst::ReturnInst(BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::VoidTy, Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
+}
+
+unsigned ReturnInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+/// Out-of-line ReturnInst method, put here so the C++ compiler can choose to
+/// emit the vtable for the class in this translation unit.
+void ReturnInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
+  assert(0 && "ReturnInst has no successors!");
+}
+
+BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
+  assert(0 && "ReturnInst has no successors!");
+  abort();
+  return 0;
+}
+
+ReturnInst::~ReturnInst() {
+}
+
+//===----------------------------------------------------------------------===//
+//                        UnwindInst Implementation
+//===----------------------------------------------------------------------===//
+
+UnwindInst::UnwindInst(Instruction *InsertBefore)
+  : TerminatorInst(Type::VoidTy, Instruction::Unwind, 0, 0, InsertBefore) {
+}
+UnwindInst::UnwindInst(BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::VoidTy, Instruction::Unwind, 0, 0, InsertAtEnd) {
+}
+
+
+unsigned UnwindInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+void UnwindInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
+  assert(0 && "UnwindInst has no successors!");
+}
+
+BasicBlock *UnwindInst::getSuccessorV(unsigned idx) const {
+  assert(0 && "UnwindInst has no successors!");
+  abort();
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                      UnreachableInst Implementation
+//===----------------------------------------------------------------------===//
+
+UnreachableInst::UnreachableInst(Instruction *InsertBefore)
+  : TerminatorInst(Type::VoidTy, Instruction::Unreachable, 0, 0, InsertBefore) {
+}
+UnreachableInst::UnreachableInst(BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::VoidTy, Instruction::Unreachable, 0, 0, InsertAtEnd) {
+}
+
+unsigned UnreachableInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+void UnreachableInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
+  assert(0 && "UnwindInst has no successors!");
+}
+
+BasicBlock *UnreachableInst::getSuccessorV(unsigned idx) const {
+  assert(0 && "UnwindInst has no successors!");
+  abort();
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                        BranchInst Implementation
+//===----------------------------------------------------------------------===//
+
+void BranchInst::AssertOK() {
+  if (isConditional())
+    assert(getCondition()->getType() == Type::Int1Ty &&
+           "May only branch on boolean predicates!");
+}
+
+BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
+  : TerminatorInst(Type::VoidTy, Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 1,
+                   1, InsertBefore) {
+  assert(IfTrue != 0 && "Branch destination may not be null!");
+  Op<-1>() = IfTrue;
+}
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
+                       Instruction *InsertBefore)
+  : TerminatorInst(Type::VoidTy, Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 3,
+                   3, InsertBefore) {
+  Op<-1>() = IfTrue;
+  Op<-2>() = IfFalse;
+  Op<-3>() = Cond;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::VoidTy, Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 1,
+                   1, InsertAtEnd) {
+  assert(IfTrue != 0 && "Branch destination may not be null!");
+  Op<-1>() = IfTrue;
+}
+
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
+           BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::VoidTy, Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 3,
+                   3, InsertAtEnd) {
+  Op<-1>() = IfTrue;
+  Op<-2>() = IfFalse;
+  Op<-3>() = Cond;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+
+BranchInst::BranchInst(const BranchInst &BI) :
+  TerminatorInst(Type::VoidTy, Instruction::Br,
+                 OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
+                 BI.getNumOperands()) {
+  Op<-1>() = BI.Op<-1>();
+  if (BI.getNumOperands() != 1) {
+    assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!");
+    Op<-3>() = BI.Op<-3>();
+    Op<-2>() = BI.Op<-2>();
+  }
+}
+
+
+Use* Use::getPrefix() {
+  PointerIntPair<Use**, 2, PrevPtrTag> &PotentialPrefix(this[-1].Prev);
+  if (PotentialPrefix.getOpaqueValue())
+    return 0;
+
+  return reinterpret_cast<Use*>((char*)&PotentialPrefix + 1);
+}
+
+BranchInst::~BranchInst() {
+  if (NumOperands == 1) {
+    if (Use *Prefix = OperandList->getPrefix()) {
+      Op<-1>() = 0;
+      //
+      // mark OperandList to have a special value for scrutiny
+      // by baseclass destructors and operator delete
+      OperandList = Prefix;
+    } else {
+      NumOperands = 3;
+      OperandList = op_begin();
+    }
+  }
+}
+
+
+BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned BranchInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        AllocationInst Implementation
+//===----------------------------------------------------------------------===//
+
+static Value *getAISize(Value *Amt) {
+  if (!Amt)
+    Amt = ConstantInt::get(Type::Int32Ty, 1);
+  else {
+    assert(!isa<BasicBlock>(Amt) &&
+           "Passed basic block into allocation size parameter! Use other ctor");
+    assert(Amt->getType() == Type::Int32Ty &&
+           "Malloc/Allocation array size is not a 32-bit integer!");
+  }
+  return Amt;
+}
+
+AllocationInst::AllocationInst(const Type *Ty, Value *ArraySize, unsigned iTy,
+                               unsigned Align, const std::string &Name,
+                               Instruction *InsertBefore)
+  : UnaryInstruction(PointerType::getUnqual(Ty), iTy, getAISize(ArraySize),
+                     InsertBefore) {
+  setAlignment(Align);
+  assert(Ty != Type::VoidTy && "Cannot allocate void!");
+  setName(Name);
+}
+
+AllocationInst::AllocationInst(const Type *Ty, Value *ArraySize, unsigned iTy,
+                               unsigned Align, const std::string &Name,
+                               BasicBlock *InsertAtEnd)
+  : UnaryInstruction(PointerType::getUnqual(Ty), iTy, getAISize(ArraySize),
+                     InsertAtEnd) {
+  setAlignment(Align);
+  assert(Ty != Type::VoidTy && "Cannot allocate void!");
+  setName(Name);
+}
+
+// Out of line virtual method, so the vtable, etc has a home.
+AllocationInst::~AllocationInst() {
+}
+
+void AllocationInst::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  SubclassData = Log2_32(Align) + 1;
+  assert(getAlignment() == Align && "Alignment representation error!");
+}
+
+bool AllocationInst::isArrayAllocation() const {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(getOperand(0)))
+    return CI->getZExtValue() != 1;
+  return true;
+}
+
+const Type *AllocationInst::getAllocatedType() const {
+  return getType()->getElementType();
+}
+
+AllocaInst::AllocaInst(const AllocaInst &AI)
+  : AllocationInst(AI.getType()->getElementType(), (Value*)AI.getOperand(0),
+                   Instruction::Alloca, AI.getAlignment()) {
+}
+
+/// isStaticAlloca - Return true if this alloca is in the entry block of the
+/// function and is a constant size.  If so, the code generator will fold it
+/// into the prolog/epilog code, so it is basically free.
+bool AllocaInst::isStaticAlloca() const {
+  // Must be constant size.
+  if (!isa<ConstantInt>(getArraySize())) return false;
+  
+  // Must be in the entry block.
+  const BasicBlock *Parent = getParent();
+  return Parent == &Parent->getParent()->front();
+}
+
+MallocInst::MallocInst(const MallocInst &MI)
+  : AllocationInst(MI.getType()->getElementType(), (Value*)MI.getOperand(0),
+                   Instruction::Malloc, MI.getAlignment()) {
+}
+
+//===----------------------------------------------------------------------===//
+//                             FreeInst Implementation
+//===----------------------------------------------------------------------===//
+
+void FreeInst::AssertOK() {
+  assert(isa<PointerType>(getOperand(0)->getType()) &&
+         "Can not free something of nonpointer type!");
+}
+
+FreeInst::FreeInst(Value *Ptr, Instruction *InsertBefore)
+  : UnaryInstruction(Type::VoidTy, Free, Ptr, InsertBefore) {
+  AssertOK();
+}
+
+FreeInst::FreeInst(Value *Ptr, BasicBlock *InsertAtEnd)
+  : UnaryInstruction(Type::VoidTy, Free, Ptr, InsertAtEnd) {
+  AssertOK();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           LoadInst Implementation
+//===----------------------------------------------------------------------===//
+
+void LoadInst::AssertOK() {
+  assert(isa<PointerType>(getOperand(0)->getType()) &&
+         "Ptr must have pointer type.");
+}
+
+LoadInst::LoadInst(Value *Ptr, const std::string &Name, Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(false);
+  setAlignment(0);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const std::string &Name, BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(false);
+  setAlignment(0);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const std::string &Name, bool isVolatile,
+                   Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const std::string &Name, bool isVolatile, 
+                   unsigned Align, Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const std::string &Name, bool isVolatile, 
+                   unsigned Align, BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const std::string &Name, bool isVolatile,
+                   BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  AssertOK();
+  setName(Name);
+}
+
+
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(false);
+  setAlignment(0);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(false);
+  setAlignment(0);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
+                   Instruction *InsertBef)
+: UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                   Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
+                   BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+void LoadInst::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  SubclassData = (SubclassData & 1) | ((Log2_32(Align)+1)<<1);
+}
+
+//===----------------------------------------------------------------------===//
+//                           StoreInst Implementation
+//===----------------------------------------------------------------------===//
+
+void StoreInst::AssertOK() {
+  assert(getOperand(0) && getOperand(1) && "Both operands must be non-null!");
+  assert(isa<PointerType>(getOperand(1)->getType()) &&
+         "Ptr must have pointer type!");
+  assert(getOperand(0)->getType() ==
+                 cast<PointerType>(getOperand(1)->getType())->getElementType()
+         && "Ptr must be a pointer to Val type!");
+}
+
+
+StoreInst::StoreInst(Value *val, Value *addr, Instruction *InsertBefore)
+  : Instruction(Type::VoidTy, Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(false);
+  setAlignment(0);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, BasicBlock *InsertAtEnd)
+  : Instruction(Type::VoidTy, Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(false);
+  setAlignment(0);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     Instruction *InsertBefore)
+  : Instruction(Type::VoidTy, Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(0);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, Instruction *InsertBefore)
+  : Instruction(Type::VoidTy, Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, BasicBlock *InsertAtEnd)
+  : Instruction(Type::VoidTy, Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     BasicBlock *InsertAtEnd)
+  : Instruction(Type::VoidTy, Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(0);
+  AssertOK();
+}
+
+void StoreInst::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  SubclassData = (SubclassData & 1) | ((Log2_32(Align)+1)<<1);
+}
+
+//===----------------------------------------------------------------------===//
+//                       GetElementPtrInst Implementation
+//===----------------------------------------------------------------------===//
+
+static unsigned retrieveAddrSpace(const Value *Val) {
+  return cast<PointerType>(Val->getType())->getAddressSpace();
+}
+
+void GetElementPtrInst::init(Value *Ptr, Value* const *Idx, unsigned NumIdx,
+                             const std::string &Name) {
+  assert(NumOperands == 1+NumIdx && "NumOperands not initialized?");
+  Use *OL = OperandList;
+  OL[0] = Ptr;
+
+  for (unsigned i = 0; i != NumIdx; ++i)
+    OL[i+1] = Idx[i];
+
+  setName(Name);
+}
+
+void GetElementPtrInst::init(Value *Ptr, Value *Idx, const std::string &Name) {
+  assert(NumOperands == 2 && "NumOperands not initialized?");
+  Use *OL = OperandList;
+  OL[0] = Ptr;
+  OL[1] = Idx;
+
+  setName(Name);
+}
+
+GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI)
+  : Instruction(GEPI.getType(), GetElementPtr,
+                OperandTraits<GetElementPtrInst>::op_end(this)
+                - GEPI.getNumOperands(),
+                GEPI.getNumOperands()) {
+  Use *OL = OperandList;
+  Use *GEPIOL = GEPI.OperandList;
+  for (unsigned i = 0, E = NumOperands; i != E; ++i)
+    OL[i] = GEPIOL[i];
+}
+
+GetElementPtrInst::GetElementPtrInst(Value *Ptr, Value *Idx,
+                                     const std::string &Name, Instruction *InBe)
+  : Instruction(PointerType::get(checkType(getIndexedType(Ptr->getType(),Idx)),
+                                 retrieveAddrSpace(Ptr)),
+                GetElementPtr,
+                OperandTraits<GetElementPtrInst>::op_end(this) - 2,
+                2, InBe) {
+  init(Ptr, Idx, Name);
+}
+
+GetElementPtrInst::GetElementPtrInst(Value *Ptr, Value *Idx,
+                                     const std::string &Name, BasicBlock *IAE)
+  : Instruction(PointerType::get(checkType(getIndexedType(Ptr->getType(),Idx)),
+                                 retrieveAddrSpace(Ptr)),
+                GetElementPtr,
+                OperandTraits<GetElementPtrInst>::op_end(this) - 2,
+                2, IAE) {
+  init(Ptr, Idx, Name);
+}
+
+/// getIndexedType - Returns the type of the element that would be accessed with
+/// a gep instruction with the specified parameters.
+///
+/// The Idxs pointer should point to a continuous piece of memory containing the
+/// indices, either as Value* or uint64_t.
+///
+/// A null type is returned if the indices are invalid for the specified
+/// pointer type.
+///
+template <typename IndexTy>
+static const Type* getIndexedTypeInternal(const Type *Ptr, IndexTy const *Idxs,
+                                          unsigned NumIdx) {
+  const PointerType *PTy = dyn_cast<PointerType>(Ptr);
+  if (!PTy) return 0;   // Type isn't a pointer type!
+  const Type *Agg = PTy->getElementType();
+
+  // Handle the special case of the empty set index set, which is always valid.
+  if (NumIdx == 0)
+    return Agg;
+  
+  // If there is at least one index, the top level type must be sized, otherwise
+  // it cannot be 'stepped over'.  We explicitly allow abstract types (those
+  // that contain opaque types) under the assumption that it will be resolved to
+  // a sane type later.
+  if (!Agg->isSized() && !Agg->isAbstract())
+    return 0;
+
+  unsigned CurIdx = 1;
+  for (; CurIdx != NumIdx; ++CurIdx) {
+    const CompositeType *CT = dyn_cast<CompositeType>(Agg);
+    if (!CT || isa<PointerType>(CT)) return 0;
+    IndexTy Index = Idxs[CurIdx];
+    if (!CT->indexValid(Index)) return 0;
+    Agg = CT->getTypeAtIndex(Index);
+
+    // If the new type forwards to another type, then it is in the middle
+    // of being refined to another type (and hence, may have dropped all
+    // references to what it was using before).  So, use the new forwarded
+    // type.
+    if (const Type *Ty = Agg->getForwardedType())
+      Agg = Ty;
+  }
+  return CurIdx == NumIdx ? Agg : 0;
+}
+
+const Type* GetElementPtrInst::getIndexedType(const Type *Ptr,
+                                              Value* const *Idxs,
+                                              unsigned NumIdx) {
+  return getIndexedTypeInternal(Ptr, Idxs, NumIdx);
+}
+
+const Type* GetElementPtrInst::getIndexedType(const Type *Ptr,
+                                              uint64_t const *Idxs,
+                                              unsigned NumIdx) {
+  return getIndexedTypeInternal(Ptr, Idxs, NumIdx);
+}
+
+const Type* GetElementPtrInst::getIndexedType(const Type *Ptr, Value *Idx) {
+  const PointerType *PTy = dyn_cast<PointerType>(Ptr);
+  if (!PTy) return 0;   // Type isn't a pointer type!
+
+  // Check the pointer index.
+  if (!PTy->indexValid(Idx)) return 0;
+
+  return PTy->getElementType();
+}
+
+
+/// hasAllZeroIndices - Return true if all of the indices of this GEP are
+/// zeros.  If so, the result pointer and the first operand have the same
+/// value, just potentially different types.
+bool GetElementPtrInst::hasAllZeroIndices() const {
+  for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(getOperand(i))) {
+      if (!CI->isZero()) return false;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// hasAllConstantIndices - Return true if all of the indices of this GEP are
+/// constant integers.  If so, the result pointer and the first operand have
+/// a constant offset between them.
+bool GetElementPtrInst::hasAllConstantIndices() const {
+  for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    if (!isa<ConstantInt>(getOperand(i)))
+      return false;
+  }
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           ExtractElementInst Implementation
+//===----------------------------------------------------------------------===//
+
+ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
+                                       const std::string &Name,
+                                       Instruction *InsertBef)
+  : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+                ExtractElement,
+                OperandTraits<ExtractElementInst>::op_begin(this),
+                2, InsertBef) {
+  assert(isValidOperands(Val, Index) &&
+         "Invalid extractelement instruction operands!");
+  Op<0>() = Val;
+  Op<1>() = Index;
+  setName(Name);
+}
+
+ExtractElementInst::ExtractElementInst(Value *Val, unsigned IndexV,
+                                       const std::string &Name,
+                                       Instruction *InsertBef)
+  : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+                ExtractElement,
+                OperandTraits<ExtractElementInst>::op_begin(this),
+                2, InsertBef) {
+  Constant *Index = ConstantInt::get(Type::Int32Ty, IndexV);
+  assert(isValidOperands(Val, Index) &&
+         "Invalid extractelement instruction operands!");
+  Op<0>() = Val;
+  Op<1>() = Index;
+  setName(Name);
+}
+
+
+ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
+                                       const std::string &Name,
+                                       BasicBlock *InsertAE)
+  : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+                ExtractElement,
+                OperandTraits<ExtractElementInst>::op_begin(this),
+                2, InsertAE) {
+  assert(isValidOperands(Val, Index) &&
+         "Invalid extractelement instruction operands!");
+
+  Op<0>() = Val;
+  Op<1>() = Index;
+  setName(Name);
+}
+
+ExtractElementInst::ExtractElementInst(Value *Val, unsigned IndexV,
+                                       const std::string &Name,
+                                       BasicBlock *InsertAE)
+  : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+                ExtractElement,
+                OperandTraits<ExtractElementInst>::op_begin(this),
+                2, InsertAE) {
+  Constant *Index = ConstantInt::get(Type::Int32Ty, IndexV);
+  assert(isValidOperands(Val, Index) &&
+         "Invalid extractelement instruction operands!");
+  
+  Op<0>() = Val;
+  Op<1>() = Index;
+  setName(Name);
+}
+
+
+bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
+  if (!isa<VectorType>(Val->getType()) || Index->getType() != Type::Int32Ty)
+    return false;
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           InsertElementInst Implementation
+//===----------------------------------------------------------------------===//
+
+InsertElementInst::InsertElementInst(const InsertElementInst &IE)
+    : Instruction(IE.getType(), InsertElement,
+                  OperandTraits<InsertElementInst>::op_begin(this), 3) {
+  Op<0>() = IE.Op<0>();
+  Op<1>() = IE.Op<1>();
+  Op<2>() = IE.Op<2>();
+}
+InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
+                                     const std::string &Name,
+                                     Instruction *InsertBef)
+  : Instruction(Vec->getType(), InsertElement,
+                OperandTraits<InsertElementInst>::op_begin(this),
+                3, InsertBef) {
+  assert(isValidOperands(Vec, Elt, Index) &&
+         "Invalid insertelement instruction operands!");
+  Op<0>() = Vec;
+  Op<1>() = Elt;
+  Op<2>() = Index;
+  setName(Name);
+}
+
+InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, unsigned IndexV,
+                                     const std::string &Name,
+                                     Instruction *InsertBef)
+  : Instruction(Vec->getType(), InsertElement,
+                OperandTraits<InsertElementInst>::op_begin(this),
+                3, InsertBef) {
+  Constant *Index = ConstantInt::get(Type::Int32Ty, IndexV);
+  assert(isValidOperands(Vec, Elt, Index) &&
+         "Invalid insertelement instruction operands!");
+  Op<0>() = Vec;
+  Op<1>() = Elt;
+  Op<2>() = Index;
+  setName(Name);
+}
+
+
+InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
+                                     const std::string &Name,
+                                     BasicBlock *InsertAE)
+  : Instruction(Vec->getType(), InsertElement,
+                OperandTraits<InsertElementInst>::op_begin(this),
+                3, InsertAE) {
+  assert(isValidOperands(Vec, Elt, Index) &&
+         "Invalid insertelement instruction operands!");
+
+  Op<0>() = Vec;
+  Op<1>() = Elt;
+  Op<2>() = Index;
+  setName(Name);
+}
+
+InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, unsigned IndexV,
+                                     const std::string &Name,
+                                     BasicBlock *InsertAE)
+: Instruction(Vec->getType(), InsertElement,
+              OperandTraits<InsertElementInst>::op_begin(this),
+              3, InsertAE) {
+  Constant *Index = ConstantInt::get(Type::Int32Ty, IndexV);
+  assert(isValidOperands(Vec, Elt, Index) &&
+         "Invalid insertelement instruction operands!");
+  
+  Op<0>() = Vec;
+  Op<1>() = Elt;
+  Op<2>() = Index;
+  setName(Name);
+}
+
+bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt, 
+                                        const Value *Index) {
+  if (!isa<VectorType>(Vec->getType()))
+    return false;   // First operand of insertelement must be vector type.
+  
+  if (Elt->getType() != cast<VectorType>(Vec->getType())->getElementType())
+    return false;// Second operand of insertelement must be vector element type.
+    
+  if (Index->getType() != Type::Int32Ty)
+    return false;  // Third operand of insertelement must be uint.
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      ShuffleVectorInst Implementation
+//===----------------------------------------------------------------------===//
+
+ShuffleVectorInst::ShuffleVectorInst(const ShuffleVectorInst &SV) 
+  : Instruction(SV.getType(), ShuffleVector,
+                OperandTraits<ShuffleVectorInst>::op_begin(this),
+                OperandTraits<ShuffleVectorInst>::operands(this)) {
+  Op<0>() = SV.Op<0>();
+  Op<1>() = SV.Op<1>();
+  Op<2>() = SV.Op<2>();
+}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
+                                     const std::string &Name,
+                                     Instruction *InsertBefore)
+: Instruction(VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                cast<VectorType>(Mask->getType())->getNumElements()),
+              ShuffleVector,
+              OperandTraits<ShuffleVectorInst>::op_begin(this),
+              OperandTraits<ShuffleVectorInst>::operands(this),
+              InsertBefore) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+  Op<0>() = V1;
+  Op<1>() = V2;
+  Op<2>() = Mask;
+  setName(Name);
+}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
+                                     const std::string &Name,
+                                     BasicBlock *InsertAtEnd)
+  : Instruction(V1->getType(), ShuffleVector,
+                OperandTraits<ShuffleVectorInst>::op_begin(this),
+                OperandTraits<ShuffleVectorInst>::operands(this),
+                InsertAtEnd) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+
+  Op<0>() = V1;
+  Op<1>() = V2;
+  Op<2>() = Mask;
+  setName(Name);
+}
+
+bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
+                                        const Value *Mask) {
+  if (!isa<VectorType>(V1->getType()) || V1->getType() != V2->getType())
+    return false;
+  
+  const VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
+  if (!isa<Constant>(Mask) || MaskTy == 0 ||
+      MaskTy->getElementType() != Type::Int32Ty)
+    return false;
+  return true;
+}
+
+/// getMaskValue - Return the index from the shuffle mask for the specified
+/// output result.  This is either -1 if the element is undef or a number less
+/// than 2*numelements.
+int ShuffleVectorInst::getMaskValue(unsigned i) const {
+  const Constant *Mask = cast<Constant>(getOperand(2));
+  if (isa<UndefValue>(Mask)) return -1;
+  if (isa<ConstantAggregateZero>(Mask)) return 0;
+  const ConstantVector *MaskCV = cast<ConstantVector>(Mask);
+  assert(i < MaskCV->getNumOperands() && "Index out of range");
+
+  if (isa<UndefValue>(MaskCV->getOperand(i)))
+    return -1;
+  return cast<ConstantInt>(MaskCV->getOperand(i))->getZExtValue();
+}
+
+//===----------------------------------------------------------------------===//
+//                             InsertValueInst Class
+//===----------------------------------------------------------------------===//
+
+void InsertValueInst::init(Value *Agg, Value *Val, const unsigned *Idx, 
+                           unsigned NumIdx, const std::string &Name) {
+  assert(NumOperands == 2 && "NumOperands not initialized?");
+  Op<0>() = Agg;
+  Op<1>() = Val;
+
+  Indices.insert(Indices.end(), Idx, Idx + NumIdx);
+  setName(Name);
+}
+
+void InsertValueInst::init(Value *Agg, Value *Val, unsigned Idx, 
+                           const std::string &Name) {
+  assert(NumOperands == 2 && "NumOperands not initialized?");
+  Op<0>() = Agg;
+  Op<1>() = Val;
+
+  Indices.push_back(Idx);
+  setName(Name);
+}
+
+InsertValueInst::InsertValueInst(const InsertValueInst &IVI)
+  : Instruction(IVI.getType(), InsertValue,
+                OperandTraits<InsertValueInst>::op_begin(this), 2),
+    Indices(IVI.Indices) {
+  Op<0>() = IVI.getOperand(0);
+  Op<1>() = IVI.getOperand(1);
+}
+
+InsertValueInst::InsertValueInst(Value *Agg,
+                                 Value *Val,
+                                 unsigned Idx, 
+                                 const std::string &Name,
+                                 Instruction *InsertBefore)
+  : Instruction(Agg->getType(), InsertValue,
+                OperandTraits<InsertValueInst>::op_begin(this),
+                2, InsertBefore) {
+  init(Agg, Val, Idx, Name);
+}
+
+InsertValueInst::InsertValueInst(Value *Agg,
+                                 Value *Val,
+                                 unsigned Idx, 
+                                 const std::string &Name,
+                                 BasicBlock *InsertAtEnd)
+  : Instruction(Agg->getType(), InsertValue,
+                OperandTraits<InsertValueInst>::op_begin(this),
+                2, InsertAtEnd) {
+  init(Agg, Val, Idx, Name);
+}
+
+//===----------------------------------------------------------------------===//
+//                             ExtractValueInst Class
+//===----------------------------------------------------------------------===//
+
+void ExtractValueInst::init(const unsigned *Idx, unsigned NumIdx,
+                            const std::string &Name) {
+  assert(NumOperands == 1 && "NumOperands not initialized?");
+
+  Indices.insert(Indices.end(), Idx, Idx + NumIdx);
+  setName(Name);
+}
+
+void ExtractValueInst::init(unsigned Idx, const std::string &Name) {
+  assert(NumOperands == 1 && "NumOperands not initialized?");
+
+  Indices.push_back(Idx);
+  setName(Name);
+}
+
+ExtractValueInst::ExtractValueInst(const ExtractValueInst &EVI)
+  : UnaryInstruction(EVI.getType(), ExtractValue, EVI.getOperand(0)),
+    Indices(EVI.Indices) {
+}
+
+// getIndexedType - Returns the type of the element that would be extracted
+// with an extractvalue instruction with the specified parameters.
+//
+// A null type is returned if the indices are invalid for the specified
+// pointer type.
+//
+const Type* ExtractValueInst::getIndexedType(const Type *Agg,
+                                             const unsigned *Idxs,
+                                             unsigned NumIdx) {
+  unsigned CurIdx = 0;
+  for (; CurIdx != NumIdx; ++CurIdx) {
+    const CompositeType *CT = dyn_cast<CompositeType>(Agg);
+    if (!CT || isa<PointerType>(CT) || isa<VectorType>(CT)) return 0;
+    unsigned Index = Idxs[CurIdx];
+    if (!CT->indexValid(Index)) return 0;
+    Agg = CT->getTypeAtIndex(Index);
+
+    // If the new type forwards to another type, then it is in the middle
+    // of being refined to another type (and hence, may have dropped all
+    // references to what it was using before).  So, use the new forwarded
+    // type.
+    if (const Type *Ty = Agg->getForwardedType())
+      Agg = Ty;
+  }
+  return CurIdx == NumIdx ? Agg : 0;
+}
+
+const Type* ExtractValueInst::getIndexedType(const Type *Agg,
+                                             unsigned Idx) {
+  return getIndexedType(Agg, &Idx, 1);
+}
+
+//===----------------------------------------------------------------------===//
+//                             BinaryOperator Class
+//===----------------------------------------------------------------------===//
+
+BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
+                               const Type *Ty, const std::string &Name,
+                               Instruction *InsertBefore)
+  : Instruction(Ty, iType,
+                OperandTraits<BinaryOperator>::op_begin(this),
+                OperandTraits<BinaryOperator>::operands(this),
+                InsertBefore) {
+  Op<0>() = S1;
+  Op<1>() = S2;
+  init(iType);
+  setName(Name);
+}
+
+BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, 
+                               const Type *Ty, const std::string &Name,
+                               BasicBlock *InsertAtEnd)
+  : Instruction(Ty, iType,
+                OperandTraits<BinaryOperator>::op_begin(this),
+                OperandTraits<BinaryOperator>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = S1;
+  Op<1>() = S2;
+  init(iType);
+  setName(Name);
+}
+
+
+void BinaryOperator::init(BinaryOps iType) {
+  Value *LHS = getOperand(0), *RHS = getOperand(1);
+  LHS = LHS; RHS = RHS; // Silence warnings.
+  assert(LHS->getType() == RHS->getType() &&
+         "Binary operator operand types must match!");
+#ifndef NDEBUG
+  switch (iType) {
+  case Add: case Sub:
+  case Mul: 
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert((getType()->isInteger() || getType()->isFloatingPoint() ||
+            isa<VectorType>(getType())) &&
+          "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case UDiv: 
+  case SDiv: 
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert((getType()->isInteger() || (isa<VectorType>(getType()) && 
+            cast<VectorType>(getType())->getElementType()->isInteger())) &&
+           "Incorrect operand type (not integer) for S/UDIV");
+    break;
+  case FDiv:
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert((getType()->isFloatingPoint() || (isa<VectorType>(getType()) &&
+            cast<VectorType>(getType())->getElementType()->isFloatingPoint())) 
+            && "Incorrect operand type (not floating point) for FDIV");
+    break;
+  case URem: 
+  case SRem: 
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert((getType()->isInteger() || (isa<VectorType>(getType()) && 
+            cast<VectorType>(getType())->getElementType()->isInteger())) &&
+           "Incorrect operand type (not integer) for S/UREM");
+    break;
+  case FRem:
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert((getType()->isFloatingPoint() || (isa<VectorType>(getType()) &&
+            cast<VectorType>(getType())->getElementType()->isFloatingPoint())) 
+            && "Incorrect operand type (not floating point) for FREM");
+    break;
+  case Shl:
+  case LShr:
+  case AShr:
+    assert(getType() == LHS->getType() &&
+           "Shift operation should return same type as operands!");
+    assert((getType()->isInteger() ||
+            (isa<VectorType>(getType()) && 
+             cast<VectorType>(getType())->getElementType()->isInteger())) &&
+           "Tried to create a shift operation on a non-integral type!");
+    break;
+  case And: case Or:
+  case Xor:
+    assert(getType() == LHS->getType() &&
+           "Logical operation should return same type as operands!");
+    assert((getType()->isInteger() ||
+            (isa<VectorType>(getType()) && 
+             cast<VectorType>(getType())->getElementType()->isInteger())) &&
+           "Tried to create a logical operation on a non-integral type!");
+    break;
+  default:
+    break;
+  }
+#endif
+}
+
+BinaryOperator *BinaryOperator::Create(BinaryOps Op, Value *S1, Value *S2,
+                                       const std::string &Name,
+                                       Instruction *InsertBefore) {
+  assert(S1->getType() == S2->getType() &&
+         "Cannot create binary operator with two operands of differing type!");
+  return new BinaryOperator(Op, S1, S2, S1->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::Create(BinaryOps Op, Value *S1, Value *S2,
+                                       const std::string &Name,
+                                       BasicBlock *InsertAtEnd) {
+  BinaryOperator *Res = Create(Op, S1, S2, Name);
+  InsertAtEnd->getInstList().push_back(Res);
+  return Res;
+}
+
+BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const std::string &Name,
+                                          Instruction *InsertBefore) {
+  Value *zero = ConstantExpr::getZeroValueForNegationExpr(Op->getType());
+  return new BinaryOperator(Instruction::Sub,
+                            zero, Op,
+                            Op->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const std::string &Name,
+                                          BasicBlock *InsertAtEnd) {
+  Value *zero = ConstantExpr::getZeroValueForNegationExpr(Op->getType());
+  return new BinaryOperator(Instruction::Sub,
+                            zero, Op,
+                            Op->getType(), Name, InsertAtEnd);
+}
+
+BinaryOperator *BinaryOperator::CreateNot(Value *Op, const std::string &Name,
+                                          Instruction *InsertBefore) {
+  Constant *C;
+  if (const VectorType *PTy = dyn_cast<VectorType>(Op->getType())) {
+    C = ConstantInt::getAllOnesValue(PTy->getElementType());
+    C = ConstantVector::get(std::vector<Constant*>(PTy->getNumElements(), C));
+  } else {
+    C = ConstantInt::getAllOnesValue(Op->getType());
+  }
+  
+  return new BinaryOperator(Instruction::Xor, Op, C,
+                            Op->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateNot(Value *Op, const std::string &Name,
+                                          BasicBlock *InsertAtEnd) {
+  Constant *AllOnes;
+  if (const VectorType *PTy = dyn_cast<VectorType>(Op->getType())) {
+    // Create a vector of all ones values.
+    Constant *Elt = ConstantInt::getAllOnesValue(PTy->getElementType());
+    AllOnes = 
+      ConstantVector::get(std::vector<Constant*>(PTy->getNumElements(), Elt));
+  } else {
+    AllOnes = ConstantInt::getAllOnesValue(Op->getType());
+  }
+  
+  return new BinaryOperator(Instruction::Xor, Op, AllOnes,
+                            Op->getType(), Name, InsertAtEnd);
+}
+
+
+// isConstantAllOnes - Helper function for several functions below
+static inline bool isConstantAllOnes(const Value *V) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return CI->isAllOnesValue();
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(V))
+    return CV->isAllOnesValue();
+  return false;
+}
+
+bool BinaryOperator::isNeg(const Value *V) {
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    if (Bop->getOpcode() == Instruction::Sub)
+      return Bop->getOperand(0) ==
+             ConstantExpr::getZeroValueForNegationExpr(Bop->getType());
+  return false;
+}
+
+bool BinaryOperator::isNot(const Value *V) {
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    return (Bop->getOpcode() == Instruction::Xor &&
+            (isConstantAllOnes(Bop->getOperand(1)) ||
+             isConstantAllOnes(Bop->getOperand(0))));
+  return false;
+}
+
+Value *BinaryOperator::getNegArgument(Value *BinOp) {
+  assert(isNeg(BinOp) && "getNegArgument from non-'neg' instruction!");
+  return cast<BinaryOperator>(BinOp)->getOperand(1);
+}
+
+const Value *BinaryOperator::getNegArgument(const Value *BinOp) {
+  return getNegArgument(const_cast<Value*>(BinOp));
+}
+
+Value *BinaryOperator::getNotArgument(Value *BinOp) {
+  assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
+  BinaryOperator *BO = cast<BinaryOperator>(BinOp);
+  Value *Op0 = BO->getOperand(0);
+  Value *Op1 = BO->getOperand(1);
+  if (isConstantAllOnes(Op0)) return Op1;
+
+  assert(isConstantAllOnes(Op1));
+  return Op0;
+}
+
+const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
+  return getNotArgument(const_cast<Value*>(BinOp));
+}
+
+
+// swapOperands - Exchange the two operands to this instruction.  This
+// instruction is safe to use on any binary instruction and does not
+// modify the semantics of the instruction.  If the instruction is
+// order dependent (SetLT f.e.) the opcode is changed.
+//
+bool BinaryOperator::swapOperands() {
+  if (!isCommutative())
+    return true; // Can't commute operands
+  Op<0>().swap(Op<1>());
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                                CastInst Class
+//===----------------------------------------------------------------------===//
+
+// Just determine if this cast only deals with integral->integral conversion.
+bool CastInst::isIntegerCast() const {
+  switch (getOpcode()) {
+    default: return false;
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::Trunc:
+      return true;
+    case Instruction::BitCast:
+      return getOperand(0)->getType()->isInteger() && getType()->isInteger();
+  }
+}
+
+bool CastInst::isLosslessCast() const {
+  // Only BitCast can be lossless, exit fast if we're not BitCast
+  if (getOpcode() != Instruction::BitCast)
+    return false;
+
+  // Identity cast is always lossless
+  const Type* SrcTy = getOperand(0)->getType();
+  const Type* DstTy = getType();
+  if (SrcTy == DstTy)
+    return true;
+  
+  // Pointer to pointer is always lossless.
+  if (isa<PointerType>(SrcTy))
+    return isa<PointerType>(DstTy);
+  return false;  // Other types have no identity values
+}
+
+/// This function determines if the CastInst does not require any bits to be
+/// changed in order to effect the cast. Essentially, it identifies cases where
+/// no code gen is necessary for the cast, hence the name no-op cast.  For 
+/// example, the following are all no-op casts:
+/// # bitcast i32* %x to i8*
+/// # bitcast <2 x i32> %x to <4 x i16> 
+/// # ptrtoint i32* %x to i32     ; on 32-bit plaforms only
+/// @brief Determine if a cast is a no-op.
+bool CastInst::isNoopCast(const Type *IntPtrTy) const {
+  switch (getOpcode()) {
+    default:
+      assert(!"Invalid CastOp");
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt: 
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      return false; // These always modify bits
+    case Instruction::BitCast:
+      return true;  // BitCast never modifies bits.
+    case Instruction::PtrToInt:
+      return IntPtrTy->getPrimitiveSizeInBits() ==
+            getType()->getPrimitiveSizeInBits();
+    case Instruction::IntToPtr:
+      return IntPtrTy->getPrimitiveSizeInBits() ==
+             getOperand(0)->getType()->getPrimitiveSizeInBits();
+  }
+}
+
+/// This function determines if a pair of casts can be eliminated and what 
+/// opcode should be used in the elimination. This assumes that there are two 
+/// instructions like this:
+/// *  %F = firstOpcode SrcTy %x to MidTy
+/// *  %S = secondOpcode MidTy %F to DstTy
+/// The function returns a resultOpcode so these two casts can be replaced with:
+/// *  %Replacement = resultOpcode %SrcTy %x to DstTy
+/// If no such cast is permited, the function returns 0.
+unsigned CastInst::isEliminableCastPair(
+  Instruction::CastOps firstOp, Instruction::CastOps secondOp,
+  const Type *SrcTy, const Type *MidTy, const Type *DstTy, const Type *IntPtrTy)
+{
+  // Define the 144 possibilities for these two cast instructions. The values
+  // in this matrix determine what to do in a given situation and select the
+  // case in the switch below.  The rows correspond to firstOp, the columns 
+  // correspond to secondOp.  In looking at the table below, keep in  mind
+  // the following cast properties:
+  //
+  //          Size Compare       Source               Destination
+  // Operator  Src ? Size   Type       Sign         Type       Sign
+  // -------- ------------ -------------------   ---------------------
+  // TRUNC         >       Integer      Any        Integral     Any
+  // ZEXT          <       Integral   Unsigned     Integer      Any
+  // SEXT          <       Integral    Signed      Integer      Any
+  // FPTOUI       n/a      FloatPt      n/a        Integral   Unsigned
+  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed 
+  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a   
+  // SITOFP       n/a      Integral    Signed      FloatPt      n/a   
+  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a   
+  // FPEXT         <       FloatPt      n/a        FloatPt      n/a   
+  // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
+  // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
+  // BITCONVERT    =       FirstClass   n/a       FirstClass    n/a   
+  //
+  // NOTE: some transforms are safe, but we consider them to be non-profitable.
+  // For example, we could merge "fptoui double to uint" + "zext uint to ulong",
+  // into "fptoui double to ulong", but this loses information about the range
+  // of the produced value (we no longer know the top-part is all zeros). 
+  // Further this conversion is often much more expensive for typical hardware,
+  // and causes issues when building libgcc.  We disallow fptosi+sext for the 
+  // same reason.
+  const unsigned numCastOps = 
+    Instruction::CastOpsEnd - Instruction::CastOpsBegin;
+  static const uint8_t CastResults[numCastOps][numCastOps] = {
+    // T        F  F  U  S  F  F  P  I  B   -+
+    // R  Z  S  P  P  I  I  T  P  2  N  T    |
+    // U  E  E  2  2  2  2  R  E  I  T  C    +- secondOp
+    // N  X  X  U  S  F  F  N  X  N  2  V    |
+    // C  T  T  I  I  P  P  C  T  T  P  T   -+
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // Trunc      -+
+    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3 }, // ZExt        |
+    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3 }, // SExt        |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToUI      |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToSI      |
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // UIToFP      +- firstOp
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // SIToFP      |
+    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4 }, // FPTrunc     |
+    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4 }, // FPExt       |
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3 }, // PtrToInt    |
+    { 99,99,99,99,99,99,99,99,99,13,99,12 }, // IntToPtr    |
+    {  5, 5, 5, 6, 6, 5, 5, 6, 6,11, 5, 1 }, // BitCast    -+
+  };
+
+  int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin]
+                            [secondOp-Instruction::CastOpsBegin];
+  switch (ElimCase) {
+    case 0: 
+      // categorically disallowed
+      return 0;
+    case 1: 
+      // allowed, use first cast's opcode
+      return firstOp;
+    case 2: 
+      // allowed, use second cast's opcode
+      return secondOp;
+    case 3: 
+      // no-op cast in second op implies firstOp as long as the DestTy 
+      // is integer
+      if (DstTy->isInteger())
+        return firstOp;
+      return 0;
+    case 4:
+      // no-op cast in second op implies firstOp as long as the DestTy
+      // is floating point
+      if (DstTy->isFloatingPoint())
+        return firstOp;
+      return 0;
+    case 5: 
+      // no-op cast in first op implies secondOp as long as the SrcTy
+      // is an integer
+      if (SrcTy->isInteger())
+        return secondOp;
+      return 0;
+    case 6:
+      // no-op cast in first op implies secondOp as long as the SrcTy
+      // is a floating point
+      if (SrcTy->isFloatingPoint())
+        return secondOp;
+      return 0;
+    case 7: { 
+      // ptrtoint, inttoptr -> bitcast (ptr -> ptr) if int size is >= ptr size
+      unsigned PtrSize = IntPtrTy->getPrimitiveSizeInBits();
+      unsigned MidSize = MidTy->getPrimitiveSizeInBits();
+      if (MidSize >= PtrSize)
+        return Instruction::BitCast;
+      return 0;
+    }
+    case 8: {
+      // ext, trunc -> bitcast,    if the SrcTy and DstTy are same size
+      // ext, trunc -> ext,        if sizeof(SrcTy) < sizeof(DstTy)
+      // ext, trunc -> trunc,      if sizeof(SrcTy) > sizeof(DstTy)
+      unsigned SrcSize = SrcTy->getPrimitiveSizeInBits();
+      unsigned DstSize = DstTy->getPrimitiveSizeInBits();
+      if (SrcSize == DstSize)
+        return Instruction::BitCast;
+      else if (SrcSize < DstSize)
+        return firstOp;
+      return secondOp;
+    }
+    case 9: // zext, sext -> zext, because sext can't sign extend after zext
+      return Instruction::ZExt;
+    case 10:
+      // fpext followed by ftrunc is allowed if the bit size returned to is
+      // the same as the original, in which case its just a bitcast
+      if (SrcTy == DstTy)
+        return Instruction::BitCast;
+      return 0; // If the types are not the same we can't eliminate it.
+    case 11:
+      // bitcast followed by ptrtoint is allowed as long as the bitcast
+      // is a pointer to pointer cast.
+      if (isa<PointerType>(SrcTy) && isa<PointerType>(MidTy))
+        return secondOp;
+      return 0;
+    case 12:
+      // inttoptr, bitcast -> intptr  if bitcast is a ptr to ptr cast
+      if (isa<PointerType>(MidTy) && isa<PointerType>(DstTy))
+        return firstOp;
+      return 0;
+    case 13: {
+      // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
+      unsigned PtrSize = IntPtrTy->getPrimitiveSizeInBits();
+      unsigned SrcSize = SrcTy->getPrimitiveSizeInBits();
+      unsigned DstSize = DstTy->getPrimitiveSizeInBits();
+      if (SrcSize <= PtrSize && SrcSize == DstSize)
+        return Instruction::BitCast;
+      return 0;
+    }
+    case 99: 
+      // cast combination can't happen (error in input). This is for all cases
+      // where the MidTy is not the same for the two cast instructions.
+      assert(!"Invalid Cast Combination");
+      return 0;
+    default:
+      assert(!"Error in CastResults table!!!");
+      return 0;
+  }
+  return 0;
+}
+
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty, 
+  const std::string &Name, Instruction *InsertBefore) {
+  // Construct and return the appropriate CastInst subclass
+  switch (op) {
+    case Trunc:    return new TruncInst    (S, Ty, Name, InsertBefore);
+    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertBefore);
+    case SExt:     return new SExtInst     (S, Ty, Name, InsertBefore);
+    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertBefore);
+    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertBefore);
+    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertBefore);
+    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertBefore);
+    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertBefore);
+    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertBefore);
+    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
+    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
+    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertBefore);
+    default:
+      assert(!"Invalid opcode provided");
+  }
+  return 0;
+}
+
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
+  const std::string &Name, BasicBlock *InsertAtEnd) {
+  // Construct and return the appropriate CastInst subclass
+  switch (op) {
+    case Trunc:    return new TruncInst    (S, Ty, Name, InsertAtEnd);
+    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertAtEnd);
+    case SExt:     return new SExtInst     (S, Ty, Name, InsertAtEnd);
+    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertAtEnd);
+    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertAtEnd);
+    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertAtEnd);
+    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertAtEnd);
+    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertAtEnd);
+    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertAtEnd);
+    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertAtEnd);
+    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertAtEnd);
+    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertAtEnd);
+    default:
+      assert(!"Invalid opcode provided");
+  }
+  return 0;
+}
+
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, const Type *Ty, 
+                                        const std::string &Name,
+                                        Instruction *InsertBefore) {
+  if (S->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::ZExt, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, const Type *Ty, 
+                                        const std::string &Name,
+                                        BasicBlock *InsertAtEnd) {
+  if (S->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::ZExt, S, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, const Type *Ty, 
+                                        const std::string &Name,
+                                        Instruction *InsertBefore) {
+  if (S->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::SExt, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, const Type *Ty, 
+                                        const std::string &Name,
+                                        BasicBlock *InsertAtEnd) {
+  if (S->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::SExt, S, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreateTruncOrBitCast(Value *S, const Type *Ty,
+                                         const std::string &Name,
+                                         Instruction *InsertBefore) {
+  if (S->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::Trunc, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateTruncOrBitCast(Value *S, const Type *Ty,
+                                         const std::string &Name, 
+                                         BasicBlock *InsertAtEnd) {
+  if (S->getType()->getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::Trunc, S, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreatePointerCast(Value *S, const Type *Ty,
+                                      const std::string &Name,
+                                      BasicBlock *InsertAtEnd) {
+  assert(isa<PointerType>(S->getType()) && "Invalid cast");
+  assert((Ty->isInteger() || isa<PointerType>(Ty)) &&
+         "Invalid cast");
+
+  if (Ty->isInteger())
+    return Create(Instruction::PtrToInt, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+}
+
+/// @brief Create a BitCast or a PtrToInt cast instruction
+CastInst *CastInst::CreatePointerCast(Value *S, const Type *Ty, 
+                                      const std::string &Name, 
+                                      Instruction *InsertBefore) {
+  assert(isa<PointerType>(S->getType()) && "Invalid cast");
+  assert((Ty->isInteger() || isa<PointerType>(Ty)) &&
+         "Invalid cast");
+
+  if (Ty->isInteger())
+    return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+  return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateIntegerCast(Value *C, const Type *Ty, 
+                                      bool isSigned, const std::string &Name,
+                                      Instruction *InsertBefore) {
+  assert(C->getType()->isInteger() && Ty->isInteger() && "Invalid cast");
+  unsigned SrcBits = C->getType()->getPrimitiveSizeInBits();
+  unsigned DstBits = Ty->getPrimitiveSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::Trunc :
+      (isSigned ? Instruction::SExt : Instruction::ZExt)));
+  return Create(opcode, C, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateIntegerCast(Value *C, const Type *Ty, 
+                                      bool isSigned, const std::string &Name,
+                                      BasicBlock *InsertAtEnd) {
+  assert(C->getType()->isInteger() && Ty->isInteger() && "Invalid cast");
+  unsigned SrcBits = C->getType()->getPrimitiveSizeInBits();
+  unsigned DstBits = Ty->getPrimitiveSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::Trunc :
+      (isSigned ? Instruction::SExt : Instruction::ZExt)));
+  return Create(opcode, C, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreateFPCast(Value *C, const Type *Ty, 
+                                 const std::string &Name, 
+                                 Instruction *InsertBefore) {
+  assert(C->getType()->isFloatingPoint() && Ty->isFloatingPoint() && 
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getPrimitiveSizeInBits();
+  unsigned DstBits = Ty->getPrimitiveSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
+  return Create(opcode, C, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateFPCast(Value *C, const Type *Ty, 
+                                 const std::string &Name, 
+                                 BasicBlock *InsertAtEnd) {
+  assert(C->getType()->isFloatingPoint() && Ty->isFloatingPoint() && 
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getPrimitiveSizeInBits();
+  unsigned DstBits = Ty->getPrimitiveSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
+  return Create(opcode, C, Ty, Name, InsertAtEnd);
+}
+
+// Check whether it is valid to call getCastOpcode for these types.
+// This routine must be kept in sync with getCastOpcode.
+bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) {
+  if (!SrcTy->isFirstClassType() || !DestTy->isFirstClassType())
+    return false;
+
+  if (SrcTy == DestTy)
+    return true;
+
+  // Get the bit sizes, we'll need these
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr/vector
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr/vector
+
+  // Run through the possibilities ...
+  if (DestTy->isInteger()) {                   // Casting to integral
+    if (SrcTy->isInteger()) {                  // Casting from integral
+        return true;
+    } else if (SrcTy->isFloatingPoint()) {     // Casting from floating pt
+      return true;
+    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
+                                               // Casting from vector
+      return DestBits == PTy->getBitWidth();
+    } else {                                   // Casting from something else
+      return isa<PointerType>(SrcTy);
+    }
+  } else if (DestTy->isFloatingPoint()) {      // Casting to floating pt
+    if (SrcTy->isInteger()) {                  // Casting from integral
+      return true;
+    } else if (SrcTy->isFloatingPoint()) {     // Casting from floating pt
+      return true;
+    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
+                                               // Casting from vector
+      return DestBits == PTy->getBitWidth();
+    } else {                                   // Casting from something else
+      return false;
+    }
+  } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
+                                                // Casting to vector
+    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
+                                                // Casting from vector
+      return DestPTy->getBitWidth() == SrcPTy->getBitWidth();
+    } else {                                    // Casting from something else
+      return DestPTy->getBitWidth() == SrcBits;
+    }
+  } else if (isa<PointerType>(DestTy)) {        // Casting to pointer
+    if (isa<PointerType>(SrcTy)) {              // Casting from pointer
+      return true;
+    } else if (SrcTy->isInteger()) {            // Casting from integral
+      return true;
+    } else {                                    // Casting from something else
+      return false;
+    }
+  } else {                                      // Casting to something else
+    return false;
+  }
+}
+
+// Provide a way to get a "cast" where the cast opcode is inferred from the 
+// types and size of the operand. This, basically, is a parallel of the 
+// logic in the castIsValid function below.  This axiom should hold:
+//   castIsValid( getCastOpcode(Val, Ty), Val, Ty)
+// should not assert in castIsValid. In other words, this produces a "correct"
+// casting opcode for the arguments passed to it.
+// This routine must be kept in sync with isCastable.
+Instruction::CastOps
+CastInst::getCastOpcode(
+  const Value *Src, bool SrcIsSigned, const Type *DestTy, bool DestIsSigned) {
+  // Get the bit sizes, we'll need these
+  const Type *SrcTy = Src->getType();
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr/vector
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr/vector
+
+  assert(SrcTy->isFirstClassType() && DestTy->isFirstClassType() &&
+         "Only first class types are castable!");
+
+  // Run through the possibilities ...
+  if (DestTy->isInteger()) {                       // Casting to integral
+    if (SrcTy->isInteger()) {                      // Casting from integral
+      if (DestBits < SrcBits)
+        return Trunc;                               // int -> smaller int
+      else if (DestBits > SrcBits) {                // its an extension
+        if (SrcIsSigned)
+          return SExt;                              // signed -> SEXT
+        else
+          return ZExt;                              // unsigned -> ZEXT
+      } else {
+        return BitCast;                             // Same size, No-op cast
+      }
+    } else if (SrcTy->isFloatingPoint()) {          // Casting from floating pt
+      if (DestIsSigned) 
+        return FPToSI;                              // FP -> sint
+      else
+        return FPToUI;                              // FP -> uint 
+    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
+      assert(DestBits == PTy->getBitWidth() &&
+               "Casting vector to integer of different width");
+      PTy = NULL;
+      return BitCast;                             // Same size, no-op cast
+    } else {
+      assert(isa<PointerType>(SrcTy) &&
+             "Casting from a value that is not first-class type");
+      return PtrToInt;                              // ptr -> int
+    }
+  } else if (DestTy->isFloatingPoint()) {           // Casting to floating pt
+    if (SrcTy->isInteger()) {                      // Casting from integral
+      if (SrcIsSigned)
+        return SIToFP;                              // sint -> FP
+      else
+        return UIToFP;                              // uint -> FP
+    } else if (SrcTy->isFloatingPoint()) {          // Casting from floating pt
+      if (DestBits < SrcBits) {
+        return FPTrunc;                             // FP -> smaller FP
+      } else if (DestBits > SrcBits) {
+        return FPExt;                               // FP -> larger FP
+      } else  {
+        return BitCast;                             // same size, no-op cast
+      }
+    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
+      assert(DestBits == PTy->getBitWidth() &&
+             "Casting vector to floating point of different width");
+      PTy = NULL;
+      return BitCast;                             // same size, no-op cast
+    } else {
+      assert(0 && "Casting pointer or non-first class to float");
+    }
+  } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
+    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
+      assert(DestPTy->getBitWidth() == SrcPTy->getBitWidth() &&
+             "Casting vector to vector of different widths");
+      SrcPTy = NULL;
+      return BitCast;                             // vector -> vector
+    } else if (DestPTy->getBitWidth() == SrcBits) {
+      return BitCast;                               // float/int -> vector
+    } else {
+      assert(!"Illegal cast to vector (wrong type or size)");
+    }
+  } else if (isa<PointerType>(DestTy)) {
+    if (isa<PointerType>(SrcTy)) {
+      return BitCast;                               // ptr -> ptr
+    } else if (SrcTy->isInteger()) {
+      return IntToPtr;                              // int -> ptr
+    } else {
+      assert(!"Casting pointer to other than pointer or int");
+    }
+  } else {
+    assert(!"Casting to type that is not first-class");
+  }
+
+  // If we fall through to here we probably hit an assertion cast above
+  // and assertions are not turned on. Anything we return is an error, so
+  // BitCast is as good a choice as any.
+  return BitCast;
+}
+
+//===----------------------------------------------------------------------===//
+//                    CastInst SubClass Constructors
+//===----------------------------------------------------------------------===//
+
+/// Check that the construction parameters for a CastInst are correct. This
+/// could be broken out into the separate constructors but it is useful to have
+/// it in one place and to eliminate the redundant code for getting the sizes
+/// of the types involved.
+bool 
+CastInst::castIsValid(Instruction::CastOps op, Value *S, const Type *DstTy) {
+
+  // Check for type sanity on the arguments
+  const Type *SrcTy = S->getType();
+  if (!SrcTy->isFirstClassType() || !DstTy->isFirstClassType())
+    return false;
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DstBitSize = DstTy->getPrimitiveSizeInBits();
+
+  // Switch on the opcode provided
+  switch (op) {
+  default: return false; // This is an input error
+  case Instruction::Trunc:
+    return SrcTy->isIntOrIntVector() &&
+           DstTy->isIntOrIntVector()&& SrcBitSize > DstBitSize;
+  case Instruction::ZExt:
+    return SrcTy->isIntOrIntVector() &&
+           DstTy->isIntOrIntVector()&& SrcBitSize < DstBitSize;
+  case Instruction::SExt: 
+    return SrcTy->isIntOrIntVector() &&
+           DstTy->isIntOrIntVector()&& SrcBitSize < DstBitSize;
+  case Instruction::FPTrunc:
+    return SrcTy->isFPOrFPVector() &&
+           DstTy->isFPOrFPVector() && 
+           SrcBitSize > DstBitSize;
+  case Instruction::FPExt:
+    return SrcTy->isFPOrFPVector() &&
+           DstTy->isFPOrFPVector() && 
+           SrcBitSize < DstBitSize;
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+    if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) {
+      if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
+        return SVTy->getElementType()->isIntOrIntVector() &&
+               DVTy->getElementType()->isFPOrFPVector() &&
+               SVTy->getNumElements() == DVTy->getNumElements();
+      }
+    }
+    return SrcTy->isIntOrIntVector() && DstTy->isFPOrFPVector();
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+    if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) {
+      if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
+        return SVTy->getElementType()->isFPOrFPVector() &&
+               DVTy->getElementType()->isIntOrIntVector() &&
+               SVTy->getNumElements() == DVTy->getNumElements();
+      }
+    }
+    return SrcTy->isFPOrFPVector() && DstTy->isIntOrIntVector();
+  case Instruction::PtrToInt:
+    return isa<PointerType>(SrcTy) && DstTy->isInteger();
+  case Instruction::IntToPtr:
+    return SrcTy->isInteger() && isa<PointerType>(DstTy);
+  case Instruction::BitCast:
+    // BitCast implies a no-op cast of type only. No bits change.
+    // However, you can't cast pointers to anything but pointers.
+    if (isa<PointerType>(SrcTy) != isa<PointerType>(DstTy))
+      return false;
+
+    // Now we know we're not dealing with a pointer/non-pointer mismatch. In all
+    // these cases, the cast is okay if the source and destination bit widths
+    // are identical.
+    return SrcBitSize == DstBitSize;
+  }
+}
+
+TruncInst::TruncInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, Trunc, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
+}
+
+TruncInst::TruncInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, Trunc, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
+}
+
+ZExtInst::ZExtInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+)  : CastInst(Ty, ZExt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
+}
+
+ZExtInst::ZExtInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+)  : CastInst(Ty, ZExt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
+}
+SExtInst::SExtInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, SExt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
+}
+
+SExtInst::SExtInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+)  : CastInst(Ty, SExt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
+}
+
+FPTruncInst::FPTruncInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
+}
+
+FPTruncInst::FPTruncInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPTrunc, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
+}
+
+FPExtInst::FPExtInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPExt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
+}
+
+FPExtInst::FPExtInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPExt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
+}
+
+UIToFPInst::UIToFPInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, UIToFP, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
+}
+
+UIToFPInst::UIToFPInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, UIToFP, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
+}
+
+SIToFPInst::SIToFPInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, SIToFP, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
+}
+
+SIToFPInst::SIToFPInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, SIToFP, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
+}
+
+FPToUIInst::FPToUIInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPToUI, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
+}
+
+FPToUIInst::FPToUIInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPToUI, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
+}
+
+FPToSIInst::FPToSIInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPToSI, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
+}
+
+FPToSIInst::FPToSIInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPToSI, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
+}
+
+PtrToIntInst::PtrToIntInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
+}
+
+PtrToIntInst::PtrToIntInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, PtrToInt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
+}
+
+IntToPtrInst::IntToPtrInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
+}
+
+IntToPtrInst::IntToPtrInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, IntToPtr, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
+}
+
+BitCastInst::BitCastInst(
+  Value *S, const Type *Ty, const std::string &Name, Instruction *InsertBefore
+) : CastInst(Ty, BitCast, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
+}
+
+BitCastInst::BitCastInst(
+  Value *S, const Type *Ty, const std::string &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, BitCast, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
+}
+
+//===----------------------------------------------------------------------===//
+//                               CmpInst Classes
+//===----------------------------------------------------------------------===//
+
+CmpInst::CmpInst(const Type *ty, OtherOps op, unsigned short predicate,
+                 Value *LHS, Value *RHS, const std::string &Name,
+                 Instruction *InsertBefore)
+  : Instruction(ty, op,
+                OperandTraits<CmpInst>::op_begin(this),
+                OperandTraits<CmpInst>::operands(this),
+                InsertBefore) {
+    Op<0>() = LHS;
+    Op<1>() = RHS;
+  SubclassData = predicate;
+  setName(Name);
+}
+
+CmpInst::CmpInst(const Type *ty, OtherOps op, unsigned short predicate,
+                 Value *LHS, Value *RHS, const std::string &Name,
+                 BasicBlock *InsertAtEnd)
+  : Instruction(ty, op,
+                OperandTraits<CmpInst>::op_begin(this),
+                OperandTraits<CmpInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = LHS;
+  Op<1>() = RHS;
+  SubclassData = predicate;
+  setName(Name);
+}
+
+CmpInst *
+CmpInst::Create(OtherOps Op, unsigned short predicate, Value *S1, Value *S2, 
+                const std::string &Name, Instruction *InsertBefore) {
+  if (Op == Instruction::ICmp) {
+    return new ICmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                        InsertBefore);
+  }
+  if (Op == Instruction::FCmp) {
+    return new FCmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                        InsertBefore);
+  }
+  if (Op == Instruction::VICmp) {
+    return new VICmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                         InsertBefore);
+  }
+  return new VFCmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                       InsertBefore);
+}
+
+CmpInst *
+CmpInst::Create(OtherOps Op, unsigned short predicate, Value *S1, Value *S2, 
+                const std::string &Name, BasicBlock *InsertAtEnd) {
+  if (Op == Instruction::ICmp) {
+    return new ICmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                        InsertAtEnd);
+  }
+  if (Op == Instruction::FCmp) {
+    return new FCmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                        InsertAtEnd);
+  }
+  if (Op == Instruction::VICmp) {
+    return new VICmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                         InsertAtEnd);
+  }
+  return new VFCmpInst(CmpInst::Predicate(predicate), S1, S2, Name, 
+                       InsertAtEnd);
+}
+
+void CmpInst::swapOperands() {
+  if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    IC->swapOperands();
+  else
+    cast<FCmpInst>(this)->swapOperands();
+}
+
+bool CmpInst::isCommutative() {
+  if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    return IC->isCommutative();
+  return cast<FCmpInst>(this)->isCommutative();
+}
+
+bool CmpInst::isEquality() {
+  if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    return IC->isEquality();
+  return cast<FCmpInst>(this)->isEquality();
+}
+
+
+CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
+  switch (pred) {
+    default: assert(!"Unknown cmp predicate!");
+    case ICMP_EQ: return ICMP_NE;
+    case ICMP_NE: return ICMP_EQ;
+    case ICMP_UGT: return ICMP_ULE;
+    case ICMP_ULT: return ICMP_UGE;
+    case ICMP_UGE: return ICMP_ULT;
+    case ICMP_ULE: return ICMP_UGT;
+    case ICMP_SGT: return ICMP_SLE;
+    case ICMP_SLT: return ICMP_SGE;
+    case ICMP_SGE: return ICMP_SLT;
+    case ICMP_SLE: return ICMP_SGT;
+
+    case FCMP_OEQ: return FCMP_UNE;
+    case FCMP_ONE: return FCMP_UEQ;
+    case FCMP_OGT: return FCMP_ULE;
+    case FCMP_OLT: return FCMP_UGE;
+    case FCMP_OGE: return FCMP_ULT;
+    case FCMP_OLE: return FCMP_UGT;
+    case FCMP_UEQ: return FCMP_ONE;
+    case FCMP_UNE: return FCMP_OEQ;
+    case FCMP_UGT: return FCMP_OLE;
+    case FCMP_ULT: return FCMP_OGE;
+    case FCMP_UGE: return FCMP_OLT;
+    case FCMP_ULE: return FCMP_OGT;
+    case FCMP_ORD: return FCMP_UNO;
+    case FCMP_UNO: return FCMP_ORD;
+    case FCMP_TRUE: return FCMP_FALSE;
+    case FCMP_FALSE: return FCMP_TRUE;
+  }
+}
+
+ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
+  switch (pred) {
+    default: assert(! "Unknown icmp predicate!");
+    case ICMP_EQ: case ICMP_NE: 
+    case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE: 
+       return pred;
+    case ICMP_UGT: return ICMP_SGT;
+    case ICMP_ULT: return ICMP_SLT;
+    case ICMP_UGE: return ICMP_SGE;
+    case ICMP_ULE: return ICMP_SLE;
+  }
+}
+
+ICmpInst::Predicate ICmpInst::getUnsignedPredicate(Predicate pred) {
+  switch (pred) {
+    default: assert(! "Unknown icmp predicate!");
+    case ICMP_EQ: case ICMP_NE: 
+    case ICMP_UGT: case ICMP_ULT: case ICMP_UGE: case ICMP_ULE: 
+       return pred;
+    case ICMP_SGT: return ICMP_UGT;
+    case ICMP_SLT: return ICMP_ULT;
+    case ICMP_SGE: return ICMP_UGE;
+    case ICMP_SLE: return ICMP_ULE;
+  }
+}
+
+bool ICmpInst::isSignedPredicate(Predicate pred) {
+  switch (pred) {
+    default: assert(! "Unknown icmp predicate!");
+    case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE: 
+      return true;
+    case ICMP_EQ:  case ICMP_NE: case ICMP_UGT: case ICMP_ULT: 
+    case ICMP_UGE: case ICMP_ULE:
+      return false;
+  }
+}
+
+/// Initialize a set of values that all satisfy the condition with C.
+///
+ConstantRange 
+ICmpInst::makeConstantRange(Predicate pred, const APInt &C) {
+  APInt Lower(C);
+  APInt Upper(C);
+  uint32_t BitWidth = C.getBitWidth();
+  switch (pred) {
+  default: assert(0 && "Invalid ICmp opcode to ConstantRange ctor!");
+  case ICmpInst::ICMP_EQ: Upper++; break;
+  case ICmpInst::ICMP_NE: Lower++; break;
+  case ICmpInst::ICMP_ULT: Lower = APInt::getMinValue(BitWidth); break;
+  case ICmpInst::ICMP_SLT: Lower = APInt::getSignedMinValue(BitWidth); break;
+  case ICmpInst::ICMP_UGT: 
+    Lower++; Upper = APInt::getMinValue(BitWidth);        // Min = Next(Max)
+    break;
+  case ICmpInst::ICMP_SGT:
+    Lower++; Upper = APInt::getSignedMinValue(BitWidth);  // Min = Next(Max)
+    break;
+  case ICmpInst::ICMP_ULE: 
+    Lower = APInt::getMinValue(BitWidth); Upper++; 
+    break;
+  case ICmpInst::ICMP_SLE: 
+    Lower = APInt::getSignedMinValue(BitWidth); Upper++; 
+    break;
+  case ICmpInst::ICMP_UGE:
+    Upper = APInt::getMinValue(BitWidth);        // Min = Next(Max)
+    break;
+  case ICmpInst::ICMP_SGE:
+    Upper = APInt::getSignedMinValue(BitWidth);  // Min = Next(Max)
+    break;
+  }
+  return ConstantRange(Lower, Upper);
+}
+
+CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
+  switch (pred) {
+    default: assert(!"Unknown cmp predicate!");
+    case ICMP_EQ: case ICMP_NE:
+      return pred;
+    case ICMP_SGT: return ICMP_SLT;
+    case ICMP_SLT: return ICMP_SGT;
+    case ICMP_SGE: return ICMP_SLE;
+    case ICMP_SLE: return ICMP_SGE;
+    case ICMP_UGT: return ICMP_ULT;
+    case ICMP_ULT: return ICMP_UGT;
+    case ICMP_UGE: return ICMP_ULE;
+    case ICMP_ULE: return ICMP_UGE;
+  
+    case FCMP_FALSE: case FCMP_TRUE:
+    case FCMP_OEQ: case FCMP_ONE:
+    case FCMP_UEQ: case FCMP_UNE:
+    case FCMP_ORD: case FCMP_UNO:
+      return pred;
+    case FCMP_OGT: return FCMP_OLT;
+    case FCMP_OLT: return FCMP_OGT;
+    case FCMP_OGE: return FCMP_OLE;
+    case FCMP_OLE: return FCMP_OGE;
+    case FCMP_UGT: return FCMP_ULT;
+    case FCMP_ULT: return FCMP_UGT;
+    case FCMP_UGE: return FCMP_ULE;
+    case FCMP_ULE: return FCMP_UGE;
+  }
+}
+
+bool CmpInst::isUnsigned(unsigned short predicate) {
+  switch (predicate) {
+    default: return false;
+    case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT: 
+    case ICmpInst::ICMP_UGE: return true;
+  }
+}
+
+bool CmpInst::isSigned(unsigned short predicate){
+  switch (predicate) {
+    default: return false;
+    case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT: 
+    case ICmpInst::ICMP_SGE: return true;
+  }
+}
+
+bool CmpInst::isOrdered(unsigned short predicate) {
+  switch (predicate) {
+    default: return false;
+    case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT: 
+    case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_OLE: 
+    case FCmpInst::FCMP_ORD: return true;
+  }
+}
+      
+bool CmpInst::isUnordered(unsigned short predicate) {
+  switch (predicate) {
+    default: return false;
+    case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT: 
+    case FCmpInst::FCMP_ULT: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_ULE: 
+    case FCmpInst::FCMP_UNO: return true;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                        SwitchInst Implementation
+//===----------------------------------------------------------------------===//
+
+void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumCases) {
+  assert(Value && Default);
+  ReservedSpace = 2+NumCases*2;
+  NumOperands = 2;
+  OperandList = allocHungoffUses(ReservedSpace);
+
+  OperandList[0] = Value;
+  OperandList[1] = Default;
+}
+
+/// SwitchInst ctor - Create a new switch instruction, specifying a value to
+/// switch on and a default destination.  The number of additional cases can
+/// be specified here to make memory allocation more efficient.  This
+/// constructor can also autoinsert before another instruction.
+SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
+                       Instruction *InsertBefore)
+  : TerminatorInst(Type::VoidTy, Instruction::Switch, 0, 0, InsertBefore) {
+  init(Value, Default, NumCases);
+}
+
+/// SwitchInst ctor - Create a new switch instruction, specifying a value to
+/// switch on and a default destination.  The number of additional cases can
+/// be specified here to make memory allocation more efficient.  This
+/// constructor also autoinserts at the end of the specified BasicBlock.
+SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
+                       BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::VoidTy, Instruction::Switch, 0, 0, InsertAtEnd) {
+  init(Value, Default, NumCases);
+}
+
+SwitchInst::SwitchInst(const SwitchInst &SI)
+  : TerminatorInst(Type::VoidTy, Instruction::Switch,
+                   allocHungoffUses(SI.getNumOperands()), SI.getNumOperands()) {
+  Use *OL = OperandList, *InOL = SI.OperandList;
+  for (unsigned i = 0, E = SI.getNumOperands(); i != E; i+=2) {
+    OL[i] = InOL[i];
+    OL[i+1] = InOL[i+1];
+  }
+}
+
+SwitchInst::~SwitchInst() {
+  dropHungoffUses(OperandList);
+}
+
+
+/// addCase - Add an entry to the switch instruction...
+///
+void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
+  unsigned OpNo = NumOperands;
+  if (OpNo+2 > ReservedSpace)
+    resizeOperands(0);  // Get more space!
+  // Initialize some new operands.
+  assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
+  NumOperands = OpNo+2;
+  OperandList[OpNo] = OnVal;
+  OperandList[OpNo+1] = Dest;
+}
+
+/// removeCase - This method removes the specified successor from the switch
+/// instruction.  Note that this cannot be used to remove the default
+/// destination (successor #0).
+///
+void SwitchInst::removeCase(unsigned idx) {
+  assert(idx != 0 && "Cannot remove the default case!");
+  assert(idx*2 < getNumOperands() && "Successor index out of range!!!");
+
+  unsigned NumOps = getNumOperands();
+  Use *OL = OperandList;
+
+  // Move everything after this operand down.
+  //
+  // FIXME: we could just swap with the end of the list, then erase.  However,
+  // client might not expect this to happen.  The code as it is thrashes the
+  // use/def lists, which is kinda lame.
+  for (unsigned i = (idx+1)*2; i != NumOps; i += 2) {
+    OL[i-2] = OL[i];
+    OL[i-2+1] = OL[i+1];
+  }
+
+  // Nuke the last value.
+  OL[NumOps-2].set(0);
+  OL[NumOps-2+1].set(0);
+  NumOperands = NumOps-2;
+}
+
+/// resizeOperands - resize operands - This adjusts the length of the operands
+/// list according to the following behavior:
+///   1. If NumOps == 0, grow the operand list in response to a push_back style
+///      of operation.  This grows the number of ops by 3 times.
+///   2. If NumOps > NumOperands, reserve space for NumOps operands.
+///   3. If NumOps == NumOperands, trim the reserved space.
+///
+void SwitchInst::resizeOperands(unsigned NumOps) {
+  unsigned e = getNumOperands();
+  if (NumOps == 0) {
+    NumOps = e*3;
+  } else if (NumOps*2 > NumOperands) {
+    // No resize needed.
+    if (ReservedSpace >= NumOps) return;
+  } else if (NumOps == NumOperands) {
+    if (ReservedSpace == NumOps) return;
+  } else {
+    return;
+  }
+
+  ReservedSpace = NumOps;
+  Use *NewOps = allocHungoffUses(NumOps);
+  Use *OldOps = OperandList;
+  for (unsigned i = 0; i != e; ++i) {
+      NewOps[i] = OldOps[i];
+  }
+  OperandList = NewOps;
+  if (OldOps) Use::zap(OldOps, OldOps + e, true);
+}
+
+
+BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned SwitchInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+// Define these methods here so vtables don't get emitted into every translation
+// unit that uses these classes.
+
+GetElementPtrInst *GetElementPtrInst::clone() const {
+  return new(getNumOperands()) GetElementPtrInst(*this);
+}
+
+BinaryOperator *BinaryOperator::clone() const {
+  return Create(getOpcode(), Op<0>(), Op<1>());
+}
+
+FCmpInst* FCmpInst::clone() const {
+  return new FCmpInst(getPredicate(), Op<0>(), Op<1>());
+}
+ICmpInst* ICmpInst::clone() const {
+  return new ICmpInst(getPredicate(), Op<0>(), Op<1>());
+}
+
+VFCmpInst* VFCmpInst::clone() const {
+  return new VFCmpInst(getPredicate(), Op<0>(), Op<1>());
+}
+VICmpInst* VICmpInst::clone() const {
+  return new VICmpInst(getPredicate(), Op<0>(), Op<1>());
+}
+
+ExtractValueInst *ExtractValueInst::clone() const {
+  return new ExtractValueInst(*this);
+}
+InsertValueInst *InsertValueInst::clone() const {
+  return new InsertValueInst(*this);
+}
+
+
+MallocInst *MallocInst::clone()   const { return new MallocInst(*this); }
+AllocaInst *AllocaInst::clone()   const { return new AllocaInst(*this); }
+FreeInst   *FreeInst::clone()     const { return new FreeInst(getOperand(0)); }
+LoadInst   *LoadInst::clone()     const { return new LoadInst(*this); }
+StoreInst  *StoreInst::clone()    const { return new StoreInst(*this); }
+CastInst   *TruncInst::clone()    const { return new TruncInst(*this); }
+CastInst   *ZExtInst::clone()     const { return new ZExtInst(*this); }
+CastInst   *SExtInst::clone()     const { return new SExtInst(*this); }
+CastInst   *FPTruncInst::clone()  const { return new FPTruncInst(*this); }
+CastInst   *FPExtInst::clone()    const { return new FPExtInst(*this); }
+CastInst   *UIToFPInst::clone()   const { return new UIToFPInst(*this); }
+CastInst   *SIToFPInst::clone()   const { return new SIToFPInst(*this); }
+CastInst   *FPToUIInst::clone()   const { return new FPToUIInst(*this); }
+CastInst   *FPToSIInst::clone()   const { return new FPToSIInst(*this); }
+CastInst   *PtrToIntInst::clone() const { return new PtrToIntInst(*this); }
+CastInst   *IntToPtrInst::clone() const { return new IntToPtrInst(*this); }
+CastInst   *BitCastInst::clone()  const { return new BitCastInst(*this); }
+CallInst   *CallInst::clone()     const {
+  return new(getNumOperands()) CallInst(*this);
+}
+SelectInst *SelectInst::clone()   const {
+  return new(getNumOperands()) SelectInst(*this);
+}
+VAArgInst  *VAArgInst::clone()    const { return new VAArgInst(*this); }
+
+ExtractElementInst *ExtractElementInst::clone() const {
+  return new ExtractElementInst(*this);
+}
+InsertElementInst *InsertElementInst::clone() const {
+  return InsertElementInst::Create(*this);
+}
+ShuffleVectorInst *ShuffleVectorInst::clone() const {
+  return new ShuffleVectorInst(*this);
+}
+PHINode    *PHINode::clone()    const { return new PHINode(*this); }
+ReturnInst *ReturnInst::clone() const {
+  return new(getNumOperands()) ReturnInst(*this);
+}
+BranchInst *BranchInst::clone() const {
+  unsigned Ops(getNumOperands());
+  return new(Ops, Ops == 1) BranchInst(*this);
+}
+SwitchInst *SwitchInst::clone() const { return new SwitchInst(*this); }
+InvokeInst *InvokeInst::clone() const {
+  return new(getNumOperands()) InvokeInst(*this);
+}
+UnwindInst *UnwindInst::clone() const { return new UnwindInst(); }
+UnreachableInst *UnreachableInst::clone() const { return new UnreachableInst();}
diff --git a/lib/VMCore/IntrinsicInst.cpp b/lib/VMCore/IntrinsicInst.cpp
new file mode 100644
index 0000000..8bdc968
--- /dev/null
+++ b/lib/VMCore/IntrinsicInst.cpp
@@ -0,0 +1,77 @@
+//===-- InstrinsicInst.cpp - Intrinsic Instruction Wrappers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements methods that make it really easy to deal with intrinsic
+// functions with the isa/dyncast family of functions.  In particular, this
+// allows you to do things like:
+//
+//     if (DbgStopPointInst *SPI = dyn_cast<DbgStopPointInst>(Inst))
+//        ... SPI->getFileName() ... SPI->getDirectory() ...
+//
+// All intrinsic function calls are instances of the call instruction, so these
+// are all subclasses of the CallInst class.  Note that none of these classes
+// has state or virtual methods, which is an important part of this gross/neat
+// hack working.
+// 
+// In some cases, arguments to intrinsics need to be generic and are defined as
+// type pointer to empty struct { }*.  To access the real item of interest the
+// cast instruction needs to be stripped away. 
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+/// DbgInfoIntrinsic - This is the common base class for debug info intrinsics
+///
+
+static Value *CastOperand(Value *C) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    if (CE->isCast())
+      return CE->getOperand(0);
+  return NULL;
+}
+
+Value *DbgInfoIntrinsic::StripCast(Value *C) {
+  if (Value *CO = CastOperand(C)) {
+    C = StripCast(CO);
+  } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (GV->hasInitializer())
+      if (Value *CO = CastOperand(GV->getInitializer()))
+        C = StripCast(CO);
+  }
+  return dyn_cast<GlobalVariable>(C);
+}
+
+//===----------------------------------------------------------------------===//
+/// DbgStopPointInst - This represents the llvm.dbg.stoppoint instruction.
+///
+
+Value *DbgStopPointInst::getFileName() const {
+  // Once the operand indices are verified, update this assert
+  assert(LLVMDebugVersion == (7 << 16) && "Verify operand indices");
+  GlobalVariable *GV = cast<GlobalVariable>(getContext());
+  if (!GV->hasInitializer()) return NULL;
+  ConstantStruct *CS = cast<ConstantStruct>(GV->getInitializer());
+  return CS->getOperand(3);
+}
+
+Value *DbgStopPointInst::getDirectory() const {
+  // Once the operand indices are verified, update this assert
+  assert(LLVMDebugVersion == (7 << 16) && "Verify operand indices");
+  GlobalVariable *GV = cast<GlobalVariable>(getContext());
+  if (!GV->hasInitializer()) return NULL;
+  ConstantStruct *CS = cast<ConstantStruct>(GV->getInitializer());
+  return CS->getOperand(4);
+}
diff --git a/lib/VMCore/LeakDetector.cpp b/lib/VMCore/LeakDetector.cpp
new file mode 100644
index 0000000..1bf9171
--- /dev/null
+++ b/lib/VMCore/LeakDetector.cpp
@@ -0,0 +1,131 @@
+//===-- LeakDetector.cpp - Implement LeakDetector interface ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LeakDetector class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Value.h"
+using namespace llvm;
+
+namespace {
+  template <class T>
+  struct VISIBILITY_HIDDEN PrinterTrait {
+    static void print(const T* P) { cerr << P; }
+  };
+
+  template<>
+  struct VISIBILITY_HIDDEN PrinterTrait<Value> {
+    static void print(const Value* P) { cerr << *P; }
+  };
+
+  template <typename T>
+  struct VISIBILITY_HIDDEN LeakDetectorImpl {
+    explicit LeakDetectorImpl(const char* const name) : Cache(0), Name(name) { }
+
+    // Because the most common usage pattern, by far, is to add a
+    // garbage object, then remove it immediately, we optimize this
+    // case.  When an object is added, it is not added to the set
+    // immediately, it is added to the CachedValue Value.  If it is
+    // immediately removed, no set search need be performed.
+    void addGarbage(const T* o) {
+      if (Cache) {
+        assert(Ts.count(Cache) == 0 && "Object already in set!");
+        Ts.insert(Cache);
+      }
+      Cache = o;
+    }
+
+    void removeGarbage(const T* o) {
+      if (o == Cache)
+        Cache = 0; // Cache hit
+      else
+        Ts.erase(o);
+    }
+
+    bool hasGarbage(const std::string& Message) {
+      addGarbage(0); // Flush the Cache
+
+      assert(Cache == 0 && "No value should be cached anymore!");
+
+      if (!Ts.empty()) {
+        cerr << "Leaked " << Name << " objects found: " << Message << ":\n";
+        for (typename SmallPtrSet<const T*, 8>::iterator I = Ts.begin(),
+               E = Ts.end(); I != E; ++I) {
+          cerr << "\t";
+          PrinterTrait<T>::print(*I);
+          cerr << "\n";
+        }
+        cerr << '\n';
+
+        return true;
+      }
+      return false;
+    }
+
+  private:
+    SmallPtrSet<const T*, 8> Ts;
+    const T* Cache;
+    const char* const Name;
+  };
+
+  static LeakDetectorImpl<void>  *Objects;
+  static LeakDetectorImpl<Value> *LLVMObjects;
+
+  static LeakDetectorImpl<void> &getObjects() {
+    if (Objects == 0)
+      Objects = new LeakDetectorImpl<void>("GENERIC");
+    return *Objects;
+  }
+
+  static LeakDetectorImpl<Value> &getLLVMObjects() {
+    if (LLVMObjects == 0)
+      LLVMObjects = new LeakDetectorImpl<Value>("LLVM");
+    return *LLVMObjects;
+  }
+
+  static void clearGarbage() {
+    delete Objects;
+    delete LLVMObjects;
+    Objects = 0;
+    LLVMObjects = 0;
+  }
+}
+
+void LeakDetector::addGarbageObjectImpl(void *Object) {
+  getObjects().addGarbage(Object);
+}
+
+void LeakDetector::addGarbageObjectImpl(const Value *Object) {
+  getLLVMObjects().addGarbage(Object);
+}
+
+void LeakDetector::removeGarbageObjectImpl(void *Object) {
+  getObjects().removeGarbage(Object);
+}
+
+void LeakDetector::removeGarbageObjectImpl(const Value *Object) {
+  getLLVMObjects().removeGarbage(Object);
+}
+
+void LeakDetector::checkForGarbageImpl(const std::string &Message) {
+  // use non-short-circuit version so that both checks are performed
+  if (getObjects().hasGarbage(Message) |
+      getLLVMObjects().hasGarbage(Message))
+    cerr << "\nThis is probably because you removed an object, but didn't "
+         << "delete it.  Please check your code for memory leaks.\n";
+
+  // Clear out results so we don't get duplicate warnings on
+  // next call...
+  clearGarbage();
+}
diff --git a/lib/VMCore/Makefile b/lib/VMCore/Makefile
new file mode 100644
index 0000000..e9d3dc8
--- /dev/null
+++ b/lib/VMCore/Makefile
@@ -0,0 +1,33 @@
+##===- lib/VMCore/Makefile ------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../..
+LIBRARYNAME = LLVMCore
+BUILD_ARCHIVE = 1
+
+BUILT_SOURCES = $(PROJ_OBJ_ROOT)/include/llvm/Intrinsics.gen
+
+include $(LEVEL)/Makefile.common
+
+GENFILE:=$(PROJ_OBJ_ROOT)/include/llvm/Intrinsics.gen
+
+INTRINSICTD  := $(PROJ_SRC_ROOT)/include/llvm/Intrinsics.td
+INTRINSICTDS := $(wildcard $(PROJ_SRC_ROOT)/include/llvm/Intrinsics*.td)
+
+$(ObjDir)/Intrinsics.gen.tmp: $(ObjDir)/.dir $(INTRINSICTDS) $(TBLGEN)
+	$(Echo) Building Intrinsics.gen.tmp from Intrinsics.td
+	$(Verb) $(TableGen) $(call SYSPATH, $(INTRINSICTD)) -o $(call SYSPATH, $@) -gen-intrinsic
+
+$(GENFILE): $(ObjDir)/Intrinsics.gen.tmp
+	$(Verb) $(CMP) -s $@ $< || ( $(CP) $< $@ && \
+	  $(EchoCmd) Updated Intrinsics.gen because Intrinsics.gen.tmp \
+	    changed significantly. )
+
+install-local:: $(GENFILE)
+	$(Echo) Installing $(PROJ_includedir)/llvm/Intrinsics.gen
+	$(Verb) $(DataInstall) $(GENFILE) $(PROJ_includedir)/llvm/Intrinsics.gen
diff --git a/lib/VMCore/Mangler.cpp b/lib/VMCore/Mangler.cpp
new file mode 100644
index 0000000..0bd190a
--- /dev/null
+++ b/lib/VMCore/Mangler.cpp
@@ -0,0 +1,196 @@
+//===-- Mangler.cpp - Self-contained c/asm llvm name mangler --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Unified name mangler for CWriter and assembly backends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Mangler.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+using namespace llvm;
+
+static char HexDigit(int V) {
+  return V < 10 ? V+'0' : V+'A'-10;
+}
+
+static std::string MangleLetter(unsigned char C) {
+  char Result[] = { '_', HexDigit(C >> 4), HexDigit(C & 15), '_', 0 };
+  return Result;
+}
+
+/// makeNameProper - We don't want identifier names non-C-identifier characters
+/// in them, so mangle them as appropriate.
+///
+std::string Mangler::makeNameProper(const std::string &X, const char *Prefix,
+                                    const char *PrivatePrefix) {
+  if (X.empty()) return X;  // Empty names are uniqued by the caller.
+  
+  // If PreserveAsmNames is set, names with asm identifiers are not modified. 
+  if (PreserveAsmNames && X[0] == 1)
+    return X;
+  
+  if (!UseQuotes) {
+    std::string Result;
+
+    // If X does not start with (char)1, add the prefix.
+    bool NeedPrefix = true;
+    std::string::const_iterator I = X.begin();
+    if (*I == 1) {
+      NeedPrefix = false;
+      ++I;  // Skip over the marker.
+    }
+    
+    // Mangle the first letter specially, don't allow numbers.
+    if (*I >= '0' && *I <= '9')
+      Result += MangleLetter(*I++);
+
+    for (std::string::const_iterator E = X.end(); I != E; ++I) {
+      if (!isCharAcceptable(*I))
+        Result += MangleLetter(*I);
+      else
+        Result += *I;
+    }
+
+    if (NeedPrefix) {
+      if (Prefix)
+        Result = Prefix + Result;
+      if (PrivatePrefix)
+        Result = PrivatePrefix + Result;
+    }
+    return Result;
+  }
+
+  bool NeedPrefix = true;
+  bool NeedQuotes = false;
+  std::string Result;    
+  std::string::const_iterator I = X.begin();
+  if (*I == 1) {
+    NeedPrefix = false;
+    ++I;  // Skip over the marker.
+  }
+
+  // If the first character is a number, we need quotes.
+  if (*I >= '0' && *I <= '9')
+    NeedQuotes = true;
+    
+  // Do an initial scan of the string, checking to see if we need quotes or
+  // to escape a '"' or not.
+  if (!NeedQuotes)
+    for (std::string::const_iterator E = X.end(); I != E; ++I)
+      if (!isCharAcceptable(*I)) {
+        NeedQuotes = true;
+        break;
+      }
+    
+  // In the common case, we don't need quotes.  Handle this quickly.
+  if (!NeedQuotes) {
+    if (NeedPrefix) {
+      if (Prefix)
+        Result = Prefix + X;
+      else
+        Result = X;
+      if (PrivatePrefix)
+        Result = PrivatePrefix + Result;
+      return Result;
+    } else
+      return X.substr(1);
+  }
+    
+  // Otherwise, construct the string the expensive way.
+  for (std::string::const_iterator E = X.end(); I != E; ++I) {
+    if (*I == '"')
+      Result += "_QQ_";
+    else if (*I == '\n')
+      Result += "_NL_";
+    else
+      Result += *I;
+  }
+
+  if (NeedPrefix) {
+    if (Prefix)
+      Result = Prefix + X;
+    else
+      Result = X;
+    if (PrivatePrefix)
+      Result = PrivatePrefix + Result;
+  }
+  Result = '"' + Result + '"';
+  return Result;
+}
+
+/// getTypeID - Return a unique ID for the specified LLVM type.
+///
+unsigned Mangler::getTypeID(const Type *Ty) {
+  unsigned &E = TypeMap[Ty];
+  if (E == 0) E = ++TypeCounter;
+  return E;
+}
+
+std::string Mangler::getValueName(const Value *V) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return getValueName(GV);
+  
+  std::string &Name = Memo[V];
+  if (!Name.empty())
+    return Name;       // Return the already-computed name for V.
+  
+  // Always mangle local names.
+  Name = "ltmp_" + utostr(Count++) + "_" + utostr(getTypeID(V->getType()));
+  return Name;
+}
+
+
+std::string Mangler::getValueName(const GlobalValue *GV, const char * Suffix) {
+  // Check to see whether we've already named V.
+  std::string &Name = Memo[GV];
+  if (!Name.empty())
+    return Name;       // Return the already-computed name for V.
+
+  // Name mangling occurs as follows:
+  // - If V is an intrinsic function, do not change name at all
+  // - Otherwise, mangling occurs if global collides with existing name.
+  if (isa<Function>(GV) && cast<Function>(GV)->isIntrinsic()) {
+    Name = GV->getNameStart(); // Is an intrinsic function
+  } else if (!GV->hasName()) {
+    // Must mangle the global into a unique ID.
+    unsigned TypeUniqueID = getTypeID(GV->getType());
+    static unsigned GlobalID = 0;
+    Name = "__unnamed_" + utostr(TypeUniqueID) + "_" + utostr(GlobalID++);
+  } else {
+    if (GV->hasPrivateLinkage())
+      Name = makeNameProper(GV->getName() + Suffix, Prefix, PrivatePrefix);
+    else
+      Name = makeNameProper(GV->getName() + Suffix, Prefix);
+  }
+
+  return Name;
+}
+
+Mangler::Mangler(Module &M, const char *prefix, const char *privatePrefix)
+  : Prefix(prefix), PrivatePrefix (privatePrefix), UseQuotes(false),
+    PreserveAsmNames(false), Count(0), TypeCounter(0) {
+  std::fill(AcceptableChars, array_endof(AcceptableChars), 0);
+
+  // Letters and numbers are acceptable.
+  for (unsigned char X = 'a'; X <= 'z'; ++X)
+    markCharAcceptable(X);
+  for (unsigned char X = 'A'; X <= 'Z'; ++X)
+    markCharAcceptable(X);
+  for (unsigned char X = '0'; X <= '9'; ++X)
+    markCharAcceptable(X);
+  
+  // These chars are acceptable.
+  markCharAcceptable('_');
+  markCharAcceptable('$');
+  markCharAcceptable('.');
+}
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
new file mode 100644
index 0000000..a598005
--- /dev/null
+++ b/lib/VMCore/Module.cpp
@@ -0,0 +1,381 @@
+//===-- Module.cpp - Implement the Module class ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Module class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Module.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/LeakDetector.h"
+#include "SymbolTableListTraitsImpl.h"
+#include "llvm/TypeSymbolTable.h"
+#include <algorithm>
+#include <cstdarg>
+#include <cstdlib>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Methods to implement the globals and functions lists.
+//
+
+GlobalVariable *ilist_traits<GlobalVariable>::createSentinel() {
+  GlobalVariable *Ret = new GlobalVariable(Type::Int32Ty, false,
+                                           GlobalValue::ExternalLinkage);
+  // This should not be garbage monitored.
+  LeakDetector::removeGarbageObject(Ret);
+  return Ret;
+}
+GlobalAlias *ilist_traits<GlobalAlias>::createSentinel() {
+  GlobalAlias *Ret = new GlobalAlias(Type::Int32Ty,
+                                     GlobalValue::ExternalLinkage);
+  // This should not be garbage monitored.
+  LeakDetector::removeGarbageObject(Ret);
+  return Ret;
+}
+
+// Explicit instantiations of SymbolTableListTraits since some of the methods
+// are not in the public header file.
+template class SymbolTableListTraits<GlobalVariable, Module>;
+template class SymbolTableListTraits<Function, Module>;
+template class SymbolTableListTraits<GlobalAlias, Module>;
+
+//===----------------------------------------------------------------------===//
+// Primitive Module methods.
+//
+
+Module::Module(const std::string &MID)
+  : ModuleID(MID), DataLayout("") {
+  ValSymTab = new ValueSymbolTable();
+  TypeSymTab = new TypeSymbolTable();
+}
+
+Module::~Module() {
+  dropAllReferences();
+  GlobalList.clear();
+  FunctionList.clear();
+  AliasList.clear();
+  LibraryList.clear();
+  delete ValSymTab;
+  delete TypeSymTab;
+}
+
+/// Target endian information...
+Module::Endianness Module::getEndianness() const {
+  std::string temp = DataLayout;
+  Module::Endianness ret = AnyEndianness;
+  
+  while (!temp.empty()) {
+    std::string token = getToken(temp, "-");
+    
+    if (token[0] == 'e') {
+      ret = LittleEndian;
+    } else if (token[0] == 'E') {
+      ret = BigEndian;
+    }
+  }
+  
+  return ret;
+}
+
+/// Target Pointer Size information...
+Module::PointerSize Module::getPointerSize() const {
+  std::string temp = DataLayout;
+  Module::PointerSize ret = AnyPointerSize;
+  
+  while (!temp.empty()) {
+    std::string token = getToken(temp, "-");
+    char signal = getToken(token, ":")[0];
+    
+    if (signal == 'p') {
+      int size = atoi(getToken(token, ":").c_str());
+      if (size == 32)
+        ret = Pointer32;
+      else if (size == 64)
+        ret = Pointer64;
+    }
+  }
+  
+  return ret;
+}
+
+/// getNamedValue - Return the first global value in the module with
+/// the specified name, of arbitrary type.  This method returns null
+/// if a global with the specified name is not found.
+GlobalValue *Module::getNamedValue(const std::string &Name) const {
+  return cast_or_null<GlobalValue>(getValueSymbolTable().lookup(Name));
+}
+
+GlobalValue *Module::getNamedValue(const char *Name) const {
+  llvm::Value *V = getValueSymbolTable().lookup(Name, Name+strlen(Name));
+  return cast_or_null<GlobalValue>(V);
+}
+
+//===----------------------------------------------------------------------===//
+// Methods for easy access to the functions in the module.
+//
+
+// getOrInsertFunction - Look up the specified function in the module symbol
+// table.  If it does not exist, add a prototype for the function and return
+// it.  This is nice because it allows most passes to get away with not handling
+// the symbol table directly for this common task.
+//
+Constant *Module::getOrInsertFunction(const std::string &Name,
+                                      const FunctionType *Ty,
+                                      AttrListPtr AttributeList) {
+  // See if we have a definition for the specified function already.
+  GlobalValue *F = getNamedValue(Name);
+  if (F == 0) {
+    // Nope, add it
+    Function *New = Function::Create(Ty, GlobalVariable::ExternalLinkage, Name);
+    if (!New->isIntrinsic())       // Intrinsics get attrs set on construction
+      New->setAttributes(AttributeList);
+    FunctionList.push_back(New);
+    return New;                    // Return the new prototype.
+  }
+
+  // Okay, the function exists.  Does it have externally visible linkage?
+  if (F->hasLocalLinkage()) {
+    // Clear the function's name.
+    F->setName("");
+    // Retry, now there won't be a conflict.
+    Constant *NewF = getOrInsertFunction(Name, Ty);
+    F->setName(&Name[0], Name.size());
+    return NewF;
+  }
+
+  // If the function exists but has the wrong type, return a bitcast to the
+  // right type.
+  if (F->getType() != PointerType::getUnqual(Ty))
+    return ConstantExpr::getBitCast(F, PointerType::getUnqual(Ty));
+  
+  // Otherwise, we just found the existing function or a prototype.
+  return F;  
+}
+
+Constant *Module::getOrInsertTargetIntrinsic(const std::string &Name,
+                                             const FunctionType *Ty,
+                                             AttrListPtr AttributeList) {
+  // See if we have a definition for the specified function already.
+  GlobalValue *F = getNamedValue(Name);
+  if (F == 0) {
+    // Nope, add it
+    Function *New = Function::Create(Ty, GlobalVariable::ExternalLinkage, Name);
+    New->setAttributes(AttributeList);
+    FunctionList.push_back(New);
+    return New; // Return the new prototype.
+  }
+
+  // Otherwise, we just found the existing function or a prototype.
+  return F;  
+}
+
+Constant *Module::getOrInsertFunction(const std::string &Name,
+                                      const FunctionType *Ty) {
+  AttrListPtr AttributeList = AttrListPtr::get((AttributeWithIndex *)0, 0);
+  return getOrInsertFunction(Name, Ty, AttributeList);
+}
+
+// getOrInsertFunction - Look up the specified function in the module symbol
+// table.  If it does not exist, add a prototype for the function and return it.
+// This version of the method takes a null terminated list of function
+// arguments, which makes it easier for clients to use.
+//
+Constant *Module::getOrInsertFunction(const std::string &Name,
+                                      AttrListPtr AttributeList,
+                                      const Type *RetTy, ...) {
+  va_list Args;
+  va_start(Args, RetTy);
+
+  // Build the list of argument types...
+  std::vector<const Type*> ArgTys;
+  while (const Type *ArgTy = va_arg(Args, const Type*))
+    ArgTys.push_back(ArgTy);
+
+  va_end(Args);
+
+  // Build the function type and chain to the other getOrInsertFunction...
+  return getOrInsertFunction(Name, FunctionType::get(RetTy, ArgTys, false),
+                             AttributeList);
+}
+
+Constant *Module::getOrInsertFunction(const std::string &Name,
+                                      const Type *RetTy, ...) {
+  va_list Args;
+  va_start(Args, RetTy);
+
+  // Build the list of argument types...
+  std::vector<const Type*> ArgTys;
+  while (const Type *ArgTy = va_arg(Args, const Type*))
+    ArgTys.push_back(ArgTy);
+
+  va_end(Args);
+
+  // Build the function type and chain to the other getOrInsertFunction...
+  return getOrInsertFunction(Name, FunctionType::get(RetTy, ArgTys, false),
+                             AttrListPtr::get((AttributeWithIndex *)0, 0));
+}
+
+// getFunction - Look up the specified function in the module symbol table.
+// If it does not exist, return null.
+//
+Function *Module::getFunction(const std::string &Name) const {
+  return dyn_cast_or_null<Function>(getNamedValue(Name));
+}
+
+Function *Module::getFunction(const char *Name) const {
+  return dyn_cast_or_null<Function>(getNamedValue(Name));
+}
+
+//===----------------------------------------------------------------------===//
+// Methods for easy access to the global variables in the module.
+//
+
+/// getGlobalVariable - Look up the specified global variable in the module
+/// symbol table.  If it does not exist, return null.  The type argument
+/// should be the underlying type of the global, i.e., it should not have
+/// the top-level PointerType, which represents the address of the global.
+/// If AllowLocal is set to true, this function will return types that
+/// have an local. By default, these types are not returned.
+///
+GlobalVariable *Module::getGlobalVariable(const std::string &Name,
+                                          bool AllowLocal) const {
+  if (GlobalVariable *Result = 
+      dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
+    if (AllowLocal || !Result->hasLocalLinkage())
+      return Result;
+  return 0;
+}
+
+/// getOrInsertGlobal - Look up the specified global in the module symbol table.
+///   1. If it does not exist, add a declaration of the global and return it.
+///   2. Else, the global exists but has the wrong type: return the function
+///      with a constantexpr cast to the right type.
+///   3. Finally, if the existing global is the correct delclaration, return the
+///      existing global.
+Constant *Module::getOrInsertGlobal(const std::string &Name, const Type *Ty) {
+  // See if we have a definition for the specified global already.
+  GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(getNamedValue(Name));
+  if (GV == 0) {
+    // Nope, add it
+    GlobalVariable *New =
+      new GlobalVariable(Ty, false, GlobalVariable::ExternalLinkage, 0, Name);
+    GlobalList.push_back(New);
+    return New;                    // Return the new declaration.
+  }
+
+  // If the variable exists but has the wrong type, return a bitcast to the
+  // right type.
+  if (GV->getType() != PointerType::getUnqual(Ty))
+    return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty));
+  
+  // Otherwise, we just found the existing function or a prototype.
+  return GV;
+}
+
+//===----------------------------------------------------------------------===//
+// Methods for easy access to the global variables in the module.
+//
+
+// getNamedAlias - Look up the specified global in the module symbol table.
+// If it does not exist, return null.
+//
+GlobalAlias *Module::getNamedAlias(const std::string &Name) const {
+  return dyn_cast_or_null<GlobalAlias>(getNamedValue(Name));
+}
+
+//===----------------------------------------------------------------------===//
+// Methods for easy access to the types in the module.
+//
+
+
+// addTypeName - Insert an entry in the symbol table mapping Str to Type.  If
+// there is already an entry for this name, true is returned and the symbol
+// table is not modified.
+//
+bool Module::addTypeName(const std::string &Name, const Type *Ty) {
+  TypeSymbolTable &ST = getTypeSymbolTable();
+
+  if (ST.lookup(Name)) return true;  // Already in symtab...
+
+  // Not in symbol table?  Set the name with the Symtab as an argument so the
+  // type knows what to update...
+  ST.insert(Name, Ty);
+
+  return false;
+}
+
+/// getTypeByName - Return the type with the specified name in this module, or
+/// null if there is none by that name.
+const Type *Module::getTypeByName(const std::string &Name) const {
+  const TypeSymbolTable &ST = getTypeSymbolTable();
+  return cast_or_null<Type>(ST.lookup(Name));
+}
+
+// getTypeName - If there is at least one entry in the symbol table for the
+// specified type, return it.
+//
+std::string Module::getTypeName(const Type *Ty) const {
+  const TypeSymbolTable &ST = getTypeSymbolTable();
+
+  TypeSymbolTable::const_iterator TI = ST.begin();
+  TypeSymbolTable::const_iterator TE = ST.end();
+  if ( TI == TE ) return ""; // No names for types
+
+  while (TI != TE && TI->second != Ty)
+    ++TI;
+
+  if (TI != TE)  // Must have found an entry!
+    return TI->first;
+  return "";     // Must not have found anything...
+}
+
+//===----------------------------------------------------------------------===//
+// Other module related stuff.
+//
+
+
+// dropAllReferences() - This function causes all the subelementss to "let go"
+// of all references that they are maintaining.  This allows one to 'delete' a
+// whole module at a time, even though there may be circular references... first
+// all references are dropped, and all use counts go to zero.  Then everything
+// is deleted for real.  Note that no operations are valid on an object that
+// has "dropped all references", except operator delete.
+//
+void Module::dropAllReferences() {
+  for(Module::iterator I = begin(), E = end(); I != E; ++I)
+    I->dropAllReferences();
+
+  for(Module::global_iterator I = global_begin(), E = global_end(); I != E; ++I)
+    I->dropAllReferences();
+
+  for(Module::alias_iterator I = alias_begin(), E = alias_end(); I != E; ++I)
+    I->dropAllReferences();
+}
+
+void Module::addLibrary(const std::string& Lib) {
+  for (Module::lib_iterator I = lib_begin(), E = lib_end(); I != E; ++I)
+    if (*I == Lib)
+      return;
+  LibraryList.push_back(Lib);
+}
+
+void Module::removeLibrary(const std::string& Lib) {
+  LibraryListType::iterator I = LibraryList.begin();
+  LibraryListType::iterator E = LibraryList.end();
+  for (;I != E; ++I)
+    if (*I == Lib) {
+      LibraryList.erase(I);
+      return;
+    }
+}
diff --git a/lib/VMCore/ModuleProvider.cpp b/lib/VMCore/ModuleProvider.cpp
new file mode 100644
index 0000000..cfff97c
--- /dev/null
+++ b/lib/VMCore/ModuleProvider.cpp
@@ -0,0 +1,26 @@
+//===-- ModuleProvider.cpp - Base implementation for module providers -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Minimal implementation of the abstract interface for providing a module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ModuleProvider.h"
+#include "llvm/Module.h"
+using namespace llvm;
+
+/// ctor - always have a valid Module
+///
+ModuleProvider::ModuleProvider() : TheModule(0) { }
+
+/// dtor - when we leave, we take our Module with us
+///
+ModuleProvider::~ModuleProvider() {
+  delete TheModule;
+}
diff --git a/lib/VMCore/Pass.cpp b/lib/VMCore/Pass.cpp
new file mode 100644
index 0000000..6db5d7e
--- /dev/null
+++ b/lib/VMCore/Pass.cpp
@@ -0,0 +1,323 @@
+//===- Pass.cpp - LLVM Pass Infrastructure Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM Pass infrastructure.  It is primarily
+// responsible with ensuring that passes are executed and batched together
+// optimally.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ManagedStatic.h"
+#include <algorithm>
+#include <map>
+#include <set>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Pass Implementation
+//
+
+// Force out-of-line virtual method.
+Pass::~Pass() { 
+  delete Resolver; 
+}
+
+// Force out-of-line virtual method.
+ModulePass::~ModulePass() { }
+
+bool Pass::mustPreserveAnalysisID(const PassInfo *AnalysisID) const {
+  return Resolver->getAnalysisIfAvailable(AnalysisID, true) != 0;
+}
+
+// dumpPassStructure - Implement the -debug-passes=Structure option
+void Pass::dumpPassStructure(unsigned Offset) {
+  cerr << std::string(Offset*2, ' ') << getPassName() << "\n";
+}
+
+/// getPassName - Return a nice clean name for a pass.  This usually
+/// implemented in terms of the name that is registered by one of the
+/// Registration templates, but can be overloaded directly.
+///
+const char *Pass::getPassName() const {
+  if (const PassInfo *PI = getPassInfo())
+    return PI->getPassName();
+  return "Unnamed pass: implement Pass::getPassName()";
+}
+
+// print - Print out the internal state of the pass.  This is called by Analyze
+// to print out the contents of an analysis.  Otherwise it is not necessary to
+// implement this method.
+//
+void Pass::print(std::ostream &O,const Module*) const {
+  O << "Pass::print not implemented for pass: '" << getPassName() << "'!\n";
+}
+
+// dump - call print(cerr);
+void Pass::dump() const {
+  print(*cerr.stream(), 0);
+}
+
+//===----------------------------------------------------------------------===//
+// ImmutablePass Implementation
+//
+// Force out-of-line virtual method.
+ImmutablePass::~ImmutablePass() { }
+
+//===----------------------------------------------------------------------===//
+// FunctionPass Implementation
+//
+
+// run - On a module, we run this pass by initializing, runOnFunction'ing once
+// for every function in the module, then by finalizing.
+//
+bool FunctionPass::runOnModule(Module &M) {
+  bool Changed = doInitialization(M);
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration())      // Passes are not run on external functions!
+    Changed |= runOnFunction(*I);
+
+  return Changed | doFinalization(M);
+}
+
+// run - On a function, we simply initialize, run the function, then finalize.
+//
+bool FunctionPass::run(Function &F) {
+  // Passes are not run on external functions!
+  if (F.isDeclaration()) return false;
+
+  bool Changed = doInitialization(*F.getParent());
+  Changed |= runOnFunction(F);
+  return Changed | doFinalization(*F.getParent());
+}
+
+//===----------------------------------------------------------------------===//
+// BasicBlockPass Implementation
+//
+
+// To run this pass on a function, we simply call runOnBasicBlock once for each
+// function.
+//
+bool BasicBlockPass::runOnFunction(Function &F) {
+  bool Changed = doInitialization(F);
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    Changed |= runOnBasicBlock(*I);
+  return Changed | doFinalization(F);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Registration mechanism
+//
+namespace {
+class PassRegistrar {
+  /// PassInfoMap - Keep track of the passinfo object for each registered llvm
+  /// pass.
+  typedef std::map<intptr_t, const PassInfo*> MapType;
+  MapType PassInfoMap;
+  
+  /// AnalysisGroupInfo - Keep track of information for each analysis group.
+  struct AnalysisGroupInfo {
+    const PassInfo *DefaultImpl;
+    std::set<const PassInfo *> Implementations;
+    AnalysisGroupInfo() : DefaultImpl(0) {}
+  };
+  
+  /// AnalysisGroupInfoMap - Information for each analysis group.
+  std::map<const PassInfo *, AnalysisGroupInfo> AnalysisGroupInfoMap;
+
+public:
+  
+  const PassInfo *GetPassInfo(intptr_t TI) const {
+    MapType::const_iterator I = PassInfoMap.find(TI);
+    return I != PassInfoMap.end() ? I->second : 0;
+  }
+  
+  void RegisterPass(const PassInfo &PI) {
+    bool Inserted =
+      PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
+    assert(Inserted && "Pass registered multiple times!"); Inserted=Inserted;
+  }
+  
+  void UnregisterPass(const PassInfo &PI) {
+    MapType::iterator I = PassInfoMap.find(PI.getTypeInfo());
+    assert(I != PassInfoMap.end() && "Pass registered but not in map!");
+    
+    // Remove pass from the map.
+    PassInfoMap.erase(I);
+  }
+  
+  void EnumerateWith(PassRegistrationListener *L) {
+    for (MapType::const_iterator I = PassInfoMap.begin(),
+         E = PassInfoMap.end(); I != E; ++I)
+      L->passEnumerate(I->second);
+  }
+  
+  
+  /// Analysis Group Mechanisms.
+  void RegisterAnalysisGroup(PassInfo *InterfaceInfo,
+                             const PassInfo *ImplementationInfo,
+                             bool isDefault) {
+    AnalysisGroupInfo &AGI = AnalysisGroupInfoMap[InterfaceInfo];
+    assert(AGI.Implementations.count(ImplementationInfo) == 0 &&
+           "Cannot add a pass to the same analysis group more than once!");
+    AGI.Implementations.insert(ImplementationInfo);
+    if (isDefault) {
+      assert(AGI.DefaultImpl == 0 && InterfaceInfo->getNormalCtor() == 0 &&
+             "Default implementation for analysis group already specified!");
+      assert(ImplementationInfo->getNormalCtor() &&
+           "Cannot specify pass as default if it does not have a default ctor");
+      AGI.DefaultImpl = ImplementationInfo;
+      InterfaceInfo->setNormalCtor(ImplementationInfo->getNormalCtor());
+    }
+  }
+};
+}
+
+static std::vector<PassRegistrationListener*> *Listeners = 0;
+
+// FIXME: This should use ManagedStatic to manage the pass registrar.
+// Unfortunately, we can't do this, because passes are registered with static
+// ctors, and having llvm_shutdown clear this map prevents successful
+// ressurection after llvm_shutdown is run.
+static PassRegistrar *getPassRegistrar() {
+  static PassRegistrar *PassRegistrarObj = 0;
+  if (!PassRegistrarObj)
+    PassRegistrarObj = new PassRegistrar();
+  return PassRegistrarObj;
+}
+
+// getPassInfo - Return the PassInfo data structure that corresponds to this
+// pass...
+const PassInfo *Pass::getPassInfo() const {
+  return lookupPassInfo(PassID);
+}
+
+const PassInfo *Pass::lookupPassInfo(intptr_t TI) {
+  return getPassRegistrar()->GetPassInfo(TI);
+}
+
+void PassInfo::registerPass() {
+  getPassRegistrar()->RegisterPass(*this);
+
+  // Notify any listeners.
+  if (Listeners)
+    for (std::vector<PassRegistrationListener*>::iterator
+           I = Listeners->begin(), E = Listeners->end(); I != E; ++I)
+      (*I)->passRegistered(this);
+}
+
+void PassInfo::unregisterPass() {
+  getPassRegistrar()->UnregisterPass(*this);
+}
+
+//===----------------------------------------------------------------------===//
+//                  Analysis Group Implementation Code
+//===----------------------------------------------------------------------===//
+
+// RegisterAGBase implementation
+//
+RegisterAGBase::RegisterAGBase(const char *Name, intptr_t InterfaceID,
+                               intptr_t PassID, bool isDefault)
+  : PassInfo(Name, InterfaceID),
+    ImplementationInfo(0), isDefaultImplementation(isDefault) {
+
+  InterfaceInfo = const_cast<PassInfo*>(Pass::lookupPassInfo(InterfaceID));
+  if (InterfaceInfo == 0) {
+    // First reference to Interface, register it now.
+    registerPass();
+    InterfaceInfo = this;
+  }
+  assert(isAnalysisGroup() &&
+         "Trying to join an analysis group that is a normal pass!");
+
+  if (PassID) {
+    ImplementationInfo = Pass::lookupPassInfo(PassID);
+    assert(ImplementationInfo &&
+           "Must register pass before adding to AnalysisGroup!");
+
+    // Make sure we keep track of the fact that the implementation implements
+    // the interface.
+    PassInfo *IIPI = const_cast<PassInfo*>(ImplementationInfo);
+    IIPI->addInterfaceImplemented(InterfaceInfo);
+    
+    getPassRegistrar()->RegisterAnalysisGroup(InterfaceInfo, IIPI, isDefault);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// PassRegistrationListener implementation
+//
+
+// PassRegistrationListener ctor - Add the current object to the list of
+// PassRegistrationListeners...
+PassRegistrationListener::PassRegistrationListener() {
+  if (!Listeners) Listeners = new std::vector<PassRegistrationListener*>();
+  Listeners->push_back(this);
+}
+
+// dtor - Remove object from list of listeners...
+PassRegistrationListener::~PassRegistrationListener() {
+  std::vector<PassRegistrationListener*>::iterator I =
+    std::find(Listeners->begin(), Listeners->end(), this);
+  assert(Listeners && I != Listeners->end() &&
+         "PassRegistrationListener not registered!");
+  Listeners->erase(I);
+
+  if (Listeners->empty()) {
+    delete Listeners;
+    Listeners = 0;
+  }
+}
+
+// enumeratePasses - Iterate over the registered passes, calling the
+// passEnumerate callback on each PassInfo object.
+//
+void PassRegistrationListener::enumeratePasses() {
+  getPassRegistrar()->EnumerateWith(this);
+}
+
+//===----------------------------------------------------------------------===//
+//   AnalysisUsage Class Implementation
+//
+
+namespace {
+  struct GetCFGOnlyPasses : public PassRegistrationListener {
+    typedef AnalysisUsage::VectorType VectorType;
+    VectorType &CFGOnlyList;
+    GetCFGOnlyPasses(VectorType &L) : CFGOnlyList(L) {}
+    
+    void passEnumerate(const PassInfo *P) {
+      if (P->isCFGOnlyPass())
+        CFGOnlyList.push_back(P);
+    }
+  };
+}
+
+// setPreservesCFG - This function should be called to by the pass, iff they do
+// not:
+//
+//  1. Add or remove basic blocks from the function
+//  2. Modify terminator instructions in any way.
+//
+// This function annotates the AnalysisUsage info object to say that analyses
+// that only depend on the CFG are preserved by this pass.
+//
+void AnalysisUsage::setPreservesCFG() {
+  // Since this transformation doesn't modify the CFG, it preserves all analyses
+  // that only depend on the CFG (like dominators, loop info, etc...)
+  GetCFGOnlyPasses(Preserved).enumeratePasses();
+}
+
+
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
new file mode 100644
index 0000000..4799915
--- /dev/null
+++ b/lib/VMCore/PassManager.cpp
@@ -0,0 +1,1710 @@
+//===- PassManager.cpp - LLVM Pass Infrastructure Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM Pass Manager infrastructure. 
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/PassManagers.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm-c/Core.h"
+#include <algorithm>
+#include <cstdio>
+#include <map>
+using namespace llvm;
+
+// See PassManagers.h for Pass Manager infrastructure overview.
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// Pass debugging information.  Often it is useful to find out what pass is
+// running when a crash occurs in a utility.  When this library is compiled with
+// debugging on, a command line option (--debug-pass) is enabled that causes the
+// pass name to be printed before it executes.
+//
+
+// Different debug levels that can be enabled...
+enum PassDebugLevel {
+  None, Arguments, Structure, Executions, Details
+};
+
+// Always verify dominfo if expensive checking is enabled.
+#ifdef XDEBUG
+bool VerifyDomInfo = true;
+#else
+bool VerifyDomInfo = false;
+#endif
+static cl::opt<bool,true>
+VerifyDomInfoX("verify-dom-info", cl::location(VerifyDomInfo),
+               cl::desc("Verify dominator info (time consuming)"));
+
+static cl::opt<enum PassDebugLevel>
+PassDebugging("debug-pass", cl::Hidden,
+                  cl::desc("Print PassManager debugging information"),
+                  cl::values(
+  clEnumVal(None      , "disable debug output"),
+  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
+  clEnumVal(Structure , "print pass structure before run()"),
+  clEnumVal(Executions, "print pass name before it is executed"),
+  clEnumVal(Details   , "print pass details when it is executed"),
+                             clEnumValEnd));
+} // End of llvm namespace
+
+void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
+  if (V == 0 && M == 0)
+    OS << "Releasing pass '";
+  else
+    OS << "Running pass '";
+  
+  OS << P->getPassName() << "'";
+  
+  if (M) {
+    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
+    return;
+  }
+  if (V == 0) {
+    OS << '\n';
+    return;
+  }
+
+  OS << " on ";
+  if (isa<Function>(V))
+    OS << "function";
+  else if (isa<BasicBlock>(V))
+    OS << "basic block";
+  else
+    OS << "value";
+
+  OS << " '";
+  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
+  OS << "'\n";
+}
+
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// BBPassManager
+//
+/// BBPassManager manages BasicBlockPass. It batches all the
+/// pass together and sequence them to process one basic block before
+/// processing next basic block.
+class VISIBILITY_HIDDEN BBPassManager : public PMDataManager, 
+                                        public FunctionPass {
+
+public:
+  static char ID;
+  explicit BBPassManager(int Depth) 
+    : PMDataManager(Depth), FunctionPass(&ID) {}
+
+  /// Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the function, and if so, return true.
+  bool runOnFunction(Function &F);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M);
+  bool doInitialization(Function &F);
+  bool doFinalization(Module &M);
+  bool doFinalization(Function &F);
+
+  virtual const char *getPassName() const {
+    return "BasicBlock Pass Manager";
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::cerr << std::string(Offset*2, ' ') << "BasicBlockPass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      BP->dumpPassStructure(Offset + 1);
+      dumpLastUses(BP, Offset+1);
+    }
+  }
+
+  BasicBlockPass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
+    return BP;
+  }
+
+  virtual PassManagerType getPassManagerType() const { 
+    return PMT_BasicBlockPassManager; 
+  }
+};
+
+char BBPassManager::ID = 0;
+}
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl
+//
+/// FunctionPassManagerImpl manages FPPassManagers
+class FunctionPassManagerImpl : public Pass,
+                                public PMDataManager,
+                                public PMTopLevelManager {
+public:
+  static char ID;
+  explicit FunctionPassManagerImpl(int Depth) : 
+    Pass(&ID), PMDataManager(Depth), 
+    PMTopLevelManager(TLM_Function) { }
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+ 
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Function &F);
+
+  /// doInitialization - Run all of the initializers for the function passes.
+  ///
+  bool doInitialization(Module &M);
+  
+  /// doFinalization - Run all of the finalizers for the function passes.
+  ///
+  bool doFinalization(Module &M);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  inline void addTopLevelPass(Pass *P) {
+
+    if (ImmutablePass *IP = dynamic_cast<ImmutablePass *> (P)) {
+      
+      // P is a immutable pass and it will be managed by this
+      // top level manager. Set up analysis resolver to connect them.
+      AnalysisResolver *AR = new AnalysisResolver(*this);
+      P->setResolver(AR);
+      initializeAnalysisImpl(P);
+      addImmutablePass(IP);
+      recordAvailableAnalysis(IP);
+    } else {
+      P->assignPassManager(activeStack);
+    }
+
+  }
+
+  FPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
+    return FP;
+  }
+};
+
+char FunctionPassManagerImpl::ID = 0;
+//===----------------------------------------------------------------------===//
+// MPPassManager
+//
+/// MPPassManager manages ModulePasses and function pass managers.
+/// It batches all Module passes and function pass managers together and
+/// sequences them to process one module.
+class MPPassManager : public Pass, public PMDataManager {
+public:
+  static char ID;
+  explicit MPPassManager(int Depth) :
+    Pass(&ID), PMDataManager(Depth) { }
+
+  // Delete on the fly managers.
+  virtual ~MPPassManager() {
+    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator 
+           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+         I != E; ++I) {
+      FunctionPassManagerImpl *FPP = I->second;
+      delete FPP;
+    }
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  /// Add RequiredPass into list of lower level passes required by pass P.
+  /// RequiredPass is run on the fly by Pass Manager when P requests it
+  /// through getAnalysis interface.
+  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
+
+  /// Return function pass corresponding to PassInfo PI, that is 
+  /// required by module pass MP. Instantiate analysis pass, by using
+  /// its runOnFunction() for function F.
+  virtual Pass* getOnTheFlyPass(Pass *MP, const PassInfo *PI, Function &F);
+
+  virtual const char *getPassName() const {
+    return "Module Pass Manager";
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::cerr << std::string(Offset*2, ' ') << "ModulePass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      ModulePass *MP = getContainedPass(Index);
+      MP->dumpPassStructure(Offset + 1);
+      if (FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP])
+        FPP->dumpPassStructure(Offset + 2);
+      dumpLastUses(MP, Offset+1);
+    }
+  }
+
+  ModulePass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    return static_cast<ModulePass *>(PassVector[N]);
+  }
+
+  virtual PassManagerType getPassManagerType() const { 
+    return PMT_ModulePassManager; 
+  }
+
+ private:
+  /// Collection of on the fly FPPassManagers. These managers manage
+  /// function passes that are required by module passes.
+  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
+};
+
+char MPPassManager::ID = 0;
+//===----------------------------------------------------------------------===//
+// PassManagerImpl
+//
+
+/// PassManagerImpl manages MPPassManagers
+class PassManagerImpl : public Pass,
+                        public PMDataManager,
+                        public PMTopLevelManager {
+
+public:
+  static char ID;
+  explicit PassManagerImpl(int Depth) :
+    Pass(&ID), PMDataManager(Depth), PMTopLevelManager(TLM_Pass) { }
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+ 
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Module &M);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  inline void addTopLevelPass(Pass *P) {
+    if (ImmutablePass *IP = dynamic_cast<ImmutablePass *> (P)) {
+      
+      // P is a immutable pass and it will be managed by this
+      // top level manager. Set up analysis resolver to connect them.
+      AnalysisResolver *AR = new AnalysisResolver(*this);
+      P->setResolver(AR);
+      initializeAnalysisImpl(P);
+      addImmutablePass(IP);
+      recordAvailableAnalysis(IP);
+    } else {
+      P->assignPassManager(activeStack);
+    }
+  }
+
+  MPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
+    return MP;
+  }
+};
+
+char PassManagerImpl::ID = 0;
+} // End of llvm namespace
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+/// TimingInfo Class - This class is used to calculate information about the
+/// amount of time each pass takes to execute.  This only happens when
+/// -time-passes is enabled on the command line.
+///
+class VISIBILITY_HIDDEN TimingInfo {
+  std::map<Pass*, Timer> TimingData;
+  TimerGroup TG;
+
+public:
+  // Use 'create' member to get this.
+  TimingInfo() : TG("... Pass execution timing report ...") {}
+  
+  // TimingDtor - Print out information about timing information
+  ~TimingInfo() {
+    // Delete all of the timers...
+    TimingData.clear();
+    // TimerGroup is deleted next, printing the report.
+  }
+
+  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
+  // to a non null value (if the -time-passes option is enabled) or it leaves it
+  // null.  It may be called multiple times.
+  static void createTheTimeInfo();
+
+  void passStarted(Pass *P) {
+    if (dynamic_cast<PMDataManager *>(P)) 
+      return;
+
+    std::map<Pass*, Timer>::iterator I = TimingData.find(P);
+    if (I == TimingData.end())
+      I=TimingData.insert(std::make_pair(P, Timer(P->getPassName(), TG))).first;
+    I->second.startTimer();
+  }
+  void passEnded(Pass *P) {
+    if (dynamic_cast<PMDataManager *>(P)) 
+      return;
+
+    std::map<Pass*, Timer>::iterator I = TimingData.find(P);
+    assert(I != TimingData.end() && "passStarted/passEnded not nested right!");
+    I->second.stopTimer();
+  }
+};
+
+} // End of anon namespace
+
+static TimingInfo *TheTimeInfo;
+
+//===----------------------------------------------------------------------===//
+// PMTopLevelManager implementation
+
+/// Initialize top level manager. Create first pass manager.
+PMTopLevelManager::PMTopLevelManager(enum TopLevelManagerType t) {
+  if (t == TLM_Pass) {
+    MPPassManager *MPP = new MPPassManager(1);
+    MPP->setTopLevelManager(this);
+    addPassManager(MPP);
+    activeStack.push(MPP);
+  } else if (t == TLM_Function) {
+    FPPassManager *FPP = new FPPassManager(1);
+    FPP->setTopLevelManager(this);
+    addPassManager(FPP);
+    activeStack.push(FPP);
+  } 
+}
+
+/// Set pass P as the last user of the given analysis passes.
+void PMTopLevelManager::setLastUser(SmallVector<Pass *, 12> &AnalysisPasses, 
+                                    Pass *P) {
+  for (SmallVector<Pass *, 12>::iterator I = AnalysisPasses.begin(),
+         E = AnalysisPasses.end(); I != E; ++I) {
+    Pass *AP = *I;
+    LastUser[AP] = P;
+    
+    if (P == AP)
+      continue;
+
+    // If AP is the last user of other passes then make P last user of
+    // such passes.
+    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
+           LUE = LastUser.end(); LUI != LUE; ++LUI) {
+      if (LUI->second == AP)
+        // DenseMap iterator is not invalidated here because
+        // this is just updating exisitng entry.
+        LastUser[LUI->first] = P;
+    }
+  }
+}
+
+/// Collect passes whose last user is P
+void PMTopLevelManager::collectLastUses(SmallVector<Pass *, 12> &LastUses,
+                                        Pass *P) {
+  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI = 
+    InversedLastUser.find(P);
+  if (DMI == InversedLastUser.end())
+    return;
+
+  SmallPtrSet<Pass *, 8> &LU = DMI->second;
+  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
+         E = LU.end(); I != E; ++I) {
+    LastUses.push_back(*I);
+  }
+
+}
+
+AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
+  AnalysisUsage *AnUsage = NULL;
+  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  if (DMI != AnUsageMap.end()) 
+    AnUsage = DMI->second;
+  else {
+    AnUsage = new AnalysisUsage();
+    P->getAnalysisUsage(*AnUsage);
+    AnUsageMap[P] = AnUsage;
+  }
+  return AnUsage;
+}
+
+/// Schedule pass P for execution. Make sure that passes required by
+/// P are run before P is run. Update analysis info maintained by
+/// the manager. Remove dead passes. This is a recursive function.
+void PMTopLevelManager::schedulePass(Pass *P) {
+
+  // TODO : Allocate function manager for this pass, other wise required set
+  // may be inserted into previous function manager
+
+  // Give pass a chance to prepare the stage.
+  P->preparePassManager(activeStack);
+
+  // If P is an analysis pass and it is available then do not
+  // generate the analysis again. Stale analysis info should not be
+  // available at this point.
+  if (P->getPassInfo() &&
+      P->getPassInfo()->isAnalysis() && findAnalysisPass(P->getPassInfo())) {
+    delete P;
+    return;
+  }
+
+  AnalysisUsage *AnUsage = findAnalysisUsage(P);
+
+  bool checkAnalysis = true;
+  while (checkAnalysis) {
+    checkAnalysis = false;
+  
+    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
+           E = RequiredSet.end(); I != E; ++I) {
+      
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      if (!AnalysisPass) {
+        AnalysisPass = (*I)->createPass();
+        if (P->getPotentialPassManagerType () ==
+            AnalysisPass->getPotentialPassManagerType())
+          // Schedule analysis pass that is managed by the same pass manager.
+          schedulePass(AnalysisPass);
+        else if (P->getPotentialPassManagerType () >
+                 AnalysisPass->getPotentialPassManagerType()) {
+          // Schedule analysis pass that is managed by a new manager.
+          schedulePass(AnalysisPass);
+          // Recheck analysis passes to ensure that required analysises that
+          // are already checked are still available.
+          checkAnalysis = true;
+        }
+        else
+          // Do not schedule this analysis. Lower level analsyis 
+          // passes are run on the fly.
+          delete AnalysisPass;
+      }
+    }
+  }
+
+  // Now all required passes are available.
+  addTopLevelPass(P);
+}
+
+/// Find the pass that implements Analysis AID. Search immutable
+/// passes and all pass managers. If desired pass is not found
+/// then return NULL.
+Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
+
+  Pass *P = NULL;
+  // Check pass managers
+  for (SmallVector<PMDataManager *, 8>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); P == NULL && I != E; ++I) {
+    PMDataManager *PMD = *I;
+    P = PMD->findAnalysisPass(AID, false);
+  }
+
+  // Check other pass managers
+  for (SmallVector<PMDataManager *, 8>::iterator
+         I = IndirectPassManagers.begin(),
+         E = IndirectPassManagers.end(); P == NULL && I != E; ++I)
+    P = (*I)->findAnalysisPass(AID, false);
+
+  for (SmallVector<ImmutablePass *, 8>::iterator I = ImmutablePasses.begin(),
+         E = ImmutablePasses.end(); P == NULL && I != E; ++I) {
+    const PassInfo *PI = (*I)->getPassInfo();
+    if (PI == AID)
+      P = *I;
+
+    // If Pass not found then check the interfaces implemented by Immutable Pass
+    if (!P) {
+      const std::vector<const PassInfo*> &ImmPI =
+        PI->getInterfacesImplemented();
+      if (std::find(ImmPI.begin(), ImmPI.end(), AID) != ImmPI.end())
+        P = *I;
+    }
+  }
+
+  return P;
+}
+
+// Print passes managed by this top level manager.
+void PMTopLevelManager::dumpPasses() const {
+
+  if (PassDebugging < Structure)
+    return;
+
+  // Print out the immutable passes
+  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
+    ImmutablePasses[i]->dumpPassStructure(0);
+  }
+  
+  // Every class that derives from PMDataManager also derives from Pass
+  // (sometimes indirectly), but there's no inheritance relationship
+  // between PMDataManager and Pass, so we have to dynamic_cast to get
+  // from a PMDataManager* to a Pass*.
+  for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    dynamic_cast<Pass *>(*I)->dumpPassStructure(1);
+}
+
+void PMTopLevelManager::dumpArguments() const {
+
+  if (PassDebugging < Arguments)
+    return;
+
+  cerr << "Pass Arguments: ";
+  for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->dumpPassArguments();
+  cerr << "\n";
+}
+
+void PMTopLevelManager::initializeAllAnalysisInfo() {
+  for (SmallVector<PMDataManager *, 8>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+  
+  // Initailize other pass managers
+  for (SmallVector<PMDataManager *, 8>::iterator I = IndirectPassManagers.begin(),
+         E = IndirectPassManagers.end(); I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
+        DME = LastUser.end(); DMI != DME; ++DMI) {
+    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI = 
+      InversedLastUser.find(DMI->second);
+    if (InvDMI != InversedLastUser.end()) {
+      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
+      L.insert(DMI->first);
+    } else {
+      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
+      InversedLastUser[DMI->second] = L;
+    }
+  }
+}
+
+/// Destructor
+PMTopLevelManager::~PMTopLevelManager() {
+  for (SmallVector<PMDataManager *, 8>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    delete *I;
+  
+  for (SmallVector<ImmutablePass *, 8>::iterator
+         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    delete *I;
+
+  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
+         DME = AnUsageMap.end(); DMI != DME; ++DMI)
+    delete DMI->second;
+}
+
+//===----------------------------------------------------------------------===//
+// PMDataManager implementation
+
+/// Augement AvailableAnalysis by adding analysis made available by pass P.
+void PMDataManager::recordAvailableAnalysis(Pass *P) {
+  const PassInfo *PI = P->getPassInfo();
+  if (PI == 0) return;
+  
+  AvailableAnalysis[PI] = P;
+
+  //This pass is the current implementation of all of the interfaces it
+  //implements as well.
+  const std::vector<const PassInfo*> &II = PI->getInterfacesImplemented();
+  for (unsigned i = 0, e = II.size(); i != e; ++i)
+    AvailableAnalysis[II[i]] = P;
+}
+
+// Return true if P preserves high level analysis used by other
+// passes managed by this manager
+bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return true;
+  
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (SmallVector<Pass *, 8>::iterator I = HigherLevelAnalysis.begin(),
+         E = HigherLevelAnalysis.end(); I  != E; ++I) {
+    Pass *P1 = *I;
+    if (!dynamic_cast<ImmutablePass*>(P1) &&
+        std::find(PreservedSet.begin(), PreservedSet.end(),
+                  P1->getPassInfo()) == 
+           PreservedSet.end())
+      return false;
+  }
+  
+  return true;
+}
+
+/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
+void PMDataManager::verifyPreservedAnalysis(Pass *P) {
+  // Don't do this unless assertions are enabled.
+#ifdef NDEBUG
+  return;
+#endif
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+
+  // Verify preserved analysis
+  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
+         E = PreservedSet.end(); I != E; ++I) {
+    AnalysisID AID = *I;
+    if (Pass *AP = findAnalysisPass(AID, true))
+      AP->verifyAnalysis();
+  }
+}
+
+/// verifyDomInfo - Verify dominator information if it is available.
+void PMDataManager::verifyDomInfo(Pass &P, Function &F) {
+  if (!VerifyDomInfo || !P.getResolver())
+    return;
+
+  DominatorTree *DT = P.getAnalysisIfAvailable<DominatorTree>();
+  if (!DT)
+    return;
+
+  DominatorTree OtherDT;
+  OtherDT.getBase().recalculate(F);
+  if (DT->compare(OtherDT)) {
+    cerr << "Dominator Information for " << F.getNameStart() << "\n";
+    cerr << "Pass '" << P.getPassName() << "'\n";
+    cerr << "----- Valid -----\n";
+    OtherDT.dump();
+    cerr << "----- Invalid -----\n";
+    DT->dump();
+    assert(0 && "Invalid dominator info");
+  }
+
+  DominanceFrontier *DF = P.getAnalysisIfAvailable<DominanceFrontier>();
+  if (!DF) 
+    return;
+
+  DominanceFrontier OtherDF;
+  std::vector<BasicBlock*> DTRoots = DT->getRoots();
+  OtherDF.calculate(*DT, DT->getNode(DTRoots[0]));
+  if (DF->compare(OtherDF)) {
+    cerr << "Dominator Information for " << F.getNameStart() << "\n";
+    cerr << "Pass '" << P.getPassName() << "'\n";
+    cerr << "----- Valid -----\n";
+    OtherDF.dump();
+    cerr << "----- Invalid -----\n";
+    DF->dump();
+    assert(0 && "Invalid dominator info");
+  }
+}
+
+/// Remove Analysis not preserved by Pass P
+void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (std::map<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
+         E = AvailableAnalysis.end(); I != E; ) {
+    std::map<AnalysisID, Pass*>::iterator Info = I++;
+    if (!dynamic_cast<ImmutablePass*>(Info->second)
+        && std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) == 
+        PreservedSet.end()) {
+      // Remove this analysis
+      if (PassDebugging >= Details) {
+        Pass *S = Info->second;
+        cerr << " -- '" <<  P->getPassName() << "' is not preserving '";
+        cerr << S->getPassName() << "'\n";
+      }
+      AvailableAnalysis.erase(Info);
+    }
+  }
+
+  // Check inherited analysis also. If P is not preserving analysis
+  // provided by parent manager then remove it here.
+  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
+
+    if (!InheritedAnalysis[Index])
+      continue;
+
+    for (std::map<AnalysisID, Pass*>::iterator 
+           I = InheritedAnalysis[Index]->begin(),
+           E = InheritedAnalysis[Index]->end(); I != E; ) {
+      std::map<AnalysisID, Pass *>::iterator Info = I++;
+      if (!dynamic_cast<ImmutablePass*>(Info->second) &&
+          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) == 
+             PreservedSet.end())
+        // Remove this analysis
+        InheritedAnalysis[Index]->erase(Info);
+    }
+  }
+}
+
+/// Remove analysis passes that are not used any longer
+void PMDataManager::removeDeadPasses(Pass *P, const char *Msg,
+                                     enum PassDebuggingString DBG_STR) {
+
+  SmallVector<Pass *, 12> DeadPasses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(DeadPasses, P);
+
+  if (PassDebugging >= Details && !DeadPasses.empty()) {
+    cerr << " -*- '" <<  P->getPassName();
+    cerr << "' is the last user of following pass instances.";
+    cerr << " Free these instances\n";
+  }
+
+  for (SmallVector<Pass *, 12>::iterator I = DeadPasses.begin(),
+         E = DeadPasses.end(); I != E; ++I) {
+
+    dumpPassInfo(*I, FREEING_MSG, DBG_STR, Msg);
+
+    {
+      // If the pass crashes releasing memory, remember this.
+      PassManagerPrettyStackEntry X(*I);
+      
+      if (TheTimeInfo) TheTimeInfo->passStarted(*I);
+      (*I)->releaseMemory();
+      if (TheTimeInfo) TheTimeInfo->passEnded(*I);
+    }
+    if (const PassInfo *PI = (*I)->getPassInfo()) {
+      std::map<AnalysisID, Pass*>::iterator Pos =
+        AvailableAnalysis.find(PI);
+
+      // It is possible that pass is already removed from the AvailableAnalysis
+      if (Pos != AvailableAnalysis.end())
+        AvailableAnalysis.erase(Pos);
+
+      // Remove all interfaces this pass implements, for which it is also
+      // listed as the available implementation.
+      const std::vector<const PassInfo*> &II = PI->getInterfacesImplemented();
+      for (unsigned i = 0, e = II.size(); i != e; ++i) {
+        Pos = AvailableAnalysis.find(II[i]);
+        if (Pos != AvailableAnalysis.end() && Pos->second == *I)
+          AvailableAnalysis.erase(Pos);
+      }
+    }
+  }
+}
+
+/// Add pass P into the PassVector. Update 
+/// AvailableAnalysis appropriately if ProcessAnalysis is true.
+void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
+  // This manager is going to manage pass P. Set up analysis resolver
+  // to connect them.
+  AnalysisResolver *AR = new AnalysisResolver(*this);
+  P->setResolver(AR);
+
+  // If a FunctionPass F is the last user of ModulePass info M
+  // then the F's manager, not F, records itself as a last user of M.
+  SmallVector<Pass *, 12> TransferLastUses;
+
+  if (!ProcessAnalysis) {
+    // Add pass
+    PassVector.push_back(P);
+    return;
+  }
+
+  // At the moment, this pass is the last user of all required passes.
+  SmallVector<Pass *, 12> LastUses;
+  SmallVector<Pass *, 8> RequiredPasses;
+  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
+
+  unsigned PDepth = this->getDepth();
+
+  collectRequiredAnalysis(RequiredPasses, 
+                          ReqAnalysisNotAvailable, P);
+  for (SmallVector<Pass *, 8>::iterator I = RequiredPasses.begin(),
+         E = RequiredPasses.end(); I != E; ++I) {
+    Pass *PRequired = *I;
+    unsigned RDepth = 0;
+
+    assert(PRequired->getResolver() && "Analysis Resolver is not set");
+    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
+    RDepth = DM.getDepth();
+
+    if (PDepth == RDepth)
+      LastUses.push_back(PRequired);
+    else if (PDepth > RDepth) {
+      // Let the parent claim responsibility of last use
+      TransferLastUses.push_back(PRequired);
+      // Keep track of higher level analysis used by this manager.
+      HigherLevelAnalysis.push_back(PRequired);
+    } else 
+      assert(0 && "Unable to accomodate Required Pass");
+  }
+
+  // Set P as P's last user until someone starts using P.
+  // However, if P is a Pass Manager then it does not need
+  // to record its last user.
+  if (!dynamic_cast<PMDataManager *>(P))
+    LastUses.push_back(P);
+  TPM->setLastUser(LastUses, P);
+
+  if (!TransferLastUses.empty()) {
+    Pass *My_PM = dynamic_cast<Pass *>(this);
+    TPM->setLastUser(TransferLastUses, My_PM);
+    TransferLastUses.clear();
+  }
+
+  // Now, take care of required analysises that are not available.
+  for (SmallVector<AnalysisID, 8>::iterator 
+         I = ReqAnalysisNotAvailable.begin(), 
+         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
+    Pass *AnalysisPass = (*I)->createPass();
+    this->addLowerLevelRequiredPass(P, AnalysisPass);
+  }
+
+  // Take a note of analysis required and made available by this pass.
+  // Remove the analysis not preserved by this pass
+  removeNotPreservedAnalysis(P);
+  recordAvailableAnalysis(P);
+
+  // Add pass
+  PassVector.push_back(P);
+}
+
+
+/// Populate RP with analysis pass that are required by
+/// pass P and are available. Populate RP_NotAvail with analysis
+/// pass that are required by pass P but are not available.
+void PMDataManager::collectRequiredAnalysis(SmallVector<Pass *, 8>&RP,
+                                       SmallVector<AnalysisID, 8> &RP_NotAvail,
+                                            Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+  for (AnalysisUsage::VectorType::const_iterator 
+         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);   
+    else
+      RP_NotAvail.push_back(*I);
+  }
+
+  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);   
+    else
+      RP_NotAvail.push_back(*I);
+  }
+}
+
+// All Required analyses should be available to the pass as it runs!  Here
+// we fill in the AnalysisImpls member of the pass so that it can
+// successfully use the getAnalysis() method to retrieve the
+// implementations it needs.
+//
+void PMDataManager::initializeAnalysisImpl(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+
+  for (AnalysisUsage::VectorType::const_iterator
+         I = AnUsage->getRequiredSet().begin(),
+         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
+    Pass *Impl = findAnalysisPass(*I, true);
+    if (Impl == 0)
+      // This may be analysis pass that is initialized on the fly.
+      // If that is not the case then it will raise an assert when it is used.
+      continue;
+    AnalysisResolver *AR = P->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->addAnalysisImplsPair(*I, Impl);
+  }
+}
+
+/// Find the pass that implements Analysis AID. If desired pass is not found
+/// then return NULL.
+Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
+
+  // Check if AvailableAnalysis map has one entry.
+  std::map<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
+
+  if (I != AvailableAnalysis.end())
+    return I->second;
+
+  // Search Parents through TopLevelManager
+  if (SearchParent)
+    return TPM->findAnalysisPass(AID);
+  
+  return NULL;
+}
+
+// Print list of passes that are last used by P.
+void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
+
+  SmallVector<Pass *, 12> LUses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(LUses, P);
+  
+  for (SmallVector<Pass *, 12>::iterator I = LUses.begin(),
+         E = LUses.end(); I != E; ++I) {
+    llvm::cerr << "--" << std::string(Offset*2, ' ');
+    (*I)->dumpPassStructure(0);
+  }
+}
+
+void PMDataManager::dumpPassArguments() const {
+  for (SmallVector<Pass *, 8>::const_iterator I = PassVector.begin(),
+        E = PassVector.end(); I != E; ++I) {
+    if (PMDataManager *PMD = dynamic_cast<PMDataManager *>(*I))
+      PMD->dumpPassArguments();
+    else
+      if (const PassInfo *PI = (*I)->getPassInfo())
+        if (!PI->isAnalysisGroup())
+          cerr << " -" << PI->getPassArgument();
+  }
+}
+
+void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
+                                 enum PassDebuggingString S2,
+                                 const char *Msg) {
+  if (PassDebugging < Executions)
+    return;
+  cerr << (void*)this << std::string(getDepth()*2+1, ' ');
+  switch (S1) {
+  case EXECUTION_MSG:
+    cerr << "Executing Pass '" << P->getPassName();
+    break;
+  case MODIFICATION_MSG:
+    cerr << "Made Modification '" << P->getPassName();
+    break;
+  case FREEING_MSG:
+    cerr << " Freeing Pass '" << P->getPassName();
+    break;
+  default:
+    break;
+  }
+  switch (S2) {
+  case ON_BASICBLOCK_MSG:
+    cerr << "' on BasicBlock '" << Msg << "'...\n";
+    break;
+  case ON_FUNCTION_MSG:
+    cerr << "' on Function '" << Msg << "'...\n";
+    break;
+  case ON_MODULE_MSG:
+    cerr << "' on Module '"  << Msg << "'...\n";
+    break;
+  case ON_LOOP_MSG:
+    cerr << "' on Loop " << Msg << "'...\n";
+    break;
+  case ON_CG_MSG:
+    cerr << "' on Call Graph " << Msg << "'...\n";
+    break;
+  default:
+    break;
+  }
+}
+
+void PMDataManager::dumpRequiredSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+    
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
+}
+
+void PMDataManager::dumpPreservedSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+    
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
+}
+
+void PMDataManager::dumpAnalysisUsage(const char *Msg, const Pass *P,
+                                   const AnalysisUsage::VectorType &Set) const {
+  assert(PassDebugging >= Details);
+  if (Set.empty())
+    return;
+  cerr << (void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
+  for (unsigned i = 0; i != Set.size(); ++i) {
+    if (i) cerr << ",";
+    cerr << " " << Set[i]->getPassName();
+  }
+  cerr << "\n";
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+/// This should be handled by specific pass manager.
+void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  if (TPM) {
+    TPM->dumpArguments();
+    TPM->dumpPasses();
+  }
+
+  // Module Level pass may required Function Level analysis info 
+  // (e.g. dominator info). Pass manager uses on the fly function pass manager 
+  // to provide this on demand. In that case, in Pass manager terminology, 
+  // module level pass is requiring lower level analysis info managed by
+  // lower level pass manager.
+
+  // When Pass manager is not able to order required analysis info, Pass manager
+  // checks whether any lower level manager will be able to provide this 
+  // analysis info on demand or not.
+#ifndef NDEBUG
+  cerr << "Unable to schedule '" << RequiredPass->getPassName();
+  cerr << "' required by '" << P->getPassName() << "'\n";
+#endif
+  assert(0 && "Unable to schedule pass");
+}
+
+// Destructor
+PMDataManager::~PMDataManager() {
+  for (SmallVector<Pass *, 8>::iterator I = PassVector.begin(),
+         E = PassVector.end(); I != E; ++I)
+    delete *I;
+}
+
+//===----------------------------------------------------------------------===//
+// NOTE: Is this the right place to define this method ?
+// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
+Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
+  return PM.findAnalysisPass(ID, dir);
+}
+
+Pass *AnalysisResolver::findImplPass(Pass *P, const PassInfo *AnalysisPI, 
+                                     Function &F) {
+  return PM.getOnTheFlyPass(P, AnalysisPI, F);
+}
+
+//===----------------------------------------------------------------------===//
+// BBPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking 
+/// runOnBasicBlock method.  Keep track of whether any of the passes modifies 
+/// the function, and if so, return true.
+bool BBPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = doInitialization(F);
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getNameStart());
+      dumpRequiredSet(BP);
+
+      initializeAnalysisImpl(BP);
+
+      {
+        // If the pass crashes, remember this.
+        PassManagerPrettyStackEntry X(BP, *I);
+      
+        if (TheTimeInfo) TheTimeInfo->passStarted(BP);
+        Changed |= BP->runOnBasicBlock(*I);
+        if (TheTimeInfo) TheTimeInfo->passEnded(BP);
+      }
+
+      if (Changed) 
+        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
+                     I->getNameStart());
+      dumpPreservedSet(BP);
+
+      verifyPreservedAnalysis(BP);
+      removeNotPreservedAnalysis(BP);
+      recordAvailableAnalysis(BP);
+      removeDeadPasses(BP, I->getNameStart(), ON_BASICBLOCK_MSG);
+    }
+
+  return Changed |= doFinalization(F);
+}
+
+// Implement doInitialization and doFinalization
+bool BBPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doInitialization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doInitialization(F);
+  }
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doFinalization(F);
+  }
+
+  return Changed;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManager implementation
+
+/// Create new Function pass manager
+FunctionPassManager::FunctionPassManager(ModuleProvider *P) {
+  FPM = new FunctionPassManagerImpl(0);
+  // FPM is the top level manager.
+  FPM->setTopLevelManager(FPM);
+
+  AnalysisResolver *AR = new AnalysisResolver(*FPM);
+  FPM->setResolver(AR);
+  
+  MP = P;
+}
+
+FunctionPassManager::~FunctionPassManager() {
+  delete FPM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes
+/// ownership of the Pass to the PassManager.  When the
+/// PassManager_X is destroyed, the pass will be destroyed as well, so
+/// there is no need to delete the pass. (TODO delete passes.)
+/// This implies that all passes MUST be allocated with 'new'.
+void FunctionPassManager::add(Pass *P) { 
+  FPM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep
+/// track of whether any of the passes modifies the function, and if
+/// so, return true.
+///
+bool FunctionPassManager::run(Function &F) {
+  std::string errstr;
+  if (MP->materializeFunction(&F, &errstr)) {
+    cerr << "Error reading bitcode file: " << errstr << "\n";
+    abort();
+  }
+  return FPM->run(F);
+}
+
+
+/// doInitialization - Run all of the initializers for the function passes.
+///
+bool FunctionPassManager::doInitialization() {
+  return FPM->doInitialization(*MP->getModule());
+}
+
+/// doFinalization - Run all of the finalizers for the function passes.
+///
+bool FunctionPassManager::doFinalization() {
+  return FPM->doFinalization(*MP->getModule());
+}
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl implementation
+//
+bool FunctionPassManagerImpl::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FunctionPassManagerImpl::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+/// cleanup - After running all passes, clean up pass manager cache.
+void FPPassManager::cleanup() {
+ for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    AnalysisResolver *AR = FP->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->clearAnalysisImpls();
+ }
+}
+
+// Execute all the passes managed by this top level manager.
+// Return true if any function is modified by a pass.
+bool FunctionPassManagerImpl::run(Function &F) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  dumpArguments();
+  dumpPasses();
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnFunction(F);
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    getContainedManager(Index)->cleanup();
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// FPPassManager implementation
+
+char FPPassManager::ID = 0;
+/// Print passes managed by this manager
+void FPPassManager::dumpPassStructure(unsigned Offset) {
+  llvm::cerr << std::string(Offset*2, ' ') << "FunctionPass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    FP->dumpPassStructure(Offset + 1);
+    dumpLastUses(FP, Offset+1);
+  }
+}
+
+
+/// Execute all of the passes scheduled for execution by invoking 
+/// runOnFunction method.  Keep track of whether any of the passes modifies 
+/// the function, and if so, return true.
+bool FPPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+
+    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getNameStart());
+    dumpRequiredSet(FP);
+
+    initializeAnalysisImpl(FP);
+
+    {
+      PassManagerPrettyStackEntry X(FP, F);
+
+      if (TheTimeInfo) TheTimeInfo->passStarted(FP);
+      Changed |= FP->runOnFunction(F);
+      if (TheTimeInfo) TheTimeInfo->passEnded(FP);
+    }
+
+    if (Changed) 
+      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getNameStart());
+    dumpPreservedSet(FP);
+
+    verifyPreservedAnalysis(FP);
+    removeNotPreservedAnalysis(FP);
+    recordAvailableAnalysis(FP);
+    removeDeadPasses(FP, F.getNameStart(), ON_FUNCTION_MSG);
+
+    // If dominator information is available then verify the info if requested.
+    verifyDomInfo(*FP, F);
+  }
+  return Changed;
+}
+
+bool FPPassManager::runOnModule(Module &M) {
+  bool Changed = doInitialization(M);
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    runOnFunction(*I);
+
+  return Changed |= doFinalization(M);
+}
+
+bool FPPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FPPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// MPPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking 
+/// runOnModule method.  Keep track of whether any of the passes modifies 
+/// the module, and if so, return true.
+bool
+MPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    ModulePass *MP = getContainedPass(Index);
+
+    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG,
+                 M.getModuleIdentifier().c_str());
+    dumpRequiredSet(MP);
+
+    initializeAnalysisImpl(MP);
+
+    {
+      PassManagerPrettyStackEntry X(MP, M);
+      if (TheTimeInfo) TheTimeInfo->passStarted(MP);
+      Changed |= MP->runOnModule(M);
+      if (TheTimeInfo) TheTimeInfo->passEnded(MP);
+    }
+
+    if (Changed) 
+      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
+                   M.getModuleIdentifier().c_str());
+    dumpPreservedSet(MP);
+    
+    verifyPreservedAnalysis(MP);
+    removeNotPreservedAnalysis(MP);
+    recordAvailableAnalysis(MP);
+    removeDeadPasses(MP, M.getModuleIdentifier().c_str(), ON_MODULE_MSG);
+  }
+  return Changed;
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+  assert((P->getPotentialPassManagerType() < 
+          RequiredPass->getPotentialPassManagerType()) &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
+  if (!FPP) {
+    FPP = new FunctionPassManagerImpl(0);
+    // FPP is the top level manager.
+    FPP->setTopLevelManager(FPP);
+
+    OnTheFlyManagers[P] = FPP;
+  }
+  FPP->add(RequiredPass);
+
+  // Register P as the last user of RequiredPass.
+  SmallVector<Pass *, 12> LU;
+  LU.push_back(RequiredPass);
+  FPP->setLastUser(LU,  P);
+}
+
+/// Return function pass corresponding to PassInfo PI, that is 
+/// required by module pass MP. Instantiate analysis pass, by using
+/// its runOnFunction() for function F.
+Pass* MPPassManager::getOnTheFlyPass(Pass *MP, const PassInfo *PI, Function &F){
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
+  assert(FPP && "Unable to find on the fly pass");
+  
+  FPP->run(F);
+  return (dynamic_cast<PMTopLevelManager *>(FPP))->findAnalysisPass(PI);
+}
+
+
+//===----------------------------------------------------------------------===//
+// PassManagerImpl implementation
+//
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManagerImpl::run(Module &M) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  dumpArguments();
+  dumpPasses();
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnModule(M);
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager implementation
+
+/// Create new pass manager
+PassManager::PassManager() {
+  PM = new PassManagerImpl(0);
+  // PM is the top level manager
+  PM->setTopLevelManager(PM);
+}
+
+PassManager::~PassManager() {
+  delete PM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes ownership of
+/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+/// will be destroyed as well, so there is no need to delete the pass.  This
+/// implies that all passes MUST be allocated with 'new'.
+void PassManager::add(Pass *P) {
+  PM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManager::run(Module &M) {
+  return PM->run(M);
+}
+
+//===----------------------------------------------------------------------===//
+// TimingInfo Class - This class is used to calculate information about the
+// amount of time each pass takes to execute.  This only happens with
+// -time-passes is enabled on the command line.
+//
+bool llvm::TimePassesIsEnabled = false;
+static cl::opt<bool,true>
+EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
+            cl::desc("Time each pass, printing elapsed time for each on exit"));
+
+// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
+// a non null value (if the -time-passes option is enabled) or it leaves it
+// null.  It may be called multiple times.
+void TimingInfo::createTheTimeInfo() {
+  if (!TimePassesIsEnabled || TheTimeInfo) return;
+
+  // Constructed the first time this is called, iff -time-passes is enabled.
+  // This guarantees that the object will be constructed before static globals,
+  // thus it will be destroyed before them.
+  static ManagedStatic<TimingInfo> TTI;
+  TheTimeInfo = &*TTI;
+}
+
+/// If TimingInfo is enabled then start pass timer.
+void StartPassTimer(Pass *P) {
+  if (TheTimeInfo) 
+    TheTimeInfo->passStarted(P);
+}
+
+/// If TimingInfo is enabled then stop pass timer.
+void StopPassTimer(Pass *P) {
+  if (TheTimeInfo) 
+    TheTimeInfo->passEnded(P);
+}
+
+//===----------------------------------------------------------------------===//
+// PMStack implementation
+//
+
+// Pop Pass Manager from the stack and clear its analysis info.
+void PMStack::pop() {
+
+  PMDataManager *Top = this->top();
+  Top->initializeAnalysisInfo();
+
+  S.pop_back();
+}
+
+// Push PM on the stack and set its top level manager.
+void PMStack::push(PMDataManager *PM) {
+  assert(PM && "Unable to push. Pass Manager expected");
+
+  if (!this->empty()) {
+    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
+
+    assert(TPM && "Unable to find top level manager");
+    TPM->addIndirectPassManager(PM);
+    PM->setTopLevelManager(TPM);
+  }
+
+  S.push_back(PM);
+}
+
+// Dump content of the pass manager stack.
+void PMStack::dump() {
+  for (std::deque<PMDataManager *>::iterator I = S.begin(),
+         E = S.end(); I != E; ++I)
+    printf("%s ", dynamic_cast<Pass *>(*I)->getPassName());
+
+  if (!S.empty())
+    printf("\n");
+}
+
+/// Find appropriate Module Pass Manager in the PM Stack and
+/// add self into that manager. 
+void ModulePass::assignPassManager(PMStack &PMS, 
+                                   PassManagerType PreferredType) {
+  // Find Module Pass Manager
+  while(!PMS.empty()) {
+    PassManagerType TopPMType = PMS.top()->getPassManagerType();
+    if (TopPMType == PreferredType)
+      break; // We found desired pass manager
+    else if (TopPMType > PMT_ModulePassManager)
+      PMS.pop();    // Pop children pass managers
+    else
+      break;
+  }
+  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
+  PMS.top()->add(this);
+}
+
+/// Find appropriate Function Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager. 
+void FunctionPass::assignPassManager(PMStack &PMS,
+                                     PassManagerType PreferredType) {
+
+  // Find Module Pass Manager
+  while(!PMS.empty()) {
+    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
+      PMS.pop();
+    else
+      break; 
+  }
+  FPPassManager *FPP = dynamic_cast<FPPassManager *>(PMS.top());
+
+  // Create new Function Pass Manager
+  if (!FPP) {
+    assert(!PMS.empty() && "Unable to create Function Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Function Pass Manager
+    FPP = new FPPassManager(PMD->getDepth() + 1);
+    FPP->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(FPP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    FPP->assignPassManager(PMS, PMD->getPassManagerType());
+
+    // [4] Push new manager into PMS
+    PMS.push(FPP);
+  }
+
+  // Assign FPP as the manager of this pass.
+  FPP->add(this);
+}
+
+/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager. 
+void BasicBlockPass::assignPassManager(PMStack &PMS,
+                                       PassManagerType PreferredType) {
+  BBPassManager *BBP = NULL;
+
+  // Basic Pass Manager is a leaf pass manager. It does not handle
+  // any other pass manager.
+  if (!PMS.empty())
+    BBP = dynamic_cast<BBPassManager *>(PMS.top());
+
+  // If leaf manager is not Basic Block Pass manager then create new
+  // basic Block Pass manager.
+
+  if (!BBP) {
+    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Basic Block Manager
+    BBP = new BBPassManager(PMD->getDepth() + 1);
+
+    // [2] Set up new manager's top level manager
+    // Basic Block Pass Manager does not live by itself
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(BBP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    BBP->assignPassManager(PMS);
+
+    // [4] Push new manager into PMS
+    PMS.push(BBP);
+  }
+
+  // Assign BBP as the manager of this pass.
+  BBP->add(this);
+}
+
+PassManagerBase::~PassManagerBase() {}
+  
+/*===-- C Bindings --------------------------------------------------------===*/
+
+LLVMPassManagerRef LLVMCreatePassManager() {
+  return wrap(new PassManager());
+}
+
+LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) {
+  return wrap(new FunctionPassManager(unwrap(P)));
+}
+
+int LLVMRunPassManager(LLVMPassManagerRef PM, LLVMModuleRef M) {
+  return unwrap<PassManager>(PM)->run(*unwrap(M));
+}
+
+int LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM) {
+  return unwrap<FunctionPassManager>(FPM)->doInitialization();
+}
+
+int LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F) {
+  return unwrap<FunctionPassManager>(FPM)->run(*unwrap<Function>(F));
+}
+
+int LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) {
+  return unwrap<FunctionPassManager>(FPM)->doFinalization();
+}
+
+void LLVMDisposePassManager(LLVMPassManagerRef PM) {
+  delete unwrap(PM);
+}
diff --git a/lib/VMCore/PrintModulePass.cpp b/lib/VMCore/PrintModulePass.cpp
new file mode 100644
index 0000000..0a7f449
--- /dev/null
+++ b/lib/VMCore/PrintModulePass.cpp
@@ -0,0 +1,99 @@
+//===--- VMCore/PrintModulePass.cpp - Module/Function Printer -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// PrintModulePass and PrintFunctionPass implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Assembly/PrintModulePass.h"
+
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+
+  class VISIBILITY_HIDDEN PrintModulePass : public ModulePass {
+    raw_ostream *Out;       // raw_ostream to print on
+    bool DeleteStream;      // Delete the ostream in our dtor?
+  public:
+    static char ID;
+    PrintModulePass() : ModulePass(&ID), Out(&errs()), 
+      DeleteStream(false) {}
+    PrintModulePass(raw_ostream *o, bool DS)
+      : ModulePass(&ID), Out(o), DeleteStream(DS) {}
+    
+    ~PrintModulePass() {
+      if (DeleteStream) delete Out;
+    }
+    
+    bool runOnModule(Module &M) {
+      (*Out) << M;
+      return false;
+    }
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+  
+  class PrintFunctionPass : public FunctionPass {
+    std::string Banner;     // String to print before each function
+    raw_ostream *Out;       // raw_ostream to print on
+    bool DeleteStream;      // Delete the ostream in our dtor?
+  public:
+    static char ID;
+    PrintFunctionPass() : FunctionPass(&ID), Banner(""), Out(&errs()), 
+                          DeleteStream(false) {}
+    PrintFunctionPass(const std::string &B, raw_ostream *o, bool DS)
+      : FunctionPass(&ID), Banner(B), Out(o), DeleteStream(DS) {}
+    
+    inline ~PrintFunctionPass() {
+      if (DeleteStream) delete Out;
+    }
+    
+    // runOnFunction - This pass just prints a banner followed by the
+    // function as it's processed.
+    //
+    bool runOnFunction(Function &F) {
+      (*Out) << Banner << static_cast<Value&>(F);
+      return false;
+    }
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char PrintModulePass::ID = 0;
+static RegisterPass<PrintModulePass>
+X("print-module", "Print module to stderr");
+char PrintFunctionPass::ID = 0;
+static RegisterPass<PrintFunctionPass>
+Y("print-function","Print function to stderr");
+
+/// createPrintModulePass - Create and return a pass that writes the
+/// module to the specified raw_ostream.
+ModulePass *llvm::createPrintModulePass(llvm::raw_ostream *OS, 
+                                        bool DeleteStream) {
+  return new PrintModulePass(OS, DeleteStream);
+}
+
+/// createPrintFunctionPass - Create and return a pass that prints
+/// functions to the specified raw_ostream as they are processed.
+FunctionPass *llvm::createPrintFunctionPass(const std::string &Banner,
+                                            llvm::raw_ostream *OS, 
+                                            bool DeleteStream) {
+  return new PrintFunctionPass(Banner, OS, DeleteStream);
+}
+
diff --git a/lib/VMCore/SymbolTableListTraitsImpl.h b/lib/VMCore/SymbolTableListTraitsImpl.h
new file mode 100644
index 0000000..72687bb
--- /dev/null
+++ b/lib/VMCore/SymbolTableListTraitsImpl.h
@@ -0,0 +1,118 @@
+//===-- llvm/SymbolTableListTraitsImpl.h - Implementation ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the stickier parts of the SymbolTableListTraits class,
+// and is explicitly instantiated where needed to avoid defining all this code
+// in a widely used header.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYMBOLTABLELISTTRAITS_IMPL_H
+#define LLVM_SYMBOLTABLELISTTRAITS_IMPL_H
+
+#include "llvm/SymbolTableListTraits.h"
+#include "llvm/ValueSymbolTable.h"
+
+namespace llvm {
+
+/// setSymTabObject - This is called when (f.e.) the parent of a basic block
+/// changes.  This requires us to remove all the instruction symtab entries from
+/// the current function and reinsert them into the new function.
+template<typename ValueSubClass, typename ItemParentClass>
+template<typename TPtr>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::setSymTabObject(TPtr *Dest, TPtr Src) {
+  // Get the old symtab and value list before doing the assignment.
+  ValueSymbolTable *OldST = TraitsClass::getSymTab(getListOwner());
+
+  // Do it.
+  *Dest = Src;
+  
+  // Get the new SymTab object.
+  ValueSymbolTable *NewST = TraitsClass::getSymTab(getListOwner());
+  
+  // If there is nothing to do, quick exit.
+  if (OldST == NewST) return;
+  
+  // Move all the elements from the old symtab to the new one.
+  iplist<ValueSubClass> &ItemList = TraitsClass::getList(getListOwner());
+  if (ItemList.empty()) return;
+  
+  if (OldST) {
+    // Remove all entries from the previous symtab.
+    for (typename iplist<ValueSubClass>::iterator I = ItemList.begin();
+         I != ItemList.end(); ++I)
+      if (I->hasName())
+        OldST->removeValueName(I->getValueName());
+  }
+
+  if (NewST) {
+    // Add all of the items to the new symtab.
+    for (typename iplist<ValueSubClass>::iterator I = ItemList.begin();
+         I != ItemList.end(); ++I)
+      if (I->hasName())
+        NewST->reinsertValue(I);
+  }
+  
+}
+
+template<typename ValueSubClass, typename ItemParentClass>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::addNodeToList(ValueSubClass *V) {
+  assert(V->getParent() == 0 && "Value already in a container!!");
+  ItemParentClass *Owner = getListOwner();
+  V->setParent(Owner);
+  if (V->hasName())
+    if (ValueSymbolTable *ST = TraitsClass::getSymTab(Owner))
+      ST->reinsertValue(V);
+}
+
+template<typename ValueSubClass, typename ItemParentClass>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::removeNodeFromList(ValueSubClass *V) {
+  V->setParent(0);
+  if (V->hasName())
+    if (ValueSymbolTable *ST = TraitsClass::getSymTab(getListOwner()))
+      ST->removeValueName(V->getValueName());
+}
+
+template<typename ValueSubClass, typename ItemParentClass>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::transferNodesFromList(ilist_traits<ValueSubClass> &L2,
+                        ilist_iterator<ValueSubClass> first,
+                        ilist_iterator<ValueSubClass> last) {
+  // We only have to do work here if transferring instructions between BBs
+  ItemParentClass *NewIP = getListOwner(), *OldIP = L2.getListOwner();
+  if (NewIP == OldIP) return;  // No work to do at all...
+
+  // We only have to update symbol table entries if we are transferring the
+  // instructions to a different symtab object...
+  ValueSymbolTable *NewST = TraitsClass::getSymTab(NewIP);
+  ValueSymbolTable *OldST = TraitsClass::getSymTab(OldIP);
+  if (NewST != OldST) {
+    for (; first != last; ++first) {
+      ValueSubClass &V = *first;
+      bool HasName = V.hasName();
+      if (OldST && HasName)
+        OldST->removeValueName(V.getValueName());
+      V.setParent(NewIP);
+      if (NewST && HasName)
+        NewST->reinsertValue(&V);
+    }
+  } else {
+    // Just transferring between blocks in the same function, simply update the
+    // parent fields in the instructions...
+    for (; first != last; ++first)
+      first->setParent(NewIP);
+  }
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
new file mode 100644
index 0000000..11b93ff
--- /dev/null
+++ b/lib/VMCore/Type.cpp
@@ -0,0 +1,1457 @@
+//===-- Type.cpp - Implement the Type class -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Type class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DerivedTypes.h"
+#include "llvm/Constants.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdarg>
+using namespace llvm;
+
+// DEBUG_MERGE_TYPES - Enable this #define to see how and when derived types are
+// created and later destroyed, all in an effort to make sure that there is only
+// a single canonical version of a type.
+//
+// #define DEBUG_MERGE_TYPES 1
+
+AbstractTypeUser::~AbstractTypeUser() {}
+
+
+//===----------------------------------------------------------------------===//
+//                         Type Class Implementation
+//===----------------------------------------------------------------------===//
+
+// Concrete/Abstract TypeDescriptions - We lazily calculate type descriptions
+// for types as they are needed.  Because resolution of types must invalidate
+// all of the abstract type descriptions, we keep them in a seperate map to make
+// this easy.
+static ManagedStatic<TypePrinting> ConcreteTypeDescriptions;
+static ManagedStatic<TypePrinting> AbstractTypeDescriptions;
+
+/// Because of the way Type subclasses are allocated, this function is necessary
+/// to use the correct kind of "delete" operator to deallocate the Type object.
+/// Some type objects (FunctionTy, StructTy) allocate additional space after 
+/// the space for their derived type to hold the contained types array of
+/// PATypeHandles. Using this allocation scheme means all the PATypeHandles are
+/// allocated with the type object, decreasing allocations and eliminating the
+/// need for a std::vector to be used in the Type class itself. 
+/// @brief Type destruction function
+void Type::destroy() const {
+
+  // Structures and Functions allocate their contained types past the end of
+  // the type object itself. These need to be destroyed differently than the
+  // other types.
+  if (isa<FunctionType>(this) || isa<StructType>(this)) {
+    // First, make sure we destruct any PATypeHandles allocated by these
+    // subclasses.  They must be manually destructed. 
+    for (unsigned i = 0; i < NumContainedTys; ++i)
+      ContainedTys[i].PATypeHandle::~PATypeHandle();
+
+    // Now call the destructor for the subclass directly because we're going
+    // to delete this as an array of char.
+    if (isa<FunctionType>(this))
+      static_cast<const FunctionType*>(this)->FunctionType::~FunctionType();
+    else
+      static_cast<const StructType*>(this)->StructType::~StructType();
+
+    // Finally, remove the memory as an array deallocation of the chars it was
+    // constructed from.
+    operator delete(const_cast<Type *>(this));
+
+    return;
+  }
+
+  // For all the other type subclasses, there is either no contained types or 
+  // just one (all Sequentials). For Sequentials, the PATypeHandle is not
+  // allocated past the type object, its included directly in the SequentialType
+  // class. This means we can safely just do "normal" delete of this object and
+  // all the destructors that need to run will be run.
+  delete this; 
+}
+
+const Type *Type::getPrimitiveType(TypeID IDNumber) {
+  switch (IDNumber) {
+  case VoidTyID      : return VoidTy;
+  case FloatTyID     : return FloatTy;
+  case DoubleTyID    : return DoubleTy;
+  case X86_FP80TyID  : return X86_FP80Ty;
+  case FP128TyID     : return FP128Ty;
+  case PPC_FP128TyID : return PPC_FP128Ty;
+  case LabelTyID     : return LabelTy;
+  case MetadataTyID  : return MetadataTy;
+  default:
+    return 0;
+  }
+}
+
+const Type *Type::getVAArgsPromotedType() const {
+  if (ID == IntegerTyID && getSubclassData() < 32)
+    return Type::Int32Ty;
+  else if (ID == FloatTyID)
+    return Type::DoubleTy;
+  else
+    return this;
+}
+
+/// isIntOrIntVector - Return true if this is an integer type or a vector of
+/// integer types.
+///
+bool Type::isIntOrIntVector() const {
+  if (isInteger())
+    return true;
+  if (ID != Type::VectorTyID) return false;
+  
+  return cast<VectorType>(this)->getElementType()->isInteger();
+}
+
+/// isFPOrFPVector - Return true if this is a FP type or a vector of FP types.
+///
+bool Type::isFPOrFPVector() const {
+  if (ID == Type::FloatTyID || ID == Type::DoubleTyID || 
+      ID == Type::FP128TyID || ID == Type::X86_FP80TyID || 
+      ID == Type::PPC_FP128TyID)
+    return true;
+  if (ID != Type::VectorTyID) return false;
+  
+  return cast<VectorType>(this)->getElementType()->isFloatingPoint();
+}
+
+// canLosslesllyBitCastTo - Return true if this type can be converted to
+// 'Ty' without any reinterpretation of bits.  For example, uint to int.
+//
+bool Type::canLosslesslyBitCastTo(const Type *Ty) const {
+  // Identity cast means no change so return true
+  if (this == Ty) 
+    return true;
+  
+  // They are not convertible unless they are at least first class types
+  if (!this->isFirstClassType() || !Ty->isFirstClassType())
+    return false;
+
+  // Vector -> Vector conversions are always lossless if the two vector types
+  // have the same size, otherwise not.
+  if (const VectorType *thisPTy = dyn_cast<VectorType>(this))
+    if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty))
+      return thisPTy->getBitWidth() == thatPTy->getBitWidth();
+
+  // At this point we have only various mismatches of the first class types
+  // remaining and ptr->ptr. Just select the lossless conversions. Everything
+  // else is not lossless.
+  if (isa<PointerType>(this))
+    return isa<PointerType>(Ty);
+  return false;  // Other types have no identity values
+}
+
+unsigned Type::getPrimitiveSizeInBits() const {
+  switch (getTypeID()) {
+  case Type::FloatTyID: return 32;
+  case Type::DoubleTyID: return 64;
+  case Type::X86_FP80TyID: return 80;
+  case Type::FP128TyID: return 128;
+  case Type::PPC_FP128TyID: return 128;
+  case Type::IntegerTyID: return cast<IntegerType>(this)->getBitWidth();
+  case Type::VectorTyID:  return cast<VectorType>(this)->getBitWidth();
+  default: return 0;
+  }
+}
+
+/// isSizedDerivedType - Derived types like structures and arrays are sized
+/// iff all of the members of the type are sized as well.  Since asking for
+/// their size is relatively uncommon, move this operation out of line.
+bool Type::isSizedDerivedType() const {
+  if (isa<IntegerType>(this))
+    return true;
+
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(this))
+    return ATy->getElementType()->isSized();
+
+  if (const VectorType *PTy = dyn_cast<VectorType>(this))
+    return PTy->getElementType()->isSized();
+
+  if (!isa<StructType>(this)) 
+    return false;
+
+  // Okay, our struct is sized if all of the elements are...
+  for (subtype_iterator I = subtype_begin(), E = subtype_end(); I != E; ++I)
+    if (!(*I)->isSized()) 
+      return false;
+
+  return true;
+}
+
+/// getForwardedTypeInternal - This method is used to implement the union-find
+/// algorithm for when a type is being forwarded to another type.
+const Type *Type::getForwardedTypeInternal() const {
+  assert(ForwardType && "This type is not being forwarded to another type!");
+
+  // Check to see if the forwarded type has been forwarded on.  If so, collapse
+  // the forwarding links.
+  const Type *RealForwardedType = ForwardType->getForwardedType();
+  if (!RealForwardedType)
+    return ForwardType;  // No it's not forwarded again
+
+  // Yes, it is forwarded again.  First thing, add the reference to the new
+  // forward type.
+  if (RealForwardedType->isAbstract())
+    cast<DerivedType>(RealForwardedType)->addRef();
+
+  // Now drop the old reference.  This could cause ForwardType to get deleted.
+  cast<DerivedType>(ForwardType)->dropRef();
+
+  // Return the updated type.
+  ForwardType = RealForwardedType;
+  return ForwardType;
+}
+
+void Type::refineAbstractType(const DerivedType *OldTy, const Type *NewTy) {
+  abort();
+}
+void Type::typeBecameConcrete(const DerivedType *AbsTy) {
+  abort();
+}
+
+
+std::string Type::getDescription() const {
+  TypePrinting &Map =
+    isAbstract() ? *AbstractTypeDescriptions : *ConcreteTypeDescriptions;
+  
+  std::string DescStr;
+  raw_string_ostream DescOS(DescStr);
+  Map.print(this, DescOS);
+  return DescOS.str();
+}
+
+
+bool StructType::indexValid(const Value *V) const {
+  // Structure indexes require 32-bit integer constants.
+  if (V->getType() == Type::Int32Ty)
+    if (const ConstantInt *CU = dyn_cast<ConstantInt>(V))
+      return indexValid(CU->getZExtValue());
+  return false;
+}
+
+bool StructType::indexValid(unsigned V) const {
+  return V < NumContainedTys;
+}
+
+// getTypeAtIndex - Given an index value into the type, return the type of the
+// element.  For a structure type, this must be a constant value...
+//
+const Type *StructType::getTypeAtIndex(const Value *V) const {
+  unsigned Idx = (unsigned)cast<ConstantInt>(V)->getZExtValue();
+  return getTypeAtIndex(Idx);
+}
+
+const Type *StructType::getTypeAtIndex(unsigned Idx) const {
+  assert(indexValid(Idx) && "Invalid structure index!");
+  return ContainedTys[Idx];
+}
+
+//===----------------------------------------------------------------------===//
+//                          Primitive 'Type' data
+//===----------------------------------------------------------------------===//
+
+const Type *Type::VoidTy       = new Type(Type::VoidTyID);
+const Type *Type::FloatTy      = new Type(Type::FloatTyID);
+const Type *Type::DoubleTy     = new Type(Type::DoubleTyID);
+const Type *Type::X86_FP80Ty   = new Type(Type::X86_FP80TyID);
+const Type *Type::FP128Ty      = new Type(Type::FP128TyID);
+const Type *Type::PPC_FP128Ty  = new Type(Type::PPC_FP128TyID);
+const Type *Type::LabelTy      = new Type(Type::LabelTyID);
+const Type *Type::MetadataTy   = new Type(Type::MetadataTyID);
+
+namespace {
+  struct BuiltinIntegerType : public IntegerType {
+    explicit BuiltinIntegerType(unsigned W) : IntegerType(W) {}
+  };
+}
+const IntegerType *Type::Int1Ty  = new BuiltinIntegerType(1);
+const IntegerType *Type::Int8Ty  = new BuiltinIntegerType(8);
+const IntegerType *Type::Int16Ty = new BuiltinIntegerType(16);
+const IntegerType *Type::Int32Ty = new BuiltinIntegerType(32);
+const IntegerType *Type::Int64Ty = new BuiltinIntegerType(64);
+
+//===----------------------------------------------------------------------===//
+//                          Derived Type Constructors
+//===----------------------------------------------------------------------===//
+
+/// isValidReturnType - Return true if the specified type is valid as a return
+/// type.
+bool FunctionType::isValidReturnType(const Type *RetTy) {
+  if (RetTy->isFirstClassType()) {
+    if (const PointerType *PTy = dyn_cast<PointerType>(RetTy))
+      return PTy->getElementType() != Type::MetadataTy;
+    return true;
+  }
+  if (RetTy == Type::VoidTy || RetTy == Type::MetadataTy ||
+      isa<OpaqueType>(RetTy))
+    return true;
+  
+  // If this is a multiple return case, verify that each return is a first class
+  // value and that there is at least one value.
+  const StructType *SRetTy = dyn_cast<StructType>(RetTy);
+  if (SRetTy == 0 || SRetTy->getNumElements() == 0)
+    return false;
+  
+  for (unsigned i = 0, e = SRetTy->getNumElements(); i != e; ++i)
+    if (!SRetTy->getElementType(i)->isFirstClassType())
+      return false;
+  return true;
+}
+
+FunctionType::FunctionType(const Type *Result,
+                           const std::vector<const Type*> &Params,
+                           bool IsVarArgs)
+  : DerivedType(FunctionTyID), isVarArgs(IsVarArgs) {
+  ContainedTys = reinterpret_cast<PATypeHandle*>(this+1);
+  NumContainedTys = Params.size() + 1; // + 1 for result type
+  assert(isValidReturnType(Result) && "invalid return type for function");
+
+
+  bool isAbstract = Result->isAbstract();
+  new (&ContainedTys[0]) PATypeHandle(Result, this);
+
+  for (unsigned i = 0; i != Params.size(); ++i) {
+    assert((Params[i]->isFirstClassType() || isa<OpaqueType>(Params[i])) &&
+           "Function arguments must be value types!");
+    assert((!isa<PointerType>(Params[i]) ||
+            cast<PointerType>(Params[i])->getElementType() != Type::MetadataTy)
+           && "Attempt to use metadata* as function argument type!");
+    new (&ContainedTys[i+1]) PATypeHandle(Params[i], this);
+    isAbstract |= Params[i]->isAbstract();
+  }
+
+  // Calculate whether or not this type is abstract
+  setAbstract(isAbstract);
+}
+
+StructType::StructType(const std::vector<const Type*> &Types, bool isPacked)
+  : CompositeType(StructTyID) {
+  ContainedTys = reinterpret_cast<PATypeHandle*>(this + 1);
+  NumContainedTys = Types.size();
+  setSubclassData(isPacked);
+  bool isAbstract = false;
+  for (unsigned i = 0; i < Types.size(); ++i) {
+    assert(Types[i] && "<null> type for structure field!");
+    assert(Types[i] != Type::VoidTy && "Void type for structure field!");
+    assert(Types[i] != Type::LabelTy && "Label type for structure field!");
+    assert(Types[i] != Type::MetadataTy && "Metadata type for structure field");
+    assert((!isa<PointerType>(Types[i]) ||
+            cast<PointerType>(Types[i])->getElementType() != Type::MetadataTy)
+           && "Type 'metadata*' is invalid for structure field.");
+    new (&ContainedTys[i]) PATypeHandle(Types[i], this);
+    isAbstract |= Types[i]->isAbstract();
+  }
+
+  // Calculate whether or not this type is abstract
+  setAbstract(isAbstract);
+}
+
+ArrayType::ArrayType(const Type *ElType, uint64_t NumEl)
+  : SequentialType(ArrayTyID, ElType) {
+  NumElements = NumEl;
+
+  // Calculate whether or not this type is abstract
+  setAbstract(ElType->isAbstract());
+}
+
+VectorType::VectorType(const Type *ElType, unsigned NumEl)
+  : SequentialType(VectorTyID, ElType) {
+  NumElements = NumEl;
+  setAbstract(ElType->isAbstract());
+  assert(NumEl > 0 && "NumEl of a VectorType must be greater than 0");
+  assert((ElType->isInteger() || ElType->isFloatingPoint() || 
+          isa<OpaqueType>(ElType)) && 
+         "Elements of a VectorType must be a primitive type");
+
+}
+
+
+PointerType::PointerType(const Type *E, unsigned AddrSpace)
+  : SequentialType(PointerTyID, E) {
+  AddressSpace = AddrSpace;
+  // Calculate whether or not this type is abstract
+  setAbstract(E->isAbstract());
+}
+
+OpaqueType::OpaqueType() : DerivedType(OpaqueTyID) {
+  setAbstract(true);
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "Derived new type: " << *this << "\n";
+#endif
+}
+
+void PATypeHolder::destroy() {
+  Ty = 0;
+}
+
+// dropAllTypeUses - When this (abstract) type is resolved to be equal to
+// another (more concrete) type, we must eliminate all references to other
+// types, to avoid some circular reference problems.
+void DerivedType::dropAllTypeUses() {
+  if (NumContainedTys != 0) {
+    // The type must stay abstract.  To do this, we insert a pointer to a type
+    // that will never get resolved, thus will always be abstract.
+    static Type *AlwaysOpaqueTy = OpaqueType::get();
+    static PATypeHolder Holder(AlwaysOpaqueTy);
+    ContainedTys[0] = AlwaysOpaqueTy;
+
+    // Change the rest of the types to be Int32Ty's.  It doesn't matter what we
+    // pick so long as it doesn't point back to this type.  We choose something
+    // concrete to avoid overhead for adding to AbstracTypeUser lists and stuff.
+    for (unsigned i = 1, e = NumContainedTys; i != e; ++i)
+      ContainedTys[i] = Type::Int32Ty;
+  }
+}
+
+
+namespace {
+
+/// TypePromotionGraph and graph traits - this is designed to allow us to do
+/// efficient SCC processing of type graphs.  This is the exact same as
+/// GraphTraits<Type*>, except that we pretend that concrete types have no
+/// children to avoid processing them.
+struct TypePromotionGraph {
+  Type *Ty;
+  TypePromotionGraph(Type *T) : Ty(T) {}
+};
+
+}
+
+namespace llvm {
+  template <> struct GraphTraits<TypePromotionGraph> {
+    typedef Type NodeType;
+    typedef Type::subtype_iterator ChildIteratorType;
+
+    static inline NodeType *getEntryNode(TypePromotionGraph G) { return G.Ty; }
+    static inline ChildIteratorType child_begin(NodeType *N) {
+      if (N->isAbstract())
+        return N->subtype_begin();
+      else           // No need to process children of concrete types.
+        return N->subtype_end();
+    }
+    static inline ChildIteratorType child_end(NodeType *N) {
+      return N->subtype_end();
+    }
+  };
+}
+
+
+// PromoteAbstractToConcrete - This is a recursive function that walks a type
+// graph calculating whether or not a type is abstract.
+//
+void Type::PromoteAbstractToConcrete() {
+  if (!isAbstract()) return;
+
+  scc_iterator<TypePromotionGraph> SI = scc_begin(TypePromotionGraph(this));
+  scc_iterator<TypePromotionGraph> SE = scc_end  (TypePromotionGraph(this));
+
+  for (; SI != SE; ++SI) {
+    std::vector<Type*> &SCC = *SI;
+
+    // Concrete types are leaves in the tree.  Since an SCC will either be all
+    // abstract or all concrete, we only need to check one type.
+    if (SCC[0]->isAbstract()) {
+      if (isa<OpaqueType>(SCC[0]))
+        return;     // Not going to be concrete, sorry.
+
+      // If all of the children of all of the types in this SCC are concrete,
+      // then this SCC is now concrete as well.  If not, neither this SCC, nor
+      // any parent SCCs will be concrete, so we might as well just exit.
+      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+        for (Type::subtype_iterator CI = SCC[i]->subtype_begin(),
+               E = SCC[i]->subtype_end(); CI != E; ++CI)
+          if ((*CI)->isAbstract())
+            // If the child type is in our SCC, it doesn't make the entire SCC
+            // abstract unless there is a non-SCC abstract type.
+            if (std::find(SCC.begin(), SCC.end(), *CI) == SCC.end())
+              return;               // Not going to be concrete, sorry.
+
+      // Okay, we just discovered this whole SCC is now concrete, mark it as
+      // such!
+      for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+        assert(SCC[i]->isAbstract() && "Why are we processing concrete types?");
+
+        SCC[i]->setAbstract(false);
+      }
+
+      for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+        assert(!SCC[i]->isAbstract() && "Concrete type became abstract?");
+        // The type just became concrete, notify all users!
+        cast<DerivedType>(SCC[i])->notifyUsesThatTypeBecameConcrete();
+      }
+    }
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      Type Structural Equality Testing
+//===----------------------------------------------------------------------===//
+
+// TypesEqual - Two types are considered structurally equal if they have the
+// same "shape": Every level and element of the types have identical primitive
+// ID's, and the graphs have the same edges/nodes in them.  Nodes do not have to
+// be pointer equals to be equivalent though.  This uses an optimistic algorithm
+// that assumes that two graphs are the same until proven otherwise.
+//
+static bool TypesEqual(const Type *Ty, const Type *Ty2,
+                       std::map<const Type *, const Type *> &EqTypes) {
+  if (Ty == Ty2) return true;
+  if (Ty->getTypeID() != Ty2->getTypeID()) return false;
+  if (isa<OpaqueType>(Ty))
+    return false;  // Two unequal opaque types are never equal
+
+  std::map<const Type*, const Type*>::iterator It = EqTypes.find(Ty);
+  if (It != EqTypes.end())
+    return It->second == Ty2;    // Looping back on a type, check for equality
+
+  // Otherwise, add the mapping to the table to make sure we don't get
+  // recursion on the types...
+  EqTypes.insert(It, std::make_pair(Ty, Ty2));
+
+  // Two really annoying special cases that breaks an otherwise nice simple
+  // algorithm is the fact that arraytypes have sizes that differentiates types,
+  // and that function types can be varargs or not.  Consider this now.
+  //
+  if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+    const IntegerType *ITy2 = cast<IntegerType>(Ty2);
+    return ITy->getBitWidth() == ITy2->getBitWidth();
+  } else if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+    const PointerType *PTy2 = cast<PointerType>(Ty2);
+    return PTy->getAddressSpace() == PTy2->getAddressSpace() &&
+           TypesEqual(PTy->getElementType(), PTy2->getElementType(), EqTypes);
+  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructType *STy2 = cast<StructType>(Ty2);
+    if (STy->getNumElements() != STy2->getNumElements()) return false;
+    if (STy->isPacked() != STy2->isPacked()) return false;
+    for (unsigned i = 0, e = STy2->getNumElements(); i != e; ++i)
+      if (!TypesEqual(STy->getElementType(i), STy2->getElementType(i), EqTypes))
+        return false;
+    return true;
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    const ArrayType *ATy2 = cast<ArrayType>(Ty2);
+    return ATy->getNumElements() == ATy2->getNumElements() &&
+           TypesEqual(ATy->getElementType(), ATy2->getElementType(), EqTypes);
+  } else if (const VectorType *PTy = dyn_cast<VectorType>(Ty)) {
+    const VectorType *PTy2 = cast<VectorType>(Ty2);
+    return PTy->getNumElements() == PTy2->getNumElements() &&
+           TypesEqual(PTy->getElementType(), PTy2->getElementType(), EqTypes);
+  } else if (const FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
+    const FunctionType *FTy2 = cast<FunctionType>(Ty2);
+    if (FTy->isVarArg() != FTy2->isVarArg() ||
+        FTy->getNumParams() != FTy2->getNumParams() ||
+        !TypesEqual(FTy->getReturnType(), FTy2->getReturnType(), EqTypes))
+      return false;
+    for (unsigned i = 0, e = FTy2->getNumParams(); i != e; ++i) {
+      if (!TypesEqual(FTy->getParamType(i), FTy2->getParamType(i), EqTypes))
+        return false;
+    }
+    return true;
+  } else {
+    assert(0 && "Unknown derived type!");
+    return false;
+  }
+}
+
+static bool TypesEqual(const Type *Ty, const Type *Ty2) {
+  std::map<const Type *, const Type *> EqTypes;
+  return TypesEqual(Ty, Ty2, EqTypes);
+}
+
+// AbstractTypeHasCycleThrough - Return true there is a path from CurTy to
+// TargetTy in the type graph.  We know that Ty is an abstract type, so if we
+// ever reach a non-abstract type, we know that we don't need to search the
+// subgraph.
+static bool AbstractTypeHasCycleThrough(const Type *TargetTy, const Type *CurTy,
+                                SmallPtrSet<const Type*, 128> &VisitedTypes) {
+  if (TargetTy == CurTy) return true;
+  if (!CurTy->isAbstract()) return false;
+
+  if (!VisitedTypes.insert(CurTy))
+    return false;  // Already been here.
+
+  for (Type::subtype_iterator I = CurTy->subtype_begin(),
+       E = CurTy->subtype_end(); I != E; ++I)
+    if (AbstractTypeHasCycleThrough(TargetTy, *I, VisitedTypes))
+      return true;
+  return false;
+}
+
+static bool ConcreteTypeHasCycleThrough(const Type *TargetTy, const Type *CurTy,
+                                SmallPtrSet<const Type*, 128> &VisitedTypes) {
+  if (TargetTy == CurTy) return true;
+
+  if (!VisitedTypes.insert(CurTy))
+    return false;  // Already been here.
+
+  for (Type::subtype_iterator I = CurTy->subtype_begin(),
+       E = CurTy->subtype_end(); I != E; ++I)
+    if (ConcreteTypeHasCycleThrough(TargetTy, *I, VisitedTypes))
+      return true;
+  return false;
+}
+
+/// TypeHasCycleThroughItself - Return true if the specified type has a cycle
+/// back to itself.
+static bool TypeHasCycleThroughItself(const Type *Ty) {
+  SmallPtrSet<const Type*, 128> VisitedTypes;
+
+  if (Ty->isAbstract()) {  // Optimized case for abstract types.
+    for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+         I != E; ++I)
+      if (AbstractTypeHasCycleThrough(Ty, *I, VisitedTypes))
+        return true;
+  } else {
+    for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+         I != E; ++I)
+      if (ConcreteTypeHasCycleThrough(Ty, *I, VisitedTypes))
+        return true;
+  }
+  return false;
+}
+
+/// getSubElementHash - Generate a hash value for all of the SubType's of this
+/// type.  The hash value is guaranteed to be zero if any of the subtypes are 
+/// an opaque type.  Otherwise we try to mix them in as well as possible, but do
+/// not look at the subtype's subtype's.
+static unsigned getSubElementHash(const Type *Ty) {
+  unsigned HashVal = 0;
+  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+       I != E; ++I) {
+    HashVal *= 32;
+    const Type *SubTy = I->get();
+    HashVal += SubTy->getTypeID();
+    switch (SubTy->getTypeID()) {
+    default: break;
+    case Type::OpaqueTyID: return 0;    // Opaque -> hash = 0 no matter what.
+    case Type::IntegerTyID:
+      HashVal ^= (cast<IntegerType>(SubTy)->getBitWidth() << 3);
+      break;
+    case Type::FunctionTyID:
+      HashVal ^= cast<FunctionType>(SubTy)->getNumParams()*2 + 
+                 cast<FunctionType>(SubTy)->isVarArg();
+      break;
+    case Type::ArrayTyID:
+      HashVal ^= cast<ArrayType>(SubTy)->getNumElements();
+      break;
+    case Type::VectorTyID:
+      HashVal ^= cast<VectorType>(SubTy)->getNumElements();
+      break;
+    case Type::StructTyID:
+      HashVal ^= cast<StructType>(SubTy)->getNumElements();
+      break;
+    case Type::PointerTyID:
+      HashVal ^= cast<PointerType>(SubTy)->getAddressSpace();
+      break;
+    }
+  }
+  return HashVal ? HashVal : 1;  // Do not return zero unless opaque subty.
+}
+
+//===----------------------------------------------------------------------===//
+//                       Derived Type Factory Functions
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class TypeMapBase {
+protected:
+  /// TypesByHash - Keep track of types by their structure hash value.  Note
+  /// that we only keep track of types that have cycles through themselves in
+  /// this map.
+  ///
+  std::multimap<unsigned, PATypeHolder> TypesByHash;
+
+public:
+  ~TypeMapBase() {
+    // PATypeHolder won't destroy non-abstract types.
+    // We can't destroy them by simply iterating, because
+    // they may contain references to each-other.
+#if 0
+    for (std::multimap<unsigned, PATypeHolder>::iterator I
+         = TypesByHash.begin(), E = TypesByHash.end(); I != E; ++I) {
+      Type *Ty = const_cast<Type*>(I->second.Ty);
+      I->second.destroy();
+      // We can't invoke destroy or delete, because the type may
+      // contain references to already freed types.
+      // So we have to destruct the object the ugly way.
+      if (Ty) {
+        Ty->AbstractTypeUsers.clear();
+        static_cast<const Type*>(Ty)->Type::~Type();
+        operator delete(Ty);
+      }
+    }
+#endif
+  }
+
+  void RemoveFromTypesByHash(unsigned Hash, const Type *Ty) {
+    std::multimap<unsigned, PATypeHolder>::iterator I =
+      TypesByHash.lower_bound(Hash);
+    for (; I != TypesByHash.end() && I->first == Hash; ++I) {
+      if (I->second == Ty) {
+        TypesByHash.erase(I);
+        return;
+      }
+    }
+    
+    // This must be do to an opaque type that was resolved.  Switch down to hash
+    // code of zero.
+    assert(Hash && "Didn't find type entry!");
+    RemoveFromTypesByHash(0, Ty);
+  }
+  
+  /// TypeBecameConcrete - When Ty gets a notification that TheType just became
+  /// concrete, drop uses and make Ty non-abstract if we should.
+  void TypeBecameConcrete(DerivedType *Ty, const DerivedType *TheType) {
+    // If the element just became concrete, remove 'ty' from the abstract
+    // type user list for the type.  Do this for as many times as Ty uses
+    // OldType.
+    for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+         I != E; ++I)
+      if (I->get() == TheType)
+        TheType->removeAbstractTypeUser(Ty);
+    
+    // If the type is currently thought to be abstract, rescan all of our
+    // subtypes to see if the type has just become concrete!  Note that this
+    // may send out notifications to AbstractTypeUsers that types become
+    // concrete.
+    if (Ty->isAbstract())
+      Ty->PromoteAbstractToConcrete();
+  }
+};
+}
+
+
+// TypeMap - Make sure that only one instance of a particular type may be
+// created on any given run of the compiler... note that this involves updating
+// our map if an abstract type gets refined somehow.
+//
+namespace llvm {
+template<class ValType, class TypeClass>
+class TypeMap : public TypeMapBase {
+  std::map<ValType, PATypeHolder> Map;
+public:
+  typedef typename std::map<ValType, PATypeHolder>::iterator iterator;
+  ~TypeMap() { print("ON EXIT"); }
+
+  inline TypeClass *get(const ValType &V) {
+    iterator I = Map.find(V);
+    return I != Map.end() ? cast<TypeClass>((Type*)I->second.get()) : 0;
+  }
+
+  inline void add(const ValType &V, TypeClass *Ty) {
+    Map.insert(std::make_pair(V, Ty));
+
+    // If this type has a cycle, remember it.
+    TypesByHash.insert(std::make_pair(ValType::hashTypeStructure(Ty), Ty));
+    print("add");
+  }
+  
+  /// RefineAbstractType - This method is called after we have merged a type
+  /// with another one.  We must now either merge the type away with
+  /// some other type or reinstall it in the map with it's new configuration.
+  void RefineAbstractType(TypeClass *Ty, const DerivedType *OldType,
+                        const Type *NewType) {
+#ifdef DEBUG_MERGE_TYPES
+    DOUT << "RefineAbstractType(" << (void*)OldType << "[" << *OldType
+         << "], " << (void*)NewType << " [" << *NewType << "])\n";
+#endif
+    
+    // Otherwise, we are changing one subelement type into another.  Clearly the
+    // OldType must have been abstract, making us abstract.
+    assert(Ty->isAbstract() && "Refining a non-abstract type!");
+    assert(OldType != NewType);
+
+    // Make a temporary type holder for the type so that it doesn't disappear on
+    // us when we erase the entry from the map.
+    PATypeHolder TyHolder = Ty;
+
+    // The old record is now out-of-date, because one of the children has been
+    // updated.  Remove the obsolete entry from the map.
+    unsigned NumErased = Map.erase(ValType::get(Ty));
+    assert(NumErased && "Element not found!"); NumErased = NumErased;
+
+    // Remember the structural hash for the type before we start hacking on it,
+    // in case we need it later.
+    unsigned OldTypeHash = ValType::hashTypeStructure(Ty);
+
+    // Find the type element we are refining... and change it now!
+    for (unsigned i = 0, e = Ty->getNumContainedTypes(); i != e; ++i)
+      if (Ty->ContainedTys[i] == OldType)
+        Ty->ContainedTys[i] = NewType;
+    unsigned NewTypeHash = ValType::hashTypeStructure(Ty);
+    
+    // If there are no cycles going through this node, we can do a simple,
+    // efficient lookup in the map, instead of an inefficient nasty linear
+    // lookup.
+    if (!TypeHasCycleThroughItself(Ty)) {
+      typename std::map<ValType, PATypeHolder>::iterator I;
+      bool Inserted;
+
+      tie(I, Inserted) = Map.insert(std::make_pair(ValType::get(Ty), Ty));
+      if (!Inserted) {
+        // Refined to a different type altogether?
+        RemoveFromTypesByHash(OldTypeHash, Ty);
+
+        // We already have this type in the table.  Get rid of the newly refined
+        // type.
+        TypeClass *NewTy = cast<TypeClass>((Type*)I->second.get());
+        Ty->refineAbstractTypeTo(NewTy);
+        return;
+      }
+    } else {
+      // Now we check to see if there is an existing entry in the table which is
+      // structurally identical to the newly refined type.  If so, this type
+      // gets refined to the pre-existing type.
+      //
+      std::multimap<unsigned, PATypeHolder>::iterator I, E, Entry;
+      tie(I, E) = TypesByHash.equal_range(NewTypeHash);
+      Entry = E;
+      for (; I != E; ++I) {
+        if (I->second == Ty) {
+          // Remember the position of the old type if we see it in our scan.
+          Entry = I;
+        } else {
+          if (TypesEqual(Ty, I->second)) {
+            TypeClass *NewTy = cast<TypeClass>((Type*)I->second.get());
+
+            // Remove the old entry form TypesByHash.  If the hash values differ
+            // now, remove it from the old place.  Otherwise, continue scanning
+            // withing this hashcode to reduce work.
+            if (NewTypeHash != OldTypeHash) {
+              RemoveFromTypesByHash(OldTypeHash, Ty);
+            } else {
+              if (Entry == E) {
+                // Find the location of Ty in the TypesByHash structure if we
+                // haven't seen it already.
+                while (I->second != Ty) {
+                  ++I;
+                  assert(I != E && "Structure doesn't contain type??");
+                }
+                Entry = I;
+              }
+              TypesByHash.erase(Entry);
+            }
+            Ty->refineAbstractTypeTo(NewTy);
+            return;
+          }
+        }
+      }
+
+      // If there is no existing type of the same structure, we reinsert an
+      // updated record into the map.
+      Map.insert(std::make_pair(ValType::get(Ty), Ty));
+    }
+
+    // If the hash codes differ, update TypesByHash
+    if (NewTypeHash != OldTypeHash) {
+      RemoveFromTypesByHash(OldTypeHash, Ty);
+      TypesByHash.insert(std::make_pair(NewTypeHash, Ty));
+    }
+    
+    // If the type is currently thought to be abstract, rescan all of our
+    // subtypes to see if the type has just become concrete!  Note that this
+    // may send out notifications to AbstractTypeUsers that types become
+    // concrete.
+    if (Ty->isAbstract())
+      Ty->PromoteAbstractToConcrete();
+  }
+
+  void print(const char *Arg) const {
+#ifdef DEBUG_MERGE_TYPES
+    DOUT << "TypeMap<>::" << Arg << " table contents:\n";
+    unsigned i = 0;
+    for (typename std::map<ValType, PATypeHolder>::const_iterator I
+           = Map.begin(), E = Map.end(); I != E; ++I)
+      DOUT << " " << (++i) << ". " << (void*)I->second.get() << " "
+           << *I->second.get() << "\n";
+#endif
+  }
+
+  void dump() const { print("dump output"); }
+};
+}
+
+
+//===----------------------------------------------------------------------===//
+// Function Type Factory and Value Class...
+//
+
+//===----------------------------------------------------------------------===//
+// Integer Type Factory...
+//
+namespace llvm {
+class IntegerValType {
+  uint32_t bits;
+public:
+  IntegerValType(uint16_t numbits) : bits(numbits) {}
+
+  static IntegerValType get(const IntegerType *Ty) {
+    return IntegerValType(Ty->getBitWidth());
+  }
+
+  static unsigned hashTypeStructure(const IntegerType *Ty) {
+    return (unsigned)Ty->getBitWidth();
+  }
+
+  inline bool operator<(const IntegerValType &IVT) const {
+    return bits < IVT.bits;
+  }
+};
+}
+
+static ManagedStatic<TypeMap<IntegerValType, IntegerType> > IntegerTypes;
+
+const IntegerType *IntegerType::get(unsigned NumBits) {
+  assert(NumBits >= MIN_INT_BITS && "bitwidth too small");
+  assert(NumBits <= MAX_INT_BITS && "bitwidth too large");
+
+  // Check for the built-in integer types
+  switch (NumBits) {
+    case  1: return cast<IntegerType>(Type::Int1Ty);
+    case  8: return cast<IntegerType>(Type::Int8Ty);
+    case 16: return cast<IntegerType>(Type::Int16Ty);
+    case 32: return cast<IntegerType>(Type::Int32Ty);
+    case 64: return cast<IntegerType>(Type::Int64Ty);
+    default: 
+      break;
+  }
+
+  IntegerValType IVT(NumBits);
+  IntegerType *ITy = IntegerTypes->get(IVT);
+  if (ITy) return ITy;           // Found a match, return it!
+
+  // Value not found.  Derive a new type!
+  ITy = new IntegerType(NumBits);
+  IntegerTypes->add(IVT, ITy);
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "Derived new type: " << *ITy << "\n";
+#endif
+  return ITy;
+}
+
+bool IntegerType::isPowerOf2ByteWidth() const {
+  unsigned BitWidth = getBitWidth();
+  return (BitWidth > 7) && isPowerOf2_32(BitWidth);
+}
+
+APInt IntegerType::getMask() const {
+  return APInt::getAllOnesValue(getBitWidth());
+}
+
+// FunctionValType - Define a class to hold the key that goes into the TypeMap
+//
+namespace llvm {
+class FunctionValType {
+  const Type *RetTy;
+  std::vector<const Type*> ArgTypes;
+  bool isVarArg;
+public:
+  FunctionValType(const Type *ret, const std::vector<const Type*> &args,
+                  bool isVA) : RetTy(ret), ArgTypes(args), isVarArg(isVA) {}
+
+  static FunctionValType get(const FunctionType *FT);
+
+  static unsigned hashTypeStructure(const FunctionType *FT) {
+    unsigned Result = FT->getNumParams()*2 + FT->isVarArg();
+    return Result;
+  }
+
+  inline bool operator<(const FunctionValType &MTV) const {
+    if (RetTy < MTV.RetTy) return true;
+    if (RetTy > MTV.RetTy) return false;
+    if (isVarArg < MTV.isVarArg) return true;
+    if (isVarArg > MTV.isVarArg) return false;
+    if (ArgTypes < MTV.ArgTypes) return true;
+    if (ArgTypes > MTV.ArgTypes) return false;
+    return false;
+  }
+};
+}
+
+// Define the actual map itself now...
+static ManagedStatic<TypeMap<FunctionValType, FunctionType> > FunctionTypes;
+
+FunctionValType FunctionValType::get(const FunctionType *FT) {
+  // Build up a FunctionValType
+  std::vector<const Type *> ParamTypes;
+  ParamTypes.reserve(FT->getNumParams());
+  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i)
+    ParamTypes.push_back(FT->getParamType(i));
+  return FunctionValType(FT->getReturnType(), ParamTypes, FT->isVarArg());
+}
+
+
+// FunctionType::get - The factory function for the FunctionType class...
+FunctionType *FunctionType::get(const Type *ReturnType,
+                                const std::vector<const Type*> &Params,
+                                bool isVarArg) {
+  FunctionValType VT(ReturnType, Params, isVarArg);
+  FunctionType *FT = FunctionTypes->get(VT);
+  if (FT)
+    return FT;
+
+  FT = (FunctionType*) operator new(sizeof(FunctionType) +
+                                    sizeof(PATypeHandle)*(Params.size()+1));
+  new (FT) FunctionType(ReturnType, Params, isVarArg);
+  FunctionTypes->add(VT, FT);
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "Derived new type: " << FT << "\n";
+#endif
+  return FT;
+}
+
+//===----------------------------------------------------------------------===//
+// Array Type Factory...
+//
+namespace llvm {
+class ArrayValType {
+  const Type *ValTy;
+  uint64_t Size;
+public:
+  ArrayValType(const Type *val, uint64_t sz) : ValTy(val), Size(sz) {}
+
+  static ArrayValType get(const ArrayType *AT) {
+    return ArrayValType(AT->getElementType(), AT->getNumElements());
+  }
+
+  static unsigned hashTypeStructure(const ArrayType *AT) {
+    return (unsigned)AT->getNumElements();
+  }
+
+  inline bool operator<(const ArrayValType &MTV) const {
+    if (Size < MTV.Size) return true;
+    return Size == MTV.Size && ValTy < MTV.ValTy;
+  }
+};
+}
+static ManagedStatic<TypeMap<ArrayValType, ArrayType> > ArrayTypes;
+
+
+ArrayType *ArrayType::get(const Type *ElementType, uint64_t NumElements) {
+  assert(ElementType && "Can't get array of <null> types!");
+  assert(ElementType != Type::VoidTy && "Array of void is not valid!");
+  assert(ElementType != Type::LabelTy && "Array of labels is not valid!");
+  assert(ElementType != Type::MetadataTy && "Array of metadata is not valid!");
+  assert((!isa<PointerType>(ElementType) ||
+          cast<PointerType>(ElementType)->getElementType() != Type::MetadataTy)
+         && "Array of metadata* is not valid!");
+
+  ArrayValType AVT(ElementType, NumElements);
+  ArrayType *AT = ArrayTypes->get(AVT);
+  if (AT) return AT;           // Found a match, return it!
+
+  // Value not found.  Derive a new type!
+  ArrayTypes->add(AVT, AT = new ArrayType(ElementType, NumElements));
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "Derived new type: " << *AT << "\n";
+#endif
+  return AT;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector Type Factory...
+//
+namespace llvm {
+class VectorValType {
+  const Type *ValTy;
+  unsigned Size;
+public:
+  VectorValType(const Type *val, int sz) : ValTy(val), Size(sz) {}
+
+  static VectorValType get(const VectorType *PT) {
+    return VectorValType(PT->getElementType(), PT->getNumElements());
+  }
+
+  static unsigned hashTypeStructure(const VectorType *PT) {
+    return PT->getNumElements();
+  }
+
+  inline bool operator<(const VectorValType &MTV) const {
+    if (Size < MTV.Size) return true;
+    return Size == MTV.Size && ValTy < MTV.ValTy;
+  }
+};
+}
+static ManagedStatic<TypeMap<VectorValType, VectorType> > VectorTypes;
+
+
+VectorType *VectorType::get(const Type *ElementType, unsigned NumElements) {
+  assert(ElementType && "Can't get vector of <null> types!");
+
+  VectorValType PVT(ElementType, NumElements);
+  VectorType *PT = VectorTypes->get(PVT);
+  if (PT) return PT;           // Found a match, return it!
+
+  // Value not found.  Derive a new type!
+  VectorTypes->add(PVT, PT = new VectorType(ElementType, NumElements));
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "Derived new type: " << *PT << "\n";
+#endif
+  return PT;
+}
+
+//===----------------------------------------------------------------------===//
+// Struct Type Factory...
+//
+
+namespace llvm {
+// StructValType - Define a class to hold the key that goes into the TypeMap
+//
+class StructValType {
+  std::vector<const Type*> ElTypes;
+  bool packed;
+public:
+  StructValType(const std::vector<const Type*> &args, bool isPacked)
+    : ElTypes(args), packed(isPacked) {}
+
+  static StructValType get(const StructType *ST) {
+    std::vector<const Type *> ElTypes;
+    ElTypes.reserve(ST->getNumElements());
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
+      ElTypes.push_back(ST->getElementType(i));
+
+    return StructValType(ElTypes, ST->isPacked());
+  }
+
+  static unsigned hashTypeStructure(const StructType *ST) {
+    return ST->getNumElements();
+  }
+
+  inline bool operator<(const StructValType &STV) const {
+    if (ElTypes < STV.ElTypes) return true;
+    else if (ElTypes > STV.ElTypes) return false;
+    else return (int)packed < (int)STV.packed;
+  }
+};
+}
+
+static ManagedStatic<TypeMap<StructValType, StructType> > StructTypes;
+
+StructType *StructType::get(const std::vector<const Type*> &ETypes, 
+                            bool isPacked) {
+  StructValType STV(ETypes, isPacked);
+  StructType *ST = StructTypes->get(STV);
+  if (ST) return ST;
+
+  // Value not found.  Derive a new type!
+  ST = (StructType*) operator new(sizeof(StructType) +
+                                  sizeof(PATypeHandle) * ETypes.size());
+  new (ST) StructType(ETypes, isPacked);
+  StructTypes->add(STV, ST);
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "Derived new type: " << *ST << "\n";
+#endif
+  return ST;
+}
+
+StructType *StructType::get(const Type *type, ...) {
+  va_list ap;
+  std::vector<const llvm::Type*> StructFields;
+  va_start(ap, type);
+  while (type) {
+    StructFields.push_back(type);
+    type = va_arg(ap, llvm::Type*);
+  }
+  return llvm::StructType::get(StructFields);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Pointer Type Factory...
+//
+
+// PointerValType - Define a class to hold the key that goes into the TypeMap
+//
+namespace llvm {
+class PointerValType {
+  const Type *ValTy;
+  unsigned AddressSpace;
+public:
+  PointerValType(const Type *val, unsigned as) : ValTy(val), AddressSpace(as) {}
+
+  static PointerValType get(const PointerType *PT) {
+    return PointerValType(PT->getElementType(), PT->getAddressSpace());
+  }
+
+  static unsigned hashTypeStructure(const PointerType *PT) {
+    return getSubElementHash(PT);
+  }
+
+  bool operator<(const PointerValType &MTV) const {
+    if (AddressSpace < MTV.AddressSpace) return true;
+    return AddressSpace == MTV.AddressSpace && ValTy < MTV.ValTy;
+  }
+};
+}
+
+static ManagedStatic<TypeMap<PointerValType, PointerType> > PointerTypes;
+
+PointerType *PointerType::get(const Type *ValueType, unsigned AddressSpace) {
+  assert(ValueType && "Can't get a pointer to <null> type!");
+  assert(ValueType != Type::VoidTy &&
+         "Pointer to void is not valid, use i8* instead!");
+  assert(ValueType != Type::LabelTy && "Pointer to label is not valid!");
+  assert((!isa<PointerType>(ValueType) ||
+          cast<PointerType>(ValueType)->getElementType() != Type::MetadataTy)
+         && "Pointer to metadata* is not valid!");
+  PointerValType PVT(ValueType, AddressSpace);
+
+  PointerType *PT = PointerTypes->get(PVT);
+  if (PT) return PT;
+
+  // Value not found.  Derive a new type!
+  PointerTypes->add(PVT, PT = new PointerType(ValueType, AddressSpace));
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "Derived new type: " << *PT << "\n";
+#endif
+  return PT;
+}
+
+PointerType *Type::getPointerTo(unsigned addrs) const {
+  return PointerType::get(this, addrs);
+}
+
+//===----------------------------------------------------------------------===//
+//                     Derived Type Refinement Functions
+//===----------------------------------------------------------------------===//
+
+// removeAbstractTypeUser - Notify an abstract type that a user of the class
+// no longer has a handle to the type.  This function is called primarily by
+// the PATypeHandle class.  When there are no users of the abstract type, it
+// is annihilated, because there is no way to get a reference to it ever again.
+//
+void Type::removeAbstractTypeUser(AbstractTypeUser *U) const {
+  // Search from back to front because we will notify users from back to
+  // front.  Also, it is likely that there will be a stack like behavior to
+  // users that register and unregister users.
+  //
+  unsigned i;
+  for (i = AbstractTypeUsers.size(); AbstractTypeUsers[i-1] != U; --i)
+    assert(i != 0 && "AbstractTypeUser not in user list!");
+
+  --i;  // Convert to be in range 0 <= i < size()
+  assert(i < AbstractTypeUsers.size() && "Index out of range!");  // Wraparound?
+
+  AbstractTypeUsers.erase(AbstractTypeUsers.begin()+i);
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "  remAbstractTypeUser[" << (void*)this << ", "
+       << *this << "][" << i << "] User = " << U << "\n";
+#endif
+
+  if (AbstractTypeUsers.empty() && getRefCount() == 0 && isAbstract()) {
+#ifdef DEBUG_MERGE_TYPES
+    DOUT << "DELETEing unused abstract type: <" << *this
+         << ">[" << (void*)this << "]" << "\n";
+#endif
+    this->destroy();
+  }
+}
+
+// refineAbstractTypeTo - This function is used when it is discovered that
+// the 'this' abstract type is actually equivalent to the NewType specified.
+// This causes all users of 'this' to switch to reference the more concrete type
+// NewType and for 'this' to be deleted.
+//
+void DerivedType::refineAbstractTypeTo(const Type *NewType) {
+  assert(isAbstract() && "refineAbstractTypeTo: Current type is not abstract!");
+  assert(this != NewType && "Can't refine to myself!");
+  assert(ForwardType == 0 && "This type has already been refined!");
+
+  // The descriptions may be out of date.  Conservatively clear them all!
+  if (AbstractTypeDescriptions.isConstructed())
+    AbstractTypeDescriptions->clear();
+
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "REFINING abstract type [" << (void*)this << " "
+       << *this << "] to [" << (void*)NewType << " "
+       << *NewType << "]!\n";
+#endif
+
+  // Make sure to put the type to be refined to into a holder so that if IT gets
+  // refined, that we will not continue using a dead reference...
+  //
+  PATypeHolder NewTy(NewType);
+
+  // Any PATypeHolders referring to this type will now automatically forward to
+  // the type we are resolved to.
+  ForwardType = NewType;
+  if (NewType->isAbstract())
+    cast<DerivedType>(NewType)->addRef();
+
+  // Add a self use of the current type so that we don't delete ourself until
+  // after the function exits.
+  //
+  PATypeHolder CurrentTy(this);
+
+  // To make the situation simpler, we ask the subclass to remove this type from
+  // the type map, and to replace any type uses with uses of non-abstract types.
+  // This dramatically limits the amount of recursive type trouble we can find
+  // ourselves in.
+  dropAllTypeUses();
+
+  // Iterate over all of the uses of this type, invoking callback.  Each user
+  // should remove itself from our use list automatically.  We have to check to
+  // make sure that NewTy doesn't _become_ 'this'.  If it does, resolving types
+  // will not cause users to drop off of the use list.  If we resolve to ourself
+  // we succeed!
+  //
+  while (!AbstractTypeUsers.empty() && NewTy != this) {
+    AbstractTypeUser *User = AbstractTypeUsers.back();
+
+    unsigned OldSize = AbstractTypeUsers.size(); OldSize=OldSize;
+#ifdef DEBUG_MERGE_TYPES
+    DOUT << " REFINING user " << OldSize-1 << "[" << (void*)User
+         << "] of abstract type [" << (void*)this << " "
+         << *this << "] to [" << (void*)NewTy.get() << " "
+         << *NewTy << "]!\n";
+#endif
+    User->refineAbstractType(this, NewTy);
+
+    assert(AbstractTypeUsers.size() != OldSize &&
+           "AbsTyUser did not remove self from user list!");
+  }
+
+  // If we were successful removing all users from the type, 'this' will be
+  // deleted when the last PATypeHolder is destroyed or updated from this type.
+  // This may occur on exit of this function, as the CurrentTy object is
+  // destroyed.
+}
+
+// notifyUsesThatTypeBecameConcrete - Notify AbstractTypeUsers of this type that
+// the current type has transitioned from being abstract to being concrete.
+//
+void DerivedType::notifyUsesThatTypeBecameConcrete() {
+#ifdef DEBUG_MERGE_TYPES
+  DOUT << "typeIsREFINED type: " << (void*)this << " " << *this << "\n";
+#endif
+
+  unsigned OldSize = AbstractTypeUsers.size(); OldSize=OldSize;
+  while (!AbstractTypeUsers.empty()) {
+    AbstractTypeUser *ATU = AbstractTypeUsers.back();
+    ATU->typeBecameConcrete(this);
+
+    assert(AbstractTypeUsers.size() < OldSize-- &&
+           "AbstractTypeUser did not remove itself from the use list!");
+  }
+}
+
+// refineAbstractType - Called when a contained type is found to be more
+// concrete - this could potentially change us from an abstract type to a
+// concrete type.
+//
+void FunctionType::refineAbstractType(const DerivedType *OldType,
+                                      const Type *NewType) {
+  FunctionTypes->RefineAbstractType(this, OldType, NewType);
+}
+
+void FunctionType::typeBecameConcrete(const DerivedType *AbsTy) {
+  FunctionTypes->TypeBecameConcrete(this, AbsTy);
+}
+
+
+// refineAbstractType - Called when a contained type is found to be more
+// concrete - this could potentially change us from an abstract type to a
+// concrete type.
+//
+void ArrayType::refineAbstractType(const DerivedType *OldType,
+                                   const Type *NewType) {
+  ArrayTypes->RefineAbstractType(this, OldType, NewType);
+}
+
+void ArrayType::typeBecameConcrete(const DerivedType *AbsTy) {
+  ArrayTypes->TypeBecameConcrete(this, AbsTy);
+}
+
+// refineAbstractType - Called when a contained type is found to be more
+// concrete - this could potentially change us from an abstract type to a
+// concrete type.
+//
+void VectorType::refineAbstractType(const DerivedType *OldType,
+                                   const Type *NewType) {
+  VectorTypes->RefineAbstractType(this, OldType, NewType);
+}
+
+void VectorType::typeBecameConcrete(const DerivedType *AbsTy) {
+  VectorTypes->TypeBecameConcrete(this, AbsTy);
+}
+
+// refineAbstractType - Called when a contained type is found to be more
+// concrete - this could potentially change us from an abstract type to a
+// concrete type.
+//
+void StructType::refineAbstractType(const DerivedType *OldType,
+                                    const Type *NewType) {
+  StructTypes->RefineAbstractType(this, OldType, NewType);
+}
+
+void StructType::typeBecameConcrete(const DerivedType *AbsTy) {
+  StructTypes->TypeBecameConcrete(this, AbsTy);
+}
+
+// refineAbstractType - Called when a contained type is found to be more
+// concrete - this could potentially change us from an abstract type to a
+// concrete type.
+//
+void PointerType::refineAbstractType(const DerivedType *OldType,
+                                     const Type *NewType) {
+  PointerTypes->RefineAbstractType(this, OldType, NewType);
+}
+
+void PointerType::typeBecameConcrete(const DerivedType *AbsTy) {
+  PointerTypes->TypeBecameConcrete(this, AbsTy);
+}
+
+bool SequentialType::indexValid(const Value *V) const {
+  if (isa<IntegerType>(V->getType())) 
+    return true;
+  return false;
+}
+
+namespace llvm {
+std::ostream &operator<<(std::ostream &OS, const Type *T) {
+  if (T == 0)
+    OS << "<null> value!\n";
+  else
+    T->print(OS);
+  return OS;
+}
+
+std::ostream &operator<<(std::ostream &OS, const Type &T) {
+  T.print(OS);
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Type &T) {
+  T.print(OS);
+  return OS;
+}
+}
diff --git a/lib/VMCore/TypeSymbolTable.cpp b/lib/VMCore/TypeSymbolTable.cpp
new file mode 100644
index 0000000..475d719
--- /dev/null
+++ b/lib/VMCore/TypeSymbolTable.cpp
@@ -0,0 +1,165 @@
+//===-- TypeSymbolTable.cpp - Implement the TypeSymbolTable class ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TypeSymbolTable class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Streams.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_SYMBOL_TABLE 0
+#define DEBUG_ABSTYPE 0
+
+TypeSymbolTable::~TypeSymbolTable() {
+  // Drop all abstract type references in the type plane...
+  for (iterator TI = tmap.begin(), TE = tmap.end(); TI != TE; ++TI) {
+    if (TI->second->isAbstract())   // If abstract, drop the reference...
+      cast<DerivedType>(TI->second)->removeAbstractTypeUser(this);
+  }
+}
+
+std::string TypeSymbolTable::getUniqueName(const std::string &BaseName) const {
+  std::string TryName = BaseName;
+  const_iterator End = tmap.end();
+
+  // See if the name exists
+  while (tmap.find(TryName) != End)            // Loop until we find a free
+    TryName = BaseName + utostr(++LastUnique); // name in the symbol table
+  return TryName;
+}
+
+// lookup a type by name - returns null on failure
+Type* TypeSymbolTable::lookup(const std::string& Name) const {
+  const_iterator TI = tmap.find(Name);
+  if (TI != tmap.end())
+    return const_cast<Type*>(TI->second);
+  return 0;
+}
+
+// remove - Remove a type from the symbol table...
+Type* TypeSymbolTable::remove(iterator Entry) {
+  assert(Entry != tmap.end() && "Invalid entry to remove!");
+
+  const Type* Result = Entry->second;
+
+#if DEBUG_SYMBOL_TABLE
+  dump();
+  cerr << " Removing Value: " << Result->getName() << "\n";
+#endif
+
+  tmap.erase(Entry);
+
+  // If we are removing an abstract type, remove the symbol table from it's use
+  // list...
+  if (Result->isAbstract()) {
+#if DEBUG_ABSTYPE
+    cerr << "Removing abstract type from symtab"
+         << Result->getDescription()
+         << "\n";
+#endif
+    cast<DerivedType>(Result)->removeAbstractTypeUser(this);
+  }
+
+  return const_cast<Type*>(Result);
+}
+
+
+// insert - Insert a type into the symbol table with the specified name...
+void TypeSymbolTable::insert(const std::string& Name, const Type* T) {
+  assert(T && "Can't insert null type into symbol table!");
+
+  if (tmap.insert(make_pair(Name, T)).second) {
+    // Type inserted fine with no conflict.
+    
+#if DEBUG_SYMBOL_TABLE
+    dump();
+    cerr << " Inserted type: " << Name << ": " << T->getDescription() << "\n";
+#endif
+  } else {
+    // If there is a name conflict...
+    
+    // Check to see if there is a naming conflict.  If so, rename this type!
+    std::string UniqueName = Name;
+    if (lookup(Name))
+      UniqueName = getUniqueName(Name);
+    
+#if DEBUG_SYMBOL_TABLE
+    dump();
+    cerr << " Inserting type: " << UniqueName << ": "
+        << T->getDescription() << "\n";
+#endif
+
+    // Insert the tmap entry
+    tmap.insert(make_pair(UniqueName, T));
+  }
+
+  // If we are adding an abstract type, add the symbol table to it's use list.
+  if (T->isAbstract()) {
+    cast<DerivedType>(T)->addAbstractTypeUser(this);
+#if DEBUG_ABSTYPE
+    cerr << "Added abstract type to ST: " << T->getDescription() << "\n";
+#endif
+  }
+}
+
+// This function is called when one of the types in the type plane are refined
+void TypeSymbolTable::refineAbstractType(const DerivedType *OldType,
+                                         const Type *NewType) {
+
+  // Loop over all of the types in the symbol table, replacing any references
+  // to OldType with references to NewType.  Note that there may be multiple
+  // occurrences, and although we only need to remove one at a time, it's
+  // faster to remove them all in one pass.
+  //
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (I->second == (Type*)OldType) {  // FIXME when Types aren't const.
+#if DEBUG_ABSTYPE
+      cerr << "Removing type " << OldType->getDescription() << "\n";
+#endif
+      OldType->removeAbstractTypeUser(this);
+
+      I->second = (Type*)NewType;  // TODO FIXME when types aren't const
+      if (NewType->isAbstract()) {
+#if DEBUG_ABSTYPE
+        cerr << "Added type " << NewType->getDescription() << "\n";
+#endif
+        cast<DerivedType>(NewType)->addAbstractTypeUser(this);
+      }
+    }
+  }
+}
+
+
+// Handle situation where type becomes Concreate from Abstract
+void TypeSymbolTable::typeBecameConcrete(const DerivedType *AbsTy) {
+  // Loop over all of the types in the symbol table, dropping any abstract
+  // type user entries for AbsTy which occur because there are names for the
+  // type.
+  for (iterator TI = begin(), TE = end(); TI != TE; ++TI)
+    if (TI->second == const_cast<Type*>(static_cast<const Type*>(AbsTy)))
+      AbsTy->removeAbstractTypeUser(this);
+}
+
+static void DumpTypes(const std::pair<const std::string, const Type*>& T ) {
+  cerr << "  '" << T.first << "' = ";
+  T.second->dump();
+  cerr << "\n";
+}
+
+void TypeSymbolTable::dump() const {
+  cerr << "TypeSymbolPlane: ";
+  for_each(tmap.begin(), tmap.end(), DumpTypes);
+}
+
+// vim: sw=2 ai
diff --git a/lib/VMCore/Use.cpp b/lib/VMCore/Use.cpp
new file mode 100644
index 0000000..b25415a
--- /dev/null
+++ b/lib/VMCore/Use.cpp
@@ -0,0 +1,233 @@
+//===-- Use.cpp - Implement the Use class ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the algorithm for finding the User of a Use.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/User.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+//                         Use swap Implementation
+//===----------------------------------------------------------------------===//
+
+void Use::swap(Use &RHS) {
+  Value *V1(Val);
+  Value *V2(RHS.Val);
+  if (V1 != V2) {
+    if (V1) {
+      removeFromList();
+    }
+
+    if (V2) {
+      RHS.removeFromList();
+      Val = V2;
+      V2->addUse(*this);
+    } else {
+      Val = 0;
+    }
+
+    if (V1) {
+      RHS.Val = V1;
+      V1->addUse(RHS);
+    } else {
+      RHS.Val = 0;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Use getImpliedUser Implementation
+//===----------------------------------------------------------------------===//
+
+const Use *Use::getImpliedUser() const {
+  const Use *Current = this;
+
+  while (true) {
+    unsigned Tag = (Current++)->Prev.getInt();
+    switch (Tag) {
+      case zeroDigitTag:
+      case oneDigitTag:
+        continue;
+
+      case stopTag: {
+        ++Current;
+        ptrdiff_t Offset = 1;
+        while (true) {
+          unsigned Tag = Current->Prev.getInt();
+          switch (Tag) {
+            case zeroDigitTag:
+            case oneDigitTag:
+              ++Current;
+              Offset = (Offset << 1) + Tag;
+              continue;
+            default:
+              return Current + Offset;
+          }
+        }
+      }
+
+      case fullStopTag:
+        return Current;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Use initTags Implementation
+//===----------------------------------------------------------------------===//
+
+Use *Use::initTags(Use * const Start, Use *Stop, ptrdiff_t Done) {
+  ptrdiff_t Count = Done;
+  while (Start != Stop) {
+    --Stop;
+    Stop->Val = 0;
+    if (!Count) {
+      Stop->Prev.setFromOpaqueValue(reinterpret_cast<Use**>(Done == 0
+                                                            ? fullStopTag
+                                                            : stopTag));
+      ++Done;
+      Count = Done;
+    } else {
+      Stop->Prev.setFromOpaqueValue(reinterpret_cast<Use**>(Count & 1));
+      Count >>= 1;
+      ++Done;
+    }
+  }
+
+  return Start;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Use zap Implementation
+//===----------------------------------------------------------------------===//
+
+void Use::zap(Use *Start, const Use *Stop, bool del) {
+  if (del) {
+    while (Start != Stop) {
+      (--Stop)->~Use();
+    }
+    ::operator delete(Start);
+    return;
+  }
+
+  while (Start != Stop) {
+    (Start++)->set(0);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         AugmentedUse layout struct
+//===----------------------------------------------------------------------===//
+
+struct AugmentedUse : Use {
+  PointerIntPair<User*, 1, Tag> ref;
+  AugmentedUse(); // not implemented
+};
+
+
+//===----------------------------------------------------------------------===//
+//                         Use getUser Implementation
+//===----------------------------------------------------------------------===//
+
+User *Use::getUser() const {
+  const Use *End = getImpliedUser();
+  const PointerIntPair<User*, 1, Tag>& ref(
+                                static_cast<const AugmentedUse*>(End - 1)->ref);
+  User *She = ref.getPointer();
+  return ref.getInt()
+    ? She
+    : (User*)End;
+}
+
+//===----------------------------------------------------------------------===//
+//                         User allocHungoffUses Implementation
+//===----------------------------------------------------------------------===//
+
+Use *User::allocHungoffUses(unsigned N) const {
+  Use *Begin = static_cast<Use*>(::operator new(sizeof(Use) * N
+                                                + sizeof(AugmentedUse)
+                                                - sizeof(Use)));
+  Use *End = Begin + N;
+  PointerIntPair<User*, 1, Tag>& ref(static_cast<AugmentedUse&>(End[-1]).ref);
+  ref.setPointer(const_cast<User*>(this));
+  ref.setInt(tagOne);
+  return Use::initTags(Begin, End);
+}
+
+//===----------------------------------------------------------------------===//
+//                         User operator new Implementations
+//===----------------------------------------------------------------------===//
+
+void *User::operator new(size_t s, unsigned Us) {
+  void *Storage = ::operator new(s + sizeof(Use) * Us);
+  Use *Start = static_cast<Use*>(Storage);
+  Use *End = Start + Us;
+  User *Obj = reinterpret_cast<User*>(End);
+  Obj->OperandList = Start;
+  Obj->NumOperands = Us;
+  Use::initTags(Start, End);
+  return Obj;
+}
+
+/// Prefixed allocation - just before the first Use, allocate a NULL pointer.
+/// The destructor can detect its presence and readjust the OperandList
+/// for deletition.
+///
+void *User::operator new(size_t s, unsigned Us, bool Prefix) {
+  // currently prefixed allocation only admissible for
+  // unconditional branch instructions
+  if (!Prefix)
+    return operator new(s, Us);
+
+  assert(Us == 1 && "Other than one Use allocated?");
+  typedef PointerIntPair<void*, 2, Use::PrevPtrTag> TaggedPrefix;
+  void *Raw = ::operator new(s + sizeof(TaggedPrefix) + sizeof(Use) * Us);
+  TaggedPrefix *Pre = static_cast<TaggedPrefix*>(Raw);
+  Pre->setFromOpaqueValue(0);
+  void *Storage = Pre + 1; // skip over prefix
+  Use *Start = static_cast<Use*>(Storage);
+  Use *End = Start + Us;
+  User *Obj = reinterpret_cast<User*>(End);
+  Obj->OperandList = Start;
+  Obj->NumOperands = Us;
+  Use::initTags(Start, End);
+  return Obj;
+}
+
+//===----------------------------------------------------------------------===//
+//                         User operator delete Implementation
+//===----------------------------------------------------------------------===//
+
+void User::operator delete(void *Usr) {
+  User *Start = static_cast<User*>(Usr);
+  Use *Storage = static_cast<Use*>(Usr) - Start->NumOperands;
+  //
+  // look for a variadic User
+  if (Storage == Start->OperandList) {
+    ::operator delete(Storage);
+    return;
+  }
+  //
+  // check for the flag whether the destructor has detected a prefixed
+  // allocation, in which case we remove the flag and delete starting
+  // at OperandList
+  if (reinterpret_cast<intptr_t>(Start->OperandList) & 1) {
+    ::operator delete(reinterpret_cast<char*>(Start->OperandList) - 1);
+    return;
+  }
+  //
+  // in all other cases just delete the nullary User (covers hung-off
+  // uses also
+  ::operator delete(Usr);
+}
+
+} // End llvm namespace
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
new file mode 100644
index 0000000..3af161f
--- /dev/null
+++ b/lib/VMCore/Value.cpp
@@ -0,0 +1,581 @@
+//===-- Value.cpp - Implement the Value class -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Value, ValueHandle, and User classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constant.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/ADT/DenseMap.h"
+#include <algorithm>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                                Value Class
+//===----------------------------------------------------------------------===//
+
+static inline const Type *checkType(const Type *Ty) {
+  assert(Ty && "Value defined with a null type: Error!");
+  return Ty;
+}
+
+Value::Value(const Type *ty, unsigned scid)
+  : SubclassID(scid), HasValueHandle(0), SubclassData(0), VTy(checkType(ty)),
+    UseList(0), Name(0) {
+  if (isa<CallInst>(this) || isa<InvokeInst>(this))
+    assert((VTy->isFirstClassType() || VTy == Type::VoidTy ||
+            isa<OpaqueType>(ty) || VTy->getTypeID() == Type::StructTyID) &&
+           "invalid CallInst  type!");
+  else if (!isa<Constant>(this) && !isa<BasicBlock>(this))
+    assert((VTy->isFirstClassType() || VTy == Type::VoidTy ||
+           isa<OpaqueType>(ty)) &&
+           "Cannot create non-first-class values except for constants!");
+}
+
+Value::~Value() {
+  // Notify all ValueHandles (if present) that this value is going away.
+  if (HasValueHandle)
+    ValueHandleBase::ValueIsDeleted(this);
+  
+#ifndef NDEBUG      // Only in -g mode...
+  // Check to make sure that there are no uses of this value that are still
+  // around when the value is destroyed.  If there are, then we have a dangling
+  // reference and something is wrong.  This code is here to print out what is
+  // still being referenced.  The value in question should be printed as
+  // a <badref>
+  //
+  if (!use_empty()) {
+    cerr << "While deleting: " << *VTy << " %" << getNameStr() << "\n";
+    for (use_iterator I = use_begin(), E = use_end(); I != E; ++I)
+      cerr << "Use still stuck around after Def is destroyed:"
+           << **I << "\n";
+  }
+#endif
+  assert(use_empty() && "Uses remain when a value is destroyed!");
+
+  // If this value is named, destroy the name.  This should not be in a symtab
+  // at this point.
+  if (Name)
+    Name->Destroy();
+  
+  // There should be no uses of this object anymore, remove it.
+  LeakDetector::removeGarbageObject(this);
+}
+
+/// hasNUses - Return true if this Value has exactly N users.
+///
+bool Value::hasNUses(unsigned N) const {
+  use_const_iterator UI = use_begin(), E = use_end();
+
+  for (; N; --N, ++UI)
+    if (UI == E) return false;  // Too few.
+  return UI == E;
+}
+
+/// hasNUsesOrMore - Return true if this value has N users or more.  This is
+/// logically equivalent to getNumUses() >= N.
+///
+bool Value::hasNUsesOrMore(unsigned N) const {
+  use_const_iterator UI = use_begin(), E = use_end();
+
+  for (; N; --N, ++UI)
+    if (UI == E) return false;  // Too few.
+
+  return true;
+}
+
+/// isUsedInBasicBlock - Return true if this value is used in the specified
+/// basic block.
+bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
+  for (use_const_iterator I = use_begin(), E = use_end(); I != E; ++I) {
+    const Instruction *User = dyn_cast<Instruction>(*I);
+    if (User && User->getParent() == BB)
+      return true;
+  }
+  return false;
+}
+
+
+/// getNumUses - This method computes the number of uses of this Value.  This
+/// is a linear time operation.  Use hasOneUse or hasNUses to check for specific
+/// values.
+unsigned Value::getNumUses() const {
+  return (unsigned)std::distance(use_begin(), use_end());
+}
+
+static bool getSymTab(Value *V, ValueSymbolTable *&ST) {
+  ST = 0;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    if (BasicBlock *P = I->getParent())
+      if (Function *PP = P->getParent())
+        ST = &PP->getValueSymbolTable();
+  } else if (BasicBlock *BB = dyn_cast<BasicBlock>(V)) {
+    if (Function *P = BB->getParent()) 
+      ST = &P->getValueSymbolTable();
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    if (Module *P = GV->getParent()) 
+      ST = &P->getValueSymbolTable();
+  } else if (Argument *A = dyn_cast<Argument>(V)) {
+    if (Function *P = A->getParent()) 
+      ST = &P->getValueSymbolTable();
+  } else {
+    assert(isa<Constant>(V) && "Unknown value type!");
+    return true;  // no name is setable for this.
+  }
+  return false;
+}
+
+/// getNameStart - Return a pointer to a null terminated string for this name.
+/// Note that names can have null characters within the string as well as at
+/// their end.  This always returns a non-null pointer.
+const char *Value::getNameStart() const {
+  if (Name == 0) return "";
+  return Name->getKeyData();
+}
+
+/// getNameLen - Return the length of the string, correctly handling nul
+/// characters embedded into them.
+unsigned Value::getNameLen() const {
+  return Name ? Name->getKeyLength() : 0;
+}
+
+/// isName - Return true if this value has the name specified by the provided
+/// nul terminated string.
+bool Value::isName(const char *N) const {
+  unsigned InLen = strlen(N);
+  return InLen == getNameLen() && memcmp(getNameStart(), N, InLen) == 0;
+}
+
+
+std::string Value::getNameStr() const {
+  if (Name == 0) return "";
+  return std::string(Name->getKeyData(),
+                     Name->getKeyData()+Name->getKeyLength());
+}
+
+void Value::setName(const std::string &name) {
+  setName(&name[0], name.size());
+}
+
+void Value::setName(const char *Name) {
+  setName(Name, Name ? strlen(Name) : 0);
+}
+
+void Value::setName(const char *NameStr, unsigned NameLen) {
+  if (NameLen == 0 && !hasName()) return;
+  assert(getType() != Type::VoidTy && "Cannot assign a name to void values!");
+  
+  // Get the symbol table to update for this object.
+  ValueSymbolTable *ST;
+  if (getSymTab(this, ST))
+    return;  // Cannot set a name on this value (e.g. constant).
+
+  if (!ST) { // No symbol table to update?  Just do the change.
+    if (NameLen == 0) {
+      // Free the name for this value.
+      Name->Destroy();
+      Name = 0;
+      return;
+    }
+    
+    if (Name) {
+      // Name isn't changing?
+      if (NameLen == Name->getKeyLength() &&
+          !memcmp(Name->getKeyData(), NameStr, NameLen))
+        return;
+      Name->Destroy();
+    }
+    
+    // NOTE: Could optimize for the case the name is shrinking to not deallocate
+    // then reallocated.
+      
+    // Create the new name.
+    Name = ValueName::Create(NameStr, NameStr+NameLen);
+    Name->setValue(this);
+    return;
+  }
+  
+  // NOTE: Could optimize for the case the name is shrinking to not deallocate
+  // then reallocated.
+  if (hasName()) {
+    // Name isn't changing?
+    if (NameLen == Name->getKeyLength() &&
+        !memcmp(Name->getKeyData(), NameStr, NameLen))
+      return;
+
+    // Remove old name.
+    ST->removeValueName(Name);
+    Name->Destroy();
+    Name = 0;
+
+    if (NameLen == 0)
+      return;
+  }
+
+  // Name is changing to something new.
+  Name = ST->createValueName(NameStr, NameLen, this);
+}
+
+
+/// takeName - transfer the name from V to this value, setting V's name to
+/// empty.  It is an error to call V->takeName(V). 
+void Value::takeName(Value *V) {
+  ValueSymbolTable *ST = 0;
+  // If this value has a name, drop it.
+  if (hasName()) {
+    // Get the symtab this is in.
+    if (getSymTab(this, ST)) {
+      // We can't set a name on this value, but we need to clear V's name if
+      // it has one.
+      if (V->hasName()) V->setName(0, 0);
+      return;  // Cannot set a name on this value (e.g. constant).
+    }
+    
+    // Remove old name.
+    if (ST)
+      ST->removeValueName(Name);
+    Name->Destroy();
+    Name = 0;
+  } 
+  
+  // Now we know that this has no name.
+  
+  // If V has no name either, we're done.
+  if (!V->hasName()) return;
+   
+  // Get this's symtab if we didn't before.
+  if (!ST) {
+    if (getSymTab(this, ST)) {
+      // Clear V's name.
+      V->setName(0, 0);
+      return;  // Cannot set a name on this value (e.g. constant).
+    }
+  }
+  
+  // Get V's ST, this should always succed, because V has a name.
+  ValueSymbolTable *VST;
+  bool Failure = getSymTab(V, VST);
+  assert(!Failure && "V has a name, so it should have a ST!"); Failure=Failure;
+  
+  // If these values are both in the same symtab, we can do this very fast.
+  // This works even if both values have no symtab yet.
+  if (ST == VST) {
+    // Take the name!
+    Name = V->Name;
+    V->Name = 0;
+    Name->setValue(this);
+    return;
+  }
+  
+  // Otherwise, things are slightly more complex.  Remove V's name from VST and
+  // then reinsert it into ST.
+  
+  if (VST)
+    VST->removeValueName(V->Name);
+  Name = V->Name;
+  V->Name = 0;
+  Name->setValue(this);
+  
+  if (ST)
+    ST->reinsertValue(this);
+}
+
+
+// uncheckedReplaceAllUsesWith - This is exactly the same as replaceAllUsesWith,
+// except that it doesn't have all of the asserts.  The asserts fail because we
+// are half-way done resolving types, which causes some types to exist as two
+// different Type*'s at the same time.  This is a sledgehammer to work around
+// this problem.
+//
+void Value::uncheckedReplaceAllUsesWith(Value *New) {
+  // Notify all ValueHandles (if present) that this value is going away.
+  if (HasValueHandle)
+    ValueHandleBase::ValueIsRAUWd(this, New);
+ 
+  while (!use_empty()) {
+    Use &U = *UseList;
+    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+    // constant because they are uniqued.
+    if (Constant *C = dyn_cast<Constant>(U.getUser())) {
+      if (!isa<GlobalValue>(C)) {
+        C->replaceUsesOfWithOnConstant(this, New, &U);
+        continue;
+      }
+    }
+    
+    U.set(New);
+  }
+}
+
+void Value::replaceAllUsesWith(Value *New) {
+  assert(New && "Value::replaceAllUsesWith(<null>) is invalid!");
+  assert(New != this && "this->replaceAllUsesWith(this) is NOT valid!");
+  assert(New->getType() == getType() &&
+         "replaceAllUses of value with new value of different type!");
+
+  uncheckedReplaceAllUsesWith(New);
+}
+
+Value *Value::stripPointerCasts() {
+  if (!isa<PointerType>(getType()))
+    return this;
+  Value *V = this;
+  do {
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+          if (!CE->getOperand(i)->isNullValue())
+            return V;
+        V = CE->getOperand(0);
+      } else if (CE->getOpcode() == Instruction::BitCast) {
+        V = CE->getOperand(0);
+      } else {
+        return V;
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+      if (!GEP->hasAllZeroIndices())
+        return V;
+      V = GEP->getOperand(0);
+    } else if (BitCastInst *CI = dyn_cast<BitCastInst>(V)) {
+      V = CI->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(isa<PointerType>(V->getType()) && "Unexpected operand type!");
+  } while (1);
+}
+
+Value *Value::getUnderlyingObject() {
+  if (!isa<PointerType>(getType()))
+    return this;
+  Value *V = this;
+  unsigned MaxLookup = 6;
+  do {
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I))
+        return V;
+      V = I->getOperand(0);
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+      if (CE->getOpcode() != Instruction::BitCast &&
+          CE->getOpcode() != Instruction::GetElementPtr)
+        return V;
+      V = CE->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(isa<PointerType>(V->getType()) && "Unexpected operand type!");
+  } while (--MaxLookup);
+  return V;
+}
+
+/// DoPHITranslation - If this value is a PHI node with CurBB as its parent,
+/// return the value in the PHI node corresponding to PredBB.  If not, return
+/// ourself.  This is useful if you want to know the value something has in a
+/// predecessor block.
+Value *Value::DoPHITranslation(const BasicBlock *CurBB, 
+                               const BasicBlock *PredBB) {
+  PHINode *PN = dyn_cast<PHINode>(this);
+  if (PN && PN->getParent() == CurBB)
+    return PN->getIncomingValueForBlock(PredBB);
+  return this;
+}
+
+//===----------------------------------------------------------------------===//
+//                             ValueHandleBase Class
+//===----------------------------------------------------------------------===//
+
+/// ValueHandles - This map keeps track of all of the value handles that are
+/// watching a Value*.  The Value::HasValueHandle bit is used to know whether or
+/// not a value has an entry in this map.
+typedef DenseMap<Value*, ValueHandleBase*> ValueHandlesTy;
+static ManagedStatic<ValueHandlesTy> ValueHandles;
+
+/// AddToExistingUseList - Add this ValueHandle to the use list for VP, where
+/// List is known to point into the existing use list.
+void ValueHandleBase::AddToExistingUseList(ValueHandleBase **List) {
+  assert(List && "Handle list is null?");
+  
+  // Splice ourselves into the list.
+  Next = *List;
+  *List = this;
+  setPrevPtr(List);
+  if (Next) {
+    Next->setPrevPtr(&Next);
+    assert(VP == Next->VP && "Added to wrong list?");
+  }
+}
+
+/// AddToUseList - Add this ValueHandle to the use list for VP.
+void ValueHandleBase::AddToUseList() {
+  assert(VP && "Null pointer doesn't have a use list!");
+  if (VP->HasValueHandle) {
+    // If this value already has a ValueHandle, then it must be in the
+    // ValueHandles map already.
+    ValueHandleBase *&Entry = (*ValueHandles)[VP];
+    assert(Entry != 0 && "Value doesn't have any handles?");
+    return AddToExistingUseList(&Entry);
+  }
+  
+  // Ok, it doesn't have any handles yet, so we must insert it into the
+  // DenseMap.  However, doing this insertion could cause the DenseMap to
+  // reallocate itself, which would invalidate all of the PrevP pointers that
+  // point into the old table.  Handle this by checking for reallocation and
+  // updating the stale pointers only if needed.
+  ValueHandlesTy &Handles = *ValueHandles;
+  const void *OldBucketPtr = Handles.getPointerIntoBucketsArray();
+  
+  ValueHandleBase *&Entry = Handles[VP];
+  assert(Entry == 0 && "Value really did already have handles?");
+  AddToExistingUseList(&Entry);
+  VP->HasValueHandle = true;
+  
+  // If reallocation didn't happen or if this was the first insertion, don't
+  // walk the table.
+  if (Handles.isPointerIntoBucketsArray(OldBucketPtr) || 
+      Handles.size() == 1)
+    return;
+  
+  // Okay, reallocation did happen.  Fix the Prev Pointers.
+  for (ValueHandlesTy::iterator I = Handles.begin(), E = Handles.end();
+       I != E; ++I) {
+    assert(I->second && I->first == I->second->VP && "List invariant broken!");
+    I->second->setPrevPtr(&I->second);
+  }
+}
+
+/// RemoveFromUseList - Remove this ValueHandle from its current use list.
+void ValueHandleBase::RemoveFromUseList() {
+  assert(VP && VP->HasValueHandle && "Pointer doesn't have a use list!");
+
+  // Unlink this from its use list.
+  ValueHandleBase **PrevPtr = getPrevPtr();
+  assert(*PrevPtr == this && "List invariant broken");
+  
+  *PrevPtr = Next;
+  if (Next) {
+    assert(Next->getPrevPtr() == &Next && "List invariant broken");
+    Next->setPrevPtr(PrevPtr);
+    return;
+  }
+  
+  // If the Next pointer was null, then it is possible that this was the last
+  // ValueHandle watching VP.  If so, delete its entry from the ValueHandles
+  // map.
+  ValueHandlesTy &Handles = *ValueHandles;
+  if (Handles.isPointerIntoBucketsArray(PrevPtr)) {
+    Handles.erase(VP);
+    VP->HasValueHandle = false;
+  }
+}
+
+
+void ValueHandleBase::ValueIsDeleted(Value *V) {
+  assert(V->HasValueHandle && "Should only be called if ValueHandles present");
+
+  // Get the linked list base, which is guaranteed to exist since the
+  // HasValueHandle flag is set.
+  ValueHandleBase *Entry = (*ValueHandles)[V];
+  assert(Entry && "Value bit set but no entries exist");
+  
+  while (Entry) {
+    // Advance pointer to avoid invalidation.
+    ValueHandleBase *ThisNode = Entry;
+    Entry = Entry->Next;
+    
+    switch (ThisNode->getKind()) {
+    case Assert:
+#ifndef NDEBUG      // Only in -g mode...
+      cerr << "While deleting: " << *V->getType() << " %" << V->getNameStr()
+           << "\n";
+#endif
+      cerr << "An asserting value handle still pointed to this value!\n";
+      abort();
+    case Weak:
+      // Weak just goes to null, which will unlink it from the list.
+      ThisNode->operator=(0);
+      break;
+    case Callback:
+      // Forward to the subclass's implementation.
+      static_cast<CallbackVH*>(ThisNode)->deleted();
+      break;
+    }
+  }
+  
+  // All callbacks and weak references should be dropped by now.
+  assert(!V->HasValueHandle && "All references to V were not removed?");
+}
+
+
+void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
+  assert(Old->HasValueHandle &&"Should only be called if ValueHandles present");
+  assert(Old != New && "Changing value into itself!");
+  
+  // Get the linked list base, which is guaranteed to exist since the
+  // HasValueHandle flag is set.
+  ValueHandleBase *Entry = (*ValueHandles)[Old];
+  assert(Entry && "Value bit set but no entries exist");
+  
+  while (Entry) {
+    // Advance pointer to avoid invalidation.
+    ValueHandleBase *ThisNode = Entry;
+    Entry = Entry->Next;
+    
+    switch (ThisNode->getKind()) {
+    case Assert:
+      // Asserting handle does not follow RAUW implicitly.
+      break;
+    case Weak:
+      // Weak goes to the new value, which will unlink it from Old's list.
+      ThisNode->operator=(New);
+      break;
+    case Callback:
+      // Forward to the subclass's implementation.
+      static_cast<CallbackVH*>(ThisNode)->allUsesReplacedWith(New);
+      break;
+    }
+  }
+}
+
+/// ~CallbackVH. Empty, but defined here to avoid emitting the vtable
+/// more than once.
+CallbackVH::~CallbackVH() {}
+
+
+//===----------------------------------------------------------------------===//
+//                                 User Class
+//===----------------------------------------------------------------------===//
+
+// replaceUsesOfWith - Replaces all references to the "From" definition with
+// references to the "To" definition.
+//
+void User::replaceUsesOfWith(Value *From, Value *To) {
+  if (From == To) return;   // Duh what?
+
+  assert((!isa<Constant>(this) || isa<GlobalValue>(this)) &&
+         "Cannot call User::replaceUsesofWith on a constant!");
+
+  for (unsigned i = 0, E = getNumOperands(); i != E; ++i)
+    if (getOperand(i) == From) {  // Is This operand is pointing to oldval?
+      // The side effects of this setOperand call include linking to
+      // "To", adding "this" to the uses list of To, and
+      // most importantly, removing "this" from the use list of "From".
+      setOperand(i, To); // Fix it now...
+    }
+}
+
diff --git a/lib/VMCore/ValueSymbolTable.cpp b/lib/VMCore/ValueSymbolTable.cpp
new file mode 100644
index 0000000..eee18a1
--- /dev/null
+++ b/lib/VMCore/ValueSymbolTable.cpp
@@ -0,0 +1,137 @@
+//===-- ValueSymbolTable.cpp - Implement the ValueSymbolTable class -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ValueSymbolTable class for the VMCore library.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "valuesymtab"
+#include "llvm/GlobalValue.h"
+#include "llvm/Type.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+// Class destructor
+ValueSymbolTable::~ValueSymbolTable() {
+#ifndef NDEBUG   // Only do this in -g mode...
+  for (iterator VI = vmap.begin(), VE = vmap.end(); VI != VE; ++VI)
+    cerr << "Value still in symbol table! Type = '"
+         << VI->getValue()->getType()->getDescription() << "' Name = '"
+         << VI->getKeyData() << "'\n";
+  assert(vmap.empty() && "Values remain in symbol table!");
+#endif
+}
+
+// lookup a value - Returns null on failure...
+//
+Value *ValueSymbolTable::lookup(const std::string &Name) const {
+  const_iterator VI = vmap.find(Name.data(), Name.data() + Name.size());
+  if (VI != vmap.end())                   // We found the symbol
+    return VI->getValue();
+  return 0;
+}
+
+Value *ValueSymbolTable::lookup(const char *NameBegin,
+                                const char *NameEnd) const {
+  const_iterator VI = vmap.find(NameBegin, NameEnd);
+  if (VI != vmap.end())                   // We found the symbol
+    return VI->getValue();
+  return 0;
+}
+
+// Insert a value into the symbol table with the specified name...
+//
+void ValueSymbolTable::reinsertValue(Value* V) {
+  assert(V->hasName() && "Can't insert nameless Value into symbol table");
+
+  // Try inserting the name, assuming it won't conflict.
+  if (vmap.insert(V->Name)) {
+    //DOUT << " Inserted value: " << V->Name << ": " << *V << "\n";
+    return;
+  }
+  
+  // Otherwise, there is a naming conflict.  Rename this value.
+  SmallString<128> UniqueName(V->getNameStart(), V->getNameEnd());
+
+  // The name is too already used, just free it so we can allocate a new name.
+  V->Name->Destroy();
+  
+  unsigned BaseSize = UniqueName.size();
+  while (1) {
+    // Trim any suffix off.
+    UniqueName.resize(BaseSize);
+    UniqueName.append_uint_32(++LastUnique);
+    // Try insert the vmap entry with this suffix.
+    ValueName &NewName =
+      vmap.GetOrCreateValue(UniqueName.data(),
+                            UniqueName.data() + UniqueName.size());
+    if (NewName.getValue() == 0) {
+      // Newly inserted name.  Success!
+      NewName.setValue(V);
+      V->Name = &NewName;
+      //DEBUG(DOUT << " Inserted value: " << UniqueName << ": " << *V << "\n");
+      return;
+    }
+  }
+}
+
+void ValueSymbolTable::removeValueName(ValueName *V) {
+  //DEBUG(DOUT << " Removing Value: " << V->getKeyData() << "\n");
+  // Remove the value from the symbol table.
+  vmap.remove(V);
+}
+
+/// createValueName - This method attempts to create a value name and insert
+/// it into the symbol table with the specified name.  If it conflicts, it
+/// auto-renames the name and returns that instead.
+ValueName *ValueSymbolTable::createValueName(const char *NameStart,
+                                             unsigned NameLen, Value *V) {
+  // In the common case, the name is not already in the symbol table.
+  ValueName &Entry = vmap.GetOrCreateValue(NameStart, NameStart+NameLen);
+  if (Entry.getValue() == 0) {
+    Entry.setValue(V);
+    //DEBUG(DOUT << " Inserted value: " << Entry.getKeyData() << ": "
+    //           << *V << "\n");
+    return &Entry;
+  }
+  
+  // Otherwise, there is a naming conflict.  Rename this value.
+  SmallString<128> UniqueName(NameStart, NameStart+NameLen);
+  
+  while (1) {
+    // Trim any suffix off.
+    UniqueName.resize(NameLen);
+    UniqueName.append_uint_32(++LastUnique);
+    
+    // Try insert the vmap entry with this suffix.
+    ValueName &NewName =
+      vmap.GetOrCreateValue(UniqueName.data(),
+                            UniqueName.data() + UniqueName.size());
+    if (NewName.getValue() == 0) {
+      // Newly inserted name.  Success!
+      NewName.setValue(V);
+      //DEBUG(DOUT << " Inserted value: " << UniqueName << ": " << *V << "\n");
+      return &NewName;
+    }
+  }
+}
+
+
+// dump - print out the symbol table
+//
+void ValueSymbolTable::dump() const {
+  //DOUT << "ValueSymbolTable:\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    //DOUT << "  '" << I->getKeyData() << "' = ";
+    I->getValue()->dump();
+    //DOUT << "\n";
+  }
+}
diff --git a/lib/VMCore/ValueTypes.cpp b/lib/VMCore/ValueTypes.cpp
new file mode 100644
index 0000000..fe4af05
--- /dev/null
+++ b/lib/VMCore/ValueTypes.cpp
@@ -0,0 +1,185 @@
+//===----------- ValueTypes.cpp - Implementation of MVT methods -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements methods in the CodeGen/ValueTypes.h header.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+using namespace llvm;
+
+MVT MVT::getExtendedIntegerVT(unsigned BitWidth) {
+  MVT VT;
+  VT.LLVMTy = IntegerType::get(BitWidth);
+  assert(VT.isExtended() && "Type is not extended!");
+  return VT;
+}
+
+MVT MVT::getExtendedVectorVT(MVT VT, unsigned NumElements) {
+  MVT ResultVT;
+  ResultVT.LLVMTy = VectorType::get(VT.getTypeForMVT(), NumElements);
+  assert(ResultVT.isExtended() && "Type is not extended!");
+  return ResultVT;
+}
+
+bool MVT::isExtendedFloatingPoint() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isFPOrFPVector();
+}
+
+bool MVT::isExtendedInteger() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isIntOrIntVector();
+}
+
+bool MVT::isExtendedVector() const {
+  assert(isExtended() && "Type is not extended!");
+  return isa<VectorType>(LLVMTy);
+}
+
+bool MVT::isExtended64BitVector() const {
+  return isExtendedVector() && getSizeInBits() == 64;
+}
+
+bool MVT::isExtended128BitVector() const {
+  return isExtendedVector() && getSizeInBits() == 128;
+}
+
+MVT MVT::getExtendedVectorElementType() const {
+  assert(isExtended() && "Type is not extended!");
+  return MVT::getMVT(cast<VectorType>(LLVMTy)->getElementType());
+}
+
+unsigned MVT::getExtendedVectorNumElements() const {
+  assert(isExtended() && "Type is not extended!");
+  return cast<VectorType>(LLVMTy)->getNumElements();
+}
+
+unsigned MVT::getExtendedSizeInBits() const {
+  assert(isExtended() && "Type is not extended!");
+  if (const IntegerType *ITy = dyn_cast<IntegerType>(LLVMTy))
+    return ITy->getBitWidth();
+  if (const VectorType *VTy = dyn_cast<VectorType>(LLVMTy))
+    return VTy->getBitWidth();
+  assert(false && "Unrecognized extended type!");
+  return 0; // Suppress warnings.
+}
+
+/// getMVTString - This function returns value type as a string, e.g. "i32".
+std::string MVT::getMVTString() const {
+  switch (V) {
+  default:
+    if (isVector())
+      return "v" + utostr(getVectorNumElements()) +
+             getVectorElementType().getMVTString();
+    if (isInteger())
+      return "i" + utostr(getSizeInBits());
+    assert(0 && "Invalid MVT!");
+    return "?";
+  case MVT::i1:      return "i1";
+  case MVT::i8:      return "i8";
+  case MVT::i16:     return "i16";
+  case MVT::i32:     return "i32";
+  case MVT::i64:     return "i64";
+  case MVT::i128:    return "i128";
+  case MVT::f32:     return "f32";
+  case MVT::f64:     return "f64";
+  case MVT::f80:     return "f80";
+  case MVT::f128:    return "f128";
+  case MVT::ppcf128: return "ppcf128";
+  case MVT::isVoid:  return "isVoid";
+  case MVT::Other:   return "ch";
+  case MVT::Flag:    return "flag";
+  case MVT::v2i8:    return "v2i8";
+  case MVT::v4i8:    return "v4i8";
+  case MVT::v2i16:   return "v2i16";
+  case MVT::v8i8:    return "v8i8";
+  case MVT::v4i16:   return "v4i16";
+  case MVT::v2i32:   return "v2i32";
+  case MVT::v1i64:   return "v1i64";
+  case MVT::v16i8:   return "v16i8";
+  case MVT::v8i16:   return "v8i16";
+  case MVT::v4i32:   return "v4i32";
+  case MVT::v2i64:   return "v2i64";
+  case MVT::v2f32:   return "v2f32";
+  case MVT::v4f32:   return "v4f32";
+  case MVT::v2f64:   return "v2f64";
+  case MVT::v3i32:   return "v3i32";
+  case MVT::v3f32:   return "v3f32";
+  }
+}
+
+/// getTypeForMVT - This method returns an LLVM type corresponding to the
+/// specified MVT.  For integer types, this returns an unsigned type.  Note
+/// that this will abort for types that cannot be represented.
+const Type *MVT::getTypeForMVT() const {
+  switch (V) {
+  default:
+    assert(isExtended() && "Type is not extended!");
+    return LLVMTy;
+  case MVT::isVoid:  return Type::VoidTy;
+  case MVT::i1:      return Type::Int1Ty;
+  case MVT::i8:      return Type::Int8Ty;
+  case MVT::i16:     return Type::Int16Ty;
+  case MVT::i32:     return Type::Int32Ty;
+  case MVT::i64:     return Type::Int64Ty;
+  case MVT::i128:    return IntegerType::get(128);
+  case MVT::f32:     return Type::FloatTy;
+  case MVT::f64:     return Type::DoubleTy;
+  case MVT::f80:     return Type::X86_FP80Ty;
+  case MVT::f128:    return Type::FP128Ty;
+  case MVT::ppcf128: return Type::PPC_FP128Ty;
+  case MVT::v2i8:    return VectorType::get(Type::Int8Ty, 2);
+  case MVT::v4i8:    return VectorType::get(Type::Int8Ty, 4);
+  case MVT::v2i16:    return VectorType::get(Type::Int16Ty, 2);
+  case MVT::v8i8:    return VectorType::get(Type::Int8Ty, 8);
+  case MVT::v4i16:   return VectorType::get(Type::Int16Ty, 4);
+  case MVT::v2i32:   return VectorType::get(Type::Int32Ty, 2);
+  case MVT::v1i64:   return VectorType::get(Type::Int64Ty, 1);
+  case MVT::v16i8:   return VectorType::get(Type::Int8Ty, 16);
+  case MVT::v8i16:   return VectorType::get(Type::Int16Ty, 8);
+  case MVT::v4i32:   return VectorType::get(Type::Int32Ty, 4);
+  case MVT::v2i64:   return VectorType::get(Type::Int64Ty, 2);
+  case MVT::v2f32:   return VectorType::get(Type::FloatTy, 2);
+  case MVT::v4f32:   return VectorType::get(Type::FloatTy, 4);
+  case MVT::v2f64:   return VectorType::get(Type::DoubleTy, 2);
+  case MVT::v3i32:   return VectorType::get(Type::Int32Ty, 3);
+  case MVT::v3f32:   return VectorType::get(Type::FloatTy, 3);
+  }
+}
+
+/// getMVT - Return the value type corresponding to the specified type.  This
+/// returns all pointers as MVT::iPTR.  If HandleUnknown is true, unknown types
+/// are returned as Other, otherwise they are invalid.
+MVT MVT::getMVT(const Type *Ty, bool HandleUnknown){
+  switch (Ty->getTypeID()) {
+  default:
+    if (HandleUnknown) return MVT::Other;
+    assert(0 && "Unknown type!");
+    return MVT::isVoid;
+  case Type::VoidTyID:
+    return MVT::isVoid;
+  case Type::IntegerTyID:
+    return getIntegerVT(cast<IntegerType>(Ty)->getBitWidth());
+  case Type::FloatTyID:     return MVT::f32;
+  case Type::DoubleTyID:    return MVT::f64;
+  case Type::X86_FP80TyID:  return MVT::f80;
+  case Type::FP128TyID:     return MVT::f128;
+  case Type::PPC_FP128TyID: return MVT::ppcf128;
+  case Type::PointerTyID:   return MVT::iPTR;
+  case Type::VectorTyID: {
+    const VectorType *VTy = cast<VectorType>(Ty);
+    return getVectorVT(getMVT(VTy->getElementType(), false),
+                       VTy->getNumElements());
+  }
+  }
+}
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
new file mode 100644
index 0000000..59ec3be
--- /dev/null
+++ b/lib/VMCore/Verifier.cpp
@@ -0,0 +1,1770 @@
+//===-- Verifier.cpp - Implement the Module Verifier -------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the function verifier interface, that can be used for some
+// sanity checking of input to the system.
+//
+// Note that this does not provide full `Java style' security and verifications,
+// instead it just tries to ensure that code is well-formed.
+//
+//  * Both of a binary operator's parameters are of the same type
+//  * Verify that the indices of mem access instructions match other operands
+//  * Verify that arithmetic and other things are only performed on first-class
+//    types.  Verify that shifts & logicals only happen on integrals f.e.
+//  * All of the constants in a switch statement are of the correct type
+//  * The code is in valid SSA form
+//  * It should be illegal to put a label into any other type (like a structure)
+//    or to return one. [except constant arrays!]
+//  * Only phi nodes can be self referential: 'add i32 %0, %0 ; <int>:0' is bad
+//  * PHI nodes must have an entry for each predecessor, with no extras.
+//  * PHI nodes must be the first thing in a basic block, all grouped together
+//  * PHI nodes must have at least one entry
+//  * All basic blocks should only end with terminator insts, not contain them
+//  * The entry node to a function must not have predecessors
+//  * All Instructions must be embedded into a basic block
+//  * Functions cannot take a void-typed parameter
+//  * Verify that a function's argument list agrees with it's declared type.
+//  * It is illegal to specify a name for a void value.
+//  * It is illegal to have a internal global value with no initializer
+//  * It is illegal to have a ret instruction that returns a value that does not
+//    agree with the function return value type.
+//  * Function call argument types match the function prototype
+//  * All other things that are tested by asserts spread about the code...
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/MDNode.h"
+#include "llvm/Module.h"
+#include "llvm/ModuleProvider.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <sstream>
+#include <cstdarg>
+using namespace llvm;
+
+namespace {  // Anonymous namespace for class
+  struct VISIBILITY_HIDDEN PreVerifier : public FunctionPass {
+    static char ID; // Pass ID, replacement for typeid
+
+    PreVerifier() : FunctionPass(&ID) { }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    // Check that the prerequisites for successful DominatorTree construction
+    // are satisfied.
+    bool runOnFunction(Function &F) {
+      bool Broken = false;
+
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+        if (I->empty() || !I->back().isTerminator()) {
+          cerr << "Basic Block does not have terminator!\n";
+          WriteAsOperand(*cerr, I, true);
+          cerr << "\n";
+          Broken = true;
+        }
+      }
+
+      if (Broken)
+        abort();
+
+      return false;
+    }
+  };
+}
+
+char PreVerifier::ID = 0;
+static RegisterPass<PreVerifier>
+PreVer("preverify", "Preliminary module verification");
+static const PassInfo *const PreVerifyID = &PreVer;
+
+namespace {
+  struct VISIBILITY_HIDDEN
+     Verifier : public FunctionPass, InstVisitor<Verifier> {
+    static char ID; // Pass ID, replacement for typeid
+    bool Broken;          // Is this module found to be broken?
+    bool RealPass;        // Are we not being run by a PassManager?
+    VerifierFailureAction action;
+                          // What to do if verification fails.
+    Module *Mod;          // Module we are verifying right now
+    DominatorTree *DT; // Dominator Tree, caution can be null!
+    std::stringstream msgs;  // A stringstream to collect messages
+
+    /// InstInThisBlock - when verifying a basic block, keep track of all of the
+    /// instructions we have seen so far.  This allows us to do efficient
+    /// dominance checks for the case when an instruction has an operand that is
+    /// an instruction in the same block.
+    SmallPtrSet<Instruction*, 16> InstsInThisBlock;
+
+    Verifier()
+      : FunctionPass(&ID), 
+      Broken(false), RealPass(true), action(AbortProcessAction),
+      DT(0), msgs( std::ios::app | std::ios::out ) {}
+    explicit Verifier(VerifierFailureAction ctn)
+      : FunctionPass(&ID), 
+      Broken(false), RealPass(true), action(ctn), DT(0),
+      msgs( std::ios::app | std::ios::out ) {}
+    explicit Verifier(bool AB)
+      : FunctionPass(&ID), 
+      Broken(false), RealPass(true),
+      action( AB ? AbortProcessAction : PrintMessageAction), DT(0),
+      msgs( std::ios::app | std::ios::out ) {}
+    explicit Verifier(DominatorTree &dt)
+      : FunctionPass(&ID), 
+      Broken(false), RealPass(false), action(PrintMessageAction),
+      DT(&dt), msgs( std::ios::app | std::ios::out ) {}
+
+
+    bool doInitialization(Module &M) {
+      Mod = &M;
+      verifyTypeSymbolTable(M.getTypeSymbolTable());
+
+      // If this is a real pass, in a pass manager, we must abort before
+      // returning back to the pass manager, or else the pass manager may try to
+      // run other passes on the broken module.
+      if (RealPass)
+        return abortIfBroken();
+      return false;
+    }
+
+    bool runOnFunction(Function &F) {
+      // Get dominator information if we are being run by PassManager
+      if (RealPass) DT = &getAnalysis<DominatorTree>();
+
+      Mod = F.getParent();
+
+      visit(F);
+      InstsInThisBlock.clear();
+
+      // If this is a real pass, in a pass manager, we must abort before
+      // returning back to the pass manager, or else the pass manager may try to
+      // run other passes on the broken module.
+      if (RealPass)
+        return abortIfBroken();
+
+      return false;
+    }
+
+    bool doFinalization(Module &M) {
+      // Scan through, checking all of the external function's linkage now...
+      for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+        visitGlobalValue(*I);
+
+        // Check to make sure function prototypes are okay.
+        if (I->isDeclaration()) visitFunction(*I);
+      }
+
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+           I != E; ++I)
+        visitGlobalVariable(*I);
+
+      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); 
+           I != E; ++I)
+        visitGlobalAlias(*I);
+
+      // If the module is broken, abort at this time.
+      return abortIfBroken();
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequiredID(PreVerifyID);
+      if (RealPass)
+        AU.addRequired<DominatorTree>();
+    }
+
+    /// abortIfBroken - If the module is broken and we are supposed to abort on
+    /// this condition, do so.
+    ///
+    bool abortIfBroken() {
+      if (!Broken) return false;
+      msgs << "Broken module found, ";
+      switch (action) {
+      default: assert(0 && "Unknown action");
+      case AbortProcessAction:
+        msgs << "compilation aborted!\n";
+        cerr << msgs.str();
+        abort();
+      case PrintMessageAction:
+        msgs << "verification continues.\n";
+        cerr << msgs.str();
+        return false;
+      case ReturnStatusAction:
+        msgs << "compilation terminated.\n";
+        return true;
+      }
+    }
+
+
+    // Verification methods...
+    void verifyTypeSymbolTable(TypeSymbolTable &ST);
+    void visitGlobalValue(GlobalValue &GV);
+    void visitGlobalVariable(GlobalVariable &GV);
+    void visitGlobalAlias(GlobalAlias &GA);
+    void visitFunction(Function &F);
+    void visitBasicBlock(BasicBlock &BB);
+    using InstVisitor<Verifier>::visit;
+       
+    void visit(Instruction &I);
+       
+    void visitTruncInst(TruncInst &I);
+    void visitZExtInst(ZExtInst &I);
+    void visitSExtInst(SExtInst &I);
+    void visitFPTruncInst(FPTruncInst &I);
+    void visitFPExtInst(FPExtInst &I);
+    void visitFPToUIInst(FPToUIInst &I);
+    void visitFPToSIInst(FPToSIInst &I);
+    void visitUIToFPInst(UIToFPInst &I);
+    void visitSIToFPInst(SIToFPInst &I);
+    void visitIntToPtrInst(IntToPtrInst &I);
+    void visitPtrToIntInst(PtrToIntInst &I);
+    void visitBitCastInst(BitCastInst &I);
+    void visitPHINode(PHINode &PN);
+    void visitBinaryOperator(BinaryOperator &B);
+    void visitICmpInst(ICmpInst &IC);
+    void visitFCmpInst(FCmpInst &FC);
+    void visitExtractElementInst(ExtractElementInst &EI);
+    void visitInsertElementInst(InsertElementInst &EI);
+    void visitShuffleVectorInst(ShuffleVectorInst &EI);
+    void visitVAArgInst(VAArgInst &VAA) { visitInstruction(VAA); }
+    void visitCallInst(CallInst &CI);
+    void visitInvokeInst(InvokeInst &II);
+    void visitGetElementPtrInst(GetElementPtrInst &GEP);
+    void visitLoadInst(LoadInst &LI);
+    void visitStoreInst(StoreInst &SI);
+    void visitInstruction(Instruction &I);
+    void visitTerminatorInst(TerminatorInst &I);
+    void visitReturnInst(ReturnInst &RI);
+    void visitSwitchInst(SwitchInst &SI);
+    void visitSelectInst(SelectInst &SI);
+    void visitUserOp1(Instruction &I);
+    void visitUserOp2(Instruction &I) { visitUserOp1(I); }
+    void visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI);
+    void visitAllocationInst(AllocationInst &AI);
+    void visitExtractValueInst(ExtractValueInst &EVI);
+    void visitInsertValueInst(InsertValueInst &IVI);
+
+    void VerifyCallSite(CallSite CS);
+    bool PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
+                          int VT, unsigned ArgNo, std::string &Suffix);
+    void VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
+                                  unsigned RetNum, unsigned ParamNum, ...);
+    void VerifyAttrs(Attributes Attrs, const Type *Ty,
+                     bool isReturnValue, const Value *V);
+    void VerifyFunctionAttrs(const FunctionType *FT, const AttrListPtr &Attrs,
+                             const Value *V);
+    bool VerifyMDNode(const MDNode *N);
+
+    void WriteValue(const Value *V) {
+      if (!V) return;
+      if (isa<Instruction>(V)) {
+        msgs << *V;
+      } else {
+        WriteAsOperand(msgs, V, true, Mod);
+        msgs << "\n";
+      }
+    }
+
+    void WriteType(const Type *T) {
+      if (!T) return;
+      raw_os_ostream RO(msgs);
+      RO << ' ';
+      WriteTypeSymbolic(RO, T, Mod);
+    }
+
+
+    // CheckFailed - A check failed, so print out the condition and the message
+    // that failed.  This provides a nice place to put a breakpoint if you want
+    // to see why something is not correct.
+    void CheckFailed(const std::string &Message,
+                     const Value *V1 = 0, const Value *V2 = 0,
+                     const Value *V3 = 0, const Value *V4 = 0) {
+      msgs << Message << "\n";
+      WriteValue(V1);
+      WriteValue(V2);
+      WriteValue(V3);
+      WriteValue(V4);
+      Broken = true;
+    }
+
+    void CheckFailed( const std::string& Message, const Value* V1,
+                      const Type* T2, const Value* V3 = 0 ) {
+      msgs << Message << "\n";
+      WriteValue(V1);
+      WriteType(T2);
+      WriteValue(V3);
+      Broken = true;
+    }
+  };
+} // End anonymous namespace
+
+char Verifier::ID = 0;
+static RegisterPass<Verifier> X("verify", "Module Verifier");
+
+// Assert - We know that cond should be true, if not print an error message.
+#define Assert(C, M) \
+  do { if (!(C)) { CheckFailed(M); return; } } while (0)
+#define Assert1(C, M, V1) \
+  do { if (!(C)) { CheckFailed(M, V1); return; } } while (0)
+#define Assert2(C, M, V1, V2) \
+  do { if (!(C)) { CheckFailed(M, V1, V2); return; } } while (0)
+#define Assert3(C, M, V1, V2, V3) \
+  do { if (!(C)) { CheckFailed(M, V1, V2, V3); return; } } while (0)
+#define Assert4(C, M, V1, V2, V3, V4) \
+  do { if (!(C)) { CheckFailed(M, V1, V2, V3, V4); return; } } while (0)
+
+void Verifier::visit(Instruction &I) {
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+    Assert1(I.getOperand(i) != 0, "Operand is null", &I);
+  InstVisitor<Verifier>::visit(I);
+}
+
+
+void Verifier::visitGlobalValue(GlobalValue &GV) {
+  Assert1(!GV.isDeclaration() ||
+          GV.hasExternalLinkage() ||
+          GV.hasDLLImportLinkage() ||
+          GV.hasExternalWeakLinkage() ||
+          GV.hasGhostLinkage() ||
+          (isa<GlobalAlias>(GV) &&
+           (GV.hasLocalLinkage() || GV.hasWeakLinkage())),
+  "Global is external, but doesn't have external or dllimport or weak linkage!",
+          &GV);
+
+  Assert1(!GV.hasDLLImportLinkage() || GV.isDeclaration(),
+          "Global is marked as dllimport, but not external", &GV);
+  
+  Assert1(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV),
+          "Only global variables can have appending linkage!", &GV);
+
+  if (GV.hasAppendingLinkage()) {
+    GlobalVariable &GVar = cast<GlobalVariable>(GV);
+    Assert1(isa<ArrayType>(GVar.getType()->getElementType()),
+            "Only global arrays can have appending linkage!", &GV);
+  }
+}
+
+void Verifier::visitGlobalVariable(GlobalVariable &GV) {
+  if (GV.hasInitializer()) {
+    Assert1(GV.getInitializer()->getType() == GV.getType()->getElementType(),
+            "Global variable initializer type does not match global "
+            "variable type!", &GV);
+
+    // Verify that any metadata used in a global initializer points only to
+    // other globals.
+    if (MDNode *FirstNode = dyn_cast<MDNode>(GV.getInitializer())) {
+      if (VerifyMDNode(FirstNode)) {
+        SmallVector<const MDNode *, 4> NodesToAnalyze;
+        NodesToAnalyze.push_back(FirstNode);
+        while (!NodesToAnalyze.empty()) {
+          const MDNode *N = NodesToAnalyze.back();
+          NodesToAnalyze.pop_back();
+
+          for (MDNode::const_elem_iterator I = N->elem_begin(),
+                 E = N->elem_end(); I != E; ++I)
+            if (const Value *V = *I) {
+              if (const MDNode *Next = dyn_cast<MDNode>(V))
+                NodesToAnalyze.push_back(Next);
+              else
+                Assert3(isa<Constant>(V),
+                        "reference to instruction from global metadata node",
+                        &GV, N, V);
+            }
+        }
+      }
+    }
+  } else {
+    Assert1(GV.hasExternalLinkage() || GV.hasDLLImportLinkage() ||
+            GV.hasExternalWeakLinkage(),
+            "invalid linkage type for global declaration", &GV);
+  }
+
+  visitGlobalValue(GV);
+}
+
+void Verifier::visitGlobalAlias(GlobalAlias &GA) {
+  Assert1(!GA.getName().empty(),
+          "Alias name cannot be empty!", &GA);
+  Assert1(GA.hasExternalLinkage() || GA.hasLocalLinkage() ||
+          GA.hasWeakLinkage(),
+          "Alias should have external or external weak linkage!", &GA);
+  Assert1(GA.getAliasee(),
+          "Aliasee cannot be NULL!", &GA);
+  Assert1(GA.getType() == GA.getAliasee()->getType(),
+          "Alias and aliasee types should match!", &GA);
+
+  if (!isa<GlobalValue>(GA.getAliasee())) {
+    const ConstantExpr *CE = dyn_cast<ConstantExpr>(GA.getAliasee());
+    Assert1(CE && 
+            (CE->getOpcode() == Instruction::BitCast ||
+             CE->getOpcode() == Instruction::GetElementPtr) &&
+            isa<GlobalValue>(CE->getOperand(0)),
+            "Aliasee should be either GlobalValue or bitcast of GlobalValue",
+            &GA);
+  }
+
+  const GlobalValue* Aliasee = GA.resolveAliasedGlobal(/*stopOnWeak*/ false);
+  Assert1(Aliasee,
+          "Aliasing chain should end with function or global variable", &GA);
+
+  visitGlobalValue(GA);
+}
+
+void Verifier::verifyTypeSymbolTable(TypeSymbolTable &ST) {
+}
+
+// VerifyAttrs - Check the given parameter attributes for an argument or return
+// value of the specified type.  The value V is printed in error messages.
+void Verifier::VerifyAttrs(Attributes Attrs, const Type *Ty, 
+                           bool isReturnValue, const Value *V) {
+  if (Attrs == Attribute::None)
+    return;
+
+  if (isReturnValue) {
+    Attributes RetI = Attrs & Attribute::ParameterOnly;
+    Assert1(!RetI, "Attribute " + Attribute::getAsString(RetI) +
+            " does not apply to return values!", V);
+  }
+  Attributes FnCheckAttr = Attrs & Attribute::FunctionOnly;
+  Assert1(!FnCheckAttr, "Attribute " + Attribute::getAsString(FnCheckAttr) +
+          " only applies to functions!", V);
+  
+  for (unsigned i = 0;
+       i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
+    Attributes MutI = Attrs & Attribute::MutuallyIncompatible[i];
+    Assert1(!(MutI & (MutI - 1)), "Attributes " +
+            Attribute::getAsString(MutI) + " are incompatible!", V);
+  }
+
+  Attributes TypeI = Attrs & Attribute::typeIncompatible(Ty);
+  Assert1(!TypeI, "Wrong type for attribute " +
+          Attribute::getAsString(TypeI), V);
+
+  Attributes ByValI = Attrs & Attribute::ByVal;
+  if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+    Assert1(!ByValI || PTy->getElementType()->isSized(),
+            "Attribute " + Attribute::getAsString(ByValI) +
+            " does not support unsized types!", V);
+  } else {
+    Assert1(!ByValI,
+            "Attribute " + Attribute::getAsString(ByValI) +
+            " only applies to parameters with pointer type!", V);
+  }
+}
+
+// VerifyFunctionAttrs - Check parameter attributes against a function type.
+// The value V is printed in error messages.
+void Verifier::VerifyFunctionAttrs(const FunctionType *FT,
+                                   const AttrListPtr &Attrs,
+                                   const Value *V) {
+  if (Attrs.isEmpty())
+    return;
+
+  bool SawNest = false;
+
+  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
+    const AttributeWithIndex &Attr = Attrs.getSlot(i);
+
+    const Type *Ty;
+    if (Attr.Index == 0)
+      Ty = FT->getReturnType();
+    else if (Attr.Index-1 < FT->getNumParams())
+      Ty = FT->getParamType(Attr.Index-1);
+    else
+      break;  // VarArgs attributes, don't verify.
+    
+    VerifyAttrs(Attr.Attrs, Ty, Attr.Index == 0, V);
+
+    if (Attr.Attrs & Attribute::Nest) {
+      Assert1(!SawNest, "More than one parameter has attribute nest!", V);
+      SawNest = true;
+    }
+
+    if (Attr.Attrs & Attribute::StructRet)
+      Assert1(Attr.Index == 1, "Attribute sret not on first parameter!", V);
+  }
+
+  Attributes FAttrs = Attrs.getFnAttributes();
+  Assert1(!(FAttrs & (~Attribute::FunctionOnly)),
+          "Attribute " + Attribute::getAsString(FAttrs) +
+          " does not apply to function!", V);
+      
+  for (unsigned i = 0;
+       i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
+    Attributes MutI = FAttrs & Attribute::MutuallyIncompatible[i];
+    Assert1(!(MutI & (MutI - 1)), "Attributes " +
+            Attribute::getAsString(MutI) + " are incompatible!", V);
+  }
+}
+
+static bool VerifyAttributeCount(const AttrListPtr &Attrs, unsigned Params) {
+  if (Attrs.isEmpty())
+    return true;
+    
+  unsigned LastSlot = Attrs.getNumSlots() - 1;
+  unsigned LastIndex = Attrs.getSlot(LastSlot).Index;
+  if (LastIndex <= Params
+      || (LastIndex == (unsigned)~0
+          && (LastSlot == 0 || Attrs.getSlot(LastSlot - 1).Index <= Params)))  
+    return true;
+    
+  return false;
+}
+// visitFunction - Verify that a function is ok.
+//
+void Verifier::visitFunction(Function &F) {
+  // Check function arguments.
+  const FunctionType *FT = F.getFunctionType();
+  unsigned NumArgs = F.arg_size();
+
+  Assert2(FT->getNumParams() == NumArgs,
+          "# formal arguments must match # of arguments for function type!",
+          &F, FT);
+  Assert1(F.getReturnType()->isFirstClassType() ||
+          F.getReturnType() == Type::VoidTy || 
+          isa<StructType>(F.getReturnType()),
+          "Functions cannot return aggregate values!", &F);
+
+  Assert1(!F.hasStructRetAttr() || F.getReturnType() == Type::VoidTy,
+          "Invalid struct return type!", &F);
+
+  const AttrListPtr &Attrs = F.getAttributes();
+
+  Assert1(VerifyAttributeCount(Attrs, FT->getNumParams()),
+          "Attributes after last parameter!", &F);
+
+  // Check function attributes.
+  VerifyFunctionAttrs(FT, Attrs, &F);
+
+  // Check that this function meets the restrictions on this calling convention.
+  switch (F.getCallingConv()) {
+  default:
+    break;
+  case CallingConv::C:
+    break;
+  case CallingConv::Fast:
+  case CallingConv::Cold:
+  case CallingConv::X86_FastCall:
+    Assert1(!F.isVarArg(),
+            "Varargs functions must have C calling conventions!", &F);
+    break;
+  }
+  
+  bool isLLVMdotName = F.getName().size() >= 5 &&
+                       F.getName().substr(0, 5) == "llvm.";
+  if (!isLLVMdotName)
+    Assert1(F.getReturnType() != Type::MetadataTy,
+            "Function may not return metadata unless it's an intrinsic", &F);
+
+  // Check that the argument values match the function type for this function...
+  unsigned i = 0;
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I, ++i) {
+    Assert2(I->getType() == FT->getParamType(i),
+            "Argument value does not match function argument type!",
+            I, FT->getParamType(i));
+    Assert1(I->getType()->isFirstClassType(),
+            "Function arguments must have first-class types!", I);
+    if (!isLLVMdotName)
+      Assert2(I->getType() != Type::MetadataTy,
+              "Function takes metadata but isn't an intrinsic", I, &F);
+  }
+
+  if (F.isDeclaration()) {
+    Assert1(F.hasExternalLinkage() || F.hasDLLImportLinkage() ||
+            F.hasExternalWeakLinkage() || F.hasGhostLinkage(),
+            "invalid linkage type for function declaration", &F);
+  } else {
+    // Verify that this function (which has a body) is not named "llvm.*".  It
+    // is not legal to define intrinsics.
+    Assert1(!isLLVMdotName, "llvm intrinsics cannot be defined!", &F);
+    
+    // Check the entry node
+    BasicBlock *Entry = &F.getEntryBlock();
+    Assert1(pred_begin(Entry) == pred_end(Entry),
+            "Entry block to function must not have predecessors!", Entry);
+  }
+}
+
+
+// verifyBasicBlock - Verify that a basic block is well formed...
+//
+void Verifier::visitBasicBlock(BasicBlock &BB) {
+  InstsInThisBlock.clear();
+
+  // Ensure that basic blocks have terminators!
+  Assert1(BB.getTerminator(), "Basic Block does not have terminator!", &BB);
+
+  // Check constraints that this basic block imposes on all of the PHI nodes in
+  // it.
+  if (isa<PHINode>(BB.front())) {
+    SmallVector<BasicBlock*, 8> Preds(pred_begin(&BB), pred_end(&BB));
+    SmallVector<std::pair<BasicBlock*, Value*>, 8> Values;
+    std::sort(Preds.begin(), Preds.end());
+    PHINode *PN;
+    for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I));++I) {
+
+      // Ensure that PHI nodes have at least one entry!
+      Assert1(PN->getNumIncomingValues() != 0,
+              "PHI nodes must have at least one entry.  If the block is dead, "
+              "the PHI should be removed!", PN);
+      Assert1(PN->getNumIncomingValues() == Preds.size(),
+              "PHINode should have one entry for each predecessor of its "
+              "parent basic block!", PN);
+
+      // Get and sort all incoming values in the PHI node...
+      Values.clear();
+      Values.reserve(PN->getNumIncomingValues());
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Values.push_back(std::make_pair(PN->getIncomingBlock(i),
+                                        PN->getIncomingValue(i)));
+      std::sort(Values.begin(), Values.end());
+
+      for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+        // Check to make sure that if there is more than one entry for a
+        // particular basic block in this PHI node, that the incoming values are
+        // all identical.
+        //
+        Assert4(i == 0 || Values[i].first  != Values[i-1].first ||
+                Values[i].second == Values[i-1].second,
+                "PHI node has multiple entries for the same basic block with "
+                "different incoming values!", PN, Values[i].first,
+                Values[i].second, Values[i-1].second);
+
+        // Check to make sure that the predecessors and PHI node entries are
+        // matched up.
+        Assert3(Values[i].first == Preds[i],
+                "PHI node entries do not match predecessors!", PN,
+                Values[i].first, Preds[i]);
+      }
+    }
+  }
+}
+
+void Verifier::visitTerminatorInst(TerminatorInst &I) {
+  // Ensure that terminators only exist at the end of the basic block.
+  Assert1(&I == I.getParent()->getTerminator(),
+          "Terminator found in the middle of a basic block!", I.getParent());
+  visitInstruction(I);
+}
+
+void Verifier::visitReturnInst(ReturnInst &RI) {
+  Function *F = RI.getParent()->getParent();
+  unsigned N = RI.getNumOperands();
+  if (F->getReturnType() == Type::VoidTy) 
+    Assert2(N == 0,
+            "Found return instr that returns non-void in Function of void "
+            "return type!", &RI, F->getReturnType());
+  else if (N == 1 && F->getReturnType() == RI.getOperand(0)->getType()) {
+    // Exactly one return value and it matches the return type. Good.
+  } else if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+    // The return type is a struct; check for multiple return values.
+    Assert2(STy->getNumElements() == N,
+            "Incorrect number of return values in ret instruction!",
+            &RI, F->getReturnType());
+    for (unsigned i = 0; i != N; ++i)
+      Assert2(STy->getElementType(i) == RI.getOperand(i)->getType(),
+              "Function return type does not match operand "
+              "type of return inst!", &RI, F->getReturnType());
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(F->getReturnType())) {
+    // The return type is an array; check for multiple return values.
+    Assert2(ATy->getNumElements() == N,
+            "Incorrect number of return values in ret instruction!",
+            &RI, F->getReturnType());
+    for (unsigned i = 0; i != N; ++i)
+      Assert2(ATy->getElementType() == RI.getOperand(i)->getType(),
+              "Function return type does not match operand "
+              "type of return inst!", &RI, F->getReturnType());
+  } else {
+    CheckFailed("Function return type does not match operand "
+                "type of return inst!", &RI, F->getReturnType());
+  }
+  
+  // Check to make sure that the return value has necessary properties for
+  // terminators...
+  visitTerminatorInst(RI);
+}
+
+void Verifier::visitSwitchInst(SwitchInst &SI) {
+  // Check to make sure that all of the constants in the switch instruction
+  // have the same type as the switched-on value.
+  const Type *SwitchTy = SI.getCondition()->getType();
+  for (unsigned i = 1, e = SI.getNumCases(); i != e; ++i)
+    Assert1(SI.getCaseValue(i)->getType() == SwitchTy,
+            "Switch constants must all be same type as switch value!", &SI);
+
+  visitTerminatorInst(SI);
+}
+
+void Verifier::visitSelectInst(SelectInst &SI) {
+  Assert1(!SelectInst::areInvalidOperands(SI.getOperand(0), SI.getOperand(1),
+                                          SI.getOperand(2)),
+          "Invalid operands for select instruction!", &SI);
+
+  Assert1(SI.getTrueValue()->getType() == SI.getType(),
+          "Select values must have same type as select instruction!", &SI);
+  visitInstruction(SI);
+}
+
+
+/// visitUserOp1 - User defined operators shouldn't live beyond the lifetime of
+/// a pass, if any exist, it's an error.
+///
+void Verifier::visitUserOp1(Instruction &I) {
+  Assert1(0, "User-defined operators should not live outside of a pass!", &I);
+}
+
+void Verifier::visitTruncInst(TruncInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  Assert1(SrcTy->isIntOrIntVector(), "Trunc only operates on integer", &I);
+  Assert1(DestTy->isIntOrIntVector(), "Trunc only produces integer", &I);
+  Assert1(isa<VectorType>(SrcTy) == isa<VectorType>(DestTy),
+          "trunc source and destination must both be a vector or neither", &I);
+  Assert1(SrcBitSize > DestBitSize,"DestTy too big for Trunc", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitZExtInst(ZExtInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  Assert1(SrcTy->isIntOrIntVector(), "ZExt only operates on integer", &I);
+  Assert1(DestTy->isIntOrIntVector(), "ZExt only produces an integer", &I);
+  Assert1(isa<VectorType>(SrcTy) == isa<VectorType>(DestTy),
+          "zext source and destination must both be a vector or neither", &I);
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  Assert1(SrcBitSize < DestBitSize,"Type too small for ZExt", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitSExtInst(SExtInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  Assert1(SrcTy->isIntOrIntVector(), "SExt only operates on integer", &I);
+  Assert1(DestTy->isIntOrIntVector(), "SExt only produces an integer", &I);
+  Assert1(isa<VectorType>(SrcTy) == isa<VectorType>(DestTy),
+          "sext source and destination must both be a vector or neither", &I);
+  Assert1(SrcBitSize < DestBitSize,"Type too small for SExt", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPTruncInst(FPTruncInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  Assert1(SrcTy->isFPOrFPVector(),"FPTrunc only operates on FP", &I);
+  Assert1(DestTy->isFPOrFPVector(),"FPTrunc only produces an FP", &I);
+  Assert1(isa<VectorType>(SrcTy) == isa<VectorType>(DestTy),
+          "fptrunc source and destination must both be a vector or neither",&I);
+  Assert1(SrcBitSize > DestBitSize,"DestTy too big for FPTrunc", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPExtInst(FPExtInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  Assert1(SrcTy->isFPOrFPVector(),"FPExt only operates on FP", &I);
+  Assert1(DestTy->isFPOrFPVector(),"FPExt only produces an FP", &I);
+  Assert1(isa<VectorType>(SrcTy) == isa<VectorType>(DestTy),
+          "fpext source and destination must both be a vector or neither", &I);
+  Assert1(SrcBitSize < DestBitSize,"DestTy too small for FPExt", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitUIToFPInst(UIToFPInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  bool SrcVec = isa<VectorType>(SrcTy);
+  bool DstVec = isa<VectorType>(DestTy);
+
+  Assert1(SrcVec == DstVec,
+          "UIToFP source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isIntOrIntVector(),
+          "UIToFP source must be integer or integer vector", &I);
+  Assert1(DestTy->isFPOrFPVector(),
+          "UIToFP result must be FP or FP vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "UIToFP source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitSIToFPInst(SIToFPInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  bool SrcVec = SrcTy->getTypeID() == Type::VectorTyID;
+  bool DstVec = DestTy->getTypeID() == Type::VectorTyID;
+
+  Assert1(SrcVec == DstVec,
+          "SIToFP source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isIntOrIntVector(),
+          "SIToFP source must be integer or integer vector", &I);
+  Assert1(DestTy->isFPOrFPVector(),
+          "SIToFP result must be FP or FP vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "SIToFP source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPToUIInst(FPToUIInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  bool SrcVec = isa<VectorType>(SrcTy);
+  bool DstVec = isa<VectorType>(DestTy);
+
+  Assert1(SrcVec == DstVec,
+          "FPToUI source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isFPOrFPVector(), "FPToUI source must be FP or FP vector", &I);
+  Assert1(DestTy->isIntOrIntVector(),
+          "FPToUI result must be integer or integer vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "FPToUI source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPToSIInst(FPToSIInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  bool SrcVec = isa<VectorType>(SrcTy);
+  bool DstVec = isa<VectorType>(DestTy);
+
+  Assert1(SrcVec == DstVec,
+          "FPToSI source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isFPOrFPVector(),
+          "FPToSI source must be FP or FP vector", &I);
+  Assert1(DestTy->isIntOrIntVector(),
+          "FPToSI result must be integer or integer vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "FPToSI source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  Assert1(isa<PointerType>(SrcTy), "PtrToInt source must be pointer", &I);
+  Assert1(DestTy->isInteger(), "PtrToInt result must be integral", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  Assert1(SrcTy->isInteger(), "IntToPtr source must be an integral", &I);
+  Assert1(isa<PointerType>(DestTy), "IntToPtr result must be a pointer",&I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitBitCastInst(BitCastInst &I) {
+  // Get the source and destination types
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  // BitCast implies a no-op cast of type only. No bits change.
+  // However, you can't cast pointers to anything but pointers.
+  Assert1(isa<PointerType>(DestTy) == isa<PointerType>(DestTy),
+          "Bitcast requires both operands to be pointer or neither", &I);
+  Assert1(SrcBitSize == DestBitSize, "Bitcast requies types of same width", &I);
+
+  // Disallow aggregates.
+  Assert1(!SrcTy->isAggregateType(),
+          "Bitcast operand must not be aggregate", &I);
+  Assert1(!DestTy->isAggregateType(),
+          "Bitcast type must not be aggregate", &I);
+
+  visitInstruction(I);
+}
+
+/// visitPHINode - Ensure that a PHI node is well formed.
+///
+void Verifier::visitPHINode(PHINode &PN) {
+  // Ensure that the PHI nodes are all grouped together at the top of the block.
+  // This can be tested by checking whether the instruction before this is
+  // either nonexistent (because this is begin()) or is a PHI node.  If not,
+  // then there is some other instruction before a PHI.
+  Assert2(&PN == &PN.getParent()->front() || 
+          isa<PHINode>(--BasicBlock::iterator(&PN)),
+          "PHI nodes not grouped at top of basic block!",
+          &PN, PN.getParent());
+
+  // Check that all of the operands of the PHI node have the same type as the
+  // result.
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+    Assert1(PN.getType() == PN.getIncomingValue(i)->getType(),
+            "PHI node operands are not the same type as the result!", &PN);
+
+  // All other PHI node constraints are checked in the visitBasicBlock method.
+
+  visitInstruction(PN);
+}
+
+void Verifier::VerifyCallSite(CallSite CS) {
+  Instruction *I = CS.getInstruction();
+
+  Assert1(isa<PointerType>(CS.getCalledValue()->getType()),
+          "Called function must be a pointer!", I);
+  const PointerType *FPTy = cast<PointerType>(CS.getCalledValue()->getType());
+  Assert1(isa<FunctionType>(FPTy->getElementType()),
+          "Called function is not pointer to function type!", I);
+
+  const FunctionType *FTy = cast<FunctionType>(FPTy->getElementType());
+
+  // Verify that the correct number of arguments are being passed
+  if (FTy->isVarArg())
+    Assert1(CS.arg_size() >= FTy->getNumParams(),
+            "Called function requires more parameters than were provided!",I);
+  else
+    Assert1(CS.arg_size() == FTy->getNumParams(),
+            "Incorrect number of arguments passed to called function!", I);
+
+  // Verify that all arguments to the call match the function type...
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    Assert3(CS.getArgument(i)->getType() == FTy->getParamType(i),
+            "Call parameter type does not match function signature!",
+            CS.getArgument(i), FTy->getParamType(i), I);
+
+  const AttrListPtr &Attrs = CS.getAttributes();
+
+  Assert1(VerifyAttributeCount(Attrs, CS.arg_size()),
+          "Attributes after last parameter!", I);
+
+  // Verify call attributes.
+  VerifyFunctionAttrs(FTy, Attrs, I);
+
+  if (FTy->isVarArg())
+    // Check attributes on the varargs part.
+    for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
+      Attributes Attr = Attrs.getParamAttributes(Idx);
+
+      VerifyAttrs(Attr, CS.getArgument(Idx-1)->getType(), false, I);
+
+      Attributes VArgI = Attr & Attribute::VarArgsIncompatible;
+      Assert1(!VArgI, "Attribute " + Attribute::getAsString(VArgI) +
+              " cannot be used for vararg call arguments!", I);
+    }
+
+  // Verify that there's no metadata unless it's a direct call to an intrinsic.
+  if (!CS.getCalledFunction() || CS.getCalledFunction()->getName().size() < 5 ||
+      CS.getCalledFunction()->getName().substr(0, 5) != "llvm.") {
+    Assert1(FTy->getReturnType() != Type::MetadataTy,
+            "Only intrinsics may return metadata", I);
+    for (FunctionType::param_iterator PI = FTy->param_begin(),
+           PE = FTy->param_end(); PI != PE; ++PI)
+      Assert1(PI->get() != Type::MetadataTy, "Function has metadata parameter "
+              "but isn't an intrinsic", I);
+  }
+
+  visitInstruction(*I);
+}
+
+void Verifier::visitCallInst(CallInst &CI) {
+  VerifyCallSite(&CI);
+
+  if (Function *F = CI.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
+      visitIntrinsicFunctionCall(ID, CI);
+}
+
+void Verifier::visitInvokeInst(InvokeInst &II) {
+  VerifyCallSite(&II);
+}
+
+/// visitBinaryOperator - Check that both arguments to the binary operator are
+/// of the same type!
+///
+void Verifier::visitBinaryOperator(BinaryOperator &B) {
+  Assert1(B.getOperand(0)->getType() == B.getOperand(1)->getType(),
+          "Both operands to a binary operator are not of the same type!", &B);
+
+  switch (B.getOpcode()) {
+  // Check that logical operators are only used with integral operands.
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    Assert1(B.getType()->isInteger() ||
+            (isa<VectorType>(B.getType()) && 
+             cast<VectorType>(B.getType())->getElementType()->isInteger()),
+            "Logical operators only work with integral types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Logical operators must have same type for operands and result!",
+            &B);
+    break;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    Assert1(B.getType()->isInteger() ||
+            (isa<VectorType>(B.getType()) && 
+             cast<VectorType>(B.getType())->getElementType()->isInteger()),
+            "Shifts only work with integral types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Shift return type must be same as operands!", &B);
+    /* FALL THROUGH */
+  default:
+    // Arithmetic operators only work on integer or fp values
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Arithmetic operators must have same type for operands and result!",
+            &B);
+    Assert1(B.getType()->isInteger() || B.getType()->isFloatingPoint() ||
+            isa<VectorType>(B.getType()),
+            "Arithmetic operators must have integer, fp, or vector type!", &B);
+    break;
+  }
+
+  visitInstruction(B);
+}
+
+void Verifier::visitICmpInst(ICmpInst& IC) {
+  // Check that the operands are the same type
+  const Type* Op0Ty = IC.getOperand(0)->getType();
+  const Type* Op1Ty = IC.getOperand(1)->getType();
+  Assert1(Op0Ty == Op1Ty,
+          "Both operands to ICmp instruction are not of the same type!", &IC);
+  // Check that the operands are the right type
+  Assert1(Op0Ty->isIntOrIntVector() || isa<PointerType>(Op0Ty),
+          "Invalid operand types for ICmp instruction", &IC);
+
+  visitInstruction(IC);
+}
+
+void Verifier::visitFCmpInst(FCmpInst& FC) {
+  // Check that the operands are the same type
+  const Type* Op0Ty = FC.getOperand(0)->getType();
+  const Type* Op1Ty = FC.getOperand(1)->getType();
+  Assert1(Op0Ty == Op1Ty,
+          "Both operands to FCmp instruction are not of the same type!", &FC);
+  // Check that the operands are the right type
+  Assert1(Op0Ty->isFPOrFPVector(),
+          "Invalid operand types for FCmp instruction", &FC);
+  visitInstruction(FC);
+}
+
+void Verifier::visitExtractElementInst(ExtractElementInst &EI) {
+  Assert1(ExtractElementInst::isValidOperands(EI.getOperand(0),
+                                              EI.getOperand(1)),
+          "Invalid extractelement operands!", &EI);
+  visitInstruction(EI);
+}
+
+void Verifier::visitInsertElementInst(InsertElementInst &IE) {
+  Assert1(InsertElementInst::isValidOperands(IE.getOperand(0),
+                                             IE.getOperand(1),
+                                             IE.getOperand(2)),
+          "Invalid insertelement operands!", &IE);
+  visitInstruction(IE);
+}
+
+void Verifier::visitShuffleVectorInst(ShuffleVectorInst &SV) {
+  Assert1(ShuffleVectorInst::isValidOperands(SV.getOperand(0), SV.getOperand(1),
+                                             SV.getOperand(2)),
+          "Invalid shufflevector operands!", &SV);
+
+  const VectorType *VTy = dyn_cast<VectorType>(SV.getOperand(0)->getType());
+  Assert1(VTy, "Operands are not a vector type", &SV);
+
+  // Check to see if Mask is valid.
+  if (const ConstantVector *MV = dyn_cast<ConstantVector>(SV.getOperand(2))) {
+    for (unsigned i = 0, e = MV->getNumOperands(); i != e; ++i) {
+      if (ConstantInt* CI = dyn_cast<ConstantInt>(MV->getOperand(i))) {
+        Assert1(!CI->uge(VTy->getNumElements()*2),
+                "Invalid shufflevector shuffle mask!", &SV);
+      } else {
+        Assert1(isa<UndefValue>(MV->getOperand(i)),
+                "Invalid shufflevector shuffle mask!", &SV);
+      }
+    }
+  } else {
+    Assert1(isa<UndefValue>(SV.getOperand(2)) || 
+            isa<ConstantAggregateZero>(SV.getOperand(2)),
+            "Invalid shufflevector shuffle mask!", &SV);
+  }
+
+  visitInstruction(SV);
+}
+
+void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  SmallVector<Value*, 16> Idxs(GEP.idx_begin(), GEP.idx_end());
+  const Type *ElTy =
+    GetElementPtrInst::getIndexedType(GEP.getOperand(0)->getType(),
+                                      Idxs.begin(), Idxs.end());
+  Assert1(ElTy, "Invalid indices for GEP pointer type!", &GEP);
+  Assert2(isa<PointerType>(GEP.getType()) &&
+          cast<PointerType>(GEP.getType())->getElementType() == ElTy,
+          "GEP is not of right type for indices!", &GEP, ElTy);
+  visitInstruction(GEP);
+}
+
+void Verifier::visitLoadInst(LoadInst &LI) {
+  const Type *ElTy =
+    cast<PointerType>(LI.getOperand(0)->getType())->getElementType();
+  Assert2(ElTy == LI.getType(),
+          "Load result type does not match pointer operand type!", &LI, ElTy);
+  Assert1(ElTy != Type::MetadataTy, "Can't load metadata!", &LI);
+  visitInstruction(LI);
+}
+
+void Verifier::visitStoreInst(StoreInst &SI) {
+  const Type *ElTy =
+    cast<PointerType>(SI.getOperand(1)->getType())->getElementType();
+  Assert2(ElTy == SI.getOperand(0)->getType(),
+          "Stored value type does not match pointer operand type!", &SI, ElTy);
+  Assert1(ElTy != Type::MetadataTy, "Can't store metadata!", &SI);
+  visitInstruction(SI);
+}
+
+void Verifier::visitAllocationInst(AllocationInst &AI) {
+  const PointerType *PTy = AI.getType();
+  Assert1(PTy->getAddressSpace() == 0, 
+          "Allocation instruction pointer not in the generic address space!",
+          &AI);
+  Assert1(PTy->getElementType()->isSized(), "Cannot allocate unsized type",
+          &AI);
+  visitInstruction(AI);
+}
+
+void Verifier::visitExtractValueInst(ExtractValueInst &EVI) {
+  Assert1(ExtractValueInst::getIndexedType(EVI.getAggregateOperand()->getType(),
+                                           EVI.idx_begin(), EVI.idx_end()) ==
+          EVI.getType(),
+          "Invalid ExtractValueInst operands!", &EVI);
+  
+  visitInstruction(EVI);
+}
+
+void Verifier::visitInsertValueInst(InsertValueInst &IVI) {
+  Assert1(ExtractValueInst::getIndexedType(IVI.getAggregateOperand()->getType(),
+                                           IVI.idx_begin(), IVI.idx_end()) ==
+          IVI.getOperand(1)->getType(),
+          "Invalid InsertValueInst operands!", &IVI);
+  
+  visitInstruction(IVI);
+}
+
+/// verifyInstruction - Verify that an instruction is well formed.
+///
+void Verifier::visitInstruction(Instruction &I) {
+  BasicBlock *BB = I.getParent();
+  Assert1(BB, "Instruction not embedded in basic block!", &I);
+
+  if (!isa<PHINode>(I)) {   // Check that non-phi nodes are not self referential
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+         UI != UE; ++UI)
+      Assert1(*UI != (User*)&I || !DT->isReachableFromEntry(BB),
+              "Only PHI nodes may reference their own value!", &I);
+  }
+  
+  // Verify that if this is a terminator that it is at the end of the block.
+  if (isa<TerminatorInst>(I))
+    Assert1(BB->getTerminator() == &I, "Terminator not at end of block!", &I);
+  
+
+  // Check that void typed values don't have names
+  Assert1(I.getType() != Type::VoidTy || !I.hasName(),
+          "Instruction has a name, but provides a void value!", &I);
+
+  // Check that the return value of the instruction is either void or a legal
+  // value type.
+  Assert1(I.getType() == Type::VoidTy || I.getType()->isFirstClassType()
+          || ((isa<CallInst>(I) || isa<InvokeInst>(I)) 
+              && isa<StructType>(I.getType())),
+          "Instruction returns a non-scalar type!", &I);
+
+  // Check that the instruction doesn't produce metadata or metadata*. Calls
+  // all already checked against the callee type.
+  Assert1(I.getType() != Type::MetadataTy ||
+          isa<CallInst>(I) || isa<InvokeInst>(I),
+          "Invalid use of metadata!", &I);
+
+  if (const PointerType *PTy = dyn_cast<PointerType>(I.getType()))
+    Assert1(PTy->getElementType() != Type::MetadataTy,
+            "Instructions may not produce pointer to metadata.", &I);
+
+
+  // Check that all uses of the instruction, if they are instructions
+  // themselves, actually have parent basic blocks.  If the use is not an
+  // instruction, it is an error!
+  for (User::use_iterator UI = I.use_begin(), UE = I.use_end();
+       UI != UE; ++UI) {
+    Assert1(isa<Instruction>(*UI), "Use of instruction is not an instruction!",
+            *UI);
+    Instruction *Used = cast<Instruction>(*UI);
+    Assert2(Used->getParent() != 0, "Instruction referencing instruction not"
+            " embedded in a basic block!", &I, Used);
+  }
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    Assert1(I.getOperand(i) != 0, "Instruction has null operand!", &I);
+
+    // Check to make sure that only first-class-values are operands to
+    // instructions.
+    if (!I.getOperand(i)->getType()->isFirstClassType()) {
+      Assert1(0, "Instruction operands must be first-class values!", &I);
+    }
+
+    if (const PointerType *PTy =
+            dyn_cast<PointerType>(I.getOperand(i)->getType()))
+      Assert1(PTy->getElementType() != Type::MetadataTy,
+              "Invalid use of metadata pointer.", &I);
+    
+    if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
+      // Check to make sure that the "address of" an intrinsic function is never
+      // taken.
+      Assert1(!F->isIntrinsic() || (i == 0 && isa<CallInst>(I)),
+              "Cannot take the address of an intrinsic!", &I);
+      Assert1(F->getParent() == Mod, "Referencing function in another module!",
+              &I);
+    } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
+      Assert1(OpBB->getParent() == BB->getParent(),
+              "Referring to a basic block in another function!", &I);
+    } else if (Argument *OpArg = dyn_cast<Argument>(I.getOperand(i))) {
+      Assert1(OpArg->getParent() == BB->getParent(),
+              "Referring to an argument in another function!", &I);
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) {
+      Assert1(GV->getParent() == Mod, "Referencing global in another module!",
+              &I);
+    } else if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i))) {
+      BasicBlock *OpBlock = Op->getParent();
+
+      // Check that a definition dominates all of its uses.
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Op)) {
+        // Invoke results are only usable in the normal destination, not in the
+        // exceptional destination.
+        BasicBlock *NormalDest = II->getNormalDest();
+
+        Assert2(NormalDest != II->getUnwindDest(),
+                "No uses of invoke possible due to dominance structure!",
+                Op, &I);
+
+        // PHI nodes differ from other nodes because they actually "use" the
+        // value in the predecessor basic blocks they correspond to.
+        BasicBlock *UseBlock = BB;
+        if (isa<PHINode>(I))
+          UseBlock = cast<BasicBlock>(I.getOperand(i+1));
+
+        if (isa<PHINode>(I) && UseBlock == OpBlock) {
+          // Special case of a phi node in the normal destination or the unwind
+          // destination.
+          Assert2(BB == NormalDest || !DT->isReachableFromEntry(UseBlock),
+                  "Invoke result not available in the unwind destination!",
+                  Op, &I);
+        } else {
+          Assert2(DT->dominates(NormalDest, UseBlock) ||
+                  !DT->isReachableFromEntry(UseBlock),
+                  "Invoke result does not dominate all uses!", Op, &I);
+
+          // If the normal successor of an invoke instruction has multiple
+          // predecessors, then the normal edge from the invoke is critical,
+          // so the invoke value can only be live if the destination block
+          // dominates all of it's predecessors (other than the invoke).
+          if (!NormalDest->getSinglePredecessor() &&
+              DT->isReachableFromEntry(UseBlock))
+            // If it is used by something non-phi, then the other case is that
+            // 'NormalDest' dominates all of its predecessors other than the
+            // invoke.  In this case, the invoke value can still be used.
+            for (pred_iterator PI = pred_begin(NormalDest),
+                 E = pred_end(NormalDest); PI != E; ++PI)
+              if (*PI != II->getParent() && !DT->dominates(NormalDest, *PI) &&
+                  DT->isReachableFromEntry(*PI)) {
+                CheckFailed("Invoke result does not dominate all uses!", Op,&I);
+                return;
+              }
+        }
+      } else if (isa<PHINode>(I)) {
+        // PHI nodes are more difficult than other nodes because they actually
+        // "use" the value in the predecessor basic blocks they correspond to.
+        BasicBlock *PredBB = cast<BasicBlock>(I.getOperand(i+1));
+        Assert2(DT->dominates(OpBlock, PredBB) ||
+                !DT->isReachableFromEntry(PredBB),
+                "Instruction does not dominate all uses!", Op, &I);
+      } else {
+        if (OpBlock == BB) {
+          // If they are in the same basic block, make sure that the definition
+          // comes before the use.
+          Assert2(InstsInThisBlock.count(Op) || !DT->isReachableFromEntry(BB),
+                  "Instruction does not dominate all uses!", Op, &I);
+        }
+
+        // Definition must dominate use unless use is unreachable!
+        Assert2(InstsInThisBlock.count(Op) || DT->dominates(Op, &I) ||
+                !DT->isReachableFromEntry(BB),
+                "Instruction does not dominate all uses!", Op, &I);
+      }
+    } else if (isa<InlineAsm>(I.getOperand(i))) {
+      Assert1(i == 0 && (isa<CallInst>(I) || isa<InvokeInst>(I)),
+              "Cannot take the address of an inline asm!", &I);
+    }
+  }
+  InstsInThisBlock.insert(&I);
+}
+
+// Flags used by TableGen to mark intrinsic parameters with the
+// LLVMExtendedElementVectorType and LLVMTruncatedElementVectorType classes.
+static const unsigned ExtendedElementVectorType = 0x40000000;
+static const unsigned TruncatedElementVectorType = 0x20000000;
+
+/// visitIntrinsicFunction - Allow intrinsics to be verified in different ways.
+///
+void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
+  Function *IF = CI.getCalledFunction();
+  Assert1(IF->isDeclaration(), "Intrinsic functions should never be defined!",
+          IF);
+  
+#define GET_INTRINSIC_VERIFIER
+#include "llvm/Intrinsics.gen"
+#undef GET_INTRINSIC_VERIFIER
+  
+  switch (ID) {
+  default:
+    break;
+  case Intrinsic::dbg_declare:  // llvm.dbg.declare
+    if (Constant *C = dyn_cast<Constant>(CI.getOperand(1)))
+      Assert1(C && !isa<ConstantPointerNull>(C),
+              "invalid llvm.dbg.declare intrinsic call", &CI);
+    break;
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+    Assert1(isa<ConstantInt>(CI.getOperand(4)),
+            "alignment argument of memory intrinsics must be a constant int",
+            &CI);
+    break;
+  case Intrinsic::gcroot:
+  case Intrinsic::gcwrite:
+  case Intrinsic::gcread:
+    if (ID == Intrinsic::gcroot) {
+      AllocaInst *AI =
+        dyn_cast<AllocaInst>(CI.getOperand(1)->stripPointerCasts());
+      Assert1(AI && isa<PointerType>(AI->getType()->getElementType()),
+              "llvm.gcroot parameter #1 must be a pointer alloca.", &CI);
+      Assert1(isa<Constant>(CI.getOperand(2)),
+              "llvm.gcroot parameter #2 must be a constant.", &CI);
+    }
+      
+    Assert1(CI.getParent()->getParent()->hasGC(),
+            "Enclosing function does not use GC.", &CI);
+    break;
+  case Intrinsic::init_trampoline:
+    Assert1(isa<Function>(CI.getOperand(2)->stripPointerCasts()),
+            "llvm.init_trampoline parameter #2 must resolve to a function.",
+            &CI);
+    break;
+  case Intrinsic::prefetch:
+    Assert1(isa<ConstantInt>(CI.getOperand(2)) &&
+            isa<ConstantInt>(CI.getOperand(3)) &&
+            cast<ConstantInt>(CI.getOperand(2))->getZExtValue() < 2 &&
+            cast<ConstantInt>(CI.getOperand(3))->getZExtValue() < 4,
+            "invalid arguments to llvm.prefetch",
+            &CI);
+    break;
+  case Intrinsic::stackprotector:
+    Assert1(isa<AllocaInst>(CI.getOperand(2)->stripPointerCasts()),
+            "llvm.stackprotector parameter #2 must resolve to an alloca.",
+            &CI);
+    break;
+  }
+}
+
+/// Produce a string to identify an intrinsic parameter or return value.
+/// The ArgNo value numbers the return values from 0 to NumRets-1 and the
+/// parameters beginning with NumRets.
+///
+static std::string IntrinsicParam(unsigned ArgNo, unsigned NumRets) {
+  if (ArgNo < NumRets) {
+    if (NumRets == 1)
+      return "Intrinsic result type";
+    else
+      return "Intrinsic result type #" + utostr(ArgNo);
+  } else
+    return "Intrinsic parameter #" + utostr(ArgNo - NumRets);
+}
+
+bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
+                                int VT, unsigned ArgNo, std::string &Suffix) {
+  const FunctionType *FTy = F->getFunctionType();
+
+  unsigned NumElts = 0;
+  const Type *EltTy = Ty;
+  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (VTy) {
+    EltTy = VTy->getElementType();
+    NumElts = VTy->getNumElements();
+  }
+
+  const Type *RetTy = FTy->getReturnType();
+  const StructType *ST = dyn_cast<StructType>(RetTy);
+  unsigned NumRets = 1;
+  if (ST)
+    NumRets = ST->getNumElements();
+
+  if (VT < 0) {
+    int Match = ~VT;
+
+    // Check flags that indicate a type that is an integral vector type with
+    // elements that are larger or smaller than the elements of the matched
+    // type.
+    if ((Match & (ExtendedElementVectorType |
+                  TruncatedElementVectorType)) != 0) {
+      const IntegerType *IEltTy = dyn_cast<IntegerType>(EltTy);
+      if (!VTy || !IEltTy) {
+        CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not "
+                    "an integral vector type.", F);
+        return false;
+      }
+      // Adjust the current Ty (in the opposite direction) rather than
+      // the type being matched against.
+      if ((Match & ExtendedElementVectorType) != 0) {
+        if ((IEltTy->getBitWidth() & 1) != 0) {
+          CheckFailed(IntrinsicParam(ArgNo, NumRets) + " vector "
+                      "element bit-width is odd.", F);
+          return false;
+        }
+        Ty = VectorType::getTruncatedElementVectorType(VTy);
+      } else
+        Ty = VectorType::getExtendedElementVectorType(VTy);
+      Match &= ~(ExtendedElementVectorType | TruncatedElementVectorType);
+    }
+
+    if (Match <= static_cast<int>(NumRets - 1)) {
+      if (ST)
+        RetTy = ST->getElementType(Match);
+
+      if (Ty != RetTy) {
+        CheckFailed(IntrinsicParam(ArgNo, NumRets) + " does not "
+                    "match return type.", F);
+        return false;
+      }
+    } else {
+      if (Ty != FTy->getParamType(Match - 1)) {
+        CheckFailed(IntrinsicParam(ArgNo, NumRets) + " does not "
+                    "match parameter %" + utostr(Match - 1) + ".", F);
+        return false;
+      }
+    }
+  } else if (VT == MVT::iAny) {
+    if (!EltTy->isInteger()) {
+      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not "
+                  "an integer type.", F);
+      return false;
+    }
+
+    unsigned GotBits = cast<IntegerType>(EltTy)->getBitWidth();
+    Suffix += ".";
+
+    if (EltTy != Ty)
+      Suffix += "v" + utostr(NumElts);
+
+    Suffix += "i" + utostr(GotBits);
+
+    // Check some constraints on various intrinsics.
+    switch (ID) {
+    default: break; // Not everything needs to be checked.
+    case Intrinsic::bswap:
+      if (GotBits < 16 || GotBits % 16 != 0) {
+        CheckFailed("Intrinsic requires even byte width argument", F);
+        return false;
+      }
+      break;
+    }
+  } else if (VT == MVT::fAny) {
+    if (!EltTy->isFloatingPoint()) {
+      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not "
+                  "a floating-point type.", F);
+      return false;
+    }
+
+    Suffix += ".";
+
+    if (EltTy != Ty)
+      Suffix += "v" + utostr(NumElts);
+
+    Suffix += MVT::getMVT(EltTy).getMVTString();
+  } else if (VT == MVT::iPTR) {
+    if (!isa<PointerType>(Ty)) {
+      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not a "
+                  "pointer and a pointer is required.", F);
+      return false;
+    }
+  } else if (VT == MVT::iPTRAny) {
+    // Outside of TableGen, we don't distinguish iPTRAny (to any address space)
+    // and iPTR. In the verifier, we can not distinguish which case we have so
+    // allow either case to be legal.
+    if (const PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
+      Suffix += ".p" + utostr(PTyp->getAddressSpace()) + 
+        MVT::getMVT(PTyp->getElementType()).getMVTString();
+    } else {
+      CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is not a "
+                  "pointer and a pointer is required.", F);
+      return false;
+    }
+  } else if (MVT((MVT::SimpleValueType)VT).isVector()) {
+    MVT VVT = MVT((MVT::SimpleValueType)VT);
+
+    // If this is a vector argument, verify the number and type of elements.
+    if (VVT.getVectorElementType() != MVT::getMVT(EltTy)) {
+      CheckFailed("Intrinsic prototype has incorrect vector element type!", F);
+      return false;
+    }
+
+    if (VVT.getVectorNumElements() != NumElts) {
+      CheckFailed("Intrinsic prototype has incorrect number of "
+                  "vector elements!", F);
+      return false;
+    }
+  } else if (MVT((MVT::SimpleValueType)VT).getTypeForMVT() != EltTy) {
+    CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is wrong!", F);
+    return false;
+  } else if (EltTy != Ty) {
+    CheckFailed(IntrinsicParam(ArgNo, NumRets) + " is a vector "
+                "and a scalar is required.", F);
+    return false;
+  }
+
+  return true;
+}
+
+/// VerifyIntrinsicPrototype - TableGen emits calls to this function into
+/// Intrinsics.gen.  This implements a little state machine that verifies the
+/// prototype of intrinsics.
+void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
+                                        unsigned RetNum,
+                                        unsigned ParamNum, ...) {
+  va_list VA;
+  va_start(VA, ParamNum);
+  const FunctionType *FTy = F->getFunctionType();
+  
+  // For overloaded intrinsics, the Suffix of the function name must match the
+  // types of the arguments. This variable keeps track of the expected
+  // suffix, to be checked at the end.
+  std::string Suffix;
+
+  if (FTy->getNumParams() + FTy->isVarArg() != ParamNum) {
+    CheckFailed("Intrinsic prototype has incorrect number of arguments!", F);
+    return;
+  }
+
+  const Type *Ty = FTy->getReturnType();
+  const StructType *ST = dyn_cast<StructType>(Ty);
+
+  // Verify the return types.
+  if (ST && ST->getNumElements() != RetNum) {
+    CheckFailed("Intrinsic prototype has incorrect number of return types!", F);
+    return;
+  }
+
+  for (unsigned ArgNo = 0; ArgNo < RetNum; ++ArgNo) {
+    int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
+
+    if (ST) Ty = ST->getElementType(ArgNo);
+
+    if (!PerformTypeCheck(ID, F, Ty, VT, ArgNo, Suffix))
+      break;
+  }
+
+  // Verify the parameter types.
+  for (unsigned ArgNo = 0; ArgNo < ParamNum; ++ArgNo) {
+    int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
+
+    if (VT == MVT::isVoid && ArgNo > 0) {
+      if (!FTy->isVarArg())
+        CheckFailed("Intrinsic prototype has no '...'!", F);
+      break;
+    }
+
+    if (!PerformTypeCheck(ID, F, FTy->getParamType(ArgNo), VT, ArgNo + RetNum,
+                          Suffix))
+      break;
+  }
+
+  va_end(VA);
+
+  // For intrinsics without pointer arguments, if we computed a Suffix then the
+  // intrinsic is overloaded and we need to make sure that the name of the
+  // function is correct. We add the suffix to the name of the intrinsic and
+  // compare against the given function name. If they are not the same, the
+  // function name is invalid. This ensures that overloading of intrinsics
+  // uses a sane and consistent naming convention.  Note that intrinsics with
+  // pointer argument may or may not be overloaded so we will check assuming it
+  // has a suffix and not.
+  if (!Suffix.empty()) {
+    std::string Name(Intrinsic::getName(ID));
+    if (Name + Suffix != F->getName()) {
+      CheckFailed("Overloaded intrinsic has incorrect suffix: '" +
+                  F->getName().substr(Name.length()) + "'. It should be '" +
+                  Suffix + "'", F);
+    }
+  }
+
+  // Check parameter attributes.
+  Assert1(F->getAttributes() == Intrinsic::getAttributes(ID),
+          "Intrinsic has wrong parameter attributes!", F);
+}
+
+/// Verify that an MDNode is not cyclic.
+bool Verifier::VerifyMDNode(const MDNode *N) {
+  if (N->elem_empty()) return true;
+
+  // The current DFS path through the nodes. Node and element number.
+  typedef std::pair<const MDNode *, MDNode::const_elem_iterator> Edge;
+  SmallVector<Edge, 8> Path;
+
+  Path.push_back(std::make_pair(N, N->elem_begin()));
+  while (!Path.empty()) {
+    Edge &e = Path.back();
+    const MDNode *&e_N = e.first;
+    MDNode::const_elem_iterator &e_I = e.second;
+
+    if (e_N->elem_end() == e_I) {
+      Path.pop_back();
+      continue;
+    }
+
+    for (MDNode::const_elem_iterator e_E = e_N->elem_end(); e_I != e_E; ++e_I) {
+      if (const MDNode *C = dyn_cast_or_null<MDNode>(e_I->operator Value*())) {
+        // Is child MDNode C already in the Path?
+        for (SmallVectorImpl<Edge>::iterator I = Path.begin(), E = Path.end();
+             I != E; ++I) {
+          if (I->first != C) {
+            CheckFailed("MDNode is cyclic.", C);
+            return false;
+          }
+        }
+
+        Path.push_back(std::make_pair(C, C->elem_begin()));
+        break;
+      }
+    }
+  }
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Implement the public interfaces to this file...
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createVerifierPass(VerifierFailureAction action) {
+  return new Verifier(action);
+}
+
+
+// verifyFunction - Create
+bool llvm::verifyFunction(const Function &f, VerifierFailureAction action) {
+  Function &F = const_cast<Function&>(f);
+  assert(!F.isDeclaration() && "Cannot verify external functions");
+
+  ExistingModuleProvider MP(F.getParent());
+  FunctionPassManager FPM(&MP);
+  Verifier *V = new Verifier(action);
+  FPM.add(V);
+  FPM.run(F);
+  MP.releaseModule();
+  return V->Broken;
+}
+
+/// verifyModule - Check a module for errors, printing messages on stderr.
+/// Return true if the module is corrupt.
+///
+bool llvm::verifyModule(const Module &M, VerifierFailureAction action,
+                        std::string *ErrorInfo) {
+  PassManager PM;
+  Verifier *V = new Verifier(action);
+  PM.add(V);
+  PM.run(const_cast<Module&>(M));
+  
+  if (ErrorInfo && V->Broken)
+    *ErrorInfo = V->msgs.str();
+  return V->Broken;
+}
+
+// vim: sw=2
author	ed <ed@FreeBSD.org>	2009-06-02 17:52:33 +0000
committer	ed <ed@FreeBSD.org>	2009-06-02 17:52:33 +0000
commit	3277b69d734b9c90b44ebde4ede005717e2c3b2e (patch)
tree	64ba909838c23261cace781ece27d106134ea451 /lib
download	FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.zip FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.tar.gz